@elizaos/capacitor-bun-runtime 2.0.3-beta.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. package/ElizaosCapacitorBunRuntime.podspec +54 -0
  2. package/LICENSE +21 -0
  3. package/README.md +127 -0
  4. package/dist/esm/definitions.d.ts +136 -0
  5. package/dist/esm/definitions.d.ts.map +1 -0
  6. package/dist/esm/definitions.js +14 -0
  7. package/dist/esm/definitions.js.map +1 -0
  8. package/dist/esm/index.d.ts +9 -0
  9. package/dist/esm/index.d.ts.map +1 -0
  10. package/dist/esm/index.js +11 -0
  11. package/dist/esm/index.js.map +1 -0
  12. package/dist/esm/web.d.ts +19 -0
  13. package/dist/esm/web.d.ts.map +1 -0
  14. package/dist/esm/web.js +44 -0
  15. package/dist/esm/web.js.map +1 -0
  16. package/dist/plugin.cjs.js +63 -0
  17. package/dist/plugin.cjs.js.map +1 -0
  18. package/dist/plugin.js +66 -0
  19. package/dist/plugin.js.map +1 -0
  20. package/ios/Sources/ElizaBunRuntimePlugin/BridgeInstaller.swift +94 -0
  21. package/ios/Sources/ElizaBunRuntimePlugin/ElizaBunRuntime.swift +705 -0
  22. package/ios/Sources/ElizaBunRuntimePlugin/ElizaBunRuntimePlugin.swift +1109 -0
  23. package/ios/Sources/ElizaBunRuntimePlugin/FullBunEngineHost.swift +677 -0
  24. package/ios/Sources/ElizaBunRuntimePlugin/JSContextHelpers.swift +226 -0
  25. package/ios/Sources/ElizaBunRuntimePlugin/SandboxPaths.swift +46 -0
  26. package/ios/Sources/ElizaBunRuntimePlugin/bridge/CryptoBridge.swift +238 -0
  27. package/ios/Sources/ElizaBunRuntimePlugin/bridge/ElizaSqliteVecBridge.m +28 -0
  28. package/ios/Sources/ElizaBunRuntimePlugin/bridge/FSBridge.swift +270 -0
  29. package/ios/Sources/ElizaBunRuntimePlugin/bridge/HTTPBridge.swift +153 -0
  30. package/ios/Sources/ElizaBunRuntimePlugin/bridge/HTTPServerBridge.swift +32 -0
  31. package/ios/Sources/ElizaBunRuntimePlugin/bridge/LlamaBridge.swift +233 -0
  32. package/ios/Sources/ElizaBunRuntimePlugin/bridge/LlamaBridgeImpl.swift +1863 -0
  33. package/ios/Sources/ElizaBunRuntimePlugin/bridge/LogBridge.swift +36 -0
  34. package/ios/Sources/ElizaBunRuntimePlugin/bridge/PathsBridge.swift +41 -0
  35. package/ios/Sources/ElizaBunRuntimePlugin/bridge/ProcessBridge.swift +80 -0
  36. package/ios/Sources/ElizaBunRuntimePlugin/bridge/SqliteBridge.swift +406 -0
  37. package/ios/Sources/ElizaBunRuntimePlugin/bridge/SqliteBridgeInstaller.swift +17 -0
  38. package/ios/Sources/ElizaBunRuntimePlugin/bridge/SqliteVecLoader.swift +66 -0
  39. package/ios/Sources/ElizaBunRuntimePlugin/bridge/UIBridge.swift +72 -0
  40. package/ios/Sources/ElizaBunRuntimePlugin/kokoro/KokoroCoreMlChinesePhonemizer.swift +313 -0
  41. package/ios/Sources/ElizaBunRuntimePlugin/kokoro/KokoroCoreMlConfiguration.swift +28 -0
  42. package/ios/Sources/ElizaBunRuntimePlugin/kokoro/KokoroCoreMlEngine.swift +325 -0
  43. package/ios/Sources/ElizaBunRuntimePlugin/kokoro/KokoroCoreMlHindiPhonemizer.swift +150 -0
  44. package/ios/Sources/ElizaBunRuntimePlugin/kokoro/KokoroCoreMlJapanesePhonemizer.swift +209 -0
  45. package/ios/Sources/ElizaBunRuntimePlugin/kokoro/KokoroCoreMlLatinPhonemizer.swift +374 -0
  46. package/ios/Sources/ElizaBunRuntimePlugin/kokoro/KokoroCoreMlModel.swift +87 -0
  47. package/ios/Sources/ElizaBunRuntimePlugin/kokoro/KokoroCoreMlPhonemizer.swift +679 -0
  48. package/ios/Sources/ElizaBunRuntimePlugin/kokoro/KokoroCoreMlPronunciationDicts.swift +131 -0
  49. package/ios/Sources/ElizaBunRuntimePlugin/kokoro/KokoroCoreMlSupport.swift +24 -0
  50. package/ios/Tests/llama-bridge-smoke-main.swift +92 -0
  51. package/package.json +68 -0
  52. package/src/bridge-contract.test.ts +127 -0
  53. package/src/definitions.d.ts +136 -0
  54. package/src/definitions.d.ts.map +1 -0
  55. package/src/definitions.ts +152 -0
  56. package/src/index.d.ts +9 -0
  57. package/src/index.d.ts.map +1 -0
  58. package/src/index.ts +16 -0
  59. package/src/web.d.ts +19 -0
  60. package/src/web.d.ts.map +1 -0
  61. package/src/web.ts +80 -0
@@ -0,0 +1,1863 @@
1
+ import Foundation
2
+ #if !ELIZA_IOS_FULL_BUN_ENGINE
3
+ import JavaScriptCore
4
+ #endif
5
+ import Darwin.Mach
6
+
7
+ #if ELIZA_IOS_INCLUDE_LLAMA
8
+
9
+ // MARK: - LlamaBridgeImpl
10
+ //
11
+ // Real llama.cpp-backed implementation. Pure Swift API surface — does NOT
12
+ // install JS bridge functions. `LlamaBridge.swift` owns that JS-facing layer
13
+ // and delegates llama.cpp work to this class.
14
+ //
15
+ // The split keeps responsibilities clean:
16
+ // * `LlamaBridge.swift` owns the JS-facing contract (parses JSValue args,
17
+ // builds promises, schedules ManagedCallback streaming).
18
+ // * `LlamaBridgeImpl.swift` (this file) owns the C-API plumbing
19
+ // (@_silgen_name bindings, batch/sampler setup, decode loop).
20
+ //
21
+ // The impl is thread-safe: it does its own queueing via a per-session
22
+ // serial queue and a session registry guarded by a sync lock.
23
+
24
+ // MARK: - C-API bindings via @_silgen_name
25
+ //
26
+ // We call llama.cpp's C symbols directly through @_silgen_name rather than
27
+ // importing a generated module. This keeps us provider-agnostic: the same
28
+ // Swift code works whether the binary slice came from `LlamaCpp.xcframework`
29
+ // (built by the app-core iOS local-inference pipeline) or from a different
30
+ // distribution. The contract is the linker — at link time
31
+ // the symbols must resolve, otherwise we get a clear "Undefined symbol"
32
+ // error.
33
+ //
34
+ // Symbol names track upstream llama.cpp >= b4404 (Jan 2025 sampler-chain
35
+ // API). If you bump the pinned version in
36
+ // the pinned llama.cpp version to one that renamed any of these symbols, this
37
+ // file is where you update them.
38
+
39
+ private let LLAMA_DEFAULT_SEED: UInt32 = 0xFFFFFFFF
40
+ private let LLAMA_TOKEN_NULL: Int32 = -1
41
+
42
+ typealias LlamaModelPtr = OpaquePointer
43
+ typealias LlamaContextPtr = OpaquePointer
44
+ typealias LlamaMemoryPtr = OpaquePointer
45
+ typealias LlamaVocabPtr = OpaquePointer
46
+ typealias LlamaSamplerPtr = OpaquePointer
47
+
48
+ @_silgen_name("llama_backend_init")
49
+ private func c_llama_backend_init()
50
+
51
+ @_silgen_name("llama_backend_free")
52
+ private func c_llama_backend_free()
53
+
54
+ @_silgen_name("llama_model_load_from_file")
55
+ private func c_llama_model_load_from_file(
56
+ _ path: UnsafePointer<CChar>,
57
+ _ params: LlamaModelParamsBag
58
+ ) -> LlamaModelPtr?
59
+
60
+ @_silgen_name("llama_model_free")
61
+ private func c_llama_model_free(_ model: LlamaModelPtr)
62
+
63
+ @_silgen_name("llama_model_default_params")
64
+ private func c_llama_model_default_params() -> LlamaModelParamsBag
65
+
66
+ @_silgen_name("llama_init_from_model")
67
+ private func c_llama_init_from_model(
68
+ _ model: LlamaModelPtr,
69
+ _ params: LlamaContextParamsBag
70
+ ) -> LlamaContextPtr?
71
+
72
+ @_silgen_name("llama_free")
73
+ private func c_llama_free(_ ctx: LlamaContextPtr)
74
+
75
+ @_silgen_name("llama_context_default_params")
76
+ private func c_llama_context_default_params() -> LlamaContextParamsBag
77
+
78
+ @_silgen_name("llama_model_get_vocab")
79
+ private func c_llama_model_get_vocab(_ model: LlamaModelPtr) -> LlamaVocabPtr
80
+
81
+ @_silgen_name("llama_n_ctx")
82
+ private func c_llama_n_ctx(_ ctx: LlamaContextPtr) -> UInt32
83
+
84
+ @_silgen_name("llama_get_memory")
85
+ private func c_llama_get_memory(_ ctx: LlamaContextPtr) -> LlamaMemoryPtr?
86
+
87
+ @_silgen_name("llama_memory_clear")
88
+ private func c_llama_memory_clear(_ memory: LlamaMemoryPtr, _ data: Bool)
89
+
90
+ @_silgen_name("llama_tokenize")
91
+ private func c_llama_tokenize(
92
+ _ vocab: LlamaVocabPtr,
93
+ _ text: UnsafePointer<CChar>,
94
+ _ text_len: Int32,
95
+ _ tokens: UnsafeMutablePointer<Int32>,
96
+ _ n_tokens_max: Int32,
97
+ _ add_special: Bool,
98
+ _ parse_special: Bool
99
+ ) -> Int32
100
+
101
+ @_silgen_name("llama_token_to_piece")
102
+ private func c_llama_token_to_piece(
103
+ _ vocab: LlamaVocabPtr,
104
+ _ token: Int32,
105
+ _ buf: UnsafeMutablePointer<CChar>,
106
+ _ length: Int32,
107
+ _ lstrip: Int32,
108
+ _ special: Bool
109
+ ) -> Int32
110
+
111
+ @_silgen_name("llama_vocab_is_eog")
112
+ private func c_llama_vocab_is_eog(_ vocab: LlamaVocabPtr, _ token: Int32) -> Bool
113
+
114
+ @_silgen_name("llama_batch_init")
115
+ private func c_llama_batch_init(_ n_tokens: Int32, _ embd: Int32, _ n_seq_max: Int32) -> LlamaBatch
116
+
117
+ @_silgen_name("llama_batch_free")
118
+ private func c_llama_batch_free(_ batch: LlamaBatch)
119
+
120
+ @_silgen_name("llama_decode")
121
+ private func c_llama_decode(_ ctx: LlamaContextPtr, _ batch: LlamaBatch) -> Int32
122
+
123
+ @_silgen_name("llama_sampler_chain_default_params")
124
+ private func c_llama_sampler_chain_default_params() -> LlamaSamplerChainParams
125
+
126
+ @_silgen_name("llama_sampler_chain_init")
127
+ private func c_llama_sampler_chain_init(_ params: LlamaSamplerChainParams) -> LlamaSamplerPtr?
128
+
129
+ @_silgen_name("llama_sampler_chain_add")
130
+ private func c_llama_sampler_chain_add(_ chain: LlamaSamplerPtr, _ sampler: LlamaSamplerPtr)
131
+
132
+ @_silgen_name("llama_sampler_init_temp")
133
+ private func c_llama_sampler_init_temp(_ t: Float) -> LlamaSamplerPtr?
134
+
135
+ @_silgen_name("llama_sampler_init_top_p")
136
+ private func c_llama_sampler_init_top_p(_ p: Float, _ min_keep: Int) -> LlamaSamplerPtr?
137
+
138
+ @_silgen_name("llama_sampler_init_top_k")
139
+ private func c_llama_sampler_init_top_k(_ k: Int32) -> LlamaSamplerPtr?
140
+
141
+ @_silgen_name("llama_sampler_init_dist")
142
+ private func c_llama_sampler_init_dist(_ seed: UInt32) -> LlamaSamplerPtr?
143
+
144
+ @_silgen_name("llama_sampler_sample")
145
+ private func c_llama_sampler_sample(_ smpl: LlamaSamplerPtr, _ ctx: LlamaContextPtr, _ idx: Int32) -> Int32
146
+
147
+ @_silgen_name("llama_sampler_accept")
148
+ private func c_llama_sampler_accept(_ smpl: LlamaSamplerPtr, _ token: Int32)
149
+
150
+ @_silgen_name("llama_sampler_free")
151
+ private func c_llama_sampler_free(_ smpl: LlamaSamplerPtr)
152
+
153
+ // MARK: - Opaque parameter bags
154
+ //
155
+ // llama.cpp's `llama_model_params`, `llama_context_params`, and `llama_batch`
156
+ // are POD structs but their layouts drift across upstream releases. We treat
157
+ // the params structs as opaque byte bags sized generously, and use a tiny C
158
+ // shim (LlamaShim.c) for the few field reads/writes Swift needs. That keeps
159
+ // Swift agnostic to layout drift.
160
+ //
161
+ // `LlamaBatch` we mirror in Swift because its layout has been stable since
162
+ // the b3000-era refactor and we need to pass it back into C functions by
163
+ // value. Six pointers + n_tokens; alignment is automatic.
164
+
165
+ struct LlamaModelParamsBag {
166
+ // Exact size of the pinned iOS `llama_model_params` (72 B). These structs
167
+ // are returned and passed by value, so "large enough" is not ABI-safe.
168
+ // Never read from Swift directly — the shim is the only authorized writer.
169
+ private var storage: (UInt64, UInt64, UInt64,
170
+ UInt64, UInt64, UInt64,
171
+ UInt64, UInt64, UInt64) =
172
+ (0, 0, 0, 0, 0, 0, 0, 0, 0)
173
+ }
174
+
175
+ struct LlamaContextParamsBag {
176
+ // Exact size of the pinned iOS `llama_context_params` (136 B).
177
+ private var storage: (UInt64, UInt64, UInt64, UInt64,
178
+ UInt64, UInt64, UInt64, UInt64,
179
+ UInt64, UInt64, UInt64, UInt64,
180
+ UInt64, UInt64, UInt64, UInt64,
181
+ UInt64) =
182
+ (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
183
+ 0, 0, 0, 0, 0)
184
+ }
185
+
186
+ struct LlamaSamplerChainParams {
187
+ var no_perf: Bool = false
188
+ }
189
+
190
+ struct LlamaBatch {
191
+ var n_tokens: Int32 = 0
192
+ var token: UnsafeMutablePointer<Int32>? = nil
193
+ var embd: UnsafeMutablePointer<Float>? = nil
194
+ var pos: UnsafeMutablePointer<Int32>? = nil
195
+ var n_seq_id: UnsafeMutablePointer<Int32>? = nil
196
+ var seq_id: UnsafeMutablePointer<UnsafeMutablePointer<Int32>?>? = nil
197
+ var logits: UnsafeMutablePointer<Int8>? = nil
198
+ }
199
+
200
+ // Shim symbols — implemented in LlamaShim.c. The shim folds into libllama.a
201
+ // by `vendor-deps/llama.cpp/build-ios.sh`.
202
+
203
+ @_silgen_name("eliza_llama_model_params_set_n_gpu_layers")
204
+ private func shim_model_params_set_n_gpu_layers(_ params: UnsafeMutablePointer<LlamaModelParamsBag>, _ n: Int32)
205
+
206
+ @_silgen_name("eliza_llama_context_params_set_n_ctx")
207
+ private func shim_context_params_set_n_ctx(_ params: UnsafeMutablePointer<LlamaContextParamsBag>, _ n: UInt32)
208
+
209
+ @_silgen_name("eliza_llama_context_params_set_n_threads")
210
+ private func shim_context_params_set_n_threads(_ params: UnsafeMutablePointer<LlamaContextParamsBag>, _ n: Int32, _ n_batch: Int32)
211
+
212
+ @_silgen_name("eliza_llama_context_params_set_batch_sizes")
213
+ private func shim_context_params_set_batch_sizes(_ params: UnsafeMutablePointer<LlamaContextParamsBag>, _ nBatch: UInt32, _ nUbatch: UInt32)
214
+
215
+ @_silgen_name("eliza_llama_batch_set_single")
216
+ private func shim_batch_set_single(_ batch: UnsafeMutablePointer<LlamaBatch>, _ token: Int32, _ pos: Int32, _ logits_out: Bool)
217
+
218
+ @_silgen_name("eliza_llama_batch_append")
219
+ private func shim_batch_append(_ batch: UnsafeMutablePointer<LlamaBatch>, _ token: Int32, _ pos: Int32, _ logits_out: Bool)
220
+
221
+ @_silgen_name("eliza_llama_batch_reset")
222
+ private func shim_batch_reset(_ batch: UnsafeMutablePointer<LlamaBatch>)
223
+
224
+ @_silgen_name("eliza_llama_log_silence")
225
+ private func shim_log_silence()
226
+
227
+ @_silgen_name("eliza_llama_has_metal")
228
+ private func shim_has_metal() -> Bool
229
+
230
+ // KV cache-type setters. `type` is the integer value of llama.cpp's
231
+ // `ggml_type` enum (e.g. 1=f16, 8=q8_0, 2=q4_0). The Swift wrapper
232
+ // maps the string-typed cacheType{K,V} from JS to the enum value
233
+ // via `ggmlTypeFromString` and only invokes these when a mapping
234
+ // exists; otherwise the field keeps its default and llama.cpp uses
235
+ // the build-time default (typically f16).
236
+ @_silgen_name("eliza_llama_context_params_set_type_k")
237
+ private func shim_context_params_set_type_k(_ params: UnsafeMutablePointer<LlamaContextParamsBag>, _ type: Int32)
238
+
239
+ @_silgen_name("eliza_llama_context_params_set_type_v")
240
+ private func shim_context_params_set_type_v(_ params: UnsafeMutablePointer<LlamaContextParamsBag>, _ type: Int32)
241
+
242
+ // MTP speculative-decode bridge. `shim_speculative_supported()`
243
+ // returns true only when the linked slice has the buun fork's
244
+ // libcommon (with `common_speculative_draft_gen`) folded into it.
245
+ // On stock slices the helper is absent and `supported()` is false;
246
+ // the generate loop then falls back to plain decode.
247
+ @_silgen_name("eliza_llama_speculative_supported")
248
+ private func shim_speculative_supported() -> Bool
249
+
250
+ @_silgen_name("eliza_llama_speculative_draft_gen")
251
+ private func shim_speculative_draft_gen(
252
+ _ targetCtx: LlamaContextPtr,
253
+ _ drafterCtx: LlamaContextPtr,
254
+ _ pastTokens: UnsafePointer<Int32>,
255
+ _ nPast: Int32,
256
+ _ draftMin: Int32,
257
+ _ draftMax: Int32,
258
+ _ outDrafted: UnsafeMutablePointer<Int32>,
259
+ _ outCapacity: Int32
260
+ ) -> Int32
261
+
262
+ // Token-tree sampler constructor. Returns NULL when the slice does
263
+ // not link `llama_sampler_init_logit_bias` or when the payload is
264
+ // malformed. The Swift caller checks for NULL before adding the stage
265
+ // to the sampler chain.
266
+ @_silgen_name("eliza_llama_sampler_init_token_tree")
267
+ private func shim_sampler_init_token_tree(
268
+ _ nVocab: Int32,
269
+ _ trieBytes: UnsafePointer<UInt8>,
270
+ _ trieSize: Int
271
+ ) -> LlamaSamplerPtr?
272
+
273
+ // Vocab size lookup used when constructing the token-tree sampler.
274
+ @_silgen_name("llama_vocab_n_tokens")
275
+ private func c_llama_vocab_n_tokens(_ vocab: LlamaVocabPtr) -> Int32
276
+
277
+ typealias ElizaInferenceContextPtr = OpaquePointer
278
+
279
+ @_silgen_name("eliza_inference_abi_version")
280
+ private func c_eliza_inference_abi_version() -> UnsafePointer<CChar>?
281
+
282
+ @_silgen_name("eliza_inference_create")
283
+ private func c_eliza_inference_create(
284
+ _ bundleDir: UnsafePointer<CChar>,
285
+ _ outError: UnsafeMutablePointer<UnsafeMutablePointer<CChar>?>
286
+ ) -> ElizaInferenceContextPtr?
287
+
288
+ @_silgen_name("eliza_inference_destroy")
289
+ private func c_eliza_inference_destroy(_ ctx: ElizaInferenceContextPtr?)
290
+
291
+ @_silgen_name("eliza_inference_mmap_acquire")
292
+ private func c_eliza_inference_mmap_acquire(
293
+ _ ctx: ElizaInferenceContextPtr?,
294
+ _ regionName: UnsafePointer<CChar>,
295
+ _ outError: UnsafeMutablePointer<UnsafeMutablePointer<CChar>?>
296
+ ) -> Int32
297
+
298
+ @_silgen_name("eliza_inference_tts_synthesize")
299
+ private func c_eliza_inference_tts_synthesize(
300
+ _ ctx: ElizaInferenceContextPtr?,
301
+ _ text: UnsafePointer<CChar>,
302
+ _ textLen: Int,
303
+ _ speakerPresetId: UnsafePointer<CChar>?,
304
+ _ outPcm: UnsafeMutablePointer<Float>,
305
+ _ maxSamples: Int,
306
+ _ outError: UnsafeMutablePointer<UnsafeMutablePointer<CChar>?>
307
+ ) -> Int32
308
+
309
+ @_silgen_name("eliza_inference_asr_transcribe")
310
+ private func c_eliza_inference_asr_transcribe(
311
+ _ ctx: ElizaInferenceContextPtr?,
312
+ _ pcm: UnsafePointer<Float>?,
313
+ _ nSamples: Int,
314
+ _ sampleRate: Int32,
315
+ _ outText: UnsafeMutablePointer<CChar>?,
316
+ _ maxTextBytes: Int,
317
+ _ outError: UnsafeMutablePointer<UnsafeMutablePointer<CChar>?>?
318
+ ) -> Int32
319
+
320
+ @_silgen_name("eliza_inference_free_string")
321
+ private func c_eliza_inference_free_string(_ value: UnsafeMutablePointer<CChar>?)
322
+
323
+ // MARK: - Result types
324
+
325
+ /// Per-generation speculative-decode toggle. `auto` follows the session
326
+ /// state (drafter loaded + slice supports spec decode); `on` requires it
327
+ /// (falls back with a log line if unsupported); `off` forces plain decode.
328
+ public enum SpecDecodeMode {
329
+ case auto
330
+ case on
331
+ case off
332
+ }
333
+
334
+ public struct LlamaLoadResult {
335
+ public let contextId: Int64?
336
+ public let error: String?
337
+ public static func success(_ id: Int64) -> LlamaLoadResult { .init(contextId: id, error: nil) }
338
+ public static func failure(_ id: Int64?, _ msg: String?) -> LlamaLoadResult { .init(contextId: id, error: msg) }
339
+ public static func failure(_ msg: String) -> LlamaLoadResult { .init(contextId: nil, error: msg) }
340
+ }
341
+
342
+ public struct LlamaGenerateResult {
343
+ public let text: String
344
+ public let promptTokens: Int
345
+ public let outputTokens: Int
346
+ public let durationMs: Double
347
+ public let error: String?
348
+ public static func success(text: String, promptTokens: Int, outputTokens: Int, durationMs: Double) -> LlamaGenerateResult {
349
+ .init(text: text, promptTokens: promptTokens, outputTokens: outputTokens, durationMs: durationMs, error: nil)
350
+ }
351
+ public static func failure(_ msg: String) -> LlamaGenerateResult {
352
+ .init(text: "", promptTokens: 0, outputTokens: 0, durationMs: 0, error: msg)
353
+ }
354
+ }
355
+
356
+ public struct LlamaTtsSynthesizeResult {
357
+ public let audioBase64: String
358
+ public let audioFilePath: String?
359
+ public let contentType: String
360
+ public let sampleRate: Int
361
+ public let samples: Int
362
+ public let durationMs: Double
363
+ public let error: String?
364
+
365
+ public static func success(audioFilePath: String, sampleRate: Int, samples: Int, durationMs: Double) -> LlamaTtsSynthesizeResult {
366
+ .init(
367
+ audioBase64: "",
368
+ audioFilePath: audioFilePath,
369
+ contentType: "audio/wav",
370
+ sampleRate: sampleRate,
371
+ samples: samples,
372
+ durationMs: durationMs,
373
+ error: nil
374
+ )
375
+ }
376
+
377
+ public static func failure(_ msg: String) -> LlamaTtsSynthesizeResult {
378
+ .init(
379
+ audioBase64: "",
380
+ audioFilePath: nil,
381
+ contentType: "audio/wav",
382
+ sampleRate: 24_000,
383
+ samples: 0,
384
+ durationMs: 0,
385
+ error: msg
386
+ )
387
+ }
388
+ }
389
+
390
+ public struct LlamaAsrTranscribeResult {
391
+ public let text: String
392
+ public let durationMs: Double
393
+ public let error: String?
394
+
395
+ public static func success(text: String, durationMs: Double) -> LlamaAsrTranscribeResult {
396
+ .init(text: text, durationMs: durationMs, error: nil)
397
+ }
398
+
399
+ public static func failure(_ msg: String) -> LlamaAsrTranscribeResult {
400
+ .init(text: "", durationMs: 0, error: msg)
401
+ }
402
+ }
403
+
404
+ public struct LlamaHardwareInfo {
405
+ public let backend: String // "metal" or "cpu"
406
+ public let totalRamGB: Double
407
+ public let availableRamGB: Double
408
+ public let cpuCores: Int
409
+ public let isSimulator: Bool
410
+ public let metalSupported: Bool
411
+ /// True when the linked slice exposes a usable `common_speculative_draft_gen`
412
+ /// AND the device has enough headroom to run target + drafter side-by-side.
413
+ public let mtpSupported: Bool
414
+ /// Optional human-readable reason when `mtpSupported` is false.
415
+ public let mtpReason: String?
416
+
417
+ /// Render as the `[String: Any]` shape the bridge contract expects.
418
+ public func asDict() -> [String: Any] {
419
+ var dict: [String: Any] = [
420
+ "backend": backend,
421
+ "total_ram_gb": NSNumber(value: totalRamGB),
422
+ "available_ram_gb": NSNumber(value: availableRamGB),
423
+ "cpu_cores": NSNumber(value: cpuCores),
424
+ "is_simulator": NSNumber(value: isSimulator),
425
+ "metal_supported": NSNumber(value: metalSupported),
426
+ "mtp_supported": NSNumber(value: mtpSupported)
427
+ ]
428
+ if let reason = mtpReason {
429
+ dict["mtp_reason"] = reason
430
+ }
431
+ return dict
432
+ }
433
+ }
434
+
435
+ // MARK: - Session bookkeeping
436
+
437
+ private final class LlamaSession {
438
+ let id: Int64
439
+ let model: LlamaModelPtr
440
+ let ctx: LlamaContextPtr
441
+ let vocab: LlamaVocabPtr
442
+ let workQueue: DispatchQueue
443
+ let nCtx: UInt32
444
+ let nBatch: UInt32
445
+ var cancelled: Bool = false
446
+
447
+ // MTP drafter state. Non-nil iff the user passed a `draftModelPath`
448
+ // at load time AND the slice supports speculative decode (the
449
+ // `eliza_llama_speculative_supported` shim probe returned true).
450
+ let drafterModel: LlamaModelPtr?
451
+ let drafterCtx: LlamaContextPtr?
452
+ let draftMinDefault: Int32
453
+ let draftMaxDefault: Int32
454
+
455
+ init(
456
+ id: Int64,
457
+ model: LlamaModelPtr,
458
+ ctx: LlamaContextPtr,
459
+ vocab: LlamaVocabPtr,
460
+ nCtx: UInt32,
461
+ nBatch: UInt32,
462
+ drafterModel: LlamaModelPtr? = nil,
463
+ drafterCtx: LlamaContextPtr? = nil,
464
+ draftMinDefault: Int32 = 1,
465
+ draftMaxDefault: Int32 = 3
466
+ ) {
467
+ self.id = id
468
+ self.model = model
469
+ self.ctx = ctx
470
+ self.vocab = vocab
471
+ self.nCtx = nCtx
472
+ self.nBatch = nBatch
473
+ self.drafterModel = drafterModel
474
+ self.drafterCtx = drafterCtx
475
+ self.draftMinDefault = draftMinDefault
476
+ self.draftMaxDefault = draftMaxDefault
477
+ self.workQueue = DispatchQueue(label: "ai.eliza.bun.llama.session.\(id)")
478
+ }
479
+
480
+ func free() {
481
+ if let drafterCtx = drafterCtx {
482
+ c_llama_free(drafterCtx)
483
+ }
484
+ if let drafterModel = drafterModel {
485
+ c_llama_model_free(drafterModel)
486
+ }
487
+ c_llama_free(ctx)
488
+ c_llama_model_free(model)
489
+ }
490
+ }
491
+
492
+ private final class CachedVoiceContext {
493
+ let bundleDir: String
494
+ let backend: String
495
+ let context: ElizaInferenceContextPtr
496
+
497
+ init(bundleDir: String, backend: String, context: ElizaInferenceContextPtr) {
498
+ self.bundleDir = bundleDir
499
+ self.backend = backend
500
+ self.context = context
501
+ }
502
+ }
503
+
504
+ /// Maps a string KV cache type ("f16", "q8_0", "q4_0", "tbq3", ...) to the
505
+ /// integer value of llama.cpp's `ggml_type` enum. Returns nil for unknown
506
+ /// types so the caller can leave the params struct at default. Fork-specific
507
+ /// TBQ / QJL / Q4_POLAR codes mirror the patched ggml_type enum values
508
+ /// introduced by `packages/app-core/scripts/build-llama-cpp-mtp.mjs`;
509
+ /// when the linked slice doesn't have those kernels compiled in, llama.cpp
510
+ /// reports the error at context-init time and we surface it through
511
+ /// `loadModel`'s failure path.
512
+ private func ggmlTypeFromString(_ raw: String?) -> Int32? {
513
+ guard let raw = raw?.lowercased(), !raw.isEmpty else { return nil }
514
+ switch raw {
515
+ case "f32": return 0
516
+ case "f16": return 1
517
+ case "q4_0": return 2
518
+ case "q4_1": return 3
519
+ case "q5_0": return 6
520
+ case "q5_1": return 7
521
+ case "q8_0": return 8
522
+ case "q8_1": return 9
523
+ case "q2_k": return 10
524
+ case "q3_k": return 11
525
+ case "q4_k": return 12
526
+ case "q5_k": return 13
527
+ case "q6_k": return 14
528
+ case "q8_k": return 15
529
+ // Buun fork codes. Values mirror the patched enum in build-llama-cpp-mtp.mjs.
530
+ case "tbq3", "q4_tq3": return 64
531
+ case "tbq4", "q4_tq4": return 65
532
+ case "qjl4": return 66
533
+ case "q4_polar": return 67
534
+ default: return nil
535
+ }
536
+ }
537
+
538
+ private final class SessionRegistry {
539
+ static let shared = SessionRegistry()
540
+ private let queue = DispatchQueue(label: "ai.eliza.bun.llama.sessions")
541
+ private var sessions: [Int64: LlamaSession] = [:]
542
+ private var nextId: Int64 = 1
543
+ private var backendInitialized = false
544
+
545
+ func ensureBackend() {
546
+ queue.sync {
547
+ if !backendInitialized {
548
+ shim_log_silence()
549
+ c_llama_backend_init()
550
+ backendInitialized = true
551
+ }
552
+ }
553
+ }
554
+
555
+ func add(_ session: LlamaSession) {
556
+ queue.sync { sessions[session.id] = session }
557
+ }
558
+
559
+ func get(_ id: Int64) -> LlamaSession? {
560
+ queue.sync { sessions[id] }
561
+ }
562
+
563
+ func remove(_ id: Int64) -> LlamaSession? {
564
+ queue.sync {
565
+ let s = sessions.removeValue(forKey: id)
566
+ return s
567
+ }
568
+ }
569
+
570
+ func allocateId() -> Int64 {
571
+ queue.sync {
572
+ let id = nextId
573
+ nextId += 1
574
+ return id
575
+ }
576
+ }
577
+ }
578
+
579
+ // MARK: - LlamaBridgeImpl public API
580
+
581
+ public final class LlamaBridgeImpl {
582
+ public static let shared = LlamaBridgeImpl()
583
+ private let ttsQueue = DispatchQueue(label: "ai.eliza.bun.llama.tts")
584
+ private var cachedTtsContext: CachedVoiceContext?
585
+ private var cachedAsrContext: CachedVoiceContext?
586
+
587
+ private init() {}
588
+
589
+ private static var isRunningInSimulator: Bool {
590
+ #if targetEnvironment(simulator)
591
+ return true
592
+ #else
593
+ return false
594
+ #endif
595
+ }
596
+
597
+ /// Keep this in sync with the pinned `llama_model_params` layout mirrored
598
+ /// in `runtime-symbol-shim.c`. Upstream defaults keep `use_extra_bufts`
599
+ /// enabled, which can still touch Metal buffer types even when
600
+ /// `n_gpu_layers` is zero. `split_mode = LLAMA_SPLIT_MODE_NONE` plus
601
+ /// `main_gpu = -1` tells llama.cpp to clear discovered GPU devices.
602
+ private static func forceModelCpuOnly(_ params: UnsafeMutablePointer<LlamaModelParamsBag>) {
603
+ let raw = UnsafeMutableRawPointer(params)
604
+ raw.advanced(by: 20).storeBytes(of: Int32(0), as: Int32.self) // split_mode = LLAMA_SPLIT_MODE_NONE
605
+ raw.advanced(by: 24).storeBytes(of: Int32(-1), as: Int32.self) // main_gpu = disabled
606
+ raw.advanced(by: 69).storeBytes(of: UInt8(0), as: UInt8.self) // use_extra_bufts
607
+ }
608
+
609
+ /// Keep this in sync with the pinned `llama_context_params` layout mirrored
610
+ /// in `runtime-symbol-shim.c`. CPU-only simulator loads must also disable
611
+ /// KQV/op offload and flash attention, otherwise llama.cpp can initialize
612
+ /// ggml-metal during context creation.
613
+ private static func setContextGpuOffload(
614
+ _ params: UnsafeMutablePointer<LlamaContextParamsBag>,
615
+ enabled: Bool
616
+ ) {
617
+ let raw = UnsafeMutableRawPointer(params)
618
+ if !enabled {
619
+ raw.advanced(by: 36).storeBytes(of: Int32(0), as: Int32.self) // flash_attn_type disabled
620
+ }
621
+ raw.advanced(by: 113).storeBytes(of: UInt8(enabled ? 1 : 0), as: UInt8.self) // offload_kqv
622
+ raw.advanced(by: 115).storeBytes(of: UInt8(enabled ? 1 : 0), as: UInt8.self) // op_offload
623
+ }
624
+
625
+ private static func mobileBatchSizes(contextSize: UInt32) -> (logical: UInt32, physical: UInt32) {
626
+ let logical = max(UInt32(1), min(contextSize, UInt32(4096)))
627
+ let physicalLimit: UInt32 = Self.isRunningInSimulator ? 512 : 1024
628
+ return (logical, max(UInt32(1), min(logical, physicalLimit)))
629
+ }
630
+
631
+ /// Synchronously loads a GGUF and returns either a context_id or an error.
632
+ /// Heavy operation (file I/O + model mmap + Metal init); the caller should
633
+ /// dispatch onto a background queue before invoking.
634
+ ///
635
+ /// When `draftModelPath` is set AND the linked slice supports speculative
636
+ /// decode (`shim_speculative_supported()` is true), a second model +
637
+ /// context is loaded as the MTP drafter and stored on the session.
638
+ /// Drafter load failures are non-fatal: we log and proceed without spec
639
+ /// decode rather than failing the entire load.
640
+ public func loadModel(
641
+ path: String,
642
+ contextSize: UInt32 = 4096,
643
+ useGPU: Bool = true,
644
+ threads: Int32? = nil,
645
+ draftModelPath: String? = nil,
646
+ draftContextSize: UInt32 = 4096,
647
+ draftGpuLayers: Int32? = nil,
648
+ draftMin: Int32 = 1,
649
+ draftMax: Int32 = 3,
650
+ cacheTypeK: String? = nil,
651
+ cacheTypeV: String? = nil
652
+ ) -> LlamaLoadResult {
653
+ guard FileManager.default.fileExists(atPath: path) else {
654
+ return .failure("llama_load_model: file not found at \(path)")
655
+ }
656
+ SessionRegistry.shared.ensureBackend()
657
+
658
+ let resolvedThreads = threads ?? min(4, Int32(ProcessInfo.processInfo.activeProcessorCount))
659
+
660
+ var modelParams = c_llama_model_default_params()
661
+ let canUseGPU = useGPU && shim_has_metal() && !Self.isRunningInSimulator
662
+ let nGpuLayers: Int32 = canUseGPU ? 999 : 0
663
+ withUnsafeMutablePointer(to: &modelParams) { ptr in
664
+ shim_model_params_set_n_gpu_layers(ptr, nGpuLayers)
665
+ if !canUseGPU {
666
+ Self.forceModelCpuOnly(ptr)
667
+ }
668
+ }
669
+
670
+ guard let modelPtr = path.withCString({ cpath in
671
+ c_llama_model_load_from_file(cpath, modelParams)
672
+ }) else {
673
+ return .failure("llama_model_load_from_file failed for \(path)")
674
+ }
675
+
676
+ let batchSizes = Self.mobileBatchSizes(contextSize: contextSize)
677
+ var ctxParams = c_llama_context_default_params()
678
+ withUnsafeMutablePointer(to: &ctxParams) { ptr in
679
+ shim_context_params_set_n_ctx(ptr, contextSize)
680
+ shim_context_params_set_batch_sizes(ptr, batchSizes.logical, batchSizes.physical)
681
+ shim_context_params_set_n_threads(ptr, resolvedThreads, resolvedThreads)
682
+ Self.setContextGpuOffload(ptr, enabled: canUseGPU)
683
+ if let kCode = ggmlTypeFromString(cacheTypeK) {
684
+ shim_context_params_set_type_k(ptr, kCode)
685
+ }
686
+ if let vCode = ggmlTypeFromString(cacheTypeV) {
687
+ shim_context_params_set_type_v(ptr, vCode)
688
+ }
689
+ }
690
+
691
+ guard let llamaCtx = c_llama_init_from_model(modelPtr, ctxParams) else {
692
+ c_llama_model_free(modelPtr)
693
+ return .failure("llama_init_from_model failed")
694
+ }
695
+
696
+ // Optional MTP drafter. Non-fatal: drafter load failures fall back
697
+ // to plain decode rather than aborting the main model load.
698
+ var drafterModelPtr: LlamaModelPtr? = nil
699
+ var drafterCtxPtr: LlamaContextPtr? = nil
700
+ if let drafterPath = draftModelPath, !drafterPath.isEmpty {
701
+ if !FileManager.default.fileExists(atPath: drafterPath) {
702
+ NSLog("[LlamaBridgeImpl] drafter not found at \(drafterPath); spec decode disabled")
703
+ } else if !shim_speculative_supported() {
704
+ NSLog("[LlamaBridgeImpl] linked slice has no common_speculative_draft_gen; spec decode disabled")
705
+ } else {
706
+ var drafterModelParams = c_llama_model_default_params()
707
+ let drafterLayers: Int32 = draftGpuLayers ?? (canUseGPU ? 999 : 0)
708
+ withUnsafeMutablePointer(to: &drafterModelParams) { ptr in
709
+ shim_model_params_set_n_gpu_layers(ptr, drafterLayers)
710
+ if !canUseGPU { Self.forceModelCpuOnly(ptr) }
711
+ }
712
+ let loadedDrafter = drafterPath.withCString { cpath in
713
+ c_llama_model_load_from_file(cpath, drafterModelParams)
714
+ }
715
+ if let dm = loadedDrafter {
716
+ let draftBatchSizes = Self.mobileBatchSizes(contextSize: draftContextSize)
717
+ var drafterCtxParams = c_llama_context_default_params()
718
+ withUnsafeMutablePointer(to: &drafterCtxParams) { ptr in
719
+ shim_context_params_set_n_ctx(ptr, draftContextSize)
720
+ shim_context_params_set_batch_sizes(ptr, draftBatchSizes.logical, draftBatchSizes.physical)
721
+ shim_context_params_set_n_threads(ptr, resolvedThreads, resolvedThreads)
722
+ Self.setContextGpuOffload(ptr, enabled: canUseGPU)
723
+ if let kCode = ggmlTypeFromString(cacheTypeK) {
724
+ shim_context_params_set_type_k(ptr, kCode)
725
+ }
726
+ if let vCode = ggmlTypeFromString(cacheTypeV) {
727
+ shim_context_params_set_type_v(ptr, vCode)
728
+ }
729
+ }
730
+ if let dctx = c_llama_init_from_model(dm, drafterCtxParams) {
731
+ drafterModelPtr = dm
732
+ drafterCtxPtr = dctx
733
+ } else {
734
+ c_llama_model_free(dm)
735
+ NSLog("[LlamaBridgeImpl] drafter context init failed; spec decode disabled")
736
+ }
737
+ } else {
738
+ NSLog("[LlamaBridgeImpl] drafter model load failed for \(drafterPath); spec decode disabled")
739
+ }
740
+ }
741
+ }
742
+
743
+ let vocab = c_llama_model_get_vocab(modelPtr)
744
+ let nCtxActual = c_llama_n_ctx(llamaCtx)
745
+ let id = SessionRegistry.shared.allocateId()
746
+ let resolvedDraftMin = max(1, draftMin)
747
+ let resolvedDraftMax = max(resolvedDraftMin, draftMax)
748
+ let session = LlamaSession(
749
+ id: id,
750
+ model: modelPtr,
751
+ ctx: llamaCtx,
752
+ vocab: vocab,
753
+ nCtx: nCtxActual,
754
+ nBatch: batchSizes.logical,
755
+ drafterModel: drafterModelPtr,
756
+ drafterCtx: drafterCtxPtr,
757
+ draftMinDefault: resolvedDraftMin,
758
+ draftMaxDefault: resolvedDraftMax
759
+ )
760
+ SessionRegistry.shared.add(session)
761
+ return .success(id)
762
+ }
763
+
764
+ /// Streaming generation. Returns the final result after the loop ends.
765
+ /// `onToken` is called for every sampled token; the bool second argument
766
+ /// is `true` exactly once, at the end. The caller is responsible for
767
+ /// marshalling `onToken` invocations back to the JS thread (we don't do
768
+ /// that here so this class stays JSC-agnostic).
769
+ ///
770
+ /// - `specDecode`:
771
+ /// - `.auto` (default): use spec decode iff the session has a drafter
772
+ /// AND the slice supports it.
773
+ /// - `.on`: prefer spec decode; fall back to plain decode with a
774
+ /// log line when unsupported.
775
+ /// - `.off`: force plain decode even when a drafter is loaded.
776
+ /// - `draftMin` / `draftMax` override session defaults per call.
777
+ /// - `tokenTreeTrie` is the serialized token-tree payload (see
778
+ /// `token-tree.ts`); when non-nil and the slice exposes
779
+ /// `llama_sampler_init_logit_bias`, the bias stage is inserted
780
+ /// into the sampler chain before the temperature/top-k/top-p
781
+ /// stages so the trie constraints fire first.
782
+ public func generate(
783
+ contextId: Int64,
784
+ prompt: String,
785
+ maxTokens: Int32 = 256,
786
+ temperature: Float = 0.7,
787
+ topP: Float = 0.95,
788
+ topK: Int32 = 40,
789
+ stopSequences: [String] = [],
790
+ specDecode: SpecDecodeMode = .auto,
791
+ draftMin: Int32? = nil,
792
+ draftMax: Int32? = nil,
793
+ tokenTreeTrie: Data? = nil,
794
+ onToken: ((String, Bool) -> Void)? = nil
795
+ ) -> LlamaGenerateResult {
796
+ guard let session = SessionRegistry.shared.get(contextId) else {
797
+ return .failure("llama_generate: unknown context_id \(contextId)")
798
+ }
799
+ session.cancelled = false
800
+ let start = DispatchTime.now()
801
+
802
+ // 1. Tokenize prompt.
803
+ let promptTokens = LlamaBridgeImpl.tokenize(
804
+ vocab: session.vocab,
805
+ text: prompt,
806
+ addSpecial: true
807
+ )
808
+ if promptTokens.isEmpty {
809
+ return .failure("tokenize returned 0 tokens (prompt empty?)")
810
+ }
811
+ if Int32(promptTokens.count) >= Int32(session.nCtx) {
812
+ return .failure("prompt (\(promptTokens.count) tokens) exceeds context (\(session.nCtx))")
813
+ }
814
+
815
+ // Reset KV cache for a clean generation.
816
+ if let memory = c_llama_get_memory(session.ctx) {
817
+ c_llama_memory_clear(memory, true)
818
+ }
819
+
820
+ // 2. Prefill the prompt in chunks that fit the context's logical
821
+ // batch size. llama.cpp aborts, rather than returning an error, when a
822
+ // single decode exceeds cparams.n_batch.
823
+ let prefillChunkSize = max(1, min(Int(session.nBatch), promptTokens.count))
824
+ let batch = c_llama_batch_init(Int32(prefillChunkSize), 0, 1)
825
+ defer { c_llama_batch_free(batch) }
826
+
827
+ var mutableBatch = batch
828
+ var prefilled = 0
829
+ while prefilled < promptTokens.count {
830
+ let chunkEnd = min(prefilled + prefillChunkSize, promptTokens.count)
831
+ withUnsafeMutablePointer(to: &mutableBatch) { ptr in
832
+ shim_batch_reset(ptr)
833
+ for i in prefilled..<chunkEnd {
834
+ let isLast = i == promptTokens.count - 1
835
+ shim_batch_append(ptr, promptTokens[i], Int32(i), isLast)
836
+ }
837
+ }
838
+ if c_llama_decode(session.ctx, mutableBatch) != 0 {
839
+ return .failure("llama_decode (prompt chunk \(prefilled)..<\(chunkEnd)) failed")
840
+ }
841
+ prefilled = chunkEnd
842
+ }
843
+
844
+ // 3. Sampler chain.
845
+ var chainParams = c_llama_sampler_chain_default_params()
846
+ chainParams.no_perf = true
847
+ guard let chain = c_llama_sampler_chain_init(chainParams) else {
848
+ return .failure("llama_sampler_chain_init failed")
849
+ }
850
+ defer { c_llama_sampler_free(chain) }
851
+
852
+ // Token-tree logit-bias stage fires first so the trie constrains
853
+ // the distribution before temperature / top-k / top-p shrink it.
854
+ // We feature-detect by NULL return: stock builds without
855
+ // `llama_sampler_init_logit_bias` get a NULL here and we skip.
856
+ if let trie = tokenTreeTrie, !trie.isEmpty {
857
+ let nVocab = c_llama_vocab_n_tokens(session.vocab)
858
+ let trieSampler: LlamaSamplerPtr? = trie.withUnsafeBytes { raw -> LlamaSamplerPtr? in
859
+ guard let base = raw.baseAddress?.assumingMemoryBound(to: UInt8.self) else { return nil }
860
+ return shim_sampler_init_token_tree(nVocab, base, trie.count)
861
+ }
862
+ if let s = trieSampler {
863
+ c_llama_sampler_chain_add(chain, s)
864
+ } else {
865
+ NSLog("[LlamaBridgeImpl] token-tree sampler unavailable; skipping trie stage")
866
+ }
867
+ }
868
+
869
+ if let s = c_llama_sampler_init_top_k(topK) { c_llama_sampler_chain_add(chain, s) }
870
+ if let s = c_llama_sampler_init_top_p(topP, 1) { c_llama_sampler_chain_add(chain, s) }
871
+ if let s = c_llama_sampler_init_temp(temperature) { c_llama_sampler_chain_add(chain, s) }
872
+ if let s = c_llama_sampler_init_dist(LLAMA_DEFAULT_SEED) { c_llama_sampler_chain_add(chain, s) }
873
+
874
+ // Resolve spec-decode mode for this call. The auto path defers
875
+ // entirely to whether the session has a drafter AND the slice
876
+ // supports spec decode. `.on` falls back gracefully (logs and
877
+ // proceeds with plain decode) rather than failing.
878
+ let wantsSpec: Bool
879
+ switch specDecode {
880
+ case .off:
881
+ wantsSpec = false
882
+ case .on:
883
+ wantsSpec = true
884
+ if session.drafterCtx == nil || !shim_speculative_supported() {
885
+ NSLog("[LlamaBridgeImpl] spec decode requested but unavailable; falling back to plain decode")
886
+ }
887
+ case .auto:
888
+ wantsSpec = session.drafterCtx != nil && shim_speculative_supported()
889
+ }
890
+ let useSpec = wantsSpec && session.drafterCtx != nil && shim_speculative_supported()
891
+ let effectiveDraftMin = max(1, draftMin ?? session.draftMinDefault)
892
+ let effectiveDraftMax = max(effectiveDraftMin, draftMax ?? session.draftMaxDefault)
893
+ // Per-generation past-token buffer used by the spec-decode call.
894
+ // Allocated up-front (capped at nCtx) so we don't realloc each step.
895
+ var pastTokenBuffer: [Int32] = useSpec ? promptTokens : []
896
+ pastTokenBuffer.reserveCapacity(Int(session.nCtx))
897
+ // Scratch buffer for drafted tokens. Capped at effectiveDraftMax
898
+ // so we never overrun the libcommon helper's writeable range.
899
+ var draftScratch = [Int32](repeating: 0, count: Int(effectiveDraftMax))
900
+
901
+ // 4. Generation loop.
902
+ var generated = ""
903
+ var generatedTokens: Int32 = 0
904
+ var nPast: Int32 = Int32(promptTokens.count)
905
+ var stoppedByStopSeq = false
906
+
907
+ while generatedTokens < maxTokens {
908
+ if session.cancelled { break }
909
+
910
+ // First, sample one token from the target's current distribution.
911
+ // This is the verified token that the target accepts unconditionally.
912
+ let newTokenId = c_llama_sampler_sample(chain, session.ctx, -1)
913
+ c_llama_sampler_accept(chain, newTokenId)
914
+
915
+ if c_llama_vocab_is_eog(session.vocab, newTokenId) {
916
+ break
917
+ }
918
+
919
+ let piece = LlamaBridgeImpl.tokenToPiece(vocab: session.vocab, token: newTokenId)
920
+ generated.append(piece)
921
+ generatedTokens += 1
922
+
923
+ onToken?(piece, false)
924
+ if useSpec {
925
+ pastTokenBuffer.append(newTokenId)
926
+ }
927
+
928
+ if !stopSequences.isEmpty {
929
+ if let _ = stopSequences.first(where: { !$0.isEmpty && generated.hasSuffix($0) }) {
930
+ stoppedByStopSeq = true
931
+ break
932
+ }
933
+ }
934
+
935
+ // Feed sampled token back to extend KV cache on the main context.
936
+ withUnsafeMutablePointer(to: &mutableBatch) { ptr in
937
+ shim_batch_set_single(ptr, newTokenId, nPast, true)
938
+ }
939
+ if c_llama_decode(session.ctx, mutableBatch) != 0 {
940
+ onToken?("", true)
941
+ return .failure("llama_decode (decode-loop) failed at token \(generatedTokens)")
942
+ }
943
+ nPast += 1
944
+
945
+ if nPast >= Int32(session.nCtx) { break }
946
+ if generatedTokens >= maxTokens { break }
947
+
948
+ // Optional speculative-decode burst.
949
+ //
950
+ // After every verified token we ask the drafter to propose up to
951
+ // `effectiveDraftMax` continuation tokens. We then run them
952
+ // through the target's sampler one at a time and stop on first
953
+ // disagreement. This is the textbook common_speculative loop:
954
+ // drafted tokens that match the target's distribution are kept
955
+ // verbatim, the first mismatch resets us to the standard
956
+ // per-token sample at the top of the next outer iteration.
957
+ //
958
+ // The shim helper guards itself when libcommon isn't linked, so
959
+ // this branch is cheap and safe in stock builds — it just never
960
+ // fires there (`useSpec` is false).
961
+ if useSpec, let drafterCtx = session.drafterCtx {
962
+ let nDrafted: Int32 = pastTokenBuffer.withUnsafeBufferPointer { pastBuf in
963
+ guard let pastBase = pastBuf.baseAddress else { return 0 }
964
+ return draftScratch.withUnsafeMutableBufferPointer { draftBuf in
965
+ guard let draftBase = draftBuf.baseAddress else { return 0 }
966
+ return shim_speculative_draft_gen(
967
+ session.ctx,
968
+ drafterCtx,
969
+ pastBase,
970
+ Int32(pastBuf.count),
971
+ effectiveDraftMin,
972
+ effectiveDraftMax,
973
+ draftBase,
974
+ Int32(draftBuf.count)
975
+ )
976
+ }
977
+ }
978
+ if nDrafted <= 0 { continue }
979
+
980
+ for di in 0..<Int(nDrafted) {
981
+ if generatedTokens >= maxTokens { break }
982
+ let proposed = draftScratch[di]
983
+
984
+ // Verify proposal: re-sample at the same position and
985
+ // compare. If the target's next token equals `proposed`
986
+ // we accept; otherwise we discard the rest of the burst.
987
+ // We use the same sampler chain so temperature, top-k,
988
+ // top-p, and token-tree all apply equally to drafted
989
+ // tokens.
990
+ withUnsafeMutablePointer(to: &mutableBatch) { ptr in
991
+ shim_batch_set_single(ptr, proposed, nPast, true)
992
+ }
993
+ if c_llama_decode(session.ctx, mutableBatch) != 0 {
994
+ // Hard error: bubble up.
995
+ onToken?("", true)
996
+ return .failure("llama_decode (spec-verify) failed at token \(generatedTokens)")
997
+ }
998
+ let verified = c_llama_sampler_sample(chain, session.ctx, -1)
999
+ if verified != proposed {
1000
+ // Disagree — accept the verified token, drop rest of burst.
1001
+ c_llama_sampler_accept(chain, verified)
1002
+ if c_llama_vocab_is_eog(session.vocab, verified) {
1003
+ // Surface the verified token, then exit outer loop.
1004
+ let p = LlamaBridgeImpl.tokenToPiece(vocab: session.vocab, token: verified)
1005
+ generated.append(p)
1006
+ generatedTokens += 1
1007
+ onToken?(p, false)
1008
+ nPast += 1
1009
+ break
1010
+ }
1011
+ let p = LlamaBridgeImpl.tokenToPiece(vocab: session.vocab, token: verified)
1012
+ generated.append(p)
1013
+ generatedTokens += 1
1014
+ pastTokenBuffer.append(verified)
1015
+ onToken?(p, false)
1016
+ nPast += 1
1017
+ break
1018
+ }
1019
+ // Agree — accept the proposal verbatim.
1020
+ c_llama_sampler_accept(chain, proposed)
1021
+ let p = LlamaBridgeImpl.tokenToPiece(vocab: session.vocab, token: proposed)
1022
+ generated.append(p)
1023
+ generatedTokens += 1
1024
+ pastTokenBuffer.append(proposed)
1025
+ onToken?(p, false)
1026
+ nPast += 1
1027
+ if c_llama_vocab_is_eog(session.vocab, proposed) { break }
1028
+ if !stopSequences.isEmpty,
1029
+ stopSequences.first(where: { !$0.isEmpty && generated.hasSuffix($0) }) != nil {
1030
+ stoppedByStopSeq = true
1031
+ break
1032
+ }
1033
+ if nPast >= Int32(session.nCtx) { break }
1034
+ }
1035
+ if stoppedByStopSeq { break }
1036
+ if nPast >= Int32(session.nCtx) { break }
1037
+ }
1038
+ }
1039
+
1040
+ onToken?("", true)
1041
+
1042
+ // Strip stop sequence from the bulk text (streaming consumer already saw it).
1043
+ var finalText = generated
1044
+ if stoppedByStopSeq {
1045
+ for stop in stopSequences where !stop.isEmpty && finalText.hasSuffix(stop) {
1046
+ finalText = String(finalText.dropLast(stop.count))
1047
+ break
1048
+ }
1049
+ }
1050
+
1051
+ let elapsedNs = DispatchTime.now().uptimeNanoseconds - start.uptimeNanoseconds
1052
+ return .success(
1053
+ text: finalText,
1054
+ promptTokens: promptTokens.count,
1055
+ outputTokens: Int(generatedTokens),
1056
+ durationMs: Double(elapsedNs) / 1_000_000.0
1057
+ )
1058
+ }
1059
+
1060
+ /// Marks the in-flight generation on `contextId` for cancellation. The
1061
+ /// generation loop polls this flag between sampled tokens.
1062
+ public func cancel(contextId: Int64) {
1063
+ SessionRegistry.shared.get(contextId)?.cancelled = true
1064
+ }
1065
+
1066
+ public func synthesizeSpeech(
1067
+ bundleDir: String,
1068
+ text: String,
1069
+ speakerPresetId: String? = nil,
1070
+ maxSamples: Int = 24_000 * 60
1071
+ ) -> LlamaTtsSynthesizeResult {
1072
+ ttsQueue.sync {
1073
+ if Self.kokoroCoreMlTtsEnabled(),
1074
+ let coreMlResult = synthesizeKokoroCoreMl(
1075
+ bundleDir: bundleDir,
1076
+ text: text,
1077
+ speakerPresetId: speakerPresetId,
1078
+ maxSamples: maxSamples
1079
+ ) {
1080
+ return coreMlResult
1081
+ }
1082
+ // When CoreML Kokoro is unavailable, route through OmniVoice — the
1083
+ // tier DEFAULT voice engine (ELIZA_1_VOICE_BACKENDS) and the validated
1084
+ // fused engine. The fork GGUF-Kokoro path is intentionally disabled on
1085
+ // iOS ("not production speech"); OmniVoice is not gated.
1086
+ return Self.withTemporaryEnvironment("ELIZA_TTS_BACKEND", value: "omnivoice") {
1087
+ return Self.withTemporaryEnvironment("ELIZA_TTS_MAX_BACKEND_ALLOC_MB", value: "768") {
1088
+ return Self.withTemporaryEnvironment("GGML_BACKEND", value: "CPU") {
1089
+ return synthesizeSpeechAttempt(
1090
+ bundleDir: bundleDir,
1091
+ text: text,
1092
+ speakerPresetId: speakerPresetId,
1093
+ maxSamples: maxSamples
1094
+ )
1095
+ }
1096
+ }
1097
+ }
1098
+ }
1099
+ }
1100
+
1101
+ /// On-device speech-to-text. Mirrors `synthesizeSpeech`: serializes on the
1102
+ /// shared inference queue, reuses the per-bundle `EliInferenceContext`
1103
+ /// (which serves text + tts + asr), and surfaces native errors verbatim.
1104
+ /// `pcm` is mono fp32 in [-1, 1]; `sampleRate` is the source rate in Hz —
1105
+ /// the linked slice resamples internally as needed.
1106
+ public func transcribeSpeech(
1107
+ bundleDir: String,
1108
+ pcm: [Float],
1109
+ sampleRate: Int
1110
+ ) -> LlamaAsrTranscribeResult {
1111
+ ttsQueue.sync {
1112
+ Self.withTemporaryEnvironment("GGML_BACKEND", value: "CPU") {
1113
+ transcribeSpeechAttempt(
1114
+ bundleDir: bundleDir,
1115
+ pcm: pcm,
1116
+ sampleRate: sampleRate
1117
+ )
1118
+ }
1119
+ }
1120
+ }
1121
+
1122
+ private func transcribeSpeechAttempt(
1123
+ bundleDir: String,
1124
+ pcm: [Float],
1125
+ sampleRate: Int
1126
+ ) -> LlamaAsrTranscribeResult {
1127
+ let attemptBackend = Self.currentBackendEnv()
1128
+ NSLog("[LlamaBridgeImpl] ASR attempt start backend=\(attemptBackend) bundle=\(bundleDir) samples=\(pcm.count) sampleRate=\(sampleRate)")
1129
+ guard FileManager.default.fileExists(atPath: bundleDir) else {
1130
+ NSLog("[LlamaBridgeImpl] ASR attempt failed stage=bundle-check backend=\(attemptBackend) bundle=\(bundleDir)")
1131
+ return .failure("eliza_asr_transcribe: bundle not found at \(bundleDir)")
1132
+ }
1133
+ guard !pcm.isEmpty else {
1134
+ NSLog("[LlamaBridgeImpl] ASR attempt failed stage=pcm-check backend=\(attemptBackend)")
1135
+ return .failure("eliza_asr_transcribe: empty pcm")
1136
+ }
1137
+ guard let abiPtr = c_eliza_inference_abi_version() else {
1138
+ NSLog("[LlamaBridgeImpl] ASR attempt failed stage=abi backend=\(attemptBackend) reason=missing")
1139
+ return .failure("eliza_asr_transcribe: missing eliza inference ABI")
1140
+ }
1141
+ let abi = String(cString: abiPtr)
1142
+ guard let abiVersion = Int(abi), abiVersion >= 4 else {
1143
+ NSLog("[LlamaBridgeImpl] ASR attempt failed stage=abi backend=\(attemptBackend) abi=\(abi)")
1144
+ return .failure("eliza_asr_transcribe: linked iOS inference slice is the smoke-build ABI \(abi); rebuild with fused iOS local inference")
1145
+ }
1146
+
1147
+ let start = DispatchTime.now()
1148
+ let prepared = prepareAsrContext(bundleDir: bundleDir, backend: attemptBackend)
1149
+ guard let ctx = prepared.context else {
1150
+ let error = prepared.error ?? "eliza_inference_mmap_acquire(asr) failed"
1151
+ return .failure(error)
1152
+ }
1153
+
1154
+ var out = [CChar](repeating: 0, count: 4096)
1155
+ var asrError: UnsafeMutablePointer<CChar>? = nil
1156
+ NSLog("[LlamaBridgeImpl] ASR stage=transcribe begin backend=\(attemptBackend) samples=\(pcm.count)")
1157
+ let bytesWritten = pcm.withUnsafeBufferPointer { pcmBuffer -> Int32 in
1158
+ guard let pcmPtr = pcmBuffer.baseAddress else { return -2 }
1159
+ return out.withUnsafeMutableBufferPointer { outBuffer -> Int32 in
1160
+ guard let outPtr = outBuffer.baseAddress else { return -2 }
1161
+ return c_eliza_inference_asr_transcribe(
1162
+ ctx,
1163
+ pcmPtr,
1164
+ pcm.count,
1165
+ Int32(sampleRate),
1166
+ outPtr,
1167
+ outBuffer.count,
1168
+ &asrError
1169
+ )
1170
+ }
1171
+ }
1172
+ guard bytesWritten >= 0 else {
1173
+ let error = Self.takeInferenceError(&asrError, fallback: "eliza_inference_asr_transcribe failed with code \(bytesWritten)")
1174
+ NSLog("[LlamaBridgeImpl] ASR stage=transcribe failed backend=\(attemptBackend) code=\(bytesWritten) error=\(error)")
1175
+ clearCachedAsrContext()
1176
+ return .failure(error)
1177
+ }
1178
+ let transcript = out.withUnsafeBufferPointer { buffer -> String in
1179
+ guard let base = buffer.baseAddress else { return "" }
1180
+ return String(cString: base)
1181
+ }
1182
+ let elapsedNs = DispatchTime.now().uptimeNanoseconds - start.uptimeNanoseconds
1183
+ NSLog("[LlamaBridgeImpl] ASR attempt ok backend=\(attemptBackend) bytes=\(bytesWritten) durationMs=\(Double(elapsedNs) / 1_000_000.0)")
1184
+ return .success(text: transcript, durationMs: Double(elapsedNs) / 1_000_000.0)
1185
+ }
1186
+
1187
+ public func ttsEngineDiagnostics(bundleDir: String?) -> [String: Any] {
1188
+ let hardware = hardwareInfo()
1189
+ var payload: [String: Any] = [
1190
+ "available": true,
1191
+ "abiVersion": Self.elizaInferenceAbiVersion() ?? "missing",
1192
+ "ggmlBackendEnv": Self.currentBackendEnv(),
1193
+ "ttsBackendEnv": Self.currentTtsBackendEnv(),
1194
+ "kokoroGgufTtsEnabled": Self.experimentalKokoroGgufTtsEnabled(),
1195
+ "kokoroCoreMlTtsEnabled": Self.kokoroCoreMlTtsEnabled(),
1196
+ "cachedTtsContext": cachedTtsContext != nil,
1197
+ "cachedAsrContext": cachedAsrContext != nil,
1198
+ "hardware": hardware.asDict(),
1199
+ ]
1200
+ if let bundleDir {
1201
+ payload["bundleDir"] = bundleDir
1202
+ payload["kokoroCoreMl"] = Self.kokoroCoreMlDiagnostics(bundleDir: bundleDir)
1203
+ }
1204
+ return payload
1205
+ }
1206
+
1207
+ private func synthesizeKokoroCoreMl(
1208
+ bundleDir: String,
1209
+ text: String,
1210
+ speakerPresetId: String?,
1211
+ maxSamples: Int
1212
+ ) -> LlamaTtsSynthesizeResult? {
1213
+ guard let coreMlDir = Self.kokoroCoreMlDirectory(bundleDir: bundleDir) else {
1214
+ return nil
1215
+ }
1216
+ guard #available(iOS 18.0, *) else {
1217
+ return .failure("iOS Kokoro CoreML TTS requires iOS 18 or newer")
1218
+ }
1219
+ do {
1220
+ NSLog("[LlamaBridgeImpl] TTS attempt start backend=kokoro-coreml bundle=\(bundleDir) modelDir=\(coreMlDir.path) textBytes=\(text.lengthOfBytes(using: .utf8)) maxSamples=\(maxSamples)")
1221
+ let result = try KokoroCoreMlEngine.shared.synthesize(
1222
+ modelDirectory: coreMlDir,
1223
+ text: text,
1224
+ voice: speakerPresetId,
1225
+ maxSamples: max(maxSamples, 24_000)
1226
+ )
1227
+ let wav = Self.wavData(from: result.samples, sampleRate: result.sampleRate)
1228
+ let audioFileUrl = FileManager.default.temporaryDirectory
1229
+ .appendingPathComponent("eliza-kokoro-coreml-\(UUID().uuidString)")
1230
+ .appendingPathExtension("wav")
1231
+ try wav.write(to: audioFileUrl, options: [.atomic])
1232
+ NSLog("[LlamaBridgeImpl] TTS attempt ok backend=kokoro-coreml voice=\(result.voice) samples=\(result.samples.count) durationMs=\(result.durationMs)")
1233
+ return .success(
1234
+ audioFilePath: audioFileUrl.path,
1235
+ sampleRate: result.sampleRate,
1236
+ samples: result.samples.count,
1237
+ durationMs: result.durationMs
1238
+ )
1239
+ } catch {
1240
+ NSLog("[LlamaBridgeImpl] TTS attempt failed backend=kokoro-coreml error=\(error.localizedDescription)")
1241
+ return .failure("Kokoro CoreML TTS failed: \(error.localizedDescription)")
1242
+ }
1243
+ }
1244
+
1245
+ private func synthesizeSpeechAttempt(
1246
+ bundleDir: String,
1247
+ text: String,
1248
+ speakerPresetId: String?,
1249
+ maxSamples: Int
1250
+ ) -> LlamaTtsSynthesizeResult {
1251
+ let attemptBackend = Self.currentBackendEnv()
1252
+ NSLog("[LlamaBridgeImpl] TTS attempt start backend=\(attemptBackend) bundle=\(bundleDir) textBytes=\(text.lengthOfBytes(using: .utf8)) maxSamples=\(maxSamples)")
1253
+ guard FileManager.default.fileExists(atPath: bundleDir) else {
1254
+ NSLog("[LlamaBridgeImpl] TTS attempt failed stage=bundle-check backend=\(attemptBackend) bundle=\(bundleDir)")
1255
+ return .failure("eliza_tts_synthesize: bundle not found at \(bundleDir)")
1256
+ }
1257
+ guard let abiPtr = c_eliza_inference_abi_version() else {
1258
+ NSLog("[LlamaBridgeImpl] TTS attempt failed stage=abi backend=\(attemptBackend) reason=missing")
1259
+ return .failure("eliza_tts_synthesize: missing eliza inference ABI")
1260
+ }
1261
+ let abi = String(cString: abiPtr)
1262
+ guard let abiVersion = Int(abi), abiVersion >= 4 else {
1263
+ NSLog("[LlamaBridgeImpl] TTS attempt failed stage=abi backend=\(attemptBackend) abi=\(abi)")
1264
+ return .failure("eliza_tts_synthesize: linked iOS inference slice is the smoke-build ABI \(abi); rebuild with fused iOS local inference")
1265
+ }
1266
+ if Self.currentTtsBackendEnv() == "kokoro" && !Self.experimentalKokoroGgufTtsEnabled() {
1267
+ NSLog("[LlamaBridgeImpl] TTS attempt blocked stage=backend-gate backend=\(attemptBackend) ttsBackend=kokoro")
1268
+ return .failure("iOS Kokoro GGUF TTS is not enabled because this fork path does not produce production speech. Use the CoreML/ONNX Kokoro backend for real local voice.")
1269
+ }
1270
+
1271
+ let start = DispatchTime.now()
1272
+ let prepared = prepareTtsContext(bundleDir: bundleDir, backend: attemptBackend)
1273
+ guard let ctx = prepared.context else {
1274
+ let error = prepared.error ?? "eliza_inference_mmap_acquire(tts) failed"
1275
+ return .failure(error)
1276
+ }
1277
+
1278
+ let boundedMaxSamples = min(max(maxSamples, 24_000), 24_000 * 120)
1279
+ var pcm = [Float](repeating: 0, count: boundedMaxSamples)
1280
+ var ttsError: UnsafeMutablePointer<CChar>? = nil
1281
+ let textLength = text.lengthOfBytes(using: .utf8)
1282
+ NSLog("[LlamaBridgeImpl] TTS stage=synthesize begin backend=\(attemptBackend) maxSamples=\(boundedMaxSamples)")
1283
+ let sampleCount = pcm.withUnsafeMutableBufferPointer { pcmBuffer -> Int32 in
1284
+ guard let pcmPtr = pcmBuffer.baseAddress else { return -2 }
1285
+ return text.withCString { textPtr in
1286
+ if let speakerPresetId, !speakerPresetId.isEmpty {
1287
+ return speakerPresetId.withCString { speakerPtr in
1288
+ c_eliza_inference_tts_synthesize(
1289
+ ctx,
1290
+ textPtr,
1291
+ textLength,
1292
+ speakerPtr,
1293
+ pcmPtr,
1294
+ boundedMaxSamples,
1295
+ &ttsError
1296
+ )
1297
+ }
1298
+ }
1299
+ return c_eliza_inference_tts_synthesize(
1300
+ ctx,
1301
+ textPtr,
1302
+ textLength,
1303
+ nil,
1304
+ pcmPtr,
1305
+ boundedMaxSamples,
1306
+ &ttsError
1307
+ )
1308
+ }
1309
+ }
1310
+ guard sampleCount >= 0 else {
1311
+ let error = Self.takeInferenceError(&ttsError, fallback: "eliza_inference_tts_synthesize failed with code \(sampleCount)")
1312
+ NSLog("[LlamaBridgeImpl] TTS stage=synthesize failed backend=\(attemptBackend) code=\(sampleCount) error=\(error)")
1313
+ clearCachedTtsContext()
1314
+ return .failure(error)
1315
+ }
1316
+ NSLog("[LlamaBridgeImpl] TTS stage=synthesize ok backend=\(attemptBackend) samples=\(sampleCount)")
1317
+ let samples = Array(pcm.prefix(Int(sampleCount)))
1318
+ let wav = Self.wavData(from: samples, sampleRate: 24_000)
1319
+ let audioFileUrl = FileManager.default.temporaryDirectory
1320
+ .appendingPathComponent("eliza-tts-\(UUID().uuidString)")
1321
+ .appendingPathExtension("wav")
1322
+ do {
1323
+ try wav.write(to: audioFileUrl, options: [.atomic])
1324
+ } catch {
1325
+ NSLog("[LlamaBridgeImpl] TTS stage=write-wav failed backend=\(attemptBackend) error=\(error.localizedDescription)")
1326
+ return .failure("eliza_tts_synthesize: failed to write synthesized audio: \(error.localizedDescription)")
1327
+ }
1328
+ let elapsedNs = DispatchTime.now().uptimeNanoseconds - start.uptimeNanoseconds
1329
+ NSLog("[LlamaBridgeImpl] TTS attempt ok backend=\(attemptBackend) samples=\(samples.count) durationMs=\(Double(elapsedNs) / 1_000_000.0)")
1330
+ return .success(
1331
+ audioFilePath: audioFileUrl.path,
1332
+ sampleRate: 24_000,
1333
+ samples: samples.count,
1334
+ durationMs: Double(elapsedNs) / 1_000_000.0
1335
+ )
1336
+ }
1337
+
1338
+ private func prepareTtsContext(
1339
+ bundleDir: String,
1340
+ backend: String
1341
+ ) -> (context: ElizaInferenceContextPtr?, error: String?) {
1342
+ return prepareVoiceContext(bundleDir: bundleDir, backend: backend, region: "tts")
1343
+ }
1344
+
1345
+ private func prepareAsrContext(
1346
+ bundleDir: String,
1347
+ backend: String
1348
+ ) -> (context: ElizaInferenceContextPtr?, error: String?) {
1349
+ return prepareVoiceContext(bundleDir: bundleDir, backend: backend, region: "asr")
1350
+ }
1351
+
1352
+ private func prepareVoiceContext(
1353
+ bundleDir: String,
1354
+ backend: String,
1355
+ region: String
1356
+ ) -> (context: ElizaInferenceContextPtr?, error: String?) {
1357
+ let cachedContext = region == "asr" ? cachedAsrContext : cachedTtsContext
1358
+ if let cachedContext,
1359
+ cachedContext.bundleDir == bundleDir,
1360
+ cachedContext.backend == backend {
1361
+ NSLog("[LlamaBridgeImpl] \(region.uppercased()) stage=mmap-acquire cached backend=\(backend) region=\(region)")
1362
+ return (cachedContext.context, nil)
1363
+ }
1364
+ clearCachedVoiceContext(region: region)
1365
+
1366
+ var createError: UnsafeMutablePointer<CChar>? = nil
1367
+ NSLog("[LlamaBridgeImpl] \(region.uppercased()) stage=create begin backend=\(backend)")
1368
+ guard let ctx = bundleDir.withCString({ bundlePtr in
1369
+ c_eliza_inference_create(bundlePtr, &createError)
1370
+ }) else {
1371
+ let error = Self.takeInferenceError(&createError, fallback: "eliza_inference_create failed")
1372
+ NSLog("[LlamaBridgeImpl] \(region.uppercased()) stage=create failed backend=\(backend) error=\(error)")
1373
+ return (nil, error)
1374
+ }
1375
+ NSLog("[LlamaBridgeImpl] \(region.uppercased()) stage=create ok backend=\(backend)")
1376
+
1377
+ var acquireError: UnsafeMutablePointer<CChar>? = nil
1378
+ NSLog("[LlamaBridgeImpl] \(region.uppercased()) stage=mmap-acquire begin backend=\(backend) region=\(region)")
1379
+ let acquireCode = region.withCString { regionPtr in
1380
+ c_eliza_inference_mmap_acquire(ctx, regionPtr, &acquireError)
1381
+ }
1382
+ guard acquireCode >= 0 else {
1383
+ let error = Self.takeInferenceError(&acquireError, fallback: "eliza_inference_mmap_acquire(\(region)) failed with code \(acquireCode)")
1384
+ NSLog("[LlamaBridgeImpl] \(region.uppercased()) stage=mmap-acquire failed backend=\(backend) code=\(acquireCode) error=\(error)")
1385
+ c_eliza_inference_destroy(ctx)
1386
+ return (nil, error)
1387
+ }
1388
+ NSLog("[LlamaBridgeImpl] \(region.uppercased()) stage=mmap-acquire ok backend=\(backend) region=\(region)")
1389
+ let context = CachedVoiceContext(bundleDir: bundleDir, backend: backend, context: ctx)
1390
+ if region == "asr" {
1391
+ cachedAsrContext = context
1392
+ } else {
1393
+ cachedTtsContext = context
1394
+ }
1395
+ return (ctx, nil)
1396
+ }
1397
+
1398
+ private func clearCachedTtsContext() {
1399
+ clearCachedVoiceContext(region: "tts")
1400
+ }
1401
+
1402
+ private func clearCachedAsrContext() {
1403
+ clearCachedVoiceContext(region: "asr")
1404
+ }
1405
+
1406
+ private func clearCachedVoiceContext(region: String) {
1407
+ if region == "asr" {
1408
+ if let cachedAsrContext {
1409
+ c_eliza_inference_destroy(cachedAsrContext.context)
1410
+ self.cachedAsrContext = nil
1411
+ }
1412
+ return
1413
+ }
1414
+ if let cachedTtsContext {
1415
+ c_eliza_inference_destroy(cachedTtsContext.context)
1416
+ self.cachedTtsContext = nil
1417
+ }
1418
+ }
1419
+
1420
+ private static func shouldRetryTtsOnCpu(_ error: String) -> Bool {
1421
+ let lower = error.lowercased()
1422
+ return lower.contains("ov_init failed")
1423
+ || lower.contains("pipeline_tts_load failed")
1424
+ || lower.contains("failed to allocate backend buffer")
1425
+ || lower.contains("metal")
1426
+ }
1427
+
1428
+ private static func shouldPreferCpuTtsBackend() -> Bool {
1429
+ if currentBackendEnv() != "default" {
1430
+ return false
1431
+ }
1432
+ let override = getenv("ELIZA_IOS_TTS_BACKEND").map { String(cString: $0).lowercased() }
1433
+ return override != "gpu" && override != "metal"
1434
+ }
1435
+
1436
+ private static func elizaInferenceAbiVersion() -> String? {
1437
+ guard let abiPtr = c_eliza_inference_abi_version() else {
1438
+ return nil
1439
+ }
1440
+ return String(cString: abiPtr)
1441
+ }
1442
+
1443
+ private static func currentBackendEnv() -> String {
1444
+ getenv("GGML_BACKEND").map { String(cString: $0) } ?? "default"
1445
+ }
1446
+
1447
+ private static func currentTtsBackendEnv() -> String {
1448
+ getenv("ELIZA_TTS_BACKEND").map { String(cString: $0) } ?? "default"
1449
+ }
1450
+
1451
+ private static func kokoroCoreMlDirectory(bundleDir: String) -> URL? {
1452
+ guard #available(iOS 18.0, *) else { return nil }
1453
+ return KokoroCoreMlEngine.modelDirectory(in: bundleDir)
1454
+ }
1455
+
1456
+ private static func kokoroCoreMlDiagnostics(bundleDir: String) -> [String: Any] {
1457
+ if #available(iOS 18.0, *) {
1458
+ return KokoroCoreMlEngine.shared.diagnostics(
1459
+ modelDirectory: kokoroCoreMlDirectory(bundleDir: bundleDir)
1460
+ )
1461
+ }
1462
+ return [
1463
+ "available": false,
1464
+ "requiresIos": "18.0",
1465
+ "error": "iOS Kokoro CoreML TTS requires iOS 18 or newer",
1466
+ ]
1467
+ }
1468
+
1469
+ private static func experimentalKokoroGgufTtsEnabled() -> Bool {
1470
+ guard let value = getenv("ELIZA_IOS_ALLOW_EXPERIMENTAL_KOKORO_GGUF_TTS").map({ String(cString: $0).lowercased() }) else {
1471
+ return false
1472
+ }
1473
+ return value == "1" || value == "true" || value == "yes" || value == "on"
1474
+ }
1475
+
1476
+ private static func kokoroCoreMlTtsEnabled() -> Bool {
1477
+ guard let value = getenv("ELIZA_IOS_ENABLE_KOKORO_COREML_TTS").map({ String(cString: $0).lowercased() }) else {
1478
+ return false
1479
+ }
1480
+ return value == "1" || value == "true" || value == "yes" || value == "on"
1481
+ }
1482
+
1483
+ private static func withTemporaryEnvironment<T>(_ name: String, value: String, body: () -> T) -> T {
1484
+ let previous = getenv(name).map { String(cString: $0) }
1485
+ setenv(name, value, 1)
1486
+ defer {
1487
+ if let previous {
1488
+ setenv(name, previous, 1)
1489
+ } else {
1490
+ unsetenv(name)
1491
+ }
1492
+ }
1493
+ return body()
1494
+ }
1495
+
1496
+ /// Releases the model + context backing `contextId`. The session's work
1497
+ /// queue serializes the free against any in-flight generate.
1498
+ public func free(contextId: Int64) {
1499
+ if let session = SessionRegistry.shared.remove(contextId) {
1500
+ session.workQueue.async { session.free() }
1501
+ }
1502
+ }
1503
+
1504
+ /// Returns the work queue for a context_id, or nil. The bridge uses this
1505
+ /// to schedule `generate(...)` on the per-session serial queue, keeping
1506
+ /// multiple JS calls into the same context naturally serialized.
1507
+ public func workQueue(for contextId: Int64) -> DispatchQueue? {
1508
+ return SessionRegistry.shared.get(contextId)?.workQueue
1509
+ }
1510
+
1511
+ /// Reports runtime capabilities. Synchronous and cheap to call.
1512
+ ///
1513
+ /// `mtpSupported` reflects three conjuncted conditions:
1514
+ /// 1. The linked slice exposes `common_speculative_draft_gen`
1515
+ /// (probed via `shim_speculative_supported()`).
1516
+ /// 2. Metal is usable (we won't claim mtp on the simulator).
1517
+ /// 3. The device has enough free RAM to plausibly host target +
1518
+ /// drafter side-by-side. The 3 GB threshold matches the
1519
+ /// headroom required for an Eliza-1 1B drafter + 7B target
1520
+ /// with f16 KV cache.
1521
+ public func hardwareInfo() -> LlamaHardwareInfo {
1522
+ let pi = ProcessInfo.processInfo
1523
+ let isSim = Self.isRunningInSimulator
1524
+ let totalRAM = Double(pi.physicalMemory) / (1024.0 * 1024.0 * 1024.0)
1525
+ let availRAM = LlamaBridgeImpl.availableMemoryGB()
1526
+ let metalSupported = shim_has_metal() && !isSim
1527
+ let specSlice = shim_speculative_supported()
1528
+ let memoryHeadroom = availRAM >= 3.0
1529
+ let mtpSupported = specSlice && metalSupported && memoryHeadroom
1530
+ let mtpReason: String?
1531
+ if mtpSupported {
1532
+ mtpReason = nil
1533
+ } else if !specSlice {
1534
+ mtpReason = "linked llama slice has no common_speculative_draft_gen"
1535
+ } else if !metalSupported {
1536
+ mtpReason = isSim ? "simulator: GPU unavailable" : "Metal unsupported"
1537
+ } else if !memoryHeadroom {
1538
+ mtpReason = "insufficient free RAM (need >= 3 GB, got \(String(format: "%.2f", availRAM)))"
1539
+ } else {
1540
+ mtpReason = "unknown"
1541
+ }
1542
+ return LlamaHardwareInfo(
1543
+ backend: metalSupported ? "metal" : "cpu",
1544
+ totalRamGB: totalRAM,
1545
+ availableRamGB: availRAM,
1546
+ cpuCores: pi.activeProcessorCount,
1547
+ isSimulator: isSim,
1548
+ metalSupported: metalSupported,
1549
+ mtpSupported: mtpSupported,
1550
+ mtpReason: mtpReason
1551
+ )
1552
+ }
1553
+
1554
+ // MARK: - Private helpers
1555
+
1556
+ private static func availableMemoryGB() -> Double {
1557
+ var info = task_vm_info_data_t()
1558
+ var count = mach_msg_type_number_t(MemoryLayout<task_vm_info_data_t>.size / MemoryLayout<integer_t>.size)
1559
+ let result = withUnsafeMutablePointer(to: &info) { ptr -> kern_return_t in
1560
+ ptr.withMemoryRebound(to: integer_t.self, capacity: Int(count)) { intPtr in
1561
+ task_info(mach_task_self_, task_flavor_t(TASK_VM_INFO), intPtr, &count)
1562
+ }
1563
+ }
1564
+ guard result == KERN_SUCCESS else { return 0 }
1565
+ let used = Double(info.phys_footprint)
1566
+ let total = Double(ProcessInfo.processInfo.physicalMemory)
1567
+ let avail = max(0, total - used)
1568
+ return avail / (1024.0 * 1024.0 * 1024.0)
1569
+ }
1570
+
1571
+ private static func tokenize(vocab: LlamaVocabPtr, text: String, addSpecial: Bool) -> [Int32] {
1572
+ let utf8 = text.utf8CString
1573
+ let textLen = Int32(text.utf8.count)
1574
+ var probeBuf = [Int32](repeating: 0, count: 8)
1575
+ let probe = utf8.withUnsafeBufferPointer { bp -> Int32 in
1576
+ guard let base = bp.baseAddress else { return 0 }
1577
+ return probeBuf.withUnsafeMutableBufferPointer { ob in
1578
+ c_llama_tokenize(vocab, base, textLen, ob.baseAddress!, Int32(ob.count), addSpecial, true)
1579
+ }
1580
+ }
1581
+ if probe >= 0 {
1582
+ return Array(probeBuf.prefix(Int(probe)))
1583
+ }
1584
+ let needed = Int(-probe)
1585
+ var tokens = [Int32](repeating: 0, count: needed)
1586
+ let written = utf8.withUnsafeBufferPointer { bp -> Int32 in
1587
+ guard let base = bp.baseAddress else { return 0 }
1588
+ return tokens.withUnsafeMutableBufferPointer { ob in
1589
+ c_llama_tokenize(vocab, base, textLen, ob.baseAddress!, Int32(ob.count), addSpecial, true)
1590
+ }
1591
+ }
1592
+ if written <= 0 { return [] }
1593
+ return Array(tokens.prefix(Int(written)))
1594
+ }
1595
+
1596
+ private static func tokenToPiece(vocab: LlamaVocabPtr, token: Int32) -> String {
1597
+ var buf = [CChar](repeating: 0, count: 64)
1598
+ let n = buf.withUnsafeMutableBufferPointer { bp -> Int32 in
1599
+ c_llama_token_to_piece(vocab, token, bp.baseAddress!, Int32(bp.count), 0, false)
1600
+ }
1601
+ let writtenCount: Int
1602
+ if n < 0 {
1603
+ let needed = Int(-n)
1604
+ buf = [CChar](repeating: 0, count: needed + 1)
1605
+ let n2 = buf.withUnsafeMutableBufferPointer { bp -> Int32 in
1606
+ c_llama_token_to_piece(vocab, token, bp.baseAddress!, Int32(bp.count), 0, false)
1607
+ }
1608
+ if n2 <= 0 { return "" }
1609
+ writtenCount = Int(n2)
1610
+ } else if n == 0 {
1611
+ return ""
1612
+ } else {
1613
+ writtenCount = Int(n)
1614
+ }
1615
+ // Buffer is not necessarily null-terminated. Decode the byte slice as UTF-8.
1616
+ let bytes = buf.prefix(writtenCount).map { UInt8(bitPattern: $0) }
1617
+ return String(decoding: bytes, as: UTF8.self)
1618
+ }
1619
+
1620
+ private static func takeInferenceError(
1621
+ _ errorPtr: UnsafeMutablePointer<UnsafeMutablePointer<CChar>?>,
1622
+ fallback: String
1623
+ ) -> String {
1624
+ guard let pointer = errorPtr.pointee else { return fallback }
1625
+ let message = String(cString: pointer)
1626
+ c_eliza_inference_free_string(pointer)
1627
+ errorPtr.pointee = nil
1628
+ return message.isEmpty ? fallback : message
1629
+ }
1630
+
1631
+ private static func wavData(from pcm: [Float], sampleRate: Int) -> Data {
1632
+ var data = Data()
1633
+ let bytesPerSample = 2
1634
+ let channelCount = 1
1635
+ let byteRate = sampleRate * channelCount * bytesPerSample
1636
+ let blockAlign = channelCount * bytesPerSample
1637
+ let dataSize = pcm.count * bytesPerSample
1638
+
1639
+ data.append(contentsOf: "RIFF".utf8)
1640
+ appendLittleEndian(UInt32(36 + dataSize), to: &data)
1641
+ data.append(contentsOf: "WAVE".utf8)
1642
+ data.append(contentsOf: "fmt ".utf8)
1643
+ appendLittleEndian(UInt32(16), to: &data)
1644
+ appendLittleEndian(UInt16(1), to: &data)
1645
+ appendLittleEndian(UInt16(channelCount), to: &data)
1646
+ appendLittleEndian(UInt32(sampleRate), to: &data)
1647
+ appendLittleEndian(UInt32(byteRate), to: &data)
1648
+ appendLittleEndian(UInt16(blockAlign), to: &data)
1649
+ appendLittleEndian(UInt16(16), to: &data)
1650
+ data.append(contentsOf: "data".utf8)
1651
+ appendLittleEndian(UInt32(dataSize), to: &data)
1652
+
1653
+ for sample in pcm {
1654
+ let clamped = max(-1.0, min(1.0, sample))
1655
+ let scaled = clamped < 0
1656
+ ? Int16((clamped * 32768.0).rounded())
1657
+ : Int16((clamped * 32767.0).rounded())
1658
+ appendLittleEndian(scaled, to: &data)
1659
+ }
1660
+ return data
1661
+ }
1662
+
1663
+ private static func appendLittleEndian<T: FixedWidthInteger>(_ value: T, to data: inout Data) {
1664
+ var littleEndian = value.littleEndian
1665
+ withUnsafeBytes(of: &littleEndian) { bytes in
1666
+ data.append(contentsOf: bytes)
1667
+ }
1668
+ }
1669
+ }
1670
+
1671
+ #else
1672
+
1673
+ public struct LlamaLoadResult {
1674
+ public let contextId: Int64?
1675
+ public let error: String?
1676
+ public static func success(_ id: Int64) -> LlamaLoadResult { .init(contextId: id, error: nil) }
1677
+ public static func failure(_ msg: String) -> LlamaLoadResult { .init(contextId: nil, error: msg) }
1678
+ }
1679
+
1680
+ public struct LlamaGenerateResult {
1681
+ public let text: String
1682
+ public let promptTokens: Int
1683
+ public let outputTokens: Int
1684
+ public let durationMs: Double
1685
+ public let error: String?
1686
+ public static func success(text: String, promptTokens: Int, outputTokens: Int, durationMs: Double) -> LlamaGenerateResult {
1687
+ .init(text: text, promptTokens: promptTokens, outputTokens: outputTokens, durationMs: durationMs, error: nil)
1688
+ }
1689
+ public static func failure(_ msg: String) -> LlamaGenerateResult {
1690
+ .init(text: "", promptTokens: 0, outputTokens: 0, durationMs: 0, error: msg)
1691
+ }
1692
+ }
1693
+
1694
+ public struct LlamaTtsSynthesizeResult {
1695
+ public let audioBase64: String
1696
+ public let audioFilePath: String?
1697
+ public let contentType: String
1698
+ public let sampleRate: Int
1699
+ public let samples: Int
1700
+ public let durationMs: Double
1701
+ public let error: String?
1702
+
1703
+ public static func failure(_ msg: String) -> LlamaTtsSynthesizeResult {
1704
+ .init(
1705
+ audioBase64: "",
1706
+ audioFilePath: nil,
1707
+ contentType: "audio/wav",
1708
+ sampleRate: 24_000,
1709
+ samples: 0,
1710
+ durationMs: 0,
1711
+ error: msg
1712
+ )
1713
+ }
1714
+ }
1715
+
1716
+ public struct LlamaAsrTranscribeResult {
1717
+ public let text: String
1718
+ public let durationMs: Double
1719
+ public let error: String?
1720
+
1721
+ public static func success(text: String, durationMs: Double) -> LlamaAsrTranscribeResult {
1722
+ .init(text: text, durationMs: durationMs, error: nil)
1723
+ }
1724
+
1725
+ public static func failure(_ msg: String) -> LlamaAsrTranscribeResult {
1726
+ .init(text: "", durationMs: 0, error: msg)
1727
+ }
1728
+ }
1729
+
1730
+ public enum SpecDecodeMode {
1731
+ case auto
1732
+ case on
1733
+ case off
1734
+ }
1735
+
1736
+ public struct LlamaHardwareInfo {
1737
+ public let backend: String
1738
+ public let totalRamGB: Double
1739
+ public let availableRamGB: Double
1740
+ public let cpuCores: Int
1741
+ public let isSimulator: Bool
1742
+ public let metalSupported: Bool
1743
+ public let mtpSupported: Bool
1744
+ public let mtpReason: String?
1745
+
1746
+ public func asDict() -> [String: Any] {
1747
+ var dict: [String: Any] = [
1748
+ "backend": backend,
1749
+ "total_ram_gb": NSNumber(value: totalRamGB),
1750
+ "available_ram_gb": NSNumber(value: availableRamGB),
1751
+ "cpu_cores": NSNumber(value: cpuCores),
1752
+ "is_simulator": NSNumber(value: isSimulator),
1753
+ "metal_supported": NSNumber(value: metalSupported),
1754
+ "mtp_supported": NSNumber(value: mtpSupported)
1755
+ ]
1756
+ if let reason = mtpReason {
1757
+ dict["mtp_reason"] = reason
1758
+ }
1759
+ return dict
1760
+ }
1761
+ }
1762
+
1763
+ public final class LlamaBridgeImpl {
1764
+ public static let shared = LlamaBridgeImpl()
1765
+
1766
+ private init() {}
1767
+
1768
+ private static var isRunningInSimulator: Bool {
1769
+ #if targetEnvironment(simulator)
1770
+ return true
1771
+ #else
1772
+ return false
1773
+ #endif
1774
+ }
1775
+
1776
+ public func loadModel(
1777
+ path: String,
1778
+ contextSize: UInt32 = 4096,
1779
+ useGPU: Bool = true,
1780
+ threads: Int32? = nil,
1781
+ draftModelPath: String? = nil,
1782
+ draftContextSize: UInt32 = 4096,
1783
+ draftGpuLayers: Int32? = nil,
1784
+ draftMin: Int32 = 1,
1785
+ draftMax: Int32 = 3,
1786
+ cacheTypeK: String? = nil,
1787
+ cacheTypeV: String? = nil
1788
+ ) -> LlamaLoadResult {
1789
+ return .failure("llama.cpp is not bundled in this iOS build")
1790
+ }
1791
+
1792
+ public func generate(
1793
+ contextId: Int64,
1794
+ prompt: String,
1795
+ maxTokens: Int32 = 256,
1796
+ temperature: Float = 0.7,
1797
+ topP: Float = 0.95,
1798
+ topK: Int32 = 40,
1799
+ stopSequences: [String] = [],
1800
+ specDecode: SpecDecodeMode = .auto,
1801
+ draftMin: Int32? = nil,
1802
+ draftMax: Int32? = nil,
1803
+ tokenTreeTrie: Data? = nil,
1804
+ onToken: ((String, Bool) -> Void)? = nil
1805
+ ) -> LlamaGenerateResult {
1806
+ onToken?("", true)
1807
+ return .failure("llama.cpp is not bundled in this iOS build")
1808
+ }
1809
+
1810
+ public func cancel(contextId: Int64) {}
1811
+
1812
+ public func synthesizeSpeech(
1813
+ bundleDir: String,
1814
+ text: String,
1815
+ speakerPresetId: String? = nil,
1816
+ maxSamples: Int = 24_000 * 60
1817
+ ) -> LlamaTtsSynthesizeResult {
1818
+ return .failure("llama.cpp is not bundled in this iOS build")
1819
+ }
1820
+
1821
+ public func transcribeSpeech(
1822
+ bundleDir: String,
1823
+ pcm: [Float],
1824
+ sampleRate: Int
1825
+ ) -> LlamaAsrTranscribeResult {
1826
+ return .failure("llama.cpp is not bundled in this iOS build")
1827
+ }
1828
+
1829
+ public func ttsEngineDiagnostics(bundleDir: String?) -> [String: Any] {
1830
+ var payload: [String: Any] = [
1831
+ "available": false,
1832
+ "message": "llama.cpp is not bundled in this iOS build",
1833
+ "hardware": hardwareInfo().asDict(),
1834
+ ]
1835
+ if let bundleDir {
1836
+ payload["bundleDir"] = bundleDir
1837
+ }
1838
+ return payload
1839
+ }
1840
+
1841
+ public func free(contextId: Int64) {}
1842
+
1843
+ public func workQueue(for contextId: Int64) -> DispatchQueue? {
1844
+ return nil
1845
+ }
1846
+
1847
+ public func hardwareInfo() -> LlamaHardwareInfo {
1848
+ let pi = ProcessInfo.processInfo
1849
+ let totalRAM = Double(pi.physicalMemory) / (1024.0 * 1024.0 * 1024.0)
1850
+ return LlamaHardwareInfo(
1851
+ backend: "unavailable",
1852
+ totalRamGB: totalRAM,
1853
+ availableRamGB: totalRAM,
1854
+ cpuCores: pi.activeProcessorCount,
1855
+ isSimulator: Self.isRunningInSimulator,
1856
+ metalSupported: false,
1857
+ mtpSupported: false,
1858
+ mtpReason: "llama.cpp is not bundled in this iOS build"
1859
+ )
1860
+ }
1861
+ }
1862
+
1863
+ #endif