@dvai-bridge/ios-llama-core 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,91 @@
1
+ import Foundation
2
+
3
+ /// Errors thrown by `ImageDecoder.resolve(url:)` when the input URL string
4
+ /// can't be turned into image bytes.
5
+ enum ImageSourceError: Error {
6
+ case malformedDataURL(String)
7
+ case invalidScheme(String)
8
+ case httpError(status: Int)
9
+ case base64DecodeFailed
10
+ }
11
+
12
+ /// Resolves any of the three image URL schemes accepted by the DVAI bridge
13
+ /// (`data:`, `https:`/`http:`, `file:`) into the raw encoded image bytes
14
+ /// (PNG/JPEG/etc.). The bytes are returned as-is — actual format decoding
15
+ /// is performed downstream by `mtmd_helper_eval` inside llama.cpp.
16
+ struct ImageDecoder {
17
+ /// Resolve any supported URL scheme into raw image bytes.
18
+ ///
19
+ /// - `data:` URLs are parsed for an optional `;base64` token and
20
+ /// decoded accordingly (URL-encoded payloads are also supported).
21
+ /// - `https:` / `http:` URLs are fetched via `URLSession` with a 30s
22
+ /// timeout; non-2xx responses throw `httpError`.
23
+ /// - `file:` URLs are read off disk via `Data(contentsOf:)`.
24
+ /// - Any other scheme throws `invalidScheme`.
25
+ static func resolve(url: String) async throws -> Data {
26
+ if url.hasPrefix("data:") {
27
+ return try resolveDataURL(url)
28
+ }
29
+ guard let parsed = URL(string: url) else {
30
+ throw ImageSourceError.invalidScheme(url)
31
+ }
32
+ switch parsed.scheme?.lowercased() {
33
+ case "https", "http":
34
+ return try await resolveHTTP(parsed)
35
+ case "file":
36
+ return try Data(contentsOf: parsed)
37
+ case let other?:
38
+ throw ImageSourceError.invalidScheme(other)
39
+ case nil:
40
+ throw ImageSourceError.invalidScheme(url)
41
+ }
42
+ }
43
+
44
+ /// Parse a `data:[<mediatype>][;base64],<payload>` URL into raw bytes.
45
+ /// Strict: a missing comma is treated as malformed (we don't try to
46
+ /// guess intent).
47
+ private static func resolveDataURL(_ url: String) throws -> Data {
48
+ // RFC 2397: data:[<mediatype>][;base64],<data>
49
+ // Empty header and/or empty body are well-formed and produce an
50
+ // empty Data result (e.g. `data:,` returns `Data()`).
51
+ guard let commaIdx = url.firstIndex(of: ",") else {
52
+ throw ImageSourceError.malformedDataURL(url)
53
+ }
54
+ // Skip the leading "data:" (5 chars) and isolate the header / body.
55
+ let prefixEnd = url.index(url.startIndex, offsetBy: 5)
56
+ let header = url[prefixEnd..<commaIdx]
57
+ let body = String(url[url.index(after: commaIdx)...])
58
+ if header.contains(";base64") {
59
+ guard let decoded = Data(base64Encoded: body) else {
60
+ throw ImageSourceError.base64DecodeFailed
61
+ }
62
+ return decoded
63
+ }
64
+ // Non-base64: payload is percent-encoded text per RFC 2397.
65
+ let decodedString = body.removingPercentEncoding ?? body
66
+ return Data(decodedString.utf8)
67
+ }
68
+
69
+ /// Fetch over HTTP(S) with a 30-second timeout. Uses the older
70
+ /// dataTask + continuation pattern so we still work on iOS 14
71
+ /// (the package's deployment target); `URLSession.data(for:)` is
72
+ /// iOS 15+.
73
+ private static func resolveHTTP(_ url: URL) async throws -> Data {
74
+ var request = URLRequest(url: url)
75
+ request.timeoutInterval = 30
76
+ return try await withCheckedThrowingContinuation { continuation in
77
+ let task = URLSession.shared.dataTask(with: request) { data, response, error in
78
+ if let error = error {
79
+ continuation.resume(throwing: error)
80
+ return
81
+ }
82
+ if let http = response as? HTTPURLResponse, !(200...299).contains(http.statusCode) {
83
+ continuation.resume(throwing: ImageSourceError.httpError(status: http.statusCode))
84
+ return
85
+ }
86
+ continuation.resume(returning: data ?? Data())
87
+ }
88
+ task.resume()
89
+ }
90
+ }
91
+ }
@@ -0,0 +1,59 @@
1
+ // Internal/LlamaCppBridgeProtocol.swift
2
+ import Foundation
3
+ #if !COCOAPODS
4
+ import DVAILlamaCoreObjC
5
+ #endif
6
+
7
+ /// Test seam over the ObjC++ `LlamaCppBridge`. Concrete `LlamaCppBridge`
8
+ /// conforms via the extension below; `LlamaHandlers` takes this protocol so
9
+ /// unit tests can substitute a canned-response fake without loading a real
10
+ /// GGUF model. Mirrors the `ImageDecoderProtocol` pattern used by Task 35's
11
+ /// `ContentPartsTranslator`.
12
+ ///
13
+ /// The inference methods use Swift's automatic NSError-bridging — they
14
+ /// match the `(NSString *) … error:(NSError **)` ObjC selector and so are
15
+ /// imported as `throws -> String` / `throws -> [NSNumber]`.
16
+ protocol LlamaCppBridgeProtocol: AnyObject {
17
+ var isLoaded: Bool { get }
18
+ func completePrompt(
19
+ _ prompt: String,
20
+ maxTokens: Int32,
21
+ temperature: Float,
22
+ topP: Float
23
+ ) throws -> String
24
+ func embedding(_ text: String) throws -> [NSNumber]
25
+
26
+ // Phase 2A Pass 2: real multimodal projector (mmproj) lifecycle +
27
+ // chat-template + multimodal completion.
28
+ var isMmprojLoaded: Bool { get }
29
+ func loadMmproj(atPath path: String) throws
30
+ func unloadMmproj()
31
+ /// Whether the loaded model declares an audio encoder (mtmd_support_audio).
32
+ /// Always false when mmproj is not loaded.
33
+ func hasAudioEncoder() -> Bool
34
+
35
+ /// Apply `llama_chat_apply_template`. `templateOverride` nil/empty falls
36
+ /// back to the model's bundled chat template. Each message dict must have
37
+ /// `role` and `content` string entries. Returns the rendered prompt string.
38
+ func applyChatTemplate(
39
+ _ templateOverride: String?,
40
+ messages: [[String: String]],
41
+ addAssistant: Bool
42
+ ) throws -> String
43
+
44
+ /// Multimodal completion. The prompt must contain N `<__media__>` markers
45
+ /// matching `media.count`; bytes are auto-detected as image vs audio
46
+ /// (image: PNG/JPEG/etc.; audio: WAV/MP3/FLAC).
47
+ func completeMultimodalPrompt(
48
+ _ prompt: String,
49
+ media: [Data],
50
+ maxTokens: Int32,
51
+ temperature: Float,
52
+ topP: Float
53
+ ) throws -> String
54
+ }
55
+
56
+ // Concrete `LlamaCppBridge` (ObjC class) gets the four new methods via its
57
+ // imported ObjC selectors; the existing ones (completePrompt, embedding,
58
+ // loadMmproj, isMmprojLoaded) already conform from Pass 1.
59
+ extension LlamaCppBridge: LlamaCppBridgeProtocol {}
@@ -0,0 +1,422 @@
1
+ // Internal/LlamaHandlers.swift
2
+ import Foundation
3
+ #if !COCOAPODS
4
+ import DVAILlamaCoreObjC
5
+ #endif
6
+ #if !COCOAPODS
7
+ import DVAISharedCore
8
+ #endif
9
+
10
+ /// OpenAI-compatible handler set for the llama backend. Wires
11
+ /// `ContentPartsTranslator` → `bridge.completePrompt` → OpenAI response shape
12
+ /// per spec §6 + §8.
13
+ ///
14
+ /// Phase 1 scope (all `false` until Phase 2 lands the corresponding loaders):
15
+ /// - `mmprojLoaded`: true once a multimodal projector is loaded; gates image parts.
16
+ /// - `modelHasAudioEncoder`: true once a model with native audio is loaded; gates audio parts.
17
+ /// - `embeddingMode`: mirrored from the start opts; gates POST /v1/embeddings.
18
+ ///
19
+ /// Streaming: SSE chunks are emitted in 4 frames (role / content / finish /
20
+ /// `[DONE]`). Telegraph 0.40 buffers the whole SSE body server-side anyway, so
21
+ /// 4-chunk vs 1-chunk is identical to the client. Real per-token streaming
22
+ /// lands when Telegraph (or its replacement) supports chunked-encoding flush.
23
+ ///
24
+ /// Note: this 4-frame shape with a separate empty-delta finish frame matches
25
+ /// `FoundationHandlers` (iOS, capacitor-foundation) but intentionally differs
26
+ /// from `MediaPipeHandlers` (Android, capacitor-mediapipe), which folds
27
+ /// `finish_reason: "stop"` onto its final content delta and emits a variable
28
+ /// number of frames. See `MediaPipeHandlers`' "Streaming envelope parity"
29
+ /// KDoc section, and `docs/development/handler-parity.md`, for the full
30
+ /// comparison.
31
+ ///
32
+ /// All bridge-touching paths are serialized via `bridgeLock` because
33
+ /// llama.cpp's `llama_context` is not thread-safe; concurrent requests
34
+ /// would corrupt the shared KV cache.
35
+ public final class LlamaHandlers: DVAIHandlers, @unchecked Sendable {
36
+ private let bridge: LlamaCppBridgeProtocol
37
+ private let bridgeLock = NSLock()
38
+ private let modelId: String
39
+ private let mmprojLoaded: Bool
40
+ private let modelHasAudioEncoder: Bool
41
+ private let embeddingMode: Bool
42
+ private let chatTemplate: String?
43
+ private let translator: ContentPartsTranslator
44
+
45
+ /// Public initializer used by `PluginState`. Wraps a concrete
46
+ /// `LlamaCppBridge` (the protocol existential) so tests can swap in fakes.
47
+ public convenience init(
48
+ bridge: LlamaCppBridge,
49
+ modelId: String,
50
+ mmprojLoaded: Bool = false,
51
+ modelHasAudioEncoder: Bool = false,
52
+ embeddingMode: Bool = false,
53
+ chatTemplate: String? = nil
54
+ ) {
55
+ self.init(
56
+ bridgeProtocol: bridge,
57
+ modelId: modelId,
58
+ mmprojLoaded: mmprojLoaded,
59
+ modelHasAudioEncoder: modelHasAudioEncoder,
60
+ embeddingMode: embeddingMode,
61
+ chatTemplate: chatTemplate
62
+ )
63
+ }
64
+
65
+ /// Internal initializer accepting the protocol existential — used by
66
+ /// tests that inject a mock bridge. The public init forwards here.
67
+ init(
68
+ bridgeProtocol: LlamaCppBridgeProtocol,
69
+ modelId: String,
70
+ mmprojLoaded: Bool = false,
71
+ modelHasAudioEncoder: Bool = false,
72
+ embeddingMode: Bool = false,
73
+ chatTemplate: String? = nil,
74
+ translator: ContentPartsTranslator? = nil
75
+ ) {
76
+ self.bridge = bridgeProtocol
77
+ self.modelId = modelId
78
+ self.mmprojLoaded = mmprojLoaded
79
+ self.modelHasAudioEncoder = modelHasAudioEncoder
80
+ self.embeddingMode = embeddingMode
81
+ self.chatTemplate = chatTemplate
82
+ self.translator = translator ?? ContentPartsTranslator(
83
+ mmprojLoaded: mmprojLoaded,
84
+ modelHasAudioEncoder: modelHasAudioEncoder
85
+ )
86
+ }
87
+
88
+ // MARK: - /v1/chat/completions
89
+
90
+ public func handleChatCompletion(body: [String: Any], ctx: HandlerContext) async throws -> HandlerResponse {
91
+ guard let messages = body["messages"] as? [[String: Any]] else {
92
+ return .error(400, "Missing 'messages' field")
93
+ }
94
+
95
+ let promptInput: LlamaPromptInput
96
+ do {
97
+ promptInput = try await translator.translate(messages: messages)
98
+ } catch let e as TranslatorError {
99
+ return .error(translatorErrorToStatus(e), translatorErrorMessage(e))
100
+ }
101
+
102
+ // TODO(strict-mode): currently silently defaults if max_tokens/temperature/top_p
103
+ // arrive as strings instead of numbers; OpenAI rejects this with 400.
104
+ let maxTokens = body["max_tokens"] as? Int ?? 256
105
+ let temperature = body["temperature"] as? Double ?? 1.0
106
+ let topP = body["top_p"] as? Double ?? 1.0
107
+ let stream = body["stream"] as? Bool ?? false
108
+
109
+ // Render the chat template. The bridge falls back to the model's
110
+ // bundled tokenizer.chat_template when our override is nil/empty.
111
+ // Marker positions inside content fields are preserved by the
112
+ // translator, so the rendered prompt has N <__media__> markers
113
+ // matching media.count in declaration order.
114
+ let chatPrompt: String
115
+ do {
116
+ chatPrompt = try runOnBridge {
117
+ try bridge.applyChatTemplate(
118
+ chatTemplate,
119
+ messages: promptInput.messagesWithMarkers.map { ["role": $0.role, "content": $0.content] },
120
+ addAssistant: true
121
+ )
122
+ }
123
+ } catch {
124
+ return .error(500, "chat template apply failed: \(error.localizedDescription)")
125
+ }
126
+
127
+ let completion: String
128
+ do {
129
+ if promptInput.media.isEmpty {
130
+ completion = try runOnBridge {
131
+ try bridge.completePrompt(
132
+ chatPrompt,
133
+ maxTokens: Int32(maxTokens),
134
+ temperature: Float(temperature),
135
+ topP: Float(topP)
136
+ )
137
+ }
138
+ } else {
139
+ completion = try runOnBridge {
140
+ try bridge.completeMultimodalPrompt(
141
+ chatPrompt,
142
+ media: promptInput.media,
143
+ maxTokens: Int32(maxTokens),
144
+ temperature: Float(temperature),
145
+ topP: Float(topP)
146
+ )
147
+ }
148
+ }
149
+ } catch {
150
+ return .error(500, error.localizedDescription)
151
+ }
152
+
153
+ let id = "chatcmpl-\(UUID().uuidString.prefix(24).lowercased())"
154
+ let created = Int(Date().timeIntervalSince1970)
155
+
156
+ if stream {
157
+ // 4-chunk SSE: role delta, content delta with full body, finish, [DONE].
158
+ let chunks = buildChatStreamChunks(
159
+ id: id,
160
+ created: created,
161
+ completion: completion
162
+ )
163
+ let asyncStream = AsyncStream<String> { continuation in
164
+ for chunk in chunks { continuation.yield(chunk) }
165
+ continuation.finish()
166
+ }
167
+ return .sse(asyncStream)
168
+ }
169
+
170
+ let response: [String: Any] = [
171
+ "id": id,
172
+ "object": "chat.completion",
173
+ "created": created,
174
+ "model": modelId,
175
+ "choices": [[
176
+ "index": 0,
177
+ "message": ["role": "assistant", "content": completion],
178
+ "finish_reason": "stop",
179
+ ] as [String: Any]],
180
+ "usage": ["prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0],
181
+ ]
182
+ return .json(200, response)
183
+ }
184
+
185
+ // MARK: - /v1/completions (legacy)
186
+
187
+ public func handleCompletion(body: [String: Any], ctx: HandlerContext) async throws -> HandlerResponse {
188
+ let promptField = body["prompt"]
189
+ let prompt: String
190
+ if let s = promptField as? String {
191
+ prompt = s
192
+ } else if let arr = promptField as? [String] {
193
+ prompt = arr.joined(separator: "\n")
194
+ } else if promptField == nil {
195
+ prompt = ""
196
+ } else {
197
+ return .error(400, "'prompt' must be a string or array of strings")
198
+ }
199
+
200
+ var chatBody = body
201
+ chatBody["messages"] = [["role": "user", "content": prompt]]
202
+ chatBody.removeValue(forKey: "prompt")
203
+
204
+ let chatResp = try await handleChatCompletion(body: chatBody, ctx: ctx)
205
+ switch chatResp {
206
+ case .json(let status, let chatBodyAny):
207
+ guard status == 200, let chat = chatBodyAny as? [String: Any] else {
208
+ return chatResp
209
+ }
210
+ return .json(200, chatToLegacyCompletion(chat))
211
+ case .sse(let chatStream):
212
+ let model = (body["model"] as? String) ?? modelId
213
+ let legacyStream = AsyncStream<String> { continuation in
214
+ Task {
215
+ for await chunk in chatStream {
216
+ continuation.yield(adaptChunkToLegacy(chunk, model: model))
217
+ }
218
+ continuation.finish()
219
+ }
220
+ }
221
+ return .sse(legacyStream)
222
+ case .error:
223
+ return chatResp
224
+ }
225
+ }
226
+
227
+ // MARK: - /v1/embeddings
228
+
229
+ public func handleEmbeddings(body: [String: Any], ctx: HandlerContext) async throws -> HandlerResponse {
230
+ if !embeddingMode {
231
+ return .error(400, "Embeddings require nativeEmbeddingMode: true at start time.")
232
+ }
233
+ let inputAny = body["input"]
234
+ let inputs: [String]
235
+ if let s = inputAny as? String {
236
+ inputs = [s]
237
+ } else if let arr = inputAny as? [String] {
238
+ inputs = arr
239
+ } else {
240
+ return .error(400, "Missing or malformed 'input' field")
241
+ }
242
+
243
+ var data: [[String: Any]] = []
244
+ for (i, text) in inputs.enumerated() {
245
+ do {
246
+ let vec = try runOnBridge { try bridge.embedding(text) }
247
+ let embedding = vec.map { $0.doubleValue }
248
+ data.append(["object": "embedding", "embedding": embedding, "index": i])
249
+ } catch {
250
+ return .error(500, error.localizedDescription)
251
+ }
252
+ }
253
+ let response: [String: Any] = [
254
+ "object": "list",
255
+ "data": data,
256
+ "model": modelId,
257
+ "usage": ["prompt_tokens": 0, "total_tokens": 0],
258
+ ]
259
+ return .json(200, response)
260
+ }
261
+
262
+ // MARK: - /v1/models
263
+
264
+ public func handleModels(ctx: HandlerContext) async throws -> HandlerResponse {
265
+ return .json(200, [
266
+ "object": "list",
267
+ "data": [["id": ctx.modelId, "object": "model", "owned_by": "dvai-bridge"] as [String: Any]],
268
+ ])
269
+ }
270
+
271
+ // MARK: - Helpers
272
+
273
+ private func translatorErrorToStatus(_ e: TranslatorError) -> Int {
274
+ switch e {
275
+ case .imageFetchFailed: return 502
276
+ default: return 400
277
+ }
278
+ }
279
+
280
+ private func translatorErrorMessage(_ e: TranslatorError) -> String {
281
+ switch e {
282
+ case .noMmprojForImage:
283
+ return "Request includes an image but no mmproj was loaded. Set nativeMmprojPath when starting."
284
+ case .audioWithoutAudioEncoder:
285
+ return "Loaded model has no native audio encoder. Use a multimodal model like Gemma 4 or Phi-4 Multimodal."
286
+ case .unsupportedAudioFormat(let fmt, let supported):
287
+ return "Unsupported audio format: \(fmt). Supported on this platform: \(supported.joined(separator: ", "))."
288
+ case .audioDecodeFailed(let reason):
289
+ return "Audio decode failed: \(reason)"
290
+ case .imageFetchFailed(let reason):
291
+ return "Failed to fetch image: \(reason)"
292
+ case .malformedRequest(let reason):
293
+ return reason
294
+ }
295
+ }
296
+
297
+ // Server-side buffering: Telegraph 0.40 does not stream chunks incrementally;
298
+ // the entire AsyncStream content is gathered before the response is flushed.
299
+ // 4-chunk vs single-chunk emission is identical to clients.
300
+ /// Build the 4 SSE frames for a streaming chat.completion response.
301
+ /// Each entry is the full `data: <json>\n\n` (or `data: [DONE]\n\n`)
302
+ /// frame. Returned in protocol order: role, content, finish, DONE.
303
+ private func buildChatStreamChunks(id: String, created: Int, completion: String) -> [String] {
304
+ var out: [String] = []
305
+ let role: [String: Any] = [
306
+ "id": id,
307
+ "object": "chat.completion.chunk",
308
+ "created": created,
309
+ "model": modelId,
310
+ "choices": [[
311
+ "index": 0,
312
+ "delta": ["role": "assistant"],
313
+ ] as [String: Any]],
314
+ ]
315
+ if let s = serialize(role) { out.append("data: \(s)\n\n") }
316
+
317
+ let content: [String: Any] = [
318
+ "id": id,
319
+ "object": "chat.completion.chunk",
320
+ "created": created,
321
+ "model": modelId,
322
+ "choices": [[
323
+ "index": 0,
324
+ "delta": ["content": completion],
325
+ ] as [String: Any]],
326
+ ]
327
+ if let s = serialize(content) { out.append("data: \(s)\n\n") }
328
+
329
+ let finish: [String: Any] = [
330
+ "id": id,
331
+ "object": "chat.completion.chunk",
332
+ "created": created,
333
+ "model": modelId,
334
+ "choices": [[
335
+ "index": 0,
336
+ "delta": [:] as [String: Any],
337
+ "finish_reason": "stop",
338
+ ] as [String: Any]],
339
+ ]
340
+ if let s = serialize(finish) { out.append("data: \(s)\n\n") }
341
+
342
+ out.append("data: [DONE]\n\n")
343
+ return out
344
+ }
345
+
346
+ private func serialize(_ obj: Any) -> String? {
347
+ guard let data = try? JSONSerialization.data(withJSONObject: obj, options: []),
348
+ let s = String(data: data, encoding: .utf8) else {
349
+ return nil
350
+ }
351
+ return s
352
+ }
353
+
354
+ /// Convert a chat.completion JSON body to the legacy text_completion shape.
355
+ /// Mirrors `chatToLegacyCompletion()` in `packages/dvai-bridge-core`.
356
+ private func chatToLegacyCompletion(_ chat: [String: Any]) -> [String: Any] {
357
+ var legacy: [String: Any] = [:]
358
+ let chatId = chat["id"] as? String ?? ""
359
+ legacy["id"] = chatId.isEmpty
360
+ ? "cmpl-\(Int(Date().timeIntervalSince1970))"
361
+ : chatId.replacingOccurrences(of: "chatcmpl-", with: "cmpl-")
362
+ legacy["object"] = "text_completion"
363
+ legacy["created"] = chat["created"] ?? Int(Date().timeIntervalSince1970)
364
+ legacy["model"] = chat["model"] ?? modelId
365
+ let choices = (chat["choices"] as? [[String: Any]]) ?? []
366
+ legacy["choices"] = choices.map { c -> [String: Any] in
367
+ let msg = c["message"] as? [String: Any]
368
+ return [
369
+ "text": (msg?["content"] as? String) ?? "",
370
+ "index": c["index"] ?? 0,
371
+ "finish_reason": c["finish_reason"] ?? "stop",
372
+ "logprobs": NSNull(),
373
+ ] as [String: Any]
374
+ }
375
+ legacy["usage"] = chat["usage"] ?? ["prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0]
376
+ return legacy
377
+ }
378
+
379
+ /// Adapt a single SSE frame from chat.completion.chunk → text_completion.chunk.
380
+ /// `[DONE]` is forwarded unchanged. Frames that don't parse fall through.
381
+ private func adaptChunkToLegacy(_ chunk: String, model: String) -> String {
382
+ let trimmed = chunk.trimmingCharacters(in: .whitespacesAndNewlines)
383
+ guard trimmed.hasPrefix("data:") else { return chunk }
384
+ let payload = String(trimmed.dropFirst("data:".count)).trimmingCharacters(in: .whitespacesAndNewlines)
385
+ if payload == "[DONE]" { return "data: [DONE]\n\n" }
386
+ guard let data = payload.data(using: .utf8),
387
+ let parsed = try? JSONSerialization.jsonObject(with: data) as? [String: Any] else {
388
+ return chunk
389
+ }
390
+ let chatId = parsed["id"] as? String ?? ""
391
+ let id = chatId.replacingOccurrences(of: "chatcmpl-", with: "cmpl-")
392
+ var legacyChoices: [[String: Any]] = []
393
+ for c in (parsed["choices"] as? [[String: Any]]) ?? [] {
394
+ let delta = c["delta"] as? [String: Any]
395
+ legacyChoices.append([
396
+ "text": (delta?["content"] as? String) ?? "",
397
+ "index": c["index"] ?? 0,
398
+ "finish_reason": c["finish_reason"] ?? NSNull(),
399
+ "logprobs": NSNull(),
400
+ ] as [String: Any])
401
+ }
402
+ let legacy: [String: Any] = [
403
+ "id": id,
404
+ "object": "text_completion.chunk",
405
+ "created": parsed["created"] ?? Int(Date().timeIntervalSince1970),
406
+ "model": parsed["model"] ?? model,
407
+ "choices": legacyChoices,
408
+ ]
409
+ if let s = serialize(legacy) { return "data: \(s)\n\n" }
410
+ return chunk
411
+ }
412
+ }
413
+
414
+ private extension LlamaHandlers {
415
+ /// Serialize all bridge-touching paths via `bridgeLock` so concurrent
416
+ /// requests can't corrupt the shared `llama_context` KV cache.
417
+ func runOnBridge<T>(_ block: () throws -> T) throws -> T {
418
+ bridgeLock.lock()
419
+ defer { bridgeLock.unlock() }
420
+ return try block()
421
+ }
422
+ }