@dvai-bridge/ios-llama-core 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE ADDED
@@ -0,0 +1,51 @@
1
+ # Deep Voice Ai Limited - Software License Agreement
2
+
3
+ **Version 1.0.0**
4
+
5
+ This License Agreement governs the use of the DVAI-Bridge software (the "Software"). By downloading, installing, or using the Software, you agree to be bound by the terms of this License.
6
+
7
+ ---
8
+
9
+ ## 1. LICENSE GRANTS
10
+
11
+ ### 1.1 Development and Personal Use (Free Tier)
12
+ Deep Voice Ai Limited ("Licensor") grants you a non-exclusive, non-transferable, royalty-free license to use the Software solely for:
13
+ - Internal development and testing purposes.
14
+ - Non-commercial personal projects.
15
+ - Academic and non-profit research.
16
+
17
+ ### 1.2 Commercial Use (Paid Tier)
18
+ Any use of the Software for **Commercial Purposes** requires a separate, paid Commercial License from Licensor. "Commercial Purposes" include:
19
+ - Use in production environments.
20
+ - Integration into revenue-generating products or services.
21
+ - Distribution to third-party customers for a fee.
22
+ - Use by an entity with more than $100,000 USD in annual revenue.
23
+
24
+ To obtain a Commercial License, contact `info@deepvoiceai.co` or visit `https://deepvoiceai.co/licensing`.
25
+
26
+ ---
27
+
28
+ ## 2. RESTRICTIONS
29
+ Except as expressly permitted, you may not:
30
+ - Sublicense, rent, lease, or resell the Software without express permission.
31
+ - Remove any proprietary notices or branding from the Software.
32
+ - Use the Software for any illegal or malicious purposes.
33
+
34
+ ---
35
+
36
+ ## 3. INTELLECTUAL PROPERTY
37
+ The Software is owned by **Deep Voice Ai Limited** and is protected by copyright and intellectual property laws. This agreement does not transfer ownership of the Software.
38
+
39
+ ---
40
+
41
+ ## 4. NO WARRANTY
42
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED. IN NO EVENT SHALL THE LICENSOR BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE.
43
+
44
+ ---
45
+
46
+ ## 5. GOVERNING LAW
47
+ This License shall be governed by and construed in accordance with the laws of the jurisdiction where Deep Voice Ai Limited is registered.
48
+
49
+ ---
50
+
51
+ © 2026 Deep Voice Ai Limited. All rights reserved.
package/Package.swift ADDED
@@ -0,0 +1,71 @@
1
+ // swift-tools-version: 5.9
2
+ import PackageDescription
3
+
4
+ let package = Package(
5
+ name: "DVAILlamaCore",
6
+ // Platform floor bumped to iOS 17 / macOS 14 in v3.2.0 — DVAISharedCore
7
+ // moved to Hummingbird (swift-nio) which carries that floor. Earlier
8
+ // (Telegraph era) we shipped iOS 14 / macOS 12.
9
+ platforms: [.iOS(.v17), .macOS(.v14)],
10
+ products: [
11
+ .library(name: "DVAILlamaCore", targets: ["DVAILlamaCore"]),
12
+ .library(name: "DVAILlamaCoreObjC", targets: ["DVAILlamaCoreObjC"]),
13
+ ],
14
+ dependencies: [
15
+ // Shared HTTP-server / handler-dispatch types that all backend
16
+ // cores re-use. Path-dep identity = "dvai-bridge-ios-shared-core".
17
+ // DVAISharedCore brings in Hummingbird transitively as of
18
+ // v3.2.0 — the iOS HTTP server backbone is no longer Telegraph.
19
+ .package(path: "../dvai-bridge-ios-shared-core"),
20
+ ],
21
+ targets: [
22
+ // Prebuilt llama.xcframework — produced by upstream's
23
+ // build-xcframework.sh, materialized by `bash scripts/mac-side-prepare-xcframework.sh`.
24
+ // The xcframework is gitignored; CI must run the prepare step before
25
+ // iOS jobs. The submodule + xcframeworks live in
26
+ // dvai-bridge-android-llama-core (Phase 3A Task 9 relocation).
27
+ .binaryTarget(
28
+ name: "llama",
29
+ path: "../dvai-bridge-android-llama-core/android/src/main/cpp/native/llama.cpp/build-apple/llama.xcframework"
30
+ ),
31
+ .binaryTarget(
32
+ name: "mtmd",
33
+ path: "../dvai-bridge-android-llama-core/android/src/main/cpp/native/llama.cpp/build-apple/mtmd.xcframework"
34
+ ),
35
+ // Package.swift lives at the package root (not under `ios/`), so target
36
+ // paths include the `ios/` prefix. The root placement avoids an SPM
37
+ // identity-collision with sibling packages whose manifests also live at
38
+ // `<pkg>/ios/Package.swift` — SPM derives a path-dep's identity from
39
+ // the last directory component of the path, and "ios" would alias
40
+ // multiple packages and trigger a false cyclic-dependency error.
41
+ .target(
42
+ name: "DVAILlamaCoreObjC",
43
+ dependencies: ["llama", "mtmd"],
44
+ path: "ios/Sources/DVAILlamaCoreObjC",
45
+ publicHeadersPath: "include",
46
+ linkerSettings: [
47
+ .linkedFramework("Foundation"),
48
+ ]
49
+ ),
50
+ // Swift target — depends on the ObjC++ target and the extracted
51
+ // shared HTTP types in DVAISharedCore.
52
+ .target(
53
+ name: "DVAILlamaCore",
54
+ dependencies: [
55
+ "DVAILlamaCoreObjC",
56
+ .product(name: "DVAISharedCore", package: "dvai-bridge-ios-shared-core"),
57
+ ],
58
+ path: "ios/Sources/DVAILlamaCore"
59
+ ),
60
+ .testTarget(
61
+ name: "DVAILlamaCoreTests",
62
+ dependencies: [
63
+ "DVAILlamaCore",
64
+ "DVAILlamaCoreObjC",
65
+ .product(name: "DVAISharedCore", package: "dvai-bridge-ios-shared-core"),
66
+ ],
67
+ path: "ios/Tests/DVAILlamaCoreTests"
68
+ ),
69
+ ],
70
+ cxxLanguageStandard: .cxx17
71
+ )
package/README.md ADDED
@@ -0,0 +1,199 @@
1
+ ![DVAI-Bridge](/assets/banner.png)
2
+
3
+ # DVAI-Bridge
4
+
5
+ <!-- [![Smoke — real models](https://github.com/Westenets/dvai-bridge/actions/workflows/smoke-real-models.yml/badge.svg?branch=main)](https://github.com/Westenets/dvai-bridge/actions/workflows/smoke-real-models.yml) -->
6
+
7
+ [![License](https://img.shields.io/badge/License-Commercial-blue.svg)](LICENSE) ![Node.js](https://img.shields.io/badge/Node.js-22+-green?logo=node.js) ![TypeScript](https://img.shields.io/badge/TypeScript-5.6+-blue?logo=typescript) ![Swift](https://img.shields.io/badge/Swift-5.9+-F05138?logo=swift) ![Kotlin](https://img.shields.io/badge/Kotlin-2.0+-7F52FF?logo=kotlin) ![Flutter](https://img.shields.io/badge/Flutter-3.39+-02569B?logo=flutter) ![.NET](https://img.shields.io/badge/.NET-10.0_LTS-512BD4?logo=dotnet)
8
+
9
+ > **The local OpenAI server you embed inside your app.**
10
+ > One library. One HTTP wire. Every platform. Zero install for your users.
11
+
12
+ **Docs:** [dvai-bridge.deepvoiceai.co](https://dvai-bridge.deepvoiceai.co)
13
+
14
+ ```ts
15
+ import { DVAI } from "@dvai-bridge/core";
16
+ import OpenAI from "openai";
17
+
18
+ const dvai = new DVAI({ backend: "transformers" });
19
+ await dvai.initialize();
20
+
21
+ const openai = new OpenAI({ baseURL: dvai.baseUrl, apiKey: "ignored" });
22
+ await openai.chat.completions.create({
23
+ model: dvai.transformersModelId,
24
+ messages: [{ role: "user", content: "Hello!" }],
25
+ });
26
+ ```
27
+
28
+ That's it. A real OpenAI-compatible server is now running inside your app's
29
+ own process. Point any OpenAI client — LangChain, the OpenAI SDK, the Vercel
30
+ AI SDK, anything — at `dvai.baseUrl` and your agent code keeps working.
31
+
32
+ Built by **[Deep Voice AI](https://deepvoiceai.co)**.
33
+
34
+ ---
35
+
36
+ ## Why it exists
37
+
38
+ Local AI works beautifully on a laptop with **Ollama + LangChain**. Then you
39
+ try to ship the app and your users don't have Ollama. Mobile can't run it.
40
+ Corporate IT won't add another daemon. So you reinvent the same plumbing —
41
+ spawn an inference engine, bind a port, translate to OpenAI HTTP, handle
42
+ CORS, manage lifecycle, wrap the accelerator of the day per platform — and
43
+ do it all over again for every target OS.
44
+
45
+ DVAI-Bridge is that plumbing, packaged as a library, for every client
46
+ platform.
47
+
48
+ ---
49
+
50
+ ## What you get
51
+
52
+ - **One OpenAI HTTP surface.** Bound on `127.0.0.1` (or `0.0.0.0` for
53
+ device-to-device). Streaming, embeddings, models, recovery — all built in.
54
+ - **Six SDKs.** `@dvai-bridge/core` + `react` + `vanilla` + `capacitor`,
55
+ `DVAIBridge` (Swift / iOS), `co.deepvoiceai:dvai-bridge` (Kotlin / Android),
56
+ `@dvai-bridge/react-native`, `dvai_bridge` (Flutter), `co.deepvoiceai.dvai-bridge` (.NET).
57
+ - **Nine backends.** WebLLM, Transformers.js, llama.cpp, Apple Foundation
58
+ Models, MLX, CoreML / ANE, MediaPipe LLM, LiteRT, ONNX Runtime GenAI —
59
+ selected per-platform, invisible to your agent code.
60
+ - **Native acceleration** wherever it runs: WebGPU in browsers, CUDA / Metal
61
+ / Vulkan / DirectML on desktop, ANE / Metal / MLX on iOS, NNAPI / QNN
62
+ Hexagon / GPU delegate on Android.
63
+ - **Multimodal.** Text, image, audio, video — declarative loader for
64
+ cutting-edge models (Gemma 4, LLaVA, Idefics) without waiting for library
65
+ updates.
66
+ - **Distributed inference (v3.0+).** Phone too slow? Offload to your laptop
67
+ on the same Wi-Fi via mDNS pairing — same OpenAI wire, transparent to
68
+ your code. Internet path via a self-hostable rendezvous server.
69
+ - **DVAI Hub (v3.1+).** A first-party desktop utility that turns any device
70
+ into a strong-peer for the rest of your fleet. Brand-neutral install via
71
+ Homebrew / winget / GitHub Releases, OR fork it for your own branded
72
+ companion. Routes through Ollama / LM Studio / vLLM / llama-server /
73
+ llamafile if you've already got those running.
74
+ - **Zero user install.** It's a library, not a daemon. `npm install`,
75
+ `cocoapods`, gradle — your CI already has the muscle for it.
76
+
77
+ ---
78
+
79
+ ## Supported platforms
80
+
81
+ | Stack | Package | Backends |
82
+ | --- | --- | --- |
83
+ | Browser (React, Vue, Svelte, vanilla JS) | `@dvai-bridge/core` + `react` / `vanilla` | WebLLM (WebGPU), Transformers.js (WebGPU / WASM SIMD) |
84
+ | Node / Bun / Electron | `@dvai-bridge/core` | Transformers.js, native llama.cpp |
85
+ | Capacitor hybrid mobile | `@dvai-bridge/capacitor` + backend slice | Native llama.cpp (Metal iOS, Vulkan / CPU Android) |
86
+ | iOS native (Swift) | `DVAIBridge` (SPM / CocoaPods) | llama.cpp (Metal), CoreML / ANE, Apple Foundation Models, MLX |
87
+ | Android native (Kotlin / Java) | `co.deepvoiceai:dvai-bridge` (AAR) | llama.cpp, MediaPipe LLM, LiteRT, NNAPI / QNN |
88
+ | React Native (≥0.77, TurboModule) | `@dvai-bridge/react-native` | All iOS + Android backends (delegates) |
89
+ | Flutter (≥3.39) | `dvai_bridge` (pub.dev) | All iOS + Android backends (Pigeon channels) |
90
+ | .NET 10 LTS (MAUI / Avalonia / WinUI / Catalyst / desktop) | `co.deepvoiceai.dvai-bridge*` (NuGet) | iOS / Android delegate to native; desktop = llama.cpp + ONNX Runtime GenAI + ML.NET |
91
+
92
+ Full quickstart per platform: [dvai-bridge.deepvoiceai.co/guide/getting-started](https://dvai-bridge.deepvoiceai.co/guide/getting-started)
93
+
94
+ ---
95
+
96
+ ## Examples
97
+
98
+ ```ts
99
+ // React
100
+ import { DVAIProvider, useDVAI } from "@dvai-bridge/react";
101
+ <DVAIProvider config={{ backend: "transformers" }}>
102
+ <Chat />
103
+ </DVAIProvider>;
104
+ function Chat() {
105
+ const { isReady, baseUrl } = useDVAI();
106
+ return isReady ? <div>Local AI live at {baseUrl}</div> : <Loading />;
107
+ }
108
+ ```
109
+
110
+ ```swift
111
+ // iOS
112
+ let server = try await DVAIBridge.shared.start()
113
+ // server.baseUrl = "http://127.0.0.1:38883/v1"
114
+ ```
115
+
116
+ ```kotlin
117
+ // Android
118
+ val server = DVAIBridge.start(context)
119
+ // server.baseUrl = "http://127.0.0.1:38883/v1"
120
+ ```
121
+
122
+ ```dart
123
+ // Flutter
124
+ final state = await DVAIBridge.instance.start(
125
+ backend: BackendKind.auto,
126
+ modelPath: '/path/to/model.gguf',
127
+ );
128
+ // state.baseUrl = "http://127.0.0.1:38883/v1"
129
+ ```
130
+
131
+ ```csharp
132
+ // .NET
133
+ var server = await DVAIBridge.Shared.StartAsync(new StartOptions {
134
+ Backend = BackendKind.Auto,
135
+ ModelPath = "/path/to/model.gguf",
136
+ });
137
+ // server.BaseUrl = "http://127.0.0.1:38883/v1"
138
+ ```
139
+
140
+ Multimodal, streaming, embeddings, distributed offload, the Hub —
141
+ everything's at the [docs site](https://dvai-bridge.deepvoiceai.co).
142
+
143
+ ---
144
+
145
+ ## What's new in v3.1
146
+
147
+ - **DVAI Hub** — Tauri desktop utility that's the strong-peer side of v3
148
+ distributed inference. `brew install deepvoiceai/dvai-hub/dvai-hub` (or
149
+ `winget install DeepVoiceAI.DVAIHub`) → mobile apps on the same Wi-Fi
150
+ pair with it and offload heavy inference. [Guide →](https://dvai-bridge.deepvoiceai.co/guide/dvai-hub)
151
+ - **External-engine bridge.** Hub surfaces Ollama / LM Studio / vLLM /
152
+ llama-server / llamafile as additional backend pools so paired apps
153
+ serve from whatever's already cached. Opt-in per engine.
154
+ - **Strict substitution policy.** Models with mismatched family / version /
155
+ size / type are refused by default; quant-only mismatches gated behind a
156
+ per-pairing `preferBetterQuant` flag. No silent mis-routing.
157
+ - **HMAC-signed identity** on `/v1/chat/completions`. Per-app audit logs
158
+ surface who served what, with structured `(appId, peerDeviceId,
159
+ engine, requestedModel, servedModel, outcome)` rows.
160
+ - **Library finalization.** `httpBindHost` (LAN bind), `chatCompletionInterceptor`
161
+ (extension point), HMAC primitives re-exported, `/v1/dvai/*` routes
162
+ actually dispatched, TransformersBackend Node-mode device fix.
163
+ [Migration v3.0 → v3.1 →](https://dvai-bridge.deepvoiceai.co/migration/v3.0-to-v3.1)
164
+
165
+ ---
166
+
167
+ ## Robustness
168
+
169
+ Streaming-correct (SSE passthrough + blank-chunk detection), generation
170
+ timeout, automatic engine-state recovery on fatal errors, port fallback,
171
+ worker offloading, Private Network Access ready, CORS configured. The
172
+ boring substrate so your agent code never has to think about it.
173
+
174
+ ---
175
+
176
+ ## Licensing
177
+
178
+ Dual: **free for development & personal use** on `localhost` (verified at
179
+ runtime). **Commercial use** requires a license key — `info@deepvoiceai.co`.
180
+
181
+ ---
182
+
183
+ ## Contributing
184
+
185
+ PRs welcome.
186
+
187
+ ```bash
188
+ pnpm install
189
+ pnpm build
190
+ bash scripts/build-all.sh # full matrix (auto-skips per-host)
191
+ ```
192
+
193
+ [`CONTRIBUTING.md`](./CONTRIBUTING.md) for the PR flow. Per-platform
194
+ contributor docs (iOS / Android / RN / Flutter / .NET) under
195
+ [`docs/development/`](./docs/development/).
196
+
197
+ ---
198
+
199
+ © Deep Voice AI Limited. All rights reserved.
@@ -0,0 +1,112 @@
1
+ import Foundation
2
+ import AVFoundation
3
+
4
+ /// Supported audio encodings accepted by `AudioDecoder.decode(data:format:)`.
5
+ ///
6
+ /// `pcm16` is treated as already-decoded raw little-endian 16 kHz mono PCM16
7
+ /// and returned unchanged. All other formats are decoded via
8
+ /// `AVAudioFile` + `AVAudioConverter` to that same target format.
9
+ enum AudioFormat: String {
10
+ case pcm16, wav, mp3, m4a, aac, flac
11
+ }
12
+
13
+ /// Decodes supported audio formats to 16 kHz mono PCM16 little-endian samples
14
+ /// suitable for feeding into a multimodal projector.
15
+ struct AudioDecoder {
16
+ /// Decode `data` (encoded in `format`) to 16 kHz mono PCM16 LE samples.
17
+ /// Pass-through for `.pcm16`.
18
+ static func decode(data: Data, format: AudioFormat) async throws -> Data {
19
+ switch format {
20
+ case .pcm16:
21
+ return data
22
+ case .wav, .mp3, .m4a, .aac, .flac:
23
+ return try await decodeViaAVAudioFile(data: data)
24
+ }
25
+ }
26
+
27
+ private static func decodeViaAVAudioFile(data: Data) async throws -> Data {
28
+ // AVAudioFile requires a file URL, so write to a temp file first.
29
+ let tmpURL = FileManager.default.temporaryDirectory.appendingPathComponent(UUID().uuidString)
30
+ try data.write(to: tmpURL)
31
+ defer { try? FileManager.default.removeItem(at: tmpURL) }
32
+
33
+ let inputFile = try AVAudioFile(forReading: tmpURL)
34
+ guard let outputFormat = AVAudioFormat(
35
+ commonFormat: .pcmFormatInt16,
36
+ sampleRate: 16000,
37
+ channels: 1,
38
+ interleaved: true
39
+ ) else {
40
+ throw NSError(
41
+ domain: "AudioDecoder",
42
+ code: 1,
43
+ userInfo: [NSLocalizedDescriptionKey: "Unable to create output format"]
44
+ )
45
+ }
46
+ guard let converter = AVAudioConverter(from: inputFile.processingFormat, to: outputFormat) else {
47
+ throw NSError(
48
+ domain: "AudioDecoder",
49
+ code: 2,
50
+ userInfo: [NSLocalizedDescriptionKey: "Unable to create converter"]
51
+ )
52
+ }
53
+ guard
54
+ let inputBuf = AVAudioPCMBuffer(pcmFormat: inputFile.processingFormat, frameCapacity: 4096),
55
+ let outputBuf = AVAudioPCMBuffer(pcmFormat: outputFormat, frameCapacity: 4096)
56
+ else {
57
+ throw NSError(
58
+ domain: "AudioDecoder",
59
+ code: 3,
60
+ userInfo: [NSLocalizedDescriptionKey: "Unable to allocate buffers"]
61
+ )
62
+ }
63
+
64
+ var result = Data()
65
+ while inputFile.framePosition < inputFile.length {
66
+ try inputFile.read(into: inputBuf)
67
+
68
+ // The input callback may be invoked multiple times per
69
+ // `convert(to:error:withInputFrom:)` call. Without this guard the
70
+ // same `inputBuf` would be re-emitted to the converter and we'd
71
+ // double-count samples / corrupt output.
72
+ var consumed = false
73
+ var error: NSError?
74
+ converter.convert(to: outputBuf, error: &error) { _, status in
75
+ if consumed {
76
+ status.pointee = .endOfStream
77
+ return nil
78
+ }
79
+ consumed = true
80
+ status.pointee = .haveData
81
+ return inputBuf
82
+ }
83
+ if let error = error { throw error }
84
+
85
+ if let int16Data = outputBuf.int16ChannelData {
86
+ let frameLength = Int(outputBuf.frameLength)
87
+ if frameLength > 0 {
88
+ let bytes = UnsafeRawPointer(int16Data[0])
89
+ result.append(Data(bytes: bytes, count: frameLength * 2))
90
+ }
91
+ }
92
+ }
93
+
94
+ // Drain: tell the converter we're done so any buffered tail samples
95
+ // (e.g. AAC priming / codec lookahead) are flushed to outputBuf.
96
+ var drainError: NSError?
97
+ let drainStatus = converter.convert(to: outputBuf, error: &drainError) { _, status in
98
+ status.pointee = .endOfStream
99
+ return nil
100
+ }
101
+ if drainStatus == .haveData, drainError == nil, let int16Data = outputBuf.int16ChannelData {
102
+ let frameLength = Int(outputBuf.frameLength)
103
+ if frameLength > 0 {
104
+ let bytes = UnsafeRawPointer(int16Data[0])
105
+ result.append(Data(bytes: bytes, count: frameLength * 2))
106
+ }
107
+ }
108
+ // drainError on flush is acceptable — some codecs return an error when there's nothing left.
109
+
110
+ return result
111
+ }
112
+ }
@@ -0,0 +1,232 @@
1
+ import Foundation
2
+
3
+ /// The canonical media-marker token mtmd uses for image/audio splice points.
4
+ /// Mirrors `mtmd_default_marker()` from `tools/mtmd/mtmd.h`. Substituting this
5
+ /// literal lets us avoid an FFI call from this translation unit.
6
+ public let MTMD_MEDIA_MARKER = "<__media__>"
7
+
8
+ /// One rendered chat message ready for `bridge.applyChatTemplate(...)`.
9
+ /// Content has had image_url / input_audio parts replaced with the
10
+ /// `<__media__>` marker; the corresponding raw bytes live in
11
+ /// `LlamaPromptInput.media` in the same declaration order as the markers.
12
+ struct LlamaTranslatedMessage: Equatable {
13
+ let role: String
14
+ let content: String
15
+ }
16
+
17
+ /// Output of `ContentPartsTranslator.translate(messages:)` — the inputs
18
+ /// the llama.cpp handler will hand to `bridge.applyChatTemplate(...)` and
19
+ /// `bridge.completeMultimodalPrompt(...)`.
20
+ struct LlamaPromptInput: Equatable {
21
+ /// Per-message rendered content with media replaced by `<__media__>`
22
+ /// markers, in source order. Pass directly to `applyChatTemplate`.
23
+ let messagesWithMarkers: [LlamaTranslatedMessage]
24
+ /// All media bytes (images + decoded audio) in declaration order across
25
+ /// all messages, matching the order the markers appear in the rendered
26
+ /// content. mtmd's `tokenize` matches markers to bitmaps by position;
27
+ /// it auto-detects image vs audio by magic bytes, so a single ordered
28
+ /// list is sufficient.
29
+ let media: [Data]
30
+ /// Legacy: concatenation of all `text` parts for diagnostics. The handler
31
+ /// no longer feeds this to the model directly — `messagesWithMarkers` +
32
+ /// `media` is the source of truth — but it stays available for logging.
33
+ let prompt: String
34
+ }
35
+
36
+ /// Errors raised by `ContentPartsTranslator.translate(messages:)`. The HTTP
37
+ /// status mappings (per spec §8.5) are owned by the handler layer; the
38
+ /// translator just throws the typed case.
39
+ enum TranslatorError: Error, Equatable {
40
+ /// 400 — `Request includes an image but no mmproj was loaded. Set nativeMmprojPath when starting.`
41
+ case noMmprojForImage
42
+ /// 400 — `Loaded model has no native audio encoder. Use a multimodal model like Gemma 4 or Phi-4 Multimodal.`
43
+ case audioWithoutAudioEncoder
44
+ /// 400 — `Unsupported audio format: <fmt>. Supported on this platform: <list>.`
45
+ case unsupportedAudioFormat(String, supported: [String])
46
+ /// 400 — `Audio decode failed: <reason>`.
47
+ case audioDecodeFailed(String)
48
+ /// 502 — `Failed to fetch image: <reason>`.
49
+ case imageFetchFailed(String)
50
+ /// 400 — `<reason>`. Used for shape errors (missing role, unknown part type, etc.).
51
+ case malformedRequest(String)
52
+ }
53
+
54
+ /// Test seam for the image-decode collaborator. The default implementation
55
+ /// just delegates to `ImageDecoder.resolve(url:)`; tests can substitute a
56
+ /// canned-bytes mock so they don't have to round-trip through the real
57
+ /// data-URL / file / HTTP pipelines (those are covered in `ImageDecoderTest`).
58
+ protocol ImageDecoderProtocol {
59
+ func resolve(url: String) async throws -> Data
60
+ }
61
+
62
+ struct DefaultImageDecoder: ImageDecoderProtocol {
63
+ func resolve(url: String) async throws -> Data {
64
+ try await ImageDecoder.resolve(url: url)
65
+ }
66
+ }
67
+
68
+ /// Walks an OpenAI-style `messages` array and produces a `LlamaPromptInput`
69
+ /// bundle. Each message's content is rendered into a string with media parts
70
+ /// replaced by `<__media__>` markers; the corresponding bytes (image bytes
71
+ /// from `ImageDecoder` and the raw base64-decoded audio bytes — still in
72
+ /// their WAV/MP3/FLAC envelope) are appended to `media` in declaration order.
73
+ ///
74
+ /// mtmd does its own format detection (via miniaudio) inside
75
+ /// `mtmd_helper_bitmap_init_from_buf` by inspecting magic bytes, so the
76
+ /// translator must hand it the original encoded audio rather than headerless
77
+ /// PCM samples — `mtmd_helper_bitmap_init_from_buf` only recognizes
78
+ /// WAV/MP3/FLAC and would fail silently on raw PCM.
79
+ ///
80
+ /// Audio data contract: `input_audio.data` must be standard base64 (RFC 4648
81
+ /// §4); URL-safe base64 (`-` / `_` chars) is rejected. This matches OpenAI's
82
+ /// documented input format.
83
+ ///
84
+ /// Spec reference: §8.1 (content-part shape), §8.2 (image translation), §8.3
85
+ /// (audio translation), §8.5 (error mapping).
86
+ final class ContentPartsTranslator {
87
+ /// Audio formats this platform can decode. Anything outside the set
88
+ /// throws `unsupportedAudioFormat`. iOS has flac (via `AVAudioFile`);
89
+ /// Android has ogg instead.
90
+ static let supportedAudioFormats: [String] = ["pcm16", "wav", "mp3", "m4a", "aac", "flac"]
91
+
92
+ private let mmprojLoaded: Bool
93
+ private let modelHasAudioEncoder: Bool
94
+ private let imageDecoder: ImageDecoderProtocol
95
+ /// Currently unused on the production path — mtmd handles audio decoding
96
+ /// internally via miniaudio, so we pass the raw base64-decoded bytes (in
97
+ /// their WAV/MP3/FLAC envelope) straight through. Kept as an init
98
+ /// parameter for backward compatibility with existing tests and as a
99
+ /// possible future fallback for formats mtmd cannot decode itself.
100
+ private let audioDecoder: (Data, AudioFormat) async throws -> Data
101
+
102
+ init(
103
+ mmprojLoaded: Bool,
104
+ modelHasAudioEncoder: Bool,
105
+ imageDecoder: ImageDecoderProtocol = DefaultImageDecoder(),
106
+ audioDecoder: @escaping (Data, AudioFormat) async throws -> Data = { try await AudioDecoder.decode(data: $0, format: $1) }
107
+ ) {
108
+ self.mmprojLoaded = mmprojLoaded
109
+ self.modelHasAudioEncoder = modelHasAudioEncoder
110
+ self.imageDecoder = imageDecoder
111
+ self.audioDecoder = audioDecoder
112
+ }
113
+
114
+ /// Translate an OpenAI `messages` array (as decoded JSON: `[String: Any]`
115
+ /// per message) into a `LlamaPromptInput`. Walks each message's `content`
116
+ /// in order; legacy string content is treated as a single text part.
117
+ func translate(messages: [[String: Any]]) async throws -> LlamaPromptInput {
118
+ var translatedMessages: [LlamaTranslatedMessage] = []
119
+ var media: [Data] = []
120
+ var promptParts: [String] = []
121
+
122
+ for (msgIdx, msg) in messages.enumerated() {
123
+ guard let role = msg["role"] as? String else {
124
+ throw TranslatorError.malformedRequest("messages[\(msgIdx)] missing string 'role'")
125
+ }
126
+ let content = msg["content"]
127
+ // Per-message rendered string (text segments + markers in order).
128
+ var renderedSegments: [String] = []
129
+
130
+ if let text = content as? String {
131
+ promptParts.append(text)
132
+ renderedSegments.append(text)
133
+ translatedMessages.append(LlamaTranslatedMessage(role: role, content: renderedSegments.joined(separator: " ")))
134
+ continue
135
+ }
136
+ guard let parts = content as? [[String: Any]] else {
137
+ throw TranslatorError.malformedRequest(
138
+ "messages[\(msgIdx)].content must be a string or array of content parts"
139
+ )
140
+ }
141
+ for (partIdx, part) in parts.enumerated() {
142
+ let path = "messages[\(msgIdx)].content[\(partIdx)]"
143
+ guard let type = part["type"] as? String else {
144
+ throw TranslatorError.malformedRequest("\(path) missing string 'type'")
145
+ }
146
+ switch type {
147
+ case "text":
148
+ guard let text = part["text"] as? String else {
149
+ throw TranslatorError.malformedRequest("\(path) text part missing string 'text'")
150
+ }
151
+ promptParts.append(text)
152
+ renderedSegments.append(text)
153
+ case "image_url":
154
+ if !mmprojLoaded {
155
+ throw TranslatorError.noMmprojForImage
156
+ }
157
+ guard let imgObj = part["image_url"] as? [String: Any],
158
+ let url = imgObj["url"] as? String else {
159
+ throw TranslatorError.malformedRequest("\(path) image_url part missing image_url.url")
160
+ }
161
+ do {
162
+ let bytes = try await imageDecoder.resolve(url: url)
163
+ media.append(bytes)
164
+ } catch {
165
+ throw TranslatorError.imageFetchFailed(String(describing: error))
166
+ }
167
+ renderedSegments.append(MTMD_MEDIA_MARKER)
168
+ case "input_audio":
169
+ if !modelHasAudioEncoder {
170
+ throw TranslatorError.audioWithoutAudioEncoder
171
+ }
172
+ guard let audioObj = part["input_audio"] as? [String: Any],
173
+ let dataB64 = audioObj["data"] as? String,
174
+ let formatStr = audioObj["format"] as? String else {
175
+ throw TranslatorError.malformedRequest(
176
+ "\(path) input_audio part missing input_audio.data or input_audio.format"
177
+ )
178
+ }
179
+ if !Self.supportedAudioFormats.contains(formatStr) {
180
+ throw TranslatorError.unsupportedAudioFormat(
181
+ formatStr,
182
+ supported: Self.supportedAudioFormats
183
+ )
184
+ }
185
+ // Validate the format string against `AudioFormat` to keep
186
+ // the supported-format gate honest, but the production
187
+ // path no longer calls into AudioDecoder — mtmd does its
188
+ // own format detection by magic bytes.
189
+ guard AudioFormat(rawValue: formatStr) != nil else {
190
+ // The supportedAudioFormats list is the source of truth;
191
+ // this branch only fires if the raw-value enum diverges
192
+ // from that list.
193
+ throw TranslatorError.unsupportedAudioFormat(
194
+ formatStr,
195
+ supported: Self.supportedAudioFormats
196
+ )
197
+ }
198
+ guard !dataB64.isEmpty else {
199
+ throw TranslatorError.malformedRequest("input_audio.data is empty")
200
+ }
201
+ // Standard base64 only (RFC 4648 §4). URL-safe base64 (-/_ chars) is rejected.
202
+ // Matches OpenAI's documented input format.
203
+ guard let encodedBytes = Data(base64Encoded: dataB64) else {
204
+ throw TranslatorError.malformedRequest("input_audio.data is not valid base64")
205
+ }
206
+ // Pass the raw base64-decoded bytes (still in their
207
+ // original WAV/MP3/FLAC envelope) straight through to
208
+ // mtmd. `mtmd_helper_bitmap_init_from_buf` only accepts
209
+ // WAV/MP3/FLAC by magic-byte detection — feeding it
210
+ // headerless PCM (e.g. via `AudioDecoder.decode`) makes
211
+ // bitmap-init fail silently with mtmd error 52.
212
+ media.append(encodedBytes)
213
+ renderedSegments.append(MTMD_MEDIA_MARKER)
214
+ default:
215
+ throw TranslatorError.malformedRequest("unsupported content part type: \(type)")
216
+ }
217
+ }
218
+ // Join the rendered segments with spaces so adjacent text+marker
219
+ // pairs become "before <__media__> after". A single space matches
220
+ // the canonical mtmd-cli prompt shape.
221
+ translatedMessages.append(
222
+ LlamaTranslatedMessage(role: role, content: renderedSegments.joined(separator: " "))
223
+ )
224
+ }
225
+
226
+ return LlamaPromptInput(
227
+ messagesWithMarkers: translatedMessages,
228
+ media: media,
229
+ prompt: promptParts.joined(separator: "\n")
230
+ )
231
+ }
232
+ }