@dvai-bridge/ios-llama-core 4.0.0 → 4.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +341 -34
- package/Package.swift +71 -71
- package/ios/Sources/DVAILlamaCore/AudioDecoder.swift +112 -112
- package/ios/Sources/DVAILlamaCore/ContentPartsTranslator.swift +232 -232
- package/ios/Sources/DVAILlamaCore/ImageDecoder.swift +91 -91
- package/ios/Sources/DVAILlamaCore/LlamaCppBridgeProtocol.swift +59 -59
- package/ios/Sources/DVAILlamaCore/LlamaHandlers.swift +422 -422
- package/ios/Sources/DVAILlamaCore/ModelDownloader.swift +445 -445
- package/ios/Sources/DVAILlamaCore/PluginState.swift +158 -158
- package/ios/Sources/DVAILlamaCoreObjC/LlamaCppBridge.mm +649 -649
- package/ios/Sources/DVAILlamaCoreObjC/include/LlamaCppBridge.h +101 -101
- package/ios/Tests/DVAILlamaCoreTests/AudioDecoderTest.swift +46 -46
- package/ios/Tests/DVAILlamaCoreTests/ContentPartsTranslatorTest.swift +361 -361
- package/ios/Tests/DVAILlamaCoreTests/ImageDecoderTest.swift +139 -139
- package/ios/Tests/DVAILlamaCoreTests/LlamaCppBridgeTest.swift +131 -131
- package/ios/Tests/DVAILlamaCoreTests/LlamaHandlersTest.swift +515 -515
- package/ios/Tests/DVAILlamaCoreTests/ModelDownloaderTest.swift +89 -89
- package/ios/Tests/DVAILlamaCoreTests/PluginStateTest.swift +51 -51
- package/package.json +3 -3
- package/README.md +0 -199
|
@@ -1,232 +1,232 @@
|
|
|
1
|
-
import Foundation
|
|
2
|
-
|
|
3
|
-
/// The canonical media-marker token mtmd uses for image/audio splice points.
|
|
4
|
-
/// Mirrors `mtmd_default_marker()` from `tools/mtmd/mtmd.h`. Substituting this
|
|
5
|
-
/// literal lets us avoid an FFI call from this translation unit.
|
|
6
|
-
public let MTMD_MEDIA_MARKER = "<__media__>"
|
|
7
|
-
|
|
8
|
-
/// One rendered chat message ready for `bridge.applyChatTemplate(...)`.
|
|
9
|
-
/// Content has had image_url / input_audio parts replaced with the
|
|
10
|
-
/// `<__media__>` marker; the corresponding raw bytes live in
|
|
11
|
-
/// `LlamaPromptInput.media` in the same declaration order as the markers.
|
|
12
|
-
struct LlamaTranslatedMessage: Equatable {
|
|
13
|
-
let role: String
|
|
14
|
-
let content: String
|
|
15
|
-
}
|
|
16
|
-
|
|
17
|
-
/// Output of `ContentPartsTranslator.translate(messages:)` — the inputs
|
|
18
|
-
/// the llama.cpp handler will hand to `bridge.applyChatTemplate(...)` and
|
|
19
|
-
/// `bridge.completeMultimodalPrompt(...)`.
|
|
20
|
-
struct LlamaPromptInput: Equatable {
|
|
21
|
-
/// Per-message rendered content with media replaced by `<__media__>`
|
|
22
|
-
/// markers, in source order. Pass directly to `applyChatTemplate`.
|
|
23
|
-
let messagesWithMarkers: [LlamaTranslatedMessage]
|
|
24
|
-
/// All media bytes (images + decoded audio) in declaration order across
|
|
25
|
-
/// all messages, matching the order the markers appear in the rendered
|
|
26
|
-
/// content. mtmd's `tokenize` matches markers to bitmaps by position;
|
|
27
|
-
/// it auto-detects image vs audio by magic bytes, so a single ordered
|
|
28
|
-
/// list is sufficient.
|
|
29
|
-
let media: [Data]
|
|
30
|
-
/// Legacy: concatenation of all `text` parts for diagnostics. The handler
|
|
31
|
-
/// no longer feeds this to the model directly — `messagesWithMarkers` +
|
|
32
|
-
/// `media` is the source of truth — but it stays available for logging.
|
|
33
|
-
let prompt: String
|
|
34
|
-
}
|
|
35
|
-
|
|
36
|
-
/// Errors raised by `ContentPartsTranslator.translate(messages:)`. The HTTP
|
|
37
|
-
/// status mappings (per spec §8.5) are owned by the handler layer; the
|
|
38
|
-
/// translator just throws the typed case.
|
|
39
|
-
enum TranslatorError: Error, Equatable {
|
|
40
|
-
/// 400 — `Request includes an image but no mmproj was loaded. Set nativeMmprojPath when starting.`
|
|
41
|
-
case noMmprojForImage
|
|
42
|
-
/// 400 — `Loaded model has no native audio encoder. Use a multimodal model like Gemma 4 or Phi-4 Multimodal.`
|
|
43
|
-
case audioWithoutAudioEncoder
|
|
44
|
-
/// 400 — `Unsupported audio format: <fmt>. Supported on this platform: <list>.`
|
|
45
|
-
case unsupportedAudioFormat(String, supported: [String])
|
|
46
|
-
/// 400 — `Audio decode failed: <reason>`.
|
|
47
|
-
case audioDecodeFailed(String)
|
|
48
|
-
/// 502 — `Failed to fetch image: <reason>`.
|
|
49
|
-
case imageFetchFailed(String)
|
|
50
|
-
/// 400 — `<reason>`. Used for shape errors (missing role, unknown part type, etc.).
|
|
51
|
-
case malformedRequest(String)
|
|
52
|
-
}
|
|
53
|
-
|
|
54
|
-
/// Test seam for the image-decode collaborator. The default implementation
|
|
55
|
-
/// just delegates to `ImageDecoder.resolve(url:)`; tests can substitute a
|
|
56
|
-
/// canned-bytes mock so they don't have to round-trip through the real
|
|
57
|
-
/// data-URL / file / HTTP pipelines (those are covered in `ImageDecoderTest`).
|
|
58
|
-
protocol ImageDecoderProtocol {
|
|
59
|
-
func resolve(url: String) async throws -> Data
|
|
60
|
-
}
|
|
61
|
-
|
|
62
|
-
struct DefaultImageDecoder: ImageDecoderProtocol {
|
|
63
|
-
func resolve(url: String) async throws -> Data {
|
|
64
|
-
try await ImageDecoder.resolve(url: url)
|
|
65
|
-
}
|
|
66
|
-
}
|
|
67
|
-
|
|
68
|
-
/// Walks an OpenAI-style `messages` array and produces a `LlamaPromptInput`
|
|
69
|
-
/// bundle. Each message's content is rendered into a string with media parts
|
|
70
|
-
/// replaced by `<__media__>` markers; the corresponding bytes (image bytes
|
|
71
|
-
/// from `ImageDecoder` and the raw base64-decoded audio bytes — still in
|
|
72
|
-
/// their WAV/MP3/FLAC envelope) are appended to `media` in declaration order.
|
|
73
|
-
///
|
|
74
|
-
/// mtmd does its own format detection (via miniaudio) inside
|
|
75
|
-
/// `mtmd_helper_bitmap_init_from_buf` by inspecting magic bytes, so the
|
|
76
|
-
/// translator must hand it the original encoded audio rather than headerless
|
|
77
|
-
/// PCM samples — `mtmd_helper_bitmap_init_from_buf` only recognizes
|
|
78
|
-
/// WAV/MP3/FLAC and would fail silently on raw PCM.
|
|
79
|
-
///
|
|
80
|
-
/// Audio data contract: `input_audio.data` must be standard base64 (RFC 4648
|
|
81
|
-
/// §4); URL-safe base64 (`-` / `_` chars) is rejected. This matches OpenAI's
|
|
82
|
-
/// documented input format.
|
|
83
|
-
///
|
|
84
|
-
/// Spec reference: §8.1 (content-part shape), §8.2 (image translation), §8.3
|
|
85
|
-
/// (audio translation), §8.5 (error mapping).
|
|
86
|
-
final class ContentPartsTranslator {
|
|
87
|
-
/// Audio formats this platform can decode. Anything outside the set
|
|
88
|
-
/// throws `unsupportedAudioFormat`. iOS has flac (via `AVAudioFile`);
|
|
89
|
-
/// Android has ogg instead.
|
|
90
|
-
static let supportedAudioFormats: [String] = ["pcm16", "wav", "mp3", "m4a", "aac", "flac"]
|
|
91
|
-
|
|
92
|
-
private let mmprojLoaded: Bool
|
|
93
|
-
private let modelHasAudioEncoder: Bool
|
|
94
|
-
private let imageDecoder: ImageDecoderProtocol
|
|
95
|
-
/// Currently unused on the production path — mtmd handles audio decoding
|
|
96
|
-
/// internally via miniaudio, so we pass the raw base64-decoded bytes (in
|
|
97
|
-
/// their WAV/MP3/FLAC envelope) straight through. Kept as an init
|
|
98
|
-
/// parameter for backward compatibility with existing tests and as a
|
|
99
|
-
/// possible future fallback for formats mtmd cannot decode itself.
|
|
100
|
-
private let audioDecoder: (Data, AudioFormat) async throws -> Data
|
|
101
|
-
|
|
102
|
-
init(
|
|
103
|
-
mmprojLoaded: Bool,
|
|
104
|
-
modelHasAudioEncoder: Bool,
|
|
105
|
-
imageDecoder: ImageDecoderProtocol = DefaultImageDecoder(),
|
|
106
|
-
audioDecoder: @escaping (Data, AudioFormat) async throws -> Data = { try await AudioDecoder.decode(data: $0, format: $1) }
|
|
107
|
-
) {
|
|
108
|
-
self.mmprojLoaded = mmprojLoaded
|
|
109
|
-
self.modelHasAudioEncoder = modelHasAudioEncoder
|
|
110
|
-
self.imageDecoder = imageDecoder
|
|
111
|
-
self.audioDecoder = audioDecoder
|
|
112
|
-
}
|
|
113
|
-
|
|
114
|
-
/// Translate an OpenAI `messages` array (as decoded JSON: `[String: Any]`
|
|
115
|
-
/// per message) into a `LlamaPromptInput`. Walks each message's `content`
|
|
116
|
-
/// in order; legacy string content is treated as a single text part.
|
|
117
|
-
func translate(messages: [[String: Any]]) async throws -> LlamaPromptInput {
|
|
118
|
-
var translatedMessages: [LlamaTranslatedMessage] = []
|
|
119
|
-
var media: [Data] = []
|
|
120
|
-
var promptParts: [String] = []
|
|
121
|
-
|
|
122
|
-
for (msgIdx, msg) in messages.enumerated() {
|
|
123
|
-
guard let role = msg["role"] as? String else {
|
|
124
|
-
throw TranslatorError.malformedRequest("messages[\(msgIdx)] missing string 'role'")
|
|
125
|
-
}
|
|
126
|
-
let content = msg["content"]
|
|
127
|
-
// Per-message rendered string (text segments + markers in order).
|
|
128
|
-
var renderedSegments: [String] = []
|
|
129
|
-
|
|
130
|
-
if let text = content as? String {
|
|
131
|
-
promptParts.append(text)
|
|
132
|
-
renderedSegments.append(text)
|
|
133
|
-
translatedMessages.append(LlamaTranslatedMessage(role: role, content: renderedSegments.joined(separator: " ")))
|
|
134
|
-
continue
|
|
135
|
-
}
|
|
136
|
-
guard let parts = content as? [[String: Any]] else {
|
|
137
|
-
throw TranslatorError.malformedRequest(
|
|
138
|
-
"messages[\(msgIdx)].content must be a string or array of content parts"
|
|
139
|
-
)
|
|
140
|
-
}
|
|
141
|
-
for (partIdx, part) in parts.enumerated() {
|
|
142
|
-
let path = "messages[\(msgIdx)].content[\(partIdx)]"
|
|
143
|
-
guard let type = part["type"] as? String else {
|
|
144
|
-
throw TranslatorError.malformedRequest("\(path) missing string 'type'")
|
|
145
|
-
}
|
|
146
|
-
switch type {
|
|
147
|
-
case "text":
|
|
148
|
-
guard let text = part["text"] as? String else {
|
|
149
|
-
throw TranslatorError.malformedRequest("\(path) text part missing string 'text'")
|
|
150
|
-
}
|
|
151
|
-
promptParts.append(text)
|
|
152
|
-
renderedSegments.append(text)
|
|
153
|
-
case "image_url":
|
|
154
|
-
if !mmprojLoaded {
|
|
155
|
-
throw TranslatorError.noMmprojForImage
|
|
156
|
-
}
|
|
157
|
-
guard let imgObj = part["image_url"] as? [String: Any],
|
|
158
|
-
let url = imgObj["url"] as? String else {
|
|
159
|
-
throw TranslatorError.malformedRequest("\(path) image_url part missing image_url.url")
|
|
160
|
-
}
|
|
161
|
-
do {
|
|
162
|
-
let bytes = try await imageDecoder.resolve(url: url)
|
|
163
|
-
media.append(bytes)
|
|
164
|
-
} catch {
|
|
165
|
-
throw TranslatorError.imageFetchFailed(String(describing: error))
|
|
166
|
-
}
|
|
167
|
-
renderedSegments.append(MTMD_MEDIA_MARKER)
|
|
168
|
-
case "input_audio":
|
|
169
|
-
if !modelHasAudioEncoder {
|
|
170
|
-
throw TranslatorError.audioWithoutAudioEncoder
|
|
171
|
-
}
|
|
172
|
-
guard let audioObj = part["input_audio"] as? [String: Any],
|
|
173
|
-
let dataB64 = audioObj["data"] as? String,
|
|
174
|
-
let formatStr = audioObj["format"] as? String else {
|
|
175
|
-
throw TranslatorError.malformedRequest(
|
|
176
|
-
"\(path) input_audio part missing input_audio.data or input_audio.format"
|
|
177
|
-
)
|
|
178
|
-
}
|
|
179
|
-
if !Self.supportedAudioFormats.contains(formatStr) {
|
|
180
|
-
throw TranslatorError.unsupportedAudioFormat(
|
|
181
|
-
formatStr,
|
|
182
|
-
supported: Self.supportedAudioFormats
|
|
183
|
-
)
|
|
184
|
-
}
|
|
185
|
-
// Validate the format string against `AudioFormat` to keep
|
|
186
|
-
// the supported-format gate honest, but the production
|
|
187
|
-
// path no longer calls into AudioDecoder — mtmd does its
|
|
188
|
-
// own format detection by magic bytes.
|
|
189
|
-
guard AudioFormat(rawValue: formatStr) != nil else {
|
|
190
|
-
// The supportedAudioFormats list is the source of truth;
|
|
191
|
-
// this branch only fires if the raw-value enum diverges
|
|
192
|
-
// from that list.
|
|
193
|
-
throw TranslatorError.unsupportedAudioFormat(
|
|
194
|
-
formatStr,
|
|
195
|
-
supported: Self.supportedAudioFormats
|
|
196
|
-
)
|
|
197
|
-
}
|
|
198
|
-
guard !dataB64.isEmpty else {
|
|
199
|
-
throw TranslatorError.malformedRequest("input_audio.data is empty")
|
|
200
|
-
}
|
|
201
|
-
// Standard base64 only (RFC 4648 §4). URL-safe base64 (-/_ chars) is rejected.
|
|
202
|
-
// Matches OpenAI's documented input format.
|
|
203
|
-
guard let encodedBytes = Data(base64Encoded: dataB64) else {
|
|
204
|
-
throw TranslatorError.malformedRequest("input_audio.data is not valid base64")
|
|
205
|
-
}
|
|
206
|
-
// Pass the raw base64-decoded bytes (still in their
|
|
207
|
-
// original WAV/MP3/FLAC envelope) straight through to
|
|
208
|
-
// mtmd. `mtmd_helper_bitmap_init_from_buf` only accepts
|
|
209
|
-
// WAV/MP3/FLAC by magic-byte detection — feeding it
|
|
210
|
-
// headerless PCM (e.g. via `AudioDecoder.decode`) makes
|
|
211
|
-
// bitmap-init fail silently with mtmd error 52.
|
|
212
|
-
media.append(encodedBytes)
|
|
213
|
-
renderedSegments.append(MTMD_MEDIA_MARKER)
|
|
214
|
-
default:
|
|
215
|
-
throw TranslatorError.malformedRequest("unsupported content part type: \(type)")
|
|
216
|
-
}
|
|
217
|
-
}
|
|
218
|
-
// Join the rendered segments with spaces so adjacent text+marker
|
|
219
|
-
// pairs become "before <__media__> after". A single space matches
|
|
220
|
-
// the canonical mtmd-cli prompt shape.
|
|
221
|
-
translatedMessages.append(
|
|
222
|
-
LlamaTranslatedMessage(role: role, content: renderedSegments.joined(separator: " "))
|
|
223
|
-
)
|
|
224
|
-
}
|
|
225
|
-
|
|
226
|
-
return LlamaPromptInput(
|
|
227
|
-
messagesWithMarkers: translatedMessages,
|
|
228
|
-
media: media,
|
|
229
|
-
prompt: promptParts.joined(separator: "\n")
|
|
230
|
-
)
|
|
231
|
-
}
|
|
232
|
-
}
|
|
1
|
+
import Foundation
|
|
2
|
+
|
|
3
|
+
/// The canonical media-marker token mtmd uses for image/audio splice points.
|
|
4
|
+
/// Mirrors `mtmd_default_marker()` from `tools/mtmd/mtmd.h`. Substituting this
|
|
5
|
+
/// literal lets us avoid an FFI call from this translation unit.
|
|
6
|
+
public let MTMD_MEDIA_MARKER = "<__media__>"
|
|
7
|
+
|
|
8
|
+
/// One rendered chat message ready for `bridge.applyChatTemplate(...)`.
|
|
9
|
+
/// Content has had image_url / input_audio parts replaced with the
|
|
10
|
+
/// `<__media__>` marker; the corresponding raw bytes live in
|
|
11
|
+
/// `LlamaPromptInput.media` in the same declaration order as the markers.
|
|
12
|
+
struct LlamaTranslatedMessage: Equatable {
|
|
13
|
+
let role: String
|
|
14
|
+
let content: String
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
/// Output of `ContentPartsTranslator.translate(messages:)` — the inputs
|
|
18
|
+
/// the llama.cpp handler will hand to `bridge.applyChatTemplate(...)` and
|
|
19
|
+
/// `bridge.completeMultimodalPrompt(...)`.
|
|
20
|
+
struct LlamaPromptInput: Equatable {
|
|
21
|
+
/// Per-message rendered content with media replaced by `<__media__>`
|
|
22
|
+
/// markers, in source order. Pass directly to `applyChatTemplate`.
|
|
23
|
+
let messagesWithMarkers: [LlamaTranslatedMessage]
|
|
24
|
+
/// All media bytes (images + decoded audio) in declaration order across
|
|
25
|
+
/// all messages, matching the order the markers appear in the rendered
|
|
26
|
+
/// content. mtmd's `tokenize` matches markers to bitmaps by position;
|
|
27
|
+
/// it auto-detects image vs audio by magic bytes, so a single ordered
|
|
28
|
+
/// list is sufficient.
|
|
29
|
+
let media: [Data]
|
|
30
|
+
/// Legacy: concatenation of all `text` parts for diagnostics. The handler
|
|
31
|
+
/// no longer feeds this to the model directly — `messagesWithMarkers` +
|
|
32
|
+
/// `media` is the source of truth — but it stays available for logging.
|
|
33
|
+
let prompt: String
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
/// Errors raised by `ContentPartsTranslator.translate(messages:)`. The HTTP
|
|
37
|
+
/// status mappings (per spec §8.5) are owned by the handler layer; the
|
|
38
|
+
/// translator just throws the typed case.
|
|
39
|
+
enum TranslatorError: Error, Equatable {
|
|
40
|
+
/// 400 — `Request includes an image but no mmproj was loaded. Set nativeMmprojPath when starting.`
|
|
41
|
+
case noMmprojForImage
|
|
42
|
+
/// 400 — `Loaded model has no native audio encoder. Use a multimodal model like Gemma 4 or Phi-4 Multimodal.`
|
|
43
|
+
case audioWithoutAudioEncoder
|
|
44
|
+
/// 400 — `Unsupported audio format: <fmt>. Supported on this platform: <list>.`
|
|
45
|
+
case unsupportedAudioFormat(String, supported: [String])
|
|
46
|
+
/// 400 — `Audio decode failed: <reason>`.
|
|
47
|
+
case audioDecodeFailed(String)
|
|
48
|
+
/// 502 — `Failed to fetch image: <reason>`.
|
|
49
|
+
case imageFetchFailed(String)
|
|
50
|
+
/// 400 — `<reason>`. Used for shape errors (missing role, unknown part type, etc.).
|
|
51
|
+
case malformedRequest(String)
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
/// Test seam for the image-decode collaborator. The default implementation
|
|
55
|
+
/// just delegates to `ImageDecoder.resolve(url:)`; tests can substitute a
|
|
56
|
+
/// canned-bytes mock so they don't have to round-trip through the real
|
|
57
|
+
/// data-URL / file / HTTP pipelines (those are covered in `ImageDecoderTest`).
|
|
58
|
+
protocol ImageDecoderProtocol {
|
|
59
|
+
func resolve(url: String) async throws -> Data
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
struct DefaultImageDecoder: ImageDecoderProtocol {
|
|
63
|
+
func resolve(url: String) async throws -> Data {
|
|
64
|
+
try await ImageDecoder.resolve(url: url)
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
/// Walks an OpenAI-style `messages` array and produces a `LlamaPromptInput`
|
|
69
|
+
/// bundle. Each message's content is rendered into a string with media parts
|
|
70
|
+
/// replaced by `<__media__>` markers; the corresponding bytes (image bytes
|
|
71
|
+
/// from `ImageDecoder` and the raw base64-decoded audio bytes — still in
|
|
72
|
+
/// their WAV/MP3/FLAC envelope) are appended to `media` in declaration order.
|
|
73
|
+
///
|
|
74
|
+
/// mtmd does its own format detection (via miniaudio) inside
|
|
75
|
+
/// `mtmd_helper_bitmap_init_from_buf` by inspecting magic bytes, so the
|
|
76
|
+
/// translator must hand it the original encoded audio rather than headerless
|
|
77
|
+
/// PCM samples — `mtmd_helper_bitmap_init_from_buf` only recognizes
|
|
78
|
+
/// WAV/MP3/FLAC and would fail silently on raw PCM.
|
|
79
|
+
///
|
|
80
|
+
/// Audio data contract: `input_audio.data` must be standard base64 (RFC 4648
|
|
81
|
+
/// §4); URL-safe base64 (`-` / `_` chars) is rejected. This matches OpenAI's
|
|
82
|
+
/// documented input format.
|
|
83
|
+
///
|
|
84
|
+
/// Spec reference: §8.1 (content-part shape), §8.2 (image translation), §8.3
|
|
85
|
+
/// (audio translation), §8.5 (error mapping).
|
|
86
|
+
final class ContentPartsTranslator {
|
|
87
|
+
/// Audio formats this platform can decode. Anything outside the set
|
|
88
|
+
/// throws `unsupportedAudioFormat`. iOS has flac (via `AVAudioFile`);
|
|
89
|
+
/// Android has ogg instead.
|
|
90
|
+
static let supportedAudioFormats: [String] = ["pcm16", "wav", "mp3", "m4a", "aac", "flac"]
|
|
91
|
+
|
|
92
|
+
private let mmprojLoaded: Bool
|
|
93
|
+
private let modelHasAudioEncoder: Bool
|
|
94
|
+
private let imageDecoder: ImageDecoderProtocol
|
|
95
|
+
/// Currently unused on the production path — mtmd handles audio decoding
|
|
96
|
+
/// internally via miniaudio, so we pass the raw base64-decoded bytes (in
|
|
97
|
+
/// their WAV/MP3/FLAC envelope) straight through. Kept as an init
|
|
98
|
+
/// parameter for backward compatibility with existing tests and as a
|
|
99
|
+
/// possible future fallback for formats mtmd cannot decode itself.
|
|
100
|
+
private let audioDecoder: (Data, AudioFormat) async throws -> Data
|
|
101
|
+
|
|
102
|
+
init(
|
|
103
|
+
mmprojLoaded: Bool,
|
|
104
|
+
modelHasAudioEncoder: Bool,
|
|
105
|
+
imageDecoder: ImageDecoderProtocol = DefaultImageDecoder(),
|
|
106
|
+
audioDecoder: @escaping (Data, AudioFormat) async throws -> Data = { try await AudioDecoder.decode(data: $0, format: $1) }
|
|
107
|
+
) {
|
|
108
|
+
self.mmprojLoaded = mmprojLoaded
|
|
109
|
+
self.modelHasAudioEncoder = modelHasAudioEncoder
|
|
110
|
+
self.imageDecoder = imageDecoder
|
|
111
|
+
self.audioDecoder = audioDecoder
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
/// Translate an OpenAI `messages` array (as decoded JSON: `[String: Any]`
|
|
115
|
+
/// per message) into a `LlamaPromptInput`. Walks each message's `content`
|
|
116
|
+
/// in order; legacy string content is treated as a single text part.
|
|
117
|
+
func translate(messages: [[String: Any]]) async throws -> LlamaPromptInput {
|
|
118
|
+
var translatedMessages: [LlamaTranslatedMessage] = []
|
|
119
|
+
var media: [Data] = []
|
|
120
|
+
var promptParts: [String] = []
|
|
121
|
+
|
|
122
|
+
for (msgIdx, msg) in messages.enumerated() {
|
|
123
|
+
guard let role = msg["role"] as? String else {
|
|
124
|
+
throw TranslatorError.malformedRequest("messages[\(msgIdx)] missing string 'role'")
|
|
125
|
+
}
|
|
126
|
+
let content = msg["content"]
|
|
127
|
+
// Per-message rendered string (text segments + markers in order).
|
|
128
|
+
var renderedSegments: [String] = []
|
|
129
|
+
|
|
130
|
+
if let text = content as? String {
|
|
131
|
+
promptParts.append(text)
|
|
132
|
+
renderedSegments.append(text)
|
|
133
|
+
translatedMessages.append(LlamaTranslatedMessage(role: role, content: renderedSegments.joined(separator: " ")))
|
|
134
|
+
continue
|
|
135
|
+
}
|
|
136
|
+
guard let parts = content as? [[String: Any]] else {
|
|
137
|
+
throw TranslatorError.malformedRequest(
|
|
138
|
+
"messages[\(msgIdx)].content must be a string or array of content parts"
|
|
139
|
+
)
|
|
140
|
+
}
|
|
141
|
+
for (partIdx, part) in parts.enumerated() {
|
|
142
|
+
let path = "messages[\(msgIdx)].content[\(partIdx)]"
|
|
143
|
+
guard let type = part["type"] as? String else {
|
|
144
|
+
throw TranslatorError.malformedRequest("\(path) missing string 'type'")
|
|
145
|
+
}
|
|
146
|
+
switch type {
|
|
147
|
+
case "text":
|
|
148
|
+
guard let text = part["text"] as? String else {
|
|
149
|
+
throw TranslatorError.malformedRequest("\(path) text part missing string 'text'")
|
|
150
|
+
}
|
|
151
|
+
promptParts.append(text)
|
|
152
|
+
renderedSegments.append(text)
|
|
153
|
+
case "image_url":
|
|
154
|
+
if !mmprojLoaded {
|
|
155
|
+
throw TranslatorError.noMmprojForImage
|
|
156
|
+
}
|
|
157
|
+
guard let imgObj = part["image_url"] as? [String: Any],
|
|
158
|
+
let url = imgObj["url"] as? String else {
|
|
159
|
+
throw TranslatorError.malformedRequest("\(path) image_url part missing image_url.url")
|
|
160
|
+
}
|
|
161
|
+
do {
|
|
162
|
+
let bytes = try await imageDecoder.resolve(url: url)
|
|
163
|
+
media.append(bytes)
|
|
164
|
+
} catch {
|
|
165
|
+
throw TranslatorError.imageFetchFailed(String(describing: error))
|
|
166
|
+
}
|
|
167
|
+
renderedSegments.append(MTMD_MEDIA_MARKER)
|
|
168
|
+
case "input_audio":
|
|
169
|
+
if !modelHasAudioEncoder {
|
|
170
|
+
throw TranslatorError.audioWithoutAudioEncoder
|
|
171
|
+
}
|
|
172
|
+
guard let audioObj = part["input_audio"] as? [String: Any],
|
|
173
|
+
let dataB64 = audioObj["data"] as? String,
|
|
174
|
+
let formatStr = audioObj["format"] as? String else {
|
|
175
|
+
throw TranslatorError.malformedRequest(
|
|
176
|
+
"\(path) input_audio part missing input_audio.data or input_audio.format"
|
|
177
|
+
)
|
|
178
|
+
}
|
|
179
|
+
if !Self.supportedAudioFormats.contains(formatStr) {
|
|
180
|
+
throw TranslatorError.unsupportedAudioFormat(
|
|
181
|
+
formatStr,
|
|
182
|
+
supported: Self.supportedAudioFormats
|
|
183
|
+
)
|
|
184
|
+
}
|
|
185
|
+
// Validate the format string against `AudioFormat` to keep
|
|
186
|
+
// the supported-format gate honest, but the production
|
|
187
|
+
// path no longer calls into AudioDecoder — mtmd does its
|
|
188
|
+
// own format detection by magic bytes.
|
|
189
|
+
guard AudioFormat(rawValue: formatStr) != nil else {
|
|
190
|
+
// The supportedAudioFormats list is the source of truth;
|
|
191
|
+
// this branch only fires if the raw-value enum diverges
|
|
192
|
+
// from that list.
|
|
193
|
+
throw TranslatorError.unsupportedAudioFormat(
|
|
194
|
+
formatStr,
|
|
195
|
+
supported: Self.supportedAudioFormats
|
|
196
|
+
)
|
|
197
|
+
}
|
|
198
|
+
guard !dataB64.isEmpty else {
|
|
199
|
+
throw TranslatorError.malformedRequest("input_audio.data is empty")
|
|
200
|
+
}
|
|
201
|
+
// Standard base64 only (RFC 4648 §4). URL-safe base64 (-/_ chars) is rejected.
|
|
202
|
+
// Matches OpenAI's documented input format.
|
|
203
|
+
guard let encodedBytes = Data(base64Encoded: dataB64) else {
|
|
204
|
+
throw TranslatorError.malformedRequest("input_audio.data is not valid base64")
|
|
205
|
+
}
|
|
206
|
+
// Pass the raw base64-decoded bytes (still in their
|
|
207
|
+
// original WAV/MP3/FLAC envelope) straight through to
|
|
208
|
+
// mtmd. `mtmd_helper_bitmap_init_from_buf` only accepts
|
|
209
|
+
// WAV/MP3/FLAC by magic-byte detection — feeding it
|
|
210
|
+
// headerless PCM (e.g. via `AudioDecoder.decode`) makes
|
|
211
|
+
// bitmap-init fail silently with mtmd error 52.
|
|
212
|
+
media.append(encodedBytes)
|
|
213
|
+
renderedSegments.append(MTMD_MEDIA_MARKER)
|
|
214
|
+
default:
|
|
215
|
+
throw TranslatorError.malformedRequest("unsupported content part type: \(type)")
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
// Join the rendered segments with spaces so adjacent text+marker
|
|
219
|
+
// pairs become "before <__media__> after". A single space matches
|
|
220
|
+
// the canonical mtmd-cli prompt shape.
|
|
221
|
+
translatedMessages.append(
|
|
222
|
+
LlamaTranslatedMessage(role: role, content: renderedSegments.joined(separator: " "))
|
|
223
|
+
)
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
return LlamaPromptInput(
|
|
227
|
+
messagesWithMarkers: translatedMessages,
|
|
228
|
+
media: media,
|
|
229
|
+
prompt: promptParts.joined(separator: "\n")
|
|
230
|
+
)
|
|
231
|
+
}
|
|
232
|
+
}
|