@dvai-bridge/ios-llama-core 4.0.0 → 4.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +341 -34
- package/Package.swift +71 -71
- package/ios/Sources/DVAILlamaCore/AudioDecoder.swift +112 -112
- package/ios/Sources/DVAILlamaCore/ContentPartsTranslator.swift +232 -232
- package/ios/Sources/DVAILlamaCore/ImageDecoder.swift +91 -91
- package/ios/Sources/DVAILlamaCore/LlamaCppBridgeProtocol.swift +59 -59
- package/ios/Sources/DVAILlamaCore/LlamaHandlers.swift +422 -422
- package/ios/Sources/DVAILlamaCore/ModelDownloader.swift +445 -445
- package/ios/Sources/DVAILlamaCore/PluginState.swift +158 -158
- package/ios/Sources/DVAILlamaCoreObjC/LlamaCppBridge.mm +649 -649
- package/ios/Sources/DVAILlamaCoreObjC/include/LlamaCppBridge.h +101 -101
- package/ios/Tests/DVAILlamaCoreTests/AudioDecoderTest.swift +46 -46
- package/ios/Tests/DVAILlamaCoreTests/ContentPartsTranslatorTest.swift +361 -361
- package/ios/Tests/DVAILlamaCoreTests/ImageDecoderTest.swift +139 -139
- package/ios/Tests/DVAILlamaCoreTests/LlamaCppBridgeTest.swift +131 -131
- package/ios/Tests/DVAILlamaCoreTests/LlamaHandlersTest.swift +515 -515
- package/ios/Tests/DVAILlamaCoreTests/ModelDownloaderTest.swift +89 -89
- package/ios/Tests/DVAILlamaCoreTests/PluginStateTest.swift +51 -51
- package/package.json +3 -3
- package/README.md +0 -199
|
@@ -1,361 +1,361 @@
|
|
|
1
|
-
import XCTest
|
|
2
|
-
@testable import DVAILlamaCore
|
|
3
|
-
|
|
4
|
-
final class ContentPartsTranslatorTest: XCTestCase {
|
|
5
|
-
// MARK: - Mocks
|
|
6
|
-
|
|
7
|
-
/// Image decoder that returns canned bytes per URL. Records every call so
|
|
8
|
-
/// tests can assert which URLs were passed in and in what order.
|
|
9
|
-
final class MockImageDecoder: ImageDecoderProtocol {
|
|
10
|
-
var responses: [String: Data] = [:]
|
|
11
|
-
var calls: [String] = []
|
|
12
|
-
func resolve(url: String) async throws -> Data {
|
|
13
|
-
calls.append(url)
|
|
14
|
-
if let bytes = responses[url] { return bytes }
|
|
15
|
-
return Data([0xDE, 0xAD, 0xBE, 0xEF])
|
|
16
|
-
}
|
|
17
|
-
}
|
|
18
|
-
|
|
19
|
-
/// Audio-decoder closure factory. Records each call's `(bytesIn, format)`
|
|
20
|
-
/// and returns canned PCM bytes.
|
|
21
|
-
final class AudioRecorder {
|
|
22
|
-
var calls: [(Data, AudioFormat)] = []
|
|
23
|
-
var pcmOut: Data = Data([0x11, 0x22, 0x33, 0x44])
|
|
24
|
-
func make() -> (Data, AudioFormat) async throws -> Data {
|
|
25
|
-
{ [unowned self] data, format in
|
|
26
|
-
self.calls.append((data, format))
|
|
27
|
-
return self.pcmOut
|
|
28
|
-
}
|
|
29
|
-
}
|
|
30
|
-
}
|
|
31
|
-
|
|
32
|
-
// MARK: - Fixture loader
|
|
33
|
-
|
|
34
|
-
/// Loads `transport-fixtures.json` from the repo-root `fixtures/` dir.
|
|
35
|
-
/// For `CHAT_REQUEST_AUDIO_PCM16` the `data` field carries the literal
|
|
36
|
-
/// `"<replaced-by-loader>"` placeholder; we substitute the base64 of the
|
|
37
|
-
/// PCM16 fixture file before returning.
|
|
38
|
-
private func loadFixture(_ key: String) throws -> [String: Any] {
|
|
39
|
-
let url = fixturesURL().appendingPathComponent("transport-fixtures.json")
|
|
40
|
-
let data = try Data(contentsOf: url)
|
|
41
|
-
guard var root = try JSONSerialization.jsonObject(with: data) as? [String: Any],
|
|
42
|
-
var fixture = root[key] as? [String: Any] else {
|
|
43
|
-
XCTFail("fixture \(key) missing or not an object")
|
|
44
|
-
return [:]
|
|
45
|
-
}
|
|
46
|
-
if key == "CHAT_REQUEST_AUDIO_PCM16" {
|
|
47
|
-
let pcmURL = fixturesURL().appendingPathComponent("audio").appendingPathComponent("pcm16-1s-16khz-mono.bin")
|
|
48
|
-
let pcmBytes = try Data(contentsOf: pcmURL)
|
|
49
|
-
let b64 = pcmBytes.base64EncodedString()
|
|
50
|
-
// Mutate messages[0].content[0].input_audio.data
|
|
51
|
-
if var messages = fixture["messages"] as? [[String: Any]],
|
|
52
|
-
var msg0 = messages.first,
|
|
53
|
-
var parts = msg0["content"] as? [[String: Any]],
|
|
54
|
-
var part0 = parts.first,
|
|
55
|
-
var audio = part0["input_audio"] as? [String: Any] {
|
|
56
|
-
audio["data"] = b64
|
|
57
|
-
part0["input_audio"] = audio
|
|
58
|
-
parts[0] = part0
|
|
59
|
-
msg0["content"] = parts
|
|
60
|
-
messages[0] = msg0
|
|
61
|
-
fixture["messages"] = messages
|
|
62
|
-
root[key] = fixture
|
|
63
|
-
} else {
|
|
64
|
-
XCTFail("CHAT_REQUEST_AUDIO_PCM16 fixture shape unexpected")
|
|
65
|
-
}
|
|
66
|
-
}
|
|
67
|
-
return fixture
|
|
68
|
-
}
|
|
69
|
-
|
|
70
|
-
private func messages(from fixture: [String: Any]) -> [[String: Any]] {
|
|
71
|
-
(fixture["messages"] as? [[String: Any]]) ?? []
|
|
72
|
-
}
|
|
73
|
-
|
|
74
|
-
private func fixturesURL() -> URL {
|
|
75
|
-
var dir = URL(fileURLWithPath: #file).deletingLastPathComponent()
|
|
76
|
-
while !FileManager.default.fileExists(atPath: dir.appendingPathComponent("fixtures").path) {
|
|
77
|
-
let parent = dir.deletingLastPathComponent()
|
|
78
|
-
if parent.path == dir.path {
|
|
79
|
-
fatalError("fixtures dir not found walking up from \(#file)")
|
|
80
|
-
}
|
|
81
|
-
dir = parent
|
|
82
|
-
}
|
|
83
|
-
return dir.appendingPathComponent("fixtures")
|
|
84
|
-
}
|
|
85
|
-
|
|
86
|
-
// MARK: - Happy paths (driven by transport-fixtures.json)
|
|
87
|
-
|
|
88
|
-
/// `CHAT_REQUEST_TEXT` — the legacy string-content shape produces a prompt
|
|
89
|
-
/// with the user text and no media collateral.
|
|
90
|
-
func testTextOnlyMessage() async throws {
|
|
91
|
-
let fixture = try loadFixture("CHAT_REQUEST_TEXT")
|
|
92
|
-
let translator = ContentPartsTranslator(mmprojLoaded: false, modelHasAudioEncoder: false)
|
|
93
|
-
let result = try await translator.translate(messages: messages(from: fixture))
|
|
94
|
-
XCTAssertEqual(result.prompt, "hi")
|
|
95
|
-
XCTAssertTrue(result.media.isEmpty)
|
|
96
|
-
XCTAssertEqual(result.messagesWithMarkers.count, 1)
|
|
97
|
-
XCTAssertEqual(result.messagesWithMarkers[0].role, "user")
|
|
98
|
-
XCTAssertEqual(result.messagesWithMarkers[0].content, "hi")
|
|
99
|
-
}
|
|
100
|
-
|
|
101
|
-
/// `CHAT_REQUEST_IMAGE` — text + data-URL image. The image part should be
|
|
102
|
-
/// resolved via the (mocked) ImageDecoder and the bytes appended to
|
|
103
|
-
/// `media`. The rendered content for that message has a single
|
|
104
|
-
/// `<__media__>` marker substituted in place of the image part.
|
|
105
|
-
func testTextPlusImage() async throws {
|
|
106
|
-
let fixture = try loadFixture("CHAT_REQUEST_IMAGE")
|
|
107
|
-
let mock = MockImageDecoder()
|
|
108
|
-
let cannedPng = Data([0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A, 0x99])
|
|
109
|
-
// Prefix-match any data: URL by snapping it after we observe it; here
|
|
110
|
-
// we just set a default in `responses` keyed off the actual URL once
|
|
111
|
-
// we know it from the fixture.
|
|
112
|
-
let urlFromFixture: String = {
|
|
113
|
-
let parts = (((fixture["messages"] as? [[String: Any]])?[0])?["content"] as? [[String: Any]]) ?? []
|
|
114
|
-
return ((parts.first(where: { ($0["type"] as? String) == "image_url" })?["image_url"] as? [String: Any])?["url"] as? String) ?? ""
|
|
115
|
-
}()
|
|
116
|
-
mock.responses[urlFromFixture] = cannedPng
|
|
117
|
-
|
|
118
|
-
let translator = ContentPartsTranslator(
|
|
119
|
-
mmprojLoaded: true,
|
|
120
|
-
modelHasAudioEncoder: false,
|
|
121
|
-
imageDecoder: mock
|
|
122
|
-
)
|
|
123
|
-
let result = try await translator.translate(messages: messages(from: fixture))
|
|
124
|
-
XCTAssertEqual(result.prompt, "What is in this image?")
|
|
125
|
-
XCTAssertEqual(result.media.count, 1)
|
|
126
|
-
XCTAssertEqual(result.media[0], cannedPng)
|
|
127
|
-
XCTAssertEqual(mock.calls, [urlFromFixture])
|
|
128
|
-
// Marker count in rendered content == media count.
|
|
129
|
-
let markerCount = result.messagesWithMarkers
|
|
130
|
-
.map { $0.content.components(separatedBy: MTMD_MEDIA_MARKER).count - 1 }
|
|
131
|
-
.reduce(0, +)
|
|
132
|
-
XCTAssertEqual(markerCount, 1)
|
|
133
|
-
}
|
|
134
|
-
|
|
135
|
-
/// `CHAT_REQUEST_AUDIO_PCM16` — base64-encoded audio + text. The base64
|
|
136
|
-
/// payload is decoded and the **raw bytes** land in `media` unchanged;
|
|
137
|
-
/// mtmd does its own format detection downstream via miniaudio, so the
|
|
138
|
-
/// translator no longer routes audio through `AudioDecoder`. The
|
|
139
|
-
/// `audioDecoder` collaborator is wired up but should not be invoked.
|
|
140
|
-
func testAudioPCM16PlusText() async throws {
|
|
141
|
-
let fixture = try loadFixture("CHAT_REQUEST_AUDIO_PCM16")
|
|
142
|
-
let recorder = AudioRecorder()
|
|
143
|
-
let translator = ContentPartsTranslator(
|
|
144
|
-
mmprojLoaded: false,
|
|
145
|
-
modelHasAudioEncoder: true,
|
|
146
|
-
audioDecoder: recorder.make()
|
|
147
|
-
)
|
|
148
|
-
let result = try await translator.translate(messages: messages(from: fixture))
|
|
149
|
-
XCTAssertEqual(result.prompt, "Transcribe this.")
|
|
150
|
-
XCTAssertEqual(result.media.count, 1)
|
|
151
|
-
// `media[0]` should be the raw base64-decoded bytes (i.e. the
|
|
152
|
-
// contents of the PCM fixture file as-is) — NOT the canned
|
|
153
|
-
// `recorder.pcmOut`, because the translator no longer routes audio
|
|
154
|
-
// through the decoder closure.
|
|
155
|
-
let pcmFile = try Data(contentsOf: fixturesURL().appendingPathComponent("audio").appendingPathComponent("pcm16-1s-16khz-mono.bin"))
|
|
156
|
-
XCTAssertEqual(result.media[0], pcmFile)
|
|
157
|
-
XCTAssertEqual(recorder.calls.count, 0, "audioDecoder must not be called on the production path; mtmd handles decode itself")
|
|
158
|
-
let markerCount = result.messagesWithMarkers
|
|
159
|
-
.map { $0.content.components(separatedBy: MTMD_MEDIA_MARKER).count - 1 }
|
|
160
|
-
.reduce(0, +)
|
|
161
|
-
XCTAssertEqual(markerCount, 1)
|
|
162
|
-
}
|
|
163
|
-
|
|
164
|
-
/// Interleaved `[text, image, text, audio, text]` → media list preserves
|
|
165
|
-
/// declaration order (image first, then audio); rendered content has
|
|
166
|
-
/// exactly two `<__media__>` markers in the right positions. After the
|
|
167
|
-
/// audio-path fix, `media[1]` is the raw base64-decoded audio bytes
|
|
168
|
-
/// (mtmd handles format detection downstream); the audio-decoder
|
|
169
|
-
/// collaborator must not be invoked.
|
|
170
|
-
func testInterleavedTextImageAudio() async throws {
|
|
171
|
-
let imageMock = MockImageDecoder()
|
|
172
|
-
let imageBytes = Data([0xAA, 0xBB, 0xCC])
|
|
173
|
-
imageMock.responses["data:image/png;base64,AAAA"] = imageBytes
|
|
174
|
-
let audioRecorder = AudioRecorder()
|
|
175
|
-
audioRecorder.pcmOut = Data([0x55, 0x66, 0x77])
|
|
176
|
-
let translator = ContentPartsTranslator(
|
|
177
|
-
mmprojLoaded: true,
|
|
178
|
-
modelHasAudioEncoder: true,
|
|
179
|
-
imageDecoder: imageMock,
|
|
180
|
-
audioDecoder: audioRecorder.make()
|
|
181
|
-
)
|
|
182
|
-
let messages: [[String: Any]] = [[
|
|
183
|
-
"role": "user",
|
|
184
|
-
"content": [
|
|
185
|
-
["type": "text", "text": "before"],
|
|
186
|
-
["type": "image_url", "image_url": ["url": "data:image/png;base64,AAAA"]] as [String: Any],
|
|
187
|
-
["type": "text", "text": "between"],
|
|
188
|
-
["type": "input_audio", "input_audio": ["data": "AAAA", "format": "pcm16"]] as [String: Any],
|
|
189
|
-
["type": "text", "text": "after"],
|
|
190
|
-
],
|
|
191
|
-
]]
|
|
192
|
-
let result = try await translator.translate(messages: messages)
|
|
193
|
-
XCTAssertEqual(result.media.count, 2)
|
|
194
|
-
XCTAssertEqual(result.media[0], imageBytes, "image must come first in declaration order")
|
|
195
|
-
// `"AAAA"` base64-decoded is three zero bytes — that's what mtmd sees.
|
|
196
|
-
XCTAssertEqual(result.media[1], Data([0x00, 0x00, 0x00]), "audio bytes are the raw base64-decoded payload")
|
|
197
|
-
XCTAssertEqual(audioRecorder.calls.count, 0, "audioDecoder must not be invoked on the production path")
|
|
198
|
-
XCTAssertEqual(result.messagesWithMarkers.count, 1)
|
|
199
|
-
let content = result.messagesWithMarkers[0].content
|
|
200
|
-
let markerCount = content.components(separatedBy: MTMD_MEDIA_MARKER).count - 1
|
|
201
|
-
XCTAssertEqual(markerCount, 2)
|
|
202
|
-
// First marker should appear after "before" and before "between";
|
|
203
|
-
// second after "between" and before "after".
|
|
204
|
-
let firstMarker = content.range(of: MTMD_MEDIA_MARKER)!
|
|
205
|
-
let secondMarker = content.range(of: MTMD_MEDIA_MARKER, range: firstMarker.upperBound..<content.endIndex)!
|
|
206
|
-
let beforeRange = content.range(of: "before")!
|
|
207
|
-
let betweenRange = content.range(of: "between")!
|
|
208
|
-
let afterRange = content.range(of: "after")!
|
|
209
|
-
XCTAssertLessThan(beforeRange.upperBound, firstMarker.lowerBound)
|
|
210
|
-
XCTAssertLessThan(firstMarker.upperBound, betweenRange.lowerBound)
|
|
211
|
-
XCTAssertLessThan(betweenRange.upperBound, secondMarker.lowerBound)
|
|
212
|
-
XCTAssertLessThan(secondMarker.upperBound, afterRange.lowerBound)
|
|
213
|
-
}
|
|
214
|
-
|
|
215
|
-
// MARK: - Negative paths
|
|
216
|
-
|
|
217
|
-
/// Image part with `mmprojLoaded == false` → `noMmprojForImage`. The
|
|
218
|
-
/// translator must throw before even consulting the image decoder.
|
|
219
|
-
func testImageWithoutMmprojThrows() async {
|
|
220
|
-
let messages: [[String: Any]] = [[
|
|
221
|
-
"role": "user",
|
|
222
|
-
"content": [
|
|
223
|
-
["type": "image_url", "image_url": ["url": "data:image/png;base64,AAAA"]]
|
|
224
|
-
],
|
|
225
|
-
]]
|
|
226
|
-
let mock = MockImageDecoder()
|
|
227
|
-
let translator = ContentPartsTranslator(
|
|
228
|
-
mmprojLoaded: false,
|
|
229
|
-
modelHasAudioEncoder: false,
|
|
230
|
-
imageDecoder: mock
|
|
231
|
-
)
|
|
232
|
-
do {
|
|
233
|
-
_ = try await translator.translate(messages: messages)
|
|
234
|
-
XCTFail("expected noMmprojForImage")
|
|
235
|
-
} catch TranslatorError.noMmprojForImage {
|
|
236
|
-
XCTAssertTrue(mock.calls.isEmpty, "translator should not invoke decoder when mmproj is missing")
|
|
237
|
-
} catch {
|
|
238
|
-
XCTFail("unexpected error: \(error)")
|
|
239
|
-
}
|
|
240
|
-
}
|
|
241
|
-
|
|
242
|
-
/// Audio part with `modelHasAudioEncoder == false` → `audioWithoutAudioEncoder`.
|
|
243
|
-
func testAudioWithoutEncoderThrows() async {
|
|
244
|
-
let messages: [[String: Any]] = [[
|
|
245
|
-
"role": "user",
|
|
246
|
-
"content": [
|
|
247
|
-
["type": "input_audio", "input_audio": ["data": "AAAA", "format": "pcm16"]]
|
|
248
|
-
],
|
|
249
|
-
]]
|
|
250
|
-
let translator = ContentPartsTranslator(mmprojLoaded: false, modelHasAudioEncoder: false)
|
|
251
|
-
do {
|
|
252
|
-
_ = try await translator.translate(messages: messages)
|
|
253
|
-
XCTFail("expected audioWithoutAudioEncoder")
|
|
254
|
-
} catch TranslatorError.audioWithoutAudioEncoder {
|
|
255
|
-
// expected
|
|
256
|
-
} catch {
|
|
257
|
-
XCTFail("unexpected error: \(error)")
|
|
258
|
-
}
|
|
259
|
-
}
|
|
260
|
-
|
|
261
|
-
/// Unsupported audio format (e.g. `vorbis`) → `unsupportedAudioFormat`
|
|
262
|
-
/// with the offending format echoed back and the supported list filled in.
|
|
263
|
-
func testUnsupportedAudioFormatThrows() async {
|
|
264
|
-
let messages: [[String: Any]] = [[
|
|
265
|
-
"role": "user",
|
|
266
|
-
"content": [
|
|
267
|
-
["type": "input_audio", "input_audio": ["data": "AAAA", "format": "vorbis"]]
|
|
268
|
-
],
|
|
269
|
-
]]
|
|
270
|
-
let translator = ContentPartsTranslator(mmprojLoaded: false, modelHasAudioEncoder: true)
|
|
271
|
-
do {
|
|
272
|
-
_ = try await translator.translate(messages: messages)
|
|
273
|
-
XCTFail("expected unsupportedAudioFormat")
|
|
274
|
-
} catch let TranslatorError.unsupportedAudioFormat(fmt, supported) {
|
|
275
|
-
XCTAssertEqual(fmt, "vorbis")
|
|
276
|
-
XCTAssertEqual(supported, ContentPartsTranslator.supportedAudioFormats)
|
|
277
|
-
XCTAssertTrue(supported.contains("flac"), "iOS supported list should include flac")
|
|
278
|
-
} catch {
|
|
279
|
-
XCTFail("unexpected error: \(error)")
|
|
280
|
-
}
|
|
281
|
-
}
|
|
282
|
-
|
|
283
|
-
/// Unknown content part type → `malformedRequest` with the offending type
|
|
284
|
-
/// echoed in the message.
|
|
285
|
-
func testUnknownContentPartTypeThrows() async {
|
|
286
|
-
let messages: [[String: Any]] = [[
|
|
287
|
-
"role": "user",
|
|
288
|
-
"content": [
|
|
289
|
-
["type": "video_url", "video_url": ["url": "https://example.com/v.mp4"]]
|
|
290
|
-
],
|
|
291
|
-
]]
|
|
292
|
-
let translator = ContentPartsTranslator(mmprojLoaded: true, modelHasAudioEncoder: true)
|
|
293
|
-
do {
|
|
294
|
-
_ = try await translator.translate(messages: messages)
|
|
295
|
-
XCTFail("expected malformedRequest")
|
|
296
|
-
} catch let TranslatorError.malformedRequest(reason) {
|
|
297
|
-
XCTAssertTrue(reason.contains("video_url"), "expected reason to mention offending type, got: \(reason)")
|
|
298
|
-
} catch {
|
|
299
|
-
XCTFail("unexpected error: \(error)")
|
|
300
|
-
}
|
|
301
|
-
}
|
|
302
|
-
|
|
303
|
-
/// Empty `input_audio.data` → `malformedRequest`. The audio decoder must
|
|
304
|
-
/// not be invoked — this is a request-shape error caught before decode.
|
|
305
|
-
func testEmptyAudioDataThrowsMalformedRequest() async {
|
|
306
|
-
let translator = ContentPartsTranslator(
|
|
307
|
-
mmprojLoaded: false,
|
|
308
|
-
modelHasAudioEncoder: true,
|
|
309
|
-
imageDecoder: MockImageDecoder(),
|
|
310
|
-
audioDecoder: { _, _ in
|
|
311
|
-
XCTFail("audio decoder should not be invoked for empty data")
|
|
312
|
-
return Data()
|
|
313
|
-
}
|
|
314
|
-
)
|
|
315
|
-
let messages: [[String: Any]] = [[
|
|
316
|
-
"role": "user",
|
|
317
|
-
"content": [[
|
|
318
|
-
"type": "input_audio",
|
|
319
|
-
"input_audio": ["data": "", "format": "pcm16"]
|
|
320
|
-
]]
|
|
321
|
-
]]
|
|
322
|
-
do {
|
|
323
|
-
_ = try await translator.translate(messages: messages)
|
|
324
|
-
XCTFail("Expected throw")
|
|
325
|
-
} catch TranslatorError.malformedRequest {
|
|
326
|
-
// OK
|
|
327
|
-
} catch {
|
|
328
|
-
XCTFail("Unexpected: \(error)")
|
|
329
|
-
}
|
|
330
|
-
}
|
|
331
|
-
|
|
332
|
-
/// Malformed base64 in `input_audio.data` → `malformedRequest` (not
|
|
333
|
-
/// `audioDecodeFailed`). The audio decoder never runs — this is a
|
|
334
|
-
/// pre-decode request-shape error.
|
|
335
|
-
func testMalformedBase64ThrowsMalformedRequest() async {
|
|
336
|
-
let translator = ContentPartsTranslator(
|
|
337
|
-
mmprojLoaded: false,
|
|
338
|
-
modelHasAudioEncoder: true,
|
|
339
|
-
imageDecoder: MockImageDecoder(),
|
|
340
|
-
audioDecoder: { _, _ in
|
|
341
|
-
XCTFail("audio decoder should not be invoked for invalid base64")
|
|
342
|
-
return Data()
|
|
343
|
-
}
|
|
344
|
-
)
|
|
345
|
-
let messages: [[String: Any]] = [[
|
|
346
|
-
"role": "user",
|
|
347
|
-
"content": [[
|
|
348
|
-
"type": "input_audio",
|
|
349
|
-
"input_audio": ["data": "!!!not-valid-base64!!!", "format": "pcm16"]
|
|
350
|
-
]]
|
|
351
|
-
]]
|
|
352
|
-
do {
|
|
353
|
-
_ = try await translator.translate(messages: messages)
|
|
354
|
-
XCTFail("Expected throw")
|
|
355
|
-
} catch TranslatorError.malformedRequest {
|
|
356
|
-
// OK
|
|
357
|
-
} catch {
|
|
358
|
-
XCTFail("Unexpected: \(error)")
|
|
359
|
-
}
|
|
360
|
-
}
|
|
361
|
-
}
|
|
1
|
+
import XCTest
|
|
2
|
+
@testable import DVAILlamaCore
|
|
3
|
+
|
|
4
|
+
final class ContentPartsTranslatorTest: XCTestCase {
|
|
5
|
+
// MARK: - Mocks
|
|
6
|
+
|
|
7
|
+
/// Image decoder that returns canned bytes per URL. Records every call so
|
|
8
|
+
/// tests can assert which URLs were passed in and in what order.
|
|
9
|
+
final class MockImageDecoder: ImageDecoderProtocol {
|
|
10
|
+
var responses: [String: Data] = [:]
|
|
11
|
+
var calls: [String] = []
|
|
12
|
+
func resolve(url: String) async throws -> Data {
|
|
13
|
+
calls.append(url)
|
|
14
|
+
if let bytes = responses[url] { return bytes }
|
|
15
|
+
return Data([0xDE, 0xAD, 0xBE, 0xEF])
|
|
16
|
+
}
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
/// Audio-decoder closure factory. Records each call's `(bytesIn, format)`
|
|
20
|
+
/// and returns canned PCM bytes.
|
|
21
|
+
final class AudioRecorder {
|
|
22
|
+
var calls: [(Data, AudioFormat)] = []
|
|
23
|
+
var pcmOut: Data = Data([0x11, 0x22, 0x33, 0x44])
|
|
24
|
+
func make() -> (Data, AudioFormat) async throws -> Data {
|
|
25
|
+
{ [unowned self] data, format in
|
|
26
|
+
self.calls.append((data, format))
|
|
27
|
+
return self.pcmOut
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
// MARK: - Fixture loader
|
|
33
|
+
|
|
34
|
+
/// Loads `transport-fixtures.json` from the repo-root `fixtures/` dir.
|
|
35
|
+
/// For `CHAT_REQUEST_AUDIO_PCM16` the `data` field carries the literal
|
|
36
|
+
/// `"<replaced-by-loader>"` placeholder; we substitute the base64 of the
|
|
37
|
+
/// PCM16 fixture file before returning.
|
|
38
|
+
private func loadFixture(_ key: String) throws -> [String: Any] {
|
|
39
|
+
let url = fixturesURL().appendingPathComponent("transport-fixtures.json")
|
|
40
|
+
let data = try Data(contentsOf: url)
|
|
41
|
+
guard var root = try JSONSerialization.jsonObject(with: data) as? [String: Any],
|
|
42
|
+
var fixture = root[key] as? [String: Any] else {
|
|
43
|
+
XCTFail("fixture \(key) missing or not an object")
|
|
44
|
+
return [:]
|
|
45
|
+
}
|
|
46
|
+
if key == "CHAT_REQUEST_AUDIO_PCM16" {
|
|
47
|
+
let pcmURL = fixturesURL().appendingPathComponent("audio").appendingPathComponent("pcm16-1s-16khz-mono.bin")
|
|
48
|
+
let pcmBytes = try Data(contentsOf: pcmURL)
|
|
49
|
+
let b64 = pcmBytes.base64EncodedString()
|
|
50
|
+
// Mutate messages[0].content[0].input_audio.data
|
|
51
|
+
if var messages = fixture["messages"] as? [[String: Any]],
|
|
52
|
+
var msg0 = messages.first,
|
|
53
|
+
var parts = msg0["content"] as? [[String: Any]],
|
|
54
|
+
var part0 = parts.first,
|
|
55
|
+
var audio = part0["input_audio"] as? [String: Any] {
|
|
56
|
+
audio["data"] = b64
|
|
57
|
+
part0["input_audio"] = audio
|
|
58
|
+
parts[0] = part0
|
|
59
|
+
msg0["content"] = parts
|
|
60
|
+
messages[0] = msg0
|
|
61
|
+
fixture["messages"] = messages
|
|
62
|
+
root[key] = fixture
|
|
63
|
+
} else {
|
|
64
|
+
XCTFail("CHAT_REQUEST_AUDIO_PCM16 fixture shape unexpected")
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
return fixture
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
private func messages(from fixture: [String: Any]) -> [[String: Any]] {
|
|
71
|
+
(fixture["messages"] as? [[String: Any]]) ?? []
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
private func fixturesURL() -> URL {
|
|
75
|
+
var dir = URL(fileURLWithPath: #file).deletingLastPathComponent()
|
|
76
|
+
while !FileManager.default.fileExists(atPath: dir.appendingPathComponent("fixtures").path) {
|
|
77
|
+
let parent = dir.deletingLastPathComponent()
|
|
78
|
+
if parent.path == dir.path {
|
|
79
|
+
fatalError("fixtures dir not found walking up from \(#file)")
|
|
80
|
+
}
|
|
81
|
+
dir = parent
|
|
82
|
+
}
|
|
83
|
+
return dir.appendingPathComponent("fixtures")
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
// MARK: - Happy paths (driven by transport-fixtures.json)
|
|
87
|
+
|
|
88
|
+
/// `CHAT_REQUEST_TEXT` — the legacy string-content shape produces a prompt
|
|
89
|
+
/// with the user text and no media collateral.
|
|
90
|
+
func testTextOnlyMessage() async throws {
|
|
91
|
+
let fixture = try loadFixture("CHAT_REQUEST_TEXT")
|
|
92
|
+
let translator = ContentPartsTranslator(mmprojLoaded: false, modelHasAudioEncoder: false)
|
|
93
|
+
let result = try await translator.translate(messages: messages(from: fixture))
|
|
94
|
+
XCTAssertEqual(result.prompt, "hi")
|
|
95
|
+
XCTAssertTrue(result.media.isEmpty)
|
|
96
|
+
XCTAssertEqual(result.messagesWithMarkers.count, 1)
|
|
97
|
+
XCTAssertEqual(result.messagesWithMarkers[0].role, "user")
|
|
98
|
+
XCTAssertEqual(result.messagesWithMarkers[0].content, "hi")
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
/// `CHAT_REQUEST_IMAGE` — text + data-URL image. The image part should be
|
|
102
|
+
/// resolved via the (mocked) ImageDecoder and the bytes appended to
|
|
103
|
+
/// `media`. The rendered content for that message has a single
|
|
104
|
+
/// `<__media__>` marker substituted in place of the image part.
|
|
105
|
+
func testTextPlusImage() async throws {
|
|
106
|
+
let fixture = try loadFixture("CHAT_REQUEST_IMAGE")
|
|
107
|
+
let mock = MockImageDecoder()
|
|
108
|
+
let cannedPng = Data([0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A, 0x99])
|
|
109
|
+
// Prefix-match any data: URL by snapping it after we observe it; here
|
|
110
|
+
// we just set a default in `responses` keyed off the actual URL once
|
|
111
|
+
// we know it from the fixture.
|
|
112
|
+
let urlFromFixture: String = {
|
|
113
|
+
let parts = (((fixture["messages"] as? [[String: Any]])?[0])?["content"] as? [[String: Any]]) ?? []
|
|
114
|
+
return ((parts.first(where: { ($0["type"] as? String) == "image_url" })?["image_url"] as? [String: Any])?["url"] as? String) ?? ""
|
|
115
|
+
}()
|
|
116
|
+
mock.responses[urlFromFixture] = cannedPng
|
|
117
|
+
|
|
118
|
+
let translator = ContentPartsTranslator(
|
|
119
|
+
mmprojLoaded: true,
|
|
120
|
+
modelHasAudioEncoder: false,
|
|
121
|
+
imageDecoder: mock
|
|
122
|
+
)
|
|
123
|
+
let result = try await translator.translate(messages: messages(from: fixture))
|
|
124
|
+
XCTAssertEqual(result.prompt, "What is in this image?")
|
|
125
|
+
XCTAssertEqual(result.media.count, 1)
|
|
126
|
+
XCTAssertEqual(result.media[0], cannedPng)
|
|
127
|
+
XCTAssertEqual(mock.calls, [urlFromFixture])
|
|
128
|
+
// Marker count in rendered content == media count.
|
|
129
|
+
let markerCount = result.messagesWithMarkers
|
|
130
|
+
.map { $0.content.components(separatedBy: MTMD_MEDIA_MARKER).count - 1 }
|
|
131
|
+
.reduce(0, +)
|
|
132
|
+
XCTAssertEqual(markerCount, 1)
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
/// `CHAT_REQUEST_AUDIO_PCM16` — base64-encoded audio + text. The base64
|
|
136
|
+
/// payload is decoded and the **raw bytes** land in `media` unchanged;
|
|
137
|
+
/// mtmd does its own format detection downstream via miniaudio, so the
|
|
138
|
+
/// translator no longer routes audio through `AudioDecoder`. The
|
|
139
|
+
/// `audioDecoder` collaborator is wired up but should not be invoked.
|
|
140
|
+
func testAudioPCM16PlusText() async throws {
|
|
141
|
+
let fixture = try loadFixture("CHAT_REQUEST_AUDIO_PCM16")
|
|
142
|
+
let recorder = AudioRecorder()
|
|
143
|
+
let translator = ContentPartsTranslator(
|
|
144
|
+
mmprojLoaded: false,
|
|
145
|
+
modelHasAudioEncoder: true,
|
|
146
|
+
audioDecoder: recorder.make()
|
|
147
|
+
)
|
|
148
|
+
let result = try await translator.translate(messages: messages(from: fixture))
|
|
149
|
+
XCTAssertEqual(result.prompt, "Transcribe this.")
|
|
150
|
+
XCTAssertEqual(result.media.count, 1)
|
|
151
|
+
// `media[0]` should be the raw base64-decoded bytes (i.e. the
|
|
152
|
+
// contents of the PCM fixture file as-is) — NOT the canned
|
|
153
|
+
// `recorder.pcmOut`, because the translator no longer routes audio
|
|
154
|
+
// through the decoder closure.
|
|
155
|
+
let pcmFile = try Data(contentsOf: fixturesURL().appendingPathComponent("audio").appendingPathComponent("pcm16-1s-16khz-mono.bin"))
|
|
156
|
+
XCTAssertEqual(result.media[0], pcmFile)
|
|
157
|
+
XCTAssertEqual(recorder.calls.count, 0, "audioDecoder must not be called on the production path; mtmd handles decode itself")
|
|
158
|
+
let markerCount = result.messagesWithMarkers
|
|
159
|
+
.map { $0.content.components(separatedBy: MTMD_MEDIA_MARKER).count - 1 }
|
|
160
|
+
.reduce(0, +)
|
|
161
|
+
XCTAssertEqual(markerCount, 1)
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
/// Interleaved `[text, image, text, audio, text]` → media list preserves
|
|
165
|
+
/// declaration order (image first, then audio); rendered content has
|
|
166
|
+
/// exactly two `<__media__>` markers in the right positions. After the
|
|
167
|
+
/// audio-path fix, `media[1]` is the raw base64-decoded audio bytes
|
|
168
|
+
/// (mtmd handles format detection downstream); the audio-decoder
|
|
169
|
+
/// collaborator must not be invoked.
|
|
170
|
+
func testInterleavedTextImageAudio() async throws {
|
|
171
|
+
let imageMock = MockImageDecoder()
|
|
172
|
+
let imageBytes = Data([0xAA, 0xBB, 0xCC])
|
|
173
|
+
imageMock.responses["data:image/png;base64,AAAA"] = imageBytes
|
|
174
|
+
let audioRecorder = AudioRecorder()
|
|
175
|
+
audioRecorder.pcmOut = Data([0x55, 0x66, 0x77])
|
|
176
|
+
let translator = ContentPartsTranslator(
|
|
177
|
+
mmprojLoaded: true,
|
|
178
|
+
modelHasAudioEncoder: true,
|
|
179
|
+
imageDecoder: imageMock,
|
|
180
|
+
audioDecoder: audioRecorder.make()
|
|
181
|
+
)
|
|
182
|
+
let messages: [[String: Any]] = [[
|
|
183
|
+
"role": "user",
|
|
184
|
+
"content": [
|
|
185
|
+
["type": "text", "text": "before"],
|
|
186
|
+
["type": "image_url", "image_url": ["url": "data:image/png;base64,AAAA"]] as [String: Any],
|
|
187
|
+
["type": "text", "text": "between"],
|
|
188
|
+
["type": "input_audio", "input_audio": ["data": "AAAA", "format": "pcm16"]] as [String: Any],
|
|
189
|
+
["type": "text", "text": "after"],
|
|
190
|
+
],
|
|
191
|
+
]]
|
|
192
|
+
let result = try await translator.translate(messages: messages)
|
|
193
|
+
XCTAssertEqual(result.media.count, 2)
|
|
194
|
+
XCTAssertEqual(result.media[0], imageBytes, "image must come first in declaration order")
|
|
195
|
+
// `"AAAA"` base64-decoded is three zero bytes — that's what mtmd sees.
|
|
196
|
+
XCTAssertEqual(result.media[1], Data([0x00, 0x00, 0x00]), "audio bytes are the raw base64-decoded payload")
|
|
197
|
+
XCTAssertEqual(audioRecorder.calls.count, 0, "audioDecoder must not be invoked on the production path")
|
|
198
|
+
XCTAssertEqual(result.messagesWithMarkers.count, 1)
|
|
199
|
+
let content = result.messagesWithMarkers[0].content
|
|
200
|
+
let markerCount = content.components(separatedBy: MTMD_MEDIA_MARKER).count - 1
|
|
201
|
+
XCTAssertEqual(markerCount, 2)
|
|
202
|
+
// First marker should appear after "before" and before "between";
|
|
203
|
+
// second after "between" and before "after".
|
|
204
|
+
let firstMarker = content.range(of: MTMD_MEDIA_MARKER)!
|
|
205
|
+
let secondMarker = content.range(of: MTMD_MEDIA_MARKER, range: firstMarker.upperBound..<content.endIndex)!
|
|
206
|
+
let beforeRange = content.range(of: "before")!
|
|
207
|
+
let betweenRange = content.range(of: "between")!
|
|
208
|
+
let afterRange = content.range(of: "after")!
|
|
209
|
+
XCTAssertLessThan(beforeRange.upperBound, firstMarker.lowerBound)
|
|
210
|
+
XCTAssertLessThan(firstMarker.upperBound, betweenRange.lowerBound)
|
|
211
|
+
XCTAssertLessThan(betweenRange.upperBound, secondMarker.lowerBound)
|
|
212
|
+
XCTAssertLessThan(secondMarker.upperBound, afterRange.lowerBound)
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
// MARK: - Negative paths
|
|
216
|
+
|
|
217
|
+
/// Image part with `mmprojLoaded == false` → `noMmprojForImage`. The
|
|
218
|
+
/// translator must throw before even consulting the image decoder.
|
|
219
|
+
func testImageWithoutMmprojThrows() async {
|
|
220
|
+
let messages: [[String: Any]] = [[
|
|
221
|
+
"role": "user",
|
|
222
|
+
"content": [
|
|
223
|
+
["type": "image_url", "image_url": ["url": "data:image/png;base64,AAAA"]]
|
|
224
|
+
],
|
|
225
|
+
]]
|
|
226
|
+
let mock = MockImageDecoder()
|
|
227
|
+
let translator = ContentPartsTranslator(
|
|
228
|
+
mmprojLoaded: false,
|
|
229
|
+
modelHasAudioEncoder: false,
|
|
230
|
+
imageDecoder: mock
|
|
231
|
+
)
|
|
232
|
+
do {
|
|
233
|
+
_ = try await translator.translate(messages: messages)
|
|
234
|
+
XCTFail("expected noMmprojForImage")
|
|
235
|
+
} catch TranslatorError.noMmprojForImage {
|
|
236
|
+
XCTAssertTrue(mock.calls.isEmpty, "translator should not invoke decoder when mmproj is missing")
|
|
237
|
+
} catch {
|
|
238
|
+
XCTFail("unexpected error: \(error)")
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
/// Audio part with `modelHasAudioEncoder == false` → `audioWithoutAudioEncoder`.
|
|
243
|
+
func testAudioWithoutEncoderThrows() async {
|
|
244
|
+
let messages: [[String: Any]] = [[
|
|
245
|
+
"role": "user",
|
|
246
|
+
"content": [
|
|
247
|
+
["type": "input_audio", "input_audio": ["data": "AAAA", "format": "pcm16"]]
|
|
248
|
+
],
|
|
249
|
+
]]
|
|
250
|
+
let translator = ContentPartsTranslator(mmprojLoaded: false, modelHasAudioEncoder: false)
|
|
251
|
+
do {
|
|
252
|
+
_ = try await translator.translate(messages: messages)
|
|
253
|
+
XCTFail("expected audioWithoutAudioEncoder")
|
|
254
|
+
} catch TranslatorError.audioWithoutAudioEncoder {
|
|
255
|
+
// expected
|
|
256
|
+
} catch {
|
|
257
|
+
XCTFail("unexpected error: \(error)")
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
/// Unsupported audio format (e.g. `vorbis`) → `unsupportedAudioFormat`
|
|
262
|
+
/// with the offending format echoed back and the supported list filled in.
|
|
263
|
+
func testUnsupportedAudioFormatThrows() async {
|
|
264
|
+
let messages: [[String: Any]] = [[
|
|
265
|
+
"role": "user",
|
|
266
|
+
"content": [
|
|
267
|
+
["type": "input_audio", "input_audio": ["data": "AAAA", "format": "vorbis"]]
|
|
268
|
+
],
|
|
269
|
+
]]
|
|
270
|
+
let translator = ContentPartsTranslator(mmprojLoaded: false, modelHasAudioEncoder: true)
|
|
271
|
+
do {
|
|
272
|
+
_ = try await translator.translate(messages: messages)
|
|
273
|
+
XCTFail("expected unsupportedAudioFormat")
|
|
274
|
+
} catch let TranslatorError.unsupportedAudioFormat(fmt, supported) {
|
|
275
|
+
XCTAssertEqual(fmt, "vorbis")
|
|
276
|
+
XCTAssertEqual(supported, ContentPartsTranslator.supportedAudioFormats)
|
|
277
|
+
XCTAssertTrue(supported.contains("flac"), "iOS supported list should include flac")
|
|
278
|
+
} catch {
|
|
279
|
+
XCTFail("unexpected error: \(error)")
|
|
280
|
+
}
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
/// Unknown content part type → `malformedRequest` with the offending type
|
|
284
|
+
/// echoed in the message.
|
|
285
|
+
func testUnknownContentPartTypeThrows() async {
|
|
286
|
+
let messages: [[String: Any]] = [[
|
|
287
|
+
"role": "user",
|
|
288
|
+
"content": [
|
|
289
|
+
["type": "video_url", "video_url": ["url": "https://example.com/v.mp4"]]
|
|
290
|
+
],
|
|
291
|
+
]]
|
|
292
|
+
let translator = ContentPartsTranslator(mmprojLoaded: true, modelHasAudioEncoder: true)
|
|
293
|
+
do {
|
|
294
|
+
_ = try await translator.translate(messages: messages)
|
|
295
|
+
XCTFail("expected malformedRequest")
|
|
296
|
+
} catch let TranslatorError.malformedRequest(reason) {
|
|
297
|
+
XCTAssertTrue(reason.contains("video_url"), "expected reason to mention offending type, got: \(reason)")
|
|
298
|
+
} catch {
|
|
299
|
+
XCTFail("unexpected error: \(error)")
|
|
300
|
+
}
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
/// Empty `input_audio.data` → `malformedRequest`. The audio decoder must
|
|
304
|
+
/// not be invoked — this is a request-shape error caught before decode.
|
|
305
|
+
func testEmptyAudioDataThrowsMalformedRequest() async {
|
|
306
|
+
let translator = ContentPartsTranslator(
|
|
307
|
+
mmprojLoaded: false,
|
|
308
|
+
modelHasAudioEncoder: true,
|
|
309
|
+
imageDecoder: MockImageDecoder(),
|
|
310
|
+
audioDecoder: { _, _ in
|
|
311
|
+
XCTFail("audio decoder should not be invoked for empty data")
|
|
312
|
+
return Data()
|
|
313
|
+
}
|
|
314
|
+
)
|
|
315
|
+
let messages: [[String: Any]] = [[
|
|
316
|
+
"role": "user",
|
|
317
|
+
"content": [[
|
|
318
|
+
"type": "input_audio",
|
|
319
|
+
"input_audio": ["data": "", "format": "pcm16"]
|
|
320
|
+
]]
|
|
321
|
+
]]
|
|
322
|
+
do {
|
|
323
|
+
_ = try await translator.translate(messages: messages)
|
|
324
|
+
XCTFail("Expected throw")
|
|
325
|
+
} catch TranslatorError.malformedRequest {
|
|
326
|
+
// OK
|
|
327
|
+
} catch {
|
|
328
|
+
XCTFail("Unexpected: \(error)")
|
|
329
|
+
}
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
/// Malformed base64 in `input_audio.data` → `malformedRequest` (not
|
|
333
|
+
/// `audioDecodeFailed`). The audio decoder never runs — this is a
|
|
334
|
+
/// pre-decode request-shape error.
|
|
335
|
+
func testMalformedBase64ThrowsMalformedRequest() async {
|
|
336
|
+
let translator = ContentPartsTranslator(
|
|
337
|
+
mmprojLoaded: false,
|
|
338
|
+
modelHasAudioEncoder: true,
|
|
339
|
+
imageDecoder: MockImageDecoder(),
|
|
340
|
+
audioDecoder: { _, _ in
|
|
341
|
+
XCTFail("audio decoder should not be invoked for invalid base64")
|
|
342
|
+
return Data()
|
|
343
|
+
}
|
|
344
|
+
)
|
|
345
|
+
let messages: [[String: Any]] = [[
|
|
346
|
+
"role": "user",
|
|
347
|
+
"content": [[
|
|
348
|
+
"type": "input_audio",
|
|
349
|
+
"input_audio": ["data": "!!!not-valid-base64!!!", "format": "pcm16"]
|
|
350
|
+
]]
|
|
351
|
+
]]
|
|
352
|
+
do {
|
|
353
|
+
_ = try await translator.translate(messages: messages)
|
|
354
|
+
XCTFail("Expected throw")
|
|
355
|
+
} catch TranslatorError.malformedRequest {
|
|
356
|
+
// OK
|
|
357
|
+
} catch {
|
|
358
|
+
XCTFail("Unexpected: \(error)")
|
|
359
|
+
}
|
|
360
|
+
}
|
|
361
|
+
}
|