@dvai-bridge/ios-llama-core 4.0.0 → 4.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,649 +1,649 @@
1
- #import "LlamaCppBridge.h"
2
- // Consumed via SPM .binaryTarget against build-apple/llama.xcframework
3
- // (built by scripts/mac-side-prepare-xcframework.sh). Framework
4
- // modulemap re-exports llama.h, ggml.h, ggml-alloc.h, ggml-backend.h,
5
- // ggml-metal.h, ggml-cpu.h, ggml-blas.h, gguf.h.
6
- #import <llama/llama.h>
7
- // Multimodal (mtmd) is shipped as a sibling binaryTarget --
8
- // build-apple/mtmd.xcframework. The framework's modulemap exposes
9
- // mtmd.h and mtmd-helper.h; ggml.h / llama.h come from the llama
10
- // framework imported above.
11
- #import <mtmd/mtmd.h>
12
- #import <mtmd/mtmd-helper.h>
13
- #import <Foundation/Foundation.h>
14
- #import <stdlib.h>
15
- #import <string.h>
16
-
17
- @implementation LlamaCppBridge {
18
- struct llama_model *_model;
19
- struct llama_context *_ctx;
20
- NSString *_currentModelPath;
21
- BOOL _embeddingMode;
22
- // Phase 2A Pass 2: real mtmd state.
23
- struct mtmd_context *_mtmdCtx;
24
- NSString *_currentMmprojPath;
25
- }
26
-
27
- - (instancetype)init {
28
- if ((self = [super init])) {
29
- _model = NULL;
30
- _ctx = NULL;
31
- _currentModelPath = nil;
32
- _embeddingMode = NO;
33
- _mtmdCtx = NULL;
34
- _currentMmprojPath = nil;
35
- }
36
- return self;
37
- }
38
-
39
- - (void)dealloc {
40
- [self unload];
41
- }
42
-
43
- - (BOOL)isLoaded {
44
- return _model != NULL && _ctx != NULL;
45
- }
46
-
47
- - (NSString *)currentModelPath {
48
- return _currentModelPath;
49
- }
50
-
51
- - (BOOL)loadModelAtPath:(NSString *)path
52
- mmprojPath:(NSString *)mmprojPath
53
- gpuLayers:(int)gpuLayers
54
- contextSize:(int)contextSize
55
- threads:(int)threads
56
- embeddingMode:(BOOL)embeddingMode
57
- error:(NSError **)error {
58
- if (path.length == 0) {
59
- if (error) {
60
- *error = [NSError errorWithDomain:@"DVAIBridgeLlama"
61
- code:1
62
- userInfo:@{NSLocalizedDescriptionKey: @"empty model path"}];
63
- }
64
- return NO;
65
- }
66
-
67
- [self unload];
68
-
69
- llama_backend_init();
70
-
71
- struct llama_model_params mp = llama_model_default_params();
72
- mp.n_gpu_layers = gpuLayers;
73
- // llama.cpp b8933: llama_load_model_from_file -> llama_model_load_from_file.
74
- _model = llama_model_load_from_file([path UTF8String], mp);
75
- if (_model == NULL) {
76
- if (error) {
77
- *error = [NSError errorWithDomain:@"DVAIBridgeLlama"
78
- code:2
79
- userInfo:@{NSLocalizedDescriptionKey: @"llama_model_load_from_file failed"}];
80
- }
81
- return NO;
82
- }
83
-
84
- struct llama_context_params cp = llama_context_default_params();
85
- cp.n_ctx = (uint32_t)contextSize;
86
- cp.n_threads = threads;
87
- cp.n_threads_batch = threads;
88
- cp.embeddings = embeddingMode ? true : false;
89
-
90
- // llama.cpp b8933: llama_new_context_with_model -> llama_init_from_model.
91
- _ctx = llama_init_from_model(_model, cp);
92
- if (_ctx == NULL) {
93
- llama_model_free(_model);
94
- _model = NULL;
95
- if (error) {
96
- *error = [NSError errorWithDomain:@"DVAIBridgeLlama"
97
- code:3
98
- userInfo:@{NSLocalizedDescriptionKey: @"llama_init_from_model failed"}];
99
- }
100
- return NO;
101
- }
102
-
103
- // mmproj path is just recorded for now. The multimodal projector is loaded
104
- // on-demand via -loadMmprojAtPath:error: by PluginState after the main
105
- // model is up. We don't auto-load here so that text-only flows keep their
106
- // simple init shape.
107
- (void)mmprojPath;
108
-
109
- _currentModelPath = [path copy];
110
- _embeddingMode = embeddingMode;
111
- return YES;
112
- }
113
-
114
- - (void)unload {
115
- // Multimodal projector outlives nothing past the main model -- if the
116
- // text model goes away, the mtmd_context (which holds a reference to
117
- // it) must go too. Unload the projector first.
118
- [self unloadMmproj];
119
- if (_ctx != NULL) {
120
- llama_free(_ctx);
121
- _ctx = NULL;
122
- }
123
- if (_model != NULL) {
124
- // llama.cpp b8933: llama_free_model -> llama_model_free.
125
- llama_model_free(_model);
126
- _model = NULL;
127
- }
128
- _currentModelPath = nil;
129
- _embeddingMode = NO;
130
- }
131
-
132
- - (NSString *)versionString {
133
- const char *info = llama_print_system_info();
134
- return [NSString stringWithFormat:@"llama.cpp %s", info ? info : ""];
135
- }
136
-
137
- #pragma mark - Internal sampling helper
138
-
139
- // Greedy-sample up to maxTokens tokens starting from the current KV-cache
140
- // state (n_past tokens already evaled). Returns the generated text. Used by
141
- // both completePrompt: and completeMultimodalPrompt:.
142
- - (NSString *)sampleGreedyUpToMaxTokens:(int)maxTokens
143
- vocab:(const struct llama_vocab *)vocab {
144
- struct llama_sampler_chain_params sp = llama_sampler_chain_default_params();
145
- struct llama_sampler *chain = llama_sampler_chain_init(sp);
146
- llama_sampler_chain_add(chain, llama_sampler_init_greedy());
147
-
148
- NSMutableString *result = [NSMutableString string];
149
- const llama_token eos = llama_vocab_eos(vocab);
150
-
151
- for (int i = 0; i < maxTokens; i++) {
152
- llama_token tokenId = llama_sampler_sample(chain, _ctx, -1);
153
- llama_sampler_accept(chain, tokenId);
154
-
155
- if (tokenId == eos) break;
156
-
157
- char buf[256] = {0};
158
- int wrote = llama_token_to_piece(vocab, tokenId, buf, (int)sizeof(buf),
159
- /*lstrip=*/0, /*special=*/false);
160
- if (wrote > 0) {
161
- NSString *piece = [[NSString alloc] initWithBytes:buf
162
- length:(NSUInteger)wrote
163
- encoding:NSUTF8StringEncoding];
164
- if (piece != nil) {
165
- [result appendString:piece];
166
- }
167
- }
168
-
169
- struct llama_batch nb = llama_batch_get_one(&tokenId, 1);
170
- if (llama_decode(_ctx, nb) != 0) break;
171
- }
172
-
173
- llama_sampler_free(chain);
174
- return result;
175
- }
176
-
177
- - (nullable NSString *)completePrompt:(NSString *)prompt
178
- maxTokens:(int)maxTokens
179
- temperature:(float)temperature
180
- topP:(float)topP
181
- error:(NSError **)error {
182
- (void)temperature;
183
- (void)topP;
184
-
185
- if (!self.isLoaded) {
186
- if (error) {
187
- *error = [NSError errorWithDomain:@"DVAIBridgeLlama"
188
- code:10
189
- userInfo:@{NSLocalizedDescriptionKey: @"Model not loaded"}];
190
- }
191
- return nil;
192
- }
193
-
194
- const char *cprompt = prompt ? [prompt UTF8String] : "";
195
- const int promptLen = (int)strlen(cprompt);
196
-
197
- // llama.cpp b8933: tokenize / token_to_piece / token_eos now take a vocab,
198
- // not a model. Fetch it once and reuse.
199
- const struct llama_vocab *vocab = llama_model_get_vocab(_model);
200
-
201
- // Probe: a negative return is the (negated) required token count.
202
- int probe = llama_tokenize(vocab, cprompt, promptLen,
203
- NULL, 0, /*add_special=*/true, /*parse_special=*/false);
204
- int needed = probe < 0 ? -probe : probe;
205
- if (needed <= 0) {
206
- if (error) {
207
- *error = [NSError errorWithDomain:@"DVAIBridgeLlama"
208
- code:11
209
- userInfo:@{NSLocalizedDescriptionKey: @"Tokenization produced no tokens"}];
210
- }
211
- return nil;
212
- }
213
-
214
- llama_token *tokens = (llama_token *)calloc((size_t)needed, sizeof(llama_token));
215
- if (tokens == NULL) {
216
- if (error) {
217
- *error = [NSError errorWithDomain:@"DVAIBridgeLlama"
218
- code:11
219
- userInfo:@{NSLocalizedDescriptionKey: @"calloc failed"}];
220
- }
221
- return nil;
222
- }
223
-
224
- int actual = llama_tokenize(vocab, cprompt, promptLen,
225
- tokens, needed, /*add_special=*/true, /*parse_special=*/false);
226
- if (actual <= 0) {
227
- free(tokens);
228
- if (error) {
229
- *error = [NSError errorWithDomain:@"DVAIBridgeLlama"
230
- code:11
231
- userInfo:@{NSLocalizedDescriptionKey: @"Tokenization failed"}];
232
- }
233
- return nil;
234
- }
235
-
236
- // llama.cpp b8933: llama_batch_get_one is now (tokens, n_tokens) only -- the
237
- // pos_0 / seq_id args were removed. Position is tracked automatically by
238
- // llama_decode via the context's KV cache state.
239
- struct llama_batch batch = llama_batch_get_one(tokens, actual);
240
- if (llama_decode(_ctx, batch) != 0) {
241
- free(tokens);
242
- if (error) {
243
- *error = [NSError errorWithDomain:@"DVAIBridgeLlama"
244
- code:12
245
- userInfo:@{NSLocalizedDescriptionKey: @"Decode failed"}];
246
- }
247
- return nil;
248
- }
249
- free(tokens);
250
-
251
- return [self sampleGreedyUpToMaxTokens:maxTokens vocab:vocab];
252
- }
253
-
254
- - (nullable NSArray<NSNumber *> *)embedding:(NSString *)text
255
- error:(NSError **)error {
256
- if (!self.isLoaded) {
257
- if (error) {
258
- *error = [NSError errorWithDomain:@"DVAIBridgeLlama"
259
- code:20
260
- userInfo:@{NSLocalizedDescriptionKey: @"Model not loaded"}];
261
- }
262
- return nil;
263
- }
264
-
265
- const char *cText = text ? [text UTF8String] : "";
266
- const int textLen = (int)strlen(cText);
267
-
268
- // llama.cpp b8933: tokenize takes a vocab, not a model.
269
- const struct llama_vocab *vocab = llama_model_get_vocab(_model);
270
-
271
- int probe = llama_tokenize(vocab, cText, textLen,
272
- NULL, 0, /*add_special=*/true, /*parse_special=*/false);
273
- int needed = probe < 0 ? -probe : probe;
274
- if (needed <= 0) {
275
- if (error) {
276
- *error = [NSError errorWithDomain:@"DVAIBridgeLlama"
277
- code:21
278
- userInfo:@{NSLocalizedDescriptionKey: @"Tokenization produced no tokens"}];
279
- }
280
- return nil;
281
- }
282
-
283
- llama_token *tokens = (llama_token *)calloc((size_t)needed, sizeof(llama_token));
284
- if (tokens == NULL) {
285
- if (error) {
286
- *error = [NSError errorWithDomain:@"DVAIBridgeLlama"
287
- code:21
288
- userInfo:@{NSLocalizedDescriptionKey: @"calloc failed"}];
289
- }
290
- return nil;
291
- }
292
-
293
- int actual = llama_tokenize(vocab, cText, textLen,
294
- tokens, needed, /*add_special=*/true, /*parse_special=*/false);
295
- if (actual <= 0) {
296
- free(tokens);
297
- if (error) {
298
- *error = [NSError errorWithDomain:@"DVAIBridgeLlama"
299
- code:21
300
- userInfo:@{NSLocalizedDescriptionKey: @"Tokenization failed"}];
301
- }
302
- return nil;
303
- }
304
-
305
- // llama.cpp b8933: llama_batch_get_one is (tokens, n_tokens) only.
306
- struct llama_batch batch = llama_batch_get_one(tokens, actual);
307
- if (llama_decode(_ctx, batch) != 0) {
308
- free(tokens);
309
- if (error) {
310
- *error = [NSError errorWithDomain:@"DVAIBridgeLlama"
311
- code:22
312
- userInfo:@{NSLocalizedDescriptionKey: @"Decode failed"}];
313
- }
314
- return nil;
315
- }
316
- free(tokens);
317
-
318
- // llama.cpp b8933: llama_n_embd -> llama_model_n_embd.
319
- int n_embd = llama_model_n_embd(_model);
320
- if (n_embd <= 0) {
321
- if (error) {
322
- *error = [NSError errorWithDomain:@"DVAIBridgeLlama"
323
- code:23
324
- userInfo:@{NSLocalizedDescriptionKey: @"llama_model_n_embd returned non-positive"}];
325
- }
326
- return nil;
327
- }
328
- const float *vec = llama_get_embeddings_seq(_ctx, 0);
329
- if (!vec) {
330
- // Fallback: llama_get_embeddings returns the last-decoded token's
331
- // embedding, valid when not in seq-mode. The seq variant prefers a
332
- // pooled / sequence-level vector when the context was loaded with
333
- // embedding pooling on; the plain variant is the best-effort fallback.
334
- vec = llama_get_embeddings(_ctx);
335
- }
336
- if (!vec) {
337
- if (error) {
338
- *error = [NSError errorWithDomain:@"DVAIBridgeLlama"
339
- code:23
340
- userInfo:@{NSLocalizedDescriptionKey: @"Embedding pointer null"}];
341
- }
342
- return nil;
343
- }
344
-
345
- NSMutableArray<NSNumber *> *result = [NSMutableArray arrayWithCapacity:(NSUInteger)n_embd];
346
- for (int i = 0; i < n_embd; i++) {
347
- [result addObject:@(vec[i])];
348
- }
349
- return result;
350
- }
351
-
352
- #pragma mark - Multimodal (mtmd) — Phase 2A Pass 2
353
-
354
- - (BOOL)isMmprojLoaded {
355
- return _mtmdCtx != NULL;
356
- }
357
-
358
- - (BOOL)loadMmprojAtPath:(NSString *)mmprojPath
359
- error:(NSError **)error {
360
- return [self loadMmprojAtPath:mmprojPath useGPU:YES error:error];
361
- }
362
-
363
- - (BOOL)loadMmprojAtPath:(NSString *)mmprojPath
364
- useGPU:(BOOL)useGPU
365
- error:(NSError **)error {
366
- if (mmprojPath.length == 0) {
367
- if (error) {
368
- *error = [NSError errorWithDomain:@"DVAIBridgeLlama"
369
- code:30
370
- userInfo:@{NSLocalizedDescriptionKey: @"empty mmproj path"}];
371
- }
372
- return NO;
373
- }
374
- if (_model == NULL) {
375
- if (error) {
376
- *error = [NSError errorWithDomain:@"DVAIBridgeLlama"
377
- code:31
378
- userInfo:@{NSLocalizedDescriptionKey: @"main model must be loaded before mmproj"}];
379
- }
380
- return NO;
381
- }
382
- [self unloadMmproj];
383
-
384
- struct mtmd_context_params params = mtmd_context_params_default();
385
- params.use_gpu = useGPU ? true : false;
386
- _mtmdCtx = mtmd_init_from_file([mmprojPath UTF8String], _model, params);
387
- if (_mtmdCtx == NULL) {
388
- if (error) {
389
- *error = [NSError errorWithDomain:@"DVAIBridgeLlama"
390
- code:32
391
- userInfo:@{NSLocalizedDescriptionKey:
392
- @"mtmd_init_from_file failed (mmproj incompatible with model?)"}];
393
- }
394
- return NO;
395
- }
396
- _currentMmprojPath = [mmprojPath copy];
397
- return YES;
398
- }
399
-
400
- - (void)unloadMmproj {
401
- if (_mtmdCtx != NULL) {
402
- mtmd_free(_mtmdCtx);
403
- _mtmdCtx = NULL;
404
- }
405
- _currentMmprojPath = nil;
406
- }
407
-
408
- - (BOOL)hasAudioEncoder {
409
- if (_mtmdCtx == NULL) return NO;
410
- return mtmd_support_audio(_mtmdCtx);
411
- }
412
-
413
- - (nullable NSString *)applyChatTemplate:(nullable NSString *)templateOverride
414
- messages:(NSArray<NSDictionary<NSString *, NSString *> *> *)messages
415
- addAssistant:(BOOL)addAssistant
416
- error:(NSError **)error {
417
- if (!self.isLoaded) {
418
- if (error) {
419
- *error = [NSError errorWithDomain:@"DVAIBridgeLlama"
420
- code:40
421
- userInfo:@{NSLocalizedDescriptionKey: @"Model not loaded"}];
422
- }
423
- return nil;
424
- }
425
- NSUInteger n = messages.count;
426
- if (n == 0) {
427
- if (error) {
428
- *error = [NSError errorWithDomain:@"DVAIBridgeLlama"
429
- code:43
430
- userInfo:@{NSLocalizedDescriptionKey: @"messages array is empty"}];
431
- }
432
- return nil;
433
- }
434
-
435
- // Build llama_chat_message array. We strdup() each role/content so the
436
- // C-string lifetime is independent of any autorelease pool draining
437
- // mid-call. NSString.UTF8String returns a pointer with autorelease
438
- // lifetime, which is unsafe to hold across this multi-step call.
439
- struct llama_chat_message *chat = (struct llama_chat_message *)calloc(n, sizeof(struct llama_chat_message));
440
- if (!chat) {
441
- if (error) {
442
- *error = [NSError errorWithDomain:@"DVAIBridgeLlama"
443
- code:44
444
- userInfo:@{NSLocalizedDescriptionKey: @"calloc failed"}];
445
- }
446
- return nil;
447
- }
448
- for (NSUInteger i = 0; i < n; i++) {
449
- NSDictionary *msg = messages[i];
450
- NSString *role = msg[@"role"];
451
- NSString *content = msg[@"content"];
452
- if (![role isKindOfClass:[NSString class]]) role = @"user";
453
- if (![content isKindOfClass:[NSString class]]) content = @"";
454
- chat[i].role = strdup([role UTF8String]);
455
- chat[i].content = strdup([content UTF8String]);
456
- }
457
-
458
- // Resolve template: explicit override > model's own > NULL (= built-in
459
- // default heuristic; may fail for unknown architectures).
460
- const char *tmpl = NULL;
461
- if (templateOverride.length > 0) {
462
- tmpl = [templateOverride UTF8String];
463
- } else {
464
- const char *modelTmpl = llama_model_chat_template(_model, NULL);
465
- if (modelTmpl) tmpl = modelTmpl;
466
- }
467
-
468
- // Probe size. llama_chat_apply_template returns the required bytes
469
- // (positive) when buf is too small, or a negative error code.
470
- int needed = llama_chat_apply_template(tmpl, chat, n, addAssistant, NULL, 0);
471
- if (needed <= 0) {
472
- for (NSUInteger i = 0; i < n; i++) {
473
- free((void *)chat[i].role);
474
- free((void *)chat[i].content);
475
- }
476
- free(chat);
477
- if (error) {
478
- *error = [NSError errorWithDomain:@"DVAIBridgeLlama"
479
- code:41
480
- userInfo:@{NSLocalizedDescriptionKey:
481
- @"llama_chat_apply_template probe failed (model has no chat template and none provided?)"}];
482
- }
483
- return nil;
484
- }
485
- char *buf = (char *)calloc((size_t)needed + 1, sizeof(char));
486
- if (!buf) {
487
- for (NSUInteger i = 0; i < n; i++) {
488
- free((void *)chat[i].role);
489
- free((void *)chat[i].content);
490
- }
491
- free(chat);
492
- if (error) {
493
- *error = [NSError errorWithDomain:@"DVAIBridgeLlama"
494
- code:44
495
- userInfo:@{NSLocalizedDescriptionKey: @"calloc failed"}];
496
- }
497
- return nil;
498
- }
499
- int actual = llama_chat_apply_template(tmpl, chat, n, addAssistant, buf, needed + 1);
500
- NSString *result = nil;
501
- if (actual > 0) {
502
- result = [[NSString alloc] initWithBytes:buf length:(NSUInteger)actual encoding:NSUTF8StringEncoding];
503
- }
504
- for (NSUInteger i = 0; i < n; i++) {
505
- free((void *)chat[i].role);
506
- free((void *)chat[i].content);
507
- }
508
- free(chat);
509
- free(buf);
510
-
511
- if (!result) {
512
- if (error) {
513
- *error = [NSError errorWithDomain:@"DVAIBridgeLlama"
514
- code:42
515
- userInfo:@{NSLocalizedDescriptionKey: @"llama_chat_apply_template failed"}];
516
- }
517
- return nil;
518
- }
519
- return result;
520
- }
521
-
522
- - (nullable NSString *)completeMultimodalPrompt:(NSString *)prompt
523
- media:(NSArray<NSData *> *)mediaInOrder
524
- maxTokens:(int)maxTokens
525
- temperature:(float)temperature
526
- topP:(float)topP
527
- error:(NSError **)error {
528
- (void)temperature;
529
- (void)topP;
530
-
531
- if (!self.isLoaded) {
532
- if (error) {
533
- *error = [NSError errorWithDomain:@"DVAIBridgeLlama"
534
- code:50
535
- userInfo:@{NSLocalizedDescriptionKey: @"Model not loaded"}];
536
- }
537
- return nil;
538
- }
539
- if (_mtmdCtx == NULL) {
540
- if (error) {
541
- *error = [NSError errorWithDomain:@"DVAIBridgeLlama"
542
- code:51
543
- userInfo:@{NSLocalizedDescriptionKey: @"mmproj not loaded"}];
544
- }
545
- return nil;
546
- }
547
-
548
- NSUInteger nMedia = mediaInOrder.count;
549
-
550
- // 1. Build bitmaps in declaration order. Each bitmap is auto-detected as
551
- // image vs audio by mtmd_helper_bitmap_init_from_buf via magic bytes.
552
- mtmd_bitmap **bitmaps = NULL;
553
- if (nMedia > 0) {
554
- bitmaps = (mtmd_bitmap **)calloc(nMedia, sizeof(mtmd_bitmap *));
555
- if (!bitmaps) {
556
- if (error) {
557
- *error = [NSError errorWithDomain:@"DVAIBridgeLlama"
558
- code:55
559
- userInfo:@{NSLocalizedDescriptionKey: @"calloc failed"}];
560
- }
561
- return nil;
562
- }
563
- }
564
- for (NSUInteger i = 0; i < nMedia; i++) {
565
- NSData *bytes = mediaInOrder[i];
566
- bitmaps[i] = mtmd_helper_bitmap_init_from_buf(_mtmdCtx,
567
- (const unsigned char *)bytes.bytes,
568
- (size_t)bytes.length);
569
- if (bitmaps[i] == NULL) {
570
- for (NSUInteger j = 0; j < i; j++) mtmd_bitmap_free(bitmaps[j]);
571
- free(bitmaps);
572
- if (error) {
573
- *error = [NSError errorWithDomain:@"DVAIBridgeLlama"
574
- code:52
575
- userInfo:@{NSLocalizedDescriptionKey:
576
- [NSString stringWithFormat:@"mtmd_helper_bitmap_init_from_buf failed for media[%lu]",
577
- (unsigned long)i]}];
578
- }
579
- return nil;
580
- }
581
- }
582
-
583
- // 2. Tokenize. mtmd_tokenize matches markers in the prompt against the
584
- // bitmap array in order.
585
- mtmd_input_chunks *chunks = mtmd_input_chunks_init();
586
- if (!chunks) {
587
- for (NSUInteger i = 0; i < nMedia; i++) mtmd_bitmap_free(bitmaps[i]);
588
- free(bitmaps);
589
- if (error) {
590
- *error = [NSError errorWithDomain:@"DVAIBridgeLlama"
591
- code:55
592
- userInfo:@{NSLocalizedDescriptionKey: @"mtmd_input_chunks_init failed"}];
593
- }
594
- return nil;
595
- }
596
- struct mtmd_input_text input_text;
597
- input_text.text = prompt ? [prompt UTF8String] : "";
598
- // The chat template already added BOS; don't add it again.
599
- input_text.add_special = false;
600
- input_text.parse_special = true;
601
- int32_t tok_rc = mtmd_tokenize(_mtmdCtx, chunks, &input_text,
602
- (const mtmd_bitmap **)bitmaps, (size_t)nMedia);
603
- // Per mtmd.h: mtmd_tokenize copies what it needs out of bitmaps; safe to
604
- // free immediately after the call returns.
605
- for (NSUInteger i = 0; i < nMedia; i++) mtmd_bitmap_free(bitmaps[i]);
606
- free(bitmaps);
607
- if (tok_rc != 0) {
608
- mtmd_input_chunks_free(chunks);
609
- if (error) {
610
- NSString *msg = (tok_rc == 1)
611
- ? @"mtmd_tokenize: marker count does not match media count"
612
- : (tok_rc == 2)
613
- ? @"mtmd_tokenize: image preprocessing error"
614
- : [NSString stringWithFormat:@"mtmd_tokenize failed (rc=%d)", tok_rc];
615
- *error = [NSError errorWithDomain:@"DVAIBridgeLlama"
616
- code:53
617
- userInfo:@{NSLocalizedDescriptionKey: msg}];
618
- }
619
- return nil;
620
- }
621
-
622
- // 3. Eval all chunks.
623
- llama_pos n_past = 0;
624
- llama_pos new_n_past = 0;
625
- int32_t eval_rc = mtmd_helper_eval_chunks(_mtmdCtx, _ctx, chunks,
626
- n_past,
627
- /*seq_id=*/0,
628
- /*n_batch=*/512,
629
- /*logits_last=*/true,
630
- &new_n_past);
631
- mtmd_input_chunks_free(chunks);
632
- if (eval_rc != 0) {
633
- if (error) {
634
- *error = [NSError errorWithDomain:@"DVAIBridgeLlama"
635
- code:54
636
- userInfo:@{NSLocalizedDescriptionKey:
637
- [NSString stringWithFormat:@"mtmd_helper_eval_chunks failed (rc=%d)",
638
- eval_rc]}];
639
- }
640
- return nil;
641
- }
642
-
643
- // 4. Sampling loop (greedy). The KV cache now reflects all evaled chunks;
644
- // each sampled token is appended as a 1-token batch via the helper.
645
- const struct llama_vocab *vocab = llama_model_get_vocab(_model);
646
- return [self sampleGreedyUpToMaxTokens:maxTokens vocab:vocab];
647
- }
648
-
649
- @end
1
+ #import "LlamaCppBridge.h"
2
+ // Consumed via SPM .binaryTarget against build-apple/llama.xcframework
3
+ // (built by scripts/mac-side-prepare-xcframework.sh). Framework
4
+ // modulemap re-exports llama.h, ggml.h, ggml-alloc.h, ggml-backend.h,
5
+ // ggml-metal.h, ggml-cpu.h, ggml-blas.h, gguf.h.
6
+ #import <llama/llama.h>
7
+ // Multimodal (mtmd) is shipped as a sibling binaryTarget --
8
+ // build-apple/mtmd.xcframework. The framework's modulemap exposes
9
+ // mtmd.h and mtmd-helper.h; ggml.h / llama.h come from the llama
10
+ // framework imported above.
11
+ #import <mtmd/mtmd.h>
12
+ #import <mtmd/mtmd-helper.h>
13
+ #import <Foundation/Foundation.h>
14
+ #import <stdlib.h>
15
+ #import <string.h>
16
+
17
+ @implementation LlamaCppBridge {
18
+ struct llama_model *_model;
19
+ struct llama_context *_ctx;
20
+ NSString *_currentModelPath;
21
+ BOOL _embeddingMode;
22
+ // Phase 2A Pass 2: real mtmd state.
23
+ struct mtmd_context *_mtmdCtx;
24
+ NSString *_currentMmprojPath;
25
+ }
26
+
27
+ - (instancetype)init {
28
+ if ((self = [super init])) {
29
+ _model = NULL;
30
+ _ctx = NULL;
31
+ _currentModelPath = nil;
32
+ _embeddingMode = NO;
33
+ _mtmdCtx = NULL;
34
+ _currentMmprojPath = nil;
35
+ }
36
+ return self;
37
+ }
38
+
39
+ - (void)dealloc {
40
+ [self unload];
41
+ }
42
+
43
+ - (BOOL)isLoaded {
44
+ return _model != NULL && _ctx != NULL;
45
+ }
46
+
47
+ - (NSString *)currentModelPath {
48
+ return _currentModelPath;
49
+ }
50
+
51
+ - (BOOL)loadModelAtPath:(NSString *)path
52
+ mmprojPath:(NSString *)mmprojPath
53
+ gpuLayers:(int)gpuLayers
54
+ contextSize:(int)contextSize
55
+ threads:(int)threads
56
+ embeddingMode:(BOOL)embeddingMode
57
+ error:(NSError **)error {
58
+ if (path.length == 0) {
59
+ if (error) {
60
+ *error = [NSError errorWithDomain:@"DVAIBridgeLlama"
61
+ code:1
62
+ userInfo:@{NSLocalizedDescriptionKey: @"empty model path"}];
63
+ }
64
+ return NO;
65
+ }
66
+
67
+ [self unload];
68
+
69
+ llama_backend_init();
70
+
71
+ struct llama_model_params mp = llama_model_default_params();
72
+ mp.n_gpu_layers = gpuLayers;
73
+ // llama.cpp b8933: llama_load_model_from_file -> llama_model_load_from_file.
74
+ _model = llama_model_load_from_file([path UTF8String], mp);
75
+ if (_model == NULL) {
76
+ if (error) {
77
+ *error = [NSError errorWithDomain:@"DVAIBridgeLlama"
78
+ code:2
79
+ userInfo:@{NSLocalizedDescriptionKey: @"llama_model_load_from_file failed"}];
80
+ }
81
+ return NO;
82
+ }
83
+
84
+ struct llama_context_params cp = llama_context_default_params();
85
+ cp.n_ctx = (uint32_t)contextSize;
86
+ cp.n_threads = threads;
87
+ cp.n_threads_batch = threads;
88
+ cp.embeddings = embeddingMode ? true : false;
89
+
90
+ // llama.cpp b8933: llama_new_context_with_model -> llama_init_from_model.
91
+ _ctx = llama_init_from_model(_model, cp);
92
+ if (_ctx == NULL) {
93
+ llama_model_free(_model);
94
+ _model = NULL;
95
+ if (error) {
96
+ *error = [NSError errorWithDomain:@"DVAIBridgeLlama"
97
+ code:3
98
+ userInfo:@{NSLocalizedDescriptionKey: @"llama_init_from_model failed"}];
99
+ }
100
+ return NO;
101
+ }
102
+
103
+ // mmproj path is just recorded for now. The multimodal projector is loaded
104
+ // on-demand via -loadMmprojAtPath:error: by PluginState after the main
105
+ // model is up. We don't auto-load here so that text-only flows keep their
106
+ // simple init shape.
107
+ (void)mmprojPath;
108
+
109
+ _currentModelPath = [path copy];
110
+ _embeddingMode = embeddingMode;
111
+ return YES;
112
+ }
113
+
114
+ - (void)unload {
115
+ // Multimodal projector outlives nothing past the main model -- if the
116
+ // text model goes away, the mtmd_context (which holds a reference to
117
+ // it) must go too. Unload the projector first.
118
+ [self unloadMmproj];
119
+ if (_ctx != NULL) {
120
+ llama_free(_ctx);
121
+ _ctx = NULL;
122
+ }
123
+ if (_model != NULL) {
124
+ // llama.cpp b8933: llama_free_model -> llama_model_free.
125
+ llama_model_free(_model);
126
+ _model = NULL;
127
+ }
128
+ _currentModelPath = nil;
129
+ _embeddingMode = NO;
130
+ }
131
+
132
+ - (NSString *)versionString {
133
+ const char *info = llama_print_system_info();
134
+ return [NSString stringWithFormat:@"llama.cpp %s", info ? info : ""];
135
+ }
136
+
137
+ #pragma mark - Internal sampling helper
138
+
139
+ // Greedy-sample up to maxTokens tokens starting from the current KV-cache
140
+ // state (n_past tokens already evaled). Returns the generated text. Used by
141
+ // both completePrompt: and completeMultimodalPrompt:.
142
+ - (NSString *)sampleGreedyUpToMaxTokens:(int)maxTokens
143
+ vocab:(const struct llama_vocab *)vocab {
144
+ struct llama_sampler_chain_params sp = llama_sampler_chain_default_params();
145
+ struct llama_sampler *chain = llama_sampler_chain_init(sp);
146
+ llama_sampler_chain_add(chain, llama_sampler_init_greedy());
147
+
148
+ NSMutableString *result = [NSMutableString string];
149
+ const llama_token eos = llama_vocab_eos(vocab);
150
+
151
+ for (int i = 0; i < maxTokens; i++) {
152
+ llama_token tokenId = llama_sampler_sample(chain, _ctx, -1);
153
+ llama_sampler_accept(chain, tokenId);
154
+
155
+ if (tokenId == eos) break;
156
+
157
+ char buf[256] = {0};
158
+ int wrote = llama_token_to_piece(vocab, tokenId, buf, (int)sizeof(buf),
159
+ /*lstrip=*/0, /*special=*/false);
160
+ if (wrote > 0) {
161
+ NSString *piece = [[NSString alloc] initWithBytes:buf
162
+ length:(NSUInteger)wrote
163
+ encoding:NSUTF8StringEncoding];
164
+ if (piece != nil) {
165
+ [result appendString:piece];
166
+ }
167
+ }
168
+
169
+ struct llama_batch nb = llama_batch_get_one(&tokenId, 1);
170
+ if (llama_decode(_ctx, nb) != 0) break;
171
+ }
172
+
173
+ llama_sampler_free(chain);
174
+ return result;
175
+ }
176
+
177
+ - (nullable NSString *)completePrompt:(NSString *)prompt
178
+ maxTokens:(int)maxTokens
179
+ temperature:(float)temperature
180
+ topP:(float)topP
181
+ error:(NSError **)error {
182
+ (void)temperature;
183
+ (void)topP;
184
+
185
+ if (!self.isLoaded) {
186
+ if (error) {
187
+ *error = [NSError errorWithDomain:@"DVAIBridgeLlama"
188
+ code:10
189
+ userInfo:@{NSLocalizedDescriptionKey: @"Model not loaded"}];
190
+ }
191
+ return nil;
192
+ }
193
+
194
+ const char *cprompt = prompt ? [prompt UTF8String] : "";
195
+ const int promptLen = (int)strlen(cprompt);
196
+
197
+ // llama.cpp b8933: tokenize / token_to_piece / token_eos now take a vocab,
198
+ // not a model. Fetch it once and reuse.
199
+ const struct llama_vocab *vocab = llama_model_get_vocab(_model);
200
+
201
+ // Probe: a negative return is the (negated) required token count.
202
+ int probe = llama_tokenize(vocab, cprompt, promptLen,
203
+ NULL, 0, /*add_special=*/true, /*parse_special=*/false);
204
+ int needed = probe < 0 ? -probe : probe;
205
+ if (needed <= 0) {
206
+ if (error) {
207
+ *error = [NSError errorWithDomain:@"DVAIBridgeLlama"
208
+ code:11
209
+ userInfo:@{NSLocalizedDescriptionKey: @"Tokenization produced no tokens"}];
210
+ }
211
+ return nil;
212
+ }
213
+
214
+ llama_token *tokens = (llama_token *)calloc((size_t)needed, sizeof(llama_token));
215
+ if (tokens == NULL) {
216
+ if (error) {
217
+ *error = [NSError errorWithDomain:@"DVAIBridgeLlama"
218
+ code:11
219
+ userInfo:@{NSLocalizedDescriptionKey: @"calloc failed"}];
220
+ }
221
+ return nil;
222
+ }
223
+
224
+ int actual = llama_tokenize(vocab, cprompt, promptLen,
225
+ tokens, needed, /*add_special=*/true, /*parse_special=*/false);
226
+ if (actual <= 0) {
227
+ free(tokens);
228
+ if (error) {
229
+ *error = [NSError errorWithDomain:@"DVAIBridgeLlama"
230
+ code:11
231
+ userInfo:@{NSLocalizedDescriptionKey: @"Tokenization failed"}];
232
+ }
233
+ return nil;
234
+ }
235
+
236
+ // llama.cpp b8933: llama_batch_get_one is now (tokens, n_tokens) only -- the
237
+ // pos_0 / seq_id args were removed. Position is tracked automatically by
238
+ // llama_decode via the context's KV cache state.
239
+ struct llama_batch batch = llama_batch_get_one(tokens, actual);
240
+ if (llama_decode(_ctx, batch) != 0) {
241
+ free(tokens);
242
+ if (error) {
243
+ *error = [NSError errorWithDomain:@"DVAIBridgeLlama"
244
+ code:12
245
+ userInfo:@{NSLocalizedDescriptionKey: @"Decode failed"}];
246
+ }
247
+ return nil;
248
+ }
249
+ free(tokens);
250
+
251
+ return [self sampleGreedyUpToMaxTokens:maxTokens vocab:vocab];
252
+ }
253
+
254
+ - (nullable NSArray<NSNumber *> *)embedding:(NSString *)text
255
+ error:(NSError **)error {
256
+ if (!self.isLoaded) {
257
+ if (error) {
258
+ *error = [NSError errorWithDomain:@"DVAIBridgeLlama"
259
+ code:20
260
+ userInfo:@{NSLocalizedDescriptionKey: @"Model not loaded"}];
261
+ }
262
+ return nil;
263
+ }
264
+
265
+ const char *cText = text ? [text UTF8String] : "";
266
+ const int textLen = (int)strlen(cText);
267
+
268
+ // llama.cpp b8933: tokenize takes a vocab, not a model.
269
+ const struct llama_vocab *vocab = llama_model_get_vocab(_model);
270
+
271
+ int probe = llama_tokenize(vocab, cText, textLen,
272
+ NULL, 0, /*add_special=*/true, /*parse_special=*/false);
273
+ int needed = probe < 0 ? -probe : probe;
274
+ if (needed <= 0) {
275
+ if (error) {
276
+ *error = [NSError errorWithDomain:@"DVAIBridgeLlama"
277
+ code:21
278
+ userInfo:@{NSLocalizedDescriptionKey: @"Tokenization produced no tokens"}];
279
+ }
280
+ return nil;
281
+ }
282
+
283
+ llama_token *tokens = (llama_token *)calloc((size_t)needed, sizeof(llama_token));
284
+ if (tokens == NULL) {
285
+ if (error) {
286
+ *error = [NSError errorWithDomain:@"DVAIBridgeLlama"
287
+ code:21
288
+ userInfo:@{NSLocalizedDescriptionKey: @"calloc failed"}];
289
+ }
290
+ return nil;
291
+ }
292
+
293
+ int actual = llama_tokenize(vocab, cText, textLen,
294
+ tokens, needed, /*add_special=*/true, /*parse_special=*/false);
295
+ if (actual <= 0) {
296
+ free(tokens);
297
+ if (error) {
298
+ *error = [NSError errorWithDomain:@"DVAIBridgeLlama"
299
+ code:21
300
+ userInfo:@{NSLocalizedDescriptionKey: @"Tokenization failed"}];
301
+ }
302
+ return nil;
303
+ }
304
+
305
+ // llama.cpp b8933: llama_batch_get_one is (tokens, n_tokens) only.
306
+ struct llama_batch batch = llama_batch_get_one(tokens, actual);
307
+ if (llama_decode(_ctx, batch) != 0) {
308
+ free(tokens);
309
+ if (error) {
310
+ *error = [NSError errorWithDomain:@"DVAIBridgeLlama"
311
+ code:22
312
+ userInfo:@{NSLocalizedDescriptionKey: @"Decode failed"}];
313
+ }
314
+ return nil;
315
+ }
316
+ free(tokens);
317
+
318
+ // llama.cpp b8933: llama_n_embd -> llama_model_n_embd.
319
+ int n_embd = llama_model_n_embd(_model);
320
+ if (n_embd <= 0) {
321
+ if (error) {
322
+ *error = [NSError errorWithDomain:@"DVAIBridgeLlama"
323
+ code:23
324
+ userInfo:@{NSLocalizedDescriptionKey: @"llama_model_n_embd returned non-positive"}];
325
+ }
326
+ return nil;
327
+ }
328
+ const float *vec = llama_get_embeddings_seq(_ctx, 0);
329
+ if (!vec) {
330
+ // Fallback: llama_get_embeddings returns the last-decoded token's
331
+ // embedding, valid when not in seq-mode. The seq variant prefers a
332
+ // pooled / sequence-level vector when the context was loaded with
333
+ // embedding pooling on; the plain variant is the best-effort fallback.
334
+ vec = llama_get_embeddings(_ctx);
335
+ }
336
+ if (!vec) {
337
+ if (error) {
338
+ *error = [NSError errorWithDomain:@"DVAIBridgeLlama"
339
+ code:23
340
+ userInfo:@{NSLocalizedDescriptionKey: @"Embedding pointer null"}];
341
+ }
342
+ return nil;
343
+ }
344
+
345
+ NSMutableArray<NSNumber *> *result = [NSMutableArray arrayWithCapacity:(NSUInteger)n_embd];
346
+ for (int i = 0; i < n_embd; i++) {
347
+ [result addObject:@(vec[i])];
348
+ }
349
+ return result;
350
+ }
351
+
352
+ #pragma mark - Multimodal (mtmd) — Phase 2A Pass 2
353
+
354
+ - (BOOL)isMmprojLoaded {
355
+ return _mtmdCtx != NULL;
356
+ }
357
+
358
+ - (BOOL)loadMmprojAtPath:(NSString *)mmprojPath
359
+ error:(NSError **)error {
360
+ return [self loadMmprojAtPath:mmprojPath useGPU:YES error:error];
361
+ }
362
+
363
+ - (BOOL)loadMmprojAtPath:(NSString *)mmprojPath
364
+ useGPU:(BOOL)useGPU
365
+ error:(NSError **)error {
366
+ if (mmprojPath.length == 0) {
367
+ if (error) {
368
+ *error = [NSError errorWithDomain:@"DVAIBridgeLlama"
369
+ code:30
370
+ userInfo:@{NSLocalizedDescriptionKey: @"empty mmproj path"}];
371
+ }
372
+ return NO;
373
+ }
374
+ if (_model == NULL) {
375
+ if (error) {
376
+ *error = [NSError errorWithDomain:@"DVAIBridgeLlama"
377
+ code:31
378
+ userInfo:@{NSLocalizedDescriptionKey: @"main model must be loaded before mmproj"}];
379
+ }
380
+ return NO;
381
+ }
382
+ [self unloadMmproj];
383
+
384
+ struct mtmd_context_params params = mtmd_context_params_default();
385
+ params.use_gpu = useGPU ? true : false;
386
+ _mtmdCtx = mtmd_init_from_file([mmprojPath UTF8String], _model, params);
387
+ if (_mtmdCtx == NULL) {
388
+ if (error) {
389
+ *error = [NSError errorWithDomain:@"DVAIBridgeLlama"
390
+ code:32
391
+ userInfo:@{NSLocalizedDescriptionKey:
392
+ @"mtmd_init_from_file failed (mmproj incompatible with model?)"}];
393
+ }
394
+ return NO;
395
+ }
396
+ _currentMmprojPath = [mmprojPath copy];
397
+ return YES;
398
+ }
399
+
400
+ - (void)unloadMmproj {
401
+ if (_mtmdCtx != NULL) {
402
+ mtmd_free(_mtmdCtx);
403
+ _mtmdCtx = NULL;
404
+ }
405
+ _currentMmprojPath = nil;
406
+ }
407
+
408
+ - (BOOL)hasAudioEncoder {
409
+ if (_mtmdCtx == NULL) return NO;
410
+ return mtmd_support_audio(_mtmdCtx);
411
+ }
412
+
413
+ - (nullable NSString *)applyChatTemplate:(nullable NSString *)templateOverride
414
+ messages:(NSArray<NSDictionary<NSString *, NSString *> *> *)messages
415
+ addAssistant:(BOOL)addAssistant
416
+ error:(NSError **)error {
417
+ if (!self.isLoaded) {
418
+ if (error) {
419
+ *error = [NSError errorWithDomain:@"DVAIBridgeLlama"
420
+ code:40
421
+ userInfo:@{NSLocalizedDescriptionKey: @"Model not loaded"}];
422
+ }
423
+ return nil;
424
+ }
425
+ NSUInteger n = messages.count;
426
+ if (n == 0) {
427
+ if (error) {
428
+ *error = [NSError errorWithDomain:@"DVAIBridgeLlama"
429
+ code:43
430
+ userInfo:@{NSLocalizedDescriptionKey: @"messages array is empty"}];
431
+ }
432
+ return nil;
433
+ }
434
+
435
+ // Build llama_chat_message array. We strdup() each role/content so the
436
+ // C-string lifetime is independent of any autorelease pool draining
437
+ // mid-call. NSString.UTF8String returns a pointer with autorelease
438
+ // lifetime, which is unsafe to hold across this multi-step call.
439
+ struct llama_chat_message *chat = (struct llama_chat_message *)calloc(n, sizeof(struct llama_chat_message));
440
+ if (!chat) {
441
+ if (error) {
442
+ *error = [NSError errorWithDomain:@"DVAIBridgeLlama"
443
+ code:44
444
+ userInfo:@{NSLocalizedDescriptionKey: @"calloc failed"}];
445
+ }
446
+ return nil;
447
+ }
448
+ for (NSUInteger i = 0; i < n; i++) {
449
+ NSDictionary *msg = messages[i];
450
+ NSString *role = msg[@"role"];
451
+ NSString *content = msg[@"content"];
452
+ if (![role isKindOfClass:[NSString class]]) role = @"user";
453
+ if (![content isKindOfClass:[NSString class]]) content = @"";
454
+ chat[i].role = strdup([role UTF8String]);
455
+ chat[i].content = strdup([content UTF8String]);
456
+ }
457
+
458
+ // Resolve template: explicit override > model's own > NULL (= built-in
459
+ // default heuristic; may fail for unknown architectures).
460
+ const char *tmpl = NULL;
461
+ if (templateOverride.length > 0) {
462
+ tmpl = [templateOverride UTF8String];
463
+ } else {
464
+ const char *modelTmpl = llama_model_chat_template(_model, NULL);
465
+ if (modelTmpl) tmpl = modelTmpl;
466
+ }
467
+
468
+ // Probe size. llama_chat_apply_template returns the required bytes
469
+ // (positive) when buf is too small, or a negative error code.
470
+ int needed = llama_chat_apply_template(tmpl, chat, n, addAssistant, NULL, 0);
471
+ if (needed <= 0) {
472
+ for (NSUInteger i = 0; i < n; i++) {
473
+ free((void *)chat[i].role);
474
+ free((void *)chat[i].content);
475
+ }
476
+ free(chat);
477
+ if (error) {
478
+ *error = [NSError errorWithDomain:@"DVAIBridgeLlama"
479
+ code:41
480
+ userInfo:@{NSLocalizedDescriptionKey:
481
+ @"llama_chat_apply_template probe failed (model has no chat template and none provided?)"}];
482
+ }
483
+ return nil;
484
+ }
485
+ char *buf = (char *)calloc((size_t)needed + 1, sizeof(char));
486
+ if (!buf) {
487
+ for (NSUInteger i = 0; i < n; i++) {
488
+ free((void *)chat[i].role);
489
+ free((void *)chat[i].content);
490
+ }
491
+ free(chat);
492
+ if (error) {
493
+ *error = [NSError errorWithDomain:@"DVAIBridgeLlama"
494
+ code:44
495
+ userInfo:@{NSLocalizedDescriptionKey: @"calloc failed"}];
496
+ }
497
+ return nil;
498
+ }
499
+ int actual = llama_chat_apply_template(tmpl, chat, n, addAssistant, buf, needed + 1);
500
+ NSString *result = nil;
501
+ if (actual > 0) {
502
+ result = [[NSString alloc] initWithBytes:buf length:(NSUInteger)actual encoding:NSUTF8StringEncoding];
503
+ }
504
+ for (NSUInteger i = 0; i < n; i++) {
505
+ free((void *)chat[i].role);
506
+ free((void *)chat[i].content);
507
+ }
508
+ free(chat);
509
+ free(buf);
510
+
511
+ if (!result) {
512
+ if (error) {
513
+ *error = [NSError errorWithDomain:@"DVAIBridgeLlama"
514
+ code:42
515
+ userInfo:@{NSLocalizedDescriptionKey: @"llama_chat_apply_template failed"}];
516
+ }
517
+ return nil;
518
+ }
519
+ return result;
520
+ }
521
+
522
+ - (nullable NSString *)completeMultimodalPrompt:(NSString *)prompt
523
+ media:(NSArray<NSData *> *)mediaInOrder
524
+ maxTokens:(int)maxTokens
525
+ temperature:(float)temperature
526
+ topP:(float)topP
527
+ error:(NSError **)error {
528
+ (void)temperature;
529
+ (void)topP;
530
+
531
+ if (!self.isLoaded) {
532
+ if (error) {
533
+ *error = [NSError errorWithDomain:@"DVAIBridgeLlama"
534
+ code:50
535
+ userInfo:@{NSLocalizedDescriptionKey: @"Model not loaded"}];
536
+ }
537
+ return nil;
538
+ }
539
+ if (_mtmdCtx == NULL) {
540
+ if (error) {
541
+ *error = [NSError errorWithDomain:@"DVAIBridgeLlama"
542
+ code:51
543
+ userInfo:@{NSLocalizedDescriptionKey: @"mmproj not loaded"}];
544
+ }
545
+ return nil;
546
+ }
547
+
548
+ NSUInteger nMedia = mediaInOrder.count;
549
+
550
+ // 1. Build bitmaps in declaration order. Each bitmap is auto-detected as
551
+ // image vs audio by mtmd_helper_bitmap_init_from_buf via magic bytes.
552
+ mtmd_bitmap **bitmaps = NULL;
553
+ if (nMedia > 0) {
554
+ bitmaps = (mtmd_bitmap **)calloc(nMedia, sizeof(mtmd_bitmap *));
555
+ if (!bitmaps) {
556
+ if (error) {
557
+ *error = [NSError errorWithDomain:@"DVAIBridgeLlama"
558
+ code:55
559
+ userInfo:@{NSLocalizedDescriptionKey: @"calloc failed"}];
560
+ }
561
+ return nil;
562
+ }
563
+ }
564
+ for (NSUInteger i = 0; i < nMedia; i++) {
565
+ NSData *bytes = mediaInOrder[i];
566
+ bitmaps[i] = mtmd_helper_bitmap_init_from_buf(_mtmdCtx,
567
+ (const unsigned char *)bytes.bytes,
568
+ (size_t)bytes.length);
569
+ if (bitmaps[i] == NULL) {
570
+ for (NSUInteger j = 0; j < i; j++) mtmd_bitmap_free(bitmaps[j]);
571
+ free(bitmaps);
572
+ if (error) {
573
+ *error = [NSError errorWithDomain:@"DVAIBridgeLlama"
574
+ code:52
575
+ userInfo:@{NSLocalizedDescriptionKey:
576
+ [NSString stringWithFormat:@"mtmd_helper_bitmap_init_from_buf failed for media[%lu]",
577
+ (unsigned long)i]}];
578
+ }
579
+ return nil;
580
+ }
581
+ }
582
+
583
+ // 2. Tokenize. mtmd_tokenize matches markers in the prompt against the
584
+ // bitmap array in order.
585
+ mtmd_input_chunks *chunks = mtmd_input_chunks_init();
586
+ if (!chunks) {
587
+ for (NSUInteger i = 0; i < nMedia; i++) mtmd_bitmap_free(bitmaps[i]);
588
+ free(bitmaps);
589
+ if (error) {
590
+ *error = [NSError errorWithDomain:@"DVAIBridgeLlama"
591
+ code:55
592
+ userInfo:@{NSLocalizedDescriptionKey: @"mtmd_input_chunks_init failed"}];
593
+ }
594
+ return nil;
595
+ }
596
+ struct mtmd_input_text input_text;
597
+ input_text.text = prompt ? [prompt UTF8String] : "";
598
+ // The chat template already added BOS; don't add it again.
599
+ input_text.add_special = false;
600
+ input_text.parse_special = true;
601
+ int32_t tok_rc = mtmd_tokenize(_mtmdCtx, chunks, &input_text,
602
+ (const mtmd_bitmap **)bitmaps, (size_t)nMedia);
603
+ // Per mtmd.h: mtmd_tokenize copies what it needs out of bitmaps; safe to
604
+ // free immediately after the call returns.
605
+ for (NSUInteger i = 0; i < nMedia; i++) mtmd_bitmap_free(bitmaps[i]);
606
+ free(bitmaps);
607
+ if (tok_rc != 0) {
608
+ mtmd_input_chunks_free(chunks);
609
+ if (error) {
610
+ NSString *msg = (tok_rc == 1)
611
+ ? @"mtmd_tokenize: marker count does not match media count"
612
+ : (tok_rc == 2)
613
+ ? @"mtmd_tokenize: image preprocessing error"
614
+ : [NSString stringWithFormat:@"mtmd_tokenize failed (rc=%d)", tok_rc];
615
+ *error = [NSError errorWithDomain:@"DVAIBridgeLlama"
616
+ code:53
617
+ userInfo:@{NSLocalizedDescriptionKey: msg}];
618
+ }
619
+ return nil;
620
+ }
621
+
622
+ // 3. Eval all chunks.
623
+ llama_pos n_past = 0;
624
+ llama_pos new_n_past = 0;
625
+ int32_t eval_rc = mtmd_helper_eval_chunks(_mtmdCtx, _ctx, chunks,
626
+ n_past,
627
+ /*seq_id=*/0,
628
+ /*n_batch=*/512,
629
+ /*logits_last=*/true,
630
+ &new_n_past);
631
+ mtmd_input_chunks_free(chunks);
632
+ if (eval_rc != 0) {
633
+ if (error) {
634
+ *error = [NSError errorWithDomain:@"DVAIBridgeLlama"
635
+ code:54
636
+ userInfo:@{NSLocalizedDescriptionKey:
637
+ [NSString stringWithFormat:@"mtmd_helper_eval_chunks failed (rc=%d)",
638
+ eval_rc]}];
639
+ }
640
+ return nil;
641
+ }
642
+
643
+ // 4. Sampling loop (greedy). The KV cache now reflects all evaled chunks;
644
+ // each sampled token is appended as a 1-token batch via the helper.
645
+ const struct llama_vocab *vocab = llama_model_get_vocab(_model);
646
+ return [self sampleGreedyUpToMaxTokens:maxTokens vocab:vocab];
647
+ }
648
+
649
+ @end