@fugood/llama.node 0.4.7 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. package/CMakeLists.txt +4 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/lib/binding.ts +66 -6
  11. package/lib/index.js +59 -17
  12. package/lib/index.ts +74 -23
  13. package/package.json +1 -1
  14. package/src/DecodeAudioTokenWorker.cpp +40 -0
  15. package/src/DecodeAudioTokenWorker.h +22 -0
  16. package/src/EmbeddingWorker.cpp +7 -5
  17. package/src/LlamaCompletionWorker.cpp +68 -54
  18. package/src/LlamaCompletionWorker.h +7 -8
  19. package/src/LlamaContext.cpp +551 -235
  20. package/src/LlamaContext.h +26 -4
  21. package/src/LoadSessionWorker.cpp +4 -2
  22. package/src/SaveSessionWorker.cpp +10 -6
  23. package/src/TokenizeWorker.cpp +23 -14
  24. package/src/TokenizeWorker.h +2 -2
  25. package/src/addons.cc +8 -11
  26. package/src/common.hpp +129 -126
  27. package/src/llama.cpp/.github/workflows/build.yml +2 -2
  28. package/src/llama.cpp/.github/workflows/release.yml +152 -129
  29. package/src/llama.cpp/.github/workflows/winget.yml +42 -0
  30. package/src/llama.cpp/common/arg.cpp +14 -13
  31. package/src/llama.cpp/common/common.cpp +4 -75
  32. package/src/llama.cpp/common/common.h +7 -12
  33. package/src/llama.cpp/examples/lookahead/lookahead.cpp +0 -13
  34. package/src/llama.cpp/examples/lookup/lookup.cpp +0 -11
  35. package/src/llama.cpp/examples/parallel/parallel.cpp +0 -9
  36. package/src/llama.cpp/examples/retrieval/retrieval.cpp +6 -6
  37. package/src/llama.cpp/examples/simple/simple.cpp +1 -1
  38. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +2 -2
  39. package/src/llama.cpp/examples/sycl/run-llama2.sh +4 -4
  40. package/src/llama.cpp/examples/sycl/run-llama3.sh +28 -0
  41. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
  42. package/src/llama.cpp/examples/sycl/win-run-llama3.bat +9 -0
  43. package/src/llama.cpp/ggml/include/ggml-opt.h +2 -0
  44. package/src/llama.cpp/ggml/include/ggml.h +11 -0
  45. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +274 -0
  46. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +27 -0
  47. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +18 -2
  48. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1 -0
  49. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +107 -0
  50. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +16 -0
  51. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +8 -2
  52. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +315 -155
  53. package/src/llama.cpp/ggml/src/ggml-opt.cpp +5 -0
  54. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +43 -12
  55. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +171 -112
  56. package/src/llama.cpp/ggml/src/ggml.c +64 -18
  57. package/src/llama.cpp/include/llama.h +24 -124
  58. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +5 -1
  59. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +5 -1
  60. package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
  61. package/src/llama.cpp/src/llama-batch.cpp +3 -1
  62. package/src/llama.cpp/src/llama-context.cpp +60 -110
  63. package/src/llama.cpp/src/llama-graph.cpp +137 -233
  64. package/src/llama.cpp/src/llama-graph.h +49 -7
  65. package/src/llama.cpp/src/llama-hparams.cpp +17 -1
  66. package/src/llama.cpp/src/llama-hparams.h +34 -5
  67. package/src/llama.cpp/src/llama-kv-cache.cpp +654 -321
  68. package/src/llama.cpp/src/llama-kv-cache.h +201 -85
  69. package/src/llama.cpp/src/llama-memory.h +3 -2
  70. package/src/llama.cpp/src/llama-model.cpp +273 -94
  71. package/src/llama.cpp/src/llama-model.h +4 -1
  72. package/src/llama.cpp/tests/test-arg-parser.cpp +1 -1
  73. package/src/llama.cpp/tools/llama-bench/llama-bench.cpp +1 -0
  74. package/src/llama.cpp/tools/mtmd/CMakeLists.txt +13 -2
  75. package/src/llama.cpp/tools/mtmd/clip-impl.h +108 -11
  76. package/src/llama.cpp/tools/mtmd/clip.cpp +466 -88
  77. package/src/llama.cpp/tools/mtmd/clip.h +6 -4
  78. package/src/llama.cpp/tools/mtmd/miniaudio.h +93468 -0
  79. package/src/llama.cpp/tools/mtmd/mtmd-audio.cpp +855 -0
  80. package/src/llama.cpp/tools/mtmd/mtmd-audio.h +62 -0
  81. package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +21 -14
  82. package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +36 -49
  83. package/src/llama.cpp/tools/mtmd/mtmd.cpp +362 -98
  84. package/src/llama.cpp/tools/mtmd/mtmd.h +52 -21
  85. package/src/llama.cpp/tools/run/run.cpp +2 -2
  86. package/src/llama.cpp/tools/server/server.cpp +158 -47
  87. package/src/llama.cpp/tools/server/utils.hpp +71 -43
  88. package/src/llama.cpp/tools/tts/tts.cpp +4 -2
  89. package/src/tts_utils.cpp +342 -0
  90. package/src/tts_utils.h +62 -0
  91. package/bin/win32/arm64/llama-node.node +0 -0
  92. package/bin/win32/arm64/node.lib +0 -0
  93. package/bin/win32/x64/llama-node.node +0 -0
  94. package/bin/win32/x64/node.lib +0 -0
  95. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  96. package/bin/win32-vulkan/arm64/node.lib +0 -0
  97. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  98. package/bin/win32-vulkan/x64/node.lib +0 -0
@@ -35,6 +35,7 @@ struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callbac
35
35
 
36
36
  enum ffn_op_type {
37
37
  FFN_GELU,
38
+ FFN_GELU_ERF,
38
39
  FFN_SILU,
39
40
  FFN_GELU_QUICK,
40
41
  };
@@ -165,6 +166,9 @@ enum patch_merge_type {
165
166
  };
166
167
 
167
168
  struct clip_hparams {
169
+ bool has_vision = false;
170
+ bool has_audio = false;
171
+
168
172
  int32_t image_size;
169
173
  int32_t patch_size;
170
174
  int32_t n_embd;
@@ -191,6 +195,10 @@ struct clip_hparams {
191
195
  int32_t attn_window_size = 0;
192
196
  int32_t n_wa_pattern = 0;
193
197
  int32_t spatial_merge_size = 0;
198
+
199
+ // audio
200
+ int32_t n_mel_bins = 0; // whisper preprocessor
201
+ int32_t proj_stack_factor = 0; // ultravox
194
202
  };
195
203
 
196
204
  struct clip_layer {
@@ -332,6 +340,14 @@ struct clip_vision_model {
332
340
  // pixtral
333
341
  ggml_tensor * token_embd_img_break = nullptr;
334
342
  ggml_tensor * mm_patch_merger_w = nullptr;
343
+
344
+ // ultravox / whisper encoder
345
+ ggml_tensor * conv1d_1_w = nullptr;
346
+ ggml_tensor * conv1d_1_b = nullptr;
347
+ ggml_tensor * conv1d_2_w = nullptr;
348
+ ggml_tensor * conv1d_2_b = nullptr;
349
+ ggml_tensor * mm_norm_pre_w = nullptr;
350
+ ggml_tensor * mm_norm_mid_w = nullptr;
335
351
  };
336
352
 
337
353
  struct clip_ctx {
@@ -359,9 +375,12 @@ struct clip_ctx {
359
375
  int max_nodes = 8192;
360
376
  ggml_backend_sched_ptr sched;
361
377
 
362
- clip_image_size load_image_size;
378
+ // for debugging
379
+ bool debug_graph = false;
380
+ std::vector<ggml_tensor *> debug_print_tensors;
363
381
 
364
382
  clip_ctx(clip_context_params & ctx_params) {
383
+ debug_graph = std::getenv("MTMD_DEBUG_GRAPH") != nullptr;
365
384
  backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
366
385
  if (!backend_cpu) {
367
386
  throw std::runtime_error("failed to initialize CPU backend");
@@ -440,7 +459,7 @@ struct clip_graph {
440
459
  };
441
460
  ctx0_ptr.reset(ggml_init(params));
442
461
  ctx0 = ctx0_ptr.get();
443
- gf = ggml_new_graph(ctx0);
462
+ gf = ggml_new_graph_custom(ctx0, ctx->max_nodes, false);
444
463
  }
445
464
 
446
465
  ggml_cgraph * build_siglip() {
@@ -522,7 +541,7 @@ struct clip_graph {
522
541
  ggml_set_input(pos_w);
523
542
 
524
543
  auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
525
- return build_rope_2d(ctx0, cur, pos_h, pos_w, hparams.rope_theta);
544
+ return build_rope_2d(ctx0, cur, pos_h, pos_w, hparams.rope_theta, true);
526
545
  };
527
546
 
528
547
  ggml_tensor * inp = build_inp();
@@ -936,6 +955,101 @@ struct clip_graph {
936
955
  return gf;
937
956
  }
938
957
 
958
+ ggml_cgraph * build_llama4() {
959
+ GGML_ASSERT(model.class_embedding != nullptr);
960
+ GGML_ASSERT(model.position_embeddings != nullptr);
961
+
962
+ const int n_pos = n_patches + 1; // +1 for [CLS]
963
+
964
+ // 2D input positions
965
+ ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
966
+ ggml_set_name(pos_h, "pos_h");
967
+ ggml_set_input(pos_h);
968
+
969
+ ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
970
+ ggml_set_name(pos_w, "pos_w");
971
+ ggml_set_input(pos_w);
972
+
973
+ ggml_tensor * inp = build_inp_raw();
974
+
975
+ // Llama4UnfoldConvolution
976
+ {
977
+ ggml_tensor * kernel = ggml_reshape_4d(ctx0, model.patch_embeddings_0,
978
+ patch_size, patch_size, 3, n_embd);
979
+ inp = ggml_im2col(ctx0, kernel, inp, patch_size, patch_size, 0, 0, 1, 1, true, inp->type);
980
+ inp = ggml_mul_mat(ctx0, model.patch_embeddings_0, inp);
981
+ inp = ggml_reshape_2d(ctx0, inp, n_embd, n_patches);
982
+ cb(inp, "patch_conv", -1);
983
+ }
984
+
985
+ // add CLS token
986
+ inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
987
+
988
+ // build ViT with 2D position embeddings
989
+ auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
990
+ // first half is X axis and second half is Y axis
991
+ // ref: https://github.com/huggingface/transformers/blob/40a493c7ed4f19f08eadb0639cf26d49bfa5e180/src/transformers/models/llama4/modeling_llama4.py#L1312
992
+ // ref: https://github.com/Blaizzy/mlx-vlm/blob/a57156aa87b33cca6e5ee6cfc14dd4ef8f611be6/mlx_vlm/models/llama4/vision.py#L441
993
+ return build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false);
994
+ };
995
+ ggml_tensor * cur = build_vit(
996
+ inp, n_pos,
997
+ NORM_TYPE_NORMAL,
998
+ hparams.ffn_op,
999
+ model.position_embeddings,
1000
+ add_pos);
1001
+
1002
+ // remove CLS token
1003
+ cur = ggml_view_2d(ctx0, cur,
1004
+ n_embd, n_patches,
1005
+ ggml_row_size(cur->type, n_embd), 0);
1006
+
1007
+ // pixel shuffle
1008
+ // based on Llama4VisionPixelShuffleMLP
1009
+ // https://github.com/huggingface/transformers/blob/2932f318a20d9e54cc7aea052e040164d85de7d6/src/transformers/models/llama4/modeling_llama4.py#L1151
1010
+ {
1011
+ const int scale_factor = model.hparams.proj_scale_factor;
1012
+ const int bsz = 1; // batch size, always 1 for now since we don't support batching
1013
+ GGML_ASSERT(scale_factor > 0);
1014
+ GGML_ASSERT(n_patches_x == n_patches_y); // llama4 only supports square images
1015
+ cur = ggml_reshape_4d(ctx0, cur,
1016
+ n_embd * scale_factor,
1017
+ n_patches_x / scale_factor,
1018
+ n_patches_y,
1019
+ bsz);
1020
+ cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
1021
+ cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur),
1022
+ n_embd * scale_factor * scale_factor,
1023
+ n_patches_x / scale_factor,
1024
+ n_patches_y / scale_factor,
1025
+ bsz);
1026
+ cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
1027
+ // flatten to 2D
1028
+ cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, cur),
1029
+ n_embd * scale_factor * scale_factor,
1030
+ n_patches / scale_factor / scale_factor);
1031
+ cb(cur, "pixel_shuffle", -1);
1032
+ }
1033
+
1034
+ // based on Llama4VisionMLP2 (always uses GELU activation, no bias)
1035
+ {
1036
+ cur = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, cur);
1037
+ cur = ggml_gelu(ctx0, cur);
1038
+ cur = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, cur);
1039
+ cur = ggml_gelu(ctx0, cur);
1040
+ cb(cur, "adapter_mlp", -1);
1041
+ }
1042
+
1043
+ // Llama4MultiModalProjector
1044
+ cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur);
1045
+ cb(cur, "projected", -1);
1046
+
1047
+ // build the graph
1048
+ ggml_build_forward_expand(gf, cur);
1049
+
1050
+ return gf;
1051
+ }
1052
+
939
1053
  // this graph is used by llava, granite and glm
940
1054
  // due to having embedding_stack (used by granite), we cannot reuse build_vit
941
1055
  ggml_cgraph * build_llava() {
@@ -1310,16 +1424,118 @@ struct clip_graph {
1310
1424
  return gf;
1311
1425
  }
1312
1426
 
1427
+ // whisper encoder with custom projector
1428
+ ggml_cgraph * build_whisper_enc() {
1429
+ const int n_frames = img.nx;
1430
+ const int n_pos = n_frames / 2;
1431
+ GGML_ASSERT(model.position_embeddings->ne[1] >= n_pos);
1432
+
1433
+ ggml_tensor * inp = build_inp_raw(1);
1434
+
1435
+ // conv1d block
1436
+ {
1437
+ // convolution + gelu
1438
+ ggml_tensor * cur = ggml_conv_1d_ph(ctx0, model.conv1d_1_w, inp, 1, 1);
1439
+ cur = ggml_add(ctx0, cur, model.conv1d_1_b);
1440
+
1441
+ cur = ggml_gelu_erf(ctx0, cur);
1442
+
1443
+ cur = ggml_conv_1d_ph(ctx0, model.conv1d_2_w, cur, 2, 1);
1444
+ cur = ggml_add(ctx0, cur, model.conv1d_2_b);
1445
+
1446
+ cur = ggml_gelu_erf(ctx0, cur);
1447
+ // transpose
1448
+ inp = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
1449
+ cb(inp, "after_conv1d", -1);
1450
+ }
1451
+
1452
+ // sanity check (only check one layer, but it should be the same for all)
1453
+ GGML_ASSERT(model.layers[0].ln_1_w && model.layers[0].ln_1_b);
1454
+ GGML_ASSERT(model.layers[0].ln_2_w && model.layers[0].ln_2_b);
1455
+ GGML_ASSERT(model.layers[0].q_b);
1456
+ GGML_ASSERT(model.layers[0].v_b);
1457
+ GGML_ASSERT(!model.layers[0].k_b); // no bias for k
1458
+ GGML_ASSERT(model.post_ln_w && model.post_ln_b);
1459
+
1460
+ ggml_tensor * pos_embd_selected = ggml_view_2d(
1461
+ ctx0, model.position_embeddings,
1462
+ model.position_embeddings->ne[0], n_pos,
1463
+ model.position_embeddings->nb[1], 0
1464
+ );
1465
+ ggml_tensor * cur = build_vit(
1466
+ inp, n_pos,
1467
+ NORM_TYPE_NORMAL,
1468
+ hparams.ffn_op,
1469
+ pos_embd_selected,
1470
+ nullptr);
1471
+
1472
+ cb(cur, "after_transformer", -1);
1473
+
1474
+ // StackAudioFrames
1475
+ // https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/blob/main/ultravox_model.py
1476
+ {
1477
+ int64_t stride = n_embd * hparams.proj_stack_factor;
1478
+ int64_t padded_len = GGML_PAD(ggml_nelements(cur), stride);
1479
+ int64_t pad = padded_len - ggml_nelements(cur);
1480
+ if (pad > 0) {
1481
+ cur = ggml_view_1d(ctx0, cur, ggml_nelements(cur), 0);
1482
+ cur = ggml_pad(ctx0, cur, pad, 0, 0, 0);
1483
+ }
1484
+ cur = ggml_view_2d(ctx0, cur, stride, padded_len / stride,
1485
+ ggml_row_size(cur->type, stride), 0);
1486
+ }
1487
+
1488
+ cb(cur, "after_stacked", -1);
1489
+
1490
+ // UltravoxProjector
1491
+ {
1492
+ // pre-norm
1493
+ cur = ggml_rms_norm(ctx0, cur, 1e-6);
1494
+ cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w);
1495
+
1496
+ // ffn in
1497
+ cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
1498
+
1499
+ // swiglu
1500
+ {
1501
+ int64_t split_point = cur->ne[0] / 2;
1502
+ ggml_tensor * x0 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0));
1503
+ ggml_tensor * x1 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur)));
1504
+
1505
+ // see SwiGLU in ultravox_model.py, the second half passed through is silu, not the first half
1506
+ x1 = ggml_silu(ctx0, x1);
1507
+ cur = ggml_mul(ctx0, x0, x1);
1508
+ }
1509
+
1510
+ // mid-norm
1511
+ cur = ggml_rms_norm(ctx0, cur, 1e-6);
1512
+ cur = ggml_mul(ctx0, cur, model.mm_norm_mid_w);
1513
+
1514
+ // ffn out
1515
+ cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
1516
+ }
1517
+
1518
+ cb(cur, "projected", -1);
1519
+
1520
+ ggml_build_forward_expand(gf, cur);
1521
+
1522
+ return gf;
1523
+ }
1524
+
1313
1525
  private:
1314
1526
  //
1315
1527
  // utility functions
1316
1528
  //
1317
1529
 
1318
- void cb(ggml_tensor * cur, const char * name, int il) const {
1319
- // TODO: implement this
1320
- GGML_UNUSED(cur);
1321
- GGML_UNUSED(name);
1322
- GGML_UNUSED(il);
1530
+ void cb(ggml_tensor * cur0, const char * name, int il) const {
1531
+ if (ctx->debug_graph) {
1532
+ ggml_tensor * cur = ggml_cpy(ctx0, cur0, ggml_dup_tensor(ctx0, cur0));
1533
+ std::string cur_name = il >= 0 ? std::string(name) + "_" + std::to_string(il) : name;
1534
+ ggml_set_name(cur, cur_name.c_str());
1535
+ ggml_set_output(cur);
1536
+ ggml_build_forward_expand(gf, cur);
1537
+ ctx->debug_print_tensors.push_back(cur);
1538
+ }
1323
1539
  }
1324
1540
 
1325
1541
  // build vision transformer (ViT) cgraph
@@ -1460,8 +1676,8 @@ private:
1460
1676
  return inp;
1461
1677
  }
1462
1678
 
1463
- ggml_tensor * build_inp_raw() {
1464
- ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, img.nx, img.ny, 3);
1679
+ ggml_tensor * build_inp_raw(int channels = 3) {
1680
+ ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, img.nx, img.ny, channels);
1465
1681
  ggml_set_name(inp_raw, "inp_raw");
1466
1682
  ggml_set_input(inp_raw);
1467
1683
  return inp_raw;
@@ -1539,6 +1755,11 @@ private:
1539
1755
  cur = ggml_gelu(ctx0, cur);
1540
1756
  cb(cur, "ffn_gelu", il);
1541
1757
  } break;
1758
+ case FFN_GELU_ERF:
1759
+ {
1760
+ cur = ggml_gelu_erf(ctx0, cur);
1761
+ cb(cur, "ggml_gelu_erf", il);
1762
+ } break;
1542
1763
  case FFN_GELU_QUICK:
1543
1764
  {
1544
1765
  cur = ggml_gelu_quick(ctx0, cur);
@@ -1630,9 +1851,10 @@ private:
1630
1851
  static ggml_tensor * build_rope_2d(
1631
1852
  ggml_context * ctx0,
1632
1853
  ggml_tensor * cur,
1633
- ggml_tensor * pos_h,
1634
- ggml_tensor * pos_w,
1635
- const float freq_base
1854
+ ggml_tensor * pos_a, // first half
1855
+ ggml_tensor * pos_b, // second half
1856
+ const float freq_base,
1857
+ const bool interleave_freq
1636
1858
  ) {
1637
1859
  const int64_t n_dim = cur->ne[0];
1638
1860
  const int64_t n_head = cur->ne[1];
@@ -1646,7 +1868,9 @@ private:
1646
1868
  // ^ don't ask me why, it's math! -2(2i) / n_dim == -2i / (n_dim/2)
1647
1869
  // then for the second half, we use freq_scale to shift the inv_freq
1648
1870
  // ^ why? replace (2i) with (2i+1) in the above equation
1649
- const float freq_scale_odd = std::pow(freq_base, (float)-2/n_dim);
1871
+ const float freq_scale_odd = interleave_freq
1872
+ ? std::pow(freq_base, (float)-2/n_dim)
1873
+ : 1.0;
1650
1874
 
1651
1875
  // first half
1652
1876
  ggml_tensor * first;
@@ -1659,7 +1883,7 @@ private:
1659
1883
  first = ggml_rope_ext(
1660
1884
  ctx0,
1661
1885
  first,
1662
- pos_h, // positions
1886
+ pos_a, // positions
1663
1887
  nullptr, // freq factors
1664
1888
  n_dim/2, // n_dims
1665
1889
  0, 0, freq_base,
@@ -1679,7 +1903,7 @@ private:
1679
1903
  second = ggml_rope_ext(
1680
1904
  ctx0,
1681
1905
  second,
1682
- pos_w, // positions
1906
+ pos_b, // positions
1683
1907
  nullptr, // freq factors
1684
1908
  n_dim/2, // n_dims
1685
1909
  0, 0, freq_base,
@@ -1723,6 +1947,14 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
1723
1947
  {
1724
1948
  res = graph.build_internvl();
1725
1949
  } break;
1950
+ case PROJECTOR_TYPE_LLAMA4:
1951
+ {
1952
+ res = graph.build_llama4();
1953
+ } break;
1954
+ case PROJECTOR_TYPE_ULTRAVOX:
1955
+ {
1956
+ res = graph.build_whisper_enc();
1957
+ } break;
1726
1958
  default:
1727
1959
  {
1728
1960
  res = graph.build_llava();
@@ -1806,18 +2038,30 @@ struct clip_model_loader {
1806
2038
 
1807
2039
  // other hparams
1808
2040
  {
1809
- get_i32(KEY_MINICPMV_VERSION, ctx_clip.minicpmv_version, false); // legacy
1810
-
1811
- get_u32(KEY_N_EMBD, hparams.n_embd);
1812
- get_u32(KEY_N_HEAD, hparams.n_head);
1813
- get_u32(KEY_N_FF, hparams.n_ff);
1814
- get_u32(KEY_N_BLOCK, hparams.n_layer);
1815
- get_u32(KEY_PROJ_DIM, hparams.projection_dim);
1816
- get_f32(KEY_LAYER_NORM_EPS, hparams.eps);
1817
- get_u32(KEY_IMAGE_SIZE, hparams.image_size);
1818
- get_u32(KEY_PATCH_SIZE, hparams.patch_size);
1819
- get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false);
1820
- get_arr_int(KEY_IMAGE_GRID_PINPOINTS, hparams.image_grid_pinpoints, false);
2041
+ get_bool(KEY_HAS_AUDIO_ENC, hparams.has_audio, false);
2042
+ get_bool(KEY_HAS_VISION_ENC, hparams.has_vision, false);
2043
+
2044
+ const char * prefix = hparams.has_vision ? "vision" : "audio";
2045
+ get_u32(string_format(KEY_N_EMBD, prefix), hparams.n_embd);
2046
+ get_u32(string_format(KEY_N_HEAD, prefix), hparams.n_head);
2047
+ get_u32(string_format(KEY_N_FF, prefix), hparams.n_ff);
2048
+ get_u32(string_format(KEY_N_BLOCK, prefix), hparams.n_layer);
2049
+ get_u32(string_format(KEY_PROJ_DIM, prefix), hparams.projection_dim);
2050
+ get_f32(string_format(KEY_LAYER_NORM_EPS, prefix), hparams.eps);
2051
+
2052
+ if (hparams.has_vision) {
2053
+ get_u32(KEY_IMAGE_SIZE, hparams.image_size);
2054
+ get_u32(KEY_PATCH_SIZE, hparams.patch_size);
2055
+ get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false);
2056
+ get_arr_int(KEY_IMAGE_GRID_PINPOINTS, hparams.image_grid_pinpoints, false);
2057
+ get_i32(KEY_MINICPMV_VERSION, ctx_clip.minicpmv_version, false); // legacy
2058
+
2059
+ } else if (hparams.has_audio) {
2060
+ get_u32(KEY_A_NUM_MEL_BINS, hparams.n_mel_bins);
2061
+
2062
+ } else {
2063
+ throw std::runtime_error(string_format("%s: neither vision nor audio encoder is present\n", __func__));
2064
+ }
1821
2065
 
1822
2066
  // default warmup value
1823
2067
  hparams.warmup_image_size = hparams.image_size;
@@ -1855,7 +2099,7 @@ struct clip_model_loader {
1855
2099
  }
1856
2100
  }
1857
2101
 
1858
- {
2102
+ if (hparams.has_vision) {
1859
2103
  int idx_mean = gguf_find_key(ctx_gguf.get(), KEY_IMAGE_MEAN);
1860
2104
  int idx_std = gguf_find_key(ctx_gguf.get(), KEY_IMAGE_STD);
1861
2105
  GGML_ASSERT(idx_mean >= 0 && "image_mean not found");
@@ -1926,24 +2170,56 @@ struct clip_model_loader {
1926
2170
  hparams.warmup_image_size = hparams.patch_size * 8;
1927
2171
  get_u32(KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern);
1928
2172
  } break;
2173
+ case PROJECTOR_TYPE_LLAMA4:
2174
+ {
2175
+ hparams.rope_theta = 10000.0f;
2176
+ get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor);
2177
+
2178
+ // borrowed from llava-1.6
2179
+ const int isize = hparams.image_size;
2180
+ hparams.image_grid_pinpoints = {
2181
+ isize, isize*2, // 336, 672
2182
+ isize*2, isize, // 672, 336
2183
+ isize*2, isize*2, // 672, 672
2184
+ isize*3, isize, // 1008, 336
2185
+ isize, isize*3, // 336, 1008
2186
+ };
2187
+ } break;
2188
+ case PROJECTOR_TYPE_ULTRAVOX:
2189
+ {
2190
+ get_u32(KEY_A_PROJ_STACK_FACTOR, hparams.proj_stack_factor);
2191
+ if (hparams.n_mel_bins != 128) {
2192
+ throw std::runtime_error(string_format("%s: only 128 mel bins are supported for ultravox\n", __func__));
2193
+ }
2194
+ hparams.ffn_op = FFN_GELU_ERF;
2195
+ log_ffn_op = "gelu_erf"; // temporary solution for logging
2196
+ } break;
1929
2197
  default:
1930
2198
  break;
1931
2199
  }
1932
2200
 
1933
2201
  LOG_INF("%s: projector: %s\n", __func__, proj_type.c_str());
2202
+ LOG_INF("%s: has_vision_encoder: %d\n", __func__, hparams.has_vision);
2203
+ LOG_INF("%s: has_audio_encoder: %d\n", __func__, hparams.has_audio);
1934
2204
  LOG_INF("%s: n_embd: %d\n", __func__, hparams.n_embd);
1935
2205
  LOG_INF("%s: n_head: %d\n", __func__, hparams.n_head);
1936
2206
  LOG_INF("%s: n_ff: %d\n", __func__, hparams.n_ff);
1937
2207
  LOG_INF("%s: n_layer: %d\n", __func__, hparams.n_layer);
2208
+ LOG_INF("%s: ffn_op: %s\n", __func__, log_ffn_op.c_str());
1938
2209
  LOG_INF("%s: projection_dim: %d\n", __func__, hparams.projection_dim);
1939
- LOG_INF("%s: image_size: %d\n", __func__, hparams.image_size);
1940
- LOG_INF("%s: patch_size: %d\n", __func__, hparams.patch_size);
1941
2210
  LOG_INF("\n");
1942
- LOG_INF("%s: has_llava_proj: %d\n", __func__, ctx_clip.has_llava_projector);
1943
- LOG_INF("%s: minicpmv_version: %d\n", __func__, ctx_clip.minicpmv_version);
1944
- LOG_INF("%s: proj_scale_factor: %d\n", __func__, hparams.proj_scale_factor);
1945
- LOG_INF("%s: n_wa_pattern: %d\n", __func__, hparams.n_wa_pattern);
1946
- LOG_INF("%s: ffn_op: %s\n", __func__, log_ffn_op.c_str());
2211
+ if (hparams.has_vision) {
2212
+ LOG_INF("%s: image_size: %d\n", __func__, hparams.image_size);
2213
+ LOG_INF("%s: patch_size: %d\n", __func__, hparams.patch_size);
2214
+ LOG_INF("%s: has_llava_proj: %d\n", __func__, ctx_clip.has_llava_projector);
2215
+ LOG_INF("%s: minicpmv_version: %d\n", __func__, ctx_clip.minicpmv_version);
2216
+ LOG_INF("%s: proj_scale_factor: %d\n", __func__, hparams.proj_scale_factor);
2217
+ LOG_INF("%s: n_wa_pattern: %d\n", __func__, hparams.n_wa_pattern);
2218
+ } else if (hparams.has_audio) {
2219
+ LOG_INF("%s: n_mel_bins: %d\n", __func__, hparams.n_mel_bins);
2220
+ LOG_INF("%s: proj_stack_factor: %d\n", __func__, hparams.proj_stack_factor);
2221
+ }
2222
+ LOG_INF("\n");
1947
2223
  LOG_INF("%s: model size: %.2f MiB\n", __func__, model_size / 1024.0 / 1024.0);
1948
2224
  LOG_INF("%s: metadata size: %.2f MiB\n", __func__, ggml_get_mem_size(ctx_meta.get()) / 1024.0 / 1024.0);
1949
2225
  }
@@ -1954,6 +2230,9 @@ struct clip_model_loader {
1954
2230
  std::map<std::string, size_t> tensor_offset;
1955
2231
  std::vector<ggml_tensor *> tensors_to_load;
1956
2232
 
2233
+ // TODO @ngxson : support both audio and video in the future
2234
+ const char * prefix = hparams.has_audio ? "a" : "v";
2235
+
1957
2236
  // get offsets
1958
2237
  for (int64_t i = 0; i < gguf_get_n_tensors(ctx_gguf.get()); ++i) {
1959
2238
  const char * name = gguf_get_tensor_name(ctx_gguf.get(), i);
@@ -1991,47 +2270,47 @@ struct clip_model_loader {
1991
2270
 
1992
2271
  vision_model.class_embedding = get_tensor(TN_CLASS_EMBD, false);
1993
2272
 
1994
- vision_model.pre_ln_w = get_tensor(string_format(TN_LN_PRE, "v", "weight"), false);
1995
- vision_model.pre_ln_b = get_tensor(string_format(TN_LN_PRE, "v", "bias"), false);
2273
+ vision_model.pre_ln_w = get_tensor(string_format(TN_LN_PRE, prefix, "weight"), false);
2274
+ vision_model.pre_ln_b = get_tensor(string_format(TN_LN_PRE, prefix, "bias"), false);
1996
2275
 
1997
- vision_model.post_ln_w = get_tensor(string_format(TN_LN_POST, "v", "weight"), false);
1998
- vision_model.post_ln_b = get_tensor(string_format(TN_LN_POST, "v", "bias"), false);
2276
+ vision_model.post_ln_w = get_tensor(string_format(TN_LN_POST, prefix, "weight"), false);
2277
+ vision_model.post_ln_b = get_tensor(string_format(TN_LN_POST, prefix, "bias"), false);
1999
2278
 
2000
2279
  vision_model.patch_bias = get_tensor(TN_PATCH_BIAS, false);
2001
2280
  vision_model.patch_embeddings_0 = get_tensor(TN_PATCH_EMBD, false);
2002
2281
  vision_model.patch_embeddings_1 = get_tensor(TN_PATCH_EMBD_1, false);
2003
2282
 
2004
- vision_model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, "v"), false);
2283
+ vision_model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, prefix), false);
2005
2284
 
2006
2285
  // layers
2007
2286
  vision_model.layers.resize(hparams.n_layer);
2008
2287
  for (int il = 0; il < hparams.n_layer; ++il) {
2009
2288
  auto & layer = vision_model.layers[il];
2010
- layer.k_w = get_tensor(string_format(TN_ATTN_K, "v", il, "weight"));
2011
- layer.q_w = get_tensor(string_format(TN_ATTN_Q, "v", il, "weight"));
2012
- layer.v_w = get_tensor(string_format(TN_ATTN_V, "v", il, "weight"));
2013
- layer.o_w = get_tensor(string_format(TN_ATTN_OUTPUT, "v", il, "weight"));
2014
- layer.k_norm = get_tensor(string_format(TN_ATTN_K_NORM, "v", il, "weight"), false);
2015
- layer.q_norm = get_tensor(string_format(TN_ATTN_Q_NORM, "v", il, "weight"), false);
2016
- layer.ln_1_w = get_tensor(string_format(TN_LN_1, "v", il, "weight"), false);
2017
- layer.ln_2_w = get_tensor(string_format(TN_LN_2, "v", il, "weight"), false);
2018
- layer.ls_1_w = get_tensor(string_format(TN_LS_1, "v", il, "weight"), false); // no bias
2019
- layer.ls_2_w = get_tensor(string_format(TN_LS_2, "v", il, "weight"), false); // no bias
2020
-
2021
- layer.k_b = get_tensor(string_format(TN_ATTN_K, "v", il, "bias"), false);
2022
- layer.q_b = get_tensor(string_format(TN_ATTN_Q, "v", il, "bias"), false);
2023
- layer.v_b = get_tensor(string_format(TN_ATTN_V, "v", il, "bias"), false);
2024
- layer.o_b = get_tensor(string_format(TN_ATTN_OUTPUT, "v", il, "bias"), false);
2025
- layer.ln_1_b = get_tensor(string_format(TN_LN_1, "v", il, "bias"), false);
2026
- layer.ln_2_b = get_tensor(string_format(TN_LN_2, "v", il, "bias"), false);
2289
+ layer.k_w = get_tensor(string_format(TN_ATTN_K, prefix, il, "weight"));
2290
+ layer.q_w = get_tensor(string_format(TN_ATTN_Q, prefix, il, "weight"));
2291
+ layer.v_w = get_tensor(string_format(TN_ATTN_V, prefix, il, "weight"));
2292
+ layer.o_w = get_tensor(string_format(TN_ATTN_OUTPUT, prefix, il, "weight"));
2293
+ layer.k_norm = get_tensor(string_format(TN_ATTN_K_NORM, prefix, il, "weight"), false);
2294
+ layer.q_norm = get_tensor(string_format(TN_ATTN_Q_NORM, prefix, il, "weight"), false);
2295
+ layer.ln_1_w = get_tensor(string_format(TN_LN_1, prefix, il, "weight"), false);
2296
+ layer.ln_2_w = get_tensor(string_format(TN_LN_2, prefix, il, "weight"), false);
2297
+ layer.ls_1_w = get_tensor(string_format(TN_LS_1, prefix, il, "weight"), false); // no bias
2298
+ layer.ls_2_w = get_tensor(string_format(TN_LS_2, prefix, il, "weight"), false); // no bias
2299
+
2300
+ layer.k_b = get_tensor(string_format(TN_ATTN_K, prefix, il, "bias"), false);
2301
+ layer.q_b = get_tensor(string_format(TN_ATTN_Q, prefix, il, "bias"), false);
2302
+ layer.v_b = get_tensor(string_format(TN_ATTN_V, prefix, il, "bias"), false);
2303
+ layer.o_b = get_tensor(string_format(TN_ATTN_OUTPUT, prefix, il, "bias"), false);
2304
+ layer.ln_1_b = get_tensor(string_format(TN_LN_1, prefix, il, "bias"), false);
2305
+ layer.ln_2_b = get_tensor(string_format(TN_LN_2, prefix, il, "bias"), false);
2027
2306
 
2028
2307
  // ffn
2029
- layer.ff_up_w = get_tensor(string_format(TN_FFN_UP, "v", il, "weight"));
2030
- layer.ff_up_b = get_tensor(string_format(TN_FFN_UP, "v", il, "bias"), false);
2031
- layer.ff_gate_w = get_tensor(string_format(TN_FFN_GATE, "v", il, "weight"), false);
2032
- layer.ff_gate_b = get_tensor(string_format(TN_FFN_GATE, "v", il, "bias"), false);
2033
- layer.ff_down_w = get_tensor(string_format(TN_FFN_DOWN, "v", il, "weight"));
2034
- layer.ff_down_b = get_tensor(string_format(TN_FFN_DOWN, "v", il, "bias"), false);
2308
+ layer.ff_up_w = get_tensor(string_format(TN_FFN_UP, prefix, il, "weight"));
2309
+ layer.ff_up_b = get_tensor(string_format(TN_FFN_UP, prefix, il, "bias"), false);
2310
+ layer.ff_gate_w = get_tensor(string_format(TN_FFN_GATE, prefix, il, "weight"), false);
2311
+ layer.ff_gate_b = get_tensor(string_format(TN_FFN_GATE, prefix, il, "bias"), false);
2312
+ layer.ff_down_w = get_tensor(string_format(TN_FFN_DOWN, prefix, il, "weight"));
2313
+ layer.ff_down_b = get_tensor(string_format(TN_FFN_DOWN, prefix, il, "bias"), false);
2035
2314
 
2036
2315
  // some models already exported with legacy (incorrect) naming which is quite messy, let's fix it here
2037
2316
  // note: Qwen model converted from the old surgery script has n_ff = 0, so we cannot use n_ff to check!
@@ -2173,6 +2452,17 @@ struct clip_model_loader {
2173
2452
  vision_model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false);
2174
2453
  vision_model.mm_patch_merger_w = get_tensor(TN_MM_PATCH_MERGER, false);
2175
2454
  } break;
2455
+ case PROJECTOR_TYPE_ULTRAVOX:
2456
+ {
2457
+ vision_model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
2458
+ vision_model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias"));
2459
+ vision_model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight"));
2460
+ vision_model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias"));
2461
+ vision_model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
2462
+ vision_model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
2463
+ vision_model.mm_norm_pre_w = get_tensor(string_format(TN_MM_NORM_PRE, "weight"));
2464
+ vision_model.mm_norm_mid_w = get_tensor(string_format(TN_MM_NORM_MID, "weight"));
2465
+ } break;
2176
2466
  case PROJECTOR_TYPE_INTERNVL:
2177
2467
  {
2178
2468
  vision_model.mm_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight"));
@@ -2182,6 +2472,12 @@ struct clip_model_loader {
2182
2472
  vision_model.mm_3_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "weight"));
2183
2473
  vision_model.mm_3_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "bias"));
2184
2474
  } break;
2475
+ case PROJECTOR_TYPE_LLAMA4:
2476
+ {
2477
+ vision_model.mm_model_proj = get_tensor(TN_MM_PROJECTOR);
2478
+ vision_model.mm_model_mlp_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight"));
2479
+ vision_model.mm_model_mlp_2_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 2, "weight"));
2480
+ } break;
2185
2481
  default:
2186
2482
  GGML_ASSERT(false && "unknown projector type");
2187
2483
  }
@@ -2224,13 +2520,19 @@ struct clip_model_loader {
2224
2520
  }
2225
2521
 
2226
2522
  void alloc_compute_meta() {
2523
+ const auto & hparams = ctx_clip.vision_model.hparams;
2227
2524
  ctx_clip.buf_compute_meta.resize(ctx_clip.max_nodes * ggml_tensor_overhead() + ggml_graph_overhead());
2228
2525
 
2229
2526
  // create a fake batch
2230
2527
  clip_image_f32_batch batch;
2231
2528
  clip_image_f32_ptr img(clip_image_f32_init());
2232
- img->nx = ctx_clip.vision_model.hparams.warmup_image_size;
2233
- img->ny = ctx_clip.vision_model.hparams.warmup_image_size;
2529
+ if (hparams.has_vision) {
2530
+ img->nx = hparams.warmup_image_size;
2531
+ img->ny = hparams.warmup_image_size;
2532
+ } else {
2533
+ img->nx = 1024; // TODO @ngxson : use a better default
2534
+ img->ny = hparams.n_mel_bins;
2535
+ }
2234
2536
  img->buf.resize(img->nx * img->ny * 3);
2235
2537
  batch.entries.push_back(std::move(img));
2236
2538
 
@@ -2328,14 +2630,6 @@ struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_p
2328
2630
  return ctx_clip;
2329
2631
  }
2330
2632
 
2331
- void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size) {
2332
- ctx_clip->load_image_size = *load_image_size; // copy
2333
- }
2334
-
2335
- struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip) {
2336
- return &ctx_clip->load_image_size;
2337
- }
2338
-
2339
2633
  struct clip_image_size * clip_image_size_init() {
2340
2634
  struct clip_image_size * load_image_size = new struct clip_image_size();
2341
2635
  load_image_size->width = 448;
@@ -2849,7 +3143,7 @@ private:
2849
3143
 
2850
3144
  // used by llava 1.6 with custom list of pinpoints
2851
3145
  static clip_image_size select_best_resolution(const std::vector<int32_t> & pinpoints, const clip_image_size & original_size) {
2852
- std::vector<clip_image_size> possible_resolutions;
3146
+ std::vector<clip_image_size> possible_resolutions; // TODO @ngxson : construct this inside hparams, not here
2853
3147
  for (size_t i = 0; i < pinpoints.size(); i += 2) {
2854
3148
  possible_resolutions.push_back(clip_image_size{pinpoints[i], pinpoints[i+1]});
2855
3149
  }
@@ -2916,12 +3210,6 @@ private:
2916
3210
  }
2917
3211
  };
2918
3212
 
2919
- // TODO @ngxson : decprecate the load_image_size singleton pattern
2920
- int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip) {
2921
- const auto inst = llava_uhd::get_slice_instructions(ctx_clip, ctx_clip->load_image_size);
2922
- return inst.grid_size.width;
2923
- }
2924
-
2925
3213
  // returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
2926
3214
  // res_imgs memory is being allocated here, previous allocations will be freed if found
2927
3215
  bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, struct clip_image_f32_batch * res_imgs) {
@@ -2943,9 +3231,12 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
2943
3231
  normalize_image_u8_to_f32(*imgs[i], *res, ctx->image_mean, ctx->image_std);
2944
3232
  res_imgs->entries.push_back(std::move(res));
2945
3233
  }
3234
+
3235
+ res_imgs->grid_x = inst.grid_size.width;
3236
+ res_imgs->grid_y = inst.grid_size.height;
2946
3237
  return true;
2947
- }
2948
- else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
3238
+
3239
+ } else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
2949
3240
  clip_image_u8 resized;
2950
3241
  auto patch_size = params.patch_size * 2;
2951
3242
  auto new_size = image_manipulation::calc_size_preserved_ratio(original_size, patch_size, params.image_size);
@@ -2971,8 +3262,8 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
2971
3262
  normalize_image_u8_to_f32(resized_image, *img_f32, ctx->image_mean, ctx->image_std);
2972
3263
  res_imgs->entries.push_back(std::move(img_f32));
2973
3264
  return true;
2974
- }
2975
- else if (ctx->proj_type == PROJECTOR_TYPE_PIXTRAL) {
3265
+
3266
+ } else if (ctx->proj_type == PROJECTOR_TYPE_PIXTRAL) {
2976
3267
  clip_image_u8 resized_image;
2977
3268
  auto new_size = image_manipulation::calc_size_preserved_ratio(original_size, params.patch_size, params.image_size);
2978
3269
  image_manipulation::bilinear_resize(*img, resized_image, new_size.width, new_size.height);
@@ -2980,6 +3271,22 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
2980
3271
  normalize_image_u8_to_f32(resized_image, *img_f32, ctx->image_mean, ctx->image_std);
2981
3272
  res_imgs->entries.push_back(std::move(img_f32));
2982
3273
  return true;
3274
+
3275
+ } else if (ctx->proj_type == PROJECTOR_TYPE_LLAMA4) {
3276
+ GGML_ASSERT(!params.image_grid_pinpoints.empty());
3277
+ auto const inst = llava_uhd::get_slice_instructions(ctx, original_size);
3278
+ std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst);
3279
+
3280
+ for (size_t i = 0; i < imgs.size(); ++i) {
3281
+ clip_image_f32_ptr res(clip_image_f32_init());
3282
+ normalize_image_u8_to_f32(*imgs[i], *res, ctx->image_mean, ctx->image_std);
3283
+ res_imgs->entries.push_back(std::move(res));
3284
+ }
3285
+
3286
+ res_imgs->grid_x = inst.grid_size.width;
3287
+ res_imgs->grid_y = inst.grid_size.height;
3288
+ return true;
3289
+
2983
3290
  }
2984
3291
 
2985
3292
  // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
@@ -3098,6 +3405,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
3098
3405
  const auto & params = ctx->vision_model.hparams;
3099
3406
 
3100
3407
  int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
3408
+ int scale_factor = ctx->vision_model.hparams.proj_scale_factor;
3101
3409
 
3102
3410
  if (ctx->proj_type == PROJECTOR_TYPE_LDP
3103
3411
  || ctx->proj_type == PROJECTOR_TYPE_LDPV2
@@ -3136,6 +3444,12 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
3136
3444
  int n_patches_x = img->nx / params.patch_size / (n_merge > 0 ? n_merge : 1);
3137
3445
  int n_patches_y = img->ny / params.patch_size / (n_merge > 0 ? n_merge : 1);
3138
3446
  n_patches = n_patches_y*n_patches_x + n_patches_y - 1; // + one [IMG_BREAK] per row, except the last row
3447
+ } else if (ctx->proj_type == PROJECTOR_TYPE_LLAMA4) {
3448
+ n_patches /= (scale_factor * scale_factor);
3449
+ } else if (ctx->proj_type == PROJECTOR_TYPE_ULTRAVOX) {
3450
+ const int proj_stack_factor = ctx->vision_model.hparams.proj_stack_factor;
3451
+ const int n_len = CLIP_ALIGN(img->nx, proj_stack_factor);
3452
+ n_patches = n_len / proj_stack_factor / 2;
3139
3453
  }
3140
3454
 
3141
3455
  return n_patches;
@@ -3247,6 +3561,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
3247
3561
  }
3248
3562
 
3249
3563
  // build the inference graph
3564
+ ctx->debug_print_tensors.clear();
3250
3565
  ggml_backend_sched_reset(ctx->sched.get());
3251
3566
  ggml_cgraph * gf = clip_image_build_graph(ctx, imgs);
3252
3567
  ggml_backend_sched_alloc_graph(ctx->sched.get(), gf);
@@ -3261,8 +3576,8 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
3261
3576
  const int patch_size = hparams.patch_size;
3262
3577
  const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
3263
3578
  const int n_pos = num_patches + (model.class_embedding ? 1 : 0);
3264
- const int pos_w = ctx->load_image_size.width / patch_size;
3265
- const int pos_h = ctx->load_image_size.height / patch_size;
3579
+ const int pos_w = image_size_width / patch_size;
3580
+ const int pos_h = image_size_height / patch_size;
3266
3581
 
3267
3582
  const bool use_window_attn = hparams.n_wa_pattern > 0; // for qwen2.5vl
3268
3583
 
@@ -3292,7 +3607,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
3292
3607
  };
3293
3608
 
3294
3609
  // set input pixel values
3295
- {
3610
+ if (!imgs.is_audio) {
3296
3611
  size_t nelem = 0;
3297
3612
  for (const auto & img : imgs.entries) {
3298
3613
  nelem += img->nx * img->ny * 3;
@@ -3329,6 +3644,16 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
3329
3644
  }
3330
3645
  }
3331
3646
  set_input_f32("inp_raw", inp_raw);
3647
+
3648
+ } else {
3649
+ // audio input
3650
+ GGML_ASSERT(imgs.entries.size() == 1);
3651
+ const auto & mel_inp = imgs.entries[0];
3652
+ const int n_step = mel_inp->nx;
3653
+ const int n_mel = mel_inp->ny;
3654
+ std::vector<float> inp_raw(n_step * n_mel);
3655
+ std::memcpy(inp_raw.data(), mel_inp->buf.data(), n_step * n_mel * sizeof(float));
3656
+ set_input_f32("inp_raw", inp_raw);
3332
3657
  }
3333
3658
 
3334
3659
  // set input per projector
@@ -3525,9 +3850,27 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
3525
3850
  case PROJECTOR_TYPE_GEMMA3:
3526
3851
  case PROJECTOR_TYPE_IDEFICS3:
3527
3852
  case PROJECTOR_TYPE_INTERNVL:
3853
+ case PROJECTOR_TYPE_ULTRAVOX:
3528
3854
  {
3529
3855
  // do nothing
3530
3856
  } break;
3857
+ case PROJECTOR_TYPE_LLAMA4:
3858
+ {
3859
+ // set the 2D positions
3860
+ int n_patches_per_col = image_size_width / patch_size;
3861
+ std::vector<int> pos_data(num_patches + 1, 0); // +1 for the [CLS] token
3862
+ // last pos is always kept 0, it's for CLS
3863
+ // dimension H
3864
+ for (int i = 0; i < num_patches; i++) {
3865
+ pos_data[i] = (i / n_patches_per_col) + 1;
3866
+ }
3867
+ set_input_i32("pos_h", pos_data);
3868
+ // dimension W
3869
+ for (int i = 0; i < num_patches; i++) {
3870
+ pos_data[i] = (i % n_patches_per_col) + 1;
3871
+ }
3872
+ set_input_i32("pos_w", pos_data);
3873
+ } break;
3531
3874
  default:
3532
3875
  GGML_ABORT("Unknown projector type");
3533
3876
  }
@@ -3548,6 +3891,18 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
3548
3891
  return false;
3549
3892
  }
3550
3893
 
3894
+ // print debug nodes
3895
+ if (ctx->debug_graph) {
3896
+ LOG_INF("\n\n---\n\n");
3897
+ LOG_INF("\n\nDebug graph:\n\n");
3898
+ for (ggml_tensor * t : ctx->debug_print_tensors) {
3899
+ std::vector<uint8_t> data(ggml_nbytes(t));
3900
+ ggml_backend_tensor_get(t, data.data(), 0, ggml_nbytes(t));
3901
+ print_tensor_shape(t);
3902
+ print_tensor_data(t, data.data(), 3);
3903
+ }
3904
+ }
3905
+
3551
3906
  // the last node is the embedding tensor
3552
3907
  ggml_tensor * embeddings = ggml_graph_node(gf, -1);
3553
3908
 
@@ -3594,8 +3949,12 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
3594
3949
  return ctx->vision_model.mm_input_proj_w->ne[0];
3595
3950
  case PROJECTOR_TYPE_IDEFICS3:
3596
3951
  return ctx->vision_model.projection->ne[1];
3952
+ case PROJECTOR_TYPE_ULTRAVOX:
3953
+ return ctx->vision_model.mm_2_w->ne[1];
3597
3954
  case PROJECTOR_TYPE_INTERNVL:
3598
3955
  return ctx->vision_model.mm_3_w->ne[1];
3956
+ case PROJECTOR_TYPE_LLAMA4:
3957
+ return ctx->vision_model.mm_model_proj->ne[1];
3599
3958
  default:
3600
3959
  GGML_ABORT("Unknown projector type");
3601
3960
  }
@@ -3624,6 +3983,14 @@ bool clip_is_gemma3(const struct clip_ctx * ctx) {
3624
3983
  return ctx->proj_type == PROJECTOR_TYPE_GEMMA3;
3625
3984
  }
3626
3985
 
3986
+ bool clip_has_vision_encoder(const struct clip_ctx * ctx) {
3987
+ return ctx->vision_model.hparams.has_vision;
3988
+ }
3989
+
3990
+ bool clip_has_audio_encoder(const struct clip_ctx * ctx) {
3991
+ return ctx->vision_model.hparams.has_audio;
3992
+ }
3993
+
3627
3994
  bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
3628
3995
  clip_image_f32 clip_img;
3629
3996
  clip_img.buf.resize(h * w * 3);
@@ -3644,3 +4011,14 @@ bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img,
3644
4011
  projector_type clip_get_projector_type(const struct clip_ctx * ctx) {
3645
4012
  return ctx->proj_type;
3646
4013
  }
4014
+
4015
+ void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel, int n_frames, float * mel) {
4016
+ clip_image_f32 * audio = new clip_image_f32;
4017
+ audio->nx = n_frames;
4018
+ audio->ny = n_mel;
4019
+ audio->buf.resize(n_frames * n_mel);
4020
+ std::memcpy(audio->buf.data(), mel, n_frames * n_mel * sizeof(float));
4021
+
4022
+ batch->entries.push_back(clip_image_f32_ptr(audio));
4023
+ batch->is_audio = true;
4024
+ }