@fugood/llama.node 0.3.13 → 0.3.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. package/bin/darwin/arm64/llama-node.node +0 -0
  2. package/bin/darwin/x64/llama-node.node +0 -0
  3. package/bin/linux/arm64/llama-node.node +0 -0
  4. package/bin/linux/x64/llama-node.node +0 -0
  5. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  6. package/bin/linux-cuda/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/lib/binding.ts +1 -1
  18. package/package.json +1 -1
  19. package/src/LlamaContext.cpp +98 -76
  20. package/src/LlamaContext.h +1 -1
  21. package/src/common.hpp +1 -2
  22. package/src/llama.cpp/.github/workflows/build.yml +60 -10
  23. package/src/llama.cpp/.github/workflows/server.yml +2 -0
  24. package/src/llama.cpp/common/CMakeLists.txt +3 -3
  25. package/src/llama.cpp/common/arg.cpp +112 -11
  26. package/src/llama.cpp/common/chat.cpp +960 -266
  27. package/src/llama.cpp/common/chat.h +135 -0
  28. package/src/llama.cpp/common/common.cpp +27 -171
  29. package/src/llama.cpp/common/common.h +27 -67
  30. package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
  31. package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
  32. package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +37 -5
  33. package/src/llama.cpp/common/ngram-cache.cpp +1 -0
  34. package/src/llama.cpp/common/sampling.cpp +45 -7
  35. package/src/llama.cpp/common/speculative.cpp +6 -5
  36. package/src/llama.cpp/common/speculative.h +1 -1
  37. package/src/llama.cpp/docs/build.md +45 -7
  38. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +3 -1
  39. package/src/llama.cpp/examples/embedding/embedding.cpp +1 -0
  40. package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
  41. package/src/llama.cpp/examples/imatrix/imatrix.cpp +2 -3
  42. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +1 -1
  43. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  44. package/src/llama.cpp/examples/llava/clip.cpp +373 -107
  45. package/src/llama.cpp/examples/llava/clip.h +19 -3
  46. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
  47. package/src/llama.cpp/examples/llava/llava.cpp +4 -2
  48. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
  49. package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -0
  50. package/src/llama.cpp/examples/main/main.cpp +73 -28
  51. package/src/llama.cpp/examples/parallel/parallel.cpp +1 -0
  52. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -0
  53. package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
  54. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
  55. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
  56. package/src/llama.cpp/examples/run/run.cpp +110 -67
  57. package/src/llama.cpp/examples/server/server.cpp +82 -87
  58. package/src/llama.cpp/examples/server/utils.hpp +94 -107
  59. package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
  60. package/src/llama.cpp/examples/tts/tts.cpp +251 -142
  61. package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
  62. package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
  63. package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
  64. package/src/llama.cpp/ggml/include/ggml-cpu.h +3 -0
  65. package/src/llama.cpp/ggml/include/ggml.h +5 -1
  66. package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
  67. package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
  68. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
  69. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
  70. package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
  71. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -2
  72. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
  73. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +132 -17
  74. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
  75. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
  76. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
  77. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +151 -0
  78. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1396 -386
  79. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1432 -151
  80. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +22 -0
  81. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
  82. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
  83. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
  84. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  85. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +15 -2
  86. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
  87. package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
  88. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
  89. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +235 -0
  90. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -2
  91. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
  92. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +220 -116
  93. package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
  94. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
  95. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +2 -0
  96. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  97. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
  98. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +51 -10
  99. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
  100. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
  101. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
  102. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
  103. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
  104. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +136 -4
  105. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +308 -0
  106. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
  107. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +168 -721
  108. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -77
  109. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -0
  110. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
  111. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
  112. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +146 -42
  113. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +13 -3
  114. package/src/llama.cpp/ggml/src/ggml.c +8 -3
  115. package/src/llama.cpp/include/llama.h +19 -5
  116. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
  117. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
  118. package/src/llama.cpp/requirements/requirements-all.txt +1 -0
  119. package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
  120. package/src/llama.cpp/requirements.txt +1 -0
  121. package/src/llama.cpp/src/llama-arch.cpp +21 -0
  122. package/src/llama.cpp/src/llama-arch.h +1 -0
  123. package/src/llama.cpp/src/llama-chat.cpp +1 -0
  124. package/src/llama.cpp/src/llama-grammar.cpp +182 -182
  125. package/src/llama.cpp/src/llama-grammar.h +12 -3
  126. package/src/llama.cpp/src/llama-kv-cache.h +1 -0
  127. package/src/llama.cpp/src/llama-mmap.cpp +11 -1
  128. package/src/llama.cpp/src/llama-model.cpp +69 -5
  129. package/src/llama.cpp/src/llama-sampling.cpp +43 -10
  130. package/src/llama.cpp/src/llama-vocab.cpp +12 -0
  131. package/src/llama.cpp/src/llama.cpp +147 -0
  132. package/src/llama.cpp/tests/test-backend-ops.cpp +166 -110
  133. package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
  134. package/src/llama.cpp/tests/test-chat.cpp +593 -395
  135. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
  136. package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
  137. package/src/llama.cpp/Sources/llama/llama.h +0 -4
  138. package/src/llama.cpp/common/chat.hpp +0 -55
  139. /package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +0 -0
@@ -4,31 +4,12 @@
4
4
  // Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
5
5
  #include "clip.h"
6
6
  #include "ggml.h"
7
+ #include "ggml-cpp.h"
7
8
  #include "ggml-cpu.h"
8
9
  #include "ggml-alloc.h"
9
10
  #include "ggml-backend.h"
10
11
  #include "gguf.h"
11
12
 
12
- //#ifdef GGML_USE_CUDA
13
- //#include "ggml-cuda.h"
14
- //#endif
15
- //
16
- //#ifdef GGML_USE_SYCL
17
- //#include "ggml-sycl.h"
18
- //#endif
19
- //
20
- //#ifdef GGML_USE_METAL
21
- //#include "ggml-metal.h"
22
- //#endif
23
- //
24
- //#ifdef GGML_USE_CANN
25
- //#include "ggml-cann.h"
26
- //#endif
27
- //
28
- //#ifdef GGML_USE_VULKAN
29
- //#include "ggml-vulkan.h"
30
- //#endif
31
-
32
13
  #define STB_IMAGE_IMPLEMENTATION
33
14
  #include "stb_image.h"
34
15
 
@@ -40,6 +21,7 @@
40
21
  #include <map>
41
22
  #include <regex>
42
23
  #include <stdexcept>
24
+ #include <unordered_set>
43
25
  #include <vector>
44
26
  #include <sstream>
45
27
  #include <cinttypes>
@@ -120,6 +102,7 @@ static std::string format(const char * fmt, ...) {
120
102
  #define KEY_IMAGE_MEAN "clip.vision.image_mean"
121
103
  #define KEY_IMAGE_STD "clip.vision.image_std"
122
104
  #define KEY_PROJ_TYPE "clip.projector_type"
105
+ #define KEY_FEATURE_LAYER "clip.vision.feature_layer"
123
106
 
124
107
  #define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type"
125
108
  #define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints"
@@ -153,6 +136,8 @@ static std::string format(const char * fmt, ...) {
153
136
  #define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
154
137
  #define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s"
155
138
  #define TN_IMAGE_NEWLINE "model.image_newline"
139
+ #define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3
140
+ #define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3
156
141
 
157
142
  #define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k"
158
143
  #define TN_MINICPMV_QUERY "resampler.query"
@@ -179,6 +164,7 @@ enum projector_type {
179
164
  PROJECTOR_TYPE_RESAMPLER,
180
165
  PROJECTOR_TYPE_GLM_EDGE,
181
166
  PROJECTOR_TYPE_MERGER,
167
+ PROJECTOR_TYPE_GEMMA3,
182
168
  PROJECTOR_TYPE_UNKNOWN,
183
169
  };
184
170
 
@@ -189,6 +175,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
189
175
  { PROJECTOR_TYPE_RESAMPLER, "resampler"},
190
176
  { PROJECTOR_TYPE_GLM_EDGE, "adapter"},
191
177
  { PROJECTOR_TYPE_MERGER, "qwen2vl_merger"},
178
+ { PROJECTOR_TYPE_GEMMA3, "gemma3"},
192
179
  };
193
180
 
194
181
 
@@ -315,7 +302,7 @@ static projector_type clip_projector_type_from_string(const std::string & name)
315
302
  return kv.first;
316
303
  }
317
304
  }
318
- return PROJECTOR_TYPE_UNKNOWN;
305
+ throw std::runtime_error(format("Unknown projector type: %s", name.c_str()));
319
306
  }
320
307
 
321
308
  #ifdef CLIP_DEBUG_FUNCTIONS
@@ -444,8 +431,9 @@ struct clip_hparams {
444
431
 
445
432
  char mm_patch_merge_type[32] = "flat"; // spatial_unpad or flat (default)
446
433
 
447
- int32_t image_grid_pinpoints[32];
434
+ std::vector<int32_t> image_grid_pinpoints;
448
435
  int32_t image_crop_resolution;
436
+ std::unordered_set<int32_t> vision_feature_layer;
449
437
  };
450
438
 
451
439
  struct clip_layer {
@@ -571,6 +559,10 @@ struct clip_vision_model {
571
559
  struct ggml_tensor * mm_model_ln_kv_b;
572
560
  struct ggml_tensor * mm_model_ln_post_w;
573
561
  struct ggml_tensor * mm_model_ln_post_b;
562
+
563
+ // gemma3
564
+ struct ggml_tensor * mm_input_proj_w;
565
+ struct ggml_tensor * mm_soft_emb_norm_w;
574
566
  };
575
567
 
576
568
  struct clip_ctx {
@@ -585,6 +577,7 @@ struct clip_ctx {
585
577
  struct clip_vision_model vision_model;
586
578
  projector_type proj_type = PROJECTOR_TYPE_MLP;
587
579
 
580
+ int32_t max_feature_layer; // unused in newer models like gemma3
588
581
  float image_mean[3];
589
582
  float image_std[3];
590
583
  bool use_gelu = false;
@@ -596,21 +589,209 @@ struct clip_ctx {
596
589
  bool has_post_norm = false;
597
590
  bool has_patch_bias = false;
598
591
 
599
- struct gguf_context * ctx_gguf;
600
- struct ggml_context * ctx_data;
592
+ struct gguf_context * ctx_gguf = nullptr;
593
+ struct ggml_context * ctx_data = nullptr;
601
594
 
602
595
  std::vector<uint8_t> buf_compute_meta;
603
596
 
604
- // memory buffers to evaluate the model
605
- ggml_backend_buffer_t params_buffer = NULL;
597
+ std::vector<ggml_backend_t> backend_ptrs;
598
+ std::vector<ggml_backend_buffer_type_t> backend_buft;
599
+
600
+ ggml_backend_t backend = nullptr;
601
+ ggml_backend_t backend_cpu = nullptr;
602
+ ggml_backend_buffer_t buf = nullptr;
603
+
604
+ ggml_backend_sched_ptr sched;
605
+
606
+ struct clip_image_size * load_image_size = nullptr;
607
+
608
+ clip_ctx(clip_context_params & ctx_params) {
609
+ backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
610
+ backend = ctx_params.use_gpu
611
+ ? ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_GPU, nullptr)
612
+ : nullptr;
613
+
614
+ if (backend) {
615
+ LOG_INF("%s: CLIP using %s backend\n", __func__, ggml_backend_name(backend));
616
+ backend_ptrs.push_back(backend);
617
+ backend_buft.push_back(ggml_backend_get_default_buffer_type(backend));
618
+ } else {
619
+ backend = backend_cpu;
620
+ LOG_INF("%s: CLIP using CPU backend\n", __func__);
621
+ }
606
622
 
607
- ggml_backend_t backend = NULL;
608
- ggml_gallocr_t compute_alloc = NULL;
623
+ backend_ptrs.push_back(backend_cpu);
624
+ backend_buft.push_back(ggml_backend_get_default_buffer_type(backend_cpu));
609
625
 
610
- struct clip_image_size * load_image_size;
626
+ sched.reset(
627
+ ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), 8192, false)
628
+ );
629
+ }
630
+
631
+ ~clip_ctx() {
632
+ ggml_free(ctx_data);
633
+ gguf_free(ctx_gguf);
634
+ ggml_backend_buffer_free(buf);
635
+ ggml_backend_free(backend);
636
+ if (backend_cpu != backend) {
637
+ ggml_backend_free(backend_cpu);
638
+ }
639
+ }
611
640
  };
612
641
 
613
- static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) {
642
+ static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_image_f32_batch * imgs) {
643
+ const auto & model = ctx->vision_model;
644
+ const auto & hparams = model.hparams;
645
+
646
+ const int image_size = hparams.image_size;
647
+ int image_size_width = image_size;
648
+ int image_size_height = image_size;
649
+
650
+ const int patch_size = hparams.patch_size;
651
+ const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
652
+ const int hidden_size = hparams.hidden_size;
653
+ const int n_head = hparams.n_head;
654
+ const int d_head = hidden_size / n_head;
655
+ const int n_layer = hparams.n_layer;
656
+ const float eps = hparams.eps;
657
+
658
+ GGML_ASSERT(imgs->size == 1); // batch_size == 1
659
+
660
+ struct ggml_init_params params = {
661
+ /*.mem_size =*/ ctx->buf_compute_meta.size(),
662
+ /*.mem_buffer =*/ ctx->buf_compute_meta.data(),
663
+ /*.no_alloc =*/ true,
664
+ };
665
+
666
+ struct ggml_context * ctx0 = ggml_init(params);
667
+ struct ggml_cgraph * gf = ggml_new_graph(ctx0);
668
+
669
+ // input raw
670
+ struct ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, 3);
671
+ ggml_set_name(inp_raw, "inp_raw");
672
+ ggml_set_input(inp_raw);
673
+
674
+ struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
675
+ inp = ggml_reshape_2d(ctx0, inp, num_patches, hidden_size);
676
+ inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp));
677
+ inp = ggml_add(ctx0, inp, model.patch_bias);
678
+
679
+ // position embeddings
680
+ struct ggml_tensor * embeddings = ggml_add(ctx0, inp, model.position_embeddings);
681
+
682
+ // loop over layers
683
+ for (int il = 0; il < n_layer; il++) {
684
+ struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states
685
+
686
+ // layernorm1
687
+ {
688
+ cur = ggml_norm(ctx0, cur, eps);
689
+ cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_1_w), model.layers[il].ln_1_b);
690
+ }
691
+
692
+ // self-attention
693
+ {
694
+
695
+ struct ggml_tensor * Q =
696
+ ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].q_w, cur), model.layers[il].q_b);
697
+
698
+ Q = ggml_reshape_3d(ctx0, Q, d_head, n_head, num_patches);
699
+ Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
700
+
701
+ struct ggml_tensor * K =
702
+ ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].k_w, cur), model.layers[il].k_b);
703
+
704
+ K = ggml_reshape_3d(ctx0, K, d_head, n_head, num_patches);
705
+ K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
706
+
707
+ struct ggml_tensor * V =
708
+ ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].v_w, cur), model.layers[il].v_b);
709
+
710
+ V = ggml_reshape_3d(ctx0, V, d_head, n_head, num_patches);
711
+ V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
712
+
713
+ struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
714
+ KQ = ggml_scale_inplace(ctx0, KQ, 1.0f / sqrtf((float)d_head));
715
+ KQ = ggml_soft_max_inplace(ctx0, KQ);
716
+
717
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
718
+ KQV = ggml_reshape_3d(ctx0, KQV, d_head, num_patches, n_head);
719
+ KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
720
+
721
+ cur = ggml_cont_2d(ctx0, KQV, hidden_size, num_patches);
722
+ }
723
+
724
+ // attention output
725
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].o_w, cur), model.layers[il].o_b);
726
+
727
+ // re-add the layer input, e.g., residual
728
+ cur = ggml_add(ctx0, cur, embeddings);
729
+
730
+ embeddings = cur; // embeddings = residual, cur = hidden_states
731
+
732
+ // layernorm2
733
+ {
734
+ cur = ggml_norm(ctx0, cur, eps);
735
+ cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_2_w), model.layers[il].ln_2_b);
736
+ }
737
+
738
+ cur = ggml_mul_mat(ctx0, model.layers[il].ff_i_w, cur);
739
+ cur = ggml_add(ctx0, cur, model.layers[il].ff_i_b);
740
+
741
+ // siglip uses gelu
742
+ cur = ggml_gelu(ctx0, cur);
743
+
744
+ cur = ggml_mul_mat(ctx0, model.layers[il].ff_o_w, cur);
745
+ cur = ggml_add(ctx0, cur, model.layers[il].ff_o_b);
746
+
747
+ // residual 2
748
+ cur = ggml_add(ctx0, embeddings, cur);
749
+
750
+ embeddings = cur;
751
+ }
752
+
753
+ // post-layernorm
754
+ if (ctx->has_post_norm) {
755
+ embeddings = ggml_norm(ctx0, embeddings, eps);
756
+ ggml_set_name(embeddings, "post_ln");
757
+
758
+ embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.post_ln_w), model.post_ln_b);
759
+ }
760
+
761
+ if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
762
+ const int batch_size = 1;
763
+ const int mm_tokens_per_image = 256; // default value for gemma3
764
+ const int tokens_per_side = sqrt(mm_tokens_per_image);
765
+ const int patches_per_image = sqrt(num_patches);
766
+ const int kernel_size = patches_per_image / tokens_per_side;
767
+
768
+ embeddings = ggml_cont(ctx0, ggml_transpose(ctx0, embeddings));
769
+ embeddings = ggml_reshape_4d(ctx0, embeddings, patches_per_image, patches_per_image, hidden_size, batch_size);
770
+
771
+ // doing a pool2d to reduce the number of output tokens to 256
772
+ embeddings = ggml_pool_2d(ctx0, embeddings, GGML_OP_POOL_AVG, kernel_size, kernel_size, kernel_size, kernel_size, 0, 0);
773
+ embeddings = ggml_reshape_3d(ctx0, embeddings, embeddings->ne[0] * embeddings->ne[0], hidden_size, batch_size);
774
+ embeddings = ggml_cont(ctx0, ggml_transpose(ctx0, embeddings));
775
+
776
+ // apply norm before projection
777
+ embeddings = ggml_rms_norm(ctx0, embeddings, eps);
778
+ embeddings = ggml_mul(ctx0, embeddings, model.mm_soft_emb_norm_w);
779
+
780
+ // apply projection
781
+ embeddings = ggml_mul_mat(ctx0,
782
+ ggml_cont(ctx0, ggml_transpose(ctx0, model.mm_input_proj_w)),
783
+ embeddings);
784
+ }
785
+
786
+ // build the graph
787
+ ggml_build_forward_expand(gf, embeddings);
788
+
789
+ ggml_free(ctx0);
790
+
791
+ return gf;
792
+ }
793
+
794
+ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) {
614
795
  if (!ctx->has_vision_encoder) {
615
796
  LOG_ERR("This gguf file seems to have no vision encoder\n");
616
797
  return nullptr;
@@ -651,7 +832,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
651
832
  const int hidden_size = hparams.hidden_size;
652
833
  const int n_head = hparams.n_head;
653
834
  const int d_head = hidden_size / n_head;
654
- int n_layer = hparams.n_layer;
655
835
  const float eps = hparams.eps;
656
836
  int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
657
837
 
@@ -752,13 +932,19 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
752
932
  embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.pre_ln_w), model.pre_ln_b);
753
933
  }
754
934
 
935
+ std::vector<struct ggml_tensor *> embedding_stack;
936
+ const auto & vision_feature_layer = hparams.vision_feature_layer;
937
+
755
938
  // loop over layers
756
- if (ctx->has_minicpmv_projector || ctx->has_glm_projector || ctx->has_qwen2vl_merger) {
757
- n_layer += 1;
758
- }
759
- for (int il = 0; il < n_layer - 1; il++) {
939
+ for (int il = 0; il < ctx->max_feature_layer; il++) {
760
940
  struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states
761
941
 
942
+ // If this is an embedding feature layer, save the output.
943
+ // NOTE: 0 index here refers to the input to the encoder.
944
+ if (vision_feature_layer.find(il) != vision_feature_layer.end()) {
945
+ embedding_stack.push_back(embeddings);
946
+ }
947
+
762
948
  //const size_t nb_q_w = model.layers[il].q_w->nb[0];
763
949
 
764
950
  // layernorm1
@@ -846,7 +1032,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
846
1032
  cur = ggml_add(ctx0, embeddings, cur);
847
1033
 
848
1034
  embeddings = cur;
849
-
850
1035
  }
851
1036
 
852
1037
  // post-layernorm
@@ -857,6 +1042,19 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
857
1042
  embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.post_ln_w), model.post_ln_b);
858
1043
  }
859
1044
 
1045
+ // final layer is a vision feature layer
1046
+ if (vision_feature_layer.find(ctx->max_feature_layer) != vision_feature_layer.end()) {
1047
+ embedding_stack.push_back(embeddings);
1048
+ }
1049
+
1050
+ // If feature layers are explicitly set, stack them (if we have multiple)
1051
+ if (!embedding_stack.empty()) {
1052
+ embeddings = embedding_stack[0];
1053
+ for (size_t i = 1; i < embedding_stack.size(); i++) {
1054
+ embeddings = ggml_concat(ctx0, embeddings, embedding_stack[i], 0);
1055
+ }
1056
+ }
1057
+
860
1058
  // llava projector
861
1059
  if (ctx->has_llava_projector) {
862
1060
  embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
@@ -1139,7 +1337,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
1139
1337
  } else {
1140
1338
  GGML_ABORT("fatel error");
1141
1339
  }
1142
- } else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
1340
+ }
1341
+ else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
1143
1342
  embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size * 4, num_positions / 4, batch_size);
1144
1343
 
1145
1344
  embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
@@ -1161,8 +1360,25 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
1161
1360
  return gf;
1162
1361
  }
1163
1362
 
1363
+ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) {
1364
+ if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
1365
+ return clip_image_build_graph_siglip(ctx, imgs);
1366
+ } else {
1367
+ // TODO: we should have one build_* function per model
1368
+ return clip_image_build_graph_legacy(ctx, imgs, load_image_size, is_inf);
1369
+ }
1370
+ }
1371
+
1164
1372
  // read and create ggml_context containing the tensors and their data
1165
1373
  struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
1374
+ return clip_init(fname, clip_context_params{
1375
+ /* use_gpu */ true,
1376
+ /* verbosity */ verbosity,
1377
+ });
1378
+ }
1379
+
1380
+ struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_params) {
1381
+ int verbosity = ctx_params.verbosity;
1166
1382
  struct ggml_context * meta = NULL;
1167
1383
 
1168
1384
  struct gguf_init_params params = {
@@ -1256,7 +1472,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
1256
1472
  }
1257
1473
  }
1258
1474
 
1259
- clip_ctx * new_clip = new clip_ctx{};
1475
+ clip_ctx * new_clip = new clip_ctx(ctx_params);
1260
1476
 
1261
1477
  // update projector type
1262
1478
  {
@@ -1275,36 +1491,6 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
1275
1491
  }
1276
1492
  }
1277
1493
 
1278
- //#ifdef GGML_USE_CUDA
1279
- // new_clip->backend = ggml_backend_cuda_init(0);
1280
- // LOG_INF("%s: CLIP using CUDA backend\n", __func__);
1281
- //#endif
1282
- //
1283
- //#ifdef GGML_USE_METAL
1284
- // new_clip->backend = ggml_backend_metal_init();
1285
- // LOG_INF("%s: CLIP using Metal backend\n", __func__);
1286
- //#endif
1287
- //
1288
- //#ifdef GGML_USE_CANN
1289
- // new_clip->backend = ggml_backend_cann_init(0);
1290
- // LOG_INF("%s: CLIP using CANN backend\n", __func__);
1291
- //#endif
1292
- //
1293
- //#ifdef GGML_USE_VULKAN
1294
- // new_clip->backend = ggml_backend_vk_init(0);
1295
- // LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
1296
- //#endif
1297
- //
1298
- //#ifdef GGML_USE_SYCL
1299
- // new_clip->backend = ggml_backend_sycl_init(0);
1300
- // LOG_INF("%s: CLIP using SYCL backend\n", __func__);
1301
- //#endif
1302
-
1303
- if (!new_clip->backend) {
1304
- new_clip->backend = ggml_backend_cpu_init();
1305
- LOG_INF("%s: CLIP using CPU backend\n", __func__);
1306
- }
1307
-
1308
1494
  // model size and capabilities
1309
1495
  {
1310
1496
  int idx = get_key_idx(ctx, KEY_HAS_TEXT_ENC);
@@ -1342,8 +1528,12 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
1342
1528
  GGML_ASSERT(new_clip->has_vision_encoder);
1343
1529
  GGML_ASSERT(!new_clip->has_text_encoder);
1344
1530
 
1345
- idx = get_key_idx(ctx, KEY_USE_GELU);
1346
- new_clip->use_gelu = gguf_get_val_bool(ctx, idx);
1531
+ try {
1532
+ idx = get_key_idx(ctx, KEY_USE_GELU);
1533
+ new_clip->use_gelu = gguf_get_val_bool(ctx, idx);
1534
+ } catch (std::runtime_error & /*e*/) {
1535
+ new_clip->use_gelu = false;
1536
+ }
1347
1537
 
1348
1538
  try {
1349
1539
  idx = get_key_idx(ctx, KEY_USE_SILU);
@@ -1357,6 +1547,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
1357
1547
  LOG_INF("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
1358
1548
  LOG_INF("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector);
1359
1549
  LOG_INF("%s: minicpmv_projector: %d\n", __func__, new_clip->has_minicpmv_projector);
1550
+ LOG_INF("%s: minicpmv_version: %d\n", __func__, new_clip->minicpmv_version);
1360
1551
  LOG_INF("%s: glm_projector: %d\n", __func__, new_clip->has_glm_projector);
1361
1552
  LOG_INF("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
1362
1553
  LOG_INF("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
@@ -1399,7 +1590,9 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
1399
1590
  }
1400
1591
 
1401
1592
  // alloc memory and offload data
1402
- new_clip->params_buffer = ggml_backend_alloc_ctx_tensors(new_clip->ctx_data, new_clip->backend);
1593
+ ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(new_clip->backend);
1594
+ new_clip->buf = ggml_backend_alloc_ctx_tensors_from_buft(new_clip->ctx_data, buft);
1595
+ ggml_backend_buffer_set_usage(new_clip->buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
1403
1596
  for (int i = 0; i < n_tensors; ++i) {
1404
1597
  const char * name = gguf_get_tensor_name(ctx, i);
1405
1598
  struct ggml_tensor * cur = ggml_get_tensor(new_clip->ctx_data, name);
@@ -1412,7 +1605,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
1412
1605
  return nullptr;
1413
1606
  }
1414
1607
  int num_bytes = ggml_nbytes(cur);
1415
- if (ggml_backend_buffer_is_host(new_clip->params_buffer)) {
1608
+ if (ggml_backend_buft_is_host(buft)) {
1416
1609
  // for the CPU and Metal backend, we can read directly into the tensor
1417
1610
  fin.read(reinterpret_cast<char *>(cur->data), num_bytes);
1418
1611
  } else {
@@ -1443,14 +1636,26 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
1443
1636
  int idx = get_key_idx(ctx, KEY_IMAGE_GRID_PINPOINTS);
1444
1637
  int n = gguf_get_arr_n(ctx, idx);
1445
1638
  const int32_t * pinpoints = (const int32_t *)gguf_get_arr_data(ctx, idx);
1446
- for (int i = 0; i < 32 && i < n && pinpoints[i] != 0; ++i) {
1447
- hparams.image_grid_pinpoints[i] = pinpoints[i];
1639
+ for (int i = 0; i < n; ++i) {
1640
+ hparams.image_grid_pinpoints.push_back(pinpoints[i]);
1448
1641
  }
1449
- if (n < 32)
1450
- hparams.image_grid_pinpoints[n] = 0;
1451
- } catch (std::runtime_error & /*e*/) {
1452
- hparams.image_grid_pinpoints[0]=0;
1453
- }
1642
+ } catch (std::runtime_error & /*e*/) { }
1643
+
1644
+ // Load the vision feature layer indices if they are explicitly provided;
1645
+ // if multiple vision feature layers are present, the values will be concatenated
1646
+ // to form the final visual features.
1647
+ // NOTE: gguf conversions should standardize the values of the vision feature layer to
1648
+ // be non-negative, since we use -1 to mark values as unset here.
1649
+ try {
1650
+ int idx = get_key_idx(ctx, KEY_FEATURE_LAYER);
1651
+ int n = gguf_get_arr_n(ctx, idx);
1652
+
1653
+ const int32_t * vision_feature_layer = (const int32_t *)gguf_get_arr_data(ctx, idx);
1654
+
1655
+ for (int i = 0; i < n; ++i) {
1656
+ hparams.vision_feature_layer.insert(vision_feature_layer[i]);
1657
+ }
1658
+ } catch (std::runtime_error & /*e*/) { }
1454
1659
 
1455
1660
  try {
1456
1661
  int idx = get_key_idx(ctx, KEY_MM_PATCH_MERGE_TYPE);
@@ -1476,6 +1681,9 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
1476
1681
  new_clip->image_std[i] = std_data[i];
1477
1682
  }
1478
1683
 
1684
+ // Calculate the deepest feature layer based on hparams and projector type
1685
+ new_clip->max_feature_layer = get_deepest_feature_layer(new_clip);
1686
+
1479
1687
  if (verbosity >= 2) {
1480
1688
  LOG_INF("\n%s: vision model hparams\n", __func__);
1481
1689
  LOG_INF("image_size %d\n", hparams.image_size);
@@ -1489,8 +1697,13 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
1489
1697
  LOG_INF("v_image_mean %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
1490
1698
  LOG_INF("v_image_std %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
1491
1699
  LOG_INF("v_image_grid_pinpoints: ");
1492
- for (int i = 0; i < 32 && (hparams.image_grid_pinpoints[i] != 0); ++i) {
1493
- LOG_INF("%d ", hparams.image_grid_pinpoints[i]);
1700
+ for (const auto & pp : hparams.image_grid_pinpoints) {
1701
+ LOG_INF("%d ", pp);
1702
+ }
1703
+ LOG_INF("\n");
1704
+ LOG_INF("v_vision_feature_layer: ");
1705
+ for (const auto & feature_layer: hparams.vision_feature_layer) {
1706
+ LOG_INF("%d ", feature_layer);
1494
1707
  }
1495
1708
  LOG_INF("\n");
1496
1709
  LOG_INF("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type);
@@ -1528,11 +1741,17 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
1528
1741
  }
1529
1742
 
1530
1743
  try {
1531
- vision_model.patch_embeddings_0 = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
1744
+ vision_model.patch_embeddings_0 = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
1745
+ } catch(const std::exception& /*e*/) {
1746
+ vision_model.patch_embeddings_0 = nullptr;
1747
+ }
1748
+
1749
+ try {
1532
1750
  vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
1533
1751
  } catch(const std::exception& /*e*/) {
1534
- LOG_ERR("%s: failed to load vision model tensors\n", __func__);
1752
+ vision_model.position_embeddings = nullptr;
1535
1753
  }
1754
+
1536
1755
  try {
1537
1756
  vision_model.patch_embeddings_1 = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD_1);
1538
1757
  } catch(const std::exception& /*e*/) {
@@ -1643,6 +1862,10 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
1643
1862
  vision_model.mm_1_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
1644
1863
  vision_model.mm_1_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
1645
1864
  }
1865
+ else if (new_clip->proj_type == PROJECTOR_TYPE_GEMMA3) {
1866
+ vision_model.mm_input_proj_w = get_tensor(new_clip->ctx_data, TN_MM_INP_PROJ);
1867
+ vision_model.mm_soft_emb_norm_w = get_tensor(new_clip->ctx_data, TN_MM_SOFT_EMB_N);
1868
+ }
1646
1869
  else {
1647
1870
  std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type];
1648
1871
  throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
@@ -1678,14 +1901,21 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
1678
1901
  // measure mem requirement and allocate
1679
1902
  {
1680
1903
  new_clip->buf_compute_meta.resize(GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead());
1681
- new_clip->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_clip->backend));
1682
1904
  clip_image_f32_batch batch;
1683
1905
  batch.size = 1;
1684
1906
  batch.data = nullptr;
1685
1907
  ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false);
1686
- ggml_gallocr_reserve(new_clip->compute_alloc, gf);
1687
- size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
1688
- LOG_INF("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
1908
+ ggml_backend_sched_reserve(new_clip->sched.get(), gf);
1909
+ for (size_t i = 0; i < new_clip->backend_ptrs.size(); ++i) {
1910
+ ggml_backend_t backend = new_clip->backend_ptrs[i];
1911
+ ggml_backend_buffer_type_t buft = new_clip->backend_buft[i];
1912
+ size_t size = ggml_backend_sched_get_buffer_size(new_clip->sched.get(), backend);
1913
+ if (size > 1) {
1914
+ LOG_INF("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
1915
+ ggml_backend_buft_name(buft),
1916
+ size / 1024.0 / 1024.0);
1917
+ }
1918
+ }
1689
1919
  }
1690
1920
 
1691
1921
  return new_clip;
@@ -1729,11 +1959,11 @@ void clip_image_f32_batch_free(struct clip_image_f32_batch * batch) {
1729
1959
  }
1730
1960
  }
1731
1961
 
1732
- static void build_clip_img_from_data(const stbi_uc * data, int nx, int ny, clip_image_u8 * img) {
1962
+ void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, clip_image_u8 * img) {
1733
1963
  img->nx = nx;
1734
1964
  img->ny = ny;
1735
1965
  img->buf.resize(3 * nx * ny);
1736
- memcpy(img->buf.data(), data, img->buf.size());
1966
+ memcpy(img->buf.data(), rgb_pixels, img->buf.size());
1737
1967
  }
1738
1968
 
1739
1969
  bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
@@ -1743,7 +1973,7 @@ bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
1743
1973
  LOG_ERR("%s: failed to load image '%s'\n", __func__, fname);
1744
1974
  return false;
1745
1975
  }
1746
- build_clip_img_from_data(data, nx, ny, img);
1976
+ clip_build_img_from_pixels(data, nx, ny, img);
1747
1977
  stbi_image_free(data);
1748
1978
  return true;
1749
1979
  }
@@ -1755,7 +1985,7 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length
1755
1985
  LOG_ERR("%s: failed to decode image bytes\n", __func__);
1756
1986
  return false;
1757
1987
  }
1758
- build_clip_img_from_data(data, nx, ny, img);
1988
+ clip_build_img_from_pixels(data, nx, ny, img);
1759
1989
  stbi_image_free(data);
1760
1990
  return true;
1761
1991
  }
@@ -2177,7 +2407,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
2177
2407
  return true;
2178
2408
  }
2179
2409
 
2180
- if (ctx->has_glm_projector) {
2410
+ if (ctx->has_glm_projector || ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
2181
2411
  res_imgs->size = 1;
2182
2412
  res_imgs->data = new clip_image_f32[res_imgs->size];
2183
2413
  clip_image_u8 resized_image;
@@ -2235,10 +2465,10 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
2235
2465
  }
2236
2466
  }
2237
2467
  } else {
2238
- if (params.image_grid_pinpoints[0] != 0) {
2468
+ if (!params.image_grid_pinpoints.empty()) {
2239
2469
  // "spatial_unpad" with "anyres" processing for llava-1.6
2240
2470
  std::vector<std::pair<int, int>> possible_resolutions;
2241
- for (int i = 0; i < 32 && params.image_grid_pinpoints[i] != 0; i+=2) {
2471
+ for (size_t i = 0; i < params.image_grid_pinpoints.size(); i+=2) {
2242
2472
  possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]});
2243
2473
  }
2244
2474
  std::pair<int, int> best_resolution = select_best_resolution({img->nx, img->ny}, possible_resolutions);
@@ -2366,12 +2596,6 @@ ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) {
2366
2596
  }
2367
2597
 
2368
2598
  void clip_free(clip_ctx * ctx) {
2369
- ggml_free(ctx->ctx_data);
2370
- gguf_free(ctx->ctx_gguf);
2371
-
2372
- ggml_backend_buffer_free(ctx->params_buffer);
2373
- ggml_backend_free(ctx->backend);
2374
- ggml_gallocr_free(ctx->compute_alloc);
2375
2599
  delete ctx;
2376
2600
  }
2377
2601
 
@@ -2404,7 +2628,14 @@ const char * clip_patch_merge_type(const struct clip_ctx * ctx) {
2404
2628
  }
2405
2629
 
2406
2630
  const int32_t * clip_image_grid(const struct clip_ctx * ctx) {
2407
- return ctx->vision_model.hparams.image_grid_pinpoints;
2631
+ if (ctx->vision_model.hparams.image_grid_pinpoints.size()) {
2632
+ return &ctx->vision_model.hparams.image_grid_pinpoints.front();
2633
+ }
2634
+ return nullptr;
2635
+ }
2636
+
2637
+ size_t get_clip_image_grid_size(const struct clip_ctx * ctx) {
2638
+ return ctx->vision_model.hparams.image_grid_pinpoints.size();
2408
2639
  }
2409
2640
 
2410
2641
  int clip_n_patches(const struct clip_ctx * ctx) {
@@ -2560,8 +2791,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
2560
2791
  }
2561
2792
 
2562
2793
  // build the inference graph
2794
+ ggml_backend_sched_reset(ctx->sched.get());
2563
2795
  ggml_cgraph * gf = clip_image_build_graph(ctx, imgs, ctx->load_image_size, true);
2564
- ggml_gallocr_alloc_graph(ctx->compute_alloc, gf);
2796
+ ggml_backend_sched_alloc_graph(ctx->sched.get(), gf);
2565
2797
 
2566
2798
  // set inputs
2567
2799
  const auto & model = ctx->vision_model;
@@ -2700,6 +2932,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
2700
2932
  ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
2701
2933
  free(positions_data);
2702
2934
  }
2935
+ else if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
2936
+ // do nothing
2937
+ }
2703
2938
  else {
2704
2939
  struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
2705
2940
 
@@ -2712,9 +2947,13 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
2712
2947
 
2713
2948
  if (!ctx->has_glm_projector) {
2714
2949
  struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
2950
+ // The patches vector is used to get rows to index into the embeds with;
2951
+ // we should skip dim 0 only if we have CLS to avoid going out of bounds
2952
+ // when retrieving the rows.
2953
+ int patch_offset = ctx->has_class_embedding ? 1 : 0;
2715
2954
  int* patches_data = (int*)malloc(ggml_nbytes(patches));
2716
2955
  for (int i = 0; i < num_patches; i++) {
2717
- patches_data[i] = i + 1;
2956
+ patches_data[i] = i + patch_offset;
2718
2957
  }
2719
2958
  ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
2720
2959
  free(patches_data);
@@ -2722,11 +2961,13 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
2722
2961
  }
2723
2962
  }
2724
2963
 
2725
- if (ggml_backend_is_cpu(ctx->backend)) {
2726
- ggml_backend_cpu_set_n_threads(ctx->backend, n_threads);
2727
- }
2964
+ ggml_backend_cpu_set_n_threads(ctx->backend_cpu, n_threads);
2728
2965
 
2729
- ggml_backend_graph_compute(ctx->backend, gf);
2966
+ auto status = ggml_backend_sched_graph_compute(ctx->sched.get(), gf);
2967
+ if (status != GGML_STATUS_SUCCESS) {
2968
+ LOG_ERR("%s: ggml_backend_sched_graph_compute failed with error %d\n", __func__, status);
2969
+ return false;
2970
+ }
2730
2971
 
2731
2972
  // the last node is the embedding tensor
2732
2973
  struct ggml_tensor * embeddings = ggml_graph_node(gf, -1);
@@ -2906,6 +3147,9 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
2906
3147
  if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
2907
3148
  return ctx->vision_model.mm_1_b->ne[0];
2908
3149
  }
3150
+ if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
3151
+ return ctx->vision_model.mm_input_proj_w->ne[0];
3152
+ }
2909
3153
 
2910
3154
  std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
2911
3155
  throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
@@ -2925,6 +3169,28 @@ bool clip_is_qwen2vl(const struct clip_ctx * ctx) {
2925
3169
  return ctx->has_qwen2vl_merger;
2926
3170
  }
2927
3171
 
3172
+ // Determine the number of encoder layers to iterate over
3173
+ int get_deepest_feature_layer(const struct clip_ctx * ctx) {
3174
+ // Get the index of the second to last layer; this is the
3175
+ // default for models that have a llava projector
3176
+ const auto & hparams = ctx->vision_model.hparams;
3177
+ int n_layer = hparams.n_layer - 1;
3178
+ int deepest_feature_layer = -1;
3179
+
3180
+ // Handle other projectors; incrementing here indicates that we
3181
+ // should use the last encoder layer for the vision features.
3182
+ if (ctx->has_minicpmv_projector || ctx->has_glm_projector || ctx->has_qwen2vl_merger) {
3183
+ n_layer += 1;
3184
+ }
3185
+
3186
+ // If we set explicit vision feature layers, only go up to the deepest one
3187
+ for (const auto & feature_layer : hparams.vision_feature_layer) {
3188
+ if (feature_layer > deepest_feature_layer) {
3189
+ deepest_feature_layer = feature_layer;
3190
+ }
3191
+ }
3192
+ return deepest_feature_layer < 0 ? n_layer : deepest_feature_layer;
3193
+ }
2928
3194
 
2929
3195
  bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
2930
3196
  clip_image_f32 clip_img;