@fugood/llama.node 0.3.12 → 0.3.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +2 -1
- package/package.json +1 -1
- package/src/LlamaCompletionWorker.cpp +14 -0
- package/src/LlamaContext.cpp +110 -79
- package/src/LlamaContext.h +1 -1
- package/src/common.hpp +1 -2
- package/src/llama.cpp/.github/workflows/build.yml +95 -13
- package/src/llama.cpp/.github/workflows/docker.yml +2 -0
- package/src/llama.cpp/.github/workflows/labeler.yml +1 -1
- package/src/llama.cpp/.github/workflows/server.yml +2 -0
- package/src/llama.cpp/common/CMakeLists.txt +23 -6
- package/src/llama.cpp/common/arg.cpp +292 -14
- package/src/llama.cpp/common/chat.cpp +1128 -315
- package/src/llama.cpp/common/chat.h +135 -0
- package/src/llama.cpp/common/common.cpp +27 -171
- package/src/llama.cpp/common/common.h +41 -73
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
- package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
- package/src/llama.cpp/common/llguidance.cpp +3 -3
- package/src/llama.cpp/common/log.cpp +1 -0
- package/src/llama.cpp/common/log.h +2 -1
- package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +21 -7
- package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +61 -14
- package/src/llama.cpp/common/ngram-cache.cpp +1 -0
- package/src/llama.cpp/common/sampling.cpp +93 -49
- package/src/llama.cpp/common/speculative.cpp +6 -5
- package/src/llama.cpp/common/speculative.h +1 -1
- package/src/llama.cpp/docs/build.md +47 -9
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +3 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +1 -0
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +4 -4
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +6 -5
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +1 -1
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +373 -107
- package/src/llama.cpp/examples/llava/clip.h +19 -3
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
- package/src/llama.cpp/examples/llava/llava.cpp +4 -2
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -0
- package/src/llama.cpp/examples/main/main.cpp +73 -28
- package/src/llama.cpp/examples/parallel/parallel.cpp +1 -0
- package/src/llama.cpp/examples/passkey/passkey.cpp +1 -0
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +1 -0
- package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
- package/src/llama.cpp/examples/run/run.cpp +115 -79
- package/src/llama.cpp/examples/server/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/server/httplib.h +381 -292
- package/src/llama.cpp/examples/server/server.cpp +134 -128
- package/src/llama.cpp/examples/server/utils.hpp +95 -106
- package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
- package/src/llama.cpp/examples/tts/tts.cpp +251 -142
- package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
- package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-cpu.h +4 -1
- package/src/llama.cpp/ggml/include/ggml-metal.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +0 -2
- package/src/llama.cpp/ggml/include/ggml.h +6 -2
- package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
- package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
- package/src/llama.cpp/ggml/src/ggml-common.h +0 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +132 -17
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +156 -11
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +2235 -641
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1572 -198
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +24 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +9 -8
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +16 -3
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +235 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -2
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +246 -120
- package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +51 -10
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +136 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +308 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +174 -728
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -77
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +949 -602
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +37 -3
- package/src/llama.cpp/ggml/src/ggml.c +9 -4
- package/src/llama.cpp/include/llama.h +32 -14
- package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-all.txt +1 -0
- package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
- package/src/llama.cpp/requirements.txt +1 -0
- package/src/llama.cpp/src/llama-arch.cpp +21 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-chat.cpp +1 -0
- package/src/llama.cpp/src/llama-grammar.cpp +183 -183
- package/src/llama.cpp/src/llama-grammar.h +13 -4
- package/src/llama.cpp/src/llama-impl.h +6 -6
- package/src/llama.cpp/src/llama-kv-cache.h +2 -1
- package/src/llama.cpp/src/llama-mmap.cpp +11 -1
- package/src/llama.cpp/src/llama-mmap.h +1 -0
- package/src/llama.cpp/src/llama-model.cpp +70 -6
- package/src/llama.cpp/src/llama-sampling.cpp +174 -67
- package/src/llama.cpp/src/llama-vocab.cpp +12 -0
- package/src/llama.cpp/src/llama.cpp +154 -5
- package/src/llama.cpp/src/unicode.cpp +9 -2
- package/src/llama.cpp/tests/test-backend-ops.cpp +171 -115
- package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
- package/src/llama.cpp/tests/test-chat.cpp +691 -325
- package/src/llama.cpp/tests/test-gguf.cpp +4 -4
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
- package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
- package/src/llama.cpp/tests/test-sampling.cpp +15 -0
- package/src/llama.cpp/Sources/llama/llama.h +0 -4
- package/src/llama.cpp/common/chat.hpp +0 -52
|
@@ -4,31 +4,12 @@
|
|
|
4
4
|
// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
|
|
5
5
|
#include "clip.h"
|
|
6
6
|
#include "ggml.h"
|
|
7
|
+
#include "ggml-cpp.h"
|
|
7
8
|
#include "ggml-cpu.h"
|
|
8
9
|
#include "ggml-alloc.h"
|
|
9
10
|
#include "ggml-backend.h"
|
|
10
11
|
#include "gguf.h"
|
|
11
12
|
|
|
12
|
-
//#ifdef GGML_USE_CUDA
|
|
13
|
-
//#include "ggml-cuda.h"
|
|
14
|
-
//#endif
|
|
15
|
-
//
|
|
16
|
-
//#ifdef GGML_USE_SYCL
|
|
17
|
-
//#include "ggml-sycl.h"
|
|
18
|
-
//#endif
|
|
19
|
-
//
|
|
20
|
-
//#ifdef GGML_USE_METAL
|
|
21
|
-
//#include "ggml-metal.h"
|
|
22
|
-
//#endif
|
|
23
|
-
//
|
|
24
|
-
//#ifdef GGML_USE_CANN
|
|
25
|
-
//#include "ggml-cann.h"
|
|
26
|
-
//#endif
|
|
27
|
-
//
|
|
28
|
-
//#ifdef GGML_USE_VULKAN
|
|
29
|
-
//#include "ggml-vulkan.h"
|
|
30
|
-
//#endif
|
|
31
|
-
|
|
32
13
|
#define STB_IMAGE_IMPLEMENTATION
|
|
33
14
|
#include "stb_image.h"
|
|
34
15
|
|
|
@@ -40,6 +21,7 @@
|
|
|
40
21
|
#include <map>
|
|
41
22
|
#include <regex>
|
|
42
23
|
#include <stdexcept>
|
|
24
|
+
#include <unordered_set>
|
|
43
25
|
#include <vector>
|
|
44
26
|
#include <sstream>
|
|
45
27
|
#include <cinttypes>
|
|
@@ -120,6 +102,7 @@ static std::string format(const char * fmt, ...) {
|
|
|
120
102
|
#define KEY_IMAGE_MEAN "clip.vision.image_mean"
|
|
121
103
|
#define KEY_IMAGE_STD "clip.vision.image_std"
|
|
122
104
|
#define KEY_PROJ_TYPE "clip.projector_type"
|
|
105
|
+
#define KEY_FEATURE_LAYER "clip.vision.feature_layer"
|
|
123
106
|
|
|
124
107
|
#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type"
|
|
125
108
|
#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints"
|
|
@@ -153,6 +136,8 @@ static std::string format(const char * fmt, ...) {
|
|
|
153
136
|
#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
|
|
154
137
|
#define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s"
|
|
155
138
|
#define TN_IMAGE_NEWLINE "model.image_newline"
|
|
139
|
+
#define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3
|
|
140
|
+
#define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3
|
|
156
141
|
|
|
157
142
|
#define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k"
|
|
158
143
|
#define TN_MINICPMV_QUERY "resampler.query"
|
|
@@ -179,6 +164,7 @@ enum projector_type {
|
|
|
179
164
|
PROJECTOR_TYPE_RESAMPLER,
|
|
180
165
|
PROJECTOR_TYPE_GLM_EDGE,
|
|
181
166
|
PROJECTOR_TYPE_MERGER,
|
|
167
|
+
PROJECTOR_TYPE_GEMMA3,
|
|
182
168
|
PROJECTOR_TYPE_UNKNOWN,
|
|
183
169
|
};
|
|
184
170
|
|
|
@@ -189,6 +175,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
|
|
189
175
|
{ PROJECTOR_TYPE_RESAMPLER, "resampler"},
|
|
190
176
|
{ PROJECTOR_TYPE_GLM_EDGE, "adapter"},
|
|
191
177
|
{ PROJECTOR_TYPE_MERGER, "qwen2vl_merger"},
|
|
178
|
+
{ PROJECTOR_TYPE_GEMMA3, "gemma3"},
|
|
192
179
|
};
|
|
193
180
|
|
|
194
181
|
|
|
@@ -315,7 +302,7 @@ static projector_type clip_projector_type_from_string(const std::string & name)
|
|
|
315
302
|
return kv.first;
|
|
316
303
|
}
|
|
317
304
|
}
|
|
318
|
-
|
|
305
|
+
throw std::runtime_error(format("Unknown projector type: %s", name.c_str()));
|
|
319
306
|
}
|
|
320
307
|
|
|
321
308
|
#ifdef CLIP_DEBUG_FUNCTIONS
|
|
@@ -444,8 +431,9 @@ struct clip_hparams {
|
|
|
444
431
|
|
|
445
432
|
char mm_patch_merge_type[32] = "flat"; // spatial_unpad or flat (default)
|
|
446
433
|
|
|
447
|
-
int32_t image_grid_pinpoints
|
|
434
|
+
std::vector<int32_t> image_grid_pinpoints;
|
|
448
435
|
int32_t image_crop_resolution;
|
|
436
|
+
std::unordered_set<int32_t> vision_feature_layer;
|
|
449
437
|
};
|
|
450
438
|
|
|
451
439
|
struct clip_layer {
|
|
@@ -571,6 +559,10 @@ struct clip_vision_model {
|
|
|
571
559
|
struct ggml_tensor * mm_model_ln_kv_b;
|
|
572
560
|
struct ggml_tensor * mm_model_ln_post_w;
|
|
573
561
|
struct ggml_tensor * mm_model_ln_post_b;
|
|
562
|
+
|
|
563
|
+
// gemma3
|
|
564
|
+
struct ggml_tensor * mm_input_proj_w;
|
|
565
|
+
struct ggml_tensor * mm_soft_emb_norm_w;
|
|
574
566
|
};
|
|
575
567
|
|
|
576
568
|
struct clip_ctx {
|
|
@@ -585,6 +577,7 @@ struct clip_ctx {
|
|
|
585
577
|
struct clip_vision_model vision_model;
|
|
586
578
|
projector_type proj_type = PROJECTOR_TYPE_MLP;
|
|
587
579
|
|
|
580
|
+
int32_t max_feature_layer; // unused in newer models like gemma3
|
|
588
581
|
float image_mean[3];
|
|
589
582
|
float image_std[3];
|
|
590
583
|
bool use_gelu = false;
|
|
@@ -596,21 +589,209 @@ struct clip_ctx {
|
|
|
596
589
|
bool has_post_norm = false;
|
|
597
590
|
bool has_patch_bias = false;
|
|
598
591
|
|
|
599
|
-
struct gguf_context * ctx_gguf;
|
|
600
|
-
struct ggml_context * ctx_data;
|
|
592
|
+
struct gguf_context * ctx_gguf = nullptr;
|
|
593
|
+
struct ggml_context * ctx_data = nullptr;
|
|
601
594
|
|
|
602
595
|
std::vector<uint8_t> buf_compute_meta;
|
|
603
596
|
|
|
604
|
-
|
|
605
|
-
|
|
597
|
+
std::vector<ggml_backend_t> backend_ptrs;
|
|
598
|
+
std::vector<ggml_backend_buffer_type_t> backend_buft;
|
|
599
|
+
|
|
600
|
+
ggml_backend_t backend = nullptr;
|
|
601
|
+
ggml_backend_t backend_cpu = nullptr;
|
|
602
|
+
ggml_backend_buffer_t buf = nullptr;
|
|
603
|
+
|
|
604
|
+
ggml_backend_sched_ptr sched;
|
|
605
|
+
|
|
606
|
+
struct clip_image_size * load_image_size = nullptr;
|
|
607
|
+
|
|
608
|
+
clip_ctx(clip_context_params & ctx_params) {
|
|
609
|
+
backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
|
|
610
|
+
backend = ctx_params.use_gpu
|
|
611
|
+
? ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_GPU, nullptr)
|
|
612
|
+
: nullptr;
|
|
613
|
+
|
|
614
|
+
if (backend) {
|
|
615
|
+
LOG_INF("%s: CLIP using %s backend\n", __func__, ggml_backend_name(backend));
|
|
616
|
+
backend_ptrs.push_back(backend);
|
|
617
|
+
backend_buft.push_back(ggml_backend_get_default_buffer_type(backend));
|
|
618
|
+
} else {
|
|
619
|
+
backend = backend_cpu;
|
|
620
|
+
LOG_INF("%s: CLIP using CPU backend\n", __func__);
|
|
621
|
+
}
|
|
606
622
|
|
|
607
|
-
|
|
608
|
-
|
|
623
|
+
backend_ptrs.push_back(backend_cpu);
|
|
624
|
+
backend_buft.push_back(ggml_backend_get_default_buffer_type(backend_cpu));
|
|
609
625
|
|
|
610
|
-
|
|
626
|
+
sched.reset(
|
|
627
|
+
ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), 8192, false)
|
|
628
|
+
);
|
|
629
|
+
}
|
|
630
|
+
|
|
631
|
+
~clip_ctx() {
|
|
632
|
+
ggml_free(ctx_data);
|
|
633
|
+
gguf_free(ctx_gguf);
|
|
634
|
+
ggml_backend_buffer_free(buf);
|
|
635
|
+
ggml_backend_free(backend);
|
|
636
|
+
if (backend_cpu != backend) {
|
|
637
|
+
ggml_backend_free(backend_cpu);
|
|
638
|
+
}
|
|
639
|
+
}
|
|
611
640
|
};
|
|
612
641
|
|
|
613
|
-
static ggml_cgraph *
|
|
642
|
+
static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_image_f32_batch * imgs) {
|
|
643
|
+
const auto & model = ctx->vision_model;
|
|
644
|
+
const auto & hparams = model.hparams;
|
|
645
|
+
|
|
646
|
+
const int image_size = hparams.image_size;
|
|
647
|
+
int image_size_width = image_size;
|
|
648
|
+
int image_size_height = image_size;
|
|
649
|
+
|
|
650
|
+
const int patch_size = hparams.patch_size;
|
|
651
|
+
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
|
|
652
|
+
const int hidden_size = hparams.hidden_size;
|
|
653
|
+
const int n_head = hparams.n_head;
|
|
654
|
+
const int d_head = hidden_size / n_head;
|
|
655
|
+
const int n_layer = hparams.n_layer;
|
|
656
|
+
const float eps = hparams.eps;
|
|
657
|
+
|
|
658
|
+
GGML_ASSERT(imgs->size == 1); // batch_size == 1
|
|
659
|
+
|
|
660
|
+
struct ggml_init_params params = {
|
|
661
|
+
/*.mem_size =*/ ctx->buf_compute_meta.size(),
|
|
662
|
+
/*.mem_buffer =*/ ctx->buf_compute_meta.data(),
|
|
663
|
+
/*.no_alloc =*/ true,
|
|
664
|
+
};
|
|
665
|
+
|
|
666
|
+
struct ggml_context * ctx0 = ggml_init(params);
|
|
667
|
+
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
|
668
|
+
|
|
669
|
+
// input raw
|
|
670
|
+
struct ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, 3);
|
|
671
|
+
ggml_set_name(inp_raw, "inp_raw");
|
|
672
|
+
ggml_set_input(inp_raw);
|
|
673
|
+
|
|
674
|
+
struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
|
|
675
|
+
inp = ggml_reshape_2d(ctx0, inp, num_patches, hidden_size);
|
|
676
|
+
inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp));
|
|
677
|
+
inp = ggml_add(ctx0, inp, model.patch_bias);
|
|
678
|
+
|
|
679
|
+
// position embeddings
|
|
680
|
+
struct ggml_tensor * embeddings = ggml_add(ctx0, inp, model.position_embeddings);
|
|
681
|
+
|
|
682
|
+
// loop over layers
|
|
683
|
+
for (int il = 0; il < n_layer; il++) {
|
|
684
|
+
struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states
|
|
685
|
+
|
|
686
|
+
// layernorm1
|
|
687
|
+
{
|
|
688
|
+
cur = ggml_norm(ctx0, cur, eps);
|
|
689
|
+
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_1_w), model.layers[il].ln_1_b);
|
|
690
|
+
}
|
|
691
|
+
|
|
692
|
+
// self-attention
|
|
693
|
+
{
|
|
694
|
+
|
|
695
|
+
struct ggml_tensor * Q =
|
|
696
|
+
ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].q_w, cur), model.layers[il].q_b);
|
|
697
|
+
|
|
698
|
+
Q = ggml_reshape_3d(ctx0, Q, d_head, n_head, num_patches);
|
|
699
|
+
Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
|
|
700
|
+
|
|
701
|
+
struct ggml_tensor * K =
|
|
702
|
+
ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].k_w, cur), model.layers[il].k_b);
|
|
703
|
+
|
|
704
|
+
K = ggml_reshape_3d(ctx0, K, d_head, n_head, num_patches);
|
|
705
|
+
K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
|
|
706
|
+
|
|
707
|
+
struct ggml_tensor * V =
|
|
708
|
+
ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].v_w, cur), model.layers[il].v_b);
|
|
709
|
+
|
|
710
|
+
V = ggml_reshape_3d(ctx0, V, d_head, n_head, num_patches);
|
|
711
|
+
V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
|
|
712
|
+
|
|
713
|
+
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
|
714
|
+
KQ = ggml_scale_inplace(ctx0, KQ, 1.0f / sqrtf((float)d_head));
|
|
715
|
+
KQ = ggml_soft_max_inplace(ctx0, KQ);
|
|
716
|
+
|
|
717
|
+
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
|
|
718
|
+
KQV = ggml_reshape_3d(ctx0, KQV, d_head, num_patches, n_head);
|
|
719
|
+
KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
|
720
|
+
|
|
721
|
+
cur = ggml_cont_2d(ctx0, KQV, hidden_size, num_patches);
|
|
722
|
+
}
|
|
723
|
+
|
|
724
|
+
// attention output
|
|
725
|
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].o_w, cur), model.layers[il].o_b);
|
|
726
|
+
|
|
727
|
+
// re-add the layer input, e.g., residual
|
|
728
|
+
cur = ggml_add(ctx0, cur, embeddings);
|
|
729
|
+
|
|
730
|
+
embeddings = cur; // embeddings = residual, cur = hidden_states
|
|
731
|
+
|
|
732
|
+
// layernorm2
|
|
733
|
+
{
|
|
734
|
+
cur = ggml_norm(ctx0, cur, eps);
|
|
735
|
+
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_2_w), model.layers[il].ln_2_b);
|
|
736
|
+
}
|
|
737
|
+
|
|
738
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].ff_i_w, cur);
|
|
739
|
+
cur = ggml_add(ctx0, cur, model.layers[il].ff_i_b);
|
|
740
|
+
|
|
741
|
+
// siglip uses gelu
|
|
742
|
+
cur = ggml_gelu(ctx0, cur);
|
|
743
|
+
|
|
744
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].ff_o_w, cur);
|
|
745
|
+
cur = ggml_add(ctx0, cur, model.layers[il].ff_o_b);
|
|
746
|
+
|
|
747
|
+
// residual 2
|
|
748
|
+
cur = ggml_add(ctx0, embeddings, cur);
|
|
749
|
+
|
|
750
|
+
embeddings = cur;
|
|
751
|
+
}
|
|
752
|
+
|
|
753
|
+
// post-layernorm
|
|
754
|
+
if (ctx->has_post_norm) {
|
|
755
|
+
embeddings = ggml_norm(ctx0, embeddings, eps);
|
|
756
|
+
ggml_set_name(embeddings, "post_ln");
|
|
757
|
+
|
|
758
|
+
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.post_ln_w), model.post_ln_b);
|
|
759
|
+
}
|
|
760
|
+
|
|
761
|
+
if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
|
|
762
|
+
const int batch_size = 1;
|
|
763
|
+
const int mm_tokens_per_image = 256; // default value for gemma3
|
|
764
|
+
const int tokens_per_side = sqrt(mm_tokens_per_image);
|
|
765
|
+
const int patches_per_image = sqrt(num_patches);
|
|
766
|
+
const int kernel_size = patches_per_image / tokens_per_side;
|
|
767
|
+
|
|
768
|
+
embeddings = ggml_cont(ctx0, ggml_transpose(ctx0, embeddings));
|
|
769
|
+
embeddings = ggml_reshape_4d(ctx0, embeddings, patches_per_image, patches_per_image, hidden_size, batch_size);
|
|
770
|
+
|
|
771
|
+
// doing a pool2d to reduce the number of output tokens to 256
|
|
772
|
+
embeddings = ggml_pool_2d(ctx0, embeddings, GGML_OP_POOL_AVG, kernel_size, kernel_size, kernel_size, kernel_size, 0, 0);
|
|
773
|
+
embeddings = ggml_reshape_3d(ctx0, embeddings, embeddings->ne[0] * embeddings->ne[0], hidden_size, batch_size);
|
|
774
|
+
embeddings = ggml_cont(ctx0, ggml_transpose(ctx0, embeddings));
|
|
775
|
+
|
|
776
|
+
// apply norm before projection
|
|
777
|
+
embeddings = ggml_rms_norm(ctx0, embeddings, eps);
|
|
778
|
+
embeddings = ggml_mul(ctx0, embeddings, model.mm_soft_emb_norm_w);
|
|
779
|
+
|
|
780
|
+
// apply projection
|
|
781
|
+
embeddings = ggml_mul_mat(ctx0,
|
|
782
|
+
ggml_cont(ctx0, ggml_transpose(ctx0, model.mm_input_proj_w)),
|
|
783
|
+
embeddings);
|
|
784
|
+
}
|
|
785
|
+
|
|
786
|
+
// build the graph
|
|
787
|
+
ggml_build_forward_expand(gf, embeddings);
|
|
788
|
+
|
|
789
|
+
ggml_free(ctx0);
|
|
790
|
+
|
|
791
|
+
return gf;
|
|
792
|
+
}
|
|
793
|
+
|
|
794
|
+
static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) {
|
|
614
795
|
if (!ctx->has_vision_encoder) {
|
|
615
796
|
LOG_ERR("This gguf file seems to have no vision encoder\n");
|
|
616
797
|
return nullptr;
|
|
@@ -651,7 +832,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|
|
651
832
|
const int hidden_size = hparams.hidden_size;
|
|
652
833
|
const int n_head = hparams.n_head;
|
|
653
834
|
const int d_head = hidden_size / n_head;
|
|
654
|
-
int n_layer = hparams.n_layer;
|
|
655
835
|
const float eps = hparams.eps;
|
|
656
836
|
int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
|
|
657
837
|
|
|
@@ -752,13 +932,19 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|
|
752
932
|
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.pre_ln_w), model.pre_ln_b);
|
|
753
933
|
}
|
|
754
934
|
|
|
935
|
+
std::vector<struct ggml_tensor *> embedding_stack;
|
|
936
|
+
const auto & vision_feature_layer = hparams.vision_feature_layer;
|
|
937
|
+
|
|
755
938
|
// loop over layers
|
|
756
|
-
|
|
757
|
-
n_layer += 1;
|
|
758
|
-
}
|
|
759
|
-
for (int il = 0; il < n_layer - 1; il++) {
|
|
939
|
+
for (int il = 0; il < ctx->max_feature_layer; il++) {
|
|
760
940
|
struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states
|
|
761
941
|
|
|
942
|
+
// If this is an embedding feature layer, save the output.
|
|
943
|
+
// NOTE: 0 index here refers to the input to the encoder.
|
|
944
|
+
if (vision_feature_layer.find(il) != vision_feature_layer.end()) {
|
|
945
|
+
embedding_stack.push_back(embeddings);
|
|
946
|
+
}
|
|
947
|
+
|
|
762
948
|
//const size_t nb_q_w = model.layers[il].q_w->nb[0];
|
|
763
949
|
|
|
764
950
|
// layernorm1
|
|
@@ -846,7 +1032,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|
|
846
1032
|
cur = ggml_add(ctx0, embeddings, cur);
|
|
847
1033
|
|
|
848
1034
|
embeddings = cur;
|
|
849
|
-
|
|
850
1035
|
}
|
|
851
1036
|
|
|
852
1037
|
// post-layernorm
|
|
@@ -857,6 +1042,19 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|
|
857
1042
|
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.post_ln_w), model.post_ln_b);
|
|
858
1043
|
}
|
|
859
1044
|
|
|
1045
|
+
// final layer is a vision feature layer
|
|
1046
|
+
if (vision_feature_layer.find(ctx->max_feature_layer) != vision_feature_layer.end()) {
|
|
1047
|
+
embedding_stack.push_back(embeddings);
|
|
1048
|
+
}
|
|
1049
|
+
|
|
1050
|
+
// If feature layers are explicitly set, stack them (if we have multiple)
|
|
1051
|
+
if (!embedding_stack.empty()) {
|
|
1052
|
+
embeddings = embedding_stack[0];
|
|
1053
|
+
for (size_t i = 1; i < embedding_stack.size(); i++) {
|
|
1054
|
+
embeddings = ggml_concat(ctx0, embeddings, embedding_stack[i], 0);
|
|
1055
|
+
}
|
|
1056
|
+
}
|
|
1057
|
+
|
|
860
1058
|
// llava projector
|
|
861
1059
|
if (ctx->has_llava_projector) {
|
|
862
1060
|
embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
|
|
@@ -1139,7 +1337,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|
|
1139
1337
|
} else {
|
|
1140
1338
|
GGML_ABORT("fatel error");
|
|
1141
1339
|
}
|
|
1142
|
-
}
|
|
1340
|
+
}
|
|
1341
|
+
else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
|
|
1143
1342
|
embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size * 4, num_positions / 4, batch_size);
|
|
1144
1343
|
|
|
1145
1344
|
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
|
|
@@ -1161,8 +1360,25 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|
|
1161
1360
|
return gf;
|
|
1162
1361
|
}
|
|
1163
1362
|
|
|
1363
|
+
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) {
|
|
1364
|
+
if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
|
|
1365
|
+
return clip_image_build_graph_siglip(ctx, imgs);
|
|
1366
|
+
} else {
|
|
1367
|
+
// TODO: we should have one build_* function per model
|
|
1368
|
+
return clip_image_build_graph_legacy(ctx, imgs, load_image_size, is_inf);
|
|
1369
|
+
}
|
|
1370
|
+
}
|
|
1371
|
+
|
|
1164
1372
|
// read and create ggml_context containing the tensors and their data
|
|
1165
1373
|
struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|
1374
|
+
return clip_init(fname, clip_context_params{
|
|
1375
|
+
/* use_gpu */ true,
|
|
1376
|
+
/* verbosity */ verbosity,
|
|
1377
|
+
});
|
|
1378
|
+
}
|
|
1379
|
+
|
|
1380
|
+
struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_params) {
|
|
1381
|
+
int verbosity = ctx_params.verbosity;
|
|
1166
1382
|
struct ggml_context * meta = NULL;
|
|
1167
1383
|
|
|
1168
1384
|
struct gguf_init_params params = {
|
|
@@ -1256,7 +1472,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|
|
1256
1472
|
}
|
|
1257
1473
|
}
|
|
1258
1474
|
|
|
1259
|
-
clip_ctx * new_clip = new clip_ctx
|
|
1475
|
+
clip_ctx * new_clip = new clip_ctx(ctx_params);
|
|
1260
1476
|
|
|
1261
1477
|
// update projector type
|
|
1262
1478
|
{
|
|
@@ -1275,36 +1491,6 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|
|
1275
1491
|
}
|
|
1276
1492
|
}
|
|
1277
1493
|
|
|
1278
|
-
//#ifdef GGML_USE_CUDA
|
|
1279
|
-
// new_clip->backend = ggml_backend_cuda_init(0);
|
|
1280
|
-
// LOG_INF("%s: CLIP using CUDA backend\n", __func__);
|
|
1281
|
-
//#endif
|
|
1282
|
-
//
|
|
1283
|
-
//#ifdef GGML_USE_METAL
|
|
1284
|
-
// new_clip->backend = ggml_backend_metal_init();
|
|
1285
|
-
// LOG_INF("%s: CLIP using Metal backend\n", __func__);
|
|
1286
|
-
//#endif
|
|
1287
|
-
//
|
|
1288
|
-
//#ifdef GGML_USE_CANN
|
|
1289
|
-
// new_clip->backend = ggml_backend_cann_init(0);
|
|
1290
|
-
// LOG_INF("%s: CLIP using CANN backend\n", __func__);
|
|
1291
|
-
//#endif
|
|
1292
|
-
//
|
|
1293
|
-
//#ifdef GGML_USE_VULKAN
|
|
1294
|
-
// new_clip->backend = ggml_backend_vk_init(0);
|
|
1295
|
-
// LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
|
|
1296
|
-
//#endif
|
|
1297
|
-
//
|
|
1298
|
-
//#ifdef GGML_USE_SYCL
|
|
1299
|
-
// new_clip->backend = ggml_backend_sycl_init(0);
|
|
1300
|
-
// LOG_INF("%s: CLIP using SYCL backend\n", __func__);
|
|
1301
|
-
//#endif
|
|
1302
|
-
|
|
1303
|
-
if (!new_clip->backend) {
|
|
1304
|
-
new_clip->backend = ggml_backend_cpu_init();
|
|
1305
|
-
LOG_INF("%s: CLIP using CPU backend\n", __func__);
|
|
1306
|
-
}
|
|
1307
|
-
|
|
1308
1494
|
// model size and capabilities
|
|
1309
1495
|
{
|
|
1310
1496
|
int idx = get_key_idx(ctx, KEY_HAS_TEXT_ENC);
|
|
@@ -1342,8 +1528,12 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|
|
1342
1528
|
GGML_ASSERT(new_clip->has_vision_encoder);
|
|
1343
1529
|
GGML_ASSERT(!new_clip->has_text_encoder);
|
|
1344
1530
|
|
|
1345
|
-
|
|
1346
|
-
|
|
1531
|
+
try {
|
|
1532
|
+
idx = get_key_idx(ctx, KEY_USE_GELU);
|
|
1533
|
+
new_clip->use_gelu = gguf_get_val_bool(ctx, idx);
|
|
1534
|
+
} catch (std::runtime_error & /*e*/) {
|
|
1535
|
+
new_clip->use_gelu = false;
|
|
1536
|
+
}
|
|
1347
1537
|
|
|
1348
1538
|
try {
|
|
1349
1539
|
idx = get_key_idx(ctx, KEY_USE_SILU);
|
|
@@ -1357,6 +1547,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|
|
1357
1547
|
LOG_INF("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
|
|
1358
1548
|
LOG_INF("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector);
|
|
1359
1549
|
LOG_INF("%s: minicpmv_projector: %d\n", __func__, new_clip->has_minicpmv_projector);
|
|
1550
|
+
LOG_INF("%s: minicpmv_version: %d\n", __func__, new_clip->minicpmv_version);
|
|
1360
1551
|
LOG_INF("%s: glm_projector: %d\n", __func__, new_clip->has_glm_projector);
|
|
1361
1552
|
LOG_INF("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
|
|
1362
1553
|
LOG_INF("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
|
|
@@ -1399,7 +1590,9 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|
|
1399
1590
|
}
|
|
1400
1591
|
|
|
1401
1592
|
// alloc memory and offload data
|
|
1402
|
-
|
|
1593
|
+
ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(new_clip->backend);
|
|
1594
|
+
new_clip->buf = ggml_backend_alloc_ctx_tensors_from_buft(new_clip->ctx_data, buft);
|
|
1595
|
+
ggml_backend_buffer_set_usage(new_clip->buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
|
|
1403
1596
|
for (int i = 0; i < n_tensors; ++i) {
|
|
1404
1597
|
const char * name = gguf_get_tensor_name(ctx, i);
|
|
1405
1598
|
struct ggml_tensor * cur = ggml_get_tensor(new_clip->ctx_data, name);
|
|
@@ -1412,7 +1605,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|
|
1412
1605
|
return nullptr;
|
|
1413
1606
|
}
|
|
1414
1607
|
int num_bytes = ggml_nbytes(cur);
|
|
1415
|
-
if (
|
|
1608
|
+
if (ggml_backend_buft_is_host(buft)) {
|
|
1416
1609
|
// for the CPU and Metal backend, we can read directly into the tensor
|
|
1417
1610
|
fin.read(reinterpret_cast<char *>(cur->data), num_bytes);
|
|
1418
1611
|
} else {
|
|
@@ -1443,14 +1636,26 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|
|
1443
1636
|
int idx = get_key_idx(ctx, KEY_IMAGE_GRID_PINPOINTS);
|
|
1444
1637
|
int n = gguf_get_arr_n(ctx, idx);
|
|
1445
1638
|
const int32_t * pinpoints = (const int32_t *)gguf_get_arr_data(ctx, idx);
|
|
1446
|
-
for (int i = 0; i <
|
|
1447
|
-
hparams.image_grid_pinpoints
|
|
1639
|
+
for (int i = 0; i < n; ++i) {
|
|
1640
|
+
hparams.image_grid_pinpoints.push_back(pinpoints[i]);
|
|
1448
1641
|
}
|
|
1449
|
-
|
|
1450
|
-
|
|
1451
|
-
|
|
1452
|
-
|
|
1453
|
-
|
|
1642
|
+
} catch (std::runtime_error & /*e*/) { }
|
|
1643
|
+
|
|
1644
|
+
// Load the vision feature layer indices if they are explicitly provided;
|
|
1645
|
+
// if multiple vision feature layers are present, the values will be concatenated
|
|
1646
|
+
// to form the final visual features.
|
|
1647
|
+
// NOTE: gguf conversions should standardize the values of the vision feature layer to
|
|
1648
|
+
// be non-negative, since we use -1 to mark values as unset here.
|
|
1649
|
+
try {
|
|
1650
|
+
int idx = get_key_idx(ctx, KEY_FEATURE_LAYER);
|
|
1651
|
+
int n = gguf_get_arr_n(ctx, idx);
|
|
1652
|
+
|
|
1653
|
+
const int32_t * vision_feature_layer = (const int32_t *)gguf_get_arr_data(ctx, idx);
|
|
1654
|
+
|
|
1655
|
+
for (int i = 0; i < n; ++i) {
|
|
1656
|
+
hparams.vision_feature_layer.insert(vision_feature_layer[i]);
|
|
1657
|
+
}
|
|
1658
|
+
} catch (std::runtime_error & /*e*/) { }
|
|
1454
1659
|
|
|
1455
1660
|
try {
|
|
1456
1661
|
int idx = get_key_idx(ctx, KEY_MM_PATCH_MERGE_TYPE);
|
|
@@ -1476,6 +1681,9 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|
|
1476
1681
|
new_clip->image_std[i] = std_data[i];
|
|
1477
1682
|
}
|
|
1478
1683
|
|
|
1684
|
+
// Calculate the deepest feature layer based on hparams and projector type
|
|
1685
|
+
new_clip->max_feature_layer = get_deepest_feature_layer(new_clip);
|
|
1686
|
+
|
|
1479
1687
|
if (verbosity >= 2) {
|
|
1480
1688
|
LOG_INF("\n%s: vision model hparams\n", __func__);
|
|
1481
1689
|
LOG_INF("image_size %d\n", hparams.image_size);
|
|
@@ -1489,8 +1697,13 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|
|
1489
1697
|
LOG_INF("v_image_mean %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
|
|
1490
1698
|
LOG_INF("v_image_std %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
|
|
1491
1699
|
LOG_INF("v_image_grid_pinpoints: ");
|
|
1492
|
-
for (
|
|
1493
|
-
LOG_INF("%d ",
|
|
1700
|
+
for (const auto & pp : hparams.image_grid_pinpoints) {
|
|
1701
|
+
LOG_INF("%d ", pp);
|
|
1702
|
+
}
|
|
1703
|
+
LOG_INF("\n");
|
|
1704
|
+
LOG_INF("v_vision_feature_layer: ");
|
|
1705
|
+
for (const auto & feature_layer: hparams.vision_feature_layer) {
|
|
1706
|
+
LOG_INF("%d ", feature_layer);
|
|
1494
1707
|
}
|
|
1495
1708
|
LOG_INF("\n");
|
|
1496
1709
|
LOG_INF("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type);
|
|
@@ -1528,11 +1741,17 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|
|
1528
1741
|
}
|
|
1529
1742
|
|
|
1530
1743
|
try {
|
|
1531
|
-
vision_model.patch_embeddings_0
|
|
1744
|
+
vision_model.patch_embeddings_0 = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
|
|
1745
|
+
} catch(const std::exception& /*e*/) {
|
|
1746
|
+
vision_model.patch_embeddings_0 = nullptr;
|
|
1747
|
+
}
|
|
1748
|
+
|
|
1749
|
+
try {
|
|
1532
1750
|
vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
|
|
1533
1751
|
} catch(const std::exception& /*e*/) {
|
|
1534
|
-
|
|
1752
|
+
vision_model.position_embeddings = nullptr;
|
|
1535
1753
|
}
|
|
1754
|
+
|
|
1536
1755
|
try {
|
|
1537
1756
|
vision_model.patch_embeddings_1 = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD_1);
|
|
1538
1757
|
} catch(const std::exception& /*e*/) {
|
|
@@ -1643,6 +1862,10 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|
|
1643
1862
|
vision_model.mm_1_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
|
|
1644
1863
|
vision_model.mm_1_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
|
|
1645
1864
|
}
|
|
1865
|
+
else if (new_clip->proj_type == PROJECTOR_TYPE_GEMMA3) {
|
|
1866
|
+
vision_model.mm_input_proj_w = get_tensor(new_clip->ctx_data, TN_MM_INP_PROJ);
|
|
1867
|
+
vision_model.mm_soft_emb_norm_w = get_tensor(new_clip->ctx_data, TN_MM_SOFT_EMB_N);
|
|
1868
|
+
}
|
|
1646
1869
|
else {
|
|
1647
1870
|
std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type];
|
|
1648
1871
|
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
|
|
@@ -1678,14 +1901,21 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|
|
1678
1901
|
// measure mem requirement and allocate
|
|
1679
1902
|
{
|
|
1680
1903
|
new_clip->buf_compute_meta.resize(GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead());
|
|
1681
|
-
new_clip->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_clip->backend));
|
|
1682
1904
|
clip_image_f32_batch batch;
|
|
1683
1905
|
batch.size = 1;
|
|
1684
1906
|
batch.data = nullptr;
|
|
1685
1907
|
ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false);
|
|
1686
|
-
|
|
1687
|
-
size_t
|
|
1688
|
-
|
|
1908
|
+
ggml_backend_sched_reserve(new_clip->sched.get(), gf);
|
|
1909
|
+
for (size_t i = 0; i < new_clip->backend_ptrs.size(); ++i) {
|
|
1910
|
+
ggml_backend_t backend = new_clip->backend_ptrs[i];
|
|
1911
|
+
ggml_backend_buffer_type_t buft = new_clip->backend_buft[i];
|
|
1912
|
+
size_t size = ggml_backend_sched_get_buffer_size(new_clip->sched.get(), backend);
|
|
1913
|
+
if (size > 1) {
|
|
1914
|
+
LOG_INF("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
|
|
1915
|
+
ggml_backend_buft_name(buft),
|
|
1916
|
+
size / 1024.0 / 1024.0);
|
|
1917
|
+
}
|
|
1918
|
+
}
|
|
1689
1919
|
}
|
|
1690
1920
|
|
|
1691
1921
|
return new_clip;
|
|
@@ -1729,11 +1959,11 @@ void clip_image_f32_batch_free(struct clip_image_f32_batch * batch) {
|
|
|
1729
1959
|
}
|
|
1730
1960
|
}
|
|
1731
1961
|
|
|
1732
|
-
|
|
1962
|
+
void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, clip_image_u8 * img) {
|
|
1733
1963
|
img->nx = nx;
|
|
1734
1964
|
img->ny = ny;
|
|
1735
1965
|
img->buf.resize(3 * nx * ny);
|
|
1736
|
-
memcpy(img->buf.data(),
|
|
1966
|
+
memcpy(img->buf.data(), rgb_pixels, img->buf.size());
|
|
1737
1967
|
}
|
|
1738
1968
|
|
|
1739
1969
|
bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
|
|
@@ -1743,7 +1973,7 @@ bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
|
|
|
1743
1973
|
LOG_ERR("%s: failed to load image '%s'\n", __func__, fname);
|
|
1744
1974
|
return false;
|
|
1745
1975
|
}
|
|
1746
|
-
|
|
1976
|
+
clip_build_img_from_pixels(data, nx, ny, img);
|
|
1747
1977
|
stbi_image_free(data);
|
|
1748
1978
|
return true;
|
|
1749
1979
|
}
|
|
@@ -1755,7 +1985,7 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length
|
|
|
1755
1985
|
LOG_ERR("%s: failed to decode image bytes\n", __func__);
|
|
1756
1986
|
return false;
|
|
1757
1987
|
}
|
|
1758
|
-
|
|
1988
|
+
clip_build_img_from_pixels(data, nx, ny, img);
|
|
1759
1989
|
stbi_image_free(data);
|
|
1760
1990
|
return true;
|
|
1761
1991
|
}
|
|
@@ -2177,7 +2407,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
|
|
|
2177
2407
|
return true;
|
|
2178
2408
|
}
|
|
2179
2409
|
|
|
2180
|
-
if (ctx->has_glm_projector) {
|
|
2410
|
+
if (ctx->has_glm_projector || ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
|
|
2181
2411
|
res_imgs->size = 1;
|
|
2182
2412
|
res_imgs->data = new clip_image_f32[res_imgs->size];
|
|
2183
2413
|
clip_image_u8 resized_image;
|
|
@@ -2235,10 +2465,10 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
|
|
|
2235
2465
|
}
|
|
2236
2466
|
}
|
|
2237
2467
|
} else {
|
|
2238
|
-
if (params.image_grid_pinpoints
|
|
2468
|
+
if (!params.image_grid_pinpoints.empty()) {
|
|
2239
2469
|
// "spatial_unpad" with "anyres" processing for llava-1.6
|
|
2240
2470
|
std::vector<std::pair<int, int>> possible_resolutions;
|
|
2241
|
-
for (
|
|
2471
|
+
for (size_t i = 0; i < params.image_grid_pinpoints.size(); i+=2) {
|
|
2242
2472
|
possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]});
|
|
2243
2473
|
}
|
|
2244
2474
|
std::pair<int, int> best_resolution = select_best_resolution({img->nx, img->ny}, possible_resolutions);
|
|
@@ -2366,12 +2596,6 @@ ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) {
|
|
|
2366
2596
|
}
|
|
2367
2597
|
|
|
2368
2598
|
void clip_free(clip_ctx * ctx) {
|
|
2369
|
-
ggml_free(ctx->ctx_data);
|
|
2370
|
-
gguf_free(ctx->ctx_gguf);
|
|
2371
|
-
|
|
2372
|
-
ggml_backend_buffer_free(ctx->params_buffer);
|
|
2373
|
-
ggml_backend_free(ctx->backend);
|
|
2374
|
-
ggml_gallocr_free(ctx->compute_alloc);
|
|
2375
2599
|
delete ctx;
|
|
2376
2600
|
}
|
|
2377
2601
|
|
|
@@ -2404,7 +2628,14 @@ const char * clip_patch_merge_type(const struct clip_ctx * ctx) {
|
|
|
2404
2628
|
}
|
|
2405
2629
|
|
|
2406
2630
|
const int32_t * clip_image_grid(const struct clip_ctx * ctx) {
|
|
2407
|
-
|
|
2631
|
+
if (ctx->vision_model.hparams.image_grid_pinpoints.size()) {
|
|
2632
|
+
return &ctx->vision_model.hparams.image_grid_pinpoints.front();
|
|
2633
|
+
}
|
|
2634
|
+
return nullptr;
|
|
2635
|
+
}
|
|
2636
|
+
|
|
2637
|
+
size_t get_clip_image_grid_size(const struct clip_ctx * ctx) {
|
|
2638
|
+
return ctx->vision_model.hparams.image_grid_pinpoints.size();
|
|
2408
2639
|
}
|
|
2409
2640
|
|
|
2410
2641
|
int clip_n_patches(const struct clip_ctx * ctx) {
|
|
@@ -2560,8 +2791,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|
|
2560
2791
|
}
|
|
2561
2792
|
|
|
2562
2793
|
// build the inference graph
|
|
2794
|
+
ggml_backend_sched_reset(ctx->sched.get());
|
|
2563
2795
|
ggml_cgraph * gf = clip_image_build_graph(ctx, imgs, ctx->load_image_size, true);
|
|
2564
|
-
|
|
2796
|
+
ggml_backend_sched_alloc_graph(ctx->sched.get(), gf);
|
|
2565
2797
|
|
|
2566
2798
|
// set inputs
|
|
2567
2799
|
const auto & model = ctx->vision_model;
|
|
@@ -2700,6 +2932,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|
|
2700
2932
|
ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
|
|
2701
2933
|
free(positions_data);
|
|
2702
2934
|
}
|
|
2935
|
+
else if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
|
|
2936
|
+
// do nothing
|
|
2937
|
+
}
|
|
2703
2938
|
else {
|
|
2704
2939
|
struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
|
|
2705
2940
|
|
|
@@ -2712,9 +2947,13 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|
|
2712
2947
|
|
|
2713
2948
|
if (!ctx->has_glm_projector) {
|
|
2714
2949
|
struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
|
|
2950
|
+
// The patches vector is used to get rows to index into the embeds with;
|
|
2951
|
+
// we should skip dim 0 only if we have CLS to avoid going out of bounds
|
|
2952
|
+
// when retrieving the rows.
|
|
2953
|
+
int patch_offset = ctx->has_class_embedding ? 1 : 0;
|
|
2715
2954
|
int* patches_data = (int*)malloc(ggml_nbytes(patches));
|
|
2716
2955
|
for (int i = 0; i < num_patches; i++) {
|
|
2717
|
-
patches_data[i] = i +
|
|
2956
|
+
patches_data[i] = i + patch_offset;
|
|
2718
2957
|
}
|
|
2719
2958
|
ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
|
|
2720
2959
|
free(patches_data);
|
|
@@ -2722,11 +2961,13 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|
|
2722
2961
|
}
|
|
2723
2962
|
}
|
|
2724
2963
|
|
|
2725
|
-
|
|
2726
|
-
ggml_backend_cpu_set_n_threads(ctx->backend, n_threads);
|
|
2727
|
-
}
|
|
2964
|
+
ggml_backend_cpu_set_n_threads(ctx->backend_cpu, n_threads);
|
|
2728
2965
|
|
|
2729
|
-
|
|
2966
|
+
auto status = ggml_backend_sched_graph_compute(ctx->sched.get(), gf);
|
|
2967
|
+
if (status != GGML_STATUS_SUCCESS) {
|
|
2968
|
+
LOG_ERR("%s: ggml_backend_sched_graph_compute failed with error %d\n", __func__, status);
|
|
2969
|
+
return false;
|
|
2970
|
+
}
|
|
2730
2971
|
|
|
2731
2972
|
// the last node is the embedding tensor
|
|
2732
2973
|
struct ggml_tensor * embeddings = ggml_graph_node(gf, -1);
|
|
@@ -2906,6 +3147,9 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
|
|
2906
3147
|
if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
|
|
2907
3148
|
return ctx->vision_model.mm_1_b->ne[0];
|
|
2908
3149
|
}
|
|
3150
|
+
if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
|
|
3151
|
+
return ctx->vision_model.mm_input_proj_w->ne[0];
|
|
3152
|
+
}
|
|
2909
3153
|
|
|
2910
3154
|
std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
|
|
2911
3155
|
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
|
|
@@ -2925,6 +3169,28 @@ bool clip_is_qwen2vl(const struct clip_ctx * ctx) {
|
|
|
2925
3169
|
return ctx->has_qwen2vl_merger;
|
|
2926
3170
|
}
|
|
2927
3171
|
|
|
3172
|
+
// Determine the number of encoder layers to iterate over
|
|
3173
|
+
int get_deepest_feature_layer(const struct clip_ctx * ctx) {
|
|
3174
|
+
// Get the index of the second to last layer; this is the
|
|
3175
|
+
// default for models that have a llava projector
|
|
3176
|
+
const auto & hparams = ctx->vision_model.hparams;
|
|
3177
|
+
int n_layer = hparams.n_layer - 1;
|
|
3178
|
+
int deepest_feature_layer = -1;
|
|
3179
|
+
|
|
3180
|
+
// Handle other projectors; incrementing here indicates that we
|
|
3181
|
+
// should use the last encoder layer for the vision features.
|
|
3182
|
+
if (ctx->has_minicpmv_projector || ctx->has_glm_projector || ctx->has_qwen2vl_merger) {
|
|
3183
|
+
n_layer += 1;
|
|
3184
|
+
}
|
|
3185
|
+
|
|
3186
|
+
// If we set explicit vision feature layers, only go up to the deepest one
|
|
3187
|
+
for (const auto & feature_layer : hparams.vision_feature_layer) {
|
|
3188
|
+
if (feature_layer > deepest_feature_layer) {
|
|
3189
|
+
deepest_feature_layer = feature_layer;
|
|
3190
|
+
}
|
|
3191
|
+
}
|
|
3192
|
+
return deepest_feature_layer < 0 ? n_layer : deepest_feature_layer;
|
|
3193
|
+
}
|
|
2928
3194
|
|
|
2929
3195
|
bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
|
|
2930
3196
|
clip_image_f32 clip_img;
|