@fugood/llama.node 0.3.9 → 0.3.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. package/bin/darwin/arm64/llama-node.node +0 -0
  2. package/bin/darwin/x64/llama-node.node +0 -0
  3. package/bin/linux/arm64/llama-node.node +0 -0
  4. package/bin/linux/x64/llama-node.node +0 -0
  5. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  6. package/bin/linux-cuda/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/lib/binding.js +2 -2
  18. package/lib/binding.ts +47 -8
  19. package/lib/index.js +21 -1
  20. package/lib/index.ts +31 -1
  21. package/package.json +12 -3
  22. package/src/LlamaCompletionWorker.cpp +33 -6
  23. package/src/LlamaCompletionWorker.h +3 -1
  24. package/src/LlamaContext.cpp +336 -28
  25. package/src/LlamaContext.h +2 -0
  26. package/src/common.hpp +19 -2
  27. package/src/llama.cpp/.github/workflows/build.yml +289 -107
  28. package/src/llama.cpp/.github/workflows/close-issue.yml +1 -1
  29. package/src/llama.cpp/.github/workflows/docker.yml +2 -1
  30. package/src/llama.cpp/.github/workflows/server.yml +25 -2
  31. package/src/llama.cpp/CMakeLists.txt +10 -19
  32. package/src/llama.cpp/cmake/build-info.cmake +1 -1
  33. package/src/llama.cpp/common/CMakeLists.txt +32 -0
  34. package/src/llama.cpp/common/arg.cpp +66 -16
  35. package/src/llama.cpp/common/chat-template.hpp +515 -0
  36. package/src/llama.cpp/common/chat.cpp +966 -0
  37. package/src/llama.cpp/common/chat.hpp +52 -0
  38. package/src/llama.cpp/common/common.cpp +159 -36
  39. package/src/llama.cpp/common/common.h +56 -14
  40. package/src/llama.cpp/common/json-schema-to-grammar.cpp +46 -66
  41. package/src/llama.cpp/common/json-schema-to-grammar.h +15 -1
  42. package/src/llama.cpp/common/llguidance.cpp +270 -0
  43. package/src/llama.cpp/common/log.cpp +1 -10
  44. package/src/llama.cpp/common/log.h +10 -0
  45. package/src/llama.cpp/common/minja.hpp +2868 -0
  46. package/src/llama.cpp/common/sampling.cpp +22 -1
  47. package/src/llama.cpp/common/sampling.h +3 -0
  48. package/src/llama.cpp/docs/build.md +54 -9
  49. package/src/llama.cpp/examples/export-lora/export-lora.cpp +12 -2
  50. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +1 -1
  51. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  52. package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +59 -0
  53. package/src/llama.cpp/examples/llava/clip.cpp +133 -14
  54. package/src/llama.cpp/examples/llava/clip.h +2 -0
  55. package/src/llama.cpp/examples/llava/llava.cpp +22 -8
  56. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +9 -1
  57. package/src/llama.cpp/examples/main/main.cpp +26 -25
  58. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +136 -137
  59. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +18 -4
  60. package/src/llama.cpp/examples/run/run.cpp +224 -69
  61. package/src/llama.cpp/examples/server/server.cpp +252 -81
  62. package/src/llama.cpp/examples/server/utils.hpp +73 -21
  63. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +6 -4
  64. package/src/llama.cpp/examples/simple-cmake-pkg/CMakeLists.txt +11 -0
  65. package/src/llama.cpp/ggml/CMakeLists.txt +78 -1
  66. package/src/llama.cpp/ggml/include/ggml.h +1 -1
  67. package/src/llama.cpp/ggml/src/CMakeLists.txt +21 -4
  68. package/src/llama.cpp/ggml/src/ggml-alloc.c +1 -13
  69. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +91 -78
  70. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +7 -7
  71. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -1
  72. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +1 -1
  73. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +46 -0
  74. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +16 -1
  75. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +1 -1
  76. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +28 -8
  77. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +5 -7
  78. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +33 -23
  79. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +1 -5
  80. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +323 -121
  81. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +13 -3
  82. package/src/llama.cpp/ggml/src/ggml.c +23 -13
  83. package/src/llama.cpp/include/llama.h +14 -1
  84. package/src/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.inp +112 -0
  85. package/src/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.out +46 -0
  86. package/src/llama.cpp/src/CMakeLists.txt +1 -1
  87. package/src/llama.cpp/src/llama-arch.cpp +7 -2
  88. package/src/llama.cpp/src/llama-arch.h +3 -1
  89. package/src/llama.cpp/src/llama-chat.cpp +11 -2
  90. package/src/llama.cpp/src/llama-chat.h +1 -0
  91. package/src/llama.cpp/src/llama-grammar.cpp +86 -6
  92. package/src/llama.cpp/src/llama-grammar.h +22 -1
  93. package/src/llama.cpp/src/llama-mmap.cpp +1 -0
  94. package/src/llama.cpp/src/llama-model-loader.cpp +1 -1
  95. package/src/llama.cpp/src/llama-model.cpp +76 -6
  96. package/src/llama.cpp/src/llama-sampling.cpp +47 -4
  97. package/src/llama.cpp/src/llama-vocab.cpp +10 -4
  98. package/src/llama.cpp/src/llama.cpp +181 -123
  99. package/src/llama.cpp/tests/CMakeLists.txt +4 -0
  100. package/src/llama.cpp/tests/test-backend-ops.cpp +158 -57
  101. package/src/llama.cpp/tests/test-chat-template.cpp +154 -31
  102. package/src/llama.cpp/tests/test-chat.cpp +607 -0
  103. package/src/llama.cpp/tests/test-grammar-integration.cpp +2 -2
  104. package/src/llama.cpp/tests/test-grammar-llguidance.cpp +1140 -0
  105. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +1 -1
  106. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +0 -32
@@ -151,9 +151,30 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
151
151
 
152
152
  lparams.no_perf = params.no_perf;
153
153
 
154
+ std::vector<const char *> trigger_words;
155
+ trigger_words.reserve(params.grammar_trigger_words.size());
156
+ for (const auto & str : params.grammar_trigger_words) {
157
+ trigger_words.push_back(str.word.c_str());
158
+ }
159
+
160
+ struct llama_sampler * grmr;
161
+ if (params.grammar.compare(0, 11, "%llguidance") == 0) {
162
+ #ifdef LLAMA_USE_LLGUIDANCE
163
+ grmr = llama_sampler_init_llg(vocab, "lark", params.grammar.c_str());
164
+ #else
165
+ GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
166
+ #endif // LLAMA_USE_LLGUIDANCE
167
+ } else {
168
+ grmr = params.grammar_lazy
169
+ ? llama_sampler_init_grammar_lazy(vocab, params.grammar.c_str(), "root",
170
+ trigger_words.data(), trigger_words.size(),
171
+ params.grammar_trigger_tokens.data(), params.grammar_trigger_tokens.size())
172
+ : llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root");
173
+ }
174
+
154
175
  auto * result = new common_sampler {
155
176
  /* .params = */ params,
156
- /* .grmr = */ llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root"),
177
+ /* .grmr = */ grmr,
157
178
  /* .chain = */ llama_sampler_chain_init(lparams),
158
179
  /* .prev = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
159
180
  /* .cur = */ {},
@@ -102,3 +102,6 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr);
102
102
 
103
103
  std::vector<enum common_sampler_type> common_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
104
104
  std::vector<enum common_sampler_type> common_sampler_types_from_chars(const std::string & chars);
105
+
106
+ llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab,
107
+ const char * grammar_kind, const char * grammar_data);
@@ -125,21 +125,66 @@ For detailed info, please refer to [llama.cpp for SYCL](./backend/SYCL.md).
125
125
 
126
126
  ## CUDA
127
127
 
128
- This provides GPU acceleration using an NVIDIA GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from the [NVIDIA developer site](https://developer.nvidia.com/cuda-downloads).
128
+ This provides GPU acceleration using an NVIDIA GPU. Make sure to have the [CUDA toolkit](https://developer.nvidia.com/cuda-toolkit) installed.
129
129
 
130
- If you are using Fedora (using Fedora Workstation, or an 'Atomic' variant such as Silverblue), or would like to set up CUDA in a toolbox, please consider our [Fedora CUDA guide](./cuda-fedora.md). Unfortunately, the process is not as simple as one might expect.
130
+ #### Download directly from NVIDIA
131
+ You may find the official downloads here: [NVIDIA developer site](https://developer.nvidia.com/cuda-downloads).
131
132
 
132
- - Using `CMake`:
133
133
 
134
- ```bash
135
- cmake -B build -DGGML_CUDA=ON
136
- cmake --build build --config Release
137
- ```
134
+ #### Compile and run inside a Fedora Toolbox Container
135
+ We also have a [guide](./cuda-fedora.md) for setting up CUDA toolkit in a Fedora [toolbox container](https://containertoolbx.org/).
136
+
137
+ **Recommended for:**
138
+
139
+ - ***Particularly*** *convenient* for users of [Atomic Desktops for Fedora](https://fedoraproject.org/atomic-desktops/); such as: [Silverblue](https://fedoraproject.org/atomic-desktops/silverblue/) and [Kinoite](https://fedoraproject.org/atomic-desktops/kinoite/).
140
+ - Toolbox is installed by default: [Fedora Workstation](https://fedoraproject.org/workstation/) or [Fedora KDE Plasma Desktop](https://fedoraproject.org/spins/kde).
141
+ - *Optionally* toolbox packages are available: [Arch Linux](https://archlinux.org/), [Red Hat Enterprise Linux >= 8.5](https://www.redhat.com/en/technologies/linux-platforms/enterprise-linux), or [Ubuntu](https://ubuntu.com/download)
142
+
143
+
144
+ ### Compilation
145
+ ```bash
146
+ cmake -B build -DGGML_CUDA=ON
147
+ cmake --build build --config Release
148
+ ```
149
+
150
+ ### Override Compute Capability Specifications
151
+
152
+ If `nvcc` cannot detect your gpu, you may get compile-warnings such as:
153
+ ```text
154
+ nvcc warning : Cannot find valid GPU for '-arch=native', default arch is used
155
+ ```
138
156
 
139
- The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used.
157
+ To override the `native` GPU detection:
158
+
159
+ #### 1. Take note of the `Compute Capability` of your NVIDIA devices: ["CUDA: Your GPU Compute > Capability"](https://developer.nvidia.com/cuda-gpus).
160
+
161
+ ```text
162
+ GeForce RTX 4090 8.9
163
+ GeForce RTX 3080 Ti 8.6
164
+ GeForce RTX 3070 8.6
165
+ ```
166
+
167
+ #### 2. Manually list each varying `Compute Capability` in the `CMAKE_CUDA_ARCHITECTURES` list.
168
+
169
+ ```bash
170
+ cmake -B build -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES="86;89"
171
+ ```
172
+
173
+ ### Runtime CUDA environmental variables
174
+
175
+ You may set the [cuda environmental variables](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) at runtime.
176
+
177
+ ```bash
178
+ # Use `CUDA_VISIBLE_DEVICES` to hide the first compute device.
179
+ CUDA_VISIBLE_DEVICES="-0" ./build/bin/llama-server --model /srv/models/llama.gguf
180
+ ```
181
+
182
+ ### Unified Memory
140
183
 
141
184
  The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted. In Windows this setting is available in the NVIDIA control panel as `System Memory Fallback`.
142
185
 
186
+ ### Performance Tuning
187
+
143
188
  The following compilation options are also available to tweak performance:
144
189
 
145
190
  | Option | Legal values | Default | Description |
@@ -286,7 +331,7 @@ You don't need to install Vulkan SDK. It will be installed inside the container.
286
331
 
287
332
  ```sh
288
333
  # Build the image
289
- docker build -t llama-cpp-vulkan -f .devops/llama-cli-vulkan.Dockerfile .
334
+ docker build -t llama-cpp-vulkan --target light -f .devops/vulkan.Dockerfile .
290
335
 
291
336
  # Then, use it:
292
337
  docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-vulkan -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
@@ -345,8 +345,18 @@ struct lora_merge_ctx {
345
345
  gf = ggml_new_graph(ctx0);
346
346
  struct ggml_tensor * cur = inp_base;
347
347
  for (size_t i = 0; i < adapters.size(); ++i) {
348
- struct ggml_tensor * a_T = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_cast(ctx0, inp_a[i], GGML_TYPE_F32)));
349
- struct ggml_tensor * delta = ggml_mul_mat(ctx0, a_T, ggml_cast(ctx0, inp_b[i], GGML_TYPE_F32));
348
+ struct ggml_tensor * delta;
349
+ bool is_tok_embd = string_starts_with(name_base, "token_embd");
350
+ if (is_tok_embd) {
351
+ printf("%s : detected token embeddings tensor\n", __func__);
352
+ delta = ggml_mul_mat(ctx0,
353
+ ggml_cast(ctx0, inp_b[i], GGML_TYPE_F32),
354
+ ggml_cast(ctx0, inp_a[i], GGML_TYPE_F32));
355
+ } else {
356
+ delta = ggml_mul_mat(ctx0,
357
+ ggml_cont(ctx0, ggml_transpose(ctx0, ggml_cast(ctx0, inp_a[i], GGML_TYPE_F32))),
358
+ ggml_cast(ctx0, inp_b[i], GGML_TYPE_F32));
359
+ }
350
360
  // scale
351
361
  const float alpha = adapters[i]->alpha;
352
362
  const float rank = (float) inp_b[i]->ne[0];
@@ -76,7 +76,7 @@ int main(int argc, char** argv) {
76
76
  grammar_str = buffer.str();
77
77
  }
78
78
 
79
- llama_grammar * grammar = llama_grammar_init_impl(nullptr, grammar_str.c_str(), "root");
79
+ llama_grammar * grammar = llama_grammar_init_impl(nullptr, grammar_str.c_str(), "root", false, nullptr, 0, nullptr, 0);
80
80
  if (grammar == nullptr) {
81
81
  fprintf(stdout, "Failed to initialize llama_grammar\n");
82
82
  return 1;
@@ -50,3 +50,10 @@ set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-qwen2vl-cli)
50
50
  install(TARGETS ${TARGET} RUNTIME)
51
51
  target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
52
52
  target_compile_features(${TARGET} PRIVATE cxx_std_17)
53
+
54
+ set(TARGET llama-llava-clip-quantize-cli)
55
+ add_executable(${TARGET} clip-quantize-cli.cpp)
56
+ set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava-clip-quantize-cli)
57
+ install(TARGETS ${TARGET} RUNTIME)
58
+ target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
59
+ target_compile_features(${TARGET} PRIVATE cxx_std_17)
@@ -0,0 +1,59 @@
1
+ #include "arg.h"
2
+ #include "base64.hpp"
3
+ #include "log.h"
4
+ #include "common.h"
5
+ #include "sampling.h"
6
+ #include "clip.h"
7
+ #include "llava.h"
8
+ #include "llama.h"
9
+ #include "ggml.h"
10
+
11
+ static void print_usage(int argc, char ** argv) {
12
+ (void) argc;
13
+
14
+ fprintf(stderr, "usage: %s /path/to/ggml-model-f32.gguf /path/to/ggml-model-quantized.gguf type\n", argv[0]);
15
+ fprintf(stderr, " type = 2 - q4_0\n");
16
+ fprintf(stderr, " type = 3 - q4_1\n");
17
+ fprintf(stderr, " type = 6 - q5_0\n");
18
+ fprintf(stderr, " type = 7 - q5_1\n");
19
+ fprintf(stderr, " type = 8 - q8_0\n");
20
+ }
21
+
22
+ int main(int argc, char ** argv) {
23
+ if (argc != 4) {
24
+ print_usage(argc, argv);
25
+ return 1;
26
+ }
27
+
28
+ const std::string fname_inp = argv[1];
29
+ const std::string fname_out = argv[2];
30
+
31
+ const int itype = atoi(argv[3]);
32
+
33
+ const int64_t t_main_start_us = ggml_time_us();
34
+
35
+ int64_t t_quantize_us = 0;
36
+
37
+ // load the model
38
+ {
39
+ const int64_t t_start_us = ggml_time_us();
40
+
41
+ if (!clip_model_quantize(fname_inp.c_str(), fname_out.c_str(), itype)) {
42
+ fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
43
+ return 1;
44
+ }
45
+
46
+ t_quantize_us = ggml_time_us() - t_start_us;
47
+ }
48
+
49
+ // report timing
50
+ {
51
+ const int64_t t_main_end_us = ggml_time_us();
52
+
53
+ printf("\n");
54
+ printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us / 1000.0f);
55
+ printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us) / 1000.0f);
56
+ }
57
+
58
+ return 0;
59
+ }
@@ -102,6 +102,7 @@ static std::string format(const char * fmt, ...) {
102
102
  #define KEY_HAS_VIS_ENC "clip.has_vision_encoder"
103
103
  #define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector"
104
104
  #define KEY_HAS_MINICPMV_PROJ "clip.has_minicpmv_projector"
105
+ #define KEY_HAS_GLM_PROJ "clip.has_glm_projector"
105
106
  #define KEY_MINICPMV_VERSION "clip.minicpmv_version"
106
107
  #define KEY_HAS_QWEN2VL_MERGER "clip.has_qwen2vl_merger"
107
108
  #define KEY_USE_GELU "clip.use_gelu"
@@ -160,6 +161,15 @@ static std::string format(const char * fmt, ...) {
160
161
  #define TN_MINICPMV_ATTN "resampler.attn.%s.%s"
161
162
  #define TN_MINICPMV_LN "resampler.ln_%s.%s"
162
163
 
164
+ #define TN_GLM_ADAPER_CONV "adapter.conv.%s"
165
+ #define TN_GLM_ADAPTER_LINEAR "adapter.linear.linear.%s"
166
+ #define TN_GLM_ADAPTER_NORM_1 "adapter.linear.norm1.%s"
167
+ #define TN_GLM_ADAPTER_D_H_2_4H "adapter.linear.dense_h_to_4h.%s"
168
+ #define TN_GLM_ADAPTER_GATE "adapter.linear.gate.%s"
169
+ #define TN_GLM_ADAPTER_D_4H_2_H "adapter.linear.dense_4h_to_h.%s"
170
+ #define TN_GLM_BOI_W "adapter.boi"
171
+ #define TN_GLM_EOI_W "adapter.eoi"
172
+
163
173
 
164
174
  enum projector_type {
165
175
  PROJECTOR_TYPE_MLP,
@@ -167,6 +177,7 @@ enum projector_type {
167
177
  PROJECTOR_TYPE_LDP,
168
178
  PROJECTOR_TYPE_LDPV2,
169
179
  PROJECTOR_TYPE_RESAMPLER,
180
+ PROJECTOR_TYPE_GLM_EDGE,
170
181
  PROJECTOR_TYPE_MERGER,
171
182
  PROJECTOR_TYPE_UNKNOWN,
172
183
  };
@@ -176,6 +187,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
176
187
  { PROJECTOR_TYPE_LDP, "ldp" },
177
188
  { PROJECTOR_TYPE_LDPV2, "ldpv2"},
178
189
  { PROJECTOR_TYPE_RESAMPLER, "resampler"},
190
+ { PROJECTOR_TYPE_GLM_EDGE, "adapter"},
179
191
  { PROJECTOR_TYPE_MERGER, "qwen2vl_merger"},
180
192
  };
181
193
 
@@ -500,6 +512,12 @@ struct clip_vision_model {
500
512
  struct ggml_tensor * mm_4_w = NULL;
501
513
  struct ggml_tensor * mm_4_b = NULL;
502
514
 
515
+ //GLMV-Edge projection
516
+ struct ggml_tensor * mm_model_adapter_conv_w;
517
+ struct ggml_tensor * mm_model_adapter_conv_b;
518
+ struct ggml_tensor * boi_w;
519
+ struct ggml_tensor * eoi_w;
520
+
503
521
  // MobileVLM projection
504
522
  struct ggml_tensor * mm_model_mlp_1_w;
505
523
  struct ggml_tensor * mm_model_mlp_1_b;
@@ -560,6 +578,7 @@ struct clip_ctx {
560
578
  bool has_vision_encoder = false;
561
579
  bool has_llava_projector = false;
562
580
  bool has_minicpmv_projector = false;
581
+ bool has_glm_projector = false;
563
582
  bool has_qwen2vl_merger = false;
564
583
  int minicpmv_version = 2;
565
584
 
@@ -638,7 +657,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
638
657
 
639
658
  const int batch_size = imgs->size;
640
659
 
641
- if (ctx->has_llava_projector || ctx->has_minicpmv_projector) {
660
+ if (ctx->has_llava_projector || ctx->has_minicpmv_projector || ctx->has_glm_projector) {
642
661
  GGML_ASSERT(batch_size == 1);
643
662
  }
644
663
 
@@ -718,6 +737,9 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
718
737
  else if (ctx->minicpmv_version == 3) {
719
738
  pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 3584, pos_w * pos_h, 1);
720
739
  }
740
+ else if (ctx->minicpmv_version == 4) {
741
+ pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 3584, pos_w * pos_h, 1);
742
+ }
721
743
  ggml_set_name(pos_embed, "pos_embed");
722
744
  ggml_set_input(pos_embed);
723
745
  }
@@ -731,8 +753,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
731
753
  }
732
754
 
733
755
  // loop over layers
734
- if (ctx->has_minicpmv_projector || ctx->has_qwen2vl_merger) {
735
- // TODO: figure out why we doing thing in this way ???
756
+ if (ctx->has_minicpmv_projector || ctx->has_glm_projector || ctx->has_qwen2vl_merger) {
736
757
  n_layer += 1;
737
758
  }
738
759
  for (int il = 0; il < n_layer - 1; il++) {
@@ -1053,6 +1074,11 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
1053
1074
  n_head = hidden_size/d_head;
1054
1075
  num_query = 64;
1055
1076
  }
1077
+ else if (ctx->minicpmv_version == 4) {
1078
+ hidden_size = 3584;
1079
+ n_head = hidden_size/d_head;
1080
+ num_query = 64;
1081
+ }
1056
1082
 
1057
1083
  struct ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), model.mm_model_attn_q_b);
1058
1084
  Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
@@ -1087,7 +1113,33 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
1087
1113
  GGML_ASSERT(false);
1088
1114
  }
1089
1115
  }
1090
- else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
1116
+ // glm projector
1117
+ else if (ctx->has_glm_projector) {
1118
+ if (ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
1119
+ size_t gridsz = (size_t)sqrt(embeddings->ne[1]);
1120
+ embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings,1,0,2,3));
1121
+ embeddings = ggml_reshape_3d(ctx0, embeddings, gridsz, gridsz, embeddings->ne[1]);
1122
+ embeddings = ggml_conv_2d(ctx0, model.mm_model_adapter_conv_w, embeddings, 2, 2, 0, 0, 1, 1);
1123
+ embeddings = ggml_reshape_3d(ctx0, embeddings,embeddings->ne[0]*embeddings->ne[1] , embeddings->ne[2], batch_size);
1124
+ embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings, 1, 0, 2, 3));
1125
+ embeddings = ggml_add(ctx0, embeddings, model.mm_model_adapter_conv_b);
1126
+ //GLU
1127
+ {
1128
+ embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
1129
+ embeddings = ggml_norm(ctx0, embeddings, eps);
1130
+ embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_q_w), model.mm_model_ln_q_b);
1131
+ embeddings = ggml_gelu_inplace(ctx0, embeddings);
1132
+ struct ggml_tensor * x = embeddings;
1133
+ embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, embeddings);
1134
+ x = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w,x);
1135
+ embeddings = ggml_silu_inplace(ctx0, embeddings);
1136
+ embeddings = ggml_mul(ctx0, embeddings,x);
1137
+ embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, embeddings);
1138
+ }
1139
+ } else {
1140
+ GGML_ABORT("fatel error");
1141
+ }
1142
+ } else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
1091
1143
  embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size * 4, num_positions / 4, batch_size);
1092
1144
 
1093
1145
  embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
@@ -1276,6 +1328,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
1276
1328
  new_clip->minicpmv_version = gguf_get_val_i32(ctx, idx);
1277
1329
  }
1278
1330
 
1331
+ idx = gguf_find_key(ctx, KEY_HAS_GLM_PROJ);
1332
+ if (idx != -1) {
1333
+ new_clip->has_glm_projector = gguf_get_val_bool(ctx, idx);
1334
+ }
1335
+
1279
1336
  idx = gguf_find_key(ctx, KEY_HAS_QWEN2VL_MERGER);
1280
1337
  if (idx != -1) {
1281
1338
  new_clip->has_qwen2vl_merger = gguf_get_val_bool(ctx, idx);
@@ -1300,6 +1357,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
1300
1357
  LOG_INF("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
1301
1358
  LOG_INF("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector);
1302
1359
  LOG_INF("%s: minicpmv_projector: %d\n", __func__, new_clip->has_minicpmv_projector);
1360
+ LOG_INF("%s: glm_projector: %d\n", __func__, new_clip->has_glm_projector);
1303
1361
  LOG_INF("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
1304
1362
  LOG_INF("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
1305
1363
  }
@@ -1567,6 +1625,18 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
1567
1625
  vision_model.mm_model_ln_post_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "weight"));
1568
1626
  vision_model.mm_model_ln_post_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "bias"));
1569
1627
  }
1628
+ else if (new_clip->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
1629
+ vision_model.mm_model_adapter_conv_w = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPER_CONV, "weight"));
1630
+ vision_model.mm_model_adapter_conv_b = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPER_CONV, "bias"));
1631
+ vision_model.mm_model_mlp_0_w = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPTER_LINEAR,"weight"));
1632
+ vision_model.mm_model_ln_q_w = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPTER_NORM_1,"weight"));
1633
+ vision_model.mm_model_ln_q_b = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPTER_NORM_1,"bias"));
1634
+ vision_model.mm_model_mlp_1_w = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPTER_D_H_2_4H,"weight"));
1635
+ vision_model.mm_model_mlp_2_w = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPTER_GATE,"weight"));
1636
+ vision_model.mm_model_mlp_3_w = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPTER_D_4H_2_H,"weight"));
1637
+ vision_model.boi_w = get_tensor(new_clip->ctx_data, TN_GLM_BOI_W);
1638
+ vision_model.eoi_w = get_tensor(new_clip->ctx_data, TN_GLM_EOI_W);
1639
+ }
1570
1640
  else if (new_clip->proj_type == PROJECTOR_TYPE_MERGER) {
1571
1641
  vision_model.mm_0_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
1572
1642
  vision_model.mm_0_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias"));
@@ -2041,6 +2111,7 @@ static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_imag
2041
2111
  images[images.size()-1].push_back(patch);
2042
2112
  }
2043
2113
  }
2114
+ clip_image_u8_free(refine_image);
2044
2115
  }
2045
2116
  return images;
2046
2117
  }
@@ -2079,6 +2150,13 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
2079
2150
  clip_image_f32_free(res);
2080
2151
  }
2081
2152
  }
2153
+ for (size_t i = 0; i < imgs.size(); ++i) {
2154
+ for (size_t j = 0; j < imgs[i].size(); ++j) {
2155
+ if (imgs[i][j] != nullptr) {
2156
+ clip_image_u8_free(imgs[i][j]);
2157
+ }
2158
+ }
2159
+ }
2082
2160
  return true;
2083
2161
  }
2084
2162
  else if (ctx->has_qwen2vl_merger) {
@@ -2099,6 +2177,20 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
2099
2177
  return true;
2100
2178
  }
2101
2179
 
2180
+ if (ctx->has_glm_projector) {
2181
+ res_imgs->size = 1;
2182
+ res_imgs->data = new clip_image_f32[res_imgs->size];
2183
+ clip_image_u8 resized_image;
2184
+ int32_t sz=ctx->vision_model.hparams.image_size;
2185
+ bicubic_resize(*img, resized_image,sz,sz);
2186
+ clip_image_f32 * res = clip_image_f32_init();
2187
+ //clip_image_save_to_bmp(resized_image, "resized.bmp");
2188
+ normalize_image_u8_to_f32(&resized_image, res, ctx->image_mean, ctx->image_std);
2189
+ res_imgs->data[0] = *res;
2190
+ clip_image_f32_free(res);
2191
+ return true;
2192
+ }
2193
+
2102
2194
  bool pad_to_square = true;
2103
2195
  if (!ctx->has_vision_encoder) {
2104
2196
  LOG_ERR("This gguf file seems to have no vision encoder\n");
@@ -2284,7 +2376,8 @@ void clip_free(clip_ctx * ctx) {
2284
2376
  }
2285
2377
 
2286
2378
  size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
2287
- return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float);
2379
+ int extra_tokens = ctx->has_glm_projector ? 2 : 0;
2380
+ return (clip_n_patches(ctx) + extra_tokens) * clip_n_mmproj_embd(ctx) * sizeof(float);
2288
2381
  }
2289
2382
 
2290
2383
  size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w) {
@@ -2326,7 +2419,7 @@ int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * i
2326
2419
 
2327
2420
  int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
2328
2421
 
2329
- if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2) {
2422
+ if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2 || ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
2330
2423
  n_patches /= 4;
2331
2424
  } else if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
2332
2425
  if (ctx->minicpmv_version == 2) {
@@ -2335,6 +2428,9 @@ int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * i
2335
2428
  else if (ctx->minicpmv_version == 3) {
2336
2429
  n_patches = 64;
2337
2430
  }
2431
+ else if (ctx->minicpmv_version == 4) {
2432
+ n_patches = 64;
2433
+ }
2338
2434
  } else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
2339
2435
  int patch_size = params.patch_size * 2;
2340
2436
  int x_patch = img->nx / patch_size + (int)(img->nx % patch_size > 0);
@@ -2456,6 +2552,12 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
2456
2552
  if (ctx->has_minicpmv_projector) {
2457
2553
  GGML_ASSERT(batch_size == 1);
2458
2554
  }
2555
+ if (ctx->has_glm_projector) {
2556
+ GGML_ASSERT(batch_size == 1);
2557
+ ggml_tensor * boi = ctx->vision_model.boi_w;
2558
+ ggml_backend_tensor_get(boi,vec,0,ggml_nbytes(boi));
2559
+ vec = (float*)(vec+ggml_nelements(boi)); //offset for boi
2560
+ }
2459
2561
 
2460
2562
  // build the inference graph
2461
2563
  ggml_cgraph * gf = clip_image_build_graph(ctx, imgs, ctx->load_image_size, true);
@@ -2514,8 +2616,8 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
2514
2616
  // -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316
2515
2617
  struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
2516
2618
  int* positions_data = (int*)malloc(ggml_nbytes(positions));
2517
- int bucket_coords_h[70];
2518
- int bucket_coords_w[70];
2619
+ int bucket_coords_h[1024];
2620
+ int bucket_coords_w[1024];
2519
2621
  for (int i = 0; i < pos_h; i++){
2520
2622
  bucket_coords_h[i] = std::floor(70.0*i/pos_h);
2521
2623
  }
@@ -2543,6 +2645,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
2543
2645
  else if (ctx->minicpmv_version == 3) {
2544
2646
  embed_dim = 3584;
2545
2647
  }
2648
+ else if (ctx->minicpmv_version == 4) {
2649
+ embed_dim = 3584;
2650
+ }
2546
2651
  auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));
2547
2652
 
2548
2653
  float * pos_embed_data = (float *)malloc(ggml_nbytes(pos_embed));
@@ -2605,7 +2710,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
2605
2710
  ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
2606
2711
  free(positions_data);
2607
2712
 
2608
- {
2713
+ if (!ctx->has_glm_projector) {
2609
2714
  struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
2610
2715
  int* patches_data = (int*)malloc(ggml_nbytes(patches));
2611
2716
  for (int i = 0; i < num_patches; i++) {
@@ -2629,14 +2734,19 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
2629
2734
  // copy the embeddings to the location passed by the user
2630
2735
  ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
2631
2736
 
2737
+ if (ctx->has_glm_projector) {
2738
+ //eoi
2739
+ ggml_tensor * eoi = ctx->vision_model.eoi_w;
2740
+ int offset = ggml_nelements(embeddings);
2741
+ ggml_backend_tensor_get(eoi, vec+offset, 0, ggml_nbytes(eoi));
2742
+ }
2743
+
2632
2744
  return true;
2633
2745
  }
2634
2746
 
2635
2747
  bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype) {
2636
- ggml_type type = GGML_TYPE_Q4_1;
2637
-
2638
2748
  assert(itype < GGML_TYPE_COUNT);
2639
- type = static_cast<ggml_type>(itype);
2749
+ ggml_type type = static_cast<ggml_type>(itype);
2640
2750
 
2641
2751
  auto * ctx_clip = clip_model_load(fname_inp, 2);
2642
2752
 
@@ -2689,8 +2799,8 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
2689
2799
  }
2690
2800
  }
2691
2801
 
2692
- // quantize only 2D tensors
2693
- quantize &= (ggml_n_dims(cur) == 2);
2802
+ // quantize only 2D tensors and bigger than block size
2803
+ quantize &= (ggml_n_dims(cur) == 2) && cur->ne[0] > ggml_blck_size(type);
2694
2804
 
2695
2805
  if (quantize) {
2696
2806
  new_type = type;
@@ -2786,6 +2896,12 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
2786
2896
  else if (ctx->minicpmv_version == 3) {
2787
2897
  return 3584;
2788
2898
  }
2899
+ else if (ctx->minicpmv_version == 4) {
2900
+ return 3584;
2901
+ }
2902
+ }
2903
+ if (ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE){
2904
+ return ctx->vision_model.mm_model_mlp_3_w->ne[1];
2789
2905
  }
2790
2906
  if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
2791
2907
  return ctx->vision_model.mm_1_b->ne[0];
@@ -2802,6 +2918,9 @@ int clip_is_minicpmv(const struct clip_ctx * ctx) {
2802
2918
  return 0;
2803
2919
  }
2804
2920
 
2921
+ bool clip_is_glm(const struct clip_ctx * ctx) {
2922
+ return ctx->has_glm_projector;
2923
+ }
2805
2924
  bool clip_is_qwen2vl(const struct clip_ctx * ctx) {
2806
2925
  return ctx->has_qwen2vl_merger;
2807
2926
  }
@@ -93,6 +93,8 @@ CLIP_API bool clip_is_qwen2vl(const struct clip_ctx * ctx);
93
93
 
94
94
  CLIP_API bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);
95
95
 
96
+ CLIP_API bool clip_is_glm(const struct clip_ctx * ctx);
97
+
96
98
  #ifdef __cplusplus
97
99
  }
98
100
  #endif
@@ -216,7 +216,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
216
216
  return true;
217
217
  }
218
218
 
219
- static clip_image_f32 * only_v2_5_reshape_by_patch(clip_image_f32 * image, int patch_size) {
219
+ static clip_image_f32 * reshape_by_patch(clip_image_f32 * image, int patch_size) {
220
220
  int width = image->nx;
221
221
  int height = image->ny;
222
222
  int num_patches = (height / patch_size) * (width / patch_size);
@@ -277,13 +277,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
277
277
  encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
278
278
  }
279
279
  else {
280
- int has_minicpmv_projector = clip_is_minicpmv(ctx_clip);
281
- if (has_minicpmv_projector == 2) {
282
- encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
283
- }
284
- else if (has_minicpmv_projector == 3) {
285
- encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
286
- }
280
+ encoded = clip_image_encode(ctx_clip, n_threads, reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
287
281
  }
288
282
 
289
283
  if (!encoded) {
@@ -313,6 +307,23 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
313
307
  load_image_size->height = img->ny;
314
308
  clip_add_load_image_size(ctx_clip, load_image_size);
315
309
  LOG_INF("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height);
310
+ delete[] img_res_v.data;
311
+ img_res_v.size = 0;
312
+ img_res_v.data = nullptr;
313
+ }
314
+ else if (clip_is_glm(ctx_clip)){
315
+ struct clip_image_size * load_image_size = clip_image_size_init();
316
+ load_image_size->width = img_res_v.data[0].nx;
317
+ load_image_size->height = img_res_v.data[0].ny;
318
+ clip_add_load_image_size(ctx_clip, load_image_size);
319
+
320
+ bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd);
321
+ int pos = int(load_image_size->width/clip_patch_size(ctx_clip)/2);
322
+ *n_img_pos = (pos * pos + 2);
323
+ if (!encoded){
324
+ LOG_ERR("Unable to encode image \n");
325
+ return false;
326
+ }
316
327
  }
317
328
  else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
318
329
  // flat / default llava-1.5 type embedding
@@ -398,6 +409,9 @@ bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, co
398
409
  if (clip_is_minicpmv(ctx_clip)) {
399
410
  num_max_patches = 10;
400
411
  }
412
+ if (clip_is_glm(ctx_clip)) {
413
+ num_max_patches = 1;
414
+ }
401
415
  float * image_embd;
402
416
  if (clip_is_qwen2vl(ctx_clip)) {
403
417
  // qwen2vl don't split image into chunks, so `num_max_patches` is not needed.