@fugood/llama.node 0.3.9 → 0.3.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.js +2 -2
- package/lib/binding.ts +47 -8
- package/lib/index.js +21 -1
- package/lib/index.ts +31 -1
- package/package.json +12 -3
- package/src/LlamaCompletionWorker.cpp +33 -6
- package/src/LlamaCompletionWorker.h +3 -1
- package/src/LlamaContext.cpp +336 -28
- package/src/LlamaContext.h +2 -0
- package/src/common.hpp +19 -2
- package/src/llama.cpp/.github/workflows/build.yml +289 -107
- package/src/llama.cpp/.github/workflows/close-issue.yml +1 -1
- package/src/llama.cpp/.github/workflows/docker.yml +2 -1
- package/src/llama.cpp/.github/workflows/server.yml +25 -2
- package/src/llama.cpp/CMakeLists.txt +10 -19
- package/src/llama.cpp/cmake/build-info.cmake +1 -1
- package/src/llama.cpp/common/CMakeLists.txt +32 -0
- package/src/llama.cpp/common/arg.cpp +66 -16
- package/src/llama.cpp/common/chat-template.hpp +515 -0
- package/src/llama.cpp/common/chat.cpp +966 -0
- package/src/llama.cpp/common/chat.hpp +52 -0
- package/src/llama.cpp/common/common.cpp +159 -36
- package/src/llama.cpp/common/common.h +56 -14
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +46 -66
- package/src/llama.cpp/common/json-schema-to-grammar.h +15 -1
- package/src/llama.cpp/common/llguidance.cpp +270 -0
- package/src/llama.cpp/common/log.cpp +1 -10
- package/src/llama.cpp/common/log.h +10 -0
- package/src/llama.cpp/common/minja.hpp +2868 -0
- package/src/llama.cpp/common/sampling.cpp +22 -1
- package/src/llama.cpp/common/sampling.h +3 -0
- package/src/llama.cpp/docs/build.md +54 -9
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +12 -2
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +1 -1
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +59 -0
- package/src/llama.cpp/examples/llava/clip.cpp +133 -14
- package/src/llama.cpp/examples/llava/clip.h +2 -0
- package/src/llama.cpp/examples/llava/llava.cpp +22 -8
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +9 -1
- package/src/llama.cpp/examples/main/main.cpp +26 -25
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +136 -137
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +18 -4
- package/src/llama.cpp/examples/run/run.cpp +224 -69
- package/src/llama.cpp/examples/server/server.cpp +252 -81
- package/src/llama.cpp/examples/server/utils.hpp +73 -21
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +6 -4
- package/src/llama.cpp/examples/simple-cmake-pkg/CMakeLists.txt +11 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +78 -1
- package/src/llama.cpp/ggml/include/ggml.h +1 -1
- package/src/llama.cpp/ggml/src/CMakeLists.txt +21 -4
- package/src/llama.cpp/ggml/src/ggml-alloc.c +1 -13
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +91 -78
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +7 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +1 -1
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +46 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +16 -1
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +1 -1
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +28 -8
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +5 -7
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +33 -23
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +1 -5
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +323 -121
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +13 -3
- package/src/llama.cpp/ggml/src/ggml.c +23 -13
- package/src/llama.cpp/include/llama.h +14 -1
- package/src/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.out +46 -0
- package/src/llama.cpp/src/CMakeLists.txt +1 -1
- package/src/llama.cpp/src/llama-arch.cpp +7 -2
- package/src/llama.cpp/src/llama-arch.h +3 -1
- package/src/llama.cpp/src/llama-chat.cpp +11 -2
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-grammar.cpp +86 -6
- package/src/llama.cpp/src/llama-grammar.h +22 -1
- package/src/llama.cpp/src/llama-mmap.cpp +1 -0
- package/src/llama.cpp/src/llama-model-loader.cpp +1 -1
- package/src/llama.cpp/src/llama-model.cpp +76 -6
- package/src/llama.cpp/src/llama-sampling.cpp +47 -4
- package/src/llama.cpp/src/llama-vocab.cpp +10 -4
- package/src/llama.cpp/src/llama.cpp +181 -123
- package/src/llama.cpp/tests/CMakeLists.txt +4 -0
- package/src/llama.cpp/tests/test-backend-ops.cpp +158 -57
- package/src/llama.cpp/tests/test-chat-template.cpp +154 -31
- package/src/llama.cpp/tests/test-chat.cpp +607 -0
- package/src/llama.cpp/tests/test-grammar-integration.cpp +2 -2
- package/src/llama.cpp/tests/test-grammar-llguidance.cpp +1140 -0
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +1 -1
- package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +0 -32
|
@@ -151,9 +151,30 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
|
|
151
151
|
|
|
152
152
|
lparams.no_perf = params.no_perf;
|
|
153
153
|
|
|
154
|
+
std::vector<const char *> trigger_words;
|
|
155
|
+
trigger_words.reserve(params.grammar_trigger_words.size());
|
|
156
|
+
for (const auto & str : params.grammar_trigger_words) {
|
|
157
|
+
trigger_words.push_back(str.word.c_str());
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
struct llama_sampler * grmr;
|
|
161
|
+
if (params.grammar.compare(0, 11, "%llguidance") == 0) {
|
|
162
|
+
#ifdef LLAMA_USE_LLGUIDANCE
|
|
163
|
+
grmr = llama_sampler_init_llg(vocab, "lark", params.grammar.c_str());
|
|
164
|
+
#else
|
|
165
|
+
GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
|
|
166
|
+
#endif // LLAMA_USE_LLGUIDANCE
|
|
167
|
+
} else {
|
|
168
|
+
grmr = params.grammar_lazy
|
|
169
|
+
? llama_sampler_init_grammar_lazy(vocab, params.grammar.c_str(), "root",
|
|
170
|
+
trigger_words.data(), trigger_words.size(),
|
|
171
|
+
params.grammar_trigger_tokens.data(), params.grammar_trigger_tokens.size())
|
|
172
|
+
: llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root");
|
|
173
|
+
}
|
|
174
|
+
|
|
154
175
|
auto * result = new common_sampler {
|
|
155
176
|
/* .params = */ params,
|
|
156
|
-
/* .grmr = */
|
|
177
|
+
/* .grmr = */ grmr,
|
|
157
178
|
/* .chain = */ llama_sampler_chain_init(lparams),
|
|
158
179
|
/* .prev = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
|
|
159
180
|
/* .cur = */ {},
|
|
@@ -102,3 +102,6 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr);
|
|
|
102
102
|
|
|
103
103
|
std::vector<enum common_sampler_type> common_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
|
|
104
104
|
std::vector<enum common_sampler_type> common_sampler_types_from_chars(const std::string & chars);
|
|
105
|
+
|
|
106
|
+
llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab,
|
|
107
|
+
const char * grammar_kind, const char * grammar_data);
|
|
@@ -125,21 +125,66 @@ For detailed info, please refer to [llama.cpp for SYCL](./backend/SYCL.md).
|
|
|
125
125
|
|
|
126
126
|
## CUDA
|
|
127
127
|
|
|
128
|
-
This provides GPU acceleration using an NVIDIA GPU. Make sure to have the CUDA toolkit
|
|
128
|
+
This provides GPU acceleration using an NVIDIA GPU. Make sure to have the [CUDA toolkit](https://developer.nvidia.com/cuda-toolkit) installed.
|
|
129
129
|
|
|
130
|
-
|
|
130
|
+
#### Download directly from NVIDIA
|
|
131
|
+
You may find the official downloads here: [NVIDIA developer site](https://developer.nvidia.com/cuda-downloads).
|
|
131
132
|
|
|
132
|
-
- Using `CMake`:
|
|
133
133
|
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
134
|
+
#### Compile and run inside a Fedora Toolbox Container
|
|
135
|
+
We also have a [guide](./cuda-fedora.md) for setting up CUDA toolkit in a Fedora [toolbox container](https://containertoolbx.org/).
|
|
136
|
+
|
|
137
|
+
**Recommended for:**
|
|
138
|
+
|
|
139
|
+
- ***Particularly*** *convenient* for users of [Atomic Desktops for Fedora](https://fedoraproject.org/atomic-desktops/); such as: [Silverblue](https://fedoraproject.org/atomic-desktops/silverblue/) and [Kinoite](https://fedoraproject.org/atomic-desktops/kinoite/).
|
|
140
|
+
- Toolbox is installed by default: [Fedora Workstation](https://fedoraproject.org/workstation/) or [Fedora KDE Plasma Desktop](https://fedoraproject.org/spins/kde).
|
|
141
|
+
- *Optionally* toolbox packages are available: [Arch Linux](https://archlinux.org/), [Red Hat Enterprise Linux >= 8.5](https://www.redhat.com/en/technologies/linux-platforms/enterprise-linux), or [Ubuntu](https://ubuntu.com/download)
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
### Compilation
|
|
145
|
+
```bash
|
|
146
|
+
cmake -B build -DGGML_CUDA=ON
|
|
147
|
+
cmake --build build --config Release
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
### Override Compute Capability Specifications
|
|
151
|
+
|
|
152
|
+
If `nvcc` cannot detect your gpu, you may get compile-warnings such as:
|
|
153
|
+
```text
|
|
154
|
+
nvcc warning : Cannot find valid GPU for '-arch=native', default arch is used
|
|
155
|
+
```
|
|
138
156
|
|
|
139
|
-
|
|
157
|
+
To override the `native` GPU detection:
|
|
158
|
+
|
|
159
|
+
#### 1. Take note of the `Compute Capability` of your NVIDIA devices: ["CUDA: Your GPU Compute > Capability"](https://developer.nvidia.com/cuda-gpus).
|
|
160
|
+
|
|
161
|
+
```text
|
|
162
|
+
GeForce RTX 4090 8.9
|
|
163
|
+
GeForce RTX 3080 Ti 8.6
|
|
164
|
+
GeForce RTX 3070 8.6
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
#### 2. Manually list each varying `Compute Capability` in the `CMAKE_CUDA_ARCHITECTURES` list.
|
|
168
|
+
|
|
169
|
+
```bash
|
|
170
|
+
cmake -B build -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES="86;89"
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
### Runtime CUDA environmental variables
|
|
174
|
+
|
|
175
|
+
You may set the [cuda environmental variables](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) at runtime.
|
|
176
|
+
|
|
177
|
+
```bash
|
|
178
|
+
# Use `CUDA_VISIBLE_DEVICES` to hide the first compute device.
|
|
179
|
+
CUDA_VISIBLE_DEVICES="-0" ./build/bin/llama-server --model /srv/models/llama.gguf
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
### Unified Memory
|
|
140
183
|
|
|
141
184
|
The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted. In Windows this setting is available in the NVIDIA control panel as `System Memory Fallback`.
|
|
142
185
|
|
|
186
|
+
### Performance Tuning
|
|
187
|
+
|
|
143
188
|
The following compilation options are also available to tweak performance:
|
|
144
189
|
|
|
145
190
|
| Option | Legal values | Default | Description |
|
|
@@ -286,7 +331,7 @@ You don't need to install Vulkan SDK. It will be installed inside the container.
|
|
|
286
331
|
|
|
287
332
|
```sh
|
|
288
333
|
# Build the image
|
|
289
|
-
docker build -t llama-cpp-vulkan -f .devops/
|
|
334
|
+
docker build -t llama-cpp-vulkan --target light -f .devops/vulkan.Dockerfile .
|
|
290
335
|
|
|
291
336
|
# Then, use it:
|
|
292
337
|
docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-vulkan -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
|
|
@@ -345,8 +345,18 @@ struct lora_merge_ctx {
|
|
|
345
345
|
gf = ggml_new_graph(ctx0);
|
|
346
346
|
struct ggml_tensor * cur = inp_base;
|
|
347
347
|
for (size_t i = 0; i < adapters.size(); ++i) {
|
|
348
|
-
struct ggml_tensor *
|
|
349
|
-
|
|
348
|
+
struct ggml_tensor * delta;
|
|
349
|
+
bool is_tok_embd = string_starts_with(name_base, "token_embd");
|
|
350
|
+
if (is_tok_embd) {
|
|
351
|
+
printf("%s : detected token embeddings tensor\n", __func__);
|
|
352
|
+
delta = ggml_mul_mat(ctx0,
|
|
353
|
+
ggml_cast(ctx0, inp_b[i], GGML_TYPE_F32),
|
|
354
|
+
ggml_cast(ctx0, inp_a[i], GGML_TYPE_F32));
|
|
355
|
+
} else {
|
|
356
|
+
delta = ggml_mul_mat(ctx0,
|
|
357
|
+
ggml_cont(ctx0, ggml_transpose(ctx0, ggml_cast(ctx0, inp_a[i], GGML_TYPE_F32))),
|
|
358
|
+
ggml_cast(ctx0, inp_b[i], GGML_TYPE_F32));
|
|
359
|
+
}
|
|
350
360
|
// scale
|
|
351
361
|
const float alpha = adapters[i]->alpha;
|
|
352
362
|
const float rank = (float) inp_b[i]->ne[0];
|
|
@@ -76,7 +76,7 @@ int main(int argc, char** argv) {
|
|
|
76
76
|
grammar_str = buffer.str();
|
|
77
77
|
}
|
|
78
78
|
|
|
79
|
-
llama_grammar * grammar = llama_grammar_init_impl(nullptr, grammar_str.c_str(), "root");
|
|
79
|
+
llama_grammar * grammar = llama_grammar_init_impl(nullptr, grammar_str.c_str(), "root", false, nullptr, 0, nullptr, 0);
|
|
80
80
|
if (grammar == nullptr) {
|
|
81
81
|
fprintf(stdout, "Failed to initialize llama_grammar\n");
|
|
82
82
|
return 1;
|
|
@@ -50,3 +50,10 @@ set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-qwen2vl-cli)
|
|
|
50
50
|
install(TARGETS ${TARGET} RUNTIME)
|
|
51
51
|
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
|
|
52
52
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
|
53
|
+
|
|
54
|
+
set(TARGET llama-llava-clip-quantize-cli)
|
|
55
|
+
add_executable(${TARGET} clip-quantize-cli.cpp)
|
|
56
|
+
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava-clip-quantize-cli)
|
|
57
|
+
install(TARGETS ${TARGET} RUNTIME)
|
|
58
|
+
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
|
|
59
|
+
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
#include "arg.h"
|
|
2
|
+
#include "base64.hpp"
|
|
3
|
+
#include "log.h"
|
|
4
|
+
#include "common.h"
|
|
5
|
+
#include "sampling.h"
|
|
6
|
+
#include "clip.h"
|
|
7
|
+
#include "llava.h"
|
|
8
|
+
#include "llama.h"
|
|
9
|
+
#include "ggml.h"
|
|
10
|
+
|
|
11
|
+
static void print_usage(int argc, char ** argv) {
|
|
12
|
+
(void) argc;
|
|
13
|
+
|
|
14
|
+
fprintf(stderr, "usage: %s /path/to/ggml-model-f32.gguf /path/to/ggml-model-quantized.gguf type\n", argv[0]);
|
|
15
|
+
fprintf(stderr, " type = 2 - q4_0\n");
|
|
16
|
+
fprintf(stderr, " type = 3 - q4_1\n");
|
|
17
|
+
fprintf(stderr, " type = 6 - q5_0\n");
|
|
18
|
+
fprintf(stderr, " type = 7 - q5_1\n");
|
|
19
|
+
fprintf(stderr, " type = 8 - q8_0\n");
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
int main(int argc, char ** argv) {
|
|
23
|
+
if (argc != 4) {
|
|
24
|
+
print_usage(argc, argv);
|
|
25
|
+
return 1;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
const std::string fname_inp = argv[1];
|
|
29
|
+
const std::string fname_out = argv[2];
|
|
30
|
+
|
|
31
|
+
const int itype = atoi(argv[3]);
|
|
32
|
+
|
|
33
|
+
const int64_t t_main_start_us = ggml_time_us();
|
|
34
|
+
|
|
35
|
+
int64_t t_quantize_us = 0;
|
|
36
|
+
|
|
37
|
+
// load the model
|
|
38
|
+
{
|
|
39
|
+
const int64_t t_start_us = ggml_time_us();
|
|
40
|
+
|
|
41
|
+
if (!clip_model_quantize(fname_inp.c_str(), fname_out.c_str(), itype)) {
|
|
42
|
+
fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
|
|
43
|
+
return 1;
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
t_quantize_us = ggml_time_us() - t_start_us;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
// report timing
|
|
50
|
+
{
|
|
51
|
+
const int64_t t_main_end_us = ggml_time_us();
|
|
52
|
+
|
|
53
|
+
printf("\n");
|
|
54
|
+
printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us / 1000.0f);
|
|
55
|
+
printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us) / 1000.0f);
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
return 0;
|
|
59
|
+
}
|
|
@@ -102,6 +102,7 @@ static std::string format(const char * fmt, ...) {
|
|
|
102
102
|
#define KEY_HAS_VIS_ENC "clip.has_vision_encoder"
|
|
103
103
|
#define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector"
|
|
104
104
|
#define KEY_HAS_MINICPMV_PROJ "clip.has_minicpmv_projector"
|
|
105
|
+
#define KEY_HAS_GLM_PROJ "clip.has_glm_projector"
|
|
105
106
|
#define KEY_MINICPMV_VERSION "clip.minicpmv_version"
|
|
106
107
|
#define KEY_HAS_QWEN2VL_MERGER "clip.has_qwen2vl_merger"
|
|
107
108
|
#define KEY_USE_GELU "clip.use_gelu"
|
|
@@ -160,6 +161,15 @@ static std::string format(const char * fmt, ...) {
|
|
|
160
161
|
#define TN_MINICPMV_ATTN "resampler.attn.%s.%s"
|
|
161
162
|
#define TN_MINICPMV_LN "resampler.ln_%s.%s"
|
|
162
163
|
|
|
164
|
+
#define TN_GLM_ADAPER_CONV "adapter.conv.%s"
|
|
165
|
+
#define TN_GLM_ADAPTER_LINEAR "adapter.linear.linear.%s"
|
|
166
|
+
#define TN_GLM_ADAPTER_NORM_1 "adapter.linear.norm1.%s"
|
|
167
|
+
#define TN_GLM_ADAPTER_D_H_2_4H "adapter.linear.dense_h_to_4h.%s"
|
|
168
|
+
#define TN_GLM_ADAPTER_GATE "adapter.linear.gate.%s"
|
|
169
|
+
#define TN_GLM_ADAPTER_D_4H_2_H "adapter.linear.dense_4h_to_h.%s"
|
|
170
|
+
#define TN_GLM_BOI_W "adapter.boi"
|
|
171
|
+
#define TN_GLM_EOI_W "adapter.eoi"
|
|
172
|
+
|
|
163
173
|
|
|
164
174
|
enum projector_type {
|
|
165
175
|
PROJECTOR_TYPE_MLP,
|
|
@@ -167,6 +177,7 @@ enum projector_type {
|
|
|
167
177
|
PROJECTOR_TYPE_LDP,
|
|
168
178
|
PROJECTOR_TYPE_LDPV2,
|
|
169
179
|
PROJECTOR_TYPE_RESAMPLER,
|
|
180
|
+
PROJECTOR_TYPE_GLM_EDGE,
|
|
170
181
|
PROJECTOR_TYPE_MERGER,
|
|
171
182
|
PROJECTOR_TYPE_UNKNOWN,
|
|
172
183
|
};
|
|
@@ -176,6 +187,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
|
|
176
187
|
{ PROJECTOR_TYPE_LDP, "ldp" },
|
|
177
188
|
{ PROJECTOR_TYPE_LDPV2, "ldpv2"},
|
|
178
189
|
{ PROJECTOR_TYPE_RESAMPLER, "resampler"},
|
|
190
|
+
{ PROJECTOR_TYPE_GLM_EDGE, "adapter"},
|
|
179
191
|
{ PROJECTOR_TYPE_MERGER, "qwen2vl_merger"},
|
|
180
192
|
};
|
|
181
193
|
|
|
@@ -500,6 +512,12 @@ struct clip_vision_model {
|
|
|
500
512
|
struct ggml_tensor * mm_4_w = NULL;
|
|
501
513
|
struct ggml_tensor * mm_4_b = NULL;
|
|
502
514
|
|
|
515
|
+
//GLMV-Edge projection
|
|
516
|
+
struct ggml_tensor * mm_model_adapter_conv_w;
|
|
517
|
+
struct ggml_tensor * mm_model_adapter_conv_b;
|
|
518
|
+
struct ggml_tensor * boi_w;
|
|
519
|
+
struct ggml_tensor * eoi_w;
|
|
520
|
+
|
|
503
521
|
// MobileVLM projection
|
|
504
522
|
struct ggml_tensor * mm_model_mlp_1_w;
|
|
505
523
|
struct ggml_tensor * mm_model_mlp_1_b;
|
|
@@ -560,6 +578,7 @@ struct clip_ctx {
|
|
|
560
578
|
bool has_vision_encoder = false;
|
|
561
579
|
bool has_llava_projector = false;
|
|
562
580
|
bool has_minicpmv_projector = false;
|
|
581
|
+
bool has_glm_projector = false;
|
|
563
582
|
bool has_qwen2vl_merger = false;
|
|
564
583
|
int minicpmv_version = 2;
|
|
565
584
|
|
|
@@ -638,7 +657,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|
|
638
657
|
|
|
639
658
|
const int batch_size = imgs->size;
|
|
640
659
|
|
|
641
|
-
if (ctx->has_llava_projector || ctx->has_minicpmv_projector) {
|
|
660
|
+
if (ctx->has_llava_projector || ctx->has_minicpmv_projector || ctx->has_glm_projector) {
|
|
642
661
|
GGML_ASSERT(batch_size == 1);
|
|
643
662
|
}
|
|
644
663
|
|
|
@@ -718,6 +737,9 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|
|
718
737
|
else if (ctx->minicpmv_version == 3) {
|
|
719
738
|
pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 3584, pos_w * pos_h, 1);
|
|
720
739
|
}
|
|
740
|
+
else if (ctx->minicpmv_version == 4) {
|
|
741
|
+
pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 3584, pos_w * pos_h, 1);
|
|
742
|
+
}
|
|
721
743
|
ggml_set_name(pos_embed, "pos_embed");
|
|
722
744
|
ggml_set_input(pos_embed);
|
|
723
745
|
}
|
|
@@ -731,8 +753,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|
|
731
753
|
}
|
|
732
754
|
|
|
733
755
|
// loop over layers
|
|
734
|
-
if (ctx->has_minicpmv_projector || ctx->has_qwen2vl_merger) {
|
|
735
|
-
// TODO: figure out why we doing thing in this way ???
|
|
756
|
+
if (ctx->has_minicpmv_projector || ctx->has_glm_projector || ctx->has_qwen2vl_merger) {
|
|
736
757
|
n_layer += 1;
|
|
737
758
|
}
|
|
738
759
|
for (int il = 0; il < n_layer - 1; il++) {
|
|
@@ -1053,6 +1074,11 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|
|
1053
1074
|
n_head = hidden_size/d_head;
|
|
1054
1075
|
num_query = 64;
|
|
1055
1076
|
}
|
|
1077
|
+
else if (ctx->minicpmv_version == 4) {
|
|
1078
|
+
hidden_size = 3584;
|
|
1079
|
+
n_head = hidden_size/d_head;
|
|
1080
|
+
num_query = 64;
|
|
1081
|
+
}
|
|
1056
1082
|
|
|
1057
1083
|
struct ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), model.mm_model_attn_q_b);
|
|
1058
1084
|
Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
|
|
@@ -1087,7 +1113,33 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|
|
1087
1113
|
GGML_ASSERT(false);
|
|
1088
1114
|
}
|
|
1089
1115
|
}
|
|
1090
|
-
|
|
1116
|
+
// glm projector
|
|
1117
|
+
else if (ctx->has_glm_projector) {
|
|
1118
|
+
if (ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
|
|
1119
|
+
size_t gridsz = (size_t)sqrt(embeddings->ne[1]);
|
|
1120
|
+
embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings,1,0,2,3));
|
|
1121
|
+
embeddings = ggml_reshape_3d(ctx0, embeddings, gridsz, gridsz, embeddings->ne[1]);
|
|
1122
|
+
embeddings = ggml_conv_2d(ctx0, model.mm_model_adapter_conv_w, embeddings, 2, 2, 0, 0, 1, 1);
|
|
1123
|
+
embeddings = ggml_reshape_3d(ctx0, embeddings,embeddings->ne[0]*embeddings->ne[1] , embeddings->ne[2], batch_size);
|
|
1124
|
+
embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings, 1, 0, 2, 3));
|
|
1125
|
+
embeddings = ggml_add(ctx0, embeddings, model.mm_model_adapter_conv_b);
|
|
1126
|
+
//GLU
|
|
1127
|
+
{
|
|
1128
|
+
embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
|
|
1129
|
+
embeddings = ggml_norm(ctx0, embeddings, eps);
|
|
1130
|
+
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_q_w), model.mm_model_ln_q_b);
|
|
1131
|
+
embeddings = ggml_gelu_inplace(ctx0, embeddings);
|
|
1132
|
+
struct ggml_tensor * x = embeddings;
|
|
1133
|
+
embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, embeddings);
|
|
1134
|
+
x = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w,x);
|
|
1135
|
+
embeddings = ggml_silu_inplace(ctx0, embeddings);
|
|
1136
|
+
embeddings = ggml_mul(ctx0, embeddings,x);
|
|
1137
|
+
embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, embeddings);
|
|
1138
|
+
}
|
|
1139
|
+
} else {
|
|
1140
|
+
GGML_ABORT("fatel error");
|
|
1141
|
+
}
|
|
1142
|
+
} else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
|
|
1091
1143
|
embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size * 4, num_positions / 4, batch_size);
|
|
1092
1144
|
|
|
1093
1145
|
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
|
|
@@ -1276,6 +1328,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|
|
1276
1328
|
new_clip->minicpmv_version = gguf_get_val_i32(ctx, idx);
|
|
1277
1329
|
}
|
|
1278
1330
|
|
|
1331
|
+
idx = gguf_find_key(ctx, KEY_HAS_GLM_PROJ);
|
|
1332
|
+
if (idx != -1) {
|
|
1333
|
+
new_clip->has_glm_projector = gguf_get_val_bool(ctx, idx);
|
|
1334
|
+
}
|
|
1335
|
+
|
|
1279
1336
|
idx = gguf_find_key(ctx, KEY_HAS_QWEN2VL_MERGER);
|
|
1280
1337
|
if (idx != -1) {
|
|
1281
1338
|
new_clip->has_qwen2vl_merger = gguf_get_val_bool(ctx, idx);
|
|
@@ -1300,6 +1357,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|
|
1300
1357
|
LOG_INF("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
|
|
1301
1358
|
LOG_INF("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector);
|
|
1302
1359
|
LOG_INF("%s: minicpmv_projector: %d\n", __func__, new_clip->has_minicpmv_projector);
|
|
1360
|
+
LOG_INF("%s: glm_projector: %d\n", __func__, new_clip->has_glm_projector);
|
|
1303
1361
|
LOG_INF("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
|
|
1304
1362
|
LOG_INF("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
|
|
1305
1363
|
}
|
|
@@ -1567,6 +1625,18 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|
|
1567
1625
|
vision_model.mm_model_ln_post_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "weight"));
|
|
1568
1626
|
vision_model.mm_model_ln_post_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "bias"));
|
|
1569
1627
|
}
|
|
1628
|
+
else if (new_clip->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
|
|
1629
|
+
vision_model.mm_model_adapter_conv_w = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPER_CONV, "weight"));
|
|
1630
|
+
vision_model.mm_model_adapter_conv_b = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPER_CONV, "bias"));
|
|
1631
|
+
vision_model.mm_model_mlp_0_w = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPTER_LINEAR,"weight"));
|
|
1632
|
+
vision_model.mm_model_ln_q_w = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPTER_NORM_1,"weight"));
|
|
1633
|
+
vision_model.mm_model_ln_q_b = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPTER_NORM_1,"bias"));
|
|
1634
|
+
vision_model.mm_model_mlp_1_w = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPTER_D_H_2_4H,"weight"));
|
|
1635
|
+
vision_model.mm_model_mlp_2_w = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPTER_GATE,"weight"));
|
|
1636
|
+
vision_model.mm_model_mlp_3_w = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPTER_D_4H_2_H,"weight"));
|
|
1637
|
+
vision_model.boi_w = get_tensor(new_clip->ctx_data, TN_GLM_BOI_W);
|
|
1638
|
+
vision_model.eoi_w = get_tensor(new_clip->ctx_data, TN_GLM_EOI_W);
|
|
1639
|
+
}
|
|
1570
1640
|
else if (new_clip->proj_type == PROJECTOR_TYPE_MERGER) {
|
|
1571
1641
|
vision_model.mm_0_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
|
|
1572
1642
|
vision_model.mm_0_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias"));
|
|
@@ -2041,6 +2111,7 @@ static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_imag
|
|
|
2041
2111
|
images[images.size()-1].push_back(patch);
|
|
2042
2112
|
}
|
|
2043
2113
|
}
|
|
2114
|
+
clip_image_u8_free(refine_image);
|
|
2044
2115
|
}
|
|
2045
2116
|
return images;
|
|
2046
2117
|
}
|
|
@@ -2079,6 +2150,13 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
|
|
|
2079
2150
|
clip_image_f32_free(res);
|
|
2080
2151
|
}
|
|
2081
2152
|
}
|
|
2153
|
+
for (size_t i = 0; i < imgs.size(); ++i) {
|
|
2154
|
+
for (size_t j = 0; j < imgs[i].size(); ++j) {
|
|
2155
|
+
if (imgs[i][j] != nullptr) {
|
|
2156
|
+
clip_image_u8_free(imgs[i][j]);
|
|
2157
|
+
}
|
|
2158
|
+
}
|
|
2159
|
+
}
|
|
2082
2160
|
return true;
|
|
2083
2161
|
}
|
|
2084
2162
|
else if (ctx->has_qwen2vl_merger) {
|
|
@@ -2099,6 +2177,20 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
|
|
|
2099
2177
|
return true;
|
|
2100
2178
|
}
|
|
2101
2179
|
|
|
2180
|
+
if (ctx->has_glm_projector) {
|
|
2181
|
+
res_imgs->size = 1;
|
|
2182
|
+
res_imgs->data = new clip_image_f32[res_imgs->size];
|
|
2183
|
+
clip_image_u8 resized_image;
|
|
2184
|
+
int32_t sz=ctx->vision_model.hparams.image_size;
|
|
2185
|
+
bicubic_resize(*img, resized_image,sz,sz);
|
|
2186
|
+
clip_image_f32 * res = clip_image_f32_init();
|
|
2187
|
+
//clip_image_save_to_bmp(resized_image, "resized.bmp");
|
|
2188
|
+
normalize_image_u8_to_f32(&resized_image, res, ctx->image_mean, ctx->image_std);
|
|
2189
|
+
res_imgs->data[0] = *res;
|
|
2190
|
+
clip_image_f32_free(res);
|
|
2191
|
+
return true;
|
|
2192
|
+
}
|
|
2193
|
+
|
|
2102
2194
|
bool pad_to_square = true;
|
|
2103
2195
|
if (!ctx->has_vision_encoder) {
|
|
2104
2196
|
LOG_ERR("This gguf file seems to have no vision encoder\n");
|
|
@@ -2284,7 +2376,8 @@ void clip_free(clip_ctx * ctx) {
|
|
|
2284
2376
|
}
|
|
2285
2377
|
|
|
2286
2378
|
size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
|
|
2287
|
-
|
|
2379
|
+
int extra_tokens = ctx->has_glm_projector ? 2 : 0;
|
|
2380
|
+
return (clip_n_patches(ctx) + extra_tokens) * clip_n_mmproj_embd(ctx) * sizeof(float);
|
|
2288
2381
|
}
|
|
2289
2382
|
|
|
2290
2383
|
size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w) {
|
|
@@ -2326,7 +2419,7 @@ int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * i
|
|
|
2326
2419
|
|
|
2327
2420
|
int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
|
|
2328
2421
|
|
|
2329
|
-
if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2) {
|
|
2422
|
+
if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2 || ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
|
|
2330
2423
|
n_patches /= 4;
|
|
2331
2424
|
} else if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
|
|
2332
2425
|
if (ctx->minicpmv_version == 2) {
|
|
@@ -2335,6 +2428,9 @@ int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * i
|
|
|
2335
2428
|
else if (ctx->minicpmv_version == 3) {
|
|
2336
2429
|
n_patches = 64;
|
|
2337
2430
|
}
|
|
2431
|
+
else if (ctx->minicpmv_version == 4) {
|
|
2432
|
+
n_patches = 64;
|
|
2433
|
+
}
|
|
2338
2434
|
} else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
|
|
2339
2435
|
int patch_size = params.patch_size * 2;
|
|
2340
2436
|
int x_patch = img->nx / patch_size + (int)(img->nx % patch_size > 0);
|
|
@@ -2456,6 +2552,12 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|
|
2456
2552
|
if (ctx->has_minicpmv_projector) {
|
|
2457
2553
|
GGML_ASSERT(batch_size == 1);
|
|
2458
2554
|
}
|
|
2555
|
+
if (ctx->has_glm_projector) {
|
|
2556
|
+
GGML_ASSERT(batch_size == 1);
|
|
2557
|
+
ggml_tensor * boi = ctx->vision_model.boi_w;
|
|
2558
|
+
ggml_backend_tensor_get(boi,vec,0,ggml_nbytes(boi));
|
|
2559
|
+
vec = (float*)(vec+ggml_nelements(boi)); //offset for boi
|
|
2560
|
+
}
|
|
2459
2561
|
|
|
2460
2562
|
// build the inference graph
|
|
2461
2563
|
ggml_cgraph * gf = clip_image_build_graph(ctx, imgs, ctx->load_image_size, true);
|
|
@@ -2514,8 +2616,8 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|
|
2514
2616
|
// -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316
|
|
2515
2617
|
struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
|
|
2516
2618
|
int* positions_data = (int*)malloc(ggml_nbytes(positions));
|
|
2517
|
-
int bucket_coords_h[
|
|
2518
|
-
int bucket_coords_w[
|
|
2619
|
+
int bucket_coords_h[1024];
|
|
2620
|
+
int bucket_coords_w[1024];
|
|
2519
2621
|
for (int i = 0; i < pos_h; i++){
|
|
2520
2622
|
bucket_coords_h[i] = std::floor(70.0*i/pos_h);
|
|
2521
2623
|
}
|
|
@@ -2543,6 +2645,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|
|
2543
2645
|
else if (ctx->minicpmv_version == 3) {
|
|
2544
2646
|
embed_dim = 3584;
|
|
2545
2647
|
}
|
|
2648
|
+
else if (ctx->minicpmv_version == 4) {
|
|
2649
|
+
embed_dim = 3584;
|
|
2650
|
+
}
|
|
2546
2651
|
auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));
|
|
2547
2652
|
|
|
2548
2653
|
float * pos_embed_data = (float *)malloc(ggml_nbytes(pos_embed));
|
|
@@ -2605,7 +2710,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|
|
2605
2710
|
ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
|
|
2606
2711
|
free(positions_data);
|
|
2607
2712
|
|
|
2608
|
-
{
|
|
2713
|
+
if (!ctx->has_glm_projector) {
|
|
2609
2714
|
struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
|
|
2610
2715
|
int* patches_data = (int*)malloc(ggml_nbytes(patches));
|
|
2611
2716
|
for (int i = 0; i < num_patches; i++) {
|
|
@@ -2629,14 +2734,19 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|
|
2629
2734
|
// copy the embeddings to the location passed by the user
|
|
2630
2735
|
ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
|
|
2631
2736
|
|
|
2737
|
+
if (ctx->has_glm_projector) {
|
|
2738
|
+
//eoi
|
|
2739
|
+
ggml_tensor * eoi = ctx->vision_model.eoi_w;
|
|
2740
|
+
int offset = ggml_nelements(embeddings);
|
|
2741
|
+
ggml_backend_tensor_get(eoi, vec+offset, 0, ggml_nbytes(eoi));
|
|
2742
|
+
}
|
|
2743
|
+
|
|
2632
2744
|
return true;
|
|
2633
2745
|
}
|
|
2634
2746
|
|
|
2635
2747
|
bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype) {
|
|
2636
|
-
ggml_type type = GGML_TYPE_Q4_1;
|
|
2637
|
-
|
|
2638
2748
|
assert(itype < GGML_TYPE_COUNT);
|
|
2639
|
-
type = static_cast<ggml_type>(itype);
|
|
2749
|
+
ggml_type type = static_cast<ggml_type>(itype);
|
|
2640
2750
|
|
|
2641
2751
|
auto * ctx_clip = clip_model_load(fname_inp, 2);
|
|
2642
2752
|
|
|
@@ -2689,8 +2799,8 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
|
|
|
2689
2799
|
}
|
|
2690
2800
|
}
|
|
2691
2801
|
|
|
2692
|
-
// quantize only 2D tensors
|
|
2693
|
-
quantize &= (ggml_n_dims(cur) == 2);
|
|
2802
|
+
// quantize only 2D tensors and bigger than block size
|
|
2803
|
+
quantize &= (ggml_n_dims(cur) == 2) && cur->ne[0] > ggml_blck_size(type);
|
|
2694
2804
|
|
|
2695
2805
|
if (quantize) {
|
|
2696
2806
|
new_type = type;
|
|
@@ -2786,6 +2896,12 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
|
|
2786
2896
|
else if (ctx->minicpmv_version == 3) {
|
|
2787
2897
|
return 3584;
|
|
2788
2898
|
}
|
|
2899
|
+
else if (ctx->minicpmv_version == 4) {
|
|
2900
|
+
return 3584;
|
|
2901
|
+
}
|
|
2902
|
+
}
|
|
2903
|
+
if (ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE){
|
|
2904
|
+
return ctx->vision_model.mm_model_mlp_3_w->ne[1];
|
|
2789
2905
|
}
|
|
2790
2906
|
if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
|
|
2791
2907
|
return ctx->vision_model.mm_1_b->ne[0];
|
|
@@ -2802,6 +2918,9 @@ int clip_is_minicpmv(const struct clip_ctx * ctx) {
|
|
|
2802
2918
|
return 0;
|
|
2803
2919
|
}
|
|
2804
2920
|
|
|
2921
|
+
bool clip_is_glm(const struct clip_ctx * ctx) {
|
|
2922
|
+
return ctx->has_glm_projector;
|
|
2923
|
+
}
|
|
2805
2924
|
bool clip_is_qwen2vl(const struct clip_ctx * ctx) {
|
|
2806
2925
|
return ctx->has_qwen2vl_merger;
|
|
2807
2926
|
}
|
|
@@ -93,6 +93,8 @@ CLIP_API bool clip_is_qwen2vl(const struct clip_ctx * ctx);
|
|
|
93
93
|
|
|
94
94
|
CLIP_API bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);
|
|
95
95
|
|
|
96
|
+
CLIP_API bool clip_is_glm(const struct clip_ctx * ctx);
|
|
97
|
+
|
|
96
98
|
#ifdef __cplusplus
|
|
97
99
|
}
|
|
98
100
|
#endif
|
|
@@ -216,7 +216,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
|
|
|
216
216
|
return true;
|
|
217
217
|
}
|
|
218
218
|
|
|
219
|
-
static clip_image_f32 *
|
|
219
|
+
static clip_image_f32 * reshape_by_patch(clip_image_f32 * image, int patch_size) {
|
|
220
220
|
int width = image->nx;
|
|
221
221
|
int height = image->ny;
|
|
222
222
|
int num_patches = (height / patch_size) * (width / patch_size);
|
|
@@ -277,13 +277,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
|
|
277
277
|
encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
|
|
278
278
|
}
|
|
279
279
|
else {
|
|
280
|
-
|
|
281
|
-
if (has_minicpmv_projector == 2) {
|
|
282
|
-
encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
|
|
283
|
-
}
|
|
284
|
-
else if (has_minicpmv_projector == 3) {
|
|
285
|
-
encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
|
|
286
|
-
}
|
|
280
|
+
encoded = clip_image_encode(ctx_clip, n_threads, reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
|
|
287
281
|
}
|
|
288
282
|
|
|
289
283
|
if (!encoded) {
|
|
@@ -313,6 +307,23 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
|
|
313
307
|
load_image_size->height = img->ny;
|
|
314
308
|
clip_add_load_image_size(ctx_clip, load_image_size);
|
|
315
309
|
LOG_INF("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height);
|
|
310
|
+
delete[] img_res_v.data;
|
|
311
|
+
img_res_v.size = 0;
|
|
312
|
+
img_res_v.data = nullptr;
|
|
313
|
+
}
|
|
314
|
+
else if (clip_is_glm(ctx_clip)){
|
|
315
|
+
struct clip_image_size * load_image_size = clip_image_size_init();
|
|
316
|
+
load_image_size->width = img_res_v.data[0].nx;
|
|
317
|
+
load_image_size->height = img_res_v.data[0].ny;
|
|
318
|
+
clip_add_load_image_size(ctx_clip, load_image_size);
|
|
319
|
+
|
|
320
|
+
bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd);
|
|
321
|
+
int pos = int(load_image_size->width/clip_patch_size(ctx_clip)/2);
|
|
322
|
+
*n_img_pos = (pos * pos + 2);
|
|
323
|
+
if (!encoded){
|
|
324
|
+
LOG_ERR("Unable to encode image \n");
|
|
325
|
+
return false;
|
|
326
|
+
}
|
|
316
327
|
}
|
|
317
328
|
else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
|
|
318
329
|
// flat / default llava-1.5 type embedding
|
|
@@ -398,6 +409,9 @@ bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, co
|
|
|
398
409
|
if (clip_is_minicpmv(ctx_clip)) {
|
|
399
410
|
num_max_patches = 10;
|
|
400
411
|
}
|
|
412
|
+
if (clip_is_glm(ctx_clip)) {
|
|
413
|
+
num_max_patches = 1;
|
|
414
|
+
}
|
|
401
415
|
float * image_embd;
|
|
402
416
|
if (clip_is_qwen2vl(ctx_clip)) {
|
|
403
417
|
// qwen2vl don't split image into chunks, so `num_max_patches` is not needed.
|