@fugood/llama.node 0.3.13 → 0.3.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. package/bin/darwin/arm64/llama-node.node +0 -0
  2. package/bin/darwin/x64/llama-node.node +0 -0
  3. package/bin/linux/arm64/llama-node.node +0 -0
  4. package/bin/linux/x64/llama-node.node +0 -0
  5. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  6. package/bin/linux-cuda/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/lib/binding.ts +1 -1
  18. package/package.json +1 -1
  19. package/src/LlamaContext.cpp +98 -76
  20. package/src/LlamaContext.h +1 -1
  21. package/src/common.hpp +1 -2
  22. package/src/llama.cpp/.github/workflows/build.yml +60 -10
  23. package/src/llama.cpp/.github/workflows/server.yml +2 -0
  24. package/src/llama.cpp/common/CMakeLists.txt +3 -3
  25. package/src/llama.cpp/common/arg.cpp +112 -11
  26. package/src/llama.cpp/common/chat.cpp +960 -266
  27. package/src/llama.cpp/common/chat.h +135 -0
  28. package/src/llama.cpp/common/common.cpp +27 -171
  29. package/src/llama.cpp/common/common.h +27 -67
  30. package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
  31. package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
  32. package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +37 -5
  33. package/src/llama.cpp/common/ngram-cache.cpp +1 -0
  34. package/src/llama.cpp/common/sampling.cpp +45 -7
  35. package/src/llama.cpp/common/speculative.cpp +6 -5
  36. package/src/llama.cpp/common/speculative.h +1 -1
  37. package/src/llama.cpp/docs/build.md +45 -7
  38. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +3 -1
  39. package/src/llama.cpp/examples/embedding/embedding.cpp +1 -0
  40. package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
  41. package/src/llama.cpp/examples/imatrix/imatrix.cpp +2 -3
  42. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +1 -1
  43. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  44. package/src/llama.cpp/examples/llava/clip.cpp +373 -107
  45. package/src/llama.cpp/examples/llava/clip.h +19 -3
  46. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
  47. package/src/llama.cpp/examples/llava/llava.cpp +4 -2
  48. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
  49. package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -0
  50. package/src/llama.cpp/examples/main/main.cpp +73 -28
  51. package/src/llama.cpp/examples/parallel/parallel.cpp +1 -0
  52. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -0
  53. package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
  54. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
  55. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
  56. package/src/llama.cpp/examples/run/run.cpp +110 -67
  57. package/src/llama.cpp/examples/server/server.cpp +82 -87
  58. package/src/llama.cpp/examples/server/utils.hpp +94 -107
  59. package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
  60. package/src/llama.cpp/examples/tts/tts.cpp +251 -142
  61. package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
  62. package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
  63. package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
  64. package/src/llama.cpp/ggml/include/ggml-cpu.h +3 -0
  65. package/src/llama.cpp/ggml/include/ggml.h +5 -1
  66. package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
  67. package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
  68. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
  69. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
  70. package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
  71. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -2
  72. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
  73. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +132 -17
  74. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
  75. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
  76. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
  77. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +151 -0
  78. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1396 -386
  79. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1432 -151
  80. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +22 -0
  81. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
  82. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
  83. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
  84. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  85. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +15 -2
  86. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
  87. package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
  88. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
  89. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +235 -0
  90. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -2
  91. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
  92. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +220 -116
  93. package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
  94. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
  95. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +2 -0
  96. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  97. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
  98. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +51 -10
  99. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
  100. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
  101. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
  102. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
  103. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
  104. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +136 -4
  105. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +308 -0
  106. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
  107. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +168 -721
  108. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -77
  109. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -0
  110. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
  111. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
  112. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +146 -42
  113. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +13 -3
  114. package/src/llama.cpp/ggml/src/ggml.c +8 -3
  115. package/src/llama.cpp/include/llama.h +19 -5
  116. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
  117. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
  118. package/src/llama.cpp/requirements/requirements-all.txt +1 -0
  119. package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
  120. package/src/llama.cpp/requirements.txt +1 -0
  121. package/src/llama.cpp/src/llama-arch.cpp +21 -0
  122. package/src/llama.cpp/src/llama-arch.h +1 -0
  123. package/src/llama.cpp/src/llama-chat.cpp +1 -0
  124. package/src/llama.cpp/src/llama-grammar.cpp +182 -182
  125. package/src/llama.cpp/src/llama-grammar.h +12 -3
  126. package/src/llama.cpp/src/llama-kv-cache.h +1 -0
  127. package/src/llama.cpp/src/llama-mmap.cpp +11 -1
  128. package/src/llama.cpp/src/llama-model.cpp +69 -5
  129. package/src/llama.cpp/src/llama-sampling.cpp +43 -10
  130. package/src/llama.cpp/src/llama-vocab.cpp +12 -0
  131. package/src/llama.cpp/src/llama.cpp +147 -0
  132. package/src/llama.cpp/tests/test-backend-ops.cpp +166 -110
  133. package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
  134. package/src/llama.cpp/tests/test-chat.cpp +593 -395
  135. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
  136. package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
  137. package/src/llama.cpp/Sources/llama/llama.h +0 -4
  138. package/src/llama.cpp/common/chat.hpp +0 -55
  139. /package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +0 -0
@@ -39,8 +39,15 @@ struct clip_image_f32_batch {
39
39
  size_t size;
40
40
  };
41
41
 
42
- CLIP_API struct clip_ctx * clip_model_load (const char * fname, int verbosity);
43
- CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity);
42
+ struct clip_context_params {
43
+ bool use_gpu;
44
+ int verbosity;
45
+ };
46
+
47
+ // deprecated, use clip_init
48
+ CLIP_API struct clip_ctx * clip_model_load(const char * fname, int verbosity);
49
+
50
+ CLIP_API struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_params);
44
51
 
45
52
  CLIP_API void clip_free(struct clip_ctx * ctx);
46
53
 
@@ -55,6 +62,7 @@ CLIP_API int32_t clip_hidden_size(const struct clip_ctx * ctx);
55
62
  CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);
56
63
 
57
64
  CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
65
+ CLIP_API size_t get_clip_image_grid_size(const struct clip_ctx * ctx);
58
66
 
59
67
  CLIP_API int clip_n_patches (const struct clip_ctx * ctx);
60
68
  CLIP_API int clip_n_patches_by_img (const struct clip_ctx * ctx, struct clip_image_f32 * img);
@@ -73,6 +81,12 @@ CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);
73
81
  CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch * batch);
74
82
  CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch * batch);
75
83
 
84
+ /**
85
+ * Build image from pixels decoded by other libraries instead of stb_image.h for better performance.
86
+ * The memory layout is RGBRGBRGB..., input buffer length must be 3*nx*ny bytes
87
+ */
88
+ CLIP_API void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, struct clip_image_u8 * img);
89
+
76
90
  CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
77
91
 
78
92
  /** interpret bytes as an image file with length bytes_length, and use the result to populate img */
@@ -89,11 +103,13 @@ CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, cons
89
103
  CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);
90
104
 
91
105
  CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx);
106
+ CLIP_API bool clip_is_glm(const struct clip_ctx * ctx);
92
107
  CLIP_API bool clip_is_qwen2vl(const struct clip_ctx * ctx);
93
108
 
109
+ CLIP_API int get_deepest_feature_layer(const struct clip_ctx * ctx);
110
+
94
111
  CLIP_API bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);
95
112
 
96
- CLIP_API bool clip_is_glm(const struct clip_ctx * ctx);
97
113
 
98
114
  #ifdef __cplusplus
99
115
  }
@@ -0,0 +1,341 @@
1
+ #include "arg.h"
2
+ #include "log.h"
3
+ #include "common.h"
4
+ #include "sampling.h"
5
+ #include "clip.h"
6
+ #include "stb_image.h"
7
+ #include "llama.h"
8
+ #include "ggml.h"
9
+ #include "console.h"
10
+
11
+ #include <vector>
12
+ #include <limits.h>
13
+ #include <inttypes.h>
14
+
15
+ #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
16
+ #include <signal.h>
17
+ #include <unistd.h>
18
+ #elif defined (_WIN32)
19
+ #define WIN32_LEAN_AND_MEAN
20
+ #ifndef NOMINMAX
21
+ #define NOMINMAX
22
+ #endif
23
+ #include <windows.h>
24
+ #include <signal.h>
25
+ #endif
26
+
27
+ static bool g_is_generating = false;
28
+
29
+ /**
30
+ * Please note that this is NOT a production-ready stuff.
31
+ * It is a playground for trying Gemma 3 vision capabilities.
32
+ * For contributors: please keep this code simple and easy to understand.
33
+ */
34
+
35
+ static void show_additional_info(int /*argc*/, char ** argv) {
36
+ LOG(
37
+ "Experimental CLI for using Gemma 3 vision model\n\n"
38
+ "Usage: %s [options] -m <model> --mmproj <mmproj> --image <image> -p <prompt>\n\n"
39
+ " -m and --mmproj are required\n"
40
+ " --image and -p are optional, if NOT provided, the CLI will run in chat mode\n",
41
+ argv[0]
42
+ );
43
+ }
44
+
45
+ #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
46
+ static void sigint_handler(int signo) {
47
+ if (signo == SIGINT) {
48
+ if (g_is_generating) {
49
+ g_is_generating = false;
50
+ } else {
51
+ console::cleanup();
52
+ LOG("\nInterrupted by user\n");
53
+ _exit(130);
54
+ }
55
+ }
56
+ }
57
+ #endif
58
+
59
+ struct gemma3_context {
60
+ struct clip_ctx * ctx_clip = NULL;
61
+ common_init_result llama_init;
62
+
63
+ llama_model * model;
64
+ llama_context * lctx;
65
+ const llama_vocab * vocab;
66
+ llama_batch batch;
67
+
68
+ int n_threads = 1;
69
+ llama_pos n_past = 0;
70
+
71
+ gemma3_context(common_params & params) : llama_init(common_init_from_params(params)) {
72
+ model = llama_init.model.get();
73
+ lctx = llama_init.context.get();
74
+ vocab = llama_model_get_vocab(model);
75
+ n_threads = params.cpuparams.n_threads;
76
+ batch = llama_batch_init(params.n_batch, 0, 1);
77
+ init_clip_model(params);
78
+ }
79
+
80
+ void init_clip_model(common_params & params) {
81
+ const char * clip_path = params.mmproj.c_str();
82
+ ctx_clip = clip_model_load(clip_path, params.verbosity > 1);
83
+ }
84
+
85
+ ~gemma3_context() {
86
+ clip_free(ctx_clip);
87
+ }
88
+ };
89
+
90
+ struct decode_embd_batch {
91
+ std::vector<llama_pos> pos;
92
+ std::vector<int32_t> n_seq_id;
93
+ std::vector<llama_seq_id> seq_id_0;
94
+ std::vector<llama_seq_id *> seq_ids;
95
+ std::vector<int8_t> logits;
96
+ llama_batch batch;
97
+ decode_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
98
+ pos .resize(n_tokens);
99
+ n_seq_id.resize(n_tokens);
100
+ seq_ids .resize(n_tokens + 1);
101
+ logits .resize(n_tokens);
102
+ seq_id_0.resize(1);
103
+ seq_id_0[0] = seq_id;
104
+ seq_ids [n_tokens] = nullptr;
105
+ batch = {
106
+ /*n_tokens =*/ n_tokens,
107
+ /*tokens =*/ nullptr,
108
+ /*embd =*/ embd,
109
+ /*pos =*/ pos.data(),
110
+ /*n_seq_id =*/ n_seq_id.data(),
111
+ /*seq_id =*/ seq_ids.data(),
112
+ /*logits =*/ logits.data(),
113
+ };
114
+ for (int i = 0; i < n_tokens; i++) {
115
+ batch.pos [i] = pos_0 + i;
116
+ batch.n_seq_id[i] = 1;
117
+ batch.seq_id [i] = seq_id_0.data();
118
+ batch.logits [i] = false;
119
+ }
120
+ }
121
+ };
122
+
123
+ static int eval_text(gemma3_context & ctx, std::string input, bool logits_last = false) {
124
+ llama_tokens tokens = common_tokenize(ctx.lctx, input, false, true);
125
+ common_batch_clear(ctx.batch);
126
+ for (llama_token & t : tokens) {
127
+ common_batch_add(ctx.batch, t, ctx.n_past++, {0}, false);
128
+ }
129
+ if (logits_last) {
130
+ ctx.batch.logits[ctx.batch.n_tokens - 1] = true;
131
+ }
132
+ // LOG("eval_text (n_tokens = %d): %s\n", (int)tokens.size(), input.c_str());
133
+ if (llama_decode(ctx.lctx, ctx.batch)) {
134
+ LOG_ERR("Failed to decode text\n");
135
+ return 1;
136
+ }
137
+ return 0;
138
+ }
139
+
140
+ static int eval_image(gemma3_context & ctx, std::string & fname) {
141
+ std::vector<float> image_embd_v;
142
+ int n_embd = llama_model_n_embd(ctx.model);
143
+ int n_tokens = 256;
144
+ image_embd_v.resize(n_tokens * n_embd);
145
+
146
+ bool ok;
147
+ struct clip_image_u8 * img_u8 = clip_image_u8_init();
148
+ ok = clip_image_load_from_file(fname.c_str(), img_u8);
149
+ if (!ok) {
150
+ LOG_ERR("Unable to load image %s\n", fname.c_str());
151
+ clip_image_u8_free(img_u8);
152
+ return 2; // non-fatal error
153
+ }
154
+
155
+ clip_image_f32_batch batch_f32;
156
+ ok = clip_image_preprocess(ctx.ctx_clip, img_u8, &batch_f32);
157
+ if (!ok) {
158
+ LOG_ERR("Unable to preprocess image\n");
159
+ clip_image_f32_batch_free(&batch_f32);
160
+ clip_image_u8_free(img_u8);
161
+ return 1;
162
+ }
163
+
164
+ int64_t t0 = ggml_time_ms();
165
+ LOG("Encoding image %s\n", fname.c_str());
166
+ ok = clip_image_batch_encode(ctx.ctx_clip, ctx.n_threads, &batch_f32, image_embd_v.data());
167
+ if (!ok) {
168
+ LOG_ERR("Unable to encode image\n");
169
+ clip_image_f32_batch_free(&batch_f32);
170
+ clip_image_u8_free(img_u8);
171
+ return 1;
172
+ }
173
+ LOG("Image encoded in %" PRId64 " ms\n", ggml_time_ms() - t0);
174
+
175
+ clip_image_f32_batch_free(&batch_f32);
176
+ clip_image_u8_free(img_u8);
177
+
178
+ // decode image embeddings
179
+ int64_t t1 = ggml_time_ms();
180
+ eval_text(ctx, "<start_of_image>");
181
+ llama_set_causal_attn(ctx.lctx, false);
182
+ decode_embd_batch batch_img(image_embd_v.data(), n_tokens, ctx.n_past, 0);
183
+ if (llama_decode(ctx.lctx, batch_img.batch)) {
184
+ LOG_ERR("failed to decode image\n");
185
+ return 1;
186
+ }
187
+ ctx.n_past += n_tokens;
188
+ llama_set_causal_attn(ctx.lctx, true);
189
+ eval_text(ctx, "<end_of_image>");
190
+ LOG("Image decoded in %" PRId64 " ms\n", ggml_time_ms() - t1);
191
+ return 0;
192
+ }
193
+
194
+ static int generate_response(gemma3_context & ctx, common_sampler * smpl, int n_predict) {
195
+ for (int i = 0; i < n_predict; i++) {
196
+ if (i > n_predict || !g_is_generating) {
197
+ printf("\n");
198
+ break;
199
+ }
200
+
201
+ llama_token token_id = common_sampler_sample(smpl, ctx.lctx, -1);
202
+ common_sampler_accept(smpl, token_id, true);
203
+
204
+ if (llama_vocab_is_eog(ctx.vocab, token_id)) {
205
+ printf("\n");
206
+ break; // end of generation
207
+ }
208
+
209
+ printf("%s", common_token_to_piece(ctx.lctx, token_id).c_str());
210
+ fflush(stdout);
211
+
212
+ // eval the token
213
+ common_batch_clear(ctx.batch);
214
+ common_batch_add(ctx.batch, token_id, ctx.n_past++, {0}, true);
215
+ if (llama_decode(ctx.lctx, ctx.batch)) {
216
+ LOG_ERR("failed to decode token\n");
217
+ return 1;
218
+ }
219
+ }
220
+ return 0;
221
+ }
222
+
223
+ int main(int argc, char ** argv) {
224
+ ggml_time_init();
225
+
226
+ common_params params;
227
+ params.sampling.temp = 0.2; // lower temp by default for better quality
228
+
229
+ if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, show_additional_info)) {
230
+ return 1;
231
+ }
232
+
233
+ common_init();
234
+
235
+ if (params.mmproj.empty()) {
236
+ show_additional_info(argc, argv);
237
+ return 1;
238
+ }
239
+
240
+ gemma3_context ctx(params);
241
+ printf("%s: %s\n", __func__, params.model.c_str());
242
+
243
+ bool is_single_turn = !params.prompt.empty() && !params.image.empty();
244
+
245
+ struct common_sampler * smpl = common_sampler_init(ctx.model, params.sampling);
246
+ int n_predict = params.n_predict < 0 ? INT_MAX : params.n_predict;
247
+
248
+ // ctrl+C handling
249
+ {
250
+ #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
251
+ struct sigaction sigint_action;
252
+ sigint_action.sa_handler = sigint_handler;
253
+ sigemptyset (&sigint_action.sa_mask);
254
+ sigint_action.sa_flags = 0;
255
+ sigaction(SIGINT, &sigint_action, NULL);
256
+ #elif defined (_WIN32)
257
+ auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
258
+ return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
259
+ };
260
+ SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
261
+ #endif
262
+ }
263
+
264
+ if (eval_text(ctx, "<bos>")) {
265
+ return 1;
266
+ }
267
+
268
+ if (is_single_turn) {
269
+ g_is_generating = true;
270
+ if (eval_text(ctx, "<start_of_turn>user\n")) {
271
+ return 1;
272
+ }
273
+ for (auto & fname : params.image) {
274
+ if (eval_image(ctx, fname)) {
275
+ return 1;
276
+ }
277
+ }
278
+ if (eval_text(ctx, params.prompt + "<end_of_turn><start_of_turn>model\n", true)) {
279
+ return 1;
280
+ }
281
+ if (generate_response(ctx, smpl, n_predict)) {
282
+ return 1;
283
+ }
284
+
285
+ } else {
286
+ LOG("\n Running in chat mode, available commands:");
287
+ LOG("\n /image <path> load an image");
288
+ LOG("\n /clear clear the chat history");
289
+ LOG("\n /quit or /exit exit the program");
290
+ LOG("\n");
291
+
292
+ if (eval_text(ctx, "<start_of_turn>user\n")) {
293
+ return 1;
294
+ }
295
+
296
+ while (true) {
297
+ g_is_generating = false;
298
+ LOG("\n> ");
299
+ console::set_display(console::user_input);
300
+ std::string line;
301
+ console::readline(line, false);
302
+ console::set_display(console::reset);
303
+ line = string_strip(line);
304
+ if (line.empty()) {
305
+ continue;
306
+ }
307
+ if (line == "/quit" || line == "/exit") {
308
+ break;
309
+ }
310
+ if (line == "/clear") {
311
+ ctx.n_past = 0;
312
+ llama_kv_cache_seq_rm(ctx.lctx, 0, 1, -1); // keep BOS
313
+ LOG("Chat history cleared\n\n");
314
+ continue;
315
+ }
316
+ g_is_generating = true;
317
+ if (line.find("/image") == 0) {
318
+ std::string image = line.substr(7);
319
+ int res = eval_image(ctx, image);
320
+ if (res == 2) {
321
+ continue; // image not found
322
+ }
323
+ if (res) {
324
+ return 1;
325
+ }
326
+ continue;
327
+ }
328
+ if (eval_text(ctx, line + "<end_of_turn><start_of_turn>model\n", true)) {
329
+ return 1;
330
+ }
331
+ if (generate_response(ctx, smpl, n_predict)) {
332
+ return 1;
333
+ }
334
+ if (eval_text(ctx, "<end_of_turn><start_of_turn>user\n")) {
335
+ return 1;
336
+ }
337
+ }
338
+ }
339
+
340
+ return 0;
341
+ }
@@ -353,9 +353,10 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
353
353
  LOG_INF("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
354
354
 
355
355
  const int32_t * image_grid = clip_image_grid(ctx_clip);
356
+ const size_t num_gridpoints = get_clip_image_grid_size(ctx_clip);
356
357
 
357
358
  std::vector<std::pair<int, int>> grid_pinpoints;
358
- for (int i = 0; i < 32 && image_grid[i] != 0; i += 2) {
359
+ for (size_t i = 0; i < num_gridpoints; i += 2) {
359
360
  grid_pinpoints.push_back({image_grid[i], image_grid[i+1]});
360
361
  }
361
362
 
@@ -405,7 +406,8 @@ bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx *
405
406
  }
406
407
 
407
408
  bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
408
- int num_max_patches = 6;
409
+ // Granite vision uses up to 10 patches + base patch
410
+ int num_max_patches = 11;
409
411
  if (clip_is_minicpmv(ctx_clip)) {
410
412
  num_max_patches = 10;
411
413
  }
@@ -86,7 +86,11 @@ static struct clip_ctx * clip_init_context(common_params * params) {
86
86
  if (prompt.empty()) {
87
87
  prompt = "describe the image in detail.";
88
88
  }
89
- auto * ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
89
+ struct clip_context_params clip_params = {
90
+ /* use_gpu */ params->n_gpu_layers != 0,
91
+ /* verbosity */ params->verbosity,
92
+ };
93
+ auto * ctx_clip = clip_init(clip_path, clip_params);
90
94
  return ctx_clip;
91
95
  }
92
96
 
@@ -148,19 +152,34 @@ static void process_image(struct llava_context * ctx_llava, struct llava_image_e
148
152
  process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
149
153
  eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
150
154
  if (num_image_embeds > 1) {
151
- size_t num_image_embeds_col = clip_uhd_num_image_embeds_col(ctx_llava->ctx_clip);
152
- eval_string(ctx_llava->ctx_llama, std::string("<slice>").c_str(), params->n_batch, &n_past, false);
153
- for (size_t i = 0; i < (num_image_embeds-1)/num_image_embeds_col; ++i) {
154
- for (size_t j = 0; j < num_image_embeds_col; ++j) {
155
- eval_string(ctx_llava->ctx_llama, std::string("<image>").c_str(), params->n_batch, &n_past, false);
156
- process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
157
- eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
158
- if (j == num_image_embeds_col - 1) {
159
- eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false);
155
+ if (has_minicpmv_projector == 2) {
156
+ size_t num_image_embeds_col = clip_uhd_num_image_embeds_col(ctx_llava->ctx_clip);
157
+ eval_string(ctx_llava->ctx_llama, std::string("<slice>").c_str(), params->n_batch, &n_past, false);
158
+ for (size_t i = 0; i < (num_image_embeds-1)/num_image_embeds_col; ++i) {
159
+ for (size_t j = 0; j < num_image_embeds_col; ++j) {
160
+ eval_string(ctx_llava->ctx_llama, std::string("<image>").c_str(), params->n_batch, &n_past, false);
161
+ process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
162
+ eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
163
+ if (j == num_image_embeds_col - 1) {
164
+ eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false);
165
+ }
166
+ }
167
+ }
168
+ eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false);
169
+ }
170
+ else if (has_minicpmv_projector == 3 || has_minicpmv_projector == 4) {
171
+ size_t num_image_embeds_col = clip_uhd_num_image_embeds_col(ctx_llava->ctx_clip);
172
+ for (size_t i = 0; i < (num_image_embeds-1)/num_image_embeds_col; ++i) {
173
+ for (size_t j = 0; j < num_image_embeds_col; ++j) {
174
+ eval_string(ctx_llava->ctx_llama, std::string("<slice>").c_str(), params->n_batch, &n_past, false);
175
+ process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
176
+ eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false);
177
+ if (j == num_image_embeds_col - 1) {
178
+ eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false);
179
+ }
160
180
  }
161
181
  }
162
182
  }
163
- eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false);
164
183
  }
165
184
  LOG_INF("%s: image token past: %d\n", __func__, n_past);
166
185
  }
@@ -7,6 +7,7 @@
7
7
  #include <cstdio>
8
8
  #include <string>
9
9
  #include <vector>
10
+ #include <algorithm>
10
11
 
11
12
  struct ngram_data {
12
13
  bool active = false;