@fugood/llama.node 0.3.13 → 0.3.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +1 -1
- package/package.json +1 -1
- package/src/LlamaContext.cpp +98 -76
- package/src/LlamaContext.h +1 -1
- package/src/common.hpp +1 -2
- package/src/llama.cpp/.github/workflows/build.yml +60 -10
- package/src/llama.cpp/.github/workflows/server.yml +2 -0
- package/src/llama.cpp/common/CMakeLists.txt +3 -3
- package/src/llama.cpp/common/arg.cpp +112 -11
- package/src/llama.cpp/common/chat.cpp +960 -266
- package/src/llama.cpp/common/chat.h +135 -0
- package/src/llama.cpp/common/common.cpp +27 -171
- package/src/llama.cpp/common/common.h +27 -67
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
- package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
- package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +37 -5
- package/src/llama.cpp/common/ngram-cache.cpp +1 -0
- package/src/llama.cpp/common/sampling.cpp +45 -7
- package/src/llama.cpp/common/speculative.cpp +6 -5
- package/src/llama.cpp/common/speculative.h +1 -1
- package/src/llama.cpp/docs/build.md +45 -7
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +3 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +1 -0
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +2 -3
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +1 -1
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +373 -107
- package/src/llama.cpp/examples/llava/clip.h +19 -3
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
- package/src/llama.cpp/examples/llava/llava.cpp +4 -2
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -0
- package/src/llama.cpp/examples/main/main.cpp +73 -28
- package/src/llama.cpp/examples/parallel/parallel.cpp +1 -0
- package/src/llama.cpp/examples/passkey/passkey.cpp +1 -0
- package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
- package/src/llama.cpp/examples/run/run.cpp +110 -67
- package/src/llama.cpp/examples/server/server.cpp +82 -87
- package/src/llama.cpp/examples/server/utils.hpp +94 -107
- package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
- package/src/llama.cpp/examples/tts/tts.cpp +251 -142
- package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
- package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-cpu.h +3 -0
- package/src/llama.cpp/ggml/include/ggml.h +5 -1
- package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
- package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +132 -17
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +151 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1396 -386
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1432 -151
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +22 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +15 -2
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +235 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -2
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +220 -116
- package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +51 -10
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +136 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +308 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +168 -721
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -77
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +146 -42
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +13 -3
- package/src/llama.cpp/ggml/src/ggml.c +8 -3
- package/src/llama.cpp/include/llama.h +19 -5
- package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-all.txt +1 -0
- package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
- package/src/llama.cpp/requirements.txt +1 -0
- package/src/llama.cpp/src/llama-arch.cpp +21 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-chat.cpp +1 -0
- package/src/llama.cpp/src/llama-grammar.cpp +182 -182
- package/src/llama.cpp/src/llama-grammar.h +12 -3
- package/src/llama.cpp/src/llama-kv-cache.h +1 -0
- package/src/llama.cpp/src/llama-mmap.cpp +11 -1
- package/src/llama.cpp/src/llama-model.cpp +69 -5
- package/src/llama.cpp/src/llama-sampling.cpp +43 -10
- package/src/llama.cpp/src/llama-vocab.cpp +12 -0
- package/src/llama.cpp/src/llama.cpp +147 -0
- package/src/llama.cpp/tests/test-backend-ops.cpp +166 -110
- package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
- package/src/llama.cpp/tests/test-chat.cpp +593 -395
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
- package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
- package/src/llama.cpp/Sources/llama/llama.h +0 -4
- package/src/llama.cpp/common/chat.hpp +0 -55
- /package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +0 -0
|
@@ -39,8 +39,15 @@ struct clip_image_f32_batch {
|
|
|
39
39
|
size_t size;
|
|
40
40
|
};
|
|
41
41
|
|
|
42
|
-
|
|
43
|
-
|
|
42
|
+
struct clip_context_params {
|
|
43
|
+
bool use_gpu;
|
|
44
|
+
int verbosity;
|
|
45
|
+
};
|
|
46
|
+
|
|
47
|
+
// deprecated, use clip_init
|
|
48
|
+
CLIP_API struct clip_ctx * clip_model_load(const char * fname, int verbosity);
|
|
49
|
+
|
|
50
|
+
CLIP_API struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_params);
|
|
44
51
|
|
|
45
52
|
CLIP_API void clip_free(struct clip_ctx * ctx);
|
|
46
53
|
|
|
@@ -55,6 +62,7 @@ CLIP_API int32_t clip_hidden_size(const struct clip_ctx * ctx);
|
|
|
55
62
|
CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);
|
|
56
63
|
|
|
57
64
|
CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
|
|
65
|
+
CLIP_API size_t get_clip_image_grid_size(const struct clip_ctx * ctx);
|
|
58
66
|
|
|
59
67
|
CLIP_API int clip_n_patches (const struct clip_ctx * ctx);
|
|
60
68
|
CLIP_API int clip_n_patches_by_img (const struct clip_ctx * ctx, struct clip_image_f32 * img);
|
|
@@ -73,6 +81,12 @@ CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);
|
|
|
73
81
|
CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch * batch);
|
|
74
82
|
CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch * batch);
|
|
75
83
|
|
|
84
|
+
/**
|
|
85
|
+
* Build image from pixels decoded by other libraries instead of stb_image.h for better performance.
|
|
86
|
+
* The memory layout is RGBRGBRGB..., input buffer length must be 3*nx*ny bytes
|
|
87
|
+
*/
|
|
88
|
+
CLIP_API void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, struct clip_image_u8 * img);
|
|
89
|
+
|
|
76
90
|
CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
|
|
77
91
|
|
|
78
92
|
/** interpret bytes as an image file with length bytes_length, and use the result to populate img */
|
|
@@ -89,11 +103,13 @@ CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, cons
|
|
|
89
103
|
CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);
|
|
90
104
|
|
|
91
105
|
CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx);
|
|
106
|
+
CLIP_API bool clip_is_glm(const struct clip_ctx * ctx);
|
|
92
107
|
CLIP_API bool clip_is_qwen2vl(const struct clip_ctx * ctx);
|
|
93
108
|
|
|
109
|
+
CLIP_API int get_deepest_feature_layer(const struct clip_ctx * ctx);
|
|
110
|
+
|
|
94
111
|
CLIP_API bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);
|
|
95
112
|
|
|
96
|
-
CLIP_API bool clip_is_glm(const struct clip_ctx * ctx);
|
|
97
113
|
|
|
98
114
|
#ifdef __cplusplus
|
|
99
115
|
}
|
|
@@ -0,0 +1,341 @@
|
|
|
1
|
+
#include "arg.h"
|
|
2
|
+
#include "log.h"
|
|
3
|
+
#include "common.h"
|
|
4
|
+
#include "sampling.h"
|
|
5
|
+
#include "clip.h"
|
|
6
|
+
#include "stb_image.h"
|
|
7
|
+
#include "llama.h"
|
|
8
|
+
#include "ggml.h"
|
|
9
|
+
#include "console.h"
|
|
10
|
+
|
|
11
|
+
#include <vector>
|
|
12
|
+
#include <limits.h>
|
|
13
|
+
#include <inttypes.h>
|
|
14
|
+
|
|
15
|
+
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
|
16
|
+
#include <signal.h>
|
|
17
|
+
#include <unistd.h>
|
|
18
|
+
#elif defined (_WIN32)
|
|
19
|
+
#define WIN32_LEAN_AND_MEAN
|
|
20
|
+
#ifndef NOMINMAX
|
|
21
|
+
#define NOMINMAX
|
|
22
|
+
#endif
|
|
23
|
+
#include <windows.h>
|
|
24
|
+
#include <signal.h>
|
|
25
|
+
#endif
|
|
26
|
+
|
|
27
|
+
static bool g_is_generating = false;
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* Please note that this is NOT a production-ready stuff.
|
|
31
|
+
* It is a playground for trying Gemma 3 vision capabilities.
|
|
32
|
+
* For contributors: please keep this code simple and easy to understand.
|
|
33
|
+
*/
|
|
34
|
+
|
|
35
|
+
static void show_additional_info(int /*argc*/, char ** argv) {
|
|
36
|
+
LOG(
|
|
37
|
+
"Experimental CLI for using Gemma 3 vision model\n\n"
|
|
38
|
+
"Usage: %s [options] -m <model> --mmproj <mmproj> --image <image> -p <prompt>\n\n"
|
|
39
|
+
" -m and --mmproj are required\n"
|
|
40
|
+
" --image and -p are optional, if NOT provided, the CLI will run in chat mode\n",
|
|
41
|
+
argv[0]
|
|
42
|
+
);
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
|
46
|
+
static void sigint_handler(int signo) {
|
|
47
|
+
if (signo == SIGINT) {
|
|
48
|
+
if (g_is_generating) {
|
|
49
|
+
g_is_generating = false;
|
|
50
|
+
} else {
|
|
51
|
+
console::cleanup();
|
|
52
|
+
LOG("\nInterrupted by user\n");
|
|
53
|
+
_exit(130);
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
#endif
|
|
58
|
+
|
|
59
|
+
struct gemma3_context {
|
|
60
|
+
struct clip_ctx * ctx_clip = NULL;
|
|
61
|
+
common_init_result llama_init;
|
|
62
|
+
|
|
63
|
+
llama_model * model;
|
|
64
|
+
llama_context * lctx;
|
|
65
|
+
const llama_vocab * vocab;
|
|
66
|
+
llama_batch batch;
|
|
67
|
+
|
|
68
|
+
int n_threads = 1;
|
|
69
|
+
llama_pos n_past = 0;
|
|
70
|
+
|
|
71
|
+
gemma3_context(common_params & params) : llama_init(common_init_from_params(params)) {
|
|
72
|
+
model = llama_init.model.get();
|
|
73
|
+
lctx = llama_init.context.get();
|
|
74
|
+
vocab = llama_model_get_vocab(model);
|
|
75
|
+
n_threads = params.cpuparams.n_threads;
|
|
76
|
+
batch = llama_batch_init(params.n_batch, 0, 1);
|
|
77
|
+
init_clip_model(params);
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
void init_clip_model(common_params & params) {
|
|
81
|
+
const char * clip_path = params.mmproj.c_str();
|
|
82
|
+
ctx_clip = clip_model_load(clip_path, params.verbosity > 1);
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
~gemma3_context() {
|
|
86
|
+
clip_free(ctx_clip);
|
|
87
|
+
}
|
|
88
|
+
};
|
|
89
|
+
|
|
90
|
+
struct decode_embd_batch {
|
|
91
|
+
std::vector<llama_pos> pos;
|
|
92
|
+
std::vector<int32_t> n_seq_id;
|
|
93
|
+
std::vector<llama_seq_id> seq_id_0;
|
|
94
|
+
std::vector<llama_seq_id *> seq_ids;
|
|
95
|
+
std::vector<int8_t> logits;
|
|
96
|
+
llama_batch batch;
|
|
97
|
+
decode_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
|
|
98
|
+
pos .resize(n_tokens);
|
|
99
|
+
n_seq_id.resize(n_tokens);
|
|
100
|
+
seq_ids .resize(n_tokens + 1);
|
|
101
|
+
logits .resize(n_tokens);
|
|
102
|
+
seq_id_0.resize(1);
|
|
103
|
+
seq_id_0[0] = seq_id;
|
|
104
|
+
seq_ids [n_tokens] = nullptr;
|
|
105
|
+
batch = {
|
|
106
|
+
/*n_tokens =*/ n_tokens,
|
|
107
|
+
/*tokens =*/ nullptr,
|
|
108
|
+
/*embd =*/ embd,
|
|
109
|
+
/*pos =*/ pos.data(),
|
|
110
|
+
/*n_seq_id =*/ n_seq_id.data(),
|
|
111
|
+
/*seq_id =*/ seq_ids.data(),
|
|
112
|
+
/*logits =*/ logits.data(),
|
|
113
|
+
};
|
|
114
|
+
for (int i = 0; i < n_tokens; i++) {
|
|
115
|
+
batch.pos [i] = pos_0 + i;
|
|
116
|
+
batch.n_seq_id[i] = 1;
|
|
117
|
+
batch.seq_id [i] = seq_id_0.data();
|
|
118
|
+
batch.logits [i] = false;
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
};
|
|
122
|
+
|
|
123
|
+
static int eval_text(gemma3_context & ctx, std::string input, bool logits_last = false) {
|
|
124
|
+
llama_tokens tokens = common_tokenize(ctx.lctx, input, false, true);
|
|
125
|
+
common_batch_clear(ctx.batch);
|
|
126
|
+
for (llama_token & t : tokens) {
|
|
127
|
+
common_batch_add(ctx.batch, t, ctx.n_past++, {0}, false);
|
|
128
|
+
}
|
|
129
|
+
if (logits_last) {
|
|
130
|
+
ctx.batch.logits[ctx.batch.n_tokens - 1] = true;
|
|
131
|
+
}
|
|
132
|
+
// LOG("eval_text (n_tokens = %d): %s\n", (int)tokens.size(), input.c_str());
|
|
133
|
+
if (llama_decode(ctx.lctx, ctx.batch)) {
|
|
134
|
+
LOG_ERR("Failed to decode text\n");
|
|
135
|
+
return 1;
|
|
136
|
+
}
|
|
137
|
+
return 0;
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
static int eval_image(gemma3_context & ctx, std::string & fname) {
|
|
141
|
+
std::vector<float> image_embd_v;
|
|
142
|
+
int n_embd = llama_model_n_embd(ctx.model);
|
|
143
|
+
int n_tokens = 256;
|
|
144
|
+
image_embd_v.resize(n_tokens * n_embd);
|
|
145
|
+
|
|
146
|
+
bool ok;
|
|
147
|
+
struct clip_image_u8 * img_u8 = clip_image_u8_init();
|
|
148
|
+
ok = clip_image_load_from_file(fname.c_str(), img_u8);
|
|
149
|
+
if (!ok) {
|
|
150
|
+
LOG_ERR("Unable to load image %s\n", fname.c_str());
|
|
151
|
+
clip_image_u8_free(img_u8);
|
|
152
|
+
return 2; // non-fatal error
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
clip_image_f32_batch batch_f32;
|
|
156
|
+
ok = clip_image_preprocess(ctx.ctx_clip, img_u8, &batch_f32);
|
|
157
|
+
if (!ok) {
|
|
158
|
+
LOG_ERR("Unable to preprocess image\n");
|
|
159
|
+
clip_image_f32_batch_free(&batch_f32);
|
|
160
|
+
clip_image_u8_free(img_u8);
|
|
161
|
+
return 1;
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
int64_t t0 = ggml_time_ms();
|
|
165
|
+
LOG("Encoding image %s\n", fname.c_str());
|
|
166
|
+
ok = clip_image_batch_encode(ctx.ctx_clip, ctx.n_threads, &batch_f32, image_embd_v.data());
|
|
167
|
+
if (!ok) {
|
|
168
|
+
LOG_ERR("Unable to encode image\n");
|
|
169
|
+
clip_image_f32_batch_free(&batch_f32);
|
|
170
|
+
clip_image_u8_free(img_u8);
|
|
171
|
+
return 1;
|
|
172
|
+
}
|
|
173
|
+
LOG("Image encoded in %" PRId64 " ms\n", ggml_time_ms() - t0);
|
|
174
|
+
|
|
175
|
+
clip_image_f32_batch_free(&batch_f32);
|
|
176
|
+
clip_image_u8_free(img_u8);
|
|
177
|
+
|
|
178
|
+
// decode image embeddings
|
|
179
|
+
int64_t t1 = ggml_time_ms();
|
|
180
|
+
eval_text(ctx, "<start_of_image>");
|
|
181
|
+
llama_set_causal_attn(ctx.lctx, false);
|
|
182
|
+
decode_embd_batch batch_img(image_embd_v.data(), n_tokens, ctx.n_past, 0);
|
|
183
|
+
if (llama_decode(ctx.lctx, batch_img.batch)) {
|
|
184
|
+
LOG_ERR("failed to decode image\n");
|
|
185
|
+
return 1;
|
|
186
|
+
}
|
|
187
|
+
ctx.n_past += n_tokens;
|
|
188
|
+
llama_set_causal_attn(ctx.lctx, true);
|
|
189
|
+
eval_text(ctx, "<end_of_image>");
|
|
190
|
+
LOG("Image decoded in %" PRId64 " ms\n", ggml_time_ms() - t1);
|
|
191
|
+
return 0;
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
static int generate_response(gemma3_context & ctx, common_sampler * smpl, int n_predict) {
|
|
195
|
+
for (int i = 0; i < n_predict; i++) {
|
|
196
|
+
if (i > n_predict || !g_is_generating) {
|
|
197
|
+
printf("\n");
|
|
198
|
+
break;
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
llama_token token_id = common_sampler_sample(smpl, ctx.lctx, -1);
|
|
202
|
+
common_sampler_accept(smpl, token_id, true);
|
|
203
|
+
|
|
204
|
+
if (llama_vocab_is_eog(ctx.vocab, token_id)) {
|
|
205
|
+
printf("\n");
|
|
206
|
+
break; // end of generation
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
printf("%s", common_token_to_piece(ctx.lctx, token_id).c_str());
|
|
210
|
+
fflush(stdout);
|
|
211
|
+
|
|
212
|
+
// eval the token
|
|
213
|
+
common_batch_clear(ctx.batch);
|
|
214
|
+
common_batch_add(ctx.batch, token_id, ctx.n_past++, {0}, true);
|
|
215
|
+
if (llama_decode(ctx.lctx, ctx.batch)) {
|
|
216
|
+
LOG_ERR("failed to decode token\n");
|
|
217
|
+
return 1;
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
return 0;
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
int main(int argc, char ** argv) {
|
|
224
|
+
ggml_time_init();
|
|
225
|
+
|
|
226
|
+
common_params params;
|
|
227
|
+
params.sampling.temp = 0.2; // lower temp by default for better quality
|
|
228
|
+
|
|
229
|
+
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, show_additional_info)) {
|
|
230
|
+
return 1;
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
common_init();
|
|
234
|
+
|
|
235
|
+
if (params.mmproj.empty()) {
|
|
236
|
+
show_additional_info(argc, argv);
|
|
237
|
+
return 1;
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
gemma3_context ctx(params);
|
|
241
|
+
printf("%s: %s\n", __func__, params.model.c_str());
|
|
242
|
+
|
|
243
|
+
bool is_single_turn = !params.prompt.empty() && !params.image.empty();
|
|
244
|
+
|
|
245
|
+
struct common_sampler * smpl = common_sampler_init(ctx.model, params.sampling);
|
|
246
|
+
int n_predict = params.n_predict < 0 ? INT_MAX : params.n_predict;
|
|
247
|
+
|
|
248
|
+
// ctrl+C handling
|
|
249
|
+
{
|
|
250
|
+
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
|
251
|
+
struct sigaction sigint_action;
|
|
252
|
+
sigint_action.sa_handler = sigint_handler;
|
|
253
|
+
sigemptyset (&sigint_action.sa_mask);
|
|
254
|
+
sigint_action.sa_flags = 0;
|
|
255
|
+
sigaction(SIGINT, &sigint_action, NULL);
|
|
256
|
+
#elif defined (_WIN32)
|
|
257
|
+
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
|
|
258
|
+
return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
|
|
259
|
+
};
|
|
260
|
+
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
|
|
261
|
+
#endif
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
if (eval_text(ctx, "<bos>")) {
|
|
265
|
+
return 1;
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
if (is_single_turn) {
|
|
269
|
+
g_is_generating = true;
|
|
270
|
+
if (eval_text(ctx, "<start_of_turn>user\n")) {
|
|
271
|
+
return 1;
|
|
272
|
+
}
|
|
273
|
+
for (auto & fname : params.image) {
|
|
274
|
+
if (eval_image(ctx, fname)) {
|
|
275
|
+
return 1;
|
|
276
|
+
}
|
|
277
|
+
}
|
|
278
|
+
if (eval_text(ctx, params.prompt + "<end_of_turn><start_of_turn>model\n", true)) {
|
|
279
|
+
return 1;
|
|
280
|
+
}
|
|
281
|
+
if (generate_response(ctx, smpl, n_predict)) {
|
|
282
|
+
return 1;
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
} else {
|
|
286
|
+
LOG("\n Running in chat mode, available commands:");
|
|
287
|
+
LOG("\n /image <path> load an image");
|
|
288
|
+
LOG("\n /clear clear the chat history");
|
|
289
|
+
LOG("\n /quit or /exit exit the program");
|
|
290
|
+
LOG("\n");
|
|
291
|
+
|
|
292
|
+
if (eval_text(ctx, "<start_of_turn>user\n")) {
|
|
293
|
+
return 1;
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
while (true) {
|
|
297
|
+
g_is_generating = false;
|
|
298
|
+
LOG("\n> ");
|
|
299
|
+
console::set_display(console::user_input);
|
|
300
|
+
std::string line;
|
|
301
|
+
console::readline(line, false);
|
|
302
|
+
console::set_display(console::reset);
|
|
303
|
+
line = string_strip(line);
|
|
304
|
+
if (line.empty()) {
|
|
305
|
+
continue;
|
|
306
|
+
}
|
|
307
|
+
if (line == "/quit" || line == "/exit") {
|
|
308
|
+
break;
|
|
309
|
+
}
|
|
310
|
+
if (line == "/clear") {
|
|
311
|
+
ctx.n_past = 0;
|
|
312
|
+
llama_kv_cache_seq_rm(ctx.lctx, 0, 1, -1); // keep BOS
|
|
313
|
+
LOG("Chat history cleared\n\n");
|
|
314
|
+
continue;
|
|
315
|
+
}
|
|
316
|
+
g_is_generating = true;
|
|
317
|
+
if (line.find("/image") == 0) {
|
|
318
|
+
std::string image = line.substr(7);
|
|
319
|
+
int res = eval_image(ctx, image);
|
|
320
|
+
if (res == 2) {
|
|
321
|
+
continue; // image not found
|
|
322
|
+
}
|
|
323
|
+
if (res) {
|
|
324
|
+
return 1;
|
|
325
|
+
}
|
|
326
|
+
continue;
|
|
327
|
+
}
|
|
328
|
+
if (eval_text(ctx, line + "<end_of_turn><start_of_turn>model\n", true)) {
|
|
329
|
+
return 1;
|
|
330
|
+
}
|
|
331
|
+
if (generate_response(ctx, smpl, n_predict)) {
|
|
332
|
+
return 1;
|
|
333
|
+
}
|
|
334
|
+
if (eval_text(ctx, "<end_of_turn><start_of_turn>user\n")) {
|
|
335
|
+
return 1;
|
|
336
|
+
}
|
|
337
|
+
}
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
return 0;
|
|
341
|
+
}
|
|
@@ -353,9 +353,10 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
|
|
353
353
|
LOG_INF("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
|
|
354
354
|
|
|
355
355
|
const int32_t * image_grid = clip_image_grid(ctx_clip);
|
|
356
|
+
const size_t num_gridpoints = get_clip_image_grid_size(ctx_clip);
|
|
356
357
|
|
|
357
358
|
std::vector<std::pair<int, int>> grid_pinpoints;
|
|
358
|
-
for (
|
|
359
|
+
for (size_t i = 0; i < num_gridpoints; i += 2) {
|
|
359
360
|
grid_pinpoints.push_back({image_grid[i], image_grid[i+1]});
|
|
360
361
|
}
|
|
361
362
|
|
|
@@ -405,7 +406,8 @@ bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx *
|
|
|
405
406
|
}
|
|
406
407
|
|
|
407
408
|
bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
|
|
408
|
-
|
|
409
|
+
// Granite vision uses up to 10 patches + base patch
|
|
410
|
+
int num_max_patches = 11;
|
|
409
411
|
if (clip_is_minicpmv(ctx_clip)) {
|
|
410
412
|
num_max_patches = 10;
|
|
411
413
|
}
|
|
@@ -86,7 +86,11 @@ static struct clip_ctx * clip_init_context(common_params * params) {
|
|
|
86
86
|
if (prompt.empty()) {
|
|
87
87
|
prompt = "describe the image in detail.";
|
|
88
88
|
}
|
|
89
|
-
|
|
89
|
+
struct clip_context_params clip_params = {
|
|
90
|
+
/* use_gpu */ params->n_gpu_layers != 0,
|
|
91
|
+
/* verbosity */ params->verbosity,
|
|
92
|
+
};
|
|
93
|
+
auto * ctx_clip = clip_init(clip_path, clip_params);
|
|
90
94
|
return ctx_clip;
|
|
91
95
|
}
|
|
92
96
|
|
|
@@ -148,19 +152,34 @@ static void process_image(struct llava_context * ctx_llava, struct llava_image_e
|
|
|
148
152
|
process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
|
|
149
153
|
eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
|
|
150
154
|
if (num_image_embeds > 1) {
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
for (size_t
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
155
|
+
if (has_minicpmv_projector == 2) {
|
|
156
|
+
size_t num_image_embeds_col = clip_uhd_num_image_embeds_col(ctx_llava->ctx_clip);
|
|
157
|
+
eval_string(ctx_llava->ctx_llama, std::string("<slice>").c_str(), params->n_batch, &n_past, false);
|
|
158
|
+
for (size_t i = 0; i < (num_image_embeds-1)/num_image_embeds_col; ++i) {
|
|
159
|
+
for (size_t j = 0; j < num_image_embeds_col; ++j) {
|
|
160
|
+
eval_string(ctx_llava->ctx_llama, std::string("<image>").c_str(), params->n_batch, &n_past, false);
|
|
161
|
+
process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
|
|
162
|
+
eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
|
|
163
|
+
if (j == num_image_embeds_col - 1) {
|
|
164
|
+
eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false);
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false);
|
|
169
|
+
}
|
|
170
|
+
else if (has_minicpmv_projector == 3 || has_minicpmv_projector == 4) {
|
|
171
|
+
size_t num_image_embeds_col = clip_uhd_num_image_embeds_col(ctx_llava->ctx_clip);
|
|
172
|
+
for (size_t i = 0; i < (num_image_embeds-1)/num_image_embeds_col; ++i) {
|
|
173
|
+
for (size_t j = 0; j < num_image_embeds_col; ++j) {
|
|
174
|
+
eval_string(ctx_llava->ctx_llama, std::string("<slice>").c_str(), params->n_batch, &n_past, false);
|
|
175
|
+
process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
|
|
176
|
+
eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false);
|
|
177
|
+
if (j == num_image_embeds_col - 1) {
|
|
178
|
+
eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false);
|
|
179
|
+
}
|
|
160
180
|
}
|
|
161
181
|
}
|
|
162
182
|
}
|
|
163
|
-
eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false);
|
|
164
183
|
}
|
|
165
184
|
LOG_INF("%s: image token past: %d\n", __func__, n_past);
|
|
166
185
|
}
|