@fugood/llama.node 0.3.16 → 0.3.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +3 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +5 -0
- package/package.json +1 -1
- package/src/LlamaCompletionWorker.cpp +8 -0
- package/src/LlamaCompletionWorker.h +1 -0
- package/src/LlamaContext.cpp +3 -2
- package/src/llama.cpp/.github/workflows/build-linux-cross.yml +124 -0
- package/src/llama.cpp/.github/workflows/build.yml +70 -27
- package/src/llama.cpp/.github/workflows/docker.yml +6 -6
- package/src/llama.cpp/.github/workflows/server.yml +7 -11
- package/src/llama.cpp/CMakeLists.txt +23 -1
- package/src/llama.cpp/common/CMakeLists.txt +6 -3
- package/src/llama.cpp/common/arg.cpp +809 -105
- package/src/llama.cpp/common/arg.h +9 -0
- package/src/llama.cpp/common/chat.cpp +1 -1
- package/src/llama.cpp/common/common.cpp +31 -521
- package/src/llama.cpp/common/common.h +17 -36
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
- package/src/llama.cpp/common/llguidance.cpp +30 -47
- package/src/llama.cpp/common/minja/chat-template.hpp +15 -7
- package/src/llama.cpp/common/minja/minja.hpp +119 -93
- package/src/llama.cpp/common/sampling.cpp +3 -0
- package/src/llama.cpp/docs/build.md +122 -7
- package/src/llama.cpp/examples/CMakeLists.txt +0 -9
- package/src/llama.cpp/examples/batched/batched.cpp +1 -1
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +7 -1
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +15 -16
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +210 -8
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/llava/CMakeLists.txt +39 -24
- package/src/llama.cpp/examples/llava/clip-impl.h +345 -0
- package/src/llama.cpp/examples/llava/clip.cpp +2152 -1803
- package/src/llama.cpp/examples/llava/clip.h +39 -22
- package/src/llama.cpp/examples/llava/deprecation-warning.cpp +22 -0
- package/src/llama.cpp/examples/llava/llava.cpp +64 -52
- package/src/llama.cpp/examples/llava/mtmd-cli.cpp +344 -0
- package/src/llama.cpp/examples/llava/mtmd.cpp +708 -0
- package/src/llama.cpp/examples/llava/mtmd.h +168 -0
- package/src/llama.cpp/examples/llava/{qwen2vl-cli.cpp → qwen2vl-test.cpp} +83 -31
- package/src/llama.cpp/examples/main/main.cpp +16 -5
- package/src/llama.cpp/examples/parallel/parallel.cpp +3 -1
- package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +17 -3
- package/src/llama.cpp/examples/quantize/quantize.cpp +115 -2
- package/src/llama.cpp/examples/rpc/CMakeLists.txt +4 -2
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +163 -8
- package/src/llama.cpp/examples/run/CMakeLists.txt +12 -1
- package/src/llama.cpp/examples/run/run.cpp +14 -28
- package/src/llama.cpp/examples/server/httplib.h +313 -247
- package/src/llama.cpp/examples/server/server.cpp +238 -139
- package/src/llama.cpp/examples/server/utils.hpp +51 -2
- package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
- package/src/llama.cpp/examples/sycl/build.sh +2 -2
- package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
- package/src/llama.cpp/examples/tts/tts.cpp +6 -9
- package/src/llama.cpp/ggml/CMakeLists.txt +8 -2
- package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
- package/src/llama.cpp/ggml/include/ggml.h +66 -99
- package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
- package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +48 -22
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -192
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +754 -404
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1003 -13519
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +2 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +0 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +3 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +533 -88
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8809 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +258 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
- package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +70 -3
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -260
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +293 -40
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +96 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +350 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +2 -292
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +967 -438
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +204 -280
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +23 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +646 -114
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +12 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +17 -8
- package/src/llama.cpp/ggml/src/ggml.c +141 -245
- package/src/llama.cpp/ggml/src/gguf.cpp +1 -0
- package/src/llama.cpp/include/llama.h +30 -11
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-all.txt +2 -0
- package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
- package/src/llama.cpp/src/CMakeLists.txt +3 -2
- package/src/llama.cpp/src/llama-adapter.cpp +37 -1
- package/src/llama.cpp/src/llama-arch.cpp +160 -17
- package/src/llama.cpp/src/llama-arch.h +16 -0
- package/src/llama.cpp/src/llama-chat.cpp +82 -17
- package/src/llama.cpp/src/llama-chat.h +6 -2
- package/src/llama.cpp/src/llama-context.cpp +108 -92
- package/src/llama.cpp/src/llama-context.h +1 -2
- package/src/llama.cpp/src/llama-graph.cpp +189 -119
- package/src/llama.cpp/src/llama-graph.h +26 -6
- package/src/llama.cpp/src/llama-hparams.h +13 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +70 -123
- package/src/llama.cpp/src/llama-kv-cache.h +41 -115
- package/src/llama.cpp/src/llama-memory.h +1 -1
- package/src/llama.cpp/src/llama-mmap.cpp +1 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +10 -5
- package/src/llama.cpp/src/llama-model-loader.h +5 -3
- package/src/llama.cpp/src/llama-model.cpp +1760 -534
- package/src/llama.cpp/src/llama-model.h +13 -1
- package/src/llama.cpp/src/llama-quant.cpp +29 -8
- package/src/llama.cpp/src/llama-sampling.cpp +7 -1
- package/src/llama.cpp/src/llama-vocab.cpp +44 -6
- package/src/llama.cpp/src/llama.cpp +1 -1
- package/src/llama.cpp/tests/CMakeLists.txt +43 -30
- package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
- package/src/llama.cpp/tests/test-backend-ops.cpp +82 -43
- package/src/llama.cpp/tests/test-chat-template.cpp +34 -13
- package/src/llama.cpp/tests/test-chat.cpp +12 -2
- package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
- package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
- package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
- package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
- package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
- package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
- package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
|
@@ -1,341 +0,0 @@
|
|
|
1
|
-
#include "arg.h"
|
|
2
|
-
#include "log.h"
|
|
3
|
-
#include "common.h"
|
|
4
|
-
#include "sampling.h"
|
|
5
|
-
#include "clip.h"
|
|
6
|
-
#include "stb_image.h"
|
|
7
|
-
#include "llama.h"
|
|
8
|
-
#include "ggml.h"
|
|
9
|
-
#include "console.h"
|
|
10
|
-
|
|
11
|
-
#include <vector>
|
|
12
|
-
#include <limits.h>
|
|
13
|
-
#include <inttypes.h>
|
|
14
|
-
|
|
15
|
-
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
|
16
|
-
#include <signal.h>
|
|
17
|
-
#include <unistd.h>
|
|
18
|
-
#elif defined (_WIN32)
|
|
19
|
-
#define WIN32_LEAN_AND_MEAN
|
|
20
|
-
#ifndef NOMINMAX
|
|
21
|
-
#define NOMINMAX
|
|
22
|
-
#endif
|
|
23
|
-
#include <windows.h>
|
|
24
|
-
#include <signal.h>
|
|
25
|
-
#endif
|
|
26
|
-
|
|
27
|
-
static bool g_is_generating = false;
|
|
28
|
-
|
|
29
|
-
/**
|
|
30
|
-
* Please note that this is NOT a production-ready stuff.
|
|
31
|
-
* It is a playground for trying Gemma 3 vision capabilities.
|
|
32
|
-
* For contributors: please keep this code simple and easy to understand.
|
|
33
|
-
*/
|
|
34
|
-
|
|
35
|
-
static void show_additional_info(int /*argc*/, char ** argv) {
|
|
36
|
-
LOG(
|
|
37
|
-
"Experimental CLI for using Gemma 3 vision model\n\n"
|
|
38
|
-
"Usage: %s [options] -m <model> --mmproj <mmproj> --image <image> -p <prompt>\n\n"
|
|
39
|
-
" -m and --mmproj are required\n"
|
|
40
|
-
" --image and -p are optional, if NOT provided, the CLI will run in chat mode\n",
|
|
41
|
-
argv[0]
|
|
42
|
-
);
|
|
43
|
-
}
|
|
44
|
-
|
|
45
|
-
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
|
46
|
-
static void sigint_handler(int signo) {
|
|
47
|
-
if (signo == SIGINT) {
|
|
48
|
-
if (g_is_generating) {
|
|
49
|
-
g_is_generating = false;
|
|
50
|
-
} else {
|
|
51
|
-
console::cleanup();
|
|
52
|
-
LOG("\nInterrupted by user\n");
|
|
53
|
-
_exit(130);
|
|
54
|
-
}
|
|
55
|
-
}
|
|
56
|
-
}
|
|
57
|
-
#endif
|
|
58
|
-
|
|
59
|
-
struct gemma3_context {
|
|
60
|
-
struct clip_ctx * ctx_clip = NULL;
|
|
61
|
-
common_init_result llama_init;
|
|
62
|
-
|
|
63
|
-
llama_model * model;
|
|
64
|
-
llama_context * lctx;
|
|
65
|
-
const llama_vocab * vocab;
|
|
66
|
-
llama_batch batch;
|
|
67
|
-
|
|
68
|
-
int n_threads = 1;
|
|
69
|
-
llama_pos n_past = 0;
|
|
70
|
-
|
|
71
|
-
gemma3_context(common_params & params) : llama_init(common_init_from_params(params)) {
|
|
72
|
-
model = llama_init.model.get();
|
|
73
|
-
lctx = llama_init.context.get();
|
|
74
|
-
vocab = llama_model_get_vocab(model);
|
|
75
|
-
n_threads = params.cpuparams.n_threads;
|
|
76
|
-
batch = llama_batch_init(params.n_batch, 0, 1);
|
|
77
|
-
init_clip_model(params);
|
|
78
|
-
}
|
|
79
|
-
|
|
80
|
-
void init_clip_model(common_params & params) {
|
|
81
|
-
const char * clip_path = params.mmproj.c_str();
|
|
82
|
-
ctx_clip = clip_model_load(clip_path, params.verbosity > 1);
|
|
83
|
-
}
|
|
84
|
-
|
|
85
|
-
~gemma3_context() {
|
|
86
|
-
clip_free(ctx_clip);
|
|
87
|
-
}
|
|
88
|
-
};
|
|
89
|
-
|
|
90
|
-
struct decode_embd_batch {
|
|
91
|
-
std::vector<llama_pos> pos;
|
|
92
|
-
std::vector<int32_t> n_seq_id;
|
|
93
|
-
std::vector<llama_seq_id> seq_id_0;
|
|
94
|
-
std::vector<llama_seq_id *> seq_ids;
|
|
95
|
-
std::vector<int8_t> logits;
|
|
96
|
-
llama_batch batch;
|
|
97
|
-
decode_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
|
|
98
|
-
pos .resize(n_tokens);
|
|
99
|
-
n_seq_id.resize(n_tokens);
|
|
100
|
-
seq_ids .resize(n_tokens + 1);
|
|
101
|
-
logits .resize(n_tokens);
|
|
102
|
-
seq_id_0.resize(1);
|
|
103
|
-
seq_id_0[0] = seq_id;
|
|
104
|
-
seq_ids [n_tokens] = nullptr;
|
|
105
|
-
batch = {
|
|
106
|
-
/*n_tokens =*/ n_tokens,
|
|
107
|
-
/*tokens =*/ nullptr,
|
|
108
|
-
/*embd =*/ embd,
|
|
109
|
-
/*pos =*/ pos.data(),
|
|
110
|
-
/*n_seq_id =*/ n_seq_id.data(),
|
|
111
|
-
/*seq_id =*/ seq_ids.data(),
|
|
112
|
-
/*logits =*/ logits.data(),
|
|
113
|
-
};
|
|
114
|
-
for (int i = 0; i < n_tokens; i++) {
|
|
115
|
-
batch.pos [i] = pos_0 + i;
|
|
116
|
-
batch.n_seq_id[i] = 1;
|
|
117
|
-
batch.seq_id [i] = seq_id_0.data();
|
|
118
|
-
batch.logits [i] = false;
|
|
119
|
-
}
|
|
120
|
-
}
|
|
121
|
-
};
|
|
122
|
-
|
|
123
|
-
static int eval_text(gemma3_context & ctx, std::string input, bool logits_last = false) {
|
|
124
|
-
llama_tokens tokens = common_tokenize(ctx.lctx, input, false, true);
|
|
125
|
-
common_batch_clear(ctx.batch);
|
|
126
|
-
for (llama_token & t : tokens) {
|
|
127
|
-
common_batch_add(ctx.batch, t, ctx.n_past++, {0}, false);
|
|
128
|
-
}
|
|
129
|
-
if (logits_last) {
|
|
130
|
-
ctx.batch.logits[ctx.batch.n_tokens - 1] = true;
|
|
131
|
-
}
|
|
132
|
-
// LOG("eval_text (n_tokens = %d): %s\n", (int)tokens.size(), input.c_str());
|
|
133
|
-
if (llama_decode(ctx.lctx, ctx.batch)) {
|
|
134
|
-
LOG_ERR("Failed to decode text\n");
|
|
135
|
-
return 1;
|
|
136
|
-
}
|
|
137
|
-
return 0;
|
|
138
|
-
}
|
|
139
|
-
|
|
140
|
-
static int eval_image(gemma3_context & ctx, std::string & fname) {
|
|
141
|
-
std::vector<float> image_embd_v;
|
|
142
|
-
int n_embd = llama_model_n_embd(ctx.model);
|
|
143
|
-
int n_tokens = 256;
|
|
144
|
-
image_embd_v.resize(n_tokens * n_embd);
|
|
145
|
-
|
|
146
|
-
bool ok;
|
|
147
|
-
struct clip_image_u8 * img_u8 = clip_image_u8_init();
|
|
148
|
-
ok = clip_image_load_from_file(fname.c_str(), img_u8);
|
|
149
|
-
if (!ok) {
|
|
150
|
-
LOG_ERR("Unable to load image %s\n", fname.c_str());
|
|
151
|
-
clip_image_u8_free(img_u8);
|
|
152
|
-
return 2; // non-fatal error
|
|
153
|
-
}
|
|
154
|
-
|
|
155
|
-
clip_image_f32_batch batch_f32;
|
|
156
|
-
ok = clip_image_preprocess(ctx.ctx_clip, img_u8, &batch_f32);
|
|
157
|
-
if (!ok) {
|
|
158
|
-
LOG_ERR("Unable to preprocess image\n");
|
|
159
|
-
clip_image_f32_batch_free(&batch_f32);
|
|
160
|
-
clip_image_u8_free(img_u8);
|
|
161
|
-
return 1;
|
|
162
|
-
}
|
|
163
|
-
|
|
164
|
-
int64_t t0 = ggml_time_ms();
|
|
165
|
-
LOG("Encoding image %s\n", fname.c_str());
|
|
166
|
-
ok = clip_image_batch_encode(ctx.ctx_clip, ctx.n_threads, &batch_f32, image_embd_v.data());
|
|
167
|
-
if (!ok) {
|
|
168
|
-
LOG_ERR("Unable to encode image\n");
|
|
169
|
-
clip_image_f32_batch_free(&batch_f32);
|
|
170
|
-
clip_image_u8_free(img_u8);
|
|
171
|
-
return 1;
|
|
172
|
-
}
|
|
173
|
-
LOG("Image encoded in %" PRId64 " ms\n", ggml_time_ms() - t0);
|
|
174
|
-
|
|
175
|
-
clip_image_f32_batch_free(&batch_f32);
|
|
176
|
-
clip_image_u8_free(img_u8);
|
|
177
|
-
|
|
178
|
-
// decode image embeddings
|
|
179
|
-
int64_t t1 = ggml_time_ms();
|
|
180
|
-
eval_text(ctx, "<start_of_image>");
|
|
181
|
-
llama_set_causal_attn(ctx.lctx, false);
|
|
182
|
-
decode_embd_batch batch_img(image_embd_v.data(), n_tokens, ctx.n_past, 0);
|
|
183
|
-
if (llama_decode(ctx.lctx, batch_img.batch)) {
|
|
184
|
-
LOG_ERR("failed to decode image\n");
|
|
185
|
-
return 1;
|
|
186
|
-
}
|
|
187
|
-
ctx.n_past += n_tokens;
|
|
188
|
-
llama_set_causal_attn(ctx.lctx, true);
|
|
189
|
-
eval_text(ctx, "<end_of_image>");
|
|
190
|
-
LOG("Image decoded in %" PRId64 " ms\n", ggml_time_ms() - t1);
|
|
191
|
-
return 0;
|
|
192
|
-
}
|
|
193
|
-
|
|
194
|
-
static int generate_response(gemma3_context & ctx, common_sampler * smpl, int n_predict) {
|
|
195
|
-
for (int i = 0; i < n_predict; i++) {
|
|
196
|
-
if (i > n_predict || !g_is_generating) {
|
|
197
|
-
printf("\n");
|
|
198
|
-
break;
|
|
199
|
-
}
|
|
200
|
-
|
|
201
|
-
llama_token token_id = common_sampler_sample(smpl, ctx.lctx, -1);
|
|
202
|
-
common_sampler_accept(smpl, token_id, true);
|
|
203
|
-
|
|
204
|
-
if (llama_vocab_is_eog(ctx.vocab, token_id)) {
|
|
205
|
-
printf("\n");
|
|
206
|
-
break; // end of generation
|
|
207
|
-
}
|
|
208
|
-
|
|
209
|
-
printf("%s", common_token_to_piece(ctx.lctx, token_id).c_str());
|
|
210
|
-
fflush(stdout);
|
|
211
|
-
|
|
212
|
-
// eval the token
|
|
213
|
-
common_batch_clear(ctx.batch);
|
|
214
|
-
common_batch_add(ctx.batch, token_id, ctx.n_past++, {0}, true);
|
|
215
|
-
if (llama_decode(ctx.lctx, ctx.batch)) {
|
|
216
|
-
LOG_ERR("failed to decode token\n");
|
|
217
|
-
return 1;
|
|
218
|
-
}
|
|
219
|
-
}
|
|
220
|
-
return 0;
|
|
221
|
-
}
|
|
222
|
-
|
|
223
|
-
int main(int argc, char ** argv) {
|
|
224
|
-
ggml_time_init();
|
|
225
|
-
|
|
226
|
-
common_params params;
|
|
227
|
-
params.sampling.temp = 0.2; // lower temp by default for better quality
|
|
228
|
-
|
|
229
|
-
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, show_additional_info)) {
|
|
230
|
-
return 1;
|
|
231
|
-
}
|
|
232
|
-
|
|
233
|
-
common_init();
|
|
234
|
-
|
|
235
|
-
if (params.mmproj.empty()) {
|
|
236
|
-
show_additional_info(argc, argv);
|
|
237
|
-
return 1;
|
|
238
|
-
}
|
|
239
|
-
|
|
240
|
-
gemma3_context ctx(params);
|
|
241
|
-
printf("%s: %s\n", __func__, params.model.c_str());
|
|
242
|
-
|
|
243
|
-
bool is_single_turn = !params.prompt.empty() && !params.image.empty();
|
|
244
|
-
|
|
245
|
-
struct common_sampler * smpl = common_sampler_init(ctx.model, params.sampling);
|
|
246
|
-
int n_predict = params.n_predict < 0 ? INT_MAX : params.n_predict;
|
|
247
|
-
|
|
248
|
-
// ctrl+C handling
|
|
249
|
-
{
|
|
250
|
-
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
|
251
|
-
struct sigaction sigint_action;
|
|
252
|
-
sigint_action.sa_handler = sigint_handler;
|
|
253
|
-
sigemptyset (&sigint_action.sa_mask);
|
|
254
|
-
sigint_action.sa_flags = 0;
|
|
255
|
-
sigaction(SIGINT, &sigint_action, NULL);
|
|
256
|
-
#elif defined (_WIN32)
|
|
257
|
-
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
|
|
258
|
-
return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
|
|
259
|
-
};
|
|
260
|
-
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
|
|
261
|
-
#endif
|
|
262
|
-
}
|
|
263
|
-
|
|
264
|
-
if (eval_text(ctx, "<bos>")) {
|
|
265
|
-
return 1;
|
|
266
|
-
}
|
|
267
|
-
|
|
268
|
-
if (is_single_turn) {
|
|
269
|
-
g_is_generating = true;
|
|
270
|
-
if (eval_text(ctx, "<start_of_turn>user\n")) {
|
|
271
|
-
return 1;
|
|
272
|
-
}
|
|
273
|
-
for (auto & fname : params.image) {
|
|
274
|
-
if (eval_image(ctx, fname)) {
|
|
275
|
-
return 1;
|
|
276
|
-
}
|
|
277
|
-
}
|
|
278
|
-
if (eval_text(ctx, params.prompt + "<end_of_turn><start_of_turn>model\n", true)) {
|
|
279
|
-
return 1;
|
|
280
|
-
}
|
|
281
|
-
if (generate_response(ctx, smpl, n_predict)) {
|
|
282
|
-
return 1;
|
|
283
|
-
}
|
|
284
|
-
|
|
285
|
-
} else {
|
|
286
|
-
LOG("\n Running in chat mode, available commands:");
|
|
287
|
-
LOG("\n /image <path> load an image");
|
|
288
|
-
LOG("\n /clear clear the chat history");
|
|
289
|
-
LOG("\n /quit or /exit exit the program");
|
|
290
|
-
LOG("\n");
|
|
291
|
-
|
|
292
|
-
if (eval_text(ctx, "<start_of_turn>user\n")) {
|
|
293
|
-
return 1;
|
|
294
|
-
}
|
|
295
|
-
|
|
296
|
-
while (true) {
|
|
297
|
-
g_is_generating = false;
|
|
298
|
-
LOG("\n> ");
|
|
299
|
-
console::set_display(console::user_input);
|
|
300
|
-
std::string line;
|
|
301
|
-
console::readline(line, false);
|
|
302
|
-
console::set_display(console::reset);
|
|
303
|
-
line = string_strip(line);
|
|
304
|
-
if (line.empty()) {
|
|
305
|
-
continue;
|
|
306
|
-
}
|
|
307
|
-
if (line == "/quit" || line == "/exit") {
|
|
308
|
-
break;
|
|
309
|
-
}
|
|
310
|
-
if (line == "/clear") {
|
|
311
|
-
ctx.n_past = 0;
|
|
312
|
-
llama_kv_self_seq_rm(ctx.lctx, 0, 1, -1); // keep BOS
|
|
313
|
-
LOG("Chat history cleared\n\n");
|
|
314
|
-
continue;
|
|
315
|
-
}
|
|
316
|
-
g_is_generating = true;
|
|
317
|
-
if (line.find("/image") == 0) {
|
|
318
|
-
std::string image = line.substr(7);
|
|
319
|
-
int res = eval_image(ctx, image);
|
|
320
|
-
if (res == 2) {
|
|
321
|
-
continue; // image not found
|
|
322
|
-
}
|
|
323
|
-
if (res) {
|
|
324
|
-
return 1;
|
|
325
|
-
}
|
|
326
|
-
continue;
|
|
327
|
-
}
|
|
328
|
-
if (eval_text(ctx, line + "<end_of_turn><start_of_turn>model\n", true)) {
|
|
329
|
-
return 1;
|
|
330
|
-
}
|
|
331
|
-
if (generate_response(ctx, smpl, n_predict)) {
|
|
332
|
-
return 1;
|
|
333
|
-
}
|
|
334
|
-
if (eval_text(ctx, "<end_of_turn><start_of_turn>user\n")) {
|
|
335
|
-
return 1;
|
|
336
|
-
}
|
|
337
|
-
}
|
|
338
|
-
}
|
|
339
|
-
|
|
340
|
-
return 0;
|
|
341
|
-
}
|
|
@@ -1,332 +0,0 @@
|
|
|
1
|
-
#include "arg.h"
|
|
2
|
-
#include "base64.hpp"
|
|
3
|
-
#include "log.h"
|
|
4
|
-
#include "common.h"
|
|
5
|
-
#include "sampling.h"
|
|
6
|
-
#include "clip.h"
|
|
7
|
-
#include "llava.h"
|
|
8
|
-
#include "llama.h"
|
|
9
|
-
#include "ggml.h"
|
|
10
|
-
|
|
11
|
-
#include <cstdio>
|
|
12
|
-
#include <cstdlib>
|
|
13
|
-
#include <cstring>
|
|
14
|
-
#include <vector>
|
|
15
|
-
|
|
16
|
-
static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
|
|
17
|
-
int N = (int) tokens.size();
|
|
18
|
-
for (int i = 0; i < N; i += n_batch) {
|
|
19
|
-
int n_eval = (int) tokens.size() - i;
|
|
20
|
-
if (n_eval > n_batch) {
|
|
21
|
-
n_eval = n_batch;
|
|
22
|
-
}
|
|
23
|
-
if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval))) {
|
|
24
|
-
LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
|
|
25
|
-
return false;
|
|
26
|
-
}
|
|
27
|
-
*n_past += n_eval;
|
|
28
|
-
}
|
|
29
|
-
return true;
|
|
30
|
-
}
|
|
31
|
-
|
|
32
|
-
static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
|
|
33
|
-
std::vector<llama_token> tokens;
|
|
34
|
-
tokens.push_back(id);
|
|
35
|
-
return eval_tokens(ctx_llama, tokens, 1, n_past);
|
|
36
|
-
}
|
|
37
|
-
|
|
38
|
-
static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
|
|
39
|
-
std::string str2 = str;
|
|
40
|
-
std::vector<llama_token> embd_inp = common_tokenize(ctx_llama, str2, add_bos, true);
|
|
41
|
-
eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
|
|
42
|
-
return true;
|
|
43
|
-
}
|
|
44
|
-
|
|
45
|
-
static const char * sample(struct common_sampler * smpl,
|
|
46
|
-
struct llama_context * ctx_llama,
|
|
47
|
-
int * n_past) {
|
|
48
|
-
const llama_token id = common_sampler_sample(smpl, ctx_llama, -1);
|
|
49
|
-
common_sampler_accept(smpl, id, true);
|
|
50
|
-
|
|
51
|
-
const llama_model * model = llama_get_model(ctx_llama);
|
|
52
|
-
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
53
|
-
|
|
54
|
-
static std::string ret;
|
|
55
|
-
if (llama_vocab_is_eog(vocab, id)) {
|
|
56
|
-
ret = "</s>";
|
|
57
|
-
} else {
|
|
58
|
-
ret = common_token_to_piece(ctx_llama, id);
|
|
59
|
-
}
|
|
60
|
-
eval_id(ctx_llama, id, n_past);
|
|
61
|
-
return ret.c_str();
|
|
62
|
-
}
|
|
63
|
-
|
|
64
|
-
static const char* IMG_BASE64_TAG_BEGIN = "<img src=\"data:image/jpeg;base64,";
|
|
65
|
-
static const char* IMG_BASE64_TAG_END = "\">";
|
|
66
|
-
|
|
67
|
-
static void find_image_tag_in_prompt(const std::string& prompt, size_t& begin_out, size_t& end_out) {
|
|
68
|
-
begin_out = prompt.find(IMG_BASE64_TAG_BEGIN);
|
|
69
|
-
end_out = prompt.find(IMG_BASE64_TAG_END, (begin_out == std::string::npos) ? 0UL : begin_out);
|
|
70
|
-
}
|
|
71
|
-
|
|
72
|
-
static bool prompt_contains_image(const std::string& prompt) {
|
|
73
|
-
size_t begin, end;
|
|
74
|
-
find_image_tag_in_prompt(prompt, begin, end);
|
|
75
|
-
return (begin != std::string::npos);
|
|
76
|
-
}
|
|
77
|
-
|
|
78
|
-
// replaces the base64 image tag in the prompt with `replacement`
|
|
79
|
-
static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip_ctx * ctx_clip, int n_threads, const std::string& prompt) {
|
|
80
|
-
size_t img_base64_str_start, img_base64_str_end;
|
|
81
|
-
find_image_tag_in_prompt(prompt, img_base64_str_start, img_base64_str_end);
|
|
82
|
-
if (img_base64_str_start == std::string::npos || img_base64_str_end == std::string::npos) {
|
|
83
|
-
LOG_ERR("%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END);
|
|
84
|
-
return NULL;
|
|
85
|
-
}
|
|
86
|
-
|
|
87
|
-
auto base64_bytes_start = img_base64_str_start + strlen(IMG_BASE64_TAG_BEGIN);
|
|
88
|
-
auto base64_bytes_count = img_base64_str_end - base64_bytes_start;
|
|
89
|
-
auto base64_str = prompt.substr(base64_bytes_start, base64_bytes_count );
|
|
90
|
-
|
|
91
|
-
auto required_bytes = base64::required_encode_size(base64_str.size());
|
|
92
|
-
auto img_bytes = std::vector<unsigned char>(required_bytes);
|
|
93
|
-
base64::decode(base64_str.begin(), base64_str.end(), img_bytes.begin());
|
|
94
|
-
|
|
95
|
-
auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size());
|
|
96
|
-
if (!embed) {
|
|
97
|
-
LOG_ERR("%s: could not load image from base64 string.\n", __func__);
|
|
98
|
-
return NULL;
|
|
99
|
-
}
|
|
100
|
-
|
|
101
|
-
return embed;
|
|
102
|
-
}
|
|
103
|
-
|
|
104
|
-
static std::string remove_image_from_prompt(const std::string& prompt, const char * replacement = "") {
|
|
105
|
-
size_t begin, end;
|
|
106
|
-
find_image_tag_in_prompt(prompt, begin, end);
|
|
107
|
-
if (begin == std::string::npos || end == std::string::npos) {
|
|
108
|
-
return prompt;
|
|
109
|
-
}
|
|
110
|
-
auto pre = prompt.substr(0, begin);
|
|
111
|
-
auto post = prompt.substr(end + strlen(IMG_BASE64_TAG_END));
|
|
112
|
-
return pre + replacement + post;
|
|
113
|
-
}
|
|
114
|
-
|
|
115
|
-
struct llava_context {
|
|
116
|
-
struct clip_ctx * ctx_clip = NULL;
|
|
117
|
-
struct llama_context * ctx_llama = NULL;
|
|
118
|
-
struct llama_model * model = NULL;
|
|
119
|
-
};
|
|
120
|
-
|
|
121
|
-
static void print_usage(int, char ** argv) {
|
|
122
|
-
LOG("\n example usage:\n");
|
|
123
|
-
LOG("\n %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
|
|
124
|
-
LOG("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
|
|
125
|
-
}
|
|
126
|
-
|
|
127
|
-
static struct llava_image_embed * load_image(llava_context * ctx_llava, common_params * params, const std::string & fname) {
|
|
128
|
-
|
|
129
|
-
// load and preprocess the image
|
|
130
|
-
llava_image_embed * embed = NULL;
|
|
131
|
-
auto prompt = params->prompt;
|
|
132
|
-
if (prompt_contains_image(prompt)) {
|
|
133
|
-
if (!params->image.empty()) {
|
|
134
|
-
LOG_INF("using base64 encoded image instead of command line image path\n");
|
|
135
|
-
}
|
|
136
|
-
embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->cpuparams.n_threads, prompt);
|
|
137
|
-
if (!embed) {
|
|
138
|
-
LOG_ERR("%s: can't load image from prompt\n", __func__);
|
|
139
|
-
return NULL;
|
|
140
|
-
}
|
|
141
|
-
params->prompt = remove_image_from_prompt(prompt);
|
|
142
|
-
} else {
|
|
143
|
-
embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->cpuparams.n_threads, fname.c_str());
|
|
144
|
-
if (!embed) {
|
|
145
|
-
fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str());
|
|
146
|
-
return NULL;
|
|
147
|
-
}
|
|
148
|
-
}
|
|
149
|
-
|
|
150
|
-
return embed;
|
|
151
|
-
}
|
|
152
|
-
|
|
153
|
-
static void process_prompt(struct llava_context * ctx_llava, struct llava_image_embed * image_embed, common_params * params, const std::string & prompt) {
|
|
154
|
-
int n_past = 0;
|
|
155
|
-
|
|
156
|
-
const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
|
|
157
|
-
|
|
158
|
-
std::string system_prompt, user_prompt;
|
|
159
|
-
size_t image_pos = prompt.find("<image>");
|
|
160
|
-
if (image_pos != std::string::npos) {
|
|
161
|
-
// new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image
|
|
162
|
-
system_prompt = prompt.substr(0, image_pos);
|
|
163
|
-
user_prompt = prompt.substr(image_pos + std::string("<image>").length());
|
|
164
|
-
LOG_INF("system_prompt: %s\n", system_prompt.c_str());
|
|
165
|
-
if (params->verbose_prompt) {
|
|
166
|
-
auto tmp = common_tokenize(ctx_llava->ctx_llama, system_prompt, true, true);
|
|
167
|
-
for (int i = 0; i < (int) tmp.size(); i++) {
|
|
168
|
-
LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
|
|
169
|
-
}
|
|
170
|
-
}
|
|
171
|
-
LOG_INF("user_prompt: %s\n", user_prompt.c_str());
|
|
172
|
-
if (params->verbose_prompt) {
|
|
173
|
-
auto tmp = common_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
|
|
174
|
-
for (int i = 0; i < (int) tmp.size(); i++) {
|
|
175
|
-
LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
|
|
176
|
-
}
|
|
177
|
-
}
|
|
178
|
-
} else {
|
|
179
|
-
// llava-1.5 native mode
|
|
180
|
-
system_prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:";
|
|
181
|
-
user_prompt = prompt + "\nASSISTANT:";
|
|
182
|
-
if (params->verbose_prompt) {
|
|
183
|
-
auto tmp = common_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
|
|
184
|
-
for (int i = 0; i < (int) tmp.size(); i++) {
|
|
185
|
-
LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
|
|
186
|
-
}
|
|
187
|
-
}
|
|
188
|
-
}
|
|
189
|
-
|
|
190
|
-
eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, true);
|
|
191
|
-
llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past);
|
|
192
|
-
eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
|
|
193
|
-
|
|
194
|
-
// generate the response
|
|
195
|
-
|
|
196
|
-
LOG("\n");
|
|
197
|
-
|
|
198
|
-
struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sampling);
|
|
199
|
-
if (!smpl) {
|
|
200
|
-
LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
|
|
201
|
-
exit(1);
|
|
202
|
-
}
|
|
203
|
-
|
|
204
|
-
std::string response = "";
|
|
205
|
-
for (int i = 0; i < max_tgt_len; i++) {
|
|
206
|
-
const char * tmp = sample(smpl, ctx_llava->ctx_llama, &n_past);
|
|
207
|
-
response += tmp;
|
|
208
|
-
if (strcmp(tmp, "</s>") == 0) break;
|
|
209
|
-
if (strstr(tmp, "###")) break; // Yi-VL behavior
|
|
210
|
-
LOG("%s", tmp);
|
|
211
|
-
if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works)
|
|
212
|
-
if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6
|
|
213
|
-
if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6
|
|
214
|
-
|
|
215
|
-
fflush(stdout);
|
|
216
|
-
}
|
|
217
|
-
|
|
218
|
-
common_sampler_free(smpl);
|
|
219
|
-
LOG("\n");
|
|
220
|
-
}
|
|
221
|
-
|
|
222
|
-
static struct llama_model * llava_init(common_params * params) {
|
|
223
|
-
llama_backend_init();
|
|
224
|
-
llama_numa_init(params->numa);
|
|
225
|
-
|
|
226
|
-
llama_model_params model_params = common_model_params_to_llama(*params);
|
|
227
|
-
|
|
228
|
-
llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params);
|
|
229
|
-
if (model == NULL) {
|
|
230
|
-
LOG_ERR("%s: unable to load model\n" , __func__);
|
|
231
|
-
return NULL;
|
|
232
|
-
}
|
|
233
|
-
return model;
|
|
234
|
-
}
|
|
235
|
-
|
|
236
|
-
static struct llava_context * llava_init_context(common_params * params, llama_model * model) {
|
|
237
|
-
const char * clip_path = params->mmproj.c_str();
|
|
238
|
-
|
|
239
|
-
auto prompt = params->prompt;
|
|
240
|
-
if (prompt.empty()) {
|
|
241
|
-
prompt = "describe the image in detail.";
|
|
242
|
-
}
|
|
243
|
-
|
|
244
|
-
auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
|
|
245
|
-
|
|
246
|
-
llama_context_params ctx_params = common_context_params_to_llama(*params);
|
|
247
|
-
ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
|
|
248
|
-
|
|
249
|
-
llama_context * ctx_llama = llama_init_from_model(model, ctx_params);
|
|
250
|
-
|
|
251
|
-
if (ctx_llama == NULL) {
|
|
252
|
-
LOG_ERR("%s: failed to create the llama_context\n" , __func__);
|
|
253
|
-
return NULL;
|
|
254
|
-
}
|
|
255
|
-
|
|
256
|
-
auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
|
|
257
|
-
|
|
258
|
-
ctx_llava->ctx_llama = ctx_llama;
|
|
259
|
-
ctx_llava->ctx_clip = ctx_clip;
|
|
260
|
-
ctx_llava->model = model;
|
|
261
|
-
return ctx_llava;
|
|
262
|
-
}
|
|
263
|
-
|
|
264
|
-
static void llava_free(struct llava_context * ctx_llava) {
|
|
265
|
-
if (ctx_llava->ctx_clip) {
|
|
266
|
-
clip_free(ctx_llava->ctx_clip);
|
|
267
|
-
ctx_llava->ctx_clip = NULL;
|
|
268
|
-
}
|
|
269
|
-
|
|
270
|
-
llama_free(ctx_llava->ctx_llama);
|
|
271
|
-
llama_model_free(ctx_llava->model);
|
|
272
|
-
llama_backend_free();
|
|
273
|
-
}
|
|
274
|
-
|
|
275
|
-
int main(int argc, char ** argv) {
|
|
276
|
-
ggml_time_init();
|
|
277
|
-
|
|
278
|
-
common_params params;
|
|
279
|
-
|
|
280
|
-
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, print_usage)) {
|
|
281
|
-
return 1;
|
|
282
|
-
}
|
|
283
|
-
|
|
284
|
-
common_init();
|
|
285
|
-
|
|
286
|
-
if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
|
|
287
|
-
print_usage(argc, argv);
|
|
288
|
-
return 1;
|
|
289
|
-
}
|
|
290
|
-
|
|
291
|
-
auto * model = llava_init(¶ms);
|
|
292
|
-
if (model == NULL) {
|
|
293
|
-
fprintf(stderr, "%s: error: failed to init llava model\n", __func__);
|
|
294
|
-
return 1;
|
|
295
|
-
}
|
|
296
|
-
|
|
297
|
-
if (prompt_contains_image(params.prompt)) {
|
|
298
|
-
auto * ctx_llava = llava_init_context(¶ms, model);
|
|
299
|
-
|
|
300
|
-
auto * image_embed = load_image(ctx_llava, ¶ms, "");
|
|
301
|
-
|
|
302
|
-
// process the prompt
|
|
303
|
-
process_prompt(ctx_llava, image_embed, ¶ms, params.prompt);
|
|
304
|
-
|
|
305
|
-
llama_perf_context_print(ctx_llava->ctx_llama);
|
|
306
|
-
llava_image_embed_free(image_embed);
|
|
307
|
-
ctx_llava->model = NULL;
|
|
308
|
-
llava_free(ctx_llava);
|
|
309
|
-
} else {
|
|
310
|
-
for (auto & image : params.image) {
|
|
311
|
-
auto * ctx_llava = llava_init_context(¶ms, model);
|
|
312
|
-
|
|
313
|
-
auto * image_embed = load_image(ctx_llava, ¶ms, image);
|
|
314
|
-
if (!image_embed) {
|
|
315
|
-
LOG_ERR("%s: failed to load image %s. Terminating\n\n", __func__, image.c_str());
|
|
316
|
-
return 1;
|
|
317
|
-
}
|
|
318
|
-
|
|
319
|
-
// process the prompt
|
|
320
|
-
process_prompt(ctx_llava, image_embed, ¶ms, params.prompt);
|
|
321
|
-
|
|
322
|
-
llama_perf_context_print(ctx_llava->ctx_llama);
|
|
323
|
-
llava_image_embed_free(image_embed);
|
|
324
|
-
ctx_llava->model = NULL;
|
|
325
|
-
llava_free(ctx_llava);
|
|
326
|
-
}
|
|
327
|
-
}
|
|
328
|
-
|
|
329
|
-
llama_model_free(model);
|
|
330
|
-
|
|
331
|
-
return 0;
|
|
332
|
-
}
|