@fugood/llama.node 0.3.15 → 0.3.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +3 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +5 -0
- package/package.json +1 -1
- package/src/LlamaCompletionWorker.cpp +8 -0
- package/src/LlamaCompletionWorker.h +1 -0
- package/src/LlamaContext.cpp +3 -2
- package/src/llama.cpp/.github/workflows/build-linux-cross.yml +124 -0
- package/src/llama.cpp/.github/workflows/build.yml +70 -27
- package/src/llama.cpp/.github/workflows/docker.yml +6 -6
- package/src/llama.cpp/.github/workflows/server.yml +7 -11
- package/src/llama.cpp/CMakeLists.txt +23 -1
- package/src/llama.cpp/common/CMakeLists.txt +6 -3
- package/src/llama.cpp/common/arg.cpp +809 -105
- package/src/llama.cpp/common/arg.h +9 -0
- package/src/llama.cpp/common/chat.cpp +1 -1
- package/src/llama.cpp/common/common.cpp +31 -521
- package/src/llama.cpp/common/common.h +17 -36
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
- package/src/llama.cpp/common/llguidance.cpp +30 -47
- package/src/llama.cpp/common/minja/chat-template.hpp +15 -7
- package/src/llama.cpp/common/minja/minja.hpp +119 -93
- package/src/llama.cpp/common/sampling.cpp +3 -0
- package/src/llama.cpp/docs/build.md +122 -7
- package/src/llama.cpp/examples/CMakeLists.txt +0 -9
- package/src/llama.cpp/examples/batched/batched.cpp +1 -1
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +7 -1
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +15 -16
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +210 -8
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/llava/CMakeLists.txt +39 -24
- package/src/llama.cpp/examples/llava/clip-impl.h +345 -0
- package/src/llama.cpp/examples/llava/clip.cpp +2152 -1803
- package/src/llama.cpp/examples/llava/clip.h +39 -22
- package/src/llama.cpp/examples/llava/deprecation-warning.cpp +22 -0
- package/src/llama.cpp/examples/llava/llava.cpp +64 -52
- package/src/llama.cpp/examples/llava/mtmd-cli.cpp +344 -0
- package/src/llama.cpp/examples/llava/mtmd.cpp +708 -0
- package/src/llama.cpp/examples/llava/mtmd.h +168 -0
- package/src/llama.cpp/examples/llava/{qwen2vl-cli.cpp → qwen2vl-test.cpp} +83 -31
- package/src/llama.cpp/examples/main/main.cpp +16 -5
- package/src/llama.cpp/examples/parallel/parallel.cpp +3 -1
- package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +17 -3
- package/src/llama.cpp/examples/quantize/quantize.cpp +115 -2
- package/src/llama.cpp/examples/rpc/CMakeLists.txt +4 -2
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +163 -8
- package/src/llama.cpp/examples/run/CMakeLists.txt +12 -1
- package/src/llama.cpp/examples/run/run.cpp +14 -28
- package/src/llama.cpp/examples/server/httplib.h +313 -247
- package/src/llama.cpp/examples/server/server.cpp +243 -139
- package/src/llama.cpp/examples/server/utils.hpp +51 -2
- package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
- package/src/llama.cpp/examples/sycl/build.sh +2 -2
- package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
- package/src/llama.cpp/examples/tts/tts.cpp +14 -9
- package/src/llama.cpp/ggml/CMakeLists.txt +8 -2
- package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
- package/src/llama.cpp/ggml/include/ggml.h +66 -99
- package/src/llama.cpp/ggml/src/CMakeLists.txt +15 -8
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
- package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +48 -22
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2413 -228
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +754 -404
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1004 -13516
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +2 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +0 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +3 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +533 -88
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8809 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +258 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
- package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +70 -3
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -260
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +293 -40
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +127 -33
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +350 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +29 -293
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +967 -438
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +12 -43
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +210 -286
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +23 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +692 -126
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +12 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +21 -10
- package/src/llama.cpp/ggml/src/ggml.c +141 -245
- package/src/llama.cpp/ggml/src/gguf.cpp +1 -0
- package/src/llama.cpp/include/llama.h +30 -11
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-all.txt +2 -0
- package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
- package/src/llama.cpp/src/CMakeLists.txt +3 -2
- package/src/llama.cpp/src/llama-adapter.cpp +37 -1
- package/src/llama.cpp/src/llama-arch.cpp +161 -17
- package/src/llama.cpp/src/llama-arch.h +16 -0
- package/src/llama.cpp/src/llama-chat.cpp +82 -17
- package/src/llama.cpp/src/llama-chat.h +6 -2
- package/src/llama.cpp/src/llama-context.cpp +108 -92
- package/src/llama.cpp/src/llama-context.h +1 -2
- package/src/llama.cpp/src/llama-graph.cpp +189 -119
- package/src/llama.cpp/src/llama-graph.h +26 -6
- package/src/llama.cpp/src/llama-hparams.h +13 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +70 -123
- package/src/llama.cpp/src/llama-kv-cache.h +41 -115
- package/src/llama.cpp/src/llama-memory.h +1 -1
- package/src/llama.cpp/src/llama-mmap.cpp +1 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +10 -5
- package/src/llama.cpp/src/llama-model-loader.h +5 -3
- package/src/llama.cpp/src/llama-model.cpp +1544 -291
- package/src/llama.cpp/src/llama-model.h +13 -1
- package/src/llama.cpp/src/llama-quant.cpp +29 -8
- package/src/llama.cpp/src/llama-sampling.cpp +7 -1
- package/src/llama.cpp/src/llama-vocab.cpp +44 -6
- package/src/llama.cpp/src/llama.cpp +1 -1
- package/src/llama.cpp/tests/CMakeLists.txt +43 -30
- package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
- package/src/llama.cpp/tests/test-backend-ops.cpp +139 -57
- package/src/llama.cpp/tests/test-chat-template.cpp +34 -13
- package/src/llama.cpp/tests/test-chat.cpp +12 -2
- package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
- package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
- package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
- package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
- package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
- package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
- package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
|
@@ -2,7 +2,6 @@
|
|
|
2
2
|
# undef NDEBUG
|
|
3
3
|
#endif
|
|
4
4
|
|
|
5
|
-
#include "unicode.h"
|
|
6
5
|
#include "sampling.h"
|
|
7
6
|
|
|
8
7
|
#include <cassert>
|
|
@@ -84,7 +83,7 @@ static void test(const std::string & test_desc, const std::string & grammar_str,
|
|
|
84
83
|
|
|
85
84
|
fprintf(stderr,
|
|
86
85
|
"\n NOTE: Debug grammar file generated. To analyze this failure in detail, run the following "
|
|
87
|
-
"command: ./
|
|
86
|
+
"command: ./test-gbnf-validator test-grammar-integration.grammar.gbnf "
|
|
88
87
|
"test-grammar-integration.string.txt\n\n");
|
|
89
88
|
} else {
|
|
90
89
|
fprintf(stdout, "✅︎\n");
|
|
@@ -1086,6 +1085,65 @@ static void test_json_schema() {
|
|
|
1086
1085
|
});
|
|
1087
1086
|
}
|
|
1088
1087
|
|
|
1088
|
+
static void one_hot(llama_token_data_array & tok_arr, llama_token selected) {
|
|
1089
|
+
auto n_vocab = tok_arr.size;
|
|
1090
|
+
|
|
1091
|
+
tok_arr.selected = -1;
|
|
1092
|
+
tok_arr.sorted = false;
|
|
1093
|
+
for (llama_token token_id = 0; token_id < (llama_token) n_vocab; token_id++) {
|
|
1094
|
+
tok_arr.data[token_id].id = token_id;
|
|
1095
|
+
tok_arr.data[token_id].logit = 0.0f;
|
|
1096
|
+
}
|
|
1097
|
+
|
|
1098
|
+
tok_arr.data[selected].logit = 100.0f;
|
|
1099
|
+
}
|
|
1100
|
+
|
|
1101
|
+
static void test_sampler_chain(void) {
|
|
1102
|
+
auto sparams = llama_sampler_chain_default_params();
|
|
1103
|
+
sparams.no_perf = false;
|
|
1104
|
+
llama_sampler * sampler = llama_sampler_chain_init(sparams);
|
|
1105
|
+
|
|
1106
|
+
const auto grammar_data = R"(%llguidance {}
|
|
1107
|
+
start: /[A-Z ]*/)";
|
|
1108
|
+
|
|
1109
|
+
llama_sampler_chain_add(sampler, llama_sampler_init_llg(vocab, "lark", grammar_data));
|
|
1110
|
+
llama_sampler_chain_add(sampler, llama_sampler_init_dist(42));
|
|
1111
|
+
|
|
1112
|
+
auto input = "ALL YOUR BASE ARE BELONG TO US";
|
|
1113
|
+
auto tokens = common_tokenize(vocab, input, false, false);
|
|
1114
|
+
|
|
1115
|
+
auto n_vocab = llama_vocab_n_tokens(vocab);
|
|
1116
|
+
|
|
1117
|
+
std::vector<llama_token_data> cur;
|
|
1118
|
+
cur.reserve(n_vocab);
|
|
1119
|
+
for (llama_token token_id = 0; token_id < (llama_token) n_vocab; token_id++) {
|
|
1120
|
+
cur.emplace_back(llama_token_data{ token_id, 0.0f, 0.0f });
|
|
1121
|
+
}
|
|
1122
|
+
auto tok_arr = llama_token_data_array{ cur.data(), cur.size(), -1, false };
|
|
1123
|
+
|
|
1124
|
+
for (const auto token : tokens) {
|
|
1125
|
+
one_hot(tok_arr, token);
|
|
1126
|
+
|
|
1127
|
+
fprintf(stderr, "applying token: %d\n", token);
|
|
1128
|
+
llama_sampler_apply(sampler, &tok_arr);
|
|
1129
|
+
|
|
1130
|
+
auto idx = tok_arr.selected;
|
|
1131
|
+
fprintf(stderr, " -> %d %f\n", cur[idx].id, cur[idx].logit);
|
|
1132
|
+
assert(cur[tok_arr.selected].id == token);
|
|
1133
|
+
llama_sampler_accept(sampler, token);
|
|
1134
|
+
}
|
|
1135
|
+
|
|
1136
|
+
auto tok_eos = llama_vocab_eot(vocab);
|
|
1137
|
+
if (tok_eos == LLAMA_TOKEN_NULL) {
|
|
1138
|
+
tok_eos = llama_vocab_eos(vocab);
|
|
1139
|
+
}
|
|
1140
|
+
|
|
1141
|
+
one_hot(tok_arr, tok_eos);
|
|
1142
|
+
|
|
1143
|
+
llama_sampler_apply(sampler, &tok_arr);
|
|
1144
|
+
assert(cur[tok_arr.selected].id == tok_eos);
|
|
1145
|
+
}
|
|
1146
|
+
|
|
1089
1147
|
int main(int argc, const char ** argv) {
|
|
1090
1148
|
fprintf(stdout, "Running llguidance integration tests...\n");
|
|
1091
1149
|
|
|
@@ -1135,6 +1193,9 @@ int main(int argc, const char ** argv) {
|
|
|
1135
1193
|
test_special_chars();
|
|
1136
1194
|
test_quantifiers();
|
|
1137
1195
|
test_json_schema();
|
|
1196
|
+
|
|
1197
|
+
test_sampler_chain();
|
|
1198
|
+
|
|
1138
1199
|
fprintf(stdout, "All tests passed.\n");
|
|
1139
1200
|
return 0;
|
|
1140
1201
|
}
|
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
|
|
5
5
|
#include "json-schema-to-grammar.h"
|
|
6
6
|
|
|
7
|
-
#include "llama-grammar.h"
|
|
7
|
+
#include "../src/llama-grammar.h"
|
|
8
8
|
|
|
9
9
|
#include <cassert>
|
|
10
10
|
#include <fstream>
|
|
@@ -597,6 +597,22 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
|
|
597
597
|
)"""
|
|
598
598
|
});
|
|
599
599
|
|
|
600
|
+
test({
|
|
601
|
+
SUCCESS,
|
|
602
|
+
"maxItems 0",
|
|
603
|
+
R"""({
|
|
604
|
+
"items": {
|
|
605
|
+
"type": "boolean"
|
|
606
|
+
},
|
|
607
|
+
"maxItems": 0
|
|
608
|
+
})""",
|
|
609
|
+
R"""(
|
|
610
|
+
boolean ::= ("true" | "false") space
|
|
611
|
+
root ::= "[" space "]" space
|
|
612
|
+
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
|
613
|
+
)"""
|
|
614
|
+
});
|
|
615
|
+
|
|
600
616
|
test({
|
|
601
617
|
SUCCESS,
|
|
602
618
|
"maxItems 1",
|
|
@@ -1,341 +0,0 @@
|
|
|
1
|
-
#include "arg.h"
|
|
2
|
-
#include "log.h"
|
|
3
|
-
#include "common.h"
|
|
4
|
-
#include "sampling.h"
|
|
5
|
-
#include "clip.h"
|
|
6
|
-
#include "stb_image.h"
|
|
7
|
-
#include "llama.h"
|
|
8
|
-
#include "ggml.h"
|
|
9
|
-
#include "console.h"
|
|
10
|
-
|
|
11
|
-
#include <vector>
|
|
12
|
-
#include <limits.h>
|
|
13
|
-
#include <inttypes.h>
|
|
14
|
-
|
|
15
|
-
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
|
16
|
-
#include <signal.h>
|
|
17
|
-
#include <unistd.h>
|
|
18
|
-
#elif defined (_WIN32)
|
|
19
|
-
#define WIN32_LEAN_AND_MEAN
|
|
20
|
-
#ifndef NOMINMAX
|
|
21
|
-
#define NOMINMAX
|
|
22
|
-
#endif
|
|
23
|
-
#include <windows.h>
|
|
24
|
-
#include <signal.h>
|
|
25
|
-
#endif
|
|
26
|
-
|
|
27
|
-
static bool g_is_generating = false;
|
|
28
|
-
|
|
29
|
-
/**
|
|
30
|
-
* Please note that this is NOT a production-ready stuff.
|
|
31
|
-
* It is a playground for trying Gemma 3 vision capabilities.
|
|
32
|
-
* For contributors: please keep this code simple and easy to understand.
|
|
33
|
-
*/
|
|
34
|
-
|
|
35
|
-
static void show_additional_info(int /*argc*/, char ** argv) {
|
|
36
|
-
LOG(
|
|
37
|
-
"Experimental CLI for using Gemma 3 vision model\n\n"
|
|
38
|
-
"Usage: %s [options] -m <model> --mmproj <mmproj> --image <image> -p <prompt>\n\n"
|
|
39
|
-
" -m and --mmproj are required\n"
|
|
40
|
-
" --image and -p are optional, if NOT provided, the CLI will run in chat mode\n",
|
|
41
|
-
argv[0]
|
|
42
|
-
);
|
|
43
|
-
}
|
|
44
|
-
|
|
45
|
-
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
|
46
|
-
static void sigint_handler(int signo) {
|
|
47
|
-
if (signo == SIGINT) {
|
|
48
|
-
if (g_is_generating) {
|
|
49
|
-
g_is_generating = false;
|
|
50
|
-
} else {
|
|
51
|
-
console::cleanup();
|
|
52
|
-
LOG("\nInterrupted by user\n");
|
|
53
|
-
_exit(130);
|
|
54
|
-
}
|
|
55
|
-
}
|
|
56
|
-
}
|
|
57
|
-
#endif
|
|
58
|
-
|
|
59
|
-
struct gemma3_context {
|
|
60
|
-
struct clip_ctx * ctx_clip = NULL;
|
|
61
|
-
common_init_result llama_init;
|
|
62
|
-
|
|
63
|
-
llama_model * model;
|
|
64
|
-
llama_context * lctx;
|
|
65
|
-
const llama_vocab * vocab;
|
|
66
|
-
llama_batch batch;
|
|
67
|
-
|
|
68
|
-
int n_threads = 1;
|
|
69
|
-
llama_pos n_past = 0;
|
|
70
|
-
|
|
71
|
-
gemma3_context(common_params & params) : llama_init(common_init_from_params(params)) {
|
|
72
|
-
model = llama_init.model.get();
|
|
73
|
-
lctx = llama_init.context.get();
|
|
74
|
-
vocab = llama_model_get_vocab(model);
|
|
75
|
-
n_threads = params.cpuparams.n_threads;
|
|
76
|
-
batch = llama_batch_init(params.n_batch, 0, 1);
|
|
77
|
-
init_clip_model(params);
|
|
78
|
-
}
|
|
79
|
-
|
|
80
|
-
void init_clip_model(common_params & params) {
|
|
81
|
-
const char * clip_path = params.mmproj.c_str();
|
|
82
|
-
ctx_clip = clip_model_load(clip_path, params.verbosity > 1);
|
|
83
|
-
}
|
|
84
|
-
|
|
85
|
-
~gemma3_context() {
|
|
86
|
-
clip_free(ctx_clip);
|
|
87
|
-
}
|
|
88
|
-
};
|
|
89
|
-
|
|
90
|
-
struct decode_embd_batch {
|
|
91
|
-
std::vector<llama_pos> pos;
|
|
92
|
-
std::vector<int32_t> n_seq_id;
|
|
93
|
-
std::vector<llama_seq_id> seq_id_0;
|
|
94
|
-
std::vector<llama_seq_id *> seq_ids;
|
|
95
|
-
std::vector<int8_t> logits;
|
|
96
|
-
llama_batch batch;
|
|
97
|
-
decode_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
|
|
98
|
-
pos .resize(n_tokens);
|
|
99
|
-
n_seq_id.resize(n_tokens);
|
|
100
|
-
seq_ids .resize(n_tokens + 1);
|
|
101
|
-
logits .resize(n_tokens);
|
|
102
|
-
seq_id_0.resize(1);
|
|
103
|
-
seq_id_0[0] = seq_id;
|
|
104
|
-
seq_ids [n_tokens] = nullptr;
|
|
105
|
-
batch = {
|
|
106
|
-
/*n_tokens =*/ n_tokens,
|
|
107
|
-
/*tokens =*/ nullptr,
|
|
108
|
-
/*embd =*/ embd,
|
|
109
|
-
/*pos =*/ pos.data(),
|
|
110
|
-
/*n_seq_id =*/ n_seq_id.data(),
|
|
111
|
-
/*seq_id =*/ seq_ids.data(),
|
|
112
|
-
/*logits =*/ logits.data(),
|
|
113
|
-
};
|
|
114
|
-
for (int i = 0; i < n_tokens; i++) {
|
|
115
|
-
batch.pos [i] = pos_0 + i;
|
|
116
|
-
batch.n_seq_id[i] = 1;
|
|
117
|
-
batch.seq_id [i] = seq_id_0.data();
|
|
118
|
-
batch.logits [i] = false;
|
|
119
|
-
}
|
|
120
|
-
}
|
|
121
|
-
};
|
|
122
|
-
|
|
123
|
-
static int eval_text(gemma3_context & ctx, std::string input, bool logits_last = false) {
|
|
124
|
-
llama_tokens tokens = common_tokenize(ctx.lctx, input, false, true);
|
|
125
|
-
common_batch_clear(ctx.batch);
|
|
126
|
-
for (llama_token & t : tokens) {
|
|
127
|
-
common_batch_add(ctx.batch, t, ctx.n_past++, {0}, false);
|
|
128
|
-
}
|
|
129
|
-
if (logits_last) {
|
|
130
|
-
ctx.batch.logits[ctx.batch.n_tokens - 1] = true;
|
|
131
|
-
}
|
|
132
|
-
// LOG("eval_text (n_tokens = %d): %s\n", (int)tokens.size(), input.c_str());
|
|
133
|
-
if (llama_decode(ctx.lctx, ctx.batch)) {
|
|
134
|
-
LOG_ERR("Failed to decode text\n");
|
|
135
|
-
return 1;
|
|
136
|
-
}
|
|
137
|
-
return 0;
|
|
138
|
-
}
|
|
139
|
-
|
|
140
|
-
static int eval_image(gemma3_context & ctx, std::string & fname) {
|
|
141
|
-
std::vector<float> image_embd_v;
|
|
142
|
-
int n_embd = llama_model_n_embd(ctx.model);
|
|
143
|
-
int n_tokens = 256;
|
|
144
|
-
image_embd_v.resize(n_tokens * n_embd);
|
|
145
|
-
|
|
146
|
-
bool ok;
|
|
147
|
-
struct clip_image_u8 * img_u8 = clip_image_u8_init();
|
|
148
|
-
ok = clip_image_load_from_file(fname.c_str(), img_u8);
|
|
149
|
-
if (!ok) {
|
|
150
|
-
LOG_ERR("Unable to load image %s\n", fname.c_str());
|
|
151
|
-
clip_image_u8_free(img_u8);
|
|
152
|
-
return 2; // non-fatal error
|
|
153
|
-
}
|
|
154
|
-
|
|
155
|
-
clip_image_f32_batch batch_f32;
|
|
156
|
-
ok = clip_image_preprocess(ctx.ctx_clip, img_u8, &batch_f32);
|
|
157
|
-
if (!ok) {
|
|
158
|
-
LOG_ERR("Unable to preprocess image\n");
|
|
159
|
-
clip_image_f32_batch_free(&batch_f32);
|
|
160
|
-
clip_image_u8_free(img_u8);
|
|
161
|
-
return 1;
|
|
162
|
-
}
|
|
163
|
-
|
|
164
|
-
int64_t t0 = ggml_time_ms();
|
|
165
|
-
LOG("Encoding image %s\n", fname.c_str());
|
|
166
|
-
ok = clip_image_batch_encode(ctx.ctx_clip, ctx.n_threads, &batch_f32, image_embd_v.data());
|
|
167
|
-
if (!ok) {
|
|
168
|
-
LOG_ERR("Unable to encode image\n");
|
|
169
|
-
clip_image_f32_batch_free(&batch_f32);
|
|
170
|
-
clip_image_u8_free(img_u8);
|
|
171
|
-
return 1;
|
|
172
|
-
}
|
|
173
|
-
LOG("Image encoded in %" PRId64 " ms\n", ggml_time_ms() - t0);
|
|
174
|
-
|
|
175
|
-
clip_image_f32_batch_free(&batch_f32);
|
|
176
|
-
clip_image_u8_free(img_u8);
|
|
177
|
-
|
|
178
|
-
// decode image embeddings
|
|
179
|
-
int64_t t1 = ggml_time_ms();
|
|
180
|
-
eval_text(ctx, "<start_of_image>");
|
|
181
|
-
llama_set_causal_attn(ctx.lctx, false);
|
|
182
|
-
decode_embd_batch batch_img(image_embd_v.data(), n_tokens, ctx.n_past, 0);
|
|
183
|
-
if (llama_decode(ctx.lctx, batch_img.batch)) {
|
|
184
|
-
LOG_ERR("failed to decode image\n");
|
|
185
|
-
return 1;
|
|
186
|
-
}
|
|
187
|
-
ctx.n_past += n_tokens;
|
|
188
|
-
llama_set_causal_attn(ctx.lctx, true);
|
|
189
|
-
eval_text(ctx, "<end_of_image>");
|
|
190
|
-
LOG("Image decoded in %" PRId64 " ms\n", ggml_time_ms() - t1);
|
|
191
|
-
return 0;
|
|
192
|
-
}
|
|
193
|
-
|
|
194
|
-
static int generate_response(gemma3_context & ctx, common_sampler * smpl, int n_predict) {
|
|
195
|
-
for (int i = 0; i < n_predict; i++) {
|
|
196
|
-
if (i > n_predict || !g_is_generating) {
|
|
197
|
-
printf("\n");
|
|
198
|
-
break;
|
|
199
|
-
}
|
|
200
|
-
|
|
201
|
-
llama_token token_id = common_sampler_sample(smpl, ctx.lctx, -1);
|
|
202
|
-
common_sampler_accept(smpl, token_id, true);
|
|
203
|
-
|
|
204
|
-
if (llama_vocab_is_eog(ctx.vocab, token_id)) {
|
|
205
|
-
printf("\n");
|
|
206
|
-
break; // end of generation
|
|
207
|
-
}
|
|
208
|
-
|
|
209
|
-
printf("%s", common_token_to_piece(ctx.lctx, token_id).c_str());
|
|
210
|
-
fflush(stdout);
|
|
211
|
-
|
|
212
|
-
// eval the token
|
|
213
|
-
common_batch_clear(ctx.batch);
|
|
214
|
-
common_batch_add(ctx.batch, token_id, ctx.n_past++, {0}, true);
|
|
215
|
-
if (llama_decode(ctx.lctx, ctx.batch)) {
|
|
216
|
-
LOG_ERR("failed to decode token\n");
|
|
217
|
-
return 1;
|
|
218
|
-
}
|
|
219
|
-
}
|
|
220
|
-
return 0;
|
|
221
|
-
}
|
|
222
|
-
|
|
223
|
-
int main(int argc, char ** argv) {
|
|
224
|
-
ggml_time_init();
|
|
225
|
-
|
|
226
|
-
common_params params;
|
|
227
|
-
params.sampling.temp = 0.2; // lower temp by default for better quality
|
|
228
|
-
|
|
229
|
-
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, show_additional_info)) {
|
|
230
|
-
return 1;
|
|
231
|
-
}
|
|
232
|
-
|
|
233
|
-
common_init();
|
|
234
|
-
|
|
235
|
-
if (params.mmproj.empty()) {
|
|
236
|
-
show_additional_info(argc, argv);
|
|
237
|
-
return 1;
|
|
238
|
-
}
|
|
239
|
-
|
|
240
|
-
gemma3_context ctx(params);
|
|
241
|
-
printf("%s: %s\n", __func__, params.model.c_str());
|
|
242
|
-
|
|
243
|
-
bool is_single_turn = !params.prompt.empty() && !params.image.empty();
|
|
244
|
-
|
|
245
|
-
struct common_sampler * smpl = common_sampler_init(ctx.model, params.sampling);
|
|
246
|
-
int n_predict = params.n_predict < 0 ? INT_MAX : params.n_predict;
|
|
247
|
-
|
|
248
|
-
// ctrl+C handling
|
|
249
|
-
{
|
|
250
|
-
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
|
251
|
-
struct sigaction sigint_action;
|
|
252
|
-
sigint_action.sa_handler = sigint_handler;
|
|
253
|
-
sigemptyset (&sigint_action.sa_mask);
|
|
254
|
-
sigint_action.sa_flags = 0;
|
|
255
|
-
sigaction(SIGINT, &sigint_action, NULL);
|
|
256
|
-
#elif defined (_WIN32)
|
|
257
|
-
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
|
|
258
|
-
return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
|
|
259
|
-
};
|
|
260
|
-
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
|
|
261
|
-
#endif
|
|
262
|
-
}
|
|
263
|
-
|
|
264
|
-
if (eval_text(ctx, "<bos>")) {
|
|
265
|
-
return 1;
|
|
266
|
-
}
|
|
267
|
-
|
|
268
|
-
if (is_single_turn) {
|
|
269
|
-
g_is_generating = true;
|
|
270
|
-
if (eval_text(ctx, "<start_of_turn>user\n")) {
|
|
271
|
-
return 1;
|
|
272
|
-
}
|
|
273
|
-
for (auto & fname : params.image) {
|
|
274
|
-
if (eval_image(ctx, fname)) {
|
|
275
|
-
return 1;
|
|
276
|
-
}
|
|
277
|
-
}
|
|
278
|
-
if (eval_text(ctx, params.prompt + "<end_of_turn><start_of_turn>model\n", true)) {
|
|
279
|
-
return 1;
|
|
280
|
-
}
|
|
281
|
-
if (generate_response(ctx, smpl, n_predict)) {
|
|
282
|
-
return 1;
|
|
283
|
-
}
|
|
284
|
-
|
|
285
|
-
} else {
|
|
286
|
-
LOG("\n Running in chat mode, available commands:");
|
|
287
|
-
LOG("\n /image <path> load an image");
|
|
288
|
-
LOG("\n /clear clear the chat history");
|
|
289
|
-
LOG("\n /quit or /exit exit the program");
|
|
290
|
-
LOG("\n");
|
|
291
|
-
|
|
292
|
-
if (eval_text(ctx, "<start_of_turn>user\n")) {
|
|
293
|
-
return 1;
|
|
294
|
-
}
|
|
295
|
-
|
|
296
|
-
while (true) {
|
|
297
|
-
g_is_generating = false;
|
|
298
|
-
LOG("\n> ");
|
|
299
|
-
console::set_display(console::user_input);
|
|
300
|
-
std::string line;
|
|
301
|
-
console::readline(line, false);
|
|
302
|
-
console::set_display(console::reset);
|
|
303
|
-
line = string_strip(line);
|
|
304
|
-
if (line.empty()) {
|
|
305
|
-
continue;
|
|
306
|
-
}
|
|
307
|
-
if (line == "/quit" || line == "/exit") {
|
|
308
|
-
break;
|
|
309
|
-
}
|
|
310
|
-
if (line == "/clear") {
|
|
311
|
-
ctx.n_past = 0;
|
|
312
|
-
llama_kv_self_seq_rm(ctx.lctx, 0, 1, -1); // keep BOS
|
|
313
|
-
LOG("Chat history cleared\n\n");
|
|
314
|
-
continue;
|
|
315
|
-
}
|
|
316
|
-
g_is_generating = true;
|
|
317
|
-
if (line.find("/image") == 0) {
|
|
318
|
-
std::string image = line.substr(7);
|
|
319
|
-
int res = eval_image(ctx, image);
|
|
320
|
-
if (res == 2) {
|
|
321
|
-
continue; // image not found
|
|
322
|
-
}
|
|
323
|
-
if (res) {
|
|
324
|
-
return 1;
|
|
325
|
-
}
|
|
326
|
-
continue;
|
|
327
|
-
}
|
|
328
|
-
if (eval_text(ctx, line + "<end_of_turn><start_of_turn>model\n", true)) {
|
|
329
|
-
return 1;
|
|
330
|
-
}
|
|
331
|
-
if (generate_response(ctx, smpl, n_predict)) {
|
|
332
|
-
return 1;
|
|
333
|
-
}
|
|
334
|
-
if (eval_text(ctx, "<end_of_turn><start_of_turn>user\n")) {
|
|
335
|
-
return 1;
|
|
336
|
-
}
|
|
337
|
-
}
|
|
338
|
-
}
|
|
339
|
-
|
|
340
|
-
return 0;
|
|
341
|
-
}
|