@fugood/llama.node 0.3.16 → 0.3.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +3 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +5 -0
- package/package.json +1 -1
- package/src/LlamaCompletionWorker.cpp +8 -0
- package/src/LlamaCompletionWorker.h +1 -0
- package/src/LlamaContext.cpp +3 -2
- package/src/llama.cpp/.github/workflows/build-linux-cross.yml +124 -0
- package/src/llama.cpp/.github/workflows/build.yml +70 -27
- package/src/llama.cpp/.github/workflows/docker.yml +6 -6
- package/src/llama.cpp/.github/workflows/server.yml +7 -11
- package/src/llama.cpp/CMakeLists.txt +23 -1
- package/src/llama.cpp/common/CMakeLists.txt +6 -3
- package/src/llama.cpp/common/arg.cpp +809 -105
- package/src/llama.cpp/common/arg.h +9 -0
- package/src/llama.cpp/common/chat.cpp +1 -1
- package/src/llama.cpp/common/common.cpp +31 -521
- package/src/llama.cpp/common/common.h +17 -36
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
- package/src/llama.cpp/common/llguidance.cpp +30 -47
- package/src/llama.cpp/common/minja/chat-template.hpp +15 -7
- package/src/llama.cpp/common/minja/minja.hpp +119 -93
- package/src/llama.cpp/common/sampling.cpp +3 -0
- package/src/llama.cpp/docs/build.md +122 -7
- package/src/llama.cpp/examples/CMakeLists.txt +0 -9
- package/src/llama.cpp/examples/batched/batched.cpp +1 -1
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +7 -1
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +15 -16
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +210 -8
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/llava/CMakeLists.txt +39 -24
- package/src/llama.cpp/examples/llava/clip-impl.h +345 -0
- package/src/llama.cpp/examples/llava/clip.cpp +2152 -1803
- package/src/llama.cpp/examples/llava/clip.h +39 -22
- package/src/llama.cpp/examples/llava/deprecation-warning.cpp +22 -0
- package/src/llama.cpp/examples/llava/llava.cpp +64 -52
- package/src/llama.cpp/examples/llava/mtmd-cli.cpp +344 -0
- package/src/llama.cpp/examples/llava/mtmd.cpp +708 -0
- package/src/llama.cpp/examples/llava/mtmd.h +168 -0
- package/src/llama.cpp/examples/llava/{qwen2vl-cli.cpp → qwen2vl-test.cpp} +83 -31
- package/src/llama.cpp/examples/main/main.cpp +16 -5
- package/src/llama.cpp/examples/parallel/parallel.cpp +3 -1
- package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +17 -3
- package/src/llama.cpp/examples/quantize/quantize.cpp +115 -2
- package/src/llama.cpp/examples/rpc/CMakeLists.txt +4 -2
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +163 -8
- package/src/llama.cpp/examples/run/CMakeLists.txt +12 -1
- package/src/llama.cpp/examples/run/run.cpp +14 -28
- package/src/llama.cpp/examples/server/httplib.h +313 -247
- package/src/llama.cpp/examples/server/server.cpp +238 -139
- package/src/llama.cpp/examples/server/utils.hpp +51 -2
- package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
- package/src/llama.cpp/examples/sycl/build.sh +2 -2
- package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
- package/src/llama.cpp/examples/tts/tts.cpp +6 -9
- package/src/llama.cpp/ggml/CMakeLists.txt +8 -2
- package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
- package/src/llama.cpp/ggml/include/ggml.h +66 -99
- package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
- package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +48 -22
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -192
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +754 -404
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1003 -13519
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +2 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +0 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +3 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +533 -88
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8809 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +258 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
- package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +70 -3
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -260
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +293 -40
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +96 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +350 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +2 -292
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +967 -438
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +204 -280
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +23 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +646 -114
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +12 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +17 -8
- package/src/llama.cpp/ggml/src/ggml.c +141 -245
- package/src/llama.cpp/ggml/src/gguf.cpp +1 -0
- package/src/llama.cpp/include/llama.h +30 -11
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-all.txt +2 -0
- package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
- package/src/llama.cpp/src/CMakeLists.txt +3 -2
- package/src/llama.cpp/src/llama-adapter.cpp +37 -1
- package/src/llama.cpp/src/llama-arch.cpp +160 -17
- package/src/llama.cpp/src/llama-arch.h +16 -0
- package/src/llama.cpp/src/llama-chat.cpp +82 -17
- package/src/llama.cpp/src/llama-chat.h +6 -2
- package/src/llama.cpp/src/llama-context.cpp +108 -92
- package/src/llama.cpp/src/llama-context.h +1 -2
- package/src/llama.cpp/src/llama-graph.cpp +189 -119
- package/src/llama.cpp/src/llama-graph.h +26 -6
- package/src/llama.cpp/src/llama-hparams.h +13 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +70 -123
- package/src/llama.cpp/src/llama-kv-cache.h +41 -115
- package/src/llama.cpp/src/llama-memory.h +1 -1
- package/src/llama.cpp/src/llama-mmap.cpp +1 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +10 -5
- package/src/llama.cpp/src/llama-model-loader.h +5 -3
- package/src/llama.cpp/src/llama-model.cpp +1760 -534
- package/src/llama.cpp/src/llama-model.h +13 -1
- package/src/llama.cpp/src/llama-quant.cpp +29 -8
- package/src/llama.cpp/src/llama-sampling.cpp +7 -1
- package/src/llama.cpp/src/llama-vocab.cpp +44 -6
- package/src/llama.cpp/src/llama.cpp +1 -1
- package/src/llama.cpp/tests/CMakeLists.txt +43 -30
- package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
- package/src/llama.cpp/tests/test-backend-ops.cpp +82 -43
- package/src/llama.cpp/tests/test-chat-template.cpp +34 -13
- package/src/llama.cpp/tests/test-chat.cpp +12 -2
- package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
- package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
- package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
- package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
- package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
- package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
- package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
|
@@ -39,11 +39,14 @@ enum llm_type {
|
|
|
39
39
|
LLM_TYPE_770M,
|
|
40
40
|
LLM_TYPE_780M,
|
|
41
41
|
LLM_TYPE_0_5B,
|
|
42
|
+
LLM_TYPE_0_6B,
|
|
42
43
|
LLM_TYPE_1B,
|
|
43
44
|
LLM_TYPE_1_3B,
|
|
44
45
|
LLM_TYPE_1_4B,
|
|
45
46
|
LLM_TYPE_1_5B,
|
|
46
47
|
LLM_TYPE_1_6B,
|
|
48
|
+
LLM_TYPE_1_7B,
|
|
49
|
+
LLM_TYPE_1_8B,
|
|
47
50
|
LLM_TYPE_2B,
|
|
48
51
|
LLM_TYPE_2_8B,
|
|
49
52
|
LLM_TYPE_2_9B,
|
|
@@ -61,6 +64,7 @@ enum llm_type {
|
|
|
61
64
|
LLM_TYPE_15B,
|
|
62
65
|
LLM_TYPE_16B,
|
|
63
66
|
LLM_TYPE_20B,
|
|
67
|
+
LLM_TYPE_27B,
|
|
64
68
|
LLM_TYPE_30B,
|
|
65
69
|
LLM_TYPE_32B,
|
|
66
70
|
LLM_TYPE_34B,
|
|
@@ -69,6 +73,7 @@ enum llm_type {
|
|
|
69
73
|
LLM_TYPE_65B,
|
|
70
74
|
LLM_TYPE_70B,
|
|
71
75
|
LLM_TYPE_236B,
|
|
76
|
+
LLM_TYPE_290B,
|
|
72
77
|
LLM_TYPE_314B,
|
|
73
78
|
LLM_TYPE_671B,
|
|
74
79
|
LLM_TYPE_SMALL,
|
|
@@ -83,7 +88,10 @@ enum llm_type {
|
|
|
83
88
|
LLM_TYPE_16x3_8B,
|
|
84
89
|
LLM_TYPE_10B_128x3_66B,
|
|
85
90
|
LLM_TYPE_57B_A14B,
|
|
86
|
-
|
|
91
|
+
LLM_TYPE_17B_16E, // llama4 Scout
|
|
92
|
+
LLM_TYPE_17B_128E, // llama4 Maverick
|
|
93
|
+
LLM_TYPE_30B_A3B,
|
|
94
|
+
LLM_TYPE_235B_A22B,
|
|
87
95
|
};
|
|
88
96
|
|
|
89
97
|
struct llama_layer_posnet {
|
|
@@ -167,6 +175,8 @@ struct llama_layer {
|
|
|
167
175
|
struct ggml_tensor * wq_b = nullptr;
|
|
168
176
|
struct ggml_tensor * wkv_a_mqa = nullptr;
|
|
169
177
|
struct ggml_tensor * wkv_b = nullptr;
|
|
178
|
+
struct ggml_tensor * wk_b = nullptr;
|
|
179
|
+
struct ggml_tensor * wv_b = nullptr;
|
|
170
180
|
struct ggml_tensor * wq_cross = nullptr;
|
|
171
181
|
struct ggml_tensor * wk_cross = nullptr;
|
|
172
182
|
struct ggml_tensor * wv_cross = nullptr;
|
|
@@ -380,6 +390,8 @@ struct llama_model {
|
|
|
380
390
|
|
|
381
391
|
ggml_backend_buffer_type_t select_buft(int il) const;
|
|
382
392
|
|
|
393
|
+
bool has_tensor_overrides() const;
|
|
394
|
+
|
|
383
395
|
const struct ggml_tensor * get_tensor(const char * name) const;
|
|
384
396
|
|
|
385
397
|
// TODO: move this to new llm_arch_model_i interface
|
|
@@ -10,6 +10,7 @@
|
|
|
10
10
|
#include <cinttypes>
|
|
11
11
|
#include <fstream>
|
|
12
12
|
#include <mutex>
|
|
13
|
+
#include <regex>
|
|
13
14
|
#include <thread>
|
|
14
15
|
#include <unordered_map>
|
|
15
16
|
|
|
@@ -47,8 +48,14 @@ struct quantize_state_impl {
|
|
|
47
48
|
{}
|
|
48
49
|
};
|
|
49
50
|
|
|
51
|
+
// changes to this struct must be replicated in quantize.cpp
|
|
52
|
+
struct tensor_quantization {
|
|
53
|
+
std::string name;
|
|
54
|
+
ggml_type quant = GGML_TYPE_COUNT;
|
|
55
|
+
};
|
|
56
|
+
|
|
50
57
|
static void llama_tensor_dequantize_impl(
|
|
51
|
-
|
|
58
|
+
ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
|
|
52
59
|
const size_t nelements, const int nthread
|
|
53
60
|
) {
|
|
54
61
|
if (output.size() < nelements) {
|
|
@@ -527,7 +534,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
527
534
|
}
|
|
528
535
|
|
|
529
536
|
std::vector<std::string> splits = {};
|
|
530
|
-
llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides);
|
|
537
|
+
llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides, nullptr);
|
|
531
538
|
ml.init_mappings(false); // no prefetching
|
|
532
539
|
|
|
533
540
|
llama_model model(llama_model_default_params());
|
|
@@ -536,7 +543,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
536
543
|
model.load_hparams(ml);
|
|
537
544
|
model.load_stats (ml);
|
|
538
545
|
|
|
539
|
-
|
|
546
|
+
quantize_state_impl qs(model, params);
|
|
540
547
|
|
|
541
548
|
if (params->only_copy) {
|
|
542
549
|
ftype = ml.ftype;
|
|
@@ -661,7 +668,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
661
668
|
// populate the original tensors so we get an initial meta data
|
|
662
669
|
for (const auto * it : tensors) {
|
|
663
670
|
uint16_t i_split = params->keep_split ? it->idx : 0;
|
|
664
|
-
|
|
671
|
+
ggml_tensor * tensor = it->tensor;
|
|
665
672
|
if (!ctx_outs[i_split]) {
|
|
666
673
|
ctx_outs[i_split].reset(gguf_init_empty());
|
|
667
674
|
}
|
|
@@ -710,7 +717,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
710
717
|
new_ofstream(0);
|
|
711
718
|
for (const auto * it : tensors) {
|
|
712
719
|
const auto & weight = *it;
|
|
713
|
-
|
|
720
|
+
ggml_tensor * tensor = weight.tensor;
|
|
714
721
|
if (weight.idx != cur_split && params->keep_split) {
|
|
715
722
|
close_ofstream();
|
|
716
723
|
new_ofstream(weight.idx);
|
|
@@ -776,7 +783,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
776
783
|
// do not quantize relative position bias (T5)
|
|
777
784
|
quantize &= name.find("attn_rel_b.weight") == std::string::npos;
|
|
778
785
|
|
|
779
|
-
|
|
786
|
+
ggml_type new_type;
|
|
780
787
|
void * new_data;
|
|
781
788
|
size_t new_size;
|
|
782
789
|
|
|
@@ -786,6 +793,19 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
786
793
|
// get more optimal quantization type based on the tensor shape, layer, etc.
|
|
787
794
|
if (!params->pure && ggml_is_quantized(default_type)) {
|
|
788
795
|
new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
|
|
796
|
+
// unless the user specifies a type
|
|
797
|
+
if (params->tensor_types) {
|
|
798
|
+
const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
|
|
799
|
+
for (const auto & [tname, qtype] : tensor_types) {
|
|
800
|
+
if (std::regex pattern(tname); std::regex_search(tensor->name, pattern)) {
|
|
801
|
+
if (qtype != new_type) {
|
|
802
|
+
LLAMA_LOG_DEBUG("(overriding %s -> %s), ", ggml_type_name(new_type), ggml_type_name(qtype));
|
|
803
|
+
}
|
|
804
|
+
new_type = qtype;
|
|
805
|
+
break;
|
|
806
|
+
}
|
|
807
|
+
}
|
|
808
|
+
}
|
|
789
809
|
}
|
|
790
810
|
if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
|
|
791
811
|
new_type = params->token_embedding_type;
|
|
@@ -910,8 +930,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
910
930
|
// interface implementation
|
|
911
931
|
//
|
|
912
932
|
|
|
913
|
-
|
|
914
|
-
|
|
933
|
+
llama_model_quantize_params llama_model_quantize_default_params() {
|
|
934
|
+
llama_model_quantize_params result = {
|
|
915
935
|
/*.nthread =*/ 0,
|
|
916
936
|
/*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
|
|
917
937
|
/*.output_tensor_type =*/ GGML_TYPE_COUNT,
|
|
@@ -923,6 +943,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
|
|
923
943
|
/*.keep_split =*/ false,
|
|
924
944
|
/*.imatrix =*/ nullptr,
|
|
925
945
|
/*.kv_overrides =*/ nullptr,
|
|
946
|
+
/*.tensor_type =*/ nullptr,
|
|
926
947
|
};
|
|
927
948
|
|
|
928
949
|
return result;
|
|
@@ -232,7 +232,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
|
|
|
232
232
|
// }
|
|
233
233
|
|
|
234
234
|
if (k <= 0) {
|
|
235
|
-
|
|
235
|
+
return;
|
|
236
236
|
}
|
|
237
237
|
|
|
238
238
|
k = std::min(k, (int) cur_p->size);
|
|
@@ -298,6 +298,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
|
|
|
298
298
|
}
|
|
299
299
|
cur_p->sorted = true;
|
|
300
300
|
}
|
|
301
|
+
|
|
301
302
|
cur_p->size = k;
|
|
302
303
|
}
|
|
303
304
|
|
|
@@ -1477,6 +1478,7 @@ static struct llama_sampler * llama_sampler_grammar_clone(const struct llama_sam
|
|
|
1477
1478
|
const auto * ctx = (const llama_sampler_grammar *) smpl->ctx;
|
|
1478
1479
|
|
|
1479
1480
|
auto * result = llama_sampler_init_grammar_impl(ctx->vocab, nullptr, nullptr, false, nullptr, 0, nullptr, 0, nullptr, 0);
|
|
1481
|
+
GGML_ASSERT(result);
|
|
1480
1482
|
|
|
1481
1483
|
// copy the state
|
|
1482
1484
|
{
|
|
@@ -1548,6 +1550,10 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
|
|
|
1548
1550
|
/* .grammar_root = */ grammar_root,
|
|
1549
1551
|
/* .grammar = */ llama_grammar_init_impl(vocab, grammar_str, grammar_root, lazy, trigger_patterns, num_trigger_patterns, trigger_tokens, num_trigger_tokens),
|
|
1550
1552
|
};
|
|
1553
|
+
if (!ctx->grammar) {
|
|
1554
|
+
delete ctx;
|
|
1555
|
+
return nullptr;
|
|
1556
|
+
}
|
|
1551
1557
|
} else {
|
|
1552
1558
|
*ctx = {
|
|
1553
1559
|
/* .vocab = */ vocab,
|
|
@@ -342,6 +342,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
|
|
342
342
|
case LLAMA_VOCAB_PRE_TYPE_MPT:
|
|
343
343
|
case LLAMA_VOCAB_PRE_TYPE_OLMO:
|
|
344
344
|
case LLAMA_VOCAB_PRE_TYPE_JAIS:
|
|
345
|
+
case LLAMA_VOCAB_PRE_TYPE_TRILLION:
|
|
345
346
|
regex_exprs = {
|
|
346
347
|
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
|
347
348
|
};
|
|
@@ -400,6 +401,20 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
|
|
400
401
|
"[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
|
401
402
|
};
|
|
402
403
|
break;
|
|
404
|
+
case LLAMA_VOCAB_PRE_TYPE_SUPERBPE:
|
|
405
|
+
regex_exprs = {
|
|
406
|
+
"\\p{N}+",
|
|
407
|
+
"(?=(\\d{3})+(?!\\d))",
|
|
408
|
+
};
|
|
409
|
+
break;
|
|
410
|
+
case LLAMA_VOCAB_PRE_TYPE_BAILINGMOE:
|
|
411
|
+
regex_exprs = {
|
|
412
|
+
// original regex from tokenizer.json
|
|
413
|
+
// "'(?i:[sdmt]|ll|ve|re)|[^\\r\\n\\p{L}\\p{N}]?+\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]++[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+"
|
|
414
|
+
// FIXME? Changed possessive quantifiers (?+ and ++) to greedy to avoid errors and imatrix hanging (tried atomic grouping but it's not supported?)
|
|
415
|
+
"'(?:[sSdDmMtT]|[lL][lL]|[vV][eE]|[rR][eE])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
|
|
416
|
+
};
|
|
417
|
+
break;
|
|
403
418
|
default:
|
|
404
419
|
// default regex for BPE tokenization pre-processing
|
|
405
420
|
regex_exprs = {
|
|
@@ -1491,7 +1506,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
1491
1506
|
tokenizer_pre == "llama3" ||
|
|
1492
1507
|
tokenizer_pre == "llama-v3" ||
|
|
1493
1508
|
tokenizer_pre == "llama-bpe"||
|
|
1494
|
-
tokenizer_pre == "falcon3"
|
|
1509
|
+
tokenizer_pre == "falcon3" ||
|
|
1510
|
+
tokenizer_pre == "pixtral") {
|
|
1495
1511
|
pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
|
|
1496
1512
|
ignore_merges = true;
|
|
1497
1513
|
add_bos = true;
|
|
@@ -1557,6 +1573,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
1557
1573
|
pre_type = LLAMA_VOCAB_PRE_TYPE_PORO;
|
|
1558
1574
|
clean_spaces = false;
|
|
1559
1575
|
} else if (
|
|
1576
|
+
tokenizer_pre == "glm4" ||
|
|
1560
1577
|
tokenizer_pre == "chatglm-bpe") {
|
|
1561
1578
|
pre_type = LLAMA_VOCAB_PRE_TYPE_CHATGLM4;
|
|
1562
1579
|
special_bos_id = LLAMA_TOKEN_NULL;
|
|
@@ -1601,9 +1618,22 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
1601
1618
|
tokenizer_pre == "megrez") {
|
|
1602
1619
|
pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
|
|
1603
1620
|
} else if (
|
|
1604
|
-
|
|
1621
|
+
tokenizer_pre == "gpt-4o" ||
|
|
1622
|
+
tokenizer_pre == "llama4") {
|
|
1605
1623
|
pre_type = LLAMA_VOCAB_PRE_TYPE_GPT4O;
|
|
1606
1624
|
clean_spaces = false;
|
|
1625
|
+
} else if (
|
|
1626
|
+
tokenizer_pre == "superbpe") {
|
|
1627
|
+
pre_type = LLAMA_VOCAB_PRE_TYPE_SUPERBPE;
|
|
1628
|
+
clean_spaces = false;
|
|
1629
|
+
} else if (
|
|
1630
|
+
tokenizer_pre == "trillion") {
|
|
1631
|
+
pre_type = LLAMA_VOCAB_PRE_TYPE_TRILLION;
|
|
1632
|
+
clean_spaces = false;
|
|
1633
|
+
} else if (
|
|
1634
|
+
tokenizer_pre == "bailingmoe") {
|
|
1635
|
+
pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
|
|
1636
|
+
clean_spaces = false;
|
|
1607
1637
|
} else {
|
|
1608
1638
|
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
|
1609
1639
|
}
|
|
@@ -1781,6 +1811,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
1781
1811
|
|| t.first == "<end_of_turn>"
|
|
1782
1812
|
|| t.first == "<|endoftext|>"
|
|
1783
1813
|
|| t.first == "<EOT>"
|
|
1814
|
+
|| t.first == "_<EOT>"
|
|
1784
1815
|
|| t.first == "<|end▁of▁sentence|>" // DeepSeek
|
|
1785
1816
|
) {
|
|
1786
1817
|
special_eot_id = t.second;
|
|
@@ -1811,8 +1842,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
1811
1842
|
if (false
|
|
1812
1843
|
|| t.first == "<|fim_prefix|>" // Qwen
|
|
1813
1844
|
|| t.first == "<fim-prefix>"
|
|
1845
|
+
|| t.first == "<fim_prefix>" // Granite
|
|
1814
1846
|
|| t.first == "<|fim▁begin|>" // DeepSeek
|
|
1815
1847
|
|| t.first == "<PRE>"
|
|
1848
|
+
|| t.first == "▁<PRE>" // CodeLlama
|
|
1816
1849
|
) {
|
|
1817
1850
|
special_fim_pre_id = t.second;
|
|
1818
1851
|
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
|
@@ -1828,8 +1861,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
1828
1861
|
if (false
|
|
1829
1862
|
|| t.first == "<|fim_suffix|>" // Qwen
|
|
1830
1863
|
|| t.first == "<fim-suffix>"
|
|
1864
|
+
|| t.first == "<fim_suffix>" // Granite
|
|
1831
1865
|
|| t.first == "<|fim▁hole|>" // DeepSeek
|
|
1832
1866
|
|| t.first == "<SUF>"
|
|
1867
|
+
|| t.first == "▁<SUF>" // CodeLlama
|
|
1833
1868
|
) {
|
|
1834
1869
|
special_fim_suf_id = t.second;
|
|
1835
1870
|
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
|
@@ -1845,8 +1880,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
1845
1880
|
if (false
|
|
1846
1881
|
|| t.first == "<|fim_middle|>" // Qwen
|
|
1847
1882
|
|| t.first == "<fim-middle>"
|
|
1883
|
+
|| t.first == "<fim_middle>" // Granite
|
|
1848
1884
|
|| t.first == "<|fim▁end|>" // DeepSeek
|
|
1849
1885
|
|| t.first == "<MID>"
|
|
1886
|
+
|| t.first == "▁<MID>" // CodeLlama
|
|
1850
1887
|
) {
|
|
1851
1888
|
special_fim_mid_id = t.second;
|
|
1852
1889
|
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
|
@@ -1862,6 +1899,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
1862
1899
|
if (false
|
|
1863
1900
|
|| t.first == "<|fim_pad|>" // Qwen
|
|
1864
1901
|
|| t.first == "<fim-pad>"
|
|
1902
|
+
|| t.first == "<fim_pad>" // Granite
|
|
1865
1903
|
|| t.first == "<PAD>"
|
|
1866
1904
|
) {
|
|
1867
1905
|
special_fim_pad_id = t.second;
|
|
@@ -1880,6 +1918,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
1880
1918
|
|| t.first == "<|repo_name|>"
|
|
1881
1919
|
|| t.first == "<fim-repo>"
|
|
1882
1920
|
|| t.first == "<REPO>"
|
|
1921
|
+
|| t.first == "<reponame>" // Granite
|
|
1883
1922
|
) {
|
|
1884
1923
|
special_fim_rep_id = t.second;
|
|
1885
1924
|
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
|
@@ -1931,6 +1970,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
1931
1970
|
|| t.first == "<|endoftext|>"
|
|
1932
1971
|
|| t.first == "<|eom_id|>"
|
|
1933
1972
|
|| t.first == "<EOT>"
|
|
1973
|
+
|| t.first == "_<EOT>"
|
|
1934
1974
|
) {
|
|
1935
1975
|
special_eog_ids.insert(t.second);
|
|
1936
1976
|
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
|
@@ -2189,14 +2229,12 @@ void llama_vocab::impl::tokenizer_st_partition(std::forward_list<fragment_buffer
|
|
|
2189
2229
|
// find the first occurrence of a given special token in this fragment
|
|
2190
2230
|
// passing offset argument only limit the "search area" but match coordinates
|
|
2191
2231
|
// are still relative to the source full raw_text
|
|
2192
|
-
|
|
2232
|
+
// string_view begins at pos 0 for the same reason
|
|
2233
|
+
auto match = std::string_view(raw_text.data(), raw_text_base_offset + raw_text_base_length).find(text, raw_text_base_offset);
|
|
2193
2234
|
|
|
2194
2235
|
// no occurrences found, stop processing this fragment for a given special token
|
|
2195
2236
|
if (match == std::string::npos) break;
|
|
2196
2237
|
|
|
2197
|
-
// check if match is within bounds of offset <-> length
|
|
2198
|
-
if (match + text.length() > raw_text_base_offset + raw_text_base_length) break;
|
|
2199
|
-
|
|
2200
2238
|
#ifdef PRETOKENIZERDEBUG
|
|
2201
2239
|
LLAMA_LOG_WARN("FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
|
|
2202
2240
|
#endif
|
|
@@ -92,7 +92,7 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
|
|
|
92
92
|
model.t_start_us = tm.t_start_us;
|
|
93
93
|
|
|
94
94
|
try {
|
|
95
|
-
llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides);
|
|
95
|
+
llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides, params.tensor_buft_overrides);
|
|
96
96
|
|
|
97
97
|
ml.print_info();
|
|
98
98
|
|
|
@@ -1,5 +1,17 @@
|
|
|
1
1
|
llama_add_compile_flags()
|
|
2
2
|
|
|
3
|
+
function(llama_build source)
|
|
4
|
+
if (DEFINED LLAMA_TEST_NAME)
|
|
5
|
+
set(TEST_TARGET ${LLAMA_TEST_NAME})
|
|
6
|
+
else()
|
|
7
|
+
get_filename_component(TEST_TARGET ${source} NAME_WE)
|
|
8
|
+
endif()
|
|
9
|
+
|
|
10
|
+
add_executable(${TEST_TARGET} ${source})
|
|
11
|
+
target_link_libraries(${TEST_TARGET} PRIVATE common)
|
|
12
|
+
install(TARGETS ${TEST_TARGET} RUNTIME)
|
|
13
|
+
endfunction()
|
|
14
|
+
|
|
3
15
|
function(llama_test target)
|
|
4
16
|
include(CMakeParseArguments)
|
|
5
17
|
set(options)
|
|
@@ -36,7 +48,7 @@ endfunction()
|
|
|
36
48
|
# - LABEL: label for the test (defaults to main)
|
|
37
49
|
# - ARGS: arguments to pass to the test executable
|
|
38
50
|
# - WORKING_DIRECTORY
|
|
39
|
-
function(
|
|
51
|
+
function(llama_build_and_test source)
|
|
40
52
|
include(CMakeParseArguments)
|
|
41
53
|
set(options)
|
|
42
54
|
set(oneValueArgs NAME LABEL WORKING_DIRECTORY)
|
|
@@ -58,6 +70,7 @@ function(llama_target_and_test source)
|
|
|
58
70
|
add_executable(${TEST_TARGET} ${source} get-model.cpp)
|
|
59
71
|
install(TARGETS ${TEST_TARGET} RUNTIME)
|
|
60
72
|
target_link_libraries(${TEST_TARGET} PRIVATE common)
|
|
73
|
+
|
|
61
74
|
add_test(
|
|
62
75
|
NAME ${TEST_TARGET}
|
|
63
76
|
WORKING_DIRECTORY ${LLAMA_TEST_WORKING_DIRECTORY}
|
|
@@ -68,9 +81,7 @@ function(llama_target_and_test source)
|
|
|
68
81
|
endfunction()
|
|
69
82
|
|
|
70
83
|
# build test-tokenizer-0 target once and add many tests
|
|
71
|
-
|
|
72
|
-
target_link_libraries(test-tokenizer-0 PRIVATE common)
|
|
73
|
-
install(TARGETS test-tokenizer-0 RUNTIME)
|
|
84
|
+
llama_build(test-tokenizer-0.cpp)
|
|
74
85
|
|
|
75
86
|
llama_test(test-tokenizer-0 NAME test-tokenizer-0-bert-bge ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bert-bge.gguf)
|
|
76
87
|
llama_test(test-tokenizer-0 NAME test-tokenizer-0-command-r ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-command-r.gguf)
|
|
@@ -87,27 +98,27 @@ llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact ARGS ${CMAKE
|
|
|
87
98
|
llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
|
|
88
99
|
|
|
89
100
|
if (LLAMA_LLGUIDANCE)
|
|
90
|
-
|
|
101
|
+
llama_build_and_test(test-grammar-llguidance.cpp ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf)
|
|
91
102
|
endif ()
|
|
92
103
|
|
|
93
104
|
if (NOT WIN32)
|
|
94
105
|
# these tests are disabled on Windows because they use internal functions not exported with LLAMA_API
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
106
|
+
llama_build_and_test(test-sampling.cpp)
|
|
107
|
+
llama_build_and_test(test-grammar-parser.cpp)
|
|
108
|
+
llama_build_and_test(test-grammar-integration.cpp)
|
|
109
|
+
llama_build_and_test(test-llama-grammar.cpp)
|
|
110
|
+
llama_build_and_test(test-chat.cpp)
|
|
100
111
|
# TODO: disabled on loongarch64 because the ggml-ci node lacks Python 3.8
|
|
101
112
|
if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
|
|
102
|
-
|
|
113
|
+
llama_build_and_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)
|
|
103
114
|
target_include_directories(test-json-schema-to-grammar PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../examples/server)
|
|
104
115
|
endif()
|
|
105
116
|
|
|
117
|
+
llama_build(test-quantize-stats.cpp)
|
|
118
|
+
llama_build(test-gbnf-validator.cpp)
|
|
106
119
|
|
|
107
120
|
# build test-tokenizer-1-bpe target once and add many tests
|
|
108
|
-
|
|
109
|
-
target_link_libraries(test-tokenizer-1-bpe PRIVATE common)
|
|
110
|
-
install(TARGETS test-tokenizer-1-bpe RUNTIME)
|
|
121
|
+
llama_build(test-tokenizer-1-bpe.cpp)
|
|
111
122
|
|
|
112
123
|
# TODO: disabled due to slowness
|
|
113
124
|
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-aquila ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
|
|
@@ -120,33 +131,35 @@ if (NOT WIN32)
|
|
|
120
131
|
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
|
|
121
132
|
|
|
122
133
|
# build test-tokenizer-1-spm target once and add many tests
|
|
123
|
-
|
|
124
|
-
target_link_libraries(test-tokenizer-1-spm PRIVATE common)
|
|
125
|
-
install(TARGETS test-tokenizer-1-spm RUNTIME)
|
|
134
|
+
llama_build(test-tokenizer-1-spm.cpp)
|
|
126
135
|
|
|
127
136
|
llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-llama-spm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-spm.gguf)
|
|
128
137
|
#llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-baichuan ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
|
|
129
138
|
|
|
130
|
-
#
|
|
139
|
+
# llama_build_and_test(test-double-float.cpp) # SLOW
|
|
131
140
|
endif()
|
|
132
141
|
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
142
|
+
llama_build_and_test(test-log.cpp)
|
|
143
|
+
llama_build_and_test(test-chat-template.cpp)
|
|
144
|
+
|
|
145
|
+
# this fails on windows (github hosted runner) due to curl DLL not found (exit code 0xc0000135)
|
|
146
|
+
if (NOT WIN32)
|
|
147
|
+
llama_build_and_test(test-arg-parser.cpp)
|
|
148
|
+
endif()
|
|
136
149
|
|
|
137
|
-
#
|
|
138
|
-
|
|
139
|
-
|
|
150
|
+
# llama_build_and_test(test-opt.cpp) # SLOW
|
|
151
|
+
llama_build_and_test(test-gguf.cpp)
|
|
152
|
+
llama_build_and_test(test-backend-ops.cpp)
|
|
140
153
|
|
|
141
|
-
|
|
142
|
-
|
|
154
|
+
llama_build_and_test(test-model-load-cancel.cpp LABEL "model")
|
|
155
|
+
llama_build_and_test(test-autorelease.cpp LABEL "model")
|
|
143
156
|
|
|
144
157
|
if (NOT GGML_BACKEND_DL)
|
|
145
158
|
# these tests use the backends directly and cannot be built with dynamic loading
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
159
|
+
llama_build_and_test(test-barrier.cpp)
|
|
160
|
+
llama_build_and_test(test-quantize-fns.cpp)
|
|
161
|
+
llama_build_and_test(test-quantize-perf.cpp)
|
|
162
|
+
llama_build_and_test(test-rope.cpp)
|
|
150
163
|
endif()
|
|
151
164
|
|
|
152
165
|
|
|
@@ -77,7 +77,7 @@ int main(void) {
|
|
|
77
77
|
|
|
78
78
|
argv = {"binary_name", "-m", "model_file.gguf"};
|
|
79
79
|
assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
|
80
|
-
assert(params.model == "model_file.gguf");
|
|
80
|
+
assert(params.model.path == "model_file.gguf");
|
|
81
81
|
|
|
82
82
|
argv = {"binary_name", "-t", "1234"};
|
|
83
83
|
assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
|
@@ -89,7 +89,7 @@ int main(void) {
|
|
|
89
89
|
|
|
90
90
|
argv = {"binary_name", "-m", "abc.gguf", "--predict", "6789", "--batch-size", "9090"};
|
|
91
91
|
assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
|
92
|
-
assert(params.model == "abc.gguf");
|
|
92
|
+
assert(params.model.path == "abc.gguf");
|
|
93
93
|
assert(params.n_predict == 6789);
|
|
94
94
|
assert(params.n_batch == 9090);
|
|
95
95
|
|
|
@@ -112,7 +112,7 @@ int main(void) {
|
|
|
112
112
|
setenv("LLAMA_ARG_THREADS", "1010", true);
|
|
113
113
|
argv = {"binary_name"};
|
|
114
114
|
assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
|
115
|
-
assert(params.model == "blah.gguf");
|
|
115
|
+
assert(params.model.path == "blah.gguf");
|
|
116
116
|
assert(params.cpuparams.n_threads == 1010);
|
|
117
117
|
|
|
118
118
|
|
|
@@ -122,10 +122,57 @@ int main(void) {
|
|
|
122
122
|
setenv("LLAMA_ARG_THREADS", "1010", true);
|
|
123
123
|
argv = {"binary_name", "-m", "overwritten.gguf"};
|
|
124
124
|
assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
|
125
|
-
assert(params.model == "overwritten.gguf");
|
|
125
|
+
assert(params.model.path == "overwritten.gguf");
|
|
126
126
|
assert(params.cpuparams.n_threads == 1010);
|
|
127
127
|
#endif // _WIN32
|
|
128
128
|
|
|
129
|
+
if (common_has_curl()) {
|
|
130
|
+
printf("test-arg-parser: test curl-related functions\n\n");
|
|
131
|
+
const char * GOOD_URL = "https://raw.githubusercontent.com/ggml-org/llama.cpp/refs/heads/master/README.md";
|
|
132
|
+
const char * BAD_URL = "https://www.google.com/404";
|
|
133
|
+
const char * BIG_FILE = "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v1.bin";
|
|
134
|
+
|
|
135
|
+
{
|
|
136
|
+
printf("test-arg-parser: test good URL\n\n");
|
|
137
|
+
auto res = common_remote_get_content(GOOD_URL, {});
|
|
138
|
+
assert(res.first == 200);
|
|
139
|
+
assert(res.second.size() > 0);
|
|
140
|
+
std::string str(res.second.data(), res.second.size());
|
|
141
|
+
assert(str.find("llama.cpp") != std::string::npos);
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
{
|
|
145
|
+
printf("test-arg-parser: test bad URL\n\n");
|
|
146
|
+
auto res = common_remote_get_content(BAD_URL, {});
|
|
147
|
+
assert(res.first == 404);
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
{
|
|
151
|
+
printf("test-arg-parser: test max size error\n");
|
|
152
|
+
common_remote_params params;
|
|
153
|
+
params.max_size = 1;
|
|
154
|
+
try {
|
|
155
|
+
common_remote_get_content(GOOD_URL, params);
|
|
156
|
+
assert(false && "it should throw an error");
|
|
157
|
+
} catch (std::exception & e) {
|
|
158
|
+
printf(" expected error: %s\n\n", e.what());
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
{
|
|
163
|
+
printf("test-arg-parser: test timeout error\n");
|
|
164
|
+
common_remote_params params;
|
|
165
|
+
params.timeout = 1;
|
|
166
|
+
try {
|
|
167
|
+
common_remote_get_content(BIG_FILE, params);
|
|
168
|
+
assert(false && "it should throw an error");
|
|
169
|
+
} catch (std::exception & e) {
|
|
170
|
+
printf(" expected error: %s\n\n", e.what());
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
} else {
|
|
174
|
+
printf("test-arg-parser: no curl, skipping curl-related functions\n");
|
|
175
|
+
}
|
|
129
176
|
|
|
130
177
|
printf("test-arg-parser: all tests OK\n\n");
|
|
131
178
|
}
|