@fugood/llama.node 0.3.17 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +3 -1
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +39 -2
- package/lib/index.js +132 -1
- package/lib/index.ts +203 -3
- package/package.json +2 -1
- package/src/EmbeddingWorker.cpp +1 -1
- package/src/LlamaCompletionWorker.cpp +366 -19
- package/src/LlamaCompletionWorker.h +30 -10
- package/src/LlamaContext.cpp +213 -5
- package/src/LlamaContext.h +12 -0
- package/src/common.hpp +15 -0
- package/src/llama.cpp/.github/workflows/build-linux-cross.yml +133 -24
- package/src/llama.cpp/.github/workflows/build.yml +41 -762
- package/src/llama.cpp/.github/workflows/docker.yml +5 -2
- package/src/llama.cpp/.github/workflows/release.yml +716 -0
- package/src/llama.cpp/.github/workflows/server.yml +12 -12
- package/src/llama.cpp/CMakeLists.txt +5 -17
- package/src/llama.cpp/cmake/build-info.cmake +8 -2
- package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -6
- package/src/llama.cpp/common/CMakeLists.txt +31 -3
- package/src/llama.cpp/common/arg.cpp +48 -29
- package/src/llama.cpp/common/chat.cpp +128 -106
- package/src/llama.cpp/common/chat.h +2 -0
- package/src/llama.cpp/common/common.cpp +37 -1
- package/src/llama.cpp/common/common.h +18 -9
- package/src/llama.cpp/common/llguidance.cpp +1 -0
- package/src/llama.cpp/common/minja/chat-template.hpp +9 -5
- package/src/llama.cpp/common/minja/minja.hpp +69 -36
- package/src/llama.cpp/common/regex-partial.cpp +204 -0
- package/src/llama.cpp/common/regex-partial.h +56 -0
- package/src/llama.cpp/common/sampling.cpp +57 -50
- package/src/llama.cpp/examples/CMakeLists.txt +2 -23
- package/src/llama.cpp/examples/embedding/embedding.cpp +2 -11
- package/src/llama.cpp/examples/parallel/parallel.cpp +86 -14
- package/src/llama.cpp/examples/training/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/training/finetune.cpp +96 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +27 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +4 -4
- package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-opt.h +47 -28
- package/src/llama.cpp/ggml/include/ggml.h +10 -7
- package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -1
- package/src/llama.cpp/ggml/src/ggml-alloc.c +4 -1
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +9 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +20 -13
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +0 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +306 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +4 -13
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +29 -16
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +88 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +264 -69
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +501 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +0 -13
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +0 -6
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +23 -4
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +36 -11
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +0 -2
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +368 -190
- package/src/llama.cpp/ggml/src/ggml-quants.c +0 -6
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +41 -27
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +29 -23
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +9 -8
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +121 -232
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +7 -15
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +72 -25
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +14 -7
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +7 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +0 -23
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +338 -166
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +185 -89
- package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +83 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +128 -53
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +81 -70
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +657 -193
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +20 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +123 -29
- package/src/llama.cpp/ggml/src/ggml.c +29 -20
- package/src/llama.cpp/ggml/src/gguf.cpp +33 -33
- package/src/llama.cpp/include/llama.h +52 -11
- package/src/llama.cpp/requirements/requirements-all.txt +3 -3
- package/src/llama.cpp/scripts/xxd.cmake +1 -1
- package/src/llama.cpp/src/CMakeLists.txt +1 -0
- package/src/llama.cpp/src/llama-adapter.cpp +6 -0
- package/src/llama.cpp/src/llama-arch.cpp +3 -0
- package/src/llama.cpp/src/llama-batch.cpp +5 -1
- package/src/llama.cpp/src/llama-batch.h +2 -1
- package/src/llama.cpp/src/llama-chat.cpp +17 -7
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +389 -501
- package/src/llama.cpp/src/llama-context.h +44 -32
- package/src/llama.cpp/src/llama-cparams.h +1 -0
- package/src/llama.cpp/src/llama-graph.cpp +20 -38
- package/src/llama.cpp/src/llama-graph.h +12 -8
- package/src/llama.cpp/src/llama-kv-cache.cpp +1503 -389
- package/src/llama.cpp/src/llama-kv-cache.h +271 -85
- package/src/llama.cpp/src/llama-memory.h +11 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +24 -15
- package/src/llama.cpp/src/llama-model-saver.cpp +281 -0
- package/src/llama.cpp/src/llama-model-saver.h +37 -0
- package/src/llama.cpp/src/llama-model.cpp +316 -69
- package/src/llama.cpp/src/llama-model.h +8 -1
- package/src/llama.cpp/src/llama-quant.cpp +15 -13
- package/src/llama.cpp/src/llama-sampling.cpp +18 -6
- package/src/llama.cpp/src/llama-vocab.cpp +42 -4
- package/src/llama.cpp/src/llama-vocab.h +6 -0
- package/src/llama.cpp/src/llama.cpp +14 -0
- package/src/llama.cpp/tests/CMakeLists.txt +10 -2
- package/src/llama.cpp/tests/test-backend-ops.cpp +107 -47
- package/src/llama.cpp/tests/test-chat-template.cpp +10 -11
- package/src/llama.cpp/tests/test-chat.cpp +3 -1
- package/src/llama.cpp/tests/test-mtmd-c-api.c +63 -0
- package/src/llama.cpp/tests/test-opt.cpp +33 -21
- package/src/llama.cpp/tests/test-regex-partial.cpp +288 -0
- package/src/llama.cpp/tests/test-sampling.cpp +1 -1
- package/src/llama.cpp/tools/CMakeLists.txt +39 -0
- package/src/llama.cpp/{examples → tools}/batched-bench/batched-bench.cpp +2 -2
- package/src/llama.cpp/{examples → tools}/imatrix/imatrix.cpp +11 -9
- package/src/llama.cpp/{examples → tools}/llama-bench/llama-bench.cpp +495 -348
- package/src/llama.cpp/{examples → tools}/main/main.cpp +6 -9
- package/src/llama.cpp/{examples/llava → tools/mtmd}/CMakeLists.txt +1 -35
- package/src/llama.cpp/{examples/llava → tools/mtmd}/clip-impl.h +25 -5
- package/src/llama.cpp/{examples/llava → tools/mtmd}/clip.cpp +1440 -1349
- package/src/llama.cpp/tools/mtmd/clip.h +99 -0
- package/src/llama.cpp/{examples/llava → tools/mtmd}/mtmd-cli.cpp +70 -44
- package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +310 -0
- package/src/llama.cpp/{examples/llava → tools/mtmd}/mtmd.cpp +251 -281
- package/src/llama.cpp/tools/mtmd/mtmd.h +331 -0
- package/src/llama.cpp/{examples → tools}/perplexity/perplexity.cpp +4 -2
- package/src/llama.cpp/{examples → tools}/quantize/quantize.cpp +13 -76
- package/src/llama.cpp/{examples → tools}/rpc/rpc-server.cpp +70 -74
- package/src/llama.cpp/{examples → tools}/run/run.cpp +18 -4
- package/src/llama.cpp/{examples → tools}/server/CMakeLists.txt +2 -1
- package/src/llama.cpp/{examples → tools}/server/server.cpp +291 -76
- package/src/llama.cpp/{examples → tools}/server/utils.hpp +377 -5
- package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +0 -6
- package/src/llama.cpp/examples/infill/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/infill/infill.cpp +0 -590
- package/src/llama.cpp/examples/llava/android/build_64.sh +0 -8
- package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +0 -59
- package/src/llama.cpp/examples/llava/clip.h +0 -135
- package/src/llama.cpp/examples/llava/llava.cpp +0 -586
- package/src/llama.cpp/examples/llava/llava.h +0 -49
- package/src/llama.cpp/examples/llava/mtmd.h +0 -168
- package/src/llama.cpp/examples/llava/qwen2vl-test.cpp +0 -636
- /package/src/llama.cpp/{examples → tools}/batched-bench/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/completions.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/cvector-generator.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/mean.hpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/negative.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/pca.hpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/positive.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/export-lora/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/export-lora/export-lora.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/gguf-split/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/gguf-split/gguf-split.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/imatrix/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/llama-bench/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/main/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples/llava → tools/mtmd}/deprecation-warning.cpp +0 -0
- /package/src/llama.cpp/{examples/llava → tools/mtmd}/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/perplexity/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/quantize/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/rpc/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/run/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.h +0 -0
- /package/src/llama.cpp/{examples → tools}/server/bench/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/server/httplib.h +0 -0
- /package/src/llama.cpp/{examples → tools}/server/tests/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/tokenize/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/tokenize/tokenize.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/tts/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/tts/tts.cpp +0 -0
|
@@ -36,6 +36,7 @@ enum llm_type {
|
|
|
36
36
|
LLM_TYPE_335M,
|
|
37
37
|
LLM_TYPE_410M,
|
|
38
38
|
LLM_TYPE_450M,
|
|
39
|
+
LLM_TYPE_475M,
|
|
39
40
|
LLM_TYPE_770M,
|
|
40
41
|
LLM_TYPE_780M,
|
|
41
42
|
LLM_TYPE_0_5B,
|
|
@@ -75,6 +76,7 @@ enum llm_type {
|
|
|
75
76
|
LLM_TYPE_236B,
|
|
76
77
|
LLM_TYPE_290B,
|
|
77
78
|
LLM_TYPE_314B,
|
|
79
|
+
LLM_TYPE_405B,
|
|
78
80
|
LLM_TYPE_671B,
|
|
79
81
|
LLM_TYPE_SMALL,
|
|
80
82
|
LLM_TYPE_MEDIUM,
|
|
@@ -94,6 +96,8 @@ enum llm_type {
|
|
|
94
96
|
LLM_TYPE_235B_A22B,
|
|
95
97
|
};
|
|
96
98
|
|
|
99
|
+
std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type);
|
|
100
|
+
|
|
97
101
|
struct llama_layer_posnet {
|
|
98
102
|
// resnet
|
|
99
103
|
struct ggml_tensor * norm1 = nullptr;
|
|
@@ -394,8 +398,11 @@ struct llama_model {
|
|
|
394
398
|
|
|
395
399
|
const struct ggml_tensor * get_tensor(const char * name) const;
|
|
396
400
|
|
|
401
|
+
ggml_tensor * get_rope_factors(uint32_t n_ctx_per_seq, int il) const;
|
|
402
|
+
|
|
403
|
+
// note: can mutate `cparams`
|
|
397
404
|
// TODO: move this to new llm_arch_model_i interface
|
|
398
|
-
llama_memory_i * create_memory(
|
|
405
|
+
llama_memory_i * create_memory(const llama_memory_params & params, llama_cparams & cparams) const;
|
|
399
406
|
|
|
400
407
|
// TODO: move this to new llm_arch_model_i interface
|
|
401
408
|
llm_graph_result_ptr build_graph(
|
|
@@ -14,6 +14,12 @@
|
|
|
14
14
|
#include <thread>
|
|
15
15
|
#include <unordered_map>
|
|
16
16
|
|
|
17
|
+
// Quantization types. Changes to this struct must be replicated in quantize.cpp
|
|
18
|
+
struct tensor_quantization {
|
|
19
|
+
std::string name;
|
|
20
|
+
ggml_type quant = GGML_TYPE_COUNT;
|
|
21
|
+
};
|
|
22
|
+
|
|
17
23
|
static void zeros(std::ofstream & file, size_t n) {
|
|
18
24
|
char zero = 0;
|
|
19
25
|
for (size_t i = 0; i < n; ++i) {
|
|
@@ -48,12 +54,6 @@ struct quantize_state_impl {
|
|
|
48
54
|
{}
|
|
49
55
|
};
|
|
50
56
|
|
|
51
|
-
// changes to this struct must be replicated in quantize.cpp
|
|
52
|
-
struct tensor_quantization {
|
|
53
|
-
std::string name;
|
|
54
|
-
ggml_type quant = GGML_TYPE_COUNT;
|
|
55
|
-
};
|
|
56
|
-
|
|
57
57
|
static void llama_tensor_dequantize_impl(
|
|
58
58
|
ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
|
|
59
59
|
const size_t nelements, const int nthread
|
|
@@ -519,7 +519,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
519
519
|
nthread = std::thread::hardware_concurrency();
|
|
520
520
|
}
|
|
521
521
|
|
|
522
|
-
// mmap consistently increases speed Linux, and also increases speed on Windows with
|
|
522
|
+
// mmap consistently increases speed on Linux, and also increases speed on Windows with
|
|
523
523
|
// hot cache. It may cause a slowdown on macOS, possibly related to free memory.
|
|
524
524
|
#if defined(__linux__) || defined(_WIN32)
|
|
525
525
|
constexpr bool use_mmap = true;
|
|
@@ -529,7 +529,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
529
529
|
|
|
530
530
|
llama_model_kv_override * kv_overrides = nullptr;
|
|
531
531
|
if (params->kv_overrides) {
|
|
532
|
-
auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
|
|
532
|
+
auto * v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
|
|
533
533
|
kv_overrides = v->data();
|
|
534
534
|
}
|
|
535
535
|
|
|
@@ -796,17 +796,19 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
796
796
|
// unless the user specifies a type
|
|
797
797
|
if (params->tensor_types) {
|
|
798
798
|
const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
|
|
799
|
+
const std::string tensor_name(tensor->name);
|
|
799
800
|
for (const auto & [tname, qtype] : tensor_types) {
|
|
800
|
-
if (std::regex pattern(tname); std::regex_search(
|
|
801
|
-
if
|
|
802
|
-
LLAMA_LOG_DEBUG("(overriding %s
|
|
801
|
+
if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) {
|
|
802
|
+
if (qtype != new_type) {
|
|
803
|
+
LLAMA_LOG_DEBUG("(overriding %s) ", ggml_type_name(new_type));
|
|
804
|
+
new_type = qtype;
|
|
805
|
+
break; // if two or more types are specified for the tensor, first match wins
|
|
803
806
|
}
|
|
804
|
-
new_type = qtype;
|
|
805
|
-
break;
|
|
806
807
|
}
|
|
807
808
|
}
|
|
808
809
|
}
|
|
809
810
|
}
|
|
811
|
+
|
|
810
812
|
if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
|
|
811
813
|
new_type = params->token_embedding_type;
|
|
812
814
|
}
|
|
@@ -1750,23 +1750,35 @@ static const char * llama_sampler_top_n_sigma_name(const struct llama_sampler *
|
|
|
1750
1750
|
static void llama_sampler_top_n_sigma_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
|
1751
1751
|
const auto * ctx = (llama_sampler_top_n_sigma *) smpl->ctx;
|
|
1752
1752
|
|
|
1753
|
+
if (ctx->n <= 0.0f || cur_p->size <= 1) {
|
|
1754
|
+
return;
|
|
1755
|
+
}
|
|
1756
|
+
|
|
1753
1757
|
// find max logit and calculate mean
|
|
1754
1758
|
float max = cur_p->data[0].logit;
|
|
1755
1759
|
float logits_sum = 0;
|
|
1760
|
+
size_t valid_count = 0;
|
|
1756
1761
|
for (size_t i = 0; i < cur_p->size; ++i) {
|
|
1757
|
-
|
|
1758
|
-
|
|
1762
|
+
// Only count non-negative infinity values
|
|
1763
|
+
if (cur_p->data[i].logit != -INFINITY) {
|
|
1764
|
+
if (cur_p->data[i].logit > max) {
|
|
1765
|
+
max = cur_p->data[i].logit;
|
|
1766
|
+
}
|
|
1767
|
+
logits_sum += cur_p->data[i].logit;
|
|
1768
|
+
valid_count++;
|
|
1759
1769
|
}
|
|
1760
|
-
logits_sum += cur_p->data[i].logit;
|
|
1761
1770
|
}
|
|
1762
|
-
float mean = logits_sum/
|
|
1771
|
+
float mean = valid_count > 0 ? logits_sum/valid_count : 0;
|
|
1763
1772
|
|
|
1764
1773
|
// calculate standard deviation
|
|
1765
1774
|
float acc = 0;
|
|
1766
1775
|
for (size_t i = 0; i < cur_p->size; ++i) {
|
|
1767
|
-
|
|
1776
|
+
// Skip -infinity in std calculation
|
|
1777
|
+
if (cur_p->data[i].logit != -INFINITY) {
|
|
1778
|
+
acc += pow(cur_p->data[i].logit - mean, 2);
|
|
1779
|
+
}
|
|
1768
1780
|
}
|
|
1769
|
-
float std = sqrt(acc/
|
|
1781
|
+
float std = valid_count > 0 ? sqrt(acc/valid_count) : 0;
|
|
1770
1782
|
|
|
1771
1783
|
//apply mask
|
|
1772
1784
|
for (size_t i = 0; i < cur_p->size; ++i) {
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
#include "llama-vocab.h"
|
|
2
2
|
|
|
3
|
+
#include "ggml.h"
|
|
4
|
+
#include "gguf.h"
|
|
3
5
|
#include "llama-impl.h"
|
|
4
6
|
#include "llama-model-loader.h"
|
|
5
7
|
|
|
@@ -415,6 +417,13 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
|
|
415
417
|
"'(?:[sSdDmMtT]|[lL][lL]|[vV][eE]|[rR][eE])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
|
|
416
418
|
};
|
|
417
419
|
break;
|
|
420
|
+
case LLAMA_VOCAB_PRE_TYPE_SEED_CODER:
|
|
421
|
+
regex_exprs = {
|
|
422
|
+
// original regex from tokenizer.json
|
|
423
|
+
// "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\r\n]+|\\s*[\r\n]+|\\s+(?!\\S)|\\s+"
|
|
424
|
+
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\\r\\n]+|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
|
425
|
+
};
|
|
426
|
+
break;
|
|
418
427
|
default:
|
|
419
428
|
// default regex for BPE tokenization pre-processing
|
|
420
429
|
regex_exprs = {
|
|
@@ -1227,6 +1236,9 @@ struct fragment_buffer_variant {
|
|
|
1227
1236
|
struct llama_vocab::impl {
|
|
1228
1237
|
uint32_t n_token_types = 0; // for BERT-style token types
|
|
1229
1238
|
|
|
1239
|
+
std::string tokenizer_model;
|
|
1240
|
+
std::string tokenizer_pre;
|
|
1241
|
+
|
|
1230
1242
|
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
|
|
1231
1243
|
enum llama_vocab_pre_type pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
|
1232
1244
|
|
|
@@ -1362,9 +1374,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
1362
1374
|
|
|
1363
1375
|
// determine vocab type
|
|
1364
1376
|
{
|
|
1365
|
-
std::string tokenizer_model;
|
|
1366
|
-
std::string tokenizer_pre;
|
|
1367
|
-
|
|
1368
1377
|
ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model);
|
|
1369
1378
|
ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
|
|
1370
1379
|
|
|
@@ -1459,7 +1468,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
1459
1468
|
|
|
1460
1469
|
const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
|
|
1461
1470
|
if (precompiled_charsmap_keyidx != -1) {
|
|
1462
|
-
|
|
1471
|
+
const gguf_type pc_type = gguf_get_arr_type(ctx, precompiled_charsmap_keyidx);
|
|
1472
|
+
GGML_ASSERT(pc_type == GGUF_TYPE_INT8 || pc_type == GGUF_TYPE_UINT8);
|
|
1473
|
+
|
|
1474
|
+
const size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
|
|
1463
1475
|
const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
|
|
1464
1476
|
precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
|
|
1465
1477
|
#ifdef IS_BIG_ENDIAN
|
|
@@ -1634,6 +1646,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
1634
1646
|
tokenizer_pre == "bailingmoe") {
|
|
1635
1647
|
pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
|
|
1636
1648
|
clean_spaces = false;
|
|
1649
|
+
} else if (
|
|
1650
|
+
tokenizer_pre == "seed-coder") {
|
|
1651
|
+
pre_type = LLAMA_VOCAB_PRE_TYPE_SEED_CODER;
|
|
1652
|
+
clean_spaces = false;
|
|
1637
1653
|
} else {
|
|
1638
1654
|
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
|
1639
1655
|
}
|
|
@@ -2778,6 +2794,14 @@ void llama_vocab::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
2778
2794
|
pimpl->load(ml, kv);
|
|
2779
2795
|
}
|
|
2780
2796
|
|
|
2797
|
+
std::string llama_vocab::get_tokenizer_model() const {
|
|
2798
|
+
return pimpl->tokenizer_model;
|
|
2799
|
+
}
|
|
2800
|
+
|
|
2801
|
+
std::string llama_vocab::get_tokenizer_pre() const {
|
|
2802
|
+
return pimpl->tokenizer_pre;
|
|
2803
|
+
}
|
|
2804
|
+
|
|
2781
2805
|
enum llama_vocab_type llama_vocab::get_type() const {
|
|
2782
2806
|
return pimpl->type;
|
|
2783
2807
|
}
|
|
@@ -3000,6 +3024,20 @@ int llama_vocab::find_bpe_rank(const std::string & token_left, const std::string
|
|
|
3000
3024
|
return it->second;
|
|
3001
3025
|
}
|
|
3002
3026
|
|
|
3027
|
+
std::vector<std::string> llama_vocab::get_bpe_merges() const {
|
|
3028
|
+
std::vector<std::string> result(pimpl->bpe_ranks.size());
|
|
3029
|
+
|
|
3030
|
+
for (const auto & pair : pimpl->bpe_ranks) {
|
|
3031
|
+
result[pair.second] = pair.first.first + " " + pair.first.second;
|
|
3032
|
+
}
|
|
3033
|
+
|
|
3034
|
+
return result;
|
|
3035
|
+
}
|
|
3036
|
+
|
|
3037
|
+
std::vector<char> llama_vocab::get_precompiled_charsmap() const {
|
|
3038
|
+
return pimpl->precompiled_charsmap;
|
|
3039
|
+
}
|
|
3040
|
+
|
|
3003
3041
|
int32_t llama_vocab::tokenize(
|
|
3004
3042
|
const char * text,
|
|
3005
3043
|
int32_t text_len,
|
|
@@ -21,6 +21,9 @@ struct llama_vocab {
|
|
|
21
21
|
|
|
22
22
|
void load(llama_model_loader & ml, const LLM_KV & kv);
|
|
23
23
|
|
|
24
|
+
std::string get_tokenizer_model() const;
|
|
25
|
+
std::string get_tokenizer_pre() const;
|
|
26
|
+
|
|
24
27
|
enum llama_vocab_type get_type() const;
|
|
25
28
|
enum llama_vocab_pre_type get_pre_type() const;
|
|
26
29
|
|
|
@@ -80,6 +83,9 @@ struct llama_vocab {
|
|
|
80
83
|
int max_token_len() const;
|
|
81
84
|
|
|
82
85
|
int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;
|
|
86
|
+
std::vector<std::string> get_bpe_merges() const;
|
|
87
|
+
|
|
88
|
+
std::vector<char> get_precompiled_charsmap() const;
|
|
83
89
|
|
|
84
90
|
int32_t tokenize(
|
|
85
91
|
const char * text,
|
|
@@ -4,6 +4,7 @@
|
|
|
4
4
|
#include "llama-mmap.h"
|
|
5
5
|
#include "llama-vocab.h"
|
|
6
6
|
#include "llama-model-loader.h"
|
|
7
|
+
#include "llama-model-saver.h"
|
|
7
8
|
#include "llama-model.h"
|
|
8
9
|
|
|
9
10
|
#include "ggml.h"
|
|
@@ -139,6 +140,11 @@ static struct llama_model * llama_model_load_from_file_impl(
|
|
|
139
140
|
struct llama_model_params params) {
|
|
140
141
|
ggml_time_init();
|
|
141
142
|
|
|
143
|
+
if (!params.vocab_only && ggml_backend_reg_count() == 0) {
|
|
144
|
+
LLAMA_LOG_ERROR("%s: no backends are loaded. hint: use ggml_backend_load() or ggml_backend_load_all() to load a backend before calling this function\n", __func__);
|
|
145
|
+
return nullptr;
|
|
146
|
+
}
|
|
147
|
+
|
|
142
148
|
unsigned cur_percentage = 0;
|
|
143
149
|
if (params.progress_callback == NULL) {
|
|
144
150
|
params.progress_callback_user_data = &cur_percentage;
|
|
@@ -253,6 +259,13 @@ struct llama_model * llama_model_load_from_splits(
|
|
|
253
259
|
return llama_model_load_from_file_impl(splits.front(), splits, params);
|
|
254
260
|
}
|
|
255
261
|
|
|
262
|
+
void llama_model_save_to_file(const struct llama_model * model, const char * path_model) {
|
|
263
|
+
llama_model_saver ms(*model);
|
|
264
|
+
ms.add_kv_from_model();
|
|
265
|
+
ms.add_tensors_from_model();
|
|
266
|
+
ms.save(path_model);
|
|
267
|
+
}
|
|
268
|
+
|
|
256
269
|
//
|
|
257
270
|
// chat templates
|
|
258
271
|
//
|
|
@@ -338,3 +351,4 @@ const char * llama_print_system_info(void) {
|
|
|
338
351
|
|
|
339
352
|
return s.c_str();
|
|
340
353
|
}
|
|
354
|
+
|
|
@@ -111,10 +111,13 @@ if (NOT WIN32)
|
|
|
111
111
|
# TODO: disabled on loongarch64 because the ggml-ci node lacks Python 3.8
|
|
112
112
|
if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
|
|
113
113
|
llama_build_and_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)
|
|
114
|
-
target_include_directories(test-json-schema-to-grammar PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../
|
|
114
|
+
target_include_directories(test-json-schema-to-grammar PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../tools/server)
|
|
115
|
+
endif()
|
|
116
|
+
|
|
117
|
+
if (NOT GGML_BACKEND_DL)
|
|
118
|
+
llama_build(test-quantize-stats.cpp)
|
|
115
119
|
endif()
|
|
116
120
|
|
|
117
|
-
llama_build(test-quantize-stats.cpp)
|
|
118
121
|
llama_build(test-gbnf-validator.cpp)
|
|
119
122
|
|
|
120
123
|
# build test-tokenizer-1-bpe target once and add many tests
|
|
@@ -141,6 +144,7 @@ endif()
|
|
|
141
144
|
|
|
142
145
|
llama_build_and_test(test-log.cpp)
|
|
143
146
|
llama_build_and_test(test-chat-template.cpp)
|
|
147
|
+
llama_build_and_test(test-regex-partial.cpp)
|
|
144
148
|
|
|
145
149
|
# this fails on windows (github hosted runner) due to curl DLL not found (exit code 0xc0000135)
|
|
146
150
|
if (NOT WIN32)
|
|
@@ -162,6 +166,10 @@ if (NOT GGML_BACKEND_DL)
|
|
|
162
166
|
llama_build_and_test(test-rope.cpp)
|
|
163
167
|
endif()
|
|
164
168
|
|
|
169
|
+
# libmtmd
|
|
170
|
+
set(LLAMA_TEST_NAME test-mtmd-c-api)
|
|
171
|
+
llama_build_and_test(test-mtmd-c-api.c)
|
|
172
|
+
target_link_libraries(${LLAMA_TEST_NAME} PRIVATE mtmd)
|
|
165
173
|
|
|
166
174
|
# dummy executable - not installed
|
|
167
175
|
get_filename_component(TEST_TARGET test-c.c NAME_WE)
|