@fugood/llama.node 0.3.3 → 0.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +5 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +29 -1
- package/package.json +1 -1
- package/src/EmbeddingWorker.cpp +15 -5
- package/src/EmbeddingWorker.h +2 -1
- package/src/LlamaCompletionWorker.cpp +17 -1
- package/src/LlamaContext.cpp +86 -18
- package/src/LlamaContext.h +2 -0
- package/src/llama.cpp/.github/workflows/build.yml +197 -159
- package/src/llama.cpp/.github/workflows/docker.yml +5 -8
- package/src/llama.cpp/.github/workflows/python-lint.yml +8 -1
- package/src/llama.cpp/.github/workflows/server.yml +21 -14
- package/src/llama.cpp/CMakeLists.txt +11 -6
- package/src/llama.cpp/Sources/llama/llama.h +4 -0
- package/src/llama.cpp/cmake/common.cmake +33 -0
- package/src/llama.cpp/cmake/x64-windows-llvm.cmake +11 -0
- package/src/llama.cpp/common/CMakeLists.txt +6 -2
- package/src/llama.cpp/common/arg.cpp +426 -245
- package/src/llama.cpp/common/common.cpp +143 -80
- package/src/llama.cpp/common/common.h +81 -24
- package/src/llama.cpp/common/sampling.cpp +53 -19
- package/src/llama.cpp/common/sampling.h +22 -1
- package/src/llama.cpp/common/speculative.cpp +274 -0
- package/src/llama.cpp/common/speculative.h +28 -0
- package/src/llama.cpp/docs/build.md +101 -148
- package/src/llama.cpp/examples/CMakeLists.txt +32 -13
- package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched/batched.cpp +5 -4
- package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/cvector-generator/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +1 -1
- package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +3 -2
- package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +4 -7
- package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +8 -1
- package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +2 -2
- package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
- package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +11 -2
- package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/infill/infill.cpp +1 -1
- package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +405 -316
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/llava/CMakeLists.txt +10 -3
- package/src/llama.cpp/examples/llava/clip.cpp +262 -66
- package/src/llama.cpp/examples/llava/clip.h +8 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +1 -1
- package/src/llama.cpp/examples/llava/llava.cpp +46 -19
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +1 -1
- package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +581 -0
- package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -1
- package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +2 -1
- package/src/llama.cpp/examples/lookup/lookup.cpp +2 -2
- package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/main/main.cpp +9 -5
- package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/parallel/parallel.cpp +1 -1
- package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +0 -3
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +4 -4
- package/src/llama.cpp/examples/run/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/run/run.cpp +911 -0
- package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +4 -4
- package/src/llama.cpp/examples/server/CMakeLists.txt +3 -7
- package/src/llama.cpp/examples/server/server.cpp +1758 -886
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
- package/src/llama.cpp/examples/server/utils.hpp +94 -304
- package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/simple/simple.cpp +4 -0
- package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +3 -0
- package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/speculative/speculative.cpp +16 -15
- package/src/llama.cpp/examples/speculative-simple/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +265 -0
- package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +1 -1
- package/src/llama.cpp/examples/tts/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/tts/tts.cpp +932 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +46 -34
- package/src/llama.cpp/ggml/include/ggml-backend.h +16 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +7 -49
- package/src/llama.cpp/ggml/include/ggml-opencl.h +26 -0
- package/src/llama.cpp/ggml/include/ggml.h +106 -24
- package/src/llama.cpp/ggml/src/CMakeLists.txt +73 -24
- package/src/llama.cpp/ggml/src/ggml-alloc.c +0 -1
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +51 -11
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +379 -22
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +3 -7
- package/src/llama.cpp/ggml/src/ggml-blas/ggml-blas.cpp +5 -2
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +33 -3
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +456 -111
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +6 -3
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +95 -35
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -5
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +22 -9
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +24 -13
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +23 -13
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +17 -0
- package/src/llama.cpp/ggml/src/ggml-common.h +42 -42
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +288 -213
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
- package/src/llama.cpp/ggml/src/{ggml-amx → ggml-cpu/amx}/common.h +19 -22
- package/src/llama.cpp/ggml/src/{ggml-amx → ggml-cpu/amx}/mmq.cpp +93 -92
- package/src/llama.cpp/ggml/src/{ggml-amx → ggml-cpu/amx}/mmq.h +2 -9
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.c → ggml-cpu-aarch64.cpp} +892 -190
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +2 -24
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +15 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +38 -25
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +552 -399
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +101 -136
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +7 -10
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -6
- package/src/llama.cpp/ggml/src/ggml-impl.h +32 -11
- package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +13 -9
- package/src/llama.cpp/ggml/src/ggml-kompute/ggml-kompute.cpp +131 -64
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +3 -6
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +39 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +14 -7
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +147 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +4004 -0
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +67 -80
- package/src/llama.cpp/ggml/src/ggml-quants.c +0 -9
- package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +3 -5
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +5 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +13 -10
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +2 -11
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +5 -5
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +32 -13
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +80 -61
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +159 -114
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +6 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +6 -20
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +4 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +8 -8
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +4 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +7 -7
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +4 -1
- package/src/llama.cpp/ggml/src/ggml-threading.h +4 -2
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +21 -7
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1718 -399
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +3 -1
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +105 -31
- package/src/llama.cpp/ggml/src/ggml.c +367 -207
- package/src/llama.cpp/include/llama-cpp.h +25 -0
- package/src/llama.cpp/include/llama.h +26 -19
- package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +46 -0
- package/src/llama.cpp/pocs/CMakeLists.txt +3 -1
- package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
- package/src/llama.cpp/src/CMakeLists.txt +2 -7
- package/src/llama.cpp/src/llama-grammar.cpp +15 -15
- package/src/llama.cpp/src/llama-grammar.h +2 -5
- package/src/llama.cpp/src/llama-sampling.cpp +35 -90
- package/src/llama.cpp/src/llama-vocab.cpp +6 -1
- package/src/llama.cpp/src/llama.cpp +1748 -640
- package/src/llama.cpp/src/unicode.cpp +62 -51
- package/src/llama.cpp/src/unicode.h +9 -10
- package/src/llama.cpp/tests/CMakeLists.txt +48 -37
- package/src/llama.cpp/tests/test-arg-parser.cpp +2 -2
- package/src/llama.cpp/tests/test-backend-ops.cpp +140 -21
- package/src/llama.cpp/tests/test-chat-template.cpp +50 -4
- package/src/llama.cpp/tests/test-gguf.cpp +1303 -0
- package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -6
- package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -4
- package/src/llama.cpp/tests/test-quantize-fns.cpp +3 -3
- package/src/llama.cpp/tests/test-rope.cpp +61 -20
- package/src/llama.cpp/tests/test-sampling.cpp +2 -2
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +0 -72
- package/src/llama.cpp/.github/workflows/nix-ci.yml +0 -79
- package/src/llama.cpp/.github/workflows/nix-flake-update.yml +0 -22
- package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +0 -36
- package/src/llama.cpp/ggml/include/ggml-amx.h +0 -25
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +0 -129
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -19
- package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +0 -107
- package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +0 -446
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
#pragma once
|
|
2
|
+
|
|
3
|
+
#ifndef __cplusplus
|
|
4
|
+
#error "This header is for C++ only"
|
|
5
|
+
#endif
|
|
6
|
+
|
|
7
|
+
#include <memory>
|
|
8
|
+
|
|
9
|
+
#include "llama.h"
|
|
10
|
+
|
|
11
|
+
struct llama_model_deleter {
|
|
12
|
+
void operator()(llama_model * model) { llama_free_model(model); }
|
|
13
|
+
};
|
|
14
|
+
|
|
15
|
+
struct llama_context_deleter {
|
|
16
|
+
void operator()(llama_context * context) { llama_free(context); }
|
|
17
|
+
};
|
|
18
|
+
|
|
19
|
+
struct llama_sampler_deleter {
|
|
20
|
+
void operator()(llama_sampler * sampler) { llama_sampler_free(sampler); }
|
|
21
|
+
};
|
|
22
|
+
|
|
23
|
+
typedef std::unique_ptr<llama_model, llama_model_deleter> llama_model_ptr;
|
|
24
|
+
typedef std::unique_ptr<llama_context, llama_context_deleter> llama_context_ptr;
|
|
25
|
+
typedef std::unique_ptr<llama_sampler, llama_sampler_deleter> llama_sampler_ptr;
|
|
@@ -104,12 +104,15 @@ extern "C" {
|
|
|
104
104
|
LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
|
|
105
105
|
LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
|
|
106
106
|
LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
|
|
107
|
+
LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
|
|
107
108
|
};
|
|
108
109
|
|
|
109
110
|
enum llama_rope_type {
|
|
110
|
-
LLAMA_ROPE_TYPE_NONE
|
|
111
|
-
LLAMA_ROPE_TYPE_NORM
|
|
112
|
-
LLAMA_ROPE_TYPE_NEOX
|
|
111
|
+
LLAMA_ROPE_TYPE_NONE = -1,
|
|
112
|
+
LLAMA_ROPE_TYPE_NORM = 0,
|
|
113
|
+
LLAMA_ROPE_TYPE_NEOX = GGML_ROPE_TYPE_NEOX,
|
|
114
|
+
LLAMA_ROPE_TYPE_MROPE = GGML_ROPE_TYPE_MROPE,
|
|
115
|
+
LLAMA_ROPE_TYPE_VISION = GGML_ROPE_TYPE_VISION,
|
|
113
116
|
};
|
|
114
117
|
|
|
115
118
|
enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file
|
|
@@ -171,9 +174,9 @@ extern "C" {
|
|
|
171
174
|
LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
|
|
172
175
|
LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors
|
|
173
176
|
LLAMA_FTYPE_MOSTLY_BF16 = 32, // except 1d tensors
|
|
174
|
-
LLAMA_FTYPE_MOSTLY_Q4_0_4_4 = 33, //
|
|
175
|
-
LLAMA_FTYPE_MOSTLY_Q4_0_4_8 = 34, //
|
|
176
|
-
LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, //
|
|
177
|
+
//LLAMA_FTYPE_MOSTLY_Q4_0_4_4 = 33, // removed from gguf files, use Q4_0 and runtime repack
|
|
178
|
+
//LLAMA_FTYPE_MOSTLY_Q4_0_4_8 = 34, // removed from gguf files, use Q4_0 and runtime repack
|
|
179
|
+
//LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // removed from gguf files, use Q4_0 and runtime repack
|
|
177
180
|
LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors
|
|
178
181
|
LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors
|
|
179
182
|
|
|
@@ -185,7 +188,8 @@ extern "C" {
|
|
|
185
188
|
LLAMA_ROPE_SCALING_TYPE_NONE = 0,
|
|
186
189
|
LLAMA_ROPE_SCALING_TYPE_LINEAR = 1,
|
|
187
190
|
LLAMA_ROPE_SCALING_TYPE_YARN = 2,
|
|
188
|
-
|
|
191
|
+
LLAMA_ROPE_SCALING_TYPE_LONGROPE = 3,
|
|
192
|
+
LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_LONGROPE,
|
|
189
193
|
};
|
|
190
194
|
|
|
191
195
|
enum llama_pooling_type {
|
|
@@ -272,6 +276,9 @@ extern "C" {
|
|
|
272
276
|
};
|
|
273
277
|
|
|
274
278
|
struct llama_model_params {
|
|
279
|
+
// NULL-terminated list of devices to use for offloading (if NULL, all available devices are used)
|
|
280
|
+
ggml_backend_dev_t * devices;
|
|
281
|
+
|
|
275
282
|
int32_t n_gpu_layers; // number of layers to store in VRAM
|
|
276
283
|
enum llama_split_mode split_mode; // how to split the model across multiple GPUs
|
|
277
284
|
|
|
@@ -451,6 +458,7 @@ extern "C" {
|
|
|
451
458
|
// Functions to access the model's GGUF metadata scalar values
|
|
452
459
|
// - The functions return the length of the string on success, or -1 on failure
|
|
453
460
|
// - The output string is always null-terminated and cleared on failure
|
|
461
|
+
// - When retrieving a string, an extra byte must be allocated to account for the null terminator
|
|
454
462
|
// - GGUF array values are not supported by these functions
|
|
455
463
|
|
|
456
464
|
// Get metadata value as a string by key name
|
|
@@ -474,9 +482,6 @@ extern "C" {
|
|
|
474
482
|
// Returns the total number of parameters in the model
|
|
475
483
|
LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
|
|
476
484
|
|
|
477
|
-
// Get a llama model tensor
|
|
478
|
-
LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name);
|
|
479
|
-
|
|
480
485
|
// Returns true if the model contains an encoder that requires llama_encode() call
|
|
481
486
|
LLAMA_API bool llama_model_has_encoder(const struct llama_model * model);
|
|
482
487
|
|
|
@@ -667,6 +672,9 @@ extern "C" {
|
|
|
667
672
|
// Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
|
|
668
673
|
LLAMA_API void llama_kv_cache_update(struct llama_context * ctx);
|
|
669
674
|
|
|
675
|
+
// Check if the context supports KV cache shifting
|
|
676
|
+
LLAMA_API bool llama_kv_cache_can_shift(struct llama_context * ctx);
|
|
677
|
+
|
|
670
678
|
//
|
|
671
679
|
// State / sessions
|
|
672
680
|
//
|
|
@@ -984,6 +992,9 @@ extern "C" {
|
|
|
984
992
|
char * buf,
|
|
985
993
|
int32_t length);
|
|
986
994
|
|
|
995
|
+
// Get list of built-in chat templates
|
|
996
|
+
LLAMA_API int32_t llama_chat_builtin_templates(const char ** output, size_t len);
|
|
997
|
+
|
|
987
998
|
//
|
|
988
999
|
// Sampling API
|
|
989
1000
|
//
|
|
@@ -1125,16 +1136,12 @@ extern "C" {
|
|
|
1125
1136
|
const char * grammar_str,
|
|
1126
1137
|
const char * grammar_root);
|
|
1127
1138
|
|
|
1139
|
+
/// NOTE: Avoid using on the full vocabulary as searching for repeated tokens can become slow. For example, apply top-k or top-p sampling first.
|
|
1128
1140
|
LLAMA_API struct llama_sampler * llama_sampler_init_penalties(
|
|
1129
|
-
int32_t
|
|
1130
|
-
|
|
1131
|
-
|
|
1132
|
-
|
|
1133
|
-
float penalty_repeat, // 1.0 = disabled
|
|
1134
|
-
float penalty_freq, // 0.0 = disabled
|
|
1135
|
-
float penalty_present, // 0.0 = disabled
|
|
1136
|
-
bool penalize_nl, // consider newlines as a repeatable token
|
|
1137
|
-
bool ignore_eos); // ignore the end-of-sequence token
|
|
1141
|
+
int32_t penalty_last_n, // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
|
1142
|
+
float penalty_repeat, // 1.0 = disabled
|
|
1143
|
+
float penalty_freq, // 0.0 = disabled
|
|
1144
|
+
float penalty_present); // 0.0 = disabled
|
|
1138
1145
|
|
|
1139
1146
|
/// @details DRY sampler, designed by p-e-w, as described in: https://github.com/oobabooga/text-generation-webui/pull/5677, porting Koboldcpp implementation authored by pi6am: https://github.com/LostRuins/koboldcpp/pull/982
|
|
1140
1147
|
LLAMA_API struct llama_sampler * llama_sampler_init_dry(
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
ied 4 ½ months
|
|
2
|
+
__ggml_vocab_test__
|
|
3
|
+
Führer
|
|
4
|
+
__ggml_vocab_test__
|
|
5
|
+
|
|
6
|
+
__ggml_vocab_test__
|
|
7
|
+
|
|
8
|
+
__ggml_vocab_test__
|
|
9
|
+
|
|
10
|
+
__ggml_vocab_test__
|
|
11
|
+
|
|
12
|
+
__ggml_vocab_test__
|
|
13
|
+
|
|
14
|
+
__ggml_vocab_test__
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
__ggml_vocab_test__
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
__ggml_vocab_test__
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
__ggml_vocab_test__
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
__ggml_vocab_test__
|
|
30
|
+
Hello world
|
|
31
|
+
__ggml_vocab_test__
|
|
32
|
+
Hello world
|
|
33
|
+
__ggml_vocab_test__
|
|
34
|
+
Hello World
|
|
35
|
+
__ggml_vocab_test__
|
|
36
|
+
Hello World
|
|
37
|
+
__ggml_vocab_test__
|
|
38
|
+
Hello World!
|
|
39
|
+
__ggml_vocab_test__
|
|
40
|
+
Hello, world!
|
|
41
|
+
__ggml_vocab_test__
|
|
42
|
+
Hello, world!
|
|
43
|
+
__ggml_vocab_test__
|
|
44
|
+
this is 🦙.cpp
|
|
45
|
+
__ggml_vocab_test__
|
|
46
|
+
w048 7tuijk dsdfhu
|
|
47
|
+
__ggml_vocab_test__
|
|
48
|
+
нещо на Български
|
|
49
|
+
__ggml_vocab_test__
|
|
50
|
+
កាន់តែពិសេសអាចខលចេញ
|
|
51
|
+
__ggml_vocab_test__
|
|
52
|
+
🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
|
|
53
|
+
__ggml_vocab_test__
|
|
54
|
+
Hello
|
|
55
|
+
__ggml_vocab_test__
|
|
56
|
+
Hello
|
|
57
|
+
__ggml_vocab_test__
|
|
58
|
+
Hello
|
|
59
|
+
__ggml_vocab_test__
|
|
60
|
+
Hello
|
|
61
|
+
__ggml_vocab_test__
|
|
62
|
+
Hello
|
|
63
|
+
__ggml_vocab_test__
|
|
64
|
+
Hello
|
|
65
|
+
Hello
|
|
66
|
+
__ggml_vocab_test__
|
|
67
|
+
(
|
|
68
|
+
__ggml_vocab_test__
|
|
69
|
+
|
|
70
|
+
=
|
|
71
|
+
__ggml_vocab_test__
|
|
72
|
+
' era
|
|
73
|
+
__ggml_vocab_test__
|
|
74
|
+
Hello, y'all! How are you 😁 ?我想在apple工作1314151天~
|
|
75
|
+
__ggml_vocab_test__
|
|
76
|
+
!!!!!!
|
|
77
|
+
__ggml_vocab_test__
|
|
78
|
+
3
|
|
79
|
+
__ggml_vocab_test__
|
|
80
|
+
33
|
|
81
|
+
__ggml_vocab_test__
|
|
82
|
+
333
|
|
83
|
+
__ggml_vocab_test__
|
|
84
|
+
3333
|
|
85
|
+
__ggml_vocab_test__
|
|
86
|
+
33333
|
|
87
|
+
__ggml_vocab_test__
|
|
88
|
+
333333
|
|
89
|
+
__ggml_vocab_test__
|
|
90
|
+
3333333
|
|
91
|
+
__ggml_vocab_test__
|
|
92
|
+
33333333
|
|
93
|
+
__ggml_vocab_test__
|
|
94
|
+
333333333
|
|
95
|
+
__ggml_vocab_test__
|
|
96
|
+
Cửa Việt
|
|
97
|
+
__ggml_vocab_test__
|
|
98
|
+
discards
|
|
99
|
+
__ggml_vocab_test__
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
|
|
112
|
+
__ggml_vocab_test__
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
2550 204 18430 377
|
|
2
|
+
597 2768 298 8564
|
|
3
|
+
|
|
4
|
+
1437
|
|
5
|
+
1437 1437
|
|
6
|
+
1437 1437 1437
|
|
7
|
+
50117
|
|
8
|
+
50118
|
|
9
|
+
50140
|
|
10
|
+
50140 50118
|
|
11
|
+
50117 50118
|
|
12
|
+
31414 232
|
|
13
|
+
20920 232
|
|
14
|
+
31414 623
|
|
15
|
+
20920 623
|
|
16
|
+
20920 623 328
|
|
17
|
+
31414 6 232 328
|
|
18
|
+
20920 6 232 328
|
|
19
|
+
42 16 8103 18164 27 4 49317
|
|
20
|
+
605 40976 262 10109 18474 385 29 36807 6455
|
|
21
|
+
36765 25482 22063 23171 34251 18697 10809 26161 18697 3602 22063 27969 40966 25417 15264 26161 24269 36709 41171 35328
|
|
22
|
+
1376 17772 7471 1376 17772 19002 1376 17772 9085 1376 4333 13859 1376 17772 9357 1376 4333 9264 1376 17772 25448 1376 17772 18400 1376 17772 4333 1376 4333 10172 1376 17772 4333 1376 17772 7258 1376 17772 19002 1376 17772 5782 1376 17772 10172 1376 17772 3726 1376 17772 5782 1376 4333 10172 1376 17772 23171
|
|
23
|
+
6569 15113 7471 36 21113 43 17841 19002 17 8384 6569 14285 4958 12605 36 34654 2841 4203 354 10146 26511 1070 43 36174 5782 36 8338 21554 14 34 63 308 19233 43
|
|
24
|
+
31414
|
|
25
|
+
20920
|
|
26
|
+
1437 20920
|
|
27
|
+
1437 1437 20920
|
|
28
|
+
1437 1437 1437 20920
|
|
29
|
+
1437 1437 1437 20920 50118 1437 1437 1437 20920
|
|
30
|
+
36
|
|
31
|
+
50118 5457
|
|
32
|
+
108 3567
|
|
33
|
+
31414 6 1423 108 1250 328 1336 32 47 17841 10172 17487 47876 3602 48617 15264 46537 11423 27326 48494 8210 49233 1558 1570 27761 49429 43251 10809 17772
|
|
34
|
+
32376 12846
|
|
35
|
+
246
|
|
36
|
+
3103
|
|
37
|
+
25631
|
|
38
|
+
46152
|
|
39
|
+
3103 25631
|
|
40
|
+
46152 3103
|
|
41
|
+
46152 25631
|
|
42
|
+
46152 46152
|
|
43
|
+
46152 3103 25631
|
|
44
|
+
347 1376 2023 12410 102 16376 1376 2023 6382 90
|
|
45
|
+
9553 5954
|
|
46
|
+
50118 1437 50140 1437 50140 50118 1437 50117 1437 50117 50117 1437 50117 50118 1437 1437 50118 1437 1437 1437 50118 1437 1437 1437 1437 50118 1437 1437 1437 1437 1437 50118 6569 15113 7471 36 21113 43 17841 19002 17 8384 6569 14285 4958 12605 36 34654 2841 4203 354 10146 26511 1070 43 36174 5782 8103 18164 27 6569 18164 27 155 2357 30242 155 25631 30242 3103 30242 25631 30242 46152 30242 3103 25631 155 4 246 155 7586 246 155 734 246 25974 17772 7471 1376 17772 19002 1376 17772 9085 1376 4333 13859 1376 17772 9357 1376 4333 9264 1376 17772 25448 1376 17772 18400 1376 17772 4333 1376 4333 10172 1376 17772 4333 1376 17772 7258 1376 17772 19002 1376 17772 5782 18636 10172 17487 47876 3602 48617 15264 46537 11423 27326 48494 8210 49233 1558 1570 27761 49429 43251 10809 17772 36738 48332 47463 18697 10809 25482 22063 23171 34251 18697 10809 26161 18697 3602 22063 27969 40966 25417 15264 26161 24269 36709 41171 35328 128 49690 108 49972 49519 12905 48149 48149 43796 32376 12846 27282 28749 38 348 57 128 41042 37 18 89 6 128 4629 47 686 116 128 448 45 686 38 581 146 24 6 128 495 47 101 103 6845 116 166 108 30660 10 108 462 574
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
set(TARGET llama-vdot)
|
|
2
2
|
add_executable(${TARGET} vdot.cpp)
|
|
3
3
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
|
4
|
-
target_compile_features(${TARGET} PRIVATE
|
|
4
|
+
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
|
5
5
|
|
|
6
6
|
set(TARGET llama-q8dot)
|
|
7
7
|
add_executable(${TARGET} q8dot.cpp)
|
|
8
8
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
|
9
|
-
target_compile_features(${TARGET} PRIVATE
|
|
9
|
+
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
|
@@ -1,9 +1,4 @@
|
|
|
1
|
-
|
|
2
|
-
if (WIN32)
|
|
3
|
-
if (BUILD_SHARED_LIBS)
|
|
4
|
-
set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
|
|
5
|
-
endif()
|
|
6
|
-
endif()
|
|
1
|
+
llama_add_compile_flags()
|
|
7
2
|
|
|
8
3
|
#
|
|
9
4
|
# libraries
|
|
@@ -23,7 +18,7 @@ add_library(llama
|
|
|
23
18
|
)
|
|
24
19
|
|
|
25
20
|
target_include_directories(llama PUBLIC . ../include)
|
|
26
|
-
target_compile_features (llama PUBLIC
|
|
21
|
+
target_compile_features (llama PUBLIC cxx_std_17) # don't bump
|
|
27
22
|
|
|
28
23
|
target_link_libraries(llama PUBLIC ggml)
|
|
29
24
|
|
|
@@ -822,15 +822,11 @@ llama_grammar_stacks & llama_grammar_get_stacks(struct llama_grammar * grammar)
|
|
|
822
822
|
return grammar->stacks;
|
|
823
823
|
}
|
|
824
824
|
|
|
825
|
-
void llama_grammar_accept(
|
|
826
|
-
|
|
827
|
-
|
|
828
|
-
const uint32_t chr,
|
|
829
|
-
llama_grammar_stacks & stacks_new) {
|
|
830
|
-
stacks_new.clear();
|
|
831
|
-
stacks_new.reserve(stacks.size());
|
|
825
|
+
void llama_grammar_accept(struct llama_grammar * grammar, uint32_t chr) {
|
|
826
|
+
llama_grammar_stacks stacks_new;
|
|
827
|
+
stacks_new.reserve(grammar->stacks.size());
|
|
832
828
|
|
|
833
|
-
for (const auto & stack : stacks) {
|
|
829
|
+
for (const auto & stack : grammar->stacks) {
|
|
834
830
|
if (stack.empty()) {
|
|
835
831
|
continue;
|
|
836
832
|
}
|
|
@@ -844,9 +840,11 @@ void llama_grammar_accept(
|
|
|
844
840
|
if (!llama_grammar_is_end_of_sequence(pos)) {
|
|
845
841
|
new_stack.push_back(pos);
|
|
846
842
|
}
|
|
847
|
-
llama_grammar_advance_stack(rules, new_stack, stacks_new);
|
|
843
|
+
llama_grammar_advance_stack(grammar->rules, new_stack, stacks_new);
|
|
848
844
|
}
|
|
849
845
|
}
|
|
846
|
+
|
|
847
|
+
grammar->stacks = std::move(stacks_new);
|
|
850
848
|
}
|
|
851
849
|
|
|
852
850
|
llama_grammar_candidates llama_grammar_reject_candidates_for_stack(
|
|
@@ -1051,7 +1049,12 @@ void llama_grammar_free_impl(struct llama_grammar * grammar) {
|
|
|
1051
1049
|
}
|
|
1052
1050
|
|
|
1053
1051
|
struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & grammar) {
|
|
1054
|
-
llama_grammar * result = new llama_grammar {
|
|
1052
|
+
llama_grammar * result = new llama_grammar {
|
|
1053
|
+
grammar.vocab,
|
|
1054
|
+
grammar.rules,
|
|
1055
|
+
grammar.stacks,
|
|
1056
|
+
grammar.partial_utf8,
|
|
1057
|
+
};
|
|
1055
1058
|
|
|
1056
1059
|
// redirect elements in stacks to point to new rules
|
|
1057
1060
|
for (size_t is = 0; is < result->stacks.size(); is++) {
|
|
@@ -1059,7 +1062,7 @@ struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & gra
|
|
|
1059
1062
|
for (size_t ir0 = 0; ir0 < grammar.rules.size(); ir0++) {
|
|
1060
1063
|
for (size_t ir1 = 0; ir1 < grammar.rules[ir0].size(); ir1++) {
|
|
1061
1064
|
if (grammar.stacks[is][ie] == &grammar.rules[ir0][ir1]) {
|
|
1062
|
-
|
|
1065
|
+
result->stacks[is][ie] = &result->rules[ir0][ir1];
|
|
1063
1066
|
}
|
|
1064
1067
|
}
|
|
1065
1068
|
}
|
|
@@ -1126,11 +1129,8 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
|
|
|
1126
1129
|
const auto decoded = decode_utf8(piece, grammar.partial_utf8);
|
|
1127
1130
|
const auto & code_points = decoded.first;
|
|
1128
1131
|
|
|
1129
|
-
llama_grammar_stacks stacks_new;
|
|
1130
|
-
|
|
1131
1132
|
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
|
1132
|
-
llama_grammar_accept(grammar
|
|
1133
|
-
grammar.stacks = std::move(stacks_new);
|
|
1133
|
+
llama_grammar_accept(&grammar, *it);
|
|
1134
1134
|
}
|
|
1135
1135
|
|
|
1136
1136
|
grammar.partial_utf8 = decoded.second;
|
|
@@ -58,6 +58,7 @@ using llama_grammar_rules = std::vector<llama_grammar_rule>;
|
|
|
58
58
|
using llama_grammar_stacks = std::vector<llama_grammar_stack>;
|
|
59
59
|
using llama_grammar_candidates = std::vector<llama_grammar_candidate>;
|
|
60
60
|
|
|
61
|
+
// TODO: remove, needed for tests atm
|
|
61
62
|
const llama_grammar_rules & llama_grammar_get_rules (const struct llama_grammar * grammar);
|
|
62
63
|
llama_grammar_stacks & llama_grammar_get_stacks( struct llama_grammar * grammar);
|
|
63
64
|
|
|
@@ -65,11 +66,7 @@ const llama_grammar_rules & llama_grammar_get_rules (const struct llama_grammar
|
|
|
65
66
|
// be positioned at a character range (see `llama_grammar_advance_stack`), and
|
|
66
67
|
// produces the N possible stacks if the given char is accepted at those
|
|
67
68
|
// positions
|
|
68
|
-
void llama_grammar_accept(
|
|
69
|
-
const llama_grammar_rules & rules,
|
|
70
|
-
const llama_grammar_stacks & stacks,
|
|
71
|
-
uint32_t chr,
|
|
72
|
-
llama_grammar_stacks & stacks_new);
|
|
69
|
+
void llama_grammar_accept(struct llama_grammar * grammar, uint32_t chr);
|
|
73
70
|
|
|
74
71
|
std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_stack(
|
|
75
72
|
const llama_grammar_rules & rules,
|
|
@@ -1396,19 +1396,15 @@ struct llama_sampler * llama_sampler_init_grammar_impl(const struct llama_vocab
|
|
|
1396
1396
|
// penalties
|
|
1397
1397
|
|
|
1398
1398
|
struct llama_sampler_penalties {
|
|
1399
|
-
const int32_t n_vocab;
|
|
1400
|
-
const llama_token special_eos_id;
|
|
1401
|
-
const llama_token linefeed_id;
|
|
1402
|
-
|
|
1403
1399
|
const int32_t penalty_last_n;
|
|
1404
1400
|
const float penalty_repeat;
|
|
1405
1401
|
const float penalty_freq;
|
|
1406
1402
|
const float penalty_present;
|
|
1407
1403
|
|
|
1408
|
-
const bool penalize_nl;
|
|
1409
|
-
const bool ignore_eos;
|
|
1410
|
-
|
|
1411
1404
|
ring_buffer<llama_token> prev;
|
|
1405
|
+
|
|
1406
|
+
// a frequency map to count token occurrences
|
|
1407
|
+
std::unordered_map<llama_token, int> token_count;
|
|
1412
1408
|
};
|
|
1413
1409
|
|
|
1414
1410
|
static const char * llama_sampler_penalties_name(const struct llama_sampler * /*smpl*/) {
|
|
@@ -1421,76 +1417,50 @@ static void llama_sampler_penalties_accept(struct llama_sampler * smpl, llama_to
|
|
|
1421
1417
|
return;
|
|
1422
1418
|
}
|
|
1423
1419
|
|
|
1424
|
-
ctx->
|
|
1425
|
-
}
|
|
1426
|
-
|
|
1427
|
-
static void llama_sampler_penalties_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
|
1428
|
-
auto * ctx = (llama_sampler_penalties *) smpl->ctx;
|
|
1420
|
+
ctx->token_count[token]++;
|
|
1429
1421
|
|
|
1430
|
-
if
|
|
1431
|
-
|
|
1422
|
+
// if the ring buffer is full, remove the oldest token
|
|
1423
|
+
if (ctx->prev.size() >= (size_t) ctx->penalty_last_n) {
|
|
1424
|
+
const auto old = ctx->prev.front();
|
|
1432
1425
|
|
|
1433
|
-
|
|
1434
|
-
if (
|
|
1435
|
-
|
|
1436
|
-
} else {
|
|
1437
|
-
// else, search for the special EOS token
|
|
1438
|
-
for (size_t i = 0; i < cur_p->size; ++i) {
|
|
1439
|
-
if (cur_p->data[i].id == ctx->special_eos_id) {
|
|
1440
|
-
cur_p->data[i].logit = -INFINITY;
|
|
1441
|
-
break;
|
|
1442
|
-
}
|
|
1443
|
-
}
|
|
1426
|
+
ctx->token_count[old]--;
|
|
1427
|
+
if (ctx->token_count[old] == 0) {
|
|
1428
|
+
ctx->token_count.erase(old);
|
|
1444
1429
|
}
|
|
1445
1430
|
}
|
|
1446
1431
|
|
|
1447
|
-
|
|
1448
|
-
(ctx->penalty_repeat == 1.0f && ctx->penalty_freq == 0.0f && ctx->penalty_present == 0.0f)) {
|
|
1449
|
-
return;
|
|
1450
|
-
}
|
|
1451
|
-
|
|
1452
|
-
bool nl_found = false;
|
|
1453
|
-
size_t nl_idx = 0;
|
|
1454
|
-
float nl_logit = -INFINITY;
|
|
1455
|
-
if (!ctx->penalize_nl) {
|
|
1456
|
-
assert(ctx->linefeed_id >= 0);
|
|
1432
|
+
ctx->prev.push_back(token);
|
|
1457
1433
|
|
|
1458
|
-
|
|
1459
|
-
|
|
1460
|
-
|
|
1461
|
-
|
|
1462
|
-
|
|
1463
|
-
} else {
|
|
1464
|
-
// else, search for the linefeed token
|
|
1465
|
-
for (size_t i = 0; i < cur_p->size; ++i) {
|
|
1466
|
-
if (cur_p->data[i].id == ctx->linefeed_id) {
|
|
1467
|
-
nl_found = true;
|
|
1468
|
-
nl_idx = i;
|
|
1469
|
-
nl_logit = cur_p->data[i].logit;
|
|
1470
|
-
break;
|
|
1471
|
-
}
|
|
1472
|
-
}
|
|
1473
|
-
}
|
|
1434
|
+
#if 0
|
|
1435
|
+
// sanity check
|
|
1436
|
+
std::unordered_map<llama_token, int> tmp;
|
|
1437
|
+
for (int i = 0; i < std::min<int>(ctx->penalty_last_n, ctx->prev.size()); ++i) {
|
|
1438
|
+
tmp[ctx->prev.rat(i)]++;
|
|
1474
1439
|
}
|
|
1475
1440
|
|
|
1476
|
-
|
|
1477
|
-
|
|
1478
|
-
|
|
1479
|
-
|
|
1441
|
+
assert(ctx->token_count == tmp);
|
|
1442
|
+
#endif
|
|
1443
|
+
}
|
|
1444
|
+
|
|
1445
|
+
static void llama_sampler_penalties_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
|
1446
|
+
auto * ctx = (llama_sampler_penalties *) smpl->ctx;
|
|
1480
1447
|
|
|
1481
|
-
|
|
1482
|
-
|
|
1448
|
+
if ((ctx->penalty_last_n == 0) ||
|
|
1449
|
+
(ctx->penalty_repeat == 1.0f && ctx->penalty_freq == 0.0f && ctx->penalty_present == 0.0f)) {
|
|
1450
|
+
return;
|
|
1483
1451
|
}
|
|
1484
1452
|
|
|
1485
1453
|
// Apply frequency and presence penalties to the cur_p
|
|
1486
1454
|
for (size_t i = 0; i < cur_p->size; ++i) {
|
|
1487
|
-
const auto token_iter = token_count.find(cur_p->data[i].id);
|
|
1488
|
-
if (token_iter == token_count.end()) {
|
|
1455
|
+
const auto token_iter = ctx->token_count.find(cur_p->data[i].id);
|
|
1456
|
+
if (token_iter == ctx->token_count.end()) {
|
|
1489
1457
|
continue;
|
|
1490
1458
|
}
|
|
1491
1459
|
|
|
1492
1460
|
const int count = token_iter->second;
|
|
1493
1461
|
|
|
1462
|
+
assert(count > 0 && count <= ctx->penalty_last_n);
|
|
1463
|
+
|
|
1494
1464
|
// The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
|
|
1495
1465
|
// This is common fix for this problem, which is to multiply by the penalty instead of dividing.
|
|
1496
1466
|
if (cur_p->data[i].logit <= 0) {
|
|
@@ -1503,30 +1473,21 @@ static void llama_sampler_penalties_apply(struct llama_sampler * smpl, llama_tok
|
|
|
1503
1473
|
}
|
|
1504
1474
|
|
|
1505
1475
|
cur_p->sorted = false;
|
|
1506
|
-
|
|
1507
|
-
if (!ctx->penalize_nl && nl_found) {
|
|
1508
|
-
// restore the logit of the newline token if it was penalized
|
|
1509
|
-
cur_p->data[nl_idx].logit = nl_logit;
|
|
1510
|
-
}
|
|
1511
1476
|
}
|
|
1512
1477
|
|
|
1513
1478
|
static void llama_sampler_penalties_reset(struct llama_sampler * smpl) {
|
|
1514
1479
|
auto * ctx = (llama_sampler_penalties *) smpl->ctx;
|
|
1515
1480
|
ctx->prev.clear();
|
|
1481
|
+
ctx->token_count.clear();
|
|
1516
1482
|
}
|
|
1517
1483
|
|
|
1518
1484
|
static struct llama_sampler * llama_sampler_penalties_clone(const struct llama_sampler * smpl) {
|
|
1519
1485
|
const auto * ctx = (const llama_sampler_penalties *) smpl->ctx;
|
|
1520
1486
|
auto * result = llama_sampler_init_penalties(
|
|
1521
|
-
ctx->n_vocab,
|
|
1522
|
-
ctx->special_eos_id,
|
|
1523
|
-
ctx->linefeed_id,
|
|
1524
1487
|
ctx->penalty_last_n,
|
|
1525
1488
|
ctx->penalty_repeat,
|
|
1526
1489
|
ctx->penalty_freq,
|
|
1527
|
-
ctx->penalty_present
|
|
1528
|
-
ctx->penalize_nl,
|
|
1529
|
-
ctx->ignore_eos);
|
|
1490
|
+
ctx->penalty_present);
|
|
1530
1491
|
|
|
1531
1492
|
// copy the state
|
|
1532
1493
|
{
|
|
@@ -1552,38 +1513,21 @@ static struct llama_sampler_i llama_sampler_penalties_i = {
|
|
|
1552
1513
|
};
|
|
1553
1514
|
|
|
1554
1515
|
struct llama_sampler * llama_sampler_init_penalties(
|
|
1555
|
-
int32_t n_vocab,
|
|
1556
|
-
llama_token special_eos_id,
|
|
1557
|
-
llama_token linefeed_id,
|
|
1558
1516
|
int32_t penalty_last_n,
|
|
1559
1517
|
float penalty_repeat,
|
|
1560
1518
|
float penalty_freq,
|
|
1561
|
-
float penalty_present
|
|
1562
|
-
bool penalize_nl,
|
|
1563
|
-
bool ignore_eos) {
|
|
1564
|
-
if (linefeed_id == LLAMA_TOKEN_NULL) {
|
|
1565
|
-
penalize_nl = true;
|
|
1566
|
-
}
|
|
1567
|
-
|
|
1568
|
-
if (special_eos_id == LLAMA_TOKEN_NULL) {
|
|
1569
|
-
ignore_eos = false;
|
|
1570
|
-
}
|
|
1571
|
-
|
|
1519
|
+
float penalty_present) {
|
|
1572
1520
|
penalty_last_n = std::max(penalty_last_n, 0);
|
|
1573
1521
|
|
|
1574
1522
|
return new llama_sampler {
|
|
1575
1523
|
/* .iface = */ &llama_sampler_penalties_i,
|
|
1576
1524
|
/* .ctx = */ new llama_sampler_penalties {
|
|
1577
|
-
/* .n_vocab = */ n_vocab,
|
|
1578
|
-
/* .special_eos_id = */ special_eos_id,
|
|
1579
|
-
/* .linefeed_id = */ linefeed_id,
|
|
1580
1525
|
/* .penalty_last_n = */ penalty_last_n,
|
|
1581
1526
|
/* .penalty_repeat = */ penalty_repeat,
|
|
1582
1527
|
/* .penalty_freq = */ penalty_freq,
|
|
1583
1528
|
/* .penalty_present = */ penalty_present,
|
|
1584
|
-
/* .penalize_nl = */ penalize_nl,
|
|
1585
|
-
/* .ignore_eos = */ ignore_eos,
|
|
1586
1529
|
/* .prev = */ ring_buffer<llama_token>(penalty_last_n),
|
|
1530
|
+
/* .token_count = */ {},
|
|
1587
1531
|
},
|
|
1588
1532
|
};
|
|
1589
1533
|
}
|
|
@@ -1611,7 +1555,8 @@ static void get_overlapping_token_sequences(const llama_vocab & vocab, const std
|
|
|
1611
1555
|
if (word.find(str) != std::string::npos) {
|
|
1612
1556
|
token_sequences.emplace(token_id, std::vector<llama_token>());
|
|
1613
1557
|
} else {
|
|
1614
|
-
size_t word_len = word.size()
|
|
1558
|
+
size_t word_len = word.size();
|
|
1559
|
+
size_t str_len = str.size();
|
|
1615
1560
|
size_t pos = -1;
|
|
1616
1561
|
while ((pos = word.find(str[0], pos + 1)) != std::string::npos) {
|
|
1617
1562
|
bool match = true;
|
|
@@ -418,6 +418,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
|
|
418
418
|
case LLAMA_VOCAB_PRE_TYPE_SMOLLM:
|
|
419
419
|
case LLAMA_VOCAB_PRE_TYPE_CODESHELL:
|
|
420
420
|
case LLAMA_VOCAB_PRE_TYPE_EXAONE:
|
|
421
|
+
case LLAMA_VOCAB_PRE_TYPE_MINERVA:
|
|
421
422
|
regex_exprs = {
|
|
422
423
|
"\\p{N}",
|
|
423
424
|
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
|
@@ -737,7 +738,7 @@ struct llm_tokenizer_wpm_session {
|
|
|
737
738
|
std::vector<std::string> words(1, "");
|
|
738
739
|
|
|
739
740
|
for (const uint32_t cpt : cpts_nfd) {
|
|
740
|
-
const auto flags =
|
|
741
|
+
const auto flags = unicode_cpt_flags_from_cpt(cpt);
|
|
741
742
|
|
|
742
743
|
if (flags.is_whitespace) {
|
|
743
744
|
if (words.back().size()) { // finish previous word if any
|
|
@@ -1866,6 +1867,10 @@ int32_t llama_detokenize_impl(
|
|
|
1866
1867
|
int32_t text_len_max,
|
|
1867
1868
|
bool remove_special,
|
|
1868
1869
|
bool unparse_special) {
|
|
1870
|
+
if (vocab.type == LLAMA_VOCAB_TYPE_NONE) {
|
|
1871
|
+
return 0;
|
|
1872
|
+
}
|
|
1873
|
+
|
|
1869
1874
|
GGML_ASSERT(vocab.tokenizer && "Tokenizer not initialized. Call llama_vocab::init_tokenizer() first.");
|
|
1870
1875
|
|
|
1871
1876
|
int32_t avail = text_len_max;
|