@fugood/llama.node 0.3.1 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +1 -8
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/package.json +4 -2
- package/src/DetokenizeWorker.cpp +1 -1
- package/src/EmbeddingWorker.cpp +2 -2
- package/src/LlamaCompletionWorker.cpp +10 -10
- package/src/LlamaCompletionWorker.h +2 -2
- package/src/LlamaContext.cpp +14 -17
- package/src/TokenizeWorker.cpp +1 -1
- package/src/common.hpp +5 -4
- package/src/llama.cpp/.github/workflows/build.yml +137 -29
- package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
- package/src/llama.cpp/.github/workflows/docker.yml +46 -34
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
- package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
- package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
- package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
- package/src/llama.cpp/.github/workflows/server.yml +7 -0
- package/src/llama.cpp/CMakeLists.txt +26 -11
- package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
- package/src/llama.cpp/common/CMakeLists.txt +10 -10
- package/src/llama.cpp/common/arg.cpp +2041 -0
- package/src/llama.cpp/common/arg.h +77 -0
- package/src/llama.cpp/common/common.cpp +523 -1861
- package/src/llama.cpp/common/common.h +234 -106
- package/src/llama.cpp/common/console.cpp +3 -0
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
- package/src/llama.cpp/common/log.cpp +401 -0
- package/src/llama.cpp/common/log.h +66 -698
- package/src/llama.cpp/common/ngram-cache.cpp +39 -36
- package/src/llama.cpp/common/ngram-cache.h +19 -19
- package/src/llama.cpp/common/sampling.cpp +356 -350
- package/src/llama.cpp/common/sampling.h +62 -139
- package/src/llama.cpp/common/stb_image.h +5990 -6398
- package/src/llama.cpp/docs/build.md +72 -17
- package/src/llama.cpp/examples/CMakeLists.txt +1 -2
- package/src/llama.cpp/examples/batched/batched.cpp +49 -65
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +42 -53
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +22 -22
- package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
- package/src/llama.cpp/examples/embedding/embedding.cpp +147 -91
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +37 -37
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +39 -38
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
- package/src/llama.cpp/examples/{baby-llama → gen-docs}/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +46 -39
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +75 -69
- package/src/llama.cpp/examples/infill/infill.cpp +131 -192
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +276 -178
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +40 -36
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +686 -150
- package/src/llama.cpp/examples/llava/clip.h +11 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +60 -71
- package/src/llama.cpp/examples/llava/llava.cpp +146 -26
- package/src/llama.cpp/examples/llava/llava.h +2 -3
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
- package/src/llama.cpp/examples/llava/requirements.txt +1 -0
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +55 -56
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +15 -13
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +34 -33
- package/src/llama.cpp/examples/lookup/lookup.cpp +60 -63
- package/src/llama.cpp/examples/main/main.cpp +216 -313
- package/src/llama.cpp/examples/parallel/parallel.cpp +58 -59
- package/src/llama.cpp/examples/passkey/passkey.cpp +53 -61
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +277 -311
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -12
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +57 -52
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +27 -2
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +60 -46
- package/src/llama.cpp/examples/server/CMakeLists.txt +7 -18
- package/src/llama.cpp/examples/server/server.cpp +1347 -1531
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
- package/src/llama.cpp/examples/server/utils.hpp +396 -107
- package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/simple/simple.cpp +132 -106
- package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
- package/src/llama.cpp/examples/speculative/speculative.cpp +153 -124
- package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +27 -29
- package/src/llama.cpp/ggml/CMakeLists.txt +29 -12
- package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +166 -68
- package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +17 -19
- package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
- package/src/llama.cpp/ggml/include/ggml-cuda.h +17 -17
- package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
- package/src/llama.cpp/ggml/include/ggml-metal.h +13 -12
- package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
- package/src/llama.cpp/ggml/include/ggml.h +272 -505
- package/src/llama.cpp/ggml/src/CMakeLists.txt +69 -1110
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +52 -2116
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
- package/src/llama.cpp/ggml/src/ggml-alloc.c +29 -27
- package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
- package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
- package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
- package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
- package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +144 -81
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
- package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +394 -635
- package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
- package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +217 -70
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
- package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +458 -353
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
- package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +371 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1885 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +380 -584
- package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
- package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +233 -87
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
- package/src/llama.cpp/ggml/src/ggml-quants.c +369 -9994
- package/src/llama.cpp/ggml/src/ggml-quants.h +78 -110
- package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
- package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +560 -335
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +6 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +51 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +310 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +18 -25
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
- package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3350 -3980
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +70 -68
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +9 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +8 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
- package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
- package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2034 -1718
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +152 -185
- package/src/llama.cpp/ggml/src/ggml.c +2075 -16579
- package/src/llama.cpp/include/llama.h +296 -285
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
- package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
- package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
- package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
- package/src/llama.cpp/src/CMakeLists.txt +2 -1
- package/src/llama.cpp/src/llama-grammar.cpp +721 -122
- package/src/llama.cpp/src/llama-grammar.h +120 -15
- package/src/llama.cpp/src/llama-impl.h +156 -1
- package/src/llama.cpp/src/llama-sampling.cpp +2058 -346
- package/src/llama.cpp/src/llama-sampling.h +39 -47
- package/src/llama.cpp/src/llama-vocab.cpp +390 -127
- package/src/llama.cpp/src/llama-vocab.h +60 -20
- package/src/llama.cpp/src/llama.cpp +6215 -3263
- package/src/llama.cpp/src/unicode-data.cpp +6 -4
- package/src/llama.cpp/src/unicode-data.h +4 -4
- package/src/llama.cpp/src/unicode.cpp +15 -7
- package/src/llama.cpp/tests/CMakeLists.txt +4 -2
- package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
- package/src/llama.cpp/tests/test-backend-ops.cpp +1725 -297
- package/src/llama.cpp/tests/test-barrier.cpp +94 -0
- package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
- package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
- package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +23 -8
- package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
- package/src/llama.cpp/tests/test-log.cpp +39 -0
- package/src/llama.cpp/tests/test-opt.cpp +853 -142
- package/src/llama.cpp/tests/test-quantize-fns.cpp +28 -19
- package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
- package/src/llama.cpp/tests/test-rope.cpp +2 -1
- package/src/llama.cpp/tests/test-sampling.cpp +226 -142
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +56 -36
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
- package/patches/llama.patch +0 -22
- package/src/llama.cpp/.github/workflows/bench.yml +0 -310
- package/src/llama.cpp/common/grammar-parser.cpp +0 -536
- package/src/llama.cpp/common/grammar-parser.h +0 -29
- package/src/llama.cpp/common/train.cpp +0 -1513
- package/src/llama.cpp/common/train.h +0 -233
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1640
- package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
- package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
- package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +0 -1027
- package/src/llama.cpp/tests/test-grad0.cpp +0 -1566
- /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
- /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
|
@@ -2,10 +2,13 @@
|
|
|
2
2
|
#include "common.h"
|
|
3
3
|
#include "log.h"
|
|
4
4
|
|
|
5
|
+
#include <cinttypes>
|
|
5
6
|
#include <cstdint>
|
|
7
|
+
#include <cstdio>
|
|
6
8
|
#include <fstream>
|
|
9
|
+
#include <thread>
|
|
7
10
|
|
|
8
|
-
void
|
|
11
|
+
void common_ngram_cache_update(common_ngram_cache & ngram_cache, int ngram_min, int ngram_max,
|
|
9
12
|
std::vector<llama_token> & inp, int nnew, bool print_progress) {
|
|
10
13
|
const int64_t t_start_ms = ggml_time_ms();
|
|
11
14
|
const int64_t inp_size = inp.size();
|
|
@@ -17,16 +20,16 @@ void llama_ngram_cache_update(llama_ngram_cache & ngram_cache, int ngram_min, in
|
|
|
17
20
|
const int64_t i_start = std::max(inp_size - nnew, ngram_size);
|
|
18
21
|
for (int64_t i = i_start; i < inp_size; ++i) {
|
|
19
22
|
const int64_t ngram_start = i - ngram_size;
|
|
20
|
-
|
|
23
|
+
common_ngram ngram(&inp[ngram_start], ngram_size);
|
|
21
24
|
const llama_token token = inp[i];
|
|
22
25
|
|
|
23
|
-
|
|
26
|
+
common_ngram_cache::iterator part_it = ngram_cache.find(ngram);
|
|
24
27
|
if (part_it == ngram_cache.end()) {
|
|
25
|
-
|
|
28
|
+
common_ngram_cache_part part;
|
|
26
29
|
part.emplace(token, 1);
|
|
27
30
|
ngram_cache.emplace(ngram, part);
|
|
28
31
|
} else {
|
|
29
|
-
|
|
32
|
+
common_ngram_cache_part::iterator token_count_it = part_it->second.find(token);
|
|
30
33
|
if (token_count_it == part_it->second.end()) {
|
|
31
34
|
part_it->second.emplace(token, 1);
|
|
32
35
|
} else {
|
|
@@ -59,12 +62,12 @@ constexpr int draft_min_sample_size_strict[LLAMA_NGRAM_MAX] = { 4, 3, 2, 2};
|
|
|
59
62
|
constexpr int draft_min_percent_strict[LLAMA_NGRAM_MAX] = {75, 66, 66, 66};
|
|
60
63
|
|
|
61
64
|
// Helper function that tries to draft a token from only the static ngram cache:
|
|
62
|
-
static llama_token try_draft(
|
|
63
|
-
|
|
65
|
+
static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram ngram_static) {
|
|
66
|
+
common_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
|
|
64
67
|
if (part_static_it == nc_static.end()) {
|
|
65
68
|
return -1;
|
|
66
69
|
}
|
|
67
|
-
const
|
|
70
|
+
const common_ngram_cache_part part_static = part_static_it->second;
|
|
68
71
|
|
|
69
72
|
int max_count_static = 0;
|
|
70
73
|
int sum_count_static = 0;
|
|
@@ -92,19 +95,19 @@ static llama_token try_draft(llama_ngram_cache & nc_static, const llama_ngram ng
|
|
|
92
95
|
|
|
93
96
|
// Try to draft a token from primary cache (context/dynamic), validate with static cache:
|
|
94
97
|
static llama_token try_draft(
|
|
95
|
-
|
|
98
|
+
common_ngram_cache & nc_primary, const std::vector<common_ngram> & ngrams_primary, common_ngram_cache_part & part_static,
|
|
96
99
|
const int * min_sample_size, const int * min_percent) {
|
|
97
100
|
|
|
98
101
|
llama_token drafted_token = -1;
|
|
99
102
|
|
|
100
103
|
for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token == -1; --i) {
|
|
101
|
-
const
|
|
104
|
+
const common_ngram ngram_primary = ngrams_primary[i];
|
|
102
105
|
|
|
103
|
-
|
|
106
|
+
common_ngram_cache::iterator part_primary_it = nc_primary.find(ngram_primary);
|
|
104
107
|
if (part_primary_it == nc_primary.end()) {
|
|
105
108
|
continue;
|
|
106
109
|
}
|
|
107
|
-
const
|
|
110
|
+
const common_ngram_cache_part part_primary = part_primary_it->second;
|
|
108
111
|
|
|
109
112
|
int max_count_primary = 0;
|
|
110
113
|
int max_count_static = 0;
|
|
@@ -114,7 +117,7 @@ static llama_token try_draft(
|
|
|
114
117
|
for (std::pair<llama_token, int> token_count_primary : part_primary) {
|
|
115
118
|
const llama_token token = token_count_primary.first;
|
|
116
119
|
|
|
117
|
-
|
|
120
|
+
common_ngram_cache_part::iterator token_count_static_it = part_static.find(token);
|
|
118
121
|
|
|
119
122
|
const int32_t count_primary = token_count_primary.second;
|
|
120
123
|
const int32_t count_static = token_count_static_it != part_static.end() ? 100*token_count_static_it->second : 1;
|
|
@@ -139,9 +142,9 @@ static llama_token try_draft(
|
|
|
139
142
|
return drafted_token;
|
|
140
143
|
}
|
|
141
144
|
|
|
142
|
-
void
|
|
145
|
+
void common_ngram_cache_draft(
|
|
143
146
|
std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max,
|
|
144
|
-
|
|
147
|
+
common_ngram_cache & nc_context, common_ngram_cache & nc_dynamic, common_ngram_cache & nc_static
|
|
145
148
|
) {
|
|
146
149
|
GGML_ASSERT(draft.size() == 1);
|
|
147
150
|
const int inp_size = inp.size();
|
|
@@ -154,21 +157,21 @@ void llama_ngram_cache_draft(
|
|
|
154
157
|
llama_token drafted_token = -1;
|
|
155
158
|
|
|
156
159
|
const int ngram_start_static = inp_size-LLAMA_NGRAM_STATIC + draft.size()-1;
|
|
157
|
-
|
|
160
|
+
common_ngram ngram_static;
|
|
158
161
|
for (int j = ngram_start_static; j < ngram_start_static + LLAMA_NGRAM_STATIC; ++j) {
|
|
159
162
|
ngram_static.tokens[j-ngram_start_static] = get_token(inp, draft, j);
|
|
160
163
|
}
|
|
161
|
-
|
|
162
|
-
|
|
164
|
+
common_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
|
|
165
|
+
common_ngram_cache_part part_static;
|
|
163
166
|
if (part_static_it != nc_static.end()) {
|
|
164
167
|
part_static = part_static_it->second;
|
|
165
168
|
}
|
|
166
169
|
|
|
167
170
|
// cd = context + dynamic
|
|
168
|
-
std::vector<
|
|
171
|
+
std::vector<common_ngram> ngrams_cd;
|
|
169
172
|
for (int ngram_size_cd = ngram_min; ngram_size_cd <= ngram_max; ++ngram_size_cd) {
|
|
170
173
|
const int ngram_start_cd = inp_size-ngram_size_cd + draft.size()-1;
|
|
171
|
-
|
|
174
|
+
common_ngram ngram_cd;
|
|
172
175
|
for (int j = ngram_start_cd; j < ngram_start_cd + ngram_size_cd; ++j) {
|
|
173
176
|
ngram_cd.tokens[j-ngram_start_cd] = get_token(inp, draft, j);
|
|
174
177
|
}
|
|
@@ -193,16 +196,16 @@ void llama_ngram_cache_draft(
|
|
|
193
196
|
}
|
|
194
197
|
}
|
|
195
198
|
|
|
196
|
-
void
|
|
199
|
+
void common_ngram_cache_save(common_ngram_cache & ngram_cache, std::string & filename) {
|
|
197
200
|
std::ofstream file_out(filename, std::ios::binary);
|
|
198
|
-
for (std::pair<
|
|
199
|
-
const
|
|
200
|
-
|
|
201
|
+
for (std::pair<common_ngram, common_ngram_cache_part> item : ngram_cache) {
|
|
202
|
+
const common_ngram ngram = item.first;
|
|
203
|
+
common_ngram_cache_part token_counts = item.second;
|
|
201
204
|
GGML_ASSERT(!token_counts.empty());
|
|
202
205
|
const int32_t ntokens = token_counts.size();
|
|
203
206
|
GGML_ASSERT(ntokens > 0);
|
|
204
207
|
|
|
205
|
-
file_out.write(reinterpret_cast<const char *>(&ngram), sizeof(
|
|
208
|
+
file_out.write(reinterpret_cast<const char *>(&ngram), sizeof(common_ngram));
|
|
206
209
|
file_out.write(reinterpret_cast<const char *>(&ntokens), sizeof(int32_t));
|
|
207
210
|
for (std::pair<llama_token, int32_t> item2 : token_counts) {
|
|
208
211
|
const llama_token token = item2.first;
|
|
@@ -216,14 +219,14 @@ void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filen
|
|
|
216
219
|
|
|
217
220
|
}
|
|
218
221
|
|
|
219
|
-
|
|
222
|
+
common_ngram_cache common_ngram_cache_load(std::string & filename) {
|
|
220
223
|
std::ifstream hashmap_file(filename, std::ios::binary);
|
|
221
224
|
if (!hashmap_file) {
|
|
222
225
|
throw std::ifstream::failure("Unable to open file " + filename);
|
|
223
226
|
}
|
|
224
|
-
|
|
227
|
+
common_ngram_cache ngram_cache;
|
|
225
228
|
|
|
226
|
-
|
|
229
|
+
common_ngram ngram;
|
|
227
230
|
int32_t ntokens;
|
|
228
231
|
llama_token token;
|
|
229
232
|
int32_t count;
|
|
@@ -232,11 +235,11 @@ llama_ngram_cache llama_ngram_cache_load(std::string & filename) {
|
|
|
232
235
|
char * ntokensc = reinterpret_cast<char*>(&ntokens);
|
|
233
236
|
char * tokenc = reinterpret_cast<char*>(&token);
|
|
234
237
|
char * countc = reinterpret_cast<char*>(&count);
|
|
235
|
-
while(hashmap_file.read(ngramc, sizeof(
|
|
238
|
+
while(hashmap_file.read(ngramc, sizeof(common_ngram))) {
|
|
236
239
|
GGML_ASSERT(!hashmap_file.eof());
|
|
237
240
|
GGML_ASSERT(hashmap_file.read(ntokensc, sizeof(int32_t)));
|
|
238
241
|
GGML_ASSERT(ntokens > 0);
|
|
239
|
-
|
|
242
|
+
common_ngram_cache_part token_counts;
|
|
240
243
|
|
|
241
244
|
for (int i = 0; i < ntokens; ++i) {
|
|
242
245
|
GGML_ASSERT(!hashmap_file.eof());
|
|
@@ -254,12 +257,12 @@ llama_ngram_cache llama_ngram_cache_load(std::string & filename) {
|
|
|
254
257
|
return ngram_cache;
|
|
255
258
|
}
|
|
256
259
|
|
|
257
|
-
void
|
|
258
|
-
for (std::pair<
|
|
259
|
-
const
|
|
260
|
-
|
|
260
|
+
void common_ngram_cache_merge(common_ngram_cache & ngram_cache_target, common_ngram_cache & ngram_cache_add) {
|
|
261
|
+
for (std::pair<common_ngram, common_ngram_cache_part> ngram_part : ngram_cache_add) {
|
|
262
|
+
const common_ngram ngram = ngram_part.first;
|
|
263
|
+
common_ngram_cache_part part = ngram_part.second;
|
|
261
264
|
|
|
262
|
-
|
|
265
|
+
common_ngram_cache::iterator part_merged_it = ngram_cache_target.find(ngram);
|
|
263
266
|
if (part_merged_it == ngram_cache_target.end()) {
|
|
264
267
|
ngram_cache_target.emplace(ngram, part);
|
|
265
268
|
continue;
|
|
@@ -270,7 +273,7 @@ void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram
|
|
|
270
273
|
const int32_t count = token_count.second;
|
|
271
274
|
GGML_ASSERT(count > 0);
|
|
272
275
|
|
|
273
|
-
|
|
276
|
+
common_ngram_cache_part::iterator token_count_merged_it = part_merged_it->second.find(token);
|
|
274
277
|
if (token_count_merged_it == part_merged_it->second.end()) {
|
|
275
278
|
part_merged_it->second.emplace(token, count);
|
|
276
279
|
continue;
|
|
@@ -12,22 +12,22 @@
|
|
|
12
12
|
|
|
13
13
|
// Data structures to map n-grams to empirical token probabilities:
|
|
14
14
|
|
|
15
|
-
struct
|
|
15
|
+
struct common_ngram {
|
|
16
16
|
llama_token tokens[LLAMA_NGRAM_MAX];
|
|
17
17
|
|
|
18
|
-
|
|
18
|
+
common_ngram() {
|
|
19
19
|
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
|
|
20
20
|
tokens[i] = -1;
|
|
21
21
|
}
|
|
22
22
|
}
|
|
23
23
|
|
|
24
|
-
|
|
24
|
+
common_ngram(const llama_token * input, const int ngram_size) {
|
|
25
25
|
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
|
|
26
26
|
tokens[i] = i < ngram_size ? input[i] : -1;
|
|
27
27
|
}
|
|
28
28
|
}
|
|
29
29
|
|
|
30
|
-
bool operator==(const
|
|
30
|
+
bool operator==(const common_ngram & other) const {
|
|
31
31
|
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
|
|
32
32
|
if (tokens[i] != other.tokens[i]) {
|
|
33
33
|
return false;
|
|
@@ -37,28 +37,28 @@ struct llama_ngram {
|
|
|
37
37
|
}
|
|
38
38
|
};
|
|
39
39
|
|
|
40
|
-
struct
|
|
40
|
+
struct common_token_hash_function {
|
|
41
41
|
size_t operator()(const llama_token token) const {
|
|
42
42
|
// see https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
|
|
43
43
|
return token * 11400714819323198485llu;
|
|
44
44
|
}
|
|
45
45
|
};
|
|
46
46
|
|
|
47
|
-
struct
|
|
48
|
-
size_t operator()(const
|
|
49
|
-
size_t hash =
|
|
47
|
+
struct common_ngram_hash_function {
|
|
48
|
+
size_t operator()(const common_ngram & ngram) const {
|
|
49
|
+
size_t hash = common_token_hash_function{}(ngram.tokens[0]);
|
|
50
50
|
for (int i = 1; i < LLAMA_NGRAM_MAX; ++i) {
|
|
51
|
-
hash ^=
|
|
51
|
+
hash ^= common_token_hash_function{}(ngram.tokens[i]);
|
|
52
52
|
}
|
|
53
53
|
return hash;
|
|
54
54
|
}
|
|
55
55
|
};
|
|
56
56
|
|
|
57
57
|
// token -> number of times token has been seen
|
|
58
|
-
typedef std::unordered_map<llama_token, int32_t>
|
|
58
|
+
typedef std::unordered_map<llama_token, int32_t> common_ngram_cache_part;
|
|
59
59
|
|
|
60
60
|
// n-gram -> empirical distribution of following tokens
|
|
61
|
-
typedef std::unordered_map<
|
|
61
|
+
typedef std::unordered_map<common_ngram, common_ngram_cache_part, common_ngram_hash_function> common_ngram_cache;
|
|
62
62
|
|
|
63
63
|
|
|
64
64
|
// Update an ngram cache with tokens.
|
|
@@ -70,8 +70,8 @@ typedef std::unordered_map<llama_ngram, llama_ngram_cache_part, llama_ngram_hash
|
|
|
70
70
|
//
|
|
71
71
|
// In order to get correct results inp_data can ONLY BE APPENDED TO.
|
|
72
72
|
// Changes in the middle need a complete rebuild.
|
|
73
|
-
void
|
|
74
|
-
|
|
73
|
+
void common_ngram_cache_update(
|
|
74
|
+
common_ngram_cache & ngram_cache, int ngram_min, int ngram_max, std::vector<llama_token> & inp_data, int nnew, bool print_progress);
|
|
75
75
|
|
|
76
76
|
// Try to draft tokens from ngram caches.
|
|
77
77
|
// inp: the tokens generated so far.
|
|
@@ -81,21 +81,21 @@ void llama_ngram_cache_update(
|
|
|
81
81
|
// nc_context: ngram cache based on current context.
|
|
82
82
|
// nc_dynamic: ngram cache based on previous user generations.
|
|
83
83
|
// nc_static: ngram cache generated from a large text corpus, used for validation.
|
|
84
|
-
void
|
|
84
|
+
void common_ngram_cache_draft(
|
|
85
85
|
std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max,
|
|
86
|
-
|
|
86
|
+
common_ngram_cache & nc_context, common_ngram_cache & nc_dynamic, common_ngram_cache & nc_static);
|
|
87
87
|
|
|
88
88
|
// Save an ngram cache to a file.
|
|
89
89
|
// ngram_cache: the ngram cache to save.
|
|
90
90
|
// filename: the path under which to save the ngram cache.
|
|
91
|
-
void
|
|
91
|
+
void common_ngram_cache_save(common_ngram_cache & ngram_cache, std::string & filename);
|
|
92
92
|
|
|
93
|
-
// Load an ngram cache saved with
|
|
93
|
+
// Load an ngram cache saved with common_ngram_cache_save.
|
|
94
94
|
// filename: the path from which to load the ngram cache.
|
|
95
95
|
// returns: an ngram cache containing the information saved to filename.
|
|
96
|
-
|
|
96
|
+
common_ngram_cache common_ngram_cache_load(std::string & filename);
|
|
97
97
|
|
|
98
98
|
// Merge two ngram caches.
|
|
99
99
|
// ngram_cache_target: the ngram cache to which to add the information from ngram_cache_add.
|
|
100
100
|
// ngram_cache_add: the ngram cache to add to ngram_cache_target.
|
|
101
|
-
void
|
|
101
|
+
void common_ngram_cache_merge(common_ngram_cache & ngram_cache_target, common_ngram_cache & ngram_cache_add);
|