@fugood/llama.node 0.3.0 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +1 -10
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/package.json +6 -4
- package/src/LlamaCompletionWorker.cpp +6 -6
- package/src/LlamaContext.cpp +7 -9
- package/src/common.hpp +2 -1
- package/src/llama.cpp/.github/workflows/build.yml +98 -24
- package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
- package/src/llama.cpp/.github/workflows/docker.yml +43 -34
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
- package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
- package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
- package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
- package/src/llama.cpp/.github/workflows/server.yml +7 -0
- package/src/llama.cpp/CMakeLists.txt +20 -8
- package/src/llama.cpp/common/CMakeLists.txt +12 -10
- package/src/llama.cpp/common/arg.cpp +2006 -0
- package/src/llama.cpp/common/arg.h +77 -0
- package/src/llama.cpp/common/common.cpp +496 -1632
- package/src/llama.cpp/common/common.h +161 -63
- package/src/llama.cpp/common/console.cpp +3 -0
- package/src/llama.cpp/common/log.cpp +401 -0
- package/src/llama.cpp/common/log.h +66 -698
- package/src/llama.cpp/common/ngram-cache.cpp +3 -0
- package/src/llama.cpp/common/sampling.cpp +348 -350
- package/src/llama.cpp/common/sampling.h +62 -139
- package/src/llama.cpp/common/stb_image.h +5990 -6398
- package/src/llama.cpp/common/train.cpp +2 -0
- package/src/llama.cpp/docs/build.md +36 -1
- package/src/llama.cpp/examples/CMakeLists.txt +0 -1
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +1 -2
- package/src/llama.cpp/examples/batched/batched.cpp +39 -55
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +34 -44
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +15 -15
- package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
- package/src/llama.cpp/examples/embedding/embedding.cpp +143 -87
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +33 -33
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +36 -35
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
- package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +34 -27
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +59 -62
- package/src/llama.cpp/examples/infill/infill.cpp +117 -132
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +265 -58
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +29 -22
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +685 -150
- package/src/llama.cpp/examples/llava/clip.h +11 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +47 -58
- package/src/llama.cpp/examples/llava/llava.cpp +110 -24
- package/src/llama.cpp/examples/llava/llava.h +2 -3
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
- package/src/llama.cpp/examples/llava/requirements.txt +1 -0
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +42 -43
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +10 -8
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +23 -22
- package/src/llama.cpp/examples/lookup/lookup.cpp +40 -43
- package/src/llama.cpp/examples/main/main.cpp +210 -262
- package/src/llama.cpp/examples/parallel/parallel.cpp +49 -49
- package/src/llama.cpp/examples/passkey/passkey.cpp +42 -50
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +187 -200
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -3
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +49 -44
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +24 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +32 -35
- package/src/llama.cpp/examples/server/CMakeLists.txt +3 -5
- package/src/llama.cpp/examples/server/server.cpp +1027 -1073
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
- package/src/llama.cpp/examples/server/utils.hpp +107 -105
- package/src/llama.cpp/examples/simple/simple.cpp +35 -41
- package/src/llama.cpp/examples/speculative/speculative.cpp +129 -103
- package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +25 -27
- package/src/llama.cpp/ggml/CMakeLists.txt +14 -3
- package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-backend.h +145 -60
- package/src/llama.cpp/ggml/include/ggml-blas.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +15 -19
- package/src/llama.cpp/ggml/include/ggml-cuda.h +16 -16
- package/src/llama.cpp/ggml/include/ggml-metal.h +5 -8
- package/src/llama.cpp/ggml/include/ggml-rpc.h +5 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +8 -8
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +7 -7
- package/src/llama.cpp/ggml/include/ggml.h +293 -186
- package/src/llama.cpp/ggml/src/CMakeLists.txt +86 -44
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +2135 -1119
- package/src/llama.cpp/ggml/src/ggml-alloc.c +6 -0
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +152 -70
- package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +606 -286
- package/src/llama.cpp/ggml/src/ggml-blas.cpp +9 -10
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
- package/src/llama.cpp/ggml/src/ggml-cann.cpp +215 -216
- package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
- package/src/llama.cpp/ggml/src/ggml-cpu-impl.h +614 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +49 -603
- package/src/llama.cpp/ggml/src/ggml-kompute.cpp +4 -24
- package/src/llama.cpp/ggml/src/ggml-quants.c +972 -92
- package/src/llama.cpp/ggml/src/ggml-quants.h +15 -0
- package/src/llama.cpp/ggml/src/ggml-rpc.cpp +116 -66
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +52 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +16 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +6 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +2 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl.cpp +97 -169
- package/src/llama.cpp/ggml/src/ggml-vulkan.cpp +1508 -1124
- package/src/llama.cpp/ggml/src/ggml.c +3001 -1647
- package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +192 -0
- package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +88 -40
- package/src/llama.cpp/include/llama.h +241 -264
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
- package/src/llama.cpp/src/llama-grammar.cpp +721 -122
- package/src/llama.cpp/src/llama-grammar.h +120 -15
- package/src/llama.cpp/src/llama-impl.h +156 -1
- package/src/llama.cpp/src/llama-sampling.cpp +1375 -303
- package/src/llama.cpp/src/llama-sampling.h +20 -47
- package/src/llama.cpp/src/llama-vocab.cpp +343 -120
- package/src/llama.cpp/src/llama-vocab.h +33 -17
- package/src/llama.cpp/src/llama.cpp +4247 -1525
- package/src/llama.cpp/src/unicode-data.cpp +6 -4
- package/src/llama.cpp/src/unicode-data.h +4 -4
- package/src/llama.cpp/src/unicode.cpp +15 -7
- package/src/llama.cpp/tests/CMakeLists.txt +3 -0
- package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
- package/src/llama.cpp/tests/test-backend-ops.cpp +1592 -289
- package/src/llama.cpp/tests/test-barrier.cpp +93 -0
- package/src/llama.cpp/tests/test-grad0.cpp +187 -70
- package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
- package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +6 -4
- package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
- package/src/llama.cpp/tests/test-log.cpp +39 -0
- package/src/llama.cpp/tests/test-quantize-fns.cpp +6 -0
- package/src/llama.cpp/tests/test-rope.cpp +1 -1
- package/src/llama.cpp/tests/test-sampling.cpp +157 -98
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +55 -35
- package/patches/llama.patch +0 -22
- package/src/llama.cpp/.github/workflows/bench.yml +0 -310
- package/src/llama.cpp/common/grammar-parser.cpp +0 -536
- package/src/llama.cpp/common/grammar-parser.h +0 -29
- package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
- package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
|
@@ -10,181 +10,199 @@
|
|
|
10
10
|
#include <string>
|
|
11
11
|
#include <vector>
|
|
12
12
|
|
|
13
|
-
static void dump(const llama_token_data_array *
|
|
14
|
-
for (size_t i = 0; i <
|
|
15
|
-
printf("%d: %f (%f)\n",
|
|
13
|
+
static void dump(const llama_token_data_array * cur_p) {
|
|
14
|
+
for (size_t i = 0; i < cur_p->size; i++) {
|
|
15
|
+
printf("%d: %f (%f)\n", cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit);
|
|
16
16
|
}
|
|
17
17
|
}
|
|
18
18
|
|
|
19
|
-
#define DUMP(
|
|
19
|
+
#define DUMP(__cur_p) do { printf("%s:%d (%s)\n", __FILE__, __LINE__, __func__); dump((__cur_p)); printf("-\n"); } while(0)
|
|
20
|
+
|
|
21
|
+
#define APPLY(__cnstr, __cur_p) do { \
|
|
22
|
+
auto * cnstr = (__cnstr); \
|
|
23
|
+
llama_sampler_apply(cnstr, (__cur_p)); \
|
|
24
|
+
llama_sampler_free(cnstr); \
|
|
25
|
+
} while(0)
|
|
20
26
|
|
|
21
27
|
static void test_top_k(const std::vector<float> & probs, const std::vector<float> & expected_probs, int k) {
|
|
22
28
|
const size_t n_vocab = probs.size();
|
|
23
|
-
|
|
24
|
-
|
|
29
|
+
|
|
30
|
+
std::vector<llama_token_data> cur;
|
|
31
|
+
cur.reserve(n_vocab);
|
|
25
32
|
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
|
|
26
33
|
const float logit = logf(probs[token_id]);
|
|
27
|
-
|
|
34
|
+
cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
|
|
28
35
|
}
|
|
29
36
|
|
|
30
|
-
llama_token_data_array
|
|
31
|
-
|
|
32
|
-
DUMP(&
|
|
33
|
-
|
|
34
|
-
DUMP(&
|
|
37
|
+
llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
|
|
38
|
+
APPLY(llama_sampler_init_softmax(), &cur_p);
|
|
39
|
+
DUMP(&cur_p);
|
|
40
|
+
APPLY(llama_sampler_init_top_k(k), &cur_p);
|
|
41
|
+
DUMP(&cur_p);
|
|
35
42
|
|
|
36
|
-
GGML_ASSERT(
|
|
37
|
-
for (size_t i = 0; i <
|
|
38
|
-
GGML_ASSERT(fabs(
|
|
43
|
+
GGML_ASSERT(cur_p.size == expected_probs.size());
|
|
44
|
+
for (size_t i = 0; i < cur_p.size; i++) {
|
|
45
|
+
GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-5);
|
|
39
46
|
}
|
|
40
47
|
}
|
|
41
48
|
|
|
42
49
|
static void test_top_p(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
|
|
43
50
|
const size_t n_vocab = probs.size();
|
|
44
|
-
|
|
45
|
-
|
|
51
|
+
|
|
52
|
+
std::vector<llama_token_data> cur;
|
|
53
|
+
cur.reserve(n_vocab);
|
|
46
54
|
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
|
|
47
55
|
const float logit = logf(probs[token_id]);
|
|
48
|
-
|
|
56
|
+
cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
|
|
49
57
|
}
|
|
50
58
|
|
|
51
|
-
llama_token_data_array
|
|
52
|
-
|
|
53
|
-
DUMP(&
|
|
54
|
-
|
|
55
|
-
DUMP(&
|
|
59
|
+
llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
|
|
60
|
+
APPLY(llama_sampler_init_softmax(), &cur_p);
|
|
61
|
+
DUMP(&cur_p);
|
|
62
|
+
APPLY(llama_sampler_init_top_p(p, 1), &cur_p);
|
|
63
|
+
DUMP(&cur_p);
|
|
56
64
|
|
|
57
|
-
GGML_ASSERT(
|
|
58
|
-
for (size_t i = 0; i <
|
|
59
|
-
GGML_ASSERT(fabs(
|
|
65
|
+
GGML_ASSERT(cur_p.size == expected_probs.size());
|
|
66
|
+
for (size_t i = 0; i < cur_p.size; i++) {
|
|
67
|
+
GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-3);
|
|
60
68
|
}
|
|
61
69
|
}
|
|
62
70
|
|
|
63
71
|
static void test_tfs(const std::vector<float> & probs, const std::vector<float> & expected_probs, float z) {
|
|
64
72
|
const size_t n_vocab = probs.size();
|
|
65
|
-
|
|
66
|
-
|
|
73
|
+
|
|
74
|
+
std::vector<llama_token_data> cur;
|
|
75
|
+
cur.reserve(n_vocab);
|
|
67
76
|
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
|
|
68
77
|
const float logit = logf(probs[token_id]);
|
|
69
|
-
|
|
78
|
+
cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
|
|
70
79
|
}
|
|
71
80
|
|
|
72
|
-
llama_token_data_array
|
|
73
|
-
DUMP(&
|
|
74
|
-
|
|
75
|
-
DUMP(&
|
|
81
|
+
llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
|
|
82
|
+
DUMP(&cur_p);
|
|
83
|
+
APPLY(llama_sampler_init_tail_free(z, 1), &cur_p);
|
|
84
|
+
DUMP(&cur_p);
|
|
76
85
|
|
|
77
|
-
GGML_ASSERT(
|
|
78
|
-
for (size_t i = 0; i <
|
|
79
|
-
GGML_ASSERT(fabs(
|
|
86
|
+
GGML_ASSERT(cur_p.size == expected_probs.size());
|
|
87
|
+
for (size_t i = 0; i < cur_p.size; i++) {
|
|
88
|
+
GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-3);
|
|
80
89
|
}
|
|
81
90
|
}
|
|
82
91
|
|
|
83
92
|
static void test_min_p(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
|
|
84
93
|
const size_t n_vocab = probs.size();
|
|
85
|
-
|
|
86
|
-
|
|
94
|
+
|
|
95
|
+
std::vector<llama_token_data> cur;
|
|
96
|
+
cur.reserve(n_vocab);
|
|
87
97
|
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
|
|
88
98
|
const float logit = logf(probs[token_id]);
|
|
89
|
-
|
|
99
|
+
cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
|
|
90
100
|
}
|
|
91
101
|
|
|
92
|
-
llama_token_data_array
|
|
93
|
-
DUMP(&
|
|
94
|
-
|
|
95
|
-
DUMP(&
|
|
96
|
-
|
|
102
|
+
llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
|
|
103
|
+
DUMP(&cur_p);
|
|
104
|
+
APPLY(llama_sampler_init_min_p(p, 1), &cur_p);
|
|
105
|
+
DUMP(&cur_p);
|
|
106
|
+
APPLY(llama_sampler_init_softmax(), &cur_p);
|
|
97
107
|
|
|
98
|
-
GGML_ASSERT(
|
|
99
|
-
for (size_t i = 0; i <
|
|
100
|
-
GGML_ASSERT(fabs(
|
|
108
|
+
GGML_ASSERT(cur_p.size == expected_probs.size());
|
|
109
|
+
for (size_t i = 0; i < cur_p.size; i++) {
|
|
110
|
+
GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-3);
|
|
101
111
|
}
|
|
102
112
|
}
|
|
103
113
|
|
|
104
114
|
static void test_typical(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
|
|
105
115
|
const size_t n_vocab = probs.size();
|
|
106
|
-
|
|
107
|
-
|
|
116
|
+
|
|
117
|
+
std::vector<llama_token_data> cur;
|
|
118
|
+
cur.reserve(n_vocab);
|
|
108
119
|
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
|
|
109
120
|
const float logit = logf(probs[token_id]);
|
|
110
|
-
|
|
121
|
+
cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
|
|
111
122
|
}
|
|
112
123
|
|
|
113
|
-
llama_token_data_array
|
|
114
|
-
DUMP(&
|
|
115
|
-
|
|
116
|
-
DUMP(&
|
|
124
|
+
llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
|
|
125
|
+
DUMP(&cur_p);
|
|
126
|
+
APPLY(llama_sampler_init_typical(p, 1), &cur_p);
|
|
127
|
+
DUMP(&cur_p);
|
|
117
128
|
|
|
118
|
-
GGML_ASSERT(
|
|
119
|
-
for (size_t i = 0; i <
|
|
120
|
-
GGML_ASSERT(fabs(
|
|
129
|
+
GGML_ASSERT(cur_p.size == expected_probs.size());
|
|
130
|
+
for (size_t i = 0; i < cur_p.size; i++) {
|
|
131
|
+
GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-3);
|
|
121
132
|
}
|
|
122
133
|
}
|
|
123
134
|
|
|
124
|
-
static void
|
|
135
|
+
static void test_penalties(
|
|
125
136
|
const std::vector<float> & probs, const std::vector<llama_token> & last_tokens,
|
|
126
137
|
const std::vector<float> & expected_probs, float repeat_penalty, float alpha_frequency, float alpha_presence
|
|
127
138
|
) {
|
|
128
139
|
GGML_ASSERT(probs.size() == expected_probs.size());
|
|
129
140
|
|
|
130
141
|
const size_t n_vocab = probs.size();
|
|
131
|
-
|
|
132
|
-
|
|
142
|
+
|
|
143
|
+
std::vector<llama_token_data> cur;
|
|
144
|
+
cur.reserve(n_vocab);
|
|
133
145
|
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
|
|
134
146
|
const float logit = logf(probs[token_id]);
|
|
135
|
-
|
|
147
|
+
cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
|
|
151
|
+
|
|
152
|
+
auto * sampler = llama_sampler_init_penalties(n_vocab, LLAMA_TOKEN_NULL, LLAMA_TOKEN_NULL, last_tokens.size(), repeat_penalty, alpha_frequency, alpha_presence, false, false);
|
|
153
|
+
|
|
154
|
+
for (size_t i = 0; i < last_tokens.size(); i++) {
|
|
155
|
+
llama_sampler_accept(sampler, last_tokens[i]);
|
|
136
156
|
}
|
|
137
157
|
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
DUMP(&candidates_p);
|
|
158
|
+
APPLY(llama_sampler_init_softmax(), &cur_p);
|
|
159
|
+
DUMP(&cur_p);
|
|
160
|
+
APPLY(sampler, &cur_p);
|
|
161
|
+
APPLY(llama_sampler_init_softmax(), &cur_p);
|
|
162
|
+
DUMP(&cur_p);
|
|
144
163
|
|
|
145
|
-
GGML_ASSERT(
|
|
146
|
-
for (size_t i = 0; i <
|
|
147
|
-
GGML_ASSERT(fabs(
|
|
164
|
+
GGML_ASSERT(cur_p.size == expected_probs.size());
|
|
165
|
+
for (size_t i = 0; i < cur_p.size; i++) {
|
|
166
|
+
GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-3);
|
|
148
167
|
}
|
|
149
168
|
}
|
|
150
169
|
|
|
151
|
-
static void test_sampler_queue(
|
|
152
|
-
const size_t n_vocab, const std::string samplers_sequence, const int top_k, const float top_p, const float min_p
|
|
170
|
+
static void test_sampler_queue(const size_t n_vocab, const std::string & samplers_sequence, const int top_k, const float top_p, const float min_p
|
|
153
171
|
) {
|
|
154
|
-
std::vector<llama_token_data>
|
|
155
|
-
|
|
172
|
+
std::vector<llama_token_data> cur;
|
|
173
|
+
cur.reserve(n_vocab);
|
|
156
174
|
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
|
|
157
175
|
const float logit = logf(token_id);
|
|
158
|
-
|
|
176
|
+
cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
|
|
159
177
|
}
|
|
160
178
|
|
|
161
|
-
llama_token_data_array
|
|
179
|
+
llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
|
|
162
180
|
|
|
163
181
|
llama_token min_token_id = 0;
|
|
164
182
|
const llama_token max_token_id = n_vocab-1;
|
|
165
183
|
|
|
166
184
|
for (auto s : samplers_sequence) {
|
|
167
185
|
switch (s){
|
|
168
|
-
case 'k':
|
|
169
|
-
case 'f': GGML_ABORT("tail_free test not implemented");
|
|
170
|
-
case 'y': GGML_ABORT("typical test not implemented");
|
|
171
|
-
case 'p':
|
|
172
|
-
case 'm':
|
|
173
|
-
case 't': GGML_ABORT("temperature test not implemented");
|
|
174
|
-
default : GGML_ABORT("Unknown sampler");
|
|
186
|
+
case 'k': APPLY(llama_sampler_init_top_k(top_k), &cur_p); break;
|
|
187
|
+
case 'f': GGML_ABORT("tail_free test not implemented");
|
|
188
|
+
case 'y': GGML_ABORT("typical test not implemented");
|
|
189
|
+
case 'p': APPLY(llama_sampler_init_top_p(top_p, 1), &cur_p); break;
|
|
190
|
+
case 'm': APPLY(llama_sampler_init_min_p(min_p, 1), &cur_p); break;
|
|
191
|
+
case 't': GGML_ABORT("temperature test not implemented");
|
|
192
|
+
default : GGML_ABORT("Unknown sampler");
|
|
175
193
|
}
|
|
176
194
|
|
|
177
|
-
|
|
195
|
+
APPLY(llama_sampler_init_softmax(), &cur_p); // make sure tokens are sorted for tests
|
|
178
196
|
|
|
179
|
-
const int size =
|
|
197
|
+
const int size = cur_p.size;
|
|
180
198
|
|
|
181
199
|
if (s == 'k') {
|
|
182
200
|
const int expected_size = std::min(size, top_k);
|
|
183
201
|
min_token_id = std::max(min_token_id, (llama_token)(n_vocab - top_k));
|
|
184
202
|
|
|
185
203
|
GGML_ASSERT(size == expected_size);
|
|
186
|
-
GGML_ASSERT(
|
|
187
|
-
GGML_ASSERT(
|
|
204
|
+
GGML_ASSERT(cur_p.data[0].id == max_token_id);
|
|
205
|
+
GGML_ASSERT(cur_p.data[expected_size-1].id == min_token_id);
|
|
188
206
|
} else if (s == 'p') {
|
|
189
207
|
const int softmax_divisor = n_vocab * (n_vocab-1) / 2 - min_token_id * (min_token_id-1) / 2;
|
|
190
208
|
const int softmax_numerator_target = ceilf(top_p * softmax_divisor);
|
|
@@ -206,8 +224,8 @@ static void test_sampler_queue(
|
|
|
206
224
|
}
|
|
207
225
|
|
|
208
226
|
GGML_ASSERT(size == expected_size);
|
|
209
|
-
GGML_ASSERT(
|
|
210
|
-
GGML_ASSERT(
|
|
227
|
+
GGML_ASSERT(cur_p.data[0].id == max_token_id);
|
|
228
|
+
GGML_ASSERT(cur_p.data[expected_size-1].id == min_token_id);
|
|
211
229
|
} else if (s == 'm') {
|
|
212
230
|
int expected_size = ceilf((1.0f-min_p) * n_vocab);
|
|
213
231
|
expected_size = std::max(expected_size, 1);
|
|
@@ -219,17 +237,56 @@ static void test_sampler_queue(
|
|
|
219
237
|
min_token_id = std::min(min_token_id, (llama_token)(n_vocab - 1));
|
|
220
238
|
|
|
221
239
|
GGML_ASSERT(size == expected_size);
|
|
222
|
-
GGML_ASSERT(
|
|
223
|
-
GGML_ASSERT(
|
|
240
|
+
GGML_ASSERT(cur_p.data[0].id == max_token_id);
|
|
241
|
+
GGML_ASSERT(cur_p.data[expected_size-1].id == min_token_id);
|
|
224
242
|
} else {
|
|
225
243
|
GGML_ABORT("fatal error");
|
|
226
244
|
}
|
|
227
245
|
}
|
|
228
246
|
|
|
229
|
-
printf("Sampler queue %3s OK with n_vocab=%
|
|
247
|
+
printf("Sampler queue %3s OK with n_vocab=%05zu top_k=%05d top_p=%f min_p=%f\n",
|
|
230
248
|
samplers_sequence.c_str(), n_vocab, top_k, top_p, min_p);
|
|
231
249
|
}
|
|
232
250
|
|
|
251
|
+
static void bench(llama_sampler * cnstr, const char * cnstr_name, const std::vector<llama_token_data> & data, int n_iter) {
|
|
252
|
+
std::vector<llama_token_data> cur(data.size());
|
|
253
|
+
std::copy(data.begin(), data.end(), cur.begin());
|
|
254
|
+
llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
|
|
255
|
+
llama_sampler_apply(cnstr, &cur_p);
|
|
256
|
+
llama_sampler_reset(cnstr);
|
|
257
|
+
const int64_t t_start = ggml_time_us();
|
|
258
|
+
for (int i = 0; i < n_iter; i++) {
|
|
259
|
+
std::copy(data.begin(), data.end(), cur.begin());
|
|
260
|
+
llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
|
|
261
|
+
llama_sampler_apply(cnstr, &cur_p);
|
|
262
|
+
llama_sampler_reset(cnstr);
|
|
263
|
+
}
|
|
264
|
+
const int64_t t_end = ggml_time_us();
|
|
265
|
+
llama_sampler_free(cnstr);
|
|
266
|
+
printf("%-42s: %8.3f us/iter\n", cnstr_name, (t_end - t_start) / (float)n_iter);
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
#define BENCH(__cnstr, __data, __n_iter) bench((__cnstr), #__cnstr, (__data), (__n_iter))
|
|
270
|
+
|
|
271
|
+
static void test_perf() {
|
|
272
|
+
const int n_vocab = 1 << 17;
|
|
273
|
+
|
|
274
|
+
std::vector<llama_token_data> data;
|
|
275
|
+
|
|
276
|
+
data.reserve(n_vocab);
|
|
277
|
+
for (int i = 0; i < n_vocab; i++) {
|
|
278
|
+
const float logit = 2.0f*((float)(rand())/RAND_MAX - 0.5f);
|
|
279
|
+
data.emplace_back(llama_token_data{i, logit, 0.0f});
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
BENCH(llama_sampler_init_top_k (40), data, 32);
|
|
283
|
+
BENCH(llama_sampler_init_top_p (0.8f, 1), data, 32);
|
|
284
|
+
BENCH(llama_sampler_init_min_p (0.2f, 1), data, 32);
|
|
285
|
+
BENCH(llama_sampler_init_tail_free(0.5f, 1), data, 32);
|
|
286
|
+
BENCH(llama_sampler_init_typical (0.5f, 1), data, 32);
|
|
287
|
+
BENCH(llama_sampler_init_softmax (), data, 32);
|
|
288
|
+
}
|
|
289
|
+
|
|
233
290
|
int main(void) {
|
|
234
291
|
ggml_time_init();
|
|
235
292
|
|
|
@@ -259,13 +316,13 @@ int main(void) {
|
|
|
259
316
|
test_typical({0.97f, 0.01f, 0.01f, 0.01f}, {0.97f}, 0.5f);
|
|
260
317
|
test_typical({0.4f, 0.2f, 0.2f, 0.2f}, {0.2f, 0.2f, 0.2f}, 0.5f);
|
|
261
318
|
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
319
|
+
test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0}, {0.25f, 0.25f, 0.25f, 0.25f, 0}, 50.0f, 0.0f, 0.0f);
|
|
320
|
+
test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2}, {0.5f, 0.5f, 0, 0, 0}, 50.0f, 0.0f, 0.0f);
|
|
321
|
+
test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.5f, 0.5f, 0, 0, 0}, 50.0f, 0.0f, 0.0f);
|
|
265
322
|
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
323
|
+
test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0}, {0.249997f, 0.249997f, 0.249997f, 0.249997f, 0.000011f}, 1.0f, 5.0f, 5.0f);
|
|
324
|
+
test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2}, {0.499966f, 0.499966f, 0.000023f, 0.000023f, 0.000023f}, 1.0f, 5.0f, 5.0f);
|
|
325
|
+
test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.499977f, 0.499977f, 0.000023f, 0.000023f, 0.000000f}, 1.0f, 5.0f, 5.0f);
|
|
269
326
|
|
|
270
327
|
test_sampler_queue(10000, "k", 10000, 1.0f, 1.0f);
|
|
271
328
|
test_sampler_queue(10000, "k", 1, 1.0f, 1.0f);
|
|
@@ -297,5 +354,7 @@ int main(void) {
|
|
|
297
354
|
|
|
298
355
|
printf("OK\n");
|
|
299
356
|
|
|
357
|
+
test_perf();
|
|
358
|
+
|
|
300
359
|
return 0;
|
|
301
360
|
}
|
|
@@ -7,6 +7,7 @@
|
|
|
7
7
|
#include <map>
|
|
8
8
|
#include <vector>
|
|
9
9
|
#include <fstream>
|
|
10
|
+
#include <thread>
|
|
10
11
|
|
|
11
12
|
//static const std::map<std::string, std::vector<llama_token>> & k_tests() {
|
|
12
13
|
// static std::map<std::string, std::vector<llama_token>> _k_tests = {
|
|
@@ -194,45 +195,64 @@ int main(int argc, char **argv) {
|
|
|
194
195
|
|
|
195
196
|
const bool add_special = false;
|
|
196
197
|
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
198
|
+
// multi-threaded tokenization
|
|
199
|
+
const int nthread = std::thread::hardware_concurrency();
|
|
200
|
+
std::vector<std::thread> threads(nthread);
|
|
201
|
+
|
|
202
|
+
for (int i = 0; i < nthread; i++) {
|
|
203
|
+
threads[i] = std::thread([&, i]() {
|
|
204
|
+
for (const auto & test_kv : k_tests) {
|
|
205
|
+
const std::vector<llama_token> res = llama_tokenize(ctx, test_kv.first, add_special, false);
|
|
206
|
+
|
|
207
|
+
// here only print the result of the first thread
|
|
208
|
+
// because the other threads are running the same tests
|
|
209
|
+
if (i != 0) {
|
|
210
|
+
continue;
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
printf("\n");
|
|
214
|
+
printf("src: '%s'\n", test_kv.first.c_str());
|
|
215
|
+
printf("res: '%s'\n", llama_detokenize(ctx, res).c_str());
|
|
216
|
+
printf("tok: ");
|
|
217
|
+
for (const auto & tok : res) {
|
|
218
|
+
printf("%d ", tok);
|
|
219
|
+
}
|
|
220
|
+
printf("\n");
|
|
221
|
+
|
|
222
|
+
bool correct = res.size() == test_kv.second.size();
|
|
223
|
+
for (int i = 0; i < (int) res.size() && correct; ++i) {
|
|
224
|
+
if (test_kv.second[i] != res[i]) {
|
|
225
|
+
correct = false;
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
if (!correct) {
|
|
230
|
+
fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str());
|
|
231
|
+
fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__,
|
|
232
|
+
llama_detokenize(ctx, res).c_str(),
|
|
233
|
+
llama_detokenize(ctx, test_kv.second).c_str());
|
|
234
|
+
fprintf(stderr, "%s : expected tokens: ", __func__);
|
|
235
|
+
for (const auto & t : test_kv.second) {
|
|
236
|
+
fprintf(stderr, "%6d '%s', ", t, llama_token_to_piece(ctx, t).c_str());
|
|
237
|
+
}
|
|
238
|
+
fprintf(stderr, "\n");
|
|
239
|
+
fprintf(stderr, "%s : got tokens: ", __func__);
|
|
240
|
+
for (const auto & t : res) {
|
|
241
|
+
fprintf(stderr, "%6d '%s', ", t, llama_token_to_piece(ctx, t).c_str());
|
|
242
|
+
}
|
|
243
|
+
fprintf(stderr, "\n");
|
|
244
|
+
|
|
245
|
+
success = false;
|
|
246
|
+
}
|
|
213
247
|
}
|
|
214
|
-
}
|
|
215
|
-
|
|
216
|
-
if (!correct) {
|
|
217
|
-
fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str());
|
|
218
|
-
fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__,
|
|
219
|
-
llama_detokenize(ctx, res).c_str(),
|
|
220
|
-
llama_detokenize(ctx, test_kv.second).c_str());
|
|
221
|
-
fprintf(stderr, "%s : expected tokens: ", __func__);
|
|
222
|
-
for (const auto & t : test_kv.second) {
|
|
223
|
-
fprintf(stderr, "%6d '%s', ", t, llama_token_to_piece(ctx, t).c_str());
|
|
224
|
-
}
|
|
225
|
-
fprintf(stderr, "\n");
|
|
226
|
-
fprintf(stderr, "%s : got tokens: ", __func__);
|
|
227
|
-
for (const auto & t : res) {
|
|
228
|
-
fprintf(stderr, "%6d '%s', ", t, llama_token_to_piece(ctx, t).c_str());
|
|
229
|
-
}
|
|
230
|
-
fprintf(stderr, "\n");
|
|
248
|
+
});
|
|
249
|
+
}
|
|
231
250
|
|
|
232
|
-
|
|
233
|
-
|
|
251
|
+
for (int i = 0; i < nthread; i++) {
|
|
252
|
+
threads[i].join();
|
|
234
253
|
}
|
|
235
254
|
|
|
255
|
+
// single threaded tokenization
|
|
236
256
|
if (!fname_text.empty()) {
|
|
237
257
|
fprintf(stderr, "%s : tokenizing: '%s'\n", __func__, fname_text.c_str());
|
|
238
258
|
|
package/patches/llama.patch
DELETED
|
@@ -1,22 +0,0 @@
|
|
|
1
|
-
diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp
|
|
2
|
-
index fa68360b..f9ff7b5d 100644
|
|
3
|
-
--- a/ggml/src/ggml-vulkan.cpp
|
|
4
|
-
+++ b/ggml/src/ggml-vulkan.cpp
|
|
5
|
-
@@ -617,9 +617,15 @@ static void ggml_vk_create_pipeline(vk_device& device, vk_pipeline& pipeline, co
|
|
6
|
-
vk::PipelineCreateFlags(),
|
|
7
|
-
pipeline_shader_create_info,
|
|
8
|
-
pipeline->layout);
|
|
9
|
-
- pipeline->pipeline = device->device.createComputePipeline(VK_NULL_HANDLE, compute_pipeline_create_info).value;
|
|
10
|
-
|
|
11
|
-
- device->pipelines.push_back(pipeline);
|
|
12
|
-
+ try {
|
|
13
|
-
+ pipeline->pipeline = device->device.createComputePipeline(VK_NULL_HANDLE, compute_pipeline_create_info).value;
|
|
14
|
-
+ device->pipelines.push_back(pipeline);
|
|
15
|
-
+ } catch(vk::UnknownError const&) {
|
|
16
|
-
+ VK_LOG_DEBUG("Failed to create pipeline " << name);
|
|
17
|
-
+ ggml_vk_destroy_pipeline(device->device, pipeline);
|
|
18
|
-
+ pipeline.reset();
|
|
19
|
-
+ }
|
|
20
|
-
}
|
|
21
|
-
|
|
22
|
-
static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline) {
|