@fugood/llama.node 0.3.2 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +2 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/package.json +1 -1
- package/src/DetokenizeWorker.cpp +1 -1
- package/src/EmbeddingWorker.cpp +2 -2
- package/src/LlamaCompletionWorker.cpp +8 -8
- package/src/LlamaCompletionWorker.h +2 -2
- package/src/LlamaContext.cpp +8 -9
- package/src/TokenizeWorker.cpp +1 -1
- package/src/common.hpp +4 -4
- package/src/llama.cpp/.github/workflows/build.yml +43 -9
- package/src/llama.cpp/.github/workflows/docker.yml +3 -0
- package/src/llama.cpp/CMakeLists.txt +7 -4
- package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
- package/src/llama.cpp/common/CMakeLists.txt +0 -2
- package/src/llama.cpp/common/arg.cpp +642 -607
- package/src/llama.cpp/common/arg.h +22 -22
- package/src/llama.cpp/common/common.cpp +79 -281
- package/src/llama.cpp/common/common.h +130 -100
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
- package/src/llama.cpp/common/log.cpp +50 -50
- package/src/llama.cpp/common/log.h +18 -18
- package/src/llama.cpp/common/ngram-cache.cpp +36 -36
- package/src/llama.cpp/common/ngram-cache.h +19 -19
- package/src/llama.cpp/common/sampling.cpp +116 -108
- package/src/llama.cpp/common/sampling.h +20 -20
- package/src/llama.cpp/docs/build.md +37 -17
- package/src/llama.cpp/examples/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched/batched.cpp +14 -14
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +10 -11
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +9 -9
- package/src/llama.cpp/examples/embedding/embedding.cpp +12 -12
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +8 -8
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +5 -5
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +7 -7
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +18 -18
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +20 -11
- package/src/llama.cpp/examples/infill/infill.cpp +40 -86
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +42 -151
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -14
- package/src/llama.cpp/examples/llava/clip.cpp +1 -0
- package/src/llama.cpp/examples/llava/llava-cli.cpp +23 -23
- package/src/llama.cpp/examples/llava/llava.cpp +37 -3
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +21 -21
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +26 -26
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +7 -7
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +14 -14
- package/src/llama.cpp/examples/lookup/lookup.cpp +29 -29
- package/src/llama.cpp/examples/main/main.cpp +64 -109
- package/src/llama.cpp/examples/parallel/parallel.cpp +18 -19
- package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +99 -120
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +10 -9
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +13 -13
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +3 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +34 -17
- package/src/llama.cpp/examples/server/CMakeLists.txt +4 -13
- package/src/llama.cpp/examples/server/server.cpp +553 -691
- package/src/llama.cpp/examples/server/utils.hpp +312 -25
- package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/simple/simple.cpp +128 -96
- package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
- package/src/llama.cpp/examples/speculative/speculative.cpp +54 -51
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +2 -2
- package/src/llama.cpp/ggml/CMakeLists.txt +15 -9
- package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +46 -33
- package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +9 -7
- package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
- package/src/llama.cpp/ggml/include/ggml-cuda.h +12 -12
- package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
- package/src/llama.cpp/ggml/include/ggml-metal.h +11 -7
- package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
- package/src/llama.cpp/ggml/include/ggml.h +53 -393
- package/src/llama.cpp/ggml/src/CMakeLists.txt +66 -1149
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +46 -3126
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
- package/src/llama.cpp/ggml/src/ggml-alloc.c +23 -27
- package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
- package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
- package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
- package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
- package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +6 -25
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +303 -864
- package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
- package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +213 -65
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
- package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +255 -149
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
- package/src/llama.cpp/ggml/src/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +0 -243
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
- package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.cpp +667 -1
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +366 -16
- package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
- package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +238 -72
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
- package/src/llama.cpp/ggml/src/ggml-quants.c +187 -10692
- package/src/llama.cpp/ggml/src/ggml-quants.h +78 -125
- package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
- package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +475 -300
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +40 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +258 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +2 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
- package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3584 -4142
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +69 -67
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +3 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +6 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
- package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
- package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +555 -623
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +125 -206
- package/src/llama.cpp/ggml/src/ggml.c +4032 -19890
- package/src/llama.cpp/include/llama.h +67 -33
- package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
- package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
- package/src/llama.cpp/src/CMakeLists.txt +2 -1
- package/src/llama.cpp/src/llama-sampling.cpp +745 -105
- package/src/llama.cpp/src/llama-sampling.h +21 -2
- package/src/llama.cpp/src/llama-vocab.cpp +49 -9
- package/src/llama.cpp/src/llama-vocab.h +35 -11
- package/src/llama.cpp/src/llama.cpp +2636 -2406
- package/src/llama.cpp/src/unicode-data.cpp +2 -2
- package/src/llama.cpp/tests/CMakeLists.txt +1 -2
- package/src/llama.cpp/tests/test-arg-parser.cpp +14 -14
- package/src/llama.cpp/tests/test-backend-ops.cpp +185 -60
- package/src/llama.cpp/tests/test-barrier.cpp +1 -0
- package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -4
- package/src/llama.cpp/tests/test-log.cpp +2 -2
- package/src/llama.cpp/tests/test-opt.cpp +853 -142
- package/src/llama.cpp/tests/test-quantize-fns.cpp +22 -19
- package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
- package/src/llama.cpp/tests/test-rope.cpp +1 -0
- package/src/llama.cpp/tests/test-sampling.cpp +162 -137
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +7 -7
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
- package/src/llama.cpp/common/train.cpp +0 -1515
- package/src/llama.cpp/common/train.h +0 -233
- package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1639
- package/src/llama.cpp/tests/test-grad0.cpp +0 -1683
- /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
- /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
- /package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +0 -0
|
@@ -2311,7 +2311,7 @@ const std::unordered_set<uint32_t> unicode_set_whitespace = {
|
|
|
2311
2311
|
0x003000,
|
|
2312
2312
|
};
|
|
2313
2313
|
|
|
2314
|
-
// list is always in ascending order, to enable binary
|
|
2314
|
+
// list is always in ascending order, to enable binary search
|
|
2315
2315
|
const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_lowercase = {
|
|
2316
2316
|
{0x000041, 0x000061},
|
|
2317
2317
|
{0x000042, 0x000062},
|
|
@@ -3748,7 +3748,7 @@ const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_lowercase
|
|
|
3748
3748
|
{0x01E921, 0x01E943},
|
|
3749
3749
|
};
|
|
3750
3750
|
|
|
3751
|
-
// list is always in ascending order, to enable binary
|
|
3751
|
+
// list is always in ascending order, to enable binary search
|
|
3752
3752
|
const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_uppercase = {
|
|
3753
3753
|
{0x000061, 0x000041},
|
|
3754
3754
|
{0x000062, 0x000042},
|
|
@@ -116,9 +116,8 @@ llama_target_and_test(test-sampling.cpp)
|
|
|
116
116
|
llama_target_and_test(test-chat-template.cpp)
|
|
117
117
|
|
|
118
118
|
llama_target_and_test(test-grammar-parser.cpp)
|
|
119
|
-
llama_target_and_test(test-llama-grammar.cpp)
|
|
120
119
|
llama_target_and_test(test-grammar-integration.cpp)
|
|
121
|
-
llama_target_and_test(test-
|
|
120
|
+
llama_target_and_test(test-llama-grammar.cpp)
|
|
122
121
|
llama_target_and_test(test-barrier.cpp)
|
|
123
122
|
# llama_target_and_test(test-opt.cpp) # SLOW
|
|
124
123
|
llama_target_and_test(test-backend-ops.cpp)
|
|
@@ -10,12 +10,12 @@
|
|
|
10
10
|
#include <cassert>
|
|
11
11
|
|
|
12
12
|
int main(void) {
|
|
13
|
-
|
|
13
|
+
common_params params;
|
|
14
14
|
|
|
15
15
|
printf("test-arg-parser: make sure there is no duplicated arguments in any examples\n\n");
|
|
16
16
|
for (int ex = 0; ex < LLAMA_EXAMPLE_COUNT; ex++) {
|
|
17
17
|
try {
|
|
18
|
-
auto ctx_arg =
|
|
18
|
+
auto ctx_arg = common_params_parser_init(params, (enum llama_example)ex);
|
|
19
19
|
std::unordered_set<std::string> seen_args;
|
|
20
20
|
std::unordered_set<std::string> seen_env_vars;
|
|
21
21
|
for (const auto & opt : ctx_arg.options) {
|
|
@@ -58,44 +58,44 @@ int main(void) {
|
|
|
58
58
|
|
|
59
59
|
// missing value
|
|
60
60
|
argv = {"binary_name", "-m"};
|
|
61
|
-
assert(false ==
|
|
61
|
+
assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
|
62
62
|
|
|
63
63
|
// wrong value (int)
|
|
64
64
|
argv = {"binary_name", "-ngl", "hello"};
|
|
65
|
-
assert(false ==
|
|
65
|
+
assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
|
66
66
|
|
|
67
67
|
// wrong value (enum)
|
|
68
68
|
argv = {"binary_name", "-sm", "hello"};
|
|
69
|
-
assert(false ==
|
|
69
|
+
assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
|
70
70
|
|
|
71
71
|
// non-existence arg in specific example (--draft cannot be used outside llama-speculative)
|
|
72
72
|
argv = {"binary_name", "--draft", "123"};
|
|
73
|
-
assert(false ==
|
|
73
|
+
assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SERVER));
|
|
74
74
|
|
|
75
75
|
|
|
76
76
|
printf("test-arg-parser: test valid usage\n\n");
|
|
77
77
|
|
|
78
78
|
argv = {"binary_name", "-m", "model_file.gguf"};
|
|
79
|
-
assert(true ==
|
|
79
|
+
assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
|
80
80
|
assert(params.model == "model_file.gguf");
|
|
81
81
|
|
|
82
82
|
argv = {"binary_name", "-t", "1234"};
|
|
83
|
-
assert(true ==
|
|
83
|
+
assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
|
84
84
|
assert(params.cpuparams.n_threads == 1234);
|
|
85
85
|
|
|
86
86
|
argv = {"binary_name", "--verbose"};
|
|
87
|
-
assert(true ==
|
|
87
|
+
assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
|
88
88
|
assert(params.verbosity > 1);
|
|
89
89
|
|
|
90
90
|
argv = {"binary_name", "-m", "abc.gguf", "--predict", "6789", "--batch-size", "9090"};
|
|
91
|
-
assert(true ==
|
|
91
|
+
assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
|
92
92
|
assert(params.model == "abc.gguf");
|
|
93
93
|
assert(params.n_predict == 6789);
|
|
94
94
|
assert(params.n_batch == 9090);
|
|
95
95
|
|
|
96
96
|
// --draft cannot be used outside llama-speculative
|
|
97
97
|
argv = {"binary_name", "--draft", "123"};
|
|
98
|
-
assert(true ==
|
|
98
|
+
assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SPECULATIVE));
|
|
99
99
|
assert(params.n_draft == 123);
|
|
100
100
|
|
|
101
101
|
// skip this part on windows, because setenv is not supported
|
|
@@ -106,12 +106,12 @@ int main(void) {
|
|
|
106
106
|
|
|
107
107
|
setenv("LLAMA_ARG_THREADS", "blah", true);
|
|
108
108
|
argv = {"binary_name"};
|
|
109
|
-
assert(false ==
|
|
109
|
+
assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
|
110
110
|
|
|
111
111
|
setenv("LLAMA_ARG_MODEL", "blah.gguf", true);
|
|
112
112
|
setenv("LLAMA_ARG_THREADS", "1010", true);
|
|
113
113
|
argv = {"binary_name"};
|
|
114
|
-
assert(true ==
|
|
114
|
+
assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
|
115
115
|
assert(params.model == "blah.gguf");
|
|
116
116
|
assert(params.cpuparams.n_threads == 1010);
|
|
117
117
|
|
|
@@ -121,7 +121,7 @@ int main(void) {
|
|
|
121
121
|
setenv("LLAMA_ARG_MODEL", "blah.gguf", true);
|
|
122
122
|
setenv("LLAMA_ARG_THREADS", "1010", true);
|
|
123
123
|
argv = {"binary_name", "-m", "overwritten.gguf"};
|
|
124
|
-
assert(true ==
|
|
124
|
+
assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
|
125
125
|
assert(params.model == "overwritten.gguf");
|
|
126
126
|
assert(params.cpuparams.n_threads == 1010);
|
|
127
127
|
#endif // _WIN32
|
|
@@ -16,6 +16,7 @@
|
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
#include <ggml.h>
|
|
19
|
+
#include <ggml-cpu.h>
|
|
19
20
|
#include <ggml-alloc.h>
|
|
20
21
|
#include <ggml-backend.h>
|
|
21
22
|
|
|
@@ -133,7 +134,7 @@ static std::vector<float> tensor_to_float(const ggml_tensor * t) {
|
|
|
133
134
|
std::vector<uint8_t> buf(ggml_nbytes(t));
|
|
134
135
|
ggml_backend_tensor_get(t, buf.data(), 0, ggml_nbytes(t));
|
|
135
136
|
|
|
136
|
-
|
|
137
|
+
const auto * tt = ggml_get_type_traits(t->type);
|
|
137
138
|
size_t bs = ggml_blck_size(t->type);
|
|
138
139
|
std::vector<float> vq(ggml_blck_size(t->type));
|
|
139
140
|
bool quantized = ggml_is_quantized(t->type);
|
|
@@ -159,7 +160,7 @@ static std::vector<float> tensor_to_float(const ggml_tensor * t) {
|
|
|
159
160
|
} else if (t->type == GGML_TYPE_I8) {
|
|
160
161
|
tv.push_back((float)*(int8_t *) &buf[i]);
|
|
161
162
|
} else if (quantized) {
|
|
162
|
-
tt
|
|
163
|
+
tt->to_float(&buf[i], vq.data(), bs);
|
|
163
164
|
tv.insert(tv.end(), vq.begin(), vq.end());
|
|
164
165
|
} else {
|
|
165
166
|
GGML_ABORT("fatal error");
|
|
@@ -680,6 +681,7 @@ struct test_case {
|
|
|
680
681
|
|
|
681
682
|
// run
|
|
682
683
|
int64_t total_time_us = 0;
|
|
684
|
+
int64_t total_mem = 0;
|
|
683
685
|
int total_runs = 0;
|
|
684
686
|
do {
|
|
685
687
|
int64_t start_time = ggml_time_us();
|
|
@@ -687,6 +689,7 @@ struct test_case {
|
|
|
687
689
|
int64_t end_time = ggml_time_us();
|
|
688
690
|
|
|
689
691
|
total_time_us += end_time - start_time;
|
|
692
|
+
total_mem += mem;
|
|
690
693
|
total_runs += n_runs;
|
|
691
694
|
} while (total_time_us < 1000*1000); // run for at least 1 second
|
|
692
695
|
|
|
@@ -716,7 +719,7 @@ struct test_case {
|
|
|
716
719
|
} else {
|
|
717
720
|
printf("%8zu kB/run - \033[1;34m%7.2f GB/s\033[0m",
|
|
718
721
|
op_size(out) / 1024,
|
|
719
|
-
|
|
722
|
+
total_mem / (total_time_us / 1e6) / 1024.0 / 1024.0 / 1024.0);
|
|
720
723
|
}
|
|
721
724
|
printf("\n");
|
|
722
725
|
|
|
@@ -808,11 +811,11 @@ struct test_case {
|
|
|
808
811
|
|
|
809
812
|
ggml_build_forward_expand(gf, out);
|
|
810
813
|
ggml_graph_cpy(gf, gb);
|
|
811
|
-
ggml_build_backward_expand(ctx,
|
|
814
|
+
ggml_build_backward_expand(ctx, ctx, gb, false);
|
|
812
815
|
if (expect.size() != 1 || expect[0] != 0.0f) {
|
|
813
816
|
GGML_ASSERT(ggml_graph_n_nodes(gb) > ggml_graph_n_nodes(gf));
|
|
814
817
|
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
|
815
|
-
GGML_ASSERT(!(t->flags & GGML_TENSOR_FLAG_PARAM) || t->
|
|
818
|
+
GGML_ASSERT(!(t->flags & GGML_TENSOR_FLAG_PARAM) || ggml_graph_get_grad(gb, t)->op != GGML_OP_NONE);
|
|
816
819
|
}
|
|
817
820
|
}
|
|
818
821
|
|
|
@@ -859,7 +862,13 @@ struct test_case {
|
|
|
859
862
|
const char * bn = ggml_backend_name(backend);
|
|
860
863
|
const int64_t ne = ggml_nelements(t);
|
|
861
864
|
|
|
862
|
-
std::vector<float> ga
|
|
865
|
+
std::vector<float> ga;
|
|
866
|
+
struct ggml_tensor * grad = ggml_graph_get_grad(gb, t);
|
|
867
|
+
if (grad) {
|
|
868
|
+
ga = tensor_to_float(grad);
|
|
869
|
+
} else {
|
|
870
|
+
ga.resize(ne); // default value is 0.0f
|
|
871
|
+
}
|
|
863
872
|
|
|
864
873
|
for (int64_t i = 0; i < ne; ++i) { // gradient algebraic
|
|
865
874
|
// check for nans
|
|
@@ -1613,8 +1622,8 @@ struct test_ssm_scan : public test_case {
|
|
|
1613
1622
|
}
|
|
1614
1623
|
};
|
|
1615
1624
|
|
|
1616
|
-
//
|
|
1617
|
-
struct
|
|
1625
|
+
// GGML_OP_RWKV_WKV6
|
|
1626
|
+
struct test_rwkv_wkv6 : public test_case {
|
|
1618
1627
|
const ggml_type type;
|
|
1619
1628
|
|
|
1620
1629
|
const int64_t head_count;
|
|
@@ -1626,7 +1635,7 @@ struct test_rwkv_wkv : public test_case {
|
|
|
1626
1635
|
return VARS_TO_STR5(type, head_count, head_size, n_seq_tokens, n_seqs);
|
|
1627
1636
|
}
|
|
1628
1637
|
|
|
1629
|
-
|
|
1638
|
+
test_rwkv_wkv6(ggml_type type = GGML_TYPE_F32,
|
|
1630
1639
|
int64_t head_count = 32, int64_t head_size = 64, int64_t n_seq_tokens = 32, int64_t n_seqs = 32)
|
|
1631
1640
|
: type(type), head_count(head_count), head_size(head_size), n_seq_tokens(n_seq_tokens), n_seqs(n_seqs) {}
|
|
1632
1641
|
|
|
@@ -1638,7 +1647,7 @@ struct test_rwkv_wkv : public test_case {
|
|
|
1638
1647
|
ggml_tensor * tf = ggml_new_tensor(ctx, type, 2, std::vector<int64_t>{ head_size, head_count }.data());
|
|
1639
1648
|
ggml_tensor * td = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ 1, head_size, head_count, n_tokens }.data());
|
|
1640
1649
|
ggml_tensor * s = ggml_new_tensor(ctx, type, 2, std::vector<int64_t>{ head_size * head_size * head_count, n_seqs }.data());
|
|
1641
|
-
ggml_tensor * out =
|
|
1650
|
+
ggml_tensor * out = ggml_rwkv_wkv6(ctx, k, v, r, tf, td, s);
|
|
1642
1651
|
return out;
|
|
1643
1652
|
}
|
|
1644
1653
|
};
|
|
@@ -1650,11 +1659,12 @@ struct test_mul_mat : public test_case {
|
|
|
1650
1659
|
const int64_t m;
|
|
1651
1660
|
const int64_t n;
|
|
1652
1661
|
const int64_t k;
|
|
1653
|
-
const std::array<int64_t, 2> bs;
|
|
1654
|
-
const std::array<int64_t, 2> nr;
|
|
1662
|
+
const std::array<int64_t, 2> bs; // dims 3 and 4
|
|
1663
|
+
const std::array<int64_t, 2> nr; // repeat in dims 3 and 4
|
|
1664
|
+
const std::array<int64_t, 4> per; // permutation of dimensions
|
|
1655
1665
|
|
|
1656
1666
|
std::string vars() override {
|
|
1657
|
-
return
|
|
1667
|
+
return VARS_TO_STR8(type_a, type_b, m, n, k, bs, nr, per);
|
|
1658
1668
|
}
|
|
1659
1669
|
|
|
1660
1670
|
double max_nmse_err() override {
|
|
@@ -1669,17 +1679,44 @@ struct test_mul_mat : public test_case {
|
|
|
1669
1679
|
test_mul_mat(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32,
|
|
1670
1680
|
int64_t m = 32, int64_t n = 32, int64_t k = 32,
|
|
1671
1681
|
std::array<int64_t, 2> bs = {10, 10},
|
|
1672
|
-
std::array<int64_t, 2> nr = {2, 2}
|
|
1673
|
-
|
|
1682
|
+
std::array<int64_t, 2> nr = {2, 2},
|
|
1683
|
+
std::array<int64_t, 4> per = {0, 1, 2, 3})
|
|
1684
|
+
: type_a(type_a), type_b(type_b), m(m), n(n), k(k), bs(bs), nr(nr), per(per) {}
|
|
1674
1685
|
|
|
1675
1686
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
1676
1687
|
// C^T = A * B^T: (k, m) * (k, n) => (m, n)
|
|
1677
|
-
ggml_tensor * a
|
|
1678
|
-
ggml_tensor * b
|
|
1679
|
-
|
|
1680
|
-
|
|
1681
|
-
|
|
1682
|
-
|
|
1688
|
+
ggml_tensor * a;
|
|
1689
|
+
ggml_tensor * b;
|
|
1690
|
+
|
|
1691
|
+
const int npermuted = (per[0] != 0) + (per[1] != 1) + (per[2] != 2) + (per[3] != 3);
|
|
1692
|
+
if (npermuted > 0) {
|
|
1693
|
+
GGML_ASSERT(npermuted == 2);
|
|
1694
|
+
GGML_ASSERT(!ggml_is_quantized(type_a) || per[0] == 0);
|
|
1695
|
+
GGML_ASSERT(!ggml_is_quantized(type_b) || per[0] == 0);
|
|
1696
|
+
|
|
1697
|
+
// Create tensors with the permuted dimensions, then permute them back to the dimensions given by m,n,k.
|
|
1698
|
+
const int64_t ne_a[4] = {k, m, bs[0], bs[1]};
|
|
1699
|
+
const int64_t ne_b[4] = {k, n, bs[0]*nr[0], bs[1]*nr[1]};
|
|
1700
|
+
|
|
1701
|
+
a = ggml_new_tensor_4d(ctx, type_a, ne_a[per[0]], ne_a[per[1]], ne_a[per[2]], ne_a[per[3]]);
|
|
1702
|
+
b = ggml_new_tensor_4d(ctx, type_b, ne_b[per[0]], ne_b[per[1]], ne_b[per[2]], ne_b[per[3]]);
|
|
1703
|
+
ggml_set_param(ctx, a);
|
|
1704
|
+
ggml_set_param(ctx, b);
|
|
1705
|
+
ggml_set_name(a, "a");
|
|
1706
|
+
ggml_set_name(b, "b");
|
|
1707
|
+
|
|
1708
|
+
a = ggml_permute(ctx, a, per[0], per[1], per[2], per[3]);
|
|
1709
|
+
b = ggml_permute(ctx, b, per[0], per[1], per[2], per[3]);
|
|
1710
|
+
ggml_set_name(a, "a_permuted");
|
|
1711
|
+
ggml_set_name(b, "b_permuted");
|
|
1712
|
+
} else {
|
|
1713
|
+
a = ggml_new_tensor_4d(ctx, type_a, k, m, bs[0], bs[1]);
|
|
1714
|
+
b = ggml_new_tensor_4d(ctx, type_b, k, n, bs[0]*nr[0], bs[1]*nr[1]);
|
|
1715
|
+
ggml_set_param(ctx, a);
|
|
1716
|
+
ggml_set_param(ctx, b);
|
|
1717
|
+
ggml_set_name(a, "a");
|
|
1718
|
+
ggml_set_name(b, "b");
|
|
1719
|
+
}
|
|
1683
1720
|
|
|
1684
1721
|
ggml_tensor * out = ggml_mul_mat(ctx, a, b);
|
|
1685
1722
|
ggml_set_name(out, "out");
|
|
@@ -2469,6 +2506,35 @@ struct test_sum_rows : public test_case {
|
|
|
2469
2506
|
}
|
|
2470
2507
|
};
|
|
2471
2508
|
|
|
2509
|
+
// GGML_OP_MEAN
|
|
2510
|
+
struct test_mean : public test_case {
|
|
2511
|
+
const ggml_type type;
|
|
2512
|
+
const std::array<int64_t, 4> ne;
|
|
2513
|
+
|
|
2514
|
+
std::string vars() override {
|
|
2515
|
+
return VARS_TO_STR2(type, ne);
|
|
2516
|
+
}
|
|
2517
|
+
|
|
2518
|
+
test_mean(ggml_type type = GGML_TYPE_F32,
|
|
2519
|
+
std::array<int64_t, 4> ne = {10, 5, 4, 3})
|
|
2520
|
+
: type(type), ne(ne) {}
|
|
2521
|
+
|
|
2522
|
+
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
2523
|
+
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
|
2524
|
+
ggml_set_param(ctx, a);
|
|
2525
|
+
ggml_set_name(a, "a");
|
|
2526
|
+
|
|
2527
|
+
ggml_tensor * out = ggml_mean(ctx, a);
|
|
2528
|
+
ggml_set_name(out, "out");
|
|
2529
|
+
|
|
2530
|
+
return out;
|
|
2531
|
+
}
|
|
2532
|
+
|
|
2533
|
+
float grad_eps() override {
|
|
2534
|
+
return 0.1f * ne[0]*ne[1]*ne[2]*ne[3];
|
|
2535
|
+
}
|
|
2536
|
+
};
|
|
2537
|
+
|
|
2472
2538
|
// GGML_OP_UPSCALE
|
|
2473
2539
|
struct test_upscale : public test_case {
|
|
2474
2540
|
const ggml_type type;
|
|
@@ -2711,6 +2777,13 @@ struct test_flash_attn_ext : public test_case {
|
|
|
2711
2777
|
return 5e-4;
|
|
2712
2778
|
}
|
|
2713
2779
|
|
|
2780
|
+
uint64_t op_flops(ggml_tensor * t) override {
|
|
2781
|
+
GGML_UNUSED(t);
|
|
2782
|
+
// Just counting matmul costs:
|
|
2783
|
+
// Q*K^T is nb x hs x kv, P*V is nb x kv x hs, per head
|
|
2784
|
+
return 2 * 2 * nh * nb * hs * kv;
|
|
2785
|
+
}
|
|
2786
|
+
|
|
2714
2787
|
test_flash_attn_ext(int64_t hs = 128, int64_t nh = 32, int64_t kv = 96, int64_t nb = 8,
|
|
2715
2788
|
bool mask = true, float max_bias = 0.0f, float logit_softcap = 0.0f, ggml_type type_KV = GGML_TYPE_F16)
|
|
2716
2789
|
: hs(hs), nh(nh), kv(kv), nb(nb), mask(mask), max_bias(max_bias), logit_softcap(logit_softcap), type_KV(type_KV) {}
|
|
@@ -2796,24 +2869,14 @@ struct test_cross_entropy_loss : public test_case {
|
|
|
2796
2869
|
struct test_opt_step_adamw : public test_case {
|
|
2797
2870
|
const ggml_type type;
|
|
2798
2871
|
const std::array<int64_t, 4> ne;
|
|
2799
|
-
const float alpha;
|
|
2800
|
-
const float beta1;
|
|
2801
|
-
const float beta2;
|
|
2802
|
-
const float eps;
|
|
2803
|
-
const float wd;
|
|
2804
2872
|
|
|
2805
2873
|
std::string vars() override {
|
|
2806
|
-
return
|
|
2874
|
+
return VARS_TO_STR2(type, ne);
|
|
2807
2875
|
}
|
|
2808
2876
|
|
|
2809
2877
|
test_opt_step_adamw(ggml_type type = GGML_TYPE_F32,
|
|
2810
|
-
std::array<int64_t, 4> ne = {10, 5, 4, 3}
|
|
2811
|
-
|
|
2812
|
-
float beta1 = 0.9f,
|
|
2813
|
-
float beta2 = 0.999f,
|
|
2814
|
-
float eps = 1e-8f,
|
|
2815
|
-
float wd = 0.0f)
|
|
2816
|
-
: type(type), ne(ne), alpha(alpha), beta1(beta1), beta2(beta2), eps(eps), wd(wd) {}
|
|
2878
|
+
std::array<int64_t, 4> ne = {10, 5, 4, 3})
|
|
2879
|
+
: type(type), ne(ne) {}
|
|
2817
2880
|
|
|
2818
2881
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
2819
2882
|
ggml_tensor * a = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
|
|
@@ -2823,7 +2886,16 @@ struct test_opt_step_adamw : public test_case {
|
|
|
2823
2886
|
ggml_tensor * grad = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
|
|
2824
2887
|
ggml_set_name(grad, "grad");
|
|
2825
2888
|
|
|
2826
|
-
ggml_tensor *
|
|
2889
|
+
ggml_tensor * grad_m = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
|
|
2890
|
+
ggml_set_name(grad_m, "grad_m");
|
|
2891
|
+
|
|
2892
|
+
ggml_tensor * grad_v = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
|
|
2893
|
+
ggml_set_name(grad_v, "grad_v");
|
|
2894
|
+
|
|
2895
|
+
ggml_tensor * adamw_params = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 7);
|
|
2896
|
+
ggml_set_name(adamw_params, "adamw_params");
|
|
2897
|
+
|
|
2898
|
+
ggml_tensor * out = ggml_opt_step_adamw(ctx, a, grad, grad_m, grad_v, adamw_params);
|
|
2827
2899
|
ggml_set_name(out, "out");
|
|
2828
2900
|
|
|
2829
2901
|
return out;
|
|
@@ -2831,7 +2903,7 @@ struct test_opt_step_adamw : public test_case {
|
|
|
2831
2903
|
|
|
2832
2904
|
void initialize_tensors(ggml_context * ctx) override {
|
|
2833
2905
|
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
|
2834
|
-
init_tensor_uniform(t, 0.0f, 1.0f); // grad_v
|
|
2906
|
+
init_tensor_uniform(t, 0.0f, 1.0f); // grad_v and adamw_params need non-negative values.
|
|
2835
2907
|
}
|
|
2836
2908
|
}
|
|
2837
2909
|
|
|
@@ -3308,13 +3380,49 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
|
|
3308
3380
|
}
|
|
3309
3381
|
}
|
|
3310
3382
|
|
|
3311
|
-
|
|
3312
|
-
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32));
|
|
3313
|
-
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16));
|
|
3314
|
-
// test cases for 1D im2col
|
|
3383
|
+
// im2col 1D
|
|
3315
3384
|
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32, {3000, 128, 1, 1}, {3, 128, 1280, 1}, 1, 0, 1, 0, 1, 0, false));
|
|
3316
3385
|
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32, {3000, 128, 1, 1}, {3, 128, 1280, 1}, 1, 0, 1, 0, 1, 0, false));
|
|
3317
3386
|
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {3000, 128, 1, 1}, {3, 128, 1280, 1}, 1, 0, 1, 0, 1, 0, false));
|
|
3387
|
+
for (int s0 : {1, 3}) {
|
|
3388
|
+
for (int p0 : {0, 3}) {
|
|
3389
|
+
for (int d0 : {1, 3}) {
|
|
3390
|
+
test_cases.emplace_back(new test_im2col(
|
|
3391
|
+
GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32, {20, 2, 2, 1}, {3, 2, 2, 1},
|
|
3392
|
+
s0, 0, p0, 0, d0, 0, false));
|
|
3393
|
+
}
|
|
3394
|
+
}
|
|
3395
|
+
}
|
|
3396
|
+
|
|
3397
|
+
// im2col 2D
|
|
3398
|
+
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32));
|
|
3399
|
+
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32));
|
|
3400
|
+
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16));
|
|
3401
|
+
for (int s0 : {1, 3}) {
|
|
3402
|
+
for (int s1 : {1, 3}) {
|
|
3403
|
+
for (int p0 : {0, 3}) {
|
|
3404
|
+
for (int p1 : {0, 3}) {
|
|
3405
|
+
for (int d0 : {1, 3}) {
|
|
3406
|
+
for (int d1 : {1, 3}) {
|
|
3407
|
+
test_cases.emplace_back(new test_im2col(
|
|
3408
|
+
GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32, {20, 20, 2, 2}, {3, 3, 2, 2},
|
|
3409
|
+
s0, s1, p0, p1, d0, d1, true));
|
|
3410
|
+
}
|
|
3411
|
+
}
|
|
3412
|
+
}
|
|
3413
|
+
}
|
|
3414
|
+
}
|
|
3415
|
+
}
|
|
3416
|
+
|
|
3417
|
+
// extra tests for im2col 2D
|
|
3418
|
+
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 1, 32}, {3, 3, 1, 32}, 1, 1, 1, 1, 1, 1, true));
|
|
3419
|
+
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 2, 32}, {3, 3, 2, 32}, 1, 1, 1, 1, 1, 1, true));
|
|
3420
|
+
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 1, 1024}, {3, 3, 1, 1024}, 1, 1, 1, 1, 1, 1, true));
|
|
3421
|
+
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 2, 1024}, {3, 3, 2, 1024}, 1, 1, 1, 1, 1, 1, true));
|
|
3422
|
+
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 1, 2048}, {3, 3, 1, 2048}, 1, 1, 1, 1, 1, 1, true));
|
|
3423
|
+
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 2, 2048}, {3, 3, 2, 2048}, 1, 1, 1, 1, 1, 1, true));
|
|
3424
|
+
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 1, 2560}, {3, 3, 1, 2560}, 1, 1, 1, 1, 1, 1, true));
|
|
3425
|
+
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 2, 2560}, {3, 3, 2, 2560}, 1, 1, 1, 1, 1, 1, true));
|
|
3318
3426
|
|
|
3319
3427
|
// sycl backend will limit task global_range < MAX_INT
|
|
3320
3428
|
// test cases for 2D im2col with large input W and H (occurs in stable-diffusion)
|
|
@@ -3434,21 +3542,22 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
|
|
3434
3542
|
|
|
3435
3543
|
test_cases.emplace_back(new test_ssm_scan(GGML_TYPE_F32, 16, 1024, 32, 4));
|
|
3436
3544
|
|
|
3437
|
-
test_cases.emplace_back(new
|
|
3438
|
-
test_cases.emplace_back(new
|
|
3439
|
-
test_cases.emplace_back(new
|
|
3440
|
-
test_cases.emplace_back(new
|
|
3545
|
+
test_cases.emplace_back(new test_rwkv_wkv6(GGML_TYPE_F32, 32, 64, 1, 1));
|
|
3546
|
+
test_cases.emplace_back(new test_rwkv_wkv6(GGML_TYPE_F32, 32, 64, 32, 1));
|
|
3547
|
+
test_cases.emplace_back(new test_rwkv_wkv6(GGML_TYPE_F32, 32, 64, 32, 4));
|
|
3548
|
+
test_cases.emplace_back(new test_rwkv_wkv6(GGML_TYPE_F32, 32, 64, 128, 4));
|
|
3441
3549
|
|
|
3442
3550
|
#if 1
|
|
3443
3551
|
for (ggml_type type_a : base_types) {
|
|
3444
3552
|
for (ggml_type type_b : {GGML_TYPE_F32, GGML_TYPE_F16}) {
|
|
3445
|
-
|
|
3446
|
-
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,
|
|
3447
|
-
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,
|
|
3448
|
-
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,
|
|
3449
|
-
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,
|
|
3450
|
-
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,
|
|
3451
|
-
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,
|
|
3553
|
+
// test cases without permutation
|
|
3554
|
+
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, { 1, 1}, {1, 1}));
|
|
3555
|
+
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 1}, {1, 1}));
|
|
3556
|
+
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 1}, {2, 1}));
|
|
3557
|
+
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 10}, {1, 1}));
|
|
3558
|
+
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 10}, {2, 1}));
|
|
3559
|
+
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 10}, {1, 2}));
|
|
3560
|
+
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 10}, {2, 2}));
|
|
3452
3561
|
|
|
3453
3562
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, { 1, 1}, {1, 1}));
|
|
3454
3563
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 1}, {1, 1}));
|
|
@@ -3457,6 +3566,19 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
|
|
3457
3566
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {2, 1}));
|
|
3458
3567
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {1, 2}));
|
|
3459
3568
|
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {2, 2}));
|
|
3569
|
+
|
|
3570
|
+
// test cases with permutation
|
|
3571
|
+
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {2, 3}, {1, 1}, {0, 2, 1, 3}));
|
|
3572
|
+
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {2, 3}, {1, 1}, {0, 1, 3, 2}));
|
|
3573
|
+
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {2, 3}, {1, 1}, {0, 3, 2, 1}));
|
|
3574
|
+
|
|
3575
|
+
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 8, 256, {2, 3}, {1, 1}, {0, 2, 1, 3}));
|
|
3576
|
+
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 8, 256, {2, 3}, {1, 1}, {0, 1, 3, 2}));
|
|
3577
|
+
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 8, 256, {2, 3}, {1, 1}, {0, 3, 2, 1}));
|
|
3578
|
+
|
|
3579
|
+
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {2, 3}, {1, 1}, {0, 2, 1, 3}));
|
|
3580
|
+
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {2, 3}, {1, 1}, {0, 1, 3, 2}));
|
|
3581
|
+
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {2, 3}, {1, 1}, {0, 3, 2, 1}));
|
|
3460
3582
|
}
|
|
3461
3583
|
}
|
|
3462
3584
|
for (ggml_type type_a : other_types) {
|
|
@@ -3520,7 +3642,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
|
|
3520
3642
|
for (int n_mats : {4}) {
|
|
3521
3643
|
for (int n_used : {2}) {
|
|
3522
3644
|
for (bool b : {false}) {
|
|
3523
|
-
for (int n : {1}) {
|
|
3645
|
+
for (int n : {1, 32}) {
|
|
3524
3646
|
int m = 512;
|
|
3525
3647
|
int k = 256;
|
|
3526
3648
|
test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, n_mats, n_used, b, m, n, k));
|
|
@@ -3647,6 +3769,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
|
|
3647
3769
|
|
|
3648
3770
|
test_cases.emplace_back(new test_sum());
|
|
3649
3771
|
test_cases.emplace_back(new test_sum_rows());
|
|
3772
|
+
test_cases.emplace_back(new test_mean());
|
|
3650
3773
|
test_cases.emplace_back(new test_upscale());
|
|
3651
3774
|
test_cases.emplace_back(new test_upscale(GGML_TYPE_F32, { 512, 512, 3, 1 }, 2, true));
|
|
3652
3775
|
test_cases.emplace_back(new test_upscale_ext());
|
|
@@ -3666,7 +3789,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
|
|
3666
3789
|
for (int nh : { 32, }) {
|
|
3667
3790
|
for (int kv : { 512, 1024, }) {
|
|
3668
3791
|
for (int nb : { 1, 3, 32, 35, }) {
|
|
3669
|
-
for (ggml_type type_KV : {GGML_TYPE_F16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0}) {
|
|
3792
|
+
for (ggml_type type_KV : {GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0}) {
|
|
3670
3793
|
test_cases.emplace_back(new test_flash_attn_ext(hs, nh, kv, nb, mask, max_bias, logit_softcap, type_KV));
|
|
3671
3794
|
}
|
|
3672
3795
|
}
|
|
@@ -3678,9 +3801,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
|
|
3678
3801
|
}
|
|
3679
3802
|
|
|
3680
3803
|
test_cases.emplace_back(new test_cross_entropy_loss());
|
|
3681
|
-
|
|
3682
|
-
test_cases.emplace_back(new test_opt_step_adamw(GGML_TYPE_F32, {10, 5, 4, 3}, 1.0f, 1e-3f, 0.9f, 0.999f, wd));
|
|
3683
|
-
}
|
|
3804
|
+
test_cases.emplace_back(new test_opt_step_adamw(GGML_TYPE_F32, {10, 5, 4, 3}));
|
|
3684
3805
|
|
|
3685
3806
|
// these tests are disabled to save execution time, but they can be handy for debugging
|
|
3686
3807
|
#if 0
|
|
@@ -3700,6 +3821,8 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
|
|
|
3700
3821
|
test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 1, 1, 1}));
|
|
3701
3822
|
test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 512, 1, 1}));
|
|
3702
3823
|
|
|
3824
|
+
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F16, {512, 3072, 1, 1}));
|
|
3825
|
+
|
|
3703
3826
|
for (int bs : {1, 512}) {
|
|
3704
3827
|
for (ggml_type type_a : all_types) {
|
|
3705
3828
|
for (ggml_type type_b : {GGML_TYPE_F32}) {
|
|
@@ -3820,9 +3943,11 @@ int main(int argc, char ** argv) {
|
|
|
3820
3943
|
continue;
|
|
3821
3944
|
}
|
|
3822
3945
|
|
|
3823
|
-
|
|
3946
|
+
ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
|
|
3947
|
+
auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
|
|
3948
|
+
if (ggml_backend_set_n_threads_fn) {
|
|
3824
3949
|
// TODO: better value for n_threads
|
|
3825
|
-
|
|
3950
|
+
ggml_backend_set_n_threads_fn(backend, std::thread::hardware_concurrency());
|
|
3826
3951
|
}
|
|
3827
3952
|
|
|
3828
3953
|
printf(" Device description: %s\n", ggml_backend_dev_description(dev));
|
|
@@ -3846,6 +3971,8 @@ int main(int argc, char ** argv) {
|
|
|
3846
3971
|
ggml_backend_free(backend);
|
|
3847
3972
|
}
|
|
3848
3973
|
|
|
3974
|
+
ggml_quantize_free();
|
|
3975
|
+
|
|
3849
3976
|
printf("%zu/%zu backends passed\n", n_ok, ggml_backend_dev_count());
|
|
3850
3977
|
|
|
3851
3978
|
if (n_ok != ggml_backend_dev_count()) {
|
|
@@ -3853,8 +3980,6 @@ int main(int argc, char ** argv) {
|
|
|
3853
3980
|
return 1;
|
|
3854
3981
|
}
|
|
3855
3982
|
|
|
3856
|
-
ggml_quantize_free();
|
|
3857
|
-
|
|
3858
3983
|
printf("\033[1;32mOK\033[0m\n");
|
|
3859
3984
|
return 0;
|
|
3860
3985
|
}
|
|
@@ -65,6 +65,8 @@ int main(void) {
|
|
|
65
65
|
u8"{% for message in messages %}{% if message['role'] == 'user' %}{{'<用户>' + message['content'].strip() + '<AI>'}}{% else %}{{message['content'].strip()}}{% endif %}{% endfor %}",
|
|
66
66
|
// DeepSeek-V2
|
|
67
67
|
"{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ message['content'] + '\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}",
|
|
68
|
+
// ibm-granite/granite-3.0-8b-instruct
|
|
69
|
+
"{%- if tools %}\n {{- '<|start_of_role|>available_tools<|end_of_role|>\n' }}\n {%- for tool in tools %}\n {{- tool | tojson(indent=4) }}\n {%- if not loop.last %}\n {{- '\n\n' }}\n {%- endif %}\n {%- endfor %}\n {{- '<|end_of_text|>\n' }}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'system' %}\n {{- '<|start_of_role|>system<|end_of_role|>' + message['content'] + '<|end_of_text|>\n' }}\n {%- elif message['role'] == 'user' %}\n {{- '<|start_of_role|>user<|end_of_role|>' + message['content'] + '<|end_of_text|>\n' }}\n {%- elif message['role'] == 'assistant' %}\n {{- '<|start_of_role|>assistant<|end_of_role|>' + message['content'] + '<|end_of_text|>\n' }}\n {%- elif message['role'] == 'assistant_tool_call' %}\n {{- '<|start_of_role|>assistant<|end_of_role|><|tool_call|>' + message['content'] + '<|end_of_text|>\n' }}\n {%- elif message['role'] == 'tool_response' %}\n {{- '<|start_of_role|>tool_response<|end_of_role|>' + message['content'] + '<|end_of_text|>\n' }}\n {%- endif %}\n {%- if loop.last and add_generation_prompt %}\n {{- '<|start_of_role|>assistant<|end_of_role|>' }}\n {%- endif %}\n{%- endfor %}",
|
|
68
70
|
};
|
|
69
71
|
std::vector<std::string> expected_output = {
|
|
70
72
|
// teknium/OpenHermes-2.5-Mistral-7B
|
|
@@ -109,6 +111,8 @@ int main(void) {
|
|
|
109
111
|
u8"You are a helpful assistant<用户>Hello<AI>Hi there<用户>Who are you<AI>I am an assistant<用户>Another question<AI>",
|
|
110
112
|
// DeepSeek-V2
|
|
111
113
|
u8"You are a helpful assistant\n\nUser: Hello\n\nAssistant: Hi there<|end▁of▁sentence|>User: Who are you\n\nAssistant: I am an assistant <|end▁of▁sentence|>User: Another question\n\nAssistant:",
|
|
114
|
+
// ibm-granite/granite-3.0-8b-instruct
|
|
115
|
+
"<|start_of_role|>system<|end_of_role|>You are a helpful assistant<|end_of_text|>\n<|start_of_role|>user<|end_of_role|>Hello<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>Hi there<|end_of_text|>\n<|start_of_role|>user<|end_of_role|>Who are you<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|> I am an assistant <|end_of_text|>\n<|start_of_role|>user<|end_of_role|>Another question<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>\n",
|
|
112
116
|
};
|
|
113
117
|
std::vector<char> formatted_chat(1024);
|
|
114
118
|
int32_t res;
|
|
@@ -140,11 +144,11 @@ int main(void) {
|
|
|
140
144
|
|
|
141
145
|
// test llama_chat_format_single for system message
|
|
142
146
|
printf("\n\n=== llama_chat_format_single (system message) ===\n\n");
|
|
143
|
-
std::vector<
|
|
144
|
-
|
|
147
|
+
std::vector<common_chat_msg> chat2;
|
|
148
|
+
common_chat_msg sys_msg{"system", "You are a helpful assistant"};
|
|
145
149
|
|
|
146
150
|
auto fmt_sys = [&](std::string tmpl) {
|
|
147
|
-
auto output =
|
|
151
|
+
auto output = common_chat_format_single(nullptr, tmpl, chat2, sys_msg, false);
|
|
148
152
|
printf("fmt_sys(%s) : %s\n", tmpl.c_str(), output.c_str());
|
|
149
153
|
printf("-------------------------\n");
|
|
150
154
|
return output;
|
|
@@ -160,10 +164,10 @@ int main(void) {
|
|
|
160
164
|
chat2.push_back({"system", "You are a helpful assistant"});
|
|
161
165
|
chat2.push_back({"user", "Hello"});
|
|
162
166
|
chat2.push_back({"assistant", "I am assistant"});
|
|
163
|
-
|
|
167
|
+
common_chat_msg new_msg{"user", "How are you"};
|
|
164
168
|
|
|
165
169
|
auto fmt_single = [&](std::string tmpl) {
|
|
166
|
-
auto output =
|
|
170
|
+
auto output = common_chat_format_single(nullptr, tmpl, chat2, new_msg, true);
|
|
167
171
|
printf("fmt_single(%s) : %s\n", tmpl.c_str(), output.c_str());
|
|
168
172
|
printf("-------------------------\n");
|
|
169
173
|
return output;
|