@fugood/llama.node 0.3.2 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (190) hide show
  1. package/CMakeLists.txt +2 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/package.json +1 -1
  17. package/src/DetokenizeWorker.cpp +1 -1
  18. package/src/EmbeddingWorker.cpp +2 -2
  19. package/src/LlamaCompletionWorker.cpp +8 -8
  20. package/src/LlamaCompletionWorker.h +2 -2
  21. package/src/LlamaContext.cpp +8 -9
  22. package/src/TokenizeWorker.cpp +1 -1
  23. package/src/common.hpp +4 -4
  24. package/src/llama.cpp/.github/workflows/build.yml +43 -9
  25. package/src/llama.cpp/.github/workflows/docker.yml +3 -0
  26. package/src/llama.cpp/CMakeLists.txt +7 -4
  27. package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
  28. package/src/llama.cpp/common/CMakeLists.txt +0 -2
  29. package/src/llama.cpp/common/arg.cpp +642 -607
  30. package/src/llama.cpp/common/arg.h +22 -22
  31. package/src/llama.cpp/common/common.cpp +79 -281
  32. package/src/llama.cpp/common/common.h +130 -100
  33. package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
  34. package/src/llama.cpp/common/log.cpp +50 -50
  35. package/src/llama.cpp/common/log.h +18 -18
  36. package/src/llama.cpp/common/ngram-cache.cpp +36 -36
  37. package/src/llama.cpp/common/ngram-cache.h +19 -19
  38. package/src/llama.cpp/common/sampling.cpp +116 -108
  39. package/src/llama.cpp/common/sampling.h +20 -20
  40. package/src/llama.cpp/docs/build.md +37 -17
  41. package/src/llama.cpp/examples/CMakeLists.txt +1 -1
  42. package/src/llama.cpp/examples/batched/batched.cpp +14 -14
  43. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +10 -11
  44. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
  45. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +9 -9
  46. package/src/llama.cpp/examples/embedding/embedding.cpp +12 -12
  47. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +8 -8
  48. package/src/llama.cpp/examples/export-lora/export-lora.cpp +5 -5
  49. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +7 -7
  50. package/src/llama.cpp/examples/gritlm/gritlm.cpp +18 -18
  51. package/src/llama.cpp/examples/imatrix/imatrix.cpp +20 -11
  52. package/src/llama.cpp/examples/infill/infill.cpp +40 -86
  53. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +42 -151
  54. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  55. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -14
  56. package/src/llama.cpp/examples/llava/clip.cpp +1 -0
  57. package/src/llama.cpp/examples/llava/llava-cli.cpp +23 -23
  58. package/src/llama.cpp/examples/llava/llava.cpp +37 -3
  59. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +21 -21
  60. package/src/llama.cpp/examples/lookahead/lookahead.cpp +26 -26
  61. package/src/llama.cpp/examples/lookup/lookup-create.cpp +7 -7
  62. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  63. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +14 -14
  64. package/src/llama.cpp/examples/lookup/lookup.cpp +29 -29
  65. package/src/llama.cpp/examples/main/main.cpp +64 -109
  66. package/src/llama.cpp/examples/parallel/parallel.cpp +18 -19
  67. package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
  68. package/src/llama.cpp/examples/perplexity/perplexity.cpp +99 -120
  69. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +10 -9
  70. package/src/llama.cpp/examples/retrieval/retrieval.cpp +13 -13
  71. package/src/llama.cpp/examples/rpc/rpc-server.cpp +3 -1
  72. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +34 -17
  73. package/src/llama.cpp/examples/server/CMakeLists.txt +4 -13
  74. package/src/llama.cpp/examples/server/server.cpp +553 -691
  75. package/src/llama.cpp/examples/server/utils.hpp +312 -25
  76. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  77. package/src/llama.cpp/examples/simple/simple.cpp +128 -96
  78. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
  79. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
  80. package/src/llama.cpp/examples/speculative/speculative.cpp +54 -51
  81. package/src/llama.cpp/examples/tokenize/tokenize.cpp +2 -2
  82. package/src/llama.cpp/ggml/CMakeLists.txt +15 -9
  83. package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
  84. package/src/llama.cpp/ggml/include/ggml-backend.h +46 -33
  85. package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
  86. package/src/llama.cpp/ggml/include/ggml-cann.h +9 -7
  87. package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
  88. package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
  89. package/src/llama.cpp/ggml/include/ggml-cuda.h +12 -12
  90. package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
  91. package/src/llama.cpp/ggml/include/ggml-metal.h +11 -7
  92. package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
  93. package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
  94. package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
  95. package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
  96. package/src/llama.cpp/ggml/include/ggml.h +53 -393
  97. package/src/llama.cpp/ggml/src/CMakeLists.txt +66 -1149
  98. package/src/llama.cpp/ggml/src/ggml-aarch64.c +46 -3126
  99. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
  100. package/src/llama.cpp/ggml/src/ggml-alloc.c +23 -27
  101. package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
  102. package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
  103. package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
  104. package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
  105. package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
  106. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +6 -25
  107. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
  108. package/src/llama.cpp/ggml/src/ggml-backend.cpp +303 -864
  109. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
  110. package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +213 -65
  111. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
  112. package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +255 -149
  113. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
  114. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
  115. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
  116. package/src/llama.cpp/ggml/src/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +0 -243
  117. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
  118. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  119. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
  120. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
  121. package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.cpp +667 -1
  122. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
  123. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
  124. package/src/llama.cpp/ggml/src/ggml-impl.h +366 -16
  125. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
  126. package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +238 -72
  127. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
  128. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
  129. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
  130. package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
  131. package/src/llama.cpp/ggml/src/ggml-quants.c +187 -10692
  132. package/src/llama.cpp/ggml/src/ggml-quants.h +78 -125
  133. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
  134. package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +475 -300
  135. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
  136. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
  137. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +40 -0
  138. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +258 -0
  139. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
  140. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +2 -22
  141. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
  142. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
  143. package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3584 -4142
  144. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +69 -67
  145. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +3 -3
  146. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
  147. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
  148. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +6 -0
  149. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
  150. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
  151. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
  152. package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
  153. package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
  154. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
  155. package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +555 -623
  156. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +125 -206
  157. package/src/llama.cpp/ggml/src/ggml.c +4032 -19890
  158. package/src/llama.cpp/include/llama.h +67 -33
  159. package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
  160. package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
  161. package/src/llama.cpp/src/CMakeLists.txt +2 -1
  162. package/src/llama.cpp/src/llama-sampling.cpp +745 -105
  163. package/src/llama.cpp/src/llama-sampling.h +21 -2
  164. package/src/llama.cpp/src/llama-vocab.cpp +49 -9
  165. package/src/llama.cpp/src/llama-vocab.h +35 -11
  166. package/src/llama.cpp/src/llama.cpp +2636 -2406
  167. package/src/llama.cpp/src/unicode-data.cpp +2 -2
  168. package/src/llama.cpp/tests/CMakeLists.txt +1 -2
  169. package/src/llama.cpp/tests/test-arg-parser.cpp +14 -14
  170. package/src/llama.cpp/tests/test-backend-ops.cpp +185 -60
  171. package/src/llama.cpp/tests/test-barrier.cpp +1 -0
  172. package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
  173. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -4
  174. package/src/llama.cpp/tests/test-log.cpp +2 -2
  175. package/src/llama.cpp/tests/test-opt.cpp +853 -142
  176. package/src/llama.cpp/tests/test-quantize-fns.cpp +22 -19
  177. package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
  178. package/src/llama.cpp/tests/test-rope.cpp +1 -0
  179. package/src/llama.cpp/tests/test-sampling.cpp +162 -137
  180. package/src/llama.cpp/tests/test-tokenizer-0.cpp +7 -7
  181. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
  182. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
  183. package/src/llama.cpp/common/train.cpp +0 -1515
  184. package/src/llama.cpp/common/train.h +0 -233
  185. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -5
  186. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1639
  187. package/src/llama.cpp/tests/test-grad0.cpp +0 -1683
  188. /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
  189. /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
  190. /package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +0 -0
@@ -2311,7 +2311,7 @@ const std::unordered_set<uint32_t> unicode_set_whitespace = {
2311
2311
  0x003000,
2312
2312
  };
2313
2313
 
2314
- // list is always in ascending order, to enable binary searh
2314
+ // list is always in ascending order, to enable binary search
2315
2315
  const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_lowercase = {
2316
2316
  {0x000041, 0x000061},
2317
2317
  {0x000042, 0x000062},
@@ -3748,7 +3748,7 @@ const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_lowercase
3748
3748
  {0x01E921, 0x01E943},
3749
3749
  };
3750
3750
 
3751
- // list is always in ascending order, to enable binary searh
3751
+ // list is always in ascending order, to enable binary search
3752
3752
  const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_uppercase = {
3753
3753
  {0x000061, 0x000041},
3754
3754
  {0x000062, 0x000042},
@@ -116,9 +116,8 @@ llama_target_and_test(test-sampling.cpp)
116
116
  llama_target_and_test(test-chat-template.cpp)
117
117
 
118
118
  llama_target_and_test(test-grammar-parser.cpp)
119
- llama_target_and_test(test-llama-grammar.cpp)
120
119
  llama_target_and_test(test-grammar-integration.cpp)
121
- llama_target_and_test(test-grad0.cpp)
120
+ llama_target_and_test(test-llama-grammar.cpp)
122
121
  llama_target_and_test(test-barrier.cpp)
123
122
  # llama_target_and_test(test-opt.cpp) # SLOW
124
123
  llama_target_and_test(test-backend-ops.cpp)
@@ -10,12 +10,12 @@
10
10
  #include <cassert>
11
11
 
12
12
  int main(void) {
13
- gpt_params params;
13
+ common_params params;
14
14
 
15
15
  printf("test-arg-parser: make sure there is no duplicated arguments in any examples\n\n");
16
16
  for (int ex = 0; ex < LLAMA_EXAMPLE_COUNT; ex++) {
17
17
  try {
18
- auto ctx_arg = gpt_params_parser_init(params, (enum llama_example)ex);
18
+ auto ctx_arg = common_params_parser_init(params, (enum llama_example)ex);
19
19
  std::unordered_set<std::string> seen_args;
20
20
  std::unordered_set<std::string> seen_env_vars;
21
21
  for (const auto & opt : ctx_arg.options) {
@@ -58,44 +58,44 @@ int main(void) {
58
58
 
59
59
  // missing value
60
60
  argv = {"binary_name", "-m"};
61
- assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
61
+ assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
62
62
 
63
63
  // wrong value (int)
64
64
  argv = {"binary_name", "-ngl", "hello"};
65
- assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
65
+ assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
66
66
 
67
67
  // wrong value (enum)
68
68
  argv = {"binary_name", "-sm", "hello"};
69
- assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
69
+ assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
70
70
 
71
71
  // non-existence arg in specific example (--draft cannot be used outside llama-speculative)
72
72
  argv = {"binary_name", "--draft", "123"};
73
- assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SERVER));
73
+ assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SERVER));
74
74
 
75
75
 
76
76
  printf("test-arg-parser: test valid usage\n\n");
77
77
 
78
78
  argv = {"binary_name", "-m", "model_file.gguf"};
79
- assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
79
+ assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
80
80
  assert(params.model == "model_file.gguf");
81
81
 
82
82
  argv = {"binary_name", "-t", "1234"};
83
- assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
83
+ assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
84
84
  assert(params.cpuparams.n_threads == 1234);
85
85
 
86
86
  argv = {"binary_name", "--verbose"};
87
- assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
87
+ assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
88
88
  assert(params.verbosity > 1);
89
89
 
90
90
  argv = {"binary_name", "-m", "abc.gguf", "--predict", "6789", "--batch-size", "9090"};
91
- assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
91
+ assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
92
92
  assert(params.model == "abc.gguf");
93
93
  assert(params.n_predict == 6789);
94
94
  assert(params.n_batch == 9090);
95
95
 
96
96
  // --draft cannot be used outside llama-speculative
97
97
  argv = {"binary_name", "--draft", "123"};
98
- assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SPECULATIVE));
98
+ assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SPECULATIVE));
99
99
  assert(params.n_draft == 123);
100
100
 
101
101
  // skip this part on windows, because setenv is not supported
@@ -106,12 +106,12 @@ int main(void) {
106
106
 
107
107
  setenv("LLAMA_ARG_THREADS", "blah", true);
108
108
  argv = {"binary_name"};
109
- assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
109
+ assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
110
110
 
111
111
  setenv("LLAMA_ARG_MODEL", "blah.gguf", true);
112
112
  setenv("LLAMA_ARG_THREADS", "1010", true);
113
113
  argv = {"binary_name"};
114
- assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
114
+ assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
115
115
  assert(params.model == "blah.gguf");
116
116
  assert(params.cpuparams.n_threads == 1010);
117
117
 
@@ -121,7 +121,7 @@ int main(void) {
121
121
  setenv("LLAMA_ARG_MODEL", "blah.gguf", true);
122
122
  setenv("LLAMA_ARG_THREADS", "1010", true);
123
123
  argv = {"binary_name", "-m", "overwritten.gguf"};
124
- assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
124
+ assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
125
125
  assert(params.model == "overwritten.gguf");
126
126
  assert(params.cpuparams.n_threads == 1010);
127
127
  #endif // _WIN32
@@ -16,6 +16,7 @@
16
16
 
17
17
 
18
18
  #include <ggml.h>
19
+ #include <ggml-cpu.h>
19
20
  #include <ggml-alloc.h>
20
21
  #include <ggml-backend.h>
21
22
 
@@ -133,7 +134,7 @@ static std::vector<float> tensor_to_float(const ggml_tensor * t) {
133
134
  std::vector<uint8_t> buf(ggml_nbytes(t));
134
135
  ggml_backend_tensor_get(t, buf.data(), 0, ggml_nbytes(t));
135
136
 
136
- ggml_type_traits_t tt = ggml_internal_get_type_traits(t->type);
137
+ const auto * tt = ggml_get_type_traits(t->type);
137
138
  size_t bs = ggml_blck_size(t->type);
138
139
  std::vector<float> vq(ggml_blck_size(t->type));
139
140
  bool quantized = ggml_is_quantized(t->type);
@@ -159,7 +160,7 @@ static std::vector<float> tensor_to_float(const ggml_tensor * t) {
159
160
  } else if (t->type == GGML_TYPE_I8) {
160
161
  tv.push_back((float)*(int8_t *) &buf[i]);
161
162
  } else if (quantized) {
162
- tt.to_float(&buf[i], vq.data(), bs);
163
+ tt->to_float(&buf[i], vq.data(), bs);
163
164
  tv.insert(tv.end(), vq.begin(), vq.end());
164
165
  } else {
165
166
  GGML_ABORT("fatal error");
@@ -680,6 +681,7 @@ struct test_case {
680
681
 
681
682
  // run
682
683
  int64_t total_time_us = 0;
684
+ int64_t total_mem = 0;
683
685
  int total_runs = 0;
684
686
  do {
685
687
  int64_t start_time = ggml_time_us();
@@ -687,6 +689,7 @@ struct test_case {
687
689
  int64_t end_time = ggml_time_us();
688
690
 
689
691
  total_time_us += end_time - start_time;
692
+ total_mem += mem;
690
693
  total_runs += n_runs;
691
694
  } while (total_time_us < 1000*1000); // run for at least 1 second
692
695
 
@@ -716,7 +719,7 @@ struct test_case {
716
719
  } else {
717
720
  printf("%8zu kB/run - \033[1;34m%7.2f GB/s\033[0m",
718
721
  op_size(out) / 1024,
719
- mem / (total_time_us / 1e6) / 1024.0 / 1024.0 / 1024.0);
722
+ total_mem / (total_time_us / 1e6) / 1024.0 / 1024.0 / 1024.0);
720
723
  }
721
724
  printf("\n");
722
725
 
@@ -808,11 +811,11 @@ struct test_case {
808
811
 
809
812
  ggml_build_forward_expand(gf, out);
810
813
  ggml_graph_cpy(gf, gb);
811
- ggml_build_backward_expand(ctx, gf, gb, false);
814
+ ggml_build_backward_expand(ctx, ctx, gb, false);
812
815
  if (expect.size() != 1 || expect[0] != 0.0f) {
813
816
  GGML_ASSERT(ggml_graph_n_nodes(gb) > ggml_graph_n_nodes(gf));
814
817
  for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
815
- GGML_ASSERT(!(t->flags & GGML_TENSOR_FLAG_PARAM) || t->grad->op != GGML_OP_NONE);
818
+ GGML_ASSERT(!(t->flags & GGML_TENSOR_FLAG_PARAM) || ggml_graph_get_grad(gb, t)->op != GGML_OP_NONE);
816
819
  }
817
820
  }
818
821
 
@@ -859,7 +862,13 @@ struct test_case {
859
862
  const char * bn = ggml_backend_name(backend);
860
863
  const int64_t ne = ggml_nelements(t);
861
864
 
862
- std::vector<float> ga = tensor_to_float(t->grad);
865
+ std::vector<float> ga;
866
+ struct ggml_tensor * grad = ggml_graph_get_grad(gb, t);
867
+ if (grad) {
868
+ ga = tensor_to_float(grad);
869
+ } else {
870
+ ga.resize(ne); // default value is 0.0f
871
+ }
863
872
 
864
873
  for (int64_t i = 0; i < ne; ++i) { // gradient algebraic
865
874
  // check for nans
@@ -1613,8 +1622,8 @@ struct test_ssm_scan : public test_case {
1613
1622
  }
1614
1623
  };
1615
1624
 
1616
- // GGML_OP_RWKV_WKV
1617
- struct test_rwkv_wkv : public test_case {
1625
+ // GGML_OP_RWKV_WKV6
1626
+ struct test_rwkv_wkv6 : public test_case {
1618
1627
  const ggml_type type;
1619
1628
 
1620
1629
  const int64_t head_count;
@@ -1626,7 +1635,7 @@ struct test_rwkv_wkv : public test_case {
1626
1635
  return VARS_TO_STR5(type, head_count, head_size, n_seq_tokens, n_seqs);
1627
1636
  }
1628
1637
 
1629
- test_rwkv_wkv(ggml_type type = GGML_TYPE_F32,
1638
+ test_rwkv_wkv6(ggml_type type = GGML_TYPE_F32,
1630
1639
  int64_t head_count = 32, int64_t head_size = 64, int64_t n_seq_tokens = 32, int64_t n_seqs = 32)
1631
1640
  : type(type), head_count(head_count), head_size(head_size), n_seq_tokens(n_seq_tokens), n_seqs(n_seqs) {}
1632
1641
 
@@ -1638,7 +1647,7 @@ struct test_rwkv_wkv : public test_case {
1638
1647
  ggml_tensor * tf = ggml_new_tensor(ctx, type, 2, std::vector<int64_t>{ head_size, head_count }.data());
1639
1648
  ggml_tensor * td = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ 1, head_size, head_count, n_tokens }.data());
1640
1649
  ggml_tensor * s = ggml_new_tensor(ctx, type, 2, std::vector<int64_t>{ head_size * head_size * head_count, n_seqs }.data());
1641
- ggml_tensor * out = ggml_rwkv_wkv(ctx, k, v, r, tf, td, s);
1650
+ ggml_tensor * out = ggml_rwkv_wkv6(ctx, k, v, r, tf, td, s);
1642
1651
  return out;
1643
1652
  }
1644
1653
  };
@@ -1650,11 +1659,12 @@ struct test_mul_mat : public test_case {
1650
1659
  const int64_t m;
1651
1660
  const int64_t n;
1652
1661
  const int64_t k;
1653
- const std::array<int64_t, 2> bs; // dims 3 and 4
1654
- const std::array<int64_t, 2> nr; // repeat in dims 3 and 4
1662
+ const std::array<int64_t, 2> bs; // dims 3 and 4
1663
+ const std::array<int64_t, 2> nr; // repeat in dims 3 and 4
1664
+ const std::array<int64_t, 4> per; // permutation of dimensions
1655
1665
 
1656
1666
  std::string vars() override {
1657
- return VARS_TO_STR7(type_a, type_b, m, n, k, bs, nr);
1667
+ return VARS_TO_STR8(type_a, type_b, m, n, k, bs, nr, per);
1658
1668
  }
1659
1669
 
1660
1670
  double max_nmse_err() override {
@@ -1669,17 +1679,44 @@ struct test_mul_mat : public test_case {
1669
1679
  test_mul_mat(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32,
1670
1680
  int64_t m = 32, int64_t n = 32, int64_t k = 32,
1671
1681
  std::array<int64_t, 2> bs = {10, 10},
1672
- std::array<int64_t, 2> nr = {2, 2})
1673
- : type_a(type_a), type_b(type_b), m(m), n(n), k(k), bs(bs), nr(nr) {}
1682
+ std::array<int64_t, 2> nr = {2, 2},
1683
+ std::array<int64_t, 4> per = {0, 1, 2, 3})
1684
+ : type_a(type_a), type_b(type_b), m(m), n(n), k(k), bs(bs), nr(nr), per(per) {}
1674
1685
 
1675
1686
  ggml_tensor * build_graph(ggml_context * ctx) override {
1676
1687
  // C^T = A * B^T: (k, m) * (k, n) => (m, n)
1677
- ggml_tensor * a = ggml_new_tensor_4d(ctx, type_a, k, m, bs[0] , bs[1]);
1678
- ggml_tensor * b = ggml_new_tensor_4d(ctx, type_b, k, n, bs[0]*nr[0], bs[1]*nr[1]);
1679
- ggml_set_param(ctx, a);
1680
- ggml_set_param(ctx, b);
1681
- ggml_set_name(a, "a");
1682
- ggml_set_name(b, "b");
1688
+ ggml_tensor * a;
1689
+ ggml_tensor * b;
1690
+
1691
+ const int npermuted = (per[0] != 0) + (per[1] != 1) + (per[2] != 2) + (per[3] != 3);
1692
+ if (npermuted > 0) {
1693
+ GGML_ASSERT(npermuted == 2);
1694
+ GGML_ASSERT(!ggml_is_quantized(type_a) || per[0] == 0);
1695
+ GGML_ASSERT(!ggml_is_quantized(type_b) || per[0] == 0);
1696
+
1697
+ // Create tensors with the permuted dimensions, then permute them back to the dimensions given by m,n,k.
1698
+ const int64_t ne_a[4] = {k, m, bs[0], bs[1]};
1699
+ const int64_t ne_b[4] = {k, n, bs[0]*nr[0], bs[1]*nr[1]};
1700
+
1701
+ a = ggml_new_tensor_4d(ctx, type_a, ne_a[per[0]], ne_a[per[1]], ne_a[per[2]], ne_a[per[3]]);
1702
+ b = ggml_new_tensor_4d(ctx, type_b, ne_b[per[0]], ne_b[per[1]], ne_b[per[2]], ne_b[per[3]]);
1703
+ ggml_set_param(ctx, a);
1704
+ ggml_set_param(ctx, b);
1705
+ ggml_set_name(a, "a");
1706
+ ggml_set_name(b, "b");
1707
+
1708
+ a = ggml_permute(ctx, a, per[0], per[1], per[2], per[3]);
1709
+ b = ggml_permute(ctx, b, per[0], per[1], per[2], per[3]);
1710
+ ggml_set_name(a, "a_permuted");
1711
+ ggml_set_name(b, "b_permuted");
1712
+ } else {
1713
+ a = ggml_new_tensor_4d(ctx, type_a, k, m, bs[0], bs[1]);
1714
+ b = ggml_new_tensor_4d(ctx, type_b, k, n, bs[0]*nr[0], bs[1]*nr[1]);
1715
+ ggml_set_param(ctx, a);
1716
+ ggml_set_param(ctx, b);
1717
+ ggml_set_name(a, "a");
1718
+ ggml_set_name(b, "b");
1719
+ }
1683
1720
 
1684
1721
  ggml_tensor * out = ggml_mul_mat(ctx, a, b);
1685
1722
  ggml_set_name(out, "out");
@@ -2469,6 +2506,35 @@ struct test_sum_rows : public test_case {
2469
2506
  }
2470
2507
  };
2471
2508
 
2509
+ // GGML_OP_MEAN
2510
+ struct test_mean : public test_case {
2511
+ const ggml_type type;
2512
+ const std::array<int64_t, 4> ne;
2513
+
2514
+ std::string vars() override {
2515
+ return VARS_TO_STR2(type, ne);
2516
+ }
2517
+
2518
+ test_mean(ggml_type type = GGML_TYPE_F32,
2519
+ std::array<int64_t, 4> ne = {10, 5, 4, 3})
2520
+ : type(type), ne(ne) {}
2521
+
2522
+ ggml_tensor * build_graph(ggml_context * ctx) override {
2523
+ ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
2524
+ ggml_set_param(ctx, a);
2525
+ ggml_set_name(a, "a");
2526
+
2527
+ ggml_tensor * out = ggml_mean(ctx, a);
2528
+ ggml_set_name(out, "out");
2529
+
2530
+ return out;
2531
+ }
2532
+
2533
+ float grad_eps() override {
2534
+ return 0.1f * ne[0]*ne[1]*ne[2]*ne[3];
2535
+ }
2536
+ };
2537
+
2472
2538
  // GGML_OP_UPSCALE
2473
2539
  struct test_upscale : public test_case {
2474
2540
  const ggml_type type;
@@ -2711,6 +2777,13 @@ struct test_flash_attn_ext : public test_case {
2711
2777
  return 5e-4;
2712
2778
  }
2713
2779
 
2780
+ uint64_t op_flops(ggml_tensor * t) override {
2781
+ GGML_UNUSED(t);
2782
+ // Just counting matmul costs:
2783
+ // Q*K^T is nb x hs x kv, P*V is nb x kv x hs, per head
2784
+ return 2 * 2 * nh * nb * hs * kv;
2785
+ }
2786
+
2714
2787
  test_flash_attn_ext(int64_t hs = 128, int64_t nh = 32, int64_t kv = 96, int64_t nb = 8,
2715
2788
  bool mask = true, float max_bias = 0.0f, float logit_softcap = 0.0f, ggml_type type_KV = GGML_TYPE_F16)
2716
2789
  : hs(hs), nh(nh), kv(kv), nb(nb), mask(mask), max_bias(max_bias), logit_softcap(logit_softcap), type_KV(type_KV) {}
@@ -2796,24 +2869,14 @@ struct test_cross_entropy_loss : public test_case {
2796
2869
  struct test_opt_step_adamw : public test_case {
2797
2870
  const ggml_type type;
2798
2871
  const std::array<int64_t, 4> ne;
2799
- const float alpha;
2800
- const float beta1;
2801
- const float beta2;
2802
- const float eps;
2803
- const float wd;
2804
2872
 
2805
2873
  std::string vars() override {
2806
- return VARS_TO_STR7(type, ne, alpha, beta1, beta2, eps, wd);
2874
+ return VARS_TO_STR2(type, ne);
2807
2875
  }
2808
2876
 
2809
2877
  test_opt_step_adamw(ggml_type type = GGML_TYPE_F32,
2810
- std::array<int64_t, 4> ne = {10, 5, 4, 3},
2811
- float alpha = 1e-3f,
2812
- float beta1 = 0.9f,
2813
- float beta2 = 0.999f,
2814
- float eps = 1e-8f,
2815
- float wd = 0.0f)
2816
- : type(type), ne(ne), alpha(alpha), beta1(beta1), beta2(beta2), eps(eps), wd(wd) {}
2878
+ std::array<int64_t, 4> ne = {10, 5, 4, 3})
2879
+ : type(type), ne(ne) {}
2817
2880
 
2818
2881
  ggml_tensor * build_graph(ggml_context * ctx) override {
2819
2882
  ggml_tensor * a = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
@@ -2823,7 +2886,16 @@ struct test_opt_step_adamw : public test_case {
2823
2886
  ggml_tensor * grad = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
2824
2887
  ggml_set_name(grad, "grad");
2825
2888
 
2826
- ggml_tensor * out = ggml_opt_step_adamw(ctx, a, grad, alpha, beta1, beta2, eps, wd);
2889
+ ggml_tensor * grad_m = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
2890
+ ggml_set_name(grad_m, "grad_m");
2891
+
2892
+ ggml_tensor * grad_v = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
2893
+ ggml_set_name(grad_v, "grad_v");
2894
+
2895
+ ggml_tensor * adamw_params = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 7);
2896
+ ggml_set_name(adamw_params, "adamw_params");
2897
+
2898
+ ggml_tensor * out = ggml_opt_step_adamw(ctx, a, grad, grad_m, grad_v, adamw_params);
2827
2899
  ggml_set_name(out, "out");
2828
2900
 
2829
2901
  return out;
@@ -2831,7 +2903,7 @@ struct test_opt_step_adamw : public test_case {
2831
2903
 
2832
2904
  void initialize_tensors(ggml_context * ctx) override {
2833
2905
  for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
2834
- init_tensor_uniform(t, 0.0f, 1.0f); // grad_v needs non-negative values.
2906
+ init_tensor_uniform(t, 0.0f, 1.0f); // grad_v and adamw_params need non-negative values.
2835
2907
  }
2836
2908
  }
2837
2909
 
@@ -3308,13 +3380,49 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
3308
3380
  }
3309
3381
  }
3310
3382
 
3311
- test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32));
3312
- test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32));
3313
- test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16));
3314
- // test cases for 1D im2col
3383
+ // im2col 1D
3315
3384
  test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32, {3000, 128, 1, 1}, {3, 128, 1280, 1}, 1, 0, 1, 0, 1, 0, false));
3316
3385
  test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32, {3000, 128, 1, 1}, {3, 128, 1280, 1}, 1, 0, 1, 0, 1, 0, false));
3317
3386
  test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {3000, 128, 1, 1}, {3, 128, 1280, 1}, 1, 0, 1, 0, 1, 0, false));
3387
+ for (int s0 : {1, 3}) {
3388
+ for (int p0 : {0, 3}) {
3389
+ for (int d0 : {1, 3}) {
3390
+ test_cases.emplace_back(new test_im2col(
3391
+ GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32, {20, 2, 2, 1}, {3, 2, 2, 1},
3392
+ s0, 0, p0, 0, d0, 0, false));
3393
+ }
3394
+ }
3395
+ }
3396
+
3397
+ // im2col 2D
3398
+ test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32));
3399
+ test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32));
3400
+ test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16));
3401
+ for (int s0 : {1, 3}) {
3402
+ for (int s1 : {1, 3}) {
3403
+ for (int p0 : {0, 3}) {
3404
+ for (int p1 : {0, 3}) {
3405
+ for (int d0 : {1, 3}) {
3406
+ for (int d1 : {1, 3}) {
3407
+ test_cases.emplace_back(new test_im2col(
3408
+ GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32, {20, 20, 2, 2}, {3, 3, 2, 2},
3409
+ s0, s1, p0, p1, d0, d1, true));
3410
+ }
3411
+ }
3412
+ }
3413
+ }
3414
+ }
3415
+ }
3416
+
3417
+ // extra tests for im2col 2D
3418
+ test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 1, 32}, {3, 3, 1, 32}, 1, 1, 1, 1, 1, 1, true));
3419
+ test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 2, 32}, {3, 3, 2, 32}, 1, 1, 1, 1, 1, 1, true));
3420
+ test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 1, 1024}, {3, 3, 1, 1024}, 1, 1, 1, 1, 1, 1, true));
3421
+ test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 2, 1024}, {3, 3, 2, 1024}, 1, 1, 1, 1, 1, 1, true));
3422
+ test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 1, 2048}, {3, 3, 1, 2048}, 1, 1, 1, 1, 1, 1, true));
3423
+ test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 2, 2048}, {3, 3, 2, 2048}, 1, 1, 1, 1, 1, 1, true));
3424
+ test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 1, 2560}, {3, 3, 1, 2560}, 1, 1, 1, 1, 1, 1, true));
3425
+ test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 2, 2560}, {3, 3, 2, 2560}, 1, 1, 1, 1, 1, 1, true));
3318
3426
 
3319
3427
  // sycl backend will limit task global_range < MAX_INT
3320
3428
  // test cases for 2D im2col with large input W and H (occurs in stable-diffusion)
@@ -3434,21 +3542,22 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
3434
3542
 
3435
3543
  test_cases.emplace_back(new test_ssm_scan(GGML_TYPE_F32, 16, 1024, 32, 4));
3436
3544
 
3437
- test_cases.emplace_back(new test_rwkv_wkv(GGML_TYPE_F32, 32, 64, 1, 1));
3438
- test_cases.emplace_back(new test_rwkv_wkv(GGML_TYPE_F32, 32, 64, 32, 1));
3439
- test_cases.emplace_back(new test_rwkv_wkv(GGML_TYPE_F32, 32, 64, 32, 4));
3440
- test_cases.emplace_back(new test_rwkv_wkv(GGML_TYPE_F32, 32, 64, 128, 4));
3545
+ test_cases.emplace_back(new test_rwkv_wkv6(GGML_TYPE_F32, 32, 64, 1, 1));
3546
+ test_cases.emplace_back(new test_rwkv_wkv6(GGML_TYPE_F32, 32, 64, 32, 1));
3547
+ test_cases.emplace_back(new test_rwkv_wkv6(GGML_TYPE_F32, 32, 64, 32, 4));
3548
+ test_cases.emplace_back(new test_rwkv_wkv6(GGML_TYPE_F32, 32, 64, 128, 4));
3441
3549
 
3442
3550
  #if 1
3443
3551
  for (ggml_type type_a : base_types) {
3444
3552
  for (ggml_type type_b : {GGML_TYPE_F32, GGML_TYPE_F16}) {
3445
- test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, { 1, 1}, {1, 1}));
3446
- test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 1}, {1, 1}));
3447
- test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 1}, {2, 1}));
3448
- test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 10}, {1, 1}));
3449
- test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 10}, {2, 1}));
3450
- test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 10}, {1, 2}));
3451
- test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 10}, {2, 2}));
3553
+ // test cases without permutation
3554
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, { 1, 1}, {1, 1}));
3555
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 1}, {1, 1}));
3556
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 1}, {2, 1}));
3557
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 10}, {1, 1}));
3558
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 10}, {2, 1}));
3559
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 10}, {1, 2}));
3560
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 10}, {2, 2}));
3452
3561
 
3453
3562
  test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, { 1, 1}, {1, 1}));
3454
3563
  test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 1}, {1, 1}));
@@ -3457,6 +3566,19 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
3457
3566
  test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {2, 1}));
3458
3567
  test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {1, 2}));
3459
3568
  test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {2, 2}));
3569
+
3570
+ // test cases with permutation
3571
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {2, 3}, {1, 1}, {0, 2, 1, 3}));
3572
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {2, 3}, {1, 1}, {0, 1, 3, 2}));
3573
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {2, 3}, {1, 1}, {0, 3, 2, 1}));
3574
+
3575
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 8, 256, {2, 3}, {1, 1}, {0, 2, 1, 3}));
3576
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 8, 256, {2, 3}, {1, 1}, {0, 1, 3, 2}));
3577
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 8, 256, {2, 3}, {1, 1}, {0, 3, 2, 1}));
3578
+
3579
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {2, 3}, {1, 1}, {0, 2, 1, 3}));
3580
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {2, 3}, {1, 1}, {0, 1, 3, 2}));
3581
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {2, 3}, {1, 1}, {0, 3, 2, 1}));
3460
3582
  }
3461
3583
  }
3462
3584
  for (ggml_type type_a : other_types) {
@@ -3520,7 +3642,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
3520
3642
  for (int n_mats : {4}) {
3521
3643
  for (int n_used : {2}) {
3522
3644
  for (bool b : {false}) {
3523
- for (int n : {1}) {
3645
+ for (int n : {1, 32}) {
3524
3646
  int m = 512;
3525
3647
  int k = 256;
3526
3648
  test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, n_mats, n_used, b, m, n, k));
@@ -3647,6 +3769,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
3647
3769
 
3648
3770
  test_cases.emplace_back(new test_sum());
3649
3771
  test_cases.emplace_back(new test_sum_rows());
3772
+ test_cases.emplace_back(new test_mean());
3650
3773
  test_cases.emplace_back(new test_upscale());
3651
3774
  test_cases.emplace_back(new test_upscale(GGML_TYPE_F32, { 512, 512, 3, 1 }, 2, true));
3652
3775
  test_cases.emplace_back(new test_upscale_ext());
@@ -3666,7 +3789,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
3666
3789
  for (int nh : { 32, }) {
3667
3790
  for (int kv : { 512, 1024, }) {
3668
3791
  for (int nb : { 1, 3, 32, 35, }) {
3669
- for (ggml_type type_KV : {GGML_TYPE_F16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0}) {
3792
+ for (ggml_type type_KV : {GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0}) {
3670
3793
  test_cases.emplace_back(new test_flash_attn_ext(hs, nh, kv, nb, mask, max_bias, logit_softcap, type_KV));
3671
3794
  }
3672
3795
  }
@@ -3678,9 +3801,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
3678
3801
  }
3679
3802
 
3680
3803
  test_cases.emplace_back(new test_cross_entropy_loss());
3681
- for (float wd : {0.0f, 1e-2f}) {
3682
- test_cases.emplace_back(new test_opt_step_adamw(GGML_TYPE_F32, {10, 5, 4, 3}, 1.0f, 1e-3f, 0.9f, 0.999f, wd));
3683
- }
3804
+ test_cases.emplace_back(new test_opt_step_adamw(GGML_TYPE_F32, {10, 5, 4, 3}));
3684
3805
 
3685
3806
  // these tests are disabled to save execution time, but they can be handy for debugging
3686
3807
  #if 0
@@ -3700,6 +3821,8 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
3700
3821
  test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 1, 1, 1}));
3701
3822
  test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 512, 1, 1}));
3702
3823
 
3824
+ test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F16, {512, 3072, 1, 1}));
3825
+
3703
3826
  for (int bs : {1, 512}) {
3704
3827
  for (ggml_type type_a : all_types) {
3705
3828
  for (ggml_type type_b : {GGML_TYPE_F32}) {
@@ -3820,9 +3943,11 @@ int main(int argc, char ** argv) {
3820
3943
  continue;
3821
3944
  }
3822
3945
 
3823
- if (ggml_backend_is_cpu(backend)) {
3946
+ ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
3947
+ auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
3948
+ if (ggml_backend_set_n_threads_fn) {
3824
3949
  // TODO: better value for n_threads
3825
- ggml_backend_cpu_set_n_threads(backend, std::thread::hardware_concurrency() / 2);
3950
+ ggml_backend_set_n_threads_fn(backend, std::thread::hardware_concurrency());
3826
3951
  }
3827
3952
 
3828
3953
  printf(" Device description: %s\n", ggml_backend_dev_description(dev));
@@ -3846,6 +3971,8 @@ int main(int argc, char ** argv) {
3846
3971
  ggml_backend_free(backend);
3847
3972
  }
3848
3973
 
3974
+ ggml_quantize_free();
3975
+
3849
3976
  printf("%zu/%zu backends passed\n", n_ok, ggml_backend_dev_count());
3850
3977
 
3851
3978
  if (n_ok != ggml_backend_dev_count()) {
@@ -3853,8 +3980,6 @@ int main(int argc, char ** argv) {
3853
3980
  return 1;
3854
3981
  }
3855
3982
 
3856
- ggml_quantize_free();
3857
-
3858
3983
  printf("\033[1;32mOK\033[0m\n");
3859
3984
  return 0;
3860
3985
  }
@@ -1,4 +1,5 @@
1
1
  #include "ggml.h"
2
+ #include "ggml-cpu.h"
2
3
  #include "ggml-backend.h"
3
4
 
4
5
  #include <chrono>
@@ -65,6 +65,8 @@ int main(void) {
65
65
  u8"{% for message in messages %}{% if message['role'] == 'user' %}{{'<用户>' + message['content'].strip() + '<AI>'}}{% else %}{{message['content'].strip()}}{% endif %}{% endfor %}",
66
66
  // DeepSeek-V2
67
67
  "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ message['content'] + '\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}",
68
+ // ibm-granite/granite-3.0-8b-instruct
69
+ "{%- if tools %}\n {{- '<|start_of_role|>available_tools<|end_of_role|>\n' }}\n {%- for tool in tools %}\n {{- tool | tojson(indent=4) }}\n {%- if not loop.last %}\n {{- '\n\n' }}\n {%- endif %}\n {%- endfor %}\n {{- '<|end_of_text|>\n' }}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'system' %}\n {{- '<|start_of_role|>system<|end_of_role|>' + message['content'] + '<|end_of_text|>\n' }}\n {%- elif message['role'] == 'user' %}\n {{- '<|start_of_role|>user<|end_of_role|>' + message['content'] + '<|end_of_text|>\n' }}\n {%- elif message['role'] == 'assistant' %}\n {{- '<|start_of_role|>assistant<|end_of_role|>' + message['content'] + '<|end_of_text|>\n' }}\n {%- elif message['role'] == 'assistant_tool_call' %}\n {{- '<|start_of_role|>assistant<|end_of_role|><|tool_call|>' + message['content'] + '<|end_of_text|>\n' }}\n {%- elif message['role'] == 'tool_response' %}\n {{- '<|start_of_role|>tool_response<|end_of_role|>' + message['content'] + '<|end_of_text|>\n' }}\n {%- endif %}\n {%- if loop.last and add_generation_prompt %}\n {{- '<|start_of_role|>assistant<|end_of_role|>' }}\n {%- endif %}\n{%- endfor %}",
68
70
  };
69
71
  std::vector<std::string> expected_output = {
70
72
  // teknium/OpenHermes-2.5-Mistral-7B
@@ -109,6 +111,8 @@ int main(void) {
109
111
  u8"You are a helpful assistant<用户>Hello<AI>Hi there<用户>Who are you<AI>I am an assistant<用户>Another question<AI>",
110
112
  // DeepSeek-V2
111
113
  u8"You are a helpful assistant\n\nUser: Hello\n\nAssistant: Hi there<|end▁of▁sentence|>User: Who are you\n\nAssistant: I am an assistant <|end▁of▁sentence|>User: Another question\n\nAssistant:",
114
+ // ibm-granite/granite-3.0-8b-instruct
115
+ "<|start_of_role|>system<|end_of_role|>You are a helpful assistant<|end_of_text|>\n<|start_of_role|>user<|end_of_role|>Hello<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>Hi there<|end_of_text|>\n<|start_of_role|>user<|end_of_role|>Who are you<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|> I am an assistant <|end_of_text|>\n<|start_of_role|>user<|end_of_role|>Another question<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>\n",
112
116
  };
113
117
  std::vector<char> formatted_chat(1024);
114
118
  int32_t res;
@@ -140,11 +144,11 @@ int main(void) {
140
144
 
141
145
  // test llama_chat_format_single for system message
142
146
  printf("\n\n=== llama_chat_format_single (system message) ===\n\n");
143
- std::vector<llama_chat_msg> chat2;
144
- llama_chat_msg sys_msg{"system", "You are a helpful assistant"};
147
+ std::vector<common_chat_msg> chat2;
148
+ common_chat_msg sys_msg{"system", "You are a helpful assistant"};
145
149
 
146
150
  auto fmt_sys = [&](std::string tmpl) {
147
- auto output = llama_chat_format_single(nullptr, tmpl, chat2, sys_msg, false);
151
+ auto output = common_chat_format_single(nullptr, tmpl, chat2, sys_msg, false);
148
152
  printf("fmt_sys(%s) : %s\n", tmpl.c_str(), output.c_str());
149
153
  printf("-------------------------\n");
150
154
  return output;
@@ -160,10 +164,10 @@ int main(void) {
160
164
  chat2.push_back({"system", "You are a helpful assistant"});
161
165
  chat2.push_back({"user", "Hello"});
162
166
  chat2.push_back({"assistant", "I am assistant"});
163
- llama_chat_msg new_msg{"user", "How are you"};
167
+ common_chat_msg new_msg{"user", "How are you"};
164
168
 
165
169
  auto fmt_single = [&](std::string tmpl) {
166
- auto output = llama_chat_format_single(nullptr, tmpl, chat2, new_msg, true);
170
+ auto output = common_chat_format_single(nullptr, tmpl, chat2, new_msg, true);
167
171
  printf("fmt_single(%s) : %s\n", tmpl.c_str(), output.c_str());
168
172
  printf("-------------------------\n");
169
173
  return output;