@fugood/llama.node 0.3.0 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (187) hide show
  1. package/CMakeLists.txt +1 -10
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/package.json +6 -4
  17. package/src/LlamaCompletionWorker.cpp +6 -6
  18. package/src/LlamaContext.cpp +7 -9
  19. package/src/common.hpp +2 -1
  20. package/src/llama.cpp/.github/workflows/build.yml +98 -24
  21. package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
  22. package/src/llama.cpp/.github/workflows/docker.yml +43 -34
  23. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
  24. package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
  25. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
  26. package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
  27. package/src/llama.cpp/.github/workflows/server.yml +7 -0
  28. package/src/llama.cpp/CMakeLists.txt +20 -8
  29. package/src/llama.cpp/common/CMakeLists.txt +12 -10
  30. package/src/llama.cpp/common/arg.cpp +2006 -0
  31. package/src/llama.cpp/common/arg.h +77 -0
  32. package/src/llama.cpp/common/common.cpp +496 -1632
  33. package/src/llama.cpp/common/common.h +161 -63
  34. package/src/llama.cpp/common/console.cpp +3 -0
  35. package/src/llama.cpp/common/log.cpp +401 -0
  36. package/src/llama.cpp/common/log.h +66 -698
  37. package/src/llama.cpp/common/ngram-cache.cpp +3 -0
  38. package/src/llama.cpp/common/sampling.cpp +348 -350
  39. package/src/llama.cpp/common/sampling.h +62 -139
  40. package/src/llama.cpp/common/stb_image.h +5990 -6398
  41. package/src/llama.cpp/common/train.cpp +2 -0
  42. package/src/llama.cpp/docs/build.md +36 -1
  43. package/src/llama.cpp/examples/CMakeLists.txt +0 -1
  44. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +1 -2
  45. package/src/llama.cpp/examples/batched/batched.cpp +39 -55
  46. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +34 -44
  47. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
  48. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +15 -15
  49. package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
  50. package/src/llama.cpp/examples/embedding/embedding.cpp +143 -87
  51. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +33 -33
  52. package/src/llama.cpp/examples/export-lora/export-lora.cpp +36 -35
  53. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
  54. package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +5 -0
  55. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
  56. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
  57. package/src/llama.cpp/examples/gritlm/gritlm.cpp +34 -27
  58. package/src/llama.cpp/examples/imatrix/imatrix.cpp +59 -62
  59. package/src/llama.cpp/examples/infill/infill.cpp +117 -132
  60. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +265 -58
  61. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +29 -22
  62. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  63. package/src/llama.cpp/examples/llava/clip.cpp +685 -150
  64. package/src/llama.cpp/examples/llava/clip.h +11 -2
  65. package/src/llama.cpp/examples/llava/llava-cli.cpp +47 -58
  66. package/src/llama.cpp/examples/llava/llava.cpp +110 -24
  67. package/src/llama.cpp/examples/llava/llava.h +2 -3
  68. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
  69. package/src/llama.cpp/examples/llava/requirements.txt +1 -0
  70. package/src/llama.cpp/examples/lookahead/lookahead.cpp +42 -43
  71. package/src/llama.cpp/examples/lookup/lookup-create.cpp +10 -8
  72. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +23 -22
  73. package/src/llama.cpp/examples/lookup/lookup.cpp +40 -43
  74. package/src/llama.cpp/examples/main/main.cpp +210 -262
  75. package/src/llama.cpp/examples/parallel/parallel.cpp +49 -49
  76. package/src/llama.cpp/examples/passkey/passkey.cpp +42 -50
  77. package/src/llama.cpp/examples/perplexity/perplexity.cpp +187 -200
  78. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  79. package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
  80. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -3
  81. package/src/llama.cpp/examples/retrieval/retrieval.cpp +49 -44
  82. package/src/llama.cpp/examples/rpc/rpc-server.cpp +24 -1
  83. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +32 -35
  84. package/src/llama.cpp/examples/server/CMakeLists.txt +3 -5
  85. package/src/llama.cpp/examples/server/server.cpp +1027 -1073
  86. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
  87. package/src/llama.cpp/examples/server/utils.hpp +107 -105
  88. package/src/llama.cpp/examples/simple/simple.cpp +35 -41
  89. package/src/llama.cpp/examples/speculative/speculative.cpp +129 -103
  90. package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
  91. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
  92. package/src/llama.cpp/examples/tokenize/tokenize.cpp +25 -27
  93. package/src/llama.cpp/ggml/CMakeLists.txt +14 -3
  94. package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
  95. package/src/llama.cpp/ggml/include/ggml-backend.h +145 -60
  96. package/src/llama.cpp/ggml/include/ggml-blas.h +3 -3
  97. package/src/llama.cpp/ggml/include/ggml-cann.h +15 -19
  98. package/src/llama.cpp/ggml/include/ggml-cuda.h +16 -16
  99. package/src/llama.cpp/ggml/include/ggml-metal.h +5 -8
  100. package/src/llama.cpp/ggml/include/ggml-rpc.h +5 -5
  101. package/src/llama.cpp/ggml/include/ggml-sycl.h +8 -8
  102. package/src/llama.cpp/ggml/include/ggml-vulkan.h +7 -7
  103. package/src/llama.cpp/ggml/include/ggml.h +293 -186
  104. package/src/llama.cpp/ggml/src/CMakeLists.txt +86 -44
  105. package/src/llama.cpp/ggml/src/ggml-aarch64.c +2135 -1119
  106. package/src/llama.cpp/ggml/src/ggml-alloc.c +6 -0
  107. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +152 -70
  108. package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +606 -286
  109. package/src/llama.cpp/ggml/src/ggml-blas.cpp +9 -10
  110. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
  111. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
  112. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
  113. package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
  114. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
  115. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
  116. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
  117. package/src/llama.cpp/ggml/src/ggml-cann.cpp +215 -216
  118. package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
  119. package/src/llama.cpp/ggml/src/ggml-cpu-impl.h +614 -0
  120. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
  121. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
  122. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
  123. package/src/llama.cpp/ggml/src/ggml-impl.h +49 -603
  124. package/src/llama.cpp/ggml/src/ggml-kompute.cpp +4 -24
  125. package/src/llama.cpp/ggml/src/ggml-quants.c +972 -92
  126. package/src/llama.cpp/ggml/src/ggml-quants.h +15 -0
  127. package/src/llama.cpp/ggml/src/ggml-rpc.cpp +116 -66
  128. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
  129. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +11 -0
  130. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +52 -0
  131. package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
  132. package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
  133. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
  134. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
  135. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
  136. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
  137. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +16 -3
  138. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
  139. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
  140. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
  141. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1 -1
  142. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +6 -3
  143. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +2 -0
  144. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
  145. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
  146. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
  147. package/src/llama.cpp/ggml/src/ggml-sycl.cpp +97 -169
  148. package/src/llama.cpp/ggml/src/ggml-vulkan.cpp +1508 -1124
  149. package/src/llama.cpp/ggml/src/ggml.c +3001 -1647
  150. package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +192 -0
  151. package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +2 -0
  152. package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +88 -40
  153. package/src/llama.cpp/include/llama.h +241 -264
  154. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
  155. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
  156. package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
  157. package/src/llama.cpp/src/llama-grammar.cpp +721 -122
  158. package/src/llama.cpp/src/llama-grammar.h +120 -15
  159. package/src/llama.cpp/src/llama-impl.h +156 -1
  160. package/src/llama.cpp/src/llama-sampling.cpp +1375 -303
  161. package/src/llama.cpp/src/llama-sampling.h +20 -47
  162. package/src/llama.cpp/src/llama-vocab.cpp +343 -120
  163. package/src/llama.cpp/src/llama-vocab.h +33 -17
  164. package/src/llama.cpp/src/llama.cpp +4247 -1525
  165. package/src/llama.cpp/src/unicode-data.cpp +6 -4
  166. package/src/llama.cpp/src/unicode-data.h +4 -4
  167. package/src/llama.cpp/src/unicode.cpp +15 -7
  168. package/src/llama.cpp/tests/CMakeLists.txt +3 -0
  169. package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
  170. package/src/llama.cpp/tests/test-backend-ops.cpp +1592 -289
  171. package/src/llama.cpp/tests/test-barrier.cpp +93 -0
  172. package/src/llama.cpp/tests/test-grad0.cpp +187 -70
  173. package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
  174. package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
  175. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +6 -4
  176. package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
  177. package/src/llama.cpp/tests/test-log.cpp +39 -0
  178. package/src/llama.cpp/tests/test-quantize-fns.cpp +6 -0
  179. package/src/llama.cpp/tests/test-rope.cpp +1 -1
  180. package/src/llama.cpp/tests/test-sampling.cpp +157 -98
  181. package/src/llama.cpp/tests/test-tokenizer-0.cpp +55 -35
  182. package/patches/llama.patch +0 -22
  183. package/src/llama.cpp/.github/workflows/bench.yml +0 -310
  184. package/src/llama.cpp/common/grammar-parser.cpp +0 -536
  185. package/src/llama.cpp/common/grammar-parser.h +0 -29
  186. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
  187. package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
@@ -1,3 +1,20 @@
1
+ // This file defines tests for various GGML ops and backends.
2
+ // For the forward pass it asserts that the results of multiple backends computing the same GGML ops are consistent.
3
+ // For the backward pass it asserts that the gradients from backpropagation are consistent
4
+ // with the gradients obtained via the method of finite differences ("grad" mode, this is optional).
5
+ // It is also possible to check the performance ("perf" mode).
6
+ //
7
+ // this file has three sections: Section 1 does general setup, section 2 defines the GGML ops to be tested,
8
+ // and section 3 defines which tests to run.
9
+ // Quick start for adding a new GGML op: Go to section 2 and create a struct that inherits from test_case,
10
+ // then go to section 3 and add an instantiation of your struct.
11
+
12
+
13
+ // ##############################
14
+ // ## Section 1: General Setup ##
15
+ // ##############################
16
+
17
+
1
18
  #include <ggml.h>
2
19
  #include <ggml-alloc.h>
3
20
  #include <ggml-backend.h>
@@ -5,7 +22,9 @@
5
22
  #include <algorithm>
6
23
  #include <array>
7
24
  #include <cfloat>
25
+ #include <cstdint>
8
26
  #include <cstring>
27
+ #include <cinttypes>
9
28
  #include <functional>
10
29
  #include <memory>
11
30
  #include <random>
@@ -13,64 +32,52 @@
13
32
  #include <stdlib.h>
14
33
  #include <string>
15
34
  #include <thread>
35
+ #include <future>
16
36
  #include <vector>
17
37
 
18
-
19
38
  static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float max = 1.0f) {
20
- // static RNG initialization (revisit if n_threads stops being constant)
21
- static const size_t n_threads = std::thread::hardware_concurrency();
22
- static std::vector<std::default_random_engine> generators = []() {
23
- std::random_device rd;
24
- std::vector<std::default_random_engine> vec;
25
- vec.reserve(n_threads);
26
- //for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(1234 + i); } // fixed seed
27
- for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(rd()); }
28
- return vec;
29
- }();
30
-
31
- size_t size = ggml_nelements(tensor);
32
- std::vector<float> data(size);
39
+ size_t nels = ggml_nelements(tensor);
40
+ std::vector<float> data(nels);
41
+ {
42
+ // parallel initialization
43
+ static const size_t n_threads = std::thread::hardware_concurrency();
44
+ // static RNG initialization (revisit if n_threads stops being constant)
45
+ static std::vector<std::default_random_engine> generators = []() {
46
+ std::random_device rd;
47
+ std::vector<std::default_random_engine> vec;
48
+ vec.reserve(n_threads);
49
+ //for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(1234 + i); } // fixed seed
50
+ for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(rd()); }
51
+ return vec;
52
+ }();
53
+
54
+ auto init_thread = [&](size_t ith, size_t start, size_t end) {
55
+ std::uniform_real_distribution<float> distribution(min, max);
56
+ auto & gen = generators[ith];
57
+ for (size_t i = start; i < end; i++) {
58
+ data[i] = distribution(gen);
59
+ }
60
+ };
33
61
 
34
- auto init_thread = [&](size_t ith, size_t start, size_t end) {
35
- std::uniform_real_distribution<float> distribution(min, max);
36
- for (size_t i = start; i < end; i++) {
37
- data[i] = distribution(generators[ith]);
62
+ std::vector<std::future<void>> tasks;
63
+ tasks.reserve(n_threads);
64
+ for (size_t i = 0; i < n_threads; i++) {
65
+ size_t start = i*nels/n_threads;
66
+ size_t end = (i+1)*nels/n_threads;
67
+ tasks.push_back(std::async(std::launch::async, init_thread, i, start, end));
38
68
  }
39
- };
40
-
41
- std::vector<std::thread> threads;
42
- threads.reserve(n_threads);
43
- for (size_t i = 0; i < n_threads; i++) {
44
- size_t start = i*size/n_threads;
45
- size_t end = (i+1)*size/n_threads;
46
- threads.emplace_back(init_thread, i, start, end);
47
- }
48
- for (auto & t : threads) {
49
- t.join();
50
- }
51
-
52
- #if 0
53
- const char * val_str = getenv("GGML_TEST_EPS");
54
- float val = 1e-9f;
55
- if (val_str != nullptr) {
56
- val = std::stof(val_str);
57
- printf("GGML_TEST_EPS=%e\n", val);
58
- }
59
-
60
- // test quantization with very small values that may result in nan scales due to division by zero
61
- if (ggml_is_quantized(tensor->type)) {
62
- for (int i = 0; i < 256; i++) {
63
- data[i] = val;
69
+ for (auto & t : tasks) {
70
+ t.get();
64
71
  }
65
72
  }
66
- #endif
67
73
 
68
74
  if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) {
69
- ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float));
75
+ ggml_backend_tensor_set(tensor, data.data(), 0, nels * sizeof(float));
70
76
  } else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16) {
71
- GGML_ASSERT(size % ggml_blck_size(tensor->type) == 0);
72
- std::vector<uint8_t> dataq(ggml_row_size(tensor->type, size));
73
- std::vector<float> imatrix(tensor->ne[0], 1.0f); // dummy importance matrix
77
+ GGML_ASSERT(nels % ggml_blck_size(tensor->type) == 0);
78
+
79
+ // dummy importance matrix
80
+ std::vector<float> imatrix(tensor->ne[0], 1.0f);
74
81
  const float * im = imatrix.data();
75
82
  if (!ggml_quantize_requires_imatrix(tensor->type)) {
76
83
  // when the imatrix is optional, we want to test both quantization with and without imatrix
@@ -80,19 +87,40 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
80
87
  }
81
88
  }
82
89
 
83
- ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size/tensor->ne[0], tensor->ne[0], im);
84
- GGML_ASSERT(ggml_validate_row_data(tensor->type, dataq.data(), dataq.size()));
85
- // TODO: other cases
86
- //#pragma omp parallel for
87
- //for (int i = 0; i < tensor->ne[1]; i++) {
88
- // ggml_quantize_chunk(tensor->type, data.data(), dataq.data(),
89
- // i * tensor->ne[0], 1, tensor->ne[0], im);
90
- //}
91
-
90
+ std::vector<uint8_t> dataq(ggml_row_size(tensor->type, nels));
91
+ {
92
+ // parallel quantization by block
93
+ size_t blck_size = ggml_blck_size(tensor->type);
94
+ size_t n_blocks = nels / blck_size;
95
+
96
+ auto quantize_thread = [&](size_t start, size_t end) {
97
+ ggml_quantize_chunk(tensor->type, data.data(), dataq.data(),
98
+ start * blck_size, end - start, blck_size, im);
99
+ };
100
+
101
+ const size_t min_blocks_per_thread = 1;
102
+ const size_t n_threads = std::min<size_t>(std::thread::hardware_concurrency()/2,
103
+ std::max<size_t>(1, n_blocks / min_blocks_per_thread));
104
+ std::vector<std::future<void>> tasks;
105
+ tasks.reserve(n_threads);
106
+ for (size_t i = 0; i < n_threads; i++) {
107
+ size_t start = i*n_blocks/n_threads;
108
+ size_t end = (i+1)*n_blocks/n_threads;
109
+ tasks.push_back(std::async(std::launch::async, quantize_thread, start, end));
110
+ }
111
+ for (auto & t : tasks) {
112
+ t.get();
113
+ }
114
+ }
92
115
  ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size());
93
116
  } else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) {
94
117
  // This is going to create some weird integers though.
95
118
  ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor));
119
+ } else if (tensor->type == GGML_TYPE_I64) {
120
+ // Integers with a size of 8 bytes can be set by mirroring the float data, the specific values are again not really meaningful.
121
+ const size_t nbytes_half = ggml_nbytes(tensor)/2;
122
+ ggml_backend_tensor_set(tensor, data.data(), 0*nbytes_half, nbytes_half);
123
+ ggml_backend_tensor_set(tensor, data.data(), 1*nbytes_half, nbytes_half);
96
124
  } else {
97
125
  GGML_ABORT("fatal error");
98
126
  }
@@ -122,6 +150,8 @@ static std::vector<float> tensor_to_float(const ggml_tensor * t) {
122
150
  tv.push_back(ggml_bf16_to_fp32(*(ggml_bf16_t*)&buf[i]));
123
151
  } else if (t->type == GGML_TYPE_F32) {
124
152
  tv.push_back(*(float *) &buf[i]);
153
+ } else if (t->type == GGML_TYPE_I64) {
154
+ tv.push_back((float)*(int64_t *) &buf[i]);
125
155
  } else if (t->type == GGML_TYPE_I32) {
126
156
  tv.push_back((float)*(int32_t *) &buf[i]);
127
157
  } else if (t->type == GGML_TYPE_I16) {
@@ -142,60 +172,6 @@ static std::vector<float> tensor_to_float(const ggml_tensor * t) {
142
172
  return tv;
143
173
  }
144
174
 
145
- /*
146
- static double cosine_similarity(const float * v1, const float * v2, size_t n) {
147
- double dot = 0.0;
148
- double mag1 = 0.0;
149
- double mag2 = 0.0;
150
-
151
- for (size_t i = 0; i < n; i++) {
152
- if (std::isnan(v1[i]) || std::isnan(v2[i])) {
153
- return -1.0f;
154
- }
155
- if (std::isinf(v1[i]) && std::isinf(v2[i])) {
156
- continue;
157
- }
158
- dot += v1[i]*v2[i];
159
- mag1 += v1[i]*v1[i];
160
- mag2 += v2[i]*v2[i];
161
- }
162
-
163
- return dot/sqrt(mag1*mag2);
164
- }
165
-
166
- static float distance(const float * v1, const float * v2, size_t n) {
167
- double d = 0.0;
168
-
169
- for (size_t i = 0; i < n; i++) {
170
- if (std::isnan(v1[i]) || std::isnan(v2[i])) {
171
- return INFINITY;
172
- }
173
- if (std::isinf(v1[i]) && std::isinf(v2[i])) {
174
- continue;
175
- }
176
- d += (v1[i] - v2[i])*(v1[i] - v2[i]);
177
- }
178
-
179
- return sqrt(d);
180
- }
181
-
182
- static float vec_len(const float * v, size_t n) {
183
- double d = 0.0;
184
-
185
- for (size_t i = 0; i < n; i++) {
186
- if (std::isnan(v[i])) {
187
- return INFINITY;
188
- }
189
- if (std::isinf(v[i])) {
190
- continue;
191
- }
192
- d += v[i]*v[i];
193
- }
194
-
195
- return sqrt(d);
196
- }
197
- */
198
-
199
175
  // normalized mean squared error = mse(a, b) / mse(a, 0)
200
176
  static double nmse(const float * a, const float * b, size_t n) {
201
177
  double mse_a_b = 0.0;
@@ -212,8 +188,40 @@ static double nmse(const float * a, const float * b, size_t n) {
212
188
  return mse_a_b / mse_a_0;
213
189
  }
214
190
 
191
+ // maximum absolute asymmetry between a and b
192
+ // asymmetry: (a - b) / (a + b)
193
+ // This is more stable than relative error if one of the values fluctuates towards zero.
194
+ // n: number of values to compare.
195
+ // expected_vals: optional vector of expected values for a. If expected_vals is not empty, filter out all comparisons where
196
+ // a does not match any of the expected values. Needed for noncontinuous gradients where the numerical calculation can fail.
197
+ static double mean_abs_asymm(const float * a, const float * b, const size_t n, const std::vector<float> & expected_vals) {
198
+ double sum = 0.0f;
199
+
200
+ size_t nvalid = 0;
201
+ for (size_t i = 0; i < n; i++) {
202
+ if (!expected_vals.empty()) {
203
+ bool matches_any = false;
204
+ for (const float & ev : expected_vals) {
205
+ if (fabsf(a[i] - ev) < 1e-3f) {
206
+ matches_any = true;
207
+ break;
208
+ }
209
+ }
210
+ if (!matches_any) {
211
+ continue;
212
+ }
213
+ }
214
+
215
+ const float asymm = (a[i] - b[i]) / (a[i] + b[i]);
216
+
217
+ sum += fabsf(asymm);
218
+ nvalid++;
219
+ }
220
+
221
+ return sum/nvalid;
222
+ }
223
+
215
224
  // utils for printing the variables of the test cases
216
- #define VAR_TO_STR(x) (#x "=" + var_to_str(x))
217
225
 
218
226
  template<typename T>
219
227
  static std::string var_to_str(const T & x) {
@@ -246,10 +254,6 @@ static std::string var_to_str(const std::array<T, N> & x) {
246
254
  return s;
247
255
  }
248
256
 
249
- //static std::string var_to_str(ggml_unary_op unary_op) {
250
- // return ggml_unary_op_name(unary_op);
251
- //}
252
-
253
257
  static std::string var_to_str(ggml_type type) {
254
258
  return ggml_type_name(type);
255
259
  }
@@ -262,6 +266,8 @@ static std::string var_to_str(ggml_op_pool pool) {
262
266
  }
263
267
  }
264
268
 
269
+ #define VAR_TO_STR(x) (#x "=" + var_to_str(x))
270
+
265
271
  #define VARS_TO_STR1(a) VAR_TO_STR(a)
266
272
  #define VARS_TO_STR2(a, b) VAR_TO_STR(a) + "," + VAR_TO_STR(b)
267
273
  #define VARS_TO_STR3(a, b, c) VAR_TO_STR(a) + "," + VARS_TO_STR2(b, c)
@@ -295,6 +301,7 @@ static bool ggml_is_view_op(enum ggml_op op) {
295
301
  enum test_mode {
296
302
  MODE_TEST,
297
303
  MODE_PERF,
304
+ MODE_GRAD,
298
305
  };
299
306
 
300
307
  struct test_case {
@@ -314,6 +321,32 @@ struct test_case {
314
321
  return 1e-7;
315
322
  }
316
323
 
324
+ virtual double max_maa_err() {
325
+ return 1e-4;
326
+ }
327
+
328
+ virtual float grad_eps() {
329
+ return 1e-1f;
330
+ }
331
+
332
+ // If false, estimate gradient with 2 points, neglects 3rd order derivative and higher.
333
+ // If true, estimate gradient with 4 points, neglects 5th order derivative and higher.
334
+ virtual bool grad_precise() {
335
+ return false;
336
+ }
337
+
338
+ // Skip gradient checks if total number of gradients to be checked is larger than this (to speed up the tests).
339
+ virtual int64_t grad_nmax() {
340
+ return 10000;
341
+ }
342
+
343
+ // No effect if empty.
344
+ // If not empty, skip all gradient checks where the numerical result does not match any of the values.
345
+ // Needed for dealing with noncontinuous gradients (e.g. ReLU) where estimation using finite differences is unreliable.
346
+ virtual std::vector<float> grad_expect() {
347
+ return {};
348
+ }
349
+
317
350
  virtual void initialize_tensors(ggml_context * ctx) {
318
351
  for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
319
352
  init_tensor_uniform(t);
@@ -331,7 +364,13 @@ struct test_case {
331
364
  return size;
332
365
  }
333
366
 
367
+ virtual uint64_t op_flops(ggml_tensor * t) {
368
+ GGML_UNUSED(t);
369
+ return 0;
370
+ }
371
+
334
372
  ggml_cgraph * gf = nullptr;
373
+ ggml_cgraph * gb = nullptr;
335
374
 
336
375
  static const int sentinel_size = 1024;
337
376
 
@@ -340,7 +379,7 @@ struct test_case {
340
379
  std::vector<ggml_tensor *> sentinels;
341
380
 
342
381
  void add_sentinel(ggml_context * ctx) {
343
- if (mode == MODE_PERF) {
382
+ if (mode == MODE_PERF || mode == MODE_GRAD) {
344
383
  return;
345
384
  }
346
385
  ggml_tensor * sentinel = ::ggml_new_tensor_1d(ctx, GGML_TYPE_F32, sentinel_size);
@@ -389,6 +428,7 @@ struct test_case {
389
428
  /* .no_alloc = */ true,
390
429
  };
391
430
  ggml_context * ctx = ggml_init(params);
431
+ GGML_ASSERT(ctx);
392
432
 
393
433
  gf = ggml_new_graph(ctx);
394
434
 
@@ -439,7 +479,7 @@ struct test_case {
439
479
 
440
480
  // add sentinels as graph nodes so that they are checked in the callback
441
481
  for (ggml_tensor * sentinel : sentinels) {
442
- gf->nodes[gf->n_nodes++] = sentinel;
482
+ ggml_graph_add_node(gf, sentinel);
443
483
  }
444
484
 
445
485
  // randomize tensors
@@ -550,6 +590,7 @@ struct test_case {
550
590
  /* .no_alloc = */ true,
551
591
  };
552
592
  ggml_context * ctx = ggml_init(params);
593
+ GGML_ASSERT(ctx);
553
594
 
554
595
  ggml_tensor * out = build_graph(ctx);
555
596
 
@@ -570,12 +611,11 @@ struct test_case {
570
611
  }
571
612
 
572
613
  // align while also leaving some margin for variations in parameters
573
- int align = 20;
614
+ int align = 8;
574
615
  int last = (len + align - 1) / align * align;
575
616
  if (last - len < 5) {
576
617
  last += align;
577
618
  }
578
- last = std::max(last, 60);
579
619
  printf("%*s", last - len, "");
580
620
 
581
621
  // allocate
@@ -596,11 +636,27 @@ struct test_case {
596
636
  // warmup run
597
637
  ggml_backend_graph_compute(backend, gf);
598
638
 
639
+ // determine number of runs
640
+ int n_runs;
641
+ if (op_flops(out) > 0) {
642
+ // based on flops
643
+ const uint64_t GFLOP = 1000 * 1000 * 1000;
644
+ const uint64_t target_flops_cpu = 8ULL * GFLOP;
645
+ const uint64_t target_flops_gpu = 100ULL * GFLOP;
646
+ uint64_t target_flops = ggml_backend_is_cpu(backend) ? target_flops_cpu : target_flops_gpu;
647
+ n_runs = std::min<int>(ggml_graph_size(gf) - ggml_graph_n_nodes(gf), target_flops / op_flops(out)) + 1;
648
+ } else {
649
+ // based on memory size
650
+ const size_t GB = 1ULL << 30;
651
+ const size_t target_size_cpu = 8 * GB;
652
+ const size_t target_size_gpu = 32 * GB;
653
+ size_t target_size = ggml_backend_is_cpu(backend) ? target_size_cpu : target_size_gpu;
654
+ n_runs = std::min<int>(ggml_graph_size(gf) - ggml_graph_n_nodes(gf), target_size / op_size(out)) + 1;
655
+ }
656
+
599
657
  // duplicate the op
600
- size_t target_size = ggml_backend_is_cpu(backend) ? 1ULL << 33 : 1ULL << 35; // 8 GB CPU, 32 GB GPU
601
- int n_runs = std::min((size_t)gf->size - gf->n_nodes, target_size / op_size(out)) + 1;
602
658
  for (int i = 1; i < n_runs; i++) {
603
- gf->nodes[gf->n_nodes++] = out;
659
+ ggml_graph_add_node(gf, out);
604
660
  }
605
661
 
606
662
  // calculate memory
@@ -615,36 +671,330 @@ struct test_case {
615
671
  }
616
672
  return size;
617
673
  };
618
- for (int i = 0; i < gf->n_nodes; i++) {
619
- if (ggml_is_view_op(gf->nodes[i]->op) || gf->nodes[i] == out) {
674
+ for (int i = 0; i < ggml_graph_n_nodes(gf); ++i) {
675
+ if (ggml_is_view_op(ggml_graph_node(gf, i)->op) || ggml_graph_node(gf, i) == out) {
620
676
  continue;
621
677
  }
622
- mem += tensor_op_size(gf->nodes[i]);
678
+ mem += tensor_op_size(ggml_graph_node(gf, i));
623
679
  }
624
680
 
625
681
  // run
626
- ggml_backend_synchronize(backend);
682
+ int64_t total_time_us = 0;
683
+ int total_runs = 0;
684
+ do {
685
+ int64_t start_time = ggml_time_us();
686
+ ggml_backend_graph_compute(backend, gf);
687
+ int64_t end_time = ggml_time_us();
688
+
689
+ total_time_us += end_time - start_time;
690
+ total_runs += n_runs;
691
+ } while (total_time_us < 1000*1000); // run for at least 1 second
692
+
693
+ printf(" %8d runs - %8.2f us/run - ",
694
+ total_runs,
695
+ (double)total_time_us / total_runs);
696
+
697
+ if (op_flops(out) > 0) {
698
+ double flops_per_sec = (op_flops(out) * total_runs) / (total_time_us / 1e6);
699
+ auto format_flops = [](double flops) -> std::string {
700
+ char buf[256];
701
+ if (flops >= 1e12) {
702
+ snprintf(buf, sizeof(buf), "%6.2f TFLOP", flops / 1e12);
703
+ } else if (flops >= 1e9) {
704
+ snprintf(buf, sizeof(buf), "%6.2f GFLOP", flops / 1e9);
705
+ } else if (flops >= 1e6) {
706
+ snprintf(buf, sizeof(buf), "%6.2f MFLOP", flops / 1e6);
707
+ } else {
708
+ snprintf(buf, sizeof(buf), "%6.2f KFLOP", flops / 1e3);
709
+ }
710
+ return buf;
711
+ };
712
+ printf("%s/run - \033[1;34m%sS\033[0m",
713
+ format_flops(op_flops(out)).c_str(),
714
+ format_flops(flops_per_sec).c_str());
715
+
716
+ } else {
717
+ printf("%8zu kB/run - \033[1;34m%7.2f GB/s\033[0m",
718
+ op_size(out) / 1024,
719
+ mem / (total_time_us / 1e6) / 1024.0 / 1024.0 / 1024.0);
720
+ }
721
+ printf("\n");
722
+
723
+ ggml_backend_buffer_free(buf);
724
+
725
+ ggml_free(ctx);
726
+
727
+ return true;
728
+ }
729
+
730
+ bool eval_grad(ggml_backend_t backend, const char * op_name) {
731
+ mode = MODE_GRAD;
732
+ const std::vector<float> expect = grad_expect();
733
+
734
+ ggml_init_params params = {
735
+ /* .mem_size = */ ggml_tensor_overhead()*128 + 2*ggml_graph_overhead_custom(GGML_DEFAULT_GRAPH_SIZE, true),
736
+ /* .mem_base = */ NULL,
737
+ /* .no_alloc = */ true,
738
+ };
739
+ ggml_context * ctx = ggml_init(params);
740
+ GGML_ASSERT(ctx);
741
+
742
+ gf = ggml_new_graph_custom(ctx, GGML_DEFAULT_GRAPH_SIZE, true);
743
+ gb = ggml_new_graph_custom(ctx, GGML_DEFAULT_GRAPH_SIZE, true);
744
+
745
+ ggml_tensor * out = build_graph(ctx);
746
+
747
+ if ((op_name != nullptr && op_desc(out) != op_name) || out->op == GGML_OP_OPT_STEP_ADAMW) {
748
+ //printf(" %s: skipping\n", op_desc(out).c_str());
749
+ ggml_free(ctx);
750
+ return true;
751
+ }
752
+
753
+ printf(" %s(%s): ", op_desc(out).c_str(), vars().c_str());
754
+ fflush(stdout);
755
+
756
+ if (out->type != GGML_TYPE_F32) {
757
+ ggml_free(ctx);
758
+ printf("not supported [%s->type != FP32]\n", out->name);
759
+ return true;
760
+ }
761
+
762
+ // check if the backend supports the ops
763
+ bool supported = true;
764
+ bool any_params = false;
765
+ for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
766
+ if (!ggml_backend_supports_op(backend, t)) {
767
+ printf("not supported [%s] ", ggml_backend_name(backend));
768
+ supported = false;
769
+ break;
770
+ }
771
+ if ((t->flags & GGML_TENSOR_FLAG_PARAM)) {
772
+ any_params = true;
773
+ if (t->type != GGML_TYPE_F32) {
774
+ printf("not supported [%s->type != FP32] ", t->name);
775
+ supported = false;
776
+ break;
777
+ }
778
+ }
779
+ }
780
+ if (!any_params) {
781
+ printf("not supported [%s] \n", op_name);
782
+ supported = false;
783
+ }
784
+ if (!supported) {
785
+ printf("\n");
786
+ ggml_free(ctx);
787
+ return true;
788
+ }
789
+
790
+ int64_t ngrads = 0;
791
+ for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
792
+ if (t->flags & GGML_TENSOR_FLAG_PARAM) {
793
+ ngrads += ggml_nelements(t);
794
+ }
795
+ }
796
+ if (ngrads > grad_nmax()) {
797
+ printf("skipping large tensors for speed \n");
798
+ ggml_free(ctx);
799
+ return true;
800
+ }
801
+
802
+
803
+ if (!ggml_is_scalar(out)) {
804
+ out = ggml_sum(ctx, out);
805
+ ggml_set_name(out, "sum_of_out");
806
+ }
807
+ ggml_set_loss(out);
808
+
809
+ ggml_build_forward_expand(gf, out);
810
+ ggml_graph_cpy(gf, gb);
811
+ ggml_build_backward_expand(ctx, gf, gb, false);
812
+ if (expect.size() != 1 || expect[0] != 0.0f) {
813
+ GGML_ASSERT(ggml_graph_n_nodes(gb) > ggml_graph_n_nodes(gf));
814
+ for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
815
+ GGML_ASSERT(!(t->flags & GGML_TENSOR_FLAG_PARAM) || t->grad->op != GGML_OP_NONE);
816
+ }
817
+ }
818
+
819
+ // TODO: refactor so that this check is only needed once
820
+ for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
821
+ if (!ggml_backend_supports_op(backend, t)) {
822
+ printf("not supported [%s] ", ggml_backend_name(backend));
823
+ supported = false;
824
+ break;
825
+ }
826
+ if ((t->flags & GGML_TENSOR_FLAG_PARAM) && t->type != GGML_TYPE_F32) {
827
+ printf("not supported [%s->type != FP32] ", t->name);
828
+ supported = false;
829
+ break;
830
+ }
831
+ }
832
+ if (!supported) {
833
+ printf("\n");
834
+ ggml_free(ctx);
835
+ return true;
836
+ }
837
+
838
+ // allocate
839
+ ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx, backend);
840
+ if (buf == NULL) {
841
+ printf("failed to allocate tensors [%s] ", ggml_backend_name(backend));
842
+ ggml_free(ctx);
843
+ return false;
844
+ }
845
+
846
+
847
+ initialize_tensors(ctx); // Randomizes all tensors (including gradients).
848
+ ggml_graph_reset(gb); // Sets gradients to 1 if loss, 0 otherwise.
627
849
 
628
- int64_t start_time = ggml_time_us();
629
850
  ggml_backend_graph_compute(backend, gf);
630
- ggml_backend_synchronize(backend);
631
- int64_t end_time = ggml_time_us();
632
- double time_us = end_time - start_time;
851
+ ggml_backend_graph_compute(backend, gb);
852
+
853
+ bool ok = true;
854
+ for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
855
+ if (!(t->flags & GGML_TENSOR_FLAG_PARAM)) {
856
+ continue;
857
+ }
858
+
859
+ const char * bn = ggml_backend_name(backend);
860
+ const int64_t ne = ggml_nelements(t);
861
+
862
+ std::vector<float> ga = tensor_to_float(t->grad);
863
+
864
+ for (int64_t i = 0; i < ne; ++i) { // gradient algebraic
865
+ // check for nans
866
+ if (!std::isfinite(ga[i])) {
867
+ printf("[%s] nonfinite gradient at index %" PRId64 " (%s=%f) ", ggml_op_desc(t), i, bn, ga[i]);
868
+ ok = false;
869
+ break;
870
+ }
871
+ }
872
+ if (!ok) {
873
+ break;
874
+ }
875
+
876
+ std::vector<float> gn(ne); // gradient numeric
877
+ GGML_ASSERT(ga.size() == gn.size());
878
+
879
+ std::vector<float> x0 = tensor_to_float(t); // original t data
880
+ GGML_ASSERT(ggml_is_scalar(out));
881
+ GGML_ASSERT(out->type == GGML_TYPE_F32);
882
+
883
+ const float eps = grad_eps();
884
+ for (int64_t i = 0; i < ne; ++i) {
885
+ const float xiu = x0[i] + 1.0f*eps; // x, index i, up
886
+ const float xiuh = x0[i] + 0.5f*eps; // x, index i, up half
887
+ const float xidh = x0[i] - 0.5f*eps; // x, index i, down half
888
+ const float xid = x0[i] - 1.0f*eps; // x, index i, down
889
+
890
+ float fu, fuh, fdh, fd; // output values for xiu, xiuh, xid, xidh
891
+
892
+ ggml_backend_tensor_set(t, &xiu, i*sizeof(float), sizeof(float));
893
+ ggml_backend_graph_compute(backend, gf);
894
+ ggml_backend_tensor_get(out, &fu, 0, ggml_nbytes(out));
895
+
896
+ ggml_backend_tensor_set(t, &xid, i*sizeof(float), sizeof(float));
897
+ ggml_backend_graph_compute(backend, gf);
898
+ ggml_backend_tensor_get(out, &fd, 0, ggml_nbytes(out));
899
+
900
+ if (grad_precise()) {
901
+ ggml_backend_tensor_set(t, &xiuh, i*sizeof(float), sizeof(float));
902
+ ggml_backend_graph_compute(backend, gf);
903
+ ggml_backend_tensor_get(out, &fuh, 0, ggml_nbytes(out));
904
+
905
+ ggml_backend_tensor_set(t, &xidh, i*sizeof(float), sizeof(float));
906
+ ggml_backend_graph_compute(backend, gf);
907
+ ggml_backend_tensor_get(out, &fdh, 0, ggml_nbytes(out));
908
+
909
+ gn[i] = (8.0*(double)fuh + (double)fd - (8.0*(double)fdh + (double)fu)) / (6.0*(double)eps);
910
+ } else {
911
+ gn[i] = (fu - fd) / (2.0f*eps);
912
+ }
633
913
 
634
- printf(" %5d runs - %8.2f us/run - %8zu kB/run - \033[1;34m%7.2f GB/s\033[0m\n",
635
- n_runs,
636
- time_us / n_runs,
637
- op_size(out) / 1024,
638
- mem / (time_us/1e6) / 1024.0 / 1024.0 / 1024.0);
914
+ ggml_backend_tensor_set(t, x0.data(), 0, ggml_nbytes(t));
915
+ }
916
+
917
+ const double err = mean_abs_asymm(gn.data(), ga.data(), gn.size(), expect);
918
+ if (err > max_maa_err()) {
919
+ printf("[%s] MAA = %.9f > %.9f ", ggml_op_desc(t), err, max_maa_err());
920
+ ok = false;
921
+ break;
922
+ }
923
+ if (!ok) {
924
+ break;
925
+ }
926
+ }
927
+
928
+ if (!ok) {
929
+ printf("compare failed ");
930
+ }
639
931
 
640
932
  ggml_backend_buffer_free(buf);
641
933
 
642
934
  ggml_free(ctx);
643
935
 
644
- return true;
936
+ if (ok) {
937
+ printf("\033[1;32mOK\033[0m\n");
938
+ return true;
939
+ }
940
+
941
+ printf("\033[1;31mFAIL\033[0m\n");
942
+ return false;
943
+ }
944
+ };
945
+
946
+
947
+ // ###################################
948
+ // ## Section 2: GGML Op Defintions ##
949
+ // ###################################
950
+
951
+
952
+ // The following is an example showing the bare minimum for creating a test for a GGML op.
953
+
954
+ // GGML_OP_EXAMPLE
955
+ struct test_example : public test_case {
956
+ // Always define these 2 or variants thereof:
957
+ const ggml_type type; // The type of the input tensors.
958
+ const std::array<int64_t, 4> ne; // The shape of the input tensors.
959
+ // For some ops it's necessary to define multiple types or shapes for the inputs.
960
+ // Or they may need additional parameters.
961
+
962
+ // Put all parameters needed to fully define the test into one of the VARS_TO_STR macros.
963
+ // In most cases these are just the properties of the struct that you defined above.
964
+ // This is needed for info prints.
965
+ std::string vars() override {
966
+ return VARS_TO_STR2(type, ne);
967
+ }
968
+
969
+ // Define a constructor for the struct.
970
+ // In most cases it will be sufficient to have the same arguments as the struct has properties
971
+ // and just use initializer lists.
972
+ test_example(ggml_type type = GGML_TYPE_F32,
973
+ std::array<int64_t, 4> ne = {10, 5, 4, 3})
974
+ : type(type), ne(ne) {}
975
+
976
+ // Define how a simple GGML compute graph can be constructed for the new GGML op.
977
+ ggml_tensor * build_graph(ggml_context * ctx) override {
978
+ // Step 1: create input tensors that don't depend on any other tensors:
979
+ ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
980
+ ggml_set_name(a, "a"); // Setting names is optional but it's useful for debugging.
981
+
982
+ ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne.data());
983
+ ggml_set_name(b, "b");
984
+
985
+ // Step 2: use the op that you want to test in the GGML compute graph.
986
+ ggml_tensor * out = ggml_add(ctx, a, b); // For this example we're just doing a simple addition.
987
+ ggml_set_name(out, "out");
988
+
989
+ // Step 3: return the output tensor.
990
+ return out;
645
991
  }
992
+ // In order to also check the gradients for your op, add calls like ggml_set_param(ctx, a)
993
+ // immediately after you create the tensors.
994
+ // This is optional and only makes sense if a backward pass has actually been implemented for the new op.
646
995
  };
647
996
 
997
+
648
998
  // GGML_OP_UNARY
649
999
  struct test_unary : public test_case {
650
1000
  const ggml_unary_op op;
@@ -658,20 +1008,36 @@ struct test_unary : public test_case {
658
1008
 
659
1009
  test_unary(ggml_unary_op op,
660
1010
  ggml_type type = GGML_TYPE_F32,
661
- std::array<int64_t, 4> ne_a = {128, 10, 10, 10},
1011
+ std::array<int64_t, 4> ne_a = {128, 2, 2, 2},
662
1012
  int v = 0)
663
1013
  : op(op), type(type), ne_a(ne_a), v(v) {}
664
1014
 
665
1015
  ggml_tensor * build_graph(ggml_context * ctx) override {
1016
+ const bool grad_supported = op == GGML_UNARY_OP_ABS || op == GGML_UNARY_OP_SGN || op == GGML_UNARY_OP_NEG ||
1017
+ op == GGML_UNARY_OP_STEP || op == GGML_UNARY_OP_RELU || op == GGML_UNARY_OP_SILU;
1018
+
666
1019
  ggml_tensor * a;
667
1020
  if (v & 1) {
668
1021
  auto ne = ne_a; ne[0] *= 3;
669
1022
  a = ggml_new_tensor(ctx, type, 4, ne.data());
1023
+ if (grad_supported) {
1024
+ ggml_set_param(ctx, a);
1025
+ }
1026
+ ggml_set_name(a, "a");
1027
+
670
1028
  a = ggml_view_4d(ctx, a, ne_a[0], ne_a[1], ne_a[2], ne_a[3], a->nb[1], a->nb[2], a->nb[3], 0);
1029
+ ggml_set_name(a, "view_of_a");
671
1030
  } else {
672
1031
  a = ggml_new_tensor(ctx, type, 4, ne_a.data());
1032
+ if (grad_supported) {
1033
+ ggml_set_param(ctx, a);
1034
+ }
1035
+ ggml_set_name(a, "a");
673
1036
  }
1037
+
674
1038
  ggml_tensor * out = ggml_unary(ctx, a, op);
1039
+ ggml_set_name(out, "out");
1040
+
675
1041
  return out;
676
1042
  }
677
1043
 
@@ -681,6 +1047,24 @@ struct test_unary : public test_case {
681
1047
  init_tensor_uniform(t, -150.f, 150.f);
682
1048
  }
683
1049
  }
1050
+
1051
+ float grad_eps() override {
1052
+ return 15.0f;
1053
+ }
1054
+
1055
+ std::vector<float> grad_expect() override {
1056
+ if (op == GGML_UNARY_OP_ABS) {
1057
+ return {-1.0f, 1.0f};
1058
+ }
1059
+ if (op == GGML_UNARY_OP_SGN || op == GGML_UNARY_OP_STEP) {
1060
+ return {0.0f};
1061
+ }
1062
+ if (op == GGML_UNARY_OP_RELU) {
1063
+ return {0.0f, 1.0f};
1064
+ }
1065
+ return {};
1066
+ }
1067
+
684
1068
  };
685
1069
 
686
1070
  // GGML_OP_GET_ROWS
@@ -701,11 +1085,24 @@ struct test_get_rows : public test_case {
701
1085
 
702
1086
  ggml_tensor * build_graph(ggml_context * ctx) override {
703
1087
  ggml_tensor * in = ggml_new_tensor_3d(ctx, type, n, m, b);
1088
+ ggml_set_name(in, "in");
1089
+
704
1090
  ggml_tensor * rows = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, r, b);
1091
+ ggml_set_name(rows, "rows");
705
1092
  if (v) {
706
1093
  rows = ggml_view_2d(ctx, rows, r/2, b, rows->nb[1], 0);
1094
+ ggml_set_name(rows, "view_of_rows");
1095
+ }
1096
+
1097
+ const bool grad_supported = ggml_is_matrix(in) && ggml_is_vector(rows);
1098
+ if (grad_supported) {
1099
+ ggml_set_param(ctx, in);
1100
+ // rows is a constant input -> no gradients
707
1101
  }
1102
+
708
1103
  ggml_tensor * out = ggml_get_rows(ctx, in, rows);
1104
+ ggml_set_name(out, "out");
1105
+
709
1106
  return out;
710
1107
  }
711
1108
 
@@ -726,29 +1123,101 @@ struct test_get_rows : public test_case {
726
1123
  }
727
1124
  };
728
1125
 
729
- // GGML_OP_REPEAT
730
- struct test_repeat : public test_case {
1126
+ // GGML_OP_ARGMAX
1127
+ struct test_argmax : public test_case {
731
1128
  const ggml_type type;
732
1129
  const std::array<int64_t, 4> ne;
733
- const std::array<int, 4> nr;
734
1130
 
735
1131
  std::string vars() override {
736
- return VARS_TO_STR3(type, ne, nr);
1132
+ return VARS_TO_STR2(type, ne);
737
1133
  }
738
1134
 
739
- size_t op_size(ggml_tensor * t) override {
740
- return ggml_nbytes(t) * 2;
1135
+ test_argmax(ggml_type type = GGML_TYPE_F32,
1136
+ std::array<int64_t, 4> ne = {10, 100, 1, 1})
1137
+ : type(type), ne(ne) {}
1138
+
1139
+ ggml_tensor * build_graph(ggml_context * ctx) override {
1140
+ ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
1141
+ ggml_set_name(a, "a");
1142
+
1143
+ ggml_tensor * out = ggml_argmax(ctx, a);
1144
+ ggml_set_name(out, "out");
1145
+
1146
+ return out;
1147
+ }
1148
+
1149
+ double max_nmse_err() override {
1150
+ return 0.0;
1151
+ }
1152
+ };
1153
+
1154
+ // GGML_OP_COUNT_EQUAL
1155
+ struct test_count_equal : public test_case {
1156
+ const ggml_type type;
1157
+ const std::array<int64_t, 4> ne;
1158
+
1159
+ std::string vars() override {
1160
+ return VARS_TO_STR2(type, ne);
1161
+ }
1162
+
1163
+ test_count_equal(ggml_type type = GGML_TYPE_F32,
1164
+ std::array<int64_t, 4> ne = {4, 500, 1, 1})
1165
+ : type(type), ne(ne) {}
1166
+
1167
+ ggml_tensor * build_graph(ggml_context * ctx) override {
1168
+ ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
1169
+ ggml_set_name(a, "a");
1170
+
1171
+ ggml_tensor * a_argmax = ggml_argmax(ctx, a);
1172
+ ggml_set_name(a_argmax, "a_argmax");
1173
+
1174
+ ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne.data());
1175
+ ggml_set_name(b, "b");
1176
+
1177
+ ggml_tensor * b_argmax = ggml_argmax(ctx, a);
1178
+ ggml_set_name(b_argmax, "b_argmax");
1179
+
1180
+ ggml_tensor * out = ggml_count_equal(ctx, a_argmax, b_argmax);
1181
+ ggml_set_name(out, "out");
1182
+
1183
+ return out;
1184
+ }
1185
+
1186
+ double max_nmse_err() override {
1187
+ return 0.0;
1188
+ }
1189
+ };
1190
+
1191
+ // GGML_OP_REPEAT
1192
+ struct test_repeat : public test_case {
1193
+ const ggml_type type;
1194
+ const std::array<int64_t, 4> ne;
1195
+ const std::array<int, 4> nr;
1196
+
1197
+ std::string vars() override {
1198
+ return VARS_TO_STR3(type, ne, nr);
1199
+ }
1200
+
1201
+ size_t op_size(ggml_tensor * t) override {
1202
+ return ggml_nbytes(t) * 2;
741
1203
  }
742
1204
 
743
1205
  test_repeat(ggml_type type = GGML_TYPE_F32,
744
- std::array<int64_t, 4> ne = {10, 10, 10, 10},
1206
+ std::array<int64_t, 4> ne = {10, 5, 4, 3},
745
1207
  std::array<int, 4> nr = {2, 2, 2, 2})
746
1208
  : type(type), ne(ne), nr(nr) {}
747
1209
 
748
1210
  ggml_tensor * build_graph(ggml_context * ctx) override {
749
1211
  ggml_tensor * target = ggml_new_tensor_4d(ctx, type, ne[0]*nr[0], ne[1]*nr[1], ne[2]*nr[2], ne[3]*nr[3]);
1212
+ ggml_set_name(target, "target");
1213
+
750
1214
  ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data());
1215
+ ggml_set_param(ctx, src);
1216
+ ggml_set_name(src, "src");
1217
+
751
1218
  ggml_tensor * out = ggml_repeat(ctx, src, target);
1219
+ ggml_set_name(out, "out");
1220
+
752
1221
  return out;
753
1222
  }
754
1223
  };
@@ -774,10 +1243,62 @@ struct test_dup : public test_case {
774
1243
 
775
1244
  ggml_tensor * build_graph(ggml_context * ctx) override {
776
1245
  ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data());
1246
+ ggml_set_param(ctx, src);
1247
+ ggml_set_name(src, "src");
1248
+
777
1249
  if (_use_permute) {
778
1250
  src = ggml_permute(ctx, src, permute[0], permute[1], permute[2], permute[3]);
1251
+ ggml_set_name(src, "src_permuted");
779
1252
  }
1253
+
780
1254
  ggml_tensor * out = ggml_dup(ctx, src);
1255
+ ggml_set_name(out, "out");
1256
+
1257
+ return out;
1258
+ }
1259
+ };
1260
+
1261
+ // GGML_OP_SET
1262
+ struct test_set : public test_case {
1263
+ const ggml_type type_src;
1264
+ const ggml_type type_dst;
1265
+ const std::array<int64_t, 4> ne;
1266
+ const int dim;
1267
+
1268
+ std::string vars() override {
1269
+ return VARS_TO_STR4(type_src, type_dst, ne, dim);
1270
+ }
1271
+
1272
+ size_t op_size(ggml_tensor * t) override {
1273
+ return ggml_nbytes(t) + ggml_nbytes(t->src[0]);
1274
+ }
1275
+
1276
+ test_set(ggml_type type_src = GGML_TYPE_F32, ggml_type type_dst = GGML_TYPE_F32,
1277
+ std::array<int64_t, 4> ne = {6, 5, 4, 3}, int dim = 1)
1278
+ : type_src(type_src), type_dst(type_dst), ne(ne), dim(dim) {}
1279
+
1280
+ ggml_tensor * build_graph(ggml_context * ctx) override {
1281
+ ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne.data());
1282
+ ggml_set_param(ctx, src);
1283
+ ggml_set_name(src, "src");
1284
+
1285
+ auto ne_dst = ne;
1286
+ for (int i = 0; i < dim; ++i) {
1287
+ ne_dst[i] *= 2;
1288
+ }
1289
+ ggml_tensor* dst = ggml_new_tensor(ctx, type_dst, 4, ne_dst.data());
1290
+ ggml_set_param(ctx, dst);
1291
+ ggml_set_name(dst, "dst");
1292
+
1293
+ size_t offset = 0;
1294
+ for (int i = 0; i < dim; ++i) {
1295
+ offset += ((ne_dst[i] - ne[i])/2)*dst->nb[i];
1296
+ }
1297
+ ggml_tensor * out = ggml_set(ctx, dst, src,
1298
+ // The backward pass requires setting a contiguous region:
1299
+ src->nb[1], src->nb[2], src->nb[3], offset);
1300
+ ggml_set_name(out, "out");
1301
+
781
1302
  return out;
782
1303
  }
783
1304
  };
@@ -804,18 +1325,26 @@ struct test_cpy : public test_case {
804
1325
 
805
1326
  test_cpy(ggml_type type_src = GGML_TYPE_F32, ggml_type type_dst = GGML_TYPE_F32,
806
1327
  std::array<int64_t, 4> ne = {10, 10, 10, 1},
807
- std::array<int64_t, 4> permute = {0, 0, 0, 0},
808
- bool _dst_use_permute = false)
1328
+ std::array<int64_t, 4> permute = {0, 0, 0, 0})
809
1329
  : type_src(type_src), type_dst(type_dst), ne(ne), permute(permute),
810
1330
  _src_use_permute(permute[0] + permute[1] + permute[2] + permute[3] > 0) {}
811
1331
 
812
1332
  ggml_tensor * build_graph(ggml_context * ctx) override {
813
1333
  ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne.data());
1334
+ ggml_set_param(ctx, src);
1335
+ ggml_set_name(src, "src");
1336
+
814
1337
  if (_src_use_permute) {
815
1338
  src = ggml_permute(ctx, src, permute[0], permute[1], permute[2], permute[3]);
1339
+ ggml_set_name(src, "src_permuted");
816
1340
  }
1341
+
817
1342
  ggml_tensor* dst = ggml_new_tensor(ctx, type_dst, 4, src->ne);
1343
+ ggml_set_name(dst, "dst");
1344
+
818
1345
  ggml_tensor * out = ggml_cpy(ctx, src, dst);
1346
+ ggml_set_name(out, "out");
1347
+
819
1348
  return out;
820
1349
  }
821
1350
  };
@@ -835,8 +1364,14 @@ struct test_cont : public test_case {
835
1364
 
836
1365
  ggml_tensor * build_graph(ggml_context * ctx) override {
837
1366
  ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data());
1367
+ ggml_set_param(ctx, src);
1368
+ ggml_set_name(src, "src");
1369
+
838
1370
  src = ggml_transpose(ctx, src);
1371
+ ggml_set_name(src, "src_transposed");
1372
+
839
1373
  ggml_tensor * out = ggml_cont(ctx, src);
1374
+ ggml_set_name(out, "out");
840
1375
 
841
1376
  return out;
842
1377
  }
@@ -867,21 +1402,79 @@ struct test_bin_bcast : public test_case {
867
1402
 
868
1403
  ggml_tensor * build_graph(ggml_context * ctx) override {
869
1404
  ggml_tensor * a = ggml_new_tensor_4d(ctx, type, ne[0]*nr[0], ne[1]*nr[1], ne[2]*nr[2], ne[3]*nr[3]);
1405
+ ggml_set_name(a, "a");
1406
+
870
1407
  ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne.data());
1408
+ ggml_set_name(b, "b");
1409
+
1410
+ // The backward pass supports broadcasting only for GGML_ADD:
1411
+ const bool grad_supported = op == ggml_add || ggml_are_same_shape(a, b);
1412
+ if (grad_supported) {
1413
+ ggml_set_param(ctx, a);
1414
+ ggml_set_param(ctx, b);
1415
+ }
1416
+
871
1417
  ggml_tensor * out = op(ctx, a, b);
1418
+ ggml_set_name(out, "out");
1419
+
872
1420
  return out;
873
1421
  }
874
1422
 
875
1423
  void initialize_tensors(ggml_context * ctx) override {
876
1424
  for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
877
- if (op == ggml_div) {
878
- // avoid division by zero
879
- init_tensor_uniform(t, 1.0f, 2.0f);
1425
+ if (op == ggml_mul || op == ggml_div) {
1426
+ // MUL and DIV have numerical issues around zero:
1427
+ init_tensor_uniform(t, 0.9f, 1.1f);
880
1428
  } else {
881
1429
  init_tensor_uniform(t);
882
1430
  }
883
1431
  }
884
1432
  }
1433
+
1434
+ float grad_eps() override {
1435
+ return 0.1f * (op == ggml_mul ? ne[0]*ne[1]*ne[2]*ne[3] : 1);
1436
+ }
1437
+
1438
+ bool grad_precise() override {
1439
+ return op == ggml_div;
1440
+ }
1441
+
1442
+ double max_maa_err() override {
1443
+ return op == ggml_add ? 1e-4 : 1e-3;
1444
+ }
1445
+ };
1446
+
1447
+ // GGML_OP_ADD1
1448
+ struct test_add1 : public test_case {
1449
+ const ggml_type type;
1450
+ const std::array<int64_t, 4> ne;
1451
+
1452
+ std::string vars() override {
1453
+ return VARS_TO_STR2(type, ne);
1454
+ }
1455
+
1456
+ test_add1(ggml_type type = GGML_TYPE_F32,
1457
+ std::array<int64_t, 4> ne = {10, 5, 4, 3})
1458
+ : type(type), ne(ne) {}
1459
+
1460
+ ggml_tensor * build_graph(ggml_context * ctx) override {
1461
+ ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
1462
+ ggml_set_param(ctx, a);
1463
+ ggml_set_name(a, "a");
1464
+
1465
+ ggml_tensor * b = ggml_new_tensor_1d(ctx, type, 1);
1466
+ // ggml_set_param(ctx, b); // TODO: implement
1467
+ ggml_set_name(b, "b");
1468
+
1469
+ ggml_tensor * out = ggml_add1(ctx, a, b);
1470
+ ggml_set_name(out, "out");
1471
+
1472
+ return out;
1473
+ }
1474
+
1475
+ float grad_eps() override {
1476
+ return 0.1f * ne[0]*ne[1]*ne[2]*ne[3];
1477
+ }
885
1478
  };
886
1479
 
887
1480
  // GGML_OP_SCALE
@@ -901,7 +1494,12 @@ struct test_scale : public test_case {
901
1494
 
902
1495
  ggml_tensor * build_graph(ggml_context * ctx) override {
903
1496
  ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
1497
+ ggml_set_param(ctx, a);
1498
+ ggml_set_name(a, "a");
1499
+
904
1500
  ggml_tensor * out = ggml_scale(ctx, a, scale);
1501
+ ggml_set_name(out, "out");
1502
+
905
1503
  return out;
906
1504
  }
907
1505
  };
@@ -917,13 +1515,17 @@ struct test_norm : public test_case {
917
1515
  }
918
1516
 
919
1517
  test_norm(ggml_type type = GGML_TYPE_F32,
920
- std::array<int64_t, 4> ne = {64, 10, 10, 10},
1518
+ std::array<int64_t, 4> ne = {64, 5, 4, 3},
921
1519
  float eps = 1e-6f)
922
1520
  : type(type), ne(ne), eps(eps) {}
923
1521
 
924
1522
  ggml_tensor * build_graph(ggml_context * ctx) override {
925
1523
  ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
1524
+ ggml_set_name(a, "a");
1525
+
926
1526
  ggml_tensor * out = ggml_norm(ctx, a, eps);
1527
+ ggml_set_name(out, "out");
1528
+
927
1529
  return out;
928
1530
  }
929
1531
  };
@@ -939,13 +1541,104 @@ struct test_rms_norm : public test_case {
939
1541
  }
940
1542
 
941
1543
  test_rms_norm(ggml_type type = GGML_TYPE_F32,
942
- std::array<int64_t, 4> ne = {64, 10, 10, 10},
1544
+ std::array<int64_t, 4> ne = {64, 5, 4, 3},
943
1545
  float eps = 1e-6f)
944
1546
  : type(type), ne(ne), eps(eps) {}
945
1547
 
946
1548
  ggml_tensor * build_graph(ggml_context * ctx) override {
947
1549
  ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
1550
+ ggml_set_param(ctx, a);
1551
+ ggml_set_name(a, "a");
1552
+
948
1553
  ggml_tensor * out = ggml_rms_norm(ctx, a, eps);
1554
+ ggml_set_name(out, "out");
1555
+
1556
+ return out;
1557
+ }
1558
+
1559
+ bool grad_precise() override {
1560
+ return true;
1561
+ }
1562
+ };
1563
+
1564
+ // GGML_OP_SSM_CONV
1565
+ struct test_ssm_conv : public test_case {
1566
+ const ggml_type type;
1567
+ const std::array<int64_t, 4> ne_a;
1568
+ const std::array<int64_t, 4> ne_b;
1569
+
1570
+ std::string vars() override {
1571
+ return VARS_TO_STR3(type, ne_a, ne_b);
1572
+ }
1573
+
1574
+ test_ssm_conv(ggml_type type = GGML_TYPE_F32,
1575
+ std::array<int64_t, 4> ne_a = {10, 10, 10, 1},
1576
+ std::array<int64_t, 4> ne_b = {3, 3, 1, 1})
1577
+ : type(type), ne_a(ne_a), ne_b(ne_b) {}
1578
+
1579
+ ggml_tensor * build_graph(ggml_context * ctx) override {
1580
+ ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
1581
+ ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne_b.data());
1582
+ ggml_tensor * out = ggml_ssm_conv(ctx, a, b);
1583
+ return out;
1584
+ }
1585
+ };
1586
+
1587
+ // GGML_OP_SSM_SCAN
1588
+ struct test_ssm_scan : public test_case {
1589
+ const ggml_type type;
1590
+
1591
+ const int64_t d_state;
1592
+ const int64_t d_inner;
1593
+ const int64_t n_seq_tokens;
1594
+ const int64_t n_seqs;
1595
+
1596
+ std::string vars() override {
1597
+ return VARS_TO_STR5(type, d_state, d_inner, n_seq_tokens, n_seqs);
1598
+ }
1599
+
1600
+ test_ssm_scan(ggml_type type = GGML_TYPE_F32,
1601
+ int64_t d_state = 32, int64_t d_inner = 32, int64_t n_seq_tokens = 32, int64_t n_seqs = 32)
1602
+ : type(type), d_state(d_state), d_inner(d_inner), n_seq_tokens(n_seq_tokens), n_seqs(n_seqs) {}
1603
+
1604
+ ggml_tensor * build_graph(ggml_context * ctx) override {
1605
+ ggml_tensor * s = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ d_state, d_inner, n_seqs, 1 }.data());
1606
+ ggml_tensor * x = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ d_inner, n_seq_tokens, n_seqs, 1 }.data());
1607
+ ggml_tensor * dt = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ d_inner, n_seq_tokens, n_seqs, 1 }.data());
1608
+ ggml_tensor * A = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ d_state, d_inner, 1 , 1 }.data());
1609
+ ggml_tensor * B = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ d_state, n_seq_tokens, n_seqs, 1 }.data());
1610
+ ggml_tensor * C = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ d_state, n_seq_tokens, n_seqs, 1 }.data());
1611
+ ggml_tensor * out = ggml_ssm_scan(ctx, s, x, dt, A, B, C);
1612
+ return out;
1613
+ }
1614
+ };
1615
+
1616
+ // GGML_OP_RWKV_WKV
1617
+ struct test_rwkv_wkv : public test_case {
1618
+ const ggml_type type;
1619
+
1620
+ const int64_t head_count;
1621
+ const int64_t head_size;
1622
+ const int64_t n_seq_tokens;
1623
+ const int64_t n_seqs;
1624
+
1625
+ std::string vars() override {
1626
+ return VARS_TO_STR5(type, head_count, head_size, n_seq_tokens, n_seqs);
1627
+ }
1628
+
1629
+ test_rwkv_wkv(ggml_type type = GGML_TYPE_F32,
1630
+ int64_t head_count = 32, int64_t head_size = 64, int64_t n_seq_tokens = 32, int64_t n_seqs = 32)
1631
+ : type(type), head_count(head_count), head_size(head_size), n_seq_tokens(n_seq_tokens), n_seqs(n_seqs) {}
1632
+
1633
+ ggml_tensor * build_graph(ggml_context * ctx) override {
1634
+ const int64_t n_tokens = n_seq_tokens * n_seqs;
1635
+ ggml_tensor * r = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ 1, head_size, head_count, n_tokens }.data());
1636
+ ggml_tensor * k = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ head_size, 1, head_count, n_tokens }.data());
1637
+ ggml_tensor * v = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ 1, head_size, head_count, n_tokens }.data());
1638
+ ggml_tensor * tf = ggml_new_tensor(ctx, type, 2, std::vector<int64_t>{ head_size, head_count }.data());
1639
+ ggml_tensor * td = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ 1, head_size, head_count, n_tokens }.data());
1640
+ ggml_tensor * s = ggml_new_tensor(ctx, type, 2, std::vector<int64_t>{ head_size * head_size * head_count, n_seqs }.data());
1641
+ ggml_tensor * out = ggml_rwkv_wkv(ctx, k, v, r, tf, td, s);
949
1642
  return out;
950
1643
  }
951
1644
  };
@@ -968,13 +1661,9 @@ struct test_mul_mat : public test_case {
968
1661
  return 5e-4;
969
1662
  }
970
1663
 
971
- size_t op_size(ggml_tensor * t) override {
972
- size_t a = ggml_nbytes(t->src[0]) * n * nr[0] * nr[1];
973
- size_t b = ggml_nbytes(t->src[1]) * m;
974
- size_t c = ggml_nbytes(t);
975
- return a + b + c;
976
-
1664
+ uint64_t op_flops(ggml_tensor * t) override {
977
1665
  GGML_UNUSED(t);
1666
+ return 2 * m * n * k * bs[0] * nr[0] * bs[1] * nr[1];
978
1667
  }
979
1668
 
980
1669
  test_mul_mat(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32,
@@ -987,7 +1676,14 @@ struct test_mul_mat : public test_case {
987
1676
  // C^T = A * B^T: (k, m) * (k, n) => (m, n)
988
1677
  ggml_tensor * a = ggml_new_tensor_4d(ctx, type_a, k, m, bs[0] , bs[1]);
989
1678
  ggml_tensor * b = ggml_new_tensor_4d(ctx, type_b, k, n, bs[0]*nr[0], bs[1]*nr[1]);
1679
+ ggml_set_param(ctx, a);
1680
+ ggml_set_param(ctx, b);
1681
+ ggml_set_name(a, "a");
1682
+ ggml_set_name(b, "b");
1683
+
990
1684
  ggml_tensor * out = ggml_mul_mat(ctx, a, b);
1685
+ ggml_set_name(out, "out");
1686
+
991
1687
  return out;
992
1688
  }
993
1689
  };
@@ -1011,13 +1707,9 @@ struct test_mul_mat_id : public test_case {
1011
1707
  return 5e-4;
1012
1708
  }
1013
1709
 
1014
- size_t op_size(ggml_tensor * t) override {
1015
- size_t a = ggml_nbytes(t->src[2]) * n;
1016
- size_t b = ggml_nbytes(t->src[1]) * m;
1017
- size_t c = ggml_nbytes(t);
1018
- return a + b + c;
1019
-
1710
+ uint64_t op_flops(ggml_tensor * t) override {
1020
1711
  GGML_UNUSED(t);
1712
+ return 2 * m * k * n * n_used;
1021
1713
  }
1022
1714
 
1023
1715
  test_mul_mat_id(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32,
@@ -1031,12 +1723,21 @@ struct test_mul_mat_id : public test_case {
1031
1723
  ggml_tensor * build_graph(ggml_context * ctx) override {
1032
1724
  // C^T = A * B^T: (k, m) * (k, n) => (m, n)
1033
1725
  ggml_tensor * as = ggml_new_tensor_3d(ctx, type_a, k, m, n_mats);
1726
+ ggml_set_name(as, "as");
1727
+
1034
1728
  ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_mats, n);
1729
+ ggml_set_name(ids, "ids");
1035
1730
  if (n_used != n_mats) {
1036
1731
  ids = ggml_view_2d(ctx, ids, n_used, n, ids->nb[1], 0);
1732
+ ggml_set_name(ids, "view_of_ids");
1037
1733
  }
1734
+
1038
1735
  ggml_tensor * b = ggml_new_tensor_3d(ctx, type_b, k, this->b ? 1 : n_used, n);
1736
+ ggml_set_name(b, "b");
1737
+
1039
1738
  ggml_tensor * out = ggml_mul_mat_id(ctx, as, b, ids);
1739
+ ggml_set_name(out, "out");
1740
+
1040
1741
  return out;
1041
1742
  }
1042
1743
 
@@ -1062,8 +1763,157 @@ struct test_mul_mat_id : public test_case {
1062
1763
  }
1063
1764
  };
1064
1765
 
1065
- // GGML_OP_SQR
1066
- struct test_sqr : public test_case {
1766
+ // GGML_OP_OUT_PROD
1767
+ struct test_out_prod : public test_case {
1768
+ const ggml_type type_a;
1769
+ const ggml_type type_b;
1770
+ const int64_t m;
1771
+ const int64_t n;
1772
+ const int64_t k;
1773
+ const std::array<int64_t, 2> bs; // dims 3 and 4
1774
+ const bool trans_b;
1775
+
1776
+ std::string vars() override {
1777
+ return VARS_TO_STR7(type_a, type_b, m, n, k, bs, trans_b);
1778
+ }
1779
+
1780
+ double max_nmse_err() override {
1781
+ return 5e-4;
1782
+ }
1783
+
1784
+ test_out_prod(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32,
1785
+ int64_t m = 32, int64_t n = 32, int64_t k = 32,
1786
+ std::array<int64_t, 2> bs = {10, 10},
1787
+ bool trans_b = false)
1788
+ : type_a(type_a), type_b(type_b), m(m), n(n), k(k), bs(bs), trans_b(trans_b) {}
1789
+
1790
+ ggml_tensor * build_graph(ggml_context * ctx) override {
1791
+ ggml_tensor * a = ggml_new_tensor_4d(ctx, type_a, m, k, bs[0], bs[1]);
1792
+ ggml_set_name(a, "a");
1793
+
1794
+ ggml_tensor * b;
1795
+ if (trans_b) {
1796
+ b = ggml_new_tensor_4d(ctx, type_b, k, n, bs[0], bs[1]);
1797
+ b = ggml_transpose(ctx, b);
1798
+ } else {
1799
+ b = ggml_new_tensor_4d(ctx, type_b, n, k, bs[0], bs[1]);
1800
+ }
1801
+ ggml_set_name(b, "b");
1802
+
1803
+ ggml_tensor * out = ggml_out_prod(ctx, a, b);
1804
+ ggml_set_name(out, "out");
1805
+
1806
+ return out;
1807
+ }
1808
+ };
1809
+
1810
+ // GGML_OP_SQR
1811
+ struct test_sqr : public test_case {
1812
+ const ggml_type type;
1813
+ const std::array<int64_t, 4> ne;
1814
+
1815
+ std::string vars() override {
1816
+ return VARS_TO_STR2(type, ne);
1817
+ }
1818
+
1819
+ test_sqr(ggml_type type = GGML_TYPE_F32,
1820
+ std::array<int64_t, 4> ne = {10, 5, 4, 3})
1821
+ : type(type), ne(ne) {}
1822
+
1823
+ ggml_tensor * build_graph(ggml_context * ctx) override {
1824
+ ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
1825
+ ggml_set_param(ctx, a);
1826
+ ggml_set_name(a, "a");
1827
+
1828
+ ggml_tensor * out = ggml_sqr(ctx, a);
1829
+ ggml_set_name(out, "out");
1830
+
1831
+ return out;
1832
+ }
1833
+
1834
+ float grad_eps() override {
1835
+ return 0.1f * 0.25f*ne[0]*ne[1]*ne[2]*ne[3]; // 10% of expected value of sum.
1836
+ }
1837
+ };
1838
+
1839
+ // GGML_OP_SQRT
1840
+ struct test_sqrt : public test_case {
1841
+ const ggml_type type;
1842
+ const std::array<int64_t, 4> ne;
1843
+
1844
+ std::string vars() override {
1845
+ return VARS_TO_STR2(type, ne);
1846
+ }
1847
+
1848
+ test_sqrt(ggml_type type = GGML_TYPE_F32,
1849
+ std::array<int64_t, 4> ne = {10, 3, 3, 2})
1850
+ : type(type), ne(ne) {}
1851
+
1852
+ ggml_tensor * build_graph(ggml_context * ctx) override {
1853
+ ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
1854
+ ggml_set_param(ctx, a);
1855
+ ggml_set_name(a, "a");
1856
+
1857
+ ggml_tensor * out = ggml_sqrt(ctx, a);
1858
+ ggml_set_name(out, "out");
1859
+
1860
+ return out;
1861
+ }
1862
+
1863
+ void initialize_tensors(ggml_context * ctx) override {
1864
+ // fill with positive values
1865
+ for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
1866
+ init_tensor_uniform(t, 50.0f, 100.0f);
1867
+ }
1868
+ }
1869
+
1870
+ float grad_eps() override {
1871
+ return 20.0f;
1872
+ }
1873
+
1874
+ bool grad_precise() override {
1875
+ return true;
1876
+ }
1877
+ };
1878
+
1879
+ // GGML_OP_LOG
1880
+ struct test_log : public test_case {
1881
+ const ggml_type type;
1882
+ const std::array<int64_t, 4> ne;
1883
+
1884
+ std::string vars() override {
1885
+ return VARS_TO_STR2(type, ne);
1886
+ }
1887
+
1888
+ test_log(ggml_type type = GGML_TYPE_F32,
1889
+ std::array<int64_t, 4> ne = {10, 5, 4, 3})
1890
+ : type(type), ne(ne) {}
1891
+
1892
+ ggml_tensor * build_graph(ggml_context * ctx) override {
1893
+ ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
1894
+ ggml_set_param(ctx, a);
1895
+ ggml_set_name(a, "a");
1896
+
1897
+ ggml_tensor * out = ggml_log(ctx, a);
1898
+ ggml_set_name(out, "out");
1899
+
1900
+ return out;
1901
+ }
1902
+
1903
+ void initialize_tensors(ggml_context * ctx) override {
1904
+ for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
1905
+ // log(1) == 0, cluster values there to keep the sum low for better precision in the backward pass:
1906
+ init_tensor_uniform(t, 0.9f, 1.1f);
1907
+ }
1908
+ }
1909
+
1910
+ bool grad_precise() override {
1911
+ return true;
1912
+ }
1913
+ };
1914
+
1915
+ // GGML_OP_SIN
1916
+ struct test_sin : public test_case {
1067
1917
  const ggml_type type;
1068
1918
  const std::array<int64_t, 4> ne;
1069
1919
 
@@ -1071,19 +1921,42 @@ struct test_sqr : public test_case {
1071
1921
  return VARS_TO_STR2(type, ne);
1072
1922
  }
1073
1923
 
1074
- test_sqr(ggml_type type = GGML_TYPE_F32,
1075
- std::array<int64_t, 4> ne = {10, 10, 10, 10})
1924
+ test_sin(ggml_type type = GGML_TYPE_F32,
1925
+ std::array<int64_t, 4> ne = {10, 2, 2, 2})
1076
1926
  : type(type), ne(ne) {}
1077
1927
 
1078
1928
  ggml_tensor * build_graph(ggml_context * ctx) override {
1079
1929
  ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
1080
- ggml_tensor * out = ggml_sqr(ctx, a);
1930
+ ggml_set_param(ctx, a);
1931
+ ggml_set_name(a, "a");
1932
+
1933
+ ggml_tensor * out = ggml_sin(ctx, a);
1934
+ ggml_set_name(out, "out");
1935
+
1081
1936
  return out;
1082
1937
  }
1938
+
1939
+ void initialize_tensors(ggml_context * ctx) override {
1940
+ for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
1941
+ init_tensor_uniform(t, -6.5f, 6.5f); // Covers interval [-2*pi, 2*pi].
1942
+ }
1943
+ }
1944
+
1945
+ double max_maa_err() override {
1946
+ return 1e-3;
1947
+ }
1948
+
1949
+ float grad_eps() override {
1950
+ return 0.2f;
1951
+ }
1952
+
1953
+ bool grad_precise() override {
1954
+ return true;
1955
+ }
1083
1956
  };
1084
1957
 
1085
- // GGML_OP_SQRT
1086
- struct test_sqrt : public test_case {
1958
+ // GGML_OP_COS
1959
+ struct test_cos : public test_case {
1087
1960
  const ggml_type type;
1088
1961
  const std::array<int64_t, 4> ne;
1089
1962
 
@@ -1091,22 +1964,38 @@ struct test_sqrt : public test_case {
1091
1964
  return VARS_TO_STR2(type, ne);
1092
1965
  }
1093
1966
 
1094
- test_sqrt(ggml_type type = GGML_TYPE_F32,
1095
- std::array<int64_t, 4> ne = {10, 10, 10, 10})
1967
+ test_cos(ggml_type type = GGML_TYPE_F32,
1968
+ std::array<int64_t, 4> ne = {10, 2, 2, 2})
1096
1969
  : type(type), ne(ne) {}
1097
1970
 
1098
1971
  ggml_tensor * build_graph(ggml_context * ctx) override {
1099
1972
  ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
1100
- ggml_tensor * out = ggml_sqrt(ctx, a);
1973
+ ggml_set_param(ctx, a);
1974
+ ggml_set_name(a, "a");
1975
+
1976
+ ggml_tensor * out = ggml_cos(ctx, a);
1977
+ ggml_set_name(out, "out");
1978
+
1101
1979
  return out;
1102
1980
  }
1103
1981
 
1104
1982
  void initialize_tensors(ggml_context * ctx) override {
1105
- // fill with positive values
1106
1983
  for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
1107
- init_tensor_uniform(t, 0.0f, 100.0f);
1984
+ init_tensor_uniform(t, -6.5f, 6.5f); // Covers interval [-2*pi, 2*pi].
1108
1985
  }
1109
1986
  }
1987
+
1988
+ double max_maa_err() override {
1989
+ return 1e-3;
1990
+ }
1991
+
1992
+ float grad_eps() override {
1993
+ return 0.2f;
1994
+ }
1995
+
1996
+ bool grad_precise() override {
1997
+ return true;
1998
+ }
1110
1999
  };
1111
2000
 
1112
2001
  // GGML_OP_CLAMP
@@ -1121,15 +2010,27 @@ struct test_clamp : public test_case {
1121
2010
  }
1122
2011
 
1123
2012
  test_clamp(ggml_type type = GGML_TYPE_F32,
1124
- std::array<int64_t, 4> ne = {10, 10, 10, 10},
2013
+ std::array<int64_t, 4> ne = {10, 5, 4, 3},
1125
2014
  float min = -0.5f, float max = 0.5f)
1126
2015
  : type(type), ne(ne), min(min), max(max) {}
1127
2016
 
1128
2017
  ggml_tensor * build_graph(ggml_context * ctx) override {
1129
2018
  ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
2019
+ ggml_set_name(a, "a");
2020
+
1130
2021
  ggml_tensor * out = ggml_clamp(ctx, a, min, max);
2022
+ ggml_set_name(out, "out");
2023
+
1131
2024
  return out;
1132
2025
  }
2026
+
2027
+ float grad_eps() override {
2028
+ return 1e-2f;
2029
+ }
2030
+
2031
+ std::vector<float> grad_expect() override {
2032
+ return {0.0f, 1.0f};
2033
+ }
1133
2034
  };
1134
2035
 
1135
2036
  // GGML_OP_DIAG_MASK_INF
@@ -1143,13 +2044,18 @@ struct test_diag_mask_inf : public test_case {
1143
2044
  }
1144
2045
 
1145
2046
  test_diag_mask_inf(ggml_type type = GGML_TYPE_F32,
1146
- std::array<int64_t, 4> ne = {10, 10, 10, 10},
2047
+ std::array<int64_t, 4> ne = {10, 10, 3, 2},
1147
2048
  int n_past = 5)
1148
2049
  : type(type), ne(ne), n_past(n_past) {}
1149
2050
 
1150
2051
  ggml_tensor * build_graph(ggml_context * ctx) override {
1151
2052
  ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
2053
+ ggml_set_param(ctx, a);
2054
+ ggml_set_name(a, "a");
2055
+
1152
2056
  ggml_tensor * out = ggml_diag_mask_inf(ctx, a, n_past);
2057
+ ggml_set_name(out, "out");
2058
+
1153
2059
  return out;
1154
2060
  }
1155
2061
  };
@@ -1173,7 +2079,7 @@ struct test_soft_max : public test_case {
1173
2079
  }
1174
2080
 
1175
2081
  test_soft_max(ggml_type type = GGML_TYPE_F32,
1176
- std::array<int64_t, 4> ne = {10, 10, 10, 10},
2082
+ std::array<int64_t, 4> ne = {10, 5, 4, 3},
1177
2083
  bool mask = false,
1178
2084
  float scale = 1.0f,
1179
2085
  float max_bias = 0.0f)
@@ -1181,13 +2087,24 @@ struct test_soft_max : public test_case {
1181
2087
 
1182
2088
  ggml_tensor * build_graph(ggml_context * ctx) override {
1183
2089
  ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
2090
+ ggml_set_param(ctx, a);
2091
+ ggml_set_name(a, "a");
2092
+
1184
2093
  ggml_tensor * mask = nullptr;
1185
2094
  if (this->mask) {
1186
2095
  mask = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, ne[0], ne[1]);
2096
+ ggml_set_name(mask, "mask");
1187
2097
  }
2098
+
1188
2099
  ggml_tensor * out = ggml_soft_max_ext(ctx, a, mask, scale, max_bias);
2100
+ ggml_set_name(out, "out");
2101
+
1189
2102
  return out;
1190
2103
  }
2104
+
2105
+ bool grad_precise() override {
2106
+ return true;
2107
+ }
1191
2108
  };
1192
2109
 
1193
2110
 
@@ -1209,7 +2126,7 @@ struct test_rope : public test_case {
1209
2126
  }
1210
2127
 
1211
2128
  test_rope(ggml_type type = GGML_TYPE_F32,
1212
- std::array<int64_t, 4> ne_a = {10, 10, 10, 1},
2129
+ std::array<int64_t, 4> ne_a = {10, 5, 3, 1},
1213
2130
  int n_dims = 10, int mode = 0, int n_ctx = 512, float fs = 1.0f, float ef = 0.0f, float af = 0.0f, bool ff = false, int v = 0)
1214
2131
  : type(type), ne_a(ne_a), n_dims(n_dims), mode(mode), n_ctx(n_ctx), fs(fs), ef(ef), af(af), ff(ff), v(v) {}
1215
2132
 
@@ -1218,13 +2135,29 @@ struct test_rope : public test_case {
1218
2135
  if (v & 1) {
1219
2136
  auto ne = ne_a; ne[0] *= 2; ne[1] *= 4; ne[2] *= 3;
1220
2137
  a = ggml_new_tensor(ctx, type, 4, ne.data());
2138
+ ggml_set_param(ctx, a);
2139
+ ggml_set_name(a, "a");
2140
+
1221
2141
  a = ggml_view_4d(ctx, a, ne_a[0], ne_a[1], ne_a[2], ne_a[3], a->nb[1], a->nb[2], a->nb[3], 0);
2142
+ ggml_set_name(a, "view_of_a");
1222
2143
  } else {
1223
2144
  a = ggml_new_tensor(ctx, type, 4, ne_a.data());
2145
+ ggml_set_param(ctx, a);
2146
+ ggml_set_name(a, "a");
1224
2147
  }
2148
+
1225
2149
  ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne_a[2]);
1226
- ggml_tensor * freq = ff ? ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_dims/2) : nullptr;
2150
+ ggml_set_name(pos, "pos");
2151
+
2152
+ ggml_tensor * freq = nullptr;
2153
+ if (ff) {
2154
+ freq = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_dims/2);
2155
+ ggml_set_name(freq, "freq");
2156
+ }
2157
+
1227
2158
  ggml_tensor * out = ggml_rope_ext(ctx, a, pos, freq, n_dims, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
2159
+ ggml_set_name(out, "out");
2160
+
1228
2161
  return out;
1229
2162
  }
1230
2163
 
@@ -1247,6 +2180,14 @@ struct test_rope : public test_case {
1247
2180
  }
1248
2181
  }
1249
2182
  }
2183
+
2184
+ double max_maa_err() override {
2185
+ return 1e-3;
2186
+ }
2187
+
2188
+ bool grad_precise() override {
2189
+ return true;
2190
+ }
1250
2191
  };
1251
2192
 
1252
2193
  // GGML_OP_POOL2D
@@ -1278,7 +2219,12 @@ struct test_pool2d : public test_case {
1278
2219
 
1279
2220
  ggml_tensor * build_graph(ggml_context * ctx) override {
1280
2221
  ggml_tensor * input = ggml_new_tensor(ctx, type_input, 4, ne_input.data());
2222
+ ggml_set_param(ctx, input);
2223
+ ggml_set_name(input, "input");
2224
+
1281
2225
  ggml_tensor * out = ggml_pool_2d(ctx, input, pool_type, k0, k1, s0, s1, p0, p1);
2226
+ ggml_set_name(out, "out");
2227
+
1282
2228
  return out;
1283
2229
  }
1284
2230
  };
@@ -1303,8 +2249,14 @@ struct test_conv_transpose_1d : public test_case {
1303
2249
 
1304
2250
  ggml_tensor * build_graph(ggml_context * ctx) override {
1305
2251
  ggml_tensor * input = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_input.data());
2252
+ ggml_set_name(input, "input");
2253
+
1306
2254
  ggml_tensor * kernel = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_kernel.data());
2255
+ ggml_set_name(kernel, "kernel");
2256
+
1307
2257
  ggml_tensor * out = ggml_conv_transpose_1d(ctx, kernel, input, s0, p0, d0);
2258
+ ggml_set_name(out, "out");
2259
+
1308
2260
  return out;
1309
2261
  }
1310
2262
  };
@@ -1343,8 +2295,15 @@ struct test_im2col : public test_case {
1343
2295
 
1344
2296
  ggml_tensor * build_graph(ggml_context * ctx) override {
1345
2297
  ggml_tensor * input = ggml_new_tensor(ctx, type_input, 4, ne_input.data());
2298
+ ggml_set_param(ctx, input);
2299
+ ggml_set_name(input, "input");
2300
+
1346
2301
  ggml_tensor * kernel = ggml_new_tensor(ctx, type_kernel, 4, ne_kernel.data());
2302
+ ggml_set_name(kernel, "kernel");
2303
+
1347
2304
  ggml_tensor * out = ggml_im2col(ctx, kernel, input, s0, s1, p0, p1, d0, d1, is_2D, dst_type);
2305
+ ggml_set_name(out, "out");
2306
+
1348
2307
  return out;
1349
2308
  }
1350
2309
  };
@@ -1362,8 +2321,8 @@ struct test_concat : public test_case {
1362
2321
  }
1363
2322
 
1364
2323
  test_concat(ggml_type type = GGML_TYPE_F32,
1365
- std::array<int64_t, 4> ne_a = {10, 10, 10, 10},
1366
- int64_t ne_b_d = 10,
2324
+ std::array<int64_t, 4> ne_a = {10, 5, 5, 5},
2325
+ int64_t ne_b_d = 5,
1367
2326
  int dim = 2, int v = 0)
1368
2327
  : type(type), ne_a(ne_a), ne_b_d(ne_b_d), dim(dim), v(v) {}
1369
2328
 
@@ -1374,19 +2333,30 @@ struct test_concat : public test_case {
1374
2333
  if (v & 1) {
1375
2334
  auto ne = ne_a; ne[0] *= 2; ne[1] *= 4; ne[2] *= 3;
1376
2335
  a = ggml_new_tensor(ctx, type, 4, ne.data());
2336
+ ggml_set_name(a, "a");
2337
+
1377
2338
  a = ggml_view_4d(ctx, a, ne_a[0], ne_a[1], ne_a[2], ne_a[3], a->nb[1], a->nb[2], a->nb[3], 0);
2339
+ ggml_set_name(a, "view_of_a");
1378
2340
  } else {
1379
2341
  a = ggml_new_tensor(ctx, type, 4, ne_a.data());
2342
+ ggml_set_name(a, "a");
1380
2343
  }
1381
2344
  ggml_tensor * b;
1382
2345
  if (v & 2) {
1383
2346
  auto ne = ne_b; ne[0] *= 3; ne[1] *= 2; ne[2] *= 4;
1384
2347
  b = ggml_new_tensor(ctx, type, 4, ne.data());
2348
+ ggml_set_name(b, "b");
2349
+
1385
2350
  b = ggml_view_4d(ctx, b, ne_b[0], ne_b[1], ne_b[2], ne_b[3], b->nb[1], b->nb[2], b->nb[3], 0);
2351
+ ggml_set_name(b, "view_of_b");
1386
2352
  } else {
1387
2353
  b = ggml_new_tensor(ctx, type, 4, ne_b.data());
2354
+ ggml_set_name(b, "b");
1388
2355
  }
2356
+
1389
2357
  ggml_tensor * out = ggml_concat(ctx, a, b, dim);
2358
+ ggml_set_name(out, "out");
2359
+
1390
2360
  return out;
1391
2361
  }
1392
2362
  };
@@ -1408,7 +2378,11 @@ struct test_argsort : public test_case {
1408
2378
 
1409
2379
  ggml_tensor * build_graph(ggml_context * ctx) override {
1410
2380
  ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
2381
+ ggml_set_name(a, "a");
2382
+
1411
2383
  ggml_tensor * out = ggml_argsort(ctx, a, order);
2384
+ ggml_set_name(out, "out");
2385
+
1412
2386
  return out;
1413
2387
  }
1414
2388
 
@@ -1441,6 +2415,35 @@ struct test_argsort : public test_case {
1441
2415
  }
1442
2416
  };
1443
2417
 
2418
+ // GGML_OP_SUM
2419
+ struct test_sum : public test_case {
2420
+ const ggml_type type;
2421
+ const std::array<int64_t, 4> ne;
2422
+
2423
+ std::string vars() override {
2424
+ return VARS_TO_STR2(type, ne);
2425
+ }
2426
+
2427
+ test_sum(ggml_type type = GGML_TYPE_F32,
2428
+ std::array<int64_t, 4> ne = {10, 5, 4, 3})
2429
+ : type(type), ne(ne) {}
2430
+
2431
+ ggml_tensor * build_graph(ggml_context * ctx) override {
2432
+ ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
2433
+ ggml_set_param(ctx, a);
2434
+ ggml_set_name(a, "a");
2435
+
2436
+ ggml_tensor * out = ggml_sum(ctx, a);
2437
+ ggml_set_name(out, "out");
2438
+
2439
+ return out;
2440
+ }
2441
+
2442
+ float grad_eps() override {
2443
+ return 0.1f * sqrtf(ne[0]*ne[1]*ne[2]*ne[3]);
2444
+ }
2445
+ };
2446
+
1444
2447
  // GGML_OP_SUM_ROWS
1445
2448
  struct test_sum_rows : public test_case {
1446
2449
  const ggml_type type;
@@ -1451,12 +2454,17 @@ struct test_sum_rows : public test_case {
1451
2454
  }
1452
2455
 
1453
2456
  test_sum_rows(ggml_type type = GGML_TYPE_F32,
1454
- std::array<int64_t, 4> ne = {10, 10, 10, 10})
2457
+ std::array<int64_t, 4> ne = {10, 5, 4, 3})
1455
2458
  : type(type), ne(ne) {}
1456
2459
 
1457
2460
  ggml_tensor * build_graph(ggml_context * ctx) override {
1458
2461
  ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
2462
+ ggml_set_param(ctx, a);
2463
+ ggml_set_name(a, "a");
2464
+
1459
2465
  ggml_tensor * out = ggml_sum_rows(ctx, a);
2466
+ ggml_set_name(out, "out");
2467
+
1460
2468
  return out;
1461
2469
  }
1462
2470
  };
@@ -1479,8 +2487,16 @@ struct test_upscale : public test_case {
1479
2487
 
1480
2488
  ggml_tensor * build_graph(ggml_context * ctx) override {
1481
2489
  ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
1482
- if (transpose) a = ggml_transpose(ctx, a);
2490
+ ggml_set_name(a, "a");
2491
+
2492
+ if (transpose) {
2493
+ a = ggml_transpose(ctx, a);
2494
+ ggml_set_name(a, "a_transposed");
2495
+ }
2496
+
1483
2497
  ggml_tensor * out = ggml_upscale(ctx, a, scale_factor);
2498
+ ggml_set_name(out, "out");
2499
+
1484
2500
  return out;
1485
2501
  }
1486
2502
  };
@@ -1502,7 +2518,11 @@ struct test_upscale_ext : public test_case {
1502
2518
 
1503
2519
  ggml_tensor * build_graph(ggml_context * ctx) override {
1504
2520
  ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
2521
+ ggml_set_name(a, "a");
2522
+
1505
2523
  ggml_tensor * out = ggml_upscale_ext(ctx, a, ne_tgt[0], ne_tgt[1],ne_tgt[2], ne_tgt[3]);
2524
+ ggml_set_name(out, "out");
2525
+
1506
2526
  return out;
1507
2527
  }
1508
2528
  };
@@ -1512,6 +2532,7 @@ struct test_group_norm : public test_case {
1512
2532
  const ggml_type type;
1513
2533
  const std::array<int64_t, 4> ne;
1514
2534
  const int32_t num_groups;
2535
+ const float eps;
1515
2536
 
1516
2537
  std::string vars() override {
1517
2538
  return VARS_TO_STR3(type, ne, num_groups);
@@ -1519,12 +2540,17 @@ struct test_group_norm : public test_case {
1519
2540
 
1520
2541
  test_group_norm(ggml_type type = GGML_TYPE_F32,
1521
2542
  std::array<int64_t, 4> ne = {64, 64, 320, 1},
1522
- int32_t num_groups = 32)
1523
- : type(type), ne(ne), num_groups(num_groups) {}
2543
+ int32_t num_groups = 32,
2544
+ float eps = 1e-6f)
2545
+ : type(type), ne(ne), num_groups(num_groups), eps(eps) {}
1524
2546
 
1525
2547
  ggml_tensor * build_graph(ggml_context * ctx) override {
1526
2548
  ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
1527
- ggml_tensor * out = ggml_group_norm(ctx, a, num_groups);
2549
+ ggml_set_name(a, "a");
2550
+
2551
+ ggml_tensor * out = ggml_group_norm(ctx, a, num_groups, eps);
2552
+ ggml_set_name(out, "out");
2553
+
1528
2554
  return out;
1529
2555
  }
1530
2556
  };
@@ -1540,14 +2566,22 @@ struct test_acc : public test_case {
1540
2566
  }
1541
2567
 
1542
2568
  test_acc(ggml_type type = GGML_TYPE_F32,
1543
- std::array<int64_t, 4> ne_a = {1024, 577, 1, 1},
1544
- std::array<int64_t, 4> ne_b = {1024, 576, 1, 1})
2569
+ std::array<int64_t, 4> ne_a = {256, 17, 1, 1},
2570
+ std::array<int64_t, 4> ne_b = {256, 16, 1, 1})
1545
2571
  : type(type), ne_a(ne_a), ne_b(ne_b) {}
1546
2572
 
1547
2573
  ggml_tensor * build_graph(ggml_context * ctx) override {
1548
2574
  ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
2575
+ ggml_set_param(ctx, a);
2576
+ ggml_set_name(a, "a");
2577
+
1549
2578
  ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne_b.data());
2579
+ ggml_set_param(ctx, b);
2580
+ ggml_set_name(b, "b");
2581
+
1550
2582
  ggml_tensor * out = ggml_acc(ctx, a, b, a->nb[1], a->nb[2], a->nb[3], b->nb[1]);
2583
+ ggml_set_name(out, "out");
2584
+
1551
2585
  return out;
1552
2586
  }
1553
2587
  };
@@ -1570,7 +2604,11 @@ struct test_pad : public test_case {
1570
2604
 
1571
2605
  ggml_tensor * build_graph(ggml_context * ctx) override {
1572
2606
  ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
2607
+ ggml_set_name(a, "a");
2608
+
1573
2609
  ggml_tensor * out = ggml_pad(ctx, a, pad_0, pad_1, 0, 0);
2610
+ ggml_set_name(out, "out");
2611
+
1574
2612
  return out;
1575
2613
  }
1576
2614
  };
@@ -1592,6 +2630,8 @@ struct test_arange : public test_case {
1592
2630
 
1593
2631
  ggml_tensor * build_graph(ggml_context * ctx) override {
1594
2632
  ggml_tensor * out = ggml_arange(ctx, start, stop, step);
2633
+ ggml_set_name(out, "out");
2634
+
1595
2635
  return out;
1596
2636
  }
1597
2637
  };
@@ -1614,7 +2654,11 @@ struct test_timestep_embedding : public test_case {
1614
2654
 
1615
2655
  ggml_tensor * build_graph(ggml_context * ctx) override {
1616
2656
  ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
2657
+ ggml_set_name(a, "a");
2658
+
1617
2659
  ggml_tensor * out = ggml_timestep_embedding(ctx, a, dim, max_period);
2660
+ ggml_set_name(out, "out");
2661
+
1618
2662
  return out;
1619
2663
  }
1620
2664
  };
@@ -1630,13 +2674,17 @@ struct test_leaky_relu : public test_case {
1630
2674
  }
1631
2675
 
1632
2676
  test_leaky_relu(ggml_type type = GGML_TYPE_F32,
1633
- std::array<int64_t, 4> ne_a = {10, 10, 10, 10},
2677
+ std::array<int64_t, 4> ne_a = {10, 5, 4, 3},
1634
2678
  float negative_slope = 0.1f)
1635
2679
  : type(type), ne_a(ne_a), negative_slope(negative_slope) {}
1636
2680
 
1637
2681
  ggml_tensor * build_graph(ggml_context * ctx) override {
1638
2682
  ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
2683
+ ggml_set_name(a, "a");
2684
+
1639
2685
  ggml_tensor * out = ggml_leaky_relu(ctx, a, negative_slope, true);
2686
+ ggml_set_name(out, "out");
2687
+
1640
2688
  return out;
1641
2689
  }
1642
2690
  };
@@ -1651,30 +2699,145 @@ struct test_flash_attn_ext : public test_case {
1651
2699
  const bool mask; // use mask
1652
2700
 
1653
2701
  const float max_bias; // ALiBi
2702
+ const float logit_softcap; // Gemma 2
1654
2703
 
1655
2704
  const ggml_type type_KV;
1656
2705
 
1657
2706
  std::string vars() override {
1658
- return VARS_TO_STR7(hs, nh, kv, nb, mask, max_bias, type_KV);
2707
+ return VARS_TO_STR8(hs, nh, kv, nb, mask, max_bias, logit_softcap, type_KV);
1659
2708
  }
1660
2709
 
1661
2710
  double max_nmse_err() override {
1662
2711
  return 5e-4;
1663
2712
  }
1664
2713
 
1665
- test_flash_attn_ext(int64_t hs = 128, int64_t nh = 32, int64_t kv = 96, int64_t nb = 8, bool mask = true, float max_bias = 0.0f, ggml_type type_KV = GGML_TYPE_F16)
1666
- : hs(hs), nh(nh), kv(kv), nb(nb), mask(mask), max_bias(max_bias), type_KV(type_KV) {}
2714
+ test_flash_attn_ext(int64_t hs = 128, int64_t nh = 32, int64_t kv = 96, int64_t nb = 8,
2715
+ bool mask = true, float max_bias = 0.0f, float logit_softcap = 0.0f, ggml_type type_KV = GGML_TYPE_F16)
2716
+ : hs(hs), nh(nh), kv(kv), nb(nb), mask(mask), max_bias(max_bias), logit_softcap(logit_softcap), type_KV(type_KV) {}
1667
2717
 
1668
2718
  ggml_tensor * build_graph(ggml_context * ctx) override {
1669
2719
  const int64_t hs_padded = GGML_PAD(hs, ggml_blck_size(type_KV));
1670
2720
 
1671
2721
  ggml_tensor * q = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, hs_padded, nb, nh, 1);
2722
+ ggml_set_name(q, "q");
2723
+
1672
2724
  ggml_tensor * k = ggml_new_tensor_4d(ctx, type_KV, hs_padded, kv, nh, 1);
2725
+ ggml_set_name(k, "k");
2726
+
1673
2727
  ggml_tensor * v = ggml_new_tensor_4d(ctx, type_KV, hs_padded, kv, nh, 1);
1674
- ggml_tensor * m = mask ? ggml_new_tensor_4d(ctx, GGML_TYPE_F16, kv, GGML_PAD(nb, GGML_KQ_MASK_PAD), 1, 1) : nullptr;
1675
- ggml_tensor * out = ggml_flash_attn_ext(ctx, q, k, v, m, 1.0f/sqrtf(hs), max_bias);
2728
+ ggml_set_name(v, "v");
2729
+
2730
+ ggml_tensor * m = nullptr;
2731
+ if (mask) {
2732
+ m = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, kv, GGML_PAD(nb, GGML_KQ_MASK_PAD), 1, 1);
2733
+ ggml_set_name(m, "m");
2734
+ }
2735
+
2736
+ ggml_tensor * out = ggml_flash_attn_ext(ctx, q, k, v, m, 1.0f/sqrtf(hs), max_bias, logit_softcap);
2737
+ ggml_set_name(out, "out");
2738
+
2739
+ return out;
2740
+ }
2741
+
2742
+ bool grad_precise() override {
2743
+ return true;
2744
+ }
2745
+ };
2746
+
2747
+ // GGML_OP_CROSS_ENTROPY_LOSS
2748
+ struct test_cross_entropy_loss : public test_case {
2749
+ const ggml_type type;
2750
+ const std::array<int64_t, 4> ne;
2751
+
2752
+ std::string vars() override {
2753
+ return VARS_TO_STR2(type, ne);
2754
+ }
2755
+
2756
+ test_cross_entropy_loss(ggml_type type = GGML_TYPE_F32,
2757
+ std::array<int64_t, 4> ne = {10, 5, 4, 3})
2758
+ : type(type), ne(ne) {}
2759
+
2760
+ ggml_tensor * build_graph(ggml_context * ctx) override {
2761
+ ggml_tensor * logits = ggml_new_tensor(ctx, type, 4, ne.data());
2762
+ ggml_set_param(ctx, logits);
2763
+ ggml_set_name(logits, "logits");
2764
+
2765
+ ggml_tensor * labels = ggml_new_tensor(ctx, type, 4, ne.data());
2766
+ // The labels are assumed to be constant -> no gradients.
2767
+ ggml_set_name(labels, "labels");
2768
+
2769
+ // Ensure labels add up to 1:
2770
+ labels = ggml_soft_max(ctx, labels);
2771
+ ggml_set_name(labels, "labels_normalized");
2772
+
2773
+ ggml_tensor * out = ggml_cross_entropy_loss(ctx, logits, labels);
2774
+ ggml_set_name(out, "out");
2775
+
2776
+ return out;
2777
+ }
2778
+
2779
+ void initialize_tensors(ggml_context * ctx) override {
2780
+ // For larger abs. diffs between logits softmax is more linear, therefore more precise num. gradients.
2781
+ for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
2782
+ init_tensor_uniform(t, -100.0f, 100.0f);
2783
+ }
2784
+ }
2785
+
2786
+ float grad_eps() override {
2787
+ return 1.0f;
2788
+ }
2789
+
2790
+ bool grad_precise() override {
2791
+ return true;
2792
+ }
2793
+ };
2794
+
2795
+ // GGML_OP_OPT_STEP_ADAMW
2796
+ struct test_opt_step_adamw : public test_case {
2797
+ const ggml_type type;
2798
+ const std::array<int64_t, 4> ne;
2799
+ const float alpha;
2800
+ const float beta1;
2801
+ const float beta2;
2802
+ const float eps;
2803
+ const float wd;
2804
+
2805
+ std::string vars() override {
2806
+ return VARS_TO_STR7(type, ne, alpha, beta1, beta2, eps, wd);
2807
+ }
2808
+
2809
+ test_opt_step_adamw(ggml_type type = GGML_TYPE_F32,
2810
+ std::array<int64_t, 4> ne = {10, 5, 4, 3},
2811
+ float alpha = 1e-3f,
2812
+ float beta1 = 0.9f,
2813
+ float beta2 = 0.999f,
2814
+ float eps = 1e-8f,
2815
+ float wd = 0.0f)
2816
+ : type(type), ne(ne), alpha(alpha), beta1(beta1), beta2(beta2), eps(eps), wd(wd) {}
2817
+
2818
+ ggml_tensor * build_graph(ggml_context * ctx) override {
2819
+ ggml_tensor * a = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
2820
+ ggml_set_param(ctx, a); // Despite tensor a having gradients the output tensor will not.
2821
+ ggml_set_name(a, "a");
2822
+
2823
+ ggml_tensor * grad = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
2824
+ ggml_set_name(grad, "grad");
2825
+
2826
+ ggml_tensor * out = ggml_opt_step_adamw(ctx, a, grad, alpha, beta1, beta2, eps, wd);
2827
+ ggml_set_name(out, "out");
2828
+
1676
2829
  return out;
1677
2830
  }
2831
+
2832
+ void initialize_tensors(ggml_context * ctx) override {
2833
+ for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
2834
+ init_tensor_uniform(t, 0.0f, 1.0f); // grad_v needs non-negative values.
2835
+ }
2836
+ }
2837
+
2838
+ bool grad_precise() override {
2839
+ return true;
2840
+ }
1678
2841
  };
1679
2842
 
1680
2843
  enum llm_norm_type {
@@ -2061,48 +3224,55 @@ struct test_falcon : public test_llm {
2061
3224
  }
2062
3225
  };
2063
3226
 
2064
- static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_name) {
2065
- std::vector<std::unique_ptr<test_case>> test_cases;
2066
- std::default_random_engine rng(0);
2067
3227
 
2068
- const ggml_type all_types[] = {
2069
- GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16,
2070
- GGML_TYPE_Q4_0, GGML_TYPE_Q4_1,
2071
- GGML_TYPE_Q5_0, GGML_TYPE_Q5_1,
2072
- GGML_TYPE_Q8_0,
2073
- GGML_TYPE_Q2_K, GGML_TYPE_Q3_K,
2074
- GGML_TYPE_Q4_K, GGML_TYPE_Q5_K,
2075
- GGML_TYPE_Q6_K,
2076
- GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S,
2077
- GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M,
2078
- GGML_TYPE_IQ4_NL, GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS,
2079
- };
3228
+ // ###########################################
3229
+ // ## Section 3: GGML Op Test Instantiation ##
3230
+ // ###########################################
3231
+ static const ggml_type all_types[] = {
3232
+ GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16,
3233
+ GGML_TYPE_Q4_0, GGML_TYPE_Q4_1,
3234
+ GGML_TYPE_Q5_0, GGML_TYPE_Q5_1,
3235
+ GGML_TYPE_Q8_0,
3236
+ GGML_TYPE_Q2_K, GGML_TYPE_Q3_K,
3237
+ GGML_TYPE_Q4_K, GGML_TYPE_Q5_K,
3238
+ GGML_TYPE_Q6_K,
3239
+ // GGML_TYPE_TQ1_0, GGML_TYPE_TQ2_0, // TODO: implement for all backends
3240
+ GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S,
3241
+ GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M,
3242
+ GGML_TYPE_IQ4_NL, GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS,
3243
+ };
2080
3244
 
2081
- const ggml_type base_types[] = {
2082
- GGML_TYPE_F32, GGML_TYPE_F16,
2083
- GGML_TYPE_Q4_0,
2084
- GGML_TYPE_Q4_K,
2085
- GGML_TYPE_IQ2_XXS
2086
- };
3245
+ static const ggml_type base_types[] = {
3246
+ GGML_TYPE_F32, GGML_TYPE_F16,
3247
+ GGML_TYPE_Q4_0,
3248
+ GGML_TYPE_Q4_K,
3249
+ GGML_TYPE_IQ2_XXS
3250
+ };
2087
3251
 
2088
- const ggml_type other_types[] = {
2089
- GGML_TYPE_Q4_1,
2090
- GGML_TYPE_Q5_0, GGML_TYPE_Q5_1,
2091
- GGML_TYPE_Q8_0,
2092
- GGML_TYPE_Q2_K, GGML_TYPE_Q3_K,
2093
- GGML_TYPE_Q5_K,
2094
- GGML_TYPE_Q6_K,
2095
- GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S,
2096
- GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M,
2097
- GGML_TYPE_IQ4_NL, GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS,
2098
- GGML_TYPE_BF16,
2099
- };
3252
+ static const ggml_type other_types[] = {
3253
+ GGML_TYPE_Q4_1,
3254
+ GGML_TYPE_Q5_0, GGML_TYPE_Q5_1,
3255
+ GGML_TYPE_Q8_0,
3256
+ GGML_TYPE_Q2_K, GGML_TYPE_Q3_K,
3257
+ GGML_TYPE_Q5_K,
3258
+ GGML_TYPE_Q6_K,
3259
+ // GGML_TYPE_TQ1_0, GGML_TYPE_TQ2_0, // TODO: implement for all backends
3260
+ GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S,
3261
+ GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M,
3262
+ GGML_TYPE_IQ4_NL, GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS,
3263
+ GGML_TYPE_BF16,
3264
+ };
3265
+
3266
+ // Test cases for evaluation: should try to cover edge cases while using small input sizes to keep the runtime low
3267
+ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
3268
+ std::vector<std::unique_ptr<test_case>> test_cases;
3269
+ std::default_random_engine rng(0);
2100
3270
 
2101
3271
  // unary ops
2102
3272
  for (int v : {0, 1}) {
2103
3273
  for (int op = 0; op < GGML_UNARY_OP_COUNT; op++) {
2104
- test_cases.emplace_back(new test_unary((ggml_unary_op) op, GGML_TYPE_F32, { 128, 10, 10, 10 }, v));
2105
- test_cases.emplace_back(new test_unary((ggml_unary_op) op, GGML_TYPE_F32, { 7, 13, 19, 23 }, v));
3274
+ test_cases.emplace_back(new test_unary((ggml_unary_op) op, GGML_TYPE_F32, { 128, 2, 2, 2 }, v));
3275
+ test_cases.emplace_back(new test_unary((ggml_unary_op) op, GGML_TYPE_F32, { 5, 7, 11, 13 }, v));
2106
3276
  }
2107
3277
  }
2108
3278
 
@@ -2138,8 +3308,20 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
2138
3308
  }
2139
3309
  }
2140
3310
 
3311
+ test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32));
2141
3312
  test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32));
2142
3313
  test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16));
3314
+ // test cases for 1D im2col
3315
+ test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32, {3000, 128, 1, 1}, {3, 128, 1280, 1}, 1, 0, 1, 0, 1, 0, false));
3316
+ test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32, {3000, 128, 1, 1}, {3, 128, 1280, 1}, 1, 0, 1, 0, 1, 0, false));
3317
+ test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {3000, 128, 1, 1}, {3, 128, 1280, 1}, 1, 0, 1, 0, 1, 0, false));
3318
+
3319
+ // sycl backend will limit task global_range < MAX_INT
3320
+ // test cases for 2D im2col with large input W and H (occurs in stable-diffusion)
3321
+ // however these cases need to alloc more memory which may fail in some devices (Intel Arc770, etc.)
3322
+ // these cases are verified (pass) in Intel(R) Data Center GPU Max 1100 (sycl backend) and NV A30 (cuda backend)
3323
+ // test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {1024, 1024, 256, 1}, {3, 3, 256, 1}, 1, 1, 1, 1, 1, 1, true));
3324
+ // test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32, {1024, 1024, 256, 1}, {3, 3, 256, 1}, 1, 1, 1, 1, 1, 1, true));
2143
3325
 
2144
3326
  test_cases.emplace_back(new test_conv_transpose_1d());
2145
3327
  test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 3, 0, 1));
@@ -2150,14 +3332,18 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
2150
3332
  test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,1,2,1}, 1, 0, 1));
2151
3333
  test_cases.emplace_back(new test_conv_transpose_1d({2,1,1,1}, {3,1,1,1}, 1, 0, 1));
2152
3334
 
3335
+ test_cases.emplace_back(new test_argmax());
3336
+ test_cases.emplace_back(new test_count_equal());
2153
3337
 
2154
- test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 1, 1, 1}));
2155
- test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {2, 1, 1, 1}));
2156
- test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 2, 1, 1}));
2157
- test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 1, 2, 1}));
2158
- test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 1, 1, 2}));
2159
- test_cases.emplace_back(new test_repeat(GGML_TYPE_I32, {10, 10, 10, 10}, {2, 1, 1, 1}));
2160
- test_cases.emplace_back(new test_repeat(GGML_TYPE_I16, {10, 10, 10, 10}, {1, 1, 1, 2}));
3338
+ for (int ne3 : {1, 3}) { // CUDA backward pass only supports ne3 == 1
3339
+ test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {1, 1, 1, 1}));
3340
+ test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {2, 1, 1, 1}));
3341
+ test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {1, 2, 1, 1}));
3342
+ test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {1, 1, 2, 1}));
3343
+ test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {1, 1, 1, 2}));
3344
+ test_cases.emplace_back(new test_repeat(GGML_TYPE_I32, {10, 5, 4, ne3}, {2, 1, 1, 1}));
3345
+ test_cases.emplace_back(new test_repeat(GGML_TYPE_I16, {10, 5, 4, ne3}, {1, 1, 1, 2}));
3346
+ }
2161
3347
 
2162
3348
  test_cases.emplace_back(new test_dup(GGML_TYPE_F32));
2163
3349
  test_cases.emplace_back(new test_dup(GGML_TYPE_F16));
@@ -2167,8 +3353,12 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
2167
3353
  test_cases.emplace_back(new test_dup(GGML_TYPE_F16, {10, 10, 5, 1}, {0, 2, 1, 3})); // dup by rows
2168
3354
  test_cases.emplace_back(new test_dup(GGML_TYPE_F32, {10, 10, 5, 1}, {1, 0, 2, 3}));
2169
3355
  test_cases.emplace_back(new test_dup(GGML_TYPE_F16, {10, 10, 5, 1}, {1, 0, 2, 3})); // dup dst not-contiguous
2170
- test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {0, 2, 1, 3}));
2171
- test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {1, 2, 0, 3}));
3356
+ test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {0, 2, 1, 3}));
3357
+ test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {1, 2, 0, 3}));
3358
+
3359
+ for (int dim = 1; dim < GGML_MAX_DIMS; ++dim) {
3360
+ test_cases.emplace_back(new test_set(GGML_TYPE_F32, GGML_TYPE_F32, {6, 5, 4, 3}, dim));
3361
+ }
2172
3362
 
2173
3363
  for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) {
2174
3364
  for (ggml_type type_dst : all_types) {
@@ -2183,6 +3373,15 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
2183
3373
  }
2184
3374
 
2185
3375
  test_cases.emplace_back(new test_cont());
3376
+ test_cases.emplace_back(new test_cont(GGML_TYPE_F32, {2, 1, 1 ,1}));
3377
+ test_cases.emplace_back(new test_cont(GGML_TYPE_F32, {2, 1, 3 ,5}));
3378
+ test_cases.emplace_back(new test_cont(GGML_TYPE_F32, {2, 3, 5 ,7}));
3379
+ test_cases.emplace_back(new test_cont(GGML_TYPE_F16, {2, 1, 1 ,1}));
3380
+ test_cases.emplace_back(new test_cont(GGML_TYPE_F16, {2, 1, 3 ,5}));
3381
+ test_cases.emplace_back(new test_cont(GGML_TYPE_F16, {2, 3, 5 ,7}));
3382
+ test_cases.emplace_back(new test_cont(GGML_TYPE_BF16, {2, 1, 1 ,1}));
3383
+ test_cases.emplace_back(new test_cont(GGML_TYPE_BF16, {2, 1, 3 ,5}));
3384
+ test_cases.emplace_back(new test_cont(GGML_TYPE_BF16, {2, 3, 5 ,7}));
2186
3385
 
2187
3386
  auto add_test_bin_bcast = [&](ggml_type type, std::array<int64_t, 4> ne, std::array<int, 4> nr) {
2188
3387
  for (auto op : {ggml_add, ggml_mul, ggml_div}) {
@@ -2193,16 +3392,16 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
2193
3392
  add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 8, 1}, {1, 1, 1, 1});
2194
3393
  add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 1, 1}, {32, 1, 1, 1});
2195
3394
  add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 320, 320}, {1, 1, 1, 1});
2196
- add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 1, 1}, {1, 1, 1, 1});
2197
- add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 1}, {1, 1, 1, 1});
2198
- add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {1, 1, 1, 1});
2199
- add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {2, 1, 1, 1});
2200
- add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {1, 2, 1, 1});
2201
- add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {1, 1, 2, 1});
2202
- add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {1, 1, 1, 2});
2203
- add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {1, 1, 2, 2});
2204
- add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {1, 2, 2, 2});
2205
- add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {2, 2, 2, 2});
3395
+ add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 1, 1}, {1, 1, 1, 1});
3396
+ add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 4, 1}, {1, 1, 1, 1});
3397
+ add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 4, 3}, {1, 1, 1, 1});
3398
+ add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 4, 3}, {2, 1, 1, 1});
3399
+ add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 4, 3}, {1, 2, 1, 1});
3400
+ add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 4, 3}, {1, 1, 2, 1});
3401
+ add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 4, 3}, {1, 1, 1, 2});
3402
+ add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 4, 3}, {1, 1, 2, 2});
3403
+ add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 4, 3}, {1, 2, 2, 2});
3404
+ add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 4, 3}, {2, 2, 2, 2});
2206
3405
 
2207
3406
  // stable diffusion
2208
3407
  add_test_bin_bcast(GGML_TYPE_F32, {1280, 1, 1, 1}, {1, 1, 1, 1});
@@ -2221,13 +3420,25 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
2221
3420
  //add_test_bin_bcast(GGML_TYPE_F32, {3, 3, 2560, 1280}, {1, 1, 1, 1});
2222
3421
  //add_test_bin_bcast(GGML_TYPE_F32, {3, 3, 2560, 1280}, {2, 1, 1, 1});
2223
3422
 
3423
+ test_cases.emplace_back(new test_add1());
2224
3424
  test_cases.emplace_back(new test_scale());
2225
3425
 
2226
3426
  for (float eps : {1e-6f, 1e-5f, 1e-3f, 1e-1f}) {
2227
- test_cases.emplace_back(new test_norm(GGML_TYPE_F32, {64, 10, 10, 10}, eps));
2228
- test_cases.emplace_back(new test_rms_norm(GGML_TYPE_F32, {64, 10, 10, 10}, eps));
3427
+ test_cases.emplace_back(new test_norm(GGML_TYPE_F32, {64, 5, 4, 3}, eps));
3428
+ test_cases.emplace_back(new test_rms_norm(GGML_TYPE_F32, {64, 5, 4, 3}, eps));
2229
3429
  }
2230
3430
 
3431
+ test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {4, 1536, 1, 1}, {4, 1536, 1, 1}));
3432
+ test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {8, 1536, 1, 1}, {4, 1536, 1, 1}));
3433
+ test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {4, 1536, 4, 1}, {4, 1536, 1, 1}));
3434
+
3435
+ test_cases.emplace_back(new test_ssm_scan(GGML_TYPE_F32, 16, 1024, 32, 4));
3436
+
3437
+ test_cases.emplace_back(new test_rwkv_wkv(GGML_TYPE_F32, 32, 64, 1, 1));
3438
+ test_cases.emplace_back(new test_rwkv_wkv(GGML_TYPE_F32, 32, 64, 32, 1));
3439
+ test_cases.emplace_back(new test_rwkv_wkv(GGML_TYPE_F32, 32, 64, 32, 4));
3440
+ test_cases.emplace_back(new test_rwkv_wkv(GGML_TYPE_F32, 32, 64, 128, 4));
3441
+
2231
3442
  #if 1
2232
3443
  for (ggml_type type_a : base_types) {
2233
3444
  for (ggml_type type_b : {GGML_TYPE_F32, GGML_TYPE_F16}) {
@@ -2248,6 +3459,14 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
2248
3459
  test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {2, 2}));
2249
3460
  }
2250
3461
  }
3462
+ for (ggml_type type_a : other_types) {
3463
+ for (ggml_type type_b : {GGML_TYPE_F32}) {
3464
+ if (ggml_blck_size(type_a) != 256) {
3465
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, ggml_blck_size(type_a), {1, 1}, {1, 1}));
3466
+ }
3467
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {1, 1}, {1, 1}));
3468
+ }
3469
+ }
2251
3470
  #else
2252
3471
  // m = a rows
2253
3472
  // n = b rows
@@ -2267,12 +3486,6 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
2267
3486
  }
2268
3487
  #endif
2269
3488
 
2270
- for (ggml_type type_a : other_types) {
2271
- for (ggml_type type_b : {GGML_TYPE_F32}) {
2272
- test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, { 1, 1}, {1, 1}));
2273
- }
2274
- }
2275
-
2276
3489
  test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 64, 2, 128, { 8, 1}, {1, 1}));
2277
3490
  test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 83, 2, 128, { 8, 1}, {4, 1}));
2278
3491
  test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 64, 2, 64, { 8, 1}, {4, 1}));
@@ -2280,6 +3493,12 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
2280
3493
  test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 64, 45, 128, { 8, 1}, {4, 1}));
2281
3494
  test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 45, 64, { 8, 1}, {4, 1}));
2282
3495
 
3496
+ // sycl backend will limit task global_range < MAX_INT
3497
+ // test case for f16-type-convert-to-fp32 kernel with large k under fp32 compute dtype (occurs in stable-diffusion)
3498
+ // however this case needs to alloc more memory which may fail in some devices (Intel Arc770, etc.)
3499
+ // this case is verified (pass) in Intel(R) Data Center GPU Max 1100 (sycl backend) and NV A30 (cuda backend)
3500
+ // test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F16, 512, 262144, 9216, {1, 1}, {1, 1}));
3501
+
2283
3502
  for (ggml_type type_a : base_types) {
2284
3503
  for (ggml_type type_b : {GGML_TYPE_F32 /*, GGML_TYPE_F16 */}) {
2285
3504
  for (int n_mats : {4, 8}) {
@@ -2312,13 +3531,37 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
2312
3531
  }
2313
3532
  }
2314
3533
 
3534
+ for (ggml_type type_a : base_types) {
3535
+ for (ggml_type type_b : {GGML_TYPE_F32, GGML_TYPE_F16}) {
3536
+ test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 1, 16, { 1, 1}));
3537
+ test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 1, 16, {10, 1}));
3538
+ test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 1, 16, {10, 1}));
3539
+ test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 1, 16, {10, 10}));
3540
+ test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 1, 16, {10, 10}));
3541
+ test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 1, 16, {10, 10}));
3542
+ test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 1, 16, {10, 10}));
3543
+
3544
+ test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, { 1, 1}));
3545
+ test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, { 1, 1}, true));
3546
+ test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, {10, 1}));
3547
+ test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, {10, 1}));
3548
+ test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, {10, 10}));
3549
+ test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, {10, 10}));
3550
+ test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, {10, 10}));
3551
+ test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, {10, 10}));
3552
+ }
3553
+ }
3554
+
2315
3555
  test_cases.emplace_back(new test_sqr());
2316
3556
  test_cases.emplace_back(new test_sqrt());
3557
+ test_cases.emplace_back(new test_log());
3558
+ test_cases.emplace_back(new test_sin());
3559
+ test_cases.emplace_back(new test_cos());
2317
3560
  test_cases.emplace_back(new test_clamp());
2318
3561
 
2319
- test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 1, 1}, 5));
2320
- test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 10, 1}, 5));
2321
- test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 10, 10}, 5));
3562
+ test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 1, 1}, 5));
3563
+ test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 3, 1}, 5));
3564
+ test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 3, 2}, 5));
2322
3565
 
2323
3566
  #if 0
2324
3567
  std::uniform_int_distribution<> dist_ne1(1, 50);
@@ -2362,23 +3605,23 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
2362
3605
  for (float af : { 1.0f, 1.4245f }) {
2363
3606
  for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
2364
3607
  for (bool ff : {false, true}) { // freq_factors
2365
- test_cases.emplace_back(new test_rope(type, {128, 32, 10, 1}, 128, 0, 512, fs, ef, af, ff, v)); // llama 7B
3608
+ test_cases.emplace_back(new test_rope(type, {128, 32, 2, 1}, 128, 0, 512, fs, ef, af, ff, v)); // llama 7B
2366
3609
 
2367
3610
  if (all) {
2368
- test_cases.emplace_back(new test_rope(type, {128, 40, 10, 1}, 128, 0, 512, fs, ef, af, ff, v)); // llama 13B
2369
- test_cases.emplace_back(new test_rope(type, {128, 52, 10, 1}, 128, 0, 512, fs, ef, af, ff, v)); // llama 30B
2370
- test_cases.emplace_back(new test_rope(type, {128, 64, 10, 1}, 128, 0, 512, fs, ef, af, ff, v)); // llama 65B
3611
+ test_cases.emplace_back(new test_rope(type, {128, 40, 2, 1}, 128, 0, 512, fs, ef, af, ff, v)); // llama 13B
3612
+ test_cases.emplace_back(new test_rope(type, {128, 52, 2, 1}, 128, 0, 512, fs, ef, af, ff, v)); // llama 30B
3613
+ test_cases.emplace_back(new test_rope(type, {128, 64, 2, 1}, 128, 0, 512, fs, ef, af, ff, v)); // llama 65B
2371
3614
  }
2372
3615
 
2373
3616
  if (all) {
2374
- test_cases.emplace_back(new test_rope(type, { 64, 1, 10, 1}, 64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 7B)
2375
- test_cases.emplace_back(new test_rope(type, { 64, 71, 10, 1}, 64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 7B)
2376
- test_cases.emplace_back(new test_rope(type, { 64, 8, 10, 1}, 64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 40B)
2377
- test_cases.emplace_back(new test_rope(type, { 80, 32, 10, 1}, 20, 2, 512, fs, ef, af, ff, v)); // neox (stablelm)
2378
- test_cases.emplace_back(new test_rope(type, { 80, 32, 10, 1}, 32, 2, 512, fs, ef, af, ff, v)); // neox (phi-2)
3617
+ test_cases.emplace_back(new test_rope(type, { 64, 1, 2, 1}, 64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 7B)
3618
+ test_cases.emplace_back(new test_rope(type, { 64, 71, 2, 1}, 64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 7B)
3619
+ test_cases.emplace_back(new test_rope(type, { 64, 8, 2, 1}, 64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 40B)
3620
+ test_cases.emplace_back(new test_rope(type, { 80, 32, 2, 1}, 20, 2, 512, fs, ef, af, ff, v)); // neox (stablelm)
3621
+ test_cases.emplace_back(new test_rope(type, { 80, 32, 2, 1}, 32, 2, 512, fs, ef, af, ff, v)); // neox (phi-2)
2379
3622
  }
2380
3623
 
2381
- test_cases.emplace_back(new test_rope(type, { 64, 128, 10, 1}, 64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 40B)
3624
+ test_cases.emplace_back(new test_rope(type, { 64, 128, 2, 1}, 64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 40B)
2382
3625
  }
2383
3626
  }
2384
3627
 
@@ -2402,6 +3645,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
2402
3645
  test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {60, 10, 10, 10}, order)); // qwen
2403
3646
  }
2404
3647
 
3648
+ test_cases.emplace_back(new test_sum());
2405
3649
  test_cases.emplace_back(new test_sum_rows());
2406
3650
  test_cases.emplace_back(new test_upscale());
2407
3651
  test_cases.emplace_back(new test_upscale(GGML_TYPE_F32, { 512, 512, 3, 1 }, 2, true));
@@ -2417,11 +3661,14 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
2417
3661
  for (bool mask : { true, false } ) {
2418
3662
  for (float max_bias : { 0.0f, 8.0f }) {
2419
3663
  if (!mask && max_bias > 0.0f) continue;
2420
- for (int nh : { 32, }) {
2421
- for (int kv : { 512, 1024, }) {
2422
- for (int nb : { 1, 2, 4, 8, }) {
2423
- for (ggml_type type_KV : {GGML_TYPE_F16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0}) {
2424
- test_cases.emplace_back(new test_flash_attn_ext(hs, nh, kv, nb, mask, max_bias, type_KV));
3664
+ for (float logit_softcap : {0.0f, 10.0f}) {
3665
+ if (hs != 128 && logit_softcap != 0.0f) continue;
3666
+ for (int nh : { 32, }) {
3667
+ for (int kv : { 512, 1024, }) {
3668
+ for (int nb : { 1, 3, 32, 35, }) {
3669
+ for (ggml_type type_KV : {GGML_TYPE_F16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0}) {
3670
+ test_cases.emplace_back(new test_flash_attn_ext(hs, nh, kv, nb, mask, max_bias, logit_softcap, type_KV));
3671
+ }
2425
3672
  }
2426
3673
  }
2427
3674
  }
@@ -2430,6 +3677,11 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
2430
3677
  }
2431
3678
  }
2432
3679
 
3680
+ test_cases.emplace_back(new test_cross_entropy_loss());
3681
+ for (float wd : {0.0f, 1e-2f}) {
3682
+ test_cases.emplace_back(new test_opt_step_adamw(GGML_TYPE_F32, {10, 5, 4, 3}, 1.0f, 1e-3f, 0.9f, 0.999f, wd));
3683
+ }
3684
+
2433
3685
  // these tests are disabled to save execution time, but they can be handy for debugging
2434
3686
  #if 0
2435
3687
  test_cases.emplace_back(new test_llama(1));
@@ -2438,8 +3690,30 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
2438
3690
  test_cases.emplace_back(new test_falcon(2));
2439
3691
  #endif
2440
3692
 
2441
- // run tests
3693
+ return test_cases;
3694
+ }
3695
+
3696
+ // Test cases for performance evaluation: should be representative of real-world use cases
3697
+ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
3698
+ std::vector<std::unique_ptr<test_case>> test_cases;
3699
+
3700
+ test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 1, 1, 1}));
3701
+ test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 512, 1, 1}));
3702
+
3703
+ for (int bs : {1, 512}) {
3704
+ for (ggml_type type_a : all_types) {
3705
+ for (ggml_type type_b : {GGML_TYPE_F32}) {
3706
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 4096, bs, 14336, {1, 1}, {1, 1}));
3707
+ }
3708
+ }
3709
+ }
3710
+
3711
+ return test_cases;
3712
+ }
3713
+
3714
+ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_name) {
2442
3715
  if (mode == MODE_TEST) {
3716
+ auto test_cases = make_test_cases_eval();
2443
3717
  ggml_backend_t backend_cpu = ggml_backend_cpu_init();
2444
3718
 
2445
3719
  size_t n_ok = 0;
@@ -2455,7 +3729,21 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
2455
3729
  return n_ok == test_cases.size();
2456
3730
  }
2457
3731
 
3732
+ if (mode == MODE_GRAD) {
3733
+ auto test_cases = make_test_cases_eval();
3734
+ size_t n_ok = 0;
3735
+ for (auto & test : test_cases) {
3736
+ if (test->eval_grad(backend, op_name)) {
3737
+ n_ok++;
3738
+ }
3739
+ }
3740
+ printf(" %zu/%zu tests passed\n", n_ok, test_cases.size());
3741
+
3742
+ return n_ok == test_cases.size();
3743
+ }
3744
+
2458
3745
  if (mode == MODE_PERF) {
3746
+ auto test_cases = make_test_cases_perf();
2459
3747
  for (auto & test : test_cases) {
2460
3748
  test->eval_perf(backend, op_name);
2461
3749
  }
@@ -2463,13 +3751,15 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
2463
3751
  }
2464
3752
 
2465
3753
  GGML_ABORT("fatal error");
2466
- return false;
2467
3754
  }
2468
3755
 
2469
3756
  static void usage(char ** argv) {
2470
3757
  printf("Usage: %s [mode] [-o op] [-b backend]\n", argv[0]);
2471
- printf(" valid modes are: test (compare with CPU backend for correctness) or perf (performance evaluation)\n");
2472
- printf(" op names are as given by ggml_op_desc()\n");
3758
+ printf(" valid modes:\n");
3759
+ printf(" - test (default, compare with CPU backend for correctness)\n");
3760
+ printf(" - grad (compare gradients from backpropagation with method of finite differences)\n");
3761
+ printf(" - perf (performance evaluation)\n");
3762
+ printf(" op names for -o are as given by ggml_op_desc() (e.g. ADD, MUL_MAT, etc)\n");
2473
3763
  }
2474
3764
 
2475
3765
  int main(int argc, char ** argv) {
@@ -2482,6 +3772,8 @@ int main(int argc, char ** argv) {
2482
3772
  mode = MODE_TEST;
2483
3773
  } else if (strcmp(argv[i], "perf") == 0) {
2484
3774
  mode = MODE_PERF;
3775
+ } else if (strcmp(argv[i], "grad") == 0) {
3776
+ mode = MODE_GRAD;
2485
3777
  } else if (strcmp(argv[i], "-o") == 0) {
2486
3778
  if (i + 1 < argc) {
2487
3779
  op_name_filter = argv[++i];
@@ -2503,30 +3795,41 @@ int main(int argc, char ** argv) {
2503
3795
  }
2504
3796
 
2505
3797
  // enumerate backends
2506
- printf("Testing %zu backends\n\n", ggml_backend_reg_get_count());
3798
+ printf("Testing %zu devices\n\n", ggml_backend_dev_count());
2507
3799
 
2508
3800
  size_t n_ok = 0;
2509
3801
 
2510
- for (size_t i = 0; i < ggml_backend_reg_get_count(); i++) {
2511
- printf("Backend %zu/%zu (%s)\n", i + 1, ggml_backend_reg_get_count(), ggml_backend_reg_get_name(i));
3802
+ for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
3803
+ ggml_backend_dev_t dev = ggml_backend_dev_get(i);
2512
3804
 
2513
- if (backend_filter != NULL && strcmp(backend_filter, ggml_backend_reg_get_name(i)) != 0) {
3805
+ printf("Backend %zu/%zu: %s\n", i + 1, ggml_backend_dev_count(), ggml_backend_dev_name(dev));
3806
+
3807
+ if (backend_filter != NULL && strcmp(backend_filter, ggml_backend_dev_name(dev)) != 0) {
2514
3808
  printf(" Skipping\n");
2515
3809
  n_ok++;
2516
3810
  continue;
2517
3811
  }
2518
3812
 
2519
- ggml_backend_t backend = ggml_backend_reg_init_backend(i, NULL);
3813
+ ggml_backend_t backend = ggml_backend_dev_init(dev, NULL);
2520
3814
  GGML_ASSERT(backend != NULL);
2521
3815
 
2522
- if (backend_filter == NULL && ggml_backend_is_cpu(backend)) {
3816
+ if (backend_filter == NULL && ggml_backend_is_cpu(backend) && mode != MODE_GRAD) {
2523
3817
  printf(" Skipping CPU backend\n");
2524
3818
  ggml_backend_free(backend);
2525
3819
  n_ok++;
2526
3820
  continue;
2527
3821
  }
2528
3822
 
2529
- printf(" Backend name: %s\n", ggml_backend_name(backend));
3823
+ if (ggml_backend_is_cpu(backend)) {
3824
+ // TODO: better value for n_threads
3825
+ ggml_backend_cpu_set_n_threads(backend, std::thread::hardware_concurrency() / 2);
3826
+ }
3827
+
3828
+ printf(" Device description: %s\n", ggml_backend_dev_description(dev));
3829
+ size_t free, total; // NOLINT
3830
+ ggml_backend_dev_memory(dev, &free, &total);
3831
+ printf(" Device memory: %zu MB (%zu MB free)\n", total / 1024 / 1024, free / 1024 / 1024);
3832
+ printf("\n");
2530
3833
 
2531
3834
  bool ok = test_backend(backend, mode, op_name_filter);
2532
3835
 
@@ -2543,9 +3846,9 @@ int main(int argc, char ** argv) {
2543
3846
  ggml_backend_free(backend);
2544
3847
  }
2545
3848
 
2546
- printf("%zu/%zu backends passed\n", n_ok, ggml_backend_reg_get_count());
3849
+ printf("%zu/%zu backends passed\n", n_ok, ggml_backend_dev_count());
2547
3850
 
2548
- if (n_ok != ggml_backend_reg_get_count()) {
3851
+ if (n_ok != ggml_backend_dev_count()) {
2549
3852
  printf("\033[1;31mFAIL\033[0m\n");
2550
3853
  return 1;
2551
3854
  }