@fugood/llama.node 0.0.1-alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (204) hide show
  1. package/CMakeLists.txt +85 -0
  2. package/README.md +56 -0
  3. package/bin/darwin/arm64/llama-node.node +0 -0
  4. package/bin/darwin/x64/llama-node.node +0 -0
  5. package/bin/linux/arm64/llama-node.node +0 -0
  6. package/bin/linux/x64/llama-node.node +0 -0
  7. package/bin/win32/arm64/llama-node.node +0 -0
  8. package/bin/win32/arm64/node.lib +0 -0
  9. package/bin/win32/x64/llama-node.node +0 -0
  10. package/bin/win32/x64/node.lib +0 -0
  11. package/lib/binding.js +13 -0
  12. package/lib/binding.ts +57 -0
  13. package/lib/index.js +24 -0
  14. package/lib/index.ts +13 -0
  15. package/package.json +65 -0
  16. package/src/addons.cpp +506 -0
  17. package/src/llama.cpp/CMakeLists.txt +1320 -0
  18. package/src/llama.cpp/build.zig +172 -0
  19. package/src/llama.cpp/cmake/FindSIMD.cmake +100 -0
  20. package/src/llama.cpp/common/CMakeLists.txt +87 -0
  21. package/src/llama.cpp/common/base64.hpp +392 -0
  22. package/src/llama.cpp/common/common.cpp +2949 -0
  23. package/src/llama.cpp/common/common.h +324 -0
  24. package/src/llama.cpp/common/console.cpp +501 -0
  25. package/src/llama.cpp/common/console.h +19 -0
  26. package/src/llama.cpp/common/grammar-parser.cpp +440 -0
  27. package/src/llama.cpp/common/grammar-parser.h +29 -0
  28. package/src/llama.cpp/common/json-schema-to-grammar.cpp +764 -0
  29. package/src/llama.cpp/common/json-schema-to-grammar.h +4 -0
  30. package/src/llama.cpp/common/json.hpp +24766 -0
  31. package/src/llama.cpp/common/log.h +724 -0
  32. package/src/llama.cpp/common/ngram-cache.cpp +282 -0
  33. package/src/llama.cpp/common/ngram-cache.h +94 -0
  34. package/src/llama.cpp/common/sampling.cpp +353 -0
  35. package/src/llama.cpp/common/sampling.h +147 -0
  36. package/src/llama.cpp/common/stb_image.h +8396 -0
  37. package/src/llama.cpp/common/train.cpp +1513 -0
  38. package/src/llama.cpp/common/train.h +233 -0
  39. package/src/llama.cpp/examples/CMakeLists.txt +52 -0
  40. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +5 -0
  41. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +1640 -0
  42. package/src/llama.cpp/examples/batched/CMakeLists.txt +5 -0
  43. package/src/llama.cpp/examples/batched/batched.cpp +262 -0
  44. package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +5 -0
  45. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +261 -0
  46. package/src/llama.cpp/examples/beam-search/CMakeLists.txt +5 -0
  47. package/src/llama.cpp/examples/beam-search/beam-search.cpp +188 -0
  48. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +6 -0
  49. package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +275 -0
  50. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +5 -0
  51. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +936 -0
  52. package/src/llama.cpp/examples/embedding/CMakeLists.txt +5 -0
  53. package/src/llama.cpp/examples/embedding/embedding.cpp +211 -0
  54. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +9 -0
  55. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +195 -0
  56. package/src/llama.cpp/examples/export-lora/CMakeLists.txt +5 -0
  57. package/src/llama.cpp/examples/export-lora/export-lora.cpp +462 -0
  58. package/src/llama.cpp/examples/finetune/CMakeLists.txt +5 -0
  59. package/src/llama.cpp/examples/finetune/finetune.cpp +1861 -0
  60. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +5 -0
  61. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +132 -0
  62. package/src/llama.cpp/examples/gguf/CMakeLists.txt +5 -0
  63. package/src/llama.cpp/examples/gguf/gguf.cpp +256 -0
  64. package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +5 -0
  65. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +553 -0
  66. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +5 -0
  67. package/src/llama.cpp/examples/gritlm/gritlm.cpp +215 -0
  68. package/src/llama.cpp/examples/imatrix/CMakeLists.txt +5 -0
  69. package/src/llama.cpp/examples/imatrix/imatrix.cpp +655 -0
  70. package/src/llama.cpp/examples/infill/CMakeLists.txt +5 -0
  71. package/src/llama.cpp/examples/infill/infill.cpp +767 -0
  72. package/src/llama.cpp/examples/jeopardy/questions.txt +100 -0
  73. package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +5 -0
  74. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +1286 -0
  75. package/src/llama.cpp/examples/llama.android/app/src/main/cpp/CMakeLists.txt +50 -0
  76. package/src/llama.cpp/examples/llama.android/app/src/main/cpp/llama-android.cpp +443 -0
  77. package/src/llama.cpp/examples/llava/CMakeLists.txt +37 -0
  78. package/src/llama.cpp/examples/llava/clip.cpp +2027 -0
  79. package/src/llama.cpp/examples/llava/clip.h +85 -0
  80. package/src/llama.cpp/examples/llava/llava-cli.cpp +309 -0
  81. package/src/llama.cpp/examples/llava/llava.cpp +426 -0
  82. package/src/llama.cpp/examples/llava/llava.h +50 -0
  83. package/src/llama.cpp/examples/llava/requirements.txt +3 -0
  84. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +5 -0
  85. package/src/llama.cpp/examples/lookahead/lookahead.cpp +485 -0
  86. package/src/llama.cpp/examples/lookup/CMakeLists.txt +23 -0
  87. package/src/llama.cpp/examples/lookup/lookup-create.cpp +41 -0
  88. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +47 -0
  89. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +160 -0
  90. package/src/llama.cpp/examples/lookup/lookup.cpp +258 -0
  91. package/src/llama.cpp/examples/main/CMakeLists.txt +5 -0
  92. package/src/llama.cpp/examples/main/main.cpp +957 -0
  93. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +33 -0
  94. package/src/llama.cpp/examples/parallel/CMakeLists.txt +5 -0
  95. package/src/llama.cpp/examples/parallel/parallel.cpp +427 -0
  96. package/src/llama.cpp/examples/passkey/CMakeLists.txt +5 -0
  97. package/src/llama.cpp/examples/passkey/passkey.cpp +302 -0
  98. package/src/llama.cpp/examples/perplexity/CMakeLists.txt +5 -0
  99. package/src/llama.cpp/examples/perplexity/perplexity.cpp +1943 -0
  100. package/src/llama.cpp/examples/quantize/CMakeLists.txt +6 -0
  101. package/src/llama.cpp/examples/quantize/quantize.cpp +423 -0
  102. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +6 -0
  103. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +424 -0
  104. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +5 -0
  105. package/src/llama.cpp/examples/retrieval/retrieval.cpp +350 -0
  106. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +5 -0
  107. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +246 -0
  108. package/src/llama.cpp/examples/server/CMakeLists.txt +40 -0
  109. package/src/llama.cpp/examples/server/bench/requirements.txt +2 -0
  110. package/src/llama.cpp/examples/server/httplib.h +9465 -0
  111. package/src/llama.cpp/examples/server/server.cpp +3826 -0
  112. package/src/llama.cpp/examples/server/tests/requirements.txt +6 -0
  113. package/src/llama.cpp/examples/server/utils.hpp +653 -0
  114. package/src/llama.cpp/examples/simple/CMakeLists.txt +5 -0
  115. package/src/llama.cpp/examples/simple/simple.cpp +183 -0
  116. package/src/llama.cpp/examples/speculative/CMakeLists.txt +5 -0
  117. package/src/llama.cpp/examples/speculative/speculative.cpp +614 -0
  118. package/src/llama.cpp/examples/sycl/CMakeLists.txt +9 -0
  119. package/src/llama.cpp/examples/sycl/ls-sycl-device.cpp +13 -0
  120. package/src/llama.cpp/examples/tokenize/CMakeLists.txt +5 -0
  121. package/src/llama.cpp/examples/tokenize/tokenize.cpp +42 -0
  122. package/src/llama.cpp/examples/train-text-from-scratch/CMakeLists.txt +5 -0
  123. package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +1252 -0
  124. package/src/llama.cpp/ggml-alloc.c +985 -0
  125. package/src/llama.cpp/ggml-alloc.h +76 -0
  126. package/src/llama.cpp/ggml-backend-impl.h +141 -0
  127. package/src/llama.cpp/ggml-backend.c +2099 -0
  128. package/src/llama.cpp/ggml-backend.h +233 -0
  129. package/src/llama.cpp/ggml-common.h +1853 -0
  130. package/src/llama.cpp/ggml-cuda.h +43 -0
  131. package/src/llama.cpp/ggml-impl.h +265 -0
  132. package/src/llama.cpp/ggml-kompute.cpp +2006 -0
  133. package/src/llama.cpp/ggml-kompute.h +46 -0
  134. package/src/llama.cpp/ggml-metal.h +66 -0
  135. package/src/llama.cpp/ggml-mpi.c +216 -0
  136. package/src/llama.cpp/ggml-mpi.h +39 -0
  137. package/src/llama.cpp/ggml-opencl.cpp +2301 -0
  138. package/src/llama.cpp/ggml-opencl.h +36 -0
  139. package/src/llama.cpp/ggml-quants.c +12678 -0
  140. package/src/llama.cpp/ggml-quants.h +133 -0
  141. package/src/llama.cpp/ggml-sycl.cpp +17882 -0
  142. package/src/llama.cpp/ggml-sycl.h +49 -0
  143. package/src/llama.cpp/ggml-vulkan-shaders.hpp +69849 -0
  144. package/src/llama.cpp/ggml-vulkan.cpp +6442 -0
  145. package/src/llama.cpp/ggml-vulkan.h +29 -0
  146. package/src/llama.cpp/ggml.c +21819 -0
  147. package/src/llama.cpp/ggml.h +2403 -0
  148. package/src/llama.cpp/llama.cpp +17468 -0
  149. package/src/llama.cpp/llama.h +1117 -0
  150. package/src/llama.cpp/pocs/CMakeLists.txt +12 -0
  151. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +9 -0
  152. package/src/llama.cpp/pocs/vdot/q8dot.cpp +172 -0
  153. package/src/llama.cpp/pocs/vdot/vdot.cpp +310 -0
  154. package/src/llama.cpp/prompts/LLM-questions.txt +49 -0
  155. package/src/llama.cpp/prompts/alpaca.txt +1 -0
  156. package/src/llama.cpp/prompts/assistant.txt +31 -0
  157. package/src/llama.cpp/prompts/chat-with-baichuan.txt +4 -0
  158. package/src/llama.cpp/prompts/chat-with-bob.txt +7 -0
  159. package/src/llama.cpp/prompts/chat-with-qwen.txt +1 -0
  160. package/src/llama.cpp/prompts/chat-with-vicuna-v0.txt +7 -0
  161. package/src/llama.cpp/prompts/chat-with-vicuna-v1.txt +7 -0
  162. package/src/llama.cpp/prompts/chat.txt +28 -0
  163. package/src/llama.cpp/prompts/dan-modified.txt +1 -0
  164. package/src/llama.cpp/prompts/dan.txt +1 -0
  165. package/src/llama.cpp/prompts/mnemonics.txt +93 -0
  166. package/src/llama.cpp/prompts/parallel-questions.txt +43 -0
  167. package/src/llama.cpp/prompts/reason-act.txt +18 -0
  168. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +3 -0
  169. package/src/llama.cpp/requirements/requirements-convert-llama-ggml-to-gguf.txt +1 -0
  170. package/src/llama.cpp/requirements/requirements-convert-lora-to-ggml.txt +2 -0
  171. package/src/llama.cpp/requirements/requirements-convert-persimmon-to-gguf.txt +2 -0
  172. package/src/llama.cpp/requirements/requirements-convert.txt +5 -0
  173. package/src/llama.cpp/requirements.txt +12 -0
  174. package/src/llama.cpp/scripts/gen-build-info-cpp.cmake +24 -0
  175. package/src/llama.cpp/scripts/xxd.cmake +16 -0
  176. package/src/llama.cpp/sgemm.cpp +999 -0
  177. package/src/llama.cpp/sgemm.h +12 -0
  178. package/src/llama.cpp/tests/CMakeLists.txt +78 -0
  179. package/src/llama.cpp/tests/get-model.cpp +21 -0
  180. package/src/llama.cpp/tests/get-model.h +2 -0
  181. package/src/llama.cpp/tests/test-autorelease.cpp +24 -0
  182. package/src/llama.cpp/tests/test-backend-ops.cpp +2266 -0
  183. package/src/llama.cpp/tests/test-c.c +7 -0
  184. package/src/llama.cpp/tests/test-chat-template.cpp +107 -0
  185. package/src/llama.cpp/tests/test-double-float.cpp +57 -0
  186. package/src/llama.cpp/tests/test-grad0.cpp +1606 -0
  187. package/src/llama.cpp/tests/test-grammar-integration.cpp +243 -0
  188. package/src/llama.cpp/tests/test-grammar-parser.cpp +250 -0
  189. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +899 -0
  190. package/src/llama.cpp/tests/test-llama-grammar.cpp +402 -0
  191. package/src/llama.cpp/tests/test-model-load-cancel.cpp +27 -0
  192. package/src/llama.cpp/tests/test-opt.cpp +181 -0
  193. package/src/llama.cpp/tests/test-quantize-fns.cpp +185 -0
  194. package/src/llama.cpp/tests/test-quantize-perf.cpp +363 -0
  195. package/src/llama.cpp/tests/test-rope.cpp +221 -0
  196. package/src/llama.cpp/tests/test-sampling.cpp +301 -0
  197. package/src/llama.cpp/tests/test-tokenizer-0-falcon.cpp +187 -0
  198. package/src/llama.cpp/tests/test-tokenizer-0-llama.cpp +190 -0
  199. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +123 -0
  200. package/src/llama.cpp/tests/test-tokenizer-1-llama.cpp +111 -0
  201. package/src/llama.cpp/unicode-data.cpp +1651 -0
  202. package/src/llama.cpp/unicode-data.h +16 -0
  203. package/src/llama.cpp/unicode.cpp +277 -0
  204. package/src/llama.cpp/unicode.h +28 -0
@@ -0,0 +1,2266 @@
1
+ #include <ggml.h>
2
+ #include <ggml-alloc.h>
3
+ #include <ggml-backend.h>
4
+ #include <ggml-backend-impl.h>
5
+ #include <algorithm>
6
+ #include <array>
7
+ #include <cfloat>
8
+ #include <cstring>
9
+ #include <functional>
10
+ #include <memory>
11
+ #include <random>
12
+ #include <stdio.h>
13
+ #include <stdlib.h>
14
+ #include <string>
15
+ #include <thread>
16
+ #include <vector>
17
+
18
+ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float max = 1.0f) {
19
+ // static RNG initialization (revisit if n_threads stops being constant)
20
+ static const size_t n_threads = std::thread::hardware_concurrency();
21
+ static std::vector<std::default_random_engine> generators = []() {
22
+ std::random_device rd;
23
+ std::vector<std::default_random_engine> vec;
24
+ vec.reserve(n_threads);
25
+ //for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(1234 + i); } // fixed seed
26
+ for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(rd()); }
27
+ return vec;
28
+ }();
29
+
30
+ size_t size = ggml_nelements(tensor);
31
+ std::vector<float> data(size);
32
+
33
+ auto init_thread = [&](size_t ith, size_t start, size_t end) {
34
+ std::uniform_real_distribution<float> distribution(min, max);
35
+ for (size_t i = start; i < end; i++) {
36
+ data[i] = distribution(generators[ith]);
37
+ }
38
+ };
39
+
40
+ std::vector<std::thread> threads;
41
+ threads.reserve(n_threads);
42
+ for (size_t i = 0; i < n_threads; i++) {
43
+ size_t start = i*size/n_threads;
44
+ size_t end = (i+1)*size/n_threads;
45
+ threads.emplace_back(init_thread, i, start, end);
46
+ }
47
+ for (auto & t : threads) {
48
+ t.join();
49
+ }
50
+
51
+ if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) {
52
+ ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float));
53
+ } else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16) {
54
+ GGML_ASSERT(size % ggml_blck_size(tensor->type) == 0);
55
+ std::vector<uint8_t> dataq(ggml_row_size(tensor->type, size));
56
+ std::vector<float> imatrix(tensor->ne[0], 1.0f); // dummy importance matrix
57
+ const float * im = imatrix.data();
58
+ if (!ggml_quantize_requires_imatrix(tensor->type)) {
59
+ // when the imatrix is optional, we want to test both quantization with and without imatrix
60
+ // use one of the random numbers to decide
61
+ if (data[0] > 0.5f*(min + max)) {
62
+ im = nullptr;
63
+ }
64
+ }
65
+ ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size/tensor->ne[0], tensor->ne[0], im);
66
+ ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size());
67
+ } else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) {
68
+ // This is going to create some weird integers though.
69
+ ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor));
70
+ } else {
71
+ GGML_ASSERT(false);
72
+ }
73
+ }
74
+
75
+ static std::vector<float> tensor_to_float(const ggml_tensor * t) {
76
+ std::vector<float> tv;
77
+ tv.reserve(ggml_nelements(t));
78
+
79
+ std::vector<uint8_t> buf(ggml_nbytes(t));
80
+ ggml_backend_tensor_get(t, buf.data(), 0, ggml_nbytes(t));
81
+
82
+ ggml_type_traits_t tt = ggml_internal_get_type_traits(t->type);
83
+ size_t bs = ggml_blck_size(t->type);
84
+ std::vector<float> vq(ggml_blck_size(t->type));
85
+ bool quantized = ggml_is_quantized(t->type);
86
+
87
+ // access elements by index to avoid gaps in views
88
+ for (int64_t i3 = 0; i3 < t->ne[3]; i3++) {
89
+ for (int64_t i2 = 0; i2 < t->ne[2]; i2++) {
90
+ for (int64_t i1 = 0; i1 < t->ne[1]; i1++) {
91
+ for (int64_t i0 = 0; i0 < t->ne[0]; i0 += bs) {
92
+ size_t i = i3*t->nb[3] + i2*t->nb[2] + i1*t->nb[1] + i0/bs*t->nb[0];
93
+ if (t->type == GGML_TYPE_F16) {
94
+ tv.push_back(ggml_fp16_to_fp32(*(ggml_fp16_t*)&buf[i]));
95
+ } else if (t->type == GGML_TYPE_F32) {
96
+ tv.push_back(*(float *) &buf[i]);
97
+ } else if (t->type == GGML_TYPE_I32) {
98
+ tv.push_back((float)*(int32_t *) &buf[i]);
99
+ } else if (t->type == GGML_TYPE_I16) {
100
+ tv.push_back((float)*(int16_t *) &buf[i]);
101
+ } else if (t->type == GGML_TYPE_I8) {
102
+ tv.push_back((float)*(int8_t *) &buf[i]);
103
+ } else if (quantized) {
104
+ tt.to_float(&buf[i], vq.data(), bs);
105
+ tv.insert(tv.end(), vq.begin(), vq.end());
106
+ } else {
107
+ GGML_ASSERT(false);
108
+ }
109
+ }
110
+ }
111
+ }
112
+ }
113
+
114
+ return tv;
115
+ }
116
+
117
+ /*
118
+ static double cosine_similarity(const float * v1, const float * v2, size_t n) {
119
+ double dot = 0.0;
120
+ double mag1 = 0.0;
121
+ double mag2 = 0.0;
122
+
123
+ for (size_t i = 0; i < n; i++) {
124
+ if (std::isnan(v1[i]) || std::isnan(v2[i])) {
125
+ return -1.0f;
126
+ }
127
+ if (std::isinf(v1[i]) && std::isinf(v2[i])) {
128
+ continue;
129
+ }
130
+ dot += v1[i]*v2[i];
131
+ mag1 += v1[i]*v1[i];
132
+ mag2 += v2[i]*v2[i];
133
+ }
134
+
135
+ return dot/sqrt(mag1*mag2);
136
+ }
137
+
138
+ static float distance(const float * v1, const float * v2, size_t n) {
139
+ double d = 0.0;
140
+
141
+ for (size_t i = 0; i < n; i++) {
142
+ if (std::isnan(v1[i]) || std::isnan(v2[i])) {
143
+ return INFINITY;
144
+ }
145
+ if (std::isinf(v1[i]) && std::isinf(v2[i])) {
146
+ continue;
147
+ }
148
+ d += (v1[i] - v2[i])*(v1[i] - v2[i]);
149
+ }
150
+
151
+ return sqrt(d);
152
+ }
153
+
154
+ static float vec_len(const float * v, size_t n) {
155
+ double d = 0.0;
156
+
157
+ for (size_t i = 0; i < n; i++) {
158
+ if (std::isnan(v[i])) {
159
+ return INFINITY;
160
+ }
161
+ if (std::isinf(v[i])) {
162
+ continue;
163
+ }
164
+ d += v[i]*v[i];
165
+ }
166
+
167
+ return sqrt(d);
168
+ }
169
+ */
170
+
171
+ // normalized mean squared error = mse(a, b) / mse(a, 0)
172
+ static double nmse(const float * a, const float * b, size_t n) {
173
+ double mse_a_b = 0.0;
174
+ double mse_a_0 = 0.0;
175
+
176
+ for (size_t i = 0; i < n; i++) {
177
+ float a_i = a[i];
178
+ float b_i = b[i];
179
+
180
+ mse_a_b += (a_i - b_i) * (a_i - b_i);
181
+ mse_a_0 += a_i * a_i;
182
+ }
183
+
184
+ return mse_a_b / mse_a_0;
185
+ }
186
+
187
+ // utils for printing the variables of the test cases
188
+ #define VAR_TO_STR(x) (#x "=" + var_to_str(x))
189
+
190
+ template<typename T>
191
+ static std::string var_to_str(const T & x) {
192
+ return std::to_string(x);
193
+ }
194
+
195
+ template<typename T, size_t N>
196
+ static std::string var_to_str(const T (&x)[N]) {
197
+ std::string s = "[";
198
+ for (size_t i = 0; i < N; i++) {
199
+ if (i > 0) {
200
+ s += ",";
201
+ }
202
+ s += var_to_str(x[i]);
203
+ }
204
+ s += "]";
205
+ return s;
206
+ }
207
+
208
+ template<typename T, size_t N>
209
+ static std::string var_to_str(const std::array<T, N> & x) {
210
+ std::string s = "[";
211
+ for (size_t i = 0; i < N; i++) {
212
+ if (i > 0) {
213
+ s += ",";
214
+ }
215
+ s += var_to_str(x[i]);
216
+ }
217
+ s += "]";
218
+ return s;
219
+ }
220
+
221
+ //static std::string var_to_str(ggml_unary_op unary_op) {
222
+ // return ggml_unary_op_name(unary_op);
223
+ //}
224
+
225
+ static std::string var_to_str(ggml_type type) {
226
+ return ggml_type_name(type);
227
+ }
228
+
229
+ static std::string var_to_str(ggml_op_pool pool) {
230
+ switch (pool) {
231
+ case GGML_OP_POOL_AVG: return "avg";
232
+ case GGML_OP_POOL_MAX: return "max";
233
+ default: return std::to_string(pool);
234
+ }
235
+ }
236
+
237
+ #define VARS_TO_STR1(a) VAR_TO_STR(a)
238
+ #define VARS_TO_STR2(a, b) VAR_TO_STR(a) + "," + VAR_TO_STR(b)
239
+ #define VARS_TO_STR3(a, b, c) VAR_TO_STR(a) + "," + VARS_TO_STR2(b, c)
240
+ #define VARS_TO_STR4(a, b, c, d) VAR_TO_STR(a) + "," + VARS_TO_STR3(b, c, d)
241
+ #define VARS_TO_STR5(a, b, c, d, e) VAR_TO_STR(a) + "," + VARS_TO_STR4(b, c, d, e)
242
+ #define VARS_TO_STR6(a, b, c, d, e, f) VAR_TO_STR(a) + "," + VARS_TO_STR5(b, c, d, e, f)
243
+ #define VARS_TO_STR7(a, b, c, d, e, f, g) VAR_TO_STR(a) + "," + VARS_TO_STR6(b, c, d, e, f, g)
244
+ #define VARS_TO_STR8(a, b, c, d, e, f, g, h) VAR_TO_STR(a) + "," + VARS_TO_STR7(b, c, d, e, f, g, h)
245
+ #define VARS_TO_STR9(a, b, c, d, e, f, g, h, i) VAR_TO_STR(a) + "," + VARS_TO_STR8(b, c, d, e, f, g, h, i)
246
+ #define VARS_TO_STR10(a, b, c, d, e, f, g, h, i, j) VAR_TO_STR(a) + "," + VARS_TO_STR9(b, c, d, e, f, g, h, i, j)
247
+ #define VARS_TO_STR11(a, b, c, d, e, f, g, h, i, j, k) VAR_TO_STR(a) + "," + VARS_TO_STR10(b, c, d, e, f, g, h, i, j, k)
248
+ #define VARS_TO_STR12(a, b, c, d, e, f, g, h, i, j, k, l) VAR_TO_STR(a) + "," + VARS_TO_STR11(b, c, d, e, f, g, h, i, j, k, l)
249
+
250
+ #ifdef GGML_USE_SYCL
251
+ static bool inline _isinf(float f) {
252
+ return (*(uint32_t *)&f & 0x7fffffff) == 0x7f800000;
253
+ }
254
+ #else
255
+ static bool inline _isinf(float f) { return std::isinf(f); }
256
+ #endif
257
+
258
+ // accept FLT_MAX as infinity
259
+ static bool isinf_or_max(float f) {
260
+ return _isinf(f) || f == FLT_MAX || f == -FLT_MAX;
261
+ }
262
+
263
+ static bool ggml_is_view_op(enum ggml_op op) {
264
+ return op == GGML_OP_VIEW || op == GGML_OP_RESHAPE || op == GGML_OP_PERMUTE || op == GGML_OP_TRANSPOSE;
265
+ }
266
+
267
+ enum test_mode {
268
+ MODE_TEST,
269
+ MODE_PERF,
270
+ };
271
+
272
+ struct test_case {
273
+ virtual ~test_case() {}
274
+
275
+ virtual std::string op_desc(ggml_tensor * t) {
276
+ return ggml_op_desc(t);
277
+ }
278
+
279
+ virtual std::string vars() {
280
+ return "";
281
+ }
282
+
283
+ virtual ggml_tensor * build_graph(ggml_context * ctx) = 0;
284
+
285
+ virtual double max_nmse_err() {
286
+ return 1e-7;
287
+ }
288
+
289
+ virtual void initialize_tensors(ggml_context * ctx) {
290
+ for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
291
+ init_tensor_uniform(t);
292
+ }
293
+ }
294
+
295
+ virtual size_t op_size(ggml_tensor * t) {
296
+ size_t size = ggml_nbytes(t);
297
+ // add source tensors
298
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
299
+ if (t->src[i] != NULL) {
300
+ size += ggml_nbytes(t->src[i]);
301
+ }
302
+ }
303
+ return size;
304
+ }
305
+
306
+ ggml_cgraph * gf = nullptr;
307
+
308
+ static const int sentinel_size = 1024;
309
+
310
+ test_mode mode;
311
+
312
+ std::vector<ggml_tensor *> sentinels;
313
+
314
+ void add_sentinel(ggml_context * ctx) {
315
+ if (mode == MODE_PERF) {
316
+ return;
317
+ }
318
+ ggml_tensor * sentinel = ::ggml_new_tensor_1d(ctx, GGML_TYPE_F32, sentinel_size);
319
+ ggml_format_name(sentinel, "sent_%zu", sentinels.size());
320
+ sentinels.push_back(sentinel);
321
+ }
322
+
323
+ // hijack ggml_new_tensor to add sentinels after each tensor to check for overflows in the backend
324
+
325
+ ggml_tensor * ggml_new_tensor(ggml_context * ctx, ggml_type type, int n_dims, const int64_t * ne) {
326
+ ggml_tensor * t = ::ggml_new_tensor(ctx, type, n_dims, ne);
327
+ add_sentinel(ctx);
328
+ return t;
329
+ }
330
+
331
+ ggml_tensor * ggml_new_tensor_1d(ggml_context * ctx, ggml_type type, int64_t ne0) {
332
+ ggml_tensor * t = ::ggml_new_tensor_1d(ctx, type, ne0);
333
+ add_sentinel(ctx);
334
+ return t;
335
+ }
336
+
337
+ ggml_tensor * ggml_new_tensor_2d(ggml_context * ctx, ggml_type type, int64_t ne0, int64_t ne1) {
338
+ ggml_tensor * t = ::ggml_new_tensor_2d(ctx, type, ne0, ne1);
339
+ add_sentinel(ctx);
340
+ return t;
341
+ }
342
+
343
+ ggml_tensor * ggml_new_tensor_3d(ggml_context * ctx, ggml_type type, int64_t ne0, int64_t ne1, int64_t ne2) {
344
+ ggml_tensor * t = ::ggml_new_tensor_3d(ctx, type, ne0, ne1, ne2);
345
+ add_sentinel(ctx);
346
+ return t;
347
+ }
348
+
349
+ ggml_tensor * ggml_new_tensor_4d(ggml_context * ctx, ggml_type type, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
350
+ ggml_tensor * t = ::ggml_new_tensor_4d(ctx, type, ne0, ne1, ne2, ne3);
351
+ add_sentinel(ctx);
352
+ return t;
353
+ }
354
+
355
+ bool eval(ggml_backend_t backend1, ggml_backend_t backend2, const char * op_name) {
356
+ mode = MODE_TEST;
357
+
358
+ ggml_init_params params = {
359
+ /* .mem_size = */ ggml_tensor_overhead()*128 + ggml_graph_overhead(),
360
+ /* .mem_base = */ NULL,
361
+ /* .no_alloc = */ true,
362
+ };
363
+ ggml_context * ctx = ggml_init(params);
364
+
365
+ gf = ggml_new_graph(ctx);
366
+
367
+ // pre-graph sentinel
368
+ add_sentinel(ctx);
369
+
370
+ ggml_tensor * out = build_graph(ctx);
371
+
372
+ if (op_name != nullptr && op_desc(out) != op_name) {
373
+ //printf(" %s: skipping\n", op_desc(out).c_str());
374
+ ggml_free(ctx);
375
+ return true;
376
+ }
377
+
378
+ printf(" %s(%s): ", op_desc(out).c_str(), vars().c_str());
379
+ fflush(stdout);
380
+
381
+ // check if the backends support the ops
382
+ bool supported = true;
383
+ for (ggml_backend_t backend : {backend1, backend2}) {
384
+ for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
385
+ if (!ggml_backend_supports_op(backend, t)) {
386
+ printf("not supported [%s] ", ggml_backend_name(backend));
387
+ supported = false;
388
+ break;
389
+ }
390
+ }
391
+ }
392
+ if (!supported) {
393
+ printf("\n");
394
+ ggml_free(ctx);
395
+ return true;
396
+ }
397
+
398
+ // post-graph sentinel
399
+ add_sentinel(ctx);
400
+
401
+ // allocate
402
+ ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx, backend1);
403
+ if (buf == NULL) {
404
+ printf("failed to allocate tensors [%s] ", ggml_backend_name(backend1));
405
+ ggml_free(ctx);
406
+ return false;
407
+ }
408
+
409
+ // build graph
410
+ ggml_build_forward_expand(gf, out);
411
+
412
+ // add sentinels as graph nodes so that they are checked in the callback
413
+ for (ggml_tensor * sentinel : sentinels) {
414
+ gf->nodes[gf->n_nodes++] = sentinel;
415
+ }
416
+
417
+ // randomize tensors
418
+ initialize_tensors(ctx);
419
+
420
+ // compare
421
+ struct callback_userdata {
422
+ bool ok;
423
+ double max_err;
424
+ ggml_backend_t backend1;
425
+ ggml_backend_t backend2;
426
+ };
427
+
428
+ callback_userdata ud {
429
+ true,
430
+ max_nmse_err(),
431
+ backend1,
432
+ backend2
433
+ };
434
+
435
+ auto callback = [](int index, ggml_tensor * t1, ggml_tensor * t2, void * user_data) -> bool {
436
+ callback_userdata * ud = (callback_userdata *) user_data;
437
+ const char * bn1 = ggml_backend_name(ud->backend1);
438
+ const char * bn2 = ggml_backend_name(ud->backend2);
439
+
440
+ if (t1->op == GGML_OP_NONE) {
441
+ // sentinels must be unchanged
442
+ std::vector<uint8_t> t1_data(ggml_nbytes(t1));
443
+ std::vector<uint8_t> t2_data(ggml_nbytes(t2));
444
+ ggml_backend_tensor_get(t1, t1_data.data(), 0, ggml_nbytes(t1));
445
+ ggml_backend_tensor_get(t2, t2_data.data(), 0, ggml_nbytes(t2));
446
+
447
+ if (memcmp(t1_data.data(), t2_data.data(), ggml_nbytes(t1)) != 0) {
448
+ printf("sentinel mismatch: %s ", t1->name);
449
+ ud->ok = false;
450
+ return true;
451
+ }
452
+ }
453
+
454
+ std::vector<float> f1 = tensor_to_float(t1);
455
+ std::vector<float> f2 = tensor_to_float(t2);
456
+
457
+ for (size_t i = 0; i < f1.size(); i++) {
458
+ // check for nans
459
+ if (std::isnan(f1[i]) || std::isnan(f2[i])) {
460
+ printf("[%s] NaN at index %zu (%s=%f %s=%f) ", ggml_op_desc(t1), i, bn1, f1[i], bn2, f2[i]);
461
+ ud->ok = false;
462
+ return true;
463
+ }
464
+ // check for infs: both must be inf of the same sign, or both must be finite
465
+ if (isinf_or_max(f1[i]) || isinf_or_max(f2[i])) {
466
+ if (isinf_or_max(f1[i]) && isinf_or_max(f2[i])) {
467
+ if (std::signbit(f1[i]) != std::signbit(f2[i])) {
468
+ printf("[%s] inf sign mismatch: %s=%f %s=%f ", ggml_op_desc(t1), bn1, f1[i], bn2, f2[i]);
469
+ ud->ok = false;
470
+ return true;
471
+ }
472
+ } else {
473
+ printf("[%s] inf mismatch: %s=%f %s=%f ", ggml_op_desc(t1), bn1, f1[i], bn2, f2[i]);
474
+ ud->ok = false;
475
+ return true;
476
+ }
477
+ }
478
+ }
479
+
480
+ double err = nmse(f1.data(), f2.data(), f1.size());
481
+ if (err > ud->max_err) {
482
+ printf("[%s] NMSE = %.9f > %.9f ", ggml_op_desc(t1), err, ud->max_err);
483
+ //for (int i = 0; i < (int) f1.size(); i++) {
484
+ // printf("%5d %9.6f %9.6f, diff = %9.6f\n", i, f1[i], f2[i], f1[i] - f2[i]);
485
+ //}
486
+ //printf("\n");
487
+ //exit(1);
488
+ ud->ok = false;
489
+ }
490
+ return true;
491
+
492
+ GGML_UNUSED(index);
493
+ };
494
+
495
+ const bool cmp_ok = ggml_backend_compare_graph_backend(backend1, backend2, gf, callback, &ud);
496
+
497
+ if (!cmp_ok) {
498
+ printf("compare failed ");
499
+ }
500
+
501
+ ggml_backend_buffer_free(buf);
502
+
503
+ ggml_free(ctx);
504
+
505
+ if (ud.ok && cmp_ok) {
506
+ printf("\033[1;32mOK\033[0m\n");
507
+ return true;
508
+ }
509
+
510
+ printf("\033[1;31mFAIL\033[0m\n");
511
+ return false;
512
+ }
513
+
514
+ bool eval_perf(ggml_backend_t backend, const char * op_name) {
515
+ mode = MODE_PERF;
516
+
517
+ static const size_t graph_nodes = 8192;
518
+
519
+ ggml_init_params params = {
520
+ /* .mem_size = */ ggml_tensor_overhead()*128 + ggml_graph_overhead_custom(graph_nodes, false),
521
+ /* .mem_base = */ NULL,
522
+ /* .no_alloc = */ true,
523
+ };
524
+ ggml_context * ctx = ggml_init(params);
525
+
526
+ ggml_tensor * out = build_graph(ctx);
527
+
528
+ if (op_name != nullptr && op_desc(out) != op_name) {
529
+ //printf(" %s: skipping\n", op_desc(out).c_str());
530
+ ggml_free(ctx);
531
+ return true;
532
+ }
533
+
534
+ int len = printf(" %s(%s): ", op_desc(out).c_str(), vars().c_str());
535
+ fflush(stdout);
536
+
537
+ // check if backends support op
538
+ if (!ggml_backend_supports_op(backend, out)) {
539
+ printf("not supported\n");
540
+ ggml_free(ctx);
541
+ return true;
542
+ }
543
+
544
+ // align while also leaving some margin for variations in parameters
545
+ int align = 20;
546
+ int last = (len + align - 1) / align * align;
547
+ if (last - len < 5) {
548
+ last += align;
549
+ }
550
+ last = std::max(last, 60);
551
+ printf("%*s", last - len, "");
552
+
553
+ // allocate
554
+ ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx, backend);
555
+ if (buf == NULL) {
556
+ printf("failed to allocate tensors\n");
557
+ ggml_free(ctx);
558
+ return false;
559
+ }
560
+
561
+ // randomize tensors
562
+ initialize_tensors(ctx);
563
+
564
+ // build graph
565
+ ggml_cgraph * gf = ggml_new_graph_custom(ctx, graph_nodes, false);
566
+ ggml_build_forward_expand(gf, out);
567
+
568
+ // warmup run
569
+ ggml_backend_graph_compute(backend, gf);
570
+
571
+ // duplicate the op
572
+ size_t target_size = ggml_backend_is_cpu(backend) ? 1ULL << 33 : 1ULL << 35; // 8 GB CPU, 32 GB GPU
573
+ int n_runs = std::min((size_t)gf->size - gf->n_nodes, target_size / op_size(out)) + 1;
574
+ for (int i = 1; i < n_runs; i++) {
575
+ gf->nodes[gf->n_nodes++] = out;
576
+ }
577
+
578
+ // calculate memory
579
+ size_t mem = n_runs * op_size(out);
580
+ auto tensor_op_size = [](ggml_tensor * t) {
581
+ size_t size = ggml_nbytes(t);
582
+ // add source tensors
583
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
584
+ if (t->src[i] != NULL) {
585
+ size += ggml_nbytes(t->src[i]);
586
+ }
587
+ }
588
+ return size;
589
+ };
590
+ for (int i = 0; i < gf->n_nodes; i++) {
591
+ if (ggml_is_view_op(gf->nodes[i]->op) || gf->nodes[i] == out) {
592
+ continue;
593
+ }
594
+ mem += tensor_op_size(gf->nodes[i]);
595
+ }
596
+
597
+ // run
598
+ ggml_backend_synchronize(backend);
599
+
600
+ int64_t start_time = ggml_time_us();
601
+ ggml_backend_graph_compute(backend, gf);
602
+ ggml_backend_synchronize(backend);
603
+ int64_t end_time = ggml_time_us();
604
+ double time_us = end_time - start_time;
605
+
606
+ printf(" %5d runs - %8.2f us/run - %8zu kB/run - \033[1;34m%7.2f GB/s\033[0m\n",
607
+ n_runs,
608
+ time_us / n_runs,
609
+ op_size(out) / 1024,
610
+ mem / (time_us/1e6) / 1024.0 / 1024.0 / 1024.0);
611
+
612
+ ggml_backend_buffer_free(buf);
613
+
614
+ ggml_free(ctx);
615
+
616
+ return true;
617
+ }
618
+ };
619
+
620
+ // GGML_OP_UNARY
621
+ struct test_unary : public test_case {
622
+ const ggml_unary_op op;
623
+ const ggml_type type;
624
+ const std::array<int64_t, 4> ne;
625
+
626
+ std::string vars() override {
627
+ return VARS_TO_STR2(type, ne);
628
+ }
629
+
630
+ test_unary(ggml_unary_op op,
631
+ ggml_type type = GGML_TYPE_F32,
632
+ std::array<int64_t, 4> ne = {128, 10, 10, 10})
633
+ : op(op), type(type), ne(ne) {}
634
+
635
+ ggml_tensor * build_graph(ggml_context * ctx) override {
636
+ ggml_tensor * in = ggml_new_tensor(ctx, type, 4, ne.data());
637
+ ggml_tensor * out = ggml_unary(ctx, in, op);
638
+ return out;
639
+ }
640
+
641
+ void initialize_tensors(ggml_context * ctx) override {
642
+ for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
643
+ // test extended range of values to check for NaNs in GELU
644
+ init_tensor_uniform(t, -150.f, 150.f);
645
+ }
646
+ }
647
+ };
648
+
649
+ // GGML_OP_GET_ROWS
650
+ struct test_get_rows : public test_case {
651
+ const ggml_type type;
652
+ const int n; // cols
653
+ const int m; // rows
654
+ const int r; // rows to get
655
+ const int b; // batch size
656
+ const bool v; // view (non-contiguous src1)
657
+
658
+ std::string vars() override {
659
+ return VARS_TO_STR6(type, n, m, r, b, v);
660
+ }
661
+
662
+ test_get_rows(ggml_type type = GGML_TYPE_F32, int n = 10, int m = 5, int r = 3, int b = 1, bool v = false)
663
+ : type(type), n(n), m(m), r(r), b(b), v(v) {}
664
+
665
+ ggml_tensor * build_graph(ggml_context * ctx) override {
666
+ ggml_tensor * in = ggml_new_tensor_3d(ctx, type, n, m, b);
667
+ ggml_tensor * rows = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, r, b);
668
+ if (v) {
669
+ rows = ggml_view_2d(ctx, rows, r/2, b, rows->nb[1], 0);
670
+ }
671
+ ggml_tensor * out = ggml_get_rows(ctx, in, rows);
672
+ return out;
673
+ }
674
+
675
+ void initialize_tensors(ggml_context * ctx) override {
676
+ for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
677
+ if (t->type == GGML_TYPE_I32) {
678
+ if (ggml_is_view_op(t->op)) { continue; }
679
+ // rows
680
+ std::vector<int> data(r*b);
681
+ for (int i = 0; i < r*b; i++) {
682
+ data[i] = rand() % m;
683
+ }
684
+ ggml_backend_tensor_set(t, data.data(), 0, r * b * sizeof(int));
685
+ } else {
686
+ init_tensor_uniform(t);
687
+ }
688
+ }
689
+ }
690
+ };
691
+
692
+ // GGML_OP_REPEAT
693
+ struct test_repeat : public test_case {
694
+ const ggml_type type;
695
+ const std::array<int64_t, 4> ne;
696
+ const std::array<int, 4> nr;
697
+
698
+ std::string vars() override {
699
+ return VARS_TO_STR3(type, ne, nr);
700
+ }
701
+
702
+ size_t op_size(ggml_tensor * t) override {
703
+ return ggml_nbytes(t) * 2;
704
+ }
705
+
706
+ test_repeat(ggml_type type = GGML_TYPE_F32,
707
+ std::array<int64_t, 4> ne = {10, 10, 10, 10},
708
+ std::array<int, 4> nr = {2, 2, 2, 2})
709
+ : type(type), ne(ne), nr(nr) {}
710
+
711
+ ggml_tensor * build_graph(ggml_context * ctx) override {
712
+ ggml_tensor * target = ggml_new_tensor_4d(ctx, type, ne[0]*nr[0], ne[1]*nr[1], ne[2]*nr[2], ne[3]*nr[3]);
713
+ ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data());
714
+ ggml_tensor * out = ggml_repeat(ctx, src, target);
715
+ return out;
716
+ }
717
+ };
718
+
719
+ // GGML_OP_DUP
720
+ struct test_dup : public test_case {
721
+ const ggml_type type;
722
+ const std::array<int64_t, 4> ne;
723
+ const std::array<int64_t, 4> permute;
724
+ bool _use_permute;
725
+
726
+ std::string vars() override {
727
+ std::string v = VARS_TO_STR2(type, ne);
728
+ if (_use_permute) v += "," + VAR_TO_STR(permute);
729
+ return v;
730
+ }
731
+
732
+ test_dup(ggml_type type = GGML_TYPE_F32,
733
+ std::array<int64_t, 4> ne = {10, 10, 10, 1},
734
+ std::array<int64_t, 4> permute = {0, 0, 0, 0})
735
+ : type(type), ne(ne), permute(permute),
736
+ _use_permute(permute[0] + permute[1] + permute[2] + permute[3] > 0) {}
737
+
738
+ ggml_tensor * build_graph(ggml_context * ctx) override {
739
+ ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data());
740
+ if (_use_permute) {
741
+ src = ggml_permute(ctx, src, permute[0], permute[1], permute[2], permute[3]);
742
+ }
743
+ ggml_tensor * out = ggml_dup(ctx, src);
744
+ return out;
745
+ }
746
+ };
747
+
748
+ // GGML_OP_CPY
749
+ struct test_cpy : public test_case {
750
+ const ggml_type type_src;
751
+ const ggml_type type_dst;
752
+ const std::array<int64_t, 4> ne;
753
+
754
+ std::string vars() override {
755
+ return VARS_TO_STR3(type_src, type_dst, ne);
756
+ }
757
+
758
+ size_t op_size(ggml_tensor * t) override {
759
+ return ggml_nbytes(t) + ggml_nbytes(t->src[0]);
760
+ }
761
+
762
+ test_cpy(ggml_type type_src = GGML_TYPE_F32, ggml_type type_dst = GGML_TYPE_F32,
763
+ std::array<int64_t, 4> ne = {10, 10, 10, 1})
764
+ : type_src(type_src), type_dst(type_dst), ne(ne) {}
765
+
766
+ ggml_tensor * build_graph(ggml_context * ctx) override {
767
+ ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne.data());
768
+ ggml_tensor * dst = ggml_new_tensor(ctx, type_dst, 4, ne.data());
769
+ ggml_tensor * out = ggml_cpy(ctx, src, dst);
770
+ return out;
771
+ }
772
+ };
773
+
774
+ // GGML_OP_CONT
775
+ struct test_cont : public test_case {
776
+ const ggml_type type;
777
+ const std::array<int64_t, 4> ne;
778
+
779
+ std::string vars() override {
780
+ return VARS_TO_STR2(type, ne);
781
+ }
782
+
783
+ test_cont(ggml_type type = GGML_TYPE_F32,
784
+ std::array<int64_t, 4> ne = {10, 10, 10, 1})
785
+ : type(type), ne(ne) {}
786
+
787
+ ggml_tensor * build_graph(ggml_context * ctx) override {
788
+ ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data());
789
+ src = ggml_transpose(ctx, src);
790
+ ggml_tensor * out = ggml_cont(ctx, src);
791
+
792
+ return out;
793
+ }
794
+ };
795
+
796
+ // GGML_OP_ADD
797
+ // GGML_OP_MUL
798
+ // GGML_OP_DIV
799
+ struct test_bin_bcast : public test_case {
800
+ using op_t = ggml_tensor * (*) (ggml_context *, ggml_tensor *, ggml_tensor *);
801
+ op_t op;
802
+ const ggml_type type;
803
+ const std::array<int64_t, 4> ne;
804
+ const std::array<int, 4> nr;
805
+
806
+ std::string vars() override {
807
+ return VARS_TO_STR3(type, ne, nr);
808
+ }
809
+
810
+ size_t op_size(ggml_tensor * t) override {
811
+ return ggml_nbytes(t) * 3;
812
+ }
813
+
814
+ test_bin_bcast(op_t op, ggml_type type = GGML_TYPE_F32,
815
+ std::array<int64_t, 4> ne = {10, 10, 1, 1},
816
+ std::array<int, 4> nr = {1, 2, 1, 1})
817
+ : op(op), type(type), ne(ne), nr(nr) {}
818
+
819
+ ggml_tensor * build_graph(ggml_context * ctx) override {
820
+ ggml_tensor * a = ggml_new_tensor_4d(ctx, type, ne[0]*nr[0], ne[1]*nr[1], ne[2]*nr[2], ne[3]*nr[3]);
821
+ ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne.data());
822
+ ggml_tensor * out = op(ctx, a, b);
823
+ return out;
824
+ }
825
+
826
+ void initialize_tensors(ggml_context * ctx) override {
827
+ for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
828
+ if (op == ggml_div) {
829
+ // avoid division by zero
830
+ init_tensor_uniform(t, 1.0f, 2.0f);
831
+ } else {
832
+ init_tensor_uniform(t);
833
+ }
834
+ }
835
+ }
836
+ };
837
+
838
+ // GGML_OP_SCALE
839
+ struct test_scale : public test_case {
840
+ const ggml_type type;
841
+ const std::array<int64_t, 4> ne;
842
+ float scale;
843
+
844
+ std::string vars() override {
845
+ return VARS_TO_STR3(type, ne, scale);
846
+ }
847
+
848
+ test_scale(ggml_type type = GGML_TYPE_F32,
849
+ std::array<int64_t, 4> ne = {10, 10, 10, 10},
850
+ float scale = 2.0f)
851
+ : type(type), ne(ne), scale(scale) {}
852
+
853
+ ggml_tensor * build_graph(ggml_context * ctx) override {
854
+ ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
855
+ ggml_tensor * out = ggml_scale(ctx, a, scale);
856
+ return out;
857
+ }
858
+ };
859
+
860
+ // GGML_OP_NORM
861
+ struct test_norm : public test_case {
862
+ const ggml_type type;
863
+ const std::array<int64_t, 4> ne;
864
+ float eps;
865
+
866
+ std::string vars() override {
867
+ return VARS_TO_STR3(type, ne, eps);
868
+ }
869
+
870
+ test_norm(ggml_type type = GGML_TYPE_F32,
871
+ std::array<int64_t, 4> ne = {64, 10, 10, 10},
872
+ float eps = 1e-6f)
873
+ : type(type), ne(ne), eps(eps) {}
874
+
875
+ ggml_tensor * build_graph(ggml_context * ctx) override {
876
+ ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
877
+ ggml_tensor * out = ggml_norm(ctx, a, eps);
878
+ return out;
879
+ }
880
+ };
881
+
882
+ // GGML_OP_RMS_NORM
883
+ struct test_rms_norm : public test_case {
884
+ const ggml_type type;
885
+ const std::array<int64_t, 4> ne;
886
+ float eps;
887
+
888
+ std::string vars() override {
889
+ return VARS_TO_STR3(type, ne, eps);
890
+ }
891
+
892
+ test_rms_norm(ggml_type type = GGML_TYPE_F32,
893
+ std::array<int64_t, 4> ne = {64, 10, 10, 10},
894
+ float eps = 1e-6f)
895
+ : type(type), ne(ne), eps(eps) {}
896
+
897
+ ggml_tensor * build_graph(ggml_context * ctx) override {
898
+ ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
899
+ ggml_tensor * out = ggml_rms_norm(ctx, a, eps);
900
+ return out;
901
+ }
902
+ };
903
+
904
+ // GGML_OP_MUL_MAT
905
+ struct test_mul_mat : public test_case {
906
+ const ggml_type type_a;
907
+ const ggml_type type_b;
908
+ const int64_t m;
909
+ const int64_t n;
910
+ const int64_t k;
911
+ const std::array<int64_t, 2> bs; // dims 3 and 4
912
+ const std::array<int64_t, 2> nr; // repeat in dims 3 and 4
913
+
914
+ std::string vars() override {
915
+ return VARS_TO_STR7(type_a, type_b, m, n, k, bs, nr);
916
+ }
917
+
918
+ double max_nmse_err() override {
919
+ return 5e-4;
920
+ }
921
+
922
+ size_t op_size(ggml_tensor * t) override {
923
+ size_t a = ggml_nbytes(t->src[0]) * n * nr[0] * nr[1];
924
+ size_t b = ggml_nbytes(t->src[1]) * m;
925
+ size_t c = ggml_nbytes(t);
926
+ return a + b + c;
927
+
928
+ GGML_UNUSED(t);
929
+ }
930
+
931
+ test_mul_mat(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32,
932
+ int64_t m = 32, int64_t n = 32, int64_t k = 32,
933
+ std::array<int64_t, 2> bs = {10, 10},
934
+ std::array<int64_t, 2> nr = {2, 2})
935
+ : type_a(type_a), type_b(type_b), m(m), n(n), k(k), bs(bs), nr(nr) {}
936
+
937
+ ggml_tensor * build_graph(ggml_context * ctx) override {
938
+ // C^T = A * B^T: (k, m) * (k, n) => (m, n)
939
+ ggml_tensor * a = ggml_new_tensor_4d(ctx, type_a, k, m, bs[0] , bs[1]);
940
+ ggml_tensor * b = ggml_new_tensor_4d(ctx, type_b, k, n, bs[0]*nr[0], bs[1]*nr[1]);
941
+ ggml_tensor * out = ggml_mul_mat(ctx, a, b);
942
+ return out;
943
+ }
944
+ };
945
+
946
+ // GGML_OP_MUL_MAT_ID
947
+ struct test_mul_mat_id : public test_case {
948
+ const ggml_type type_a;
949
+ const ggml_type type_b;
950
+ const int n_mats;
951
+ const int n_used;
952
+ const bool b; // brodcast b matrix
953
+ const int64_t m;
954
+ const int64_t n;
955
+ const int64_t k;
956
+
957
+ std::string vars() override {
958
+ return VARS_TO_STR8(type_a, type_b, n_mats, n_used, b, m, n, k);
959
+ }
960
+
961
+ double max_nmse_err() override {
962
+ return 5e-4;
963
+ }
964
+
965
+ size_t op_size(ggml_tensor * t) override {
966
+ size_t a = ggml_nbytes(t->src[2]) * n;
967
+ size_t b = ggml_nbytes(t->src[1]) * m;
968
+ size_t c = ggml_nbytes(t);
969
+ return a + b + c;
970
+
971
+ GGML_UNUSED(t);
972
+ }
973
+
974
+ test_mul_mat_id(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32,
975
+ int n_mats = 8, int n_used = 2, bool b = false,
976
+ int64_t m = 32, int64_t n = 32, int64_t k = 32)
977
+ : type_a(type_a), type_b(type_b), n_mats(n_mats), n_used(n_used), b(b),
978
+ m(m), n(n), k(k) {
979
+ GGML_ASSERT(n_used <= n_mats);
980
+ }
981
+
982
+ ggml_tensor * build_graph(ggml_context * ctx) override {
983
+ // C^T = A * B^T: (k, m) * (k, n) => (m, n)
984
+ ggml_tensor * as = ggml_new_tensor_3d(ctx, type_a, k, m, n_mats);
985
+ ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_mats, n);
986
+ if (n_used != n_mats) {
987
+ ids = ggml_view_2d(ctx, ids, n_used, n, ids->nb[1], 0);
988
+ }
989
+ ggml_tensor * b = ggml_new_tensor_3d(ctx, type_b, k, this->b ? 1 : n_used, n);
990
+ ggml_tensor * out = ggml_mul_mat_id(ctx, as, b, ids);
991
+ return out;
992
+ }
993
+
994
+ void initialize_tensors(ggml_context * ctx) override {
995
+ std::random_device rd;
996
+ std::default_random_engine rng(rd());
997
+ for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
998
+ if (t->type == GGML_TYPE_I32) {
999
+ if (ggml_is_view_op(t->op)) { continue; }
1000
+ // ids
1001
+ for (int64_t r = 0; r < ggml_nrows(t); r++) {
1002
+ std::vector<int32_t> data(t->ne[0]);
1003
+ for (int i = 0; i < t->ne[0]; i++) {
1004
+ data[i] = i % n_mats;
1005
+ }
1006
+ std::shuffle(data.begin(), data.end(), rng);
1007
+ ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(int32_t));
1008
+ }
1009
+ } else {
1010
+ init_tensor_uniform(t);
1011
+ }
1012
+ }
1013
+ }
1014
+ };
1015
+
1016
+ // GGML_OP_SQR
1017
+ struct test_sqr : public test_case {
1018
+ const ggml_type type;
1019
+ const std::array<int64_t, 4> ne;
1020
+
1021
+ std::string vars() override {
1022
+ return VARS_TO_STR2(type, ne);
1023
+ }
1024
+
1025
+ test_sqr(ggml_type type = GGML_TYPE_F32,
1026
+ std::array<int64_t, 4> ne = {10, 10, 10, 10})
1027
+ : type(type), ne(ne) {}
1028
+
1029
+ ggml_tensor * build_graph(ggml_context * ctx) override {
1030
+ ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
1031
+ ggml_tensor * out = ggml_sqr(ctx, a);
1032
+ return out;
1033
+ }
1034
+ };
1035
+
1036
+ // GGML_OP_CLAMP
1037
+ struct test_clamp : public test_case {
1038
+ const ggml_type type;
1039
+ const std::array<int64_t, 4> ne;
1040
+ float min;
1041
+ float max;
1042
+
1043
+ std::string vars() override {
1044
+ return VARS_TO_STR4(type, ne, min, max);
1045
+ }
1046
+
1047
+ test_clamp(ggml_type type = GGML_TYPE_F32,
1048
+ std::array<int64_t, 4> ne = {10, 10, 10, 10},
1049
+ float min = -0.5f, float max = 0.5f)
1050
+ : type(type), ne(ne), min(min), max(max) {}
1051
+
1052
+ ggml_tensor * build_graph(ggml_context * ctx) override {
1053
+ ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
1054
+ ggml_tensor * out = ggml_clamp(ctx, a, min, max);
1055
+ return out;
1056
+ }
1057
+ };
1058
+
1059
+ // GGML_OP_DIAG_MASK_INF
1060
+ struct test_diag_mask_inf : public test_case {
1061
+ const ggml_type type;
1062
+ const std::array<int64_t, 4> ne;
1063
+ const int n_past;
1064
+
1065
+ std::string vars() override {
1066
+ return VARS_TO_STR3(type, ne, n_past);
1067
+ }
1068
+
1069
+ test_diag_mask_inf(ggml_type type = GGML_TYPE_F32,
1070
+ std::array<int64_t, 4> ne = {10, 10, 10, 10},
1071
+ int n_past = 5)
1072
+ : type(type), ne(ne), n_past(n_past) {}
1073
+
1074
+ ggml_tensor * build_graph(ggml_context * ctx) override {
1075
+ ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
1076
+ ggml_tensor * out = ggml_diag_mask_inf(ctx, a, n_past);
1077
+ return out;
1078
+ }
1079
+ };
1080
+
1081
+ // GGML_OP_SOFT_MAX
1082
+ struct test_soft_max : public test_case {
1083
+ const ggml_type type;
1084
+ const std::array<int64_t, 4> ne;
1085
+ const bool mask;
1086
+ const float scale;
1087
+ const float max_bias;
1088
+
1089
+ std::string vars() override {
1090
+ return VARS_TO_STR5(type, ne, mask, scale, max_bias);
1091
+ }
1092
+
1093
+ test_soft_max(ggml_type type = GGML_TYPE_F32,
1094
+ std::array<int64_t, 4> ne = {10, 10, 10, 10},
1095
+ bool mask = false,
1096
+ float scale = 1.0f,
1097
+ float max_bias = 0.0f)
1098
+ : type(type), ne(ne), mask(mask), scale(scale), max_bias(max_bias) {}
1099
+
1100
+ ggml_tensor * build_graph(ggml_context * ctx) override {
1101
+ ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
1102
+ ggml_tensor * mask = nullptr;
1103
+ if (this->mask) {
1104
+ mask = ggml_new_tensor_2d(ctx, type, ne[0], ne[1]);
1105
+ }
1106
+ ggml_tensor * pos = nullptr;
1107
+ if (max_bias > 0.0f) {
1108
+ pos = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, ne[0]);
1109
+ }
1110
+ ggml_tensor * out = ggml_soft_max_ext(ctx, a, mask, pos, scale, max_bias);
1111
+ return out;
1112
+ }
1113
+ };
1114
+
1115
+ // GGML_OP_ROPE
1116
+ struct test_rope : public test_case {
1117
+ const ggml_type type;
1118
+ const std::array<int64_t, 4> ne;
1119
+ int n_dims;
1120
+ int mode;
1121
+ int n_ctx;
1122
+
1123
+ std::string vars() override {
1124
+ return VARS_TO_STR5(type, ne, n_dims, mode, n_ctx);
1125
+ }
1126
+
1127
+ test_rope(ggml_type type = GGML_TYPE_F32,
1128
+ std::array<int64_t, 4> ne = {10, 10, 10, 1},
1129
+ int n_dims = 10, int mode = 0, int n_ctx = 512)
1130
+ : type(type), ne(ne), n_dims(n_dims), mode(mode), n_ctx(n_ctx) {}
1131
+
1132
+ ggml_tensor * build_graph(ggml_context * ctx) override {
1133
+ ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
1134
+ ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne[2]);
1135
+ ggml_tensor * out = ggml_rope(ctx, a, pos, n_dims, mode, n_ctx);
1136
+ return out;
1137
+ }
1138
+
1139
+ void initialize_tensors(ggml_context * ctx) override {
1140
+ for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
1141
+ if (t->type == GGML_TYPE_I32) {
1142
+ // pos
1143
+ std::vector<int> data(ne[2]);
1144
+ for (int i = 0; i < ne[2]; i++) {
1145
+ data[i] = rand() % n_ctx;
1146
+ }
1147
+ ggml_backend_tensor_set(t, data.data(), 0, ne[2] * sizeof(int));
1148
+ } else {
1149
+ init_tensor_uniform(t);
1150
+ }
1151
+ }
1152
+ }
1153
+ };
1154
+
1155
+ // GGML_OP_POOL2D
1156
+ struct test_pool2d : public test_case {
1157
+ enum ggml_op_pool pool_type;
1158
+ const ggml_type type_input;
1159
+ const std::array<int64_t, 4> ne_input;
1160
+ // kernel size
1161
+ const int k0;
1162
+ const int k1;
1163
+ // stride
1164
+ const int s0;
1165
+ const int s1;
1166
+ // padding
1167
+ const int p0;
1168
+ const int p1;
1169
+
1170
+ std::string vars() override {
1171
+ return VARS_TO_STR9(pool_type, type_input, ne_input, k0, k1, s0, s1, p0, p1);
1172
+ }
1173
+
1174
+ test_pool2d(ggml_op_pool pool_type = GGML_OP_POOL_AVG,
1175
+ ggml_type type_input = GGML_TYPE_F32,
1176
+ std::array<int64_t, 4> ne_input = {10, 10, 3, 1}, // [input_width, input_height, input_channels, 1]
1177
+ int k0 = 3, int k1 = 3,
1178
+ int s0 = 1, int s1 = 1,
1179
+ int p0 = 1, int p1 = 1)
1180
+ : pool_type(pool_type), type_input(type_input), ne_input(ne_input), k0(k0), k1(k1), s0(s0), s1(s1), p0(p0), p1(p1) {}
1181
+
1182
+ ggml_tensor * build_graph(ggml_context * ctx) override {
1183
+ ggml_tensor * input = ggml_new_tensor(ctx, type_input, 4, ne_input.data());
1184
+ ggml_tensor * out = ggml_pool_2d(ctx, input, pool_type, k0, k1, s0, s1, p0, p1);
1185
+ return out;
1186
+ }
1187
+ };
1188
+
1189
+ // GGML_OP_IM2COL
1190
+ struct test_im2col : public test_case {
1191
+ const ggml_type type_input;
1192
+ const ggml_type type_kernel;
1193
+ const ggml_type dst_type;
1194
+ const std::array<int64_t, 4> ne_input;
1195
+ const std::array<int64_t, 4> ne_kernel;
1196
+ // stride
1197
+ const int s0;
1198
+ const int s1;
1199
+ // padding
1200
+ const int p0;
1201
+ const int p1;
1202
+ // dilatation
1203
+ const int d0;
1204
+ const int d1;
1205
+ // mode
1206
+ const bool is_2D;
1207
+
1208
+ std::string vars() override {
1209
+ return VARS_TO_STR12(type_input, type_kernel, dst_type, ne_input, ne_kernel, s0, s1, p0, p1, d0, d1, is_2D);
1210
+ }
1211
+
1212
+ test_im2col(ggml_type type_input = GGML_TYPE_F32, ggml_type type_kernel = GGML_TYPE_F16, ggml_type dst_type = GGML_TYPE_F32,
1213
+ std::array<int64_t, 4> ne_input = {10, 10, 3, 1}, // [input_width, input_height, input_channels, 1]
1214
+ std::array<int64_t, 4> ne_kernel = {3, 3, 3, 1}, // [kernel_width, kernel_height, input_channels, 1]
1215
+ int s0 = 1, int s1 = 1,
1216
+ int p0 = 1, int p1 = 1,
1217
+ int d0 = 1, int d1 = 1,
1218
+ bool is_2D = true)
1219
+ : type_input(type_input), type_kernel(type_kernel), dst_type(dst_type), ne_input(ne_input), ne_kernel(ne_kernel), s0(s0), s1(s1), p0(p0), p1(p1), d0(d0), d1(d1), is_2D(is_2D) {}
1220
+
1221
+ ggml_tensor * build_graph(ggml_context * ctx) override {
1222
+ ggml_tensor * input = ggml_new_tensor(ctx, type_input, 4, ne_input.data());
1223
+ ggml_tensor * kernel = ggml_new_tensor(ctx, type_kernel, 4, ne_kernel.data());
1224
+ ggml_tensor * out = ggml_im2col(ctx, kernel, input, s0, s1, p0, p1, d0, d1, is_2D, dst_type);
1225
+ return out;
1226
+ }
1227
+ };
1228
+
1229
+ // GGML_OP_CONCAT
1230
+ struct test_concat : public test_case {
1231
+ const ggml_type type;
1232
+ const std::array<int64_t, 4> ne;
1233
+ const int64_t b_ne2;
1234
+
1235
+ std::string vars() override {
1236
+ return VARS_TO_STR3(type, ne, b_ne2);
1237
+ }
1238
+
1239
+ test_concat(ggml_type type = GGML_TYPE_F32,
1240
+ std::array<int64_t, 4> ne = {10, 10, 10, 10},
1241
+ int64_t b_ne2 = 10)
1242
+ : type(type), ne(ne), b_ne2(b_ne2) {}
1243
+
1244
+ ggml_tensor * build_graph(ggml_context * ctx) override {
1245
+ ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
1246
+ ggml_tensor * b = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], b_ne2, ne[3]);
1247
+ ggml_tensor * out = ggml_concat(ctx, a, b);
1248
+ return out;
1249
+ }
1250
+ };
1251
+
1252
+ // GGML_OP_ARGSORT
1253
+ struct test_argsort : public test_case {
1254
+ const ggml_type type;
1255
+ const std::array<int64_t, 4> ne;
1256
+ ggml_sort_order order;
1257
+
1258
+ std::string vars() override {
1259
+ return VARS_TO_STR3(type, ne, order);
1260
+ }
1261
+
1262
+ test_argsort(ggml_type type = GGML_TYPE_F32,
1263
+ std::array<int64_t, 4> ne = {16, 10, 10, 10},
1264
+ ggml_sort_order order = GGML_SORT_ORDER_ASC)
1265
+ : type(type), ne(ne), order(order) {}
1266
+
1267
+ ggml_tensor * build_graph(ggml_context * ctx) override {
1268
+ ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
1269
+ ggml_tensor * out = ggml_argsort(ctx, a, order);
1270
+ return out;
1271
+ }
1272
+
1273
+ void initialize_tensors(ggml_context * ctx) override {
1274
+ std::random_device rd;
1275
+ std::default_random_engine rng(rd());
1276
+ for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
1277
+ if (t->type == GGML_TYPE_I32) {
1278
+ // indices
1279
+ std::vector<int> data(ggml_nelements(t));
1280
+ for (int i = 0; i < ggml_nelements(t); i++) {
1281
+ data[i] = rand();
1282
+ }
1283
+ std::shuffle(data.begin(), data.end(), rng);
1284
+ ggml_backend_tensor_set(t, data.data(), 0, ne[0]*ne[1]*ne[2]*ne[3] * sizeof(int));
1285
+ } else if (t->type == GGML_TYPE_F32) {
1286
+ // initialize with unique values to avoid ties
1287
+ for (int64_t r = 0; r < ggml_nrows(t); r++) {
1288
+ std::vector<float> data(t->ne[0]);
1289
+ for (int i = 0; i < t->ne[0]; i++) {
1290
+ data[i] = i;
1291
+ }
1292
+ std::shuffle(data.begin(), data.end(), rng);
1293
+ ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(float));
1294
+ }
1295
+ } else {
1296
+ GGML_ASSERT(false);
1297
+ }
1298
+ }
1299
+ }
1300
+ };
1301
+
1302
+ // GGML_OP_SUM_ROWS
1303
+ struct test_sum_rows : public test_case {
1304
+ const ggml_type type;
1305
+ const std::array<int64_t, 4> ne;
1306
+
1307
+ std::string vars() override {
1308
+ return VARS_TO_STR2(type, ne);
1309
+ }
1310
+
1311
+ test_sum_rows(ggml_type type = GGML_TYPE_F32,
1312
+ std::array<int64_t, 4> ne = {10, 10, 10, 10})
1313
+ : type(type), ne(ne) {}
1314
+
1315
+ ggml_tensor * build_graph(ggml_context * ctx) override {
1316
+ ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
1317
+ ggml_tensor * out = ggml_sum_rows(ctx, a);
1318
+ return out;
1319
+ }
1320
+ };
1321
+
1322
+ // GGML_OP_UPSCALE
1323
+ struct test_upscale : public test_case {
1324
+ const ggml_type type;
1325
+ const std::array<int64_t, 4> ne;
1326
+ const int32_t scale_factor;
1327
+
1328
+ std::string vars() override {
1329
+ return VARS_TO_STR3(type, ne, scale_factor);
1330
+ }
1331
+
1332
+ test_upscale(ggml_type type = GGML_TYPE_F32,
1333
+ std::array<int64_t, 4> ne = {512, 512, 3, 1},
1334
+ int32_t scale_factor = 2)
1335
+ : type(type), ne(ne), scale_factor(scale_factor) {}
1336
+
1337
+ ggml_tensor * build_graph(ggml_context * ctx) override {
1338
+ ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
1339
+ ggml_tensor * out = ggml_upscale(ctx, a, scale_factor);
1340
+ return out;
1341
+ }
1342
+ };
1343
+
1344
+ // GGML_OP_GROUP_NORM
1345
+ struct test_group_norm : public test_case {
1346
+ const ggml_type type;
1347
+ const std::array<int64_t, 4> ne;
1348
+ const int32_t num_groups;
1349
+
1350
+ std::string vars() override {
1351
+ return VARS_TO_STR3(type, ne, num_groups);
1352
+ }
1353
+
1354
+ test_group_norm(ggml_type type = GGML_TYPE_F32,
1355
+ std::array<int64_t, 4> ne = {64, 64, 320, 1},
1356
+ int32_t num_groups = 32)
1357
+ : type(type), ne(ne), num_groups(num_groups) {}
1358
+
1359
+ ggml_tensor * build_graph(ggml_context * ctx) override {
1360
+ ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
1361
+ ggml_tensor * out = ggml_group_norm(ctx, a, num_groups);
1362
+ return out;
1363
+ }
1364
+ };
1365
+
1366
+ // GGML_OP_ACC
1367
+ struct test_acc : public test_case {
1368
+ const ggml_type type;
1369
+ const std::array<int64_t, 4> ne_a;
1370
+ const std::array<int64_t, 4> ne_b;
1371
+
1372
+ std::string vars() override {
1373
+ return VARS_TO_STR3(type, ne_a, ne_b);
1374
+ }
1375
+
1376
+ test_acc(ggml_type type = GGML_TYPE_F32,
1377
+ std::array<int64_t, 4> ne_a = {1024, 577, 1, 1},
1378
+ std::array<int64_t, 4> ne_b = {1024, 576, 1, 1})
1379
+ : type(type), ne_a(ne_a), ne_b(ne_b) {}
1380
+
1381
+ ggml_tensor * build_graph(ggml_context * ctx) override {
1382
+ ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
1383
+ ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne_b.data());
1384
+ ggml_tensor * out = ggml_acc(ctx, a, b, a->nb[1], a->nb[2], a->nb[3], b->nb[1]);
1385
+ return out;
1386
+ }
1387
+ };
1388
+
1389
+ // GGML_OP_PAD
1390
+ struct test_pad : public test_case {
1391
+ const ggml_type type;
1392
+ const std::array<int64_t, 4> ne_a;
1393
+ const int pad_0;
1394
+ const int pad_1;
1395
+
1396
+ std::string vars() override {
1397
+ return VARS_TO_STR4(type, ne_a, pad_0, pad_1);
1398
+ }
1399
+
1400
+ test_pad(ggml_type type = GGML_TYPE_F32,
1401
+ std::array<int64_t, 4> ne_a = {512, 512, 1, 1},
1402
+ int pad_0 = 1, int pad_1 = 1)
1403
+ : type(type), ne_a(ne_a), pad_0(pad_0), pad_1(pad_1) {}
1404
+
1405
+ ggml_tensor * build_graph(ggml_context * ctx) override {
1406
+ ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
1407
+ ggml_tensor * out = ggml_pad(ctx, a, pad_0, pad_1, 0, 0);
1408
+ return out;
1409
+ }
1410
+ };
1411
+
1412
+ // GGML_OP_ARANGE
1413
+ struct test_arange : public test_case {
1414
+ const ggml_type type;
1415
+ const float start;
1416
+ const float stop;
1417
+ const float step;
1418
+
1419
+ std::string vars() override {
1420
+ return VARS_TO_STR4(type, start, stop, step);
1421
+ }
1422
+
1423
+ test_arange(ggml_type type = GGML_TYPE_F32,
1424
+ float start = 0.f, float stop = 10.f, float step = 1.f)
1425
+ : type(type), start(start), stop(stop), step(step) {}
1426
+
1427
+ ggml_tensor * build_graph(ggml_context * ctx) override {
1428
+ ggml_tensor * out = ggml_arange(ctx, start, stop, step);
1429
+ return out;
1430
+ }
1431
+ };
1432
+
1433
+ // GGML_OP_TIMESTEP_EMBEDDING
1434
+ struct test_timestep_embedding : public test_case {
1435
+ const ggml_type type;
1436
+ const std::array<int64_t, 4> ne_a;
1437
+ const int dim;
1438
+ const int max_period;
1439
+
1440
+ std::string vars() override {
1441
+ return VARS_TO_STR4(type, ne_a, dim, max_period);
1442
+ }
1443
+
1444
+ test_timestep_embedding(ggml_type type = GGML_TYPE_F32,
1445
+ std::array<int64_t, 4> ne_a = {2, 1, 1, 1},
1446
+ int dim = 320, int max_period=10000)
1447
+ : type(type), ne_a(ne_a), dim(dim), max_period(max_period) {}
1448
+
1449
+ ggml_tensor * build_graph(ggml_context * ctx) override {
1450
+ ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
1451
+ ggml_tensor * out = ggml_timestep_embedding(ctx, a, dim, max_period);
1452
+ return out;
1453
+ }
1454
+ };
1455
+
1456
+ // GGML_OP_LEAKY_RELU
1457
+ struct test_leaky_relu : public test_case {
1458
+ const ggml_type type;
1459
+ const std::array<int64_t, 4> ne_a;
1460
+ const float negative_slope;
1461
+
1462
+ std::string vars() override {
1463
+ return VARS_TO_STR3(type, ne_a, negative_slope);
1464
+ }
1465
+
1466
+ test_leaky_relu(ggml_type type = GGML_TYPE_F32,
1467
+ std::array<int64_t, 4> ne_a = {10, 10, 10, 10},
1468
+ float negative_slope = 0.1f)
1469
+ : type(type), ne_a(ne_a), negative_slope(negative_slope) {}
1470
+
1471
+ ggml_tensor * build_graph(ggml_context * ctx) override {
1472
+ ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
1473
+ ggml_tensor * out = ggml_leaky_relu(ctx, a, negative_slope, true);
1474
+ return out;
1475
+ }
1476
+ };
1477
+
1478
+ enum llm_norm_type {
1479
+ LLM_NORM,
1480
+ LLM_NORM_RMS,
1481
+ };
1482
+
1483
+ struct llama_hparams {
1484
+ uint32_t n_vocab;
1485
+ uint32_t n_embd;
1486
+ uint32_t n_head;
1487
+ uint32_t n_head_kv;
1488
+ static constexpr uint32_t n_layer = 1;
1489
+ uint32_t n_rot;
1490
+ uint32_t n_embd_head; // dimension of values (d_v)
1491
+ uint32_t n_ff;
1492
+
1493
+ float f_norm_eps;
1494
+ float f_norm_rms_eps;
1495
+
1496
+ // cparams
1497
+ static constexpr uint32_t n_ctx = 512; // user-specified context size
1498
+ static constexpr uint32_t n_orig_ctx = n_ctx;
1499
+
1500
+ // batch
1501
+ int32_t n_tokens;
1502
+
1503
+ // llm_build_context
1504
+ static constexpr int32_t n_kv = 32; // size of KV cache to consider (n_kv <= n_ctx
1505
+ static constexpr int32_t kv_head = 1; // index of where we store new KV data in the cache
1506
+
1507
+ uint32_t n_embd_gqa() const { // dimension of key embeddings across all k-v heads
1508
+ return n_embd_head * n_head_kv;
1509
+ }
1510
+ };
1511
+
1512
+ // LLM base class
1513
+ struct test_llm : public test_case {
1514
+ llama_hparams hp;
1515
+
1516
+ protected:
1517
+ test_llm(llama_hparams hp)
1518
+ : hp(std::move(hp)) {
1519
+ }
1520
+
1521
+ public:
1522
+ struct ggml_tensor * llm_build_norm(
1523
+ struct ggml_context * ctx,
1524
+ struct ggml_tensor * cur,
1525
+ struct ggml_tensor * mw,
1526
+ struct ggml_tensor * mb,
1527
+ llm_norm_type type) {
1528
+ switch (type) {
1529
+ case LLM_NORM: cur = ggml_norm (ctx, cur, hp.f_norm_eps); break;
1530
+ case LLM_NORM_RMS: cur = ggml_rms_norm(ctx, cur, hp.f_norm_rms_eps); break;
1531
+ }
1532
+ cur = ggml_mul(ctx, cur, mw);
1533
+ if (mb) {
1534
+ cur = ggml_add(ctx, cur, mb);
1535
+ }
1536
+ return cur;
1537
+ }
1538
+
1539
+ void llm_build_kv_store(
1540
+ struct ggml_context * ctx,
1541
+ struct ggml_tensor * k_l,
1542
+ struct ggml_tensor * v_l,
1543
+ struct ggml_tensor * k_cur,
1544
+ struct ggml_tensor * v_cur) {
1545
+ // compute the transposed [n_tokens, n_embd] V matrix
1546
+ struct ggml_tensor * v_cur_t = ggml_transpose(ctx, ggml_reshape_2d(ctx, v_cur, hp.n_embd_gqa(), hp.n_tokens));
1547
+
1548
+ struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, k_l, hp.n_tokens*hp.n_embd_gqa(),
1549
+ (ggml_row_size(k_l->type, hp.n_embd_gqa()))*hp.kv_head);
1550
+
1551
+ struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, v_l, hp.n_tokens, hp.n_embd_gqa(),
1552
+ ( hp.n_ctx)*ggml_element_size(v_l),
1553
+ (hp.kv_head)*ggml_element_size(v_l));
1554
+
1555
+ // important: storing RoPE-ed version of K in the KV cache!
1556
+ ggml_cpy(ctx, k_cur, k_cache_view);
1557
+ ggml_cpy(ctx, v_cur_t, v_cache_view);
1558
+ }
1559
+
1560
+ struct ggml_tensor * llm_build_kqv(
1561
+ struct ggml_context * ctx,
1562
+ struct ggml_tensor * k_l,
1563
+ struct ggml_tensor * v_l,
1564
+ struct ggml_tensor * q_cur,
1565
+ struct ggml_tensor * kq_mask,
1566
+ float kq_scale) {
1567
+ struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3);
1568
+
1569
+ struct ggml_tensor * k =
1570
+ ggml_view_3d(ctx, k_l,
1571
+ hp.n_embd_head, hp.n_kv, hp.n_head_kv,
1572
+ ggml_row_size(k_l->type, hp.n_embd_gqa()),
1573
+ ggml_row_size(k_l->type, hp.n_embd_head),
1574
+ 0);
1575
+
1576
+ struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
1577
+
1578
+ kq = ggml_soft_max_ext(ctx, kq, kq_mask, nullptr, kq_scale, 0.0f);
1579
+
1580
+ // split cached v into n_head heads
1581
+ struct ggml_tensor * v =
1582
+ ggml_view_3d(ctx, v_l,
1583
+ hp.n_kv, hp.n_embd_head, hp.n_head_kv,
1584
+ ggml_element_size(v_l)*hp.n_ctx,
1585
+ ggml_element_size(v_l)*hp.n_ctx*hp.n_embd_head,
1586
+ 0);
1587
+
1588
+ struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq);
1589
+
1590
+ struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
1591
+
1592
+ struct ggml_tensor * cur = ggml_cont_2d(ctx, kqv_merged, hp.n_embd_head*hp.n_head, hp.n_tokens);
1593
+
1594
+ struct ggml_tensor * wo = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_embd);
1595
+ cur = ggml_mul_mat(ctx, wo, cur);
1596
+
1597
+ return cur;
1598
+ }
1599
+
1600
+ void initialize_tensors(ggml_context * ctx) override {
1601
+ for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
1602
+ if (t->type == GGML_TYPE_I32) {
1603
+ // pos
1604
+ std::vector<int> data(hp.n_tokens);
1605
+ for (int i = 0; i < hp.n_tokens; i++) {
1606
+ data[i] = rand() % hp.n_ctx;
1607
+ }
1608
+ ggml_backend_tensor_set(t, data.data(), 0, hp.n_tokens * sizeof(int));
1609
+ } else {
1610
+ init_tensor_uniform(t);
1611
+ }
1612
+ }
1613
+ }
1614
+ };
1615
+
1616
+ // Llama
1617
+ struct test_llama : public test_llm {
1618
+ static constexpr float freq_base = 10000.0f;
1619
+ static constexpr float freq_scale = 1.0f;
1620
+ static constexpr float ext_factor = 0.0f;
1621
+ static constexpr float attn_factor = 1.0f;
1622
+ static constexpr float beta_fast = 32.0f;
1623
+ static constexpr float beta_slow = 1.0f;
1624
+
1625
+ std::string op_desc(ggml_tensor * t) override {
1626
+ GGML_UNUSED(t);
1627
+ return "LLAMA";
1628
+ }
1629
+
1630
+ std::string vars() override {
1631
+ auto n_tokens = hp.n_tokens;
1632
+ return VARS_TO_STR1(n_tokens);
1633
+ }
1634
+
1635
+ double max_nmse_err() override {
1636
+ return 2e-3;
1637
+ }
1638
+
1639
+ test_llama(int n_tokens = 1)
1640
+ : test_llm({
1641
+ /*n_vocab =*/ 32000,
1642
+ /*n_embd =*/ 3200,
1643
+ /*n_head =*/ 32,
1644
+ /*n_head_kv =*/ 32,
1645
+ /*n_rot =*/ 100,
1646
+ /*n_embd_head =*/ 100,
1647
+ /*n_ff =*/ 8640,
1648
+ /*f_norm_eps =*/ 0.f,
1649
+ /*f_norm_rms_eps =*/ 1e-5f,
1650
+ /*n_tokens =*/ n_tokens,
1651
+ }) {
1652
+ }
1653
+
1654
+ ggml_tensor * build_graph(ggml_context * ctx) override {
1655
+ struct ggml_tensor * cur;
1656
+ struct ggml_tensor * inpL;
1657
+
1658
+ inpL = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, hp.n_embd, hp.n_tokens);
1659
+
1660
+ // inp_pos - contains the positions
1661
+ struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, hp.n_tokens);
1662
+
1663
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
1664
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hp.n_kv, hp.n_tokens, 1);
1665
+
1666
+ ggml_tensor * k_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
1667
+ ggml_tensor * v_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
1668
+
1669
+ for (uint32_t il = 0; il < hp.n_layer; ++il) {
1670
+ struct ggml_tensor * inpSA = inpL;
1671
+
1672
+ // norm
1673
+ ggml_tensor * attn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
1674
+ cur = llm_build_norm(ctx, inpL, attn_norm, nullptr, LLM_NORM_RMS);
1675
+
1676
+ // self-attention
1677
+ {
1678
+ ggml_tensor * wq = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_embd);
1679
+ ggml_tensor * wk = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_embd_gqa());
1680
+ ggml_tensor * wv = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_embd_gqa());
1681
+
1682
+ // compute Q and K and RoPE them
1683
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx, wq, cur);
1684
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx, wk, cur);
1685
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx, wv, cur);
1686
+
1687
+ Qcur = ggml_rope_custom(
1688
+ ctx, ggml_reshape_3d(ctx, Qcur, hp.n_embd_head, hp.n_head, hp.n_tokens), inp_pos,
1689
+ hp.n_rot, 0, 0, hp.n_orig_ctx, freq_base, freq_scale,
1690
+ ext_factor, attn_factor, beta_fast, beta_slow
1691
+ );
1692
+
1693
+ Kcur = ggml_rope_custom(
1694
+ ctx, ggml_reshape_3d(ctx, Kcur, hp.n_embd_head, hp.n_head_kv, hp.n_tokens), inp_pos,
1695
+ hp.n_rot, 0, 0, hp.n_orig_ctx, freq_base, freq_scale,
1696
+ ext_factor, attn_factor, beta_fast, beta_slow
1697
+ );
1698
+
1699
+ llm_build_kv_store(ctx, k_l, v_l, Kcur, Vcur);
1700
+
1701
+ cur = llm_build_kqv(ctx, k_l, v_l, Qcur, KQ_mask, 1.0f/sqrtf(float(hp.n_embd_head)));
1702
+ }
1703
+
1704
+ struct ggml_tensor * ffn_inp = ggml_add(ctx, cur, inpSA);
1705
+
1706
+ // feed-forward network
1707
+ ggml_tensor * ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
1708
+ cur = llm_build_norm(ctx, ffn_inp, ffn_norm, nullptr, LLM_NORM_RMS);
1709
+
1710
+ ggml_tensor * ffn_gate = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_ff);
1711
+ ggml_tensor * ffn_down = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_ff, hp.n_embd);
1712
+ ggml_tensor * ffn_up = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_ff);
1713
+ struct ggml_tensor * tmp = ggml_mul_mat(ctx, ffn_up, cur);
1714
+ cur = ggml_mul_mat(ctx, ffn_gate, cur);
1715
+ cur = ggml_silu(ctx, cur);
1716
+ cur = ggml_mul(ctx, cur, tmp);
1717
+ cur = ggml_mul_mat(ctx, ffn_down, cur);
1718
+
1719
+ cur = ggml_add(ctx, cur, ffn_inp);
1720
+
1721
+ // input for next layer
1722
+ inpL = cur;
1723
+ }
1724
+
1725
+ cur = inpL;
1726
+
1727
+ ggml_tensor * output_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
1728
+ cur = llm_build_norm(ctx, cur, output_norm, nullptr, LLM_NORM_RMS);
1729
+
1730
+ // lm_head
1731
+ ggml_tensor * output = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_vocab);
1732
+ cur = ggml_mul_mat(ctx, output, cur);
1733
+
1734
+ return cur;
1735
+ }
1736
+ };
1737
+
1738
+ // Falcon
1739
+ struct test_falcon : public test_llm {
1740
+ static constexpr float freq_base = 10000.0f;
1741
+ static constexpr float freq_scale = 1.0f;
1742
+ static constexpr float ext_factor = 0.0f;
1743
+ static constexpr float attn_factor = 1.0f;
1744
+ static constexpr float beta_fast = 32.0f;
1745
+ static constexpr float beta_slow = 1.0f;
1746
+
1747
+ std::string op_desc(ggml_tensor * t) override {
1748
+ GGML_UNUSED(t);
1749
+ return "FALCON";
1750
+ }
1751
+
1752
+ std::string vars() override {
1753
+ auto n_tokens = hp.n_tokens;
1754
+ return VARS_TO_STR1(n_tokens);
1755
+ }
1756
+
1757
+ double max_nmse_err() override {
1758
+ return 2e-3;
1759
+ }
1760
+
1761
+ test_falcon(int n_tokens = 1)
1762
+ : test_llm({
1763
+ /*n_vocab =*/ 32000,
1764
+ /*n_embd =*/ 3200,
1765
+ /*n_head =*/ 50,
1766
+ /*n_head_kv =*/ 1,
1767
+ /*n_rot =*/ 64,
1768
+ /*n_embd_head =*/ 64,
1769
+ /*n_ff =*/ 8640,
1770
+ /*f_norm_eps =*/ 1e-5f,
1771
+ /*f_norm_rms_eps =*/ 0.f,
1772
+ /*n_tokens =*/ n_tokens,
1773
+ }) {
1774
+ }
1775
+
1776
+ ggml_tensor * build_graph(ggml_context * ctx) override {
1777
+ struct ggml_tensor * cur;
1778
+ struct ggml_tensor * inpL;
1779
+
1780
+ inpL = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, hp.n_embd, hp.n_tokens);
1781
+
1782
+ // inp_pos - contains the positions
1783
+ struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, hp.n_tokens);
1784
+
1785
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
1786
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hp.n_kv, hp.n_tokens, 1);
1787
+
1788
+ ggml_tensor * k_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
1789
+ ggml_tensor * v_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
1790
+
1791
+ for (uint32_t il = 0; il < hp.n_layer; ++il) {
1792
+ // norm
1793
+ ggml_tensor * attn_norm_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
1794
+ ggml_tensor * attn_norm_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
1795
+ ggml_tensor * attn_norm = llm_build_norm(ctx, inpL, attn_norm_w, attn_norm_b, LLM_NORM);
1796
+
1797
+ // self-attention
1798
+ {
1799
+ cur = attn_norm;
1800
+
1801
+ ggml_tensor * wqkv = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_embd + 2*hp.n_embd_gqa());
1802
+
1803
+ cur = ggml_mul_mat(ctx, wqkv, cur);
1804
+
1805
+ struct ggml_tensor * Qcur = ggml_cont(ctx, ggml_view_2d(ctx, cur, hp.n_embd, hp.n_tokens, cur->nb[1], 0*sizeof(float)*(hp.n_embd)));
1806
+ struct ggml_tensor * Kcur = ggml_cont(ctx, ggml_view_2d(ctx, cur, hp.n_embd_gqa(), hp.n_tokens, cur->nb[1], 1*sizeof(float)*(hp.n_embd)));
1807
+ struct ggml_tensor * Vcur = ggml_cont(ctx, ggml_view_2d(ctx, cur, hp.n_embd_gqa(), hp.n_tokens, cur->nb[1], 1*sizeof(float)*(hp.n_embd + hp.n_embd_gqa())));
1808
+
1809
+ Qcur = ggml_reshape_3d(ctx, Qcur, hp.n_embd_head, hp.n_head, hp.n_tokens);
1810
+ Kcur = ggml_reshape_3d(ctx, Kcur, hp.n_embd_head, hp.n_head_kv, hp.n_tokens);
1811
+
1812
+ // using mode = 2 for neox mode
1813
+ Qcur = ggml_rope_custom(
1814
+ ctx, Qcur, inp_pos, hp.n_rot, 2, 0, hp.n_orig_ctx,
1815
+ freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
1816
+ );
1817
+
1818
+ Kcur = ggml_rope_custom(
1819
+ ctx, Kcur, inp_pos, hp.n_rot, 2, 0, hp.n_orig_ctx,
1820
+ freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
1821
+ );
1822
+
1823
+ llm_build_kv_store(ctx, k_l, v_l, Kcur, Vcur);
1824
+
1825
+ cur = llm_build_kqv(ctx, k_l, v_l, Qcur, KQ_mask, 1.0f/sqrtf(float(hp.n_embd_head)));
1826
+ }
1827
+
1828
+ struct ggml_tensor * ffn_inp = cur;
1829
+
1830
+ // feed forward
1831
+ {
1832
+ ggml_tensor * ffn_up = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_ff);
1833
+ ggml_tensor * ffn_down = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_ff, hp.n_embd);
1834
+ cur = attn_norm;
1835
+ cur = ggml_mul_mat(ctx, ffn_up, cur);
1836
+ cur = ggml_gelu(ctx, cur);
1837
+ cur = ggml_mul_mat(ctx, ffn_down, cur);
1838
+ }
1839
+
1840
+ cur = ggml_add(ctx, cur, ffn_inp);
1841
+
1842
+ cur = ggml_add(ctx, cur, inpL);
1843
+
1844
+ // input for next layer
1845
+ inpL = cur;
1846
+ }
1847
+
1848
+ cur = inpL;
1849
+
1850
+ ggml_tensor * output_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
1851
+ ggml_tensor * output_norm_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
1852
+ cur = llm_build_norm(ctx, cur, output_norm, output_norm_b, LLM_NORM);
1853
+
1854
+ // lm_head
1855
+ ggml_tensor * output = ggml_new_tensor_2d(ctx, GGML_TYPE_Q8_0, hp.n_embd, hp.n_vocab);
1856
+ cur = ggml_mul_mat(ctx, output, cur);
1857
+
1858
+ return cur;
1859
+ }
1860
+ };
1861
+
1862
+ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_name) {
1863
+ std::vector<std::unique_ptr<test_case>> test_cases;
1864
+ std::default_random_engine rng(0);
1865
+
1866
+ const ggml_type all_types[] = {
1867
+ GGML_TYPE_F32, GGML_TYPE_F16,
1868
+ GGML_TYPE_Q4_0, GGML_TYPE_Q4_1,
1869
+ GGML_TYPE_Q5_0, GGML_TYPE_Q5_1,
1870
+ GGML_TYPE_Q8_0,
1871
+ GGML_TYPE_Q2_K, GGML_TYPE_Q3_K,
1872
+ GGML_TYPE_Q4_K, GGML_TYPE_Q5_K,
1873
+ GGML_TYPE_Q6_K,
1874
+ GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S,
1875
+ GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M,
1876
+ GGML_TYPE_IQ4_NL, GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS,
1877
+ };
1878
+
1879
+ const ggml_type base_types[] = {
1880
+ GGML_TYPE_F32, GGML_TYPE_F16,
1881
+ GGML_TYPE_Q4_0,
1882
+ GGML_TYPE_Q4_K,
1883
+ GGML_TYPE_IQ2_XXS
1884
+ };
1885
+
1886
+ const ggml_type other_types[] = {
1887
+ GGML_TYPE_Q4_1,
1888
+ GGML_TYPE_Q5_0, GGML_TYPE_Q5_1,
1889
+ GGML_TYPE_Q8_0,
1890
+ GGML_TYPE_Q2_K, GGML_TYPE_Q3_K,
1891
+ GGML_TYPE_Q5_K,
1892
+ GGML_TYPE_Q6_K,
1893
+ GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S,
1894
+ GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M,
1895
+ GGML_TYPE_IQ4_NL, GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS,
1896
+ };
1897
+
1898
+ // unary ops
1899
+ for (int op = 0; op < GGML_UNARY_OP_COUNT; op++) {
1900
+ test_cases.emplace_back(new test_unary((ggml_unary_op) op));
1901
+ test_cases.emplace_back(new test_unary((ggml_unary_op) op, GGML_TYPE_F32, { 7, 13, 19, 23 }));
1902
+ }
1903
+
1904
+ test_cases.emplace_back(new test_get_rows(GGML_TYPE_F32, 1, 8, 2, 1, false));
1905
+ for (ggml_type type : all_types) {
1906
+ for (int b : {1, 7}) {
1907
+ for (bool v : {false, true}) {
1908
+ test_cases.emplace_back(new test_get_rows(type, 256, 5, 4, b, v));
1909
+ }
1910
+ }
1911
+ }
1912
+ for (int b : {1, 7}) {
1913
+ for (bool v : {false, true}) {
1914
+ test_cases.emplace_back(new test_get_rows(GGML_TYPE_I32, 256, 5, 4, b, v));
1915
+ }
1916
+ }
1917
+
1918
+ for (ggml_type type_input : {GGML_TYPE_F32}) {
1919
+ for (ggml_op_pool pool_type : {GGML_OP_POOL_AVG, GGML_OP_POOL_MAX}) {
1920
+ for (int k0 : {1, 3}) {
1921
+ for (int k1 : {1, 3}) {
1922
+ for (int s0 : {1, 2}) {
1923
+ for (int s1 : {1, 2}) {
1924
+ for (int p0 : {0, 1}) {
1925
+ for (int p1 : {0, 1}) {
1926
+ test_cases.emplace_back(new test_pool2d(pool_type, type_input, {10, 10, 3, 1}, k0, k1, s0, s1, p0, p1));
1927
+ }
1928
+ }
1929
+ }
1930
+ }
1931
+ }
1932
+ }
1933
+ }
1934
+ }
1935
+
1936
+ test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32));
1937
+ test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16));
1938
+
1939
+ test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 1, 1, 1}));
1940
+ test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {2, 1, 1, 1}));
1941
+ test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 2, 1, 1}));
1942
+ test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 1, 2, 1}));
1943
+ test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 1, 1, 2}));
1944
+ test_cases.emplace_back(new test_repeat(GGML_TYPE_I32, {10, 10, 10, 10}, {2, 1, 1, 1}));
1945
+ test_cases.emplace_back(new test_repeat(GGML_TYPE_I16, {10, 10, 10, 10}, {1, 1, 1, 2}));
1946
+
1947
+ test_cases.emplace_back(new test_dup(GGML_TYPE_F32));
1948
+ test_cases.emplace_back(new test_dup(GGML_TYPE_F16));
1949
+ test_cases.emplace_back(new test_dup(GGML_TYPE_I32));
1950
+ test_cases.emplace_back(new test_dup(GGML_TYPE_I16));
1951
+ test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {0, 2, 1, 3}));
1952
+ test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {1, 2, 0, 3}));
1953
+
1954
+ for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) {
1955
+ for (ggml_type type_dst : all_types) {
1956
+ test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4}));
1957
+ }
1958
+ }
1959
+
1960
+ test_cases.emplace_back(new test_cont());
1961
+
1962
+ auto add_test_bin_bcast = [&](ggml_type type, std::array<int64_t, 4> ne, std::array<int, 4> nr) {
1963
+ for (auto op : {ggml_add, ggml_mul, ggml_div}) {
1964
+ test_cases.emplace_back(new test_bin_bcast(op, type, ne, nr));
1965
+ }
1966
+ };
1967
+
1968
+ add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 8, 1}, {1, 1, 1, 1});
1969
+ add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 1, 1}, {32, 1, 1, 1});
1970
+ add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 320, 320}, {1, 1, 1, 1});
1971
+ add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 1, 1}, {1, 1, 1, 1});
1972
+ add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 1}, {1, 1, 1, 1});
1973
+ add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {1, 1, 1, 1});
1974
+ add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {2, 1, 1, 1});
1975
+ add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {1, 2, 1, 1});
1976
+ add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {1, 1, 2, 1});
1977
+ add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {1, 1, 1, 2});
1978
+ add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {1, 1, 2, 2});
1979
+ add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {1, 2, 2, 2});
1980
+ add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {2, 2, 2, 2});
1981
+
1982
+ // stable diffusion
1983
+ add_test_bin_bcast(GGML_TYPE_F32, {1280, 1, 1, 1}, {1, 1, 1, 1});
1984
+ add_test_bin_bcast(GGML_TYPE_F32, {1280, 1, 1, 1}, {1, 16, 16, 1});
1985
+ add_test_bin_bcast(GGML_TYPE_F32, {1280, 16, 16, 1}, {1, 1, 1, 1});
1986
+ add_test_bin_bcast(GGML_TYPE_F32, {1280, 1, 1, 1}, {1, 256, 1, 1});
1987
+ add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 1280, 1}, {16, 16, 1, 1});
1988
+ add_test_bin_bcast(GGML_TYPE_F32, {16, 16, 1280, 1}, {1, 1, 1, 1});
1989
+ add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 1920, 1}, {16, 16, 1, 1});
1990
+ add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 2560, 1}, {16, 16, 1, 1});
1991
+ add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 1280, 1}, {32, 32, 1, 1});
1992
+ add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 1920, 1}, {32, 32, 1, 1});
1993
+ add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 640, 1}, {32, 32, 1, 1});
1994
+ add_test_bin_bcast(GGML_TYPE_F32, {5120, 1, 1, 1}, {1, 256, 1, 1});
1995
+ add_test_bin_bcast(GGML_TYPE_F32, {640, 1, 1, 1}, {1, 1, 1, 1});
1996
+ //add_test_bin_bcast(GGML_TYPE_F32, {3, 3, 2560, 1280}, {1, 1, 1, 1});
1997
+ //add_test_bin_bcast(GGML_TYPE_F32, {3, 3, 2560, 1280}, {2, 1, 1, 1});
1998
+
1999
+ test_cases.emplace_back(new test_scale());
2000
+
2001
+ for (float eps : {1e-6f, 1e-5f, 1e-3f, 1e-1f}) {
2002
+ test_cases.emplace_back(new test_norm(GGML_TYPE_F32, {64, 10, 10, 10}, eps));
2003
+ test_cases.emplace_back(new test_rms_norm(GGML_TYPE_F32, {64, 10, 10, 10}, eps));
2004
+ }
2005
+
2006
+ for (ggml_type type_a : base_types) {
2007
+ for (ggml_type type_b : {GGML_TYPE_F32, GGML_TYPE_F16}) {
2008
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, { 1, 1}, {1, 1}));
2009
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 1}, {1, 1}));
2010
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 1}, {2, 1}));
2011
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 10}, {1, 1}));
2012
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 10}, {2, 1}));
2013
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 10}, {1, 2}));
2014
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 10}, {2, 2}));
2015
+
2016
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, { 1, 1}, {1, 1}));
2017
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 1}, {1, 1}));
2018
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 1}, {2, 1}));
2019
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {1, 1}));
2020
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {2, 1}));
2021
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {1, 2}));
2022
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {2, 2}));
2023
+ }
2024
+ }
2025
+
2026
+ for (ggml_type type_a : other_types) {
2027
+ for (ggml_type type_b : {GGML_TYPE_F32}) {
2028
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, { 1, 1}, {1, 1}));
2029
+ }
2030
+ }
2031
+
2032
+ test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 64, 2, 128, { 8, 1}, {1, 1}));
2033
+ test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 83, 2, 128, { 8, 1}, {4, 1}));
2034
+ test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 64, 2, 64, { 8, 1}, {4, 1}));
2035
+ test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 83, 2, 64, { 8, 1}, {4, 1}));
2036
+ test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 64, 45, 128, { 8, 1}, {4, 1}));
2037
+ test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 45, 64, { 8, 1}, {4, 1}));
2038
+
2039
+ for (ggml_type type_a : base_types) {
2040
+ for (ggml_type type_b : {GGML_TYPE_F32 /*, GGML_TYPE_F16 */}) {
2041
+ for (int n_mats : {4, 8}) {
2042
+ for (int n_used : {1, 2, 4}) {
2043
+ for (bool b : {false, true}) {
2044
+ for (int n : {1, 32}) {
2045
+ int m = 512;
2046
+ int k = 256;
2047
+ test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, n_mats, n_used, b, m, n, k));
2048
+ }
2049
+ }
2050
+ }
2051
+ }
2052
+ }
2053
+ }
2054
+
2055
+ for (ggml_type type_a : other_types) {
2056
+ for (ggml_type type_b : {GGML_TYPE_F32 /*, GGML_TYPE_F16 */}) {
2057
+ for (int n_mats : {4}) {
2058
+ for (int n_used : {2}) {
2059
+ for (bool b : {false}) {
2060
+ for (int n : {1}) {
2061
+ int m = 512;
2062
+ int k = 256;
2063
+ test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, n_mats, n_used, b, m, n, k));
2064
+ }
2065
+ }
2066
+ }
2067
+ }
2068
+ }
2069
+ }
2070
+
2071
+ test_cases.emplace_back(new test_sqr());
2072
+ test_cases.emplace_back(new test_clamp());
2073
+
2074
+ test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 1, 1}, 5));
2075
+ test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 10, 1}, 5));
2076
+ test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 10, 10}, 5));
2077
+
2078
+ #if 0
2079
+ std::uniform_int_distribution<> dist_ne1(1, 50);
2080
+ int exponent = 1;
2081
+ while (exponent < (1 << 17)) {
2082
+ std::uniform_int_distribution<> dist_ne0(exponent, 2*exponent);
2083
+
2084
+ for (int n = 0; n < 10; ++n) {
2085
+ int64_t ne0 = dist_ne0(rng);
2086
+ int64_t ne1 = dist_ne1(rng);
2087
+ test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0, ne1, 1, 1}, n/2 == 0, 0.1f, ne0 < 1000 ? 4.0f : 0.0f));
2088
+ }
2089
+
2090
+ exponent <<= 1;
2091
+ }
2092
+ #endif
2093
+ for (bool mask : {false, true}) {
2094
+ for (float max_bias : {0.0f, 8.0f}) {
2095
+ for (float scale : {1.0f, 0.1f}) {
2096
+ for (int64_t ne0 : {16, 1024}) {
2097
+ for (int64_t ne1 : {16, 1024}) {
2098
+ test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0, ne1, 1, 1}, mask, scale, max_bias));
2099
+ test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0-1, ne1-1, 1, 1}, mask, scale, max_bias));
2100
+ }
2101
+ }
2102
+ }
2103
+ }
2104
+ }
2105
+
2106
+ test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, false, 0.1f, 0.0f));
2107
+ test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, 0.1f, 0.0f));
2108
+ test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, false, 0.1f, 8.0f));
2109
+ test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, 0.1f, 8.0f));
2110
+
2111
+ for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
2112
+ test_cases.emplace_back(new test_rope(type, {128, 32, 10, 1}, 128, 0, 512)); // llama 7B
2113
+ test_cases.emplace_back(new test_rope(type, {128, 40, 10, 1}, 128, 0, 512)); // llama 13B
2114
+ test_cases.emplace_back(new test_rope(type, {128, 52, 10, 1}, 128, 0, 512)); // llama 30B
2115
+ test_cases.emplace_back(new test_rope(type, {128, 64, 10, 1}, 128, 0, 512)); // llama 65B
2116
+ test_cases.emplace_back(new test_rope(type, { 64, 1, 10, 1}, 64, 2, 512)); // neox (falcon 7B)
2117
+ test_cases.emplace_back(new test_rope(type, { 64, 71, 10, 1}, 64, 2, 512)); // neox (falcon 7B)
2118
+ test_cases.emplace_back(new test_rope(type, { 64, 8, 10, 1}, 64, 2, 512)); // neox (falcon 40B)
2119
+ test_cases.emplace_back(new test_rope(type, { 64, 128, 10, 1}, 64, 2, 512)); // neox (falcon 40B)
2120
+ test_cases.emplace_back(new test_rope(type, { 80, 32, 10, 1}, 20, 2, 512)); // neox (stablelm)
2121
+ test_cases.emplace_back(new test_rope(type, { 80, 32, 10, 1}, 32, 2, 512)); // neox (phi-2)
2122
+ }
2123
+
2124
+ test_cases.emplace_back(new test_concat(GGML_TYPE_F32));
2125
+ test_cases.emplace_back(new test_concat(GGML_TYPE_I32));
2126
+
2127
+ for (ggml_sort_order order : {GGML_SORT_ORDER_ASC, GGML_SORT_ORDER_DESC}) {
2128
+ test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {8, 1, 1, 1}, order));
2129
+ test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {16, 10, 10, 10}, order));
2130
+ test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {60, 10, 10, 10}, order)); // qwen
2131
+ }
2132
+
2133
+ test_cases.emplace_back(new test_sum_rows());
2134
+ test_cases.emplace_back(new test_upscale());
2135
+ test_cases.emplace_back(new test_group_norm());
2136
+ test_cases.emplace_back(new test_acc());
2137
+ test_cases.emplace_back(new test_pad());
2138
+ test_cases.emplace_back(new test_arange());
2139
+ test_cases.emplace_back(new test_timestep_embedding());
2140
+ test_cases.emplace_back(new test_leaky_relu());
2141
+
2142
+ // these tests are disabled to save execution time, but they can be handy for debugging
2143
+ #if 0
2144
+ test_cases.emplace_back(new test_llama(1));
2145
+ test_cases.emplace_back(new test_llama(2));
2146
+ test_cases.emplace_back(new test_falcon(1));
2147
+ test_cases.emplace_back(new test_falcon(2));
2148
+ #endif
2149
+
2150
+ // run tests
2151
+ if (mode == MODE_TEST) {
2152
+ ggml_backend_t backend_cpu = ggml_backend_cpu_init();
2153
+
2154
+ size_t n_ok = 0;
2155
+ for (auto & test : test_cases) {
2156
+ if (test->eval(backend, backend_cpu, op_name)) {
2157
+ n_ok++;
2158
+ }
2159
+ }
2160
+ printf(" %zu/%zu tests passed\n", n_ok, test_cases.size());
2161
+
2162
+ ggml_backend_free(backend_cpu);
2163
+
2164
+ return n_ok == test_cases.size();
2165
+ }
2166
+
2167
+ if (mode == MODE_PERF) {
2168
+ for (auto & test : test_cases) {
2169
+ test->eval_perf(backend, op_name);
2170
+ }
2171
+ return true;
2172
+ }
2173
+
2174
+ GGML_ASSERT(false);
2175
+ return false;
2176
+ }
2177
+
2178
+ static void usage(char ** argv) {
2179
+ printf("Usage: %s [mode] [-o op] [-b backend]\n", argv[0]);
2180
+ printf(" valid modes are: test (compare with CPU backend for correctness) or perf (performance evaluation)\n");
2181
+ printf(" op names are as given by ggml_op_desc()\n");
2182
+ }
2183
+
2184
+ int main(int argc, char ** argv) {
2185
+ test_mode mode = MODE_TEST;
2186
+ const char * op_name_filter = NULL;
2187
+ const char * backend_filter = NULL;
2188
+
2189
+ for (int i = 1; i < argc; i++) {
2190
+ if (strcmp(argv[i], "test") == 0) {
2191
+ mode = MODE_TEST;
2192
+ } else if (strcmp(argv[i], "perf") == 0) {
2193
+ mode = MODE_PERF;
2194
+ } else if (strcmp(argv[i], "-o") == 0) {
2195
+ if (i + 1 < argc) {
2196
+ op_name_filter = argv[++i];
2197
+ } else {
2198
+ usage(argv);
2199
+ return 1;
2200
+ }
2201
+ } else if (strcmp(argv[i], "-b") == 0) {
2202
+ if (i + 1 < argc) {
2203
+ backend_filter = argv[++i];
2204
+ } else {
2205
+ usage(argv);
2206
+ return 1;
2207
+ }
2208
+ } else {
2209
+ usage(argv);
2210
+ return 1;
2211
+ }
2212
+ }
2213
+
2214
+ // enumerate backends
2215
+ printf("Testing %zu backends\n\n", ggml_backend_reg_get_count());
2216
+
2217
+ size_t n_ok = 0;
2218
+
2219
+ for (size_t i = 0; i < ggml_backend_reg_get_count(); i++) {
2220
+ printf("Backend %zu/%zu (%s)\n", i + 1, ggml_backend_reg_get_count(), ggml_backend_reg_get_name(i));
2221
+
2222
+ if (backend_filter != NULL && strcmp(backend_filter, ggml_backend_reg_get_name(i)) != 0) {
2223
+ printf(" Skipping\n");
2224
+ n_ok++;
2225
+ continue;
2226
+ }
2227
+
2228
+ ggml_backend_t backend = ggml_backend_reg_init_backend(i, NULL);
2229
+ GGML_ASSERT(backend != NULL);
2230
+
2231
+ if (backend_filter == NULL && ggml_backend_is_cpu(backend)) {
2232
+ printf(" Skipping CPU backend\n");
2233
+ ggml_backend_free(backend);
2234
+ n_ok++;
2235
+ continue;
2236
+ }
2237
+
2238
+ printf(" Backend name: %s\n", ggml_backend_name(backend));
2239
+
2240
+ bool ok = test_backend(backend, mode, op_name_filter);
2241
+
2242
+ printf(" Backend %s: ", ggml_backend_name(backend));
2243
+ if (ok) {
2244
+ printf("\033[1;32mOK\033[0m\n");
2245
+ n_ok++;
2246
+ } else {
2247
+ printf("\033[1;31mFAIL\033[0m\n");
2248
+ }
2249
+
2250
+ printf("\n");
2251
+
2252
+ ggml_backend_free(backend);
2253
+ }
2254
+
2255
+ printf("%zu/%zu backends passed\n", n_ok, ggml_backend_reg_get_count());
2256
+
2257
+ if (n_ok != ggml_backend_reg_get_count()) {
2258
+ printf("\033[1;31mFAIL\033[0m\n");
2259
+ return 1;
2260
+ }
2261
+
2262
+ ggml_quantize_free();
2263
+
2264
+ printf("\033[1;32mOK\033[0m\n");
2265
+ return 0;
2266
+ }