@fugood/llama.node 0.3.1 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (252) hide show
  1. package/CMakeLists.txt +1 -8
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/package.json +4 -2
  17. package/src/DetokenizeWorker.cpp +1 -1
  18. package/src/EmbeddingWorker.cpp +2 -2
  19. package/src/LlamaCompletionWorker.cpp +10 -10
  20. package/src/LlamaCompletionWorker.h +2 -2
  21. package/src/LlamaContext.cpp +14 -17
  22. package/src/TokenizeWorker.cpp +1 -1
  23. package/src/common.hpp +5 -4
  24. package/src/llama.cpp/.github/workflows/build.yml +137 -29
  25. package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
  26. package/src/llama.cpp/.github/workflows/docker.yml +46 -34
  27. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
  28. package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
  29. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
  30. package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
  31. package/src/llama.cpp/.github/workflows/server.yml +7 -0
  32. package/src/llama.cpp/CMakeLists.txt +26 -11
  33. package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
  34. package/src/llama.cpp/common/CMakeLists.txt +10 -10
  35. package/src/llama.cpp/common/arg.cpp +2041 -0
  36. package/src/llama.cpp/common/arg.h +77 -0
  37. package/src/llama.cpp/common/common.cpp +523 -1861
  38. package/src/llama.cpp/common/common.h +234 -106
  39. package/src/llama.cpp/common/console.cpp +3 -0
  40. package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
  41. package/src/llama.cpp/common/log.cpp +401 -0
  42. package/src/llama.cpp/common/log.h +66 -698
  43. package/src/llama.cpp/common/ngram-cache.cpp +39 -36
  44. package/src/llama.cpp/common/ngram-cache.h +19 -19
  45. package/src/llama.cpp/common/sampling.cpp +356 -350
  46. package/src/llama.cpp/common/sampling.h +62 -139
  47. package/src/llama.cpp/common/stb_image.h +5990 -6398
  48. package/src/llama.cpp/docs/build.md +72 -17
  49. package/src/llama.cpp/examples/CMakeLists.txt +1 -2
  50. package/src/llama.cpp/examples/batched/batched.cpp +49 -65
  51. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +42 -53
  52. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
  53. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +22 -22
  54. package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
  55. package/src/llama.cpp/examples/embedding/embedding.cpp +147 -91
  56. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +37 -37
  57. package/src/llama.cpp/examples/export-lora/export-lora.cpp +39 -38
  58. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
  59. package/src/llama.cpp/examples/{baby-llama → gen-docs}/CMakeLists.txt +2 -2
  60. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
  61. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
  62. package/src/llama.cpp/examples/gritlm/gritlm.cpp +46 -39
  63. package/src/llama.cpp/examples/imatrix/imatrix.cpp +75 -69
  64. package/src/llama.cpp/examples/infill/infill.cpp +131 -192
  65. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +276 -178
  66. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  67. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +40 -36
  68. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  69. package/src/llama.cpp/examples/llava/clip.cpp +686 -150
  70. package/src/llama.cpp/examples/llava/clip.h +11 -2
  71. package/src/llama.cpp/examples/llava/llava-cli.cpp +60 -71
  72. package/src/llama.cpp/examples/llava/llava.cpp +146 -26
  73. package/src/llama.cpp/examples/llava/llava.h +2 -3
  74. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
  75. package/src/llama.cpp/examples/llava/requirements.txt +1 -0
  76. package/src/llama.cpp/examples/lookahead/lookahead.cpp +55 -56
  77. package/src/llama.cpp/examples/lookup/lookup-create.cpp +15 -13
  78. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  79. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +34 -33
  80. package/src/llama.cpp/examples/lookup/lookup.cpp +60 -63
  81. package/src/llama.cpp/examples/main/main.cpp +216 -313
  82. package/src/llama.cpp/examples/parallel/parallel.cpp +58 -59
  83. package/src/llama.cpp/examples/passkey/passkey.cpp +53 -61
  84. package/src/llama.cpp/examples/perplexity/perplexity.cpp +277 -311
  85. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  86. package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
  87. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -12
  88. package/src/llama.cpp/examples/retrieval/retrieval.cpp +57 -52
  89. package/src/llama.cpp/examples/rpc/rpc-server.cpp +27 -2
  90. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +60 -46
  91. package/src/llama.cpp/examples/server/CMakeLists.txt +7 -18
  92. package/src/llama.cpp/examples/server/server.cpp +1347 -1531
  93. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
  94. package/src/llama.cpp/examples/server/utils.hpp +396 -107
  95. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  96. package/src/llama.cpp/examples/simple/simple.cpp +132 -106
  97. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
  98. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
  99. package/src/llama.cpp/examples/speculative/speculative.cpp +153 -124
  100. package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
  101. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
  102. package/src/llama.cpp/examples/tokenize/tokenize.cpp +27 -29
  103. package/src/llama.cpp/ggml/CMakeLists.txt +29 -12
  104. package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
  105. package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
  106. package/src/llama.cpp/ggml/include/ggml-backend.h +166 -68
  107. package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
  108. package/src/llama.cpp/ggml/include/ggml-cann.h +17 -19
  109. package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
  110. package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
  111. package/src/llama.cpp/ggml/include/ggml-cuda.h +17 -17
  112. package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
  113. package/src/llama.cpp/ggml/include/ggml-metal.h +13 -12
  114. package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
  115. package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
  116. package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
  117. package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
  118. package/src/llama.cpp/ggml/include/ggml.h +272 -505
  119. package/src/llama.cpp/ggml/src/CMakeLists.txt +69 -1110
  120. package/src/llama.cpp/ggml/src/ggml-aarch64.c +52 -2116
  121. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
  122. package/src/llama.cpp/ggml/src/ggml-alloc.c +29 -27
  123. package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
  124. package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
  125. package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
  126. package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
  127. package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
  128. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +144 -81
  129. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
  130. package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +394 -635
  131. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
  132. package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +217 -70
  133. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
  134. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
  135. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
  136. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
  137. package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
  138. package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +458 -353
  139. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
  140. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
  141. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
  142. package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
  143. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
  144. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
  145. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
  146. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +371 -0
  147. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
  148. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  149. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
  150. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
  151. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1885 -0
  152. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
  153. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
  154. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
  155. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
  156. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
  157. package/src/llama.cpp/ggml/src/ggml-impl.h +380 -584
  158. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
  159. package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +233 -87
  160. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
  161. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
  162. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
  163. package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
  164. package/src/llama.cpp/ggml/src/ggml-quants.c +369 -9994
  165. package/src/llama.cpp/ggml/src/ggml-quants.h +78 -110
  166. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
  167. package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +560 -335
  168. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
  169. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +6 -0
  170. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +51 -0
  171. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +310 -0
  172. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
  173. package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
  174. package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
  175. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
  176. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
  177. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
  178. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
  179. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +18 -25
  180. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
  181. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
  182. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
  183. package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3350 -3980
  184. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
  185. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
  186. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +70 -68
  187. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +9 -6
  188. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
  189. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
  190. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +8 -0
  191. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
  192. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
  193. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
  194. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
  195. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
  196. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
  197. package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
  198. package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
  199. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
  200. package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2034 -1718
  201. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +2 -0
  202. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +152 -185
  203. package/src/llama.cpp/ggml/src/ggml.c +2075 -16579
  204. package/src/llama.cpp/include/llama.h +296 -285
  205. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
  206. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
  207. package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
  208. package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
  209. package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
  210. package/src/llama.cpp/src/CMakeLists.txt +2 -1
  211. package/src/llama.cpp/src/llama-grammar.cpp +721 -122
  212. package/src/llama.cpp/src/llama-grammar.h +120 -15
  213. package/src/llama.cpp/src/llama-impl.h +156 -1
  214. package/src/llama.cpp/src/llama-sampling.cpp +2058 -346
  215. package/src/llama.cpp/src/llama-sampling.h +39 -47
  216. package/src/llama.cpp/src/llama-vocab.cpp +390 -127
  217. package/src/llama.cpp/src/llama-vocab.h +60 -20
  218. package/src/llama.cpp/src/llama.cpp +6215 -3263
  219. package/src/llama.cpp/src/unicode-data.cpp +6 -4
  220. package/src/llama.cpp/src/unicode-data.h +4 -4
  221. package/src/llama.cpp/src/unicode.cpp +15 -7
  222. package/src/llama.cpp/tests/CMakeLists.txt +4 -2
  223. package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
  224. package/src/llama.cpp/tests/test-backend-ops.cpp +1725 -297
  225. package/src/llama.cpp/tests/test-barrier.cpp +94 -0
  226. package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
  227. package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
  228. package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
  229. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +23 -8
  230. package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
  231. package/src/llama.cpp/tests/test-log.cpp +39 -0
  232. package/src/llama.cpp/tests/test-opt.cpp +853 -142
  233. package/src/llama.cpp/tests/test-quantize-fns.cpp +28 -19
  234. package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
  235. package/src/llama.cpp/tests/test-rope.cpp +2 -1
  236. package/src/llama.cpp/tests/test-sampling.cpp +226 -142
  237. package/src/llama.cpp/tests/test-tokenizer-0.cpp +56 -36
  238. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
  239. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
  240. package/patches/llama.patch +0 -22
  241. package/src/llama.cpp/.github/workflows/bench.yml +0 -310
  242. package/src/llama.cpp/common/grammar-parser.cpp +0 -536
  243. package/src/llama.cpp/common/grammar-parser.h +0 -29
  244. package/src/llama.cpp/common/train.cpp +0 -1513
  245. package/src/llama.cpp/common/train.h +0 -233
  246. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1640
  247. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
  248. package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
  249. package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +0 -1027
  250. package/src/llama.cpp/tests/test-grad0.cpp +0 -1566
  251. /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
  252. /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
@@ -1,11 +1,31 @@
1
+ // This file defines tests for various GGML ops and backends.
2
+ // For the forward pass it asserts that the results of multiple backends computing the same GGML ops are consistent.
3
+ // For the backward pass it asserts that the gradients from backpropagation are consistent
4
+ // with the gradients obtained via the method of finite differences ("grad" mode, this is optional).
5
+ // It is also possible to check the performance ("perf" mode).
6
+ //
7
+ // this file has three sections: Section 1 does general setup, section 2 defines the GGML ops to be tested,
8
+ // and section 3 defines which tests to run.
9
+ // Quick start for adding a new GGML op: Go to section 2 and create a struct that inherits from test_case,
10
+ // then go to section 3 and add an instantiation of your struct.
11
+
12
+
13
+ // ##############################
14
+ // ## Section 1: General Setup ##
15
+ // ##############################
16
+
17
+
1
18
  #include <ggml.h>
19
+ #include <ggml-cpu.h>
2
20
  #include <ggml-alloc.h>
3
21
  #include <ggml-backend.h>
4
22
 
5
23
  #include <algorithm>
6
24
  #include <array>
7
25
  #include <cfloat>
26
+ #include <cstdint>
8
27
  #include <cstring>
28
+ #include <cinttypes>
9
29
  #include <functional>
10
30
  #include <memory>
11
31
  #include <random>
@@ -13,64 +33,52 @@
13
33
  #include <stdlib.h>
14
34
  #include <string>
15
35
  #include <thread>
36
+ #include <future>
16
37
  #include <vector>
17
38
 
18
-
19
39
  static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float max = 1.0f) {
20
- // static RNG initialization (revisit if n_threads stops being constant)
21
- static const size_t n_threads = std::thread::hardware_concurrency();
22
- static std::vector<std::default_random_engine> generators = []() {
23
- std::random_device rd;
24
- std::vector<std::default_random_engine> vec;
25
- vec.reserve(n_threads);
26
- //for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(1234 + i); } // fixed seed
27
- for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(rd()); }
28
- return vec;
29
- }();
30
-
31
- size_t size = ggml_nelements(tensor);
32
- std::vector<float> data(size);
40
+ size_t nels = ggml_nelements(tensor);
41
+ std::vector<float> data(nels);
42
+ {
43
+ // parallel initialization
44
+ static const size_t n_threads = std::thread::hardware_concurrency();
45
+ // static RNG initialization (revisit if n_threads stops being constant)
46
+ static std::vector<std::default_random_engine> generators = []() {
47
+ std::random_device rd;
48
+ std::vector<std::default_random_engine> vec;
49
+ vec.reserve(n_threads);
50
+ //for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(1234 + i); } // fixed seed
51
+ for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(rd()); }
52
+ return vec;
53
+ }();
54
+
55
+ auto init_thread = [&](size_t ith, size_t start, size_t end) {
56
+ std::uniform_real_distribution<float> distribution(min, max);
57
+ auto & gen = generators[ith];
58
+ for (size_t i = start; i < end; i++) {
59
+ data[i] = distribution(gen);
60
+ }
61
+ };
33
62
 
34
- auto init_thread = [&](size_t ith, size_t start, size_t end) {
35
- std::uniform_real_distribution<float> distribution(min, max);
36
- for (size_t i = start; i < end; i++) {
37
- data[i] = distribution(generators[ith]);
63
+ std::vector<std::future<void>> tasks;
64
+ tasks.reserve(n_threads);
65
+ for (size_t i = 0; i < n_threads; i++) {
66
+ size_t start = i*nels/n_threads;
67
+ size_t end = (i+1)*nels/n_threads;
68
+ tasks.push_back(std::async(std::launch::async, init_thread, i, start, end));
38
69
  }
39
- };
40
-
41
- std::vector<std::thread> threads;
42
- threads.reserve(n_threads);
43
- for (size_t i = 0; i < n_threads; i++) {
44
- size_t start = i*size/n_threads;
45
- size_t end = (i+1)*size/n_threads;
46
- threads.emplace_back(init_thread, i, start, end);
47
- }
48
- for (auto & t : threads) {
49
- t.join();
50
- }
51
-
52
- #if 0
53
- const char * val_str = getenv("GGML_TEST_EPS");
54
- float val = 1e-9f;
55
- if (val_str != nullptr) {
56
- val = std::stof(val_str);
57
- printf("GGML_TEST_EPS=%e\n", val);
58
- }
59
-
60
- // test quantization with very small values that may result in nan scales due to division by zero
61
- if (ggml_is_quantized(tensor->type)) {
62
- for (int i = 0; i < 256; i++) {
63
- data[i] = val;
70
+ for (auto & t : tasks) {
71
+ t.get();
64
72
  }
65
73
  }
66
- #endif
67
74
 
68
75
  if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) {
69
- ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float));
76
+ ggml_backend_tensor_set(tensor, data.data(), 0, nels * sizeof(float));
70
77
  } else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16) {
71
- GGML_ASSERT(size % ggml_blck_size(tensor->type) == 0);
72
- std::vector<uint8_t> dataq(ggml_row_size(tensor->type, size));
73
- std::vector<float> imatrix(tensor->ne[0], 1.0f); // dummy importance matrix
78
+ GGML_ASSERT(nels % ggml_blck_size(tensor->type) == 0);
79
+
80
+ // dummy importance matrix
81
+ std::vector<float> imatrix(tensor->ne[0], 1.0f);
74
82
  const float * im = imatrix.data();
75
83
  if (!ggml_quantize_requires_imatrix(tensor->type)) {
76
84
  // when the imatrix is optional, we want to test both quantization with and without imatrix
@@ -80,19 +88,40 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
80
88
  }
81
89
  }
82
90
 
83
- ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size/tensor->ne[0], tensor->ne[0], im);
84
- GGML_ASSERT(ggml_validate_row_data(tensor->type, dataq.data(), dataq.size()));
85
- // TODO: other cases
86
- //#pragma omp parallel for
87
- //for (int i = 0; i < tensor->ne[1]; i++) {
88
- // ggml_quantize_chunk(tensor->type, data.data(), dataq.data(),
89
- // i * tensor->ne[0], 1, tensor->ne[0], im);
90
- //}
91
-
91
+ std::vector<uint8_t> dataq(ggml_row_size(tensor->type, nels));
92
+ {
93
+ // parallel quantization by block
94
+ size_t blck_size = ggml_blck_size(tensor->type);
95
+ size_t n_blocks = nels / blck_size;
96
+
97
+ auto quantize_thread = [&](size_t start, size_t end) {
98
+ ggml_quantize_chunk(tensor->type, data.data(), dataq.data(),
99
+ start * blck_size, end - start, blck_size, im);
100
+ };
101
+
102
+ const size_t min_blocks_per_thread = 1;
103
+ const size_t n_threads = std::min<size_t>(std::thread::hardware_concurrency()/2,
104
+ std::max<size_t>(1, n_blocks / min_blocks_per_thread));
105
+ std::vector<std::future<void>> tasks;
106
+ tasks.reserve(n_threads);
107
+ for (size_t i = 0; i < n_threads; i++) {
108
+ size_t start = i*n_blocks/n_threads;
109
+ size_t end = (i+1)*n_blocks/n_threads;
110
+ tasks.push_back(std::async(std::launch::async, quantize_thread, start, end));
111
+ }
112
+ for (auto & t : tasks) {
113
+ t.get();
114
+ }
115
+ }
92
116
  ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size());
93
117
  } else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) {
94
118
  // This is going to create some weird integers though.
95
119
  ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor));
120
+ } else if (tensor->type == GGML_TYPE_I64) {
121
+ // Integers with a size of 8 bytes can be set by mirroring the float data, the specific values are again not really meaningful.
122
+ const size_t nbytes_half = ggml_nbytes(tensor)/2;
123
+ ggml_backend_tensor_set(tensor, data.data(), 0*nbytes_half, nbytes_half);
124
+ ggml_backend_tensor_set(tensor, data.data(), 1*nbytes_half, nbytes_half);
96
125
  } else {
97
126
  GGML_ABORT("fatal error");
98
127
  }
@@ -105,7 +134,7 @@ static std::vector<float> tensor_to_float(const ggml_tensor * t) {
105
134
  std::vector<uint8_t> buf(ggml_nbytes(t));
106
135
  ggml_backend_tensor_get(t, buf.data(), 0, ggml_nbytes(t));
107
136
 
108
- ggml_type_traits_t tt = ggml_internal_get_type_traits(t->type);
137
+ const auto * tt = ggml_get_type_traits(t->type);
109
138
  size_t bs = ggml_blck_size(t->type);
110
139
  std::vector<float> vq(ggml_blck_size(t->type));
111
140
  bool quantized = ggml_is_quantized(t->type);
@@ -122,6 +151,8 @@ static std::vector<float> tensor_to_float(const ggml_tensor * t) {
122
151
  tv.push_back(ggml_bf16_to_fp32(*(ggml_bf16_t*)&buf[i]));
123
152
  } else if (t->type == GGML_TYPE_F32) {
124
153
  tv.push_back(*(float *) &buf[i]);
154
+ } else if (t->type == GGML_TYPE_I64) {
155
+ tv.push_back((float)*(int64_t *) &buf[i]);
125
156
  } else if (t->type == GGML_TYPE_I32) {
126
157
  tv.push_back((float)*(int32_t *) &buf[i]);
127
158
  } else if (t->type == GGML_TYPE_I16) {
@@ -129,7 +160,7 @@ static std::vector<float> tensor_to_float(const ggml_tensor * t) {
129
160
  } else if (t->type == GGML_TYPE_I8) {
130
161
  tv.push_back((float)*(int8_t *) &buf[i]);
131
162
  } else if (quantized) {
132
- tt.to_float(&buf[i], vq.data(), bs);
163
+ tt->to_float(&buf[i], vq.data(), bs);
133
164
  tv.insert(tv.end(), vq.begin(), vq.end());
134
165
  } else {
135
166
  GGML_ABORT("fatal error");
@@ -142,60 +173,6 @@ static std::vector<float> tensor_to_float(const ggml_tensor * t) {
142
173
  return tv;
143
174
  }
144
175
 
145
- /*
146
- static double cosine_similarity(const float * v1, const float * v2, size_t n) {
147
- double dot = 0.0;
148
- double mag1 = 0.0;
149
- double mag2 = 0.0;
150
-
151
- for (size_t i = 0; i < n; i++) {
152
- if (std::isnan(v1[i]) || std::isnan(v2[i])) {
153
- return -1.0f;
154
- }
155
- if (std::isinf(v1[i]) && std::isinf(v2[i])) {
156
- continue;
157
- }
158
- dot += v1[i]*v2[i];
159
- mag1 += v1[i]*v1[i];
160
- mag2 += v2[i]*v2[i];
161
- }
162
-
163
- return dot/sqrt(mag1*mag2);
164
- }
165
-
166
- static float distance(const float * v1, const float * v2, size_t n) {
167
- double d = 0.0;
168
-
169
- for (size_t i = 0; i < n; i++) {
170
- if (std::isnan(v1[i]) || std::isnan(v2[i])) {
171
- return INFINITY;
172
- }
173
- if (std::isinf(v1[i]) && std::isinf(v2[i])) {
174
- continue;
175
- }
176
- d += (v1[i] - v2[i])*(v1[i] - v2[i]);
177
- }
178
-
179
- return sqrt(d);
180
- }
181
-
182
- static float vec_len(const float * v, size_t n) {
183
- double d = 0.0;
184
-
185
- for (size_t i = 0; i < n; i++) {
186
- if (std::isnan(v[i])) {
187
- return INFINITY;
188
- }
189
- if (std::isinf(v[i])) {
190
- continue;
191
- }
192
- d += v[i]*v[i];
193
- }
194
-
195
- return sqrt(d);
196
- }
197
- */
198
-
199
176
  // normalized mean squared error = mse(a, b) / mse(a, 0)
200
177
  static double nmse(const float * a, const float * b, size_t n) {
201
178
  double mse_a_b = 0.0;
@@ -212,8 +189,40 @@ static double nmse(const float * a, const float * b, size_t n) {
212
189
  return mse_a_b / mse_a_0;
213
190
  }
214
191
 
192
+ // maximum absolute asymmetry between a and b
193
+ // asymmetry: (a - b) / (a + b)
194
+ // This is more stable than relative error if one of the values fluctuates towards zero.
195
+ // n: number of values to compare.
196
+ // expected_vals: optional vector of expected values for a. If expected_vals is not empty, filter out all comparisons where
197
+ // a does not match any of the expected values. Needed for noncontinuous gradients where the numerical calculation can fail.
198
+ static double mean_abs_asymm(const float * a, const float * b, const size_t n, const std::vector<float> & expected_vals) {
199
+ double sum = 0.0f;
200
+
201
+ size_t nvalid = 0;
202
+ for (size_t i = 0; i < n; i++) {
203
+ if (!expected_vals.empty()) {
204
+ bool matches_any = false;
205
+ for (const float & ev : expected_vals) {
206
+ if (fabsf(a[i] - ev) < 1e-3f) {
207
+ matches_any = true;
208
+ break;
209
+ }
210
+ }
211
+ if (!matches_any) {
212
+ continue;
213
+ }
214
+ }
215
+
216
+ const float asymm = (a[i] - b[i]) / (a[i] + b[i]);
217
+
218
+ sum += fabsf(asymm);
219
+ nvalid++;
220
+ }
221
+
222
+ return sum/nvalid;
223
+ }
224
+
215
225
  // utils for printing the variables of the test cases
216
- #define VAR_TO_STR(x) (#x "=" + var_to_str(x))
217
226
 
218
227
  template<typename T>
219
228
  static std::string var_to_str(const T & x) {
@@ -246,10 +255,6 @@ static std::string var_to_str(const std::array<T, N> & x) {
246
255
  return s;
247
256
  }
248
257
 
249
- //static std::string var_to_str(ggml_unary_op unary_op) {
250
- // return ggml_unary_op_name(unary_op);
251
- //}
252
-
253
258
  static std::string var_to_str(ggml_type type) {
254
259
  return ggml_type_name(type);
255
260
  }
@@ -262,6 +267,8 @@ static std::string var_to_str(ggml_op_pool pool) {
262
267
  }
263
268
  }
264
269
 
270
+ #define VAR_TO_STR(x) (#x "=" + var_to_str(x))
271
+
265
272
  #define VARS_TO_STR1(a) VAR_TO_STR(a)
266
273
  #define VARS_TO_STR2(a, b) VAR_TO_STR(a) + "," + VAR_TO_STR(b)
267
274
  #define VARS_TO_STR3(a, b, c) VAR_TO_STR(a) + "," + VARS_TO_STR2(b, c)
@@ -295,6 +302,7 @@ static bool ggml_is_view_op(enum ggml_op op) {
295
302
  enum test_mode {
296
303
  MODE_TEST,
297
304
  MODE_PERF,
305
+ MODE_GRAD,
298
306
  };
299
307
 
300
308
  struct test_case {
@@ -314,6 +322,32 @@ struct test_case {
314
322
  return 1e-7;
315
323
  }
316
324
 
325
+ virtual double max_maa_err() {
326
+ return 1e-4;
327
+ }
328
+
329
+ virtual float grad_eps() {
330
+ return 1e-1f;
331
+ }
332
+
333
+ // If false, estimate gradient with 2 points, neglects 3rd order derivative and higher.
334
+ // If true, estimate gradient with 4 points, neglects 5th order derivative and higher.
335
+ virtual bool grad_precise() {
336
+ return false;
337
+ }
338
+
339
+ // Skip gradient checks if total number of gradients to be checked is larger than this (to speed up the tests).
340
+ virtual int64_t grad_nmax() {
341
+ return 10000;
342
+ }
343
+
344
+ // No effect if empty.
345
+ // If not empty, skip all gradient checks where the numerical result does not match any of the values.
346
+ // Needed for dealing with noncontinuous gradients (e.g. ReLU) where estimation using finite differences is unreliable.
347
+ virtual std::vector<float> grad_expect() {
348
+ return {};
349
+ }
350
+
317
351
  virtual void initialize_tensors(ggml_context * ctx) {
318
352
  for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
319
353
  init_tensor_uniform(t);
@@ -331,7 +365,13 @@ struct test_case {
331
365
  return size;
332
366
  }
333
367
 
368
+ virtual uint64_t op_flops(ggml_tensor * t) {
369
+ GGML_UNUSED(t);
370
+ return 0;
371
+ }
372
+
334
373
  ggml_cgraph * gf = nullptr;
374
+ ggml_cgraph * gb = nullptr;
335
375
 
336
376
  static const int sentinel_size = 1024;
337
377
 
@@ -340,7 +380,7 @@ struct test_case {
340
380
  std::vector<ggml_tensor *> sentinels;
341
381
 
342
382
  void add_sentinel(ggml_context * ctx) {
343
- if (mode == MODE_PERF) {
383
+ if (mode == MODE_PERF || mode == MODE_GRAD) {
344
384
  return;
345
385
  }
346
386
  ggml_tensor * sentinel = ::ggml_new_tensor_1d(ctx, GGML_TYPE_F32, sentinel_size);
@@ -389,6 +429,7 @@ struct test_case {
389
429
  /* .no_alloc = */ true,
390
430
  };
391
431
  ggml_context * ctx = ggml_init(params);
432
+ GGML_ASSERT(ctx);
392
433
 
393
434
  gf = ggml_new_graph(ctx);
394
435
 
@@ -439,7 +480,7 @@ struct test_case {
439
480
 
440
481
  // add sentinels as graph nodes so that they are checked in the callback
441
482
  for (ggml_tensor * sentinel : sentinels) {
442
- gf->nodes[gf->n_nodes++] = sentinel;
483
+ ggml_graph_add_node(gf, sentinel);
443
484
  }
444
485
 
445
486
  // randomize tensors
@@ -550,6 +591,7 @@ struct test_case {
550
591
  /* .no_alloc = */ true,
551
592
  };
552
593
  ggml_context * ctx = ggml_init(params);
594
+ GGML_ASSERT(ctx);
553
595
 
554
596
  ggml_tensor * out = build_graph(ctx);
555
597
 
@@ -570,12 +612,11 @@ struct test_case {
570
612
  }
571
613
 
572
614
  // align while also leaving some margin for variations in parameters
573
- int align = 20;
615
+ int align = 8;
574
616
  int last = (len + align - 1) / align * align;
575
617
  if (last - len < 5) {
576
618
  last += align;
577
619
  }
578
- last = std::max(last, 60);
579
620
  printf("%*s", last - len, "");
580
621
 
581
622
  // allocate
@@ -596,11 +637,27 @@ struct test_case {
596
637
  // warmup run
597
638
  ggml_backend_graph_compute(backend, gf);
598
639
 
640
+ // determine number of runs
641
+ int n_runs;
642
+ if (op_flops(out) > 0) {
643
+ // based on flops
644
+ const uint64_t GFLOP = 1000 * 1000 * 1000;
645
+ const uint64_t target_flops_cpu = 8ULL * GFLOP;
646
+ const uint64_t target_flops_gpu = 100ULL * GFLOP;
647
+ uint64_t target_flops = ggml_backend_is_cpu(backend) ? target_flops_cpu : target_flops_gpu;
648
+ n_runs = std::min<int>(ggml_graph_size(gf) - ggml_graph_n_nodes(gf), target_flops / op_flops(out)) + 1;
649
+ } else {
650
+ // based on memory size
651
+ const size_t GB = 1ULL << 30;
652
+ const size_t target_size_cpu = 8 * GB;
653
+ const size_t target_size_gpu = 32 * GB;
654
+ size_t target_size = ggml_backend_is_cpu(backend) ? target_size_cpu : target_size_gpu;
655
+ n_runs = std::min<int>(ggml_graph_size(gf) - ggml_graph_n_nodes(gf), target_size / op_size(out)) + 1;
656
+ }
657
+
599
658
  // duplicate the op
600
- size_t target_size = ggml_backend_is_cpu(backend) ? 1ULL << 33 : 1ULL << 35; // 8 GB CPU, 32 GB GPU
601
- int n_runs = std::min((size_t)gf->size - gf->n_nodes, target_size / op_size(out)) + 1;
602
659
  for (int i = 1; i < n_runs; i++) {
603
- gf->nodes[gf->n_nodes++] = out;
660
+ ggml_graph_add_node(gf, out);
604
661
  }
605
662
 
606
663
  // calculate memory
@@ -615,36 +672,338 @@ struct test_case {
615
672
  }
616
673
  return size;
617
674
  };
618
- for (int i = 0; i < gf->n_nodes; i++) {
619
- if (ggml_is_view_op(gf->nodes[i]->op) || gf->nodes[i] == out) {
675
+ for (int i = 0; i < ggml_graph_n_nodes(gf); ++i) {
676
+ if (ggml_is_view_op(ggml_graph_node(gf, i)->op) || ggml_graph_node(gf, i) == out) {
620
677
  continue;
621
678
  }
622
- mem += tensor_op_size(gf->nodes[i]);
679
+ mem += tensor_op_size(ggml_graph_node(gf, i));
623
680
  }
624
681
 
625
682
  // run
626
- ggml_backend_synchronize(backend);
683
+ int64_t total_time_us = 0;
684
+ int64_t total_mem = 0;
685
+ int total_runs = 0;
686
+ do {
687
+ int64_t start_time = ggml_time_us();
688
+ ggml_backend_graph_compute(backend, gf);
689
+ int64_t end_time = ggml_time_us();
690
+
691
+ total_time_us += end_time - start_time;
692
+ total_mem += mem;
693
+ total_runs += n_runs;
694
+ } while (total_time_us < 1000*1000); // run for at least 1 second
695
+
696
+ printf(" %8d runs - %8.2f us/run - ",
697
+ total_runs,
698
+ (double)total_time_us / total_runs);
699
+
700
+ if (op_flops(out) > 0) {
701
+ double flops_per_sec = (op_flops(out) * total_runs) / (total_time_us / 1e6);
702
+ auto format_flops = [](double flops) -> std::string {
703
+ char buf[256];
704
+ if (flops >= 1e12) {
705
+ snprintf(buf, sizeof(buf), "%6.2f TFLOP", flops / 1e12);
706
+ } else if (flops >= 1e9) {
707
+ snprintf(buf, sizeof(buf), "%6.2f GFLOP", flops / 1e9);
708
+ } else if (flops >= 1e6) {
709
+ snprintf(buf, sizeof(buf), "%6.2f MFLOP", flops / 1e6);
710
+ } else {
711
+ snprintf(buf, sizeof(buf), "%6.2f KFLOP", flops / 1e3);
712
+ }
713
+ return buf;
714
+ };
715
+ printf("%s/run - \033[1;34m%sS\033[0m",
716
+ format_flops(op_flops(out)).c_str(),
717
+ format_flops(flops_per_sec).c_str());
718
+
719
+ } else {
720
+ printf("%8zu kB/run - \033[1;34m%7.2f GB/s\033[0m",
721
+ op_size(out) / 1024,
722
+ total_mem / (total_time_us / 1e6) / 1024.0 / 1024.0 / 1024.0);
723
+ }
724
+ printf("\n");
725
+
726
+ ggml_backend_buffer_free(buf);
727
+
728
+ ggml_free(ctx);
729
+
730
+ return true;
731
+ }
732
+
733
+ bool eval_grad(ggml_backend_t backend, const char * op_name) {
734
+ mode = MODE_GRAD;
735
+ const std::vector<float> expect = grad_expect();
736
+
737
+ ggml_init_params params = {
738
+ /* .mem_size = */ ggml_tensor_overhead()*128 + 2*ggml_graph_overhead_custom(GGML_DEFAULT_GRAPH_SIZE, true),
739
+ /* .mem_base = */ NULL,
740
+ /* .no_alloc = */ true,
741
+ };
742
+ ggml_context * ctx = ggml_init(params);
743
+ GGML_ASSERT(ctx);
744
+
745
+ gf = ggml_new_graph_custom(ctx, GGML_DEFAULT_GRAPH_SIZE, true);
746
+ gb = ggml_new_graph_custom(ctx, GGML_DEFAULT_GRAPH_SIZE, true);
747
+
748
+ ggml_tensor * out = build_graph(ctx);
749
+
750
+ if ((op_name != nullptr && op_desc(out) != op_name) || out->op == GGML_OP_OPT_STEP_ADAMW) {
751
+ //printf(" %s: skipping\n", op_desc(out).c_str());
752
+ ggml_free(ctx);
753
+ return true;
754
+ }
755
+
756
+ printf(" %s(%s): ", op_desc(out).c_str(), vars().c_str());
757
+ fflush(stdout);
758
+
759
+ if (out->type != GGML_TYPE_F32) {
760
+ ggml_free(ctx);
761
+ printf("not supported [%s->type != FP32]\n", out->name);
762
+ return true;
763
+ }
764
+
765
+ // check if the backend supports the ops
766
+ bool supported = true;
767
+ bool any_params = false;
768
+ for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
769
+ if (!ggml_backend_supports_op(backend, t)) {
770
+ printf("not supported [%s] ", ggml_backend_name(backend));
771
+ supported = false;
772
+ break;
773
+ }
774
+ if ((t->flags & GGML_TENSOR_FLAG_PARAM)) {
775
+ any_params = true;
776
+ if (t->type != GGML_TYPE_F32) {
777
+ printf("not supported [%s->type != FP32] ", t->name);
778
+ supported = false;
779
+ break;
780
+ }
781
+ }
782
+ }
783
+ if (!any_params) {
784
+ printf("not supported [%s] \n", op_name);
785
+ supported = false;
786
+ }
787
+ if (!supported) {
788
+ printf("\n");
789
+ ggml_free(ctx);
790
+ return true;
791
+ }
792
+
793
+ int64_t ngrads = 0;
794
+ for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
795
+ if (t->flags & GGML_TENSOR_FLAG_PARAM) {
796
+ ngrads += ggml_nelements(t);
797
+ }
798
+ }
799
+ if (ngrads > grad_nmax()) {
800
+ printf("skipping large tensors for speed \n");
801
+ ggml_free(ctx);
802
+ return true;
803
+ }
804
+
805
+
806
+ if (!ggml_is_scalar(out)) {
807
+ out = ggml_sum(ctx, out);
808
+ ggml_set_name(out, "sum_of_out");
809
+ }
810
+ ggml_set_loss(out);
811
+
812
+ ggml_build_forward_expand(gf, out);
813
+ ggml_graph_cpy(gf, gb);
814
+ ggml_build_backward_expand(ctx, ctx, gb, false);
815
+ if (expect.size() != 1 || expect[0] != 0.0f) {
816
+ GGML_ASSERT(ggml_graph_n_nodes(gb) > ggml_graph_n_nodes(gf));
817
+ for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
818
+ GGML_ASSERT(!(t->flags & GGML_TENSOR_FLAG_PARAM) || ggml_graph_get_grad(gb, t)->op != GGML_OP_NONE);
819
+ }
820
+ }
821
+
822
+ // TODO: refactor so that this check is only needed once
823
+ for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
824
+ if (!ggml_backend_supports_op(backend, t)) {
825
+ printf("not supported [%s] ", ggml_backend_name(backend));
826
+ supported = false;
827
+ break;
828
+ }
829
+ if ((t->flags & GGML_TENSOR_FLAG_PARAM) && t->type != GGML_TYPE_F32) {
830
+ printf("not supported [%s->type != FP32] ", t->name);
831
+ supported = false;
832
+ break;
833
+ }
834
+ }
835
+ if (!supported) {
836
+ printf("\n");
837
+ ggml_free(ctx);
838
+ return true;
839
+ }
840
+
841
+ // allocate
842
+ ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx, backend);
843
+ if (buf == NULL) {
844
+ printf("failed to allocate tensors [%s] ", ggml_backend_name(backend));
845
+ ggml_free(ctx);
846
+ return false;
847
+ }
848
+
849
+
850
+ initialize_tensors(ctx); // Randomizes all tensors (including gradients).
851
+ ggml_graph_reset(gb); // Sets gradients to 1 if loss, 0 otherwise.
627
852
 
628
- int64_t start_time = ggml_time_us();
629
853
  ggml_backend_graph_compute(backend, gf);
630
- ggml_backend_synchronize(backend);
631
- int64_t end_time = ggml_time_us();
632
- double time_us = end_time - start_time;
854
+ ggml_backend_graph_compute(backend, gb);
855
+
856
+ bool ok = true;
857
+ for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
858
+ if (!(t->flags & GGML_TENSOR_FLAG_PARAM)) {
859
+ continue;
860
+ }
861
+
862
+ const char * bn = ggml_backend_name(backend);
863
+ const int64_t ne = ggml_nelements(t);
864
+
865
+ std::vector<float> ga;
866
+ struct ggml_tensor * grad = ggml_graph_get_grad(gb, t);
867
+ if (grad) {
868
+ ga = tensor_to_float(grad);
869
+ } else {
870
+ ga.resize(ne); // default value is 0.0f
871
+ }
872
+
873
+ for (int64_t i = 0; i < ne; ++i) { // gradient algebraic
874
+ // check for nans
875
+ if (!std::isfinite(ga[i])) {
876
+ printf("[%s] nonfinite gradient at index %" PRId64 " (%s=%f) ", ggml_op_desc(t), i, bn, ga[i]);
877
+ ok = false;
878
+ break;
879
+ }
880
+ }
881
+ if (!ok) {
882
+ break;
883
+ }
884
+
885
+ std::vector<float> gn(ne); // gradient numeric
886
+ GGML_ASSERT(ga.size() == gn.size());
887
+
888
+ std::vector<float> x0 = tensor_to_float(t); // original t data
889
+ GGML_ASSERT(ggml_is_scalar(out));
890
+ GGML_ASSERT(out->type == GGML_TYPE_F32);
891
+
892
+ const float eps = grad_eps();
893
+ for (int64_t i = 0; i < ne; ++i) {
894
+ const float xiu = x0[i] + 1.0f*eps; // x, index i, up
895
+ const float xiuh = x0[i] + 0.5f*eps; // x, index i, up half
896
+ const float xidh = x0[i] - 0.5f*eps; // x, index i, down half
897
+ const float xid = x0[i] - 1.0f*eps; // x, index i, down
898
+
899
+ float fu, fuh, fdh, fd; // output values for xiu, xiuh, xid, xidh
900
+
901
+ ggml_backend_tensor_set(t, &xiu, i*sizeof(float), sizeof(float));
902
+ ggml_backend_graph_compute(backend, gf);
903
+ ggml_backend_tensor_get(out, &fu, 0, ggml_nbytes(out));
904
+
905
+ ggml_backend_tensor_set(t, &xid, i*sizeof(float), sizeof(float));
906
+ ggml_backend_graph_compute(backend, gf);
907
+ ggml_backend_tensor_get(out, &fd, 0, ggml_nbytes(out));
908
+
909
+ if (grad_precise()) {
910
+ ggml_backend_tensor_set(t, &xiuh, i*sizeof(float), sizeof(float));
911
+ ggml_backend_graph_compute(backend, gf);
912
+ ggml_backend_tensor_get(out, &fuh, 0, ggml_nbytes(out));
913
+
914
+ ggml_backend_tensor_set(t, &xidh, i*sizeof(float), sizeof(float));
915
+ ggml_backend_graph_compute(backend, gf);
916
+ ggml_backend_tensor_get(out, &fdh, 0, ggml_nbytes(out));
917
+
918
+ gn[i] = (8.0*(double)fuh + (double)fd - (8.0*(double)fdh + (double)fu)) / (6.0*(double)eps);
919
+ } else {
920
+ gn[i] = (fu - fd) / (2.0f*eps);
921
+ }
922
+
923
+ ggml_backend_tensor_set(t, x0.data(), 0, ggml_nbytes(t));
924
+ }
925
+
926
+ const double err = mean_abs_asymm(gn.data(), ga.data(), gn.size(), expect);
927
+ if (err > max_maa_err()) {
928
+ printf("[%s] MAA = %.9f > %.9f ", ggml_op_desc(t), err, max_maa_err());
929
+ ok = false;
930
+ break;
931
+ }
932
+ if (!ok) {
933
+ break;
934
+ }
935
+ }
633
936
 
634
- printf(" %5d runs - %8.2f us/run - %8zu kB/run - \033[1;34m%7.2f GB/s\033[0m\n",
635
- n_runs,
636
- time_us / n_runs,
637
- op_size(out) / 1024,
638
- mem / (time_us/1e6) / 1024.0 / 1024.0 / 1024.0);
937
+ if (!ok) {
938
+ printf("compare failed ");
939
+ }
639
940
 
640
941
  ggml_backend_buffer_free(buf);
641
942
 
642
943
  ggml_free(ctx);
643
944
 
644
- return true;
945
+ if (ok) {
946
+ printf("\033[1;32mOK\033[0m\n");
947
+ return true;
948
+ }
949
+
950
+ printf("\033[1;31mFAIL\033[0m\n");
951
+ return false;
952
+ }
953
+ };
954
+
955
+
956
+ // ###################################
957
+ // ## Section 2: GGML Op Defintions ##
958
+ // ###################################
959
+
960
+
961
+ // The following is an example showing the bare minimum for creating a test for a GGML op.
962
+
963
+ // GGML_OP_EXAMPLE
964
+ struct test_example : public test_case {
965
+ // Always define these 2 or variants thereof:
966
+ const ggml_type type; // The type of the input tensors.
967
+ const std::array<int64_t, 4> ne; // The shape of the input tensors.
968
+ // For some ops it's necessary to define multiple types or shapes for the inputs.
969
+ // Or they may need additional parameters.
970
+
971
+ // Put all parameters needed to fully define the test into one of the VARS_TO_STR macros.
972
+ // In most cases these are just the properties of the struct that you defined above.
973
+ // This is needed for info prints.
974
+ std::string vars() override {
975
+ return VARS_TO_STR2(type, ne);
976
+ }
977
+
978
+ // Define a constructor for the struct.
979
+ // In most cases it will be sufficient to have the same arguments as the struct has properties
980
+ // and just use initializer lists.
981
+ test_example(ggml_type type = GGML_TYPE_F32,
982
+ std::array<int64_t, 4> ne = {10, 5, 4, 3})
983
+ : type(type), ne(ne) {}
984
+
985
+ // Define how a simple GGML compute graph can be constructed for the new GGML op.
986
+ ggml_tensor * build_graph(ggml_context * ctx) override {
987
+ // Step 1: create input tensors that don't depend on any other tensors:
988
+ ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
989
+ ggml_set_name(a, "a"); // Setting names is optional but it's useful for debugging.
990
+
991
+ ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne.data());
992
+ ggml_set_name(b, "b");
993
+
994
+ // Step 2: use the op that you want to test in the GGML compute graph.
995
+ ggml_tensor * out = ggml_add(ctx, a, b); // For this example we're just doing a simple addition.
996
+ ggml_set_name(out, "out");
997
+
998
+ // Step 3: return the output tensor.
999
+ return out;
645
1000
  }
1001
+ // In order to also check the gradients for your op, add calls like ggml_set_param(ctx, a)
1002
+ // immediately after you create the tensors.
1003
+ // This is optional and only makes sense if a backward pass has actually been implemented for the new op.
646
1004
  };
647
1005
 
1006
+
648
1007
  // GGML_OP_UNARY
649
1008
  struct test_unary : public test_case {
650
1009
  const ggml_unary_op op;
@@ -658,20 +1017,36 @@ struct test_unary : public test_case {
658
1017
 
659
1018
  test_unary(ggml_unary_op op,
660
1019
  ggml_type type = GGML_TYPE_F32,
661
- std::array<int64_t, 4> ne_a = {128, 10, 10, 10},
1020
+ std::array<int64_t, 4> ne_a = {128, 2, 2, 2},
662
1021
  int v = 0)
663
1022
  : op(op), type(type), ne_a(ne_a), v(v) {}
664
1023
 
665
1024
  ggml_tensor * build_graph(ggml_context * ctx) override {
1025
+ const bool grad_supported = op == GGML_UNARY_OP_ABS || op == GGML_UNARY_OP_SGN || op == GGML_UNARY_OP_NEG ||
1026
+ op == GGML_UNARY_OP_STEP || op == GGML_UNARY_OP_RELU || op == GGML_UNARY_OP_SILU;
1027
+
666
1028
  ggml_tensor * a;
667
1029
  if (v & 1) {
668
1030
  auto ne = ne_a; ne[0] *= 3;
669
1031
  a = ggml_new_tensor(ctx, type, 4, ne.data());
1032
+ if (grad_supported) {
1033
+ ggml_set_param(ctx, a);
1034
+ }
1035
+ ggml_set_name(a, "a");
1036
+
670
1037
  a = ggml_view_4d(ctx, a, ne_a[0], ne_a[1], ne_a[2], ne_a[3], a->nb[1], a->nb[2], a->nb[3], 0);
1038
+ ggml_set_name(a, "view_of_a");
671
1039
  } else {
672
1040
  a = ggml_new_tensor(ctx, type, 4, ne_a.data());
1041
+ if (grad_supported) {
1042
+ ggml_set_param(ctx, a);
1043
+ }
1044
+ ggml_set_name(a, "a");
673
1045
  }
1046
+
674
1047
  ggml_tensor * out = ggml_unary(ctx, a, op);
1048
+ ggml_set_name(out, "out");
1049
+
675
1050
  return out;
676
1051
  }
677
1052
 
@@ -681,6 +1056,24 @@ struct test_unary : public test_case {
681
1056
  init_tensor_uniform(t, -150.f, 150.f);
682
1057
  }
683
1058
  }
1059
+
1060
+ float grad_eps() override {
1061
+ return 15.0f;
1062
+ }
1063
+
1064
+ std::vector<float> grad_expect() override {
1065
+ if (op == GGML_UNARY_OP_ABS) {
1066
+ return {-1.0f, 1.0f};
1067
+ }
1068
+ if (op == GGML_UNARY_OP_SGN || op == GGML_UNARY_OP_STEP) {
1069
+ return {0.0f};
1070
+ }
1071
+ if (op == GGML_UNARY_OP_RELU) {
1072
+ return {0.0f, 1.0f};
1073
+ }
1074
+ return {};
1075
+ }
1076
+
684
1077
  };
685
1078
 
686
1079
  // GGML_OP_GET_ROWS
@@ -701,11 +1094,24 @@ struct test_get_rows : public test_case {
701
1094
 
702
1095
  ggml_tensor * build_graph(ggml_context * ctx) override {
703
1096
  ggml_tensor * in = ggml_new_tensor_3d(ctx, type, n, m, b);
1097
+ ggml_set_name(in, "in");
1098
+
704
1099
  ggml_tensor * rows = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, r, b);
1100
+ ggml_set_name(rows, "rows");
705
1101
  if (v) {
706
1102
  rows = ggml_view_2d(ctx, rows, r/2, b, rows->nb[1], 0);
1103
+ ggml_set_name(rows, "view_of_rows");
1104
+ }
1105
+
1106
+ const bool grad_supported = ggml_is_matrix(in) && ggml_is_vector(rows);
1107
+ if (grad_supported) {
1108
+ ggml_set_param(ctx, in);
1109
+ // rows is a constant input -> no gradients
707
1110
  }
1111
+
708
1112
  ggml_tensor * out = ggml_get_rows(ctx, in, rows);
1113
+ ggml_set_name(out, "out");
1114
+
709
1115
  return out;
710
1116
  }
711
1117
 
@@ -726,14 +1132,79 @@ struct test_get_rows : public test_case {
726
1132
  }
727
1133
  };
728
1134
 
729
- // GGML_OP_REPEAT
730
- struct test_repeat : public test_case {
1135
+ // GGML_OP_ARGMAX
1136
+ struct test_argmax : public test_case {
731
1137
  const ggml_type type;
732
1138
  const std::array<int64_t, 4> ne;
733
- const std::array<int, 4> nr;
734
1139
 
735
1140
  std::string vars() override {
736
- return VARS_TO_STR3(type, ne, nr);
1141
+ return VARS_TO_STR2(type, ne);
1142
+ }
1143
+
1144
+ test_argmax(ggml_type type = GGML_TYPE_F32,
1145
+ std::array<int64_t, 4> ne = {10, 100, 1, 1})
1146
+ : type(type), ne(ne) {}
1147
+
1148
+ ggml_tensor * build_graph(ggml_context * ctx) override {
1149
+ ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
1150
+ ggml_set_name(a, "a");
1151
+
1152
+ ggml_tensor * out = ggml_argmax(ctx, a);
1153
+ ggml_set_name(out, "out");
1154
+
1155
+ return out;
1156
+ }
1157
+
1158
+ double max_nmse_err() override {
1159
+ return 0.0;
1160
+ }
1161
+ };
1162
+
1163
+ // GGML_OP_COUNT_EQUAL
1164
+ struct test_count_equal : public test_case {
1165
+ const ggml_type type;
1166
+ const std::array<int64_t, 4> ne;
1167
+
1168
+ std::string vars() override {
1169
+ return VARS_TO_STR2(type, ne);
1170
+ }
1171
+
1172
+ test_count_equal(ggml_type type = GGML_TYPE_F32,
1173
+ std::array<int64_t, 4> ne = {4, 500, 1, 1})
1174
+ : type(type), ne(ne) {}
1175
+
1176
+ ggml_tensor * build_graph(ggml_context * ctx) override {
1177
+ ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
1178
+ ggml_set_name(a, "a");
1179
+
1180
+ ggml_tensor * a_argmax = ggml_argmax(ctx, a);
1181
+ ggml_set_name(a_argmax, "a_argmax");
1182
+
1183
+ ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne.data());
1184
+ ggml_set_name(b, "b");
1185
+
1186
+ ggml_tensor * b_argmax = ggml_argmax(ctx, a);
1187
+ ggml_set_name(b_argmax, "b_argmax");
1188
+
1189
+ ggml_tensor * out = ggml_count_equal(ctx, a_argmax, b_argmax);
1190
+ ggml_set_name(out, "out");
1191
+
1192
+ return out;
1193
+ }
1194
+
1195
+ double max_nmse_err() override {
1196
+ return 0.0;
1197
+ }
1198
+ };
1199
+
1200
+ // GGML_OP_REPEAT
1201
+ struct test_repeat : public test_case {
1202
+ const ggml_type type;
1203
+ const std::array<int64_t, 4> ne;
1204
+ const std::array<int, 4> nr;
1205
+
1206
+ std::string vars() override {
1207
+ return VARS_TO_STR3(type, ne, nr);
737
1208
  }
738
1209
 
739
1210
  size_t op_size(ggml_tensor * t) override {
@@ -741,14 +1212,21 @@ struct test_repeat : public test_case {
741
1212
  }
742
1213
 
743
1214
  test_repeat(ggml_type type = GGML_TYPE_F32,
744
- std::array<int64_t, 4> ne = {10, 10, 10, 10},
1215
+ std::array<int64_t, 4> ne = {10, 5, 4, 3},
745
1216
  std::array<int, 4> nr = {2, 2, 2, 2})
746
1217
  : type(type), ne(ne), nr(nr) {}
747
1218
 
748
1219
  ggml_tensor * build_graph(ggml_context * ctx) override {
749
1220
  ggml_tensor * target = ggml_new_tensor_4d(ctx, type, ne[0]*nr[0], ne[1]*nr[1], ne[2]*nr[2], ne[3]*nr[3]);
1221
+ ggml_set_name(target, "target");
1222
+
750
1223
  ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data());
1224
+ ggml_set_param(ctx, src);
1225
+ ggml_set_name(src, "src");
1226
+
751
1227
  ggml_tensor * out = ggml_repeat(ctx, src, target);
1228
+ ggml_set_name(out, "out");
1229
+
752
1230
  return out;
753
1231
  }
754
1232
  };
@@ -774,10 +1252,62 @@ struct test_dup : public test_case {
774
1252
 
775
1253
  ggml_tensor * build_graph(ggml_context * ctx) override {
776
1254
  ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data());
1255
+ ggml_set_param(ctx, src);
1256
+ ggml_set_name(src, "src");
1257
+
777
1258
  if (_use_permute) {
778
1259
  src = ggml_permute(ctx, src, permute[0], permute[1], permute[2], permute[3]);
1260
+ ggml_set_name(src, "src_permuted");
779
1261
  }
1262
+
780
1263
  ggml_tensor * out = ggml_dup(ctx, src);
1264
+ ggml_set_name(out, "out");
1265
+
1266
+ return out;
1267
+ }
1268
+ };
1269
+
1270
+ // GGML_OP_SET
1271
+ struct test_set : public test_case {
1272
+ const ggml_type type_src;
1273
+ const ggml_type type_dst;
1274
+ const std::array<int64_t, 4> ne;
1275
+ const int dim;
1276
+
1277
+ std::string vars() override {
1278
+ return VARS_TO_STR4(type_src, type_dst, ne, dim);
1279
+ }
1280
+
1281
+ size_t op_size(ggml_tensor * t) override {
1282
+ return ggml_nbytes(t) + ggml_nbytes(t->src[0]);
1283
+ }
1284
+
1285
+ test_set(ggml_type type_src = GGML_TYPE_F32, ggml_type type_dst = GGML_TYPE_F32,
1286
+ std::array<int64_t, 4> ne = {6, 5, 4, 3}, int dim = 1)
1287
+ : type_src(type_src), type_dst(type_dst), ne(ne), dim(dim) {}
1288
+
1289
+ ggml_tensor * build_graph(ggml_context * ctx) override {
1290
+ ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne.data());
1291
+ ggml_set_param(ctx, src);
1292
+ ggml_set_name(src, "src");
1293
+
1294
+ auto ne_dst = ne;
1295
+ for (int i = 0; i < dim; ++i) {
1296
+ ne_dst[i] *= 2;
1297
+ }
1298
+ ggml_tensor* dst = ggml_new_tensor(ctx, type_dst, 4, ne_dst.data());
1299
+ ggml_set_param(ctx, dst);
1300
+ ggml_set_name(dst, "dst");
1301
+
1302
+ size_t offset = 0;
1303
+ for (int i = 0; i < dim; ++i) {
1304
+ offset += ((ne_dst[i] - ne[i])/2)*dst->nb[i];
1305
+ }
1306
+ ggml_tensor * out = ggml_set(ctx, dst, src,
1307
+ // The backward pass requires setting a contiguous region:
1308
+ src->nb[1], src->nb[2], src->nb[3], offset);
1309
+ ggml_set_name(out, "out");
1310
+
781
1311
  return out;
782
1312
  }
783
1313
  };
@@ -804,18 +1334,26 @@ struct test_cpy : public test_case {
804
1334
 
805
1335
  test_cpy(ggml_type type_src = GGML_TYPE_F32, ggml_type type_dst = GGML_TYPE_F32,
806
1336
  std::array<int64_t, 4> ne = {10, 10, 10, 1},
807
- std::array<int64_t, 4> permute = {0, 0, 0, 0},
808
- bool _dst_use_permute = false)
1337
+ std::array<int64_t, 4> permute = {0, 0, 0, 0})
809
1338
  : type_src(type_src), type_dst(type_dst), ne(ne), permute(permute),
810
1339
  _src_use_permute(permute[0] + permute[1] + permute[2] + permute[3] > 0) {}
811
1340
 
812
1341
  ggml_tensor * build_graph(ggml_context * ctx) override {
813
1342
  ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne.data());
1343
+ ggml_set_param(ctx, src);
1344
+ ggml_set_name(src, "src");
1345
+
814
1346
  if (_src_use_permute) {
815
1347
  src = ggml_permute(ctx, src, permute[0], permute[1], permute[2], permute[3]);
1348
+ ggml_set_name(src, "src_permuted");
816
1349
  }
1350
+
817
1351
  ggml_tensor* dst = ggml_new_tensor(ctx, type_dst, 4, src->ne);
1352
+ ggml_set_name(dst, "dst");
1353
+
818
1354
  ggml_tensor * out = ggml_cpy(ctx, src, dst);
1355
+ ggml_set_name(out, "out");
1356
+
819
1357
  return out;
820
1358
  }
821
1359
  };
@@ -835,8 +1373,14 @@ struct test_cont : public test_case {
835
1373
 
836
1374
  ggml_tensor * build_graph(ggml_context * ctx) override {
837
1375
  ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data());
1376
+ ggml_set_param(ctx, src);
1377
+ ggml_set_name(src, "src");
1378
+
838
1379
  src = ggml_transpose(ctx, src);
1380
+ ggml_set_name(src, "src_transposed");
1381
+
839
1382
  ggml_tensor * out = ggml_cont(ctx, src);
1383
+ ggml_set_name(out, "out");
840
1384
 
841
1385
  return out;
842
1386
  }
@@ -867,21 +1411,79 @@ struct test_bin_bcast : public test_case {
867
1411
 
868
1412
  ggml_tensor * build_graph(ggml_context * ctx) override {
869
1413
  ggml_tensor * a = ggml_new_tensor_4d(ctx, type, ne[0]*nr[0], ne[1]*nr[1], ne[2]*nr[2], ne[3]*nr[3]);
1414
+ ggml_set_name(a, "a");
1415
+
870
1416
  ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne.data());
1417
+ ggml_set_name(b, "b");
1418
+
1419
+ // The backward pass supports broadcasting only for GGML_ADD:
1420
+ const bool grad_supported = op == ggml_add || ggml_are_same_shape(a, b);
1421
+ if (grad_supported) {
1422
+ ggml_set_param(ctx, a);
1423
+ ggml_set_param(ctx, b);
1424
+ }
1425
+
871
1426
  ggml_tensor * out = op(ctx, a, b);
1427
+ ggml_set_name(out, "out");
1428
+
872
1429
  return out;
873
1430
  }
874
1431
 
875
1432
  void initialize_tensors(ggml_context * ctx) override {
876
1433
  for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
877
- if (op == ggml_div) {
878
- // avoid division by zero
879
- init_tensor_uniform(t, 1.0f, 2.0f);
1434
+ if (op == ggml_mul || op == ggml_div) {
1435
+ // MUL and DIV have numerical issues around zero:
1436
+ init_tensor_uniform(t, 0.9f, 1.1f);
880
1437
  } else {
881
1438
  init_tensor_uniform(t);
882
1439
  }
883
1440
  }
884
1441
  }
1442
+
1443
+ float grad_eps() override {
1444
+ return 0.1f * (op == ggml_mul ? ne[0]*ne[1]*ne[2]*ne[3] : 1);
1445
+ }
1446
+
1447
+ bool grad_precise() override {
1448
+ return op == ggml_div;
1449
+ }
1450
+
1451
+ double max_maa_err() override {
1452
+ return op == ggml_add ? 1e-4 : 1e-3;
1453
+ }
1454
+ };
1455
+
1456
+ // GGML_OP_ADD1
1457
+ struct test_add1 : public test_case {
1458
+ const ggml_type type;
1459
+ const std::array<int64_t, 4> ne;
1460
+
1461
+ std::string vars() override {
1462
+ return VARS_TO_STR2(type, ne);
1463
+ }
1464
+
1465
+ test_add1(ggml_type type = GGML_TYPE_F32,
1466
+ std::array<int64_t, 4> ne = {10, 5, 4, 3})
1467
+ : type(type), ne(ne) {}
1468
+
1469
+ ggml_tensor * build_graph(ggml_context * ctx) override {
1470
+ ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
1471
+ ggml_set_param(ctx, a);
1472
+ ggml_set_name(a, "a");
1473
+
1474
+ ggml_tensor * b = ggml_new_tensor_1d(ctx, type, 1);
1475
+ // ggml_set_param(ctx, b); // TODO: implement
1476
+ ggml_set_name(b, "b");
1477
+
1478
+ ggml_tensor * out = ggml_add1(ctx, a, b);
1479
+ ggml_set_name(out, "out");
1480
+
1481
+ return out;
1482
+ }
1483
+
1484
+ float grad_eps() override {
1485
+ return 0.1f * ne[0]*ne[1]*ne[2]*ne[3];
1486
+ }
885
1487
  };
886
1488
 
887
1489
  // GGML_OP_SCALE
@@ -901,7 +1503,12 @@ struct test_scale : public test_case {
901
1503
 
902
1504
  ggml_tensor * build_graph(ggml_context * ctx) override {
903
1505
  ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
1506
+ ggml_set_param(ctx, a);
1507
+ ggml_set_name(a, "a");
1508
+
904
1509
  ggml_tensor * out = ggml_scale(ctx, a, scale);
1510
+ ggml_set_name(out, "out");
1511
+
905
1512
  return out;
906
1513
  }
907
1514
  };
@@ -917,13 +1524,17 @@ struct test_norm : public test_case {
917
1524
  }
918
1525
 
919
1526
  test_norm(ggml_type type = GGML_TYPE_F32,
920
- std::array<int64_t, 4> ne = {64, 10, 10, 10},
1527
+ std::array<int64_t, 4> ne = {64, 5, 4, 3},
921
1528
  float eps = 1e-6f)
922
1529
  : type(type), ne(ne), eps(eps) {}
923
1530
 
924
1531
  ggml_tensor * build_graph(ggml_context * ctx) override {
925
1532
  ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
1533
+ ggml_set_name(a, "a");
1534
+
926
1535
  ggml_tensor * out = ggml_norm(ctx, a, eps);
1536
+ ggml_set_name(out, "out");
1537
+
927
1538
  return out;
928
1539
  }
929
1540
  };
@@ -939,13 +1550,104 @@ struct test_rms_norm : public test_case {
939
1550
  }
940
1551
 
941
1552
  test_rms_norm(ggml_type type = GGML_TYPE_F32,
942
- std::array<int64_t, 4> ne = {64, 10, 10, 10},
1553
+ std::array<int64_t, 4> ne = {64, 5, 4, 3},
943
1554
  float eps = 1e-6f)
944
1555
  : type(type), ne(ne), eps(eps) {}
945
1556
 
946
1557
  ggml_tensor * build_graph(ggml_context * ctx) override {
947
1558
  ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
1559
+ ggml_set_param(ctx, a);
1560
+ ggml_set_name(a, "a");
1561
+
948
1562
  ggml_tensor * out = ggml_rms_norm(ctx, a, eps);
1563
+ ggml_set_name(out, "out");
1564
+
1565
+ return out;
1566
+ }
1567
+
1568
+ bool grad_precise() override {
1569
+ return true;
1570
+ }
1571
+ };
1572
+
1573
+ // GGML_OP_SSM_CONV
1574
+ struct test_ssm_conv : public test_case {
1575
+ const ggml_type type;
1576
+ const std::array<int64_t, 4> ne_a;
1577
+ const std::array<int64_t, 4> ne_b;
1578
+
1579
+ std::string vars() override {
1580
+ return VARS_TO_STR3(type, ne_a, ne_b);
1581
+ }
1582
+
1583
+ test_ssm_conv(ggml_type type = GGML_TYPE_F32,
1584
+ std::array<int64_t, 4> ne_a = {10, 10, 10, 1},
1585
+ std::array<int64_t, 4> ne_b = {3, 3, 1, 1})
1586
+ : type(type), ne_a(ne_a), ne_b(ne_b) {}
1587
+
1588
+ ggml_tensor * build_graph(ggml_context * ctx) override {
1589
+ ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
1590
+ ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne_b.data());
1591
+ ggml_tensor * out = ggml_ssm_conv(ctx, a, b);
1592
+ return out;
1593
+ }
1594
+ };
1595
+
1596
+ // GGML_OP_SSM_SCAN
1597
+ struct test_ssm_scan : public test_case {
1598
+ const ggml_type type;
1599
+
1600
+ const int64_t d_state;
1601
+ const int64_t d_inner;
1602
+ const int64_t n_seq_tokens;
1603
+ const int64_t n_seqs;
1604
+
1605
+ std::string vars() override {
1606
+ return VARS_TO_STR5(type, d_state, d_inner, n_seq_tokens, n_seqs);
1607
+ }
1608
+
1609
+ test_ssm_scan(ggml_type type = GGML_TYPE_F32,
1610
+ int64_t d_state = 32, int64_t d_inner = 32, int64_t n_seq_tokens = 32, int64_t n_seqs = 32)
1611
+ : type(type), d_state(d_state), d_inner(d_inner), n_seq_tokens(n_seq_tokens), n_seqs(n_seqs) {}
1612
+
1613
+ ggml_tensor * build_graph(ggml_context * ctx) override {
1614
+ ggml_tensor * s = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ d_state, d_inner, n_seqs, 1 }.data());
1615
+ ggml_tensor * x = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ d_inner, n_seq_tokens, n_seqs, 1 }.data());
1616
+ ggml_tensor * dt = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ d_inner, n_seq_tokens, n_seqs, 1 }.data());
1617
+ ggml_tensor * A = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ d_state, d_inner, 1 , 1 }.data());
1618
+ ggml_tensor * B = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ d_state, n_seq_tokens, n_seqs, 1 }.data());
1619
+ ggml_tensor * C = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ d_state, n_seq_tokens, n_seqs, 1 }.data());
1620
+ ggml_tensor * out = ggml_ssm_scan(ctx, s, x, dt, A, B, C);
1621
+ return out;
1622
+ }
1623
+ };
1624
+
1625
+ // GGML_OP_RWKV_WKV6
1626
+ struct test_rwkv_wkv6 : public test_case {
1627
+ const ggml_type type;
1628
+
1629
+ const int64_t head_count;
1630
+ const int64_t head_size;
1631
+ const int64_t n_seq_tokens;
1632
+ const int64_t n_seqs;
1633
+
1634
+ std::string vars() override {
1635
+ return VARS_TO_STR5(type, head_count, head_size, n_seq_tokens, n_seqs);
1636
+ }
1637
+
1638
+ test_rwkv_wkv6(ggml_type type = GGML_TYPE_F32,
1639
+ int64_t head_count = 32, int64_t head_size = 64, int64_t n_seq_tokens = 32, int64_t n_seqs = 32)
1640
+ : type(type), head_count(head_count), head_size(head_size), n_seq_tokens(n_seq_tokens), n_seqs(n_seqs) {}
1641
+
1642
+ ggml_tensor * build_graph(ggml_context * ctx) override {
1643
+ const int64_t n_tokens = n_seq_tokens * n_seqs;
1644
+ ggml_tensor * r = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ 1, head_size, head_count, n_tokens }.data());
1645
+ ggml_tensor * k = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ head_size, 1, head_count, n_tokens }.data());
1646
+ ggml_tensor * v = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ 1, head_size, head_count, n_tokens }.data());
1647
+ ggml_tensor * tf = ggml_new_tensor(ctx, type, 2, std::vector<int64_t>{ head_size, head_count }.data());
1648
+ ggml_tensor * td = ggml_new_tensor(ctx, type, 4, std::vector<int64_t>{ 1, head_size, head_count, n_tokens }.data());
1649
+ ggml_tensor * s = ggml_new_tensor(ctx, type, 2, std::vector<int64_t>{ head_size * head_size * head_count, n_seqs }.data());
1650
+ ggml_tensor * out = ggml_rwkv_wkv6(ctx, k, v, r, tf, td, s);
949
1651
  return out;
950
1652
  }
951
1653
  };
@@ -957,37 +1659,68 @@ struct test_mul_mat : public test_case {
957
1659
  const int64_t m;
958
1660
  const int64_t n;
959
1661
  const int64_t k;
960
- const std::array<int64_t, 2> bs; // dims 3 and 4
961
- const std::array<int64_t, 2> nr; // repeat in dims 3 and 4
1662
+ const std::array<int64_t, 2> bs; // dims 3 and 4
1663
+ const std::array<int64_t, 2> nr; // repeat in dims 3 and 4
1664
+ const std::array<int64_t, 4> per; // permutation of dimensions
962
1665
 
963
1666
  std::string vars() override {
964
- return VARS_TO_STR7(type_a, type_b, m, n, k, bs, nr);
1667
+ return VARS_TO_STR8(type_a, type_b, m, n, k, bs, nr, per);
965
1668
  }
966
1669
 
967
1670
  double max_nmse_err() override {
968
1671
  return 5e-4;
969
1672
  }
970
1673
 
971
- size_t op_size(ggml_tensor * t) override {
972
- size_t a = ggml_nbytes(t->src[0]) * n * nr[0] * nr[1];
973
- size_t b = ggml_nbytes(t->src[1]) * m;
974
- size_t c = ggml_nbytes(t);
975
- return a + b + c;
976
-
1674
+ uint64_t op_flops(ggml_tensor * t) override {
977
1675
  GGML_UNUSED(t);
1676
+ return 2 * m * n * k * bs[0] * nr[0] * bs[1] * nr[1];
978
1677
  }
979
1678
 
980
1679
  test_mul_mat(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32,
981
1680
  int64_t m = 32, int64_t n = 32, int64_t k = 32,
982
1681
  std::array<int64_t, 2> bs = {10, 10},
983
- std::array<int64_t, 2> nr = {2, 2})
984
- : type_a(type_a), type_b(type_b), m(m), n(n), k(k), bs(bs), nr(nr) {}
1682
+ std::array<int64_t, 2> nr = {2, 2},
1683
+ std::array<int64_t, 4> per = {0, 1, 2, 3})
1684
+ : type_a(type_a), type_b(type_b), m(m), n(n), k(k), bs(bs), nr(nr), per(per) {}
985
1685
 
986
1686
  ggml_tensor * build_graph(ggml_context * ctx) override {
987
1687
  // C^T = A * B^T: (k, m) * (k, n) => (m, n)
988
- ggml_tensor * a = ggml_new_tensor_4d(ctx, type_a, k, m, bs[0] , bs[1]);
989
- ggml_tensor * b = ggml_new_tensor_4d(ctx, type_b, k, n, bs[0]*nr[0], bs[1]*nr[1]);
1688
+ ggml_tensor * a;
1689
+ ggml_tensor * b;
1690
+
1691
+ const int npermuted = (per[0] != 0) + (per[1] != 1) + (per[2] != 2) + (per[3] != 3);
1692
+ if (npermuted > 0) {
1693
+ GGML_ASSERT(npermuted == 2);
1694
+ GGML_ASSERT(!ggml_is_quantized(type_a) || per[0] == 0);
1695
+ GGML_ASSERT(!ggml_is_quantized(type_b) || per[0] == 0);
1696
+
1697
+ // Create tensors with the permuted dimensions, then permute them back to the dimensions given by m,n,k.
1698
+ const int64_t ne_a[4] = {k, m, bs[0], bs[1]};
1699
+ const int64_t ne_b[4] = {k, n, bs[0]*nr[0], bs[1]*nr[1]};
1700
+
1701
+ a = ggml_new_tensor_4d(ctx, type_a, ne_a[per[0]], ne_a[per[1]], ne_a[per[2]], ne_a[per[3]]);
1702
+ b = ggml_new_tensor_4d(ctx, type_b, ne_b[per[0]], ne_b[per[1]], ne_b[per[2]], ne_b[per[3]]);
1703
+ ggml_set_param(ctx, a);
1704
+ ggml_set_param(ctx, b);
1705
+ ggml_set_name(a, "a");
1706
+ ggml_set_name(b, "b");
1707
+
1708
+ a = ggml_permute(ctx, a, per[0], per[1], per[2], per[3]);
1709
+ b = ggml_permute(ctx, b, per[0], per[1], per[2], per[3]);
1710
+ ggml_set_name(a, "a_permuted");
1711
+ ggml_set_name(b, "b_permuted");
1712
+ } else {
1713
+ a = ggml_new_tensor_4d(ctx, type_a, k, m, bs[0], bs[1]);
1714
+ b = ggml_new_tensor_4d(ctx, type_b, k, n, bs[0]*nr[0], bs[1]*nr[1]);
1715
+ ggml_set_param(ctx, a);
1716
+ ggml_set_param(ctx, b);
1717
+ ggml_set_name(a, "a");
1718
+ ggml_set_name(b, "b");
1719
+ }
1720
+
990
1721
  ggml_tensor * out = ggml_mul_mat(ctx, a, b);
1722
+ ggml_set_name(out, "out");
1723
+
991
1724
  return out;
992
1725
  }
993
1726
  };
@@ -1011,13 +1744,9 @@ struct test_mul_mat_id : public test_case {
1011
1744
  return 5e-4;
1012
1745
  }
1013
1746
 
1014
- size_t op_size(ggml_tensor * t) override {
1015
- size_t a = ggml_nbytes(t->src[2]) * n;
1016
- size_t b = ggml_nbytes(t->src[1]) * m;
1017
- size_t c = ggml_nbytes(t);
1018
- return a + b + c;
1019
-
1747
+ uint64_t op_flops(ggml_tensor * t) override {
1020
1748
  GGML_UNUSED(t);
1749
+ return 2 * m * k * n * n_used;
1021
1750
  }
1022
1751
 
1023
1752
  test_mul_mat_id(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32,
@@ -1031,12 +1760,21 @@ struct test_mul_mat_id : public test_case {
1031
1760
  ggml_tensor * build_graph(ggml_context * ctx) override {
1032
1761
  // C^T = A * B^T: (k, m) * (k, n) => (m, n)
1033
1762
  ggml_tensor * as = ggml_new_tensor_3d(ctx, type_a, k, m, n_mats);
1763
+ ggml_set_name(as, "as");
1764
+
1034
1765
  ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_mats, n);
1766
+ ggml_set_name(ids, "ids");
1035
1767
  if (n_used != n_mats) {
1036
1768
  ids = ggml_view_2d(ctx, ids, n_used, n, ids->nb[1], 0);
1769
+ ggml_set_name(ids, "view_of_ids");
1037
1770
  }
1771
+
1038
1772
  ggml_tensor * b = ggml_new_tensor_3d(ctx, type_b, k, this->b ? 1 : n_used, n);
1773
+ ggml_set_name(b, "b");
1774
+
1039
1775
  ggml_tensor * out = ggml_mul_mat_id(ctx, as, b, ids);
1776
+ ggml_set_name(out, "out");
1777
+
1040
1778
  return out;
1041
1779
  }
1042
1780
 
@@ -1062,6 +1800,50 @@ struct test_mul_mat_id : public test_case {
1062
1800
  }
1063
1801
  };
1064
1802
 
1803
+ // GGML_OP_OUT_PROD
1804
+ struct test_out_prod : public test_case {
1805
+ const ggml_type type_a;
1806
+ const ggml_type type_b;
1807
+ const int64_t m;
1808
+ const int64_t n;
1809
+ const int64_t k;
1810
+ const std::array<int64_t, 2> bs; // dims 3 and 4
1811
+ const bool trans_b;
1812
+
1813
+ std::string vars() override {
1814
+ return VARS_TO_STR7(type_a, type_b, m, n, k, bs, trans_b);
1815
+ }
1816
+
1817
+ double max_nmse_err() override {
1818
+ return 5e-4;
1819
+ }
1820
+
1821
+ test_out_prod(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32,
1822
+ int64_t m = 32, int64_t n = 32, int64_t k = 32,
1823
+ std::array<int64_t, 2> bs = {10, 10},
1824
+ bool trans_b = false)
1825
+ : type_a(type_a), type_b(type_b), m(m), n(n), k(k), bs(bs), trans_b(trans_b) {}
1826
+
1827
+ ggml_tensor * build_graph(ggml_context * ctx) override {
1828
+ ggml_tensor * a = ggml_new_tensor_4d(ctx, type_a, m, k, bs[0], bs[1]);
1829
+ ggml_set_name(a, "a");
1830
+
1831
+ ggml_tensor * b;
1832
+ if (trans_b) {
1833
+ b = ggml_new_tensor_4d(ctx, type_b, k, n, bs[0], bs[1]);
1834
+ b = ggml_transpose(ctx, b);
1835
+ } else {
1836
+ b = ggml_new_tensor_4d(ctx, type_b, n, k, bs[0], bs[1]);
1837
+ }
1838
+ ggml_set_name(b, "b");
1839
+
1840
+ ggml_tensor * out = ggml_out_prod(ctx, a, b);
1841
+ ggml_set_name(out, "out");
1842
+
1843
+ return out;
1844
+ }
1845
+ };
1846
+
1065
1847
  // GGML_OP_SQR
1066
1848
  struct test_sqr : public test_case {
1067
1849
  const ggml_type type;
@@ -1072,14 +1854,23 @@ struct test_sqr : public test_case {
1072
1854
  }
1073
1855
 
1074
1856
  test_sqr(ggml_type type = GGML_TYPE_F32,
1075
- std::array<int64_t, 4> ne = {10, 10, 10, 10})
1857
+ std::array<int64_t, 4> ne = {10, 5, 4, 3})
1076
1858
  : type(type), ne(ne) {}
1077
1859
 
1078
1860
  ggml_tensor * build_graph(ggml_context * ctx) override {
1079
1861
  ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
1862
+ ggml_set_param(ctx, a);
1863
+ ggml_set_name(a, "a");
1864
+
1080
1865
  ggml_tensor * out = ggml_sqr(ctx, a);
1866
+ ggml_set_name(out, "out");
1867
+
1081
1868
  return out;
1082
1869
  }
1870
+
1871
+ float grad_eps() override {
1872
+ return 0.1f * 0.25f*ne[0]*ne[1]*ne[2]*ne[3]; // 10% of expected value of sum.
1873
+ }
1083
1874
  };
1084
1875
 
1085
1876
  // GGML_OP_SQRT
@@ -1092,21 +1883,156 @@ struct test_sqrt : public test_case {
1092
1883
  }
1093
1884
 
1094
1885
  test_sqrt(ggml_type type = GGML_TYPE_F32,
1095
- std::array<int64_t, 4> ne = {10, 10, 10, 10})
1886
+ std::array<int64_t, 4> ne = {10, 3, 3, 2})
1096
1887
  : type(type), ne(ne) {}
1097
1888
 
1098
1889
  ggml_tensor * build_graph(ggml_context * ctx) override {
1099
1890
  ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
1891
+ ggml_set_param(ctx, a);
1892
+ ggml_set_name(a, "a");
1893
+
1100
1894
  ggml_tensor * out = ggml_sqrt(ctx, a);
1895
+ ggml_set_name(out, "out");
1896
+
1101
1897
  return out;
1102
1898
  }
1103
1899
 
1104
1900
  void initialize_tensors(ggml_context * ctx) override {
1105
1901
  // fill with positive values
1106
1902
  for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
1107
- init_tensor_uniform(t, 0.0f, 100.0f);
1903
+ init_tensor_uniform(t, 50.0f, 100.0f);
1108
1904
  }
1109
1905
  }
1906
+
1907
+ float grad_eps() override {
1908
+ return 20.0f;
1909
+ }
1910
+
1911
+ bool grad_precise() override {
1912
+ return true;
1913
+ }
1914
+ };
1915
+
1916
+ // GGML_OP_LOG
1917
+ struct test_log : public test_case {
1918
+ const ggml_type type;
1919
+ const std::array<int64_t, 4> ne;
1920
+
1921
+ std::string vars() override {
1922
+ return VARS_TO_STR2(type, ne);
1923
+ }
1924
+
1925
+ test_log(ggml_type type = GGML_TYPE_F32,
1926
+ std::array<int64_t, 4> ne = {10, 5, 4, 3})
1927
+ : type(type), ne(ne) {}
1928
+
1929
+ ggml_tensor * build_graph(ggml_context * ctx) override {
1930
+ ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
1931
+ ggml_set_param(ctx, a);
1932
+ ggml_set_name(a, "a");
1933
+
1934
+ ggml_tensor * out = ggml_log(ctx, a);
1935
+ ggml_set_name(out, "out");
1936
+
1937
+ return out;
1938
+ }
1939
+
1940
+ void initialize_tensors(ggml_context * ctx) override {
1941
+ for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
1942
+ // log(1) == 0, cluster values there to keep the sum low for better precision in the backward pass:
1943
+ init_tensor_uniform(t, 0.9f, 1.1f);
1944
+ }
1945
+ }
1946
+
1947
+ bool grad_precise() override {
1948
+ return true;
1949
+ }
1950
+ };
1951
+
1952
+ // GGML_OP_SIN
1953
+ struct test_sin : public test_case {
1954
+ const ggml_type type;
1955
+ const std::array<int64_t, 4> ne;
1956
+
1957
+ std::string vars() override {
1958
+ return VARS_TO_STR2(type, ne);
1959
+ }
1960
+
1961
+ test_sin(ggml_type type = GGML_TYPE_F32,
1962
+ std::array<int64_t, 4> ne = {10, 2, 2, 2})
1963
+ : type(type), ne(ne) {}
1964
+
1965
+ ggml_tensor * build_graph(ggml_context * ctx) override {
1966
+ ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
1967
+ ggml_set_param(ctx, a);
1968
+ ggml_set_name(a, "a");
1969
+
1970
+ ggml_tensor * out = ggml_sin(ctx, a);
1971
+ ggml_set_name(out, "out");
1972
+
1973
+ return out;
1974
+ }
1975
+
1976
+ void initialize_tensors(ggml_context * ctx) override {
1977
+ for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
1978
+ init_tensor_uniform(t, -6.5f, 6.5f); // Covers interval [-2*pi, 2*pi].
1979
+ }
1980
+ }
1981
+
1982
+ double max_maa_err() override {
1983
+ return 1e-3;
1984
+ }
1985
+
1986
+ float grad_eps() override {
1987
+ return 0.2f;
1988
+ }
1989
+
1990
+ bool grad_precise() override {
1991
+ return true;
1992
+ }
1993
+ };
1994
+
1995
+ // GGML_OP_COS
1996
+ struct test_cos : public test_case {
1997
+ const ggml_type type;
1998
+ const std::array<int64_t, 4> ne;
1999
+
2000
+ std::string vars() override {
2001
+ return VARS_TO_STR2(type, ne);
2002
+ }
2003
+
2004
+ test_cos(ggml_type type = GGML_TYPE_F32,
2005
+ std::array<int64_t, 4> ne = {10, 2, 2, 2})
2006
+ : type(type), ne(ne) {}
2007
+
2008
+ ggml_tensor * build_graph(ggml_context * ctx) override {
2009
+ ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
2010
+ ggml_set_param(ctx, a);
2011
+ ggml_set_name(a, "a");
2012
+
2013
+ ggml_tensor * out = ggml_cos(ctx, a);
2014
+ ggml_set_name(out, "out");
2015
+
2016
+ return out;
2017
+ }
2018
+
2019
+ void initialize_tensors(ggml_context * ctx) override {
2020
+ for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
2021
+ init_tensor_uniform(t, -6.5f, 6.5f); // Covers interval [-2*pi, 2*pi].
2022
+ }
2023
+ }
2024
+
2025
+ double max_maa_err() override {
2026
+ return 1e-3;
2027
+ }
2028
+
2029
+ float grad_eps() override {
2030
+ return 0.2f;
2031
+ }
2032
+
2033
+ bool grad_precise() override {
2034
+ return true;
2035
+ }
1110
2036
  };
1111
2037
 
1112
2038
  // GGML_OP_CLAMP
@@ -1121,15 +2047,27 @@ struct test_clamp : public test_case {
1121
2047
  }
1122
2048
 
1123
2049
  test_clamp(ggml_type type = GGML_TYPE_F32,
1124
- std::array<int64_t, 4> ne = {10, 10, 10, 10},
2050
+ std::array<int64_t, 4> ne = {10, 5, 4, 3},
1125
2051
  float min = -0.5f, float max = 0.5f)
1126
2052
  : type(type), ne(ne), min(min), max(max) {}
1127
2053
 
1128
2054
  ggml_tensor * build_graph(ggml_context * ctx) override {
1129
2055
  ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
2056
+ ggml_set_name(a, "a");
2057
+
1130
2058
  ggml_tensor * out = ggml_clamp(ctx, a, min, max);
2059
+ ggml_set_name(out, "out");
2060
+
1131
2061
  return out;
1132
2062
  }
2063
+
2064
+ float grad_eps() override {
2065
+ return 1e-2f;
2066
+ }
2067
+
2068
+ std::vector<float> grad_expect() override {
2069
+ return {0.0f, 1.0f};
2070
+ }
1133
2071
  };
1134
2072
 
1135
2073
  // GGML_OP_DIAG_MASK_INF
@@ -1143,13 +2081,18 @@ struct test_diag_mask_inf : public test_case {
1143
2081
  }
1144
2082
 
1145
2083
  test_diag_mask_inf(ggml_type type = GGML_TYPE_F32,
1146
- std::array<int64_t, 4> ne = {10, 10, 10, 10},
2084
+ std::array<int64_t, 4> ne = {10, 10, 3, 2},
1147
2085
  int n_past = 5)
1148
2086
  : type(type), ne(ne), n_past(n_past) {}
1149
2087
 
1150
2088
  ggml_tensor * build_graph(ggml_context * ctx) override {
1151
2089
  ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
2090
+ ggml_set_param(ctx, a);
2091
+ ggml_set_name(a, "a");
2092
+
1152
2093
  ggml_tensor * out = ggml_diag_mask_inf(ctx, a, n_past);
2094
+ ggml_set_name(out, "out");
2095
+
1153
2096
  return out;
1154
2097
  }
1155
2098
  };
@@ -1173,7 +2116,7 @@ struct test_soft_max : public test_case {
1173
2116
  }
1174
2117
 
1175
2118
  test_soft_max(ggml_type type = GGML_TYPE_F32,
1176
- std::array<int64_t, 4> ne = {10, 10, 10, 10},
2119
+ std::array<int64_t, 4> ne = {10, 5, 4, 3},
1177
2120
  bool mask = false,
1178
2121
  float scale = 1.0f,
1179
2122
  float max_bias = 0.0f)
@@ -1181,13 +2124,24 @@ struct test_soft_max : public test_case {
1181
2124
 
1182
2125
  ggml_tensor * build_graph(ggml_context * ctx) override {
1183
2126
  ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
2127
+ ggml_set_param(ctx, a);
2128
+ ggml_set_name(a, "a");
2129
+
1184
2130
  ggml_tensor * mask = nullptr;
1185
2131
  if (this->mask) {
1186
2132
  mask = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, ne[0], ne[1]);
2133
+ ggml_set_name(mask, "mask");
1187
2134
  }
2135
+
1188
2136
  ggml_tensor * out = ggml_soft_max_ext(ctx, a, mask, scale, max_bias);
2137
+ ggml_set_name(out, "out");
2138
+
1189
2139
  return out;
1190
2140
  }
2141
+
2142
+ bool grad_precise() override {
2143
+ return true;
2144
+ }
1191
2145
  };
1192
2146
 
1193
2147
 
@@ -1209,7 +2163,7 @@ struct test_rope : public test_case {
1209
2163
  }
1210
2164
 
1211
2165
  test_rope(ggml_type type = GGML_TYPE_F32,
1212
- std::array<int64_t, 4> ne_a = {10, 10, 10, 1},
2166
+ std::array<int64_t, 4> ne_a = {10, 5, 3, 1},
1213
2167
  int n_dims = 10, int mode = 0, int n_ctx = 512, float fs = 1.0f, float ef = 0.0f, float af = 0.0f, bool ff = false, int v = 0)
1214
2168
  : type(type), ne_a(ne_a), n_dims(n_dims), mode(mode), n_ctx(n_ctx), fs(fs), ef(ef), af(af), ff(ff), v(v) {}
1215
2169
 
@@ -1218,13 +2172,29 @@ struct test_rope : public test_case {
1218
2172
  if (v & 1) {
1219
2173
  auto ne = ne_a; ne[0] *= 2; ne[1] *= 4; ne[2] *= 3;
1220
2174
  a = ggml_new_tensor(ctx, type, 4, ne.data());
2175
+ ggml_set_param(ctx, a);
2176
+ ggml_set_name(a, "a");
2177
+
1221
2178
  a = ggml_view_4d(ctx, a, ne_a[0], ne_a[1], ne_a[2], ne_a[3], a->nb[1], a->nb[2], a->nb[3], 0);
2179
+ ggml_set_name(a, "view_of_a");
1222
2180
  } else {
1223
2181
  a = ggml_new_tensor(ctx, type, 4, ne_a.data());
2182
+ ggml_set_param(ctx, a);
2183
+ ggml_set_name(a, "a");
1224
2184
  }
2185
+
1225
2186
  ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne_a[2]);
1226
- ggml_tensor * freq = ff ? ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_dims/2) : nullptr;
2187
+ ggml_set_name(pos, "pos");
2188
+
2189
+ ggml_tensor * freq = nullptr;
2190
+ if (ff) {
2191
+ freq = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_dims/2);
2192
+ ggml_set_name(freq, "freq");
2193
+ }
2194
+
1227
2195
  ggml_tensor * out = ggml_rope_ext(ctx, a, pos, freq, n_dims, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
2196
+ ggml_set_name(out, "out");
2197
+
1228
2198
  return out;
1229
2199
  }
1230
2200
 
@@ -1247,6 +2217,14 @@ struct test_rope : public test_case {
1247
2217
  }
1248
2218
  }
1249
2219
  }
2220
+
2221
+ double max_maa_err() override {
2222
+ return 1e-3;
2223
+ }
2224
+
2225
+ bool grad_precise() override {
2226
+ return true;
2227
+ }
1250
2228
  };
1251
2229
 
1252
2230
  // GGML_OP_POOL2D
@@ -1278,7 +2256,12 @@ struct test_pool2d : public test_case {
1278
2256
 
1279
2257
  ggml_tensor * build_graph(ggml_context * ctx) override {
1280
2258
  ggml_tensor * input = ggml_new_tensor(ctx, type_input, 4, ne_input.data());
2259
+ ggml_set_param(ctx, input);
2260
+ ggml_set_name(input, "input");
2261
+
1281
2262
  ggml_tensor * out = ggml_pool_2d(ctx, input, pool_type, k0, k1, s0, s1, p0, p1);
2263
+ ggml_set_name(out, "out");
2264
+
1282
2265
  return out;
1283
2266
  }
1284
2267
  };
@@ -1303,8 +2286,14 @@ struct test_conv_transpose_1d : public test_case {
1303
2286
 
1304
2287
  ggml_tensor * build_graph(ggml_context * ctx) override {
1305
2288
  ggml_tensor * input = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_input.data());
2289
+ ggml_set_name(input, "input");
2290
+
1306
2291
  ggml_tensor * kernel = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_kernel.data());
2292
+ ggml_set_name(kernel, "kernel");
2293
+
1307
2294
  ggml_tensor * out = ggml_conv_transpose_1d(ctx, kernel, input, s0, p0, d0);
2295
+ ggml_set_name(out, "out");
2296
+
1308
2297
  return out;
1309
2298
  }
1310
2299
  };
@@ -1343,8 +2332,15 @@ struct test_im2col : public test_case {
1343
2332
 
1344
2333
  ggml_tensor * build_graph(ggml_context * ctx) override {
1345
2334
  ggml_tensor * input = ggml_new_tensor(ctx, type_input, 4, ne_input.data());
2335
+ ggml_set_param(ctx, input);
2336
+ ggml_set_name(input, "input");
2337
+
1346
2338
  ggml_tensor * kernel = ggml_new_tensor(ctx, type_kernel, 4, ne_kernel.data());
2339
+ ggml_set_name(kernel, "kernel");
2340
+
1347
2341
  ggml_tensor * out = ggml_im2col(ctx, kernel, input, s0, s1, p0, p1, d0, d1, is_2D, dst_type);
2342
+ ggml_set_name(out, "out");
2343
+
1348
2344
  return out;
1349
2345
  }
1350
2346
  };
@@ -1362,8 +2358,8 @@ struct test_concat : public test_case {
1362
2358
  }
1363
2359
 
1364
2360
  test_concat(ggml_type type = GGML_TYPE_F32,
1365
- std::array<int64_t, 4> ne_a = {10, 10, 10, 10},
1366
- int64_t ne_b_d = 10,
2361
+ std::array<int64_t, 4> ne_a = {10, 5, 5, 5},
2362
+ int64_t ne_b_d = 5,
1367
2363
  int dim = 2, int v = 0)
1368
2364
  : type(type), ne_a(ne_a), ne_b_d(ne_b_d), dim(dim), v(v) {}
1369
2365
 
@@ -1374,19 +2370,30 @@ struct test_concat : public test_case {
1374
2370
  if (v & 1) {
1375
2371
  auto ne = ne_a; ne[0] *= 2; ne[1] *= 4; ne[2] *= 3;
1376
2372
  a = ggml_new_tensor(ctx, type, 4, ne.data());
2373
+ ggml_set_name(a, "a");
2374
+
1377
2375
  a = ggml_view_4d(ctx, a, ne_a[0], ne_a[1], ne_a[2], ne_a[3], a->nb[1], a->nb[2], a->nb[3], 0);
2376
+ ggml_set_name(a, "view_of_a");
1378
2377
  } else {
1379
2378
  a = ggml_new_tensor(ctx, type, 4, ne_a.data());
2379
+ ggml_set_name(a, "a");
1380
2380
  }
1381
2381
  ggml_tensor * b;
1382
2382
  if (v & 2) {
1383
2383
  auto ne = ne_b; ne[0] *= 3; ne[1] *= 2; ne[2] *= 4;
1384
2384
  b = ggml_new_tensor(ctx, type, 4, ne.data());
2385
+ ggml_set_name(b, "b");
2386
+
1385
2387
  b = ggml_view_4d(ctx, b, ne_b[0], ne_b[1], ne_b[2], ne_b[3], b->nb[1], b->nb[2], b->nb[3], 0);
2388
+ ggml_set_name(b, "view_of_b");
1386
2389
  } else {
1387
2390
  b = ggml_new_tensor(ctx, type, 4, ne_b.data());
2391
+ ggml_set_name(b, "b");
1388
2392
  }
2393
+
1389
2394
  ggml_tensor * out = ggml_concat(ctx, a, b, dim);
2395
+ ggml_set_name(out, "out");
2396
+
1390
2397
  return out;
1391
2398
  }
1392
2399
  };
@@ -1408,7 +2415,11 @@ struct test_argsort : public test_case {
1408
2415
 
1409
2416
  ggml_tensor * build_graph(ggml_context * ctx) override {
1410
2417
  ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
2418
+ ggml_set_name(a, "a");
2419
+
1411
2420
  ggml_tensor * out = ggml_argsort(ctx, a, order);
2421
+ ggml_set_name(out, "out");
2422
+
1412
2423
  return out;
1413
2424
  }
1414
2425
 
@@ -1441,6 +2452,35 @@ struct test_argsort : public test_case {
1441
2452
  }
1442
2453
  };
1443
2454
 
2455
+ // GGML_OP_SUM
2456
+ struct test_sum : public test_case {
2457
+ const ggml_type type;
2458
+ const std::array<int64_t, 4> ne;
2459
+
2460
+ std::string vars() override {
2461
+ return VARS_TO_STR2(type, ne);
2462
+ }
2463
+
2464
+ test_sum(ggml_type type = GGML_TYPE_F32,
2465
+ std::array<int64_t, 4> ne = {10, 5, 4, 3})
2466
+ : type(type), ne(ne) {}
2467
+
2468
+ ggml_tensor * build_graph(ggml_context * ctx) override {
2469
+ ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
2470
+ ggml_set_param(ctx, a);
2471
+ ggml_set_name(a, "a");
2472
+
2473
+ ggml_tensor * out = ggml_sum(ctx, a);
2474
+ ggml_set_name(out, "out");
2475
+
2476
+ return out;
2477
+ }
2478
+
2479
+ float grad_eps() override {
2480
+ return 0.1f * sqrtf(ne[0]*ne[1]*ne[2]*ne[3]);
2481
+ }
2482
+ };
2483
+
1444
2484
  // GGML_OP_SUM_ROWS
1445
2485
  struct test_sum_rows : public test_case {
1446
2486
  const ggml_type type;
@@ -1451,16 +2491,50 @@ struct test_sum_rows : public test_case {
1451
2491
  }
1452
2492
 
1453
2493
  test_sum_rows(ggml_type type = GGML_TYPE_F32,
1454
- std::array<int64_t, 4> ne = {10, 10, 10, 10})
2494
+ std::array<int64_t, 4> ne = {10, 5, 4, 3})
1455
2495
  : type(type), ne(ne) {}
1456
2496
 
1457
2497
  ggml_tensor * build_graph(ggml_context * ctx) override {
1458
2498
  ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
2499
+ ggml_set_param(ctx, a);
2500
+ ggml_set_name(a, "a");
2501
+
1459
2502
  ggml_tensor * out = ggml_sum_rows(ctx, a);
2503
+ ggml_set_name(out, "out");
2504
+
1460
2505
  return out;
1461
2506
  }
1462
2507
  };
1463
2508
 
2509
+ // GGML_OP_MEAN
2510
+ struct test_mean : public test_case {
2511
+ const ggml_type type;
2512
+ const std::array<int64_t, 4> ne;
2513
+
2514
+ std::string vars() override {
2515
+ return VARS_TO_STR2(type, ne);
2516
+ }
2517
+
2518
+ test_mean(ggml_type type = GGML_TYPE_F32,
2519
+ std::array<int64_t, 4> ne = {10, 5, 4, 3})
2520
+ : type(type), ne(ne) {}
2521
+
2522
+ ggml_tensor * build_graph(ggml_context * ctx) override {
2523
+ ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
2524
+ ggml_set_param(ctx, a);
2525
+ ggml_set_name(a, "a");
2526
+
2527
+ ggml_tensor * out = ggml_mean(ctx, a);
2528
+ ggml_set_name(out, "out");
2529
+
2530
+ return out;
2531
+ }
2532
+
2533
+ float grad_eps() override {
2534
+ return 0.1f * ne[0]*ne[1]*ne[2]*ne[3];
2535
+ }
2536
+ };
2537
+
1464
2538
  // GGML_OP_UPSCALE
1465
2539
  struct test_upscale : public test_case {
1466
2540
  const ggml_type type;
@@ -1479,8 +2553,16 @@ struct test_upscale : public test_case {
1479
2553
 
1480
2554
  ggml_tensor * build_graph(ggml_context * ctx) override {
1481
2555
  ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
1482
- if (transpose) a = ggml_transpose(ctx, a);
2556
+ ggml_set_name(a, "a");
2557
+
2558
+ if (transpose) {
2559
+ a = ggml_transpose(ctx, a);
2560
+ ggml_set_name(a, "a_transposed");
2561
+ }
2562
+
1483
2563
  ggml_tensor * out = ggml_upscale(ctx, a, scale_factor);
2564
+ ggml_set_name(out, "out");
2565
+
1484
2566
  return out;
1485
2567
  }
1486
2568
  };
@@ -1502,7 +2584,11 @@ struct test_upscale_ext : public test_case {
1502
2584
 
1503
2585
  ggml_tensor * build_graph(ggml_context * ctx) override {
1504
2586
  ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
2587
+ ggml_set_name(a, "a");
2588
+
1505
2589
  ggml_tensor * out = ggml_upscale_ext(ctx, a, ne_tgt[0], ne_tgt[1],ne_tgt[2], ne_tgt[3]);
2590
+ ggml_set_name(out, "out");
2591
+
1506
2592
  return out;
1507
2593
  }
1508
2594
  };
@@ -1512,6 +2598,7 @@ struct test_group_norm : public test_case {
1512
2598
  const ggml_type type;
1513
2599
  const std::array<int64_t, 4> ne;
1514
2600
  const int32_t num_groups;
2601
+ const float eps;
1515
2602
 
1516
2603
  std::string vars() override {
1517
2604
  return VARS_TO_STR3(type, ne, num_groups);
@@ -1519,12 +2606,17 @@ struct test_group_norm : public test_case {
1519
2606
 
1520
2607
  test_group_norm(ggml_type type = GGML_TYPE_F32,
1521
2608
  std::array<int64_t, 4> ne = {64, 64, 320, 1},
1522
- int32_t num_groups = 32)
1523
- : type(type), ne(ne), num_groups(num_groups) {}
2609
+ int32_t num_groups = 32,
2610
+ float eps = 1e-6f)
2611
+ : type(type), ne(ne), num_groups(num_groups), eps(eps) {}
1524
2612
 
1525
2613
  ggml_tensor * build_graph(ggml_context * ctx) override {
1526
2614
  ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
1527
- ggml_tensor * out = ggml_group_norm(ctx, a, num_groups);
2615
+ ggml_set_name(a, "a");
2616
+
2617
+ ggml_tensor * out = ggml_group_norm(ctx, a, num_groups, eps);
2618
+ ggml_set_name(out, "out");
2619
+
1528
2620
  return out;
1529
2621
  }
1530
2622
  };
@@ -1540,14 +2632,22 @@ struct test_acc : public test_case {
1540
2632
  }
1541
2633
 
1542
2634
  test_acc(ggml_type type = GGML_TYPE_F32,
1543
- std::array<int64_t, 4> ne_a = {1024, 577, 1, 1},
1544
- std::array<int64_t, 4> ne_b = {1024, 576, 1, 1})
2635
+ std::array<int64_t, 4> ne_a = {256, 17, 1, 1},
2636
+ std::array<int64_t, 4> ne_b = {256, 16, 1, 1})
1545
2637
  : type(type), ne_a(ne_a), ne_b(ne_b) {}
1546
2638
 
1547
2639
  ggml_tensor * build_graph(ggml_context * ctx) override {
1548
2640
  ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
2641
+ ggml_set_param(ctx, a);
2642
+ ggml_set_name(a, "a");
2643
+
1549
2644
  ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne_b.data());
2645
+ ggml_set_param(ctx, b);
2646
+ ggml_set_name(b, "b");
2647
+
1550
2648
  ggml_tensor * out = ggml_acc(ctx, a, b, a->nb[1], a->nb[2], a->nb[3], b->nb[1]);
2649
+ ggml_set_name(out, "out");
2650
+
1551
2651
  return out;
1552
2652
  }
1553
2653
  };
@@ -1570,7 +2670,11 @@ struct test_pad : public test_case {
1570
2670
 
1571
2671
  ggml_tensor * build_graph(ggml_context * ctx) override {
1572
2672
  ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
2673
+ ggml_set_name(a, "a");
2674
+
1573
2675
  ggml_tensor * out = ggml_pad(ctx, a, pad_0, pad_1, 0, 0);
2676
+ ggml_set_name(out, "out");
2677
+
1574
2678
  return out;
1575
2679
  }
1576
2680
  };
@@ -1592,6 +2696,8 @@ struct test_arange : public test_case {
1592
2696
 
1593
2697
  ggml_tensor * build_graph(ggml_context * ctx) override {
1594
2698
  ggml_tensor * out = ggml_arange(ctx, start, stop, step);
2699
+ ggml_set_name(out, "out");
2700
+
1595
2701
  return out;
1596
2702
  }
1597
2703
  };
@@ -1614,7 +2720,11 @@ struct test_timestep_embedding : public test_case {
1614
2720
 
1615
2721
  ggml_tensor * build_graph(ggml_context * ctx) override {
1616
2722
  ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
2723
+ ggml_set_name(a, "a");
2724
+
1617
2725
  ggml_tensor * out = ggml_timestep_embedding(ctx, a, dim, max_period);
2726
+ ggml_set_name(out, "out");
2727
+
1618
2728
  return out;
1619
2729
  }
1620
2730
  };
@@ -1630,13 +2740,17 @@ struct test_leaky_relu : public test_case {
1630
2740
  }
1631
2741
 
1632
2742
  test_leaky_relu(ggml_type type = GGML_TYPE_F32,
1633
- std::array<int64_t, 4> ne_a = {10, 10, 10, 10},
2743
+ std::array<int64_t, 4> ne_a = {10, 5, 4, 3},
1634
2744
  float negative_slope = 0.1f)
1635
2745
  : type(type), ne_a(ne_a), negative_slope(negative_slope) {}
1636
2746
 
1637
2747
  ggml_tensor * build_graph(ggml_context * ctx) override {
1638
2748
  ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
2749
+ ggml_set_name(a, "a");
2750
+
1639
2751
  ggml_tensor * out = ggml_leaky_relu(ctx, a, negative_slope, true);
2752
+ ggml_set_name(out, "out");
2753
+
1640
2754
  return out;
1641
2755
  }
1642
2756
  };
@@ -1651,30 +2765,151 @@ struct test_flash_attn_ext : public test_case {
1651
2765
  const bool mask; // use mask
1652
2766
 
1653
2767
  const float max_bias; // ALiBi
2768
+ const float logit_softcap; // Gemma 2
1654
2769
 
1655
2770
  const ggml_type type_KV;
1656
2771
 
1657
2772
  std::string vars() override {
1658
- return VARS_TO_STR7(hs, nh, kv, nb, mask, max_bias, type_KV);
2773
+ return VARS_TO_STR8(hs, nh, kv, nb, mask, max_bias, logit_softcap, type_KV);
1659
2774
  }
1660
2775
 
1661
2776
  double max_nmse_err() override {
1662
2777
  return 5e-4;
1663
2778
  }
1664
2779
 
1665
- test_flash_attn_ext(int64_t hs = 128, int64_t nh = 32, int64_t kv = 96, int64_t nb = 8, bool mask = true, float max_bias = 0.0f, ggml_type type_KV = GGML_TYPE_F16)
1666
- : hs(hs), nh(nh), kv(kv), nb(nb), mask(mask), max_bias(max_bias), type_KV(type_KV) {}
2780
+ uint64_t op_flops(ggml_tensor * t) override {
2781
+ GGML_UNUSED(t);
2782
+ // Just counting matmul costs:
2783
+ // Q*K^T is nb x hs x kv, P*V is nb x kv x hs, per head
2784
+ return 2 * 2 * nh * nb * hs * kv;
2785
+ }
2786
+
2787
+ test_flash_attn_ext(int64_t hs = 128, int64_t nh = 32, int64_t kv = 96, int64_t nb = 8,
2788
+ bool mask = true, float max_bias = 0.0f, float logit_softcap = 0.0f, ggml_type type_KV = GGML_TYPE_F16)
2789
+ : hs(hs), nh(nh), kv(kv), nb(nb), mask(mask), max_bias(max_bias), logit_softcap(logit_softcap), type_KV(type_KV) {}
1667
2790
 
1668
2791
  ggml_tensor * build_graph(ggml_context * ctx) override {
1669
2792
  const int64_t hs_padded = GGML_PAD(hs, ggml_blck_size(type_KV));
1670
2793
 
1671
2794
  ggml_tensor * q = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, hs_padded, nb, nh, 1);
2795
+ ggml_set_name(q, "q");
2796
+
1672
2797
  ggml_tensor * k = ggml_new_tensor_4d(ctx, type_KV, hs_padded, kv, nh, 1);
2798
+ ggml_set_name(k, "k");
2799
+
1673
2800
  ggml_tensor * v = ggml_new_tensor_4d(ctx, type_KV, hs_padded, kv, nh, 1);
1674
- ggml_tensor * m = mask ? ggml_new_tensor_4d(ctx, GGML_TYPE_F16, kv, GGML_PAD(nb, GGML_KQ_MASK_PAD), 1, 1) : nullptr;
1675
- ggml_tensor * out = ggml_flash_attn_ext(ctx, q, k, v, m, 1.0f/sqrtf(hs), max_bias);
2801
+ ggml_set_name(v, "v");
2802
+
2803
+ ggml_tensor * m = nullptr;
2804
+ if (mask) {
2805
+ m = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, kv, GGML_PAD(nb, GGML_KQ_MASK_PAD), 1, 1);
2806
+ ggml_set_name(m, "m");
2807
+ }
2808
+
2809
+ ggml_tensor * out = ggml_flash_attn_ext(ctx, q, k, v, m, 1.0f/sqrtf(hs), max_bias, logit_softcap);
2810
+ ggml_set_name(out, "out");
2811
+
2812
+ return out;
2813
+ }
2814
+
2815
+ bool grad_precise() override {
2816
+ return true;
2817
+ }
2818
+ };
2819
+
2820
+ // GGML_OP_CROSS_ENTROPY_LOSS
2821
+ struct test_cross_entropy_loss : public test_case {
2822
+ const ggml_type type;
2823
+ const std::array<int64_t, 4> ne;
2824
+
2825
+ std::string vars() override {
2826
+ return VARS_TO_STR2(type, ne);
2827
+ }
2828
+
2829
+ test_cross_entropy_loss(ggml_type type = GGML_TYPE_F32,
2830
+ std::array<int64_t, 4> ne = {10, 5, 4, 3})
2831
+ : type(type), ne(ne) {}
2832
+
2833
+ ggml_tensor * build_graph(ggml_context * ctx) override {
2834
+ ggml_tensor * logits = ggml_new_tensor(ctx, type, 4, ne.data());
2835
+ ggml_set_param(ctx, logits);
2836
+ ggml_set_name(logits, "logits");
2837
+
2838
+ ggml_tensor * labels = ggml_new_tensor(ctx, type, 4, ne.data());
2839
+ // The labels are assumed to be constant -> no gradients.
2840
+ ggml_set_name(labels, "labels");
2841
+
2842
+ // Ensure labels add up to 1:
2843
+ labels = ggml_soft_max(ctx, labels);
2844
+ ggml_set_name(labels, "labels_normalized");
2845
+
2846
+ ggml_tensor * out = ggml_cross_entropy_loss(ctx, logits, labels);
2847
+ ggml_set_name(out, "out");
2848
+
2849
+ return out;
2850
+ }
2851
+
2852
+ void initialize_tensors(ggml_context * ctx) override {
2853
+ // For larger abs. diffs between logits softmax is more linear, therefore more precise num. gradients.
2854
+ for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
2855
+ init_tensor_uniform(t, -100.0f, 100.0f);
2856
+ }
2857
+ }
2858
+
2859
+ float grad_eps() override {
2860
+ return 1.0f;
2861
+ }
2862
+
2863
+ bool grad_precise() override {
2864
+ return true;
2865
+ }
2866
+ };
2867
+
2868
+ // GGML_OP_OPT_STEP_ADAMW
2869
+ struct test_opt_step_adamw : public test_case {
2870
+ const ggml_type type;
2871
+ const std::array<int64_t, 4> ne;
2872
+
2873
+ std::string vars() override {
2874
+ return VARS_TO_STR2(type, ne);
2875
+ }
2876
+
2877
+ test_opt_step_adamw(ggml_type type = GGML_TYPE_F32,
2878
+ std::array<int64_t, 4> ne = {10, 5, 4, 3})
2879
+ : type(type), ne(ne) {}
2880
+
2881
+ ggml_tensor * build_graph(ggml_context * ctx) override {
2882
+ ggml_tensor * a = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
2883
+ ggml_set_param(ctx, a); // Despite tensor a having gradients the output tensor will not.
2884
+ ggml_set_name(a, "a");
2885
+
2886
+ ggml_tensor * grad = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
2887
+ ggml_set_name(grad, "grad");
2888
+
2889
+ ggml_tensor * grad_m = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
2890
+ ggml_set_name(grad_m, "grad_m");
2891
+
2892
+ ggml_tensor * grad_v = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
2893
+ ggml_set_name(grad_v, "grad_v");
2894
+
2895
+ ggml_tensor * adamw_params = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 7);
2896
+ ggml_set_name(adamw_params, "adamw_params");
2897
+
2898
+ ggml_tensor * out = ggml_opt_step_adamw(ctx, a, grad, grad_m, grad_v, adamw_params);
2899
+ ggml_set_name(out, "out");
2900
+
1676
2901
  return out;
1677
2902
  }
2903
+
2904
+ void initialize_tensors(ggml_context * ctx) override {
2905
+ for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
2906
+ init_tensor_uniform(t, 0.0f, 1.0f); // grad_v and adamw_params need non-negative values.
2907
+ }
2908
+ }
2909
+
2910
+ bool grad_precise() override {
2911
+ return true;
2912
+ }
1678
2913
  };
1679
2914
 
1680
2915
  enum llm_norm_type {
@@ -2061,48 +3296,55 @@ struct test_falcon : public test_llm {
2061
3296
  }
2062
3297
  };
2063
3298
 
2064
- static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_name) {
2065
- std::vector<std::unique_ptr<test_case>> test_cases;
2066
- std::default_random_engine rng(0);
2067
3299
 
2068
- const ggml_type all_types[] = {
2069
- GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16,
2070
- GGML_TYPE_Q4_0, GGML_TYPE_Q4_1,
2071
- GGML_TYPE_Q5_0, GGML_TYPE_Q5_1,
2072
- GGML_TYPE_Q8_0,
2073
- GGML_TYPE_Q2_K, GGML_TYPE_Q3_K,
2074
- GGML_TYPE_Q4_K, GGML_TYPE_Q5_K,
2075
- GGML_TYPE_Q6_K,
2076
- GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S,
2077
- GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M,
2078
- GGML_TYPE_IQ4_NL, GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS,
2079
- };
3300
+ // ###########################################
3301
+ // ## Section 3: GGML Op Test Instantiation ##
3302
+ // ###########################################
3303
+ static const ggml_type all_types[] = {
3304
+ GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16,
3305
+ GGML_TYPE_Q4_0, GGML_TYPE_Q4_1,
3306
+ GGML_TYPE_Q5_0, GGML_TYPE_Q5_1,
3307
+ GGML_TYPE_Q8_0,
3308
+ GGML_TYPE_Q2_K, GGML_TYPE_Q3_K,
3309
+ GGML_TYPE_Q4_K, GGML_TYPE_Q5_K,
3310
+ GGML_TYPE_Q6_K,
3311
+ // GGML_TYPE_TQ1_0, GGML_TYPE_TQ2_0, // TODO: implement for all backends
3312
+ GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S,
3313
+ GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M,
3314
+ GGML_TYPE_IQ4_NL, GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS,
3315
+ };
2080
3316
 
2081
- const ggml_type base_types[] = {
2082
- GGML_TYPE_F32, GGML_TYPE_F16,
2083
- GGML_TYPE_Q4_0,
2084
- GGML_TYPE_Q4_K,
2085
- GGML_TYPE_IQ2_XXS
2086
- };
3317
+ static const ggml_type base_types[] = {
3318
+ GGML_TYPE_F32, GGML_TYPE_F16,
3319
+ GGML_TYPE_Q4_0,
3320
+ GGML_TYPE_Q4_K,
3321
+ GGML_TYPE_IQ2_XXS
3322
+ };
2087
3323
 
2088
- const ggml_type other_types[] = {
2089
- GGML_TYPE_Q4_1,
2090
- GGML_TYPE_Q5_0, GGML_TYPE_Q5_1,
2091
- GGML_TYPE_Q8_0,
2092
- GGML_TYPE_Q2_K, GGML_TYPE_Q3_K,
2093
- GGML_TYPE_Q5_K,
2094
- GGML_TYPE_Q6_K,
2095
- GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S,
2096
- GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M,
2097
- GGML_TYPE_IQ4_NL, GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS,
2098
- GGML_TYPE_BF16,
2099
- };
3324
+ static const ggml_type other_types[] = {
3325
+ GGML_TYPE_Q4_1,
3326
+ GGML_TYPE_Q5_0, GGML_TYPE_Q5_1,
3327
+ GGML_TYPE_Q8_0,
3328
+ GGML_TYPE_Q2_K, GGML_TYPE_Q3_K,
3329
+ GGML_TYPE_Q5_K,
3330
+ GGML_TYPE_Q6_K,
3331
+ // GGML_TYPE_TQ1_0, GGML_TYPE_TQ2_0, // TODO: implement for all backends
3332
+ GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S,
3333
+ GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M,
3334
+ GGML_TYPE_IQ4_NL, GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS,
3335
+ GGML_TYPE_BF16,
3336
+ };
3337
+
3338
+ // Test cases for evaluation: should try to cover edge cases while using small input sizes to keep the runtime low
3339
+ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
3340
+ std::vector<std::unique_ptr<test_case>> test_cases;
3341
+ std::default_random_engine rng(0);
2100
3342
 
2101
3343
  // unary ops
2102
3344
  for (int v : {0, 1}) {
2103
3345
  for (int op = 0; op < GGML_UNARY_OP_COUNT; op++) {
2104
- test_cases.emplace_back(new test_unary((ggml_unary_op) op, GGML_TYPE_F32, { 128, 10, 10, 10 }, v));
2105
- test_cases.emplace_back(new test_unary((ggml_unary_op) op, GGML_TYPE_F32, { 7, 13, 19, 23 }, v));
3346
+ test_cases.emplace_back(new test_unary((ggml_unary_op) op, GGML_TYPE_F32, { 128, 2, 2, 2 }, v));
3347
+ test_cases.emplace_back(new test_unary((ggml_unary_op) op, GGML_TYPE_F32, { 5, 7, 11, 13 }, v));
2106
3348
  }
2107
3349
  }
2108
3350
 
@@ -2138,8 +3380,56 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
2138
3380
  }
2139
3381
  }
2140
3382
 
3383
+ // im2col 1D
3384
+ test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32, {3000, 128, 1, 1}, {3, 128, 1280, 1}, 1, 0, 1, 0, 1, 0, false));
3385
+ test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32, {3000, 128, 1, 1}, {3, 128, 1280, 1}, 1, 0, 1, 0, 1, 0, false));
3386
+ test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {3000, 128, 1, 1}, {3, 128, 1280, 1}, 1, 0, 1, 0, 1, 0, false));
3387
+ for (int s0 : {1, 3}) {
3388
+ for (int p0 : {0, 3}) {
3389
+ for (int d0 : {1, 3}) {
3390
+ test_cases.emplace_back(new test_im2col(
3391
+ GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32, {20, 2, 2, 1}, {3, 2, 2, 1},
3392
+ s0, 0, p0, 0, d0, 0, false));
3393
+ }
3394
+ }
3395
+ }
3396
+
3397
+ // im2col 2D
3398
+ test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32));
2141
3399
  test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32));
2142
3400
  test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16));
3401
+ for (int s0 : {1, 3}) {
3402
+ for (int s1 : {1, 3}) {
3403
+ for (int p0 : {0, 3}) {
3404
+ for (int p1 : {0, 3}) {
3405
+ for (int d0 : {1, 3}) {
3406
+ for (int d1 : {1, 3}) {
3407
+ test_cases.emplace_back(new test_im2col(
3408
+ GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32, {20, 20, 2, 2}, {3, 3, 2, 2},
3409
+ s0, s1, p0, p1, d0, d1, true));
3410
+ }
3411
+ }
3412
+ }
3413
+ }
3414
+ }
3415
+ }
3416
+
3417
+ // extra tests for im2col 2D
3418
+ test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 1, 32}, {3, 3, 1, 32}, 1, 1, 1, 1, 1, 1, true));
3419
+ test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 2, 32}, {3, 3, 2, 32}, 1, 1, 1, 1, 1, 1, true));
3420
+ test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 1, 1024}, {3, 3, 1, 1024}, 1, 1, 1, 1, 1, 1, true));
3421
+ test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 2, 1024}, {3, 3, 2, 1024}, 1, 1, 1, 1, 1, 1, true));
3422
+ test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 1, 2048}, {3, 3, 1, 2048}, 1, 1, 1, 1, 1, 1, true));
3423
+ test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 2, 2048}, {3, 3, 2, 2048}, 1, 1, 1, 1, 1, 1, true));
3424
+ test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 1, 2560}, {3, 3, 1, 2560}, 1, 1, 1, 1, 1, 1, true));
3425
+ test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 2, 2560}, {3, 3, 2, 2560}, 1, 1, 1, 1, 1, 1, true));
3426
+
3427
+ // sycl backend will limit task global_range < MAX_INT
3428
+ // test cases for 2D im2col with large input W and H (occurs in stable-diffusion)
3429
+ // however these cases need to alloc more memory which may fail in some devices (Intel Arc770, etc.)
3430
+ // these cases are verified (pass) in Intel(R) Data Center GPU Max 1100 (sycl backend) and NV A30 (cuda backend)
3431
+ // test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {1024, 1024, 256, 1}, {3, 3, 256, 1}, 1, 1, 1, 1, 1, 1, true));
3432
+ // test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32, {1024, 1024, 256, 1}, {3, 3, 256, 1}, 1, 1, 1, 1, 1, 1, true));
2143
3433
 
2144
3434
  test_cases.emplace_back(new test_conv_transpose_1d());
2145
3435
  test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 3, 0, 1));
@@ -2150,14 +3440,18 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
2150
3440
  test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,1,2,1}, 1, 0, 1));
2151
3441
  test_cases.emplace_back(new test_conv_transpose_1d({2,1,1,1}, {3,1,1,1}, 1, 0, 1));
2152
3442
 
3443
+ test_cases.emplace_back(new test_argmax());
3444
+ test_cases.emplace_back(new test_count_equal());
2153
3445
 
2154
- test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 1, 1, 1}));
2155
- test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {2, 1, 1, 1}));
2156
- test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 2, 1, 1}));
2157
- test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 1, 2, 1}));
2158
- test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 1, 1, 2}));
2159
- test_cases.emplace_back(new test_repeat(GGML_TYPE_I32, {10, 10, 10, 10}, {2, 1, 1, 1}));
2160
- test_cases.emplace_back(new test_repeat(GGML_TYPE_I16, {10, 10, 10, 10}, {1, 1, 1, 2}));
3446
+ for (int ne3 : {1, 3}) { // CUDA backward pass only supports ne3 == 1
3447
+ test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {1, 1, 1, 1}));
3448
+ test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {2, 1, 1, 1}));
3449
+ test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {1, 2, 1, 1}));
3450
+ test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {1, 1, 2, 1}));
3451
+ test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {1, 1, 1, 2}));
3452
+ test_cases.emplace_back(new test_repeat(GGML_TYPE_I32, {10, 5, 4, ne3}, {2, 1, 1, 1}));
3453
+ test_cases.emplace_back(new test_repeat(GGML_TYPE_I16, {10, 5, 4, ne3}, {1, 1, 1, 2}));
3454
+ }
2161
3455
 
2162
3456
  test_cases.emplace_back(new test_dup(GGML_TYPE_F32));
2163
3457
  test_cases.emplace_back(new test_dup(GGML_TYPE_F16));
@@ -2167,8 +3461,12 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
2167
3461
  test_cases.emplace_back(new test_dup(GGML_TYPE_F16, {10, 10, 5, 1}, {0, 2, 1, 3})); // dup by rows
2168
3462
  test_cases.emplace_back(new test_dup(GGML_TYPE_F32, {10, 10, 5, 1}, {1, 0, 2, 3}));
2169
3463
  test_cases.emplace_back(new test_dup(GGML_TYPE_F16, {10, 10, 5, 1}, {1, 0, 2, 3})); // dup dst not-contiguous
2170
- test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {0, 2, 1, 3}));
2171
- test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {1, 2, 0, 3}));
3464
+ test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {0, 2, 1, 3}));
3465
+ test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {1, 2, 0, 3}));
3466
+
3467
+ for (int dim = 1; dim < GGML_MAX_DIMS; ++dim) {
3468
+ test_cases.emplace_back(new test_set(GGML_TYPE_F32, GGML_TYPE_F32, {6, 5, 4, 3}, dim));
3469
+ }
2172
3470
 
2173
3471
  for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) {
2174
3472
  for (ggml_type type_dst : all_types) {
@@ -2183,6 +3481,15 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
2183
3481
  }
2184
3482
 
2185
3483
  test_cases.emplace_back(new test_cont());
3484
+ test_cases.emplace_back(new test_cont(GGML_TYPE_F32, {2, 1, 1 ,1}));
3485
+ test_cases.emplace_back(new test_cont(GGML_TYPE_F32, {2, 1, 3 ,5}));
3486
+ test_cases.emplace_back(new test_cont(GGML_TYPE_F32, {2, 3, 5 ,7}));
3487
+ test_cases.emplace_back(new test_cont(GGML_TYPE_F16, {2, 1, 1 ,1}));
3488
+ test_cases.emplace_back(new test_cont(GGML_TYPE_F16, {2, 1, 3 ,5}));
3489
+ test_cases.emplace_back(new test_cont(GGML_TYPE_F16, {2, 3, 5 ,7}));
3490
+ test_cases.emplace_back(new test_cont(GGML_TYPE_BF16, {2, 1, 1 ,1}));
3491
+ test_cases.emplace_back(new test_cont(GGML_TYPE_BF16, {2, 1, 3 ,5}));
3492
+ test_cases.emplace_back(new test_cont(GGML_TYPE_BF16, {2, 3, 5 ,7}));
2186
3493
 
2187
3494
  auto add_test_bin_bcast = [&](ggml_type type, std::array<int64_t, 4> ne, std::array<int, 4> nr) {
2188
3495
  for (auto op : {ggml_add, ggml_mul, ggml_div}) {
@@ -2193,16 +3500,16 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
2193
3500
  add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 8, 1}, {1, 1, 1, 1});
2194
3501
  add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 1, 1}, {32, 1, 1, 1});
2195
3502
  add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 320, 320}, {1, 1, 1, 1});
2196
- add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 1, 1}, {1, 1, 1, 1});
2197
- add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 1}, {1, 1, 1, 1});
2198
- add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {1, 1, 1, 1});
2199
- add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {2, 1, 1, 1});
2200
- add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {1, 2, 1, 1});
2201
- add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {1, 1, 2, 1});
2202
- add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {1, 1, 1, 2});
2203
- add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {1, 1, 2, 2});
2204
- add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {1, 2, 2, 2});
2205
- add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {2, 2, 2, 2});
3503
+ add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 1, 1}, {1, 1, 1, 1});
3504
+ add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 4, 1}, {1, 1, 1, 1});
3505
+ add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 4, 3}, {1, 1, 1, 1});
3506
+ add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 4, 3}, {2, 1, 1, 1});
3507
+ add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 4, 3}, {1, 2, 1, 1});
3508
+ add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 4, 3}, {1, 1, 2, 1});
3509
+ add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 4, 3}, {1, 1, 1, 2});
3510
+ add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 4, 3}, {1, 1, 2, 2});
3511
+ add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 4, 3}, {1, 2, 2, 2});
3512
+ add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 4, 3}, {2, 2, 2, 2});
2206
3513
 
2207
3514
  // stable diffusion
2208
3515
  add_test_bin_bcast(GGML_TYPE_F32, {1280, 1, 1, 1}, {1, 1, 1, 1});
@@ -2221,23 +3528,36 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
2221
3528
  //add_test_bin_bcast(GGML_TYPE_F32, {3, 3, 2560, 1280}, {1, 1, 1, 1});
2222
3529
  //add_test_bin_bcast(GGML_TYPE_F32, {3, 3, 2560, 1280}, {2, 1, 1, 1});
2223
3530
 
3531
+ test_cases.emplace_back(new test_add1());
2224
3532
  test_cases.emplace_back(new test_scale());
2225
3533
 
2226
3534
  for (float eps : {1e-6f, 1e-5f, 1e-3f, 1e-1f}) {
2227
- test_cases.emplace_back(new test_norm(GGML_TYPE_F32, {64, 10, 10, 10}, eps));
2228
- test_cases.emplace_back(new test_rms_norm(GGML_TYPE_F32, {64, 10, 10, 10}, eps));
3535
+ test_cases.emplace_back(new test_norm(GGML_TYPE_F32, {64, 5, 4, 3}, eps));
3536
+ test_cases.emplace_back(new test_rms_norm(GGML_TYPE_F32, {64, 5, 4, 3}, eps));
2229
3537
  }
2230
3538
 
3539
+ test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {4, 1536, 1, 1}, {4, 1536, 1, 1}));
3540
+ test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {8, 1536, 1, 1}, {4, 1536, 1, 1}));
3541
+ test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {4, 1536, 4, 1}, {4, 1536, 1, 1}));
3542
+
3543
+ test_cases.emplace_back(new test_ssm_scan(GGML_TYPE_F32, 16, 1024, 32, 4));
3544
+
3545
+ test_cases.emplace_back(new test_rwkv_wkv6(GGML_TYPE_F32, 32, 64, 1, 1));
3546
+ test_cases.emplace_back(new test_rwkv_wkv6(GGML_TYPE_F32, 32, 64, 32, 1));
3547
+ test_cases.emplace_back(new test_rwkv_wkv6(GGML_TYPE_F32, 32, 64, 32, 4));
3548
+ test_cases.emplace_back(new test_rwkv_wkv6(GGML_TYPE_F32, 32, 64, 128, 4));
3549
+
2231
3550
  #if 1
2232
3551
  for (ggml_type type_a : base_types) {
2233
3552
  for (ggml_type type_b : {GGML_TYPE_F32, GGML_TYPE_F16}) {
2234
- test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, { 1, 1}, {1, 1}));
2235
- test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 1}, {1, 1}));
2236
- test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 1}, {2, 1}));
2237
- test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 10}, {1, 1}));
2238
- test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 10}, {2, 1}));
2239
- test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 10}, {1, 2}));
2240
- test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 10}, {2, 2}));
3553
+ // test cases without permutation
3554
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, { 1, 1}, {1, 1}));
3555
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 1}, {1, 1}));
3556
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 1}, {2, 1}));
3557
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 10}, {1, 1}));
3558
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 10}, {2, 1}));
3559
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 10}, {1, 2}));
3560
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 10}, {2, 2}));
2241
3561
 
2242
3562
  test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, { 1, 1}, {1, 1}));
2243
3563
  test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 1}, {1, 1}));
@@ -2246,6 +3566,27 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
2246
3566
  test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {2, 1}));
2247
3567
  test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {1, 2}));
2248
3568
  test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {2, 2}));
3569
+
3570
+ // test cases with permutation
3571
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {2, 3}, {1, 1}, {0, 2, 1, 3}));
3572
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {2, 3}, {1, 1}, {0, 1, 3, 2}));
3573
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {2, 3}, {1, 1}, {0, 3, 2, 1}));
3574
+
3575
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 8, 256, {2, 3}, {1, 1}, {0, 2, 1, 3}));
3576
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 8, 256, {2, 3}, {1, 1}, {0, 1, 3, 2}));
3577
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 8, 256, {2, 3}, {1, 1}, {0, 3, 2, 1}));
3578
+
3579
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {2, 3}, {1, 1}, {0, 2, 1, 3}));
3580
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {2, 3}, {1, 1}, {0, 1, 3, 2}));
3581
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {2, 3}, {1, 1}, {0, 3, 2, 1}));
3582
+ }
3583
+ }
3584
+ for (ggml_type type_a : other_types) {
3585
+ for (ggml_type type_b : {GGML_TYPE_F32}) {
3586
+ if (ggml_blck_size(type_a) != 256) {
3587
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, ggml_blck_size(type_a), {1, 1}, {1, 1}));
3588
+ }
3589
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {1, 1}, {1, 1}));
2249
3590
  }
2250
3591
  }
2251
3592
  #else
@@ -2267,12 +3608,6 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
2267
3608
  }
2268
3609
  #endif
2269
3610
 
2270
- for (ggml_type type_a : other_types) {
2271
- for (ggml_type type_b : {GGML_TYPE_F32}) {
2272
- test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, { 1, 1}, {1, 1}));
2273
- }
2274
- }
2275
-
2276
3611
  test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 64, 2, 128, { 8, 1}, {1, 1}));
2277
3612
  test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 83, 2, 128, { 8, 1}, {4, 1}));
2278
3613
  test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 64, 2, 64, { 8, 1}, {4, 1}));
@@ -2280,6 +3615,12 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
2280
3615
  test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 64, 45, 128, { 8, 1}, {4, 1}));
2281
3616
  test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 45, 64, { 8, 1}, {4, 1}));
2282
3617
 
3618
+ // sycl backend will limit task global_range < MAX_INT
3619
+ // test case for f16-type-convert-to-fp32 kernel with large k under fp32 compute dtype (occurs in stable-diffusion)
3620
+ // however this case needs to alloc more memory which may fail in some devices (Intel Arc770, etc.)
3621
+ // this case is verified (pass) in Intel(R) Data Center GPU Max 1100 (sycl backend) and NV A30 (cuda backend)
3622
+ // test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F16, 512, 262144, 9216, {1, 1}, {1, 1}));
3623
+
2283
3624
  for (ggml_type type_a : base_types) {
2284
3625
  for (ggml_type type_b : {GGML_TYPE_F32 /*, GGML_TYPE_F16 */}) {
2285
3626
  for (int n_mats : {4, 8}) {
@@ -2301,7 +3642,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
2301
3642
  for (int n_mats : {4}) {
2302
3643
  for (int n_used : {2}) {
2303
3644
  for (bool b : {false}) {
2304
- for (int n : {1}) {
3645
+ for (int n : {1, 32}) {
2305
3646
  int m = 512;
2306
3647
  int k = 256;
2307
3648
  test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, n_mats, n_used, b, m, n, k));
@@ -2312,13 +3653,37 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
2312
3653
  }
2313
3654
  }
2314
3655
 
3656
+ for (ggml_type type_a : base_types) {
3657
+ for (ggml_type type_b : {GGML_TYPE_F32, GGML_TYPE_F16}) {
3658
+ test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 1, 16, { 1, 1}));
3659
+ test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 1, 16, {10, 1}));
3660
+ test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 1, 16, {10, 1}));
3661
+ test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 1, 16, {10, 10}));
3662
+ test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 1, 16, {10, 10}));
3663
+ test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 1, 16, {10, 10}));
3664
+ test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 1, 16, {10, 10}));
3665
+
3666
+ test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, { 1, 1}));
3667
+ test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, { 1, 1}, true));
3668
+ test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, {10, 1}));
3669
+ test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, {10, 1}));
3670
+ test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, {10, 10}));
3671
+ test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, {10, 10}));
3672
+ test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, {10, 10}));
3673
+ test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, {10, 10}));
3674
+ }
3675
+ }
3676
+
2315
3677
  test_cases.emplace_back(new test_sqr());
2316
3678
  test_cases.emplace_back(new test_sqrt());
3679
+ test_cases.emplace_back(new test_log());
3680
+ test_cases.emplace_back(new test_sin());
3681
+ test_cases.emplace_back(new test_cos());
2317
3682
  test_cases.emplace_back(new test_clamp());
2318
3683
 
2319
- test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 1, 1}, 5));
2320
- test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 10, 1}, 5));
2321
- test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 10, 10}, 5));
3684
+ test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 1, 1}, 5));
3685
+ test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 3, 1}, 5));
3686
+ test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 3, 2}, 5));
2322
3687
 
2323
3688
  #if 0
2324
3689
  std::uniform_int_distribution<> dist_ne1(1, 50);
@@ -2362,23 +3727,23 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
2362
3727
  for (float af : { 1.0f, 1.4245f }) {
2363
3728
  for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
2364
3729
  for (bool ff : {false, true}) { // freq_factors
2365
- test_cases.emplace_back(new test_rope(type, {128, 32, 10, 1}, 128, 0, 512, fs, ef, af, ff, v)); // llama 7B
3730
+ test_cases.emplace_back(new test_rope(type, {128, 32, 2, 1}, 128, 0, 512, fs, ef, af, ff, v)); // llama 7B
2366
3731
 
2367
3732
  if (all) {
2368
- test_cases.emplace_back(new test_rope(type, {128, 40, 10, 1}, 128, 0, 512, fs, ef, af, ff, v)); // llama 13B
2369
- test_cases.emplace_back(new test_rope(type, {128, 52, 10, 1}, 128, 0, 512, fs, ef, af, ff, v)); // llama 30B
2370
- test_cases.emplace_back(new test_rope(type, {128, 64, 10, 1}, 128, 0, 512, fs, ef, af, ff, v)); // llama 65B
3733
+ test_cases.emplace_back(new test_rope(type, {128, 40, 2, 1}, 128, 0, 512, fs, ef, af, ff, v)); // llama 13B
3734
+ test_cases.emplace_back(new test_rope(type, {128, 52, 2, 1}, 128, 0, 512, fs, ef, af, ff, v)); // llama 30B
3735
+ test_cases.emplace_back(new test_rope(type, {128, 64, 2, 1}, 128, 0, 512, fs, ef, af, ff, v)); // llama 65B
2371
3736
  }
2372
3737
 
2373
3738
  if (all) {
2374
- test_cases.emplace_back(new test_rope(type, { 64, 1, 10, 1}, 64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 7B)
2375
- test_cases.emplace_back(new test_rope(type, { 64, 71, 10, 1}, 64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 7B)
2376
- test_cases.emplace_back(new test_rope(type, { 64, 8, 10, 1}, 64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 40B)
2377
- test_cases.emplace_back(new test_rope(type, { 80, 32, 10, 1}, 20, 2, 512, fs, ef, af, ff, v)); // neox (stablelm)
2378
- test_cases.emplace_back(new test_rope(type, { 80, 32, 10, 1}, 32, 2, 512, fs, ef, af, ff, v)); // neox (phi-2)
3739
+ test_cases.emplace_back(new test_rope(type, { 64, 1, 2, 1}, 64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 7B)
3740
+ test_cases.emplace_back(new test_rope(type, { 64, 71, 2, 1}, 64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 7B)
3741
+ test_cases.emplace_back(new test_rope(type, { 64, 8, 2, 1}, 64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 40B)
3742
+ test_cases.emplace_back(new test_rope(type, { 80, 32, 2, 1}, 20, 2, 512, fs, ef, af, ff, v)); // neox (stablelm)
3743
+ test_cases.emplace_back(new test_rope(type, { 80, 32, 2, 1}, 32, 2, 512, fs, ef, af, ff, v)); // neox (phi-2)
2379
3744
  }
2380
3745
 
2381
- test_cases.emplace_back(new test_rope(type, { 64, 128, 10, 1}, 64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 40B)
3746
+ test_cases.emplace_back(new test_rope(type, { 64, 128, 2, 1}, 64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 40B)
2382
3747
  }
2383
3748
  }
2384
3749
 
@@ -2402,7 +3767,9 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
2402
3767
  test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {60, 10, 10, 10}, order)); // qwen
2403
3768
  }
2404
3769
 
3770
+ test_cases.emplace_back(new test_sum());
2405
3771
  test_cases.emplace_back(new test_sum_rows());
3772
+ test_cases.emplace_back(new test_mean());
2406
3773
  test_cases.emplace_back(new test_upscale());
2407
3774
  test_cases.emplace_back(new test_upscale(GGML_TYPE_F32, { 512, 512, 3, 1 }, 2, true));
2408
3775
  test_cases.emplace_back(new test_upscale_ext());
@@ -2417,11 +3784,14 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
2417
3784
  for (bool mask : { true, false } ) {
2418
3785
  for (float max_bias : { 0.0f, 8.0f }) {
2419
3786
  if (!mask && max_bias > 0.0f) continue;
2420
- for (int nh : { 32, }) {
2421
- for (int kv : { 512, 1024, }) {
2422
- for (int nb : { 1, 2, 4, 8, }) {
2423
- for (ggml_type type_KV : {GGML_TYPE_F16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0}) {
2424
- test_cases.emplace_back(new test_flash_attn_ext(hs, nh, kv, nb, mask, max_bias, type_KV));
3787
+ for (float logit_softcap : {0.0f, 10.0f}) {
3788
+ if (hs != 128 && logit_softcap != 0.0f) continue;
3789
+ for (int nh : { 32, }) {
3790
+ for (int kv : { 512, 1024, }) {
3791
+ for (int nb : { 1, 3, 32, 35, }) {
3792
+ for (ggml_type type_KV : {GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0}) {
3793
+ test_cases.emplace_back(new test_flash_attn_ext(hs, nh, kv, nb, mask, max_bias, logit_softcap, type_KV));
3794
+ }
2425
3795
  }
2426
3796
  }
2427
3797
  }
@@ -2430,6 +3800,9 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
2430
3800
  }
2431
3801
  }
2432
3802
 
3803
+ test_cases.emplace_back(new test_cross_entropy_loss());
3804
+ test_cases.emplace_back(new test_opt_step_adamw(GGML_TYPE_F32, {10, 5, 4, 3}));
3805
+
2433
3806
  // these tests are disabled to save execution time, but they can be handy for debugging
2434
3807
  #if 0
2435
3808
  test_cases.emplace_back(new test_llama(1));
@@ -2438,8 +3811,32 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
2438
3811
  test_cases.emplace_back(new test_falcon(2));
2439
3812
  #endif
2440
3813
 
2441
- // run tests
3814
+ return test_cases;
3815
+ }
3816
+
3817
+ // Test cases for performance evaluation: should be representative of real-world use cases
3818
+ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
3819
+ std::vector<std::unique_ptr<test_case>> test_cases;
3820
+
3821
+ test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 1, 1, 1}));
3822
+ test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 512, 1, 1}));
3823
+
3824
+ test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F16, {512, 3072, 1, 1}));
3825
+
3826
+ for (int bs : {1, 512}) {
3827
+ for (ggml_type type_a : all_types) {
3828
+ for (ggml_type type_b : {GGML_TYPE_F32}) {
3829
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 4096, bs, 14336, {1, 1}, {1, 1}));
3830
+ }
3831
+ }
3832
+ }
3833
+
3834
+ return test_cases;
3835
+ }
3836
+
3837
+ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_name) {
2442
3838
  if (mode == MODE_TEST) {
3839
+ auto test_cases = make_test_cases_eval();
2443
3840
  ggml_backend_t backend_cpu = ggml_backend_cpu_init();
2444
3841
 
2445
3842
  size_t n_ok = 0;
@@ -2455,7 +3852,21 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
2455
3852
  return n_ok == test_cases.size();
2456
3853
  }
2457
3854
 
3855
+ if (mode == MODE_GRAD) {
3856
+ auto test_cases = make_test_cases_eval();
3857
+ size_t n_ok = 0;
3858
+ for (auto & test : test_cases) {
3859
+ if (test->eval_grad(backend, op_name)) {
3860
+ n_ok++;
3861
+ }
3862
+ }
3863
+ printf(" %zu/%zu tests passed\n", n_ok, test_cases.size());
3864
+
3865
+ return n_ok == test_cases.size();
3866
+ }
3867
+
2458
3868
  if (mode == MODE_PERF) {
3869
+ auto test_cases = make_test_cases_perf();
2459
3870
  for (auto & test : test_cases) {
2460
3871
  test->eval_perf(backend, op_name);
2461
3872
  }
@@ -2463,13 +3874,15 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
2463
3874
  }
2464
3875
 
2465
3876
  GGML_ABORT("fatal error");
2466
- return false;
2467
3877
  }
2468
3878
 
2469
3879
  static void usage(char ** argv) {
2470
3880
  printf("Usage: %s [mode] [-o op] [-b backend]\n", argv[0]);
2471
- printf(" valid modes are: test (compare with CPU backend for correctness) or perf (performance evaluation)\n");
2472
- printf(" op names are as given by ggml_op_desc()\n");
3881
+ printf(" valid modes:\n");
3882
+ printf(" - test (default, compare with CPU backend for correctness)\n");
3883
+ printf(" - grad (compare gradients from backpropagation with method of finite differences)\n");
3884
+ printf(" - perf (performance evaluation)\n");
3885
+ printf(" op names for -o are as given by ggml_op_desc() (e.g. ADD, MUL_MAT, etc)\n");
2473
3886
  }
2474
3887
 
2475
3888
  int main(int argc, char ** argv) {
@@ -2482,6 +3895,8 @@ int main(int argc, char ** argv) {
2482
3895
  mode = MODE_TEST;
2483
3896
  } else if (strcmp(argv[i], "perf") == 0) {
2484
3897
  mode = MODE_PERF;
3898
+ } else if (strcmp(argv[i], "grad") == 0) {
3899
+ mode = MODE_GRAD;
2485
3900
  } else if (strcmp(argv[i], "-o") == 0) {
2486
3901
  if (i + 1 < argc) {
2487
3902
  op_name_filter = argv[++i];
@@ -2503,30 +3918,43 @@ int main(int argc, char ** argv) {
2503
3918
  }
2504
3919
 
2505
3920
  // enumerate backends
2506
- printf("Testing %zu backends\n\n", ggml_backend_reg_get_count());
3921
+ printf("Testing %zu devices\n\n", ggml_backend_dev_count());
2507
3922
 
2508
3923
  size_t n_ok = 0;
2509
3924
 
2510
- for (size_t i = 0; i < ggml_backend_reg_get_count(); i++) {
2511
- printf("Backend %zu/%zu (%s)\n", i + 1, ggml_backend_reg_get_count(), ggml_backend_reg_get_name(i));
3925
+ for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
3926
+ ggml_backend_dev_t dev = ggml_backend_dev_get(i);
2512
3927
 
2513
- if (backend_filter != NULL && strcmp(backend_filter, ggml_backend_reg_get_name(i)) != 0) {
3928
+ printf("Backend %zu/%zu: %s\n", i + 1, ggml_backend_dev_count(), ggml_backend_dev_name(dev));
3929
+
3930
+ if (backend_filter != NULL && strcmp(backend_filter, ggml_backend_dev_name(dev)) != 0) {
2514
3931
  printf(" Skipping\n");
2515
3932
  n_ok++;
2516
3933
  continue;
2517
3934
  }
2518
3935
 
2519
- ggml_backend_t backend = ggml_backend_reg_init_backend(i, NULL);
3936
+ ggml_backend_t backend = ggml_backend_dev_init(dev, NULL);
2520
3937
  GGML_ASSERT(backend != NULL);
2521
3938
 
2522
- if (backend_filter == NULL && ggml_backend_is_cpu(backend)) {
3939
+ if (backend_filter == NULL && ggml_backend_is_cpu(backend) && mode != MODE_GRAD) {
2523
3940
  printf(" Skipping CPU backend\n");
2524
3941
  ggml_backend_free(backend);
2525
3942
  n_ok++;
2526
3943
  continue;
2527
3944
  }
2528
3945
 
2529
- printf(" Backend name: %s\n", ggml_backend_name(backend));
3946
+ ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
3947
+ auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
3948
+ if (ggml_backend_set_n_threads_fn) {
3949
+ // TODO: better value for n_threads
3950
+ ggml_backend_set_n_threads_fn(backend, std::thread::hardware_concurrency());
3951
+ }
3952
+
3953
+ printf(" Device description: %s\n", ggml_backend_dev_description(dev));
3954
+ size_t free, total; // NOLINT
3955
+ ggml_backend_dev_memory(dev, &free, &total);
3956
+ printf(" Device memory: %zu MB (%zu MB free)\n", total / 1024 / 1024, free / 1024 / 1024);
3957
+ printf("\n");
2530
3958
 
2531
3959
  bool ok = test_backend(backend, mode, op_name_filter);
2532
3960
 
@@ -2543,15 +3971,15 @@ int main(int argc, char ** argv) {
2543
3971
  ggml_backend_free(backend);
2544
3972
  }
2545
3973
 
2546
- printf("%zu/%zu backends passed\n", n_ok, ggml_backend_reg_get_count());
3974
+ ggml_quantize_free();
3975
+
3976
+ printf("%zu/%zu backends passed\n", n_ok, ggml_backend_dev_count());
2547
3977
 
2548
- if (n_ok != ggml_backend_reg_get_count()) {
3978
+ if (n_ok != ggml_backend_dev_count()) {
2549
3979
  printf("\033[1;31mFAIL\033[0m\n");
2550
3980
  return 1;
2551
3981
  }
2552
3982
 
2553
- ggml_quantize_free();
2554
-
2555
3983
  printf("\033[1;32mOK\033[0m\n");
2556
3984
  return 0;
2557
3985
  }