@fugood/llama.node 0.3.1 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (252) hide show
  1. package/CMakeLists.txt +1 -8
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/package.json +4 -2
  17. package/src/DetokenizeWorker.cpp +1 -1
  18. package/src/EmbeddingWorker.cpp +2 -2
  19. package/src/LlamaCompletionWorker.cpp +10 -10
  20. package/src/LlamaCompletionWorker.h +2 -2
  21. package/src/LlamaContext.cpp +14 -17
  22. package/src/TokenizeWorker.cpp +1 -1
  23. package/src/common.hpp +5 -4
  24. package/src/llama.cpp/.github/workflows/build.yml +137 -29
  25. package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
  26. package/src/llama.cpp/.github/workflows/docker.yml +46 -34
  27. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
  28. package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
  29. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
  30. package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
  31. package/src/llama.cpp/.github/workflows/server.yml +7 -0
  32. package/src/llama.cpp/CMakeLists.txt +26 -11
  33. package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
  34. package/src/llama.cpp/common/CMakeLists.txt +10 -10
  35. package/src/llama.cpp/common/arg.cpp +2041 -0
  36. package/src/llama.cpp/common/arg.h +77 -0
  37. package/src/llama.cpp/common/common.cpp +523 -1861
  38. package/src/llama.cpp/common/common.h +234 -106
  39. package/src/llama.cpp/common/console.cpp +3 -0
  40. package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
  41. package/src/llama.cpp/common/log.cpp +401 -0
  42. package/src/llama.cpp/common/log.h +66 -698
  43. package/src/llama.cpp/common/ngram-cache.cpp +39 -36
  44. package/src/llama.cpp/common/ngram-cache.h +19 -19
  45. package/src/llama.cpp/common/sampling.cpp +356 -350
  46. package/src/llama.cpp/common/sampling.h +62 -139
  47. package/src/llama.cpp/common/stb_image.h +5990 -6398
  48. package/src/llama.cpp/docs/build.md +72 -17
  49. package/src/llama.cpp/examples/CMakeLists.txt +1 -2
  50. package/src/llama.cpp/examples/batched/batched.cpp +49 -65
  51. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +42 -53
  52. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
  53. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +22 -22
  54. package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
  55. package/src/llama.cpp/examples/embedding/embedding.cpp +147 -91
  56. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +37 -37
  57. package/src/llama.cpp/examples/export-lora/export-lora.cpp +39 -38
  58. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
  59. package/src/llama.cpp/examples/{baby-llama → gen-docs}/CMakeLists.txt +2 -2
  60. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
  61. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
  62. package/src/llama.cpp/examples/gritlm/gritlm.cpp +46 -39
  63. package/src/llama.cpp/examples/imatrix/imatrix.cpp +75 -69
  64. package/src/llama.cpp/examples/infill/infill.cpp +131 -192
  65. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +276 -178
  66. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  67. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +40 -36
  68. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  69. package/src/llama.cpp/examples/llava/clip.cpp +686 -150
  70. package/src/llama.cpp/examples/llava/clip.h +11 -2
  71. package/src/llama.cpp/examples/llava/llava-cli.cpp +60 -71
  72. package/src/llama.cpp/examples/llava/llava.cpp +146 -26
  73. package/src/llama.cpp/examples/llava/llava.h +2 -3
  74. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
  75. package/src/llama.cpp/examples/llava/requirements.txt +1 -0
  76. package/src/llama.cpp/examples/lookahead/lookahead.cpp +55 -56
  77. package/src/llama.cpp/examples/lookup/lookup-create.cpp +15 -13
  78. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  79. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +34 -33
  80. package/src/llama.cpp/examples/lookup/lookup.cpp +60 -63
  81. package/src/llama.cpp/examples/main/main.cpp +216 -313
  82. package/src/llama.cpp/examples/parallel/parallel.cpp +58 -59
  83. package/src/llama.cpp/examples/passkey/passkey.cpp +53 -61
  84. package/src/llama.cpp/examples/perplexity/perplexity.cpp +277 -311
  85. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  86. package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
  87. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -12
  88. package/src/llama.cpp/examples/retrieval/retrieval.cpp +57 -52
  89. package/src/llama.cpp/examples/rpc/rpc-server.cpp +27 -2
  90. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +60 -46
  91. package/src/llama.cpp/examples/server/CMakeLists.txt +7 -18
  92. package/src/llama.cpp/examples/server/server.cpp +1347 -1531
  93. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
  94. package/src/llama.cpp/examples/server/utils.hpp +396 -107
  95. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  96. package/src/llama.cpp/examples/simple/simple.cpp +132 -106
  97. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
  98. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
  99. package/src/llama.cpp/examples/speculative/speculative.cpp +153 -124
  100. package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
  101. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
  102. package/src/llama.cpp/examples/tokenize/tokenize.cpp +27 -29
  103. package/src/llama.cpp/ggml/CMakeLists.txt +29 -12
  104. package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
  105. package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
  106. package/src/llama.cpp/ggml/include/ggml-backend.h +166 -68
  107. package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
  108. package/src/llama.cpp/ggml/include/ggml-cann.h +17 -19
  109. package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
  110. package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
  111. package/src/llama.cpp/ggml/include/ggml-cuda.h +17 -17
  112. package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
  113. package/src/llama.cpp/ggml/include/ggml-metal.h +13 -12
  114. package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
  115. package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
  116. package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
  117. package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
  118. package/src/llama.cpp/ggml/include/ggml.h +272 -505
  119. package/src/llama.cpp/ggml/src/CMakeLists.txt +69 -1110
  120. package/src/llama.cpp/ggml/src/ggml-aarch64.c +52 -2116
  121. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
  122. package/src/llama.cpp/ggml/src/ggml-alloc.c +29 -27
  123. package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
  124. package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
  125. package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
  126. package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
  127. package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
  128. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +144 -81
  129. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
  130. package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +394 -635
  131. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
  132. package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +217 -70
  133. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
  134. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
  135. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
  136. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
  137. package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
  138. package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +458 -353
  139. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
  140. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
  141. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
  142. package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
  143. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
  144. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
  145. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
  146. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +371 -0
  147. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
  148. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  149. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
  150. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
  151. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1885 -0
  152. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
  153. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
  154. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
  155. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
  156. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
  157. package/src/llama.cpp/ggml/src/ggml-impl.h +380 -584
  158. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
  159. package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +233 -87
  160. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
  161. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
  162. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
  163. package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
  164. package/src/llama.cpp/ggml/src/ggml-quants.c +369 -9994
  165. package/src/llama.cpp/ggml/src/ggml-quants.h +78 -110
  166. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
  167. package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +560 -335
  168. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
  169. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +6 -0
  170. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +51 -0
  171. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +310 -0
  172. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
  173. package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
  174. package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
  175. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
  176. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
  177. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
  178. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
  179. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +18 -25
  180. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
  181. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
  182. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
  183. package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3350 -3980
  184. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
  185. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
  186. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +70 -68
  187. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +9 -6
  188. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
  189. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
  190. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +8 -0
  191. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
  192. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
  193. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
  194. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
  195. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
  196. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
  197. package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
  198. package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
  199. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
  200. package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2034 -1718
  201. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +2 -0
  202. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +152 -185
  203. package/src/llama.cpp/ggml/src/ggml.c +2075 -16579
  204. package/src/llama.cpp/include/llama.h +296 -285
  205. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
  206. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
  207. package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
  208. package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
  209. package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
  210. package/src/llama.cpp/src/CMakeLists.txt +2 -1
  211. package/src/llama.cpp/src/llama-grammar.cpp +721 -122
  212. package/src/llama.cpp/src/llama-grammar.h +120 -15
  213. package/src/llama.cpp/src/llama-impl.h +156 -1
  214. package/src/llama.cpp/src/llama-sampling.cpp +2058 -346
  215. package/src/llama.cpp/src/llama-sampling.h +39 -47
  216. package/src/llama.cpp/src/llama-vocab.cpp +390 -127
  217. package/src/llama.cpp/src/llama-vocab.h +60 -20
  218. package/src/llama.cpp/src/llama.cpp +6215 -3263
  219. package/src/llama.cpp/src/unicode-data.cpp +6 -4
  220. package/src/llama.cpp/src/unicode-data.h +4 -4
  221. package/src/llama.cpp/src/unicode.cpp +15 -7
  222. package/src/llama.cpp/tests/CMakeLists.txt +4 -2
  223. package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
  224. package/src/llama.cpp/tests/test-backend-ops.cpp +1725 -297
  225. package/src/llama.cpp/tests/test-barrier.cpp +94 -0
  226. package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
  227. package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
  228. package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
  229. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +23 -8
  230. package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
  231. package/src/llama.cpp/tests/test-log.cpp +39 -0
  232. package/src/llama.cpp/tests/test-opt.cpp +853 -142
  233. package/src/llama.cpp/tests/test-quantize-fns.cpp +28 -19
  234. package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
  235. package/src/llama.cpp/tests/test-rope.cpp +2 -1
  236. package/src/llama.cpp/tests/test-sampling.cpp +226 -142
  237. package/src/llama.cpp/tests/test-tokenizer-0.cpp +56 -36
  238. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
  239. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
  240. package/patches/llama.patch +0 -22
  241. package/src/llama.cpp/.github/workflows/bench.yml +0 -310
  242. package/src/llama.cpp/common/grammar-parser.cpp +0 -536
  243. package/src/llama.cpp/common/grammar-parser.h +0 -29
  244. package/src/llama.cpp/common/train.cpp +0 -1513
  245. package/src/llama.cpp/common/train.h +0 -233
  246. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1640
  247. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
  248. package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
  249. package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +0 -1027
  250. package/src/llama.cpp/tests/test-grad0.cpp +0 -1566
  251. /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
  252. /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
@@ -1,6 +0,0 @@
1
- set(TARGET llama-bench-matmult)
2
- add_executable(${TARGET} benchmark-matmult.cpp)
3
- install(TARGETS ${TARGET} RUNTIME)
4
- target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
5
- target_include_directories(${TARGET} PRIVATE ../../common)
6
- target_compile_features(${TARGET} PRIVATE cxx_std_11)
@@ -1,275 +0,0 @@
1
- #include "common.h"
2
- #include "ggml.h"
3
-
4
- #include <locale.h>
5
- #include <assert.h>
6
- #include <math.h>
7
- #include <cstring>
8
- #include <cstdio>
9
- #include <cinttypes>
10
- #include <unordered_map>
11
- #include <queue>
12
- #include <string.h>
13
- #include <cassert>
14
- #include <fstream>
15
- #include <string>
16
- #include <iterator>
17
- #include <algorithm>
18
-
19
- #if defined(_MSC_VER)
20
- #pragma warning(disable: 4244 4267) // possible loss of data
21
- #endif
22
-
23
- static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
24
- struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
25
-
26
- if (plan.work_size > 0) {
27
- buf.resize(plan.work_size);
28
- plan.work_data = buf.data();
29
- }
30
-
31
- ggml_graph_compute(graph, &plan);
32
- }
33
-
34
- static float tensor_sum_elements(const ggml_tensor * tensor) {
35
- double sum = 0;
36
- if (tensor->type == GGML_TYPE_F32) {
37
- for (int j = 0; j < tensor->ne[1]; j++) {
38
- for (int k = 0; k < tensor->ne[0]; k++) {
39
- sum += ((float *) tensor->data)[j*tensor->ne[0] + k];
40
- }
41
- }
42
- }
43
- return sum;
44
- }
45
-
46
- static void tensor_dump(const ggml_tensor * tensor, const char * name) {
47
- printf("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi) - ", name,
48
- tensor->type, ggml_type_name(tensor->type),
49
- tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->nb[0], tensor->nb[1], tensor->nb[2]);
50
- float sum = tensor_sum_elements(tensor);
51
- printf("Sum of tensor %s is %6.2f\n", name, sum);
52
- }
53
-
54
- #define TENSOR_DUMP(tensor) tensor_dump(tensor, #tensor)
55
-
56
- struct benchmark_params_struct {
57
- int32_t n_threads = 1;
58
- int32_t n_iterations = 10;
59
- };
60
-
61
- static void print_usage(int /*argc*/, char ** argv, struct benchmark_params_struct params) {
62
- fprintf(stderr, "usage: %s [options]\n", argv[0]);
63
- fprintf(stderr, "\n");
64
- fprintf(stderr, "options:\n");
65
- fprintf(stderr, " -h, --help show this help message and exit\n");
66
- fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
67
- fprintf(stderr, " -i N, --iter N number of iterations to use during computation (default: %d)\n", params.n_iterations);
68
- fprintf(stderr, "\n");
69
- }
70
-
71
- int main(int argc, char ** argv) {
72
- struct benchmark_params_struct benchmark_params;
73
-
74
- bool invalid_param = false;
75
- std::string arg;
76
- for (int i = 1; i < argc; i++) {
77
- arg = argv[i];
78
-
79
- if (arg == "-t" || arg == "--threads") {
80
- if (++i >= argc) {
81
- invalid_param = true;
82
- break;
83
- }
84
- benchmark_params.n_threads = std::stoi(argv[i]);
85
- } else if (arg == "-i" || arg == "--iter") {
86
- if (++i >= argc) {
87
- invalid_param = true;
88
- break;
89
- }
90
- benchmark_params.n_iterations = std::stoi(argv[i]);
91
- } else if (arg == "-h" || arg == "--help") {
92
- print_usage(argc, argv, benchmark_params);
93
- exit(0);
94
- }
95
- }
96
- if (invalid_param) {
97
- fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
98
- print_usage(argc, argv, benchmark_params);
99
- exit(1);
100
- }
101
-
102
- print_build_info();
103
- printf("Starting Test\n");
104
-
105
- // create the ggml context
106
- struct ggml_context * ctx;
107
- //const int sizex = 4096;
108
- //const int sizey = 11008;
109
-
110
- #undef VERBOSE_DEBUGGING
111
- #ifndef VERBOSE_DEBUGGING
112
- const int sizey = 4096;
113
- const int sizex = 11008;
114
- const int sizez = 128;
115
- #else
116
- /* Working - let's increase size */
117
- const int sizey = 1;
118
- const int sizex = (8*32);
119
- const int sizez = 1;
120
-
121
- /*const int sizey = 1;
122
- const int sizex = 3*(8*32);
123
- const int sizez = 1;*/
124
- #endif
125
-
126
- //printf("Memsize required = %i\n", sizex*sizex);
127
-
128
- // TODO: perform the bench for all types or for a user specified type
129
- const ggml_type qtype = GGML_TYPE_Q4_1;
130
-
131
- size_t ctx_size = 0;
132
- ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey);
133
- ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey);
134
- ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizez);
135
- ctx_size += ggml_row_size(qtype, sizex*sizey);
136
- ctx_size += ggml_row_size(qtype, sizex*sizey);
137
- ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS
138
- ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS
139
- ctx_size += 1024*1024*16;
140
-
141
- printf("Allocating Memory of size %zi bytes, %zi MB\n",ctx_size, (ctx_size/1024/1024));
142
-
143
- struct ggml_init_params params = {
144
- /*.mem_size =*/ ctx_size,
145
- /*.mem_buffer =*/ NULL,
146
- /* no_alloc =*/ 0
147
- };
148
-
149
- ctx = ggml_init(params);
150
- if (!ctx) {
151
- fprintf(stderr, "%s: ggml_init() failed\n", __func__);
152
- return 1;
153
- }
154
-
155
-
156
- printf("Creating new tensors\n");
157
- // printf("Creating new tensor m1\n");
158
- struct ggml_tensor * m11 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey);
159
- ggml_set_f32(m11, 1.0f);
160
-
161
- // printf("Creating new tensor m1\n");
162
- struct ggml_tensor * m12 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey);
163
- ggml_set_f32(m12, 1.5f);
164
-
165
- // printf("Creating new tensor m2\n");
166
- struct ggml_tensor * m2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizez);
167
- ggml_set_f32(m2, 2.0f);
168
-
169
- printf("\n------ Test 1 - Matrix Mult via F32 code\n");
170
- // printf("Creating new tensor m11xm2\n");
171
- struct ggml_tensor * m11xm2 = ggml_mul_mat(ctx, m11, m2);
172
-
173
- // printf("Creating compute graph\n");
174
- struct ggml_cgraph * gf = ggml_new_graph(ctx);
175
- ggml_build_forward_expand(gf, m11xm2);
176
-
177
- printf("n_threads=%i\n", benchmark_params.n_threads);
178
-
179
- TENSOR_DUMP(m11);
180
- TENSOR_DUMP(m2);
181
-
182
- std::vector<uint8_t> work_buffer;
183
-
184
- ggml_graph_compute_helper(work_buffer, gf, benchmark_params.n_threads);
185
-
186
- TENSOR_DUMP(gf->nodes[0]);
187
-
188
- printf("\n------ Test 2 - Matrix Mult via %s code\n", ggml_type_name(qtype));
189
-
190
- int32_t nelements = sizex*sizey;
191
-
192
- // Set up a the benchmark matrices
193
- // printf("Creating new tensor q11 & Running quantize\n");
194
- struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
195
- ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements/m11->ne[0], m11->ne[0], nullptr);
196
-
197
- // Set up a the compute graph
198
- // printf("Creating new tensor q31\n");
199
- struct ggml_tensor * q31 = ggml_mul_mat(ctx, q11, m2);
200
-
201
- // printf("Creating compute graph\n");
202
- struct ggml_cgraph * gf31 = ggml_new_graph(ctx);
203
- ggml_build_forward_expand(gf31, q31);
204
-
205
- // Set up a second graph computation to make sure we override the CPU cache lines
206
- // printf("Creating new tensor q12 & Running quantize\n");
207
- struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
208
- ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements/m12->ne[0], m12->ne[0], nullptr);
209
-
210
- // printf("Creating new tensor q32\n");
211
- struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2);
212
-
213
- //printf("Creating compute graph\n");
214
- struct ggml_cgraph * gf32 = ggml_new_graph(ctx);
215
- ggml_build_forward_expand(gf32, q32);
216
- printf("n_threads=%i\n", benchmark_params.n_threads);
217
-
218
- const int dimx = sizex;
219
- const int dimy = sizey;
220
- const int dimz = sizez;
221
- long long int flops_per_dot_product = dimy + dimy;
222
- long long int flops_per_matrix = flops_per_dot_product * dimx * dimz; ;
223
- printf("Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - about %6.2f gFLOPS\n\n", sizex, sizey, 1, sizex, sizez, 1, 1.0f*flops_per_matrix / 1000 / 1000 / 1000);
224
-
225
-
226
- // Let's use the F32 result from above as a reference for the quantized multiplication
227
- float sum_of_F32_reference = tensor_sum_elements(gf->nodes[0]);
228
-
229
- printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n");
230
- printf("=====================================================================================\n");
231
-
232
- double gflops_sum = 0;
233
- for (int i=0;i<benchmark_params.n_iterations ;i++) {
234
-
235
- long long int start = ggml_time_us();
236
- //printf("Running ggml_graph_compute\n");
237
- ggml_graph_compute_helper(work_buffer, gf31, benchmark_params.n_threads);
238
-
239
- long long int stop = ggml_time_us();
240
- long long int usec = stop-start;
241
- double gflops = (double)(flops_per_matrix)/usec/1000.0;
242
- gflops_sum += gflops;
243
- printf("%9i;%8i;%6i;%6i;%6i;%15lli;%18lli;%10.2f\n",
244
- i,
245
- benchmark_params.n_threads,
246
- sizex, sizey, sizez, flops_per_matrix,
247
- usec,gflops);
248
-
249
- #ifdef VERBOSE_DEBUGGING
250
- TENSOR_DUMP("res",gf31.nodes[0])
251
- #endif
252
-
253
- // Check that the matrix multiplication result is in the right ballpark
254
- // We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different
255
- float sum_of_Q4_result = tensor_sum_elements(gf31->nodes[0]);
256
- float delta = std::abs(sum_of_Q4_result - sum_of_F32_reference);
257
- float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; // Let's accept an epsilon of 10^-6
258
-
259
- if (delta > allowed_delta) {
260
- printf("\nABORT - ERROR in Matrix Multiplication result - expected %6.2f, got %6.2f (delta %6.2f > allowed_delta %6.2f)\n",
261
- sum_of_F32_reference,
262
- sum_of_Q4_result,
263
- delta,
264
- allowed_delta
265
- );
266
- exit(0);
267
- }
268
-
269
- // Running a different graph computation to make sure we override the CPU cache lines
270
- ggml_graph_compute_helper(work_buffer, gf32, benchmark_params.n_threads);
271
- }
272
- printf("\n");
273
- printf("Average%78.2f\n",gflops_sum/((double)benchmark_params.n_iterations));
274
- printf("=====================================================================================\n");
275
- }