@fugood/llama.node 0.3.1 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (252) hide show
  1. package/CMakeLists.txt +1 -8
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/package.json +4 -2
  17. package/src/DetokenizeWorker.cpp +1 -1
  18. package/src/EmbeddingWorker.cpp +2 -2
  19. package/src/LlamaCompletionWorker.cpp +10 -10
  20. package/src/LlamaCompletionWorker.h +2 -2
  21. package/src/LlamaContext.cpp +14 -17
  22. package/src/TokenizeWorker.cpp +1 -1
  23. package/src/common.hpp +5 -4
  24. package/src/llama.cpp/.github/workflows/build.yml +137 -29
  25. package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
  26. package/src/llama.cpp/.github/workflows/docker.yml +46 -34
  27. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
  28. package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
  29. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
  30. package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
  31. package/src/llama.cpp/.github/workflows/server.yml +7 -0
  32. package/src/llama.cpp/CMakeLists.txt +26 -11
  33. package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
  34. package/src/llama.cpp/common/CMakeLists.txt +10 -10
  35. package/src/llama.cpp/common/arg.cpp +2041 -0
  36. package/src/llama.cpp/common/arg.h +77 -0
  37. package/src/llama.cpp/common/common.cpp +523 -1861
  38. package/src/llama.cpp/common/common.h +234 -106
  39. package/src/llama.cpp/common/console.cpp +3 -0
  40. package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
  41. package/src/llama.cpp/common/log.cpp +401 -0
  42. package/src/llama.cpp/common/log.h +66 -698
  43. package/src/llama.cpp/common/ngram-cache.cpp +39 -36
  44. package/src/llama.cpp/common/ngram-cache.h +19 -19
  45. package/src/llama.cpp/common/sampling.cpp +356 -350
  46. package/src/llama.cpp/common/sampling.h +62 -139
  47. package/src/llama.cpp/common/stb_image.h +5990 -6398
  48. package/src/llama.cpp/docs/build.md +72 -17
  49. package/src/llama.cpp/examples/CMakeLists.txt +1 -2
  50. package/src/llama.cpp/examples/batched/batched.cpp +49 -65
  51. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +42 -53
  52. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
  53. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +22 -22
  54. package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
  55. package/src/llama.cpp/examples/embedding/embedding.cpp +147 -91
  56. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +37 -37
  57. package/src/llama.cpp/examples/export-lora/export-lora.cpp +39 -38
  58. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
  59. package/src/llama.cpp/examples/{baby-llama → gen-docs}/CMakeLists.txt +2 -2
  60. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
  61. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
  62. package/src/llama.cpp/examples/gritlm/gritlm.cpp +46 -39
  63. package/src/llama.cpp/examples/imatrix/imatrix.cpp +75 -69
  64. package/src/llama.cpp/examples/infill/infill.cpp +131 -192
  65. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +276 -178
  66. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  67. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +40 -36
  68. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  69. package/src/llama.cpp/examples/llava/clip.cpp +686 -150
  70. package/src/llama.cpp/examples/llava/clip.h +11 -2
  71. package/src/llama.cpp/examples/llava/llava-cli.cpp +60 -71
  72. package/src/llama.cpp/examples/llava/llava.cpp +146 -26
  73. package/src/llama.cpp/examples/llava/llava.h +2 -3
  74. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
  75. package/src/llama.cpp/examples/llava/requirements.txt +1 -0
  76. package/src/llama.cpp/examples/lookahead/lookahead.cpp +55 -56
  77. package/src/llama.cpp/examples/lookup/lookup-create.cpp +15 -13
  78. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  79. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +34 -33
  80. package/src/llama.cpp/examples/lookup/lookup.cpp +60 -63
  81. package/src/llama.cpp/examples/main/main.cpp +216 -313
  82. package/src/llama.cpp/examples/parallel/parallel.cpp +58 -59
  83. package/src/llama.cpp/examples/passkey/passkey.cpp +53 -61
  84. package/src/llama.cpp/examples/perplexity/perplexity.cpp +277 -311
  85. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  86. package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
  87. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -12
  88. package/src/llama.cpp/examples/retrieval/retrieval.cpp +57 -52
  89. package/src/llama.cpp/examples/rpc/rpc-server.cpp +27 -2
  90. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +60 -46
  91. package/src/llama.cpp/examples/server/CMakeLists.txt +7 -18
  92. package/src/llama.cpp/examples/server/server.cpp +1347 -1531
  93. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
  94. package/src/llama.cpp/examples/server/utils.hpp +396 -107
  95. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  96. package/src/llama.cpp/examples/simple/simple.cpp +132 -106
  97. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
  98. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
  99. package/src/llama.cpp/examples/speculative/speculative.cpp +153 -124
  100. package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
  101. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
  102. package/src/llama.cpp/examples/tokenize/tokenize.cpp +27 -29
  103. package/src/llama.cpp/ggml/CMakeLists.txt +29 -12
  104. package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
  105. package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
  106. package/src/llama.cpp/ggml/include/ggml-backend.h +166 -68
  107. package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
  108. package/src/llama.cpp/ggml/include/ggml-cann.h +17 -19
  109. package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
  110. package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
  111. package/src/llama.cpp/ggml/include/ggml-cuda.h +17 -17
  112. package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
  113. package/src/llama.cpp/ggml/include/ggml-metal.h +13 -12
  114. package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
  115. package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
  116. package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
  117. package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
  118. package/src/llama.cpp/ggml/include/ggml.h +272 -505
  119. package/src/llama.cpp/ggml/src/CMakeLists.txt +69 -1110
  120. package/src/llama.cpp/ggml/src/ggml-aarch64.c +52 -2116
  121. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
  122. package/src/llama.cpp/ggml/src/ggml-alloc.c +29 -27
  123. package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
  124. package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
  125. package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
  126. package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
  127. package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
  128. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +144 -81
  129. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
  130. package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +394 -635
  131. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
  132. package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +217 -70
  133. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
  134. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
  135. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
  136. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
  137. package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
  138. package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +458 -353
  139. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
  140. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
  141. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
  142. package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
  143. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
  144. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
  145. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
  146. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +371 -0
  147. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
  148. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  149. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
  150. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
  151. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1885 -0
  152. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
  153. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
  154. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
  155. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
  156. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
  157. package/src/llama.cpp/ggml/src/ggml-impl.h +380 -584
  158. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
  159. package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +233 -87
  160. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
  161. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
  162. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
  163. package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
  164. package/src/llama.cpp/ggml/src/ggml-quants.c +369 -9994
  165. package/src/llama.cpp/ggml/src/ggml-quants.h +78 -110
  166. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
  167. package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +560 -335
  168. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
  169. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +6 -0
  170. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +51 -0
  171. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +310 -0
  172. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
  173. package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
  174. package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
  175. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
  176. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
  177. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
  178. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
  179. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +18 -25
  180. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
  181. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
  182. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
  183. package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3350 -3980
  184. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
  185. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
  186. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +70 -68
  187. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +9 -6
  188. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
  189. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
  190. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +8 -0
  191. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
  192. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
  193. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
  194. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
  195. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
  196. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
  197. package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
  198. package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
  199. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
  200. package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2034 -1718
  201. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +2 -0
  202. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +152 -185
  203. package/src/llama.cpp/ggml/src/ggml.c +2075 -16579
  204. package/src/llama.cpp/include/llama.h +296 -285
  205. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
  206. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
  207. package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
  208. package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
  209. package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
  210. package/src/llama.cpp/src/CMakeLists.txt +2 -1
  211. package/src/llama.cpp/src/llama-grammar.cpp +721 -122
  212. package/src/llama.cpp/src/llama-grammar.h +120 -15
  213. package/src/llama.cpp/src/llama-impl.h +156 -1
  214. package/src/llama.cpp/src/llama-sampling.cpp +2058 -346
  215. package/src/llama.cpp/src/llama-sampling.h +39 -47
  216. package/src/llama.cpp/src/llama-vocab.cpp +390 -127
  217. package/src/llama.cpp/src/llama-vocab.h +60 -20
  218. package/src/llama.cpp/src/llama.cpp +6215 -3263
  219. package/src/llama.cpp/src/unicode-data.cpp +6 -4
  220. package/src/llama.cpp/src/unicode-data.h +4 -4
  221. package/src/llama.cpp/src/unicode.cpp +15 -7
  222. package/src/llama.cpp/tests/CMakeLists.txt +4 -2
  223. package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
  224. package/src/llama.cpp/tests/test-backend-ops.cpp +1725 -297
  225. package/src/llama.cpp/tests/test-barrier.cpp +94 -0
  226. package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
  227. package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
  228. package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
  229. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +23 -8
  230. package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
  231. package/src/llama.cpp/tests/test-log.cpp +39 -0
  232. package/src/llama.cpp/tests/test-opt.cpp +853 -142
  233. package/src/llama.cpp/tests/test-quantize-fns.cpp +28 -19
  234. package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
  235. package/src/llama.cpp/tests/test-rope.cpp +2 -1
  236. package/src/llama.cpp/tests/test-sampling.cpp +226 -142
  237. package/src/llama.cpp/tests/test-tokenizer-0.cpp +56 -36
  238. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
  239. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
  240. package/patches/llama.patch +0 -22
  241. package/src/llama.cpp/.github/workflows/bench.yml +0 -310
  242. package/src/llama.cpp/common/grammar-parser.cpp +0 -536
  243. package/src/llama.cpp/common/grammar-parser.h +0 -29
  244. package/src/llama.cpp/common/train.cpp +0 -1513
  245. package/src/llama.cpp/common/train.h +0 -233
  246. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1640
  247. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
  248. package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
  249. package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +0 -1027
  250. package/src/llama.cpp/tests/test-grad0.cpp +0 -1566
  251. /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
  252. /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
@@ -1,6 +1,7 @@
1
1
  // Unit tests for quantization specific functions - quantize, dequantize and dot product
2
2
 
3
3
  #include "ggml.h"
4
+ #include "ggml-cpu.h"
4
5
 
5
6
  #undef NDEBUG
6
7
  #include <assert.h>
@@ -15,11 +16,13 @@
15
16
 
16
17
  constexpr float MAX_QUANTIZATION_REFERENCE_ERROR = 0.0001f;
17
18
  constexpr float MAX_QUANTIZATION_TOTAL_ERROR = 0.002f;
19
+ constexpr float MAX_QUANTIZATION_TOTAL_ERROR_TERNARY = 0.01f;
18
20
  constexpr float MAX_QUANTIZATION_TOTAL_ERROR_2BITS = 0.0075f;
19
21
  constexpr float MAX_QUANTIZATION_TOTAL_ERROR_3BITS = 0.0040f;
20
22
  constexpr float MAX_QUANTIZATION_TOTAL_ERROR_3BITS_XXS = 0.0050f;
21
23
  constexpr float MAX_DOT_PRODUCT_ERROR = 0.02f;
22
24
  constexpr float MAX_DOT_PRODUCT_ERROR_LOWBIT = 0.04f;
25
+ constexpr float MAX_DOT_PRODUCT_ERROR_TERNARY = 0.15f;
23
26
 
24
27
  static const char* RESULT_STR[] = {"ok", "FAILED"};
25
28
 
@@ -42,26 +45,27 @@ static float array_rmse(const float * a1, const float * a2, size_t n) {
42
45
  }
43
46
 
44
47
  // Total quantization error on test data
45
- static float total_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) {
48
+ static float total_quantization_error(const ggml_type_traits * qfns, const ggml_type_traits_cpu * qfns_cpu, size_t test_size, const float * test_data) {
46
49
  std::vector<uint8_t> tmp_q(2*test_size);
47
50
  std::vector<float> tmp_out(test_size);
48
51
 
49
- qfns.from_float(test_data, tmp_q.data(), test_size);
50
- qfns.to_float(tmp_q.data(), tmp_out.data(), test_size);
52
+ qfns_cpu->from_float(test_data, tmp_q.data(), test_size);
53
+ qfns->to_float(tmp_q.data(), tmp_out.data(), test_size);
51
54
  return array_rmse(test_data, tmp_out.data(), test_size);
52
55
  }
53
56
 
54
57
  // Total quantization error on test data
55
- static float reference_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) {
58
+ static float reference_quantization_error(const ggml_type_traits * qfns, const ggml_type_traits_cpu * qfns_cpu, size_t test_size, const float * test_data) {
56
59
  std::vector<uint8_t> tmp_q(2*test_size);
57
60
  std::vector<float> tmp_out(test_size);
58
61
  std::vector<float> tmp_out_ref(test_size);
59
62
 
60
- qfns.from_float(test_data, tmp_q.data(), test_size);
61
- qfns.to_float(tmp_q.data(), tmp_out.data(), test_size);
63
+ // FIXME: why is done twice?
64
+ qfns_cpu->from_float(test_data, tmp_q.data(), test_size);
65
+ qfns->to_float(tmp_q.data(), tmp_out.data(), test_size);
62
66
 
63
- qfns.from_float_ref(test_data, tmp_q.data(), test_size);
64
- qfns.to_float(tmp_q.data(), tmp_out_ref.data(), test_size);
67
+ qfns->from_float_ref(test_data, tmp_q.data(), test_size);
68
+ qfns->to_float(tmp_q.data(), tmp_out_ref.data(), test_size);
65
69
 
66
70
  return array_rmse(tmp_out.data(), tmp_out_ref.data(), test_size);
67
71
  }
@@ -76,18 +80,18 @@ static float dot_product(const float * a1, const float * a2, size_t test_size) {
76
80
 
77
81
  // Total dot product error
78
82
  static float dot_product_error(
79
- ggml_type_traits_t & qfns, size_t test_size, const float * test_data1, const float *test_data2
83
+ const ggml_type_traits * qfns, const ggml_type_traits_cpu * qfns_cpu, size_t test_size, const float * test_data1, const float *test_data2
80
84
  ) {
81
85
  std::vector<uint8_t> tmp_q1(2*test_size);
82
86
  std::vector<uint8_t> tmp_q2(2*test_size);
83
87
 
84
- auto vdot = ggml_internal_get_type_traits(qfns.vec_dot_type);
88
+ const auto * vdot = ggml_get_type_traits_cpu(qfns_cpu->vec_dot_type);
85
89
 
86
- qfns.from_float(test_data1, tmp_q1.data(), test_size);
87
- vdot.from_float(test_data2, tmp_q2.data(), test_size);
90
+ qfns_cpu->from_float(test_data1, tmp_q1.data(), test_size);
91
+ vdot->from_float(test_data2, tmp_q2.data(), test_size);
88
92
 
89
93
  float result = INFINITY;
90
- qfns.vec_dot(test_size, &result, 0, tmp_q1.data(), 0, tmp_q2.data(), 0, 1);
94
+ qfns_cpu->vec_dot(test_size, &result, 0, tmp_q1.data(), 0, tmp_q2.data(), 0, 1);
91
95
 
92
96
  const float dot_ref = dot_product(test_data1, test_data2, test_size);
93
97
 
@@ -129,10 +133,11 @@ int main(int argc, char * argv[]) {
129
133
 
130
134
  for (int i = 0; i < GGML_TYPE_COUNT; i++) {
131
135
  ggml_type type = (ggml_type) i;
132
- ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
136
+ const auto * qfns = ggml_get_type_traits(type);
137
+ const auto * qfns_cpu = ggml_get_type_traits_cpu(type);
133
138
 
134
139
  // deprecated - skip
135
- if (qfns.blck_size == 0) {
140
+ if (qfns->blck_size == 0) {
136
141
  continue;
137
142
  }
138
143
 
@@ -141,9 +146,11 @@ int main(int argc, char * argv[]) {
141
146
  printf("Testing %s\n", ggml_type_name((ggml_type) i));
142
147
  ggml_quantize_init(ei);
143
148
 
144
- if (qfns.from_float && qfns.to_float) {
145
- const float total_error = total_quantization_error(qfns, test_size, test_data.data());
149
+ if (qfns_cpu->from_float && qfns->to_float) {
150
+ const float total_error = total_quantization_error(qfns, qfns_cpu, test_size, test_data.data());
146
151
  const float max_quantization_error =
152
+ type == GGML_TYPE_TQ1_0 ? MAX_QUANTIZATION_TOTAL_ERROR_TERNARY :
153
+ type == GGML_TYPE_TQ2_0 ? MAX_QUANTIZATION_TOTAL_ERROR_TERNARY :
147
154
  type == GGML_TYPE_Q2_K ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS :
148
155
  type == GGML_TYPE_IQ2_S ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS :
149
156
  type == GGML_TYPE_Q3_K ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS :
@@ -155,17 +162,19 @@ int main(int argc, char * argv[]) {
155
162
  printf("%5s absolute quantization error: %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], total_error);
156
163
  }
157
164
 
158
- const float reference_error = reference_quantization_error(qfns, test_size, test_data.data());
165
+ const float reference_error = reference_quantization_error(qfns, qfns_cpu, test_size, test_data.data());
159
166
  failed = !(reference_error < MAX_QUANTIZATION_REFERENCE_ERROR);
160
167
  num_failed += failed;
161
168
  if (failed || verbose) {
162
169
  printf("%5s reference implementation error: %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], reference_error);
163
170
  }
164
171
 
165
- const float vec_dot_error = dot_product_error(qfns, test_size, test_data.data(), test_data2.data());
172
+ const float vec_dot_error = dot_product_error(qfns, qfns_cpu, test_size, test_data.data(), test_data2.data());
166
173
  const float max_allowed_error = type == GGML_TYPE_Q2_K || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ2_XXS ||
167
174
  type == GGML_TYPE_IQ3_XXS || type == GGML_TYPE_IQ3_S || type == GGML_TYPE_IQ2_S
168
175
  ? MAX_DOT_PRODUCT_ERROR_LOWBIT
176
+ : type == GGML_TYPE_TQ1_0 || type == GGML_TYPE_TQ2_0
177
+ ? MAX_DOT_PRODUCT_ERROR_TERNARY
169
178
  : MAX_DOT_PRODUCT_ERROR;
170
179
  failed = !(vec_dot_error < max_allowed_error);
171
180
  num_failed += failed;
@@ -1,12 +1,12 @@
1
1
  // Benchmark quantization specific functions on synthetic data
2
2
 
3
3
  #include "ggml.h"
4
+ #include "ggml-cpu.h"
4
5
 
5
6
  #undef NDEBUG
6
7
  #include <algorithm>
7
8
  #include <assert.h>
8
9
  #include <functional>
9
- #include <inttypes.h>
10
10
  #include <math.h>
11
11
  #include <memory>
12
12
  #include <stdio.h>
@@ -122,9 +122,10 @@ static void usage(char * argv[]) {
122
122
  printf(" --type TYPE set test type as");
123
123
  for (int i = 0; i < GGML_TYPE_COUNT; i++) {
124
124
  ggml_type type = (ggml_type) i;
125
- ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
125
+ const auto * qfns = ggml_get_type_traits(type);
126
+ const auto * qfns_cpu = ggml_get_type_traits_cpu(type);
126
127
  if (ggml_type_name(type) != NULL) {
127
- if (qfns.from_float && qfns.to_float) {
128
+ if (qfns_cpu->from_float && qfns->to_float) {
128
129
  printf(" %s", ggml_type_name(type));
129
130
  }
130
131
  }
@@ -270,12 +271,13 @@ int main(int argc, char * argv[]) {
270
271
 
271
272
  for (int i = 0; i < GGML_TYPE_COUNT; i++) {
272
273
  ggml_type type = (ggml_type) i;
273
- ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
274
+ const auto * qfns = ggml_get_type_traits(type);
275
+ const auto * qfns_cpu = ggml_get_type_traits_cpu(type);
274
276
  if (!params.include_types.empty() && ggml_type_name(type) && std::find(params.include_types.begin(), params.include_types.end(), ggml_type_name(type)) == params.include_types.end()) {
275
277
  continue;
276
278
  }
277
279
 
278
- if (qfns.from_float && qfns.to_float) {
280
+ if (qfns_cpu->from_float && qfns->to_float) {
279
281
  printf("%s\n", ggml_type_name(type));
280
282
 
281
283
  ggml_quantize_init(type);
@@ -285,7 +287,7 @@ int main(int argc, char * argv[]) {
285
287
  for (size_t size : params.test_sizes) {
286
288
  printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
287
289
  auto quantize_fn = [&](void) -> float {
288
- qfns.from_float_ref(test_data1, test_q1, size);
290
+ qfns->from_float_ref(test_data1, test_q1, size);
289
291
  return test_q1[0];
290
292
  };
291
293
  size_t quantized_size = ggml_row_size(type, size);
@@ -299,7 +301,7 @@ int main(int argc, char * argv[]) {
299
301
  for (size_t size : params.test_sizes) {
300
302
  printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
301
303
  auto quantize_fn = [&](void) -> float {
302
- qfns.from_float(test_data1, test_q1, size);
304
+ qfns_cpu->from_float(test_data1, test_q1, size);
303
305
  return test_q1[0];
304
306
  };
305
307
  size_t quantized_size = ggml_row_size(type, size);
@@ -310,11 +312,11 @@ int main(int argc, char * argv[]) {
310
312
 
311
313
  if (params.op_dequantize_row_q) {
312
314
  printf(" dequantize_row_q\n");
313
- qfns.from_float(test_data1, test_q1, largest);
315
+ qfns_cpu->from_float(test_data1, test_q1, largest);
314
316
  for (size_t size : params.test_sizes) {
315
317
  printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
316
318
  auto quantize_fn = [&](void) -> float {
317
- qfns.to_float(test_q1, test_out, size);
319
+ qfns->to_float(test_q1, test_out, size);
318
320
  return test_out[0];
319
321
  };
320
322
  size_t quantized_size = ggml_row_size(type, size);
@@ -328,8 +330,8 @@ int main(int argc, char * argv[]) {
328
330
  for (size_t size : params.test_sizes) {
329
331
  printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
330
332
  auto quantize_fn = [&](void) -> float {
331
- auto vdot = ggml_internal_get_type_traits(qfns.vec_dot_type);
332
- vdot.from_float(test_data1, test_q1, size);
333
+ const auto * vdot = ggml_get_type_traits_cpu(qfns_cpu->vec_dot_type);
334
+ vdot->from_float(test_data1, test_q1, size);
333
335
  return test_q1[0];
334
336
  };
335
337
  size_t quantized_size = ggml_row_size(type, size);
@@ -340,13 +342,13 @@ int main(int argc, char * argv[]) {
340
342
 
341
343
  if (params.op_vec_dot_q) {
342
344
  printf(" vec_dot_q\n");
343
- qfns.from_float(test_data1, test_q1, largest);
344
- qfns.from_float(test_data2, test_q2, largest);
345
+ qfns_cpu->from_float(test_data1, test_q1, largest);
346
+ qfns_cpu->from_float(test_data2, test_q2, largest);
345
347
  for (size_t size : params.test_sizes) {
346
348
  printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
347
349
  auto quantize_fn = [&](void) -> float {
348
350
  float result;
349
- qfns.vec_dot(size, &result, 0, test_q1, 0, test_q2, 0, 1);
351
+ qfns_cpu->vec_dot(size, &result, 0, test_q1, 0, test_q2, 0, 1);
350
352
  return result;
351
353
  };
352
354
  size_t quantized_size = ggml_row_size(type, size);
@@ -1,4 +1,5 @@
1
1
  #include "ggml.h"
2
+ #include "ggml-cpu.h"
2
3
 
3
4
  #include <cmath>
4
5
  #include <cstdio>
@@ -113,7 +114,7 @@ static struct ggml_tensor * get_random_tensor_f32(
113
114
  }
114
115
 
115
116
  static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
116
- struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
117
+ struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
117
118
 
118
119
  if (plan.work_size > 0) {
119
120
  buf.resize(plan.work_size);