@fugood/llama.node 0.3.1 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (252) hide show
  1. package/CMakeLists.txt +1 -8
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/package.json +4 -2
  17. package/src/DetokenizeWorker.cpp +1 -1
  18. package/src/EmbeddingWorker.cpp +2 -2
  19. package/src/LlamaCompletionWorker.cpp +10 -10
  20. package/src/LlamaCompletionWorker.h +2 -2
  21. package/src/LlamaContext.cpp +14 -17
  22. package/src/TokenizeWorker.cpp +1 -1
  23. package/src/common.hpp +5 -4
  24. package/src/llama.cpp/.github/workflows/build.yml +137 -29
  25. package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
  26. package/src/llama.cpp/.github/workflows/docker.yml +46 -34
  27. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
  28. package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
  29. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
  30. package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
  31. package/src/llama.cpp/.github/workflows/server.yml +7 -0
  32. package/src/llama.cpp/CMakeLists.txt +26 -11
  33. package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
  34. package/src/llama.cpp/common/CMakeLists.txt +10 -10
  35. package/src/llama.cpp/common/arg.cpp +2041 -0
  36. package/src/llama.cpp/common/arg.h +77 -0
  37. package/src/llama.cpp/common/common.cpp +523 -1861
  38. package/src/llama.cpp/common/common.h +234 -106
  39. package/src/llama.cpp/common/console.cpp +3 -0
  40. package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
  41. package/src/llama.cpp/common/log.cpp +401 -0
  42. package/src/llama.cpp/common/log.h +66 -698
  43. package/src/llama.cpp/common/ngram-cache.cpp +39 -36
  44. package/src/llama.cpp/common/ngram-cache.h +19 -19
  45. package/src/llama.cpp/common/sampling.cpp +356 -350
  46. package/src/llama.cpp/common/sampling.h +62 -139
  47. package/src/llama.cpp/common/stb_image.h +5990 -6398
  48. package/src/llama.cpp/docs/build.md +72 -17
  49. package/src/llama.cpp/examples/CMakeLists.txt +1 -2
  50. package/src/llama.cpp/examples/batched/batched.cpp +49 -65
  51. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +42 -53
  52. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
  53. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +22 -22
  54. package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
  55. package/src/llama.cpp/examples/embedding/embedding.cpp +147 -91
  56. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +37 -37
  57. package/src/llama.cpp/examples/export-lora/export-lora.cpp +39 -38
  58. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
  59. package/src/llama.cpp/examples/{baby-llama → gen-docs}/CMakeLists.txt +2 -2
  60. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
  61. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
  62. package/src/llama.cpp/examples/gritlm/gritlm.cpp +46 -39
  63. package/src/llama.cpp/examples/imatrix/imatrix.cpp +75 -69
  64. package/src/llama.cpp/examples/infill/infill.cpp +131 -192
  65. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +276 -178
  66. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  67. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +40 -36
  68. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  69. package/src/llama.cpp/examples/llava/clip.cpp +686 -150
  70. package/src/llama.cpp/examples/llava/clip.h +11 -2
  71. package/src/llama.cpp/examples/llava/llava-cli.cpp +60 -71
  72. package/src/llama.cpp/examples/llava/llava.cpp +146 -26
  73. package/src/llama.cpp/examples/llava/llava.h +2 -3
  74. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
  75. package/src/llama.cpp/examples/llava/requirements.txt +1 -0
  76. package/src/llama.cpp/examples/lookahead/lookahead.cpp +55 -56
  77. package/src/llama.cpp/examples/lookup/lookup-create.cpp +15 -13
  78. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  79. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +34 -33
  80. package/src/llama.cpp/examples/lookup/lookup.cpp +60 -63
  81. package/src/llama.cpp/examples/main/main.cpp +216 -313
  82. package/src/llama.cpp/examples/parallel/parallel.cpp +58 -59
  83. package/src/llama.cpp/examples/passkey/passkey.cpp +53 -61
  84. package/src/llama.cpp/examples/perplexity/perplexity.cpp +277 -311
  85. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  86. package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
  87. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -12
  88. package/src/llama.cpp/examples/retrieval/retrieval.cpp +57 -52
  89. package/src/llama.cpp/examples/rpc/rpc-server.cpp +27 -2
  90. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +60 -46
  91. package/src/llama.cpp/examples/server/CMakeLists.txt +7 -18
  92. package/src/llama.cpp/examples/server/server.cpp +1347 -1531
  93. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
  94. package/src/llama.cpp/examples/server/utils.hpp +396 -107
  95. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  96. package/src/llama.cpp/examples/simple/simple.cpp +132 -106
  97. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
  98. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
  99. package/src/llama.cpp/examples/speculative/speculative.cpp +153 -124
  100. package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
  101. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
  102. package/src/llama.cpp/examples/tokenize/tokenize.cpp +27 -29
  103. package/src/llama.cpp/ggml/CMakeLists.txt +29 -12
  104. package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
  105. package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
  106. package/src/llama.cpp/ggml/include/ggml-backend.h +166 -68
  107. package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
  108. package/src/llama.cpp/ggml/include/ggml-cann.h +17 -19
  109. package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
  110. package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
  111. package/src/llama.cpp/ggml/include/ggml-cuda.h +17 -17
  112. package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
  113. package/src/llama.cpp/ggml/include/ggml-metal.h +13 -12
  114. package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
  115. package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
  116. package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
  117. package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
  118. package/src/llama.cpp/ggml/include/ggml.h +272 -505
  119. package/src/llama.cpp/ggml/src/CMakeLists.txt +69 -1110
  120. package/src/llama.cpp/ggml/src/ggml-aarch64.c +52 -2116
  121. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
  122. package/src/llama.cpp/ggml/src/ggml-alloc.c +29 -27
  123. package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
  124. package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
  125. package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
  126. package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
  127. package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
  128. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +144 -81
  129. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
  130. package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +394 -635
  131. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
  132. package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +217 -70
  133. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
  134. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
  135. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
  136. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
  137. package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
  138. package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +458 -353
  139. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
  140. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
  141. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
  142. package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
  143. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
  144. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
  145. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
  146. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +371 -0
  147. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
  148. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  149. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
  150. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
  151. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1885 -0
  152. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
  153. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
  154. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
  155. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
  156. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
  157. package/src/llama.cpp/ggml/src/ggml-impl.h +380 -584
  158. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
  159. package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +233 -87
  160. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
  161. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
  162. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
  163. package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
  164. package/src/llama.cpp/ggml/src/ggml-quants.c +369 -9994
  165. package/src/llama.cpp/ggml/src/ggml-quants.h +78 -110
  166. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
  167. package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +560 -335
  168. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
  169. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +6 -0
  170. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +51 -0
  171. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +310 -0
  172. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
  173. package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
  174. package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
  175. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
  176. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
  177. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
  178. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
  179. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +18 -25
  180. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
  181. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
  182. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
  183. package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3350 -3980
  184. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
  185. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
  186. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +70 -68
  187. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +9 -6
  188. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
  189. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
  190. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +8 -0
  191. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
  192. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
  193. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
  194. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
  195. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
  196. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
  197. package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
  198. package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
  199. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
  200. package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2034 -1718
  201. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +2 -0
  202. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +152 -185
  203. package/src/llama.cpp/ggml/src/ggml.c +2075 -16579
  204. package/src/llama.cpp/include/llama.h +296 -285
  205. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
  206. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
  207. package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
  208. package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
  209. package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
  210. package/src/llama.cpp/src/CMakeLists.txt +2 -1
  211. package/src/llama.cpp/src/llama-grammar.cpp +721 -122
  212. package/src/llama.cpp/src/llama-grammar.h +120 -15
  213. package/src/llama.cpp/src/llama-impl.h +156 -1
  214. package/src/llama.cpp/src/llama-sampling.cpp +2058 -346
  215. package/src/llama.cpp/src/llama-sampling.h +39 -47
  216. package/src/llama.cpp/src/llama-vocab.cpp +390 -127
  217. package/src/llama.cpp/src/llama-vocab.h +60 -20
  218. package/src/llama.cpp/src/llama.cpp +6215 -3263
  219. package/src/llama.cpp/src/unicode-data.cpp +6 -4
  220. package/src/llama.cpp/src/unicode-data.h +4 -4
  221. package/src/llama.cpp/src/unicode.cpp +15 -7
  222. package/src/llama.cpp/tests/CMakeLists.txt +4 -2
  223. package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
  224. package/src/llama.cpp/tests/test-backend-ops.cpp +1725 -297
  225. package/src/llama.cpp/tests/test-barrier.cpp +94 -0
  226. package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
  227. package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
  228. package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
  229. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +23 -8
  230. package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
  231. package/src/llama.cpp/tests/test-log.cpp +39 -0
  232. package/src/llama.cpp/tests/test-opt.cpp +853 -142
  233. package/src/llama.cpp/tests/test-quantize-fns.cpp +28 -19
  234. package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
  235. package/src/llama.cpp/tests/test-rope.cpp +2 -1
  236. package/src/llama.cpp/tests/test-sampling.cpp +226 -142
  237. package/src/llama.cpp/tests/test-tokenizer-0.cpp +56 -36
  238. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
  239. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
  240. package/patches/llama.patch +0 -22
  241. package/src/llama.cpp/.github/workflows/bench.yml +0 -310
  242. package/src/llama.cpp/common/grammar-parser.cpp +0 -536
  243. package/src/llama.cpp/common/grammar-parser.h +0 -29
  244. package/src/llama.cpp/common/train.cpp +0 -1513
  245. package/src/llama.cpp/common/train.h +0 -233
  246. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1640
  247. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
  248. package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
  249. package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +0 -1027
  250. package/src/llama.cpp/tests/test-grad0.cpp +0 -1566
  251. /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
  252. /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
@@ -1,9 +1,5 @@
1
- // SPDX-FileCopyrightText: Copyright 2024 Arm Ltd.
2
1
  #pragma once
3
2
 
4
- #define GGML_COMMON_DECL_C
5
- #include "ggml-common.h"
6
-
7
3
  #include "ggml.h"
8
4
 
9
5
  // GGML internal header
@@ -12,27 +8,11 @@
12
8
  extern "C" {
13
9
  #endif
14
10
 
15
- // Quantization
16
- void quantize_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
17
- void quantize_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
18
-
19
- void quantize_mat_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nrows, int64_t n_per_row, int64_t blck_size_interleave);
20
-
21
11
  // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
22
12
  size_t quantize_q4_0_4x4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
23
13
  size_t quantize_q4_0_4x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
24
14
  size_t quantize_q4_0_8x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
25
15
 
26
- // GEMV
27
- void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
28
- void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
29
- void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
30
-
31
- // GEMM
32
- void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
33
- void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
34
- void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
35
-
36
16
  #ifdef __cplusplus
37
17
  }
38
18
  #endif
@@ -14,7 +14,7 @@
14
14
 
15
15
  //#define GGML_ALLOCATOR_DEBUG
16
16
 
17
- //#define AT_PRINTF(...) fprintf(stderr, __VA_ARGS__)
17
+ //#define AT_PRINTF(...) GGML_LOG_DEBUG(__VA_ARGS__)
18
18
  #define AT_PRINTF(...)
19
19
 
20
20
 
@@ -89,7 +89,7 @@ void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tenso
89
89
  size = GGML_PAD(size, talloc->alignment);
90
90
 
91
91
  if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {
92
- fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
92
+ GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
93
93
  __func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
94
94
  GGML_ABORT("not enough space in the buffer");
95
95
  }
@@ -172,7 +172,7 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
172
172
  best_fit_block = alloc->n_free_blocks - 1;
173
173
  } else {
174
174
  // this should never happen
175
- fprintf(stderr, "%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
175
+ GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
176
176
  __func__, size, max_avail);
177
177
  GGML_ABORT("not enough space in the buffer");
178
178
  }
@@ -209,16 +209,16 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
209
209
  }
210
210
  }
211
211
  }
212
- fprintf(stderr, "max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
212
+ GGML_LOG_DEBUG("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
213
213
  for (int i = 0; i < 1024; i++) {
214
214
  if (alloc->allocated_tensors[i].tensor) {
215
- fprintf(stderr, "%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
215
+ GGML_LOG_DEBUG("%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
216
216
  alloc->allocated_tensors[i].offset,
217
217
  alloc->allocated_tensors[i].offset + ggml_nbytes(alloc->allocated_tensors[i].tensor),
218
218
  ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0);
219
219
  }
220
220
  }
221
- fprintf(stderr, "\n");
221
+ GGML_LOG_DEBUG("\n");
222
222
  }
223
223
  #endif
224
224
 
@@ -294,6 +294,12 @@ static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
294
294
  alloc->free_blocks[0].offset = 0;
295
295
  alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
296
296
  alloc->max_size = 0;
297
+
298
+ #ifdef GGML_ALLOCATOR_DEBUG
299
+ for (int i = 0; i < 1024; i++) {
300
+ alloc->allocated_tensors[i].tensor = NULL;
301
+ }
302
+ #endif
297
303
  }
298
304
 
299
305
  static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment) {
@@ -342,7 +348,6 @@ struct tensor_alloc {
342
348
  };
343
349
 
344
350
  struct leaf_alloc {
345
- int buffer_id;
346
351
  struct tensor_alloc leaf;
347
352
  };
348
353
 
@@ -461,18 +466,12 @@ static bool ggml_gallocr_is_own(ggml_gallocr_t galloc, struct ggml_tensor * t) {
461
466
  return ggml_gallocr_hash_get(galloc, t)->allocated;
462
467
  }
463
468
 
464
- static void ggml_gallocr_set_node_offset(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id, size_t offset) {
465
- struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
466
- hn->buffer_id = buffer_id;
467
- hn->offset = offset;
468
- hn->allocated = true;
469
- }
470
-
471
469
  static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor * t) {
472
470
  return t->data != NULL || ggml_gallocr_hash_get(galloc, t)->allocated;
473
471
  }
474
472
 
475
473
  static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) {
474
+ GGML_ASSERT(buffer_id >= 0);
476
475
  struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
477
476
 
478
477
  if (!ggml_gallocr_is_allocated(galloc, node) && !ggml_is_view(node)) {
@@ -734,7 +733,6 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
734
733
  for (int i = 0; i < graph->n_leafs; i++) {
735
734
  struct ggml_tensor * leaf = graph->leafs[i];
736
735
  struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
737
- galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
738
736
  if (leaf->view_src || leaf->data) {
739
737
  galloc->leaf_allocs[i].leaf.buffer_id = -1;
740
738
  galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
@@ -762,13 +760,13 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
762
760
  // even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
763
761
  if (new_size > cur_size || galloc->buffers[i] == NULL) {
764
762
  #ifndef NDEBUG
765
- fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
763
+ GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
766
764
  #endif
767
765
 
768
766
  ggml_backend_buffer_free(galloc->buffers[i]);
769
767
  galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
770
768
  if (galloc->buffers[i] == NULL) {
771
- fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
769
+ GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
772
770
  return false;
773
771
  }
774
772
  ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
@@ -812,21 +810,25 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
812
810
  }
813
811
 
814
812
  static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
815
- size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
813
+ size_t node_size = 0;
814
+ if (!node->data && !node->view_src) {
815
+ GGML_ASSERT(talloc->buffer_id >= 0); // prevent segfault when misusing the API
816
+ node_size = ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
817
+ }
816
818
  return talloc->size_max >= node_size;
817
819
  }
818
820
 
819
821
  static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
820
822
  if (galloc->n_nodes != graph->n_nodes) {
821
823
  #ifndef NDEBUG
822
- fprintf(stderr, "%s: graph has different number of nodes\n", __func__);
824
+ GGML_LOG_DEBUG("%s: graph has different number of nodes\n", __func__);
823
825
  #endif
824
826
  return true;
825
827
  }
826
828
 
827
829
  if (galloc->n_leafs != graph->n_leafs) {
828
830
  #ifndef NDEBUG
829
- fprintf(stderr, "%s: graph has different number of leafs\n", __func__);
831
+ GGML_LOG_DEBUG("%s: graph has different number of leafs\n", __func__);
830
832
  #endif
831
833
  return true;
832
834
  }
@@ -837,7 +839,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
837
839
 
838
840
  if (!ggml_gallocr_node_needs_realloc(galloc, node, &node_alloc->dst)) {
839
841
  #ifndef NDEBUG
840
- fprintf(stderr, "%s: node %s is not valid\n", __func__, node->name);
842
+ GGML_LOG_DEBUG("%s: node %s is not valid\n", __func__, node->name);
841
843
  #endif
842
844
  return true;
843
845
  }
@@ -849,7 +851,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
849
851
  }
850
852
  if (!ggml_gallocr_node_needs_realloc(galloc, src, &node_alloc->src[j])) {
851
853
  #ifndef NDEBUG
852
- fprintf(stderr, "%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
854
+ GGML_LOG_DEBUG("%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
853
855
  #endif
854
856
  return true;
855
857
  }
@@ -863,14 +865,14 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
863
865
  if (ggml_gallocr_needs_realloc(galloc, graph)) {
864
866
  if (galloc->n_buffers == 1) {
865
867
  #ifndef NDEBUG
866
- fprintf(stderr, "%s: reallocating buffers automatically\n", __func__);
868
+ GGML_LOG_DEBUG("%s: reallocating buffers automatically\n", __func__);
867
869
  #endif
868
870
  if (!ggml_gallocr_reserve(galloc, graph)) {
869
871
  return false;
870
872
  }
871
873
  } else {
872
874
  #ifndef NDEBUG
873
- fprintf(stderr, "%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__);
875
+ GGML_LOG_DEBUG("%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__);
874
876
  #endif
875
877
  return false;
876
878
  }
@@ -934,7 +936,7 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
934
936
  ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
935
937
  if (buffer == NULL) {
936
938
  #ifndef NDEBUG
937
- fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
939
+ GGML_LOG_DEBUG("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
938
940
  #endif
939
941
  for (size_t i = 0; i < *n_buffers; i++) {
940
942
  ggml_backend_buffer_free((*buffers)[i]);
@@ -984,7 +986,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
984
986
  }
985
987
 
986
988
  if (this_size > max_size) {
987
- fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
989
+ GGML_LOG_ERROR("%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
988
990
  __func__, t->name,
989
991
  ggml_backend_buft_name(buft),
990
992
  this_size, max_size);
@@ -1016,7 +1018,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
1016
1018
 
1017
1019
  if (n_buffers == 0) {
1018
1020
  #ifndef NDEBUG
1019
- fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
1021
+ GGML_LOG_DEBUG("%s: all tensors in the context are already allocated\n", __func__);
1020
1022
  #endif
1021
1023
  return NULL;
1022
1024
  }
@@ -0,0 +1,107 @@
1
+ if (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
2
+ (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
3
+ CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$") AND
4
+ CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 11.0)
5
+ message(STATUS "Using AMX")
6
+
7
+ file(GLOB GGML_HEADERS_AMX "*.h")
8
+ list(APPEND GGML_HEADERS_AMX "../../include/ggml-amx.h")
9
+
10
+ file(GLOB GGML_SOURCES_AMX "*.cpp")
11
+
12
+ add_library(ggml-amx
13
+ ${GGML_HEADERS_AMX}
14
+ ${GGML_SOURCES_AMX})
15
+
16
+ target_link_libraries(ggml-amx PRIVATE ggml-base)
17
+ target_include_directories(ggml-amx PRIVATE . ..)
18
+
19
+ # this is duplicated from the CPU backend, since the AMX backend also depends on the architecture flags
20
+ # TODO: integrate AMX backend into the CPU backend
21
+ if (MSVC)
22
+ # instruction set detection for MSVC only
23
+ if (GGML_NATIVE)
24
+ # TODO: improve, should not reference files from the parent folder
25
+ include(../ggml-cpu/cmake/FindSIMD.cmake)
26
+ endif ()
27
+ if (GGML_AVX512)
28
+ list(APPEND ARCH_FLAGS /arch:AVX512)
29
+ # MSVC has no compile-time flags enabling specific
30
+ # AVX512 extensions, neither it defines the
31
+ # macros corresponding to the extensions.
32
+ # Do it manually.
33
+ if (GGML_AVX512_VBMI)
34
+ add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>)
35
+ add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>)
36
+ endif()
37
+ if (GGML_AVX512_VNNI)
38
+ add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
39
+ add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
40
+ endif()
41
+ if (GGML_AVX512_BF16)
42
+ add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512BF16__>)
43
+ add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512BF16__>)
44
+ endif()
45
+ if (GGML_AMX_TILE)
46
+ add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_TILE__>)
47
+ add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_TILE__>)
48
+ endif()
49
+ if (GGML_AMX_INT8)
50
+ add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_INT8__>)
51
+ add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_INT8__>)
52
+ endif()
53
+ if (GGML_AMX_BF16)
54
+ add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_BF16__>)
55
+ add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_BF16__>)
56
+ endif()
57
+ elseif (GGML_AVX2)
58
+ list(APPEND ARCH_FLAGS /arch:AVX2)
59
+ elseif (GGML_AVX)
60
+ list(APPEND ARCH_FLAGS /arch:AVX)
61
+ endif()
62
+ else()
63
+ if (GGML_NATIVE)
64
+ list(APPEND ARCH_FLAGS -march=native)
65
+ endif()
66
+ if (GGML_F16C)
67
+ list(APPEND ARCH_FLAGS -mf16c)
68
+ endif()
69
+ if (GGML_FMA)
70
+ list(APPEND ARCH_FLAGS -mfma)
71
+ endif()
72
+ if (GGML_AVX)
73
+ list(APPEND ARCH_FLAGS -mavx)
74
+ endif()
75
+ if (GGML_AVX2)
76
+ list(APPEND ARCH_FLAGS -mavx2)
77
+ endif()
78
+ if (GGML_AVX512)
79
+ list(APPEND ARCH_FLAGS -mavx512f)
80
+ list(APPEND ARCH_FLAGS -mavx512dq)
81
+ list(APPEND ARCH_FLAGS -mavx512bw)
82
+ endif()
83
+ if (GGML_AVX512_VBMI)
84
+ list(APPEND ARCH_FLAGS -mavx512vbmi)
85
+ endif()
86
+ if (GGML_AVX512_VNNI)
87
+ list(APPEND ARCH_FLAGS -mavx512vnni)
88
+ endif()
89
+ if (GGML_AVX512_BF16)
90
+ list(APPEND ARCH_FLAGS -mavx512bf16)
91
+ endif()
92
+ if (GGML_AMX_TILE)
93
+ list(APPEND ARCH_FLAGS -mamx-tile)
94
+ endif()
95
+ if (GGML_AMX_INT8)
96
+ list(APPEND ARCH_FLAGS -mamx-int8)
97
+ endif()
98
+ if (GGML_AMX_BF16)
99
+ list(APPEND ARCH_FLAGS -mamx-bf16)
100
+ endif()
101
+ endif()
102
+
103
+ target_compile_options(ggml-amx PRIVATE ${ARCH_FLAGS})
104
+ else()
105
+ set(GGML_AMX OFF PARENT_SCOPE)
106
+ message(WARNING "AMX requires x86 and gcc version > 11.0. Turning off GGML_AMX.")
107
+ endif()
@@ -0,0 +1,94 @@
1
+ #pragma once
2
+
3
+ #include "ggml.h"
4
+ // hack until AMX is moved into the CPU backend
5
+ #include "../ggml-cpu/ggml-cpu-impl.h" // <immintrin.h>
6
+
7
+ #include <algorithm>
8
+ #include <memory>
9
+ #include <type_traits>
10
+
11
+ #if defined(_OPENMP)
12
+ #include <omp.h>
13
+ #endif
14
+
15
+ #define TILE_M 16
16
+ #define TILE_N 16
17
+ #define TILE_K 32
18
+ #define VNNI_BLK 4
19
+
20
+ #define AMX_BLK_SIZE 32
21
+
22
+ #define TMM0 0
23
+ #define TMM1 1
24
+ #define TMM2 2
25
+ #define TMM3 3
26
+ #define TMM4 4
27
+ #define TMM5 5
28
+ #define TMM6 6
29
+ #define TMM7 7
30
+
31
+ // parallel routines
32
+ template <typename T, typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
33
+ inline T div_up(T x, T y) { return (x + y - 1) / y; }
34
+
35
+ template <typename T>
36
+ inline void balance211(T n, T nth, T ith, T& n_start, T& n_end) {
37
+ #if 0
38
+ // onednn partition pattern
39
+ T& n_my = n_end;
40
+ if (nth <= 1 || n == 0) {
41
+ n_start = 0;
42
+ n_my = n;
43
+ } else {
44
+ T n1 = div_up(n, nth);
45
+ T n2 = n1 - 1;
46
+ T T1 = n - n2 * nth;
47
+ n_my = ith < T1 ? n1 : n2;
48
+ n_start = ith <= T1 ? ith*n1 : T1 * n1 + (ith - T1) * n2;
49
+ }
50
+ n_end += n_start;
51
+ #else
52
+ // pytorch aten partition pattern
53
+ T n_my = div_up(n, nth);
54
+ n_start = ith * n_my;
55
+ n_end = std::min(n_start + n_my, n);
56
+ #endif
57
+ }
58
+
59
+ template <typename func_t>
60
+ inline void parallel_for(int nth, int n, const func_t& f) {
61
+ #if defined(_OPENMP)
62
+ #pragma omp parallel num_threads(nth)
63
+ {
64
+ //int nth = omp_get_num_threads();
65
+ int ith = omp_get_thread_num();
66
+ int tbegin, tend;
67
+ balance211(n, nth, ith, tbegin, tend);
68
+ f(tbegin, tend);
69
+ }
70
+ #else
71
+ f(0, n);
72
+
73
+ GGML_UNUSED(nth);
74
+ #endif
75
+ }
76
+
77
+ // quantized types that have AMX support
78
+ inline bool qtype_has_amx_kernels(const enum ggml_type type) {
79
+ // TODO: fix padding for vnni format
80
+ return (type == GGML_TYPE_Q4_0) ||
81
+ (type == GGML_TYPE_Q4_1);
82
+ //(type == GGML_TYPE_Q8_0) ||
83
+ //(type == GGML_TYPE_Q4_K) ||
84
+ //(type == GGML_TYPE_Q5_K) ||
85
+ //(type == GGML_TYPE_Q6_K) ||
86
+ //(type == GGML_TYPE_IQ4_XS);
87
+ }
88
+
89
+ // ggml backend context
90
+ struct ggml_backend_amx_context {
91
+ int n_threads = GGML_DEFAULT_N_THREADS;
92
+ std::unique_ptr<char[]> work_data;
93
+ size_t work_size = 0;
94
+ };