@fugood/llama.node 0.3.1 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (252) hide show
  1. package/CMakeLists.txt +1 -8
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/package.json +4 -2
  17. package/src/DetokenizeWorker.cpp +1 -1
  18. package/src/EmbeddingWorker.cpp +2 -2
  19. package/src/LlamaCompletionWorker.cpp +10 -10
  20. package/src/LlamaCompletionWorker.h +2 -2
  21. package/src/LlamaContext.cpp +14 -17
  22. package/src/TokenizeWorker.cpp +1 -1
  23. package/src/common.hpp +5 -4
  24. package/src/llama.cpp/.github/workflows/build.yml +137 -29
  25. package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
  26. package/src/llama.cpp/.github/workflows/docker.yml +46 -34
  27. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
  28. package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
  29. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
  30. package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
  31. package/src/llama.cpp/.github/workflows/server.yml +7 -0
  32. package/src/llama.cpp/CMakeLists.txt +26 -11
  33. package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
  34. package/src/llama.cpp/common/CMakeLists.txt +10 -10
  35. package/src/llama.cpp/common/arg.cpp +2041 -0
  36. package/src/llama.cpp/common/arg.h +77 -0
  37. package/src/llama.cpp/common/common.cpp +523 -1861
  38. package/src/llama.cpp/common/common.h +234 -106
  39. package/src/llama.cpp/common/console.cpp +3 -0
  40. package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
  41. package/src/llama.cpp/common/log.cpp +401 -0
  42. package/src/llama.cpp/common/log.h +66 -698
  43. package/src/llama.cpp/common/ngram-cache.cpp +39 -36
  44. package/src/llama.cpp/common/ngram-cache.h +19 -19
  45. package/src/llama.cpp/common/sampling.cpp +356 -350
  46. package/src/llama.cpp/common/sampling.h +62 -139
  47. package/src/llama.cpp/common/stb_image.h +5990 -6398
  48. package/src/llama.cpp/docs/build.md +72 -17
  49. package/src/llama.cpp/examples/CMakeLists.txt +1 -2
  50. package/src/llama.cpp/examples/batched/batched.cpp +49 -65
  51. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +42 -53
  52. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
  53. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +22 -22
  54. package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
  55. package/src/llama.cpp/examples/embedding/embedding.cpp +147 -91
  56. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +37 -37
  57. package/src/llama.cpp/examples/export-lora/export-lora.cpp +39 -38
  58. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
  59. package/src/llama.cpp/examples/{baby-llama → gen-docs}/CMakeLists.txt +2 -2
  60. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
  61. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
  62. package/src/llama.cpp/examples/gritlm/gritlm.cpp +46 -39
  63. package/src/llama.cpp/examples/imatrix/imatrix.cpp +75 -69
  64. package/src/llama.cpp/examples/infill/infill.cpp +131 -192
  65. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +276 -178
  66. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  67. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +40 -36
  68. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  69. package/src/llama.cpp/examples/llava/clip.cpp +686 -150
  70. package/src/llama.cpp/examples/llava/clip.h +11 -2
  71. package/src/llama.cpp/examples/llava/llava-cli.cpp +60 -71
  72. package/src/llama.cpp/examples/llava/llava.cpp +146 -26
  73. package/src/llama.cpp/examples/llava/llava.h +2 -3
  74. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
  75. package/src/llama.cpp/examples/llava/requirements.txt +1 -0
  76. package/src/llama.cpp/examples/lookahead/lookahead.cpp +55 -56
  77. package/src/llama.cpp/examples/lookup/lookup-create.cpp +15 -13
  78. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  79. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +34 -33
  80. package/src/llama.cpp/examples/lookup/lookup.cpp +60 -63
  81. package/src/llama.cpp/examples/main/main.cpp +216 -313
  82. package/src/llama.cpp/examples/parallel/parallel.cpp +58 -59
  83. package/src/llama.cpp/examples/passkey/passkey.cpp +53 -61
  84. package/src/llama.cpp/examples/perplexity/perplexity.cpp +277 -311
  85. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  86. package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
  87. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -12
  88. package/src/llama.cpp/examples/retrieval/retrieval.cpp +57 -52
  89. package/src/llama.cpp/examples/rpc/rpc-server.cpp +27 -2
  90. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +60 -46
  91. package/src/llama.cpp/examples/server/CMakeLists.txt +7 -18
  92. package/src/llama.cpp/examples/server/server.cpp +1347 -1531
  93. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
  94. package/src/llama.cpp/examples/server/utils.hpp +396 -107
  95. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  96. package/src/llama.cpp/examples/simple/simple.cpp +132 -106
  97. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
  98. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
  99. package/src/llama.cpp/examples/speculative/speculative.cpp +153 -124
  100. package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
  101. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
  102. package/src/llama.cpp/examples/tokenize/tokenize.cpp +27 -29
  103. package/src/llama.cpp/ggml/CMakeLists.txt +29 -12
  104. package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
  105. package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
  106. package/src/llama.cpp/ggml/include/ggml-backend.h +166 -68
  107. package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
  108. package/src/llama.cpp/ggml/include/ggml-cann.h +17 -19
  109. package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
  110. package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
  111. package/src/llama.cpp/ggml/include/ggml-cuda.h +17 -17
  112. package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
  113. package/src/llama.cpp/ggml/include/ggml-metal.h +13 -12
  114. package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
  115. package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
  116. package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
  117. package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
  118. package/src/llama.cpp/ggml/include/ggml.h +272 -505
  119. package/src/llama.cpp/ggml/src/CMakeLists.txt +69 -1110
  120. package/src/llama.cpp/ggml/src/ggml-aarch64.c +52 -2116
  121. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
  122. package/src/llama.cpp/ggml/src/ggml-alloc.c +29 -27
  123. package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
  124. package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
  125. package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
  126. package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
  127. package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
  128. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +144 -81
  129. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
  130. package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +394 -635
  131. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
  132. package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +217 -70
  133. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
  134. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
  135. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
  136. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
  137. package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
  138. package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +458 -353
  139. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
  140. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
  141. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
  142. package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
  143. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
  144. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
  145. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
  146. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +371 -0
  147. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
  148. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  149. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
  150. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
  151. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1885 -0
  152. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
  153. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
  154. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
  155. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
  156. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
  157. package/src/llama.cpp/ggml/src/ggml-impl.h +380 -584
  158. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
  159. package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +233 -87
  160. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
  161. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
  162. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
  163. package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
  164. package/src/llama.cpp/ggml/src/ggml-quants.c +369 -9994
  165. package/src/llama.cpp/ggml/src/ggml-quants.h +78 -110
  166. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
  167. package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +560 -335
  168. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
  169. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +6 -0
  170. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +51 -0
  171. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +310 -0
  172. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
  173. package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
  174. package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
  175. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
  176. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
  177. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
  178. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
  179. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +18 -25
  180. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
  181. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
  182. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
  183. package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3350 -3980
  184. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
  185. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
  186. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +70 -68
  187. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +9 -6
  188. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
  189. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
  190. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +8 -0
  191. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
  192. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
  193. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
  194. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
  195. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
  196. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
  197. package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
  198. package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
  199. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
  200. package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2034 -1718
  201. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +2 -0
  202. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +152 -185
  203. package/src/llama.cpp/ggml/src/ggml.c +2075 -16579
  204. package/src/llama.cpp/include/llama.h +296 -285
  205. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
  206. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
  207. package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
  208. package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
  209. package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
  210. package/src/llama.cpp/src/CMakeLists.txt +2 -1
  211. package/src/llama.cpp/src/llama-grammar.cpp +721 -122
  212. package/src/llama.cpp/src/llama-grammar.h +120 -15
  213. package/src/llama.cpp/src/llama-impl.h +156 -1
  214. package/src/llama.cpp/src/llama-sampling.cpp +2058 -346
  215. package/src/llama.cpp/src/llama-sampling.h +39 -47
  216. package/src/llama.cpp/src/llama-vocab.cpp +390 -127
  217. package/src/llama.cpp/src/llama-vocab.h +60 -20
  218. package/src/llama.cpp/src/llama.cpp +6215 -3263
  219. package/src/llama.cpp/src/unicode-data.cpp +6 -4
  220. package/src/llama.cpp/src/unicode-data.h +4 -4
  221. package/src/llama.cpp/src/unicode.cpp +15 -7
  222. package/src/llama.cpp/tests/CMakeLists.txt +4 -2
  223. package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
  224. package/src/llama.cpp/tests/test-backend-ops.cpp +1725 -297
  225. package/src/llama.cpp/tests/test-barrier.cpp +94 -0
  226. package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
  227. package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
  228. package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
  229. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +23 -8
  230. package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
  231. package/src/llama.cpp/tests/test-log.cpp +39 -0
  232. package/src/llama.cpp/tests/test-opt.cpp +853 -142
  233. package/src/llama.cpp/tests/test-quantize-fns.cpp +28 -19
  234. package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
  235. package/src/llama.cpp/tests/test-rope.cpp +2 -1
  236. package/src/llama.cpp/tests/test-sampling.cpp +226 -142
  237. package/src/llama.cpp/tests/test-tokenizer-0.cpp +56 -36
  238. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
  239. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
  240. package/patches/llama.patch +0 -22
  241. package/src/llama.cpp/.github/workflows/bench.yml +0 -310
  242. package/src/llama.cpp/common/grammar-parser.cpp +0 -536
  243. package/src/llama.cpp/common/grammar-parser.h +0 -29
  244. package/src/llama.cpp/common/train.cpp +0 -1513
  245. package/src/llama.cpp/common/train.h +0 -233
  246. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1640
  247. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
  248. package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
  249. package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +0 -1027
  250. package/src/llama.cpp/tests/test-grad0.cpp +0 -1566
  251. /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
  252. /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
@@ -9,6 +9,7 @@ file(GLOB SRC_FILES
9
9
  get_row_q8_0.cpp
10
10
  quantize_f32_q8_0.cpp
11
11
  quantize_f16_q8_0.cpp
12
+ quantize_float_to_q4_0.cpp
12
13
  dup.cpp
13
14
  )
14
15
 
@@ -29,4 +30,4 @@ ascendc_library(ascendc_kernels STATIC
29
30
  ${SRC_FILES}
30
31
  )
31
32
 
32
- #ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)
33
+ # ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)
@@ -8,6 +8,8 @@
8
8
 
9
9
  #include "aclrtlaunch_ascendc_quantize_f32_q8_0.h"
10
10
  #include "aclrtlaunch_ascendc_quantize_f16_q8_0.h"
11
+ #include "aclrtlaunch_ascendc_quantize_f16_to_q4_0.h"
12
+ #include "aclrtlaunch_ascendc_quantize_f32_to_q4_0.h"
11
13
 
12
14
  #include "aclrtlaunch_ascendc_dup_by_rows_fp16.h"
13
15
  #include "aclrtlaunch_ascendc_dup_by_rows_fp32.h"
@@ -0,0 +1,278 @@
1
+ #include "kernel_operator.h"
2
+
3
+ using namespace AscendC;
4
+
5
+ #define BUFFER_NUM 2
6
+ #define Group_Size 32
7
+
8
+ template <typename SRC_T>
9
+ class QUANTIZE_FLOAT_TO_Q4_0 {
10
+ public:
11
+ __aicore__ inline QUANTIZE_FLOAT_TO_Q4_0() {}
12
+ __aicore__ inline void init(GM_ADDR input, GM_ADDR output,
13
+ int64_t *input_ne_ub, size_t *input_nb_ub,
14
+ int64_t *output_ne_ub) {
15
+ // TODO: fix test_case CPY(type_src=f16,type_dst=q4_0,ne=[256,4,4,4],
16
+ // permute=[0,0,0,0]):
17
+ // [CPY] NMSE = 0.000008343 > 0.000001000 FAIL
18
+ int64_t op_block_num = GetBlockNum();
19
+ int64_t op_block_idx = GetBlockIdx();
20
+
21
+ // input stride of data elements
22
+ for (int i = 0; i < 4; i++) {
23
+ input_ne[i] = input_ne_ub[i];
24
+ input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
25
+ output_ne[i] = output_ne_ub[i];
26
+ }
27
+
28
+ // output stride of data elements
29
+ output_stride[0] = 1;
30
+ for (int i = 1; i < 4; i++) {
31
+ output_stride[i] = output_stride[i - 1] * output_ne[i - 1];
32
+ }
33
+
34
+ // scale saved one by one after data:. [group1_scale, group2_scale, ...]
35
+ scale_ne = input_ne;
36
+ scale_stride[0] = 1;
37
+ scale_stride[1] = input_ne[0] / Group_Size;
38
+ for (int i = 2; i < 4; i++) {
39
+ scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
40
+ }
41
+
42
+ // split input tensor by rows.
43
+ uint64_t nr = input_ne[1] * input_ne[2] * input_ne[3];
44
+ dr = nr / op_block_num;
45
+
46
+ uint64_t tails = nr % op_block_num;
47
+ if (op_block_idx < tails) {
48
+ dr += 1;
49
+ ir = dr * op_block_idx;
50
+ } else {
51
+ ir = dr * op_block_idx + tails;
52
+ }
53
+
54
+ group_size_in_row = scale_stride[1];
55
+ int64_t scale_offset = output_ne[0] * output_ne[1] * output_ne[2] *
56
+ output_ne[3] * sizeof(uint8_t) / 2;
57
+
58
+ input_gm.SetGlobalBuffer((__gm__ SRC_T *)input);
59
+ output_gm.SetGlobalBuffer((__gm__ int8_t *)output);
60
+ scale_gm.SetGlobalBuffer((__gm__ half *)(output + scale_offset + ir *
61
+ group_size_in_row *
62
+ sizeof(half)));
63
+
64
+ pipe.InitBuffer(input_queue, BUFFER_NUM, Group_Size * sizeof(SRC_T));
65
+ pipe.InitBuffer(output_queue, BUFFER_NUM,
66
+ Group_Size * sizeof(int8_t) / 2);
67
+ pipe.InitBuffer(cast_queue , 1, Group_Size * sizeof(float));
68
+ pipe.InitBuffer(work_queue, 1, Group_Size * sizeof(float));
69
+ pipe.InitBuffer(max_queue, 1, Group_Size * sizeof(float));
70
+ pipe.InitBuffer(min_queue, 1, Group_Size * sizeof(float));
71
+ pipe.InitBuffer(scale_queue, 1, Group_Size / 2 * sizeof(half));
72
+ pipe.InitBuffer(int8_queue, 1, Group_Size * sizeof(int8_t));
73
+ pipe.InitBuffer(half_queue, 1, Group_Size * sizeof(half));
74
+ }
75
+
76
+ __aicore__ inline void copy_in(uint32_t offset) {
77
+ LocalTensor<SRC_T> input_local = input_queue.AllocTensor<SRC_T>();
78
+ DataCopy(input_local, input_gm[offset], Group_Size);
79
+ input_queue.EnQue(input_local);
80
+ }
81
+
82
+ __aicore__ inline void copy_out(uint32_t offset) {
83
+ // reinterpretcast Group_Size(32) * int4b_t to Group_Size / 2 * int8_t,
84
+ // and using DataCopyPad to avoid 32 bits align.
85
+ LocalTensor<int4b_t> output_local = output_queue.DeQue<int4b_t>();
86
+ LocalTensor<int8_t> output_int8_local =
87
+ output_local.ReinterpretCast<int8_t>();
88
+
89
+ DataCopyExtParams dataCopyParams;
90
+ dataCopyParams.blockCount = 1;
91
+ dataCopyParams.blockLen = Group_Size / 2 * sizeof(int8_t);
92
+ DataCopyPad(output_gm[offset], output_int8_local, dataCopyParams);
93
+
94
+ output_queue.FreeTensor(output_local);
95
+ }
96
+
97
+ __aicore__ inline void input_to_cast(LocalTensor<float> cast_local,
98
+ LocalTensor<float> input_local) {
99
+ DataCopy(cast_local, input_local, Group_Size);
100
+ }
101
+
102
+ __aicore__ inline void input_to_cast(LocalTensor<float> cast_local,
103
+ LocalTensor<half> input_local) {
104
+ Cast(cast_local, input_local, RoundMode::CAST_NONE, Group_Size);
105
+ }
106
+
107
+ __aicore__ inline half calculate_group(int64_t row, int64_t group) {
108
+ const int64_t i3 = row / (input_ne[1] * input_ne[2]);
109
+ const int64_t i2 = (row - i3 * input_ne[1] * input_ne[2]) / input_ne[1];
110
+ const int64_t i1 =
111
+ row - i3 * input_ne[1] * input_ne[2] - i2 * input_ne[1];
112
+
113
+ const int64_t input_offset = i1 * input_stride[1] +
114
+ i2 * input_stride[2] +
115
+ i3 * input_stride[3] + Group_Size * group;
116
+
117
+ // output_offset is stride for output_gm which datatype is int8_t and
118
+ // divided by 2 is needed for int4b_t.
119
+ const int64_t output_offset = (i1 * output_stride[1] +
120
+ i2 * output_stride[2] +
121
+ i3 * output_stride[3] +
122
+ Group_Size * group) / 2;
123
+ copy_in(input_offset);
124
+
125
+ LocalTensor<SRC_T> input_local = input_queue.DeQue<SRC_T>();
126
+ LocalTensor<int4b_t> output_local = output_queue.AllocTensor<int4b_t>();
127
+ LocalTensor<float> cast_local = cast_queue.AllocTensor<float>();
128
+ LocalTensor<float> work_local = work_queue.AllocTensor<float>();
129
+ LocalTensor<float> max_local = max_queue.AllocTensor<float>();
130
+ LocalTensor<float> min_local = min_queue.AllocTensor<float>();
131
+ LocalTensor<int8_t> int8_local = int8_queue.AllocTensor<int8_t>();
132
+ LocalTensor<half> half_local = half_queue.AllocTensor<half>();
133
+
134
+ input_to_cast(cast_local, input_local);
135
+
136
+ ReduceMax(max_local, cast_local, work_local, Group_Size);
137
+ ReduceMin(min_local, cast_local, work_local, Group_Size);
138
+ const float max_value = max_local.GetValue(0);
139
+ const float min_value = min_local.GetValue(0);
140
+ float d = max_value;
141
+ if (min_value < 0 && (-1 * min_value) > max_value) {
142
+ d = min_value;
143
+ }
144
+
145
+ d = d / (-8);
146
+ if (d != 0) {
147
+ Muls(cast_local, cast_local, 1.0f / d, Group_Size);
148
+ }
149
+
150
+ // range: [-8,8] -> [0.5,16.5] -> [0,16] -> [0,15] -> [-8,7]
151
+ float scalar = 8.5f;
152
+ Adds(cast_local, cast_local, scalar, Group_Size);
153
+ Cast(cast_local, cast_local, RoundMode::CAST_FLOOR, Group_Size);
154
+ scalar = 15.0f;
155
+ Mins(cast_local, cast_local, scalar, Group_Size);
156
+ scalar = -8.0f;
157
+ Adds(cast_local, cast_local, scalar, Group_Size);
158
+
159
+ // float->half->int4b
160
+ Cast(half_local, cast_local, RoundMode::CAST_NONE, Group_Size);
161
+ Cast(output_local, half_local, RoundMode::CAST_NONE, Group_Size);
162
+
163
+ output_queue.EnQue(output_local);
164
+ copy_out(output_offset);
165
+
166
+ input_queue.FreeTensor(input_local);
167
+ work_queue.FreeTensor(work_local);
168
+ max_queue.FreeTensor(max_local);
169
+ min_queue.FreeTensor(min_local);
170
+ int8_queue.FreeTensor(int8_local);
171
+ half_queue.FreeTensor(half_local);
172
+ cast_queue.FreeTensor(cast_local);
173
+ return (half)d;
174
+ }
175
+
176
+ __aicore__ inline void calculate() {
177
+ LocalTensor<half> scale_local = scale_queue.AllocTensor<half>();
178
+ uint32_t scale_local_offset = 0;
179
+ uint32_t scale_global_offset = 0;
180
+ for (int64_t i = ir; i < ir + dr; i++) {
181
+ for (int64_t j = 0; j < group_size_in_row; j++) {
182
+ half scale = calculate_group(i, j);
183
+ scale_local.SetValue(scale_local_offset++, scale);
184
+ // Copy Group_Size/2 length data each time.
185
+ if (scale_local_offset == Group_Size / 2) {
186
+ scale_local_offset = 0;
187
+ // TODO: OPTIMIZE ME
188
+ pipe_barrier(PIPE_ALL);
189
+ DataCopy(scale_gm[scale_global_offset], scale_local,
190
+ Group_Size / 2);
191
+ pipe_barrier(PIPE_ALL);
192
+ scale_global_offset += Group_Size / 2;
193
+ }
194
+ }
195
+ }
196
+
197
+ if (scale_local_offset != 0) {
198
+ pipe_barrier(PIPE_ALL);
199
+ DataCopyExtParams dataCopyParams;
200
+ dataCopyParams.blockCount = 1;
201
+ dataCopyParams.blockLen = scale_local_offset * sizeof(half);
202
+ DataCopyPad(scale_gm[scale_global_offset], scale_local,
203
+ dataCopyParams);
204
+ pipe_barrier(PIPE_ALL);
205
+ }
206
+ scale_queue.FreeTensor(scale_local);
207
+ }
208
+
209
+ private:
210
+ int64_t input_ne[4];
211
+ size_t input_stride[4];
212
+
213
+ int64_t *scale_ne;
214
+ size_t scale_stride[4];
215
+
216
+ int64_t output_ne[4];
217
+ size_t output_stride[4];
218
+
219
+ int64_t group_size_in_row;
220
+
221
+ int64_t ir;
222
+ int64_t dr;
223
+
224
+ TPipe pipe;
225
+ GlobalTensor<SRC_T> input_gm;
226
+ GlobalTensor<half> scale_gm;
227
+ GlobalTensor<int8_t> output_gm;
228
+ TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
229
+ TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
230
+ TQue<QuePosition::VECIN, BUFFER_NUM> work_queue;
231
+ TQue<QuePosition::VECOUT, BUFFER_NUM> max_queue;
232
+ TQue<QuePosition::VECOUT, BUFFER_NUM> min_queue;
233
+ TQue<QuePosition::VECOUT, BUFFER_NUM> scale_queue;
234
+ TQue<QuePosition::VECOUT, BUFFER_NUM> cast_queue;
235
+ TQue<QuePosition::VECOUT, BUFFER_NUM> int8_queue;
236
+ TQue<QuePosition::VECOUT, BUFFER_NUM> half_queue;
237
+ };
238
+
239
+ template <typename T>
240
+ __aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
241
+ auto gm_ptr = (__gm__ uint8_t *)gm;
242
+ auto ub_ptr = (uint8_t *)(ub);
243
+ for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
244
+ *ub_ptr = *gm_ptr;
245
+ }
246
+ }
247
+
248
+ extern "C" __global__ __aicore__ void ascendc_quantize_f16_to_q4_0(
249
+ GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
250
+ GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
251
+ int64_t input_ne_ub[4];
252
+ size_t input_nb_ub[4];
253
+ int64_t output_ne_ub[4];
254
+
255
+ copy_to_ub(input_ne_gm, input_ne_ub, 32);
256
+ copy_to_ub(input_nb_gm, input_nb_ub, 32);
257
+ copy_to_ub(output_ne_gm, output_ne_ub, 32);
258
+
259
+ QUANTIZE_FLOAT_TO_Q4_0<half> op;
260
+ op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
261
+ op.calculate();
262
+ }
263
+
264
+ extern "C" __global__ __aicore__ void ascendc_quantize_f32_to_q4_0(
265
+ GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
266
+ GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
267
+ int64_t input_ne_ub[4];
268
+ size_t input_nb_ub[4];
269
+ int64_t output_ne_ub[4];
270
+
271
+ copy_to_ub(input_ne_gm, input_ne_ub, 32);
272
+ copy_to_ub(input_nb_gm, input_nb_ub, 32);
273
+ copy_to_ub(output_ne_gm, output_ne_ub, 32);
274
+
275
+ QUANTIZE_FLOAT_TO_Q4_0<float> op;
276
+ op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
277
+ op.calculate();
278
+ }
@@ -227,6 +227,25 @@ typedef struct {
227
227
  } block_q8_0x8;
228
228
  static_assert(sizeof(block_q8_0x8) == 8 * sizeof(ggml_half) + QK8_0 * 8, "wrong q8_0x8 block size/padding");
229
229
 
230
+ //
231
+ // Ternary quantization
232
+ //
233
+
234
+ // 1.6875 bpw
235
+ typedef struct {
236
+ uint8_t qs[(QK_K - 4 * QK_K / 64) / 5]; // 5 elements per byte (3^5 = 243 < 256)
237
+ uint8_t qh[QK_K/64]; // 4 elements per byte
238
+ ggml_half d;
239
+ } block_tq1_0;
240
+ static_assert(sizeof(block_tq1_0) == sizeof(ggml_half) + QK_K / 64 + (QK_K - 4 * QK_K / 64) / 5, "wrong tq1_0 block size/padding");
241
+
242
+ // 2.0625 bpw
243
+ typedef struct {
244
+ uint8_t qs[QK_K/4]; // 2 bits per element
245
+ ggml_half d;
246
+ } block_tq2_0;
247
+ static_assert(sizeof(block_tq2_0) == sizeof(ggml_half) + QK_K / 4, "wrong tq2_0 block size/padding");
248
+
230
249
  //
231
250
  // Super-block quantization structures
232
251
  //
@@ -361,6 +380,7 @@ typedef struct {
361
380
  } block_iq3_s;
362
381
  static_assert(sizeof(block_iq3_s) == sizeof(ggml_half) + 13*(QK_K/32) + IQ3S_N_SCALE, "wrong iq3_s block size/padding");
363
382
 
383
+ // 1.5625 bpw
364
384
  typedef struct {
365
385
  ggml_half d;
366
386
  uint8_t qs[QK_K/8];
@@ -0,0 +1,261 @@
1
+ add_library(ggml-cpu
2
+ ggml-cpu.c
3
+ ggml-cpu.cpp
4
+ ggml-cpu-aarch64.c
5
+ ggml-cpu-aarch64.h
6
+ ggml-cpu-quants.c
7
+ ggml-cpu-quants.h
8
+ )
9
+
10
+ target_link_libraries(ggml-cpu PRIVATE ggml-base)
11
+ target_include_directories(ggml-cpu PRIVATE . ..)
12
+
13
+ if (APPLE AND GGML_ACCELERATE)
14
+ find_library(ACCELERATE_FRAMEWORK Accelerate)
15
+ if (ACCELERATE_FRAMEWORK)
16
+ message(STATUS "Accelerate framework found")
17
+
18
+ add_compile_definitions(GGML_USE_ACCELERATE)
19
+ add_compile_definitions(ACCELERATE_NEW_LAPACK)
20
+ add_compile_definitions(ACCELERATE_LAPACK_ILP64)
21
+
22
+ target_link_libraries(ggml-cpu PRIVATE ${ACCELERATE_FRAMEWORK})
23
+ else()
24
+ message(WARNING "Accelerate framework not found")
25
+ endif()
26
+ endif()
27
+
28
+ if (GGML_OPENMP)
29
+ find_package(OpenMP)
30
+ if (OpenMP_FOUND)
31
+ message(STATUS "OpenMP found")
32
+
33
+ add_compile_definitions(GGML_USE_OPENMP)
34
+
35
+ target_link_libraries(ggml-cpu PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
36
+
37
+ # FIXME: should be replaced with a compiler id check
38
+ #if (GGML_MUSA)
39
+ # list(APPEND GGML_CPU_EXTRA_INCLUDES "/usr/lib/llvm-14/lib/clang/14.0.0/include")
40
+ # list(APPEND GGML_CPU_EXTRA_LIBS_PRIVATE "/usr/lib/llvm-14/lib/libomp.so")
41
+ #endif()
42
+ else()
43
+ message(WARNING "OpenMP not found")
44
+ endif()
45
+ endif()
46
+
47
+ if (GGML_LLAMAFILE)
48
+ message(STATUS "Using llamafile")
49
+
50
+ add_compile_definitions(GGML_USE_LLAMAFILE)
51
+
52
+ target_sources(ggml-cpu PRIVATE
53
+ llamafile/sgemm.cpp
54
+ llamafile/sgemm.h)
55
+ endif()
56
+
57
+ if (GGML_CPU_HBM)
58
+ find_library(memkind memkind REQUIRED)
59
+
60
+ message(STATUS "Using memkind for CPU HBM")
61
+
62
+ add_compile_definitions(GGML_USE_CPU_HBM)
63
+
64
+ target_link_libraries(ggml-cpu PUBLIC memkind)
65
+ endif()
66
+
67
+ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR
68
+ CMAKE_GENERATOR_PLATFORM_LWR STREQUAL "arm64" OR
69
+ (NOT CMAKE_OSX_ARCHITECTURES AND
70
+ NOT CMAKE_GENERATOR_PLATFORM_LWR AND
71
+ CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$"))
72
+
73
+ message(STATUS "ARM detected")
74
+
75
+ if (MSVC)
76
+ add_compile_definitions(__aarch64__) # MSVC defines _M_ARM64 instead
77
+ add_compile_definitions(__ARM_NEON)
78
+ add_compile_definitions(__ARM_FEATURE_FMA)
79
+
80
+ set(CMAKE_REQUIRED_FLAGS_PREV ${CMAKE_REQUIRED_FLAGS})
81
+ string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} "/arch:armv8.2")
82
+
83
+ check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
84
+ if (GGML_COMPILER_SUPPORT_DOTPROD)
85
+ add_compile_definitions(__ARM_FEATURE_DOTPROD)
86
+ endif ()
87
+
88
+ check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
89
+
90
+ if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
91
+ add_compile_definitions(__ARM_FEATURE_MATMUL_INT8)
92
+ endif ()
93
+
94
+ check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
95
+ if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
96
+ add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
97
+ endif ()
98
+
99
+ set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_PREV})
100
+ else()
101
+ check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E)
102
+ if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
103
+ list(APPEND ARCH_FLAGS -mfp16-format=ieee)
104
+ endif()
105
+ if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6")
106
+ # Raspberry Pi 1, Zero
107
+ list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access)
108
+ endif()
109
+ if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
110
+ if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Android")
111
+ # Android armeabi-v7a
112
+ list(APPEND ARCH_FLAGS -mfpu=neon-vfpv4 -mno-unaligned-access -funsafe-math-optimizations)
113
+ else()
114
+ # Raspberry Pi 2
115
+ list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
116
+ endif()
117
+ endif()
118
+ if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
119
+ # Android arm64-v8a
120
+ # Raspberry Pi 3, 4, Zero 2 (32-bit)
121
+ list(APPEND ARCH_FLAGS -mno-unaligned-access)
122
+ endif()
123
+ if (GGML_SVE)
124
+ list(APPEND ARCH_FLAGS -march=armv8.6-a+sve)
125
+ endif()
126
+ endif()
127
+ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
128
+ (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
129
+ CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$"))
130
+ message(STATUS "x86 detected")
131
+ if (MSVC)
132
+ # instruction set detection for MSVC only
133
+ if (GGML_NATIVE)
134
+ # TODO: improve, should not reference files from the parent folder
135
+ include(cmake/FindSIMD.cmake)
136
+ endif ()
137
+ if (GGML_AVX512)
138
+ list(APPEND ARCH_FLAGS /arch:AVX512)
139
+ # MSVC has no compile-time flags enabling specific
140
+ # AVX512 extensions, neither it defines the
141
+ # macros corresponding to the extensions.
142
+ # Do it manually.
143
+ if (GGML_AVX512_VBMI)
144
+ add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>)
145
+ add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>)
146
+ if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
147
+ list(APPEND ARCH_FLAGS -mavx512vbmi)
148
+ endif()
149
+ endif()
150
+ if (GGML_AVX512_VNNI)
151
+ add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
152
+ add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
153
+ if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
154
+ list(APPEND ARCH_FLAGS -mavx512vnni)
155
+ endif()
156
+ endif()
157
+ if (GGML_AVX512_BF16)
158
+ add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512BF16__>)
159
+ add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512BF16__>)
160
+ if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
161
+ list(APPEND ARCH_FLAGS -mavx512bf16)
162
+ endif()
163
+ endif()
164
+ if (GGML_AMX_TILE)
165
+ add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_TILE__>)
166
+ add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_TILE__>)
167
+ endif()
168
+ if (GGML_AMX_INT8)
169
+ add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_INT8__>)
170
+ add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_INT8__>)
171
+ endif()
172
+ if (GGML_AMX_BF16)
173
+ add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_BF16__>)
174
+ add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_BF16__>)
175
+ endif()
176
+ elseif (GGML_AVX2)
177
+ list(APPEND ARCH_FLAGS /arch:AVX2)
178
+ elseif (GGML_AVX)
179
+ list(APPEND ARCH_FLAGS /arch:AVX)
180
+ endif()
181
+ else()
182
+ if (GGML_NATIVE)
183
+ list(APPEND ARCH_FLAGS -march=native)
184
+ endif()
185
+ if (GGML_F16C)
186
+ list(APPEND ARCH_FLAGS -mf16c)
187
+ endif()
188
+ if (GGML_FMA)
189
+ list(APPEND ARCH_FLAGS -mfma)
190
+ endif()
191
+ if (GGML_AVX)
192
+ list(APPEND ARCH_FLAGS -mavx)
193
+ endif()
194
+ if (GGML_AVX2)
195
+ list(APPEND ARCH_FLAGS -mavx2)
196
+ endif()
197
+ if (GGML_AVX512)
198
+ list(APPEND ARCH_FLAGS -mavx512f)
199
+ list(APPEND ARCH_FLAGS -mavx512dq)
200
+ list(APPEND ARCH_FLAGS -mavx512bw)
201
+ endif()
202
+ if (GGML_AVX512_VBMI)
203
+ list(APPEND ARCH_FLAGS -mavx512vbmi)
204
+ endif()
205
+ if (GGML_AVX512_VNNI)
206
+ list(APPEND ARCH_FLAGS -mavx512vnni)
207
+ endif()
208
+ if (GGML_AVX512_BF16)
209
+ list(APPEND ARCH_FLAGS -mavx512bf16)
210
+ endif()
211
+ if (GGML_AMX_TILE)
212
+ list(APPEND ARCH_FLAGS -mamx-tile)
213
+ endif()
214
+ if (GGML_AMX_INT8)
215
+ list(APPEND ARCH_FLAGS -mamx-int8)
216
+ endif()
217
+ if (GGML_AMX_BF16)
218
+ list(APPEND ARCH_FLAGS -mamx-bf16)
219
+ endif()
220
+ endif()
221
+ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
222
+ message(STATUS "PowerPC detected")
223
+ execute_process(COMMAND bash -c "grep POWER10 /proc/cpuinfo | head -n 1" OUTPUT_VARIABLE POWER10_M)
224
+ string(FIND "${POWER10_M}" "POWER10" substring_index)
225
+ if (NOT DEFINED substring_index OR "${substring_index}" STREQUAL "")
226
+ set(substring_index -1)
227
+ endif()
228
+
229
+ if (${substring_index} GREATER_EQUAL 0)
230
+ list(APPEND ARCH_FLAGS -mcpu=power10)
231
+ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
232
+ list(APPEND ARCH_FLAGS -mcpu=powerpc64le)
233
+ else()
234
+ list(APPEND ARCH_FLAGS -mcpu=native -mtune=native)
235
+ #TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
236
+ endif()
237
+ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
238
+ message(STATUS "loongarch64 detected")
239
+
240
+ list(APPEND ARCH_FLAGS -march=loongarch64)
241
+ if (GGML_LASX)
242
+ list(APPEND ARCH_FLAGS -mlasx)
243
+ endif()
244
+ if (GGML_LSX)
245
+ list(APPEND ARCH_FLAGS -mlsx)
246
+ endif()
247
+ else()
248
+ message(STATUS "Unknown architecture")
249
+ endif()
250
+
251
+ if (GGML_CPU_AARCH64)
252
+ message(STATUS "Using runtime weight conversion of Q4_0 to Q4_0_x_x to enable optimized GEMM/GEMV kernels")
253
+ add_compile_definitions(GGML_USE_CPU_AARCH64)
254
+ endif()
255
+
256
+ target_compile_options(ggml-cpu PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${ARCH_FLAGS}>")
257
+ target_compile_options(ggml-cpu PRIVATE "$<$<COMPILE_LANGUAGE:C>:${ARCH_FLAGS}>")
258
+
259
+ if (EMSCRIPTEN)
260
+ set_target_properties(ggml-cpu PROPERTIES COMPILE_FLAGS "-msimd128")
261
+ endif()