@novastera-oss/llamarn 0.2.6 → 0.2.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (253) hide show
  1. package/android/src/main/cpp/include/llama.h +141 -38
  2. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  3. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  4. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  6. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  7. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  8. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  9. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  10. package/cpp/LlamaCppModel.cpp +58 -24
  11. package/cpp/LlamaCppModel.h +3 -3
  12. package/cpp/PureCppImpl.cpp +1 -1
  13. package/cpp/PureCppImpl.h +2 -2
  14. package/cpp/build-info.cpp +2 -2
  15. package/cpp/llama.cpp/CMakeLists.txt +15 -4
  16. package/cpp/llama.cpp/Makefile +2 -2
  17. package/cpp/llama.cpp/README.md +32 -13
  18. package/cpp/llama.cpp/common/CMakeLists.txt +10 -20
  19. package/cpp/llama.cpp/common/arg.cpp +37 -6
  20. package/cpp/llama.cpp/common/build-info.cpp.in +2 -2
  21. package/cpp/llama.cpp/common/chat-parser.cpp +5 -0
  22. package/cpp/llama.cpp/common/chat-parser.h +2 -0
  23. package/cpp/llama.cpp/common/chat.cpp +12 -9
  24. package/cpp/llama.cpp/common/chat.h +1 -1
  25. package/cpp/llama.cpp/common/common.cpp +53 -40
  26. package/cpp/llama.cpp/common/common.h +6 -2
  27. package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
  28. package/cpp/llama.cpp/common/speculative.cpp +6 -4
  29. package/cpp/llama.cpp/convert_hf_to_gguf.py +215 -76
  30. package/cpp/llama.cpp/ggml/CMakeLists.txt +48 -2
  31. package/cpp/llama.cpp/ggml/cmake/common.cmake +1 -2
  32. package/cpp/llama.cpp/ggml/include/ggml-cpu.h +2 -0
  33. package/cpp/llama.cpp/ggml/include/ggml.h +33 -0
  34. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +64 -13
  35. package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +5 -0
  36. package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +6 -1
  37. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +33 -9
  38. package/cpp/llama.cpp/ggml/src/ggml-common.h +4 -0
  39. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +124 -26
  40. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
  41. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +11 -10
  42. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  43. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4114 -0
  44. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2163 -0
  45. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2639 -0
  46. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  47. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2732 -0
  48. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2069 -0
  49. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +397 -0
  50. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1300 -0
  51. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1481 -0
  52. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +4311 -0
  53. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +79 -3225
  54. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +184 -0
  55. package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +4 -3
  56. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +16 -7
  57. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +93 -104
  58. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +12 -8
  59. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
  60. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
  61. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +58 -8
  62. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
  63. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +194 -69
  64. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -0
  65. package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +1158 -0
  66. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
  67. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +1571 -0
  68. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.h +98 -0
  69. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +213 -37
  70. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
  71. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +2 -2
  72. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +45 -45
  73. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +59 -37
  74. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
  75. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
  76. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
  77. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
  78. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +4 -1
  79. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +4 -0
  80. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +90 -39
  81. package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cu +19 -0
  82. package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cuh +3 -0
  83. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cu +257 -87
  84. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cuh +2 -3
  85. package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +6 -4
  86. package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cu +5 -18
  87. package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cuh +0 -1
  88. package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -0
  89. package/cpp/llama.cpp/ggml/src/ggml-impl.h +61 -183
  90. package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +11 -10
  91. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +16 -0
  92. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +260 -49
  93. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +497 -282
  94. package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +2 -2
  95. package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +7 -0
  96. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1078 -468
  97. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
  98. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  99. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
  100. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
  101. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
  102. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  103. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
  104. package/cpp/llama.cpp/ggml/src/ggml-quants.c +0 -2
  105. package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +18 -15
  106. package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +1 -1
  107. package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -6
  108. package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +20 -48
  109. package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +28 -41
  110. package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +4 -10
  111. package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +117 -165
  112. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +192 -53
  113. package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +32 -0
  114. package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +49 -67
  115. package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
  116. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +99 -159
  117. package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +3 -0
  118. package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +8 -105
  119. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +209 -92
  120. package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
  121. package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
  122. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +60 -80
  123. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +158 -203
  124. package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +55 -74
  125. package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +38 -10
  126. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +24 -20
  127. package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -3
  128. package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
  129. package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
  130. package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -8
  131. package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +108 -16
  132. package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +12 -16
  133. package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +36 -28
  134. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +487 -247
  135. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -12
  136. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
  137. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +2 -0
  138. package/cpp/llama.cpp/ggml/src/ggml.c +69 -19
  139. package/cpp/llama.cpp/ggml/src/gguf.cpp +5 -1
  140. package/cpp/llama.cpp/gguf-py/gguf/constants.py +133 -0
  141. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +25 -1
  142. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +78 -3
  143. package/cpp/llama.cpp/gguf-py/gguf/vocab.py +97 -4
  144. package/cpp/llama.cpp/gguf-py/pyproject.toml +2 -2
  145. package/cpp/llama.cpp/include/llama.h +141 -38
  146. package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +124 -0
  147. package/cpp/llama.cpp/requirements/requirements-compare-llama-bench.txt +1 -0
  148. package/cpp/llama.cpp/src/CMakeLists.txt +2 -2
  149. package/cpp/llama.cpp/src/llama-arch.cpp +150 -3
  150. package/cpp/llama.cpp/src/llama-arch.h +25 -1
  151. package/cpp/llama.cpp/src/llama-batch.cpp +736 -274
  152. package/cpp/llama.cpp/src/llama-batch.h +110 -57
  153. package/cpp/llama.cpp/src/llama-chat.cpp +30 -8
  154. package/cpp/llama.cpp/src/llama-chat.h +1 -0
  155. package/cpp/llama.cpp/src/llama-context.cpp +360 -266
  156. package/cpp/llama.cpp/src/llama-context.h +27 -23
  157. package/cpp/llama.cpp/src/llama-cparams.cpp +1 -1
  158. package/cpp/llama.cpp/src/llama-cparams.h +1 -1
  159. package/cpp/llama.cpp/src/llama-graph.cpp +411 -344
  160. package/cpp/llama.cpp/src/llama-graph.h +126 -58
  161. package/cpp/llama.cpp/src/llama-hparams.cpp +10 -2
  162. package/cpp/llama.cpp/src/llama-hparams.h +16 -2
  163. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +103 -73
  164. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +34 -42
  165. package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +345 -221
  166. package/cpp/llama.cpp/src/llama-kv-cache-unified.h +75 -50
  167. package/cpp/llama.cpp/src/llama-kv-cells.h +51 -22
  168. package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +246 -0
  169. package/cpp/llama.cpp/src/llama-memory-hybrid.h +138 -0
  170. package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.cpp → llama-memory-recurrent.cpp} +302 -317
  171. package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.h → llama-memory-recurrent.h} +60 -68
  172. package/cpp/llama.cpp/src/llama-memory.cpp +41 -0
  173. package/cpp/llama.cpp/src/llama-memory.h +73 -36
  174. package/cpp/llama.cpp/src/llama-mmap.cpp +1 -1
  175. package/cpp/llama.cpp/src/llama-model-loader.cpp +42 -17
  176. package/cpp/llama.cpp/src/llama-model-saver.cpp +1 -0
  177. package/cpp/llama.cpp/src/llama-model.cpp +1630 -511
  178. package/cpp/llama.cpp/src/llama-model.h +26 -0
  179. package/cpp/llama.cpp/src/llama-quant.cpp +89 -6
  180. package/cpp/llama.cpp/src/llama-vocab.cpp +58 -26
  181. package/cpp/llama.cpp/src/llama-vocab.h +1 -0
  182. package/cpp/llama.cpp/src/llama.cpp +11 -7
  183. package/cpp/llama.cpp/src/unicode.cpp +5 -0
  184. package/cpp/rn-completion.cpp +2 -2
  185. package/cpp/{rn-llama.hpp → rn-llama.h} +1 -1
  186. package/cpp/{rn-utils.hpp → rn-utils.h} +3 -0
  187. package/ios/include/chat.h +1 -1
  188. package/ios/include/common.h +6 -2
  189. package/ios/include/llama.h +141 -38
  190. package/ios/libs/llama.xcframework/Info.plist +15 -15
  191. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  192. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4689
  193. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  194. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +33 -0
  195. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +141 -38
  196. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  197. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  198. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
  199. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3622
  200. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  201. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
  202. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
  203. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  204. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  205. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
  206. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3766 -3624
  207. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +2 -0
  208. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +33 -0
  209. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +141 -38
  210. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +2 -0
  211. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +33 -0
  212. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +141 -38
  213. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  214. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +2 -0
  215. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +33 -0
  216. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +141 -38
  217. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  218. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  219. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  220. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4689
  221. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  222. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +33 -0
  223. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +141 -38
  224. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  225. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  226. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
  227. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3622
  228. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  229. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
  230. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
  231. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  232. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  233. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4926 -4725
  234. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  235. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +33 -0
  236. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +141 -38
  237. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  238. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  239. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4897 -4746
  240. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3794 -3652
  241. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  242. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
  243. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
  244. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  245. package/package.json +1 -2
  246. package/cpp/llama.cpp/common/cmake/build-info-gen-cpp.cmake +0 -24
  247. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  248. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13891
  249. package/cpp/llama.cpp/src/llama-kv-cache.cpp +0 -1
  250. package/cpp/llama.cpp/src/llama-kv-cache.h +0 -44
  251. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
  252. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
  253. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
@@ -470,6 +470,7 @@ extern "C" {
470
470
  GGML_OP_TRANSPOSE,
471
471
  GGML_OP_GET_ROWS,
472
472
  GGML_OP_GET_ROWS_BACK,
473
+ GGML_OP_SET_ROWS,
473
474
  GGML_OP_DIAG,
474
475
  GGML_OP_DIAG_MASK_INF,
475
476
  GGML_OP_DIAG_MASK_ZERO,
@@ -489,6 +490,7 @@ extern "C" {
489
490
  GGML_OP_UPSCALE, // nearest interpolate
490
491
  GGML_OP_PAD,
491
492
  GGML_OP_PAD_REFLECT_1D,
493
+ GGML_OP_ROLL,
492
494
  GGML_OP_ARANGE,
493
495
  GGML_OP_TIMESTEP_EMBEDDING,
494
496
  GGML_OP_ARGSORT,
@@ -686,6 +688,9 @@ extern "C" {
686
688
  // true for tensor that is stored in memory as CxWxHxN and has been permuted to WxHxCxN
687
689
  GGML_API bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor);
688
690
 
691
+ // true if the elements in dimension 0 are contiguous, or there is just 1 block of elements
692
+ GGML_API bool ggml_is_contiguous_rows(const struct ggml_tensor * tensor);
693
+
689
694
  GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
690
695
  GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
691
696
 
@@ -1374,6 +1379,23 @@ extern "C" {
1374
1379
  struct ggml_tensor * b, // row indices
1375
1380
  struct ggml_tensor * c); // data for ggml_get_rows, only used for its shape
1376
1381
 
1382
+ // a TD [n_embd, ne1, ne2, ne3]
1383
+ // b TS [n_embd, n_rows, ne02, ne03] | ne02 == ne2, ne03 == ne3
1384
+ // c I64 [n_rows, ne11, ne12, 1] | c[i] in [0, ne1)
1385
+ //
1386
+ // undefined behavior if destination rows overlap
1387
+ //
1388
+ // broadcast:
1389
+ // ne2 % ne11 == 0
1390
+ // ne3 % ne12 == 0
1391
+ //
1392
+ // return view(a)
1393
+ GGML_API struct ggml_tensor * ggml_set_rows(
1394
+ struct ggml_context * ctx,
1395
+ struct ggml_tensor * a, // destination
1396
+ struct ggml_tensor * b, // source
1397
+ struct ggml_tensor * c); // row indices
1398
+
1377
1399
  GGML_API struct ggml_tensor * ggml_diag(
1378
1400
  struct ggml_context * ctx,
1379
1401
  struct ggml_tensor * a);
@@ -1801,6 +1823,17 @@ extern "C" {
1801
1823
  int p0,
1802
1824
  int p1);
1803
1825
 
1826
+ // Move tensor elements by an offset given for each dimension. Elements that
1827
+ // are shifted beyond the last position are wrapped around to the beginning.
1828
+ GGML_API struct ggml_tensor * ggml_roll(
1829
+ struct ggml_context * ctx,
1830
+ struct ggml_tensor * a,
1831
+ int shift0,
1832
+ int shift1,
1833
+ int shift2,
1834
+ int shift3);
1835
+
1836
+
1804
1837
  // Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
1805
1838
  // timesteps: [N,]
1806
1839
  // return: [N, dim]
@@ -125,7 +125,6 @@ if (NOT MSVC)
125
125
  endif()
126
126
 
127
127
  if (MINGW)
128
- # Target Windows 8 for PrefetchVirtualMemory
129
128
  add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER})
130
129
  endif()
131
130
 
@@ -213,6 +212,7 @@ endif()
213
212
 
214
213
  add_library(ggml
215
214
  ggml-backend-reg.cpp)
215
+ add_library(ggml::ggml ALIAS ggml)
216
216
 
217
217
  target_link_libraries(ggml PUBLIC ggml-base)
218
218
 
@@ -270,17 +270,27 @@ endfunction()
270
270
  function(ggml_add_cpu_backend_variant tag_name)
271
271
  set(GGML_CPU_TAG_NAME ${tag_name})
272
272
  # other: OPENMP LLAMAFILE CPU_HBM
273
- foreach (feat NATIVE
274
- SSE42
275
- AVX AVX2 BMI2 AVX_VNNI FMA F16C
276
- AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16
277
- AMX_TILE AMX_INT8 AMX_BF16)
278
- set(GGML_${feat} OFF)
279
- endforeach()
280
-
281
- foreach (feat ${ARGN})
282
- set(GGML_${feat} ON)
283
- endforeach()
273
+ if (GGML_SYSTEM_ARCH STREQUAL "x86")
274
+ foreach (feat NATIVE
275
+ SSE42
276
+ AVX AVX2 BMI2 AVX_VNNI FMA F16C
277
+ AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16
278
+ AMX_TILE AMX_INT8 AMX_BF16)
279
+ set(GGML_${feat} OFF)
280
+ endforeach()
281
+
282
+ foreach (feat ${ARGN})
283
+ set(GGML_${feat} ON)
284
+ endforeach()
285
+ elseif (GGML_SYSTEM_ARCH STREQUAL "ARM")
286
+ foreach (feat ${ARGN})
287
+ set(GGML_INTERNAL_${feat} ON)
288
+ endforeach()
289
+ elseif (GGML_SYSTEM_ARCH STREQUAL "PowerPC")
290
+ foreach (feat ${ARGN})
291
+ set(GGML_INTERNAL_${feat} ON)
292
+ endforeach()
293
+ endif()
284
294
 
285
295
  ggml_add_cpu_backend_variant_impl(${tag_name})
286
296
  endfunction()
@@ -290,6 +300,8 @@ ggml_add_backend(CPU)
290
300
  if (GGML_CPU_ALL_VARIANTS)
291
301
  if (NOT GGML_BACKEND_DL)
292
302
  message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
303
+ elseif (GGML_CPU_ARM_ARCH)
304
+ message(FATAL_ERROR "Cannot use both GGML_CPU_ARM_ARCH and GGML_CPU_ALL_VARIANTS")
293
305
  endif()
294
306
  if (GGML_SYSTEM_ARCH STREQUAL "x86")
295
307
  ggml_add_cpu_backend_variant(x64)
@@ -303,8 +315,47 @@ if (GGML_CPU_ALL_VARIANTS)
303
315
  # MSVC doesn't support AMX
304
316
  ggml_add_cpu_backend_variant(sapphirerapids SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
305
317
  endif()
318
+ elseif(GGML_SYSTEM_ARCH STREQUAL "ARM")
319
+ if (CMAKE_SYSTEM_NAME MATCHES "Linux")
320
+ # Many of these features are optional so we build versions with popular
321
+ # combinations and name the backends based on the version they were
322
+ # first released with
323
+ ggml_add_cpu_backend_variant(armv8.0_1)
324
+ ggml_add_cpu_backend_variant(armv8.2_1 DOTPROD)
325
+ ggml_add_cpu_backend_variant(armv8.2_2 DOTPROD FP16_VECTOR_ARITHMETIC)
326
+ ggml_add_cpu_backend_variant(armv8.2_3 DOTPROD FP16_VECTOR_ARITHMETIC SVE)
327
+ ggml_add_cpu_backend_variant(armv8.6_1 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8)
328
+ ggml_add_cpu_backend_variant(armv8.6_2 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8 SVE2)
329
+ ggml_add_cpu_backend_variant(armv9.2_1 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8 SME)
330
+ ggml_add_cpu_backend_variant(armv9.2_2 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8 SVE2 SME)
331
+ elseif (CMAKE_SYSTEM_NAME MATCHES "Android")
332
+ # Android-specific backends with SoC-compatible feature sets
333
+ ggml_add_cpu_backend_variant(android_armv8.0_1)
334
+ ggml_add_cpu_backend_variant(android_armv8.2_1 DOTPROD)
335
+ ggml_add_cpu_backend_variant(android_armv8.2_2 DOTPROD FP16_VECTOR_ARITHMETIC)
336
+ ggml_add_cpu_backend_variant(android_armv8.6_1 DOTPROD FP16_VECTOR_ARITHMETIC MATMUL_INT8)
337
+ elseif (APPLE)
338
+ ggml_add_cpu_backend_variant(apple_m1 DOTPROD)
339
+ ggml_add_cpu_backend_variant(apple_m2_m3 DOTPROD MATMUL_INT8)
340
+ ggml_add_cpu_backend_variant(apple_m4 DOTPROD MATMUL_INT8 NOSVE SME)
341
+ else()
342
+ message(FATAL_ERROR "Unsupported ARM target OS: ${CMAKE_SYSTEM_NAME}")
343
+ endif()
344
+ elseif (GGML_SYSTEM_ARCH STREQUAL "PowerPC")
345
+ if (CMAKE_SYSTEM_NAME MATCHES "Linux")
346
+ ggml_add_cpu_backend_variant(power0)
347
+ ggml_add_cpu_backend_variant(power7_1 POWER7)
348
+ ggml_add_cpu_backend_variant(power7_2 POWER7 VSX)
349
+ ggml_add_cpu_backend_variant(power8_1 POWER8)
350
+ ggml_add_cpu_backend_variant(power8_2 POWER8 VSX)
351
+ ggml_add_cpu_backend_variant(power9 POWER9 VSX)
352
+ ggml_add_cpu_backend_variant(power10 POWER10 VSX)
353
+ ggml_add_cpu_backend_variant(power11 POWER11 VSX)
354
+ else()
355
+ message(FATAL_ERROR "Unsupported PowerPC target OS: ${CMAKE_SYSTEM_NAME}")
356
+ endif()
306
357
  else()
307
- message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS not yet supported on ${GGML_SYSTEM_ARCH}")
358
+ message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS not yet supported with ${GGML_SYSTEM_ARCH} on ${CMAKE_SYSTEM_NAME}")
308
359
  endif()
309
360
  elseif (GGML_CPU)
310
361
  ggml_add_cpu_backend_variant_impl("")
@@ -69,6 +69,9 @@
69
69
  #if defined(__clang__)
70
70
  # pragma clang diagnostic push
71
71
  # pragma clang diagnostic ignored "-Wdeprecated-declarations"
72
+ #elif defined(__GNUC__)
73
+ # pragma GCC diagnostic push
74
+ # pragma GCC diagnostic ignored "-Wdeprecated-declarations"
72
75
  #endif
73
76
 
74
77
  namespace fs = std::filesystem;
@@ -91,6 +94,8 @@ static std::string path_str(const fs::path & path) {
91
94
 
92
95
  #if defined(__clang__)
93
96
  # pragma clang diagnostic pop
97
+ #elif defined(__GNUC__)
98
+ # pragma GCC diagnostic pop
94
99
  #endif
95
100
 
96
101
  #ifdef _WIN32
@@ -37,6 +37,7 @@
37
37
  #include <thread>
38
38
  #include <unistd.h>
39
39
  #include <functional>
40
+ #include <optional>
40
41
 
41
42
  #include "../include/ggml-cann.h"
42
43
  #include "../include/ggml.h"
@@ -103,6 +104,9 @@ const ggml_cann_device_info& ggml_cann_info();
103
104
  void ggml_cann_set_device(int32_t device);
104
105
  int32_t ggml_cann_get_device();
105
106
 
107
+ std::optional<std::string> get_env(const std::string& name);
108
+ bool parse_bool(const std::string& value);
109
+
106
110
  /**
107
111
  * @brief Abstract base class for memory pools used by CANN.
108
112
  */
@@ -354,7 +358,8 @@ struct ggml_backend_cann_context {
354
358
  : device(device), name("CANN" + std::to_string(device)), task_queue(1024, device) {
355
359
  ggml_cann_set_device(device);
356
360
  description = aclrtGetSocName();
357
- async_mode = (getenv("GGML_CANN_ASYNC_MODE") != nullptr);
361
+
362
+ async_mode = parse_bool(get_env("GGML_CANN_ASYNC_MODE").value_or(""));
358
363
  GGML_LOG_INFO("%s: device %d async operator submission is %s\n", __func__,
359
364
  device, async_mode ? "ON" : "OFF");
360
365
  }
@@ -31,6 +31,8 @@
31
31
  #include <mutex>
32
32
  #include <queue>
33
33
  #include <chrono>
34
+ #include <unordered_set>
35
+ #include <optional>
34
36
 
35
37
  #include "ggml-impl.h"
36
38
  #include "ggml-backend-impl.h"
@@ -93,6 +95,26 @@ int32_t ggml_cann_get_device() {
93
95
  return id;
94
96
  }
95
97
 
98
+ /**
99
+ * @brief Get the value of the specified environment variable (name).
100
+ * if not empty, return a std::string object
101
+ */
102
+ std::optional<std::string> get_env(const std::string& name) {
103
+ const char* val = std::getenv(name.c_str());
104
+ if (!val) return std::nullopt;
105
+ std::string res = std::string(val);
106
+ std::transform(res.begin(), res.end(), res.begin(), ::tolower);
107
+ return res;
108
+ }
109
+
110
+ /**
111
+ * @brief Verify whether the environment variable is a valid value.
112
+ */
113
+ bool parse_bool(const std::string& value) {
114
+ std::unordered_set<std::string> valid_values = {"on", "1", "yes", "y", "enable", "true"};
115
+ return valid_values.find(value) != valid_values.end();
116
+ }
117
+
96
118
  /**
97
119
  * @brief Initialize the CANN device information.
98
120
  *
@@ -214,7 +236,7 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
214
236
  * @param device The device ID to associate with this buffer pool.
215
237
  */
216
238
  explicit ggml_cann_pool_buf_prio(int device) : device(device) {
217
- disable_clean = getenv("GGML_CANN_DISABLE_BUF_POOL_CLEAN") != nullptr;
239
+ disable_clean = parse_bool(get_env("GGML_CANN_DISABLE_BUF_POOL_CLEAN").value_or(""));
218
240
  }
219
241
 
220
242
  /**
@@ -410,7 +432,7 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
410
432
  * @param device The device ID to associate with this buffer pool.
411
433
  */
412
434
  explicit ggml_cann_pool_buf(int device) : device(device) {
413
- disable_clean = getenv("GGML_CANN_DISABLE_BUF_POOL_CLEAN") != nullptr;
435
+ disable_clean = parse_bool(get_env("GGML_CANN_DISABLE_BUF_POOL_CLEAN").value_or(""));
414
436
  }
415
437
 
416
438
  /**
@@ -731,16 +753,18 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
731
753
  */
732
754
  std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(
733
755
  int device) {
734
- bool disable_vmm = (getenv("GGML_CANN_DISABLE_VMM_POOL") != nullptr);
735
- if (!disable_vmm && ggml_cann_info().devices[device].vmm) {
736
- GGML_LOG_INFO("%s: device %d use vmm pool\n", __func__, device);
737
- return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_vmm(device));
738
- }
739
- bool enable_buf_prio = (getenv("GGML_CANN_ENABLE_BUF_PRIO_POOL") != nullptr);
740
- if (enable_buf_prio) {
756
+ std::string mem_pool_type = get_env("GGML_CANN_MEM_POOL").value_or("");
757
+
758
+ if (mem_pool_type == "prio") {
741
759
  GGML_LOG_INFO("%s: device %d use buffer pool with priority queue\n", __func__, device);
742
760
  return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_buf_prio(device));
743
761
  }
762
+
763
+ if (ggml_cann_info().devices[device].vmm && mem_pool_type != "leg") {
764
+ GGML_LOG_INFO("%s: device %d use vmm pool\n", __func__, device);
765
+ return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_vmm(device));
766
+ }
767
+
744
768
  GGML_LOG_INFO("%s: device %d use buffer pool\n", __func__, device);
745
769
  return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_buf(device));
746
770
  }
@@ -1074,6 +1074,10 @@ GGML_TABLE_BEGIN(uint32_t, iq3s_grid, 512)
1074
1074
  0x0f090307, 0x0f090501, 0x0f090b01, 0x0f0b0505, 0x0f0b0905, 0x0f0d0105, 0x0f0d0703, 0x0f0f0101,
1075
1075
  GGML_TABLE_END()
1076
1076
 
1077
+ GGML_TABLE_BEGIN(int8_t, kvalues_iq4nl, 16)
1078
+ -127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113,
1079
+ GGML_TABLE_END()
1080
+
1077
1081
  #define NGRID_IQ1S 2048
1078
1082
  #define IQ1S_DELTA 0.125f
1079
1083
  #define IQ1M_DELTA 0.125f
@@ -1,3 +1,17 @@
1
+ function(ggml_add_cpu_backend_features cpu_name arch)
2
+ # The feature detection code is compiled as a separate target so that
3
+ # it can be built without the architecture flags
4
+ # Since multiple variants of the CPU backend may be included in the same
5
+ # build, using set_source_files_properties() to set the arch flags is not possible
6
+ set(GGML_CPU_FEATS_NAME ${cpu_name}-feats)
7
+ add_library(${GGML_CPU_FEATS_NAME} OBJECT ggml-cpu/arch/${arch}/cpu-feats.cpp)
8
+ target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . .. ../include)
9
+ target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARGN})
10
+ target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED)
11
+ set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
12
+ target_link_libraries(${cpu_name} PRIVATE ${GGML_CPU_FEATS_NAME})
13
+ endfunction()
14
+
1
15
  function(ggml_add_cpu_backend_variant_impl tag_name)
2
16
  if (tag_name)
3
17
  set(GGML_CPU_NAME ggml-cpu-${tag_name})
@@ -10,14 +24,14 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
10
24
  list (APPEND GGML_CPU_SOURCES
11
25
  ggml-cpu/ggml-cpu.c
12
26
  ggml-cpu/ggml-cpu.cpp
13
- ggml-cpu/ggml-cpu-aarch64.cpp
14
- ggml-cpu/ggml-cpu-aarch64.h
15
- ggml-cpu/ggml-cpu-hbm.cpp
16
- ggml-cpu/ggml-cpu-hbm.h
17
- ggml-cpu/ggml-cpu-quants.c
18
- ggml-cpu/ggml-cpu-quants.h
19
- ggml-cpu/ggml-cpu-traits.cpp
20
- ggml-cpu/ggml-cpu-traits.h
27
+ ggml-cpu/repack.cpp
28
+ ggml-cpu/repack.h
29
+ ggml-cpu/hbm.cpp
30
+ ggml-cpu/hbm.h
31
+ ggml-cpu/quants.c
32
+ ggml-cpu/quants.h
33
+ ggml-cpu/traits.cpp
34
+ ggml-cpu/traits.h
21
35
  ggml-cpu/amx/amx.cpp
22
36
  ggml-cpu/amx/amx.h
23
37
  ggml-cpu/amx/mmq.cpp
@@ -84,6 +98,11 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
84
98
 
85
99
  if (GGML_SYSTEM_ARCH STREQUAL "ARM")
86
100
  message(STATUS "ARM detected")
101
+ list(APPEND GGML_CPU_SOURCES
102
+ ggml-cpu/arch/arm/quants.c
103
+ ggml-cpu/arch/arm/repack.cpp
104
+ )
105
+
87
106
  if (MSVC AND NOT CMAKE_C_COMPILER_ID STREQUAL "Clang")
88
107
  message(FATAL_ERROR "MSVC is not supported for ARM, use clang")
89
108
  else()
@@ -138,6 +157,49 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
138
157
  else()
139
158
  if (GGML_CPU_ARM_ARCH)
140
159
  list(APPEND ARCH_FLAGS -march=${GGML_CPU_ARM_ARCH})
160
+ elseif(GGML_CPU_ALL_VARIANTS)
161
+ # Begin with the lowest baseline
162
+ set(ARM_MCPU "armv8-a")
163
+ set(ARCH_TAGS "")
164
+ set(ARCH_DEFINITIONS "")
165
+
166
+ # When a feature is selected, bump the MCPU to the first
167
+ # version that supported it
168
+ if (GGML_INTERNAL_DOTPROD)
169
+ set(ARM_MCPU "armv8.2-a")
170
+ set(ARCH_TAGS "${ARCH_TAGS}+dotprod")
171
+ list(APPEND ARCH_DEFINITIONS GGML_USE_DOTPROD)
172
+ endif()
173
+ if (GGML_INTERNAL_FP16_VECTOR_ARITHMETIC)
174
+ set(ARM_MCPU "armv8.2-a")
175
+ set(ARCH_TAGS "${ARCH_TAGS}+fp16")
176
+ list(APPEND ARCH_DEFINITIONS GGML_USE_FP16_VECTOR_ARITHMETIC)
177
+ endif()
178
+ if (GGML_INTERNAL_SVE)
179
+ set(ARM_MCPU "armv8.2-a")
180
+ set(ARCH_TAGS "${ARCH_TAGS}+sve")
181
+ list(APPEND ARCH_DEFINITIONS GGML_USE_SVE)
182
+ endif()
183
+ if (GGML_INTERNAL_MATMUL_INT8)
184
+ set(ARM_MCPU "armv8.6-a")
185
+ set(ARCH_TAGS "${ARCH_TAGS}+i8mm")
186
+ list(APPEND ARCH_DEFINITIONS GGML_USE_MATMUL_INT8)
187
+ endif()
188
+ if (GGML_INTERNAL_SVE2)
189
+ set(ARM_MCPU "armv8.6-a")
190
+ set(ARCH_TAGS "${ARCH_TAGS}+sve2")
191
+ list(APPEND ARCH_DEFINITIONS GGML_USE_SVE2)
192
+ endif()
193
+ if (GGML_INTERNAL_NOSVE)
194
+ set(ARCH_TAGS "${ARCH_TAGS}+nosve")
195
+ endif()
196
+ if (GGML_INTERNAL_SME)
197
+ set(ARM_MCPU "armv9.2-a")
198
+ set(ARCH_TAGS "${ARCH_TAGS}+sme")
199
+ list(APPEND ARCH_DEFINITIONS GGML_USE_SME)
200
+ endif()
201
+ list(APPEND ARCH_FLAGS "-march=${ARM_MCPU}${ARCH_TAGS}")
202
+ ggml_add_cpu_backend_features(${GGML_CPU_NAME} arm ${ARCH_DEFINITIONS})
141
203
  endif()
142
204
  endif()
143
205
 
@@ -167,6 +229,11 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
167
229
  endif()
168
230
  elseif (GGML_SYSTEM_ARCH STREQUAL "x86")
169
231
  message(STATUS "x86 detected")
232
+ list(APPEND GGML_CPU_SOURCES
233
+ ggml-cpu/arch/x86/quants.c
234
+ ggml-cpu/arch/x86/repack.cpp
235
+ )
236
+
170
237
  if (MSVC)
171
238
  # instruction set detection for MSVC only
172
239
  if (GGML_NATIVE)
@@ -296,21 +363,11 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
296
363
  # the feature check relies on ARCH_DEFINITIONS, but it is not set with GGML_NATIVE
297
364
  message(FATAL_ERROR "GGML_NATIVE is not compatible with GGML_BACKEND_DL, consider using GGML_CPU_ALL_VARIANTS")
298
365
  endif()
299
-
300
- # The feature detection code is compiled as a separate target so that
301
- # it can be built without the architecture flags
302
- # Since multiple variants of the CPU backend may be included in the same
303
- # build, using set_source_files_properties() to set the arch flags is not possible
304
- set(GGML_CPU_FEATS_NAME ${GGML_CPU_NAME}-feats)
305
- add_library(${GGML_CPU_FEATS_NAME} OBJECT ggml-cpu/cpu-feats-x86.cpp)
306
- target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . .. ../include)
307
- target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARCH_DEFINITIONS})
308
- target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED)
309
- set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
310
- target_link_libraries(${GGML_CPU_NAME} PRIVATE ${GGML_CPU_FEATS_NAME})
366
+ ggml_add_cpu_backend_features(${GGML_CPU_NAME} x86 ${ARCH_DEFINITIONS})
311
367
  endif()
312
368
  elseif (GGML_SYSTEM_ARCH STREQUAL "PowerPC")
313
369
  message(STATUS "PowerPC detected")
370
+ list(APPEND GGML_CPU_SOURCES ggml-cpu/arch/powerpc/quants.c)
314
371
  if (GGML_NATIVE)
315
372
  if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
316
373
  file(READ "/proc/cpuinfo" POWER10_M)
@@ -318,7 +375,8 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
318
375
  execute_process(COMMAND bash -c "prtconf |grep 'Implementation' | head -n 1" OUTPUT_VARIABLE POWER10_M)
319
376
  endif()
320
377
 
321
- string(REGEX MATCHALL "POWER *([0-9]+)" MATCHED_STRING "${POWER10_M}")
378
+ string(TOUPPER "${POWER10_M}" POWER10_M_UPPER)
379
+ string(REGEX MATCHALL "POWER *([0-9]+)" MATCHED_STRING "${POWER10_M_UPPER}")
322
380
  string(REGEX REPLACE "POWER *([0-9]+)" "\\1" EXTRACTED_NUMBER "${MATCHED_STRING}")
323
381
 
324
382
  if (EXTRACTED_NUMBER GREATER_EQUAL 10)
@@ -330,6 +388,27 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
330
388
  else()
331
389
  list(APPEND ARCH_FLAGS -mcpu=native -mtune=native -mpowerpc64)
332
390
  endif()
391
+ elseif(GGML_CPU_ALL_VARIANTS)
392
+ # Begin with the lowest baseline
393
+ set(ARCH_DEFINITIONS "")
394
+
395
+ # When a feature is selected, bump the MCPU to the first
396
+ # version that supported it
397
+ foreach(PVER RANGE 7 11)
398
+ if(DEFINED GGML_INTERNAL_POWER${PVER})
399
+ set(POWERPC_MCPU "power${PVER}")
400
+ list(APPEND ARCH_DEFINITIONS GGML_USE_POWER${PVER})
401
+ endif()
402
+ endforeach()
403
+ if (GGML_INTERNAL_VSX)
404
+ list(APPEND ARCH_DEFINITIONS GGML_USE_VSX)
405
+ list(APPEND ARCH_FLAGS -mvsx)
406
+ endif()
407
+
408
+ if (DEFINED POWERPC_MCPU)
409
+ list(APPEND ARCH_FLAGS -mcpu=${POWERPC_MCPU})
410
+ endif()
411
+ ggml_add_cpu_backend_features(${GGML_CPU_NAME} powerpc ${ARCH_DEFINITIONS})
333
412
  else()
334
413
  if (GGML_CPU_POWERPC_CPUTYPE)
335
414
  list(APPEND ARCH_FLAGS -mcpu=${GGML_CPU_POWERPC_CPUTYPE})
@@ -337,6 +416,8 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
337
416
  endif()
338
417
  elseif (GGML_SYSTEM_ARCH STREQUAL "loongarch64")
339
418
  message(STATUS "loongarch64 detected")
419
+ list(APPEND GGML_CPU_SOURCES ggml-cpu/arch/loongarch/quants.c)
420
+
340
421
  list(APPEND ARCH_FLAGS -march=loongarch64)
341
422
  if (GGML_LASX)
342
423
  list(APPEND ARCH_FLAGS -mlasx)
@@ -346,6 +427,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
346
427
  endif()
347
428
  elseif (GGML_SYSTEM_ARCH STREQUAL "riscv64")
348
429
  message(STATUS "riscv64 detected")
430
+ list(APPEND GGML_CPU_SOURCES
431
+ ggml-cpu/arch/riscv/quants.c
432
+ ggml-cpu/arch/riscv/repack.cpp
433
+ )
349
434
  if (GGML_RVV)
350
435
  if (GGML_XTHEADVECTOR)
351
436
  list(APPEND ARCH_FLAGS -march=rv64gc_xtheadvector -mabi=lp64d)
@@ -357,11 +442,13 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
357
442
  endif()
358
443
  elseif (GGML_SYSTEM_ARCH STREQUAL "s390x")
359
444
  message(STATUS "s390x detected")
445
+ list(APPEND GGML_CPU_SOURCES ggml-cpu/arch/s390/quants.c)
360
446
  file(READ "/proc/cpuinfo" CPUINFO_CONTENTS)
361
447
  string(REGEX REPLACE "machine[ \t\r\n]*=[ \t\r\n]*([0-9]+)" "\\1" S390X_M ${CPUINFO_CONTENTS})
362
448
 
363
449
  # TODO: Separation to determine activation of VX/VXE/VXE2
364
450
  if (${S390X_M} MATCHES "8561|8562")
451
+ set(GGML_NNPA OFF)
365
452
  message(STATUS "z15 target")
366
453
  list(APPEND ARCH_FLAGS -march=z15)
367
454
  elseif (${S390X_M} MATCHES "3931")
@@ -378,14 +465,25 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
378
465
  endif()
379
466
 
380
467
  if (GGML_VXE)
468
+ message(STATUS "VX/VXE/VXE2 enabled")
381
469
  list(APPEND ARCH_FLAGS -mvx -mzvector)
470
+ list(APPEND ARCH_DEFINITIONS GGML_VXE)
471
+ endif()
472
+
473
+ if (GGML_NNPA)
474
+ message(STATUS "NNPA enabled")
475
+ list(APPEND ARCH_DEFINITIONS GGML_NNPA)
382
476
  endif()
477
+ elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "wasm")
478
+ message(STATUS "Wasm detected")
479
+ list (APPEND GGML_CPU_SOURCES ggml-cpu/arch/wasm/quants.c)
383
480
  else()
384
- message(STATUS "Unknown architecture")
481
+ message(WARNING "Unknown CPU architecture. Falling back to generic implementations.")
482
+ list(APPEND ARCH_FLAGS -DGGML_CPU_GENERIC)
385
483
  endif()
386
484
 
387
- if (GGML_CPU_AARCH64)
388
- target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_CPU_AARCH64)
485
+ if (GGML_CPU_REPACK)
486
+ target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_CPU_REPACK)
389
487
  endif()
390
488
 
391
489
  if (GGML_CPU_KLEIDIAI)
@@ -396,9 +494,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
396
494
 
397
495
  # Fetch KleidiAI sources:
398
496
  include(FetchContent)
399
- set(KLEIDIAI_COMMIT_TAG "v1.6.0")
497
+ set(KLEIDIAI_COMMIT_TAG "v1.9.0")
400
498
  set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
401
- set(KLEIDIAI_ARCHIVE_MD5 "75b4ad68f25ab673dcc01065e5a0b05f")
499
+ set(KLEIDIAI_ARCHIVE_MD5 "2a8e1bb55d201557553545536489a017")
402
500
 
403
501
  if (POLICY CMP0135)
404
502
  cmake_policy(SET CMP0135 NEW)
@@ -5,7 +5,7 @@
5
5
  #include "ggml-backend.h"
6
6
  #include "ggml-impl.h"
7
7
  #include "ggml-cpu.h"
8
- #include "ggml-cpu-traits.h"
8
+ #include "traits.h"
9
9
 
10
10
  #if defined(__gnu_linux__)
11
11
  #include <sys/syscall.h>
@@ -8,7 +8,8 @@
8
8
  #include "mmq.h"
9
9
  #include "ggml-impl.h"
10
10
  #include "ggml-cpu-impl.h"
11
- #include "ggml-cpu-quants.h"
11
+ #include "simd-mappings.h"
12
+ #include "quants.h"
12
13
  #include "ggml-quants.h"
13
14
  #include <algorithm>
14
15
  #include <type_traits>
@@ -453,7 +454,7 @@ void quantize_row_q8_K_vnni(const float * RESTRICT x, void * RESTRICT vy, int64_
453
454
 
454
455
  // Quantize these floats
455
456
  const float iscale = 127.f / amax;
456
- y[i].d = GGML_FP32_TO_FP16(1 / iscale);
457
+ y[i].d = GGML_CPU_FP32_TO_FP16(1 / iscale);
457
458
  const float id = ( amax != 0.0f ) ? iscale : 0.f;
458
459
  const __m512 vscale = _mm512_set1_ps(id);
459
460
 
@@ -1090,7 +1091,7 @@ struct acc_C<block_q8_0, block_q4_0, is_acc> {
1090
1091
  const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset)));
1091
1092
 
1092
1093
  for (int m = 0; m < nr; ++m) {
1093
- const __m512 vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].d));
1094
+ const __m512 vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[m * lda].d));
1094
1095
  const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));
1095
1096
 
1096
1097
  __m512 vsum;
@@ -1113,8 +1114,8 @@ struct acc_C<block_q8_1, block_q4_1, is_acc> {
1113
1114
  const __m512 vm0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset + TILE_N * sizeof(ggml_half))));
1114
1115
 
1115
1116
  for (int m = 0; m < nr; ++m) {
1116
- const __m512 vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].d));
1117
- const __m512 vs1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].s));
1117
+ const __m512 vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[m * lda].d));
1118
+ const __m512 vs1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[m * lda].s));
1118
1119
  const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));
1119
1120
 
1120
1121
  __m512 vsum;
@@ -1137,7 +1138,7 @@ struct acc_C<block_q8_0, block_q8_0, is_acc> {
1137
1138
  const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset)));
1138
1139
 
1139
1140
  for (int m = 0; m < nr; ++m) {
1140
- const __m512 vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].d));
1141
+ const __m512 vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[m * lda].d));
1141
1142
  const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));
1142
1143
 
1143
1144
  __m512 vsum;
@@ -1437,7 +1438,7 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q4_0, float, BLOCK_M, BLOCK_N, BLO
1437
1438
  va[k] = _mm512_set1_epi32(a_ptr[k]);
1438
1439
  vcomp = _mm512_dpbusd_epi32(vcomp, off, va[k]);
1439
1440
  }
1440
- vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].d));
1441
+ vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[0 * KB + i].d));
1441
1442
  }
1442
1443
 
1443
1444
  // load b
@@ -1498,8 +1499,8 @@ struct tinygemm_kernel_vnni<block_q8_1, block_q4_1, float, 1, BLOCK_N, BLOCK_K>
1498
1499
  for (int k = 0; k < 8; ++k) {
1499
1500
  va[k] = _mm512_set1_epi32(a_ptr[k]);
1500
1501
  }
1501
- vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].d));
1502
- vs1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].s));
1502
+ vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[0 * KB + i].d));
1503
+ vs1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[0 * KB + i].s));
1503
1504
  }
1504
1505
 
1505
1506
  // load b
@@ -1571,7 +1572,7 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q8_0, float, BLOCK_M, BLOCK_N, BLO
1571
1572
  va[k] = _mm512_set1_epi32(a_ptr[k]);
1572
1573
  va[k] = _mm512_add_epi8(va[k], off);
1573
1574
  }
1574
- vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].d));
1575
+ vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[0 * KB + i].d));
1575
1576
  }
1576
1577
 
1577
1578
  // load b