@novastera-oss/llamarn 0.2.1 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (268) hide show
  1. package/README.md +80 -14
  2. package/RNLlamaCpp.podspec +10 -3
  3. package/android/CMakeLists.txt +8 -0
  4. package/android/src/main/cpp/include/llama.h +62 -125
  5. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  6. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  8. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  9. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  10. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  11. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  12. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  13. package/cpp/PureCppImpl.cpp +9 -27
  14. package/cpp/SystemUtils.h +2 -2
  15. package/cpp/build-info.cpp +2 -2
  16. package/cpp/llama.cpp/README.md +11 -3
  17. package/cpp/llama.cpp/build-xcframework.sh +1 -0
  18. package/cpp/llama.cpp/common/CMakeLists.txt +8 -2
  19. package/cpp/llama.cpp/common/arg.cpp +153 -113
  20. package/cpp/llama.cpp/common/chat-parser.cpp +379 -0
  21. package/cpp/llama.cpp/common/chat-parser.h +117 -0
  22. package/cpp/llama.cpp/common/chat.cpp +847 -699
  23. package/cpp/llama.cpp/common/chat.h +73 -6
  24. package/cpp/llama.cpp/common/common.cpp +50 -82
  25. package/cpp/llama.cpp/common/common.h +21 -17
  26. package/cpp/llama.cpp/common/json-partial.cpp +255 -0
  27. package/cpp/llama.cpp/common/json-partial.h +37 -0
  28. package/cpp/llama.cpp/common/minja/chat-template.hpp +9 -5
  29. package/cpp/llama.cpp/common/minja/minja.hpp +69 -36
  30. package/cpp/llama.cpp/common/regex-partial.cpp +204 -0
  31. package/cpp/llama.cpp/common/regex-partial.h +56 -0
  32. package/cpp/llama.cpp/common/sampling.cpp +7 -8
  33. package/cpp/llama.cpp/convert_hf_to_gguf.py +453 -118
  34. package/cpp/llama.cpp/convert_hf_to_gguf_update.py +120 -68
  35. package/cpp/llama.cpp/ggml/CMakeLists.txt +2 -1
  36. package/cpp/llama.cpp/ggml/cmake/common.cmake +25 -0
  37. package/cpp/llama.cpp/ggml/include/ggml-opt.h +49 -28
  38. package/cpp/llama.cpp/ggml/include/ggml.h +26 -7
  39. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +16 -10
  40. package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +4 -1
  41. package/cpp/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +1 -0
  42. package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +2 -0
  43. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +604 -0
  44. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +42 -0
  45. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +54 -2
  46. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +50 -51
  47. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -2
  48. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +5 -9
  49. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +779 -19
  50. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +22 -0
  51. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +88 -5
  52. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -12
  53. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +264 -69
  54. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +322 -100
  55. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +117 -1
  56. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +85 -16
  57. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +220 -49
  58. package/cpp/llama.cpp/ggml/src/ggml-cuda/acc.cu +40 -26
  59. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +1 -1
  60. package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cu +11 -1
  61. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +15 -7
  62. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +266 -64
  63. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +49 -4
  64. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +48 -4
  65. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cu +2 -1
  66. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +5 -1
  67. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cu +2 -0
  68. package/cpp/llama.cpp/ggml/src/ggml-cuda/quantize.cu +7 -6
  69. package/cpp/llama.cpp/ggml/src/ggml-cuda/sum.cu +1 -1
  70. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +10 -0
  71. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +2 -0
  72. package/cpp/llama.cpp/ggml/src/ggml-impl.h +1 -1
  73. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +4 -0
  74. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +99 -17
  75. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +200 -2
  76. package/cpp/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +8 -2
  77. package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cu +112 -0
  78. package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +12 -0
  79. package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +6 -0
  80. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +972 -178
  81. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
  82. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/div.cl +72 -0
  83. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/group_norm.cl +72 -0
  84. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
  85. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sub.cl +72 -0
  86. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sum_rows.cl +39 -0
  87. package/cpp/llama.cpp/ggml/src/ggml-opt.cpp +373 -190
  88. package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +29 -23
  89. package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -10
  90. package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +101 -5
  91. package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +31 -33
  92. package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +1 -0
  93. package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +29 -2
  94. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +4 -5
  95. package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
  96. package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +9 -1
  97. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +84 -72
  98. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +2 -0
  99. package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
  100. package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +1 -3
  101. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +324 -129
  102. package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +1 -0
  103. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +31 -2
  104. package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +95 -68
  105. package/cpp/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +1 -0
  106. package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +22 -0
  107. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -2
  108. package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +1 -4
  109. package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +2 -3
  110. package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +69 -43
  111. package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +2 -14
  112. package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +81 -91
  113. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +432 -181
  114. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +17 -0
  115. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +1 -1
  116. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +6 -152
  117. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +162 -0
  118. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +360 -0
  119. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +2 -118
  120. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +1 -1
  121. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +12 -1
  122. package/cpp/llama.cpp/ggml/src/ggml.c +107 -36
  123. package/cpp/llama.cpp/ggml/src/gguf.cpp +33 -33
  124. package/cpp/llama.cpp/gguf-py/gguf/constants.py +100 -15
  125. package/cpp/llama.cpp/gguf-py/gguf/gguf_reader.py +1 -1
  126. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +44 -12
  127. package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_editor_gui.py +21 -10
  128. package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_new_metadata.py +5 -2
  129. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +128 -31
  130. package/cpp/llama.cpp/gguf-py/gguf/utility.py +1 -1
  131. package/cpp/llama.cpp/gguf-py/pyproject.toml +1 -1
  132. package/cpp/llama.cpp/include/llama.h +62 -125
  133. package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +1 -1
  134. package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +1 -1
  135. package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.inp +1 -1
  136. package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.out +1 -1
  137. package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +1 -1
  138. package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +1 -1
  139. package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +1 -1
  140. package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +1 -1
  141. package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.inp +1 -1
  142. package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.out +1 -1
  143. package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +1 -1
  144. package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +1 -1
  145. package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +1 -1
  146. package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +1 -1
  147. package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +1 -1
  148. package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +1 -1
  149. package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.inp +1 -1
  150. package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.out +1 -1
  151. package/cpp/llama.cpp/models/ggml-vocab-nomic-bert-moe.gguf +0 -0
  152. package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +1 -1
  153. package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.out +1 -1
  154. package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +1 -1
  155. package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.out +1 -1
  156. package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.inp +1 -1
  157. package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.out +1 -1
  158. package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +1 -1
  159. package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.out +1 -1
  160. package/cpp/llama.cpp/models/templates/Qwen-QwQ-32B.jinja +62 -0
  161. package/cpp/llama.cpp/models/templates/Qwen-Qwen3-0.6B.jinja +85 -0
  162. package/cpp/llama.cpp/models/templates/README.md +2 -0
  163. package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +5 -1
  164. package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +5 -1
  165. package/cpp/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
  166. package/cpp/llama.cpp/requirements/requirements-gguf_editor_gui.txt +1 -1
  167. package/cpp/llama.cpp/src/CMakeLists.txt +2 -0
  168. package/cpp/llama.cpp/src/llama-arch.cpp +6 -0
  169. package/cpp/llama.cpp/src/llama-arch.h +2 -0
  170. package/cpp/llama.cpp/src/llama-batch.cpp +3 -1
  171. package/cpp/llama.cpp/src/llama-context.cpp +340 -123
  172. package/cpp/llama.cpp/src/llama-context.h +30 -0
  173. package/cpp/llama.cpp/src/llama-cparams.cpp +4 -0
  174. package/cpp/llama.cpp/src/llama-cparams.h +2 -0
  175. package/cpp/llama.cpp/src/llama-grammar.cpp +12 -2
  176. package/cpp/llama.cpp/src/llama-graph.cpp +157 -247
  177. package/cpp/llama.cpp/src/llama-graph.h +52 -7
  178. package/cpp/llama.cpp/src/llama-hparams.cpp +17 -1
  179. package/cpp/llama.cpp/src/llama-hparams.h +37 -5
  180. package/cpp/llama.cpp/src/llama-kv-cache.cpp +742 -481
  181. package/cpp/llama.cpp/src/llama-kv-cache.h +196 -99
  182. package/cpp/llama.cpp/src/llama-kv-cells.h +379 -0
  183. package/cpp/llama.cpp/src/llama-memory.h +4 -3
  184. package/cpp/llama.cpp/src/llama-model-loader.cpp +22 -17
  185. package/cpp/llama.cpp/src/llama-model-saver.cpp +281 -0
  186. package/cpp/llama.cpp/src/llama-model-saver.h +37 -0
  187. package/cpp/llama.cpp/src/llama-model.cpp +529 -172
  188. package/cpp/llama.cpp/src/llama-model.h +6 -1
  189. package/cpp/llama.cpp/src/llama-quant.cpp +15 -13
  190. package/cpp/llama.cpp/src/llama-sampling.cpp +2 -2
  191. package/cpp/llama.cpp/src/llama-vocab.cpp +35 -8
  192. package/cpp/llama.cpp/src/llama-vocab.h +6 -0
  193. package/cpp/llama.cpp/src/llama.cpp +14 -0
  194. package/cpp/rn-completion.cpp +60 -5
  195. package/ios/include/chat.h +73 -6
  196. package/ios/include/common/minja/chat-template.hpp +9 -5
  197. package/ios/include/common/minja/minja.hpp +69 -36
  198. package/ios/include/common.h +21 -17
  199. package/ios/include/llama.h +62 -125
  200. package/ios/libs/llama.xcframework/Info.plist +19 -19
  201. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  202. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4617 -4487
  203. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-opt.h +237 -0
  204. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +26 -7
  205. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +62 -125
  206. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  207. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  208. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4638 -4508
  209. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3557 -3435
  210. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +237 -0
  211. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +26 -7
  212. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +62 -125
  213. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  214. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  215. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4638 -4508
  216. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3559 -3437
  217. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-opt.h +237 -0
  218. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +26 -7
  219. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +62 -125
  220. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-opt.h +237 -0
  221. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +26 -7
  222. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +62 -125
  223. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  224. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-opt.h +237 -0
  225. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +26 -7
  226. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +62 -125
  227. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  228. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  229. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  230. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4616 -4487
  231. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-opt.h +237 -0
  232. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +26 -7
  233. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +62 -125
  234. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  235. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  236. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4637 -4508
  237. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3556 -3435
  238. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +237 -0
  239. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +26 -7
  240. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +62 -125
  241. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  242. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  243. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4653 -4523
  244. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-opt.h +237 -0
  245. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +26 -7
  246. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +62 -125
  247. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  248. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  249. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4674 -4544
  250. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3587 -3465
  251. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +237 -0
  252. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +26 -7
  253. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +62 -125
  254. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  255. package/package.json +1 -1
  256. package/cpp/llama.cpp/common/stb_image.h +0 -7988
  257. package/cpp/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +0 -112
  258. package/cpp/llama.cpp/models/ggml-vocab-chameleon.gguf.out +0 -46
  259. package/cpp/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.inp +0 -112
  260. package/cpp/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.out +0 -46
  261. package/cpp/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +0 -112
  262. package/cpp/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +0 -46
  263. package/cpp/llama.cpp/models/ggml-vocab-llama4.gguf.inp +0 -112
  264. package/cpp/llama.cpp/models/ggml-vocab-llama4.gguf.out +0 -46
  265. package/cpp/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +0 -112
  266. package/cpp/llama.cpp/models/ggml-vocab-pixtral.gguf.out +0 -46
  267. package/cpp/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +0 -112
  268. package/cpp/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +0 -46
@@ -17,29 +17,98 @@ void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * G
17
17
 
18
18
  #if defined(GGML_SIMD)
19
19
  float sumf = 0.0f;
20
- const int np = (n & ~(GGML_F32_STEP - 1));
21
20
 
22
- GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
21
+ #if defined(__ARM_FEATURE_SVE)
22
+ const int sve_register_length = ggml_cpu_get_sve_cnt() * 8;
23
+ const int ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
24
+ const int ggml_f32_step = 8 * ggml_f32_epr; // choose 8 SVE registers
25
+
26
+ const int np = (n & ~(ggml_f32_step - 1));
27
+ svfloat32_t sum1 = svdup_n_f32(0.0f);
28
+ svfloat32_t sum2 = svdup_n_f32(0.0f);
29
+ svfloat32_t sum3 = svdup_n_f32(0.0f);
30
+ svfloat32_t sum4 = svdup_n_f32(0.0f);
31
+ svfloat32_t sum5 = svdup_n_f32(0.0f);
32
+ svfloat32_t sum6 = svdup_n_f32(0.0f);
33
+ svfloat32_t sum7 = svdup_n_f32(0.0f);
34
+ svfloat32_t sum8 = svdup_n_f32(0.0f);
35
+ svfloat32_t ax1,ax2,ax3,ax4,ax5,ax6,ax7,ax8;
36
+ svfloat32_t ay1,ay2,ay3,ay4,ay5,ay6,ay7,ay8;
37
+ for (int i = 0; i < np; i += ggml_f32_step) {
38
+ ax1 = GGML_F32_VEC_LOAD(x + i);
39
+ ay1 = GGML_F32_VEC_LOAD(y + i);
40
+ sum1 = GGML_F32_VEC_FMA(ax1, ay1, sum1);
41
+
42
+ ax2 = GGML_F32_VEC_LOAD(x + i + 1*ggml_f32_epr);
43
+ ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
44
+ sum2 = GGML_F32_VEC_FMA(ax2, ay2, sum2);
45
+
46
+ ax3 = GGML_F32_VEC_LOAD(x + i + 2*ggml_f32_epr);
47
+ ay3 = GGML_F32_VEC_LOAD(y + i + 2*ggml_f32_epr);
48
+ sum3 = GGML_F32_VEC_FMA(ax3, ay3, sum3);
49
+
50
+ ax4 = GGML_F32_VEC_LOAD(x + i + 3*ggml_f32_epr);
51
+ ay4 = GGML_F32_VEC_LOAD(y + i + 3*ggml_f32_epr);
52
+ sum4 = GGML_F32_VEC_FMA(ax4, ay4, sum4);
53
+
54
+ ax5 = GGML_F32_VEC_LOAD(x + i + 4*ggml_f32_epr);
55
+ ay5 = GGML_F32_VEC_LOAD(y + i + 4*ggml_f32_epr);
56
+ sum5 = GGML_F32_VEC_FMA(ax5, ay5, sum5);
57
+
58
+ ax6 = GGML_F32_VEC_LOAD(x + i + 5*ggml_f32_epr);
59
+ ay6 = GGML_F32_VEC_LOAD(y + i + 5*ggml_f32_epr);
60
+ sum6 = GGML_F32_VEC_FMA(ax6, ay6, sum6);
61
+
62
+ ax7 = GGML_F32_VEC_LOAD(x + i + 6*ggml_f32_epr);
63
+ ay7 = GGML_F32_VEC_LOAD(y + i + 6*ggml_f32_epr);
64
+ sum7 = GGML_F32_VEC_FMA(ax7, ay7, sum7);
65
+
66
+ ax8 = GGML_F32_VEC_LOAD(x + i + 7*ggml_f32_epr);
67
+ ay8 = GGML_F32_VEC_LOAD(y + i + 7*ggml_f32_epr);
68
+ sum8 = GGML_F32_VEC_FMA(ax8, ay8, sum8);
69
+ }
70
+ // leftovers
71
+ // Since 8 unrolls are done in above loop, leftovers lie in range [0, ggml_f32_step] which is handled in below loop
72
+ const int np2 = (n & ~(ggml_f32_epr - 1));
73
+ for (int i = np; i < np2; i += ggml_f32_epr) {
74
+ ax1 = GGML_F32_VEC_LOAD(x + i);
75
+ ay1 = GGML_F32_VEC_LOAD(y + i);
76
+ sum1 = GGML_F32_VEC_FMA(ax1, ay1, sum1);
77
+ }
78
+ // maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
79
+ if (np2 < n) {
80
+ svbool_t pg = svwhilelt_b32(np2, n);
81
+ ax1 = svld1_f32(pg, x + np2);
82
+ ay1 = svld1_f32(pg, y + np2);
83
+ sum1 = svmad_f32_m(pg, ax1, ay1, sum1);
84
+ }
85
+ // reduce sum1,sum2 to sum1
86
+ GGML_F32_VEC_REDUCE(sumf, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8);
87
+ #else
88
+ const int np = (n & ~(GGML_F32_STEP - 1));
23
89
 
24
- GGML_F32_VEC ax[GGML_F32_ARR];
25
- GGML_F32_VEC ay[GGML_F32_ARR];
90
+ GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
26
91
 
27
- for (int i = 0; i < np; i += GGML_F32_STEP) {
28
- for (int j = 0; j < GGML_F32_ARR; j++) {
29
- ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
30
- ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
92
+ GGML_F32_VEC ax[GGML_F32_ARR];
93
+ GGML_F32_VEC ay[GGML_F32_ARR];
31
94
 
32
- sum[j] = GGML_F32_VEC_FMA(sum[j], ax[j], ay[j]);
95
+ for (int i = 0; i < np; i += GGML_F32_STEP) {
96
+ for (int j = 0; j < GGML_F32_ARR; j++) {
97
+ ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
98
+ ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
99
+
100
+ sum[j] = GGML_F32_VEC_FMA(sum[j], ax[j], ay[j]);
101
+ }
33
102
  }
34
- }
35
103
 
36
- // reduce sum0..sum3 to sum0
37
- GGML_F32_VEC_REDUCE(sumf, sum);
104
+ // reduce sum0..sum3 to sum0
105
+ GGML_F32_VEC_REDUCE(sumf, sum);
38
106
 
39
- // leftovers
40
- for (int i = np; i < n; ++i) {
41
- sumf += x[i]*y[i];
42
- }
107
+ // leftovers
108
+ for (int i = np; i < n; ++i) {
109
+ sumf += x[i]*y[i];
110
+ }
111
+ #endif
43
112
  #else
44
113
  // scalar
45
114
  ggml_float sumf = 0.0;
@@ -5,6 +5,7 @@
5
5
  #include "ggml-impl.h"
6
6
  #include "simd-mappings.h"
7
7
  #include "ggml.h"
8
+ #include "ggml-cpu.h"
8
9
 
9
10
  #if defined(GGML_USE_ACCELERATE)
10
11
  #include <Accelerate/Accelerate.h>
@@ -148,27 +149,108 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
148
149
 
149
150
  inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const float * GGML_RESTRICT x, const float v) {
150
151
  #if defined(GGML_SIMD)
151
- const int np = (n & ~(GGML_F32_STEP - 1));
152
+ #if defined(__ARM_FEATURE_SVE)
152
153
 
153
- GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
154
+ const int sve_register_length = ggml_cpu_get_sve_cnt() * 8;
155
+ const int ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
156
+ const int ggml_f32_step = 8 * ggml_f32_epr; // choose 8 SVE registers
157
+ GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
154
158
 
155
- GGML_F32_VEC ax[GGML_F32_ARR];
156
- GGML_F32_VEC ay[GGML_F32_ARR];
159
+ const int np = (n & ~(ggml_f32_step - 1));
160
+ svfloat32_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
161
+ svfloat32_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
162
+ for (int i = 0; i < np; i += ggml_f32_step) {
157
163
 
158
- for (int i = 0; i < np; i += GGML_F32_STEP) {
159
- for (int j = 0; j < GGML_F32_ARR; j++) {
160
- ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
161
- ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
162
- ay[j] = GGML_F32_VEC_FMA(ay[j], ax[j], vx);
164
+ ax1 = GGML_F32_VEC_LOAD(x + i);
165
+ ay1 = GGML_F32_VEC_LOAD(y + i);
166
+ ay1 = GGML_F32_VEC_FMA(ax1, vx, ay1);
163
167
 
164
- GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
168
+ GGML_F32_VEC_STORE(y + i, ay1);
169
+
170
+ ax2 = GGML_F32_VEC_LOAD(x + i + 1*ggml_f32_epr);
171
+ ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
172
+ ay2 = GGML_F32_VEC_FMA(ax2, vx, ay2);
173
+
174
+ GGML_F32_VEC_STORE(y + i + 1*ggml_f32_epr, ay2);
175
+
176
+ ax3 = GGML_F32_VEC_LOAD(x + i + 2*ggml_f32_epr);
177
+ ay3 = GGML_F32_VEC_LOAD(y + i + 2*ggml_f32_epr);
178
+ ay3 = GGML_F32_VEC_FMA(ax3, vx, ay3);
179
+
180
+ GGML_F32_VEC_STORE(y + i + 2*ggml_f32_epr, ay3);
181
+
182
+ ax4 = GGML_F32_VEC_LOAD(x + i + 3*ggml_f32_epr);
183
+ ay4 = GGML_F32_VEC_LOAD(y + i + 3*ggml_f32_epr);
184
+ ay4 = GGML_F32_VEC_FMA(ax4, vx, ay4);
185
+
186
+ GGML_F32_VEC_STORE(y + i + 3*ggml_f32_epr, ay4);
187
+
188
+ ax5 = GGML_F32_VEC_LOAD(x + i + 4*ggml_f32_epr);
189
+ ay5 = GGML_F32_VEC_LOAD(y + i + 4*ggml_f32_epr);
190
+ ay5 = GGML_F32_VEC_FMA(ax5, vx, ay5);
191
+
192
+ GGML_F32_VEC_STORE(y + i + 4*ggml_f32_epr, ay5);
193
+
194
+ ax6 = GGML_F32_VEC_LOAD(x + i + 5*ggml_f32_epr);
195
+ ay6 = GGML_F32_VEC_LOAD(y + i + 5*ggml_f32_epr);
196
+ ay6 = GGML_F32_VEC_FMA(ax6, vx, ay6);
197
+
198
+ GGML_F32_VEC_STORE(y + i + 5*ggml_f32_epr, ay6);
199
+
200
+ ax7 = GGML_F32_VEC_LOAD(x + i + 6*ggml_f32_epr);
201
+ ay7 = GGML_F32_VEC_LOAD(y + i + 6*ggml_f32_epr);
202
+ ay7 = GGML_F32_VEC_FMA(ax7, vx, ay7);
203
+
204
+ GGML_F32_VEC_STORE(y + i + 6*ggml_f32_epr, ay7);
205
+
206
+ ax8 = GGML_F32_VEC_LOAD(x + i + 7*ggml_f32_epr);
207
+ ay8 = GGML_F32_VEC_LOAD(y + i + 7*ggml_f32_epr);
208
+ ay8 = GGML_F32_VEC_FMA(ax8, vx, ay8);
209
+
210
+ GGML_F32_VEC_STORE(y + i + 7*ggml_f32_epr, ay8);
165
211
  }
166
- }
212
+ // leftovers
213
+ // Since 8 unrolls are done in above loop, leftovers lie in range [0, ggml_f32_step] which is handled in below loop
214
+ const int np2 = (n & ~(ggml_f32_epr - 1));
215
+ for (int i = np; i < np2; i += ggml_f32_epr) {
216
+ ax1 = GGML_F32_VEC_LOAD(x + i);
217
+ ay1 = GGML_F32_VEC_LOAD(y + i);
218
+ ay1 = GGML_F32_VEC_FMA(ax1, vx, ay1);
219
+
220
+ GGML_F32_VEC_STORE(y + i, ay1);
221
+ }
222
+ // maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
223
+ if (np2 < n) {
224
+ svbool_t pg =svwhilelt_b32(np2, n);
225
+ ax1 = svld1_f32(pg, x + np2);
226
+ ay1 = svld1_f32(pg, y + np2);
227
+ ay1 = svmad_f32_m(pg, ax1, vx, ay1);
228
+
229
+ svst1_f32(pg, y + np2, ay1);
230
+ }
231
+ #else
232
+ const int np = (n & ~(GGML_F32_STEP - 1));
167
233
 
168
- // leftovers
169
- for (int i = np; i < n; ++i) {
170
- y[i] += x[i]*v;
171
- }
234
+ GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
235
+
236
+ GGML_F32_VEC ax[GGML_F32_ARR];
237
+ GGML_F32_VEC ay[GGML_F32_ARR];
238
+
239
+ for (int i = 0; i < np; i += GGML_F32_STEP) {
240
+ for (int j = 0; j < GGML_F32_ARR; j++) {
241
+ ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
242
+ ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
243
+ ay[j] = GGML_F32_VEC_FMA(ay[j], ax[j], vx);
244
+
245
+ GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
246
+ }
247
+ }
248
+
249
+ // leftovers
250
+ for (int i = np; i < n; ++i) {
251
+ y[i] += x[i]*v;
252
+ }
253
+ #endif
172
254
  #else
173
255
  // scalar
174
256
  for (int i = 0; i < n; ++i) {
@@ -220,36 +302,45 @@ inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int
220
302
  }
221
303
 
222
304
  #if defined(GGML_SIMD)
223
- const int np = (n & ~(GGML_F32_STEP - 1));
305
+ #if defined(__ARM_FEATURE_SVE)
306
+ // scalar Route to scalar implementation //TODO: Write SVE code
307
+ for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
308
+ for (int i = 0; i < n; ++i) {
309
+ y[i] += x[k][i]*v[k][0];
310
+ }
311
+ }
312
+ #else
313
+ const int np = (n & ~(GGML_F32_STEP - 1));
224
314
 
225
- GGML_F32_VEC vx[GGML_VEC_MAD_UNROLL];
315
+ GGML_F32_VEC vx[GGML_VEC_MAD_UNROLL];
226
316
 
227
- for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
228
- vx[k] = GGML_F32_VEC_SET1(v[k][0]);
229
- }
317
+ for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
318
+ vx[k] = GGML_F32_VEC_SET1(v[k][0]);
319
+ }
230
320
 
231
- GGML_F32_VEC ax[GGML_VEC_MAD_UNROLL][GGML_F32_ARR];
232
- GGML_F32_VEC ay[GGML_F32_ARR];
321
+ GGML_F32_VEC ax[GGML_VEC_MAD_UNROLL][GGML_F32_ARR];
322
+ GGML_F32_VEC ay[GGML_F32_ARR];
233
323
 
234
- for (int i = 0; i < np; i += GGML_F32_STEP) {
235
- for (int j = 0; j < GGML_F32_ARR; j++) {
236
- ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
324
+ for (int i = 0; i < np; i += GGML_F32_STEP) {
325
+ for (int j = 0; j < GGML_F32_ARR; j++) {
326
+ ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
237
327
 
238
- for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
239
- ax[k][j] = GGML_F32_VEC_LOAD(x[k] + i + j*GGML_F32_EPR);
240
- ay[j] = GGML_F32_VEC_FMA(ay[j], ax[k][j], vx[k]);
241
- }
328
+ for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
329
+ ax[k][j] = GGML_F32_VEC_LOAD(x[k] + i + j*GGML_F32_EPR);
330
+ ay[j] = GGML_F32_VEC_FMA(ay[j], ax[k][j], vx[k]);
331
+ }
242
332
 
243
- GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
333
+ GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
334
+ }
244
335
  }
245
- }
246
336
 
247
- // leftovers
248
- for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
249
- for (int i = np; i < n; ++i) {
250
- y[i] += x[k][i]*v[k][0];
337
+ // leftovers
338
+ for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
339
+ for (int i = np; i < n; ++i) {
340
+ y[i] += x[k][i]*v[k][0];
341
+ }
251
342
  }
252
- }
343
+ #endif
253
344
  #else
254
345
  // scalar
255
346
  for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
@@ -265,25 +356,53 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
265
356
  #if defined(GGML_USE_ACCELERATE)
266
357
  vDSP_vsmul(y, 1, &v, y, 1, n);
267
358
  #elif defined(GGML_SIMD)
268
- const int np = (n & ~(GGML_F32_STEP - 1));
359
+ #if defined(__ARM_FEATURE_SVE)
360
+ const int sve_register_length = ggml_cpu_get_sve_cnt() * 8;
361
+ const int ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
362
+ const int ggml_f32_step = 2 * ggml_f32_epr;
363
+
364
+ GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
365
+ const int np = (n & ~(ggml_f32_step - 1));
366
+ svfloat32_t ay1;
367
+ svfloat32_t ay2;
368
+ for (int i = 0; i < np; i += ggml_f32_step) {
369
+ ay1 = GGML_F32_VEC_LOAD(y + i);
370
+ ay1 = GGML_F32_VEC_MUL(ay1, vx);
371
+ GGML_F32_VEC_STORE(y + i, ay1);
372
+
373
+ ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
374
+ ay2 = GGML_F32_VEC_MUL(ay2, vx);
375
+ GGML_F32_VEC_STORE(y + i + 1*ggml_f32_epr, ay2);
376
+ }
377
+ // leftovers
378
+ // maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
379
+ if (np < n) {
380
+ svbool_t pg = svwhilelt_b32(np, n);
381
+ ay1 = svld1_f32(pg, y + np);
382
+ ay1 = svmul_f32_m(pg, ay1, vx);
383
+ svst1_f32(pg, y + np, ay1);
384
+ }
385
+ #else
386
+ const int np = (n & ~(GGML_F32_STEP - 1));
269
387
 
270
- GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
388
+ GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
271
389
 
272
- GGML_F32_VEC ay[GGML_F32_ARR];
390
+ GGML_F32_VEC ay[GGML_F32_ARR];
273
391
 
274
- for (int i = 0; i < np; i += GGML_F32_STEP) {
275
- for (int j = 0; j < GGML_F32_ARR; j++) {
276
- ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
277
- ay[j] = GGML_F32_VEC_MUL(ay[j], vx);
392
+ for (int i = 0; i < np; i += GGML_F32_STEP) {
393
+ for (int j = 0; j < GGML_F32_ARR; j++) {
394
+ ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
395
+ ay[j] = GGML_F32_VEC_MUL(ay[j], vx);
278
396
 
279
- GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
397
+ GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
398
+ }
280
399
  }
281
- }
282
400
 
283
- // leftovers
284
- for (int i = np; i < n; ++i) {
285
- y[i] *= v;
286
- }
401
+ // leftovers
402
+ for (int i = np; i < n; ++i) {
403
+ y[i] *= v;
404
+ }
405
+ #endif
287
406
  #else
288
407
  // scalar
289
408
  for (int i = 0; i < n; ++i) {
@@ -428,6 +547,7 @@ inline static void ggml_vec_exp_f16 (const int n, ggml_fp16_t * y, const ggml_fp
428
547
  static const float GELU_COEF_A = 0.044715f;
429
548
  static const float GELU_QUICK_COEF = -1.702f;
430
549
  static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
550
+ static const float SQRT_2_INV = 0.70710678118654752440084436210484f;
431
551
 
432
552
  inline static float ggml_gelu_f32(float x) {
433
553
  return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
@@ -440,6 +560,14 @@ inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp
440
560
  }
441
561
  }
442
562
 
563
+ inline static void ggml_vec_gelu_erf_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
564
+ for (int i = 0; i < n; ++i) {
565
+ float xi = GGML_FP16_TO_FP32(x[i]);
566
+ float res = 0.5f*xi*(1.0f + erff(xi*SQRT_2_INV));
567
+ y[i] = GGML_FP32_TO_FP16(res);
568
+ }
569
+ }
570
+
443
571
  #ifdef GGML_GELU_FP16
444
572
  inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
445
573
  uint16_t t;
@@ -463,6 +591,13 @@ inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
463
591
  }
464
592
  #endif
465
593
 
594
+ inline static void ggml_vec_gelu_erf_f32(const int n, float * y, const float * x) {
595
+ for (int i = 0; i < n; ++i) {
596
+ float xi = x[i];
597
+ y[i] = 0.5f*xi*(1.0f + erff(xi*SQRT_2_INV));
598
+ }
599
+ }
600
+
466
601
  inline static float ggml_gelu_quick_f32(float x) {
467
602
  return x*(1.0f/(1.0f+expf(GELU_QUICK_COEF*x)));
468
603
  }
@@ -512,6 +647,42 @@ inline static ggml_fp16_t ggml_silu_f16(ggml_fp16_t x) {
512
647
  #error "ref: https://github.com/ggml-org/llama.cpp/pull/7154#issuecomment-2143844461"
513
648
  #endif
514
649
 
650
+ /* Below function was borrowed from the GitHub repository:
651
+ https://github.com/openvinotoolkit/openvino/blob/master/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/common.hpp */
652
+ #if defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
653
+ inline static svfloat32_t exp_ps_sve(svbool_t pg, svfloat32_t src) {
654
+ // Constants
655
+ const svfloat32_t log2_e = svdup_n_f32(1.4426950409f);
656
+ const svfloat32_t ln2 = svdup_n_f32(0.6931473921f);
657
+ const svfloat32_t half_ln2_sq = svdup_n_f32(0.2413862043f);
658
+ const svuint32_t not_mask17 = svdup_n_u32(~((1u << 17) - 1));
659
+ const svfloat32_t one = svdup_n_f32(1.0f);
660
+ const svfloat32_t inactive1 = svdup_n_f32(0.0f);
661
+ const svint32_t inactive2 = svdup_n_s32(0);
662
+
663
+ // Algorithm starts here
664
+ svfloat32_t t0 = svmul_f32_m(pg, src, log2_e); // y = x * log2(e)
665
+ svfloat32_t t1 = svrintm_f32_m(inactive1, pg, t0); // rount to int (float)
666
+ svint32_t t2 = svcvt_s32_f32_m(inactive2, pg, t1); // n
667
+
668
+ t1 = svsub_f32_m(pg, t0, t1); // a = y - floor(y)
669
+ t1 = svadd_f32_m(pg, t1, one); // b = a + 1
670
+
671
+ svuint32_t t3 = svlsr_n_u32_m(pg, svreinterpret_u32_f32(t1), 17); // v = b >> 17 (u32)
672
+ svfloat32_t t4 = svexpa_f32(t3); // c = fexpa(v)
673
+ t4 = svscale_f32_m(pg, t4, t2); // fexpa(v) * 2^(n)
674
+
675
+ // and_(t2.d, t1.d, not_mask17.d)
676
+ svfloat32_t t5 = svreinterpret_f32_u32(svand_u32_m(pg, svreinterpret_u32_f32(t1), not_mask17));
677
+ t5 = svsub_f32_m(pg, t1, t5); // z
678
+ t0 = svmla_f32_m(pg, ln2, t5, half_ln2_sq); // ln2 + half_ln2_sq * z
679
+ t0 = svmla_f32_m(pg, one, t5, t0); // 1 + (ln2 * z) + (half_ln2_sq * z * z)
680
+ t0 = svmul_f32_m(pg, t0, t4); // Final result
681
+
682
+ return t0;
683
+ }
684
+ #endif
685
+
515
686
  #if defined(__ARM_NEON) && defined(__aarch64__)
516
687
 
517
688
  // adapted from arm limited optimized routine
@@ -1,47 +1,61 @@
1
1
  #include "acc.cuh"
2
2
 
3
- static __global__ void acc_f32(const float * x, const float * y, float * dst, const int ne,
4
- const int ne10, const int ne11, const int ne12,
5
- const int nb1, const int nb2, int offset) {
6
- const int i = blockDim.x * blockIdx.x + threadIdx.x;
3
+ static __global__ void acc_f32(const float * x, const float * y, float * dst, const int64_t ne,
4
+ const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t ne13,
5
+ const int64_t s11, const int64_t s12, const int64_t s13, const int64_t offset) {
6
+ const int64_t i = blockDim.x * blockIdx.x + threadIdx.x;
7
+
7
8
  if (i >= ne) {
8
9
  return;
9
10
  }
10
- int src1_idx = i - offset;
11
- int oz = src1_idx / nb2;
12
- int oy = (src1_idx - (oz * nb2)) / nb1;
13
- int ox = src1_idx % nb1;
14
- if (src1_idx >= 0 && ox < ne10 && oy < ne11 && oz < ne12) {
15
- dst[i] = x[i] + y[ox + oy * ne10 + oz * ne10 * ne11];
16
- } else {
17
- dst[i] = x[i];
11
+
12
+ int64_t src1_idx = i - offset;
13
+
14
+ int64_t tmp = src1_idx;
15
+ const int64_t i13 = tmp / s13;
16
+ tmp -= i13 * s13;
17
+ const int64_t i12 = tmp / s12;
18
+ tmp -= i12 * s12;
19
+ const int64_t i11 = tmp / s11;
20
+ tmp -= i11 * s11;
21
+ const int64_t i10 = tmp;
22
+
23
+ float val = x[i];
24
+ if (src1_idx >= 0 && i10 < ne10 && i11 < ne11 && i12 < ne12 && i13 < ne13) {
25
+ val += y[((i13*ne12 + i12) * ne11 + i11) * ne10 + i10];
18
26
  }
27
+ dst[i] = val;
19
28
  }
20
29
 
21
- static void acc_f32_cuda(const float * x, const float * y, float * dst, const int n_elements,
22
- const int ne10, const int ne11, const int ne12,
23
- const int nb1, const int nb2, const int offset, cudaStream_t stream) {
24
- int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE;
25
- acc_f32<<<num_blocks, CUDA_ACC_BLOCK_SIZE, 0, stream>>>(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset);
30
+ static void acc_f32_cuda(const float * x, const float * y, float * dst, const int64_t n_elements,
31
+ const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t ne13,
32
+ const int64_t s1, const int64_t s2, const int64_t s3, const int64_t offset, cudaStream_t stream) {
33
+ const int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE;
34
+ acc_f32<<<num_blocks, CUDA_ACC_BLOCK_SIZE, 0, stream>>>(x, y, dst, n_elements, ne10, ne11, ne12, ne13, s1, s2, s3, offset);
26
35
  }
27
36
 
28
37
  void ggml_cuda_op_acc(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
29
38
  const ggml_tensor * src0 = dst->src[0];
30
39
  const ggml_tensor * src1 = dst->src[1];
31
- const float * src0_d = (const float *)src0->data;
32
- const float * src1_d = (const float *)src1->data;
33
- float * dst_d = (float *)dst->data;
40
+
41
+ const float * src0_d = (const float *) src0->data;
42
+ const float * src1_d = (const float *) src1->data;
43
+ float * dst_d = (float *) dst->data;
44
+
34
45
  cudaStream_t stream = ctx.stream();
35
46
 
36
47
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
37
48
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
38
49
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
39
- GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported
40
50
 
41
- int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
42
- int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
43
- // int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused
44
- int offset = dst->op_params[3] / 4; // offset in bytes
51
+ GGML_ASSERT(ggml_is_contiguous(src1));
52
+ GGML_ASSERT(dst->nb[0] == ggml_element_size(dst));
53
+ GGML_ASSERT(ggml_is_contiguously_allocated(dst));
54
+
55
+ const int64_t s1 = dst->op_params[0] / sizeof(float);
56
+ const int64_t s2 = dst->op_params[1] / sizeof(float);
57
+ const int64_t s3 = dst->op_params[2] / sizeof(float);
58
+ const int64_t offset = dst->op_params[3] / sizeof(float);
45
59
 
46
- acc_f32_cuda(src0_d, src1_d, dst_d, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, stream);
60
+ acc_f32_cuda(src0_d, src1_d, dst_d, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], s1, s2, s3, offset, stream);
47
61
  }
@@ -168,7 +168,7 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in
168
168
 
169
169
  #define CUBLAS_CHECK(err) CUDA_CHECK_GEN(err, CUBLAS_STATUS_SUCCESS, cublas_get_error_str)
170
170
 
171
- #if !defined(GGML_USE_HIP)
171
+ #if !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
172
172
  static const char * cu_get_error_str(CUresult err) {
173
173
  const char * err_str;
174
174
  cuGetErrorString(err, &err_str);
@@ -1,5 +1,8 @@
1
1
  #include "cpy.cuh"
2
2
  #include "dequantize.cuh"
3
+ #ifdef GGML_USE_MUSA
4
+ #include "ggml-musa/mudnn.cuh"
5
+ #endif // GGML_USE_MUSA
3
6
 
4
7
  typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
5
8
 
@@ -597,7 +600,14 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
597
600
  #endif
598
601
  if (src0->type == src1->type && ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
599
602
  GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(src1));
600
- CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
603
+ #ifdef GGML_USE_MUSA
604
+ if (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) {
605
+ CUDA_CHECK(mudnnMemcpyAsync(ctx, src1, src0));
606
+ } else
607
+ #endif // GGML_USE_MUSA
608
+ {
609
+ CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
610
+ }
601
611
  } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
602
612
  ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
603
613
  } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) {
@@ -623,8 +623,8 @@ static __global__ void flash_attn_combine_results(
623
623
  __builtin_assume(tid < D);
624
624
 
625
625
  extern __shared__ float2 meta[];
626
- if (tid < 2*parallel_blocks) {
627
- ((float *) meta)[threadIdx.x] = ((const float *)VKQ_meta) [blockIdx.z*(2*parallel_blocks) + tid];
626
+ for (int i = tid; i < 2*parallel_blocks; i += D) {
627
+ ((float *) meta)[i] = ((const float *)VKQ_meta) [blockIdx.z*(2*parallel_blocks) + i];
628
628
  }
629
629
 
630
630
  __syncthreads();
@@ -678,10 +678,14 @@ void launch_fattn(
678
678
  ) {
679
679
  constexpr int ncols = ncols1 * ncols2;
680
680
 
681
+ const bool is_mla = DV == 512; // TODO better parameterization
682
+
681
683
  const ggml_tensor * Q = dst->src[0];
682
684
  const ggml_tensor * K = dst->src[1];
683
685
  const ggml_tensor * V = dst->src[2];
684
686
 
687
+ GGML_ASSERT(V || is_mla);
688
+
685
689
  const ggml_tensor * mask = dst->src[3];
686
690
 
687
691
  ggml_tensor * KQV = dst;
@@ -689,6 +693,10 @@ void launch_fattn(
689
693
  GGML_ASSERT(Q->type == GGML_TYPE_F32);
690
694
  GGML_ASSERT(KQV->type == GGML_TYPE_F32);
691
695
 
696
+ GGML_ASSERT( Q->nb[0] == ggml_element_size(Q));
697
+ GGML_ASSERT( K->nb[0] == ggml_element_size(K));
698
+ GGML_ASSERT(!V || V->nb[0] == ggml_element_size(V));
699
+
692
700
  GGML_ASSERT(!mask || mask->type == GGML_TYPE_F16);
693
701
  GGML_ASSERT(!mask || mask->ne[1] >= GGML_PAD(Q->ne[1], 16) &&
694
702
  "the Flash-Attention CUDA kernel requires the mask to be padded to 16 and at least n_queries big");
@@ -713,10 +721,10 @@ void launch_fattn(
713
721
  size_t nb12 = K->nb[2];
714
722
  size_t nb13 = K->nb[3];
715
723
 
716
- const char * V_data = (const char *) V->data;
717
- size_t nb21 = V->nb[1];
718
- size_t nb22 = V->nb[2];
719
- size_t nb23 = V->nb[3];
724
+ const char * V_data = V ? (const char *) V->data : nullptr;
725
+ size_t nb21 = V ? V->nb[1] : nb11;
726
+ size_t nb22 = V ? V->nb[2] : nb12;
727
+ size_t nb23 = V ? V->nb[3] : nb13;
720
728
 
721
729
  if (need_f16_K && K->type != GGML_TYPE_F16) {
722
730
  GGML_ASSERT(ggml_is_contiguously_allocated(K));
@@ -733,7 +741,7 @@ void launch_fattn(
733
741
  nb13 = nb13*bs*sizeof(half)/ts;
734
742
  }
735
743
 
736
- if (need_f16_V && V->type != GGML_TYPE_F16) {
744
+ if (V && need_f16_V && V->type != GGML_TYPE_F16) {
737
745
  GGML_ASSERT(ggml_is_contiguously_allocated(V));
738
746
  V_f16.alloc(ggml_nelements(V));
739
747
  to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(V->type);