@novastera-oss/llamarn 0.2.9 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (314) hide show
  1. package/android/build.gradle +2 -1
  2. package/android/proguard-rules.pro +12 -0
  3. package/android/src/main/cpp/include/llama.h +15 -47
  4. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  6. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  8. package/android/src/main/jniLibs/armeabi-v7a/libggml-base.so +0 -0
  9. package/android/src/main/jniLibs/armeabi-v7a/libggml-cpu.so +0 -0
  10. package/android/src/main/jniLibs/armeabi-v7a/libggml.so +0 -0
  11. package/android/src/main/jniLibs/armeabi-v7a/libllama.so +0 -0
  12. package/android/src/main/jniLibs/x86/libggml-base.so +0 -0
  13. package/android/src/main/jniLibs/x86/libggml-cpu.so +0 -0
  14. package/android/src/main/jniLibs/x86/libggml.so +0 -0
  15. package/android/src/main/jniLibs/x86/libllama.so +0 -0
  16. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  17. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  18. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  19. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  20. package/cpp/build-info.cpp +2 -2
  21. package/cpp/llama.cpp/CMakeLists.txt +0 -1
  22. package/cpp/llama.cpp/CMakePresets.json +11 -0
  23. package/cpp/llama.cpp/CODEOWNERS +1 -0
  24. package/cpp/llama.cpp/README.md +8 -8
  25. package/cpp/llama.cpp/build-xcframework.sh +1 -1
  26. package/cpp/llama.cpp/common/CMakeLists.txt +4 -5
  27. package/cpp/llama.cpp/common/arg.cpp +62 -1
  28. package/cpp/llama.cpp/common/chat.cpp +37 -20
  29. package/cpp/llama.cpp/common/chat.h +2 -0
  30. package/cpp/llama.cpp/common/common.cpp +22 -6
  31. package/cpp/llama.cpp/common/common.h +22 -4
  32. package/cpp/llama.cpp/convert_hf_to_gguf.py +1250 -43
  33. package/cpp/llama.cpp/convert_hf_to_gguf_update.py +21 -13
  34. package/cpp/llama.cpp/ggml/CMakeLists.txt +13 -3
  35. package/cpp/llama.cpp/ggml/cmake/ggml-config.cmake.in +85 -47
  36. package/cpp/llama.cpp/ggml/include/ggml-backend.h +1 -1
  37. package/cpp/llama.cpp/ggml/include/ggml-webgpu.h +19 -0
  38. package/cpp/llama.cpp/ggml/include/ggml.h +173 -10
  39. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +1 -1
  40. package/cpp/llama.cpp/ggml/src/ggml-alloc.c +0 -15
  41. package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +7 -8
  42. package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +44 -38
  43. package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +3 -1
  44. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +126 -8
  45. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +130 -22
  46. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +138 -18
  47. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +11 -3
  48. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +1 -1
  49. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +28 -1
  50. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +109 -12
  51. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +3 -0
  52. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +88 -10
  53. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +343 -1094
  54. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +1206 -163
  55. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +6 -0
  56. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +0 -1
  57. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +1 -1
  58. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +36 -9
  59. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +142 -9
  60. package/cpp/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +3 -3
  61. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +31 -4
  62. package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cu +86 -17
  63. package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cuh +5 -0
  64. package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy-utils.cuh +225 -0
  65. package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cu +41 -301
  66. package/cpp/llama.cpp/ggml/src/ggml-cuda/cross-entropy-loss.cu +2 -14
  67. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +85 -64
  68. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +47 -60
  69. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cu +29 -42
  70. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cu +46 -59
  71. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +36 -45
  72. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +38 -45
  73. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +23 -36
  74. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cu +3 -13
  75. package/cpp/llama.cpp/ggml/src/ggml-cuda/getrows.cu +8 -0
  76. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +255 -99
  77. package/cpp/llama.cpp/ggml/src/ggml-cuda/im2col.cu +1 -1
  78. package/cpp/llama.cpp/ggml/src/ggml-cuda/mma.cuh +111 -3
  79. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cu +6 -4
  80. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +1152 -695
  81. package/cpp/llama.cpp/ggml/src/ggml-cuda/norm.cu +92 -5
  82. package/cpp/llama.cpp/ggml/src/ggml-cuda/norm.cuh +2 -0
  83. package/cpp/llama.cpp/ggml/src/ggml-cuda/rope.cu +21 -27
  84. package/cpp/llama.cpp/ggml/src/ggml-cuda/scale.cu +8 -6
  85. package/cpp/llama.cpp/ggml/src/ggml-cuda/set-rows.cu +275 -0
  86. package/cpp/llama.cpp/ggml/src/ggml-cuda/set-rows.cuh +7 -0
  87. package/cpp/llama.cpp/ggml/src/ggml-cuda/softmax.cu +119 -58
  88. package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-conv.cu +10 -2
  89. package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +192 -52
  90. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +104 -0
  91. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +13 -0
  92. package/cpp/llama.cpp/ggml/src/ggml-cuda/upscale.cu +92 -6
  93. package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +27 -6
  94. package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +2 -2
  95. package/cpp/llama.cpp/ggml/src/ggml-impl.h +80 -0
  96. package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -2
  97. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +48 -12
  98. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +572 -106
  99. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +599 -105
  100. package/cpp/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +18 -4
  101. package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +5 -0
  102. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +800 -42
  103. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
  104. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
  105. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/gelu.cl +27 -0
  106. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/glu.cl +337 -0
  107. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/im2col_f16.cl +1 -1
  108. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/im2col_f32.cl +1 -1
  109. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
  110. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/rms_norm.cl +79 -0
  111. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/scale.cl +3 -2
  112. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/set_rows.cl +95 -0
  113. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +24 -11
  114. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +24 -11
  115. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f16.cl +24 -11
  116. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f32.cl +24 -11
  117. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +2 -3
  118. package/cpp/llama.cpp/ggml/src/ggml-quants.c +6 -6
  119. package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +4 -4
  120. package/cpp/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  121. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +693 -1034
  122. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +18 -9
  123. package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +14 -26
  124. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +191 -55
  125. package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
  126. package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +8 -9
  127. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +15 -18
  128. package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.cpp +131 -0
  129. package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.hpp +8 -0
  130. package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +2 -6
  131. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +991 -307
  132. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +265 -0
  133. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +59 -12
  134. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +1 -1
  135. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
  136. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +1 -1
  137. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +1 -1
  138. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
  139. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +28 -23
  140. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +14 -9
  141. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +38 -32
  142. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +32 -27
  143. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +44 -12
  144. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
  145. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
  146. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
  147. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
  148. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +2 -0
  149. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +17 -0
  150. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp +29 -0
  151. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +3 -8
  152. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +128 -72
  153. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +38 -9
  154. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
  155. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +18 -3
  156. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
  157. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +1 -4
  158. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -9
  159. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -9
  160. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -9
  161. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rte.comp +5 -0
  162. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +1 -1
  163. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +20 -4
  164. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
  165. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +69 -5
  166. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +84 -9
  167. package/cpp/llama.cpp/ggml/src/ggml-webgpu/CMakeLists.txt +54 -0
  168. package/cpp/llama.cpp/ggml/src/ggml-webgpu/ggml-webgpu.cpp +907 -0
  169. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +60 -0
  170. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +35 -0
  171. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
  172. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +56 -0
  173. package/cpp/llama.cpp/ggml/src/ggml.c +386 -67
  174. package/cpp/llama.cpp/ggml/src/gguf.cpp +8 -1
  175. package/cpp/llama.cpp/gguf-py/gguf/constants.py +307 -0
  176. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +8 -2
  177. package/cpp/llama.cpp/gguf-py/gguf/metadata.py +4 -0
  178. package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_dump.py +24 -1
  179. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +122 -47
  180. package/cpp/llama.cpp/gguf-py/gguf/vocab.py +12 -3
  181. package/cpp/llama.cpp/include/llama.h +15 -47
  182. package/cpp/llama.cpp/models/templates/llama-cpp-rwkv-world.jinja +34 -0
  183. package/cpp/llama.cpp/models/templates/moonshotai-Kimi-K2.jinja +43 -0
  184. package/cpp/llama.cpp/requirements/requirements-all.txt +1 -0
  185. package/cpp/llama.cpp/requirements/requirements-server-bench.txt +5 -0
  186. package/cpp/llama.cpp/src/llama-arch.cpp +316 -3
  187. package/cpp/llama.cpp/src/llama-arch.h +23 -1
  188. package/cpp/llama.cpp/src/llama-batch.cpp +103 -71
  189. package/cpp/llama.cpp/src/llama-batch.h +31 -18
  190. package/cpp/llama.cpp/src/llama-chat.cpp +58 -1
  191. package/cpp/llama.cpp/src/llama-chat.h +3 -0
  192. package/cpp/llama.cpp/src/llama-context.cpp +180 -106
  193. package/cpp/llama.cpp/src/llama-context.h +26 -16
  194. package/cpp/llama.cpp/src/llama-cparams.h +3 -2
  195. package/cpp/llama.cpp/src/llama-graph.cpp +310 -211
  196. package/cpp/llama.cpp/src/llama-graph.h +184 -122
  197. package/cpp/llama.cpp/src/llama-hparams.cpp +47 -1
  198. package/cpp/llama.cpp/src/llama-hparams.h +13 -2
  199. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +38 -22
  200. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +7 -2
  201. package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +849 -304
  202. package/cpp/llama.cpp/src/llama-kv-cache-unified.h +143 -47
  203. package/cpp/llama.cpp/src/llama-kv-cells.h +62 -10
  204. package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +10 -4
  205. package/cpp/llama.cpp/src/llama-memory-hybrid.h +3 -1
  206. package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +36 -11
  207. package/cpp/llama.cpp/src/llama-memory.cpp +17 -0
  208. package/cpp/llama.cpp/src/llama-memory.h +3 -0
  209. package/cpp/llama.cpp/src/llama-model.cpp +3545 -719
  210. package/cpp/llama.cpp/src/llama-model.h +21 -4
  211. package/cpp/llama.cpp/src/llama-quant.cpp +2 -2
  212. package/cpp/llama.cpp/src/llama-vocab.cpp +376 -10
  213. package/cpp/llama.cpp/src/llama-vocab.h +43 -0
  214. package/cpp/llama.cpp/src/unicode.cpp +207 -0
  215. package/cpp/llama.cpp/src/unicode.h +2 -0
  216. package/ios/include/chat.h +2 -0
  217. package/ios/include/common.h +22 -4
  218. package/ios/include/llama.h +15 -47
  219. package/ios/libs/llama.xcframework/Info.plist +13 -13
  220. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  221. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5267 -4890
  222. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-backend.h +1 -1
  223. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +173 -10
  224. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +15 -47
  225. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  226. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  227. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5238 -4861
  228. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4014 -3764
  229. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
  230. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +173 -10
  231. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +15 -47
  232. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  233. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  234. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5238 -4861
  235. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4016 -3766
  236. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-backend.h +1 -1
  237. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +173 -10
  238. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +15 -47
  239. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-backend.h +1 -1
  240. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +173 -10
  241. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +15 -47
  242. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  243. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-backend.h +1 -1
  244. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +173 -10
  245. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +15 -47
  246. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  247. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  248. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  249. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5267 -4890
  250. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-backend.h +1 -1
  251. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +173 -10
  252. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +15 -47
  253. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  254. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  255. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5238 -4861
  256. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4014 -3764
  257. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
  258. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +173 -10
  259. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +15 -47
  260. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  261. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  262. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5303 -4926
  263. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-backend.h +1 -1
  264. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +173 -10
  265. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +15 -47
  266. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  267. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  268. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5274 -4897
  269. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4044 -3794
  270. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
  271. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +173 -10
  272. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +15 -47
  273. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  274. package/package.json +4 -4
  275. package/cpp/llama.cpp/ggml/include/ggml-kompute.h +0 -50
  276. package/cpp/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
  277. package/cpp/llama.cpp/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
  278. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/common.comp +0 -112
  279. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +0 -58
  280. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +0 -25
  281. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +0 -52
  282. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +0 -52
  283. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +0 -52
  284. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +0 -52
  285. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +0 -30
  286. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +0 -22
  287. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +0 -17
  288. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +0 -31
  289. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +0 -31
  290. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +0 -38
  291. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +0 -39
  292. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +0 -44
  293. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +0 -52
  294. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +0 -69
  295. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +0 -51
  296. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +0 -33
  297. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +0 -35
  298. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +0 -140
  299. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +0 -106
  300. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +0 -73
  301. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +0 -52
  302. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +0 -28
  303. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +0 -84
  304. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +0 -21
  305. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +0 -53
  306. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +0 -52
  307. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +0 -52
  308. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +0 -52
  309. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +0 -52
  310. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +0 -19
  311. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +0 -23
  312. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +0 -22
  313. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +0 -72
  314. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +0 -71
@@ -22,21 +22,6 @@ static bool ggml_is_view(const struct ggml_tensor * t) {
22
22
  return t->view_src != NULL;
23
23
  }
24
24
 
25
- static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
26
- if (a->type != b->type) {
27
- return false;
28
- }
29
- for (int i = 0; i < GGML_MAX_DIMS; i++) {
30
- if (a->ne[i] != b->ne[i]) {
31
- return false;
32
- }
33
- if (a->nb[i] != b->nb[i]) {
34
- return false;
35
- }
36
- }
37
- return true;
38
- }
39
-
40
25
  // ops that return true for this function must not use restrict pointers for their backend implementations
41
26
  static bool ggml_op_can_inplace(enum ggml_op op) {
42
27
  switch (op) {
@@ -45,6 +45,10 @@
45
45
  #include "ggml-vulkan.h"
46
46
  #endif
47
47
 
48
+ #ifdef GGML_USE_WEBGPU
49
+ #include "ggml-webgpu.h"
50
+ #endif
51
+
48
52
  #ifdef GGML_USE_OPENCL
49
53
  #include "ggml-opencl.h"
50
54
  #endif
@@ -61,10 +65,6 @@
61
65
  #include "ggml-cann.h"
62
66
  #endif
63
67
 
64
- #ifdef GGML_USE_KOMPUTE
65
- #include "ggml-kompute.h"
66
- #endif
67
-
68
68
  // disable C++17 deprecation warning for std::codecvt_utf8
69
69
  #if defined(__clang__)
70
70
  # pragma clang diagnostic push
@@ -177,6 +177,9 @@ struct ggml_backend_registry {
177
177
  #ifdef GGML_USE_VULKAN
178
178
  register_backend(ggml_backend_vk_reg());
179
179
  #endif
180
+ #ifdef GGML_USE_WEBGPU
181
+ register_backend(ggml_backend_webgpu_reg());
182
+ #endif
180
183
  #ifdef GGML_USE_OPENCL
181
184
  register_backend(ggml_backend_opencl_reg());
182
185
  #endif
@@ -189,9 +192,6 @@ struct ggml_backend_registry {
189
192
  #ifdef GGML_USE_RPC
190
193
  register_backend(ggml_backend_rpc_reg());
191
194
  #endif
192
- #ifdef GGML_USE_KOMPUTE
193
- register_backend(ggml_backend_kompute_reg());
194
- #endif
195
195
  #ifdef GGML_USE_CPU
196
196
  register_backend(ggml_backend_cpu_reg());
197
197
  #endif
@@ -575,7 +575,6 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
575
575
  ggml_backend_load_best("cann", silent, dir_path);
576
576
  ggml_backend_load_best("cuda", silent, dir_path);
577
577
  ggml_backend_load_best("hip", silent, dir_path);
578
- ggml_backend_load_best("kompute", silent, dir_path);
579
578
  ggml_backend_load_best("metal", silent, dir_path);
580
579
  ggml_backend_load_best("rpc", silent, dir_path);
581
580
  ggml_backend_load_best("sycl", silent, dir_path);
@@ -352,21 +352,6 @@ ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend) {
352
352
 
353
353
  // backend copy
354
354
 
355
- static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
356
- if (a->type != b->type) {
357
- return false;
358
- }
359
- for (int i = 0; i < GGML_MAX_DIMS; i++) {
360
- if (a->ne[i] != b->ne[i]) {
361
- return false;
362
- }
363
- if (a->nb[i] != b->nb[i]) {
364
- return false;
365
- }
366
- }
367
- return true;
368
- }
369
-
370
355
  void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst) {
371
356
  GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
372
357
 
@@ -662,6 +647,7 @@ struct ggml_backend_sched {
662
647
  // pipeline parallelism support
663
648
  int n_copies;
664
649
  int cur_copy;
650
+ int next_copy;
665
651
  ggml_backend_event_t events[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES];
666
652
  struct ggml_tensor * graph_inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
667
653
  int n_graph_inputs;
@@ -817,8 +803,9 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
817
803
  }
818
804
  if (sched->debug > 1) {
819
805
  ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
820
- GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
821
- fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
806
+ GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s] use=%d:", i, ggml_op_name(node->op), node->name,
807
+ fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node),
808
+ graph->use_counts[ggml_hash_find(&graph->visited_hash_set, node)]);
822
809
  for (int j = 0; j < GGML_MAX_SRC; j++) {
823
810
  struct ggml_tensor * src = node->src[j];
824
811
  if (src == NULL) {
@@ -1447,8 +1434,6 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
1447
1434
  }
1448
1435
  }
1449
1436
 
1450
- sched->cur_copy = (sched->cur_copy + 1) % sched->n_copies;
1451
-
1452
1437
  return GGML_STATUS_SUCCESS;
1453
1438
  }
1454
1439
 
@@ -1549,10 +1534,10 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
1549
1534
  bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
1550
1535
  GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
1551
1536
 
1552
- ggml_backend_sched_split_graph(sched, measure_graph);
1553
-
1554
1537
  ggml_backend_sched_synchronize(sched);
1555
1538
 
1539
+ ggml_backend_sched_split_graph(sched, measure_graph);
1540
+
1556
1541
  if (!ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
1557
1542
  return false;
1558
1543
  }
@@ -1564,6 +1549,10 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
1564
1549
 
1565
1550
  bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1566
1551
  GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + graph->n_leafs);
1552
+ GGML_ASSERT(!sched->is_alloc);
1553
+
1554
+ sched->cur_copy = sched->next_copy;
1555
+ sched->next_copy = (sched->next_copy + 1) % sched->n_copies;
1567
1556
 
1568
1557
  ggml_backend_sched_split_graph(sched, graph);
1569
1558
 
@@ -1604,7 +1593,7 @@ void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) {
1604
1593
  // if the graph is not already allocated, always use copy 0 after a synchronization
1605
1594
  // this ensures that during generation the same copy is used every time,
1606
1595
  // which avoids changes in the graph that could cause CUDA or other graphs to be disabled
1607
- sched->cur_copy = 0;
1596
+ sched->next_copy = 0;
1608
1597
  }
1609
1598
  }
1610
1599
 
@@ -1826,7 +1815,7 @@ void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy) {
1826
1815
  ggml_free(copy.ctx_unallocated);
1827
1816
  }
1828
1817
 
1829
- bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data) {
1818
+ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data, struct ggml_tensor * test_node) {
1830
1819
  struct ggml_backend_graph_copy copy = ggml_backend_graph_copy(backend2, graph);
1831
1820
  if (copy.buffer == NULL) {
1832
1821
  return false;
@@ -1837,28 +1826,45 @@ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t
1837
1826
 
1838
1827
  assert(g1->n_nodes == g2->n_nodes);
1839
1828
 
1840
- for (int i = 0; i < g1->n_nodes; i++) {
1841
- struct ggml_tensor * t1 = g1->nodes[i];
1842
- struct ggml_tensor * t2 = g2->nodes[i];
1829
+ if (test_node != nullptr) {
1830
+ // Compute the whole graph and only test the output for a specific tensor
1831
+ ggml_backend_graph_compute(backend1, g1);
1832
+ ggml_backend_graph_compute(backend2, g2);
1843
1833
 
1844
- assert(t1->op == t2->op && ggml_are_same_layout(t1, t2));
1834
+ int test_node_idx = -1;
1835
+ for (int i = 0; i < g1->n_nodes; i++) {
1836
+ struct ggml_tensor * t1 = g1->nodes[i];
1837
+ if (t1 == test_node) {
1838
+ test_node_idx = i;
1839
+ break;
1840
+ }
1841
+ }
1842
+ GGML_ASSERT(test_node_idx != -1);
1845
1843
 
1846
- struct ggml_cgraph g1v = ggml_graph_view(g1, i, i + 1);
1847
- struct ggml_cgraph g2v = ggml_graph_view(g2, i, i + 1);
1844
+ callback(test_node_idx, g1->nodes[test_node_idx], g2->nodes[test_node_idx], user_data);
1845
+ } else {
1846
+ for (int i = 0; i < g1->n_nodes; i++) {
1847
+ struct ggml_tensor * t1 = g1->nodes[i];
1848
+ struct ggml_tensor * t2 = g2->nodes[i];
1848
1849
 
1849
- ggml_backend_graph_compute(backend1, &g1v);
1850
- ggml_backend_graph_compute(backend2, &g2v);
1850
+ assert(t1->op == t2->op && ggml_are_same_layout(t1, t2));
1851
1851
 
1852
- if (ggml_is_view_op(t1->op)) {
1853
- continue;
1854
- }
1852
+ struct ggml_cgraph g1v = ggml_graph_view(g1, i, i + 1);
1853
+ struct ggml_cgraph g2v = ggml_graph_view(g2, i, i + 1);
1855
1854
 
1856
- // compare results, calculate rms etc
1857
- if (!callback(i, t1, t2, user_data)) {
1858
- break;
1855
+ ggml_backend_graph_compute(backend1, &g1v);
1856
+ ggml_backend_graph_compute(backend2, &g2v);
1857
+
1858
+ if (ggml_is_view_op(t1->op)) {
1859
+ continue;
1860
+ }
1861
+
1862
+ // compare results, calculate rms etc
1863
+ if (!callback(i, t1, t2, user_data)) {
1864
+ break;
1865
+ }
1859
1866
  }
1860
1867
  }
1861
-
1862
1868
  ggml_backend_graph_copy_free(copy);
1863
1869
 
1864
1870
  return true;
@@ -77,6 +77,8 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne,
77
77
  for (int i = 0; i < final_dims; i++) {
78
78
  acl_storage_len += (acl_ne[i] - 1) * acl_stride[i];
79
79
  }
80
+ size_t elem_offset = offset / ggml_element_size(tensor);
81
+ acl_storage_len += elem_offset;
80
82
 
81
83
  // Reverse ne and stride.
82
84
  std::reverse(acl_ne, acl_ne + final_dims);
@@ -84,7 +86,7 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne,
84
86
 
85
87
  aclTensor* acl_tensor = aclCreateTensor(
86
88
  acl_ne, final_dims, ggml_cann_type_mapping(tensor->type), acl_stride,
87
- offset / ggml_element_size(tensor), format, &acl_storage_len, 1,
89
+ elem_offset, format, &acl_storage_len, 1,
88
90
  tensor->data);
89
91
 
90
92
  return acl_tensor;
@@ -65,8 +65,9 @@
65
65
  #include <aclnnop/aclnn_eq_tensor.h>
66
66
  #include <aclnnop/aclnn_gt_scalar.h>
67
67
  #include <aclnnop/aclnn_pow.h>
68
- #include <aclnnop/aclnn_grouped_matmul_v2.h>
68
+ #include <aclnnop/aclnn_grouped_matmul_v3.h>
69
69
  #include <aclnnop/aclnn_fused_infer_attention_score_v2.h>
70
+ #include <aclnnop/aclnn_zero.h>
70
71
  #include <float.h>
71
72
 
72
73
  #include <cmath>
@@ -98,7 +99,7 @@ void bcast_shape(ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst, aclT
98
99
  }
99
100
  }
100
101
 
101
- void ggml_cann_unary_op(
102
+ void ggml_cann_op_unary(
102
103
  std::function<void(ggml_backend_cann_context&, aclTensor*, aclTensor*)> unary_op,
103
104
  ggml_backend_cann_context& ctx, ggml_tensor* dst) {
104
105
  ggml_tensor* src = dst->src[0];
@@ -110,6 +111,42 @@ void ggml_cann_unary_op(
110
111
  ggml_cann_release_resources(ctx, acl_src, acl_dst);
111
112
  }
112
113
 
114
+ void ggml_cann_op_unary_gated(
115
+ std::function<void(ggml_backend_cann_context&, aclTensor*, aclTensor*)> unary_op,
116
+ ggml_backend_cann_context& ctx, ggml_tensor* dst) {
117
+ ggml_tensor* src0 = dst->src[0];
118
+ ggml_tensor* src1 = dst->src[1];
119
+
120
+ GGML_ASSERT(ggml_is_contiguous_1(src0));
121
+ GGML_ASSERT(ggml_is_contiguous_1(dst));
122
+ const int32_t swapped = ggml_get_op_params_i32(dst, 1);
123
+
124
+ aclTensor* acl_dst = ggml_cann_create_tensor(dst);
125
+ aclTensor *acl_src0 = nullptr, *acl_src1 = nullptr;
126
+ if(src1) {
127
+ GGML_ASSERT(ggml_is_contiguous_1(src1));
128
+ GGML_ASSERT(src0->type == src1->type);
129
+
130
+ acl_src0 = ggml_cann_create_tensor(src0);
131
+ acl_src1 = ggml_cann_create_tensor(src1);
132
+ } else {
133
+ int64_t ne[] = {src0->ne[0] / 2, src0->ne[1], src0->ne[2], src0->ne[3]};
134
+ size_t nb[] = {src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]};
135
+ acl_src0 = ggml_cann_create_tensor(src0, ne, nb, GGML_MAX_DIMS, ACL_FORMAT_ND, 0);
136
+ acl_src1 = ggml_cann_create_tensor(src0, ne, nb, GGML_MAX_DIMS, ACL_FORMAT_ND, ne[0] * ggml_element_size(src0));
137
+ if (swapped) {
138
+ std::swap(acl_src0, acl_src1);
139
+ }
140
+ }
141
+
142
+ unary_op(ctx, acl_src0, acl_dst);
143
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMul, acl_dst, acl_src1);
144
+
145
+ ggml_cann_release_resources(ctx, acl_src0, acl_dst);
146
+ if(src1)
147
+ ggml_cann_release_resources(ctx, acl_src1);
148
+ }
149
+
113
150
  /**
114
151
  * @brief Repeats elements of a tensor along each dimension according to the
115
152
  * specified repeat array.
@@ -804,10 +841,11 @@ static aclTensor* aclnn_zero(ggml_backend_cann_context& ctx, void* buffer,
804
841
  nb[i] = nb[i - 1] * ne[i - 1];
805
842
  }
806
843
 
807
- ggml_cann_async_memset(ctx, buffer, n_bytes, 0);
808
844
  aclTensor* zero =
809
845
  ggml_cann_create_tensor(buffer, type, type_size, ne, nb, dims);
846
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceZero, zero);
810
847
  return zero;
848
+ GGML_UNUSED(n_bytes);
811
849
  }
812
850
 
813
851
  /**
@@ -1783,8 +1821,27 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
1783
1821
  size_t transpose_nb[] = {bcast_weight_nb[1], bcast_weight_nb[0],
1784
1822
  bcast_weight_nb[2], bcast_weight_nb[3],
1785
1823
  bcast_weight_nb[4], bcast_weight_nb[5]};
1786
- aclTensor* acl_weight_tensor =
1787
- ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims);
1824
+ aclTensor* acl_weight_tensor;
1825
+
1826
+ bool weightToNZ = false;
1827
+ #ifdef ASCEND_310P
1828
+ weightToNZ = (getenv("GGML_CANN_WEIGHT_NZ") != nullptr);
1829
+ #endif
1830
+ if (weightToNZ && is_matmul_weight(weight)) {
1831
+ int64_t acl_stride[2] = {1, transpose_ne[1]};
1832
+
1833
+ // Reverse ne.
1834
+ std::reverse(transpose_ne, transpose_ne + n_dims);
1835
+
1836
+ std::vector<int64_t> storageDims = {transpose_ne[0], transpose_ne[1]};
1837
+
1838
+ acl_weight_tensor = aclCreateTensor(
1839
+ transpose_ne, n_dims, ggml_cann_type_mapping(weight->type), acl_stride,
1840
+ 0, ACL_FORMAT_FRACTAL_NZ, storageDims.data(), 2, weight->data);
1841
+ } else {
1842
+ acl_weight_tensor =
1843
+ ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims, ACL_FORMAT_ND);
1844
+ }
1788
1845
  aclTensor* acl_dst =
1789
1846
  ggml_cann_create_tensor(dst, bcast_dst_ne, bcast_dst_nb, n_dims);
1790
1847
 
@@ -2654,6 +2711,67 @@ static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context& ctx, ggml_tensor*
2654
2711
  memcpy(ori_src0_nb, cast_nb, sizeof(ori_src0_nb));
2655
2712
  }
2656
2713
 
2714
+ #ifdef ASCEND_310P
2715
+ ggml_tensor src0_row = *src0;
2716
+ ggml_tensor src1_row = *src1;
2717
+ ggml_tensor dst_row = *dst;
2718
+
2719
+ if (src0->type == GGML_TYPE_F16) {
2720
+ src0_row.type = GGML_TYPE_F32;
2721
+ }
2722
+
2723
+ // src0_row [D, M, 1, 1] weight without permute
2724
+ src0_row.ne[2] = 1;
2725
+ src0_row.ne[3] = 1;
2726
+ src0_row.nb[0] = ori_src0_nb[0];
2727
+ src0_row.nb[1] = ori_src0_nb[1];
2728
+ src0_row.nb[2] = ori_src0_nb[1];
2729
+ src0_row.nb[3] = ori_src0_nb[1];
2730
+
2731
+ // src1_row [D, 1, 1, 1] -> input
2732
+ src1_row.ne[1] = 1;
2733
+ src1_row.ne[2] = 1;
2734
+ src1_row.ne[3] = 1;
2735
+ src1_row.nb[2] = nb11;
2736
+ src1_row.nb[3] = nb11;
2737
+
2738
+ // dst_row [M, 1, 1, 1] -> out
2739
+ dst_row.ne[1] = 1;
2740
+ dst_row.ne[2] = 1;
2741
+ dst_row.ne[3] = 1;
2742
+ dst_row.nb[2] = nb1;
2743
+ dst_row.nb[3] = nb1;
2744
+
2745
+ //create weight for one row
2746
+ for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
2747
+ for (int64_t id = 0; id < n_ids; id++) {
2748
+ // expert index
2749
+ int32_t i02 = *(int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
2750
+ GGML_ASSERT(i02 >= 0 && i02 < n_as);
2751
+
2752
+ // If B = 1 (broadcast), always use 0; otherwise, use id.
2753
+ int64_t i11 = (ne11 == 1 ? 0 : id);
2754
+ int64_t i12 = iid1;
2755
+
2756
+ int64_t i1 = id;
2757
+ int64_t i2 = i12;
2758
+
2759
+ void* src0_tmp_ptr = src0_original + i02*ori_src0_nb[2];
2760
+ void* src1_tmp_ptr = src1_original + i11*nb11 + i12*nb12;
2761
+ void* dst_tmp_ptr = dst_original + i1*nb1 + i2*nb2;
2762
+
2763
+ src0_row.data = src0_tmp_ptr;
2764
+ src1_row.data = src1_tmp_ptr;
2765
+ dst_row.data = dst_tmp_ptr;
2766
+ dst_row.src[0] = &src0_row;
2767
+ dst_row.src[1] = &src1_row;
2768
+
2769
+ ggml_cann_mul_mat(ctx, &dst_row);
2770
+ }
2771
+ }
2772
+ return;
2773
+ #endif
2774
+
2657
2775
  std::vector<aclTensor*> src0_tensor_vec;
2658
2776
  std::vector<aclTensor*> src1_tensor_vec;
2659
2777
  std::vector<aclTensor*> dst_tensor_vec;
@@ -2701,9 +2819,9 @@ static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context& ctx, ggml_tensor*
2701
2819
  }
2702
2820
 
2703
2821
  size_t GROUP_SIZE = 128;
2704
- // GroupedMatmulV2 required tensor_list.size < 128
2822
+ // GroupedMatmulV3 required tensor_list.size < 128
2705
2823
  for (size_t i = 0; i < src0_tensor_vec.size(); i += GROUP_SIZE) {
2706
- // split and call GroupedMatmulV2
2824
+ // split and call GroupedMatmulV3
2707
2825
  size_t end = std::min(i + GROUP_SIZE, src0_tensor_vec.size());
2708
2826
  std::vector<aclTensor*> src0_tensor_vec_split(src0_tensor_vec.begin() + i, src0_tensor_vec.begin() + end);
2709
2827
  std::vector<aclTensor*> src1_tensor_vec_split(src1_tensor_vec.begin() + i, src1_tensor_vec.begin() + end);
@@ -2713,7 +2831,7 @@ static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context& ctx, ggml_tensor*
2713
2831
  aclTensorList* src1_tensor_list = aclCreateTensorList(src1_tensor_vec_split.data(), src1_tensor_vec_split.size());
2714
2832
  aclTensorList* dst_tensor_list = aclCreateTensorList(dst_tensor_vec_split.data(), dst_tensor_vec_split.size());
2715
2833
 
2716
- GGML_CANN_CALL_ACLNN_OP(ctx, GroupedMatmulV2, src1_tensor_list, src0_tensor_list,
2834
+ GGML_CANN_CALL_ACLNN_OP(ctx, GroupedMatmulV3, src1_tensor_list, src0_tensor_list,
2717
2835
  nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, -1, dst_tensor_list);
2718
2836
 
2719
2837
  ggml_cann_release_resources(ctx, src0_tensor_list, src1_tensor_list, dst_tensor_list);
@@ -23,6 +23,7 @@
23
23
  #ifndef CANN_ACLNN_OPS
24
24
  #define CANN_ACLNN_OPS
25
25
 
26
+ #include <unordered_set>
26
27
  #include <functional>
27
28
  #include <aclnnop/aclnn_abs.h>
28
29
  #include <aclnnop/aclnn_neg.h>
@@ -1020,6 +1021,37 @@ inline void ggml_cann_async_memset(ggml_backend_cann_context & ctx, void * buffe
1020
1021
  */
1021
1022
  void ggml_cann_mul_mat_id(ggml_backend_cann_context& ctx, ggml_tensor* dst);
1022
1023
 
1024
+ /**
1025
+ * @brief Check whether a tensor is a weight tensor for matrix multiplication.
1026
+ *
1027
+ * @details Checks whether the given tensor serves as weight parameters in matrix multiplication operations,
1028
+ * typically within neural network layers. The function maintains a static set of canonical weight
1029
+ * naming suffixes from Transformer-based architectures. Uses substring matching to identify weight
1030
+ * tensors even with hierarchical naming patterns.
1031
+ *
1032
+ * @param tensor Pointer to the target ggml_tensor object (const-qualified).
1033
+ */
1034
+ static bool is_matmul_weight(const ggml_tensor* tensor) {
1035
+ std::string name = ggml_get_name(tensor);
1036
+ static const std::unordered_set<std::string> weight_suffixes{
1037
+ "output.weight",
1038
+ "attn_q.weight",
1039
+ "attn_k.weight",
1040
+ "attn_v.weight",
1041
+ "attn_output.weight",
1042
+ "ffn_gate.weight",
1043
+ "ffn_up.weight",
1044
+ "ffn_down.weight"
1045
+ };
1046
+
1047
+ for (const auto& suffix : weight_suffixes) {
1048
+ if (name.find(suffix) != std::string::npos) {
1049
+ return true;
1050
+ }
1051
+ }
1052
+ return false;
1053
+ }
1054
+
1023
1055
  /**
1024
1056
  * @brief Applies a element-wise operation to two input tensors using the CANN
1025
1057
  * backend.
@@ -1066,7 +1098,7 @@ void ggml_cann_binary_op(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
1066
1098
  * @param dst The destination tensor. Its src[0] is treated as the input tensor.
1067
1099
  */
1068
1100
  template <void unary_op(ggml_backend_cann_context&, aclTensor*, aclTensor*)>
1069
- void ggml_cann_unary_op(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
1101
+ void ggml_cann_op_unary(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
1070
1102
  ggml_tensor* src = dst->src[0];
1071
1103
 
1072
1104
  aclTensor* acl_src = ggml_cann_create_tensor(src);
@@ -1077,49 +1109,125 @@ template <void unary_op(ggml_backend_cann_context&, aclTensor*, aclTensor*)>
1077
1109
  }
1078
1110
 
1079
1111
  /**
1080
- * @brief Applies a unary operation to a ggml tensor using the CANN backend.
1112
+ * @brief Applies a unary operation to a ggml tensor using the CANN backend.
1081
1113
  *
1082
- * @details This function performs a unary operation on the input tensor using
1083
- * a user-provided lambda or callable object `unary_op`, which accepts the CANN
1084
- * context and two ACL tensors (source and destination). Internally, this function
1085
- * creates ACL representations of the ggml tensors and invokes the unary operation.
1086
- * The result is stored in the destination tensor `dst`. This utility abstracts the
1087
- * common boilerplate of tensor conversion and cleanup when implementing unary ops.
1114
+ * @details This function applies a unary operation to the input tensor using
1115
+ * a user-provided lambda or callable `unary_op`. The lambda receives the
1116
+ * CANN backend context and two ACL tensors: the source and the destination.
1088
1117
  *
1089
- * @param unary_op A callable that performs the unary operation using CANN APIs.
1090
- * @param ctx The CANN context used for operations.
1091
- * @param dst The destination tensor where the result will be stored.
1092
- * The source tensor is retrieved from `dst->src[0]`.
1118
+ * Internally, this function handles the conversion from GGML tensors to ACL tensors,
1119
+ * calls the provided unary op, and manages resource cleanup. The input is assumed
1120
+ * to be `dst->src[0]`, and the result is written to `dst`.
1121
+ *
1122
+ * This utility simplifies writing unary op wrappers by abstracting tensor preparation.
1123
+ *
1124
+ * @param unary_op A callable that performs the unary operation using CANN ACL APIs.
1125
+ * @param ctx The CANN context for operation execution.
1126
+ * @param dst The destination ggml_tensor where the result will be stored.
1127
+ * The input tensor is assumed to be `dst->src[0]`.
1128
+ *
1129
+ * @see GGML_CANN_CALL_OP_UNARY
1130
+ */
1131
+ void ggml_cann_op_unary(
1132
+ std::function<void(ggml_backend_cann_context&, aclTensor*, aclTensor*)> unary_op,
1133
+ ggml_backend_cann_context& ctx, ggml_tensor* dst);
1134
+
1135
+ /**
1136
+ * @brief Applies a gated (GLU-style) unary operation using the CANN backend.
1137
+ *
1138
+ * @details This function performs a gated activation such as GEGLU or ReGLU.
1139
+ * It supports two input modes:
1140
+ *
1141
+ * 1. **Dual input mode**: `dst->src[0]` and `dst->src[1]` are both valid tensors.
1142
+ * These are used directly as the value and gate tensors.
1143
+ *
1144
+ * 2. **Packed input mode**: Only `dst->src[0]` is valid, and it is assumed to
1145
+ * contain a concatenation of value and gate along the first dimension. This tensor
1146
+ * will be split into two equal halves to form the value and gate inputs.
1147
+ *
1148
+ * The function applies a user-provided unary operation (e.g., GELU) to the value tensor,
1149
+ * then multiplies the result in-place with the gate tensor:
1150
+ *
1151
+ * @code
1152
+ * dst = unary_op(value) * gate;
1153
+ * @endcode
1154
+ *
1155
+ * The `swapped` parameter (from `dst->op_params[1]`) allows flipping the
1156
+ * order of value/gate in the packed input case.
1157
+ *
1158
+ * @param unary_op A callable that performs the unary operation using CANN ACL APIs.
1159
+ * It receives (ctx, acl_value_tensor, acl_output_tensor).
1160
+ * @param ctx The CANN context used for execution.
1161
+ * @param dst The destination ggml_tensor. Source tensors are in `dst->src[0]` and optionally `src[1]`.
1162
+ *
1163
+ * @see GGML_CANN_CALL_OP_UNARY_GATED
1093
1164
  */
1094
- void ggml_cann_unary_op(
1165
+ void ggml_cann_op_unary_gated(
1095
1166
  std::function<void(ggml_backend_cann_context&, aclTensor*, aclTensor*)> unary_op,
1096
1167
  ggml_backend_cann_context& ctx, ggml_tensor* dst);
1097
1168
 
1098
1169
  /**
1099
- * @brief Helper macro to invoke a unary ACL operation using ggml_cann_unary_op.
1170
+ * @brief Helper macro to call a unary ACL operator via ggml_cann_op_unary.
1100
1171
  *
1101
- * This macro defines an inline lambda wrapping a specific ACL operation name,
1102
- * and passes it to the templated ggml_cann_unary_op function. It simplifies
1103
- * calling unary ops by hiding the lambda boilerplate.
1172
+ * This macro wraps the specified ACLNN unary operator name into a lambda expression,
1173
+ * and passes it to `ggml_cann_op_unary`, which handles the common logic for executing
1174
+ * unary ops in the CANN backend.
1104
1175
  *
1105
- * Internally, the lambda will call:
1176
+ * Internally, this macro expands to a lambda like:
1106
1177
  * @code
1107
- * GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst);
1178
+ * [](ggml_backend_cann_context& ctx, aclTensor* acl_src, aclTensor* acl_dst) {
1179
+ * GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst);
1180
+ * };
1108
1181
  * @endcode
1109
1182
  *
1183
+ * This lambda is then passed to `ggml_cann_op_unary`, which applies the operation.
1184
+ *
1110
1185
  * @param OP_NAME The name of the ACL unary operator to invoke via GGML_CANN_CALL_ACLNN_OP.
1111
1186
  *
1112
- * @see ggml_cann_unary_op
1187
+ * @see ggml_cann_op_unary
1113
1188
  * @see GGML_CANN_CALL_ACLNN_OP
1114
1189
  */
1115
- #define GGML_CANN_CALL_UNARY_OP(OP_NAME) \
1190
+ #define GGML_CANN_CALL_OP_UNARY(OP_NAME) \
1116
1191
  do { \
1117
1192
  auto lambda = [](ggml_backend_cann_context& ctx, \
1118
1193
  aclTensor* acl_src, \
1119
1194
  aclTensor* acl_dst) { \
1120
1195
  GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst); \
1121
1196
  }; \
1122
- ggml_cann_unary_op(lambda, ctx, dst); \
1197
+ ggml_cann_op_unary(lambda, ctx, dst); \
1123
1198
  } \
1124
1199
  while (0)
1200
+
1201
+ /**
1202
+ * @brief Helper macro to call a gated unary ACL operator via ggml_cann_op_unary_gated.
1203
+ *
1204
+ * This macro wraps the specified ACLNN unary operator name into a lambda expression,
1205
+ * and passes it to `ggml_cann_op_unary_gated`, which handles the common logic for
1206
+ * executing gated unary ops in the CANN backend.
1207
+ *
1208
+ * Internally, this macro expands to a lambda like:
1209
+ * @code
1210
+ * [](ggml_backend_cann_context& ctx, aclTensor* acl_src, aclTensor* acl_dst) {
1211
+ * GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst);
1212
+ * };
1213
+ * @endcode
1214
+ *
1215
+ * This lambda is then passed to `ggml_cann_op_unary_gated`, which applies the operation.
1216
+ *
1217
+ * @param OP_NAME The name of the ACL unary operator to invoke via GGML_CANN_CALL_ACLNN_OP.
1218
+ *
1219
+ * @see ggml_cann_op_unary_gated
1220
+ * @see GGML_CANN_CALL_ACLNN_OP
1221
+ */
1222
+ #define GGML_CANN_CALL_OP_UNARY_GATED(OP_NAME) \
1223
+ do { \
1224
+ auto lambda = [](ggml_backend_cann_context& ctx, \
1225
+ aclTensor* acl_src, \
1226
+ aclTensor* acl_dst) { \
1227
+ GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst); \
1228
+ }; \
1229
+ ggml_cann_op_unary_gated(lambda, ctx, dst); \
1230
+ } \
1231
+ while (0)
1232
+
1125
1233
  #endif // CANN_ACLNN_OPS