@novastera-oss/llamarn 0.4.1 → 0.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (976) hide show
  1. package/RNLlamaCpp.podspec +3 -0
  2. package/android/CMakeLists.txt +2 -0
  3. package/android/src/main/cpp/include/llama.h +44 -21
  4. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  6. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  8. package/android/src/main/jniLibs/armeabi-v7a/libggml-base.so +0 -0
  9. package/android/src/main/jniLibs/armeabi-v7a/libggml-cpu.so +0 -0
  10. package/android/src/main/jniLibs/armeabi-v7a/libggml.so +0 -0
  11. package/android/src/main/jniLibs/armeabi-v7a/libllama.so +0 -0
  12. package/android/src/main/jniLibs/x86/libggml-base.so +0 -0
  13. package/android/src/main/jniLibs/x86/libggml-cpu.so +0 -0
  14. package/android/src/main/jniLibs/x86/libggml.so +0 -0
  15. package/android/src/main/jniLibs/x86/libllama.so +0 -0
  16. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  17. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  18. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  19. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  20. package/cpp/build-info.cpp +2 -2
  21. package/cpp/llama.cpp/CMakeLists.txt +12 -0
  22. package/cpp/llama.cpp/CODEOWNERS +116 -10
  23. package/cpp/llama.cpp/CONTRIBUTING.md +30 -3
  24. package/cpp/llama.cpp/README.md +13 -5
  25. package/cpp/llama.cpp/build-xcframework.sh +5 -0
  26. package/cpp/llama.cpp/cmake/riscv64-spacemit-linux-gnu-gcc.cmake +29 -0
  27. package/cpp/llama.cpp/common/CMakeLists.txt +12 -2
  28. package/cpp/llama.cpp/common/arg.cpp +303 -795
  29. package/cpp/llama.cpp/common/arg.h +2 -3
  30. package/cpp/llama.cpp/common/chat-parser-xml-toolcall.cpp +861 -0
  31. package/cpp/llama.cpp/common/chat-parser-xml-toolcall.h +45 -0
  32. package/cpp/llama.cpp/common/chat-parser.cpp +156 -15
  33. package/cpp/llama.cpp/common/chat-parser.h +13 -0
  34. package/cpp/llama.cpp/common/chat.cpp +1147 -88
  35. package/cpp/llama.cpp/common/chat.h +16 -3
  36. package/cpp/llama.cpp/common/common.cpp +70 -15
  37. package/cpp/llama.cpp/common/common.h +57 -19
  38. package/cpp/llama.cpp/common/download.cpp +1072 -0
  39. package/cpp/llama.cpp/common/download.h +55 -0
  40. package/cpp/llama.cpp/common/http.h +73 -0
  41. package/cpp/llama.cpp/common/json-partial.cpp +70 -2
  42. package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +61 -22
  43. package/cpp/llama.cpp/common/json-schema-to-grammar.h +2 -0
  44. package/cpp/llama.cpp/common/log.cpp +59 -2
  45. package/cpp/llama.cpp/common/log.h +12 -4
  46. package/cpp/llama.cpp/common/sampling.cpp +84 -8
  47. package/cpp/llama.cpp/common/sampling.h +3 -1
  48. package/cpp/llama.cpp/common/speculative.cpp +1 -1
  49. package/cpp/llama.cpp/convert_hf_to_gguf.py +1608 -233
  50. package/cpp/llama.cpp/convert_hf_to_gguf_update.py +6 -1
  51. package/cpp/llama.cpp/convert_lora_to_gguf.py +37 -5
  52. package/cpp/llama.cpp/ggml/CMakeLists.txt +47 -28
  53. package/cpp/llama.cpp/ggml/include/ggml-backend.h +19 -1
  54. package/cpp/llama.cpp/ggml/include/ggml-cpu.h +1 -1
  55. package/cpp/llama.cpp/ggml/include/ggml-hexagon.h +19 -0
  56. package/cpp/llama.cpp/ggml/include/ggml-metal.h +1 -6
  57. package/cpp/llama.cpp/ggml/include/ggml-rpc.h +7 -9
  58. package/cpp/llama.cpp/ggml/include/ggml-zdnn.h +2 -1
  59. package/cpp/llama.cpp/ggml/include/ggml.h +199 -6
  60. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +38 -0
  61. package/cpp/llama.cpp/ggml/src/ggml-alloc.c +299 -130
  62. package/cpp/llama.cpp/ggml/src/ggml-backend-impl.h +4 -4
  63. package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +21 -5
  64. package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +99 -2
  65. package/cpp/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +1 -1
  66. package/cpp/llama.cpp/ggml/src/ggml-blas/ggml-blas.cpp +1 -0
  67. package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +57 -45
  68. package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +138 -47
  69. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +1584 -1773
  70. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +201 -317
  71. package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +146 -187
  72. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +771 -713
  73. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +135 -77
  74. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +5 -2
  75. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +428 -26
  76. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +16 -17
  77. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +318 -145
  78. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
  79. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +155 -60
  80. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +8 -8
  81. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +0 -1
  82. package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +14 -0
  83. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +10 -9
  84. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +108 -64
  85. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +14 -4
  86. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +530 -87
  87. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +37 -45
  88. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +349 -127
  89. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +947 -1218
  90. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +5 -4
  91. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +143 -29
  92. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +82 -76
  93. package/cpp/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.cpp +1025 -0
  94. package/cpp/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
  95. package/cpp/llama.cpp/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
  96. package/cpp/llama.cpp/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
  97. package/cpp/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +151 -0
  98. package/cpp/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +7 -0
  99. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +233 -28
  100. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +326 -66
  101. package/cpp/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +12 -3
  102. package/cpp/llama.cpp/ggml/src/ggml-cuda/argsort.cu +102 -6
  103. package/cpp/llama.cpp/ggml/src/ggml-cuda/binbcast.cu +110 -76
  104. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +167 -38
  105. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d.cu +6 -11
  106. package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cuh +12 -0
  107. package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy-utils.cuh +1 -1
  108. package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cu +245 -151
  109. package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cuh +1 -5
  110. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +341 -289
  111. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile.cu +49 -0
  112. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile.cuh +1233 -0
  113. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec.cuh +586 -0
  114. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +6 -6
  115. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +48 -0
  116. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cu +123 -220
  117. package/cpp/llama.cpp/ggml/src/ggml-cuda/getrows.cu +41 -39
  118. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +715 -45
  119. package/cpp/llama.cpp/ggml/src/ggml-cuda/im2col.cu +150 -0
  120. package/cpp/llama.cpp/ggml/src/ggml-cuda/im2col.cuh +1 -0
  121. package/cpp/llama.cpp/ggml/src/ggml-cuda/mma.cuh +321 -24
  122. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmf.cu +93 -351
  123. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmf.cuh +828 -1
  124. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmid.cu +164 -0
  125. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmid.cuh +5 -0
  126. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cu +3 -166
  127. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +1 -1
  128. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvf.cu +371 -78
  129. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvf.cuh +3 -2
  130. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvq.cu +279 -147
  131. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvq.cuh +1 -1
  132. package/cpp/llama.cpp/ggml/src/ggml-cuda/norm.cu +97 -85
  133. package/cpp/llama.cpp/ggml/src/ggml-cuda/pad.cu +46 -23
  134. package/cpp/llama.cpp/ggml/src/ggml-cuda/pad_reflect_1d.cu +63 -54
  135. package/cpp/llama.cpp/ggml/src/ggml-cuda/quantize.cu +12 -10
  136. package/cpp/llama.cpp/ggml/src/ggml-cuda/rope.cu +192 -77
  137. package/cpp/llama.cpp/ggml/src/ggml-cuda/rope.cuh +2 -0
  138. package/cpp/llama.cpp/ggml/src/ggml-cuda/scale.cu +10 -9
  139. package/cpp/llama.cpp/ggml/src/ggml-cuda/set-rows.cu +137 -75
  140. package/cpp/llama.cpp/ggml/src/ggml-cuda/set.cu +39 -0
  141. package/cpp/llama.cpp/ggml/src/ggml-cuda/set.cuh +7 -0
  142. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq112-dv112.cu +5 -0
  143. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq128-dv128.cu +5 -0
  144. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq256-dv256.cu +5 -0
  145. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq40-dv40.cu +5 -0
  146. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq576-dv512.cu +5 -0
  147. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq64-dv64.cu +5 -0
  148. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq72-dv72.cu +5 -0
  149. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq80-dv80.cu +5 -0
  150. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq96-dv96.cu +5 -0
  151. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu +7 -0
  152. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_0.cu +7 -0
  153. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_1.cu +7 -0
  154. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_0.cu +7 -0
  155. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_1.cu +7 -0
  156. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q8_0.cu +7 -0
  157. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-f16.cu +7 -0
  158. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu +7 -0
  159. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_1.cu +7 -0
  160. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_0.cu +7 -0
  161. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_1.cu +7 -0
  162. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q8_0.cu +7 -0
  163. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-f16.cu +7 -0
  164. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_0.cu +7 -0
  165. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_1.cu +7 -0
  166. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_0.cu +7 -0
  167. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_1.cu +7 -0
  168. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q8_0.cu +7 -0
  169. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-f16.cu +7 -0
  170. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_0.cu +7 -0
  171. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_1.cu +7 -0
  172. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_0.cu +7 -0
  173. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_1.cu +7 -0
  174. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q8_0.cu +7 -0
  175. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-f16.cu +7 -0
  176. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_0.cu +7 -0
  177. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_1.cu +7 -0
  178. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_0.cu +7 -0
  179. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_1.cu +7 -0
  180. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q8_0.cu +7 -0
  181. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-f16.cu +7 -0
  182. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_0.cu +7 -0
  183. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_1.cu +7 -0
  184. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_0.cu +7 -0
  185. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_1.cu +7 -0
  186. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu +7 -0
  187. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +40 -19
  188. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_1.cu +5 -0
  189. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_10.cu +5 -0
  190. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_11.cu +5 -0
  191. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_12.cu +5 -0
  192. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_13.cu +5 -0
  193. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_14.cu +5 -0
  194. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_15.cu +5 -0
  195. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_16.cu +5 -0
  196. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_2.cu +5 -0
  197. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_3.cu +5 -0
  198. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_4.cu +5 -0
  199. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_5.cu +5 -0
  200. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_6.cu +5 -0
  201. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_7.cu +5 -0
  202. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_8.cu +5 -0
  203. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_9.cu +5 -0
  204. package/cpp/llama.cpp/ggml/src/ggml-cuda/topk-moe.cu +336 -0
  205. package/cpp/llama.cpp/ggml/src/ggml-cuda/topk-moe.cuh +16 -0
  206. package/cpp/llama.cpp/ggml/src/ggml-cuda/tsembd.cu +3 -3
  207. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +105 -11
  208. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +36 -0
  209. package/cpp/llama.cpp/ggml/src/ggml-cuda/upscale.cu +87 -6
  210. package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +28 -12
  211. package/cpp/llama.cpp/ggml/src/ggml-hexagon/CMakeLists.txt +68 -0
  212. package/cpp/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp +3807 -0
  213. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/CMakeLists.txt +40 -0
  214. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/act-ops.c +442 -0
  215. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/binary-ops.c +360 -0
  216. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +157 -0
  217. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-ctx.h +40 -0
  218. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.c +69 -0
  219. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.h +119 -0
  220. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-msg.h +156 -0
  221. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-ops.h +64 -0
  222. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp_iface.idl +16 -0
  223. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-exp.c +93 -0
  224. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-inverse.c +60 -0
  225. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c +49 -0
  226. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-utils.c +960 -0
  227. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-utils.h +1032 -0
  228. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/main.c +829 -0
  229. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/matmul-ops.c +2223 -0
  230. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/ops-utils.h +149 -0
  231. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/rope-ops.c +418 -0
  232. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/softmax-ops.c +402 -0
  233. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/unary-ops.c +255 -0
  234. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/worker-pool.c +297 -0
  235. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/worker-pool.h +57 -0
  236. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp-utils.c +448 -0
  237. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp-utils.h +220 -0
  238. package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +8 -13
  239. package/cpp/llama.cpp/ggml/src/ggml-impl.h +110 -12
  240. package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +6 -5
  241. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-common.cpp +446 -0
  242. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
  243. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-context.h +33 -0
  244. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-context.m +599 -0
  245. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.cpp +1662 -0
  246. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.h +251 -0
  247. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.m +1527 -0
  248. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +244 -39
  249. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-ops.cpp +3844 -0
  250. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-ops.h +90 -0
  251. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.cpp +723 -0
  252. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +3453 -1907
  253. package/cpp/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +3 -1
  254. package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +10 -0
  255. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1331 -109
  256. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/cvt.cl +126 -0
  257. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +31 -4
  258. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +35 -7
  259. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +31 -4
  260. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32.cl +162 -0
  261. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32.cl +156 -0
  262. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/get_rows.cl +36 -12
  263. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_kq_kqv.cl +273 -0
  264. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +24 -10
  265. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +24 -10
  266. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl +154 -0
  267. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
  268. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
  269. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
  270. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
  271. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
  272. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
  273. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/pad.cl +29 -20
  274. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/rms_norm.cl +25 -10
  275. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/rope.cl +50 -24
  276. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/set_rows.cl +123 -10
  277. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +2 -2
  278. package/cpp/llama.cpp/ggml/src/ggml-quants.c +1 -0
  279. package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +341 -161
  280. package/cpp/llama.cpp/ggml/src/ggml-sycl/backend.hpp +6 -0
  281. package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +6 -5
  282. package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +74 -15
  283. package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +50 -30
  284. package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +10 -4
  285. package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +166 -99
  286. package/cpp/llama.cpp/ggml/src/ggml-sycl/count-equal.cpp +79 -0
  287. package/cpp/llama.cpp/ggml/src/ggml-sycl/count-equal.hpp +9 -0
  288. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +72 -94
  289. package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +67 -49
  290. package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +21 -31
  291. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +252 -316
  292. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +6 -2
  293. package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +9 -6
  294. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +359 -142
  295. package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
  296. package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
  297. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +80 -60
  298. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +201 -132
  299. package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +230 -55
  300. package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.hpp +2 -0
  301. package/cpp/llama.cpp/ggml/src/ggml-sycl/pad.cpp +97 -0
  302. package/cpp/llama.cpp/ggml/src/ggml-sycl/pad.hpp +24 -0
  303. package/cpp/llama.cpp/ggml/src/ggml-sycl/pad_reflect_1d.cpp +72 -0
  304. package/cpp/llama.cpp/ggml/src/ggml-sycl/pad_reflect_1d.hpp +8 -0
  305. package/cpp/llama.cpp/ggml/src/ggml-sycl/presets.hpp +2 -0
  306. package/cpp/llama.cpp/ggml/src/ggml-sycl/repeat_back.cpp +76 -0
  307. package/cpp/llama.cpp/ggml/src/ggml-sycl/repeat_back.hpp +8 -0
  308. package/cpp/llama.cpp/ggml/src/ggml-sycl/roll.cpp +122 -0
  309. package/cpp/llama.cpp/ggml/src/ggml-sycl/roll.hpp +20 -0
  310. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +50 -41
  311. package/cpp/llama.cpp/ggml/src/ggml-sycl/set.cpp +73 -0
  312. package/cpp/llama.cpp/ggml/src/ggml-sycl/set.hpp +5 -0
  313. package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.cpp +45 -36
  314. package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +330 -165
  315. package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +4 -0
  316. package/cpp/llama.cpp/ggml/src/ggml-sycl/ssm_conv.cpp +127 -0
  317. package/cpp/llama.cpp/ggml/src/ggml-sycl/ssm_conv.hpp +5 -0
  318. package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +12 -6
  319. package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +16 -12
  320. package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +38 -18
  321. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +4184 -2159
  322. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/abs.comp +21 -0
  323. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +2 -2
  324. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +2 -2
  325. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add1.comp +28 -0
  326. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +1 -1
  327. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/arange.comp +20 -0
  328. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +2 -2
  329. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +33 -26
  330. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argsort_large.comp +114 -0
  331. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/ceil.comp +22 -0
  332. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp +2 -2
  333. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/concat.comp +2 -2
  334. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +2 -2
  335. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp +1 -1
  336. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +53 -30
  337. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +1 -1
  338. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +2 -2
  339. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +3 -3
  340. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +13 -6
  341. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_transpose.comp +67 -0
  342. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp +2 -2
  343. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/count_equal.comp +2 -2
  344. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_f32.comp +1 -1
  345. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs.comp → dequant_funcs.glsl} +138 -2
  346. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs_cm2.comp → dequant_funcs_cm2.glsl} +18 -4
  347. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_head.comp → dequant_head.glsl} +1 -1
  348. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +1 -1
  349. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_s.comp +1 -1
  350. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +2 -2
  351. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xs.comp +1 -1
  352. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +3 -2
  353. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +7 -6
  354. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +5 -3
  355. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp +1 -1
  356. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp +1 -1
  357. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +3 -3
  358. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +3 -3
  359. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
  360. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_0.comp +1 -1
  361. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_1.comp +1 -1
  362. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +3 -3
  363. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_0.comp +1 -1
  364. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_1.comp +1 -1
  365. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +3 -3
  366. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
  367. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q8_0.comp +1 -1
  368. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/diag_mask_inf.comp +1 -1
  369. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/div.comp +2 -2
  370. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +3 -2
  371. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/fill.comp +19 -0
  372. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +52 -14
  373. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{flash_attn_base.comp → flash_attn_base.glsl} +50 -12
  374. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +61 -12
  375. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +54 -12
  376. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +5 -1
  377. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/floor.comp +22 -0
  378. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +2 -2
  379. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +2 -2
  380. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +2 -2
  381. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp +2 -2
  382. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +2 -2
  383. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp +2 -2
  384. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{generic_binary_head.comp → generic_binary_head.glsl} +10 -2
  385. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +21 -12
  386. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +28 -18
  387. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{glu_head.comp → glu_head.glsl} +1 -1
  388. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp +2 -2
  389. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +22 -0
  390. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +22 -0
  391. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +15 -7
  392. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +125 -0
  393. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +2 -2
  394. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/leaky_relu.comp +2 -2
  395. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +18 -0
  396. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul.comp +2 -2
  397. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +1 -1
  398. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl +229 -0
  399. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +33 -0
  400. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_m.comp +1 -1
  401. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp +1 -1
  402. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_s.comp +1 -1
  403. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +1 -1
  404. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xxs.comp +1 -1
  405. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_s.comp +1 -1
  406. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_xxs.comp +1 -1
  407. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +9 -7
  408. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp +9 -7
  409. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +3 -5
  410. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +1 -1
  411. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +3 -5
  412. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +3 -5
  413. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +1 -1
  414. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +140 -0
  415. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +106 -634
  416. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +118 -9
  417. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +556 -0
  418. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +70 -0
  419. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +77 -214
  420. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +589 -0
  421. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +78 -0
  422. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +97 -13
  423. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/neg.comp +20 -0
  424. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/norm.comp +2 -2
  425. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_adamw.comp +2 -2
  426. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +1 -1
  427. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +25 -4
  428. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/pool2d.comp +1 -1
  429. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +55 -5
  430. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +2 -2
  431. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp +2 -2
  432. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp +2 -2
  433. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp +2 -2
  434. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +45 -3
  435. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_back.comp +2 -2
  436. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +2 -2
  437. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +2 -2
  438. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +227 -0
  439. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +20 -0
  440. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +5 -52
  441. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +5 -35
  442. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +5 -35
  443. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +27 -0
  444. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +5 -41
  445. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/round.comp +29 -0
  446. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +2 -2
  447. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp +2 -2
  448. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp +2 -2
  449. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/silu_back.comp +2 -2
  450. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp +2 -2
  451. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +1 -1
  452. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +6 -2
  453. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/softplus.comp +23 -0
  454. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +2 -2
  455. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/square.comp +2 -2
  456. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +44 -0
  457. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/ssm_scan.comp +140 -0
  458. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/step.comp +22 -0
  459. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sub.comp +2 -2
  460. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +1 -1
  461. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +2 -2
  462. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +2 -2
  463. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp +2 -2
  464. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +5 -4
  465. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +171 -0
  466. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/trunc.comp +22 -0
  467. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{types.comp → types.glsl} +79 -29
  468. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +36 -12
  469. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +471 -196
  470. package/cpp/llama.cpp/ggml/src/ggml-webgpu/CMakeLists.txt +8 -0
  471. package/cpp/llama.cpp/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1690 -383
  472. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl +188 -0
  473. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +45 -0
  474. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +930 -0
  475. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +101 -0
  476. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +57 -10
  477. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.tmpl.wgsl +874 -0
  478. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +323 -0
  479. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +25 -912
  480. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +97 -0
  481. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.tmpl.wgsl +247 -0
  482. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.tmpl.wgsl +302 -0
  483. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl +267 -0
  484. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +123 -0
  485. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/rope.tmpl.wgsl +295 -0
  486. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/scale.tmpl.wgsl +90 -0
  487. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/{set_rows.wgsl → set_rows.tmpl.wgsl} +38 -8
  488. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.tmpl.wgsl +345 -0
  489. package/cpp/llama.cpp/ggml/src/ggml-zdnn/common.hpp +59 -0
  490. package/cpp/llama.cpp/ggml/src/ggml-zdnn/ggml-zdnn.cpp +96 -314
  491. package/cpp/llama.cpp/ggml/src/ggml-zdnn/mmf.cpp +80 -0
  492. package/cpp/llama.cpp/ggml/src/ggml-zdnn/mmf.hpp +12 -0
  493. package/cpp/llama.cpp/ggml/src/ggml-zdnn/utils.cpp +79 -0
  494. package/cpp/llama.cpp/ggml/src/ggml-zdnn/utils.hpp +19 -0
  495. package/cpp/llama.cpp/ggml/src/ggml.c +440 -17
  496. package/cpp/llama.cpp/ggml/src/gguf.cpp +104 -29
  497. package/cpp/llama.cpp/gguf-py/gguf/constants.py +363 -13
  498. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +64 -0
  499. package/cpp/llama.cpp/gguf-py/gguf/lazy.py +8 -3
  500. package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_convert_endian.py +6 -0
  501. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +156 -18
  502. package/cpp/llama.cpp/gguf-py/gguf/utility.py +80 -0
  503. package/cpp/llama.cpp/gguf-py/gguf/vocab.py +4 -4
  504. package/cpp/llama.cpp/include/llama.h +44 -21
  505. package/cpp/llama.cpp/media/llama1-icon-transparent.png +0 -0
  506. package/cpp/llama.cpp/media/llama1-icon-transparent.svg +77 -0
  507. package/cpp/llama.cpp/media/llama1-icon.png +0 -0
  508. package/cpp/llama.cpp/media/llama1-icon.svg +87 -0
  509. package/cpp/llama.cpp/requirements/requirements-all.txt +2 -0
  510. package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +3 -3
  511. package/cpp/llama.cpp/requirements/requirements-convert_legacy_llama.txt +3 -1
  512. package/cpp/llama.cpp/requirements/requirements-tool_bench.txt +1 -1
  513. package/cpp/llama.cpp/src/CMakeLists.txt +101 -0
  514. package/cpp/llama.cpp/src/llama-adapter.cpp +33 -0
  515. package/cpp/llama.cpp/src/llama-adapter.h +3 -0
  516. package/cpp/llama.cpp/src/llama-arch.cpp +344 -14
  517. package/cpp/llama.cpp/src/llama-arch.h +50 -0
  518. package/cpp/llama.cpp/src/llama-batch.cpp +63 -31
  519. package/cpp/llama.cpp/src/llama-batch.h +13 -2
  520. package/cpp/llama.cpp/src/llama-chat.cpp +85 -3
  521. package/cpp/llama.cpp/src/llama-chat.h +4 -0
  522. package/cpp/llama.cpp/src/llama-context.cpp +300 -45
  523. package/cpp/llama.cpp/src/llama-context.h +16 -6
  524. package/cpp/llama.cpp/src/llama-cparams.h +2 -1
  525. package/cpp/llama.cpp/src/llama-grammar.cpp +17 -9
  526. package/cpp/llama.cpp/src/llama-graph.cpp +226 -64
  527. package/cpp/llama.cpp/src/llama-graph.h +27 -5
  528. package/cpp/llama.cpp/src/llama-hparams.cpp +53 -2
  529. package/cpp/llama.cpp/src/llama-hparams.h +48 -8
  530. package/cpp/llama.cpp/src/llama-impl.cpp +3 -3
  531. package/cpp/llama.cpp/src/llama-impl.h +2 -0
  532. package/cpp/llama.cpp/src/llama-kv-cache-iswa.cpp +13 -3
  533. package/cpp/llama.cpp/src/llama-kv-cache-iswa.h +2 -0
  534. package/cpp/llama.cpp/src/llama-kv-cache.cpp +120 -62
  535. package/cpp/llama.cpp/src/llama-kv-cache.h +13 -4
  536. package/cpp/llama.cpp/src/llama-kv-cells.h +44 -2
  537. package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +19 -9
  538. package/cpp/llama.cpp/src/llama-memory-hybrid.h +2 -0
  539. package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +38 -17
  540. package/cpp/llama.cpp/src/llama-memory-recurrent.h +5 -2
  541. package/cpp/llama.cpp/src/llama-memory.h +3 -0
  542. package/cpp/llama.cpp/src/llama-model-loader.cpp +2 -0
  543. package/cpp/llama.cpp/src/llama-model.cpp +1070 -12614
  544. package/cpp/llama.cpp/src/llama-model.h +40 -4
  545. package/cpp/llama.cpp/src/llama-quant.cpp +14 -6
  546. package/cpp/llama.cpp/src/llama-sampling.cpp +243 -136
  547. package/cpp/llama.cpp/src/llama-vocab.cpp +43 -3
  548. package/cpp/llama.cpp/src/llama-vocab.h +43 -39
  549. package/cpp/llama.cpp/src/llama.cpp +69 -10
  550. package/cpp/llama.cpp/src/models/afmoe.cpp +187 -0
  551. package/cpp/llama.cpp/src/models/apertus.cpp +125 -0
  552. package/cpp/llama.cpp/src/models/arcee.cpp +135 -0
  553. package/cpp/llama.cpp/src/models/arctic.cpp +138 -0
  554. package/cpp/llama.cpp/src/models/arwkv7.cpp +86 -0
  555. package/cpp/llama.cpp/src/models/baichuan.cpp +122 -0
  556. package/cpp/llama.cpp/src/models/bailingmoe.cpp +144 -0
  557. package/cpp/llama.cpp/src/models/bailingmoe2.cpp +135 -0
  558. package/cpp/llama.cpp/src/models/bert.cpp +176 -0
  559. package/cpp/llama.cpp/src/models/bitnet.cpp +160 -0
  560. package/cpp/llama.cpp/src/models/bloom.cpp +101 -0
  561. package/cpp/llama.cpp/src/models/chameleon.cpp +178 -0
  562. package/cpp/llama.cpp/src/models/chatglm.cpp +132 -0
  563. package/cpp/llama.cpp/src/models/codeshell.cpp +111 -0
  564. package/cpp/llama.cpp/src/models/cogvlm.cpp +100 -0
  565. package/cpp/llama.cpp/src/models/cohere2-iswa.cpp +131 -0
  566. package/cpp/llama.cpp/src/models/command-r.cpp +122 -0
  567. package/cpp/llama.cpp/src/models/dbrx.cpp +123 -0
  568. package/cpp/llama.cpp/src/models/deci.cpp +135 -0
  569. package/cpp/llama.cpp/src/models/deepseek.cpp +144 -0
  570. package/cpp/llama.cpp/src/models/deepseek2.cpp +237 -0
  571. package/cpp/llama.cpp/src/models/dots1.cpp +134 -0
  572. package/cpp/llama.cpp/src/models/dream.cpp +105 -0
  573. package/cpp/llama.cpp/src/models/ernie4-5-moe.cpp +150 -0
  574. package/cpp/llama.cpp/src/models/ernie4-5.cpp +110 -0
  575. package/cpp/llama.cpp/src/models/exaone.cpp +114 -0
  576. package/cpp/llama.cpp/src/models/exaone4.cpp +123 -0
  577. package/cpp/llama.cpp/src/models/falcon-h1.cpp +113 -0
  578. package/cpp/llama.cpp/src/models/falcon.cpp +120 -0
  579. package/cpp/llama.cpp/src/models/gemma-embedding.cpp +120 -0
  580. package/cpp/llama.cpp/src/models/gemma.cpp +112 -0
  581. package/cpp/llama.cpp/src/models/gemma2-iswa.cpp +125 -0
  582. package/cpp/llama.cpp/src/models/gemma3-iswa.cpp +131 -0
  583. package/cpp/llama.cpp/src/models/gemma3n-iswa.cpp +377 -0
  584. package/cpp/llama.cpp/src/models/glm4-moe.cpp +153 -0
  585. package/cpp/llama.cpp/src/models/glm4.cpp +127 -0
  586. package/cpp/llama.cpp/src/models/gpt2.cpp +105 -0
  587. package/cpp/llama.cpp/src/models/gptneox.cpp +144 -0
  588. package/cpp/llama.cpp/src/models/granite-hybrid.cpp +196 -0
  589. package/cpp/llama.cpp/src/models/granite.cpp +211 -0
  590. package/cpp/llama.cpp/src/models/graph-context-mamba.cpp +283 -0
  591. package/cpp/llama.cpp/src/models/grok.cpp +159 -0
  592. package/cpp/llama.cpp/src/models/grovemoe.cpp +141 -0
  593. package/cpp/llama.cpp/src/models/hunyuan-dense.cpp +132 -0
  594. package/cpp/llama.cpp/src/models/hunyuan-moe.cpp +154 -0
  595. package/cpp/llama.cpp/src/models/internlm2.cpp +120 -0
  596. package/cpp/llama.cpp/src/models/jais.cpp +86 -0
  597. package/cpp/llama.cpp/src/models/jamba.cpp +106 -0
  598. package/cpp/llama.cpp/src/models/lfm2.cpp +173 -0
  599. package/cpp/llama.cpp/src/models/llada-moe.cpp +122 -0
  600. package/cpp/llama.cpp/src/models/llada.cpp +99 -0
  601. package/cpp/llama.cpp/src/models/llama-iswa.cpp +174 -0
  602. package/cpp/llama.cpp/src/models/llama.cpp +155 -0
  603. package/cpp/llama.cpp/src/models/mamba.cpp +55 -0
  604. package/cpp/llama.cpp/src/models/minicpm3.cpp +199 -0
  605. package/cpp/llama.cpp/src/models/minimax-m2.cpp +124 -0
  606. package/cpp/llama.cpp/src/models/models.h +485 -0
  607. package/cpp/llama.cpp/src/models/mpt.cpp +126 -0
  608. package/cpp/llama.cpp/src/models/nemotron-h.cpp +121 -0
  609. package/cpp/llama.cpp/src/models/nemotron.cpp +122 -0
  610. package/cpp/llama.cpp/src/models/neo-bert.cpp +104 -0
  611. package/cpp/llama.cpp/src/models/olmo.cpp +121 -0
  612. package/cpp/llama.cpp/src/models/olmo2.cpp +150 -0
  613. package/cpp/llama.cpp/src/models/olmoe.cpp +124 -0
  614. package/cpp/llama.cpp/src/models/openai-moe-iswa.cpp +124 -0
  615. package/cpp/llama.cpp/src/models/openelm.cpp +124 -0
  616. package/cpp/llama.cpp/src/models/orion.cpp +123 -0
  617. package/cpp/llama.cpp/src/models/pangu-embedded.cpp +121 -0
  618. package/cpp/llama.cpp/src/models/phi2.cpp +121 -0
  619. package/cpp/llama.cpp/src/models/phi3.cpp +152 -0
  620. package/cpp/llama.cpp/src/models/plamo.cpp +110 -0
  621. package/cpp/llama.cpp/src/models/plamo2.cpp +316 -0
  622. package/cpp/llama.cpp/src/models/plm.cpp +168 -0
  623. package/cpp/llama.cpp/src/models/qwen.cpp +108 -0
  624. package/cpp/llama.cpp/src/models/qwen2.cpp +117 -0
  625. package/cpp/llama.cpp/src/models/qwen2moe.cpp +151 -0
  626. package/cpp/llama.cpp/src/models/qwen2vl.cpp +117 -0
  627. package/cpp/llama.cpp/src/models/qwen3.cpp +117 -0
  628. package/cpp/llama.cpp/src/models/qwen3moe.cpp +124 -0
  629. package/cpp/llama.cpp/src/models/qwen3vl-moe.cpp +149 -0
  630. package/cpp/llama.cpp/src/models/qwen3vl.cpp +141 -0
  631. package/cpp/llama.cpp/src/models/refact.cpp +94 -0
  632. package/cpp/llama.cpp/src/models/rwkv6-base.cpp +162 -0
  633. package/cpp/llama.cpp/src/models/rwkv6.cpp +94 -0
  634. package/cpp/llama.cpp/src/models/rwkv6qwen2.cpp +86 -0
  635. package/cpp/llama.cpp/src/models/rwkv7-base.cpp +135 -0
  636. package/cpp/llama.cpp/src/models/rwkv7.cpp +90 -0
  637. package/cpp/llama.cpp/src/models/seed-oss.cpp +124 -0
  638. package/cpp/llama.cpp/src/models/smallthinker.cpp +120 -0
  639. package/cpp/llama.cpp/src/models/smollm3.cpp +128 -0
  640. package/cpp/llama.cpp/src/models/stablelm.cpp +146 -0
  641. package/cpp/llama.cpp/src/models/starcoder.cpp +100 -0
  642. package/cpp/llama.cpp/src/models/starcoder2.cpp +121 -0
  643. package/cpp/llama.cpp/src/models/t5-dec.cpp +166 -0
  644. package/cpp/llama.cpp/src/models/t5-enc.cpp +96 -0
  645. package/cpp/llama.cpp/src/models/wavtokenizer-dec.cpp +149 -0
  646. package/cpp/llama.cpp/src/models/xverse.cpp +108 -0
  647. package/cpp/llama.cpp/src/unicode.cpp +77 -0
  648. package/cpp/llama.cpp/src/unicode.h +43 -0
  649. package/cpp/llama.cpp/vendor/cpp-httplib/CMakeLists.txt +94 -0
  650. package/cpp/llama.cpp/vendor/cpp-httplib/httplib.cpp +9339 -0
  651. package/cpp/llama.cpp/vendor/cpp-httplib/httplib.h +433 -8222
  652. package/cpp/llama.cpp/vendor/cpp-httplib/patch-boringssl.cmake +6 -0
  653. package/cpp/llama.cpp/vendor/miniaudio/miniaudio.h +4179 -1900
  654. package/cpp/llama.cpp/vendor/minja/chat-template.hpp +9 -2
  655. package/cpp/llama.cpp/vendor/minja/minja.hpp +101 -22
  656. package/ios/include/chat.h +16 -3
  657. package/ios/include/common/minja/chat-template.hpp +9 -2
  658. package/ios/include/common/minja/minja.hpp +101 -22
  659. package/ios/include/common.h +57 -19
  660. package/ios/include/json-schema-to-grammar.h +2 -0
  661. package/ios/include/llama.h +44 -21
  662. package/ios/include/log.h +12 -4
  663. package/ios/include/sampling.h +3 -1
  664. package/ios/libs/llama.xcframework/Info.plist +20 -20
  665. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  666. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +6399 -5557
  667. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-backend.h +19 -1
  668. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +1 -1
  669. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-metal.h +1 -6
  670. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +199 -6
  671. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +44 -21
  672. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  673. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  674. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +6362 -5520
  675. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4813 -4241
  676. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +19 -1
  677. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +1 -1
  678. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-metal.h +1 -6
  679. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +199 -6
  680. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +44 -21
  681. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  682. package/package.json +10 -4
  683. package/cpp/llama.cpp/ggml/src/ggml-cann/Doxyfile +0 -2579
  684. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cu +0 -371
  685. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cuh +0 -3
  686. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cu +0 -379
  687. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cuh +0 -3
  688. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +0 -495
  689. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +0 -486
  690. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
  691. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
  692. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
  693. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
  694. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
  695. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
  696. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
  697. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
  698. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
  699. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
  700. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
  701. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
  702. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
  703. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
  704. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
  705. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
  706. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
  707. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
  708. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
  709. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
  710. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
  711. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
  712. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
  713. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
  714. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
  715. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
  716. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
  717. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
  718. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
  719. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
  720. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
  721. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
  722. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
  723. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
  724. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
  725. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
  726. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
  727. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
  728. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
  729. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
  730. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
  731. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
  732. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
  733. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
  734. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
  735. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
  736. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
  737. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
  738. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
  739. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
  740. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
  741. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
  742. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
  743. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
  744. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
  745. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
  746. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
  747. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
  748. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
  749. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
  750. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
  751. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
  752. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
  753. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
  754. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
  755. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
  756. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
  757. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
  758. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
  759. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
  760. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
  761. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
  762. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
  763. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
  764. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
  765. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
  766. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
  767. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
  768. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
  769. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
  770. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
  771. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
  772. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
  773. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
  774. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
  775. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
  776. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +0 -6886
  777. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +0 -154
  778. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +0 -105
  779. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +0 -55
  780. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +0 -60
  781. package/cpp/llama.cpp/ggml/src/ggml-zdnn/ggml-zdnn-impl.h +0 -97
  782. package/cpp/llama.cpp/models/ggml-vocab-aquila.gguf +0 -0
  783. package/cpp/llama.cpp/models/ggml-vocab-baichuan.gguf +0 -0
  784. package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf +0 -0
  785. package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +0 -112
  786. package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +0 -46
  787. package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf +0 -0
  788. package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.inp +0 -112
  789. package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.out +0 -46
  790. package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf +0 -0
  791. package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +0 -112
  792. package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +0 -46
  793. package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf +0 -0
  794. package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +0 -112
  795. package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +0 -46
  796. package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf +0 -0
  797. package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.inp +0 -112
  798. package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.out +0 -46
  799. package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf +0 -0
  800. package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +0 -112
  801. package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +0 -46
  802. package/cpp/llama.cpp/models/ggml-vocab-gpt-neox.gguf +0 -0
  803. package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf +0 -0
  804. package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +0 -112
  805. package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +0 -46
  806. package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf +0 -0
  807. package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +0 -112
  808. package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +0 -46
  809. package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf +0 -0
  810. package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.inp +0 -112
  811. package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.out +0 -46
  812. package/cpp/llama.cpp/models/ggml-vocab-nomic-bert-moe.gguf +0 -0
  813. package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf +0 -0
  814. package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +0 -112
  815. package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.out +0 -46
  816. package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf +0 -0
  817. package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +0 -112
  818. package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.out +0 -46
  819. package/cpp/llama.cpp/models/ggml-vocab-refact.gguf +0 -0
  820. package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.inp +0 -112
  821. package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.out +0 -46
  822. package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf +0 -0
  823. package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +0 -112
  824. package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.out +0 -46
  825. package/cpp/llama.cpp/models/templates/ByteDance-Seed-OSS.jinja +0 -171
  826. package/cpp/llama.cpp/models/templates/CohereForAI-c4ai-command-r-plus-tool_use.jinja +0 -202
  827. package/cpp/llama.cpp/models/templates/CohereForAI-c4ai-command-r7b-12-2024-tool_use.jinja +0 -156
  828. package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +0 -124
  829. package/cpp/llama.cpp/models/templates/NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja +0 -152
  830. package/cpp/llama.cpp/models/templates/NousResearch-Hermes-3-Llama-3.1-8B-tool_use.jinja +0 -152
  831. package/cpp/llama.cpp/models/templates/Qwen-QwQ-32B.jinja +0 -62
  832. package/cpp/llama.cpp/models/templates/Qwen-Qwen2.5-7B-Instruct.jinja +0 -54
  833. package/cpp/llama.cpp/models/templates/Qwen-Qwen3-0.6B.jinja +0 -85
  834. package/cpp/llama.cpp/models/templates/README.md +0 -25
  835. package/cpp/llama.cpp/models/templates/deepseek-ai-DeepSeek-R1-Distill-Llama-8B.jinja +0 -1
  836. package/cpp/llama.cpp/models/templates/deepseek-ai-DeepSeek-R1-Distill-Qwen-32B.jinja +0 -1
  837. package/cpp/llama.cpp/models/templates/fireworks-ai-llama-3-firefunction-v2.jinja +0 -57
  838. package/cpp/llama.cpp/models/templates/google-gemma-2-2b-it.jinja +0 -4
  839. package/cpp/llama.cpp/models/templates/ibm-granite-granite-3.3-2B-Instruct.jinja +0 -59
  840. package/cpp/llama.cpp/models/templates/llama-cpp-deepseek-r1.jinja +0 -76
  841. package/cpp/llama.cpp/models/templates/llama-cpp-rwkv-world.jinja +0 -34
  842. package/cpp/llama.cpp/models/templates/meetkai-functionary-medium-v3.1.jinja +0 -58
  843. package/cpp/llama.cpp/models/templates/meetkai-functionary-medium-v3.2.jinja +0 -287
  844. package/cpp/llama.cpp/models/templates/meta-llama-Llama-3.1-8B-Instruct.jinja +0 -109
  845. package/cpp/llama.cpp/models/templates/meta-llama-Llama-3.2-3B-Instruct.jinja +0 -93
  846. package/cpp/llama.cpp/models/templates/meta-llama-Llama-3.3-70B-Instruct.jinja +0 -109
  847. package/cpp/llama.cpp/models/templates/microsoft-Phi-3.5-mini-instruct.jinja +0 -8
  848. package/cpp/llama.cpp/models/templates/mistralai-Mistral-Nemo-Instruct-2407.jinja +0 -87
  849. package/cpp/llama.cpp/models/templates/moonshotai-Kimi-K2.jinja +0 -43
  850. package/cpp/llama.cpp/models/templates/openai-gpt-oss-120b.jinja +0 -331
  851. package/cpp/llama.cpp/models/templates/unsloth-mistral-Devstral-Small-2507.jinja +0 -105
  852. package/cpp/llama.cpp/prompts/LLM-questions.txt +0 -49
  853. package/cpp/llama.cpp/prompts/alpaca.txt +0 -1
  854. package/cpp/llama.cpp/prompts/assistant.txt +0 -31
  855. package/cpp/llama.cpp/prompts/chat-with-baichuan.txt +0 -4
  856. package/cpp/llama.cpp/prompts/chat-with-bob.txt +0 -7
  857. package/cpp/llama.cpp/prompts/chat-with-qwen.txt +0 -1
  858. package/cpp/llama.cpp/prompts/chat-with-vicuna-v0.txt +0 -7
  859. package/cpp/llama.cpp/prompts/chat-with-vicuna-v1.txt +0 -7
  860. package/cpp/llama.cpp/prompts/chat.txt +0 -28
  861. package/cpp/llama.cpp/prompts/dan-modified.txt +0 -1
  862. package/cpp/llama.cpp/prompts/dan.txt +0 -1
  863. package/cpp/llama.cpp/prompts/mnemonics.txt +0 -93
  864. package/cpp/llama.cpp/prompts/parallel-questions.txt +0 -43
  865. package/cpp/llama.cpp/prompts/reason-act.txt +0 -18
  866. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
  867. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  868. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5524
  869. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +0 -4247
  870. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-alloc.h +0 -76
  871. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-backend.h +0 -354
  872. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-blas.h +0 -25
  873. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +0 -145
  874. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-metal.h +0 -66
  875. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-opt.h +0 -256
  876. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +0 -2492
  877. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/gguf.h +0 -202
  878. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +0 -1391
  879. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Modules/module.modulemap +0 -17
  880. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Resources/Info.plist +0 -32
  881. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-alloc.h +0 -76
  882. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-backend.h +0 -354
  883. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-blas.h +0 -25
  884. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +0 -145
  885. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-metal.h +0 -66
  886. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-opt.h +0 -256
  887. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +0 -2492
  888. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/gguf.h +0 -202
  889. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +0 -1391
  890. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Modules/module.modulemap +0 -17
  891. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Resources/Info.plist +0 -32
  892. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  893. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-alloc.h +0 -76
  894. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-backend.h +0 -354
  895. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-blas.h +0 -25
  896. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +0 -145
  897. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-metal.h +0 -66
  898. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-opt.h +0 -256
  899. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +0 -2492
  900. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/gguf.h +0 -202
  901. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +0 -1391
  902. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Modules/module.modulemap +0 -17
  903. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Resources/Info.plist +0 -32
  904. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  905. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  906. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
  907. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  908. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5561
  909. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-alloc.h +0 -76
  910. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-backend.h +0 -354
  911. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-blas.h +0 -25
  912. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +0 -145
  913. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-metal.h +0 -66
  914. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-opt.h +0 -256
  915. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +0 -2492
  916. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/gguf.h +0 -202
  917. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +0 -1391
  918. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Info.plist +0 -35
  919. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Modules/module.modulemap +0 -17
  920. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  921. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
  922. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  923. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5524
  924. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +0 -4246
  925. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-alloc.h +0 -76
  926. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +0 -354
  927. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-blas.h +0 -25
  928. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +0 -145
  929. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-metal.h +0 -66
  930. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +0 -256
  931. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +0 -2492
  932. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/gguf.h +0 -202
  933. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +0 -1391
  934. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Info.plist +0 -35
  935. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Modules/module.modulemap +0 -17
  936. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  937. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
  938. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  939. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5558
  940. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-alloc.h +0 -76
  941. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-backend.h +0 -354
  942. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-blas.h +0 -25
  943. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +0 -145
  944. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-metal.h +0 -66
  945. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-opt.h +0 -256
  946. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +0 -2492
  947. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/gguf.h +0 -202
  948. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +0 -1391
  949. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Info.plist +0 -32
  950. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Modules/module.modulemap +0 -17
  951. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  952. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
  953. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  954. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5520
  955. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +0 -4243
  956. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-alloc.h +0 -76
  957. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +0 -354
  958. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-blas.h +0 -25
  959. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +0 -145
  960. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-metal.h +0 -66
  961. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +0 -256
  962. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +0 -2492
  963. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/gguf.h +0 -202
  964. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +0 -1391
  965. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Info.plist +0 -32
  966. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Modules/module.modulemap +0 -17
  967. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  968. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{test_bfloat16_support.comp → feature-tests/bfloat16.comp} +0 -0
  969. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat_support.comp → feature-tests/coopmat.comp} +0 -0
  970. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat2_support.comp → feature-tests/coopmat2.comp} +0 -0
  971. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{test_integer_dot_support.comp → feature-tests/integer_dot.comp} +0 -0
  972. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{generic_head.comp → generic_head.glsl} +0 -0
  973. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{generic_unary_head.comp → generic_unary_head.glsl} +0 -0
  974. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{glu_main.comp → glu_main.glsl} +0 -0
  975. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{rte.comp → rte.glsl} +0 -0
  976. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{utils.comp → utils.glsl} +0 -0
@@ -22,24 +22,24 @@
22
22
 
23
23
  #include "ggml-cann.h"
24
24
 
25
+ #include "ggml-backend-impl.h"
26
+ #include "ggml-cann/aclnn_ops.h"
27
+ #include "ggml-cann/common.h"
28
+ #include "ggml-impl.h"
29
+ #include "ggml.h"
30
+
25
31
  #include <acl/acl.h>
26
- #include <stdarg.h>
27
32
  #include <aclnnop/aclnn_trans_matmul_weight.h>
33
+ #include <stdarg.h>
28
34
 
35
+ #include <chrono>
29
36
  #include <cmath>
30
37
  #include <cstdio>
31
38
  #include <cstring>
32
39
  #include <mutex>
40
+ #include <optional>
33
41
  #include <queue>
34
- #include <chrono>
35
42
  #include <unordered_set>
36
- #include <optional>
37
-
38
- #include "ggml-impl.h"
39
- #include "ggml-backend-impl.h"
40
- #include "ggml-cann/aclnn_ops.h"
41
- #include "ggml-cann/common.h"
42
- #include "ggml.h"
43
43
 
44
44
  #define GGML_COMMON_DECL_C
45
45
 
@@ -56,33 +56,41 @@
56
56
  * @param line The line number where the error occurred.
57
57
  * @param msg The error message.
58
58
  */
59
- [[noreturn]] void ggml_cann_error(const char* stmt, const char* func,
60
- const char* file, int line, const char* msg) {
59
+ [[noreturn]] void ggml_cann_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
61
60
  int32_t id = -1;
62
61
  aclrtGetDevice(&id);
63
62
 
64
63
  GGML_LOG_ERROR("CANN error: %s\n", msg);
65
- GGML_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func,
66
- file, line);
64
+ GGML_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line);
67
65
  GGML_LOG_ERROR(" %s\n", stmt);
68
66
  // abort with GGML_ASSERT to get a stack trace
69
67
  GGML_ABORT("CANN error");
70
68
  }
71
69
 
70
+ // Thread-local variable to record the current device of this thread.
71
+ thread_local int g_current_cann_device = -1;
72
+
72
73
  /**
73
- * @brief Sets the device to be used by CANN.
74
+ * @brief Set the CANN device to be used.
74
75
  *
75
- * @param device The device ID to set.
76
+ * @param device The target device ID to set.
76
77
  */
77
78
  void ggml_cann_set_device(const int32_t device) {
78
- // TODO: uncomment these lines after empty context has fixed.
79
- // int current_device;
80
- // ACL_CHECK(aclrtGetDevice(&current_device));
79
+ // int current_device = -1;
80
+ // Note: In some CANN versions, if no device has been set yet,
81
+ // aclrtGetDevice(&current_device) may return 0 by default.
82
+ // aclrtGetDevice(&current_device);
81
83
 
82
- // if (device == current_device) {
83
- // return;
84
- // }
84
+ // If the current device is already the target one, no need to switch.
85
+ if (device == g_current_cann_device) {
86
+ return;
87
+ }
88
+
89
+ // Switch to the new device.
85
90
  ACL_CHECK(aclrtSetDevice(device));
91
+
92
+ // Update the global device record.
93
+ g_current_cann_device = device;
86
94
  }
87
95
 
88
96
  /**
@@ -100,9 +108,11 @@ int32_t ggml_cann_get_device() {
100
108
  * @brief Get the value of the specified environment variable (name).
101
109
  * if not empty, return a std::string object
102
110
  */
103
- std::optional<std::string> get_env(const std::string& name) {
104
- const char* val = std::getenv(name.c_str());
105
- if (!val) return std::nullopt;
111
+ std::optional<std::string> get_env(const std::string & name) {
112
+ const char * val = std::getenv(name.c_str());
113
+ if (!val) {
114
+ return std::nullopt;
115
+ }
106
116
  std::string res = std::string(val);
107
117
  std::transform(res.begin(), res.end(), res.begin(), ::tolower);
108
118
  return res;
@@ -111,11 +121,29 @@ std::optional<std::string> get_env(const std::string& name) {
111
121
  /**
112
122
  * @brief Verify whether the environment variable is a valid value.
113
123
  */
114
- bool parse_bool(const std::string& value) {
115
- std::unordered_set<std::string> valid_values = {"on", "1", "yes", "y", "enable", "true"};
124
+ bool parse_bool(const std::string & value) {
125
+ std::unordered_set<std::string> valid_values = { "on", "1", "yes", "y", "enable", "true" };
116
126
  return valid_values.find(value) != valid_values.end();
117
127
  }
118
128
 
129
+ /**
130
+ * @brief Parse a string as an integer, returning 0 if invalid.
131
+ *
132
+ * This function attempts to convert the input string `value` to an `int`.
133
+ * If the string is not a valid integer or is out of the `int` range,
134
+ * it returns 0.
135
+ *
136
+ * @param value The string to parse.
137
+ * @return The parsed integer, or 0 if conversion fails.
138
+ */
139
+ int parse_integer(const std::string & value) {
140
+ try {
141
+ return std::stoi(value);
142
+ } catch (...) {
143
+ return 0;
144
+ }
145
+ }
146
+
119
147
  /**
120
148
  * @brief Initialize the CANN device information.
121
149
  *
@@ -127,11 +155,10 @@ bool parse_bool(const std::string& value) {
127
155
  static ggml_cann_device_info ggml_cann_init() {
128
156
  ggml_cann_device_info info = {};
129
157
 
130
- aclError err = aclrtGetDeviceCount((uint32_t*)&info.device_count);
158
+ aclError err = aclrtGetDeviceCount((uint32_t *) &info.device_count);
131
159
 
132
160
  if (err != ACL_SUCCESS) {
133
- GGML_LOG_ERROR("%s: failed to initialize CANN: %s\n",
134
- __func__, aclGetRecentErrMsg());
161
+ GGML_LOG_ERROR("%s: failed to initialize CANN: %s\n", __func__, aclGetRecentErrMsg());
135
162
  return info;
136
163
  }
137
164
 
@@ -139,16 +166,15 @@ static ggml_cann_device_info ggml_cann_init() {
139
166
 
140
167
  for (int id = 0; id < info.device_count; ++id) {
141
168
  aclrtPhysicalMemProp prop = {};
142
- prop.handleType = ACL_MEM_HANDLE_TYPE_NONE;
143
- prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED;
144
- prop.memAttr = ACL_HBM_MEM_HUGE;
145
- prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE;
146
- prop.location.id = id;
147
- prop.reserve = 0;
148
- err = aclrtMemGetAllocationGranularity(
149
- &prop, ACL_RT_MEM_ALLOC_GRANULARITY_RECOMMENDED,
150
- &info.devices[id].vmm_granularity);
151
- info.devices[id].vmm = err == ACL_SUCCESS;
169
+ prop.handleType = ACL_MEM_HANDLE_TYPE_NONE;
170
+ prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED;
171
+ prop.memAttr = ACL_HBM_MEM_HUGE;
172
+ prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE;
173
+ prop.location.id = id;
174
+ prop.reserve = 0;
175
+ err = aclrtMemGetAllocationGranularity(&prop, ACL_RT_MEM_ALLOC_GRANULARITY_RECOMMENDED,
176
+ &info.devices[id].vmm_granularity);
177
+ info.devices[id].vmm = err == ACL_SUCCESS;
152
178
 
153
179
  size_t free, total;
154
180
  ggml_backend_cann_get_device_memory(id, &free, &total);
@@ -168,7 +194,7 @@ static ggml_cann_device_info ggml_cann_init() {
168
194
  *
169
195
  * @return A reference to the structure containing the device information.
170
196
  */
171
- const ggml_cann_device_info& ggml_cann_info() {
197
+ const ggml_cann_device_info & ggml_cann_info() {
172
198
  static ggml_cann_device_info info = ggml_cann_init();
173
199
  return info;
174
200
  }
@@ -188,7 +214,7 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
188
214
  /**
189
215
  * @brief The minimum free margin for a buffer.
190
216
  */
191
- static const size_t min_free_margin = 1ull << 20; // 1MB
217
+ static const size_t min_free_margin = 1ull << 20; // 1MB
192
218
 
193
219
  /**
194
220
  * @brief The alignment for buffer allocation.
@@ -209,22 +235,18 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
209
235
  * @brief Structure representing a CANN buffer.
210
236
  */
211
237
  struct ggml_cann_buffer {
212
- void* ptr = nullptr; ///< Pointer to the buffer.
213
- size_t size = 0; ///< Size of the buffer.
214
- std::chrono::steady_clock::time_point last_used; ///< Last used time.
238
+ void * ptr = nullptr; ///< Pointer to the buffer.
239
+ size_t size = 0; ///< Size of the buffer.
240
+ std::chrono::steady_clock::time_point last_used; ///< Last used time.
215
241
 
216
- bool operator>(const ggml_cann_buffer& other) const {
217
- return size > other.size;
218
- }
242
+ bool operator>(const ggml_cann_buffer & other) const { return size > other.size; }
219
243
  };
220
244
 
221
245
  /**
222
246
  * @brief Array of CANN buffers in the pool.
223
247
  */
224
- std::unordered_map<void*, size_t> buffer_pool;
225
- std::priority_queue<ggml_cann_buffer,
226
- std::vector<ggml_cann_buffer>,
227
- std::greater<>> free_buffers ;
248
+ std::unordered_map<void *, size_t> buffer_pool;
249
+ std::priority_queue<ggml_cann_buffer, std::vector<ggml_cann_buffer>, std::greater<>> free_buffers;
228
250
 
229
251
  /**
230
252
  * @brief Total size of all buffers in the pool.
@@ -245,7 +267,7 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
245
267
  */
246
268
  ~ggml_cann_pool_buf_prio() {
247
269
  ggml_cann_set_device(device);
248
- for (auto& [b_ptr, b_size] : buffer_pool) {
270
+ for (auto & [b_ptr, b_size] : buffer_pool) {
249
271
  aclrtFree(b_ptr);
250
272
  pool_size -= b_size;
251
273
  }
@@ -261,14 +283,14 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
261
283
  * the allocated buffer.
262
284
  * @return A pointer to the allocated buffer.
263
285
  */
264
- void* alloc(size_t size, size_t* actual_size) override {
286
+ void * alloc(size_t size, size_t * actual_size) override {
265
287
  size = GGML_PAD(size, alignment);
266
288
  if (size == 0) {
267
289
  size = alignment;
268
290
  }
269
291
 
270
- void* ptr = nullptr;
271
- auto now = std::chrono::steady_clock::now();
292
+ void * ptr = nullptr;
293
+ auto now = std::chrono::steady_clock::now();
272
294
 
273
295
  std::vector<ggml_cann_buffer> free_buffers_rest;
274
296
  free_buffers_rest.reserve(free_buffers.size());
@@ -281,24 +303,22 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
281
303
  const size_t margin = b.size - size;
282
304
  if (margin <= max_reuse_margin) {
283
305
  *actual_size = b.size;
284
- ptr = b.ptr;
306
+ ptr = b.ptr;
285
307
  #ifdef DEBUG_CANN_MALLOC
286
308
  GGML_LOG_INFO(
287
309
  "cann pool[%d]: reused %p, "
288
310
  "pool_size = %5u MB, "
289
311
  "size = %5u MB, "
290
312
  "margin = %5u MB\n",
291
- device, b.ptr,
292
- (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
293
- (uint32_t)(GGML_PAD(size, 1048576) / 1048576),
294
- (uint32_t)(GGML_PAD(margin, 1048576) / 1048576));
313
+ device, b.ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576),
314
+ (uint32_t) (GGML_PAD(size, 1048576) / 1048576),
315
+ (uint32_t) (GGML_PAD(margin, 1048576) / 1048576));
295
316
  #endif
296
317
  break;
297
318
  }
298
319
  }
299
320
 
300
- bool should_clean = !disable_clean &&
301
- b.size > min_free_margin &&
321
+ bool should_clean = !disable_clean && b.size > min_free_margin &&
302
322
  std::chrono::duration_cast<std::chrono::milliseconds>(now - b.last_used).count() > 100;
303
323
  if (should_clean) {
304
324
  // free the buffer if the size is needed to be freed
@@ -310,20 +330,20 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
310
330
  "cann pool[%d]: clean %p, "
311
331
  "pool_size = %5u MB, "
312
332
  "size = %5u MB\n",
313
- device, b.ptr,
314
- (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
315
- (uint32_t)(GGML_PAD(b.size, 1048576) / 1048576));
333
+ device, b.ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576),
334
+ (uint32_t) (GGML_PAD(b.size, 1048576) / 1048576));
316
335
  #endif
317
336
  continue;
318
337
  }
319
338
  free_buffers_rest.push_back(b);
320
339
  }
321
- for (ggml_cann_buffer &b : free_buffers_rest) {
340
+ for (ggml_cann_buffer & b : free_buffers_rest) {
322
341
  free_buffers.push(std::move(b));
323
342
  }
324
343
 
325
344
  #ifdef DEBUG_CANN_MALLOC
326
- GGML_LOG_INFO("cann pool[%d] free pool_size = %5u MB\n\n", device, (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576));
345
+ GGML_LOG_INFO("cann pool[%d] free pool_size = %5u MB\n\n", device,
346
+ (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576));
327
347
  #endif
328
348
  if (ptr != nullptr) {
329
349
  return ptr;
@@ -339,8 +359,8 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
339
359
  "cann pool[%d]: allocate %p, "
340
360
  "pool_size = %5u MB, "
341
361
  "size = %5u MB\n",
342
- device, ptr, (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
343
- (uint32_t)(GGML_PAD(size, 1048576) / 1048576));
362
+ device, ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576),
363
+ (uint32_t) (GGML_PAD(size, 1048576) / 1048576));
344
364
  #endif
345
365
  buffer_pool.emplace(ptr, size);
346
366
  return ptr;
@@ -352,7 +372,7 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
352
372
  * @param ptr Pointer to the buffer to free.
353
373
  * @param size Size of the buffer to free.
354
374
  */
355
- void free(void* ptr, size_t size) override {
375
+ void free(void * ptr, size_t size) override {
356
376
  GGML_UNUSED(size);
357
377
  auto it = buffer_pool.find(ptr);
358
378
  if (it == buffer_pool.end()) {
@@ -360,13 +380,12 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
360
380
  }
361
381
 
362
382
  auto now = std::chrono::steady_clock::now();
363
- free_buffers.emplace(ggml_cann_buffer{ptr, it->second, now});
383
+ free_buffers.emplace(ggml_cann_buffer{ ptr, it->second, now });
364
384
  #ifdef DEBUG_CANN_MALLOC
365
385
  GGML_LOG_INFO(
366
386
  "cann pool[%d]: return %p, "
367
387
  "pool_size = %5u MB\n",
368
- device, ptr,
369
- (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576));
388
+ device, ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576));
370
389
  #endif
371
390
  }
372
391
  };
@@ -385,7 +404,7 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
385
404
  /**
386
405
  * @brief The minimum free margin for a buffer.
387
406
  */
388
- static const size_t min_free_margin = 1ull << 20; // 1MB
407
+ static const size_t min_free_margin = 1ull << 20; // 1MB
389
408
 
390
409
  /**
391
410
  * @brief The alignment for buffer allocation.
@@ -411,10 +430,10 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
411
430
  * @brief Structure representing a CANN buffer.
412
431
  */
413
432
  struct ggml_cann_buffer {
414
- void* ptr = nullptr; ///< Pointer to the buffer memory.
415
- size_t size = 0; ///< Size of the buffer.
416
- bool used = false; ///< Whether the buffer is currently in use.
417
- std::chrono::steady_clock::time_point last_used; ///< Last used time.
433
+ void * ptr = nullptr; ///< Pointer to the buffer memory.
434
+ size_t size = 0; ///< Size of the buffer.
435
+ bool used = false; ///< Whether the buffer is currently in use.
436
+ std::chrono::steady_clock::time_point last_used; ///< Last used time.
418
437
  };
419
438
 
420
439
  /**
@@ -442,7 +461,7 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
442
461
  ~ggml_cann_pool_buf() {
443
462
  ggml_cann_set_device(device);
444
463
  for (int i = 0; i < MAX_BUFFERS; ++i) {
445
- ggml_cann_buffer& b = buffer_pool[i];
464
+ ggml_cann_buffer & b = buffer_pool[i];
446
465
  if (b.ptr != nullptr) {
447
466
  aclrtFree(b.ptr);
448
467
  pool_size -= b.size;
@@ -459,18 +478,18 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
459
478
  * the allocated buffer.
460
479
  * @return A pointer to the allocated buffer.
461
480
  */
462
- void* alloc(size_t size, size_t* actual_size) override {
481
+ void * alloc(size_t size, size_t * actual_size) override {
463
482
  size = GGML_PAD(size, alignment);
464
483
  if (size == 0) {
465
484
  size = alignment;
466
485
  }
467
486
 
468
- void* ptr = nullptr;
469
- auto now = std::chrono::steady_clock::now();
487
+ void * ptr = nullptr;
488
+ auto now = std::chrono::steady_clock::now();
470
489
 
471
490
  int i = 0;
472
491
  for (; i < MAX_BUFFERS; ++i) {
473
- ggml_cann_buffer& b = buffer_pool[i];
492
+ ggml_cann_buffer & b = buffer_pool[i];
474
493
  if (b.ptr == nullptr) {
475
494
  break;
476
495
  }
@@ -482,25 +501,23 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
482
501
  const size_t margin = b.size - size;
483
502
  if (margin <= max_reuse_margin) {
484
503
  *actual_size = b.size;
485
- b.used = true;
486
- ptr = b.ptr;
504
+ b.used = true;
505
+ ptr = b.ptr;
487
506
  #ifdef DEBUG_CANN_MALLOC
488
507
  GGML_LOG_INFO(
489
508
  "cann pool[%d]: reused %p, "
490
509
  "pool_size = %5u MB, "
491
510
  "size = %5u MB, "
492
511
  "margin = %5u MB\n",
493
- device, b.ptr,
494
- (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
495
- (uint32_t)(GGML_PAD(size, 1048576) / 1048576),
496
- (uint32_t)(GGML_PAD(margin, 1048576) / 1048576));
512
+ device, b.ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576),
513
+ (uint32_t) (GGML_PAD(size, 1048576) / 1048576),
514
+ (uint32_t) (GGML_PAD(margin, 1048576) / 1048576));
497
515
  #endif
498
516
  break;
499
517
  }
500
518
  }
501
519
 
502
- bool should_clean = !disable_clean &&
503
- b.size > min_free_margin &&
520
+ bool should_clean = !disable_clean && b.size > min_free_margin &&
504
521
  std::chrono::duration_cast<std::chrono::milliseconds>(now - b.last_used).count() > 100;
505
522
  if (should_clean) {
506
523
  // free the buffer if the size is needed to be freed
@@ -511,9 +528,8 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
511
528
  "cann pool[%d]: clean %p, "
512
529
  "pool_size = %5u MB, "
513
530
  "size = %5u MB\n",
514
- device, b.ptr,
515
- (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
516
- (uint32_t)(GGML_PAD(b.size, 1048576) / 1048576));
531
+ device, b.ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576),
532
+ (uint32_t) (GGML_PAD(b.size, 1048576) / 1048576));
517
533
  #endif
518
534
  b.ptr = nullptr;
519
535
  }
@@ -524,13 +540,13 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
524
540
 
525
541
  if (i < MAX_BUFFERS) {
526
542
  // allocate a new buffer if no buffer can be reused
527
- ggml_cann_buffer& b = buffer_pool[i];
543
+ ggml_cann_buffer & b = buffer_pool[i];
528
544
  ggml_cann_set_device(device);
529
545
  ACL_CHECK(aclrtMalloc(&b.ptr, size, ACL_MEM_MALLOC_HUGE_FIRST));
530
546
  pool_size += size;
531
547
  *actual_size = size;
532
- b.size = size;
533
- b.used = true;
548
+ b.size = size;
549
+ b.used = true;
534
550
  if (i >= MAX_BUFFERS - 8) {
535
551
  GGML_LOG_WARN("cann pool[%d]: slots almost full\n", device);
536
552
  }
@@ -539,9 +555,8 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
539
555
  "cann pool[%d]: allocate %p, "
540
556
  "pool_size = %5u MB, "
541
557
  "size = %5u MB\n",
542
- device, b.ptr,
543
- (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
544
- (uint32_t)(GGML_PAD(b.size, 1048576) / 1048576));
558
+ device, b.ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576),
559
+ (uint32_t) (GGML_PAD(b.size, 1048576) / 1048576));
545
560
  #endif
546
561
  return b.ptr;
547
562
  }
@@ -555,21 +570,20 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
555
570
  * @param ptr Pointer to the buffer to free.
556
571
  * @param size Size of the buffer to free.
557
572
  */
558
- void free(void* ptr, size_t size) override {
573
+ void free(void * ptr, size_t size) override {
559
574
  GGML_UNUSED(size);
560
575
  for (int i = 0; i < MAX_BUFFERS; ++i) {
561
- ggml_cann_buffer& b = buffer_pool[i];
576
+ ggml_cann_buffer & b = buffer_pool[i];
562
577
  if (b.ptr != ptr) {
563
578
  continue;
564
579
  }
565
- b.used = false;
580
+ b.used = false;
566
581
  b.last_used = std::chrono::steady_clock::now();
567
582
  #ifdef DEBUG_CANN_MALLOC
568
583
  GGML_LOG_INFO(
569
584
  "cann pool[%d]: return %p, "
570
585
  "pool_size = %5u MB\n",
571
- device, b.ptr,
572
- (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576));
586
+ device, b.ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576));
573
587
  #endif
574
588
  return;
575
589
  }
@@ -597,7 +611,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
597
611
  /**
598
612
  * @brief Pointer to the start of the virtual memory pool.
599
613
  */
600
- void* pool_addr = 0;
614
+ void * pool_addr = 0;
601
615
 
602
616
  /**
603
617
  * @brief Amount of virtual memory used in the pool.
@@ -622,7 +636,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
622
636
  /**
623
637
  * @brief Offsets for the mapped memory regions.
624
638
  */
625
- std::vector<void*> map_offsets;
639
+ std::vector<void *> map_offsets;
626
640
 
627
641
  /**
628
642
  * @brief Constructor to initialize the buffer pool with virtual memory for
@@ -630,11 +644,10 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
630
644
  *
631
645
  * @param device The device ID to associate with this buffer pool.
632
646
  */
633
- explicit ggml_cann_pool_vmm(int device)
634
- : device(device) {
635
- auto dev = ggml_cann_info().devices[device];
647
+ explicit ggml_cann_pool_vmm(int device) : device(device) {
648
+ auto dev = ggml_cann_info().devices[device];
636
649
  granularity = dev.vmm_granularity;
637
- max_size = dev.total_vram;
650
+ max_size = dev.total_vram;
638
651
  }
639
652
 
640
653
  /**
@@ -642,10 +655,10 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
642
655
  */
643
656
  ~ggml_cann_pool_vmm() {
644
657
  if (pool_addr != 0) {
645
- for (auto& offset : map_offsets) {
658
+ for (auto & offset : map_offsets) {
646
659
  ACL_CHECK(aclrtUnmapMem(offset));
647
660
  }
648
- for (auto& handle : handles) {
661
+ for (auto & handle : handles) {
649
662
  ACL_CHECK(aclrtFreePhysical(handle));
650
663
  }
651
664
  ACL_CHECK(aclrtReleaseMemAddress(pool_addr));
@@ -660,11 +673,11 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
660
673
  * the allocated buffer.
661
674
  * @return A pointer to the allocated buffer.
662
675
  */
663
- void* alloc(size_t size, size_t* actual_size) override {
676
+ void * alloc(size_t size, size_t * actual_size) override {
664
677
  // round up the allocation size to the alignment to ensure that all
665
678
  // allocations are aligned for all data types
666
679
  const size_t alignment = 128;
667
- size = GGML_PAD(size, alignment);
680
+ size = GGML_PAD(size, alignment);
668
681
  if (size == 0) {
669
682
  size = alignment;
670
683
  }
@@ -674,53 +687,51 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
674
687
  if (size > avail) {
675
688
  // round up to the next multiple of the granularity
676
689
  size_t reserve_size = size - avail;
677
- reserve_size = GGML_PAD(reserve_size, granularity);
690
+ reserve_size = GGML_PAD(reserve_size, granularity);
678
691
 
679
692
  GGML_ASSERT(pool_size + reserve_size <= max_size);
680
693
 
681
694
  // allocate more physical memory
682
695
  aclrtPhysicalMemProp prop = {};
683
- prop.handleType = ACL_MEM_HANDLE_TYPE_NONE;
684
- prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED;
685
- prop.memAttr = ACL_HBM_MEM_HUGE;
686
- prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE;
687
- prop.location.id = device;
688
- prop.reserve = 0;
696
+ prop.handleType = ACL_MEM_HANDLE_TYPE_NONE;
697
+ prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED;
698
+ prop.memAttr = ACL_HBM_MEM_HUGE;
699
+ prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE;
700
+ prop.location.id = device;
701
+ prop.reserve = 0;
689
702
  aclrtDrvMemHandle handle;
690
703
  ACL_CHECK(aclrtMallocPhysical(&handle, reserve_size, &prop, 0));
691
704
 
692
705
  // reserve virtual address space (if not already reserved)
693
706
  if (pool_addr == 0) {
694
- ACL_CHECK(aclrtReserveMemAddress(
695
- &pool_addr, max_size, 0, NULL, 1));
707
+ ACL_CHECK(aclrtReserveMemAddress(&pool_addr, max_size, 0, NULL, 1));
696
708
  }
697
709
 
698
710
  // map at the end of the pool
699
- ACL_CHECK(aclrtMapMem((char*)pool_addr + pool_size, reserve_size, 0,
700
- handle, 0));
711
+ ACL_CHECK(aclrtMapMem((char *) pool_addr + pool_size, reserve_size, 0, handle, 0));
701
712
 
702
713
  handles.push_back(handle);
703
- map_offsets.push_back((char*)pool_addr + pool_size);
714
+ map_offsets.push_back((char *) pool_addr + pool_size);
704
715
 
705
716
  // add to the pool
706
717
  pool_size += reserve_size;
707
718
 
708
719
  #ifdef DEBUG_CANN_MALLOC
709
- GGML_LOG_INFO("cann pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
710
- device, (unsigned long long) (pool_size/1024/1024),
711
- (unsigned long long) (reserve_size/1024/1024));
720
+ GGML_LOG_INFO("cann pool[%d]: size increased to %llu MB (reserved %llu MB)\n", device,
721
+ (unsigned long long) (pool_size / 1024 / 1024),
722
+ (unsigned long long) (reserve_size / 1024 / 1024));
712
723
  #endif
713
724
  }
714
725
 
715
726
  GGML_ASSERT(pool_addr != 0);
716
727
 
717
- void* ptr = (void*)((char*)pool_addr + pool_used);
728
+ void * ptr = (void *) ((char *) pool_addr + pool_used);
718
729
  *actual_size = size;
719
730
  pool_used += size;
720
731
 
721
732
  #ifdef DEBUG_CANN_MALLOC
722
- GGML_LOG_INFO("cann pool[%d]: allocated %llu bytes at %llx\n", device,
723
- (unsigned long long)size, (unsigned long long)ptr);
733
+ GGML_LOG_INFO("cann pool[%d]: allocated %llu bytes at %llx\n", device, (unsigned long long) size,
734
+ (unsigned long long) ptr);
724
735
  #endif
725
736
  return ptr;
726
737
  }
@@ -731,16 +742,16 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
731
742
  * @param ptr Pointer to the buffer to free.
732
743
  * @param size Size of the buffer to free.
733
744
  */
734
- void free(void* ptr, size_t size) override {
745
+ void free(void * ptr, size_t size) override {
735
746
  #ifdef DEBUG_CANN_MALLOC
736
- GGML_LOG_INFO("cann pool[%d]: freed %llu bytes at %llx\n", device,
737
- (unsigned long long)size, (unsigned long long)ptr);
747
+ GGML_LOG_INFO("cann pool[%d]: freed %llu bytes at %llx\n", device, (unsigned long long) size,
748
+ (unsigned long long) ptr);
738
749
  #endif
739
750
 
740
751
  pool_used -= size;
741
752
 
742
753
  // all deallocations must be in reverse order of the allocations
743
- GGML_ASSERT(ptr == (void*)((char*)pool_addr + pool_used));
754
+ GGML_ASSERT(ptr == (void *) ((char *) pool_addr + pool_used));
744
755
  }
745
756
  };
746
757
 
@@ -752,8 +763,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
752
763
  * @param device The device ID for which to create the pool.
753
764
  * @return A unique pointer to the created CANN pool.
754
765
  */
755
- std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(
756
- int device) {
766
+ std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(int device) {
757
767
  std::string mem_pool_type = get_env("GGML_CANN_MEM_POOL").value_or("");
758
768
 
759
769
  if (mem_pool_type == "prio") {
@@ -778,9 +788,8 @@ std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(
778
788
  * ID, device pointer, and a name derived from GGML_CANN_NAME and the device ID.
779
789
  */
780
790
  struct ggml_backend_cann_buffer_context {
781
- int32_t device; ///< The device ID associated with this buffer context.
782
- void* dev_ptr =
783
- nullptr; ///< Pointer to the device memory allocated for the buffer.
791
+ int32_t device; ///< The device ID associated with this buffer context.
792
+ void * dev_ptr = nullptr; ///< Pointer to the device memory allocated for the buffer.
784
793
 
785
794
  /**
786
795
  * @brief Constructor to initialize the CANN buffer context.
@@ -788,9 +797,7 @@ struct ggml_backend_cann_buffer_context {
788
797
  * @param device The device ID associated with this buffer context.
789
798
  * @param dev_ptr Pointer to the device memory allocated for the buffer.
790
799
  */
791
- ggml_backend_cann_buffer_context(int32_t device, void* dev_ptr)
792
- : device(device),
793
- dev_ptr(dev_ptr) {}
800
+ ggml_backend_cann_buffer_context(int32_t device, void * dev_ptr) : device(device), dev_ptr(dev_ptr) {}
794
801
 
795
802
  /**
796
803
  * @brief Destructor to free the device memory allocated for the buffer.
@@ -808,8 +815,8 @@ struct ggml_backend_cann_buffer_context {
808
815
  * @return true if the buffer is a CANN buffer, false otherwise.
809
816
  */
810
817
  static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft);
811
- static bool ggml_backend_buffer_is_cann(
812
- ggml_backend_buffer_t buffer) {
818
+
819
+ static bool ggml_backend_buffer_is_cann(ggml_backend_buffer_t buffer) {
813
820
  return ggml_backend_buft_is_cann(buffer->buft);
814
821
  }
815
822
 
@@ -821,10 +828,8 @@ static bool ggml_backend_buffer_is_cann(
821
828
  *
822
829
  * @param buffer The CANN buffer to free.
823
830
  */
824
- static void ggml_backend_cann_buffer_free_buffer(
825
- ggml_backend_buffer_t buffer) {
826
- ggml_backend_cann_buffer_context* ctx =
827
- (ggml_backend_cann_buffer_context*)buffer->context;
831
+ static void ggml_backend_cann_buffer_free_buffer(ggml_backend_buffer_t buffer) {
832
+ ggml_backend_cann_buffer_context * ctx = (ggml_backend_cann_buffer_context *) buffer->context;
828
833
  delete ctx;
829
834
  }
830
835
 
@@ -837,10 +842,8 @@ static void ggml_backend_cann_buffer_free_buffer(
837
842
  * @param buffer The CANN buffer whose base pointer is to be retrieved.
838
843
  * @return A pointer to the base of the device memory allocated for the buffer.
839
844
  */
840
- static void* ggml_backend_cann_buffer_get_base(
841
- ggml_backend_buffer_t buffer) {
842
- ggml_backend_cann_buffer_context* ctx =
843
- (ggml_backend_cann_buffer_context*)buffer->context;
845
+ static void * ggml_backend_cann_buffer_get_base(ggml_backend_buffer_t buffer) {
846
+ ggml_backend_cann_buffer_context * ctx = (ggml_backend_cann_buffer_context *) buffer->context;
844
847
  return ctx->dev_ptr;
845
848
  }
846
849
 
@@ -857,21 +860,17 @@ static void* ggml_backend_cann_buffer_get_base(
857
860
  * @param dst Pointer to the destination buffer where transformed data will be
858
861
  * stored.
859
862
  */
860
- static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor,
861
- const void* src,
862
- void* dst) {
863
-
864
- int64_t n_elems = ggml_nelements(tensor);
865
- int64_t groups = n_elems / QK4_0;
866
- size_t quant_bytes = n_elems * sizeof(uint8_t) / 2;
863
+ static void ggml_backend_cann_transform_q4_0(ggml_tensor * tensor, const void * src, void * dst) {
864
+ int64_t n_elems = ggml_nelements(tensor);
865
+ int64_t groups = n_elems / QK4_0;
866
+ size_t quant_bytes = n_elems * sizeof(uint8_t) / 2;
867
867
 
868
- uint8_t* quant_offset = (uint8_t*)dst;
869
- uint16_t* scale_offset = (uint16_t*)((char*)dst + quant_bytes);
868
+ uint8_t * quant_offset = (uint8_t *) dst;
869
+ uint16_t * scale_offset = (uint16_t *) ((char *) dst + quant_bytes);
870
870
 
871
871
  for (int i = 0; i < groups; i++) {
872
- const block_q4_0* group =
873
- (const block_q4_0*)((const char*)src + i * sizeof(block_q4_0));
874
- *scale_offset = group->d;
872
+ const block_q4_0 * group = (const block_q4_0 *) ((const char *) src + i * sizeof(block_q4_0));
873
+ *scale_offset = group->d;
875
874
  scale_offset++;
876
875
 
877
876
  // 0-15
@@ -890,8 +889,7 @@ static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor,
890
889
  }
891
890
 
892
891
  // put (uint4b_t -8) into int4b_t
893
- for (quant_offset = (uint8_t*)dst;
894
- quant_offset < (uint8_t*)dst + quant_bytes; quant_offset++) {
892
+ for (quant_offset = (uint8_t *) dst; quant_offset < (uint8_t *) dst + quant_bytes; quant_offset++) {
895
893
  (*quant_offset) ^= 0x88;
896
894
  }
897
895
  }
@@ -909,29 +907,27 @@ static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor,
909
907
  * @param dst Pointer to the destination buffer where the Q4.0 formatted data
910
908
  * will be stored.
911
909
  */
912
- static void ggml_backend_cann_transform_back_q4_0(
913
- const ggml_tensor* tensor, void* src, void* dst) {
914
-
915
- int64_t n_elems = ggml_nelements(tensor);
916
- int64_t groups = n_elems / QK4_0;
917
- size_t quant_bytes = n_elems * sizeof(uint8_t) / 2;
910
+ static void ggml_backend_cann_transform_back_q4_0(const ggml_tensor * tensor, void * src, void * dst) {
911
+ int64_t n_elems = ggml_nelements(tensor);
912
+ int64_t groups = n_elems / QK4_0;
913
+ size_t quant_bytes = n_elems * sizeof(uint8_t) / 2;
918
914
 
919
- uint8_t* quant_offset = (uint8_t*)src;
920
- uint16_t* scale_offset = (uint16_t*)((char*)src + quant_bytes);
915
+ uint8_t * quant_offset = (uint8_t *) src;
916
+ uint16_t * scale_offset = (uint16_t *) ((char *) src + quant_bytes);
921
917
 
922
- for (; quant_offset < (uint8_t*)src + quant_bytes; quant_offset++) {
918
+ for (; quant_offset < (uint8_t *) src + quant_bytes; quant_offset++) {
923
919
  (*quant_offset) ^= 0x88;
924
920
  }
925
- quant_offset = (uint8_t*)src;
921
+ quant_offset = (uint8_t *) src;
926
922
 
927
923
  for (int i = 0; i < groups; i++) {
928
- block_q4_0* group = (block_q4_0*)((char*)dst + i * sizeof(block_q4_0));
929
- group->d = *scale_offset;
924
+ block_q4_0 * group = (block_q4_0 *) ((char *) dst + i * sizeof(block_q4_0));
925
+ group->d = *scale_offset;
930
926
  scale_offset++;
931
927
 
932
928
  // 0-15
933
929
  for (int j = 0; j < QK4_0 / 2; j += 2) {
934
- group->qs[j] = ((*quant_offset) & 0x0F);
930
+ group->qs[j] = ((*quant_offset) & 0x0F);
935
931
  group->qs[j + 1] = ((*quant_offset) >> 4);
936
932
  quant_offset++;
937
933
  }
@@ -958,20 +954,17 @@ static void ggml_backend_cann_transform_back_q4_0(
958
954
  * @param dst Pointer to the destination buffer where transformed data will be
959
955
  * stored.
960
956
  */
961
- static void ggml_backend_cann_transform_q8_0(ggml_tensor* tensor,
962
- const void* src,
963
- void* dst) {
964
- int64_t n_elems = ggml_nelements(tensor);
965
- int64_t groups = n_elems / QK8_0;
966
- size_t quant_bytes = n_elems * sizeof(uint8_t);
957
+ static void ggml_backend_cann_transform_q8_0(ggml_tensor * tensor, const void * src, void * dst) {
958
+ int64_t n_elems = ggml_nelements(tensor);
959
+ int64_t groups = n_elems / QK8_0;
960
+ size_t quant_bytes = n_elems * sizeof(uint8_t);
967
961
 
968
- uint8_t* quant_offset = (uint8_t*)dst;
969
- uint16_t* scale_offset = (uint16_t*)((char*)dst + quant_bytes);
962
+ uint8_t * quant_offset = (uint8_t *) dst;
963
+ uint16_t * scale_offset = (uint16_t *) ((char *) dst + quant_bytes);
970
964
 
971
965
  for (int i = 0; i < groups; i++) {
972
- const block_q8_0* group =
973
- (const block_q8_0*)((const char*)src + i * sizeof(block_q8_0));
974
- *scale_offset = group->d;
966
+ const block_q8_0 * group = (const block_q8_0 *) ((const char *) src + i * sizeof(block_q8_0));
967
+ *scale_offset = group->d;
975
968
  scale_offset++;
976
969
  size_t group_quant_size = QK8_0 * sizeof(uint8_t);
977
970
  memcpy(quant_offset, group->qs, group_quant_size);
@@ -992,19 +985,17 @@ static void ggml_backend_cann_transform_q8_0(ggml_tensor* tensor,
992
985
  * @param dst Pointer to the destination buffer where the Q8.0 formatted data
993
986
  * will be stored.
994
987
  */
995
- static void ggml_backend_cann_transform_back_q8_0(
996
- const ggml_tensor* tensor, const void* src, void* dst) {
997
- int64_t n_elems = ggml_nelements(tensor);
998
- int64_t groups = n_elems / QK8_0;
999
- size_t quant_bytes = n_elems * sizeof(uint8_t);
988
+ static void ggml_backend_cann_transform_back_q8_0(const ggml_tensor * tensor, const void * src, void * dst) {
989
+ int64_t n_elems = ggml_nelements(tensor);
990
+ int64_t groups = n_elems / QK8_0;
991
+ size_t quant_bytes = n_elems * sizeof(uint8_t);
1000
992
 
1001
- const uint8_t* quant_offset = (const uint8_t*)src;
1002
- const uint16_t* scale_offset =
1003
- (const uint16_t*)((const char*)src + quant_bytes);
993
+ const uint8_t * quant_offset = (const uint8_t *) src;
994
+ const uint16_t * scale_offset = (const uint16_t *) ((const char *) src + quant_bytes);
1004
995
 
1005
996
  for (int i = 0; i < groups; i++) {
1006
- block_q8_0* group = (block_q8_0*)((char*)dst + i * sizeof(block_q8_0));
1007
- group->d = *scale_offset;
997
+ block_q8_0 * group = (block_q8_0 *) ((char *) dst + i * sizeof(block_q8_0));
998
+ group->d = *scale_offset;
1008
999
  scale_offset++;
1009
1000
  size_t group_quant_size = QK8_0 * sizeof(uint8_t);
1010
1001
  memcpy(group->qs, quant_offset, group_quant_size);
@@ -1024,8 +1015,7 @@ static void ggml_backend_cann_transform_back_q8_0(
1024
1015
  * @param dst Pointer to the destination buffer where transformed data will be
1025
1016
  * stored.
1026
1017
  */
1027
- static void ggml_backend_cann_transform(ggml_tensor* tensor,
1028
- const void* src, void* dst) {
1018
+ static void ggml_backend_cann_transform(ggml_tensor * tensor, const void * src, void * dst) {
1029
1019
  switch (tensor->type) {
1030
1020
  case GGML_TYPE_Q4_0:
1031
1021
  ggml_backend_cann_transform_q4_0(tensor, src, dst);
@@ -1050,8 +1040,7 @@ static void ggml_backend_cann_transform(ggml_tensor* tensor,
1050
1040
  * @param dst Pointer to the destination buffer where transformed tensor data
1051
1041
  * will be stored.
1052
1042
  */
1053
- static void ggml_backend_cann_transform_back(
1054
- const ggml_tensor* tensor, void* src, void* dst) {
1043
+ static void ggml_backend_cann_transform_back(const ggml_tensor * tensor, void * src, void * dst) {
1055
1044
  switch (tensor->type) {
1056
1045
  case GGML_TYPE_Q4_0:
1057
1046
  ggml_backend_cann_transform_back_q4_0(tensor, src, dst);
@@ -1092,8 +1081,7 @@ static bool need_transform(ggml_type type) {
1092
1081
  * @param buffer The CANN buffer from which to initialize the tensor.
1093
1082
  * @param tensor Pointer to the tensor to be initialized.
1094
1083
  */
1095
- static enum ggml_status ggml_backend_cann_buffer_init_tensor(
1096
- ggml_backend_buffer_t buffer, ggml_tensor* tensor) {
1084
+ static enum ggml_status ggml_backend_cann_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
1097
1085
  if (tensor->view_src != NULL && tensor->view_offs == 0) {
1098
1086
  GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
1099
1087
  return GGML_STATUS_SUCCESS;
@@ -1104,42 +1092,75 @@ static enum ggml_status ggml_backend_cann_buffer_init_tensor(
1104
1092
  if (ggml_is_quantized(tensor->type)) {
1105
1093
  // Initialize padding to 0 to avoid possible NaN values
1106
1094
  size_t original_size = ggml_nbytes(tensor);
1107
- size_t padded_size =
1108
- ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
1095
+ size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
1109
1096
 
1110
1097
  if (padded_size > original_size && tensor->view_src == nullptr) {
1111
1098
  size_t memset_size = padded_size - original_size;
1112
- ACL_CHECK(aclrtMemset((char*)tensor->data + original_size,
1113
- memset_size, 0, memset_size));
1099
+ ACL_CHECK(aclrtMemset((char *) tensor->data + original_size, memset_size, 0, memset_size));
1114
1100
  }
1115
1101
  }
1116
1102
  return GGML_STATUS_SUCCESS;
1117
1103
  }
1118
1104
 
1119
- // ND to NZ Workspace Cache Management. Thread-safety: Not guaranteed
1120
- namespace {
1121
- void* g_nz_workspace = nullptr;
1122
- size_t g_nz_workspace_allocated = 0;
1105
+ /**
1106
+ * @brief Workspace for caching NZ buffers per device.
1107
+ *
1108
+ * This struct manages a device buffer used in NZ computations. It supports
1109
+ * allocation, reallocation, and clearing of cached memory. The struct is
1110
+ * designed to be used with a global array, one per device.
1111
+ */
1112
+ struct ggml_cann_nz_workspace {
1113
+ void * ptr; // Pointer to allocated device buffer
1114
+ size_t allocated; // Size of currently allocated buffer in bytes
1115
+
1116
+ /**
1117
+ * @brief Constructor. Initializes the workspace with no allocated memory.
1118
+ */
1119
+ ggml_cann_nz_workspace() : ptr(nullptr), allocated(0) {}
1123
1120
 
1124
- void release_nz_workspace() {
1125
- if (g_nz_workspace) {
1126
- aclrtFree(g_nz_workspace);
1127
- g_nz_workspace = nullptr;
1128
- g_nz_workspace_allocated = 0;
1121
+ /**
1122
+ * @brief Free cached memory and reset the workspace.
1123
+ *
1124
+ * If a buffer has been allocated, this function releases it using
1125
+ * aclrtFree and resets internal state.
1126
+ */
1127
+ void clear() {
1128
+ if (ptr) {
1129
+ ACL_CHECK(aclrtFree(ptr));
1130
+ ptr = nullptr;
1131
+ allocated = 0;
1129
1132
  }
1130
1133
  }
1131
1134
 
1132
- void relloc_nz_workspace(size_t new_size) {
1133
- if (new_size > g_nz_workspace_allocated) {
1134
- if (g_nz_workspace) {
1135
- aclrtFree(g_nz_workspace);
1136
- g_nz_workspace = nullptr;
1135
+ /**
1136
+ * @brief Allocate or reallocate the workspace buffer.
1137
+ *
1138
+ * If the requested size is larger than the currently allocated size,
1139
+ * the old buffer will be freed and a new buffer of the requested size
1140
+ * will be allocated on the device.
1141
+ *
1142
+ * @param new_size Size in bytes to allocate for the workspace.
1143
+ */
1144
+ void realloc(size_t new_size) {
1145
+ if (new_size > allocated) {
1146
+ clear();
1147
+ ACL_CHECK(aclrtMalloc(&ptr, new_size, ACL_MEM_MALLOC_HUGE_FIRST));
1148
+ allocated = new_size;
1137
1149
  }
1138
- ACL_CHECK(aclrtMalloc(&g_nz_workspace, new_size, ACL_MEM_MALLOC_HUGE_FIRST));
1139
- g_nz_workspace_allocated = new_size;
1140
- }
1141
1150
  }
1142
- }
1151
+
1152
+ /**
1153
+ * @brief Get the device buffer pointer.
1154
+ *
1155
+ * @return Pointer to the allocated buffer, or nullptr if not allocated.
1156
+ */
1157
+ void * get() const { return ptr; }
1158
+ };
1159
+
1160
+ /**
1161
+ * @brief Global array of NZ workspaces, one per device.
1162
+ */
1163
+ static ggml_cann_nz_workspace g_nz_workspaces[GGML_CANN_MAX_DEVICES];
1143
1164
 
1144
1165
  /**
1145
1166
  * @brief Convert tensor weights to NZ format using Ascend CANN API.
@@ -1149,26 +1170,25 @@ namespace {
1149
1170
  * improve performance on certain hardware.
1150
1171
  *
1151
1172
  * @param tensor Pointer to the input ggml_tensor containing the weights.
1152
- * @param data Pointer to the raw data buffer for the tensor weights.
1153
1173
  * @param offset Byte offset within the tensor data buffer where weights start.
1174
+ * @param device device id.
1154
1175
  *
1155
1176
  * @note The workspace buffer used in this function is managed globally and reused
1156
1177
  * across calls. This reduces overhead from repeated memory allocation and deallocation.
1157
1178
  */
1158
- static void weight_format_to_nz(ggml_tensor *tensor, size_t offset) {
1159
- aclTensor* weightTransposed = ggml_cann_create_tensor(tensor, tensor->ne,
1160
- tensor->nb, 2, ACL_FORMAT_ND, offset);
1161
- uint64_t workspaceSize = 0;
1162
- aclOpExecutor *executor;
1179
+ static void weight_format_to_nz(ggml_tensor * tensor, size_t offset, int device) {
1180
+ acl_tensor_ptr weightTransposed = ggml_cann_create_tensor(tensor, tensor->ne, tensor->nb, 2, ACL_FORMAT_ND, offset);
1181
+ uint64_t workspaceSize = 0;
1182
+ aclOpExecutor * executor;
1163
1183
 
1164
1184
  // TransMatmulWeight
1165
- ACL_CHECK(aclnnTransMatmulWeightGetWorkspaceSize(weightTransposed,
1166
- &workspaceSize, &executor));
1185
+ ACL_CHECK(aclnnTransMatmulWeightGetWorkspaceSize(weightTransposed.get(), &workspaceSize, &executor));
1167
1186
  // Avoid frequent malloc/free of the workspace.
1168
- relloc_nz_workspace(workspaceSize);
1187
+ g_nz_workspaces[device].realloc(workspaceSize);
1188
+
1189
+ void * g_nz_workspace = g_nz_workspaces[device].get();
1169
1190
 
1170
1191
  ACL_CHECK(aclnnTransMatmulWeight(g_nz_workspace, workspaceSize, executor, nullptr));
1171
- ACL_CHECK(aclDestroyTensor(weightTransposed));
1172
1192
  }
1173
1193
 
1174
1194
  // TODO: need handle tensor which has paddings.
@@ -1184,11 +1204,12 @@ static void weight_format_to_nz(ggml_tensor *tensor, size_t offset) {
1184
1204
  * @param offset Offset in the source data from where to start copying.
1185
1205
  * @param size Size of the data to be copied, in bytes.
1186
1206
  */
1187
- static void ggml_backend_cann_buffer_set_tensor(
1188
- ggml_backend_buffer_t buffer, ggml_tensor *tensor, const void *data,
1189
- size_t offset, size_t size) {
1190
- ggml_backend_cann_buffer_context *ctx =
1191
- (ggml_backend_cann_buffer_context *)buffer->context;
1207
+ static void ggml_backend_cann_buffer_set_tensor(ggml_backend_buffer_t buffer,
1208
+ ggml_tensor * tensor,
1209
+ const void * data,
1210
+ size_t offset,
1211
+ size_t size) {
1212
+ ggml_backend_cann_buffer_context * ctx = (ggml_backend_cann_buffer_context *) buffer->context;
1192
1213
 
1193
1214
  ggml_cann_set_device(ctx->device);
1194
1215
  // TODO: refer to cann(#6017), it use thread's default stream.
@@ -1196,22 +1217,19 @@ static void ggml_backend_cann_buffer_set_tensor(
1196
1217
  // Why aclrtSynchronizeDevice?
1197
1218
 
1198
1219
  // Only check env once.
1199
- static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or(""));
1220
+ static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or("on"));
1200
1221
  if (!need_transform(tensor->type)) {
1201
- ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size, data, size,
1202
- ACL_MEMCPY_HOST_TO_DEVICE));
1203
- if (weight_to_nz && is_matmul_weight((const ggml_tensor*)tensor)) {
1222
+ ACL_CHECK(aclrtMemcpy((char *) tensor->data + offset, size, data, size, ACL_MEMCPY_HOST_TO_DEVICE));
1223
+ if (weight_to_nz && is_matmul_weight((const ggml_tensor *) tensor)) {
1204
1224
  GGML_ASSERT(tensor->ne[2] == 1);
1205
1225
  GGML_ASSERT(tensor->ne[3] == 1);
1206
- weight_format_to_nz(tensor, offset);
1226
+ weight_format_to_nz(tensor, offset, ctx->device);
1207
1227
  }
1208
1228
  } else {
1209
- void *transform_buffer = malloc(size);
1229
+ void * transform_buffer = malloc(size);
1210
1230
  ggml_backend_cann_transform(tensor, data, transform_buffer);
1211
1231
 
1212
- ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size,
1213
- transform_buffer, size,
1214
- ACL_MEMCPY_HOST_TO_DEVICE));
1232
+ ACL_CHECK(aclrtMemcpy((char *) tensor->data + offset, size, transform_buffer, size, ACL_MEMCPY_HOST_TO_DEVICE));
1215
1233
  free(transform_buffer);
1216
1234
  }
1217
1235
  }
@@ -1229,22 +1247,20 @@ static void ggml_backend_cann_buffer_set_tensor(
1229
1247
  * @param offset Offset in the destination buffer where to start copying.
1230
1248
  * @param size Size of the data to be copied, in bytes.
1231
1249
  */
1232
- static void ggml_backend_cann_buffer_get_tensor(
1233
- ggml_backend_buffer_t buffer, const ggml_tensor* tensor, void* data,
1234
- size_t offset, size_t size) {
1235
- ggml_backend_cann_buffer_context* ctx =
1236
- (ggml_backend_cann_buffer_context*)buffer->context;
1250
+ static void ggml_backend_cann_buffer_get_tensor(ggml_backend_buffer_t buffer,
1251
+ const ggml_tensor * tensor,
1252
+ void * data,
1253
+ size_t offset,
1254
+ size_t size) {
1255
+ ggml_backend_cann_buffer_context * ctx = (ggml_backend_cann_buffer_context *) buffer->context;
1237
1256
 
1238
1257
  ggml_cann_set_device(ctx->device);
1239
1258
 
1240
1259
  if (!need_transform(tensor->type)) {
1241
- ACL_CHECK(aclrtMemcpy(data, size, (char*)tensor->data + offset, size,
1242
- ACL_MEMCPY_DEVICE_TO_HOST));
1260
+ ACL_CHECK(aclrtMemcpy(data, size, (char *) tensor->data + offset, size, ACL_MEMCPY_DEVICE_TO_HOST));
1243
1261
  } else {
1244
- void* transform_buffer = malloc(size);
1245
- ACL_CHECK(aclrtMemcpy(transform_buffer, size,
1246
- (char*)tensor->data + offset, size,
1247
- ACL_MEMCPY_DEVICE_TO_HOST));
1262
+ void * transform_buffer = malloc(size);
1263
+ ACL_CHECK(aclrtMemcpy(transform_buffer, size, (char *) tensor->data + offset, size, ACL_MEMCPY_DEVICE_TO_HOST));
1248
1264
  ggml_backend_cann_transform_back(tensor, transform_buffer, data);
1249
1265
  free(transform_buffer);
1250
1266
  }
@@ -1263,31 +1279,31 @@ static void ggml_backend_cann_buffer_get_tensor(
1263
1279
  * @param dst Pointer to the destination tensor where the data will be copied.
1264
1280
  * @return true if the copy operation succeeded, false otherwise.
1265
1281
  */
1266
- static bool ggml_backend_cann_buffer_cpy_tensor(
1267
- ggml_backend_buffer_t buffer, const ggml_tensor* src, ggml_tensor* dst) {
1282
+ static bool ggml_backend_cann_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
1283
+ const ggml_tensor * src,
1284
+ ggml_tensor * dst) {
1268
1285
  if (ggml_backend_buffer_is_cann(src->buffer)) {
1269
- ggml_backend_cann_buffer_context* src_ctx =
1270
- (ggml_backend_cann_buffer_context*)src->buffer->context;
1271
- ggml_backend_cann_buffer_context* dst_ctx =
1272
- (ggml_backend_cann_buffer_context*)buffer->context;
1286
+ ggml_backend_cann_buffer_context * src_ctx = (ggml_backend_cann_buffer_context *) src->buffer->context;
1287
+ ggml_backend_cann_buffer_context * dst_ctx = (ggml_backend_cann_buffer_context *) buffer->context;
1273
1288
 
1274
1289
  size_t memcpy_size = ggml_nbytes(src);
1275
1290
  // Same device.
1276
1291
  if (src_ctx->device == dst_ctx->device) {
1277
- ACL_CHECK(aclrtMemcpy((char*)dst->data, memcpy_size,
1278
- (const char*)src->data, memcpy_size,
1292
+ ACL_CHECK(aclrtMemcpy((char *) dst->data, memcpy_size, (const char *) src->data, memcpy_size,
1279
1293
  ACL_MEMCPY_DEVICE_TO_DEVICE));
1280
1294
  return true;
1281
1295
  } else {
1296
+ #ifdef ASCEND_310P
1297
+ // TODO: Support 310p P2P copy
1298
+ return false;
1299
+ #endif
1282
1300
  // Different device but can access by peer.
1283
1301
  int32_t canAccessPeer = 0;
1284
- ACL_CHECK(aclrtDeviceCanAccessPeer(&canAccessPeer, src_ctx->device,
1285
- dst_ctx->device));
1302
+ ACL_CHECK(aclrtDeviceCanAccessPeer(&canAccessPeer, src_ctx->device, dst_ctx->device));
1286
1303
  if (canAccessPeer) {
1287
1304
  ggml_cann_set_device(src_ctx->device);
1288
1305
  ACL_CHECK(aclrtDeviceEnablePeerAccess(dst_ctx->device, 0));
1289
- ACL_CHECK(aclrtMemcpy((char*)dst->data, memcpy_size,
1290
- (const char*)src->data, memcpy_size,
1306
+ ACL_CHECK(aclrtMemcpy((char *) dst->data, memcpy_size, (const char *) src->data, memcpy_size,
1291
1307
  ACL_MEMCPY_DEVICE_TO_DEVICE));
1292
1308
  return true;
1293
1309
  }
@@ -1305,10 +1321,8 @@ static bool ggml_backend_cann_buffer_cpy_tensor(
1305
1321
  * @param buffer The CANN buffer to be cleared.
1306
1322
  * @param value The value to which each byte in the buffer will be set.
1307
1323
  */
1308
- static void ggml_backend_cann_buffer_clear(
1309
- ggml_backend_buffer_t buffer, uint8_t value) {
1310
- ggml_backend_cann_buffer_context* ctx =
1311
- (ggml_backend_cann_buffer_context*)buffer->context;
1324
+ static void ggml_backend_cann_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
1325
+ ggml_backend_cann_buffer_context * ctx = (ggml_backend_cann_buffer_context *) buffer->context;
1312
1326
 
1313
1327
  ggml_cann_set_device(ctx->device);
1314
1328
  ACL_CHECK(aclrtMemset(ctx->dev_ptr, buffer->size, value, buffer->size));
@@ -1338,9 +1352,8 @@ static const ggml_backend_buffer_i ggml_backend_cann_buffer_interface = {
1338
1352
  * buffer type.
1339
1353
  */
1340
1354
  struct ggml_backend_cann_buffer_type_context {
1341
- int32_t
1342
- device; /**< Device identifier associated with the buffer context. */
1343
- std::string name; /**< Name associated with the buffer context. */
1355
+ int32_t device; /**< Device identifier associated with the buffer context. */
1356
+ std::string name; /**< Name associated with the buffer context. */
1344
1357
  };
1345
1358
 
1346
1359
  /**
@@ -1352,10 +1365,8 @@ struct ggml_backend_cann_buffer_type_context {
1352
1365
  * @param buft Pointer to the buffer type context.
1353
1366
  * @return Const pointer to the C-style string containing the name.
1354
1367
  */
1355
- static const char* ggml_backend_cann_buffer_type_name(
1356
- ggml_backend_buffer_type_t buft) {
1357
- ggml_backend_cann_buffer_type_context* buft_ctx =
1358
- (ggml_backend_cann_buffer_type_context*)buft->context;
1368
+ static const char * ggml_backend_cann_buffer_type_name(ggml_backend_buffer_type_t buft) {
1369
+ ggml_backend_cann_buffer_type_context * buft_ctx = (ggml_backend_cann_buffer_type_context *) buft->context;
1359
1370
 
1360
1371
  return buft_ctx->name.c_str();
1361
1372
  }
@@ -1370,34 +1381,27 @@ static const char* ggml_backend_cann_buffer_type_name(
1370
1381
  * @param size Size in bytes of the buffer to allocate.
1371
1382
  * @return Pointer to the allocated buffer, or nullptr if allocation fails.
1372
1383
  */
1373
- static ggml_backend_buffer_t
1374
- ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
1375
- size_t size) {
1376
- ggml_backend_cann_buffer_type_context* buft_ctx =
1377
- (ggml_backend_cann_buffer_type_context*)buft->context;
1384
+ static ggml_backend_buffer_t ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
1385
+ ggml_backend_cann_buffer_type_context * buft_ctx = (ggml_backend_cann_buffer_type_context *) buft->context;
1378
1386
 
1379
1387
  ggml_cann_set_device(buft_ctx->device);
1380
1388
 
1381
1389
  const size_t alignment = 128;
1382
- size = GGML_PAD(size, alignment);
1390
+ size = GGML_PAD(size, alignment);
1383
1391
  if (size == 0) {
1384
1392
  size = alignment;
1385
1393
  }
1386
- void* dev_ptr;
1394
+ void * dev_ptr;
1387
1395
  aclError err = aclrtMalloc(&dev_ptr, size, ACL_MEM_MALLOC_HUGE_FIRST);
1388
1396
  if (err != ACL_SUCCESS) {
1389
- GGML_LOG_ERROR(
1390
- "%s: allocating %.2f MiB on device %d: aclrtMalloc failed: %s\n",
1391
- __func__, size / 1024.0 / 1024.0, buft_ctx->device,
1392
- aclGetRecentErrMsg());
1397
+ GGML_LOG_ERROR("%s: allocating %.2f MiB on device %d: aclrtMalloc failed: %s\n", __func__,
1398
+ size / 1024.0 / 1024.0, buft_ctx->device, aclGetRecentErrMsg());
1393
1399
  return nullptr;
1394
1400
  }
1395
1401
 
1396
- ggml_backend_cann_buffer_context* ctx =
1397
- new ggml_backend_cann_buffer_context(buft_ctx->device, dev_ptr);
1402
+ ggml_backend_cann_buffer_context * ctx = new ggml_backend_cann_buffer_context(buft_ctx->device, dev_ptr);
1398
1403
 
1399
- return ggml_backend_buffer_init(buft, ggml_backend_cann_buffer_interface,
1400
- ctx, size);
1404
+ return ggml_backend_buffer_init(buft, ggml_backend_cann_buffer_interface, ctx, size);
1401
1405
  }
1402
1406
 
1403
1407
  /**
@@ -1412,8 +1416,7 @@ ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
1412
1416
  * @return The alignment requirement in bytes (fixed at 128 bytes for CANN
1413
1417
  * buffers).
1414
1418
  */
1415
- static size_t ggml_backend_cann_buffer_type_get_alignment(
1416
- ggml_backend_buffer_type_t buft) {
1419
+ static size_t ggml_backend_cann_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
1417
1420
  return 128;
1418
1421
 
1419
1422
  GGML_UNUSED(buft);
@@ -1433,13 +1436,13 @@ static size_t ggml_backend_cann_buffer_type_get_alignment(
1433
1436
  * @return The total allocation size in bytes required for the tensor in the
1434
1437
  * CANN buffer.
1435
1438
  */
1436
- static size_t ggml_backend_cann_buffer_type_get_alloc_size(
1437
- ggml_backend_buffer_type_t buft, const ggml_tensor* tensor) {
1438
- size_t size = ggml_nbytes(tensor);
1439
- int64_t ne0 = tensor->ne[0];
1439
+ static size_t ggml_backend_cann_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft,
1440
+ const ggml_tensor * tensor) {
1441
+ size_t size = ggml_nbytes(tensor);
1442
+ int64_t ne0 = tensor->ne[0];
1440
1443
 
1441
1444
  // Only check env once.
1442
- static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or(""));
1445
+ static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or("on"));
1443
1446
 
1444
1447
  // last line must bigger than 32, because every single op deal at
1445
1448
  // least 32 bytes.
@@ -1449,19 +1452,17 @@ static size_t ggml_backend_cann_buffer_type_get_alloc_size(
1449
1452
  // size += (line_size_align_32 - line_size);
1450
1453
  if (ggml_is_quantized(tensor->type)) {
1451
1454
  if (ne0 % MATRIX_ROW_PADDING != 0) {
1452
- size += ggml_row_size(
1453
- tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
1455
+ size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
1454
1456
  }
1455
- } else if (weight_to_nz && is_matmul_weight((const ggml_tensor*)tensor)) {
1457
+ } else if (weight_to_nz && is_matmul_weight((const ggml_tensor *) tensor)) {
1456
1458
  // NZ format weight are not support quantized yet.
1457
1459
  // If ND tensor transform to NZ, size may changed.
1458
- int64_t shape[] = {tensor->ne[1], tensor->ne[0]};
1460
+ int64_t shape[] = { tensor->ne[1], tensor->ne[0] };
1459
1461
  GGML_ASSERT(tensor->ne[2] == 1);
1460
1462
  GGML_ASSERT(tensor->ne[3] == 1);
1461
- const aclIntArray *acl_shape = aclCreateIntArray(shape, 2);
1462
- size_t new_size;
1463
- ACL_CHECK(aclnnCalculateMatmulWeightSizeV2(acl_shape,
1464
- ggml_cann_type_mapping(tensor->type), &new_size));
1463
+ const aclIntArray * acl_shape = aclCreateIntArray(shape, 2);
1464
+ size_t new_size;
1465
+ ACL_CHECK(aclnnCalculateMatmulWeightSizeV2(acl_shape, ggml_cann_type_mapping(tensor->type), &new_size));
1465
1466
  ACL_CHECK(aclDestroyIntArray(acl_shape));
1466
1467
  size = std::max(size, new_size);
1467
1468
  }
@@ -1502,17 +1503,15 @@ static const ggml_backend_buffer_type_i ggml_backend_cann_buffer_type_interface
1502
1503
  * @return A pointer to the buffer type interface for the specified device, or
1503
1504
  * nullptr if the device index is out of range.
1504
1505
  */
1505
- ggml_backend_buffer_type_t
1506
- ggml_backend_cann_buffer_type(int32_t device) {
1507
- static std::mutex mutex;
1506
+ ggml_backend_buffer_type_t ggml_backend_cann_buffer_type(int32_t device) {
1507
+ static std::mutex mutex;
1508
1508
  std::lock_guard<std::mutex> lock(mutex);
1509
1509
 
1510
1510
  if (device >= ggml_backend_cann_get_device_count()) {
1511
1511
  return nullptr;
1512
1512
  }
1513
1513
 
1514
- static ggml_backend_buffer_type
1515
- ggml_backend_cann_buffer_types[GGML_CANN_MAX_DEVICES];
1514
+ static ggml_backend_buffer_type ggml_backend_cann_buffer_types[GGML_CANN_MAX_DEVICES];
1516
1515
 
1517
1516
  static bool ggml_backend_cann_buffer_type_initialized = false;
1518
1517
 
@@ -1522,8 +1521,7 @@ ggml_backend_cann_buffer_type(int32_t device) {
1522
1521
  /* .iface = */ ggml_backend_cann_buffer_type_interface,
1523
1522
  /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), i),
1524
1523
  /* .context = */
1525
- new ggml_backend_cann_buffer_type_context{
1526
- i, "CANN" + std::to_string(i)},
1524
+ new ggml_backend_cann_buffer_type_context{ i, "CANN" + std::to_string(i) },
1527
1525
  };
1528
1526
  }
1529
1527
  ggml_backend_cann_buffer_type_initialized = true;
@@ -1587,16 +1585,16 @@ static void * ggml_cann_host_malloc(size_t size) {
1587
1585
  }
1588
1586
 
1589
1587
  const size_t alignment = 128;
1590
- size = GGML_PAD(size, alignment);
1588
+ size = GGML_PAD(size, alignment);
1591
1589
  if (size == 0) {
1592
1590
  size = alignment;
1593
1591
  }
1594
1592
 
1595
- void * hostPtr = nullptr;
1596
- aclError err = aclrtMallocHost((void **) &hostPtr, size);
1593
+ void * hostPtr = nullptr;
1594
+ aclError err = aclrtMallocHost((void **) &hostPtr, size);
1597
1595
  if (err != ACL_SUCCESS) {
1598
- GGML_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
1599
- size / 1024.0 / 1024.0, aclGetRecentErrMsg());
1596
+ GGML_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__, size / 1024.0 / 1024.0,
1597
+ aclGetRecentErrMsg());
1600
1598
  return nullptr;
1601
1599
  }
1602
1600
  return hostPtr;
@@ -1609,7 +1607,8 @@ static void * ggml_cann_host_malloc(size_t size) {
1609
1607
  * @param size Size in bytes of the host buffer to allocate.
1610
1608
  * @return Pointer to the allocated host buffer, or CPU buffer pointer if allocation fails.
1611
1609
  */
1612
- static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
1610
+ static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
1611
+ size_t size) {
1613
1612
  void * hostPtr = ggml_cann_host_malloc(size);
1614
1613
 
1615
1614
  if (hostPtr == nullptr) {
@@ -1618,8 +1617,8 @@ static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggm
1618
1617
  }
1619
1618
 
1620
1619
  ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(hostPtr, size);
1621
- buffer->buft = buft;
1622
- buffer->iface.free_buffer = ggml_backend_cann_host_buffer_free;
1620
+ buffer->buft = buft;
1621
+ buffer->iface.free_buffer = ggml_backend_cann_host_buffer_free;
1623
1622
 
1624
1623
  return buffer;
1625
1624
  }
@@ -1633,14 +1632,15 @@ static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggm
1633
1632
  ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type() {
1634
1633
  static struct ggml_backend_buffer_type ggml_backend_cann_buffer_type_host = {
1635
1634
  /* .iface = */ {
1636
- /* .get_name = */ ggml_backend_cann_host_buffer_type_name,
1637
- /* .alloc_buffer = */ ggml_backend_cann_host_buffer_type_alloc_buffer,
1638
- /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
1639
- /* .get_max_size = */ NULL, // defaults to SIZE_MAX
1635
+ /* .get_name = */ ggml_backend_cann_host_buffer_type_name,
1636
+ /* .alloc_buffer = */ ggml_backend_cann_host_buffer_type_alloc_buffer,
1637
+ /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
1638
+ /* .get_max_size = */ NULL, // defaults to SIZE_MAX
1640
1639
  /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
1641
- /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
1642
- },
1643
- /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), 0),
1640
+ /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
1641
+ },
1642
+ /* .device = */
1643
+ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), 0),
1644
1644
  /* .context = */ nullptr,
1645
1645
  };
1646
1646
 
@@ -1660,8 +1660,7 @@ ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type() {
1660
1660
  * stored.
1661
1661
  * @return true if the computation was successful; false otherwise.
1662
1662
  */
1663
- static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
1664
- struct ggml_tensor* dst) {
1663
+ static bool ggml_cann_compute_forward(ggml_backend_cann_context & ctx, struct ggml_tensor * dst) {
1665
1664
  switch (dst->op) {
1666
1665
  case GGML_OP_REPEAT:
1667
1666
  ggml_cann_repeat(ctx, dst);
@@ -1707,14 +1706,14 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
1707
1706
  case GGML_UNARY_OP_SILU:
1708
1707
  GGML_CANN_CALL_OP_UNARY(Silu);
1709
1708
  break;
1710
- case GGML_UNARY_OP_GELU_QUICK: {
1711
- auto lambda = [](ggml_backend_cann_context& ctx,
1712
- aclTensor* acl_src,
1713
- aclTensor* acl_dst) {
1714
- GGML_CANN_CALL_ACLNN_OP(ctx, GeluV2, acl_src, 0, acl_dst);
1715
- };
1716
- ggml_cann_op_unary(lambda, ctx, dst);
1717
- } break;
1709
+ case GGML_UNARY_OP_GELU_QUICK:
1710
+ {
1711
+ auto lambda = [](ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst) {
1712
+ GGML_CANN_CALL_ACLNN_OP(ctx, GeluV2, acl_src, 0, acl_dst);
1713
+ };
1714
+ ggml_cann_op_unary(lambda, ctx, dst);
1715
+ }
1716
+ break;
1718
1717
  case GGML_UNARY_OP_TANH:
1719
1718
  GGML_CANN_CALL_OP_UNARY(Tanh);
1720
1719
  break;
@@ -1759,14 +1758,14 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
1759
1758
  case GGML_GLU_OP_SWIGLU:
1760
1759
  GGML_CANN_CALL_OP_UNARY_GATED(Silu);
1761
1760
  break;
1762
- case GGML_GLU_OP_GEGLU_QUICK: {
1763
- auto lambda = [](ggml_backend_cann_context& ctx,
1764
- aclTensor* acl_src,
1765
- aclTensor* acl_dst) {
1766
- GGML_CANN_CALL_ACLNN_OP(ctx, GeluV2, acl_src, 0, acl_dst);
1767
- };
1768
- ggml_cann_op_unary_gated(lambda, ctx, dst);
1769
- } break;
1761
+ case GGML_GLU_OP_GEGLU_QUICK:
1762
+ {
1763
+ auto lambda = [](ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst) {
1764
+ GGML_CANN_CALL_ACLNN_OP(ctx, GeluV2, acl_src, 0, acl_dst);
1765
+ };
1766
+ ggml_cann_op_unary_gated(lambda, ctx, dst);
1767
+ }
1768
+ break;
1770
1769
  default:
1771
1770
  return false;
1772
1771
  }
@@ -1777,6 +1776,12 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
1777
1776
  case GGML_OP_GROUP_NORM:
1778
1777
  ggml_cann_group_norm(ctx, dst);
1779
1778
  break;
1779
+ case GGML_OP_L2_NORM:
1780
+ ggml_cann_l2_norm(ctx, dst);
1781
+ break;
1782
+ case GGML_OP_CROSS_ENTROPY_LOSS:
1783
+ ggml_cann_cross_entropy_loss(ctx, dst);
1784
+ break;
1780
1785
  case GGML_OP_CONCAT:
1781
1786
  ggml_cann_concat(ctx, dst);
1782
1787
  break;
@@ -1898,9 +1903,8 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
1898
1903
  * @param backend Pointer to the CANN backend structure.
1899
1904
  * @return A pointer to a constant string representing the backend name.
1900
1905
  */
1901
- static const char* ggml_backend_cann_name(ggml_backend_t backend) {
1902
- ggml_backend_cann_context* cann_ctx =
1903
- (ggml_backend_cann_context*)backend->context;
1906
+ static const char * ggml_backend_cann_name(ggml_backend_t backend) {
1907
+ ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context;
1904
1908
 
1905
1909
  return cann_ctx->name.c_str();
1906
1910
  }
@@ -1914,8 +1918,7 @@ static const char* ggml_backend_cann_name(ggml_backend_t backend) {
1914
1918
  * @param backend Pointer to the CANN backend structure to be freed.
1915
1919
  */
1916
1920
  static void ggml_backend_cann_free(ggml_backend_t backend) {
1917
- ggml_backend_cann_context* cann_ctx =
1918
- (ggml_backend_cann_context*)backend->context;
1921
+ ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context;
1919
1922
  ACL_CHECK(aclrtSynchronizeDevice());
1920
1923
  ACL_CHECK(aclrtResetDevice(cann_ctx->device));
1921
1924
 
@@ -1923,7 +1926,6 @@ static void ggml_backend_cann_free(ggml_backend_t backend) {
1923
1926
  delete backend;
1924
1927
  }
1925
1928
 
1926
-
1927
1929
  /**
1928
1930
  * @brief Sets tensor data asynchronously in the CANN backend.
1929
1931
  *
@@ -1936,21 +1938,18 @@ static void ggml_backend_cann_free(ggml_backend_t backend) {
1936
1938
  * @param size Size of the data to copy in bytes.
1937
1939
  */
1938
1940
  static void ggml_backend_cann_set_tensor_async(ggml_backend_t backend,
1939
- ggml_tensor *tensor,
1940
- const void *data,
1941
- size_t offset,
1942
- size_t size) {
1943
- ggml_backend_cann_context *cann_ctx =
1944
- (ggml_backend_cann_context *)backend->context;
1945
- ggml_backend_buffer_t buf =
1946
- tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
1947
-
1948
- GGML_ASSERT(buf->buft == ggml_backend_cann_buffer_type(cann_ctx->device) &&
1949
- "unsupported buffer type");
1941
+ ggml_tensor * tensor,
1942
+ const void * data,
1943
+ size_t offset,
1944
+ size_t size) {
1945
+ ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context;
1946
+ ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
1947
+
1948
+ GGML_ASSERT(buf->buft == ggml_backend_cann_buffer_type(cann_ctx->device) && "unsupported buffer type");
1950
1949
  GGML_ASSERT(!ggml_is_quantized(tensor->type));
1951
1950
 
1952
- ggml_cann_async_memcpy(cann_ctx, (char *)tensor->data + offset, data, size,
1953
- ACL_MEMCPY_HOST_TO_DEVICE);
1951
+ ACL_CHECK(aclrtMemcpyAsync((char *) tensor->data + offset, size, data, size, ACL_MEMCPY_HOST_TO_DEVICE,
1952
+ cann_ctx->stream()));
1954
1953
  }
1955
1954
 
1956
1955
  /**
@@ -1964,21 +1963,19 @@ static void ggml_backend_cann_set_tensor_async(ggml_backend_t backend,
1964
1963
  * @param offset Offset in bytes within the host data.
1965
1964
  * @param size Size of the data to copy in bytes.
1966
1965
  */
1967
- static void ggml_backend_cann_get_tensor_async(
1968
- ggml_backend_t backend, const ggml_tensor *tensor, void *data,
1969
- size_t offset, size_t size) {
1970
- ggml_backend_cann_context *cann_ctx =
1971
- (ggml_backend_cann_context *)backend->context;
1972
- ggml_backend_buffer_t buf =
1973
- tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
1966
+ static void ggml_backend_cann_get_tensor_async(ggml_backend_t backend,
1967
+ const ggml_tensor * tensor,
1968
+ void * data,
1969
+ size_t offset,
1970
+ size_t size) {
1971
+ ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context;
1972
+ ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
1974
1973
 
1975
- GGML_ASSERT(buf->buft == ggml_backend_cann_buffer_type(cann_ctx->device) &&
1976
- "unsupported buffer type");
1974
+ GGML_ASSERT(buf->buft == ggml_backend_cann_buffer_type(cann_ctx->device) && "unsupported buffer type");
1977
1975
  GGML_ASSERT(!ggml_is_quantized(tensor->type));
1978
1976
 
1979
- ggml_cann_async_memcpy(cann_ctx, data, (char *)tensor->data + offset, size,
1980
- ACL_MEMCPY_DEVICE_TO_HOST);
1981
-
1977
+ ACL_CHECK(aclrtMemcpyAsync(data, size, (char *) tensor->data + offset, size, ACL_MEMCPY_DEVICE_TO_HOST,
1978
+ cann_ctx->stream()));
1982
1979
  }
1983
1980
 
1984
1981
  /**
@@ -1994,65 +1991,67 @@ static void ggml_backend_cann_get_tensor_async(
1994
1991
  * @param dst Pointer to the destination tensor to copy data to.
1995
1992
  * @return true if the copy operation succeeds, false otherwise.
1996
1993
  */
1997
- static bool ggml_backend_cann_cpy_tensor_async(
1998
- ggml_backend_t backend_src, ggml_backend_t backend_dst,
1999
- const ggml_tensor* src, ggml_tensor* dst) {
2000
- GGML_ASSERT(ggml_backend_is_cann(backend_src) ||
2001
- ggml_backend_is_cann(backend_dst));
1994
+ static bool ggml_backend_cann_cpy_tensor_async(ggml_backend_t backend_src,
1995
+ ggml_backend_t backend_dst,
1996
+ const ggml_tensor * src,
1997
+ ggml_tensor * dst) {
1998
+ GGML_ASSERT(ggml_backend_is_cann(backend_src) || ggml_backend_is_cann(backend_dst));
1999
+
2000
+ GGML_ASSERT(!is_matmul_weight((const ggml_tensor *) src));
2002
2001
 
2003
- if (!ggml_backend_buffer_is_cann(src->buffer) ||
2004
- !ggml_backend_buffer_is_cann(dst->buffer)) {
2002
+ if (!ggml_backend_buffer_is_cann(src->buffer) || !ggml_backend_buffer_is_cann(dst->buffer)) {
2005
2003
  return false;
2006
2004
  }
2007
2005
 
2008
- ggml_backend_buffer_t buf_src =
2009
- src->view_src ? src->view_src->buffer : src->buffer;
2010
- ggml_backend_buffer_t buf_dst =
2011
- dst->view_src ? dst->view_src->buffer : dst->buffer;
2006
+ ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
2007
+ ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;
2012
2008
 
2013
- ggml_backend_cann_context* cann_ctx_src =
2014
- (ggml_backend_cann_context*)backend_src->context;
2015
- ggml_backend_cann_context* cann_ctx_dst =
2016
- (ggml_backend_cann_context*)backend_dst->context;
2009
+ ggml_backend_cann_context * cann_ctx_src = (ggml_backend_cann_context *) backend_src->context;
2010
+ ggml_backend_cann_context * cann_ctx_dst = (ggml_backend_cann_context *) backend_dst->context;
2017
2011
 
2018
2012
  size_t copy_size = ggml_nbytes(dst);
2019
2013
  if (copy_size == 0) {
2020
2014
  return true;
2021
2015
  }
2022
2016
  if (backend_src != backend_dst) {
2023
- ggml_backend_cann_buffer_context* buf_ctx_src =
2024
- (ggml_backend_cann_buffer_context*)buf_src->context;
2025
- ggml_backend_cann_buffer_context* buf_ctx_dst =
2026
- (ggml_backend_cann_buffer_context*)buf_dst->context;
2017
+ #ifdef ASCEND_310P
2018
+ // TODO: Support 310p P2P copy
2019
+ return false;
2020
+ #endif
2021
+ ggml_backend_cann_buffer_context * buf_ctx_src = (ggml_backend_cann_buffer_context *) buf_src->context;
2022
+ ggml_backend_cann_buffer_context * buf_ctx_dst = (ggml_backend_cann_buffer_context *) buf_dst->context;
2027
2023
 
2028
2024
  GGML_ASSERT(cann_ctx_src->device == buf_ctx_src->device);
2029
2025
  GGML_ASSERT(cann_ctx_dst->device == buf_ctx_dst->device);
2030
2026
 
2031
2027
  int32_t canAccessPeer = 0;
2032
- ACL_CHECK(aclrtDeviceCanAccessPeer(&canAccessPeer, cann_ctx_src->device,
2033
- cann_ctx_dst->device));
2028
+ ACL_CHECK(aclrtDeviceCanAccessPeer(&canAccessPeer, cann_ctx_src->device, cann_ctx_dst->device));
2034
2029
  if (!canAccessPeer) {
2035
2030
  return false;
2036
2031
  }
2037
2032
 
2038
2033
  // need open both directions for memcpyasync between devices.
2039
- ggml_cann_set_device(cann_ctx_dst->device);
2040
2034
  ACL_CHECK(aclrtDeviceEnablePeerAccess(cann_ctx_src->device, 0));
2041
2035
  ggml_cann_set_device(cann_ctx_src->device);
2042
2036
  ACL_CHECK(aclrtDeviceEnablePeerAccess(cann_ctx_dst->device, 0));
2043
2037
 
2044
2038
  // wait for task_queue empty to keep task order.
2045
- cann_ctx_src->task_queue.wait();
2046
- ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size,
2047
- ACL_MEMCPY_DEVICE_TO_DEVICE,
2039
+ ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size, ACL_MEMCPY_DEVICE_TO_DEVICE,
2048
2040
  cann_ctx_src->stream()));
2049
-
2050
- //TODO: workaround for Event didn`t work here.
2051
- aclrtSynchronizeStream(cann_ctx_src->stream());
2041
+ // record event on src stream after the copy
2042
+ // TODO: this event is not effective with acl graph mode, change to use aclrtSynchronizeStream
2043
+ // if (!cann_ctx_src->copy_event) {
2044
+ // ACL_CHECK(aclrtCreateEventWithFlag(&cann_ctx_src->copy_event, ACL_EVENT_SYNC));
2045
+ // }
2046
+ // ACL_CHECK(aclrtRecordEvent(cann_ctx_src->copy_event, cann_ctx_src->stream()));
2047
+
2048
+ // // wait on dst stream for the copy to complete
2049
+ // ggml_cann_set_device(cann_ctx_dst->device);
2050
+ // ACL_CHECK(aclrtStreamWaitEvent(cann_ctx_dst->stream(), cann_ctx_src->copy_event));
2051
+ ACL_CHECK(aclrtSynchronizeStream(cann_ctx_src->stream()));
2052
2052
  } else {
2053
2053
  // src and dst are on the same backend
2054
- ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size,
2055
- ACL_MEMCPY_DEVICE_TO_DEVICE,
2054
+ ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size, ACL_MEMCPY_DEVICE_TO_DEVICE,
2056
2055
  cann_ctx_dst->stream()));
2057
2056
  }
2058
2057
 
@@ -2068,39 +2067,65 @@ static bool ggml_backend_cann_cpy_tensor_async(
2068
2067
  * @param backend Pointer to the CANN backend structure to synchronize.
2069
2068
  */
2070
2069
  static void ggml_backend_cann_synchronize(ggml_backend_t backend) {
2071
- ggml_backend_cann_context* cann_ctx =
2072
- (ggml_backend_cann_context*)backend->context;
2073
- cann_ctx->task_queue.wait();
2070
+ ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context;
2074
2071
  ggml_cann_set_device(cann_ctx->device);
2075
2072
  ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream()));
2076
2073
  }
2077
2074
 
2078
2075
  #ifdef USE_ACL_GRAPH
2079
2076
  /**
2080
- * @brief Populate the internal CANN graph node properties from the ggml computation graph.
2077
+ * @brief Add a new CANN graph to the LRU cache by populating node properties from the ggml graph.
2081
2078
  *
2082
- * This function copies all node attributes (operation type, dimensions, strides, input sources,
2083
- * and operation parameters) into the cached CANN graph structure for later reuse or comparison.
2079
+ * This function creates a new ggml_cann_graph object and fills its node properties
2080
+ * (operation type, dimensions, strides, input sources, and operation parameters)
2081
+ * based on the current ggml computation graph.
2084
2082
  *
2085
- * @param cann_ctx The CANN backend context.
2086
- * @param cgraph The ggml computational graph.
2083
+ * Each node in the ggml graph is mapped to a property entry in the new CANN graph:
2084
+ * - node address
2085
+ * - operation type
2086
+ * - shape (ne) and strides (nb)
2087
+ * - source tensor addresses
2088
+ * - operation parameters
2089
+ *
2090
+ * After initialization, the new graph is pushed into the LRU cache owned by the
2091
+ * CANN backend context. The cache takes ownership of the graph and manages its
2092
+ * lifetime (including deletion upon eviction).
2093
+ *
2094
+ * @param cann_ctx The CANN backend context containing the graph cache.
2095
+ * @param cgraph The current ggml computation graph.
2087
2096
  */
2088
- static void set_ggml_graph_node_properties(ggml_backend_cann_context * cann_ctx, ggml_cgraph * cgraph) {
2089
- for (int node_idx = 0; node_idx < cgraph->n_nodes; node_idx++) {
2090
- ggml_tensor * node = cgraph->nodes[node_idx];
2091
- cann_ctx->cann_graph->ggml_graph_properties[node_idx].node_address = node->data;
2092
- cann_ctx->cann_graph->ggml_graph_properties[node_idx].node_op = node->op;
2097
+ static void add_lru_matched_graph_node_properties(ggml_backend_cann_context * cann_ctx, ggml_cgraph * cgraph) {
2098
+ // Create a new ggml_cann_graph object on the heap (its lifetime is managed by the cache).
2099
+ ggml_cann_graph * new_graph = new ggml_cann_graph();
2100
+ new_graph->ggml_graph_properties.resize(cgraph->n_nodes);
2093
2101
 
2094
- for (int dim = 0; dim < GGML_MAX_DIMS; dim++) {
2095
- cann_ctx->cann_graph->ggml_graph_properties[node_idx].ne[dim] = node->ne[dim];
2096
- cann_ctx->cann_graph->ggml_graph_properties[node_idx].nb[dim] = node->nb[dim];
2097
- }
2098
- for (int src = 0; src < GGML_MAX_SRC; src++) {
2099
- cann_ctx->cann_graph->ggml_graph_properties[node_idx].src_address[src] =
2100
- node->src[src] ? node->src[src]->data : nullptr;
2102
+ for (int node_idx = 0; node_idx < cgraph->n_nodes; ++node_idx) {
2103
+ ggml_tensor * node = cgraph->nodes[node_idx];
2104
+ auto & prop = new_graph->ggml_graph_properties[node_idx];
2105
+
2106
+ prop.node_address = node->data;
2107
+ prop.node_op = node->op;
2108
+
2109
+ std::copy_n(node->ne, GGML_MAX_DIMS, prop.ne);
2110
+ std::copy_n(node->nb, GGML_MAX_DIMS, prop.nb);
2111
+
2112
+ for (int src = 0; src < GGML_MAX_SRC; ++src) {
2113
+ if (node->src[src]) {
2114
+ prop.src_address[src] = node->src[src]->data;
2115
+ std::copy_n(node->src[src]->ne, GGML_MAX_DIMS, prop.src_ne[src]);
2116
+ std::copy_n(node->src[src]->nb, GGML_MAX_DIMS, prop.src_nb[src]);
2117
+ } else {
2118
+ prop.src_address[src] = nullptr;
2119
+ std::fill_n(prop.src_ne[src], GGML_MAX_DIMS, 0);
2120
+ std::fill_n(prop.src_nb[src], GGML_MAX_DIMS, 0);
2121
+ }
2101
2122
  }
2102
- memcpy(cann_ctx->cann_graph->ggml_graph_properties[node_idx].op_params, node->op_params, GGML_MAX_OP_PARAMS);
2123
+
2124
+ memcpy(prop.op_params, node->op_params, GGML_MAX_OP_PARAMS);
2103
2125
  }
2126
+
2127
+ // Insert into the LRU cache (cache takes ownership and will delete it when evicted).
2128
+ cann_ctx->graph_lru_cache.push(new_graph);
2104
2129
  }
2105
2130
 
2106
2131
  /**
@@ -2113,14 +2138,16 @@ static void set_ggml_graph_node_properties(ggml_backend_cann_context * cann_ctx,
2113
2138
  * @param graph_node_properties The stored properties of a CANN graph node.
2114
2139
  * @return true if all fields match (excluding GGML_OP_VIEW); false otherwise.
2115
2140
  */
2116
- static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
2117
- if (node->data != graph_node_properties->node_address &&
2118
- node->op != GGML_OP_VIEW) {
2141
+ static bool ggml_graph_node_has_matching_properties(ggml_tensor * node,
2142
+ ggml_graph_node_properties * graph_node_properties) {
2143
+ if (node->data != graph_node_properties->node_address && node->op != GGML_OP_VIEW) {
2119
2144
  return false;
2120
2145
  }
2146
+
2121
2147
  if (node->op != graph_node_properties->node_op) {
2122
2148
  return false;
2123
2149
  }
2150
+
2124
2151
  for (int i = 0; i < GGML_MAX_DIMS; i++) {
2125
2152
  if (node->ne[i] != graph_node_properties->ne[i]) {
2126
2153
  return false;
@@ -2129,46 +2156,74 @@ static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_gra
2129
2156
  return false;
2130
2157
  }
2131
2158
  }
2159
+
2132
2160
  for (int i = 0; i < GGML_MAX_SRC; i++) {
2133
- if (node->src[i] &&
2134
- node->src[i]->data != graph_node_properties->src_address[i] &&
2135
- node->op != GGML_OP_VIEW
2136
- ) {
2137
- return false;
2161
+ if (node->src[i]) {
2162
+ if (node->src[i]->data != graph_node_properties->src_address[i] && node->op != GGML_OP_VIEW) {
2163
+ return false;
2164
+ }
2165
+
2166
+ for (int d = 0; d < GGML_MAX_DIMS; d++) {
2167
+ if (node->src[i]->ne[d] != graph_node_properties->src_ne[i][d]) {
2168
+ return false;
2169
+ }
2170
+ if (node->src[i]->nb[d] != graph_node_properties->src_nb[i][d]) {
2171
+ return false;
2172
+ }
2173
+ }
2174
+ } else {
2175
+ if (graph_node_properties->src_address[i] != nullptr) {
2176
+ return false;
2177
+ }
2138
2178
  }
2139
2179
  }
2140
- if (node->op == GGML_OP_SCALE &&
2141
- memcmp(graph_node_properties->op_params, node->op_params, GGML_MAX_OP_PARAMS) != 0) {
2142
- return false;
2180
+
2181
+ if (node->op == GGML_OP_SCALE || node->op == GGML_OP_UNARY || node->op == GGML_OP_GLU) {
2182
+ return memcmp(graph_node_properties->op_params, node->op_params, GGML_MAX_OP_PARAMS) == 0;
2143
2183
  }
2144
2184
  return true;
2145
2185
  }
2146
2186
 
2147
2187
  /**
2148
- * @brief Determine if the CANN graph needs to be rebuilt due to graph changes.
2188
+ * @brief Check whether there is a cached CANN graph that matches the current ggml graph.
2189
+ *
2190
+ * This function iterates through the cached CANN graphs stored in the LRU cache and
2191
+ * compares them against the given ggml computation graph. A match requires that the
2192
+ * number of nodes is the same and that each node’s properties (operation type,
2193
+ * dimensions, strides, inputs, and operation parameters) are identical.
2149
2194
  *
2150
- * This checks whether the number or properties of ggml graph nodes have changed
2151
- * compared to the last captured CANN graph. If so, the CANN graph must be re-captured.
2195
+ * If a matching graph is found, it is promoted to the front of the LRU cache and the
2196
+ * function returns true. Otherwise, the function returns false, indicating that a new
2197
+ * CANN graph needs to be captured.
2152
2198
  *
2153
- * @param cann_ctx The CANN backend context.
2199
+ * @param cann_ctx The CANN backend context containing the graph cache.
2154
2200
  * @param cgraph The current ggml computation graph.
2155
- * @return true if an update is required; false otherwise.
2156
- */
2157
- static bool is_cann_graph_update_required(ggml_backend_cann_context * cann_ctx, ggml_cgraph * cgraph) {
2158
- // The number of nodes is different, so the graph needs to be reconstructed.
2159
- if (cann_ctx->cann_graph->ggml_graph_properties.size() != (size_t)cgraph->n_nodes) {
2160
- cann_ctx->cann_graph->ggml_graph_properties.resize(cgraph->n_nodes);
2161
- return true;
2162
- }
2201
+ * @return true if a matching cached graph exists; false otherwise.
2202
+ */
2203
+ static bool is_matched_graph(ggml_backend_cann_context * cann_ctx, ggml_cgraph * cgraph) {
2204
+ ggml_cann_graph_lru_cache & lru_cache = cann_ctx->graph_lru_cache;
2205
+ for (auto & graph_ptr : lru_cache.cache_list) {
2206
+ // Skip graphs with a different number of nodes.
2207
+ if (graph_ptr->ggml_graph_properties.size() != static_cast<size_t>(cgraph->n_nodes)) {
2208
+ continue;
2209
+ }
2163
2210
 
2164
- // The number of nodes is the same; iterate over each node to check whether they match.
2165
- for (int i = 0; i < cgraph->n_nodes; i++) {
2166
- bool has_matching_properties = ggml_graph_node_has_matching_properties(
2167
- cgraph->nodes[i], &cann_ctx->cann_graph->ggml_graph_properties[i]);
2168
- if(!has_matching_properties) {
2211
+ // Check if all nodes match.
2212
+ bool all_match = true;
2213
+ for (int i = 0; i < cgraph->n_nodes; ++i) {
2214
+ if (!ggml_graph_node_has_matching_properties(cgraph->nodes[i], &graph_ptr->ggml_graph_properties[i])) {
2215
+ all_match = false;
2216
+ break;
2217
+ }
2218
+ }
2219
+
2220
+ if (all_match) {
2221
+ // update cache_list && renturn graph_ptr
2222
+ lru_cache.move_to_front(graph_ptr);
2169
2223
  return true;
2170
2224
  }
2171
2225
  }
2226
+
2172
2227
  return false;
2173
2228
  }
2174
2229
  #endif // USE_ACL_GRAPH
@@ -2186,25 +2241,23 @@ static bool is_cann_graph_update_required(ggml_backend_cann_context * cann_ctx,
2186
2241
  * @param use_cann_graph Whether to use CANN graph execution.
2187
2242
  * @param cann_graph_update_required Whether graph capture is needed due to graph changes.
2188
2243
  */
2189
- static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx, ggml_cgraph * cgraph,
2190
- bool & use_cann_graph, bool & cann_graph_update_required) {
2244
+ static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx,
2245
+ ggml_cgraph * cgraph,
2246
+ bool & use_cann_graph,
2247
+ bool & cann_graph_update_required) {
2191
2248
  #ifdef USE_ACL_GRAPH
2192
- if (use_cann_graph && cann_graph_update_required) {
2193
- if (cann_ctx->cann_graph->graph != nullptr) {
2194
- ACL_CHECK(aclmdlRIDestroy(cann_ctx->cann_graph->graph));
2195
- cann_ctx->cann_graph->graph = nullptr;
2196
- }
2249
+ if (use_cann_graph && cann_graph_update_required) { // Begin CANN graph capture
2197
2250
  ACL_CHECK(aclmdlRICaptureBegin(cann_ctx->stream(), ACL_MODEL_RI_CAPTURE_MODE_GLOBAL));
2198
2251
  }
2199
- #endif // USE_ACL_GRAPH
2200
-
2252
+ #endif // USE_ACL_GRAPH
2201
2253
  // Only perform the graph execution if CANN graphs are not enabled, or we are capturing the graph.
2202
2254
  // With the use of CANN graphs, the execution will be performed by the graph launch.
2203
2255
  if (!use_cann_graph || cann_graph_update_required) {
2204
2256
  for (int i = 0; i < cgraph->n_nodes; i++) {
2205
2257
  ggml_tensor * node = cgraph->nodes[i];
2206
2258
 
2207
- if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
2259
+ if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE ||
2260
+ node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
2208
2261
  continue;
2209
2262
  }
2210
2263
 
@@ -2217,18 +2270,19 @@ static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx
2217
2270
  }
2218
2271
 
2219
2272
  #ifdef USE_ACL_GRAPH
2220
- if (use_cann_graph && cann_graph_update_required) { // End CANN graph capture
2221
- ACL_CHECK(aclmdlRICaptureEnd(cann_ctx->stream(), &cann_ctx->cann_graph->graph));
2222
- }
2223
-
2224
2273
  if (use_cann_graph) {
2225
- // Execute graph
2226
- ACL_CHECK(aclmdlRIExecuteAsync(cann_ctx->cann_graph->graph, cann_ctx->stream()));
2274
+ ggml_cann_graph * matched_graph = cann_ctx->graph_lru_cache.cache_list.front();
2275
+
2276
+ if (cann_graph_update_required) { // End CANN graph capture
2277
+ ACL_CHECK(aclmdlRICaptureEnd(cann_ctx->stream(), &matched_graph->graph));
2278
+ }
2279
+
2280
+ // Execute CANN graph
2281
+ ACL_CHECK(aclmdlRIExecuteAsync(matched_graph->graph, cann_ctx->stream()));
2227
2282
  }
2228
- #endif // USE_ACL_GRAPH
2283
+ #endif // USE_ACL_GRAPH
2229
2284
  }
2230
2285
 
2231
-
2232
2286
  /**
2233
2287
  * @brief Computes a computational graph using a CANN backend.
2234
2288
  *
@@ -2241,36 +2295,50 @@ static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx
2241
2295
  * @return enum ggml_status Returns GGML_STATUS_SUCCESS if computation
2242
2296
  * completes successfully, otherwise an appropriate error status.
2243
2297
  */
2244
- static enum ggml_status ggml_backend_cann_graph_compute(
2245
- ggml_backend_t backend, ggml_cgraph* cgraph) {
2246
- ggml_backend_cann_context* cann_ctx =
2247
- (ggml_backend_cann_context*)backend->context;
2298
+ static enum ggml_status ggml_backend_cann_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
2299
+ ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context;
2248
2300
  ggml_cann_set_device(cann_ctx->device);
2249
- release_nz_workspace();
2301
+ g_nz_workspaces[cann_ctx->device].clear();
2302
+
2303
+ // calculate rope cache for fist layer in current device.
2304
+ cann_ctx->rope_cache.cached = false;
2305
+
2250
2306
  #ifdef USE_ACL_GRAPH
2251
- bool use_cann_graph = true;
2307
+ bool use_cann_graph = true;
2252
2308
  bool cann_graph_update_required = false;
2253
2309
 
2254
- if (use_cann_graph) {
2255
- if (cann_ctx->cann_graph == nullptr) {
2256
- cann_ctx->cann_graph.reset(new ggml_cann_graph());
2257
- cann_graph_update_required = true;
2310
+ static bool prefill_use_graph = parse_bool(get_env("GGML_CANN_PREFILL_USE_GRAPH").value_or(""));
2311
+ if (!prefill_use_graph) {
2312
+ // Do not use acl_graph for prefill.
2313
+ for (int i = 0; i < cgraph->n_nodes; i++) {
2314
+ ggml_tensor * node = cgraph->nodes[i];
2315
+ // TODO: Optimize here. Currently, we can only
2316
+ // get seq_len by FA's input.
2317
+ if (node->op == GGML_OP_FLASH_ATTN_EXT) {
2318
+ // Q -> src[0], shape: [B, S, N, D]
2319
+ use_cann_graph = (node->src[0]->ne[1] == 1);
2320
+ break;
2321
+ }
2258
2322
  }
2323
+ }
2324
+
2325
+ if (!cann_ctx->acl_graph_mode) {
2326
+ use_cann_graph = false;
2327
+ }
2259
2328
 
2260
- cann_graph_update_required = is_cann_graph_update_required(cann_ctx, cgraph);
2261
- set_ggml_graph_node_properties(cann_ctx, cgraph);
2329
+ if (use_cann_graph) {
2330
+ // If no matching graph is found, the graph needs to be recaptured.
2331
+ cann_graph_update_required = !is_matched_graph(cann_ctx, cgraph);
2332
+ if (cann_graph_update_required) {
2333
+ // If no matching graph is found, add a new ACL graph.
2334
+ add_lru_matched_graph_node_properties(cann_ctx, cgraph);
2335
+ }
2262
2336
  }
2263
2337
  #else
2264
- bool use_cann_graph = false;
2338
+ bool use_cann_graph = false;
2265
2339
  bool cann_graph_update_required = false;
2266
2340
  #endif // USE_ACL_GRAPH
2267
-
2268
- evaluate_and_capture_cann_graph(
2269
- cann_ctx,
2270
- cgraph,
2271
- use_cann_graph,
2272
- cann_graph_update_required
2273
- );
2341
+ evaluate_and_capture_cann_graph(cann_ctx, cgraph, use_cann_graph, cann_graph_update_required);
2274
2342
 
2275
2343
  return GGML_STATUS_SUCCESS;
2276
2344
  }
@@ -2287,8 +2355,7 @@ static enum ggml_status ggml_backend_cann_graph_compute(
2287
2355
  * @return bool Returns true if the operation is supported by the backend,
2288
2356
  * otherwise false.
2289
2357
  */
2290
- static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
2291
- const ggml_tensor* op) {
2358
+ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
2292
2359
  switch (op->op) {
2293
2360
  case GGML_OP_UNARY:
2294
2361
  switch (ggml_get_unary_op(op)) {
@@ -2323,24 +2390,24 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
2323
2390
  return false;
2324
2391
  }
2325
2392
  break;
2326
- case GGML_OP_MUL_MAT: {
2327
- switch (op->src[0]->type) {
2328
- case GGML_TYPE_F16:
2329
- case GGML_TYPE_F32:
2330
- return true;
2331
- case GGML_TYPE_Q8_0:
2332
- case GGML_TYPE_Q4_0:
2393
+ case GGML_OP_MUL_MAT:
2394
+ {
2395
+ switch (op->src[0]->type) {
2396
+ case GGML_TYPE_F16:
2397
+ case GGML_TYPE_F32:
2398
+ return true;
2399
+ case GGML_TYPE_Q8_0:
2400
+ case GGML_TYPE_Q4_0:
2333
2401
  #ifdef ASCEND_310P
2334
- // Q4 && Q8 per group is not support on 310p device
2335
- return false;
2402
+ // Q4 && Q8 per group is not support on 310p device
2403
+ return false;
2336
2404
  #endif
2337
- // only support contiguous for quantized types.
2338
- return ggml_is_contiguous(op->src[0]) &&
2339
- ggml_is_contiguous(op->src[1]);
2340
- default:
2341
- return false;
2405
+ // only support contiguous for quantized types.
2406
+ return ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
2407
+ default:
2408
+ return false;
2409
+ }
2342
2410
  }
2343
- }
2344
2411
  case GGML_OP_MUL_MAT_ID:
2345
2412
  switch (op->src[0]->type) {
2346
2413
  case GGML_TYPE_F16:
@@ -2353,106 +2420,115 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
2353
2420
  return false;
2354
2421
  #endif
2355
2422
  // only support contiguous for quantized types.
2356
- return ggml_is_contiguous(op->src[0]) &&
2357
- ggml_is_contiguous(op->src[1]);
2423
+ return ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
2358
2424
  default:
2359
2425
  return false;
2360
2426
  }
2361
2427
  // embedding
2362
- case GGML_OP_GET_ROWS: {
2363
- switch (op->src[0]->type) {
2364
- case GGML_TYPE_F32:
2365
- case GGML_TYPE_F16:
2366
- case GGML_TYPE_Q8_0:
2367
- return true;
2368
- default:
2369
- return false;
2370
- }
2371
- } break;
2372
- case GGML_OP_SET_ROWS: {
2373
- switch (op->type) {
2374
- case GGML_TYPE_F32:
2375
- case GGML_TYPE_F16:
2376
- return true;
2377
- default:
2378
- return false;
2428
+ case GGML_OP_GET_ROWS:
2429
+ {
2430
+ switch (op->src[0]->type) {
2431
+ case GGML_TYPE_F32:
2432
+ case GGML_TYPE_F16:
2433
+ case GGML_TYPE_Q8_0:
2434
+ return true;
2435
+ default:
2436
+ return false;
2437
+ }
2379
2438
  }
2380
- } break;
2381
- case GGML_OP_CPY: {
2382
- ggml_tensor *src = op->src[0];
2383
- if ((op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_F16) ||
2384
- (src->type != GGML_TYPE_F32 &&
2385
- src->type != GGML_TYPE_F16)) {
2386
- // only support F32 and F16.
2387
- return false;
2439
+ break;
2440
+ case GGML_OP_SET_ROWS:
2441
+ {
2442
+ switch (op->type) {
2443
+ case GGML_TYPE_F32:
2444
+ case GGML_TYPE_F16:
2445
+ return true;
2446
+ default:
2447
+ return false;
2448
+ }
2388
2449
  }
2389
- return true;
2390
- } break;
2391
- case GGML_OP_CONT: {
2392
- // TODO: support GGML_TYPE_BF16
2393
- switch (op->src[0]->type) {
2394
- case GGML_TYPE_F32:
2395
- case GGML_TYPE_F16:
2396
- return true;
2397
- default:
2450
+ break;
2451
+ case GGML_OP_CPY:
2452
+ {
2453
+ ggml_tensor * src = op->src[0];
2454
+ if ((op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_F16) ||
2455
+ (src->type != GGML_TYPE_F32 && src->type != GGML_TYPE_F16)) {
2456
+ // only support F32 and F16.
2398
2457
  return false;
2458
+ }
2459
+ return true;
2399
2460
  }
2400
- }
2401
- case GGML_OP_ROPE: {
2402
- // TODO: with ops-test v == 1
2403
- float ext_factor = 0.0f;
2404
- memcpy(&ext_factor, (const float *) op->op_params + 7, sizeof(float));
2405
- // TODO: n_dims <= ne0
2406
- if (op->src[0]->ne[0] != op->op_params[1]) {
2407
- return false;
2408
- }
2409
- // TODO: ext_factor != 0
2410
- if (ext_factor != 0) {
2411
- return false;
2412
- }
2413
-
2414
- const int mode = ((const int32_t *) op->op_params)[2];
2415
- if (mode & GGML_ROPE_TYPE_MROPE) {
2416
- return false;
2417
- }
2418
- if (mode & GGML_ROPE_TYPE_VISION) {
2419
- return false;
2461
+ break;
2462
+ case GGML_OP_CONT:
2463
+ {
2464
+ // TODO: support GGML_TYPE_BF16
2465
+ switch (op->src[0]->type) {
2466
+ case GGML_TYPE_F32:
2467
+ case GGML_TYPE_F16:
2468
+ return true;
2469
+ default:
2470
+ return false;
2471
+ }
2420
2472
  }
2473
+ case GGML_OP_ROPE:
2474
+ {
2475
+ // TODO: with ops-test v == 1
2476
+ // TODO: n_dims <= ne0
2477
+ if (op->src[0]->ne[0] != op->op_params[1]) {
2478
+ return false;
2479
+ }
2421
2480
 
2422
- if(!ggml_is_contiguous(op->src[0])){
2423
- return false;
2424
- }
2425
- return true;
2426
- }
2427
- case GGML_OP_UPSCALE: {
2428
- // aclnnUpsampleNearest2dGetWorkspaceSize not support
2429
- // selfDimN[2]/outDimN[2] or selfDimC[3]/outDimC[3] not equal
2430
- if (op->src[0]->ne[2] * op->ne[3] != op->src[0]->ne[3] * op->ne[2]) {
2431
- return false;
2481
+ const int mode = ((const int32_t *) op->op_params)[2];
2482
+ if (mode & GGML_ROPE_TYPE_MROPE) {
2483
+ return false;
2484
+ }
2485
+ if (mode & GGML_ROPE_TYPE_VISION) {
2486
+ return false;
2487
+ }
2488
+ if (op->src[0]->ne[0] > 896) {
2489
+ return false;
2490
+ }
2491
+ #ifdef ASCEND_310P
2492
+ if (!ggml_is_contiguous(op->src[0])) {
2493
+ return false;
2494
+ }
2495
+ #endif
2496
+ return true;
2432
2497
  }
2433
- if (op->op_params[0] != GGML_SCALE_MODE_NEAREST) {
2434
- return false;
2498
+ case GGML_OP_UPSCALE:
2499
+ {
2500
+ // aclnnUpsampleNearest2dGetWorkspaceSize not support
2501
+ // selfDimN[2]/outDimN[2] or selfDimC[3]/outDimC[3] not equal
2502
+ if (op->src[0]->ne[2] * op->ne[3] != op->src[0]->ne[3] * op->ne[2]) {
2503
+ return false;
2504
+ }
2505
+ if (op->op_params[0] != GGML_SCALE_MODE_NEAREST) {
2506
+ return false;
2507
+ }
2508
+ return true;
2435
2509
  }
2436
- return true;
2437
- }
2438
- case GGML_OP_POOL_2D: {
2439
- const int32_t * opts = (const int32_t *) op->op_params;
2510
+ case GGML_OP_POOL_2D:
2511
+ {
2512
+ const int32_t * opts = (const int32_t *) op->op_params;
2440
2513
  #ifdef ASCEND_310P
2441
- enum ggml_op_pool opt = static_cast<ggml_op_pool>(opts[0]);
2442
- if(opt == GGML_OP_POOL_MAX){
2443
- return false;
2444
- }
2514
+ enum ggml_op_pool opt = static_cast<ggml_op_pool>(opts[0]);
2515
+ if (opt == GGML_OP_POOL_MAX) {
2516
+ return false;
2517
+ }
2445
2518
  #endif
2446
- const int k0 = opts[1];
2447
- const int k1 = opts[2];
2448
- const int p0 = opts[5];
2449
- const int p1 = opts[6];
2450
- // value of paddingH should be at most half of kernelH
2451
- // value of paddingW should be at most half of kernelW
2452
- return (p0 <= (k0 / 2)) && (p1 <= (k1 / 2));
2453
- }
2454
- case GGML_OP_DUP:
2519
+ const int k0 = opts[1];
2520
+ const int k1 = opts[2];
2521
+ const int p0 = opts[5];
2522
+ const int p1 = opts[6];
2523
+ // value of paddingH should be at most half of kernelH
2524
+ // value of paddingW should be at most half of kernelW
2525
+ return (p0 <= (k0 / 2)) && (p1 <= (k1 / 2));
2526
+ }
2455
2527
  case GGML_OP_SUM:
2528
+ return ggml_is_contiguous_rows(op->src[0]);
2529
+ case GGML_OP_L2_NORM:
2530
+ case GGML_OP_CROSS_ENTROPY_LOSS:
2531
+ case GGML_OP_DUP:
2456
2532
  case GGML_OP_IM2COL:
2457
2533
  case GGML_OP_CONCAT:
2458
2534
  case GGML_OP_REPEAT:
@@ -2483,63 +2559,60 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
2483
2559
  case GGML_OP_ARGMAX:
2484
2560
  case GGML_OP_COS:
2485
2561
  case GGML_OP_SIN:
2486
- case GGML_OP_CONV_TRANSPOSE_1D:
2487
2562
  case GGML_OP_LOG:
2488
2563
  case GGML_OP_MEAN:
2489
2564
  case GGML_OP_PAD_REFLECT_1D:
2490
2565
  case GGML_OP_COUNT_EQUAL:
2491
2566
  return true;
2567
+ case GGML_OP_CONV_TRANSPOSE_1D:
2568
+ // TODO: ((weightL - 1) * dilationW - padLeft)=1336 should not be larger than 255.
2569
+ return (op->src[0]->ne[0] - 1) <= 255;
2492
2570
  case GGML_OP_SCALE:
2493
2571
  float bias;
2494
- memcpy(&bias, (const float *)(op->op_params) + 1, sizeof(float));
2495
- return bias == 0.0f; // TODO: support bias != 0.0f
2572
+ memcpy(&bias, (const float *) (op->op_params) + 1, sizeof(float));
2573
+ return bias == 0.0f; // TODO: support bias != 0.0f
2496
2574
  case GGML_OP_SOFT_MAX:
2497
2575
  // TODO: support attention sinks [TAG_ATTN_SINKS]
2498
2576
  if (op->src[2]) {
2499
2577
  return false;
2500
2578
  }
2501
2579
  return true;
2502
- case GGML_OP_FLASH_ATTN_EXT:{
2580
+ case GGML_OP_FLASH_ATTN_EXT:
2581
+ {
2503
2582
  #ifdef ASCEND_310P
2504
- // FA not support on 310p device
2505
- return false;
2506
- #endif
2507
- // derived from [ggml-cuda.cu]
2508
- if(op->src[1]->type != GGML_TYPE_F16 || op->src[2]->type != GGML_TYPE_F16){
2509
- return false;
2510
- }
2511
- if(op->src[1]->type != GGML_TYPE_F16 && op->src[1]->type != GGML_TYPE_F32 && op->src[1]->type != GGML_TYPE_BF16){
2512
- return false;
2513
- }
2514
- if(op->type != GGML_TYPE_F16 && op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_BF16){
2515
- return false;
2516
- }
2517
- // TODO: support attention sinks [TAG_ATTN_SINKS]
2518
- if (op->src[4]) {
2519
- return false;
2520
- }
2521
- if (op->src[1]->ne[0] != op->src[2]->ne[0]) {
2522
- // different head sizes of K and V are not supported yet
2523
- return false;
2524
- }
2525
- if (op->src[0]->ne[0] == 192) {
2526
- return false;
2527
- }
2528
- if (op->src[0]->ne[0] == 576) {
2529
- // DeepSeek MLA
2530
- return false;
2531
- }
2532
- if (op->src[0]->ne[0] % 16 != 0) {
2533
- // TODO: padding to support
2534
- return false;
2535
- }
2536
- float logitSoftcap = 0.0f;
2537
- memcpy(&logitSoftcap, (const float *)(op->op_params) + 2, sizeof(float));
2538
- if(logitSoftcap != 0.0f) {
2583
+ // FA not support on 310p device
2539
2584
  return false;
2585
+ #endif
2586
+ // derived from [ggml-cuda.cu]
2587
+ if (op->src[1]->type != GGML_TYPE_F16 || op->src[2]->type != GGML_TYPE_F16) {
2588
+ return false;
2589
+ }
2590
+ if (op->src[1]->type != GGML_TYPE_F16 && op->src[1]->type != GGML_TYPE_F32 &&
2591
+ op->src[1]->type != GGML_TYPE_BF16) {
2592
+ return false;
2593
+ }
2594
+ if (op->type != GGML_TYPE_F16 && op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_BF16) {
2595
+ return false;
2596
+ }
2597
+ // TODO: support attention sinks [TAG_ATTN_SINKS]
2598
+ if (op->src[4]) {
2599
+ return false;
2600
+ }
2601
+ if (op->src[1]->ne[0] != op->src[2]->ne[0]) {
2602
+ // different head sizes of K and V are not supported yet
2603
+ return false;
2604
+ }
2605
+ if (op->src[0]->ne[0] % 16 != 0) {
2606
+ // TODO: padding to support
2607
+ return false;
2608
+ }
2609
+ float logitSoftcap = 0.0f;
2610
+ memcpy(&logitSoftcap, (const float *) (op->op_params) + 2, sizeof(float));
2611
+ if (logitSoftcap != 0.0f) {
2612
+ return false;
2613
+ }
2614
+ return true;
2540
2615
  }
2541
- return true;
2542
- }
2543
2616
  default:
2544
2617
  return false;
2545
2618
  }
@@ -2576,8 +2649,7 @@ static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft) {
2576
2649
  * @return bool Returns true if the operation should be offloaded, otherwise
2577
2650
  * false.
2578
2651
  */
2579
- static bool ggml_backend_cann_offload_op(ggml_backend_dev_t dev,
2580
- const ggml_tensor* op) {
2652
+ static bool ggml_backend_cann_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
2581
2653
  const int min_batch_size = 32;
2582
2654
  GGML_UNUSED(dev);
2583
2655
 
@@ -2593,9 +2665,8 @@ static bool ggml_backend_cann_offload_op(ggml_backend_dev_t dev,
2593
2665
  * @param event Pointer to the event structure to be recorded.
2594
2666
  */
2595
2667
  static void ggml_backend_cann_event_record(ggml_backend_t backend, ggml_backend_event_t event) {
2596
- ggml_backend_cann_context* cann_ctx =
2597
- (ggml_backend_cann_context*)backend->context;
2598
- ACL_CHECK(aclrtRecordEvent((aclrtEvent)event->context, cann_ctx->stream()));
2668
+ ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context;
2669
+ ACL_CHECK(aclrtRecordEvent((aclrtEvent) event->context, cann_ctx->stream()));
2599
2670
  }
2600
2671
 
2601
2672
  /**
@@ -2608,13 +2679,10 @@ static void ggml_backend_cann_event_record(ggml_backend_t backend, ggml_backend_
2608
2679
  * @param event Pointer to the event structure that the backend needs to wait
2609
2680
  * for.
2610
2681
  */
2611
- static void ggml_backend_cann_event_wait(ggml_backend_t backend,
2612
- ggml_backend_event_t event) {
2613
- ggml_backend_cann_context* cann_ctx =
2614
- (ggml_backend_cann_context*)backend->context;
2682
+ static void ggml_backend_cann_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
2683
+ ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context;
2615
2684
  if (ggml_backend_is_cann(backend)) {
2616
- ACL_CHECK(aclrtStreamWaitEvent(cann_ctx->stream(),
2617
- (aclrtEvent)event->context));
2685
+ ACL_CHECK(aclrtStreamWaitEvent(cann_ctx->stream(), (aclrtEvent) event->context));
2618
2686
  } else {
2619
2687
  GGML_ABORT("fatal error");
2620
2688
  }
@@ -2641,6 +2709,7 @@ static const ggml_backend_i ggml_backend_cann_interface = {
2641
2709
  /* .graph_compute = */ ggml_backend_cann_graph_compute,
2642
2710
  /* .event_record = */ ggml_backend_cann_event_record,
2643
2711
  /* .event_wait = */ ggml_backend_cann_event_wait,
2712
+ /* .graph_optimize = */ NULL,
2644
2713
  };
2645
2714
 
2646
2715
  /**
@@ -2652,30 +2721,30 @@ static const ggml_backend_i ggml_backend_cann_interface = {
2652
2721
  * @return A pointer to the static GUID.
2653
2722
  */
2654
2723
  static ggml_guid_t ggml_backend_cann_guid() {
2655
- static ggml_guid guid = {0xa1, 0x94, 0xaf, 0xac, 0xbd, 0x4f, 0x47, 0x34,
2656
- 0xbe, 0x1a, 0x9e, 0x71, 0x1f, 0x9e, 0xed, 0x64};
2724
+ static ggml_guid guid = { 0xa1, 0x94, 0xaf, 0xac, 0xbd, 0x4f, 0x47, 0x34,
2725
+ 0xbe, 0x1a, 0x9e, 0x71, 0x1f, 0x9e, 0xed, 0x64 };
2657
2726
  return &guid;
2658
2727
  }
2659
2728
 
2660
2729
  // backend device
2661
2730
  struct ggml_backend_cann_device_context {
2662
- int device;
2731
+ int device;
2663
2732
  std::string name;
2664
2733
  std::string description;
2665
2734
  };
2666
2735
 
2667
2736
  static const char * ggml_backend_cann_device_get_name(ggml_backend_dev_t dev) {
2668
- ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
2737
+ ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *) dev->context;
2669
2738
  return ctx->name.c_str();
2670
2739
  }
2671
2740
 
2672
- static const char* ggml_backend_cann_device_get_description(ggml_backend_dev_t dev) {
2673
- ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
2741
+ static const char * ggml_backend_cann_device_get_description(ggml_backend_dev_t dev) {
2742
+ ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *) dev->context;
2674
2743
  return ctx->description.c_str();
2675
2744
  }
2676
2745
 
2677
2746
  static void ggml_backend_cann_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
2678
- ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
2747
+ ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *) dev->context;
2679
2748
  ggml_backend_cann_get_device_memory(ctx->device, free, total);
2680
2749
  }
2681
2750
 
@@ -2702,7 +2771,7 @@ static void ggml_backend_cann_device_get_props(ggml_backend_dev_t dev, ggml_back
2702
2771
 
2703
2772
  static ggml_backend_t ggml_backend_cann_device_init(ggml_backend_dev_t dev, const char * params) {
2704
2773
  GGML_UNUSED(params);
2705
- ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
2774
+ ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *) dev->context;
2706
2775
  return ggml_backend_cann_init(ctx->device);
2707
2776
  }
2708
2777
 
@@ -2719,19 +2788,17 @@ static ggml_backend_t ggml_backend_cann_device_init(ggml_backend_dev_t dev, cons
2719
2788
  * @return bool Returns true if the CANN backend supports the buffer type,
2720
2789
  * otherwise false.
2721
2790
  */
2722
- static bool ggml_backend_cann_supports_buft(
2723
- ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
2791
+ static bool ggml_backend_cann_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
2724
2792
  if (ggml_backend_buft_is_cann(buft)) {
2725
- ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *)dev->context;
2726
- ggml_backend_cann_buffer_type_context * buft_ctx =
2727
- (ggml_backend_cann_buffer_type_context *)buft->context;
2793
+ ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *) dev->context;
2794
+ ggml_backend_cann_buffer_type_context * buft_ctx = (ggml_backend_cann_buffer_type_context *) buft->context;
2728
2795
  return buft_ctx->device == dev_ctx->device;
2729
2796
  }
2730
2797
  return false;
2731
2798
  }
2732
2799
 
2733
2800
  static ggml_backend_buffer_type_t ggml_backend_cann_device_get_buffer_type(ggml_backend_dev_t dev) {
2734
- ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
2801
+ ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *) dev->context;
2735
2802
  return ggml_backend_cann_buffer_type(ctx->device);
2736
2803
  }
2737
2804
 
@@ -2750,9 +2817,8 @@ static ggml_backend_buffer_type_t ggml_backend_cann_device_get_host_buffer_type(
2750
2817
  * @param backend Pointer to the CANN backend.
2751
2818
  * @return ggml_backend_event_t Returns a pointer to the new event structure.
2752
2819
  */
2753
- static ggml_backend_event_t ggml_backend_cann_device_event_new(
2754
- ggml_backend_dev_t dev) {
2755
- ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *)dev->context;
2820
+ static ggml_backend_event_t ggml_backend_cann_device_event_new(ggml_backend_dev_t dev) {
2821
+ ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *) dev->context;
2756
2822
 
2757
2823
  ggml_cann_set_device(dev_ctx->device);
2758
2824
 
@@ -2774,7 +2840,7 @@ static ggml_backend_event_t ggml_backend_cann_device_event_new(
2774
2840
  * @param event Pointer to the event structure to be freed.
2775
2841
  */
2776
2842
  static void ggml_backend_cann_device_event_free(ggml_backend_dev_t dev, ggml_backend_event_t event) {
2777
- ACL_CHECK(aclrtDestroyEvent((aclrtEvent)event->context));
2843
+ ACL_CHECK(aclrtDestroyEvent((aclrtEvent) event->context));
2778
2844
 
2779
2845
  delete event;
2780
2846
  GGML_UNUSED(dev);
@@ -2788,7 +2854,7 @@ static void ggml_backend_cann_device_event_free(ggml_backend_dev_t dev, ggml_bac
2788
2854
  * @param event Pointer to the event structure to be synchronized.
2789
2855
  */
2790
2856
  static void ggml_backend_cann_device_event_synchronize(ggml_backend_dev_t dev, ggml_backend_event_t event) {
2791
- ACL_CHECK(aclrtSynchronizeEvent((aclrtEvent)event->context));
2857
+ ACL_CHECK(aclrtSynchronizeEvent((aclrtEvent) event->context));
2792
2858
 
2793
2859
  GGML_UNUSED(dev);
2794
2860
  }
@@ -2799,10 +2865,10 @@ static const ggml_backend_device_i ggml_backend_cann_device_interface = {
2799
2865
  /* .get_memory = */ ggml_backend_cann_device_get_memory,
2800
2866
  /* .get_type = */ ggml_backend_cann_device_get_type,
2801
2867
  /* .get_props = */ ggml_backend_cann_device_get_props,
2802
- /* .init_backend = */ ggml_backend_cann_device_init, // called for every card
2868
+ /* .init_backend = */ ggml_backend_cann_device_init, // called for every card
2803
2869
  /* .get_buffer_type = */ ggml_backend_cann_device_get_buffer_type,
2804
2870
  /* .get_host_buffer_type = */ ggml_backend_cann_device_get_host_buffer_type,
2805
- /* .buffer_from_host_ptr = */ NULL, // not supported for CANN
2871
+ /* .buffer_from_host_ptr = */ NULL, // not supported for CANN
2806
2872
  /* .supports_op = */ ggml_backend_cann_supports_op,
2807
2873
  /* .supports_buft = */ ggml_backend_cann_supports_buft,
2808
2874
  /* .offload_op = */ ggml_backend_cann_offload_op,
@@ -2811,7 +2877,6 @@ static const ggml_backend_device_i ggml_backend_cann_device_interface = {
2811
2877
  /* .event_synchronize = */ ggml_backend_cann_device_event_synchronize,
2812
2878
  };
2813
2879
 
2814
-
2815
2880
  // backend reg
2816
2881
  struct ggml_backend_cann_reg_context {
2817
2882
  std::vector<ggml_backend_dev_t> devices;
@@ -2823,12 +2888,12 @@ static const char * ggml_backend_cann_reg_get_name(ggml_backend_reg_t reg) {
2823
2888
  }
2824
2889
 
2825
2890
  static size_t ggml_backend_cann_reg_get_device_count(ggml_backend_reg_t reg) {
2826
- ggml_backend_cann_reg_context * ctx = (ggml_backend_cann_reg_context *)reg->context;
2891
+ ggml_backend_cann_reg_context * ctx = (ggml_backend_cann_reg_context *) reg->context;
2827
2892
  return ctx->devices.size();
2828
2893
  }
2829
2894
 
2830
2895
  static ggml_backend_dev_t ggml_backend_cann_reg_get_device(ggml_backend_reg_t reg, size_t index) {
2831
- ggml_backend_cann_reg_context * ctx = (ggml_backend_cann_reg_context *)reg->context;
2896
+ ggml_backend_cann_reg_context * ctx = (ggml_backend_cann_reg_context *) reg->context;
2832
2897
  GGML_ASSERT(index < ctx->devices.size());
2833
2898
  return ctx->devices[index];
2834
2899
  }
@@ -2850,34 +2915,30 @@ static const ggml_backend_reg_i ggml_backend_cann_reg_interface = {
2850
2915
  // backend registry, called only once for cann backend
2851
2916
  ggml_backend_reg_t ggml_backend_cann_reg() {
2852
2917
  static ggml_backend_reg reg;
2853
- static bool initialized = false;
2918
+ static bool initialized = false;
2854
2919
 
2855
2920
  {
2856
- static std::mutex mutex;
2921
+ static std::mutex mutex;
2857
2922
  std::lock_guard<std::mutex> lock(mutex);
2858
2923
  if (!initialized) {
2859
2924
  aclInit(nullptr);
2860
2925
  ggml_backend_cann_reg_context * ctx = new ggml_backend_cann_reg_context;
2861
2926
 
2862
2927
  for (int i = 0; i < ggml_cann_info().device_count; i++) {
2863
- ggml_backend_cann_device_context* dev_ctx = new ggml_backend_cann_device_context();
2864
- dev_ctx->description = aclrtGetSocName();
2865
- dev_ctx->device = i;
2866
- dev_ctx->name = GGML_CANN_NAME + std::to_string(i);
2928
+ ggml_backend_cann_device_context * dev_ctx = new ggml_backend_cann_device_context();
2929
+ dev_ctx->description = aclrtGetSocName();
2930
+ dev_ctx->device = i;
2931
+ dev_ctx->name = GGML_CANN_NAME + std::to_string(i);
2867
2932
  ggml_cann_set_device(i);
2868
- ggml_backend_dev_t dev = new ggml_backend_device {
2869
- /* .iface = */ ggml_backend_cann_device_interface,
2870
- /* .reg = */ &reg,
2871
- /* .context = */ dev_ctx
2872
- };
2933
+ ggml_backend_dev_t dev = new ggml_backend_device{ /* .iface = */ ggml_backend_cann_device_interface,
2934
+ /* .reg = */ &reg,
2935
+ /* .context = */ dev_ctx };
2873
2936
  ctx->devices.push_back(dev);
2874
2937
  }
2875
2938
 
2876
- reg = ggml_backend_reg {
2877
- /* .api_version = */ GGML_BACKEND_API_VERSION,
2878
- /* .iface = */ ggml_backend_cann_reg_interface,
2879
- /* .context = */ ctx
2880
- };
2939
+ reg = ggml_backend_reg{ /* .api_version = */ GGML_BACKEND_API_VERSION,
2940
+ /* .iface = */ ggml_backend_cann_reg_interface,
2941
+ /* .context = */ ctx };
2881
2942
  }
2882
2943
 
2883
2944
  initialized = true;
@@ -2893,39 +2954,36 @@ ggml_backend_t ggml_backend_cann_init(int32_t device) {
2893
2954
  return nullptr;
2894
2955
  }
2895
2956
 
2896
- ggml_backend_cann_context* ctx = new ggml_backend_cann_context(device);
2957
+ ggml_backend_cann_context * ctx = new ggml_backend_cann_context(device);
2897
2958
  if (ctx == nullptr) {
2898
2959
  GGML_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
2899
2960
  return nullptr;
2900
2961
  }
2901
2962
  ggml_cann_set_device(ctx->device);
2902
2963
  ggml_backend_t cann_backend =
2903
- new ggml_backend{/* .guid = */ ggml_backend_cann_guid(),
2904
- /* .interface = */ ggml_backend_cann_interface,
2905
- /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), device),
2906
- /* .context = */ ctx};
2964
+ new ggml_backend{ /* .guid = */ ggml_backend_cann_guid(),
2965
+ /* .interface = */ ggml_backend_cann_interface,
2966
+ /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), device),
2967
+ /* .context = */ ctx };
2907
2968
 
2908
2969
  return cann_backend;
2909
2970
  }
2910
2971
 
2911
2972
  bool ggml_backend_is_cann(ggml_backend_t backend) {
2912
- return backend != NULL &&
2913
- ggml_guid_matches(backend->guid, ggml_backend_cann_guid());
2973
+ return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cann_guid());
2914
2974
  }
2915
2975
 
2916
2976
  int32_t ggml_backend_cann_get_device_count() {
2917
2977
  return ggml_cann_info().device_count;
2918
2978
  }
2919
2979
 
2920
- void ggml_backend_cann_get_device_description(
2921
- int32_t device, char* description, size_t description_size) {
2980
+ void ggml_backend_cann_get_device_description(int32_t device, char * description, size_t description_size) {
2922
2981
  ggml_cann_set_device(device);
2923
- const char* soc_name = aclrtGetSocName();
2982
+ const char * soc_name = aclrtGetSocName();
2924
2983
  snprintf(description, description_size, "%s", soc_name);
2925
2984
  }
2926
2985
 
2927
- void ggml_backend_cann_get_device_memory(int32_t device, size_t* free,
2928
- size_t* total) {
2986
+ void ggml_backend_cann_get_device_memory(int32_t device, size_t * free, size_t * total) {
2929
2987
  ggml_cann_set_device(device);
2930
2988
  ACL_CHECK(aclrtGetMemInfo(ACL_HBM_MEM, free, total));
2931
2989
  }