@novastera-oss/llamarn 0.4.0 → 0.4.3-beta4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (979) hide show
  1. package/RNLlamaCpp.podspec +4 -1
  2. package/android/CMakeLists.txt +13 -3
  3. package/android/src/main/cpp/include/llama.h +44 -21
  4. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  6. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  8. package/android/src/main/jniLibs/armeabi-v7a/libggml-base.so +0 -0
  9. package/android/src/main/jniLibs/armeabi-v7a/libggml-cpu.so +0 -0
  10. package/android/src/main/jniLibs/armeabi-v7a/libggml.so +0 -0
  11. package/android/src/main/jniLibs/armeabi-v7a/libllama.so +0 -0
  12. package/android/src/main/jniLibs/x86/libggml-base.so +0 -0
  13. package/android/src/main/jniLibs/x86/libggml-cpu.so +0 -0
  14. package/android/src/main/jniLibs/x86/libggml.so +0 -0
  15. package/android/src/main/jniLibs/x86/libllama.so +0 -0
  16. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  17. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  18. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  19. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  20. package/cpp/LlamaCppModel.cpp +2 -10
  21. package/cpp/SystemUtils.cpp +3 -7
  22. package/cpp/build-info.cpp +2 -2
  23. package/cpp/llama.cpp/CMakeLists.txt +12 -0
  24. package/cpp/llama.cpp/CODEOWNERS +116 -10
  25. package/cpp/llama.cpp/CONTRIBUTING.md +30 -3
  26. package/cpp/llama.cpp/README.md +13 -5
  27. package/cpp/llama.cpp/build-xcframework.sh +5 -0
  28. package/cpp/llama.cpp/cmake/riscv64-spacemit-linux-gnu-gcc.cmake +29 -0
  29. package/cpp/llama.cpp/common/CMakeLists.txt +12 -2
  30. package/cpp/llama.cpp/common/arg.cpp +303 -795
  31. package/cpp/llama.cpp/common/arg.h +2 -3
  32. package/cpp/llama.cpp/common/chat-parser-xml-toolcall.cpp +861 -0
  33. package/cpp/llama.cpp/common/chat-parser-xml-toolcall.h +45 -0
  34. package/cpp/llama.cpp/common/chat-parser.cpp +156 -15
  35. package/cpp/llama.cpp/common/chat-parser.h +13 -0
  36. package/cpp/llama.cpp/common/chat.cpp +1147 -88
  37. package/cpp/llama.cpp/common/chat.h +16 -3
  38. package/cpp/llama.cpp/common/common.cpp +70 -15
  39. package/cpp/llama.cpp/common/common.h +57 -19
  40. package/cpp/llama.cpp/common/download.cpp +1072 -0
  41. package/cpp/llama.cpp/common/download.h +55 -0
  42. package/cpp/llama.cpp/common/http.h +73 -0
  43. package/cpp/llama.cpp/common/json-partial.cpp +70 -2
  44. package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +61 -22
  45. package/cpp/llama.cpp/common/json-schema-to-grammar.h +2 -0
  46. package/cpp/llama.cpp/common/log.cpp +59 -2
  47. package/cpp/llama.cpp/common/log.h +12 -4
  48. package/cpp/llama.cpp/common/sampling.cpp +84 -8
  49. package/cpp/llama.cpp/common/sampling.h +3 -1
  50. package/cpp/llama.cpp/common/speculative.cpp +1 -1
  51. package/cpp/llama.cpp/convert_hf_to_gguf.py +1608 -233
  52. package/cpp/llama.cpp/convert_hf_to_gguf_update.py +6 -1
  53. package/cpp/llama.cpp/convert_lora_to_gguf.py +37 -5
  54. package/cpp/llama.cpp/ggml/CMakeLists.txt +47 -28
  55. package/cpp/llama.cpp/ggml/include/ggml-backend.h +19 -1
  56. package/cpp/llama.cpp/ggml/include/ggml-cpu.h +1 -1
  57. package/cpp/llama.cpp/ggml/include/ggml-hexagon.h +19 -0
  58. package/cpp/llama.cpp/ggml/include/ggml-metal.h +1 -6
  59. package/cpp/llama.cpp/ggml/include/ggml-rpc.h +7 -9
  60. package/cpp/llama.cpp/ggml/include/ggml-zdnn.h +2 -1
  61. package/cpp/llama.cpp/ggml/include/ggml.h +199 -6
  62. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +38 -0
  63. package/cpp/llama.cpp/ggml/src/ggml-alloc.c +299 -130
  64. package/cpp/llama.cpp/ggml/src/ggml-backend-impl.h +4 -4
  65. package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +21 -5
  66. package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +99 -2
  67. package/cpp/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +1 -1
  68. package/cpp/llama.cpp/ggml/src/ggml-blas/ggml-blas.cpp +1 -0
  69. package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +57 -45
  70. package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +138 -47
  71. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +1584 -1773
  72. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +201 -317
  73. package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +146 -187
  74. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +771 -713
  75. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +135 -77
  76. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +5 -2
  77. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +428 -26
  78. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +16 -17
  79. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +318 -145
  80. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
  81. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +155 -60
  82. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +8 -8
  83. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +0 -1
  84. package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +14 -0
  85. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +10 -9
  86. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +108 -64
  87. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +14 -4
  88. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +530 -87
  89. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +37 -45
  90. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +349 -127
  91. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +947 -1218
  92. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +5 -4
  93. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +143 -29
  94. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +82 -76
  95. package/cpp/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.cpp +1025 -0
  96. package/cpp/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
  97. package/cpp/llama.cpp/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
  98. package/cpp/llama.cpp/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
  99. package/cpp/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +151 -0
  100. package/cpp/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +7 -0
  101. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +233 -28
  102. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +326 -66
  103. package/cpp/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +12 -3
  104. package/cpp/llama.cpp/ggml/src/ggml-cuda/argsort.cu +102 -6
  105. package/cpp/llama.cpp/ggml/src/ggml-cuda/binbcast.cu +110 -76
  106. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +167 -38
  107. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d.cu +6 -11
  108. package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cuh +12 -0
  109. package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy-utils.cuh +1 -1
  110. package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cu +245 -151
  111. package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cuh +1 -5
  112. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +341 -289
  113. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile.cu +49 -0
  114. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile.cuh +1233 -0
  115. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec.cuh +586 -0
  116. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +6 -6
  117. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +48 -0
  118. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cu +123 -220
  119. package/cpp/llama.cpp/ggml/src/ggml-cuda/getrows.cu +41 -39
  120. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +715 -45
  121. package/cpp/llama.cpp/ggml/src/ggml-cuda/im2col.cu +150 -0
  122. package/cpp/llama.cpp/ggml/src/ggml-cuda/im2col.cuh +1 -0
  123. package/cpp/llama.cpp/ggml/src/ggml-cuda/mma.cuh +321 -24
  124. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmf.cu +93 -351
  125. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmf.cuh +828 -1
  126. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmid.cu +164 -0
  127. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmid.cuh +5 -0
  128. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cu +3 -166
  129. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +1 -1
  130. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvf.cu +371 -78
  131. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvf.cuh +3 -2
  132. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvq.cu +279 -147
  133. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvq.cuh +1 -1
  134. package/cpp/llama.cpp/ggml/src/ggml-cuda/norm.cu +97 -85
  135. package/cpp/llama.cpp/ggml/src/ggml-cuda/pad.cu +46 -23
  136. package/cpp/llama.cpp/ggml/src/ggml-cuda/pad_reflect_1d.cu +63 -54
  137. package/cpp/llama.cpp/ggml/src/ggml-cuda/quantize.cu +12 -10
  138. package/cpp/llama.cpp/ggml/src/ggml-cuda/rope.cu +192 -77
  139. package/cpp/llama.cpp/ggml/src/ggml-cuda/rope.cuh +2 -0
  140. package/cpp/llama.cpp/ggml/src/ggml-cuda/scale.cu +10 -9
  141. package/cpp/llama.cpp/ggml/src/ggml-cuda/set-rows.cu +137 -75
  142. package/cpp/llama.cpp/ggml/src/ggml-cuda/set.cu +39 -0
  143. package/cpp/llama.cpp/ggml/src/ggml-cuda/set.cuh +7 -0
  144. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq112-dv112.cu +5 -0
  145. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq128-dv128.cu +5 -0
  146. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq256-dv256.cu +5 -0
  147. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq40-dv40.cu +5 -0
  148. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq576-dv512.cu +5 -0
  149. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq64-dv64.cu +5 -0
  150. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq72-dv72.cu +5 -0
  151. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq80-dv80.cu +5 -0
  152. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq96-dv96.cu +5 -0
  153. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu +7 -0
  154. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_0.cu +7 -0
  155. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_1.cu +7 -0
  156. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_0.cu +7 -0
  157. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_1.cu +7 -0
  158. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q8_0.cu +7 -0
  159. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-f16.cu +7 -0
  160. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu +7 -0
  161. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_1.cu +7 -0
  162. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_0.cu +7 -0
  163. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_1.cu +7 -0
  164. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q8_0.cu +7 -0
  165. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-f16.cu +7 -0
  166. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_0.cu +7 -0
  167. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_1.cu +7 -0
  168. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_0.cu +7 -0
  169. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_1.cu +7 -0
  170. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q8_0.cu +7 -0
  171. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-f16.cu +7 -0
  172. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_0.cu +7 -0
  173. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_1.cu +7 -0
  174. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_0.cu +7 -0
  175. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_1.cu +7 -0
  176. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q8_0.cu +7 -0
  177. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-f16.cu +7 -0
  178. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_0.cu +7 -0
  179. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_1.cu +7 -0
  180. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_0.cu +7 -0
  181. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_1.cu +7 -0
  182. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q8_0.cu +7 -0
  183. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-f16.cu +7 -0
  184. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_0.cu +7 -0
  185. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_1.cu +7 -0
  186. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_0.cu +7 -0
  187. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_1.cu +7 -0
  188. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu +7 -0
  189. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +40 -19
  190. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_1.cu +5 -0
  191. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_10.cu +5 -0
  192. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_11.cu +5 -0
  193. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_12.cu +5 -0
  194. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_13.cu +5 -0
  195. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_14.cu +5 -0
  196. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_15.cu +5 -0
  197. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_16.cu +5 -0
  198. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_2.cu +5 -0
  199. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_3.cu +5 -0
  200. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_4.cu +5 -0
  201. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_5.cu +5 -0
  202. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_6.cu +5 -0
  203. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_7.cu +5 -0
  204. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_8.cu +5 -0
  205. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_9.cu +5 -0
  206. package/cpp/llama.cpp/ggml/src/ggml-cuda/topk-moe.cu +336 -0
  207. package/cpp/llama.cpp/ggml/src/ggml-cuda/topk-moe.cuh +16 -0
  208. package/cpp/llama.cpp/ggml/src/ggml-cuda/tsembd.cu +3 -3
  209. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +105 -11
  210. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +36 -0
  211. package/cpp/llama.cpp/ggml/src/ggml-cuda/upscale.cu +87 -6
  212. package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +28 -12
  213. package/cpp/llama.cpp/ggml/src/ggml-hexagon/CMakeLists.txt +68 -0
  214. package/cpp/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp +3807 -0
  215. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/CMakeLists.txt +40 -0
  216. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/act-ops.c +442 -0
  217. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/binary-ops.c +360 -0
  218. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +157 -0
  219. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-ctx.h +40 -0
  220. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.c +69 -0
  221. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.h +119 -0
  222. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-msg.h +156 -0
  223. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-ops.h +64 -0
  224. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp_iface.idl +16 -0
  225. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-exp.c +93 -0
  226. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-inverse.c +60 -0
  227. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c +49 -0
  228. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-utils.c +960 -0
  229. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-utils.h +1032 -0
  230. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/main.c +829 -0
  231. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/matmul-ops.c +2223 -0
  232. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/ops-utils.h +149 -0
  233. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/rope-ops.c +418 -0
  234. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/softmax-ops.c +402 -0
  235. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/unary-ops.c +255 -0
  236. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/worker-pool.c +297 -0
  237. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/worker-pool.h +57 -0
  238. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp-utils.c +448 -0
  239. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp-utils.h +220 -0
  240. package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +8 -13
  241. package/cpp/llama.cpp/ggml/src/ggml-impl.h +110 -12
  242. package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +6 -5
  243. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-common.cpp +446 -0
  244. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
  245. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-context.h +33 -0
  246. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-context.m +599 -0
  247. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.cpp +1662 -0
  248. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.h +251 -0
  249. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.m +1527 -0
  250. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +244 -39
  251. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-ops.cpp +3844 -0
  252. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-ops.h +90 -0
  253. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.cpp +723 -0
  254. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +3453 -1907
  255. package/cpp/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +3 -1
  256. package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +10 -0
  257. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1331 -109
  258. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/cvt.cl +126 -0
  259. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +31 -4
  260. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +35 -7
  261. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +31 -4
  262. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32.cl +162 -0
  263. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32.cl +156 -0
  264. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/get_rows.cl +36 -12
  265. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_kq_kqv.cl +273 -0
  266. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +24 -10
  267. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +24 -10
  268. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl +154 -0
  269. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
  270. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
  271. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
  272. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
  273. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
  274. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
  275. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/pad.cl +29 -20
  276. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/rms_norm.cl +25 -10
  277. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/rope.cl +50 -24
  278. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/set_rows.cl +123 -10
  279. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +2 -2
  280. package/cpp/llama.cpp/ggml/src/ggml-quants.c +1 -0
  281. package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +341 -161
  282. package/cpp/llama.cpp/ggml/src/ggml-sycl/backend.hpp +6 -0
  283. package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +6 -5
  284. package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +74 -15
  285. package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +50 -30
  286. package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +10 -4
  287. package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +166 -99
  288. package/cpp/llama.cpp/ggml/src/ggml-sycl/count-equal.cpp +79 -0
  289. package/cpp/llama.cpp/ggml/src/ggml-sycl/count-equal.hpp +9 -0
  290. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +72 -94
  291. package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +67 -49
  292. package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +21 -31
  293. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +252 -316
  294. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +6 -2
  295. package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +9 -6
  296. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +359 -142
  297. package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
  298. package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
  299. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +80 -60
  300. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +201 -132
  301. package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +230 -55
  302. package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.hpp +2 -0
  303. package/cpp/llama.cpp/ggml/src/ggml-sycl/pad.cpp +97 -0
  304. package/cpp/llama.cpp/ggml/src/ggml-sycl/pad.hpp +24 -0
  305. package/cpp/llama.cpp/ggml/src/ggml-sycl/pad_reflect_1d.cpp +72 -0
  306. package/cpp/llama.cpp/ggml/src/ggml-sycl/pad_reflect_1d.hpp +8 -0
  307. package/cpp/llama.cpp/ggml/src/ggml-sycl/presets.hpp +2 -0
  308. package/cpp/llama.cpp/ggml/src/ggml-sycl/repeat_back.cpp +76 -0
  309. package/cpp/llama.cpp/ggml/src/ggml-sycl/repeat_back.hpp +8 -0
  310. package/cpp/llama.cpp/ggml/src/ggml-sycl/roll.cpp +122 -0
  311. package/cpp/llama.cpp/ggml/src/ggml-sycl/roll.hpp +20 -0
  312. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +50 -41
  313. package/cpp/llama.cpp/ggml/src/ggml-sycl/set.cpp +73 -0
  314. package/cpp/llama.cpp/ggml/src/ggml-sycl/set.hpp +5 -0
  315. package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.cpp +45 -36
  316. package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +330 -165
  317. package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +4 -0
  318. package/cpp/llama.cpp/ggml/src/ggml-sycl/ssm_conv.cpp +127 -0
  319. package/cpp/llama.cpp/ggml/src/ggml-sycl/ssm_conv.hpp +5 -0
  320. package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +12 -6
  321. package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +16 -12
  322. package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +38 -18
  323. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +4184 -2159
  324. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/abs.comp +21 -0
  325. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +2 -2
  326. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +2 -2
  327. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add1.comp +28 -0
  328. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +1 -1
  329. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/arange.comp +20 -0
  330. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +2 -2
  331. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +33 -26
  332. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argsort_large.comp +114 -0
  333. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/ceil.comp +22 -0
  334. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp +2 -2
  335. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/concat.comp +2 -2
  336. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +2 -2
  337. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp +1 -1
  338. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +53 -30
  339. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +1 -1
  340. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +2 -2
  341. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +3 -3
  342. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +13 -6
  343. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_transpose.comp +67 -0
  344. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp +2 -2
  345. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/count_equal.comp +2 -2
  346. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_f32.comp +1 -1
  347. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs.comp → dequant_funcs.glsl} +138 -2
  348. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs_cm2.comp → dequant_funcs_cm2.glsl} +18 -4
  349. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_head.comp → dequant_head.glsl} +1 -1
  350. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +1 -1
  351. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_s.comp +1 -1
  352. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +2 -2
  353. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xs.comp +1 -1
  354. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +3 -2
  355. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +7 -6
  356. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +5 -3
  357. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp +1 -1
  358. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp +1 -1
  359. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +3 -3
  360. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +3 -3
  361. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
  362. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_0.comp +1 -1
  363. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_1.comp +1 -1
  364. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +3 -3
  365. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_0.comp +1 -1
  366. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_1.comp +1 -1
  367. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +3 -3
  368. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
  369. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q8_0.comp +1 -1
  370. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/diag_mask_inf.comp +1 -1
  371. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/div.comp +2 -2
  372. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +3 -2
  373. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/fill.comp +19 -0
  374. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +52 -14
  375. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{flash_attn_base.comp → flash_attn_base.glsl} +50 -12
  376. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +61 -12
  377. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +54 -12
  378. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +5 -1
  379. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/floor.comp +22 -0
  380. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +2 -2
  381. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +2 -2
  382. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +2 -2
  383. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp +2 -2
  384. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +2 -2
  385. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp +2 -2
  386. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{generic_binary_head.comp → generic_binary_head.glsl} +10 -2
  387. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +21 -12
  388. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +28 -18
  389. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{glu_head.comp → glu_head.glsl} +1 -1
  390. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp +2 -2
  391. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +22 -0
  392. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +22 -0
  393. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +15 -7
  394. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +125 -0
  395. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +2 -2
  396. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/leaky_relu.comp +2 -2
  397. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +18 -0
  398. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul.comp +2 -2
  399. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +1 -1
  400. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl +229 -0
  401. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +33 -0
  402. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_m.comp +1 -1
  403. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp +1 -1
  404. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_s.comp +1 -1
  405. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +1 -1
  406. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xxs.comp +1 -1
  407. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_s.comp +1 -1
  408. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_xxs.comp +1 -1
  409. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +9 -7
  410. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp +9 -7
  411. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +3 -5
  412. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +1 -1
  413. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +3 -5
  414. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +3 -5
  415. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +1 -1
  416. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +140 -0
  417. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +106 -634
  418. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +118 -9
  419. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +556 -0
  420. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +70 -0
  421. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +77 -214
  422. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +589 -0
  423. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +78 -0
  424. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +97 -13
  425. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/neg.comp +20 -0
  426. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/norm.comp +2 -2
  427. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_adamw.comp +2 -2
  428. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +1 -1
  429. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +25 -4
  430. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/pool2d.comp +1 -1
  431. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +55 -5
  432. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +2 -2
  433. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp +2 -2
  434. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp +2 -2
  435. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp +2 -2
  436. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +45 -3
  437. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_back.comp +2 -2
  438. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +2 -2
  439. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +2 -2
  440. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +227 -0
  441. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +20 -0
  442. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +5 -52
  443. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +5 -35
  444. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +5 -35
  445. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +27 -0
  446. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +5 -41
  447. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/round.comp +29 -0
  448. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +2 -2
  449. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp +2 -2
  450. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp +2 -2
  451. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/silu_back.comp +2 -2
  452. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp +2 -2
  453. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +1 -1
  454. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +6 -2
  455. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/softplus.comp +23 -0
  456. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +2 -2
  457. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/square.comp +2 -2
  458. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +44 -0
  459. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/ssm_scan.comp +140 -0
  460. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/step.comp +22 -0
  461. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sub.comp +2 -2
  462. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +1 -1
  463. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +2 -2
  464. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +2 -2
  465. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp +2 -2
  466. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +5 -4
  467. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +171 -0
  468. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/trunc.comp +22 -0
  469. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{types.comp → types.glsl} +79 -29
  470. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +36 -12
  471. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +471 -196
  472. package/cpp/llama.cpp/ggml/src/ggml-webgpu/CMakeLists.txt +8 -0
  473. package/cpp/llama.cpp/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1690 -383
  474. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl +188 -0
  475. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +45 -0
  476. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +930 -0
  477. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +101 -0
  478. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +57 -10
  479. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.tmpl.wgsl +874 -0
  480. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +323 -0
  481. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +25 -912
  482. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +97 -0
  483. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.tmpl.wgsl +247 -0
  484. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.tmpl.wgsl +302 -0
  485. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl +267 -0
  486. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +123 -0
  487. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/rope.tmpl.wgsl +295 -0
  488. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/scale.tmpl.wgsl +90 -0
  489. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/{set_rows.wgsl → set_rows.tmpl.wgsl} +38 -8
  490. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.tmpl.wgsl +345 -0
  491. package/cpp/llama.cpp/ggml/src/ggml-zdnn/common.hpp +59 -0
  492. package/cpp/llama.cpp/ggml/src/ggml-zdnn/ggml-zdnn.cpp +96 -314
  493. package/cpp/llama.cpp/ggml/src/ggml-zdnn/mmf.cpp +80 -0
  494. package/cpp/llama.cpp/ggml/src/ggml-zdnn/mmf.hpp +12 -0
  495. package/cpp/llama.cpp/ggml/src/ggml-zdnn/utils.cpp +79 -0
  496. package/cpp/llama.cpp/ggml/src/ggml-zdnn/utils.hpp +19 -0
  497. package/cpp/llama.cpp/ggml/src/ggml.c +440 -17
  498. package/cpp/llama.cpp/ggml/src/gguf.cpp +104 -29
  499. package/cpp/llama.cpp/gguf-py/gguf/constants.py +363 -13
  500. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +64 -0
  501. package/cpp/llama.cpp/gguf-py/gguf/lazy.py +8 -3
  502. package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_convert_endian.py +6 -0
  503. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +156 -18
  504. package/cpp/llama.cpp/gguf-py/gguf/utility.py +80 -0
  505. package/cpp/llama.cpp/gguf-py/gguf/vocab.py +4 -4
  506. package/cpp/llama.cpp/include/llama.h +44 -21
  507. package/cpp/llama.cpp/media/llama1-icon-transparent.png +0 -0
  508. package/cpp/llama.cpp/media/llama1-icon-transparent.svg +77 -0
  509. package/cpp/llama.cpp/media/llama1-icon.png +0 -0
  510. package/cpp/llama.cpp/media/llama1-icon.svg +87 -0
  511. package/cpp/llama.cpp/requirements/requirements-all.txt +2 -0
  512. package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +3 -3
  513. package/cpp/llama.cpp/requirements/requirements-convert_legacy_llama.txt +3 -1
  514. package/cpp/llama.cpp/requirements/requirements-tool_bench.txt +1 -1
  515. package/cpp/llama.cpp/src/CMakeLists.txt +101 -0
  516. package/cpp/llama.cpp/src/llama-adapter.cpp +33 -0
  517. package/cpp/llama.cpp/src/llama-adapter.h +3 -0
  518. package/cpp/llama.cpp/src/llama-arch.cpp +344 -14
  519. package/cpp/llama.cpp/src/llama-arch.h +50 -0
  520. package/cpp/llama.cpp/src/llama-batch.cpp +63 -31
  521. package/cpp/llama.cpp/src/llama-batch.h +13 -2
  522. package/cpp/llama.cpp/src/llama-chat.cpp +85 -3
  523. package/cpp/llama.cpp/src/llama-chat.h +4 -0
  524. package/cpp/llama.cpp/src/llama-context.cpp +300 -45
  525. package/cpp/llama.cpp/src/llama-context.h +16 -6
  526. package/cpp/llama.cpp/src/llama-cparams.h +2 -1
  527. package/cpp/llama.cpp/src/llama-grammar.cpp +17 -9
  528. package/cpp/llama.cpp/src/llama-graph.cpp +226 -64
  529. package/cpp/llama.cpp/src/llama-graph.h +27 -5
  530. package/cpp/llama.cpp/src/llama-hparams.cpp +53 -2
  531. package/cpp/llama.cpp/src/llama-hparams.h +48 -8
  532. package/cpp/llama.cpp/src/llama-impl.cpp +3 -3
  533. package/cpp/llama.cpp/src/llama-impl.h +2 -0
  534. package/cpp/llama.cpp/src/llama-kv-cache-iswa.cpp +13 -3
  535. package/cpp/llama.cpp/src/llama-kv-cache-iswa.h +2 -0
  536. package/cpp/llama.cpp/src/llama-kv-cache.cpp +120 -62
  537. package/cpp/llama.cpp/src/llama-kv-cache.h +13 -4
  538. package/cpp/llama.cpp/src/llama-kv-cells.h +44 -2
  539. package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +19 -9
  540. package/cpp/llama.cpp/src/llama-memory-hybrid.h +2 -0
  541. package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +38 -17
  542. package/cpp/llama.cpp/src/llama-memory-recurrent.h +5 -2
  543. package/cpp/llama.cpp/src/llama-memory.h +3 -0
  544. package/cpp/llama.cpp/src/llama-model-loader.cpp +2 -0
  545. package/cpp/llama.cpp/src/llama-model.cpp +1070 -12614
  546. package/cpp/llama.cpp/src/llama-model.h +40 -4
  547. package/cpp/llama.cpp/src/llama-quant.cpp +14 -6
  548. package/cpp/llama.cpp/src/llama-sampling.cpp +243 -136
  549. package/cpp/llama.cpp/src/llama-vocab.cpp +43 -3
  550. package/cpp/llama.cpp/src/llama-vocab.h +43 -39
  551. package/cpp/llama.cpp/src/llama.cpp +69 -10
  552. package/cpp/llama.cpp/src/models/afmoe.cpp +187 -0
  553. package/cpp/llama.cpp/src/models/apertus.cpp +125 -0
  554. package/cpp/llama.cpp/src/models/arcee.cpp +135 -0
  555. package/cpp/llama.cpp/src/models/arctic.cpp +138 -0
  556. package/cpp/llama.cpp/src/models/arwkv7.cpp +86 -0
  557. package/cpp/llama.cpp/src/models/baichuan.cpp +122 -0
  558. package/cpp/llama.cpp/src/models/bailingmoe.cpp +144 -0
  559. package/cpp/llama.cpp/src/models/bailingmoe2.cpp +135 -0
  560. package/cpp/llama.cpp/src/models/bert.cpp +176 -0
  561. package/cpp/llama.cpp/src/models/bitnet.cpp +160 -0
  562. package/cpp/llama.cpp/src/models/bloom.cpp +101 -0
  563. package/cpp/llama.cpp/src/models/chameleon.cpp +178 -0
  564. package/cpp/llama.cpp/src/models/chatglm.cpp +132 -0
  565. package/cpp/llama.cpp/src/models/codeshell.cpp +111 -0
  566. package/cpp/llama.cpp/src/models/cogvlm.cpp +100 -0
  567. package/cpp/llama.cpp/src/models/cohere2-iswa.cpp +131 -0
  568. package/cpp/llama.cpp/src/models/command-r.cpp +122 -0
  569. package/cpp/llama.cpp/src/models/dbrx.cpp +123 -0
  570. package/cpp/llama.cpp/src/models/deci.cpp +135 -0
  571. package/cpp/llama.cpp/src/models/deepseek.cpp +144 -0
  572. package/cpp/llama.cpp/src/models/deepseek2.cpp +237 -0
  573. package/cpp/llama.cpp/src/models/dots1.cpp +134 -0
  574. package/cpp/llama.cpp/src/models/dream.cpp +105 -0
  575. package/cpp/llama.cpp/src/models/ernie4-5-moe.cpp +150 -0
  576. package/cpp/llama.cpp/src/models/ernie4-5.cpp +110 -0
  577. package/cpp/llama.cpp/src/models/exaone.cpp +114 -0
  578. package/cpp/llama.cpp/src/models/exaone4.cpp +123 -0
  579. package/cpp/llama.cpp/src/models/falcon-h1.cpp +113 -0
  580. package/cpp/llama.cpp/src/models/falcon.cpp +120 -0
  581. package/cpp/llama.cpp/src/models/gemma-embedding.cpp +120 -0
  582. package/cpp/llama.cpp/src/models/gemma.cpp +112 -0
  583. package/cpp/llama.cpp/src/models/gemma2-iswa.cpp +125 -0
  584. package/cpp/llama.cpp/src/models/gemma3-iswa.cpp +131 -0
  585. package/cpp/llama.cpp/src/models/gemma3n-iswa.cpp +377 -0
  586. package/cpp/llama.cpp/src/models/glm4-moe.cpp +153 -0
  587. package/cpp/llama.cpp/src/models/glm4.cpp +127 -0
  588. package/cpp/llama.cpp/src/models/gpt2.cpp +105 -0
  589. package/cpp/llama.cpp/src/models/gptneox.cpp +144 -0
  590. package/cpp/llama.cpp/src/models/granite-hybrid.cpp +196 -0
  591. package/cpp/llama.cpp/src/models/granite.cpp +211 -0
  592. package/cpp/llama.cpp/src/models/graph-context-mamba.cpp +283 -0
  593. package/cpp/llama.cpp/src/models/grok.cpp +159 -0
  594. package/cpp/llama.cpp/src/models/grovemoe.cpp +141 -0
  595. package/cpp/llama.cpp/src/models/hunyuan-dense.cpp +132 -0
  596. package/cpp/llama.cpp/src/models/hunyuan-moe.cpp +154 -0
  597. package/cpp/llama.cpp/src/models/internlm2.cpp +120 -0
  598. package/cpp/llama.cpp/src/models/jais.cpp +86 -0
  599. package/cpp/llama.cpp/src/models/jamba.cpp +106 -0
  600. package/cpp/llama.cpp/src/models/lfm2.cpp +173 -0
  601. package/cpp/llama.cpp/src/models/llada-moe.cpp +122 -0
  602. package/cpp/llama.cpp/src/models/llada.cpp +99 -0
  603. package/cpp/llama.cpp/src/models/llama-iswa.cpp +174 -0
  604. package/cpp/llama.cpp/src/models/llama.cpp +155 -0
  605. package/cpp/llama.cpp/src/models/mamba.cpp +55 -0
  606. package/cpp/llama.cpp/src/models/minicpm3.cpp +199 -0
  607. package/cpp/llama.cpp/src/models/minimax-m2.cpp +124 -0
  608. package/cpp/llama.cpp/src/models/models.h +485 -0
  609. package/cpp/llama.cpp/src/models/mpt.cpp +126 -0
  610. package/cpp/llama.cpp/src/models/nemotron-h.cpp +121 -0
  611. package/cpp/llama.cpp/src/models/nemotron.cpp +122 -0
  612. package/cpp/llama.cpp/src/models/neo-bert.cpp +104 -0
  613. package/cpp/llama.cpp/src/models/olmo.cpp +121 -0
  614. package/cpp/llama.cpp/src/models/olmo2.cpp +150 -0
  615. package/cpp/llama.cpp/src/models/olmoe.cpp +124 -0
  616. package/cpp/llama.cpp/src/models/openai-moe-iswa.cpp +124 -0
  617. package/cpp/llama.cpp/src/models/openelm.cpp +124 -0
  618. package/cpp/llama.cpp/src/models/orion.cpp +123 -0
  619. package/cpp/llama.cpp/src/models/pangu-embedded.cpp +121 -0
  620. package/cpp/llama.cpp/src/models/phi2.cpp +121 -0
  621. package/cpp/llama.cpp/src/models/phi3.cpp +152 -0
  622. package/cpp/llama.cpp/src/models/plamo.cpp +110 -0
  623. package/cpp/llama.cpp/src/models/plamo2.cpp +316 -0
  624. package/cpp/llama.cpp/src/models/plm.cpp +168 -0
  625. package/cpp/llama.cpp/src/models/qwen.cpp +108 -0
  626. package/cpp/llama.cpp/src/models/qwen2.cpp +117 -0
  627. package/cpp/llama.cpp/src/models/qwen2moe.cpp +151 -0
  628. package/cpp/llama.cpp/src/models/qwen2vl.cpp +117 -0
  629. package/cpp/llama.cpp/src/models/qwen3.cpp +117 -0
  630. package/cpp/llama.cpp/src/models/qwen3moe.cpp +124 -0
  631. package/cpp/llama.cpp/src/models/qwen3vl-moe.cpp +149 -0
  632. package/cpp/llama.cpp/src/models/qwen3vl.cpp +141 -0
  633. package/cpp/llama.cpp/src/models/refact.cpp +94 -0
  634. package/cpp/llama.cpp/src/models/rwkv6-base.cpp +162 -0
  635. package/cpp/llama.cpp/src/models/rwkv6.cpp +94 -0
  636. package/cpp/llama.cpp/src/models/rwkv6qwen2.cpp +86 -0
  637. package/cpp/llama.cpp/src/models/rwkv7-base.cpp +135 -0
  638. package/cpp/llama.cpp/src/models/rwkv7.cpp +90 -0
  639. package/cpp/llama.cpp/src/models/seed-oss.cpp +124 -0
  640. package/cpp/llama.cpp/src/models/smallthinker.cpp +120 -0
  641. package/cpp/llama.cpp/src/models/smollm3.cpp +128 -0
  642. package/cpp/llama.cpp/src/models/stablelm.cpp +146 -0
  643. package/cpp/llama.cpp/src/models/starcoder.cpp +100 -0
  644. package/cpp/llama.cpp/src/models/starcoder2.cpp +121 -0
  645. package/cpp/llama.cpp/src/models/t5-dec.cpp +166 -0
  646. package/cpp/llama.cpp/src/models/t5-enc.cpp +96 -0
  647. package/cpp/llama.cpp/src/models/wavtokenizer-dec.cpp +149 -0
  648. package/cpp/llama.cpp/src/models/xverse.cpp +108 -0
  649. package/cpp/llama.cpp/src/unicode.cpp +77 -0
  650. package/cpp/llama.cpp/src/unicode.h +43 -0
  651. package/cpp/llama.cpp/vendor/cpp-httplib/CMakeLists.txt +94 -0
  652. package/cpp/llama.cpp/vendor/cpp-httplib/httplib.cpp +9339 -0
  653. package/cpp/llama.cpp/vendor/cpp-httplib/httplib.h +433 -8222
  654. package/cpp/llama.cpp/vendor/cpp-httplib/patch-boringssl.cmake +6 -0
  655. package/cpp/llama.cpp/vendor/miniaudio/miniaudio.h +4179 -1900
  656. package/cpp/llama.cpp/vendor/minja/chat-template.hpp +9 -2
  657. package/cpp/llama.cpp/vendor/minja/minja.hpp +101 -22
  658. package/cpp/rn-completion.cpp +3 -27
  659. package/ios/include/chat.h +16 -3
  660. package/ios/include/common/minja/chat-template.hpp +9 -2
  661. package/ios/include/common/minja/minja.hpp +101 -22
  662. package/ios/include/common.h +57 -19
  663. package/ios/include/json-schema-to-grammar.h +2 -0
  664. package/ios/include/llama.h +44 -21
  665. package/ios/include/log.h +12 -4
  666. package/ios/include/sampling.h +3 -1
  667. package/ios/libs/llama.xcframework/Info.plist +20 -20
  668. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  669. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +6399 -5557
  670. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-backend.h +19 -1
  671. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +1 -1
  672. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-metal.h +1 -6
  673. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +199 -6
  674. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +44 -21
  675. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  676. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  677. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +6362 -5520
  678. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4813 -4241
  679. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +19 -1
  680. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +1 -1
  681. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-metal.h +1 -6
  682. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +199 -6
  683. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +44 -21
  684. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  685. package/package.json +10 -4
  686. package/cpp/llama.cpp/ggml/src/ggml-cann/Doxyfile +0 -2579
  687. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cu +0 -371
  688. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cuh +0 -3
  689. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cu +0 -379
  690. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cuh +0 -3
  691. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +0 -495
  692. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +0 -486
  693. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
  694. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
  695. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
  696. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
  697. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
  698. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
  699. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
  700. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
  701. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
  702. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
  703. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
  704. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
  705. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
  706. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
  707. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
  708. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
  709. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
  710. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
  711. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
  712. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
  713. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
  714. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
  715. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
  716. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
  717. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
  718. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
  719. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
  720. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
  721. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
  722. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
  723. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
  724. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
  725. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
  726. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
  727. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
  728. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
  729. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
  730. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
  731. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
  732. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
  733. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
  734. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
  735. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
  736. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
  737. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
  738. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
  739. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
  740. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
  741. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
  742. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
  743. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
  744. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
  745. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
  746. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
  747. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
  748. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
  749. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
  750. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
  751. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
  752. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
  753. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
  754. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
  755. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
  756. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
  757. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
  758. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
  759. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
  760. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
  761. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
  762. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
  763. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
  764. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
  765. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
  766. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
  767. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
  768. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
  769. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
  770. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
  771. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
  772. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
  773. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
  774. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
  775. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
  776. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
  777. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
  778. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
  779. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +0 -6886
  780. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +0 -154
  781. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +0 -105
  782. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +0 -55
  783. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +0 -60
  784. package/cpp/llama.cpp/ggml/src/ggml-zdnn/ggml-zdnn-impl.h +0 -97
  785. package/cpp/llama.cpp/models/ggml-vocab-aquila.gguf +0 -0
  786. package/cpp/llama.cpp/models/ggml-vocab-baichuan.gguf +0 -0
  787. package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf +0 -0
  788. package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +0 -112
  789. package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +0 -46
  790. package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf +0 -0
  791. package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.inp +0 -112
  792. package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.out +0 -46
  793. package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf +0 -0
  794. package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +0 -112
  795. package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +0 -46
  796. package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf +0 -0
  797. package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +0 -112
  798. package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +0 -46
  799. package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf +0 -0
  800. package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.inp +0 -112
  801. package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.out +0 -46
  802. package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf +0 -0
  803. package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +0 -112
  804. package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +0 -46
  805. package/cpp/llama.cpp/models/ggml-vocab-gpt-neox.gguf +0 -0
  806. package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf +0 -0
  807. package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +0 -112
  808. package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +0 -46
  809. package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf +0 -0
  810. package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +0 -112
  811. package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +0 -46
  812. package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf +0 -0
  813. package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.inp +0 -112
  814. package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.out +0 -46
  815. package/cpp/llama.cpp/models/ggml-vocab-nomic-bert-moe.gguf +0 -0
  816. package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf +0 -0
  817. package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +0 -112
  818. package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.out +0 -46
  819. package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf +0 -0
  820. package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +0 -112
  821. package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.out +0 -46
  822. package/cpp/llama.cpp/models/ggml-vocab-refact.gguf +0 -0
  823. package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.inp +0 -112
  824. package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.out +0 -46
  825. package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf +0 -0
  826. package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +0 -112
  827. package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.out +0 -46
  828. package/cpp/llama.cpp/models/templates/ByteDance-Seed-OSS.jinja +0 -171
  829. package/cpp/llama.cpp/models/templates/CohereForAI-c4ai-command-r-plus-tool_use.jinja +0 -202
  830. package/cpp/llama.cpp/models/templates/CohereForAI-c4ai-command-r7b-12-2024-tool_use.jinja +0 -156
  831. package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +0 -124
  832. package/cpp/llama.cpp/models/templates/NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja +0 -152
  833. package/cpp/llama.cpp/models/templates/NousResearch-Hermes-3-Llama-3.1-8B-tool_use.jinja +0 -152
  834. package/cpp/llama.cpp/models/templates/Qwen-QwQ-32B.jinja +0 -62
  835. package/cpp/llama.cpp/models/templates/Qwen-Qwen2.5-7B-Instruct.jinja +0 -54
  836. package/cpp/llama.cpp/models/templates/Qwen-Qwen3-0.6B.jinja +0 -85
  837. package/cpp/llama.cpp/models/templates/README.md +0 -25
  838. package/cpp/llama.cpp/models/templates/deepseek-ai-DeepSeek-R1-Distill-Llama-8B.jinja +0 -1
  839. package/cpp/llama.cpp/models/templates/deepseek-ai-DeepSeek-R1-Distill-Qwen-32B.jinja +0 -1
  840. package/cpp/llama.cpp/models/templates/fireworks-ai-llama-3-firefunction-v2.jinja +0 -57
  841. package/cpp/llama.cpp/models/templates/google-gemma-2-2b-it.jinja +0 -4
  842. package/cpp/llama.cpp/models/templates/ibm-granite-granite-3.3-2B-Instruct.jinja +0 -59
  843. package/cpp/llama.cpp/models/templates/llama-cpp-deepseek-r1.jinja +0 -76
  844. package/cpp/llama.cpp/models/templates/llama-cpp-rwkv-world.jinja +0 -34
  845. package/cpp/llama.cpp/models/templates/meetkai-functionary-medium-v3.1.jinja +0 -58
  846. package/cpp/llama.cpp/models/templates/meetkai-functionary-medium-v3.2.jinja +0 -287
  847. package/cpp/llama.cpp/models/templates/meta-llama-Llama-3.1-8B-Instruct.jinja +0 -109
  848. package/cpp/llama.cpp/models/templates/meta-llama-Llama-3.2-3B-Instruct.jinja +0 -93
  849. package/cpp/llama.cpp/models/templates/meta-llama-Llama-3.3-70B-Instruct.jinja +0 -109
  850. package/cpp/llama.cpp/models/templates/microsoft-Phi-3.5-mini-instruct.jinja +0 -8
  851. package/cpp/llama.cpp/models/templates/mistralai-Mistral-Nemo-Instruct-2407.jinja +0 -87
  852. package/cpp/llama.cpp/models/templates/moonshotai-Kimi-K2.jinja +0 -43
  853. package/cpp/llama.cpp/models/templates/openai-gpt-oss-120b.jinja +0 -331
  854. package/cpp/llama.cpp/models/templates/unsloth-mistral-Devstral-Small-2507.jinja +0 -105
  855. package/cpp/llama.cpp/prompts/LLM-questions.txt +0 -49
  856. package/cpp/llama.cpp/prompts/alpaca.txt +0 -1
  857. package/cpp/llama.cpp/prompts/assistant.txt +0 -31
  858. package/cpp/llama.cpp/prompts/chat-with-baichuan.txt +0 -4
  859. package/cpp/llama.cpp/prompts/chat-with-bob.txt +0 -7
  860. package/cpp/llama.cpp/prompts/chat-with-qwen.txt +0 -1
  861. package/cpp/llama.cpp/prompts/chat-with-vicuna-v0.txt +0 -7
  862. package/cpp/llama.cpp/prompts/chat-with-vicuna-v1.txt +0 -7
  863. package/cpp/llama.cpp/prompts/chat.txt +0 -28
  864. package/cpp/llama.cpp/prompts/dan-modified.txt +0 -1
  865. package/cpp/llama.cpp/prompts/dan.txt +0 -1
  866. package/cpp/llama.cpp/prompts/mnemonics.txt +0 -93
  867. package/cpp/llama.cpp/prompts/parallel-questions.txt +0 -43
  868. package/cpp/llama.cpp/prompts/reason-act.txt +0 -18
  869. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
  870. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  871. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5524
  872. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +0 -4247
  873. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-alloc.h +0 -76
  874. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-backend.h +0 -354
  875. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-blas.h +0 -25
  876. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +0 -145
  877. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-metal.h +0 -66
  878. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-opt.h +0 -256
  879. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +0 -2492
  880. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/gguf.h +0 -202
  881. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +0 -1391
  882. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Modules/module.modulemap +0 -17
  883. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Resources/Info.plist +0 -32
  884. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-alloc.h +0 -76
  885. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-backend.h +0 -354
  886. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-blas.h +0 -25
  887. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +0 -145
  888. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-metal.h +0 -66
  889. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-opt.h +0 -256
  890. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +0 -2492
  891. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/gguf.h +0 -202
  892. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +0 -1391
  893. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Modules/module.modulemap +0 -17
  894. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Resources/Info.plist +0 -32
  895. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  896. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-alloc.h +0 -76
  897. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-backend.h +0 -354
  898. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-blas.h +0 -25
  899. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +0 -145
  900. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-metal.h +0 -66
  901. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-opt.h +0 -256
  902. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +0 -2492
  903. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/gguf.h +0 -202
  904. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +0 -1391
  905. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Modules/module.modulemap +0 -17
  906. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Resources/Info.plist +0 -32
  907. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  908. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  909. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
  910. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  911. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5561
  912. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-alloc.h +0 -76
  913. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-backend.h +0 -354
  914. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-blas.h +0 -25
  915. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +0 -145
  916. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-metal.h +0 -66
  917. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-opt.h +0 -256
  918. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +0 -2492
  919. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/gguf.h +0 -202
  920. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +0 -1391
  921. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Info.plist +0 -35
  922. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Modules/module.modulemap +0 -17
  923. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  924. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
  925. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  926. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5524
  927. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +0 -4246
  928. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-alloc.h +0 -76
  929. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +0 -354
  930. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-blas.h +0 -25
  931. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +0 -145
  932. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-metal.h +0 -66
  933. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +0 -256
  934. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +0 -2492
  935. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/gguf.h +0 -202
  936. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +0 -1391
  937. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Info.plist +0 -35
  938. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Modules/module.modulemap +0 -17
  939. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  940. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
  941. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  942. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5558
  943. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-alloc.h +0 -76
  944. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-backend.h +0 -354
  945. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-blas.h +0 -25
  946. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +0 -145
  947. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-metal.h +0 -66
  948. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-opt.h +0 -256
  949. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +0 -2492
  950. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/gguf.h +0 -202
  951. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +0 -1391
  952. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Info.plist +0 -32
  953. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Modules/module.modulemap +0 -17
  954. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  955. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
  956. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  957. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5520
  958. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +0 -4243
  959. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-alloc.h +0 -76
  960. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +0 -354
  961. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-blas.h +0 -25
  962. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +0 -145
  963. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-metal.h +0 -66
  964. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +0 -256
  965. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +0 -2492
  966. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/gguf.h +0 -202
  967. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +0 -1391
  968. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Info.plist +0 -32
  969. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Modules/module.modulemap +0 -17
  970. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  971. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{test_bfloat16_support.comp → feature-tests/bfloat16.comp} +0 -0
  972. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat_support.comp → feature-tests/coopmat.comp} +0 -0
  973. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat2_support.comp → feature-tests/coopmat2.comp} +0 -0
  974. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{test_integer_dot_support.comp → feature-tests/integer_dot.comp} +0 -0
  975. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{generic_head.comp → generic_head.glsl} +0 -0
  976. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{generic_unary_head.comp → generic_unary_head.glsl} +0 -0
  977. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{glu_main.comp → glu_main.glsl} +0 -0
  978. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{rte.comp → rte.glsl} +0 -0
  979. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{utils.comp → utils.glsl} +0 -0
@@ -22,54 +22,58 @@
22
22
 
23
23
  #include "aclnn_ops.h"
24
24
 
25
+ #include "ggml-impl.h"
26
+ #include "ggml.h"
27
+
28
+ #include <aclnnop/aclnn_add.h>
25
29
  #include <aclnnop/aclnn_addcdiv.h>
30
+ #include <aclnnop/aclnn_argmax.h>
26
31
  #include <aclnnop/aclnn_avgpool2d.h>
27
32
  #include <aclnnop/aclnn_batch_matmul.h>
28
33
  #include <aclnnop/aclnn_cast.h>
34
+ #include <aclnnop/aclnn_clamp.h>
29
35
  #include <aclnnop/aclnn_constant_pad_nd.h>
36
+ #include <aclnnop/aclnn_convolution.h>
30
37
  #include <aclnnop/aclnn_copy.h>
31
38
  #include <aclnnop/aclnn_div.h>
39
+ #include <aclnnop/aclnn_elu.h>
32
40
  #include <aclnnop/aclnn_embedding.h>
41
+ #include <aclnnop/aclnn_eq_tensor.h>
33
42
  #include <aclnnop/aclnn_exp.h>
34
43
  #include <aclnnop/aclnn_fill_scalar.h>
44
+ #include <aclnnop/aclnn_fused_infer_attention_score_v2.h>
35
45
  #include <aclnnop/aclnn_group_norm.h>
46
+ #include <aclnnop/aclnn_grouped_matmul_v3.h>
47
+ #include <aclnnop/aclnn_gt_scalar.h>
48
+ #include <aclnnop/aclnn_im2col.h>
49
+ #include <aclnnop/aclnn_index_copy.h>
36
50
  #include <aclnnop/aclnn_index_fill_tensor.h>
51
+ #include <aclnnop/aclnn_index_select.h>
37
52
  #include <aclnnop/aclnn_layer_norm.h>
53
+ #include <aclnnop/aclnn_log.h>
38
54
  #include <aclnnop/aclnn_matmul.h>
39
55
  #include <aclnnop/aclnn_max_pool.h>
56
+ #include <aclnnop/aclnn_mean.h>
40
57
  #include <aclnnop/aclnn_mm.h>
58
+ #include <aclnnop/aclnn_mul.h>
41
59
  #include <aclnnop/aclnn_permute.h>
60
+ #include <aclnnop/aclnn_pow.h>
42
61
  #include <aclnnop/aclnn_pow_tensor_tensor.h>
43
62
  #include <aclnnop/aclnn_reduce_sum.h>
63
+ #include <aclnnop/aclnn_reflection_pad1d.h>
44
64
  #include <aclnnop/aclnn_repeat.h>
45
65
  #include <aclnnop/aclnn_repeat_interleave.h>
66
+ #include <aclnnop/aclnn_rms_norm.h>
46
67
  #include <aclnnop/aclnn_roll.h>
47
68
  #include <aclnnop/aclnn_softmax.h>
69
+ #include <aclnnop/aclnn_sub.h>
70
+ #include <aclnnop/aclnn_sum.h>
71
+ #include <aclnnop/aclnn_threshold.h>
48
72
  #include <aclnnop/aclnn_tril.h>
49
73
  #include <aclnnop/aclnn_triu.h>
50
74
  #include <aclnnop/aclnn_upsample_nearest_2d.h>
51
75
  #include <aclnnop/aclnn_weight_quant_batch_matmul_v2.h>
52
- #include <aclnnop/aclnn_argmax.h>
53
- #include <aclnnop/aclnn_sum.h>
54
- #include <aclnnop/aclnn_rms_norm.h>
55
- #include <aclnnop/aclnn_im2col.h>
56
- #include <aclnnop/aclnn_add.h>
57
- #include <aclnnop/aclnn_sub.h>
58
- #include <aclnnop/aclnn_mul.h>
59
- #include <aclnnop/aclnn_div.h>
60
- #include <aclnnop/aclnn_convolution.h>
61
- #include <aclnnop/aclnn_elu.h>
62
- #include <aclnnop/aclnn_log.h>
63
- #include <aclnnop/aclnn_mean.h>
64
- #include <aclnnop/aclnn_reflection_pad1d.h>
65
- #include <aclnnop/aclnn_eq_tensor.h>
66
- #include <aclnnop/aclnn_gt_scalar.h>
67
- #include <aclnnop/aclnn_pow.h>
68
- #include <aclnnop/aclnn_grouped_matmul_v3.h>
69
- #include <aclnnop/aclnn_fused_infer_attention_score_v2.h>
70
76
  #include <aclnnop/aclnn_zero.h>
71
- #include <aclnnop/aclnn_index_copy.h>
72
- #include <aclnnop/aclnn_index_select.h>
73
77
  #include <float.h>
74
78
 
75
79
  #include <cmath>
@@ -77,76 +81,71 @@
77
81
  #include <exception>
78
82
  #include <vector>
79
83
 
80
- #include "ggml-impl.h"
81
- #include "ggml.h"
82
-
83
84
  #define GGML_COMMON_DECL_C
84
85
 
85
86
  #include "../ggml-common.h"
86
87
 
87
-
88
- void bcast_shape(ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst, aclTensor ** acl_src0,
89
- aclTensor ** acl_src1, aclTensor ** acl_dst) {
88
+ void bcast_shape(ggml_tensor * src0,
89
+ ggml_tensor * src1,
90
+ ggml_tensor * dst,
91
+ acl_tensor_ptr & acl_src0,
92
+ acl_tensor_ptr & acl_src1,
93
+ acl_tensor_ptr & acl_dst) {
90
94
  GGML_ASSERT(ggml_are_same_shape(src0, dst) && ggml_can_repeat(src1, src0));
91
95
  // Need bcast
92
96
  if (!ggml_are_same_shape(src0, src1) && ggml_cann_need_bcast(src0, src1)) {
93
97
  BCAST_SHAPE(src0, src1)
94
- *acl_src0 = ggml_cann_create_tensor(src0, BCAST_PARAM(src0));
95
- *acl_src1 = ggml_cann_create_tensor(src1, BCAST_PARAM(src1));
96
- *acl_dst = ggml_cann_create_tensor(dst, BCAST_PARAM(src0));
98
+ acl_src0 = ggml_cann_create_tensor(src0, BCAST_PARAM(src0));
99
+ acl_src1 = ggml_cann_create_tensor(src1, BCAST_PARAM(src1));
100
+ acl_dst = ggml_cann_create_tensor(dst, BCAST_PARAM(src0));
97
101
  } else {
98
- *acl_src0 = ggml_cann_create_tensor(src0);
99
- *acl_src1 = ggml_cann_create_tensor(src1);
100
- *acl_dst = ggml_cann_create_tensor(dst);
102
+ acl_src0 = ggml_cann_create_tensor(src0);
103
+ acl_src1 = ggml_cann_create_tensor(src1);
104
+ acl_dst = ggml_cann_create_tensor(dst);
101
105
  }
102
106
  }
103
107
 
104
- void ggml_cann_op_unary(
105
- std::function<void(ggml_backend_cann_context&, aclTensor*, aclTensor*)> unary_op,
106
- ggml_backend_cann_context& ctx, ggml_tensor* dst) {
107
- ggml_tensor* src = dst->src[0];
108
+ void ggml_cann_op_unary(std::function<void(ggml_backend_cann_context &, aclTensor *, aclTensor *)> unary_op,
109
+ ggml_backend_cann_context & ctx,
110
+ ggml_tensor * dst) {
111
+ ggml_tensor * src = dst->src[0];
108
112
 
109
- aclTensor* acl_src = ggml_cann_create_tensor(src);
110
- aclTensor* acl_dst = ggml_cann_create_tensor(dst);
113
+ acl_tensor_ptr acl_src = ggml_cann_create_tensor(src);
114
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
111
115
 
112
- unary_op(ctx, acl_src, acl_dst);
113
- ggml_cann_release_resources(ctx, acl_src, acl_dst);
116
+ unary_op(ctx, acl_src.get(), acl_dst.get());
114
117
  }
115
118
 
116
- void ggml_cann_op_unary_gated(
117
- std::function<void(ggml_backend_cann_context&, aclTensor*, aclTensor*)> unary_op,
118
- ggml_backend_cann_context& ctx, ggml_tensor* dst) {
119
- ggml_tensor* src0 = dst->src[0];
120
- ggml_tensor* src1 = dst->src[1];
119
+ void ggml_cann_op_unary_gated(std::function<void(ggml_backend_cann_context &, aclTensor *, aclTensor *)> unary_op,
120
+ ggml_backend_cann_context & ctx,
121
+ ggml_tensor * dst) {
122
+ ggml_tensor * src0 = dst->src[0];
123
+ ggml_tensor * src1 = dst->src[1];
121
124
 
122
125
  GGML_ASSERT(ggml_is_contiguous_1(src0));
123
126
  GGML_ASSERT(ggml_is_contiguous_1(dst));
124
127
  const int32_t swapped = ggml_get_op_params_i32(dst, 1);
125
128
 
126
- aclTensor* acl_dst = ggml_cann_create_tensor(dst);
127
- aclTensor *acl_src0 = nullptr, *acl_src1 = nullptr;
128
- if(src1) {
129
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
130
+ acl_tensor_ptr acl_src0, acl_src1;
131
+ if (src1) {
129
132
  GGML_ASSERT(ggml_is_contiguous_1(src1));
130
133
  GGML_ASSERT(src0->type == src1->type);
131
134
 
132
135
  acl_src0 = ggml_cann_create_tensor(src0);
133
136
  acl_src1 = ggml_cann_create_tensor(src1);
134
137
  } else {
135
- int64_t ne[] = {src0->ne[0] / 2, src0->ne[1], src0->ne[2], src0->ne[3]};
136
- size_t nb[] = {src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]};
137
- acl_src0 = ggml_cann_create_tensor(src0, ne, nb, GGML_MAX_DIMS, ACL_FORMAT_ND, 0);
138
+ int64_t ne[] = { src0->ne[0] / 2, src0->ne[1], src0->ne[2], src0->ne[3] };
139
+ size_t nb[] = { src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3] };
140
+ acl_src0 = ggml_cann_create_tensor(src0, ne, nb, GGML_MAX_DIMS, ACL_FORMAT_ND, 0);
138
141
  acl_src1 = ggml_cann_create_tensor(src0, ne, nb, GGML_MAX_DIMS, ACL_FORMAT_ND, ne[0] * ggml_element_size(src0));
139
142
  if (swapped) {
140
143
  std::swap(acl_src0, acl_src1);
141
144
  }
142
145
  }
143
146
 
144
- unary_op(ctx, acl_src0, acl_dst);
145
- GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMul, acl_dst, acl_src1);
146
-
147
- ggml_cann_release_resources(ctx, acl_src0, acl_dst);
148
- if(src1)
149
- ggml_cann_release_resources(ctx, acl_src1);
147
+ unary_op(ctx, acl_src0.get(), acl_dst.get());
148
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMul, acl_dst.get(), acl_src1.get());
150
149
  }
151
150
 
152
151
  /**
@@ -159,13 +158,14 @@ void ggml_cann_op_unary_gated(
159
158
  * @param repeat_array The array specifying the number of repetitions along each
160
159
  * dimension.
161
160
  */
162
- static void aclnn_repeat(ggml_backend_cann_context& ctx, aclTensor* acl_src,
163
- aclTensor* acl_dst, int64_t* repeat_array) {
161
+ static void aclnn_repeat(ggml_backend_cann_context & ctx,
162
+ aclTensor * acl_src,
163
+ aclTensor * acl_dst,
164
+ int64_t * repeat_array) {
164
165
  // repeat tensor along each dim with repeat_array
165
- aclIntArray* repeats = aclCreateIntArray(repeat_array, GGML_MAX_DIMS);
166
+ acl_int_array_ptr repeats = ggml_cann_create_int_array(repeat_array, GGML_MAX_DIMS);
166
167
 
167
- GGML_CANN_CALL_ACLNN_OP(ctx, Repeat, acl_src, repeats, acl_dst);
168
- ggml_cann_release_resources(ctx, repeats);
168
+ GGML_CANN_CALL_ACLNN_OP(ctx, Repeat, acl_src, repeats.get(), acl_dst);
169
169
  }
170
170
 
171
171
  /**
@@ -181,61 +181,60 @@ static void aclnn_repeat(ggml_backend_cann_context& ctx, aclTensor* acl_src,
181
181
  * @param cast_data_type The target data type to which the source tensor will be
182
182
  * casted.
183
183
  */
184
- static void aclnn_cast(ggml_backend_cann_context& ctx, aclTensor* acl_src,
185
- aclTensor* acl_dst, aclDataType cast_data_type) {
184
+ static void aclnn_cast(ggml_backend_cann_context & ctx,
185
+ aclTensor * acl_src,
186
+ aclTensor * acl_dst,
187
+ aclDataType cast_data_type) {
186
188
  GGML_CANN_CALL_ACLNN_OP(ctx, Cast, acl_src, cast_data_type, acl_dst);
187
189
  }
188
190
 
189
- void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
190
- ggml_tensor* src = dst->src[0];
191
+ void ggml_cann_repeat(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
192
+ ggml_tensor * src = dst->src[0];
191
193
  GGML_ASSERT(ggml_can_repeat(src, dst));
192
194
 
193
- aclTensor* acl_src = ggml_cann_create_tensor(src);
194
- aclTensor* acl_dst = ggml_cann_create_tensor(dst);
195
+ acl_tensor_ptr acl_src = ggml_cann_create_tensor(src);
196
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
195
197
 
196
- int64_t repeatsArray[] = {dst->ne[3] / src->ne[3], dst->ne[2] / src->ne[2],
197
- dst->ne[1] / src->ne[1], dst->ne[0] / src->ne[0]};
198
+ int64_t repeatsArray[] = { dst->ne[3] / src->ne[3], dst->ne[2] / src->ne[2], dst->ne[1] / src->ne[1],
199
+ dst->ne[0] / src->ne[0] };
198
200
 
199
- aclnn_repeat(ctx, acl_src, acl_dst, repeatsArray);
200
- ggml_cann_release_resources(ctx, acl_src, acl_dst);
201
+ aclnn_repeat(ctx, acl_src.get(), acl_dst.get(), repeatsArray);
201
202
  }
202
203
 
203
- void aclnn_add(ggml_backend_cann_context& ctx, aclTensor* acl_src0,
204
- aclTensor* acl_src1, aclTensor* acl_dst) {
205
- float alphaValue = 1.0f;
206
- aclScalar* alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
207
- if (acl_dst != nullptr)
208
- GGML_CANN_CALL_ACLNN_OP(ctx, Add, acl_src0, acl_src1, alpha, acl_dst);
209
- else
210
- GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdd, acl_src0, acl_src1, alpha);
211
- ggml_cann_release_resources(ctx, alpha);
204
+ void aclnn_add(ggml_backend_cann_context & ctx, aclTensor * acl_src0, aclTensor * acl_src1, aclTensor * acl_dst) {
205
+ float alphaValue = 1.0f;
206
+ acl_scalar_ptr alpha = ggml_cann_create_scalar(&alphaValue, aclDataType::ACL_FLOAT);
207
+ if (acl_dst != nullptr) {
208
+ GGML_CANN_CALL_ACLNN_OP(ctx, Add, acl_src0, acl_src1, alpha.get(), acl_dst);
209
+ } else {
210
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdd, acl_src0, acl_src1, alpha.get());
211
+ }
212
212
  }
213
213
 
214
- void aclnn_sub(ggml_backend_cann_context& ctx, aclTensor* acl_src0,
215
- aclTensor* acl_src1, aclTensor* acl_dst) {
216
- float alphaValue = 1.0f;
217
- aclScalar* alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
218
- if (acl_dst != nullptr)
219
- GGML_CANN_CALL_ACLNN_OP(ctx, Sub, acl_src0, acl_src1, alpha, acl_dst);
220
- else
221
- GGML_CANN_CALL_ACLNN_OP(ctx, InplaceSub, acl_src0, acl_src1, alpha);
222
- ggml_cann_release_resources(ctx, alpha);
214
+ void aclnn_sub(ggml_backend_cann_context & ctx, aclTensor * acl_src0, aclTensor * acl_src1, aclTensor * acl_dst) {
215
+ float alphaValue = 1.0f;
216
+ acl_scalar_ptr alpha = ggml_cann_create_scalar(&alphaValue, aclDataType::ACL_FLOAT);
217
+ if (acl_dst != nullptr) {
218
+ GGML_CANN_CALL_ACLNN_OP(ctx, Sub, acl_src0, acl_src1, alpha.get(), acl_dst);
219
+ } else {
220
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceSub, acl_src0, acl_src1, alpha.get());
221
+ }
223
222
  }
224
223
 
225
- void aclnn_mul(ggml_backend_cann_context& ctx, aclTensor* acl_src,
226
- aclTensor* acl_other, aclTensor* acl_dst) {
227
- if (acl_dst != nullptr)
224
+ void aclnn_mul(ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_other, aclTensor * acl_dst) {
225
+ if (acl_dst != nullptr) {
228
226
  GGML_CANN_CALL_ACLNN_OP(ctx, Mul, acl_src, acl_other, acl_dst);
229
- else
227
+ } else {
230
228
  GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMul, acl_src, acl_other);
229
+ }
231
230
  }
232
231
 
233
- void aclnn_div(ggml_backend_cann_context& ctx, aclTensor* acl_src,
234
- aclTensor* acl_other, aclTensor* acl_dst) {
235
- if (acl_dst != nullptr)
232
+ void aclnn_div(ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_other, aclTensor * acl_dst) {
233
+ if (acl_dst != nullptr) {
236
234
  GGML_CANN_CALL_ACLNN_OP(ctx, Div, acl_src, acl_other, acl_dst);
237
- else
235
+ } else {
238
236
  GGML_CANN_CALL_ACLNN_OP(ctx, InplaceDiv, acl_src, acl_other);
237
+ }
239
238
  }
240
239
 
241
240
  /**
@@ -260,33 +259,30 @@ void aclnn_div(ggml_backend_cann_context& ctx, aclTensor* acl_src,
260
259
  * @param inplace Flag indicating whether to perform the operation in-place on
261
260
  * `acl_src`.
262
261
  */
263
- static void aclnn_muls(ggml_backend_cann_context& ctx, aclTensor* acl_src,
264
- float scale, aclTensor* acl_dst, bool inplace) {
265
- aclScalar* acl_scale = aclCreateScalar(&scale, aclDataType::ACL_FLOAT);
262
+ static void aclnn_muls(ggml_backend_cann_context & ctx,
263
+ aclTensor * acl_src,
264
+ float scale,
265
+ aclTensor * acl_dst,
266
+ bool inplace) {
267
+ acl_scalar_ptr acl_scale = ggml_cann_create_scalar(&scale, aclDataType::ACL_FLOAT);
266
268
  if (inplace) {
267
- GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMuls, acl_src, acl_scale);
269
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMuls, acl_src, acl_scale.get());
268
270
  } else {
269
- GGML_CANN_CALL_ACLNN_OP(ctx, Muls, acl_src, acl_scale, acl_dst);
271
+ GGML_CANN_CALL_ACLNN_OP(ctx, Muls, acl_src, acl_scale.get(), acl_dst);
270
272
  }
271
- ggml_cann_release_resources(ctx, acl_scale);
272
273
  }
273
274
 
274
- void ggml_cann_leaky_relu(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
275
- ggml_tensor* src = dst->src[0];
275
+ void ggml_cann_leaky_relu(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
276
+ ggml_tensor * src = dst->src[0];
276
277
 
277
- GGML_ASSERT(src->type == GGML_TYPE_F32);
278
- GGML_ASSERT(dst->type == GGML_TYPE_F32);
279
-
280
- aclTensor* acl_src = ggml_cann_create_tensor(src);
281
- aclTensor* acl_dst = ggml_cann_create_tensor(dst);
278
+ acl_tensor_ptr acl_src = ggml_cann_create_tensor(src);
279
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
282
280
 
283
281
  float negative_slope;
284
282
  memcpy(&negative_slope, dst->op_params, sizeof(float));
285
- aclScalar* acl_negative_slope =
286
- aclCreateScalar(&negative_slope, aclDataType::ACL_FLOAT);
283
+ acl_scalar_ptr acl_negative_slope = ggml_cann_create_scalar(&negative_slope, aclDataType::ACL_FLOAT);
287
284
 
288
- GGML_CANN_CALL_ACLNN_OP(ctx, LeakyRelu, acl_src, acl_negative_slope, acl_dst);
289
- ggml_cann_release_resources(ctx, acl_negative_slope, acl_src, acl_dst);
285
+ GGML_CANN_CALL_ACLNN_OP(ctx, LeakyRelu, acl_src.get(), acl_negative_slope.get(), acl_dst.get());
290
286
  }
291
287
 
292
288
  /**
@@ -299,29 +295,27 @@ void ggml_cann_leaky_relu(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
299
295
  * stored.
300
296
  * @param concat_dim The dimension along which the tensors will be concatenated.
301
297
  */
302
- static void aclnn_concat(ggml_backend_cann_context& ctx,
303
- aclTensorList* tensorList, aclTensor* acl_dst,
304
- int64_t concat_dim) {
298
+ static void aclnn_concat(ggml_backend_cann_context & ctx,
299
+ aclTensorList * tensorList,
300
+ aclTensor * acl_dst,
301
+ int64_t concat_dim) {
305
302
  GGML_CANN_CALL_ACLNN_OP(ctx, Cat, tensorList, concat_dim, acl_dst);
306
303
  }
307
304
 
308
- void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
309
- ggml_tensor* src0 = dst->src[0];
310
- ggml_tensor* src1 = dst->src[1];
311
- aclTensor* acl_src0 = ggml_cann_create_tensor(src0);
312
- aclTensor* acl_src1 = ggml_cann_create_tensor(src1);
313
- aclTensor* acl_dst = ggml_cann_create_tensor(dst);
305
+ void ggml_cann_concat(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
306
+ ggml_tensor * src0 = dst->src[0];
307
+ ggml_tensor * src1 = dst->src[1];
308
+ acl_tensor_ptr acl_src0 = ggml_cann_create_tensor(src0);
309
+ acl_tensor_ptr acl_src1 = ggml_cann_create_tensor(src1);
310
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
314
311
 
315
312
  const int32_t dim = ggml_get_op_params_i32(dst, 0);
316
313
 
317
314
  GGML_ASSERT(dim >= 0 && dim < 4);
318
315
  int32_t acl_dim = 3 - dim;
319
316
 
320
- aclTensor* tensors[] = {acl_src0, acl_src1};
321
- aclTensorList* tensor_list = aclCreateTensorList(tensors, 2);
322
- aclnn_concat(ctx, tensor_list, acl_dst, acl_dim);
323
-
324
- ggml_cann_release_resources(ctx, tensor_list, acl_dst);
317
+ acl_tensor_list_ptr tensor_list = ggml_cann_create_tensor_list(acl_src0, acl_src1);
318
+ aclnn_concat(ctx, tensor_list.get(), acl_dst.get(), acl_dim);
325
319
  }
326
320
 
327
321
  /**
@@ -341,169 +335,277 @@ void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
341
335
  * @param step The step size between consecutive values.
342
336
  * @param n_elements The number of elements in the destination tensor.
343
337
  */
344
- static void aclnn_arange(ggml_backend_cann_context& ctx, aclTensor* acl_dst,
345
- float start, float stop, float step,
346
- int64_t n_elements) {
347
- int64_t steps = (int64_t)std::ceil((stop - start) / step);
338
+ static void aclnn_arange(ggml_backend_cann_context & ctx,
339
+ aclTensor * acl_dst,
340
+ float start,
341
+ float stop,
342
+ float step,
343
+ int64_t n_elements) {
344
+ int64_t steps = (int64_t) std::ceil((stop - start) / step);
348
345
  GGML_ASSERT(n_elements == steps);
349
346
 
350
- aclScalar* acl_start = aclCreateScalar(&start, aclDataType::ACL_FLOAT);
351
- aclScalar* acl_end = aclCreateScalar(&stop, aclDataType::ACL_FLOAT);
352
- aclScalar* acl_step = aclCreateScalar(&step, aclDataType::ACL_FLOAT);
347
+ acl_scalar_ptr acl_start = ggml_cann_create_scalar(&start, aclDataType::ACL_FLOAT);
348
+ acl_scalar_ptr acl_end = ggml_cann_create_scalar(&stop, aclDataType::ACL_FLOAT);
349
+ acl_scalar_ptr acl_step = ggml_cann_create_scalar(&step, aclDataType::ACL_FLOAT);
353
350
 
354
- GGML_CANN_CALL_ACLNN_OP(ctx, Arange, acl_start, acl_end, acl_step, acl_dst);
355
- ggml_cann_release_resources(ctx, acl_start, acl_end, acl_step);
351
+ GGML_CANN_CALL_ACLNN_OP(ctx, Arange, acl_start.get(), acl_end.get(), acl_step.get(), acl_dst);
356
352
  }
357
353
 
358
- void ggml_cann_arange(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
354
+ void ggml_cann_arange(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
359
355
  GGML_ASSERT(dst->type == GGML_TYPE_F32);
360
356
 
361
- aclTensor* acl_dst = ggml_cann_create_tensor(dst);
357
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
362
358
 
363
359
  int64_t n_elements = ggml_nelements(dst);
364
- float start;
365
- float stop;
366
- float step;
367
- memcpy(&start, (float*)dst->op_params + 0, sizeof(float));
368
- memcpy(&stop, (float*)dst->op_params + 1, sizeof(float));
369
- memcpy(&step, (float*)dst->op_params + 2, sizeof(float));
360
+ float start;
361
+ float stop;
362
+ float step;
363
+ memcpy(&start, (float *) dst->op_params + 0, sizeof(float));
364
+ memcpy(&stop, (float *) dst->op_params + 1, sizeof(float));
365
+ memcpy(&step, (float *) dst->op_params + 2, sizeof(float));
370
366
 
371
- aclnn_arange(ctx, acl_dst, start, stop, step, n_elements);
372
- ggml_cann_release_resources(ctx, acl_dst);
367
+ aclnn_arange(ctx, acl_dst.get(), start, stop, step, n_elements);
373
368
  }
374
369
 
375
- void ggml_cann_clamp(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
376
- ggml_tensor* src = dst->src[0];
370
+ void ggml_cann_clamp(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
371
+ ggml_tensor * src = dst->src[0];
377
372
 
378
373
  float min;
379
374
  float max;
380
375
  memcpy(&min, dst->op_params, sizeof(float));
381
- memcpy(&max, (float*)dst->op_params + 1, sizeof(float));
376
+ memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
382
377
 
383
- aclTensor* acl_src = ggml_cann_create_tensor(src);
384
- aclTensor* acl_dst = ggml_cann_create_tensor(dst);
378
+ acl_tensor_ptr acl_src = ggml_cann_create_tensor(src);
379
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
385
380
 
386
- aclScalar* acl_min = aclCreateScalar(&min, aclDataType::ACL_FLOAT);
387
- aclScalar* acl_max = aclCreateScalar(&max, aclDataType::ACL_FLOAT);
381
+ acl_scalar_ptr acl_min = ggml_cann_create_scalar(&min, aclDataType::ACL_FLOAT);
382
+ acl_scalar_ptr acl_max = ggml_cann_create_scalar(&max, aclDataType::ACL_FLOAT);
388
383
 
389
- GGML_CANN_CALL_ACLNN_OP(ctx, Clamp, acl_src, acl_min, acl_max, acl_dst);
390
- ggml_cann_release_resources(ctx, acl_min, acl_max, acl_src, acl_dst);
384
+ GGML_CANN_CALL_ACLNN_OP(ctx, Clamp, acl_src.get(), acl_min.get(), acl_max.get(), acl_dst.get());
391
385
  }
392
386
 
393
- void ggml_cann_scale(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
394
- ggml_tensor* src = dst->src[0];
387
+ void ggml_cann_scale(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
388
+ ggml_tensor * src = dst->src[0];
395
389
 
396
390
  // scale factor
397
391
  float v;
398
392
  memcpy(&v, dst->op_params, sizeof(float));
399
393
 
400
- aclScalar* scale = aclCreateScalar(&v, aclDataType::ACL_FLOAT);
401
- aclTensor* acl_src = ggml_cann_create_tensor(src);
402
- aclTensor* acl_dst = ggml_cann_create_tensor(dst);
394
+ acl_scalar_ptr scale = ggml_cann_create_scalar(&v, aclDataType::ACL_FLOAT);
395
+ acl_tensor_ptr acl_src = ggml_cann_create_tensor(src);
396
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
403
397
 
404
- GGML_CANN_CALL_ACLNN_OP(ctx, Muls, acl_src, scale, acl_dst);
405
- ggml_cann_release_resources(ctx, scale, acl_src, acl_dst);
398
+ GGML_CANN_CALL_ACLNN_OP(ctx, Muls, acl_src.get(), scale.get(), acl_dst.get());
406
399
  }
407
400
 
408
- void ggml_cann_argsort(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
409
- ggml_tensor* src = dst->src[0];
410
- enum ggml_sort_order order = (enum ggml_sort_order)dst->op_params[0];
401
+ void ggml_cann_argsort(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
402
+ ggml_tensor * src = dst->src[0];
403
+ enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
411
404
 
412
- aclTensor* acl_src = ggml_cann_create_tensor(src);
413
- aclTensor* acl_dst = ggml_cann_create_tensor(dst);
414
- ggml_cann_pool_alloc temp_buffer_allocator(
415
- ctx.pool(), ggml_nelements(dst) * sizeof(int64_t));
416
- void* buffer = temp_buffer_allocator.get();
417
- aclTensor* tmp_tensor =
418
- ggml_cann_create_tensor(buffer, ACL_INT64, ggml_type_size(dst->type),
419
- dst->ne, dst->nb, GGML_MAX_DIMS);
420
- GGML_CANN_CALL_ACLNN_OP(ctx, Argsort, acl_src, -1, (order == GGML_SORT_ORDER_DESC ? true : false),
421
- tmp_tensor);
422
- GGML_CANN_CALL_ACLNN_OP(ctx, Cast, tmp_tensor, ggml_cann_type_mapping(dst->type), acl_dst);
423
- ggml_cann_release_resources(ctx, acl_src, tmp_tensor, acl_dst);
405
+ acl_tensor_ptr acl_src = ggml_cann_create_tensor(src);
406
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
407
+ ggml_cann_pool_alloc temp_buffer_allocator(ctx.pool(), ggml_nelements(dst) * sizeof(int64_t));
408
+ void * buffer = temp_buffer_allocator.get();
409
+ acl_tensor_ptr tmp_tensor =
410
+ ggml_cann_create_tensor(buffer, ACL_INT64, ggml_type_size(dst->type), dst->ne, dst->nb, GGML_MAX_DIMS);
411
+ GGML_CANN_CALL_ACLNN_OP(ctx, Argsort, acl_src.get(), -1, (order == GGML_SORT_ORDER_DESC ? true : false),
412
+ tmp_tensor.get());
413
+ GGML_CANN_CALL_ACLNN_OP(ctx, Cast, tmp_tensor.get(), ggml_cann_type_mapping(dst->type), acl_dst.get());
424
414
  }
425
415
 
426
- void ggml_cann_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
427
- ggml_tensor* src = dst->src[0];
416
+ void ggml_cann_norm(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
417
+ ggml_tensor * src = dst->src[0];
428
418
 
429
- aclTensor* acl_src = ggml_cann_create_tensor(src);
430
- aclTensor* acl_dst = ggml_cann_create_tensor(dst);
419
+ acl_tensor_ptr acl_src = ggml_cann_create_tensor(src);
420
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
431
421
 
432
422
  float eps;
433
423
  memcpy(&eps, dst->op_params, sizeof(float));
434
424
 
435
- std::vector<int64_t> normData = {dst->ne[0]};
436
- aclIntArray* norm = aclCreateIntArray(normData.data(), normData.size());
437
- GGML_CANN_CALL_ACLNN_OP(ctx, LayerNorm, acl_src, norm, nullptr, nullptr,
438
- eps, acl_dst, nullptr, nullptr);
439
- ggml_cann_release_resources(ctx, norm, acl_src, acl_dst);
425
+ std::vector<int64_t> normData = { dst->ne[0] };
426
+ acl_int_array_ptr norm = ggml_cann_create_int_array(normData.data(), normData.size());
427
+ GGML_CANN_CALL_ACLNN_OP(ctx, LayerNorm, acl_src.get(), norm.get(), nullptr, nullptr, eps, acl_dst.get(), nullptr,
428
+ nullptr);
429
+ }
430
+
431
+ void ggml_cann_l2_norm(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
432
+ ggml_tensor * src = dst->src[0];
433
+
434
+ acl_tensor_ptr acl_src = ggml_cann_create_tensor(src);
435
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
436
+
437
+ size_t type_size = ggml_type_size(src->type);
438
+ int64_t n_bytes = src->ne[3] * src->ne[2] * src->ne[1] * type_size;
439
+ ggml_cann_pool_alloc temp_buffer_allocator(ctx.pool(), n_bytes);
440
+ void * buffer = temp_buffer_allocator.get();
441
+
442
+ int64_t div_ne[] = { 1, src->ne[1], src->ne[2], src->ne[3] };
443
+ size_t div_nb[GGML_MAX_DIMS];
444
+ div_nb[0] = sizeof(float);
445
+ for (int i = 1; i < GGML_MAX_DIMS; ++i) {
446
+ div_nb[i] = div_nb[i - 1] * div_ne[i - 1];
447
+ }
448
+ acl_tensor_ptr acl_div = ggml_cann_create_tensor(buffer, ACL_FLOAT, type_size, div_ne, div_nb, GGML_MAX_DIMS);
449
+
450
+ std::vector<int64_t> norm_dims = { 3 };
451
+ acl_int_array_ptr dims_array = ggml_cann_create_int_array(norm_dims.data(), norm_dims.size());
452
+
453
+ float p_value = 2.0f;
454
+ acl_scalar_ptr p_scalar = ggml_cann_create_scalar(&p_value, aclDataType::ACL_FLOAT);
455
+ GGML_CANN_CALL_ACLNN_OP(ctx, Norm, acl_src.get(), p_scalar.get(), dims_array.get(), true, acl_div.get());
456
+ GGML_CANN_CALL_ACLNN_OP(ctx, Div, acl_src.get(), acl_div.get(), acl_dst.get());
440
457
  }
441
458
 
442
- void ggml_cann_group_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
443
- ggml_tensor* src = dst->src[0];
459
+ void ggml_cann_cross_entropy_loss(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
460
+ ggml_tensor * src0 = dst->src[0];
461
+ ggml_tensor * src1 = dst->src[1];
462
+
463
+ const int64_t nc = src0->ne[0];
464
+ const int64_t nr = ggml_nrows(src0);
444
465
 
445
- aclTensor* acl_src = ggml_cann_create_tensor(src);
446
- aclTensor* acl_dst = ggml_cann_create_tensor(dst);
466
+ int64_t logits_ne[] = { nc, nr };
467
+ size_t logits_nb[2];
468
+ logits_nb[0] = ggml_type_size(src0->type);
469
+ logits_nb[1] = logits_nb[0] * logits_ne[0];
470
+ acl_tensor_ptr acl_logits = ggml_cann_create_tensor(src0->data, ACL_FLOAT, sizeof(float), logits_ne, logits_nb, 2);
471
+
472
+ size_t log_softmax_type_size = sizeof(float);
473
+ int64_t log_softmax_n_bytes = nr * nc * log_softmax_type_size;
474
+ ggml_cann_pool_alloc log_softmax_allocator(ctx.pool(), log_softmax_n_bytes);
475
+ void * log_softmax_buffer = log_softmax_allocator.get();
476
+
477
+ int64_t log_softmax_ne[] = { nc, nr };
478
+ size_t log_softmax_nb[2];
479
+ log_softmax_nb[0] = log_softmax_type_size;
480
+ log_softmax_nb[1] = log_softmax_nb[0] * log_softmax_ne[0];
481
+ acl_tensor_ptr acl_log_softmax = ggml_cann_create_tensor(log_softmax_buffer, ACL_FLOAT, log_softmax_type_size,
482
+ log_softmax_ne, log_softmax_nb, 2);
483
+
484
+ GGML_CANN_CALL_ACLNN_OP(ctx, LogSoftmax, acl_logits.get(), 1, acl_log_softmax.get());
485
+
486
+ int64_t labels_ne[] = { nc, nr };
487
+ size_t labels_nb[2];
488
+ labels_nb[0] = ggml_type_size(src1->type);
489
+ labels_nb[1] = labels_nb[0] * labels_ne[0];
490
+ acl_tensor_ptr acl_labels = ggml_cann_create_tensor(src1->data, ACL_FLOAT, sizeof(float), labels_ne, labels_nb, 2);
491
+
492
+ size_t mul_type_size = sizeof(float);
493
+ int64_t mul_n_bytes = nr * nc * mul_type_size;
494
+ ggml_cann_pool_alloc mul_allocator(ctx.pool(), mul_n_bytes);
495
+ void * mul_buffer = mul_allocator.get();
496
+
497
+ int64_t mul_ne[] = { nc, nr };
498
+ size_t mul_nb[2];
499
+ mul_nb[0] = mul_type_size;
500
+ mul_nb[1] = mul_nb[0] * mul_ne[0];
501
+ acl_tensor_ptr acl_mul_result = ggml_cann_create_tensor(mul_buffer, ACL_FLOAT, mul_type_size, mul_ne, mul_nb, 2);
502
+
503
+ GGML_CANN_CALL_ACLNN_OP(ctx, Mul, acl_log_softmax.get(), acl_labels.get(), acl_mul_result.get());
504
+
505
+ size_t sum_per_sample_type_size = sizeof(float);
506
+ int64_t sum_per_sample_n_bytes = nr * sum_per_sample_type_size;
507
+ ggml_cann_pool_alloc sum_per_sample_allocator(ctx.pool(), sum_per_sample_n_bytes);
508
+ void * sum_per_sample_buffer = sum_per_sample_allocator.get();
509
+
510
+ int64_t sum_per_sample_ne[] = { nr };
511
+ size_t sum_per_sample_nb[1];
512
+ sum_per_sample_nb[0] = sum_per_sample_type_size;
513
+ acl_tensor_ptr acl_sum_per_sample = ggml_cann_create_tensor(
514
+ sum_per_sample_buffer, ACL_FLOAT, sum_per_sample_type_size, sum_per_sample_ne, sum_per_sample_nb, 1);
515
+
516
+ std::vector<int64_t> sum_dims = { 1 };
517
+ acl_int_array_ptr dims_array = ggml_cann_create_int_array(sum_dims.data(), sum_dims.size());
518
+ bool keep_dims = false;
519
+
520
+ GGML_CANN_CALL_ACLNN_OP(ctx, ReduceSum, acl_mul_result.get(), dims_array.get(), keep_dims, ACL_FLOAT,
521
+ acl_sum_per_sample.get());
522
+
523
+ size_t total_sum_type_size = sizeof(float);
524
+ int64_t total_sum_n_bytes = 1 * total_sum_type_size;
525
+ ggml_cann_pool_alloc total_sum_allocator(ctx.pool(), total_sum_n_bytes);
526
+ void * total_sum_buffer = total_sum_allocator.get();
527
+
528
+ int64_t total_sum_ne[] = { 1 };
529
+ size_t total_sum_nb[1];
530
+ total_sum_nb[0] = total_sum_type_size;
531
+
532
+ acl_tensor_ptr acl_total_sum =
533
+ ggml_cann_create_tensor(total_sum_buffer, ACL_FLOAT, total_sum_type_size, total_sum_ne, total_sum_nb, 1);
534
+
535
+ std::vector<int64_t> total_sum_dims = { 0 };
536
+ acl_int_array_ptr total_sum_dims_array = ggml_cann_create_int_array(total_sum_dims.data(), total_sum_dims.size());
537
+
538
+ GGML_CANN_CALL_ACLNN_OP(ctx, ReduceSum, acl_sum_per_sample.get(), total_sum_dims_array.get(), keep_dims, ACL_FLOAT,
539
+ acl_total_sum.get());
540
+
541
+ float value = -1.0f / static_cast<float>(nr);
542
+ acl_scalar_ptr scale_factor = ggml_cann_create_scalar(&value, aclDataType::ACL_FLOAT);
543
+ acl_tensor_ptr acl_dst =
544
+ ggml_cann_create_tensor(dst->data, ACL_FLOAT, sizeof(float), total_sum_ne, total_sum_nb, 1);
545
+
546
+ GGML_CANN_CALL_ACLNN_OP(ctx, Muls, acl_total_sum.get(), scale_factor.get(), acl_dst.get());
547
+ }
548
+
549
+ void ggml_cann_group_norm(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
550
+ ggml_tensor * src = dst->src[0];
551
+
552
+ acl_tensor_ptr acl_src = ggml_cann_create_tensor(src);
553
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
447
554
 
448
555
  int n_groups = dst->op_params[0];
449
556
 
450
557
  float eps;
451
558
  memcpy(&eps, dst->op_params + 1, sizeof(float));
452
559
 
453
- int64_t N = src->ne[3];
454
- int64_t C = src->ne[2];
560
+ int64_t N = src->ne[3];
561
+ int64_t C = src->ne[2];
455
562
  int64_t HxW = src->ne[1] * src->ne[0];
456
563
 
457
- size_t type_size = ggml_type_size(src->type);
458
- int64_t ne[] = {n_groups, N};
459
- size_t nb[] = {type_size, type_size * n_groups};
460
- size_t n_bytes = N * n_groups;
564
+ size_t type_size = ggml_type_size(src->type);
565
+ int64_t ne[] = { n_groups, N };
566
+ size_t nb[] = { type_size, type_size * n_groups };
567
+ size_t n_bytes = N * n_groups;
461
568
 
462
569
  ggml_cann_pool_alloc temp_buffer_allocator(ctx.pool(), n_bytes * 2);
463
- void* buffer = temp_buffer_allocator.get();
464
- aclTensor* acl_mean_out = ggml_cann_create_tensor(
465
- buffer, ACL_FLOAT, type_size, ne, nb, ACL_FORMAT_ND);
466
- aclTensor* acl_rstd_out = ggml_cann_create_tensor(
467
- (char*)buffer + n_bytes, ACL_FLOAT, type_size, ne, nb, ACL_FORMAT_ND);
570
+ void * buffer = temp_buffer_allocator.get();
571
+ acl_tensor_ptr acl_mean_out = ggml_cann_create_tensor(buffer, ACL_FLOAT, type_size, ne, nb, ACL_FORMAT_ND);
572
+ acl_tensor_ptr acl_rstd_out =
573
+ ggml_cann_create_tensor((char *) buffer + n_bytes, ACL_FLOAT, type_size, ne, nb, ACL_FORMAT_ND);
468
574
 
469
- GGML_CANN_CALL_ACLNN_OP(ctx, GroupNorm, acl_src, nullptr, nullptr, N, C, HxW, n_groups, eps,
470
- acl_dst, acl_mean_out, acl_rstd_out);
471
- ggml_cann_release_resources(ctx, acl_src, acl_dst, acl_mean_out, acl_rstd_out);
575
+ GGML_CANN_CALL_ACLNN_OP(ctx, GroupNorm, acl_src.get(), nullptr, nullptr, N, C, HxW, n_groups, eps, acl_dst.get(),
576
+ acl_mean_out.get(), acl_rstd_out.get());
472
577
  }
473
578
 
474
- void ggml_cann_acc(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
475
- ggml_tensor* src0 = dst->src[0];
476
- ggml_tensor* src1 = dst->src[1];
579
+ void ggml_cann_acc(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
580
+ ggml_tensor * src0 = dst->src[0];
581
+ ggml_tensor * src1 = dst->src[1];
477
582
 
478
- size_t nb1 = ((int32_t*)dst->op_params)[0];
479
- size_t nb2 = ((int32_t*)dst->op_params)[1];
480
- size_t nb3 = ((int32_t*)dst->op_params)[2];
481
- size_t offset = ((int32_t*)dst->op_params)[3];
482
- bool inplace = (bool)((int32_t*)dst->op_params)[4];
583
+ size_t nb1 = ((int32_t *) dst->op_params)[0];
584
+ size_t nb2 = ((int32_t *) dst->op_params)[1];
585
+ size_t nb3 = ((int32_t *) dst->op_params)[2];
586
+ size_t offset = ((int32_t *) dst->op_params)[3];
587
+ bool inplace = (bool) ((int32_t *) dst->op_params)[4];
483
588
 
484
- size_t param_nb[] = {ggml_element_size(src0), nb1, nb2, nb3};
589
+ size_t param_nb[] = { ggml_element_size(src0), nb1, nb2, nb3 };
485
590
 
486
- aclTensor* acl_dst = ggml_cann_create_tensor(
487
- dst, src1->ne, param_nb, GGML_MAX_DIMS, ACL_FORMAT_ND, offset);
488
- aclTensor* acl_src1 = ggml_cann_create_tensor(src1);
591
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst, src1->ne, param_nb, GGML_MAX_DIMS, ACL_FORMAT_ND, offset);
592
+ acl_tensor_ptr acl_src1 = ggml_cann_create_tensor(src1);
489
593
 
490
- aclScalar* alpha = nullptr;
491
- float alphaValue = 1.0f;
492
- alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
594
+ acl_scalar_ptr alpha = nullptr;
595
+ float alphaValue = 1.0f;
596
+ alpha = ggml_cann_create_scalar(&alphaValue, aclDataType::ACL_FLOAT);
493
597
 
494
598
  if (!inplace) {
495
599
  size_t cpy_size = ggml_nbytes(dst);
496
- ggml_cann_async_memcpy(ctx, dst->data, src0->data, cpy_size,
497
- ACL_MEMCPY_DEVICE_TO_DEVICE);
498
- aclTensor* acl_src0 = ggml_cann_create_tensor(
499
- src0, src1->ne, src0->nb, GGML_MAX_DIMS, ACL_FORMAT_ND, offset);
600
+ ACL_CHECK(
601
+ aclrtMemcpyAsync(dst->data, cpy_size, src0->data, cpy_size, ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
602
+ acl_tensor_ptr acl_src0 =
603
+ ggml_cann_create_tensor(src0, src1->ne, src0->nb, GGML_MAX_DIMS, ACL_FORMAT_ND, offset);
500
604
 
501
- GGML_CANN_CALL_ACLNN_OP(ctx, Add, acl_src0, acl_src1, alpha, acl_dst);
502
- ggml_cann_release_resources(ctx, acl_src0);
605
+ GGML_CANN_CALL_ACLNN_OP(ctx, Add, acl_src0.get(), acl_src1.get(), alpha.get(), acl_dst.get());
503
606
  } else {
504
- GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdd, acl_dst, acl_src1, alpha);
607
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdd, acl_dst.get(), acl_src1.get(), alpha.get());
505
608
  }
506
- ggml_cann_release_resources(ctx, acl_src1, acl_dst);
507
609
  }
508
610
 
509
611
  /**
@@ -516,42 +618,36 @@ void ggml_cann_acc(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
516
618
  * @param dim An array of dimension indices.
517
619
  * @param dim_size The number of dimensions.
518
620
  */
519
- static void aclnn_reduce_sum(ggml_backend_cann_context& ctx, ggml_tensor* dst,
520
- int64_t* dim, size_t dim_size) {
621
+ static void aclnn_reduce_sum(ggml_backend_cann_context & ctx, ggml_tensor * dst, int64_t * dim, size_t dim_size) {
521
622
  GGML_ASSERT(dst->ne[0] == 1);
522
- ggml_tensor* src = dst->src[0];
523
- aclTensor* acl_src = ggml_cann_create_tensor(src);
524
- aclTensor* acl_dst = ggml_cann_create_tensor(dst);
525
- aclIntArray* reduce_dims = aclCreateIntArray(dim, dim_size);
623
+ ggml_tensor * src = dst->src[0];
624
+ acl_tensor_ptr acl_src = ggml_cann_create_tensor(src);
625
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
626
+ acl_int_array_ptr reduce_dims = ggml_cann_create_int_array(dim, dim_size);
526
627
 
527
- GGML_CANN_CALL_ACLNN_OP(ctx, ReduceSum, acl_src, reduce_dims, true,
528
- ggml_cann_type_mapping(dst->type), acl_dst);
529
- ggml_cann_release_resources(ctx, acl_src, acl_dst, reduce_dims);
628
+ GGML_CANN_CALL_ACLNN_OP(ctx, ReduceSum, acl_src.get(), reduce_dims.get(), true, ggml_cann_type_mapping(dst->type),
629
+ acl_dst.get());
530
630
  }
531
631
 
532
- void ggml_cann_sum_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
533
- int64_t reduce_dims[] = {3};
632
+ void ggml_cann_sum_rows(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
633
+ int64_t reduce_dims[] = { 3 };
534
634
  aclnn_reduce_sum(ctx, dst, reduce_dims, 1);
535
635
  }
536
636
 
537
- void ggml_cann_sum(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
538
- int64_t reduce_dims[] = {0, 1, 2, 3};
637
+ void ggml_cann_sum(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
638
+ int64_t reduce_dims[] = { 0, 1, 2, 3 };
539
639
  aclnn_reduce_sum(ctx, dst, reduce_dims, 4);
540
640
  }
541
641
 
542
- void ggml_cann_upsample_nearest2d(ggml_backend_cann_context& ctx,
543
- ggml_tensor* dst) {
544
- ggml_tensor* src = dst->src[0];
545
- aclTensor* acl_src =
546
- ggml_cann_create_tensor(src, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
547
- aclTensor* acl_dst =
548
- ggml_cann_create_tensor(dst, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
642
+ void ggml_cann_upsample_nearest2d(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
643
+ ggml_tensor * src = dst->src[0];
644
+ acl_tensor_ptr acl_src = ggml_cann_create_tensor(src, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
645
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
549
646
 
550
- std::vector<int64_t> output_size{dst->ne[1], dst->ne[0]};
551
- auto output_size_array = aclCreateIntArray(output_size.data(), 2);
647
+ std::vector<int64_t> output_size{ dst->ne[1], dst->ne[0] };
648
+ acl_int_array_ptr output_size_array = ggml_cann_create_int_array(output_size.data(), 2);
552
649
 
553
- GGML_CANN_CALL_ACLNN_OP(ctx, UpsampleNearest2d, acl_src, output_size_array, acl_dst);
554
- ggml_cann_release_resources(ctx, acl_src, acl_dst, output_size_array);
650
+ GGML_CANN_CALL_ACLNN_OP(ctx, UpsampleNearest2d, acl_src.get(), output_size_array.get(), acl_dst.get());
555
651
  }
556
652
 
557
653
  /**
@@ -568,30 +664,37 @@ void ggml_cann_upsample_nearest2d(ggml_backend_cann_context& ctx,
568
664
  * The size of the array should be twice the number of dimensions of the tensor.
569
665
  * @param value The value to be used for padding. The default value is 0.0.
570
666
  */
571
- static void aclnn_pad(ggml_backend_cann_context& ctx, aclTensor* acl_src,
572
- aclTensor* acl_dst, int64_t* paddings,
573
- float value = 0.0f) {
574
- aclIntArray* acl_pad = aclCreateIntArray(paddings, GGML_MAX_DIMS * 2);
575
- aclScalar* acl_value = aclCreateScalar(&value, aclDataType::ACL_FLOAT);
667
+ static void aclnn_pad(ggml_backend_cann_context & ctx,
668
+ aclTensor * acl_src,
669
+ aclTensor * acl_dst,
670
+ int64_t * paddings,
671
+ float value = 0.0f) {
672
+ acl_int_array_ptr acl_pad = ggml_cann_create_int_array(paddings, GGML_MAX_DIMS * 2);
673
+ acl_scalar_ptr acl_value = ggml_cann_create_scalar(&value, aclDataType::ACL_FLOAT);
576
674
 
577
- GGML_CANN_CALL_ACLNN_OP(ctx, ConstantPadNd, acl_src, acl_pad, acl_value, acl_dst);
578
- ggml_cann_release_resources(ctx, acl_pad, acl_value);
675
+ GGML_CANN_CALL_ACLNN_OP(ctx, ConstantPadNd, acl_src, acl_pad.get(), acl_value.get(), acl_dst);
579
676
  }
580
677
 
581
- void ggml_cann_pad(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
582
- ggml_tensor* src = dst->src[0];
583
- aclTensor* acl_src = ggml_cann_create_tensor(src);
584
- aclTensor* acl_dst = ggml_cann_create_tensor(dst);
678
+ void ggml_cann_pad(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
679
+ ggml_tensor * src = dst->src[0];
680
+ acl_tensor_ptr acl_src = ggml_cann_create_tensor(src);
681
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
585
682
 
586
683
  // padding: value in the array means how much distance will be padding.
587
684
  // the position of elements in the array means which dirction to padding,
588
685
  // each position means: [dim0.front, dim0.behind, dim1.front, dim1.behind,
589
686
  // dim2.front, dim2.behind, dim3.front, dim3.behind]
590
- int64_t paddings[] = {
591
- 0, dst->ne[0] - src->ne[0], 0, dst->ne[1] - src->ne[1],
592
- 0, dst->ne[2] - src->ne[2], 0, dst->ne[3] - src->ne[3]};
593
- aclnn_pad(ctx, acl_src, acl_dst, paddings);
594
- ggml_cann_release_resources(ctx, acl_src, acl_dst);
687
+ const int32_t lp0 = ggml_get_op_params_i32(dst, 0);
688
+ const int32_t rp0 = ggml_get_op_params_i32(dst, 1);
689
+ const int32_t lp1 = ggml_get_op_params_i32(dst, 2);
690
+ const int32_t rp1 = ggml_get_op_params_i32(dst, 3);
691
+ const int32_t lp2 = ggml_get_op_params_i32(dst, 4);
692
+ const int32_t rp2 = ggml_get_op_params_i32(dst, 5);
693
+ const int32_t lp3 = ggml_get_op_params_i32(dst, 6);
694
+ const int32_t rp3 = ggml_get_op_params_i32(dst, 7);
695
+
696
+ int64_t paddings[] = { lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3 };
697
+ aclnn_pad(ctx, acl_src.get(), acl_dst.get(), paddings);
595
698
  }
596
699
 
597
700
  /**
@@ -606,46 +709,40 @@ void ggml_cann_pad(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
606
709
  * @param dst The destination tensor where the result will be stored. The source
607
710
  * tensor is referenced by `dst->src[0]`.
608
711
  */
609
- static void ggml_cann_avg_pool2d(ggml_backend_cann_context& ctx,
610
- ggml_tensor* dst) {
611
- ggml_tensor* src = dst->src[0];
712
+ static void ggml_cann_avg_pool2d(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
713
+ ggml_tensor * src = dst->src[0];
612
714
  GGML_ASSERT(src->type == GGML_TYPE_F32);
613
715
  GGML_ASSERT(dst->type == GGML_TYPE_F32);
614
716
 
615
- aclTensor* acl_src =
616
- ggml_cann_create_tensor(src, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
617
- aclTensor* acl_dst =
618
- ggml_cann_create_tensor(dst, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
619
-
620
- const int32_t* opts = (const int32_t*)dst->op_params;
621
- const int k0 = opts[1];
622
- const int k1 = opts[2];
623
- const int s0 = opts[3];
624
- const int s1 = opts[4];
625
- const int p0 = opts[5];
626
- const int p1 = opts[6];
627
-
628
- std::vector<int64_t> kernel_dims = {k1, k0};
629
- std::vector<int64_t> stride_dims = {s1, s0};
630
- std::vector<int64_t> padding_avg_dims = {p1, p0}; // (padH, padW)
631
-
632
- auto* kernel_size = aclCreateIntArray(kernel_dims.data(), 2);
633
- auto* strides = aclCreateIntArray(stride_dims.data(), 2);
634
- auto* paddings_avg = aclCreateIntArray(padding_avg_dims.data(), 2);
635
-
636
- bool ceil_mode = false;
637
- bool count_include_pad = true;
638
- int64_t divisor_override = 0;
639
- int8_t cube_math_type = 0;
717
+ acl_tensor_ptr acl_src = ggml_cann_create_tensor(src, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
718
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
719
+
720
+ const int32_t * opts = (const int32_t *) dst->op_params;
721
+ const int k0 = opts[1];
722
+ const int k1 = opts[2];
723
+ const int s0 = opts[3];
724
+ const int s1 = opts[4];
725
+ const int p0 = opts[5];
726
+ const int p1 = opts[6];
727
+
728
+ std::vector<int64_t> kernel_dims = { k1, k0 };
729
+ std::vector<int64_t> stride_dims = { s1, s0 };
730
+ std::vector<int64_t> padding_avg_dims = { p1, p0 }; // (padH, padW)
731
+
732
+ acl_int_array_ptr kernel_size = ggml_cann_create_int_array(kernel_dims.data(), 2);
733
+ acl_int_array_ptr strides = ggml_cann_create_int_array(stride_dims.data(), 2);
734
+ acl_int_array_ptr paddings_avg = ggml_cann_create_int_array(padding_avg_dims.data(), 2);
735
+
736
+ bool ceil_mode = false;
737
+ bool count_include_pad = true;
738
+ int64_t divisor_override = 0;
739
+ int8_t cube_math_type = 0;
640
740
  #ifdef ASCEND_310P
641
741
  cube_math_type = 1;
642
742
  #endif
643
743
 
644
- GGML_CANN_CALL_ACLNN_OP(ctx, AvgPool2d, acl_src, kernel_size, strides, paddings_avg,
645
- ceil_mode, count_include_pad, divisor_override,
646
- cube_math_type, acl_dst);
647
- ggml_cann_release_resources(ctx, acl_src, acl_dst, kernel_size, strides,
648
- paddings_avg);
744
+ GGML_CANN_CALL_ACLNN_OP(ctx, AvgPool2d, acl_src.get(), kernel_size.get(), strides.get(), paddings_avg.get(),
745
+ ceil_mode, count_include_pad, divisor_override, cube_math_type, acl_dst.get());
649
746
  }
650
747
 
651
748
  /**
@@ -660,68 +757,60 @@ static void ggml_cann_avg_pool2d(ggml_backend_cann_context& ctx,
660
757
  * @param dst The destination tensor where the result will be stored. The source
661
758
  * tensor is referenced by `dst->src[0]`.
662
759
  */
663
- static void ggml_cann_max_pool2d(ggml_backend_cann_context& ctx,
664
- ggml_tensor* dst) {
665
- ggml_tensor* src = dst->src[0];
760
+ static void ggml_cann_max_pool2d(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
761
+ ggml_tensor * src = dst->src[0];
666
762
  GGML_ASSERT(src->type == GGML_TYPE_F32);
667
763
  GGML_ASSERT(dst->type == GGML_TYPE_F32);
668
764
 
669
- aclTensor* acl_src =
670
- ggml_cann_create_tensor(src, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
671
- aclTensor* acl_dst =
672
- ggml_cann_create_tensor(dst, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
765
+ acl_tensor_ptr acl_src = ggml_cann_create_tensor(src, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
766
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
673
767
 
674
- const int32_t* opts = (const int32_t*)dst->op_params;
675
- const int k0 = opts[1];
676
- const int k1 = opts[2];
677
- const int s0 = opts[3];
678
- const int s1 = opts[4];
679
- const int p0 = opts[5];
680
- const int p1 = opts[6];
768
+ const int32_t * opts = (const int32_t *) dst->op_params;
769
+ const int k0 = opts[1];
770
+ const int k1 = opts[2];
771
+ const int s0 = opts[3];
772
+ const int s1 = opts[4];
773
+ const int p0 = opts[5];
774
+ const int p1 = opts[6];
681
775
 
682
- int64_t temp_ne[] = {src->ne[0] + p0 * 2, src->ne[1] + p1 * 2, src->ne[2],
683
- src->ne[3]};
684
- size_t temp_nb[GGML_MAX_DIMS];
776
+ int64_t temp_ne[] = { src->ne[0] + p0 * 2, src->ne[1] + p1 * 2, src->ne[2], src->ne[3] };
777
+ size_t temp_nb[GGML_MAX_DIMS];
685
778
 
686
779
  temp_nb[0] = ggml_element_size(src);
687
780
  for (int i = 1; i < GGML_MAX_DIMS; i++) {
688
781
  temp_nb[i] = temp_nb[i - 1] * temp_ne[i - 1];
689
782
  }
690
783
 
691
- ggml_cann_pool_alloc temp_buffer_allocator(
692
- ctx.pool(), ggml_nbytes(src) + p0 * 2 + p1 * 2 * src->nb[1]);
693
- void* buffer = temp_buffer_allocator.get();
694
- aclTensor* tmp_tensor = ggml_cann_create_tensor(
695
- buffer, ACL_FLOAT, ggml_element_size(src), temp_ne, temp_nb,
696
- GGML_MAX_DIMS, ACL_FORMAT_NCHW);
784
+ ggml_cann_pool_alloc temp_buffer_allocator(ctx.pool(), ggml_nbytes(src) + p0 * 2 + p1 * 2 * src->nb[1]);
785
+ void * buffer = temp_buffer_allocator.get();
786
+ acl_tensor_ptr tmp_tensor = ggml_cann_create_tensor(buffer, ACL_FLOAT, ggml_element_size(src), temp_ne, temp_nb,
787
+ GGML_MAX_DIMS, ACL_FORMAT_NCHW);
697
788
 
698
789
  // pad: see padding in ggml_cann_pad()
699
- int64_t paddings[] = {p0, p0, p1, p1, 0, 0, 0, 0};
700
- float value = -FLT_MAX;
701
- aclnn_pad(ctx, acl_src, tmp_tensor, paddings, value);
790
+ int64_t paddings[] = { p0, p0, p1, p1, 0, 0, 0, 0 };
791
+ float value = -FLT_MAX;
792
+ aclnn_pad(ctx, acl_src.get(), tmp_tensor.get(), paddings, value);
702
793
 
703
794
  // max_pool
704
- std::vector<int64_t> kernel_dims = {k1, k0};
705
- std::vector<int64_t> stride_dims = {s1, s0};
795
+ std::vector<int64_t> kernel_dims = { k1, k0 };
796
+ std::vector<int64_t> stride_dims = { s1, s0 };
706
797
  // padding_max_dims: [dim0_start, dim0_end, dim1_start, dim1_end]
707
- std::vector<int64_t> padding_max_dims = {0, 0, 0, 0};
708
- std::vector<int64_t> dilation_size = {1, 1};
709
- auto* kernel_size = aclCreateIntArray(kernel_dims.data(), 2);
710
- auto* strides = aclCreateIntArray(stride_dims.data(), 2);
711
- auto* paddings_max = aclCreateIntArray(padding_max_dims.data(), 4);
712
- auto* dilations = aclCreateIntArray(dilation_size.data(), 2);
713
-
714
- bool ceil_mode = false;
798
+ std::vector<int64_t> padding_max_dims = { 0, 0, 0, 0 };
799
+ std::vector<int64_t> dilation_size = { 1, 1 };
800
+ acl_int_array_ptr kernel_size = ggml_cann_create_int_array(kernel_dims.data(), 2);
801
+ acl_int_array_ptr strides = ggml_cann_create_int_array(stride_dims.data(), 2);
802
+ acl_int_array_ptr paddings_max = ggml_cann_create_int_array(padding_max_dims.data(), 4);
803
+ acl_int_array_ptr dilations = ggml_cann_create_int_array(dilation_size.data(), 2);
804
+
805
+ bool ceil_mode = false;
715
806
  int64_t auto_pads = 0;
716
- GGML_CANN_CALL_ACLNN_OP(ctx, MaxPool, tmp_tensor, kernel_size, strides, auto_pads,
717
- paddings_max, dilations, ceil_mode, acl_dst);
718
- ggml_cann_release_resources(ctx, acl_src, acl_dst, tmp_tensor, kernel_size,
719
- strides, paddings_max, dilations);
807
+ GGML_CANN_CALL_ACLNN_OP(ctx, MaxPool, tmp_tensor.get(), kernel_size.get(), strides.get(), auto_pads,
808
+ paddings_max.get(), dilations.get(), ceil_mode, acl_dst.get());
720
809
  }
721
810
 
722
- void ggml_cann_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
723
- const int32_t* opts = (const int32_t*)dst->op_params;
724
- enum ggml_op_pool op = static_cast<ggml_op_pool>(opts[0]);
811
+ void ggml_cann_pool2d(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
812
+ const int32_t * opts = (const int32_t *) dst->op_params;
813
+ enum ggml_op_pool op = static_cast<ggml_op_pool>(opts[0]);
725
814
  switch (op) {
726
815
  case GGML_OP_POOL_AVG:
727
816
  ggml_cann_avg_pool2d(ctx, dst);
@@ -745,42 +834,37 @@ void ggml_cann_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
745
834
  * @param acl_src The source tensor from which data will be copied.
746
835
  * @param acl_dst The destination tensor where the data will be copied to.
747
836
  */
748
- static void cann_copy(ggml_backend_cann_context& ctx, aclTensor* acl_src,
749
- aclTensor* acl_dst) {
837
+ static void cann_copy(ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst) {
750
838
  GGML_CANN_CALL_ACLNN_OP(ctx, InplaceCopy, acl_dst, acl_src);
751
839
  }
752
840
 
753
- void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
754
- ggml_tensor* src0 = dst->src[0];
841
+ void ggml_cann_dup(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
842
+ ggml_tensor * src0 = dst->src[0];
755
843
 
756
844
  if (ggml_are_same_shape(src0, dst)) {
757
- aclTensor* acl_src = ggml_cann_create_tensor(src0);
758
- aclTensor* acl_dst = ggml_cann_create_tensor(dst);
845
+ acl_tensor_ptr acl_src = ggml_cann_create_tensor(src0);
846
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
759
847
  if (dst->type == src0->type) {
760
- cann_copy(ctx, acl_src, acl_dst);
848
+ cann_copy(ctx, acl_src.get(), acl_dst.get());
761
849
  } else {
762
- aclnn_cast(ctx, acl_src, acl_dst, ggml_cann_type_mapping(dst->type));
850
+ aclnn_cast(ctx, acl_src.get(), acl_dst.get(), ggml_cann_type_mapping(dst->type));
763
851
  }
764
- ggml_cann_release_resources(ctx, acl_src, acl_dst);
765
852
  } else {
766
- void* src_trans_buffer = src0->data;
853
+ void * src_trans_buffer = src0->data;
767
854
  ggml_cann_pool_alloc src_buffer_allocator;
768
855
  if (!ggml_is_contiguous(src0)) {
769
- aclTensor* acl_src = ggml_cann_create_tensor(src0);
770
- src_buffer_allocator.alloc(ctx.pool(),
771
- ggml_nelements(src0) * ggml_type_size(src0->type));
856
+ acl_tensor_ptr acl_src = ggml_cann_create_tensor(src0);
857
+ src_buffer_allocator.alloc(ctx.pool(), ggml_nelements(src0) * ggml_type_size(src0->type));
772
858
  src_trans_buffer = src_buffer_allocator.get();
773
859
  size_t src_trans_nb[GGML_MAX_DIMS];
774
860
  src_trans_nb[0] = ggml_type_size(src0->type);
775
861
  for (int i = 1; i < GGML_MAX_DIMS; i++) {
776
862
  src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
777
863
  }
778
- aclTensor* src_trans_tensor = ggml_cann_create_tensor(
779
- src_trans_buffer, ggml_cann_type_mapping(src0->type),
780
- ggml_type_size(src0->type), src0->ne, src_trans_nb,
781
- GGML_MAX_DIMS);
782
- cann_copy(ctx, acl_src, src_trans_tensor);
783
- ggml_cann_release_resources(ctx, acl_src, src_trans_tensor);
864
+ acl_tensor_ptr src_trans_tensor =
865
+ ggml_cann_create_tensor(src_trans_buffer, ggml_cann_type_mapping(src0->type),
866
+ ggml_type_size(src0->type), src0->ne, src_trans_nb, GGML_MAX_DIMS);
867
+ cann_copy(ctx, acl_src.get(), src_trans_tensor.get());
784
868
  }
785
869
 
786
870
  size_t src_reshape_nb[GGML_MAX_DIMS];
@@ -789,19 +873,17 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
789
873
  src_reshape_nb[i] = src_reshape_nb[i - 1] * dst->ne[i - 1];
790
874
  }
791
875
 
792
- aclTensor* trans_acl_src = ggml_cann_create_tensor(src_trans_buffer,
793
- ggml_cann_type_mapping(src0->type),ggml_type_size(src0->type),
794
- dst->ne, src_reshape_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
795
- aclTensor* acl_dst = ggml_cann_create_tensor(dst);
876
+ acl_tensor_ptr trans_acl_src =
877
+ ggml_cann_create_tensor(src_trans_buffer, ggml_cann_type_mapping(src0->type), ggml_type_size(src0->type),
878
+ dst->ne, src_reshape_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
879
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
796
880
 
797
881
  if (dst->type == src0->type) {
798
- cann_copy(ctx, trans_acl_src, acl_dst);
882
+ cann_copy(ctx, trans_acl_src.get(), acl_dst.get());
799
883
  } else {
800
- aclnn_cast(ctx, trans_acl_src, acl_dst, ggml_cann_type_mapping(dst->type));
884
+ aclnn_cast(ctx, trans_acl_src.get(), acl_dst.get(), ggml_cann_type_mapping(dst->type));
801
885
  }
802
- ggml_cann_release_resources(ctx, trans_acl_src, acl_dst);
803
886
  }
804
- return;
805
887
  }
806
888
 
807
889
  /**
@@ -818,20 +900,23 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
818
900
  * @param dims The number of dimensions of the tensor.
819
901
  * @param type The data type of the tensor.
820
902
  * @param type_size The size of each element in the tensor data type.
821
- * @return An ACL tensor initialized with zeros.
903
+ * @return A tensor smart pointer initialized with zeros.
822
904
  */
823
- static aclTensor* aclnn_zero(ggml_backend_cann_context& ctx, void* buffer,
824
- size_t n_bytes, int64_t* ne, int64_t dims,
825
- aclDataType type, size_t type_size) {
905
+ static acl_tensor_ptr aclnn_zero(ggml_backend_cann_context & ctx,
906
+ void * buffer,
907
+ size_t n_bytes,
908
+ int64_t * ne,
909
+ int64_t dims,
910
+ aclDataType type,
911
+ size_t type_size) {
826
912
  size_t nb[GGML_MAX_DIMS];
827
913
  nb[0] = type_size;
828
914
  for (int i = 1; i < dims; i++) {
829
915
  nb[i] = nb[i - 1] * ne[i - 1];
830
916
  }
831
917
 
832
- aclTensor* zero =
833
- ggml_cann_create_tensor(buffer, type, type_size, ne, nb, dims);
834
- GGML_CANN_CALL_ACLNN_OP(ctx, InplaceZero, zero);
918
+ acl_tensor_ptr zero = ggml_cann_create_tensor(buffer, type, type_size, ne, nb, dims);
919
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceZero, zero.get());
835
920
  return zero;
836
921
  GGML_UNUSED(n_bytes);
837
922
  }
@@ -852,18 +937,21 @@ static aclTensor* aclnn_zero(ggml_backend_cann_context& ctx, void* buffer,
852
937
  * @param type_size The size of each element in the tensor data type.
853
938
  * @param value The value to be used for initializing the tensor (default
854
939
  * is 1.0).
855
- * @return An ACL tensor initialized with value.
940
+ * @return A tensor smart pointer initialized with value.
856
941
  */
857
- static aclTensor* aclnn_values(ggml_backend_cann_context& ctx, void* buffer,
858
- size_t n_bytes, int64_t* ne, int64_t dims,
859
- aclDataType type, size_t type_size,
860
- float value = 1.0f) {
861
- aclTensor* acl_tensor =
862
- aclnn_zero(ctx, buffer, n_bytes, ne, dims, type, type_size);
863
- float alpha_host = 1.0f;
864
- aclScalar* alpha = aclCreateScalar(&alpha_host, aclDataType::ACL_FLOAT);
865
- aclScalar* other = aclCreateScalar(&value, aclDataType::ACL_FLOAT);
866
- GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdds, acl_tensor, other, alpha);
942
+ static acl_tensor_ptr aclnn_values(ggml_backend_cann_context & ctx,
943
+ void * buffer,
944
+ size_t n_bytes,
945
+ int64_t * ne,
946
+ int64_t dims,
947
+ aclDataType type,
948
+ size_t type_size,
949
+ float value = 1.0f) {
950
+ acl_tensor_ptr acl_tensor = aclnn_zero(ctx, buffer, n_bytes, ne, dims, type, type_size);
951
+ float alpha_host = 1.0f;
952
+ acl_scalar_ptr alpha = ggml_cann_create_scalar(&alpha_host, aclDataType::ACL_FLOAT);
953
+ acl_scalar_ptr other = ggml_cann_create_scalar(&value, aclDataType::ACL_FLOAT);
954
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdds, acl_tensor.get(), other.get(), alpha.get());
867
955
  return acl_tensor;
868
956
  }
869
957
 
@@ -877,22 +965,19 @@ static aclTensor* aclnn_values(ggml_backend_cann_context& ctx, void* buffer,
877
965
  * @param scalar The scalar value used to fill the tensor.
878
966
  * @param acl_dst The destination tensor to be filled with the scalar value.
879
967
  */
880
- static void aclnn_fill_scalar(ggml_backend_cann_context& ctx, float scalar,
881
- aclTensor* acl_dst) {
882
- auto acl_scalar = aclCreateScalar(&scalar, aclDataType::ACL_FLOAT);
883
- GGML_CANN_CALL_ACLNN_OP(ctx, InplaceFillScalar, acl_dst, acl_scalar);
884
- ggml_cann_release_resources(ctx, acl_scalar);
968
+ static void aclnn_fill_scalar(ggml_backend_cann_context & ctx, float scalar, aclTensor * acl_dst) {
969
+ acl_scalar_ptr acl_scalar = ggml_cann_create_scalar(&scalar, aclDataType::ACL_FLOAT);
970
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceFillScalar, acl_dst, acl_scalar.get());
885
971
  }
886
972
 
887
973
  /**
888
- * @brief Get or expand a cached float32 tensor filled with a scalar value.
974
+ * @brief Get or expand a cached tensor filled with a scalar value.
889
975
  *
890
- * This function manages cached device memory for float32 tensors. If the current
976
+ * This function manages cached device memory for tensors. If the current
891
977
  * cache size is insufficient for the requested tensor shape, the old memory will
892
- * be released and new memory will be allocated. The allocated buffer is then
893
- * initialized either with zeros (when @p value == 0.0f) or with the given scalar
894
- * value using CANN operations. Finally, an aclTensor object is created from the
895
- * cached memory and returned.
978
+ * be released and new memory will be allocated. The allocated buffer is
979
+ * initialized with the given scalar value using CANN operations.
980
+ * Finally, an aclTensor object is created from the cached memory and returned.
896
981
  *
897
982
  * @param ctx The CANN backend context that manages device memory.
898
983
  * @param buffer A pointer to the cached device buffer (will be allocated
@@ -901,25 +986,26 @@ static void aclnn_fill_scalar(ggml_backend_cann_context& ctx, float scalar,
901
986
  * updated when the cache is expanded.
902
987
  * @param ne The tensor shape array (number of elements in each dimension).
903
988
  * @param nb The stride size for each dimension.
989
+ * @param dtype Data type of cached tensor.
904
990
  * @param dims The number of tensor dimensions.
905
991
  * @param value The scalar value used to fill the tensor (supports zero
906
992
  * initialization via memset or arbitrary values via fill_scalar).
907
- * @return An aclTensor pointer created from the cached buffer.
993
+ * @return A tensor smart pointer created from the cached buffer.
908
994
  */
909
- static aclTensor* get_f32_cache_acl_tensor(
910
- ggml_backend_cann_context& ctx,
911
- void** buffer,
912
- int64_t &cache_element,
913
- int64_t* ne,
914
- size_t* nb,
915
- int64_t dims,
916
- float value) {
995
+ static acl_tensor_ptr get_cache_acl_tensor(ggml_backend_cann_context & ctx,
996
+ void ** buffer,
997
+ int64_t & cache_element,
998
+ int64_t * ne,
999
+ size_t * nb,
1000
+ ggml_type dtype,
1001
+ int64_t dims,
1002
+ float value) {
917
1003
  // Calculate total number of elements
918
1004
  int64_t n_element = 1;
919
1005
  for (int i = 0; i < dims; i++) {
920
1006
  n_element *= ne[i];
921
1007
  }
922
- size_t size = n_element * sizeof(float);
1008
+ size_t size = n_element * ggml_type_size(dtype);
923
1009
 
924
1010
  // Allocate or expand cache if needed
925
1011
  if (cache_element < n_element) {
@@ -932,92 +1018,78 @@ static aclTensor* get_f32_cache_acl_tensor(
932
1018
  cache_element = n_element;
933
1019
 
934
1020
  // Initialize cache
935
- if (value == 0.0f) {
936
- ACL_CHECK(aclrtMemsetAsync(*buffer, size, 0, size, ctx.stream()));
937
- } else {
938
- int64_t pool_ne[1] = { n_element };
939
- size_t pool_nb[1] = { sizeof(float) };
940
- aclTensor* acl_value = ggml_cann_create_tensor(
941
- *buffer, ACL_FLOAT, sizeof(float), pool_ne, pool_nb, 1);
942
- aclnn_fill_scalar(ctx, 1, acl_value);
943
- ggml_cann_release_resources(ctx, acl_value);
944
- }
1021
+ int64_t pool_ne[1] = { n_element };
1022
+ size_t pool_nb[1] = { ggml_type_size(dtype) };
1023
+ acl_tensor_ptr acl_value =
1024
+ ggml_cann_create_tensor(*buffer, ggml_cann_type_mapping(dtype), ggml_type_size(dtype), pool_ne, pool_nb, 1);
1025
+ aclnn_fill_scalar(ctx, value, acl_value.get());
945
1026
  }
946
1027
 
947
- return ggml_cann_create_tensor(*buffer, ACL_FLOAT, sizeof(float), ne, nb, dims);
1028
+ return ggml_cann_create_tensor(*buffer, ggml_cann_type_mapping(dtype), ggml_type_size(dtype), ne, nb, dims);
948
1029
  }
949
1030
 
950
- void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
951
- ggml_tensor* src = dst->src[0];
1031
+ void ggml_cann_rms_norm(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
1032
+ ggml_tensor * src = dst->src[0];
952
1033
 
953
- aclTensor* acl_src = ggml_cann_create_tensor(src);
954
- aclTensor* acl_dst = ggml_cann_create_tensor(dst);
1034
+ acl_tensor_ptr acl_src = ggml_cann_create_tensor(src);
1035
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
955
1036
 
956
1037
  float eps;
957
1038
  memcpy(&eps, dst->op_params, sizeof(float));
958
1039
 
959
- // build gamma, one...
1040
+ // build gamma.
960
1041
  size_t acl_gamma_nb[GGML_MAX_DIMS];
961
- acl_gamma_nb[0] = sizeof(float);
1042
+ // gamma's type is the same with dst.
1043
+ acl_gamma_nb[0] = ggml_type_size(dst->type);
962
1044
  for (int i = 1; i < GGML_MAX_DIMS; i++) {
963
1045
  acl_gamma_nb[i] = acl_gamma_nb[i - 1] * src->ne[i - 1];
964
1046
  }
965
- aclTensor* acl_gamma = get_f32_cache_acl_tensor(
966
- ctx,
967
- &ctx.f32_one_cache,
968
- ctx.f32_one_cache_element,
969
- src->ne,
970
- acl_gamma_nb,
971
- 1, // dims
972
- 1.0f // value
1047
+ acl_tensor_ptr acl_gamma = get_cache_acl_tensor(
1048
+ ctx, &ctx.rms_norm_one_tensor_cache.cache, ctx.rms_norm_one_tensor_cache.size, src->ne, acl_gamma_nb, dst->type,
1049
+ 1, // dims
1050
+ 1.0f // value
973
1051
  );
974
1052
 
975
- // build rstd, zero...
976
- size_t acl_rstd_nb[GGML_MAX_DIMS];
1053
+ // build rstd.
1054
+ int64_t acl_rstd_ne[] = { src->ne[1], src->ne[2], src->ne[3] };
1055
+ size_t acl_rstd_nb[GGML_MAX_DIMS - 1];
1056
+ // rstd will always be F32.
977
1057
  acl_rstd_nb[0] = sizeof(float);
978
- for (int i = 1; i < GGML_MAX_DIMS; i++) {
979
- acl_rstd_nb[i] = acl_rstd_nb[i - 1] * src->ne[i - 1];
980
- }
981
- aclTensor* acl_rstd = get_f32_cache_acl_tensor(
982
- ctx,
983
- &ctx.f32_zero_cache,
984
- ctx.f32_zero_cache_element,
985
- src->ne,
986
- acl_rstd_nb,
987
- GGML_MAX_DIMS,
988
- 0.0f // value
989
- );
1058
+ for (int i = 1; i < GGML_MAX_DIMS - 1; i++) {
1059
+ acl_rstd_nb[i] = acl_rstd_nb[i - 1] * acl_rstd_ne[i - 1];
1060
+ }
1061
+ acl_tensor_ptr acl_rstd =
1062
+ get_cache_acl_tensor(ctx, &ctx.rms_norm_zero_tensor_cache.cache, ctx.rms_norm_zero_tensor_cache.size,
1063
+ acl_rstd_ne, acl_rstd_nb, GGML_TYPE_F32, GGML_MAX_DIMS - 1,
1064
+ 0.0f // value
1065
+ );
990
1066
 
991
- GGML_CANN_CALL_ACLNN_OP(ctx, RmsNorm, acl_src, acl_gamma, eps, acl_dst, acl_rstd);
992
- ggml_cann_release_resources(ctx, acl_src, acl_dst, acl_gamma, acl_rstd);
1067
+ GGML_CANN_CALL_ACLNN_OP(ctx, RmsNorm, acl_src.get(), acl_gamma.get(), eps, acl_dst.get(), acl_rstd.get());
993
1068
  }
994
1069
 
995
1070
  // TODO: performace is low.
996
- void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst,
997
- float value) {
998
- ggml_tensor* src = dst->src[0];
1071
+ void ggml_cann_diag_mask(ggml_backend_cann_context & ctx, ggml_tensor * dst, float value) {
1072
+ ggml_tensor * src = dst->src[0];
999
1073
 
1000
- aclTensor* acl_src = ggml_cann_create_tensor(src);
1001
- aclTensor* acl_dst = ggml_cann_create_tensor(dst);
1074
+ acl_tensor_ptr acl_src = ggml_cann_create_tensor(src);
1075
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
1002
1076
 
1003
- const int n_past = ((int32_t*)dst->op_params)[0];
1077
+ const int n_past = ((int32_t *) dst->op_params)[0];
1004
1078
 
1005
1079
  ggml_cann_pool_alloc one_tensor_allocator(ctx.pool(), ggml_nbytes(src));
1006
- void* buffer = one_tensor_allocator.get();
1080
+ void * buffer = one_tensor_allocator.get();
1007
1081
 
1008
- aclTensor* mask_tensor = ggml_cann_create_tensor(buffer, ggml_cann_type_mapping(src->type),
1009
- ggml_type_size(src->type), src->ne, src->nb, GGML_MAX_DIMS);
1082
+ acl_tensor_ptr mask_tensor = ggml_cann_create_tensor(buffer, ggml_cann_type_mapping(src->type),
1083
+ ggml_type_size(src->type), src->ne, src->nb, GGML_MAX_DIMS);
1010
1084
 
1011
- aclnn_fill_scalar(ctx, value, mask_tensor);
1085
+ aclnn_fill_scalar(ctx, value, mask_tensor.get());
1012
1086
 
1013
- aclScalar* alpha = nullptr;
1014
- float alphaValue = 1.0f;
1015
- alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
1087
+ float alphaValue = 1.0f;
1088
+ acl_scalar_ptr alpha = ggml_cann_create_scalar(&alphaValue, aclDataType::ACL_FLOAT);
1016
1089
 
1017
- GGML_CANN_CALL_ACLNN_OP(ctx, InplaceTriu, mask_tensor, n_past + 1);
1018
- GGML_CANN_CALL_ACLNN_OP(ctx, Tril, acl_src, n_past + 1, acl_dst);
1019
- GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdd, acl_dst, mask_tensor, alpha);
1020
- ggml_cann_release_resources(ctx, alpha, acl_src, acl_dst, mask_tensor);
1090
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceTriu, mask_tensor.get(), n_past + 1);
1091
+ GGML_CANN_CALL_ACLNN_OP(ctx, Tril, acl_src.get(), n_past + 1, acl_dst.get());
1092
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdd, acl_dst.get(), mask_tensor.get(), alpha.get());
1021
1093
  }
1022
1094
 
1023
1095
  /**
@@ -1035,129 +1107,121 @@ void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst,
1035
1107
  * tensor.
1036
1108
  * @param dims The number of dimensions in the tensor.
1037
1109
  */
1038
- static void aclnn_permute(ggml_backend_cann_context& ctx, aclTensor* acl_src,
1039
- aclTensor* acl_dst, int64_t* new_dim, uint64_t dims) {
1040
- aclIntArray* acl_dims = aclCreateIntArray(new_dim, dims);
1041
- GGML_CANN_CALL_ACLNN_OP(ctx, Permute, acl_src, acl_dims, acl_dst);
1042
- ggml_cann_release_resources(ctx, acl_dims);
1043
- }
1044
-
1045
- static void ggml_cann_im2col_2d_post_process(ggml_backend_cann_context& ctx,
1046
- ggml_tensor* dst,
1047
- ggml_tensor* src1,
1048
- aclTensor* tmp_cast_tensor,
1049
- aclTensor* tmp_im2col_tensor) {
1110
+ static void aclnn_permute(ggml_backend_cann_context & ctx,
1111
+ aclTensor * acl_src,
1112
+ aclTensor * acl_dst,
1113
+ int64_t * new_dim,
1114
+ uint64_t dims) {
1115
+ acl_int_array_ptr acl_dims = ggml_cann_create_int_array(new_dim, dims);
1116
+ GGML_CANN_CALL_ACLNN_OP(ctx, Permute, acl_src, acl_dims.get(), acl_dst);
1117
+ }
1118
+
1119
+ static void ggml_cann_im2col_2d_post_process(ggml_backend_cann_context & ctx,
1120
+ ggml_tensor * dst,
1121
+ ggml_tensor * src1,
1122
+ aclTensor * tmp_cast_tensor,
1123
+ aclTensor * tmp_im2col_tensor) {
1050
1124
  // Permute: [N, IC * KH * KW, OW * OH] -> [N, OW * OH, IC * KH * KW]
1051
- int64_t dst_ne[] = {dst->ne[0], dst->ne[1] * dst->ne[2], dst->ne[3]};
1052
- size_t dst_nb[] = {dst->nb[0], dst->nb[1], dst->nb[3]};
1053
- aclTensor* acl_dst =
1054
- ggml_cann_create_tensor(dst, dst_ne, dst_nb, GGML_MAX_DIMS - 1);
1125
+ int64_t dst_ne[] = { dst->ne[0], dst->ne[1] * dst->ne[2], dst->ne[3] };
1126
+ size_t dst_nb[] = { dst->nb[0], dst->nb[1], dst->nb[3] };
1127
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst, dst_ne, dst_nb, GGML_MAX_DIMS - 1);
1055
1128
 
1056
- int64_t permute_dim[] = {0, 2, 1};
1129
+ int64_t permute_dim[] = { 0, 2, 1 };
1057
1130
  if (src1->type != dst->type) {
1058
- aclnn_permute(ctx, tmp_cast_tensor, acl_dst, permute_dim, 3);
1131
+ aclnn_permute(ctx, tmp_cast_tensor, acl_dst.get(), permute_dim, 3);
1059
1132
  } else {
1060
- aclnn_permute(ctx, tmp_im2col_tensor, acl_dst, permute_dim, 3);
1133
+ aclnn_permute(ctx, tmp_im2col_tensor, acl_dst.get(), permute_dim, 3);
1061
1134
  }
1062
-
1063
- ggml_cann_release_resources(ctx, acl_dst);
1064
1135
  }
1065
1136
 
1066
- static void ggml_cann_im2col_1d_post_process(
1067
- ggml_backend_cann_context& ctx, ggml_tensor* dst, ggml_tensor* src1,
1068
- aclTensor* tmp_cast_tensor, aclTensor* tmp_im2col_tensor,
1069
- const std::vector<int64_t>& im2col_op_params) {
1137
+ static void ggml_cann_im2col_1d_post_process(ggml_backend_cann_context & ctx,
1138
+ ggml_tensor * dst,
1139
+ ggml_tensor * src1,
1140
+ aclTensor * tmp_cast_tensor,
1141
+ aclTensor * tmp_im2col_tensor,
1142
+ const std::vector<int64_t> & im2col_op_params) {
1070
1143
  // get params
1071
- const int64_t KH = im2col_op_params[0];
1072
- const int64_t KW = im2col_op_params[1];
1073
- const int64_t IW = im2col_op_params[2];
1074
- const int64_t IC = im2col_op_params[3];
1075
- const int64_t N = im2col_op_params[4];
1076
- const int64_t OH = im2col_op_params[5];
1077
- const int64_t OW = im2col_op_params[6];
1078
- const int64_t s0 = im2col_op_params[7];
1079
- const int64_t p0 = im2col_op_params[8];
1080
- const int64_t d0 = im2col_op_params[9];
1144
+ const int64_t KH = im2col_op_params[0];
1145
+ const int64_t KW = im2col_op_params[1];
1146
+ const int64_t IW = im2col_op_params[2];
1147
+ const int64_t IC = im2col_op_params[3];
1148
+ const int64_t N = im2col_op_params[4];
1149
+ const int64_t OH = im2col_op_params[5];
1150
+ const int64_t OW = im2col_op_params[6];
1151
+ const int64_t s0 = im2col_op_params[7];
1152
+ const int64_t p0 = im2col_op_params[8];
1153
+ const int64_t d0 = im2col_op_params[9];
1081
1154
  const int64_t n_bytes_factor = im2col_op_params[10];
1082
1155
 
1083
1156
  // Permute: [N, IC * KH * KW, OW * OH] ->
1084
1157
  // [N, OW * OH * n_bytes_factor, IC * KH * KW]
1085
1158
  ggml_cann_pool_alloc tmp_permute_allocator(ctx.pool());
1086
1159
  tmp_permute_allocator.alloc(ggml_nbytes(dst) * n_bytes_factor);
1087
- void* tmp_permute_buffer = tmp_permute_allocator.get();
1160
+ void * tmp_permute_buffer = tmp_permute_allocator.get();
1088
1161
 
1089
- int64_t tmp_permute_ne[] = {IC * KH * KW, OW * OH * n_bytes_factor, N};
1090
- size_t tmp_permute_nb[GGML_MAX_DIMS - 1];
1162
+ int64_t tmp_permute_ne[] = { IC * KH * KW, OW * OH * n_bytes_factor, N };
1163
+ size_t tmp_permute_nb[GGML_MAX_DIMS - 1];
1091
1164
  tmp_permute_nb[0] = ggml_type_size(dst->type);
1092
1165
  for (int i = 1; i < GGML_MAX_DIMS - 1; i++) {
1093
1166
  tmp_permute_nb[i] = tmp_permute_nb[i - 1] * tmp_permute_ne[i - 1];
1094
1167
  }
1095
1168
 
1096
- aclTensor* tmp_permute_tensor = ggml_cann_create_tensor(
1097
- tmp_permute_buffer, ggml_cann_type_mapping(dst->type),
1098
- ggml_type_size(dst->type), tmp_permute_ne, tmp_permute_nb,
1099
- GGML_MAX_DIMS - 1, ACL_FORMAT_ND);
1169
+ acl_tensor_ptr tmp_permute_tensor =
1170
+ ggml_cann_create_tensor(tmp_permute_buffer, ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
1171
+ tmp_permute_ne, tmp_permute_nb, GGML_MAX_DIMS - 1, ACL_FORMAT_ND);
1100
1172
 
1101
- int64_t permute_dim[] = {0, 2, 1};
1173
+ int64_t permute_dim[] = { 0, 2, 1 };
1102
1174
  if (src1->type != dst->type) {
1103
- aclnn_permute(ctx, tmp_cast_tensor, tmp_permute_tensor, permute_dim, 3);
1175
+ aclnn_permute(ctx, tmp_cast_tensor, tmp_permute_tensor.get(), permute_dim, 3);
1104
1176
  } else {
1105
- aclnn_permute(ctx, tmp_im2col_tensor, tmp_permute_tensor, permute_dim,
1106
- 3);
1177
+ aclnn_permute(ctx, tmp_im2col_tensor, tmp_permute_tensor.get(), permute_dim, 3);
1107
1178
  }
1108
1179
 
1109
1180
  // number of times the kernel moves in W dimension
1110
1181
  const int n_step_w = (IW + 2 * p0 - d0 * (KW - 1) - 1) / s0 + 1;
1111
- size_t offset;
1112
- void *cur_dst_buffer = dst->data, *cur_permute_buffer = tmp_permute_buffer;
1182
+ size_t offset;
1183
+ void * cur_dst_buffer = dst->data, *cur_permute_buffer = tmp_permute_buffer;
1113
1184
 
1114
1185
  // memory copy with offset to restore 1D im2col from 2d
1115
1186
  if (IC > 1) {
1116
- offset = IC * KH * KW * n_step_w * ggml_type_size(dst->type);
1117
- size_t size_cpy = KH * KW * ggml_type_size(dst->type);
1187
+ offset = IC * KH * KW * n_step_w * ggml_type_size(dst->type);
1188
+ size_t cpy_size = KH * KW * ggml_type_size(dst->type);
1118
1189
 
1119
1190
  for (int c = 0; c < IC; c++) {
1120
- cur_permute_buffer = (char*)tmp_permute_buffer + offset +
1121
- KH * KW * c * ggml_type_size(dst->type);
1122
- cur_dst_buffer = (char*)dst->data +
1123
- c * KH * KW * n_step_w * ggml_type_size(dst->type);
1191
+ cur_permute_buffer = (char *) tmp_permute_buffer + offset + KH * KW * c * ggml_type_size(dst->type);
1192
+ cur_dst_buffer = (char *) dst->data + c * KH * KW * n_step_w * ggml_type_size(dst->type);
1124
1193
 
1125
1194
  for (int i = 0; i < n_step_w; i++) {
1126
- ggml_cann_async_memcpy(ctx, cur_dst_buffer, cur_permute_buffer, size_cpy,
1127
- ACL_MEMCPY_DEVICE_TO_DEVICE);
1128
- cur_dst_buffer =
1129
- (char*)cur_dst_buffer + KH * KW * ggml_type_size(dst->type);
1130
- cur_permute_buffer = (char*)cur_permute_buffer +
1131
- KH * KW * IC * ggml_type_size(dst->type);
1195
+ ACL_CHECK(aclrtMemcpyAsync(cur_dst_buffer, cpy_size, cur_permute_buffer, cpy_size,
1196
+ ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
1197
+ cur_dst_buffer = (char *) cur_dst_buffer + KH * KW * ggml_type_size(dst->type);
1198
+ cur_permute_buffer = (char *) cur_permute_buffer + KH * KW * IC * ggml_type_size(dst->type);
1132
1199
  }
1133
1200
  }
1134
1201
  } else {
1135
- offset = KH * KW * n_step_w *
1136
- ggml_type_size(dst->type); // equal to ggml_nbytes(dst)
1137
- ggml_cann_async_memcpy(ctx, dst->data, (char*)tmp_permute_buffer + offset, offset,
1138
- ACL_MEMCPY_DEVICE_TO_DEVICE);
1202
+ offset = KH * KW * n_step_w * ggml_type_size(dst->type); // equal to ggml_nbytes(dst)
1203
+ ACL_CHECK(aclrtMemcpyAsync(dst->data, offset, (char *) tmp_permute_buffer + offset, offset,
1204
+ ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
1139
1205
  }
1140
-
1141
- ggml_cann_release_resources(ctx, tmp_permute_tensor);
1142
1206
  }
1143
1207
 
1144
- void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
1145
- ggml_tensor* src0 = dst->src[0]; // kernel
1146
- ggml_tensor* src1 = dst->src[1]; // input
1208
+ void ggml_cann_im2col(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
1209
+ ggml_tensor * src0 = dst->src[0]; // kernel
1210
+ ggml_tensor * src1 = dst->src[1]; // input
1147
1211
 
1148
1212
  GGML_TENSOR_BINARY_OP_LOCALS;
1149
1213
 
1150
1214
  // aclnnIm2col only works on 2D. set s1, p1, d1 to 1 to perform 2D
1151
1215
  // im2col and do post-processing to restore it to 1D.
1152
- const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
1153
- const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
1154
- const int32_t s1 = is_2D ? ((const int32_t*)(dst->op_params))[1] : 1;
1155
- const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
1156
- const int32_t p1 = is_2D ? ((const int32_t*)(dst->op_params))[3] : 1;
1157
- const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
1158
- const int32_t d1 = is_2D ? ((const int32_t*)(dst->op_params))[5] : 1;
1159
-
1160
- const int64_t N = ne13;
1216
+ const bool is_2D = ((const int32_t *) (dst->op_params))[6] == 1;
1217
+ const int32_t s0 = ((const int32_t *) (dst->op_params))[0];
1218
+ const int32_t s1 = is_2D ? ((const int32_t *) (dst->op_params))[1] : 1;
1219
+ const int32_t p0 = ((const int32_t *) (dst->op_params))[2];
1220
+ const int32_t p1 = is_2D ? ((const int32_t *) (dst->op_params))[3] : 1;
1221
+ const int32_t d0 = ((const int32_t *) (dst->op_params))[4];
1222
+ const int32_t d1 = is_2D ? ((const int32_t *) (dst->op_params))[5] : 1;
1223
+
1224
+ const int64_t N = ne13;
1161
1225
  const int64_t IC = ne12;
1162
1226
  const int64_t KH = ne01;
1163
1227
  const int64_t KW = ne00;
@@ -1170,9 +1234,9 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
1170
1234
  const int64_t n_bytes_factor = is_2D ? 1 : 3;
1171
1235
 
1172
1236
  // im2col: [N,C,H,W] -> [N, IC * KH * KW, OW * OH * n_bytes_factor]
1173
- aclTensor* acl_src1 = ggml_cann_create_tensor(src1);
1174
- int64_t tmp_im2col_ne[] = {OW * OH * n_bytes_factor, IC * KH * KW, N};
1175
- size_t tmp_im2col_nb[GGML_MAX_DIMS - 1];
1237
+ acl_tensor_ptr acl_src1 = ggml_cann_create_tensor(src1);
1238
+ int64_t tmp_im2col_ne[] = { OW * OH * n_bytes_factor, IC * KH * KW, N };
1239
+ size_t tmp_im2col_nb[GGML_MAX_DIMS - 1];
1176
1240
 
1177
1241
  tmp_im2col_nb[0] = ggml_type_size(src1->type);
1178
1242
  for (int i = 1; i < GGML_MAX_DIMS - 1; i++) {
@@ -1182,31 +1246,28 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
1182
1246
  // Calculate im2col.
1183
1247
  // If dst is f16, tmp_buffer is f32, we need alloc src.typesize *
1184
1248
  // dst.elemcount.
1185
- ggml_cann_pool_alloc im2col_allocator(
1186
- ctx.pool(),
1187
- ggml_nelements(dst) * ggml_element_size(src1) * n_bytes_factor);
1188
- void* tmp_im2col_buffer = im2col_allocator.get();
1189
-
1190
- aclTensor* tmp_im2col_tensor = ggml_cann_create_tensor(
1191
- tmp_im2col_buffer, ggml_cann_type_mapping(src1->type),
1192
- ggml_type_size(src1->type), tmp_im2col_ne, tmp_im2col_nb,
1193
- GGML_MAX_DIMS - 1, ACL_FORMAT_ND);
1194
-
1195
- std::vector<int64_t> kernel_dims = {KH, KW};
1196
- std::vector<int64_t> dilation_size = {d1, d0};
1197
- std::vector<int64_t> padding_dims = {p1, p0};
1198
- std::vector<int64_t> stride_dims = {s1, s0};
1199
- auto* kernel_size = aclCreateIntArray(kernel_dims.data(), 2);
1200
- auto* dilations = aclCreateIntArray(dilation_size.data(), 2);
1201
- auto* paddings = aclCreateIntArray(padding_dims.data(), 2);
1202
- auto* strides = aclCreateIntArray(stride_dims.data(), 2);
1203
- GGML_CANN_CALL_ACLNN_OP(ctx, Im2col, acl_src1, kernel_size, dilations,
1204
- paddings, strides, tmp_im2col_tensor);
1249
+ ggml_cann_pool_alloc im2col_allocator(ctx.pool(), ggml_nelements(dst) * ggml_element_size(src1) * n_bytes_factor);
1250
+ void * tmp_im2col_buffer = im2col_allocator.get();
1251
+
1252
+ acl_tensor_ptr tmp_im2col_tensor =
1253
+ ggml_cann_create_tensor(tmp_im2col_buffer, ggml_cann_type_mapping(src1->type), ggml_type_size(src1->type),
1254
+ tmp_im2col_ne, tmp_im2col_nb, GGML_MAX_DIMS - 1, ACL_FORMAT_ND);
1255
+
1256
+ std::vector<int64_t> kernel_dims = { KH, KW };
1257
+ std::vector<int64_t> dilation_size = { d1, d0 };
1258
+ std::vector<int64_t> padding_dims = { p1, p0 };
1259
+ std::vector<int64_t> stride_dims = { s1, s0 };
1260
+ acl_int_array_ptr kernel_size = ggml_cann_create_int_array(kernel_dims.data(), 2);
1261
+ acl_int_array_ptr dilations = ggml_cann_create_int_array(dilation_size.data(), 2);
1262
+ acl_int_array_ptr paddings = ggml_cann_create_int_array(padding_dims.data(), 2);
1263
+ acl_int_array_ptr strides = ggml_cann_create_int_array(stride_dims.data(), 2);
1264
+ GGML_CANN_CALL_ACLNN_OP(ctx, Im2col, acl_src1.get(), kernel_size.get(), dilations.get(), paddings.get(),
1265
+ strides.get(), tmp_im2col_tensor.get());
1205
1266
 
1206
1267
  // Cast if dst is f16.
1207
- aclTensor* tmp_cast_tensor = nullptr;
1268
+ acl_tensor_ptr tmp_cast_tensor;
1208
1269
  ggml_cann_pool_alloc tmp_cast_allocator(ctx.pool());
1209
- void* tmp_cast_buffer = nullptr;
1270
+ void * tmp_cast_buffer = nullptr;
1210
1271
  if (src1->type != dst->type) {
1211
1272
  tmp_cast_allocator.alloc(ggml_nbytes(dst) * n_bytes_factor);
1212
1273
  tmp_cast_buffer = tmp_cast_allocator.get();
@@ -1216,26 +1277,20 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
1216
1277
  temp_cast_nb[i] = temp_cast_nb[i - 1] * tmp_im2col_ne[i - 1];
1217
1278
  }
1218
1279
 
1219
- tmp_cast_tensor = ggml_cann_create_tensor(
1220
- tmp_cast_buffer, ggml_cann_type_mapping(dst->type),
1221
- ggml_type_size(dst->type), tmp_im2col_ne, temp_cast_nb,
1222
- GGML_MAX_DIMS - 1, ACL_FORMAT_ND);
1223
- aclnn_cast(ctx, tmp_im2col_tensor, tmp_cast_tensor, ggml_cann_type_mapping(dst->type));
1280
+ tmp_cast_tensor =
1281
+ ggml_cann_create_tensor(tmp_cast_buffer, ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
1282
+ tmp_im2col_ne, temp_cast_nb, GGML_MAX_DIMS - 1, ACL_FORMAT_ND);
1283
+ aclnn_cast(ctx, tmp_im2col_tensor.get(), tmp_cast_tensor.get(), ggml_cann_type_mapping(dst->type));
1224
1284
  }
1225
1285
 
1226
1286
  // post-processing
1227
1287
  if (is_2D) {
1228
- ggml_cann_im2col_2d_post_process(ctx, dst, src1, tmp_cast_tensor,
1229
- tmp_im2col_tensor);
1288
+ ggml_cann_im2col_2d_post_process(ctx, dst, src1, tmp_cast_tensor.get(), tmp_im2col_tensor.get());
1230
1289
  } else {
1231
- std::vector<int64_t> im2col_op_params = {
1232
- KH, KW, IW, IC, N, OH, OW, s0, p0, d0, n_bytes_factor};
1233
- ggml_cann_im2col_1d_post_process(ctx, dst, src1, tmp_cast_tensor,
1234
- tmp_im2col_tensor, im2col_op_params);
1290
+ std::vector<int64_t> im2col_op_params = { KH, KW, IW, IC, N, OH, OW, s0, p0, d0, n_bytes_factor };
1291
+ ggml_cann_im2col_1d_post_process(ctx, dst, src1, tmp_cast_tensor.get(), tmp_im2col_tensor.get(),
1292
+ im2col_op_params);
1235
1293
  }
1236
-
1237
- ggml_cann_release_resources(ctx, acl_src1, tmp_im2col_tensor, tmp_cast_tensor,
1238
- kernel_size, dilations, paddings, strides);
1239
1294
  }
1240
1295
 
1241
1296
  /**
@@ -1251,136 +1306,117 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
1251
1306
  * @param ctx The context for the CANN backend operations.
1252
1307
  * @param acl_src The tensor on which the exponential function will be applied.
1253
1308
  */
1254
- static void aclnn_exp(ggml_backend_cann_context& ctx, aclTensor* acl_src) {
1309
+ static void aclnn_exp(ggml_backend_cann_context & ctx, aclTensor * acl_src) {
1255
1310
  GGML_CANN_CALL_ACLNN_OP(ctx, InplaceExp, acl_src);
1256
1311
  }
1257
1312
 
1258
- void aclnn_cos(ggml_backend_cann_context& ctx, aclTensor* acl_src,
1259
- aclTensor* acl_dst) {
1260
- if(acl_dst == nullptr) {
1313
+ void aclnn_cos(ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst) {
1314
+ if (acl_dst == nullptr) {
1261
1315
  GGML_CANN_CALL_ACLNN_OP(ctx, InplaceCos, acl_src);
1262
1316
  } else {
1263
1317
  GGML_CANN_CALL_ACLNN_OP(ctx, Cos, acl_src, acl_dst);
1264
1318
  }
1265
1319
  }
1266
1320
 
1267
- void aclnn_sin(ggml_backend_cann_context& ctx, aclTensor* acl_src,
1268
- aclTensor* acl_dst) {
1269
- if(acl_dst == nullptr) {
1321
+ void aclnn_sin(ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst) {
1322
+ if (acl_dst == nullptr) {
1270
1323
  GGML_CANN_CALL_ACLNN_OP(ctx, InplaceSin, acl_src);
1271
1324
  } else {
1272
1325
  GGML_CANN_CALL_ACLNN_OP(ctx, Sin, acl_src, acl_dst);
1273
1326
  }
1274
1327
  }
1275
1328
 
1276
- void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx,
1277
- ggml_tensor* dst) {
1278
- const ggml_tensor* src = dst->src[0];
1329
+ void ggml_cann_timestep_embedding(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
1330
+ const ggml_tensor * src = dst->src[0];
1279
1331
 
1280
1332
  GGML_ASSERT(src->type == GGML_TYPE_F32);
1281
1333
  GGML_ASSERT(dst->type == GGML_TYPE_F32);
1282
1334
 
1283
- const int dim = dst->op_params[0];
1335
+ const int dim = dst->op_params[0];
1284
1336
  const int max_period = dst->op_params[1];
1285
- int half = dim / 2;
1337
+ int half = dim / 2;
1286
1338
 
1287
- aclTensor* acl_src = ggml_cann_create_tensor(src);
1339
+ acl_tensor_ptr acl_src = ggml_cann_create_tensor(src);
1288
1340
 
1289
1341
  // arange: [0, ..., half)
1290
- float start = 0;
1291
- float stop = half;
1292
- float step = 1;
1342
+ float start = 0;
1343
+ float stop = half;
1344
+ float step = 1;
1293
1345
  int64_t n_elements_arange = half;
1294
- int64_t tmp_arange_ne[] = {half};
1295
- size_t tmp_arange_nb[] = {sizeof(dst->type)};
1346
+ int64_t tmp_arange_ne[] = { half };
1347
+ size_t tmp_arange_nb[] = { sizeof(dst->type) };
1296
1348
 
1297
1349
  ggml_cann_pool_alloc arange_allocator(ctx.pool(), half * sizeof(dst->type));
1298
- void* tmp_arange_buffer = arange_allocator.get();
1299
- aclTensor* tmp_arange_tensor = ggml_cann_create_tensor(
1300
- tmp_arange_buffer, ggml_cann_type_mapping(dst->type),
1301
- ggml_type_size(dst->type), tmp_arange_ne, tmp_arange_nb,
1302
- GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
1350
+ void * tmp_arange_buffer = arange_allocator.get();
1351
+ acl_tensor_ptr tmp_arange_tensor =
1352
+ ggml_cann_create_tensor(tmp_arange_buffer, ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
1353
+ tmp_arange_ne, tmp_arange_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
1303
1354
 
1304
- aclnn_arange(ctx, tmp_arange_tensor, start, stop, step, n_elements_arange);
1355
+ aclnn_arange(ctx, tmp_arange_tensor.get(), start, stop, step, n_elements_arange);
1305
1356
 
1306
1357
  // freq
1307
1358
  float freq_param = -logf(max_period) / half;
1308
- bool inplace = true;
1309
- aclnn_muls(ctx, tmp_arange_tensor, freq_param, nullptr, inplace);
1310
- aclnn_exp(ctx, tmp_arange_tensor);
1359
+ bool inplace = true;
1360
+ aclnn_muls(ctx, tmp_arange_tensor.get(), freq_param, nullptr, inplace);
1361
+ aclnn_exp(ctx, tmp_arange_tensor.get());
1311
1362
 
1312
1363
  // permute: src [0,1,2,3]->[0,1,3,2]
1313
- int64_t tmp_permute_ne[] = {src->ne[1], src->ne[0], src->ne[2], src->ne[3]};
1314
- size_t tmp_permute_nb[GGML_MAX_DIMS];
1364
+ int64_t tmp_permute_ne[] = { src->ne[1], src->ne[0], src->ne[2], src->ne[3] };
1365
+ size_t tmp_permute_nb[GGML_MAX_DIMS];
1315
1366
  tmp_permute_nb[0] = ggml_type_size(src->type);
1316
1367
  for (int i = 1; i < GGML_MAX_DIMS; i++) {
1317
1368
  tmp_permute_nb[i] = tmp_permute_nb[i - 1] * tmp_permute_ne[i - 1];
1318
1369
  }
1319
1370
 
1320
1371
  ggml_cann_pool_alloc permute_allocator(ctx.pool(), ggml_nbytes(src));
1321
- void* tmp_permute_buffer = permute_allocator.get();
1322
- aclTensor* tmp_permute_tensor = ggml_cann_create_tensor(
1323
- tmp_permute_buffer, ggml_cann_type_mapping(src->type),
1324
- ggml_type_size(src->type), tmp_permute_ne, tmp_permute_nb,
1325
- GGML_MAX_DIMS, ACL_FORMAT_ND);
1326
- int64_t permute_dim[] = {0, 1, 3, 2};
1327
- int64_t num_dims = 4;
1328
- aclnn_permute(ctx, acl_src, tmp_permute_tensor, permute_dim, num_dims);
1372
+ void * tmp_permute_buffer = permute_allocator.get();
1373
+ acl_tensor_ptr tmp_permute_tensor =
1374
+ ggml_cann_create_tensor(tmp_permute_buffer, ggml_cann_type_mapping(src->type), ggml_type_size(src->type),
1375
+ tmp_permute_ne, tmp_permute_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
1376
+ int64_t permute_dim[] = { 0, 1, 3, 2 };
1377
+ int64_t num_dims = 4;
1378
+ aclnn_permute(ctx, acl_src.get(), tmp_permute_tensor.get(), permute_dim, num_dims);
1329
1379
 
1330
1380
  // timestep * freq
1331
- int64_t tmp_mul_ne[] = {src->ne[1] * half, src->ne[0], src->ne[2],
1332
- src->ne[3]};
1333
- size_t tmp_mul_nb[GGML_MAX_DIMS];
1381
+ int64_t tmp_mul_ne[] = { src->ne[1] * half, src->ne[0], src->ne[2], src->ne[3] };
1382
+ size_t tmp_mul_nb[GGML_MAX_DIMS];
1334
1383
  tmp_mul_nb[0] = ggml_type_size(src->type);
1335
1384
  for (int i = 1; i < GGML_MAX_DIMS; i++) {
1336
1385
  tmp_mul_nb[i] = tmp_mul_nb[i - 1] * tmp_mul_ne[i - 1];
1337
1386
  }
1338
1387
 
1339
- int mul_nelements =
1340
- src->ne[1] * half * src->ne[0] * src->ne[2] * src->ne[3];
1388
+ int mul_nelements = src->ne[1] * half * src->ne[0] * src->ne[2] * src->ne[3];
1341
1389
 
1342
- ggml_cann_pool_alloc mul_allocator(
1343
- ctx.pool(), mul_nelements * ggml_type_size(src->type));
1344
- void* tmp_mul_buffer = mul_allocator.get();
1345
- aclTensor* tmp_mul_tensor = ggml_cann_create_tensor(
1346
- tmp_mul_buffer, ggml_cann_type_mapping(src->type),
1347
- ggml_type_size(src->type), tmp_mul_ne, tmp_mul_nb, GGML_MAX_DIMS,
1348
- ACL_FORMAT_ND);
1349
- aclnn_mul(ctx, tmp_permute_tensor, tmp_arange_tensor, tmp_mul_tensor);
1390
+ ggml_cann_pool_alloc mul_allocator(ctx.pool(), mul_nelements * ggml_type_size(src->type));
1391
+ void * tmp_mul_buffer = mul_allocator.get();
1392
+ acl_tensor_ptr tmp_mul_tensor =
1393
+ ggml_cann_create_tensor(tmp_mul_buffer, ggml_cann_type_mapping(src->type), ggml_type_size(src->type),
1394
+ tmp_mul_ne, tmp_mul_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
1395
+ aclnn_mul(ctx, tmp_permute_tensor.get(), tmp_arange_tensor.get(), tmp_mul_tensor.get());
1350
1396
 
1351
1397
  // cos
1352
- ggml_cann_pool_alloc cos_allocator(
1353
- ctx.pool(), mul_nelements * ggml_type_size(src->type));
1354
- void* tmp_cos_buffer = cos_allocator.get();
1355
- aclTensor* tmp_cos_tensor = ggml_cann_create_tensor(
1356
- tmp_cos_buffer, ggml_cann_type_mapping(dst->type),
1357
- ggml_type_size(dst->type), tmp_mul_ne, tmp_mul_nb, GGML_MAX_DIMS,
1358
- ACL_FORMAT_ND);
1398
+ ggml_cann_pool_alloc cos_allocator(ctx.pool(), mul_nelements * ggml_type_size(src->type));
1399
+ void * tmp_cos_buffer = cos_allocator.get();
1400
+ acl_tensor_ptr tmp_cos_tensor =
1401
+ ggml_cann_create_tensor(tmp_cos_buffer, ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
1402
+ tmp_mul_ne, tmp_mul_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
1359
1403
 
1360
- aclnn_cos(ctx, tmp_mul_tensor, tmp_cos_tensor);
1404
+ aclnn_cos(ctx, tmp_mul_tensor.get(), tmp_cos_tensor.get());
1361
1405
 
1362
1406
  // sin
1363
- ggml_cann_pool_alloc sin_allocator(
1364
- ctx.pool(), mul_nelements * ggml_type_size(src->type));
1365
- void* tmp_sin_buffer = sin_allocator.get();
1366
- aclTensor* tmp_sin_tensor = ggml_cann_create_tensor(
1367
- tmp_sin_buffer, ggml_cann_type_mapping(dst->type),
1368
- ggml_type_size(dst->type), tmp_mul_ne, tmp_mul_nb, GGML_MAX_DIMS,
1369
- ACL_FORMAT_ND);
1407
+ ggml_cann_pool_alloc sin_allocator(ctx.pool(), mul_nelements * ggml_type_size(src->type));
1408
+ void * tmp_sin_buffer = sin_allocator.get();
1409
+ acl_tensor_ptr tmp_sin_tensor =
1410
+ ggml_cann_create_tensor(tmp_sin_buffer, ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
1411
+ tmp_mul_ne, tmp_mul_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
1370
1412
 
1371
- aclnn_sin(ctx, tmp_mul_tensor, tmp_sin_tensor);
1413
+ aclnn_sin(ctx, tmp_mul_tensor.get(), tmp_sin_tensor.get());
1372
1414
 
1373
1415
  // concat
1374
- int64_t concat_dim = 3;
1375
- aclTensor* acl_dst = ggml_cann_create_tensor(dst);
1376
- aclTensor* tensors[] = {tmp_cos_tensor, tmp_sin_tensor};
1377
- aclTensorList* tensor_list = aclCreateTensorList(tensors, 2);
1378
- aclnn_concat(ctx, tensor_list, acl_dst, concat_dim);
1379
-
1380
- // release
1381
- // segmentation fault when delete both tensorList and his elements.
1382
- ggml_cann_release_resources(ctx, tensor_list, acl_src, tmp_arange_tensor,
1383
- tmp_permute_tensor, tmp_mul_tensor, acl_dst);
1416
+ int64_t concat_dim = 3;
1417
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
1418
+ acl_tensor_list_ptr tensor_list = ggml_cann_create_tensor_list(tmp_cos_tensor, tmp_sin_tensor);
1419
+ aclnn_concat(ctx, tensor_list.get(), acl_dst.get(), concat_dim);
1384
1420
  }
1385
1421
 
1386
1422
  /**
@@ -1399,8 +1435,7 @@ void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx,
1399
1435
  * @param acl_exp The exponent tensor, each element of which is used to raise
1400
1436
  * the corresponding element in the destination tensor.
1401
1437
  */
1402
- static void aclnn_pow_tensor_tensor(ggml_backend_cann_context& ctx,
1403
- aclTensor* acl_dst, aclTensor* acl_exp) {
1438
+ static void aclnn_pow_tensor_tensor(ggml_backend_cann_context & ctx, aclTensor * acl_dst, aclTensor * acl_exp) {
1404
1439
  GGML_CANN_CALL_ACLNN_OP(ctx, InplacePowTensorTensor, acl_dst, acl_exp);
1405
1440
  }
1406
1441
 
@@ -1423,26 +1458,33 @@ static void aclnn_pow_tensor_tensor(ggml_backend_cann_context& ctx,
1423
1458
  * @param start Starting exponent offset.
1424
1459
  * @param stop Stopping exponent offset (exclusive).
1425
1460
  * @param step Step size for the exponent increment.
1461
+ * @param dtype Data type for slope tensor.
1426
1462
  */
1427
- static void aclnn_get_slope_inner(ggml_backend_cann_context& ctx, void* slope_buffer,
1428
- float m, int64_t size, float start, float stop, float step){
1429
- int64_t ne[] = {size};
1430
- size_t nb[] = {sizeof(uint16_t)};
1463
+ static void aclnn_get_slope_inner(ggml_backend_cann_context & ctx,
1464
+ void * slope_buffer,
1465
+ float m,
1466
+ int64_t size,
1467
+ float start,
1468
+ float stop,
1469
+ float step,
1470
+ ggml_type dtype) {
1471
+ aclDataType acl_type = ggml_cann_type_mapping(dtype);
1472
+ size_t type_size = ggml_type_size(dtype);
1431
1473
 
1432
- ggml_cann_pool_alloc arange_allocator(ctx.pool(), size * sizeof(uint16_t));
1433
- void* arange_buffer = arange_allocator.get();
1474
+ int64_t ne[] = { size };
1475
+ size_t nb[] = { type_size };
1434
1476
 
1435
- aclTensor* arange_tensor = ggml_cann_create_tensor(
1436
- arange_buffer, ACL_FLOAT16, sizeof(uint16_t), ne, nb, 1);
1437
- aclnn_arange(ctx, arange_tensor, start, stop, step, size);
1477
+ ggml_cann_pool_alloc arange_allocator(ctx.pool(), size * type_size);
1478
+ void * arange_buffer = arange_allocator.get();
1438
1479
 
1439
- aclTensor* slope_tensor = ggml_cann_create_tensor(
1440
- slope_buffer, ACL_FLOAT16, sizeof(uint16_t), ne, nb, 1);
1480
+ acl_tensor_ptr arange_tensor = ggml_cann_create_tensor(arange_buffer, acl_type, type_size, ne, nb, 1);
1481
+ aclnn_arange(ctx, arange_tensor.get(), start, stop, step, size);
1441
1482
 
1442
- aclScalar* sc = aclCreateScalar(&m, aclDataType::ACL_FLOAT);
1483
+ acl_tensor_ptr slope_tensor = ggml_cann_create_tensor(slope_buffer, acl_type, type_size, ne, nb, 1);
1443
1484
 
1444
- GGML_CANN_CALL_ACLNN_OP(ctx, PowScalarTensor, sc, arange_tensor, slope_tensor);
1445
- ggml_cann_release_resources(ctx, sc, arange_tensor, slope_tensor);
1485
+ acl_scalar_ptr sc = ggml_cann_create_scalar(&m, aclDataType::ACL_FLOAT);
1486
+
1487
+ GGML_CANN_CALL_ACLNN_OP(ctx, PowScalarTensor, sc.get(), arange_tensor.get(), slope_tensor.get());
1446
1488
  }
1447
1489
 
1448
1490
  /**
@@ -1468,10 +1510,14 @@ static void aclnn_get_slope_inner(ggml_backend_cann_context& ctx, void* slope_bu
1468
1510
  * @param n_head Total number of attention heads.
1469
1511
  * @param slope_buffer Pointer to the output buffer (float array) for storing slopes.
1470
1512
  * @param max_bias Maximum bias value for slope computation.
1513
+ * @param dtype Data type for slope tensor.
1471
1514
  *
1472
1515
  */
1473
- static void aclnn_get_slope(ggml_backend_cann_context & ctx, int64_t n_head,
1474
- void* slope_buffer, float max_bias) {
1516
+ static void aclnn_get_slope(ggml_backend_cann_context & ctx,
1517
+ int64_t n_head,
1518
+ void * slope_buffer,
1519
+ float max_bias,
1520
+ ggml_type dtype) {
1475
1521
  const int n_head_log2 = 1u << (uint32_t) floor(log2(n_head));
1476
1522
 
1477
1523
  float m0 = powf(2.0f, -(max_bias) / n_head_log2);
@@ -1488,16 +1534,15 @@ static void aclnn_get_slope(ggml_backend_cann_context & ctx, int64_t n_head,
1488
1534
  float step = 1;
1489
1535
  float count = n_head_log2;
1490
1536
  // end needs to be +1 because aclnn uses a left-closed, right-open interval.
1491
- aclnn_get_slope_inner(ctx, slope_buffer, m0, count, start, end + 1, step);
1537
+ aclnn_get_slope_inner(ctx, slope_buffer, m0, count, start, end + 1, step, dtype);
1492
1538
  if (n_head_log2 < n_head) {
1493
1539
  // arange2
1494
1540
  start = 2 * (n_head_log2 - n_head_log2) + 1;
1495
1541
  end = 2 * ((n_head - 1) - n_head_log2) + 1;
1496
1542
  step = 2;
1497
1543
  count = n_head - n_head_log2;
1498
- aclnn_get_slope_inner(
1499
- ctx, (char *) slope_buffer + n_head_log2 * sizeof(float),
1500
- m1, count, start, end + 1, step);
1544
+ aclnn_get_slope_inner(ctx, (char *) slope_buffer + n_head_log2 * sizeof(float), m1, count, start, end + 1, step,
1545
+ dtype);
1501
1546
  }
1502
1547
  }
1503
1548
 
@@ -1522,19 +1567,21 @@ static void aclnn_get_slope(ggml_backend_cann_context & ctx, int64_t n_head,
1522
1567
  * - Write data into dst_ptr using only the shape information of the dst tensor.
1523
1568
  * - `GGML_MAX_DIMS + 2` is used to extend tensor dimensions for broadcasting.
1524
1569
  */
1525
- static void aclnn_add_alibi(ggml_backend_cann_context& ctx, ggml_tensor* mask,
1526
- ggml_tensor* dst, void* dst_ptr, float max_bias) {
1527
- void* slope_buffer = nullptr;
1528
- void* bias_buffer = nullptr;
1570
+ static void aclnn_add_alibi(ggml_backend_cann_context & ctx,
1571
+ ggml_tensor * mask,
1572
+ ggml_tensor * dst,
1573
+ void * dst_ptr,
1574
+ float max_bias) {
1575
+ void * slope_buffer = nullptr;
1576
+ void * bias_buffer = nullptr;
1529
1577
 
1530
1578
  if (max_bias > 0.0f) {
1531
- int64_t n_heads = dst->ne[2];
1579
+ int64_t n_heads = dst->ne[2];
1532
1580
  ggml_cann_pool_alloc slope_allocator(ctx.pool(), n_heads * sizeof(float));
1533
1581
  slope_buffer = slope_allocator.get();
1534
- ggml_cann_pool_alloc bias_allocator(
1535
- ctx.pool(), ggml_nelements(dst) * ggml_element_size(dst));
1582
+ ggml_cann_pool_alloc bias_allocator(ctx.pool(), ggml_nelements(dst) * ggml_element_size(dst));
1536
1583
  bias_buffer = bias_allocator.get();
1537
- aclnn_get_slope(ctx, n_heads, slope_buffer, max_bias);
1584
+ aclnn_get_slope(ctx, n_heads, slope_buffer, max_bias, GGML_TYPE_F32);
1538
1585
  }
1539
1586
 
1540
1587
  // broadcast for mask, slop and dst;
@@ -1543,16 +1590,12 @@ static void aclnn_add_alibi(ggml_backend_cann_context& ctx, ggml_tensor* mask,
1543
1590
 
1544
1591
  // broadcast the mask across rows
1545
1592
  int64_t mask_ne[] = { mask->ne[0], dst->ne[1], mask->ne[2], 1, mask->ne[3], 1 };
1546
- size_t mask_nb[] = {
1547
- mask_nb[0] = mask->nb[0], mask_nb[1] = mask->nb[1], mask_nb[2] = mask->nb[2],
1548
- mask_nb[3] = mask->nb[2], mask_nb[4] = mask->nb[3], mask_nb[5] = mask->nb[3]
1549
- };
1593
+ size_t mask_nb[] = { mask_nb[0] = mask->nb[0], mask_nb[1] = mask->nb[1], mask_nb[2] = mask->nb[2],
1594
+ mask_nb[3] = mask->nb[2], mask_nb[4] = mask->nb[3], mask_nb[5] = mask->nb[3] };
1550
1595
 
1551
1596
  int64_t dst_ne[] = { dst->ne[0], dst->ne[1], mask->ne[2], nr2, mask->ne[3], nr3 };
1552
- size_t dst_nb[] = {
1553
- dst_nb[0] = dst->nb[0], dst_nb[1] = dst->nb[1], dst_nb[2] = dst->nb[2],
1554
- dst_nb[3] = dst->nb[2], dst_nb[4] = dst->nb[3], dst_nb[5] = dst->nb[3]
1555
- };
1597
+ size_t dst_nb[] = { dst_nb[0] = dst->nb[0], dst_nb[1] = dst->nb[1], dst_nb[2] = dst->nb[2],
1598
+ dst_nb[3] = dst->nb[2], dst_nb[4] = dst->nb[3], dst_nb[5] = dst->nb[3] };
1556
1599
 
1557
1600
  // slope is a 1 dim tensor, slope.ne2 == dst.ne2
1558
1601
  int64_t slope_ne[] = { 1, 1, mask->ne[2], nr2, 1, 1 };
@@ -1562,17 +1605,13 @@ static void aclnn_add_alibi(ggml_backend_cann_context& ctx, ggml_tensor* mask,
1562
1605
  slope_nb[i] = slope_nb[i - 1] * slope_ne[i - 1];
1563
1606
  }
1564
1607
 
1565
- aclTensor* acl_slope = ggml_cann_create_tensor(
1566
- slope_buffer, ACL_FLOAT, sizeof(float),
1567
- slope_ne, slope_nb, GGML_MAX_DIMS + 2);
1568
- aclTensor* acl_mask = ggml_cann_create_tensor(
1569
- mask, mask_ne, mask_nb, GGML_MAX_DIMS + 2);
1608
+ acl_tensor_ptr acl_slope =
1609
+ ggml_cann_create_tensor(slope_buffer, ACL_FLOAT, sizeof(float), slope_ne, slope_nb, GGML_MAX_DIMS + 2);
1610
+ acl_tensor_ptr acl_mask = ggml_cann_create_tensor(mask, mask_ne, mask_nb, GGML_MAX_DIMS + 2);
1570
1611
 
1571
1612
  // write data into dst_ptr using only the shape information of the dst tensor.
1572
- aclTensor* acl_dst = ggml_cann_create_tensor(
1573
- dst_ptr, ggml_cann_type_mapping(dst->type),
1574
- ggml_type_size(dst->type), dst_ne, dst_nb,
1575
- GGML_MAX_DIMS + 2);
1613
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst_ptr, ggml_cann_type_mapping(dst->type),
1614
+ ggml_type_size(dst->type), dst_ne, dst_nb, GGML_MAX_DIMS + 2);
1576
1615
 
1577
1616
  if (max_bias > 0.0f) {
1578
1617
  int64_t bias_ne[] = { mask->ne[0], dst->ne[1], mask->ne[2], nr2, mask->ne[3], 1 };
@@ -1581,17 +1620,14 @@ static void aclnn_add_alibi(ggml_backend_cann_context& ctx, ggml_tensor* mask,
1581
1620
  for (int i = 1; i < GGML_MAX_DIMS + 2; i++) {
1582
1621
  bias_nb[i] = bias_nb[i - 1] * bias_ne[i - 1];
1583
1622
  }
1584
- aclTensor* bias_tensor = ggml_cann_create_tensor(
1585
- bias_buffer, ACL_FLOAT, sizeof(float),
1586
- bias_ne, bias_nb, GGML_MAX_DIMS + 2);
1623
+ acl_tensor_ptr bias_tensor =
1624
+ ggml_cann_create_tensor(bias_buffer, ACL_FLOAT, sizeof(float), bias_ne, bias_nb, GGML_MAX_DIMS + 2);
1587
1625
 
1588
- aclnn_mul(ctx, acl_slope, acl_mask, bias_tensor);
1589
- aclnn_add(ctx, acl_dst, bias_tensor);
1590
- ggml_cann_release_resources(ctx, bias_tensor);
1626
+ aclnn_mul(ctx, acl_slope.get(), acl_mask.get(), bias_tensor.get());
1627
+ aclnn_add(ctx, acl_dst.get(), bias_tensor.get());
1591
1628
  } else {
1592
- aclnn_add(ctx, acl_dst, acl_mask);
1629
+ aclnn_add(ctx, acl_dst.get(), acl_mask.get());
1593
1630
  }
1594
- ggml_cann_release_resources(ctx, acl_slope, acl_mask, acl_dst);
1595
1631
  }
1596
1632
 
1597
1633
  void ggml_cann_cpy(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
@@ -1612,17 +1648,16 @@ void ggml_cann_cpy(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
1612
1648
  * @param acl_dst The destination tensor where the softmax results will be
1613
1649
  * stored.
1614
1650
  */
1615
- static void aclnn_softmax(ggml_backend_cann_context & ctx,
1616
- aclTensor* acl_src, int64_t dim, aclTensor * acl_dst) {
1651
+ static void aclnn_softmax(ggml_backend_cann_context & ctx, aclTensor * acl_src, int64_t dim, aclTensor * acl_dst) {
1617
1652
  GGML_CANN_CALL_ACLNN_OP(ctx, Softmax, acl_src, dim, acl_dst);
1618
1653
  }
1619
1654
 
1620
1655
  void ggml_cann_softmax(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
1621
- ggml_tensor* src0 = dst->src[0];
1622
- ggml_tensor* src1 = dst->src[1]; // mask
1656
+ ggml_tensor * src0 = dst->src[0];
1657
+ ggml_tensor * src1 = dst->src[1]; // mask
1623
1658
 
1624
- aclTensor* acl_src0 = ggml_cann_create_tensor(src0);
1625
- aclTensor* acl_dst = ggml_cann_create_tensor(dst);
1659
+ acl_tensor_ptr acl_src0 = ggml_cann_create_tensor(src0);
1660
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
1626
1661
 
1627
1662
  float scale = 1.0f;
1628
1663
  float max_bias = 0.0f;
@@ -1631,22 +1666,20 @@ void ggml_cann_softmax(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
1631
1666
  memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
1632
1667
 
1633
1668
  // input mul scale
1634
- aclScalar* acl_scale = aclCreateScalar(&scale, aclDataType::ACL_FLOAT);
1669
+ acl_scalar_ptr acl_scale = ggml_cann_create_scalar(&scale, aclDataType::ACL_FLOAT);
1635
1670
  ggml_cann_pool_alloc src_tensor_allocator(ctx.pool(), ggml_nbytes(src0));
1636
- void* src_tensor_buffer = src_tensor_allocator.get();
1637
- aclTensor* softmax_tensor = ggml_cann_create_tensor(
1638
- src_tensor_buffer, ggml_cann_type_mapping(src0->type),
1639
- ggml_element_size(src0), src0->ne, src0->nb,GGML_MAX_DIMS);
1671
+ void * src_tensor_buffer = src_tensor_allocator.get();
1672
+ acl_tensor_ptr softmax_tensor = ggml_cann_create_tensor(src_tensor_buffer, ggml_cann_type_mapping(src0->type),
1673
+ ggml_element_size(src0), src0->ne, src0->nb, GGML_MAX_DIMS);
1640
1674
 
1641
- aclnn_muls(ctx, acl_src0, scale, softmax_tensor, false);
1675
+ aclnn_muls(ctx, acl_src0.get(), scale, softmax_tensor.get(), false);
1642
1676
 
1643
1677
  // mask
1644
1678
  if (src1) {
1645
1679
  aclnn_add_alibi(ctx, src1, src0, src_tensor_buffer, max_bias);
1646
1680
  }
1647
1681
  // softmax
1648
- aclnn_softmax(ctx, softmax_tensor, 3, acl_dst);
1649
- ggml_cann_release_resources(ctx, acl_src0, acl_dst, acl_scale, softmax_tensor);
1682
+ aclnn_softmax(ctx, softmax_tensor.get(), 3, acl_dst.get());
1650
1683
  }
1651
1684
 
1652
1685
  /**
@@ -1668,31 +1701,32 @@ void ggml_cann_softmax(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
1668
1701
  * @param index The index tensor specifying the indices to select from the source tensor.
1669
1702
  * @param type The data type of the source and destination tensors.
1670
1703
  */
1671
- static void aclnn_index_select_4d(ggml_backend_cann_context& ctx,
1672
- void* src_buffer,int64_t* src_ne, size_t* src_nb,
1673
- void* dst_buffer, int64_t* dst_ne, size_t* dst_nb,
1674
- ggml_tensor* index, ggml_type type) {
1704
+ static void aclnn_index_select_4d(ggml_backend_cann_context & ctx,
1705
+ void * src_buffer,
1706
+ int64_t * src_ne,
1707
+ size_t * src_nb,
1708
+ void * dst_buffer,
1709
+ int64_t * dst_ne,
1710
+ size_t * dst_nb,
1711
+ ggml_tensor * index,
1712
+ ggml_type type) {
1675
1713
  for (int64_t i = 0; i < src_ne[3]; i++) {
1676
1714
  for (int64_t j = 0; j < src_ne[2]; j++) {
1677
1715
  // src
1678
- aclTensor* acl_src_tensor = ggml_cann_create_tensor(
1679
- (char*)src_buffer + i * src_nb[3] + j * src_nb[2],
1680
- ggml_cann_type_mapping(type), ggml_type_size(type),
1681
- src_ne, src_nb, 2);
1716
+ acl_tensor_ptr acl_src_tensor =
1717
+ ggml_cann_create_tensor((char *) src_buffer + i * src_nb[3] + j * src_nb[2],
1718
+ ggml_cann_type_mapping(type), ggml_type_size(type), src_ne, src_nb, 2);
1682
1719
 
1683
1720
  // index
1684
- aclTensor* acl_index = ggml_cann_create_tensor(
1685
- (char*)index->data + (i % index->ne[2]) * index->nb[2] + (j % index->ne[1]) * index->nb[1],
1686
- ggml_cann_type_mapping(index->type), ggml_element_size(index),
1687
- index->ne, index->nb, 1);
1721
+ acl_tensor_ptr acl_index = ggml_cann_create_tensor(
1722
+ (char *) index->data + (i % index->ne[2]) * index->nb[2] + (j % index->ne[1]) * index->nb[1],
1723
+ ggml_cann_type_mapping(index->type), ggml_element_size(index), index->ne, index->nb, 1);
1688
1724
 
1689
1725
  // out
1690
- aclTensor* acl_out = ggml_cann_create_tensor(
1691
- (char*)dst_buffer + i * dst_nb[3] + j * dst_nb[2],
1692
- ggml_cann_type_mapping(type), ggml_type_size(type),
1693
- dst_ne, dst_nb, 2);
1694
- GGML_CANN_CALL_ACLNN_OP(ctx, IndexSelect, acl_src_tensor, 0, acl_index, acl_out);
1695
- ggml_cann_release_resources(ctx, acl_src_tensor, acl_index, acl_out);
1726
+ acl_tensor_ptr acl_out =
1727
+ ggml_cann_create_tensor((char *) dst_buffer + i * dst_nb[3] + j * dst_nb[2],
1728
+ ggml_cann_type_mapping(type), ggml_type_size(type), dst_ne, dst_nb, 2);
1729
+ GGML_CANN_CALL_ACLNN_OP(ctx, IndexSelect, acl_src_tensor.get(), 0, acl_index.get(), acl_out.get());
1696
1730
  }
1697
1731
  }
1698
1732
  }
@@ -1717,167 +1751,149 @@ static void aclnn_index_select_4d(ggml_backend_cann_context& ctx,
1717
1751
  * @param index The index tensor specifying target positions in the destination tensor.
1718
1752
  * @param type The data type of the source and destination tensors.
1719
1753
  */
1720
- static void aclnn_index_copy_4d(ggml_backend_cann_context& ctx,
1721
- void* src_buffer,int64_t* src_ne, size_t* src_nb,
1722
- void* dst_buffer, int64_t* dst_ne, size_t* dst_nb,
1723
- ggml_tensor* index, ggml_type type) {
1754
+ static void aclnn_index_copy_4d(ggml_backend_cann_context & ctx,
1755
+ void * src_buffer,
1756
+ int64_t * src_ne,
1757
+ size_t * src_nb,
1758
+ void * dst_buffer,
1759
+ int64_t * dst_ne,
1760
+ size_t * dst_nb,
1761
+ ggml_tensor * index,
1762
+ ggml_type type) {
1724
1763
  for (int64_t i = 0; i < src_ne[3]; i++) {
1725
1764
  for (int64_t j = 0; j < src_ne[2]; j++) {
1726
1765
  // src
1727
- aclTensor* acl_src_tensor = ggml_cann_create_tensor(
1728
- (char*)src_buffer + i * src_nb[3] + j * src_nb[2],
1729
- ggml_cann_type_mapping(type), ggml_type_size(type),
1730
- src_ne, src_nb, 2);
1766
+ acl_tensor_ptr acl_src_tensor =
1767
+ ggml_cann_create_tensor((char *) src_buffer + i * src_nb[3] + j * src_nb[2],
1768
+ ggml_cann_type_mapping(type), ggml_type_size(type), src_ne, src_nb, 2);
1731
1769
 
1732
1770
  // index
1733
- aclTensor* acl_index = ggml_cann_create_tensor(
1734
- (char*)index->data + (i % index->ne[2]) * index->nb[2] + (j % index->ne[1]) * index->nb[1],
1735
- ggml_cann_type_mapping(index->type), ggml_element_size(index),
1736
- index->ne, index->nb, 1);
1771
+ acl_tensor_ptr acl_index = ggml_cann_create_tensor(
1772
+ (char *) index->data + (i % index->ne[2]) * index->nb[2] + (j % index->ne[1]) * index->nb[1],
1773
+ ggml_cann_type_mapping(index->type), ggml_element_size(index), index->ne, index->nb, 1);
1737
1774
 
1738
1775
  // out
1739
- aclTensor* acl_out = ggml_cann_create_tensor(
1740
- (char*)dst_buffer + i * dst_nb[3] + j * dst_nb[2],
1741
- ggml_cann_type_mapping(type), ggml_type_size(type),
1742
- dst_ne, dst_nb, 2);
1743
- GGML_CANN_CALL_ACLNN_OP(ctx, InplaceIndexCopy, acl_out, 0, acl_index, acl_src_tensor);
1744
- ggml_cann_release_resources(ctx, acl_src_tensor, acl_index, acl_out);
1776
+ acl_tensor_ptr acl_out =
1777
+ ggml_cann_create_tensor((char *) dst_buffer + i * dst_nb[3] + j * dst_nb[2],
1778
+ ggml_cann_type_mapping(type), ggml_type_size(type), dst_ne, dst_nb, 2);
1779
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceIndexCopy, acl_out.get(), 0, acl_index.get(), acl_src_tensor.get());
1745
1780
  }
1746
1781
  }
1747
1782
  }
1748
1783
 
1749
- void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
1750
- ggml_tensor* src0 = dst->src[0]; // src
1751
- ggml_tensor* src1 = dst->src[1]; // index
1784
+ void ggml_cann_get_rows(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
1785
+ ggml_tensor * src0 = dst->src[0]; // src
1786
+ ggml_tensor * src1 = dst->src[1]; // index
1787
+
1788
+ GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
1752
1789
 
1753
1790
  switch (src0->type) {
1754
- case GGML_TYPE_F32: {
1755
- aclnn_index_select_4d(ctx, src0->data, src0->ne, src0->nb,
1756
- dst->data, dst->ne, dst->nb,
1757
- src1, dst->type);
1758
- break;
1759
- }
1760
- case GGML_TYPE_F16: {
1761
- aclTensor* acl_src0 = ggml_cann_create_tensor(src0);
1762
- ggml_cann_pool_alloc src_buffer_allocator(
1763
- ctx.pool(), ggml_nelements(src0) * sizeof(float_t));
1764
- void* src_trans_buffer = src_buffer_allocator.get();
1765
- size_t src_trans_nb[GGML_MAX_DIMS];
1766
- src_trans_nb[0] = sizeof(float_t);
1767
- for (int i = 1; i < GGML_MAX_DIMS; i++) {
1768
- src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
1791
+ case GGML_TYPE_F16:
1792
+ case GGML_TYPE_F32:
1793
+ if (src0->type == dst->type) {
1794
+ aclnn_index_select_4d(ctx, src0->data, src0->ne, src0->nb, dst->data, dst->ne, dst->nb, src1,
1795
+ dst->type);
1796
+ } else {
1797
+ acl_tensor_ptr acl_src0 = ggml_cann_create_tensor(src0);
1798
+ ggml_cann_pool_alloc src_buffer_allocator(ctx.pool(), ggml_nelements(src0) * ggml_element_size(dst));
1799
+ void * src_trans_buffer = src_buffer_allocator.get();
1800
+ size_t src_trans_nb[GGML_MAX_DIMS];
1801
+ src_trans_nb[0] = dst->nb[0];
1802
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
1803
+ src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
1804
+ }
1805
+ acl_tensor_ptr src_trans_tensor =
1806
+ ggml_cann_create_tensor(src_trans_buffer, ggml_cann_type_mapping(dst->type),
1807
+ ggml_type_size(dst->type), src0->ne, src_trans_nb, GGML_MAX_DIMS);
1808
+ aclnn_cast(ctx, acl_src0.get(), src_trans_tensor.get(), ggml_cann_type_mapping(dst->type));
1809
+ aclnn_index_select_4d(ctx, src_trans_buffer, src0->ne, src_trans_nb, dst->data, dst->ne, dst->nb, src1,
1810
+ dst->type);
1769
1811
  }
1770
- aclTensor* src_trans_tensor = ggml_cann_create_tensor(
1771
- src_trans_buffer, ACL_FLOAT, ggml_type_size(dst->type),
1772
- src0->ne, src_trans_nb, GGML_MAX_DIMS);
1773
- aclnn_cast(ctx, acl_src0, src_trans_tensor, ggml_cann_type_mapping(dst->type));
1774
- aclnn_index_select_4d(ctx, src_trans_buffer, src0->ne, src_trans_nb,
1775
- dst->data, dst->ne, dst->nb,
1776
- src1, dst->type);
1777
- ggml_cann_release_resources(ctx, acl_src0, src_trans_tensor);
1778
1812
  break;
1779
- }
1780
- case GGML_TYPE_Q8_0: {
1781
- // add 1 dim for bcast mul.
1782
- size_t weight_nb[GGML_MAX_DIMS + 1], scale_nb[GGML_MAX_DIMS + 1],
1783
- dequant_nb[GGML_MAX_DIMS + 1];
1784
- int64_t weight_ne[GGML_MAX_DIMS + 1], scale_ne[GGML_MAX_DIMS + 1],
1785
- *dequant_ne;
1786
- int64_t scale_offset = 0;
1787
-
1788
- // [3,4,5,64] -> [3,4,5,2,32]
1789
- weight_ne[0] = QK8_0;
1790
- weight_ne[1] = src0->ne[0] / QK8_0;
1791
- weight_nb[0] = sizeof(int8_t);
1792
- weight_nb[1] = weight_nb[0] * weight_ne[0];
1793
- for (int i = 2; i < GGML_MAX_DIMS + 1; i++) {
1794
- weight_ne[i] = src0->ne[i - 1];
1795
- weight_nb[i] = weight_nb[i - 1] * weight_ne[i - 1];
1796
- }
1797
-
1798
- // [3,4,5,64] -> [3,4,5,2,1]
1799
- scale_ne[0] = 1;
1800
- scale_ne[1] = src0->ne[0] / QK8_0;
1801
- scale_nb[0] = sizeof(uint16_t);
1802
- scale_nb[1] = scale_nb[0] * scale_ne[0];
1803
- for (int i = 2; i < GGML_MAX_DIMS + 1; i++) {
1804
- scale_ne[i] = src0->ne[i - 1];
1805
- scale_nb[i] = scale_nb[i - 1] * scale_ne[i - 1];
1806
- }
1807
-
1808
- // [3,4,5,64] -> [3,4,5,2,32]
1809
- dequant_ne = weight_ne;
1810
- dequant_nb[0] = sizeof(float_t);
1811
- for (int i = 1; i < GGML_MAX_DIMS + 1; i++) {
1812
- dequant_nb[i] = dequant_nb[i - 1] * dequant_ne[i - 1];
1813
- }
1814
-
1815
- scale_offset = ggml_nelements(src0) * sizeof(int8_t);
1816
- ggml_cann_pool_alloc dequant_buffer_allocator(
1817
- ctx.pool(), ggml_nelements(src0) * sizeof(float_t));
1818
-
1819
- aclTensor* acl_weight_tensor = ggml_cann_create_tensor(
1820
- src0->data, ACL_INT8, sizeof(int8_t), weight_ne, weight_nb,
1821
- GGML_MAX_DIMS + 1);
1822
- aclTensor* acl_scale_tensor = ggml_cann_create_tensor(
1823
- src0->data, ACL_FLOAT16, sizeof(uint16_t), scale_ne, scale_nb,
1824
- GGML_MAX_DIMS + 1, ACL_FORMAT_ND, scale_offset);
1825
- aclTensor* dequant_tensor = ggml_cann_create_tensor(
1826
- dequant_buffer_allocator.get(), ACL_FLOAT, sizeof(float_t),
1827
- dequant_ne, dequant_nb, GGML_MAX_DIMS + 1);
1828
-
1829
- aclnn_mul(ctx, acl_weight_tensor, acl_scale_tensor, dequant_tensor);
1830
- dequant_nb[0] = sizeof(float_t);
1831
- dequant_ne = src0->ne;
1832
- for (int i = 1; i < GGML_MAX_DIMS; i++) {
1833
- dequant_nb[i] = dequant_nb[i - 1] * src0->ne[i - 1];
1813
+ case GGML_TYPE_Q8_0:
1814
+ {
1815
+ // add 1 dim for bcast mul.
1816
+ size_t weight_nb[GGML_MAX_DIMS + 1], scale_nb[GGML_MAX_DIMS + 1], dequant_nb[GGML_MAX_DIMS + 1];
1817
+ int64_t weight_ne[GGML_MAX_DIMS + 1], scale_ne[GGML_MAX_DIMS + 1], *dequant_ne;
1818
+ int64_t scale_offset = 0;
1819
+ // [3,4,5,64] -> [3,4,5,2,32]
1820
+ weight_ne[0] = QK8_0;
1821
+ weight_ne[1] = src0->ne[0] / QK8_0;
1822
+ weight_nb[0] = sizeof(int8_t);
1823
+ weight_nb[1] = weight_nb[0] * weight_ne[0];
1824
+ for (int i = 2; i < GGML_MAX_DIMS + 1; i++) {
1825
+ weight_ne[i] = src0->ne[i - 1];
1826
+ weight_nb[i] = weight_nb[i - 1] * weight_ne[i - 1];
1827
+ }
1828
+ // [3,4,5,64] -> [3,4,5,2,1]
1829
+ scale_ne[0] = 1;
1830
+ scale_ne[1] = src0->ne[0] / QK8_0;
1831
+ scale_nb[0] = sizeof(uint16_t);
1832
+ scale_nb[1] = scale_nb[0] * scale_ne[0];
1833
+ for (int i = 2; i < GGML_MAX_DIMS + 1; i++) {
1834
+ scale_ne[i] = src0->ne[i - 1];
1835
+ scale_nb[i] = scale_nb[i - 1] * scale_ne[i - 1];
1836
+ }
1837
+ // [3,4,5,64] -> [3,4,5,2,32]
1838
+ dequant_ne = weight_ne;
1839
+ dequant_nb[0] = ggml_type_size(dst->type);
1840
+ for (int i = 1; i < GGML_MAX_DIMS + 1; i++) {
1841
+ dequant_nb[i] = dequant_nb[i - 1] * dequant_ne[i - 1];
1842
+ }
1843
+ scale_offset = ggml_nelements(src0) * sizeof(int8_t);
1844
+ ggml_cann_pool_alloc dequant_buffer_allocator(ctx.pool(),
1845
+ ggml_nelements(src0) * ggml_type_size(dst->type));
1846
+ acl_tensor_ptr acl_weight_tensor = ggml_cann_create_tensor(src0->data, ACL_INT8, sizeof(int8_t),
1847
+ weight_ne, weight_nb, GGML_MAX_DIMS + 1);
1848
+ acl_tensor_ptr acl_scale_tensor =
1849
+ ggml_cann_create_tensor(src0->data, ACL_FLOAT16, sizeof(uint16_t), scale_ne, scale_nb,
1850
+ GGML_MAX_DIMS + 1, ACL_FORMAT_ND, scale_offset);
1851
+ acl_tensor_ptr dequant_tensor =
1852
+ ggml_cann_create_tensor(dequant_buffer_allocator.get(), ggml_cann_type_mapping(dst->type),
1853
+ ggml_type_size(dst->type), dequant_ne, dequant_nb, GGML_MAX_DIMS + 1);
1854
+ aclnn_mul(ctx, acl_weight_tensor.get(), acl_scale_tensor.get(), dequant_tensor.get());
1855
+ dequant_nb[0] = ggml_type_size(dst->type);
1856
+ dequant_ne = src0->ne;
1857
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
1858
+ dequant_nb[i] = dequant_nb[i - 1] * src0->ne[i - 1];
1859
+ }
1860
+ aclnn_index_select_4d(ctx, dequant_buffer_allocator.get(), dequant_ne, dequant_nb, dst->data, dst->ne,
1861
+ dst->nb, src1, dst->type);
1862
+ break;
1834
1863
  }
1835
-
1836
- aclnn_index_select_4d(ctx, dequant_buffer_allocator.get(),
1837
- dequant_ne, dequant_nb,
1838
- dst->data, dst->ne, dst->nb,
1839
- src1, dst->type);
1840
-
1841
- ggml_cann_release_resources(ctx, dequant_tensor);
1842
- break;
1843
- }
1844
1864
  default:
1845
1865
  GGML_ABORT("Unsupported tensor type for GGML_OP_GET_ROWS");
1846
1866
  break;
1847
1867
  }
1848
1868
  }
1849
1869
 
1850
- void ggml_cann_set_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
1851
- ggml_tensor* src0 = dst->src[0]; // src
1852
- ggml_tensor* src1 = dst->src[1]; // index
1870
+ void ggml_cann_set_rows(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
1871
+ ggml_tensor * src0 = dst->src[0]; // src
1872
+ ggml_tensor * src1 = dst->src[1]; // index
1853
1873
 
1854
1874
  switch (dst->type) {
1855
- case GGML_TYPE_F32: {
1856
- aclnn_index_copy_4d(ctx, src0->data, src0->ne, src0->nb,
1857
- dst->data, dst->ne, dst->nb,
1858
- src1, dst->type);
1859
- break;
1860
- }
1861
- case GGML_TYPE_F16: {
1862
- aclTensor* acl_src0 = ggml_cann_create_tensor(src0);
1863
- ggml_cann_pool_alloc src_buffer_allocator(
1864
- ctx.pool(), ggml_nelements(src0) * sizeof(uint16_t));
1865
- void* src_trans_buffer = src_buffer_allocator.get();
1866
- size_t src_trans_nb[GGML_MAX_DIMS];
1867
- src_trans_nb[0] = sizeof(uint16_t);
1868
- for (int i = 1; i < GGML_MAX_DIMS; i++) {
1869
- src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
1875
+ case GGML_TYPE_F32:
1876
+ {
1877
+ aclnn_index_copy_4d(ctx, src0->data, src0->ne, src0->nb, dst->data, dst->ne, dst->nb, src1, dst->type);
1878
+ break;
1879
+ }
1880
+ case GGML_TYPE_F16:
1881
+ {
1882
+ acl_tensor_ptr acl_src0 = ggml_cann_create_tensor(src0);
1883
+ ggml_cann_pool_alloc src_buffer_allocator(ctx.pool(), ggml_nelements(src0) * sizeof(uint16_t));
1884
+ void * src_trans_buffer = src_buffer_allocator.get();
1885
+ size_t src_trans_nb[GGML_MAX_DIMS];
1886
+ src_trans_nb[0] = sizeof(uint16_t);
1887
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
1888
+ src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
1889
+ }
1890
+ acl_tensor_ptr src_trans_tensor = ggml_cann_create_tensor(
1891
+ src_trans_buffer, ACL_FLOAT16, ggml_type_size(dst->type), src0->ne, src_trans_nb, GGML_MAX_DIMS);
1892
+ aclnn_cast(ctx, acl_src0.get(), src_trans_tensor.get(), ggml_cann_type_mapping(dst->type));
1893
+ aclnn_index_copy_4d(ctx, src_trans_buffer, src0->ne, src_trans_nb, dst->data, dst->ne, dst->nb, src1,
1894
+ dst->type);
1895
+ break;
1870
1896
  }
1871
- aclTensor* src_trans_tensor = ggml_cann_create_tensor(
1872
- src_trans_buffer, ACL_FLOAT16, ggml_type_size(dst->type),
1873
- src0->ne, src_trans_nb, GGML_MAX_DIMS);
1874
- aclnn_cast(ctx, acl_src0, src_trans_tensor, ggml_cann_type_mapping(dst->type));
1875
- aclnn_index_copy_4d(ctx, src_trans_buffer, src0->ne, src_trans_nb,
1876
- dst->data, dst->ne, dst->nb,
1877
- src1, dst->type);
1878
- ggml_cann_release_resources(ctx, acl_src0, src_trans_tensor);
1879
- break;
1880
- }
1881
1897
  default:
1882
1898
  GGML_ABORT("Unsupported tensor type for GGML_OP_SET_ROWS");
1883
1899
  break;
@@ -1899,12 +1915,13 @@ void ggml_cann_set_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
1899
1915
  * @param repeats The number of times each element will be repeated.
1900
1916
  * @param output_size The size of the output tensor.
1901
1917
  */
1902
- static void aclnn_repeat_interleave(ggml_backend_cann_context& ctx,
1903
- aclTensor* acl_src, aclTensor* acl_dst,
1904
- int64_t dim, int64_t repeats,
1905
- int64_t output_size) {
1906
- GGML_CANN_CALL_ACLNN_OP(ctx, RepeatInterleaveIntWithDim, acl_src, repeats, dim,
1907
- output_size, acl_dst);
1918
+ static void aclnn_repeat_interleave(ggml_backend_cann_context & ctx,
1919
+ aclTensor * acl_src,
1920
+ aclTensor * acl_dst,
1921
+ int64_t dim,
1922
+ int64_t repeats,
1923
+ int64_t output_size) {
1924
+ GGML_CANN_CALL_ACLNN_OP(ctx, RepeatInterleaveIntWithDim, acl_src, repeats, dim, output_size, acl_dst);
1908
1925
  }
1909
1926
 
1910
1927
  /**
@@ -1919,10 +1936,9 @@ static void aclnn_repeat_interleave(ggml_backend_cann_context& ctx,
1919
1936
  * @param dst The destination tensor where the result of the matrix
1920
1937
  * multiplication will be stored.
1921
1938
  */
1922
- static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
1923
- ggml_tensor* dst) {
1924
- ggml_tensor* weight = dst->src[0]; // weight
1925
- ggml_tensor* input = dst->src[1]; // input
1939
+ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
1940
+ ggml_tensor * weight = dst->src[0]; // weight
1941
+ ggml_tensor * input = dst->src[1]; // input
1926
1942
 
1927
1943
  // when weight ne2 or ne3 is 1, aclnnMatmulGetWorkspaceSize will auto
1928
1944
  // broadcast, when weight ne2 or ne3 is not 1, weight need repeat.
@@ -1937,51 +1953,36 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
1937
1953
  }
1938
1954
  }
1939
1955
 
1940
- aclTensor* acl_input_tensor =
1941
- ggml_cann_create_tensor(input, bcast_input_ne, bcast_input_nb, n_dims);
1942
- int64_t transpose_ne[] = {bcast_weight_ne[1], bcast_weight_ne[0],
1943
- bcast_weight_ne[2], bcast_weight_ne[3],
1944
- bcast_weight_ne[4], bcast_weight_ne[5]};
1945
- size_t transpose_nb[] = {bcast_weight_nb[1], bcast_weight_nb[0],
1946
- bcast_weight_nb[2], bcast_weight_nb[3],
1947
- bcast_weight_nb[4], bcast_weight_nb[5]};
1948
- aclTensor* acl_weight_tensor;
1956
+ acl_tensor_ptr acl_input_tensor = ggml_cann_create_tensor(input, bcast_input_ne, bcast_input_nb, n_dims);
1957
+ int64_t transpose_ne[] = { bcast_weight_ne[1], bcast_weight_ne[0], bcast_weight_ne[2],
1958
+ bcast_weight_ne[3], bcast_weight_ne[4], bcast_weight_ne[5] };
1959
+ size_t transpose_nb[] = { bcast_weight_nb[1], bcast_weight_nb[0], bcast_weight_nb[2],
1960
+ bcast_weight_nb[3], bcast_weight_nb[4], bcast_weight_nb[5] };
1961
+ acl_tensor_ptr acl_weight_tensor;
1949
1962
 
1950
1963
  // Only check env once.
1951
- static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or(""));
1964
+ static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or("on"));
1952
1965
  if (weight_to_nz && is_matmul_weight(weight)) {
1953
- int64_t acl_stride[2] = {1, transpose_ne[1]};
1954
-
1955
- // Reverse ne.
1956
- std::reverse(transpose_ne, transpose_ne + n_dims);
1957
-
1958
- std::vector<int64_t> storageDims = {transpose_ne[0], transpose_ne[1]};
1959
-
1960
- acl_weight_tensor = aclCreateTensor(
1961
- transpose_ne, n_dims, ggml_cann_type_mapping(weight->type), acl_stride,
1962
- 0, ACL_FORMAT_FRACTAL_NZ, storageDims.data(), 2, weight->data);
1966
+ acl_weight_tensor = ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims, ACL_FORMAT_FRACTAL_NZ);
1963
1967
  } else {
1964
- acl_weight_tensor =
1965
- ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims, ACL_FORMAT_ND);
1968
+ acl_weight_tensor = ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims, ACL_FORMAT_ND);
1966
1969
  }
1967
- aclTensor* acl_dst =
1968
- ggml_cann_create_tensor(dst, bcast_dst_ne, bcast_dst_nb, n_dims);
1970
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst, bcast_dst_ne, bcast_dst_nb, n_dims);
1969
1971
 
1970
1972
  switch (n_dims) {
1971
1973
  case 2:
1972
- GGML_CANN_CALL_ACLNN_OP(ctx, Mm, acl_input_tensor, acl_weight_tensor, acl_dst, 2);
1974
+ GGML_CANN_CALL_ACLNN_OP(ctx, Mm, acl_input_tensor.get(), acl_weight_tensor.get(), acl_dst.get(), 2);
1973
1975
  break;
1974
1976
  case 3:
1975
- GGML_CANN_CALL_ACLNN_OP(ctx, BatchMatMul, acl_input_tensor, acl_weight_tensor, acl_dst, 2);
1977
+ GGML_CANN_CALL_ACLNN_OP(ctx, BatchMatMul, acl_input_tensor.get(), acl_weight_tensor.get(), acl_dst.get(),
1978
+ 2);
1976
1979
  break;
1977
1980
  default:
1978
1981
  // ALLOW_FP32_DOWN_PRECISION, when input is
1979
1982
  // fp32, atlas a2 will transpose it to HFLOAT32.
1980
- GGML_CANN_CALL_ACLNN_OP(ctx, Matmul, acl_input_tensor, acl_weight_tensor, acl_dst, 1);
1983
+ GGML_CANN_CALL_ACLNN_OP(ctx, Matmul, acl_input_tensor.get(), acl_weight_tensor.get(), acl_dst.get(), 1);
1981
1984
  break;
1982
1985
  }
1983
-
1984
- ggml_cann_release_resources(ctx, acl_weight_tensor, acl_input_tensor, acl_dst);
1985
1986
  }
1986
1987
 
1987
1988
  /**
@@ -1997,11 +1998,9 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
1997
1998
  * @param dst The destination tensor where the result of the matrix
1998
1999
  * multiplication will be stored.
1999
2000
  */
2000
- static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
2001
- ggml_tensor* dst,
2002
- const enum ggml_type type) {
2003
- ggml_tensor* src0 = dst->src[0]; // weight
2004
- ggml_tensor* src1 = dst->src[1]; // input
2001
+ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context & ctx, ggml_tensor * dst, const enum ggml_type type) {
2002
+ ggml_tensor * src0 = dst->src[0]; // weight
2003
+ ggml_tensor * src1 = dst->src[1]; // input
2005
2004
 
2006
2005
  // The shape of the weight is NCHW.
2007
2006
  // Matrix multiplication uses HW dims.
@@ -2015,56 +2014,51 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
2015
2014
  } else {
2016
2015
  GGML_ABORT("Only support Q4_0 and Q8_0 MUL_MAT");
2017
2016
  }
2018
- float weight_nb[] = {src0->ne[0] * weight_elem_size, weight_elem_size};
2017
+ float weight_nb[] = { src0->ne[0] * weight_elem_size, weight_elem_size };
2019
2018
  size_t weight_stride = src0->ne[1] * src0->ne[0] * weight_elem_size;
2020
- size_t weight_size = weight_stride * src0->ne[2] * src0->ne[3];
2019
+ size_t weight_size = weight_stride * src0->ne[2] * src0->ne[3];
2021
2020
 
2022
2021
  // scale stored at the end of weight. Also need transpose.
2023
2022
  size_t scale_elem_size = sizeof(uint16_t);
2024
- size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size,
2025
- scale_elem_size};
2026
- size_t scale_stride = src0->ne[1] * src0->ne[0] / QK8_0 * scale_elem_size;
2027
- char* scale_offset = (char*)src0->data + weight_size;
2023
+ size_t scale_nb[] = { src0->ne[0] / QK8_0 * scale_elem_size, scale_elem_size };
2024
+ size_t scale_stride = src0->ne[1] * src0->ne[0] / QK8_0 * scale_elem_size;
2025
+ char * scale_offset = (char *) src0->data + weight_size;
2028
2026
 
2029
2027
  // input
2030
- size_t input_elem_size = sizeof(uint16_t);
2031
- int64_t input_ne[] = {src1->ne[0], src1->ne[1]};
2032
- size_t input_nb[] = {input_elem_size, input_ne[0] * input_elem_size};
2033
- size_t input_stride = input_ne[0] * input_ne[1] * input_elem_size;
2028
+ size_t input_elem_size = sizeof(uint16_t);
2029
+ int64_t input_ne[] = { src1->ne[0], src1->ne[1] };
2030
+ size_t input_nb[] = { input_elem_size, input_ne[0] * input_elem_size };
2031
+ size_t input_stride = input_ne[0] * input_ne[1] * input_elem_size;
2034
2032
  ggml_cann_pool_alloc input_alloctor(ctx.pool());
2035
- void* input_buffer = src1->data;
2033
+ void * input_buffer = src1->data;
2036
2034
 
2037
2035
  // case in
2038
2036
  if (src1->type != GGML_TYPE_F16) {
2039
- aclTensor* acl_src1_tensor = ggml_cann_create_tensor(src1);
2040
- input_buffer =
2041
- input_alloctor.alloc(ggml_nelements(src1) * input_elem_size);
2037
+ acl_tensor_ptr acl_src1_tensor = ggml_cann_create_tensor(src1);
2038
+ input_buffer = input_alloctor.alloc(ggml_nelements(src1) * input_elem_size);
2042
2039
 
2043
- int64_t* input_cast_ne = src1->ne;
2044
- size_t input_cast_nb[GGML_MAX_DIMS];
2040
+ int64_t * input_cast_ne = src1->ne;
2041
+ size_t input_cast_nb[GGML_MAX_DIMS];
2045
2042
  input_cast_nb[0] = sizeof(uint16_t);
2046
2043
  for (int i = 1; i < GGML_MAX_DIMS; i++) {
2047
2044
  input_cast_nb[i] = input_cast_nb[i - 1] * input_cast_ne[i - 1];
2048
2045
  }
2049
2046
 
2050
- aclTensor* acl_input_tensor = ggml_cann_create_tensor(
2051
- input_buffer, ACL_FLOAT16, input_elem_size, input_cast_ne,
2052
- input_cast_nb, GGML_MAX_DIMS);
2053
- aclnn_cast(ctx, acl_src1_tensor, acl_input_tensor, ACL_FLOAT16);
2054
- ggml_cann_release_resources(ctx, acl_input_tensor, acl_src1_tensor);
2047
+ acl_tensor_ptr acl_input_tensor = ggml_cann_create_tensor(input_buffer, ACL_FLOAT16, input_elem_size,
2048
+ input_cast_ne, input_cast_nb, GGML_MAX_DIMS);
2049
+ aclnn_cast(ctx, acl_src1_tensor.get(), acl_input_tensor.get(), ACL_FLOAT16);
2055
2050
  }
2056
2051
 
2057
2052
  // output
2058
- size_t output_elem_size = sizeof(uint16_t);
2059
- size_t output_nb[] = {output_elem_size, dst->ne[0] * output_elem_size};
2053
+ size_t output_elem_size = sizeof(uint16_t);
2054
+ size_t output_nb[] = { output_elem_size, dst->ne[0] * output_elem_size };
2060
2055
  ggml_cann_pool_alloc output_allocator(ctx.pool());
2061
- void* output_buffer =
2062
- output_allocator.alloc(ggml_nelements(dst) * output_elem_size);
2063
- size_t output_stride = dst->ne[0] * dst->ne[1] * output_elem_size;
2056
+ void * output_buffer = output_allocator.alloc(ggml_nelements(dst) * output_elem_size);
2057
+ size_t output_stride = dst->ne[0] * dst->ne[1] * output_elem_size;
2064
2058
 
2065
2059
  // aclnn
2066
- int64_t max_elem_size = 65535;
2067
- int64_t split_size = (src0->ne[1] / max_elem_size) + 1;
2060
+ int64_t max_elem_size = 65535;
2061
+ int64_t split_size = (src0->ne[1] / max_elem_size) + 1;
2068
2062
  ggml_cann_pool_alloc workspace_allocator(ctx.pool());
2069
2063
  for (int64_t n1 = 0; n1 < src1->ne[3]; n1++) {
2070
2064
  for (int64_t c1 = 0; c1 < src1->ne[2]; c1++) {
@@ -2074,98 +2068,77 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
2074
2068
  int64_t batch1 = (n1 * src1->ne[2]) + c1;
2075
2069
  int64_t batch0 = (n0 * src0->ne[2]) + c0;
2076
2070
 
2077
- aclTensor* acl_input_tensor = ggml_cann_create_tensor(
2078
- (char*)input_buffer + batch1 * input_stride, ACL_FLOAT16,
2079
- input_elem_size, input_ne, input_nb, 2);
2071
+ acl_tensor_ptr acl_input_tensor = ggml_cann_create_tensor(
2072
+ (char *) input_buffer + batch1 * input_stride, ACL_FLOAT16, input_elem_size, input_ne, input_nb, 2);
2080
2073
 
2081
2074
  // first split
2082
2075
  int64_t weight_ne_offset = 0;
2083
- int64_t weight_ne[2] = {
2084
- max_elem_size > src0->ne[1] ? src0->ne[1] : max_elem_size,
2085
- src0->ne[0]};
2086
- int64_t scale_ne_offset = 0;
2087
- int64_t scale_ne[2] = {weight_ne[0], weight_ne[1] / QK8_0};
2076
+ int64_t weight_ne[2] = { max_elem_size > src0->ne[1] ? src0->ne[1] : max_elem_size, src0->ne[0] };
2077
+ int64_t scale_ne_offset = 0;
2078
+ int64_t scale_ne[2] = { weight_ne[0], weight_ne[1] / QK8_0 };
2088
2079
  int64_t output_ne_offset = 0;
2089
- int64_t output_ne[2] = {weight_ne[0], dst->ne[1]};
2090
-
2091
- aclTensor* acl_weight_tensor = ggml_cann_create_tensor(
2092
- (char*)src0->data + batch0 * weight_stride,
2093
- ggml_cann_type_mapping(type), weight_elem_size, weight_ne,
2094
- weight_nb, 2, ACL_FORMAT_ND, weight_ne_offset);
2095
- aclTensor* acl_scale_tensor = ggml_cann_create_tensor(
2096
- scale_offset + batch0 * scale_stride, ACL_FLOAT16,
2097
- scale_elem_size, scale_ne, scale_nb, 2, ACL_FORMAT_ND,
2098
- scale_ne_offset);
2099
- aclTensor* acl_output_tensor = ggml_cann_create_tensor(
2100
- (char*)output_buffer + batch1 * output_stride, ACL_FLOAT16,
2101
- output_elem_size, output_ne, output_nb, 2, ACL_FORMAT_ND,
2102
- output_ne_offset);
2080
+ int64_t output_ne[2] = { weight_ne[0], dst->ne[1] };
2081
+
2082
+ acl_tensor_ptr acl_weight_tensor =
2083
+ ggml_cann_create_tensor((char *) src0->data + batch0 * weight_stride, ggml_cann_type_mapping(type),
2084
+ weight_elem_size, weight_ne, weight_nb, 2, ACL_FORMAT_ND, weight_ne_offset);
2085
+ acl_tensor_ptr acl_scale_tensor =
2086
+ ggml_cann_create_tensor(scale_offset + batch0 * scale_stride, ACL_FLOAT16, scale_elem_size, scale_ne,
2087
+ scale_nb, 2, ACL_FORMAT_ND, scale_ne_offset);
2088
+ acl_tensor_ptr acl_output_tensor =
2089
+ ggml_cann_create_tensor((char *) output_buffer + batch1 * output_stride, ACL_FLOAT16, output_elem_size,
2090
+ output_ne, output_nb, 2, ACL_FORMAT_ND, output_ne_offset);
2103
2091
  int64_t antiquantGroupSize = 0;
2104
2092
  if (src0->ne[0] > QK8_0) {
2105
2093
  antiquantGroupSize = QK8_0;
2106
2094
  }
2107
- GGML_CANN_CALL_ACLNN_OP(ctx, WeightQuantBatchMatmulV2, acl_input_tensor,
2108
- acl_weight_tensor, acl_scale_tensor, nullptr,
2109
- nullptr, nullptr, nullptr, antiquantGroupSize,
2110
- acl_output_tensor);
2111
- ggml_cann_release_resources(ctx, acl_weight_tensor, acl_scale_tensor, acl_output_tensor);
2095
+ GGML_CANN_CALL_ACLNN_OP(ctx, WeightQuantBatchMatmulV2, acl_input_tensor.get(), acl_weight_tensor.get(),
2096
+ acl_scale_tensor.get(), nullptr, nullptr, nullptr, nullptr, antiquantGroupSize,
2097
+ acl_output_tensor.get());
2112
2098
 
2113
2099
  // other splits
2114
2100
  for (int64_t split = 1; split < split_size; split++) {
2115
- weight_ne_offset +=
2116
- weight_elem_size * weight_ne[0] * weight_ne[1];
2117
- weight_ne[0] = max_elem_size * (split + 1) > src0->ne[1]
2118
- ? src0->ne[1] - (max_elem_size * split)
2119
- : max_elem_size;
2101
+ weight_ne_offset += weight_elem_size * weight_ne[0] * weight_ne[1];
2102
+ weight_ne[0] =
2103
+ max_elem_size * (split + 1) > src0->ne[1] ? src0->ne[1] - (max_elem_size * split) : max_elem_size;
2120
2104
  scale_ne_offset += scale_elem_size * scale_ne[0] * scale_ne[1];
2121
2105
  scale_ne[0] = weight_ne[0];
2122
- output_ne_offset +=
2123
- output_elem_size * output_ne[0] * output_ne[1];
2106
+ output_ne_offset += output_elem_size * output_ne[0] * output_ne[1];
2124
2107
  output_ne[0] = weight_ne[0];
2125
2108
 
2126
- acl_weight_tensor = ggml_cann_create_tensor(
2127
- (char*)src0->data + batch0 * weight_stride,
2128
- ggml_cann_type_mapping(type), weight_elem_size, weight_ne,
2129
- weight_nb, 2, ACL_FORMAT_ND, weight_ne_offset);
2130
- acl_scale_tensor = ggml_cann_create_tensor(
2131
- scale_offset + batch0 * scale_stride, ACL_FLOAT16,
2132
- scale_elem_size, scale_ne, scale_nb, 2, ACL_FORMAT_ND,
2133
- scale_ne_offset);
2134
- acl_output_tensor = ggml_cann_create_tensor(
2135
- (char*)output_buffer + batch1 * output_stride, ACL_FLOAT16,
2136
- output_elem_size, output_ne, output_nb, 2, ACL_FORMAT_ND,
2137
- output_ne_offset);
2138
- GGML_CANN_CALL_ACLNN_OP(ctx, WeightQuantBatchMatmulV2, acl_input_tensor,
2139
- acl_weight_tensor, acl_scale_tensor, nullptr,
2140
- nullptr, nullptr, nullptr, antiquantGroupSize,
2141
- acl_output_tensor);
2142
- ggml_cann_release_resources(ctx, acl_weight_tensor, acl_scale_tensor, acl_output_tensor);
2109
+ acl_weight_tensor =
2110
+ ggml_cann_create_tensor((char *) src0->data + batch0 * weight_stride, ggml_cann_type_mapping(type),
2111
+ weight_elem_size, weight_ne, weight_nb, 2, ACL_FORMAT_ND, weight_ne_offset);
2112
+ acl_scale_tensor =
2113
+ ggml_cann_create_tensor(scale_offset + batch0 * scale_stride, ACL_FLOAT16, scale_elem_size,
2114
+ scale_ne, scale_nb, 2, ACL_FORMAT_ND, scale_ne_offset);
2115
+ acl_output_tensor =
2116
+ ggml_cann_create_tensor((char *) output_buffer + batch1 * output_stride, ACL_FLOAT16,
2117
+ output_elem_size, output_ne, output_nb, 2, ACL_FORMAT_ND, output_ne_offset);
2118
+ GGML_CANN_CALL_ACLNN_OP(ctx, WeightQuantBatchMatmulV2, acl_input_tensor.get(), acl_weight_tensor.get(),
2119
+ acl_scale_tensor.get(), nullptr, nullptr, nullptr, nullptr, antiquantGroupSize,
2120
+ acl_output_tensor.get());
2143
2121
  }
2144
-
2145
- ggml_cann_release_resources(ctx, acl_input_tensor);
2146
2122
  }
2147
2123
  }
2148
2124
 
2149
2125
  // cast out
2150
2126
  if (dst->type != GGML_TYPE_F16) {
2151
- int64_t* output_cast_ne = dst->ne;
2152
- size_t output_cast_nb[GGML_MAX_DIMS];
2127
+ int64_t * output_cast_ne = dst->ne;
2128
+ size_t output_cast_nb[GGML_MAX_DIMS];
2153
2129
  output_cast_nb[0] = sizeof(uint16_t);
2154
2130
  for (int i = 1; i < GGML_MAX_DIMS; i++) {
2155
2131
  output_cast_nb[i] = output_cast_nb[i - 1] * output_cast_ne[i - 1];
2156
2132
  }
2157
2133
 
2158
- aclTensor* acl_output_tensor = ggml_cann_create_tensor(
2159
- output_buffer, ACL_FLOAT16, output_elem_size, output_cast_ne,
2160
- output_cast_nb, GGML_MAX_DIMS);
2161
- aclTensor* acl_dst_tensor = ggml_cann_create_tensor(dst);
2162
- aclnn_cast(ctx, acl_output_tensor, acl_dst_tensor, ggml_cann_type_mapping(dst->type));
2163
-
2164
- ggml_cann_release_resources(ctx, acl_output_tensor, acl_dst_tensor);
2134
+ acl_tensor_ptr acl_output_tensor = ggml_cann_create_tensor(output_buffer, ACL_FLOAT16, output_elem_size,
2135
+ output_cast_ne, output_cast_nb, GGML_MAX_DIMS);
2136
+ acl_tensor_ptr acl_dst_tensor = ggml_cann_create_tensor(dst);
2137
+ aclnn_cast(ctx, acl_output_tensor.get(), acl_dst_tensor.get(), ggml_cann_type_mapping(dst->type));
2165
2138
  }
2166
2139
  }
2167
2140
 
2168
- void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
2141
+ void ggml_cann_mul_mat(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
2169
2142
  const enum ggml_type type = dst->src[0]->type;
2170
2143
  switch (type) {
2171
2144
  case GGML_TYPE_F32:
@@ -2198,12 +2171,14 @@ void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
2198
2171
  * @param dims An array specifying the dimensions along which elements are
2199
2172
  * shifted.
2200
2173
  */
2201
- static void aclnn_roll(ggml_backend_cann_context& ctx, aclTensor* acl_src,
2202
- aclTensor* acl_dst, int64_t* shifts, int64_t* dims) {
2203
- aclIntArray* acl_shifts = aclCreateIntArray(shifts, 1);
2204
- aclIntArray* acl_dims = aclCreateIntArray(dims, 1);
2205
- GGML_CANN_CALL_ACLNN_OP(ctx, Roll, acl_src, acl_shifts, acl_dims, acl_dst);
2206
- ggml_cann_release_resources(ctx, acl_shifts, acl_dims);
2174
+ static void aclnn_roll(ggml_backend_cann_context & ctx,
2175
+ aclTensor * acl_src,
2176
+ aclTensor * acl_dst,
2177
+ int64_t * shifts,
2178
+ int64_t * dims) {
2179
+ acl_int_array_ptr acl_shifts = ggml_cann_create_int_array(shifts, 1);
2180
+ acl_int_array_ptr acl_dims = ggml_cann_create_int_array(dims, 1);
2181
+ GGML_CANN_CALL_ACLNN_OP(ctx, Roll, acl_src, acl_shifts.get(), acl_dims.get(), acl_dst);
2207
2182
  }
2208
2183
 
2209
2184
  /**
@@ -2219,14 +2194,15 @@ static void aclnn_roll(ggml_backend_cann_context& ctx, aclTensor* acl_src,
2219
2194
  * @param index_num The number of positions specified in the index array.
2220
2195
  * @param value The scalar value used to fill the specified positions.
2221
2196
  */
2222
- static void aclnn_index_fill_tensor(ggml_backend_cann_context& ctx,
2223
- aclTensor* acl_src, int64_t dim,
2224
- int64_t* index, int64_t index_num,
2225
- float value) {
2226
- aclIntArray* acl_index = aclCreateIntArray(index, index_num);
2227
- aclScalar* acl_value = aclCreateScalar(&value, aclDataType::ACL_FLOAT);
2228
- GGML_CANN_CALL_ACLNN_OP(ctx, InplaceIndexFillTensor, acl_src, dim, acl_index, acl_value);
2229
- ggml_cann_release_resources(ctx, acl_index, acl_value);
2197
+ static void aclnn_index_fill_tensor(ggml_backend_cann_context & ctx,
2198
+ aclTensor * acl_src,
2199
+ int64_t dim,
2200
+ int64_t * index,
2201
+ int64_t index_num,
2202
+ float value) {
2203
+ acl_int_array_ptr acl_index = ggml_cann_create_int_array(index, index_num);
2204
+ acl_scalar_ptr acl_value = ggml_cann_create_scalar(&value, aclDataType::ACL_FLOAT);
2205
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceIndexFillTensor, acl_src, dim, acl_index.get(), acl_value.get());
2230
2206
  }
2231
2207
 
2232
2208
  /**
@@ -2248,420 +2224,435 @@ static void aclnn_index_fill_tensor(ggml_backend_cann_context& ctx,
2248
2224
  * 5. Compute sin(θ), cos(θ) and optionally scale by attn_factor.
2249
2225
  * 6. Expand sin/cos values by repeat or repeat_interleave depending
2250
2226
  * on whether @param is_neox is enabled.
2251
- * 7. Store the computed values into persistent buffers
2252
- * (ctx.rope_sin_ptr / ctx.rope_cos_ptr).
2253
- *
2254
- * @param ctx The CANN backend context, holding memory pool,
2255
- * stream, and persistent buffers for rope init/cache.
2256
- * @param dst The destination ggml_tensor whose computation
2257
- * depends on the cached RoPE values (usually Qcur/Kcur).
2258
- * @param theta_scale Scalar exponent base for computing theta scale values.
2259
- * @param freq_scale Frequency scaling factor, applied to theta scale.
2260
- * @param attn_factor Attention scaling factor, applied to sin/cos.
2261
- * @param is_neox Whether to use Neox-style repeat strategy
2262
- * (dim expansion vs repeat_interleave).
2227
+ *
2228
+ * @param ctx The CANN backend context, holding memory pool,
2229
+ * stream, and persistent buffers for rope init/cache.
2230
+ * @param dst The destination ggml_tensor whose computation
2231
+ * depends on the RoPE values (usually Qcur/Kcur).
2232
+ * @param theta_scale Scalar exponent base for computing theta scale values.
2233
+ * @param freq_scale Frequency scaling factor, applied to theta scale.
2234
+ * @param attn_factor Attention scaling factor, applied to sin/cos.
2235
+ * @param is_neox Whether to use Neox-style repeat strategy
2236
+ * (dim expansion vs repeat_interleave).
2263
2237
  */
2264
- static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
2265
- float theta_scale, float freq_scale,
2266
- float attn_factor, bool is_neox) {
2267
- // int sin/cos cache, cache has different repeat method depond on
2268
- // @param.is_neox
2269
- bool is_q = (std::strncmp(dst->name, "Qcur-", 5) == 0);
2270
- bool is_k = (std::strncmp(dst->name, "Kcur-", 5) == 0);
2271
-
2272
- // used for accuracy testing
2273
- bool is_attention = is_q || is_k;
2274
-
2275
- // just compute in first layer in attention
2276
- bool is_fisrt_layer = (std::strncmp(dst->name, "Qcur-0", GGML_MAX_NAME) == 0);
2277
- if(is_attention && !is_fisrt_layer) {
2238
+ static void aclnn_cache_init(ggml_backend_cann_context & ctx,
2239
+ ggml_tensor * dst,
2240
+ float * corr_dims,
2241
+ float ext_factor,
2242
+ float theta_scale,
2243
+ float freq_scale,
2244
+ float attn_factor,
2245
+ bool is_neox) {
2246
+ ggml_tensor * src0 = dst->src[0]; // input
2247
+ ggml_tensor * src1 = dst->src[1]; // position
2248
+ ggml_tensor * src2 = dst->src[2]; // freq_factors
2249
+
2250
+ if (src2 == nullptr && ctx.rope_cache.cached && ctx.rope_cache.ext_factor == ext_factor &&
2251
+ ctx.rope_cache.theta_scale == theta_scale && ctx.rope_cache.freq_scale == freq_scale &&
2252
+ ctx.rope_cache.attn_factor == attn_factor && ctx.rope_cache.is_neox == is_neox) {
2253
+ // use cache.
2278
2254
  return;
2279
2255
  }
2280
2256
 
2281
- ggml_tensor* src0 = dst->src[0]; // input
2282
- ggml_tensor* src1 = dst->src[1]; // position
2283
- ggml_tensor* src2 = dst->src[2]; // freq_factors
2284
-
2285
- GGML_TENSOR_BINARY_OP_LOCALS
2286
-
2287
- int64_t theta_scale_length = ne00 / 2;
2288
- int64_t theta_scale_ne[] = {theta_scale_length, 1, 1, 1};
2289
- size_t theta_scale_nb[] = {sizeof(float_t), sizeof(float_t), sizeof(float_t),
2290
- theta_scale_length * sizeof(float_t)};
2257
+ int64_t theta_scale_length = src0->ne[0] / 2;
2258
+ int64_t theta_scale_ne[] = { theta_scale_length, 1, 1, 1 };
2259
+ size_t theta_scale_nb[] = { sizeof(float), sizeof(float), sizeof(float), theta_scale_length * sizeof(float) };
2291
2260
 
2292
2261
  GGML_ASSERT(src1->type == GGML_TYPE_I32);
2293
2262
  int64_t position_length = src1->ne[0];
2294
- int64_t position_ne[] = {1, 1, position_length, 1};
2295
- size_t position_nb[] = {sizeof(int32_t), sizeof(int32_t), sizeof(int32_t),
2296
- sizeof(int32_t) * position_length};
2263
+ int64_t position_ne[] = { 1, 1, position_length, 1 };
2264
+ size_t position_nb[] = { sizeof(int32_t), sizeof(int32_t), sizeof(int32_t), sizeof(int32_t) * position_length };
2297
2265
 
2298
- int64_t theta_ne[] = {theta_scale_length, 1, position_length, 1};
2299
- size_t theta_nb[GGML_MAX_DIMS];
2300
- theta_nb[0] = sizeof(float_t);
2266
+ int64_t theta_ne[] = { theta_scale_length, 1, position_length, 1 };
2267
+ size_t theta_nb[GGML_MAX_DIMS];
2268
+ theta_nb[0] = sizeof(float);
2301
2269
  for (int i = 1; i < GGML_MAX_DIMS; i++) {
2302
2270
  theta_nb[i] = theta_nb[i - 1] * theta_ne[i - 1];
2303
2271
  }
2304
2272
 
2305
- // init theta scale, just one time
2306
- if(ctx.rope_init_ptr == nullptr || !is_attention) {
2307
- // theta_scale arange, [0,1,...,ne00/2 - 1]
2308
- if(ctx.rope_init_ptr != nullptr){
2309
- ACL_CHECK(aclrtFree(ctx.rope_init_ptr));
2273
+ // theta_scale arange, [0,1,...,ne00/2 - 1]
2274
+ acl_tensor_ptr acl_theta_scale_tensor;
2275
+ // cache theta scale
2276
+ if (ctx.rope_cache.theta_scale_length != theta_scale_length ||
2277
+ // theta_scale and freq_scale should not change during the current token inference process,
2278
+ // so we can directly use == here instead of comparing the absolute difference.
2279
+ ctx.rope_cache.theta_scale != theta_scale || ctx.rope_cache.freq_scale != freq_scale) {
2280
+ ctx.rope_cache.theta_scale_length = theta_scale_length;
2281
+
2282
+ if (ctx.rope_cache.theta_scale_cache != nullptr) {
2283
+ ACL_CHECK(aclrtFree(ctx.rope_cache.theta_scale_cache));
2284
+ }
2285
+ ACL_CHECK(aclrtMalloc(&ctx.rope_cache.theta_scale_cache, theta_scale_length * sizeof(float),
2286
+ ACL_MEM_MALLOC_HUGE_FIRST));
2287
+
2288
+ acl_theta_scale_tensor = ggml_cann_create_tensor(ctx.rope_cache.theta_scale_cache, ACL_FLOAT, sizeof(float),
2289
+ theta_scale_ne, theta_scale_nb, 1);
2290
+
2291
+ float start = 0;
2292
+ float step = 1;
2293
+ float stop = theta_scale_length;
2294
+ float n_elements = theta_scale_length;
2295
+ aclnn_arange(ctx, acl_theta_scale_tensor.get(), start, stop, step, n_elements);
2296
+
2297
+ ggml_cann_pool_alloc yarn_ramp_allocator(ctx.pool());
2298
+ acl_tensor_ptr acl_yarn_ramp_tensor;
2299
+ if (ext_factor != 0) {
2300
+ // -rope_yarn_ramp
2301
+ // const float y = (i0 / 2 - low) / MAX(0.001f, high - low);
2302
+ // return MIN(1, MAX(0, y)) - 1;
2303
+ yarn_ramp_allocator.alloc(theta_scale_length * sizeof(float));
2304
+ void * yarn_ramp_buffer = yarn_ramp_allocator.get();
2305
+ acl_yarn_ramp_tensor =
2306
+ ggml_cann_create_tensor(yarn_ramp_buffer, ACL_FLOAT, sizeof(float), theta_scale_ne, theta_scale_nb, 1);
2307
+ float zero_value = 0, one_value = 1;
2308
+ float denom_safe_value = MAX(0.001f, corr_dims[1] - corr_dims[0]);
2309
+ acl_scalar_ptr low = ggml_cann_create_scalar(&corr_dims[0], aclDataType::ACL_FLOAT);
2310
+ acl_scalar_ptr zero = ggml_cann_create_scalar(&zero_value, aclDataType::ACL_FLOAT);
2311
+ acl_scalar_ptr one = ggml_cann_create_scalar(&one_value, aclDataType::ACL_FLOAT);
2312
+ acl_scalar_ptr denom_safe = ggml_cann_create_scalar(&denom_safe_value, aclDataType::ACL_FLOAT);
2313
+ acl_scalar_ptr ext_factor_sc = ggml_cann_create_scalar(&ext_factor, aclDataType::ACL_FLOAT);
2314
+
2315
+ GGML_CANN_CALL_ACLNN_OP(ctx, Subs, acl_theta_scale_tensor.get(), low.get(), one.get(),
2316
+ acl_yarn_ramp_tensor.get());
2317
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceDivs, acl_yarn_ramp_tensor.get(), denom_safe.get());
2318
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceThreshold, acl_yarn_ramp_tensor.get(), zero.get(), zero.get());
2319
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceClampMax, acl_yarn_ramp_tensor.get(), one.get());
2320
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceSubs, acl_yarn_ramp_tensor.get(), one.get(), one.get());
2321
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMuls, acl_yarn_ramp_tensor.get(), ext_factor_sc.get());
2322
+
2323
+ // theta_interp = freq_scale * theta_extrap;
2324
+ // theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
2325
+ // theta = freq_scale * theta_extrap * (1 - ramp_mix) + theta_extrap * ramp_mix;
2326
+ // theta = freq_scale * theta_extrap - freq_scale * theta_extrap * ramp_mix + theta_extrap * ramp_mix;
2327
+ // theta = theta_extrap * (freq_scale - freq_scale * ramp_mix + ramp_mix);
2328
+ //
2329
+ // we cache (freq_scale - freq_scale * ramp_mix + ramp_mix), Considering that the rope_yarn_ramp here is the inverse
2330
+ // cache freq_scale + (freq_scale - 1) * ramp_mix
2331
+ float freq_scale_1 = freq_scale - 1;
2332
+ acl_scalar_ptr freq_scale_sc = ggml_cann_create_scalar(&freq_scale, aclDataType::ACL_FLOAT);
2333
+ acl_scalar_ptr freq_scale_1_sc = ggml_cann_create_scalar(&freq_scale_1, aclDataType::ACL_FLOAT);
2334
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMuls, acl_yarn_ramp_tensor.get(), freq_scale_1_sc.get());
2335
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdds, acl_yarn_ramp_tensor.get(), freq_scale_sc.get(), one.get());
2310
2336
  }
2311
- ACL_CHECK(aclrtMalloc(&ctx.rope_init_ptr, theta_scale_length * sizeof(float_t), ACL_MEM_MALLOC_HUGE_FIRST));
2312
-
2313
- aclTensor* acl_theta_scale_tensor =
2314
- ggml_cann_create_tensor(ctx.rope_init_ptr, ACL_FLOAT, sizeof(float_t),
2315
- theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS);
2316
- float start = 0;
2317
- float step = 1;
2318
- float stop = ne00 / 2;
2319
- float n_elements = ne00 / 2;
2320
- aclnn_arange(ctx, acl_theta_scale_tensor, start, stop, step, n_elements);
2321
2337
 
2322
2338
  // power
2323
- aclScalar* acl_theta_scale = aclCreateScalar(&theta_scale, aclDataType::ACL_FLOAT);
2324
- GGML_CANN_CALL_ACLNN_OP(ctx, PowScalarTensor, acl_theta_scale, acl_theta_scale_tensor,
2325
- acl_theta_scale_tensor);
2326
-
2327
- // freq_scale
2328
- if (freq_scale != 1) {
2329
- aclnn_muls(ctx, acl_theta_scale_tensor, freq_scale, nullptr, true);
2339
+ acl_scalar_ptr acl_theta_scale = ggml_cann_create_scalar(&theta_scale, aclDataType::ACL_FLOAT);
2340
+ GGML_CANN_CALL_ACLNN_OP(ctx, PowScalarTensor, acl_theta_scale.get(), acl_theta_scale_tensor.get(),
2341
+ acl_theta_scale_tensor.get());
2342
+
2343
+ if (ext_factor != 0) {
2344
+ aclnn_mul(ctx, acl_theta_scale_tensor.get(), acl_yarn_ramp_tensor.get());
2345
+ } else if (freq_scale != 1) {
2346
+ aclnn_muls(ctx, acl_theta_scale_tensor.get(), freq_scale, nullptr, true);
2330
2347
  }
2348
+ } else {
2349
+ // use cache
2350
+ acl_theta_scale_tensor = ggml_cann_create_tensor(ctx.rope_cache.theta_scale_cache, ACL_FLOAT, sizeof(float),
2351
+ theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS);
2352
+ }
2331
2353
 
2332
- // freq_factors
2333
- if (src2) {
2334
- aclTensor* acl_freq_factors_tensor = ggml_cann_create_tensor(
2335
- src2->data, ggml_cann_type_mapping(src2->type),
2336
- ggml_type_size(src2->type), theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS);
2337
- aclnn_div(ctx, acl_theta_scale_tensor, acl_freq_factors_tensor);
2338
- ggml_cann_release_resources(ctx, acl_freq_factors_tensor);
2339
- }
2340
- // release
2341
- ggml_cann_release_resources(ctx, acl_theta_scale_tensor,acl_theta_scale);
2354
+ ggml_cann_pool_alloc freq_fac_res_allocator(ctx.pool());
2355
+ // freq_factors
2356
+ if (src2) {
2357
+ freq_fac_res_allocator.alloc(theta_scale_length * sizeof(float));
2358
+ void * freq_fac_res_ptr = freq_fac_res_allocator.get();
2359
+ acl_tensor_ptr acl_freq_factors_tensor =
2360
+ ggml_cann_create_tensor(src2->data, ggml_cann_type_mapping(src2->type), ggml_type_size(src2->type),
2361
+ theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS);
2362
+ acl_tensor_ptr acl_freq_fac_res_tensor = ggml_cann_create_tensor(freq_fac_res_ptr, ACL_FLOAT, sizeof(float),
2363
+ theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS);
2364
+ aclnn_div(ctx, acl_theta_scale_tensor.get(), acl_freq_factors_tensor.get(), acl_freq_fac_res_tensor.get());
2365
+ std::swap(acl_theta_scale_tensor, acl_freq_fac_res_tensor);
2342
2366
  }
2343
2367
 
2344
- // init sin_repeat && cos_repeat, one token just init in 0 layer
2345
- if(position_length > ctx.max_prompt_length) {
2346
- ctx.max_prompt_length = position_length;
2347
- int64_t repeat_theta_length = theta_scale_length * ctx.max_prompt_length * 2;
2348
- if(ctx.rope_sin_ptr != nullptr) {
2349
- ACL_CHECK(aclrtFree(ctx.rope_sin_ptr));
2350
- ACL_CHECK(aclrtFree(ctx.rope_cos_ptr));
2368
+ // init sin_repeat && cos_repeat, only to accelerate first layer on each device
2369
+ if (position_length > ctx.rope_cache.position_length) {
2370
+ ctx.rope_cache.position_length = position_length;
2371
+ if (ctx.rope_cache.sin_cache != nullptr) {
2372
+ ACL_CHECK(aclrtFree(ctx.rope_cache.sin_cache));
2373
+ }
2374
+ if (ctx.rope_cache.cos_cache != nullptr) {
2375
+ ACL_CHECK(aclrtFree(ctx.rope_cache.cos_cache));
2351
2376
  }
2352
- ACL_CHECK(aclrtMalloc(&ctx.rope_sin_ptr, repeat_theta_length * sizeof(float_t), ACL_MEM_MALLOC_HUGE_FIRST));
2353
- ACL_CHECK(aclrtMalloc(&ctx.rope_cos_ptr, repeat_theta_length * sizeof(float_t), ACL_MEM_MALLOC_HUGE_FIRST));
2377
+ int64_t repeat_theta_length = theta_scale_length * position_length * 2;
2378
+ ACL_CHECK(
2379
+ aclrtMalloc(&ctx.rope_cache.sin_cache, repeat_theta_length * sizeof(float), ACL_MEM_MALLOC_HUGE_FIRST));
2380
+ ACL_CHECK(
2381
+ aclrtMalloc(&ctx.rope_cache.cos_cache, repeat_theta_length * sizeof(float), ACL_MEM_MALLOC_HUGE_FIRST));
2354
2382
  }
2355
2383
 
2356
- aclTensor* acl_theta_scale_tensor =
2357
- ggml_cann_create_tensor(ctx.rope_init_ptr, ACL_FLOAT, sizeof(float_t),
2358
- theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS);
2359
-
2360
2384
  // position
2361
- aclTensor* acl_position_tensor = ggml_cann_create_tensor(
2362
- src1->data, ggml_cann_type_mapping(src1->type),
2363
- ggml_type_size(src1->type), position_ne, position_nb, GGML_MAX_DIMS);
2385
+ acl_tensor_ptr acl_position_tensor =
2386
+ ggml_cann_create_tensor(src1->data, ggml_cann_type_mapping(src1->type), ggml_type_size(src1->type), position_ne,
2387
+ position_nb, GGML_MAX_DIMS);
2364
2388
 
2365
2389
  // power * position
2366
- int64_t theta_length = theta_scale_length * position_length;
2367
- ggml_cann_pool_alloc theta_allocator(ctx.pool(),
2368
- theta_length * sizeof(float_t));
2369
- void* theta_buffer = theta_allocator.get();
2390
+ int64_t theta_length = theta_scale_length * position_length;
2391
+ ggml_cann_pool_alloc theta_allocator(ctx.pool(), theta_length * sizeof(float));
2392
+ void * theta_buffer = theta_allocator.get();
2370
2393
 
2371
- aclTensor* acl_theta_tensor =
2372
- ggml_cann_create_tensor(theta_buffer, ACL_FLOAT, sizeof(float_t),
2373
- theta_ne, theta_nb, GGML_MAX_DIMS);
2374
- aclnn_mul(ctx, acl_position_tensor, acl_theta_scale_tensor,
2375
- acl_theta_tensor);
2394
+ acl_tensor_ptr acl_theta_tensor =
2395
+ ggml_cann_create_tensor(theta_buffer, ACL_FLOAT, sizeof(float), theta_ne, theta_nb, GGML_MAX_DIMS);
2396
+ aclnn_mul(ctx, acl_position_tensor.get(), acl_theta_scale_tensor.get(), acl_theta_tensor.get());
2376
2397
 
2377
2398
  // sin/cos
2378
- ggml_cann_pool_alloc sin_allocator(ctx.pool(),
2379
- theta_length * sizeof(float_t));
2380
- void* sin_buffer = sin_allocator.get();
2381
- aclTensor* acl_sin_tensor = ggml_cann_create_tensor(
2382
- sin_buffer, ACL_FLOAT, sizeof(float_t), theta_ne, theta_nb,
2383
- GGML_MAX_DIMS, ACL_FORMAT_ND);
2384
- aclnn_sin(ctx, acl_theta_tensor, acl_sin_tensor);
2385
-
2386
- ggml_cann_pool_alloc cos_allocator(ctx.pool(),
2387
- theta_length * sizeof(float_t));
2388
- void* cos_buffer = cos_allocator.get();
2389
- aclTensor* acl_cos_tensor = ggml_cann_create_tensor(
2390
- cos_buffer, ACL_FLOAT, sizeof(float_t), theta_ne, theta_nb,
2391
- GGML_MAX_DIMS, ACL_FORMAT_ND);
2392
- aclnn_cos(ctx, acl_theta_tensor, acl_cos_tensor);
2399
+ ggml_cann_pool_alloc sin_allocator(ctx.pool(), theta_length * sizeof(float));
2400
+ void * sin_buffer = sin_allocator.get();
2401
+ acl_tensor_ptr acl_sin_tensor =
2402
+ ggml_cann_create_tensor(sin_buffer, ACL_FLOAT, sizeof(float), theta_ne, theta_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
2403
+ aclnn_sin(ctx, acl_theta_tensor.get(), acl_sin_tensor.get());
2404
+
2405
+ ggml_cann_pool_alloc cos_allocator(ctx.pool(), theta_length * sizeof(float));
2406
+ void * cos_buffer = cos_allocator.get();
2407
+ acl_tensor_ptr acl_cos_tensor =
2408
+ ggml_cann_create_tensor(cos_buffer, ACL_FLOAT, sizeof(float), theta_ne, theta_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
2409
+ aclnn_cos(ctx, acl_theta_tensor.get(), acl_cos_tensor.get());
2410
+
2411
+ if (ext_factor != 0) {
2412
+ attn_factor *= 1.0f + 0.1f * logf(1.0f / freq_scale);
2413
+ }
2393
2414
 
2394
2415
  // attn_factor
2395
2416
  if (attn_factor != 1) {
2396
- aclnn_muls(ctx, acl_sin_tensor, attn_factor, nullptr, true);
2397
- aclnn_muls(ctx, acl_cos_tensor, attn_factor, nullptr, true);
2417
+ aclnn_muls(ctx, acl_sin_tensor.get(), attn_factor, nullptr, true);
2418
+ aclnn_muls(ctx, acl_cos_tensor.get(), attn_factor, nullptr, true);
2398
2419
  }
2399
2420
 
2400
- int64_t sin_reshape_ne[4] = {ne00, 1, ne02, 1};
2401
- size_t sin_reshape_nb[GGML_MAX_DIMS];
2402
- sin_reshape_nb[0] = sizeof(float_t);
2421
+ int64_t sin_reshape_ne[4] = { src0->ne[0], 1, src0->ne[2], 1 };
2422
+ size_t sin_reshape_nb[GGML_MAX_DIMS];
2423
+ sin_reshape_nb[0] = sizeof(float);
2403
2424
  for (int i = 1; i < GGML_MAX_DIMS; i++) {
2404
2425
  sin_reshape_nb[i] = sin_reshape_nb[i - 1] * sin_reshape_ne[i - 1];
2405
2426
  }
2406
- aclTensor* acl_sin_repeat_tensor =
2407
- ggml_cann_create_tensor(ctx.rope_sin_ptr, ACL_FLOAT, sizeof(float_t),
2408
- sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
2409
- aclTensor* acl_cos_repeat_tensor =
2410
- ggml_cann_create_tensor(ctx.rope_cos_ptr, ACL_FLOAT, sizeof(float_t),
2411
- sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
2427
+ acl_tensor_ptr acl_sin_repeat_tensor = ggml_cann_create_tensor(ctx.rope_cache.sin_cache, ACL_FLOAT, sizeof(float),
2428
+ sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
2429
+ acl_tensor_ptr acl_cos_repeat_tensor = ggml_cann_create_tensor(ctx.rope_cache.cos_cache, ACL_FLOAT, sizeof(float),
2430
+ sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
2412
2431
 
2413
2432
  // repeat
2414
2433
  if (is_neox) {
2415
- int64_t repeatsArray[] = {1, 1, 1, 2};
2416
- aclnn_repeat(ctx, acl_sin_tensor, acl_sin_repeat_tensor, repeatsArray);
2417
- aclnn_repeat(ctx, acl_cos_tensor, acl_cos_repeat_tensor, repeatsArray);
2434
+ int64_t repeatsArray[] = { 1, 1, 1, 2 };
2435
+ aclnn_repeat(ctx, acl_sin_tensor.get(), acl_sin_repeat_tensor.get(), repeatsArray);
2436
+ aclnn_repeat(ctx, acl_cos_tensor.get(), acl_cos_repeat_tensor.get(), repeatsArray);
2418
2437
  } else {
2419
2438
  int64_t num_repeats = 2;
2420
- int64_t dim = 3;
2439
+ int64_t dim = 3;
2421
2440
  int64_t output_size = theta_scale_length * num_repeats;
2422
- aclnn_repeat_interleave(ctx, acl_sin_tensor, acl_sin_repeat_tensor, dim,
2423
- num_repeats, output_size);
2424
- aclnn_repeat_interleave(ctx, acl_cos_tensor, acl_cos_repeat_tensor, dim,
2425
- num_repeats, output_size);
2441
+ aclnn_repeat_interleave(ctx, acl_sin_tensor.get(), acl_sin_repeat_tensor.get(), dim, num_repeats, output_size);
2442
+ aclnn_repeat_interleave(ctx, acl_cos_tensor.get(), acl_cos_repeat_tensor.get(), dim, num_repeats, output_size);
2426
2443
  }
2427
2444
 
2428
- ggml_cann_release_resources(ctx, acl_theta_scale_tensor, acl_position_tensor,
2429
- acl_theta_tensor, acl_sin_tensor, acl_sin_repeat_tensor, acl_cos_tensor,
2430
- acl_cos_repeat_tensor);
2445
+ // Other layers use cache except first layer.
2446
+ ctx.rope_cache.cached = true;
2447
+ ctx.rope_cache.ext_factor = ext_factor;
2448
+ ctx.rope_cache.theta_scale = theta_scale;
2449
+ ctx.rope_cache.freq_scale = freq_scale;
2450
+ ctx.rope_cache.attn_factor = attn_factor;
2451
+ ctx.rope_cache.is_neox = is_neox;
2431
2452
  }
2432
2453
 
2433
2454
  #ifdef __cplusplus
2434
2455
  extern "C" {
2435
2456
  #endif
2436
- aclnnStatus aclnnRotaryPositionEmbeddingGetWorkspaceSize(
2437
- const aclTensor* x, const aclTensor* cos, const aclTensor* sin,
2438
- int64_t mode, const aclTensor* yOut, uint64_t* workspaceSize,
2439
- aclOpExecutor** executor);
2440
- aclnnStatus aclnnRotaryPositionEmbedding(void* workspace,
2441
- uint64_t workspaceSize,
2442
- aclOpExecutor* executor,
2443
- aclrtStream stream);
2457
+ aclnnStatus aclnnRotaryPositionEmbeddingGetWorkspaceSize(const aclTensor * x,
2458
+ const aclTensor * cos,
2459
+ const aclTensor * sin,
2460
+ int64_t mode,
2461
+ const aclTensor * yOut,
2462
+ uint64_t * workspaceSize,
2463
+ aclOpExecutor ** executor);
2464
+ aclnnStatus aclnnRotaryPositionEmbedding(void * workspace,
2465
+ uint64_t workspaceSize,
2466
+ aclOpExecutor * executor,
2467
+ aclrtStream stream);
2444
2468
  #ifdef __cplusplus
2445
2469
  }
2446
2470
  #endif
2447
2471
 
2448
- void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
2449
- // TODO: use ascendc
2450
- // Only test with LLAMA model.
2451
- ggml_tensor* src0 = dst->src[0]; // input
2472
+ void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
2473
+ ggml_tensor * src0 = dst->src[0]; // input
2452
2474
 
2453
2475
  // param
2454
- float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
2476
+ float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
2455
2477
  // const int n_past = ((int32_t *) dst->op_params)[0];
2456
- const int n_dims = ((int32_t*)dst->op_params)[1];
2457
- const int mode = ((int32_t*)dst->op_params)[2];
2478
+ const int n_dims = ((int32_t *) dst->op_params)[1];
2479
+ const int mode = ((int32_t *) dst->op_params)[2];
2458
2480
  // const int n_ctx = ((int32_t *) dst->op_params)[3];
2459
- const int n_ctx_orig = ((int32_t*)dst->op_params)[4];
2481
+ const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
2460
2482
 
2461
2483
  GGML_TENSOR_UNARY_OP_LOCALS
2462
2484
 
2463
- memcpy(&freq_base, (int32_t*)dst->op_params + 5, sizeof(float));
2464
- memcpy(&freq_scale, (int32_t*)dst->op_params + 6, sizeof(float));
2465
- memcpy(&ext_factor, (int32_t*)dst->op_params + 7, sizeof(float));
2466
- memcpy(&attn_factor, (int32_t*)dst->op_params + 8, sizeof(float));
2467
- memcpy(&beta_fast, (int32_t*)dst->op_params + 9, sizeof(float));
2468
- memcpy(&beta_slow, (int32_t*)dst->op_params + 10, sizeof(float));
2485
+ memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
2486
+ memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
2487
+ memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
2488
+ memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
2489
+ memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
2490
+ memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
2469
2491
 
2470
2492
  // TODO: n_dims <= ne0
2471
2493
  GGML_ASSERT(n_dims == ne0);
2472
2494
  GGML_ASSERT(n_dims % 2 == 0);
2473
- // TODO: ext_factor != 0
2474
- GGML_ASSERT(ext_factor == 0);
2475
2495
 
2476
2496
  const float theta_scale = powf(freq_base, -2.0f / n_dims);
2477
2497
 
2478
2498
  float corr_dims[2];
2479
- ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast,
2480
- beta_slow, corr_dims);
2499
+ ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
2481
2500
 
2482
2501
  const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
2483
2502
 
2484
2503
  // init ctx.rope_cos/rope_sin cache
2485
- aclnn_cache_init(ctx, dst, theta_scale, freq_scale, attn_factor, is_neox);
2504
+ aclnn_cache_init(ctx, dst, corr_dims, ext_factor, theta_scale, freq_scale, attn_factor, is_neox);
2486
2505
 
2487
- int64_t sin_reshape_ne[4] = {ne00, 1, ne02, 1};
2488
- size_t sin_reshape_nb[GGML_MAX_DIMS];
2489
- sin_reshape_nb[0] = sizeof(float_t);
2506
+ int64_t sin_reshape_ne[4] = { ne00, 1, ne02, 1 };
2507
+ size_t sin_reshape_nb[GGML_MAX_DIMS];
2508
+ sin_reshape_nb[0] = sizeof(float);
2490
2509
  for (int i = 1; i < GGML_MAX_DIMS; i++) {
2491
2510
  sin_reshape_nb[i] = sin_reshape_nb[i - 1] * sin_reshape_ne[i - 1];
2492
2511
  }
2493
- aclTensor* acl_sin_reshape_tensor =
2494
- ggml_cann_create_tensor(ctx.rope_sin_ptr, ACL_FLOAT, sizeof(float_t),
2495
- sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
2496
- aclTensor* acl_cos_reshape_tensor =
2497
- ggml_cann_create_tensor(ctx.rope_cos_ptr, ACL_FLOAT, sizeof(float_t),
2498
- sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
2512
+ acl_tensor_ptr acl_sin_reshape_tensor = ggml_cann_create_tensor(ctx.rope_cache.sin_cache, ACL_FLOAT, sizeof(float),
2513
+ sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
2514
+ acl_tensor_ptr acl_cos_reshape_tensor = ggml_cann_create_tensor(ctx.rope_cache.cos_cache, ACL_FLOAT, sizeof(float),
2515
+ sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
2499
2516
 
2500
- aclTensor* acl_src = ggml_cann_create_tensor(src0);
2501
- aclTensor* acl_dst = ggml_cann_create_tensor(dst);
2517
+ acl_tensor_ptr acl_src = ggml_cann_create_tensor(src0);
2518
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
2502
2519
 
2503
2520
  #ifdef ASCEND_310P
2504
2521
  // Special ROPE operation for 310P
2505
2522
 
2506
2523
  // roll input
2507
- void* input_roll_buffer;
2508
- aclTensor* acl_minus_one_tensor;
2509
- void* minus_one_scale_buffer = nullptr;
2524
+ void * input_roll_buffer;
2525
+ acl_tensor_ptr acl_minus_one_tensor;
2526
+ void * minus_one_scale_buffer = nullptr;
2510
2527
  ggml_cann_pool_alloc roll_allocator(ctx.pool(), ggml_nbytes(src0));
2511
- ggml_cann_pool_alloc minus_one_scale_allocator(
2512
- ctx.pool(), sizeof(float_t) * src0->ne[0]);
2528
+ ggml_cann_pool_alloc minus_one_scale_allocator(ctx.pool(), sizeof(float) * src0->ne[0]);
2513
2529
  if (!is_neox) {
2514
2530
  // roll input: [q0,q1,q2,q3,...] -> [q1,q0,q3,q2,...]
2515
- input_roll_buffer = roll_allocator.get();
2516
- int64_t input_roll_ne[4] = {2, src0->ne[1] * (src0->ne[0] / 2),
2517
- src0->ne[2], src0->ne[3]};
2518
- size_t input_roll_nb[GGML_MAX_DIMS];
2531
+ input_roll_buffer = roll_allocator.get();
2532
+ int64_t input_roll_ne[4] = { 2, src0->ne[1] * (src0->ne[0] / 2), src0->ne[2], src0->ne[3] };
2533
+ size_t input_roll_nb[GGML_MAX_DIMS];
2519
2534
  input_roll_nb[0] = ggml_type_size(src0->type);
2520
2535
  for (int i = 1; i < GGML_MAX_DIMS; i++) {
2521
2536
  input_roll_nb[i] = input_roll_nb[i - 1] * input_roll_ne[i - 1];
2522
2537
  }
2523
- aclTensor* acl_input_roll_tensor = ggml_cann_create_tensor(
2524
- input_roll_buffer, ggml_cann_type_mapping(src0->type),
2525
- ggml_type_size(src0->type), input_roll_ne, input_roll_nb,
2526
- GGML_MAX_DIMS);
2527
- aclTensor* acl_input_tensor = ggml_cann_create_tensor(
2528
- src0->data, ggml_cann_type_mapping(src0->type),
2529
- ggml_type_size(src0->type), input_roll_ne, input_roll_nb,
2530
- GGML_MAX_DIMS);
2531
-
2532
- int64_t shifts[] = {1};
2533
- int64_t dims[] = {3};
2534
- aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims);
2535
- ggml_cann_release_resources(ctx, acl_input_roll_tensor, acl_input_tensor);
2538
+ acl_tensor_ptr acl_input_roll_tensor =
2539
+ ggml_cann_create_tensor(input_roll_buffer, ggml_cann_type_mapping(src0->type), ggml_type_size(src0->type),
2540
+ input_roll_ne, input_roll_nb, GGML_MAX_DIMS);
2541
+ acl_tensor_ptr acl_input_tensor =
2542
+ ggml_cann_create_tensor(src0->data, ggml_cann_type_mapping(src0->type), ggml_type_size(src0->type),
2543
+ input_roll_ne, input_roll_nb, GGML_MAX_DIMS);
2544
+
2545
+ int64_t shifts[] = { 1 };
2546
+ int64_t dims[] = { 3 };
2547
+ aclnn_roll(ctx, acl_input_tensor.get(), acl_input_roll_tensor.get(), shifts, dims);
2536
2548
 
2537
2549
  // init [-1, 1, -1, 1, ...]
2538
2550
  minus_one_scale_buffer = minus_one_scale_allocator.get();
2539
2551
 
2540
- int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1};
2541
- size_t minus_one_nb[GGML_MAX_DIMS];
2542
- minus_one_nb[0] = sizeof(float_t);
2552
+ int64_t minus_one_ne[4] = { src0->ne[0], 1, 1, 1 };
2553
+ size_t minus_one_nb[GGML_MAX_DIMS];
2554
+ minus_one_nb[0] = sizeof(float);
2543
2555
  for (int i = 1; i < GGML_MAX_DIMS; i++) {
2544
2556
  minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1];
2545
2557
  }
2546
- acl_minus_one_tensor = aclnn_values(
2547
- ctx, minus_one_scale_buffer, sizeof(float_t) * src0->ne[0],
2548
- minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), 1);
2549
- int64_t dim = 3;
2550
- int64_t* index = new int64_t[src0->ne[0]];
2558
+ acl_minus_one_tensor = aclnn_values(ctx, minus_one_scale_buffer, sizeof(float) * src0->ne[0], minus_one_ne,
2559
+ GGML_MAX_DIMS, ACL_FLOAT, sizeof(float), 1);
2560
+ int64_t dim = 3;
2561
+ int64_t * index = new int64_t[src0->ne[0]];
2551
2562
  for (int i = 0; i < src0->ne[0]; i++) {
2552
2563
  index[i] = i / 2 * 2;
2553
2564
  }
2554
2565
  int64_t index_num = src0->ne[0];
2555
- float value = -1;
2556
- aclnn_index_fill_tensor(ctx, acl_minus_one_tensor, dim, index,
2557
- index_num, value);
2566
+ float value = -1;
2567
+ aclnn_index_fill_tensor(ctx, acl_minus_one_tensor.get(), dim, index, index_num, value);
2558
2568
  } else {
2559
2569
  // roll input: [q0,q1,q2,...] ->
2560
2570
  // [q_half,q_half+1,...,q_end,q0,q1,...q_half-1]
2561
2571
  input_roll_buffer = roll_allocator.get();
2562
- aclTensor* acl_input_roll_tensor = ggml_cann_create_tensor(
2563
- input_roll_buffer, ggml_cann_type_mapping(src0->type),
2564
- ggml_type_size(src0->type), src0->ne, src0->nb, GGML_MAX_DIMS);
2565
- aclTensor* acl_input_tensor = ggml_cann_create_tensor(src0);
2572
+ acl_tensor_ptr acl_input_roll_tensor =
2573
+ ggml_cann_create_tensor(input_roll_buffer, ggml_cann_type_mapping(src0->type), ggml_type_size(src0->type),
2574
+ src0->ne, src0->nb, GGML_MAX_DIMS);
2575
+ acl_tensor_ptr acl_input_tensor = ggml_cann_create_tensor(src0);
2566
2576
 
2567
- int64_t shifts[] = {src0->ne[0] / 2};
2568
- int64_t dims[] = {3};
2569
- aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims);
2577
+ int64_t shifts[] = { src0->ne[0] / 2 };
2578
+ int64_t dims[] = { 3 };
2579
+ aclnn_roll(ctx, acl_input_tensor.get(), acl_input_roll_tensor.get(), shifts, dims);
2570
2580
 
2571
- ggml_cann_release_resources(ctx, acl_input_roll_tensor, acl_input_tensor);
2572
2581
  // init [-1, -1, -1, 1, 1,1,...]
2573
- minus_one_scale_buffer = minus_one_scale_allocator.get();
2574
- int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1};
2575
- size_t minus_one_nb[GGML_MAX_DIMS];
2576
- minus_one_nb[0] = sizeof(float_t);
2582
+ minus_one_scale_buffer = minus_one_scale_allocator.get();
2583
+ int64_t minus_one_ne[4] = { src0->ne[0], 1, 1, 1 };
2584
+ size_t minus_one_nb[GGML_MAX_DIMS];
2585
+ minus_one_nb[0] = sizeof(float);
2577
2586
  for (int i = 1; i < GGML_MAX_DIMS; i++) {
2578
2587
  minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1];
2579
2588
  }
2580
- acl_minus_one_tensor = aclnn_values(
2581
- ctx, minus_one_scale_buffer, sizeof(float_t) * src0->ne[0],
2582
- minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), 1);
2589
+ acl_minus_one_tensor = aclnn_values(ctx, minus_one_scale_buffer, sizeof(float) * src0->ne[0], minus_one_ne,
2590
+ GGML_MAX_DIMS, ACL_FLOAT, sizeof(float), 1);
2583
2591
  // -1 * first half
2584
- int64_t first_half_ne[4] = {src0->ne[0] / 2, 1, 1, 1};
2585
- size_t first_half_nb[GGML_MAX_DIMS];
2586
- first_half_nb[0] = sizeof(float_t);
2592
+ int64_t first_half_ne[4] = { src0->ne[0] / 2, 1, 1, 1 };
2593
+ size_t first_half_nb[GGML_MAX_DIMS];
2594
+ first_half_nb[0] = sizeof(float);
2587
2595
  for (int i = 1; i < GGML_MAX_DIMS; i++) {
2588
2596
  first_half_nb[i] = first_half_nb[i - 1] * first_half_ne[i - 1];
2589
2597
  }
2590
- aclTensor* acl_first_half_tensor = ggml_cann_create_tensor(
2591
- minus_one_scale_buffer, ACL_FLOAT, sizeof(float_t), first_half_ne,
2592
- first_half_nb, GGML_MAX_DIMS);
2593
- bool inplace = true;
2594
- float scale = -1;
2595
- aclnn_muls(ctx, acl_first_half_tensor, scale, nullptr, inplace);
2596
- ggml_cann_release_resources(ctx, acl_first_half_tensor);
2598
+ acl_tensor_ptr acl_first_half_tensor = ggml_cann_create_tensor(minus_one_scale_buffer, ACL_FLOAT, sizeof(float),
2599
+ first_half_ne, first_half_nb, GGML_MAX_DIMS);
2600
+ bool inplace = true;
2601
+ float scale = -1;
2602
+ aclnn_muls(ctx, acl_first_half_tensor.get(), scale, nullptr, inplace);
2597
2603
  }
2598
2604
 
2599
2605
  // TODO: n_dims < ne0
2600
2606
  GGML_ASSERT(n_dims == src0->ne[0]);
2601
2607
 
2602
2608
  // input * scale
2603
- ggml_cann_pool_alloc roll_mul_scale_allocator(ctx.pool(),
2604
- ggml_nbytes(src0));
2605
- void* input_roll_mul_scale_buffer = roll_mul_scale_allocator.get();
2606
- size_t input_nb[GGML_MAX_DIMS];
2609
+ ggml_cann_pool_alloc roll_mul_scale_allocator(ctx.pool(), ggml_nbytes(src0));
2610
+ void * input_roll_mul_scale_buffer = roll_mul_scale_allocator.get();
2611
+ size_t input_nb[GGML_MAX_DIMS];
2607
2612
  input_nb[0] = ggml_type_size(src0->type);
2608
2613
  for (int i = 1; i < GGML_MAX_DIMS; i++) {
2609
2614
  input_nb[i] = input_nb[i - 1] * src0->ne[i - 1];
2610
2615
  }
2611
- aclTensor* acl_input_roll_mul_scale_tensor = ggml_cann_create_tensor(
2612
- input_roll_mul_scale_buffer, ggml_cann_type_mapping(src0->type),
2613
- ggml_type_size(src0->type), src0->ne, input_nb, GGML_MAX_DIMS);
2614
- aclTensor* acl_input_roll_reshape_tensor = ggml_cann_create_tensor(
2615
- input_roll_buffer, ggml_cann_type_mapping(src0->type),
2616
- ggml_type_size(src0->type), src0->ne, input_nb, GGML_MAX_DIMS);
2616
+ acl_tensor_ptr acl_input_roll_mul_scale_tensor =
2617
+ ggml_cann_create_tensor(input_roll_mul_scale_buffer, ggml_cann_type_mapping(src0->type),
2618
+ ggml_type_size(src0->type), src0->ne, input_nb, GGML_MAX_DIMS);
2619
+ acl_tensor_ptr acl_input_roll_reshape_tensor =
2620
+ ggml_cann_create_tensor(input_roll_buffer, ggml_cann_type_mapping(src0->type), ggml_type_size(src0->type),
2621
+ src0->ne, input_nb, GGML_MAX_DIMS);
2617
2622
 
2618
- aclnn_mul(ctx, acl_input_roll_reshape_tensor, acl_minus_one_tensor,
2619
- acl_input_roll_mul_scale_tensor);
2623
+ aclnn_mul(ctx, acl_input_roll_reshape_tensor.get(), acl_minus_one_tensor.get(),
2624
+ acl_input_roll_mul_scale_tensor.get());
2620
2625
 
2621
2626
  // output
2622
- void* output_fp32_buffer;
2627
+ void * output_fp32_buffer;
2623
2628
  if (src0->type == GGML_TYPE_F32) {
2624
- aclnn_mul(ctx, acl_src, acl_cos_reshape_tensor);
2625
- aclnn_mul(ctx, acl_input_roll_mul_scale_tensor,
2626
- acl_sin_reshape_tensor);
2627
- aclnn_add(ctx, acl_src, acl_input_roll_mul_scale_tensor, acl_dst);
2629
+ aclnn_mul(ctx, acl_src.get(), acl_cos_reshape_tensor.get());
2630
+ aclnn_mul(ctx, acl_input_roll_mul_scale_tensor.get(), acl_sin_reshape_tensor.get());
2631
+ aclnn_add(ctx, acl_src.get(), acl_input_roll_mul_scale_tensor.get(), acl_dst.get());
2628
2632
  // TODO: ne0 != n_dims in mode2
2629
2633
  } else if (src0->type == GGML_TYPE_F16) {
2630
2634
  size_t input_fp32_nb[GGML_MAX_DIMS];
2631
- input_fp32_nb[0] = sizeof(float_t);
2635
+ input_fp32_nb[0] = sizeof(float);
2632
2636
  for (int i = 1; i < GGML_MAX_DIMS; i++) {
2633
2637
  input_fp32_nb[i] = input_fp32_nb[i - 1] * dst->ne[i - 1];
2634
2638
  }
2635
- ggml_cann_pool_alloc fp32_allocator1(
2636
- ctx.pool(), ggml_nelements(dst) * sizeof(float_t));
2637
- void* input_fp32_buffer1 = fp32_allocator1.get();
2638
- aclTensor* input_fp32_tensor1 = ggml_cann_create_tensor(
2639
- input_fp32_buffer1, ACL_FLOAT, sizeof(float_t), dst->ne,
2640
- input_fp32_nb, GGML_MAX_DIMS);
2641
- ggml_cann_pool_alloc fp32_allocator2(
2642
- ctx.pool(), ggml_nelements(dst) * sizeof(float_t));
2643
- void* input_fp32_buffer2 = fp32_allocator2.get();
2644
- aclTensor* input_fp32_tensor2 = ggml_cann_create_tensor(
2645
- input_fp32_buffer2, ACL_FLOAT, sizeof(float_t), dst->ne,
2646
- input_fp32_nb, GGML_MAX_DIMS);
2647
-
2648
- ggml_cann_pool_alloc fp32_allocator(
2649
- ctx.pool(), ggml_nelements(dst) * sizeof(float_t));
2650
- output_fp32_buffer = fp32_allocator.get();
2651
- aclTensor* output_fp32_tensor = ggml_cann_create_tensor(
2652
- output_fp32_buffer, ACL_FLOAT, sizeof(float_t), dst->ne,
2653
- input_fp32_nb, GGML_MAX_DIMS);
2654
- aclnn_mul(ctx, acl_src, acl_cos_reshape_tensor, input_fp32_tensor1);
2655
- aclnn_mul(ctx, acl_input_roll_mul_scale_tensor, acl_sin_reshape_tensor,
2656
- input_fp32_tensor2);
2657
- aclnn_add(ctx, input_fp32_tensor1, input_fp32_tensor2,
2658
- output_fp32_tensor);
2659
- aclnn_cast(ctx, output_fp32_tensor, acl_dst, ACL_FLOAT16);
2660
-
2661
- ggml_cann_release_resources(ctx, input_fp32_tensor1, input_fp32_tensor2,
2662
- output_fp32_tensor, acl_sin_reshape_tensor,
2663
- acl_minus_one_tensor, acl_input_roll_mul_scale_tensor,
2664
- acl_input_roll_reshape_tensor, acl_src);
2639
+ ggml_cann_pool_alloc fp32_allocator1(ctx.pool(), ggml_nelements(dst) * sizeof(float));
2640
+ void * input_fp32_buffer1 = fp32_allocator1.get();
2641
+ acl_tensor_ptr input_fp32_tensor1 = ggml_cann_create_tensor(input_fp32_buffer1, ACL_FLOAT, sizeof(float),
2642
+ dst->ne, input_fp32_nb, GGML_MAX_DIMS);
2643
+ ggml_cann_pool_alloc fp32_allocator2(ctx.pool(), ggml_nelements(dst) * sizeof(float));
2644
+ void * input_fp32_buffer2 = fp32_allocator2.get();
2645
+ acl_tensor_ptr input_fp32_tensor2 = ggml_cann_create_tensor(input_fp32_buffer2, ACL_FLOAT, sizeof(float),
2646
+ dst->ne, input_fp32_nb, GGML_MAX_DIMS);
2647
+
2648
+ ggml_cann_pool_alloc fp32_allocator(ctx.pool(), ggml_nelements(dst) * sizeof(float));
2649
+ output_fp32_buffer = fp32_allocator.get();
2650
+ acl_tensor_ptr output_fp32_tensor = ggml_cann_create_tensor(output_fp32_buffer, ACL_FLOAT, sizeof(float),
2651
+ dst->ne, input_fp32_nb, GGML_MAX_DIMS);
2652
+ aclnn_mul(ctx, acl_src.get(), acl_cos_reshape_tensor.get(), input_fp32_tensor1.get());
2653
+ aclnn_mul(ctx, acl_input_roll_mul_scale_tensor.get(), acl_sin_reshape_tensor.get(), input_fp32_tensor2.get());
2654
+ aclnn_add(ctx, input_fp32_tensor1.get(), input_fp32_tensor2.get(), output_fp32_tensor.get());
2655
+ aclnn_cast(ctx, output_fp32_tensor.get(), acl_dst.get(), ACL_FLOAT16);
2665
2656
  }
2666
2657
  return;
2667
2658
  #endif
@@ -2670,178 +2661,150 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
2670
2661
  int64_t acl_mode = mode == 0 ? 1 : mode;
2671
2662
 
2672
2663
  switch (src0->type) {
2673
- case GGML_TYPE_F32: {
2674
- GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src,
2675
- acl_cos_reshape_tensor, acl_sin_reshape_tensor, acl_mode, acl_dst);
2676
- break;
2677
- }
2678
- case GGML_TYPE_F16: {
2679
- ggml_cann_pool_alloc src_trans_allocator(
2680
- ctx.pool(), ggml_nelements(src0) * sizeof(float));
2681
- void* src_trans_buffer = src_trans_allocator.get();
2682
- ggml_cann_pool_alloc dst_trans_allocator(
2683
- ctx.pool(), ggml_nelements(dst) * sizeof(float));
2684
- void* dst_trans_buffer = dst_trans_allocator.get();
2685
-
2686
- size_t src_trans_nb[GGML_MAX_DIMS];
2687
- src_trans_nb[0] = sizeof(float);
2688
- for (int i = 1; i < GGML_MAX_DIMS; i++) {
2689
- src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
2664
+ case GGML_TYPE_F32:
2665
+ {
2666
+ GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src.get(), acl_cos_reshape_tensor.get(),
2667
+ acl_sin_reshape_tensor.get(), acl_mode, acl_dst.get());
2668
+ break;
2690
2669
  }
2670
+ case GGML_TYPE_F16:
2671
+ {
2672
+ ggml_cann_pool_alloc src_trans_allocator(ctx.pool(), ggml_nelements(src0) * sizeof(float));
2673
+ void * src_trans_buffer = src_trans_allocator.get();
2674
+ ggml_cann_pool_alloc dst_trans_allocator(ctx.pool(), ggml_nelements(dst) * sizeof(float));
2675
+ void * dst_trans_buffer = dst_trans_allocator.get();
2676
+
2677
+ size_t src_trans_nb[GGML_MAX_DIMS];
2678
+ src_trans_nb[0] = sizeof(float);
2679
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
2680
+ src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
2681
+ }
2691
2682
 
2692
- aclTensor* acl_src_trans_tensor = ggml_cann_create_tensor(
2693
- src_trans_buffer, ACL_FLOAT, sizeof(float), src0->ne, src_trans_nb,
2694
- GGML_MAX_DIMS);
2695
- aclTensor* acl_dst_trans_tensor = ggml_cann_create_tensor(
2696
- dst_trans_buffer, ACL_FLOAT, sizeof(float), dst->ne, src_trans_nb,
2697
- GGML_MAX_DIMS);
2698
-
2699
- aclnn_cast(ctx, acl_src, acl_src_trans_tensor, ACL_FLOAT);
2683
+ acl_tensor_ptr acl_src_trans_tensor = ggml_cann_create_tensor(
2684
+ src_trans_buffer, ACL_FLOAT, sizeof(float), src0->ne, src_trans_nb, GGML_MAX_DIMS);
2685
+ acl_tensor_ptr acl_dst_trans_tensor = ggml_cann_create_tensor(
2686
+ dst_trans_buffer, ACL_FLOAT, sizeof(float), dst->ne, src_trans_nb, GGML_MAX_DIMS);
2700
2687
 
2701
- GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src_trans_tensor,
2702
- acl_cos_reshape_tensor, acl_sin_reshape_tensor, acl_mode,
2703
- acl_dst_trans_tensor);
2688
+ aclnn_cast(ctx, acl_src.get(), acl_src_trans_tensor.get(), ACL_FLOAT);
2704
2689
 
2705
- aclnn_cast(ctx, acl_dst_trans_tensor, acl_dst, ACL_FLOAT16);
2690
+ GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src_trans_tensor.get(),
2691
+ acl_cos_reshape_tensor.get(), acl_sin_reshape_tensor.get(), acl_mode,
2692
+ acl_dst_trans_tensor.get());
2706
2693
 
2707
- ggml_cann_release_resources(ctx, acl_src_trans_tensor,
2708
- acl_dst_trans_tensor);
2709
- break;
2710
- }
2694
+ aclnn_cast(ctx, acl_dst_trans_tensor.get(), acl_dst.get(), ACL_FLOAT16);
2695
+ break;
2696
+ }
2711
2697
  default:
2712
2698
  GGML_ABORT("Unsupported tensor type for GGML_OP_ROPE");
2713
2699
  break;
2714
2700
  }
2715
- ggml_cann_release_resources(ctx, acl_cos_reshape_tensor,
2716
- acl_sin_reshape_tensor, acl_src, acl_dst);
2717
2701
  }
2718
2702
 
2719
-
2720
- void ggml_cann_argmax(ggml_backend_cann_context& ctx, ggml_tensor* dst){
2703
+ void ggml_cann_argmax(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
2721
2704
  ggml_tensor * src0 = dst->src[0];
2722
2705
 
2723
- aclTensor* acl_src = ggml_cann_create_tensor(src0);
2724
- aclTensor* acl_dst = ggml_cann_create_tensor(dst, dst->ne, dst->nb, 3);
2706
+ acl_tensor_ptr acl_src = ggml_cann_create_tensor(src0);
2707
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst, dst->ne, dst->nb, 3);
2725
2708
 
2726
- GGML_CANN_CALL_ACLNN_OP(ctx, ArgMax, acl_src, 3, false, acl_dst);
2727
-
2728
- ggml_cann_release_resources(ctx, acl_src, acl_dst);
2709
+ GGML_CANN_CALL_ACLNN_OP(ctx, ArgMax, acl_src.get(), 3, false, acl_dst.get());
2729
2710
  }
2730
2711
 
2731
- void ggml_cann_conv_transpose_1d(ggml_backend_cann_context& ctx, ggml_tensor* dst){
2712
+ void ggml_cann_conv_transpose_1d(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
2732
2713
  ggml_tensor * src0 = dst->src[0];
2733
2714
  ggml_tensor * src1 = dst->src[1];
2734
2715
 
2735
2716
  // stride
2736
- int64_t s0 = ((const int32_t*)(dst->op_params))[0];
2717
+ int64_t s0 = ((const int32_t *) (dst->op_params))[0];
2737
2718
 
2738
- aclTensor* acl_input = ggml_cann_create_tensor(src1, src1->ne, src1->nb, 3, ACL_FORMAT_NCL);
2739
- aclTensor* acl_weight = ggml_cann_create_tensor(src0, src0->ne, src0->nb, 3, ACL_FORMAT_NCL);
2740
- aclTensor* acl_dst = ggml_cann_create_tensor(dst, dst->ne, dst->nb, 3, ACL_FORMAT_NCL);
2719
+ acl_tensor_ptr acl_input = ggml_cann_create_tensor(src1, src1->ne, src1->nb, 3, ACL_FORMAT_NCL);
2720
+ acl_tensor_ptr acl_weight = ggml_cann_create_tensor(src0, src0->ne, src0->nb, 3, ACL_FORMAT_NCL);
2721
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst, dst->ne, dst->nb, 3, ACL_FORMAT_NCL);
2741
2722
 
2742
2723
  int64_t strideVal[1];
2743
- strideVal[0] = s0;
2744
- aclIntArray *stride = aclCreateIntArray(strideVal, 1);
2745
- int64_t paddingVal[] = {0};
2746
- aclIntArray *padding = aclCreateIntArray(paddingVal, 1);
2747
- int64_t dilationVal[] = {1};
2748
- aclIntArray *dilation = aclCreateIntArray(dilationVal, 1);
2749
- bool transposed = true;
2750
- int64_t groups = 1;
2751
- int8_t cubeMathType = 0;
2724
+ strideVal[0] = s0;
2725
+ acl_int_array_ptr stride = ggml_cann_create_int_array(strideVal, 1);
2726
+ int64_t paddingVal[] = { 0 };
2727
+ acl_int_array_ptr padding = ggml_cann_create_int_array(paddingVal, 1);
2728
+ int64_t dilationVal[] = { 1 };
2729
+ acl_int_array_ptr dilation = ggml_cann_create_int_array(dilationVal, 1);
2730
+ int8_t cubeMathType = 0;
2752
2731
 
2753
2732
  #ifdef ASCEND_310P
2754
2733
  cubeMathType = 1;
2755
2734
  #endif
2756
2735
 
2757
- GGML_CANN_CALL_ACLNN_OP(ctx, Convolution, acl_input, acl_weight, nullptr, stride,
2758
- padding, dilation, transposed, padding, groups, acl_dst, cubeMathType);
2759
-
2760
- ggml_cann_release_resources(ctx, acl_weight, acl_dst, stride, padding, dilation);
2736
+ GGML_CANN_CALL_ACLNN_OP(ctx, Convolution, acl_input.get(), acl_weight.get(), nullptr, stride.get(), padding.get(),
2737
+ dilation.get(), true, padding.get(), 1, acl_dst.get(), cubeMathType);
2761
2738
  }
2762
2739
 
2763
- void ggml_cann_elu(ggml_backend_cann_context& ctx, ggml_tensor* dst){
2740
+ void ggml_cann_elu(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
2764
2741
  ggml_tensor * src0 = dst->src[0];
2765
2742
 
2766
- aclTensor* acl_input = ggml_cann_create_tensor(src0);
2767
- aclTensor* acl_dst = ggml_cann_create_tensor(dst);
2768
-
2769
- float alphaValue = 1.0f;
2770
- aclScalar* alpha = nullptr;
2771
- alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
2743
+ acl_tensor_ptr acl_input = ggml_cann_create_tensor(src0);
2744
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
2772
2745
 
2773
- GGML_CANN_CALL_ACLNN_OP(ctx, Elu, acl_input, alpha, alpha, alpha,
2774
- acl_dst);
2746
+ float alphaValue = 1.0f;
2747
+ acl_scalar_ptr alpha = nullptr;
2748
+ alpha = ggml_cann_create_scalar(&alphaValue, aclDataType::ACL_FLOAT);
2775
2749
 
2776
- ggml_cann_release_resources(ctx, acl_input, acl_dst, alpha);
2750
+ GGML_CANN_CALL_ACLNN_OP(ctx, Elu, acl_input.get(), alpha.get(), alpha.get(), alpha.get(), acl_dst.get());
2777
2751
  }
2778
2752
 
2779
- void ggml_cann_mean(ggml_backend_cann_context& ctx, ggml_tensor* dst){
2753
+ void ggml_cann_mean(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
2780
2754
  ggml_tensor * src0 = dst->src[0];
2781
2755
 
2782
- aclTensor* acl_src = ggml_cann_create_tensor(src0);
2783
- aclTensor* acl_dst = ggml_cann_create_tensor(dst);
2756
+ acl_tensor_ptr acl_src = ggml_cann_create_tensor(src0);
2757
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
2784
2758
 
2785
- int64_t reduceDimValue[] = {3};
2786
- aclIntArray* reduceDim = aclCreateIntArray(reduceDimValue, 1);
2787
- bool keepDim = true;
2759
+ int64_t reduceDimValue[] = { 3 };
2760
+ acl_int_array_ptr reduceDim = ggml_cann_create_int_array(reduceDimValue, 1);
2761
+ bool keepDim = true;
2788
2762
 
2789
- GGML_CANN_CALL_ACLNN_OP(ctx, Mean, acl_src, reduceDim, keepDim, ACL_FLOAT, acl_dst);
2790
-
2791
- ggml_cann_release_resources(ctx, acl_src, acl_dst, reduceDim);
2763
+ GGML_CANN_CALL_ACLNN_OP(ctx, Mean, acl_src.get(), reduceDim.get(), keepDim, ACL_FLOAT, acl_dst.get());
2792
2764
  }
2793
2765
 
2794
- void ggml_cann_pad_reflect_1d(ggml_backend_cann_context& ctx, ggml_tensor* dst){
2795
- ggml_tensor * src0 = dst->src[0];
2796
- int32_t *opts = (int32_t *) dst->op_params;
2797
- int64_t paddingsArray[2] = {opts[0], opts[1]};
2798
- aclIntArray* paddings = aclCreateIntArray(paddingsArray, 2);
2766
+ void ggml_cann_pad_reflect_1d(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
2767
+ ggml_tensor * src0 = dst->src[0];
2768
+ int32_t * opts = (int32_t *) dst->op_params;
2769
+ int64_t paddingsArray[2] = { opts[0], opts[1] };
2770
+ acl_int_array_ptr paddings = ggml_cann_create_int_array(paddingsArray, 2);
2799
2771
 
2800
2772
  for (int64_t i = 0; i < src0->ne[3]; i++) {
2801
- aclTensor* acl_src = ggml_cann_create_tensor(
2802
- (char*)src0->data + i * src0->ne[3],
2803
- ggml_cann_type_mapping(src0->type), ggml_element_size(src0),
2804
- src0->ne, src0->nb, 3);
2773
+ acl_tensor_ptr acl_src =
2774
+ ggml_cann_create_tensor((char *) src0->data + i * src0->ne[3], ggml_cann_type_mapping(src0->type),
2775
+ ggml_element_size(src0), src0->ne, src0->nb, 3);
2805
2776
 
2806
- aclTensor* acl_dst = ggml_cann_create_tensor(
2807
- (char*)dst->data + i * src0->ne[3],
2808
- ggml_cann_type_mapping(dst->type), ggml_element_size(dst),
2809
- dst->ne, dst->nb, 3);
2777
+ acl_tensor_ptr acl_dst =
2778
+ ggml_cann_create_tensor((char *) dst->data + i * src0->ne[3], ggml_cann_type_mapping(dst->type),
2779
+ ggml_element_size(dst), dst->ne, dst->nb, 3);
2810
2780
 
2811
- GGML_CANN_CALL_ACLNN_OP(ctx, ReflectionPad1d, acl_src, paddings, acl_dst);
2812
-
2813
- ggml_cann_release_resources(ctx, acl_src, acl_dst);
2781
+ GGML_CANN_CALL_ACLNN_OP(ctx, ReflectionPad1d, acl_src.get(), paddings.get(), acl_dst.get());
2814
2782
  }
2815
- ggml_cann_release_resources(ctx, paddings);
2816
2783
  }
2817
2784
 
2818
- void ggml_cann_count_equal(ggml_backend_cann_context& ctx, ggml_tensor* dst){
2785
+ void ggml_cann_count_equal(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
2819
2786
  ggml_tensor * src0 = dst->src[0];
2820
2787
  ggml_tensor * src1 = dst->src[1];
2821
2788
 
2822
- aclTensor* acl_self = ggml_cann_create_tensor(src0);
2823
- aclTensor* acl_other = ggml_cann_create_tensor(src1);
2789
+ acl_tensor_ptr acl_self = ggml_cann_create_tensor(src0);
2790
+ acl_tensor_ptr acl_other = ggml_cann_create_tensor(src1);
2824
2791
 
2825
- GGML_CANN_CALL_ACLNN_OP(ctx, InplaceEqTensor, acl_self, acl_other);
2792
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceEqTensor, acl_self.get(), acl_other.get());
2826
2793
 
2827
2794
  ggml_cann_sum(ctx, dst);
2828
-
2829
- ggml_cann_release_resources(ctx, acl_self, acl_other);
2830
2795
  }
2831
2796
 
2832
- void ggml_cann_step(ggml_backend_cann_context& ctx, ggml_tensor* dst){
2797
+ void ggml_cann_step(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
2833
2798
  ggml_tensor * src0 = dst->src[0];
2834
2799
 
2835
- aclTensor* acl_src = ggml_cann_create_tensor(src0);
2836
- aclTensor* acl_dst = ggml_cann_create_tensor(dst);
2800
+ acl_tensor_ptr acl_src = ggml_cann_create_tensor(src0);
2801
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
2837
2802
 
2838
- float alphaValue = 0.0f;
2839
- aclScalar* alpha = nullptr;
2840
- alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
2803
+ float alphaValue = 0.0f;
2804
+ acl_scalar_ptr alpha = nullptr;
2805
+ alpha = ggml_cann_create_scalar(&alphaValue, aclDataType::ACL_FLOAT);
2841
2806
 
2842
- GGML_CANN_CALL_ACLNN_OP(ctx, GtScalar, acl_src, alpha, acl_dst);
2843
-
2844
- ggml_cann_release_resources(ctx, acl_src, acl_dst, alpha);
2807
+ GGML_CANN_CALL_ACLNN_OP(ctx, GtScalar, acl_src.get(), alpha.get(), acl_dst.get());
2845
2808
  }
2846
2809
 
2847
2810
  /**
@@ -2862,176 +2825,54 @@ void ggml_cann_step(ggml_backend_cann_context& ctx, ggml_tensor* dst){
2862
2825
  * @note This function assumes floating-point data types and is designed for
2863
2826
  * MoE architectures, possibly involving sparse expert routing.
2864
2827
  */
2865
- static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
2828
+ static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
2866
2829
  //dst [M, K, N, 1]
2867
- ggml_tensor * src0 = dst->src[0]; //src0 [D, M, A, 1]
2868
- ggml_tensor * src1 = dst->src[1]; //src1 [D, B, N, 1], B = K or B = 1
2830
+ ggml_tensor * src0 = dst->src[0]; //src0 [D, M, A, 1] -> [D, M, K, 1]
2831
+ ggml_tensor * src1 = dst->src[1]; //src1 [D, B, N, 1], B = K or B = 1 -> [D, 1, K, 1]
2869
2832
  ggml_tensor * ids = dst->src[2]; //ids [K, N]
2870
2833
 
2871
- GGML_TENSOR_BINARY_OP_LOCALS
2834
+ GGML_ASSERT(src0->ne[3] == 1);
2835
+ GGML_ASSERT(src1->ne[3] == 1);
2836
+ GGML_ASSERT(dst->ne[3] == 1);
2872
2837
 
2873
- // copy index from npu to cpu
2874
- int64_t n_as = ne02; // A
2875
- int64_t n_ids = ids->ne[0]; // K
2876
-
2877
- std::vector<char> ids_host(ggml_nbytes(ids));
2878
- ggml_cann_async_memcpy(ctx, ids_host.data(), ids->data, ggml_nbytes(ids),
2879
- ACL_MEMCPY_DEVICE_TO_HOST);
2880
- ACL_CHECK(aclrtSynchronizeStream(ctx.stream()));
2881
-
2882
- char * src0_original = (char *) src0->data;
2883
- char * src1_original = (char *) src1->data;
2884
- char * dst_original = (char *) dst->data;
2885
- size_t ori_src0_nb[4] = {nb00, nb01, nb02, nb03};
2838
+ int64_t batch = src1->ne[2];
2839
+ GGML_ASSERT(batch == ids->ne[1]);
2886
2840
 
2887
- // src0 is F16, src1 is F32, dst is F32
2888
- ggml_cann_pool_alloc src0_cast_allocator;
2889
- if (src0->type == GGML_TYPE_F16) {
2890
- src0_cast_allocator.alloc(ctx.pool(), sizeof(float) * ggml_nelements(src0));
2891
- void* src0_cast_buf = src0_cast_allocator.get();
2841
+ ggml_cann_pool_alloc export_allocator(ctx.pool(), src0->ne[0] * src0->ne[1] * ids->ne[0] * ggml_element_size(src0));
2842
+ void * export_ptr = export_allocator.get();
2843
+ for (int64_t i = 0; i < batch; i++) {
2844
+ acl_tensor_ptr select_index = ggml_cann_create_tensor(ids, ids->ne, ids->nb, 1, ACL_FORMAT_ND, i * ids->nb[1]);
2845
+ acl_tensor_ptr export_weight = ggml_cann_create_tensor(src0, src0->ne, src0->nb, 3);
2892
2846
 
2893
- size_t cast_nb[GGML_MAX_DIMS];
2894
- cast_nb[0] = sizeof(float_t);
2895
- for (int i = 1; i < GGML_MAX_DIMS; i++) {
2896
- cast_nb[i] = cast_nb[i - 1] * src0->ne[i - 1];
2847
+ int64_t select_export_ne[] = { src0->ne[0], src0->ne[1], ids->ne[0] };
2848
+ size_t select_export_nb[3];
2849
+ select_export_nb[0] = src0->nb[0];
2850
+ for (int k = 1; k < 3; k++) {
2851
+ select_export_nb[k] = select_export_nb[k - 1] * select_export_ne[k - 1];
2897
2852
  }
2898
2853
 
2899
- aclTensor* acl_src0_f16 = ggml_cann_create_tensor(src0);
2900
- aclTensor* acl_cast = ggml_cann_create_tensor(src0_cast_buf,
2901
- ACL_FLOAT, sizeof(float), src0->ne, cast_nb, 4);
2902
- GGML_CANN_CALL_ACLNN_OP(ctx, Cast, acl_src0_f16, ACL_FLOAT, acl_cast);
2903
- ggml_cann_release_resources(ctx, acl_cast, acl_src0_f16);
2854
+ acl_tensor_ptr select_export =
2855
+ ggml_cann_create_tensor(export_ptr, ggml_cann_type_mapping(src0->type), ggml_element_size(src0),
2856
+ select_export_ne, select_export_nb, 3);
2857
+ GGML_CANN_CALL_ACLNN_OP(ctx, IndexSelect, export_weight.get(), 0, select_index.get(), select_export.get());
2904
2858
 
2905
- src0_original = (char *) src0_cast_buf;
2906
- memcpy(ori_src0_nb, cast_nb, sizeof(ori_src0_nb));
2907
- }
2859
+ int64_t select_transpose_ne[] = { select_export_ne[1], select_export_ne[0], select_export_ne[2] };
2860
+ size_t select_transpose_nb[] = { select_export_nb[1], select_export_nb[0], select_export_nb[2] };
2861
+ acl_tensor_ptr select_export_transpose =
2862
+ ggml_cann_create_tensor(export_ptr, ggml_cann_type_mapping(src0->type), ggml_element_size(src0),
2863
+ select_transpose_ne, select_transpose_nb, 3);
2908
2864
 
2909
- #ifdef ASCEND_310P
2910
- ggml_tensor src0_row = *src0;
2911
- ggml_tensor src1_row = *src1;
2912
- ggml_tensor dst_row = *dst;
2865
+ int64_t active_tensor_ne[] = { src1->ne[0], 1, src1->ne[1] };
2866
+ size_t active_tensor_nb[] = { src1->nb[0], src1->nb[1], src1->nb[1] };
2867
+ acl_tensor_ptr active_tensor =
2868
+ ggml_cann_create_tensor(src1, active_tensor_ne, active_tensor_nb, 3, ACL_FORMAT_ND, i * src1->nb[2]);
2913
2869
 
2914
- if (src0->type == GGML_TYPE_F16) {
2915
- src0_row.type = GGML_TYPE_F32;
2916
- }
2917
-
2918
- // src0_row [D, M, 1, 1] weight without permute
2919
- src0_row.ne[2] = 1;
2920
- src0_row.ne[3] = 1;
2921
- src0_row.nb[0] = ori_src0_nb[0];
2922
- src0_row.nb[1] = ori_src0_nb[1];
2923
- src0_row.nb[2] = ori_src0_nb[1];
2924
- src0_row.nb[3] = ori_src0_nb[1];
2870
+ int64_t dst_ne[] = { dst->ne[0], 1, dst->ne[1] };
2871
+ size_t dst_nb[] = { dst->nb[0], dst->nb[1], dst->nb[1] };
2872
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst, dst_ne, dst_nb, 3, ACL_FORMAT_ND, i * dst->nb[2]);
2925
2873
 
2926
- // src1_row [D, 1, 1, 1] -> input
2927
- src1_row.ne[1] = 1;
2928
- src1_row.ne[2] = 1;
2929
- src1_row.ne[3] = 1;
2930
- src1_row.nb[2] = nb11;
2931
- src1_row.nb[3] = nb11;
2932
-
2933
- // dst_row [M, 1, 1, 1] -> out
2934
- dst_row.ne[1] = 1;
2935
- dst_row.ne[2] = 1;
2936
- dst_row.ne[3] = 1;
2937
- dst_row.nb[2] = nb1;
2938
- dst_row.nb[3] = nb1;
2939
-
2940
- //create weight for one row
2941
- for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
2942
- for (int64_t id = 0; id < n_ids; id++) {
2943
- // expert index
2944
- int32_t i02 = *(int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
2945
- GGML_ASSERT(i02 >= 0 && i02 < n_as);
2946
-
2947
- // If B = 1 (broadcast), always use 0; otherwise, use id.
2948
- int64_t i11 = (ne11 == 1 ? 0 : id);
2949
- int64_t i12 = iid1;
2950
-
2951
- int64_t i1 = id;
2952
- int64_t i2 = i12;
2953
-
2954
- void* src0_tmp_ptr = src0_original + i02*ori_src0_nb[2];
2955
- void* src1_tmp_ptr = src1_original + i11*nb11 + i12*nb12;
2956
- void* dst_tmp_ptr = dst_original + i1*nb1 + i2*nb2;
2957
-
2958
- src0_row.data = src0_tmp_ptr;
2959
- src1_row.data = src1_tmp_ptr;
2960
- dst_row.data = dst_tmp_ptr;
2961
- dst_row.src[0] = &src0_row;
2962
- dst_row.src[1] = &src1_row;
2963
-
2964
- ggml_cann_mul_mat(ctx, &dst_row);
2965
- }
2966
- }
2967
- return;
2968
- #endif
2969
-
2970
- std::vector<aclTensor*> src0_tensor_vec;
2971
- std::vector<aclTensor*> src1_tensor_vec;
2972
- std::vector<aclTensor*> dst_tensor_vec;
2973
- for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
2974
- for (int64_t id = 0; id < n_ids; id++) {
2975
- // src0_row [M, D] -> weight && permute
2976
- int64_t src0_ne[2] = {ne01, ne00};
2977
- size_t src0_nb[2] = {ori_src0_nb[1], ori_src0_nb[0]};
2978
- // src1_row [D, 1] -> input
2979
- int64_t src1_ne[2] = {ne10, 1};
2980
- size_t src1_nb[2] = {nb10, nb11};
2981
- // dst_row [M, 1] -> out
2982
- int64_t dst_ne[2] = {ne0, 1};
2983
- size_t dst_nb[2] = {nb0, nb1};
2984
-
2985
- // expert index
2986
- int32_t i02 = *(int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
2987
- GGML_ASSERT(i02 >= 0 && i02 < n_as);
2988
-
2989
- // If B = 1 (broadcast), always use 0; otherwise, use id.
2990
- int64_t i11 = (ne11 == 1 ? 0 : id);
2991
- int64_t i12 = iid1;
2992
-
2993
- int64_t i1 = id;
2994
- int64_t i2 = i12;
2995
-
2996
- void* src0_tmp_ptr = src0_original + i02*ori_src0_nb[2];
2997
- void* src1_tmp_ptr = src1_original + i11*nb11 + i12*nb12;
2998
- void* dst_tmp_ptr = dst_original + i1*nb1 + i2*nb2;
2999
-
3000
- aclTensor* acl_src0 = ggml_cann_create_tensor(src0_tmp_ptr,
3001
- ACL_FLOAT, sizeof(float),
3002
- src0_ne, src0_nb, 2);
3003
- aclTensor* acl_src1 = ggml_cann_create_tensor(src1_tmp_ptr,
3004
- ACL_FLOAT, sizeof(float),
3005
- src1_ne, src1_nb, 2);
3006
- aclTensor* acl_dst = ggml_cann_create_tensor(dst_tmp_ptr,
3007
- ACL_FLOAT, sizeof(float),
3008
- dst_ne, dst_nb, 2);
3009
-
3010
- src0_tensor_vec.push_back(acl_src0);
3011
- src1_tensor_vec.push_back(acl_src1);
3012
- dst_tensor_vec.push_back(acl_dst);
3013
- }
2874
+ GGML_CANN_CALL_ACLNN_OP(ctx, BatchMatMul, active_tensor.get(), select_export_transpose.get(), acl_dst.get(), 2);
3014
2875
  }
3015
-
3016
- size_t GROUP_SIZE = 128;
3017
- // GroupedMatmulV3 required tensor_list.size < 128
3018
- for (size_t i = 0; i < src0_tensor_vec.size(); i += GROUP_SIZE) {
3019
- // split and call GroupedMatmulV3
3020
- size_t end = std::min(i + GROUP_SIZE, src0_tensor_vec.size());
3021
- std::vector<aclTensor*> src0_tensor_vec_split(src0_tensor_vec.begin() + i, src0_tensor_vec.begin() + end);
3022
- std::vector<aclTensor*> src1_tensor_vec_split(src1_tensor_vec.begin() + i, src1_tensor_vec.begin() + end);
3023
- std::vector<aclTensor*> dst_tensor_vec_split(dst_tensor_vec.begin() + i, dst_tensor_vec.begin() + end);
3024
-
3025
- aclTensorList* src0_tensor_list = aclCreateTensorList(src0_tensor_vec_split.data(), src0_tensor_vec_split.size());
3026
- aclTensorList* src1_tensor_list = aclCreateTensorList(src1_tensor_vec_split.data(), src1_tensor_vec_split.size());
3027
- aclTensorList* dst_tensor_list = aclCreateTensorList(dst_tensor_vec_split.data(), dst_tensor_vec_split.size());
3028
-
3029
- GGML_CANN_CALL_ACLNN_OP(ctx, GroupedMatmulV3, src1_tensor_list, src0_tensor_list,
3030
- nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, -1, dst_tensor_list);
3031
-
3032
- ggml_cann_release_resources(ctx, src0_tensor_list, src1_tensor_list, dst_tensor_list);
3033
- }
3034
- return;
3035
2876
  }
3036
2877
 
3037
2878
  /**
@@ -3057,7 +2898,7 @@ static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context& ctx, ggml_tensor*
3057
2898
  * @note This function assumes quantized data types and is designed for
3058
2899
  * MoE architectures with potential sparse expert routing.
3059
2900
  */
3060
- static void ggml_cann_mul_mat_id_quant(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
2901
+ static void ggml_cann_mul_mat_id_quant(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
3061
2902
  // TODO: Use aclnnGroupedMatMul
3062
2903
  //dst [M, K, N, 1]
3063
2904
  ggml_tensor * src0 = dst->src[0]; //src0 [D, M, A, 1]
@@ -3067,24 +2908,24 @@ static void ggml_cann_mul_mat_id_quant(ggml_backend_cann_context& ctx, ggml_tens
3067
2908
  GGML_TENSOR_BINARY_OP_LOCALS
3068
2909
 
3069
2910
  // copy index from npu to cpu
3070
- int64_t n_as = ne02; // A
3071
- int64_t n_ids = ids->ne[0]; // K
2911
+ int64_t n_as = ne02; // A
2912
+ int64_t n_ids = ids->ne[0]; // K
3072
2913
 
3073
2914
  std::vector<char> ids_host(ggml_nbytes(ids));
3074
- ggml_cann_async_memcpy(ctx, ids_host.data(), ids->data, ggml_nbytes(ids),
3075
- ACL_MEMCPY_DEVICE_TO_HOST);
2915
+ ACL_CHECK(aclrtMemcpyAsync(ids_host.data(), ggml_nbytes(ids), ids->data, ggml_nbytes(ids),
2916
+ ACL_MEMCPY_DEVICE_TO_HOST, ctx.stream()));
3076
2917
  ACL_CHECK(aclrtSynchronizeStream(ctx.stream()));
3077
2918
 
3078
2919
  char * src0_original = (char *) src0->data;
3079
2920
  char * src1_original = (char *) src1->data;
3080
- char * dst_original = (char *) dst->data;
2921
+ char * dst_original = (char *) dst->data;
3081
2922
 
3082
2923
  ggml_tensor src0_row = *src0;
3083
2924
  ggml_tensor src1_row = *src1;
3084
- ggml_tensor dst_row = *dst;
2925
+ ggml_tensor dst_row = *dst;
3085
2926
 
3086
2927
  const enum ggml_type type = dst->src[0]->type;
3087
- float weight_elem_size;
2928
+ float weight_elem_size;
3088
2929
  if (type == GGML_TYPE_Q4_0) {
3089
2930
  weight_elem_size = float(sizeof(uint8_t)) / 2;
3090
2931
  } else if (type == GGML_TYPE_Q8_0) {
@@ -3094,18 +2935,18 @@ static void ggml_cann_mul_mat_id_quant(ggml_backend_cann_context& ctx, ggml_tens
3094
2935
  }
3095
2936
 
3096
2937
  // src0_row [D, M, 1, 1] weight without permute
3097
- src0_row.ne[2] = 1;
3098
- src0_row.ne[3] = 1;
3099
- src0_row.nb[0] = weight_elem_size;
3100
- src0_row.nb[1] = weight_elem_size * ne00;
3101
- src0_row.nb[2] = weight_elem_size * ne00;
3102
- src0_row.nb[3] = weight_elem_size * ne00;
2938
+ src0_row.ne[2] = 1;
2939
+ src0_row.ne[3] = 1;
2940
+ src0_row.nb[0] = weight_elem_size;
2941
+ src0_row.nb[1] = weight_elem_size * ne00;
2942
+ src0_row.nb[2] = weight_elem_size * ne00;
2943
+ src0_row.nb[3] = weight_elem_size * ne00;
3103
2944
  size_t weight_stride = ne00 * ne01 * weight_elem_size;
3104
- size_t weight_size = weight_stride * ne02 * ne03;
2945
+ size_t weight_size = weight_stride * ne02 * ne03;
3105
2946
 
3106
2947
  // scale [D, M, 1, 1] -> scale && permute
3107
2948
  size_t scale_elem_size = sizeof(uint16_t);
3108
- size_t scale_stride = src0->ne[1] * src0->ne[0] / QK8_0 * scale_elem_size;
2949
+ size_t scale_stride = src0->ne[1] * src0->ne[0] / QK8_0 * scale_elem_size;
3109
2950
 
3110
2951
  // src1_row [D, 1, 1, 1] -> input
3111
2952
  src1_row.ne[1] = 1;
@@ -3123,11 +2964,11 @@ static void ggml_cann_mul_mat_id_quant(ggml_backend_cann_context& ctx, ggml_tens
3123
2964
 
3124
2965
  //create weight for one row
3125
2966
  ggml_cann_pool_alloc weight_allocator(ctx.pool());
3126
- void* weight_buffer = weight_allocator.alloc(nb02);
2967
+ void * weight_buffer = weight_allocator.alloc(nb02);
3127
2968
  for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
3128
2969
  for (int64_t id = 0; id < n_ids; id++) {
3129
2970
  // expert index
3130
- int32_t i02 = *(int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
2971
+ int32_t i02 = *(int32_t *) (ids_host.data() + iid1 * ids->nb[1] + id * ids->nb[0]);
3131
2972
  GGML_ASSERT(i02 >= 0 && i02 < n_as);
3132
2973
 
3133
2974
  // If B = 1 (broadcast), always use 0; otherwise, use id.
@@ -3137,21 +2978,21 @@ static void ggml_cann_mul_mat_id_quant(ggml_backend_cann_context& ctx, ggml_tens
3137
2978
  int64_t i1 = id;
3138
2979
  int64_t i2 = i12;
3139
2980
 
3140
- void* src0_tmp_ptr = src0_original + i02*weight_stride;
3141
- void* scale_tmp_ptr = src0_original + weight_size + i02*scale_stride;
3142
- void* src1_tmp_ptr = src1_original + i11*nb11 + i12*nb12;
3143
- void* dst_tmp_ptr = dst_original + i1*nb1 + i2*nb2;
2981
+ void * src0_tmp_ptr = src0_original + i02 * weight_stride;
2982
+ void * scale_tmp_ptr = src0_original + weight_size + i02 * scale_stride;
2983
+ void * src1_tmp_ptr = src1_original + i11 * nb11 + i12 * nb12;
2984
+ void * dst_tmp_ptr = dst_original + i1 * nb1 + i2 * nb2;
3144
2985
 
3145
2986
  // mem cpy
3146
- ggml_cann_async_memcpy(ctx, weight_buffer, src0_tmp_ptr, weight_stride,
3147
- ACL_MEMCPY_DEVICE_TO_DEVICE);
3148
- void* scale_buffer = (char*)weight_buffer + weight_stride;
3149
- ggml_cann_async_memcpy(ctx, scale_buffer, scale_tmp_ptr, scale_stride,
3150
- ACL_MEMCPY_DEVICE_TO_DEVICE);
3151
-
3152
- src0_row.data = weight_buffer;
3153
- src1_row.data = src1_tmp_ptr;
3154
- dst_row.data = dst_tmp_ptr;
2987
+ ACL_CHECK(aclrtMemcpyAsync(weight_buffer, weight_stride, src0_tmp_ptr, weight_stride,
2988
+ ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
2989
+ void * scale_buffer = (char *) weight_buffer + weight_stride;
2990
+ ACL_CHECK(aclrtMemcpyAsync(scale_buffer, scale_stride, scale_tmp_ptr, scale_stride,
2991
+ ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
2992
+
2993
+ src0_row.data = weight_buffer;
2994
+ src1_row.data = src1_tmp_ptr;
2995
+ dst_row.data = dst_tmp_ptr;
3155
2996
  dst_row.src[0] = &src0_row;
3156
2997
  dst_row.src[1] = &src1_row;
3157
2998
 
@@ -3161,7 +3002,7 @@ static void ggml_cann_mul_mat_id_quant(ggml_backend_cann_context& ctx, ggml_tens
3161
3002
  return;
3162
3003
  }
3163
3004
 
3164
- void ggml_cann_mul_mat_id(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
3005
+ void ggml_cann_mul_mat_id(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
3165
3006
  const enum ggml_type type = dst->src[0]->type;
3166
3007
  switch (type) {
3167
3008
  case GGML_TYPE_F32:
@@ -3178,12 +3019,11 @@ void ggml_cann_mul_mat_id(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
3178
3019
  }
3179
3020
  }
3180
3021
 
3181
- void ggml_cann_flash_attn_ext(ggml_backend_cann_context& ctx, ggml_tensor* dst){
3182
-
3183
- ggml_tensor* src0 = dst->src[0]; // q, fp32 | B, N, S, D (uncont) -> B, S, N, D (cont)
3184
- ggml_tensor* src1 = dst->src[1]; // k, fp16 | B, N, S, D (uncont) -> B, S, N, D (cont)
3185
- ggml_tensor* src2 = dst->src[2]; // v, fp16 | B, N, S, D (uncont) -> B, S, N, D (cont)
3186
- ggml_tensor* src3 = dst->src[3]; // mask, fp16
3022
+ void ggml_cann_flash_attn_ext(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
3023
+ ggml_tensor * src0 = dst->src[0]; // q, fp32 | B, N, S, D (uncont) -> B, S, N, D (cont)
3024
+ ggml_tensor * src1 = dst->src[1]; // k, fp16 | B, N, S, D (uncont) -> B, S, N, D (cont)
3025
+ ggml_tensor * src2 = dst->src[2]; // v, fp16 | B, N, S, D (uncont) -> B, S, N, D (cont)
3026
+ ggml_tensor * src3 = dst->src[3]; // mask, fp16
3187
3027
 
3188
3028
  // B, N, S, D (uncont) -> B, S, N, D (cont)
3189
3029
  int64_t src0_bsnd_ne[GGML_MAX_DIMS];
@@ -3199,229 +3039,200 @@ void ggml_cann_flash_attn_ext(ggml_backend_cann_context& ctx, ggml_tensor* dst){
3199
3039
  size_t src2_bsnd_nb[GGML_MAX_DIMS];
3200
3040
  memcpy(src2_bsnd_nb, src2->nb, GGML_MAX_DIMS * sizeof(size_t));
3201
3041
 
3202
- auto transpose12 = [](int64_t* ne, size_t* nb) {
3042
+ auto transpose12 = [](int64_t * ne, size_t * nb) {
3203
3043
  int64_t ne_tmp = ne[1];
3204
3044
  size_t nb_tmp = nb[1];
3205
- ne[1] = ne[2];
3206
- nb[1] = nb[2];
3207
- ne[2] = ne_tmp;
3208
- nb[2] = nb_tmp;
3045
+ ne[1] = ne[2];
3046
+ nb[1] = nb[2];
3047
+ ne[2] = ne_tmp;
3048
+ nb[2] = nb_tmp;
3209
3049
  };
3210
3050
 
3211
3051
  transpose12(src0_bsnd_ne, src0_bsnd_nb);
3212
3052
  transpose12(src1_bsnd_ne, src1_bsnd_nb);
3213
3053
  transpose12(src2_bsnd_ne, src2_bsnd_nb);
3214
3054
 
3215
- float maxBias = 0.0f;
3216
- float scaleValue = 1.0f;
3055
+ float maxBias = 0.0f;
3056
+ float scaleValue = 1.0f;
3217
3057
  float logitSoftcap = 0.0f;
3218
- memcpy(&scaleValue, (float*)dst->op_params + 0, sizeof(float));
3219
- memcpy(&maxBias, (float*)dst->op_params + 1, sizeof(float));
3220
- memcpy(&logitSoftcap, (float*)dst->op_params + 2, sizeof(float));
3058
+ memcpy(&scaleValue, (float *) dst->op_params + 0, sizeof(float));
3059
+ memcpy(&maxBias, (float *) dst->op_params + 1, sizeof(float));
3060
+ memcpy(&logitSoftcap, (float *) dst->op_params + 2, sizeof(float));
3221
3061
 
3222
- if(logitSoftcap == 0.0f){
3062
+ if (logitSoftcap == 0.0f) {
3223
3063
  size_t faElemSize = sizeof(uint16_t);
3224
- auto faDataType = ACL_FLOAT16; //ACL_BF16;
3064
+ auto faDataType = ACL_FLOAT16; //ACL_BF16;
3225
3065
 
3226
- aclTensor* acl_src0_f16_tensor = nullptr;
3227
- aclTensor* acl_src1_f16_tensor = nullptr;
3228
- aclTensor* acl_src2_f16_tensor = nullptr;
3229
- aclTensor* acl_dst_f16_tensor = nullptr;
3066
+ acl_tensor_ptr acl_q_tensor = nullptr;
3067
+ acl_tensor_ptr acl_k_tensor = nullptr;
3068
+ acl_tensor_ptr acl_v_tensor = nullptr;
3230
3069
 
3231
3070
  // Step 1: cast the src0 (Query) to fp16 if needed
3232
3071
  ggml_cann_pool_alloc src0_f16_allocator(ctx.pool());
3233
- void* src0_f16_buffer = nullptr;
3072
+ void * src0_f16_buffer = nullptr;
3234
3073
 
3235
- if(ggml_cann_type_mapping(src0->type) != faDataType){
3236
- aclTensor* acl_src0_f32_tensor = ggml_cann_create_tensor(src0, src0_bsnd_ne,
3237
- src0_bsnd_nb, GGML_MAX_DIMS);
3238
- src0_f16_buffer = src0_f16_allocator.alloc(
3239
- ggml_nelements(src0) * faElemSize);
3074
+ if (ggml_cann_type_mapping(src0->type) != faDataType) {
3075
+ acl_tensor_ptr acl_src0_f32_tensor =
3076
+ ggml_cann_create_tensor(src0, src0_bsnd_ne, src0_bsnd_nb, GGML_MAX_DIMS);
3077
+ src0_f16_buffer = src0_f16_allocator.alloc(ggml_nelements(src0) * faElemSize);
3240
3078
 
3241
- int64_t* src0_f16_ne = src0_bsnd_ne;
3242
- size_t src0_f16_nb[GGML_MAX_DIMS];
3079
+ int64_t * src0_f16_ne = src0_bsnd_ne;
3080
+ size_t src0_f16_nb[GGML_MAX_DIMS];
3243
3081
  src0_f16_nb[0] = sizeof(uint16_t);
3244
- for(int i = 1; i < GGML_MAX_DIMS; ++i){
3082
+ for (int i = 1; i < GGML_MAX_DIMS; ++i) {
3245
3083
  src0_f16_nb[i] = src0_f16_nb[i - 1] * src0_f16_ne[i - 1];
3246
3084
  }
3247
3085
 
3248
- acl_src0_f16_tensor = ggml_cann_create_tensor(
3249
- src0_f16_buffer, faDataType, faElemSize,
3250
- src0_f16_ne, src0_f16_nb, GGML_MAX_DIMS
3251
- );
3252
- aclnn_cast(ctx, acl_src0_f32_tensor, acl_src0_f16_tensor, faDataType);
3253
- ggml_cann_release_resources(ctx, acl_src0_f32_tensor);
3254
- }else{
3255
- acl_src0_f16_tensor = ggml_cann_create_tensor(src0, src0_bsnd_ne,
3256
- src0_bsnd_nb, GGML_MAX_DIMS);
3086
+ acl_q_tensor = ggml_cann_create_tensor(src0_f16_buffer, faDataType, faElemSize, src0_f16_ne, src0_f16_nb,
3087
+ GGML_MAX_DIMS);
3088
+ aclnn_cast(ctx, acl_src0_f32_tensor.get(), acl_q_tensor.get(), faDataType);
3089
+ } else {
3090
+ acl_q_tensor = ggml_cann_create_tensor(src0, src0_bsnd_ne, src0_bsnd_nb, GGML_MAX_DIMS);
3257
3091
  }
3258
3092
 
3259
3093
  // Step 2: create the acl tensors for src1 (Key), src2 (Value),
3260
3094
  // and the direct output from FusedInferAttention
3261
3095
 
3262
- acl_src1_f16_tensor = ggml_cann_create_tensor(src1, src1_bsnd_ne,
3263
- src1_bsnd_nb, GGML_MAX_DIMS);
3264
- acl_src2_f16_tensor = ggml_cann_create_tensor(src2, src2_bsnd_ne,
3265
- src2_bsnd_nb, GGML_MAX_DIMS);
3266
-
3267
- ggml_cann_pool_alloc out_f16_allocator(ctx.pool());
3268
- void* out_f16_buffer = out_f16_allocator.alloc(
3269
- ggml_nelements(dst) * faElemSize);
3270
-
3271
- int64_t* out_f16_ne = src0_bsnd_ne;
3272
- size_t out_f16_nb[GGML_MAX_DIMS];
3273
- out_f16_nb[0] = faElemSize;
3274
- for(int i = 1; i < GGML_MAX_DIMS; ++i){
3275
- out_f16_nb[i] = out_f16_nb[i - 1] * out_f16_ne[i - 1];
3276
- }
3277
-
3278
- acl_dst_f16_tensor = ggml_cann_create_tensor(
3279
- out_f16_buffer, faDataType, faElemSize,
3280
- out_f16_ne, out_f16_nb, GGML_MAX_DIMS
3281
- );
3096
+ acl_k_tensor = ggml_cann_create_tensor(src1, src1_bsnd_ne, src1_bsnd_nb, GGML_MAX_DIMS);
3097
+ acl_v_tensor = ggml_cann_create_tensor(src2, src2_bsnd_ne, src2_bsnd_nb, GGML_MAX_DIMS);
3282
3098
 
3283
3099
  // Step 3: create the PSEShift tensor if needed
3284
3100
  // this tensor is considered as mask (f16) in the llama.cpp
3285
- aclTensor* bcast_pse_tensor = nullptr;
3101
+ acl_tensor_ptr bcast_pse_tensor;
3286
3102
  ggml_cann_pool_alloc bcast_pse_allocator(ctx.pool());
3287
- if(src3 != nullptr){
3103
+ if (src3 != nullptr) {
3288
3104
  // Construct the truncated pse tensor (common for prefill/decode)
3289
3105
  int64_t trunc_pse_ne[GGML_MAX_DIMS] = {
3290
- src3->ne[0], // D
3291
- src0->ne[1], // S (number of Q tokens)
3292
- src3->ne[2], // mask N
3293
- src3->ne[3] // B
3106
+ src3->ne[0], // D
3107
+ src0->ne[1], // S (number of Q tokens)
3108
+ src3->ne[2], // mask N
3109
+ src3->ne[3] // B
3294
3110
  };
3295
- size_t* trunc_pse_nb = src3->nb;
3111
+ size_t * trunc_pse_nb = src3->nb;
3296
3112
 
3297
- aclTensor* acl_mask_f16_trunc_tensor = ggml_cann_create_tensor(
3298
- src3->data, ACL_FLOAT16, sizeof(uint16_t),
3299
- trunc_pse_ne, trunc_pse_nb, GGML_MAX_DIMS
3300
- );
3113
+ acl_tensor_ptr acl_mask_f16_trunc_tensor = ggml_cann_create_tensor(
3114
+ src3->data, ACL_FLOAT16, sizeof(uint16_t), trunc_pse_ne, trunc_pse_nb, GGML_MAX_DIMS);
3301
3115
 
3302
3116
  int64_t bcast_pse_ne[GGML_MAX_DIMS];
3303
- size_t bcast_pse_nb[GGML_MAX_DIMS];
3304
- bcast_pse_ne[0] = src3->ne[0]; // D
3305
- bcast_pse_ne[1] = src0->ne[1]; // S
3306
- bcast_pse_ne[2] = src0->ne[2]; // N (num_heads)
3307
- bcast_pse_ne[3] = src3->ne[3]; // B
3117
+ size_t bcast_pse_nb[GGML_MAX_DIMS];
3118
+ bcast_pse_ne[0] = src3->ne[0]; // D
3119
+ bcast_pse_ne[1] = src0->ne[1]; // S
3120
+ bcast_pse_ne[2] = src0->ne[2]; // N (num_heads)
3121
+ bcast_pse_ne[3] = src3->ne[3]; // B
3308
3122
  if (maxBias == 0.0f) {
3309
3123
  // When maxBias == 0.0f, use nb = 0 reduce once repeat (Qwen2)
3310
3124
  // Construct the bcast tensor (simulate repeat on the head dimension using stride=0)
3311
3125
  bcast_pse_nb[0] = sizeof(uint16_t);
3312
3126
  bcast_pse_nb[1] = bcast_pse_nb[0] * bcast_pse_ne[0];
3313
- bcast_pse_nb[2] = 0; // <---- the head dimension shares the same data
3127
+ bcast_pse_nb[2] = 0; // <---- the head dimension shares the same data
3314
3128
  bcast_pse_nb[3] = src3->nb[3];
3315
3129
 
3316
- bcast_pse_tensor = ggml_cann_create_tensor(
3317
- src3->data, ACL_FLOAT16, sizeof(uint16_t),
3318
- bcast_pse_ne, bcast_pse_nb, GGML_MAX_DIMS
3319
- );
3130
+ bcast_pse_tensor = ggml_cann_create_tensor(src3->data, ACL_FLOAT16, sizeof(uint16_t), bcast_pse_ne,
3131
+ bcast_pse_nb, GGML_MAX_DIMS);
3320
3132
 
3321
- ggml_cann_release_resources(ctx, acl_mask_f16_trunc_tensor);
3322
3133
  } else {
3323
3134
  bcast_pse_nb[0] = sizeof(uint16_t);
3324
3135
  for (int i = 1; i < GGML_MAX_DIMS; i++) {
3325
3136
  bcast_pse_nb[i] = bcast_pse_nb[i - 1] * bcast_pse_ne[i - 1];
3326
3137
  }
3327
3138
 
3328
- void* bcast_pse_buffer = bcast_pse_allocator.alloc(
3329
- ggml_nelements(src3) * src0->ne[2] * sizeof(uint16_t)
3330
- );
3139
+ void * bcast_pse_buffer =
3140
+ bcast_pse_allocator.alloc(ggml_nelements(src3) * src0->ne[2] * sizeof(uint16_t));
3331
3141
 
3332
- bcast_pse_tensor = ggml_cann_create_tensor(
3333
- bcast_pse_buffer, ACL_FLOAT16, sizeof(uint16_t),
3334
- bcast_pse_ne, bcast_pse_nb, GGML_MAX_DIMS
3335
- );
3142
+ bcast_pse_tensor = ggml_cann_create_tensor(bcast_pse_buffer, ACL_FLOAT16, sizeof(uint16_t),
3143
+ bcast_pse_ne, bcast_pse_nb, GGML_MAX_DIMS);
3336
3144
 
3337
- int64_t repeats[] = {1, src0->ne[2], 1, 1};
3338
- aclnn_repeat(ctx, acl_mask_f16_trunc_tensor, bcast_pse_tensor, repeats);
3145
+ int64_t repeats[] = { 1, src0->ne[2], 1, 1 };
3146
+ aclnn_repeat(ctx, acl_mask_f16_trunc_tensor.get(), bcast_pse_tensor.get(), repeats);
3339
3147
 
3340
3148
  // alibi
3341
3149
  // Compute the slope if needed. Derived from ggml_cann_softmax().
3342
- const int64_t n_heads = src0->ne[2];
3150
+ const int64_t n_heads = src0->ne[2];
3343
3151
  ggml_cann_pool_alloc slope_allocator(ctx.pool(), n_heads * sizeof(uint16_t));
3344
- void* slope_buffer = slope_allocator.get();
3345
- aclnn_get_slope(ctx, n_heads, slope_buffer, maxBias);
3152
+ void * slope_buffer = slope_allocator.get();
3153
+ aclnn_get_slope(ctx, n_heads, slope_buffer, maxBias, GGML_TYPE_F16);
3346
3154
 
3347
- int64_t slope_ne[] = {1, 1, n_heads, 1};
3348
- size_t slope_nb[GGML_MAX_DIMS];
3155
+ int64_t slope_ne[] = { 1, 1, n_heads, 1 };
3156
+ size_t slope_nb[GGML_MAX_DIMS];
3349
3157
  slope_nb[0] = sizeof(uint16_t);
3350
- for(int i = 1;i<GGML_MAX_DIMS;i++) {
3351
- slope_nb[i] = slope_nb[i-1] * slope_ne[0];
3158
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
3159
+ slope_nb[i] = slope_nb[i - 1] * slope_ne[0];
3352
3160
  }
3353
3161
 
3354
- aclTensor* slope_tensor = ggml_cann_create_tensor(
3355
- slope_buffer, ACL_FLOAT16, sizeof(uint16_t),
3356
- slope_ne, slope_nb, GGML_MAX_DIMS);
3357
- GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMul, bcast_pse_tensor, slope_tensor);
3358
-
3359
- ggml_cann_release_resources(ctx, slope_tensor, acl_mask_f16_trunc_tensor);
3162
+ acl_tensor_ptr slope_tensor = ggml_cann_create_tensor(slope_buffer, ACL_FLOAT16, sizeof(uint16_t),
3163
+ slope_ne, slope_nb, GGML_MAX_DIMS);
3164
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMul, bcast_pse_tensor.get(), slope_tensor.get());
3360
3165
  }
3361
3166
  }
3362
3167
 
3363
3168
  // Step 4: set the inputs for FusedInferAttention.
3364
- int kvTensorNum = 1;
3365
- aclTensor* acl_q_tensor = acl_src0_f16_tensor;
3366
- aclTensor* acl_k_tensors[] = {acl_src1_f16_tensor};
3367
- aclTensor* acl_v_tensors[] = {acl_src2_f16_tensor};
3368
- auto acl_k_tensor_list = aclCreateTensorList(acl_k_tensors, kvTensorNum);
3369
- auto acl_v_tensor_list = aclCreateTensorList(acl_v_tensors, kvTensorNum);
3370
-
3371
- int64_t numHeads = src0->ne[2]; // N
3372
- int64_t numKeyValueHeads = src1->ne[2];
3169
+ acl_tensor_list_ptr acl_k_tensor_list = ggml_cann_create_tensor_list(acl_k_tensor);
3170
+ acl_tensor_list_ptr acl_v_tensor_list = ggml_cann_create_tensor_list(acl_v_tensor);
3171
+
3172
+ int64_t numHeads = src0->ne[2]; // N
3173
+ int64_t numKeyValueHeads = src1->ne[2];
3373
3174
  // double scaleValue = 1 / sqrt(src0->ne[0]); // 1/sqrt(d)
3374
- int64_t preTokens = 65535;
3375
- int64_t nextTokens = 65535;
3376
- char layout[5] = {'B', 'S', 'N', 'D', 0};
3377
- int64_t sparseMode = 0;
3378
- int64_t innerPrecise = (src0->ne[1] == 1) ? 0 : 2;
3379
- int64_t blockSize = 0;
3380
- int64_t antiquantMode = 0;
3381
- bool softmaxLseFlag = false;
3382
- int64_t keyAntiquantMode = 0;
3175
+ int64_t preTokens = 65535;
3176
+ int64_t nextTokens = 65535;
3177
+ char layout[5] = { 'B', 'S', 'N', 'D', 0 };
3178
+ int64_t sparseMode = 0;
3179
+ int64_t innerPrecise = (src0->ne[1] == 1) ? 0 : 2;
3180
+ int64_t blockSize = 0;
3181
+ int64_t antiquantMode = 0;
3182
+ bool softmaxLseFlag = false;
3183
+ int64_t keyAntiquantMode = 0;
3383
3184
  int64_t valueAntiquantMode = 0;
3384
3185
 
3385
- // Step 5: launch the FusedInferAttentionScoreV2 kernel.
3386
- // Refer to https://gitee.com/ascend/cann-ops-adv/blob/master/docs/FusedInferAttentionScoreV2.md
3387
-
3388
- GGML_CANN_CALL_ACLNN_OP(ctx, FusedInferAttentionScoreV2,
3389
- acl_q_tensor, acl_k_tensor_list, acl_v_tensor_list, // q, k, v
3390
- bcast_pse_tensor, nullptr, // pse, mask
3391
- nullptr, nullptr, // actSeqLen, actSeqLenkv
3392
- nullptr, nullptr, // deqScale1, quantScale1
3393
- nullptr, nullptr, nullptr, // deqScale2, quantScale2, quantOffset2
3394
- nullptr, nullptr, // antiquantScale, antiquantOffset
3395
- nullptr, // blockTable
3396
- nullptr, nullptr, // qPadSize, kvPadSize
3397
- nullptr, nullptr, // kAntiquantScale, kAntiQuantOffset
3398
- nullptr, nullptr, // vAntiquantScale, vAntiQuantOffset
3399
- nullptr, nullptr, nullptr, // kSharedPrefix, vSharedPrefix, actSharedLen
3400
- numHeads, scaleValue, // heads, scaleValue
3401
- preTokens, nextTokens, // preTokens, nextTokens
3402
- layout, // inputLayout
3403
- numKeyValueHeads, // numKVHeads
3404
- sparseMode, innerPrecise, // sparseMode, innerPrecise
3405
- blockSize, antiquantMode, // blockSize, antiquantMode
3406
- softmaxLseFlag, // softmaxLseFlag
3407
- keyAntiquantMode, valueAntiquantMode, // keyAntiqMode, valueAntiqMode
3408
- acl_dst_f16_tensor, // attentionOut
3409
- nullptr // softmaxLse
3186
+ GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
3187
+ acl_tensor_ptr fa_dst_tensor;
3188
+ acl_tensor_ptr acl_dst_tensor;
3189
+ ggml_cann_pool_alloc out_f16_allocator(ctx.pool());
3190
+ if (dst->type == GGML_TYPE_F32) {
3191
+ void * out_f16_buffer = out_f16_allocator.alloc(ggml_nelements(dst) * faElemSize);
3192
+
3193
+ int64_t * out_f16_ne = src0_bsnd_ne;
3194
+ size_t out_f16_nb[GGML_MAX_DIMS];
3195
+ out_f16_nb[0] = faElemSize;
3196
+ for (int i = 1; i < GGML_MAX_DIMS; ++i) {
3197
+ out_f16_nb[i] = out_f16_nb[i - 1] * out_f16_ne[i - 1];
3198
+ }
3199
+
3200
+ fa_dst_tensor =
3201
+ ggml_cann_create_tensor(out_f16_buffer, faDataType, faElemSize, out_f16_ne, out_f16_nb, GGML_MAX_DIMS);
3202
+ } else {
3203
+ fa_dst_tensor = ggml_cann_create_tensor(dst);
3204
+ }
3205
+
3206
+ GGML_CANN_CALL_ACLNN_OP(ctx, FusedInferAttentionScoreV2, acl_q_tensor.get(), acl_k_tensor_list.get(),
3207
+ acl_v_tensor_list.get(), // q, k, v
3208
+ bcast_pse_tensor.get(), nullptr, // pse, mask
3209
+ nullptr, nullptr, // actSeqLen, actSeqLenkv
3210
+ nullptr, nullptr, // deqScale1, quantScale1
3211
+ nullptr, nullptr, nullptr, // deqScale2, quantScale2, quantOffset2
3212
+ nullptr, nullptr, // antiquantScale, antiquantOffset
3213
+ nullptr, // blockTable
3214
+ nullptr, nullptr, // qPadSize, kvPadSize
3215
+ nullptr, nullptr, // kAntiquantScale, kAntiQuantOffset
3216
+ nullptr, nullptr, // vAntiquantScale, vAntiQuantOffset
3217
+ nullptr, nullptr, nullptr, // kSharedPrefix, vSharedPrefix, actSharedLen
3218
+ numHeads, scaleValue, // heads, scaleValue
3219
+ preTokens, nextTokens, // preTokens, nextTokens
3220
+ layout, // inputLayout
3221
+ numKeyValueHeads, // numKVHeads
3222
+ sparseMode, innerPrecise, // sparseMode, innerPrecise
3223
+ blockSize, antiquantMode, // blockSize, antiquantMode
3224
+ softmaxLseFlag, // softmaxLseFlag
3225
+ keyAntiquantMode, valueAntiquantMode, // keyAntiqMode, valueAntiqMode
3226
+ fa_dst_tensor.get(), // attentionOut
3227
+ nullptr // softmaxLse
3410
3228
  );
3411
3229
 
3412
- // Step 6: post-processing, permute and cast to f32
3413
- aclTensor* acl_dst_tensor = ggml_cann_create_tensor(dst);
3414
- // TODO: when dst is fp16, don't need cast
3415
- aclnn_cast(ctx, acl_dst_f16_tensor, acl_dst_tensor, ggml_cann_type_mapping(dst->type));
3416
- ggml_cann_release_resources(ctx, acl_src0_f16_tensor,
3417
- acl_src1_f16_tensor,
3418
- acl_src2_f16_tensor,
3419
- acl_dst_f16_tensor,
3420
- acl_dst_tensor);
3421
- if(src3 != nullptr){
3422
- ggml_cann_release_resources(ctx, bcast_pse_tensor);
3230
+ if (dst->type == GGML_TYPE_F32) {
3231
+ // Step 6: post-processing, permute and cast to f32
3232
+ acl_tensor_ptr acl_dst_tensor = ggml_cann_create_tensor(dst);
3233
+ aclnn_cast(ctx, fa_dst_tensor.get(), acl_dst_tensor.get(), ggml_cann_type_mapping(dst->type));
3423
3234
  }
3424
- }else{
3235
+ } else {
3425
3236
  GGML_ABORT("Function is not implemented.");
3426
3237
  }
3427
3238
  }