@novastera-oss/llamarn 0.4.1 → 0.4.3-beta4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (976) hide show
  1. package/RNLlamaCpp.podspec +3 -0
  2. package/android/CMakeLists.txt +2 -0
  3. package/android/src/main/cpp/include/llama.h +44 -21
  4. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  6. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  8. package/android/src/main/jniLibs/armeabi-v7a/libggml-base.so +0 -0
  9. package/android/src/main/jniLibs/armeabi-v7a/libggml-cpu.so +0 -0
  10. package/android/src/main/jniLibs/armeabi-v7a/libggml.so +0 -0
  11. package/android/src/main/jniLibs/armeabi-v7a/libllama.so +0 -0
  12. package/android/src/main/jniLibs/x86/libggml-base.so +0 -0
  13. package/android/src/main/jniLibs/x86/libggml-cpu.so +0 -0
  14. package/android/src/main/jniLibs/x86/libggml.so +0 -0
  15. package/android/src/main/jniLibs/x86/libllama.so +0 -0
  16. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  17. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  18. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  19. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  20. package/cpp/build-info.cpp +2 -2
  21. package/cpp/llama.cpp/CMakeLists.txt +12 -0
  22. package/cpp/llama.cpp/CODEOWNERS +116 -10
  23. package/cpp/llama.cpp/CONTRIBUTING.md +30 -3
  24. package/cpp/llama.cpp/README.md +13 -5
  25. package/cpp/llama.cpp/build-xcframework.sh +5 -0
  26. package/cpp/llama.cpp/cmake/riscv64-spacemit-linux-gnu-gcc.cmake +29 -0
  27. package/cpp/llama.cpp/common/CMakeLists.txt +12 -2
  28. package/cpp/llama.cpp/common/arg.cpp +303 -795
  29. package/cpp/llama.cpp/common/arg.h +2 -3
  30. package/cpp/llama.cpp/common/chat-parser-xml-toolcall.cpp +861 -0
  31. package/cpp/llama.cpp/common/chat-parser-xml-toolcall.h +45 -0
  32. package/cpp/llama.cpp/common/chat-parser.cpp +156 -15
  33. package/cpp/llama.cpp/common/chat-parser.h +13 -0
  34. package/cpp/llama.cpp/common/chat.cpp +1147 -88
  35. package/cpp/llama.cpp/common/chat.h +16 -3
  36. package/cpp/llama.cpp/common/common.cpp +70 -15
  37. package/cpp/llama.cpp/common/common.h +57 -19
  38. package/cpp/llama.cpp/common/download.cpp +1072 -0
  39. package/cpp/llama.cpp/common/download.h +55 -0
  40. package/cpp/llama.cpp/common/http.h +73 -0
  41. package/cpp/llama.cpp/common/json-partial.cpp +70 -2
  42. package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +61 -22
  43. package/cpp/llama.cpp/common/json-schema-to-grammar.h +2 -0
  44. package/cpp/llama.cpp/common/log.cpp +59 -2
  45. package/cpp/llama.cpp/common/log.h +12 -4
  46. package/cpp/llama.cpp/common/sampling.cpp +84 -8
  47. package/cpp/llama.cpp/common/sampling.h +3 -1
  48. package/cpp/llama.cpp/common/speculative.cpp +1 -1
  49. package/cpp/llama.cpp/convert_hf_to_gguf.py +1608 -233
  50. package/cpp/llama.cpp/convert_hf_to_gguf_update.py +6 -1
  51. package/cpp/llama.cpp/convert_lora_to_gguf.py +37 -5
  52. package/cpp/llama.cpp/ggml/CMakeLists.txt +47 -28
  53. package/cpp/llama.cpp/ggml/include/ggml-backend.h +19 -1
  54. package/cpp/llama.cpp/ggml/include/ggml-cpu.h +1 -1
  55. package/cpp/llama.cpp/ggml/include/ggml-hexagon.h +19 -0
  56. package/cpp/llama.cpp/ggml/include/ggml-metal.h +1 -6
  57. package/cpp/llama.cpp/ggml/include/ggml-rpc.h +7 -9
  58. package/cpp/llama.cpp/ggml/include/ggml-zdnn.h +2 -1
  59. package/cpp/llama.cpp/ggml/include/ggml.h +199 -6
  60. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +38 -0
  61. package/cpp/llama.cpp/ggml/src/ggml-alloc.c +299 -130
  62. package/cpp/llama.cpp/ggml/src/ggml-backend-impl.h +4 -4
  63. package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +21 -5
  64. package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +99 -2
  65. package/cpp/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +1 -1
  66. package/cpp/llama.cpp/ggml/src/ggml-blas/ggml-blas.cpp +1 -0
  67. package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +57 -45
  68. package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +138 -47
  69. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +1584 -1773
  70. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +201 -317
  71. package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +146 -187
  72. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +771 -713
  73. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +135 -77
  74. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +5 -2
  75. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +428 -26
  76. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +16 -17
  77. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +318 -145
  78. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
  79. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +155 -60
  80. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +8 -8
  81. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +0 -1
  82. package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +14 -0
  83. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +10 -9
  84. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +108 -64
  85. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +14 -4
  86. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +530 -87
  87. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +37 -45
  88. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +349 -127
  89. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +947 -1218
  90. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +5 -4
  91. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +143 -29
  92. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +82 -76
  93. package/cpp/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.cpp +1025 -0
  94. package/cpp/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
  95. package/cpp/llama.cpp/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
  96. package/cpp/llama.cpp/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
  97. package/cpp/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +151 -0
  98. package/cpp/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +7 -0
  99. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +233 -28
  100. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +326 -66
  101. package/cpp/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +12 -3
  102. package/cpp/llama.cpp/ggml/src/ggml-cuda/argsort.cu +102 -6
  103. package/cpp/llama.cpp/ggml/src/ggml-cuda/binbcast.cu +110 -76
  104. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +167 -38
  105. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d.cu +6 -11
  106. package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cuh +12 -0
  107. package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy-utils.cuh +1 -1
  108. package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cu +245 -151
  109. package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cuh +1 -5
  110. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +341 -289
  111. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile.cu +49 -0
  112. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile.cuh +1233 -0
  113. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec.cuh +586 -0
  114. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +6 -6
  115. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +48 -0
  116. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cu +123 -220
  117. package/cpp/llama.cpp/ggml/src/ggml-cuda/getrows.cu +41 -39
  118. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +715 -45
  119. package/cpp/llama.cpp/ggml/src/ggml-cuda/im2col.cu +150 -0
  120. package/cpp/llama.cpp/ggml/src/ggml-cuda/im2col.cuh +1 -0
  121. package/cpp/llama.cpp/ggml/src/ggml-cuda/mma.cuh +321 -24
  122. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmf.cu +93 -351
  123. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmf.cuh +828 -1
  124. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmid.cu +164 -0
  125. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmid.cuh +5 -0
  126. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cu +3 -166
  127. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +1 -1
  128. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvf.cu +371 -78
  129. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvf.cuh +3 -2
  130. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvq.cu +279 -147
  131. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvq.cuh +1 -1
  132. package/cpp/llama.cpp/ggml/src/ggml-cuda/norm.cu +97 -85
  133. package/cpp/llama.cpp/ggml/src/ggml-cuda/pad.cu +46 -23
  134. package/cpp/llama.cpp/ggml/src/ggml-cuda/pad_reflect_1d.cu +63 -54
  135. package/cpp/llama.cpp/ggml/src/ggml-cuda/quantize.cu +12 -10
  136. package/cpp/llama.cpp/ggml/src/ggml-cuda/rope.cu +192 -77
  137. package/cpp/llama.cpp/ggml/src/ggml-cuda/rope.cuh +2 -0
  138. package/cpp/llama.cpp/ggml/src/ggml-cuda/scale.cu +10 -9
  139. package/cpp/llama.cpp/ggml/src/ggml-cuda/set-rows.cu +137 -75
  140. package/cpp/llama.cpp/ggml/src/ggml-cuda/set.cu +39 -0
  141. package/cpp/llama.cpp/ggml/src/ggml-cuda/set.cuh +7 -0
  142. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq112-dv112.cu +5 -0
  143. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq128-dv128.cu +5 -0
  144. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq256-dv256.cu +5 -0
  145. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq40-dv40.cu +5 -0
  146. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq576-dv512.cu +5 -0
  147. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq64-dv64.cu +5 -0
  148. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq72-dv72.cu +5 -0
  149. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq80-dv80.cu +5 -0
  150. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq96-dv96.cu +5 -0
  151. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu +7 -0
  152. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_0.cu +7 -0
  153. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_1.cu +7 -0
  154. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_0.cu +7 -0
  155. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_1.cu +7 -0
  156. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q8_0.cu +7 -0
  157. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-f16.cu +7 -0
  158. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu +7 -0
  159. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_1.cu +7 -0
  160. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_0.cu +7 -0
  161. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_1.cu +7 -0
  162. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q8_0.cu +7 -0
  163. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-f16.cu +7 -0
  164. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_0.cu +7 -0
  165. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_1.cu +7 -0
  166. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_0.cu +7 -0
  167. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_1.cu +7 -0
  168. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q8_0.cu +7 -0
  169. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-f16.cu +7 -0
  170. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_0.cu +7 -0
  171. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_1.cu +7 -0
  172. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_0.cu +7 -0
  173. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_1.cu +7 -0
  174. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q8_0.cu +7 -0
  175. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-f16.cu +7 -0
  176. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_0.cu +7 -0
  177. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_1.cu +7 -0
  178. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_0.cu +7 -0
  179. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_1.cu +7 -0
  180. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q8_0.cu +7 -0
  181. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-f16.cu +7 -0
  182. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_0.cu +7 -0
  183. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_1.cu +7 -0
  184. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_0.cu +7 -0
  185. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_1.cu +7 -0
  186. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu +7 -0
  187. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +40 -19
  188. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_1.cu +5 -0
  189. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_10.cu +5 -0
  190. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_11.cu +5 -0
  191. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_12.cu +5 -0
  192. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_13.cu +5 -0
  193. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_14.cu +5 -0
  194. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_15.cu +5 -0
  195. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_16.cu +5 -0
  196. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_2.cu +5 -0
  197. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_3.cu +5 -0
  198. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_4.cu +5 -0
  199. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_5.cu +5 -0
  200. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_6.cu +5 -0
  201. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_7.cu +5 -0
  202. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_8.cu +5 -0
  203. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_9.cu +5 -0
  204. package/cpp/llama.cpp/ggml/src/ggml-cuda/topk-moe.cu +336 -0
  205. package/cpp/llama.cpp/ggml/src/ggml-cuda/topk-moe.cuh +16 -0
  206. package/cpp/llama.cpp/ggml/src/ggml-cuda/tsembd.cu +3 -3
  207. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +105 -11
  208. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +36 -0
  209. package/cpp/llama.cpp/ggml/src/ggml-cuda/upscale.cu +87 -6
  210. package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +28 -12
  211. package/cpp/llama.cpp/ggml/src/ggml-hexagon/CMakeLists.txt +68 -0
  212. package/cpp/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp +3807 -0
  213. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/CMakeLists.txt +40 -0
  214. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/act-ops.c +442 -0
  215. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/binary-ops.c +360 -0
  216. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +157 -0
  217. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-ctx.h +40 -0
  218. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.c +69 -0
  219. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.h +119 -0
  220. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-msg.h +156 -0
  221. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-ops.h +64 -0
  222. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp_iface.idl +16 -0
  223. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-exp.c +93 -0
  224. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-inverse.c +60 -0
  225. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c +49 -0
  226. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-utils.c +960 -0
  227. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-utils.h +1032 -0
  228. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/main.c +829 -0
  229. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/matmul-ops.c +2223 -0
  230. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/ops-utils.h +149 -0
  231. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/rope-ops.c +418 -0
  232. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/softmax-ops.c +402 -0
  233. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/unary-ops.c +255 -0
  234. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/worker-pool.c +297 -0
  235. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/worker-pool.h +57 -0
  236. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp-utils.c +448 -0
  237. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp-utils.h +220 -0
  238. package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +8 -13
  239. package/cpp/llama.cpp/ggml/src/ggml-impl.h +110 -12
  240. package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +6 -5
  241. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-common.cpp +446 -0
  242. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
  243. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-context.h +33 -0
  244. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-context.m +599 -0
  245. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.cpp +1662 -0
  246. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.h +251 -0
  247. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.m +1527 -0
  248. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +244 -39
  249. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-ops.cpp +3844 -0
  250. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-ops.h +90 -0
  251. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.cpp +723 -0
  252. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +3453 -1907
  253. package/cpp/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +3 -1
  254. package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +10 -0
  255. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1331 -109
  256. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/cvt.cl +126 -0
  257. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +31 -4
  258. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +35 -7
  259. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +31 -4
  260. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32.cl +162 -0
  261. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32.cl +156 -0
  262. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/get_rows.cl +36 -12
  263. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_kq_kqv.cl +273 -0
  264. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +24 -10
  265. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +24 -10
  266. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl +154 -0
  267. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
  268. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
  269. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
  270. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
  271. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
  272. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
  273. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/pad.cl +29 -20
  274. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/rms_norm.cl +25 -10
  275. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/rope.cl +50 -24
  276. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/set_rows.cl +123 -10
  277. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +2 -2
  278. package/cpp/llama.cpp/ggml/src/ggml-quants.c +1 -0
  279. package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +341 -161
  280. package/cpp/llama.cpp/ggml/src/ggml-sycl/backend.hpp +6 -0
  281. package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +6 -5
  282. package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +74 -15
  283. package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +50 -30
  284. package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +10 -4
  285. package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +166 -99
  286. package/cpp/llama.cpp/ggml/src/ggml-sycl/count-equal.cpp +79 -0
  287. package/cpp/llama.cpp/ggml/src/ggml-sycl/count-equal.hpp +9 -0
  288. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +72 -94
  289. package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +67 -49
  290. package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +21 -31
  291. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +252 -316
  292. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +6 -2
  293. package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +9 -6
  294. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +359 -142
  295. package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
  296. package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
  297. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +80 -60
  298. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +201 -132
  299. package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +230 -55
  300. package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.hpp +2 -0
  301. package/cpp/llama.cpp/ggml/src/ggml-sycl/pad.cpp +97 -0
  302. package/cpp/llama.cpp/ggml/src/ggml-sycl/pad.hpp +24 -0
  303. package/cpp/llama.cpp/ggml/src/ggml-sycl/pad_reflect_1d.cpp +72 -0
  304. package/cpp/llama.cpp/ggml/src/ggml-sycl/pad_reflect_1d.hpp +8 -0
  305. package/cpp/llama.cpp/ggml/src/ggml-sycl/presets.hpp +2 -0
  306. package/cpp/llama.cpp/ggml/src/ggml-sycl/repeat_back.cpp +76 -0
  307. package/cpp/llama.cpp/ggml/src/ggml-sycl/repeat_back.hpp +8 -0
  308. package/cpp/llama.cpp/ggml/src/ggml-sycl/roll.cpp +122 -0
  309. package/cpp/llama.cpp/ggml/src/ggml-sycl/roll.hpp +20 -0
  310. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +50 -41
  311. package/cpp/llama.cpp/ggml/src/ggml-sycl/set.cpp +73 -0
  312. package/cpp/llama.cpp/ggml/src/ggml-sycl/set.hpp +5 -0
  313. package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.cpp +45 -36
  314. package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +330 -165
  315. package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +4 -0
  316. package/cpp/llama.cpp/ggml/src/ggml-sycl/ssm_conv.cpp +127 -0
  317. package/cpp/llama.cpp/ggml/src/ggml-sycl/ssm_conv.hpp +5 -0
  318. package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +12 -6
  319. package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +16 -12
  320. package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +38 -18
  321. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +4184 -2159
  322. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/abs.comp +21 -0
  323. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +2 -2
  324. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +2 -2
  325. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add1.comp +28 -0
  326. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +1 -1
  327. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/arange.comp +20 -0
  328. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +2 -2
  329. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +33 -26
  330. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argsort_large.comp +114 -0
  331. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/ceil.comp +22 -0
  332. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp +2 -2
  333. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/concat.comp +2 -2
  334. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +2 -2
  335. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp +1 -1
  336. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +53 -30
  337. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +1 -1
  338. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +2 -2
  339. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +3 -3
  340. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +13 -6
  341. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_transpose.comp +67 -0
  342. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp +2 -2
  343. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/count_equal.comp +2 -2
  344. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_f32.comp +1 -1
  345. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs.comp → dequant_funcs.glsl} +138 -2
  346. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs_cm2.comp → dequant_funcs_cm2.glsl} +18 -4
  347. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_head.comp → dequant_head.glsl} +1 -1
  348. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +1 -1
  349. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_s.comp +1 -1
  350. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +2 -2
  351. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xs.comp +1 -1
  352. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +3 -2
  353. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +7 -6
  354. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +5 -3
  355. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp +1 -1
  356. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp +1 -1
  357. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +3 -3
  358. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +3 -3
  359. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
  360. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_0.comp +1 -1
  361. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_1.comp +1 -1
  362. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +3 -3
  363. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_0.comp +1 -1
  364. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_1.comp +1 -1
  365. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +3 -3
  366. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
  367. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q8_0.comp +1 -1
  368. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/diag_mask_inf.comp +1 -1
  369. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/div.comp +2 -2
  370. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +3 -2
  371. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/fill.comp +19 -0
  372. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +52 -14
  373. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{flash_attn_base.comp → flash_attn_base.glsl} +50 -12
  374. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +61 -12
  375. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +54 -12
  376. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +5 -1
  377. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/floor.comp +22 -0
  378. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +2 -2
  379. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +2 -2
  380. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +2 -2
  381. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp +2 -2
  382. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +2 -2
  383. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp +2 -2
  384. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{generic_binary_head.comp → generic_binary_head.glsl} +10 -2
  385. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +21 -12
  386. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +28 -18
  387. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{glu_head.comp → glu_head.glsl} +1 -1
  388. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp +2 -2
  389. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +22 -0
  390. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +22 -0
  391. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +15 -7
  392. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +125 -0
  393. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +2 -2
  394. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/leaky_relu.comp +2 -2
  395. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +18 -0
  396. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul.comp +2 -2
  397. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +1 -1
  398. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl +229 -0
  399. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +33 -0
  400. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_m.comp +1 -1
  401. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp +1 -1
  402. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_s.comp +1 -1
  403. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +1 -1
  404. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xxs.comp +1 -1
  405. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_s.comp +1 -1
  406. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_xxs.comp +1 -1
  407. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +9 -7
  408. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp +9 -7
  409. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +3 -5
  410. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +1 -1
  411. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +3 -5
  412. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +3 -5
  413. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +1 -1
  414. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +140 -0
  415. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +106 -634
  416. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +118 -9
  417. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +556 -0
  418. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +70 -0
  419. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +77 -214
  420. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +589 -0
  421. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +78 -0
  422. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +97 -13
  423. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/neg.comp +20 -0
  424. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/norm.comp +2 -2
  425. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_adamw.comp +2 -2
  426. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +1 -1
  427. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +25 -4
  428. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/pool2d.comp +1 -1
  429. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +55 -5
  430. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +2 -2
  431. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp +2 -2
  432. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp +2 -2
  433. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp +2 -2
  434. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +45 -3
  435. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_back.comp +2 -2
  436. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +2 -2
  437. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +2 -2
  438. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +227 -0
  439. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +20 -0
  440. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +5 -52
  441. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +5 -35
  442. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +5 -35
  443. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +27 -0
  444. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +5 -41
  445. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/round.comp +29 -0
  446. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +2 -2
  447. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp +2 -2
  448. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp +2 -2
  449. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/silu_back.comp +2 -2
  450. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp +2 -2
  451. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +1 -1
  452. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +6 -2
  453. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/softplus.comp +23 -0
  454. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +2 -2
  455. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/square.comp +2 -2
  456. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +44 -0
  457. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/ssm_scan.comp +140 -0
  458. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/step.comp +22 -0
  459. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sub.comp +2 -2
  460. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +1 -1
  461. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +2 -2
  462. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +2 -2
  463. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp +2 -2
  464. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +5 -4
  465. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +171 -0
  466. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/trunc.comp +22 -0
  467. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{types.comp → types.glsl} +79 -29
  468. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +36 -12
  469. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +471 -196
  470. package/cpp/llama.cpp/ggml/src/ggml-webgpu/CMakeLists.txt +8 -0
  471. package/cpp/llama.cpp/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1690 -383
  472. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl +188 -0
  473. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +45 -0
  474. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +930 -0
  475. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +101 -0
  476. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +57 -10
  477. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.tmpl.wgsl +874 -0
  478. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +323 -0
  479. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +25 -912
  480. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +97 -0
  481. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.tmpl.wgsl +247 -0
  482. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.tmpl.wgsl +302 -0
  483. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl +267 -0
  484. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +123 -0
  485. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/rope.tmpl.wgsl +295 -0
  486. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/scale.tmpl.wgsl +90 -0
  487. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/{set_rows.wgsl → set_rows.tmpl.wgsl} +38 -8
  488. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.tmpl.wgsl +345 -0
  489. package/cpp/llama.cpp/ggml/src/ggml-zdnn/common.hpp +59 -0
  490. package/cpp/llama.cpp/ggml/src/ggml-zdnn/ggml-zdnn.cpp +96 -314
  491. package/cpp/llama.cpp/ggml/src/ggml-zdnn/mmf.cpp +80 -0
  492. package/cpp/llama.cpp/ggml/src/ggml-zdnn/mmf.hpp +12 -0
  493. package/cpp/llama.cpp/ggml/src/ggml-zdnn/utils.cpp +79 -0
  494. package/cpp/llama.cpp/ggml/src/ggml-zdnn/utils.hpp +19 -0
  495. package/cpp/llama.cpp/ggml/src/ggml.c +440 -17
  496. package/cpp/llama.cpp/ggml/src/gguf.cpp +104 -29
  497. package/cpp/llama.cpp/gguf-py/gguf/constants.py +363 -13
  498. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +64 -0
  499. package/cpp/llama.cpp/gguf-py/gguf/lazy.py +8 -3
  500. package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_convert_endian.py +6 -0
  501. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +156 -18
  502. package/cpp/llama.cpp/gguf-py/gguf/utility.py +80 -0
  503. package/cpp/llama.cpp/gguf-py/gguf/vocab.py +4 -4
  504. package/cpp/llama.cpp/include/llama.h +44 -21
  505. package/cpp/llama.cpp/media/llama1-icon-transparent.png +0 -0
  506. package/cpp/llama.cpp/media/llama1-icon-transparent.svg +77 -0
  507. package/cpp/llama.cpp/media/llama1-icon.png +0 -0
  508. package/cpp/llama.cpp/media/llama1-icon.svg +87 -0
  509. package/cpp/llama.cpp/requirements/requirements-all.txt +2 -0
  510. package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +3 -3
  511. package/cpp/llama.cpp/requirements/requirements-convert_legacy_llama.txt +3 -1
  512. package/cpp/llama.cpp/requirements/requirements-tool_bench.txt +1 -1
  513. package/cpp/llama.cpp/src/CMakeLists.txt +101 -0
  514. package/cpp/llama.cpp/src/llama-adapter.cpp +33 -0
  515. package/cpp/llama.cpp/src/llama-adapter.h +3 -0
  516. package/cpp/llama.cpp/src/llama-arch.cpp +344 -14
  517. package/cpp/llama.cpp/src/llama-arch.h +50 -0
  518. package/cpp/llama.cpp/src/llama-batch.cpp +63 -31
  519. package/cpp/llama.cpp/src/llama-batch.h +13 -2
  520. package/cpp/llama.cpp/src/llama-chat.cpp +85 -3
  521. package/cpp/llama.cpp/src/llama-chat.h +4 -0
  522. package/cpp/llama.cpp/src/llama-context.cpp +300 -45
  523. package/cpp/llama.cpp/src/llama-context.h +16 -6
  524. package/cpp/llama.cpp/src/llama-cparams.h +2 -1
  525. package/cpp/llama.cpp/src/llama-grammar.cpp +17 -9
  526. package/cpp/llama.cpp/src/llama-graph.cpp +226 -64
  527. package/cpp/llama.cpp/src/llama-graph.h +27 -5
  528. package/cpp/llama.cpp/src/llama-hparams.cpp +53 -2
  529. package/cpp/llama.cpp/src/llama-hparams.h +48 -8
  530. package/cpp/llama.cpp/src/llama-impl.cpp +3 -3
  531. package/cpp/llama.cpp/src/llama-impl.h +2 -0
  532. package/cpp/llama.cpp/src/llama-kv-cache-iswa.cpp +13 -3
  533. package/cpp/llama.cpp/src/llama-kv-cache-iswa.h +2 -0
  534. package/cpp/llama.cpp/src/llama-kv-cache.cpp +120 -62
  535. package/cpp/llama.cpp/src/llama-kv-cache.h +13 -4
  536. package/cpp/llama.cpp/src/llama-kv-cells.h +44 -2
  537. package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +19 -9
  538. package/cpp/llama.cpp/src/llama-memory-hybrid.h +2 -0
  539. package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +38 -17
  540. package/cpp/llama.cpp/src/llama-memory-recurrent.h +5 -2
  541. package/cpp/llama.cpp/src/llama-memory.h +3 -0
  542. package/cpp/llama.cpp/src/llama-model-loader.cpp +2 -0
  543. package/cpp/llama.cpp/src/llama-model.cpp +1070 -12614
  544. package/cpp/llama.cpp/src/llama-model.h +40 -4
  545. package/cpp/llama.cpp/src/llama-quant.cpp +14 -6
  546. package/cpp/llama.cpp/src/llama-sampling.cpp +243 -136
  547. package/cpp/llama.cpp/src/llama-vocab.cpp +43 -3
  548. package/cpp/llama.cpp/src/llama-vocab.h +43 -39
  549. package/cpp/llama.cpp/src/llama.cpp +69 -10
  550. package/cpp/llama.cpp/src/models/afmoe.cpp +187 -0
  551. package/cpp/llama.cpp/src/models/apertus.cpp +125 -0
  552. package/cpp/llama.cpp/src/models/arcee.cpp +135 -0
  553. package/cpp/llama.cpp/src/models/arctic.cpp +138 -0
  554. package/cpp/llama.cpp/src/models/arwkv7.cpp +86 -0
  555. package/cpp/llama.cpp/src/models/baichuan.cpp +122 -0
  556. package/cpp/llama.cpp/src/models/bailingmoe.cpp +144 -0
  557. package/cpp/llama.cpp/src/models/bailingmoe2.cpp +135 -0
  558. package/cpp/llama.cpp/src/models/bert.cpp +176 -0
  559. package/cpp/llama.cpp/src/models/bitnet.cpp +160 -0
  560. package/cpp/llama.cpp/src/models/bloom.cpp +101 -0
  561. package/cpp/llama.cpp/src/models/chameleon.cpp +178 -0
  562. package/cpp/llama.cpp/src/models/chatglm.cpp +132 -0
  563. package/cpp/llama.cpp/src/models/codeshell.cpp +111 -0
  564. package/cpp/llama.cpp/src/models/cogvlm.cpp +100 -0
  565. package/cpp/llama.cpp/src/models/cohere2-iswa.cpp +131 -0
  566. package/cpp/llama.cpp/src/models/command-r.cpp +122 -0
  567. package/cpp/llama.cpp/src/models/dbrx.cpp +123 -0
  568. package/cpp/llama.cpp/src/models/deci.cpp +135 -0
  569. package/cpp/llama.cpp/src/models/deepseek.cpp +144 -0
  570. package/cpp/llama.cpp/src/models/deepseek2.cpp +237 -0
  571. package/cpp/llama.cpp/src/models/dots1.cpp +134 -0
  572. package/cpp/llama.cpp/src/models/dream.cpp +105 -0
  573. package/cpp/llama.cpp/src/models/ernie4-5-moe.cpp +150 -0
  574. package/cpp/llama.cpp/src/models/ernie4-5.cpp +110 -0
  575. package/cpp/llama.cpp/src/models/exaone.cpp +114 -0
  576. package/cpp/llama.cpp/src/models/exaone4.cpp +123 -0
  577. package/cpp/llama.cpp/src/models/falcon-h1.cpp +113 -0
  578. package/cpp/llama.cpp/src/models/falcon.cpp +120 -0
  579. package/cpp/llama.cpp/src/models/gemma-embedding.cpp +120 -0
  580. package/cpp/llama.cpp/src/models/gemma.cpp +112 -0
  581. package/cpp/llama.cpp/src/models/gemma2-iswa.cpp +125 -0
  582. package/cpp/llama.cpp/src/models/gemma3-iswa.cpp +131 -0
  583. package/cpp/llama.cpp/src/models/gemma3n-iswa.cpp +377 -0
  584. package/cpp/llama.cpp/src/models/glm4-moe.cpp +153 -0
  585. package/cpp/llama.cpp/src/models/glm4.cpp +127 -0
  586. package/cpp/llama.cpp/src/models/gpt2.cpp +105 -0
  587. package/cpp/llama.cpp/src/models/gptneox.cpp +144 -0
  588. package/cpp/llama.cpp/src/models/granite-hybrid.cpp +196 -0
  589. package/cpp/llama.cpp/src/models/granite.cpp +211 -0
  590. package/cpp/llama.cpp/src/models/graph-context-mamba.cpp +283 -0
  591. package/cpp/llama.cpp/src/models/grok.cpp +159 -0
  592. package/cpp/llama.cpp/src/models/grovemoe.cpp +141 -0
  593. package/cpp/llama.cpp/src/models/hunyuan-dense.cpp +132 -0
  594. package/cpp/llama.cpp/src/models/hunyuan-moe.cpp +154 -0
  595. package/cpp/llama.cpp/src/models/internlm2.cpp +120 -0
  596. package/cpp/llama.cpp/src/models/jais.cpp +86 -0
  597. package/cpp/llama.cpp/src/models/jamba.cpp +106 -0
  598. package/cpp/llama.cpp/src/models/lfm2.cpp +173 -0
  599. package/cpp/llama.cpp/src/models/llada-moe.cpp +122 -0
  600. package/cpp/llama.cpp/src/models/llada.cpp +99 -0
  601. package/cpp/llama.cpp/src/models/llama-iswa.cpp +174 -0
  602. package/cpp/llama.cpp/src/models/llama.cpp +155 -0
  603. package/cpp/llama.cpp/src/models/mamba.cpp +55 -0
  604. package/cpp/llama.cpp/src/models/minicpm3.cpp +199 -0
  605. package/cpp/llama.cpp/src/models/minimax-m2.cpp +124 -0
  606. package/cpp/llama.cpp/src/models/models.h +485 -0
  607. package/cpp/llama.cpp/src/models/mpt.cpp +126 -0
  608. package/cpp/llama.cpp/src/models/nemotron-h.cpp +121 -0
  609. package/cpp/llama.cpp/src/models/nemotron.cpp +122 -0
  610. package/cpp/llama.cpp/src/models/neo-bert.cpp +104 -0
  611. package/cpp/llama.cpp/src/models/olmo.cpp +121 -0
  612. package/cpp/llama.cpp/src/models/olmo2.cpp +150 -0
  613. package/cpp/llama.cpp/src/models/olmoe.cpp +124 -0
  614. package/cpp/llama.cpp/src/models/openai-moe-iswa.cpp +124 -0
  615. package/cpp/llama.cpp/src/models/openelm.cpp +124 -0
  616. package/cpp/llama.cpp/src/models/orion.cpp +123 -0
  617. package/cpp/llama.cpp/src/models/pangu-embedded.cpp +121 -0
  618. package/cpp/llama.cpp/src/models/phi2.cpp +121 -0
  619. package/cpp/llama.cpp/src/models/phi3.cpp +152 -0
  620. package/cpp/llama.cpp/src/models/plamo.cpp +110 -0
  621. package/cpp/llama.cpp/src/models/plamo2.cpp +316 -0
  622. package/cpp/llama.cpp/src/models/plm.cpp +168 -0
  623. package/cpp/llama.cpp/src/models/qwen.cpp +108 -0
  624. package/cpp/llama.cpp/src/models/qwen2.cpp +117 -0
  625. package/cpp/llama.cpp/src/models/qwen2moe.cpp +151 -0
  626. package/cpp/llama.cpp/src/models/qwen2vl.cpp +117 -0
  627. package/cpp/llama.cpp/src/models/qwen3.cpp +117 -0
  628. package/cpp/llama.cpp/src/models/qwen3moe.cpp +124 -0
  629. package/cpp/llama.cpp/src/models/qwen3vl-moe.cpp +149 -0
  630. package/cpp/llama.cpp/src/models/qwen3vl.cpp +141 -0
  631. package/cpp/llama.cpp/src/models/refact.cpp +94 -0
  632. package/cpp/llama.cpp/src/models/rwkv6-base.cpp +162 -0
  633. package/cpp/llama.cpp/src/models/rwkv6.cpp +94 -0
  634. package/cpp/llama.cpp/src/models/rwkv6qwen2.cpp +86 -0
  635. package/cpp/llama.cpp/src/models/rwkv7-base.cpp +135 -0
  636. package/cpp/llama.cpp/src/models/rwkv7.cpp +90 -0
  637. package/cpp/llama.cpp/src/models/seed-oss.cpp +124 -0
  638. package/cpp/llama.cpp/src/models/smallthinker.cpp +120 -0
  639. package/cpp/llama.cpp/src/models/smollm3.cpp +128 -0
  640. package/cpp/llama.cpp/src/models/stablelm.cpp +146 -0
  641. package/cpp/llama.cpp/src/models/starcoder.cpp +100 -0
  642. package/cpp/llama.cpp/src/models/starcoder2.cpp +121 -0
  643. package/cpp/llama.cpp/src/models/t5-dec.cpp +166 -0
  644. package/cpp/llama.cpp/src/models/t5-enc.cpp +96 -0
  645. package/cpp/llama.cpp/src/models/wavtokenizer-dec.cpp +149 -0
  646. package/cpp/llama.cpp/src/models/xverse.cpp +108 -0
  647. package/cpp/llama.cpp/src/unicode.cpp +77 -0
  648. package/cpp/llama.cpp/src/unicode.h +43 -0
  649. package/cpp/llama.cpp/vendor/cpp-httplib/CMakeLists.txt +94 -0
  650. package/cpp/llama.cpp/vendor/cpp-httplib/httplib.cpp +9339 -0
  651. package/cpp/llama.cpp/vendor/cpp-httplib/httplib.h +433 -8222
  652. package/cpp/llama.cpp/vendor/cpp-httplib/patch-boringssl.cmake +6 -0
  653. package/cpp/llama.cpp/vendor/miniaudio/miniaudio.h +4179 -1900
  654. package/cpp/llama.cpp/vendor/minja/chat-template.hpp +9 -2
  655. package/cpp/llama.cpp/vendor/minja/minja.hpp +101 -22
  656. package/ios/include/chat.h +16 -3
  657. package/ios/include/common/minja/chat-template.hpp +9 -2
  658. package/ios/include/common/minja/minja.hpp +101 -22
  659. package/ios/include/common.h +57 -19
  660. package/ios/include/json-schema-to-grammar.h +2 -0
  661. package/ios/include/llama.h +44 -21
  662. package/ios/include/log.h +12 -4
  663. package/ios/include/sampling.h +3 -1
  664. package/ios/libs/llama.xcframework/Info.plist +20 -20
  665. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  666. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +6399 -5557
  667. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-backend.h +19 -1
  668. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +1 -1
  669. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-metal.h +1 -6
  670. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +199 -6
  671. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +44 -21
  672. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  673. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  674. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +6362 -5520
  675. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4813 -4241
  676. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +19 -1
  677. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +1 -1
  678. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-metal.h +1 -6
  679. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +199 -6
  680. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +44 -21
  681. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  682. package/package.json +10 -4
  683. package/cpp/llama.cpp/ggml/src/ggml-cann/Doxyfile +0 -2579
  684. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cu +0 -371
  685. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cuh +0 -3
  686. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cu +0 -379
  687. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cuh +0 -3
  688. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +0 -495
  689. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +0 -486
  690. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
  691. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
  692. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
  693. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
  694. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
  695. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
  696. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
  697. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
  698. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
  699. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
  700. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
  701. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
  702. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
  703. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
  704. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
  705. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
  706. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
  707. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
  708. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
  709. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
  710. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
  711. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
  712. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
  713. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
  714. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
  715. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
  716. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
  717. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
  718. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
  719. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
  720. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
  721. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
  722. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
  723. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
  724. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
  725. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
  726. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
  727. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
  728. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
  729. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
  730. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
  731. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
  732. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
  733. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
  734. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
  735. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
  736. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
  737. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
  738. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
  739. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
  740. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
  741. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
  742. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
  743. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
  744. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
  745. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
  746. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
  747. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
  748. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
  749. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
  750. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
  751. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
  752. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
  753. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
  754. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
  755. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
  756. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
  757. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
  758. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
  759. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
  760. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
  761. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
  762. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
  763. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
  764. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
  765. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
  766. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
  767. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
  768. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
  769. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
  770. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
  771. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
  772. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
  773. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
  774. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
  775. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
  776. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +0 -6886
  777. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +0 -154
  778. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +0 -105
  779. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +0 -55
  780. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +0 -60
  781. package/cpp/llama.cpp/ggml/src/ggml-zdnn/ggml-zdnn-impl.h +0 -97
  782. package/cpp/llama.cpp/models/ggml-vocab-aquila.gguf +0 -0
  783. package/cpp/llama.cpp/models/ggml-vocab-baichuan.gguf +0 -0
  784. package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf +0 -0
  785. package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +0 -112
  786. package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +0 -46
  787. package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf +0 -0
  788. package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.inp +0 -112
  789. package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.out +0 -46
  790. package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf +0 -0
  791. package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +0 -112
  792. package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +0 -46
  793. package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf +0 -0
  794. package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +0 -112
  795. package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +0 -46
  796. package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf +0 -0
  797. package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.inp +0 -112
  798. package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.out +0 -46
  799. package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf +0 -0
  800. package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +0 -112
  801. package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +0 -46
  802. package/cpp/llama.cpp/models/ggml-vocab-gpt-neox.gguf +0 -0
  803. package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf +0 -0
  804. package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +0 -112
  805. package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +0 -46
  806. package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf +0 -0
  807. package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +0 -112
  808. package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +0 -46
  809. package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf +0 -0
  810. package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.inp +0 -112
  811. package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.out +0 -46
  812. package/cpp/llama.cpp/models/ggml-vocab-nomic-bert-moe.gguf +0 -0
  813. package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf +0 -0
  814. package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +0 -112
  815. package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.out +0 -46
  816. package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf +0 -0
  817. package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +0 -112
  818. package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.out +0 -46
  819. package/cpp/llama.cpp/models/ggml-vocab-refact.gguf +0 -0
  820. package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.inp +0 -112
  821. package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.out +0 -46
  822. package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf +0 -0
  823. package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +0 -112
  824. package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.out +0 -46
  825. package/cpp/llama.cpp/models/templates/ByteDance-Seed-OSS.jinja +0 -171
  826. package/cpp/llama.cpp/models/templates/CohereForAI-c4ai-command-r-plus-tool_use.jinja +0 -202
  827. package/cpp/llama.cpp/models/templates/CohereForAI-c4ai-command-r7b-12-2024-tool_use.jinja +0 -156
  828. package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +0 -124
  829. package/cpp/llama.cpp/models/templates/NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja +0 -152
  830. package/cpp/llama.cpp/models/templates/NousResearch-Hermes-3-Llama-3.1-8B-tool_use.jinja +0 -152
  831. package/cpp/llama.cpp/models/templates/Qwen-QwQ-32B.jinja +0 -62
  832. package/cpp/llama.cpp/models/templates/Qwen-Qwen2.5-7B-Instruct.jinja +0 -54
  833. package/cpp/llama.cpp/models/templates/Qwen-Qwen3-0.6B.jinja +0 -85
  834. package/cpp/llama.cpp/models/templates/README.md +0 -25
  835. package/cpp/llama.cpp/models/templates/deepseek-ai-DeepSeek-R1-Distill-Llama-8B.jinja +0 -1
  836. package/cpp/llama.cpp/models/templates/deepseek-ai-DeepSeek-R1-Distill-Qwen-32B.jinja +0 -1
  837. package/cpp/llama.cpp/models/templates/fireworks-ai-llama-3-firefunction-v2.jinja +0 -57
  838. package/cpp/llama.cpp/models/templates/google-gemma-2-2b-it.jinja +0 -4
  839. package/cpp/llama.cpp/models/templates/ibm-granite-granite-3.3-2B-Instruct.jinja +0 -59
  840. package/cpp/llama.cpp/models/templates/llama-cpp-deepseek-r1.jinja +0 -76
  841. package/cpp/llama.cpp/models/templates/llama-cpp-rwkv-world.jinja +0 -34
  842. package/cpp/llama.cpp/models/templates/meetkai-functionary-medium-v3.1.jinja +0 -58
  843. package/cpp/llama.cpp/models/templates/meetkai-functionary-medium-v3.2.jinja +0 -287
  844. package/cpp/llama.cpp/models/templates/meta-llama-Llama-3.1-8B-Instruct.jinja +0 -109
  845. package/cpp/llama.cpp/models/templates/meta-llama-Llama-3.2-3B-Instruct.jinja +0 -93
  846. package/cpp/llama.cpp/models/templates/meta-llama-Llama-3.3-70B-Instruct.jinja +0 -109
  847. package/cpp/llama.cpp/models/templates/microsoft-Phi-3.5-mini-instruct.jinja +0 -8
  848. package/cpp/llama.cpp/models/templates/mistralai-Mistral-Nemo-Instruct-2407.jinja +0 -87
  849. package/cpp/llama.cpp/models/templates/moonshotai-Kimi-K2.jinja +0 -43
  850. package/cpp/llama.cpp/models/templates/openai-gpt-oss-120b.jinja +0 -331
  851. package/cpp/llama.cpp/models/templates/unsloth-mistral-Devstral-Small-2507.jinja +0 -105
  852. package/cpp/llama.cpp/prompts/LLM-questions.txt +0 -49
  853. package/cpp/llama.cpp/prompts/alpaca.txt +0 -1
  854. package/cpp/llama.cpp/prompts/assistant.txt +0 -31
  855. package/cpp/llama.cpp/prompts/chat-with-baichuan.txt +0 -4
  856. package/cpp/llama.cpp/prompts/chat-with-bob.txt +0 -7
  857. package/cpp/llama.cpp/prompts/chat-with-qwen.txt +0 -1
  858. package/cpp/llama.cpp/prompts/chat-with-vicuna-v0.txt +0 -7
  859. package/cpp/llama.cpp/prompts/chat-with-vicuna-v1.txt +0 -7
  860. package/cpp/llama.cpp/prompts/chat.txt +0 -28
  861. package/cpp/llama.cpp/prompts/dan-modified.txt +0 -1
  862. package/cpp/llama.cpp/prompts/dan.txt +0 -1
  863. package/cpp/llama.cpp/prompts/mnemonics.txt +0 -93
  864. package/cpp/llama.cpp/prompts/parallel-questions.txt +0 -43
  865. package/cpp/llama.cpp/prompts/reason-act.txt +0 -18
  866. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
  867. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  868. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5524
  869. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +0 -4247
  870. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-alloc.h +0 -76
  871. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-backend.h +0 -354
  872. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-blas.h +0 -25
  873. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +0 -145
  874. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-metal.h +0 -66
  875. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-opt.h +0 -256
  876. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +0 -2492
  877. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/gguf.h +0 -202
  878. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +0 -1391
  879. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Modules/module.modulemap +0 -17
  880. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Resources/Info.plist +0 -32
  881. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-alloc.h +0 -76
  882. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-backend.h +0 -354
  883. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-blas.h +0 -25
  884. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +0 -145
  885. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-metal.h +0 -66
  886. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-opt.h +0 -256
  887. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +0 -2492
  888. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/gguf.h +0 -202
  889. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +0 -1391
  890. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Modules/module.modulemap +0 -17
  891. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Resources/Info.plist +0 -32
  892. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  893. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-alloc.h +0 -76
  894. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-backend.h +0 -354
  895. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-blas.h +0 -25
  896. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +0 -145
  897. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-metal.h +0 -66
  898. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-opt.h +0 -256
  899. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +0 -2492
  900. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/gguf.h +0 -202
  901. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +0 -1391
  902. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Modules/module.modulemap +0 -17
  903. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Resources/Info.plist +0 -32
  904. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  905. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  906. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
  907. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  908. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5561
  909. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-alloc.h +0 -76
  910. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-backend.h +0 -354
  911. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-blas.h +0 -25
  912. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +0 -145
  913. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-metal.h +0 -66
  914. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-opt.h +0 -256
  915. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +0 -2492
  916. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/gguf.h +0 -202
  917. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +0 -1391
  918. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Info.plist +0 -35
  919. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Modules/module.modulemap +0 -17
  920. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  921. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
  922. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  923. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5524
  924. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +0 -4246
  925. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-alloc.h +0 -76
  926. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +0 -354
  927. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-blas.h +0 -25
  928. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +0 -145
  929. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-metal.h +0 -66
  930. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +0 -256
  931. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +0 -2492
  932. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/gguf.h +0 -202
  933. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +0 -1391
  934. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Info.plist +0 -35
  935. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Modules/module.modulemap +0 -17
  936. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  937. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
  938. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  939. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5558
  940. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-alloc.h +0 -76
  941. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-backend.h +0 -354
  942. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-blas.h +0 -25
  943. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +0 -145
  944. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-metal.h +0 -66
  945. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-opt.h +0 -256
  946. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +0 -2492
  947. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/gguf.h +0 -202
  948. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +0 -1391
  949. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Info.plist +0 -32
  950. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Modules/module.modulemap +0 -17
  951. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  952. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
  953. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  954. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5520
  955. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +0 -4243
  956. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-alloc.h +0 -76
  957. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +0 -354
  958. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-blas.h +0 -25
  959. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +0 -145
  960. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-metal.h +0 -66
  961. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +0 -256
  962. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +0 -2492
  963. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/gguf.h +0 -202
  964. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +0 -1391
  965. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Info.plist +0 -32
  966. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Modules/module.modulemap +0 -17
  967. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  968. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{test_bfloat16_support.comp → feature-tests/bfloat16.comp} +0 -0
  969. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat_support.comp → feature-tests/coopmat.comp} +0 -0
  970. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat2_support.comp → feature-tests/coopmat2.comp} +0 -0
  971. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{test_integer_dot_support.comp → feature-tests/integer_dot.comp} +0 -0
  972. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{generic_head.comp → generic_head.glsl} +0 -0
  973. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{generic_unary_head.comp → generic_unary_head.glsl} +0 -0
  974. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{glu_main.comp → glu_main.glsl} +0 -0
  975. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{rte.comp → rte.glsl} +0 -0
  976. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{utils.comp → utils.glsl} +0 -0
@@ -0,0 +1,3807 @@
1
+ #include <assert.h>
2
+ #include <inttypes.h>
3
+ #include <stdio.h>
4
+ #include <stdlib.h>
5
+ #include <string.h>
6
+ #include <time.h>
7
+
8
+ #include <atomic>
9
+ #include <chrono>
10
+ #include <mutex>
11
+ #include <string>
12
+
13
+ #ifdef _WIN32
14
+ # include <sal.h>
15
+ # ifndef _WINDOWS
16
+ # define _WINDOWS
17
+ # endif
18
+ #else
19
+ # include <semaphore.h>
20
+ # include <unistd.h>
21
+ #endif
22
+
23
+ #pragma clang diagnostic ignored "-Wnested-anon-types"
24
+ #pragma clang diagnostic ignored "-Wgnu-anonymous-struct"
25
+
26
+ #include "htp-utils.h"
27
+
28
+ #include <AEEStdErr.h>
29
+ #include <dspqueue.h>
30
+ #include <rpcmem.h>
31
+
32
+ #define GGML_COMMON_IMPL_CPP
33
+ #include "ggml-backend-impl.h"
34
+ #include "ggml-common.h"
35
+ #include "ggml-hexagon.h"
36
+ #include "ggml-impl.h"
37
+ #include "ggml-quants.h"
38
+ #include "htp-msg.h"
39
+ #include "htp_iface.h"
40
+
41
+ static size_t opt_ndev = 1;
42
+ static size_t opt_nhvx = 0; // use all
43
+ static int opt_arch = 0; // autodetect
44
+ static int opt_etm = 0;
45
+ static int opt_verbose = 0;
46
+ static int opt_profile = 0;
47
+ static int opt_hostbuf = 1;
48
+ static int opt_experimental = 0;
49
+
50
+ // Enable all stages by default
51
+ static int opt_opmask = HTP_OPMASK_QUEUE | HTP_OPMASK_QUANTIZE | HTP_OPMASK_COMPUTE;
52
+ static int opt_opsync = 0; // synchronous ops
53
+
54
+ #define HEX_VERBOSE(...) \
55
+ if (opt_verbose) GGML_LOG_DEBUG(__VA_ARGS__)
56
+
57
+ #define HEX_PROFILE(...) \
58
+ if (opt_profile) GGML_LOG_INFO(__VA_ARGS__)
59
+
60
+ static inline uint64_t hex_is_aligned(void * addr, uint32_t align) {
61
+ return ((size_t) addr & (align - 1)) == 0;
62
+ }
63
+
64
+ static inline size_t hex_round_up(size_t n, size_t m) {
65
+ return m * ((n + m - 1) / m);
66
+ }
67
+
68
+ static const char * status_to_str(uint32_t status) {
69
+ switch (status) {
70
+ case HTP_STATUS_OK:
71
+ return "OK";
72
+ case HTP_STATUS_NO_SUPPORT:
73
+ return "NO-SUPPORT";
74
+ case HTP_STATUS_INVAL_PARAMS:
75
+ return "INVAL-PARAMS";
76
+ case HTP_STATUS_VTCM_TOO_SMALL:
77
+ return "VTCM-TOO-SMALL";
78
+ case HTP_STATUS_INTERNAL_ERR:
79
+ return "INTERNAL-ERROR";
80
+ default:
81
+ return "UNKNOWN";
82
+ }
83
+ }
84
+
85
+ // ** debug helpers
86
+
87
+ static inline int hex_format_tensor_dims(char * str, const struct ggml_tensor * t) {
88
+ if (t->ne[2] == 1 && t->ne[3] == 1) {
89
+ return sprintf(str, "%d:%d", (int) t->ne[0], (int) t->ne[1]);
90
+ } else {
91
+ return sprintf(str, "%d:%d:%d:%d", (int) t->ne[0], (int) t->ne[1], (int) t->ne[2], (int) t->ne[3]);
92
+ }
93
+ }
94
+
95
+ static inline void hex_format_op_dims(char * str, const struct ggml_tensor * t) {
96
+ char * p = str;
97
+
98
+ // append src0 and src1 (if any)
99
+ if (t->src[0]) {
100
+ p += hex_format_tensor_dims(p, t->src[0]);
101
+
102
+ for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
103
+ p += sprintf(p, " x ");
104
+ p += hex_format_tensor_dims(p, t->src[i]);
105
+ }
106
+
107
+ p += sprintf(p, " -> ");
108
+ }
109
+
110
+ // format self dims separately for better visual alignment
111
+ char self[64];
112
+ hex_format_tensor_dims(self, t);
113
+
114
+ p += sprintf(p, "%s", self);
115
+ }
116
+
117
+ static inline int hex_format_tensor_strides(char * str, const struct ggml_tensor * t) {
118
+ const char * c = ggml_is_contiguous(t) ? "" : "!";
119
+
120
+ if (t->ne[2] == 1 && t->ne[3] == 1) {
121
+ return sprintf(str, "%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], c);
122
+ } else {
123
+ return sprintf(str, "%zu:%zu:%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], (size_t) t->nb[2],
124
+ (size_t) t->nb[3], c);
125
+ }
126
+ }
127
+
128
+ static inline void hex_format_op_strides(char * str, const struct ggml_tensor * t) {
129
+ char * p = str;
130
+
131
+ // append src0 and src1 (if any)
132
+ if (t->src[0]) {
133
+ p += hex_format_tensor_strides(p, t->src[0]);
134
+
135
+ for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
136
+ p += sprintf(p, " x ");
137
+ p += hex_format_tensor_strides(p, t->src[i]);
138
+ }
139
+
140
+ p += sprintf(p, " -> ");
141
+ }
142
+
143
+ // format self dims separately for better visual alignment
144
+ char self[64];
145
+ hex_format_tensor_strides(self, t);
146
+
147
+ p += sprintf(p, "%s", self);
148
+ }
149
+
150
+ static inline void hex_format_op_types(char * str, const struct ggml_tensor * t) {
151
+ char * p = str;
152
+
153
+ // append src0 and src1 (if any)
154
+ if (t->src[0]) {
155
+ p += sprintf(p, "%s", ggml_type_name(t->src[0]->type));
156
+
157
+ for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
158
+ p += sprintf(p, " x ");
159
+ p += sprintf(p, "%s", ggml_type_name(t->src[i]->type));
160
+ }
161
+
162
+ p += sprintf(p, " -> ");
163
+ }
164
+
165
+ p += sprintf(p, "%s", ggml_type_name(t->type));
166
+ }
167
+
168
+ static inline const char * hex_tensor_buff_name(const struct ggml_tensor * t) {
169
+ if (t->buffer) {
170
+ return ggml_backend_buffer_name(t->buffer);
171
+ }
172
+ return "NONE";
173
+ }
174
+
175
+ static inline void hex_format_op_buffs(char * str, const struct ggml_tensor * t) {
176
+ char * p = str;
177
+
178
+ // append src0 and src1 (if any)
179
+ if (t->src[0]) {
180
+ p += sprintf(p, "%s", hex_tensor_buff_name(t->src[0]));
181
+
182
+ for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
183
+ p += sprintf(p, " x ");
184
+ p += sprintf(p, "%s", hex_tensor_buff_name(t->src[i]));
185
+ }
186
+
187
+ p += sprintf(p, " -> ");
188
+ }
189
+
190
+ p += sprintf(p, "%s", hex_tensor_buff_name(t));
191
+ }
192
+
193
+ static inline void hex_format_op_names(char * str, const struct ggml_tensor * t) {
194
+ char * p = str;
195
+
196
+ // append src0 and src1 (if any)
197
+ if (t->src[0]) {
198
+ p += sprintf(p, "%s", t->src[0]->name);
199
+
200
+ for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
201
+ p += sprintf(p, " x ");
202
+ p += sprintf(p, "%s", t->src[i]->name);
203
+ }
204
+
205
+ p += sprintf(p, " -> ");
206
+ }
207
+
208
+ p += sprintf(p, "%s", t->name);
209
+ }
210
+
211
+ // ** backend sessions
212
+
213
+ struct ggml_hexagon_session {
214
+ ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) noexcept(false);
215
+ ~ggml_hexagon_session() noexcept(true);
216
+
217
+ void allocate(int dev_id) noexcept(false);
218
+ void release() noexcept(true);
219
+
220
+ void enqueue(struct htp_general_req &req, struct dspqueue_buffer *bufs, uint32_t n_bufs, bool sync = false);
221
+ void flush();
222
+
223
+ ggml_backend_buffer_type buffer_type;
224
+ ggml_backend_buffer_type repack_buffer_type;
225
+
226
+ std::string name;
227
+ remote_handle64 handle;
228
+ dspqueue_t queue;
229
+ uint32_t session_id;
230
+ uint32_t domain_id;
231
+ uint64_t queue_id;
232
+ int dev_id;
233
+ bool valid_session;
234
+ bool valid_handle;
235
+ bool valid_queue;
236
+ bool valid_iface;
237
+ std::atomic<int> op_pending;
238
+ uint32_t prof_usecs;
239
+ uint32_t prof_cycles;
240
+ uint32_t prof_pkts;
241
+ };
242
+
243
+ void ggml_hexagon_session::enqueue(struct htp_general_req &req, struct dspqueue_buffer *bufs, uint32_t n_bufs, bool sync) {
244
+ // Bump pending flag (cleared in the session::flush once we get the responce)
245
+ this->op_pending++; // atomic inc
246
+
247
+ int err = dspqueue_write(this->queue,
248
+ 0, // flags - the framework will autoset this
249
+ n_bufs, // number of buffers
250
+ bufs, // buffer references
251
+ sizeof(req),
252
+ (const uint8_t *) &req, // Message
253
+ 1000000 // Timeout
254
+ );
255
+
256
+ if (err != 0) {
257
+ GGML_ABORT("ggml-hex: %s dspqueue_write failed: 0x%08x\n", this->name.c_str(), (unsigned) err);
258
+ }
259
+
260
+ if (sync) {
261
+ flush();
262
+ }
263
+ }
264
+
265
+ // Flush HTP response queue i.e wait for all outstanding requests to complete
266
+ void ggml_hexagon_session::flush() {
267
+ dspqueue_t q = this->queue;
268
+
269
+ // Repeatedly read packets from the queue until it's empty. We don't
270
+ // necessarily get a separate callback for each packet, and new packets
271
+ // may arrive while we're processing the previous one.
272
+
273
+ while (this->op_pending) {
274
+ struct htp_general_rsp rsp;
275
+ uint32_t rsp_size;
276
+ uint32_t flags;
277
+
278
+ struct dspqueue_buffer bufs[HTP_MAX_PACKET_BUFFERS];
279
+ uint32_t n_bufs;
280
+
281
+ // Read response packet from queue
282
+ int err = dspqueue_read(q, &flags,
283
+ HTP_MAX_PACKET_BUFFERS, // Maximum number of buffer references
284
+ &n_bufs, // Number of buffer references
285
+ bufs, // Buffer references
286
+ sizeof(rsp), // Max message length
287
+ &rsp_size, // Message length
288
+ (uint8_t *) &rsp,
289
+ 1000000); // Timeout
290
+
291
+ if (err == AEE_EEXPIRED) {
292
+ // TODO: might need to bail out if the HTP is stuck on something
293
+ continue;
294
+ }
295
+
296
+ if (err != 0) {
297
+ GGML_ABORT("ggml-hex: dspqueue_read failed: 0x%08x\n", (unsigned) err);
298
+ }
299
+
300
+ // Basic sanity checks
301
+ if (rsp_size != sizeof(rsp)) {
302
+ GGML_ABORT("ggml-hex: dspcall : bad response (size)\n");
303
+ }
304
+
305
+ if (rsp.status != HTP_STATUS_OK) {
306
+ GGML_LOG_ERROR("ggml-hex: dspcall : dsp-rsp: %s\n", status_to_str(rsp.status));
307
+ // TODO: handle errors
308
+ }
309
+
310
+ // TODO: update profiling implementation, currently only works for opt_opsync mode
311
+ this->prof_usecs = rsp.prof_usecs;
312
+ this->prof_cycles = rsp.prof_cycles;
313
+ this->prof_pkts = rsp.prof_pkts;
314
+
315
+ this->op_pending--; // atomic dec
316
+ }
317
+ }
318
+
319
+ // ** backend buffers
320
+
321
+ struct ggml_backend_hexagon_buffer_type_context {
322
+ ggml_backend_hexagon_buffer_type_context(const std::string & name, ggml_hexagon_session * sess) {
323
+ this->sess = sess;
324
+ this->name = name;
325
+ }
326
+
327
+ ggml_hexagon_session * sess;
328
+ std::string name;
329
+ };
330
+
331
+ struct ggml_backend_hexagon_buffer_context {
332
+ bool mmap_to(ggml_hexagon_session * s) {
333
+ HEX_VERBOSE("ggml-hex: %s mmaping buffer: base %p domain-id %d session-id %d size %zu fd %d repack %d\n",
334
+ s->name.c_str(), (void *) this->base, s->domain_id, s->session_id, this->size, this->fd,
335
+ (int) this->repack);
336
+
337
+ int err = fastrpc_mmap(s->domain_id, this->fd, (void *) this->base, 0, this->size, FASTRPC_MAP_FD);
338
+ if (err != 0) {
339
+ GGML_LOG_ERROR("ggml-hex: buffer mapping failed : domain_id %d size %zu fd %d error 0x%08x\n",
340
+ s->domain_id, this->size, this->fd, (unsigned) err);
341
+ return false;
342
+ }
343
+
344
+ return true;
345
+ }
346
+
347
+ bool mmap() {
348
+ if (this->mapped) {
349
+ return true;
350
+ }
351
+ if (!mmap_to(this->sess)) {
352
+ return false;
353
+ }
354
+ this->mapped = true;
355
+ return true;
356
+ }
357
+
358
+ void munmap() {
359
+ if (!this->mapped) {
360
+ return;
361
+ }
362
+
363
+ fastrpc_munmap(this->sess->domain_id, this->fd, this->base, this->size);
364
+ this->mapped = false;
365
+ }
366
+
367
+ ggml_backend_hexagon_buffer_context(ggml_hexagon_session * sess, size_t size, bool repack) {
368
+ size += 4 * 1024; // extra page for padding
369
+
370
+ if (rpcmem_alloc2) {
371
+ this->base = (uint8_t *) rpcmem_alloc2(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size);
372
+ } else {
373
+ GGML_LOG_INFO("ggml-hex: %s rpcmem_alloc2 not found, falling back to rpcmem_alloc\n", sess->name.c_str());
374
+ this->base = (uint8_t *) rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size);
375
+ }
376
+
377
+ if (!this->base) {
378
+ GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer : size %zu\n", sess->name.c_str(), size);
379
+ throw std::runtime_error("ggml-hex: rpcmem_alloc failed (see log for details)");
380
+ }
381
+
382
+ this->fd = rpcmem_to_fd(this->base);
383
+ if (this->fd < 0) {
384
+ GGML_LOG_ERROR("ggml-hex: %s failed to get FD for buffer %p\n", sess->name.c_str(), (void *) this->base);
385
+ rpcmem_free(this->base);
386
+ this->base = NULL;
387
+ throw std::runtime_error("ggml-hex: rpcmem_to_fd failed (see log for details)");
388
+ }
389
+
390
+ HEX_VERBOSE("ggml-hex: %s allocated buffer: base %p size %zu fd %d repack %d\n", sess->name.c_str(),
391
+ (void *) this->base, size, this->fd, (int) repack);
392
+
393
+ this->sess = sess;
394
+ this->size = size;
395
+ this->mapped = false;
396
+ this->repack = repack;
397
+ }
398
+
399
+ ~ggml_backend_hexagon_buffer_context() {
400
+ munmap();
401
+ if (this->base) {
402
+ rpcmem_free(this->base);
403
+ this->base = NULL;
404
+ }
405
+ }
406
+
407
+ ggml_hexagon_session * sess; // primary session
408
+ uint8_t * base;
409
+ size_t size;
410
+ int fd;
411
+ bool mapped; // mmap is done
412
+ bool repack; // repacked buffer
413
+ };
414
+
415
+ static ggml_hexagon_session * ggml_backend_hexagon_buffer_get_sess(ggml_backend_buffer_t buffer) {
416
+ return static_cast<ggml_backend_hexagon_buffer_type_context *>(buffer->buft->context)->sess;
417
+ }
418
+
419
+ static void ggml_backend_hexagon_buffer_free_buffer(ggml_backend_buffer_t buffer) {
420
+ auto ctx = static_cast<ggml_backend_hexagon_buffer_context *>(buffer->context);
421
+ delete ctx;
422
+ }
423
+
424
+ static void * ggml_backend_hexagon_buffer_get_base(ggml_backend_buffer_t buffer) {
425
+ auto ctx = static_cast<ggml_backend_hexagon_buffer_context *>(buffer->context);
426
+ return ctx->base;
427
+ }
428
+
429
+ static enum ggml_status ggml_backend_hexagon_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
430
+ auto ctx = static_cast<ggml_backend_hexagon_buffer_context *>(buffer->context);
431
+ auto sess = ctx->sess;
432
+
433
+ HEX_VERBOSE("ggml-hex: %s init-tensor %s : base %p data %p nbytes %zu usage %d repack %d\n", sess->name.c_str(),
434
+ tensor->name, (void *) ctx->base, tensor->data, ggml_nbytes(tensor), (int) buffer->usage,
435
+ (int) ctx->repack);
436
+
437
+ if (tensor->view_src != NULL && tensor->view_offs == 0) {
438
+ ; // nothing to do for the view
439
+ } else {
440
+ if (!ctx->mapped) {
441
+ ctx->mmap();
442
+ }
443
+ }
444
+ return GGML_STATUS_SUCCESS;
445
+ }
446
+
447
+ // ======== Q4x4x2 ====================
448
+ struct x2_q4 {
449
+ int v[2];
450
+ };
451
+
452
+ static x2_q4 unpack_q4(uint8_t v) {
453
+ x2_q4 x = { (int) (v & 0x0f) - 8, (int) (v >> 4) - 8 };
454
+ return x;
455
+ }
456
+
457
+ static void dump_block_q4_0(const block_q4_0 * b, int i) {
458
+ HEX_VERBOSE("ggml-hex: repack q4_0 %d: %d %d %d %d ... %d %d %d %d : %.6f\n", i, unpack_q4(b->qs[0]).v[0],
459
+ unpack_q4(b->qs[1]).v[0], unpack_q4(b->qs[2]).v[0], unpack_q4(b->qs[3]).v[0], unpack_q4(b->qs[12]).v[1],
460
+ unpack_q4(b->qs[13]).v[1], unpack_q4(b->qs[14]).v[1], unpack_q4(b->qs[15]).v[1],
461
+ GGML_FP16_TO_FP32(b->d));
462
+ }
463
+
464
+ static void dump_packed_block_q4x4x2(const uint8_t * v, unsigned int i, size_t k) {
465
+ static const int qk = QK_Q4_0x4x2;
466
+ const int dblk_size = 8 * 2; // 8x __fp16
467
+ const int qblk_size = qk / 2; // int4
468
+ const int qrow_size = k / 2; // int4 (not padded)
469
+
470
+ const uint8_t * v_q = v + 0; // quants first
471
+ const uint8_t * v_d = v + qrow_size; // then scales
472
+
473
+ const uint8_t * q = v_q + i * qblk_size;
474
+ const ggml_half * d = (const ggml_half *) (v_d + i * dblk_size);
475
+
476
+ HEX_VERBOSE("ggml-hex: repack q4x4x2-%d: %d %d %d %d ... %d %d %d %d ... %d %d %d %d : %.6f %.6f %.6f %.6f\n", i,
477
+ unpack_q4(q[0]).v[0], unpack_q4(q[1]).v[0], unpack_q4(q[2]).v[0], unpack_q4(q[3]).v[0],
478
+ unpack_q4(q[60]).v[0], unpack_q4(q[61]).v[0], unpack_q4(q[62]).v[0], unpack_q4(q[63]).v[0],
479
+ unpack_q4(q[124]).v[0], unpack_q4(q[125]).v[0], unpack_q4(q[126]).v[0], unpack_q4(q[127]).v[0],
480
+ GGML_FP16_TO_FP32(d[0]), GGML_FP16_TO_FP32(d[1]), GGML_FP16_TO_FP32(d[2]), GGML_FP16_TO_FP32(d[3]));
481
+
482
+ HEX_VERBOSE("ggml-hex: repack q4x4x2-%d: %d %d %d %d ... %d %d %d %d ... %d %d %d %d : %.6f %.6f %.6f %.6f\n",
483
+ i + 1, unpack_q4(q[0]).v[1], unpack_q4(q[1]).v[1], unpack_q4(q[2]).v[1], unpack_q4(q[3]).v[1],
484
+ unpack_q4(q[60]).v[1], unpack_q4(q[61]).v[1], unpack_q4(q[62]).v[1], unpack_q4(q[63]).v[1],
485
+ unpack_q4(q[124]).v[1], unpack_q4(q[125]).v[1], unpack_q4(q[126]).v[1], unpack_q4(q[127]).v[1],
486
+ GGML_FP16_TO_FP32(d[4]), GGML_FP16_TO_FP32(d[5]), GGML_FP16_TO_FP32(d[6]), GGML_FP16_TO_FP32(d[7]));
487
+ }
488
+
489
+ static void unpack_q4_0_quants(uint8_t * qs, const block_q4_0 * x, unsigned int bi) {
490
+ static const int qk = QK4_0;
491
+
492
+ for (unsigned int i = 0; i < qk / 2; ++i) {
493
+ const int x0 = (x->qs[i] & 0x0F);
494
+ const int x1 = (x->qs[i] >> 4);
495
+ qs[bi * qk + i + 0] = x0;
496
+ qs[bi * qk + i + qk / 2] = x1;
497
+ }
498
+ }
499
+
500
+ static void pack_q4_0_quants(block_q4_0 * x, const uint8_t * qs, unsigned int bi) {
501
+ static const int qk = QK4_0;
502
+
503
+ for (unsigned int i = 0; i < qk / 2; ++i) {
504
+ const uint8_t x0 = qs[bi * qk + i + 0];
505
+ const uint8_t x1 = qs[bi * qk + i + qk / 2];
506
+ x->qs[i] = x0 | (x1 << 4);
507
+ }
508
+ }
509
+
510
+ static void repack_row_q4x4x2(uint8_t * y, const block_q4_0 * x, int64_t k) {
511
+ static const int qk = QK_Q4_0x4x2;
512
+ const int nb = (k + qk - 1) / qk; // number of blocks (padded)
513
+
514
+ const int dblk_size = 8 * 2; // 8x __fp16
515
+ const int qblk_size = qk / 2; // int4
516
+ const int qrow_size = k / 2; // int4 (not padded to blocks)
517
+
518
+ uint8_t * y_q = y + 0; // quants first
519
+ uint8_t * y_d = y + qrow_size; // then scales
520
+
521
+ if (opt_verbose > 2) {
522
+ for (int i = 0; i < nb; i++) {
523
+ dump_block_q4_0(&x[i * 8 + 0], 0);
524
+ dump_block_q4_0(&x[i * 8 + 1], 1);
525
+ dump_block_q4_0(&x[i * 8 + 2], 2);
526
+ dump_block_q4_0(&x[i * 8 + 3], 3);
527
+ dump_block_q4_0(&x[i * 8 + 4], 4);
528
+ dump_block_q4_0(&x[i * 8 + 5], 5);
529
+ dump_block_q4_0(&x[i * 8 + 6], 6);
530
+ dump_block_q4_0(&x[i * 8 + 7], 7);
531
+ }
532
+ }
533
+
534
+ // Repack the quants
535
+ for (int i = 0; i < nb; i++) {
536
+ uint8_t qs[QK_Q4_0x4x2]; // unpacked quants
537
+ unpack_q4_0_quants(qs, &x[i * 8 + 0], 0);
538
+ unpack_q4_0_quants(qs, &x[i * 8 + 1], 1);
539
+ unpack_q4_0_quants(qs, &x[i * 8 + 2], 2);
540
+ unpack_q4_0_quants(qs, &x[i * 8 + 3], 3);
541
+ unpack_q4_0_quants(qs, &x[i * 8 + 4], 4);
542
+ unpack_q4_0_quants(qs, &x[i * 8 + 5], 5);
543
+ unpack_q4_0_quants(qs, &x[i * 8 + 6], 6);
544
+ unpack_q4_0_quants(qs, &x[i * 8 + 7], 7);
545
+
546
+ uint8_t * q = y_q + (i * qblk_size);
547
+ for (int j = 0; j < qk / 2; j++) {
548
+ q[j] = (qs[j + 128] << 4) | qs[j];
549
+ }
550
+ }
551
+
552
+ // Repack the scales
553
+ // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2)
554
+ // the last block is truncated and overriden by the scales.
555
+ for (int i = 0; i < nb; i++) {
556
+ // Repack the scales
557
+ ggml_half * d = (ggml_half *) (y_d + i * dblk_size);
558
+ d[0] = x[i * 8 + 0].d;
559
+ d[1] = x[i * 8 + 1].d;
560
+ d[2] = x[i * 8 + 2].d;
561
+ d[3] = x[i * 8 + 3].d;
562
+ d[4] = x[i * 8 + 4].d;
563
+ d[5] = x[i * 8 + 5].d;
564
+ d[6] = x[i * 8 + 6].d;
565
+ d[7] = x[i * 8 + 7].d;
566
+ }
567
+
568
+ if (opt_verbose > 1) {
569
+ for (int i = 0; i < nb; i++) {
570
+ dump_packed_block_q4x4x2(y, i, k);
571
+ }
572
+ }
573
+ }
574
+
575
+ static void unpack_row_q4x4x2(block_q4_0 * x, const uint8_t * y, int64_t k) {
576
+ static const int qk = QK_Q4_0x4x2;
577
+ const int nb = (k + qk - 1) / qk; // number of blocks (padded)
578
+
579
+ const int dblk_size = 8 * 2; // 8x __fp16
580
+ const int qblk_size = qk / 2; // int4
581
+ const int qrow_size = k / 2; // int4 (not padded to blocks)
582
+
583
+ const uint8_t * y_q = y + 0; // quants first
584
+ const uint8_t * y_d = y + qrow_size; // then scales
585
+
586
+ if (opt_verbose > 1) {
587
+ for (int i = 0; i < nb; i++) {
588
+ dump_packed_block_q4x4x2(y, i, k);
589
+ }
590
+ }
591
+
592
+ // Unpack the quants
593
+ for (int i = 0; i < nb; i++) {
594
+ uint8_t qs[QK_Q4_0x4x2]; // unpacked quants
595
+
596
+ const uint8_t * q = y_q + (i * qblk_size);
597
+ for (int j = 0; j < qk / 2; j++) {
598
+ qs[j] = q[j] & 0xf;
599
+ qs[j + 128] = q[j] >> 4;
600
+ }
601
+
602
+ pack_q4_0_quants(&x[i * 8 + 0], qs, 0);
603
+ pack_q4_0_quants(&x[i * 8 + 1], qs, 1);
604
+ pack_q4_0_quants(&x[i * 8 + 2], qs, 2);
605
+ pack_q4_0_quants(&x[i * 8 + 3], qs, 3);
606
+ pack_q4_0_quants(&x[i * 8 + 4], qs, 4);
607
+ pack_q4_0_quants(&x[i * 8 + 5], qs, 5);
608
+ pack_q4_0_quants(&x[i * 8 + 6], qs, 6);
609
+ pack_q4_0_quants(&x[i * 8 + 7], qs, 7);
610
+ }
611
+
612
+ // Repack the scales
613
+ // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2)
614
+ // the last block is truncated and overriden by the scales.
615
+ for (int i = 0; i < nb; i++) {
616
+ // Unpack the scales
617
+ const ggml_half * d = (const ggml_half *) (y_d + i * dblk_size);
618
+ x[i * 8 + 0].d = d[0];
619
+ x[i * 8 + 1].d = d[1];
620
+ x[i * 8 + 2].d = d[2];
621
+ x[i * 8 + 3].d = d[3];
622
+ x[i * 8 + 4].d = d[4];
623
+ x[i * 8 + 5].d = d[5];
624
+ x[i * 8 + 6].d = d[6];
625
+ x[i * 8 + 7].d = d[7];
626
+ }
627
+
628
+ if (opt_verbose > 2) {
629
+ for (int i = 0; i < nb; i++) {
630
+ dump_block_q4_0(&x[i * 8 + 0], 0);
631
+ dump_block_q4_0(&x[i * 8 + 1], 1);
632
+ dump_block_q4_0(&x[i * 8 + 2], 2);
633
+ dump_block_q4_0(&x[i * 8 + 3], 3);
634
+ dump_block_q4_0(&x[i * 8 + 4], 4);
635
+ dump_block_q4_0(&x[i * 8 + 5], 5);
636
+ dump_block_q4_0(&x[i * 8 + 6], 6);
637
+ dump_block_q4_0(&x[i * 8 + 7], 7);
638
+ }
639
+ }
640
+ }
641
+
642
+ static void init_row_q4x4x2(block_q4_0 * x, int64_t k) {
643
+ static const int qk = QK_Q4_0x4x2;
644
+ const int nb = (k + qk - 1) / qk; // number of blocks (padded)
645
+
646
+ // Init the quants such that they unpack into zeros
647
+ uint8_t qs[QK_Q4_0x4x2]; // unpacked quants
648
+ memset(qs, 8, sizeof(qs));
649
+
650
+ for (int i = 0; i < nb; i++) {
651
+ pack_q4_0_quants(&x[i * 8 + 0], qs, 0);
652
+ pack_q4_0_quants(&x[i * 8 + 1], qs, 1);
653
+ pack_q4_0_quants(&x[i * 8 + 2], qs, 2);
654
+ pack_q4_0_quants(&x[i * 8 + 3], qs, 3);
655
+ pack_q4_0_quants(&x[i * 8 + 4], qs, 4);
656
+ pack_q4_0_quants(&x[i * 8 + 5], qs, 5);
657
+ pack_q4_0_quants(&x[i * 8 + 6], qs, 6);
658
+ pack_q4_0_quants(&x[i * 8 + 7], qs, 7);
659
+ }
660
+
661
+ // Init the scales
662
+ // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2)
663
+ // the last block is truncated and overriden by the scales.
664
+ for (int i = 0; i < nb; i++) {
665
+ // Unpack the scales
666
+ x[i * 8 + 0].d = 0;
667
+ x[i * 8 + 1].d = 0;
668
+ x[i * 8 + 2].d = 0;
669
+ x[i * 8 + 3].d = 0;
670
+ x[i * 8 + 4].d = 0;
671
+ x[i * 8 + 5].d = 0;
672
+ x[i * 8 + 6].d = 0;
673
+ x[i * 8 + 7].d = 0;
674
+ }
675
+ }
676
+
677
+ // repack q4_0 data into q4x4x2 tensor
678
+ static void repack_q4_0_q4x4x2(ggml_tensor * t, const void * data, size_t size) {
679
+ int64_t nrows = ggml_nrows(t);
680
+
681
+ size_t row_size = ggml_row_size(t->type, t->ne[0]);
682
+ size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q4_0x4x2)); // extra elements for the pad
683
+ size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any)
684
+
685
+ // Ensure we don't try to read more data than is available in the source buffer 'data'
686
+ // or write more than the tensor can hold.
687
+ const size_t total_tensor_size = (size_t)nrows * row_size;
688
+ const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
689
+
690
+ // Calculate how many full rows and how many remaining bytes we need to process.
691
+ const int64_t n_full_rows = n_bytes_to_copy / row_size;
692
+ const size_t n_rem_bytes = n_bytes_to_copy % row_size;
693
+
694
+ void * buf_pd = ggml_aligned_malloc(row_size_pd);
695
+ GGML_ASSERT(buf_pd != NULL);
696
+
697
+ void * buf_rp = ggml_aligned_malloc(row_size_rp);
698
+ GGML_ASSERT(buf_rp != NULL);
699
+
700
+ HEX_VERBOSE("ggml-hex: repack-q4_0-q4x4x2 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data, size,
701
+ t->ne[0], nrows, row_size);
702
+
703
+ init_row_q4x4x2((block_q4_0 *) buf_pd, t->ne[0]); // init padded buffer to make sure the tail is all zeros
704
+
705
+ // 1. Process all the full rows
706
+ for (int64_t i = 0; i < n_full_rows; i++) {
707
+ const uint8_t * src = (const uint8_t *) data + (i * row_size);
708
+ uint8_t * dst = (uint8_t *) t->data + (i * row_size);
709
+
710
+ memcpy(buf_pd, src, row_size);
711
+ repack_row_q4x4x2((uint8_t *) buf_rp, (const block_q4_0 *) buf_pd, t->ne[0]);
712
+ memcpy(dst, buf_rp, row_size);
713
+ }
714
+
715
+ // 2. Process the final, potentially partial, row
716
+ if (n_rem_bytes > 0) {
717
+ const int64_t i = n_full_rows;
718
+ const uint8_t * src = (const uint8_t *) data + (i * row_size);
719
+ uint8_t * dst = (uint8_t *) t->data + (i * row_size);
720
+
721
+ // re-init the row because we are potentially copying a partial row
722
+ init_row_q4x4x2((block_q4_0 *) buf_pd, t->ne[0]);
723
+
724
+ // Copy only the remaining bytes from the source.
725
+ memcpy(buf_pd, src, n_rem_bytes);
726
+
727
+ // Repack the entire buffer
728
+ repack_row_q4x4x2((uint8_t *) buf_rp, (const block_q4_0 *) buf_pd, t->ne[0]);
729
+
730
+ // Write only the corresponding remaining bytes to the destination tensor.
731
+ memcpy(dst, buf_rp, n_rem_bytes);
732
+ }
733
+
734
+ ggml_aligned_free(buf_pd, row_size_pd);
735
+ ggml_aligned_free(buf_rp, row_size_rp);
736
+ }
737
+
738
+ // repack q4x4x2 tensor into q4_0 data
739
+ static void repack_q4x4x2_q4_0(void * data, const ggml_tensor * t, size_t size) {
740
+ int64_t nrows = ggml_nrows(t);
741
+
742
+ size_t row_size = ggml_row_size(t->type, t->ne[0]);
743
+ size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q4_0x4x2)); // extra elements for the pad
744
+ size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any)
745
+
746
+ // Ensure we don't try to copy more data than the tensor actually contains.
747
+ const size_t total_tensor_size = (size_t)nrows * row_size;
748
+ const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
749
+
750
+ // Calculate how many full rows and how many remaining bytes we need to process.
751
+ const int64_t n_full_rows = n_bytes_to_copy / row_size;
752
+ const size_t n_rem_bytes = n_bytes_to_copy % row_size;
753
+
754
+ void * buf_pd = ggml_aligned_malloc(row_size_pd);
755
+ GGML_ASSERT(buf_pd != NULL);
756
+
757
+ void * buf_rp = ggml_aligned_malloc(row_size_rp);
758
+ GGML_ASSERT(buf_rp != NULL);
759
+
760
+ HEX_VERBOSE("ggml-hex: repack-q4x4x2-q4_0 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data, size,
761
+ t->ne[0], nrows, row_size);
762
+
763
+ memset(buf_pd, 0, row_size_pd); // clear-out padded buffer to make sure the tail is all zeros
764
+
765
+ // 1. Process all the full rows
766
+ for (int64_t i = 0; i < n_full_rows; i++) {
767
+ const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
768
+ uint8_t * dst = (uint8_t *) data + (i * row_size);
769
+
770
+ memcpy(buf_pd, src, row_size);
771
+ unpack_row_q4x4x2((block_q4_0 *) buf_rp, (const uint8_t *) buf_pd, t->ne[0]);
772
+ memcpy(dst, buf_rp, row_size);
773
+ }
774
+
775
+ // 2. Process the final, potentially partial, row
776
+ if (n_rem_bytes > 0) {
777
+ const int64_t i = n_full_rows;
778
+ const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
779
+ uint8_t * dst = (uint8_t *) data + (i * row_size);
780
+
781
+ // We still need to read and unpack the entire source row because quantization is block-based.
782
+ memcpy(buf_pd, src, row_size);
783
+ unpack_row_q4x4x2((block_q4_0 *) buf_rp, (const uint8_t *) buf_pd, t->ne[0]);
784
+
785
+ // But we only copy the remaining number of bytes to the destination.
786
+ memcpy(dst, buf_rp, n_rem_bytes);
787
+ }
788
+
789
+ ggml_aligned_free(buf_pd, row_size_pd);
790
+ ggml_aligned_free(buf_rp, row_size_rp);
791
+ }
792
+
793
+ // ======== Q8x4x2 ====================
794
+ static void dump_block_q8_0(const block_q8_0 * b, int i) {
795
+ HEX_VERBOSE("ggml-hex: repack q8_0 %d: %d %d %d %d ... %d %d %d %d : %.6f\n", i, b->qs[0], b->qs[1], b->qs[2],
796
+ b->qs[3], b->qs[28], b->qs[29], b->qs[30], b->qs[31], GGML_FP16_TO_FP32(b->d));
797
+ }
798
+
799
+ static void dump_packed_block_q8x4x2(const uint8_t * v, unsigned int i, size_t k) {
800
+ static const int qk = QK_Q8_0x4x2;
801
+ const int dblk_size = 8 * 2; // 8x __fp16
802
+ const int qblk_size = qk; // int8
803
+ const int qrow_size = k; // int8 (not padded)
804
+
805
+ const uint8_t * v_q = v + 0; // quants first
806
+ const uint8_t * v_d = v + qrow_size; // then scales
807
+
808
+ const uint8_t * q = v_q + i * qblk_size;
809
+ const ggml_half * d = (const ggml_half *) (v_d + i * dblk_size);
810
+
811
+ HEX_VERBOSE("ggml-hex: repack q8x4x2-%d: %d %d %d %d ... %d %d %d %d ... %d %d %d %d : %.6f %.6f %.6f %.6f\n", i,
812
+ q[0], q[1], q[2], q[3], q[60], q[61], q[62], q[63], q[124], q[125], q[126], q[127],
813
+ GGML_FP16_TO_FP32(d[0]), GGML_FP16_TO_FP32(d[1]), GGML_FP16_TO_FP32(d[2]), GGML_FP16_TO_FP32(d[3]));
814
+
815
+ HEX_VERBOSE("ggml-hex: repack q8x4x2-%d: %d %d %d %d ... %d %d %d %d ... %d %d %d %d : %.6f %.6f %.6f %.6f\n",
816
+ i + 1, q[128], q[129], q[130], q[131], q[192], q[193], q[194], q[195], q[252], q[253], q[254], q[255],
817
+ GGML_FP16_TO_FP32(d[4]), GGML_FP16_TO_FP32(d[5]), GGML_FP16_TO_FP32(d[6]), GGML_FP16_TO_FP32(d[7]));
818
+ }
819
+
820
+ static void unpack_q8_0_quants(uint8_t * qs, const block_q8_0 * x, unsigned int bi) {
821
+ static const int qk = QK8_0;
822
+
823
+ for (unsigned int i = 0; i < qk; ++i) {
824
+ qs[bi * qk + i] = x->qs[i];
825
+ }
826
+ }
827
+
828
+ static void pack_q8_0_quants(block_q8_0 * x, const uint8_t * qs, unsigned int bi) {
829
+ static const int qk = QK8_0;
830
+
831
+ for (unsigned int i = 0; i < qk; ++i) {
832
+ x->qs[i] = qs[bi * qk + i];
833
+ }
834
+ }
835
+
836
+ static void repack_row_q8x4x2(uint8_t * y, const block_q8_0 * x, int64_t k) {
837
+ static const int qk = QK_Q8_0x4x2;
838
+ const int nb = (k + qk - 1) / qk; // number of blocks (padded)
839
+
840
+ const int dblk_size = 8 * 2; // 8x __fp16
841
+ const int qblk_size = qk; // int8
842
+ const int qrow_size = k; // int8 (not padded to blocks)
843
+
844
+ uint8_t * y_q = y + 0; // quants first
845
+ uint8_t * y_d = y + qrow_size; // then scales
846
+
847
+ if (opt_verbose > 2) {
848
+ for (int i = 0; i < nb; i++) {
849
+ dump_block_q8_0(&x[i * 8 + 0], 0);
850
+ dump_block_q8_0(&x[i * 8 + 1], 1);
851
+ dump_block_q8_0(&x[i * 8 + 2], 2);
852
+ dump_block_q8_0(&x[i * 8 + 3], 3);
853
+ dump_block_q8_0(&x[i * 8 + 4], 4);
854
+ dump_block_q8_0(&x[i * 8 + 5], 5);
855
+ dump_block_q8_0(&x[i * 8 + 6], 6);
856
+ dump_block_q8_0(&x[i * 8 + 7], 7);
857
+ }
858
+ }
859
+
860
+ // Repack the quants
861
+ for (int i = 0; i < nb; i++) {
862
+ uint8_t qs[QK_Q8_0x4x2]; // unpacked quants
863
+
864
+ unpack_q8_0_quants(qs, &x[i * 8 + 0], 0);
865
+ unpack_q8_0_quants(qs, &x[i * 8 + 1], 1);
866
+ unpack_q8_0_quants(qs, &x[i * 8 + 2], 2);
867
+ unpack_q8_0_quants(qs, &x[i * 8 + 3], 3);
868
+ unpack_q8_0_quants(qs, &x[i * 8 + 4], 4);
869
+ unpack_q8_0_quants(qs, &x[i * 8 + 5], 5);
870
+ unpack_q8_0_quants(qs, &x[i * 8 + 6], 6);
871
+ unpack_q8_0_quants(qs, &x[i * 8 + 7], 7);
872
+
873
+ uint8_t * q = y_q + (i * qblk_size);
874
+ for (int j = 0; j < qk; j++) {
875
+ q[j] = qs[j];
876
+ }
877
+ }
878
+
879
+ // Repack the scales
880
+ // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2)
881
+ // the last block is truncated and overriden by the scales.
882
+ for (int i = 0; i < nb; i++) {
883
+ // Repack the scales
884
+ ggml_half * d = (ggml_half *) (y_d + i * dblk_size);
885
+ d[0] = x[i * 8 + 0].d;
886
+ d[1] = x[i * 8 + 1].d;
887
+ d[2] = x[i * 8 + 2].d;
888
+ d[3] = x[i * 8 + 3].d;
889
+ d[4] = x[i * 8 + 4].d;
890
+ d[5] = x[i * 8 + 5].d;
891
+ d[6] = x[i * 8 + 6].d;
892
+ d[7] = x[i * 8 + 7].d;
893
+ }
894
+
895
+ if (opt_verbose > 1) {
896
+ for (int i = 0; i < nb; i++) {
897
+ dump_packed_block_q8x4x2(y, i, k);
898
+ }
899
+ }
900
+ }
901
+
902
+ static void unpack_row_q8x4x2(block_q8_0 * x, const uint8_t * y, int64_t k) {
903
+ static const int qk = QK_Q8_0x4x2;
904
+ const int nb = (k + qk - 1) / qk; // number of blocks (padded)
905
+
906
+ const int dblk_size = 8 * 2; // 8x __fp16
907
+ const int qblk_size = qk; // int8
908
+ const int qrow_size = k; // int8 (not padded to blocks)
909
+
910
+ const uint8_t * y_q = y + 0; // quants first
911
+ const uint8_t * y_d = y + qrow_size; // then scales
912
+
913
+ if (opt_verbose > 1) {
914
+ for (int i = 0; i < nb; i++) {
915
+ dump_packed_block_q8x4x2(y, i, k);
916
+ }
917
+ }
918
+
919
+ // Unpack the quants
920
+ for (int i = 0; i < nb; i++) {
921
+ uint8_t qs[QK_Q4_0x4x2]; // unpacked quants
922
+
923
+ const uint8_t * q = y_q + (i * qblk_size);
924
+ for (int j = 0; j < qk; j++) {
925
+ qs[j] = q[j];
926
+ }
927
+
928
+ pack_q8_0_quants(&x[i * 8 + 0], qs, 0);
929
+ pack_q8_0_quants(&x[i * 8 + 1], qs, 1);
930
+ pack_q8_0_quants(&x[i * 8 + 2], qs, 2);
931
+ pack_q8_0_quants(&x[i * 8 + 3], qs, 3);
932
+ pack_q8_0_quants(&x[i * 8 + 4], qs, 4);
933
+ pack_q8_0_quants(&x[i * 8 + 5], qs, 5);
934
+ pack_q8_0_quants(&x[i * 8 + 6], qs, 6);
935
+ pack_q8_0_quants(&x[i * 8 + 7], qs, 7);
936
+ }
937
+
938
+ // Repack the scales
939
+ // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2)
940
+ // the last block is truncated and overriden by the scales.
941
+ for (int i = 0; i < nb; i++) {
942
+ // Unpack the scales
943
+ const ggml_half * d = (const ggml_half *) (y_d + i * dblk_size);
944
+ x[i * 8 + 0].d = d[0];
945
+ x[i * 8 + 1].d = d[1];
946
+ x[i * 8 + 2].d = d[2];
947
+ x[i * 8 + 3].d = d[3];
948
+ x[i * 8 + 4].d = d[4];
949
+ x[i * 8 + 5].d = d[5];
950
+ x[i * 8 + 6].d = d[6];
951
+ x[i * 8 + 7].d = d[7];
952
+ }
953
+
954
+ if (opt_verbose > 2) {
955
+ for (int i = 0; i < nb; i++) {
956
+ dump_block_q8_0(&x[i * 8 + 0], 0);
957
+ dump_block_q8_0(&x[i * 8 + 1], 1);
958
+ dump_block_q8_0(&x[i * 8 + 2], 2);
959
+ dump_block_q8_0(&x[i * 8 + 3], 3);
960
+ dump_block_q8_0(&x[i * 8 + 4], 4);
961
+ dump_block_q8_0(&x[i * 8 + 5], 5);
962
+ dump_block_q8_0(&x[i * 8 + 6], 6);
963
+ dump_block_q8_0(&x[i * 8 + 7], 7);
964
+ }
965
+ }
966
+ }
967
+
968
+ static void init_row_q8x4x2(block_q8_0 * x, int64_t k) {
969
+ static const int qk = QK_Q8_0x4x2;
970
+ const int nb = (k + qk - 1) / qk; // number of blocks (padded)
971
+
972
+ // Init the quants such that they unpack into zeros
973
+ uint8_t qs[QK_Q8_0x4x2]; // unpacked quants
974
+ memset(qs, 0, sizeof(qs));
975
+
976
+ for (int i = 0; i < nb; i++) {
977
+ pack_q8_0_quants(&x[i * 8 + 0], qs, 0);
978
+ pack_q8_0_quants(&x[i * 8 + 1], qs, 1);
979
+ pack_q8_0_quants(&x[i * 8 + 2], qs, 2);
980
+ pack_q8_0_quants(&x[i * 8 + 3], qs, 3);
981
+ pack_q8_0_quants(&x[i * 8 + 4], qs, 4);
982
+ pack_q8_0_quants(&x[i * 8 + 5], qs, 5);
983
+ pack_q8_0_quants(&x[i * 8 + 6], qs, 6);
984
+ pack_q8_0_quants(&x[i * 8 + 7], qs, 7);
985
+ }
986
+
987
+ // Init the scales
988
+ // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q8_0x4x2)
989
+ // the last block is truncated and overriden by the scales.
990
+ for (int i = 0; i < nb; i++) {
991
+ // Unpack the scales
992
+ x[i * 8 + 0].d = 0;
993
+ x[i * 8 + 1].d = 0;
994
+ x[i * 8 + 2].d = 0;
995
+ x[i * 8 + 3].d = 0;
996
+ x[i * 8 + 4].d = 0;
997
+ x[i * 8 + 5].d = 0;
998
+ x[i * 8 + 6].d = 0;
999
+ x[i * 8 + 7].d = 0;
1000
+ }
1001
+ }
1002
+
1003
+ // repack q8_0 data into q8x4x2 tensor
1004
+ static void repack_q8_0_q8x4x2(ggml_tensor * t, const void * data, size_t size) {
1005
+ int64_t nrows = ggml_nrows(t);
1006
+
1007
+ size_t row_size = ggml_row_size(t->type, t->ne[0]);
1008
+ size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q8_0x4x2)); // extra elements for the pad
1009
+ size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any)
1010
+
1011
+ // Ensure we don't try to read more data than is available in the source buffer 'data'
1012
+ // or write more than the tensor can hold.
1013
+ const size_t total_tensor_size = (size_t)nrows * row_size;
1014
+ const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
1015
+
1016
+ // Calculate how many full rows and how many remaining bytes we need to process.
1017
+ const int64_t n_full_rows = n_bytes_to_copy / row_size;
1018
+ const size_t n_rem_bytes = n_bytes_to_copy % row_size;
1019
+
1020
+ void * buf_pd = ggml_aligned_malloc(row_size_pd);
1021
+ GGML_ASSERT(buf_pd != NULL);
1022
+
1023
+ void * buf_rp = ggml_aligned_malloc(row_size_rp);
1024
+ GGML_ASSERT(buf_rp != NULL);
1025
+
1026
+ HEX_VERBOSE("ggml-hex: repack-q8_0-q8x4x2 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data, size,
1027
+ t->ne[0], nrows, row_size);
1028
+
1029
+ init_row_q8x4x2((block_q8_0 *) buf_pd, t->ne[0]); // init padded buffer to make sure the tail is all zeros
1030
+
1031
+ // 1. Process all the full rows
1032
+ for (int64_t i = 0; i < n_full_rows; i++) {
1033
+ const uint8_t * src = (const uint8_t *) data + (i * row_size);
1034
+ uint8_t * dst = (uint8_t *) t->data + (i * row_size);
1035
+
1036
+ memcpy(buf_pd, src, row_size);
1037
+ repack_row_q8x4x2((uint8_t *) buf_rp, (const block_q8_0 *) buf_pd, t->ne[0]);
1038
+ memcpy(dst, buf_rp, row_size);
1039
+ }
1040
+
1041
+ // 2. Process the final, potentially partial, row
1042
+ if (n_rem_bytes > 0) {
1043
+ const int64_t i = n_full_rows;
1044
+ const uint8_t * src = (const uint8_t *) data + (i * row_size);
1045
+ uint8_t * dst = (uint8_t *) t->data + (i * row_size);
1046
+
1047
+ // re-init the row because we are potentially copying a partial row
1048
+ init_row_q8x4x2((block_q8_0 *) buf_pd, t->ne[0]);
1049
+
1050
+ // Copy only the remaining bytes from the source.
1051
+ memcpy(buf_pd, src, n_rem_bytes);
1052
+
1053
+ // Repack the entire buffer
1054
+ repack_row_q8x4x2((uint8_t *) buf_rp, (const block_q8_0 *) buf_pd, t->ne[0]);
1055
+
1056
+ // Write only the corresponding remaining bytes to the destination tensor.
1057
+ memcpy(dst, buf_rp, n_rem_bytes);
1058
+ }
1059
+
1060
+ ggml_aligned_free(buf_pd, row_size_pd);
1061
+ ggml_aligned_free(buf_rp, row_size_rp);
1062
+ }
1063
+
1064
+ // repack q8x4x2 tensor into q8_0 data
1065
+ static void repack_q8x4x2_q8_0(void * data, const ggml_tensor * t, size_t size) {
1066
+ int64_t nrows = ggml_nrows(t);
1067
+
1068
+ size_t row_size = ggml_row_size(t->type, t->ne[0]);
1069
+ size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q8_0x4x2)); // extra elements for the pad
1070
+ size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any)
1071
+
1072
+ // Ensure we don't try to copy more data than the tensor actually contains.
1073
+ const size_t total_tensor_size = (size_t)nrows * row_size;
1074
+ const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
1075
+
1076
+ // Calculate how many full rows and how many remaining bytes we need to process.
1077
+ const int64_t n_full_rows = n_bytes_to_copy / row_size;
1078
+ const size_t n_rem_bytes = n_bytes_to_copy % row_size;
1079
+
1080
+ void * buf_pd = ggml_aligned_malloc(row_size_pd);
1081
+ GGML_ASSERT(buf_pd != NULL);
1082
+
1083
+ void * buf_rp = ggml_aligned_malloc(row_size_rp);
1084
+ GGML_ASSERT(buf_rp != NULL);
1085
+
1086
+ HEX_VERBOSE("ggml-hex: repack-q8x4x2-q8_0 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data, size,
1087
+ t->ne[0], nrows, row_size);
1088
+
1089
+ memset(buf_pd, 0, row_size_pd); // clear-out padded buffer to make sure the tail is all zeros
1090
+
1091
+ // 1. Process all the full rows
1092
+ for (int64_t i = 0; i < n_full_rows; i++) {
1093
+ const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
1094
+ uint8_t * dst = (uint8_t *) data + (i * row_size);
1095
+
1096
+ memcpy(buf_pd, src, row_size);
1097
+ unpack_row_q8x4x2((block_q8_0 *) buf_rp, (const uint8_t *) buf_pd, t->ne[0]);
1098
+ memcpy(dst, buf_rp, row_size);
1099
+ }
1100
+
1101
+ // 2. Process the final, potentially partial, row
1102
+ if (n_rem_bytes > 0) {
1103
+ const int64_t i = n_full_rows;
1104
+ const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
1105
+ uint8_t * dst = (uint8_t *) data + (i * row_size);
1106
+
1107
+ // We still need to read and unpack the entire source row because quantization is block-based.
1108
+ memcpy(buf_pd, src, row_size);
1109
+ unpack_row_q8x4x2((block_q8_0 *) buf_rp, (const uint8_t *) buf_pd, t->ne[0]);
1110
+
1111
+ // But we only copy the remaining number of bytes to the destination.
1112
+ memcpy(dst, buf_rp, n_rem_bytes);
1113
+ }
1114
+
1115
+ ggml_aligned_free(buf_pd, row_size_pd);
1116
+ ggml_aligned_free(buf_rp, row_size_rp);
1117
+ }
1118
+
1119
+ // ======== MXFP4x4x2 ====================
1120
+ struct x2_mxfp4 {
1121
+ int v[2];
1122
+ };
1123
+
1124
+ static x2_mxfp4 unpack_mxfp4(uint8_t v) {
1125
+ x2_mxfp4 x;
1126
+ x.v[0] = kvalues_mxfp4[(v & 0x0f)];
1127
+ x.v[1] = kvalues_mxfp4[(v >> 4)];
1128
+ return x;
1129
+ }
1130
+
1131
+ static void dump_block_mxfp4(const block_mxfp4 * b, int i) {
1132
+ HEX_VERBOSE("ggml-hex: repack mxfp4 %d: %d %d %d %d ... %d %d %d %d : %.6f\n", i, unpack_mxfp4(b->qs[0]).v[0],
1133
+ unpack_mxfp4(b->qs[1]).v[0], unpack_mxfp4(b->qs[2]).v[0], unpack_mxfp4(b->qs[3]).v[0],
1134
+ unpack_mxfp4(b->qs[12]).v[1], unpack_mxfp4(b->qs[13]).v[1], unpack_mxfp4(b->qs[14]).v[1],
1135
+ unpack_mxfp4(b->qs[15]).v[1], GGML_E8M0_TO_FP32_HALF(b->e));
1136
+ }
1137
+
1138
+ static void dump_packed_block_mxfp4x4x2(const uint8_t * v, unsigned int i, size_t k) {
1139
+ static const int qk = QK_MXFP4x4x2;
1140
+ const int eblk_size = 8 * 1; // 8x E8M0
1141
+ const int qblk_size = qk / 2; // int4
1142
+ const int qrow_size = k / 2; // int4 (not padded)
1143
+
1144
+ const uint8_t * v_q = v + 0; // quants first
1145
+ const uint8_t * v_e = v + qrow_size; // then scales
1146
+
1147
+ const uint8_t * q = v_q + i * qblk_size;
1148
+ const uint8_t * e = (const uint8_t *) (v_e + i * eblk_size);
1149
+
1150
+ HEX_VERBOSE("ggml-hex: repack mxfp4x4x2-%d: %d %d %d %d ... %d %d %d %d ... %d %d %d %d : %.6f %.6f %.6f %.6f\n", i,
1151
+ unpack_mxfp4(q[0]).v[0], unpack_mxfp4(q[1]).v[0], unpack_mxfp4(q[2]).v[0], unpack_mxfp4(q[3]).v[0],
1152
+ unpack_mxfp4(q[60]).v[0], unpack_mxfp4(q[61]).v[0], unpack_mxfp4(q[62]).v[0], unpack_mxfp4(q[63]).v[0],
1153
+ unpack_mxfp4(q[124]).v[0], unpack_mxfp4(q[125]).v[0], unpack_mxfp4(q[126]).v[0],
1154
+ unpack_mxfp4(q[127]).v[0], GGML_E8M0_TO_FP32_HALF(e[0]), GGML_E8M0_TO_FP32_HALF(e[1]),
1155
+ GGML_E8M0_TO_FP32_HALF(e[2]), GGML_E8M0_TO_FP32_HALF(e[3]));
1156
+
1157
+ HEX_VERBOSE("ggml-hex: repack mxfp4x4x2-%d: %d %d %d %d ... %d %d %d %d ... %d %d %d %d : %.6f %.6f %.6f %.6f\n",
1158
+ i + 1, unpack_mxfp4(q[0]).v[1], unpack_mxfp4(q[1]).v[1], unpack_mxfp4(q[2]).v[1],
1159
+ unpack_mxfp4(q[3]).v[1], unpack_mxfp4(q[60]).v[1], unpack_mxfp4(q[61]).v[1], unpack_mxfp4(q[62]).v[1],
1160
+ unpack_mxfp4(q[63]).v[1], unpack_mxfp4(q[124]).v[1], unpack_mxfp4(q[125]).v[1],
1161
+ unpack_mxfp4(q[126]).v[1], unpack_mxfp4(q[127]).v[1], GGML_E8M0_TO_FP32_HALF(e[4]),
1162
+ GGML_E8M0_TO_FP32_HALF(e[5]), GGML_E8M0_TO_FP32_HALF(e[6]), GGML_E8M0_TO_FP32_HALF(e[7]));
1163
+ }
1164
+
1165
+ static void unpack_mxfp4_quants(uint8_t * qs, const block_mxfp4 * x, unsigned int bi) {
1166
+ static const int qk = QK_MXFP4;
1167
+
1168
+ for (unsigned int i = 0; i < qk / 2; ++i) {
1169
+ const uint8_t x0 = (x->qs[i] & 0x0F);
1170
+ const uint8_t x1 = (x->qs[i] >> 4);
1171
+ qs[bi * qk + i + 0] = x0;
1172
+ qs[bi * qk + i + qk / 2] = x1;
1173
+ }
1174
+ }
1175
+
1176
+ static void pack_mxfp4_quants(block_mxfp4 * x, const uint8_t * qs, unsigned int bi) {
1177
+ static const int qk = QK4_0;
1178
+
1179
+ for (unsigned int i = 0; i < qk / 2; ++i) {
1180
+ const uint8_t x0 = qs[bi * qk + i + 0];
1181
+ const uint8_t x1 = qs[bi * qk + i + qk / 2];
1182
+ x->qs[i] = x0 | (x1 << 4);
1183
+ }
1184
+ }
1185
+
1186
+ static void repack_row_mxfp4x4x2(uint8_t * y, const block_mxfp4 * x, int64_t k) {
1187
+ static const int qk = QK_MXFP4x4x2;
1188
+ const int nb = (k + qk - 1) / qk; // number of blocks (padded)
1189
+
1190
+ const int eblk_size = 8 * 1; // 8x E8M0
1191
+ const int qblk_size = qk / 2; // int4
1192
+ const int qrow_size = k / 2; // int4 (not padded to blocks)
1193
+
1194
+ uint8_t * y_q = y + 0; // quants first
1195
+ uint8_t * y_e = y + qrow_size; // then scales
1196
+
1197
+ if (opt_verbose > 2) {
1198
+ for (int i = 0; i < nb; i++) {
1199
+ dump_block_mxfp4(&x[i * 8 + 0], 0);
1200
+ dump_block_mxfp4(&x[i * 8 + 1], 1);
1201
+ dump_block_mxfp4(&x[i * 8 + 2], 2);
1202
+ dump_block_mxfp4(&x[i * 8 + 3], 3);
1203
+ dump_block_mxfp4(&x[i * 8 + 4], 4);
1204
+ dump_block_mxfp4(&x[i * 8 + 5], 5);
1205
+ dump_block_mxfp4(&x[i * 8 + 6], 6);
1206
+ dump_block_mxfp4(&x[i * 8 + 7], 7);
1207
+ }
1208
+ }
1209
+
1210
+ // Repack the quants
1211
+ for (int i = 0; i < nb; i++) {
1212
+ uint8_t qs[QK_MXFP4x4x2]; // unpacked quants
1213
+
1214
+ unpack_mxfp4_quants(qs, &x[i * 8 + 0], 0);
1215
+ unpack_mxfp4_quants(qs, &x[i * 8 + 1], 1);
1216
+ unpack_mxfp4_quants(qs, &x[i * 8 + 2], 2);
1217
+ unpack_mxfp4_quants(qs, &x[i * 8 + 3], 3);
1218
+ unpack_mxfp4_quants(qs, &x[i * 8 + 4], 4);
1219
+ unpack_mxfp4_quants(qs, &x[i * 8 + 5], 5);
1220
+ unpack_mxfp4_quants(qs, &x[i * 8 + 6], 6);
1221
+ unpack_mxfp4_quants(qs, &x[i * 8 + 7], 7);
1222
+
1223
+ uint8_t * q = y_q + (i * qblk_size);
1224
+ for (int j = 0; j < qk / 2; j++) {
1225
+ q[j] = (qs[j + 128] << 4) | qs[j];
1226
+ }
1227
+ }
1228
+
1229
+ // Repack the scales
1230
+ // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_MXFP4x4x2)
1231
+ // the last block is truncated and overriden by the scales.
1232
+ for (int i = 0; i < nb; i++) {
1233
+ // Repack the scales
1234
+ uint8_t * e = (uint8_t *) (y_e + i * eblk_size);
1235
+ e[0] = x[i * 8 + 0].e;
1236
+ e[1] = x[i * 8 + 1].e;
1237
+ e[2] = x[i * 8 + 2].e;
1238
+ e[3] = x[i * 8 + 3].e;
1239
+ e[4] = x[i * 8 + 4].e;
1240
+ e[5] = x[i * 8 + 5].e;
1241
+ e[6] = x[i * 8 + 6].e;
1242
+ e[7] = x[i * 8 + 7].e;
1243
+ }
1244
+
1245
+ if (opt_verbose > 1) {
1246
+ for (int i = 0; i < nb; i++) {
1247
+ dump_packed_block_mxfp4x4x2(y, i, k);
1248
+ }
1249
+ }
1250
+ }
1251
+
1252
+ static void unpack_row_mxfp4x4x2(block_mxfp4 * x, const uint8_t * y, int64_t k) {
1253
+ static const int qk = QK_MXFP4x4x2;
1254
+ const int nb = (k + qk - 1) / qk; // number of blocks (padded)
1255
+
1256
+ const int eblk_size = 8 * 1; // 8x E8M0
1257
+ const int qblk_size = qk / 2; // int4
1258
+ const int qrow_size = k / 2; // int4 (not padded to blocks)
1259
+
1260
+ const uint8_t * y_q = y + 0; // quants first
1261
+ const uint8_t * y_e = y + qrow_size; // then scales
1262
+
1263
+ if (opt_verbose > 1) {
1264
+ for (int i = 0; i < nb; i++) {
1265
+ dump_packed_block_mxfp4x4x2(y, i, k);
1266
+ }
1267
+ }
1268
+
1269
+ // Unpack the quants
1270
+ for (int i = 0; i < nb; i++) {
1271
+ uint8_t qs[QK_MXFP4x4x2]; // unpacked quants
1272
+
1273
+ const uint8_t * q = y_q + (i * qblk_size);
1274
+ for (int j = 0; j < qk / 2; j++) {
1275
+ qs[j] = q[j] & 0xf;
1276
+ qs[j + 128] = q[j] >> 4;
1277
+ }
1278
+
1279
+ pack_mxfp4_quants(&x[i * 8 + 0], qs, 0);
1280
+ pack_mxfp4_quants(&x[i * 8 + 1], qs, 1);
1281
+ pack_mxfp4_quants(&x[i * 8 + 2], qs, 2);
1282
+ pack_mxfp4_quants(&x[i * 8 + 3], qs, 3);
1283
+ pack_mxfp4_quants(&x[i * 8 + 4], qs, 4);
1284
+ pack_mxfp4_quants(&x[i * 8 + 5], qs, 5);
1285
+ pack_mxfp4_quants(&x[i * 8 + 6], qs, 6);
1286
+ pack_mxfp4_quants(&x[i * 8 + 7], qs, 7);
1287
+ }
1288
+
1289
+ // Repack the scales
1290
+ // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_MXFP4_0x4x2)
1291
+ // the last block is truncated and overriden by the scales.
1292
+ for (int i = 0; i < nb; i++) {
1293
+ // Unpack the scales
1294
+ const uint8_t * e = (const uint8_t *) (y_e + i * eblk_size);
1295
+ x[i * 8 + 0].e = e[0];
1296
+ x[i * 8 + 1].e = e[1];
1297
+ x[i * 8 + 2].e = e[2];
1298
+ x[i * 8 + 3].e = e[3];
1299
+ x[i * 8 + 4].e = e[4];
1300
+ x[i * 8 + 5].e = e[5];
1301
+ x[i * 8 + 6].e = e[6];
1302
+ x[i * 8 + 7].e = e[7];
1303
+ }
1304
+
1305
+ if (opt_verbose > 2) {
1306
+ for (int i = 0; i < nb; i++) {
1307
+ dump_block_mxfp4(&x[i * 8 + 0], 0);
1308
+ dump_block_mxfp4(&x[i * 8 + 1], 1);
1309
+ dump_block_mxfp4(&x[i * 8 + 2], 2);
1310
+ dump_block_mxfp4(&x[i * 8 + 3], 3);
1311
+ dump_block_mxfp4(&x[i * 8 + 4], 4);
1312
+ dump_block_mxfp4(&x[i * 8 + 5], 5);
1313
+ dump_block_mxfp4(&x[i * 8 + 6], 6);
1314
+ dump_block_mxfp4(&x[i * 8 + 7], 7);
1315
+ }
1316
+ }
1317
+ }
1318
+
1319
+ static void init_row_mxfp4x4x2(block_mxfp4 * x, int64_t k) {
1320
+ static const int qk = QK_MXFP4x4x2;
1321
+ const int nb = (k + qk - 1) / qk; // number of blocks (padded)
1322
+
1323
+ // Init the quants such that they unpack into zeros
1324
+ uint8_t qs[QK_MXFP4x4x2]; // unpacked quants
1325
+ memset(qs, 0, sizeof(qs));
1326
+
1327
+ for (int i = 0; i < nb; i++) {
1328
+ pack_mxfp4_quants(&x[i * 8 + 0], qs, 0);
1329
+ pack_mxfp4_quants(&x[i * 8 + 1], qs, 1);
1330
+ pack_mxfp4_quants(&x[i * 8 + 2], qs, 2);
1331
+ pack_mxfp4_quants(&x[i * 8 + 3], qs, 3);
1332
+ pack_mxfp4_quants(&x[i * 8 + 4], qs, 4);
1333
+ pack_mxfp4_quants(&x[i * 8 + 5], qs, 5);
1334
+ pack_mxfp4_quants(&x[i * 8 + 6], qs, 6);
1335
+ pack_mxfp4_quants(&x[i * 8 + 7], qs, 7);
1336
+ }
1337
+
1338
+ // Init the scales
1339
+ // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_MXFP4x4x2)
1340
+ // the last block is truncated and overriden by the scales.
1341
+ for (int i = 0; i < nb; i++) {
1342
+ // Unpack the scales
1343
+ x[i * 8 + 0].e = 0;
1344
+ x[i * 8 + 1].e = 0;
1345
+ x[i * 8 + 2].e = 0;
1346
+ x[i * 8 + 3].e = 0;
1347
+ x[i * 8 + 4].e = 0;
1348
+ x[i * 8 + 5].e = 0;
1349
+ x[i * 8 + 6].e = 0;
1350
+ x[i * 8 + 7].e = 0;
1351
+ }
1352
+ }
1353
+
1354
+ // repack mxfp4 data into mxfp4x4x2 tensor
1355
+ static void repack_mxfp4_mxfp4x4x2(ggml_tensor * t, const void * data, size_t size) {
1356
+ int64_t nrows = ggml_nrows(t);
1357
+
1358
+ size_t row_size = ggml_row_size(t->type, t->ne[0]);
1359
+ size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_MXFP4x4x2)); // extra elements for the pad
1360
+ size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any)
1361
+
1362
+ // Ensure we don't try to read more data than is available in the source buffer 'data'
1363
+ // or write more than the tensor can hold.
1364
+ const size_t total_tensor_size = (size_t)nrows * row_size;
1365
+ const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
1366
+
1367
+ // Calculate how many full rows and how many remaining bytes we need to process.
1368
+ const int64_t n_full_rows = n_bytes_to_copy / row_size;
1369
+ const size_t n_rem_bytes = n_bytes_to_copy % row_size;
1370
+
1371
+ void * buf_pd = ggml_aligned_malloc(row_size_pd);
1372
+ GGML_ASSERT(buf_pd != NULL);
1373
+
1374
+ void * buf_rp = ggml_aligned_malloc(row_size_rp);
1375
+ GGML_ASSERT(buf_rp != NULL);
1376
+
1377
+ HEX_VERBOSE("ggml-hex: repack-mxfp4-mxfp4x4x2 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data,
1378
+ size, t->ne[0], nrows, row_size);
1379
+
1380
+ init_row_mxfp4x4x2((block_mxfp4 *) buf_pd, t->ne[0]); // init padded buffer to make sure the tail is all zeros
1381
+
1382
+ // 1. Process all the full rows
1383
+ for (int64_t i = 0; i < n_full_rows; i++) {
1384
+ const uint8_t * src = (const uint8_t *) data + (i * row_size);
1385
+ uint8_t * dst = (uint8_t *) t->data + (i * row_size);
1386
+
1387
+ memcpy(buf_pd, src, row_size);
1388
+ repack_row_mxfp4x4x2((uint8_t *) buf_rp, (const block_mxfp4 *) buf_pd, t->ne[0]);
1389
+ memcpy(dst, buf_rp, row_size);
1390
+ }
1391
+
1392
+ // 2. Process the final, potentially partial, row
1393
+ if (n_rem_bytes > 0) {
1394
+ const int64_t i = n_full_rows;
1395
+ const uint8_t * src = (const uint8_t *) data + (i * row_size);
1396
+ uint8_t * dst = (uint8_t *) t->data + (i * row_size);
1397
+
1398
+ // re-init the row because we are potentially copying a partial row
1399
+ init_row_mxfp4x4x2((block_mxfp4 *) buf_pd, t->ne[0]);
1400
+
1401
+ // Copy only the remaining bytes from the source.
1402
+ memcpy(buf_pd, src, n_rem_bytes);
1403
+
1404
+ // Repack the entire buffer (partial data + zero padding).
1405
+ repack_row_mxfp4x4x2((uint8_t *) buf_rp, (const block_mxfp4 *) buf_pd, t->ne[0]);
1406
+
1407
+ // Write only the corresponding remaining bytes to the destination tensor.
1408
+ memcpy(dst, buf_rp, n_rem_bytes);
1409
+ }
1410
+
1411
+ ggml_aligned_free(buf_pd, row_size_pd);
1412
+ ggml_aligned_free(buf_rp, row_size_rp);
1413
+ }
1414
+
1415
+ // repack mxfp4x4x2 tensor into mxfp4 data
1416
+ static void repack_mxfp4x4x2_mxfp4(void * data, const ggml_tensor * t, size_t size) {
1417
+ int64_t nrows = ggml_nrows(t);
1418
+
1419
+ size_t row_size = ggml_row_size(t->type, t->ne[0]);
1420
+ size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_MXFP4x4x2)); // extra elements for the pad
1421
+ size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any)
1422
+
1423
+ // Ensure we don't try to copy more data than the tensor actually contains.
1424
+ const size_t total_tensor_size = (size_t)nrows * row_size;
1425
+ const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
1426
+
1427
+ // Calculate how many full rows and how many remaining bytes we need to process.
1428
+ const int64_t n_full_rows = n_bytes_to_copy / row_size;
1429
+ const size_t n_rem_bytes = n_bytes_to_copy % row_size;
1430
+
1431
+ void * buf_pd = ggml_aligned_malloc(row_size_pd);
1432
+ GGML_ASSERT(buf_pd != NULL);
1433
+
1434
+ void * buf_rp = ggml_aligned_malloc(row_size_rp);
1435
+ GGML_ASSERT(buf_rp != NULL);
1436
+
1437
+ HEX_VERBOSE("ggml-hex: repack-mxfp4x4x2-mxfp4 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data,
1438
+ size, t->ne[0], nrows, row_size);
1439
+
1440
+ memset(buf_pd, 0, row_size_pd); // clear-out padded buffer to make sure the tail is all zeros
1441
+
1442
+ // 1. Process all the full rows
1443
+ for (int64_t i = 0; i < n_full_rows; i++) {
1444
+ const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
1445
+ uint8_t * dst = (uint8_t *) data + (i * row_size);
1446
+
1447
+ memcpy(buf_pd, src, row_size);
1448
+ unpack_row_mxfp4x4x2((block_mxfp4 *) buf_rp, (const uint8_t *) buf_pd, t->ne[0]);
1449
+ memcpy(dst, buf_rp, row_size);
1450
+ }
1451
+
1452
+ // 2. Process the final, potentially partial, row
1453
+ if (n_rem_bytes > 0) {
1454
+ const int64_t i = n_full_rows;
1455
+ const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
1456
+ uint8_t * dst = (uint8_t *) data + (i * row_size);
1457
+
1458
+ // We still need to read and unpack the entire source row because the format is block-based.
1459
+ memcpy(buf_pd, src, row_size);
1460
+ unpack_row_mxfp4x4x2((block_mxfp4 *) buf_rp, (const uint8_t *) buf_pd, t->ne[0]);
1461
+
1462
+ // But we only copy the remaining number of bytes to the destination to respect the size limit.
1463
+ memcpy(dst, buf_rp, n_rem_bytes);
1464
+ }
1465
+
1466
+ ggml_aligned_free(buf_pd, row_size_pd);
1467
+ ggml_aligned_free(buf_rp, row_size_rp);
1468
+ }
1469
+
1470
+ static void ggml_backend_hexagon_buffer_set_tensor(ggml_backend_buffer_t buffer,
1471
+ ggml_tensor * tensor,
1472
+ const void * data,
1473
+ size_t offset,
1474
+ size_t size) {
1475
+ auto ctx = (ggml_backend_hexagon_buffer_context *) buffer->context;
1476
+ auto sess = ctx->sess;
1477
+
1478
+ HEX_VERBOSE("ggml-hex: %s set-tensor %s : data %p offset %zu size %zu\n", sess->name.c_str(), tensor->name, data,
1479
+ offset, size);
1480
+
1481
+ switch (tensor->type) {
1482
+ case GGML_TYPE_Q4_0:
1483
+ GGML_ASSERT(offset == 0);
1484
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
1485
+ repack_q4_0_q4x4x2(tensor, data, size);
1486
+ break;
1487
+
1488
+ case GGML_TYPE_Q8_0:
1489
+ GGML_ASSERT(offset == 0);
1490
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
1491
+ repack_q8_0_q8x4x2(tensor, data, size);
1492
+ break;
1493
+
1494
+ case GGML_TYPE_MXFP4:
1495
+ GGML_ASSERT(offset == 0);
1496
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
1497
+ repack_mxfp4_mxfp4x4x2(tensor, data, size);
1498
+ break;
1499
+
1500
+ default:
1501
+ memcpy((char *) tensor->data + offset, data, size);
1502
+ break;
1503
+ }
1504
+ }
1505
+
1506
+ static void ggml_backend_hexagon_buffer_get_tensor(ggml_backend_buffer_t buffer,
1507
+ const ggml_tensor * tensor,
1508
+ void * data,
1509
+ size_t offset,
1510
+ size_t size) {
1511
+ auto ctx = (ggml_backend_hexagon_buffer_context *) buffer->context;
1512
+ auto sess = ctx->sess;
1513
+
1514
+ HEX_VERBOSE("ggml-hex: %s get-tensor %s : data %p offset %zu size %zu\n", sess->name.c_str(), tensor->name, data,
1515
+ offset, size);
1516
+
1517
+ switch (tensor->type) {
1518
+ case GGML_TYPE_Q4_0:
1519
+ GGML_ASSERT(offset == 0);
1520
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
1521
+ repack_q4x4x2_q4_0(data, tensor, size);
1522
+ break;
1523
+
1524
+ case GGML_TYPE_Q8_0:
1525
+ GGML_ASSERT(offset == 0);
1526
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
1527
+ repack_q8x4x2_q8_0(data, tensor, size);
1528
+ break;
1529
+
1530
+ case GGML_TYPE_MXFP4:
1531
+ GGML_ASSERT(offset == 0);
1532
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
1533
+ repack_mxfp4x4x2_mxfp4(data, tensor, size);
1534
+ break;
1535
+
1536
+ default:
1537
+ memcpy(data, (const char *) tensor->data + offset, size);
1538
+ break;
1539
+ }
1540
+ }
1541
+
1542
+ static bool ggml_backend_hexagon_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
1543
+ const struct ggml_tensor * src,
1544
+ struct ggml_tensor * dst) {
1545
+ GGML_UNUSED(buffer);
1546
+ GGML_UNUSED(src);
1547
+ GGML_UNUSED(dst);
1548
+ // we might optimize this later, for now take the slow path (ie get/set_tensor)
1549
+ return false;
1550
+ }
1551
+
1552
+ static void ggml_backend_hexagon_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
1553
+ auto ctx = (ggml_backend_hexagon_buffer_context *) buffer->context;
1554
+ auto sess = ctx->sess;
1555
+ HEX_VERBOSE("ggml-hex: %s clear-buff base %p size %zu\n", sess->name.c_str(), (void *) ctx->base, ctx->size);
1556
+ memset(ctx->base, value, ctx->size);
1557
+ }
1558
+
1559
+ static ggml_backend_buffer_i ggml_backend_hexagon_buffer_interface = {
1560
+ /* .free_buffer = */ ggml_backend_hexagon_buffer_free_buffer,
1561
+ /* .get_base = */ ggml_backend_hexagon_buffer_get_base,
1562
+ /* .init_tensor = */ ggml_backend_hexagon_buffer_init_tensor,
1563
+ /* .memset_tensor = */ NULL,
1564
+ /* .set_tensor = */ ggml_backend_hexagon_buffer_set_tensor,
1565
+ /* .get_tensor = */ ggml_backend_hexagon_buffer_get_tensor,
1566
+ /* .cpy_tensor = */ ggml_backend_hexagon_buffer_cpy_tensor,
1567
+ /* .clear = */ ggml_backend_hexagon_buffer_clear,
1568
+ /* .reset = */ NULL,
1569
+ };
1570
+
1571
+ // ** backend buffer type
1572
+
1573
+ static const char * ggml_backend_hexagon_buffer_type_name(ggml_backend_buffer_type_t buffer_type) {
1574
+ return static_cast<ggml_backend_hexagon_buffer_type_context *>(buffer_type->context)->name.c_str();
1575
+ }
1576
+
1577
+ static ggml_backend_buffer_t ggml_backend_hexagon_buffer_type_alloc_buffer(
1578
+ ggml_backend_buffer_type_t buffer_type, size_t size) {
1579
+ auto sess = static_cast<ggml_backend_hexagon_buffer_type_context *>(buffer_type->context)->sess;
1580
+ try {
1581
+ ggml_backend_hexagon_buffer_context * ctx = new ggml_backend_hexagon_buffer_context(sess, size, false /*repack*/);
1582
+ return ggml_backend_buffer_init(buffer_type, ggml_backend_hexagon_buffer_interface, ctx, size);
1583
+ } catch (std::exception const &exc) {
1584
+ GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer context: %s\n", sess->name.c_str(), exc.what());
1585
+ return nullptr;
1586
+ }
1587
+ }
1588
+
1589
+ static ggml_backend_buffer_t ggml_backend_hexagon_repack_buffer_type_alloc_buffer(
1590
+ ggml_backend_buffer_type_t buffer_type, size_t size) {
1591
+ auto sess = static_cast<ggml_backend_hexagon_buffer_type_context *>(buffer_type->context)->sess;
1592
+ try {
1593
+ ggml_backend_hexagon_buffer_context * ctx = new ggml_backend_hexagon_buffer_context(sess, size, true /*repack*/);
1594
+ return ggml_backend_buffer_init(buffer_type, ggml_backend_hexagon_buffer_interface, ctx, size);
1595
+ } catch (std::exception const &exc) {
1596
+ GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer context: %s\n", sess->name.c_str(), exc.what());
1597
+ return nullptr;
1598
+ }
1599
+ }
1600
+
1601
+ static size_t ggml_backend_hexagon_buffer_type_get_alignment(ggml_backend_buffer_type_t buffer_type) {
1602
+ return 128; // HVX alignment
1603
+ GGML_UNUSED(buffer_type);
1604
+ }
1605
+
1606
+ static size_t ggml_backend_hexagon_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * t) {
1607
+ return ggml_nbytes(t);
1608
+ }
1609
+
1610
+ static size_t ggml_backend_hexagon_buffer_type_get_max_size(ggml_backend_buffer_type_t buffer_type) {
1611
+ return 1 * 1024 * 1024 * 1024; // 1GB per buffer
1612
+ GGML_UNUSED(buffer_type);
1613
+ }
1614
+
1615
+ static bool ggml_backend_hexagon_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
1616
+ return opt_hostbuf;
1617
+ GGML_UNUSED(buft);
1618
+ }
1619
+
1620
+ static bool ggml_backend_hexagon_repack_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
1621
+ return false;
1622
+ GGML_UNUSED(buft);
1623
+ }
1624
+
1625
+ static ggml_backend_buffer_type_i ggml_backend_hexagon_buffer_type_interface = {
1626
+ /* .get_name = */ ggml_backend_hexagon_buffer_type_name,
1627
+ /* .alloc_buffer = */ ggml_backend_hexagon_buffer_type_alloc_buffer,
1628
+ /* .get_alignment = */ ggml_backend_hexagon_buffer_type_get_alignment,
1629
+ /* .get_max_size = */ ggml_backend_hexagon_buffer_type_get_max_size,
1630
+ /* .get_alloc_size = */ ggml_backend_hexagon_buffer_type_get_alloc_size,
1631
+ /* .is_host = */ ggml_backend_hexagon_buffer_type_is_host,
1632
+ };
1633
+
1634
+ static ggml_backend_buffer_type_i ggml_backend_hexagon_repack_buffer_type_interface = {
1635
+ /* .get_name = */ ggml_backend_hexagon_buffer_type_name,
1636
+ /* .alloc_buffer = */ ggml_backend_hexagon_repack_buffer_type_alloc_buffer,
1637
+ /* .get_alignment = */ ggml_backend_hexagon_buffer_type_get_alignment,
1638
+ /* .get_max_size = */ ggml_backend_hexagon_buffer_type_get_max_size,
1639
+ /* .get_alloc_size = */ ggml_backend_hexagon_buffer_type_get_alloc_size,
1640
+ /* .is_host = */ ggml_backend_hexagon_repack_buffer_type_is_host,
1641
+ };
1642
+
1643
+ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {
1644
+ this->valid_session = false;
1645
+ this->valid_handle = false;
1646
+ this->valid_queue = false;
1647
+ this->valid_iface = false;
1648
+
1649
+ this->domain_id = 3; // Default for CDSP, updated after the session is created
1650
+ this->session_id = 0; // Default for CDSP, updated after the session is created
1651
+ this->dev_id = dev_id;
1652
+ this->name = std::string("HTP") + std::to_string(dev_id);
1653
+
1654
+ this->op_pending = 0;
1655
+ this->prof_usecs = 0;
1656
+ this->prof_cycles = 0;
1657
+ this->prof_pkts = 0;
1658
+
1659
+ GGML_LOG_INFO("ggml-hex: allocating new session: %s\n", this->name.c_str());
1660
+
1661
+ domain * my_domain = get_domain(this->domain_id);
1662
+ if (my_domain == NULL) {
1663
+ GGML_LOG_ERROR("ggml-hex: unable to get domain struct for CDSP\n");
1664
+ throw std::runtime_error("ggml-hex: failed to get CDSP domain (see log for details)");
1665
+ }
1666
+
1667
+ // Create new session
1668
+ if (dev_id != 0) {
1669
+ struct remote_rpc_reserve_new_session n;
1670
+ n.domain_name_len = strlen(CDSP_DOMAIN_NAME);
1671
+ n.domain_name = const_cast<char *>(CDSP_DOMAIN_NAME);
1672
+ n.session_name = const_cast<char *>(this->name.c_str());
1673
+ n.session_name_len = this->name.size();
1674
+
1675
+ int err = remote_session_control(FASTRPC_RESERVE_NEW_SESSION, (void *) &n, sizeof(n));
1676
+ if (err != AEE_SUCCESS) {
1677
+ GGML_LOG_ERROR("ggml-hex: failed to reserve new session %d : error 0x%x\n", dev_id, err);
1678
+ throw std::runtime_error("ggml-hex: remote_session_control(new-sess) failed (see log for details)");
1679
+ }
1680
+
1681
+ // Save the IDs
1682
+ this->session_id = n.session_id;
1683
+ this->domain_id = n.effective_domain_id;
1684
+ this->valid_session = true;
1685
+ }
1686
+
1687
+ // Get session URI
1688
+
1689
+ char session_uri[256];
1690
+ {
1691
+ char htp_uri[256];
1692
+ snprintf(htp_uri, sizeof(htp_uri), "file:///libggml-htp-v%u.so?htp_iface_skel_handle_invoke&_modver=1.0", opt_arch);
1693
+
1694
+ struct remote_rpc_get_uri u = {};
1695
+ u.session_id = this->session_id;
1696
+ u.domain_name = const_cast<char *>(CDSP_DOMAIN_NAME);
1697
+ u.domain_name_len = strlen(CDSP_DOMAIN_NAME);
1698
+ u.module_uri = const_cast<char *>(htp_uri);
1699
+ u.module_uri_len = strlen(htp_uri);
1700
+ u.uri = session_uri;
1701
+ u.uri_len = sizeof(session_uri);
1702
+
1703
+ int err = remote_session_control(FASTRPC_GET_URI, (void *) &u, sizeof(u));
1704
+ if (err != AEE_SUCCESS) {
1705
+ // fallback to single session uris
1706
+ int htp_URI_domain_len = strlen(htp_uri) + MAX_DOMAIN_NAMELEN;
1707
+
1708
+ snprintf(session_uri, htp_URI_domain_len, "%s%s", htp_uri, my_domain->uri);
1709
+
1710
+ GGML_LOG_WARN("ggml-hex: failed to get URI for session %d : error 0x%x. Falling back to single session URI: %s\n", dev_id, err, session_uri);
1711
+ }
1712
+ }
1713
+
1714
+ // Enable Unsigned PD
1715
+ {
1716
+ struct remote_rpc_control_unsigned_module u;
1717
+ u.domain = this->domain_id;
1718
+ u.enable = 1;
1719
+ int err = remote_session_control(DSPRPC_CONTROL_UNSIGNED_MODULE, (void *) &u, sizeof(u));
1720
+ if (err != AEE_SUCCESS) {
1721
+ GGML_LOG_ERROR("ggml-hex: failed to enable unsigned PD for session %d : error 0x%x\n", dev_id, err);
1722
+ throw std::runtime_error("ggml-hex: remote_session_control(unsign) failed (see log for details)");
1723
+ }
1724
+ }
1725
+
1726
+ // Open session
1727
+ int err = htp_iface_open(session_uri, &this->handle);
1728
+ if (err != AEE_SUCCESS) {
1729
+ GGML_LOG_ERROR("ggml-hex: failed to open session %d : error 0x%x\n", dev_id, err);
1730
+ throw std::runtime_error("ggml-hex: failed to open session (see log for details)");
1731
+ }
1732
+
1733
+ this->valid_handle = true;
1734
+
1735
+ GGML_LOG_INFO("ggml-hex: new session: %s : session-id %d domain-id %d uri %s handle 0x%lx\n", this->name.c_str(),
1736
+ this->session_id, this->domain_id, session_uri, (unsigned long) this->handle);
1737
+
1738
+ // Enable FastRPC QoS mode
1739
+ {
1740
+ struct remote_rpc_control_latency l;
1741
+ l.enable = 1;
1742
+
1743
+ int err = remote_handle64_control(this->handle, DSPRPC_CONTROL_LATENCY, (void *) &l, sizeof(l));
1744
+ if (err != 0) {
1745
+ GGML_LOG_WARN("ggml-hex: failed to enable fastrpc QOS mode: 0x%08x\n", (unsigned) err);
1746
+ }
1747
+ }
1748
+
1749
+ // Now let's setup the DSP queue
1750
+ err = dspqueue_create(this->domain_id,
1751
+ 0, // Flags
1752
+ 128 * 1024, // Request queue size (in bytes)
1753
+ 64 * 1024, // Response queue size (in bytes)
1754
+ nullptr, // Read packet callback (we handle reads explicitly)
1755
+ nullptr, // Error callback (we handle errors during reads)
1756
+ (void *) this, // Callback context
1757
+ &queue);
1758
+ if (err != 0) {
1759
+ GGML_LOG_ERROR("ggml-hex: %s dspqueue_create failed: 0x%08x\n", this->name.c_str(), (unsigned) err);
1760
+ throw std::runtime_error("ggml-hex: failed to create dspqueue (see log for details)");
1761
+ }
1762
+
1763
+ this->valid_queue = true;
1764
+
1765
+ // Export queue for use on the DSP
1766
+ err = dspqueue_export(queue, &this->queue_id);
1767
+ if (err != 0) {
1768
+ GGML_LOG_ERROR("ggml-hex: dspqueue_export failed: 0x%08x\n", (unsigned) err);
1769
+ throw std::runtime_error("ggml-hex: dspqueue export failed (see log for details)");
1770
+ }
1771
+
1772
+ if (opt_etm) {
1773
+ err = htp_iface_enable_etm(this->handle);
1774
+ if (err != 0) {
1775
+ GGML_LOG_ERROR("ggml-hex: failed to enable ETM tracing: 0x%08x\n", (unsigned) err);
1776
+ }
1777
+ }
1778
+
1779
+ // Start the DSP-side service. We need to pass the queue ID to the
1780
+ // DSP in a FastRPC call; the DSP side will import the queue and start
1781
+ // listening for packets in a callback.
1782
+ err = htp_iface_start(this->handle, dev_id, this->queue_id, opt_nhvx);
1783
+ if (err != 0) {
1784
+ GGML_LOG_ERROR("ggml-hex: failed to start session: 0x%08x\n", (unsigned) err);
1785
+ throw std::runtime_error("ggml-hex: iface start failed (see log for details)");
1786
+ }
1787
+ this->valid_iface = true;
1788
+ }
1789
+
1790
+ void ggml_hexagon_session::release() noexcept(true) {
1791
+ GGML_LOG_INFO("ggml-hex: releasing session: %s\n", this->name.c_str());
1792
+
1793
+ int err;
1794
+
1795
+ // Stop the DSP-side service and close the queue
1796
+ if (this->valid_iface) {
1797
+ err = htp_iface_stop(this->handle);
1798
+ if (err != 0) {
1799
+ GGML_ABORT("ggml-hex: htp_iface_stop failed: 0x%08x\n", (unsigned) err);
1800
+ }
1801
+ }
1802
+
1803
+ if (opt_etm) {
1804
+ err = htp_iface_disable_etm(this->handle);
1805
+ if (err != 0) {
1806
+ GGML_LOG_ERROR("ggml-hex: warn : failed to disable ETM tracing: 0x%08x\n", (unsigned) err);
1807
+ }
1808
+ }
1809
+
1810
+ if (this->valid_queue) {
1811
+ err = dspqueue_close(queue);
1812
+ if (err != 0) {
1813
+ GGML_ABORT("ggml-hex: dspqueue_close failed: 0x%08x\n", (unsigned) err);
1814
+ }
1815
+ }
1816
+
1817
+ if (this->valid_handle) {
1818
+ htp_iface_close(this->handle);
1819
+ }
1820
+ }
1821
+
1822
+ ggml_hexagon_session::ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) noexcept(false) {
1823
+ buffer_type.context = nullptr;
1824
+ repack_buffer_type.context = nullptr;
1825
+
1826
+ buffer_type.device = dev;
1827
+ repack_buffer_type.device = dev;
1828
+
1829
+ try {
1830
+ allocate(dev_id);
1831
+
1832
+ buffer_type.iface = ggml_backend_hexagon_buffer_type_interface;
1833
+ buffer_type.context = new ggml_backend_hexagon_buffer_type_context(this->name, this);
1834
+
1835
+ repack_buffer_type.iface = ggml_backend_hexagon_repack_buffer_type_interface;
1836
+ repack_buffer_type.context = new ggml_backend_hexagon_buffer_type_context(this->name + "-REPACK", this);
1837
+ } catch (std::exception const &exc) {
1838
+ release();
1839
+ throw;
1840
+ }
1841
+ }
1842
+
1843
+ ggml_hexagon_session::~ggml_hexagon_session() noexcept(true) {
1844
+ release();
1845
+
1846
+ delete static_cast<ggml_backend_hexagon_buffer_type_context*>(buffer_type.context);
1847
+ delete static_cast<ggml_backend_hexagon_buffer_type_context*>(repack_buffer_type.context);
1848
+ }
1849
+
1850
+ // ** backend interface
1851
+
1852
+ static bool ggml_backend_buffer_is_hexagon(const struct ggml_backend_buffer * b) {
1853
+ return b->buft->iface.get_alignment == ggml_backend_hexagon_buffer_type_get_alignment;
1854
+ }
1855
+
1856
+ static inline bool ggml_backend_buffer_is_hexagon_repack(const struct ggml_backend_buffer * b) {
1857
+ return b->buft->iface.alloc_buffer == ggml_backend_hexagon_repack_buffer_type_alloc_buffer;
1858
+ }
1859
+
1860
+ static bool hex_supported_dims2(const struct ggml_tensor * x, const struct ggml_tensor * y) {
1861
+ if (x->ne[0] != y->ne[0]) {
1862
+ return false;
1863
+ }
1864
+ if (x->ne[1] != y->ne[1]) {
1865
+ return false;
1866
+ }
1867
+ if (x->ne[2] != y->ne[2]) {
1868
+ return false;
1869
+ }
1870
+ if (x->ne[3] != y->ne[3]) {
1871
+ return false;
1872
+ }
1873
+
1874
+ return true;
1875
+ }
1876
+
1877
+ static bool hex_supported_src0_type(ggml_type t) {
1878
+ return t == GGML_TYPE_F32;
1879
+ }
1880
+
1881
+ static bool hex_supported_src1_type(ggml_type t) {
1882
+ return t == GGML_TYPE_F32;
1883
+ }
1884
+
1885
+ static bool hex_supported_src2_type(ggml_type t) {
1886
+ return t == GGML_TYPE_F32;
1887
+ }
1888
+
1889
+ static bool hex_supported_src1_type2(ggml_type t) {
1890
+ return t == GGML_TYPE_F16;
1891
+ }
1892
+
1893
+ static bool hex_supported_src1_type3(ggml_type t) {
1894
+ return t == GGML_TYPE_I32;
1895
+ }
1896
+
1897
+ static bool hex_supported_dst_type(ggml_type t) {
1898
+ return t == GGML_TYPE_F32;
1899
+ }
1900
+
1901
+ static bool hex_supported_dims(const struct ggml_tensor * x, const struct ggml_tensor * y) {
1902
+ // TODO: support broadcast for ne[2 and 3]
1903
+ if (x->ne[0] != y->ne[0]) {
1904
+ return false;
1905
+ }
1906
+ if (x->ne[2] != y->ne[2]) {
1907
+ return false;
1908
+ }
1909
+ if (x->ne[3] != y->ne[3]) {
1910
+ return false;
1911
+ }
1912
+ return true;
1913
+ }
1914
+
1915
+ static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * sess, const struct ggml_tensor * dst) {
1916
+ const struct ggml_tensor * src0 = dst->src[0];
1917
+ const struct ggml_tensor * src1 = dst->src[1];
1918
+
1919
+ if (src1->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) {
1920
+ return false;
1921
+ }
1922
+
1923
+ // TODO: add support for non-cont tensors
1924
+ if (!ggml_is_contiguous(src1) || !ggml_is_contiguous(dst)) {
1925
+ return false;
1926
+ }
1927
+
1928
+ switch (src0->type) {
1929
+ case GGML_TYPE_Q4_0:
1930
+ case GGML_TYPE_Q8_0:
1931
+ case GGML_TYPE_MXFP4:
1932
+ if (src0->ne[0] % 32) {
1933
+ return false;
1934
+ }
1935
+
1936
+ if (src0->ne[1] > 16 * 1024) {
1937
+ return false; // typically the lm-head which would be too large for VTCM
1938
+ }
1939
+
1940
+ // if ((src0->ne[2] != src1->ne[2] || src0->ne[3] != src1->ne[3])) return false;
1941
+ if ((src1->ne[2] != 1 || src1->ne[3] != 1)) {
1942
+ return false;
1943
+ }
1944
+
1945
+ // src0 (weights) must be repacked
1946
+ if (src0->buffer && !ggml_backend_buffer_is_hexagon_repack(src0->buffer)) {
1947
+ return false;
1948
+ }
1949
+ break;
1950
+
1951
+ case GGML_TYPE_F16:
1952
+ if (!opt_experimental) {
1953
+ return false;
1954
+ }
1955
+ break;
1956
+
1957
+ default:
1958
+ return false;
1959
+ }
1960
+
1961
+ // src0 & src1 & dst must be mapped to the same session
1962
+ if (src0->buffer &&
1963
+ (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
1964
+ return false;
1965
+ }
1966
+ if (src1->buffer &&
1967
+ (!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
1968
+ return false;
1969
+ }
1970
+ if (dst->buffer &&
1971
+ (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
1972
+ return false;
1973
+ }
1974
+
1975
+ return true;
1976
+ }
1977
+
1978
+ static bool ggml_hexagon_supported_mul_mat_id(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
1979
+ const struct ggml_tensor * src0 = op->src[0];
1980
+ const struct ggml_tensor * src1 = op->src[1];
1981
+ const struct ggml_tensor * src2 = op->src[2];
1982
+ const struct ggml_tensor * dst = op;
1983
+
1984
+ if (src1->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32 || src2->type != GGML_TYPE_I32) {
1985
+ return false;
1986
+ }
1987
+
1988
+ switch (src0->type) {
1989
+ case GGML_TYPE_Q4_0:
1990
+ case GGML_TYPE_Q8_0:
1991
+ case GGML_TYPE_MXFP4:
1992
+ if ((src0->ne[0] % 32)) {
1993
+ return false;
1994
+ }
1995
+
1996
+ // src0 (weights) must be repacked
1997
+ if (src0->buffer && !ggml_backend_buffer_is_hexagon_repack(src0->buffer)) {
1998
+ return false;
1999
+ }
2000
+ break;
2001
+
2002
+ case GGML_TYPE_F16:
2003
+ if (!opt_experimental) {
2004
+ return false;
2005
+ }
2006
+ break;
2007
+
2008
+ default:
2009
+ return false;
2010
+ }
2011
+
2012
+ // TODO: add support for non-cont tensors
2013
+ if (!ggml_is_contiguous(src1) || !ggml_is_contiguous(dst)) {
2014
+ return false;
2015
+ }
2016
+
2017
+ // src0 (weights) must be repacked and mapped to the same session
2018
+ // src1 & sr2 & dst must be mapped to the same session
2019
+ if (src0->buffer &&
2020
+ (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
2021
+ return false;
2022
+ }
2023
+ if (src1->buffer &&
2024
+ (!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
2025
+ return false;
2026
+ }
2027
+ if (src2->buffer &&
2028
+ (!ggml_backend_buffer_is_hexagon(src2->buffer) || ggml_backend_hexagon_buffer_get_sess(src2->buffer) != sess)) {
2029
+ return false;
2030
+ }
2031
+ if (dst->buffer &&
2032
+ (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
2033
+ return false;
2034
+ }
2035
+
2036
+ return true;
2037
+ }
2038
+
2039
+ static bool ggml_hexagon_supported_binary(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
2040
+ const struct ggml_tensor * src0 = op->src[0];
2041
+ const struct ggml_tensor * src1 = op->src[1];
2042
+ const struct ggml_tensor * dst = op;
2043
+
2044
+ if (!hex_supported_src0_type(src0->type)) {
2045
+ return false;
2046
+ }
2047
+ if (!hex_supported_src1_type(src1->type)) {
2048
+ return false;
2049
+ }
2050
+ if (!hex_supported_dst_type(dst->type)) {
2051
+ return false;
2052
+ }
2053
+ if (!hex_supported_dims2(src0, dst)) {
2054
+ return false;
2055
+ }
2056
+ if (!ggml_can_repeat(src1, src0)) {
2057
+ return false;
2058
+ }
2059
+
2060
+ // TODO: add support for non-contigiuos tensors
2061
+ if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1) || !ggml_is_contiguous(dst)) {
2062
+ return false;
2063
+ }
2064
+
2065
+ // src0, src1 & dst must be mapped to the same session
2066
+ if (src0->buffer &&
2067
+ (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
2068
+ return false;
2069
+ }
2070
+ if (src1->buffer &&
2071
+ (!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
2072
+ return false;
2073
+ }
2074
+ if (dst->buffer &&
2075
+ (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
2076
+ return false;
2077
+ }
2078
+
2079
+ return true;
2080
+ }
2081
+
2082
+ static bool ggml_hexagon_supported_add_id(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
2083
+ const struct ggml_tensor * src0 = op->src[0];
2084
+ const struct ggml_tensor * src1 = op->src[1];
2085
+ const struct ggml_tensor * src2 = op->src[2];
2086
+ const struct ggml_tensor * dst = op;
2087
+
2088
+ if (!hex_supported_src0_type(src0->type)) {
2089
+ return false;
2090
+ }
2091
+ if (!hex_supported_src1_type(src1->type)) {
2092
+ return false;
2093
+ }
2094
+ if (!hex_supported_dst_type(dst->type)) {
2095
+ return false;
2096
+ }
2097
+ if (!hex_supported_dims2(src0, dst)) {
2098
+ return false;
2099
+ }
2100
+
2101
+ // REVISIT: add support for non-contigiuos tensors
2102
+ if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1) || !ggml_is_contiguous(dst)) {
2103
+ return false;
2104
+ }
2105
+
2106
+ // src0, src1 & dst must be mapped to the same session
2107
+ if (src0->buffer &&
2108
+ (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
2109
+ return false;
2110
+ }
2111
+ if (src1->buffer &&
2112
+ (!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
2113
+ return false;
2114
+ }
2115
+ if (src2->buffer &&
2116
+ (!ggml_backend_buffer_is_hexagon(src2->buffer) || ggml_backend_hexagon_buffer_get_sess(src2->buffer) != sess)) {
2117
+ return false;
2118
+ }
2119
+ if (dst->buffer &&
2120
+ (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
2121
+ return false;
2122
+ }
2123
+
2124
+ return true;
2125
+ }
2126
+
2127
+ static bool ggml_hexagon_supported_unary(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
2128
+ const struct ggml_tensor * src0 = op->src[0];
2129
+ const struct ggml_tensor * dst = op;
2130
+
2131
+ if (!hex_supported_src0_type(src0->type)) {
2132
+ return false;
2133
+ }
2134
+ if (!hex_supported_dst_type(dst->type)) {
2135
+ return false;
2136
+ }
2137
+ if (!hex_supported_dims2(src0, dst)) {
2138
+ return false;
2139
+ }
2140
+
2141
+ // TODO: add support for non-contigiuos tensors
2142
+ if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(dst)) {
2143
+ return false;
2144
+ }
2145
+
2146
+ // src0 & dst must be mapped to the same session
2147
+ if (src0->buffer &&
2148
+ (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
2149
+ return false;
2150
+ }
2151
+ if (dst->buffer &&
2152
+ (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
2153
+ return false;
2154
+ }
2155
+
2156
+ return true;
2157
+ }
2158
+
2159
+ static bool ggml_hexagon_supported_activations(const struct ggml_hexagon_session * sess,
2160
+ const struct ggml_tensor * op) {
2161
+ const struct ggml_tensor * src0 = op->src[0];
2162
+ const struct ggml_tensor * src1 = op->src[1];
2163
+ const struct ggml_tensor * dst = op;
2164
+
2165
+ if (!hex_supported_src0_type(src0->type)) {
2166
+ return false;
2167
+ }
2168
+ if (!hex_supported_dst_type(dst->type)) {
2169
+ return false;
2170
+ }
2171
+
2172
+ if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(dst)) {
2173
+ return false;
2174
+ }
2175
+
2176
+ if (src1) {
2177
+ if (!hex_supported_src1_type(src1->type)) {
2178
+ return false;
2179
+ }
2180
+ if (!hex_supported_dims2(src0, src1)) {
2181
+ return false;
2182
+ }
2183
+ if (!ggml_is_contiguous(src1)) {
2184
+ return false;
2185
+ }
2186
+ }
2187
+
2188
+ // src0, src1 & dst must be mapped to the same session
2189
+ if (src0->buffer &&
2190
+ (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
2191
+ return false;
2192
+ }
2193
+ if (src1 && src1->buffer &&
2194
+ (!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
2195
+ return false;
2196
+ }
2197
+ if (dst->buffer &&
2198
+ (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
2199
+ return false;
2200
+ }
2201
+
2202
+ return true;
2203
+ }
2204
+
2205
+ static bool ggml_hexagon_supported_softmax(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
2206
+ const struct ggml_tensor * src0 = op->src[0];
2207
+ const struct ggml_tensor * src1 = op->src[1];
2208
+ const struct ggml_tensor * src2 = op->src[2];
2209
+ const struct ggml_tensor * dst = op;
2210
+
2211
+ if (src2) {
2212
+ return false; // FIXME: add support for sinks
2213
+ }
2214
+
2215
+ if (!hex_supported_src0_type(src0->type)) {
2216
+ return false;
2217
+ }
2218
+ if (!hex_supported_dst_type(dst->type)) {
2219
+ return false;
2220
+ }
2221
+
2222
+ if (src1) {
2223
+ if (!hex_supported_src1_type(src1->type) && !hex_supported_src1_type2(src1->type)) {
2224
+ return false;
2225
+ }
2226
+ if (src0->ne[0] != src1->ne[0]) {
2227
+ return false;
2228
+ }
2229
+ if (src1->ne[1] < src0->ne[1]) {
2230
+ return false;
2231
+ }
2232
+ if (src0->ne[2] % src1->ne[2] != 0) {
2233
+ return false;
2234
+ }
2235
+ if (src0->ne[3] % src1->ne[3] != 0) {
2236
+ return false;
2237
+ }
2238
+ }
2239
+
2240
+ if (src1) {
2241
+ if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1) || !ggml_is_contiguous(dst)) {
2242
+ return false;
2243
+ }
2244
+ } else {
2245
+ if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(dst)) {
2246
+ return false;
2247
+ }
2248
+ }
2249
+
2250
+ // src0, src1 & dst must be mapped to the same session
2251
+ if (src0->buffer &&
2252
+ (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
2253
+ return false;
2254
+ }
2255
+ if (src1 && src1->buffer &&
2256
+ (!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
2257
+ return false;
2258
+ }
2259
+ if (dst->buffer &&
2260
+ (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
2261
+ return false;
2262
+ }
2263
+
2264
+ return true;
2265
+ }
2266
+
2267
+ static bool ggml_hexagon_supported_rope(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
2268
+ const int32_t * op_params = &op->op_params[0];
2269
+
2270
+ int mode = op_params[2];
2271
+
2272
+ if ((mode & GGML_ROPE_TYPE_NEOX) || (mode & GGML_ROPE_TYPE_MROPE) || (mode & GGML_ROPE_TYPE_VISION)) {
2273
+ return false;
2274
+ }
2275
+ if (mode & 1) {
2276
+ return false;
2277
+ }
2278
+
2279
+ const struct ggml_tensor * src0 = op->src[0];
2280
+ const struct ggml_tensor * src1 = op->src[1];
2281
+ const struct ggml_tensor * src2 = op->src[2];
2282
+ const struct ggml_tensor * dst = op;
2283
+
2284
+ if (!hex_supported_src0_type(src0->type)) {
2285
+ return false; // FIXME: add support for GGML_TYPE_F16 for src0
2286
+ }
2287
+ if (!hex_supported_dst_type(dst->type)) {
2288
+ return false;
2289
+ }
2290
+ if (!hex_supported_src1_type3(src1->type)) {
2291
+ return false;
2292
+ }
2293
+ if (src2) {
2294
+ if (!hex_supported_src2_type(src2->type)) {
2295
+ return false;
2296
+ }
2297
+ int n_dims = op_params[1];
2298
+ if (src2->ne[0] < (n_dims / 2)) {
2299
+ return false;
2300
+ }
2301
+ }
2302
+
2303
+ if (src2) {
2304
+ if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1) || !ggml_is_contiguous(src2) ||
2305
+ !ggml_is_contiguous(dst)) {
2306
+ return false;
2307
+ }
2308
+ } else {
2309
+ if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1) || !ggml_is_contiguous(dst)) {
2310
+ return false;
2311
+ }
2312
+ }
2313
+
2314
+ // src0, src1, src2 & dst must be mapped to the same session
2315
+ if (src0->buffer &&
2316
+ (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
2317
+ return false;
2318
+ }
2319
+ if (src1->buffer &&
2320
+ (!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
2321
+ return false;
2322
+ }
2323
+ if (src2 && src2->buffer &&
2324
+ (!ggml_backend_buffer_is_hexagon(src2->buffer) || ggml_backend_hexagon_buffer_get_sess(src2->buffer) != sess)) {
2325
+ return false;
2326
+ }
2327
+ if (dst->buffer &&
2328
+ (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
2329
+ return false;
2330
+ }
2331
+
2332
+ return true;
2333
+ }
2334
+
2335
+ // Init hexagon tensor from GGML tensor and Hexagon buffer
2336
+ static void init_htp_tensor(htp_tensor * h, const ggml_tensor * t) {
2337
+ h->data = 0; // updated by the receiver
2338
+ h->type = t->type;
2339
+ h->ne[0] = t->ne[0];
2340
+ h->ne[1] = t->ne[1];
2341
+ h->ne[2] = t->ne[2];
2342
+ h->ne[3] = t->ne[3];
2343
+ h->nb[0] = t->nb[0];
2344
+ h->nb[1] = t->nb[1];
2345
+ h->nb[2] = t->nb[2];
2346
+ h->nb[3] = t->nb[3];
2347
+ }
2348
+
2349
+ static void hex_dump_dspbuf(const struct ggml_tensor * t, const dspqueue_buffer * d) {
2350
+ auto buf = static_cast<ggml_backend_hexagon_buffer_context *>(t->buffer->context);
2351
+ auto sess = buf->sess;
2352
+
2353
+ HEX_VERBOSE("ggml-hex: %s dspqbuf : %s base-addr %p base-size %zu data %p offset %u size %u\n", sess->name.c_str(),
2354
+ t->name, (void *) buf->base, buf->size, (void *) d->ptr, (unsigned int) d->offset,
2355
+ (unsigned int) d->size);
2356
+ }
2357
+
2358
+ static void ggml_hexagon_mul_mat(const struct ggml_tensor * op, uint32_t flags) {
2359
+ const struct ggml_tensor * src0 = op->src[0];
2360
+ const struct ggml_tensor * src1 = op->src[1];
2361
+ const struct ggml_tensor * dst = op;
2362
+
2363
+ auto src0_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src0->buffer->context);
2364
+ auto src1_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src1->buffer->context);
2365
+ auto dst_buf = static_cast<ggml_backend_hexagon_buffer_context *>(dst->buffer->context);
2366
+
2367
+ uint64_t t1, t2;
2368
+ t1 = ggml_time_us();
2369
+
2370
+ // Construct HTP message
2371
+ htp_general_req req;
2372
+ req.op = HTP_OP_MUL_MAT;
2373
+ req.flags = flags;
2374
+
2375
+ init_htp_tensor(&req.src0, src0);
2376
+ init_htp_tensor(&req.src1, src1);
2377
+ init_htp_tensor(&req.dst, dst);
2378
+
2379
+ // Use opmask to override flags
2380
+ if (!(opt_opmask & HTP_OPMASK_QUANTIZE)) {
2381
+ req.flags |= HTP_OPFLAGS_SKIP_QUANTIZE;
2382
+ }
2383
+ if (!(opt_opmask & HTP_OPMASK_COMPUTE)) {
2384
+ req.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
2385
+ }
2386
+
2387
+ dspqueue_buffer bufs[3];
2388
+ memset(bufs, 0, sizeof(bufs));
2389
+
2390
+ // First buffer Weights.
2391
+ // The content is static, there is no need to do any cache management
2392
+ bufs[0].fd = src0_buf->fd;
2393
+ bufs[0].ptr = src0->data;
2394
+ bufs[0].offset = (uint8_t *) src0->data - src0_buf->base;
2395
+ bufs[0].size = ggml_nbytes(src0);
2396
+ bufs[0].flags = 0;
2397
+
2398
+ // Second buffer Input Activations. This is a buffer that the CPU
2399
+ // writes and the DSP reads, so we'll need to flush CPU caches and
2400
+ // invalidate DSP ones. On platforms with I/O coherency support the
2401
+ // framework will automatically skip cache operations where possible.
2402
+ bufs[1].fd = src1_buf->fd;
2403
+ bufs[1].ptr = src1->data;
2404
+ bufs[1].offset = (uint8_t *) src1->data - src1_buf->base;
2405
+ bufs[1].size = ggml_nbytes(src1);
2406
+ bufs[1].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
2407
+ DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
2408
+
2409
+ // Third buffer Output Activations. We'll handle DSP
2410
+ // cache maintenance in the response message but need to flush
2411
+ // CPU caches to ensure any previously written dirty lines are
2412
+ // written out before writes from the DSP start.
2413
+ bufs[2].fd = dst_buf->fd;
2414
+ bufs[2].ptr = dst->data;
2415
+ bufs[2].offset = (uint8_t *) dst->data - dst_buf->base;
2416
+ bufs[2].size = ggml_nbytes(dst);
2417
+ bufs[2].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
2418
+
2419
+ // Primary DSP session from the src0 (normally weight) tensor
2420
+ auto sess = src0_buf->sess;
2421
+
2422
+ if (opt_verbose) {
2423
+ char dims[64 * GGML_MAX_SRC];
2424
+ char strides[64 * GGML_MAX_SRC];
2425
+ char types[16 * GGML_MAX_SRC];
2426
+ char buffs[64 * GGML_MAX_SRC];
2427
+ char names[64 * GGML_MAX_SRC];
2428
+
2429
+ hex_format_op_dims(dims, op);
2430
+ hex_format_op_strides(strides, op);
2431
+ hex_format_op_types(types, op);
2432
+ hex_format_op_buffs(buffs, op);
2433
+ hex_format_op_names(names, op);
2434
+
2435
+ HEX_VERBOSE("ggml-hex: %s %s: %s : %s : %s : %s : %s: flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op),
2436
+ names, dims, types, strides, buffs, req.flags);
2437
+ if (opt_verbose > 1) {
2438
+ hex_dump_dspbuf(src0, &bufs[0]);
2439
+ hex_dump_dspbuf(src1, &bufs[1]);
2440
+ hex_dump_dspbuf(dst, &bufs[2]);
2441
+ }
2442
+ }
2443
+
2444
+ if ((opt_opmask & HTP_OPMASK_QUEUE)) {
2445
+ sess->enqueue(req, bufs, 3, opt_opsync);
2446
+ }
2447
+
2448
+ t2 = ggml_time_us();
2449
+
2450
+ HEX_PROFILE(
2451
+ "ggml-hex: %s %s %s %u:%u:%u:%u x %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles %u op-pkts %u (%f) "
2452
+ "call-usec %llu\n",
2453
+ sess->name.c_str(), ggml_op_name(op->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
2454
+ (uint32_t) src0->ne[2], (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
2455
+ (uint32_t) src1->ne[2], (uint32_t) src1->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
2456
+ (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
2457
+ (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
2458
+ }
2459
+
2460
+ static void ggml_hexagon_mul_mat_id(const struct ggml_tensor * op, uint32_t flags) {
2461
+ const struct ggml_tensor * src0 = op->src[0];
2462
+ const struct ggml_tensor * src1 = op->src[1];
2463
+ const struct ggml_tensor * src2 = op->src[2];
2464
+ const struct ggml_tensor * dst = op;
2465
+
2466
+ auto src0_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src0->buffer->context);
2467
+ auto src1_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src1->buffer->context);
2468
+ auto src2_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src2->buffer->context);
2469
+ auto dst_buf = static_cast<ggml_backend_hexagon_buffer_context *>(dst->buffer->context);
2470
+
2471
+ uint64_t t1, t2;
2472
+ t1 = ggml_time_us();
2473
+
2474
+ // Construct HTP message
2475
+ htp_general_req req;
2476
+ req.op = HTP_OP_MUL_MAT_ID;
2477
+ req.flags = flags;
2478
+
2479
+ init_htp_tensor(&req.src0, src0);
2480
+ init_htp_tensor(&req.src1, src1);
2481
+ init_htp_tensor(&req.src2, src2);
2482
+ init_htp_tensor(&req.dst, dst);
2483
+
2484
+ // Use opmask to override flags
2485
+ if (!(opt_opmask & HTP_OPMASK_QUANTIZE)) {
2486
+ req.flags |= HTP_OPFLAGS_SKIP_QUANTIZE;
2487
+ }
2488
+ if (!(opt_opmask & HTP_OPMASK_COMPUTE)) {
2489
+ req.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
2490
+ }
2491
+
2492
+ dspqueue_buffer bufs[4];
2493
+ memset(bufs, 0, sizeof(bufs));
2494
+
2495
+ // First buffer Weights.
2496
+ // The content is static, there is no need to do any cache management
2497
+ bufs[0].fd = src0_buf->fd;
2498
+ bufs[0].ptr = src0->data;
2499
+ bufs[0].offset = (uint8_t *) src0->data - src0_buf->base;
2500
+ bufs[0].size = ggml_nbytes(src0);
2501
+ bufs[0].flags = 0;
2502
+
2503
+ // Second buffer Input Activations. This is a buffer that the CPU
2504
+ // writes and the DSP reads, so we'll need to flush CPU caches and
2505
+ // invalidate DSP ones. On platforms with I/O coherency support the
2506
+ // framework will automatically skip cache operations where possible.
2507
+ bufs[1].fd = src1_buf->fd;
2508
+ bufs[1].ptr = src1->data;
2509
+ bufs[1].offset = (uint8_t *) src1->data - src1_buf->base;
2510
+ bufs[1].size = ggml_nbytes(src1);
2511
+ bufs[1].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
2512
+ DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
2513
+
2514
+ // Third buffer expert IDs. This is a buffer that the CPU
2515
+ // writes and the DSP reads, so we'll need to flush CPU caches and
2516
+ // invalidate DSP ones. On platforms with I/O coherency support the
2517
+ // framework will automatically skip cache operations where possible.
2518
+ bufs[2].fd = src2_buf->fd;
2519
+ bufs[2].ptr = src2->data;
2520
+ bufs[2].offset = (uint8_t *) src2->data - src2_buf->base;
2521
+ bufs[2].size = ggml_nbytes(src2);
2522
+ bufs[2].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
2523
+ DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
2524
+
2525
+ // Forth buffer Output Activations. We'll handle DSP
2526
+ // cache maintenance in the response message but need to flush
2527
+ // CPU caches to ensure any previously written dirty lines are
2528
+ // written out before writes from the DSP start.
2529
+ bufs[3].fd = dst_buf->fd;
2530
+ bufs[3].ptr = dst->data;
2531
+ bufs[3].offset = (uint8_t *) dst->data - dst_buf->base;
2532
+ bufs[3].size = ggml_nbytes(dst);
2533
+ bufs[3].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
2534
+
2535
+ // Primary DSP session from the src0 (normally weight) tensor
2536
+ auto sess = src0_buf->sess;
2537
+
2538
+ if (opt_verbose) {
2539
+ char dims[64 * GGML_MAX_SRC];
2540
+ char strides[64 * GGML_MAX_SRC];
2541
+ char types[16 * GGML_MAX_SRC];
2542
+ char buffs[64 * GGML_MAX_SRC];
2543
+ char names[64 * GGML_MAX_SRC];
2544
+
2545
+ hex_format_op_dims(dims, op);
2546
+ hex_format_op_types(types, op);
2547
+ hex_format_op_buffs(buffs, op);
2548
+ hex_format_op_names(names, op);
2549
+
2550
+ HEX_VERBOSE("ggml-hex: %s %s: %s : %s : %s : %s : %s: flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op),
2551
+ names, dims, types, strides, buffs, req.flags);
2552
+
2553
+ if (opt_verbose > 1) {
2554
+ hex_dump_dspbuf(src0, &bufs[0]);
2555
+ hex_dump_dspbuf(src1, &bufs[1]);
2556
+ hex_dump_dspbuf(src2, &bufs[2]);
2557
+ hex_dump_dspbuf(dst, &bufs[3]);
2558
+ }
2559
+ }
2560
+
2561
+ if ((opt_opmask & HTP_OPMASK_QUEUE)) {
2562
+ sess->enqueue(req, bufs, 4, opt_opsync);
2563
+ }
2564
+
2565
+ t2 = ggml_time_us();
2566
+
2567
+ HEX_PROFILE(
2568
+ "ggml-hex: %s matmul-id %s %u:%u:%u:%u x %s %u:%u:%u:%u (%s %u:%u:%u:%u) -> %s %u:%u:%u:%u : op-usec %u "
2569
+ "op-cycles %u op-pkts %u (%f) call-usec %llu\n",
2570
+ sess->name.c_str(), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], (uint32_t) src0->ne[2],
2571
+ (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], (uint32_t) src1->ne[2],
2572
+ (uint32_t) src1->ne[3], src2->name, (uint32_t) src2->ne[0], (uint32_t) src2->ne[1], (uint32_t) src2->ne[2],
2573
+ (uint32_t) src2->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],
2574
+ (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
2575
+ (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
2576
+ }
2577
+
2578
+ static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) {
2579
+ const struct ggml_tensor * node = op;
2580
+ const struct ggml_tensor * src0 = node->src[0];
2581
+ const struct ggml_tensor * src1 = node->src[1];
2582
+ const struct ggml_tensor * dst = node;
2583
+
2584
+ auto src0_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src0->buffer->context);
2585
+ auto src1_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src1->buffer->context);
2586
+ auto dst_buf = static_cast<ggml_backend_hexagon_buffer_context *>(dst->buffer->context);
2587
+
2588
+ uint64_t t1 = 0;
2589
+ uint64_t t2 = 0;
2590
+
2591
+ t1 = ggml_time_us();
2592
+
2593
+ // Construct HTP message
2594
+ htp_general_req req;
2595
+ req.flags = flags;
2596
+
2597
+ // Use opmask to override flags
2598
+ if (!(opt_opmask & HTP_OPMASK_QUANTIZE)) {
2599
+ req.flags |= HTP_OPFLAGS_SKIP_QUANTIZE;
2600
+ }
2601
+ if (!(opt_opmask & HTP_OPMASK_COMPUTE)) {
2602
+ req.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
2603
+ }
2604
+
2605
+ switch (node->op) {
2606
+ case GGML_OP_MUL:
2607
+ req.op = HTP_OP_MUL;
2608
+ break;
2609
+ case GGML_OP_ADD:
2610
+ req.op = HTP_OP_ADD;
2611
+ break;
2612
+ case GGML_OP_SUB:
2613
+ req.op = HTP_OP_SUB;
2614
+ break;
2615
+ default:
2616
+ GGML_ABORT("ggml-hex: binary : unsupported op:%d\n", node->op);
2617
+ }
2618
+
2619
+ init_htp_tensor(&req.src0, src0);
2620
+ init_htp_tensor(&req.src1, src1);
2621
+ init_htp_tensor(&req.dst, dst);
2622
+
2623
+ dspqueue_buffer bufs[3];
2624
+ memset(bufs, 0, sizeof(bufs));
2625
+
2626
+ // First buffer = First Operand of Binary op
2627
+ // This is a buffer that the CPU writes and the DSP reads, so we'll
2628
+ // need to flush CPU caches and invalidate DSP ones. On platforms
2629
+ // with I/O coherency support the framework will automatically skip
2630
+ // cache operations where possible.
2631
+ bufs[0].fd = src0_buf->fd;
2632
+ bufs[0].ptr = src0->data;
2633
+ bufs[0].offset = (uint8_t *) src0->data - src0_buf->base;
2634
+ bufs[0].size = ggml_nbytes(src0);
2635
+ bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
2636
+ DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP;
2637
+
2638
+ // Second buffer = Second Operand of Binary op
2639
+ // This is a buffer that the CPU writes and the DSP reads, so we'll
2640
+ // need to flush CPU caches and invalidate DSP ones. On platforms
2641
+ // with I/O coherency support the framework will automatically skip
2642
+ // cache operations where possible.
2643
+ bufs[1].fd = src1_buf->fd;
2644
+ bufs[1].ptr = src1->data;
2645
+ bufs[1].offset = (uint8_t *) src1->data - src1_buf->base;
2646
+ bufs[1].size = ggml_nbytes(src1);
2647
+ bufs[1].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
2648
+ DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
2649
+
2650
+ // Third buffer = Output Activations. We'll handle DSP
2651
+ // cache maintenance in the response message but need to flush
2652
+ // CPU caches to ensure any previously written dirty lines are
2653
+ // written out before writes from the DSP start.
2654
+ bufs[2].fd = dst_buf->fd;
2655
+ bufs[2].ptr = dst->data;
2656
+ bufs[2].offset = (uint8_t *) dst->data - dst_buf->base;
2657
+ bufs[2].size = ggml_nbytes(dst);
2658
+ bufs[2].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
2659
+
2660
+ // Primary DSP session from the src0 tensor
2661
+ ggml_hexagon_session * sess = src0_buf->sess;
2662
+
2663
+ if (opt_verbose) {
2664
+ char dims[64 * GGML_MAX_SRC];
2665
+ char strides[16 * GGML_MAX_SRC];
2666
+ char types[16 * GGML_MAX_SRC];
2667
+ char buffs[64 * GGML_MAX_SRC];
2668
+ char names[64 * GGML_MAX_SRC];
2669
+
2670
+ hex_format_op_dims(dims, op);
2671
+ hex_format_op_strides(strides, op);
2672
+ hex_format_op_types(types, op);
2673
+ hex_format_op_buffs(buffs, op);
2674
+ hex_format_op_names(names, op);
2675
+
2676
+ HEX_VERBOSE("ggml-hex: %s %s : %s : %s : %s : %s : %s : flags 0x%x\n", sess->name.c_str(),
2677
+ ggml_op_name(node->op), names, dims, types, strides, buffs, req.flags);
2678
+ if (opt_verbose > 1) {
2679
+ hex_dump_dspbuf(src0, &bufs[0]);
2680
+ hex_dump_dspbuf(src1, &bufs[1]);
2681
+ hex_dump_dspbuf(dst, &bufs[2]);
2682
+ }
2683
+ }
2684
+
2685
+ if ((opt_opmask & HTP_OPMASK_QUEUE)) {
2686
+ sess->enqueue(req, bufs, 3, opt_opsync);
2687
+ }
2688
+
2689
+ t2 = ggml_time_us();
2690
+
2691
+ HEX_PROFILE(
2692
+ "ggml-hex: %s %s %s %u:%u:%u:%u x %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles %u op-pkts %u (%f) "
2693
+ "call-usec %llu\n",
2694
+ sess->name.c_str(), ggml_op_name(node->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
2695
+ (uint32_t) src0->ne[2], (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
2696
+ (uint32_t) src1->ne[2], (uint32_t) src1->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
2697
+ (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
2698
+ (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
2699
+ }
2700
+
2701
+ static void ggml_hexagon_add_id(const struct ggml_tensor * op, uint32_t flags) {
2702
+ const struct ggml_tensor * node = op;
2703
+ const struct ggml_tensor * src0 = node->src[0];
2704
+ const struct ggml_tensor * src1 = node->src[1];
2705
+ const struct ggml_tensor * src2 = node->src[2];
2706
+ const struct ggml_tensor * dst = node;
2707
+
2708
+ auto src0_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src0->buffer->context);
2709
+ auto src1_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src1->buffer->context);
2710
+ auto src2_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src2->buffer->context);
2711
+ auto dst_buf = static_cast<ggml_backend_hexagon_buffer_context *>(dst->buffer->context);
2712
+
2713
+ uint64_t t1 = 0;
2714
+ uint64_t t2 = 0;
2715
+
2716
+ t1 = ggml_time_us();
2717
+
2718
+ // Construct HTP message
2719
+ htp_general_req req;
2720
+ req.flags = flags;
2721
+
2722
+ // Use opmask to override flags
2723
+ if (!(opt_opmask & HTP_OPMASK_QUANTIZE)) {
2724
+ req.flags |= HTP_OPFLAGS_SKIP_QUANTIZE;
2725
+ }
2726
+ if (!(opt_opmask & HTP_OPMASK_COMPUTE)) {
2727
+ req.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
2728
+ }
2729
+
2730
+ switch (node->op) {
2731
+ case GGML_OP_ADD_ID:
2732
+ req.op = HTP_OP_ADD_ID;
2733
+ break;
2734
+ default:
2735
+ GGML_ABORT("ggml-hex: unsupported op:%d\n", node->op);
2736
+ }
2737
+
2738
+ init_htp_tensor(&req.src0, src0);
2739
+ init_htp_tensor(&req.src1, src1);
2740
+ init_htp_tensor(&req.src2, src2);
2741
+ init_htp_tensor(&req.dst, dst);
2742
+
2743
+ dspqueue_buffer bufs[4];
2744
+ memset(bufs, 0, sizeof(bufs));
2745
+
2746
+ // First buffer = input activations
2747
+ bufs[0].fd = src0_buf->fd;
2748
+ bufs[0].ptr = src0->data;
2749
+ bufs[0].offset = (uint8_t *) src0->data - src0_buf->base;
2750
+ bufs[0].size = ggml_nbytes(src0);
2751
+ bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
2752
+ DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP;
2753
+
2754
+ // Second buffer = experts bias
2755
+ bufs[1].fd = src1_buf->fd;
2756
+ bufs[1].ptr = src1->data;
2757
+ bufs[1].offset = (uint8_t *) src1->data - src1_buf->base;
2758
+ bufs[1].size = ggml_nbytes(src1);
2759
+ bufs[1].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
2760
+ DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
2761
+
2762
+ // Third buffer = activated experts
2763
+ bufs[2].fd = src2_buf->fd;
2764
+ bufs[2].ptr = src2->data;
2765
+ bufs[2].offset = (uint8_t *) src2->data - src2_buf->base;
2766
+ bufs[2].size = ggml_nbytes(src2);
2767
+ bufs[2].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
2768
+ DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
2769
+
2770
+ // Forth buffer = output activations
2771
+ bufs[3].fd = dst_buf->fd;
2772
+ bufs[3].ptr = dst->data;
2773
+ bufs[3].offset = (uint8_t *) dst->data - dst_buf->base;
2774
+ bufs[3].size = ggml_nbytes(dst);
2775
+ bufs[3].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
2776
+
2777
+ // Primary DSP session from the src0 tensor
2778
+ ggml_hexagon_session * sess = src0_buf->sess;
2779
+
2780
+ if (opt_verbose) {
2781
+ char dims[64 * GGML_MAX_SRC];
2782
+ char strides[16 * GGML_MAX_SRC];
2783
+ char types[16 * GGML_MAX_SRC];
2784
+ char buffs[64 * GGML_MAX_SRC];
2785
+ char names[64 * GGML_MAX_SRC];
2786
+
2787
+ hex_format_op_dims(dims, op);
2788
+ hex_format_op_strides(strides, op);
2789
+ hex_format_op_types(types, op);
2790
+ hex_format_op_buffs(buffs, op);
2791
+ hex_format_op_names(names, op);
2792
+
2793
+ HEX_VERBOSE("ggml-hex: %s %s : %s : %s : %s : %s : %s : flags 0x%x\n", sess->name.c_str(),
2794
+ ggml_op_name(node->op), names, dims, types, strides, buffs, req.flags);
2795
+
2796
+ if (opt_verbose > 1) {
2797
+ hex_dump_dspbuf(src0, &bufs[0]);
2798
+ hex_dump_dspbuf(src1, &bufs[1]);
2799
+ hex_dump_dspbuf(src2, &bufs[2]);
2800
+ hex_dump_dspbuf(dst, &bufs[3]);
2801
+ }
2802
+ }
2803
+
2804
+ if ((opt_opmask & HTP_OPMASK_QUEUE)) {
2805
+ sess->enqueue(req, bufs, 4, opt_opsync);
2806
+ }
2807
+
2808
+ t2 = ggml_time_us();
2809
+
2810
+ HEX_PROFILE(
2811
+ "ggml-hex: %s %s %s %u:%u:%u:%u x %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles %u op-pkts %u (%f) "
2812
+ "call-usec %llu\n",
2813
+ sess->name.c_str(), ggml_op_name(node->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
2814
+ (uint32_t) src0->ne[2], (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
2815
+ (uint32_t) src1->ne[2], (uint32_t) src1->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
2816
+ (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
2817
+ (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
2818
+ }
2819
+
2820
+ static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) {
2821
+ const struct ggml_tensor * src0 = op->src[0];
2822
+ const struct ggml_tensor * src1 = op->src[1];
2823
+ const struct ggml_tensor * dst = op;
2824
+
2825
+ uint64_t t1 = 0;
2826
+ uint64_t t2 = 0;
2827
+
2828
+ t1 = ggml_time_us();
2829
+
2830
+ // Construct HTP message
2831
+ htp_general_req req;
2832
+
2833
+ memset(&req, 0, sizeof(htp_general_req));
2834
+ memcpy(&req.op_params, &op->op_params, sizeof(op->op_params));
2835
+ req.flags = flags;
2836
+
2837
+ bool supported = false;
2838
+
2839
+ switch (op->op) {
2840
+ case GGML_OP_RMS_NORM:
2841
+ req.op = HTP_OP_RMS_NORM;
2842
+ supported = true;
2843
+ break;
2844
+
2845
+ case GGML_OP_UNARY:
2846
+ if (ggml_get_unary_op(dst) == GGML_UNARY_OP_SILU) {
2847
+ req.op = HTP_OP_UNARY_SILU;
2848
+ supported = true;
2849
+ }
2850
+ break;
2851
+
2852
+ case GGML_OP_GLU:
2853
+ if (ggml_get_glu_op(dst) == GGML_GLU_OP_SWIGLU) {
2854
+ req.op = HTP_OP_GLU_SWIGLU;
2855
+ supported = true;
2856
+ } else if (ggml_get_glu_op(dst) == GGML_GLU_OP_SWIGLU_OAI) {
2857
+ req.op = HTP_OP_GLU_SWIGLU_OAI;
2858
+ supported = true;
2859
+ }
2860
+ break;
2861
+
2862
+ case GGML_OP_SOFT_MAX:
2863
+ req.op = HTP_OP_SOFTMAX;
2864
+ supported = true;
2865
+
2866
+ default:
2867
+ break;
2868
+ }
2869
+
2870
+ if (!supported) {
2871
+ GGML_ABORT("ggml-hex: unary : unsupported op:%d\n", op->op);
2872
+ }
2873
+
2874
+ init_htp_tensor(&req.dst, dst);
2875
+ init_htp_tensor(&req.src0, src0);
2876
+ if (src1) {
2877
+ init_htp_tensor(&req.src1, src1);
2878
+ }
2879
+
2880
+ // Use opmask to override flags
2881
+ if (!(opt_opmask & HTP_OPMASK_QUANTIZE)) {
2882
+ req.flags |= HTP_OPFLAGS_SKIP_QUANTIZE;
2883
+ }
2884
+ if (!(opt_opmask & HTP_OPMASK_COMPUTE)) {
2885
+ req.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
2886
+ }
2887
+
2888
+ dspqueue_buffer bufs[3];
2889
+ int n_bufs = 0;
2890
+
2891
+ memset(bufs, 0, sizeof(bufs));
2892
+
2893
+ // First buffer = Only Operand of Unary op
2894
+ // This is a buffer that the CPU writes and the DSP reads, so we'll
2895
+ // need to flush CPU caches and invalidate DSP ones. On platforms
2896
+ // with I/O coherency support the framework will automatically skip
2897
+ // cache operations where possible.
2898
+ auto src0_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src0->buffer->context);
2899
+ bufs[n_bufs].fd = src0_buf->fd;
2900
+ bufs[n_bufs].ptr = src0->data;
2901
+ bufs[n_bufs].offset = (uint8_t *) src0->data - src0_buf->base;
2902
+ bufs[n_bufs].size = ggml_nbytes(src0);
2903
+ bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
2904
+ DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP;
2905
+ ++n_bufs;
2906
+
2907
+ if (src1) {
2908
+ // Second buffer = Second Operand of Binary op
2909
+ // This is a buffer that the CPU writes and the DSP reads, so we'll
2910
+ // need to flush CPU caches and invalidate DSP ones. On platforms
2911
+ // with I/O coherency support the framework will automatically skip
2912
+ // cache operations where possible.
2913
+ auto src1_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src1->buffer->context);
2914
+ bufs[n_bufs].fd = src1_buf->fd;
2915
+ bufs[n_bufs].ptr = src1->data;
2916
+ bufs[n_bufs].offset = (uint8_t *) src1->data - src1_buf->base;
2917
+ bufs[n_bufs].size = ggml_nbytes(src1);
2918
+ bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
2919
+ DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
2920
+ ++n_bufs;
2921
+ }
2922
+
2923
+ // Second or third buffer = Output Activations. We'll handle DSP
2924
+ // Second buffer = Output Activations. We'll handle DSP
2925
+ // cache maintenance in the response message but need to flush
2926
+ // CPU caches to ensure any previously written dirty lines are
2927
+ // written out before writes from the DSP start.
2928
+ auto dst_buf = static_cast<ggml_backend_hexagon_buffer_context *>(dst->buffer->context);
2929
+ bufs[n_bufs].fd = dst_buf->fd;
2930
+ bufs[n_bufs].ptr = dst->data;
2931
+ bufs[n_bufs].offset = (uint8_t *) dst->data - dst_buf->base;
2932
+ bufs[n_bufs].size = ggml_nbytes(dst);
2933
+ bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
2934
+ ++n_bufs;
2935
+
2936
+ // Primary DSP session from the src0 tensor
2937
+ ggml_hexagon_session * sess = src0_buf->sess;
2938
+
2939
+ if (opt_verbose) {
2940
+ char dims[64 * GGML_MAX_SRC];
2941
+ char strides[64 * GGML_MAX_SRC];
2942
+ char types[16 * GGML_MAX_SRC];
2943
+ char buffs[64 * GGML_MAX_SRC];
2944
+ char names[64 * GGML_MAX_SRC];
2945
+
2946
+ hex_format_op_dims(dims, op);
2947
+ hex_format_op_strides(strides, op);
2948
+ hex_format_op_types(types, op);
2949
+ hex_format_op_buffs(buffs, op);
2950
+ hex_format_op_names(names, op);
2951
+
2952
+ HEX_VERBOSE("ggml-hex: %s %s : %s : %s : %s : %s : %s : flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op),
2953
+ names, dims, types, strides, buffs, req.flags);
2954
+ if (opt_verbose > 1) {
2955
+ hex_dump_dspbuf(src0, &bufs[0]);
2956
+ if (src1) {
2957
+ hex_dump_dspbuf(src1, &bufs[1]);
2958
+ hex_dump_dspbuf(dst, &bufs[2]);
2959
+ } else {
2960
+ hex_dump_dspbuf(dst, &bufs[1]);
2961
+ }
2962
+ }
2963
+ }
2964
+
2965
+ if ((opt_opmask & HTP_OPMASK_QUEUE)) {
2966
+ sess->enqueue(req, bufs, n_bufs, opt_opsync);
2967
+ }
2968
+
2969
+ t2 = ggml_time_us();
2970
+
2971
+ if (src1) {
2972
+ HEX_PROFILE(
2973
+ "ggml-hex: %s %s %s %u:%u:%u:%u x %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles %u op-pkts %u "
2974
+ "(%f) call-usec %llu\n",
2975
+ sess->name.c_str(), ggml_op_name(op->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
2976
+ (uint32_t) src0->ne[2], (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
2977
+ (uint32_t) src1->ne[2], (uint32_t) src1->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
2978
+ (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
2979
+ (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
2980
+ } else {
2981
+ HEX_PROFILE(
2982
+ "ggml-hex: %s %s %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles %u op-pkts %u (%f) call-usec "
2983
+ "%llu\n",
2984
+ sess->name.c_str(), ggml_op_name(op->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
2985
+ (uint32_t) src0->ne[2], (uint32_t) src0->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
2986
+ (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
2987
+ (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
2988
+ }
2989
+ }
2990
+
2991
+ static void ggml_hexagon_rope(const struct ggml_tensor * op, uint32_t flags) {
2992
+ const struct ggml_tensor * src0 = op->src[0];
2993
+ const struct ggml_tensor * src1 = op->src[1];
2994
+ const struct ggml_tensor * src2 = op->src[2];
2995
+ const struct ggml_tensor * dst = op;
2996
+
2997
+ uint64_t t1 = 0;
2998
+ uint64_t t2 = 0;
2999
+
3000
+ t1 = ggml_time_us();
3001
+
3002
+ // Construct HTP message
3003
+ htp_general_req req;
3004
+
3005
+ memset(&req, 0, sizeof(htp_general_req));
3006
+ memcpy(&req.op_params, &op->op_params, sizeof(op->op_params));
3007
+ req.flags = flags;
3008
+ req.op = HTP_OP_ROPE;
3009
+
3010
+ init_htp_tensor(&req.dst, dst);
3011
+ init_htp_tensor(&req.src0, src0);
3012
+ init_htp_tensor(&req.src1, src1);
3013
+ if (src2) {
3014
+ init_htp_tensor(&req.src2, src2);
3015
+ }
3016
+
3017
+ // Use opmask to override flags
3018
+ if (!(opt_opmask & HTP_OPMASK_QUANTIZE)) {
3019
+ req.flags |= HTP_OPFLAGS_SKIP_QUANTIZE;
3020
+ }
3021
+ if (!(opt_opmask & HTP_OPMASK_COMPUTE)) {
3022
+ req.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
3023
+ }
3024
+
3025
+ dspqueue_buffer bufs[4];
3026
+ int n_bufs = 0;
3027
+
3028
+ memset(bufs, 0, sizeof(bufs));
3029
+
3030
+ // First buffer
3031
+ // This is a buffer that the CPU writes and the DSP reads, so we'll
3032
+ // need to flush CPU caches and invalidate DSP ones. On platforms
3033
+ // with I/O coherency support the framework will automatically skip
3034
+ // cache operations where possible.
3035
+ auto src0_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src0->buffer->context);
3036
+ bufs[n_bufs].fd = src0_buf->fd;
3037
+ bufs[n_bufs].ptr = src0->data;
3038
+ bufs[n_bufs].offset = (uint8_t *) src0->data - src0_buf->base;
3039
+ bufs[n_bufs].size = ggml_nbytes(src0);
3040
+ bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
3041
+ DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP;
3042
+ ++n_bufs;
3043
+
3044
+ // Second buffer
3045
+ // This is a buffer that the CPU writes and the DSP reads, so we'll
3046
+ // need to flush CPU caches and invalidate DSP ones. On platforms
3047
+ // with I/O coherency support the framework will automatically skip
3048
+ // cache operations where possible.
3049
+ auto src1_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src1->buffer->context);
3050
+ bufs[n_bufs].fd = src1_buf->fd;
3051
+ bufs[n_bufs].ptr = src1->data;
3052
+ bufs[n_bufs].offset = (uint8_t *) src1->data - src1_buf->base;
3053
+ bufs[n_bufs].size = ggml_nbytes(src1);
3054
+ bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
3055
+ DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
3056
+ ++n_bufs;
3057
+
3058
+ if (src2) {
3059
+ // Third buffer
3060
+ // This is a buffer that the CPU writes and the DSP reads, so we'll
3061
+ // need to flush CPU caches and invalidate DSP ones. On platforms
3062
+ // with I/O coherency support the framework will automatically skip
3063
+ // cache operations where possible.
3064
+ auto src2_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src2->buffer->context);
3065
+ bufs[n_bufs].fd = src2_buf->fd;
3066
+ bufs[n_bufs].ptr = src2->data;
3067
+ bufs[n_bufs].offset = (uint8_t *) src2->data - src2_buf->base;
3068
+ bufs[n_bufs].size = ggml_nbytes(src2);
3069
+ bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
3070
+ DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
3071
+ ++n_bufs;
3072
+ }
3073
+
3074
+ // Final buffer = Output Activations. We'll handle DSP
3075
+ // Second buffer = Output Activations. We'll handle DSP
3076
+ // cache maintenance in the response message but need to flush
3077
+ // CPU caches to ensure any previously written dirty lines are
3078
+ // written out before writes from the DSP start.
3079
+ auto dst_buf = static_cast<ggml_backend_hexagon_buffer_context *>(dst->buffer->context);
3080
+ bufs[n_bufs].fd = dst_buf->fd;
3081
+ bufs[n_bufs].ptr = dst->data;
3082
+ bufs[n_bufs].offset = (uint8_t *) dst->data - dst_buf->base;
3083
+ bufs[n_bufs].size = ggml_nbytes(dst);
3084
+ bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
3085
+ ++n_bufs;
3086
+
3087
+ // Primary DSP session from the src0 tensor
3088
+ ggml_hexagon_session * sess = src0_buf->sess;
3089
+
3090
+ if (opt_verbose) {
3091
+ char dims[64 * GGML_MAX_SRC];
3092
+ char strides[64 * GGML_MAX_SRC];
3093
+ char types[16 * GGML_MAX_SRC];
3094
+ char buffs[64 * GGML_MAX_SRC];
3095
+ char names[64 * GGML_MAX_SRC];
3096
+
3097
+ hex_format_op_dims(dims, op);
3098
+ hex_format_op_strides(strides, op);
3099
+ hex_format_op_types(types, op);
3100
+ hex_format_op_buffs(buffs, op);
3101
+ hex_format_op_names(names, op);
3102
+
3103
+ HEX_VERBOSE("ggml-hex: %s %s : %s : %s : %s : %s : %s : flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op),
3104
+ names, dims, types, strides, buffs, req.flags);
3105
+ if (opt_verbose > 1) {
3106
+ hex_dump_dspbuf(src0, &bufs[0]);
3107
+ if (src1) {
3108
+ hex_dump_dspbuf(src1, &bufs[1]);
3109
+ hex_dump_dspbuf(dst, &bufs[2]);
3110
+ } else {
3111
+ hex_dump_dspbuf(dst, &bufs[1]);
3112
+ }
3113
+ }
3114
+ }
3115
+
3116
+ if ((opt_opmask & HTP_OPMASK_QUEUE)) {
3117
+ sess->enqueue(req, bufs, n_bufs, opt_opsync);
3118
+ }
3119
+
3120
+ t2 = ggml_time_us();
3121
+
3122
+ if (src2) {
3123
+ HEX_PROFILE(
3124
+ "ggml-hex: %s %s %s %u:%u:%u:%u x %s %u:%u:%u:%u x %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles "
3125
+ "%u op-pkts %u (%f) call-usec %llu\n",
3126
+ sess->name.c_str(), ggml_op_name(op->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
3127
+ (uint32_t) src0->ne[2], (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
3128
+ (uint32_t) src1->ne[2], (uint32_t) src1->ne[3], src2->name, (uint32_t) src2->ne[0], (uint32_t) src2->ne[1],
3129
+ (uint32_t) src2->ne[2], (uint32_t) src2->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
3130
+ (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
3131
+ (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
3132
+ } else {
3133
+ HEX_PROFILE(
3134
+ "ggml-hex: %s %s %s %u:%u:%u:%u x %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles %u op-pkts %u "
3135
+ "(%f) call-usec %llu\n",
3136
+ sess->name.c_str(), ggml_op_name(op->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
3137
+ (uint32_t) src0->ne[2], (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
3138
+ (uint32_t) src1->ne[2], (uint32_t) src1->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
3139
+ (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
3140
+ (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
3141
+ }
3142
+ }
3143
+
3144
+ static const char * ggml_backend_hexagon_name(ggml_backend_t backend) {
3145
+ auto sess = static_cast<ggml_hexagon_session *>(backend->context);
3146
+ return sess->name.c_str();
3147
+ }
3148
+
3149
+ static void ggml_backend_hexagon_free(ggml_backend_t backend) {
3150
+ // we just need to delete the backend here
3151
+ // the sessions are allocated & freed as part of the registry
3152
+ delete backend;
3153
+ }
3154
+
3155
+ static inline bool op_reuse_src1(const ggml_tensor * op1, const ggml_tensor * op0) {
3156
+ return (op0 && op0->src[1] == op1->src[1]);
3157
+ }
3158
+
3159
+ static inline bool is_compute_op(ggml_tensor *node)
3160
+ {
3161
+ return !(ggml_op_is_empty(node->op) || ggml_is_empty(node));
3162
+ }
3163
+
3164
+ // scan the graph and figure out last compute op index
3165
+ static inline int last_compute_op(ggml_cgraph * graph) {
3166
+ int last = 0;
3167
+ for (int i = 0; i < graph->n_nodes; ++i) {
3168
+ if (is_compute_op(graph->nodes[i])) {
3169
+ last = i;
3170
+ }
3171
+ }
3172
+
3173
+ return last;
3174
+ }
3175
+
3176
+ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, ggml_cgraph * graph) {
3177
+ auto sess = static_cast<ggml_hexagon_session *>(backend->context);
3178
+
3179
+ HEX_VERBOSE("ggml-hex: %s graph-compute n_nodes %d\n", sess->name.c_str(), graph->n_nodes);
3180
+
3181
+ const int last = last_compute_op(graph);
3182
+
3183
+ const struct ggml_tensor * prev_quant_op = nullptr; // prev executed op with quantizer
3184
+
3185
+ for (int i = 0; i < graph->n_nodes; ++i) {
3186
+ ggml_tensor * node = graph->nodes[i];
3187
+
3188
+ if (!is_compute_op(node)) {
3189
+ continue;
3190
+ }
3191
+
3192
+ uint32_t flags = 0;
3193
+
3194
+ // skip quantizer if src1 is reused
3195
+ if (op_reuse_src1(node, prev_quant_op)) {
3196
+ flags |= HTP_OPFLAGS_SKIP_QUANTIZE;
3197
+ }
3198
+
3199
+ // ask for early notification for the last Op
3200
+ if (i == last) {
3201
+ flags |= HTP_OPFLAGS_EARLY_WAKEUP;
3202
+ }
3203
+
3204
+ switch (node->op) {
3205
+ case GGML_OP_MUL_MAT:
3206
+ ggml_hexagon_mul_mat(node, flags);
3207
+ prev_quant_op = node;
3208
+ break;
3209
+ case GGML_OP_MUL_MAT_ID:
3210
+ ggml_hexagon_mul_mat_id(node, flags);
3211
+ prev_quant_op = node;
3212
+ break;
3213
+ case GGML_OP_MUL:
3214
+ case GGML_OP_ADD:
3215
+ case GGML_OP_SUB:
3216
+ ggml_hexagon_binary(node, flags);
3217
+ break;
3218
+ case GGML_OP_ADD_ID:
3219
+ ggml_hexagon_add_id(node, flags);
3220
+ break;
3221
+ case GGML_OP_RMS_NORM:
3222
+ ggml_hexagon_unary(node, flags);
3223
+ break;
3224
+ case GGML_OP_UNARY:
3225
+ if (ggml_get_unary_op(node) == GGML_UNARY_OP_SILU) {
3226
+ ggml_hexagon_unary(node, flags);
3227
+ }
3228
+ break;
3229
+ case GGML_OP_GLU:
3230
+ if ((ggml_get_glu_op(node) == GGML_GLU_OP_SWIGLU) ||
3231
+ (ggml_get_glu_op(node) == GGML_GLU_OP_SWIGLU_OAI)) {
3232
+ ggml_hexagon_unary(node, flags);
3233
+ }
3234
+ break;
3235
+ case GGML_OP_SOFT_MAX:
3236
+ ggml_hexagon_unary(node, flags);
3237
+ break;
3238
+
3239
+ case GGML_OP_ROPE:
3240
+ ggml_hexagon_rope(node, flags);
3241
+ break;
3242
+
3243
+ default:
3244
+ GGML_ABORT("\nggml-hex: graph-compute %s is not supported\n", ggml_op_desc(node));
3245
+ }
3246
+ }
3247
+
3248
+ // Wait until all pending ops complete
3249
+ sess->flush();
3250
+
3251
+ return GGML_STATUS_SUCCESS;
3252
+ }
3253
+
3254
+ static void ggml_backend_hexagon_synchronize(ggml_backend_t backend) {
3255
+ auto sess = static_cast<ggml_hexagon_session *>(backend->context);
3256
+
3257
+ HEX_VERBOSE("ggml-hex: %s synchronize\n", sess->name.c_str());
3258
+
3259
+ // Wait until all pending ops complete
3260
+ sess->flush();
3261
+ }
3262
+
3263
+ struct node_info {
3264
+ ggml_tensor * node;
3265
+
3266
+ std::vector<ggml_tensor *> fused;
3267
+
3268
+ ggml_op op() const {
3269
+ return node->op;
3270
+ }
3271
+
3272
+ const ggml_tensor * dst() const {
3273
+ return fused.empty() ? node : fused.back();
3274
+ }
3275
+
3276
+ const ggml_tensor * src0() const {
3277
+ return node->src[0];
3278
+ }
3279
+
3280
+ const ggml_tensor * src1() const {
3281
+ return node->src[1];
3282
+ }
3283
+
3284
+ bool is_empty() const {
3285
+ return ggml_op_is_empty(node->op);
3286
+ }
3287
+
3288
+ void add_fused(ggml_tensor * t) {
3289
+ fused.push_back(t);
3290
+ }
3291
+
3292
+ bool stackable() const {
3293
+ switch (this->op()) {
3294
+ case GGML_OP_MUL_MAT:
3295
+ case GGML_OP_MUL_MAT_ID:
3296
+ return ggml_is_quantized(this->src0()->type);
3297
+ default:
3298
+ return false;
3299
+ }
3300
+ }
3301
+
3302
+ bool same_input(const node_info& n) const {
3303
+ return n.src1() == this->src1();
3304
+ }
3305
+ };
3306
+
3307
+ static std::vector<int> ggml_hexagon_graph_optimize_reorder(const std::vector<node_info> & nodes) {
3308
+ const int n = nodes.size();
3309
+
3310
+ std::vector<int> res;
3311
+ res.reserve(n);
3312
+
3313
+ std::vector<bool> used(n, false);
3314
+
3315
+ // The main goal here is to stack the MUL_MAT ops with the same src1 input.
3316
+ // This allows use to reuse dynamically quantized src1 in VTCM.
3317
+
3318
+ // TODO: the current version might do incorrect reodering in cases where quantized src0
3319
+ // input is an output of another Op.
3320
+
3321
+ for (int i0 = 0; i0 < n; i0++) {
3322
+ if (used[i0]) {
3323
+ continue;
3324
+ }
3325
+
3326
+ res.push_back(i0);
3327
+
3328
+ const auto & node0 = nodes[i0];
3329
+
3330
+ if (!node0.stackable()) {
3331
+ continue;
3332
+ }
3333
+
3334
+ // that many nodes forward to search for stackable nodes that can reuse VTCM
3335
+ constexpr int N_FORWARD = 8;
3336
+
3337
+ for (int i1 = i0 + 1; i1 < i0 + N_FORWARD && i1 < n; i1++) {
3338
+ if (used[i1]) {
3339
+ continue;
3340
+ }
3341
+
3342
+ const auto & node1 = nodes[i1];
3343
+
3344
+ if (node1.stackable() && node1.same_input(node0)) {
3345
+ res.push_back(i1);
3346
+ used[i1] = true;
3347
+ }
3348
+ }
3349
+ }
3350
+
3351
+ return res;
3352
+ }
3353
+
3354
+ static void ggml_backend_hexagon_graph_optimize(ggml_backend_t backend, ggml_cgraph * gf) {
3355
+ const int n = gf->n_nodes;
3356
+
3357
+ constexpr int MAX_FUSE = 16;
3358
+
3359
+ enum ggml_op ops[MAX_FUSE];
3360
+
3361
+ std::vector<node_info> nodes;
3362
+ nodes.reserve(gf->n_nodes);
3363
+
3364
+ // fuse nodes:
3365
+ // we don't want to make reorders that break fusing, so we first pack all fusable tensors
3366
+ // and perform the reorder over the fused nodes. after the reorder is done, we unfuse
3367
+ for (int i = 0; i < n; i++) {
3368
+ node_info node = {
3369
+ /*.node =*/ gf->nodes[i],
3370
+ /*.fused =*/ {},
3371
+ };
3372
+
3373
+ // fuse only ops that start with these operations
3374
+ // can be expanded when needed
3375
+ if (node.op() == GGML_OP_ADD ||
3376
+ node.op() == GGML_OP_NORM ||
3377
+ node.op() == GGML_OP_RMS_NORM) {
3378
+ ops[0] = node.op();
3379
+
3380
+ int f = i + 1;
3381
+ while (f < n && f < i + MAX_FUSE) {
3382
+ // conservatively allow fusing only these ops
3383
+ // can be expanded when needed
3384
+ if (gf->nodes[f]->op != GGML_OP_ADD &&
3385
+ gf->nodes[f]->op != GGML_OP_MUL &&
3386
+ gf->nodes[f]->op != GGML_OP_NORM &&
3387
+ gf->nodes[f]->op != GGML_OP_RMS_NORM) {
3388
+ break;
3389
+ }
3390
+ ops[f - i] = gf->nodes[f]->op;
3391
+ f++;
3392
+ }
3393
+
3394
+ f -= i;
3395
+ for (; f > 1; f--) {
3396
+ if (ggml_can_fuse(gf, i, ops, f)) {
3397
+ break;
3398
+ }
3399
+ }
3400
+
3401
+ // add the fused tensors into the node info so we can unfuse them later
3402
+ for (int k = 1; k < f; k++) {
3403
+ ++i;
3404
+
3405
+ // the .dst() becomes the last fused tensor
3406
+ node.add_fused(gf->nodes[i]);
3407
+ }
3408
+ }
3409
+
3410
+ nodes.push_back(std::move(node));
3411
+ }
3412
+
3413
+ const auto order = ggml_hexagon_graph_optimize_reorder(nodes);
3414
+
3415
+ // unfuse
3416
+ {
3417
+ int j = 0;
3418
+ for (const auto i : order) {
3419
+ const auto & node = nodes[i];
3420
+
3421
+ gf->nodes[j++] = node.node;
3422
+
3423
+ for (auto * fused : node.fused) {
3424
+ gf->nodes[j++] = fused;
3425
+ }
3426
+ }
3427
+ }
3428
+ }
3429
+
3430
+ static struct ggml_backend_i hexagon_backend_i = {
3431
+ /* .get_name = */ ggml_backend_hexagon_name,
3432
+ /* .free = */ ggml_backend_hexagon_free,
3433
+ /* .set_tensor_async = */ NULL,
3434
+ /* .get_tensor_async = */ NULL,
3435
+ /* .cpy_tensor_async = */ NULL,
3436
+ /* .synchronize = */ ggml_backend_hexagon_synchronize,
3437
+ /* .graph_plan_create = */ NULL,
3438
+ /* .graph_plan_free = */ NULL,
3439
+ /* .graph_plan_update = */ NULL,
3440
+ /* .graph_plan_compute = */ NULL,
3441
+ /* .graph_compute = */ ggml_backend_hexagon_graph_compute,
3442
+ /* .event_record = */ NULL,
3443
+ /* .event_wait = */ NULL,
3444
+ /* .graph_optimize = */ ggml_backend_hexagon_graph_optimize,
3445
+ };
3446
+
3447
+ static ggml_guid_t ggml_backend_hexagon_guid() {
3448
+ static ggml_guid guid = { 0x7b, 0x57, 0xdc, 0xaf, 0xde, 0x12, 0x1d, 0x49,
3449
+ 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11 };
3450
+ return &guid;
3451
+ }
3452
+
3453
+ bool ggml_backend_is_hexagon(ggml_backend_t backend) {
3454
+ return backend && backend->iface.get_name == ggml_backend_hexagon_name;
3455
+ }
3456
+
3457
+ // device interface
3458
+
3459
+ static ggml_backend_t ggml_backend_hexagon_device_init(ggml_backend_dev_t dev, const char * params) {
3460
+ auto sess = static_cast<ggml_hexagon_session *>(dev->context);
3461
+
3462
+ return new ggml_backend{
3463
+ /* .guid = */ ggml_backend_hexagon_guid(),
3464
+ /* .interface = */ hexagon_backend_i,
3465
+ /* .device = */ dev,
3466
+ /* .context = */ sess,
3467
+ };
3468
+
3469
+ GGML_UNUSED(params);
3470
+ }
3471
+
3472
+ static const char * ggml_backend_hexagon_device_get_name(ggml_backend_dev_t dev) {
3473
+ auto sess = static_cast<ggml_hexagon_session *>(dev->context);
3474
+ return sess->name.c_str();
3475
+
3476
+ GGML_UNUSED(dev);
3477
+ }
3478
+
3479
+ static const char * ggml_backend_hexagon_device_get_description(ggml_backend_dev_t dev) {
3480
+ return "Hexagon";
3481
+ GGML_UNUSED(dev);
3482
+ }
3483
+
3484
+ static void ggml_backend_hexagon_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
3485
+ // ~2GB per session for now
3486
+ *free = 2ULL * 1024 * 1024 * 1024;
3487
+ *total = *free;
3488
+
3489
+ GGML_UNUSED(dev);
3490
+ }
3491
+
3492
+ static enum ggml_backend_dev_type ggml_backend_hexagon_device_get_type(ggml_backend_dev_t dev) {
3493
+ return GGML_BACKEND_DEVICE_TYPE_GPU;
3494
+
3495
+ GGML_UNUSED(dev);
3496
+ }
3497
+
3498
+ static void ggml_backend_hexagon_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
3499
+ props->name = ggml_backend_hexagon_device_get_name(dev);
3500
+ props->description = ggml_backend_hexagon_device_get_description(dev);
3501
+ props->type = ggml_backend_hexagon_device_get_type(dev);
3502
+ ggml_backend_hexagon_device_get_memory(dev, &props->memory_free, &props->memory_total);
3503
+ props->caps = {
3504
+ /* .async = */ true,
3505
+ /* .host_buffer = */ (bool) opt_hostbuf,
3506
+ /* .buffer_from_host_ptr = */ false,
3507
+ /* .events = */ false,
3508
+ };
3509
+ }
3510
+
3511
+ static ggml_backend_buffer_type_t ggml_backend_hexagon_device_get_buffer_type(ggml_backend_dev_t dev) {
3512
+ auto sess = static_cast<ggml_hexagon_session *>(dev->context);
3513
+ return &sess->buffer_type;
3514
+ }
3515
+
3516
+ static ggml_backend_buffer_type_t ggml_backend_hexagon_device_get_repack_buffer_type(ggml_backend_dev_t dev) {
3517
+ auto sess = static_cast<ggml_hexagon_session *>(dev->context);
3518
+ return &sess->repack_buffer_type;
3519
+ }
3520
+
3521
+ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
3522
+ auto sess = static_cast<ggml_hexagon_session *>(dev->context);
3523
+
3524
+ bool supp = false;
3525
+
3526
+ switch (op->op) {
3527
+ case GGML_OP_NONE:
3528
+ case GGML_OP_RESHAPE:
3529
+ case GGML_OP_VIEW:
3530
+ case GGML_OP_PERMUTE:
3531
+ case GGML_OP_TRANSPOSE:
3532
+ supp = true;
3533
+ break;
3534
+
3535
+ case GGML_OP_MUL_MAT:
3536
+ supp = ggml_hexagon_supported_mul_mat(sess, op);
3537
+ break;
3538
+
3539
+ case GGML_OP_MUL_MAT_ID:
3540
+ supp = ggml_hexagon_supported_mul_mat_id(sess, op);
3541
+ break;
3542
+
3543
+ case GGML_OP_MUL:
3544
+ case GGML_OP_ADD:
3545
+ case GGML_OP_SUB:
3546
+ supp = ggml_hexagon_supported_binary(sess, op);
3547
+ break;
3548
+
3549
+ case GGML_OP_ADD_ID:
3550
+ supp = ggml_hexagon_supported_add_id(sess, op);
3551
+ break;
3552
+
3553
+ case GGML_OP_RMS_NORM:
3554
+ supp = ggml_hexagon_supported_unary(sess, op);
3555
+ break;
3556
+
3557
+ case GGML_OP_SOFT_MAX:
3558
+ supp = ggml_hexagon_supported_softmax(sess, op);
3559
+ break;
3560
+
3561
+ case GGML_OP_UNARY:
3562
+ if (ggml_get_unary_op(op) == GGML_UNARY_OP_SILU) {
3563
+ supp = ggml_hexagon_supported_activations(sess, op);
3564
+ }
3565
+ break;
3566
+
3567
+ case GGML_OP_GLU:
3568
+ if ((ggml_get_glu_op(op) == GGML_GLU_OP_SWIGLU) /* || (ggml_get_glu_op(op) == GGML_GLU_OP_SWIGLU_OAI) */) {
3569
+ supp = ggml_hexagon_supported_activations(sess, op);
3570
+ }
3571
+ break;
3572
+
3573
+ case GGML_OP_ROPE:
3574
+ supp = ggml_hexagon_supported_rope(sess, op);
3575
+ break;
3576
+
3577
+ default:
3578
+ break;
3579
+ }
3580
+
3581
+ if (opt_verbose) {
3582
+ char dims[64 * GGML_MAX_SRC];
3583
+ char strides[64 * GGML_MAX_SRC];
3584
+ char types[16 * GGML_MAX_SRC];
3585
+ char buffs[64 * GGML_MAX_SRC];
3586
+ char names[64 * GGML_MAX_SRC];
3587
+
3588
+ hex_format_op_dims(dims, op);
3589
+ hex_format_op_strides(strides, op);
3590
+ hex_format_op_types(types, op);
3591
+ hex_format_op_buffs(buffs, op);
3592
+ hex_format_op_names(names, op);
3593
+
3594
+ HEX_VERBOSE("ggml-hex: %s device-supports-op %s : %s : %s : %s : %s : %s : (%d)\n", sess->name.c_str(),
3595
+ ggml_op_name(op->op), names, dims, types, strides, buffs, (int) supp);
3596
+ }
3597
+
3598
+ return supp;
3599
+
3600
+ GGML_UNUSED(dev);
3601
+ }
3602
+
3603
+ static bool ggml_backend_hexagon_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
3604
+ if (buft->iface.get_alignment != ggml_backend_hexagon_buffer_type_get_alignment) {
3605
+ return false;
3606
+ }
3607
+
3608
+ auto s0 = static_cast<ggml_hexagon_session *>(dev->context);
3609
+ auto s1 = static_cast<ggml_backend_hexagon_buffer_type_context *>(buft->context)->sess;
3610
+
3611
+ // Need session/domain-id for buffers to be compatible
3612
+ bool supp = (s0->session_id == s1->session_id);
3613
+
3614
+ HEX_VERBOSE("ggml-hex: %s device-supports-buft %s (%d)\n", s0->name.c_str(), s1->name.c_str(), (int) supp);
3615
+
3616
+ return supp;
3617
+ }
3618
+
3619
+ static ggml_backend_buffer_type_t * ggml_backend_hexagon_device_get_extra_buffers_type(ggml_backend_dev_t dev) {
3620
+ auto s0 = static_cast<ggml_hexagon_session *>(dev->context);
3621
+ HEX_VERBOSE("ggml-hex: device-get-extra-buft : %s \n", s0->name.c_str());
3622
+
3623
+ static ggml_backend_buffer_type_t bufts[2];
3624
+ bufts[0] = ggml_backend_hexagon_device_get_repack_buffer_type(dev);
3625
+ bufts[1] = NULL;
3626
+ return bufts;
3627
+ }
3628
+
3629
+ static const struct ggml_backend_device_i ggml_backend_hexagon_device_i = {
3630
+ /* .get_name = */ ggml_backend_hexagon_device_get_name,
3631
+ /* .get_description = */ ggml_backend_hexagon_device_get_description,
3632
+ /* .get_memory = */ ggml_backend_hexagon_device_get_memory,
3633
+ /* .get_type = */ ggml_backend_hexagon_device_get_type,
3634
+ /* .get_props = */ ggml_backend_hexagon_device_get_props,
3635
+ /* .init_backend = */ ggml_backend_hexagon_device_init,
3636
+ /* .get_buffer_type = */ ggml_backend_hexagon_device_get_buffer_type,
3637
+ /* .get_host_buffer_type = */ NULL, // ggml_backend_hexagon_device_get_host_buffer_type,
3638
+ /* .buffer_from_host_ptr = */ NULL, // ggml_backend_hexagon_device_buffer_from_ptr,
3639
+ /* .supports_op = */ ggml_backend_hexagon_device_supports_op,
3640
+ /* .supports_buft = */ ggml_backend_hexagon_device_supports_buft,
3641
+ /* .offload_op = */ NULL, // ggml_backend_hexagon_device_offload_op,
3642
+ /* .event_new = */ NULL,
3643
+ /* .event_free = */ NULL,
3644
+ /* .event_synchronize = */ NULL,
3645
+ };
3646
+
3647
+ //** backend registry
3648
+
3649
+ #define GGML_HEXAGON_MAX_SESSIONS 16
3650
+
3651
+ struct ggml_hexagon_registry {
3652
+ ggml_hexagon_registry(ggml_backend_reg_t reg);
3653
+ ~ggml_hexagon_registry();
3654
+
3655
+ ggml_backend_device devices[GGML_HEXAGON_MAX_SESSIONS];
3656
+ };
3657
+
3658
+ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
3659
+ GGML_LOG_INFO("ggml-hex: Hexagon backend (experimental) : allocating new registry : ndev %zu\n", opt_ndev);
3660
+
3661
+ if (!opt_arch) {
3662
+ int err = get_hex_arch_ver(CDSP_DOMAIN_ID, &opt_arch);
3663
+ if (err != 0) {
3664
+ GGML_LOG_ERROR("ggml-hex: failed to query HTP version (err %d) defaulting to v73\n", err);
3665
+ opt_arch = 73;
3666
+ }
3667
+ }
3668
+
3669
+ if(opt_arch < 75) {
3670
+ opt_ndev = 1;
3671
+ GGML_LOG_WARN("ggml-hex: forcing ndev to 1 for SoCs archs lower than v75.\n");
3672
+ }
3673
+
3674
+ GGML_LOG_INFO("ggml-hex: Hexagon Arch version v%d\n", opt_arch);
3675
+
3676
+ // Create devices / sessions
3677
+ for (size_t i = 0; i < opt_ndev; i++) {
3678
+ devices[i].iface = ggml_backend_hexagon_device_i;
3679
+ devices[i].reg = reg;
3680
+ try {
3681
+ devices[i].context = new ggml_hexagon_session(i, &devices[i]);
3682
+ } catch (std::exception const &exc) {
3683
+ GGML_LOG_ERROR("ggml-hex: failed to create device/session %zu\n", i);
3684
+ devices[i].context = nullptr;
3685
+ }
3686
+ }
3687
+ }
3688
+
3689
+ ggml_hexagon_registry::~ggml_hexagon_registry() {
3690
+ GGML_LOG_INFO("ggml-hex: releasing registry\n");
3691
+
3692
+ // Release devices / sessions
3693
+ for (size_t i = 0; i < opt_ndev; i++) {
3694
+ auto sess = static_cast<ggml_hexagon_session *>(devices[i].context);
3695
+ delete sess;
3696
+ }
3697
+ }
3698
+
3699
+ static const char * ggml_backend_hexagon_reg_get_name(ggml_backend_reg_t reg) {
3700
+ return "HTP";
3701
+ GGML_UNUSED(reg);
3702
+ }
3703
+
3704
+ static size_t ggml_backend_hexagon_reg_get_device_count(ggml_backend_reg_t reg) {
3705
+ return opt_ndev;
3706
+ GGML_UNUSED(reg);
3707
+ }
3708
+
3709
+ static ggml_backend_dev_t ggml_backend_hexagon_reg_get_device(ggml_backend_reg_t reg, size_t index) {
3710
+ auto hreg = static_cast<ggml_hexagon_registry *>(reg->context);
3711
+
3712
+ if (index >= opt_ndev || !hreg->devices[index].context) {
3713
+ return nullptr;
3714
+ }
3715
+
3716
+ return &hreg->devices[index];
3717
+ }
3718
+
3719
+ static void * ggml_backend_hexagon_get_proc_address(ggml_backend_reg_t reg, const char * name) {
3720
+ if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0) {
3721
+ ggml_backend_dev_get_extra_bufts_t fct = ggml_backend_hexagon_device_get_extra_buffers_type;
3722
+ return (void *) fct;
3723
+ }
3724
+
3725
+ return NULL;
3726
+ }
3727
+
3728
+ static void ggml_hexagon_init(ggml_backend_reg * reg) {
3729
+ // Basic sanity checks to make sure definitions match
3730
+ static_assert((unsigned int) HTP_TYPE_Q4_0 == (unsigned int) GGML_TYPE_Q4_0,
3731
+ "please update hexagon_type to match ggml_type");
3732
+ static_assert((unsigned int) HTP_TYPE_Q8_0 == (unsigned int) GGML_TYPE_Q8_0,
3733
+ "please update hexagon_type to match ggml_type");
3734
+ static_assert((unsigned int) HTP_TYPE_MXFP4 == (unsigned int) GGML_TYPE_MXFP4,
3735
+ "please update hexagon_type to match ggml_type");
3736
+
3737
+ const char * str_verbose = getenv("GGML_HEXAGON_VERBOSE");
3738
+ const char * str_hostbuf = getenv("GGML_HEXAGON_HOSTBUF");
3739
+
3740
+ opt_verbose = str_verbose ? atoi(str_verbose) : 0;
3741
+ opt_profile = getenv("GGML_HEXAGON_PROFILE") != nullptr;
3742
+ opt_etm = getenv("GGML_HEXAGON_ETM") != nullptr;
3743
+ opt_experimental = getenv("GGML_HEXAGON_EXPERIMENTAL") != nullptr;
3744
+
3745
+ const char * str_opmask = getenv("GGML_HEXAGON_OPMASK");
3746
+ if (str_opmask != nullptr) {
3747
+ opt_opmask = strtoul(str_opmask, NULL, 0);
3748
+ }
3749
+ opt_opsync = getenv("GGML_HEXAGON_OPSYNC") != nullptr;
3750
+
3751
+ const char * str_ndev = getenv("GGML_HEXAGON_NDEV");
3752
+ if (str_ndev) {
3753
+ opt_ndev = strtoul(str_ndev, NULL, 0);
3754
+ if (opt_ndev > GGML_HEXAGON_MAX_SESSIONS) {
3755
+ opt_ndev = GGML_HEXAGON_MAX_SESSIONS;
3756
+ }
3757
+ }
3758
+
3759
+ const char * str_nhvx = getenv("GGML_HEXAGON_NHVX");
3760
+ if (str_nhvx) {
3761
+ opt_nhvx = strtoul(str_nhvx, NULL, 0);
3762
+ }
3763
+
3764
+ const char * str_arch = getenv("GGML_HEXAGON_ARCH");
3765
+ if (str_arch) {
3766
+ if (str_arch[0] == 'v') {
3767
+ str_arch++;
3768
+ }
3769
+ opt_arch = strtoul(str_arch, NULL, 0);
3770
+ }
3771
+
3772
+ opt_hostbuf = str_hostbuf ? atoi(str_hostbuf) : 1;
3773
+
3774
+ reg->context = new ggml_hexagon_registry(reg);
3775
+
3776
+ HEX_VERBOSE("ggml-hex: size-of-general-req %zu size-of-general-rsp %zu\n", sizeof(struct htp_general_req),
3777
+ sizeof(struct htp_general_rsp));
3778
+ }
3779
+
3780
+ static const struct ggml_backend_reg_i ggml_backend_hexagon_reg_i = {
3781
+ /* .get_name = */ ggml_backend_hexagon_reg_get_name,
3782
+ /* .get_device_count = */ ggml_backend_hexagon_reg_get_device_count,
3783
+ /* .get_device = */ ggml_backend_hexagon_reg_get_device,
3784
+ /* .get_proc_address = */ ggml_backend_hexagon_get_proc_address,
3785
+ };
3786
+
3787
+ ggml_backend_reg_t ggml_backend_hexagon_reg(void) {
3788
+ static bool initialized = false;
3789
+
3790
+ static ggml_backend_reg reg = { /* .api_version = */ GGML_BACKEND_API_VERSION,
3791
+ /* .iface = */ ggml_backend_hexagon_reg_i,
3792
+ /* .context = */ NULL };
3793
+
3794
+ {
3795
+ static std::mutex mutex;
3796
+ std::lock_guard<std::mutex> lock(mutex);
3797
+ if (!initialized) {
3798
+ ggml_hexagon_init(&reg);
3799
+ }
3800
+
3801
+ initialized = true;
3802
+ }
3803
+
3804
+ return &reg;
3805
+ }
3806
+
3807
+ GGML_BACKEND_DL_IMPL(ggml_backend_hexagon_reg)