llama-cpp-pydist 0.18.0__py3-none-any.whl → 0.20.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (362) hide show
  1. llama_cpp/binaries/{llama-b7376-bin-win-cpu-x64.zip → llama-b7621-bin-win-cpu-x64.zip} +0 -0
  2. llama_cpp_pydist-0.20.0.dist-info/METADATA +4539 -0
  3. {llama_cpp_pydist-0.18.0.dist-info → llama_cpp_pydist-0.20.0.dist-info}/RECORD +358 -318
  4. {llama_cpp_pydist-0.18.0.dist-info → llama_cpp_pydist-0.20.0.dist-info}/WHEEL +1 -1
  5. scripts/generate_changelog.py +10 -0
  6. vendor_llama_cpp_pydist/llama.cpp/.devops/cann.Dockerfile +1 -1
  7. vendor_llama_cpp_pydist/llama.cpp/.devops/cuda-new.Dockerfile +95 -0
  8. vendor_llama_cpp_pydist/llama.cpp/.devops/llama-cli-cann.Dockerfile +3 -2
  9. vendor_llama_cpp_pydist/llama.cpp/.devops/llama-cpp-cuda.srpm.spec +2 -0
  10. vendor_llama_cpp_pydist/llama.cpp/.devops/llama-cpp.srpm.spec +2 -0
  11. vendor_llama_cpp_pydist/llama.cpp/.gemini/settings.json +1 -0
  12. vendor_llama_cpp_pydist/llama.cpp/.github/ISSUE_TEMPLATE/010-bug-compilation.yml +2 -1
  13. vendor_llama_cpp_pydist/llama.cpp/.github/ISSUE_TEMPLATE/011-bug-results.yml +19 -5
  14. vendor_llama_cpp_pydist/llama.cpp/.github/ISSUE_TEMPLATE/019-bug-misc.yml +14 -2
  15. vendor_llama_cpp_pydist/llama.cpp/.github/workflows/build.yml +33 -2
  16. vendor_llama_cpp_pydist/llama.cpp/.github/workflows/docker.yml +25 -13
  17. vendor_llama_cpp_pydist/llama.cpp/.github/workflows/release.yml +12 -48
  18. vendor_llama_cpp_pydist/llama.cpp/.github/workflows/server-webui.yml +225 -0
  19. vendor_llama_cpp_pydist/llama.cpp/.github/workflows/server.yml +0 -264
  20. vendor_llama_cpp_pydist/llama.cpp/.gitignore +1 -0
  21. vendor_llama_cpp_pydist/llama.cpp/AGENTS.md +81 -0
  22. vendor_llama_cpp_pydist/llama.cpp/CLAUDE.md +1 -0
  23. vendor_llama_cpp_pydist/llama.cpp/CODEOWNERS +3 -2
  24. vendor_llama_cpp_pydist/llama.cpp/CONTRIBUTING.md +34 -5
  25. vendor_llama_cpp_pydist/llama.cpp/README.md +4 -2
  26. vendor_llama_cpp_pydist/llama.cpp/SECURITY.md +3 -0
  27. vendor_llama_cpp_pydist/llama.cpp/ci/run.sh +6 -0
  28. vendor_llama_cpp_pydist/llama.cpp/common/CMakeLists.txt +4 -3
  29. vendor_llama_cpp_pydist/llama.cpp/common/arg.cpp +259 -66
  30. vendor_llama_cpp_pydist/llama.cpp/common/arg.h +12 -2
  31. vendor_llama_cpp_pydist/llama.cpp/common/chat-parser.cpp +11 -0
  32. vendor_llama_cpp_pydist/llama.cpp/common/chat-peg-parser.cpp +12 -2
  33. vendor_llama_cpp_pydist/llama.cpp/common/chat.cpp +172 -3
  34. vendor_llama_cpp_pydist/llama.cpp/common/chat.h +1 -0
  35. vendor_llama_cpp_pydist/llama.cpp/common/common.cpp +151 -88
  36. vendor_llama_cpp_pydist/llama.cpp/common/common.h +38 -13
  37. vendor_llama_cpp_pydist/llama.cpp/common/json-schema-to-grammar.cpp +132 -3
  38. vendor_llama_cpp_pydist/llama.cpp/common/json-schema-to-grammar.h +20 -0
  39. vendor_llama_cpp_pydist/llama.cpp/common/peg-parser.cpp +1 -1
  40. vendor_llama_cpp_pydist/llama.cpp/common/preset.cpp +218 -6
  41. vendor_llama_cpp_pydist/llama.cpp/common/preset.h +45 -3
  42. vendor_llama_cpp_pydist/llama.cpp/common/sampling.cpp +67 -54
  43. vendor_llama_cpp_pydist/llama.cpp/common/sampling.h +8 -0
  44. vendor_llama_cpp_pydist/llama.cpp/convert_hf_to_gguf.py +904 -454
  45. vendor_llama_cpp_pydist/llama.cpp/convert_hf_to_gguf_update.py +6 -0
  46. vendor_llama_cpp_pydist/llama.cpp/docs/android/imported-into-android-studio.jpg +0 -0
  47. vendor_llama_cpp_pydist/llama.cpp/docs/android.md +22 -2
  48. vendor_llama_cpp_pydist/llama.cpp/docs/backend/OPENCL.md +1 -1
  49. vendor_llama_cpp_pydist/llama.cpp/docs/backend/SYCL.md +3 -1
  50. vendor_llama_cpp_pydist/llama.cpp/docs/backend/hexagon/CMakeUserPresets.json +2 -0
  51. vendor_llama_cpp_pydist/llama.cpp/docs/backend/hexagon/README.md +5 -5
  52. vendor_llama_cpp_pydist/llama.cpp/docs/backend/hexagon/developer.md +1 -1
  53. vendor_llama_cpp_pydist/llama.cpp/docs/build.md +21 -2
  54. vendor_llama_cpp_pydist/llama.cpp/docs/development/HOWTO-add-model.md +3 -2
  55. vendor_llama_cpp_pydist/llama.cpp/docs/development/parsing.md +2 -2
  56. vendor_llama_cpp_pydist/llama.cpp/docs/docker.md +15 -11
  57. vendor_llama_cpp_pydist/llama.cpp/docs/ops/Metal.csv +360 -322
  58. vendor_llama_cpp_pydist/llama.cpp/docs/ops/SYCL.csv +797 -361
  59. vendor_llama_cpp_pydist/llama.cpp/docs/ops.md +10 -10
  60. vendor_llama_cpp_pydist/llama.cpp/ggml/CMakeLists.txt +18 -1
  61. vendor_llama_cpp_pydist/llama.cpp/ggml/include/ggml-alloc.h +9 -0
  62. vendor_llama_cpp_pydist/llama.cpp/ggml/include/ggml-backend.h +2 -1
  63. vendor_llama_cpp_pydist/llama.cpp/ggml/include/ggml.h +2 -1
  64. vendor_llama_cpp_pydist/llama.cpp/ggml/src/CMakeLists.txt +24 -7
  65. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-alloc.c +56 -12
  66. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-backend.cpp +32 -13
  67. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +248 -19
  68. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +3 -0
  69. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/common.h +153 -9
  70. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +21 -172
  71. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +16 -2
  72. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +285 -0
  73. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +28 -0
  74. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -1
  75. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +51 -6
  76. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +86 -25
  77. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +15 -8
  78. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +768 -0
  79. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/repack.cpp +286 -0
  80. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/repack.h +8 -0
  81. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +0 -4
  82. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/vec.cpp +41 -1
  83. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/vec.h +125 -22
  84. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +42 -1
  85. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/argmax.cu +2 -2
  86. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/argsort.cu +2 -2
  87. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/common.cuh +36 -0
  88. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/cpy.cu +117 -103
  89. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/cumsum.cu +69 -33
  90. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +3 -1
  91. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +2 -2
  92. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +67 -31
  93. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mean.cu +3 -0
  94. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mma.cuh +189 -111
  95. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mmf.cuh +14 -10
  96. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mmq.cu +34 -8
  97. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +204 -42
  98. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/quantize.cu +151 -0
  99. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/quantize.cuh +14 -0
  100. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/ssm-conv.cu +14 -20
  101. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/topk-moe.cu +17 -2
  102. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/topk-moe.cuh +6 -1
  103. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +4 -0
  104. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/CMakeLists.txt +3 -1
  105. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp +224 -746
  106. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/CMakeLists.txt +2 -1
  107. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/act-ops.c +369 -129
  108. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.c +5 -11
  109. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.h +46 -15
  110. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/htp-msg.h +6 -5
  111. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-utils.c +154 -47
  112. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-utils.h +65 -2
  113. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/main.c +4 -1
  114. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/matmul-ops.c +196 -48
  115. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp-utils.h +1 -0
  116. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/op-desc.h +153 -0
  117. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-impl.h +0 -4
  118. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.cpp +57 -0
  119. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.h +2 -0
  120. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.m +12 -0
  121. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +20 -0
  122. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-ops.cpp +71 -2
  123. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-ops.h +1 -0
  124. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +73 -6
  125. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +134 -13
  126. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-opencl/kernels/cvt.cl +21 -0
  127. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-opencl/kernels/transpose.cl +13 -0
  128. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +14 -7
  129. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +42 -1
  130. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-sycl/add-id.cpp +77 -0
  131. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-sycl/add-id.hpp +8 -0
  132. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-sycl/common.hpp +17 -0
  133. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-sycl/convert.cpp +15 -0
  134. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +18 -0
  135. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +56 -3
  136. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +97 -0
  137. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +4 -0
  138. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +17 -6
  139. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +22 -0
  140. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-sycl/pad.cpp +5 -5
  141. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-sycl/ssm_conv.cpp +1 -1
  142. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +58 -0
  143. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +916 -337
  144. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/count_experts.comp +51 -0
  145. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cumsum.comp +28 -14
  146. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass1.comp +60 -0
  147. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass2.comp +66 -0
  148. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl +1 -7
  149. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp +29 -0
  150. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +3 -0
  151. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +3 -0
  152. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/generic_head.glsl +2 -0
  153. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +2 -2
  154. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +17 -4
  155. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp +40 -24
  156. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +42 -24
  157. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +11 -0
  158. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +115 -0
  159. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +10 -4
  160. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +29 -18
  161. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +27 -21
  162. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +3 -1
  163. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +10 -4
  164. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +11 -4
  165. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +4 -1
  166. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +4 -1
  167. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +4 -1
  168. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +1 -0
  169. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +4 -1
  170. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large1.comp +62 -0
  171. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large2.comp +79 -0
  172. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large3.comp +65 -0
  173. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large_common.glsl +53 -0
  174. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +57 -21
  175. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl +312 -6
  176. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +54 -0
  177. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +24 -5
  178. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/xielu.comp +35 -0
  179. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml.c +5 -0
  180. vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/constants.py +188 -0
  181. vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/gguf_writer.py +38 -2
  182. vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/tensor_mapping.py +122 -3
  183. vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/utility.py +1 -9
  184. vendor_llama_cpp_pydist/llama.cpp/grammars/README.md +6 -3
  185. vendor_llama_cpp_pydist/llama.cpp/include/llama.h +28 -2
  186. vendor_llama_cpp_pydist/llama.cpp/pyrightconfig.json +1 -1
  187. vendor_llama_cpp_pydist/llama.cpp/scripts/compare-logprobs.py +281 -0
  188. vendor_llama_cpp_pydist/llama.cpp/scripts/snapdragon/adb/run-cli.sh +9 -9
  189. vendor_llama_cpp_pydist/llama.cpp/scripts/snapdragon/adb/run-completion.sh +53 -0
  190. vendor_llama_cpp_pydist/llama.cpp/scripts/snapdragon/adb/run-mtmd.sh +65 -0
  191. vendor_llama_cpp_pydist/llama.cpp/scripts/sync-ggml.last +1 -1
  192. vendor_llama_cpp_pydist/llama.cpp/src/CMakeLists.txt +4 -0
  193. vendor_llama_cpp_pydist/llama.cpp/src/llama-adapter.cpp +12 -3
  194. vendor_llama_cpp_pydist/llama.cpp/src/llama-adapter.h +7 -1
  195. vendor_llama_cpp_pydist/llama.cpp/src/llama-arch.cpp +1966 -2248
  196. vendor_llama_cpp_pydist/llama.cpp/src/llama-arch.h +16 -2
  197. vendor_llama_cpp_pydist/llama.cpp/src/llama-chat.cpp +11 -0
  198. vendor_llama_cpp_pydist/llama.cpp/src/llama-chat.h +1 -0
  199. vendor_llama_cpp_pydist/llama.cpp/src/llama-context.cpp +99 -20
  200. vendor_llama_cpp_pydist/llama.cpp/src/llama-context.h +8 -2
  201. vendor_llama_cpp_pydist/llama.cpp/src/llama-graph.cpp +75 -7
  202. vendor_llama_cpp_pydist/llama.cpp/src/llama-graph.h +17 -4
  203. vendor_llama_cpp_pydist/llama.cpp/src/llama-hparams.cpp +3 -9
  204. vendor_llama_cpp_pydist/llama.cpp/src/llama-hparams.h +7 -9
  205. vendor_llama_cpp_pydist/llama.cpp/src/llama-impl.cpp +4 -0
  206. vendor_llama_cpp_pydist/llama.cpp/src/llama-kv-cache.cpp +85 -31
  207. vendor_llama_cpp_pydist/llama.cpp/src/llama-kv-cache.h +20 -3
  208. vendor_llama_cpp_pydist/llama.cpp/src/llama-memory-hybrid.cpp +1 -1
  209. vendor_llama_cpp_pydist/llama.cpp/src/llama-mmap.cpp +130 -28
  210. vendor_llama_cpp_pydist/llama.cpp/src/llama-mmap.h +5 -1
  211. vendor_llama_cpp_pydist/llama.cpp/src/llama-model-loader.cpp +81 -13
  212. vendor_llama_cpp_pydist/llama.cpp/src/llama-model-loader.h +4 -0
  213. vendor_llama_cpp_pydist/llama.cpp/src/llama-model.cpp +380 -68
  214. vendor_llama_cpp_pydist/llama.cpp/src/llama-model.h +14 -2
  215. vendor_llama_cpp_pydist/llama.cpp/src/llama-quant.cpp +1 -1
  216. vendor_llama_cpp_pydist/llama.cpp/src/llama-sampling.cpp +60 -33
  217. vendor_llama_cpp_pydist/llama.cpp/src/llama-sampling.h +3 -0
  218. vendor_llama_cpp_pydist/llama.cpp/src/llama-vocab.cpp +103 -34
  219. vendor_llama_cpp_pydist/llama.cpp/src/llama-vocab.h +2 -0
  220. vendor_llama_cpp_pydist/llama.cpp/src/llama.cpp +680 -1
  221. vendor_llama_cpp_pydist/llama.cpp/src/models/bert.cpp +4 -2
  222. vendor_llama_cpp_pydist/llama.cpp/src/models/cogvlm.cpp +5 -3
  223. vendor_llama_cpp_pydist/llama.cpp/src/models/deepseek2.cpp +1 -1
  224. vendor_llama_cpp_pydist/llama.cpp/src/models/gemma-embedding.cpp +2 -6
  225. vendor_llama_cpp_pydist/llama.cpp/src/models/gemma3.cpp +3 -4
  226. vendor_llama_cpp_pydist/llama.cpp/src/models/gemma3n-iswa.cpp +4 -7
  227. vendor_llama_cpp_pydist/llama.cpp/src/models/glm4-moe.cpp +28 -11
  228. vendor_llama_cpp_pydist/llama.cpp/src/models/glm4.cpp +27 -4
  229. vendor_llama_cpp_pydist/llama.cpp/src/models/llama.cpp +19 -6
  230. vendor_llama_cpp_pydist/llama.cpp/src/models/maincoder.cpp +117 -0
  231. vendor_llama_cpp_pydist/llama.cpp/src/models/mimo2-iswa.cpp +123 -0
  232. vendor_llama_cpp_pydist/llama.cpp/src/models/models.h +23 -5
  233. vendor_llama_cpp_pydist/llama.cpp/src/models/modern-bert.cpp +115 -0
  234. vendor_llama_cpp_pydist/llama.cpp/src/models/nemotron-h.cpp +35 -6
  235. vendor_llama_cpp_pydist/llama.cpp/src/models/plamo3.cpp +128 -0
  236. vendor_llama_cpp_pydist/llama.cpp/src/models/qwen2.cpp +12 -3
  237. vendor_llama_cpp_pydist/llama.cpp/src/models/qwen3next.cpp +81 -266
  238. vendor_llama_cpp_pydist/llama.cpp/src/unicode.cpp +23 -14
  239. vendor_llama_cpp_pydist/llama.cpp/tests/CMakeLists.txt +8 -0
  240. vendor_llama_cpp_pydist/llama.cpp/tests/test-arg-parser.cpp +29 -0
  241. vendor_llama_cpp_pydist/llama.cpp/tests/test-backend-ops.cpp +200 -61
  242. vendor_llama_cpp_pydist/llama.cpp/tests/test-chat.cpp +186 -3
  243. vendor_llama_cpp_pydist/llama.cpp/tests/test-grammar-llguidance.cpp +3 -0
  244. vendor_llama_cpp_pydist/llama.cpp/tests/test-json-schema-to-grammar.cpp +75 -0
  245. vendor_llama_cpp_pydist/llama.cpp/tests/test-state-restore-fragmented.cpp +122 -0
  246. vendor_llama_cpp_pydist/llama.cpp/tests/test-tokenizer-0.cpp +1 -1
  247. vendor_llama_cpp_pydist/llama.cpp/tests/test-tokenizer-1-bpe.cpp +1 -1
  248. vendor_llama_cpp_pydist/llama.cpp/tests/test-tokenizer-1-spm.cpp +1 -1
  249. vendor_llama_cpp_pydist/llama.cpp/tools/CMakeLists.txt +1 -0
  250. vendor_llama_cpp_pydist/llama.cpp/tools/batched-bench/batched-bench.cpp +11 -0
  251. vendor_llama_cpp_pydist/llama.cpp/tools/cli/README.md +187 -0
  252. vendor_llama_cpp_pydist/llama.cpp/tools/cli/cli.cpp +1 -3
  253. vendor_llama_cpp_pydist/llama.cpp/tools/completion/README.md +195 -23
  254. vendor_llama_cpp_pydist/llama.cpp/tools/completion/completion.cpp +11 -17
  255. vendor_llama_cpp_pydist/llama.cpp/tools/cvector-generator/cvector-generator.cpp +3 -3
  256. vendor_llama_cpp_pydist/llama.cpp/tools/fit-params/CMakeLists.txt +8 -0
  257. vendor_llama_cpp_pydist/llama.cpp/tools/fit-params/README.md +55 -0
  258. vendor_llama_cpp_pydist/llama.cpp/tools/fit-params/fit-params.cpp +66 -0
  259. vendor_llama_cpp_pydist/llama.cpp/tools/imatrix/imatrix.cpp +3 -3
  260. vendor_llama_cpp_pydist/llama.cpp/tools/llama-bench/README.md +1 -1
  261. vendor_llama_cpp_pydist/llama.cpp/tools/llama-bench/llama-bench.cpp +18 -1
  262. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/CMakeLists.txt +3 -0
  263. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/clip-graph.h +7 -1
  264. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/clip-impl.h +41 -8
  265. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/clip-model.h +56 -2
  266. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/clip.cpp +375 -41
  267. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/clip.h +1 -1
  268. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/conformer.cpp +217 -0
  269. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/glm4v.cpp +120 -0
  270. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/models.h +20 -0
  271. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/whisper-enc.cpp +18 -10
  272. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/youtuvl.cpp +179 -0
  273. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/mtmd-audio.cpp +371 -550
  274. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/mtmd-audio.h +19 -28
  275. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/mtmd-cli.cpp +22 -5
  276. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/mtmd.cpp +34 -14
  277. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/mtmd.h +3 -0
  278. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/tests.sh +1 -0
  279. vendor_llama_cpp_pydist/llama.cpp/tools/perplexity/perplexity.cpp +3 -3
  280. vendor_llama_cpp_pydist/llama.cpp/tools/quantize/quantize.cpp +6 -0
  281. vendor_llama_cpp_pydist/llama.cpp/tools/server/CMakeLists.txt +0 -8
  282. vendor_llama_cpp_pydist/llama.cpp/tools/server/README-dev.md +2 -0
  283. vendor_llama_cpp_pydist/llama.cpp/tools/server/README.md +71 -40
  284. vendor_llama_cpp_pydist/llama.cpp/tools/server/public/index.html.gz +0 -0
  285. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-common.cpp +10 -17
  286. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-common.h +2 -3
  287. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-context.cpp +473 -287
  288. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-context.h +52 -15
  289. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-http.cpp +16 -10
  290. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-models.cpp +256 -315
  291. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-models.h +35 -28
  292. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-queue.cpp +78 -21
  293. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-queue.h +48 -10
  294. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-task.cpp +33 -11
  295. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-task.h +28 -35
  296. vendor_llama_cpp_pydist/llama.cpp/tools/server/server.cpp +26 -12
  297. vendor_llama_cpp_pydist/llama.cpp/tools/server/tests/unit/test_chat_completion.py +11 -2
  298. vendor_llama_cpp_pydist/llama.cpp/tools/server/tests/unit/test_compat_anthropic.py +1 -1
  299. vendor_llama_cpp_pydist/llama.cpp/tools/server/tests/unit/test_sleep.py +39 -0
  300. vendor_llama_cpp_pydist/llama.cpp/tools/server/tests/utils.py +3 -0
  301. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/README.md +6 -5
  302. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/docs/architecture/high-level-architecture-simplified.md +6 -2
  303. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/docs/architecture/high-level-architecture.md +13 -3
  304. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/package-lock.json +10 -10
  305. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/package.json +3 -4
  306. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/app.d.ts +7 -0
  307. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentPreview.svelte +1 -1
  308. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentThumbnailFile.svelte +1 -1
  309. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatForm.svelte +28 -3
  310. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionFileAttachments.svelte +1 -1
  311. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActions.svelte +1 -0
  312. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage.svelte +53 -8
  313. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAssistant.svelte +29 -3
  314. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageEditForm.svelte +391 -0
  315. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageStatistics.svelte +165 -10
  316. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageUser.svelte +26 -48
  317. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessages.svelte +9 -4
  318. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreen.svelte +21 -4
  319. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreenHeader.svelte +6 -1
  320. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettings.svelte +16 -1
  321. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettingsImportExportTab.svelte +68 -2
  322. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSidebar/ChatSidebar.svelte +5 -1
  323. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/misc/BadgeChatStatistic.svelte +26 -7
  324. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/misc/MarkdownContent.svelte +263 -167
  325. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/misc/SyntaxHighlightedCode.svelte +3 -2
  326. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/models/ModelsSelector.svelte +199 -185
  327. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/server/ServerErrorSplash.svelte +2 -1
  328. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/ui/switch/index.ts +7 -0
  329. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/ui/switch/switch.svelte +29 -0
  330. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/constants/settings-config.ts +9 -0
  331. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/enums/chat.ts +4 -0
  332. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/enums/index.ts +2 -0
  333. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/hooks/use-processing-state.svelte.ts +125 -11
  334. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/markdown/enhance-code-blocks.ts +162 -0
  335. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/markdown/enhance-links.ts +33 -0
  336. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/services/chat.ts +12 -8
  337. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/services/parameter-sync.spec.ts +14 -0
  338. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/services/parameter-sync.ts +75 -13
  339. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/stores/chat.svelte.ts +235 -171
  340. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/stores/conversations.svelte.ts +24 -2
  341. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/stores/server.svelte.ts +4 -0
  342. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/stores/settings.svelte.ts +6 -6
  343. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/types/api.d.ts +4 -0
  344. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/types/settings.d.ts +1 -1
  345. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/utils/api-key-validation.ts +2 -1
  346. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/utils/clipboard.ts +259 -0
  347. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/utils/file-preview.ts +0 -9
  348. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/utils/index.ts +11 -2
  349. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/utils/text.ts +7 -0
  350. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/routes/+layout.svelte +26 -10
  351. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/tests/unit/clipboard.test.ts +423 -0
  352. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/{src/lib/utils → tests/unit}/latex-protection.test.ts +1 -1
  353. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/{src/lib/utils → tests/unit}/model-names.test.ts +1 -1
  354. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/vite.config.ts +2 -2
  355. vendor_llama_cpp_pydist/llama.cpp/tools/tts/tts.cpp +6 -6
  356. vendor_llama_cpp_pydist/llama.cpp/vendor/cpp-httplib/CMakeLists.txt +2 -1
  357. llama_cpp_pydist-0.18.0.dist-info/METADATA +0 -2448
  358. vendor_llama_cpp_pydist/llama.cpp/.github/copilot-instructions.md +0 -262
  359. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/utils/copy.ts +0 -71
  360. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/tests/server/demo.spec.ts +0 -7
  361. {llama_cpp_pydist-0.18.0.dist-info/licenses → llama_cpp_pydist-0.20.0.dist-info}/LICENSE +0 -0
  362. {llama_cpp_pydist-0.18.0.dist-info → llama_cpp_pydist-0.20.0.dist-info}/top_level.txt +0 -0
@@ -7,9 +7,10 @@
7
7
 
8
8
  #include <atomic>
9
9
  #include <chrono>
10
+ #include <cstddef>
10
11
  #include <mutex>
11
- #include <string>
12
12
  #include <stdexcept>
13
+ #include <string>
13
14
 
14
15
  #ifdef _WIN32
15
16
  # include <sal.h>
@@ -36,6 +37,7 @@
36
37
  #include "ggml-hexagon.h"
37
38
  #include "ggml-impl.h"
38
39
  #include "ggml-quants.h"
40
+ #include "op-desc.h"
39
41
  #include "htp-msg.h"
40
42
  #include "htp_iface.h"
41
43
 
@@ -55,9 +57,6 @@ static int opt_opsync = 0; // synchronous ops
55
57
  #define HEX_VERBOSE(...) \
56
58
  if (opt_verbose) GGML_LOG_DEBUG(__VA_ARGS__)
57
59
 
58
- #define HEX_PROFILE(...) \
59
- if (opt_profile) GGML_LOG_INFO(__VA_ARGS__)
60
-
61
60
  static inline uint64_t hex_is_aligned(void * addr, uint32_t align) {
62
61
  return ((size_t) addr & (align - 1)) == 0;
63
62
  }
@@ -85,128 +84,30 @@ static const char * status_to_str(uint32_t status) {
85
84
 
86
85
  // ** debug helpers
87
86
 
88
- static inline int hex_format_tensor_dims(char * str, const struct ggml_tensor * t) {
89
- if (t->ne[2] == 1 && t->ne[3] == 1) {
90
- return sprintf(str, "%d:%d", (int) t->ne[0], (int) t->ne[1]);
91
- } else {
92
- return sprintf(str, "%d:%d:%d:%d", (int) t->ne[0], (int) t->ne[1], (int) t->ne[2], (int) t->ne[3]);
93
- }
94
- }
95
-
96
- static inline void hex_format_op_dims(char * str, const struct ggml_tensor * t) {
97
- char * p = str;
98
-
99
- // append src0 and src1 (if any)
100
- if (t->src[0]) {
101
- p += hex_format_tensor_dims(p, t->src[0]);
102
-
103
- for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
104
- p += sprintf(p, " x ");
105
- p += hex_format_tensor_dims(p, t->src[i]);
106
- }
87
+ static void ggml_hexagon_dump_op_exec(const std::string &sess_name, const ggml_tensor * op, const uint32_t req_flags) {
88
+ if (!opt_verbose) return;
107
89
 
108
- p += sprintf(p, " -> ");
109
- }
110
-
111
- // format self dims separately for better visual alignment
112
- char self[64];
113
- hex_format_tensor_dims(self, t);
114
-
115
- p += sprintf(p, "%s", self);
90
+ op_desc desc(op);
91
+ GGML_LOG_DEBUG("ggml-hex: %s execute-op %s: %s : %s : %s : %s : %s : flags 0x%x\n", sess_name.c_str(),
92
+ ggml_op_name(op->op), desc.names, desc.dims, desc.types, desc.strides, desc.buffs, req_flags);
116
93
  }
117
94
 
118
- static inline int hex_format_tensor_strides(char * str, const struct ggml_tensor * t) {
119
- const char * c = ggml_is_contiguous(t) ? "" : "!";
95
+ static void ggml_hexagon_dump_op_supp(const std::string &sess_name, const struct ggml_tensor * op, bool supp) {
96
+ if (!opt_verbose) return;
120
97
 
121
- if (t->ne[2] == 1 && t->ne[3] == 1) {
122
- return sprintf(str, "%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], c);
123
- } else {
124
- return sprintf(str, "%zu:%zu:%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], (size_t) t->nb[2],
125
- (size_t) t->nb[3], c);
126
- }
98
+ op_desc desc(op);
99
+ GGML_LOG_DEBUG("ggml-hex: %s supports-op %s : %s : %s : %s : %s : %s : %s\n", sess_name.c_str(),
100
+ ggml_op_name(op->op), desc.names, desc.dims, desc.types, desc.strides, desc.buffs, supp ? "yes" : "no");
127
101
  }
128
102
 
129
- static inline void hex_format_op_strides(char * str, const struct ggml_tensor * t) {
130
- char * p = str;
103
+ static void ggml_hexagon_dump_op_prof(const std::string &sess_name, const ggml_tensor * op,
104
+ uint32_t op_usec, uint32_t op_cycles, uint32_t op_pkts, uint64_t call_usec) {
105
+ if (!opt_profile) return;
131
106
 
132
- // append src0 and src1 (if any)
133
- if (t->src[0]) {
134
- p += hex_format_tensor_strides(p, t->src[0]);
135
-
136
- for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
137
- p += sprintf(p, " x ");
138
- p += hex_format_tensor_strides(p, t->src[i]);
139
- }
140
-
141
- p += sprintf(p, " -> ");
142
- }
143
-
144
- // format self dims separately for better visual alignment
145
- char self[64];
146
- hex_format_tensor_strides(self, t);
147
-
148
- p += sprintf(p, "%s", self);
149
- }
150
-
151
- static inline void hex_format_op_types(char * str, const struct ggml_tensor * t) {
152
- char * p = str;
153
-
154
- // append src0 and src1 (if any)
155
- if (t->src[0]) {
156
- p += sprintf(p, "%s", ggml_type_name(t->src[0]->type));
157
-
158
- for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
159
- p += sprintf(p, " x ");
160
- p += sprintf(p, "%s", ggml_type_name(t->src[i]->type));
161
- }
162
-
163
- p += sprintf(p, " -> ");
164
- }
165
-
166
- p += sprintf(p, "%s", ggml_type_name(t->type));
167
- }
168
-
169
- static inline const char * hex_tensor_buff_name(const struct ggml_tensor * t) {
170
- if (t->buffer) {
171
- return ggml_backend_buffer_name(t->buffer);
172
- }
173
- return "NONE";
174
- }
175
-
176
- static inline void hex_format_op_buffs(char * str, const struct ggml_tensor * t) {
177
- char * p = str;
178
-
179
- // append src0 and src1 (if any)
180
- if (t->src[0]) {
181
- p += sprintf(p, "%s", hex_tensor_buff_name(t->src[0]));
182
-
183
- for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
184
- p += sprintf(p, " x ");
185
- p += sprintf(p, "%s", hex_tensor_buff_name(t->src[i]));
186
- }
187
-
188
- p += sprintf(p, " -> ");
189
- }
190
-
191
- p += sprintf(p, "%s", hex_tensor_buff_name(t));
192
- }
193
-
194
- static inline void hex_format_op_names(char * str, const struct ggml_tensor * t) {
195
- char * p = str;
196
-
197
- // append src0 and src1 (if any)
198
- if (t->src[0]) {
199
- p += sprintf(p, "%s", t->src[0]->name);
200
-
201
- for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
202
- p += sprintf(p, " x ");
203
- p += sprintf(p, "%s", t->src[i]->name);
204
- }
205
-
206
- p += sprintf(p, " -> ");
207
- }
208
-
209
- p += sprintf(p, "%s", t->name);
107
+ op_desc desc(op);
108
+ GGML_LOG_DEBUG("ggml-hex: %s profile-op %s: %s : %s : %s : %s : %s : op-usec %u op-cycles %u op-pkts %u (%f) call-usec %llu\n", sess_name.c_str(),
109
+ ggml_op_name(op->op), desc.names, desc.dims, desc.types, desc.strides, desc.buffs,
110
+ op_usec, op_cycles, op_pkts, (float) op_cycles / op_pkts, (unsigned long long) call_usec);
210
111
  }
211
112
 
212
113
  // ** backend sessions
@@ -221,8 +122,8 @@ struct ggml_hexagon_session {
221
122
  void enqueue(struct htp_general_req &req, struct dspqueue_buffer *bufs, uint32_t n_bufs, bool sync = false);
222
123
  void flush();
223
124
 
224
- ggml_backend_buffer_type buffer_type;
225
- ggml_backend_buffer_type repack_buffer_type;
125
+ ggml_backend_buffer_type buffer_type = {};
126
+ ggml_backend_buffer_type repack_buffer_type = {};
226
127
 
227
128
  std::string name;
228
129
  remote_handle64 handle;
@@ -241,23 +142,6 @@ struct ggml_hexagon_session {
241
142
  uint32_t prof_pkts;
242
143
  };
243
144
 
244
- static inline void hex_print_op_info(const ggml_tensor * op, ggml_hexagon_session * sess, const uint32_t req_flags) {
245
- char dims[64 * GGML_MAX_SRC];
246
- char strides[64 * GGML_MAX_SRC];
247
- char types[16 * GGML_MAX_SRC];
248
- char buffs[64 * GGML_MAX_SRC];
249
- char names[64 * GGML_MAX_SRC];
250
-
251
- hex_format_op_dims(dims, op);
252
- hex_format_op_strides(strides, op);
253
- hex_format_op_types(types, op);
254
- hex_format_op_buffs(buffs, op);
255
- hex_format_op_names(names, op);
256
-
257
- HEX_VERBOSE("ggml-hex: %s %s: %s : %s : %s : %s : %s: flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op),
258
- names, dims, types, strides, buffs, req_flags);
259
- }
260
-
261
145
  void ggml_hexagon_session::enqueue(struct htp_general_req &req, struct dspqueue_buffer *bufs, uint32_t n_bufs, bool sync) {
262
146
  // Bump pending flag (cleared in the session::flush once we get the responce)
263
147
  this->op_pending++; // atomic inc
@@ -1598,7 +1482,7 @@ static ggml_backend_buffer_t ggml_backend_hexagon_buffer_type_alloc_buffer(
1598
1482
  try {
1599
1483
  ggml_backend_hexagon_buffer_context * ctx = new ggml_backend_hexagon_buffer_context(sess, size, false /*repack*/);
1600
1484
  return ggml_backend_buffer_init(buffer_type, ggml_backend_hexagon_buffer_interface, ctx, size);
1601
- } catch (std::exception const &exc) {
1485
+ } catch (const std::exception & exc) {
1602
1486
  GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer context: %s\n", sess->name.c_str(), exc.what());
1603
1487
  return nullptr;
1604
1488
  }
@@ -1610,7 +1494,7 @@ static ggml_backend_buffer_t ggml_backend_hexagon_repack_buffer_type_alloc_buffe
1610
1494
  try {
1611
1495
  ggml_backend_hexagon_buffer_context * ctx = new ggml_backend_hexagon_buffer_context(sess, size, true /*repack*/);
1612
1496
  return ggml_backend_buffer_init(buffer_type, ggml_backend_hexagon_buffer_interface, ctx, size);
1613
- } catch (std::exception const &exc) {
1497
+ } catch (const std::exception & exc) {
1614
1498
  GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer context: %s\n", sess->name.c_str(), exc.what());
1615
1499
  return nullptr;
1616
1500
  }
@@ -1697,8 +1581,8 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {
1697
1581
  }
1698
1582
 
1699
1583
  // Save the IDs
1700
- this->session_id = n.session_id;
1701
- this->domain_id = n.effective_domain_id;
1584
+ this->session_id = n.session_id;
1585
+ this->domain_id = n.effective_domain_id;
1702
1586
  this->valid_session = true;
1703
1587
  }
1704
1588
 
@@ -1751,7 +1635,7 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {
1751
1635
  this->valid_handle = true;
1752
1636
 
1753
1637
  GGML_LOG_INFO("ggml-hex: new session: %s : session-id %d domain-id %d uri %s handle 0x%lx\n", this->name.c_str(),
1754
- this->session_id, this->domain_id, session_uri, (unsigned long) this->handle);
1638
+ this->session_id, this->domain_id, session_uri, (unsigned long) this->handle);
1755
1639
 
1756
1640
  // Enable FastRPC QoS mode
1757
1641
  {
@@ -1838,11 +1722,8 @@ void ggml_hexagon_session::release() noexcept(true) {
1838
1722
  }
1839
1723
 
1840
1724
  ggml_hexagon_session::ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) noexcept(false) {
1841
- buffer_type.context = nullptr;
1842
- repack_buffer_type.context = nullptr;
1843
-
1844
- buffer_type.device = dev;
1845
- repack_buffer_type.device = dev;
1725
+ buffer_type.device = dev;
1726
+ repack_buffer_type.device = dev;
1846
1727
 
1847
1728
  try {
1848
1729
  allocate(dev_id);
@@ -1852,7 +1733,7 @@ ggml_hexagon_session::ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) n
1852
1733
 
1853
1734
  repack_buffer_type.iface = ggml_backend_hexagon_repack_buffer_type_interface;
1854
1735
  repack_buffer_type.context = new ggml_backend_hexagon_buffer_type_context(this->name + "-REPACK", this);
1855
- } catch (std::exception const &exc) {
1736
+ } catch (const std::exception & exc) {
1856
1737
  release();
1857
1738
  throw;
1858
1739
  }
@@ -1861,8 +1742,8 @@ ggml_hexagon_session::ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) n
1861
1742
  ggml_hexagon_session::~ggml_hexagon_session() noexcept(true) {
1862
1743
  release();
1863
1744
 
1864
- delete static_cast<ggml_backend_hexagon_buffer_type_context*>(buffer_type.context);
1865
- delete static_cast<ggml_backend_hexagon_buffer_type_context*>(repack_buffer_type.context);
1745
+ delete static_cast<ggml_backend_hexagon_buffer_type_context *>(buffer_type.context);
1746
+ delete static_cast<ggml_backend_hexagon_buffer_type_context *>(repack_buffer_type.context);
1866
1747
  }
1867
1748
 
1868
1749
  // ** backend interface
@@ -1930,15 +1811,6 @@ static bool hex_supported_dims(const struct ggml_tensor * x, const struct ggml_t
1930
1811
  return true;
1931
1812
  }
1932
1813
 
1933
- template <typename... _TTensor>
1934
- static inline bool hex_supported_buffer(const struct ggml_hexagon_session * sess, _TTensor... tensors) {
1935
- return ([&]() -> bool {
1936
- return !tensors || !tensors->buffer ||
1937
- (ggml_backend_buffer_is_hexagon(tensors->buffer) &&
1938
- ggml_backend_hexagon_buffer_get_sess(tensors->buffer) == sess);
1939
- }() && ...);
1940
- }
1941
-
1942
1814
  static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * sess, const struct ggml_tensor * dst) {
1943
1815
  const struct ggml_tensor * src0 = dst->src[0];
1944
1816
  const struct ggml_tensor * src1 = dst->src[1];
@@ -1976,7 +1848,8 @@ static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * s
1976
1848
  break;
1977
1849
 
1978
1850
  case GGML_TYPE_F16:
1979
- if (!opt_experimental) {
1851
+ if (src0->nb[1] < src0->nb[0]) {
1852
+ GGML_LOG_DEBUG("ggml_hexagon_supported_mul_mat: permuted F16 src0 not supported\n");
1980
1853
  return false;
1981
1854
  }
1982
1855
  break;
@@ -1985,11 +1858,6 @@ static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * s
1985
1858
  return false;
1986
1859
  }
1987
1860
 
1988
- // src0 & src1 & dst must be mapped to the same session
1989
- if (!hex_supported_buffer(sess, src0, src1, dst)) {
1990
- return false;
1991
- }
1992
-
1993
1861
  return true;
1994
1862
  }
1995
1863
 
@@ -2032,12 +1900,6 @@ static bool ggml_hexagon_supported_mul_mat_id(const struct ggml_hexagon_session
2032
1900
  return false;
2033
1901
  }
2034
1902
 
2035
- // src0 (weights) must be repacked and mapped to the same session
2036
- // src1 & sr2 & dst must be mapped to the same session
2037
- if (!hex_supported_buffer(sess, src0, src1, src2, dst)) {
2038
- return false;
2039
- }
2040
-
2041
1903
  return true;
2042
1904
  }
2043
1905
 
@@ -2067,18 +1929,12 @@ static bool ggml_hexagon_supported_binary(const struct ggml_hexagon_session * se
2067
1929
  return false;
2068
1930
  }
2069
1931
 
2070
- // src0, src1 & dst must be mapped to the same session
2071
- if (!hex_supported_buffer(sess, src0, src1, dst)) {
2072
- return false;
2073
- }
2074
-
2075
1932
  return true;
2076
1933
  }
2077
1934
 
2078
1935
  static bool ggml_hexagon_supported_add_id(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
2079
1936
  const struct ggml_tensor * src0 = op->src[0];
2080
1937
  const struct ggml_tensor * src1 = op->src[1];
2081
- const struct ggml_tensor * src2 = op->src[2];
2082
1938
  const struct ggml_tensor * dst = op;
2083
1939
 
2084
1940
  if (!hex_supported_src0_type(src0->type)) {
@@ -2099,11 +1955,6 @@ static bool ggml_hexagon_supported_add_id(const struct ggml_hexagon_session * se
2099
1955
  return false;
2100
1956
  }
2101
1957
 
2102
- // src0, src1 & dst must be mapped to the same session
2103
- if (!hex_supported_buffer(sess, src0, src1, src2, dst)) {
2104
- return false;
2105
- }
2106
-
2107
1958
  return true;
2108
1959
  }
2109
1960
 
@@ -2126,11 +1977,6 @@ static bool ggml_hexagon_supported_unary(const struct ggml_hexagon_session * ses
2126
1977
  return false;
2127
1978
  }
2128
1979
 
2129
- // src0 & dst must be mapped to the same session
2130
- if (!hex_supported_buffer(sess, src0, dst)) {
2131
- return false;
2132
- }
2133
-
2134
1980
  return true;
2135
1981
  }
2136
1982
 
@@ -2163,11 +2009,6 @@ static bool ggml_hexagon_supported_activations(const struct ggml_hexagon_session
2163
2009
  }
2164
2010
  }
2165
2011
 
2166
- // src0, src1 & dst must be mapped to the same session
2167
- if (!hex_supported_buffer(sess, src0, src1, dst)) {
2168
- return false;
2169
- }
2170
-
2171
2012
  return true;
2172
2013
  }
2173
2014
 
@@ -2216,11 +2057,6 @@ static bool ggml_hexagon_supported_softmax(const struct ggml_hexagon_session * s
2216
2057
  }
2217
2058
  }
2218
2059
 
2219
- // src0, src1 & dst must be mapped to the same session
2220
- if (!hex_supported_buffer(sess, src0, src1, dst)) {
2221
- return false;
2222
- }
2223
-
2224
2060
  return true;
2225
2061
  }
2226
2062
 
@@ -2271,16 +2107,28 @@ static bool ggml_hexagon_supported_rope(const struct ggml_hexagon_session * sess
2271
2107
  }
2272
2108
  }
2273
2109
 
2274
- // src0, src1, src2 & dst must be mapped to the same session
2275
- if (!hex_supported_buffer(sess, src0, src1, src2, dst)) {
2276
- return false;
2277
- }
2278
-
2279
2110
  return true;
2280
2111
  }
2281
2112
 
2113
+ enum dspqbuf_type {
2114
+ DSPQBUF_TYPE_DSP_WRITE_CPU_READ = 0,
2115
+ DSPQBUF_TYPE_CPU_WRITE_DSP_READ,
2116
+ DSPQBUF_TYPE_CONSTANT,
2117
+ };
2118
+
2119
+ static void dspqbuf_dump(dspqueue_buffer * d, const struct ggml_tensor * t, dspqbuf_type type) {
2120
+ if (opt_verbose < 2) return;
2121
+
2122
+ auto buf = static_cast<ggml_backend_hexagon_buffer_context *>(t->buffer->context);
2123
+ auto sess = buf->sess;
2124
+
2125
+ GGML_LOG_DEBUG("ggml-hex: %s dspqbuf : %s base-addr %p base-size %zu data %p offset %u size %u\n", sess->name.c_str(),
2126
+ t->name, (void *) buf->base, buf->size, (void *) d->ptr, (unsigned int) d->offset,
2127
+ (unsigned int) d->size);
2128
+ }
2129
+
2282
2130
  // Init hexagon tensor from GGML tensor and Hexagon buffer
2283
- static void init_htp_tensor(htp_tensor * h, const ggml_tensor * t) {
2131
+ static void htp_req_tensor_init(htp_tensor * h, const ggml_tensor * t) {
2284
2132
  h->data = 0; // updated by the receiver
2285
2133
  h->type = t->type;
2286
2134
  h->ne[0] = t->ne[0];
@@ -2293,125 +2141,52 @@ static void init_htp_tensor(htp_tensor * h, const ggml_tensor * t) {
2293
2141
  h->nb[3] = t->nb[3];
2294
2142
  }
2295
2143
 
2296
- static size_t dspqueue_buffers_init(dspqueue_buffer * buf, const ggml_tensor * t, bool flush_host, bool flush_htp) {
2144
+ static size_t htp_req_buff_init(htp_tensor *h, dspqueue_buffer * d, const ggml_tensor * t, dspqbuf_type type) {
2297
2145
  if (!t) {
2298
2146
  return 0;
2299
2147
  }
2300
2148
 
2301
- memset(buf, 0, sizeof(*buf));
2302
- auto tensor_buf = static_cast<ggml_backend_hexagon_buffer_context *>(t->buffer->context);
2303
- buf->fd = tensor_buf->fd;
2304
- buf->ptr = t->data;
2305
- buf->offset = (uint8_t *) t->data - tensor_buf->base;
2306
- buf->size = ggml_nbytes(t);
2307
- buf->flags = (flush_host ? DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER : 0); // Flush CPU
2308
- buf->flags |= (flush_htp ? DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT : 0); // Invalidate DSP
2309
- return 1;
2310
- }
2311
-
2312
- static ggml_hexagon_session * get_session_from_tensor(const ggml_tensor * t) {
2313
- return static_cast<ggml_backend_hexagon_buffer_context *>(t->buffer->context)->sess;
2314
- }
2315
-
2316
- static void hex_dump_dspbuf(const struct ggml_tensor * t, const dspqueue_buffer * d) {
2317
- auto buf = static_cast<ggml_backend_hexagon_buffer_context *>(t->buffer->context);
2318
- auto sess = buf->sess;
2319
-
2320
- HEX_VERBOSE("ggml-hex: %s dspqbuf : %s base-addr %p base-size %zu data %p offset %u size %u\n", sess->name.c_str(),
2321
- t->name, (void *) buf->base, buf->size, (void *) d->ptr, (unsigned int) d->offset,
2322
- (unsigned int) d->size);
2323
- }
2324
-
2325
- static void ggml_hexagon_mul_mat(const struct ggml_tensor * op, uint32_t flags) {
2326
- const struct ggml_tensor * src0 = op->src[0];
2327
- const struct ggml_tensor * src1 = op->src[1];
2328
- const struct ggml_tensor * dst = op;
2329
-
2330
- uint64_t t1, t2;
2331
- t1 = ggml_time_us();
2332
-
2333
- // Construct HTP message
2334
- htp_general_req req;
2335
- req.op = HTP_OP_MUL_MAT;
2336
- req.flags = flags;
2149
+ auto buf = static_cast<ggml_backend_hexagon_buffer_context *>(t->buffer->context);
2337
2150
 
2338
- init_htp_tensor(&req.src0, src0);
2339
- init_htp_tensor(&req.src1, src1);
2340
- init_htp_tensor(&req.dst, dst);
2151
+ memset(d, 0, sizeof(*d));
2152
+ d->fd = buf->fd;
2153
+ d->ptr = t->data;
2154
+ d->offset = (uint8_t *) t->data - buf->base;
2155
+ d->size = ggml_nbytes(t);
2341
2156
 
2342
- // Use opmask to override flags
2343
- if (!(opt_opmask & HTP_OPMASK_QUANTIZE)) {
2344
- req.flags |= HTP_OPFLAGS_SKIP_QUANTIZE;
2345
- }
2346
- if (!(opt_opmask & HTP_OPMASK_COMPUTE)) {
2347
- req.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
2348
- }
2349
-
2350
- dspqueue_buffer bufs[3];
2351
-
2352
- // First buffer Weights.
2353
- // The content is static, there is no need to do any cache management
2354
- dspqueue_buffers_init(bufs, src0, false, false);
2355
-
2356
- // Second buffer Input Activations. This is a buffer that the CPU
2357
- // writes and the DSP reads, so we'll need to flush CPU caches and
2358
- // invalidate DSP ones. On platforms with I/O coherency support the
2359
- // framework will automatically skip cache operations where possible.
2360
- dspqueue_buffers_init(&bufs[1], src1, true, true);
2361
-
2362
- // Third buffer Output Activations. We'll handle DSP
2363
- // cache maintenance in the response message but need to flush
2364
- // CPU caches to ensure any previously written dirty lines are
2365
- // written out before writes from the DSP start.
2366
- dspqueue_buffers_init(&bufs[2], dst, true, false);
2367
-
2368
- auto * sess = get_session_from_tensor(src0);
2369
-
2370
- if (opt_verbose) {
2371
- hex_print_op_info(op, sess, req.flags);
2372
- if (opt_verbose > 1) {
2373
- hex_dump_dspbuf(src0, &bufs[0]);
2374
- hex_dump_dspbuf(src1, &bufs[1]);
2375
- hex_dump_dspbuf(dst, &bufs[2]);
2376
- }
2157
+ switch (type) {
2158
+ case DSPQBUF_TYPE_DSP_WRITE_CPU_READ:
2159
+ // Flush CPU
2160
+ d->flags = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER;
2161
+ break;
2162
+ case DSPQBUF_TYPE_CPU_WRITE_DSP_READ:
2163
+ // Flush CPU, Invalidate DSP
2164
+ d->flags = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT;
2165
+ break;
2166
+ default:
2167
+ // Constant buffer, no cache maintenance
2168
+ d->flags = 0;
2169
+ break;
2377
2170
  }
2378
2171
 
2379
- if ((opt_opmask & HTP_OPMASK_QUEUE)) {
2380
- sess->enqueue(req, bufs, 3, opt_opsync);
2381
- }
2172
+ htp_req_tensor_init(h, t);
2382
2173
 
2383
- t2 = ggml_time_us();
2174
+ dspqbuf_dump(d, t, type);
2384
2175
 
2385
- HEX_PROFILE(
2386
- "ggml-hex: %s %s %s %u:%u:%u:%u x %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles %u op-pkts %u (%f) "
2387
- "call-usec %llu\n",
2388
- sess->name.c_str(), ggml_op_name(op->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
2389
- (uint32_t) src0->ne[2], (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
2390
- (uint32_t) src1->ne[2], (uint32_t) src1->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
2391
- (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
2392
- (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
2176
+ return 1;
2393
2177
  }
2394
2178
 
2395
- static void ggml_hexagon_mul_mat_id(const struct ggml_tensor * op, uint32_t flags) {
2396
- const struct ggml_tensor * src0 = op->src[0];
2397
- const struct ggml_tensor * src1 = op->src[1];
2398
- const struct ggml_tensor * src2 = op->src[2];
2399
- const struct ggml_tensor * dst = op;
2179
+ typedef size_t (*htp_req_init_func_t)(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * op);
2400
2180
 
2401
- uint64_t t1, t2;
2402
- t1 = ggml_time_us();
2181
+ template <htp_req_init_func_t _init_req_func>
2182
+ static inline void ggml_hexagon_dispatch_op(ggml_hexagon_session *sess, const struct ggml_tensor * op, uint32_t flags) {
2183
+ uint64_t t = ggml_time_us();
2403
2184
 
2404
- // Construct HTP message
2185
+ // Construct HTP request
2405
2186
  htp_general_req req;
2406
- req.op = HTP_OP_MUL_MAT_ID;
2407
- req.flags = flags;
2187
+ memset(&req, 0, sizeof(req));
2408
2188
 
2409
- init_htp_tensor(&req.src0, src0);
2410
- init_htp_tensor(&req.src1, src1);
2411
- init_htp_tensor(&req.src2, src2);
2412
- init_htp_tensor(&req.dst, dst);
2413
-
2414
- // Use opmask to override flags
2189
+ req.flags = flags;
2415
2190
  if (!(opt_opmask & HTP_OPMASK_QUANTIZE)) {
2416
2191
  req.flags |= HTP_OPFLAGS_SKIP_QUANTIZE;
2417
2192
  }
@@ -2419,461 +2194,141 @@ static void ggml_hexagon_mul_mat_id(const struct ggml_tensor * op, uint32_t flag
2419
2194
  req.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
2420
2195
  }
2421
2196
 
2422
- dspqueue_buffer bufs[4];
2423
- // First buffer Weights.
2424
- // The content is static, there is no need to do any cache management
2425
- dspqueue_buffers_init(bufs, src0, false, false);
2426
-
2427
- // Second buffer Input Activations. This is a buffer that the CPU
2428
- // writes and the DSP reads, so we'll need to flush CPU caches and
2429
- // invalidate DSP ones. On platforms with I/O coherency support the
2430
- // framework will automatically skip cache operations where possible.
2431
- dspqueue_buffers_init(&bufs[1], src1, true, true);
2432
-
2433
- // Third buffer expert IDs. This is a buffer that the CPU
2434
- // writes and the DSP reads, so we'll need to flush CPU caches and
2435
- // invalidate DSP ones. On platforms with I/O coherency support the
2436
- // framework will automatically skip cache operations where possible.
2437
- dspqueue_buffers_init(&bufs[2], src2, true, true);
2438
-
2439
- // Forth buffer Output Activations. We'll handle DSP
2440
- // cache maintenance in the response message but need to flush
2441
- // CPU caches to ensure any previously written dirty lines are
2442
- // written out before writes from the DSP start.
2443
- dspqueue_buffers_init(&bufs[3], dst, true, false);
2444
-
2445
- auto * sess = get_session_from_tensor(src0);
2446
-
2447
- if (opt_verbose) {
2448
- hex_print_op_info(op, sess, req.flags);
2449
- if (opt_verbose > 1) {
2450
- hex_dump_dspbuf(src0, &bufs[0]);
2451
- hex_dump_dspbuf(src1, &bufs[1]);
2452
- hex_dump_dspbuf(src2, &bufs[2]);
2453
- hex_dump_dspbuf(dst, &bufs[3]);
2454
- }
2455
- }
2197
+ ggml_hexagon_dump_op_exec(sess->name, op, req.flags);
2456
2198
 
2457
2199
  if ((opt_opmask & HTP_OPMASK_QUEUE)) {
2458
- sess->enqueue(req, bufs, 4, opt_opsync);
2200
+ dspqueue_buffer bufs[HTP_MAX_PACKET_BUFFERS];
2201
+ size_t n_bufs = _init_req_func(&req, bufs, op);
2202
+ sess->enqueue(req, bufs, n_bufs, opt_opsync);
2459
2203
  }
2460
2204
 
2461
- t2 = ggml_time_us();
2205
+ t = ggml_time_us() - t;
2462
2206
 
2463
- HEX_PROFILE(
2464
- "ggml-hex: %s matmul-id %s %u:%u:%u:%u x %s %u:%u:%u:%u (%s %u:%u:%u:%u) -> %s %u:%u:%u:%u : op-usec %u "
2465
- "op-cycles %u op-pkts %u (%f) call-usec %llu\n",
2466
- sess->name.c_str(), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], (uint32_t) src0->ne[2],
2467
- (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], (uint32_t) src1->ne[2],
2468
- (uint32_t) src1->ne[3], src2->name, (uint32_t) src2->ne[0], (uint32_t) src2->ne[1], (uint32_t) src2->ne[2],
2469
- (uint32_t) src2->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],
2470
- (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
2471
- (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
2207
+ ggml_hexagon_dump_op_prof(sess->name, op, sess->prof_usecs, sess->prof_cycles, sess->prof_pkts, t);
2472
2208
  }
2473
2209
 
2474
- static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) {
2475
- const struct ggml_tensor * node = op;
2476
- const struct ggml_tensor * src0 = node->src[0];
2477
- const struct ggml_tensor * src1 = node->src[1];
2478
- const struct ggml_tensor * dst = node;
2479
-
2480
- uint64_t t1 = 0;
2481
- uint64_t t2 = 0;
2482
-
2483
- t1 = ggml_time_us();
2484
-
2485
- // Construct HTP message
2486
- htp_general_req req;
2487
- req.flags = flags;
2488
-
2489
- // Use opmask to override flags
2490
- if (!(opt_opmask & HTP_OPMASK_QUANTIZE)) {
2491
- req.flags |= HTP_OPFLAGS_SKIP_QUANTIZE;
2492
- }
2493
- if (!(opt_opmask & HTP_OPMASK_COMPUTE)) {
2494
- req.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
2495
- }
2496
-
2497
- switch (node->op) {
2210
+ template <bool _is_src0_constant>
2211
+ static inline size_t init_binary_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
2212
+ switch (t->op) {
2213
+ case GGML_OP_MUL_MAT:
2214
+ req->op = HTP_OP_MUL_MAT;
2215
+ break;
2498
2216
  case GGML_OP_MUL:
2499
- req.op = HTP_OP_MUL;
2217
+ req->op = HTP_OP_MUL;
2500
2218
  break;
2501
2219
  case GGML_OP_ADD:
2502
- req.op = HTP_OP_ADD;
2220
+ req->op = HTP_OP_ADD;
2503
2221
  break;
2504
2222
  case GGML_OP_SUB:
2505
- req.op = HTP_OP_SUB;
2223
+ req->op = HTP_OP_SUB;
2506
2224
  break;
2507
2225
  default:
2508
- GGML_ABORT("ggml-hex: binary : unsupported op:%d\n", node->op);
2509
- }
2510
-
2511
- init_htp_tensor(&req.src0, src0);
2512
- init_htp_tensor(&req.src1, src1);
2513
- init_htp_tensor(&req.dst, dst);
2514
-
2515
- dspqueue_buffer bufs[3];
2516
- // First buffer = First Operand of Binary op
2517
- // This is a buffer that the CPU writes and the DSP reads, so we'll
2518
- // need to flush CPU caches and invalidate DSP ones. On platforms
2519
- // with I/O coherency support the framework will automatically skip
2520
- // cache operations where possible.
2521
- dspqueue_buffers_init(bufs, src0, true, true);
2522
-
2523
- // Second buffer = Second Operand of Binary op
2524
- // This is a buffer that the CPU writes and the DSP reads, so we'll
2525
- // need to flush CPU caches and invalidate DSP ones. On platforms
2526
- // with I/O coherency support the framework will automatically skip
2527
- // cache operations where possible.
2528
- dspqueue_buffers_init(&bufs[1], src1, true, true);
2529
-
2530
- // Third buffer = Output Activations. We'll handle DSP
2531
- // cache maintenance in the response message but need to flush
2532
- // CPU caches to ensure any previously written dirty lines are
2533
- // written out before writes from the DSP start.
2534
- dspqueue_buffers_init(&bufs[2], dst, true, false);
2535
-
2536
- auto * sess = get_session_from_tensor(src0);
2537
-
2538
- if (opt_verbose) {
2539
- hex_print_op_info(op, sess, req.flags);
2540
- if (opt_verbose > 1) {
2541
- hex_dump_dspbuf(src0, &bufs[0]);
2542
- hex_dump_dspbuf(src1, &bufs[1]);
2543
- hex_dump_dspbuf(dst, &bufs[2]);
2544
- }
2226
+ GGML_ABORT("ggml-hex: binary : unsupported op: %d\n", t->op);
2227
+ break;
2545
2228
  }
2546
2229
 
2547
- if ((opt_opmask & HTP_OPMASK_QUEUE)) {
2548
- sess->enqueue(req, bufs, 3, opt_opsync);
2549
- }
2230
+ // src0: Weights (mulmat) or First Operand (binary op).
2231
+ // If constant (e.g. weights), no cache management is needed.
2232
+ // src1: Input Activations (mulmat) or Second Operand (binary op).
2550
2233
 
2551
- t2 = ggml_time_us();
2234
+ size_t n_bufs = 0;
2235
+ n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], _is_src0_constant ? DSPQBUF_TYPE_CONSTANT : DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
2236
+ n_bufs += htp_req_buff_init(&req->src1, &bufs[n_bufs], t->src[1], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
2237
+ n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
2552
2238
 
2553
- HEX_PROFILE(
2554
- "ggml-hex: %s %s %s %u:%u:%u:%u x %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles %u op-pkts %u (%f) "
2555
- "call-usec %llu\n",
2556
- sess->name.c_str(), ggml_op_name(node->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
2557
- (uint32_t) src0->ne[2], (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
2558
- (uint32_t) src1->ne[2], (uint32_t) src1->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
2559
- (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
2560
- (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
2239
+ return n_bufs;
2561
2240
  }
2562
2241
 
2563
- static void ggml_hexagon_add_id(const struct ggml_tensor * op, uint32_t flags) {
2564
- const struct ggml_tensor * node = op;
2565
- const struct ggml_tensor * src0 = node->src[0];
2566
- const struct ggml_tensor * src1 = node->src[1];
2567
- const struct ggml_tensor * src2 = node->src[2];
2568
- const struct ggml_tensor * dst = node;
2569
-
2570
- uint64_t t1 = 0;
2571
- uint64_t t2 = 0;
2572
-
2573
- t1 = ggml_time_us();
2574
-
2575
- // Construct HTP message
2576
- htp_general_req req;
2577
- req.flags = flags;
2578
-
2579
- // Use opmask to override flags
2580
- if (!(opt_opmask & HTP_OPMASK_QUANTIZE)) {
2581
- req.flags |= HTP_OPFLAGS_SKIP_QUANTIZE;
2582
- }
2583
- if (!(opt_opmask & HTP_OPMASK_COMPUTE)) {
2584
- req.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
2585
- }
2586
-
2587
- switch (node->op) {
2242
+ template <bool _is_src0_constant>
2243
+ static inline size_t init_binary_id_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
2244
+ switch (t->op) {
2245
+ case GGML_OP_MUL_MAT_ID:
2246
+ req->op = HTP_OP_MUL_MAT_ID;
2247
+ break;
2588
2248
  case GGML_OP_ADD_ID:
2589
- req.op = HTP_OP_ADD_ID;
2249
+ req->op = HTP_OP_ADD_ID;
2590
2250
  break;
2591
2251
  default:
2592
- GGML_ABORT("ggml-hex: unsupported op:%d\n", node->op);
2593
- }
2594
-
2595
- init_htp_tensor(&req.src0, src0);
2596
- init_htp_tensor(&req.src1, src1);
2597
- init_htp_tensor(&req.src2, src2);
2598
- init_htp_tensor(&req.dst, dst);
2599
-
2600
- dspqueue_buffer bufs[4];
2601
- // First buffer = input activations
2602
- dspqueue_buffers_init(bufs, src0, true, true);
2603
- // Second buffer = experts bias
2604
- dspqueue_buffers_init(&bufs[1], src1, true, true);
2605
- // Third buffer = activated experts
2606
- dspqueue_buffers_init(&bufs[2], src2, true, true);
2607
- // Forth buffer = output activations
2608
- dspqueue_buffers_init(&bufs[3], dst, true, true);
2609
-
2610
- auto * sess = get_session_from_tensor(src0);
2611
-
2612
- if (opt_verbose) {
2613
- hex_print_op_info(op, sess, req.flags);
2614
- if (opt_verbose > 1) {
2615
- hex_dump_dspbuf(src0, &bufs[0]);
2616
- hex_dump_dspbuf(src1, &bufs[1]);
2617
- hex_dump_dspbuf(src2, &bufs[2]);
2618
- hex_dump_dspbuf(dst, &bufs[3]);
2619
- }
2252
+ GGML_ABORT("ggml-hex: unsupported op: %d\n", t->op);
2620
2253
  }
2621
2254
 
2622
- if ((opt_opmask & HTP_OPMASK_QUEUE)) {
2623
- sess->enqueue(req, bufs, 4, opt_opsync);
2624
- }
2255
+ // src0: Weights (mulmat) or Input Activations (other op).
2256
+ // If constant, no cache management is needed.
2257
+ // src1: Input Activations (mulmat) or Second Operand (binary op).
2258
+ // src2: Expert IDs (mulmat) or Activated Experts (other op).
2625
2259
 
2626
- t2 = ggml_time_us();
2260
+ size_t n_bufs = 0;
2261
+ n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], _is_src0_constant ? DSPQBUF_TYPE_CONSTANT : DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
2262
+ n_bufs += htp_req_buff_init(&req->src1, &bufs[n_bufs], t->src[1], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
2263
+ n_bufs += htp_req_buff_init(&req->src2, &bufs[n_bufs], t->src[2], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
2264
+ n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
2627
2265
 
2628
- HEX_PROFILE(
2629
- "ggml-hex: %s %s %s %u:%u:%u:%u x %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles %u op-pkts %u (%f) "
2630
- "call-usec %llu\n",
2631
- sess->name.c_str(), ggml_op_name(node->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
2632
- (uint32_t) src0->ne[2], (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
2633
- (uint32_t) src1->ne[2], (uint32_t) src1->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
2634
- (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
2635
- (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
2266
+ return n_bufs;
2636
2267
  }
2637
2268
 
2638
- static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) {
2639
- const struct ggml_tensor * src0 = op->src[0];
2640
- const struct ggml_tensor * src1 = op->src[1];
2641
- const struct ggml_tensor * dst = op;
2642
-
2643
- uint64_t t1 = 0;
2644
- uint64_t t2 = 0;
2645
-
2646
- t1 = ggml_time_us();
2647
-
2648
- // Construct HTP message
2649
- htp_general_req req;
2650
-
2651
- memset(&req, 0, sizeof(htp_general_req));
2652
- memcpy(&req.op_params, &op->op_params, sizeof(op->op_params));
2653
- req.flags = flags;
2269
+ static inline size_t init_unary_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
2270
+ memcpy(&req->op_params, &t->op_params, sizeof(t->op_params));
2654
2271
 
2655
2272
  bool supported = false;
2656
2273
 
2657
- switch (op->op) {
2274
+ switch (t->op) {
2658
2275
  case GGML_OP_RMS_NORM:
2659
- req.op = HTP_OP_RMS_NORM;
2276
+ req->op = HTP_OP_RMS_NORM;
2660
2277
  supported = true;
2661
2278
  break;
2662
2279
 
2663
2280
  case GGML_OP_UNARY:
2664
- if (ggml_get_unary_op(dst) == GGML_UNARY_OP_SILU) {
2665
- req.op = HTP_OP_UNARY_SILU;
2281
+ if (ggml_get_unary_op(t) == GGML_UNARY_OP_SILU) {
2282
+ req->op = HTP_OP_UNARY_SILU;
2283
+ supported = true;
2284
+ } else if (ggml_get_unary_op(t) == GGML_UNARY_OP_GELU) {
2285
+ req->op = HTP_OP_UNARY_GELU;
2666
2286
  supported = true;
2667
2287
  }
2668
2288
  break;
2669
2289
 
2670
2290
  case GGML_OP_GLU:
2671
- if (ggml_get_glu_op(dst) == GGML_GLU_OP_SWIGLU) {
2672
- req.op = HTP_OP_GLU_SWIGLU;
2291
+ if (ggml_get_glu_op(t) == GGML_GLU_OP_SWIGLU) {
2292
+ req->op = HTP_OP_GLU_SWIGLU;
2673
2293
  supported = true;
2674
- } else if (ggml_get_glu_op(dst) == GGML_GLU_OP_SWIGLU_OAI) {
2675
- req.op = HTP_OP_GLU_SWIGLU_OAI;
2294
+ } else if (ggml_get_glu_op(t) == GGML_GLU_OP_SWIGLU_OAI) {
2295
+ req->op = HTP_OP_GLU_SWIGLU_OAI;
2676
2296
  supported = true;
2677
2297
  }
2678
2298
  break;
2679
2299
 
2680
2300
  case GGML_OP_SOFT_MAX:
2681
- req.op = HTP_OP_SOFTMAX;
2301
+ req->op = HTP_OP_SOFTMAX;
2682
2302
  supported = true;
2303
+ break;
2683
2304
 
2684
2305
  default:
2685
2306
  break;
2686
2307
  }
2687
2308
 
2688
2309
  if (!supported) {
2689
- GGML_ABORT("ggml-hex: unary : unsupported op:%d\n", op->op);
2690
- }
2691
-
2692
- init_htp_tensor(&req.dst, dst);
2693
- init_htp_tensor(&req.src0, src0);
2694
- if (src1) {
2695
- init_htp_tensor(&req.src1, src1);
2696
- }
2697
-
2698
- // Use opmask to override flags
2699
- if (!(opt_opmask & HTP_OPMASK_QUANTIZE)) {
2700
- req.flags |= HTP_OPFLAGS_SKIP_QUANTIZE;
2701
- }
2702
- if (!(opt_opmask & HTP_OPMASK_COMPUTE)) {
2703
- req.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
2704
- }
2705
-
2706
- dspqueue_buffer bufs[3];
2707
-
2708
- // First buffer = Only Operand of Unary op
2709
- // This is a buffer that the CPU writes and the DSP reads, so we'll
2710
- // need to flush CPU caches and invalidate DSP ones. On platforms
2711
- // with I/O coherency support the framework will automatically skip
2712
- // cache operations where possible.
2713
- size_t n_bufs = dspqueue_buffers_init(bufs, src0, true, true);
2714
-
2715
- // Second buffer(nullable) = Second Operand of Binary op
2716
- // This is a buffer that the CPU writes and the DSP reads, so we'll
2717
- // need to flush CPU caches and invalidate DSP ones. On platforms
2718
- // with I/O coherency support the framework will automatically skip
2719
- // cache operations where possible.
2720
- n_bufs += dspqueue_buffers_init(&bufs[n_bufs], src1, true, true);
2721
-
2722
- // Second or third buffer = Output Activations. We'll handle DSP
2723
- // Second buffer = Output Activations. We'll handle DSP
2724
- // cache maintenance in the response message but need to flush
2725
- // CPU caches to ensure any previously written dirty lines are
2726
- // written out before writes from the DSP start.
2727
- n_bufs += dspqueue_buffers_init(&bufs[n_bufs], dst, true, false);
2728
-
2729
- // Primary DSP session from the src0 tensor
2730
- auto * sess = get_session_from_tensor(src0);
2731
-
2732
- if (opt_verbose) {
2733
- hex_print_op_info(op, sess, req.flags);
2734
- if (opt_verbose > 1) {
2735
- hex_dump_dspbuf(src0, &bufs[0]);
2736
- if (src1) {
2737
- hex_dump_dspbuf(src1, &bufs[1]);
2738
- hex_dump_dspbuf(dst, &bufs[2]);
2739
- } else {
2740
- hex_dump_dspbuf(dst, &bufs[1]);
2741
- }
2742
- }
2743
- }
2744
-
2745
- if ((opt_opmask & HTP_OPMASK_QUEUE)) {
2746
- sess->enqueue(req, bufs, n_bufs, opt_opsync);
2310
+ GGML_ABORT("ggml-hex: unary : unsupported op: %d\n", t->op);
2747
2311
  }
2748
2312
 
2749
- t2 = ggml_time_us();
2313
+ size_t n_bufs = 0;
2314
+ n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
2315
+ n_bufs += htp_req_buff_init(&req->src1, &bufs[n_bufs], t->src[1], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
2316
+ n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
2750
2317
 
2751
- if (src1) {
2752
- HEX_PROFILE(
2753
- "ggml-hex: %s %s %s %u:%u:%u:%u x %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles %u op-pkts %u "
2754
- "(%f) call-usec %llu\n",
2755
- sess->name.c_str(), ggml_op_name(op->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
2756
- (uint32_t) src0->ne[2], (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
2757
- (uint32_t) src1->ne[2], (uint32_t) src1->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
2758
- (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
2759
- (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
2760
- } else {
2761
- HEX_PROFILE(
2762
- "ggml-hex: %s %s %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles %u op-pkts %u (%f) call-usec "
2763
- "%llu\n",
2764
- sess->name.c_str(), ggml_op_name(op->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
2765
- (uint32_t) src0->ne[2], (uint32_t) src0->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
2766
- (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
2767
- (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
2768
- }
2318
+ return n_bufs;
2769
2319
  }
2770
2320
 
2771
- static void ggml_hexagon_rope(const struct ggml_tensor * op, uint32_t flags) {
2772
- const struct ggml_tensor * src0 = op->src[0];
2773
- const struct ggml_tensor * src1 = op->src[1];
2774
- const struct ggml_tensor * src2 = op->src[2];
2775
- const struct ggml_tensor * dst = op;
2776
-
2777
- uint64_t t1 = 0;
2778
- uint64_t t2 = 0;
2321
+ static inline size_t init_rope_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
2322
+ memcpy(&req->op_params, &t->op_params, sizeof(t->op_params));
2323
+ req->op = HTP_OP_ROPE;
2779
2324
 
2780
- t1 = ggml_time_us();
2781
-
2782
- // Construct HTP message
2783
- htp_general_req req;
2325
+ size_t n_bufs = 0;
2326
+ n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
2327
+ n_bufs += htp_req_buff_init(&req->src1, &bufs[n_bufs], t->src[1], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
2328
+ n_bufs += htp_req_buff_init(&req->src2, &bufs[n_bufs], t->src[2], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
2329
+ n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
2784
2330
 
2785
- memset(&req, 0, sizeof(htp_general_req));
2786
- memcpy(&req.op_params, &op->op_params, sizeof(op->op_params));
2787
- req.flags = flags;
2788
- req.op = HTP_OP_ROPE;
2789
-
2790
- init_htp_tensor(&req.dst, dst);
2791
- init_htp_tensor(&req.src0, src0);
2792
- init_htp_tensor(&req.src1, src1);
2793
- if (src2) {
2794
- init_htp_tensor(&req.src2, src2);
2795
- }
2796
-
2797
- // Use opmask to override flags
2798
- if (!(opt_opmask & HTP_OPMASK_QUANTIZE)) {
2799
- req.flags |= HTP_OPFLAGS_SKIP_QUANTIZE;
2800
- }
2801
- if (!(opt_opmask & HTP_OPMASK_COMPUTE)) {
2802
- req.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
2803
- }
2804
-
2805
- dspqueue_buffer bufs[4];
2806
-
2807
- // First buffer
2808
- // This is a buffer that the CPU writes and the DSP reads, so we'll
2809
- // need to flush CPU caches and invalidate DSP ones. On platforms
2810
- // with I/O coherency support the framework will automatically skip
2811
- // cache operations where possible.
2812
- size_t n_bufs = dspqueue_buffers_init(bufs, src0, true, true);
2813
-
2814
- // Second buffer
2815
- // This is a buffer that the CPU writes and the DSP reads, so we'll
2816
- // need to flush CPU caches and invalidate DSP ones. On platforms
2817
- // with I/O coherency support the framework will automatically skip
2818
- // cache operations where possible.
2819
- n_bufs += dspqueue_buffers_init(&bufs[n_bufs], src1, true, true);
2820
-
2821
- // Third buffer(nullable)
2822
- // This is a buffer that the CPU writes and the DSP reads, so we'll
2823
- // need to flush CPU caches and invalidate DSP ones. On platforms
2824
- // with I/O coherency support the framework will automatically skip
2825
- // cache operations where possible.
2826
- n_bufs += dspqueue_buffers_init(&bufs[n_bufs], src2, true, true);
2827
-
2828
- // Final buffer = Output Activations. We'll handle DSP
2829
- // Second buffer = Output Activations. We'll handle DSP
2830
- // cache maintenance in the response message but need to flush
2831
- // CPU caches to ensure any previously written dirty lines are
2832
- // written out before writes from the DSP start.
2833
- n_bufs += dspqueue_buffers_init(&bufs[n_bufs], dst, true, false);
2834
-
2835
- // Primary DSP session from the src0 tensor
2836
- auto * sess = get_session_from_tensor(src0);
2837
-
2838
- if (opt_verbose) {
2839
- hex_print_op_info(op, sess, req.flags);
2840
- if (opt_verbose > 1) {
2841
- hex_dump_dspbuf(src0, &bufs[0]);
2842
- if (src1) {
2843
- hex_dump_dspbuf(src1, &bufs[1]);
2844
- hex_dump_dspbuf(dst, &bufs[2]);
2845
- } else {
2846
- hex_dump_dspbuf(dst, &bufs[1]);
2847
- }
2848
- }
2849
- }
2850
-
2851
- if ((opt_opmask & HTP_OPMASK_QUEUE)) {
2852
- sess->enqueue(req, bufs, n_bufs, opt_opsync);
2853
- }
2854
-
2855
- t2 = ggml_time_us();
2856
-
2857
- if (src2) {
2858
- HEX_PROFILE(
2859
- "ggml-hex: %s %s %s %u:%u:%u:%u x %s %u:%u:%u:%u x %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles "
2860
- "%u op-pkts %u (%f) call-usec %llu\n",
2861
- sess->name.c_str(), ggml_op_name(op->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
2862
- (uint32_t) src0->ne[2], (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
2863
- (uint32_t) src1->ne[2], (uint32_t) src1->ne[3], src2->name, (uint32_t) src2->ne[0], (uint32_t) src2->ne[1],
2864
- (uint32_t) src2->ne[2], (uint32_t) src2->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
2865
- (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
2866
- (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
2867
- } else {
2868
- HEX_PROFILE(
2869
- "ggml-hex: %s %s %s %u:%u:%u:%u x %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles %u op-pkts %u "
2870
- "(%f) call-usec %llu\n",
2871
- sess->name.c_str(), ggml_op_name(op->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
2872
- (uint32_t) src0->ne[2], (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
2873
- (uint32_t) src1->ne[2], (uint32_t) src1->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
2874
- (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
2875
- (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
2876
- }
2331
+ return n_bufs;
2877
2332
  }
2878
2333
 
2879
2334
  static const char * ggml_backend_hexagon_name(ggml_backend_t backend) {
@@ -2888,7 +2343,7 @@ static void ggml_backend_hexagon_free(ggml_backend_t backend) {
2888
2343
  }
2889
2344
 
2890
2345
  static inline bool op_reuse_src1(const ggml_tensor * op1, const ggml_tensor * op0) {
2891
- return (op0 && op0->src[1] == op1->src[1]);
2346
+ return (op0 && op0->src[1] == op1->src[1] && ggml_is_quantized(op0->src[0]->type) && ggml_is_quantized(op1->src[1]->type));
2892
2347
  }
2893
2348
 
2894
2349
  static inline bool is_compute_op(ggml_tensor *node)
@@ -2938,41 +2393,50 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
2938
2393
 
2939
2394
  switch (node->op) {
2940
2395
  case GGML_OP_MUL_MAT:
2941
- ggml_hexagon_mul_mat(node, flags);
2396
+ if (ggml_is_quantized(node->src[0]->type)) {
2397
+ ggml_hexagon_dispatch_op<init_binary_req<true>>(sess, node, flags);
2398
+ } else {
2399
+ ggml_hexagon_dispatch_op<init_binary_req<false>>(sess, node, flags);
2400
+ }
2942
2401
  prev_quant_op = node;
2943
2402
  break;
2944
2403
  case GGML_OP_MUL_MAT_ID:
2945
- ggml_hexagon_mul_mat_id(node, flags);
2404
+ if (ggml_is_quantized(node->src[0]->type)) {
2405
+ ggml_hexagon_dispatch_op<init_binary_id_req<true>>(sess, node, flags);
2406
+ } else {
2407
+ ggml_hexagon_dispatch_op<init_binary_id_req<false>>(sess, node, flags);
2408
+ }
2946
2409
  prev_quant_op = node;
2947
2410
  break;
2948
2411
  case GGML_OP_MUL:
2949
2412
  case GGML_OP_ADD:
2950
2413
  case GGML_OP_SUB:
2951
- ggml_hexagon_binary(node, flags);
2414
+ ggml_hexagon_dispatch_op<init_binary_req<false>>(sess, node, flags);
2952
2415
  break;
2953
2416
  case GGML_OP_ADD_ID:
2954
- ggml_hexagon_add_id(node, flags);
2417
+ ggml_hexagon_dispatch_op<init_binary_id_req<false>>(sess, node, flags);
2955
2418
  break;
2956
2419
  case GGML_OP_RMS_NORM:
2957
- ggml_hexagon_unary(node, flags);
2420
+ ggml_hexagon_dispatch_op<init_unary_req>(sess, node, flags);
2958
2421
  break;
2959
2422
  case GGML_OP_UNARY:
2960
- if (ggml_get_unary_op(node) == GGML_UNARY_OP_SILU) {
2961
- ggml_hexagon_unary(node, flags);
2423
+ if ((ggml_get_unary_op(node) == GGML_UNARY_OP_SILU) ||
2424
+ (ggml_get_unary_op(node) == GGML_UNARY_OP_GELU)) {
2425
+ ggml_hexagon_dispatch_op<init_unary_req>(sess, node, flags);
2962
2426
  }
2963
2427
  break;
2964
2428
  case GGML_OP_GLU:
2965
2429
  if ((ggml_get_glu_op(node) == GGML_GLU_OP_SWIGLU) ||
2966
- (ggml_get_glu_op(node) == GGML_GLU_OP_SWIGLU_OAI)) {
2967
- ggml_hexagon_unary(node, flags);
2430
+ (ggml_get_glu_op(node) == GGML_GLU_OP_SWIGLU_OAI)) {
2431
+ ggml_hexagon_dispatch_op<init_unary_req>(sess, node, flags);
2968
2432
  }
2969
2433
  break;
2970
2434
  case GGML_OP_SOFT_MAX:
2971
- ggml_hexagon_unary(node, flags);
2435
+ ggml_hexagon_dispatch_op<init_unary_req>(sess, node, flags);
2972
2436
  break;
2973
2437
 
2974
2438
  case GGML_OP_ROPE:
2975
- ggml_hexagon_rope(node, flags);
2439
+ ggml_hexagon_dispatch_op<init_rope_req>(sess, node, flags);
2976
2440
  break;
2977
2441
 
2978
2442
  default:
@@ -3101,8 +2565,8 @@ static void ggml_backend_hexagon_graph_optimize(ggml_backend_t backend, ggml_cgr
3101
2565
  // and perform the reorder over the fused nodes. after the reorder is done, we unfuse
3102
2566
  for (int i = 0; i < n; i++) {
3103
2567
  node_info node = {
3104
- /*.node =*/ gf->nodes[i],
3105
- /*.fused =*/ {},
2568
+ /*.node =*/gf->nodes[i],
2569
+ /*.fused =*/{},
3106
2570
  };
3107
2571
 
3108
2572
  // fuse only ops that start with these operations
@@ -3253,11 +2717,39 @@ static ggml_backend_buffer_type_t ggml_backend_hexagon_device_get_repack_buffer_
3253
2717
  return &sess->repack_buffer_type;
3254
2718
  }
3255
2719
 
2720
+ static bool ggml_hexagon_supported_buffer(ggml_hexagon_session *sess, const struct ggml_tensor * t) {
2721
+ if (t && t->buffer) {
2722
+ if (ggml_backend_buffer_is_hexagon(t->buffer) == false) return false; // not our buffer
2723
+ if (ggml_backend_hexagon_buffer_get_sess(t->buffer) != sess) return false; // wrong session
2724
+ }
2725
+ return true;
2726
+ }
2727
+
2728
+ static bool ggml_hexagon_supported_buffers(ggml_hexagon_session *sess, const struct ggml_tensor * t) {
2729
+ // all srcs & dsts must be mapped to the same session
2730
+ if (!ggml_hexagon_supported_buffer(sess, t)) {
2731
+ return false;
2732
+ }
2733
+
2734
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
2735
+ if (!ggml_hexagon_supported_buffer(sess, t->src[i])) {
2736
+ return false;
2737
+ }
2738
+ }
2739
+
2740
+ return true;
2741
+ }
2742
+
3256
2743
  static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
3257
2744
  auto sess = static_cast<ggml_hexagon_session *>(dev->context);
3258
2745
 
3259
- bool supp = false;
2746
+ // all srcs & dsts must be mapped to the same session
2747
+ if (!ggml_hexagon_supported_buffers(sess, op)) {
2748
+ ggml_hexagon_dump_op_supp(sess->name, op, false);
2749
+ return false;
2750
+ }
3260
2751
 
2752
+ bool supp = false;
3261
2753
  switch (op->op) {
3262
2754
  case GGML_OP_NONE:
3263
2755
  case GGML_OP_RESHAPE:
@@ -3294,17 +2786,21 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
3294
2786
  break;
3295
2787
 
3296
2788
  case GGML_OP_UNARY:
3297
- if (ggml_get_unary_op(op) == GGML_UNARY_OP_SILU) {
3298
- supp = ggml_hexagon_supported_activations(sess, op);
2789
+ {
2790
+ const auto unary_op = ggml_get_unary_op(op);
2791
+ if (unary_op == GGML_UNARY_OP_SILU || unary_op == GGML_UNARY_OP_GELU) {
2792
+ supp = ggml_hexagon_supported_activations(sess, op);
2793
+ }
2794
+ break;
3299
2795
  }
3300
- break;
3301
-
3302
2796
  case GGML_OP_GLU:
3303
- if ((ggml_get_glu_op(op) == GGML_GLU_OP_SWIGLU) /* || (ggml_get_glu_op(op) == GGML_GLU_OP_SWIGLU_OAI) */) {
3304
- supp = ggml_hexagon_supported_activations(sess, op);
2797
+ {
2798
+ const auto glu_op = ggml_get_glu_op(op);
2799
+ if ((glu_op == GGML_GLU_OP_SWIGLU) || (glu_op == GGML_GLU_OP_SWIGLU_OAI)) {
2800
+ supp = ggml_hexagon_supported_activations(sess, op);
2801
+ }
2802
+ break;
3305
2803
  }
3306
- break;
3307
-
3308
2804
  case GGML_OP_ROPE:
3309
2805
  supp = ggml_hexagon_supported_rope(sess, op);
3310
2806
  break;
@@ -3313,26 +2809,8 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
3313
2809
  break;
3314
2810
  }
3315
2811
 
3316
- if (opt_verbose) {
3317
- char dims[64 * GGML_MAX_SRC];
3318
- char strides[64 * GGML_MAX_SRC];
3319
- char types[16 * GGML_MAX_SRC];
3320
- char buffs[64 * GGML_MAX_SRC];
3321
- char names[64 * GGML_MAX_SRC];
3322
-
3323
- hex_format_op_dims(dims, op);
3324
- hex_format_op_strides(strides, op);
3325
- hex_format_op_types(types, op);
3326
- hex_format_op_buffs(buffs, op);
3327
- hex_format_op_names(names, op);
3328
-
3329
- HEX_VERBOSE("ggml-hex: %s device-supports-op %s : %s : %s : %s : %s : %s : (%d)\n", sess->name.c_str(),
3330
- ggml_op_name(op->op), names, dims, types, strides, buffs, (int) supp);
3331
- }
3332
-
2812
+ ggml_hexagon_dump_op_supp(sess->name, op, supp);
3333
2813
  return supp;
3334
-
3335
- GGML_UNUSED(dev);
3336
2814
  }
3337
2815
 
3338
2816
  static bool ggml_backend_hexagon_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
@@ -3401,7 +2879,7 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
3401
2879
  }
3402
2880
  }
3403
2881
 
3404
- if(opt_arch < 75) {
2882
+ if (opt_arch < 75) {
3405
2883
  opt_ndev = 1;
3406
2884
  GGML_LOG_WARN("ggml-hex: forcing ndev to 1 for SoCs archs lower than v75.\n");
3407
2885
  }
@@ -3410,11 +2888,11 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
3410
2888
 
3411
2889
  // Create devices / sessions
3412
2890
  for (size_t i = 0; i < opt_ndev; i++) {
3413
- devices[i].iface = ggml_backend_hexagon_device_i;
3414
- devices[i].reg = reg;
2891
+ devices[i].iface = ggml_backend_hexagon_device_i;
2892
+ devices[i].reg = reg;
3415
2893
  try {
3416
2894
  devices[i].context = new ggml_hexagon_session(i, &devices[i]);
3417
- } catch (std::exception const &exc) {
2895
+ } catch (const std::exception & exc) {
3418
2896
  GGML_LOG_ERROR("ggml-hex: failed to create device/session %zu\n", i);
3419
2897
  devices[i].context = nullptr;
3420
2898
  }