llama-cpp-pydist 0.19.0__py3-none-any.whl → 0.21.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (242) hide show
  1. llama_cpp/binaries/{llama-b7488-bin-win-cpu-x64.zip → llama-b7631-bin-win-cpu-x64.zip} +0 -0
  2. llama_cpp_pydist-0.21.0.dist-info/METADATA +4684 -0
  3. {llama_cpp_pydist-0.19.0.dist-info → llama_cpp_pydist-0.21.0.dist-info}/RECORD +240 -222
  4. {llama_cpp_pydist-0.19.0.dist-info → llama_cpp_pydist-0.21.0.dist-info}/WHEEL +1 -1
  5. vendor_llama_cpp_pydist/llama.cpp/.devops/cuda-new.Dockerfile +95 -0
  6. vendor_llama_cpp_pydist/llama.cpp/.gemini/settings.json +1 -0
  7. vendor_llama_cpp_pydist/llama.cpp/.github/ISSUE_TEMPLATE/010-bug-compilation.yml +2 -1
  8. vendor_llama_cpp_pydist/llama.cpp/.github/ISSUE_TEMPLATE/011-bug-results.yml +13 -2
  9. vendor_llama_cpp_pydist/llama.cpp/.github/ISSUE_TEMPLATE/019-bug-misc.yml +13 -2
  10. vendor_llama_cpp_pydist/llama.cpp/.github/workflows/build.yml +18 -6
  11. vendor_llama_cpp_pydist/llama.cpp/.github/workflows/docker.yml +25 -13
  12. vendor_llama_cpp_pydist/llama.cpp/.github/workflows/release.yml +9 -5
  13. vendor_llama_cpp_pydist/llama.cpp/.github/workflows/server.yml +18 -0
  14. vendor_llama_cpp_pydist/llama.cpp/AGENTS.md +81 -0
  15. vendor_llama_cpp_pydist/llama.cpp/CLAUDE.md +1 -0
  16. vendor_llama_cpp_pydist/llama.cpp/CONTRIBUTING.md +34 -5
  17. vendor_llama_cpp_pydist/llama.cpp/ci/run.sh +2 -1
  18. vendor_llama_cpp_pydist/llama.cpp/common/CMakeLists.txt +4 -3
  19. vendor_llama_cpp_pydist/llama.cpp/common/arg.cpp +46 -14
  20. vendor_llama_cpp_pydist/llama.cpp/common/arg.h +1 -0
  21. vendor_llama_cpp_pydist/llama.cpp/common/chat-parser.cpp +11 -0
  22. vendor_llama_cpp_pydist/llama.cpp/common/chat.cpp +36 -7
  23. vendor_llama_cpp_pydist/llama.cpp/common/chat.h +1 -0
  24. vendor_llama_cpp_pydist/llama.cpp/common/common.cpp +42 -23
  25. vendor_llama_cpp_pydist/llama.cpp/common/common.h +7 -2
  26. vendor_llama_cpp_pydist/llama.cpp/common/llguidance.cpp +10 -6
  27. vendor_llama_cpp_pydist/llama.cpp/common/regex-partial.cpp +13 -13
  28. vendor_llama_cpp_pydist/llama.cpp/common/sampling.cpp +58 -14
  29. vendor_llama_cpp_pydist/llama.cpp/common/sampling.h +3 -1
  30. vendor_llama_cpp_pydist/llama.cpp/convert_hf_to_gguf.py +424 -103
  31. vendor_llama_cpp_pydist/llama.cpp/convert_hf_to_gguf_update.py +5 -0
  32. vendor_llama_cpp_pydist/llama.cpp/docs/backend/CANN.md +4 -0
  33. vendor_llama_cpp_pydist/llama.cpp/docs/backend/OPENCL.md +51 -1
  34. vendor_llama_cpp_pydist/llama.cpp/docs/backend/SYCL.md +1 -1
  35. vendor_llama_cpp_pydist/llama.cpp/docs/backend/hexagon/README.md +5 -5
  36. vendor_llama_cpp_pydist/llama.cpp/docs/backend/hexagon/developer.md +1 -1
  37. vendor_llama_cpp_pydist/llama.cpp/docs/build.md +21 -2
  38. vendor_llama_cpp_pydist/llama.cpp/docs/development/parsing.md +2 -2
  39. vendor_llama_cpp_pydist/llama.cpp/docs/ops/Metal.csv +360 -322
  40. vendor_llama_cpp_pydist/llama.cpp/docs/ops.md +1 -1
  41. vendor_llama_cpp_pydist/llama.cpp/ggml/CMakeLists.txt +13 -1
  42. vendor_llama_cpp_pydist/llama.cpp/ggml/include/ggml-backend.h +1 -1
  43. vendor_llama_cpp_pydist/llama.cpp/ggml/src/CMakeLists.txt +23 -9
  44. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-backend.cpp +11 -11
  45. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +303 -19
  46. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +17 -0
  47. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/common.h +153 -9
  48. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +51 -158
  49. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +12 -2
  50. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -1
  51. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +86 -25
  52. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +15 -8
  53. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +768 -0
  54. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +0 -4
  55. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +66 -1
  56. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/argsort.cu +48 -27
  57. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/argsort.cuh +16 -0
  58. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/common.cuh +45 -9
  59. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/cpy.cu +117 -103
  60. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/cumsum.cu +105 -35
  61. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +3 -1
  62. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +2 -2
  63. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +83 -33
  64. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mean.cu +3 -0
  65. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mma.cuh +21 -0
  66. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mmq.cu +34 -8
  67. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +168 -13
  68. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/quantize.cu +151 -0
  69. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/quantize.cuh +14 -0
  70. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/softmax.cu +203 -6
  71. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/top-k.cu +96 -0
  72. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/top-k.cuh +3 -0
  73. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/topk-moe.cu +17 -2
  74. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/topk-moe.cuh +6 -1
  75. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +4 -0
  76. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +3 -0
  77. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
  78. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp +224 -758
  79. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/act-ops.c +316 -164
  80. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.c +5 -11
  81. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.h +46 -15
  82. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-utils.h +9 -3
  83. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/main.c +2 -1
  84. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/matmul-ops.c +20 -20
  85. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp-utils.h +1 -0
  86. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/op-desc.h +153 -0
  87. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-impl.h +0 -4
  88. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.cpp +57 -0
  89. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.h +2 -0
  90. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.m +5 -0
  91. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +20 -0
  92. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-ops.cpp +71 -2
  93. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-ops.h +1 -0
  94. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +73 -6
  95. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +134 -13
  96. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-opencl/kernels/cvt.cl +21 -0
  97. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-opencl/kernels/transpose.cl +13 -0
  98. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +14 -7
  99. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +42 -1
  100. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +742 -315
  101. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/count_experts.comp +51 -0
  102. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cumsum.comp +28 -14
  103. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass1.comp +60 -0
  104. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass2.comp +66 -0
  105. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl +1 -7
  106. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/generic_head.glsl +2 -0
  107. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +17 -4
  108. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +42 -24
  109. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +11 -0
  110. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +115 -0
  111. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +10 -4
  112. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +29 -18
  113. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +19 -16
  114. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +3 -1
  115. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +10 -4
  116. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +8 -8
  117. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +11 -4
  118. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +4 -1
  119. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +4 -1
  120. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +4 -1
  121. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +1 -0
  122. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +4 -1
  123. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +57 -22
  124. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl +312 -6
  125. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +54 -0
  126. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +10 -2
  127. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/xielu.comp +35 -0
  128. vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/constants.py +99 -0
  129. vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/gguf_writer.py +38 -2
  130. vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/tensor_mapping.py +26 -0
  131. vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/utility.py +0 -8
  132. vendor_llama_cpp_pydist/llama.cpp/grammars/README.md +3 -0
  133. vendor_llama_cpp_pydist/llama.cpp/include/llama.h +99 -12
  134. vendor_llama_cpp_pydist/llama.cpp/scripts/snapdragon/adb/run-cli.sh +9 -9
  135. vendor_llama_cpp_pydist/llama.cpp/scripts/snapdragon/adb/run-completion.sh +53 -0
  136. vendor_llama_cpp_pydist/llama.cpp/scripts/sync-ggml.last +1 -1
  137. vendor_llama_cpp_pydist/llama.cpp/src/CMakeLists.txt +4 -0
  138. vendor_llama_cpp_pydist/llama.cpp/src/llama-adapter.cpp +12 -3
  139. vendor_llama_cpp_pydist/llama.cpp/src/llama-adapter.h +7 -1
  140. vendor_llama_cpp_pydist/llama.cpp/src/llama-arch.cpp +76 -0
  141. vendor_llama_cpp_pydist/llama.cpp/src/llama-arch.h +7 -0
  142. vendor_llama_cpp_pydist/llama.cpp/src/llama-chat.cpp +11 -0
  143. vendor_llama_cpp_pydist/llama.cpp/src/llama-chat.h +1 -0
  144. vendor_llama_cpp_pydist/llama.cpp/src/llama-context.cpp +625 -40
  145. vendor_llama_cpp_pydist/llama.cpp/src/llama-context.h +43 -1
  146. vendor_llama_cpp_pydist/llama.cpp/src/llama-grammar.cpp +40 -13
  147. vendor_llama_cpp_pydist/llama.cpp/src/llama-grammar.h +2 -0
  148. vendor_llama_cpp_pydist/llama.cpp/src/llama-graph.cpp +166 -2
  149. vendor_llama_cpp_pydist/llama.cpp/src/llama-graph.h +71 -6
  150. vendor_llama_cpp_pydist/llama.cpp/src/llama-hparams.h +6 -5
  151. vendor_llama_cpp_pydist/llama.cpp/src/llama-kv-cache.h +1 -1
  152. vendor_llama_cpp_pydist/llama.cpp/src/llama-mmap.cpp +11 -4
  153. vendor_llama_cpp_pydist/llama.cpp/src/llama-model-loader.cpp +23 -0
  154. vendor_llama_cpp_pydist/llama.cpp/src/llama-model-loader.h +2 -0
  155. vendor_llama_cpp_pydist/llama.cpp/src/llama-model.cpp +329 -26
  156. vendor_llama_cpp_pydist/llama.cpp/src/llama-model.h +13 -2
  157. vendor_llama_cpp_pydist/llama.cpp/src/llama-sampling.cpp +1259 -186
  158. vendor_llama_cpp_pydist/llama.cpp/src/llama-sampling.h +19 -7
  159. vendor_llama_cpp_pydist/llama.cpp/src/llama-vocab.cpp +101 -33
  160. vendor_llama_cpp_pydist/llama.cpp/src/llama-vocab.h +2 -0
  161. vendor_llama_cpp_pydist/llama.cpp/src/llama.cpp +53 -38
  162. vendor_llama_cpp_pydist/llama.cpp/src/models/afmoe.cpp +9 -5
  163. vendor_llama_cpp_pydist/llama.cpp/src/models/bert.cpp +4 -2
  164. vendor_llama_cpp_pydist/llama.cpp/src/models/cogvlm.cpp +5 -3
  165. vendor_llama_cpp_pydist/llama.cpp/src/models/cohere2-iswa.cpp +3 -0
  166. vendor_llama_cpp_pydist/llama.cpp/src/models/deepseek2.cpp +1 -1
  167. vendor_llama_cpp_pydist/llama.cpp/src/models/gemma-embedding.cpp +2 -6
  168. vendor_llama_cpp_pydist/llama.cpp/src/models/gemma2-iswa.cpp +5 -2
  169. vendor_llama_cpp_pydist/llama.cpp/src/models/gemma3.cpp +3 -4
  170. vendor_llama_cpp_pydist/llama.cpp/src/models/gemma3n-iswa.cpp +4 -7
  171. vendor_llama_cpp_pydist/llama.cpp/src/models/llama-iswa.cpp +6 -2
  172. vendor_llama_cpp_pydist/llama.cpp/src/models/llama.cpp +19 -6
  173. vendor_llama_cpp_pydist/llama.cpp/src/models/maincoder.cpp +117 -0
  174. vendor_llama_cpp_pydist/llama.cpp/src/models/mimo2-iswa.cpp +123 -0
  175. vendor_llama_cpp_pydist/llama.cpp/src/models/models.h +18 -0
  176. vendor_llama_cpp_pydist/llama.cpp/src/models/modern-bert.cpp +116 -0
  177. vendor_llama_cpp_pydist/llama.cpp/src/models/openai-moe-iswa.cpp +5 -2
  178. vendor_llama_cpp_pydist/llama.cpp/src/models/plamo3.cpp +128 -0
  179. vendor_llama_cpp_pydist/llama.cpp/src/models/smallthinker.cpp +11 -5
  180. vendor_llama_cpp_pydist/llama.cpp/src/unicode.cpp +23 -14
  181. vendor_llama_cpp_pydist/llama.cpp/tests/CMakeLists.txt +12 -2
  182. vendor_llama_cpp_pydist/llama.cpp/tests/test-backend-ops.cpp +286 -65
  183. vendor_llama_cpp_pydist/llama.cpp/tests/test-backend-sampler.cpp +1237 -0
  184. vendor_llama_cpp_pydist/llama.cpp/tests/test-chat.cpp +29 -3
  185. vendor_llama_cpp_pydist/llama.cpp/tests/test-grammar-llguidance.cpp +3 -0
  186. vendor_llama_cpp_pydist/llama.cpp/tests/test-regex-partial.cpp +14 -14
  187. vendor_llama_cpp_pydist/llama.cpp/tests/test-tokenizer-0.cpp +1 -1
  188. vendor_llama_cpp_pydist/llama.cpp/tests/test-tokenizer-1-bpe.cpp +1 -1
  189. vendor_llama_cpp_pydist/llama.cpp/tests/test-tokenizer-1-spm.cpp +1 -1
  190. vendor_llama_cpp_pydist/llama.cpp/tools/batched-bench/batched-bench.cpp +11 -0
  191. vendor_llama_cpp_pydist/llama.cpp/tools/cli/README.md +187 -1
  192. vendor_llama_cpp_pydist/llama.cpp/tools/cli/cli.cpp +1 -3
  193. vendor_llama_cpp_pydist/llama.cpp/tools/completion/README.md +179 -7
  194. vendor_llama_cpp_pydist/llama.cpp/tools/completion/completion.cpp +4 -1
  195. vendor_llama_cpp_pydist/llama.cpp/tools/fit-params/fit-params.cpp +3 -3
  196. vendor_llama_cpp_pydist/llama.cpp/tools/llama-bench/llama-bench.cpp +18 -1
  197. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/CMakeLists.txt +1 -0
  198. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/clip-impl.h +12 -7
  199. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/clip-model.h +3 -1
  200. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/clip.cpp +118 -4
  201. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/models.h +10 -0
  202. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/siglip.cpp +9 -4
  203. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/whisper-enc.cpp +9 -0
  204. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/youtuvl.cpp +179 -0
  205. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/mtmd.cpp +5 -1
  206. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/mtmd.h +3 -0
  207. vendor_llama_cpp_pydist/llama.cpp/tools/quantize/quantize.cpp +6 -0
  208. vendor_llama_cpp_pydist/llama.cpp/tools/server/CMakeLists.txt +0 -8
  209. vendor_llama_cpp_pydist/llama.cpp/tools/server/README-dev.md +2 -0
  210. vendor_llama_cpp_pydist/llama.cpp/tools/server/README.md +27 -14
  211. vendor_llama_cpp_pydist/llama.cpp/tools/server/public/index.html.gz +0 -0
  212. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-common.cpp +22 -24
  213. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-common.h +2 -3
  214. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-context.cpp +453 -267
  215. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-context.h +52 -15
  216. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-http.cpp +16 -10
  217. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-models.cpp +174 -62
  218. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-models.h +14 -5
  219. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-queue.cpp +78 -21
  220. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-queue.h +48 -10
  221. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-task.cpp +36 -11
  222. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-task.h +28 -35
  223. vendor_llama_cpp_pydist/llama.cpp/tools/server/server.cpp +9 -5
  224. vendor_llama_cpp_pydist/llama.cpp/tools/server/tests/unit/test_chat_completion.py +11 -2
  225. vendor_llama_cpp_pydist/llama.cpp/tools/server/tests/unit/test_sleep.py +39 -0
  226. vendor_llama_cpp_pydist/llama.cpp/tools/server/tests/utils.py +3 -0
  227. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAssistant.svelte +25 -1
  228. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageStatistics.svelte +66 -13
  229. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettings.svelte +5 -0
  230. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/constants/settings-config.ts +3 -0
  231. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/hooks/use-processing-state.svelte.ts +125 -11
  232. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/services/chat.ts +15 -8
  233. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/stores/chat.svelte.ts +12 -3
  234. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/stores/settings.svelte.ts +4 -5
  235. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/types/api.d.ts +5 -0
  236. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/types/settings.d.ts +2 -1
  237. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/utils/clipboard.ts +1 -4
  238. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/routes/+layout.svelte +1 -1
  239. llama_cpp_pydist-0.19.0.dist-info/METADATA +0 -2506
  240. vendor_llama_cpp_pydist/llama.cpp/.github/copilot-instructions.md +0 -262
  241. {llama_cpp_pydist-0.19.0.dist-info/licenses → llama_cpp_pydist-0.21.0.dist-info}/LICENSE +0 -0
  242. {llama_cpp_pydist-0.19.0.dist-info → llama_cpp_pydist-0.21.0.dist-info}/top_level.txt +0 -0
@@ -32,7 +32,7 @@ Legend:
32
32
  | CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
33
33
  | CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
34
34
  | COS | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ |
35
- | COUNT_EQUAL | ❌ | ✅ | ✅ | ✅ | | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
35
+ | COUNT_EQUAL | ❌ | ✅ | ✅ | ✅ | | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
36
36
  | CPY | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
37
37
  | CROSS_ENTROPY_LOSS | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
38
38
  | CROSS_ENTROPY_LOSS_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
@@ -4,7 +4,7 @@ project("ggml" C CXX ASM)
4
4
  ### GGML Version
5
5
  set(GGML_VERSION_MAJOR 0)
6
6
  set(GGML_VERSION_MINOR 9)
7
- set(GGML_VERSION_PATCH 4)
7
+ set(GGML_VERSION_PATCH 5)
8
8
  set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")
9
9
 
10
10
  find_program(GIT_EXE NAMES git git.exe NO_CMAKE_FIND_ROOT_PATH)
@@ -430,10 +430,22 @@ if (MSVC)
430
430
  configure_msvc_target(ggml-cpu-x64)
431
431
  configure_msvc_target(ggml-cpu-sse42)
432
432
  configure_msvc_target(ggml-cpu-sandybridge)
433
+ # __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
434
+ # skipping ggml-cpu-ivybridge
435
+ # skipping ggml-cpu-piledriver
433
436
  configure_msvc_target(ggml-cpu-haswell)
434
437
  configure_msvc_target(ggml-cpu-skylakex)
438
+ configure_msvc_target(ggml-cpu-cannonlake)
439
+ configure_msvc_target(ggml-cpu-cascadelake)
435
440
  configure_msvc_target(ggml-cpu-icelake)
441
+ # MSVC 2022 doesn't support BF16 intrinsics without `/arch:AVX10.1` ?!
442
+ # https://learn.microsoft.com/en-us/cpp/intrinsics/x64-amd64-intrinsics-list?view=msvc-170
443
+ # https://learn.microsoft.com/en-us/cpp/build/reference/arch-x64?view=msvc-170
444
+ # skipping ggml-cpu-cooperlake
445
+ # skipping ggml-cpu-zen4
436
446
  configure_msvc_target(ggml-cpu-alderlake)
447
+ # MSVC doesn't support AMX
448
+ # skipping ggml-cpu-sapphirerapids
437
449
 
438
450
  if (GGML_BUILD_EXAMPLES)
439
451
  configure_msvc_target(common-ggml)
@@ -358,7 +358,7 @@ extern "C" {
358
358
  typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
359
359
 
360
360
  // Compare the output of two backends
361
- GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data, struct ggml_tensor * test_node);
361
+ GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data, struct ggml_tensor const * const * test_nodes, size_t num_test_nodes);
362
362
 
363
363
  // Tensor initialization
364
364
  GGML_API enum ggml_status ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
@@ -357,15 +357,29 @@ if (GGML_CPU_ALL_VARIANTS)
357
357
  endif()
358
358
  if (GGML_SYSTEM_ARCH STREQUAL "x86")
359
359
  ggml_add_cpu_backend_variant(x64)
360
- ggml_add_cpu_backend_variant(sse42 SSE42)
361
- ggml_add_cpu_backend_variant(sandybridge SSE42 AVX)
362
- ggml_add_cpu_backend_variant(haswell SSE42 AVX F16C AVX2 BMI2 FMA)
363
- ggml_add_cpu_backend_variant(skylakex SSE42 AVX F16C AVX2 BMI2 FMA AVX512)
364
- ggml_add_cpu_backend_variant(icelake SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
365
- ggml_add_cpu_backend_variant(alderlake SSE42 AVX F16C AVX2 BMI2 FMA AVX_VNNI)
360
+ ggml_add_cpu_backend_variant(sse42 SSE42)
361
+ ggml_add_cpu_backend_variant(sandybridge SSE42 AVX)
362
+ if (NOT MSVC)
363
+ # __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
364
+ ggml_add_cpu_backend_variant(ivybridge SSE42 AVX F16C)
365
+ ggml_add_cpu_backend_variant(piledriver SSE42 AVX F16C FMA)
366
+ endif()
367
+ ggml_add_cpu_backend_variant(haswell SSE42 AVX F16C FMA AVX2 BMI2)
368
+ ggml_add_cpu_backend_variant(skylakex SSE42 AVX F16C FMA AVX2 BMI2 AVX512)
369
+ ggml_add_cpu_backend_variant(cannonlake SSE42 AVX F16C FMA AVX2 BMI2 AVX512 AVX512_VBMI)
370
+ ggml_add_cpu_backend_variant(cascadelake SSE42 AVX F16C FMA AVX2 BMI2 AVX512 AVX512_VNNI)
371
+ ggml_add_cpu_backend_variant(icelake SSE42 AVX F16C FMA AVX2 BMI2 AVX512 AVX512_VBMI AVX512_VNNI)
372
+ if (NOT MSVC)
373
+ # MSVC 2022 doesn't support BF16 intrinsics without `/arch:AVX10.1` ?!
374
+ # https://learn.microsoft.com/en-us/cpp/intrinsics/x64-amd64-intrinsics-list?view=msvc-170
375
+ # https://learn.microsoft.com/en-us/cpp/build/reference/arch-x64?view=msvc-170
376
+ ggml_add_cpu_backend_variant(cooperlake SSE42 AVX F16C FMA AVX2 BMI2 AVX512 AVX512_VNNI AVX512_BF16)
377
+ ggml_add_cpu_backend_variant(zen4 SSE42 AVX F16C FMA AVX2 BMI2 AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16)
378
+ endif()
379
+ ggml_add_cpu_backend_variant(alderlake SSE42 AVX F16C FMA AVX2 BMI2 AVX_VNNI)
366
380
  if (NOT MSVC)
367
381
  # MSVC doesn't support AMX
368
- ggml_add_cpu_backend_variant(sapphirerapids SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
382
+ ggml_add_cpu_backend_variant(sapphirerapids SSE42 AVX F16C FMA AVX2 BMI2 AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
369
383
  endif()
370
384
  elseif(GGML_SYSTEM_ARCH STREQUAL "ARM")
371
385
  if (CMAKE_SYSTEM_NAME MATCHES "Linux")
@@ -387,8 +401,8 @@ if (GGML_CPU_ALL_VARIANTS)
387
401
  ggml_add_cpu_backend_variant(android_armv8.2_2 DOTPROD FP16_VECTOR_ARITHMETIC)
388
402
  ggml_add_cpu_backend_variant(android_armv8.6_1 DOTPROD FP16_VECTOR_ARITHMETIC MATMUL_INT8)
389
403
  ggml_add_cpu_backend_variant(android_armv9.0_1 DOTPROD MATMUL_INT8 FP16_VECTOR_ARITHMETIC SVE2)
390
- ggml_add_cpu_backend_variant(android_armv9.2_1 DOTPROD MATMUL_INT8 FP16_VECTOR_ARITHMETIC SME)
391
- ggml_add_cpu_backend_variant(android_armv9.2_2 DOTPROD MATMUL_INT8 FP16_VECTOR_ARITHMETIC SVE SME)
404
+ ggml_add_cpu_backend_variant(android_armv9.2_1 DOTPROD MATMUL_INT8 FP16_VECTOR_ARITHMETIC SVE SME)
405
+ ggml_add_cpu_backend_variant(android_armv9.2_2 DOTPROD MATMUL_INT8 FP16_VECTOR_ARITHMETIC SVE SVE2 SME)
392
406
  elseif (APPLE)
393
407
  ggml_add_cpu_backend_variant(apple_m1 DOTPROD)
394
408
  ggml_add_cpu_backend_variant(apple_m2_m3 DOTPROD MATMUL_INT8)
@@ -2053,7 +2053,7 @@ void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy) {
2053
2053
  ggml_free(copy.ctx_unallocated);
2054
2054
  }
2055
2055
 
2056
- bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data, struct ggml_tensor * test_node) {
2056
+ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data, struct ggml_tensor const * const * test_nodes, size_t num_test_nodes) {
2057
2057
  struct ggml_backend_graph_copy copy = ggml_backend_graph_copy(backend2, graph);
2058
2058
  if (copy.buffer == NULL) {
2059
2059
  return false;
@@ -2064,22 +2064,22 @@ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t
2064
2064
 
2065
2065
  assert(g1->n_nodes == g2->n_nodes);
2066
2066
 
2067
- if (test_node != nullptr) {
2068
- // Compute the whole graph and only test the output for a specific tensor
2067
+ if (num_test_nodes != 0) {
2068
+ GGML_ASSERT(test_nodes);
2069
+ // Compute the whole graph and only test the output for specific tensors
2069
2070
  ggml_backend_graph_compute(backend1, g1);
2070
2071
  ggml_backend_graph_compute(backend2, g2);
2071
2072
 
2072
- int test_node_idx = -1;
2073
+ bool verified = false;
2073
2074
  for (int i = 0; i < g1->n_nodes; i++) {
2074
- struct ggml_tensor * t1 = g1->nodes[i];
2075
- if (t1 == test_node) {
2076
- test_node_idx = i;
2077
- break;
2075
+ for (size_t j = 0; j < num_test_nodes; ++j) {
2076
+ if (g1->nodes[i] == test_nodes[j]) {
2077
+ callback(i, g1->nodes[i], g2->nodes[i], user_data);
2078
+ verified = true;
2079
+ }
2078
2080
  }
2079
2081
  }
2080
- GGML_ASSERT(test_node_idx != -1);
2081
-
2082
- callback(test_node_idx, g1->nodes[test_node_idx], g2->nodes[test_node_idx], user_data);
2082
+ GGML_ASSERT(verified);
2083
2083
  } else {
2084
2084
  for (int i = 0; i < g1->n_nodes; i++) {
2085
2085
  struct ggml_tensor * t1 = g1->nodes[i];
@@ -26,6 +26,7 @@
26
26
  #include "ggml.h"
27
27
 
28
28
  #include <aclnnop/aclnn_add.h>
29
+ #include <aclnnop/aclnn_add_rms_norm.h>
29
30
  #include <aclnnop/aclnn_addcdiv.h>
30
31
  #include <aclnnop/aclnn_argmax.h>
31
32
  #include <aclnnop/aclnn_avgpool2d.h>
@@ -2338,19 +2339,19 @@ static void aclnn_rope_cache_init(ggml_backend_cann_context & ctx,
2338
2339
  // Step1.2: prepare rope_yarn_ramp, if this part updated, should update theta_scale_tensor.
2339
2340
  // TODO: acl_yarn_ramp_tensor use rope cache.
2340
2341
  bool yarn_ramp_tensor_updated = false;
2341
- ggml_cann_pool_alloc yarn_ramp_allocator(ctx.pool());
2342
2342
  acl_tensor_ptr acl_yarn_ramp_tensor;
2343
2343
  if (ext_factor != 0 && (theta_scale_updated || ctx.rope_cache.theta_scale_length != theta_scale_length ||
2344
2344
  ctx.rope_cache.freq_scale != freq_scale)) {
2345
2345
  yarn_ramp_tensor_updated = true;
2346
-
2346
+ if (ctx.rope_cache.yarn_ramp_cache != nullptr) {
2347
+ ACL_CHECK(aclrtFree(ctx.rope_cache.yarn_ramp_cache));
2348
+ }
2349
+ ACL_CHECK(aclrtMalloc(&ctx.rope_cache.yarn_ramp_cache, theta_scale_length * sizeof(float), ACL_MEM_MALLOC_HUGE_FIRST));
2347
2350
  // -rope_yarn_ramp
2348
2351
  // const float y = (i0 / 2 - low) / MAX(0.001f, high - low);
2349
2352
  // return MIN(1, MAX(0, y)) - 1;
2350
- yarn_ramp_allocator.alloc(theta_scale_length * sizeof(float));
2351
- void * yarn_ramp_buffer = yarn_ramp_allocator.get();
2352
2353
  acl_yarn_ramp_tensor =
2353
- ggml_cann_create_tensor(yarn_ramp_buffer, ACL_FLOAT, sizeof(float), theta_scale_ne, theta_scale_nb, 1);
2354
+ ggml_cann_create_tensor(ctx.rope_cache.yarn_ramp_cache, ACL_FLOAT, sizeof(float), theta_scale_ne, theta_scale_nb, 1);
2354
2355
  float zero_value = 0, one_value = 1;
2355
2356
  float denom_safe_value = MAX(0.001f, corr_dims[1] - corr_dims[0]);
2356
2357
  acl_scalar_ptr low = ggml_cann_create_scalar(&corr_dims[0], aclDataType::ACL_FLOAT);
@@ -2380,8 +2381,10 @@ static void aclnn_rope_cache_init(ggml_backend_cann_context & ctx,
2380
2381
  acl_scalar_ptr freq_scale_1_sc = ggml_cann_create_scalar(&freq_scale_1, aclDataType::ACL_FLOAT);
2381
2382
  GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMuls, acl_yarn_ramp_tensor.get(), freq_scale_1_sc.get());
2382
2383
  GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdds, acl_yarn_ramp_tensor.get(), freq_scale_sc.get(), one.get());
2384
+ } else {
2385
+ acl_yarn_ramp_tensor =
2386
+ ggml_cann_create_tensor(ctx.rope_cache.yarn_ramp_cache, ACL_FLOAT, sizeof(float), theta_scale_ne, theta_scale_nb, 1);
2383
2387
  }
2384
-
2385
2388
  // Step 1.3: update theta_scale_tensor according to ext_factor or freq_scale.
2386
2389
  if (ext_factor != 0) {
2387
2390
  if (theta_scale_updated || yarn_ramp_tensor_updated) {
@@ -2988,32 +2991,156 @@ void ggml_cann_argmax(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
2988
2991
  GGML_CANN_CALL_ACLNN_OP(ctx, ArgMax, acl_src.get(), 3, false, acl_dst.get());
2989
2992
  }
2990
2993
 
2991
- void ggml_cann_conv_transpose_1d(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
2994
+ void ggml_cann_conv_transpose_1d(ggml_backend_cann_context& ctx, ggml_tensor* dst){
2992
2995
  ggml_tensor * src0 = dst->src[0];
2993
2996
  ggml_tensor * src1 = dst->src[1];
2994
2997
 
2995
2998
  // stride
2996
- int64_t s0 = ((const int32_t *) (dst->op_params))[0];
2999
+ int64_t s0 = ((const int32_t*)(dst->op_params))[0];
2997
3000
 
2998
- acl_tensor_ptr acl_input = ggml_cann_create_tensor(src1, src1->ne, src1->nb, 3, ACL_FORMAT_NCL);
3001
+ acl_tensor_ptr acl_input = ggml_cann_create_tensor(src1, src1->ne, src1->nb, 3, ACL_FORMAT_NCL);
2999
3002
  acl_tensor_ptr acl_weight = ggml_cann_create_tensor(src0, src0->ne, src0->nb, 3, ACL_FORMAT_NCL);
3000
- acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst, dst->ne, dst->nb, 3, ACL_FORMAT_NCL);
3003
+ acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst, dst->ne, dst->nb, 3, ACL_FORMAT_NCL);
3004
+
3005
+ // get base information of input and kernel
3006
+ int64_t input_len = *(src1->ne);
3007
+ int64_t dst_len = *(dst->ne);
3008
+ int64_t kernel_size = *(src0->ne);
3009
+
3010
+ // set the max kernel size for each conv
3011
+ int64_t max_kernel_size = 255;
3012
+
3013
+ // compute the partition of kernel
3014
+ int64_t part_num = 1;
3015
+ part_num = (kernel_size + max_kernel_size - 1) / max_kernel_size;
3001
3016
 
3002
3017
  int64_t strideVal[1];
3003
- strideVal[0] = s0;
3004
- acl_int_array_ptr stride = ggml_cann_create_int_array(strideVal, 1);
3005
- int64_t paddingVal[] = { 0 };
3006
- acl_int_array_ptr padding = ggml_cann_create_int_array(paddingVal, 1);
3007
- int64_t dilationVal[] = { 1 };
3008
- acl_int_array_ptr dilation = ggml_cann_create_int_array(dilationVal, 1);
3009
- int8_t cubeMathType = 0;
3018
+ strideVal[0] = s0;
3019
+ acl_int_array_ptr stride = ggml_cann_create_int_array(strideVal, 1);
3020
+ int64_t paddingVal[] = {0};
3021
+ acl_int_array_ptr padding = ggml_cann_create_int_array(paddingVal, 1);
3022
+ int64_t dilationVal[] = {1};
3023
+ acl_int_array_ptr dilation = ggml_cann_create_int_array(dilationVal, 1);
3024
+ bool transposed = true;
3025
+ int64_t groups = 1;
3026
+ int8_t cubeMathType = 0;
3010
3027
 
3011
3028
  #ifdef ASCEND_310P
3012
3029
  cubeMathType = 1;
3013
3030
  #endif
3014
3031
 
3015
- GGML_CANN_CALL_ACLNN_OP(ctx, Convolution, acl_input.get(), acl_weight.get(), nullptr, stride.get(), padding.get(),
3016
- dilation.get(), true, padding.get(), 1, acl_dst.get(), cubeMathType);
3032
+ auto weight_type = ggml_cann_type_mapping(src0->type);
3033
+ auto dst_type = ggml_cann_type_mapping(dst->type);
3034
+
3035
+ // slice the kernel to make each conv available
3036
+ int64_t slice_dim = -1;
3037
+ int64_t slice_start = 0;
3038
+ int64_t slice_end = max_kernel_size;
3039
+ int64_t slice_step = 1;
3040
+ int64_t interval = max_kernel_size;
3041
+
3042
+ int64_t left_pad_len = dilationVal[0] * (max_kernel_size - 1) + 1 - 2 * paddingVal[0];
3043
+ int64_t right_pad_len = 0;
3044
+
3045
+ acl_scalar_ptr alpha = nullptr;
3046
+ float alphaValue = 1.0;
3047
+ alpha = ggml_cann_create_scalar(&alphaValue, aclDataType::ACL_FLOAT);
3048
+
3049
+ // set zero to destination
3050
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceZero, acl_dst.get());
3051
+
3052
+ for(int k = 0; k < part_num; k++){
3053
+
3054
+ // create part kernel tensor and slice from big kernel
3055
+ slice_start = max_kernel_size * k;
3056
+ if(k == part_num - 1){
3057
+ slice_end = kernel_size;
3058
+ interval = kernel_size - max_kernel_size * k;
3059
+ }else{
3060
+ slice_end = max_kernel_size * (k+1);
3061
+ }
3062
+
3063
+ int64_t part_ne[4];
3064
+ for(int i = 0; i < 4; i++) {
3065
+ part_ne[i] = *(src0->ne + i);
3066
+ }
3067
+ part_ne[0] = interval;
3068
+
3069
+ size_t part_nb[4];
3070
+ part_nb[0] = sizeof(weight_type);
3071
+ for (int i = 1; i < 4; i++) {
3072
+ part_nb[i] = part_nb[i - 1] * part_ne[i - 1];
3073
+ }
3074
+
3075
+ ggml_cann_pool_alloc part_kernel_allocator;
3076
+ part_kernel_allocator.alloc(ctx.pool(), part_nb[3]);
3077
+ void* part_kernel_buf = part_kernel_allocator.get();
3078
+
3079
+ acl_tensor_ptr part_kernel = ggml_cann_create_tensor(part_kernel_buf, weight_type,
3080
+ ggml_element_size(src0), part_ne, part_nb, 3, ACL_FORMAT_NCL);
3081
+
3082
+ GGML_CANN_CALL_ACLNN_OP(ctx, Slice, acl_weight.get(), slice_dim, slice_start, slice_end, slice_step, part_kernel.get());
3083
+
3084
+ // create the part conv result tensor
3085
+ int64_t part_dst_ne[4];
3086
+ for(int i = 0; i < 4; i++){
3087
+ part_dst_ne[i] = *(dst->ne + i);
3088
+ }
3089
+ part_dst_ne[0] = (input_len - 1) * strideVal[0] - 2 * paddingVal[0] + dilationVal[0] * (part_ne[0] - 1) + 1;
3090
+
3091
+ size_t part_dst_nb[4];
3092
+ part_dst_nb[0] = sizeof(weight_type);
3093
+ for (int i = 1; i < 4; i++) {
3094
+ part_dst_nb[i] = part_dst_nb[i - 1] * part_dst_ne[i - 1];
3095
+ }
3096
+ ggml_cann_pool_alloc part_dst_allocator;
3097
+ part_dst_allocator.alloc(ctx.pool(), part_dst_nb[3]);
3098
+ void* part_dst_buf = part_dst_allocator.get();
3099
+
3100
+ acl_tensor_ptr acl_part_dst = ggml_cann_create_tensor(part_dst_buf, dst_type, ggml_element_size(dst),
3101
+ part_dst_ne, part_dst_nb, 3, ACL_FORMAT_NCL);
3102
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceZero, acl_part_dst.get());
3103
+
3104
+ // compute part conv transpose 1d
3105
+ GGML_CANN_CALL_ACLNN_OP(ctx, Convolution, acl_input.get(), part_kernel.get(), nullptr, stride.get(),
3106
+ padding.get(), dilation.get(), transposed, padding.get(), groups, acl_part_dst.get(), cubeMathType);
3107
+
3108
+ // compute the position of part result in final result
3109
+ int64_t global_start = slice_start;
3110
+ int64_t global_end = std::min((input_len - 1) * strideVal[0] + slice_end, dst_len);
3111
+
3112
+ left_pad_len = global_start;
3113
+ right_pad_len = dst_len - global_end;
3114
+
3115
+ std::vector<int64_t> padDataVal = {left_pad_len,right_pad_len};
3116
+ acl_int_array_ptr padData = ggml_cann_create_int_array(padDataVal.data(), 2);
3117
+
3118
+ acl_scalar_ptr pad_value = nullptr;
3119
+ float pad_valueVal = 0.0;
3120
+ pad_value = ggml_cann_create_scalar(&pad_valueVal, aclDataType::ACL_FLOAT);
3121
+
3122
+ int64_t conv_result_ne[4];
3123
+ for(int i = 0; i < 4; i++){
3124
+ conv_result_ne[i] = *(dst->ne + i);
3125
+ }
3126
+
3127
+ size_t conv_result_nb[4];
3128
+ conv_result_nb[0] = sizeof(weight_type);
3129
+ for (int i = 1; i < 4; i++) {
3130
+ conv_result_nb[i] = conv_result_nb[i - 1] * conv_result_ne[i - 1];
3131
+ }
3132
+
3133
+ ggml_cann_pool_alloc conv_result_allocator;
3134
+ conv_result_allocator.alloc(ctx.pool(), conv_result_nb[3]);
3135
+ void* conv_result_buf = conv_result_allocator.get();
3136
+
3137
+ acl_tensor_ptr conv_result = ggml_cann_create_tensor(conv_result_buf, dst_type, ggml_element_size(dst),
3138
+ conv_result_ne, conv_result_nb, 3, ACL_FORMAT_NCL);
3139
+
3140
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceZero, conv_result.get());
3141
+ GGML_CANN_CALL_ACLNN_OP(ctx, ConstantPadNd, acl_part_dst.get(), padData.get(), pad_value.get(), conv_result.get());
3142
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdd, acl_dst.get(), conv_result.get(), alpha.get());
3143
+ }
3017
3144
  }
3018
3145
 
3019
3146
  void ggml_cann_elu(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
@@ -3576,3 +3703,160 @@ void ggml_cann_out_prod(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
3576
3703
  break;
3577
3704
  }
3578
3705
  }
3706
+
3707
+ void ggml_cann_ssm_conv(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
3708
+ ggml_tensor * src0 = dst->src[0]; // conv_x
3709
+ ggml_tensor * src1 = dst->src[1]; // conv1d.weight
3710
+
3711
+ // This op is currently defined only for F32 in ggml_cpu
3712
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
3713
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
3714
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
3715
+
3716
+ // Shapes follow ggml_compute_forward_ssm_conv_f32
3717
+ const int64_t nc = src1->ne[0]; // d_conv
3718
+ const int64_t ncs = src0->ne[0]; // d_conv - 1 + n_t
3719
+ const int64_t nr = src0->ne[1]; // d_inner
3720
+ const int64_t n_s = src0->ne[2]; // n_seqs
3721
+
3722
+ const int64_t n_t = dst->ne[1]; // tokens per sequence
3723
+
3724
+ GGML_ASSERT(dst->ne[0] == nr); // dst: {d_inner, n_t, n_s}
3725
+ GGML_ASSERT(src1->ne[1] == nr); // weight: {d_conv, d_inner}
3726
+ GGML_ASSERT(ncs == nc - 1 + n_t); // conv_x: {d_conv - 1 + n_t, d_inner, n_s}
3727
+ GGML_ASSERT(src0->nb[0] == sizeof(float));
3728
+ GGML_ASSERT(src1->nb[0] == sizeof(float));
3729
+
3730
+ // --- Build CANN tensors ---
3731
+
3732
+ // 1) Input: conv_x as NCL
3733
+ //
3734
+ // src0->ne = { ncs, nr, n_s, 1 } // {L_in, C, N}
3735
+ // Passing ACL_FORMAT_NCL here means:
3736
+ // reversed dims -> [N, C, L_in] = [n_s, nr, ncs]
3737
+ acl_tensor_ptr acl_x = ggml_cann_create_tensor(src0, src0->ne, src0->nb, 3, ACL_FORMAT_NCL);
3738
+
3739
+ // 2) Weights: depthwise conv kernel, view src1 as {K, 1, C}
3740
+ //
3741
+ // src1 original: ne = { nc, nr, 1, 1 } // [K, C, 1, 1]
3742
+ // we want a view: ne_w = { nc, 1, nr } // [K, 1, C]
3743
+ // so that reversed dims -> [C, 1, K] which matches
3744
+ // [out_channels, in_channels/groups, kernel_size]
3745
+ int64_t w_ne[GGML_MAX_DIMS] = { nc, 1, nr, 1 }; // [K, 1 input ch. per group, C groups]
3746
+ // Layout: src1 data is [K, C] with
3747
+ // offset(k, c) = k*nb0 + c*nb1
3748
+ // We want offset_w(k, 0, c) = k*nb0 + c*nb1,
3749
+ // so we can reuse nb0 and nb1, and set nb2 = nb1.
3750
+ size_t w_nb[GGML_MAX_DIMS] = { src1->nb[0], src1->nb[1], src1->nb[1], src1->nb[3] }; // same as src1
3751
+
3752
+ acl_tensor_ptr acl_w = ggml_cann_create_tensor(
3753
+ src1->data, ggml_cann_type_mapping(src1->type), ggml_type_size(src1->type), w_ne, w_nb, 3, ACL_FORMAT_NCL);
3754
+
3755
+ // 3) Output: dst is { d_inner, n_t, n_s } (CLN)
3756
+ //
3757
+ // We need an NCL view of the same buffer:
3758
+ // desired NCL logical shape: { L_out = n_t, C = nr, N = n_s }
3759
+ //
3760
+ // Original CLN layout:
3761
+ // dst->ne = { nr, n_t, n_s }
3762
+ // dst->nb[0] = sizeof(float)
3763
+ // dst->nb[1] = nr * sizeof(float)
3764
+ // dst->nb[2] = nr * n_t * sizeof(float)
3765
+ //
3766
+ // We want offset_new(L, C, N) = offset_orig(C, L, N).
3767
+ // Choose:
3768
+ // nb_y[0] = nr * sizeof(float); // step in L
3769
+ // nb_y[1] = sizeof(float); // step in C
3770
+ // nb_y[2] = nr * n_t * sizeof(float); // step in N
3771
+ int64_t y_ne[GGML_MAX_DIMS] = { n_t, nr, n_s, 1 }; // [L_out, C, N]
3772
+ size_t y_nb[GGML_MAX_DIMS] = { dst->ne[0] * sizeof(float), sizeof(float), dst->ne[0] * dst->ne[1] * sizeof(float), dst->nb[3] }; // [nr, 1, nr * n_t]
3773
+
3774
+ acl_tensor_ptr acl_y = ggml_cann_create_tensor(
3775
+ dst->data, ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type), y_ne, y_nb, 3, ACL_FORMAT_NCL);
3776
+
3777
+ // --- Conv1d parameters: depthwise, stride 1, no padding ("valid") ---
3778
+ int64_t strideVal[1] = { 1 };
3779
+ int64_t paddingVal[1] = { 0 };
3780
+ int64_t dilationVal[1] = { 1 };
3781
+
3782
+ acl_int_array_ptr stride = ggml_cann_create_int_array(strideVal, 1);
3783
+ acl_int_array_ptr padding = ggml_cann_create_int_array(paddingVal, 1);
3784
+ acl_int_array_ptr dilation = ggml_cann_create_int_array(dilationVal, 1);
3785
+
3786
+ const bool transposed = false;
3787
+ const int64_t groups = nr; // depthwise: one group per inner dim
3788
+ int8_t cubeMathType = 0;
3789
+
3790
+ #ifdef ASCEND_310P
3791
+ cubeMathType = 1;
3792
+ #endif
3793
+
3794
+ GGML_CANN_CALL_ACLNN_OP(ctx,
3795
+ Convolution,
3796
+ acl_x.get(), // input: N, C, L_in = ncs
3797
+ acl_w.get(), // weight: [C, 1, K] with groups=nr
3798
+ nullptr, // bias
3799
+ stride.get(),
3800
+ padding.get(),
3801
+ dilation.get(),
3802
+ transposed,
3803
+ padding.get(), // output padding (unused for non-transposed)
3804
+ groups,
3805
+ acl_y.get(),
3806
+ cubeMathType);
3807
+ }
3808
+
3809
+
3810
+ void ggml_cann_op_add_rms_norm_fused(ggml_backend_cann_context & ctx,
3811
+ ggml_tensor * add_node,
3812
+ ggml_tensor * rms_norm_node) {
3813
+ // Get the two input tensors for ADD operation
3814
+ ggml_tensor * x1 = add_node->src[0];
3815
+ ggml_tensor * x2 = add_node->src[1];
3816
+
3817
+ // Create ACL tensors for the two ADD inputs
3818
+ acl_tensor_ptr acl_x1 = ggml_cann_create_tensor(x1);
3819
+ acl_tensor_ptr acl_x2 = ggml_cann_create_tensor(x2);
3820
+
3821
+ // Get epsilon parameter from rms_norm_tensor
3822
+ float eps;
3823
+ memcpy(&eps, rms_norm_node->op_params, sizeof(float));
3824
+
3825
+ // Build gamma tensor (RMS normalization scaling factor)
3826
+ // Gamma should match the normalized dimensions (last dimension of x1)
3827
+ size_t acl_gamma_nb[GGML_MAX_DIMS];
3828
+ acl_gamma_nb[0] = ggml_type_size(rms_norm_node->type);
3829
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
3830
+ acl_gamma_nb[i] = acl_gamma_nb[i - 1] * x1->ne[i - 1];
3831
+ }
3832
+ acl_tensor_ptr acl_gamma =
3833
+ get_cache_acl_tensor(ctx, &ctx.rms_norm_one_tensor_cache.cache, ctx.rms_norm_one_tensor_cache.size, x1->ne,
3834
+ acl_gamma_nb, rms_norm_node->type,
3835
+ 1, // dims - only the last dimension
3836
+ 1.0f // value
3837
+ );
3838
+
3839
+ // Build rstdOut tensor (output for normalized standard deviation)
3840
+ // Shape should be the dimensions that are NOT normalized
3841
+ int64_t acl_rstd_ne[] = { 1, x1->ne[1], x1->ne[2], x1->ne[3] };
3842
+ size_t acl_rstd_nb[GGML_MAX_DIMS - 1];
3843
+ acl_rstd_nb[0] = sizeof(float);
3844
+ for (int i = 1; i < GGML_MAX_DIMS - 1; i++) {
3845
+ acl_rstd_nb[i] = acl_rstd_nb[i - 1] * acl_rstd_ne[i - 1];
3846
+ }
3847
+ acl_tensor_ptr acl_rstd =
3848
+ get_cache_acl_tensor(ctx, &ctx.rms_norm_zero_tensor_cache.cache, ctx.rms_norm_zero_tensor_cache.size,
3849
+ acl_rstd_ne, acl_rstd_nb, GGML_TYPE_F32, GGML_MAX_DIMS,
3850
+ 0.0f // value
3851
+ );
3852
+
3853
+ acl_tensor_ptr acl_xout = ggml_cann_create_tensor(add_node);
3854
+
3855
+ // Create yOut tensor (final output after RMS normalization)
3856
+ acl_tensor_ptr acl_yout = ggml_cann_create_tensor(rms_norm_node);
3857
+
3858
+ // Call fused ADD + RMS_NORM operator
3859
+ GGML_CANN_CALL_ACLNN_OP(ctx, AddRmsNorm, acl_x1.get(), acl_x2.get(), acl_gamma.get(),
3860
+ eps, // double type
3861
+ acl_yout.get(), acl_rstd.get(), acl_xout.get());
3862
+ }
@@ -47,6 +47,7 @@
47
47
  #include <aclnnop/aclnn_sign.h>
48
48
  #include <aclnnop/aclnn_silu.h>
49
49
  #include <aclnnop/aclnn_sin.h>
50
+ #include <aclnnop/aclnn_slice.h>
50
51
  #include <aclnnop/aclnn_sqrt.h>
51
52
  #include <aclnnop/aclnn_tanh.h>
52
53
 
@@ -934,6 +935,20 @@ template <typename... Args> void register_acl_resources(std::vector<any_acl_reso
934
935
  */
935
936
  void ggml_cann_mul_mat_id(ggml_backend_cann_context & ctx, ggml_tensor * dst);
936
937
 
938
+ /**
939
+ * @brief Performs fused ADD + RMS_NORM operation using the CANN backend.
940
+ *
941
+ * This function fuses the ADD and RMS_NORM operations into a single kernel call
942
+ * for better performance. It first adds two input tensors (x1 + x2), then applies
943
+ * RMS normalization to the result.
944
+ *
945
+ * @param ctx The context for the CANN backend operations.
946
+ * @param dst The ADD operation node, contains the two input tensors to be added.
947
+ * @param rms_norm_tensor The RMS_NORM operation node, contains the gamma weights
948
+ * and epsilon parameter.
949
+ */
950
+ void ggml_cann_op_add_rms_norm_fused(ggml_backend_cann_context & ctx, ggml_tensor * add_node, ggml_tensor * rms_norm_node);
951
+
937
952
  /**
938
953
  * @brief Check whether a tensor is a weight tensor for matrix multiplication.
939
954
  *
@@ -1032,6 +1047,8 @@ void ggml_cann_op_unary(std::function<void(ggml_backend_cann_context &, aclTenso
1032
1047
  ggml_backend_cann_context & ctx,
1033
1048
  ggml_tensor * dst);
1034
1049
 
1050
+ void ggml_cann_ssm_conv(ggml_backend_cann_context & ctx, ggml_tensor * dst);
1051
+
1035
1052
  /**
1036
1053
  * @brief Applies a gated (GLU-style) unary operation using the CANN backend.
1037
1054
  *