@novastera-oss/llamarn 0.2.6 → 0.2.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (253) hide show
  1. package/android/src/main/cpp/include/llama.h +141 -38
  2. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  3. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  4. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  6. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  7. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  8. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  9. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  10. package/cpp/LlamaCppModel.cpp +58 -24
  11. package/cpp/LlamaCppModel.h +3 -3
  12. package/cpp/PureCppImpl.cpp +1 -1
  13. package/cpp/PureCppImpl.h +2 -2
  14. package/cpp/build-info.cpp +2 -2
  15. package/cpp/llama.cpp/CMakeLists.txt +15 -4
  16. package/cpp/llama.cpp/Makefile +2 -2
  17. package/cpp/llama.cpp/README.md +32 -13
  18. package/cpp/llama.cpp/common/CMakeLists.txt +10 -20
  19. package/cpp/llama.cpp/common/arg.cpp +37 -6
  20. package/cpp/llama.cpp/common/build-info.cpp.in +2 -2
  21. package/cpp/llama.cpp/common/chat-parser.cpp +5 -0
  22. package/cpp/llama.cpp/common/chat-parser.h +2 -0
  23. package/cpp/llama.cpp/common/chat.cpp +12 -9
  24. package/cpp/llama.cpp/common/chat.h +1 -1
  25. package/cpp/llama.cpp/common/common.cpp +53 -40
  26. package/cpp/llama.cpp/common/common.h +6 -2
  27. package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
  28. package/cpp/llama.cpp/common/speculative.cpp +6 -4
  29. package/cpp/llama.cpp/convert_hf_to_gguf.py +215 -76
  30. package/cpp/llama.cpp/ggml/CMakeLists.txt +48 -2
  31. package/cpp/llama.cpp/ggml/cmake/common.cmake +1 -2
  32. package/cpp/llama.cpp/ggml/include/ggml-cpu.h +2 -0
  33. package/cpp/llama.cpp/ggml/include/ggml.h +33 -0
  34. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +64 -13
  35. package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +5 -0
  36. package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +6 -1
  37. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +33 -9
  38. package/cpp/llama.cpp/ggml/src/ggml-common.h +4 -0
  39. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +124 -26
  40. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
  41. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +11 -10
  42. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  43. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4114 -0
  44. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2163 -0
  45. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2639 -0
  46. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  47. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2732 -0
  48. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2069 -0
  49. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +397 -0
  50. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1300 -0
  51. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1481 -0
  52. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +4311 -0
  53. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +79 -3225
  54. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +184 -0
  55. package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +4 -3
  56. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +16 -7
  57. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +93 -104
  58. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +12 -8
  59. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
  60. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
  61. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +58 -8
  62. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
  63. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +194 -69
  64. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -0
  65. package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +1158 -0
  66. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
  67. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +1571 -0
  68. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.h +98 -0
  69. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +213 -37
  70. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
  71. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +2 -2
  72. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +45 -45
  73. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +59 -37
  74. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
  75. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
  76. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
  77. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
  78. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +4 -1
  79. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +4 -0
  80. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +90 -39
  81. package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cu +19 -0
  82. package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cuh +3 -0
  83. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cu +257 -87
  84. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cuh +2 -3
  85. package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +6 -4
  86. package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cu +5 -18
  87. package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cuh +0 -1
  88. package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -0
  89. package/cpp/llama.cpp/ggml/src/ggml-impl.h +61 -183
  90. package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +11 -10
  91. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +16 -0
  92. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +260 -49
  93. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +497 -282
  94. package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +2 -2
  95. package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +7 -0
  96. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1078 -468
  97. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
  98. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  99. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
  100. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
  101. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
  102. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  103. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
  104. package/cpp/llama.cpp/ggml/src/ggml-quants.c +0 -2
  105. package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +18 -15
  106. package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +1 -1
  107. package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -6
  108. package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +20 -48
  109. package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +28 -41
  110. package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +4 -10
  111. package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +117 -165
  112. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +192 -53
  113. package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +32 -0
  114. package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +49 -67
  115. package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
  116. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +99 -159
  117. package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +3 -0
  118. package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +8 -105
  119. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +209 -92
  120. package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
  121. package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
  122. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +60 -80
  123. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +158 -203
  124. package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +55 -74
  125. package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +38 -10
  126. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +24 -20
  127. package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -3
  128. package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
  129. package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
  130. package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -8
  131. package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +108 -16
  132. package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +12 -16
  133. package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +36 -28
  134. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +487 -247
  135. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -12
  136. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
  137. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +2 -0
  138. package/cpp/llama.cpp/ggml/src/ggml.c +69 -19
  139. package/cpp/llama.cpp/ggml/src/gguf.cpp +5 -1
  140. package/cpp/llama.cpp/gguf-py/gguf/constants.py +133 -0
  141. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +25 -1
  142. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +78 -3
  143. package/cpp/llama.cpp/gguf-py/gguf/vocab.py +97 -4
  144. package/cpp/llama.cpp/gguf-py/pyproject.toml +2 -2
  145. package/cpp/llama.cpp/include/llama.h +141 -38
  146. package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +124 -0
  147. package/cpp/llama.cpp/requirements/requirements-compare-llama-bench.txt +1 -0
  148. package/cpp/llama.cpp/src/CMakeLists.txt +2 -2
  149. package/cpp/llama.cpp/src/llama-arch.cpp +150 -3
  150. package/cpp/llama.cpp/src/llama-arch.h +25 -1
  151. package/cpp/llama.cpp/src/llama-batch.cpp +736 -274
  152. package/cpp/llama.cpp/src/llama-batch.h +110 -57
  153. package/cpp/llama.cpp/src/llama-chat.cpp +30 -8
  154. package/cpp/llama.cpp/src/llama-chat.h +1 -0
  155. package/cpp/llama.cpp/src/llama-context.cpp +360 -266
  156. package/cpp/llama.cpp/src/llama-context.h +27 -23
  157. package/cpp/llama.cpp/src/llama-cparams.cpp +1 -1
  158. package/cpp/llama.cpp/src/llama-cparams.h +1 -1
  159. package/cpp/llama.cpp/src/llama-graph.cpp +411 -344
  160. package/cpp/llama.cpp/src/llama-graph.h +126 -58
  161. package/cpp/llama.cpp/src/llama-hparams.cpp +10 -2
  162. package/cpp/llama.cpp/src/llama-hparams.h +16 -2
  163. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +103 -73
  164. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +34 -42
  165. package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +345 -221
  166. package/cpp/llama.cpp/src/llama-kv-cache-unified.h +75 -50
  167. package/cpp/llama.cpp/src/llama-kv-cells.h +51 -22
  168. package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +246 -0
  169. package/cpp/llama.cpp/src/llama-memory-hybrid.h +138 -0
  170. package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.cpp → llama-memory-recurrent.cpp} +302 -317
  171. package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.h → llama-memory-recurrent.h} +60 -68
  172. package/cpp/llama.cpp/src/llama-memory.cpp +41 -0
  173. package/cpp/llama.cpp/src/llama-memory.h +73 -36
  174. package/cpp/llama.cpp/src/llama-mmap.cpp +1 -1
  175. package/cpp/llama.cpp/src/llama-model-loader.cpp +42 -17
  176. package/cpp/llama.cpp/src/llama-model-saver.cpp +1 -0
  177. package/cpp/llama.cpp/src/llama-model.cpp +1630 -511
  178. package/cpp/llama.cpp/src/llama-model.h +26 -0
  179. package/cpp/llama.cpp/src/llama-quant.cpp +89 -6
  180. package/cpp/llama.cpp/src/llama-vocab.cpp +58 -26
  181. package/cpp/llama.cpp/src/llama-vocab.h +1 -0
  182. package/cpp/llama.cpp/src/llama.cpp +11 -7
  183. package/cpp/llama.cpp/src/unicode.cpp +5 -0
  184. package/cpp/rn-completion.cpp +2 -2
  185. package/cpp/{rn-llama.hpp → rn-llama.h} +1 -1
  186. package/cpp/{rn-utils.hpp → rn-utils.h} +3 -0
  187. package/ios/include/chat.h +1 -1
  188. package/ios/include/common.h +6 -2
  189. package/ios/include/llama.h +141 -38
  190. package/ios/libs/llama.xcframework/Info.plist +15 -15
  191. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  192. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4689
  193. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  194. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +33 -0
  195. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +141 -38
  196. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  197. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  198. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
  199. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3622
  200. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  201. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
  202. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
  203. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  204. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  205. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
  206. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3766 -3624
  207. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +2 -0
  208. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +33 -0
  209. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +141 -38
  210. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +2 -0
  211. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +33 -0
  212. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +141 -38
  213. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  214. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +2 -0
  215. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +33 -0
  216. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +141 -38
  217. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  218. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  219. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  220. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4689
  221. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  222. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +33 -0
  223. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +141 -38
  224. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  225. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  226. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
  227. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3622
  228. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  229. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
  230. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
  231. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  232. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  233. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4926 -4725
  234. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  235. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +33 -0
  236. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +141 -38
  237. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  238. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  239. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4897 -4746
  240. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3794 -3652
  241. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  242. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
  243. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
  244. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  245. package/package.json +1 -2
  246. package/cpp/llama.cpp/common/cmake/build-info-gen-cpp.cmake +0 -24
  247. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  248. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13891
  249. package/cpp/llama.cpp/src/llama-kv-cache.cpp +0 -1
  250. package/cpp/llama.cpp/src/llama-kv-cache.h +0 -44
  251. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
  252. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
  253. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
@@ -0,0 +1,91 @@
1
+ #include <algorithm>
2
+
3
+ #include "conv2d-transpose.cuh"
4
+ #include "ggml.h"
5
+
6
+ __global__ void conv2d_transpose_kernel(const float * __restrict__ input, const half * __restrict__ kernel,
7
+ float * __restrict__ output, const int in_w, const int in_h, const int out_w,
8
+ const int out_h, const int kernel_w, const int kernel_h, const int stride,
9
+ const int c_in, const int c_out, const int batches) {
10
+ const int global_idx = blockIdx.x * blockDim.x + threadIdx.x;
11
+
12
+ const int total_elements = out_w * out_h * c_out * batches;
13
+
14
+ if (global_idx >= total_elements) {
15
+ return;
16
+ }
17
+
18
+ const int out_x_idx = global_idx % out_w;
19
+ const int out_y_idx = (global_idx / out_w) % out_h;
20
+ const int c_idx = (global_idx / (out_w * out_h)) % c_out;
21
+ const int n_idx = global_idx / (out_w * out_h * c_out);
22
+
23
+ float accumulator = 0;
24
+ // For each output idx, find the inputs that contribute to it by checking stride alignment and bounds
25
+
26
+ for (int c_in_idx = 0; c_in_idx < c_in; c_in_idx++) {
27
+ for (int kh = 0; kh < kernel_h; ++kh) {
28
+ int in_y = out_y_idx - kh;
29
+ if (in_y < 0 || in_y % stride) continue;
30
+ in_y /= stride;
31
+ if (in_y >= in_h) continue;
32
+
33
+ for (int kw = 0; kw < kernel_w; ++kw) {
34
+ int in_x = out_x_idx - kw;
35
+ if (in_x < 0 || in_x % stride) continue;
36
+ in_x /= stride;
37
+ if (in_x >= in_w) continue;
38
+
39
+ const int input_idx = (in_w * in_h * c_in) * n_idx + (in_w * in_h) * c_in_idx + (in_w) *in_y + in_x;
40
+ const int kernel_idx =
41
+ (kernel_h * kernel_w * c_out) * c_in_idx + (kernel_h * kernel_w) * c_idx + (kernel_w) *kh + kw;
42
+
43
+ float input_val = input[input_idx];
44
+ half kern_val = kernel[kernel_idx];
45
+
46
+ accumulator += input_val * (float) kern_val;
47
+ }
48
+ }
49
+ }
50
+
51
+ output[(out_w * out_h * c_out) * n_idx + (out_w * out_h) * c_idx + (out_w) *out_y_idx + out_x_idx] = accumulator;
52
+ }
53
+
54
+ //input is (W, H, C_in, N), Kernel is (W, H, C_out, C_in)
55
+ void ggml_cuda_conv_2d_transpose_p0(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
56
+ const ggml_tensor * kernel = dst->src[0];
57
+ const ggml_tensor * input = dst->src[1];
58
+
59
+ GGML_ASSERT(kernel->type == GGML_TYPE_F16 && input->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
60
+
61
+ const float * input_data = (const float *) input->data;
62
+ float * output_data = (float *) dst->data;
63
+ const half * kernel_data = (const half *) kernel->data;
64
+
65
+ const int input_w = input->ne[0];
66
+ const int input_h = input->ne[1];
67
+ const int output_w = dst->ne[0];
68
+ const int output_h = dst->ne[1];
69
+ const int channels_in = input->ne[2];
70
+ const int channels_out = kernel->ne[2];
71
+ const int kernel_w = kernel->ne[0];
72
+ const int kernel_h = kernel->ne[1];
73
+ const int stride = dst->op_params[0];
74
+ const int batches = input->ne[3];
75
+
76
+ GGML_ASSERT(channels_in == kernel->ne[3]);
77
+ GGML_ASSERT(stride > 0);
78
+
79
+ cudaStream_t st = ctx.stream();
80
+
81
+ GGML_ASSERT(ggml_is_contiguous(input));
82
+ GGML_ASSERT(ggml_is_contiguous(kernel));
83
+ GGML_ASSERT(ggml_is_contiguous(dst));
84
+
85
+ const int total = (output_w * output_h * channels_out * batches);
86
+ const int blocks = (total + CUDA_CONV2D_TRANSPOSE_BLOCK_SIZE - 1) / CUDA_CONV2D_TRANSPOSE_BLOCK_SIZE;
87
+
88
+ conv2d_transpose_kernel<<<blocks, CUDA_CONV2D_TRANSPOSE_BLOCK_SIZE, 0, st>>>(
89
+ input_data, kernel_data, output_data, input_w, input_h, output_w, output_h, kernel_w, kernel_h, stride,
90
+ channels_in, channels_out, batches);
91
+ }
@@ -0,0 +1,4 @@
1
+ #include "common.cuh"
2
+
3
+ #define CUDA_CONV2D_TRANSPOSE_BLOCK_SIZE 256
4
+ void ggml_cuda_conv_2d_transpose_p0(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
@@ -652,9 +652,12 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
652
652
  float KQ_max_scale[cols_per_thread];
653
653
  #pragma unroll
654
654
  for (int col = 0; col < cols_per_thread; ++col) {
655
- KQ_max_scale[col] = expf(KQ_max[col] - KQ_max_new[col]);
655
+ const float KQ_max_diff = KQ_max[col] - KQ_max_new[col];
656
+ KQ_max_scale[col] = expf(KQ_max_diff);
656
657
  KQ_max[col] = KQ_max_new[col];
657
658
 
659
+ *((uint32_t *) &KQ_max_scale[col]) *= KQ_max_diff >= SOFTMAX_FTZ_THRESHOLD;
660
+
658
661
  // Scale previous KQ_rowsum to account for a potential increase in KQ_max:
659
662
  KQ_rowsum[col] = KQ_max_scale[col]*KQ_rowsum[col] + KQ_rowsum_add[col];
660
663
  }
@@ -9,7 +9,11 @@
9
9
  #ifdef FP16_MMA_AVAILABLE
10
10
  #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
11
11
  #include <mma.h>
12
+ #ifdef GGML_USE_MUSA
13
+ namespace wmma = mtmusa::wmma;
14
+ #else // GGML_USE_MUSA
12
15
  namespace wmma = nvcuda::wmma;
16
+ #endif // GGML_USE_MUSA
13
17
  #elif defined(GGML_HIP_ROCWMMA_FATTN) && defined(FP16_MMA_AVAILABLE)
14
18
  #undef HIP_ENABLE_WARP_SYNC_BUILTINS // conflicts with rocWMMA headers
15
19
  #include <rocwmma/rocwmma.hpp>
@@ -11,6 +11,8 @@
11
11
  #include "ggml-cuda/clamp.cuh"
12
12
  #include "ggml-cuda/concat.cuh"
13
13
  #include "ggml-cuda/conv-transpose-1d.cuh"
14
+ #include "ggml-cuda/conv2d-dw.cuh"
15
+ #include "ggml-cuda/conv2d-transpose.cuh"
14
16
  #include "ggml-cuda/convert.cuh"
15
17
  #include "ggml-cuda/count-equal.cuh"
16
18
  #include "ggml-cuda/cpy.cuh"
@@ -35,6 +37,7 @@
35
37
  #include "ggml-cuda/ssm-scan.cuh"
36
38
  #include "ggml-cuda/sum.cuh"
37
39
  #include "ggml-cuda/sumrows.cuh"
40
+ #include "ggml-cuda/mean.cuh"
38
41
  #include "ggml-cuda/tsembd.cuh"
39
42
  #include "ggml-cuda/unary.cuh"
40
43
  #include "ggml-cuda/upscale.cuh"
@@ -47,6 +50,7 @@
47
50
  #include <atomic>
48
51
  #include <charconv>
49
52
  #include <cinttypes>
53
+ #include <condition_variable>
50
54
  #include <cstddef>
51
55
  #include <cstdint>
52
56
  #include <float.h>
@@ -54,9 +58,8 @@
54
58
  #include <map>
55
59
  #include <memory>
56
60
  #include <mutex>
57
- #include <stdint.h>
58
- #include <stdio.h>
59
61
  #include <stdarg.h>
62
+ #include <stdio.h>
60
63
  #include <stdlib.h>
61
64
  #include <string>
62
65
  #include <vector>
@@ -97,8 +100,7 @@ int ggml_cuda_get_device() {
97
100
  static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
98
101
  ggml_cuda_set_device(device);
99
102
  cudaError_t err;
100
- if (getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr)
101
- {
103
+ if (getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr) {
102
104
  err = cudaMallocManaged(ptr, size);
103
105
  #if defined(GGML_USE_HIP)
104
106
  if (err == hipSuccess) {
@@ -116,9 +118,7 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
116
118
  err = cudaMalloc(ptr, size);
117
119
  }
118
120
  #endif // defined(GGML_USE_HIP)
119
- }
120
- else
121
- {
121
+ } else {
122
122
  err = cudaMalloc(ptr, size);
123
123
  }
124
124
  return err;
@@ -514,6 +514,33 @@ std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(i
514
514
  return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_leg(device));
515
515
  }
516
516
 
517
+ // destroying a cuBLAS handle while a graph is being captured in a different thread can result in a CUDA error
518
+ // this lock is used to ensure that no cuBLAS handle is destroyed while a graph is being captured
519
+
520
+ static std::mutex ggml_cuda_lock;
521
+ static std::condition_variable ggml_cuda_lock_cv;
522
+ static std::atomic<int> ggml_cuda_lock_counter;
523
+
524
+ ggml_backend_cuda_context::~ggml_backend_cuda_context() {
525
+ std::unique_lock<std::mutex> lock(ggml_cuda_lock);
526
+ ggml_cuda_lock_cv.wait(lock, []{ return ggml_cuda_lock_counter.load(std::memory_order_relaxed) == 0; });
527
+
528
+ if (copy_event != nullptr) {
529
+ CUDA_CHECK(cudaEventDestroy(copy_event));
530
+ }
531
+ for (int i = 0; i < GGML_CUDA_MAX_DEVICES; ++i) {
532
+ for (int j = 0; j < GGML_CUDA_MAX_STREAMS; ++j) {
533
+ if (streams[i][j] != nullptr) {
534
+ CUDA_CHECK(cudaStreamDestroy(streams[i][j]));
535
+ }
536
+ }
537
+ if (cublas_handles[i] != nullptr) {
538
+ CUBLAS_CHECK(cublasDestroy(cublas_handles[i]));
539
+ }
540
+ }
541
+ }
542
+
543
+
517
544
  // cuda buffer
518
545
 
519
546
  struct ggml_backend_cuda_buffer_context {
@@ -615,9 +642,8 @@ static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
615
642
  ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
616
643
 
617
644
  ggml_cuda_set_device(ctx->device);
618
- CUDA_CHECK(cudaDeviceSynchronize());
619
- CUDA_CHECK(cudaMemset(ctx->dev_ptr, value, buffer->size));
620
- CUDA_CHECK(cudaDeviceSynchronize());
645
+ CUDA_CHECK(cudaMemsetAsync(ctx->dev_ptr, value, buffer->size, cudaStreamPerThread));
646
+ CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
621
647
  }
622
648
 
623
649
  static const ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = {
@@ -1144,7 +1170,6 @@ typedef void (*ggml_cuda_op_mul_mat_t)(
1144
1170
  static cudaError_t ggml_cuda_cpy_tensor_2d(
1145
1171
  void * dst, const struct ggml_tensor * src, int64_t i3, int64_t i2, int64_t i1_low, int64_t i1_high, cudaStream_t stream) {
1146
1172
 
1147
- GGML_ASSERT(ggml_backend_buffer_is_cuda(src->buffer));
1148
1173
  const char * src_ptr = (const char *) src->data;
1149
1174
  char * dst_ptr = (char *) dst;
1150
1175
 
@@ -1202,9 +1227,12 @@ static void ggml_cuda_op_mul_mat_cublas(
1202
1227
 
1203
1228
  const int cc = ggml_cuda_info().devices[id].cc;
1204
1229
 
1230
+ const bool supports_bf16 = GGML_CUDA_CC_IS_NVIDIA(cc) || GGML_CUDA_CC_IS_AMD(cc) ||
1231
+ (GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_QY2);
1232
+
1205
1233
  const bool use_fp16 = (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT;
1206
1234
 
1207
- if (src0->type == GGML_TYPE_BF16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) {
1235
+ if (supports_bf16 && src0->type == GGML_TYPE_BF16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) {
1208
1236
  ggml_cuda_pool_alloc<nv_bfloat16> src1_as_bf16(ctx.pool(id));
1209
1237
  if (src1->type != GGML_TYPE_BF16) {
1210
1238
  const to_bf16_cuda_t to_bf16_cuda = ggml_get_to_bf16_cuda(src1->type);
@@ -1232,7 +1260,7 @@ static void ggml_cuda_op_mul_mat_cublas(
1232
1260
 
1233
1261
  const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_BF16);
1234
1262
  to_fp32_cuda(dst_bf16.get(), dst_dd_i, row_diff*src1_ncols, stream);
1235
- } else if (((GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_VOLTA) || GGML_CUDA_CC_IS_AMD(cc)) && use_fp16) {
1263
+ } else if (fast_fp16_hardware_available(cc) && use_fp16) {
1236
1264
  // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
1237
1265
  ggml_cuda_pool_alloc<half> src0_as_f16(ctx.pool(id));
1238
1266
  if (src0->type != GGML_TYPE_F16) {
@@ -1427,8 +1455,6 @@ static void ggml_cuda_op_mul_mat(
1427
1455
  const int64_t nb2 = dst->nb[2];
1428
1456
  const int64_t nb3 = dst->nb[3];
1429
1457
 
1430
- GGML_ASSERT(ggml_backend_buffer_is_cuda(dst->buffer));
1431
- GGML_ASSERT(ggml_backend_buffer_is_cuda(src1->buffer));
1432
1458
  ggml_backend_cuda_buffer_context * src1_ctx = (ggml_backend_cuda_buffer_context *) src1->buffer->context;
1433
1459
  ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *) dst->buffer->context;
1434
1460
 
@@ -1750,7 +1776,7 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
1750
1776
  GGML_ASSERT(!ggml_is_transposed(src0));
1751
1777
  GGML_ASSERT(!ggml_is_transposed(src1));
1752
1778
 
1753
- GGML_ASSERT(ggml_backend_buffer_is_cuda(src0->buffer));
1779
+ GGML_ASSERT(!ggml_backend_buft_is_cuda_split(src0->buffer->buft));
1754
1780
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
1755
1781
 
1756
1782
  // Byte offsets and tensor dimensions are currently used in an inconsistent way for dst.
@@ -1920,16 +1946,14 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
1920
1946
  && ggml_nbytes(src0) != ggml_backend_buffer_get_alloc_size(src0->buffer, src0) && src0->view_src;
1921
1947
 
1922
1948
  bool use_mul_mat_vec = (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16)
1923
- && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
1924
- && src0->ne[0] % 2 == 0 && src1->ne[1] == 1;
1949
+ && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
1925
1950
  bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) && !bad_padding_clear
1926
1951
  && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
1927
1952
  && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
1928
1953
  bool use_mul_mat_q = ggml_is_quantized(src0->type) && !bad_padding_clear
1929
1954
  && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
1930
1955
 
1931
- bool any_gpus_with_slow_fp16 = false;
1932
- bool any_gpus_without_fp16_mma = false;
1956
+ bool any_gpus_with_slow_fp16 = false;
1933
1957
 
1934
1958
  if (split) {
1935
1959
  ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
@@ -1940,16 +1964,16 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
1940
1964
  continue;
1941
1965
  }
1942
1966
 
1943
- const int cc = ggml_cuda_info().devices[id].cc;
1944
- use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
1945
- any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
1946
- any_gpus_without_fp16_mma = any_gpus_without_fp16_mma || !fp16_mma_hardware_available(cc);
1967
+ const int cc = ggml_cuda_info().devices[id].cc;
1968
+ use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
1969
+ use_mul_mat_vec = use_mul_mat_vec && ggml_cuda_should_use_mmv(src0->type, cc, src0->ne, src1->ne[1]);
1970
+ any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
1947
1971
  }
1948
1972
  } else {
1949
- const int cc = ggml_cuda_info().devices[ctx.device].cc;
1950
- use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
1951
- any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
1952
- any_gpus_without_fp16_mma = any_gpus_without_fp16_mma || !fp16_mma_hardware_available(cc);
1973
+ const int cc = ggml_cuda_info().devices[ctx.device].cc;
1974
+ use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
1975
+ use_mul_mat_vec = use_mul_mat_vec && ggml_cuda_should_use_mmv(src0->type, cc, src0->ne, src1->ne[1]);
1976
+ any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
1953
1977
  }
1954
1978
 
1955
1979
  // debug helpers
@@ -1960,7 +1984,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
1960
1984
  //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
1961
1985
  //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
1962
1986
 
1963
- if (!split && use_mul_mat_vec && (src0->ne[1] <= MMV_MAX_ROWS || any_gpus_without_fp16_mma)) {
1987
+ if (!split && use_mul_mat_vec) {
1964
1988
  // the custom F16 vector kernel can be used over batched cuBLAS GEMM
1965
1989
  // but this is only faster for GPUs without tensor cores or with a thin src0 matrix (particularly KQV in attention)
1966
1990
  ggml_cuda_mul_mat_vec(ctx, src0, src1, nullptr, dst);
@@ -2314,6 +2338,12 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
2314
2338
  case GGML_OP_IM2COL:
2315
2339
  ggml_cuda_op_im2col(ctx, dst);
2316
2340
  break;
2341
+ case GGML_OP_CONV_2D_DW:
2342
+ ggml_cuda_op_conv2d_dw(ctx, dst);
2343
+ break;
2344
+ case GGML_OP_CONV_TRANSPOSE_2D:
2345
+ ggml_cuda_conv_2d_transpose_p0(ctx, dst);
2346
+ break;
2317
2347
  case GGML_OP_CONV_TRANSPOSE_1D:
2318
2348
  ggml_cuda_op_conv_transpose_1d(ctx,dst);
2319
2349
  break;
@@ -2326,6 +2356,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
2326
2356
  case GGML_OP_SUM_ROWS:
2327
2357
  ggml_cuda_op_sum_rows(ctx, dst);
2328
2358
  break;
2359
+ case GGML_OP_MEAN:
2360
+ ggml_cuda_op_mean(ctx, dst);
2361
+ break;
2329
2362
  case GGML_OP_SSM_CONV:
2330
2363
  ggml_cuda_op_ssm_conv(ctx, dst);
2331
2364
  break;
@@ -2668,7 +2701,9 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
2668
2701
  ggml_backend_buft_is_cuda_split(node->src[j]->buffer->buft) || (integrated && ggml_backend_buft_is_cuda_host(node->src[j]->buffer->buft)));
2669
2702
  }
2670
2703
  }
2671
- #endif
2704
+ #else
2705
+ GGML_UNUSED(integrated);
2706
+ #endif // NDEBUG
2672
2707
 
2673
2708
  bool ok = ggml_cuda_compute_forward(*cuda_ctx, node);
2674
2709
  if (!ok) {
@@ -2687,6 +2722,11 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
2687
2722
 
2688
2723
  CUDA_CHECK(cudaStreamEndCapture(cuda_ctx->stream(), &cuda_ctx->cuda_graph->graph));
2689
2724
  graph_evaluated_or_captured = true; // CUDA graph has been captured
2725
+
2726
+ std::lock_guard<std::mutex> lock(ggml_cuda_lock);
2727
+ if (ggml_cuda_lock_counter.fetch_sub(1, std::memory_order_relaxed) == 1) {
2728
+ ggml_cuda_lock_cv.notify_all();
2729
+ }
2690
2730
  } else {
2691
2731
  graph_evaluated_or_captured = true; // ggml graph has been directly evaluated
2692
2732
  }
@@ -2762,7 +2802,13 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
2762
2802
  }
2763
2803
  }
2764
2804
 
2765
- if (use_cuda_graph && cuda_graph_update_required) { // Start CUDA graph capture
2805
+ if (use_cuda_graph && cuda_graph_update_required) {
2806
+ // Start CUDA graph capture
2807
+ {
2808
+ std::lock_guard<std::mutex> lock(ggml_cuda_lock);
2809
+ ggml_cuda_lock_counter.fetch_add(1, std::memory_order_relaxed);
2810
+ }
2811
+
2766
2812
  CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed));
2767
2813
  }
2768
2814
 
@@ -3018,9 +3064,16 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
3018
3064
  return false;
3019
3065
  }
3020
3066
  #ifdef GGML_USE_MUSA
3021
- if (b->type == GGML_TYPE_F16 && b->ne[2]*b->ne[3] > 1 &&
3022
- !ggml_is_transposed(a) && !ggml_is_transposed(b)) {
3023
- return false;
3067
+ const int cc = ggml_cuda_info().devices[dev_ctx->device].cc;
3068
+ if (b->ne[2]*b->ne[3] > 1 && !ggml_is_transposed(a) && !ggml_is_transposed(b)) {
3069
+ if (GGML_CUDA_CC_IS_QY1(cc) && op->op == GGML_OP_MUL_MAT &&
3070
+ a->type == GGML_TYPE_F16 && b->type == GGML_TYPE_F16) {
3071
+ return false;
3072
+ }
3073
+ if (GGML_CUDA_CC_IS_QY2(cc) && op->op == GGML_OP_MUL_MAT_ID &&
3074
+ a->type == GGML_TYPE_Q2_K && b->type == GGML_TYPE_F32) {
3075
+ return false;
3076
+ }
3024
3077
  }
3025
3078
  #endif // GGML_USE_MUSA
3026
3079
  switch (a->type) {
@@ -3047,11 +3100,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
3047
3100
  case GGML_TYPE_IQ4_NL:
3048
3101
  case GGML_TYPE_IQ4_XS:
3049
3102
  case GGML_TYPE_BF16:
3050
- #ifdef GGML_USE_MUSA
3051
- if (a->type == GGML_TYPE_Q3_K) {
3052
- return false;
3053
- }
3054
- #endif // GGML_USE_MUSA
3055
3103
  return true;
3056
3104
  default:
3057
3105
  return false;
@@ -3211,9 +3259,12 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
3211
3259
  return op->src[0]->nb[0] == ggml_type_size(op->src[0]->type) && ggml_is_contiguous_2(op->src[0]);
3212
3260
  }
3213
3261
  case GGML_OP_IM2COL:
3262
+ case GGML_OP_CONV_2D_DW:
3263
+ case GGML_OP_CONV_TRANSPOSE_2D:
3214
3264
  case GGML_OP_POOL_2D:
3215
3265
  case GGML_OP_SUM:
3216
3266
  case GGML_OP_SUM_ROWS:
3267
+ case GGML_OP_MEAN:
3217
3268
  case GGML_OP_ARGSORT:
3218
3269
  case GGML_OP_ACC:
3219
3270
  return true;
@@ -0,0 +1,19 @@
1
+ #include "mean.cuh"
2
+
3
+ void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
4
+ const ggml_tensor * src0 = dst->src[0];
5
+ const float * src0_d = (const float *) src0->data;
6
+ float * dst_d = (float *) dst->data;
7
+ cudaStream_t stream = ctx.stream();
8
+
9
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
10
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
11
+ GGML_ASSERT(ggml_is_contiguous(src0));
12
+
13
+ const int64_t ncols = src0->ne[0];
14
+ const int64_t nrows = ggml_nrows(src0);
15
+
16
+ const dim3 block_dims(WARP_SIZE, 1, 1);
17
+ const dim3 block_nums(nrows, 1, 1);
18
+ reduce_rows_f32</*norm*/ true><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
19
+ }
@@ -0,0 +1,3 @@
1
+ #include "common.cuh"
2
+
3
+ void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst);