@novastera-oss/llamarn 0.2.9 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (247) hide show
  1. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  2. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  3. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  4. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  5. package/android/src/main/jniLibs/armeabi-v7a/libggml-base.so +0 -0
  6. package/android/src/main/jniLibs/armeabi-v7a/libggml-cpu.so +0 -0
  7. package/android/src/main/jniLibs/armeabi-v7a/libggml.so +0 -0
  8. package/android/src/main/jniLibs/armeabi-v7a/libllama.so +0 -0
  9. package/android/src/main/jniLibs/x86/libggml-base.so +0 -0
  10. package/android/src/main/jniLibs/x86/libggml-cpu.so +0 -0
  11. package/android/src/main/jniLibs/x86/libggml.so +0 -0
  12. package/android/src/main/jniLibs/x86/libllama.so +0 -0
  13. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  14. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  15. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  16. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  17. package/cpp/build-info.cpp +2 -2
  18. package/cpp/llama.cpp/CMakeLists.txt +0 -1
  19. package/cpp/llama.cpp/README.md +4 -5
  20. package/cpp/llama.cpp/build-xcframework.sh +1 -1
  21. package/cpp/llama.cpp/common/CMakeLists.txt +4 -5
  22. package/cpp/llama.cpp/common/arg.cpp +17 -0
  23. package/cpp/llama.cpp/common/chat.cpp +37 -20
  24. package/cpp/llama.cpp/common/chat.h +2 -0
  25. package/cpp/llama.cpp/common/common.h +4 -0
  26. package/cpp/llama.cpp/convert_hf_to_gguf.py +745 -6
  27. package/cpp/llama.cpp/convert_hf_to_gguf_update.py +9 -0
  28. package/cpp/llama.cpp/ggml/CMakeLists.txt +7 -2
  29. package/cpp/llama.cpp/ggml/include/ggml-backend.h +1 -1
  30. package/cpp/llama.cpp/ggml/include/ggml.h +173 -10
  31. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +0 -1
  32. package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +0 -8
  33. package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +36 -18
  34. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +68 -5
  35. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +16 -2
  36. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +6 -1
  37. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +28 -1
  38. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +1203 -163
  39. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +6 -0
  40. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +1 -1
  41. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +33 -9
  42. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +142 -9
  43. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +17 -0
  44. package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cu +22 -0
  45. package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cuh +5 -0
  46. package/cpp/llama.cpp/ggml/src/ggml-cuda/cross-entropy-loss.cu +2 -14
  47. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +4 -1
  48. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +8 -4
  49. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cu +6 -4
  50. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cu +14 -12
  51. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +5 -3
  52. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +15 -10
  53. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +8 -6
  54. package/cpp/llama.cpp/ggml/src/ggml-cuda/getrows.cu +8 -0
  55. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +185 -79
  56. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +2 -8
  57. package/cpp/llama.cpp/ggml/src/ggml-cuda/rope.cu +21 -27
  58. package/cpp/llama.cpp/ggml/src/ggml-cuda/scale.cu +8 -6
  59. package/cpp/llama.cpp/ggml/src/ggml-cuda/softmax.cu +119 -58
  60. package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-conv.cu +10 -2
  61. package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +192 -52
  62. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +97 -0
  63. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +11 -0
  64. package/cpp/llama.cpp/ggml/src/ggml-cuda/upscale.cu +92 -6
  65. package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +14 -5
  66. package/cpp/llama.cpp/ggml/src/ggml-impl.h +64 -0
  67. package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -2
  68. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +35 -9
  69. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +167 -39
  70. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +254 -57
  71. package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +3 -0
  72. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +505 -40
  73. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/gelu.cl +27 -0
  74. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/glu.cl +337 -0
  75. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
  76. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/scale.cl +3 -2
  77. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/set_rows.cl +95 -0
  78. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +24 -11
  79. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +24 -11
  80. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f16.cl +24 -11
  81. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f32.cl +24 -11
  82. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +2 -3
  83. package/cpp/llama.cpp/ggml/src/ggml-quants.c +6 -6
  84. package/cpp/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  85. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +693 -1034
  86. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +18 -9
  87. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +60 -9
  88. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +15 -18
  89. package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.cpp +131 -0
  90. package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.hpp +8 -0
  91. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +711 -292
  92. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +58 -7
  93. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +28 -23
  94. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +14 -9
  95. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +38 -32
  96. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +32 -27
  97. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +44 -12
  98. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
  99. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
  100. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
  101. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
  102. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +15 -0
  103. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp +29 -0
  104. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +128 -72
  105. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +38 -9
  106. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
  107. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +12 -3
  108. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
  109. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -9
  110. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -9
  111. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -9
  112. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +1 -1
  113. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +20 -4
  114. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
  115. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +69 -5
  116. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +23 -3
  117. package/cpp/llama.cpp/ggml/src/ggml.c +382 -61
  118. package/cpp/llama.cpp/ggml/src/gguf.cpp +8 -1
  119. package/cpp/llama.cpp/gguf-py/gguf/constants.py +209 -0
  120. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +8 -2
  121. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +73 -21
  122. package/cpp/llama.cpp/gguf-py/gguf/vocab.py +12 -3
  123. package/cpp/llama.cpp/include/llama.h +0 -40
  124. package/cpp/llama.cpp/src/llama-arch.cpp +210 -3
  125. package/cpp/llama.cpp/src/llama-arch.h +18 -1
  126. package/cpp/llama.cpp/src/llama-batch.cpp +27 -1
  127. package/cpp/llama.cpp/src/llama-batch.h +8 -1
  128. package/cpp/llama.cpp/src/llama-chat.cpp +15 -0
  129. package/cpp/llama.cpp/src/llama-chat.h +1 -0
  130. package/cpp/llama.cpp/src/llama-graph.cpp +119 -184
  131. package/cpp/llama.cpp/src/llama-graph.h +47 -60
  132. package/cpp/llama.cpp/src/llama-hparams.cpp +7 -1
  133. package/cpp/llama.cpp/src/llama-hparams.h +3 -0
  134. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +28 -18
  135. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +4 -2
  136. package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +214 -65
  137. package/cpp/llama.cpp/src/llama-kv-cache-unified.h +62 -24
  138. package/cpp/llama.cpp/src/llama-kv-cells.h +62 -10
  139. package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +9 -4
  140. package/cpp/llama.cpp/src/llama-memory-hybrid.h +3 -1
  141. package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +20 -10
  142. package/cpp/llama.cpp/src/llama-memory.cpp +17 -0
  143. package/cpp/llama.cpp/src/llama-memory.h +3 -0
  144. package/cpp/llama.cpp/src/llama-model.cpp +2530 -685
  145. package/cpp/llama.cpp/src/llama-model.h +18 -0
  146. package/cpp/llama.cpp/src/llama-quant.cpp +1 -0
  147. package/cpp/llama.cpp/src/llama-vocab.cpp +13 -2
  148. package/cpp/llama.cpp/src/llama-vocab.h +41 -0
  149. package/ios/include/chat.h +2 -0
  150. package/ios/include/common.h +4 -0
  151. package/ios/include/llama.h +0 -40
  152. package/ios/libs/llama.xcframework/Info.plist +19 -19
  153. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  154. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5055 -4886
  155. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-backend.h +1 -1
  156. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +173 -10
  157. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +0 -40
  158. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  159. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  160. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5030 -4861
  161. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3889 -3764
  162. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
  163. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +173 -10
  164. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +0 -40
  165. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  166. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  167. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5030 -4861
  168. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3891 -3766
  169. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-backend.h +1 -1
  170. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +173 -10
  171. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +0 -40
  172. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-backend.h +1 -1
  173. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +173 -10
  174. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +0 -40
  175. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  176. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-backend.h +1 -1
  177. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +173 -10
  178. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +0 -40
  179. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  180. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  181. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  182. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5059 -4890
  183. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-backend.h +1 -1
  184. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +173 -10
  185. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +0 -40
  186. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  187. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  188. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5030 -4861
  189. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3889 -3764
  190. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
  191. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +173 -10
  192. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +0 -40
  193. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  194. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  195. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5091 -4922
  196. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-backend.h +1 -1
  197. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +173 -10
  198. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +0 -40
  199. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  200. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  201. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5066 -4897
  202. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3919 -3794
  203. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
  204. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +173 -10
  205. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +0 -40
  206. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  207. package/package.json +1 -1
  208. package/cpp/llama.cpp/ggml/include/ggml-kompute.h +0 -50
  209. package/cpp/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
  210. package/cpp/llama.cpp/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
  211. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/common.comp +0 -112
  212. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +0 -58
  213. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +0 -25
  214. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +0 -52
  215. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +0 -52
  216. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +0 -52
  217. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +0 -52
  218. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +0 -30
  219. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +0 -22
  220. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +0 -17
  221. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +0 -31
  222. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +0 -31
  223. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +0 -38
  224. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +0 -39
  225. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +0 -44
  226. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +0 -52
  227. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +0 -69
  228. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +0 -51
  229. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +0 -33
  230. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +0 -35
  231. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +0 -140
  232. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +0 -106
  233. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +0 -73
  234. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +0 -52
  235. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +0 -28
  236. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +0 -84
  237. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +0 -21
  238. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +0 -53
  239. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +0 -52
  240. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +0 -52
  241. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +0 -52
  242. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +0 -52
  243. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +0 -19
  244. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +0 -23
  245. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +0 -22
  246. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +0 -72
  247. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +0 -71
@@ -128,6 +128,9 @@ models = [
128
128
  {"name": "llama4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", },
129
129
  {"name": "pixtral", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistral-community/pixtral-12b", },
130
130
  {"name": "seed-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base", },
131
+ {"name": "a.x-4.0", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/skt/A.X-4.0", },
132
+ {"name": "midm-2.0", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct", },
133
+ {"name": "lfm2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LiquidAI/LFM2-Tokenizer"},
131
134
  ]
132
135
 
133
136
  # some models are known to be broken upstream, so we will skip them as exceptions
@@ -137,6 +140,12 @@ pre_computed_hashes = [
137
140
  {"name": "chatglm-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-chat", "chkhsh": "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516"},
138
141
  {"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", "chkhsh": "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2"},
139
142
  {"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", "chkhsh": "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35"},
143
+ {"name": "hunyuan", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tencent/Hunyuan-A13B-Instruct", "chkhsh": "7e57df22b1fe23a7b1e1c7f3dc4e3f96d43a4eb0836d0c6bdc3436d7b2f1c664"},
144
+ # falcon-h1 series uses 4 different tokenizers across model sizes (0.5b - 34b), hence we need to define 4 different hashes
145
+ {"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-0.5B-Base", "chkhsh": "a6b57017d60e6edb4d88ecc2845188e0eb333a70357e45dcc9b53964a73bbae6"},
146
+ {"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-1B-Base", "chkhsh": "60476e1243776c4fb1b993dbd7a5f15ac22f83c80afdf425fa5ae01c8d44ef86"},
147
+ {"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-7B-Base", "chkhsh": "3eda48b4c4dc7de733d1a8b3e3b4a85243dbbf704da2ee9d42c6beced8897896"},
148
+ {"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-34B-Base", "chkhsh": "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b"},
140
149
  ]
141
150
 
142
151
 
@@ -181,7 +181,6 @@ option(GGML_VULKAN_MEMORY_DEBUG "ggml: enable Vulkan memory debug ou
181
181
  option(GGML_VULKAN_SHADER_DEBUG_INFO "ggml: enable Vulkan shader debug info" OFF)
182
182
  option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation" OFF)
183
183
  option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF)
184
- option(GGML_KOMPUTE "ggml: use Kompute" OFF)
185
184
  option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT})
186
185
  option(GGML_METAL_USE_BF16 "ggml: use bfloat if available" OFF)
187
186
  option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF)
@@ -266,7 +265,6 @@ set(GGML_PUBLIC_HEADERS
266
265
  include/ggml-cann.h
267
266
  include/ggml-cpp.h
268
267
  include/ggml-cuda.h
269
- include/ggml-kompute.h
270
268
  include/ggml-opt.h
271
269
  include/ggml-metal.h
272
270
  include/ggml-rpc.h
@@ -360,6 +358,13 @@ write_basic_package_version_file(
360
358
  VERSION ${GGML_INSTALL_VERSION}
361
359
  COMPATIBILITY SameMajorVersion)
362
360
 
361
+ target_compile_definitions(ggml-base PRIVATE
362
+ GGML_VERSION="${GGML_INSTALL_VERSION}"
363
+ GGML_COMMIT="${GGML_BUILD_COMMIT}"
364
+ )
365
+ message(STATUS "ggml version: ${GGML_INSTALL_VERSION}")
366
+ message(STATUS "ggml commit: ${GGML_BUILD_COMMIT}")
367
+
363
368
  install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ggml-config.cmake
364
369
  ${CMAKE_CURRENT_BINARY_DIR}/ggml-version.cmake
365
370
  DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/ggml)
@@ -339,7 +339,7 @@ extern "C" {
339
339
  typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
340
340
 
341
341
  // Compare the output of two backends
342
- GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
342
+ GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data, struct ggml_tensor * test_node);
343
343
 
344
344
  // Tensor initialization
345
345
  GGML_API enum ggml_status ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
@@ -314,6 +314,13 @@
314
314
  extern "C" {
315
315
  #endif
316
316
 
317
+ // Function type used in fatal error callbacks
318
+ typedef void (*ggml_abort_callback_t)(const char * error_message);
319
+
320
+ // Set the abort callback (passing null will restore original abort functionality: printing a message to stdout)
321
+ // Returns the old callback for chaining
322
+ GGML_API ggml_abort_callback_t ggml_set_abort_callback(ggml_abort_callback_t callback);
323
+
317
324
  GGML_NORETURN GGML_ATTRIBUTE_FORMAT(3, 4)
318
325
  GGML_API void ggml_abort(const char * file, int line, const char * fmt, ...);
319
326
 
@@ -482,12 +489,13 @@ extern "C" {
482
489
  GGML_OP_CONV_TRANSPOSE_1D,
483
490
  GGML_OP_IM2COL,
484
491
  GGML_OP_IM2COL_BACK,
492
+ GGML_OP_CONV_2D,
485
493
  GGML_OP_CONV_2D_DW,
486
494
  GGML_OP_CONV_TRANSPOSE_2D,
487
495
  GGML_OP_POOL_1D,
488
496
  GGML_OP_POOL_2D,
489
497
  GGML_OP_POOL_2D_BACK,
490
- GGML_OP_UPSCALE, // nearest interpolate
498
+ GGML_OP_UPSCALE,
491
499
  GGML_OP_PAD,
492
500
  GGML_OP_PAD_REFLECT_1D,
493
501
  GGML_OP_ROLL,
@@ -520,6 +528,8 @@ extern "C" {
520
528
  GGML_OP_CROSS_ENTROPY_LOSS_BACK,
521
529
  GGML_OP_OPT_STEP_ADAMW,
522
530
 
531
+ GGML_OP_GLU,
532
+
523
533
  GGML_OP_COUNT,
524
534
  };
525
535
 
@@ -543,6 +553,16 @@ extern "C" {
543
553
  GGML_UNARY_OP_COUNT,
544
554
  };
545
555
 
556
+ enum ggml_glu_op {
557
+ GGML_GLU_OP_REGLU,
558
+ GGML_GLU_OP_GEGLU,
559
+ GGML_GLU_OP_SWIGLU,
560
+ GGML_GLU_OP_GEGLU_ERF,
561
+ GGML_GLU_OP_GEGLU_QUICK,
562
+
563
+ GGML_GLU_OP_COUNT,
564
+ };
565
+
546
566
  enum ggml_object_type {
547
567
  GGML_OBJECT_TYPE_TENSOR,
548
568
  GGML_OBJECT_TYPE_GRAPH,
@@ -628,6 +648,9 @@ extern "C" {
628
648
 
629
649
  // misc
630
650
 
651
+ GGML_API const char * ggml_version(void);
652
+ GGML_API const char * ggml_commit(void);
653
+
631
654
  GGML_API void ggml_time_init(void); // call this once at the beginning of the program
632
655
  GGML_API int64_t ggml_time_ms(void);
633
656
  GGML_API int64_t ggml_time_us(void);
@@ -658,6 +681,7 @@ extern "C" {
658
681
  GGML_API const char * ggml_op_symbol(enum ggml_op op);
659
682
 
660
683
  GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op);
684
+ GGML_API const char * ggml_glu_op_name(enum ggml_glu_op op);
661
685
  GGML_API const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name
662
686
 
663
687
  GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
@@ -762,6 +786,7 @@ extern "C" {
762
786
  GGML_API void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3);
763
787
 
764
788
  GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
789
+ GGML_API enum ggml_glu_op ggml_get_glu_op(const struct ggml_tensor * tensor);
765
790
 
766
791
  GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
767
792
  GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
@@ -1090,6 +1115,89 @@ extern "C" {
1090
1115
  struct ggml_context * ctx,
1091
1116
  struct ggml_tensor * a);
1092
1117
 
1118
+ // gated linear unit ops
1119
+ // A: n columns, r rows,
1120
+ // result is n / 2 columns, r rows,
1121
+ // expects gate in second half of row, unless swapped is true
1122
+ GGML_API struct ggml_tensor * ggml_glu(
1123
+ struct ggml_context * ctx,
1124
+ struct ggml_tensor * a,
1125
+ enum ggml_glu_op op,
1126
+ bool swapped);
1127
+
1128
+ GGML_API struct ggml_tensor * ggml_reglu(
1129
+ struct ggml_context * ctx,
1130
+ struct ggml_tensor * a);
1131
+
1132
+ GGML_API struct ggml_tensor * ggml_reglu_swapped(
1133
+ struct ggml_context * ctx,
1134
+ struct ggml_tensor * a);
1135
+
1136
+ GGML_API struct ggml_tensor * ggml_geglu(
1137
+ struct ggml_context * ctx,
1138
+ struct ggml_tensor * a);
1139
+
1140
+ GGML_API struct ggml_tensor * ggml_geglu_swapped(
1141
+ struct ggml_context * ctx,
1142
+ struct ggml_tensor * a);
1143
+
1144
+ GGML_API struct ggml_tensor * ggml_swiglu(
1145
+ struct ggml_context * ctx,
1146
+ struct ggml_tensor * a);
1147
+
1148
+ GGML_API struct ggml_tensor * ggml_swiglu_swapped(
1149
+ struct ggml_context * ctx,
1150
+ struct ggml_tensor * a);
1151
+
1152
+ GGML_API struct ggml_tensor * ggml_geglu_erf(
1153
+ struct ggml_context * ctx,
1154
+ struct ggml_tensor * a);
1155
+
1156
+ GGML_API struct ggml_tensor * ggml_geglu_erf_swapped(
1157
+ struct ggml_context * ctx,
1158
+ struct ggml_tensor * a);
1159
+
1160
+ GGML_API struct ggml_tensor * ggml_geglu_quick(
1161
+ struct ggml_context * ctx,
1162
+ struct ggml_tensor * a);
1163
+
1164
+ GGML_API struct ggml_tensor * ggml_geglu_quick_swapped(
1165
+ struct ggml_context * ctx,
1166
+ struct ggml_tensor * a);
1167
+
1168
+ // A: n columns, r rows,
1169
+ // B: n columns, r rows,
1170
+ GGML_API struct ggml_tensor * ggml_glu_split(
1171
+ struct ggml_context * ctx,
1172
+ struct ggml_tensor * a,
1173
+ struct ggml_tensor * b,
1174
+ enum ggml_glu_op op);
1175
+
1176
+ GGML_API struct ggml_tensor * ggml_reglu_split(
1177
+ struct ggml_context * ctx,
1178
+ struct ggml_tensor * a,
1179
+ struct ggml_tensor * b);
1180
+
1181
+ GGML_API struct ggml_tensor * ggml_geglu_split(
1182
+ struct ggml_context * ctx,
1183
+ struct ggml_tensor * a,
1184
+ struct ggml_tensor * b);
1185
+
1186
+ GGML_API struct ggml_tensor * ggml_swiglu_split(
1187
+ struct ggml_context * ctx,
1188
+ struct ggml_tensor * a,
1189
+ struct ggml_tensor * b);
1190
+
1191
+ GGML_API struct ggml_tensor * ggml_geglu_erf_split(
1192
+ struct ggml_context * ctx,
1193
+ struct ggml_tensor * a,
1194
+ struct ggml_tensor * b);
1195
+
1196
+ GGML_API struct ggml_tensor * ggml_geglu_quick_split(
1197
+ struct ggml_context * ctx,
1198
+ struct ggml_tensor * a,
1199
+ struct ggml_tensor * b);
1200
+
1093
1201
  // normalize along rows
1094
1202
  GGML_API struct ggml_tensor * ggml_norm(
1095
1203
  struct ggml_context * ctx,
@@ -1189,6 +1297,19 @@ extern "C" {
1189
1297
  struct ggml_tensor * a,
1190
1298
  float s);
1191
1299
 
1300
+ // x = s * a + b
1301
+ GGML_API struct ggml_tensor * ggml_scale_bias(
1302
+ struct ggml_context * ctx,
1303
+ struct ggml_tensor * a,
1304
+ float s,
1305
+ float b);
1306
+
1307
+ GGML_API struct ggml_tensor * ggml_scale_bias_inplace(
1308
+ struct ggml_context * ctx,
1309
+ struct ggml_tensor * a,
1310
+ float s,
1311
+ float b);
1312
+
1192
1313
  // b -> view(a,offset,nb1,nb2,3), return modified a
1193
1314
  GGML_API struct ggml_tensor * ggml_set(
1194
1315
  struct ggml_context * ctx,
@@ -1433,8 +1554,14 @@ extern "C" {
1433
1554
  struct ggml_context * ctx,
1434
1555
  struct ggml_tensor * a);
1435
1556
 
1557
+ // a [ne0, ne01, ne02, ne03]
1558
+ // mask [ne0, ne11, ne12, ne13] | ne11 >= ne01, F16 or F32, optional
1559
+ //
1560
+ // broadcast:
1561
+ // ne02 % ne12 == 0
1562
+ // ne03 % ne13 == 0
1563
+ //
1436
1564
  // fused soft_max(a*scale + mask*(ALiBi slope))
1437
- // mask is optional
1438
1565
  // max_bias = 0.0f for no ALiBi
1439
1566
  GGML_API struct ggml_tensor * ggml_soft_max_ext(
1440
1567
  struct ggml_context * ctx,
@@ -1744,6 +1871,17 @@ extern "C" {
1744
1871
  struct ggml_tensor * b,
1745
1872
  int stride);
1746
1873
 
1874
+ GGML_API struct ggml_tensor * ggml_conv_2d_direct(
1875
+ struct ggml_context * ctx,
1876
+ struct ggml_tensor * a, // convolution kernel [KW, KH, IC, OC]
1877
+ struct ggml_tensor * b, // input data [W, H, C, N]
1878
+ int s0, // stride dimension 0
1879
+ int s1, // stride dimension 1
1880
+ int p0, // padding dimension 0
1881
+ int p1, // padding dimension 1
1882
+ int d0, // dilation dimension 0
1883
+ int d1); // dilation dimension 1
1884
+
1747
1885
  enum ggml_op_pool {
1748
1886
  GGML_OP_POOL_MAX,
1749
1887
  GGML_OP_POOL_AVG,
@@ -1786,6 +1924,12 @@ extern "C" {
1786
1924
  enum ggml_scale_mode {
1787
1925
  GGML_SCALE_MODE_NEAREST = 0,
1788
1926
  GGML_SCALE_MODE_BILINEAR = 1,
1927
+
1928
+ GGML_SCALE_MODE_COUNT
1929
+ };
1930
+
1931
+ enum ggml_scale_flag {
1932
+ GGML_SCALE_FLAG_ALIGN_CORNERS = (1 << 8)
1789
1933
  };
1790
1934
 
1791
1935
  // interpolate
@@ -1798,14 +1942,26 @@ extern "C" {
1798
1942
 
1799
1943
  // interpolate
1800
1944
  // interpolate scale to specified dimensions
1801
- GGML_API struct ggml_tensor * ggml_upscale_ext(
1945
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_upscale_ext(
1802
1946
  struct ggml_context * ctx,
1803
1947
  struct ggml_tensor * a,
1804
1948
  int ne0,
1805
1949
  int ne1,
1806
1950
  int ne2,
1807
1951
  int ne3,
1808
- enum ggml_scale_mode mode);
1952
+ enum ggml_scale_mode mode),
1953
+ "use ggml_interpolate instead");
1954
+
1955
+ // Up- or downsamples the input to the specified size.
1956
+ // 2D scale modes (eg. bilinear) are applied to the first two dimensions.
1957
+ GGML_API struct ggml_tensor * ggml_interpolate(
1958
+ struct ggml_context * ctx,
1959
+ struct ggml_tensor * a,
1960
+ int64_t ne0,
1961
+ int64_t ne1,
1962
+ int64_t ne2,
1963
+ int64_t ne3,
1964
+ uint32_t mode); // ggml_scale_mode [ | ggml_scale_flag...]
1809
1965
 
1810
1966
  // pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
1811
1967
  GGML_API struct ggml_tensor * ggml_pad(
@@ -1868,11 +2024,17 @@ extern "C" {
1868
2024
 
1869
2025
  #define GGML_KQ_MASK_PAD 64
1870
2026
 
1871
- // q: [n_embd_k, n_batch, n_head, 1]
1872
- // k: [n_embd_k, n_kv, n_head_kv, 1]
1873
- // v: [n_embd_v, n_kv, n_head_kv, 1] !! not transposed !!
1874
- // mask: [n_kv, n_batch_pad, 1, 1] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
1875
- // res: [n_embd_v, n_head, n_batch, 1] !! permuted !!
2027
+ // q: [n_embd_k, n_batch, n_head, ne3 ]
2028
+ // k: [n_embd_k, n_kv, n_head_kv, ne3 ]
2029
+ // v: [n_embd_v, n_kv, n_head_kv, ne3 ] !! not transposed !!
2030
+ // mask: [n_kv, n_batch_pad, ne32, ne33] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
2031
+ // res: [n_embd_v, n_head, n_batch, ne3 ] !! permuted !!
2032
+ //
2033
+ // broadcast:
2034
+ // n_head % n_head_kv == 0
2035
+ // n_head % ne32 == 0
2036
+ // ne3 % ne33 == 0
2037
+ //
1876
2038
  GGML_API struct ggml_tensor * ggml_flash_attn_ext(
1877
2039
  struct ggml_context * ctx,
1878
2040
  struct ggml_tensor * q,
@@ -1911,7 +2073,8 @@ extern "C" {
1911
2073
  struct ggml_tensor * dt,
1912
2074
  struct ggml_tensor * A,
1913
2075
  struct ggml_tensor * B,
1914
- struct ggml_tensor * C);
2076
+ struct ggml_tensor * C,
2077
+ struct ggml_tensor * ids);
1915
2078
 
1916
2079
  // partition into non-overlapping windows with padding if needed
1917
2080
  // example:
@@ -365,7 +365,6 @@ ggml_add_backend(BLAS)
365
365
  ggml_add_backend(CANN)
366
366
  ggml_add_backend(CUDA)
367
367
  ggml_add_backend(HIP)
368
- ggml_add_backend(Kompute)
369
368
  ggml_add_backend(METAL)
370
369
  ggml_add_backend(MUSA)
371
370
  ggml_add_backend(RPC)
@@ -61,10 +61,6 @@
61
61
  #include "ggml-cann.h"
62
62
  #endif
63
63
 
64
- #ifdef GGML_USE_KOMPUTE
65
- #include "ggml-kompute.h"
66
- #endif
67
-
68
64
  // disable C++17 deprecation warning for std::codecvt_utf8
69
65
  #if defined(__clang__)
70
66
  # pragma clang diagnostic push
@@ -189,9 +185,6 @@ struct ggml_backend_registry {
189
185
  #ifdef GGML_USE_RPC
190
186
  register_backend(ggml_backend_rpc_reg());
191
187
  #endif
192
- #ifdef GGML_USE_KOMPUTE
193
- register_backend(ggml_backend_kompute_reg());
194
- #endif
195
188
  #ifdef GGML_USE_CPU
196
189
  register_backend(ggml_backend_cpu_reg());
197
190
  #endif
@@ -575,7 +568,6 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
575
568
  ggml_backend_load_best("cann", silent, dir_path);
576
569
  ggml_backend_load_best("cuda", silent, dir_path);
577
570
  ggml_backend_load_best("hip", silent, dir_path);
578
- ggml_backend_load_best("kompute", silent, dir_path);
579
571
  ggml_backend_load_best("metal", silent, dir_path);
580
572
  ggml_backend_load_best("rpc", silent, dir_path);
581
573
  ggml_backend_load_best("sycl", silent, dir_path);
@@ -817,8 +817,9 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
817
817
  }
818
818
  if (sched->debug > 1) {
819
819
  ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
820
- GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
821
- fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
820
+ GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s] use=%d:", i, ggml_op_name(node->op), node->name,
821
+ fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node),
822
+ graph->use_counts[ggml_hash_find(&graph->visited_hash_set, node)]);
822
823
  for (int j = 0; j < GGML_MAX_SRC; j++) {
823
824
  struct ggml_tensor * src = node->src[j];
824
825
  if (src == NULL) {
@@ -1826,7 +1827,7 @@ void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy) {
1826
1827
  ggml_free(copy.ctx_unallocated);
1827
1828
  }
1828
1829
 
1829
- bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data) {
1830
+ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data, struct ggml_tensor * test_node) {
1830
1831
  struct ggml_backend_graph_copy copy = ggml_backend_graph_copy(backend2, graph);
1831
1832
  if (copy.buffer == NULL) {
1832
1833
  return false;
@@ -1837,28 +1838,45 @@ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t
1837
1838
 
1838
1839
  assert(g1->n_nodes == g2->n_nodes);
1839
1840
 
1840
- for (int i = 0; i < g1->n_nodes; i++) {
1841
- struct ggml_tensor * t1 = g1->nodes[i];
1842
- struct ggml_tensor * t2 = g2->nodes[i];
1841
+ if (test_node != nullptr) {
1842
+ // Compute the whole graph and only test the output for a specific tensor
1843
+ ggml_backend_graph_compute(backend1, g1);
1844
+ ggml_backend_graph_compute(backend2, g2);
1843
1845
 
1844
- assert(t1->op == t2->op && ggml_are_same_layout(t1, t2));
1846
+ int test_node_idx = -1;
1847
+ for (int i = 0; i < g1->n_nodes; i++) {
1848
+ struct ggml_tensor * t1 = g1->nodes[i];
1849
+ if (t1 == test_node) {
1850
+ test_node_idx = i;
1851
+ break;
1852
+ }
1853
+ }
1854
+ GGML_ASSERT(test_node_idx != -1);
1845
1855
 
1846
- struct ggml_cgraph g1v = ggml_graph_view(g1, i, i + 1);
1847
- struct ggml_cgraph g2v = ggml_graph_view(g2, i, i + 1);
1856
+ callback(test_node_idx, g1->nodes[test_node_idx], g2->nodes[test_node_idx], user_data);
1857
+ } else {
1858
+ for (int i = 0; i < g1->n_nodes; i++) {
1859
+ struct ggml_tensor * t1 = g1->nodes[i];
1860
+ struct ggml_tensor * t2 = g2->nodes[i];
1848
1861
 
1849
- ggml_backend_graph_compute(backend1, &g1v);
1850
- ggml_backend_graph_compute(backend2, &g2v);
1862
+ assert(t1->op == t2->op && ggml_are_same_layout(t1, t2));
1851
1863
 
1852
- if (ggml_is_view_op(t1->op)) {
1853
- continue;
1854
- }
1864
+ struct ggml_cgraph g1v = ggml_graph_view(g1, i, i + 1);
1865
+ struct ggml_cgraph g2v = ggml_graph_view(g2, i, i + 1);
1855
1866
 
1856
- // compare results, calculate rms etc
1857
- if (!callback(i, t1, t2, user_data)) {
1858
- break;
1867
+ ggml_backend_graph_compute(backend1, &g1v);
1868
+ ggml_backend_graph_compute(backend2, &g2v);
1869
+
1870
+ if (ggml_is_view_op(t1->op)) {
1871
+ continue;
1872
+ }
1873
+
1874
+ // compare results, calculate rms etc
1875
+ if (!callback(i, t1, t2, user_data)) {
1876
+ break;
1877
+ }
1859
1878
  }
1860
1879
  }
1861
-
1862
1880
  ggml_backend_graph_copy_free(copy);
1863
1881
 
1864
1882
  return true;
@@ -65,8 +65,9 @@
65
65
  #include <aclnnop/aclnn_eq_tensor.h>
66
66
  #include <aclnnop/aclnn_gt_scalar.h>
67
67
  #include <aclnnop/aclnn_pow.h>
68
- #include <aclnnop/aclnn_grouped_matmul_v2.h>
68
+ #include <aclnnop/aclnn_grouped_matmul_v3.h>
69
69
  #include <aclnnop/aclnn_fused_infer_attention_score_v2.h>
70
+ #include <aclnnop/aclnn_zero.h>
70
71
  #include <float.h>
71
72
 
72
73
  #include <cmath>
@@ -804,10 +805,11 @@ static aclTensor* aclnn_zero(ggml_backend_cann_context& ctx, void* buffer,
804
805
  nb[i] = nb[i - 1] * ne[i - 1];
805
806
  }
806
807
 
807
- ggml_cann_async_memset(ctx, buffer, n_bytes, 0);
808
808
  aclTensor* zero =
809
809
  ggml_cann_create_tensor(buffer, type, type_size, ne, nb, dims);
810
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceZero, zero);
810
811
  return zero;
812
+ GGML_UNUSED(n_bytes);
811
813
  }
812
814
 
813
815
  /**
@@ -2654,6 +2656,67 @@ static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context& ctx, ggml_tensor*
2654
2656
  memcpy(ori_src0_nb, cast_nb, sizeof(ori_src0_nb));
2655
2657
  }
2656
2658
 
2659
+ #ifdef ASCEND_310P
2660
+ ggml_tensor src0_row = *src0;
2661
+ ggml_tensor src1_row = *src1;
2662
+ ggml_tensor dst_row = *dst;
2663
+
2664
+ if (src0->type == GGML_TYPE_F16) {
2665
+ src0_row.type = GGML_TYPE_F32;
2666
+ }
2667
+
2668
+ // src0_row [D, M, 1, 1] weight without permute
2669
+ src0_row.ne[2] = 1;
2670
+ src0_row.ne[3] = 1;
2671
+ src0_row.nb[0] = ori_src0_nb[0];
2672
+ src0_row.nb[1] = ori_src0_nb[1];
2673
+ src0_row.nb[2] = ori_src0_nb[1];
2674
+ src0_row.nb[3] = ori_src0_nb[1];
2675
+
2676
+ // src1_row [D, 1, 1, 1] -> input
2677
+ src1_row.ne[1] = 1;
2678
+ src1_row.ne[2] = 1;
2679
+ src1_row.ne[3] = 1;
2680
+ src1_row.nb[2] = nb11;
2681
+ src1_row.nb[3] = nb11;
2682
+
2683
+ // dst_row [M, 1, 1, 1] -> out
2684
+ dst_row.ne[1] = 1;
2685
+ dst_row.ne[2] = 1;
2686
+ dst_row.ne[3] = 1;
2687
+ dst_row.nb[2] = nb1;
2688
+ dst_row.nb[3] = nb1;
2689
+
2690
+ //create weight for one row
2691
+ for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
2692
+ for (int64_t id = 0; id < n_ids; id++) {
2693
+ // expert index
2694
+ int32_t i02 = *(int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
2695
+ GGML_ASSERT(i02 >= 0 && i02 < n_as);
2696
+
2697
+ // If B = 1 (broadcast), always use 0; otherwise, use id.
2698
+ int64_t i11 = (ne11 == 1 ? 0 : id);
2699
+ int64_t i12 = iid1;
2700
+
2701
+ int64_t i1 = id;
2702
+ int64_t i2 = i12;
2703
+
2704
+ void* src0_tmp_ptr = src0_original + i02*ori_src0_nb[2];
2705
+ void* src1_tmp_ptr = src1_original + i11*nb11 + i12*nb12;
2706
+ void* dst_tmp_ptr = dst_original + i1*nb1 + i2*nb2;
2707
+
2708
+ src0_row.data = src0_tmp_ptr;
2709
+ src1_row.data = src1_tmp_ptr;
2710
+ dst_row.data = dst_tmp_ptr;
2711
+ dst_row.src[0] = &src0_row;
2712
+ dst_row.src[1] = &src1_row;
2713
+
2714
+ ggml_cann_mul_mat(ctx, &dst_row);
2715
+ }
2716
+ }
2717
+ return;
2718
+ #endif
2719
+
2657
2720
  std::vector<aclTensor*> src0_tensor_vec;
2658
2721
  std::vector<aclTensor*> src1_tensor_vec;
2659
2722
  std::vector<aclTensor*> dst_tensor_vec;
@@ -2701,9 +2764,9 @@ static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context& ctx, ggml_tensor*
2701
2764
  }
2702
2765
 
2703
2766
  size_t GROUP_SIZE = 128;
2704
- // GroupedMatmulV2 required tensor_list.size < 128
2767
+ // GroupedMatmulV3 required tensor_list.size < 128
2705
2768
  for (size_t i = 0; i < src0_tensor_vec.size(); i += GROUP_SIZE) {
2706
- // split and call GroupedMatmulV2
2769
+ // split and call GroupedMatmulV3
2707
2770
  size_t end = std::min(i + GROUP_SIZE, src0_tensor_vec.size());
2708
2771
  std::vector<aclTensor*> src0_tensor_vec_split(src0_tensor_vec.begin() + i, src0_tensor_vec.begin() + end);
2709
2772
  std::vector<aclTensor*> src1_tensor_vec_split(src1_tensor_vec.begin() + i, src1_tensor_vec.begin() + end);
@@ -2713,7 +2776,7 @@ static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context& ctx, ggml_tensor*
2713
2776
  aclTensorList* src1_tensor_list = aclCreateTensorList(src1_tensor_vec_split.data(), src1_tensor_vec_split.size());
2714
2777
  aclTensorList* dst_tensor_list = aclCreateTensorList(dst_tensor_vec_split.data(), dst_tensor_vec_split.size());
2715
2778
 
2716
- GGML_CANN_CALL_ACLNN_OP(ctx, GroupedMatmulV2, src1_tensor_list, src0_tensor_list,
2779
+ GGML_CANN_CALL_ACLNN_OP(ctx, GroupedMatmulV3, src1_tensor_list, src0_tensor_list,
2717
2780
  nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, -1, dst_tensor_list);
2718
2781
 
2719
2782
  ggml_cann_release_resources(ctx, src0_tensor_list, src1_tensor_list, dst_tensor_list);
@@ -2086,6 +2086,12 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
2086
2086
  return false;
2087
2087
  }
2088
2088
  } break;
2089
+ case GGML_OP_SET_ROWS:
2090
+ {
2091
+ // TODO: add support
2092
+ // ref: https://github.com/ggml-org/llama.cpp/pull/14274
2093
+ return false;
2094
+ } break;
2089
2095
  case GGML_OP_CPY: {
2090
2096
  ggml_tensor *src = op->src[0];
2091
2097
  if ((op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_F16) ||
@@ -2182,12 +2188,10 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
2182
2188
  case GGML_OP_MUL:
2183
2189
  case GGML_OP_DIV:
2184
2190
  case GGML_OP_RMS_NORM:
2185
- case GGML_OP_SCALE:
2186
2191
  case GGML_OP_SQR:
2187
2192
  case GGML_OP_SQRT:
2188
2193
  case GGML_OP_CLAMP:
2189
2194
  case GGML_OP_DIAG_MASK_INF:
2190
- case GGML_OP_SOFT_MAX:
2191
2195
  case GGML_OP_SUM_ROWS:
2192
2196
  case GGML_OP_ARGSORT:
2193
2197
  case GGML_OP_ACC:
@@ -2205,6 +2209,14 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
2205
2209
  case GGML_OP_PAD_REFLECT_1D:
2206
2210
  case GGML_OP_COUNT_EQUAL:
2207
2211
  return true;
2212
+ case GGML_OP_SCALE:
2213
+ float bias;
2214
+ memcpy(&bias, (float*)op->op_params + 1, sizeof(float));
2215
+ return bias == 0.0f; // TODO: support bias != 0.0f
2216
+ case GGML_OP_SOFT_MAX:
2217
+ // TODO: support broadcast
2218
+ // ref: https://github.com/ggml-org/llama.cpp/pull/14435
2219
+ return !op->src[1] || (op->src[1]->ne[2] == 1 && op->src[1]->ne[3] == 1);
2208
2220
  case GGML_OP_FLASH_ATTN_EXT:{
2209
2221
  // derived from [ggml-cuda.cu]
2210
2222
  if(op->src[1]->type != GGML_TYPE_F16 || op->src[2]->type != GGML_TYPE_F16){
@@ -2227,6 +2239,8 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
2227
2239
  // DeepSeek MLA
2228
2240
  return false;
2229
2241
  }
2242
+ // TODO: support broadcast
2243
+ // ref: https://github.com/ggml-org/llama.cpp/pull/14435
2230
2244
  if (op->src[0]->ne[3] != 1) {
2231
2245
  return false;
2232
2246
  }
@@ -5,7 +5,7 @@ function(ggml_add_cpu_backend_features cpu_name arch)
5
5
  # build, using set_source_files_properties() to set the arch flags is not possible
6
6
  set(GGML_CPU_FEATS_NAME ${cpu_name}-feats)
7
7
  add_library(${GGML_CPU_FEATS_NAME} OBJECT ggml-cpu/arch/${arch}/cpu-feats.cpp)
8
- target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . .. ../include)
8
+ target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . ../include)
9
9
  target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARGN})
10
10
  target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED)
11
11
  set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
@@ -589,4 +589,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
589
589
  if (EMSCRIPTEN)
590
590
  set_target_properties(${GGML_CPU_NAME} PROPERTIES COMPILE_FLAGS "-msimd128")
591
591
  endif()
592
+
593
+ if (CMAKE_CXX_COMPILER_ID STREQUAL "IntelLLVM")
594
+ # The compiler automatically enables "-ffast-math" which can cause NaNs in tests due to "-fassociative-math"
595
+ target_compile_options(${GGML_CPU_NAME} PRIVATE "-fno-associative-math")
596
+ endif()
592
597
  endfunction()