@fugood/llama.node 0.3.16 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (281) hide show
  1. package/CMakeLists.txt +6 -1
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +44 -2
  19. package/lib/index.js +132 -1
  20. package/lib/index.ts +203 -3
  21. package/package.json +2 -1
  22. package/src/EmbeddingWorker.cpp +1 -1
  23. package/src/LlamaCompletionWorker.cpp +374 -19
  24. package/src/LlamaCompletionWorker.h +31 -10
  25. package/src/LlamaContext.cpp +216 -7
  26. package/src/LlamaContext.h +12 -0
  27. package/src/common.hpp +15 -0
  28. package/src/llama.cpp/.github/workflows/build-linux-cross.yml +233 -0
  29. package/src/llama.cpp/.github/workflows/build.yml +89 -767
  30. package/src/llama.cpp/.github/workflows/docker.yml +9 -6
  31. package/src/llama.cpp/.github/workflows/release.yml +716 -0
  32. package/src/llama.cpp/.github/workflows/server.yml +19 -23
  33. package/src/llama.cpp/CMakeLists.txt +11 -1
  34. package/src/llama.cpp/cmake/build-info.cmake +8 -2
  35. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -6
  36. package/src/llama.cpp/common/CMakeLists.txt +35 -4
  37. package/src/llama.cpp/common/arg.cpp +844 -121
  38. package/src/llama.cpp/common/arg.h +9 -0
  39. package/src/llama.cpp/common/chat.cpp +129 -107
  40. package/src/llama.cpp/common/chat.h +2 -0
  41. package/src/llama.cpp/common/common.cpp +64 -518
  42. package/src/llama.cpp/common/common.h +35 -45
  43. package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
  44. package/src/llama.cpp/common/llguidance.cpp +31 -47
  45. package/src/llama.cpp/common/minja/chat-template.hpp +23 -11
  46. package/src/llama.cpp/common/minja/minja.hpp +186 -127
  47. package/src/llama.cpp/common/regex-partial.cpp +204 -0
  48. package/src/llama.cpp/common/regex-partial.h +56 -0
  49. package/src/llama.cpp/common/sampling.cpp +60 -50
  50. package/src/llama.cpp/docs/build.md +122 -7
  51. package/src/llama.cpp/examples/CMakeLists.txt +2 -32
  52. package/src/llama.cpp/examples/batched/batched.cpp +1 -1
  53. package/src/llama.cpp/examples/embedding/embedding.cpp +9 -12
  54. package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
  55. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  56. package/src/llama.cpp/examples/parallel/parallel.cpp +89 -15
  57. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
  58. package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
  59. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
  60. package/src/llama.cpp/examples/sycl/build.sh +2 -2
  61. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
  62. package/src/llama.cpp/examples/training/CMakeLists.txt +5 -0
  63. package/src/llama.cpp/examples/training/finetune.cpp +96 -0
  64. package/src/llama.cpp/ggml/CMakeLists.txt +35 -2
  65. package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
  66. package/src/llama.cpp/ggml/include/ggml-backend.h +4 -4
  67. package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -1
  68. package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
  69. package/src/llama.cpp/ggml/include/ggml-opt.h +47 -28
  70. package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
  71. package/src/llama.cpp/ggml/include/ggml.h +76 -106
  72. package/src/llama.cpp/ggml/src/CMakeLists.txt +11 -8
  73. package/src/llama.cpp/ggml/src/ggml-alloc.c +4 -1
  74. package/src/llama.cpp/ggml/src/ggml-backend.cpp +9 -5
  75. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
  76. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
  77. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
  78. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
  79. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
  80. package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
  81. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
  82. package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
  83. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +66 -33
  84. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
  85. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
  86. package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
  87. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
  88. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -194
  89. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
  90. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1060 -410
  91. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1008 -13533
  92. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +31 -16
  93. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +90 -12
  94. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -13
  95. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +266 -72
  96. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1034 -88
  97. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8796 -0
  98. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
  99. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
  101. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
  102. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +252 -0
  103. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
  104. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +23 -4
  105. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
  106. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
  107. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
  108. package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
  109. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +106 -14
  110. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
  111. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -262
  112. package/src/llama.cpp/ggml/src/ggml-opt.cpp +368 -190
  113. package/src/llama.cpp/ggml/src/ggml-quants.c +0 -6
  114. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +307 -40
  115. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +125 -45
  116. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +10 -8
  117. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +239 -0
  118. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
  119. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
  120. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +9 -307
  121. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +72 -25
  122. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +14 -7
  123. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
  124. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +7 -1
  125. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
  126. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +944 -438
  127. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
  128. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
  129. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
  130. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
  131. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +507 -411
  132. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
  133. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
  134. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +185 -89
  135. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
  136. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
  137. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
  138. package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +83 -0
  139. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
  140. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
  141. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +128 -53
  142. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +83 -49
  143. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1278 -282
  144. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +32 -0
  145. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +133 -30
  146. package/src/llama.cpp/ggml/src/ggml.c +170 -265
  147. package/src/llama.cpp/ggml/src/gguf.cpp +34 -33
  148. package/src/llama.cpp/include/llama.h +82 -22
  149. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
  150. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
  151. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
  152. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
  153. package/src/llama.cpp/requirements/requirements-all.txt +5 -3
  154. package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
  155. package/src/llama.cpp/scripts/xxd.cmake +1 -1
  156. package/src/llama.cpp/src/CMakeLists.txt +4 -2
  157. package/src/llama.cpp/src/llama-adapter.cpp +43 -1
  158. package/src/llama.cpp/src/llama-arch.cpp +163 -17
  159. package/src/llama.cpp/src/llama-arch.h +16 -0
  160. package/src/llama.cpp/src/llama-batch.cpp +5 -1
  161. package/src/llama.cpp/src/llama-batch.h +2 -1
  162. package/src/llama.cpp/src/llama-chat.cpp +91 -16
  163. package/src/llama.cpp/src/llama-chat.h +7 -2
  164. package/src/llama.cpp/src/llama-context.cpp +479 -575
  165. package/src/llama.cpp/src/llama-context.h +44 -33
  166. package/src/llama.cpp/src/llama-cparams.h +1 -0
  167. package/src/llama.cpp/src/llama-graph.cpp +209 -157
  168. package/src/llama.cpp/src/llama-graph.h +38 -14
  169. package/src/llama.cpp/src/llama-hparams.h +13 -0
  170. package/src/llama.cpp/src/llama-kv-cache.cpp +1604 -543
  171. package/src/llama.cpp/src/llama-kv-cache.h +283 -171
  172. package/src/llama.cpp/src/llama-memory.h +12 -2
  173. package/src/llama.cpp/src/llama-mmap.cpp +1 -1
  174. package/src/llama.cpp/src/llama-model-loader.cpp +34 -20
  175. package/src/llama.cpp/src/llama-model-loader.h +5 -3
  176. package/src/llama.cpp/src/llama-model-saver.cpp +281 -0
  177. package/src/llama.cpp/src/llama-model-saver.h +37 -0
  178. package/src/llama.cpp/src/llama-model.cpp +1803 -330
  179. package/src/llama.cpp/src/llama-model.h +21 -2
  180. package/src/llama.cpp/src/llama-quant.cpp +33 -10
  181. package/src/llama.cpp/src/llama-sampling.cpp +25 -7
  182. package/src/llama.cpp/src/llama-vocab.cpp +86 -10
  183. package/src/llama.cpp/src/llama-vocab.h +6 -0
  184. package/src/llama.cpp/src/llama.cpp +15 -1
  185. package/src/llama.cpp/tests/CMakeLists.txt +52 -31
  186. package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
  187. package/src/llama.cpp/tests/test-backend-ops.cpp +189 -90
  188. package/src/llama.cpp/tests/test-chat-template.cpp +26 -6
  189. package/src/llama.cpp/tests/test-chat.cpp +15 -3
  190. package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
  191. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
  192. package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
  193. package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
  194. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
  195. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
  196. package/src/llama.cpp/tests/test-mtmd-c-api.c +63 -0
  197. package/src/llama.cpp/tests/test-opt.cpp +33 -21
  198. package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
  199. package/src/llama.cpp/tests/test-regex-partial.cpp +288 -0
  200. package/src/llama.cpp/tests/test-sampling.cpp +1 -1
  201. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
  202. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
  203. package/src/llama.cpp/tools/CMakeLists.txt +39 -0
  204. package/src/llama.cpp/{examples → tools}/batched-bench/batched-bench.cpp +3 -3
  205. package/src/llama.cpp/{examples → tools}/export-lora/export-lora.cpp +1 -1
  206. package/src/llama.cpp/{examples → tools}/gguf-split/gguf-split.cpp +15 -16
  207. package/src/llama.cpp/{examples → tools}/imatrix/imatrix.cpp +11 -9
  208. package/src/llama.cpp/{examples → tools}/llama-bench/llama-bench.cpp +623 -274
  209. package/src/llama.cpp/{examples → tools}/main/main.cpp +22 -14
  210. package/src/llama.cpp/tools/mtmd/CMakeLists.txt +47 -0
  211. package/src/llama.cpp/tools/mtmd/clip-impl.h +365 -0
  212. package/src/llama.cpp/tools/mtmd/clip.cpp +3646 -0
  213. package/src/llama.cpp/tools/mtmd/clip.h +99 -0
  214. package/src/llama.cpp/tools/mtmd/deprecation-warning.cpp +22 -0
  215. package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +370 -0
  216. package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +310 -0
  217. package/src/llama.cpp/tools/mtmd/mtmd.cpp +678 -0
  218. package/src/llama.cpp/tools/mtmd/mtmd.h +331 -0
  219. package/src/llama.cpp/{examples → tools}/perplexity/perplexity.cpp +21 -5
  220. package/src/llama.cpp/{examples → tools}/quantize/quantize.cpp +53 -3
  221. package/src/llama.cpp/tools/rpc/CMakeLists.txt +4 -0
  222. package/src/llama.cpp/tools/rpc/rpc-server.cpp +322 -0
  223. package/src/llama.cpp/tools/run/CMakeLists.txt +16 -0
  224. package/src/llama.cpp/{examples → tools}/run/run.cpp +30 -30
  225. package/src/llama.cpp/{examples → tools}/server/CMakeLists.txt +2 -1
  226. package/src/llama.cpp/{examples → tools}/server/httplib.h +313 -247
  227. package/src/llama.cpp/{examples → tools}/server/server.cpp +529 -215
  228. package/src/llama.cpp/{examples → tools}/server/utils.hpp +427 -6
  229. package/src/llama.cpp/{examples → tools}/tts/tts.cpp +6 -9
  230. package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +0 -6
  231. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
  232. package/src/llama.cpp/examples/infill/CMakeLists.txt +0 -5
  233. package/src/llama.cpp/examples/infill/infill.cpp +0 -590
  234. package/src/llama.cpp/examples/llava/CMakeLists.txt +0 -66
  235. package/src/llama.cpp/examples/llava/android/build_64.sh +0 -8
  236. package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +0 -59
  237. package/src/llama.cpp/examples/llava/clip.cpp +0 -3206
  238. package/src/llama.cpp/examples/llava/clip.h +0 -118
  239. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
  240. package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
  241. package/src/llama.cpp/examples/llava/llava.cpp +0 -574
  242. package/src/llama.cpp/examples/llava/llava.h +0 -49
  243. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
  244. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +0 -584
  245. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
  246. package/src/llama.cpp/examples/rpc/CMakeLists.txt +0 -2
  247. package/src/llama.cpp/examples/rpc/rpc-server.cpp +0 -171
  248. package/src/llama.cpp/examples/run/CMakeLists.txt +0 -5
  249. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  250. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  251. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  252. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  253. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  254. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  255. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  256. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  257. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  258. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
  259. /package/src/llama.cpp/{examples → tools}/batched-bench/CMakeLists.txt +0 -0
  260. /package/src/llama.cpp/{examples → tools}/cvector-generator/CMakeLists.txt +0 -0
  261. /package/src/llama.cpp/{examples → tools}/cvector-generator/completions.txt +0 -0
  262. /package/src/llama.cpp/{examples → tools}/cvector-generator/cvector-generator.cpp +0 -0
  263. /package/src/llama.cpp/{examples → tools}/cvector-generator/mean.hpp +0 -0
  264. /package/src/llama.cpp/{examples → tools}/cvector-generator/negative.txt +0 -0
  265. /package/src/llama.cpp/{examples → tools}/cvector-generator/pca.hpp +0 -0
  266. /package/src/llama.cpp/{examples → tools}/cvector-generator/positive.txt +0 -0
  267. /package/src/llama.cpp/{examples → tools}/export-lora/CMakeLists.txt +0 -0
  268. /package/src/llama.cpp/{examples → tools}/gguf-split/CMakeLists.txt +0 -0
  269. /package/src/llama.cpp/{examples → tools}/imatrix/CMakeLists.txt +0 -0
  270. /package/src/llama.cpp/{examples → tools}/llama-bench/CMakeLists.txt +0 -0
  271. /package/src/llama.cpp/{examples → tools}/main/CMakeLists.txt +0 -0
  272. /package/src/llama.cpp/{examples/llava → tools/mtmd}/requirements.txt +0 -0
  273. /package/src/llama.cpp/{examples → tools}/perplexity/CMakeLists.txt +0 -0
  274. /package/src/llama.cpp/{examples → tools}/quantize/CMakeLists.txt +0 -0
  275. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.cpp +0 -0
  276. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.h +0 -0
  277. /package/src/llama.cpp/{examples → tools}/server/bench/requirements.txt +0 -0
  278. /package/src/llama.cpp/{examples → tools}/server/tests/requirements.txt +0 -0
  279. /package/src/llama.cpp/{examples → tools}/tokenize/CMakeLists.txt +0 -0
  280. /package/src/llama.cpp/{examples → tools}/tokenize/tokenize.cpp +0 -0
  281. /package/src/llama.cpp/{examples → tools}/tts/CMakeLists.txt +0 -0
@@ -271,6 +271,14 @@ static std::string var_to_str(ggml_op_pool pool) {
271
271
  }
272
272
  }
273
273
 
274
+ static std::string var_to_str(ggml_scale_mode mode) {
275
+ switch (mode) {
276
+ case GGML_SCALE_MODE_NEAREST: return "nearest";
277
+ case GGML_SCALE_MODE_BILINEAR: return "bilinear";
278
+ default: return std::to_string(mode);
279
+ }
280
+ }
281
+
274
282
  #define VAR_TO_STR(x) (#x "=" + var_to_str(x))
275
283
 
276
284
  #define VARS_TO_STR1(a) VAR_TO_STR(a)
@@ -815,7 +823,7 @@ struct test_case {
815
823
 
816
824
  ggml_build_forward_expand(gf, out);
817
825
  ggml_graph_cpy(gf, gb);
818
- ggml_build_backward_expand(ctx.get(), ctx.get(), gb, false);
826
+ ggml_build_backward_expand(ctx.get(), gb, nullptr);
819
827
  if (expect.size() != 1 || expect[0] != 0.0f) {
820
828
  GGML_ASSERT(ggml_graph_n_nodes(gb) > ggml_graph_n_nodes(gf));
821
829
  for (ggml_tensor * t = ggml_get_first_tensor(ctx.get()); t != NULL; t = ggml_get_next_tensor(ctx.get(), t)) {
@@ -1018,7 +1026,7 @@ struct test_example : public test_case {
1018
1026
  // Step 3: return the output tensor.
1019
1027
  return out;
1020
1028
  }
1021
- // In order to also check the gradients for your op, add calls like ggml_set_param(ctx, a)
1029
+ // In order to also check the gradients for your op, add calls like ggml_set_param(a)
1022
1030
  // immediately after you create the tensors.
1023
1031
  // This is optional and only makes sense if a backward pass has actually been implemented for the new op.
1024
1032
  };
@@ -1050,7 +1058,7 @@ struct test_unary : public test_case {
1050
1058
  auto ne = ne_a; ne[0] *= 3;
1051
1059
  a = ggml_new_tensor(ctx, type, 4, ne.data());
1052
1060
  if (grad_supported) {
1053
- ggml_set_param(ctx, a);
1061
+ ggml_set_param(a);
1054
1062
  }
1055
1063
  ggml_set_name(a, "a");
1056
1064
 
@@ -1059,7 +1067,7 @@ struct test_unary : public test_case {
1059
1067
  } else {
1060
1068
  a = ggml_new_tensor(ctx, type, 4, ne_a.data());
1061
1069
  if (grad_supported) {
1062
- ggml_set_param(ctx, a);
1070
+ ggml_set_param(a);
1063
1071
  }
1064
1072
  ggml_set_name(a, "a");
1065
1073
  }
@@ -1125,7 +1133,7 @@ struct test_get_rows : public test_case {
1125
1133
 
1126
1134
  const bool grad_supported = ggml_is_matrix(in) && ggml_is_vector(rows);
1127
1135
  if (grad_supported) {
1128
- ggml_set_param(ctx, in);
1136
+ ggml_set_param(in);
1129
1137
  // rows is a constant input -> no gradients
1130
1138
  }
1131
1139
 
@@ -1314,7 +1322,7 @@ struct test_repeat : public test_case {
1314
1322
  ggml_set_name(target, "target");
1315
1323
 
1316
1324
  ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data());
1317
- ggml_set_param(ctx, src);
1325
+ ggml_set_param(src);
1318
1326
  ggml_set_name(src, "src");
1319
1327
 
1320
1328
  ggml_tensor * out = ggml_repeat(ctx, src, target);
@@ -1398,7 +1406,7 @@ struct test_dup : public test_case {
1398
1406
 
1399
1407
  ggml_tensor * build_graph(ggml_context * ctx) override {
1400
1408
  ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data());
1401
- ggml_set_param(ctx, src);
1409
+ ggml_set_param(src);
1402
1410
  ggml_set_name(src, "src");
1403
1411
 
1404
1412
  if (_use_permute) {
@@ -1434,7 +1442,7 @@ struct test_set : public test_case {
1434
1442
 
1435
1443
  ggml_tensor * build_graph(ggml_context * ctx) override {
1436
1444
  ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne.data());
1437
- ggml_set_param(ctx, src);
1445
+ ggml_set_param(src);
1438
1446
  ggml_set_name(src, "src");
1439
1447
 
1440
1448
  auto ne_dst = ne;
@@ -1442,7 +1450,7 @@ struct test_set : public test_case {
1442
1450
  ne_dst[i] *= 2;
1443
1451
  }
1444
1452
  ggml_tensor* dst = ggml_new_tensor(ctx, type_dst, 4, ne_dst.data());
1445
- ggml_set_param(ctx, dst);
1453
+ ggml_set_param(dst);
1446
1454
  ggml_set_name(dst, "dst");
1447
1455
 
1448
1456
  size_t offset = 0;
@@ -1490,7 +1498,7 @@ struct test_cpy : public test_case {
1490
1498
 
1491
1499
  ggml_tensor * build_graph(ggml_context * ctx) override {
1492
1500
  ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne.data());
1493
- ggml_set_param(ctx, src);
1501
+ ggml_set_param(src);
1494
1502
  ggml_set_name(src, "src");
1495
1503
 
1496
1504
  if (_src_use_permute) {
@@ -1528,7 +1536,7 @@ struct test_cont : public test_case {
1528
1536
 
1529
1537
  ggml_tensor * build_graph(ggml_context * ctx) override {
1530
1538
  ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data());
1531
- ggml_set_param(ctx, src);
1539
+ ggml_set_param(src);
1532
1540
  ggml_set_name(src, "src");
1533
1541
 
1534
1542
  src = ggml_transpose(ctx, src);
@@ -1575,8 +1583,8 @@ struct test_bin_bcast : public test_case {
1575
1583
  // The backward pass supports broadcasting only for GGML_ADD:
1576
1584
  const bool grad_supported = op == ggml_add || ggml_are_same_shape(a, b);
1577
1585
  if (grad_supported) {
1578
- ggml_set_param(ctx, a);
1579
- ggml_set_param(ctx, b);
1586
+ ggml_set_param(a);
1587
+ ggml_set_param(b);
1580
1588
  }
1581
1589
 
1582
1590
  ggml_tensor * out = op(ctx, a, b);
@@ -1624,11 +1632,11 @@ struct test_add1 : public test_case {
1624
1632
 
1625
1633
  ggml_tensor * build_graph(ggml_context * ctx) override {
1626
1634
  ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
1627
- ggml_set_param(ctx, a);
1635
+ ggml_set_param(a);
1628
1636
  ggml_set_name(a, "a");
1629
1637
 
1630
1638
  ggml_tensor * b = ggml_new_tensor_1d(ctx, type, 1);
1631
- // ggml_set_param(ctx, b); // TODO: implement
1639
+ // ggml_set_param(b); // TODO: implement
1632
1640
  ggml_set_name(b, "b");
1633
1641
 
1634
1642
  ggml_tensor * out = ggml_add1(ctx, a, b);
@@ -1659,7 +1667,7 @@ struct test_scale : public test_case {
1659
1667
 
1660
1668
  ggml_tensor * build_graph(ggml_context * ctx) override {
1661
1669
  ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
1662
- ggml_set_param(ctx, a);
1670
+ ggml_set_param(a);
1663
1671
  ggml_set_name(a, "a");
1664
1672
 
1665
1673
  ggml_tensor * out = ggml_scale(ctx, a, scale);
@@ -1754,7 +1762,7 @@ struct test_rms_norm : public test_case {
1754
1762
 
1755
1763
  ggml_tensor * build_graph(ggml_context * ctx) override {
1756
1764
  ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
1757
- ggml_set_param(ctx, a);
1765
+ ggml_set_param(a);
1758
1766
  ggml_set_name(a, "a");
1759
1767
 
1760
1768
  if (v) {
@@ -1973,7 +1981,7 @@ struct test_mul_mat : public test_case {
1973
1981
  const std::array<int64_t, 2> bs; // dims 3 and 4
1974
1982
  const std::array<int64_t, 2> nr; // repeat in dims 3 and 4
1975
1983
  const std::array<int64_t, 4> per; // permutation of dimensions
1976
- const bool v; // whether a is a non-contiguous view
1984
+ const bool v; // whether a and b are non-contiguous views
1977
1985
 
1978
1986
  std::string vars() override {
1979
1987
  return VARS_TO_STR9(type_a, type_b, m, n, k, bs, nr, per, v);
@@ -2020,9 +2028,9 @@ struct test_mul_mat : public test_case {
2020
2028
  b = ggml_new_tensor_4d(ctx, type_b, ne_b[per[0]], ne_b[per[1]], ne_b[per[2]], ne_b[per[3]]);
2021
2029
  if (!ggml_is_quantized(type_a)) {
2022
2030
  if (bs[1] == 1 && nr[1] == 1) {
2023
- ggml_set_param(ctx, a);
2031
+ ggml_set_param(a);
2024
2032
  }
2025
- ggml_set_param(ctx, b);
2033
+ ggml_set_param(b);
2026
2034
  }
2027
2035
  ggml_set_name(a, "a");
2028
2036
  ggml_set_name(b, "b");
@@ -2032,19 +2040,29 @@ struct test_mul_mat : public test_case {
2032
2040
  ggml_set_name(a, "a_permuted");
2033
2041
  ggml_set_name(b, "b_permuted");
2034
2042
  } else {
2035
-
2036
2043
  if (v) {
2037
- a = ggml_new_tensor_4d(ctx, type_a, k*2, m, bs[0], bs[1]);
2038
- a = ggml_view_4d(ctx, a, k, m, bs[0], bs[1], a->nb[1], a->nb[2], a->nb[3], 0);
2044
+ a = ggml_new_tensor_4d(ctx, type_a, k*2, m, bs[0], bs[1]);
2045
+ b = ggml_new_tensor_4d(ctx, type_b, k*2, n, bs[0]*nr[0], bs[1]*nr[1]);
2046
+
2047
+ if (!ggml_is_quantized(type_a)) {
2048
+ if (bs[1] == 1 && nr[1] == 1) {
2049
+ ggml_set_param(a);
2050
+ }
2051
+ ggml_set_param(b);
2052
+ }
2053
+
2054
+ a = ggml_view_4d(ctx, a, k, m, bs[0], bs[1], a->nb[1], a->nb[2], a->nb[3], 0);
2055
+ b = ggml_view_4d(ctx, b, k, n, bs[0]*nr[0], bs[1]*nr[1], b->nb[1], b->nb[2], b->nb[3], 0);
2039
2056
  } else {
2040
2057
  a = ggml_new_tensor_4d(ctx, type_a, k, m, bs[0], bs[1]);
2041
- }
2042
- b = ggml_new_tensor_4d(ctx, type_b, k, n, bs[0]*nr[0], bs[1]*nr[1]);
2043
- if (!ggml_is_quantized(type_a)) {
2044
- if (bs[1] == 1 && nr[1] == 1) {
2045
- ggml_set_param(ctx, a);
2058
+ b = ggml_new_tensor_4d(ctx, type_b, k, n, bs[0]*nr[0], bs[1]*nr[1]);
2059
+
2060
+ if (!ggml_is_quantized(type_a)) {
2061
+ if (bs[1] == 1 && nr[1] == 1) {
2062
+ ggml_set_param(a);
2063
+ }
2064
+ ggml_set_param(b);
2046
2065
  }
2047
- ggml_set_param(ctx, b);
2048
2066
  }
2049
2067
  ggml_set_name(a, "a");
2050
2068
  ggml_set_name(b, "b");
@@ -2063,7 +2081,7 @@ struct test_mul_mat_id : public test_case {
2063
2081
  const ggml_type type_b;
2064
2082
  const int n_mats;
2065
2083
  const int n_used;
2066
- const bool b; // brodcast b matrix
2084
+ const bool b; // broadcast b matrix
2067
2085
  const int64_t m;
2068
2086
  const int64_t n;
2069
2087
  const int64_t k;
@@ -2193,7 +2211,7 @@ struct test_sqr : public test_case {
2193
2211
 
2194
2212
  ggml_tensor * build_graph(ggml_context * ctx) override {
2195
2213
  ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
2196
- ggml_set_param(ctx, a);
2214
+ ggml_set_param(a);
2197
2215
  ggml_set_name(a, "a");
2198
2216
 
2199
2217
  ggml_tensor * out = ggml_sqr(ctx, a);
@@ -2222,7 +2240,7 @@ struct test_sqrt : public test_case {
2222
2240
 
2223
2241
  ggml_tensor * build_graph(ggml_context * ctx) override {
2224
2242
  ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
2225
- ggml_set_param(ctx, a);
2243
+ ggml_set_param(a);
2226
2244
  ggml_set_name(a, "a");
2227
2245
 
2228
2246
  ggml_tensor * out = ggml_sqrt(ctx, a);
@@ -2262,7 +2280,7 @@ struct test_log : public test_case {
2262
2280
 
2263
2281
  ggml_tensor * build_graph(ggml_context * ctx) override {
2264
2282
  ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
2265
- ggml_set_param(ctx, a);
2283
+ ggml_set_param(a);
2266
2284
  ggml_set_name(a, "a");
2267
2285
 
2268
2286
  ggml_tensor * out = ggml_log(ctx, a);
@@ -2298,7 +2316,7 @@ struct test_sin : public test_case {
2298
2316
 
2299
2317
  ggml_tensor * build_graph(ggml_context * ctx) override {
2300
2318
  ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
2301
- ggml_set_param(ctx, a);
2319
+ ggml_set_param(a);
2302
2320
  ggml_set_name(a, "a");
2303
2321
 
2304
2322
  ggml_tensor * out = ggml_sin(ctx, a);
@@ -2341,7 +2359,7 @@ struct test_cos : public test_case {
2341
2359
 
2342
2360
  ggml_tensor * build_graph(ggml_context * ctx) override {
2343
2361
  ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
2344
- ggml_set_param(ctx, a);
2362
+ ggml_set_param(a);
2345
2363
  ggml_set_name(a, "a");
2346
2364
 
2347
2365
  ggml_tensor * out = ggml_cos(ctx, a);
@@ -2421,7 +2439,7 @@ struct test_diag_mask_inf : public test_case {
2421
2439
 
2422
2440
  ggml_tensor * build_graph(ggml_context * ctx) override {
2423
2441
  ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
2424
- ggml_set_param(ctx, a);
2442
+ ggml_set_param(a);
2425
2443
  ggml_set_name(a, "a");
2426
2444
 
2427
2445
  ggml_tensor * out = ggml_diag_mask_inf(ctx, a, n_past);
@@ -2460,7 +2478,7 @@ struct test_soft_max : public test_case {
2460
2478
 
2461
2479
  ggml_tensor * build_graph(ggml_context * ctx) override {
2462
2480
  ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
2463
- ggml_set_param(ctx, a);
2481
+ ggml_set_param(a);
2464
2482
  ggml_set_name(a, "a");
2465
2483
 
2466
2484
  ggml_tensor * mask = nullptr;
@@ -2542,7 +2560,7 @@ struct test_rope : public test_case {
2542
2560
  auto ne = ne_a; ne[0] *= 2; ne[1] *= 4; ne[2] *= 3;
2543
2561
  a = ggml_new_tensor(ctx, type, 4, ne.data());
2544
2562
  if (forward) {
2545
- ggml_set_param(ctx, a);
2563
+ ggml_set_param(a);
2546
2564
  }
2547
2565
  ggml_set_name(a, "a");
2548
2566
 
@@ -2551,7 +2569,7 @@ struct test_rope : public test_case {
2551
2569
  } else {
2552
2570
  a = ggml_new_tensor(ctx, type, 4, ne_a.data());
2553
2571
  if (forward) {
2554
- ggml_set_param(ctx, a);
2572
+ ggml_set_param(a);
2555
2573
  }
2556
2574
  ggml_set_name(a, "a");
2557
2575
  }
@@ -2598,6 +2616,8 @@ struct test_rope : public test_case {
2598
2616
  } else {
2599
2617
  out = ggml_rope_ext_back(ctx, a, pos, freq, n_dims, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
2600
2618
  }
2619
+
2620
+ // TODO: add test with a non-contiguous view as input ; this case is needed for build_rope_2d in clip.cpp
2601
2621
  }
2602
2622
  ggml_set_name(out, "out");
2603
2623
 
@@ -2663,7 +2683,7 @@ struct test_pool2d : public test_case {
2663
2683
 
2664
2684
  ggml_tensor * build_graph(ggml_context * ctx) override {
2665
2685
  ggml_tensor * input = ggml_new_tensor(ctx, type_input, 4, ne_input.data());
2666
- ggml_set_param(ctx, input);
2686
+ ggml_set_param(input);
2667
2687
  ggml_set_name(input, "input");
2668
2688
 
2669
2689
  ggml_tensor * out = ggml_pool_2d(ctx, input, pool_type, k0, k1, s0, s1, p0, p1);
@@ -2739,7 +2759,7 @@ struct test_im2col : public test_case {
2739
2759
 
2740
2760
  ggml_tensor * build_graph(ggml_context * ctx) override {
2741
2761
  ggml_tensor * input = ggml_new_tensor(ctx, type_input, 4, ne_input.data());
2742
- ggml_set_param(ctx, input);
2762
+ ggml_set_param(input);
2743
2763
  ggml_set_name(input, "input");
2744
2764
 
2745
2765
  ggml_tensor * kernel = ggml_new_tensor(ctx, type_kernel, 4, ne_kernel.data());
@@ -2752,6 +2772,48 @@ struct test_im2col : public test_case {
2752
2772
  }
2753
2773
  };
2754
2774
 
2775
+ // GGML_OP_CONV_2D_DW
2776
+ struct test_conv_2d_dw : public test_case {
2777
+ const std::array<int64_t, 4> ne_input;
2778
+ const std::array<int64_t, 4> ne_kernel;
2779
+ const int stride;
2780
+ const int padding;
2781
+ const int dilation;
2782
+ const bool cwhn;
2783
+
2784
+ std::string vars() override {
2785
+ return VARS_TO_STR6(ne_input, ne_kernel, stride, padding, dilation, cwhn);
2786
+ }
2787
+
2788
+ test_conv_2d_dw(std::array<int64_t, 4> ne_input = {64, 64, 16, 1},
2789
+ std::array<int64_t, 4> ne_kernel = {3, 3, 1, 16},
2790
+ int stride = 1, int padding = 0, int dilation = 1, bool cwhn = false)
2791
+ : ne_input(ne_input), ne_kernel(ne_kernel), stride(stride), padding(padding), dilation(dilation), cwhn(cwhn) {}
2792
+
2793
+ ggml_tensor * build_graph(ggml_context * ctx) override {
2794
+ ggml_tensor * input = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_input.data());
2795
+ ggml_set_name(input, "input");
2796
+
2797
+ ggml_tensor * kernel = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_kernel.data());
2798
+ ggml_set_name(kernel, "kernel");
2799
+
2800
+ if (cwhn) {
2801
+ // change memory layout to channel-most-contiguous (CWHN),
2802
+ // then permute it back so NE matches the original input
2803
+ input = ggml_cont(ctx, ggml_permute(ctx, input, 1, 2, 0, 3));
2804
+ input = ggml_permute(ctx, input, 2, 0, 1, 3);
2805
+ kernel = ggml_cont(ctx, ggml_permute(ctx, kernel, 2, 3, 1, 0));
2806
+ kernel = ggml_permute(ctx, kernel, 3, 2, 0, 1);
2807
+ }
2808
+
2809
+ ggml_tensor * out = ggml_conv_2d_dw_direct(
2810
+ ctx, kernel, input,
2811
+ stride, stride, padding, padding, dilation, dilation);
2812
+ ggml_set_name(out, "out");
2813
+ return out;
2814
+ }
2815
+ };
2816
+
2755
2817
  // GGML_OP_CONCAT
2756
2818
  struct test_concat : public test_case {
2757
2819
  const ggml_type type;
@@ -2874,7 +2936,7 @@ struct test_sum : public test_case {
2874
2936
 
2875
2937
  ggml_tensor * build_graph(ggml_context * ctx) override {
2876
2938
  ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
2877
- ggml_set_param(ctx, a);
2939
+ ggml_set_param(a);
2878
2940
  ggml_set_name(a, "a");
2879
2941
 
2880
2942
  ggml_tensor * out = ggml_sum(ctx, a);
@@ -2903,7 +2965,7 @@ struct test_sum_rows : public test_case {
2903
2965
 
2904
2966
  ggml_tensor * build_graph(ggml_context * ctx) override {
2905
2967
  ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
2906
- ggml_set_param(ctx, a);
2968
+ ggml_set_param(a);
2907
2969
  ggml_set_name(a, "a");
2908
2970
 
2909
2971
  ggml_tensor * out = ggml_sum_rows(ctx, a);
@@ -2928,7 +2990,7 @@ struct test_mean : public test_case {
2928
2990
 
2929
2991
  ggml_tensor * build_graph(ggml_context * ctx) override {
2930
2992
  ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
2931
- ggml_set_param(ctx, a);
2993
+ ggml_set_param(a);
2932
2994
  ggml_set_name(a, "a");
2933
2995
 
2934
2996
  ggml_tensor * out = ggml_mean(ctx, a);
@@ -2948,15 +3010,16 @@ struct test_upscale : public test_case {
2948
3010
  const std::array<int64_t, 4> ne;
2949
3011
  const int32_t scale_factor;
2950
3012
  const bool transpose;
3013
+ const ggml_scale_mode mode;
2951
3014
 
2952
3015
  std::string vars() override {
2953
- return VARS_TO_STR4(type, ne, scale_factor, transpose);
3016
+ return VARS_TO_STR5(type, ne, scale_factor, mode, transpose);
2954
3017
  }
2955
3018
 
2956
3019
  test_upscale(ggml_type type = GGML_TYPE_F32,
2957
3020
  std::array<int64_t, 4> ne = {512, 512, 3, 1},
2958
- int32_t scale_factor = 2, bool transpose = false)
2959
- : type(type), ne(ne), scale_factor(scale_factor), transpose(transpose) {}
3021
+ int32_t scale_factor = 2, ggml_scale_mode mode = GGML_SCALE_MODE_NEAREST, bool transpose = false)
3022
+ : type(type), ne(ne), scale_factor(scale_factor), transpose(transpose), mode(mode) {}
2960
3023
 
2961
3024
  ggml_tensor * build_graph(ggml_context * ctx) override {
2962
3025
  ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
@@ -2967,7 +3030,7 @@ struct test_upscale : public test_case {
2967
3030
  ggml_set_name(a, "a_transposed");
2968
3031
  }
2969
3032
 
2970
- ggml_tensor * out = ggml_upscale(ctx, a, scale_factor);
3033
+ ggml_tensor * out = ggml_upscale(ctx, a, scale_factor, mode);
2971
3034
  ggml_set_name(out, "out");
2972
3035
 
2973
3036
  return out;
@@ -2979,21 +3042,23 @@ struct test_upscale_ext : public test_case {
2979
3042
  const ggml_type type;
2980
3043
  const std::array<int64_t, 4> ne;
2981
3044
  const std::array<int64_t, 4> ne_tgt;
3045
+ const ggml_scale_mode mode = GGML_SCALE_MODE_NEAREST;
2982
3046
 
2983
3047
  std::string vars() override {
2984
- return VARS_TO_STR3(type, ne, ne_tgt);
3048
+ return VARS_TO_STR4(type, ne, ne_tgt, mode);
2985
3049
  }
2986
3050
 
2987
3051
  test_upscale_ext(ggml_type type = GGML_TYPE_F32,
2988
3052
  std::array<int64_t, 4> ne = {2, 5, 7, 11},
2989
- std::array<int64_t, 4> ne_tgt = {5, 7, 11, 13})
2990
- : type(type), ne(ne), ne_tgt(ne_tgt) {}
3053
+ std::array<int64_t, 4> ne_tgt = {5, 7, 11, 13},
3054
+ ggml_scale_mode mode = GGML_SCALE_MODE_NEAREST)
3055
+ : type(type), ne(ne), ne_tgt(ne_tgt), mode(mode) {}
2991
3056
 
2992
3057
  ggml_tensor * build_graph(ggml_context * ctx) override {
2993
3058
  ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
2994
3059
  ggml_set_name(a, "a");
2995
3060
 
2996
- ggml_tensor * out = ggml_upscale_ext(ctx, a, ne_tgt[0], ne_tgt[1],ne_tgt[2], ne_tgt[3]);
3061
+ ggml_tensor * out = ggml_upscale_ext(ctx, a, ne_tgt[0], ne_tgt[1],ne_tgt[2], ne_tgt[3], mode);
2997
3062
  ggml_set_name(out, "out");
2998
3063
 
2999
3064
  return out;
@@ -3071,11 +3136,11 @@ struct test_acc : public test_case {
3071
3136
 
3072
3137
  ggml_tensor * build_graph(ggml_context * ctx) override {
3073
3138
  ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
3074
- ggml_set_param(ctx, a);
3139
+ ggml_set_param(a);
3075
3140
  ggml_set_name(a, "a");
3076
3141
 
3077
3142
  ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne_b.data());
3078
- ggml_set_param(ctx, b);
3143
+ ggml_set_param(b);
3079
3144
  ggml_set_name(b, "b");
3080
3145
 
3081
3146
  ggml_tensor * out = ggml_acc(ctx, a, b, a->nb[1], a->nb[2], a->nb[3], b->nb[1]);
@@ -3217,7 +3282,8 @@ struct test_leaky_relu : public test_case {
3217
3282
 
3218
3283
  // GGML_OP_FLASH_ATTN_EXT
3219
3284
  struct test_flash_attn_ext : public test_case {
3220
- const int64_t hs; // head size
3285
+ const int64_t hsk; // K head size
3286
+ const int64_t hsv; // V head size
3221
3287
  const int64_t nh; // num heads
3222
3288
  const int64_t nr; // repeat in Q, tests for grouped-query attention
3223
3289
  const int64_t kv; // kv size
@@ -3233,7 +3299,7 @@ struct test_flash_attn_ext : public test_case {
3233
3299
  std::array<int32_t, 4> permute;
3234
3300
 
3235
3301
  std::string vars() override {
3236
- return VARS_TO_STR11(hs, nh, nr, kv, nb, mask, max_bias, logit_softcap, prec, type_KV, permute);
3302
+ return VARS_TO_STR12(hsk, hsv, nh, nr, kv, nb, mask, max_bias, logit_softcap, prec, type_KV, permute);
3237
3303
  }
3238
3304
 
3239
3305
  double max_nmse_err() override {
@@ -3243,17 +3309,18 @@ struct test_flash_attn_ext : public test_case {
3243
3309
  uint64_t op_flops(ggml_tensor * t) override {
3244
3310
  GGML_UNUSED(t);
3245
3311
  // Just counting matmul costs:
3246
- // Q*K^T is nb x hs x kv, P*V is nb x kv x hs, per head
3247
- return 2 * 2 * nh*nr * nb * hs * kv;
3312
+ // Q*K^T is nb x hsk x kv, P*V is nb x kv x hsv, per head
3313
+ return 2 * nh*nr * nb * (hsk + hsv) * kv;
3248
3314
  }
3249
3315
 
3250
- test_flash_attn_ext(int64_t hs = 128, int64_t nh = 32, int64_t nr = 1, int64_t kv = 96, int64_t nb = 8,
3316
+ test_flash_attn_ext(int64_t hsk = 128, int64_t hsv = 128, int64_t nh = 32, int64_t nr = 1, int64_t kv = 96, int64_t nb = 8,
3251
3317
  bool mask = true, float max_bias = 0.0f, float logit_softcap = 0.0f, ggml_prec prec = GGML_PREC_F32,
3252
3318
  ggml_type type_KV = GGML_TYPE_F16, std::array<int32_t, 4> permute = {0, 1, 2, 3})
3253
- : hs(hs), nh(nh), nr(nr), kv(kv), nb(nb), mask(mask), max_bias(max_bias), logit_softcap(logit_softcap), prec(prec), type_KV(type_KV), permute(permute) {}
3319
+ : hsk(hsk), hsv(hsv), nh(nh), nr(nr), kv(kv), nb(nb), mask(mask), max_bias(max_bias), logit_softcap(logit_softcap), prec(prec), type_KV(type_KV), permute(permute) {}
3254
3320
 
3255
3321
  ggml_tensor * build_graph(ggml_context * ctx) override {
3256
- const int64_t hs_padded = GGML_PAD(hs, ggml_blck_size(type_KV));
3322
+ const int64_t hsk_padded = GGML_PAD(hsk, ggml_blck_size(type_KV));
3323
+ const int64_t hsv_padded = GGML_PAD(hsv, ggml_blck_size(type_KV));
3257
3324
 
3258
3325
  auto const &create_permuted = [&](ggml_type type, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) -> ggml_tensor * {
3259
3326
  int64_t ne[4] = {ne0, ne1, ne2, ne3};
@@ -3268,13 +3335,13 @@ struct test_flash_attn_ext : public test_case {
3268
3335
  return t;
3269
3336
  };
3270
3337
 
3271
- ggml_tensor * q = create_permuted(GGML_TYPE_F32, hs_padded, nb, nh*nr, 1);
3338
+ ggml_tensor * q = create_permuted(GGML_TYPE_F32, hsk_padded, nb, nh*nr, 1);
3272
3339
  ggml_set_name(q, "q");
3273
3340
 
3274
- ggml_tensor * k = create_permuted(type_KV, hs_padded, kv, nh, 1);
3341
+ ggml_tensor * k = create_permuted(type_KV, hsk_padded, kv, nh, 1);
3275
3342
  ggml_set_name(k, "k");
3276
3343
 
3277
- ggml_tensor * v = create_permuted(type_KV, hs_padded, kv, nh, 1);
3344
+ ggml_tensor * v = create_permuted(type_KV, hsv_padded, kv, nh, 1);
3278
3345
  ggml_set_name(v, "v");
3279
3346
 
3280
3347
  ggml_tensor * m = nullptr;
@@ -3283,7 +3350,7 @@ struct test_flash_attn_ext : public test_case {
3283
3350
  ggml_set_name(m, "m");
3284
3351
  }
3285
3352
 
3286
- ggml_tensor * out = ggml_flash_attn_ext(ctx, q, k, v, m, 1.0f/sqrtf(hs), max_bias, logit_softcap);
3353
+ ggml_tensor * out = ggml_flash_attn_ext(ctx, q, k, v, m, 1.0f/sqrtf(hsk), max_bias, logit_softcap);
3287
3354
  ggml_flash_attn_ext_set_prec(out, prec);
3288
3355
  ggml_set_name(out, "out");
3289
3356
 
@@ -3310,7 +3377,7 @@ struct test_cross_entropy_loss : public test_case {
3310
3377
 
3311
3378
  ggml_tensor * build_graph(ggml_context * ctx) override {
3312
3379
  ggml_tensor * logits = ggml_new_tensor(ctx, type, 4, ne.data());
3313
- ggml_set_param(ctx, logits);
3380
+ ggml_set_param(logits);
3314
3381
  ggml_set_name(logits, "logits");
3315
3382
 
3316
3383
  ggml_tensor * labels = ggml_new_tensor(ctx, type, 4, ne.data());
@@ -3392,7 +3459,7 @@ struct test_opt_step_adamw : public test_case {
3392
3459
 
3393
3460
  ggml_tensor * build_graph(ggml_context * ctx) override {
3394
3461
  ggml_tensor * a = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
3395
- ggml_set_param(ctx, a); // Despite tensor a having gradients the output tensor will not.
3462
+ ggml_set_param(a); // Despite tensor a having gradients the output tensor will not.
3396
3463
  ggml_set_name(a, "a");
3397
3464
 
3398
3465
  ggml_tensor * grad = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
@@ -3957,6 +4024,11 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
3957
4024
  // test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {1024, 1024, 256, 1}, {3, 3, 256, 1}, 1, 1, 1, 1, 1, 1, true));
3958
4025
  // test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32, {1024, 1024, 256, 1}, {3, 3, 256, 1}, 1, 1, 1, 1, 1, 1, true));
3959
4026
 
4027
+ test_cases.emplace_back(new test_conv_2d_dw({17, 34, 9, 1}, {3, 3, 1, 9}, 1, 0, 1, false));
4028
+ test_cases.emplace_back(new test_conv_2d_dw({17, 34, 9, 1}, {3, 3, 1, 9}, 1, 0, 1, true));
4029
+ test_cases.emplace_back(new test_conv_2d_dw({32, 8, 64, 1}, {3, 3, 1, 64}, 2, 1, 1, false));
4030
+ test_cases.emplace_back(new test_conv_2d_dw({32, 8, 64, 1}, {3, 3, 1, 64}, 2, 1, 1, true));
4031
+
3960
4032
  test_cases.emplace_back(new test_conv_transpose_1d());
3961
4033
  test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 3, 0, 1));
3962
4034
  test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 2, 0, 1));
@@ -4169,6 +4241,11 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
4169
4241
  test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {2, 3}, {1, 1}, {0, 2, 1, 3}));
4170
4242
  test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {2, 3}, {1, 1}, {0, 1, 3, 2}));
4171
4243
  test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {2, 3}, {1, 1}, {0, 3, 2, 1}));
4244
+
4245
+ // test cases with large ne00/ne10 to cover stream-k fixup
4246
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 1024, {3, 2}, {1, 1}));
4247
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 8, 1024, {3, 2}, {1, 1}));
4248
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 1024, {3, 2}, {1, 1}));
4172
4249
  }
4173
4250
  }
4174
4251
  for (ggml_type type_a : other_types) {
@@ -4204,6 +4281,8 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
4204
4281
  test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 83, 2, 64, { 8, 1}, {4, 1}));
4205
4282
  test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 64, 45, 128, { 8, 1}, {4, 1}));
4206
4283
  test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 45, 64, { 8, 1}, {4, 1}));
4284
+ test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 1056, 1, 193, {1, 1}, {4, 1}, {0, 2, 1, 3}));
4285
+ test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 1056, 1, 67, {1, 1}, {4, 1}, {0, 2, 1, 3}));
4207
4286
 
4208
4287
  for (auto bs : {1,2,4,8}) {
4209
4288
  for (auto nr : {1,4}) {
@@ -4395,12 +4474,15 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
4395
4474
  test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {60, 10, 10, 10}, order)); // qwen
4396
4475
  }
4397
4476
 
4477
+ for (ggml_scale_mode mode : {GGML_SCALE_MODE_NEAREST, GGML_SCALE_MODE_BILINEAR}) {
4478
+ test_cases.emplace_back(new test_upscale(GGML_TYPE_F32, {512, 512, 3, 2}, 2, mode));
4479
+ test_cases.emplace_back(new test_upscale(GGML_TYPE_F32, {512, 512, 3, 2}, 2, mode, true));
4480
+ test_cases.emplace_back(new test_upscale_ext(GGML_TYPE_F32, {2, 5, 7, 11}, {5, 7, 11, 13}, mode));
4481
+ }
4482
+
4398
4483
  test_cases.emplace_back(new test_sum());
4399
4484
  test_cases.emplace_back(new test_sum_rows());
4400
4485
  test_cases.emplace_back(new test_mean());
4401
- test_cases.emplace_back(new test_upscale());
4402
- test_cases.emplace_back(new test_upscale(GGML_TYPE_F32, { 512, 512, 3, 1 }, 2, true));
4403
- test_cases.emplace_back(new test_upscale_ext());
4404
4486
  test_cases.emplace_back(new test_group_norm(GGML_TYPE_F32, {64, 64, 320, 1}));
4405
4487
  test_cases.emplace_back(new test_group_norm(GGML_TYPE_F32, {9, 9, 1280, 1}));
4406
4488
  test_cases.emplace_back(new test_acc());
@@ -4410,27 +4492,33 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
4410
4492
  test_cases.emplace_back(new test_timestep_embedding());
4411
4493
  test_cases.emplace_back(new test_leaky_relu());
4412
4494
 
4413
- for (int hs : { 64, 80, 128, 256, }) {
4414
- for (bool mask : { true, false } ) {
4415
- for (float max_bias : { 0.0f, 8.0f }) {
4416
- if (!mask && max_bias > 0.0f) continue;
4417
- for (float logit_softcap : {0.0f, 10.0f}) {
4418
- if (hs != 128 && logit_softcap != 0.0f) continue;
4419
- for (int nh : { 4, }) {
4420
- for (int nr : { 1, 4, 16 }) {
4421
- if (nr == 16 && hs != 128) continue;
4422
- for (int kv : { 512, 1024, }) {
4423
- if (nr != 1 && kv != 512) continue;
4424
- for (int nb : { 1, 3, 32, 35, }) {
4425
- for (ggml_prec prec : {GGML_PREC_F32, GGML_PREC_DEFAULT}) {
4426
- if (hs != 128 && prec == GGML_PREC_DEFAULT) continue;
4427
- for (ggml_type type_KV : {GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0}) {
4428
- test_cases.emplace_back(new test_flash_attn_ext(
4429
- hs, nh, nr, kv, nb, mask, max_bias, logit_softcap, prec, type_KV));
4430
- // run fewer test cases permuted
4431
- if (mask == true && max_bias == 0.0f && logit_softcap == 0 && kv == 512) {
4495
+ for (int hsk : { 64, 80, 128, 192, 256, 576 }) {
4496
+ for (int hsv : { 64, 80, 128, 192, 256, 512 }) {
4497
+ if (hsk != 192 && hsk != 576 && hsk != hsv) continue;
4498
+ if (hsk == 192 && (hsv != 128 && hsv != 192)) continue;
4499
+ if (hsk == 576 && hsv != 512) continue; // DeepSeek MLA
4500
+
4501
+ for (bool mask : { true, false } ) {
4502
+ for (float max_bias : { 0.0f, 8.0f }) {
4503
+ if (!mask && max_bias > 0.0f) continue;
4504
+ for (float logit_softcap : {0.0f, 10.0f}) {
4505
+ if (hsk != 128 && logit_softcap != 0.0f) continue;
4506
+ for (int nh : { 4, }) {
4507
+ for (int nr : { 1, 4, 16 }) {
4508
+ if (nr == 16 && hsk != 128) continue;
4509
+ for (int kv : { 512, 1024, }) {
4510
+ if (nr != 1 && kv != 512) continue;
4511
+ for (int nb : { 1, 3, 32, 35, }) {
4512
+ for (ggml_prec prec : {GGML_PREC_F32, GGML_PREC_DEFAULT}) {
4513
+ if (hsk != 128 && prec == GGML_PREC_DEFAULT) continue;
4514
+ for (ggml_type type_KV : {GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0}) {
4432
4515
  test_cases.emplace_back(new test_flash_attn_ext(
4433
- hs, nh, nr, kv, nb, mask, max_bias, logit_softcap, prec, type_KV, {0, 2, 1, 3}));
4516
+ hsk, hsv, nh, nr, kv, nb, mask, max_bias, logit_softcap, prec, type_KV));
4517
+ // run fewer test cases permuted
4518
+ if (mask == true && max_bias == 0.0f && logit_softcap == 0 && kv == 512) {
4519
+ test_cases.emplace_back(new test_flash_attn_ext(
4520
+ hsk, hsv, nh, nr, kv, nb, mask, max_bias, logit_softcap, prec, type_KV, {0, 2, 1, 3}));
4521
+ }
4434
4522
  }
4435
4523
  }
4436
4524
  }
@@ -4507,6 +4595,17 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
4507
4595
  }
4508
4596
  }
4509
4597
 
4598
+ for (int kv : { 4096, 8192, 16384, }) {
4599
+ for (int hs : { 64, 128, }) {
4600
+ for (int nr : { 1, 4, }) {
4601
+ test_cases.emplace_back(new test_flash_attn_ext(hs, hs, 8, nr, kv, 1, true, 0, 0, GGML_PREC_F32, GGML_TYPE_F16));
4602
+ }
4603
+ }
4604
+ }
4605
+
4606
+ test_cases.emplace_back(new test_conv_2d_dw({512, 512, 256, 1}, {3, 3, 1, 256}, 1, 1, 1, false));
4607
+ test_cases.emplace_back(new test_conv_2d_dw({512, 512, 256, 1}, {3, 3, 1, 256}, 1, 1, 1, true));
4608
+
4510
4609
  return test_cases;
4511
4610
  }
4512
4611