whispercpp 1.3.5 → 1.3.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (610) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE +1 -1
  3. data/README.md +99 -2
  4. data/ext/extconf.rb +1 -0
  5. data/ext/ruby_whisper.c +20 -4
  6. data/ext/ruby_whisper.h +30 -2
  7. data/ext/ruby_whisper_context.c +216 -124
  8. data/ext/ruby_whisper_context_params.c +163 -0
  9. data/ext/ruby_whisper_model.c +0 -1
  10. data/ext/ruby_whisper_params.c +0 -1
  11. data/ext/ruby_whisper_segment.c +0 -1
  12. data/ext/ruby_whisper_token.c +29 -9
  13. data/ext/ruby_whisper_transcribe.cpp +4 -1
  14. data/ext/ruby_whisper_vad_context.c +48 -1
  15. data/ext/ruby_whisper_vad_context_detect.cpp +6 -5
  16. data/ext/ruby_whisper_vad_params.c +0 -1
  17. data/ext/ruby_whisper_vad_segment.c +0 -1
  18. data/ext/ruby_whisper_vad_segments.c +0 -1
  19. data/ext/sources/CMakeLists.txt +1 -1
  20. data/ext/sources/bindings/javascript/package.json +1 -1
  21. data/ext/sources/cmake/whisper-config.cmake.in +5 -40
  22. data/ext/sources/examples/bench/bench.cpp +23 -18
  23. data/ext/sources/examples/cli/cli.cpp +8 -0
  24. data/ext/sources/examples/common-ggml.cpp +2 -0
  25. data/ext/sources/examples/miniaudio.h +4507 -2131
  26. data/ext/sources/examples/server/server.cpp +18 -4
  27. data/ext/sources/examples/talk-llama/CMakeLists.txt +3 -2
  28. data/ext/sources/examples/talk-llama/llama-adapter.cpp +7 -13
  29. data/ext/sources/examples/talk-llama/llama-adapter.h +4 -3
  30. data/ext/sources/examples/talk-llama/llama-arch.cpp +335 -17
  31. data/ext/sources/examples/talk-llama/llama-arch.h +42 -0
  32. data/ext/sources/examples/talk-llama/llama-batch.cpp +3 -1
  33. data/ext/sources/examples/talk-llama/llama-chat.cpp +21 -1
  34. data/ext/sources/examples/talk-llama/llama-chat.h +1 -0
  35. data/ext/sources/examples/talk-llama/llama-context.cpp +508 -520
  36. data/ext/sources/examples/talk-llama/llama-context.h +27 -28
  37. data/ext/sources/examples/talk-llama/llama-cparams.h +5 -0
  38. data/ext/sources/examples/talk-llama/llama-ext.h +12 -0
  39. data/ext/sources/examples/talk-llama/llama-grammar.cpp +8 -8
  40. data/ext/sources/examples/talk-llama/llama-graph.cpp +583 -130
  41. data/ext/sources/examples/talk-llama/llama-graph.h +131 -10
  42. data/ext/sources/examples/talk-llama/llama-hparams.cpp +57 -40
  43. data/ext/sources/examples/talk-llama/llama-hparams.h +79 -10
  44. data/ext/sources/examples/talk-llama/llama-impl.cpp +4 -4
  45. data/ext/sources/examples/talk-llama/llama-impl.h +13 -1
  46. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +3 -1
  47. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +274 -89
  48. data/ext/sources/examples/talk-llama/llama-kv-cache.h +2 -3
  49. data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.cpp +275 -0
  50. data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.h +140 -0
  51. data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +11 -13
  52. data/ext/sources/examples/talk-llama/llama-mmap.cpp +28 -11
  53. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +527 -119
  54. data/ext/sources/examples/talk-llama/llama-model-loader.h +35 -5
  55. data/ext/sources/examples/talk-llama/llama-model-saver.cpp +60 -46
  56. data/ext/sources/examples/talk-llama/llama-model-saver.h +5 -2
  57. data/ext/sources/examples/talk-llama/llama-model.cpp +1365 -647
  58. data/ext/sources/examples/talk-llama/llama-model.h +72 -19
  59. data/ext/sources/examples/talk-llama/llama-quant.cpp +578 -346
  60. data/ext/sources/examples/talk-llama/{llama-sampling.cpp → llama-sampler.cpp} +190 -76
  61. data/ext/sources/examples/talk-llama/{llama-sampling.h → llama-sampler.h} +0 -2
  62. data/ext/sources/examples/talk-llama/llama-vocab.cpp +118 -48
  63. data/ext/sources/examples/talk-llama/llama-vocab.h +5 -0
  64. data/ext/sources/examples/talk-llama/llama.cpp +76 -22
  65. data/ext/sources/examples/talk-llama/llama.h +63 -30
  66. data/ext/sources/examples/talk-llama/models/afmoe.cpp +2 -3
  67. data/ext/sources/examples/talk-llama/models/apertus.cpp +3 -3
  68. data/ext/sources/examples/talk-llama/models/arcee.cpp +3 -3
  69. data/ext/sources/examples/talk-llama/models/arctic.cpp +4 -5
  70. data/ext/sources/examples/talk-llama/models/baichuan.cpp +4 -3
  71. data/ext/sources/examples/talk-llama/models/bailingmoe.cpp +1 -2
  72. data/ext/sources/examples/talk-llama/models/bailingmoe2.cpp +3 -5
  73. data/ext/sources/examples/talk-llama/models/bert.cpp +13 -7
  74. data/ext/sources/examples/talk-llama/models/bitnet.cpp +9 -24
  75. data/ext/sources/examples/talk-llama/models/bloom.cpp +2 -2
  76. data/ext/sources/examples/talk-llama/models/chameleon.cpp +3 -3
  77. data/ext/sources/examples/talk-llama/models/chatglm.cpp +2 -2
  78. data/ext/sources/examples/talk-llama/models/codeshell.cpp +3 -3
  79. data/ext/sources/examples/talk-llama/models/cogvlm.cpp +3 -3
  80. data/ext/sources/examples/talk-llama/models/cohere2-iswa.cpp +2 -2
  81. data/ext/sources/examples/talk-llama/models/command-r.cpp +2 -2
  82. data/ext/sources/examples/talk-llama/models/dbrx.cpp +4 -5
  83. data/ext/sources/examples/talk-llama/models/deci.cpp +3 -3
  84. data/ext/sources/examples/talk-llama/models/deepseek.cpp +4 -6
  85. data/ext/sources/examples/talk-llama/models/deepseek2.cpp +24 -21
  86. data/ext/sources/examples/talk-llama/models/delta-net-base.cpp +445 -0
  87. data/ext/sources/examples/talk-llama/models/dots1.cpp +4 -6
  88. data/ext/sources/examples/talk-llama/models/dream.cpp +3 -3
  89. data/ext/sources/examples/talk-llama/models/ernie4-5-moe.cpp +4 -6
  90. data/ext/sources/examples/talk-llama/models/ernie4-5.cpp +3 -3
  91. data/ext/sources/examples/talk-llama/models/eurobert.cpp +97 -0
  92. data/ext/sources/examples/talk-llama/models/exaone-moe.cpp +145 -0
  93. data/ext/sources/examples/talk-llama/models/exaone.cpp +3 -3
  94. data/ext/sources/examples/talk-llama/models/exaone4.cpp +3 -3
  95. data/ext/sources/examples/talk-llama/models/falcon-h1.cpp +2 -4
  96. data/ext/sources/examples/talk-llama/models/falcon.cpp +3 -3
  97. data/ext/sources/examples/talk-llama/models/gemma-embedding.cpp +1 -1
  98. data/ext/sources/examples/talk-llama/models/gemma.cpp +1 -1
  99. data/ext/sources/examples/talk-llama/models/gemma2-iswa.cpp +1 -1
  100. data/ext/sources/examples/talk-llama/models/gemma3.cpp +1 -1
  101. data/ext/sources/examples/talk-llama/models/gemma3n-iswa.cpp +7 -7
  102. data/ext/sources/examples/talk-llama/models/glm4-moe.cpp +3 -3
  103. data/ext/sources/examples/talk-llama/models/glm4.cpp +14 -7
  104. data/ext/sources/examples/talk-llama/models/gpt2.cpp +2 -2
  105. data/ext/sources/examples/talk-llama/models/gptneox.cpp +2 -2
  106. data/ext/sources/examples/talk-llama/models/granite-hybrid.cpp +4 -5
  107. data/ext/sources/examples/talk-llama/models/granite.cpp +4 -5
  108. data/ext/sources/examples/talk-llama/models/grok.cpp +4 -4
  109. data/ext/sources/examples/talk-llama/models/grovemoe.cpp +5 -7
  110. data/ext/sources/examples/talk-llama/models/hunyuan-dense.cpp +3 -3
  111. data/ext/sources/examples/talk-llama/models/hunyuan-moe.cpp +4 -5
  112. data/ext/sources/examples/talk-llama/models/internlm2.cpp +3 -3
  113. data/ext/sources/examples/talk-llama/models/jais.cpp +2 -2
  114. data/ext/sources/examples/talk-llama/models/jais2.cpp +123 -0
  115. data/ext/sources/examples/talk-llama/models/jamba.cpp +3 -3
  116. data/ext/sources/examples/talk-llama/models/kimi-linear.cpp +381 -0
  117. data/ext/sources/examples/talk-llama/models/lfm2.cpp +145 -124
  118. data/ext/sources/examples/talk-llama/models/llada-moe.cpp +4 -4
  119. data/ext/sources/examples/talk-llama/models/llada.cpp +3 -3
  120. data/ext/sources/examples/talk-llama/models/llama-iswa.cpp +4 -4
  121. data/ext/sources/examples/talk-llama/models/llama.cpp +18 -11
  122. data/ext/sources/examples/talk-llama/models/maincoder.cpp +3 -3
  123. data/ext/sources/examples/talk-llama/models/{graph-context-mamba.cpp → mamba-base.cpp} +9 -3
  124. data/ext/sources/examples/talk-llama/models/mamba.cpp +1 -2
  125. data/ext/sources/examples/talk-llama/models/mimo2-iswa.cpp +11 -5
  126. data/ext/sources/examples/talk-llama/models/minicpm3.cpp +14 -13
  127. data/ext/sources/examples/talk-llama/models/minimax-m2.cpp +4 -5
  128. data/ext/sources/examples/talk-llama/models/mistral3.cpp +4 -4
  129. data/ext/sources/examples/talk-llama/models/models.h +181 -46
  130. data/ext/sources/examples/talk-llama/models/modern-bert.cpp +2 -9
  131. data/ext/sources/examples/talk-llama/models/mpt.cpp +2 -2
  132. data/ext/sources/examples/talk-llama/models/nemotron-h.cpp +26 -14
  133. data/ext/sources/examples/talk-llama/models/nemotron.cpp +3 -3
  134. data/ext/sources/examples/talk-llama/models/neo-bert.cpp +2 -2
  135. data/ext/sources/examples/talk-llama/models/olmo.cpp +3 -3
  136. data/ext/sources/examples/talk-llama/models/olmo2.cpp +3 -3
  137. data/ext/sources/examples/talk-llama/models/olmoe.cpp +4 -4
  138. data/ext/sources/examples/talk-llama/models/openai-moe-iswa.cpp +1 -1
  139. data/ext/sources/examples/talk-llama/models/openelm.cpp +3 -3
  140. data/ext/sources/examples/talk-llama/models/orion.cpp +3 -3
  141. data/ext/sources/examples/talk-llama/models/paddleocr.cpp +122 -0
  142. data/ext/sources/examples/talk-llama/models/pangu-embedded.cpp +3 -3
  143. data/ext/sources/examples/talk-llama/models/phi2.cpp +2 -2
  144. data/ext/sources/examples/talk-llama/models/phi3.cpp +3 -3
  145. data/ext/sources/examples/talk-llama/models/plamo.cpp +3 -3
  146. data/ext/sources/examples/talk-llama/models/plamo2.cpp +9 -5
  147. data/ext/sources/examples/talk-llama/models/plamo3.cpp +2 -2
  148. data/ext/sources/examples/talk-llama/models/plm.cpp +15 -14
  149. data/ext/sources/examples/talk-llama/models/qwen.cpp +2 -2
  150. data/ext/sources/examples/talk-llama/models/qwen2.cpp +3 -3
  151. data/ext/sources/examples/talk-llama/models/qwen2moe.cpp +4 -4
  152. data/ext/sources/examples/talk-llama/models/qwen2vl.cpp +3 -3
  153. data/ext/sources/examples/talk-llama/models/qwen3.cpp +12 -9
  154. data/ext/sources/examples/talk-llama/models/qwen35.cpp +381 -0
  155. data/ext/sources/examples/talk-llama/models/qwen35moe.cpp +422 -0
  156. data/ext/sources/examples/talk-llama/models/qwen3moe.cpp +15 -8
  157. data/ext/sources/examples/talk-llama/models/qwen3next.cpp +84 -432
  158. data/ext/sources/examples/talk-llama/models/qwen3vl-moe.cpp +9 -18
  159. data/ext/sources/examples/talk-llama/models/qwen3vl.cpp +8 -17
  160. data/ext/sources/examples/talk-llama/models/refact.cpp +2 -2
  161. data/ext/sources/examples/talk-llama/models/rnd1.cpp +4 -4
  162. data/ext/sources/examples/talk-llama/models/rwkv6-base.cpp +2 -0
  163. data/ext/sources/examples/talk-llama/models/rwkv7-base.cpp +2 -0
  164. data/ext/sources/examples/talk-llama/models/seed-oss.cpp +3 -3
  165. data/ext/sources/examples/talk-llama/models/smallthinker.cpp +4 -4
  166. data/ext/sources/examples/talk-llama/models/smollm3.cpp +3 -3
  167. data/ext/sources/examples/talk-llama/models/stablelm.cpp +2 -2
  168. data/ext/sources/examples/talk-llama/models/starcoder.cpp +2 -2
  169. data/ext/sources/examples/talk-llama/models/starcoder2.cpp +3 -3
  170. data/ext/sources/examples/talk-llama/models/step35-iswa.cpp +165 -0
  171. data/ext/sources/examples/talk-llama/models/t5-dec.cpp +2 -2
  172. data/ext/sources/examples/talk-llama/models/t5-enc.cpp +2 -2
  173. data/ext/sources/examples/talk-llama/models/xverse.cpp +3 -3
  174. data/ext/sources/examples/talk-llama/unicode.cpp +21 -65
  175. data/ext/sources/ggml/CMakeLists.txt +9 -3
  176. data/ext/sources/ggml/include/ggml-backend.h +1 -1
  177. data/ext/sources/ggml/include/ggml-cann.h +1 -1
  178. data/ext/sources/ggml/include/ggml-cpu.h +5 -0
  179. data/ext/sources/ggml/include/ggml-openvino.h +37 -0
  180. data/ext/sources/ggml/include/ggml-opt.h +1 -1
  181. data/ext/sources/ggml/include/ggml-rpc.h +6 -1
  182. data/ext/sources/ggml/include/ggml-virtgpu.h +14 -0
  183. data/ext/sources/ggml/include/ggml.h +56 -9
  184. data/ext/sources/ggml/src/CMakeLists.txt +3 -0
  185. data/ext/sources/ggml/src/ggml-alloc.c +4 -9
  186. data/ext/sources/ggml/src/ggml-backend-dl.cpp +48 -0
  187. data/ext/sources/ggml/src/ggml-backend-dl.h +45 -0
  188. data/ext/sources/ggml/src/ggml-backend-reg.cpp +28 -86
  189. data/ext/sources/ggml/src/ggml-backend.cpp +5 -2
  190. data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +1 -1
  191. data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +6 -2
  192. data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +1 -1
  193. data/ext/sources/ggml/src/ggml-cann/acl_tensor.h +1 -1
  194. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +348 -189
  195. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +40 -85
  196. data/ext/sources/ggml/src/ggml-cann/common.h +3 -4
  197. data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +44 -62
  198. data/ext/sources/ggml/src/ggml-common.h +11 -0
  199. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +16 -11
  200. data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +42 -19
  201. data/ext/sources/ggml/src/ggml-cpu/amx/common.h +34 -10
  202. data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +85 -85
  203. data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +85 -1
  204. data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +2744 -548
  205. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +1653 -0
  206. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +1391 -0
  207. data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +8 -10
  208. data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +9 -9
  209. data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +118 -18
  210. data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +107 -26
  211. data/ext/sources/ggml/src/ggml-cpu/binary-ops.cpp +2 -6
  212. data/ext/sources/ggml/src/ggml-cpu/common.h +8 -0
  213. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +3 -0
  214. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +59 -12
  215. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +15 -0
  216. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +21 -20
  217. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +965 -252
  218. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +584 -197
  219. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +903 -188
  220. data/ext/sources/ggml/src/ggml-cpu/ops.h +1 -0
  221. data/ext/sources/ggml/src/ggml-cpu/quants.c +40 -0
  222. data/ext/sources/ggml/src/ggml-cpu/quants.h +3 -0
  223. data/ext/sources/ggml/src/ggml-cpu/repack.cpp +2890 -679
  224. data/ext/sources/ggml/src/ggml-cpu/repack.h +119 -8
  225. data/ext/sources/ggml/src/ggml-cpu/simd-gemm.h +136 -0
  226. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +111 -3
  227. data/ext/sources/ggml/src/ggml-cpu/unary-ops.cpp +1 -1
  228. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +17 -0
  229. data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +1 -1
  230. data/ext/sources/ggml/src/ggml-cuda/argsort.cu +19 -10
  231. data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +32 -30
  232. data/ext/sources/ggml/src/ggml-cuda/common.cuh +134 -18
  233. data/ext/sources/ggml/src/ggml-cuda/convert.cu +41 -27
  234. data/ext/sources/ggml/src/ggml-cuda/cpy.cu +6 -3
  235. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +78 -64
  236. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +384 -143
  237. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +36 -22
  238. data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +3 -3
  239. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +26 -5
  240. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +1 -1
  241. data/ext/sources/ggml/src/ggml-cuda/fattn.cu +127 -12
  242. data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cu +263 -0
  243. data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cuh +4 -0
  244. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +595 -200
  245. data/ext/sources/ggml/src/ggml-cuda/mean.cu +9 -8
  246. data/ext/sources/ggml/src/ggml-cuda/mma.cuh +173 -6
  247. data/ext/sources/ggml/src/ggml-cuda/mmf.cu +30 -10
  248. data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +158 -85
  249. data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +34 -22
  250. data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +127 -67
  251. data/ext/sources/ggml/src/ggml-cuda/mmvf.cuh +2 -0
  252. data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +157 -65
  253. data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +1 -0
  254. data/ext/sources/ggml/src/ggml-cuda/norm.cu +18 -76
  255. data/ext/sources/ggml/src/ggml-cuda/pad.cu +13 -10
  256. data/ext/sources/ggml/src/ggml-cuda/quantize.cu +1 -1
  257. data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +2 -16
  258. data/ext/sources/ggml/src/ggml-cuda/rope.cu +233 -133
  259. data/ext/sources/ggml/src/ggml-cuda/softmax.cu +8 -83
  260. data/ext/sources/ggml/src/ggml-cuda/solve_tri.cu +1 -1
  261. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +56 -32
  262. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cuh +1 -1
  263. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_32.cu +5 -0
  264. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +1 -0
  265. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_32.cu +5 -0
  266. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +1 -0
  267. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +1 -0
  268. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +1 -0
  269. data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +3 -3
  270. data/ext/sources/ggml/src/ggml-cuda/top-k.cu +0 -1
  271. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +199 -135
  272. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +20 -14
  273. data/ext/sources/ggml/src/ggml-cuda/unary.cu +55 -0
  274. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +2 -0
  275. data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +31 -17
  276. data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +10 -0
  277. data/ext/sources/ggml/src/ggml-hexagon/CMakeLists.txt +82 -45
  278. data/ext/sources/ggml/src/ggml-hexagon/ggml-hexagon.cpp +334 -160
  279. data/ext/sources/ggml/src/ggml-hexagon/htp/CMakeLists.txt +7 -5
  280. data/ext/sources/ggml/src/ggml-hexagon/htp/act-ops.c +328 -197
  281. data/ext/sources/ggml/src/ggml-hexagon/htp/argsort-ops.c +281 -0
  282. data/ext/sources/ggml/src/ggml-hexagon/htp/binary-ops.c +765 -234
  283. data/ext/sources/ggml/src/ggml-hexagon/htp/cpy-ops.c +252 -0
  284. data/ext/sources/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +412 -265
  285. data/ext/sources/ggml/src/ggml-hexagon/htp/get-rows-ops.c +23 -23
  286. data/ext/sources/ggml/src/ggml-hexagon/htp/{htp-dma.c → hex-dma.c} +1 -1
  287. data/ext/sources/ggml/src/ggml-hexagon/htp/{htp-dma.h → hex-dma.h} +28 -3
  288. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dump.h +77 -0
  289. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-fastdiv.h +37 -0
  290. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-utils.h +51 -0
  291. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ctx.h +1 -1
  292. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-msg.h +27 -37
  293. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ops.h +6 -35
  294. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-arith.h +443 -0
  295. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-base.h +240 -0
  296. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-copy.h +245 -0
  297. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-div.h +251 -0
  298. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-dump.h +129 -0
  299. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.h +215 -0
  300. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-floor.h +100 -0
  301. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.h +210 -0
  302. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-reduce.h +296 -0
  303. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-scale.h +133 -0
  304. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +141 -0
  305. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sqrt.h +126 -0
  306. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-types.h +36 -0
  307. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.h +20 -1347
  308. data/ext/sources/ggml/src/ggml-hexagon/htp/main.c +211 -13
  309. data/ext/sources/ggml/src/ggml-hexagon/htp/matmul-ops.c +1119 -952
  310. data/ext/sources/ggml/src/ggml-hexagon/htp/rope-ops.c +254 -244
  311. data/ext/sources/ggml/src/ggml-hexagon/htp/set-rows-ops.c +36 -36
  312. data/ext/sources/ggml/src/ggml-hexagon/htp/softmax-ops.c +155 -138
  313. data/ext/sources/ggml/src/ggml-hexagon/htp/ssm-conv.c +339 -0
  314. data/ext/sources/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +128 -0
  315. data/ext/sources/ggml/src/ggml-hexagon/htp/unary-ops.c +209 -114
  316. data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.c +1 -5
  317. data/ext/sources/ggml/src/ggml-hexagon/htp-drv.cpp +418 -0
  318. data/ext/sources/ggml/src/ggml-hexagon/htp-drv.h +121 -0
  319. data/ext/sources/ggml/src/ggml-hexagon/libdl.h +79 -0
  320. data/ext/sources/ggml/src/ggml-hexagon/libggml-htp.inf +38 -0
  321. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +6 -0
  322. data/ext/sources/ggml/src/ggml-impl.h +62 -0
  323. data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +10 -10
  324. data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +13 -2
  325. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +8 -0
  326. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +147 -17
  327. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +274 -73
  328. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +22 -4
  329. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +102 -36
  330. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +174 -23
  331. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +580 -280
  332. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +5 -4
  333. data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +320 -107
  334. data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +1068 -825
  335. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +19 -1
  336. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +3108 -636
  337. data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +41 -99
  338. data/ext/sources/ggml/src/ggml-opencl/kernels/cpy.cl +45 -0
  339. data/ext/sources/ggml/src/ggml-opencl/kernels/cumsum.cl +139 -0
  340. data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +204 -0
  341. data/ext/sources/ggml/src/ggml-opencl/kernels/diag.cl +27 -0
  342. data/ext/sources/ggml/src/ggml-opencl/kernels/exp.cl +125 -0
  343. data/ext/sources/ggml/src/ggml-opencl/kernels/expm1.cl +87 -56
  344. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_1_f32.cl +132 -0
  345. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general_q8_0_f32.cl +195 -0
  346. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_1_f32.cl +283 -0
  347. data/ext/sources/ggml/src/ggml-opencl/kernels/l2_norm.cl +71 -0
  348. data/ext/sources/ggml/src/ggml-opencl/kernels/mean.cl +114 -13
  349. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl +163 -0
  350. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl +165 -0
  351. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q6_k_f32_l4_lm.cl +158 -0
  352. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_8x4.cl +129 -0
  353. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl +219 -0
  354. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl +229 -0
  355. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl +180 -0
  356. data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mv_q6_k.cl → mul_mv_q6_k_f32.cl} +4 -0
  357. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +194 -0
  358. data/ext/sources/ggml/src/ggml-opencl/kernels/neg.cl +125 -0
  359. data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +31 -32
  360. data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +14 -4
  361. data/ext/sources/ggml/src/ggml-opencl/kernels/softplus.cl +88 -60
  362. data/ext/sources/ggml/src/ggml-opencl/kernels/solve_tri.cl +51 -0
  363. data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +114 -13
  364. data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +94 -48
  365. data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +26 -0
  366. data/ext/sources/ggml/src/ggml-opencl/kernels/tri.cl +32 -0
  367. data/ext/sources/ggml/src/ggml-openvino/.clang-format +154 -0
  368. data/ext/sources/ggml/src/ggml-openvino/CMakeLists.txt +22 -0
  369. data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.cpp +975 -0
  370. data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.h +294 -0
  371. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +373 -0
  372. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.h +182 -0
  373. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino.cpp +1110 -0
  374. data/ext/sources/ggml/src/ggml-openvino/ggml-quants.cpp +884 -0
  375. data/ext/sources/ggml/src/ggml-openvino/ggml-quants.h +153 -0
  376. data/ext/sources/ggml/src/ggml-openvino/openvino/decoder.h +74 -0
  377. data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.cpp +27 -0
  378. data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.h +23 -0
  379. data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.cpp +17 -0
  380. data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.h +29 -0
  381. data/ext/sources/ggml/src/ggml-openvino/openvino/node_context.h +112 -0
  382. data/ext/sources/ggml/src/ggml-openvino/openvino/op/cont.cpp +48 -0
  383. data/ext/sources/ggml/src/ggml-openvino/openvino/op/cpy.cpp +21 -0
  384. data/ext/sources/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +90 -0
  385. data/ext/sources/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +69 -0
  386. data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp +61 -0
  387. data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp +62 -0
  388. data/ext/sources/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +90 -0
  389. data/ext/sources/ggml/src/ggml-openvino/openvino/op/permute.cpp +102 -0
  390. data/ext/sources/ggml/src/ggml-openvino/openvino/op/reshape.cpp +83 -0
  391. data/ext/sources/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp +46 -0
  392. data/ext/sources/ggml/src/ggml-openvino/openvino/op/rope.cpp +123 -0
  393. data/ext/sources/ggml/src/ggml-openvino/openvino/op/scale.cpp +41 -0
  394. data/ext/sources/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +76 -0
  395. data/ext/sources/ggml/src/ggml-openvino/openvino/op/softmax.cpp +89 -0
  396. data/ext/sources/ggml/src/ggml-openvino/openvino/op/transpose.cpp +23 -0
  397. data/ext/sources/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp +27 -0
  398. data/ext/sources/ggml/src/ggml-openvino/openvino/op/view.cpp +53 -0
  399. data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.cpp +46 -0
  400. data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.h +39 -0
  401. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp +123 -0
  402. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.h +17 -0
  403. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp +60 -0
  404. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.h +17 -0
  405. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.h +29 -0
  406. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.cpp +58 -0
  407. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.h +17 -0
  408. data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.cpp +293 -0
  409. data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.h +28 -0
  410. data/ext/sources/ggml/src/ggml-openvino/openvino/utils.cpp +226 -0
  411. data/ext/sources/ggml/src/ggml-openvino/openvino/utils.h +85 -0
  412. data/ext/sources/ggml/src/ggml-openvino/utils.cpp +823 -0
  413. data/ext/sources/ggml/src/ggml-openvino/utils.h +123 -0
  414. data/ext/sources/ggml/src/ggml-quants.c +96 -5
  415. data/ext/sources/ggml/src/ggml-quants.h +3 -0
  416. data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +15 -88
  417. data/ext/sources/ggml/src/ggml-sycl/add-id.cpp +5 -1
  418. data/ext/sources/ggml/src/ggml-sycl/backend.hpp +1 -0
  419. data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +21 -20
  420. data/ext/sources/ggml/src/ggml-sycl/common.hpp +315 -10
  421. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +69 -1
  422. data/ext/sources/ggml/src/ggml-sycl/convert.hpp +22 -1
  423. data/ext/sources/ggml/src/ggml-sycl/count-equal.cpp +1 -1
  424. data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +791 -47
  425. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +78 -68
  426. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +2 -0
  427. data/ext/sources/ggml/src/ggml-sycl/fattn-common.hpp +1179 -0
  428. data/ext/sources/ggml/src/ggml-sycl/fattn-tile.cpp +55 -0
  429. data/ext/sources/ggml/src/ggml-sycl/fattn-tile.hpp +1338 -0
  430. data/ext/sources/ggml/src/ggml-sycl/fattn-vec.hpp +667 -0
  431. data/ext/sources/ggml/src/ggml-sycl/fattn.cpp +225 -0
  432. data/ext/sources/ggml/src/ggml-sycl/fattn.hpp +22 -0
  433. data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.cpp +309 -0
  434. data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.hpp +8 -0
  435. data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +316 -51
  436. data/ext/sources/ggml/src/ggml-sycl/norm.cpp +65 -66
  437. data/ext/sources/ggml/src/ggml-sycl/outprod.cpp +3 -3
  438. data/ext/sources/ggml/src/ggml-sycl/presets.hpp +3 -0
  439. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +1 -1
  440. data/ext/sources/ggml/src/ggml-sycl/rope.cpp +450 -287
  441. data/ext/sources/ggml/src/ggml-sycl/rope.hpp +6 -0
  442. data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +6 -6
  443. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq112-dv112.cpp +5 -0
  444. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq128-dv128.cpp +5 -0
  445. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq256-dv256.cpp +5 -0
  446. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq40-dv40.cpp +5 -0
  447. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq576-dv512.cpp +5 -0
  448. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq64-dv64.cpp +5 -0
  449. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq72-dv72.cpp +5 -0
  450. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq80-dv80.cpp +5 -0
  451. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq96-dv96.cpp +5 -0
  452. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-f16.cpp +7 -0
  453. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_0.cpp +7 -0
  454. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_1.cpp +7 -0
  455. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_0.cpp +7 -0
  456. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_1.cpp +7 -0
  457. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q8_0.cpp +7 -0
  458. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-f16.cpp +7 -0
  459. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_0.cpp +7 -0
  460. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_1.cpp +7 -0
  461. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_0.cpp +7 -0
  462. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_1.cpp +7 -0
  463. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q8_0.cpp +7 -0
  464. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-f16.cpp +7 -0
  465. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_0.cpp +7 -0
  466. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_1.cpp +7 -0
  467. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_0.cpp +7 -0
  468. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_1.cpp +7 -0
  469. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q8_0.cpp +7 -0
  470. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-f16.cpp +7 -0
  471. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_0.cpp +7 -0
  472. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_1.cpp +7 -0
  473. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_0.cpp +7 -0
  474. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_1.cpp +7 -0
  475. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q8_0.cpp +7 -0
  476. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-f16.cpp +7 -0
  477. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_0.cpp +7 -0
  478. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_1.cpp +7 -0
  479. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_0.cpp +7 -0
  480. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_1.cpp +7 -0
  481. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q8_0.cpp +7 -0
  482. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-f16.cpp +7 -0
  483. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_0.cpp +7 -0
  484. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_1.cpp +7 -0
  485. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_0.cpp +7 -0
  486. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_1.cpp +7 -0
  487. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q8_0.cpp +7 -0
  488. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +13 -0
  489. data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +1 -1
  490. data/ext/sources/ggml/src/ggml-virtgpu/CMakeLists.txt +70 -0
  491. data/ext/sources/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp +87 -0
  492. data/ext/sources/ggml/src/ggml-virtgpu/backend/CMakeLists.txt +21 -0
  493. data/ext/sources/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp +115 -0
  494. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-convert.h +13 -0
  495. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +102 -0
  496. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +105 -0
  497. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +179 -0
  498. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp +148 -0
  499. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +51 -0
  500. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +73 -0
  501. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +27 -0
  502. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +32 -0
  503. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend.cpp +144 -0
  504. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +95 -0
  505. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +94 -0
  506. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +50 -0
  507. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +378 -0
  508. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +232 -0
  509. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +58 -0
  510. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +81 -0
  511. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +119 -0
  512. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +158 -0
  513. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +213 -0
  514. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend.cpp +69 -0
  515. data/ext/sources/ggml/src/ggml-virtgpu/ggml-remoting.h +71 -0
  516. data/ext/sources/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml +166 -0
  517. data/ext/sources/ggml/src/ggml-virtgpu/include/apir_hw.h +9 -0
  518. data/ext/sources/ggml/src/ggml-virtgpu/regenerate_remoting.py +333 -0
  519. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-apir.h +15 -0
  520. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +58 -0
  521. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +110 -0
  522. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +173 -0
  523. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +192 -0
  524. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +36 -0
  525. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +53 -0
  526. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +98 -0
  527. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.h +23 -0
  528. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.cpp +179 -0
  529. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.h +86 -0
  530. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.cpp +544 -0
  531. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.h +117 -0
  532. data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +1 -1
  533. data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1250 -465
  534. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +16 -8
  535. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/elu.comp +27 -0
  536. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +374 -170
  537. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +66 -22
  538. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +389 -201
  539. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +106 -58
  540. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mask_opt.comp +162 -0
  541. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +9 -8
  542. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp +128 -0
  543. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +12 -9
  544. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl +20 -17
  545. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +11 -3
  546. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +8 -4
  547. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +3 -1
  548. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +5 -3
  549. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +3 -3
  550. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +2 -3
  551. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +36 -63
  552. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -4
  553. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -4
  554. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -4
  555. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +10 -5
  556. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +7 -4
  557. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sgn.comp +21 -0
  558. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +16 -10
  559. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +55 -35
  560. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +1314 -109
  561. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1660 -1371
  562. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl +72 -0
  563. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl +106 -0
  564. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl +134 -0
  565. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +141 -0
  566. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +65 -72
  567. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl +75 -0
  568. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +6 -0
  569. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl +66 -0
  570. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +40 -5
  571. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +105 -60
  572. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{get_rows.tmpl.wgsl → get_rows.wgsl} +53 -259
  573. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat.tmpl.wgsl → mul_mat.wgsl} +68 -257
  574. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +692 -23
  575. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat_reg_tile.tmpl.wgsl → mul_mat_reg_tile.wgsl} +28 -128
  576. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat_subgroup_matrix.tmpl.wgsl → mul_mat_subgroup_matrix.wgsl} +31 -137
  577. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +480 -0
  578. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl +86 -0
  579. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/repeat.wgsl +67 -0
  580. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{scale.tmpl.wgsl → scale.wgsl} +9 -36
  581. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +40 -12
  582. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl +55 -0
  583. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +193 -0
  584. data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +6 -1
  585. data/ext/sources/ggml/src/ggml-zendnn/CMakeLists.txt +31 -32
  586. data/ext/sources/ggml/src/ggml-zendnn/ggml-zendnn.cpp +9 -6
  587. data/ext/sources/ggml/src/ggml.c +167 -33
  588. data/ext/sources/ggml/src/gguf.cpp +229 -44
  589. data/ext/sources/src/whisper.cpp +6 -28
  590. data/sig/whisper.rbs +43 -2
  591. data/test/test_context_params.rb +82 -0
  592. data/test/test_token.rb +11 -0
  593. data/test/test_vad_context.rb +58 -8
  594. data/test/test_whisper.rb +20 -0
  595. data/whispercpp.gemspec +1 -1
  596. metadata +240 -28
  597. data/ext/sources/ggml/cmake/BuildTypes.cmake +0 -54
  598. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm-ppc.h +0 -333
  599. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.c +0 -94
  600. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.c +0 -72
  601. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c +0 -49
  602. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.c +0 -1020
  603. data/ext/sources/ggml/src/ggml-hexagon/htp/ops-utils.h +0 -149
  604. data/ext/sources/ggml/src/ggml-hexagon/htp-utils.c +0 -454
  605. data/ext/sources/ggml/src/ggml-hexagon/htp-utils.h +0 -221
  606. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl +0 -188
  607. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +0 -45
  608. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl +0 -267
  609. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.tmpl.wgsl +0 -112
  610. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary_op.wgsl +0 -483
@@ -1,11 +1,11 @@
1
- #include "llama-quant.h"
1
+ #include "llama.h"
2
2
  #include "llama-impl.h"
3
3
  #include "llama-model.h"
4
4
  #include "llama-model-loader.h"
5
5
 
6
- #include <algorithm>
7
6
  #include <cmath>
8
7
  #include <cstring>
8
+ #include <string>
9
9
  #include <cinttypes>
10
10
  #include <fstream>
11
11
  #include <mutex>
@@ -13,10 +13,28 @@
13
13
  #include <thread>
14
14
  #include <unordered_map>
15
15
 
16
- // Quantization types. Changes to this struct must be replicated in quantize.cpp
17
- struct tensor_quantization {
16
+ // result of parsing --tensor-type option
17
+ // (changes to this struct must be reflected in tools/quantize/quantize.cpp)
18
+ struct tensor_type_option {
18
19
  std::string name;
19
- ggml_type quant = GGML_TYPE_COUNT;
20
+ ggml_type type = GGML_TYPE_COUNT;
21
+ };
22
+
23
+ // tensor categorization - used to avoid repeated string matching in quantization logic.
24
+ // this is different from LLM_TN - we want broad categories, not specific tensor names per arch.
25
+ enum class tensor_category {
26
+ TOKEN_EMBD,
27
+ ATTENTION_Q,
28
+ ATTENTION_V,
29
+ ATTENTION_K,
30
+ ATTENTION_QKV,
31
+ ATTENTION_KV_B,
32
+ ATTENTION_OUTPUT,
33
+ FFN_UP,
34
+ FFN_GATE,
35
+ FFN_DOWN,
36
+ OUTPUT,
37
+ OTHER
20
38
  };
21
39
 
22
40
  static void zeros(std::ofstream & file, size_t n) {
@@ -54,7 +72,7 @@ static std::string remap_layer(const std::string & orig_name, const std::vector<
54
72
  return orig_name;
55
73
  }
56
74
 
57
- static std::string remap_imatrix (const std::string & orig_name, const std::map<int, std::string> & mapped) {
75
+ static std::string remap_imatrix(const std::string & orig_name, const std::map<int, std::string> & mapped) {
58
76
  if (mapped.empty()) {
59
77
  return orig_name;
60
78
  }
@@ -76,6 +94,73 @@ static std::string remap_imatrix (const std::string & orig_name, const std::map<
76
94
  return orig_name;
77
95
  }
78
96
 
97
+ //
98
+ // helper functions for tensor name matching
99
+ //
100
+
101
+ static bool tensor_name_match_token_embd(const char * tensor_name) {
102
+ return std::strcmp(tensor_name, "token_embd.weight") == 0 ||
103
+ std::strcmp(tensor_name, "per_layer_token_embd.weight") == 0;
104
+ }
105
+
106
+ static bool tensor_name_match_output_weight(const char * tensor_name) {
107
+ return std::strcmp(tensor_name, "output.weight") == 0;
108
+ }
109
+
110
+ //
111
+ // tensor categorization for quantization
112
+ //
113
+ // (this is different from LLM_TN - we want broad categories, not specific tensor names per arch)
114
+ //
115
+
116
+ static tensor_category tensor_get_category(const std::string & tensor_name) {
117
+ if (tensor_name_match_output_weight(tensor_name.c_str())) {
118
+ return tensor_category::OUTPUT;
119
+ }
120
+ if (tensor_name_match_token_embd(tensor_name.c_str())) {
121
+ return tensor_category::TOKEN_EMBD;
122
+ }
123
+ if (tensor_name.find("attn_qkv.weight") != std::string::npos) {
124
+ return tensor_category::ATTENTION_QKV;
125
+ }
126
+ if (tensor_name.find("attn_kv_b.weight") != std::string::npos) {
127
+ return tensor_category::ATTENTION_KV_B;
128
+ }
129
+ if (tensor_name.find("attn_v.weight") != std::string::npos) {
130
+ return tensor_category::ATTENTION_V;
131
+ }
132
+ if (tensor_name.find("attn_k.weight") != std::string::npos) {
133
+ return tensor_category::ATTENTION_K;
134
+ }
135
+ if (tensor_name.find("attn_q.weight") != std::string::npos) {
136
+ return tensor_category::ATTENTION_Q;
137
+ }
138
+ if (tensor_name.find("attn_output.weight") != std::string::npos) {
139
+ return tensor_category::ATTENTION_OUTPUT;
140
+ }
141
+ if (tensor_name.find("ffn_up") != std::string::npos) {
142
+ return tensor_category::FFN_UP;
143
+ }
144
+ if (tensor_name.find("ffn_gate") != std::string::npos) {
145
+ return tensor_category::FFN_GATE;
146
+ }
147
+ if (tensor_name.find("ffn_down") != std::string::npos) {
148
+ return tensor_category::FFN_DOWN;
149
+ }
150
+ return tensor_category::OTHER;
151
+ }
152
+
153
+ // check if category is for attention-v-like tensors (more sensitive to quantization)
154
+ static bool category_is_attn_v(tensor_category cat) {
155
+ return cat == tensor_category::ATTENTION_V ||
156
+ cat == tensor_category::ATTENTION_QKV ||
157
+ cat == tensor_category::ATTENTION_KV_B;
158
+ }
159
+
160
+ //
161
+ // quantization state
162
+ //
163
+
79
164
  struct quantize_state_impl {
80
165
  const llama_model & model;
81
166
  const llama_model_quantize_params * params;
@@ -89,20 +174,42 @@ struct quantize_state_impl {
89
174
  int i_ffn_gate = 0;
90
175
  int i_ffn_up = 0;
91
176
 
92
- int n_k_quantized = 0;
93
177
  int n_fallback = 0;
94
178
 
95
179
  bool has_imatrix = false;
96
180
 
97
- // used to figure out if a model shares tok_embd with the output weight
98
- bool has_output = false;
181
+ // used to figure out if a model has tied embeddings (tok_embd shares weights with output)
182
+ bool has_tied_embeddings = true; // assume tied until we see output.weight
183
+
184
+ // tensor type override patterns (compiled once, used twice)
185
+ std::vector<std::pair<std::regex, ggml_type>> tensor_type_patterns;
99
186
 
100
- quantize_state_impl(const llama_model & model, const llama_model_quantize_params * params)
101
- : model(model)
102
- , params(params)
103
- {}
187
+ quantize_state_impl(const llama_model & model, const llama_model_quantize_params * params):
188
+ model(model), params(params)
189
+ {
190
+ // compile regex patterns once - they are expensive
191
+ if (params->tensor_types) {
192
+ const auto & tensor_types = *static_cast<const std::vector<tensor_type_option> *>(params->tensor_types);
193
+ for (const auto & [tname, qtype] : tensor_types) {
194
+ tensor_type_patterns.emplace_back(std::regex(tname), qtype);
195
+ }
196
+ }
197
+ }
104
198
  };
105
199
 
200
+ // per-tensor metadata, computed in the preliminary loop and used in the main loop
201
+ struct tensor_metadata {
202
+ ggml_type target_type;
203
+ tensor_category category;
204
+ std::string remapped_imatrix_name;
205
+ bool allows_quantization;
206
+ bool requires_imatrix;
207
+ };
208
+
209
+ //
210
+ // dequantization
211
+ //
212
+
106
213
  static void llama_tensor_dequantize_impl(
107
214
  ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
108
215
  const size_t nelements, const int nthread
@@ -175,12 +282,132 @@ static void llama_tensor_dequantize_impl(
175
282
  workers.clear();
176
283
  }
177
284
 
178
- static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
285
+ //
286
+ // do we allow this tensor to be quantized?
287
+ //
288
+
289
+ static bool tensor_allows_quantization(const llama_model_quantize_params * params, llm_arch arch, const ggml_tensor * tensor) {
290
+ // trivial checks first -- no string ops needed
291
+ if (params->only_copy) return false;
292
+
293
+ // quantize only 2D and 3D tensors (experts)
294
+ if (ggml_n_dims(tensor) < 2) return false;
295
+
296
+ const std::string name = ggml_get_name(tensor);
297
+
298
+ // This used to be a regex, but <regex> has an extreme cost to compile times.
299
+ bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
300
+
301
+ // do not quantize norm tensors
302
+ quantize &= name.find("_norm.weight") == std::string::npos;
303
+
304
+ quantize &= params->quantize_output_tensor || name != "output.weight";
305
+
306
+ // do not quantize expert gating tensors
307
+ // NOTE: can't use LLM_TN here because the layer number is not known
308
+ quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
309
+
310
+ // these are very small (e.g. 4x4)
311
+ quantize &= name.find("altup") == std::string::npos;
312
+ quantize &= name.find("laurel") == std::string::npos;
313
+
314
+ // these are not too big so keep them as it is
315
+ quantize &= name.find("per_layer_model_proj") == std::string::npos;
316
+
317
+ // do not quantize positional embeddings and token types (BERT)
318
+ quantize &= name != LLM_TN(arch)(LLM_TENSOR_POS_EMBD, "weight");
319
+ quantize &= name != LLM_TN(arch)(LLM_TENSOR_TOKEN_TYPES, "weight");
320
+
321
+ // do not quantize Mamba/Kimi's small conv1d weights
322
+ // NOTE: can't use LLM_TN here because the layer number is not known
323
+ quantize &= name.find("ssm_conv1d") == std::string::npos;
324
+ quantize &= name.find("shortconv.conv.weight") == std::string::npos;
325
+
326
+ // do not quantize RWKV's small yet 2D weights
327
+ quantize &= name.find("time_mix_first.weight") == std::string::npos;
328
+ quantize &= name.find("time_mix_w0.weight") == std::string::npos;
329
+ quantize &= name.find("time_mix_w1.weight") == std::string::npos;
330
+ quantize &= name.find("time_mix_w2.weight") == std::string::npos;
331
+ quantize &= name.find("time_mix_v0.weight") == std::string::npos;
332
+ quantize &= name.find("time_mix_v1.weight") == std::string::npos;
333
+ quantize &= name.find("time_mix_v2.weight") == std::string::npos;
334
+ quantize &= name.find("time_mix_a0.weight") == std::string::npos;
335
+ quantize &= name.find("time_mix_a1.weight") == std::string::npos;
336
+ quantize &= name.find("time_mix_a2.weight") == std::string::npos;
337
+ quantize &= name.find("time_mix_g1.weight") == std::string::npos;
338
+ quantize &= name.find("time_mix_g2.weight") == std::string::npos;
339
+ quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos;
340
+ quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos;
341
+ quantize &= name.find("time_mix_lerp_fused.weight") == std::string::npos;
342
+
343
+ // do not quantize relative position bias (T5)
344
+ quantize &= name.find("attn_rel_b.weight") == std::string::npos;
345
+
346
+ // do not quantize specific multimodal tensors
347
+ quantize &= name.find(".position_embd.") == std::string::npos;
348
+
349
+ return quantize;
350
+ }
351
+
352
+ //
353
+ // tensor type selection
354
+ //
355
+
356
+ // incompatible tensor shapes are handled here - fallback to a compatible type
357
+ static ggml_type tensor_type_fallback(quantize_state_impl & qs, const ggml_tensor * t, const ggml_type target_type) {
358
+ ggml_type return_type = target_type;
359
+
360
+ const int64_t ncols = t->ne[0];
361
+ const int64_t qk_k = ggml_blck_size(target_type);
362
+
363
+ if (ncols % qk_k != 0) { // this tensor's shape is incompatible with this quant
364
+ LLAMA_LOG_WARN("warning: %-36s - ncols %6" PRId64 " not divisible by %3" PRId64 " (required for type %7s) ",
365
+ t->name, ncols, qk_k, ggml_type_name(target_type));
366
+ ++qs.n_fallback;
367
+
368
+ switch (target_type) {
369
+ // types on the left: block size 256
370
+ case GGML_TYPE_IQ1_S:
371
+ case GGML_TYPE_IQ1_M:
372
+ case GGML_TYPE_IQ2_XXS:
373
+ case GGML_TYPE_IQ2_XS:
374
+ case GGML_TYPE_IQ2_S:
375
+ case GGML_TYPE_IQ3_XXS:
376
+ case GGML_TYPE_IQ3_S: // types on the right: block size 32
377
+ case GGML_TYPE_IQ4_XS: return_type = GGML_TYPE_IQ4_NL; break;
378
+ case GGML_TYPE_Q2_K:
379
+ case GGML_TYPE_Q3_K:
380
+ case GGML_TYPE_TQ1_0:
381
+ case GGML_TYPE_TQ2_0: return_type = GGML_TYPE_Q4_0; break;
382
+ case GGML_TYPE_Q4_K: return_type = GGML_TYPE_Q5_0; break;
383
+ case GGML_TYPE_Q5_K: return_type = GGML_TYPE_Q5_1; break;
384
+ case GGML_TYPE_Q6_K: return_type = GGML_TYPE_Q8_0; break;
385
+ default:
386
+ throw std::runtime_error(format("no tensor type fallback is defined for type %s",
387
+ ggml_type_name(target_type)));
388
+ }
389
+ if (ncols % ggml_blck_size(return_type) != 0) {
390
+ //
391
+ // the fallback return type is still not compatible for this tensor!
392
+ //
393
+ // most likely, this tensor's first dimension is not divisible by 32.
394
+ // this is very rare. we can either abort the quantization, or
395
+ // fallback to F16 / F32.
396
+ //
397
+ LLAMA_LOG_WARN("(WARNING: must use F16 due to unusual shape) ");
398
+ return_type = GGML_TYPE_F16;
399
+ }
400
+ LLAMA_LOG_WARN("-> falling back to %7s\n", ggml_type_name(return_type));
401
+ }
402
+ return return_type;
403
+ }
404
+
405
+ // internal standard logic for selecting the target tensor type based on tensor category, ftype, and model arch
406
+ static ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype, tensor_category category) {
179
407
  const std::string name = ggml_get_name(tensor);
180
408
 
181
409
  // TODO: avoid hardcoded tensor names - use the TN_* constants
182
410
  const llm_arch arch = qs.model.arch;
183
- const auto tn = LLM_TN(arch);
184
411
 
185
412
  auto use_more_bits = [](int i_layer, int n_layers) -> bool {
186
413
  return i_layer < n_layers/8 || i_layer >= 7*n_layers/8 || (i_layer - n_layers/8)%3 == 2;
@@ -204,7 +431,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
204
431
 
205
432
  // for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
206
433
  // with the quantization of the output tensor
207
- if (name == tn(LLM_TENSOR_OUTPUT, "weight") || (!qs.has_output && name == tn(LLM_TENSOR_TOKEN_EMBD, "weight"))) {
434
+ if (category == tensor_category::OUTPUT || (qs.has_tied_embeddings && category == tensor_category::TOKEN_EMBD)) {
208
435
  if (qs.params->output_tensor_type < GGML_TYPE_COUNT) {
209
436
  new_type = qs.params->output_tensor_type;
210
437
  } else {
@@ -234,7 +461,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
234
461
  } else {
235
462
  new_type = GGML_TYPE_Q8_0;
236
463
  }
237
- } else if (name == "token_embd.weight" || name == "per_layer_token_embd.weight") {
464
+ } else if (category == tensor_category::TOKEN_EMBD) {
238
465
  if (qs.params->token_embedding_type < GGML_TYPE_COUNT) {
239
466
  new_type = qs.params->token_embedding_type;
240
467
  } else {
@@ -254,21 +481,21 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
254
481
  }
255
482
  } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
256
483
  ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
257
- if (name.find("attn_v.weight") != std::string::npos) {
484
+ if (category_is_attn_v(category)) {
258
485
  if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
259
486
  else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
260
487
  ++qs.i_attention_wv;
261
488
  }
262
- else if (qs.model.hparams.n_expert == 8 && name.find("attn_k.weight") != std::string::npos) {
489
+ else if (qs.model.hparams.n_expert == 8 && category == tensor_category::ATTENTION_K) {
263
490
  new_type = GGML_TYPE_Q4_K;
264
491
  }
265
- else if (name.find("ffn_down") != std::string::npos) {
492
+ else if (category == tensor_category::FFN_DOWN) {
266
493
  if (qs.i_ffn_down < qs.n_ffn_down/8) {
267
494
  new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
268
495
  }
269
496
  ++qs.i_ffn_down;
270
497
  }
271
- else if (name.find("attn_output.weight") != std::string::npos) {
498
+ else if (category == tensor_category::ATTENTION_OUTPUT) {
272
499
  if (qs.model.hparams.n_expert == 8) {
273
500
  new_type = GGML_TYPE_Q5_K;
274
501
  } else {
@@ -276,7 +503,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
276
503
  else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S;
277
504
  }
278
505
  }
279
- } else if (name.find("attn_v.weight") != std::string::npos) {
506
+ } else if (category_is_attn_v(category)) {
280
507
  if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
281
508
  new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
282
509
  }
@@ -314,7 +541,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
314
541
  new_type = GGML_TYPE_Q8_0;
315
542
  }
316
543
  ++qs.i_attention_wv;
317
- } else if (name.find("attn_k.weight") != std::string::npos) {
544
+ } else if (category == tensor_category::ATTENTION_K) {
318
545
  if (qs.model.hparams.n_expert == 8) {
319
546
  // for the 8-expert model, bumping this to Q8_0 trades just ~128MB
320
547
  // TODO: explore better strategies
@@ -326,14 +553,14 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
326
553
  else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
327
554
  new_type = GGML_TYPE_IQ2_S;
328
555
  }
329
- } else if (name.find("attn_q.weight") != std::string::npos) {
556
+ } else if (category == tensor_category::ATTENTION_Q) {
330
557
  if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
331
558
  new_type = GGML_TYPE_IQ3_XXS;
332
559
  }
333
560
  else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
334
561
  new_type = GGML_TYPE_IQ2_S;
335
562
  }
336
- } else if (name.find("ffn_down") != std::string::npos) {
563
+ } else if (category == tensor_category::FFN_DOWN) {
337
564
  auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
338
565
  int i_layer = info.first, n_layer = info.second;
339
566
  if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
@@ -378,7 +605,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
378
605
  new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1;
379
606
  }
380
607
  ++qs.i_ffn_down;
381
- } else if (name.find("attn_output.weight") != std::string::npos) {
608
+ } else if (category == tensor_category::ATTENTION_OUTPUT) {
382
609
  if (arch != LLM_ARCH_FALCON) {
383
610
  if (qs.model.hparams.n_expert == 8) {
384
611
  if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
@@ -398,14 +625,14 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
398
625
  if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
399
626
  }
400
627
  }
401
- else if (name.find("attn_qkv.weight") != std::string::npos) {
628
+ else if (category == tensor_category::ATTENTION_QKV) {
402
629
  if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
403
630
  new_type = GGML_TYPE_Q4_K;
404
631
  }
405
632
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
406
633
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
407
634
  }
408
- else if (name.find("ffn_gate") != std::string::npos) {
635
+ else if (category == tensor_category::FFN_GATE) {
409
636
  auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str());
410
637
  int i_layer = info.first, n_layer = info.second;
411
638
  if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
@@ -413,7 +640,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
413
640
  }
414
641
  ++qs.i_ffn_gate;
415
642
  }
416
- else if (name.find("ffn_up") != std::string::npos) {
643
+ else if (category == tensor_category::FFN_UP) {
417
644
  auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str());
418
645
  int i_layer = info.first, n_layer = info.second;
419
646
  if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
@@ -422,60 +649,58 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
422
649
  ++qs.i_ffn_up;
423
650
  }
424
651
 
425
- // if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
426
- //}
427
- // IK: let's remove this, else Q2_K is almost the same as Q3_K_S
428
- //else if (name.find("ffn_gate") != std::string::npos || name.find("ffn_up") != std::string::npos) {
429
- // if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
430
- //}
431
- // This can be used to reduce the size of the Q5_K_S model.
432
- // The associated PPL increase is fully in line with the size reduction
433
- //else {
434
- // if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
435
- //}
436
- bool convert_incompatible_tensor = false;
437
- {
438
- const int64_t nx = tensor->ne[0];
439
- const int64_t ny = tensor->ne[1];
440
- const int64_t qk_k = ggml_blck_size(new_type);
652
+ return new_type;
653
+ }
441
654
 
442
- if (nx % qk_k != 0) {
443
- LLAMA_LOG_WARN("\n\n%s : tensor cols %" PRId64 " x %" PRId64 " are not divisible by %" PRId64 ", required for %s", __func__, nx, ny, qk_k, ggml_type_name(new_type));
444
- convert_incompatible_tensor = true;
445
- } else {
446
- ++qs.n_k_quantized;
447
- }
655
+ // outer wrapper: determine the ggml_type that this tensor should be quantized to
656
+ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, const llama_model_quantize_params * params, const ggml_tensor * tensor, ggml_type default_type, const tensor_metadata & tm) {
657
+ if (!tensor_allows_quantization(params, qs.model.arch, tensor)) {
658
+ return tensor->type;
659
+ }
660
+ if (params->token_embedding_type < GGML_TYPE_COUNT && tm.category == tensor_category::TOKEN_EMBD) {
661
+ return params->token_embedding_type;
662
+ }
663
+ if (params->output_tensor_type < GGML_TYPE_COUNT && tm.category == tensor_category::OUTPUT) {
664
+ return params->output_tensor_type;
448
665
  }
449
666
 
450
- if (convert_incompatible_tensor) {
451
- switch (new_type) {
452
- case GGML_TYPE_TQ1_0:
453
- case GGML_TYPE_TQ2_0: new_type = GGML_TYPE_Q4_0; break; // TODO: use a symmetric type instead
454
- case GGML_TYPE_IQ2_XXS:
455
- case GGML_TYPE_IQ2_XS:
456
- case GGML_TYPE_IQ2_S:
457
- case GGML_TYPE_IQ3_XXS:
458
- case GGML_TYPE_IQ3_S:
459
- case GGML_TYPE_IQ1_S:
460
- case GGML_TYPE_IQ1_M:
461
- case GGML_TYPE_Q2_K:
462
- case GGML_TYPE_Q3_K:
463
- case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break;
464
- case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
465
- case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break;
466
- case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
467
- default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
667
+ ggml_type new_type = default_type;
668
+
669
+ // get more optimal quantization type based on the tensor shape, layer, etc.
670
+ if (!params->pure && ggml_is_quantized(default_type)) {
671
+ // if the user provided tensor types - use those
672
+ bool manual = false;
673
+ if (!qs.tensor_type_patterns.empty()) {
674
+ const std::string tensor_name(tensor->name);
675
+ for (const auto & [pattern, qtype] : qs.tensor_type_patterns) {
676
+ if (std::regex_search(tensor_name, pattern)) {
677
+ if (qtype != new_type) {
678
+ LLAMA_LOG_WARN("%s: %-36s - applying manual override: %s -> %s\n",
679
+ __func__, tensor_name.c_str(), ggml_type_name(new_type), ggml_type_name(qtype));
680
+ new_type = qtype;
681
+ manual = true;
682
+ break;
683
+ }
684
+ }
685
+ }
468
686
  }
469
- if (tensor->ne[0] % ggml_blck_size(new_type) != 0) {
470
- new_type = GGML_TYPE_F16;
687
+
688
+ // if not manual - use the standard logic for choosing the quantization type based on the selected mixture
689
+ if (!manual) {
690
+ new_type = llama_tensor_get_type_impl(qs, new_type, tensor, params->ftype, tm.category);
471
691
  }
472
- LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
473
- ++qs.n_fallback;
692
+
693
+ // incompatible tensor shapes are handled here - fallback to a compatible type
694
+ new_type = tensor_type_fallback(qs, tensor, new_type);
474
695
  }
475
696
 
476
697
  return new_type;
477
698
  }
478
699
 
700
+ //
701
+ // quantization implementation
702
+ //
703
+
479
704
  static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
480
705
  if (nthread < 2) {
481
706
  // single-thread
@@ -530,50 +755,85 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float *
530
755
  return new_size;
531
756
  }
532
757
 
533
- static void llama_model_quantize_impl(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
534
- ggml_type default_type;
535
- llama_ftype ftype = params->ftype;
758
+ //
759
+ // imatrix requirement check
760
+ //
536
761
 
537
- switch (params->ftype) {
538
- case LLAMA_FTYPE_MOSTLY_Q4_0: default_type = GGML_TYPE_Q4_0; break;
539
- case LLAMA_FTYPE_MOSTLY_Q4_1: default_type = GGML_TYPE_Q4_1; break;
540
- case LLAMA_FTYPE_MOSTLY_Q5_0: default_type = GGML_TYPE_Q5_0; break;
541
- case LLAMA_FTYPE_MOSTLY_Q5_1: default_type = GGML_TYPE_Q5_1; break;
542
- case LLAMA_FTYPE_MOSTLY_Q8_0: default_type = GGML_TYPE_Q8_0; break;
543
- case LLAMA_FTYPE_MOSTLY_F16: default_type = GGML_TYPE_F16; break;
544
- case LLAMA_FTYPE_MOSTLY_BF16: default_type = GGML_TYPE_BF16; break;
545
- case LLAMA_FTYPE_ALL_F32: default_type = GGML_TYPE_F32; break;
762
+ static bool tensor_requires_imatrix(const char * tensor_name, const ggml_type dst_type, const llama_ftype ftype) {
763
+ if (tensor_name_match_token_embd(tensor_name) || tensor_name_match_output_weight(tensor_name)) {
764
+ return false;
765
+ }
766
+ switch (dst_type) {
767
+ case GGML_TYPE_IQ3_XXS:
768
+ case GGML_TYPE_IQ2_XXS:
769
+ case GGML_TYPE_IQ2_XS:
770
+ case GGML_TYPE_IQ2_S:
771
+ case GGML_TYPE_IQ1_M:
772
+ case GGML_TYPE_IQ1_S:
773
+ return true;
774
+ case GGML_TYPE_Q2_K:
775
+ // as a general rule, the k-type quantizations don't require imatrix data.
776
+ // the only exception is Q2_K tensors that are part of a Q2_K_S file.
777
+ return ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S;
778
+ default:
779
+ return false;
780
+ }
781
+ }
546
782
 
547
- case LLAMA_FTYPE_MOSTLY_MXFP4_MOE: default_type = GGML_TYPE_MXFP4; break;
783
+ //
784
+ // given a file type, get the default tensor type
785
+ //
786
+
787
+ static ggml_type llama_ftype_get_default_type(llama_ftype ftype) {
788
+ switch (ftype) {
789
+ case LLAMA_FTYPE_MOSTLY_Q4_0: return GGML_TYPE_Q4_0;
790
+ case LLAMA_FTYPE_MOSTLY_Q4_1: return GGML_TYPE_Q4_1;
791
+ case LLAMA_FTYPE_MOSTLY_Q5_0: return GGML_TYPE_Q5_0;
792
+ case LLAMA_FTYPE_MOSTLY_Q5_1: return GGML_TYPE_Q5_1;
793
+ case LLAMA_FTYPE_MOSTLY_Q8_0: return GGML_TYPE_Q8_0;
794
+ case LLAMA_FTYPE_MOSTLY_F16: return GGML_TYPE_F16;
795
+ case LLAMA_FTYPE_MOSTLY_BF16: return GGML_TYPE_BF16;
796
+ case LLAMA_FTYPE_ALL_F32: return GGML_TYPE_F32;
797
+
798
+ case LLAMA_FTYPE_MOSTLY_MXFP4_MOE: return GGML_TYPE_MXFP4;
548
799
 
549
800
  // K-quants
550
801
  case LLAMA_FTYPE_MOSTLY_Q2_K_S:
551
- case LLAMA_FTYPE_MOSTLY_Q2_K: default_type = GGML_TYPE_Q2_K; break;
552
- case LLAMA_FTYPE_MOSTLY_IQ3_XS: default_type = GGML_TYPE_IQ3_S; break;
802
+ case LLAMA_FTYPE_MOSTLY_Q2_K: return GGML_TYPE_Q2_K;
803
+ case LLAMA_FTYPE_MOSTLY_IQ3_XS: return GGML_TYPE_IQ3_S;
553
804
  case LLAMA_FTYPE_MOSTLY_Q3_K_S:
554
805
  case LLAMA_FTYPE_MOSTLY_Q3_K_M:
555
- case LLAMA_FTYPE_MOSTLY_Q3_K_L: default_type = GGML_TYPE_Q3_K; break;
806
+ case LLAMA_FTYPE_MOSTLY_Q3_K_L: return GGML_TYPE_Q3_K;
556
807
  case LLAMA_FTYPE_MOSTLY_Q4_K_S:
557
- case LLAMA_FTYPE_MOSTLY_Q4_K_M: default_type = GGML_TYPE_Q4_K; break;
808
+ case LLAMA_FTYPE_MOSTLY_Q4_K_M: return GGML_TYPE_Q4_K;
558
809
  case LLAMA_FTYPE_MOSTLY_Q5_K_S:
559
- case LLAMA_FTYPE_MOSTLY_Q5_K_M: default_type = GGML_TYPE_Q5_K; break;
560
- case LLAMA_FTYPE_MOSTLY_Q6_K: default_type = GGML_TYPE_Q6_K; break;
561
- case LLAMA_FTYPE_MOSTLY_TQ1_0: default_type = GGML_TYPE_TQ1_0; break;
562
- case LLAMA_FTYPE_MOSTLY_TQ2_0: default_type = GGML_TYPE_TQ2_0; break;
563
- case LLAMA_FTYPE_MOSTLY_IQ2_XXS: default_type = GGML_TYPE_IQ2_XXS; break;
564
- case LLAMA_FTYPE_MOSTLY_IQ2_XS: default_type = GGML_TYPE_IQ2_XS; break;
565
- case LLAMA_FTYPE_MOSTLY_IQ2_S: default_type = GGML_TYPE_IQ2_XS; break;
566
- case LLAMA_FTYPE_MOSTLY_IQ2_M: default_type = GGML_TYPE_IQ2_S; break;
567
- case LLAMA_FTYPE_MOSTLY_IQ3_XXS: default_type = GGML_TYPE_IQ3_XXS; break;
568
- case LLAMA_FTYPE_MOSTLY_IQ1_S: default_type = GGML_TYPE_IQ1_S; break;
569
- case LLAMA_FTYPE_MOSTLY_IQ1_M: default_type = GGML_TYPE_IQ1_M; break;
570
- case LLAMA_FTYPE_MOSTLY_IQ4_NL: default_type = GGML_TYPE_IQ4_NL; break;
571
- case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break;
572
- case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break;
573
- case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break;
810
+ case LLAMA_FTYPE_MOSTLY_Q5_K_M: return GGML_TYPE_Q5_K;
811
+ case LLAMA_FTYPE_MOSTLY_Q6_K: return GGML_TYPE_Q6_K;
812
+ case LLAMA_FTYPE_MOSTLY_TQ1_0: return GGML_TYPE_TQ1_0;
813
+ case LLAMA_FTYPE_MOSTLY_TQ2_0: return GGML_TYPE_TQ2_0;
814
+ case LLAMA_FTYPE_MOSTLY_IQ2_XXS: return GGML_TYPE_IQ2_XXS;
815
+ case LLAMA_FTYPE_MOSTLY_IQ2_XS: return GGML_TYPE_IQ2_XS;
816
+ case LLAMA_FTYPE_MOSTLY_IQ2_S: return GGML_TYPE_IQ2_XS;
817
+ case LLAMA_FTYPE_MOSTLY_IQ2_M: return GGML_TYPE_IQ2_S;
818
+ case LLAMA_FTYPE_MOSTLY_IQ3_XXS: return GGML_TYPE_IQ3_XXS;
819
+ case LLAMA_FTYPE_MOSTLY_IQ1_S: return GGML_TYPE_IQ1_S;
820
+ case LLAMA_FTYPE_MOSTLY_IQ1_M: return GGML_TYPE_IQ1_M;
821
+ case LLAMA_FTYPE_MOSTLY_IQ4_NL: return GGML_TYPE_IQ4_NL;
822
+ case LLAMA_FTYPE_MOSTLY_IQ4_XS: return GGML_TYPE_IQ4_XS;
823
+ case LLAMA_FTYPE_MOSTLY_IQ3_S:
824
+ case LLAMA_FTYPE_MOSTLY_IQ3_M: return GGML_TYPE_IQ3_S;
574
825
 
575
826
  default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
576
827
  }
828
+ }
829
+
830
+ //
831
+ // main quantization driver
832
+ //
833
+
834
+ static void llama_model_quantize_impl(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
835
+ ggml_type default_type;
836
+ llama_ftype ftype = params->ftype;
577
837
 
578
838
  int nthread = params->nthread;
579
839
 
@@ -581,6 +841,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
581
841
  nthread = std::thread::hardware_concurrency();
582
842
  }
583
843
 
844
+ default_type = llama_ftype_get_default_type(ftype);
845
+
584
846
  // mmap consistently increases speed on Linux, and also increases speed on Windows with
585
847
  // hot cache. It may cause a slowdown on macOS, possibly related to free memory.
586
848
  #if defined(__linux__) || defined(_WIN32)
@@ -596,7 +858,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
596
858
  }
597
859
 
598
860
  std::vector<std::string> splits = {};
599
- llama_model_loader ml(fname_inp, splits, use_mmap, /*use_direct_io*/ true, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
861
+ llama_model_loader ml(/*metadata*/ nullptr, /*set_tensor_data*/ nullptr, /*set_tensor_data_ud*/ nullptr,
862
+ fname_inp, splits, use_mmap, /*use_direct_io*/ false, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
600
863
  ml.init_mappings(false); // no prefetching
601
864
 
602
865
  llama_model model(llama_model_default_params());
@@ -614,7 +877,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
614
877
  if (params->imatrix) {
615
878
  imatrix_data = static_cast<const std::unordered_map<std::string, std::vector<float>>*>(params->imatrix);
616
879
  if (imatrix_data) {
617
- LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size()));
880
+ LLAMA_LOG_INFO("\n%s: have importance matrix data with %d entries\n",
881
+ __func__, (int)imatrix_data->size());
618
882
  qs.has_imatrix = true;
619
883
  // check imatrix for nans or infs
620
884
  for (const auto & kv : *imatrix_data) {
@@ -636,7 +900,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
636
900
  }
637
901
 
638
902
  // copy the KV pairs from the input file
639
- gguf_set_kv (ctx_out.get(), ml.meta.get());
903
+ gguf_set_kv (ctx_out.get(), ml.metadata);
640
904
  gguf_set_val_u32(ctx_out.get(), "general.quantization_version", GGML_QNT_VERSION); // TODO: use LLM_KV
641
905
  gguf_set_val_u32(ctx_out.get(), "general.file_type", ftype); // TODO: use LLM_KV
642
906
 
@@ -697,35 +961,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
697
961
  });
698
962
  }
699
963
 
700
- for (const auto * it : tensors) {
701
- const struct ggml_tensor * tensor = it->tensor;
702
-
703
- const std::string name = ggml_get_name(tensor);
704
-
705
- // TODO: avoid hardcoded tensor names - use the TN_* constants
706
- if (name.find("attn_v.weight") != std::string::npos ||
707
- name.find("attn_qkv.weight") != std::string::npos ||
708
- name.find("attn_kv_b.weight")!= std::string::npos) {
709
- ++qs.n_attention_wv;
710
- } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
711
- qs.has_output = true;
712
- }
713
- }
714
-
715
- qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
716
-
717
- size_t total_size_org = 0;
718
- size_t total_size_new = 0;
719
-
720
- std::vector<std::thread> workers;
721
- workers.reserve(nthread);
722
-
723
964
  int idx = 0;
724
-
725
- std::vector<no_init<uint8_t>> read_data;
726
- std::vector<no_init<uint8_t>> work;
727
- std::vector<no_init<float>> f32_conv_buf;
728
-
729
965
  uint16_t n_split = 1;
730
966
 
731
967
  // Assume split index is continuous
@@ -737,14 +973,68 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
737
973
  std::vector<gguf_context_ptr> ctx_outs(n_split);
738
974
  ctx_outs[0] = std::move(ctx_out);
739
975
 
740
- // populate the original tensors so we get an initial meta data
741
- for (const auto * it : tensors) {
976
+ // compute tensor metadata once and cache it
977
+ std::vector<tensor_metadata> metadata(tensors.size());
978
+
979
+ // initialize quantization state before preliminary loop (counters for use_more_bits)
980
+ {
981
+ for (size_t i = 0; i < tensors.size(); ++i) {
982
+ const auto cat = tensor_get_category(tensors[i]->tensor->name);
983
+ if (category_is_attn_v(cat)) {
984
+ ++qs.n_attention_wv;
985
+ }
986
+ if (cat == tensor_category::OUTPUT) {
987
+ qs.has_tied_embeddings = false;
988
+ }
989
+ metadata[i].category = cat; // save and re-use the category while we're at it
990
+ }
991
+ // these also need to be set to n_layer by default
992
+ qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)qs.model.hparams.n_layer;
993
+ }
994
+
995
+ // flag for --dry-run
996
+ bool will_require_imatrix = false;
997
+
998
+ //
999
+ // preliminary iteration over all weights
1000
+ //
1001
+
1002
+ for (size_t i = 0; i < tensors.size(); ++i) {
1003
+ const auto * it = tensors[i];
1004
+ const struct ggml_tensor * tensor = it->tensor;
1005
+ const std::string name = ggml_get_name(tensor);
1006
+
742
1007
  uint16_t i_split = params->keep_split ? it->idx : 0;
743
- ggml_tensor * tensor = it->tensor;
744
1008
  if (!ctx_outs[i_split]) {
745
1009
  ctx_outs[i_split].reset(gguf_init_empty());
746
1010
  }
747
1011
  gguf_add_tensor(ctx_outs[i_split].get(), tensor);
1012
+
1013
+ metadata[i].allows_quantization = tensor_allows_quantization(params, model.arch, tensor);
1014
+
1015
+ if (metadata[i].allows_quantization) {
1016
+ metadata[i].target_type = llama_tensor_get_type(qs, params, tensor, default_type, metadata[i]);
1017
+ } else {
1018
+ metadata[i].target_type = tensor->type;
1019
+ }
1020
+
1021
+ metadata[i].requires_imatrix = tensor_requires_imatrix(tensor->name, metadata[i].target_type, ftype);
1022
+
1023
+ if (params->imatrix) {
1024
+ metadata[i].remapped_imatrix_name = remap_imatrix(tensor->name, mapped);
1025
+ } else if (metadata[i].allows_quantization && metadata[i].requires_imatrix) {
1026
+ if (params->dry_run) {
1027
+ will_require_imatrix = true;
1028
+ } else {
1029
+ LLAMA_LOG_ERROR("\n============================================================================\n"
1030
+ " ERROR: this quantization requires an importance matrix!\n"
1031
+ " - offending tensor: %s\n"
1032
+ " - target type: %s\n"
1033
+ "============================================================================\n\n",
1034
+ name.c_str(), ggml_type_name(metadata[i].target_type));
1035
+ throw std::runtime_error("this quantization requires an imatrix!");
1036
+ }
1037
+ }
748
1038
  }
749
1039
 
750
1040
  // Set split info if needed
@@ -756,6 +1046,16 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
756
1046
  }
757
1047
  }
758
1048
 
1049
+ size_t total_size_org = 0;
1050
+ size_t total_size_new = 0;
1051
+
1052
+ std::vector<std::thread> workers;
1053
+ workers.reserve(nthread);
1054
+
1055
+ std::vector<no_init<uint8_t>> read_data;
1056
+ std::vector<no_init<uint8_t>> work;
1057
+ std::vector<no_init<float>> f32_conv_buf;
1058
+
759
1059
  int cur_split = -1;
760
1060
  std::ofstream fout;
761
1061
  auto close_ofstream = [&]() {
@@ -785,251 +1085,182 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
785
1085
  ::zeros(fout, meta_size);
786
1086
  };
787
1087
 
788
- const auto tn = LLM_TN(model.arch);
789
- new_ofstream(0);
790
- for (const auto * it : tensors) {
791
- const auto & weight = *it;
1088
+ // no output file for --dry-run
1089
+ if (!params->dry_run) {
1090
+ new_ofstream(0);
1091
+ }
1092
+
1093
+ //
1094
+ // main loop: iterate over all weights
1095
+ //
1096
+
1097
+ for (size_t i = 0; i < tensors.size(); ++i) {
1098
+ const auto & weight = *tensors[i];
1099
+ const auto & tm = metadata[i];
792
1100
  ggml_tensor * tensor = weight.tensor;
793
- if (weight.idx != cur_split && params->keep_split) {
1101
+
1102
+ if (!params->dry_run && (weight.idx != cur_split && params->keep_split)) {
794
1103
  close_ofstream();
795
1104
  new_ofstream(weight.idx);
796
1105
  }
797
1106
 
798
1107
  const std::string name = ggml_get_name(tensor);
1108
+ const size_t tensor_size = ggml_nbytes(tensor);
799
1109
 
800
- if (!ml.use_mmap) {
801
- if (read_data.size() < ggml_nbytes(tensor)) {
802
- read_data.resize(ggml_nbytes(tensor));
1110
+ if (!params->dry_run) {
1111
+ if (!ml.use_mmap) {
1112
+ if (read_data.size() < tensor_size) {
1113
+ read_data.resize(tensor_size);
1114
+ }
1115
+ tensor->data = read_data.data();
803
1116
  }
804
- tensor->data = read_data.data();
1117
+ ml.load_data_for(tensor);
805
1118
  }
806
- ml.load_data_for(tensor);
807
1119
 
808
- LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",
1120
+ LLAMA_LOG_INFO("[%4d/%4d] %-36s - [%s], type = %6s, ",
809
1121
  ++idx, ml.n_tensors,
810
1122
  ggml_get_name(tensor),
811
1123
  llama_format_tensor_shape(tensor).c_str(),
812
1124
  ggml_type_name(tensor->type));
813
1125
 
814
- // This used to be a regex, but <regex> has an extreme cost to compile times.
815
- bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
816
-
817
- // quantize only 2D and 3D tensors (experts)
818
- quantize &= (ggml_n_dims(tensor) >= 2);
819
-
820
- // do not quantize norm tensors
821
- quantize &= name.find("_norm.weight") == std::string::npos;
822
-
823
- quantize &= params->quantize_output_tensor || name != "output.weight";
824
- quantize &= !params->only_copy;
825
-
826
- // do not quantize expert gating tensors
827
- // NOTE: can't use LLM_TN here because the layer number is not known
828
- quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
829
-
830
- // these are very small (e.g. 4x4)
831
- quantize &= name.find("altup") == std::string::npos;
832
- quantize &= name.find("laurel") == std::string::npos;
833
-
834
- // these are not too big so keep them as it is
835
- quantize &= name.find("per_layer_model_proj") == std::string::npos;
836
-
837
- // do not quantize positional embeddings and token types (BERT)
838
- quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD, "weight");
839
- quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight");
840
-
841
- // do not quantize Mamba's small yet 2D weights
842
- // NOTE: can't use LLM_TN here because the layer number is not known
843
- quantize &= name.find("ssm_conv1d.weight") == std::string::npos;
844
- quantize &= name.find("shortconv.conv.weight") == std::string::npos;
845
-
846
- // do not quantize RWKV's small yet 2D weights
847
- quantize &= name.find("time_mix_first.weight") == std::string::npos;
848
- quantize &= name.find("time_mix_w0.weight") == std::string::npos;
849
- quantize &= name.find("time_mix_w1.weight") == std::string::npos;
850
- quantize &= name.find("time_mix_w2.weight") == std::string::npos;
851
- quantize &= name.find("time_mix_v0.weight") == std::string::npos;
852
- quantize &= name.find("time_mix_v1.weight") == std::string::npos;
853
- quantize &= name.find("time_mix_v2.weight") == std::string::npos;
854
- quantize &= name.find("time_mix_a0.weight") == std::string::npos;
855
- quantize &= name.find("time_mix_a1.weight") == std::string::npos;
856
- quantize &= name.find("time_mix_a2.weight") == std::string::npos;
857
- quantize &= name.find("time_mix_g1.weight") == std::string::npos;
858
- quantize &= name.find("time_mix_g2.weight") == std::string::npos;
859
- quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos;
860
- quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos;
861
- quantize &= name.find("time_mix_lerp_fused.weight") == std::string::npos;
862
-
863
- // do not quantize relative position bias (T5)
864
- quantize &= name.find("attn_rel_b.weight") == std::string::npos;
865
-
866
- // do not quantize specific multimodal tensors
867
- quantize &= name.find(".position_embd.") == std::string::npos;
868
-
869
- ggml_type new_type;
1126
+ const ggml_type cur_type = tensor->type;
1127
+ const ggml_type new_type = tm.target_type;
1128
+
1129
+ // If we've decided to quantize to the same type the tensor is already
1130
+ // in then there's nothing to do.
1131
+ bool quantize = cur_type != new_type;
1132
+
870
1133
  void * new_data;
871
1134
  size_t new_size;
872
1135
 
873
- if (quantize) {
874
- new_type = default_type;
875
-
876
- // get more optimal quantization type based on the tensor shape, layer, etc.
877
- if (!params->pure && ggml_is_quantized(default_type)) {
878
- int fallback = qs.n_fallback;
879
- new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
880
- // unless the user specifies a type, and the tensor geometry will not require fallback quantisation
881
- if (params->tensor_types && qs.n_fallback - fallback == 0) {
882
- const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
883
- const std::string tensor_name(tensor->name);
884
- for (const auto & [tname, qtype] : tensor_types) {
885
- if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) {
886
- if (qtype != new_type) {
887
- LLAMA_LOG_DEBUG("(overriding %s) ", ggml_type_name(new_type));
888
- new_type = qtype; // if two or more types are specified for the same tensor, the last match wins
889
- }
890
- }
891
- }
1136
+ if (params->dry_run) {
1137
+ // the --dry-run option calculates the final quantization size without quantizing
1138
+ if (quantize) {
1139
+ new_size = ggml_nrows(tensor) * ggml_row_size(new_type, tensor->ne[0]);
1140
+ LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB (%s)\n",
1141
+ tensor_size/1024.0/1024.0,
1142
+ new_size/1024.0/1024.0,
1143
+ ggml_type_name(new_type));
1144
+ if (!will_require_imatrix && tm.requires_imatrix) {
1145
+ will_require_imatrix = true;
892
1146
  }
1147
+ } else {
1148
+ new_size = tensor_size;
1149
+ LLAMA_LOG_INFO("size = %8.3f MiB\n", new_size/1024.0/1024.0);
893
1150
  }
894
- if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
895
- new_type = params->token_embedding_type;
896
- }
897
- if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) {
898
- new_type = params->output_tensor_type;
899
- }
900
-
901
- // If we've decided to quantize to the same type the tensor is already
902
- // in then there's nothing to do.
903
- quantize = tensor->type != new_type;
904
- }
905
-
906
- if (!quantize) {
907
- new_type = tensor->type;
908
- new_data = tensor->data;
909
- new_size = ggml_nbytes(tensor);
910
- LLAMA_LOG_INFO("size = %8.3f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0);
1151
+ total_size_org += tensor_size;
1152
+ total_size_new += new_size;
1153
+ continue;
911
1154
  } else {
912
- const int64_t nelements = ggml_nelements(tensor);
1155
+ // no --dry-run, perform quantization
1156
+ if (!quantize) {
1157
+ new_data = tensor->data;
1158
+ new_size = tensor_size;
1159
+ LLAMA_LOG_INFO("size = %8.3f MiB\n", tensor_size/1024.0/1024.0);
1160
+ } else {
1161
+ const int64_t nelements = ggml_nelements(tensor);
913
1162
 
914
- const float * imatrix = nullptr;
915
- if (imatrix_data) {
916
- auto it = imatrix_data->find(remap_imatrix(tensor->name, mapped));
917
- if (it == imatrix_data->end()) {
918
- LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
919
- } else {
920
- if (it->second.size() == (size_t)tensor->ne[0]*tensor->ne[2]) {
921
- imatrix = it->second.data();
1163
+ const float * imatrix = nullptr;
1164
+ if (imatrix_data) {
1165
+ auto it = imatrix_data->find(tm.remapped_imatrix_name);
1166
+ if (it == imatrix_data->end()) {
1167
+ LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
922
1168
  } else {
923
- LLAMA_LOG_INFO("\n====== %s: imatrix size %d is different from tensor size %d for %s\n", __func__,
924
- int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name);
925
-
926
- // this can happen when quantizing an old mixtral model with split tensors with a new incompatible imatrix
927
- // this is a significant error and it may be good idea to abort the process if this happens,
928
- // since many people will miss the error and not realize that most of the model is being quantized without an imatrix
929
- // tok_embd should be ignored in this case, since it always causes this warning
930
- if (name != tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
931
- throw std::runtime_error(format("imatrix size %d is different from tensor size %d for %s",
932
- int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name));
1169
+ if (it->second.size() == (size_t)tensor->ne[0]*tensor->ne[2]) {
1170
+ imatrix = it->second.data();
1171
+ } else {
1172
+ LLAMA_LOG_INFO("\n====== %s: imatrix size %d is different from tensor size %d for %s\n", __func__,
1173
+ int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name);
1174
+
1175
+ // this can happen when quantizing an old mixtral model with split tensors with a new incompatible imatrix
1176
+ // this is a significant error and it may be good idea to abort the process if this happens,
1177
+ // since many people will miss the error and not realize that most of the model is being quantized without an imatrix
1178
+ // tok_embd should be ignored in this case, since it always causes this warning
1179
+ if (!tensor_name_match_token_embd(tensor->name)) {
1180
+ throw std::runtime_error(format("imatrix size %d is different from tensor size %d for %s",
1181
+ int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name));
1182
+ }
933
1183
  }
934
1184
  }
935
1185
  }
936
- }
937
- if ((new_type == GGML_TYPE_IQ2_XXS ||
938
- new_type == GGML_TYPE_IQ2_XS ||
939
- new_type == GGML_TYPE_IQ2_S ||
940
- new_type == GGML_TYPE_IQ1_S ||
941
- (new_type == GGML_TYPE_IQ1_M && strcmp(tensor->name, "token_embd.weight") && strcmp(tensor->name, "output.weight")) ||
942
- (new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) {
943
- LLAMA_LOG_ERROR("\n\n============================================================\n");
944
- LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
945
- LLAMA_LOG_ERROR("The result will be garbage, so bailing out\n");
946
- LLAMA_LOG_ERROR("============================================================\n\n");
947
- throw std::runtime_error(format("Missing importance matrix for tensor %s in a very low-bit quantization", tensor->name));
948
- }
1186
+ if (!imatrix && tm.requires_imatrix) {
1187
+ LLAMA_LOG_ERROR("\n\n============================================================\n");
1188
+ LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
1189
+ LLAMA_LOG_ERROR("The result will be garbage, so bailing out\n");
1190
+ LLAMA_LOG_ERROR("============================================================\n\n");
1191
+ throw std::runtime_error(format("Missing importance matrix for tensor %s in a very low-bit quantization", tensor->name));
1192
+ }
949
1193
 
950
- float * f32_data;
1194
+ float * f32_data;
951
1195
 
952
- if (tensor->type == GGML_TYPE_F32) {
953
- f32_data = (float *) tensor->data;
954
- } else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
955
- throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
956
- } else {
957
- llama_tensor_dequantize_impl(tensor, f32_conv_buf, workers, nelements, nthread);
958
- f32_data = (float *) f32_conv_buf.data();
959
- }
1196
+ if (tensor->type == GGML_TYPE_F32) {
1197
+ f32_data = (float *) tensor->data;
1198
+ } else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
1199
+ throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
1200
+ } else {
1201
+ llama_tensor_dequantize_impl(tensor, f32_conv_buf, workers, nelements, nthread);
1202
+ f32_data = (float *) f32_conv_buf.data();
1203
+ }
960
1204
 
961
- LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
962
- fflush(stdout);
1205
+ LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
1206
+ fflush(stdout);
963
1207
 
964
- if (work.size() < (size_t)nelements * 4) {
965
- work.resize(nelements * 4); // upper bound on size
966
- }
967
- new_data = work.data();
968
-
969
- const int64_t n_per_row = tensor->ne[0];
970
- const int64_t nrows = tensor->ne[1];
971
-
972
- static const int64_t min_chunk_size = 32 * 512;
973
- const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row));
974
-
975
- const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
976
- const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
977
- const int64_t nthread_use = nthread > 1 ? std::max((int64_t)1, std::min((int64_t)nthread, nchunk)) : 1;
978
-
979
- // quantize each expert separately since they have different importance matrices
980
- new_size = 0;
981
- for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) {
982
- const float * f32_data_03 = f32_data + i03 * nelements_matrix;
983
- void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows;
984
- const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
985
-
986
- new_size += llama_tensor_quantize_impl(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
987
-
988
- // TODO: temporary sanity check that the F16 -> MXFP4 is lossless
989
- #if 0
990
- if (new_type == GGML_TYPE_MXFP4) {
991
- auto * x = f32_data_03;
992
-
993
- //LLAMA_LOG_INFO("nrows = %d, n_per_row = %d\n", nrows, n_per_row);
994
- std::vector<float> deq(nrows*n_per_row);
995
- const ggml_type_traits * qtype = ggml_get_type_traits(new_type);
996
- qtype->to_float(new_data_03, deq.data(), deq.size());
997
-
998
- double err = 0.0f;
999
- for (int i = 0; i < (int) deq.size(); ++i) {
1000
- err += fabsf(deq[i] - x[i]);
1001
- //if (fabsf(deq[i] - x[i]) > 0.00001 && i < 256) {
1002
- if (deq[i] != x[i]) {
1003
- LLAMA_LOG_INFO("deq[%d] = %f, x[%d] = %f\n", i, deq[i], i, x[i]);
1004
- }
1005
- }
1006
- //LLAMA_LOG_INFO("err = %f\n", err);
1007
- GGML_ASSERT(err == 0.00000);
1208
+ if (work.size() < (size_t)nelements * 4) {
1209
+ work.resize(nelements * 4); // upper bound on size
1008
1210
  }
1009
- #endif
1010
- }
1011
- LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
1012
- }
1013
- total_size_org += ggml_nbytes(tensor);
1014
- total_size_new += new_size;
1211
+ new_data = work.data();
1212
+
1213
+ const int64_t n_per_row = tensor->ne[0];
1214
+ const int64_t nrows = tensor->ne[1];
1015
1215
 
1016
- // update the gguf meta data as we go
1017
- gguf_set_tensor_type(ctx_outs[cur_split].get(), name.c_str(), new_type);
1018
- GGML_ASSERT(gguf_get_tensor_size(ctx_outs[cur_split].get(), gguf_find_tensor(ctx_outs[cur_split].get(), name.c_str())) == new_size);
1019
- gguf_set_tensor_data(ctx_outs[cur_split].get(), name.c_str(), new_data);
1216
+ static const int64_t min_chunk_size = 32 * 512;
1217
+ const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row));
1020
1218
 
1021
- // write tensor data + padding
1022
- fout.write((const char *) new_data, new_size);
1023
- zeros(fout, GGML_PAD(new_size, align) - new_size);
1219
+ const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
1220
+ const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
1221
+ const int64_t nthread_use = nthread > 1 ? std::max((int64_t)1, std::min((int64_t)nthread, nchunk)) : 1;
1222
+
1223
+ // quantize each expert separately since they have different importance matrices
1224
+ new_size = 0;
1225
+ for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) {
1226
+ const float * f32_data_03 = f32_data + i03 * nelements_matrix;
1227
+ void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows;
1228
+ const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
1229
+
1230
+ new_size += llama_tensor_quantize_impl(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
1231
+ }
1232
+ LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", tensor_size/1024.0/1024.0, new_size/1024.0/1024.0);
1233
+ }
1234
+ total_size_org += tensor_size;
1235
+ total_size_new += new_size;
1236
+
1237
+ // update the gguf meta data as we go
1238
+ gguf_set_tensor_type(ctx_outs[cur_split].get(), name.c_str(), new_type);
1239
+ GGML_ASSERT(gguf_get_tensor_size(ctx_outs[cur_split].get(), gguf_find_tensor(ctx_outs[cur_split].get(), name.c_str())) == new_size);
1240
+ gguf_set_tensor_data(ctx_outs[cur_split].get(), name.c_str(), new_data);
1241
+
1242
+ // write tensor data + padding
1243
+ fout.write((const char *) new_data, new_size);
1244
+ zeros(fout, GGML_PAD(new_size, align) - new_size);
1245
+ } // no --dry-run
1246
+ } // main loop
1247
+
1248
+ if (!params->dry_run) {
1249
+ close_ofstream();
1024
1250
  }
1025
- close_ofstream();
1026
1251
 
1027
- LLAMA_LOG_INFO("%s: model size = %8.2f MiB\n", __func__, total_size_org/1024.0/1024.0);
1028
- LLAMA_LOG_INFO("%s: quant size = %8.2f MiB\n", __func__, total_size_new/1024.0/1024.0);
1252
+ LLAMA_LOG_INFO("%s: model size = %8.2f MiB (%.2f BPW)\n", __func__, total_size_org/1024.0/1024.0, total_size_org*8.0/ml.n_elements);
1253
+ LLAMA_LOG_INFO("%s: quant size = %8.2f MiB (%.2f BPW)\n", __func__, total_size_new/1024.0/1024.0, total_size_new*8.0/ml.n_elements);
1254
+
1255
+ if (!params->imatrix && params->dry_run && will_require_imatrix) {
1256
+ LLAMA_LOG_WARN("%s: WARNING: dry run completed successfully, but actually completing this quantization will require an imatrix!\n",
1257
+ __func__
1258
+ );
1259
+ }
1029
1260
 
1030
1261
  if (qs.n_fallback > 0) {
1031
1262
  LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) required fallback quantization\n",
1032
- __func__, qs.n_fallback, qs.n_k_quantized + qs.n_fallback);
1263
+ __func__, qs.n_fallback, ml.n_tensors);
1033
1264
  }
1034
1265
  }
1035
1266
 
@@ -1048,6 +1279,7 @@ llama_model_quantize_params llama_model_quantize_default_params() {
1048
1279
  /*.only_copy =*/ false,
1049
1280
  /*.pure =*/ false,
1050
1281
  /*.keep_split =*/ false,
1282
+ /*.dry_run =*/ false,
1051
1283
  /*.imatrix =*/ nullptr,
1052
1284
  /*.kv_overrides =*/ nullptr,
1053
1285
  /*.tensor_type =*/ nullptr,