toy 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2107) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +1124 -0
  3. data/LICENSE +21 -0
  4. data/Makefile +2022 -0
  5. data/README.md +154 -0
  6. data/bin/toy +10 -0
  7. data/lib/toy/compute.rb +135 -0
  8. data/lib/toy/compute_cuda.rb +104 -0
  9. data/lib/toy/compute_metal.rb +97 -0
  10. data/lib/toy/core/cli/describe.rb +188 -0
  11. data/lib/toy/core/cli/eval.rb +385 -0
  12. data/lib/toy/core/cli/exit_codes.rb +15 -0
  13. data/lib/toy/core/cli/fetch.rb +238 -0
  14. data/lib/toy/core/cli/infer.rb +268 -0
  15. data/lib/toy/core/cli/install.rb +228 -0
  16. data/lib/toy/core/cli/list.rb +86 -0
  17. data/lib/toy/core/cli/manifest.rb +49 -0
  18. data/lib/toy/core/cli/new.rb +594 -0
  19. data/lib/toy/core/cli/serve.rb +237 -0
  20. data/lib/toy/core/cli/train.rb +471 -0
  21. data/lib/toy/core/cli.rb +165 -0
  22. data/lib/toy/core/config.rb +64 -0
  23. data/lib/toy/core/gguf_meta.rb +161 -0
  24. data/lib/toy/core/model_scan.rb +221 -0
  25. data/lib/toy/core/run_log.rb +94 -0
  26. data/lib/toy/core/toy_root.rb +95 -0
  27. data/lib/toy/dev/toy_card.rb +299 -0
  28. data/lib/toy/dev/toy_describe_flow.rb +412 -0
  29. data/lib/toy/dev/toy_logprobs.rb +86 -0
  30. data/lib/toy/dev/toy_tap.rb +183 -0
  31. data/lib/toy/dev/toy_token_drift.rb +121 -0
  32. data/lib/toy/ffi/tinynn.rb +1491 -0
  33. data/lib/toy/ffi/tinynn_cuda.rb +1124 -0
  34. data/lib/toy/ffi/tinynn_metal.rb +359 -0
  35. data/lib/toy/ffi_manifest.rb +84 -0
  36. data/lib/toy/io/bpe.rb +325 -0
  37. data/lib/toy/io/gguf_kv.rb +35 -0
  38. data/lib/toy/io/gguf_load.rb +331 -0
  39. data/lib/toy/io/loaders/toy_gpt2_loader.rb +70 -0
  40. data/lib/toy/io/loaders/toy_smollm2_loader.rb +754 -0
  41. data/lib/toy/io/model_index.rb +206 -0
  42. data/lib/toy/io/run_bundle.rb +280 -0
  43. data/lib/toy/io/tokenizer.rb +613 -0
  44. data/lib/toy/io/toy_corpus_loader.rb +52 -0
  45. data/lib/toy/io/toy_events.rb +56 -0
  46. data/lib/toy/io/toy_image_loader.rb +48 -0
  47. data/lib/toy/llm/adamw.rb +169 -0
  48. data/lib/toy/llm/archs/llama_arch.rb +233 -0
  49. data/lib/toy/llm/archs/llama_arch_cuda.rb +237 -0
  50. data/lib/toy/llm/archs/llama_arch_metal.rb +237 -0
  51. data/lib/toy/llm/blocks/transformer_block.rb +876 -0
  52. data/lib/toy/llm/blocks/transformer_block_cuda.rb +880 -0
  53. data/lib/toy/llm/blocks/transformer_block_metal.rb +880 -0
  54. data/lib/toy/llm/classify_batch.rb +88 -0
  55. data/lib/toy/llm/engine/gpt2_fwd_engine.rb +360 -0
  56. data/lib/toy/llm/engine/gpt2_fwd_engine_cuda.rb +362 -0
  57. data/lib/toy/llm/engine/gpt2_fwd_engine_metal.rb +362 -0
  58. data/lib/toy/llm/engine/gpt2_kv_engine.rb +346 -0
  59. data/lib/toy/llm/engine/gpt2_kv_engine_cuda.rb +348 -0
  60. data/lib/toy/llm/engine/gpt2_kv_engine_metal.rb +348 -0
  61. data/lib/toy/llm/engine/gpt2_seq_engine.rb +289 -0
  62. data/lib/toy/llm/engine/gpt2_seq_engine_cuda.rb +293 -0
  63. data/lib/toy/llm/engine/gpt2_seq_engine_metal.rb +293 -0
  64. data/lib/toy/llm/engine/llama_kv_engine.rb +1593 -0
  65. data/lib/toy/llm/engine/llama_kv_engine_cuda.rb +1526 -0
  66. data/lib/toy/llm/engine/llama_kv_engine_metal.rb +1526 -0
  67. data/lib/toy/llm/engine/llama_seq_engine.rb +1233 -0
  68. data/lib/toy/llm/engine/llama_seq_engine_cuda.rb +1238 -0
  69. data/lib/toy/llm/engine/llama_seq_engine_metal.rb +1238 -0
  70. data/lib/toy/llm/engine/vit_tiny_engine.rb +467 -0
  71. data/lib/toy/llm/labels.rb +142 -0
  72. data/lib/toy/llm/primitives/gqa.rb +62 -0
  73. data/lib/toy/llm/primitives/gqa_cuda.rb +66 -0
  74. data/lib/toy/llm/primitives/gqa_metal.rb +66 -0
  75. data/lib/toy/llm/primitives/rms_norm.rb +39 -0
  76. data/lib/toy/llm/primitives/rms_norm_cuda.rb +43 -0
  77. data/lib/toy/llm/primitives/rms_norm_metal.rb +43 -0
  78. data/lib/toy/llm/primitives/rope.rb +68 -0
  79. data/lib/toy/llm/primitives/rope_cuda.rb +72 -0
  80. data/lib/toy/llm/primitives/rope_metal.rb +72 -0
  81. data/lib/toy/llm/primitives/swiglu.rb +41 -0
  82. data/lib/toy/llm/primitives/swiglu_cuda.rb +45 -0
  83. data/lib/toy/llm/primitives/swiglu_metal.rb +45 -0
  84. data/lib/toy/llm/recipe_options.rb +71 -0
  85. data/lib/toy/llm/recipes/from_scratch.rb +105 -0
  86. data/lib/toy/llm/recipes/from_scratch_cuda.rb +109 -0
  87. data/lib/toy/llm/recipes/from_scratch_metal.rb +109 -0
  88. data/lib/toy/llm/recipes/lora.rb +110 -0
  89. data/lib/toy/llm/recipes/lora_cuda.rb +114 -0
  90. data/lib/toy/llm/recipes/lora_metal.rb +114 -0
  91. data/lib/toy/llm/recipes/vit_tiny.rb +75 -0
  92. data/lib/toy/llm/recipes/warm_start.rb +235 -0
  93. data/lib/toy/llm/recipes/warm_start_cuda.rb +239 -0
  94. data/lib/toy/llm/recipes/warm_start_metal.rb +239 -0
  95. data/lib/toy/llm/training_batch.rb +133 -0
  96. data/lib/toy/models/arch.rb +253 -0
  97. data/lib/toy/models/gpt2.rb +311 -0
  98. data/lib/toy/models/toy_gpt2.rb +177 -0
  99. data/lib/toy/models/toy_smollm2.rb +393 -0
  100. data/lib/toy/models/toy_vit.rb +83 -0
  101. data/lib/toy/models/transformer.rb +1494 -0
  102. data/lib/toy/models/transformer_lm.rb +298 -0
  103. data/lib/toy/models/transformer_lm_cuda.rb +159 -0
  104. data/lib/toy/models/transformer_lm_metal.rb +142 -0
  105. data/lib/toy/mri.rb +300 -0
  106. data/lib/toy/run/eval.rb +76 -0
  107. data/lib/toy/run/eval_cuda.rb +66 -0
  108. data/lib/toy/run/eval_lmc.rb +334 -0
  109. data/lib/toy/run/eval_metal.rb +67 -0
  110. data/lib/toy/run/infer.rb +130 -0
  111. data/lib/toy/run/infer_cuda.rb +118 -0
  112. data/lib/toy/run/infer_metal.rb +119 -0
  113. data/lib/toy/run/infer_trace.rb +37 -0
  114. data/lib/toy/run/serve.rb +144 -0
  115. data/lib/toy/run/train.rb +404 -0
  116. data/lib/toy/run/train_cuda.rb +397 -0
  117. data/lib/toy/run/train_gpt2.rb +103 -0
  118. data/lib/toy/run/train_gpt2_cuda.rb +85 -0
  119. data/lib/toy/run/train_gpt2_metal.rb +85 -0
  120. data/lib/toy/run/train_lora.rb +207 -0
  121. data/lib/toy/run/train_lora_cuda.rb +219 -0
  122. data/lib/toy/run/train_metal.rb +227 -0
  123. data/lib/toy/run/train_vit.rb +251 -0
  124. data/lib/toy/serve/openai/embeddings_handler.rb +92 -0
  125. data/lib/toy/serve/openai/handlers.rb +143 -0
  126. data/lib/toy/serve/openai/server.rb +159 -0
  127. data/lib/toy/train/sampler.rb +314 -0
  128. data/lib/toy/train/toy_chat_template.rb +179 -0
  129. data/lib/toy/train/toy_drift_grad.rb +176 -0
  130. data/lib/toy/train/toy_gguf_fuse.rb +428 -0
  131. data/lib/toy/train/toy_gguf_writer.rb +100 -0
  132. data/lib/toy/train/toy_lr_schedule.rb +39 -0
  133. data/lib/toy/train/toy_sample.rb +125 -0
  134. data/lib/toy/train/toy_trainer.rb +86 -0
  135. data/lib/toy/train/training.rb +160 -0
  136. data/lib/toy/version.rb +11 -0
  137. data/lib/toy.rb +902 -0
  138. data/prep/progress +118 -0
  139. data/prep/quietly +64 -0
  140. data/sig/toy.rbs +397 -0
  141. data/sig/toy_compute.rbs +450 -0
  142. data/spinel-ext.json +122 -0
  143. data/tinynn/Makefile +71 -0
  144. data/tinynn/tinynn_backend_cuda.c +99 -0
  145. data/tinynn/tinynn_backend_metal.m +75 -0
  146. data/tinynn/tinynn_events.c +122 -0
  147. data/tinynn/tinynn_events.h +83 -0
  148. data/tinynn/tinynn_ggml.c +2460 -0
  149. data/tinynn/tinynn_ggml.h +545 -0
  150. data/tinynn/tinynn_gguf.c +783 -0
  151. data/tinynn/tinynn_gguf.h +167 -0
  152. data/tinynn/tinynn_trace.c +180 -0
  153. data/tinynn/tinynn_trace.h +85 -0
  154. data/vendor/ggml/AUTHORS +335 -0
  155. data/vendor/ggml/CMakeLists.txt +505 -0
  156. data/vendor/ggml/CONTRIBUTING.md +3 -0
  157. data/vendor/ggml/LICENSE +21 -0
  158. data/vendor/ggml/README.md +50 -0
  159. data/vendor/ggml/ci/run.sh +395 -0
  160. data/vendor/ggml/cmake/FindNCCL.cmake +36 -0
  161. data/vendor/ggml/cmake/GitVars.cmake +22 -0
  162. data/vendor/ggml/cmake/common.cmake +50 -0
  163. data/vendor/ggml/cmake/ggml-config.cmake.in +191 -0
  164. data/vendor/ggml/docs/gguf.md +828 -0
  165. data/vendor/ggml/examples/CMakeLists.txt +34 -0
  166. data/vendor/ggml/examples/common-ggml.cpp +244 -0
  167. data/vendor/ggml/examples/common-ggml.h +18 -0
  168. data/vendor/ggml/examples/common.cpp +675 -0
  169. data/vendor/ggml/examples/common.h +322 -0
  170. data/vendor/ggml/examples/gpt-2/CMakeLists.txt +32 -0
  171. data/vendor/ggml/examples/gpt-2/README.md +225 -0
  172. data/vendor/ggml/examples/gpt-2/convert-cerebras-to-ggml.py +183 -0
  173. data/vendor/ggml/examples/gpt-2/convert-ckpt-to-ggml.py +159 -0
  174. data/vendor/ggml/examples/gpt-2/convert-h5-to-ggml.py +195 -0
  175. data/vendor/ggml/examples/gpt-2/download-ggml-model.sh +69 -0
  176. data/vendor/ggml/examples/gpt-2/download-model.sh +48 -0
  177. data/vendor/ggml/examples/gpt-2/main-alloc.cpp +880 -0
  178. data/vendor/ggml/examples/gpt-2/main-backend.cpp +946 -0
  179. data/vendor/ggml/examples/gpt-2/main-batched.cpp +1210 -0
  180. data/vendor/ggml/examples/gpt-2/main-ctx.cpp +840 -0
  181. data/vendor/ggml/examples/gpt-2/main-sched.cpp +1079 -0
  182. data/vendor/ggml/examples/gpt-2/quantize.cpp +184 -0
  183. data/vendor/ggml/examples/gpt-j/CMakeLists.txt +13 -0
  184. data/vendor/ggml/examples/gpt-j/README.md +239 -0
  185. data/vendor/ggml/examples/gpt-j/convert-h5-to-ggml.py +173 -0
  186. data/vendor/ggml/examples/gpt-j/download-ggml-model.sh +69 -0
  187. data/vendor/ggml/examples/gpt-j/download-model.sh +11 -0
  188. data/vendor/ggml/examples/gpt-j/main.cpp +755 -0
  189. data/vendor/ggml/examples/gpt-j/quantize.cpp +182 -0
  190. data/vendor/ggml/examples/magika/CMakeLists.txt +17 -0
  191. data/vendor/ggml/examples/magika/README.md +23 -0
  192. data/vendor/ggml/examples/magika/convert.py +32 -0
  193. data/vendor/ggml/examples/magika/main.cpp +374 -0
  194. data/vendor/ggml/examples/mnist/CMakeLists.txt +58 -0
  195. data/vendor/ggml/examples/mnist/README.md +206 -0
  196. data/vendor/ggml/examples/mnist/mnist-common.cpp +496 -0
  197. data/vendor/ggml/examples/mnist/mnist-common.h +166 -0
  198. data/vendor/ggml/examples/mnist/mnist-eval.cpp +67 -0
  199. data/vendor/ggml/examples/mnist/mnist-train-cnn.py +91 -0
  200. data/vendor/ggml/examples/mnist/mnist-train-fc.py +131 -0
  201. data/vendor/ggml/examples/mnist/mnist-train.cpp +39 -0
  202. data/vendor/ggml/examples/mnist/server.py +36 -0
  203. data/vendor/ggml/examples/mnist/web/index.html +178 -0
  204. data/vendor/ggml/examples/perf-metal/CMakeLists.txt +7 -0
  205. data/vendor/ggml/examples/perf-metal/perf-metal.cpp +152 -0
  206. data/vendor/ggml/examples/prompts/dolly-v2.txt +100 -0
  207. data/vendor/ggml/examples/prompts/gpt-2-chinese.txt +1 -0
  208. data/vendor/ggml/examples/prompts/gpt-2.txt +100 -0
  209. data/vendor/ggml/examples/prompts/gpt-j.txt +100 -0
  210. data/vendor/ggml/examples/prompts/gpt-neox-japanese.txt +1 -0
  211. data/vendor/ggml/examples/prompts/gpt-neox.txt +100 -0
  212. data/vendor/ggml/examples/prompts/polyglot-ko.txt +3 -0
  213. data/vendor/ggml/examples/prompts/replit.txt +100 -0
  214. data/vendor/ggml/examples/prompts/starcoder.txt +100 -0
  215. data/vendor/ggml/examples/prompts/test-cases.txt +110 -0
  216. data/vendor/ggml/examples/prompts/tokenize_huggingface.py +65 -0
  217. data/vendor/ggml/examples/prompts/whisper.txt +100 -0
  218. data/vendor/ggml/examples/python/README.md +115 -0
  219. data/vendor/ggml/examples/python/api.h +14 -0
  220. data/vendor/ggml/examples/python/example_add_quant.py +25 -0
  221. data/vendor/ggml/examples/python/example_test_all_quants.py +68 -0
  222. data/vendor/ggml/examples/python/ggml/__init__.py +58 -0
  223. data/vendor/ggml/examples/python/ggml/__init__.pyi +2406 -0
  224. data/vendor/ggml/examples/python/ggml/cffi.py +11 -0
  225. data/vendor/ggml/examples/python/ggml/ffi/__init__.pyi +7 -0
  226. data/vendor/ggml/examples/python/ggml/utils.py +182 -0
  227. data/vendor/ggml/examples/python/regenerate.py +42 -0
  228. data/vendor/ggml/examples/python/stubs.py +128 -0
  229. data/vendor/ggml/examples/python/test_tensor.py +258 -0
  230. data/vendor/ggml/examples/sam/CMakeLists.txt +13 -0
  231. data/vendor/ggml/examples/sam/README.md +95 -0
  232. data/vendor/ggml/examples/sam/convert-pth-to-ggml.py +147 -0
  233. data/vendor/ggml/examples/sam/example.jpg +0 -0
  234. data/vendor/ggml/examples/sam/sam.cpp +2370 -0
  235. data/vendor/ggml/examples/simple/CMakeLists.txt +21 -0
  236. data/vendor/ggml/examples/simple/README.md +61 -0
  237. data/vendor/ggml/examples/simple/simple-backend.cpp +153 -0
  238. data/vendor/ggml/examples/simple/simple-ctx.cpp +127 -0
  239. data/vendor/ggml/examples/stb_image.h +7987 -0
  240. data/vendor/ggml/examples/stb_image_write.h +1724 -0
  241. data/vendor/ggml/examples/test-cmake/CMakeLists.txt +10 -0
  242. data/vendor/ggml/examples/test-cmake/README.md +3 -0
  243. data/vendor/ggml/examples/test-cmake/test-cmake.cpp +6 -0
  244. data/vendor/ggml/examples/yolo/CMakeLists.txt +6 -0
  245. data/vendor/ggml/examples/yolo/README.md +59 -0
  246. data/vendor/ggml/examples/yolo/convert-yolov3-tiny.py +53 -0
  247. data/vendor/ggml/examples/yolo/data/coco.names +80 -0
  248. data/vendor/ggml/examples/yolo/data/labels/100_0.png +0 -0
  249. data/vendor/ggml/examples/yolo/data/labels/100_1.png +0 -0
  250. data/vendor/ggml/examples/yolo/data/labels/100_2.png +0 -0
  251. data/vendor/ggml/examples/yolo/data/labels/100_3.png +0 -0
  252. data/vendor/ggml/examples/yolo/data/labels/100_4.png +0 -0
  253. data/vendor/ggml/examples/yolo/data/labels/100_5.png +0 -0
  254. data/vendor/ggml/examples/yolo/data/labels/100_6.png +0 -0
  255. data/vendor/ggml/examples/yolo/data/labels/100_7.png +0 -0
  256. data/vendor/ggml/examples/yolo/data/labels/101_0.png +0 -0
  257. data/vendor/ggml/examples/yolo/data/labels/101_1.png +0 -0
  258. data/vendor/ggml/examples/yolo/data/labels/101_2.png +0 -0
  259. data/vendor/ggml/examples/yolo/data/labels/101_3.png +0 -0
  260. data/vendor/ggml/examples/yolo/data/labels/101_4.png +0 -0
  261. data/vendor/ggml/examples/yolo/data/labels/101_5.png +0 -0
  262. data/vendor/ggml/examples/yolo/data/labels/101_6.png +0 -0
  263. data/vendor/ggml/examples/yolo/data/labels/101_7.png +0 -0
  264. data/vendor/ggml/examples/yolo/data/labels/102_0.png +0 -0
  265. data/vendor/ggml/examples/yolo/data/labels/102_1.png +0 -0
  266. data/vendor/ggml/examples/yolo/data/labels/102_2.png +0 -0
  267. data/vendor/ggml/examples/yolo/data/labels/102_3.png +0 -0
  268. data/vendor/ggml/examples/yolo/data/labels/102_4.png +0 -0
  269. data/vendor/ggml/examples/yolo/data/labels/102_5.png +0 -0
  270. data/vendor/ggml/examples/yolo/data/labels/102_6.png +0 -0
  271. data/vendor/ggml/examples/yolo/data/labels/102_7.png +0 -0
  272. data/vendor/ggml/examples/yolo/data/labels/103_0.png +0 -0
  273. data/vendor/ggml/examples/yolo/data/labels/103_1.png +0 -0
  274. data/vendor/ggml/examples/yolo/data/labels/103_2.png +0 -0
  275. data/vendor/ggml/examples/yolo/data/labels/103_3.png +0 -0
  276. data/vendor/ggml/examples/yolo/data/labels/103_4.png +0 -0
  277. data/vendor/ggml/examples/yolo/data/labels/103_5.png +0 -0
  278. data/vendor/ggml/examples/yolo/data/labels/103_6.png +0 -0
  279. data/vendor/ggml/examples/yolo/data/labels/103_7.png +0 -0
  280. data/vendor/ggml/examples/yolo/data/labels/104_0.png +0 -0
  281. data/vendor/ggml/examples/yolo/data/labels/104_1.png +0 -0
  282. data/vendor/ggml/examples/yolo/data/labels/104_2.png +0 -0
  283. data/vendor/ggml/examples/yolo/data/labels/104_3.png +0 -0
  284. data/vendor/ggml/examples/yolo/data/labels/104_4.png +0 -0
  285. data/vendor/ggml/examples/yolo/data/labels/104_5.png +0 -0
  286. data/vendor/ggml/examples/yolo/data/labels/104_6.png +0 -0
  287. data/vendor/ggml/examples/yolo/data/labels/104_7.png +0 -0
  288. data/vendor/ggml/examples/yolo/data/labels/105_0.png +0 -0
  289. data/vendor/ggml/examples/yolo/data/labels/105_1.png +0 -0
  290. data/vendor/ggml/examples/yolo/data/labels/105_2.png +0 -0
  291. data/vendor/ggml/examples/yolo/data/labels/105_3.png +0 -0
  292. data/vendor/ggml/examples/yolo/data/labels/105_4.png +0 -0
  293. data/vendor/ggml/examples/yolo/data/labels/105_5.png +0 -0
  294. data/vendor/ggml/examples/yolo/data/labels/105_6.png +0 -0
  295. data/vendor/ggml/examples/yolo/data/labels/105_7.png +0 -0
  296. data/vendor/ggml/examples/yolo/data/labels/106_0.png +0 -0
  297. data/vendor/ggml/examples/yolo/data/labels/106_1.png +0 -0
  298. data/vendor/ggml/examples/yolo/data/labels/106_2.png +0 -0
  299. data/vendor/ggml/examples/yolo/data/labels/106_3.png +0 -0
  300. data/vendor/ggml/examples/yolo/data/labels/106_4.png +0 -0
  301. data/vendor/ggml/examples/yolo/data/labels/106_5.png +0 -0
  302. data/vendor/ggml/examples/yolo/data/labels/106_6.png +0 -0
  303. data/vendor/ggml/examples/yolo/data/labels/106_7.png +0 -0
  304. data/vendor/ggml/examples/yolo/data/labels/107_0.png +0 -0
  305. data/vendor/ggml/examples/yolo/data/labels/107_1.png +0 -0
  306. data/vendor/ggml/examples/yolo/data/labels/107_2.png +0 -0
  307. data/vendor/ggml/examples/yolo/data/labels/107_3.png +0 -0
  308. data/vendor/ggml/examples/yolo/data/labels/107_4.png +0 -0
  309. data/vendor/ggml/examples/yolo/data/labels/107_5.png +0 -0
  310. data/vendor/ggml/examples/yolo/data/labels/107_6.png +0 -0
  311. data/vendor/ggml/examples/yolo/data/labels/107_7.png +0 -0
  312. data/vendor/ggml/examples/yolo/data/labels/108_0.png +0 -0
  313. data/vendor/ggml/examples/yolo/data/labels/108_1.png +0 -0
  314. data/vendor/ggml/examples/yolo/data/labels/108_2.png +0 -0
  315. data/vendor/ggml/examples/yolo/data/labels/108_3.png +0 -0
  316. data/vendor/ggml/examples/yolo/data/labels/108_4.png +0 -0
  317. data/vendor/ggml/examples/yolo/data/labels/108_5.png +0 -0
  318. data/vendor/ggml/examples/yolo/data/labels/108_6.png +0 -0
  319. data/vendor/ggml/examples/yolo/data/labels/108_7.png +0 -0
  320. data/vendor/ggml/examples/yolo/data/labels/109_0.png +0 -0
  321. data/vendor/ggml/examples/yolo/data/labels/109_1.png +0 -0
  322. data/vendor/ggml/examples/yolo/data/labels/109_2.png +0 -0
  323. data/vendor/ggml/examples/yolo/data/labels/109_3.png +0 -0
  324. data/vendor/ggml/examples/yolo/data/labels/109_4.png +0 -0
  325. data/vendor/ggml/examples/yolo/data/labels/109_5.png +0 -0
  326. data/vendor/ggml/examples/yolo/data/labels/109_6.png +0 -0
  327. data/vendor/ggml/examples/yolo/data/labels/109_7.png +0 -0
  328. data/vendor/ggml/examples/yolo/data/labels/110_0.png +0 -0
  329. data/vendor/ggml/examples/yolo/data/labels/110_1.png +0 -0
  330. data/vendor/ggml/examples/yolo/data/labels/110_2.png +0 -0
  331. data/vendor/ggml/examples/yolo/data/labels/110_3.png +0 -0
  332. data/vendor/ggml/examples/yolo/data/labels/110_4.png +0 -0
  333. data/vendor/ggml/examples/yolo/data/labels/110_5.png +0 -0
  334. data/vendor/ggml/examples/yolo/data/labels/110_6.png +0 -0
  335. data/vendor/ggml/examples/yolo/data/labels/110_7.png +0 -0
  336. data/vendor/ggml/examples/yolo/data/labels/111_0.png +0 -0
  337. data/vendor/ggml/examples/yolo/data/labels/111_1.png +0 -0
  338. data/vendor/ggml/examples/yolo/data/labels/111_2.png +0 -0
  339. data/vendor/ggml/examples/yolo/data/labels/111_3.png +0 -0
  340. data/vendor/ggml/examples/yolo/data/labels/111_4.png +0 -0
  341. data/vendor/ggml/examples/yolo/data/labels/111_5.png +0 -0
  342. data/vendor/ggml/examples/yolo/data/labels/111_6.png +0 -0
  343. data/vendor/ggml/examples/yolo/data/labels/111_7.png +0 -0
  344. data/vendor/ggml/examples/yolo/data/labels/112_0.png +0 -0
  345. data/vendor/ggml/examples/yolo/data/labels/112_1.png +0 -0
  346. data/vendor/ggml/examples/yolo/data/labels/112_2.png +0 -0
  347. data/vendor/ggml/examples/yolo/data/labels/112_3.png +0 -0
  348. data/vendor/ggml/examples/yolo/data/labels/112_4.png +0 -0
  349. data/vendor/ggml/examples/yolo/data/labels/112_5.png +0 -0
  350. data/vendor/ggml/examples/yolo/data/labels/112_6.png +0 -0
  351. data/vendor/ggml/examples/yolo/data/labels/112_7.png +0 -0
  352. data/vendor/ggml/examples/yolo/data/labels/113_0.png +0 -0
  353. data/vendor/ggml/examples/yolo/data/labels/113_1.png +0 -0
  354. data/vendor/ggml/examples/yolo/data/labels/113_2.png +0 -0
  355. data/vendor/ggml/examples/yolo/data/labels/113_3.png +0 -0
  356. data/vendor/ggml/examples/yolo/data/labels/113_4.png +0 -0
  357. data/vendor/ggml/examples/yolo/data/labels/113_5.png +0 -0
  358. data/vendor/ggml/examples/yolo/data/labels/113_6.png +0 -0
  359. data/vendor/ggml/examples/yolo/data/labels/113_7.png +0 -0
  360. data/vendor/ggml/examples/yolo/data/labels/114_0.png +0 -0
  361. data/vendor/ggml/examples/yolo/data/labels/114_1.png +0 -0
  362. data/vendor/ggml/examples/yolo/data/labels/114_2.png +0 -0
  363. data/vendor/ggml/examples/yolo/data/labels/114_3.png +0 -0
  364. data/vendor/ggml/examples/yolo/data/labels/114_4.png +0 -0
  365. data/vendor/ggml/examples/yolo/data/labels/114_5.png +0 -0
  366. data/vendor/ggml/examples/yolo/data/labels/114_6.png +0 -0
  367. data/vendor/ggml/examples/yolo/data/labels/114_7.png +0 -0
  368. data/vendor/ggml/examples/yolo/data/labels/115_0.png +0 -0
  369. data/vendor/ggml/examples/yolo/data/labels/115_1.png +0 -0
  370. data/vendor/ggml/examples/yolo/data/labels/115_2.png +0 -0
  371. data/vendor/ggml/examples/yolo/data/labels/115_3.png +0 -0
  372. data/vendor/ggml/examples/yolo/data/labels/115_4.png +0 -0
  373. data/vendor/ggml/examples/yolo/data/labels/115_5.png +0 -0
  374. data/vendor/ggml/examples/yolo/data/labels/115_6.png +0 -0
  375. data/vendor/ggml/examples/yolo/data/labels/115_7.png +0 -0
  376. data/vendor/ggml/examples/yolo/data/labels/116_0.png +0 -0
  377. data/vendor/ggml/examples/yolo/data/labels/116_1.png +0 -0
  378. data/vendor/ggml/examples/yolo/data/labels/116_2.png +0 -0
  379. data/vendor/ggml/examples/yolo/data/labels/116_3.png +0 -0
  380. data/vendor/ggml/examples/yolo/data/labels/116_4.png +0 -0
  381. data/vendor/ggml/examples/yolo/data/labels/116_5.png +0 -0
  382. data/vendor/ggml/examples/yolo/data/labels/116_6.png +0 -0
  383. data/vendor/ggml/examples/yolo/data/labels/116_7.png +0 -0
  384. data/vendor/ggml/examples/yolo/data/labels/117_0.png +0 -0
  385. data/vendor/ggml/examples/yolo/data/labels/117_1.png +0 -0
  386. data/vendor/ggml/examples/yolo/data/labels/117_2.png +0 -0
  387. data/vendor/ggml/examples/yolo/data/labels/117_3.png +0 -0
  388. data/vendor/ggml/examples/yolo/data/labels/117_4.png +0 -0
  389. data/vendor/ggml/examples/yolo/data/labels/117_5.png +0 -0
  390. data/vendor/ggml/examples/yolo/data/labels/117_6.png +0 -0
  391. data/vendor/ggml/examples/yolo/data/labels/117_7.png +0 -0
  392. data/vendor/ggml/examples/yolo/data/labels/118_0.png +0 -0
  393. data/vendor/ggml/examples/yolo/data/labels/118_1.png +0 -0
  394. data/vendor/ggml/examples/yolo/data/labels/118_2.png +0 -0
  395. data/vendor/ggml/examples/yolo/data/labels/118_3.png +0 -0
  396. data/vendor/ggml/examples/yolo/data/labels/118_4.png +0 -0
  397. data/vendor/ggml/examples/yolo/data/labels/118_5.png +0 -0
  398. data/vendor/ggml/examples/yolo/data/labels/118_6.png +0 -0
  399. data/vendor/ggml/examples/yolo/data/labels/118_7.png +0 -0
  400. data/vendor/ggml/examples/yolo/data/labels/119_0.png +0 -0
  401. data/vendor/ggml/examples/yolo/data/labels/119_1.png +0 -0
  402. data/vendor/ggml/examples/yolo/data/labels/119_2.png +0 -0
  403. data/vendor/ggml/examples/yolo/data/labels/119_3.png +0 -0
  404. data/vendor/ggml/examples/yolo/data/labels/119_4.png +0 -0
  405. data/vendor/ggml/examples/yolo/data/labels/119_5.png +0 -0
  406. data/vendor/ggml/examples/yolo/data/labels/119_6.png +0 -0
  407. data/vendor/ggml/examples/yolo/data/labels/119_7.png +0 -0
  408. data/vendor/ggml/examples/yolo/data/labels/120_0.png +0 -0
  409. data/vendor/ggml/examples/yolo/data/labels/120_1.png +0 -0
  410. data/vendor/ggml/examples/yolo/data/labels/120_2.png +0 -0
  411. data/vendor/ggml/examples/yolo/data/labels/120_3.png +0 -0
  412. data/vendor/ggml/examples/yolo/data/labels/120_4.png +0 -0
  413. data/vendor/ggml/examples/yolo/data/labels/120_5.png +0 -0
  414. data/vendor/ggml/examples/yolo/data/labels/120_6.png +0 -0
  415. data/vendor/ggml/examples/yolo/data/labels/120_7.png +0 -0
  416. data/vendor/ggml/examples/yolo/data/labels/121_0.png +0 -0
  417. data/vendor/ggml/examples/yolo/data/labels/121_1.png +0 -0
  418. data/vendor/ggml/examples/yolo/data/labels/121_2.png +0 -0
  419. data/vendor/ggml/examples/yolo/data/labels/121_3.png +0 -0
  420. data/vendor/ggml/examples/yolo/data/labels/121_4.png +0 -0
  421. data/vendor/ggml/examples/yolo/data/labels/121_5.png +0 -0
  422. data/vendor/ggml/examples/yolo/data/labels/121_6.png +0 -0
  423. data/vendor/ggml/examples/yolo/data/labels/121_7.png +0 -0
  424. data/vendor/ggml/examples/yolo/data/labels/122_0.png +0 -0
  425. data/vendor/ggml/examples/yolo/data/labels/122_1.png +0 -0
  426. data/vendor/ggml/examples/yolo/data/labels/122_2.png +0 -0
  427. data/vendor/ggml/examples/yolo/data/labels/122_3.png +0 -0
  428. data/vendor/ggml/examples/yolo/data/labels/122_4.png +0 -0
  429. data/vendor/ggml/examples/yolo/data/labels/122_5.png +0 -0
  430. data/vendor/ggml/examples/yolo/data/labels/122_6.png +0 -0
  431. data/vendor/ggml/examples/yolo/data/labels/122_7.png +0 -0
  432. data/vendor/ggml/examples/yolo/data/labels/123_0.png +0 -0
  433. data/vendor/ggml/examples/yolo/data/labels/123_1.png +0 -0
  434. data/vendor/ggml/examples/yolo/data/labels/123_2.png +0 -0
  435. data/vendor/ggml/examples/yolo/data/labels/123_3.png +0 -0
  436. data/vendor/ggml/examples/yolo/data/labels/123_4.png +0 -0
  437. data/vendor/ggml/examples/yolo/data/labels/123_5.png +0 -0
  438. data/vendor/ggml/examples/yolo/data/labels/123_6.png +0 -0
  439. data/vendor/ggml/examples/yolo/data/labels/123_7.png +0 -0
  440. data/vendor/ggml/examples/yolo/data/labels/124_0.png +0 -0
  441. data/vendor/ggml/examples/yolo/data/labels/124_1.png +0 -0
  442. data/vendor/ggml/examples/yolo/data/labels/124_2.png +0 -0
  443. data/vendor/ggml/examples/yolo/data/labels/124_3.png +0 -0
  444. data/vendor/ggml/examples/yolo/data/labels/124_4.png +0 -0
  445. data/vendor/ggml/examples/yolo/data/labels/124_5.png +0 -0
  446. data/vendor/ggml/examples/yolo/data/labels/124_6.png +0 -0
  447. data/vendor/ggml/examples/yolo/data/labels/124_7.png +0 -0
  448. data/vendor/ggml/examples/yolo/data/labels/125_0.png +0 -0
  449. data/vendor/ggml/examples/yolo/data/labels/125_1.png +0 -0
  450. data/vendor/ggml/examples/yolo/data/labels/125_2.png +0 -0
  451. data/vendor/ggml/examples/yolo/data/labels/125_3.png +0 -0
  452. data/vendor/ggml/examples/yolo/data/labels/125_4.png +0 -0
  453. data/vendor/ggml/examples/yolo/data/labels/125_5.png +0 -0
  454. data/vendor/ggml/examples/yolo/data/labels/125_6.png +0 -0
  455. data/vendor/ggml/examples/yolo/data/labels/125_7.png +0 -0
  456. data/vendor/ggml/examples/yolo/data/labels/126_0.png +0 -0
  457. data/vendor/ggml/examples/yolo/data/labels/126_1.png +0 -0
  458. data/vendor/ggml/examples/yolo/data/labels/126_2.png +0 -0
  459. data/vendor/ggml/examples/yolo/data/labels/126_3.png +0 -0
  460. data/vendor/ggml/examples/yolo/data/labels/126_4.png +0 -0
  461. data/vendor/ggml/examples/yolo/data/labels/126_5.png +0 -0
  462. data/vendor/ggml/examples/yolo/data/labels/126_6.png +0 -0
  463. data/vendor/ggml/examples/yolo/data/labels/126_7.png +0 -0
  464. data/vendor/ggml/examples/yolo/data/labels/32_0.png +0 -0
  465. data/vendor/ggml/examples/yolo/data/labels/32_1.png +0 -0
  466. data/vendor/ggml/examples/yolo/data/labels/32_2.png +0 -0
  467. data/vendor/ggml/examples/yolo/data/labels/32_3.png +0 -0
  468. data/vendor/ggml/examples/yolo/data/labels/32_4.png +0 -0
  469. data/vendor/ggml/examples/yolo/data/labels/32_5.png +0 -0
  470. data/vendor/ggml/examples/yolo/data/labels/32_6.png +0 -0
  471. data/vendor/ggml/examples/yolo/data/labels/32_7.png +0 -0
  472. data/vendor/ggml/examples/yolo/data/labels/33_0.png +0 -0
  473. data/vendor/ggml/examples/yolo/data/labels/33_1.png +0 -0
  474. data/vendor/ggml/examples/yolo/data/labels/33_2.png +0 -0
  475. data/vendor/ggml/examples/yolo/data/labels/33_3.png +0 -0
  476. data/vendor/ggml/examples/yolo/data/labels/33_4.png +0 -0
  477. data/vendor/ggml/examples/yolo/data/labels/33_5.png +0 -0
  478. data/vendor/ggml/examples/yolo/data/labels/33_6.png +0 -0
  479. data/vendor/ggml/examples/yolo/data/labels/33_7.png +0 -0
  480. data/vendor/ggml/examples/yolo/data/labels/34_0.png +0 -0
  481. data/vendor/ggml/examples/yolo/data/labels/34_1.png +0 -0
  482. data/vendor/ggml/examples/yolo/data/labels/34_2.png +0 -0
  483. data/vendor/ggml/examples/yolo/data/labels/34_3.png +0 -0
  484. data/vendor/ggml/examples/yolo/data/labels/34_4.png +0 -0
  485. data/vendor/ggml/examples/yolo/data/labels/34_5.png +0 -0
  486. data/vendor/ggml/examples/yolo/data/labels/34_6.png +0 -0
  487. data/vendor/ggml/examples/yolo/data/labels/34_7.png +0 -0
  488. data/vendor/ggml/examples/yolo/data/labels/35_0.png +0 -0
  489. data/vendor/ggml/examples/yolo/data/labels/35_1.png +0 -0
  490. data/vendor/ggml/examples/yolo/data/labels/35_2.png +0 -0
  491. data/vendor/ggml/examples/yolo/data/labels/35_3.png +0 -0
  492. data/vendor/ggml/examples/yolo/data/labels/35_4.png +0 -0
  493. data/vendor/ggml/examples/yolo/data/labels/35_5.png +0 -0
  494. data/vendor/ggml/examples/yolo/data/labels/35_6.png +0 -0
  495. data/vendor/ggml/examples/yolo/data/labels/35_7.png +0 -0
  496. data/vendor/ggml/examples/yolo/data/labels/36_0.png +0 -0
  497. data/vendor/ggml/examples/yolo/data/labels/36_1.png +0 -0
  498. data/vendor/ggml/examples/yolo/data/labels/36_2.png +0 -0
  499. data/vendor/ggml/examples/yolo/data/labels/36_3.png +0 -0
  500. data/vendor/ggml/examples/yolo/data/labels/36_4.png +0 -0
  501. data/vendor/ggml/examples/yolo/data/labels/36_5.png +0 -0
  502. data/vendor/ggml/examples/yolo/data/labels/36_6.png +0 -0
  503. data/vendor/ggml/examples/yolo/data/labels/36_7.png +0 -0
  504. data/vendor/ggml/examples/yolo/data/labels/37_0.png +0 -0
  505. data/vendor/ggml/examples/yolo/data/labels/37_1.png +0 -0
  506. data/vendor/ggml/examples/yolo/data/labels/37_2.png +0 -0
  507. data/vendor/ggml/examples/yolo/data/labels/37_3.png +0 -0
  508. data/vendor/ggml/examples/yolo/data/labels/37_4.png +0 -0
  509. data/vendor/ggml/examples/yolo/data/labels/37_5.png +0 -0
  510. data/vendor/ggml/examples/yolo/data/labels/37_6.png +0 -0
  511. data/vendor/ggml/examples/yolo/data/labels/37_7.png +0 -0
  512. data/vendor/ggml/examples/yolo/data/labels/38_0.png +0 -0
  513. data/vendor/ggml/examples/yolo/data/labels/38_1.png +0 -0
  514. data/vendor/ggml/examples/yolo/data/labels/38_2.png +0 -0
  515. data/vendor/ggml/examples/yolo/data/labels/38_3.png +0 -0
  516. data/vendor/ggml/examples/yolo/data/labels/38_4.png +0 -0
  517. data/vendor/ggml/examples/yolo/data/labels/38_5.png +0 -0
  518. data/vendor/ggml/examples/yolo/data/labels/38_6.png +0 -0
  519. data/vendor/ggml/examples/yolo/data/labels/38_7.png +0 -0
  520. data/vendor/ggml/examples/yolo/data/labels/39_0.png +0 -0
  521. data/vendor/ggml/examples/yolo/data/labels/39_1.png +0 -0
  522. data/vendor/ggml/examples/yolo/data/labels/39_2.png +0 -0
  523. data/vendor/ggml/examples/yolo/data/labels/39_3.png +0 -0
  524. data/vendor/ggml/examples/yolo/data/labels/39_4.png +0 -0
  525. data/vendor/ggml/examples/yolo/data/labels/39_5.png +0 -0
  526. data/vendor/ggml/examples/yolo/data/labels/39_6.png +0 -0
  527. data/vendor/ggml/examples/yolo/data/labels/39_7.png +0 -0
  528. data/vendor/ggml/examples/yolo/data/labels/40_0.png +0 -0
  529. data/vendor/ggml/examples/yolo/data/labels/40_1.png +0 -0
  530. data/vendor/ggml/examples/yolo/data/labels/40_2.png +0 -0
  531. data/vendor/ggml/examples/yolo/data/labels/40_3.png +0 -0
  532. data/vendor/ggml/examples/yolo/data/labels/40_4.png +0 -0
  533. data/vendor/ggml/examples/yolo/data/labels/40_5.png +0 -0
  534. data/vendor/ggml/examples/yolo/data/labels/40_6.png +0 -0
  535. data/vendor/ggml/examples/yolo/data/labels/40_7.png +0 -0
  536. data/vendor/ggml/examples/yolo/data/labels/41_0.png +0 -0
  537. data/vendor/ggml/examples/yolo/data/labels/41_1.png +0 -0
  538. data/vendor/ggml/examples/yolo/data/labels/41_2.png +0 -0
  539. data/vendor/ggml/examples/yolo/data/labels/41_3.png +0 -0
  540. data/vendor/ggml/examples/yolo/data/labels/41_4.png +0 -0
  541. data/vendor/ggml/examples/yolo/data/labels/41_5.png +0 -0
  542. data/vendor/ggml/examples/yolo/data/labels/41_6.png +0 -0
  543. data/vendor/ggml/examples/yolo/data/labels/41_7.png +0 -0
  544. data/vendor/ggml/examples/yolo/data/labels/42_0.png +0 -0
  545. data/vendor/ggml/examples/yolo/data/labels/42_1.png +0 -0
  546. data/vendor/ggml/examples/yolo/data/labels/42_2.png +0 -0
  547. data/vendor/ggml/examples/yolo/data/labels/42_3.png +0 -0
  548. data/vendor/ggml/examples/yolo/data/labels/42_4.png +0 -0
  549. data/vendor/ggml/examples/yolo/data/labels/42_5.png +0 -0
  550. data/vendor/ggml/examples/yolo/data/labels/42_6.png +0 -0
  551. data/vendor/ggml/examples/yolo/data/labels/42_7.png +0 -0
  552. data/vendor/ggml/examples/yolo/data/labels/43_0.png +0 -0
  553. data/vendor/ggml/examples/yolo/data/labels/43_1.png +0 -0
  554. data/vendor/ggml/examples/yolo/data/labels/43_2.png +0 -0
  555. data/vendor/ggml/examples/yolo/data/labels/43_3.png +0 -0
  556. data/vendor/ggml/examples/yolo/data/labels/43_4.png +0 -0
  557. data/vendor/ggml/examples/yolo/data/labels/43_5.png +0 -0
  558. data/vendor/ggml/examples/yolo/data/labels/43_6.png +0 -0
  559. data/vendor/ggml/examples/yolo/data/labels/43_7.png +0 -0
  560. data/vendor/ggml/examples/yolo/data/labels/44_0.png +0 -0
  561. data/vendor/ggml/examples/yolo/data/labels/44_1.png +0 -0
  562. data/vendor/ggml/examples/yolo/data/labels/44_2.png +0 -0
  563. data/vendor/ggml/examples/yolo/data/labels/44_3.png +0 -0
  564. data/vendor/ggml/examples/yolo/data/labels/44_4.png +0 -0
  565. data/vendor/ggml/examples/yolo/data/labels/44_5.png +0 -0
  566. data/vendor/ggml/examples/yolo/data/labels/44_6.png +0 -0
  567. data/vendor/ggml/examples/yolo/data/labels/44_7.png +0 -0
  568. data/vendor/ggml/examples/yolo/data/labels/45_0.png +0 -0
  569. data/vendor/ggml/examples/yolo/data/labels/45_1.png +0 -0
  570. data/vendor/ggml/examples/yolo/data/labels/45_2.png +0 -0
  571. data/vendor/ggml/examples/yolo/data/labels/45_3.png +0 -0
  572. data/vendor/ggml/examples/yolo/data/labels/45_4.png +0 -0
  573. data/vendor/ggml/examples/yolo/data/labels/45_5.png +0 -0
  574. data/vendor/ggml/examples/yolo/data/labels/45_6.png +0 -0
  575. data/vendor/ggml/examples/yolo/data/labels/45_7.png +0 -0
  576. data/vendor/ggml/examples/yolo/data/labels/46_0.png +0 -0
  577. data/vendor/ggml/examples/yolo/data/labels/46_1.png +0 -0
  578. data/vendor/ggml/examples/yolo/data/labels/46_2.png +0 -0
  579. data/vendor/ggml/examples/yolo/data/labels/46_3.png +0 -0
  580. data/vendor/ggml/examples/yolo/data/labels/46_4.png +0 -0
  581. data/vendor/ggml/examples/yolo/data/labels/46_5.png +0 -0
  582. data/vendor/ggml/examples/yolo/data/labels/46_6.png +0 -0
  583. data/vendor/ggml/examples/yolo/data/labels/46_7.png +0 -0
  584. data/vendor/ggml/examples/yolo/data/labels/47_0.png +0 -0
  585. data/vendor/ggml/examples/yolo/data/labels/47_1.png +0 -0
  586. data/vendor/ggml/examples/yolo/data/labels/47_2.png +0 -0
  587. data/vendor/ggml/examples/yolo/data/labels/47_3.png +0 -0
  588. data/vendor/ggml/examples/yolo/data/labels/47_4.png +0 -0
  589. data/vendor/ggml/examples/yolo/data/labels/47_5.png +0 -0
  590. data/vendor/ggml/examples/yolo/data/labels/47_6.png +0 -0
  591. data/vendor/ggml/examples/yolo/data/labels/47_7.png +0 -0
  592. data/vendor/ggml/examples/yolo/data/labels/48_0.png +0 -0
  593. data/vendor/ggml/examples/yolo/data/labels/48_1.png +0 -0
  594. data/vendor/ggml/examples/yolo/data/labels/48_2.png +0 -0
  595. data/vendor/ggml/examples/yolo/data/labels/48_3.png +0 -0
  596. data/vendor/ggml/examples/yolo/data/labels/48_4.png +0 -0
  597. data/vendor/ggml/examples/yolo/data/labels/48_5.png +0 -0
  598. data/vendor/ggml/examples/yolo/data/labels/48_6.png +0 -0
  599. data/vendor/ggml/examples/yolo/data/labels/48_7.png +0 -0
  600. data/vendor/ggml/examples/yolo/data/labels/49_0.png +0 -0
  601. data/vendor/ggml/examples/yolo/data/labels/49_1.png +0 -0
  602. data/vendor/ggml/examples/yolo/data/labels/49_2.png +0 -0
  603. data/vendor/ggml/examples/yolo/data/labels/49_3.png +0 -0
  604. data/vendor/ggml/examples/yolo/data/labels/49_4.png +0 -0
  605. data/vendor/ggml/examples/yolo/data/labels/49_5.png +0 -0
  606. data/vendor/ggml/examples/yolo/data/labels/49_6.png +0 -0
  607. data/vendor/ggml/examples/yolo/data/labels/49_7.png +0 -0
  608. data/vendor/ggml/examples/yolo/data/labels/50_0.png +0 -0
  609. data/vendor/ggml/examples/yolo/data/labels/50_1.png +0 -0
  610. data/vendor/ggml/examples/yolo/data/labels/50_2.png +0 -0
  611. data/vendor/ggml/examples/yolo/data/labels/50_3.png +0 -0
  612. data/vendor/ggml/examples/yolo/data/labels/50_4.png +0 -0
  613. data/vendor/ggml/examples/yolo/data/labels/50_5.png +0 -0
  614. data/vendor/ggml/examples/yolo/data/labels/50_6.png +0 -0
  615. data/vendor/ggml/examples/yolo/data/labels/50_7.png +0 -0
  616. data/vendor/ggml/examples/yolo/data/labels/51_0.png +0 -0
  617. data/vendor/ggml/examples/yolo/data/labels/51_1.png +0 -0
  618. data/vendor/ggml/examples/yolo/data/labels/51_2.png +0 -0
  619. data/vendor/ggml/examples/yolo/data/labels/51_3.png +0 -0
  620. data/vendor/ggml/examples/yolo/data/labels/51_4.png +0 -0
  621. data/vendor/ggml/examples/yolo/data/labels/51_5.png +0 -0
  622. data/vendor/ggml/examples/yolo/data/labels/51_6.png +0 -0
  623. data/vendor/ggml/examples/yolo/data/labels/51_7.png +0 -0
  624. data/vendor/ggml/examples/yolo/data/labels/52_0.png +0 -0
  625. data/vendor/ggml/examples/yolo/data/labels/52_1.png +0 -0
  626. data/vendor/ggml/examples/yolo/data/labels/52_2.png +0 -0
  627. data/vendor/ggml/examples/yolo/data/labels/52_3.png +0 -0
  628. data/vendor/ggml/examples/yolo/data/labels/52_4.png +0 -0
  629. data/vendor/ggml/examples/yolo/data/labels/52_5.png +0 -0
  630. data/vendor/ggml/examples/yolo/data/labels/52_6.png +0 -0
  631. data/vendor/ggml/examples/yolo/data/labels/52_7.png +0 -0
  632. data/vendor/ggml/examples/yolo/data/labels/53_0.png +0 -0
  633. data/vendor/ggml/examples/yolo/data/labels/53_1.png +0 -0
  634. data/vendor/ggml/examples/yolo/data/labels/53_2.png +0 -0
  635. data/vendor/ggml/examples/yolo/data/labels/53_3.png +0 -0
  636. data/vendor/ggml/examples/yolo/data/labels/53_4.png +0 -0
  637. data/vendor/ggml/examples/yolo/data/labels/53_5.png +0 -0
  638. data/vendor/ggml/examples/yolo/data/labels/53_6.png +0 -0
  639. data/vendor/ggml/examples/yolo/data/labels/53_7.png +0 -0
  640. data/vendor/ggml/examples/yolo/data/labels/54_0.png +0 -0
  641. data/vendor/ggml/examples/yolo/data/labels/54_1.png +0 -0
  642. data/vendor/ggml/examples/yolo/data/labels/54_2.png +0 -0
  643. data/vendor/ggml/examples/yolo/data/labels/54_3.png +0 -0
  644. data/vendor/ggml/examples/yolo/data/labels/54_4.png +0 -0
  645. data/vendor/ggml/examples/yolo/data/labels/54_5.png +0 -0
  646. data/vendor/ggml/examples/yolo/data/labels/54_6.png +0 -0
  647. data/vendor/ggml/examples/yolo/data/labels/54_7.png +0 -0
  648. data/vendor/ggml/examples/yolo/data/labels/55_0.png +0 -0
  649. data/vendor/ggml/examples/yolo/data/labels/55_1.png +0 -0
  650. data/vendor/ggml/examples/yolo/data/labels/55_2.png +0 -0
  651. data/vendor/ggml/examples/yolo/data/labels/55_3.png +0 -0
  652. data/vendor/ggml/examples/yolo/data/labels/55_4.png +0 -0
  653. data/vendor/ggml/examples/yolo/data/labels/55_5.png +0 -0
  654. data/vendor/ggml/examples/yolo/data/labels/55_6.png +0 -0
  655. data/vendor/ggml/examples/yolo/data/labels/55_7.png +0 -0
  656. data/vendor/ggml/examples/yolo/data/labels/56_0.png +0 -0
  657. data/vendor/ggml/examples/yolo/data/labels/56_1.png +0 -0
  658. data/vendor/ggml/examples/yolo/data/labels/56_2.png +0 -0
  659. data/vendor/ggml/examples/yolo/data/labels/56_3.png +0 -0
  660. data/vendor/ggml/examples/yolo/data/labels/56_4.png +0 -0
  661. data/vendor/ggml/examples/yolo/data/labels/56_5.png +0 -0
  662. data/vendor/ggml/examples/yolo/data/labels/56_6.png +0 -0
  663. data/vendor/ggml/examples/yolo/data/labels/56_7.png +0 -0
  664. data/vendor/ggml/examples/yolo/data/labels/57_0.png +0 -0
  665. data/vendor/ggml/examples/yolo/data/labels/57_1.png +0 -0
  666. data/vendor/ggml/examples/yolo/data/labels/57_2.png +0 -0
  667. data/vendor/ggml/examples/yolo/data/labels/57_3.png +0 -0
  668. data/vendor/ggml/examples/yolo/data/labels/57_4.png +0 -0
  669. data/vendor/ggml/examples/yolo/data/labels/57_5.png +0 -0
  670. data/vendor/ggml/examples/yolo/data/labels/57_6.png +0 -0
  671. data/vendor/ggml/examples/yolo/data/labels/57_7.png +0 -0
  672. data/vendor/ggml/examples/yolo/data/labels/58_0.png +0 -0
  673. data/vendor/ggml/examples/yolo/data/labels/58_1.png +0 -0
  674. data/vendor/ggml/examples/yolo/data/labels/58_2.png +0 -0
  675. data/vendor/ggml/examples/yolo/data/labels/58_3.png +0 -0
  676. data/vendor/ggml/examples/yolo/data/labels/58_4.png +0 -0
  677. data/vendor/ggml/examples/yolo/data/labels/58_5.png +0 -0
  678. data/vendor/ggml/examples/yolo/data/labels/58_6.png +0 -0
  679. data/vendor/ggml/examples/yolo/data/labels/58_7.png +0 -0
  680. data/vendor/ggml/examples/yolo/data/labels/59_0.png +0 -0
  681. data/vendor/ggml/examples/yolo/data/labels/59_1.png +0 -0
  682. data/vendor/ggml/examples/yolo/data/labels/59_2.png +0 -0
  683. data/vendor/ggml/examples/yolo/data/labels/59_3.png +0 -0
  684. data/vendor/ggml/examples/yolo/data/labels/59_4.png +0 -0
  685. data/vendor/ggml/examples/yolo/data/labels/59_5.png +0 -0
  686. data/vendor/ggml/examples/yolo/data/labels/59_6.png +0 -0
  687. data/vendor/ggml/examples/yolo/data/labels/59_7.png +0 -0
  688. data/vendor/ggml/examples/yolo/data/labels/60_0.png +0 -0
  689. data/vendor/ggml/examples/yolo/data/labels/60_1.png +0 -0
  690. data/vendor/ggml/examples/yolo/data/labels/60_2.png +0 -0
  691. data/vendor/ggml/examples/yolo/data/labels/60_3.png +0 -0
  692. data/vendor/ggml/examples/yolo/data/labels/60_4.png +0 -0
  693. data/vendor/ggml/examples/yolo/data/labels/60_5.png +0 -0
  694. data/vendor/ggml/examples/yolo/data/labels/60_6.png +0 -0
  695. data/vendor/ggml/examples/yolo/data/labels/60_7.png +0 -0
  696. data/vendor/ggml/examples/yolo/data/labels/61_0.png +0 -0
  697. data/vendor/ggml/examples/yolo/data/labels/61_1.png +0 -0
  698. data/vendor/ggml/examples/yolo/data/labels/61_2.png +0 -0
  699. data/vendor/ggml/examples/yolo/data/labels/61_3.png +0 -0
  700. data/vendor/ggml/examples/yolo/data/labels/61_4.png +0 -0
  701. data/vendor/ggml/examples/yolo/data/labels/61_5.png +0 -0
  702. data/vendor/ggml/examples/yolo/data/labels/61_6.png +0 -0
  703. data/vendor/ggml/examples/yolo/data/labels/61_7.png +0 -0
  704. data/vendor/ggml/examples/yolo/data/labels/62_0.png +0 -0
  705. data/vendor/ggml/examples/yolo/data/labels/62_1.png +0 -0
  706. data/vendor/ggml/examples/yolo/data/labels/62_2.png +0 -0
  707. data/vendor/ggml/examples/yolo/data/labels/62_3.png +0 -0
  708. data/vendor/ggml/examples/yolo/data/labels/62_4.png +0 -0
  709. data/vendor/ggml/examples/yolo/data/labels/62_5.png +0 -0
  710. data/vendor/ggml/examples/yolo/data/labels/62_6.png +0 -0
  711. data/vendor/ggml/examples/yolo/data/labels/62_7.png +0 -0
  712. data/vendor/ggml/examples/yolo/data/labels/63_0.png +0 -0
  713. data/vendor/ggml/examples/yolo/data/labels/63_1.png +0 -0
  714. data/vendor/ggml/examples/yolo/data/labels/63_2.png +0 -0
  715. data/vendor/ggml/examples/yolo/data/labels/63_3.png +0 -0
  716. data/vendor/ggml/examples/yolo/data/labels/63_4.png +0 -0
  717. data/vendor/ggml/examples/yolo/data/labels/63_5.png +0 -0
  718. data/vendor/ggml/examples/yolo/data/labels/63_6.png +0 -0
  719. data/vendor/ggml/examples/yolo/data/labels/63_7.png +0 -0
  720. data/vendor/ggml/examples/yolo/data/labels/64_0.png +0 -0
  721. data/vendor/ggml/examples/yolo/data/labels/64_1.png +0 -0
  722. data/vendor/ggml/examples/yolo/data/labels/64_2.png +0 -0
  723. data/vendor/ggml/examples/yolo/data/labels/64_3.png +0 -0
  724. data/vendor/ggml/examples/yolo/data/labels/64_4.png +0 -0
  725. data/vendor/ggml/examples/yolo/data/labels/64_5.png +0 -0
  726. data/vendor/ggml/examples/yolo/data/labels/64_6.png +0 -0
  727. data/vendor/ggml/examples/yolo/data/labels/64_7.png +0 -0
  728. data/vendor/ggml/examples/yolo/data/labels/65_0.png +0 -0
  729. data/vendor/ggml/examples/yolo/data/labels/65_1.png +0 -0
  730. data/vendor/ggml/examples/yolo/data/labels/65_2.png +0 -0
  731. data/vendor/ggml/examples/yolo/data/labels/65_3.png +0 -0
  732. data/vendor/ggml/examples/yolo/data/labels/65_4.png +0 -0
  733. data/vendor/ggml/examples/yolo/data/labels/65_5.png +0 -0
  734. data/vendor/ggml/examples/yolo/data/labels/65_6.png +0 -0
  735. data/vendor/ggml/examples/yolo/data/labels/65_7.png +0 -0
  736. data/vendor/ggml/examples/yolo/data/labels/66_0.png +0 -0
  737. data/vendor/ggml/examples/yolo/data/labels/66_1.png +0 -0
  738. data/vendor/ggml/examples/yolo/data/labels/66_2.png +0 -0
  739. data/vendor/ggml/examples/yolo/data/labels/66_3.png +0 -0
  740. data/vendor/ggml/examples/yolo/data/labels/66_4.png +0 -0
  741. data/vendor/ggml/examples/yolo/data/labels/66_5.png +0 -0
  742. data/vendor/ggml/examples/yolo/data/labels/66_6.png +0 -0
  743. data/vendor/ggml/examples/yolo/data/labels/66_7.png +0 -0
  744. data/vendor/ggml/examples/yolo/data/labels/67_0.png +0 -0
  745. data/vendor/ggml/examples/yolo/data/labels/67_1.png +0 -0
  746. data/vendor/ggml/examples/yolo/data/labels/67_2.png +0 -0
  747. data/vendor/ggml/examples/yolo/data/labels/67_3.png +0 -0
  748. data/vendor/ggml/examples/yolo/data/labels/67_4.png +0 -0
  749. data/vendor/ggml/examples/yolo/data/labels/67_5.png +0 -0
  750. data/vendor/ggml/examples/yolo/data/labels/67_6.png +0 -0
  751. data/vendor/ggml/examples/yolo/data/labels/67_7.png +0 -0
  752. data/vendor/ggml/examples/yolo/data/labels/68_0.png +0 -0
  753. data/vendor/ggml/examples/yolo/data/labels/68_1.png +0 -0
  754. data/vendor/ggml/examples/yolo/data/labels/68_2.png +0 -0
  755. data/vendor/ggml/examples/yolo/data/labels/68_3.png +0 -0
  756. data/vendor/ggml/examples/yolo/data/labels/68_4.png +0 -0
  757. data/vendor/ggml/examples/yolo/data/labels/68_5.png +0 -0
  758. data/vendor/ggml/examples/yolo/data/labels/68_6.png +0 -0
  759. data/vendor/ggml/examples/yolo/data/labels/68_7.png +0 -0
  760. data/vendor/ggml/examples/yolo/data/labels/69_0.png +0 -0
  761. data/vendor/ggml/examples/yolo/data/labels/69_1.png +0 -0
  762. data/vendor/ggml/examples/yolo/data/labels/69_2.png +0 -0
  763. data/vendor/ggml/examples/yolo/data/labels/69_3.png +0 -0
  764. data/vendor/ggml/examples/yolo/data/labels/69_4.png +0 -0
  765. data/vendor/ggml/examples/yolo/data/labels/69_5.png +0 -0
  766. data/vendor/ggml/examples/yolo/data/labels/69_6.png +0 -0
  767. data/vendor/ggml/examples/yolo/data/labels/69_7.png +0 -0
  768. data/vendor/ggml/examples/yolo/data/labels/70_0.png +0 -0
  769. data/vendor/ggml/examples/yolo/data/labels/70_1.png +0 -0
  770. data/vendor/ggml/examples/yolo/data/labels/70_2.png +0 -0
  771. data/vendor/ggml/examples/yolo/data/labels/70_3.png +0 -0
  772. data/vendor/ggml/examples/yolo/data/labels/70_4.png +0 -0
  773. data/vendor/ggml/examples/yolo/data/labels/70_5.png +0 -0
  774. data/vendor/ggml/examples/yolo/data/labels/70_6.png +0 -0
  775. data/vendor/ggml/examples/yolo/data/labels/70_7.png +0 -0
  776. data/vendor/ggml/examples/yolo/data/labels/71_0.png +0 -0
  777. data/vendor/ggml/examples/yolo/data/labels/71_1.png +0 -0
  778. data/vendor/ggml/examples/yolo/data/labels/71_2.png +0 -0
  779. data/vendor/ggml/examples/yolo/data/labels/71_3.png +0 -0
  780. data/vendor/ggml/examples/yolo/data/labels/71_4.png +0 -0
  781. data/vendor/ggml/examples/yolo/data/labels/71_5.png +0 -0
  782. data/vendor/ggml/examples/yolo/data/labels/71_6.png +0 -0
  783. data/vendor/ggml/examples/yolo/data/labels/71_7.png +0 -0
  784. data/vendor/ggml/examples/yolo/data/labels/72_0.png +0 -0
  785. data/vendor/ggml/examples/yolo/data/labels/72_1.png +0 -0
  786. data/vendor/ggml/examples/yolo/data/labels/72_2.png +0 -0
  787. data/vendor/ggml/examples/yolo/data/labels/72_3.png +0 -0
  788. data/vendor/ggml/examples/yolo/data/labels/72_4.png +0 -0
  789. data/vendor/ggml/examples/yolo/data/labels/72_5.png +0 -0
  790. data/vendor/ggml/examples/yolo/data/labels/72_6.png +0 -0
  791. data/vendor/ggml/examples/yolo/data/labels/72_7.png +0 -0
  792. data/vendor/ggml/examples/yolo/data/labels/73_0.png +0 -0
  793. data/vendor/ggml/examples/yolo/data/labels/73_1.png +0 -0
  794. data/vendor/ggml/examples/yolo/data/labels/73_2.png +0 -0
  795. data/vendor/ggml/examples/yolo/data/labels/73_3.png +0 -0
  796. data/vendor/ggml/examples/yolo/data/labels/73_4.png +0 -0
  797. data/vendor/ggml/examples/yolo/data/labels/73_5.png +0 -0
  798. data/vendor/ggml/examples/yolo/data/labels/73_6.png +0 -0
  799. data/vendor/ggml/examples/yolo/data/labels/73_7.png +0 -0
  800. data/vendor/ggml/examples/yolo/data/labels/74_0.png +0 -0
  801. data/vendor/ggml/examples/yolo/data/labels/74_1.png +0 -0
  802. data/vendor/ggml/examples/yolo/data/labels/74_2.png +0 -0
  803. data/vendor/ggml/examples/yolo/data/labels/74_3.png +0 -0
  804. data/vendor/ggml/examples/yolo/data/labels/74_4.png +0 -0
  805. data/vendor/ggml/examples/yolo/data/labels/74_5.png +0 -0
  806. data/vendor/ggml/examples/yolo/data/labels/74_6.png +0 -0
  807. data/vendor/ggml/examples/yolo/data/labels/74_7.png +0 -0
  808. data/vendor/ggml/examples/yolo/data/labels/75_0.png +0 -0
  809. data/vendor/ggml/examples/yolo/data/labels/75_1.png +0 -0
  810. data/vendor/ggml/examples/yolo/data/labels/75_2.png +0 -0
  811. data/vendor/ggml/examples/yolo/data/labels/75_3.png +0 -0
  812. data/vendor/ggml/examples/yolo/data/labels/75_4.png +0 -0
  813. data/vendor/ggml/examples/yolo/data/labels/75_5.png +0 -0
  814. data/vendor/ggml/examples/yolo/data/labels/75_6.png +0 -0
  815. data/vendor/ggml/examples/yolo/data/labels/75_7.png +0 -0
  816. data/vendor/ggml/examples/yolo/data/labels/76_0.png +0 -0
  817. data/vendor/ggml/examples/yolo/data/labels/76_1.png +0 -0
  818. data/vendor/ggml/examples/yolo/data/labels/76_2.png +0 -0
  819. data/vendor/ggml/examples/yolo/data/labels/76_3.png +0 -0
  820. data/vendor/ggml/examples/yolo/data/labels/76_4.png +0 -0
  821. data/vendor/ggml/examples/yolo/data/labels/76_5.png +0 -0
  822. data/vendor/ggml/examples/yolo/data/labels/76_6.png +0 -0
  823. data/vendor/ggml/examples/yolo/data/labels/76_7.png +0 -0
  824. data/vendor/ggml/examples/yolo/data/labels/77_0.png +0 -0
  825. data/vendor/ggml/examples/yolo/data/labels/77_1.png +0 -0
  826. data/vendor/ggml/examples/yolo/data/labels/77_2.png +0 -0
  827. data/vendor/ggml/examples/yolo/data/labels/77_3.png +0 -0
  828. data/vendor/ggml/examples/yolo/data/labels/77_4.png +0 -0
  829. data/vendor/ggml/examples/yolo/data/labels/77_5.png +0 -0
  830. data/vendor/ggml/examples/yolo/data/labels/77_6.png +0 -0
  831. data/vendor/ggml/examples/yolo/data/labels/77_7.png +0 -0
  832. data/vendor/ggml/examples/yolo/data/labels/78_0.png +0 -0
  833. data/vendor/ggml/examples/yolo/data/labels/78_1.png +0 -0
  834. data/vendor/ggml/examples/yolo/data/labels/78_2.png +0 -0
  835. data/vendor/ggml/examples/yolo/data/labels/78_3.png +0 -0
  836. data/vendor/ggml/examples/yolo/data/labels/78_4.png +0 -0
  837. data/vendor/ggml/examples/yolo/data/labels/78_5.png +0 -0
  838. data/vendor/ggml/examples/yolo/data/labels/78_6.png +0 -0
  839. data/vendor/ggml/examples/yolo/data/labels/78_7.png +0 -0
  840. data/vendor/ggml/examples/yolo/data/labels/79_0.png +0 -0
  841. data/vendor/ggml/examples/yolo/data/labels/79_1.png +0 -0
  842. data/vendor/ggml/examples/yolo/data/labels/79_2.png +0 -0
  843. data/vendor/ggml/examples/yolo/data/labels/79_3.png +0 -0
  844. data/vendor/ggml/examples/yolo/data/labels/79_4.png +0 -0
  845. data/vendor/ggml/examples/yolo/data/labels/79_5.png +0 -0
  846. data/vendor/ggml/examples/yolo/data/labels/79_6.png +0 -0
  847. data/vendor/ggml/examples/yolo/data/labels/79_7.png +0 -0
  848. data/vendor/ggml/examples/yolo/data/labels/80_0.png +0 -0
  849. data/vendor/ggml/examples/yolo/data/labels/80_1.png +0 -0
  850. data/vendor/ggml/examples/yolo/data/labels/80_2.png +0 -0
  851. data/vendor/ggml/examples/yolo/data/labels/80_3.png +0 -0
  852. data/vendor/ggml/examples/yolo/data/labels/80_4.png +0 -0
  853. data/vendor/ggml/examples/yolo/data/labels/80_5.png +0 -0
  854. data/vendor/ggml/examples/yolo/data/labels/80_6.png +0 -0
  855. data/vendor/ggml/examples/yolo/data/labels/80_7.png +0 -0
  856. data/vendor/ggml/examples/yolo/data/labels/81_0.png +0 -0
  857. data/vendor/ggml/examples/yolo/data/labels/81_1.png +0 -0
  858. data/vendor/ggml/examples/yolo/data/labels/81_2.png +0 -0
  859. data/vendor/ggml/examples/yolo/data/labels/81_3.png +0 -0
  860. data/vendor/ggml/examples/yolo/data/labels/81_4.png +0 -0
  861. data/vendor/ggml/examples/yolo/data/labels/81_5.png +0 -0
  862. data/vendor/ggml/examples/yolo/data/labels/81_6.png +0 -0
  863. data/vendor/ggml/examples/yolo/data/labels/81_7.png +0 -0
  864. data/vendor/ggml/examples/yolo/data/labels/82_0.png +0 -0
  865. data/vendor/ggml/examples/yolo/data/labels/82_1.png +0 -0
  866. data/vendor/ggml/examples/yolo/data/labels/82_2.png +0 -0
  867. data/vendor/ggml/examples/yolo/data/labels/82_3.png +0 -0
  868. data/vendor/ggml/examples/yolo/data/labels/82_4.png +0 -0
  869. data/vendor/ggml/examples/yolo/data/labels/82_5.png +0 -0
  870. data/vendor/ggml/examples/yolo/data/labels/82_6.png +0 -0
  871. data/vendor/ggml/examples/yolo/data/labels/82_7.png +0 -0
  872. data/vendor/ggml/examples/yolo/data/labels/83_0.png +0 -0
  873. data/vendor/ggml/examples/yolo/data/labels/83_1.png +0 -0
  874. data/vendor/ggml/examples/yolo/data/labels/83_2.png +0 -0
  875. data/vendor/ggml/examples/yolo/data/labels/83_3.png +0 -0
  876. data/vendor/ggml/examples/yolo/data/labels/83_4.png +0 -0
  877. data/vendor/ggml/examples/yolo/data/labels/83_5.png +0 -0
  878. data/vendor/ggml/examples/yolo/data/labels/83_6.png +0 -0
  879. data/vendor/ggml/examples/yolo/data/labels/83_7.png +0 -0
  880. data/vendor/ggml/examples/yolo/data/labels/84_0.png +0 -0
  881. data/vendor/ggml/examples/yolo/data/labels/84_1.png +0 -0
  882. data/vendor/ggml/examples/yolo/data/labels/84_2.png +0 -0
  883. data/vendor/ggml/examples/yolo/data/labels/84_3.png +0 -0
  884. data/vendor/ggml/examples/yolo/data/labels/84_4.png +0 -0
  885. data/vendor/ggml/examples/yolo/data/labels/84_5.png +0 -0
  886. data/vendor/ggml/examples/yolo/data/labels/84_6.png +0 -0
  887. data/vendor/ggml/examples/yolo/data/labels/84_7.png +0 -0
  888. data/vendor/ggml/examples/yolo/data/labels/85_0.png +0 -0
  889. data/vendor/ggml/examples/yolo/data/labels/85_1.png +0 -0
  890. data/vendor/ggml/examples/yolo/data/labels/85_2.png +0 -0
  891. data/vendor/ggml/examples/yolo/data/labels/85_3.png +0 -0
  892. data/vendor/ggml/examples/yolo/data/labels/85_4.png +0 -0
  893. data/vendor/ggml/examples/yolo/data/labels/85_5.png +0 -0
  894. data/vendor/ggml/examples/yolo/data/labels/85_6.png +0 -0
  895. data/vendor/ggml/examples/yolo/data/labels/85_7.png +0 -0
  896. data/vendor/ggml/examples/yolo/data/labels/86_0.png +0 -0
  897. data/vendor/ggml/examples/yolo/data/labels/86_1.png +0 -0
  898. data/vendor/ggml/examples/yolo/data/labels/86_2.png +0 -0
  899. data/vendor/ggml/examples/yolo/data/labels/86_3.png +0 -0
  900. data/vendor/ggml/examples/yolo/data/labels/86_4.png +0 -0
  901. data/vendor/ggml/examples/yolo/data/labels/86_5.png +0 -0
  902. data/vendor/ggml/examples/yolo/data/labels/86_6.png +0 -0
  903. data/vendor/ggml/examples/yolo/data/labels/86_7.png +0 -0
  904. data/vendor/ggml/examples/yolo/data/labels/87_0.png +0 -0
  905. data/vendor/ggml/examples/yolo/data/labels/87_1.png +0 -0
  906. data/vendor/ggml/examples/yolo/data/labels/87_2.png +0 -0
  907. data/vendor/ggml/examples/yolo/data/labels/87_3.png +0 -0
  908. data/vendor/ggml/examples/yolo/data/labels/87_4.png +0 -0
  909. data/vendor/ggml/examples/yolo/data/labels/87_5.png +0 -0
  910. data/vendor/ggml/examples/yolo/data/labels/87_6.png +0 -0
  911. data/vendor/ggml/examples/yolo/data/labels/87_7.png +0 -0
  912. data/vendor/ggml/examples/yolo/data/labels/88_0.png +0 -0
  913. data/vendor/ggml/examples/yolo/data/labels/88_1.png +0 -0
  914. data/vendor/ggml/examples/yolo/data/labels/88_2.png +0 -0
  915. data/vendor/ggml/examples/yolo/data/labels/88_3.png +0 -0
  916. data/vendor/ggml/examples/yolo/data/labels/88_4.png +0 -0
  917. data/vendor/ggml/examples/yolo/data/labels/88_5.png +0 -0
  918. data/vendor/ggml/examples/yolo/data/labels/88_6.png +0 -0
  919. data/vendor/ggml/examples/yolo/data/labels/88_7.png +0 -0
  920. data/vendor/ggml/examples/yolo/data/labels/89_0.png +0 -0
  921. data/vendor/ggml/examples/yolo/data/labels/89_1.png +0 -0
  922. data/vendor/ggml/examples/yolo/data/labels/89_2.png +0 -0
  923. data/vendor/ggml/examples/yolo/data/labels/89_3.png +0 -0
  924. data/vendor/ggml/examples/yolo/data/labels/89_4.png +0 -0
  925. data/vendor/ggml/examples/yolo/data/labels/89_5.png +0 -0
  926. data/vendor/ggml/examples/yolo/data/labels/89_6.png +0 -0
  927. data/vendor/ggml/examples/yolo/data/labels/89_7.png +0 -0
  928. data/vendor/ggml/examples/yolo/data/labels/90_0.png +0 -0
  929. data/vendor/ggml/examples/yolo/data/labels/90_1.png +0 -0
  930. data/vendor/ggml/examples/yolo/data/labels/90_2.png +0 -0
  931. data/vendor/ggml/examples/yolo/data/labels/90_3.png +0 -0
  932. data/vendor/ggml/examples/yolo/data/labels/90_4.png +0 -0
  933. data/vendor/ggml/examples/yolo/data/labels/90_5.png +0 -0
  934. data/vendor/ggml/examples/yolo/data/labels/90_6.png +0 -0
  935. data/vendor/ggml/examples/yolo/data/labels/90_7.png +0 -0
  936. data/vendor/ggml/examples/yolo/data/labels/91_0.png +0 -0
  937. data/vendor/ggml/examples/yolo/data/labels/91_1.png +0 -0
  938. data/vendor/ggml/examples/yolo/data/labels/91_2.png +0 -0
  939. data/vendor/ggml/examples/yolo/data/labels/91_3.png +0 -0
  940. data/vendor/ggml/examples/yolo/data/labels/91_4.png +0 -0
  941. data/vendor/ggml/examples/yolo/data/labels/91_5.png +0 -0
  942. data/vendor/ggml/examples/yolo/data/labels/91_6.png +0 -0
  943. data/vendor/ggml/examples/yolo/data/labels/91_7.png +0 -0
  944. data/vendor/ggml/examples/yolo/data/labels/92_0.png +0 -0
  945. data/vendor/ggml/examples/yolo/data/labels/92_1.png +0 -0
  946. data/vendor/ggml/examples/yolo/data/labels/92_2.png +0 -0
  947. data/vendor/ggml/examples/yolo/data/labels/92_3.png +0 -0
  948. data/vendor/ggml/examples/yolo/data/labels/92_4.png +0 -0
  949. data/vendor/ggml/examples/yolo/data/labels/92_5.png +0 -0
  950. data/vendor/ggml/examples/yolo/data/labels/92_6.png +0 -0
  951. data/vendor/ggml/examples/yolo/data/labels/92_7.png +0 -0
  952. data/vendor/ggml/examples/yolo/data/labels/93_0.png +0 -0
  953. data/vendor/ggml/examples/yolo/data/labels/93_1.png +0 -0
  954. data/vendor/ggml/examples/yolo/data/labels/93_2.png +0 -0
  955. data/vendor/ggml/examples/yolo/data/labels/93_3.png +0 -0
  956. data/vendor/ggml/examples/yolo/data/labels/93_4.png +0 -0
  957. data/vendor/ggml/examples/yolo/data/labels/93_5.png +0 -0
  958. data/vendor/ggml/examples/yolo/data/labels/93_6.png +0 -0
  959. data/vendor/ggml/examples/yolo/data/labels/93_7.png +0 -0
  960. data/vendor/ggml/examples/yolo/data/labels/94_0.png +0 -0
  961. data/vendor/ggml/examples/yolo/data/labels/94_1.png +0 -0
  962. data/vendor/ggml/examples/yolo/data/labels/94_2.png +0 -0
  963. data/vendor/ggml/examples/yolo/data/labels/94_3.png +0 -0
  964. data/vendor/ggml/examples/yolo/data/labels/94_4.png +0 -0
  965. data/vendor/ggml/examples/yolo/data/labels/94_5.png +0 -0
  966. data/vendor/ggml/examples/yolo/data/labels/94_6.png +0 -0
  967. data/vendor/ggml/examples/yolo/data/labels/94_7.png +0 -0
  968. data/vendor/ggml/examples/yolo/data/labels/95_0.png +0 -0
  969. data/vendor/ggml/examples/yolo/data/labels/95_1.png +0 -0
  970. data/vendor/ggml/examples/yolo/data/labels/95_2.png +0 -0
  971. data/vendor/ggml/examples/yolo/data/labels/95_3.png +0 -0
  972. data/vendor/ggml/examples/yolo/data/labels/95_4.png +0 -0
  973. data/vendor/ggml/examples/yolo/data/labels/95_5.png +0 -0
  974. data/vendor/ggml/examples/yolo/data/labels/95_6.png +0 -0
  975. data/vendor/ggml/examples/yolo/data/labels/95_7.png +0 -0
  976. data/vendor/ggml/examples/yolo/data/labels/96_0.png +0 -0
  977. data/vendor/ggml/examples/yolo/data/labels/96_1.png +0 -0
  978. data/vendor/ggml/examples/yolo/data/labels/96_2.png +0 -0
  979. data/vendor/ggml/examples/yolo/data/labels/96_3.png +0 -0
  980. data/vendor/ggml/examples/yolo/data/labels/96_4.png +0 -0
  981. data/vendor/ggml/examples/yolo/data/labels/96_5.png +0 -0
  982. data/vendor/ggml/examples/yolo/data/labels/96_6.png +0 -0
  983. data/vendor/ggml/examples/yolo/data/labels/96_7.png +0 -0
  984. data/vendor/ggml/examples/yolo/data/labels/97_0.png +0 -0
  985. data/vendor/ggml/examples/yolo/data/labels/97_1.png +0 -0
  986. data/vendor/ggml/examples/yolo/data/labels/97_2.png +0 -0
  987. data/vendor/ggml/examples/yolo/data/labels/97_3.png +0 -0
  988. data/vendor/ggml/examples/yolo/data/labels/97_4.png +0 -0
  989. data/vendor/ggml/examples/yolo/data/labels/97_5.png +0 -0
  990. data/vendor/ggml/examples/yolo/data/labels/97_6.png +0 -0
  991. data/vendor/ggml/examples/yolo/data/labels/97_7.png +0 -0
  992. data/vendor/ggml/examples/yolo/data/labels/98_0.png +0 -0
  993. data/vendor/ggml/examples/yolo/data/labels/98_1.png +0 -0
  994. data/vendor/ggml/examples/yolo/data/labels/98_2.png +0 -0
  995. data/vendor/ggml/examples/yolo/data/labels/98_3.png +0 -0
  996. data/vendor/ggml/examples/yolo/data/labels/98_4.png +0 -0
  997. data/vendor/ggml/examples/yolo/data/labels/98_5.png +0 -0
  998. data/vendor/ggml/examples/yolo/data/labels/98_6.png +0 -0
  999. data/vendor/ggml/examples/yolo/data/labels/98_7.png +0 -0
  1000. data/vendor/ggml/examples/yolo/data/labels/99_0.png +0 -0
  1001. data/vendor/ggml/examples/yolo/data/labels/99_1.png +0 -0
  1002. data/vendor/ggml/examples/yolo/data/labels/99_2.png +0 -0
  1003. data/vendor/ggml/examples/yolo/data/labels/99_3.png +0 -0
  1004. data/vendor/ggml/examples/yolo/data/labels/99_4.png +0 -0
  1005. data/vendor/ggml/examples/yolo/data/labels/99_5.png +0 -0
  1006. data/vendor/ggml/examples/yolo/data/labels/99_6.png +0 -0
  1007. data/vendor/ggml/examples/yolo/data/labels/99_7.png +0 -0
  1008. data/vendor/ggml/examples/yolo/yolo-image.cpp +210 -0
  1009. data/vendor/ggml/examples/yolo/yolo-image.h +39 -0
  1010. data/vendor/ggml/examples/yolo/yolov3-tiny.cpp +661 -0
  1011. data/vendor/ggml/ggml.pc.in +10 -0
  1012. data/vendor/ggml/include/ggml-alloc.h +85 -0
  1013. data/vendor/ggml/include/ggml-backend.h +431 -0
  1014. data/vendor/ggml/include/ggml-blas.h +25 -0
  1015. data/vendor/ggml/include/ggml-cann.h +123 -0
  1016. data/vendor/ggml/include/ggml-cpp.h +39 -0
  1017. data/vendor/ggml/include/ggml-cpu.h +151 -0
  1018. data/vendor/ggml/include/ggml-cuda.h +50 -0
  1019. data/vendor/ggml/include/ggml-hexagon.h +19 -0
  1020. data/vendor/ggml/include/ggml-metal.h +61 -0
  1021. data/vendor/ggml/include/ggml-opencl.h +26 -0
  1022. data/vendor/ggml/include/ggml-openvino.h +37 -0
  1023. data/vendor/ggml/include/ggml-opt.h +256 -0
  1024. data/vendor/ggml/include/ggml-rpc.h +35 -0
  1025. data/vendor/ggml/include/ggml-sycl.h +49 -0
  1026. data/vendor/ggml/include/ggml-virtgpu.h +14 -0
  1027. data/vendor/ggml/include/ggml-vulkan.h +29 -0
  1028. data/vendor/ggml/include/ggml-webgpu.h +19 -0
  1029. data/vendor/ggml/include/ggml-zdnn.h +17 -0
  1030. data/vendor/ggml/include/ggml-zendnn.h +22 -0
  1031. data/vendor/ggml/include/ggml.h +2845 -0
  1032. data/vendor/ggml/include/gguf.h +204 -0
  1033. data/vendor/ggml/requirements.txt +12 -0
  1034. data/vendor/ggml/scripts/gen-authors.sh +9 -0
  1035. data/vendor/ggml/scripts/release.sh +296 -0
  1036. data/vendor/ggml/scripts/sync-llama-am.sh +167 -0
  1037. data/vendor/ggml/scripts/sync-llama.last +1 -0
  1038. data/vendor/ggml/scripts/sync-llama.sh +21 -0
  1039. data/vendor/ggml/scripts/sync-whisper-am.sh +138 -0
  1040. data/vendor/ggml/scripts/sync-whisper.last +1 -0
  1041. data/vendor/ggml/scripts/sync-whisper.sh +17 -0
  1042. data/vendor/ggml/src/CMakeLists.txt +493 -0
  1043. data/vendor/ggml/src/ggml-alloc.c +1248 -0
  1044. data/vendor/ggml/src/ggml-backend-dl.cpp +48 -0
  1045. data/vendor/ggml/src/ggml-backend-dl.h +45 -0
  1046. data/vendor/ggml/src/ggml-backend-impl.h +275 -0
  1047. data/vendor/ggml/src/ggml-backend-meta.cpp +2144 -0
  1048. data/vendor/ggml/src/ggml-backend-reg.cpp +586 -0
  1049. data/vendor/ggml/src/ggml-backend.cpp +2371 -0
  1050. data/vendor/ggml/src/ggml-blas/CMakeLists.txt +101 -0
  1051. data/vendor/ggml/src/ggml-blas/ggml-blas.cpp +522 -0
  1052. data/vendor/ggml/src/ggml-cann/CMakeLists.txt +89 -0
  1053. data/vendor/ggml/src/ggml-cann/acl_tensor.cpp +195 -0
  1054. data/vendor/ggml/src/ggml-cann/acl_tensor.h +349 -0
  1055. data/vendor/ggml/src/ggml-cann/aclnn_ops.cpp +4436 -0
  1056. data/vendor/ggml/src/ggml-cann/aclnn_ops.h +1190 -0
  1057. data/vendor/ggml/src/ggml-cann/common.h +651 -0
  1058. data/vendor/ggml/src/ggml-cann/ggml-cann.cpp +3062 -0
  1059. data/vendor/ggml/src/ggml-common.h +1900 -0
  1060. data/vendor/ggml/src/ggml-cpu/CMakeLists.txt +731 -0
  1061. data/vendor/ggml/src/ggml-cpu/amx/amx.cpp +249 -0
  1062. data/vendor/ggml/src/ggml-cpu/amx/amx.h +8 -0
  1063. data/vendor/ggml/src/ggml-cpu/amx/common.h +115 -0
  1064. data/vendor/ggml/src/ggml-cpu/amx/mmq.cpp +2512 -0
  1065. data/vendor/ggml/src/ggml-cpu/amx/mmq.h +10 -0
  1066. data/vendor/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +98 -0
  1067. data/vendor/ggml/src/ggml-cpu/arch/arm/quants.c +4245 -0
  1068. data/vendor/ggml/src/ggml-cpu/arch/arm/repack.cpp +5156 -0
  1069. data/vendor/ggml/src/ggml-cpu/arch/loongarch/quants.c +2158 -0
  1070. data/vendor/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  1071. data/vendor/ggml/src/ggml-cpu/arch/powerpc/quants.c +2304 -0
  1072. data/vendor/ggml/src/ggml-cpu/arch/riscv/cpu-feats.cpp +38 -0
  1073. data/vendor/ggml/src/ggml-cpu/arch/riscv/quants.c +4553 -0
  1074. data/vendor/ggml/src/ggml-cpu/arch/riscv/repack.cpp +1703 -0
  1075. data/vendor/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
  1076. data/vendor/ggml/src/ggml-cpu/arch/s390/quants.c +1465 -0
  1077. data/vendor/ggml/src/ggml-cpu/arch/wasm/quants.c +1220 -0
  1078. data/vendor/ggml/src/ggml-cpu/arch/x86/cpu-feats.cpp +327 -0
  1079. data/vendor/ggml/src/ggml-cpu/arch/x86/quants.c +3970 -0
  1080. data/vendor/ggml/src/ggml-cpu/arch/x86/repack.cpp +6407 -0
  1081. data/vendor/ggml/src/ggml-cpu/arch-fallback.h +348 -0
  1082. data/vendor/ggml/src/ggml-cpu/binary-ops.cpp +154 -0
  1083. data/vendor/ggml/src/ggml-cpu/binary-ops.h +16 -0
  1084. data/vendor/ggml/src/ggml-cpu/cmake/FindSIMD.cmake +100 -0
  1085. data/vendor/ggml/src/ggml-cpu/cmake/FindSMTIME.cmake +32 -0
  1086. data/vendor/ggml/src/ggml-cpu/common.h +95 -0
  1087. data/vendor/ggml/src/ggml-cpu/ggml-cpu-impl.h +539 -0
  1088. data/vendor/ggml/src/ggml-cpu/ggml-cpu.c +3835 -0
  1089. data/vendor/ggml/src/ggml-cpu/ggml-cpu.cpp +703 -0
  1090. data/vendor/ggml/src/ggml-cpu/hbm.cpp +55 -0
  1091. data/vendor/ggml/src/ggml-cpu/hbm.h +8 -0
  1092. data/vendor/ggml/src/ggml-cpu/kleidiai/kernels.cpp +939 -0
  1093. data/vendor/ggml/src/ggml-cpu/kleidiai/kernels.h +90 -0
  1094. data/vendor/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1513 -0
  1095. data/vendor/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  1096. data/vendor/ggml/src/ggml-cpu/llamafile/sgemm.cpp +4051 -0
  1097. data/vendor/ggml/src/ggml-cpu/llamafile/sgemm.h +25 -0
  1098. data/vendor/ggml/src/ggml-cpu/ops.cpp +11373 -0
  1099. data/vendor/ggml/src/ggml-cpu/ops.h +119 -0
  1100. data/vendor/ggml/src/ggml-cpu/quants.c +1288 -0
  1101. data/vendor/ggml/src/ggml-cpu/quants.h +103 -0
  1102. data/vendor/ggml/src/ggml-cpu/repack.cpp +4836 -0
  1103. data/vendor/ggml/src/ggml-cpu/repack.h +245 -0
  1104. data/vendor/ggml/src/ggml-cpu/simd-gemm.h +226 -0
  1105. data/vendor/ggml/src/ggml-cpu/simd-mappings.h +1319 -0
  1106. data/vendor/ggml/src/ggml-cpu/spacemit/ime.cpp +1740 -0
  1107. data/vendor/ggml/src/ggml-cpu/spacemit/ime.h +21 -0
  1108. data/vendor/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +1027 -0
  1109. data/vendor/ggml/src/ggml-cpu/spacemit/ime2_kernels.cpp +5768 -0
  1110. data/vendor/ggml/src/ggml-cpu/spacemit/ime_env.cpp +320 -0
  1111. data/vendor/ggml/src/ggml-cpu/spacemit/ime_env.h +55 -0
  1112. data/vendor/ggml/src/ggml-cpu/spacemit/ime_kernels.h +189 -0
  1113. data/vendor/ggml/src/ggml-cpu/spacemit/repack.cpp +1795 -0
  1114. data/vendor/ggml/src/ggml-cpu/spacemit/repack.h +14 -0
  1115. data/vendor/ggml/src/ggml-cpu/spacemit/rvv_kernels.cpp +3178 -0
  1116. data/vendor/ggml/src/ggml-cpu/spacemit/rvv_kernels.h +95 -0
  1117. data/vendor/ggml/src/ggml-cpu/spacemit/spine_barrier.h +34 -0
  1118. data/vendor/ggml/src/ggml-cpu/spacemit/spine_mem_pool.cpp +760 -0
  1119. data/vendor/ggml/src/ggml-cpu/spacemit/spine_mem_pool.h +32 -0
  1120. data/vendor/ggml/src/ggml-cpu/spacemit/spine_tcm.h +409 -0
  1121. data/vendor/ggml/src/ggml-cpu/traits.cpp +36 -0
  1122. data/vendor/ggml/src/ggml-cpu/traits.h +38 -0
  1123. data/vendor/ggml/src/ggml-cpu/unary-ops.cpp +337 -0
  1124. data/vendor/ggml/src/ggml-cpu/unary-ops.h +35 -0
  1125. data/vendor/ggml/src/ggml-cpu/vec.cpp +629 -0
  1126. data/vendor/ggml/src/ggml-cpu/vec.h +1588 -0
  1127. data/vendor/ggml/src/ggml-cuda/CMakeLists.txt +268 -0
  1128. data/vendor/ggml/src/ggml-cuda/acc.cu +61 -0
  1129. data/vendor/ggml/src/ggml-cuda/acc.cuh +5 -0
  1130. data/vendor/ggml/src/ggml-cuda/add-id.cu +58 -0
  1131. data/vendor/ggml/src/ggml-cuda/add-id.cuh +3 -0
  1132. data/vendor/ggml/src/ggml-cuda/allreduce.cu +971 -0
  1133. data/vendor/ggml/src/ggml-cuda/allreduce.cuh +29 -0
  1134. data/vendor/ggml/src/ggml-cuda/arange.cu +34 -0
  1135. data/vendor/ggml/src/ggml-cuda/arange.cuh +5 -0
  1136. data/vendor/ggml/src/ggml-cuda/argmax.cu +91 -0
  1137. data/vendor/ggml/src/ggml-cuda/argmax.cuh +3 -0
  1138. data/vendor/ggml/src/ggml-cuda/argsort.cu +266 -0
  1139. data/vendor/ggml/src/ggml-cuda/argsort.cuh +19 -0
  1140. data/vendor/ggml/src/ggml-cuda/binbcast.cu +534 -0
  1141. data/vendor/ggml/src/ggml-cuda/binbcast.cuh +12 -0
  1142. data/vendor/ggml/src/ggml-cuda/clamp.cu +45 -0
  1143. data/vendor/ggml/src/ggml-cuda/clamp.cuh +5 -0
  1144. data/vendor/ggml/src/ggml-cuda/common.cuh +1489 -0
  1145. data/vendor/ggml/src/ggml-cuda/concat.cu +204 -0
  1146. data/vendor/ggml/src/ggml-cuda/concat.cuh +5 -0
  1147. data/vendor/ggml/src/ggml-cuda/conv-transpose-1d.cu +86 -0
  1148. data/vendor/ggml/src/ggml-cuda/conv-transpose-1d.cuh +5 -0
  1149. data/vendor/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
  1150. data/vendor/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
  1151. data/vendor/ggml/src/ggml-cuda/conv2d-transpose.cu +115 -0
  1152. data/vendor/ggml/src/ggml-cuda/conv2d-transpose.cuh +5 -0
  1153. data/vendor/ggml/src/ggml-cuda/conv2d.cu +166 -0
  1154. data/vendor/ggml/src/ggml-cuda/conv2d.cuh +5 -0
  1155. data/vendor/ggml/src/ggml-cuda/convert.cu +892 -0
  1156. data/vendor/ggml/src/ggml-cuda/convert.cuh +66 -0
  1157. data/vendor/ggml/src/ggml-cuda/count-equal.cu +64 -0
  1158. data/vendor/ggml/src/ggml-cuda/count-equal.cuh +5 -0
  1159. data/vendor/ggml/src/ggml-cuda/cp-async.cuh +57 -0
  1160. data/vendor/ggml/src/ggml-cuda/cpy-utils.cuh +217 -0
  1161. data/vendor/ggml/src/ggml-cuda/cpy.cu +558 -0
  1162. data/vendor/ggml/src/ggml-cuda/cpy.cuh +7 -0
  1163. data/vendor/ggml/src/ggml-cuda/cross-entropy-loss.cu +177 -0
  1164. data/vendor/ggml/src/ggml-cuda/cross-entropy-loss.cuh +7 -0
  1165. data/vendor/ggml/src/ggml-cuda/cumsum.cu +307 -0
  1166. data/vendor/ggml/src/ggml-cuda/cumsum.cuh +5 -0
  1167. data/vendor/ggml/src/ggml-cuda/dequantize.cuh +99 -0
  1168. data/vendor/ggml/src/ggml-cuda/diag.cu +77 -0
  1169. data/vendor/ggml/src/ggml-cuda/diag.cuh +5 -0
  1170. data/vendor/ggml/src/ggml-cuda/diagmask.cu +40 -0
  1171. data/vendor/ggml/src/ggml-cuda/diagmask.cuh +5 -0
  1172. data/vendor/ggml/src/ggml-cuda/fattn-common.cuh +1212 -0
  1173. data/vendor/ggml/src/ggml-cuda/fattn-mma-f16.cuh +2020 -0
  1174. data/vendor/ggml/src/ggml-cuda/fattn-tile.cu +61 -0
  1175. data/vendor/ggml/src/ggml-cuda/fattn-tile.cuh +1347 -0
  1176. data/vendor/ggml/src/ggml-cuda/fattn-vec.cuh +600 -0
  1177. data/vendor/ggml/src/ggml-cuda/fattn-wmma-f16.cu +696 -0
  1178. data/vendor/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +51 -0
  1179. data/vendor/ggml/src/ggml-cuda/fattn.cu +562 -0
  1180. data/vendor/ggml/src/ggml-cuda/fattn.cuh +5 -0
  1181. data/vendor/ggml/src/ggml-cuda/fill.cu +37 -0
  1182. data/vendor/ggml/src/ggml-cuda/fill.cuh +3 -0
  1183. data/vendor/ggml/src/ggml-cuda/gated_delta_net.cu +311 -0
  1184. data/vendor/ggml/src/ggml-cuda/gated_delta_net.cuh +4 -0
  1185. data/vendor/ggml/src/ggml-cuda/getrows.cu +300 -0
  1186. data/vendor/ggml/src/ggml-cuda/getrows.cuh +15 -0
  1187. data/vendor/ggml/src/ggml-cuda/ggml-cuda.cu +5684 -0
  1188. data/vendor/ggml/src/ggml-cuda/gla.cu +93 -0
  1189. data/vendor/ggml/src/ggml-cuda/gla.cuh +3 -0
  1190. data/vendor/ggml/src/ggml-cuda/im2col.cu +267 -0
  1191. data/vendor/ggml/src/ggml-cuda/im2col.cuh +6 -0
  1192. data/vendor/ggml/src/ggml-cuda/mean.cu +75 -0
  1193. data/vendor/ggml/src/ggml-cuda/mean.cuh +3 -0
  1194. data/vendor/ggml/src/ggml-cuda/mma.cuh +1456 -0
  1195. data/vendor/ggml/src/ggml-cuda/mmf.cu +191 -0
  1196. data/vendor/ggml/src/ggml-cuda/mmf.cuh +908 -0
  1197. data/vendor/ggml/src/ggml-cuda/mmid.cu +164 -0
  1198. data/vendor/ggml/src/ggml-cuda/mmid.cuh +5 -0
  1199. data/vendor/ggml/src/ggml-cuda/mmq.cu +372 -0
  1200. data/vendor/ggml/src/ggml-cuda/mmq.cuh +4176 -0
  1201. data/vendor/ggml/src/ggml-cuda/mmvf.cu +862 -0
  1202. data/vendor/ggml/src/ggml-cuda/mmvf.cuh +14 -0
  1203. data/vendor/ggml/src/ggml-cuda/mmvq.cu +1161 -0
  1204. data/vendor/ggml/src/ggml-cuda/mmvq.cuh +16 -0
  1205. data/vendor/ggml/src/ggml-cuda/norm.cu +672 -0
  1206. data/vendor/ggml/src/ggml-cuda/norm.cuh +18 -0
  1207. data/vendor/ggml/src/ggml-cuda/opt-step-adamw.cu +78 -0
  1208. data/vendor/ggml/src/ggml-cuda/opt-step-adamw.cuh +5 -0
  1209. data/vendor/ggml/src/ggml-cuda/opt-step-sgd.cu +49 -0
  1210. data/vendor/ggml/src/ggml-cuda/opt-step-sgd.cuh +5 -0
  1211. data/vendor/ggml/src/ggml-cuda/out-prod.cu +84 -0
  1212. data/vendor/ggml/src/ggml-cuda/out-prod.cuh +3 -0
  1213. data/vendor/ggml/src/ggml-cuda/pad.cu +106 -0
  1214. data/vendor/ggml/src/ggml-cuda/pad.cuh +5 -0
  1215. data/vendor/ggml/src/ggml-cuda/pad_reflect_1d.cu +91 -0
  1216. data/vendor/ggml/src/ggml-cuda/pad_reflect_1d.cuh +5 -0
  1217. data/vendor/ggml/src/ggml-cuda/pool2d.cu +94 -0
  1218. data/vendor/ggml/src/ggml-cuda/pool2d.cuh +5 -0
  1219. data/vendor/ggml/src/ggml-cuda/quantize.cu +443 -0
  1220. data/vendor/ggml/src/ggml-cuda/quantize.cuh +41 -0
  1221. data/vendor/ggml/src/ggml-cuda/reduce_rows.cuh +39 -0
  1222. data/vendor/ggml/src/ggml-cuda/roll.cu +67 -0
  1223. data/vendor/ggml/src/ggml-cuda/roll.cuh +5 -0
  1224. data/vendor/ggml/src/ggml-cuda/rope.cu +665 -0
  1225. data/vendor/ggml/src/ggml-cuda/rope.cuh +9 -0
  1226. data/vendor/ggml/src/ggml-cuda/scale.cu +34 -0
  1227. data/vendor/ggml/src/ggml-cuda/scale.cuh +5 -0
  1228. data/vendor/ggml/src/ggml-cuda/set-rows.cu +330 -0
  1229. data/vendor/ggml/src/ggml-cuda/set-rows.cuh +7 -0
  1230. data/vendor/ggml/src/ggml-cuda/set.cu +39 -0
  1231. data/vendor/ggml/src/ggml-cuda/set.cuh +7 -0
  1232. data/vendor/ggml/src/ggml-cuda/snake.cu +72 -0
  1233. data/vendor/ggml/src/ggml-cuda/snake.cuh +8 -0
  1234. data/vendor/ggml/src/ggml-cuda/softcap.cu +34 -0
  1235. data/vendor/ggml/src/ggml-cuda/softcap.cuh +5 -0
  1236. data/vendor/ggml/src/ggml-cuda/softmax.cu +472 -0
  1237. data/vendor/ggml/src/ggml-cuda/softmax.cuh +7 -0
  1238. data/vendor/ggml/src/ggml-cuda/solve_tri.cu +275 -0
  1239. data/vendor/ggml/src/ggml-cuda/solve_tri.cuh +3 -0
  1240. data/vendor/ggml/src/ggml-cuda/ssm-conv.cu +197 -0
  1241. data/vendor/ggml/src/ggml-cuda/ssm-conv.cuh +3 -0
  1242. data/vendor/ggml/src/ggml-cuda/ssm-scan.cu +342 -0
  1243. data/vendor/ggml/src/ggml-cuda/ssm-scan.cuh +3 -0
  1244. data/vendor/ggml/src/ggml-cuda/sum.cu +41 -0
  1245. data/vendor/ggml/src/ggml-cuda/sum.cuh +5 -0
  1246. data/vendor/ggml/src/ggml-cuda/sumrows.cu +43 -0
  1247. data/vendor/ggml/src/ggml-cuda/sumrows.cuh +4 -0
  1248. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu +6 -0
  1249. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_32.cu +6 -0
  1250. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu +12 -0
  1251. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_1.cu +10 -0
  1252. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_2.cu +10 -0
  1253. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +12 -0
  1254. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu +6 -0
  1255. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_32.cu +6 -0
  1256. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +12 -0
  1257. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu +12 -0
  1258. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_1.cu +10 -0
  1259. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_2.cu +10 -0
  1260. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu +6 -0
  1261. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_2.cu +10 -0
  1262. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +12 -0
  1263. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_8.cu +12 -0
  1264. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_64-ncols2_1.cu +10 -0
  1265. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_1.cu +10 -0
  1266. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_2.cu +10 -0
  1267. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +12 -0
  1268. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_8.cu +12 -0
  1269. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq112-dv112.cu +5 -0
  1270. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq128-dv128.cu +5 -0
  1271. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq192-dv128.cu +5 -0
  1272. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq256-dv256.cu +5 -0
  1273. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq320-dv256.cu +5 -0
  1274. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq40-dv40.cu +5 -0
  1275. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq512-dv512.cu +5 -0
  1276. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq576-dv512.cu +5 -0
  1277. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq64-dv64.cu +5 -0
  1278. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq72-dv72.cu +5 -0
  1279. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq80-dv80.cu +5 -0
  1280. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq96-dv96.cu +5 -0
  1281. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-bf16.cu +7 -0
  1282. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-f16.cu +7 -0
  1283. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_0.cu +7 -0
  1284. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_1.cu +7 -0
  1285. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_0.cu +7 -0
  1286. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_1.cu +7 -0
  1287. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q8_0.cu +7 -0
  1288. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-bf16.cu +7 -0
  1289. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu +7 -0
  1290. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_0.cu +7 -0
  1291. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_1.cu +7 -0
  1292. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_0.cu +7 -0
  1293. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_1.cu +7 -0
  1294. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q8_0.cu +7 -0
  1295. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-bf16.cu +7 -0
  1296. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-f16.cu +7 -0
  1297. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu +7 -0
  1298. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_1.cu +7 -0
  1299. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_0.cu +7 -0
  1300. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_1.cu +7 -0
  1301. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q8_0.cu +7 -0
  1302. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-bf16.cu +7 -0
  1303. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-f16.cu +7 -0
  1304. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_0.cu +7 -0
  1305. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_1.cu +7 -0
  1306. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_0.cu +7 -0
  1307. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_1.cu +7 -0
  1308. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q8_0.cu +7 -0
  1309. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-bf16.cu +7 -0
  1310. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-f16.cu +7 -0
  1311. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_0.cu +7 -0
  1312. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_1.cu +7 -0
  1313. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_0.cu +7 -0
  1314. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_1.cu +7 -0
  1315. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q8_0.cu +7 -0
  1316. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-bf16.cu +7 -0
  1317. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-f16.cu +7 -0
  1318. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_0.cu +7 -0
  1319. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_1.cu +7 -0
  1320. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_0.cu +7 -0
  1321. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_1.cu +7 -0
  1322. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q8_0.cu +7 -0
  1323. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-bf16.cu +7 -0
  1324. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-f16.cu +7 -0
  1325. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_0.cu +7 -0
  1326. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_1.cu +7 -0
  1327. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_0.cu +7 -0
  1328. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_1.cu +7 -0
  1329. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu +7 -0
  1330. data/vendor/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +110 -0
  1331. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_1.cu +5 -0
  1332. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_10.cu +5 -0
  1333. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_11.cu +5 -0
  1334. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_12.cu +5 -0
  1335. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_13.cu +5 -0
  1336. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_14.cu +5 -0
  1337. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_15.cu +5 -0
  1338. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_16.cu +5 -0
  1339. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_2.cu +5 -0
  1340. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_3.cu +5 -0
  1341. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_4.cu +5 -0
  1342. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_5.cu +5 -0
  1343. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_6.cu +5 -0
  1344. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_7.cu +5 -0
  1345. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_8.cu +5 -0
  1346. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_9.cu +5 -0
  1347. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq1_s.cu +5 -0
  1348. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_s.cu +5 -0
  1349. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xs.cu +5 -0
  1350. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xxs.cu +5 -0
  1351. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_s.cu +5 -0
  1352. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_xxs.cu +5 -0
  1353. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_nl.cu +5 -0
  1354. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_xs.cu +5 -0
  1355. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-mxfp4.cu +5 -0
  1356. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-nvfp4.cu +5 -0
  1357. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q1_0.cu +5 -0
  1358. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
  1359. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
  1360. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
  1361. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
  1362. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
  1363. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
  1364. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
  1365. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
  1366. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
  1367. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
  1368. data/vendor/ggml/src/ggml-cuda/top-k.cu +95 -0
  1369. data/vendor/ggml/src/ggml-cuda/top-k.cuh +3 -0
  1370. data/vendor/ggml/src/ggml-cuda/topk-moe.cu +415 -0
  1371. data/vendor/ggml/src/ggml-cuda/topk-moe.cuh +27 -0
  1372. data/vendor/ggml/src/ggml-cuda/tri.cu +136 -0
  1373. data/vendor/ggml/src/ggml-cuda/tri.cuh +5 -0
  1374. data/vendor/ggml/src/ggml-cuda/tsembd.cu +47 -0
  1375. data/vendor/ggml/src/ggml-cuda/tsembd.cuh +5 -0
  1376. data/vendor/ggml/src/ggml-cuda/unary.cu +640 -0
  1377. data/vendor/ggml/src/ggml-cuda/unary.cuh +114 -0
  1378. data/vendor/ggml/src/ggml-cuda/upscale.cu +293 -0
  1379. data/vendor/ggml/src/ggml-cuda/upscale.cuh +5 -0
  1380. data/vendor/ggml/src/ggml-cuda/vecdotq.cuh +1317 -0
  1381. data/vendor/ggml/src/ggml-cuda/vendors/cuda.h +28 -0
  1382. data/vendor/ggml/src/ggml-cuda/vendors/hip.h +304 -0
  1383. data/vendor/ggml/src/ggml-cuda/vendors/musa.h +150 -0
  1384. data/vendor/ggml/src/ggml-cuda/wkv.cu +199 -0
  1385. data/vendor/ggml/src/ggml-cuda/wkv.cuh +7 -0
  1386. data/vendor/ggml/src/ggml-hexagon/CMakeLists.txt +118 -0
  1387. data/vendor/ggml/src/ggml-hexagon/ggml-hexagon.cpp +3680 -0
  1388. data/vendor/ggml/src/ggml-hexagon/htp/CMakeLists.txt +78 -0
  1389. data/vendor/ggml/src/ggml-hexagon/htp/act-ops.c +782 -0
  1390. data/vendor/ggml/src/ggml-hexagon/htp/argsort-ops.c +293 -0
  1391. data/vendor/ggml/src/ggml-hexagon/htp/binary-ops.c +872 -0
  1392. data/vendor/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +157 -0
  1393. data/vendor/ggml/src/ggml-hexagon/htp/cpy-ops.c +275 -0
  1394. data/vendor/ggml/src/ggml-hexagon/htp/cumsum-ops.c +270 -0
  1395. data/vendor/ggml/src/ggml-hexagon/htp/diag-ops.c +216 -0
  1396. data/vendor/ggml/src/ggml-hexagon/htp/fill-ops.c +123 -0
  1397. data/vendor/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +727 -0
  1398. data/vendor/ggml/src/ggml-hexagon/htp/gated-delta-net-ops.c +955 -0
  1399. data/vendor/ggml/src/ggml-hexagon/htp/get-rows-ops.c +124 -0
  1400. data/vendor/ggml/src/ggml-hexagon/htp/hex-dma.c +63 -0
  1401. data/vendor/ggml/src/ggml-hexagon/htp/hex-dma.h +372 -0
  1402. data/vendor/ggml/src/ggml-hexagon/htp/hex-dump.h +86 -0
  1403. data/vendor/ggml/src/ggml-hexagon/htp/hex-fastdiv.h +37 -0
  1404. data/vendor/ggml/src/ggml-hexagon/htp/hex-utils.h +137 -0
  1405. data/vendor/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c +1841 -0
  1406. data/vendor/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c +1785 -0
  1407. data/vendor/ggml/src/ggml-hexagon/htp/hmx-ops.h +71 -0
  1408. data/vendor/ggml/src/ggml-hexagon/htp/hmx-profile.h +34 -0
  1409. data/vendor/ggml/src/ggml-hexagon/htp/hmx-queue.c +158 -0
  1410. data/vendor/ggml/src/ggml-hexagon/htp/hmx-queue.h +134 -0
  1411. data/vendor/ggml/src/ggml-hexagon/htp/hmx-utils.h +200 -0
  1412. data/vendor/ggml/src/ggml-hexagon/htp/htp-ctx.h +111 -0
  1413. data/vendor/ggml/src/ggml-hexagon/htp/htp-ops.h +181 -0
  1414. data/vendor/ggml/src/ggml-hexagon/htp/htp_iface.idl +22 -0
  1415. data/vendor/ggml/src/ggml-hexagon/htp/hvx-arith.h +443 -0
  1416. data/vendor/ggml/src/ggml-hexagon/htp/hvx-base.h +308 -0
  1417. data/vendor/ggml/src/ggml-hexagon/htp/hvx-copy.h +262 -0
  1418. data/vendor/ggml/src/ggml-hexagon/htp/hvx-div.h +291 -0
  1419. data/vendor/ggml/src/ggml-hexagon/htp/hvx-dump.h +129 -0
  1420. data/vendor/ggml/src/ggml-hexagon/htp/hvx-exp.h +216 -0
  1421. data/vendor/ggml/src/ggml-hexagon/htp/hvx-floor.h +100 -0
  1422. data/vendor/ggml/src/ggml-hexagon/htp/hvx-inverse.h +210 -0
  1423. data/vendor/ggml/src/ggml-hexagon/htp/hvx-reduce.h +296 -0
  1424. data/vendor/ggml/src/ggml-hexagon/htp/hvx-repl.h +74 -0
  1425. data/vendor/ggml/src/ggml-hexagon/htp/hvx-scale.h +133 -0
  1426. data/vendor/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +142 -0
  1427. data/vendor/ggml/src/ggml-hexagon/htp/hvx-sqrt.h +126 -0
  1428. data/vendor/ggml/src/ggml-hexagon/htp/hvx-types.h +36 -0
  1429. data/vendor/ggml/src/ggml-hexagon/htp/hvx-utils.h +19 -0
  1430. data/vendor/ggml/src/ggml-hexagon/htp/main.c +880 -0
  1431. data/vendor/ggml/src/ggml-hexagon/htp/matmul-ops.c +3173 -0
  1432. data/vendor/ggml/src/ggml-hexagon/htp/repeat-ops.c +148 -0
  1433. data/vendor/ggml/src/ggml-hexagon/htp/rope-ops.c +494 -0
  1434. data/vendor/ggml/src/ggml-hexagon/htp/set-rows-ops.c +184 -0
  1435. data/vendor/ggml/src/ggml-hexagon/htp/softmax-ops.c +407 -0
  1436. data/vendor/ggml/src/ggml-hexagon/htp/solve-tri-ops.c +267 -0
  1437. data/vendor/ggml/src/ggml-hexagon/htp/ssm-conv.c +340 -0
  1438. data/vendor/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +128 -0
  1439. data/vendor/ggml/src/ggml-hexagon/htp/unary-ops.c +657 -0
  1440. data/vendor/ggml/src/ggml-hexagon/htp/vtcm-utils.h +16 -0
  1441. data/vendor/ggml/src/ggml-hexagon/htp/worker-pool.c +293 -0
  1442. data/vendor/ggml/src/ggml-hexagon/htp/worker-pool.h +57 -0
  1443. data/vendor/ggml/src/ggml-hexagon/htp-drv.cpp +418 -0
  1444. data/vendor/ggml/src/ggml-hexagon/htp-drv.h +121 -0
  1445. data/vendor/ggml/src/ggml-hexagon/libdl.h +79 -0
  1446. data/vendor/ggml/src/ggml-hexagon/libggml-htp.inf +40 -0
  1447. data/vendor/ggml/src/ggml-hexagon/op-desc.h +153 -0
  1448. data/vendor/ggml/src/ggml-hip/CMakeLists.txt +157 -0
  1449. data/vendor/ggml/src/ggml-impl.h +783 -0
  1450. data/vendor/ggml/src/ggml-metal/CMakeLists.txt +124 -0
  1451. data/vendor/ggml/src/ggml-metal/ggml-metal-common.cpp +457 -0
  1452. data/vendor/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
  1453. data/vendor/ggml/src/ggml-metal/ggml-metal-context.h +41 -0
  1454. data/vendor/ggml/src/ggml-metal/ggml-metal-context.m +739 -0
  1455. data/vendor/ggml/src/ggml-metal/ggml-metal-device.cpp +2053 -0
  1456. data/vendor/ggml/src/ggml-metal/ggml-metal-device.h +296 -0
  1457. data/vendor/ggml/src/ggml-metal/ggml-metal-device.m +1829 -0
  1458. data/vendor/ggml/src/ggml-metal/ggml-metal-impl.h +1175 -0
  1459. data/vendor/ggml/src/ggml-metal/ggml-metal-ops.cpp +4606 -0
  1460. data/vendor/ggml/src/ggml-metal/ggml-metal-ops.h +97 -0
  1461. data/vendor/ggml/src/ggml-metal/ggml-metal.cpp +950 -0
  1462. data/vendor/ggml/src/ggml-metal/ggml-metal.metal +10679 -0
  1463. data/vendor/ggml/src/ggml-musa/CMakeLists.txt +124 -0
  1464. data/vendor/ggml/src/ggml-musa/mudnn.cu +112 -0
  1465. data/vendor/ggml/src/ggml-musa/mudnn.cuh +12 -0
  1466. data/vendor/ggml/src/ggml-opencl/CMakeLists.txt +189 -0
  1467. data/vendor/ggml/src/ggml-opencl/ggml-opencl.cpp +16374 -0
  1468. data/vendor/ggml/src/ggml-opencl/kernels/add.cl +190 -0
  1469. data/vendor/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
  1470. data/vendor/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
  1471. data/vendor/ggml/src/ggml-opencl/kernels/clamp.cl +20 -0
  1472. data/vendor/ggml/src/ggml-opencl/kernels/concat.cl +51 -0
  1473. data/vendor/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
  1474. data/vendor/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
  1475. data/vendor/ggml/src/ggml-opencl/kernels/cpy.cl +229 -0
  1476. data/vendor/ggml/src/ggml-opencl/kernels/cumsum.cl +139 -0
  1477. data/vendor/ggml/src/ggml-opencl/kernels/cvt.cl +1471 -0
  1478. data/vendor/ggml/src/ggml-opencl/kernels/diag.cl +27 -0
  1479. data/vendor/ggml/src/ggml-opencl/kernels/diag_mask_inf.cl +58 -0
  1480. data/vendor/ggml/src/ggml-opencl/kernels/div.cl +138 -0
  1481. data/vendor/ggml/src/ggml-opencl/kernels/embed_kernel.py +26 -0
  1482. data/vendor/ggml/src/ggml-opencl/kernels/exp.cl +125 -0
  1483. data/vendor/ggml/src/ggml-opencl/kernels/expm1.cl +113 -0
  1484. data/vendor/ggml/src/ggml-opencl/kernels/fill.cl +17 -0
  1485. data/vendor/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +370 -0
  1486. data/vendor/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +371 -0
  1487. data/vendor/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +373 -0
  1488. data/vendor/ggml/src/ggml-opencl/kernels/gelu.cl +89 -0
  1489. data/vendor/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32.cl +162 -0
  1490. data/vendor/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32_ns.cl +302 -0
  1491. data/vendor/ggml/src/ggml-opencl/kernels/gemm_moe_q4_0_f32_ns.cl +252 -0
  1492. data/vendor/ggml/src/ggml-opencl/kernels/gemm_moe_q4_1_f32_ns.cl +254 -0
  1493. data/vendor/ggml/src/ggml-opencl/kernels/gemm_moe_q5_0_f32_ns.cl +256 -0
  1494. data/vendor/ggml/src/ggml-opencl/kernels/gemm_moe_q5_1_f32_ns.cl +258 -0
  1495. data/vendor/ggml/src/ggml-opencl/kernels/gemm_noshuffle_iq4_nl_f32.cl +150 -0
  1496. data/vendor/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_0_f32.cl +139 -0
  1497. data/vendor/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_1_f32.cl +132 -0
  1498. data/vendor/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_k_f32.cl +172 -0
  1499. data/vendor/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_k_f32.cl +176 -0
  1500. data/vendor/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q6_k_f32.cl +140 -0
  1501. data/vendor/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q8_0_f32.cl +129 -0
  1502. data/vendor/ggml/src/ggml-opencl/kernels/gemm_xmem_f16_f32_os8.cl +233 -0
  1503. data/vendor/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32.cl +156 -0
  1504. data/vendor/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32_ns.cl +161 -0
  1505. data/vendor/ggml/src/ggml-opencl/kernels/gemv_moe_q4_0_f32_ns.cl +116 -0
  1506. data/vendor/ggml/src/ggml-opencl/kernels/gemv_moe_q4_1_f32_ns.cl +119 -0
  1507. data/vendor/ggml/src/ggml-opencl/kernels/gemv_moe_q5_0_f32_ns.cl +119 -0
  1508. data/vendor/ggml/src/ggml-opencl/kernels/gemv_moe_q5_1_f32_ns.cl +121 -0
  1509. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_iq4_nl_f32.cl +302 -0
  1510. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_0_f32.cl +274 -0
  1511. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_0_f32_spec.cl +268 -0
  1512. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_1_f32.cl +283 -0
  1513. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_k_f32.cl +318 -0
  1514. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_k_f32.cl +326 -0
  1515. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q6_k_f32.cl +293 -0
  1516. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q8_0_f32.cl +195 -0
  1517. data/vendor/ggml/src/ggml-opencl/kernels/get_rows.cl +187 -0
  1518. data/vendor/ggml/src/ggml-opencl/kernels/glu.cl +378 -0
  1519. data/vendor/ggml/src/ggml-opencl/kernels/group_norm.cl +121 -0
  1520. data/vendor/ggml/src/ggml-opencl/kernels/im2col_f16.cl +57 -0
  1521. data/vendor/ggml/src/ggml-opencl/kernels/im2col_f32.cl +57 -0
  1522. data/vendor/ggml/src/ggml-opencl/kernels/l2_norm.cl +71 -0
  1523. data/vendor/ggml/src/ggml-opencl/kernels/mean.cl +140 -0
  1524. data/vendor/ggml/src/ggml-opencl/kernels/moe_reorder_b.cl +30 -0
  1525. data/vendor/ggml/src/ggml-opencl/kernels/moe_sort_by_expert.cl +82 -0
  1526. data/vendor/ggml/src/ggml-opencl/kernels/mul.cl +152 -0
  1527. data/vendor/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
  1528. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_kq_kqv.cl +273 -0
  1529. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +146 -0
  1530. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +147 -0
  1531. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_iq4_nl_f32_l4_lm.cl +171 -0
  1532. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl +163 -0
  1533. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl +165 -0
  1534. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_q4_k_f32_l4_lm.cl +179 -0
  1535. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_q5_k_f32_l4_lm.cl +192 -0
  1536. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_q6_k_f32_l4_lm.cl +158 -0
  1537. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl +154 -0
  1538. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_f16_f16.cl +118 -0
  1539. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32.cl +118 -0
  1540. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_1row.cl +94 -0
  1541. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_l4.cl +84 -0
  1542. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_f32_f32.cl +118 -0
  1543. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
  1544. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
  1545. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  1546. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
  1547. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
  1548. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32.cl +164 -0
  1549. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32_flat.cl +202 -0
  1550. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
  1551. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
  1552. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32.cl +192 -0
  1553. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_16x_flat.cl +307 -0
  1554. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_8x_flat.cl +265 -0
  1555. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_8x_flat.cl +272 -0
  1556. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_v.cl +254 -0
  1557. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl +219 -0
  1558. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl +229 -0
  1559. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl +180 -0
  1560. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32_flat.cl +196 -0
  1561. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32.cl +187 -0
  1562. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32_flat.cl +203 -0
  1563. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32.cl +194 -0
  1564. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +194 -0
  1565. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
  1566. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
  1567. data/vendor/ggml/src/ggml-opencl/kernels/neg.cl +125 -0
  1568. data/vendor/ggml/src/ggml-opencl/kernels/norm.cl +161 -0
  1569. data/vendor/ggml/src/ggml-opencl/kernels/pad.cl +39 -0
  1570. data/vendor/ggml/src/ggml-opencl/kernels/relu.cl +16 -0
  1571. data/vendor/ggml/src/ggml-opencl/kernels/repeat.cl +38 -0
  1572. data/vendor/ggml/src/ggml-opencl/kernels/rms_norm.cl +190 -0
  1573. data/vendor/ggml/src/ggml-opencl/kernels/rope.cl +747 -0
  1574. data/vendor/ggml/src/ggml-opencl/kernels/scale.cl +27 -0
  1575. data/vendor/ggml/src/ggml-opencl/kernels/set_rows.cl +208 -0
  1576. data/vendor/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
  1577. data/vendor/ggml/src/ggml-opencl/kernels/silu.cl +30 -0
  1578. data/vendor/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +108 -0
  1579. data/vendor/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +108 -0
  1580. data/vendor/ggml/src/ggml-opencl/kernels/softmax_f16.cl +107 -0
  1581. data/vendor/ggml/src/ggml-opencl/kernels/softmax_f32.cl +107 -0
  1582. data/vendor/ggml/src/ggml-opencl/kernels/softplus.cl +116 -0
  1583. data/vendor/ggml/src/ggml-opencl/kernels/solve_tri.cl +51 -0
  1584. data/vendor/ggml/src/ggml-opencl/kernels/sqr.cl +53 -0
  1585. data/vendor/ggml/src/ggml-opencl/kernels/sqrt.cl +53 -0
  1586. data/vendor/ggml/src/ggml-opencl/kernels/ssm_conv.cl +77 -0
  1587. data/vendor/ggml/src/ggml-opencl/kernels/sub.cl +138 -0
  1588. data/vendor/ggml/src/ggml-opencl/kernels/sum_rows.cl +140 -0
  1589. data/vendor/ggml/src/ggml-opencl/kernels/tanh.cl +109 -0
  1590. data/vendor/ggml/src/ggml-opencl/kernels/transpose.cl +143 -0
  1591. data/vendor/ggml/src/ggml-opencl/kernels/tri.cl +32 -0
  1592. data/vendor/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  1593. data/vendor/ggml/src/ggml-opencl/kernels/upscale.cl +120 -0
  1594. data/vendor/ggml/src/ggml-openvino/CMakeLists.txt +22 -0
  1595. data/vendor/ggml/src/ggml-openvino/ggml-decoder.cpp +985 -0
  1596. data/vendor/ggml/src/ggml-openvino/ggml-decoder.h +294 -0
  1597. data/vendor/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +380 -0
  1598. data/vendor/ggml/src/ggml-openvino/ggml-openvino-extra.h +182 -0
  1599. data/vendor/ggml/src/ggml-openvino/ggml-openvino.cpp +1132 -0
  1600. data/vendor/ggml/src/ggml-openvino/ggml-quants.cpp +956 -0
  1601. data/vendor/ggml/src/ggml-openvino/ggml-quants.h +153 -0
  1602. data/vendor/ggml/src/ggml-openvino/openvino/decoder.h +74 -0
  1603. data/vendor/ggml/src/ggml-openvino/openvino/frontend.cpp +27 -0
  1604. data/vendor/ggml/src/ggml-openvino/openvino/frontend.h +23 -0
  1605. data/vendor/ggml/src/ggml-openvino/openvino/input_model.cpp +17 -0
  1606. data/vendor/ggml/src/ggml-openvino/openvino/input_model.h +29 -0
  1607. data/vendor/ggml/src/ggml-openvino/openvino/node_context.h +112 -0
  1608. data/vendor/ggml/src/ggml-openvino/openvino/op/cont.cpp +48 -0
  1609. data/vendor/ggml/src/ggml-openvino/openvino/op/cpy.cpp +21 -0
  1610. data/vendor/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +90 -0
  1611. data/vendor/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +69 -0
  1612. data/vendor/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp +61 -0
  1613. data/vendor/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp +62 -0
  1614. data/vendor/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +90 -0
  1615. data/vendor/ggml/src/ggml-openvino/openvino/op/permute.cpp +102 -0
  1616. data/vendor/ggml/src/ggml-openvino/openvino/op/reshape.cpp +83 -0
  1617. data/vendor/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp +46 -0
  1618. data/vendor/ggml/src/ggml-openvino/openvino/op/rope.cpp +149 -0
  1619. data/vendor/ggml/src/ggml-openvino/openvino/op/scale.cpp +41 -0
  1620. data/vendor/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +76 -0
  1621. data/vendor/ggml/src/ggml-openvino/openvino/op/softmax.cpp +89 -0
  1622. data/vendor/ggml/src/ggml-openvino/openvino/op/transpose.cpp +23 -0
  1623. data/vendor/ggml/src/ggml-openvino/openvino/op/unary_gelu.cpp +25 -0
  1624. data/vendor/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp +27 -0
  1625. data/vendor/ggml/src/ggml-openvino/openvino/op/view.cpp +53 -0
  1626. data/vendor/ggml/src/ggml-openvino/openvino/op_table.cpp +47 -0
  1627. data/vendor/ggml/src/ggml-openvino/openvino/op_table.h +40 -0
  1628. data/vendor/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp +60 -0
  1629. data/vendor/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.h +17 -0
  1630. data/vendor/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.h +29 -0
  1631. data/vendor/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.cpp +58 -0
  1632. data/vendor/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.h +17 -0
  1633. data/vendor/ggml/src/ggml-openvino/openvino/rt_info/weightless_caching_attributes.hpp +41 -0
  1634. data/vendor/ggml/src/ggml-openvino/openvino/translate_session.cpp +317 -0
  1635. data/vendor/ggml/src/ggml-openvino/openvino/translate_session.h +28 -0
  1636. data/vendor/ggml/src/ggml-openvino/openvino/utils.cpp +257 -0
  1637. data/vendor/ggml/src/ggml-openvino/openvino/utils.h +86 -0
  1638. data/vendor/ggml/src/ggml-openvino/utils.cpp +880 -0
  1639. data/vendor/ggml/src/ggml-openvino/utils.h +143 -0
  1640. data/vendor/ggml/src/ggml-opt.cpp +1094 -0
  1641. data/vendor/ggml/src/ggml-quants.c +5491 -0
  1642. data/vendor/ggml/src/ggml-quants.h +112 -0
  1643. data/vendor/ggml/src/ggml-rpc/CMakeLists.txt +33 -0
  1644. data/vendor/ggml/src/ggml-rpc/ggml-rpc.cpp +1974 -0
  1645. data/vendor/ggml/src/ggml-rpc/transport.cpp +683 -0
  1646. data/vendor/ggml/src/ggml-rpc/transport.h +34 -0
  1647. data/vendor/ggml/src/ggml-sycl/CMakeLists.txt +207 -0
  1648. data/vendor/ggml/src/ggml-sycl/add-id.cpp +81 -0
  1649. data/vendor/ggml/src/ggml-sycl/add-id.hpp +8 -0
  1650. data/vendor/ggml/src/ggml-sycl/backend.hpp +48 -0
  1651. data/vendor/ggml/src/ggml-sycl/binbcast.cpp +346 -0
  1652. data/vendor/ggml/src/ggml-sycl/binbcast.hpp +39 -0
  1653. data/vendor/ggml/src/ggml-sycl/common.cpp +155 -0
  1654. data/vendor/ggml/src/ggml-sycl/common.hpp +1002 -0
  1655. data/vendor/ggml/src/ggml-sycl/concat.cpp +202 -0
  1656. data/vendor/ggml/src/ggml-sycl/concat.hpp +20 -0
  1657. data/vendor/ggml/src/ggml-sycl/conv.cpp +101 -0
  1658. data/vendor/ggml/src/ggml-sycl/conv.hpp +20 -0
  1659. data/vendor/ggml/src/ggml-sycl/convert.cpp +825 -0
  1660. data/vendor/ggml/src/ggml-sycl/convert.hpp +64 -0
  1661. data/vendor/ggml/src/ggml-sycl/count-equal.cpp +79 -0
  1662. data/vendor/ggml/src/ggml-sycl/count-equal.hpp +9 -0
  1663. data/vendor/ggml/src/ggml-sycl/cpy.cpp +602 -0
  1664. data/vendor/ggml/src/ggml-sycl/cpy.hpp +223 -0
  1665. data/vendor/ggml/src/ggml-sycl/cumsum.cpp +148 -0
  1666. data/vendor/ggml/src/ggml-sycl/cumsum.hpp +5 -0
  1667. data/vendor/ggml/src/ggml-sycl/dequantize.hpp +975 -0
  1668. data/vendor/ggml/src/ggml-sycl/diag.cpp +67 -0
  1669. data/vendor/ggml/src/ggml-sycl/diag.hpp +5 -0
  1670. data/vendor/ggml/src/ggml-sycl/dmmv.cpp +1579 -0
  1671. data/vendor/ggml/src/ggml-sycl/dmmv.hpp +27 -0
  1672. data/vendor/ggml/src/ggml-sycl/dpct/helper.hpp +3774 -0
  1673. data/vendor/ggml/src/ggml-sycl/element_wise.cpp +1124 -0
  1674. data/vendor/ggml/src/ggml-sycl/element_wise.hpp +94 -0
  1675. data/vendor/ggml/src/ggml-sycl/fattn-buffers.cpp +56 -0
  1676. data/vendor/ggml/src/ggml-sycl/fattn-buffers.hpp +63 -0
  1677. data/vendor/ggml/src/ggml-sycl/fattn-common.hpp +1181 -0
  1678. data/vendor/ggml/src/ggml-sycl/fattn-tile.cpp +59 -0
  1679. data/vendor/ggml/src/ggml-sycl/fattn-tile.hpp +1246 -0
  1680. data/vendor/ggml/src/ggml-sycl/fattn-vec.hpp +674 -0
  1681. data/vendor/ggml/src/ggml-sycl/fattn.cpp +227 -0
  1682. data/vendor/ggml/src/ggml-sycl/fattn.hpp +22 -0
  1683. data/vendor/ggml/src/ggml-sycl/fill.cpp +55 -0
  1684. data/vendor/ggml/src/ggml-sycl/fill.hpp +5 -0
  1685. data/vendor/ggml/src/ggml-sycl/gated_delta_net.cpp +307 -0
  1686. data/vendor/ggml/src/ggml-sycl/gated_delta_net.hpp +9 -0
  1687. data/vendor/ggml/src/ggml-sycl/gemm.hpp +93 -0
  1688. data/vendor/ggml/src/ggml-sycl/getrows.cpp +219 -0
  1689. data/vendor/ggml/src/ggml-sycl/getrows.hpp +20 -0
  1690. data/vendor/ggml/src/ggml-sycl/ggml-sycl.cpp +5520 -0
  1691. data/vendor/ggml/src/ggml-sycl/gla.cpp +106 -0
  1692. data/vendor/ggml/src/ggml-sycl/gla.hpp +8 -0
  1693. data/vendor/ggml/src/ggml-sycl/im2col.cpp +400 -0
  1694. data/vendor/ggml/src/ggml-sycl/im2col.hpp +23 -0
  1695. data/vendor/ggml/src/ggml-sycl/mmq.cpp +3030 -0
  1696. data/vendor/ggml/src/ggml-sycl/mmq.hpp +33 -0
  1697. data/vendor/ggml/src/ggml-sycl/mmvq.cpp +1380 -0
  1698. data/vendor/ggml/src/ggml-sycl/mmvq.hpp +43 -0
  1699. data/vendor/ggml/src/ggml-sycl/norm.cpp +656 -0
  1700. data/vendor/ggml/src/ggml-sycl/norm.hpp +28 -0
  1701. data/vendor/ggml/src/ggml-sycl/outprod.cpp +47 -0
  1702. data/vendor/ggml/src/ggml-sycl/outprod.hpp +10 -0
  1703. data/vendor/ggml/src/ggml-sycl/pad.cpp +97 -0
  1704. data/vendor/ggml/src/ggml-sycl/pad.hpp +24 -0
  1705. data/vendor/ggml/src/ggml-sycl/pad_reflect_1d.cpp +100 -0
  1706. data/vendor/ggml/src/ggml-sycl/pad_reflect_1d.hpp +10 -0
  1707. data/vendor/ggml/src/ggml-sycl/presets.hpp +79 -0
  1708. data/vendor/ggml/src/ggml-sycl/quantize.hpp +133 -0
  1709. data/vendor/ggml/src/ggml-sycl/quants.hpp +156 -0
  1710. data/vendor/ggml/src/ggml-sycl/repeat_back.cpp +76 -0
  1711. data/vendor/ggml/src/ggml-sycl/repeat_back.hpp +8 -0
  1712. data/vendor/ggml/src/ggml-sycl/roll.cpp +122 -0
  1713. data/vendor/ggml/src/ggml-sycl/roll.hpp +20 -0
  1714. data/vendor/ggml/src/ggml-sycl/rope.cpp +641 -0
  1715. data/vendor/ggml/src/ggml-sycl/rope.hpp +26 -0
  1716. data/vendor/ggml/src/ggml-sycl/set.cpp +73 -0
  1717. data/vendor/ggml/src/ggml-sycl/set.hpp +5 -0
  1718. data/vendor/ggml/src/ggml-sycl/set_rows.cpp +240 -0
  1719. data/vendor/ggml/src/ggml-sycl/set_rows.hpp +8 -0
  1720. data/vendor/ggml/src/ggml-sycl/softmax.cpp +426 -0
  1721. data/vendor/ggml/src/ggml-sycl/softmax.hpp +24 -0
  1722. data/vendor/ggml/src/ggml-sycl/solve_tri.cpp +172 -0
  1723. data/vendor/ggml/src/ggml-sycl/solve_tri.hpp +8 -0
  1724. data/vendor/ggml/src/ggml-sycl/ssm_conv.cpp +132 -0
  1725. data/vendor/ggml/src/ggml-sycl/ssm_conv.hpp +5 -0
  1726. data/vendor/ggml/src/ggml-sycl/ssm_scan.cpp +156 -0
  1727. data/vendor/ggml/src/ggml-sycl/ssm_scan.hpp +5 -0
  1728. data/vendor/ggml/src/ggml-sycl/sycl_hw.cpp +67 -0
  1729. data/vendor/ggml/src/ggml-sycl/sycl_hw.hpp +38 -0
  1730. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq112-dv112.cpp +5 -0
  1731. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq128-dv128.cpp +5 -0
  1732. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq256-dv256.cpp +5 -0
  1733. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq40-dv40.cpp +5 -0
  1734. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq512-dv512.cpp +6 -0
  1735. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq576-dv512.cpp +5 -0
  1736. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq64-dv64.cpp +5 -0
  1737. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq72-dv72.cpp +5 -0
  1738. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq80-dv80.cpp +5 -0
  1739. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq96-dv96.cpp +5 -0
  1740. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-f16.cpp +8 -0
  1741. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_0.cpp +8 -0
  1742. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_1.cpp +8 -0
  1743. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_0.cpp +8 -0
  1744. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_1.cpp +8 -0
  1745. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q8_0.cpp +8 -0
  1746. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-f16.cpp +8 -0
  1747. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_0.cpp +8 -0
  1748. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_1.cpp +8 -0
  1749. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_0.cpp +8 -0
  1750. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_1.cpp +8 -0
  1751. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q8_0.cpp +8 -0
  1752. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-f16.cpp +8 -0
  1753. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_0.cpp +8 -0
  1754. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_1.cpp +8 -0
  1755. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_0.cpp +8 -0
  1756. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_1.cpp +8 -0
  1757. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q8_0.cpp +8 -0
  1758. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-f16.cpp +8 -0
  1759. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_0.cpp +8 -0
  1760. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_1.cpp +8 -0
  1761. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_0.cpp +8 -0
  1762. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_1.cpp +8 -0
  1763. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q8_0.cpp +8 -0
  1764. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-f16.cpp +8 -0
  1765. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_0.cpp +8 -0
  1766. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_1.cpp +8 -0
  1767. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_0.cpp +8 -0
  1768. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_1.cpp +8 -0
  1769. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q8_0.cpp +8 -0
  1770. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-f16.cpp +8 -0
  1771. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_0.cpp +8 -0
  1772. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_1.cpp +8 -0
  1773. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_0.cpp +8 -0
  1774. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_1.cpp +8 -0
  1775. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q8_0.cpp +8 -0
  1776. data/vendor/ggml/src/ggml-sycl/tsembd.cpp +73 -0
  1777. data/vendor/ggml/src/ggml-sycl/tsembd.hpp +20 -0
  1778. data/vendor/ggml/src/ggml-sycl/type.hpp +112 -0
  1779. data/vendor/ggml/src/ggml-sycl/upscale.cpp +410 -0
  1780. data/vendor/ggml/src/ggml-sycl/upscale.hpp +9 -0
  1781. data/vendor/ggml/src/ggml-sycl/vecdotq.hpp +1508 -0
  1782. data/vendor/ggml/src/ggml-sycl/wkv.cpp +293 -0
  1783. data/vendor/ggml/src/ggml-sycl/wkv.hpp +10 -0
  1784. data/vendor/ggml/src/ggml-threading.cpp +12 -0
  1785. data/vendor/ggml/src/ggml-threading.h +14 -0
  1786. data/vendor/ggml/src/ggml-virtgpu/CMakeLists.txt +70 -0
  1787. data/vendor/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp +87 -0
  1788. data/vendor/ggml/src/ggml-virtgpu/backend/CMakeLists.txt +21 -0
  1789. data/vendor/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp +115 -0
  1790. data/vendor/ggml/src/ggml-virtgpu/backend/backend-convert.h +13 -0
  1791. data/vendor/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +102 -0
  1792. data/vendor/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +105 -0
  1793. data/vendor/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +179 -0
  1794. data/vendor/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp +148 -0
  1795. data/vendor/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +51 -0
  1796. data/vendor/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +73 -0
  1797. data/vendor/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +27 -0
  1798. data/vendor/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +32 -0
  1799. data/vendor/ggml/src/ggml-virtgpu/backend/backend.cpp +144 -0
  1800. data/vendor/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +95 -0
  1801. data/vendor/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +94 -0
  1802. data/vendor/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +50 -0
  1803. data/vendor/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +378 -0
  1804. data/vendor/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +232 -0
  1805. data/vendor/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +58 -0
  1806. data/vendor/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +81 -0
  1807. data/vendor/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +123 -0
  1808. data/vendor/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +160 -0
  1809. data/vendor/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +213 -0
  1810. data/vendor/ggml/src/ggml-virtgpu/ggml-backend.cpp +71 -0
  1811. data/vendor/ggml/src/ggml-virtgpu/ggml-remoting.h +71 -0
  1812. data/vendor/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml +166 -0
  1813. data/vendor/ggml/src/ggml-virtgpu/include/apir_hw.h +9 -0
  1814. data/vendor/ggml/src/ggml-virtgpu/regenerate_remoting.py +333 -0
  1815. data/vendor/ggml/src/ggml-virtgpu/virtgpu-apir.h +15 -0
  1816. data/vendor/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +58 -0
  1817. data/vendor/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +110 -0
  1818. data/vendor/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +173 -0
  1819. data/vendor/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +192 -0
  1820. data/vendor/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +36 -0
  1821. data/vendor/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +53 -0
  1822. data/vendor/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +99 -0
  1823. data/vendor/ggml/src/ggml-virtgpu/virtgpu-shm.h +23 -0
  1824. data/vendor/ggml/src/ggml-virtgpu/virtgpu-utils.cpp +179 -0
  1825. data/vendor/ggml/src/ggml-virtgpu/virtgpu-utils.h +86 -0
  1826. data/vendor/ggml/src/ggml-virtgpu/virtgpu.cpp +545 -0
  1827. data/vendor/ggml/src/ggml-virtgpu/virtgpu.h +115 -0
  1828. data/vendor/ggml/src/ggml-vulkan/CMakeLists.txt +220 -0
  1829. data/vendor/ggml/src/ggml-vulkan/cmake/host-toolchain.cmake.in +15 -0
  1830. data/vendor/ggml/src/ggml-vulkan/ggml-vulkan.cpp +17208 -0
  1831. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +31 -0
  1832. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/abs.comp +21 -0
  1833. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +37 -0
  1834. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +69 -0
  1835. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/add1.comp +28 -0
  1836. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +42 -0
  1837. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/arange.comp +20 -0
  1838. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +60 -0
  1839. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +86 -0
  1840. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/argsort_large.comp +114 -0
  1841. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/ceil.comp +22 -0
  1842. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp +17 -0
  1843. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/concat.comp +41 -0
  1844. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +49 -0
  1845. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp +105 -0
  1846. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +347 -0
  1847. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
  1848. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +23 -0
  1849. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +51 -0
  1850. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +320 -0
  1851. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/copy_transpose.comp +67 -0
  1852. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp +17 -0
  1853. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/count_equal.comp +31 -0
  1854. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/count_experts.comp +51 -0
  1855. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/cumsum.comp +83 -0
  1856. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass1.comp +60 -0
  1857. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass2.comp +66 -0
  1858. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_f32.comp +20 -0
  1859. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl +653 -0
  1860. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.glsl +768 -0
  1861. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_head.glsl +13 -0
  1862. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +42 -0
  1863. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_s.comp +35 -0
  1864. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +44 -0
  1865. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xs.comp +43 -0
  1866. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +49 -0
  1867. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +40 -0
  1868. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +51 -0
  1869. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp +32 -0
  1870. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp +34 -0
  1871. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +32 -0
  1872. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_nvfp4.comp +32 -0
  1873. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q1_0.comp +29 -0
  1874. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +34 -0
  1875. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +42 -0
  1876. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_0.comp +30 -0
  1877. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_1.comp +32 -0
  1878. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +68 -0
  1879. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_0.comp +34 -0
  1880. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_1.comp +35 -0
  1881. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +70 -0
  1882. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +33 -0
  1883. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q8_0.comp +31 -0
  1884. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp +28 -0
  1885. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/diag_mask_inf.comp +34 -0
  1886. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/div.comp +27 -0
  1887. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/elu.comp +27 -0
  1888. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +20 -0
  1889. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/bfloat16.comp +7 -0
  1890. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/coopmat.comp +7 -0
  1891. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/coopmat2.comp +7 -0
  1892. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/integer_dot.comp +7 -0
  1893. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/fill.comp +19 -0
  1894. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +756 -0
  1895. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +255 -0
  1896. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +626 -0
  1897. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +427 -0
  1898. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_dequant.glsl +123 -0
  1899. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mask_opt.comp +162 -0
  1900. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mmq_funcs.glsl +203 -0
  1901. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +121 -0
  1902. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/floor.comp +22 -0
  1903. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp +190 -0
  1904. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
  1905. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
  1906. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
  1907. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp +25 -0
  1908. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
  1909. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp +23 -0
  1910. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.glsl +65 -0
  1911. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/generic_head.glsl +11 -0
  1912. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.glsl +83 -0
  1913. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +42 -0
  1914. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +51 -0
  1915. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.glsl +28 -0
  1916. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.glsl +39 -0
  1917. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp +66 -0
  1918. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +22 -0
  1919. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +22 -0
  1920. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +93 -0
  1921. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +124 -0
  1922. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +44 -0
  1923. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/leaky_relu.comp +22 -0
  1924. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +17 -0
  1925. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul.comp +27 -0
  1926. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_split_k_reduce.comp +48 -0
  1927. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +169 -0
  1928. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl +230 -0
  1929. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +35 -0
  1930. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_m.comp +132 -0
  1931. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp +95 -0
  1932. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_s.comp +90 -0
  1933. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +105 -0
  1934. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xxs.comp +87 -0
  1935. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_s.comp +90 -0
  1936. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_xxs.comp +88 -0
  1937. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +124 -0
  1938. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp +156 -0
  1939. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +128 -0
  1940. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +132 -0
  1941. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +134 -0
  1942. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +165 -0
  1943. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +130 -0
  1944. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +143 -0
  1945. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +503 -0
  1946. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +464 -0
  1947. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +624 -0
  1948. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +600 -0
  1949. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +74 -0
  1950. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +311 -0
  1951. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +454 -0
  1952. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +93 -0
  1953. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +194 -0
  1954. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/neg.comp +20 -0
  1955. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/norm.comp +44 -0
  1956. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_adamw.comp +42 -0
  1957. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +22 -0
  1958. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +64 -0
  1959. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/pool2d.comp +74 -0
  1960. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +127 -0
  1961. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
  1962. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp +21 -0
  1963. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp +26 -0
  1964. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp +37 -0
  1965. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +150 -0
  1966. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_back.comp +55 -0
  1967. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +65 -0
  1968. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
  1969. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +207 -0
  1970. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +19 -0
  1971. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +17 -0
  1972. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +17 -0
  1973. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +17 -0
  1974. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +31 -0
  1975. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +17 -0
  1976. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/round.comp +29 -0
  1977. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +24 -0
  1978. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/sgn.comp +21 -0
  1979. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp +20 -0
  1980. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp +22 -0
  1981. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/silu_back.comp +26 -0
  1982. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp +17 -0
  1983. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +195 -0
  1984. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +54 -0
  1985. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large1.comp +62 -0
  1986. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large2.comp +79 -0
  1987. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large3.comp +65 -0
  1988. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large_common.glsl +53 -0
  1989. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/softplus.comp +23 -0
  1990. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/solve_tri.comp +81 -0
  1991. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +17 -0
  1992. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/square.comp +17 -0
  1993. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +50 -0
  1994. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/ssm_scan.comp +124 -0
  1995. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/step.comp +22 -0
  1996. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/sub.comp +29 -0
  1997. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +47 -0
  1998. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.glsl +25 -0
  1999. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
  2000. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +14 -0
  2001. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp +20 -0
  2002. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +42 -0
  2003. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/topk_argsort.comp +118 -0
  2004. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +213 -0
  2005. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/topk_nary_search.comp +246 -0
  2006. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp +42 -0
  2007. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/trunc.comp +22 -0
  2008. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl +1846 -0
  2009. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +178 -0
  2010. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/utils.glsl +25 -0
  2011. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +1183 -0
  2012. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/wkv6.comp +87 -0
  2013. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/wkv7.comp +91 -0
  2014. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/xielu.comp +35 -0
  2015. data/vendor/ggml/src/ggml-webgpu/CMakeLists.txt +80 -0
  2016. data/vendor/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +3231 -0
  2017. data/vendor/ggml/src/ggml-webgpu/ggml-webgpu.cpp +4461 -0
  2018. data/vendor/ggml/src/ggml-webgpu/pre_wgsl.hpp +778 -0
  2019. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/add_id.wgsl +64 -0
  2020. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl +72 -0
  2021. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl +106 -0
  2022. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl +134 -0
  2023. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +139 -0
  2024. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +905 -0
  2025. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl +75 -0
  2026. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/conv2d.wgsl +165 -0
  2027. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +81 -0
  2028. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl +66 -0
  2029. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +89 -0
  2030. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +706 -0
  2031. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_tile.wgsl +351 -0
  2032. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_blk.wgsl +101 -0
  2033. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_reduce.wgsl +84 -0
  2034. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_split.wgsl +720 -0
  2035. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/gated_delta_net.wgsl +132 -0
  2036. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.wgsl +773 -0
  2037. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/glu.wgsl +155 -0
  2038. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/im2col.wgsl +101 -0
  2039. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
  2040. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +747 -0
  2041. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +1210 -0
  2042. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id.wgsl +195 -0
  2043. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_gather.wgsl +55 -0
  2044. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_vec.wgsl +154 -0
  2045. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl +149 -0
  2046. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.wgsl +200 -0
  2047. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +133 -0
  2048. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_acc.tmpl +1433 -0
  2049. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl +86 -0
  2050. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/repeat.wgsl +67 -0
  2051. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_mul.wgsl +152 -0
  2052. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/rope.wgsl +224 -0
  2053. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/row_norm.wgsl +153 -0
  2054. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/scale.wgsl +63 -0
  2055. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/set.wgsl +109 -0
  2056. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +109 -0
  2057. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.wgsl +245 -0
  2058. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/solve_tri.wgsl +121 -0
  2059. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/ssm_conv.wgsl +65 -0
  2060. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/ssm_scan.wgsl +193 -0
  2061. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl +55 -0
  2062. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +210 -0
  2063. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/upscale.wgsl +240 -0
  2064. data/vendor/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
  2065. data/vendor/ggml/src/ggml-zdnn/common.hpp +59 -0
  2066. data/vendor/ggml/src/ggml-zdnn/ggml-zdnn.cpp +637 -0
  2067. data/vendor/ggml/src/ggml-zdnn/mmf.cpp +80 -0
  2068. data/vendor/ggml/src/ggml-zdnn/mmf.hpp +12 -0
  2069. data/vendor/ggml/src/ggml-zdnn/utils.cpp +79 -0
  2070. data/vendor/ggml/src/ggml-zdnn/utils.hpp +19 -0
  2071. data/vendor/ggml/src/ggml-zendnn/CMakeLists.txt +91 -0
  2072. data/vendor/ggml/src/ggml-zendnn/ggml-zendnn.cpp +669 -0
  2073. data/vendor/ggml/src/ggml.c +7777 -0
  2074. data/vendor/ggml/src/ggml.cpp +26 -0
  2075. data/vendor/ggml/src/gguf.cpp +1556 -0
  2076. data/vendor/ggml/tests/CMakeLists.txt +356 -0
  2077. data/vendor/ggml/tests/test-arange.cpp +100 -0
  2078. data/vendor/ggml/tests/test-backend-ops.cpp +9786 -0
  2079. data/vendor/ggml/tests/test-cont.c +170 -0
  2080. data/vendor/ggml/tests/test-conv-transpose-1d.cpp +691 -0
  2081. data/vendor/ggml/tests/test-conv-transpose.c +248 -0
  2082. data/vendor/ggml/tests/test-conv1d-dw-c1.cpp +243 -0
  2083. data/vendor/ggml/tests/test-conv1d-dw-c2.cpp +243 -0
  2084. data/vendor/ggml/tests/test-conv1d.cpp +289 -0
  2085. data/vendor/ggml/tests/test-conv2d-dw.cpp +153 -0
  2086. data/vendor/ggml/tests/test-conv2d.cpp +391 -0
  2087. data/vendor/ggml/tests/test-customop.c +300 -0
  2088. data/vendor/ggml/tests/test-dup.c +111 -0
  2089. data/vendor/ggml/tests/test-interpolate.cpp +166 -0
  2090. data/vendor/ggml/tests/test-opt.cpp +1003 -0
  2091. data/vendor/ggml/tests/test-pad-reflect-1d.cpp +213 -0
  2092. data/vendor/ggml/tests/test-pool.c +274 -0
  2093. data/vendor/ggml/tests/test-quantize-fns.cpp +196 -0
  2094. data/vendor/ggml/tests/test-quantize-perf.cpp +356 -0
  2095. data/vendor/ggml/tests/test-rel-pos.c +87 -0
  2096. data/vendor/ggml/tests/test-roll.cpp +128 -0
  2097. data/vendor/ggml/tests/test-timestep_embedding.cpp +180 -0
  2098. data/vendor-patches/0001-cuda-buffer_from_ptr.patch +253 -0
  2099. data/vendor-patches/0002-cuda-buffer_from_ptr-reuse-iface.patch +117 -0
  2100. data/vendor-patches/0003-cuda-buffer_from_ptr-copy-mode.patch +128 -0
  2101. data/vendor-patches/0004-cuda-cpy-strided.patch +61 -0
  2102. data/vendor-patches/0005-concat-backward.patch +36 -0
  2103. data/vendor-patches/0006-getrows-back-large-vocab.patch +69 -0
  2104. data/vendor-patches/0007-gpt2-backward-kernels.patch +438 -0
  2105. data/vendor-patches/0008-mul-mat-backward-mixed-precision.patch +50 -0
  2106. data/vendor-patches/0009-sched-unsupported-node-diagnostic.patch +26 -0
  2107. metadata +2161 -0
@@ -0,0 +1,4836 @@
1
+ #define GGML_COMMON_IMPL_CPP
2
+ #define GGML_COMMON_DECL_CPP
3
+ #include "ggml-common.h"
4
+ #include "ggml-backend-impl.h"
5
+
6
+ #include "ggml-impl.h"
7
+ #include "ggml-cpu.h"
8
+ #include "ggml-cpu-impl.h"
9
+ #include "simd-mappings.h"
10
+ #include "traits.h"
11
+
12
+ #include "arch-fallback.h"
13
+
14
+ #include <cmath>
15
+ #include <cstring>
16
+ #include <cassert>
17
+ #include <cstdio> // for GGML_ASSERT
18
+
19
+ #include "repack.h"
20
+
21
+ #if defined(__GNUC__)
22
+ #pragma GCC diagnostic ignored "-Woverlength-strings"
23
+ #endif
24
+
25
+ #define UNUSED GGML_UNUSED
26
+
27
+ static inline int nearest_int(float fval) {
28
+ assert(fabsf(fval) <= 4194303.f);
29
+ float val = fval + 12582912.f;
30
+ int i; memcpy(&i, &val, sizeof(int));
31
+ return (i & 0x007fffff) - 0x00400000;
32
+ }
33
+
34
+ // Functions to create the interleaved data layout formats
35
+
36
+ // interleave 4 block_q4_0s in blocks of blck_size_interleave
37
+ // returns an interleaved block_q4_0x4
38
+ // in the interleaved block_q4_0x4, place deltas for 4 block_q4_0 blocks
39
+ // first, then interleave quants from 4 block_q4_0s in blocks of blck_size_interleave
40
+ //
41
+ // - in : an array of block_q4_0 pointers
42
+ // - blck_size_interleave : the block_q4_0 quants bytes are interleaved in blocks of
43
+ // blck_size_interleave bytes
44
+ // - xor_mask : the mask to convert the nibbles in block_q4_0 quants bytes
45
+ // from bias offset form to pure sign form (this saves subtract
46
+ // operations durin unpacking)
47
+ //
48
+
49
+ extern "C" {
50
+
51
+ #if defined __riscv_zvfh
52
+ void ggml_quantize_mat_q8_0_4x1_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
53
+ assert(QK8_0 == 32);
54
+ assert(k % QK8_0 == 0);
55
+ const int nb = k / QK8_0;
56
+
57
+ block_q8_0x4 * GGML_RESTRICT y = (block_q8_0x4 *) vy;
58
+
59
+ // scalar
60
+ const int blck_size_interleave = 1;
61
+ float srcv[4][QK8_0];
62
+ float id[4];
63
+
64
+ for (int i = 0; i < nb; i++) {
65
+ for (int row_iter = 0; row_iter < 4; row_iter++) {
66
+ float amax = 0.0f; // absolute max
67
+
68
+ for (int j = 0; j < QK8_0; j++) {
69
+ srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j];
70
+ amax = MAX(amax, fabsf(srcv[row_iter][j]));
71
+ }
72
+
73
+ const float d = amax / ((1 << 7) - 1);
74
+ id[row_iter] = d ? 1.0f / d : 0.0f;
75
+
76
+ y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
77
+ }
78
+
79
+ for (int j = 0; j < QK8_0 * 4; j++) {
80
+ int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
81
+ int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
82
+ src_offset += (j % blck_size_interleave);
83
+
84
+ float x0 = srcv[src_id][src_offset] * id[src_id];
85
+ y[i].qs[j] = roundf(x0);
86
+ }
87
+ }
88
+ }
89
+
90
+ void ggml_quantize_mat_q8_K_4x1_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
91
+ assert(QK_K == 256);
92
+ assert(k % QK_K == 0);
93
+ const int nb = k / QK_K;
94
+
95
+ block_q8_Kx4 * GGML_RESTRICT y = (block_q8_Kx4 *) vy;
96
+
97
+ const int blck_size_interleave = 1;
98
+ float srcv[4][QK_K];
99
+ float iscale[4];
100
+
101
+ for (int i = 0; i < nb; i++) {
102
+ for (int row_iter = 0; row_iter < 4; row_iter++) {
103
+ float amax = 0.0f; // absolute max
104
+ float max = 0;
105
+
106
+ for (int j = 0; j < QK_K; j++) {
107
+ srcv[row_iter][j] = x[row_iter * k + i * QK_K + j];
108
+ // Update the maximum value of the corresponding super block
109
+ if(amax < fabsf(srcv[row_iter][j])) {
110
+ amax = fabsf(srcv[row_iter][j]);
111
+ max = srcv[row_iter][j];
112
+ }
113
+ }
114
+
115
+ iscale[row_iter] = amax ? -127.f/max : 0;
116
+ y[i].d[row_iter] = amax ? 1/iscale[row_iter] : 0;
117
+ }
118
+
119
+ for (int j = 0; j < QK_K / 4; j++) {
120
+ y[i].bsums[j] = 0;
121
+ }
122
+ for (int j = 0; j < QK_K * 4; j++) {
123
+ int src_id = j % 4;
124
+ int src_offset = j / 4;
125
+ int index = ((j >> 6) << 2) + (j & 3);
126
+
127
+ float x0 = srcv[src_id][src_offset] * iscale[src_id];
128
+ y[i].qs[j] = nearest_int(x0);
129
+ y[i].bsums[index] += y[i].qs[j];
130
+ }
131
+ }
132
+ }
133
+ #endif
134
+
135
+ void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
136
+ assert(QK8_0 == 32);
137
+ assert(k % QK8_0 == 0);
138
+ const int nb = k / QK8_0;
139
+
140
+ block_q8_0x4 * GGML_RESTRICT y = (block_q8_0x4 *) vy;
141
+
142
+ // scalar
143
+ const int blck_size_interleave = 4;
144
+ float srcv[4][QK8_0];
145
+ float id[4];
146
+
147
+ for (int i = 0; i < nb; i++) {
148
+ for (int row_iter = 0; row_iter < 4; row_iter++) {
149
+ float amax = 0.0f; // absolute max
150
+
151
+ for (int j = 0; j < QK8_0; j++) {
152
+ srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j];
153
+ amax = MAX(amax, fabsf(srcv[row_iter][j]));
154
+ }
155
+
156
+ const float d = amax / ((1 << 7) - 1);
157
+ id[row_iter] = d ? 1.0f / d : 0.0f;
158
+
159
+ y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
160
+ }
161
+
162
+ for (int j = 0; j < QK8_0 * 4; j++) {
163
+ int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
164
+ int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
165
+ src_offset += (j % blck_size_interleave);
166
+
167
+ float x0 = srcv[src_id][src_offset] * id[src_id];
168
+ y[i].qs[j] = roundf(x0);
169
+ }
170
+ }
171
+ }
172
+
173
+ void ggml_quantize_mat_q8_0_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
174
+ assert(QK8_0 == 32);
175
+ assert(k % QK8_0 == 0);
176
+ const int nb = k / QK8_0;
177
+
178
+ block_q8_0x4 * GGML_RESTRICT y = (block_q8_0x4 *) vy;
179
+
180
+ // scalar
181
+ const int blck_size_interleave = 8;
182
+ float srcv[4][QK8_0];
183
+ float id[4];
184
+
185
+ for (int i = 0; i < nb; i++) {
186
+ for (int row_iter = 0; row_iter < 4; row_iter++) {
187
+ float amax = 0.0f; // absolute max
188
+
189
+ for (int j = 0; j < QK8_0; j++) {
190
+ srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j];
191
+ amax = MAX(amax, fabsf(srcv[row_iter][j]));
192
+ }
193
+
194
+ const float d = amax / ((1 << 7) - 1);
195
+ id[row_iter] = d ? 1.0f / d : 0.0f;
196
+
197
+ y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
198
+ }
199
+
200
+ for (int j = 0; j < QK8_0 * 4; j++) {
201
+ int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
202
+ int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
203
+ src_offset += (j % blck_size_interleave);
204
+
205
+ float x0 = srcv[src_id][src_offset] * id[src_id];
206
+ y[i].qs[j] = roundf(x0);
207
+ }
208
+ }
209
+ }
210
+
211
+ void ggml_quantize_mat_q8_K_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
212
+ assert(QK_K == 256);
213
+ assert(k % QK_K == 0);
214
+ const int nb = k / QK_K;
215
+
216
+ block_q8_Kx4 * GGML_RESTRICT y = (block_q8_Kx4 *) vy;
217
+
218
+ // scalar
219
+ const int blck_size_interleave = 4;
220
+ float srcv[4][QK_K];
221
+ float iscale[4];
222
+
223
+ for (int i = 0; i < nb; i++) {
224
+ for (int row_iter = 0; row_iter < 4; row_iter++) {
225
+ float amax = 0.0f; // absolute max
226
+ float max = 0;
227
+
228
+ for (int j = 0; j < QK_K; j++) {
229
+ srcv[row_iter][j] = x[row_iter * k + i * QK_K + j];
230
+ // Update the maximum value of the corresponding super block
231
+ if(amax < fabsf(srcv[row_iter][j])) {
232
+ amax = fabsf(srcv[row_iter][j]);
233
+ max = srcv[row_iter][j];
234
+ }
235
+ }
236
+
237
+ iscale[row_iter] = amax ? -127.f/max : 0;
238
+
239
+ y[i].d[row_iter] = amax ? 1/iscale[row_iter] : 0;
240
+ }
241
+
242
+ for (int j = 0; j < QK_K / 4; j++) {
243
+ y[i].bsums[j] = 0;
244
+ }
245
+
246
+ // Quants values are interleaved in sequence of four bytes from corresponding super blocks
247
+ // Bsums values are interleaved in sequence of four bsums from each super block taken for interleaving
248
+ // i.e first four bsums from the first super block, followed by first four bsums from second super block and so on
249
+ for (int j = 0; j < QK_K * 4; j++) {
250
+ int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
251
+ int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
252
+ src_offset += (j % blck_size_interleave);
253
+ int index = (((j & 15) >> 2) << 2) + ((j >> 8) << 4) + ((j >> 6) & 3);
254
+
255
+ float x0 = srcv[src_id][src_offset] * iscale[src_id];
256
+ y[i].qs[j] = nearest_int(x0);
257
+ y[i].bsums[index] += y[i].qs[j];
258
+ }
259
+ }
260
+ }
261
+
262
+ void ggml_quantize_mat_q8_K_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
263
+ assert(QK_K == 256);
264
+ assert(k % QK_K == 0);
265
+ const int nb = k / QK_K;
266
+
267
+ block_q8_Kx4 * GGML_RESTRICT y = (block_q8_Kx4 *) vy;
268
+
269
+ // scalar
270
+ const int blck_size_interleave = 8;
271
+ float srcv[4][QK_K];
272
+ float iscale[4];
273
+
274
+ for (int i = 0; i < nb; i++) {
275
+ for (int row_iter = 0; row_iter < 4; row_iter++) {
276
+ float amax = 0.0f; // absolute max
277
+ float max = 0;
278
+
279
+ for (int j = 0; j < QK_K; j++) {
280
+ srcv[row_iter][j] = x[row_iter * k + i * QK_K + j];
281
+ // Update the maximum value of the corresponding super block
282
+ if(amax < fabsf(srcv[row_iter][j])) {
283
+ amax = fabsf(srcv[row_iter][j]);
284
+ max = srcv[row_iter][j];
285
+ }
286
+ }
287
+
288
+ iscale[row_iter] = amax ? -127.f/max : 0;
289
+
290
+ y[i].d[row_iter] = amax ? 1/iscale[row_iter] : 0;
291
+ }
292
+
293
+ for (int j = 0; j < QK_K / 4; j++) {
294
+ y[i].bsums[j] = 0;
295
+ }
296
+
297
+ // Quants values are interleaved in sequence of eight bytes from corresponding super blocks
298
+ // Bsums values are interleaved in sequence of four bsums from each super block taken for interleaving
299
+ // i.e first four bsums from the first super block, followed by first four bsums from second super block and so on
300
+ for (int j = 0; j < QK_K * 4; j++) {
301
+ int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
302
+ int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
303
+ src_offset += (j % blck_size_interleave);
304
+ int index = (((j & 31) >> 3) << 2) + ((j >> 8) << 4) + ((j >> 6) & 3);
305
+
306
+ float x0 = srcv[src_id][src_offset] * iscale[src_id];
307
+ y[i].qs[j] = nearest_int(x0);
308
+ y[i].bsums[index] += y[i].qs[j];
309
+ }
310
+ }
311
+ }
312
+
313
+ } // extern "C"
314
+
315
+ template <int64_t INTER_SIZE, ggml_type PARAM_TYPE>
316
+ void ggml_quantize_mat_t(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row);
317
+
318
+ template <> void ggml_quantize_mat_t<4, GGML_TYPE_Q8_0>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) {
319
+ assert(nrow == 4);
320
+ UNUSED(nrow);
321
+ ggml_quantize_mat_q8_0_4x4(x, vy, n_per_row);
322
+ }
323
+
324
+ template <> void ggml_quantize_mat_t<8, GGML_TYPE_Q8_0>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) {
325
+ assert(nrow == 4);
326
+ UNUSED(nrow);
327
+ ggml_quantize_mat_q8_0_4x8(x, vy, n_per_row);
328
+ }
329
+
330
+ template <> void ggml_quantize_mat_t<4, GGML_TYPE_Q8_K>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) {
331
+ assert(nrow == 4);
332
+ UNUSED(nrow);
333
+ ggml_quantize_mat_q8_K_4x4(x, vy, n_per_row);
334
+ }
335
+
336
+ template <> void ggml_quantize_mat_t<8, GGML_TYPE_Q8_K>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) {
337
+ assert(nrow == 4);
338
+ UNUSED(nrow);
339
+ ggml_quantize_mat_q8_K_4x8(x, vy, n_per_row);
340
+ }
341
+
342
+ #if defined __riscv_zvfh
343
+ template <> void ggml_quantize_mat_t<1, GGML_TYPE_Q8_0>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) {
344
+ assert(nrow == 4);
345
+ UNUSED(nrow);
346
+ ggml_quantize_mat_q8_0_4x1(x, vy, n_per_row);
347
+ }
348
+
349
+ template <> void ggml_quantize_mat_t<1, GGML_TYPE_Q8_K>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) {
350
+ assert(nrow == 4);
351
+ UNUSED(nrow);
352
+ ggml_quantize_mat_q8_K_4x1(x, vy, n_per_row);
353
+ }
354
+ #endif
355
+
356
+ template <int M, int N>
357
+ static void ggml_gemv_q6_K_NxM_q8_K_generic_impl(int n,
358
+ float * GGML_RESTRICT s,
359
+ size_t bs,
360
+ const void * GGML_RESTRICT vx,
361
+ const void * GGML_RESTRICT vy,
362
+ int nr,
363
+ int nc) {
364
+ constexpr int blocklen = M;
365
+ constexpr int ncols_interleaved = N;
366
+ const int qk = QK_K;
367
+ const int nb = n / qk;
368
+ const int blocks_per_half = 64 / blocklen;
369
+
370
+ assert(n % qk == 0);
371
+ assert(nc % ncols_interleaved == 0);
372
+
373
+ UNUSED(bs);
374
+ UNUSED(nr);
375
+
376
+ float sumf[8];
377
+
378
+ const block_q8_K * a_ptr = (const block_q8_K *) vy;
379
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
380
+ const block_q6_Kx8 * b_ptr = (const block_q6_Kx8 *) vx + (x * nb);
381
+
382
+ for (int j = 0; j < ncols_interleaved; j++) {
383
+ sumf[j] = 0.0f;
384
+ }
385
+
386
+ for (int l = 0; l < nb; l++) {
387
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
388
+ const int base_l = (k / blocks_per_half) * 128 + (k % blocks_per_half) * blocklen;
389
+ const int base_h = base_l + 64;
390
+
391
+ const int scale_idx_l = base_l / 16;
392
+ const int scale_idx_h = base_h / 16;
393
+
394
+ const int qh_shift_l = ((base_l % 128) / 32) * 2;
395
+ const int qh_shift_h = ((base_h % 128) / 32) * 2;
396
+
397
+ const int qh_half_l = (base_l / 128) * 32;
398
+ const int qh_half_h = (base_h / 128) * 32;
399
+
400
+ for (int j = 0; j < ncols_interleaved; j++) {
401
+ const int8_t scale_l = b_ptr[l].scales[scale_idx_l * ncols_interleaved + j];
402
+ const int8_t scale_h = b_ptr[l].scales[scale_idx_h * ncols_interleaved + j];
403
+
404
+ int sumi_l = 0;
405
+ int sumi_h = 0;
406
+
407
+ for (int i = 0; i < blocklen; i++) {
408
+ const int ql_pos = k * ncols_interleaved * blocklen + j * blocklen + i;
409
+ const int l_4 = b_ptr[l].ql[ql_pos] & 0xF;
410
+ const int hi_4 = (b_ptr[l].ql[ql_pos] >> 4) & 0xF;
411
+
412
+ const int qh_idx_l = qh_half_l + ((base_l + i) % 32);
413
+ const int qh_chunk_l = qh_idx_l / blocklen;
414
+ const int qh_pos_l = qh_idx_l % blocklen;
415
+ const int qh_offset_l = qh_chunk_l * (blocklen * ncols_interleaved) + j * blocklen + qh_pos_l;
416
+ const int hi_2_l = (b_ptr[l].qh[qh_offset_l] >> qh_shift_l) & 0x3;
417
+
418
+ const int qh_idx_h = qh_half_h + ((base_h + i) % 32);
419
+ const int qh_chunk_h = qh_idx_h / blocklen;
420
+ const int qh_pos_h = qh_idx_h % blocklen;
421
+ const int qh_offset_h = qh_chunk_h * (blocklen * ncols_interleaved) + j * blocklen + qh_pos_h;
422
+ const int hi_2_h = (b_ptr[l].qh[qh_offset_h] >> qh_shift_h) & 0x3;
423
+
424
+ const int q_l = ((hi_2_l << 4) | l_4) - 32;
425
+ const int q_h = ((hi_2_h << 4) | hi_4) - 32;
426
+
427
+ const int8_t a_l = a_ptr[l].qs[base_l + i];
428
+ const int8_t a_h = a_ptr[l].qs[base_h + i];
429
+
430
+ sumi_l += q_l * a_l;
431
+ sumi_h += q_h * a_h;
432
+ }
433
+
434
+ sumf[j] +=
435
+ (sumi_l * scale_l + sumi_h * scale_h) * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
436
+ }
437
+ }
438
+ }
439
+
440
+ for (int j = 0; j < ncols_interleaved; j++) {
441
+ s[x * ncols_interleaved + j] = sumf[j];
442
+ }
443
+ }
444
+ }
445
+
446
+ template <int M, int N>
447
+ static void ggml_gemm_q6_K_NxM_q8_K_generic_impl(int n,
448
+ float * GGML_RESTRICT s,
449
+ size_t bs,
450
+ const void * GGML_RESTRICT vx,
451
+ const void * GGML_RESTRICT vy,
452
+ int nr,
453
+ int nc) {
454
+ constexpr int blocklen = M;
455
+ constexpr int ncols_interleaved = N;
456
+ const int qk = QK_K;
457
+ const int nb = n / qk;
458
+ const int blocks_per_half = 64 / blocklen;
459
+ const int q8_half_stride = 512;
460
+ const int q8_low_high_step = 256;
461
+
462
+ assert(n % qk == 0);
463
+ assert(nr % 4 == 0);
464
+ assert(nc % ncols_interleaved == 0);
465
+
466
+ UNUSED(bs);
467
+
468
+ float sumf[4][8];
469
+
470
+ for (int y = 0; y < nr / 4; y++) {
471
+ const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
472
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
473
+ const block_q6_Kx8 * b_ptr = (const block_q6_Kx8 *) vx + (x * nb);
474
+
475
+ for (int m = 0; m < 4; m++) {
476
+ for (int j = 0; j < ncols_interleaved; j++) {
477
+ sumf[m][j] = 0.0f;
478
+ }
479
+ }
480
+
481
+ for (int l = 0; l < nb; l++) {
482
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
483
+ const int base_l = (k / blocks_per_half) * 128 + (k % blocks_per_half) * blocklen;
484
+ const int base_h = base_l + 64;
485
+
486
+ const int scale_idx_l = base_l / 16;
487
+ const int scale_idx_h = base_h / 16;
488
+
489
+ const int qh_shift_l = ((base_l % 128) / 32) * 2;
490
+ const int qh_shift_h = ((base_h % 128) / 32) * 2;
491
+
492
+ const int qh_half_l = (base_l / 128) * 32;
493
+ const int qh_half_h = (base_h / 128) * 32;
494
+
495
+ const int q8_base = (k / blocks_per_half) * q8_half_stride + (k % blocks_per_half) * (blocklen * 4);
496
+
497
+ for (int m = 0; m < 4; m++) {
498
+ for (int j = 0; j < ncols_interleaved; j++) {
499
+ const int8_t scale_l = b_ptr[l].scales[scale_idx_l * ncols_interleaved + j];
500
+ const int8_t scale_h = b_ptr[l].scales[scale_idx_h * ncols_interleaved + j];
501
+
502
+ int sumi_l = 0;
503
+ int sumi_h = 0;
504
+
505
+ for (int i = 0; i < blocklen; i++) {
506
+ const int ql_pos = k * ncols_interleaved * blocklen + j * blocklen + i;
507
+ const int l_4 = b_ptr[l].ql[ql_pos] & 0xF;
508
+ const int hi_4 = (b_ptr[l].ql[ql_pos] >> 4) & 0xF;
509
+
510
+ const int qh_idx_l = qh_half_l + ((base_l + i) % 32);
511
+ const int qh_chunk_l = qh_idx_l / blocklen;
512
+ const int qh_pos_l = qh_idx_l % blocklen;
513
+ const int qh_offset_l =
514
+ qh_chunk_l * (blocklen * ncols_interleaved) + j * blocklen + qh_pos_l;
515
+ const int hi_2_l = (b_ptr[l].qh[qh_offset_l] >> qh_shift_l) & 0x3;
516
+
517
+ const int qh_idx_h = qh_half_h + ((base_h + i) % 32);
518
+ const int qh_chunk_h = qh_idx_h / blocklen;
519
+ const int qh_pos_h = qh_idx_h % blocklen;
520
+ const int qh_offset_h =
521
+ qh_chunk_h * (blocklen * ncols_interleaved) + j * blocklen + qh_pos_h;
522
+ const int hi_2_h = (b_ptr[l].qh[qh_offset_h] >> qh_shift_h) & 0x3;
523
+
524
+ const int q_l = ((hi_2_l << 4) | l_4) - 32;
525
+ const int q_h = ((hi_2_h << 4) | hi_4) - 32;
526
+
527
+ const int8_t q8_l = a_ptr[l].qs[q8_base + m * blocklen + i];
528
+ const int8_t q8_h = a_ptr[l].qs[q8_base + m * blocklen + i + q8_low_high_step];
529
+
530
+ sumi_l += q_l * q8_l;
531
+ sumi_h += q_h * q8_h;
532
+ }
533
+
534
+ sumf[m][j] += (sumi_l * scale_l + sumi_h * scale_h) * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) *
535
+ a_ptr[l].d[m];
536
+ }
537
+ }
538
+ }
539
+ }
540
+
541
+ for (int m = 0; m < 4; m++) {
542
+ for (int j = 0; j < ncols_interleaved; j++) {
543
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
544
+ }
545
+ }
546
+ }
547
+ }
548
+ }
549
+
550
+ template <int M, int N>
551
+ static void ggml_gemv_q5_K_NxM_q8_K_generic_impl(int n,
552
+ float * GGML_RESTRICT s,
553
+ size_t bs,
554
+ const void * GGML_RESTRICT vx,
555
+ const void * GGML_RESTRICT vy,
556
+ int nr,
557
+ int nc) {
558
+ constexpr int blocklen = M;
559
+ constexpr int ncols_interleaved = N;
560
+ const int qk = QK_K;
561
+ const int nb = n / qk;
562
+ static const uint32_t kmask1 = 0x3f3f3f3f;
563
+ static const uint32_t kmask2 = 0x0f0f0f0f;
564
+ static const uint32_t kmask3 = 0x03030303;
565
+
566
+ assert(n % qk == 0);
567
+ assert(nc % ncols_interleaved == 0);
568
+
569
+ UNUSED(bs);
570
+ UNUSED(nr);
571
+
572
+ float sumf[ncols_interleaved];
573
+ float sum_minf[ncols_interleaved];
574
+ uint32_t utmp[32];
575
+ int sumi1;
576
+ int sumi2;
577
+ int sumi;
578
+
579
+ const block_q8_K * a_ptr = (const block_q8_K *) vy;
580
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
581
+ const block_q5_Kx8 * b_ptr = (const block_q5_Kx8 *) vx + (x * nb);
582
+
583
+ for (int j = 0; j < ncols_interleaved; j++) {
584
+ sumf[j] = 0.0;
585
+ sum_minf[j] = 0.0;
586
+ }
587
+ for (int l = 0; l < nb; l++) {
588
+ for (int sb = 0; sb < 8; sb++) {
589
+ memcpy(utmp + sb * 4, b_ptr[l].scales + sb * K_SCALE_SIZE, K_SCALE_SIZE);
590
+ utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
591
+ const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
592
+ utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
593
+ utmp[sb * 4 + 2] = uaux_0;
594
+ utmp[sb * 4 + 0] &= kmask1;
595
+ }
596
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
597
+ constexpr int scale_stride = 32;
598
+ uint8_t * scales_0 = (uint8_t *) utmp + (k / (32 / blocklen)) * scale_stride;
599
+ uint8_t * scales_1 = (uint8_t *) utmp + (k / (32 / blocklen)) * scale_stride + 16;
600
+
601
+ const int qh_shift = (k / (32 / blocklen)) * 2;
602
+ for (int j = 0; j < ncols_interleaved; j++) {
603
+ sumi1 = 0;
604
+ sumi2 = 0;
605
+ sumi = 0;
606
+ for (int i = 0; i < blocklen; ++i) {
607
+ const int b_qs_offset = k * ncols_interleaved * blocklen + j * blocklen + i;
608
+
609
+ const int qh_idx = (k * blocklen + i) % 32;
610
+ const int qh_chunk = qh_idx / blocklen;
611
+ const int qh_pos = qh_idx % blocklen;
612
+ const int b_qh_offset = qh_chunk * (blocklen * ncols_interleaved) + j * blocklen + qh_pos;
613
+
614
+ const uint8_t qh_val = b_ptr[l].qh[b_qh_offset];
615
+ const uint8_t h0 = (qh_val >> qh_shift) & 1;
616
+ const uint8_t h1 = (qh_val >> (qh_shift + 1)) & 1;
617
+
618
+ const int v0 = (int8_t) ((b_ptr[l].qs[b_qs_offset] & 0xF) | (h0 << 4));
619
+ const int v1 = (int8_t) ((b_ptr[l].qs[b_qs_offset] >> 4) | (h1 << 4));
620
+
621
+ const int q8_offset = (k / (32 / blocklen)) * 64 + (k % (32 / blocklen)) * blocklen + i;
622
+
623
+ sumi1 = (v0 * a_ptr[l].qs[q8_offset]);
624
+ sumi2 = (v1 * a_ptr[l].qs[q8_offset + 32]);
625
+ sumi1 = sumi1 * scales_0[j];
626
+ sumi2 = sumi2 * scales_1[j];
627
+ sumi += sumi1 + sumi2;
628
+ }
629
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
630
+ }
631
+ }
632
+ for (int sb = 0; sb < 8; sb++) {
633
+ uint8_t * mins = (uint8_t *) utmp + 8 + sb * 16;
634
+ for (int j = 0; j < ncols_interleaved; j++) {
635
+ sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) *
636
+ GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
637
+ }
638
+ }
639
+ }
640
+ for (int j = 0; j < ncols_interleaved; j++) {
641
+ s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
642
+ }
643
+ }
644
+ }
645
+
646
+ template <int M, int N>
647
+ static void ggml_gemm_q5_K_NxM_q8_K_generic_impl(int n,
648
+ float * GGML_RESTRICT s,
649
+ size_t bs,
650
+ const void * GGML_RESTRICT vx,
651
+ const void * GGML_RESTRICT vy,
652
+ int nr,
653
+ int nc) {
654
+ constexpr int blocklen = M;
655
+ constexpr int ncols_interleaved = N;
656
+ const int qk = QK_K;
657
+ const int nb = n / qk;
658
+ static const uint32_t kmask1 = 0x3f3f3f3f;
659
+ static const uint32_t kmask2 = 0x0f0f0f0f;
660
+ static const uint32_t kmask3 = 0x03030303;
661
+
662
+ assert(n % qk == 0);
663
+ assert(nr % 4 == 0);
664
+ assert(nc % ncols_interleaved == 0);
665
+
666
+ float sumf[4][ncols_interleaved];
667
+ float sum_minf[4][ncols_interleaved];
668
+ uint32_t utmp[32];
669
+ int sumi1;
670
+ int sumi2;
671
+ int sumi;
672
+
673
+ for (int y = 0; y < nr / 4; y++) {
674
+ const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
675
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
676
+ const block_q5_Kx8 * b_ptr = (const block_q5_Kx8 *) vx + (x * nb);
677
+ for (int m = 0; m < 4; m++) {
678
+ for (int j = 0; j < ncols_interleaved; j++) {
679
+ sumf[m][j] = 0.0;
680
+ sum_minf[m][j] = 0.0;
681
+ }
682
+ }
683
+ for (int l = 0; l < nb; l++) {
684
+ for (int sb = 0; sb < 8; sb++) {
685
+ memcpy(utmp + sb * 4, b_ptr[l].scales + sb * K_SCALE_SIZE, K_SCALE_SIZE);
686
+ utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
687
+ const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
688
+ utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
689
+ utmp[sb * 4 + 2] = uaux_0;
690
+ utmp[sb * 4 + 0] &= kmask1;
691
+ }
692
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
693
+ constexpr int scale_stride = 32;
694
+ uint8_t * scales_0 = (uint8_t *) utmp + (k / (32 / blocklen)) * scale_stride;
695
+ uint8_t * scales_1 = (uint8_t *) utmp + (k / (32 / blocklen)) * scale_stride + 16;
696
+
697
+ const int qh_shift = (k / (32 / blocklen)) * 2;
698
+ for (int m = 0; m < 4; m++) {
699
+ for (int j = 0; j < ncols_interleaved; j++) {
700
+ sumi1 = 0;
701
+ sumi2 = 0;
702
+ sumi = 0;
703
+ for (int i = 0; i < blocklen; ++i) {
704
+ const int b_qs_offset = k * ncols_interleaved * blocklen + j * blocklen + i;
705
+
706
+ const int qh_idx = (k * blocklen + i) % 32;
707
+ const int qh_chunk = qh_idx / blocklen;
708
+ const int qh_pos = qh_idx % blocklen;
709
+ const int b_qh_offset =
710
+ qh_chunk * (blocklen * ncols_interleaved) + j * blocklen + qh_pos;
711
+
712
+ const uint8_t qh_val = b_ptr[l].qh[b_qh_offset];
713
+ const uint8_t h0 = (qh_val >> qh_shift) & 1;
714
+ const uint8_t h1 = (qh_val >> (qh_shift + 1)) & 1;
715
+
716
+ const int v0 = (int8_t) ((b_ptr[l].qs[b_qs_offset] & 0xF) | (h0 << 4));
717
+ const int v1 = (int8_t) ((b_ptr[l].qs[b_qs_offset] >> 4) | (h1 << 4));
718
+
719
+ const int q8_offset = (k / (32 / blocklen)) * 256 +
720
+ (k % (32 / blocklen)) * 4 * blocklen + m * blocklen + i;
721
+
722
+ sumi1 = (v0 * a_ptr[l].qs[q8_offset]);
723
+ sumi2 = (v1 * a_ptr[l].qs[q8_offset + 128]);
724
+ sumi1 = sumi1 * scales_0[j];
725
+ sumi2 = sumi2 * scales_1[j];
726
+ sumi += sumi1 + sumi2;
727
+ }
728
+ sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
729
+ }
730
+ }
731
+ }
732
+ for (int sb = 0; sb < 8; sb++) {
733
+ uint8_t * mins = (uint8_t *) utmp + 8 + sb * 16;
734
+ for (int m = 0; m < 4; m++) {
735
+ const int16_t * bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
736
+ for (int j = 0; j < ncols_interleaved; j++) {
737
+ sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) *
738
+ GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
739
+ }
740
+ }
741
+ }
742
+ }
743
+ for (int m = 0; m < 4; m++) {
744
+ for (int j = 0; j < ncols_interleaved; j++) {
745
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
746
+ }
747
+ }
748
+ }
749
+ }
750
+ }
751
+
752
+ extern "C" {
753
+
754
+ void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
755
+ const int qk = QK8_0;
756
+ const int nb = n / qk;
757
+ const int ncols_interleaved = 4;
758
+ const int blocklen = 4;
759
+
760
+ assert(nr == 1);
761
+ assert(n % qk == 0);
762
+ assert(nc % ncols_interleaved == 0);
763
+
764
+ UNUSED(s);
765
+ UNUSED(bs);
766
+ UNUSED(vx);
767
+ UNUSED(vy);
768
+ UNUSED(nr);
769
+ UNUSED(nc);
770
+ UNUSED(nb);
771
+ UNUSED(ncols_interleaved);
772
+ UNUSED(blocklen);
773
+
774
+ float sumf[4];
775
+ int sumi;
776
+
777
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
778
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
779
+ const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
780
+
781
+ for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
782
+ for (int l = 0; l < nb; l++) {
783
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
784
+ for (int j = 0; j < ncols_interleaved; j++) {
785
+ sumi = 0;
786
+ for (int i = 0; i < blocklen; ++i) {
787
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
788
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
789
+ sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
790
+ }
791
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
792
+ }
793
+ }
794
+ }
795
+ for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
796
+ }
797
+ }
798
+
799
+ void ggml_gemv_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
800
+ const int qk = QK8_0;
801
+ const int nb = n / qk;
802
+ const int ncols_interleaved = 4;
803
+ const int blocklen = 8;
804
+
805
+ assert (n % qk == 0);
806
+ assert (nc % ncols_interleaved == 0);
807
+
808
+ UNUSED(s);
809
+ UNUSED(bs);
810
+ UNUSED(vx);
811
+ UNUSED(vy);
812
+ UNUSED(nr);
813
+ UNUSED(nc);
814
+ UNUSED(nb);
815
+ UNUSED(ncols_interleaved);
816
+ UNUSED(blocklen);
817
+
818
+ float sumf[4];
819
+ int sumi;
820
+
821
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
822
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
823
+ const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
824
+
825
+ for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
826
+ for (int l = 0; l < nb; l++) {
827
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
828
+ for (int j = 0; j < ncols_interleaved; j++) {
829
+ sumi = 0;
830
+ for (int i = 0; i < blocklen; ++i) {
831
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
832
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
833
+ sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
834
+ }
835
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
836
+ }
837
+ }
838
+ }
839
+ for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
840
+ }
841
+ }
842
+
843
+ void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
844
+ const int qk = QK8_0;
845
+ const int nb = n / qk;
846
+ const int ncols_interleaved = 8;
847
+ const int blocklen = 8;
848
+
849
+ assert (n % qk == 0);
850
+ assert (nc % ncols_interleaved == 0);
851
+
852
+ UNUSED(s);
853
+ UNUSED(bs);
854
+ UNUSED(vx);
855
+ UNUSED(vy);
856
+ UNUSED(nr);
857
+ UNUSED(nc);
858
+ UNUSED(nb);
859
+ UNUSED(ncols_interleaved);
860
+ UNUSED(blocklen);
861
+
862
+ float sumf[8];
863
+ int sumi;
864
+
865
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
866
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
867
+ const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
868
+
869
+ for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
870
+ for (int l = 0; l < nb; l++) {
871
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
872
+ for (int j = 0; j < ncols_interleaved; j++) {
873
+ sumi = 0;
874
+ for (int i = 0; i < blocklen; ++i) {
875
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
876
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
877
+ sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
878
+ }
879
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
880
+ }
881
+ }
882
+ }
883
+ for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
884
+ }
885
+ }
886
+
887
+ void ggml_gemv_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
888
+ const int qk = QK_K;
889
+ const int nb = n / qk;
890
+ const int ncols_interleaved = 8;
891
+ const int blocklen = 4;
892
+ static const uint32_t kmask1 = 0x3f3f3f3f;
893
+ static const uint32_t kmask2 = 0x0f0f0f0f;
894
+ static const uint32_t kmask3 = 0x03030303;
895
+
896
+ assert (n % qk == 0);
897
+ assert (nc % ncols_interleaved == 0);
898
+
899
+ UNUSED(bs);
900
+ UNUSED(nr);
901
+
902
+ float sumf[8];
903
+ float sum_minf[8];
904
+ uint32_t utmp[32];
905
+ int sumi1;
906
+ int sumi2;
907
+ int sumi;
908
+
909
+ const block_q8_K * a_ptr = (const block_q8_K *) vy;
910
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
911
+ const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb);
912
+
913
+ for (int j = 0; j < ncols_interleaved; j++) {
914
+ sumf[j] = 0.0;
915
+ sum_minf[j] = 0.0;
916
+ }
917
+ for (int l = 0; l < nb; l++) {
918
+ for (int sb = 0; sb < 8; sb++) {
919
+ memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
920
+ utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
921
+ const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
922
+ utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
923
+ utmp[sb * 4 + 2] = uaux_0;
924
+ utmp[sb * 4 + 0] &= kmask1;
925
+ }
926
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
927
+ uint8_t * scales_0 = (uint8_t *) utmp + (k / 8) * 32;
928
+ uint8_t * scales_1 = (uint8_t *) utmp + (k / 8) * 32 + 16;
929
+ for (int j = 0; j < ncols_interleaved; j++) {
930
+ sumi1 = 0;
931
+ sumi2 = 0;
932
+ sumi = 0;
933
+ for (int i = 0; i < blocklen; ++i) {
934
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF);
935
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
936
+ sumi1 = (v0 * a_ptr[l].qs[(k / 8) * 64 + (k % 8) * blocklen + i]);
937
+ sumi2 = (v1 * a_ptr[l].qs[(k / 8) * 64 + (k % 8) * blocklen + i + 32]);
938
+ sumi1 = sumi1 * scales_0[j];
939
+ sumi2 = sumi2 * scales_1[j];
940
+ sumi += sumi1 + sumi2;
941
+ }
942
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
943
+ }
944
+ }
945
+ for (int sb = 0; sb < 8; sb++) {
946
+ uint8_t * mins = (uint8_t *) utmp + 8 + sb * 16;
947
+ for (int j = 0; j < ncols_interleaved; j++) {
948
+ sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
949
+ }
950
+ }
951
+ }
952
+ for (int j = 0; j < ncols_interleaved; j++) {
953
+ s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
954
+ }
955
+ }
956
+ }
957
+
958
+ void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
959
+ const int qk = QK_K;
960
+ const int nb = n / qk;
961
+ const int ncols_interleaved = 8;
962
+ const int blocklen = 8;
963
+ static const uint32_t kmask1 = 0x3f3f3f3f;
964
+ static const uint32_t kmask2 = 0x0f0f0f0f;
965
+ static const uint32_t kmask3 = 0x03030303;
966
+
967
+ assert (n % qk == 0);
968
+ assert (nc % ncols_interleaved == 0);
969
+
970
+ UNUSED(bs);
971
+ UNUSED(nr);
972
+
973
+ float sumf[8];
974
+ float sum_minf[8];
975
+ uint32_t utmp[32];
976
+ int sumi1;
977
+ int sumi2;
978
+ int sumi;
979
+
980
+ const block_q8_K * a_ptr = (const block_q8_K *) vy;
981
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
982
+ const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb);
983
+
984
+ for (int j = 0; j < ncols_interleaved; j++) {
985
+ sumf[j] = 0.0;
986
+ sum_minf[j] = 0.0;
987
+ }
988
+ for (int l = 0; l < nb; l++) {
989
+ for (int sb = 0; sb < 8; sb++) {
990
+ memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
991
+ utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
992
+ const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
993
+ utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
994
+ utmp[sb * 4 + 2] = uaux_0;
995
+ utmp[sb * 4 + 0] &= kmask1;
996
+ }
997
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
998
+ uint8_t *scales_0 = (uint8_t*) utmp + (k / 4) * 32;
999
+ uint8_t *scales_1 = (uint8_t*) utmp + (k / 4) * 32 + 16;
1000
+ for (int j = 0; j < ncols_interleaved; j++) {
1001
+ sumi1 = 0;
1002
+ sumi2 = 0;
1003
+ sumi = 0;
1004
+ for (int i = 0; i < blocklen; ++i) {
1005
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF);
1006
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
1007
+ sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 64 + (k % 4) * blocklen + i]);
1008
+ sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 64 + (k % 4) * blocklen + i + 32]);
1009
+ sumi1 = sumi1 * scales_0[j];
1010
+ sumi2 = sumi2 * scales_1[j];
1011
+ sumi += sumi1 + sumi2;
1012
+ }
1013
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
1014
+ }
1015
+ }
1016
+ for (int sb = 0; sb < 8; sb++) {
1017
+ uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16;
1018
+ for (int j = 0; j < ncols_interleaved; j++) {
1019
+ sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
1020
+ }
1021
+ }
1022
+ }
1023
+ for (int j = 0; j < ncols_interleaved; j++) {
1024
+ s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
1025
+ }
1026
+ }
1027
+ }
1028
+
1029
+ void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1030
+ const int qk = QK_K;
1031
+ const int nb = n / qk;
1032
+ const int ncols_interleaved = 8;
1033
+ const int blocklen = 8;
1034
+
1035
+ assert (n % qk == 0);
1036
+ assert (nc % ncols_interleaved == 0);
1037
+
1038
+ UNUSED(s);
1039
+ UNUSED(bs);
1040
+ UNUSED(vx);
1041
+ UNUSED(vy);
1042
+ UNUSED(nr);
1043
+ UNUSED(nc);
1044
+ UNUSED(nb);
1045
+ UNUSED(ncols_interleaved);
1046
+ UNUSED(blocklen);
1047
+
1048
+ float sumf[8];
1049
+ float sum_minf[8];
1050
+ int sumi1,sumi2,sumi3,sumi4;
1051
+ int sumi;
1052
+
1053
+ const block_q8_K * a_ptr = (const block_q8_K *)vy;
1054
+ for(int x = 0; x < nc / ncols_interleaved; x++) {
1055
+ const block_q2_Kx8 * b_ptr = (const block_q2_Kx8 *) vx + (x * nb);
1056
+ for (int j = 0; j < ncols_interleaved; j++) {
1057
+ sumf[j] = 0.0;
1058
+ sum_minf[j] = 0.0;
1059
+ }
1060
+ for (int l = 0; l < nb; l++) {
1061
+ for (int k = 0; k < (qk / (4 * blocklen)); k++) {
1062
+ const uint8_t *scales_0 = b_ptr[l].scales + (k / 4) * 64 ;
1063
+ const uint8_t *scales_1 = b_ptr[l].scales + (k / 4) * 64 + 16;
1064
+ const uint8_t *scales_2 = b_ptr[l].scales + (k / 4) * 64 + 32;
1065
+ const uint8_t *scales_3 = b_ptr[l].scales + (k / 4) * 64 + 48;
1066
+ for (int j = 0; j < ncols_interleaved; j++) {
1067
+ sumi1 = 0;
1068
+ sumi2 = 0;
1069
+ sumi3 = 0;
1070
+ sumi4 = 0;
1071
+ sumi = 0;
1072
+ int offset = ((k / 2) % 2) + j * 2;
1073
+ for (int i = 0; i < blocklen; ++i){
1074
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 3);
1075
+ const int v1 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 2 ) & 3);
1076
+ const int v2 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4 ) & 3);
1077
+ const int v3 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 6 ) & 3);
1078
+ sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i]);
1079
+ sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 32]);
1080
+ sumi3 = (v2 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 64]);
1081
+ sumi4 = (v3 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 96]);
1082
+
1083
+ sumi1 = sumi1 * (scales_0[offset] & 0xF);
1084
+ sumi2 = sumi2 * (scales_1[offset] & 0xF);
1085
+ sumi3 = sumi3 * (scales_2[offset] & 0xF);
1086
+ sumi4 = sumi4 * (scales_3[offset] & 0xF);
1087
+ sumi += sumi1 + sumi2 + sumi3 + sumi4;
1088
+ }
1089
+ sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
1090
+ }
1091
+ }
1092
+ for(int sb = 0; sb < 8; sb++) {
1093
+ const uint8_t *mins = b_ptr[l].scales + sb * 16;
1094
+ for(int j = 0; j < ncols_interleaved; j++){
1095
+ sum_minf[j] += ((mins[j * 2] >> 4) * a_ptr[l].bsums[sb * 2] + (mins[(j * 2)+ 1] >> 4) * a_ptr[l].bsums[sb * 2 + 1]) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
1096
+ }
1097
+ }
1098
+ }
1099
+ for (int j = 0; j < ncols_interleaved; j++) {
1100
+ s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
1101
+ }
1102
+ }
1103
+ }
1104
+
1105
+ void ggml_gemv_q5_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1106
+ ggml_gemv_q5_K_NxM_q8_K_generic_impl<4, 8>(n, s, bs, vx, vy, nr, nc);
1107
+ }
1108
+
1109
+ void ggml_gemv_q5_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1110
+ ggml_gemv_q5_K_NxM_q8_K_generic_impl<8, 8>(n, s, bs, vx, vy, nr, nc);
1111
+ }
1112
+
1113
+
1114
+ void ggml_gemv_q6_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1115
+ ggml_gemv_q6_K_NxM_q8_K_generic_impl<4, 8>(n, s, bs, vx, vy, nr, nc);
1116
+ }
1117
+
1118
+ void ggml_gemv_q6_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1119
+ ggml_gemv_q6_K_NxM_q8_K_generic_impl<8, 8>(n, s, bs, vx, vy, nr, nc);
1120
+ }
1121
+
1122
+ void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1123
+ const int qk = QK8_0;
1124
+ const int nb = n / qk;
1125
+ const int ncols_interleaved = 4;
1126
+ const int blocklen = 4;
1127
+
1128
+ assert(nr == 1);
1129
+ assert(n % qk == 0);
1130
+ assert(nc % ncols_interleaved == 0);
1131
+
1132
+ UNUSED(bs);
1133
+ UNUSED(nr);
1134
+
1135
+ float sumf[4];
1136
+ int sumi;
1137
+
1138
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
1139
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
1140
+ const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
1141
+
1142
+ for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
1143
+ for (int l = 0; l < nb; l++) {
1144
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
1145
+ for (int j = 0; j < ncols_interleaved; j++) {
1146
+ sumi = 0;
1147
+ for (int i = 0; i < blocklen; ++i) {
1148
+ const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
1149
+ const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
1150
+ sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
1151
+ }
1152
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
1153
+ }
1154
+ }
1155
+ }
1156
+ for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
1157
+ }
1158
+ }
1159
+
1160
+ void ggml_gemv_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1161
+ const int qk = QK8_0;
1162
+ const int nb = n / qk;
1163
+ const int ncols_interleaved = 8;
1164
+ const int blocklen = 8;
1165
+
1166
+ assert(nr == 1);
1167
+ assert(n % qk == 0);
1168
+ assert(nc % ncols_interleaved == 0);
1169
+
1170
+ UNUSED(bs);
1171
+ UNUSED(nr);
1172
+
1173
+ float sumf[8];
1174
+ int sumi;
1175
+
1176
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
1177
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
1178
+ const block_iq4_nlx8 * b_ptr = (const block_iq4_nlx8 *) vx + (x * nb);
1179
+
1180
+ for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
1181
+ for (int l = 0; l < nb; l++) {
1182
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
1183
+ for (int j = 0; j < ncols_interleaved; j++) {
1184
+ sumi = 0;
1185
+ for (int i = 0; i < blocklen; ++i) {
1186
+ const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
1187
+ const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
1188
+ sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
1189
+ }
1190
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
1191
+ }
1192
+ }
1193
+ }
1194
+ for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
1195
+ }
1196
+ }
1197
+
1198
+ void ggml_gemv_mxfp4_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1199
+ const int qk = QK8_0;
1200
+ const int nb = n / qk;
1201
+ const int ncols_interleaved = 4;
1202
+ const int blocklen = 4;
1203
+
1204
+ assert(nr == 1);
1205
+ assert(n % qk == 0);
1206
+ assert(nc % ncols_interleaved == 0);
1207
+
1208
+ UNUSED(bs);
1209
+ UNUSED(nr);
1210
+
1211
+ float sumf[4];
1212
+ int sumi;
1213
+
1214
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
1215
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
1216
+ const block_mxfp4x4 * b_ptr = (const block_mxfp4x4 *) vx + (x * nb);
1217
+
1218
+ for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
1219
+ for (int l = 0; l < nb; l++) {
1220
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
1221
+ for (int j = 0; j < ncols_interleaved; j++) {
1222
+ sumi = 0;
1223
+ for (int i = 0; i < blocklen; ++i) {
1224
+ const int v0 = kvalues_mxfp4[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
1225
+ const int v1 = kvalues_mxfp4[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
1226
+ sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
1227
+ }
1228
+ sumf[j] += sumi * GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[l].e[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
1229
+ }
1230
+ }
1231
+ }
1232
+ for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
1233
+ }
1234
+ }
1235
+
1236
+ void ggml_gemv_mxfp4_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1237
+ const int qk = QK8_0;
1238
+ const int nb = n / qk;
1239
+ const int ncols_interleaved = 8;
1240
+ const int blocklen = 8;
1241
+
1242
+ assert(nr == 1);
1243
+ assert(n % qk == 0);
1244
+ assert(nc % ncols_interleaved == 0);
1245
+
1246
+ UNUSED(bs);
1247
+ UNUSED(nr);
1248
+
1249
+ float sumf[8];
1250
+ int sumi;
1251
+
1252
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
1253
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
1254
+ const block_mxfp4x8 * b_ptr = (const block_mxfp4x8 *) vx + (x * nb);
1255
+
1256
+ for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
1257
+ for (int l = 0; l < nb; l++) {
1258
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
1259
+ for (int j = 0; j < ncols_interleaved; j++) {
1260
+ sumi = 0;
1261
+ for (int i = 0; i < blocklen; ++i) {
1262
+ const int v0 = kvalues_mxfp4[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
1263
+ const int v1 = kvalues_mxfp4[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
1264
+ sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
1265
+ }
1266
+ sumf[j] += sumi * GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[l].e[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
1267
+ }
1268
+ }
1269
+ }
1270
+ for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
1271
+ }
1272
+ }
1273
+
1274
+ void ggml_gemv_q8_0_4x4_q8_0_generic(int n,
1275
+ float * GGML_RESTRICT s,
1276
+ size_t bs,
1277
+ const void * GGML_RESTRICT vx,
1278
+ const void * GGML_RESTRICT vy,
1279
+ int nr,
1280
+ int nc) {
1281
+ const int qk = QK8_0;
1282
+ const int nb = n / qk;
1283
+ const int ncols_interleaved = 4;
1284
+ const int blocklen = 4;
1285
+
1286
+ assert(nr == 1);
1287
+ assert(n % qk == 0);
1288
+ assert(nc % ncols_interleaved == 0);
1289
+
1290
+ UNUSED(bs);
1291
+ UNUSED(nr);
1292
+
1293
+ float sumf[4];
1294
+ int sumi;
1295
+
1296
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
1297
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
1298
+ const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
1299
+
1300
+ for (int j = 0; j < ncols_interleaved; j++) {
1301
+ sumf[j] = 0.0;
1302
+ }
1303
+ for (int l = 0; l < nb; l++) {
1304
+ for (int k = 0; k < (qk / blocklen); k++) {
1305
+ for (int j = 0; j < ncols_interleaved; j++) {
1306
+ sumi = 0;
1307
+ for (int i = 0; i < blocklen; ++i) {
1308
+ const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
1309
+ sumi += v0 * a_ptr[l].qs[k * blocklen + i];
1310
+ }
1311
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
1312
+ }
1313
+ }
1314
+ }
1315
+ for (int j = 0; j < ncols_interleaved; j++) {
1316
+ s[x * ncols_interleaved + j] = sumf[j];
1317
+ }
1318
+ }
1319
+ }
1320
+
1321
+ void ggml_gemv_q8_0_4x8_q8_0_generic(int n,
1322
+ float * GGML_RESTRICT s,
1323
+ size_t bs,
1324
+ const void * GGML_RESTRICT vx,
1325
+ const void * GGML_RESTRICT vy,
1326
+ int nr,
1327
+ int nc) {
1328
+ const int qk = QK8_0;
1329
+ const int nb = n / qk;
1330
+ const int ncols_interleaved = 4;
1331
+ const int blocklen = 8;
1332
+
1333
+ assert(nr == 1);
1334
+ assert(n % qk == 0);
1335
+ assert(nc % ncols_interleaved == 0);
1336
+
1337
+ UNUSED(bs);
1338
+ UNUSED(nr);
1339
+
1340
+ float sumf[4];
1341
+ int sumi;
1342
+
1343
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
1344
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
1345
+ const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
1346
+
1347
+ for (int j = 0; j < ncols_interleaved; j++) {
1348
+ sumf[j] = 0.0;
1349
+ }
1350
+ for (int l = 0; l < nb; l++) {
1351
+ for (int k = 0; k < (qk / blocklen); k++) {
1352
+ for (int j = 0; j < ncols_interleaved; j++) {
1353
+ sumi = 0;
1354
+ for (int i = 0; i < blocklen; ++i) {
1355
+ const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
1356
+ sumi += v0 * a_ptr[l].qs[k * blocklen + i];
1357
+ }
1358
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
1359
+ }
1360
+ }
1361
+ }
1362
+ for (int j = 0; j < ncols_interleaved; j++) {
1363
+ s[x * ncols_interleaved + j] = sumf[j];
1364
+ }
1365
+ }
1366
+ }
1367
+
1368
+ // Only enable these for RISC-V.
1369
+ #if defined __riscv_zvfh
1370
+ void ggml_gemv_q4_0_16x1_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1371
+ const int qk = QK8_0;
1372
+ const int nb = n / qk;
1373
+ const int ncols_interleaved = 16;
1374
+ const int blocklen = 1;
1375
+
1376
+ assert (n % qk == 0);
1377
+ assert (nc % ncols_interleaved == 0);
1378
+
1379
+ UNUSED(s);
1380
+ UNUSED(bs);
1381
+ UNUSED(vx);
1382
+ UNUSED(vy);
1383
+ UNUSED(nr);
1384
+ UNUSED(nc);
1385
+ UNUSED(nb);
1386
+ UNUSED(ncols_interleaved);
1387
+ UNUSED(blocklen);
1388
+
1389
+ float sumf[16];
1390
+ int sumi;
1391
+
1392
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
1393
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
1394
+ const block_q4_0x16 * b_ptr = (const block_q4_0x16 *) vx + (x * nb);
1395
+
1396
+ for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
1397
+ for (int l = 0; l < nb; l++) {
1398
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
1399
+ for (int j = 0; j < ncols_interleaved; j++) {
1400
+ sumi = 0;
1401
+ for (int i = 0; i < blocklen; ++i) {
1402
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
1403
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
1404
+ sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
1405
+ }
1406
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
1407
+ }
1408
+ }
1409
+ }
1410
+ for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
1411
+ }
1412
+ }
1413
+
1414
+ void ggml_gemv_q4_K_16x1_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1415
+ const int qk = QK_K;
1416
+ const int nb = n / qk;
1417
+ const int ncols_interleaved = 16;
1418
+ const int blocklen = 1;
1419
+ assert (n % qk == 0);
1420
+ assert (nc % ncols_interleaved == 0);
1421
+ UNUSED(s);
1422
+ UNUSED(bs);
1423
+ UNUSED(vx);
1424
+ UNUSED(vy);
1425
+ UNUSED(nr);
1426
+ UNUSED(nc);
1427
+ UNUSED(nb);
1428
+ UNUSED(ncols_interleaved);
1429
+ UNUSED(blocklen);
1430
+ float sumf[16];
1431
+ float sum_minf[16];
1432
+ uint8_t scales[128];
1433
+ uint8_t mins[128];
1434
+ int sumi1;
1435
+ int sumi2;
1436
+ int sumi;
1437
+ const block_q8_K * a_ptr = (const block_q8_K *) vy;
1438
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
1439
+ const block_q4_Kx16 * b_ptr = (const block_q4_Kx16 *) vx + (x * nb);
1440
+ for (int j = 0; j < ncols_interleaved; j++) {
1441
+ sumf[j] = 0.0f;
1442
+ sum_minf[j] = 0.0f;
1443
+ }
1444
+ for (int l = 0; l < nb; l++) {
1445
+ for (int i = 0; i < 128; i++) {
1446
+ scales[i] = b_ptr[l].scales[i] & 0x0F;
1447
+ mins[i] = b_ptr[l].scales[i] >> 4;
1448
+ }
1449
+ for (int i = 0; i < 64; i++) {
1450
+ scales[i] |= (b_ptr[l].scales[128 + i] & 0x03) << 4;
1451
+ mins[i] |= (b_ptr[l].scales[128 + i] & 0x0C) << 2;
1452
+ scales[i + 64] |= (b_ptr[l].scales[128 + i] & 0x30);
1453
+ mins[i + 64] |= (b_ptr[l].scales[128 + i] & 0xC0) >> 2;
1454
+ }
1455
+ for (int sb = 0; sb < 8; sb++) {
1456
+ uint8_t *min = &mins[sb * 16];
1457
+ for (int j = 0; j < ncols_interleaved; j++) {
1458
+ sum_minf[j] += min[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
1459
+ }
1460
+ }
1461
+ for (int sb = 0; sb < 8; sb += 2) {
1462
+ uint8_t *scales_0 = &scales[sb * 16];
1463
+ uint8_t *scales_1 = &scales[(sb + 1) * 16];
1464
+ for (int i = 0; i < QK4_0; i++) {
1465
+ for (int j = 0; j < ncols_interleaved; j++) {
1466
+ sumi1 = 0;
1467
+ sumi2 = 0;
1468
+ sumi = 0;
1469
+ const int v0 = (int8_t) (b_ptr[l].qs[sb * 256 + i * 16 + j] & 0xF);
1470
+ const int v1 = (int8_t) (b_ptr[l].qs[sb * 256 + i * 16 + j] >> 4);
1471
+ sumi1 = (v0 * a_ptr[l].qs[sb * 32 + i]);
1472
+ sumi2 = (v1 * a_ptr[l].qs[sb * 32 + 32 + i]);
1473
+ sumi1 = sumi1 * scales_0[j];
1474
+ sumi2 = sumi2 * scales_1[j];
1475
+ sumi += sumi1 + sumi2;
1476
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
1477
+ }
1478
+ }
1479
+ }
1480
+ }
1481
+ for (int j = 0; j < ncols_interleaved; j++) {
1482
+ s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
1483
+ }
1484
+ }
1485
+ }
1486
+
1487
+ void ggml_gemv_iq4_nl_16x1_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1488
+ const int qk = QK8_0;
1489
+ const int nb = n / qk;
1490
+ const int ncols_interleaved = 16;
1491
+ const int blocklen = 1;
1492
+
1493
+ assert(nr == 1);
1494
+ assert(n % qk == 0);
1495
+ assert(nc % ncols_interleaved == 0);
1496
+
1497
+ UNUSED(bs);
1498
+ UNUSED(nr);
1499
+
1500
+ float sumf[16];
1501
+ int sumi;
1502
+
1503
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
1504
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
1505
+ const block_iq4_nlx16 * b_ptr = (const block_iq4_nlx16 *) vx + (x * nb);
1506
+
1507
+ for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
1508
+ for (int l = 0; l < nb; l++) {
1509
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
1510
+ for (int j = 0; j < ncols_interleaved; j++) {
1511
+ sumi = 0;
1512
+ for (int i = 0; i < blocklen; ++i) {
1513
+ const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
1514
+ const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
1515
+ sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
1516
+ }
1517
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
1518
+ }
1519
+ }
1520
+ }
1521
+ for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
1522
+ }
1523
+ }
1524
+
1525
+ void ggml_gemv_q8_0_16x1_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1526
+ const int qk = QK8_0;
1527
+ const int nb = n / qk;
1528
+ const int ncols_interleaved = 16;
1529
+ const int blocklen = 1;
1530
+
1531
+ assert(nr == 1);
1532
+ assert(n % qk == 0);
1533
+ assert(nc % ncols_interleaved == 0);
1534
+
1535
+ UNUSED(bs);
1536
+ UNUSED(nr);
1537
+
1538
+ float sumf[16];
1539
+ int sumi;
1540
+
1541
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
1542
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
1543
+ const block_q8_0x16 * b_ptr = (const block_q8_0x16 *) vx + (x * nb);
1544
+
1545
+ for (int j = 0; j < ncols_interleaved; j++) {
1546
+ sumf[j] = 0.0;
1547
+ }
1548
+ for (int l = 0; l < nb; l++) {
1549
+ for (int k = 0; k < (qk / blocklen); k++) {
1550
+ for (int j = 0; j < ncols_interleaved; j++) {
1551
+ sumi = 0;
1552
+ for (int i = 0; i < blocklen; ++i) {
1553
+ const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
1554
+ sumi += v0 * a_ptr[l].qs[k * blocklen + i];
1555
+ }
1556
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
1557
+ }
1558
+ }
1559
+ }
1560
+ for (int j = 0; j < ncols_interleaved; j++) {
1561
+ s[x * ncols_interleaved + j] = sumf[j];
1562
+ }
1563
+ }
1564
+ }
1565
+
1566
+ void ggml_gemv_q2_K_16x1_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1567
+ assert(n % QK_K == 0);
1568
+ assert(nr == 1);
1569
+ assert(nc % 16 == 0);
1570
+
1571
+ UNUSED(bs);
1572
+ UNUSED(nr);
1573
+
1574
+ const int nb = n / QK_K;
1575
+ const block_q2_Kx16 * x = (const block_q2_Kx16 *)vx;
1576
+ const block_q8_K * y = (const block_q8_K *)vy;
1577
+
1578
+ // Layout: Even-Low(0,2,4,6), Odd-Low(1,3,5,7), Even-High(8...), Odd-High(9...)
1579
+ const int sb_perm[16] = {
1580
+ 0, 4, 1, 5, 2, 6, 3, 7, // 0-7
1581
+ 8, 12, 9, 13, 10, 14, 11, 15 // 8-15
1582
+ };
1583
+
1584
+ for (int col_tile = 0; col_tile < nc; col_tile += 16) {
1585
+ const block_q2_Kx16 * x_ptr = x + (col_tile / 16) * nb;
1586
+ const block_q8_K * y_ptr = y;
1587
+
1588
+ float sumf[16] = {0};
1589
+
1590
+ // Loop over K-blocks
1591
+ for (int k_block = 0; k_block < nb; ++k_block) {
1592
+ int32_t isum[16] = {0};
1593
+ int32_t summs[16] = {0};
1594
+
1595
+ const uint8_t * qs_rhs = x_ptr[k_block].qs;
1596
+ const uint8_t * sc_rhs = x_ptr[k_block].scales;
1597
+ const int8_t * qs_lhs = y_ptr[k_block].qs;
1598
+ const int16_t * bs_lhs = y_ptr[k_block].bsums;
1599
+
1600
+ // Iterate over sub-blocks 0..15
1601
+ for (int sb = 0; sb < 16; ++sb) {
1602
+ // Correction Term
1603
+ int16_t bsum = bs_lhs[sb];
1604
+ int scale_offset = sb_perm[sb] * 16;
1605
+
1606
+ for (int col = 0; col < 16; ++col) {
1607
+ uint8_t sc_val = sc_rhs[scale_offset + col];
1608
+ summs[col] += bsum * (sc_val >> 4); // Min is high 4 bits
1609
+ }
1610
+
1611
+ // Main Dot Product
1612
+ // Calculate base offsets for Q2 unpacking based on SB
1613
+ int byte_base;
1614
+ if (sb < 8) byte_base = (sb % 2 == 0) ? 0 : 16;
1615
+ else byte_base = (sb % 2 == 0) ? 32 : 48;
1616
+
1617
+ int shift = ((sb / 2) % 4) * 2;
1618
+
1619
+ for (int col = 0; col < 16; ++col) {
1620
+ uint8_t sc_val = sc_rhs[scale_offset + col];
1621
+ int32_t d_sb = sc_val & 0xF; // Scale is low 4 bits
1622
+
1623
+ // Process 16 elements (l=0..15)
1624
+ for (int l = 0; l < 16; ++l) {
1625
+ // Q2: Interleaved by column. Byte `l` contains 4 k-values.
1626
+ int qs_idx = (byte_base + l) * 16 + col;
1627
+ uint8_t q2_val = (qs_rhs[qs_idx] >> shift) & 3;
1628
+
1629
+ // Q8: Linear access
1630
+ int k = sb * 16 + l;
1631
+ int8_t q8_val = qs_lhs[k];
1632
+
1633
+ isum[col] += q8_val * q2_val * d_sb;
1634
+ }
1635
+ }
1636
+ }
1637
+
1638
+ // Finalize K-Block
1639
+ for (int col = 0; col < 16; ++col) {
1640
+ float d_lhs = y_ptr[k_block].d;
1641
+ float d_rhs = GGML_FP16_TO_FP32(x_ptr[k_block].d[col]);
1642
+ float dm_rhs = GGML_FP16_TO_FP32(x_ptr[k_block].dmin[col]);
1643
+
1644
+ float d_all = d_lhs * d_rhs;
1645
+ float d_min = d_lhs * dm_rhs;
1646
+
1647
+ sumf[col] += (isum[col] * d_all) - (summs[col] * d_min);
1648
+ }
1649
+ }
1650
+
1651
+ for (int col = 0; col < 16; ++col) {
1652
+ s[col_tile + col] = sumf[col];
1653
+ }
1654
+ }
1655
+ }
1656
+ #endif
1657
+
1658
+ void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1659
+ const int qk = QK8_0;
1660
+ const int nb = n / qk;
1661
+ const int ncols_interleaved = 4;
1662
+ const int blocklen = 4;
1663
+
1664
+ assert (n % qk == 0);
1665
+ assert (nr % 4 == 0);
1666
+ assert (nc % ncols_interleaved == 0);
1667
+
1668
+ UNUSED(s);
1669
+ UNUSED(bs);
1670
+ UNUSED(vx);
1671
+ UNUSED(vy);
1672
+ UNUSED(nr);
1673
+ UNUSED(nc);
1674
+ UNUSED(nb);
1675
+ UNUSED(ncols_interleaved);
1676
+ UNUSED(blocklen);
1677
+
1678
+ {
1679
+ float sumf[4][4];
1680
+ int sumi;
1681
+
1682
+ for (int y = 0; y < nr / 4; y++) {
1683
+ const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
1684
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
1685
+ const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
1686
+ for (int m = 0; m < 4; m++) {
1687
+ for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
1688
+ }
1689
+ for (int l = 0; l < nb; l++) {
1690
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
1691
+ for (int m = 0; m < 4; m++) {
1692
+ for (int j = 0; j < ncols_interleaved; j++) {
1693
+ sumi = 0;
1694
+ for (int i = 0; i < blocklen; ++i) {
1695
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
1696
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
1697
+ sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
1698
+ (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
1699
+ }
1700
+ sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
1701
+ }
1702
+ }
1703
+ }
1704
+ }
1705
+ for (int m = 0; m < 4; m++) {
1706
+ for (int j = 0; j < ncols_interleaved; j++)
1707
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
1708
+ }
1709
+ }
1710
+ }
1711
+ }
1712
+ }
1713
+
1714
+ void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1715
+ const int qk = QK8_0;
1716
+ const int nb = n / qk;
1717
+ const int ncols_interleaved = 4;
1718
+ const int blocklen = 8;
1719
+
1720
+ assert (n % qk == 0);
1721
+ assert (nr % 4 == 0);
1722
+ assert (nc % ncols_interleaved == 0);
1723
+
1724
+ UNUSED(s);
1725
+ UNUSED(bs);
1726
+ UNUSED(vx);
1727
+ UNUSED(vy);
1728
+ UNUSED(nr);
1729
+ UNUSED(nc);
1730
+ UNUSED(nb);
1731
+ UNUSED(ncols_interleaved);
1732
+ UNUSED(blocklen);
1733
+
1734
+ float sumf[4][4];
1735
+ int sumi;
1736
+
1737
+ for (int y = 0; y < nr / 4; y++) {
1738
+ const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
1739
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
1740
+ const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
1741
+ for (int m = 0; m < 4; m++) {
1742
+ for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
1743
+ }
1744
+ for (int l = 0; l < nb; l++) {
1745
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
1746
+ for (int m = 0; m < 4; m++) {
1747
+ for (int j = 0; j < ncols_interleaved; j++) {
1748
+ sumi = 0;
1749
+ for (int i = 0; i < blocklen; ++i) {
1750
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
1751
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
1752
+ sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
1753
+ (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
1754
+ }
1755
+ sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
1756
+ }
1757
+ }
1758
+ }
1759
+ }
1760
+ for (int m = 0; m < 4; m++) {
1761
+ for (int j = 0; j < ncols_interleaved; j++)
1762
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
1763
+ }
1764
+ }
1765
+ }
1766
+ }
1767
+
1768
+ void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1769
+ const int qk = QK8_0;
1770
+ const int nb = n / qk;
1771
+ const int ncols_interleaved = 8;
1772
+ const int blocklen = 8;
1773
+
1774
+ assert (n % qk == 0);
1775
+ assert (nr % 4 == 0);
1776
+ assert (nc % ncols_interleaved == 0);
1777
+
1778
+ UNUSED(s);
1779
+ UNUSED(bs);
1780
+ UNUSED(vx);
1781
+ UNUSED(vy);
1782
+ UNUSED(nr);
1783
+ UNUSED(nc);
1784
+ UNUSED(nb);
1785
+ UNUSED(ncols_interleaved);
1786
+ UNUSED(blocklen);
1787
+
1788
+ float sumf[4][8];
1789
+ int sumi;
1790
+
1791
+ for (int y = 0; y < nr / 4; y++) {
1792
+ const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
1793
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
1794
+ const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
1795
+ for (int m = 0; m < 4; m++) {
1796
+ for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
1797
+ }
1798
+ for (int l = 0; l < nb; l++) {
1799
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
1800
+ for (int m = 0; m < 4; m++) {
1801
+ for (int j = 0; j < ncols_interleaved; j++) {
1802
+ sumi = 0;
1803
+ for (int i = 0; i < blocklen; ++i) {
1804
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
1805
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
1806
+ sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
1807
+ (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
1808
+ }
1809
+ sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
1810
+ }
1811
+ }
1812
+ }
1813
+ }
1814
+ for (int m = 0; m < 4; m++) {
1815
+ for (int j = 0; j < ncols_interleaved; j++)
1816
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
1817
+ }
1818
+ }
1819
+ }
1820
+ }
1821
+
1822
+ void ggml_gemm_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1823
+ const int qk = QK_K;
1824
+ const int nb = n / qk;
1825
+ const int ncols_interleaved = 8;
1826
+ const int blocklen = 4;
1827
+ static const uint32_t kmask1 = 0x3f3f3f3f;
1828
+ static const uint32_t kmask2 = 0x0f0f0f0f;
1829
+ static const uint32_t kmask3 = 0x03030303;
1830
+
1831
+ assert (n % qk == 0);
1832
+ assert (nr % 4 == 0);
1833
+ assert (nc % ncols_interleaved == 0);
1834
+
1835
+ UNUSED(nb);
1836
+ UNUSED(ncols_interleaved);
1837
+ UNUSED(blocklen);
1838
+
1839
+ float sumf[4][8];
1840
+ float sum_minf[4][8];
1841
+ uint32_t utmp[32];
1842
+ int sumi1;
1843
+ int sumi2;
1844
+ int sumi;
1845
+
1846
+ for (int y = 0; y < nr / 4; y++) {
1847
+ const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
1848
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
1849
+ const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb);
1850
+ for (int m = 0; m < 4; m++) {
1851
+ for (int j = 0; j < ncols_interleaved; j++) {
1852
+ sumf[m][j] = 0.0;
1853
+ sum_minf[m][j] = 0.0;
1854
+ }
1855
+ }
1856
+ for (int l = 0; l < nb; l++) {
1857
+ for (int sb = 0; sb < 8; sb++) {
1858
+ memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
1859
+ utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
1860
+ const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
1861
+ utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
1862
+ utmp[sb * 4 + 2] = uaux_0;
1863
+ utmp[sb * 4 + 0] &= kmask1;
1864
+ }
1865
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
1866
+ uint8_t * scales_0 = (uint8_t *) utmp + (k / 8) * 32;
1867
+ uint8_t * scales_1 = (uint8_t *) utmp + (k / 8) * 32 + 16;
1868
+ for (int m = 0; m < 4; m++) {
1869
+ for (int j = 0; j < ncols_interleaved; j++) {
1870
+ sumi1 = 0;
1871
+ sumi2 = 0;
1872
+ sumi = 0;
1873
+ for (int i = 0; i < blocklen; ++i) {
1874
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF);
1875
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
1876
+ sumi1 = (v0 * a_ptr[l].qs[(k / 8) * 256 + (k % 8) * 4 * blocklen + m * blocklen + i]);
1877
+ sumi2 = (v1 * a_ptr[l].qs[(k / 8) * 256 + (k % 8) * 4 * blocklen + m * blocklen + i + 128]);
1878
+ sumi1 = sumi1 * scales_0[j];
1879
+ sumi2 = sumi2 * scales_1[j];
1880
+ sumi += sumi1 + sumi2;
1881
+ }
1882
+ sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
1883
+ }
1884
+ }
1885
+ }
1886
+ for (int sb = 0; sb < 8; sb++) {
1887
+ uint8_t * mins = (uint8_t *) utmp + 8 + sb * 16;
1888
+ for(int m = 0; m < 4; m++) {
1889
+ const int16_t * bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
1890
+ for(int j = 0; j < ncols_interleaved; j++) {
1891
+ sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
1892
+ }
1893
+ }
1894
+ }
1895
+ }
1896
+ for (int m = 0; m < 4; m++) {
1897
+ for (int j = 0; j < ncols_interleaved; j++) {
1898
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
1899
+ }
1900
+ }
1901
+ }
1902
+ }
1903
+ }
1904
+
1905
+ void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1906
+ const int qk = QK_K;
1907
+ const int nb = n / qk;
1908
+ const int ncols_interleaved = 8;
1909
+ const int blocklen = 8;
1910
+ static const uint32_t kmask1 = 0x3f3f3f3f;
1911
+ static const uint32_t kmask2 = 0x0f0f0f0f;
1912
+ static const uint32_t kmask3 = 0x03030303;
1913
+
1914
+ assert (n % qk == 0);
1915
+ assert (nr % 4 == 0);
1916
+ assert (nc % ncols_interleaved == 0);
1917
+
1918
+ UNUSED(bs);
1919
+
1920
+ float sumf[4][8];
1921
+ float sum_minf[4][8];
1922
+ uint32_t utmp[32];
1923
+ int sumi1;
1924
+ int sumi2;
1925
+ int sumi;
1926
+
1927
+ for (int y = 0; y < nr / 4; y++) {
1928
+ const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
1929
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
1930
+ const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb);
1931
+ for (int m = 0; m < 4; m++) {
1932
+ for (int j = 0; j < ncols_interleaved; j++) {
1933
+ sumf[m][j] = 0.0;
1934
+ sum_minf[m][j] = 0.0;
1935
+ }
1936
+ }
1937
+ for (int l = 0; l < nb; l++) {
1938
+ for (int sb = 0; sb < 8; sb++) {
1939
+ memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
1940
+ utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
1941
+ const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
1942
+ utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
1943
+ utmp[sb * 4 + 2] = uaux_0;
1944
+ utmp[sb * 4 + 0] &= kmask1;
1945
+ }
1946
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
1947
+ uint8_t *scales_0 = (uint8_t*) utmp + (k / 4) * 32;
1948
+ uint8_t *scales_1 = (uint8_t*) utmp + (k / 4) * 32 + 16;
1949
+ for (int m = 0; m < 4; m++) {
1950
+ for (int j = 0; j < ncols_interleaved; j++) {
1951
+ sumi1 = 0;
1952
+ sumi2 = 0;
1953
+ sumi = 0;
1954
+ for (int i = 0; i < blocklen; ++i) {
1955
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF);
1956
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
1957
+ sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 256 + (k % 4) * 4 * blocklen + m * blocklen + i]);
1958
+ sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 256 + (k % 4) * 4 * blocklen + m * blocklen + i + 128]);
1959
+ sumi1 = sumi1 * scales_0[j];
1960
+ sumi2 = sumi2 * scales_1[j];
1961
+ sumi += sumi1 + sumi2;
1962
+ }
1963
+ sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
1964
+ }
1965
+ }
1966
+ }
1967
+ for (int sb = 0; sb < 8; sb++) {
1968
+ uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16;
1969
+ for(int m = 0; m < 4; m++) {
1970
+ const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
1971
+ for(int j = 0; j < ncols_interleaved; j++) {
1972
+ sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
1973
+ }
1974
+ }
1975
+ }
1976
+ }
1977
+ for (int m = 0; m < 4; m++) {
1978
+ for (int j = 0; j < ncols_interleaved; j++) {
1979
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
1980
+ }
1981
+ }
1982
+ }
1983
+ }
1984
+ }
1985
+
1986
+ void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1987
+ const int qk = QK_K;
1988
+ const int nb = n / qk;
1989
+ const int ncols_interleaved = 8;
1990
+ const int blocklen = 8;
1991
+
1992
+ assert (n % qk == 0);
1993
+ assert (nr % 4 == 0);
1994
+ assert (nc % ncols_interleaved == 0);
1995
+
1996
+ UNUSED(s);
1997
+ UNUSED(bs);
1998
+ UNUSED(vx);
1999
+ UNUSED(vy);
2000
+ UNUSED(nr);
2001
+ UNUSED(nc);
2002
+ UNUSED(nb);
2003
+ UNUSED(ncols_interleaved);
2004
+ UNUSED(blocklen);
2005
+
2006
+ float sumf[4][8];
2007
+ float sum_minf[4][8];
2008
+ int sumi1, sumi2, sumi3, sumi4;
2009
+ int sumi;
2010
+
2011
+ for (int y = 0; y < nr / 4; y++) {
2012
+ const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
2013
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
2014
+ const block_q2_Kx8 * b_ptr = (const block_q2_Kx8 *) vx + (x * nb);
2015
+ for (int m = 0; m < 4; m++) {
2016
+ for (int j = 0; j < ncols_interleaved; j++) {
2017
+ sumf[m][j] = 0.0;
2018
+ sum_minf[m][j] = 0.0;
2019
+ }
2020
+ }
2021
+ for (int l = 0; l < nb; l++) {
2022
+ for (int k = 0; k < (qk / (4 * blocklen)); k++) {
2023
+
2024
+ const uint8_t *scales_0 = b_ptr[l].scales + (k / 4) * 64 ;
2025
+ const uint8_t *scales_1 = b_ptr[l].scales + (k / 4) * 64 + 16;
2026
+ const uint8_t *scales_2 = b_ptr[l].scales + (k / 4) * 64 + 32;
2027
+ const uint8_t *scales_3 = b_ptr[l].scales + (k / 4) * 64 + 48;
2028
+ for (int m = 0; m < 4; m++) {
2029
+ for (int j = 0; j < ncols_interleaved; j++) {
2030
+ sumi1 = 0;
2031
+ sumi2 = 0;
2032
+ sumi3 = 0;
2033
+ sumi4 = 0;
2034
+ sumi = 0;
2035
+ int offset = ((k / 2) % 2) + j * 2;
2036
+ for (int i = 0; i < blocklen; ++i){
2037
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 3);
2038
+ const int v1 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 2 ) & 3);
2039
+ const int v2 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4 ) & 3);
2040
+ const int v3 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 6 ) & 3);
2041
+ sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i]);
2042
+ sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 128]);
2043
+ sumi3 = (v2 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 256]);
2044
+ sumi4 = (v3 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 384]);
2045
+ sumi1 = sumi1 * (scales_0[offset] & 0xF);
2046
+ sumi2 = sumi2 * (scales_1[offset] & 0xF);
2047
+ sumi3 = sumi3 * (scales_2[offset] & 0xF);
2048
+ sumi4 = sumi4 * (scales_3[offset] & 0xF);
2049
+ sumi += sumi1 + sumi2 + sumi3 + sumi4;
2050
+ }
2051
+ sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
2052
+ }
2053
+ }
2054
+ }
2055
+ for(int sb = 0; sb < 8; sb++) {
2056
+ const uint8_t *mins = b_ptr[l].scales + sb * 16;
2057
+ for(int m = 0; m < 4; m++) {
2058
+ const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
2059
+ for(int j = 0; j < ncols_interleaved; j++) {
2060
+ int mins_prod = ((mins[j * 2] >> 4) * bsums[0] + (mins[(j * 2)+ 1] >> 4) * bsums[1]);
2061
+ sum_minf[m][j] += (mins_prod) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
2062
+ }
2063
+ }
2064
+ }
2065
+ }
2066
+
2067
+ for (int m = 0; m < 4; m++) {
2068
+ for (int j = 0; j < ncols_interleaved; j++) {
2069
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
2070
+ }
2071
+ }
2072
+ }
2073
+ }
2074
+ }
2075
+
2076
+ void ggml_gemm_q5_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
2077
+ ggml_gemm_q5_K_NxM_q8_K_generic_impl<4, 8>(n, s, bs, vx, vy, nr, nc);
2078
+ }
2079
+
2080
+ void ggml_gemm_q5_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
2081
+ ggml_gemm_q5_K_NxM_q8_K_generic_impl<8, 8>(n, s, bs, vx, vy, nr, nc);
2082
+ }
2083
+
2084
+ void ggml_gemm_q6_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
2085
+ ggml_gemm_q6_K_NxM_q8_K_generic_impl<4, 8>(n, s, bs, vx, vy, nr, nc);
2086
+ }
2087
+
2088
+ void ggml_gemm_q6_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
2089
+ ggml_gemm_q6_K_NxM_q8_K_generic_impl<8, 8>(n, s, bs, vx, vy, nr, nc);
2090
+ }
2091
+
2092
+ void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
2093
+ const int qk = QK8_0;
2094
+ const int nb = n / qk;
2095
+ const int ncols_interleaved = 4;
2096
+ const int blocklen = 4;
2097
+
2098
+ assert (n % qk == 0);
2099
+ assert (nr % 4 == 0);
2100
+ assert (nc % ncols_interleaved == 0);
2101
+
2102
+ UNUSED(s);
2103
+ UNUSED(bs);
2104
+ UNUSED(vx);
2105
+ UNUSED(vy);
2106
+ UNUSED(nr);
2107
+ UNUSED(nc);
2108
+ UNUSED(nb);
2109
+ UNUSED(ncols_interleaved);
2110
+ UNUSED(blocklen);
2111
+
2112
+ {
2113
+ float sumf[4][4];
2114
+ int sumi;
2115
+
2116
+ for (int y = 0; y < nr / 4; y++) {
2117
+ const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
2118
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
2119
+ const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
2120
+ for (int m = 0; m < 4; m++) {
2121
+ for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
2122
+ }
2123
+ for (int l = 0; l < nb; l++) {
2124
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
2125
+ for (int m = 0; m < 4; m++) {
2126
+ for (int j = 0; j < ncols_interleaved; j++) {
2127
+ sumi = 0;
2128
+ for (int i = 0; i < blocklen; ++i) {
2129
+ const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
2130
+ const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
2131
+ sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
2132
+ (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
2133
+ }
2134
+ sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
2135
+ }
2136
+ }
2137
+ }
2138
+ }
2139
+ for (int m = 0; m < 4; m++) {
2140
+ for (int j = 0; j < ncols_interleaved; j++)
2141
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
2142
+ }
2143
+ }
2144
+ }
2145
+ }
2146
+ }
2147
+
2148
+ void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
2149
+ const int qk = QK8_0;
2150
+ const int nb = n / qk;
2151
+ const int ncols_interleaved = 8;
2152
+ const int blocklen = 8;
2153
+
2154
+ assert(n % qk == 0);
2155
+ assert(nr % 4 == 0);
2156
+ assert(nc % ncols_interleaved == 0);
2157
+
2158
+ float sumf[4][8];
2159
+ int sumi;
2160
+
2161
+ for (int y = 0; y < nr / 4; y++) {
2162
+ const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
2163
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
2164
+ const block_iq4_nlx8 * b_ptr = (const block_iq4_nlx8 *) vx + (x * nb);
2165
+ for (int m = 0; m < 4; m++) {
2166
+ for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
2167
+ }
2168
+ for (int l = 0; l < nb; l++) {
2169
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
2170
+ for (int m = 0; m < 4; m++) {
2171
+ for (int j = 0; j < ncols_interleaved; j++) {
2172
+ sumi = 0;
2173
+ for (int i = 0; i < blocklen; ++i) {
2174
+ const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
2175
+ const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
2176
+ sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
2177
+ (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
2178
+ }
2179
+ sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
2180
+ }
2181
+ }
2182
+ }
2183
+ }
2184
+ for (int m = 0; m < 4; m++) {
2185
+ for (int j = 0; j < ncols_interleaved; j++)
2186
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
2187
+ }
2188
+ }
2189
+ }
2190
+ }
2191
+
2192
+ void ggml_gemm_mxfp4_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
2193
+ const int qk = QK8_0;
2194
+ const int nb = n / qk;
2195
+ const int ncols_interleaved = 4;
2196
+ const int blocklen = 4;
2197
+
2198
+ assert(n % qk == 0);
2199
+ assert(nr % 4 == 0);
2200
+ assert(nc % ncols_interleaved == 0);
2201
+
2202
+ float sumf[4][4];
2203
+ int sumi;
2204
+
2205
+ for (int y = 0; y < nr / 4; y++) {
2206
+ const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
2207
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
2208
+ const block_mxfp4x4 * b_ptr = (const block_mxfp4x4 *) vx + (x * nb);
2209
+ for (int m = 0; m < 4; m++) {
2210
+ for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
2211
+ }
2212
+ for (int l = 0; l < nb; l++) {
2213
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
2214
+ for (int m = 0; m < 4; m++) {
2215
+ for (int j = 0; j < ncols_interleaved; j++) {
2216
+ sumi = 0;
2217
+ for (int i = 0; i < blocklen; ++i) {
2218
+ const int v0 = kvalues_mxfp4[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
2219
+ const int v1 = kvalues_mxfp4[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
2220
+ sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
2221
+ (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
2222
+ }
2223
+ sumf[m][j] += sumi * GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[l].e[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
2224
+ }
2225
+ }
2226
+ }
2227
+ }
2228
+ for (int m = 0; m < 4; m++) {
2229
+ for (int j = 0; j < ncols_interleaved; j++)
2230
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
2231
+ }
2232
+ }
2233
+ }
2234
+ }
2235
+
2236
+ void ggml_gemm_mxfp4_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
2237
+ const int qk = QK8_0;
2238
+ const int nb = n / qk;
2239
+ const int ncols_interleaved = 8;
2240
+ const int blocklen = 8;
2241
+
2242
+ assert(n % qk == 0);
2243
+ assert(nr % 4 == 0);
2244
+ assert(nc % ncols_interleaved == 0);
2245
+
2246
+ float sumf[4][8];
2247
+ int sumi;
2248
+
2249
+ for (int y = 0; y < nr / 4; y++) {
2250
+ const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
2251
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
2252
+ const block_mxfp4x8 * b_ptr = (const block_mxfp4x8 *) vx + (x * nb);
2253
+ for (int m = 0; m < 4; m++) {
2254
+ for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
2255
+ }
2256
+ for (int l = 0; l < nb; l++) {
2257
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
2258
+ for (int m = 0; m < 4; m++) {
2259
+ for (int j = 0; j < ncols_interleaved; j++) {
2260
+ sumi = 0;
2261
+ for (int i = 0; i < blocklen; ++i) {
2262
+ const int v0 = kvalues_mxfp4[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
2263
+ const int v1 = kvalues_mxfp4[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
2264
+ sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
2265
+ (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
2266
+ }
2267
+ sumf[m][j] += sumi * GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[l].e[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
2268
+ }
2269
+ }
2270
+ }
2271
+ }
2272
+ for (int m = 0; m < 4; m++) {
2273
+ for (int j = 0; j < ncols_interleaved; j++)
2274
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
2275
+ }
2276
+ }
2277
+ }
2278
+ }
2279
+
2280
+ void ggml_gemm_q8_0_4x4_q8_0_generic(int n,
2281
+ float * GGML_RESTRICT s,
2282
+ size_t bs,
2283
+ const void * GGML_RESTRICT vx,
2284
+ const void * GGML_RESTRICT vy,
2285
+ int nr,
2286
+ int nc) {
2287
+ const int qk = QK8_0;
2288
+ const int nb = n / qk;
2289
+ const int ncols_interleaved = 4;
2290
+ const int blocklen = 4;
2291
+
2292
+ assert(n % qk == 0);
2293
+ assert(nr % 4 == 0);
2294
+ assert(nc % ncols_interleaved == 0);
2295
+
2296
+ float sumf[4][4];
2297
+ int sumi;
2298
+
2299
+ for (int y = 0; y < nr / 4; y++) {
2300
+ const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
2301
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
2302
+ const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
2303
+ for (int m = 0; m < 4; m++) {
2304
+ for (int j = 0; j < ncols_interleaved; j++) {
2305
+ sumf[m][j] = 0.0;
2306
+ }
2307
+ }
2308
+ for (int l = 0; l < nb; l++) {
2309
+ for (int k = 0; k < (qk / blocklen); k++) {
2310
+ for (int m = 0; m < 4; m++) {
2311
+ for (int j = 0; j < ncols_interleaved; j++) {
2312
+ sumi = 0;
2313
+ for (int i = 0; i < blocklen; ++i) {
2314
+ const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
2315
+ sumi += v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i];
2316
+ }
2317
+ sumf[m][j] +=
2318
+ sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
2319
+ }
2320
+ }
2321
+ }
2322
+ }
2323
+ for (int m = 0; m < 4; m++) {
2324
+ for (int j = 0; j < ncols_interleaved; j++) {
2325
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
2326
+ }
2327
+ }
2328
+ }
2329
+ }
2330
+ }
2331
+
2332
+
2333
+
2334
+ void ggml_gemm_q8_0_4x8_q8_0_generic(int n,
2335
+ float * GGML_RESTRICT s,
2336
+ size_t bs,
2337
+ const void * GGML_RESTRICT vx,
2338
+ const void * GGML_RESTRICT vy,
2339
+ int nr,
2340
+ int nc) {
2341
+ const int qk = QK8_0;
2342
+ const int nb = n / qk;
2343
+ const int ncols_interleaved = 4;
2344
+ const int blocklen = 8;
2345
+
2346
+ assert(n % qk == 0);
2347
+ assert(nr % 4 == 0);
2348
+ assert(nc % ncols_interleaved == 0);
2349
+
2350
+ float sumf[4][4];
2351
+ int sumi;
2352
+
2353
+ for (int y = 0; y < nr / 4; y++) {
2354
+ const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
2355
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
2356
+ const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
2357
+ for (int m = 0; m < 4; m++) {
2358
+ for (int j = 0; j < ncols_interleaved; j++) {
2359
+ sumf[m][j] = 0.0;
2360
+ }
2361
+ }
2362
+ for (int l = 0; l < nb; l++) {
2363
+ for (int k = 0; k < (qk / blocklen); k++) {
2364
+ for (int m = 0; m < 4; m++) {
2365
+ for (int j = 0; j < ncols_interleaved; j++) {
2366
+ sumi = 0;
2367
+ for (int i = 0; i < blocklen; ++i) {
2368
+ const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
2369
+ sumi += v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i];
2370
+ }
2371
+ sumf[m][j] +=
2372
+ sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
2373
+ }
2374
+ }
2375
+ }
2376
+ }
2377
+ for (int m = 0; m < 4; m++) {
2378
+ for (int j = 0; j < ncols_interleaved; j++) {
2379
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
2380
+ }
2381
+ }
2382
+ }
2383
+ }
2384
+ }
2385
+
2386
+ // Only enable these for RISC-V.
2387
+ #if defined __riscv_zvfh
2388
+ void ggml_gemm_q4_0_16x1_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
2389
+ const int qk = QK8_0;
2390
+ const int nb = n / qk;
2391
+ const int ncols_interleaved = 16;
2392
+ const int blocklen = 1;
2393
+
2394
+ assert (n % qk == 0);
2395
+ assert (nr % 4 == 0);
2396
+ assert (nc % ncols_interleaved == 0);
2397
+
2398
+ UNUSED(s);
2399
+ UNUSED(bs);
2400
+ UNUSED(vx);
2401
+ UNUSED(vy);
2402
+ UNUSED(nr);
2403
+ UNUSED(nc);
2404
+ UNUSED(nb);
2405
+ UNUSED(ncols_interleaved);
2406
+ UNUSED(blocklen);
2407
+
2408
+ float sumf[4][16];
2409
+ int sumi;
2410
+
2411
+ for (int y = 0; y < nr / 4; y++) {
2412
+ const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
2413
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
2414
+ const block_q4_0x16 * b_ptr = (const block_q4_0x16 *) vx + (x * nb);
2415
+ for (int m = 0; m < 4; m++) {
2416
+ for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
2417
+ }
2418
+ for (int l = 0; l < nb; l++) {
2419
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
2420
+ for (int m = 0; m < 4; m++) {
2421
+ for (int j = 0; j < ncols_interleaved; j++) {
2422
+ sumi = 0;
2423
+ for (int i = 0; i < blocklen; ++i) {
2424
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
2425
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
2426
+ sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
2427
+ (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
2428
+ }
2429
+ sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
2430
+ }
2431
+ }
2432
+ }
2433
+ }
2434
+ for (int m = 0; m < 4; m++) {
2435
+ for (int j = 0; j < ncols_interleaved; j++)
2436
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
2437
+ }
2438
+ }
2439
+ }
2440
+ }
2441
+
2442
+ void ggml_gemm_q4_K_16x1_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
2443
+ const int qk = QK_K;
2444
+ const int nb = n / qk;
2445
+ const int ncols_interleaved = 16;
2446
+ const int blocklen = 1;
2447
+
2448
+ assert (n % qk == 0);
2449
+ assert (nr % 4 == 0);
2450
+ assert (nc % ncols_interleaved == 0);
2451
+
2452
+ UNUSED(s);
2453
+ UNUSED(bs);
2454
+ UNUSED(vx);
2455
+ UNUSED(vy);
2456
+ UNUSED(nr);
2457
+ UNUSED(nc);
2458
+ UNUSED(nb);
2459
+ UNUSED(ncols_interleaved);
2460
+ UNUSED(blocklen);
2461
+
2462
+ float sumf[4][16];
2463
+ float sum_minf[4][16];
2464
+ uint8_t scales[128];
2465
+ uint8_t mins[128];
2466
+ int sumi1;
2467
+ int sumi2;
2468
+ int sumi;
2469
+
2470
+ for (int y = 0; y < nr / 4; y++) {
2471
+ const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
2472
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
2473
+ const block_q4_Kx16 * b_ptr = (const block_q4_Kx16 *) vx + (x * nb);
2474
+ for (int m = 0; m < 4; m++) {
2475
+ for (int j = 0; j < ncols_interleaved; j++) {
2476
+ sumf[m][j] = 0.0;
2477
+ sum_minf[m][j] = 0.0;
2478
+ }
2479
+ }
2480
+ for (int l = 0; l < nb; l++) {
2481
+ for (int i = 0; i < 128; i++) {
2482
+ scales[i] = b_ptr[l].scales[i] & 0x0F;
2483
+ mins[i] = b_ptr[l].scales[i] >> 4;
2484
+ }
2485
+ for (int i = 0; i < 64; i++) {
2486
+ scales[i] |= (b_ptr[l].scales[128 + i] & 0x03) << 4;
2487
+ mins[i] |= (b_ptr[l].scales[128 + i] & 0x0C) << 2;
2488
+ scales[i + 64] |= (b_ptr[l].scales[128 + i] & 0x30);
2489
+ mins[i + 64] |= (b_ptr[l].scales[128 + i] & 0xC0) >> 2;
2490
+ }
2491
+
2492
+ for (int sb = 0; sb < 8; sb++) {
2493
+ uint8_t *min = &mins[sb * 16];
2494
+ for(int m = 0; m < 4; m++) {
2495
+ const int16_t bsums = a_ptr[l].bsums[sb * 8 + m] + a_ptr[l].bsums[sb * 8 + m + 4];
2496
+ for(int j = 0; j < ncols_interleaved; j++) {
2497
+ sum_minf[m][j] += min[j] * bsums * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
2498
+ }
2499
+ }
2500
+ }
2501
+
2502
+ for (int sb = 0; sb < 8; sb += 2) {
2503
+ uint8_t *scales_0 = &scales[sb * 16];
2504
+ uint8_t *scales_1 = &scales[(sb + 1) * 16];
2505
+
2506
+ for (int i = 0; i < QK4_0; i++) {
2507
+ for (int m = 0; m < 4; m++) {
2508
+ for (int j = 0; j < ncols_interleaved; j++) {
2509
+ sumi1 = 0;
2510
+ sumi2 = 0;
2511
+ sumi = 0;
2512
+
2513
+ const int v0 = (int8_t) (b_ptr[l].qs[sb * 256 + i * 16 + j] & 0xF);
2514
+ const int v1 = (int8_t) (b_ptr[l].qs[sb * 256 + i * 16 + j] >> 4);
2515
+ sumi1 = (v0 * a_ptr[l].qs[sb * 4 * 32 + i * 4 + m]);
2516
+ sumi2 = (v1 * a_ptr[l].qs[sb * 4 * 32 + 32 * 4 + i * 4 + m]);
2517
+ sumi1 = sumi1 * scales_0[j];
2518
+ sumi2 = sumi2 * scales_1[j];
2519
+ sumi += sumi1 + sumi2;
2520
+
2521
+ sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
2522
+ }
2523
+ }
2524
+ }
2525
+ }
2526
+ }
2527
+ for (int m = 0; m < 4; m++) {
2528
+ for (int j = 0; j < ncols_interleaved; j++) {
2529
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
2530
+ }
2531
+ }
2532
+ }
2533
+ }
2534
+ }
2535
+
2536
+ void ggml_gemm_iq4_nl_16x1_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
2537
+ const int qk = QK8_0;
2538
+ const int nb = n / qk;
2539
+ const int ncols_interleaved = 16;
2540
+ const int blocklen = 1;
2541
+
2542
+ assert(n % qk == 0);
2543
+ assert(nr % 4 == 0);
2544
+ assert(nc % ncols_interleaved == 0);
2545
+
2546
+ float sumf[4][16];
2547
+ int sumi;
2548
+
2549
+ for (int y = 0; y < nr / 4; y++) {
2550
+ const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
2551
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
2552
+ const block_iq4_nlx16 * b_ptr = (const block_iq4_nlx16 *) vx + (x * nb);
2553
+ for (int m = 0; m < 4; m++) {
2554
+ for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
2555
+ }
2556
+ for (int l = 0; l < nb; l++) {
2557
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
2558
+ for (int m = 0; m < 4; m++) {
2559
+ for (int j = 0; j < ncols_interleaved; j++) {
2560
+ sumi = 0;
2561
+ for (int i = 0; i < blocklen; ++i) {
2562
+ const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
2563
+ const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
2564
+ sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
2565
+ (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + (qk / 2) * 4]));
2566
+ }
2567
+ sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
2568
+ }
2569
+ }
2570
+ }
2571
+ }
2572
+ for (int m = 0; m < 4; m++) {
2573
+ for (int j = 0; j < ncols_interleaved; j++)
2574
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
2575
+ }
2576
+ }
2577
+ }
2578
+ }
2579
+
2580
+ void ggml_gemm_q8_0_16x1_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
2581
+ const int qk = QK8_0;
2582
+ const int nb = n / qk;
2583
+ const int ncols_interleaved = 16;
2584
+ const int blocklen = 1;
2585
+
2586
+ assert(n % qk == 0);
2587
+ assert(nr % 4 == 0);
2588
+ assert(nc % ncols_interleaved == 0);
2589
+
2590
+ float sumf[4][16];
2591
+ int sumi;
2592
+
2593
+ for (int y = 0; y < nr / 4; y++) {
2594
+ const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
2595
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
2596
+ const block_q8_0x16 * b_ptr = (const block_q8_0x16 *) vx + (x * nb);
2597
+ for (int m = 0; m < 4; m++) {
2598
+ for (int j = 0; j < ncols_interleaved; j++) {
2599
+ sumf[m][j] = 0.0;
2600
+ }
2601
+ }
2602
+ for (int l = 0; l < nb; l++) {
2603
+ for (int k = 0; k < (qk / blocklen); k++) {
2604
+ for (int m = 0; m < 4; m++) {
2605
+ for (int j = 0; j < ncols_interleaved; j++) {
2606
+ sumi = 0;
2607
+ for (int i = 0; i < blocklen; ++i) {
2608
+ const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
2609
+ sumi += v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i];
2610
+ }
2611
+ sumf[m][j] +=
2612
+ sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
2613
+ }
2614
+ }
2615
+ }
2616
+ }
2617
+ for (int m = 0; m < 4; m++) {
2618
+ for (int j = 0; j < ncols_interleaved; j++) {
2619
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
2620
+ }
2621
+ }
2622
+ }
2623
+ }
2624
+ }
2625
+
2626
+
2627
+ void ggml_gemm_q2_K_16x1_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
2628
+ assert(n % QK_K == 0);
2629
+ assert(nr % 4 == 0);
2630
+ assert(nc % 16 == 0);
2631
+ const int nb = n / QK_K;
2632
+ const block_q2_Kx16 * x = (const block_q2_Kx16 *)vx;
2633
+ const block_q8_Kx4 * y = (const block_q8_Kx4 *)vy;
2634
+
2635
+ const int sb_perm[16] = {
2636
+ 0, 4, 1, 5, 2, 6, 3, 7,
2637
+ 8, 12, 9, 13, 10, 14, 11, 15
2638
+ };
2639
+
2640
+ // Iterate Rows in tiles of 4
2641
+ for (int row_tile = 0; row_tile < nr; row_tile += 4) {
2642
+ // Iterate Columns in tiles of 16
2643
+ for (int col_tile = 0; col_tile < nc; col_tile += 16) {
2644
+
2645
+ const block_q2_Kx16 * x_ptr = x + (col_tile / 16) * nb;
2646
+ const block_q8_Kx4 * y_ptr = y + (row_tile / 4) * nb;
2647
+
2648
+ float sumf[4][16];
2649
+ memset(sumf, 0, sizeof(sumf));
2650
+
2651
+ for (int k_block = 0; k_block < nb; ++k_block) {
2652
+ int32_t isum[4][16];
2653
+ int32_t summs[4][16];
2654
+ memset(isum, 0, sizeof(isum));
2655
+ memset(summs, 0, sizeof(summs));
2656
+
2657
+ const uint8_t * qs_rhs = x_ptr[k_block].qs;
2658
+ const uint8_t * sc_rhs = x_ptr[k_block].scales;
2659
+ const int8_t * qs_lhs = y_ptr[k_block].qs;
2660
+ const int16_t * bs_lhs = y_ptr[k_block].bsums;
2661
+
2662
+ for (int sb = 0; sb < 16; ++sb) {
2663
+ int scale_offset = sb_perm[sb] * 16;
2664
+
2665
+ int byte_base;
2666
+ if (sb < 8) byte_base = (sb % 2 == 0) ? 0 : 16;
2667
+ else byte_base = (sb % 2 == 0) ? 32 : 48;
2668
+ int shift = ((sb / 2) % 4) * 2;
2669
+
2670
+ for (int col = 0; col < 16; ++col) {
2671
+ uint8_t sc_val = sc_rhs[scale_offset + col];
2672
+ int32_t d_sb = sc_val & 0xF;
2673
+ int32_t m_sb = sc_val >> 4;
2674
+
2675
+ // Correction Term
2676
+ for (int r = 0; r < 4; ++r) {
2677
+ int bsum_idx = (sb / 4) * 16 + r * 4 + (sb % 4);
2678
+ summs[r][col] += bs_lhs[bsum_idx] * m_sb;
2679
+ }
2680
+
2681
+ // Main Dot Product
2682
+ for (int l = 0; l < 16; ++l) {
2683
+ int qs_idx = (byte_base + l) * 16 + col;
2684
+ uint8_t q2_val = (qs_rhs[qs_idx] >> shift) & 3;
2685
+
2686
+ // Calculate Q8 index for this specific k and row
2687
+ int k = sb * 16 + l;
2688
+ int q8_idx = (k / 4) * 16 + (k % 4);
2689
+
2690
+ for (int r = 0; r < 4; ++r) {
2691
+ // Add r*4 to jump to the correct row within the 4x4 chunk
2692
+ int8_t q8_val = qs_lhs[q8_idx + r * 4];
2693
+ isum[r][col] += q8_val * q2_val * d_sb;
2694
+ }
2695
+ }
2696
+ }
2697
+ }
2698
+
2699
+ // Finalize K-Block
2700
+ for (int col = 0; col < 16; ++col) {
2701
+ float d_rhs = GGML_FP16_TO_FP32(x_ptr[k_block].d[col]);
2702
+ float dm_rhs = GGML_FP16_TO_FP32(x_ptr[k_block].dmin[col]);
2703
+
2704
+ for (int r = 0; r < 4; ++r) {
2705
+ float d_lhs = y_ptr[k_block].d[r];
2706
+ float d_all = d_lhs * d_rhs;
2707
+ float d_min = d_lhs * dm_rhs;
2708
+ sumf[r][col] += (isum[r][col] * d_all) - (summs[r][col] * d_min);
2709
+ }
2710
+ }
2711
+ }
2712
+
2713
+ for (int r = 0; r < 4; ++r) {
2714
+ for (int col = 0; col < 16; ++col) {
2715
+ s[(row_tile + r) * bs + (col_tile + col)] = sumf[r][col];
2716
+ }
2717
+ }
2718
+ }
2719
+ }
2720
+ }
2721
+ #endif
2722
+
2723
+ } // extern "C"
2724
+
2725
+ static block_q8_0x4 make_block_q8_0x4(block_q8_0 * in, unsigned int blck_size_interleave) {
2726
+ block_q8_0x4 out;
2727
+
2728
+ for (int i = 0; i < 4; i++) {
2729
+ out.d[i] = in[i].d;
2730
+ }
2731
+
2732
+ const int end = QK8_0 * 4 / blck_size_interleave;
2733
+ for (int i = 0; i < end; ++i) {
2734
+ int src_id = i % 4;
2735
+ int src_offset = (i / 4) * blck_size_interleave;
2736
+ int dst_offset = i * blck_size_interleave;
2737
+ memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], blck_size_interleave);
2738
+ }
2739
+ return out;
2740
+ }
2741
+
2742
+ static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
2743
+ block_q4_0x4 out;
2744
+
2745
+ for (int i = 0; i < 4; i++) {
2746
+ out.d[i] = in[i].d;
2747
+ }
2748
+
2749
+ const int end = QK4_0 * 2 / blck_size_interleave;
2750
+
2751
+ if (blck_size_interleave == 8) {
2752
+ const uint64_t xor_mask = 0x8888888888888888ULL;
2753
+ for (int i = 0; i < end; ++i) {
2754
+ int src_id = i % 4;
2755
+ int src_offset = (i / 4) * blck_size_interleave;
2756
+ int dst_offset = i * blck_size_interleave;
2757
+
2758
+ uint64_t elems;
2759
+ // Using memcpy to avoid unaligned memory accesses
2760
+ memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
2761
+ elems ^= xor_mask;
2762
+ memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
2763
+ }
2764
+ } else if (blck_size_interleave == 4) {
2765
+ const uint32_t xor_mask = 0x88888888;
2766
+ for (int i = 0; i < end; ++i) {
2767
+ int src_id = i % 4;
2768
+ int src_offset = (i / 4) * blck_size_interleave;
2769
+ int dst_offset = i * blck_size_interleave;
2770
+
2771
+ uint32_t elems;
2772
+ memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint32_t));
2773
+ elems ^= xor_mask;
2774
+ memcpy(&out.qs[dst_offset], &elems, sizeof(uint32_t));
2775
+ }
2776
+ } else {
2777
+ GGML_ASSERT(false);
2778
+ }
2779
+
2780
+ return out;
2781
+ }
2782
+
2783
+ // interleave 8 block_q4_0s in blocks of blck_size_interleave
2784
+ // returns an interleaved block_q4_0x8
2785
+ // in the interleaved block_q4_0x8, place deltas for 8 block_q4_0 blocks
2786
+ // first, then interleave quants from 8 block_q4_0s in blocks of blck_size_interleave
2787
+ static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_interleave) {
2788
+ block_q4_0x8 out;
2789
+
2790
+ for (int i = 0; i < 8; i++) {
2791
+ out.d[i] = in[i].d;
2792
+ }
2793
+
2794
+ const int end = QK4_0 * 4 / blck_size_interleave;
2795
+ const uint64_t xor_mask = 0x8888888888888888ULL;
2796
+
2797
+ for (int i = 0; i < end; ++i) {
2798
+ int src_id = i % 8;
2799
+ int src_offset = (i / 8) * blck_size_interleave;
2800
+ int dst_offset = i * blck_size_interleave;
2801
+
2802
+ uint64_t elems;
2803
+ memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
2804
+ elems ^= xor_mask;
2805
+ memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
2806
+ }
2807
+
2808
+ return out;
2809
+ }
2810
+
2811
+ static block_q4_0x16 make_block_q4_0x16(block_q4_0 * in, unsigned int blck_size_interleave) {
2812
+ block_q4_0x16 out;
2813
+
2814
+ for (int i = 0; i < 16; i++) {
2815
+ out.d[i] = in[i].d;
2816
+ }
2817
+
2818
+ const int end = QK4_0 * 8 / blck_size_interleave;
2819
+
2820
+ if (blck_size_interleave == 1) {
2821
+ const uint8_t xor_mask = 0x88;
2822
+ for (int i = 0; i < end; ++i) {
2823
+ int src_id = i % 16;
2824
+ int src_offset = i / 16;
2825
+ int dst_offset = i;
2826
+
2827
+ out.qs[dst_offset] = in[src_id].qs[src_offset] ^ xor_mask;
2828
+ }
2829
+ } else {
2830
+ GGML_ASSERT(false);
2831
+ }
2832
+
2833
+ return out;
2834
+ }
2835
+
2836
+ static block_q4_Kx8 make_block_q4_Kx8(block_q4_K * in, unsigned int blck_size_interleave) {
2837
+ block_q4_Kx8 out;
2838
+ //Delta(scale) and dmin values of the eight Q4_K structures are copied onto the output interleaved structure
2839
+ for (int i = 0; i < 8; i++) {
2840
+ out.d[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d;
2841
+ }
2842
+
2843
+ for (int i = 0; i < 8; i++) {
2844
+ out.dmin[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.dmin;
2845
+ }
2846
+
2847
+ const int end = QK_K * 4 / blck_size_interleave;
2848
+
2849
+ // Interleave Q4_K quants by taking 8 bytes at a time
2850
+ for (int i = 0; i < end; ++i) {
2851
+ int src_id = i % 8;
2852
+ int src_offset = (i / 8) * blck_size_interleave;
2853
+ int dst_offset = i * blck_size_interleave;
2854
+
2855
+ // buffer large enough for the max interleave block size (8 bytes)
2856
+ uint64_t elems;
2857
+ memcpy(&elems, &in[src_id].qs[src_offset], blck_size_interleave);
2858
+ memcpy(&out.qs[dst_offset], &elems, blck_size_interleave);
2859
+ }
2860
+
2861
+ // The below logic is designed so as to unpack and rearrange scales and mins values in Q4_K
2862
+ // Currently the Q4_K structure has 8 scales and 8 mins packed in 12 bytes ( 6 bits for each value)
2863
+ // The output Q4_Kx8 structure has 96 bytes
2864
+ // Every 12 byte is packed such that it contains scales and mins for corresponding sub blocks from Q4_K structure
2865
+ // For eg - First 12 bytes contains 8 scales and 8 mins - each of first sub block from different Q4_K structures
2866
+ uint8_t s[8], m[8];
2867
+
2868
+ for (int i = 0; i < 4; i++) {
2869
+ for (int j = 0; j < 8; j++) {
2870
+ s[j] = in[j].scales[i] & 63;
2871
+ m[j] = in[j].scales[i + 4] & 63;
2872
+ }
2873
+
2874
+ out.scales[i * 12] = (s[0] & 63) + ((s[4] & 48) << 2);
2875
+ out.scales[i * 12 + 1] = (s[1] & 63) + ((s[5] & 48) << 2);
2876
+ out.scales[i * 12 + 2] = (s[2] & 63) + ((s[6] & 48) << 2);
2877
+ out.scales[i * 12 + 3] = (s[3] & 63) + ((s[7] & 48) << 2);
2878
+ out.scales[i * 12 + 4] = (m[0] & 63) + ((m[4] & 48) << 2);
2879
+ out.scales[i * 12 + 5] = (m[1] & 63) + ((m[5] & 48) << 2);
2880
+ out.scales[i * 12 + 6] = (m[2] & 63) + ((m[6] & 48) << 2);
2881
+ out.scales[i * 12 + 7] = (m[3] & 63) + ((m[7] & 48) << 2);
2882
+ out.scales[i * 12 + 8] = (s[4] & 15) + ((m[4] & 15) << 4);
2883
+ out.scales[i * 12 + 9] = (s[5] & 15) + ((m[5] & 15) << 4);
2884
+ out.scales[i * 12 + 10] = (s[6] & 15) + ((m[6] & 15) << 4);
2885
+ out.scales[i * 12 + 11] = (s[7] & 15) + ((m[7] & 15) << 4);
2886
+
2887
+ }
2888
+
2889
+ for (int i = 0; i < 4; i++) {
2890
+ for (int j = 0; j < 8; j++) {
2891
+ s[j] = ((in[j].scales[i] & 192) >> 2) | (in[j].scales[i+8] & 15);
2892
+ m[j] = ((in[j].scales[i + 4] & 192) >> 2) | ((in[j].scales[i+8] & 240) >> 4);
2893
+ }
2894
+
2895
+ out.scales[i * 12 + 48] = (s[0] & 63) + ((s[4] & 48) << 2);
2896
+ out.scales[i * 12 + 49] = (s[1] & 63) + ((s[5] & 48) << 2);
2897
+ out.scales[i * 12 + 50] = (s[2] & 63) + ((s[6] & 48) << 2);
2898
+ out.scales[i * 12 + 51] = (s[3] & 63) + ((s[7] & 48) << 2);
2899
+ out.scales[i * 12 + 52] = (m[0] & 63) + ((m[4] & 48) << 2);
2900
+ out.scales[i * 12 + 53] = (m[1] & 63) + ((m[5] & 48) << 2);
2901
+ out.scales[i * 12 + 54] = (m[2] & 63) + ((m[6] & 48) << 2);
2902
+ out.scales[i * 12 + 55] = (m[3] & 63) + ((m[7] & 48) << 2);
2903
+ out.scales[i * 12 + 56] = (s[4] & 15) + ((m[4] & 15) << 4);
2904
+ out.scales[i * 12 + 57] = (s[5] & 15) + ((m[5] & 15) << 4);
2905
+ out.scales[i * 12 + 58] = (s[6] & 15) + ((m[6] & 15) << 4);
2906
+ out.scales[i * 12 + 59] = (s[7] & 15) + ((m[7] & 15) << 4);
2907
+
2908
+ }
2909
+
2910
+ return out;
2911
+ }
2912
+
2913
+ static block_q4_Kx16 make_block_q4_Kx16(block_q4_K * in, unsigned int blck_size_interleave) {
2914
+ block_q4_Kx16 out;
2915
+ //Delta(scale) and dmin values of the 16 Q4_K structures are copied onto the output interleaved structure
2916
+ for (int i = 0; i < 16; i++) {
2917
+ out.d[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d;
2918
+ }
2919
+
2920
+ for (int i = 0; i < 16; i++) {
2921
+ out.dmin[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.dmin;
2922
+ }
2923
+
2924
+ const int end = QK_K * 8 / blck_size_interleave;
2925
+
2926
+ if (blck_size_interleave == 1) {
2927
+ for (int i = 0; i < end; ++i) {
2928
+ int src_id = i % 16;
2929
+ int src_offset = i / 16;
2930
+ int dst_offset = i;
2931
+
2932
+ out.qs[dst_offset] = in[src_id].qs[src_offset];
2933
+ }
2934
+
2935
+ // RVV repacking.
2936
+ //
2937
+ // Extract sums and mins for all 8 sub-blocks for each block of Q4_K.
2938
+ uint8_t s[128], m[128];
2939
+ for (int i = 0; i < 4; i++) {
2940
+ for (int j = 0; j < 16; j++) {
2941
+ s[i * 16 + j] = in[j].scales[i] & 63;
2942
+ m[i * 16 + j] = in[j].scales[i + 4] & 63;
2943
+ }
2944
+ }
2945
+ for (int i = 0; i < 4; i++) {
2946
+ for (int j = 0; j < 16; j++) {
2947
+ s[64 + i * 16 + j] = ((in[j].scales[i] & 192) >> 2) | (in[j].scales[i+8] & 15);
2948
+ m[64 + i * 16 + j] = ((in[j].scales[i + 4] & 192) >> 2) | ((in[j].scales[i+8] & 240) >> 4);
2949
+ }
2950
+ }
2951
+
2952
+ for (int i = 0; i < 128; i++) {
2953
+ out.scales[i] = (s[i] & 15) | ((m[i] & 15) << 4);
2954
+ }
2955
+ for (int i = 0; i < 64; i++) {
2956
+ out.scales[128 + i] = ((s[i] & 48) >> 4) | ((m[i] & 48) >> 2) | (s[64 + i] & 48) | ((m[64 + i] & 48) << 2);
2957
+ }
2958
+ } else {
2959
+ GGML_ASSERT(false);
2960
+ }
2961
+
2962
+ return out;
2963
+ }
2964
+
2965
+ static block_q2_Kx8 make_block_q2_Kx8(block_q2_K * in, unsigned int blck_size_interleave) {
2966
+ block_q2_Kx8 out;
2967
+
2968
+ // Delta(scale) and dmin values of the eight Q2_K structures are copied onto the output interleaved structure
2969
+ for (int i = 0; i < 8; i++) {
2970
+ out.d[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d;
2971
+ }
2972
+
2973
+ for (int i = 0; i < 8; i++) {
2974
+ out.dmin[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.dmin;
2975
+ }
2976
+
2977
+ const int end = QK_K * 2 / blck_size_interleave;
2978
+
2979
+ // Interleave Q2_K quants by taking 8 bytes at a time
2980
+ for (int i = 0; i < end; ++i) {
2981
+ int src_id = i % 8;
2982
+ int src_offset = (i / 8) * blck_size_interleave;
2983
+ int dst_offset = i * blck_size_interleave;
2984
+
2985
+ uint64_t elems;
2986
+ memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
2987
+ memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
2988
+ }
2989
+
2990
+ // The below logic is designed so as to unpack and rearrange scales and mins values in Q2_K
2991
+ // Currently the Q2_K structure has 16 scales and 16 mins packed in 16 bytes ( 4 bits for each value)
2992
+ // The output Q2_Kx8 structure has 128 bytes for storing scales and mins
2993
+ // Every 16 byte is packed such that it contains scales and mins for corresponding sub blocks from Q2_K structure
2994
+ // For eg - First 16 bytes contains 16 scales and 16 mins - each of first and second sub blocks from different Q2_K structures
2995
+
2996
+ for (int i = 0; i < 128; i++) {
2997
+ // Index for selecting which q2k super block
2998
+ int src1 = (i % 16) / 2;
2999
+ // Index for selecting scale
3000
+ int src2 = ((i / 16) * 2) + (i % 2);
3001
+
3002
+ out.scales[i] = in[src1].scales[src2];
3003
+ }
3004
+ return out;
3005
+ }
3006
+
3007
+ static block_q5_Kx8 make_block_q5_Kx8(block_q5_K * in, unsigned int blck_size_interleave) {
3008
+ block_q5_Kx8 out;
3009
+ //Delta(scale) and dmin values of the eight Q5_K structures are copied onto the output interleaved structure
3010
+ for (int i = 0; i < 8; i++) {
3011
+ out.d[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d;
3012
+ }
3013
+
3014
+ for (int i = 0; i < 8; i++) {
3015
+ out.dmin[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.dmin;
3016
+ }
3017
+
3018
+ const int end = QK_K * 4 / blck_size_interleave;
3019
+
3020
+ // Interleave Q5_K quants by taking blck_size_interleave bytes at a time
3021
+ for (int i = 0; i < end; ++i) {
3022
+ int src_id = i % 8;
3023
+ int src_offset = (i / 8) * blck_size_interleave;
3024
+ int dst_offset = i * blck_size_interleave;
3025
+
3026
+ memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], blck_size_interleave);
3027
+ }
3028
+
3029
+ // Repeat for high bits with the same chunk size, since
3030
+ // the high bits are interleaved in Q5_K and the index is
3031
+ // qh_idx = (qs_idx % 32);
3032
+ // qh_val = qh[qh_idx] >> (qs_idx / 32);
3033
+ for (int i = 0; i < end / 4; ++i) {
3034
+ int src_id = i % 8;
3035
+ int src_offset = (i / 8) * blck_size_interleave;
3036
+ int dst_offset = i * blck_size_interleave;
3037
+
3038
+ memcpy(&out.qh[dst_offset], &in[src_id].qh[src_offset], blck_size_interleave);
3039
+ }
3040
+
3041
+ // The below logic is copied over from Q4_K
3042
+ // The point is to unpack all the scales and mins for each sub block every time we load 12 bytes.
3043
+ // Currently the Q5_K structure has 8 scales and 8 mins packed in 12 bytes ( 6 bits for each value)
3044
+ // The output Q5_Kx8 structure has 96 bytes
3045
+ // Every 12 byte is packed such that it contains scales and mins for corresponding sub blocks from Q5_K structure
3046
+ // For eg - First 12 bytes contains 8 scales and 8 mins - each of first sub block from different Q5_K structures
3047
+ uint8_t s[8], m[8];
3048
+
3049
+ for (int i = 0; i < 4; i++) {
3050
+ for (int j = 0; j < 8; j++) {
3051
+ s[j] = in[j].scales[i] & 63;
3052
+ m[j] = in[j].scales[i + 4] & 63;
3053
+ }
3054
+
3055
+ out.scales[i * 12] = (s[0] & 63) + ((s[4] & 48) << 2);
3056
+ out.scales[i * 12 + 1] = (s[1] & 63) + ((s[5] & 48) << 2);
3057
+ out.scales[i * 12 + 2] = (s[2] & 63) + ((s[6] & 48) << 2);
3058
+ out.scales[i * 12 + 3] = (s[3] & 63) + ((s[7] & 48) << 2);
3059
+ out.scales[i * 12 + 4] = (m[0] & 63) + ((m[4] & 48) << 2);
3060
+ out.scales[i * 12 + 5] = (m[1] & 63) + ((m[5] & 48) << 2);
3061
+ out.scales[i * 12 + 6] = (m[2] & 63) + ((m[6] & 48) << 2);
3062
+ out.scales[i * 12 + 7] = (m[3] & 63) + ((m[7] & 48) << 2);
3063
+ out.scales[i * 12 + 8] = (s[4] & 15) + ((m[4] & 15) << 4);
3064
+ out.scales[i * 12 + 9] = (s[5] & 15) + ((m[5] & 15) << 4);
3065
+ out.scales[i * 12 + 10] = (s[6] & 15) + ((m[6] & 15) << 4);
3066
+ out.scales[i * 12 + 11] = (s[7] & 15) + ((m[7] & 15) << 4);
3067
+ }
3068
+
3069
+ for (int i = 0; i < 4; i++) {
3070
+ for (int j = 0; j < 8; j++) {
3071
+ s[j] = ((in[j].scales[i] & 192) >> 2) | (in[j].scales[i + 8] & 15);
3072
+ m[j] = ((in[j].scales[i + 4] & 192) >> 2) | ((in[j].scales[i + 8] & 240) >> 4);
3073
+ }
3074
+
3075
+ out.scales[i * 12 + 48] = (s[0] & 63) + ((s[4] & 48) << 2);
3076
+ out.scales[i * 12 + 49] = (s[1] & 63) + ((s[5] & 48) << 2);
3077
+ out.scales[i * 12 + 50] = (s[2] & 63) + ((s[6] & 48) << 2);
3078
+ out.scales[i * 12 + 51] = (s[3] & 63) + ((s[7] & 48) << 2);
3079
+ out.scales[i * 12 + 52] = (m[0] & 63) + ((m[4] & 48) << 2);
3080
+ out.scales[i * 12 + 53] = (m[1] & 63) + ((m[5] & 48) << 2);
3081
+ out.scales[i * 12 + 54] = (m[2] & 63) + ((m[6] & 48) << 2);
3082
+ out.scales[i * 12 + 55] = (m[3] & 63) + ((m[7] & 48) << 2);
3083
+ out.scales[i * 12 + 56] = (s[4] & 15) + ((m[4] & 15) << 4);
3084
+ out.scales[i * 12 + 57] = (s[5] & 15) + ((m[5] & 15) << 4);
3085
+ out.scales[i * 12 + 58] = (s[6] & 15) + ((m[6] & 15) << 4);
3086
+ out.scales[i * 12 + 59] = (s[7] & 15) + ((m[7] & 15) << 4);
3087
+ }
3088
+
3089
+ return out;
3090
+ }
3091
+
3092
+ static block_q6_Kx8 make_block_q6_Kx8(block_q6_K * in, unsigned int blck_size_interleave) {
3093
+ block_q6_Kx8 out;
3094
+ constexpr int n_blocks = 8; // Kx8
3095
+ for (int i = 0; i < n_blocks; i++) {
3096
+ out.d[i] = in[i].d;
3097
+ }
3098
+
3099
+ const int end_ls = QK_K * 4 / blck_size_interleave;
3100
+ // Interleave Q6_K quants by taking blck_size_interleave bytes at a time
3101
+ for (int i = 0; i < end_ls; ++i) {
3102
+ int src_id = i % n_blocks;
3103
+ int src_offset = (i / n_blocks) * blck_size_interleave;
3104
+ int dst_offset = i * blck_size_interleave;
3105
+
3106
+ uint64_t elem_ls;
3107
+ memcpy(&elem_ls, &in[src_id].ql[src_offset], blck_size_interleave);
3108
+ memcpy(&out.ql[dst_offset], &elem_ls, blck_size_interleave);
3109
+ }
3110
+
3111
+ // Interleave high bits using same chunk size as low bits
3112
+ const int end_hs = end_ls / 2;
3113
+ for (int i = 0; i < end_hs; ++i) {
3114
+ int src_id = i % n_blocks;
3115
+ int src_offset = (i / n_blocks) * blck_size_interleave;
3116
+ int dst_offset = i * blck_size_interleave;
3117
+
3118
+ uint64_t elem_hs;
3119
+ memcpy(&elem_hs, &in[src_id].qh[src_offset], blck_size_interleave);
3120
+ memcpy(&out.qh[dst_offset], &elem_hs, blck_size_interleave);
3121
+ }
3122
+
3123
+ // The below logic is designed so as to unpack and rearrange scales in Q6_K
3124
+ // The output Q6_Kx8 structure interleaves the 8 bit scales in the same fashion as the quants
3125
+ // Q6_K structure has an 8-bit scale per 16 elements -> 16 scales
3126
+ // scales: [0 bl0 0 bl1 ... 0 bl7][1 bl0 ... 1 bl7] ... [15 bl0 ... 15 bl7] (bl = block)
3127
+ constexpr int n_scales = QK_K / 16;
3128
+
3129
+ for (int i = 0; i < n_blocks; i++) {
3130
+ for (int j = 0; j < n_scales; j++) {
3131
+ out.scales[j * n_blocks + i] = in[i].scales[j];
3132
+ }
3133
+ }
3134
+
3135
+ return out;
3136
+ }
3137
+
3138
+ static block_q2_Kx16 make_block_q2_Kx16(const block_q2_K * in, unsigned int blck_size_interleave) {
3139
+ block_q2_Kx16 out;
3140
+ constexpr int N_COLS = 16;
3141
+
3142
+ // 1. Copy Super-Scales (d) and Super-Mins (dmin)
3143
+ for (int i = 0; i < N_COLS; i++) {
3144
+ out.d[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d;
3145
+ out.dmin[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.dmin;
3146
+ }
3147
+
3148
+ // 2. Interleave Q2_K Data
3149
+ const int bytes_per_col = 64;
3150
+ const int total_bytes = N_COLS * bytes_per_col;
3151
+ const int end = total_bytes / blck_size_interleave;
3152
+
3153
+ for (int i = 0; i < end; ++i) {
3154
+ int src_col_id = i % N_COLS;
3155
+ int src_offset = (i / N_COLS) * blck_size_interleave;
3156
+ int dst_offset = i * blck_size_interleave;
3157
+ memcpy(&out.qs[dst_offset], &in[src_col_id].qs[src_offset], blck_size_interleave);
3158
+ }
3159
+
3160
+ // 3. Repack Scales into the Optimized "Sequential-Parallel" Layout
3161
+ int out_idx = 0;
3162
+
3163
+ // Arrays define the sub-block order for each group
3164
+ const int even_low_sbs[] = {0, 2, 4, 6};
3165
+ const int odd_low_sbs[] = {1, 3, 5, 7};
3166
+ const int even_high_sbs[] = {8, 10, 12, 14};
3167
+ const int odd_high_sbs[] = {9, 11, 13, 15};
3168
+
3169
+ // Pack Group 1: Even-Low
3170
+ for (int sb : even_low_sbs) {
3171
+ for (int col = 0; col < N_COLS; col++) {
3172
+ out.scales[out_idx++] = in[col].scales[sb];
3173
+ }
3174
+ }
3175
+
3176
+ // Pack Group 2: Odd-Low
3177
+ for (int sb : odd_low_sbs) {
3178
+ for (int col = 0; col < N_COLS; col++) {
3179
+ out.scales[out_idx++] = in[col].scales[sb];
3180
+ }
3181
+ }
3182
+
3183
+ // Pack Group 3: Even-High
3184
+ for (int sb : even_high_sbs) {
3185
+ for (int col = 0; col < N_COLS; col++) {
3186
+ out.scales[out_idx++] = in[col].scales[sb];
3187
+ }
3188
+ }
3189
+
3190
+ // Pack Group 4: Odd-High
3191
+ for (int sb : odd_high_sbs) {
3192
+ for (int col = 0; col < N_COLS; col++) {
3193
+ out.scales[out_idx++] = in[col].scales[sb];
3194
+ }
3195
+ }
3196
+
3197
+ return out;
3198
+ }
3199
+
3200
+ static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
3201
+ GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
3202
+ GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
3203
+ constexpr int nrows_interleaved = 4;
3204
+
3205
+ block_q4_0x4 * dst = (block_q4_0x4 *)t->data;
3206
+ const block_q4_0 * src = (const block_q4_0 *)data;
3207
+ block_q4_0 dst_tmp[4];
3208
+ int nrow = ggml_nrows(t);
3209
+ int nblocks = t->ne[0] / QK4_0;
3210
+
3211
+ GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0));
3212
+
3213
+ if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
3214
+ return -1;
3215
+ }
3216
+
3217
+ for (int b = 0; b < nrow; b += nrows_interleaved) {
3218
+ for (int64_t x = 0; x < nblocks; x++) {
3219
+ for (int i = 0; i < nrows_interleaved; i++) {
3220
+ dst_tmp[i] = src[x + i * nblocks];
3221
+ }
3222
+ *dst++ = make_block_q4_0x4(dst_tmp, interleave_block);
3223
+ }
3224
+ src += nrows_interleaved * nblocks;
3225
+ }
3226
+ return 0;
3227
+
3228
+ GGML_UNUSED(data_size);
3229
+ }
3230
+
3231
+ static int repack_q4_K_to_q4_K_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
3232
+ GGML_ASSERT(t->type == GGML_TYPE_Q4_K);
3233
+ GGML_ASSERT(interleave_block == 8 || interleave_block == 4);
3234
+ constexpr int nrows_interleaved = 8;
3235
+
3236
+ block_q4_Kx8 * dst = (block_q4_Kx8*)t->data;
3237
+ const block_q4_K * src = (const block_q4_K*) data;
3238
+ block_q4_K dst_tmp[8];
3239
+ int nrow = ggml_nrows(t);
3240
+ int nblocks = t->ne[0] / QK_K;
3241
+
3242
+ GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_K));
3243
+
3244
+ if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
3245
+ return -1;
3246
+ }
3247
+
3248
+ for (int b = 0; b < nrow; b += nrows_interleaved) {
3249
+ for (int64_t x = 0; x < nblocks; x++) {
3250
+ for (int i = 0; i < nrows_interleaved; i++ ) {
3251
+ dst_tmp[i] = src[x + i * nblocks];
3252
+ }
3253
+ *dst++ = make_block_q4_Kx8(dst_tmp, interleave_block);
3254
+ }
3255
+ src += nrows_interleaved * nblocks;
3256
+ }
3257
+ return 0;
3258
+
3259
+ GGML_UNUSED(data_size);
3260
+ }
3261
+
3262
+ static int repack_q4_K_to_q4_K_16_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
3263
+ GGML_ASSERT(t->type == GGML_TYPE_Q4_K);
3264
+ constexpr int nrows_interleaved = 16;
3265
+
3266
+ block_q4_Kx16 * dst = (block_q4_Kx16*)t->data;
3267
+ const block_q4_K * src = (const block_q4_K*) data;
3268
+ block_q4_K dst_tmp[16];
3269
+ int nrow = ggml_nrows(t);
3270
+ int nblocks = t->ne[0] / QK_K;
3271
+
3272
+ GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_K));
3273
+
3274
+ if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
3275
+ return -1;
3276
+ }
3277
+
3278
+ for (int b = 0; b < nrow; b += nrows_interleaved) {
3279
+ for (int64_t x = 0; x < nblocks; x++) {
3280
+ for (int i = 0; i < nrows_interleaved; i++ ) {
3281
+ dst_tmp[i] = src[x + i * nblocks];
3282
+ }
3283
+ *dst++ = make_block_q4_Kx16(dst_tmp, interleave_block);
3284
+ }
3285
+ src += nrows_interleaved * nblocks;
3286
+ }
3287
+ return 0;
3288
+
3289
+ GGML_UNUSED(data_size);
3290
+ }
3291
+
3292
+ static int repack_q2_K_to_q2_K_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
3293
+ GGML_ASSERT(t->type == GGML_TYPE_Q2_K);
3294
+ GGML_ASSERT(interleave_block == 8);
3295
+ constexpr int nrows_interleaved = 8;
3296
+
3297
+ block_q2_Kx8 * dst = (block_q2_Kx8*)t->data;
3298
+ const block_q2_K * src = (const block_q2_K*) data;
3299
+ block_q2_K dst_tmp[8];
3300
+ int nrow = ggml_nrows(t);
3301
+ int nblocks = t->ne[0] / QK_K;
3302
+
3303
+ GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q2_K));
3304
+
3305
+ if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
3306
+ return -1;
3307
+ }
3308
+
3309
+ for (int b = 0; b < nrow; b += nrows_interleaved) {
3310
+ for (int64_t x = 0; x < nblocks; x++) {
3311
+ for (int i = 0; i < nrows_interleaved; i++) {
3312
+ dst_tmp[i] = src[x + i * nblocks];
3313
+ }
3314
+ *dst++ = make_block_q2_Kx8(dst_tmp, interleave_block);
3315
+ }
3316
+ src += nrows_interleaved * nblocks;
3317
+ }
3318
+ return 0;
3319
+
3320
+ GGML_UNUSED(data_size);
3321
+ }
3322
+
3323
+ static int repack_q2_K_to_q2_K_16_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
3324
+ GGML_ASSERT(t->type == GGML_TYPE_Q2_K);
3325
+ constexpr int nrows_interleaved = 16;
3326
+
3327
+ block_q2_Kx16 * dst = (block_q2_Kx16*)t->data;
3328
+ const block_q2_K * src = (const block_q2_K*) data;
3329
+
3330
+ block_q2_K dst_tmp[nrows_interleaved];
3331
+
3332
+ int nrow = ggml_nrows(t);
3333
+ int nblocks = t->ne[0] / QK_K;
3334
+
3335
+ GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q2_K));
3336
+
3337
+ if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
3338
+ return -1;
3339
+ }
3340
+
3341
+ for (int b = 0; b < nrow; b += nrows_interleaved) {
3342
+ for (int64_t x = 0; x < nblocks; x++) {
3343
+ // This loop gathers 16 separate blocks (one from each column)
3344
+ // that correspond to the same K-dimension chunk.
3345
+ for (int i = 0; i < nrows_interleaved; i++ ) {
3346
+ dst_tmp[i] = src[x + i * nblocks];
3347
+ }
3348
+
3349
+ *dst++ = make_block_q2_Kx16(dst_tmp, interleave_block);
3350
+ }
3351
+ src += nrows_interleaved * nblocks;
3352
+ }
3353
+ return 0;
3354
+
3355
+ GGML_UNUSED(data_size);
3356
+ }
3357
+
3358
+ static int repack_q4_0_to_q4_0_16_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
3359
+ GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
3360
+ constexpr int nrows_interleaved = 16;
3361
+
3362
+ block_q4_0x16 * dst = (block_q4_0x16*)t->data;
3363
+ const block_q4_0 * src = (const block_q4_0*) data;
3364
+ block_q4_0 dst_tmp[16];
3365
+ int nrow = ggml_nrows(t);
3366
+ int nblocks = t->ne[0] / QK4_0;
3367
+
3368
+ GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0));
3369
+
3370
+ if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
3371
+ return -1;
3372
+ }
3373
+
3374
+ for (int b = 0; b < nrow; b += nrows_interleaved) {
3375
+ for (int64_t x = 0; x < nblocks; x++) {
3376
+ for (int i = 0; i < nrows_interleaved; i++ ) {
3377
+ dst_tmp[i] = src[x + i * nblocks];
3378
+ }
3379
+ *dst++ = make_block_q4_0x16(dst_tmp, interleave_block);
3380
+ }
3381
+ src += nrows_interleaved * nblocks;
3382
+ }
3383
+ return 0;
3384
+
3385
+ GGML_UNUSED(data_size);
3386
+ }
3387
+
3388
+ static int repack_q5_K_to_q5_K_8_bl(struct ggml_tensor * t,
3389
+ int interleave_block,
3390
+ const void * GGML_RESTRICT data,
3391
+ size_t data_size) {
3392
+ GGML_ASSERT(t->type == GGML_TYPE_Q5_K);
3393
+ GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
3394
+ constexpr int nrows_interleaved = 8;
3395
+
3396
+ block_q5_Kx8 * dst = (block_q5_Kx8 *) t->data;
3397
+ const block_q5_K * src = (const block_q5_K *) data;
3398
+ block_q5_K dst_tmp[8];
3399
+ int nrow = ggml_nrows(t);
3400
+ int nblocks = t->ne[0] / QK_K;
3401
+
3402
+ GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q5_K));
3403
+
3404
+ if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
3405
+ return -1;
3406
+ }
3407
+
3408
+ for (int b = 0; b < nrow; b += nrows_interleaved) {
3409
+ for (int64_t x = 0; x < nblocks; x++) {
3410
+ for (int i = 0; i < nrows_interleaved; i++) {
3411
+ dst_tmp[i] = src[x + i * nblocks];
3412
+ }
3413
+ *dst++ = make_block_q5_Kx8(dst_tmp, interleave_block);
3414
+ }
3415
+ src += nrows_interleaved * nblocks;
3416
+ }
3417
+ return 0;
3418
+ }
3419
+
3420
+ static int repack_q6_K_to_q6_K_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
3421
+ GGML_ASSERT(t->type == GGML_TYPE_Q6_K);
3422
+ GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
3423
+ constexpr int nrows_interleaved = 8;
3424
+
3425
+ block_q6_Kx8 * dst = (block_q6_Kx8 *)t->data;
3426
+ const block_q6_K * src = (const block_q6_K *) data;
3427
+ block_q6_K dst_tmp[8];
3428
+ int nrow = ggml_nrows(t);
3429
+ int nblocks = t->ne[0] / QK_K;
3430
+
3431
+ GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q6_K));
3432
+
3433
+ if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
3434
+ return -1;
3435
+ }
3436
+
3437
+ for (int b = 0; b < nrow; b += nrows_interleaved) {
3438
+ for (int64_t x = 0; x < nblocks; x++) {
3439
+ for (int i = 0; i < nrows_interleaved; i++) {
3440
+ dst_tmp[i] = src[x + i * nblocks];
3441
+ }
3442
+ *dst++ = make_block_q6_Kx8(dst_tmp, interleave_block);
3443
+ }
3444
+ src += nrows_interleaved * nblocks;
3445
+ }
3446
+ return 0;
3447
+ }
3448
+
3449
+ static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
3450
+ GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
3451
+ GGML_ASSERT(interleave_block == 8);
3452
+ constexpr int nrows_interleaved = 8;
3453
+
3454
+ block_q4_0x8 * dst = (block_q4_0x8*)t->data;
3455
+ const block_q4_0 * src = (const block_q4_0*) data;
3456
+ block_q4_0 dst_tmp[8];
3457
+ int nrow = ggml_nrows(t);
3458
+ int nblocks = t->ne[0] / QK4_0;
3459
+
3460
+ GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0));
3461
+
3462
+ if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
3463
+ return -1;
3464
+ }
3465
+
3466
+ for (int b = 0; b < nrow; b += nrows_interleaved) {
3467
+ for (int64_t x = 0; x < nblocks; x++) {
3468
+ for (int i = 0; i < nrows_interleaved; i++ ) {
3469
+ dst_tmp[i] = src[x + i * nblocks];
3470
+ }
3471
+ *dst++ = make_block_q4_0x8(dst_tmp, interleave_block);
3472
+ }
3473
+ src += nrows_interleaved * nblocks;
3474
+ }
3475
+ return 0;
3476
+
3477
+ GGML_UNUSED(data_size);
3478
+ }
3479
+
3480
+ static int repack_q8_0_to_q8_0_4_bl(struct ggml_tensor * t,
3481
+ int interleave_block,
3482
+ const void * GGML_RESTRICT data,
3483
+ size_t data_size) {
3484
+ GGML_ASSERT(t->type == GGML_TYPE_Q8_0);
3485
+ GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
3486
+ constexpr int nrows_interleaved = 4;
3487
+
3488
+ block_q8_0x4 * dst = (block_q8_0x4 *) t->data;
3489
+ const block_q8_0 * src = (const block_q8_0 *) data;
3490
+ block_q8_0 dst_tmp[4];
3491
+ int nrow = ggml_nrows(t);
3492
+ int nblocks = t->ne[0] / QK8_0;
3493
+
3494
+ GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q8_0));
3495
+
3496
+ if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
3497
+ return -1;
3498
+ }
3499
+
3500
+ for (int b = 0; b < nrow; b += nrows_interleaved) {
3501
+ for (int64_t x = 0; x < nblocks; x++) {
3502
+ for (int i = 0; i < nrows_interleaved; i++) {
3503
+ dst_tmp[i] = src[x + i * nblocks];
3504
+ }
3505
+ *dst++ = make_block_q8_0x4(dst_tmp, interleave_block);
3506
+ }
3507
+ src += nrows_interleaved * nblocks;
3508
+ }
3509
+ return 0;
3510
+ }
3511
+
3512
+ static block_q8_0x16 make_block_q8_0x16(block_q8_0 * in, unsigned int blck_size_interleave) {
3513
+ block_q8_0x16 out;
3514
+
3515
+ for (int i = 0; i < 16; i++) {
3516
+ out.d[i] = in[i].d;
3517
+ }
3518
+
3519
+ const int end = QK8_0 * 16 / blck_size_interleave;
3520
+
3521
+ if (blck_size_interleave == 1) {
3522
+ for (int i = 0; i < end; ++i) {
3523
+ int src_id = i % 16;
3524
+ int src_offset = i / 16;
3525
+ int dst_offset = i;
3526
+ out.qs[dst_offset] = in[src_id].qs[src_offset];
3527
+ }
3528
+ } else {
3529
+ GGML_ASSERT(false);
3530
+ }
3531
+
3532
+ return out;
3533
+ }
3534
+
3535
+ static int repack_q8_0_to_q8_0_16_bl(struct ggml_tensor * t,
3536
+ int interleave_block,
3537
+ const void * GGML_RESTRICT data,
3538
+ size_t data_size) {
3539
+ GGML_ASSERT(t->type == GGML_TYPE_Q8_0);
3540
+ constexpr int nrows_interleaved = 16;
3541
+
3542
+ block_q8_0x16 * dst = (block_q8_0x16 *) t->data;
3543
+ const block_q8_0 * src = (const block_q8_0 *) data;
3544
+ block_q8_0 dst_tmp[16];
3545
+ int nrow = ggml_nrows(t);
3546
+ int nblocks = t->ne[0] / QK8_0;
3547
+
3548
+ GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q8_0));
3549
+
3550
+ if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
3551
+ return -1;
3552
+ }
3553
+
3554
+ for (int b = 0; b < nrow; b += nrows_interleaved) {
3555
+ for (int64_t x = 0; x < nblocks; x++) {
3556
+ for (int i = 0; i < nrows_interleaved; i++) {
3557
+ dst_tmp[i] = src[x + i * nblocks];
3558
+ }
3559
+ *dst++ = make_block_q8_0x16(dst_tmp, interleave_block);
3560
+ }
3561
+ src += nrows_interleaved * nblocks;
3562
+ }
3563
+ return 0;
3564
+ }
3565
+
3566
+ static block_iq4_nlx4 make_block_iq4_nlx4(block_iq4_nl * in, unsigned int blck_size_interleave) {
3567
+ block_iq4_nlx4 out;
3568
+
3569
+ for (int i = 0; i < 4; i++) {
3570
+ out.d[i] = in[i].d;
3571
+ }
3572
+
3573
+ const int end = QK4_NL * 2 / blck_size_interleave;
3574
+
3575
+ // TODO: this branch seems wrong
3576
+ //if (blck_size_interleave == 8) {
3577
+ // for (int i = 0; i < end; ++i) {
3578
+ // int src_id = i % 4;
3579
+ // int src_offset = (i / 4) * blck_size_interleave;
3580
+ // int dst_offset = i * blck_size_interleave;
3581
+
3582
+ // // Using memcpy to avoid unaligned memory accesses
3583
+ // memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint64_t));
3584
+ // }
3585
+ //} else
3586
+ if (blck_size_interleave == 4) {
3587
+ for (int i = 0; i < end; ++i) {
3588
+ int src_id = i % 4;
3589
+ int src_offset = (i / 4) * blck_size_interleave;
3590
+ int dst_offset = i * blck_size_interleave;
3591
+
3592
+ memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint32_t));
3593
+ }
3594
+ } else {
3595
+ GGML_ASSERT(false);
3596
+ }
3597
+
3598
+ return out;
3599
+ }
3600
+
3601
+ static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
3602
+ GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL);
3603
+ GGML_ASSERT(interleave_block == 4);
3604
+
3605
+ const block_iq4_nl * src = (const block_iq4_nl *)data;
3606
+ block_iq4_nlx4 * dst = ( block_iq4_nlx4 *)t->data;
3607
+
3608
+ block_iq4_nl dst_tmp[4];
3609
+
3610
+ int nrow = ggml_nrows(t);
3611
+ int nrows_interleaved = 4;
3612
+ int nblocks = t->ne[0] / QK4_NL;
3613
+
3614
+ GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl));
3615
+
3616
+ if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
3617
+ return -1;
3618
+ }
3619
+
3620
+ for (int b = 0; b < nrow; b += nrows_interleaved) {
3621
+ for (int64_t x = 0; x < nblocks; x++) {
3622
+ for (int i = 0; i < nrows_interleaved; i++) {
3623
+ dst_tmp[i] = src[x + i * nblocks];
3624
+ }
3625
+ *dst++ = make_block_iq4_nlx4(dst_tmp, interleave_block);
3626
+ }
3627
+ src += nrows_interleaved * nblocks;
3628
+ }
3629
+ return 0;
3630
+
3631
+ GGML_UNUSED(data_size);
3632
+ }
3633
+
3634
+ static block_iq4_nlx8 make_block_iq4_nlx8(block_iq4_nl * in, unsigned int blck_size_interleave) {
3635
+ block_iq4_nlx8 out;
3636
+
3637
+ for (int i = 0; i < 8; i++) {
3638
+ out.d[i] = in[i].d;
3639
+ }
3640
+
3641
+ const int end = QK4_NL * 4 / blck_size_interleave;
3642
+
3643
+ if (blck_size_interleave == 8) {
3644
+ for (int i = 0; i < end; ++i) {
3645
+ int src_id = i % 8;
3646
+ int src_offset = (i / 8) * blck_size_interleave;
3647
+ int dst_offset = i * blck_size_interleave;
3648
+
3649
+ memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint64_t));
3650
+ }
3651
+ } else {
3652
+ GGML_ASSERT(false);
3653
+ }
3654
+
3655
+ return out;
3656
+ }
3657
+
3658
+ static int repack_iq4_nl_to_iq4_nl_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
3659
+ GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL);
3660
+ GGML_ASSERT(interleave_block == 8);
3661
+
3662
+ const block_iq4_nl * src = (const block_iq4_nl *)data;
3663
+ block_iq4_nlx8 * dst = ( block_iq4_nlx8 *)t->data;
3664
+
3665
+ block_iq4_nl dst_tmp[8];
3666
+
3667
+ int nrow = ggml_nrows(t);
3668
+ int nrows_interleaved = 8;
3669
+ int nblocks = t->ne[0] / QK4_NL;
3670
+
3671
+ GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl));
3672
+
3673
+ if (t->ne[1] % nrows_interleaved != 0) {
3674
+ return -1;
3675
+ }
3676
+
3677
+ for (int b = 0; b < nrow; b += nrows_interleaved) {
3678
+ for (int64_t x = 0; x < nblocks; x++) {
3679
+ for (int i = 0; i < nrows_interleaved; i++) {
3680
+ dst_tmp[i] = src[x + i * nblocks];
3681
+ }
3682
+ *dst++ = make_block_iq4_nlx8(dst_tmp, interleave_block);
3683
+ }
3684
+ src += nrows_interleaved * nblocks;
3685
+ }
3686
+ return 0;
3687
+
3688
+ GGML_UNUSED(data_size);
3689
+ }
3690
+
3691
+ static block_iq4_nlx16 make_block_iq4_nlx16(block_iq4_nl * in, unsigned int blck_size_interleave) {
3692
+ block_iq4_nlx16 out;
3693
+
3694
+ for (int i = 0; i < 16; i++) {
3695
+ out.d[i] = in[i].d;
3696
+ }
3697
+
3698
+ const int end = QK4_NL * 8 / blck_size_interleave;
3699
+
3700
+ if (blck_size_interleave == 1) {
3701
+ for (int i = 0; i < end; ++i) {
3702
+ int src_id = i % 16;
3703
+ int src_offset = i / 16;
3704
+ int dst_offset = i;
3705
+
3706
+ out.qs[dst_offset] = in[src_id].qs[src_offset];
3707
+ }
3708
+ } else {
3709
+ GGML_ASSERT(false);
3710
+ }
3711
+
3712
+ return out;
3713
+ }
3714
+
3715
+ static int repack_iq4_nl_to_iq4_nl_16_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
3716
+ GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL);
3717
+ GGML_ASSERT(interleave_block == 1);
3718
+
3719
+ const block_iq4_nl * src = (const block_iq4_nl *)data;
3720
+ block_iq4_nlx16 * dst = ( block_iq4_nlx16 *)t->data;
3721
+
3722
+ block_iq4_nl dst_tmp[16];
3723
+
3724
+ int nrow = ggml_nrows(t);
3725
+ int nrows_interleaved = 16;
3726
+ int nblocks = t->ne[0] / QK4_NL;
3727
+
3728
+ GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl));
3729
+
3730
+ if (t->ne[1] % nrows_interleaved != 0) {
3731
+ return -1;
3732
+ }
3733
+
3734
+ for (int b = 0; b < nrow; b += nrows_interleaved) {
3735
+ for (int64_t x = 0; x < nblocks; x++) {
3736
+ for (int i = 0; i < nrows_interleaved; i++) {
3737
+ dst_tmp[i] = src[x + i * nblocks];
3738
+ }
3739
+ *dst++ = make_block_iq4_nlx16(dst_tmp, interleave_block);
3740
+ }
3741
+ src += nrows_interleaved * nblocks;
3742
+ }
3743
+ return 0;
3744
+
3745
+ GGML_UNUSED(data_size);
3746
+ }
3747
+
3748
+ static block_mxfp4x4 make_block_mxfp4x4(block_mxfp4 * in, unsigned int blck_size_interleave) {
3749
+ block_mxfp4x4 out;
3750
+
3751
+ for (int i = 0; i < 4; i++) {
3752
+ out.e[i] = in[i].e;
3753
+ }
3754
+
3755
+ const int end = QK_MXFP4 * 2 / blck_size_interleave;
3756
+
3757
+ if (blck_size_interleave == 4) {
3758
+ for (int i = 0; i < end; ++i) {
3759
+ int src_id = i % 4;
3760
+ int src_offset = (i / 4) * blck_size_interleave;
3761
+ int dst_offset = i * blck_size_interleave;
3762
+
3763
+ memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint32_t));
3764
+ }
3765
+ } else {
3766
+ GGML_ASSERT(false);
3767
+ }
3768
+
3769
+ return out;
3770
+ }
3771
+
3772
+ static int repack_mxfp4_to_mxfp4_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
3773
+ GGML_ASSERT(t->type == GGML_TYPE_MXFP4);
3774
+ GGML_ASSERT(interleave_block == 4);
3775
+
3776
+ const block_mxfp4 * src = (const block_mxfp4 *)data;
3777
+ block_mxfp4x4 * dst = ( block_mxfp4x4 *)t->data;
3778
+
3779
+ block_mxfp4 dst_tmp[4];
3780
+
3781
+ int nrow = ggml_nrows(t);
3782
+ int nrows_interleaved = 4;
3783
+ int nblocks = t->ne[0] / QK_MXFP4;
3784
+
3785
+ GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_mxfp4));
3786
+
3787
+ if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
3788
+ return -1;
3789
+ }
3790
+
3791
+ for (int b = 0; b < nrow; b += nrows_interleaved) {
3792
+ for (int64_t x = 0; x < nblocks; x++) {
3793
+ for (int i = 0; i < nrows_interleaved; i++) {
3794
+ dst_tmp[i] = src[x + i * nblocks];
3795
+ }
3796
+ *dst++ = make_block_mxfp4x4(dst_tmp, interleave_block);
3797
+ }
3798
+ src += nrows_interleaved * nblocks;
3799
+ }
3800
+ return 0;
3801
+
3802
+ GGML_UNUSED(data_size);
3803
+ }
3804
+
3805
+ static block_mxfp4x8 make_block_mxfp4x8(block_mxfp4 * in, unsigned int blck_size_interleave) {
3806
+ block_mxfp4x8 out;
3807
+
3808
+ for (int i = 0; i < 8; i++) {
3809
+ out.e[i] = in[i].e;
3810
+ }
3811
+
3812
+ const int end = QK_MXFP4 * 4 / blck_size_interleave;
3813
+
3814
+ if (blck_size_interleave == 8) {
3815
+ for (int i = 0; i < end; ++i) {
3816
+ int src_id = i % 8;
3817
+ int src_offset = (i / 8) * blck_size_interleave;
3818
+ int dst_offset = i * blck_size_interleave;
3819
+
3820
+ memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint64_t));
3821
+ }
3822
+ } else {
3823
+ GGML_ASSERT(false);
3824
+ }
3825
+
3826
+ return out;
3827
+ }
3828
+
3829
+ static int repack_mxfp4_to_mxfp4_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
3830
+ GGML_ASSERT(t->type == GGML_TYPE_MXFP4);
3831
+ GGML_ASSERT(interleave_block == 8);
3832
+
3833
+ const block_mxfp4 * src = (const block_mxfp4 *)data;
3834
+ block_mxfp4x8 * dst = ( block_mxfp4x8 *)t->data;
3835
+
3836
+ block_mxfp4 dst_tmp[8];
3837
+
3838
+ int nrow = ggml_nrows(t);
3839
+ int nrows_interleaved = 8;
3840
+ int nblocks = t->ne[0] / QK_MXFP4;
3841
+
3842
+ GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_mxfp4));
3843
+
3844
+ if (t->ne[1] % nrows_interleaved != 0) {
3845
+ return -1;
3846
+ }
3847
+
3848
+ for (int b = 0; b < nrow; b += nrows_interleaved) {
3849
+ for (int64_t x = 0; x < nblocks; x++) {
3850
+ for (int i = 0; i < nrows_interleaved; i++) {
3851
+ dst_tmp[i] = src[x + i * nblocks];
3852
+ }
3853
+ *dst++ = make_block_mxfp4x8(dst_tmp, interleave_block);
3854
+ }
3855
+ src += nrows_interleaved * nblocks;
3856
+ }
3857
+ return 0;
3858
+
3859
+ GGML_UNUSED(data_size);
3860
+ }
3861
+
3862
+ namespace ggml::cpu::repack {
3863
+ // repack
3864
+ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS>
3865
+ int repack(struct ggml_tensor *, const void *, size_t);
3866
+
3867
+ // TODO: generalise.
3868
+ template <> int repack<block_q4_0, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
3869
+ return repack_q4_0_to_q4_0_4_bl(t, 4, data, data_size);
3870
+ }
3871
+
3872
+ template <> int repack<block_q4_0, 8, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
3873
+ return repack_q4_0_to_q4_0_4_bl(t, 8, data, data_size);
3874
+ }
3875
+
3876
+ template <> int repack<block_q4_0, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
3877
+ return repack_q4_0_to_q4_0_8_bl(t, 8, data, data_size);
3878
+ }
3879
+
3880
+ template <> int repack<block_q4_K, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
3881
+ return repack_q4_K_to_q4_K_8_bl(t, 8, data, data_size);
3882
+ }
3883
+
3884
+ template <> int repack<block_q4_K, 4, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
3885
+ return repack_q4_K_to_q4_K_8_bl(t, 4, data, data_size);
3886
+ }
3887
+
3888
+ template <> int repack<block_q2_K, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
3889
+ return repack_q2_K_to_q2_K_8_bl(t, 8, data, data_size);
3890
+ }
3891
+
3892
+ template <> int repack<block_q5_K, 4, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
3893
+ return repack_q5_K_to_q5_K_8_bl(t, 4, data, data_size);
3894
+ }
3895
+
3896
+ template <> int repack<block_q5_K, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
3897
+ return repack_q5_K_to_q5_K_8_bl(t, 8, data, data_size);
3898
+ }
3899
+
3900
+ template <> int repack<block_q6_K, 4, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
3901
+ return repack_q6_K_to_q6_K_8_bl(t, 4, data, data_size);
3902
+ }
3903
+
3904
+ template <> int repack<block_q6_K, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
3905
+ return repack_q6_K_to_q6_K_8_bl(t, 8, data, data_size);
3906
+ }
3907
+
3908
+ template <> int repack<block_iq4_nl, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
3909
+ return repack_iq4_nl_to_iq4_nl_4_bl(t, 4, data, data_size);
3910
+ }
3911
+
3912
+ // TODO: needs to be revisited
3913
+ //template <> int repack<block_iq4_nl, 8, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
3914
+ // return repack_iq4_nl_to_iq4_nl_4_bl(t, 8, data, data_size);
3915
+ //}
3916
+
3917
+ template <> int repack<block_iq4_nl, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
3918
+ return repack_iq4_nl_to_iq4_nl_8_bl(t, 8, data, data_size);
3919
+ }
3920
+
3921
+ template <> int repack<block_mxfp4, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
3922
+ return repack_mxfp4_to_mxfp4_4_bl(t, 4, data, data_size);
3923
+ }
3924
+
3925
+ template <> int repack<block_mxfp4, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
3926
+ return repack_mxfp4_to_mxfp4_8_bl(t, 8, data, data_size);
3927
+ }
3928
+
3929
+ template <> int repack<block_q8_0, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
3930
+ return repack_q8_0_to_q8_0_4_bl(t, 4, data, data_size);
3931
+ }
3932
+
3933
+ template <> int repack<block_q8_0, 8, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
3934
+ return repack_q8_0_to_q8_0_4_bl(t, 8, data, data_size);
3935
+ }
3936
+
3937
+ #if defined __riscv_zvfh
3938
+ template <> int repack<block_q4_0, 1, 16>(struct ggml_tensor * t, const void * data, size_t data_size) {
3939
+ return repack_q4_0_to_q4_0_16_bl(t, 1, data, data_size);
3940
+ }
3941
+
3942
+ template <> int repack<block_q4_K, 1, 16>(struct ggml_tensor * t, const void * data, size_t data_size) {
3943
+ return repack_q4_K_to_q4_K_16_bl(t, 1, data, data_size);
3944
+ }
3945
+
3946
+ template <> int repack<block_iq4_nl, 1, 16>(struct ggml_tensor * t, const void * data, size_t data_size) {
3947
+ return repack_iq4_nl_to_iq4_nl_16_bl(t, 1, data, data_size);
3948
+ }
3949
+
3950
+ template <> int repack<block_q8_0, 1, 16>(struct ggml_tensor * t, const void * data, size_t data_size) {
3951
+ return repack_q8_0_to_q8_0_16_bl(t, 1, data, data_size);
3952
+ }
3953
+
3954
+ template <> int repack<block_q2_K, 1, 16>(struct ggml_tensor * t, const void * data, size_t data_size) {
3955
+ return repack_q2_K_to_q2_K_16_bl(t, 1, data, data_size);
3956
+ }
3957
+ #endif
3958
+
3959
+ // gemv
3960
+ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
3961
+ void gemv(int, float *, size_t, const void *, const void *, int, int);
3962
+
3963
+ template <> void gemv<block_q4_0, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
3964
+ ggml_gemv_q4_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
3965
+ }
3966
+
3967
+ template <> void gemv<block_q4_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
3968
+ ggml_gemv_q4_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
3969
+ }
3970
+
3971
+ template <> void gemv<block_q4_0, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
3972
+ ggml_gemv_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
3973
+ }
3974
+
3975
+ template <>
3976
+ void gemv<block_q2_K, 8, 8, GGML_TYPE_Q8_K>(int n,
3977
+ float * s,
3978
+ size_t bs,
3979
+ const void * vx,
3980
+ const void * vy,
3981
+ int nr,
3982
+ int nc) {
3983
+ ggml_gemv_q2_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
3984
+ }
3985
+
3986
+ template <> void gemv<block_q4_K, 4, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
3987
+ ggml_gemv_q4_K_8x4_q8_K(n, s, bs, vx, vy, nr, nc);
3988
+ }
3989
+
3990
+ template <> void gemv<block_q4_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
3991
+ ggml_gemv_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
3992
+ }
3993
+
3994
+ template <> void gemv<block_q5_K, 4, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
3995
+ ggml_gemv_q5_K_8x4_q8_K(n, s, bs, vx, vy, nr, nc);
3996
+ }
3997
+
3998
+ template <> void gemv<block_q5_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
3999
+ ggml_gemv_q5_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
4000
+ }
4001
+
4002
+ template <> void gemv<block_q6_K, 4, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4003
+ ggml_gemv_q6_K_8x4_q8_K(n, s, bs, vx, vy, nr, nc);
4004
+ }
4005
+
4006
+ template <> void gemv<block_q6_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4007
+ ggml_gemv_q6_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
4008
+ }
4009
+
4010
+ template <> void gemv<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4011
+ ggml_gemv_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
4012
+ }
4013
+
4014
+ template <> void gemv<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4015
+ ggml_gemv_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
4016
+ }
4017
+
4018
+ template <> void gemv<block_mxfp4, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4019
+ ggml_gemv_mxfp4_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
4020
+ }
4021
+
4022
+ template <> void gemv<block_mxfp4, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4023
+ ggml_gemv_mxfp4_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
4024
+ }
4025
+
4026
+ template <> void gemv<block_q8_0, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4027
+ ggml_gemv_q8_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
4028
+ }
4029
+
4030
+ template <> void gemv<block_q8_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4031
+ ggml_gemv_q8_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
4032
+ }
4033
+
4034
+ #if defined __riscv_zvfh
4035
+ template <> void gemv<block_q4_0, 1, 16, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4036
+ ggml_gemv_q4_0_16x1_q8_0(n, s, bs, vx, vy, nr, nc);
4037
+ }
4038
+
4039
+ template <> void gemv<block_q4_K, 1, 16, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4040
+ ggml_gemv_q4_K_16x1_q8_K(n, s, bs, vx, vy, nr, nc);
4041
+ }
4042
+
4043
+ template <> void gemv<block_iq4_nl, 1, 16, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4044
+ ggml_gemv_iq4_nl_16x1_q8_0(n, s, bs, vx, vy, nr, nc);
4045
+ }
4046
+
4047
+ template <> void gemv<block_q8_0, 1, 16, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4048
+ ggml_gemv_q8_0_16x1_q8_0(n, s, bs, vx, vy, nr, nc);
4049
+ }
4050
+
4051
+ template <> void gemv<block_q2_K, 1, 16, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4052
+ ggml_gemv_q2_K_16x1_q8_K(n, s, bs, vx, vy, nr, nc);
4053
+ }
4054
+ #endif
4055
+
4056
+ // gemm
4057
+ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
4058
+ void gemm(int, float *, size_t, const void *, const void *, int, int);
4059
+
4060
+ template <> void gemm<block_q4_0, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4061
+ ggml_gemm_q4_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
4062
+ }
4063
+
4064
+ template <> void gemm<block_q4_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4065
+ ggml_gemm_q4_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
4066
+ }
4067
+
4068
+ template <>
4069
+ void gemm<block_q4_0, 8, 8, GGML_TYPE_Q8_0>(int n,
4070
+ float * s,
4071
+ size_t bs,
4072
+ const void * vx,
4073
+ const void * vy,
4074
+ int nr,
4075
+ int nc) {
4076
+ ggml_gemm_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
4077
+ }
4078
+
4079
+ template <> void gemm<block_q2_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4080
+ ggml_gemm_q2_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
4081
+ }
4082
+
4083
+ template <> void gemm<block_q4_K, 4, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4084
+ ggml_gemm_q4_K_8x4_q8_K(n, s, bs, vx, vy, nr, nc);
4085
+ }
4086
+
4087
+ template <> void gemm<block_q4_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4088
+ ggml_gemm_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
4089
+ }
4090
+
4091
+ template <> void gemm<block_q5_K, 4, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4092
+ ggml_gemm_q5_K_8x4_q8_K(n, s, bs, vx, vy, nr, nc);
4093
+ }
4094
+
4095
+ template <> void gemm<block_q5_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4096
+ ggml_gemm_q5_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
4097
+ }
4098
+
4099
+ template <> void gemm<block_q6_K, 4, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4100
+ ggml_gemm_q6_K_8x4_q8_K(n, s, bs, vx, vy, nr, nc);
4101
+ }
4102
+
4103
+ template <> void gemm<block_q6_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4104
+ ggml_gemm_q6_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
4105
+ }
4106
+
4107
+ template <> void gemm<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4108
+ ggml_gemm_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
4109
+ }
4110
+
4111
+ template <> void gemm<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4112
+ ggml_gemm_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
4113
+ }
4114
+
4115
+ template <> void gemm<block_mxfp4, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4116
+ ggml_gemm_mxfp4_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
4117
+ }
4118
+
4119
+ template <> void gemm<block_mxfp4, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4120
+ ggml_gemm_mxfp4_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
4121
+ }
4122
+
4123
+ template <> void gemm<block_q8_0, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4124
+ ggml_gemm_q8_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
4125
+ }
4126
+
4127
+ template <> void gemm<block_q8_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4128
+ ggml_gemm_q8_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
4129
+ }
4130
+
4131
+ #if defined __riscv_zvfh
4132
+ template <> void gemm<block_q4_0, 1, 16, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4133
+ ggml_gemm_q4_0_16x1_q8_0(n, s, bs, vx, vy, nr, nc);
4134
+ }
4135
+
4136
+ template <> void gemm<block_q4_K, 1, 16, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4137
+ ggml_gemm_q4_K_16x1_q8_K(n, s, bs, vx, vy, nr, nc);
4138
+ }
4139
+
4140
+ template <> void gemm<block_iq4_nl, 1, 16, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4141
+ ggml_gemm_iq4_nl_16x1_q8_0(n, s, bs, vx, vy, nr, nc);
4142
+ }
4143
+
4144
+ template <> void gemm<block_q8_0, 1, 16, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4145
+ ggml_gemm_q8_0_16x1_q8_0(n, s, bs, vx, vy, nr, nc);
4146
+ }
4147
+
4148
+ template <> void gemm<block_q2_K, 1, 16, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
4149
+ ggml_gemm_q2_K_16x1_q8_K(n, s, bs, vx, vy, nr, nc);
4150
+ }
4151
+ #endif
4152
+
4153
+ class tensor_traits_base : public ggml::cpu::tensor_traits {
4154
+ public:
4155
+ virtual int repack(struct ggml_tensor * t, const void * data, size_t data_size) = 0;
4156
+ };
4157
+
4158
+ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE> class tensor_traits : public tensor_traits_base {
4159
+
4160
+ bool work_size(int /* n_threads */, const struct ggml_tensor * op, size_t & size) override {
4161
+ // not realy a GGML_TYPE_Q8_0 but same size.
4162
+ switch (op->op) {
4163
+ case GGML_OP_MUL_MAT:
4164
+ {
4165
+ size = ggml_row_size(PARAM_TYPE, ggml_nelements(op->src[1]));
4166
+ return true;
4167
+ }
4168
+ case GGML_OP_MUL_MAT_ID:
4169
+ {
4170
+ size = ggml_row_size(PARAM_TYPE, ggml_nelements(op->src[1]));
4171
+ size = GGML_PAD(size, sizeof(int64_t)); // + padding for next block.
4172
+
4173
+ const int64_t ne02 = op->src[0]->ne[2]; // n_as, n_expert
4174
+ const int64_t ne12 = op->src[1]->ne[2]; // n_tokens
4175
+
4176
+ const size_t sizeof_mmid_row_mapping = sizeof(int64_t);
4177
+
4178
+ size += sizeof_mmid_row_mapping*ne02*(ne12 + 1);
4179
+
4180
+ return true;
4181
+ }
4182
+ default:
4183
+ // GGML_ABORT("fatal error");
4184
+ break;
4185
+ }
4186
+ return false;
4187
+ }
4188
+
4189
+ bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) override {
4190
+ switch (op->op) {
4191
+ case GGML_OP_MUL_MAT:
4192
+ forward_mul_mat(params, op);
4193
+ return true;
4194
+ case GGML_OP_MUL_MAT_ID:
4195
+ forward_mul_mat_id(params, op);
4196
+ return true;
4197
+ default:
4198
+ // GGML_ABORT("fatal error");
4199
+ break;
4200
+ }
4201
+ return false;
4202
+ }
4203
+
4204
+ void forward_mul_mat_one_chunk(ggml_compute_params * params,
4205
+ ggml_tensor * op,
4206
+ int64_t src0_start,
4207
+ int64_t src0_end,
4208
+ int64_t src1_start,
4209
+ int64_t src1_end) {
4210
+ const ggml_tensor * src0 = op->src[0];
4211
+ const ggml_tensor * src1 = op->src[1];
4212
+ ggml_tensor * dst = op;
4213
+
4214
+ GGML_TENSOR_BINARY_OP_LOCALS
4215
+
4216
+ const size_t src1_col_stride = ggml_row_size(PARAM_TYPE, ne10);
4217
+
4218
+ GGML_ASSERT(ne03 == 1 && ne13 == 1);
4219
+ GGML_ASSERT(ne12 % ne02 == 0);
4220
+ const int64_t r2 = ne12 / ne02;
4221
+
4222
+ const int64_t i12 = src1_start / ne1;
4223
+ const int64_t i11 = src1_start - i12 * ne1;
4224
+
4225
+ // Determine batch index
4226
+ const int64_t i02 = i12 / r2;
4227
+
4228
+ const int64_t i1 = i11;
4229
+ const int64_t i2 = i12;
4230
+
4231
+ const char * src0_ptr = (const char *) src0->data + i02 * nb02;
4232
+ const char * src1_ptr = (const char *) params->wdata + (i11 + i12 * ne11) * src1_col_stride;
4233
+ char * dst_ptr = ((char *) dst->data + (i1 * nb1 + i2 * nb2));
4234
+
4235
+ const int64_t nrows = src1_end - src1_start;
4236
+ const int64_t ncols = src0_end - src0_start;
4237
+
4238
+ GGML_ASSERT(src1_ptr + src1_col_stride * nrows <= (const char *) params->wdata + params->wsize);
4239
+
4240
+ // If there are more than three rows in src1, use gemm; otherwise, use gemv.
4241
+ if (nrows > 3) {
4242
+ gemm<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00, (float *) (dst_ptr) + src0_start, nb1 / nb0,
4243
+ src0_ptr + src0_start * nb01, src1_ptr,
4244
+ nrows - (nrows % 4), ncols);
4245
+ }
4246
+ for (int iter = nrows - (nrows % 4); iter < nrows; iter++) {
4247
+ gemv<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00, (float *) (dst_ptr + (iter * nb1)) + src0_start,
4248
+ ne01, src0_ptr + src0_start * nb01,
4249
+ src1_ptr + (src1_col_stride * iter), 1 /* nrows */, ncols);
4250
+ }
4251
+ }
4252
+
4253
+ void forward_mul_mat(ggml_compute_params * params, ggml_tensor * op) {
4254
+ const ggml_tensor * src0 = op->src[0];
4255
+ const ggml_tensor * src1 = op->src[1];
4256
+ ggml_tensor * dst = op;
4257
+
4258
+ GGML_TENSOR_BINARY_OP_LOCALS
4259
+
4260
+ const int ith = params->ith;
4261
+ const int nth = params->nth;
4262
+
4263
+ GGML_ASSERT(ne0 == ne01);
4264
+ GGML_ASSERT(ne1 == ne11);
4265
+ GGML_ASSERT(ne2 == ne12);
4266
+ GGML_ASSERT(ne3 == ne13);
4267
+
4268
+ // dst cannot be transposed or permuted
4269
+ GGML_ASSERT(nb0 == sizeof(float));
4270
+ GGML_ASSERT(nb0 <= nb1);
4271
+ GGML_ASSERT(nb1 <= nb2);
4272
+ GGML_ASSERT(nb2 <= nb3);
4273
+
4274
+ // TODO: General batched mul mat for 4D tensors
4275
+ // Currently only supports 3D tensors
4276
+ GGML_ASSERT(ne03 == 1);
4277
+ GGML_ASSERT(ne13 == 1);
4278
+ GGML_ASSERT(ne3 == 1);
4279
+
4280
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
4281
+
4282
+ GGML_ASSERT(ggml_n_dims(op->src[0]) == 2);
4283
+ // GGML_ASSERT(ggml_n_dims(op->src[1]) == 2);
4284
+
4285
+ char * wdata = static_cast<char *>(params->wdata);
4286
+ const size_t nbw1 = ggml_row_size(PARAM_TYPE, ne10);
4287
+ const size_t nbw2 = nbw1 * ne11;
4288
+
4289
+ assert(params->wsize >= nbw2 * ne12);
4290
+
4291
+ const ggml_from_float_t from_float = ggml_get_type_traits_cpu(PARAM_TYPE)->from_float;
4292
+
4293
+ // INFO: Quantization is done in planes to avoid extra complexity in chunking.
4294
+ // Flattening dimensions not multiple of INTER_SIZE would require extra handling depending on how
4295
+ // the planes are broadcast.
4296
+ for (int64_t i12 = 0; i12 < ne12; i12++) {
4297
+ char * data_ptr = (char *) src1->data + i12 * nb12;
4298
+ char * wdata_ptr = wdata + i12 * nbw2;
4299
+
4300
+ for (int64_t i11 = ith * 4; i11 < ne11 - ne11 % 4; i11 += nth * 4) {
4301
+ ggml_quantize_mat_t<INTER_SIZE, PARAM_TYPE>((float *) (data_ptr + i11 * nb11),
4302
+ (void *) (wdata_ptr + i11 * nbw1), 4, ne10);
4303
+ }
4304
+
4305
+ const int64_t i11_processed = ne11 - ne11 % 4;
4306
+ for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) {
4307
+ from_float((float *) (data_ptr + i11 * nb11), (void *) (wdata_ptr + i11 * nbw1), ne10);
4308
+ }
4309
+ }
4310
+
4311
+ // disable for NUMA
4312
+ const bool disable_chunking = ggml_is_numa();
4313
+
4314
+ // 4x chunks per thread
4315
+ const int64_t nr0 = ggml_nrows(op->src[0]);
4316
+
4317
+ int nth_scaled = nth * 4;
4318
+ int64_t chunk_size0 = (nr0 + nth_scaled - 1) / nth_scaled;
4319
+ int64_t nchunk0 = (nr0 + chunk_size0 - 1) / chunk_size0;
4320
+
4321
+ // src1 is chunked only by full planes.
4322
+ // When we flatten we need to address dimensions not multiple of the q8 INTER_SIZE
4323
+ // to route them thorugh GEMV.
4324
+ // nchunk1 = ne12 also avoids messing the chunking for models with no 3d tensors
4325
+ // to avoid affecting their performance
4326
+ int64_t nchunk1 = ne12;
4327
+
4328
+ // Ensure minimum chunk size to avoid alignment issues with high thread counts
4329
+ // Minimum chunk size should be at least NB_COLS to prevent overlapping chunks after alignment
4330
+ const int64_t min_chunk_size = NB_COLS;
4331
+ if (nchunk0 > 0 && (nr0 / nchunk0) < min_chunk_size && nr0 >= min_chunk_size) {
4332
+ nchunk0 = (nr0 + min_chunk_size - 1) / min_chunk_size;
4333
+ }
4334
+
4335
+ int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0;
4336
+ // Only increase nchunk0 to nth if it won't make chunks too small
4337
+ if (nth == 1 || ((nchunk0 < nth || disable_chunking) && (nr0 + nth - 1) / nth >= min_chunk_size)) {
4338
+ nchunk0 = nth;
4339
+ dr0 = (nr0 + nchunk0 - 1) / nchunk0;
4340
+ }
4341
+
4342
+ // Ensure nchunk doesn't exceed the number of rows divided by minimum chunk size
4343
+ // This prevents creating too many tiny chunks that could overlap after alignment
4344
+ const int64_t max_nchunk = (nr0 + min_chunk_size - 1) / min_chunk_size;
4345
+ nchunk0 = MIN(nchunk0, max_nchunk);
4346
+
4347
+ if (ith == 0) {
4348
+ // Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
4349
+ ggml_threadpool_chunk_set(params->threadpool, nth);
4350
+ }
4351
+
4352
+ ggml_barrier(params->threadpool);
4353
+
4354
+ // The first chunk comes from our thread_id, the rest will get auto-assigned.
4355
+ int current_chunk = ith;
4356
+
4357
+ while (current_chunk < nchunk0 * nchunk1) {
4358
+ const int64_t ith0 = current_chunk % nchunk0;
4359
+ const int64_t ith1 = current_chunk / nchunk0;
4360
+
4361
+ int64_t src0_start = dr0 * ith0;
4362
+ int64_t src0_end = MIN(src0_start + dr0, nr0);
4363
+
4364
+ // full-plane range for src1
4365
+ int64_t src1_start = ith1 * ne11;
4366
+ int64_t src1_end = (ith1 + 1) * ne11;
4367
+
4368
+ // Align boundaries to NB_COLS - round up to ensure all data is included
4369
+ // The chunk size limiting above ensures chunks are large enough to prevent overlaps
4370
+ src0_start = (src0_start % NB_COLS) ? src0_start + NB_COLS - (src0_start % NB_COLS) : src0_start;
4371
+ src0_end = (src0_end % NB_COLS) ? src0_end + NB_COLS - (src0_end % NB_COLS) : src0_end;
4372
+ src0_end = MIN(src0_end, ne01);
4373
+
4374
+ // Make sure current plane is the last one before exiting
4375
+ if (src0_start >= src0_end) {
4376
+ current_chunk = ggml_threadpool_chunk_add(params->threadpool, 1);
4377
+ continue;
4378
+ }
4379
+
4380
+ forward_mul_mat_one_chunk(params, dst, src0_start, src0_end, src1_start, src1_end);
4381
+
4382
+ current_chunk = ggml_threadpool_chunk_add(params->threadpool, 1);
4383
+ }
4384
+ }
4385
+
4386
+ void forward_mul_mat_id(ggml_compute_params * params, ggml_tensor * op) {
4387
+ const ggml_tensor * src0 = op->src[0];
4388
+ const ggml_tensor * src1 = op->src[1];
4389
+ const ggml_tensor * ids = op->src[2];
4390
+ ggml_tensor * dst = op;
4391
+
4392
+ GGML_TENSOR_BINARY_OP_LOCALS
4393
+
4394
+ const int ith = params->ith;
4395
+ const int nth = params->nth;
4396
+
4397
+ const ggml_from_float_t from_float = ggml_get_type_traits_cpu(PARAM_TYPE)->from_float;
4398
+
4399
+ // we don't support permuted src0 or src1
4400
+ GGML_ASSERT(nb00 == ggml_type_size(src0->type));
4401
+ GGML_ASSERT(nb10 == ggml_type_size(src1->type));
4402
+
4403
+ // dst cannot be transposed or permuted
4404
+ GGML_ASSERT(nb0 == sizeof(float));
4405
+ GGML_ASSERT(nb0 <= nb1);
4406
+ GGML_ASSERT(nb1 <= nb2);
4407
+ GGML_ASSERT(nb2 <= nb3);
4408
+
4409
+ GGML_ASSERT(ne03 == 1);
4410
+ GGML_ASSERT(ne13 == 1);
4411
+ GGML_ASSERT(ne3 == 1);
4412
+
4413
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
4414
+
4415
+ // row groups
4416
+ const int n_ids = ids->ne[0]; // n_expert_used
4417
+ const int n_as = ne02; // n_expert
4418
+
4419
+ const size_t nbw1 = ggml_row_size(PARAM_TYPE, ne10);
4420
+ const size_t nbw2 = nbw1*ne11;
4421
+ const size_t nbw3 = nbw2*ne12;
4422
+
4423
+ struct mmid_row_mapping {
4424
+ int32_t i1;
4425
+ int32_t i2;
4426
+ };
4427
+
4428
+ GGML_ASSERT(params->wsize >=
4429
+ (GGML_PAD(nbw3, sizeof(int64_t)) +
4430
+ n_as*(ne12 + 1)*sizeof(mmid_row_mapping))
4431
+ );
4432
+
4433
+ auto * wdata = (char *)params->wdata;
4434
+ auto * wdata_src1_end = (char *)wdata + GGML_PAD(nbw3, sizeof(int64_t));
4435
+
4436
+ // total of [n_as][ne12 + 1] elements of type mmid_row_mapping (2*int32_t = int64_t)
4437
+ auto * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
4438
+ struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *) (matrix_row_counts + n_as); // [n_as][ne12]
4439
+
4440
+ // src1: float32 => param type
4441
+ for (int64_t i12 = 0; i12 < ne12; ++i12) {
4442
+ for (int64_t i11 = ith; i11 < ne11; i11 += nth) {
4443
+ from_float((float *)((char *) src1->data + i12 * nb12 + i11 * nb11),
4444
+ (void *) (wdata + i12 * nbw2 + i11 * nbw1),
4445
+ ne10);
4446
+ }
4447
+ }
4448
+
4449
+ #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id) * ne12 + (i1)]
4450
+
4451
+ if (ith == 0) {
4452
+ // initialize matrix_row_counts
4453
+ memset(matrix_row_counts, 0, n_as * sizeof(int64_t));
4454
+
4455
+ // group rows by src0 matrix
4456
+ for (int32_t iid1 = 0; iid1 < ids->ne[1]; ++iid1) {
4457
+ for (int32_t id = 0; id < n_ids; ++id) {
4458
+ const int32_t i02 =
4459
+ *(const int32_t *) ((const char *) ids->data + iid1 * ids->nb[1] + id * ids->nb[0]);
4460
+
4461
+ GGML_ASSERT(i02 >= 0 && i02 < n_as);
4462
+
4463
+ MMID_MATRIX_ROW(i02, matrix_row_counts[i02]) = { id, iid1 };
4464
+ matrix_row_counts[i02] += 1;
4465
+ }
4466
+ }
4467
+ }
4468
+
4469
+ ggml_barrier(params->threadpool);
4470
+
4471
+ // compute each matrix multiplication in sequence
4472
+ for (int cur_a = 0; cur_a < n_as; ++cur_a) {
4473
+ const int64_t cne1 = matrix_row_counts[cur_a];
4474
+
4475
+ if (cne1 == 0) {
4476
+ continue;
4477
+ }
4478
+
4479
+ const auto * src0_cur = (const char *) src0->data + cur_a*nb02;
4480
+
4481
+ //const int64_t nr0 = ne01; // src0 rows
4482
+ const int64_t nr1 = cne1; // src1 rows
4483
+
4484
+ int64_t src0_cur_start = (ith * ne01) / nth;
4485
+ int64_t src0_cur_end = ((ith + 1) * ne01) / nth;
4486
+
4487
+ // Align boundaries to NB_COLS - round up to ensure all data is included
4488
+ src0_cur_start = (src0_cur_start % NB_COLS) ? src0_cur_start + NB_COLS - (src0_cur_start % NB_COLS) : src0_cur_start;
4489
+ src0_cur_end = (src0_cur_end % NB_COLS) ? src0_cur_end + NB_COLS - (src0_cur_end % NB_COLS) : src0_cur_end;
4490
+ if (src0_cur_end > ne01) {
4491
+ src0_cur_end = ne01;
4492
+ }
4493
+
4494
+ if (src0_cur_start >= src0_cur_end) {
4495
+ return;
4496
+ }
4497
+
4498
+ for (int ir1 = 0; ir1 < nr1; ir1++) {
4499
+ struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, ir1);
4500
+
4501
+ const int id = row_mapping.i1; // selected expert index
4502
+
4503
+ const int64_t i11 = id % ne11;
4504
+ const int64_t i12 = row_mapping.i2; // row index in src1
4505
+
4506
+ const int64_t i1 = id; // selected expert index
4507
+ const int64_t i2 = i12; // row
4508
+
4509
+ const auto * src1_col = (const char *) wdata + (i11 * nbw1 + i12 * nbw2);
4510
+
4511
+ gemv<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(
4512
+ ne00, (float *) ((char *) dst->data + (i1 * nb1 + i2 * nb2)) + src0_cur_start, ne01,
4513
+ src0_cur + src0_cur_start * nb01, src1_col, 1, src0_cur_end - src0_cur_start);
4514
+ }
4515
+ }
4516
+ #undef MMID_MATRIX_ROW
4517
+ }
4518
+
4519
+ int repack(struct ggml_tensor * t, const void * data, size_t data_size) override {
4520
+ GGML_LOG_DEBUG("%s: repack tensor %s with %s_%dx%d\n", __func__, t->name, ggml_type_name(t->type),
4521
+ (int) NB_COLS, (int) INTER_SIZE);
4522
+ return ggml::cpu::repack::repack<BLOC_TYPE, INTER_SIZE, NB_COLS>(t, data, data_size);
4523
+ }
4524
+ };
4525
+
4526
+ } // namespace ggml::cpu::repack
4527
+
4528
+ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(const struct ggml_tensor * cur) {
4529
+ // instance for Q4
4530
+ static const ggml::cpu::repack::tensor_traits<block_q4_0, 4, 4, GGML_TYPE_Q8_0> q4_0_4x4_q8_0;
4531
+ static const ggml::cpu::repack::tensor_traits<block_q4_0, 8, 4, GGML_TYPE_Q8_0> q4_0_4x8_q8_0;
4532
+ static const ggml::cpu::repack::tensor_traits<block_q4_0, 8, 8, GGML_TYPE_Q8_0> q4_0_8x8_q8_0;
4533
+
4534
+ // instance for Q4_K
4535
+ static const ggml::cpu::repack::tensor_traits<block_q4_K, 4, 8, GGML_TYPE_Q8_K> q4_K_8x4_q8_K;
4536
+ static const ggml::cpu::repack::tensor_traits<block_q4_K, 8, 8, GGML_TYPE_Q8_K> q4_K_8x8_q8_K;
4537
+
4538
+ // instance for Q5_K
4539
+ static const ggml::cpu::repack::tensor_traits<block_q5_K, 4, 8, GGML_TYPE_Q8_K> q5_K_8x4_q8_K;
4540
+ static const ggml::cpu::repack::tensor_traits<block_q5_K, 8, 8, GGML_TYPE_Q8_K> q5_K_8x8_q8_K;
4541
+
4542
+ // instance for Q6_K
4543
+ static const ggml::cpu::repack::tensor_traits<block_q6_K, 4, 8, GGML_TYPE_Q8_K> q6_K_8x4_q8_K;
4544
+ static const ggml::cpu::repack::tensor_traits<block_q6_K, 8, 8, GGML_TYPE_Q8_K> q6_K_8x8_q8_K;
4545
+
4546
+ // instance for Q2
4547
+ static const ggml::cpu::repack::tensor_traits<block_q2_K, 8, 8, GGML_TYPE_Q8_K> q2_K_8x8_q8_K;
4548
+
4549
+ // instance for IQ4
4550
+ static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0> iq4_nl_4x4_q8_0;
4551
+ static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0> iq4_nl_8x8_q8_0;
4552
+
4553
+ // instance for MXFP4
4554
+ static const ggml::cpu::repack::tensor_traits<block_mxfp4, 4, 4, GGML_TYPE_Q8_0> mxfp4_4x4_q8_0;
4555
+ static const ggml::cpu::repack::tensor_traits<block_mxfp4, 8, 8, GGML_TYPE_Q8_0> mxfp4_8x8_q8_0;
4556
+
4557
+ // instance for Q8_0
4558
+ static const ggml::cpu::repack::tensor_traits<block_q8_0, 4, 4, GGML_TYPE_Q8_0> q8_0_4x4_q8_0;
4559
+ static const ggml::cpu::repack::tensor_traits<block_q8_0, 8, 4, GGML_TYPE_Q8_0> q8_0_4x8_q8_0;
4560
+
4561
+ // instances for RISC-V
4562
+ //
4563
+ // These implement outer-product style matrix multiplication kernels with
4564
+ // an interleave of 1.
4565
+ #if defined __riscv_zvfh
4566
+ static const ggml::cpu::repack::tensor_traits<block_q4_0, 1, 16, GGML_TYPE_Q8_0> q4_0_16x1_q8_0;
4567
+ static const ggml::cpu::repack::tensor_traits<block_q4_K, 1, 16, GGML_TYPE_Q8_K> q4_K_16x1_q8_K;
4568
+ static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 1, 16, GGML_TYPE_Q8_0> iq4_nl_16x1_q8_0;
4569
+ static const ggml::cpu::repack::tensor_traits<block_q8_0, 1, 16, GGML_TYPE_Q8_0> q8_0_16x1_q8_0;
4570
+ static const ggml::cpu::repack::tensor_traits<block_q2_K, 1, 16, GGML_TYPE_Q8_K> q2_K_16x1_q8_K;
4571
+ #endif
4572
+
4573
+ if (cur->type == GGML_TYPE_Q4_0) {
4574
+ if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) {
4575
+ if (cur->ne[1] % 8 == 0) {
4576
+ return &q4_0_8x8_q8_0;
4577
+ }
4578
+ }
4579
+ if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
4580
+ if (cur->ne[1] % 4 == 0) {
4581
+ return &q4_0_4x8_q8_0;
4582
+ }
4583
+ }
4584
+ if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
4585
+ if (cur->ne[1] % 4 == 0) {
4586
+ return &q4_0_4x4_q8_0;
4587
+ }
4588
+ }
4589
+ if (ggml_cpu_has_riscv_v()) {
4590
+ #if defined __riscv_zvfh
4591
+ switch (__riscv_vlenb() * 8) {
4592
+ case 128: { break; } // TODO
4593
+ case 256: { if (cur->ne[1] % 16 == 0) { return &q4_0_16x1_q8_0; } break; }
4594
+ case 512: { break; } // TODO
4595
+ case 1024: { break; } // TODO
4596
+ default: { return nullptr; }
4597
+ }
4598
+ #endif
4599
+ }
4600
+ } else if (cur->type == GGML_TYPE_Q4_K) {
4601
+ if (ggml_cpu_has_avx2()) {
4602
+ if (cur->ne[1] % 8 == 0) {
4603
+ return &q4_K_8x8_q8_K;
4604
+ }
4605
+ }
4606
+ if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
4607
+ if (cur->ne[1] % 8 == 0) {
4608
+ return &q4_K_8x8_q8_K;
4609
+ }
4610
+ }
4611
+ if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
4612
+ if (cur->ne[1] % 8 == 0) {
4613
+ return &q4_K_8x4_q8_K;
4614
+ }
4615
+ }
4616
+ if (ggml_cpu_has_riscv_v()) {
4617
+ #if defined __riscv_zvfh
4618
+ switch (__riscv_vlenb() * 8) {
4619
+ case 128: { break; } // TODO
4620
+ case 256: { if (cur->ne[1] % 16 == 0) { return &q4_K_16x1_q8_K; } break; }
4621
+ case 512: { break; } // TODO
4622
+ case 1024: { break; } // TODO
4623
+ default: { return nullptr; }
4624
+ }
4625
+ #endif
4626
+ }
4627
+ } else if (cur->type == GGML_TYPE_Q2_K) {
4628
+ if (ggml_cpu_has_avx512()) {
4629
+ if (cur->ne[1] % 8 == 0) {
4630
+ return &q2_K_8x8_q8_K;
4631
+ }
4632
+ }
4633
+ if (ggml_cpu_has_riscv_v()) {
4634
+ #if defined __riscv_zvfh
4635
+ switch (__riscv_vlenb() * 8) {
4636
+ case 128: { break; } // TODO
4637
+ case 256: { if (cur->ne[1] % 16 == 0) { return &q2_K_16x1_q8_K; } break; }
4638
+ case 512: { break; } // TODO
4639
+ case 1024: { break; } // TODO
4640
+ default: { return nullptr; }
4641
+ }
4642
+ #endif
4643
+ }
4644
+ } else if (cur->type == GGML_TYPE_Q5_K) {
4645
+ if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
4646
+ if (cur->ne[1] % 8 == 0) {
4647
+ return &q5_K_8x8_q8_K;
4648
+ }
4649
+ }
4650
+ if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
4651
+ if (cur->ne[1] % 8 == 0) {
4652
+ return &q5_K_8x4_q8_K;
4653
+ }
4654
+ }
4655
+ } else if (cur->type == GGML_TYPE_Q6_K) {
4656
+ if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
4657
+ if (cur->ne[1] % 8 == 0) {
4658
+ return &q6_K_8x8_q8_K;
4659
+ }
4660
+ }
4661
+ if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
4662
+ if (cur->ne[1] % 8 == 0) {
4663
+ return &q6_K_8x4_q8_K;
4664
+ }
4665
+ }
4666
+ } else if (cur->type == GGML_TYPE_IQ4_NL) {
4667
+ if (ggml_cpu_has_avx2()) {
4668
+ if (cur->ne[1] % 8 == 0) {
4669
+ return &iq4_nl_8x8_q8_0;
4670
+ }
4671
+ }
4672
+ if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
4673
+ if (cur->ne[1] % 4 == 0) {
4674
+ return &iq4_nl_4x4_q8_0;
4675
+ }
4676
+ }
4677
+ if (ggml_cpu_has_riscv_v()) {
4678
+ #if defined __riscv_zvfh
4679
+ switch (__riscv_vlenb() * 8) {
4680
+ case 128: { break; } // TODO
4681
+ case 256: { if (cur->ne[1] % 16 == 0) { return &iq4_nl_16x1_q8_0; } break; }
4682
+ case 512: { break; } // TODO
4683
+ case 1024: { break; } // TODO
4684
+ default: { return nullptr; }
4685
+ }
4686
+ #endif
4687
+ }
4688
+ } else if (cur->type == GGML_TYPE_MXFP4) {
4689
+ if (ggml_cpu_has_avx2()) {
4690
+ if (cur->ne[1] % 8 == 0) {
4691
+ return &mxfp4_8x8_q8_0;
4692
+ }
4693
+ }
4694
+ if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
4695
+ if (cur->ne[1] % 4 == 0) {
4696
+ return &mxfp4_4x4_q8_0;
4697
+ }
4698
+ }
4699
+ } else if (cur->type == GGML_TYPE_Q8_0) {
4700
+ if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
4701
+ if (cur->ne[1] % 4 == 0) {
4702
+ return &q8_0_4x8_q8_0;
4703
+ }
4704
+ }
4705
+ if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
4706
+ if (cur->ne[1] % 4 == 0) {
4707
+ return &q8_0_4x4_q8_0;
4708
+ }
4709
+ }
4710
+ if (ggml_cpu_has_riscv_v()) {
4711
+ #if defined __riscv_zvfh
4712
+ switch (__riscv_vlenb() * 8) {
4713
+ case 128: { break; } // TODO
4714
+ case 256: { if (cur->ne[1] % 16 == 0) { return &q8_0_16x1_q8_0; } break; }
4715
+ case 512: { break; } // TODO
4716
+ case 1024: { break; } // TODO
4717
+ default: { return nullptr; }
4718
+ }
4719
+ #endif
4720
+ }
4721
+ }
4722
+
4723
+ return nullptr;
4724
+ }
4725
+
4726
+ static enum ggml_status ggml_backend_cpu_repack_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
4727
+ tensor->extra = (void *) const_cast<ggml::cpu::tensor_traits *>(ggml_repack_get_optimal_repack_type(tensor));
4728
+
4729
+ GGML_UNUSED(buffer);
4730
+ return GGML_STATUS_SUCCESS;
4731
+ }
4732
+
4733
+ static void ggml_backend_cpu_repack_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
4734
+ const void * data, size_t offset, size_t size) {
4735
+ GGML_ASSERT(offset == 0);
4736
+ GGML_ASSERT(size == ggml_nbytes(tensor));
4737
+
4738
+ auto tensor_traits = (ggml::cpu::repack::tensor_traits_base *) tensor->extra;
4739
+ auto OK = tensor_traits->repack(tensor, data, size);
4740
+
4741
+ GGML_ASSERT(OK == 0);
4742
+ GGML_UNUSED(buffer);
4743
+ }
4744
+
4745
+ static const char * ggml_backend_cpu_repack_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
4746
+ return "CPU_REPACK";
4747
+
4748
+ GGML_UNUSED(buft);
4749
+ }
4750
+
4751
+ static ggml_backend_buffer_t ggml_backend_cpu_repack_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
4752
+ ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
4753
+
4754
+ if (buffer == nullptr) {
4755
+ return nullptr;
4756
+ }
4757
+
4758
+ buffer->buft = buft;
4759
+ buffer->iface.init_tensor = ggml_backend_cpu_repack_buffer_init_tensor;
4760
+ buffer->iface.set_tensor = ggml_backend_cpu_repack_buffer_set_tensor;
4761
+ buffer->iface.get_tensor = nullptr;
4762
+ buffer->iface.cpy_tensor = nullptr;
4763
+ return buffer;
4764
+ }
4765
+
4766
+ static size_t ggml_backend_cpu_repack_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
4767
+ return TENSOR_ALIGNMENT;
4768
+
4769
+ GGML_UNUSED(buft);
4770
+ }
4771
+
4772
+ namespace ggml::cpu::repack {
4773
+ class extra_buffer_type : ggml::cpu::extra_buffer_type {
4774
+ bool supports_op(ggml_backend_dev_t, const struct ggml_tensor * op) override {
4775
+ if ( op->op == GGML_OP_MUL_MAT &&
4776
+ op->src[0]->buffer &&
4777
+ (ggml_n_dims(op->src[0]) == 2) &&
4778
+ op->src[0]->buffer->buft == ggml_backend_cpu_repack_buffer_type() &&
4779
+ ggml_repack_get_optimal_repack_type(op->src[0])
4780
+ ) {
4781
+ if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
4782
+ return false;
4783
+ }
4784
+ if (op->src[1]->type == GGML_TYPE_F32) {
4785
+ return true;
4786
+ }
4787
+ //if (op->src[1]->type == GGML_TYPE_Q8_0) {
4788
+ // return true;
4789
+ //}
4790
+ // may be possible if Q8_0 packed...
4791
+ } else if (op->op == GGML_OP_MUL_MAT_ID
4792
+ && op->src[0]->buffer
4793
+ && (ggml_n_dims(op->src[0]) == 3)
4794
+ && op->src[0]->buffer->buft == ggml_backend_cpu_repack_buffer_type()
4795
+ && ggml_repack_get_optimal_repack_type(op->src[0])
4796
+ ) {
4797
+ if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
4798
+ return false;
4799
+ }
4800
+ if (op->src[1]->type == GGML_TYPE_F32) {
4801
+ return true;
4802
+ }
4803
+ //if (op->src[1]->type == GGML_TYPE_Q8_0) {
4804
+ // return true;
4805
+ //}
4806
+ }
4807
+ return false;
4808
+ }
4809
+
4810
+ ggml::cpu::tensor_traits * get_tensor_traits(const struct ggml_tensor * op) override {
4811
+ if (op->op == GGML_OP_MUL_MAT || op->op == GGML_OP_MUL_MAT_ID) {
4812
+ if (op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_cpu_repack_buffer_type()) {
4813
+ return (ggml::cpu::tensor_traits *) op->src[0]->extra;
4814
+ }
4815
+ }
4816
+ return nullptr;
4817
+ }
4818
+ };
4819
+ } // namespace ggml::cpu::repack
4820
+
4821
+ ggml_backend_buffer_type_t ggml_backend_cpu_repack_buffer_type(void) {
4822
+ static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_repack = {
4823
+ /* .iface = */ {
4824
+ /* .get_name = */ ggml_backend_cpu_repack_buffer_type_get_name,
4825
+ /* .alloc_buffer = */ ggml_backend_cpu_repack_buffer_type_alloc_buffer,
4826
+ /* .get_alignment = */ ggml_backend_cpu_repack_buffer_type_get_alignment,
4827
+ /* .get_max_size = */ nullptr, // defaults to SIZE_MAX
4828
+ /* .get_alloc_size = */ nullptr, // defaults to ggml_nbytes
4829
+ /* .is_host = */ nullptr,
4830
+ },
4831
+ /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
4832
+ /* .context = */ new ggml::cpu::repack::extra_buffer_type(),
4833
+ };
4834
+
4835
+ return &ggml_backend_cpu_buffer_type_repack;
4836
+ }