toy 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2107) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +1124 -0
  3. data/LICENSE +21 -0
  4. data/Makefile +2022 -0
  5. data/README.md +154 -0
  6. data/bin/toy +10 -0
  7. data/lib/toy/compute.rb +135 -0
  8. data/lib/toy/compute_cuda.rb +104 -0
  9. data/lib/toy/compute_metal.rb +97 -0
  10. data/lib/toy/core/cli/describe.rb +188 -0
  11. data/lib/toy/core/cli/eval.rb +385 -0
  12. data/lib/toy/core/cli/exit_codes.rb +15 -0
  13. data/lib/toy/core/cli/fetch.rb +238 -0
  14. data/lib/toy/core/cli/infer.rb +268 -0
  15. data/lib/toy/core/cli/install.rb +228 -0
  16. data/lib/toy/core/cli/list.rb +86 -0
  17. data/lib/toy/core/cli/manifest.rb +49 -0
  18. data/lib/toy/core/cli/new.rb +594 -0
  19. data/lib/toy/core/cli/serve.rb +237 -0
  20. data/lib/toy/core/cli/train.rb +471 -0
  21. data/lib/toy/core/cli.rb +165 -0
  22. data/lib/toy/core/config.rb +64 -0
  23. data/lib/toy/core/gguf_meta.rb +161 -0
  24. data/lib/toy/core/model_scan.rb +221 -0
  25. data/lib/toy/core/run_log.rb +94 -0
  26. data/lib/toy/core/toy_root.rb +95 -0
  27. data/lib/toy/dev/toy_card.rb +299 -0
  28. data/lib/toy/dev/toy_describe_flow.rb +412 -0
  29. data/lib/toy/dev/toy_logprobs.rb +86 -0
  30. data/lib/toy/dev/toy_tap.rb +183 -0
  31. data/lib/toy/dev/toy_token_drift.rb +121 -0
  32. data/lib/toy/ffi/tinynn.rb +1491 -0
  33. data/lib/toy/ffi/tinynn_cuda.rb +1124 -0
  34. data/lib/toy/ffi/tinynn_metal.rb +359 -0
  35. data/lib/toy/ffi_manifest.rb +84 -0
  36. data/lib/toy/io/bpe.rb +325 -0
  37. data/lib/toy/io/gguf_kv.rb +35 -0
  38. data/lib/toy/io/gguf_load.rb +331 -0
  39. data/lib/toy/io/loaders/toy_gpt2_loader.rb +70 -0
  40. data/lib/toy/io/loaders/toy_smollm2_loader.rb +754 -0
  41. data/lib/toy/io/model_index.rb +206 -0
  42. data/lib/toy/io/run_bundle.rb +280 -0
  43. data/lib/toy/io/tokenizer.rb +613 -0
  44. data/lib/toy/io/toy_corpus_loader.rb +52 -0
  45. data/lib/toy/io/toy_events.rb +56 -0
  46. data/lib/toy/io/toy_image_loader.rb +48 -0
  47. data/lib/toy/llm/adamw.rb +169 -0
  48. data/lib/toy/llm/archs/llama_arch.rb +233 -0
  49. data/lib/toy/llm/archs/llama_arch_cuda.rb +237 -0
  50. data/lib/toy/llm/archs/llama_arch_metal.rb +237 -0
  51. data/lib/toy/llm/blocks/transformer_block.rb +876 -0
  52. data/lib/toy/llm/blocks/transformer_block_cuda.rb +880 -0
  53. data/lib/toy/llm/blocks/transformer_block_metal.rb +880 -0
  54. data/lib/toy/llm/classify_batch.rb +88 -0
  55. data/lib/toy/llm/engine/gpt2_fwd_engine.rb +360 -0
  56. data/lib/toy/llm/engine/gpt2_fwd_engine_cuda.rb +362 -0
  57. data/lib/toy/llm/engine/gpt2_fwd_engine_metal.rb +362 -0
  58. data/lib/toy/llm/engine/gpt2_kv_engine.rb +346 -0
  59. data/lib/toy/llm/engine/gpt2_kv_engine_cuda.rb +348 -0
  60. data/lib/toy/llm/engine/gpt2_kv_engine_metal.rb +348 -0
  61. data/lib/toy/llm/engine/gpt2_seq_engine.rb +289 -0
  62. data/lib/toy/llm/engine/gpt2_seq_engine_cuda.rb +293 -0
  63. data/lib/toy/llm/engine/gpt2_seq_engine_metal.rb +293 -0
  64. data/lib/toy/llm/engine/llama_kv_engine.rb +1593 -0
  65. data/lib/toy/llm/engine/llama_kv_engine_cuda.rb +1526 -0
  66. data/lib/toy/llm/engine/llama_kv_engine_metal.rb +1526 -0
  67. data/lib/toy/llm/engine/llama_seq_engine.rb +1233 -0
  68. data/lib/toy/llm/engine/llama_seq_engine_cuda.rb +1238 -0
  69. data/lib/toy/llm/engine/llama_seq_engine_metal.rb +1238 -0
  70. data/lib/toy/llm/engine/vit_tiny_engine.rb +467 -0
  71. data/lib/toy/llm/labels.rb +142 -0
  72. data/lib/toy/llm/primitives/gqa.rb +62 -0
  73. data/lib/toy/llm/primitives/gqa_cuda.rb +66 -0
  74. data/lib/toy/llm/primitives/gqa_metal.rb +66 -0
  75. data/lib/toy/llm/primitives/rms_norm.rb +39 -0
  76. data/lib/toy/llm/primitives/rms_norm_cuda.rb +43 -0
  77. data/lib/toy/llm/primitives/rms_norm_metal.rb +43 -0
  78. data/lib/toy/llm/primitives/rope.rb +68 -0
  79. data/lib/toy/llm/primitives/rope_cuda.rb +72 -0
  80. data/lib/toy/llm/primitives/rope_metal.rb +72 -0
  81. data/lib/toy/llm/primitives/swiglu.rb +41 -0
  82. data/lib/toy/llm/primitives/swiglu_cuda.rb +45 -0
  83. data/lib/toy/llm/primitives/swiglu_metal.rb +45 -0
  84. data/lib/toy/llm/recipe_options.rb +71 -0
  85. data/lib/toy/llm/recipes/from_scratch.rb +105 -0
  86. data/lib/toy/llm/recipes/from_scratch_cuda.rb +109 -0
  87. data/lib/toy/llm/recipes/from_scratch_metal.rb +109 -0
  88. data/lib/toy/llm/recipes/lora.rb +110 -0
  89. data/lib/toy/llm/recipes/lora_cuda.rb +114 -0
  90. data/lib/toy/llm/recipes/lora_metal.rb +114 -0
  91. data/lib/toy/llm/recipes/vit_tiny.rb +75 -0
  92. data/lib/toy/llm/recipes/warm_start.rb +235 -0
  93. data/lib/toy/llm/recipes/warm_start_cuda.rb +239 -0
  94. data/lib/toy/llm/recipes/warm_start_metal.rb +239 -0
  95. data/lib/toy/llm/training_batch.rb +133 -0
  96. data/lib/toy/models/arch.rb +253 -0
  97. data/lib/toy/models/gpt2.rb +311 -0
  98. data/lib/toy/models/toy_gpt2.rb +177 -0
  99. data/lib/toy/models/toy_smollm2.rb +393 -0
  100. data/lib/toy/models/toy_vit.rb +83 -0
  101. data/lib/toy/models/transformer.rb +1494 -0
  102. data/lib/toy/models/transformer_lm.rb +298 -0
  103. data/lib/toy/models/transformer_lm_cuda.rb +159 -0
  104. data/lib/toy/models/transformer_lm_metal.rb +142 -0
  105. data/lib/toy/mri.rb +300 -0
  106. data/lib/toy/run/eval.rb +76 -0
  107. data/lib/toy/run/eval_cuda.rb +66 -0
  108. data/lib/toy/run/eval_lmc.rb +334 -0
  109. data/lib/toy/run/eval_metal.rb +67 -0
  110. data/lib/toy/run/infer.rb +130 -0
  111. data/lib/toy/run/infer_cuda.rb +118 -0
  112. data/lib/toy/run/infer_metal.rb +119 -0
  113. data/lib/toy/run/infer_trace.rb +37 -0
  114. data/lib/toy/run/serve.rb +144 -0
  115. data/lib/toy/run/train.rb +404 -0
  116. data/lib/toy/run/train_cuda.rb +397 -0
  117. data/lib/toy/run/train_gpt2.rb +103 -0
  118. data/lib/toy/run/train_gpt2_cuda.rb +85 -0
  119. data/lib/toy/run/train_gpt2_metal.rb +85 -0
  120. data/lib/toy/run/train_lora.rb +207 -0
  121. data/lib/toy/run/train_lora_cuda.rb +219 -0
  122. data/lib/toy/run/train_metal.rb +227 -0
  123. data/lib/toy/run/train_vit.rb +251 -0
  124. data/lib/toy/serve/openai/embeddings_handler.rb +92 -0
  125. data/lib/toy/serve/openai/handlers.rb +143 -0
  126. data/lib/toy/serve/openai/server.rb +159 -0
  127. data/lib/toy/train/sampler.rb +314 -0
  128. data/lib/toy/train/toy_chat_template.rb +179 -0
  129. data/lib/toy/train/toy_drift_grad.rb +176 -0
  130. data/lib/toy/train/toy_gguf_fuse.rb +428 -0
  131. data/lib/toy/train/toy_gguf_writer.rb +100 -0
  132. data/lib/toy/train/toy_lr_schedule.rb +39 -0
  133. data/lib/toy/train/toy_sample.rb +125 -0
  134. data/lib/toy/train/toy_trainer.rb +86 -0
  135. data/lib/toy/train/training.rb +160 -0
  136. data/lib/toy/version.rb +11 -0
  137. data/lib/toy.rb +902 -0
  138. data/prep/progress +118 -0
  139. data/prep/quietly +64 -0
  140. data/sig/toy.rbs +397 -0
  141. data/sig/toy_compute.rbs +450 -0
  142. data/spinel-ext.json +122 -0
  143. data/tinynn/Makefile +71 -0
  144. data/tinynn/tinynn_backend_cuda.c +99 -0
  145. data/tinynn/tinynn_backend_metal.m +75 -0
  146. data/tinynn/tinynn_events.c +122 -0
  147. data/tinynn/tinynn_events.h +83 -0
  148. data/tinynn/tinynn_ggml.c +2460 -0
  149. data/tinynn/tinynn_ggml.h +545 -0
  150. data/tinynn/tinynn_gguf.c +783 -0
  151. data/tinynn/tinynn_gguf.h +167 -0
  152. data/tinynn/tinynn_trace.c +180 -0
  153. data/tinynn/tinynn_trace.h +85 -0
  154. data/vendor/ggml/AUTHORS +335 -0
  155. data/vendor/ggml/CMakeLists.txt +505 -0
  156. data/vendor/ggml/CONTRIBUTING.md +3 -0
  157. data/vendor/ggml/LICENSE +21 -0
  158. data/vendor/ggml/README.md +50 -0
  159. data/vendor/ggml/ci/run.sh +395 -0
  160. data/vendor/ggml/cmake/FindNCCL.cmake +36 -0
  161. data/vendor/ggml/cmake/GitVars.cmake +22 -0
  162. data/vendor/ggml/cmake/common.cmake +50 -0
  163. data/vendor/ggml/cmake/ggml-config.cmake.in +191 -0
  164. data/vendor/ggml/docs/gguf.md +828 -0
  165. data/vendor/ggml/examples/CMakeLists.txt +34 -0
  166. data/vendor/ggml/examples/common-ggml.cpp +244 -0
  167. data/vendor/ggml/examples/common-ggml.h +18 -0
  168. data/vendor/ggml/examples/common.cpp +675 -0
  169. data/vendor/ggml/examples/common.h +322 -0
  170. data/vendor/ggml/examples/gpt-2/CMakeLists.txt +32 -0
  171. data/vendor/ggml/examples/gpt-2/README.md +225 -0
  172. data/vendor/ggml/examples/gpt-2/convert-cerebras-to-ggml.py +183 -0
  173. data/vendor/ggml/examples/gpt-2/convert-ckpt-to-ggml.py +159 -0
  174. data/vendor/ggml/examples/gpt-2/convert-h5-to-ggml.py +195 -0
  175. data/vendor/ggml/examples/gpt-2/download-ggml-model.sh +69 -0
  176. data/vendor/ggml/examples/gpt-2/download-model.sh +48 -0
  177. data/vendor/ggml/examples/gpt-2/main-alloc.cpp +880 -0
  178. data/vendor/ggml/examples/gpt-2/main-backend.cpp +946 -0
  179. data/vendor/ggml/examples/gpt-2/main-batched.cpp +1210 -0
  180. data/vendor/ggml/examples/gpt-2/main-ctx.cpp +840 -0
  181. data/vendor/ggml/examples/gpt-2/main-sched.cpp +1079 -0
  182. data/vendor/ggml/examples/gpt-2/quantize.cpp +184 -0
  183. data/vendor/ggml/examples/gpt-j/CMakeLists.txt +13 -0
  184. data/vendor/ggml/examples/gpt-j/README.md +239 -0
  185. data/vendor/ggml/examples/gpt-j/convert-h5-to-ggml.py +173 -0
  186. data/vendor/ggml/examples/gpt-j/download-ggml-model.sh +69 -0
  187. data/vendor/ggml/examples/gpt-j/download-model.sh +11 -0
  188. data/vendor/ggml/examples/gpt-j/main.cpp +755 -0
  189. data/vendor/ggml/examples/gpt-j/quantize.cpp +182 -0
  190. data/vendor/ggml/examples/magika/CMakeLists.txt +17 -0
  191. data/vendor/ggml/examples/magika/README.md +23 -0
  192. data/vendor/ggml/examples/magika/convert.py +32 -0
  193. data/vendor/ggml/examples/magika/main.cpp +374 -0
  194. data/vendor/ggml/examples/mnist/CMakeLists.txt +58 -0
  195. data/vendor/ggml/examples/mnist/README.md +206 -0
  196. data/vendor/ggml/examples/mnist/mnist-common.cpp +496 -0
  197. data/vendor/ggml/examples/mnist/mnist-common.h +166 -0
  198. data/vendor/ggml/examples/mnist/mnist-eval.cpp +67 -0
  199. data/vendor/ggml/examples/mnist/mnist-train-cnn.py +91 -0
  200. data/vendor/ggml/examples/mnist/mnist-train-fc.py +131 -0
  201. data/vendor/ggml/examples/mnist/mnist-train.cpp +39 -0
  202. data/vendor/ggml/examples/mnist/server.py +36 -0
  203. data/vendor/ggml/examples/mnist/web/index.html +178 -0
  204. data/vendor/ggml/examples/perf-metal/CMakeLists.txt +7 -0
  205. data/vendor/ggml/examples/perf-metal/perf-metal.cpp +152 -0
  206. data/vendor/ggml/examples/prompts/dolly-v2.txt +100 -0
  207. data/vendor/ggml/examples/prompts/gpt-2-chinese.txt +1 -0
  208. data/vendor/ggml/examples/prompts/gpt-2.txt +100 -0
  209. data/vendor/ggml/examples/prompts/gpt-j.txt +100 -0
  210. data/vendor/ggml/examples/prompts/gpt-neox-japanese.txt +1 -0
  211. data/vendor/ggml/examples/prompts/gpt-neox.txt +100 -0
  212. data/vendor/ggml/examples/prompts/polyglot-ko.txt +3 -0
  213. data/vendor/ggml/examples/prompts/replit.txt +100 -0
  214. data/vendor/ggml/examples/prompts/starcoder.txt +100 -0
  215. data/vendor/ggml/examples/prompts/test-cases.txt +110 -0
  216. data/vendor/ggml/examples/prompts/tokenize_huggingface.py +65 -0
  217. data/vendor/ggml/examples/prompts/whisper.txt +100 -0
  218. data/vendor/ggml/examples/python/README.md +115 -0
  219. data/vendor/ggml/examples/python/api.h +14 -0
  220. data/vendor/ggml/examples/python/example_add_quant.py +25 -0
  221. data/vendor/ggml/examples/python/example_test_all_quants.py +68 -0
  222. data/vendor/ggml/examples/python/ggml/__init__.py +58 -0
  223. data/vendor/ggml/examples/python/ggml/__init__.pyi +2406 -0
  224. data/vendor/ggml/examples/python/ggml/cffi.py +11 -0
  225. data/vendor/ggml/examples/python/ggml/ffi/__init__.pyi +7 -0
  226. data/vendor/ggml/examples/python/ggml/utils.py +182 -0
  227. data/vendor/ggml/examples/python/regenerate.py +42 -0
  228. data/vendor/ggml/examples/python/stubs.py +128 -0
  229. data/vendor/ggml/examples/python/test_tensor.py +258 -0
  230. data/vendor/ggml/examples/sam/CMakeLists.txt +13 -0
  231. data/vendor/ggml/examples/sam/README.md +95 -0
  232. data/vendor/ggml/examples/sam/convert-pth-to-ggml.py +147 -0
  233. data/vendor/ggml/examples/sam/example.jpg +0 -0
  234. data/vendor/ggml/examples/sam/sam.cpp +2370 -0
  235. data/vendor/ggml/examples/simple/CMakeLists.txt +21 -0
  236. data/vendor/ggml/examples/simple/README.md +61 -0
  237. data/vendor/ggml/examples/simple/simple-backend.cpp +153 -0
  238. data/vendor/ggml/examples/simple/simple-ctx.cpp +127 -0
  239. data/vendor/ggml/examples/stb_image.h +7987 -0
  240. data/vendor/ggml/examples/stb_image_write.h +1724 -0
  241. data/vendor/ggml/examples/test-cmake/CMakeLists.txt +10 -0
  242. data/vendor/ggml/examples/test-cmake/README.md +3 -0
  243. data/vendor/ggml/examples/test-cmake/test-cmake.cpp +6 -0
  244. data/vendor/ggml/examples/yolo/CMakeLists.txt +6 -0
  245. data/vendor/ggml/examples/yolo/README.md +59 -0
  246. data/vendor/ggml/examples/yolo/convert-yolov3-tiny.py +53 -0
  247. data/vendor/ggml/examples/yolo/data/coco.names +80 -0
  248. data/vendor/ggml/examples/yolo/data/labels/100_0.png +0 -0
  249. data/vendor/ggml/examples/yolo/data/labels/100_1.png +0 -0
  250. data/vendor/ggml/examples/yolo/data/labels/100_2.png +0 -0
  251. data/vendor/ggml/examples/yolo/data/labels/100_3.png +0 -0
  252. data/vendor/ggml/examples/yolo/data/labels/100_4.png +0 -0
  253. data/vendor/ggml/examples/yolo/data/labels/100_5.png +0 -0
  254. data/vendor/ggml/examples/yolo/data/labels/100_6.png +0 -0
  255. data/vendor/ggml/examples/yolo/data/labels/100_7.png +0 -0
  256. data/vendor/ggml/examples/yolo/data/labels/101_0.png +0 -0
  257. data/vendor/ggml/examples/yolo/data/labels/101_1.png +0 -0
  258. data/vendor/ggml/examples/yolo/data/labels/101_2.png +0 -0
  259. data/vendor/ggml/examples/yolo/data/labels/101_3.png +0 -0
  260. data/vendor/ggml/examples/yolo/data/labels/101_4.png +0 -0
  261. data/vendor/ggml/examples/yolo/data/labels/101_5.png +0 -0
  262. data/vendor/ggml/examples/yolo/data/labels/101_6.png +0 -0
  263. data/vendor/ggml/examples/yolo/data/labels/101_7.png +0 -0
  264. data/vendor/ggml/examples/yolo/data/labels/102_0.png +0 -0
  265. data/vendor/ggml/examples/yolo/data/labels/102_1.png +0 -0
  266. data/vendor/ggml/examples/yolo/data/labels/102_2.png +0 -0
  267. data/vendor/ggml/examples/yolo/data/labels/102_3.png +0 -0
  268. data/vendor/ggml/examples/yolo/data/labels/102_4.png +0 -0
  269. data/vendor/ggml/examples/yolo/data/labels/102_5.png +0 -0
  270. data/vendor/ggml/examples/yolo/data/labels/102_6.png +0 -0
  271. data/vendor/ggml/examples/yolo/data/labels/102_7.png +0 -0
  272. data/vendor/ggml/examples/yolo/data/labels/103_0.png +0 -0
  273. data/vendor/ggml/examples/yolo/data/labels/103_1.png +0 -0
  274. data/vendor/ggml/examples/yolo/data/labels/103_2.png +0 -0
  275. data/vendor/ggml/examples/yolo/data/labels/103_3.png +0 -0
  276. data/vendor/ggml/examples/yolo/data/labels/103_4.png +0 -0
  277. data/vendor/ggml/examples/yolo/data/labels/103_5.png +0 -0
  278. data/vendor/ggml/examples/yolo/data/labels/103_6.png +0 -0
  279. data/vendor/ggml/examples/yolo/data/labels/103_7.png +0 -0
  280. data/vendor/ggml/examples/yolo/data/labels/104_0.png +0 -0
  281. data/vendor/ggml/examples/yolo/data/labels/104_1.png +0 -0
  282. data/vendor/ggml/examples/yolo/data/labels/104_2.png +0 -0
  283. data/vendor/ggml/examples/yolo/data/labels/104_3.png +0 -0
  284. data/vendor/ggml/examples/yolo/data/labels/104_4.png +0 -0
  285. data/vendor/ggml/examples/yolo/data/labels/104_5.png +0 -0
  286. data/vendor/ggml/examples/yolo/data/labels/104_6.png +0 -0
  287. data/vendor/ggml/examples/yolo/data/labels/104_7.png +0 -0
  288. data/vendor/ggml/examples/yolo/data/labels/105_0.png +0 -0
  289. data/vendor/ggml/examples/yolo/data/labels/105_1.png +0 -0
  290. data/vendor/ggml/examples/yolo/data/labels/105_2.png +0 -0
  291. data/vendor/ggml/examples/yolo/data/labels/105_3.png +0 -0
  292. data/vendor/ggml/examples/yolo/data/labels/105_4.png +0 -0
  293. data/vendor/ggml/examples/yolo/data/labels/105_5.png +0 -0
  294. data/vendor/ggml/examples/yolo/data/labels/105_6.png +0 -0
  295. data/vendor/ggml/examples/yolo/data/labels/105_7.png +0 -0
  296. data/vendor/ggml/examples/yolo/data/labels/106_0.png +0 -0
  297. data/vendor/ggml/examples/yolo/data/labels/106_1.png +0 -0
  298. data/vendor/ggml/examples/yolo/data/labels/106_2.png +0 -0
  299. data/vendor/ggml/examples/yolo/data/labels/106_3.png +0 -0
  300. data/vendor/ggml/examples/yolo/data/labels/106_4.png +0 -0
  301. data/vendor/ggml/examples/yolo/data/labels/106_5.png +0 -0
  302. data/vendor/ggml/examples/yolo/data/labels/106_6.png +0 -0
  303. data/vendor/ggml/examples/yolo/data/labels/106_7.png +0 -0
  304. data/vendor/ggml/examples/yolo/data/labels/107_0.png +0 -0
  305. data/vendor/ggml/examples/yolo/data/labels/107_1.png +0 -0
  306. data/vendor/ggml/examples/yolo/data/labels/107_2.png +0 -0
  307. data/vendor/ggml/examples/yolo/data/labels/107_3.png +0 -0
  308. data/vendor/ggml/examples/yolo/data/labels/107_4.png +0 -0
  309. data/vendor/ggml/examples/yolo/data/labels/107_5.png +0 -0
  310. data/vendor/ggml/examples/yolo/data/labels/107_6.png +0 -0
  311. data/vendor/ggml/examples/yolo/data/labels/107_7.png +0 -0
  312. data/vendor/ggml/examples/yolo/data/labels/108_0.png +0 -0
  313. data/vendor/ggml/examples/yolo/data/labels/108_1.png +0 -0
  314. data/vendor/ggml/examples/yolo/data/labels/108_2.png +0 -0
  315. data/vendor/ggml/examples/yolo/data/labels/108_3.png +0 -0
  316. data/vendor/ggml/examples/yolo/data/labels/108_4.png +0 -0
  317. data/vendor/ggml/examples/yolo/data/labels/108_5.png +0 -0
  318. data/vendor/ggml/examples/yolo/data/labels/108_6.png +0 -0
  319. data/vendor/ggml/examples/yolo/data/labels/108_7.png +0 -0
  320. data/vendor/ggml/examples/yolo/data/labels/109_0.png +0 -0
  321. data/vendor/ggml/examples/yolo/data/labels/109_1.png +0 -0
  322. data/vendor/ggml/examples/yolo/data/labels/109_2.png +0 -0
  323. data/vendor/ggml/examples/yolo/data/labels/109_3.png +0 -0
  324. data/vendor/ggml/examples/yolo/data/labels/109_4.png +0 -0
  325. data/vendor/ggml/examples/yolo/data/labels/109_5.png +0 -0
  326. data/vendor/ggml/examples/yolo/data/labels/109_6.png +0 -0
  327. data/vendor/ggml/examples/yolo/data/labels/109_7.png +0 -0
  328. data/vendor/ggml/examples/yolo/data/labels/110_0.png +0 -0
  329. data/vendor/ggml/examples/yolo/data/labels/110_1.png +0 -0
  330. data/vendor/ggml/examples/yolo/data/labels/110_2.png +0 -0
  331. data/vendor/ggml/examples/yolo/data/labels/110_3.png +0 -0
  332. data/vendor/ggml/examples/yolo/data/labels/110_4.png +0 -0
  333. data/vendor/ggml/examples/yolo/data/labels/110_5.png +0 -0
  334. data/vendor/ggml/examples/yolo/data/labels/110_6.png +0 -0
  335. data/vendor/ggml/examples/yolo/data/labels/110_7.png +0 -0
  336. data/vendor/ggml/examples/yolo/data/labels/111_0.png +0 -0
  337. data/vendor/ggml/examples/yolo/data/labels/111_1.png +0 -0
  338. data/vendor/ggml/examples/yolo/data/labels/111_2.png +0 -0
  339. data/vendor/ggml/examples/yolo/data/labels/111_3.png +0 -0
  340. data/vendor/ggml/examples/yolo/data/labels/111_4.png +0 -0
  341. data/vendor/ggml/examples/yolo/data/labels/111_5.png +0 -0
  342. data/vendor/ggml/examples/yolo/data/labels/111_6.png +0 -0
  343. data/vendor/ggml/examples/yolo/data/labels/111_7.png +0 -0
  344. data/vendor/ggml/examples/yolo/data/labels/112_0.png +0 -0
  345. data/vendor/ggml/examples/yolo/data/labels/112_1.png +0 -0
  346. data/vendor/ggml/examples/yolo/data/labels/112_2.png +0 -0
  347. data/vendor/ggml/examples/yolo/data/labels/112_3.png +0 -0
  348. data/vendor/ggml/examples/yolo/data/labels/112_4.png +0 -0
  349. data/vendor/ggml/examples/yolo/data/labels/112_5.png +0 -0
  350. data/vendor/ggml/examples/yolo/data/labels/112_6.png +0 -0
  351. data/vendor/ggml/examples/yolo/data/labels/112_7.png +0 -0
  352. data/vendor/ggml/examples/yolo/data/labels/113_0.png +0 -0
  353. data/vendor/ggml/examples/yolo/data/labels/113_1.png +0 -0
  354. data/vendor/ggml/examples/yolo/data/labels/113_2.png +0 -0
  355. data/vendor/ggml/examples/yolo/data/labels/113_3.png +0 -0
  356. data/vendor/ggml/examples/yolo/data/labels/113_4.png +0 -0
  357. data/vendor/ggml/examples/yolo/data/labels/113_5.png +0 -0
  358. data/vendor/ggml/examples/yolo/data/labels/113_6.png +0 -0
  359. data/vendor/ggml/examples/yolo/data/labels/113_7.png +0 -0
  360. data/vendor/ggml/examples/yolo/data/labels/114_0.png +0 -0
  361. data/vendor/ggml/examples/yolo/data/labels/114_1.png +0 -0
  362. data/vendor/ggml/examples/yolo/data/labels/114_2.png +0 -0
  363. data/vendor/ggml/examples/yolo/data/labels/114_3.png +0 -0
  364. data/vendor/ggml/examples/yolo/data/labels/114_4.png +0 -0
  365. data/vendor/ggml/examples/yolo/data/labels/114_5.png +0 -0
  366. data/vendor/ggml/examples/yolo/data/labels/114_6.png +0 -0
  367. data/vendor/ggml/examples/yolo/data/labels/114_7.png +0 -0
  368. data/vendor/ggml/examples/yolo/data/labels/115_0.png +0 -0
  369. data/vendor/ggml/examples/yolo/data/labels/115_1.png +0 -0
  370. data/vendor/ggml/examples/yolo/data/labels/115_2.png +0 -0
  371. data/vendor/ggml/examples/yolo/data/labels/115_3.png +0 -0
  372. data/vendor/ggml/examples/yolo/data/labels/115_4.png +0 -0
  373. data/vendor/ggml/examples/yolo/data/labels/115_5.png +0 -0
  374. data/vendor/ggml/examples/yolo/data/labels/115_6.png +0 -0
  375. data/vendor/ggml/examples/yolo/data/labels/115_7.png +0 -0
  376. data/vendor/ggml/examples/yolo/data/labels/116_0.png +0 -0
  377. data/vendor/ggml/examples/yolo/data/labels/116_1.png +0 -0
  378. data/vendor/ggml/examples/yolo/data/labels/116_2.png +0 -0
  379. data/vendor/ggml/examples/yolo/data/labels/116_3.png +0 -0
  380. data/vendor/ggml/examples/yolo/data/labels/116_4.png +0 -0
  381. data/vendor/ggml/examples/yolo/data/labels/116_5.png +0 -0
  382. data/vendor/ggml/examples/yolo/data/labels/116_6.png +0 -0
  383. data/vendor/ggml/examples/yolo/data/labels/116_7.png +0 -0
  384. data/vendor/ggml/examples/yolo/data/labels/117_0.png +0 -0
  385. data/vendor/ggml/examples/yolo/data/labels/117_1.png +0 -0
  386. data/vendor/ggml/examples/yolo/data/labels/117_2.png +0 -0
  387. data/vendor/ggml/examples/yolo/data/labels/117_3.png +0 -0
  388. data/vendor/ggml/examples/yolo/data/labels/117_4.png +0 -0
  389. data/vendor/ggml/examples/yolo/data/labels/117_5.png +0 -0
  390. data/vendor/ggml/examples/yolo/data/labels/117_6.png +0 -0
  391. data/vendor/ggml/examples/yolo/data/labels/117_7.png +0 -0
  392. data/vendor/ggml/examples/yolo/data/labels/118_0.png +0 -0
  393. data/vendor/ggml/examples/yolo/data/labels/118_1.png +0 -0
  394. data/vendor/ggml/examples/yolo/data/labels/118_2.png +0 -0
  395. data/vendor/ggml/examples/yolo/data/labels/118_3.png +0 -0
  396. data/vendor/ggml/examples/yolo/data/labels/118_4.png +0 -0
  397. data/vendor/ggml/examples/yolo/data/labels/118_5.png +0 -0
  398. data/vendor/ggml/examples/yolo/data/labels/118_6.png +0 -0
  399. data/vendor/ggml/examples/yolo/data/labels/118_7.png +0 -0
  400. data/vendor/ggml/examples/yolo/data/labels/119_0.png +0 -0
  401. data/vendor/ggml/examples/yolo/data/labels/119_1.png +0 -0
  402. data/vendor/ggml/examples/yolo/data/labels/119_2.png +0 -0
  403. data/vendor/ggml/examples/yolo/data/labels/119_3.png +0 -0
  404. data/vendor/ggml/examples/yolo/data/labels/119_4.png +0 -0
  405. data/vendor/ggml/examples/yolo/data/labels/119_5.png +0 -0
  406. data/vendor/ggml/examples/yolo/data/labels/119_6.png +0 -0
  407. data/vendor/ggml/examples/yolo/data/labels/119_7.png +0 -0
  408. data/vendor/ggml/examples/yolo/data/labels/120_0.png +0 -0
  409. data/vendor/ggml/examples/yolo/data/labels/120_1.png +0 -0
  410. data/vendor/ggml/examples/yolo/data/labels/120_2.png +0 -0
  411. data/vendor/ggml/examples/yolo/data/labels/120_3.png +0 -0
  412. data/vendor/ggml/examples/yolo/data/labels/120_4.png +0 -0
  413. data/vendor/ggml/examples/yolo/data/labels/120_5.png +0 -0
  414. data/vendor/ggml/examples/yolo/data/labels/120_6.png +0 -0
  415. data/vendor/ggml/examples/yolo/data/labels/120_7.png +0 -0
  416. data/vendor/ggml/examples/yolo/data/labels/121_0.png +0 -0
  417. data/vendor/ggml/examples/yolo/data/labels/121_1.png +0 -0
  418. data/vendor/ggml/examples/yolo/data/labels/121_2.png +0 -0
  419. data/vendor/ggml/examples/yolo/data/labels/121_3.png +0 -0
  420. data/vendor/ggml/examples/yolo/data/labels/121_4.png +0 -0
  421. data/vendor/ggml/examples/yolo/data/labels/121_5.png +0 -0
  422. data/vendor/ggml/examples/yolo/data/labels/121_6.png +0 -0
  423. data/vendor/ggml/examples/yolo/data/labels/121_7.png +0 -0
  424. data/vendor/ggml/examples/yolo/data/labels/122_0.png +0 -0
  425. data/vendor/ggml/examples/yolo/data/labels/122_1.png +0 -0
  426. data/vendor/ggml/examples/yolo/data/labels/122_2.png +0 -0
  427. data/vendor/ggml/examples/yolo/data/labels/122_3.png +0 -0
  428. data/vendor/ggml/examples/yolo/data/labels/122_4.png +0 -0
  429. data/vendor/ggml/examples/yolo/data/labels/122_5.png +0 -0
  430. data/vendor/ggml/examples/yolo/data/labels/122_6.png +0 -0
  431. data/vendor/ggml/examples/yolo/data/labels/122_7.png +0 -0
  432. data/vendor/ggml/examples/yolo/data/labels/123_0.png +0 -0
  433. data/vendor/ggml/examples/yolo/data/labels/123_1.png +0 -0
  434. data/vendor/ggml/examples/yolo/data/labels/123_2.png +0 -0
  435. data/vendor/ggml/examples/yolo/data/labels/123_3.png +0 -0
  436. data/vendor/ggml/examples/yolo/data/labels/123_4.png +0 -0
  437. data/vendor/ggml/examples/yolo/data/labels/123_5.png +0 -0
  438. data/vendor/ggml/examples/yolo/data/labels/123_6.png +0 -0
  439. data/vendor/ggml/examples/yolo/data/labels/123_7.png +0 -0
  440. data/vendor/ggml/examples/yolo/data/labels/124_0.png +0 -0
  441. data/vendor/ggml/examples/yolo/data/labels/124_1.png +0 -0
  442. data/vendor/ggml/examples/yolo/data/labels/124_2.png +0 -0
  443. data/vendor/ggml/examples/yolo/data/labels/124_3.png +0 -0
  444. data/vendor/ggml/examples/yolo/data/labels/124_4.png +0 -0
  445. data/vendor/ggml/examples/yolo/data/labels/124_5.png +0 -0
  446. data/vendor/ggml/examples/yolo/data/labels/124_6.png +0 -0
  447. data/vendor/ggml/examples/yolo/data/labels/124_7.png +0 -0
  448. data/vendor/ggml/examples/yolo/data/labels/125_0.png +0 -0
  449. data/vendor/ggml/examples/yolo/data/labels/125_1.png +0 -0
  450. data/vendor/ggml/examples/yolo/data/labels/125_2.png +0 -0
  451. data/vendor/ggml/examples/yolo/data/labels/125_3.png +0 -0
  452. data/vendor/ggml/examples/yolo/data/labels/125_4.png +0 -0
  453. data/vendor/ggml/examples/yolo/data/labels/125_5.png +0 -0
  454. data/vendor/ggml/examples/yolo/data/labels/125_6.png +0 -0
  455. data/vendor/ggml/examples/yolo/data/labels/125_7.png +0 -0
  456. data/vendor/ggml/examples/yolo/data/labels/126_0.png +0 -0
  457. data/vendor/ggml/examples/yolo/data/labels/126_1.png +0 -0
  458. data/vendor/ggml/examples/yolo/data/labels/126_2.png +0 -0
  459. data/vendor/ggml/examples/yolo/data/labels/126_3.png +0 -0
  460. data/vendor/ggml/examples/yolo/data/labels/126_4.png +0 -0
  461. data/vendor/ggml/examples/yolo/data/labels/126_5.png +0 -0
  462. data/vendor/ggml/examples/yolo/data/labels/126_6.png +0 -0
  463. data/vendor/ggml/examples/yolo/data/labels/126_7.png +0 -0
  464. data/vendor/ggml/examples/yolo/data/labels/32_0.png +0 -0
  465. data/vendor/ggml/examples/yolo/data/labels/32_1.png +0 -0
  466. data/vendor/ggml/examples/yolo/data/labels/32_2.png +0 -0
  467. data/vendor/ggml/examples/yolo/data/labels/32_3.png +0 -0
  468. data/vendor/ggml/examples/yolo/data/labels/32_4.png +0 -0
  469. data/vendor/ggml/examples/yolo/data/labels/32_5.png +0 -0
  470. data/vendor/ggml/examples/yolo/data/labels/32_6.png +0 -0
  471. data/vendor/ggml/examples/yolo/data/labels/32_7.png +0 -0
  472. data/vendor/ggml/examples/yolo/data/labels/33_0.png +0 -0
  473. data/vendor/ggml/examples/yolo/data/labels/33_1.png +0 -0
  474. data/vendor/ggml/examples/yolo/data/labels/33_2.png +0 -0
  475. data/vendor/ggml/examples/yolo/data/labels/33_3.png +0 -0
  476. data/vendor/ggml/examples/yolo/data/labels/33_4.png +0 -0
  477. data/vendor/ggml/examples/yolo/data/labels/33_5.png +0 -0
  478. data/vendor/ggml/examples/yolo/data/labels/33_6.png +0 -0
  479. data/vendor/ggml/examples/yolo/data/labels/33_7.png +0 -0
  480. data/vendor/ggml/examples/yolo/data/labels/34_0.png +0 -0
  481. data/vendor/ggml/examples/yolo/data/labels/34_1.png +0 -0
  482. data/vendor/ggml/examples/yolo/data/labels/34_2.png +0 -0
  483. data/vendor/ggml/examples/yolo/data/labels/34_3.png +0 -0
  484. data/vendor/ggml/examples/yolo/data/labels/34_4.png +0 -0
  485. data/vendor/ggml/examples/yolo/data/labels/34_5.png +0 -0
  486. data/vendor/ggml/examples/yolo/data/labels/34_6.png +0 -0
  487. data/vendor/ggml/examples/yolo/data/labels/34_7.png +0 -0
  488. data/vendor/ggml/examples/yolo/data/labels/35_0.png +0 -0
  489. data/vendor/ggml/examples/yolo/data/labels/35_1.png +0 -0
  490. data/vendor/ggml/examples/yolo/data/labels/35_2.png +0 -0
  491. data/vendor/ggml/examples/yolo/data/labels/35_3.png +0 -0
  492. data/vendor/ggml/examples/yolo/data/labels/35_4.png +0 -0
  493. data/vendor/ggml/examples/yolo/data/labels/35_5.png +0 -0
  494. data/vendor/ggml/examples/yolo/data/labels/35_6.png +0 -0
  495. data/vendor/ggml/examples/yolo/data/labels/35_7.png +0 -0
  496. data/vendor/ggml/examples/yolo/data/labels/36_0.png +0 -0
  497. data/vendor/ggml/examples/yolo/data/labels/36_1.png +0 -0
  498. data/vendor/ggml/examples/yolo/data/labels/36_2.png +0 -0
  499. data/vendor/ggml/examples/yolo/data/labels/36_3.png +0 -0
  500. data/vendor/ggml/examples/yolo/data/labels/36_4.png +0 -0
  501. data/vendor/ggml/examples/yolo/data/labels/36_5.png +0 -0
  502. data/vendor/ggml/examples/yolo/data/labels/36_6.png +0 -0
  503. data/vendor/ggml/examples/yolo/data/labels/36_7.png +0 -0
  504. data/vendor/ggml/examples/yolo/data/labels/37_0.png +0 -0
  505. data/vendor/ggml/examples/yolo/data/labels/37_1.png +0 -0
  506. data/vendor/ggml/examples/yolo/data/labels/37_2.png +0 -0
  507. data/vendor/ggml/examples/yolo/data/labels/37_3.png +0 -0
  508. data/vendor/ggml/examples/yolo/data/labels/37_4.png +0 -0
  509. data/vendor/ggml/examples/yolo/data/labels/37_5.png +0 -0
  510. data/vendor/ggml/examples/yolo/data/labels/37_6.png +0 -0
  511. data/vendor/ggml/examples/yolo/data/labels/37_7.png +0 -0
  512. data/vendor/ggml/examples/yolo/data/labels/38_0.png +0 -0
  513. data/vendor/ggml/examples/yolo/data/labels/38_1.png +0 -0
  514. data/vendor/ggml/examples/yolo/data/labels/38_2.png +0 -0
  515. data/vendor/ggml/examples/yolo/data/labels/38_3.png +0 -0
  516. data/vendor/ggml/examples/yolo/data/labels/38_4.png +0 -0
  517. data/vendor/ggml/examples/yolo/data/labels/38_5.png +0 -0
  518. data/vendor/ggml/examples/yolo/data/labels/38_6.png +0 -0
  519. data/vendor/ggml/examples/yolo/data/labels/38_7.png +0 -0
  520. data/vendor/ggml/examples/yolo/data/labels/39_0.png +0 -0
  521. data/vendor/ggml/examples/yolo/data/labels/39_1.png +0 -0
  522. data/vendor/ggml/examples/yolo/data/labels/39_2.png +0 -0
  523. data/vendor/ggml/examples/yolo/data/labels/39_3.png +0 -0
  524. data/vendor/ggml/examples/yolo/data/labels/39_4.png +0 -0
  525. data/vendor/ggml/examples/yolo/data/labels/39_5.png +0 -0
  526. data/vendor/ggml/examples/yolo/data/labels/39_6.png +0 -0
  527. data/vendor/ggml/examples/yolo/data/labels/39_7.png +0 -0
  528. data/vendor/ggml/examples/yolo/data/labels/40_0.png +0 -0
  529. data/vendor/ggml/examples/yolo/data/labels/40_1.png +0 -0
  530. data/vendor/ggml/examples/yolo/data/labels/40_2.png +0 -0
  531. data/vendor/ggml/examples/yolo/data/labels/40_3.png +0 -0
  532. data/vendor/ggml/examples/yolo/data/labels/40_4.png +0 -0
  533. data/vendor/ggml/examples/yolo/data/labels/40_5.png +0 -0
  534. data/vendor/ggml/examples/yolo/data/labels/40_6.png +0 -0
  535. data/vendor/ggml/examples/yolo/data/labels/40_7.png +0 -0
  536. data/vendor/ggml/examples/yolo/data/labels/41_0.png +0 -0
  537. data/vendor/ggml/examples/yolo/data/labels/41_1.png +0 -0
  538. data/vendor/ggml/examples/yolo/data/labels/41_2.png +0 -0
  539. data/vendor/ggml/examples/yolo/data/labels/41_3.png +0 -0
  540. data/vendor/ggml/examples/yolo/data/labels/41_4.png +0 -0
  541. data/vendor/ggml/examples/yolo/data/labels/41_5.png +0 -0
  542. data/vendor/ggml/examples/yolo/data/labels/41_6.png +0 -0
  543. data/vendor/ggml/examples/yolo/data/labels/41_7.png +0 -0
  544. data/vendor/ggml/examples/yolo/data/labels/42_0.png +0 -0
  545. data/vendor/ggml/examples/yolo/data/labels/42_1.png +0 -0
  546. data/vendor/ggml/examples/yolo/data/labels/42_2.png +0 -0
  547. data/vendor/ggml/examples/yolo/data/labels/42_3.png +0 -0
  548. data/vendor/ggml/examples/yolo/data/labels/42_4.png +0 -0
  549. data/vendor/ggml/examples/yolo/data/labels/42_5.png +0 -0
  550. data/vendor/ggml/examples/yolo/data/labels/42_6.png +0 -0
  551. data/vendor/ggml/examples/yolo/data/labels/42_7.png +0 -0
  552. data/vendor/ggml/examples/yolo/data/labels/43_0.png +0 -0
  553. data/vendor/ggml/examples/yolo/data/labels/43_1.png +0 -0
  554. data/vendor/ggml/examples/yolo/data/labels/43_2.png +0 -0
  555. data/vendor/ggml/examples/yolo/data/labels/43_3.png +0 -0
  556. data/vendor/ggml/examples/yolo/data/labels/43_4.png +0 -0
  557. data/vendor/ggml/examples/yolo/data/labels/43_5.png +0 -0
  558. data/vendor/ggml/examples/yolo/data/labels/43_6.png +0 -0
  559. data/vendor/ggml/examples/yolo/data/labels/43_7.png +0 -0
  560. data/vendor/ggml/examples/yolo/data/labels/44_0.png +0 -0
  561. data/vendor/ggml/examples/yolo/data/labels/44_1.png +0 -0
  562. data/vendor/ggml/examples/yolo/data/labels/44_2.png +0 -0
  563. data/vendor/ggml/examples/yolo/data/labels/44_3.png +0 -0
  564. data/vendor/ggml/examples/yolo/data/labels/44_4.png +0 -0
  565. data/vendor/ggml/examples/yolo/data/labels/44_5.png +0 -0
  566. data/vendor/ggml/examples/yolo/data/labels/44_6.png +0 -0
  567. data/vendor/ggml/examples/yolo/data/labels/44_7.png +0 -0
  568. data/vendor/ggml/examples/yolo/data/labels/45_0.png +0 -0
  569. data/vendor/ggml/examples/yolo/data/labels/45_1.png +0 -0
  570. data/vendor/ggml/examples/yolo/data/labels/45_2.png +0 -0
  571. data/vendor/ggml/examples/yolo/data/labels/45_3.png +0 -0
  572. data/vendor/ggml/examples/yolo/data/labels/45_4.png +0 -0
  573. data/vendor/ggml/examples/yolo/data/labels/45_5.png +0 -0
  574. data/vendor/ggml/examples/yolo/data/labels/45_6.png +0 -0
  575. data/vendor/ggml/examples/yolo/data/labels/45_7.png +0 -0
  576. data/vendor/ggml/examples/yolo/data/labels/46_0.png +0 -0
  577. data/vendor/ggml/examples/yolo/data/labels/46_1.png +0 -0
  578. data/vendor/ggml/examples/yolo/data/labels/46_2.png +0 -0
  579. data/vendor/ggml/examples/yolo/data/labels/46_3.png +0 -0
  580. data/vendor/ggml/examples/yolo/data/labels/46_4.png +0 -0
  581. data/vendor/ggml/examples/yolo/data/labels/46_5.png +0 -0
  582. data/vendor/ggml/examples/yolo/data/labels/46_6.png +0 -0
  583. data/vendor/ggml/examples/yolo/data/labels/46_7.png +0 -0
  584. data/vendor/ggml/examples/yolo/data/labels/47_0.png +0 -0
  585. data/vendor/ggml/examples/yolo/data/labels/47_1.png +0 -0
  586. data/vendor/ggml/examples/yolo/data/labels/47_2.png +0 -0
  587. data/vendor/ggml/examples/yolo/data/labels/47_3.png +0 -0
  588. data/vendor/ggml/examples/yolo/data/labels/47_4.png +0 -0
  589. data/vendor/ggml/examples/yolo/data/labels/47_5.png +0 -0
  590. data/vendor/ggml/examples/yolo/data/labels/47_6.png +0 -0
  591. data/vendor/ggml/examples/yolo/data/labels/47_7.png +0 -0
  592. data/vendor/ggml/examples/yolo/data/labels/48_0.png +0 -0
  593. data/vendor/ggml/examples/yolo/data/labels/48_1.png +0 -0
  594. data/vendor/ggml/examples/yolo/data/labels/48_2.png +0 -0
  595. data/vendor/ggml/examples/yolo/data/labels/48_3.png +0 -0
  596. data/vendor/ggml/examples/yolo/data/labels/48_4.png +0 -0
  597. data/vendor/ggml/examples/yolo/data/labels/48_5.png +0 -0
  598. data/vendor/ggml/examples/yolo/data/labels/48_6.png +0 -0
  599. data/vendor/ggml/examples/yolo/data/labels/48_7.png +0 -0
  600. data/vendor/ggml/examples/yolo/data/labels/49_0.png +0 -0
  601. data/vendor/ggml/examples/yolo/data/labels/49_1.png +0 -0
  602. data/vendor/ggml/examples/yolo/data/labels/49_2.png +0 -0
  603. data/vendor/ggml/examples/yolo/data/labels/49_3.png +0 -0
  604. data/vendor/ggml/examples/yolo/data/labels/49_4.png +0 -0
  605. data/vendor/ggml/examples/yolo/data/labels/49_5.png +0 -0
  606. data/vendor/ggml/examples/yolo/data/labels/49_6.png +0 -0
  607. data/vendor/ggml/examples/yolo/data/labels/49_7.png +0 -0
  608. data/vendor/ggml/examples/yolo/data/labels/50_0.png +0 -0
  609. data/vendor/ggml/examples/yolo/data/labels/50_1.png +0 -0
  610. data/vendor/ggml/examples/yolo/data/labels/50_2.png +0 -0
  611. data/vendor/ggml/examples/yolo/data/labels/50_3.png +0 -0
  612. data/vendor/ggml/examples/yolo/data/labels/50_4.png +0 -0
  613. data/vendor/ggml/examples/yolo/data/labels/50_5.png +0 -0
  614. data/vendor/ggml/examples/yolo/data/labels/50_6.png +0 -0
  615. data/vendor/ggml/examples/yolo/data/labels/50_7.png +0 -0
  616. data/vendor/ggml/examples/yolo/data/labels/51_0.png +0 -0
  617. data/vendor/ggml/examples/yolo/data/labels/51_1.png +0 -0
  618. data/vendor/ggml/examples/yolo/data/labels/51_2.png +0 -0
  619. data/vendor/ggml/examples/yolo/data/labels/51_3.png +0 -0
  620. data/vendor/ggml/examples/yolo/data/labels/51_4.png +0 -0
  621. data/vendor/ggml/examples/yolo/data/labels/51_5.png +0 -0
  622. data/vendor/ggml/examples/yolo/data/labels/51_6.png +0 -0
  623. data/vendor/ggml/examples/yolo/data/labels/51_7.png +0 -0
  624. data/vendor/ggml/examples/yolo/data/labels/52_0.png +0 -0
  625. data/vendor/ggml/examples/yolo/data/labels/52_1.png +0 -0
  626. data/vendor/ggml/examples/yolo/data/labels/52_2.png +0 -0
  627. data/vendor/ggml/examples/yolo/data/labels/52_3.png +0 -0
  628. data/vendor/ggml/examples/yolo/data/labels/52_4.png +0 -0
  629. data/vendor/ggml/examples/yolo/data/labels/52_5.png +0 -0
  630. data/vendor/ggml/examples/yolo/data/labels/52_6.png +0 -0
  631. data/vendor/ggml/examples/yolo/data/labels/52_7.png +0 -0
  632. data/vendor/ggml/examples/yolo/data/labels/53_0.png +0 -0
  633. data/vendor/ggml/examples/yolo/data/labels/53_1.png +0 -0
  634. data/vendor/ggml/examples/yolo/data/labels/53_2.png +0 -0
  635. data/vendor/ggml/examples/yolo/data/labels/53_3.png +0 -0
  636. data/vendor/ggml/examples/yolo/data/labels/53_4.png +0 -0
  637. data/vendor/ggml/examples/yolo/data/labels/53_5.png +0 -0
  638. data/vendor/ggml/examples/yolo/data/labels/53_6.png +0 -0
  639. data/vendor/ggml/examples/yolo/data/labels/53_7.png +0 -0
  640. data/vendor/ggml/examples/yolo/data/labels/54_0.png +0 -0
  641. data/vendor/ggml/examples/yolo/data/labels/54_1.png +0 -0
  642. data/vendor/ggml/examples/yolo/data/labels/54_2.png +0 -0
  643. data/vendor/ggml/examples/yolo/data/labels/54_3.png +0 -0
  644. data/vendor/ggml/examples/yolo/data/labels/54_4.png +0 -0
  645. data/vendor/ggml/examples/yolo/data/labels/54_5.png +0 -0
  646. data/vendor/ggml/examples/yolo/data/labels/54_6.png +0 -0
  647. data/vendor/ggml/examples/yolo/data/labels/54_7.png +0 -0
  648. data/vendor/ggml/examples/yolo/data/labels/55_0.png +0 -0
  649. data/vendor/ggml/examples/yolo/data/labels/55_1.png +0 -0
  650. data/vendor/ggml/examples/yolo/data/labels/55_2.png +0 -0
  651. data/vendor/ggml/examples/yolo/data/labels/55_3.png +0 -0
  652. data/vendor/ggml/examples/yolo/data/labels/55_4.png +0 -0
  653. data/vendor/ggml/examples/yolo/data/labels/55_5.png +0 -0
  654. data/vendor/ggml/examples/yolo/data/labels/55_6.png +0 -0
  655. data/vendor/ggml/examples/yolo/data/labels/55_7.png +0 -0
  656. data/vendor/ggml/examples/yolo/data/labels/56_0.png +0 -0
  657. data/vendor/ggml/examples/yolo/data/labels/56_1.png +0 -0
  658. data/vendor/ggml/examples/yolo/data/labels/56_2.png +0 -0
  659. data/vendor/ggml/examples/yolo/data/labels/56_3.png +0 -0
  660. data/vendor/ggml/examples/yolo/data/labels/56_4.png +0 -0
  661. data/vendor/ggml/examples/yolo/data/labels/56_5.png +0 -0
  662. data/vendor/ggml/examples/yolo/data/labels/56_6.png +0 -0
  663. data/vendor/ggml/examples/yolo/data/labels/56_7.png +0 -0
  664. data/vendor/ggml/examples/yolo/data/labels/57_0.png +0 -0
  665. data/vendor/ggml/examples/yolo/data/labels/57_1.png +0 -0
  666. data/vendor/ggml/examples/yolo/data/labels/57_2.png +0 -0
  667. data/vendor/ggml/examples/yolo/data/labels/57_3.png +0 -0
  668. data/vendor/ggml/examples/yolo/data/labels/57_4.png +0 -0
  669. data/vendor/ggml/examples/yolo/data/labels/57_5.png +0 -0
  670. data/vendor/ggml/examples/yolo/data/labels/57_6.png +0 -0
  671. data/vendor/ggml/examples/yolo/data/labels/57_7.png +0 -0
  672. data/vendor/ggml/examples/yolo/data/labels/58_0.png +0 -0
  673. data/vendor/ggml/examples/yolo/data/labels/58_1.png +0 -0
  674. data/vendor/ggml/examples/yolo/data/labels/58_2.png +0 -0
  675. data/vendor/ggml/examples/yolo/data/labels/58_3.png +0 -0
  676. data/vendor/ggml/examples/yolo/data/labels/58_4.png +0 -0
  677. data/vendor/ggml/examples/yolo/data/labels/58_5.png +0 -0
  678. data/vendor/ggml/examples/yolo/data/labels/58_6.png +0 -0
  679. data/vendor/ggml/examples/yolo/data/labels/58_7.png +0 -0
  680. data/vendor/ggml/examples/yolo/data/labels/59_0.png +0 -0
  681. data/vendor/ggml/examples/yolo/data/labels/59_1.png +0 -0
  682. data/vendor/ggml/examples/yolo/data/labels/59_2.png +0 -0
  683. data/vendor/ggml/examples/yolo/data/labels/59_3.png +0 -0
  684. data/vendor/ggml/examples/yolo/data/labels/59_4.png +0 -0
  685. data/vendor/ggml/examples/yolo/data/labels/59_5.png +0 -0
  686. data/vendor/ggml/examples/yolo/data/labels/59_6.png +0 -0
  687. data/vendor/ggml/examples/yolo/data/labels/59_7.png +0 -0
  688. data/vendor/ggml/examples/yolo/data/labels/60_0.png +0 -0
  689. data/vendor/ggml/examples/yolo/data/labels/60_1.png +0 -0
  690. data/vendor/ggml/examples/yolo/data/labels/60_2.png +0 -0
  691. data/vendor/ggml/examples/yolo/data/labels/60_3.png +0 -0
  692. data/vendor/ggml/examples/yolo/data/labels/60_4.png +0 -0
  693. data/vendor/ggml/examples/yolo/data/labels/60_5.png +0 -0
  694. data/vendor/ggml/examples/yolo/data/labels/60_6.png +0 -0
  695. data/vendor/ggml/examples/yolo/data/labels/60_7.png +0 -0
  696. data/vendor/ggml/examples/yolo/data/labels/61_0.png +0 -0
  697. data/vendor/ggml/examples/yolo/data/labels/61_1.png +0 -0
  698. data/vendor/ggml/examples/yolo/data/labels/61_2.png +0 -0
  699. data/vendor/ggml/examples/yolo/data/labels/61_3.png +0 -0
  700. data/vendor/ggml/examples/yolo/data/labels/61_4.png +0 -0
  701. data/vendor/ggml/examples/yolo/data/labels/61_5.png +0 -0
  702. data/vendor/ggml/examples/yolo/data/labels/61_6.png +0 -0
  703. data/vendor/ggml/examples/yolo/data/labels/61_7.png +0 -0
  704. data/vendor/ggml/examples/yolo/data/labels/62_0.png +0 -0
  705. data/vendor/ggml/examples/yolo/data/labels/62_1.png +0 -0
  706. data/vendor/ggml/examples/yolo/data/labels/62_2.png +0 -0
  707. data/vendor/ggml/examples/yolo/data/labels/62_3.png +0 -0
  708. data/vendor/ggml/examples/yolo/data/labels/62_4.png +0 -0
  709. data/vendor/ggml/examples/yolo/data/labels/62_5.png +0 -0
  710. data/vendor/ggml/examples/yolo/data/labels/62_6.png +0 -0
  711. data/vendor/ggml/examples/yolo/data/labels/62_7.png +0 -0
  712. data/vendor/ggml/examples/yolo/data/labels/63_0.png +0 -0
  713. data/vendor/ggml/examples/yolo/data/labels/63_1.png +0 -0
  714. data/vendor/ggml/examples/yolo/data/labels/63_2.png +0 -0
  715. data/vendor/ggml/examples/yolo/data/labels/63_3.png +0 -0
  716. data/vendor/ggml/examples/yolo/data/labels/63_4.png +0 -0
  717. data/vendor/ggml/examples/yolo/data/labels/63_5.png +0 -0
  718. data/vendor/ggml/examples/yolo/data/labels/63_6.png +0 -0
  719. data/vendor/ggml/examples/yolo/data/labels/63_7.png +0 -0
  720. data/vendor/ggml/examples/yolo/data/labels/64_0.png +0 -0
  721. data/vendor/ggml/examples/yolo/data/labels/64_1.png +0 -0
  722. data/vendor/ggml/examples/yolo/data/labels/64_2.png +0 -0
  723. data/vendor/ggml/examples/yolo/data/labels/64_3.png +0 -0
  724. data/vendor/ggml/examples/yolo/data/labels/64_4.png +0 -0
  725. data/vendor/ggml/examples/yolo/data/labels/64_5.png +0 -0
  726. data/vendor/ggml/examples/yolo/data/labels/64_6.png +0 -0
  727. data/vendor/ggml/examples/yolo/data/labels/64_7.png +0 -0
  728. data/vendor/ggml/examples/yolo/data/labels/65_0.png +0 -0
  729. data/vendor/ggml/examples/yolo/data/labels/65_1.png +0 -0
  730. data/vendor/ggml/examples/yolo/data/labels/65_2.png +0 -0
  731. data/vendor/ggml/examples/yolo/data/labels/65_3.png +0 -0
  732. data/vendor/ggml/examples/yolo/data/labels/65_4.png +0 -0
  733. data/vendor/ggml/examples/yolo/data/labels/65_5.png +0 -0
  734. data/vendor/ggml/examples/yolo/data/labels/65_6.png +0 -0
  735. data/vendor/ggml/examples/yolo/data/labels/65_7.png +0 -0
  736. data/vendor/ggml/examples/yolo/data/labels/66_0.png +0 -0
  737. data/vendor/ggml/examples/yolo/data/labels/66_1.png +0 -0
  738. data/vendor/ggml/examples/yolo/data/labels/66_2.png +0 -0
  739. data/vendor/ggml/examples/yolo/data/labels/66_3.png +0 -0
  740. data/vendor/ggml/examples/yolo/data/labels/66_4.png +0 -0
  741. data/vendor/ggml/examples/yolo/data/labels/66_5.png +0 -0
  742. data/vendor/ggml/examples/yolo/data/labels/66_6.png +0 -0
  743. data/vendor/ggml/examples/yolo/data/labels/66_7.png +0 -0
  744. data/vendor/ggml/examples/yolo/data/labels/67_0.png +0 -0
  745. data/vendor/ggml/examples/yolo/data/labels/67_1.png +0 -0
  746. data/vendor/ggml/examples/yolo/data/labels/67_2.png +0 -0
  747. data/vendor/ggml/examples/yolo/data/labels/67_3.png +0 -0
  748. data/vendor/ggml/examples/yolo/data/labels/67_4.png +0 -0
  749. data/vendor/ggml/examples/yolo/data/labels/67_5.png +0 -0
  750. data/vendor/ggml/examples/yolo/data/labels/67_6.png +0 -0
  751. data/vendor/ggml/examples/yolo/data/labels/67_7.png +0 -0
  752. data/vendor/ggml/examples/yolo/data/labels/68_0.png +0 -0
  753. data/vendor/ggml/examples/yolo/data/labels/68_1.png +0 -0
  754. data/vendor/ggml/examples/yolo/data/labels/68_2.png +0 -0
  755. data/vendor/ggml/examples/yolo/data/labels/68_3.png +0 -0
  756. data/vendor/ggml/examples/yolo/data/labels/68_4.png +0 -0
  757. data/vendor/ggml/examples/yolo/data/labels/68_5.png +0 -0
  758. data/vendor/ggml/examples/yolo/data/labels/68_6.png +0 -0
  759. data/vendor/ggml/examples/yolo/data/labels/68_7.png +0 -0
  760. data/vendor/ggml/examples/yolo/data/labels/69_0.png +0 -0
  761. data/vendor/ggml/examples/yolo/data/labels/69_1.png +0 -0
  762. data/vendor/ggml/examples/yolo/data/labels/69_2.png +0 -0
  763. data/vendor/ggml/examples/yolo/data/labels/69_3.png +0 -0
  764. data/vendor/ggml/examples/yolo/data/labels/69_4.png +0 -0
  765. data/vendor/ggml/examples/yolo/data/labels/69_5.png +0 -0
  766. data/vendor/ggml/examples/yolo/data/labels/69_6.png +0 -0
  767. data/vendor/ggml/examples/yolo/data/labels/69_7.png +0 -0
  768. data/vendor/ggml/examples/yolo/data/labels/70_0.png +0 -0
  769. data/vendor/ggml/examples/yolo/data/labels/70_1.png +0 -0
  770. data/vendor/ggml/examples/yolo/data/labels/70_2.png +0 -0
  771. data/vendor/ggml/examples/yolo/data/labels/70_3.png +0 -0
  772. data/vendor/ggml/examples/yolo/data/labels/70_4.png +0 -0
  773. data/vendor/ggml/examples/yolo/data/labels/70_5.png +0 -0
  774. data/vendor/ggml/examples/yolo/data/labels/70_6.png +0 -0
  775. data/vendor/ggml/examples/yolo/data/labels/70_7.png +0 -0
  776. data/vendor/ggml/examples/yolo/data/labels/71_0.png +0 -0
  777. data/vendor/ggml/examples/yolo/data/labels/71_1.png +0 -0
  778. data/vendor/ggml/examples/yolo/data/labels/71_2.png +0 -0
  779. data/vendor/ggml/examples/yolo/data/labels/71_3.png +0 -0
  780. data/vendor/ggml/examples/yolo/data/labels/71_4.png +0 -0
  781. data/vendor/ggml/examples/yolo/data/labels/71_5.png +0 -0
  782. data/vendor/ggml/examples/yolo/data/labels/71_6.png +0 -0
  783. data/vendor/ggml/examples/yolo/data/labels/71_7.png +0 -0
  784. data/vendor/ggml/examples/yolo/data/labels/72_0.png +0 -0
  785. data/vendor/ggml/examples/yolo/data/labels/72_1.png +0 -0
  786. data/vendor/ggml/examples/yolo/data/labels/72_2.png +0 -0
  787. data/vendor/ggml/examples/yolo/data/labels/72_3.png +0 -0
  788. data/vendor/ggml/examples/yolo/data/labels/72_4.png +0 -0
  789. data/vendor/ggml/examples/yolo/data/labels/72_5.png +0 -0
  790. data/vendor/ggml/examples/yolo/data/labels/72_6.png +0 -0
  791. data/vendor/ggml/examples/yolo/data/labels/72_7.png +0 -0
  792. data/vendor/ggml/examples/yolo/data/labels/73_0.png +0 -0
  793. data/vendor/ggml/examples/yolo/data/labels/73_1.png +0 -0
  794. data/vendor/ggml/examples/yolo/data/labels/73_2.png +0 -0
  795. data/vendor/ggml/examples/yolo/data/labels/73_3.png +0 -0
  796. data/vendor/ggml/examples/yolo/data/labels/73_4.png +0 -0
  797. data/vendor/ggml/examples/yolo/data/labels/73_5.png +0 -0
  798. data/vendor/ggml/examples/yolo/data/labels/73_6.png +0 -0
  799. data/vendor/ggml/examples/yolo/data/labels/73_7.png +0 -0
  800. data/vendor/ggml/examples/yolo/data/labels/74_0.png +0 -0
  801. data/vendor/ggml/examples/yolo/data/labels/74_1.png +0 -0
  802. data/vendor/ggml/examples/yolo/data/labels/74_2.png +0 -0
  803. data/vendor/ggml/examples/yolo/data/labels/74_3.png +0 -0
  804. data/vendor/ggml/examples/yolo/data/labels/74_4.png +0 -0
  805. data/vendor/ggml/examples/yolo/data/labels/74_5.png +0 -0
  806. data/vendor/ggml/examples/yolo/data/labels/74_6.png +0 -0
  807. data/vendor/ggml/examples/yolo/data/labels/74_7.png +0 -0
  808. data/vendor/ggml/examples/yolo/data/labels/75_0.png +0 -0
  809. data/vendor/ggml/examples/yolo/data/labels/75_1.png +0 -0
  810. data/vendor/ggml/examples/yolo/data/labels/75_2.png +0 -0
  811. data/vendor/ggml/examples/yolo/data/labels/75_3.png +0 -0
  812. data/vendor/ggml/examples/yolo/data/labels/75_4.png +0 -0
  813. data/vendor/ggml/examples/yolo/data/labels/75_5.png +0 -0
  814. data/vendor/ggml/examples/yolo/data/labels/75_6.png +0 -0
  815. data/vendor/ggml/examples/yolo/data/labels/75_7.png +0 -0
  816. data/vendor/ggml/examples/yolo/data/labels/76_0.png +0 -0
  817. data/vendor/ggml/examples/yolo/data/labels/76_1.png +0 -0
  818. data/vendor/ggml/examples/yolo/data/labels/76_2.png +0 -0
  819. data/vendor/ggml/examples/yolo/data/labels/76_3.png +0 -0
  820. data/vendor/ggml/examples/yolo/data/labels/76_4.png +0 -0
  821. data/vendor/ggml/examples/yolo/data/labels/76_5.png +0 -0
  822. data/vendor/ggml/examples/yolo/data/labels/76_6.png +0 -0
  823. data/vendor/ggml/examples/yolo/data/labels/76_7.png +0 -0
  824. data/vendor/ggml/examples/yolo/data/labels/77_0.png +0 -0
  825. data/vendor/ggml/examples/yolo/data/labels/77_1.png +0 -0
  826. data/vendor/ggml/examples/yolo/data/labels/77_2.png +0 -0
  827. data/vendor/ggml/examples/yolo/data/labels/77_3.png +0 -0
  828. data/vendor/ggml/examples/yolo/data/labels/77_4.png +0 -0
  829. data/vendor/ggml/examples/yolo/data/labels/77_5.png +0 -0
  830. data/vendor/ggml/examples/yolo/data/labels/77_6.png +0 -0
  831. data/vendor/ggml/examples/yolo/data/labels/77_7.png +0 -0
  832. data/vendor/ggml/examples/yolo/data/labels/78_0.png +0 -0
  833. data/vendor/ggml/examples/yolo/data/labels/78_1.png +0 -0
  834. data/vendor/ggml/examples/yolo/data/labels/78_2.png +0 -0
  835. data/vendor/ggml/examples/yolo/data/labels/78_3.png +0 -0
  836. data/vendor/ggml/examples/yolo/data/labels/78_4.png +0 -0
  837. data/vendor/ggml/examples/yolo/data/labels/78_5.png +0 -0
  838. data/vendor/ggml/examples/yolo/data/labels/78_6.png +0 -0
  839. data/vendor/ggml/examples/yolo/data/labels/78_7.png +0 -0
  840. data/vendor/ggml/examples/yolo/data/labels/79_0.png +0 -0
  841. data/vendor/ggml/examples/yolo/data/labels/79_1.png +0 -0
  842. data/vendor/ggml/examples/yolo/data/labels/79_2.png +0 -0
  843. data/vendor/ggml/examples/yolo/data/labels/79_3.png +0 -0
  844. data/vendor/ggml/examples/yolo/data/labels/79_4.png +0 -0
  845. data/vendor/ggml/examples/yolo/data/labels/79_5.png +0 -0
  846. data/vendor/ggml/examples/yolo/data/labels/79_6.png +0 -0
  847. data/vendor/ggml/examples/yolo/data/labels/79_7.png +0 -0
  848. data/vendor/ggml/examples/yolo/data/labels/80_0.png +0 -0
  849. data/vendor/ggml/examples/yolo/data/labels/80_1.png +0 -0
  850. data/vendor/ggml/examples/yolo/data/labels/80_2.png +0 -0
  851. data/vendor/ggml/examples/yolo/data/labels/80_3.png +0 -0
  852. data/vendor/ggml/examples/yolo/data/labels/80_4.png +0 -0
  853. data/vendor/ggml/examples/yolo/data/labels/80_5.png +0 -0
  854. data/vendor/ggml/examples/yolo/data/labels/80_6.png +0 -0
  855. data/vendor/ggml/examples/yolo/data/labels/80_7.png +0 -0
  856. data/vendor/ggml/examples/yolo/data/labels/81_0.png +0 -0
  857. data/vendor/ggml/examples/yolo/data/labels/81_1.png +0 -0
  858. data/vendor/ggml/examples/yolo/data/labels/81_2.png +0 -0
  859. data/vendor/ggml/examples/yolo/data/labels/81_3.png +0 -0
  860. data/vendor/ggml/examples/yolo/data/labels/81_4.png +0 -0
  861. data/vendor/ggml/examples/yolo/data/labels/81_5.png +0 -0
  862. data/vendor/ggml/examples/yolo/data/labels/81_6.png +0 -0
  863. data/vendor/ggml/examples/yolo/data/labels/81_7.png +0 -0
  864. data/vendor/ggml/examples/yolo/data/labels/82_0.png +0 -0
  865. data/vendor/ggml/examples/yolo/data/labels/82_1.png +0 -0
  866. data/vendor/ggml/examples/yolo/data/labels/82_2.png +0 -0
  867. data/vendor/ggml/examples/yolo/data/labels/82_3.png +0 -0
  868. data/vendor/ggml/examples/yolo/data/labels/82_4.png +0 -0
  869. data/vendor/ggml/examples/yolo/data/labels/82_5.png +0 -0
  870. data/vendor/ggml/examples/yolo/data/labels/82_6.png +0 -0
  871. data/vendor/ggml/examples/yolo/data/labels/82_7.png +0 -0
  872. data/vendor/ggml/examples/yolo/data/labels/83_0.png +0 -0
  873. data/vendor/ggml/examples/yolo/data/labels/83_1.png +0 -0
  874. data/vendor/ggml/examples/yolo/data/labels/83_2.png +0 -0
  875. data/vendor/ggml/examples/yolo/data/labels/83_3.png +0 -0
  876. data/vendor/ggml/examples/yolo/data/labels/83_4.png +0 -0
  877. data/vendor/ggml/examples/yolo/data/labels/83_5.png +0 -0
  878. data/vendor/ggml/examples/yolo/data/labels/83_6.png +0 -0
  879. data/vendor/ggml/examples/yolo/data/labels/83_7.png +0 -0
  880. data/vendor/ggml/examples/yolo/data/labels/84_0.png +0 -0
  881. data/vendor/ggml/examples/yolo/data/labels/84_1.png +0 -0
  882. data/vendor/ggml/examples/yolo/data/labels/84_2.png +0 -0
  883. data/vendor/ggml/examples/yolo/data/labels/84_3.png +0 -0
  884. data/vendor/ggml/examples/yolo/data/labels/84_4.png +0 -0
  885. data/vendor/ggml/examples/yolo/data/labels/84_5.png +0 -0
  886. data/vendor/ggml/examples/yolo/data/labels/84_6.png +0 -0
  887. data/vendor/ggml/examples/yolo/data/labels/84_7.png +0 -0
  888. data/vendor/ggml/examples/yolo/data/labels/85_0.png +0 -0
  889. data/vendor/ggml/examples/yolo/data/labels/85_1.png +0 -0
  890. data/vendor/ggml/examples/yolo/data/labels/85_2.png +0 -0
  891. data/vendor/ggml/examples/yolo/data/labels/85_3.png +0 -0
  892. data/vendor/ggml/examples/yolo/data/labels/85_4.png +0 -0
  893. data/vendor/ggml/examples/yolo/data/labels/85_5.png +0 -0
  894. data/vendor/ggml/examples/yolo/data/labels/85_6.png +0 -0
  895. data/vendor/ggml/examples/yolo/data/labels/85_7.png +0 -0
  896. data/vendor/ggml/examples/yolo/data/labels/86_0.png +0 -0
  897. data/vendor/ggml/examples/yolo/data/labels/86_1.png +0 -0
  898. data/vendor/ggml/examples/yolo/data/labels/86_2.png +0 -0
  899. data/vendor/ggml/examples/yolo/data/labels/86_3.png +0 -0
  900. data/vendor/ggml/examples/yolo/data/labels/86_4.png +0 -0
  901. data/vendor/ggml/examples/yolo/data/labels/86_5.png +0 -0
  902. data/vendor/ggml/examples/yolo/data/labels/86_6.png +0 -0
  903. data/vendor/ggml/examples/yolo/data/labels/86_7.png +0 -0
  904. data/vendor/ggml/examples/yolo/data/labels/87_0.png +0 -0
  905. data/vendor/ggml/examples/yolo/data/labels/87_1.png +0 -0
  906. data/vendor/ggml/examples/yolo/data/labels/87_2.png +0 -0
  907. data/vendor/ggml/examples/yolo/data/labels/87_3.png +0 -0
  908. data/vendor/ggml/examples/yolo/data/labels/87_4.png +0 -0
  909. data/vendor/ggml/examples/yolo/data/labels/87_5.png +0 -0
  910. data/vendor/ggml/examples/yolo/data/labels/87_6.png +0 -0
  911. data/vendor/ggml/examples/yolo/data/labels/87_7.png +0 -0
  912. data/vendor/ggml/examples/yolo/data/labels/88_0.png +0 -0
  913. data/vendor/ggml/examples/yolo/data/labels/88_1.png +0 -0
  914. data/vendor/ggml/examples/yolo/data/labels/88_2.png +0 -0
  915. data/vendor/ggml/examples/yolo/data/labels/88_3.png +0 -0
  916. data/vendor/ggml/examples/yolo/data/labels/88_4.png +0 -0
  917. data/vendor/ggml/examples/yolo/data/labels/88_5.png +0 -0
  918. data/vendor/ggml/examples/yolo/data/labels/88_6.png +0 -0
  919. data/vendor/ggml/examples/yolo/data/labels/88_7.png +0 -0
  920. data/vendor/ggml/examples/yolo/data/labels/89_0.png +0 -0
  921. data/vendor/ggml/examples/yolo/data/labels/89_1.png +0 -0
  922. data/vendor/ggml/examples/yolo/data/labels/89_2.png +0 -0
  923. data/vendor/ggml/examples/yolo/data/labels/89_3.png +0 -0
  924. data/vendor/ggml/examples/yolo/data/labels/89_4.png +0 -0
  925. data/vendor/ggml/examples/yolo/data/labels/89_5.png +0 -0
  926. data/vendor/ggml/examples/yolo/data/labels/89_6.png +0 -0
  927. data/vendor/ggml/examples/yolo/data/labels/89_7.png +0 -0
  928. data/vendor/ggml/examples/yolo/data/labels/90_0.png +0 -0
  929. data/vendor/ggml/examples/yolo/data/labels/90_1.png +0 -0
  930. data/vendor/ggml/examples/yolo/data/labels/90_2.png +0 -0
  931. data/vendor/ggml/examples/yolo/data/labels/90_3.png +0 -0
  932. data/vendor/ggml/examples/yolo/data/labels/90_4.png +0 -0
  933. data/vendor/ggml/examples/yolo/data/labels/90_5.png +0 -0
  934. data/vendor/ggml/examples/yolo/data/labels/90_6.png +0 -0
  935. data/vendor/ggml/examples/yolo/data/labels/90_7.png +0 -0
  936. data/vendor/ggml/examples/yolo/data/labels/91_0.png +0 -0
  937. data/vendor/ggml/examples/yolo/data/labels/91_1.png +0 -0
  938. data/vendor/ggml/examples/yolo/data/labels/91_2.png +0 -0
  939. data/vendor/ggml/examples/yolo/data/labels/91_3.png +0 -0
  940. data/vendor/ggml/examples/yolo/data/labels/91_4.png +0 -0
  941. data/vendor/ggml/examples/yolo/data/labels/91_5.png +0 -0
  942. data/vendor/ggml/examples/yolo/data/labels/91_6.png +0 -0
  943. data/vendor/ggml/examples/yolo/data/labels/91_7.png +0 -0
  944. data/vendor/ggml/examples/yolo/data/labels/92_0.png +0 -0
  945. data/vendor/ggml/examples/yolo/data/labels/92_1.png +0 -0
  946. data/vendor/ggml/examples/yolo/data/labels/92_2.png +0 -0
  947. data/vendor/ggml/examples/yolo/data/labels/92_3.png +0 -0
  948. data/vendor/ggml/examples/yolo/data/labels/92_4.png +0 -0
  949. data/vendor/ggml/examples/yolo/data/labels/92_5.png +0 -0
  950. data/vendor/ggml/examples/yolo/data/labels/92_6.png +0 -0
  951. data/vendor/ggml/examples/yolo/data/labels/92_7.png +0 -0
  952. data/vendor/ggml/examples/yolo/data/labels/93_0.png +0 -0
  953. data/vendor/ggml/examples/yolo/data/labels/93_1.png +0 -0
  954. data/vendor/ggml/examples/yolo/data/labels/93_2.png +0 -0
  955. data/vendor/ggml/examples/yolo/data/labels/93_3.png +0 -0
  956. data/vendor/ggml/examples/yolo/data/labels/93_4.png +0 -0
  957. data/vendor/ggml/examples/yolo/data/labels/93_5.png +0 -0
  958. data/vendor/ggml/examples/yolo/data/labels/93_6.png +0 -0
  959. data/vendor/ggml/examples/yolo/data/labels/93_7.png +0 -0
  960. data/vendor/ggml/examples/yolo/data/labels/94_0.png +0 -0
  961. data/vendor/ggml/examples/yolo/data/labels/94_1.png +0 -0
  962. data/vendor/ggml/examples/yolo/data/labels/94_2.png +0 -0
  963. data/vendor/ggml/examples/yolo/data/labels/94_3.png +0 -0
  964. data/vendor/ggml/examples/yolo/data/labels/94_4.png +0 -0
  965. data/vendor/ggml/examples/yolo/data/labels/94_5.png +0 -0
  966. data/vendor/ggml/examples/yolo/data/labels/94_6.png +0 -0
  967. data/vendor/ggml/examples/yolo/data/labels/94_7.png +0 -0
  968. data/vendor/ggml/examples/yolo/data/labels/95_0.png +0 -0
  969. data/vendor/ggml/examples/yolo/data/labels/95_1.png +0 -0
  970. data/vendor/ggml/examples/yolo/data/labels/95_2.png +0 -0
  971. data/vendor/ggml/examples/yolo/data/labels/95_3.png +0 -0
  972. data/vendor/ggml/examples/yolo/data/labels/95_4.png +0 -0
  973. data/vendor/ggml/examples/yolo/data/labels/95_5.png +0 -0
  974. data/vendor/ggml/examples/yolo/data/labels/95_6.png +0 -0
  975. data/vendor/ggml/examples/yolo/data/labels/95_7.png +0 -0
  976. data/vendor/ggml/examples/yolo/data/labels/96_0.png +0 -0
  977. data/vendor/ggml/examples/yolo/data/labels/96_1.png +0 -0
  978. data/vendor/ggml/examples/yolo/data/labels/96_2.png +0 -0
  979. data/vendor/ggml/examples/yolo/data/labels/96_3.png +0 -0
  980. data/vendor/ggml/examples/yolo/data/labels/96_4.png +0 -0
  981. data/vendor/ggml/examples/yolo/data/labels/96_5.png +0 -0
  982. data/vendor/ggml/examples/yolo/data/labels/96_6.png +0 -0
  983. data/vendor/ggml/examples/yolo/data/labels/96_7.png +0 -0
  984. data/vendor/ggml/examples/yolo/data/labels/97_0.png +0 -0
  985. data/vendor/ggml/examples/yolo/data/labels/97_1.png +0 -0
  986. data/vendor/ggml/examples/yolo/data/labels/97_2.png +0 -0
  987. data/vendor/ggml/examples/yolo/data/labels/97_3.png +0 -0
  988. data/vendor/ggml/examples/yolo/data/labels/97_4.png +0 -0
  989. data/vendor/ggml/examples/yolo/data/labels/97_5.png +0 -0
  990. data/vendor/ggml/examples/yolo/data/labels/97_6.png +0 -0
  991. data/vendor/ggml/examples/yolo/data/labels/97_7.png +0 -0
  992. data/vendor/ggml/examples/yolo/data/labels/98_0.png +0 -0
  993. data/vendor/ggml/examples/yolo/data/labels/98_1.png +0 -0
  994. data/vendor/ggml/examples/yolo/data/labels/98_2.png +0 -0
  995. data/vendor/ggml/examples/yolo/data/labels/98_3.png +0 -0
  996. data/vendor/ggml/examples/yolo/data/labels/98_4.png +0 -0
  997. data/vendor/ggml/examples/yolo/data/labels/98_5.png +0 -0
  998. data/vendor/ggml/examples/yolo/data/labels/98_6.png +0 -0
  999. data/vendor/ggml/examples/yolo/data/labels/98_7.png +0 -0
  1000. data/vendor/ggml/examples/yolo/data/labels/99_0.png +0 -0
  1001. data/vendor/ggml/examples/yolo/data/labels/99_1.png +0 -0
  1002. data/vendor/ggml/examples/yolo/data/labels/99_2.png +0 -0
  1003. data/vendor/ggml/examples/yolo/data/labels/99_3.png +0 -0
  1004. data/vendor/ggml/examples/yolo/data/labels/99_4.png +0 -0
  1005. data/vendor/ggml/examples/yolo/data/labels/99_5.png +0 -0
  1006. data/vendor/ggml/examples/yolo/data/labels/99_6.png +0 -0
  1007. data/vendor/ggml/examples/yolo/data/labels/99_7.png +0 -0
  1008. data/vendor/ggml/examples/yolo/yolo-image.cpp +210 -0
  1009. data/vendor/ggml/examples/yolo/yolo-image.h +39 -0
  1010. data/vendor/ggml/examples/yolo/yolov3-tiny.cpp +661 -0
  1011. data/vendor/ggml/ggml.pc.in +10 -0
  1012. data/vendor/ggml/include/ggml-alloc.h +85 -0
  1013. data/vendor/ggml/include/ggml-backend.h +431 -0
  1014. data/vendor/ggml/include/ggml-blas.h +25 -0
  1015. data/vendor/ggml/include/ggml-cann.h +123 -0
  1016. data/vendor/ggml/include/ggml-cpp.h +39 -0
  1017. data/vendor/ggml/include/ggml-cpu.h +151 -0
  1018. data/vendor/ggml/include/ggml-cuda.h +50 -0
  1019. data/vendor/ggml/include/ggml-hexagon.h +19 -0
  1020. data/vendor/ggml/include/ggml-metal.h +61 -0
  1021. data/vendor/ggml/include/ggml-opencl.h +26 -0
  1022. data/vendor/ggml/include/ggml-openvino.h +37 -0
  1023. data/vendor/ggml/include/ggml-opt.h +256 -0
  1024. data/vendor/ggml/include/ggml-rpc.h +35 -0
  1025. data/vendor/ggml/include/ggml-sycl.h +49 -0
  1026. data/vendor/ggml/include/ggml-virtgpu.h +14 -0
  1027. data/vendor/ggml/include/ggml-vulkan.h +29 -0
  1028. data/vendor/ggml/include/ggml-webgpu.h +19 -0
  1029. data/vendor/ggml/include/ggml-zdnn.h +17 -0
  1030. data/vendor/ggml/include/ggml-zendnn.h +22 -0
  1031. data/vendor/ggml/include/ggml.h +2845 -0
  1032. data/vendor/ggml/include/gguf.h +204 -0
  1033. data/vendor/ggml/requirements.txt +12 -0
  1034. data/vendor/ggml/scripts/gen-authors.sh +9 -0
  1035. data/vendor/ggml/scripts/release.sh +296 -0
  1036. data/vendor/ggml/scripts/sync-llama-am.sh +167 -0
  1037. data/vendor/ggml/scripts/sync-llama.last +1 -0
  1038. data/vendor/ggml/scripts/sync-llama.sh +21 -0
  1039. data/vendor/ggml/scripts/sync-whisper-am.sh +138 -0
  1040. data/vendor/ggml/scripts/sync-whisper.last +1 -0
  1041. data/vendor/ggml/scripts/sync-whisper.sh +17 -0
  1042. data/vendor/ggml/src/CMakeLists.txt +493 -0
  1043. data/vendor/ggml/src/ggml-alloc.c +1248 -0
  1044. data/vendor/ggml/src/ggml-backend-dl.cpp +48 -0
  1045. data/vendor/ggml/src/ggml-backend-dl.h +45 -0
  1046. data/vendor/ggml/src/ggml-backend-impl.h +275 -0
  1047. data/vendor/ggml/src/ggml-backend-meta.cpp +2144 -0
  1048. data/vendor/ggml/src/ggml-backend-reg.cpp +586 -0
  1049. data/vendor/ggml/src/ggml-backend.cpp +2371 -0
  1050. data/vendor/ggml/src/ggml-blas/CMakeLists.txt +101 -0
  1051. data/vendor/ggml/src/ggml-blas/ggml-blas.cpp +522 -0
  1052. data/vendor/ggml/src/ggml-cann/CMakeLists.txt +89 -0
  1053. data/vendor/ggml/src/ggml-cann/acl_tensor.cpp +195 -0
  1054. data/vendor/ggml/src/ggml-cann/acl_tensor.h +349 -0
  1055. data/vendor/ggml/src/ggml-cann/aclnn_ops.cpp +4436 -0
  1056. data/vendor/ggml/src/ggml-cann/aclnn_ops.h +1190 -0
  1057. data/vendor/ggml/src/ggml-cann/common.h +651 -0
  1058. data/vendor/ggml/src/ggml-cann/ggml-cann.cpp +3062 -0
  1059. data/vendor/ggml/src/ggml-common.h +1900 -0
  1060. data/vendor/ggml/src/ggml-cpu/CMakeLists.txt +731 -0
  1061. data/vendor/ggml/src/ggml-cpu/amx/amx.cpp +249 -0
  1062. data/vendor/ggml/src/ggml-cpu/amx/amx.h +8 -0
  1063. data/vendor/ggml/src/ggml-cpu/amx/common.h +115 -0
  1064. data/vendor/ggml/src/ggml-cpu/amx/mmq.cpp +2512 -0
  1065. data/vendor/ggml/src/ggml-cpu/amx/mmq.h +10 -0
  1066. data/vendor/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +98 -0
  1067. data/vendor/ggml/src/ggml-cpu/arch/arm/quants.c +4245 -0
  1068. data/vendor/ggml/src/ggml-cpu/arch/arm/repack.cpp +5156 -0
  1069. data/vendor/ggml/src/ggml-cpu/arch/loongarch/quants.c +2158 -0
  1070. data/vendor/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  1071. data/vendor/ggml/src/ggml-cpu/arch/powerpc/quants.c +2304 -0
  1072. data/vendor/ggml/src/ggml-cpu/arch/riscv/cpu-feats.cpp +38 -0
  1073. data/vendor/ggml/src/ggml-cpu/arch/riscv/quants.c +4553 -0
  1074. data/vendor/ggml/src/ggml-cpu/arch/riscv/repack.cpp +1703 -0
  1075. data/vendor/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
  1076. data/vendor/ggml/src/ggml-cpu/arch/s390/quants.c +1465 -0
  1077. data/vendor/ggml/src/ggml-cpu/arch/wasm/quants.c +1220 -0
  1078. data/vendor/ggml/src/ggml-cpu/arch/x86/cpu-feats.cpp +327 -0
  1079. data/vendor/ggml/src/ggml-cpu/arch/x86/quants.c +3970 -0
  1080. data/vendor/ggml/src/ggml-cpu/arch/x86/repack.cpp +6407 -0
  1081. data/vendor/ggml/src/ggml-cpu/arch-fallback.h +348 -0
  1082. data/vendor/ggml/src/ggml-cpu/binary-ops.cpp +154 -0
  1083. data/vendor/ggml/src/ggml-cpu/binary-ops.h +16 -0
  1084. data/vendor/ggml/src/ggml-cpu/cmake/FindSIMD.cmake +100 -0
  1085. data/vendor/ggml/src/ggml-cpu/cmake/FindSMTIME.cmake +32 -0
  1086. data/vendor/ggml/src/ggml-cpu/common.h +95 -0
  1087. data/vendor/ggml/src/ggml-cpu/ggml-cpu-impl.h +539 -0
  1088. data/vendor/ggml/src/ggml-cpu/ggml-cpu.c +3835 -0
  1089. data/vendor/ggml/src/ggml-cpu/ggml-cpu.cpp +703 -0
  1090. data/vendor/ggml/src/ggml-cpu/hbm.cpp +55 -0
  1091. data/vendor/ggml/src/ggml-cpu/hbm.h +8 -0
  1092. data/vendor/ggml/src/ggml-cpu/kleidiai/kernels.cpp +939 -0
  1093. data/vendor/ggml/src/ggml-cpu/kleidiai/kernels.h +90 -0
  1094. data/vendor/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1513 -0
  1095. data/vendor/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  1096. data/vendor/ggml/src/ggml-cpu/llamafile/sgemm.cpp +4051 -0
  1097. data/vendor/ggml/src/ggml-cpu/llamafile/sgemm.h +25 -0
  1098. data/vendor/ggml/src/ggml-cpu/ops.cpp +11373 -0
  1099. data/vendor/ggml/src/ggml-cpu/ops.h +119 -0
  1100. data/vendor/ggml/src/ggml-cpu/quants.c +1288 -0
  1101. data/vendor/ggml/src/ggml-cpu/quants.h +103 -0
  1102. data/vendor/ggml/src/ggml-cpu/repack.cpp +4836 -0
  1103. data/vendor/ggml/src/ggml-cpu/repack.h +245 -0
  1104. data/vendor/ggml/src/ggml-cpu/simd-gemm.h +226 -0
  1105. data/vendor/ggml/src/ggml-cpu/simd-mappings.h +1319 -0
  1106. data/vendor/ggml/src/ggml-cpu/spacemit/ime.cpp +1740 -0
  1107. data/vendor/ggml/src/ggml-cpu/spacemit/ime.h +21 -0
  1108. data/vendor/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +1027 -0
  1109. data/vendor/ggml/src/ggml-cpu/spacemit/ime2_kernels.cpp +5768 -0
  1110. data/vendor/ggml/src/ggml-cpu/spacemit/ime_env.cpp +320 -0
  1111. data/vendor/ggml/src/ggml-cpu/spacemit/ime_env.h +55 -0
  1112. data/vendor/ggml/src/ggml-cpu/spacemit/ime_kernels.h +189 -0
  1113. data/vendor/ggml/src/ggml-cpu/spacemit/repack.cpp +1795 -0
  1114. data/vendor/ggml/src/ggml-cpu/spacemit/repack.h +14 -0
  1115. data/vendor/ggml/src/ggml-cpu/spacemit/rvv_kernels.cpp +3178 -0
  1116. data/vendor/ggml/src/ggml-cpu/spacemit/rvv_kernels.h +95 -0
  1117. data/vendor/ggml/src/ggml-cpu/spacemit/spine_barrier.h +34 -0
  1118. data/vendor/ggml/src/ggml-cpu/spacemit/spine_mem_pool.cpp +760 -0
  1119. data/vendor/ggml/src/ggml-cpu/spacemit/spine_mem_pool.h +32 -0
  1120. data/vendor/ggml/src/ggml-cpu/spacemit/spine_tcm.h +409 -0
  1121. data/vendor/ggml/src/ggml-cpu/traits.cpp +36 -0
  1122. data/vendor/ggml/src/ggml-cpu/traits.h +38 -0
  1123. data/vendor/ggml/src/ggml-cpu/unary-ops.cpp +337 -0
  1124. data/vendor/ggml/src/ggml-cpu/unary-ops.h +35 -0
  1125. data/vendor/ggml/src/ggml-cpu/vec.cpp +629 -0
  1126. data/vendor/ggml/src/ggml-cpu/vec.h +1588 -0
  1127. data/vendor/ggml/src/ggml-cuda/CMakeLists.txt +268 -0
  1128. data/vendor/ggml/src/ggml-cuda/acc.cu +61 -0
  1129. data/vendor/ggml/src/ggml-cuda/acc.cuh +5 -0
  1130. data/vendor/ggml/src/ggml-cuda/add-id.cu +58 -0
  1131. data/vendor/ggml/src/ggml-cuda/add-id.cuh +3 -0
  1132. data/vendor/ggml/src/ggml-cuda/allreduce.cu +971 -0
  1133. data/vendor/ggml/src/ggml-cuda/allreduce.cuh +29 -0
  1134. data/vendor/ggml/src/ggml-cuda/arange.cu +34 -0
  1135. data/vendor/ggml/src/ggml-cuda/arange.cuh +5 -0
  1136. data/vendor/ggml/src/ggml-cuda/argmax.cu +91 -0
  1137. data/vendor/ggml/src/ggml-cuda/argmax.cuh +3 -0
  1138. data/vendor/ggml/src/ggml-cuda/argsort.cu +266 -0
  1139. data/vendor/ggml/src/ggml-cuda/argsort.cuh +19 -0
  1140. data/vendor/ggml/src/ggml-cuda/binbcast.cu +534 -0
  1141. data/vendor/ggml/src/ggml-cuda/binbcast.cuh +12 -0
  1142. data/vendor/ggml/src/ggml-cuda/clamp.cu +45 -0
  1143. data/vendor/ggml/src/ggml-cuda/clamp.cuh +5 -0
  1144. data/vendor/ggml/src/ggml-cuda/common.cuh +1489 -0
  1145. data/vendor/ggml/src/ggml-cuda/concat.cu +204 -0
  1146. data/vendor/ggml/src/ggml-cuda/concat.cuh +5 -0
  1147. data/vendor/ggml/src/ggml-cuda/conv-transpose-1d.cu +86 -0
  1148. data/vendor/ggml/src/ggml-cuda/conv-transpose-1d.cuh +5 -0
  1149. data/vendor/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
  1150. data/vendor/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
  1151. data/vendor/ggml/src/ggml-cuda/conv2d-transpose.cu +115 -0
  1152. data/vendor/ggml/src/ggml-cuda/conv2d-transpose.cuh +5 -0
  1153. data/vendor/ggml/src/ggml-cuda/conv2d.cu +166 -0
  1154. data/vendor/ggml/src/ggml-cuda/conv2d.cuh +5 -0
  1155. data/vendor/ggml/src/ggml-cuda/convert.cu +892 -0
  1156. data/vendor/ggml/src/ggml-cuda/convert.cuh +66 -0
  1157. data/vendor/ggml/src/ggml-cuda/count-equal.cu +64 -0
  1158. data/vendor/ggml/src/ggml-cuda/count-equal.cuh +5 -0
  1159. data/vendor/ggml/src/ggml-cuda/cp-async.cuh +57 -0
  1160. data/vendor/ggml/src/ggml-cuda/cpy-utils.cuh +217 -0
  1161. data/vendor/ggml/src/ggml-cuda/cpy.cu +558 -0
  1162. data/vendor/ggml/src/ggml-cuda/cpy.cuh +7 -0
  1163. data/vendor/ggml/src/ggml-cuda/cross-entropy-loss.cu +177 -0
  1164. data/vendor/ggml/src/ggml-cuda/cross-entropy-loss.cuh +7 -0
  1165. data/vendor/ggml/src/ggml-cuda/cumsum.cu +307 -0
  1166. data/vendor/ggml/src/ggml-cuda/cumsum.cuh +5 -0
  1167. data/vendor/ggml/src/ggml-cuda/dequantize.cuh +99 -0
  1168. data/vendor/ggml/src/ggml-cuda/diag.cu +77 -0
  1169. data/vendor/ggml/src/ggml-cuda/diag.cuh +5 -0
  1170. data/vendor/ggml/src/ggml-cuda/diagmask.cu +40 -0
  1171. data/vendor/ggml/src/ggml-cuda/diagmask.cuh +5 -0
  1172. data/vendor/ggml/src/ggml-cuda/fattn-common.cuh +1212 -0
  1173. data/vendor/ggml/src/ggml-cuda/fattn-mma-f16.cuh +2020 -0
  1174. data/vendor/ggml/src/ggml-cuda/fattn-tile.cu +61 -0
  1175. data/vendor/ggml/src/ggml-cuda/fattn-tile.cuh +1347 -0
  1176. data/vendor/ggml/src/ggml-cuda/fattn-vec.cuh +600 -0
  1177. data/vendor/ggml/src/ggml-cuda/fattn-wmma-f16.cu +696 -0
  1178. data/vendor/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +51 -0
  1179. data/vendor/ggml/src/ggml-cuda/fattn.cu +562 -0
  1180. data/vendor/ggml/src/ggml-cuda/fattn.cuh +5 -0
  1181. data/vendor/ggml/src/ggml-cuda/fill.cu +37 -0
  1182. data/vendor/ggml/src/ggml-cuda/fill.cuh +3 -0
  1183. data/vendor/ggml/src/ggml-cuda/gated_delta_net.cu +311 -0
  1184. data/vendor/ggml/src/ggml-cuda/gated_delta_net.cuh +4 -0
  1185. data/vendor/ggml/src/ggml-cuda/getrows.cu +300 -0
  1186. data/vendor/ggml/src/ggml-cuda/getrows.cuh +15 -0
  1187. data/vendor/ggml/src/ggml-cuda/ggml-cuda.cu +5684 -0
  1188. data/vendor/ggml/src/ggml-cuda/gla.cu +93 -0
  1189. data/vendor/ggml/src/ggml-cuda/gla.cuh +3 -0
  1190. data/vendor/ggml/src/ggml-cuda/im2col.cu +267 -0
  1191. data/vendor/ggml/src/ggml-cuda/im2col.cuh +6 -0
  1192. data/vendor/ggml/src/ggml-cuda/mean.cu +75 -0
  1193. data/vendor/ggml/src/ggml-cuda/mean.cuh +3 -0
  1194. data/vendor/ggml/src/ggml-cuda/mma.cuh +1456 -0
  1195. data/vendor/ggml/src/ggml-cuda/mmf.cu +191 -0
  1196. data/vendor/ggml/src/ggml-cuda/mmf.cuh +908 -0
  1197. data/vendor/ggml/src/ggml-cuda/mmid.cu +164 -0
  1198. data/vendor/ggml/src/ggml-cuda/mmid.cuh +5 -0
  1199. data/vendor/ggml/src/ggml-cuda/mmq.cu +372 -0
  1200. data/vendor/ggml/src/ggml-cuda/mmq.cuh +4176 -0
  1201. data/vendor/ggml/src/ggml-cuda/mmvf.cu +862 -0
  1202. data/vendor/ggml/src/ggml-cuda/mmvf.cuh +14 -0
  1203. data/vendor/ggml/src/ggml-cuda/mmvq.cu +1161 -0
  1204. data/vendor/ggml/src/ggml-cuda/mmvq.cuh +16 -0
  1205. data/vendor/ggml/src/ggml-cuda/norm.cu +672 -0
  1206. data/vendor/ggml/src/ggml-cuda/norm.cuh +18 -0
  1207. data/vendor/ggml/src/ggml-cuda/opt-step-adamw.cu +78 -0
  1208. data/vendor/ggml/src/ggml-cuda/opt-step-adamw.cuh +5 -0
  1209. data/vendor/ggml/src/ggml-cuda/opt-step-sgd.cu +49 -0
  1210. data/vendor/ggml/src/ggml-cuda/opt-step-sgd.cuh +5 -0
  1211. data/vendor/ggml/src/ggml-cuda/out-prod.cu +84 -0
  1212. data/vendor/ggml/src/ggml-cuda/out-prod.cuh +3 -0
  1213. data/vendor/ggml/src/ggml-cuda/pad.cu +106 -0
  1214. data/vendor/ggml/src/ggml-cuda/pad.cuh +5 -0
  1215. data/vendor/ggml/src/ggml-cuda/pad_reflect_1d.cu +91 -0
  1216. data/vendor/ggml/src/ggml-cuda/pad_reflect_1d.cuh +5 -0
  1217. data/vendor/ggml/src/ggml-cuda/pool2d.cu +94 -0
  1218. data/vendor/ggml/src/ggml-cuda/pool2d.cuh +5 -0
  1219. data/vendor/ggml/src/ggml-cuda/quantize.cu +443 -0
  1220. data/vendor/ggml/src/ggml-cuda/quantize.cuh +41 -0
  1221. data/vendor/ggml/src/ggml-cuda/reduce_rows.cuh +39 -0
  1222. data/vendor/ggml/src/ggml-cuda/roll.cu +67 -0
  1223. data/vendor/ggml/src/ggml-cuda/roll.cuh +5 -0
  1224. data/vendor/ggml/src/ggml-cuda/rope.cu +665 -0
  1225. data/vendor/ggml/src/ggml-cuda/rope.cuh +9 -0
  1226. data/vendor/ggml/src/ggml-cuda/scale.cu +34 -0
  1227. data/vendor/ggml/src/ggml-cuda/scale.cuh +5 -0
  1228. data/vendor/ggml/src/ggml-cuda/set-rows.cu +330 -0
  1229. data/vendor/ggml/src/ggml-cuda/set-rows.cuh +7 -0
  1230. data/vendor/ggml/src/ggml-cuda/set.cu +39 -0
  1231. data/vendor/ggml/src/ggml-cuda/set.cuh +7 -0
  1232. data/vendor/ggml/src/ggml-cuda/snake.cu +72 -0
  1233. data/vendor/ggml/src/ggml-cuda/snake.cuh +8 -0
  1234. data/vendor/ggml/src/ggml-cuda/softcap.cu +34 -0
  1235. data/vendor/ggml/src/ggml-cuda/softcap.cuh +5 -0
  1236. data/vendor/ggml/src/ggml-cuda/softmax.cu +472 -0
  1237. data/vendor/ggml/src/ggml-cuda/softmax.cuh +7 -0
  1238. data/vendor/ggml/src/ggml-cuda/solve_tri.cu +275 -0
  1239. data/vendor/ggml/src/ggml-cuda/solve_tri.cuh +3 -0
  1240. data/vendor/ggml/src/ggml-cuda/ssm-conv.cu +197 -0
  1241. data/vendor/ggml/src/ggml-cuda/ssm-conv.cuh +3 -0
  1242. data/vendor/ggml/src/ggml-cuda/ssm-scan.cu +342 -0
  1243. data/vendor/ggml/src/ggml-cuda/ssm-scan.cuh +3 -0
  1244. data/vendor/ggml/src/ggml-cuda/sum.cu +41 -0
  1245. data/vendor/ggml/src/ggml-cuda/sum.cuh +5 -0
  1246. data/vendor/ggml/src/ggml-cuda/sumrows.cu +43 -0
  1247. data/vendor/ggml/src/ggml-cuda/sumrows.cuh +4 -0
  1248. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu +6 -0
  1249. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_32.cu +6 -0
  1250. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu +12 -0
  1251. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_1.cu +10 -0
  1252. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_2.cu +10 -0
  1253. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +12 -0
  1254. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu +6 -0
  1255. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_32.cu +6 -0
  1256. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +12 -0
  1257. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu +12 -0
  1258. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_1.cu +10 -0
  1259. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_2.cu +10 -0
  1260. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu +6 -0
  1261. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_2.cu +10 -0
  1262. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +12 -0
  1263. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_8.cu +12 -0
  1264. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_64-ncols2_1.cu +10 -0
  1265. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_1.cu +10 -0
  1266. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_2.cu +10 -0
  1267. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +12 -0
  1268. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_8.cu +12 -0
  1269. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq112-dv112.cu +5 -0
  1270. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq128-dv128.cu +5 -0
  1271. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq192-dv128.cu +5 -0
  1272. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq256-dv256.cu +5 -0
  1273. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq320-dv256.cu +5 -0
  1274. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq40-dv40.cu +5 -0
  1275. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq512-dv512.cu +5 -0
  1276. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq576-dv512.cu +5 -0
  1277. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq64-dv64.cu +5 -0
  1278. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq72-dv72.cu +5 -0
  1279. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq80-dv80.cu +5 -0
  1280. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq96-dv96.cu +5 -0
  1281. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-bf16.cu +7 -0
  1282. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-f16.cu +7 -0
  1283. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_0.cu +7 -0
  1284. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_1.cu +7 -0
  1285. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_0.cu +7 -0
  1286. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_1.cu +7 -0
  1287. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q8_0.cu +7 -0
  1288. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-bf16.cu +7 -0
  1289. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu +7 -0
  1290. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_0.cu +7 -0
  1291. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_1.cu +7 -0
  1292. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_0.cu +7 -0
  1293. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_1.cu +7 -0
  1294. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q8_0.cu +7 -0
  1295. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-bf16.cu +7 -0
  1296. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-f16.cu +7 -0
  1297. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu +7 -0
  1298. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_1.cu +7 -0
  1299. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_0.cu +7 -0
  1300. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_1.cu +7 -0
  1301. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q8_0.cu +7 -0
  1302. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-bf16.cu +7 -0
  1303. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-f16.cu +7 -0
  1304. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_0.cu +7 -0
  1305. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_1.cu +7 -0
  1306. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_0.cu +7 -0
  1307. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_1.cu +7 -0
  1308. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q8_0.cu +7 -0
  1309. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-bf16.cu +7 -0
  1310. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-f16.cu +7 -0
  1311. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_0.cu +7 -0
  1312. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_1.cu +7 -0
  1313. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_0.cu +7 -0
  1314. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_1.cu +7 -0
  1315. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q8_0.cu +7 -0
  1316. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-bf16.cu +7 -0
  1317. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-f16.cu +7 -0
  1318. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_0.cu +7 -0
  1319. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_1.cu +7 -0
  1320. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_0.cu +7 -0
  1321. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_1.cu +7 -0
  1322. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q8_0.cu +7 -0
  1323. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-bf16.cu +7 -0
  1324. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-f16.cu +7 -0
  1325. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_0.cu +7 -0
  1326. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_1.cu +7 -0
  1327. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_0.cu +7 -0
  1328. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_1.cu +7 -0
  1329. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu +7 -0
  1330. data/vendor/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +110 -0
  1331. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_1.cu +5 -0
  1332. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_10.cu +5 -0
  1333. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_11.cu +5 -0
  1334. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_12.cu +5 -0
  1335. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_13.cu +5 -0
  1336. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_14.cu +5 -0
  1337. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_15.cu +5 -0
  1338. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_16.cu +5 -0
  1339. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_2.cu +5 -0
  1340. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_3.cu +5 -0
  1341. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_4.cu +5 -0
  1342. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_5.cu +5 -0
  1343. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_6.cu +5 -0
  1344. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_7.cu +5 -0
  1345. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_8.cu +5 -0
  1346. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_9.cu +5 -0
  1347. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq1_s.cu +5 -0
  1348. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_s.cu +5 -0
  1349. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xs.cu +5 -0
  1350. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xxs.cu +5 -0
  1351. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_s.cu +5 -0
  1352. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_xxs.cu +5 -0
  1353. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_nl.cu +5 -0
  1354. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_xs.cu +5 -0
  1355. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-mxfp4.cu +5 -0
  1356. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-nvfp4.cu +5 -0
  1357. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q1_0.cu +5 -0
  1358. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
  1359. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
  1360. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
  1361. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
  1362. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
  1363. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
  1364. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
  1365. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
  1366. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
  1367. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
  1368. data/vendor/ggml/src/ggml-cuda/top-k.cu +95 -0
  1369. data/vendor/ggml/src/ggml-cuda/top-k.cuh +3 -0
  1370. data/vendor/ggml/src/ggml-cuda/topk-moe.cu +415 -0
  1371. data/vendor/ggml/src/ggml-cuda/topk-moe.cuh +27 -0
  1372. data/vendor/ggml/src/ggml-cuda/tri.cu +136 -0
  1373. data/vendor/ggml/src/ggml-cuda/tri.cuh +5 -0
  1374. data/vendor/ggml/src/ggml-cuda/tsembd.cu +47 -0
  1375. data/vendor/ggml/src/ggml-cuda/tsembd.cuh +5 -0
  1376. data/vendor/ggml/src/ggml-cuda/unary.cu +640 -0
  1377. data/vendor/ggml/src/ggml-cuda/unary.cuh +114 -0
  1378. data/vendor/ggml/src/ggml-cuda/upscale.cu +293 -0
  1379. data/vendor/ggml/src/ggml-cuda/upscale.cuh +5 -0
  1380. data/vendor/ggml/src/ggml-cuda/vecdotq.cuh +1317 -0
  1381. data/vendor/ggml/src/ggml-cuda/vendors/cuda.h +28 -0
  1382. data/vendor/ggml/src/ggml-cuda/vendors/hip.h +304 -0
  1383. data/vendor/ggml/src/ggml-cuda/vendors/musa.h +150 -0
  1384. data/vendor/ggml/src/ggml-cuda/wkv.cu +199 -0
  1385. data/vendor/ggml/src/ggml-cuda/wkv.cuh +7 -0
  1386. data/vendor/ggml/src/ggml-hexagon/CMakeLists.txt +118 -0
  1387. data/vendor/ggml/src/ggml-hexagon/ggml-hexagon.cpp +3680 -0
  1388. data/vendor/ggml/src/ggml-hexagon/htp/CMakeLists.txt +78 -0
  1389. data/vendor/ggml/src/ggml-hexagon/htp/act-ops.c +782 -0
  1390. data/vendor/ggml/src/ggml-hexagon/htp/argsort-ops.c +293 -0
  1391. data/vendor/ggml/src/ggml-hexagon/htp/binary-ops.c +872 -0
  1392. data/vendor/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +157 -0
  1393. data/vendor/ggml/src/ggml-hexagon/htp/cpy-ops.c +275 -0
  1394. data/vendor/ggml/src/ggml-hexagon/htp/cumsum-ops.c +270 -0
  1395. data/vendor/ggml/src/ggml-hexagon/htp/diag-ops.c +216 -0
  1396. data/vendor/ggml/src/ggml-hexagon/htp/fill-ops.c +123 -0
  1397. data/vendor/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +727 -0
  1398. data/vendor/ggml/src/ggml-hexagon/htp/gated-delta-net-ops.c +955 -0
  1399. data/vendor/ggml/src/ggml-hexagon/htp/get-rows-ops.c +124 -0
  1400. data/vendor/ggml/src/ggml-hexagon/htp/hex-dma.c +63 -0
  1401. data/vendor/ggml/src/ggml-hexagon/htp/hex-dma.h +372 -0
  1402. data/vendor/ggml/src/ggml-hexagon/htp/hex-dump.h +86 -0
  1403. data/vendor/ggml/src/ggml-hexagon/htp/hex-fastdiv.h +37 -0
  1404. data/vendor/ggml/src/ggml-hexagon/htp/hex-utils.h +137 -0
  1405. data/vendor/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c +1841 -0
  1406. data/vendor/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c +1785 -0
  1407. data/vendor/ggml/src/ggml-hexagon/htp/hmx-ops.h +71 -0
  1408. data/vendor/ggml/src/ggml-hexagon/htp/hmx-profile.h +34 -0
  1409. data/vendor/ggml/src/ggml-hexagon/htp/hmx-queue.c +158 -0
  1410. data/vendor/ggml/src/ggml-hexagon/htp/hmx-queue.h +134 -0
  1411. data/vendor/ggml/src/ggml-hexagon/htp/hmx-utils.h +200 -0
  1412. data/vendor/ggml/src/ggml-hexagon/htp/htp-ctx.h +111 -0
  1413. data/vendor/ggml/src/ggml-hexagon/htp/htp-ops.h +181 -0
  1414. data/vendor/ggml/src/ggml-hexagon/htp/htp_iface.idl +22 -0
  1415. data/vendor/ggml/src/ggml-hexagon/htp/hvx-arith.h +443 -0
  1416. data/vendor/ggml/src/ggml-hexagon/htp/hvx-base.h +308 -0
  1417. data/vendor/ggml/src/ggml-hexagon/htp/hvx-copy.h +262 -0
  1418. data/vendor/ggml/src/ggml-hexagon/htp/hvx-div.h +291 -0
  1419. data/vendor/ggml/src/ggml-hexagon/htp/hvx-dump.h +129 -0
  1420. data/vendor/ggml/src/ggml-hexagon/htp/hvx-exp.h +216 -0
  1421. data/vendor/ggml/src/ggml-hexagon/htp/hvx-floor.h +100 -0
  1422. data/vendor/ggml/src/ggml-hexagon/htp/hvx-inverse.h +210 -0
  1423. data/vendor/ggml/src/ggml-hexagon/htp/hvx-reduce.h +296 -0
  1424. data/vendor/ggml/src/ggml-hexagon/htp/hvx-repl.h +74 -0
  1425. data/vendor/ggml/src/ggml-hexagon/htp/hvx-scale.h +133 -0
  1426. data/vendor/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +142 -0
  1427. data/vendor/ggml/src/ggml-hexagon/htp/hvx-sqrt.h +126 -0
  1428. data/vendor/ggml/src/ggml-hexagon/htp/hvx-types.h +36 -0
  1429. data/vendor/ggml/src/ggml-hexagon/htp/hvx-utils.h +19 -0
  1430. data/vendor/ggml/src/ggml-hexagon/htp/main.c +880 -0
  1431. data/vendor/ggml/src/ggml-hexagon/htp/matmul-ops.c +3173 -0
  1432. data/vendor/ggml/src/ggml-hexagon/htp/repeat-ops.c +148 -0
  1433. data/vendor/ggml/src/ggml-hexagon/htp/rope-ops.c +494 -0
  1434. data/vendor/ggml/src/ggml-hexagon/htp/set-rows-ops.c +184 -0
  1435. data/vendor/ggml/src/ggml-hexagon/htp/softmax-ops.c +407 -0
  1436. data/vendor/ggml/src/ggml-hexagon/htp/solve-tri-ops.c +267 -0
  1437. data/vendor/ggml/src/ggml-hexagon/htp/ssm-conv.c +340 -0
  1438. data/vendor/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +128 -0
  1439. data/vendor/ggml/src/ggml-hexagon/htp/unary-ops.c +657 -0
  1440. data/vendor/ggml/src/ggml-hexagon/htp/vtcm-utils.h +16 -0
  1441. data/vendor/ggml/src/ggml-hexagon/htp/worker-pool.c +293 -0
  1442. data/vendor/ggml/src/ggml-hexagon/htp/worker-pool.h +57 -0
  1443. data/vendor/ggml/src/ggml-hexagon/htp-drv.cpp +418 -0
  1444. data/vendor/ggml/src/ggml-hexagon/htp-drv.h +121 -0
  1445. data/vendor/ggml/src/ggml-hexagon/libdl.h +79 -0
  1446. data/vendor/ggml/src/ggml-hexagon/libggml-htp.inf +40 -0
  1447. data/vendor/ggml/src/ggml-hexagon/op-desc.h +153 -0
  1448. data/vendor/ggml/src/ggml-hip/CMakeLists.txt +157 -0
  1449. data/vendor/ggml/src/ggml-impl.h +783 -0
  1450. data/vendor/ggml/src/ggml-metal/CMakeLists.txt +124 -0
  1451. data/vendor/ggml/src/ggml-metal/ggml-metal-common.cpp +457 -0
  1452. data/vendor/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
  1453. data/vendor/ggml/src/ggml-metal/ggml-metal-context.h +41 -0
  1454. data/vendor/ggml/src/ggml-metal/ggml-metal-context.m +739 -0
  1455. data/vendor/ggml/src/ggml-metal/ggml-metal-device.cpp +2053 -0
  1456. data/vendor/ggml/src/ggml-metal/ggml-metal-device.h +296 -0
  1457. data/vendor/ggml/src/ggml-metal/ggml-metal-device.m +1829 -0
  1458. data/vendor/ggml/src/ggml-metal/ggml-metal-impl.h +1175 -0
  1459. data/vendor/ggml/src/ggml-metal/ggml-metal-ops.cpp +4606 -0
  1460. data/vendor/ggml/src/ggml-metal/ggml-metal-ops.h +97 -0
  1461. data/vendor/ggml/src/ggml-metal/ggml-metal.cpp +950 -0
  1462. data/vendor/ggml/src/ggml-metal/ggml-metal.metal +10679 -0
  1463. data/vendor/ggml/src/ggml-musa/CMakeLists.txt +124 -0
  1464. data/vendor/ggml/src/ggml-musa/mudnn.cu +112 -0
  1465. data/vendor/ggml/src/ggml-musa/mudnn.cuh +12 -0
  1466. data/vendor/ggml/src/ggml-opencl/CMakeLists.txt +189 -0
  1467. data/vendor/ggml/src/ggml-opencl/ggml-opencl.cpp +16374 -0
  1468. data/vendor/ggml/src/ggml-opencl/kernels/add.cl +190 -0
  1469. data/vendor/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
  1470. data/vendor/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
  1471. data/vendor/ggml/src/ggml-opencl/kernels/clamp.cl +20 -0
  1472. data/vendor/ggml/src/ggml-opencl/kernels/concat.cl +51 -0
  1473. data/vendor/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
  1474. data/vendor/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
  1475. data/vendor/ggml/src/ggml-opencl/kernels/cpy.cl +229 -0
  1476. data/vendor/ggml/src/ggml-opencl/kernels/cumsum.cl +139 -0
  1477. data/vendor/ggml/src/ggml-opencl/kernels/cvt.cl +1471 -0
  1478. data/vendor/ggml/src/ggml-opencl/kernels/diag.cl +27 -0
  1479. data/vendor/ggml/src/ggml-opencl/kernels/diag_mask_inf.cl +58 -0
  1480. data/vendor/ggml/src/ggml-opencl/kernels/div.cl +138 -0
  1481. data/vendor/ggml/src/ggml-opencl/kernels/embed_kernel.py +26 -0
  1482. data/vendor/ggml/src/ggml-opencl/kernels/exp.cl +125 -0
  1483. data/vendor/ggml/src/ggml-opencl/kernels/expm1.cl +113 -0
  1484. data/vendor/ggml/src/ggml-opencl/kernels/fill.cl +17 -0
  1485. data/vendor/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +370 -0
  1486. data/vendor/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +371 -0
  1487. data/vendor/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +373 -0
  1488. data/vendor/ggml/src/ggml-opencl/kernels/gelu.cl +89 -0
  1489. data/vendor/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32.cl +162 -0
  1490. data/vendor/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32_ns.cl +302 -0
  1491. data/vendor/ggml/src/ggml-opencl/kernels/gemm_moe_q4_0_f32_ns.cl +252 -0
  1492. data/vendor/ggml/src/ggml-opencl/kernels/gemm_moe_q4_1_f32_ns.cl +254 -0
  1493. data/vendor/ggml/src/ggml-opencl/kernels/gemm_moe_q5_0_f32_ns.cl +256 -0
  1494. data/vendor/ggml/src/ggml-opencl/kernels/gemm_moe_q5_1_f32_ns.cl +258 -0
  1495. data/vendor/ggml/src/ggml-opencl/kernels/gemm_noshuffle_iq4_nl_f32.cl +150 -0
  1496. data/vendor/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_0_f32.cl +139 -0
  1497. data/vendor/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_1_f32.cl +132 -0
  1498. data/vendor/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_k_f32.cl +172 -0
  1499. data/vendor/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_k_f32.cl +176 -0
  1500. data/vendor/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q6_k_f32.cl +140 -0
  1501. data/vendor/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q8_0_f32.cl +129 -0
  1502. data/vendor/ggml/src/ggml-opencl/kernels/gemm_xmem_f16_f32_os8.cl +233 -0
  1503. data/vendor/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32.cl +156 -0
  1504. data/vendor/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32_ns.cl +161 -0
  1505. data/vendor/ggml/src/ggml-opencl/kernels/gemv_moe_q4_0_f32_ns.cl +116 -0
  1506. data/vendor/ggml/src/ggml-opencl/kernels/gemv_moe_q4_1_f32_ns.cl +119 -0
  1507. data/vendor/ggml/src/ggml-opencl/kernels/gemv_moe_q5_0_f32_ns.cl +119 -0
  1508. data/vendor/ggml/src/ggml-opencl/kernels/gemv_moe_q5_1_f32_ns.cl +121 -0
  1509. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_iq4_nl_f32.cl +302 -0
  1510. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_0_f32.cl +274 -0
  1511. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_0_f32_spec.cl +268 -0
  1512. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_1_f32.cl +283 -0
  1513. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_k_f32.cl +318 -0
  1514. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_k_f32.cl +326 -0
  1515. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q6_k_f32.cl +293 -0
  1516. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q8_0_f32.cl +195 -0
  1517. data/vendor/ggml/src/ggml-opencl/kernels/get_rows.cl +187 -0
  1518. data/vendor/ggml/src/ggml-opencl/kernels/glu.cl +378 -0
  1519. data/vendor/ggml/src/ggml-opencl/kernels/group_norm.cl +121 -0
  1520. data/vendor/ggml/src/ggml-opencl/kernels/im2col_f16.cl +57 -0
  1521. data/vendor/ggml/src/ggml-opencl/kernels/im2col_f32.cl +57 -0
  1522. data/vendor/ggml/src/ggml-opencl/kernels/l2_norm.cl +71 -0
  1523. data/vendor/ggml/src/ggml-opencl/kernels/mean.cl +140 -0
  1524. data/vendor/ggml/src/ggml-opencl/kernels/moe_reorder_b.cl +30 -0
  1525. data/vendor/ggml/src/ggml-opencl/kernels/moe_sort_by_expert.cl +82 -0
  1526. data/vendor/ggml/src/ggml-opencl/kernels/mul.cl +152 -0
  1527. data/vendor/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
  1528. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_kq_kqv.cl +273 -0
  1529. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +146 -0
  1530. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +147 -0
  1531. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_iq4_nl_f32_l4_lm.cl +171 -0
  1532. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl +163 -0
  1533. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl +165 -0
  1534. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_q4_k_f32_l4_lm.cl +179 -0
  1535. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_q5_k_f32_l4_lm.cl +192 -0
  1536. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_q6_k_f32_l4_lm.cl +158 -0
  1537. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl +154 -0
  1538. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_f16_f16.cl +118 -0
  1539. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32.cl +118 -0
  1540. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_1row.cl +94 -0
  1541. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_l4.cl +84 -0
  1542. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_f32_f32.cl +118 -0
  1543. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
  1544. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
  1545. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  1546. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
  1547. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
  1548. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32.cl +164 -0
  1549. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32_flat.cl +202 -0
  1550. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
  1551. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
  1552. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32.cl +192 -0
  1553. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_16x_flat.cl +307 -0
  1554. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_8x_flat.cl +265 -0
  1555. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_8x_flat.cl +272 -0
  1556. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_v.cl +254 -0
  1557. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl +219 -0
  1558. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl +229 -0
  1559. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl +180 -0
  1560. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32_flat.cl +196 -0
  1561. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32.cl +187 -0
  1562. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32_flat.cl +203 -0
  1563. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32.cl +194 -0
  1564. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +194 -0
  1565. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
  1566. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
  1567. data/vendor/ggml/src/ggml-opencl/kernels/neg.cl +125 -0
  1568. data/vendor/ggml/src/ggml-opencl/kernels/norm.cl +161 -0
  1569. data/vendor/ggml/src/ggml-opencl/kernels/pad.cl +39 -0
  1570. data/vendor/ggml/src/ggml-opencl/kernels/relu.cl +16 -0
  1571. data/vendor/ggml/src/ggml-opencl/kernels/repeat.cl +38 -0
  1572. data/vendor/ggml/src/ggml-opencl/kernels/rms_norm.cl +190 -0
  1573. data/vendor/ggml/src/ggml-opencl/kernels/rope.cl +747 -0
  1574. data/vendor/ggml/src/ggml-opencl/kernels/scale.cl +27 -0
  1575. data/vendor/ggml/src/ggml-opencl/kernels/set_rows.cl +208 -0
  1576. data/vendor/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
  1577. data/vendor/ggml/src/ggml-opencl/kernels/silu.cl +30 -0
  1578. data/vendor/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +108 -0
  1579. data/vendor/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +108 -0
  1580. data/vendor/ggml/src/ggml-opencl/kernels/softmax_f16.cl +107 -0
  1581. data/vendor/ggml/src/ggml-opencl/kernels/softmax_f32.cl +107 -0
  1582. data/vendor/ggml/src/ggml-opencl/kernels/softplus.cl +116 -0
  1583. data/vendor/ggml/src/ggml-opencl/kernels/solve_tri.cl +51 -0
  1584. data/vendor/ggml/src/ggml-opencl/kernels/sqr.cl +53 -0
  1585. data/vendor/ggml/src/ggml-opencl/kernels/sqrt.cl +53 -0
  1586. data/vendor/ggml/src/ggml-opencl/kernels/ssm_conv.cl +77 -0
  1587. data/vendor/ggml/src/ggml-opencl/kernels/sub.cl +138 -0
  1588. data/vendor/ggml/src/ggml-opencl/kernels/sum_rows.cl +140 -0
  1589. data/vendor/ggml/src/ggml-opencl/kernels/tanh.cl +109 -0
  1590. data/vendor/ggml/src/ggml-opencl/kernels/transpose.cl +143 -0
  1591. data/vendor/ggml/src/ggml-opencl/kernels/tri.cl +32 -0
  1592. data/vendor/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  1593. data/vendor/ggml/src/ggml-opencl/kernels/upscale.cl +120 -0
  1594. data/vendor/ggml/src/ggml-openvino/CMakeLists.txt +22 -0
  1595. data/vendor/ggml/src/ggml-openvino/ggml-decoder.cpp +985 -0
  1596. data/vendor/ggml/src/ggml-openvino/ggml-decoder.h +294 -0
  1597. data/vendor/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +380 -0
  1598. data/vendor/ggml/src/ggml-openvino/ggml-openvino-extra.h +182 -0
  1599. data/vendor/ggml/src/ggml-openvino/ggml-openvino.cpp +1132 -0
  1600. data/vendor/ggml/src/ggml-openvino/ggml-quants.cpp +956 -0
  1601. data/vendor/ggml/src/ggml-openvino/ggml-quants.h +153 -0
  1602. data/vendor/ggml/src/ggml-openvino/openvino/decoder.h +74 -0
  1603. data/vendor/ggml/src/ggml-openvino/openvino/frontend.cpp +27 -0
  1604. data/vendor/ggml/src/ggml-openvino/openvino/frontend.h +23 -0
  1605. data/vendor/ggml/src/ggml-openvino/openvino/input_model.cpp +17 -0
  1606. data/vendor/ggml/src/ggml-openvino/openvino/input_model.h +29 -0
  1607. data/vendor/ggml/src/ggml-openvino/openvino/node_context.h +112 -0
  1608. data/vendor/ggml/src/ggml-openvino/openvino/op/cont.cpp +48 -0
  1609. data/vendor/ggml/src/ggml-openvino/openvino/op/cpy.cpp +21 -0
  1610. data/vendor/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +90 -0
  1611. data/vendor/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +69 -0
  1612. data/vendor/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp +61 -0
  1613. data/vendor/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp +62 -0
  1614. data/vendor/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +90 -0
  1615. data/vendor/ggml/src/ggml-openvino/openvino/op/permute.cpp +102 -0
  1616. data/vendor/ggml/src/ggml-openvino/openvino/op/reshape.cpp +83 -0
  1617. data/vendor/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp +46 -0
  1618. data/vendor/ggml/src/ggml-openvino/openvino/op/rope.cpp +149 -0
  1619. data/vendor/ggml/src/ggml-openvino/openvino/op/scale.cpp +41 -0
  1620. data/vendor/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +76 -0
  1621. data/vendor/ggml/src/ggml-openvino/openvino/op/softmax.cpp +89 -0
  1622. data/vendor/ggml/src/ggml-openvino/openvino/op/transpose.cpp +23 -0
  1623. data/vendor/ggml/src/ggml-openvino/openvino/op/unary_gelu.cpp +25 -0
  1624. data/vendor/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp +27 -0
  1625. data/vendor/ggml/src/ggml-openvino/openvino/op/view.cpp +53 -0
  1626. data/vendor/ggml/src/ggml-openvino/openvino/op_table.cpp +47 -0
  1627. data/vendor/ggml/src/ggml-openvino/openvino/op_table.h +40 -0
  1628. data/vendor/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp +60 -0
  1629. data/vendor/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.h +17 -0
  1630. data/vendor/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.h +29 -0
  1631. data/vendor/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.cpp +58 -0
  1632. data/vendor/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.h +17 -0
  1633. data/vendor/ggml/src/ggml-openvino/openvino/rt_info/weightless_caching_attributes.hpp +41 -0
  1634. data/vendor/ggml/src/ggml-openvino/openvino/translate_session.cpp +317 -0
  1635. data/vendor/ggml/src/ggml-openvino/openvino/translate_session.h +28 -0
  1636. data/vendor/ggml/src/ggml-openvino/openvino/utils.cpp +257 -0
  1637. data/vendor/ggml/src/ggml-openvino/openvino/utils.h +86 -0
  1638. data/vendor/ggml/src/ggml-openvino/utils.cpp +880 -0
  1639. data/vendor/ggml/src/ggml-openvino/utils.h +143 -0
  1640. data/vendor/ggml/src/ggml-opt.cpp +1094 -0
  1641. data/vendor/ggml/src/ggml-quants.c +5491 -0
  1642. data/vendor/ggml/src/ggml-quants.h +112 -0
  1643. data/vendor/ggml/src/ggml-rpc/CMakeLists.txt +33 -0
  1644. data/vendor/ggml/src/ggml-rpc/ggml-rpc.cpp +1974 -0
  1645. data/vendor/ggml/src/ggml-rpc/transport.cpp +683 -0
  1646. data/vendor/ggml/src/ggml-rpc/transport.h +34 -0
  1647. data/vendor/ggml/src/ggml-sycl/CMakeLists.txt +207 -0
  1648. data/vendor/ggml/src/ggml-sycl/add-id.cpp +81 -0
  1649. data/vendor/ggml/src/ggml-sycl/add-id.hpp +8 -0
  1650. data/vendor/ggml/src/ggml-sycl/backend.hpp +48 -0
  1651. data/vendor/ggml/src/ggml-sycl/binbcast.cpp +346 -0
  1652. data/vendor/ggml/src/ggml-sycl/binbcast.hpp +39 -0
  1653. data/vendor/ggml/src/ggml-sycl/common.cpp +155 -0
  1654. data/vendor/ggml/src/ggml-sycl/common.hpp +1002 -0
  1655. data/vendor/ggml/src/ggml-sycl/concat.cpp +202 -0
  1656. data/vendor/ggml/src/ggml-sycl/concat.hpp +20 -0
  1657. data/vendor/ggml/src/ggml-sycl/conv.cpp +101 -0
  1658. data/vendor/ggml/src/ggml-sycl/conv.hpp +20 -0
  1659. data/vendor/ggml/src/ggml-sycl/convert.cpp +825 -0
  1660. data/vendor/ggml/src/ggml-sycl/convert.hpp +64 -0
  1661. data/vendor/ggml/src/ggml-sycl/count-equal.cpp +79 -0
  1662. data/vendor/ggml/src/ggml-sycl/count-equal.hpp +9 -0
  1663. data/vendor/ggml/src/ggml-sycl/cpy.cpp +602 -0
  1664. data/vendor/ggml/src/ggml-sycl/cpy.hpp +223 -0
  1665. data/vendor/ggml/src/ggml-sycl/cumsum.cpp +148 -0
  1666. data/vendor/ggml/src/ggml-sycl/cumsum.hpp +5 -0
  1667. data/vendor/ggml/src/ggml-sycl/dequantize.hpp +975 -0
  1668. data/vendor/ggml/src/ggml-sycl/diag.cpp +67 -0
  1669. data/vendor/ggml/src/ggml-sycl/diag.hpp +5 -0
  1670. data/vendor/ggml/src/ggml-sycl/dmmv.cpp +1579 -0
  1671. data/vendor/ggml/src/ggml-sycl/dmmv.hpp +27 -0
  1672. data/vendor/ggml/src/ggml-sycl/dpct/helper.hpp +3774 -0
  1673. data/vendor/ggml/src/ggml-sycl/element_wise.cpp +1124 -0
  1674. data/vendor/ggml/src/ggml-sycl/element_wise.hpp +94 -0
  1675. data/vendor/ggml/src/ggml-sycl/fattn-buffers.cpp +56 -0
  1676. data/vendor/ggml/src/ggml-sycl/fattn-buffers.hpp +63 -0
  1677. data/vendor/ggml/src/ggml-sycl/fattn-common.hpp +1181 -0
  1678. data/vendor/ggml/src/ggml-sycl/fattn-tile.cpp +59 -0
  1679. data/vendor/ggml/src/ggml-sycl/fattn-tile.hpp +1246 -0
  1680. data/vendor/ggml/src/ggml-sycl/fattn-vec.hpp +674 -0
  1681. data/vendor/ggml/src/ggml-sycl/fattn.cpp +227 -0
  1682. data/vendor/ggml/src/ggml-sycl/fattn.hpp +22 -0
  1683. data/vendor/ggml/src/ggml-sycl/fill.cpp +55 -0
  1684. data/vendor/ggml/src/ggml-sycl/fill.hpp +5 -0
  1685. data/vendor/ggml/src/ggml-sycl/gated_delta_net.cpp +307 -0
  1686. data/vendor/ggml/src/ggml-sycl/gated_delta_net.hpp +9 -0
  1687. data/vendor/ggml/src/ggml-sycl/gemm.hpp +93 -0
  1688. data/vendor/ggml/src/ggml-sycl/getrows.cpp +219 -0
  1689. data/vendor/ggml/src/ggml-sycl/getrows.hpp +20 -0
  1690. data/vendor/ggml/src/ggml-sycl/ggml-sycl.cpp +5520 -0
  1691. data/vendor/ggml/src/ggml-sycl/gla.cpp +106 -0
  1692. data/vendor/ggml/src/ggml-sycl/gla.hpp +8 -0
  1693. data/vendor/ggml/src/ggml-sycl/im2col.cpp +400 -0
  1694. data/vendor/ggml/src/ggml-sycl/im2col.hpp +23 -0
  1695. data/vendor/ggml/src/ggml-sycl/mmq.cpp +3030 -0
  1696. data/vendor/ggml/src/ggml-sycl/mmq.hpp +33 -0
  1697. data/vendor/ggml/src/ggml-sycl/mmvq.cpp +1380 -0
  1698. data/vendor/ggml/src/ggml-sycl/mmvq.hpp +43 -0
  1699. data/vendor/ggml/src/ggml-sycl/norm.cpp +656 -0
  1700. data/vendor/ggml/src/ggml-sycl/norm.hpp +28 -0
  1701. data/vendor/ggml/src/ggml-sycl/outprod.cpp +47 -0
  1702. data/vendor/ggml/src/ggml-sycl/outprod.hpp +10 -0
  1703. data/vendor/ggml/src/ggml-sycl/pad.cpp +97 -0
  1704. data/vendor/ggml/src/ggml-sycl/pad.hpp +24 -0
  1705. data/vendor/ggml/src/ggml-sycl/pad_reflect_1d.cpp +100 -0
  1706. data/vendor/ggml/src/ggml-sycl/pad_reflect_1d.hpp +10 -0
  1707. data/vendor/ggml/src/ggml-sycl/presets.hpp +79 -0
  1708. data/vendor/ggml/src/ggml-sycl/quantize.hpp +133 -0
  1709. data/vendor/ggml/src/ggml-sycl/quants.hpp +156 -0
  1710. data/vendor/ggml/src/ggml-sycl/repeat_back.cpp +76 -0
  1711. data/vendor/ggml/src/ggml-sycl/repeat_back.hpp +8 -0
  1712. data/vendor/ggml/src/ggml-sycl/roll.cpp +122 -0
  1713. data/vendor/ggml/src/ggml-sycl/roll.hpp +20 -0
  1714. data/vendor/ggml/src/ggml-sycl/rope.cpp +641 -0
  1715. data/vendor/ggml/src/ggml-sycl/rope.hpp +26 -0
  1716. data/vendor/ggml/src/ggml-sycl/set.cpp +73 -0
  1717. data/vendor/ggml/src/ggml-sycl/set.hpp +5 -0
  1718. data/vendor/ggml/src/ggml-sycl/set_rows.cpp +240 -0
  1719. data/vendor/ggml/src/ggml-sycl/set_rows.hpp +8 -0
  1720. data/vendor/ggml/src/ggml-sycl/softmax.cpp +426 -0
  1721. data/vendor/ggml/src/ggml-sycl/softmax.hpp +24 -0
  1722. data/vendor/ggml/src/ggml-sycl/solve_tri.cpp +172 -0
  1723. data/vendor/ggml/src/ggml-sycl/solve_tri.hpp +8 -0
  1724. data/vendor/ggml/src/ggml-sycl/ssm_conv.cpp +132 -0
  1725. data/vendor/ggml/src/ggml-sycl/ssm_conv.hpp +5 -0
  1726. data/vendor/ggml/src/ggml-sycl/ssm_scan.cpp +156 -0
  1727. data/vendor/ggml/src/ggml-sycl/ssm_scan.hpp +5 -0
  1728. data/vendor/ggml/src/ggml-sycl/sycl_hw.cpp +67 -0
  1729. data/vendor/ggml/src/ggml-sycl/sycl_hw.hpp +38 -0
  1730. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq112-dv112.cpp +5 -0
  1731. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq128-dv128.cpp +5 -0
  1732. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq256-dv256.cpp +5 -0
  1733. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq40-dv40.cpp +5 -0
  1734. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq512-dv512.cpp +6 -0
  1735. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq576-dv512.cpp +5 -0
  1736. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq64-dv64.cpp +5 -0
  1737. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq72-dv72.cpp +5 -0
  1738. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq80-dv80.cpp +5 -0
  1739. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq96-dv96.cpp +5 -0
  1740. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-f16.cpp +8 -0
  1741. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_0.cpp +8 -0
  1742. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_1.cpp +8 -0
  1743. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_0.cpp +8 -0
  1744. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_1.cpp +8 -0
  1745. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q8_0.cpp +8 -0
  1746. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-f16.cpp +8 -0
  1747. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_0.cpp +8 -0
  1748. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_1.cpp +8 -0
  1749. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_0.cpp +8 -0
  1750. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_1.cpp +8 -0
  1751. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q8_0.cpp +8 -0
  1752. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-f16.cpp +8 -0
  1753. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_0.cpp +8 -0
  1754. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_1.cpp +8 -0
  1755. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_0.cpp +8 -0
  1756. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_1.cpp +8 -0
  1757. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q8_0.cpp +8 -0
  1758. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-f16.cpp +8 -0
  1759. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_0.cpp +8 -0
  1760. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_1.cpp +8 -0
  1761. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_0.cpp +8 -0
  1762. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_1.cpp +8 -0
  1763. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q8_0.cpp +8 -0
  1764. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-f16.cpp +8 -0
  1765. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_0.cpp +8 -0
  1766. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_1.cpp +8 -0
  1767. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_0.cpp +8 -0
  1768. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_1.cpp +8 -0
  1769. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q8_0.cpp +8 -0
  1770. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-f16.cpp +8 -0
  1771. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_0.cpp +8 -0
  1772. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_1.cpp +8 -0
  1773. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_0.cpp +8 -0
  1774. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_1.cpp +8 -0
  1775. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q8_0.cpp +8 -0
  1776. data/vendor/ggml/src/ggml-sycl/tsembd.cpp +73 -0
  1777. data/vendor/ggml/src/ggml-sycl/tsembd.hpp +20 -0
  1778. data/vendor/ggml/src/ggml-sycl/type.hpp +112 -0
  1779. data/vendor/ggml/src/ggml-sycl/upscale.cpp +410 -0
  1780. data/vendor/ggml/src/ggml-sycl/upscale.hpp +9 -0
  1781. data/vendor/ggml/src/ggml-sycl/vecdotq.hpp +1508 -0
  1782. data/vendor/ggml/src/ggml-sycl/wkv.cpp +293 -0
  1783. data/vendor/ggml/src/ggml-sycl/wkv.hpp +10 -0
  1784. data/vendor/ggml/src/ggml-threading.cpp +12 -0
  1785. data/vendor/ggml/src/ggml-threading.h +14 -0
  1786. data/vendor/ggml/src/ggml-virtgpu/CMakeLists.txt +70 -0
  1787. data/vendor/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp +87 -0
  1788. data/vendor/ggml/src/ggml-virtgpu/backend/CMakeLists.txt +21 -0
  1789. data/vendor/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp +115 -0
  1790. data/vendor/ggml/src/ggml-virtgpu/backend/backend-convert.h +13 -0
  1791. data/vendor/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +102 -0
  1792. data/vendor/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +105 -0
  1793. data/vendor/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +179 -0
  1794. data/vendor/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp +148 -0
  1795. data/vendor/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +51 -0
  1796. data/vendor/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +73 -0
  1797. data/vendor/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +27 -0
  1798. data/vendor/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +32 -0
  1799. data/vendor/ggml/src/ggml-virtgpu/backend/backend.cpp +144 -0
  1800. data/vendor/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +95 -0
  1801. data/vendor/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +94 -0
  1802. data/vendor/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +50 -0
  1803. data/vendor/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +378 -0
  1804. data/vendor/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +232 -0
  1805. data/vendor/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +58 -0
  1806. data/vendor/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +81 -0
  1807. data/vendor/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +123 -0
  1808. data/vendor/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +160 -0
  1809. data/vendor/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +213 -0
  1810. data/vendor/ggml/src/ggml-virtgpu/ggml-backend.cpp +71 -0
  1811. data/vendor/ggml/src/ggml-virtgpu/ggml-remoting.h +71 -0
  1812. data/vendor/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml +166 -0
  1813. data/vendor/ggml/src/ggml-virtgpu/include/apir_hw.h +9 -0
  1814. data/vendor/ggml/src/ggml-virtgpu/regenerate_remoting.py +333 -0
  1815. data/vendor/ggml/src/ggml-virtgpu/virtgpu-apir.h +15 -0
  1816. data/vendor/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +58 -0
  1817. data/vendor/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +110 -0
  1818. data/vendor/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +173 -0
  1819. data/vendor/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +192 -0
  1820. data/vendor/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +36 -0
  1821. data/vendor/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +53 -0
  1822. data/vendor/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +99 -0
  1823. data/vendor/ggml/src/ggml-virtgpu/virtgpu-shm.h +23 -0
  1824. data/vendor/ggml/src/ggml-virtgpu/virtgpu-utils.cpp +179 -0
  1825. data/vendor/ggml/src/ggml-virtgpu/virtgpu-utils.h +86 -0
  1826. data/vendor/ggml/src/ggml-virtgpu/virtgpu.cpp +545 -0
  1827. data/vendor/ggml/src/ggml-virtgpu/virtgpu.h +115 -0
  1828. data/vendor/ggml/src/ggml-vulkan/CMakeLists.txt +220 -0
  1829. data/vendor/ggml/src/ggml-vulkan/cmake/host-toolchain.cmake.in +15 -0
  1830. data/vendor/ggml/src/ggml-vulkan/ggml-vulkan.cpp +17208 -0
  1831. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +31 -0
  1832. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/abs.comp +21 -0
  1833. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +37 -0
  1834. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +69 -0
  1835. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/add1.comp +28 -0
  1836. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +42 -0
  1837. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/arange.comp +20 -0
  1838. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +60 -0
  1839. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +86 -0
  1840. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/argsort_large.comp +114 -0
  1841. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/ceil.comp +22 -0
  1842. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp +17 -0
  1843. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/concat.comp +41 -0
  1844. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +49 -0
  1845. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp +105 -0
  1846. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +347 -0
  1847. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
  1848. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +23 -0
  1849. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +51 -0
  1850. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +320 -0
  1851. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/copy_transpose.comp +67 -0
  1852. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp +17 -0
  1853. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/count_equal.comp +31 -0
  1854. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/count_experts.comp +51 -0
  1855. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/cumsum.comp +83 -0
  1856. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass1.comp +60 -0
  1857. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass2.comp +66 -0
  1858. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_f32.comp +20 -0
  1859. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl +653 -0
  1860. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.glsl +768 -0
  1861. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_head.glsl +13 -0
  1862. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +42 -0
  1863. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_s.comp +35 -0
  1864. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +44 -0
  1865. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xs.comp +43 -0
  1866. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +49 -0
  1867. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +40 -0
  1868. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +51 -0
  1869. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp +32 -0
  1870. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp +34 -0
  1871. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +32 -0
  1872. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_nvfp4.comp +32 -0
  1873. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q1_0.comp +29 -0
  1874. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +34 -0
  1875. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +42 -0
  1876. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_0.comp +30 -0
  1877. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_1.comp +32 -0
  1878. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +68 -0
  1879. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_0.comp +34 -0
  1880. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_1.comp +35 -0
  1881. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +70 -0
  1882. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +33 -0
  1883. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q8_0.comp +31 -0
  1884. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp +28 -0
  1885. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/diag_mask_inf.comp +34 -0
  1886. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/div.comp +27 -0
  1887. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/elu.comp +27 -0
  1888. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +20 -0
  1889. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/bfloat16.comp +7 -0
  1890. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/coopmat.comp +7 -0
  1891. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/coopmat2.comp +7 -0
  1892. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/integer_dot.comp +7 -0
  1893. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/fill.comp +19 -0
  1894. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +756 -0
  1895. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +255 -0
  1896. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +626 -0
  1897. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +427 -0
  1898. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_dequant.glsl +123 -0
  1899. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mask_opt.comp +162 -0
  1900. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mmq_funcs.glsl +203 -0
  1901. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +121 -0
  1902. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/floor.comp +22 -0
  1903. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp +190 -0
  1904. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
  1905. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
  1906. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
  1907. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp +25 -0
  1908. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
  1909. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp +23 -0
  1910. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.glsl +65 -0
  1911. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/generic_head.glsl +11 -0
  1912. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.glsl +83 -0
  1913. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +42 -0
  1914. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +51 -0
  1915. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.glsl +28 -0
  1916. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.glsl +39 -0
  1917. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp +66 -0
  1918. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +22 -0
  1919. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +22 -0
  1920. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +93 -0
  1921. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +124 -0
  1922. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +44 -0
  1923. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/leaky_relu.comp +22 -0
  1924. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +17 -0
  1925. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul.comp +27 -0
  1926. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_split_k_reduce.comp +48 -0
  1927. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +169 -0
  1928. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl +230 -0
  1929. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +35 -0
  1930. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_m.comp +132 -0
  1931. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp +95 -0
  1932. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_s.comp +90 -0
  1933. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +105 -0
  1934. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xxs.comp +87 -0
  1935. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_s.comp +90 -0
  1936. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_xxs.comp +88 -0
  1937. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +124 -0
  1938. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp +156 -0
  1939. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +128 -0
  1940. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +132 -0
  1941. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +134 -0
  1942. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +165 -0
  1943. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +130 -0
  1944. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +143 -0
  1945. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +503 -0
  1946. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +464 -0
  1947. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +624 -0
  1948. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +600 -0
  1949. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +74 -0
  1950. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +311 -0
  1951. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +454 -0
  1952. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +93 -0
  1953. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +194 -0
  1954. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/neg.comp +20 -0
  1955. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/norm.comp +44 -0
  1956. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_adamw.comp +42 -0
  1957. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +22 -0
  1958. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +64 -0
  1959. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/pool2d.comp +74 -0
  1960. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +127 -0
  1961. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
  1962. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp +21 -0
  1963. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp +26 -0
  1964. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp +37 -0
  1965. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +150 -0
  1966. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_back.comp +55 -0
  1967. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +65 -0
  1968. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
  1969. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +207 -0
  1970. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +19 -0
  1971. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +17 -0
  1972. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +17 -0
  1973. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +17 -0
  1974. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +31 -0
  1975. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +17 -0
  1976. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/round.comp +29 -0
  1977. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +24 -0
  1978. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/sgn.comp +21 -0
  1979. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp +20 -0
  1980. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp +22 -0
  1981. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/silu_back.comp +26 -0
  1982. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp +17 -0
  1983. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +195 -0
  1984. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +54 -0
  1985. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large1.comp +62 -0
  1986. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large2.comp +79 -0
  1987. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large3.comp +65 -0
  1988. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large_common.glsl +53 -0
  1989. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/softplus.comp +23 -0
  1990. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/solve_tri.comp +81 -0
  1991. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +17 -0
  1992. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/square.comp +17 -0
  1993. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +50 -0
  1994. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/ssm_scan.comp +124 -0
  1995. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/step.comp +22 -0
  1996. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/sub.comp +29 -0
  1997. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +47 -0
  1998. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.glsl +25 -0
  1999. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
  2000. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +14 -0
  2001. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp +20 -0
  2002. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +42 -0
  2003. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/topk_argsort.comp +118 -0
  2004. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +213 -0
  2005. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/topk_nary_search.comp +246 -0
  2006. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp +42 -0
  2007. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/trunc.comp +22 -0
  2008. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl +1846 -0
  2009. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +178 -0
  2010. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/utils.glsl +25 -0
  2011. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +1183 -0
  2012. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/wkv6.comp +87 -0
  2013. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/wkv7.comp +91 -0
  2014. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/xielu.comp +35 -0
  2015. data/vendor/ggml/src/ggml-webgpu/CMakeLists.txt +80 -0
  2016. data/vendor/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +3231 -0
  2017. data/vendor/ggml/src/ggml-webgpu/ggml-webgpu.cpp +4461 -0
  2018. data/vendor/ggml/src/ggml-webgpu/pre_wgsl.hpp +778 -0
  2019. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/add_id.wgsl +64 -0
  2020. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl +72 -0
  2021. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl +106 -0
  2022. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl +134 -0
  2023. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +139 -0
  2024. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +905 -0
  2025. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl +75 -0
  2026. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/conv2d.wgsl +165 -0
  2027. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +81 -0
  2028. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl +66 -0
  2029. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +89 -0
  2030. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +706 -0
  2031. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_tile.wgsl +351 -0
  2032. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_blk.wgsl +101 -0
  2033. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_reduce.wgsl +84 -0
  2034. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_split.wgsl +720 -0
  2035. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/gated_delta_net.wgsl +132 -0
  2036. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.wgsl +773 -0
  2037. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/glu.wgsl +155 -0
  2038. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/im2col.wgsl +101 -0
  2039. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
  2040. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +747 -0
  2041. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +1210 -0
  2042. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id.wgsl +195 -0
  2043. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_gather.wgsl +55 -0
  2044. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_vec.wgsl +154 -0
  2045. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl +149 -0
  2046. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.wgsl +200 -0
  2047. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +133 -0
  2048. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_acc.tmpl +1433 -0
  2049. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl +86 -0
  2050. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/repeat.wgsl +67 -0
  2051. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_mul.wgsl +152 -0
  2052. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/rope.wgsl +224 -0
  2053. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/row_norm.wgsl +153 -0
  2054. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/scale.wgsl +63 -0
  2055. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/set.wgsl +109 -0
  2056. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +109 -0
  2057. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.wgsl +245 -0
  2058. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/solve_tri.wgsl +121 -0
  2059. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/ssm_conv.wgsl +65 -0
  2060. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/ssm_scan.wgsl +193 -0
  2061. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl +55 -0
  2062. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +210 -0
  2063. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/upscale.wgsl +240 -0
  2064. data/vendor/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
  2065. data/vendor/ggml/src/ggml-zdnn/common.hpp +59 -0
  2066. data/vendor/ggml/src/ggml-zdnn/ggml-zdnn.cpp +637 -0
  2067. data/vendor/ggml/src/ggml-zdnn/mmf.cpp +80 -0
  2068. data/vendor/ggml/src/ggml-zdnn/mmf.hpp +12 -0
  2069. data/vendor/ggml/src/ggml-zdnn/utils.cpp +79 -0
  2070. data/vendor/ggml/src/ggml-zdnn/utils.hpp +19 -0
  2071. data/vendor/ggml/src/ggml-zendnn/CMakeLists.txt +91 -0
  2072. data/vendor/ggml/src/ggml-zendnn/ggml-zendnn.cpp +669 -0
  2073. data/vendor/ggml/src/ggml.c +7777 -0
  2074. data/vendor/ggml/src/ggml.cpp +26 -0
  2075. data/vendor/ggml/src/gguf.cpp +1556 -0
  2076. data/vendor/ggml/tests/CMakeLists.txt +356 -0
  2077. data/vendor/ggml/tests/test-arange.cpp +100 -0
  2078. data/vendor/ggml/tests/test-backend-ops.cpp +9786 -0
  2079. data/vendor/ggml/tests/test-cont.c +170 -0
  2080. data/vendor/ggml/tests/test-conv-transpose-1d.cpp +691 -0
  2081. data/vendor/ggml/tests/test-conv-transpose.c +248 -0
  2082. data/vendor/ggml/tests/test-conv1d-dw-c1.cpp +243 -0
  2083. data/vendor/ggml/tests/test-conv1d-dw-c2.cpp +243 -0
  2084. data/vendor/ggml/tests/test-conv1d.cpp +289 -0
  2085. data/vendor/ggml/tests/test-conv2d-dw.cpp +153 -0
  2086. data/vendor/ggml/tests/test-conv2d.cpp +391 -0
  2087. data/vendor/ggml/tests/test-customop.c +300 -0
  2088. data/vendor/ggml/tests/test-dup.c +111 -0
  2089. data/vendor/ggml/tests/test-interpolate.cpp +166 -0
  2090. data/vendor/ggml/tests/test-opt.cpp +1003 -0
  2091. data/vendor/ggml/tests/test-pad-reflect-1d.cpp +213 -0
  2092. data/vendor/ggml/tests/test-pool.c +274 -0
  2093. data/vendor/ggml/tests/test-quantize-fns.cpp +196 -0
  2094. data/vendor/ggml/tests/test-quantize-perf.cpp +356 -0
  2095. data/vendor/ggml/tests/test-rel-pos.c +87 -0
  2096. data/vendor/ggml/tests/test-roll.cpp +128 -0
  2097. data/vendor/ggml/tests/test-timestep_embedding.cpp +180 -0
  2098. data/vendor-patches/0001-cuda-buffer_from_ptr.patch +253 -0
  2099. data/vendor-patches/0002-cuda-buffer_from_ptr-reuse-iface.patch +117 -0
  2100. data/vendor-patches/0003-cuda-buffer_from_ptr-copy-mode.patch +128 -0
  2101. data/vendor-patches/0004-cuda-cpy-strided.patch +61 -0
  2102. data/vendor-patches/0005-concat-backward.patch +36 -0
  2103. data/vendor-patches/0006-getrows-back-large-vocab.patch +69 -0
  2104. data/vendor-patches/0007-gpt2-backward-kernels.patch +438 -0
  2105. data/vendor-patches/0008-mul-mat-backward-mixed-precision.patch +50 -0
  2106. data/vendor-patches/0009-sched-unsupported-node-diagnostic.patch +26 -0
  2107. metadata +2161 -0
@@ -0,0 +1,4553 @@
1
+ #define GGML_COMMON_IMPL_C
2
+ #include "ggml-common.h"
3
+ #include "ggml-quants.h"
4
+ #include "ggml-impl.h"
5
+ #include "ggml-cpu.h"
6
+ #include "simd-mappings.h"
7
+
8
+ #include "../../quants.h"
9
+ #include "../../ggml-cpu-impl.h"
10
+
11
+ #include <math.h>
12
+ #include <string.h>
13
+ #include <assert.h>
14
+ #include <float.h>
15
+ #include <stdlib.h> // for qsort
16
+ #include <stdio.h> // for GGML_ASSERT
17
+
18
+ #ifdef _MSC_VER
19
+ #define NOINLINE __declspec(noinline)
20
+ #else
21
+ #define NOINLINE __attribute__((__noinline__))
22
+ #endif
23
+
24
+ #define GROUP_MAX_EPS 1e-15f
25
+ #define GROUP_MAX_EPS_IQ3_XXS 1e-8f
26
+ #define GROUP_MAX_EPS_IQ2_S 1e-8f
27
+ #define GROUP_MAX_EPS_IQ1_M 1e-7f
28
+ #define GROUP_MAX_EPS_IQ1_S 1e-12f
29
+
30
+ #define UNUSED GGML_UNUSED
31
+
32
+ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
33
+ assert(QK8_0 == 32);
34
+ assert(k % QK8_0 == 0);
35
+ const int nb = k / QK8_0;
36
+
37
+ block_q8_0 * GGML_RESTRICT y = vy;
38
+
39
+ #if defined(__riscv_v)
40
+
41
+ size_t vl = QK8_0;
42
+
43
+ for (int i = 0; i < nb; i++) {
44
+ // load elements
45
+ vfloat32m8_t v_x = __riscv_vle32_v_f32m8(x+i*QK8_0, vl);
46
+
47
+ vfloat32m8_t vfabs = __riscv_vfabs_v_f32m8(v_x, vl);
48
+ vfloat32m1_t tmp = __riscv_vfmv_v_f_f32m1(0.0f, vl);
49
+ vfloat32m1_t vmax = __riscv_vfredmax_vs_f32m8_f32m1(vfabs, tmp, vl);
50
+ float amax = __riscv_vfmv_f_s_f32m1_f32(vmax);
51
+
52
+ const float d = amax / ((1 << 7) - 1);
53
+ const float id = d ? 1.0f/d : 0.0f;
54
+
55
+ y[i].d = GGML_CPU_FP32_TO_FP16(d);
56
+
57
+ vfloat32m8_t x0 = __riscv_vfmul_vf_f32m8(v_x, id, vl);
58
+
59
+ // convert to integer
60
+ vint16m4_t vi = __riscv_vfncvt_x_f_w_i16m4(x0, vl);
61
+ vint8m2_t vs = __riscv_vncvt_x_x_w_i8m2(vi, vl);
62
+
63
+ // store result
64
+ __riscv_vse8_v_i8m2(y[i].qs , vs, vl);
65
+ }
66
+ #else
67
+ GGML_UNUSED(nb);
68
+ // scalar
69
+ quantize_row_q8_0_ref(x, y, k);
70
+ #endif
71
+ }
72
+
73
+ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
74
+ assert(k % QK8_1 == 0);
75
+ const int nb = k / QK8_1;
76
+
77
+ block_q8_1 * GGML_RESTRICT y = vy;
78
+
79
+ #if defined(__riscv_v)
80
+
81
+ size_t vl = QK8_1;
82
+
83
+ for (int i = 0; i < nb; i++) {
84
+ // load elements
85
+ vfloat32m8_t v_x = __riscv_vle32_v_f32m8(x+i*QK8_1, vl);
86
+
87
+ vfloat32m8_t vfabs = __riscv_vfabs_v_f32m8(v_x, vl);
88
+ vfloat32m1_t tmp = __riscv_vfmv_v_f_f32m1(0.0, vl);
89
+ vfloat32m1_t vmax = __riscv_vfredmax_vs_f32m8_f32m1(vfabs, tmp, vl);
90
+ float amax = __riscv_vfmv_f_s_f32m1_f32(vmax);
91
+
92
+ const float d = amax / ((1 << 7) - 1);
93
+ const float id = d ? 1.0f/d : 0.0f;
94
+
95
+ y[i].d = GGML_CPU_FP32_TO_FP16(d);
96
+
97
+ vfloat32m8_t x0 = __riscv_vfmul_vf_f32m8(v_x, id, vl);
98
+
99
+ // convert to integer
100
+ vint16m4_t vi = __riscv_vfncvt_x_f_w_i16m4(x0, vl);
101
+ vint8m2_t vs = __riscv_vncvt_x_x_w_i8m2(vi, vl);
102
+
103
+ // store result
104
+ __riscv_vse8_v_i8m2(y[i].qs , vs, vl);
105
+
106
+ // compute sum for y[i].s
107
+ vint16m1_t tmp2 = __riscv_vmv_v_x_i16m1(0, vl);
108
+ vint16m1_t vwrs = __riscv_vwredsum_vs_i8m2_i16m1(vs, tmp2, vl);
109
+
110
+ // set y[i].s
111
+ int sum = __riscv_vmv_x_s_i16m1_i16(vwrs);
112
+ y[i].s = GGML_CPU_FP32_TO_FP16(sum*d);
113
+ }
114
+
115
+ #else
116
+ GGML_UNUSED(nb);
117
+ // scalar
118
+ quantize_row_q8_1_ref(x, y, k);
119
+ #endif
120
+ }
121
+
122
+ void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
123
+ assert(k % QK_K == 0);
124
+ size_t nb = k / QK_K;
125
+
126
+ #if defined __riscv_v_intrinsic
127
+ block_q8_K * y_blocks = (block_q8_K *)y;
128
+ const size_t vlmax_f32m8 = __riscv_vsetvlmax_e32m8();
129
+
130
+ for (size_t i = 0; i < nb; i++) {
131
+ const float* x_block = x + i * QK_K;
132
+ block_q8_K* y_block = &y_blocks[i];
133
+
134
+ // 1. Calculate Min/Max
135
+ vfloat32m8_t max_v = __riscv_vfmv_v_f_f32m8(-__builtin_inff(), vlmax_f32m8);
136
+ vfloat32m8_t min_v = __riscv_vfmv_v_f_f32m8(__builtin_inff(), vlmax_f32m8);
137
+
138
+ size_t rem = QK_K;
139
+ size_t offset = 0;
140
+ while (rem > 0) {
141
+ size_t vl = __riscv_vsetvl_e32m8(rem);
142
+ vfloat32m8_t v_curr = __riscv_vle32_v_f32m8(x_block + offset, vl);
143
+ max_v = __riscv_vfmax_vv_f32m8(max_v, v_curr, vl);
144
+ min_v = __riscv_vfmin_vv_f32m8(min_v, v_curr, vl);
145
+ rem -= vl;
146
+ offset += vl;
147
+ }
148
+
149
+ vfloat32m1_t v_init_max = __riscv_vfmv_s_f_f32m1(-__builtin_inff(), 1);
150
+ vfloat32m1_t v_init_min = __riscv_vfmv_s_f_f32m1(__builtin_inff(), 1);
151
+
152
+ vfloat32m1_t v_scalar_max = __riscv_vfredmax_vs_f32m8_f32m1(max_v, v_init_max, vlmax_f32m8);
153
+ vfloat32m1_t v_scalar_min = __riscv_vfredmin_vs_f32m8_f32m1(min_v, v_init_min, vlmax_f32m8);
154
+
155
+ float max_val = __riscv_vfmv_f_s_f32m1_f32(v_scalar_max);
156
+ float min_val = __riscv_vfmv_f_s_f32m1_f32(v_scalar_min);
157
+
158
+ float amax = fabsf(max_val) > fabsf(min_val) ? fabsf(max_val) : fabsf(min_val);
159
+
160
+ if (amax == 0.0f) {
161
+ y_block->d = 0.0f;
162
+ memset(y_block->qs, 0, QK_K);
163
+ memset(y_block->bsums, 0, sizeof(y_block->bsums));
164
+ continue;
165
+ }
166
+
167
+ const float iscale = -127.f / (fabsf(max_val) > fabsf(min_val) ? max_val : min_val);
168
+ y_block->d = 1.0f / iscale;
169
+
170
+ // 2. Quantize and Calculate Sums
171
+ offset = 0;
172
+ rem = QK_K;
173
+ vint16m1_t v_zero_sum = __riscv_vmv_v_x_i16m1(0, 1);
174
+
175
+ while (rem > 0) {
176
+ size_t vl = __riscv_vsetvl_e32m8(rem);
177
+ vfloat32m8_t v_f = __riscv_vle32_v_f32m8(x_block + offset, vl);
178
+
179
+ v_f = __riscv_vfmul_vf_f32m8(v_f, iscale, vl);
180
+
181
+ vint32m8_t v_i32 = __riscv_vfcvt_x_f_v_i32m8_rm(v_f, __RISCV_FRM_RNE, vl);
182
+ vint16m4_t v_i16 = __riscv_vnclip_wx_i16m4(v_i32, 0, __RISCV_VXRM_RNE, vl);
183
+ vint8m2_t v_q = __riscv_vnclip_wx_i8m2(v_i16, 0, __RISCV_VXRM_RNE, vl);
184
+
185
+ __riscv_vse8_v_i8m2(y_block->qs + offset, v_q, vl);
186
+
187
+ // first iteration clear
188
+
189
+ int sum_idx;
190
+ vint8m1_t chunk_m1;
191
+ vint16m1_t v_sum;
192
+ sum_idx = offset / 16;
193
+ chunk_m1 = __riscv_vget_v_i8m2_i8m1(v_q, 0);
194
+ v_sum = __riscv_vwredsum_vs_i8m1_i16m1(chunk_m1, v_zero_sum, 16);
195
+ y_block->bsums[sum_idx] = (int16_t)__riscv_vmv_x_s_i16m1_i16(v_sum);
196
+
197
+ // remaining iterations
198
+ vint8m2_t slid_q = v_q;
199
+ for (size_t k = 16; k < vl; k += 16) {
200
+ slid_q = __riscv_vslidedown_vx_i8m2(slid_q, 16, vl);
201
+
202
+ sum_idx = (offset + k) / 16;
203
+ chunk_m1 = __riscv_vget_v_i8m2_i8m1(slid_q, 0);
204
+
205
+ v_sum = __riscv_vwredsum_vs_i8m1_i16m1(chunk_m1, v_zero_sum, 16);
206
+ y_block->bsums[sum_idx] =(int16_t)__riscv_vmv_x_s_i16m1_i16(v_sum);
207
+ }
208
+
209
+ rem -= vl;
210
+ offset += vl;
211
+ }
212
+ }
213
+ #else
214
+ GGML_UNUSED(nb);
215
+ // scalar
216
+ quantize_row_q8_K_ref(x, y, k);
217
+ #endif
218
+ }
219
+
220
+ //===================================== Dot products =================================
221
+
222
+ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
223
+ #if defined(__riscv_v)
224
+ const int qk = QK8_0;
225
+ const int nb = n / qk;
226
+
227
+ assert(n % qk == 0);
228
+ assert(nrc == 1);
229
+ UNUSED(nrc);
230
+ UNUSED(bx);
231
+ UNUSED(by);
232
+ UNUSED(bs);
233
+
234
+ const block_q4_0 * GGML_RESTRICT x = vx;
235
+ const block_q8_0 * GGML_RESTRICT y = vy;
236
+
237
+ int ib = 0;
238
+ float sumf = 0;
239
+
240
+ size_t vl = qk / 2;
241
+
242
+ for (; ib < nb; ++ib) {
243
+ // load elements
244
+ vuint8m1_t tx = __riscv_vle8_v_u8m1(x[ib].qs, vl);
245
+
246
+ vint8m1_t y0 = __riscv_vle8_v_i8m1(y[ib].qs, vl);
247
+ vint8m1_t y1 = __riscv_vle8_v_i8m1(y[ib].qs+16, vl);
248
+
249
+ // mask and store lower part of x, and then upper part
250
+ vuint8m1_t x_a = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
251
+ vuint8m1_t x_l = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
252
+
253
+ vint8m1_t x_ai = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
254
+ vint8m1_t x_li = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
255
+
256
+ // subtract offset
257
+ vint8m1_t v0 = __riscv_vsub_vx_i8m1(x_ai, 8, vl);
258
+ vint8m1_t v1 = __riscv_vsub_vx_i8m1(x_li, 8, vl);
259
+
260
+ vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
261
+ vint16m2_t vec_mul2 = __riscv_vwmacc_vv_i16m2(vec_mul1, v1, y1, vl);
262
+
263
+ vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
264
+ vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
265
+
266
+ int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
267
+
268
+ sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
269
+ }
270
+
271
+ *s = sumf;
272
+ #else
273
+ ggml_vec_dot_q4_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
274
+ #endif
275
+ }
276
+
277
+ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
278
+ #if defined(__riscv_v)
279
+ const int qk = QK8_1;
280
+ const int nb = n / qk;
281
+
282
+ assert(n % qk == 0);
283
+ assert(nrc == 1);
284
+ UNUSED(nrc);
285
+ UNUSED(bx);
286
+ UNUSED(by);
287
+ UNUSED(bs);
288
+
289
+ const block_q4_1 * GGML_RESTRICT x = vx;
290
+ const block_q8_1 * GGML_RESTRICT y = vy;
291
+
292
+ int ib = 0;
293
+ float sumf = 0;
294
+
295
+ size_t vl = qk / 2;
296
+
297
+ for (; ib < nb; ++ib) {
298
+ // load elements
299
+ vuint8m1_t tx = __riscv_vle8_v_u8m1(x[ib].qs, vl);
300
+
301
+ vint8m1_t y0 = __riscv_vle8_v_i8m1(y[ib].qs, vl);
302
+ vint8m1_t y1 = __riscv_vle8_v_i8m1(y[ib].qs+16, vl);
303
+
304
+ // mask and store lower part of x, and then upper part
305
+ vuint8m1_t x_a = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
306
+ vuint8m1_t x_l = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
307
+
308
+ vint8m1_t v0 = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
309
+ vint8m1_t v1 = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
310
+
311
+ vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
312
+ vint16m2_t vec_mul2 = __riscv_vwmacc_vv_i16m2(vec_mul1, v1, y1, vl);
313
+
314
+ vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
315
+ vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
316
+
317
+ int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
318
+
319
+ sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
320
+ }
321
+
322
+ *s = sumf;
323
+ #else
324
+ ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
325
+ #endif
326
+ }
327
+
328
+ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
329
+ #if defined(__riscv_v)
330
+ const int qk = QK8_0;
331
+ const int nb = n / qk;
332
+
333
+ int ib = 0;
334
+ float sumf = 0;
335
+
336
+ assert(n % qk == 0);
337
+ assert(qk == QK5_0);
338
+ assert(nrc == 1);
339
+ UNUSED(nrc);
340
+ UNUSED(bx);
341
+ UNUSED(by);
342
+ UNUSED(bs);
343
+
344
+ const block_q5_0 * GGML_RESTRICT x = vx;
345
+ const block_q8_0 * GGML_RESTRICT y = vy;
346
+
347
+ size_t vl;
348
+ size_t vlenb = __riscv_vlenb();
349
+
350
+ for (; ib < nb; ++ib) {
351
+ vl = qk / 2;
352
+ vuint8m1_t v0 = __riscv_vle8_v_u8m1(x[ib].qs, vl);
353
+ vint8m1_t v0l = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(v0, 0x0F, vl));
354
+ vint8m1_t v0h = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(v0, 4, vl));
355
+ vint8m2_t v0c;
356
+ if (vlenb == 16) {
357
+ v0c = __riscv_vcreate_v_i8m1_i8m2(v0l, v0h);
358
+ } else {
359
+ v0l = __riscv_vslideup_vx_i8m1(v0l, v0h, 16, 32);
360
+ v0c = __riscv_vlmul_ext_v_i8m1_i8m2(v0l);
361
+ }
362
+
363
+ vl = qk;
364
+ vbool4_t qh = __riscv_vlm_v_b4(x[ib].qh, vl);
365
+ qh = __riscv_vmnand_mm_b4(qh, qh, vl);
366
+ vint8m2_t v0f = __riscv_vsub_vx_i8m2_mu(qh, v0c, v0c, 0x10, vl);
367
+ vint8m2_t v1 = __riscv_vle8_v_i8m2(y[ib].qs, vl);
368
+ vint16m4_t mul = __riscv_vwmul_vv_i16m4(v0f, v1, vl);
369
+ vint32m1_t zero = __riscv_vmv_v_x_i32m1(0, vl);
370
+ vint32m1_t sum = __riscv_vwredsum_vs_i16m4_i32m1(mul, zero, vl);
371
+ int32_t sumi = __riscv_vmv_x_s_i32m1_i32(sum);
372
+
373
+ sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
374
+ }
375
+
376
+ *s = sumf;
377
+ #else
378
+ ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
379
+ #endif
380
+ }
381
+
382
+ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
383
+ #if defined(__riscv_v)
384
+ const int qk = QK8_1;
385
+ const int nb = n / qk;
386
+
387
+ int ib = 0;
388
+ float sumf = 0;
389
+
390
+ assert(n % qk == 0);
391
+ assert(qk == QK5_1);
392
+ assert(nrc == 1);
393
+ UNUSED(nrc);
394
+ UNUSED(bx);
395
+ UNUSED(by);
396
+ UNUSED(bs);
397
+
398
+ const block_q5_1 * GGML_RESTRICT x = vx;
399
+ const block_q8_1 * GGML_RESTRICT y = vy;
400
+
401
+ size_t vl;
402
+ size_t vlenb = __riscv_vlenb();
403
+
404
+ for (; ib < nb; ++ib) {
405
+ vl = qk / 2;
406
+ vuint8m1_t v0 = __riscv_vle8_v_u8m1(x[ib].qs, vl);
407
+ vint8m1_t v0l = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(v0, 0x0F, vl));
408
+ vint8m1_t v0h = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(v0, 4, vl));
409
+ vint8m2_t v0c;
410
+ if (vlenb == 16) {
411
+ v0c = __riscv_vcreate_v_i8m1_i8m2(v0l, v0h);
412
+ } else {
413
+ v0l = __riscv_vslideup_vx_i8m1(v0l, v0h, 16, 32);
414
+ v0c = __riscv_vlmul_ext_v_i8m1_i8m2(v0l);
415
+ }
416
+
417
+ vl = qk;
418
+ vbool4_t qh = __riscv_vlm_v_b4(x[ib].qh, vl);
419
+ vint8m2_t v0f = __riscv_vor_vx_i8m2_mu(qh, v0c, v0c, 0x10, vl);
420
+ vint8m2_t v1 = __riscv_vle8_v_i8m2(y[ib].qs, vl);
421
+ vint16m4_t mul = __riscv_vwmul_vv_i16m4(v0f, v1, vl);
422
+ vint32m1_t zero = __riscv_vmv_v_x_i32m1(0, vl);
423
+ vint32m1_t sum = __riscv_vwredsum_vs_i16m4_i32m1(mul, zero, vl);
424
+ int32_t sumi = __riscv_vmv_x_s_i32m1_i32(sum);
425
+
426
+ sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
427
+ }
428
+
429
+ *s = sumf;
430
+ #else
431
+ ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
432
+ #endif
433
+ }
434
+
435
+ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
436
+ const int qk = QK8_0;
437
+ const int nb = n / qk;
438
+
439
+ assert(n % qk == 0);
440
+ assert(nrc == 1);
441
+ UNUSED(nrc);
442
+ UNUSED(bx);
443
+ UNUSED(by);
444
+ UNUSED(bs);
445
+
446
+ const block_q8_0 * GGML_RESTRICT x = vx;
447
+ const block_q8_0 * GGML_RESTRICT y = vy;
448
+
449
+ int ib = 0;
450
+ float sumf = 0;
451
+
452
+ #if defined(__riscv_v)
453
+ size_t vl = qk;
454
+
455
+ for (; ib < nb; ++ib) {
456
+ // load elements
457
+ vint8m2_t bx_0 = __riscv_vle8_v_i8m2(x[ib].qs, vl);
458
+ vint8m2_t by_0 = __riscv_vle8_v_i8m2(y[ib].qs, vl);
459
+
460
+ vint16m4_t vw_mul = __riscv_vwmul_vv_i16m4(bx_0, by_0, vl);
461
+
462
+ vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, vl);
463
+ vint32m1_t v_sum = __riscv_vwredsum_vs_i16m4_i32m1(vw_mul, v_zero, vl);
464
+
465
+ int sumi = __riscv_vmv_x_s_i32m1_i32(v_sum);
466
+
467
+ sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
468
+ }
469
+
470
+ *s = sumf;
471
+ #else
472
+
473
+ UNUSED(nb);
474
+ UNUSED(x);
475
+ UNUSED(y);
476
+ UNUSED(ib);
477
+ UNUSED(sumf);
478
+
479
+ ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
480
+ #endif
481
+ }
482
+
483
+ #if defined(__riscv_v)
484
+ static NOINLINE void ggml_vec_dot_q1_0_q8_0_vl256(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy) {
485
+ const int qk = QK1_0;
486
+ const int nb = n / qk;
487
+ assert(n % qk == 0);
488
+
489
+ const block_q1_0 * GGML_RESTRICT x = vx;
490
+ const block_q8_0 * GGML_RESTRICT y = vy;
491
+
492
+ //LMUL = 1, VLMAX = 32
493
+ const size_t vl32 = __riscv_vsetvl_e8m1(32);
494
+ assert(vl32 == 32);
495
+
496
+ const vint16m1_t zero = __riscv_vmv_v_x_i16m1(0, 1);
497
+
498
+ float sumf = 0;
499
+
500
+ for (int ib = 0; ib < nb; ++ib) {
501
+ const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
502
+
503
+ float acc = 0;
504
+
505
+ for (int k = 0; k < 4; ++k) {
506
+ const block_q8_0 * GGML_RESTRICT yb = &y[ib * 4 + k];
507
+ const vbool8_t is_not_zero = __riscv_vlm_v_b8(x[ib].qs + 4 * k, vl32);
508
+
509
+ const vint8m1_t qy = __riscv_vle8_v_i8m1(yb->qs, vl32);
510
+ const vint8m1_t neg_qy = __riscv_vneg_v_i8m1(qy, vl32);
511
+ const vint8m1_t sy = __riscv_vmerge_vvm_i8m1(neg_qy, qy, is_not_zero, vl32);
512
+
513
+ const vint16m1_t red = __riscv_vwredsum_vs_i8m1_i16m1(sy, zero, vl32);
514
+ acc += GGML_CPU_FP16_TO_FP32(yb->d) * (float)__riscv_vmv_x_s_i16m1_i16(red);
515
+ }
516
+
517
+ sumf += d0 * acc;
518
+ }
519
+
520
+ *s = sumf;
521
+ }
522
+
523
+ static NOINLINE void ggml_vec_dot_q1_0_q8_0_vl128(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy) {
524
+ const int qk = QK1_0;
525
+ const int nb = n / qk;
526
+ assert(n % qk == 0);
527
+
528
+ const block_q1_0 * GGML_RESTRICT x = vx;
529
+ const block_q8_0 * GGML_RESTRICT y = vy;
530
+
531
+ //LMUL = 2, VLMAX = 32
532
+ const size_t vl32 = __riscv_vsetvl_e8m2(32);
533
+ assert(vl32 == 32);
534
+
535
+ const vint16m1_t zero = __riscv_vmv_v_x_i16m1(0, 1);
536
+
537
+ float sumf = 0;
538
+
539
+ for (int ib = 0; ib < nb; ++ib) {
540
+ const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
541
+
542
+ float acc = 0;
543
+
544
+ for (int k = 0; k < 4; ++k) {
545
+ const block_q8_0 * GGML_RESTRICT yb = &y[ib * 4 + k];
546
+ const vbool4_t is_not_zero = __riscv_vlm_v_b4(x[ib].qs + 4 * k, vl32);
547
+
548
+ const vint8m2_t qy = __riscv_vle8_v_i8m2(yb->qs, vl32);
549
+ const vint8m2_t neg_qy =__riscv_vneg_v_i8m2(qy, vl32);
550
+ const vint8m2_t sy = __riscv_vmerge_vvm_i8m2(neg_qy, qy, is_not_zero, vl32);
551
+
552
+ const vint16m1_t red = __riscv_vwredsum_vs_i8m2_i16m1(sy, zero, vl32);
553
+ acc += GGML_CPU_FP16_TO_FP32(yb->d) * (float)__riscv_vmv_x_s_i16m1_i16(red);
554
+ }
555
+
556
+ sumf += d0 * acc;
557
+ }
558
+
559
+ *s = sumf;
560
+ }
561
+ #endif
562
+
563
+ void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
564
+ #if defined(__riscv_v)
565
+ assert(nrc == 1);
566
+
567
+ const size_t vlen_bits = __riscv_vlenb() * 8;
568
+
569
+ if (vlen_bits >= 256) {
570
+ ggml_vec_dot_q1_0_q8_0_vl256(n, s, vx, vy);
571
+ } else if (vlen_bits >= 128) {
572
+ ggml_vec_dot_q1_0_q8_0_vl128(n, s, vx, vy);
573
+ } else {
574
+ ggml_vec_dot_q1_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
575
+ }
576
+ #else
577
+ ggml_vec_dot_q1_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
578
+ #endif
579
+ }
580
+
581
+ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
582
+ assert(nrc == 1);
583
+ UNUSED(nrc);
584
+ UNUSED(bx);
585
+ UNUSED(by);
586
+ UNUSED(bs);
587
+
588
+ const block_q2_K * GGML_RESTRICT x = vx;
589
+ const block_q8_K * GGML_RESTRICT y = vy;
590
+
591
+ const int nb = n / QK_K;
592
+
593
+ #if defined __riscv_xtheadvector
594
+
595
+ float sumf = 0;
596
+ uint8_t atmp[16];
597
+
598
+ for (int i = 0; i < nb; ++i) {
599
+ const uint8_t * q2 = x[i].qs;
600
+ const int8_t * q8 = y[i].qs;
601
+ const uint8_t * sc = x[i].scales;
602
+ const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
603
+ const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
604
+ uint8_t *patmp = atmp;
605
+ int vsums;
606
+ int tmp;
607
+ __asm__ __volatile__(
608
+ "th.vsetvli zero, %[vl16], e8, m1\n\t"
609
+ "th.vmv.v.x v8, zero\n\t"
610
+ "th.vlb.v v1, (%[sc])\n\t"
611
+ "th.vand.vi v0, v1, 0xF\n\t"
612
+ "th.vsrl.vi v1, v1, 4\n\t"
613
+ "th.vsb.v v0, (%[scale])\n\t"
614
+ "th.vwaddu.vx v16, v1, zero\n\t"
615
+ "th.vsetvli zero, %[vl16], e16, m2\n\t"
616
+ "th.vlh.v v2, (%[bsums])\n\t"
617
+ "th.vwmul.vv v4, v16, v2\n\t"
618
+ "th.vsetvli zero, %[vl16], e32, m4\n\t"
619
+ "th.vredsum.vs v8, v4, v8\n\t"
620
+ "th.vmv.x.s %[vsums], v8"
621
+ : [tmp] "=&r" (tmp), [vsums] "=&r" (vsums)
622
+ : [sc] "r" (sc), [scale] "r" (atmp), [bsums] "r" (y[i].bsums)
623
+ , [vl16] "r" (16)
624
+ : "memory"
625
+ , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
626
+ , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
627
+ , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
628
+ , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
629
+ );
630
+ sumf += dmin * vsums;
631
+ int isum = 0;
632
+
633
+ for (int j = 0; j < QK_K/128; ++j) {
634
+ __asm__ __volatile__(
635
+ "th.vsetvli zero, %[vl32], e8, m2\n\t"
636
+ "th.vlb.v v0, (%[q2])\n\t"
637
+ "th.vsrl.vi v2, v0, 2\n\t"
638
+ "th.vsrl.vi v4, v0, 4\n\t"
639
+ "th.vsrl.vi v6, v0, 6\n\t"
640
+ "th.vand.vi v0, v0, 0x3\n\t"
641
+ "th.vand.vi v2, v2, 0x3\n\t"
642
+ "th.vand.vi v4, v4, 0x3\n\t"
643
+ "th.vsetvli zero, %[vl128], e8, m8\n\t"
644
+ "th.vlb.v v8, (%[q8])\n\t"
645
+ "th.vsetvli zero, %[vl64], e8, m4\n\t"
646
+ "th.vwmul.vv v16, v0, v8\n\t"
647
+ "th.vwmul.vv v24, v4, v12\n\t"
648
+ "th.vsetvli zero, %[vl16], e16, m2\n\t"
649
+ "th.vmv.v.x v0, zero\n\t"
650
+ "th.vwredsum.vs v10, v16, v0\n\t"
651
+ "th.vwredsum.vs v9, v18, v0\n\t"
652
+ "th.vwredsum.vs v8, v20, v0\n\t"
653
+ "th.vwredsum.vs v7, v22, v0\n\t"
654
+ "th.vwredsum.vs v11, v24, v0\n\t"
655
+ "th.vwredsum.vs v12, v26, v0\n\t"
656
+ "th.vwredsum.vs v13, v28, v0\n\t"
657
+ "th.vwredsum.vs v14, v30, v0\n\t"
658
+ "li %[tmp], 4\n\t"
659
+ "th.vsetvli zero, %[tmp], e32, m1\n\t"
660
+ "th.vslideup.vi v10, v9, 1\n\t"
661
+ "th.vslideup.vi v8, v7, 1\n\t"
662
+ "th.vslideup.vi v11, v12, 1\n\t"
663
+ "th.vslideup.vi v13, v14, 1\n\t"
664
+ "th.vslideup.vi v10, v8, 2\n\t"
665
+ "th.vslideup.vi v11, v13, 2\n\t"
666
+ "li %[tmp], 8\n\t"
667
+ "th.vsetvli zero, %[tmp], e32, m2\n\t"
668
+ "th.vlbu.v v12, (%[scale])\n\t"
669
+ "th.vmul.vv v10, v10, v12\n\t"
670
+ "th.vredsum.vs v0, v10, v0\n\t"
671
+ "th.vmv.x.s %[tmp], v0\n\t"
672
+ "add %[isum], %[isum], %[tmp]"
673
+ : [tmp] "=&r" (tmp), [isum] "+&r" (isum)
674
+ : [q2] "r" (q2), [scale] "r" (patmp), [q8] "r" (q8)
675
+ , [vl16] "r" (16), [vl32] "r" (32), [vl64] "r" (64), [vl128] "r" (128)
676
+ : "memory"
677
+ , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
678
+ , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
679
+ , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
680
+ , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
681
+ );
682
+ q2 += 32; q8 += 128; patmp += 8;
683
+ }
684
+
685
+ sumf += dall * isum;
686
+ }
687
+
688
+ *s = sumf;
689
+
690
+ #elif defined __riscv_v
691
+
692
+ float sumf = 0;
693
+ uint8_t atmp[16];
694
+
695
+ const int vector_length = __riscv_vlenb() * 8;
696
+ uint8_t temp_01[32] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
697
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 };
698
+
699
+ switch (vector_length) {
700
+ case 256:
701
+ for (int i = 0; i < nb; ++i) {
702
+ const uint8_t * q2 = x[i].qs;
703
+ const int8_t * q8 = y[i].qs;
704
+ const uint8_t * sc = x[i].scales;
705
+
706
+ const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
707
+ const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
708
+
709
+ size_t vl = 16;
710
+
711
+ vuint8m1_t scales = __riscv_vle8_v_u8m1(sc, vl);
712
+ vuint8m1_t aux = __riscv_vand_vx_u8m1(scales, 0x0F, vl);
713
+
714
+ vint16m1_t q8sums = __riscv_vle16_v_i16m1(y[i].bsums, vl);
715
+
716
+ vuint8mf2_t scales_2 = __riscv_vle8_v_u8mf2(sc, vl);
717
+ vuint8mf2_t mins8 = __riscv_vsrl_vx_u8mf2(scales_2, 0x4, vl);
718
+ vint16m1_t mins = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(mins8, vl));
719
+ vint32m2_t prod = __riscv_vwmul_vv_i32m2(q8sums, mins, vl);
720
+ vint32m1_t vsums = __riscv_vredsum_vs_i32m2_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl);
721
+
722
+ sumf += dmin * __riscv_vmv_x_s_i32m1_i32(vsums);
723
+
724
+ vl = 32;
725
+
726
+ vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
727
+ vuint8m1_t v_b = __riscv_vle8_v_u8m1(temp_01, vl);
728
+
729
+ uint8_t is = 0;
730
+ int isum = 0;
731
+
732
+ for (int j = 0; j < QK_K / 128; ++j) {
733
+ // load Q2
734
+ vuint8m1_t q2_x = __riscv_vle8_v_u8m1(q2, vl);
735
+
736
+ vuint8m1_t q2_0 = __riscv_vand_vx_u8m1(q2_x, 0x03, vl);
737
+ vuint8m1_t q2_1 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q2_x, 0x2, vl), 0x03, vl);
738
+ vuint8m1_t q2_2 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q2_x, 0x4, vl), 0x03, vl);
739
+ vuint8m1_t q2_3 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q2_x, 0x6, vl), 0x03, vl);
740
+
741
+ // duplicate scale elements for product
742
+ vuint8m1_t sc0 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 0 + is, vl), vl);
743
+ vuint8m1_t sc1 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 2 + is, vl), vl);
744
+ vuint8m1_t sc2 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 4 + is, vl), vl);
745
+ vuint8m1_t sc3 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 6 + is, vl), vl);
746
+
747
+ vint16m2_t p0 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_0, sc0, vl));
748
+ vint16m2_t p1 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_1, sc1, vl));
749
+ vint16m2_t p2 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_2, sc2, vl));
750
+ vint16m2_t p3 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_3, sc3, vl));
751
+
752
+ // load Q8
753
+ vint8m1_t q8_0 = __riscv_vle8_v_i8m1(q8, vl);
754
+ vint8m1_t q8_1 = __riscv_vle8_v_i8m1(q8 + 32, vl);
755
+ vint8m1_t q8_2 = __riscv_vle8_v_i8m1(q8 + 64, vl);
756
+ vint8m1_t q8_3 = __riscv_vle8_v_i8m1(q8 + 96, vl);
757
+
758
+ vint32m4_t s0 = __riscv_vwmul_vv_i32m4(p0, __riscv_vwcvt_x_x_v_i16m2(q8_0, vl), vl);
759
+ vint32m4_t s1 = __riscv_vwmul_vv_i32m4(p1, __riscv_vwcvt_x_x_v_i16m2(q8_1, vl), vl);
760
+ vint32m4_t s2 = __riscv_vwmul_vv_i32m4(p2, __riscv_vwcvt_x_x_v_i16m2(q8_2, vl), vl);
761
+ vint32m4_t s3 = __riscv_vwmul_vv_i32m4(p3, __riscv_vwcvt_x_x_v_i16m2(q8_3, vl), vl);
762
+
763
+ vint32m1_t isum0 = __riscv_vredsum_vs_i32m4_i32m1(__riscv_vadd_vv_i32m4(s0, s1, vl), vzero, vl);
764
+ vint32m1_t isum1 = __riscv_vredsum_vs_i32m4_i32m1(__riscv_vadd_vv_i32m4(s2, s3, vl), isum0, vl);
765
+
766
+ isum += __riscv_vmv_x_s_i32m1_i32(isum1);
767
+
768
+ q2 += 32;
769
+ q8 += 128;
770
+ is = 8;
771
+ }
772
+
773
+ sumf += dall * isum;
774
+ }
775
+ break;
776
+ case 128:
777
+ for (int i = 0; i < nb; ++i) {
778
+ const uint8_t * q2 = x[i].qs;
779
+ const int8_t * q8 = y[i].qs;
780
+ const uint8_t * sc = x[i].scales;
781
+ const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
782
+ const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
783
+ uint8_t *patmp = atmp;
784
+ int vsums;
785
+ int tmp, t1, t2, t3, t4, t5, t6, t7;
786
+ __asm__ __volatile__(
787
+ "vsetivli zero, 16, e8, m1\n\t"
788
+ "vmv.v.x v8, zero\n\t"
789
+ "lb zero, 15(%[sc])\n\t"
790
+ "vle8.v v1, (%[sc])\n\t"
791
+ "vle8.v v2, (%[bsums])\n\t"
792
+ "addi %[tmp], %[bsums], 16\n\t"
793
+ "vand.vi v0, v1, 0xF\n\t"
794
+ "vsrl.vi v1, v1, 4\n\t"
795
+ "vle8.v v3, (%[tmp])\n\t"
796
+ "vse8.v v0, (%[scale])\n\t"
797
+ "vsetivli zero, 16, e16, m2\n\t"
798
+ "vzext.vf2 v0, v1\n\t"
799
+ "vwmul.vv v4, v0, v2\n\t"
800
+ "vsetivli zero, 16, e32, m4\n\t"
801
+ "vredsum.vs v8, v4, v8\n\t"
802
+ "vmv.x.s %[vsums], v8"
803
+ : [tmp] "=&r" (tmp), [vsums] "=&r" (vsums)
804
+ : [sc] "r" (sc), [scale] "r" (atmp), [bsums] "r" (y[i].bsums)
805
+ : "memory"
806
+ , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
807
+ , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
808
+ , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
809
+ , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
810
+ );
811
+ sumf += dmin * vsums;
812
+ int isum = 0;
813
+
814
+ for (int j = 0; j < QK_K/128; ++j) {
815
+ __asm__ __volatile__(
816
+ "lb zero, 31(%[q2])\n\t"
817
+ "addi %[tmp], %[q2], 16\n\t"
818
+ "addi %[t1], %[q8], 16\n\t"
819
+ "vsetivli zero, 16, e8, m1\n\t"
820
+ "vle8.v v0, (%[q2])\n\t"
821
+ "vle8.v v1, (%[tmp])\n\t"
822
+ "vsrl.vi v2, v0, 2\n\t"
823
+ "vsrl.vi v3, v1, 2\n\t"
824
+ "vsrl.vi v4, v0, 4\n\t"
825
+ "addi %[tmp], %[q8], 32\n\t"
826
+ "vle8.v v8, (%[q8])\n\t"
827
+ "vle8.v v9, (%[t1])\n\t"
828
+ "addi %[t1], %[t1], 32\n\t"
829
+ "vsrl.vi v5, v1, 4\n\t"
830
+ "vsrl.vi v6, v0, 6\n\t"
831
+ "vsrl.vi v7, v1, 6\n\t"
832
+ "vle8.v v10, (%[tmp])\n\t"
833
+ "vle8.v v11, (%[t1])\n\t"
834
+ "addi %[tmp], %[tmp], 32\n\t"
835
+ "addi %[t1], %[t1], 32\n\t"
836
+ "vand.vi v0, v0, 0x3\n\t"
837
+ "vand.vi v1, v1, 0x3\n\t"
838
+ "vand.vi v2, v2, 0x3\n\t"
839
+ "vle8.v v12, (%[tmp])\n\t"
840
+ "vle8.v v13, (%[t1])\n\t"
841
+ "addi %[tmp], %[tmp], 32\n\t"
842
+ "addi %[t1], %[t1], 32\n\t"
843
+ "vand.vi v3, v3, 0x3\n\t"
844
+ "vand.vi v4, v4, 0x3\n\t"
845
+ "vand.vi v5, v5, 0x3\n\t"
846
+ "vle8.v v14, (%[tmp])\n\t"
847
+ "vle8.v v15, (%[t1])\n\t"
848
+ "vwmul.vv v16, v0, v8\n\t"
849
+ "vwmul.vv v18, v1, v9\n\t"
850
+ "vwmul.vv v20, v2, v10\n\t"
851
+ "vwmul.vv v22, v3, v11\n\t"
852
+ "vwmul.vv v24, v4, v12\n\t"
853
+ "vwmul.vv v26, v5, v13\n\t"
854
+ "vwmul.vv v28, v6, v14\n\t"
855
+ "vwmul.vv v30, v7, v15\n\t"
856
+ "vsetivli zero, 8, e16, m1\n\t"
857
+ "vmv.v.x v0, zero\n\t"
858
+ "lbu %[tmp], 0(%[scale])\n\t"
859
+ "vwredsum.vs v8, v16, v0\n\t"
860
+ "vwredsum.vs v9, v18, v0\n\t"
861
+ "lbu %[t1], 1(%[scale])\n\t"
862
+ "vwredsum.vs v10, v20, v0\n\t"
863
+ "vwredsum.vs v11, v22, v0\n\t"
864
+ "lbu %[t2], 2(%[scale])\n\t"
865
+ "vwredsum.vs v12, v24, v0\n\t"
866
+ "vwredsum.vs v13, v26, v0\n\t"
867
+ "lbu %[t3], 3(%[scale])\n\t"
868
+ "vwredsum.vs v14, v28, v0\n\t"
869
+ "vwredsum.vs v15, v30, v0\n\t"
870
+ "lbu %[t4], 4(%[scale])\n\t"
871
+ "vwredsum.vs v8, v17, v8\n\t"
872
+ "vwredsum.vs v9, v19, v9\n\t"
873
+ "lbu %[t5], 5(%[scale])\n\t"
874
+ "vwredsum.vs v10, v21, v10\n\t"
875
+ "vwredsum.vs v11, v23, v11\n\t"
876
+ "lbu %[t6], 6(%[scale])\n\t"
877
+ "vwredsum.vs v12, v25, v12\n\t"
878
+ "vwredsum.vs v13, v27, v13\n\t"
879
+ "lbu %[t7], 7(%[scale])\n\t"
880
+ "vwredsum.vs v14, v29, v14\n\t"
881
+ "vwredsum.vs v15, v31, v15\n\t"
882
+ "vsetivli zero, 4, e32, m1\n\t"
883
+ "vmul.vx v0, v8, %[tmp]\n\t"
884
+ "vmul.vx v1, v9, %[t1]\n\t"
885
+ "vmacc.vx v0, %[t2], v10\n\t"
886
+ "vmacc.vx v1, %[t3], v11\n\t"
887
+ "vmacc.vx v0, %[t4], v12\n\t"
888
+ "vmacc.vx v1, %[t5], v13\n\t"
889
+ "vmacc.vx v0, %[t6], v14\n\t"
890
+ "vmacc.vx v1, %[t7], v15\n\t"
891
+ "vmv.x.s %[tmp], v0\n\t"
892
+ "vmv.x.s %[t1], v1\n\t"
893
+ "add %[isum], %[isum], %[tmp]\n\t"
894
+ "add %[isum], %[isum], %[t1]"
895
+ : [tmp] "=&r" (tmp), [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3)
896
+ , [t4] "=&r" (t4), [t5] "=&r" (t5), [t6] "=&r" (t6), [t7] "=&r" (t7)
897
+ , [isum] "+&r" (isum)
898
+ : [q2] "r" (q2), [scale] "r" (patmp), [q8] "r" (q8)
899
+ : "memory"
900
+ , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
901
+ , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
902
+ , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
903
+ , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
904
+ );
905
+ q2 += 32; q8 += 128; patmp += 8;
906
+ }
907
+
908
+ sumf += dall * isum;
909
+ }
910
+ break;
911
+ default:
912
+ assert(false && "Unsupported vector length");
913
+ break;
914
+ }
915
+
916
+ *s = sumf;
917
+
918
+ #else
919
+
920
+ UNUSED(x);
921
+ UNUSED(y);
922
+ UNUSED(nb);
923
+
924
+ ggml_vec_dot_q2_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
925
+ #endif
926
+ }
927
+
928
+ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
929
+ assert(n % QK_K == 0);
930
+ assert(nrc == 1);
931
+ UNUSED(nrc);
932
+ UNUSED(bx);
933
+ UNUSED(by);
934
+ UNUSED(bs);
935
+
936
+ const uint32_t kmask1 = 0x03030303;
937
+ const uint32_t kmask2 = 0x0f0f0f0f;
938
+
939
+ const block_q3_K * GGML_RESTRICT x = vx;
940
+ const block_q8_K * GGML_RESTRICT y = vy;
941
+
942
+ const int nb = n / QK_K;
943
+
944
+ #if defined __riscv_xtheadvector
945
+
946
+ uint32_t utmp[4];
947
+ float sumf = 0;
948
+
949
+ for (int i = 0; i < nb; ++i) {
950
+ const uint8_t * restrict q3 = x[i].qs;
951
+ const uint8_t * restrict qh = x[i].hmask;
952
+ const int8_t * restrict q8 = y[i].qs;
953
+
954
+ int8_t * scale = (int8_t *)utmp;
955
+ int tmp;
956
+ __asm__ __volatile__(
957
+ "li %[tmp], 12\n\t"
958
+ "th.vsetvli zero, %[tmp], e8, m1\n\t"
959
+ "th.vlb.v v0, (%[s6b])\n\t"
960
+ "th.vmv.v.v v2, v0\n\t"
961
+ "li %[tmp], 2\n\t"
962
+ "th.vsetvli zero, %[tmp], e64, m1\n\t"
963
+ "th.vmv.v.x v9, %[sh]\n\t"\
964
+ "th.vslidedown.vi v1, v0, 1\n\t"
965
+ "th.vslide1up.vx v8, v9, zero\n\t" // {0, 0, 4, 4}
966
+ "th.vslideup.vi v0, v2, 1\n\t" // {aux[0], aux[1], aux[0], aux[1]}
967
+ "li %[tmp], 4\n\t"
968
+ "th.vsetvli zero, %[tmp], e32, m1\n\t"
969
+ "th.vid.v v9\n\t"
970
+ "th.vmv.x.s %[tmp], v1\n\t"
971
+ "th.vsll.vi v9, v9, 1\n\t" // {0, 2, 4, 6}
972
+ "th.vmv.v.x v1, %[tmp]\n\t" // {aux[2], aux[2], aux[2], aux[2]}
973
+ "th.vsrl.vv v4, v1, v9\n\t"
974
+ "th.vsrl.vv v2, v0, v8\n\t"
975
+ "th.vand.vx v5, v4, %[kmask1]\n\t"
976
+ "th.vand.vx v3, v2, %[kmask2]\n\t"
977
+ "th.vsll.vi v6, v5, 4\n\t"
978
+ "th.vor.vv v7, v6, v3\n\t"
979
+ "li %[tmp], 16\n\t"
980
+ "th.vsetvli zero, %[tmp], e8, m1\n\t"
981
+ "th.vsub.vx v0, v7, %[c]\n\t"
982
+ "th.vsb.v v0, (%[scale])"
983
+ : [tmp] "=&r" (tmp)
984
+ : [sh] "r" (0x0000000400000004), [s6b] "r" (x[i].scales), [c] "r" (32)
985
+ , [scale] "r" (scale), [kmask1] "r" (kmask1), [kmask2] "r" (kmask2)
986
+ : "memory"
987
+ , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
988
+ , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
989
+ , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
990
+ , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
991
+ );
992
+
993
+ uint8_t m = 1;
994
+ int isum = 0;
995
+ for (int j = 0; j < QK_K; j += 128) {
996
+ __asm__ __volatile__(
997
+ // fixme: use v0p7 mask layout directly
998
+ "th.vsetvli zero, %[vl32], e8, m2\n\t"
999
+ "th.vlb.v v8, (%[q3])\n\t"
1000
+ "th.vsrl.vi v10, v8, 2\n\t"
1001
+ "th.vsrl.vi v12, v8, 4\n\t"
1002
+ "th.vsrl.vi v14, v8, 6\n\t"
1003
+ "th.vand.vi v8, v8, 3\n\t"
1004
+ "th.vand.vi v10, v10, 3\n\t"
1005
+ "th.vand.vi v12, v12, 3\n\t"
1006
+ "th.vlb.v v2, (%[qh])\n\t"
1007
+ "th.vand.vx v4, v2, %[m]\n\t"
1008
+ "slli %[m], %[m], 1\n\t"
1009
+ "th.vmseq.vx v0, v4, zero\n\t"
1010
+ "th.vadd.vi v8, v8, -4, v0.t\n\t"
1011
+ "th.vand.vx v4, v2, %[m]\n\t"
1012
+ "slli %[m], %[m], 1\n\t"
1013
+ "th.vmseq.vx v0, v4, zero\n\t"
1014
+ "th.vadd.vi v10, v10, -4, v0.t\n\t"
1015
+ "th.vand.vx v4, v2, %[m]\n\t"
1016
+ "slli %[m], %[m], 1\n\t"
1017
+ "th.vmseq.vx v0, v4, zero\n\t"
1018
+ "th.vadd.vi v12, v12, -4, v0.t\n\t"
1019
+ "th.vand.vx v4, v2, %[m]\n\t"
1020
+ "slli %[m], %[m], 1\n\t"
1021
+ "th.vmseq.vx v0, v4, zero\n\t"
1022
+ "th.vadd.vi v14, v14, -4, v0.t\n\t"
1023
+ "th.vsetvli zero, %[vl128], e8, m8\n\t"
1024
+ "th.vlb.v v0, (%[q8])\n\t"
1025
+ "th.vsetvli zero, %[vl64], e8, m4\n\t"
1026
+ "th.vwmul.vv v16, v0, v8\n\t"
1027
+ "th.vwmul.vv v24, v4, v12\n\t"
1028
+ "li %[tmp], 16\n\t"
1029
+ "th.vsetvli zero, %[tmp], e16, m2\n\t"
1030
+ "th.vmv.v.x v0, zero\n\t"
1031
+ "th.vwredsum.vs v10, v16, v0\n\t"
1032
+ "th.vwredsum.vs v9, v18, v0\n\t"
1033
+ "th.vwredsum.vs v8, v20, v0\n\t"
1034
+ "th.vwredsum.vs v7, v22, v0\n\t"
1035
+ "th.vwredsum.vs v11, v24, v0\n\t"
1036
+ "th.vwredsum.vs v12, v26, v0\n\t"
1037
+ "th.vwredsum.vs v13, v28, v0\n\t"
1038
+ "th.vwredsum.vs v14, v30, v0\n\t"
1039
+ "li %[tmp], 4\n\t"
1040
+ "th.vsetvli zero, %[tmp], e32, m1\n\t"
1041
+ "th.vslideup.vi v10, v9, 1\n\t"
1042
+ "th.vslideup.vi v8, v7, 1\n\t"
1043
+ "th.vslideup.vi v11, v12, 1\n\t"
1044
+ "th.vslideup.vi v13, v14, 1\n\t"
1045
+ "th.vslideup.vi v10, v8, 2\n\t"
1046
+ "th.vslideup.vi v11, v13, 2\n\t"
1047
+ "li %[tmp], 8\n\t"
1048
+ "th.vsetvli zero, %[tmp], e32, m2\n\t"
1049
+ "th.vlb.v v12, (%[scale])\n\t"
1050
+ "th.vmul.vv v10, v10, v12\n\t"
1051
+ "th.vredsum.vs v0, v10, v0\n\t"
1052
+ "th.vmv.x.s %[tmp], v0\n\t"
1053
+ "add %[isum], %[isum], %[tmp]"
1054
+ : [tmp] "=&r" (tmp), [m] "+&r" (m), [isum] "+&r" (isum)
1055
+ : [vl128] "r" (128), [vl64] "r" (64), [vl32] "r" (32)
1056
+ , [q3] "r" (q3), [qh] "r" (qh), [scale] "r" (scale), [q8] "r" (q8)
1057
+ : "memory"
1058
+ , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
1059
+ , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
1060
+ , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
1061
+ , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
1062
+ );
1063
+ q3 += 32; q8 += 128; scale += 8;
1064
+ }
1065
+
1066
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1067
+ sumf += d * isum;
1068
+ }
1069
+
1070
+ *s = sumf;
1071
+
1072
+ #elif defined __riscv_v
1073
+
1074
+ uint32_t utmp[4];
1075
+ float sumf = 0;
1076
+ uint32_t aux[3];
1077
+ const int vector_length = __riscv_vlenb() * 8;
1078
+
1079
+ switch (vector_length) {
1080
+ case 256:
1081
+ for (int i = 0; i < nb; ++i) {
1082
+
1083
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
1084
+ const uint8_t * GGML_RESTRICT qh = x[i].hmask;
1085
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
1086
+
1087
+ memcpy(aux, x[i].scales, 12);
1088
+ utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
1089
+ utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
1090
+ utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
1091
+ utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
1092
+
1093
+ int8_t * scale = (int8_t *)utmp;
1094
+ for (int j = 0; j < 16; ++j) scale[j] -= 32;
1095
+
1096
+
1097
+ size_t vl = 32;
1098
+ uint8_t m = 1;
1099
+
1100
+ vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
1101
+ vuint8m1_t vqh = __riscv_vle8_v_u8m1(qh, vl);
1102
+
1103
+ int sum_t = 0;
1104
+
1105
+ for (int j = 0; j < QK_K; j += 128) {
1106
+
1107
+ vl = 32;
1108
+
1109
+ // load Q3
1110
+ vuint8m1_t q3_x = __riscv_vle8_v_u8m1(q3, vl);
1111
+
1112
+ vint8m1_t q3_0 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q3_x, 0x03, vl));
1113
+ vint8m1_t q3_1 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q3_x, 0x2, vl), 0x03 , vl));
1114
+ vint8m1_t q3_2 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q3_x, 0x4, vl), 0x03 , vl));
1115
+ vint8m1_t q3_3 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q3_x, 0x6, vl), 0x03 , vl));
1116
+
1117
+ // compute mask for subtraction
1118
+ vuint8m1_t qh_m0 = __riscv_vand_vx_u8m1(vqh, m, vl);
1119
+ vbool8_t vmask_0 = __riscv_vmseq_vx_u8m1_b8(qh_m0, 0, vl);
1120
+ vint8m1_t q3_m0 = __riscv_vsub_vx_i8m1_mu(vmask_0, q3_0, q3_0, 0x4, vl);
1121
+ m <<= 1;
1122
+
1123
+ vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
1124
+ vbool8_t vmask_1 = __riscv_vmseq_vx_u8m1_b8(qh_m1, 0, vl);
1125
+ vint8m1_t q3_m1 = __riscv_vsub_vx_i8m1_mu(vmask_1, q3_1, q3_1, 0x4, vl);
1126
+ m <<= 1;
1127
+
1128
+ vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
1129
+ vbool8_t vmask_2 = __riscv_vmseq_vx_u8m1_b8(qh_m2, 0, vl);
1130
+ vint8m1_t q3_m2 = __riscv_vsub_vx_i8m1_mu(vmask_2, q3_2, q3_2, 0x4, vl);
1131
+ m <<= 1;
1132
+
1133
+ vuint8m1_t qh_m3 = __riscv_vand_vx_u8m1(vqh, m, vl);
1134
+ vbool8_t vmask_3 = __riscv_vmseq_vx_u8m1_b8(qh_m3, 0, vl);
1135
+ vint8m1_t q3_m3 = __riscv_vsub_vx_i8m1_mu(vmask_3, q3_3, q3_3, 0x4, vl);
1136
+ m <<= 1;
1137
+
1138
+ // load Q8 and take product with Q3
1139
+ vint16m2_t a0 = __riscv_vwmul_vv_i16m2(q3_m0, __riscv_vle8_v_i8m1(q8, vl), vl);
1140
+ vint16m2_t a1 = __riscv_vwmul_vv_i16m2(q3_m1, __riscv_vle8_v_i8m1(q8+32, vl), vl);
1141
+ vint16m2_t a2 = __riscv_vwmul_vv_i16m2(q3_m2, __riscv_vle8_v_i8m1(q8+64, vl), vl);
1142
+ vint16m2_t a3 = __riscv_vwmul_vv_i16m2(q3_m3, __riscv_vle8_v_i8m1(q8+96, vl), vl);
1143
+
1144
+ vl = 16;
1145
+
1146
+ // retrieve lane to multiply with scale
1147
+ vint32m2_t aux0_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a0, 0), (scale[0]), vl);
1148
+ vint32m2_t aux0_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a0, 1), (scale[1]), vl);
1149
+ vint32m2_t aux1_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a1, 0), (scale[2]), vl);
1150
+ vint32m2_t aux1_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a1, 1), (scale[3]), vl);
1151
+ vint32m2_t aux2_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a2, 0), (scale[4]), vl);
1152
+ vint32m2_t aux2_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a2, 1), (scale[5]), vl);
1153
+ vint32m2_t aux3_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a3, 0), (scale[6]), vl);
1154
+ vint32m2_t aux3_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a3, 1), (scale[7]), vl);
1155
+
1156
+ vint32m1_t isum0 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux0_0, aux0_1, vl), vzero, vl);
1157
+ vint32m1_t isum1 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux1_0, aux1_1, vl), isum0, vl);
1158
+ vint32m1_t isum2 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux2_0, aux2_1, vl), isum1, vl);
1159
+ vint32m1_t isum3 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux3_0, aux3_1, vl), isum2, vl);
1160
+
1161
+ sum_t += __riscv_vmv_x_s_i32m1_i32(isum3);
1162
+
1163
+ q3 += 32; q8 += 128; scale += 8;
1164
+
1165
+ }
1166
+
1167
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1168
+
1169
+ sumf += d*sum_t;
1170
+
1171
+ }
1172
+ break;
1173
+ case 128:
1174
+ for (int i = 0; i < nb; ++i) {
1175
+ const uint8_t * restrict q3 = x[i].qs;
1176
+ const uint8_t * restrict qh = x[i].hmask;
1177
+ const int8_t * restrict q8 = y[i].qs;
1178
+
1179
+ int8_t * scale = (int8_t *)utmp;
1180
+ int tmp, t1, t2, t3, t4, t5, t6, t7;
1181
+ __asm__ __volatile__(
1182
+ "vsetivli zero, 12, e8, m1\n\t"
1183
+ "vle8.v v0, (%[s6b])\n\t"
1184
+ "vmv1r.v v2, v0\n\t"
1185
+ "vsetivli zero, 2, e64, m1\n\t"
1186
+ "vmv.v.x v9, %[sh]\n\t"\
1187
+ "vslidedown.vi v1, v0, 1\n\t"
1188
+ "vslide1up.vx v8, v9, zero\n\t" // {0, 0, 4, 4}
1189
+ "vslideup.vi v0, v2, 1\n\t" // {aux[0], aux[1], aux[0], aux[1]}
1190
+ "vsetivli zero, 4, e32, m1\n\t"
1191
+ "vid.v v9\n\t"
1192
+ "vmv.x.s %[tmp], v1\n\t"
1193
+ "vsll.vi v9, v9, 1\n\t" // {0, 2, 4, 6}
1194
+ "vmv.v.x v1, %[tmp]\n\t" // {aux[2], aux[2], aux[2], aux[2]}
1195
+ "vsrl.vv v4, v1, v9\n\t"
1196
+ "vsrl.vv v2, v0, v8\n\t"
1197
+ "vand.vx v5, v4, %[kmask1]\n\t"
1198
+ "vand.vx v3, v2, %[kmask2]\n\t"
1199
+ "vsll.vi v6, v5, 4\n\t"
1200
+ "vor.vv v7, v6, v3\n\t"
1201
+ "vsetivli zero, 16, e8, m1\n\t"
1202
+ "vsub.vx v0, v7, %[c]\n\t"
1203
+ "vse8.v v0, (%[scale])"
1204
+ : [tmp] "=&r" (tmp)
1205
+ : [sh] "r" (0x0000000400000004), [s6b] "r" (x[i].scales), [c] "r" (32)
1206
+ , [scale] "r" (scale), [kmask1] "r" (kmask1), [kmask2] "r" (kmask2)
1207
+ : "memory"
1208
+ , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
1209
+ , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
1210
+ , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
1211
+ , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
1212
+ );
1213
+
1214
+ uint8_t m = 1;
1215
+ int isum = 0;
1216
+ for (int j = 0; j < QK_K; j += 128) {
1217
+ __asm__ __volatile__(
1218
+ "lb zero, 31(%[q3])\n\t"
1219
+ "vsetvli zero, %[vl32], e8, m2, ta, mu\n\t"
1220
+ "vle8.v v8, (%[q3])\n\t"
1221
+ "vsrl.vi v10, v8, 2\n\t"
1222
+ "vsrl.vi v12, v8, 4\n\t"
1223
+ "vsrl.vi v14, v8, 6\n\t"
1224
+ "lb zero, 64(%[q8])\n\t"
1225
+ "vand.vi v8, v8, 3\n\t"
1226
+ "vand.vi v10, v10, 3\n\t"
1227
+ "vand.vi v12, v12, 3\n\t"
1228
+ "vle8.v v2, (%[qh])\n\t"
1229
+ "lb zero, 127(%[q8])\n\t"
1230
+ "vand.vx v4, v2, %[m]\n\t"
1231
+ "slli %[m], %[m], 1\n\t"
1232
+ "vmseq.vx v0, v4, zero\n\t"
1233
+ "vadd.vi v8, v8, -4, v0.t\n\t"
1234
+ "lb zero, 0(%[q8])\n\t"
1235
+ "vand.vx v4, v2, %[m]\n\t"
1236
+ "slli %[m], %[m], 1\n\t"
1237
+ "vmseq.vx v0, v4, zero\n\t"
1238
+ "vadd.vi v10, v10, -4, v0.t\n\t"
1239
+ "vand.vx v4, v2, %[m]\n\t"
1240
+ "slli %[m], %[m], 1\n\t"
1241
+ "vmseq.vx v0, v4, zero\n\t"
1242
+ "vadd.vi v12, v12, -4, v0.t\n\t"
1243
+ "vand.vx v4, v2, %[m]\n\t"
1244
+ "slli %[m], %[m], 1\n\t"
1245
+ "vmseq.vx v0, v4, zero\n\t"
1246
+ "vadd.vi v14, v14, -4, v0.t\n\t"
1247
+ "vsetvli zero, %[vl128], e8, m8\n\t"
1248
+ "vle8.v v0, (%[q8])\n\t"
1249
+ "lb %[tmp], 0(%[scale])\n\t"
1250
+ "lb %[t1], 1(%[scale])\n\t"
1251
+ "lb %[t2], 2(%[scale])\n\t"
1252
+ "lb %[t3], 3(%[scale])\n\t"
1253
+ "vsetvli zero, %[vl64], e8, m4\n\t"
1254
+ "vwmul.vv v16, v0, v8\n\t"
1255
+ "vwmul.vv v24, v4, v12\n\t"
1256
+ "vsetivli zero, 16, e16, m2\n\t"
1257
+ "vmv.v.x v0, zero\n\t"
1258
+ "vwredsum.vs v8, v16, v0\n\t"
1259
+ "lb %[t4], 4(%[scale])\n\t"
1260
+ "lb %[t5], 5(%[scale])\n\t"
1261
+ "vwredsum.vs v9, v18, v0\n\t"
1262
+ "vwredsum.vs v10, v20, v0\n\t"
1263
+ "vwredsum.vs v11, v22, v0\n\t"
1264
+ "vwredsum.vs v12, v24, v0\n\t"
1265
+ "lb %[t6], 6(%[scale])\n\t"
1266
+ "lb %[t7], 7(%[scale])\n\t"
1267
+ "vwredsum.vs v13, v26, v0\n\t"
1268
+ "vwredsum.vs v14, v28, v0\n\t"
1269
+ "vwredsum.vs v15, v30, v0\n\t"
1270
+ "vsetivli zero, 4, e32, m1\n\t"
1271
+ "vmul.vx v0, v8, %[tmp]\n\t"
1272
+ "vmul.vx v1, v9, %[t1]\n\t"
1273
+ "vmacc.vx v0, %[t2], v10\n\t"
1274
+ "vmacc.vx v1, %[t3], v11\n\t"
1275
+ "vmacc.vx v0, %[t4], v12\n\t"
1276
+ "vmacc.vx v1, %[t5], v13\n\t"
1277
+ "vmacc.vx v0, %[t6], v14\n\t"
1278
+ "vmacc.vx v1, %[t7], v15\n\t"
1279
+ "vmv.x.s %[tmp], v0\n\t"
1280
+ "vmv.x.s %[t1], v1\n\t"
1281
+ "add %[isum], %[isum], %[tmp]\n\t"
1282
+ "add %[isum], %[isum], %[t1]"
1283
+ : [tmp] "=&r" (tmp), [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3)
1284
+ , [t4] "=&r" (t4), [t5] "=&r" (t5), [t6] "=&r" (t6), [t7] "=&r" (t7)
1285
+ , [m] "+&r" (m), [isum] "+&r" (isum)
1286
+ : [vl128] "r" (128), [vl64] "r" (64), [vl32] "r" (32)
1287
+ , [q3] "r" (q3), [qh] "r" (qh), [scale] "r" (scale), [q8] "r" (q8)
1288
+ : "memory"
1289
+ , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
1290
+ , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
1291
+ , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
1292
+ , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
1293
+ );
1294
+ q3 += 32; q8 += 128; scale += 8;
1295
+ }
1296
+
1297
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1298
+ sumf += d * isum;
1299
+ }
1300
+ break;
1301
+ default:
1302
+ assert(false && "Unsupported vector length");
1303
+ break;
1304
+ }
1305
+
1306
+ *s = sumf;
1307
+
1308
+ #else
1309
+
1310
+ UNUSED(kmask1);
1311
+ UNUSED(kmask2);
1312
+ UNUSED(x);
1313
+ UNUSED(y);
1314
+ UNUSED(nb);
1315
+
1316
+ ggml_vec_dot_q3_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1317
+ #endif
1318
+
1319
+ }
1320
+
1321
+ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1322
+ assert(n % QK_K == 0);
1323
+ assert(nrc == 1);
1324
+ UNUSED(nrc);
1325
+ UNUSED(bx);
1326
+ UNUSED(by);
1327
+ UNUSED(bs);
1328
+
1329
+ const block_q4_K * GGML_RESTRICT x = vx;
1330
+ const block_q8_K * GGML_RESTRICT y = vy;
1331
+
1332
+ const int nb = n / QK_K;
1333
+
1334
+ static const uint32_t kmask1 = 0x3f3f3f3f;
1335
+ static const uint32_t kmask2 = 0x0f0f0f0f;
1336
+ static const uint32_t kmask3 = 0x03030303;
1337
+
1338
+ uint32_t utmp[4];
1339
+
1340
+ #if defined __riscv_xtheadvector
1341
+
1342
+ const uint8_t * scales = (const uint8_t*)&utmp[0];
1343
+ const uint8_t * mins = (const uint8_t*)&utmp[2];
1344
+
1345
+ float sumf = 0;
1346
+
1347
+ for (int i = 0; i < nb; ++i) {
1348
+ const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
1349
+ const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
1350
+
1351
+ int tmp, tmp2, sumi;
1352
+ __asm__ __volatile__(
1353
+ "li %[t1], 12\n\t"
1354
+ "th.vsetvli zero, %[t1], e8, m1\n\t"
1355
+ "th.vlb.v v1, (%[s6b])\n\t" // {aux[0], aux[1], aux[2]}
1356
+ "li %[t1], 4\n\t"
1357
+ "th.vsetvli zero, %[t1], e32, m1\n\t"
1358
+ "th.vslidedown.vi v2, v1, 2\n\t"
1359
+ "th.vmv.v.v v3, v2\n\t"
1360
+ "th.vslideup.vi v2, v3, 1\n\t" // {aux[2], aux[2]}
1361
+ "li %[t1], 2\n\t"
1362
+ "th.vsetvli zero, %[t1], e32, m1\n\t"
1363
+ "th.vmv.v.i v4, 4\n\t"
1364
+ "th.vand.vx v8, v1, %[kmask1]\n\t"
1365
+ "th.vslide1up.vx v5, v4, zero\n\t" // {0, 4}
1366
+ "th.vsrl.vi v6, v1, 6\n\t"
1367
+ "th.vsrl.vv v7, v2, v5\n\t"
1368
+ "th.vand.vx v0, v6, %[kmask3]\n\t"
1369
+ "th.vand.vx v2, v7, %[kmask2]\n\t"
1370
+ "th.vsll.vi v6, v0, 4\n\t"
1371
+ "li %[t2], 8\n\t"
1372
+ "addi %[t1], %[utmp], 4\n\t"
1373
+ "th.vor.vv v1, v6, v2\n\t"
1374
+ "th.vssw.v v8, (%[utmp]), %[t2]\n\t"
1375
+ "th.vssw.v v1, (%[t1]), %[t2]\n\t"
1376
+ "th.vsetvli zero, zero, e32, m2\n\t" // vl == 8
1377
+ "th.vlw.v v2, (%[bsums])\n\t"
1378
+ "th.vsetvli zero, %[t2], e16, m1\n\t"
1379
+ "th.vnsrl.vi v0, v2, 0\n\t"
1380
+ "th.vnsrl.vi v1, v2, 16\n\t"
1381
+ "th.vadd.vv v2, v0, v1\n\t"
1382
+ "th.vlbu.v v4, (%[mins])\n\t"
1383
+ "th.vwmul.vv v6, v4, v2\n\t"
1384
+ "th.vmv.v.x v0, zero\n\t"
1385
+ "th.vsetvli zero, %[t2], e32, m2\n\t"
1386
+ "th.vredsum.vs v0, v6, v0\n\t"
1387
+ "th.vmv.x.s %[sumi], v0"
1388
+ : [t1] "=&r" (tmp), [t2] "=&r" (tmp2), [sumi] "=&r" (sumi)
1389
+ : [bsums] "r" (y[i].bsums), [mins] "r" (mins), [utmp] "r" (utmp)
1390
+ , [s6b] "r" (x[i].scales), [kmask1] "r" (kmask1)
1391
+ , [kmask2] "r" (kmask2), [kmask3] "r" (kmask3)
1392
+ : "memory"
1393
+ , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
1394
+ , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
1395
+ , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
1396
+ , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
1397
+ );
1398
+ sumf -= dmin * sumi;
1399
+
1400
+ const uint8_t * restrict q4 = x[i].qs;
1401
+ const int8_t * restrict q8 = y[i].qs;
1402
+
1403
+ sumi = 0;
1404
+ const uint8_t * scale = scales;
1405
+
1406
+ for (int j = 0; j < QK_K/128; ++j) {
1407
+ int vl128 = 128, vl64 = 64, vl32 = 32;
1408
+ __asm__ __volatile__(
1409
+ "th.vsetvli zero, %[vl128], e8, m8\n\t"
1410
+ "th.vlb.v v8, (%[q8])\n\t"
1411
+ "th.vsetvli zero, %[vl64], e8, m4\n\t"
1412
+ "th.vlb.v v0, (%[q4])\n\t"
1413
+ "th.vsrl.vi v4, v0, 4\n\t"
1414
+ "th.vand.vi v0, v0, 0xF\n\t"
1415
+ "th.vsetvli zero, %[vl32], e8, m2\n\t"
1416
+ "th.vwmul.vv v28, v6, v14\n\t"
1417
+ "th.vwmul.vv v20, v4, v10\n\t"
1418
+ "th.vwmul.vv v24, v2, v12\n\t"
1419
+ "th.vwmul.vv v16, v0, v8\n\t"
1420
+ "li %[tmp], 4\n\t"
1421
+ "th.vsetvli zero, %[tmp], e32, m1\n\t"
1422
+ "th.vlbu.v v1, (%[scale])\n\t"
1423
+ "th.vmv.v.x v0, zero\n\t"
1424
+ "th.vsetvli zero, %[vl32], e16, m4\n\t"
1425
+ "th.vwredsum.vs v6, v24, v0\n\t"
1426
+ "th.vwredsum.vs v7, v28, v0\n\t"
1427
+ "th.vwredsum.vs v4, v16, v0\n\t"
1428
+ "th.vwredsum.vs v5, v20, v0\n\t"
1429
+ "th.vsetvli zero, %[tmp], e32, m1\n\t"
1430
+ "th.vslideup.vi v6, v7, 1\n\t"
1431
+ "th.vslideup.vi v4, v5, 1\n\t"
1432
+ "th.vslideup.vi v4, v6, 2\n\t"
1433
+ "th.vmul.vv v8, v4, v1\n\t"
1434
+ "th.vredsum.vs v0, v8, v0\n\t"
1435
+ "th.vmv.x.s %[tmp], v0\n\t"
1436
+ "add %[sumi], %[sumi], %[tmp]"
1437
+ : [tmp] "=&r" (tmp), [sumi] "+&r" (sumi)
1438
+ : [vl128] "r" (vl128), [vl64] "r" (vl64), [vl32] "r" (vl32)
1439
+ , [q4] "r" (q4), [q8] "r" (q8), [scale] "r" (scale)
1440
+ : "memory"
1441
+ , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
1442
+ , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
1443
+ , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
1444
+ , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
1445
+ );
1446
+
1447
+ q4 += 64; q8 += 128; scale += 4;
1448
+ }
1449
+
1450
+ sumf += d * sumi;
1451
+
1452
+ }
1453
+
1454
+ *s = sumf;
1455
+
1456
+ #elif defined __riscv_v
1457
+
1458
+ const uint8_t * scales = (const uint8_t*)&utmp[0];
1459
+ const uint8_t * mins = (const uint8_t*)&utmp[2];
1460
+
1461
+ float sumf = 0;
1462
+ const int vector_length = __riscv_vlenb() * 8;
1463
+
1464
+ switch (vector_length) {
1465
+ case 256:
1466
+ for (int i = 0; i < nb; ++i) {
1467
+
1468
+ size_t vl = 8;
1469
+
1470
+ const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
1471
+ const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
1472
+
1473
+ vint16mf2_t q8sums_0 = __riscv_vlse16_v_i16mf2(y[i].bsums, 4, vl);
1474
+ vint16mf2_t q8sums_1 = __riscv_vlse16_v_i16mf2(y[i].bsums+1, 4, vl);
1475
+ vint16mf2_t q8sums = __riscv_vadd_vv_i16mf2(q8sums_0, q8sums_1, vl);
1476
+
1477
+ memcpy(utmp, x[i].scales, 12);
1478
+ utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
1479
+ const uint32_t uaux = utmp[1] & kmask1;
1480
+ utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
1481
+ utmp[2] = uaux;
1482
+ utmp[0] &= kmask1;
1483
+
1484
+ vuint8mf4_t mins8 = __riscv_vle8_v_u8mf4(mins, vl);
1485
+ vint16mf2_t v_mins = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vzext_vf2_u16mf2(mins8, vl));
1486
+ vint32m1_t prod = __riscv_vwmul_vv_i32m1(q8sums, v_mins, vl);
1487
+
1488
+ vint32m1_t sumi = __riscv_vredsum_vs_i32m1_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl);
1489
+ sumf -= dmin * __riscv_vmv_x_s_i32m1_i32(sumi);
1490
+
1491
+ const uint8_t * GGML_RESTRICT q4 = x[i].qs;
1492
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
1493
+
1494
+ vl = 32;
1495
+
1496
+ int32_t sum_1 = 0;
1497
+ int32_t sum_2 = 0;
1498
+
1499
+ vint16m1_t vzero = __riscv_vmv_v_x_i16m1(0, 1);
1500
+
1501
+ for (int j = 0; j < QK_K/64; ++j) {
1502
+ // load Q4
1503
+ vuint8m1_t q4_x = __riscv_vle8_v_u8m1(q4, vl);
1504
+
1505
+ // load Q8 and multiply it with lower Q4 nibble
1506
+ vint8m1_t q8_0 = __riscv_vle8_v_i8m1(q8, vl);
1507
+ vint8m1_t q4_0 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q4_x, 0x0F, vl));
1508
+ vint16m2_t qv_0 = __riscv_vwmul_vv_i16m2(q4_0, q8_0, vl);
1509
+ vint16m1_t vs_0 = __riscv_vredsum_vs_i16m2_i16m1(qv_0, vzero, vl);
1510
+
1511
+ sum_1 += __riscv_vmv_x_s_i16m1_i16(vs_0) * scales[2*j+0];
1512
+
1513
+ // load Q8 and multiply it with upper Q4 nibble
1514
+ vint8m1_t q8_1 = __riscv_vle8_v_i8m1(q8+32, vl);
1515
+ vint8m1_t q4_1 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q4_x, 0x04, vl));
1516
+ vint16m2_t qv_1 = __riscv_vwmul_vv_i16m2(q4_1, q8_1, vl);
1517
+ vint16m1_t vs_1 = __riscv_vredsum_vs_i16m2_i16m1(qv_1, vzero, vl);
1518
+
1519
+ sum_2 += __riscv_vmv_x_s_i16m1_i16(vs_1) * scales[2*j+1];
1520
+
1521
+ q4 += 32; q8 += 64;
1522
+
1523
+ }
1524
+
1525
+ sumf += d*(sum_1 + sum_2);
1526
+
1527
+ }
1528
+ break;
1529
+ case 128:
1530
+ for (int i = 0; i < nb; ++i) {
1531
+ const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
1532
+ const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
1533
+
1534
+ float ftmp, ft2;
1535
+ const uint8_t * restrict q40;
1536
+ const uint8_t * restrict q41;
1537
+ const uint8_t * restrict q42;
1538
+ const uint8_t * restrict q43;
1539
+ const int8_t * restrict q80;
1540
+ const int8_t * restrict q81;
1541
+ const int8_t * restrict q82;
1542
+ const int8_t * restrict q83;
1543
+ int s0, s1, s2, s3;
1544
+
1545
+ __asm__ __volatile__(
1546
+ "li %[s1], 8\n\t"
1547
+ "vsetivli zero, 4, e32, m1, ta, ma\n\t"
1548
+ "vle32.v v1, (%[s6b])\n\t"
1549
+ "vslide1down.vx v1, v1, zero\n\t"
1550
+ "vmv.v.x v16, zero\n\t"
1551
+ "vslidedown.vi v2, v1, 2\n\t"
1552
+ "vmv1r.v v3, v2\n\t"
1553
+ "vslideup.vi v2, v3, 1\n\t" // {aux[2], aux[2]}
1554
+ "vsetivli zero, 2, e32, m1, ta, ma\n\t"
1555
+ "vmv.v.i v4, 4\n\t"
1556
+ "vand.vx v8, v1, %[kmask1]\n\t"
1557
+ "vslide1up.vx v5, v4, zero\n\t" // {0, 4}
1558
+ "vsrl.vi v6, v1, 6\n\t"
1559
+ "vsrl.vv v7, v2, v5\n\t"
1560
+ "vsse32.v v8, (%[utmp]), %[s1]\n\t"
1561
+ "vand.vx v0, v6, %[kmask3]\n\t"
1562
+ "vand.vx v2, v7, %[kmask2]\n\t"
1563
+ "vsll.vi v6, v0, 4\n\t"
1564
+ "addi %[s0], %[utmp], 4\n\t"
1565
+ "vor.vv v1, v6, v2\n\t"
1566
+ "vsse32.v v1, (%[s0]), %[s1]\n\t"
1567
+ "vsetivli zero, 8, e16, m1, ta, ma\n\t"
1568
+ "vle32.v v2, (%[bsums])\n\t"
1569
+ "vnsrl.wi v0, v2, 0\n\t"
1570
+ "vnsrl.wi v1, v2, 16\n\t"
1571
+ "vadd.vv v2, v0, v1\n\t"
1572
+ "vle8.v v3, (%[mins])\n\t"
1573
+ "vzext.vf2 v4, v3\n\t"
1574
+ "vwmul.vv v6, v4, v2\n\t"
1575
+ "vsetivli zero, 4, e32, m1, ta, ma\n\t"
1576
+ "vredsum.vs v0, v6, v16\n\t"
1577
+ "vredsum.vs v0, v7, v0\n\t"
1578
+ "vfcvt.f.x.v v0, v0\n\t"
1579
+ "vfmv.f.s %[ftmp], v0\n\t"
1580
+ "vsetivli zero, 16, e8, m1, ta, ma\n\t"
1581
+ "vle8.v v0, (%[xs])\n\t"
1582
+ "fnmsub.s %[sumf], %[dmin], %[ftmp], %[sumf]\n\t"
1583
+ "addi %[q40], %[xs], 64\n\t"
1584
+ "addi %[q41], %[xs], 16\n\t"
1585
+ "addi %[q42], %[xs], 32\n\t"
1586
+ "addi %[q43], %[xs], 48\n\t"
1587
+ "addi %[q80], %[ys], 64\n\t"
1588
+ "vle8.v v1, (%[q41])\n\t"
1589
+ "vle8.v v2, (%[q42])\n\t"
1590
+ "addi %[q81], %[ys], 16\n\t"
1591
+ "addi %[q41], %[q41], 64\n\t"
1592
+ "addi %[q82], %[ys], 32\n\t"
1593
+ "vle8.v v3, (%[q43])\n\t"
1594
+ "vle8.v v8, (%[ys])\n\t"
1595
+ "addi %[q42], %[q42], 64\n\t"
1596
+ "addi %[q83], %[ys], 48\n\t"
1597
+ "addi %[q43], %[q43], 64\n\t"
1598
+ "vsrl.vi v4, v0, 4\n\t"
1599
+ "vle8.v v9, (%[q81])\n\t"
1600
+ "vle8.v v10, (%[q82])\n\t"
1601
+ "vand.vi v0, v0, 0xF\n\t"
1602
+ "addi %[q81], %[q81], 64\n\t"
1603
+ "vsrl.vi v5, v1, 4\n\t"
1604
+ "addi %[q82], %[q82], 64\n\t"
1605
+ "vle8.v v11, (%[q83])\n\t"
1606
+ "vle8.v v12, (%[q80])\n\t"
1607
+ "vand.vi v1, v1, 0xF\n\t"
1608
+ "addi %[q83], %[q83], 64\n\t"
1609
+ "vsrl.vi v6, v2, 4\n\t"
1610
+ "addi %[q80], %[q80], 64\n\t"
1611
+ "vle8.v v13, (%[q81])\n\t"
1612
+ "vle8.v v14, (%[q82])\n\t"
1613
+ "vand.vi v2, v2, 0xF\n\t"
1614
+ "addi %[q81], %[q81], 64\n\t"
1615
+ "vsrl.vi v7, v3, 4\n\t"
1616
+ "addi %[q82], %[q82], 64\n\t"
1617
+ "vwmul.vv v16, v0, v8\n\t"
1618
+ "vle8.v v15, (%[q83])\n\t"
1619
+ "vle8.v v0, (%[q40])\n\t"
1620
+ "vand.vi v3, v3, 0xF\n\t"
1621
+ "addi %[q83], %[q83], 64\n\t"
1622
+ "vwmul.vv v24, v2, v12\n\t"
1623
+ "vwmul.vv v20, v4, v10\n\t"
1624
+ "vwmul.vv v28, v6, v14\n\t"
1625
+ "vwmacc.vv v16, v1, v9\n\t"
1626
+ "vle8.v v1, (%[q41])\n\t"
1627
+ "vle8.v v2, (%[q42])\n\t"
1628
+ "vwmacc.vv v24, v3, v13\n\t"
1629
+ "vwmacc.vv v20, v5, v11\n\t"
1630
+ "vwmacc.vv v28, v7, v15\n\t"
1631
+ "addi %[q40], %[q80], 64\n\t"
1632
+ "addi %[q41], %[q81], 64\n\t"
1633
+ "vle8.v v3, (%[q43])\n\t"
1634
+ "vle8.v v8, (%[q80])\n\t"
1635
+ "addi %[q42], %[q82], 64\n\t"
1636
+ "addi %[q43], %[q83], 64\n\t"
1637
+ "vsrl.vi v4, v0, 4\n\t"
1638
+ "vle8.v v9, (%[q81])\n\t"
1639
+ "vle8.v v10, (%[q82])\n\t"
1640
+ "vand.vi v0, v0, 0xF\n\t"
1641
+ "vsrl.vi v5, v1, 4\n\t"
1642
+ "vsrl.vi v7, v3, 4\n\t"
1643
+ "vand.vi v3, v3, 0xF\n\t"
1644
+ "vle8.v v11, (%[q83])\n\t"
1645
+ "vle8.v v12, (%[q40])\n\t"
1646
+ "vand.vi v1, v1, 0xF\n\t"
1647
+ "vsrl.vi v6, v2, 4\n\t"
1648
+ "vand.vi v2, v2, 0xF\n\t"
1649
+ "vwmul.vv v18, v0, v8\n\t"
1650
+ "vle8.v v13, (%[q41])\n\t"
1651
+ "vle8.v v14, (%[q42])\n\t"
1652
+ "vwmul.vv v26, v2, v12\n\t"
1653
+ "vwmul.vv v22, v4, v10\n\t"
1654
+ "vwmul.vv v30, v6, v14\n\t"
1655
+ "vwmacc.vv v18, v1, v9\n\t"
1656
+ "vle8.v v15, (%[q43])\n\t"
1657
+ "vwmacc.vv v26, v3, v13\n\t"
1658
+ "vwmacc.vv v22, v5, v11\n\t"
1659
+ "vwmacc.vv v30, v7, v15\n\t"
1660
+ "vmv.v.x v0, zero\n\t"
1661
+ "vsetivli zero, 16, e16, m2, ta, ma\n\t"
1662
+ "vwredsum.vs v4, v16, v0\n\t"
1663
+ "lbu %[s0], 0(%[scale])\n\t"
1664
+ "vwredsum.vs v5, v20, v0\n\t"
1665
+ "lbu %[s1], 1(%[scale])\n\t"
1666
+ "vwredsum.vs v6, v24, v0\n\t"
1667
+ "lbu %[s2], 2(%[scale])\n\t"
1668
+ "vwredsum.vs v7, v28, v0\n\t"
1669
+ "lbu %[s3], 3(%[scale])\n\t"
1670
+ "vwredsum.vs v8, v18, v0\n\t"
1671
+ "lbu %[q40], 4(%[scale])\n\t"
1672
+ "vwredsum.vs v9, v22, v0\n\t"
1673
+ "lbu %[q41], 5(%[scale])\n\t"
1674
+ "vwredsum.vs v10, v26, v0\n\t"
1675
+ "lbu %[q42], 6(%[scale])\n\t"
1676
+ "vwredsum.vs v11, v30, v0\n\t"
1677
+ "lbu %[q43], 7(%[scale])\n\t"
1678
+ "vsetivli zero, 4, e32, m1, ta, ma\n\t"
1679
+ "vmul.vx v0, v4, %[s0]\n\t"
1680
+ "vmul.vx v1, v8, %[q40]\n\t"
1681
+ "vmacc.vx v0, %[s1], v5\n\t"
1682
+ "vmacc.vx v1, %[q41], v9\n\t"
1683
+ "vmacc.vx v0, %[s2], v6\n\t"
1684
+ "vmacc.vx v1, %[q42], v10\n\t"
1685
+ "vmacc.vx v0, %[s3], v7\n\t"
1686
+ "vmacc.vx v1, %[q43], v11\n\t"
1687
+ "vfcvt.f.x.v v0, v0\n\t"
1688
+ "vfcvt.f.x.v v1, v1\n\t"
1689
+ "vfmv.f.s %[ft2], v0\n\t"
1690
+ "vfmv.f.s %[ftmp], v1\n\t"
1691
+ "fadd.s %[ft2], %[ft2], %[ftmp]\n\t"
1692
+ "fmadd.s %[sumf], %[d], %[ft2], %[sumf]"
1693
+ : [ftmp] "=&f" (ftmp), [sumf] "+&f" (sumf), [ft2] "=&f" (ft2)
1694
+ , [s0] "=&r" (s0), [s1] "=&r" (s1), [s2] "=&r" (s2), [s3] "=&r" (s3)
1695
+ , [q40] "=&r" (q40), [q41] "=&r" (q41), [q42] "=&r" (q42), [q43] "=&r" (q43)
1696
+ , [q80] "=&r" (q80), [q81] "=&r" (q81), [q82] "=&r" (q82), [q83] "=&r" (q83)
1697
+ : [d] "f" (d), [ys] "r" (y[i].qs), [xs] "r" (x[i].qs), [scale] "r" (scales)
1698
+ , [bsums] "r" (y[i].bsums), [mins] "r" (mins), [utmp] "r" (utmp)
1699
+ , [s6b] "r" (&x[i]), [kmask1] "r" (kmask1), [dmin] "f" (dmin)
1700
+ , [kmask2] "r" (kmask2), [kmask3] "r" (kmask3)
1701
+ : "memory"
1702
+ , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
1703
+ , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
1704
+ , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
1705
+ , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
1706
+ );
1707
+ }
1708
+ break;
1709
+ default:
1710
+ assert(false && "Unsupported vector length");
1711
+ break;
1712
+ }
1713
+
1714
+ *s = sumf;
1715
+
1716
+ #else
1717
+
1718
+ UNUSED(x);
1719
+ UNUSED(y);
1720
+ UNUSED(kmask1);
1721
+ UNUSED(kmask2);
1722
+ UNUSED(kmask3);
1723
+ UNUSED(nb);
1724
+ UNUSED(utmp);
1725
+
1726
+ ggml_vec_dot_q4_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1727
+ #endif
1728
+ }
1729
+
1730
+ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1731
+ assert(n % QK_K == 0);
1732
+ assert(nrc == 1);
1733
+ UNUSED(nrc);
1734
+ UNUSED(bx);
1735
+ UNUSED(by);
1736
+ UNUSED(bs);
1737
+
1738
+ const block_q5_K * GGML_RESTRICT x = vx;
1739
+ const block_q8_K * GGML_RESTRICT y = vy;
1740
+
1741
+ const int nb = n / QK_K;
1742
+
1743
+ static const uint32_t kmask1 = 0x3f3f3f3f;
1744
+ static const uint32_t kmask2 = 0x0f0f0f0f;
1745
+ static const uint32_t kmask3 = 0x03030303;
1746
+
1747
+ uint32_t utmp[4];
1748
+
1749
+ #if defined __riscv_v
1750
+
1751
+ const uint8_t * scales = (const uint8_t*)&utmp[0];
1752
+ const uint8_t * mins = (const uint8_t*)&utmp[2];
1753
+
1754
+ float sumf = 0;
1755
+ float sums = 0.0;
1756
+
1757
+ size_t vl;
1758
+
1759
+ for (int i = 0; i < nb; ++i) {
1760
+
1761
+ vl = 8;
1762
+
1763
+ const uint8_t * GGML_RESTRICT q5 = x[i].qs;
1764
+ const uint8_t * GGML_RESTRICT hm = x[i].qh;
1765
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
1766
+
1767
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1768
+ const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
1769
+
1770
+ vint16m1_t q8sums_0 = __riscv_vlse16_v_i16m1(y[i].bsums, 4, vl);
1771
+ vint16m1_t q8sums_1 = __riscv_vlse16_v_i16m1(y[i].bsums+1, 4, vl);
1772
+ vint16m1_t q8sums = __riscv_vadd_vv_i16m1(q8sums_0, q8sums_1, vl);
1773
+
1774
+ memcpy(utmp, x[i].scales, 12);
1775
+ utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
1776
+ const uint32_t uaux = utmp[1] & kmask1;
1777
+ utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
1778
+ utmp[2] = uaux;
1779
+ utmp[0] &= kmask1;
1780
+
1781
+ vuint8mf2_t mins8 = __riscv_vle8_v_u8mf2(mins, vl);
1782
+ vint16m1_t v_mins = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(mins8, vl));
1783
+ vint32m2_t prod = __riscv_vwmul_vv_i32m2(q8sums, v_mins, vl);
1784
+
1785
+ vint32m1_t sumi = __riscv_vredsum_vs_i32m2_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl);
1786
+ sumf -= dmin * __riscv_vmv_x_s_i32m1_i32(sumi);
1787
+
1788
+ vl = 32;
1789
+ int32_t aux32 = 0;
1790
+ int is = 0;
1791
+
1792
+ uint8_t m = 1;
1793
+ vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
1794
+ vuint8m2_t vqh = __riscv_vle8_v_u8m2(hm, vl);
1795
+
1796
+ for (int j = 0; j < QK_K/64; ++j) {
1797
+ // load Q5 and Q8
1798
+ vuint8m2_t q5_x = __riscv_vle8_v_u8m2(q5, vl);
1799
+ vint8m2_t q8_y1 = __riscv_vle8_v_i8m2(q8, vl);
1800
+ vint8m2_t q8_y2 = __riscv_vle8_v_i8m2(q8+32, vl);
1801
+
1802
+ // compute mask for addition
1803
+ vint8m2_t q5_a = __riscv_vreinterpret_v_u8m2_i8m2(__riscv_vand_vx_u8m2(q5_x, 0x0F, vl));
1804
+ vuint8m2_t qh_m1 = __riscv_vand_vx_u8m2(vqh, m, vl);
1805
+ vbool4_t vmask_1 = __riscv_vmsne_vx_u8m2_b4(qh_m1, 0, vl);
1806
+ vint8m2_t q5_m1 = __riscv_vadd_vx_i8m2_mu(vmask_1, q5_a, q5_a, 16, vl);
1807
+ m <<= 1;
1808
+
1809
+ vint8m2_t q5_l = __riscv_vreinterpret_v_u8m2_i8m2(__riscv_vsrl_vx_u8m2(q5_x, 0x04, vl));
1810
+ vuint8m2_t qh_m2 = __riscv_vand_vx_u8m2(vqh, m, vl);
1811
+ vbool4_t vmask_2 = __riscv_vmsne_vx_u8m2_b4(qh_m2, 0, vl);
1812
+ vint8m2_t q5_m2 = __riscv_vadd_vx_i8m2_mu(vmask_2, q5_l, q5_l, 16, vl);
1813
+ m <<= 1;
1814
+
1815
+ vint16m4_t v0 = __riscv_vwmul_vv_i16m4(q5_m1, q8_y1, vl);
1816
+ vint16m4_t v1 = __riscv_vwmul_vv_i16m4(q5_m2, q8_y2, vl);
1817
+
1818
+ vint32m8_t vs1 = __riscv_vwmul_vx_i32m8(v0, scales[is++], vl);
1819
+ vint32m8_t vs2 = __riscv_vwmul_vx_i32m8(v1, scales[is++], vl);
1820
+
1821
+ vint32m1_t vacc1 = __riscv_vredsum_vs_i32m8_i32m1(vs1, vzero, vl);
1822
+ vint32m1_t vacc2 = __riscv_vredsum_vs_i32m8_i32m1(vs2, vacc1, vl);
1823
+
1824
+ aux32 += __riscv_vmv_x_s_i32m1_i32(vacc2);
1825
+ q5 += 32; q8 += 64;
1826
+
1827
+ }
1828
+
1829
+ sums += aux32 * d;
1830
+
1831
+ }
1832
+
1833
+ *s = sumf+sums;
1834
+
1835
+ #else
1836
+
1837
+ UNUSED(x);
1838
+ UNUSED(y);
1839
+ UNUSED(kmask1);
1840
+ UNUSED(kmask2);
1841
+ UNUSED(kmask3);
1842
+ UNUSED(nb);
1843
+ UNUSED(utmp);
1844
+
1845
+ ggml_vec_dot_q5_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1846
+ #endif
1847
+ }
1848
+
1849
+ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1850
+ assert(n % QK_K == 0);
1851
+ assert(nrc == 1);
1852
+ UNUSED(nrc);
1853
+ UNUSED(bx);
1854
+ UNUSED(by);
1855
+ UNUSED(bs);
1856
+
1857
+ const block_q6_K * GGML_RESTRICT x = vx;
1858
+ const block_q8_K * GGML_RESTRICT y = vy;
1859
+
1860
+ const int nb = n / QK_K;
1861
+
1862
+ #if defined __riscv_xtheadvector
1863
+
1864
+ float sumf = 0;
1865
+
1866
+ for (int i = 0; i < nb; ++i) {
1867
+
1868
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1869
+
1870
+ const uint8_t * restrict q6 = x[i].ql;
1871
+ const uint8_t * restrict qh = x[i].qh;
1872
+ const int8_t * restrict q8 = y[i].qs;
1873
+
1874
+ const int8_t * restrict scale = x[i].scales;
1875
+
1876
+ int sum_t = 0;
1877
+ int t0;
1878
+
1879
+ for (int j = 0; j < QK_K/128; ++j) {
1880
+ __asm__ __volatile__(
1881
+ "th.vsetvli zero, %[vl32], e8, m2\n\t" // vl == 32
1882
+ "th.vlb.v v4, (%[qh])\n\t"
1883
+ "th.vsll.vi v0, v4, 4\n\t"
1884
+ "th.vsll.vi v2, v4, 2\n\t"
1885
+ "th.vsrl.vi v6, v4, 2\n\t"
1886
+ "th.vsetvli zero, %[vl64], e8, m4\n\t" // vl == 64
1887
+ "th.vlb.v v8, (%[q6])\n\t"
1888
+ "th.vsrl.vi v12, v8, 4\n\t"
1889
+ "th.vand.vi v8, v8, 0xF\n\t"
1890
+ "th.vsetvli zero, %[vl128], e8, m8\n\t" // vl == 128
1891
+ "th.vand.vx v0, v0, %[mask]\n\t"
1892
+ "th.vor.vv v8, v8, v0\n\t"
1893
+ "th.vlb.v v0, (%[q8])\n\t"
1894
+ "th.vsub.vx v8, v8, %[vl32]\n\t"
1895
+ "th.vsetvli zero, %[vl64], e8, m4\n\t" // vl == 64
1896
+ "th.vwmul.vv v16, v0, v8\n\t"
1897
+ "th.vwmul.vv v24, v4, v12\n\t"
1898
+ "li %[t0], 16\n\t"
1899
+ "th.vsetvli zero, %[t0], e16, m2\n\t" // vl == 16
1900
+ "th.vmv.v.x v0, zero\n\t"
1901
+ "th.vwredsum.vs v10, v16, v0\n\t"
1902
+ "th.vwredsum.vs v9, v18, v0\n\t"
1903
+ "th.vwredsum.vs v8, v20, v0\n\t"
1904
+ "th.vwredsum.vs v7, v22, v0\n\t"
1905
+ "th.vwredsum.vs v11, v24, v0\n\t"
1906
+ "th.vwredsum.vs v12, v26, v0\n\t"
1907
+ "th.vwredsum.vs v13, v28, v0\n\t"
1908
+ "th.vwredsum.vs v14, v30, v0\n\t"
1909
+ "li %[t0], 4\n\t"
1910
+ "th.vsetvli zero, %[t0], e32, m1\n\t" // vl == 4
1911
+ "th.vslideup.vi v10, v9, 1\n\t"
1912
+ "th.vslideup.vi v8, v7, 1\n\t"
1913
+ "th.vslideup.vi v11, v12, 1\n\t"
1914
+ "th.vslideup.vi v13, v14, 1\n\t"
1915
+ "th.vslideup.vi v10, v8, 2\n\t"
1916
+ "th.vslideup.vi v11, v13, 2\n\t"
1917
+ "li %[t0], 8\n\t"
1918
+ "th.vsetvli zero, %[t0], e32, m2\n\t" // vl == 8
1919
+ "th.vlb.v v4, (%[scale])\n\t"
1920
+ "th.vmul.vv v2, v4, v10\n\t"
1921
+ "th.vredsum.vs v0, v2, v0\n\t"
1922
+ "th.vmv.x.s %[t0], v0\n\t"
1923
+ "add %[sumi], %[sumi], %[t0]"
1924
+ : [sumi] "+&r" (sum_t), [t0] "=&r" (t0)
1925
+ : [qh] "r" (qh), [q6] "r" (q6), [q8] "r" (q8), [scale] "r" (scale)
1926
+ , [vl32] "r" (32), [vl64] "r" (64), [vl128] "r" (128)
1927
+ , [mask] "r" (0x30)
1928
+ : "memory"
1929
+ , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
1930
+ , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
1931
+ , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
1932
+ , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
1933
+ );
1934
+ q6 += 64; qh += 32; q8 += 128; scale += 8;
1935
+ }
1936
+
1937
+ sumf += d * sum_t;
1938
+
1939
+ }
1940
+
1941
+ *s = sumf;
1942
+
1943
+ #elif defined __riscv_v
1944
+
1945
+ float sumf = 0;
1946
+ const int vector_length = __riscv_vlenb() * 8;
1947
+
1948
+ switch (vector_length) {
1949
+ case 256:
1950
+ for (int i = 0; i < nb; ++i) {
1951
+
1952
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1953
+
1954
+ const uint8_t * GGML_RESTRICT q6 = x[i].ql;
1955
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
1956
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
1957
+
1958
+ const int8_t * GGML_RESTRICT scale = x[i].scales;
1959
+
1960
+ size_t vl;
1961
+
1962
+ vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
1963
+
1964
+ int sum_t = 0;
1965
+ int is = 0;
1966
+
1967
+ for (int j = 0; j < QK_K/128; ++j) {
1968
+
1969
+ vl = 32;
1970
+
1971
+ // load qh
1972
+ vuint8m1_t qh_x = __riscv_vle8_v_u8m1(qh, vl);
1973
+
1974
+ // load Q6
1975
+ vuint8m1_t q6_0 = __riscv_vle8_v_u8m1(q6, vl);
1976
+ vuint8m1_t q6_1 = __riscv_vle8_v_u8m1(q6+32, vl);
1977
+
1978
+ vuint8m1_t q6a_0 = __riscv_vand_vx_u8m1(q6_0, 0x0F, vl);
1979
+ vuint8m1_t q6a_1 = __riscv_vand_vx_u8m1(q6_1, 0x0F, vl);
1980
+ vuint8m1_t q6s_0 = __riscv_vsrl_vx_u8m1(q6_0, 0x04, vl);
1981
+ vuint8m1_t q6s_1 = __riscv_vsrl_vx_u8m1(q6_1, 0x04, vl);
1982
+
1983
+ vuint8m1_t qh_0 = __riscv_vand_vx_u8m1(qh_x, 0x03, vl);
1984
+ vuint8m1_t qh_1 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x2, vl), 0x03 , vl);
1985
+ vuint8m1_t qh_2 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x4, vl), 0x03 , vl);
1986
+ vuint8m1_t qh_3 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x6, vl), 0x03 , vl);
1987
+
1988
+ vuint8m1_t qhi_0 = __riscv_vor_vv_u8m1(q6a_0, __riscv_vsll_vx_u8m1(qh_0, 0x04, vl), vl);
1989
+ vuint8m1_t qhi_1 = __riscv_vor_vv_u8m1(q6a_1, __riscv_vsll_vx_u8m1(qh_1, 0x04, vl), vl);
1990
+ vuint8m1_t qhi_2 = __riscv_vor_vv_u8m1(q6s_0, __riscv_vsll_vx_u8m1(qh_2, 0x04, vl), vl);
1991
+ vuint8m1_t qhi_3 = __riscv_vor_vv_u8m1(q6s_1, __riscv_vsll_vx_u8m1(qh_3, 0x04, vl), vl);
1992
+
1993
+ vint8m1_t a_0 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_0), 32, vl);
1994
+ vint8m1_t a_1 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_1), 32, vl);
1995
+ vint8m1_t a_2 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_2), 32, vl);
1996
+ vint8m1_t a_3 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_3), 32, vl);
1997
+
1998
+ // load Q8 and take product
1999
+ vint16m2_t va_q_0 = __riscv_vwmul_vv_i16m2(a_0, __riscv_vle8_v_i8m1(q8, vl), vl);
2000
+ vint16m2_t va_q_1 = __riscv_vwmul_vv_i16m2(a_1, __riscv_vle8_v_i8m1(q8+32, vl), vl);
2001
+ vint16m2_t va_q_2 = __riscv_vwmul_vv_i16m2(a_2, __riscv_vle8_v_i8m1(q8+64, vl), vl);
2002
+ vint16m2_t va_q_3 = __riscv_vwmul_vv_i16m2(a_3, __riscv_vle8_v_i8m1(q8+96, vl), vl);
2003
+
2004
+ vl = 16;
2005
+
2006
+ vint32m2_t vaux_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_0, 0), scale[is+0], vl);
2007
+ vint32m2_t vaux_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_0, 1), scale[is+1], vl);
2008
+ vint32m2_t vaux_2 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_1, 0), scale[is+2], vl);
2009
+ vint32m2_t vaux_3 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_1, 1), scale[is+3], vl);
2010
+ vint32m2_t vaux_4 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_2, 0), scale[is+4], vl);
2011
+ vint32m2_t vaux_5 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_2, 1), scale[is+5], vl);
2012
+ vint32m2_t vaux_6 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_3, 0), scale[is+6], vl);
2013
+ vint32m2_t vaux_7 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_3, 1), scale[is+7], vl);
2014
+
2015
+ vint32m1_t isum0 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_0, vaux_1, vl), vzero, vl);
2016
+ vint32m1_t isum1 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_2, vaux_3, vl), isum0, vl);
2017
+ vint32m1_t isum2 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_4, vaux_5, vl), isum1, vl);
2018
+ vint32m1_t isum3 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_6, vaux_7, vl), isum2, vl);
2019
+
2020
+ sum_t += __riscv_vmv_x_s_i32m1_i32(isum3);
2021
+
2022
+ q6 += 64; qh += 32; q8 += 128; is=8;
2023
+
2024
+ }
2025
+
2026
+ sumf += d * sum_t;
2027
+
2028
+ }
2029
+ break;
2030
+ case 128:
2031
+ for (int i = 0; i < nb; ++i) {
2032
+
2033
+ __builtin_prefetch(&x[i + 1].d, 0, 1);
2034
+
2035
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
2036
+
2037
+ const uint8_t * restrict q6 = x[i].ql;
2038
+ const uint8_t * restrict qh = x[i].qh;
2039
+ const int8_t * restrict q8 = y[i].qs;
2040
+
2041
+ const int8_t * restrict scale = x[i].scales;
2042
+
2043
+ int q6h;
2044
+ float ftmp;
2045
+
2046
+ for (int j = 0; j < QK_K/128; ++j) {
2047
+ __asm__ __volatile__(
2048
+ "addi %[q6h], %[q6], 32\n\t"
2049
+ "ld t0, 0(%[scale])\n\t"
2050
+ "addi %[scale], %[scale], 8\n\t"
2051
+ "slli t6, t0, 1 * 8\n\t"
2052
+ "lb zero, 0(%[q6])\n\t"
2053
+ "slli t5, t0, 2 * 8\n\t"
2054
+ "slli t4, t0, 3 * 8\n\t"
2055
+ "lb zero, 0(%[q6h])\n\t"
2056
+ "slli t3, t0, 4 * 8\n\t"
2057
+ "slli t2, t0, 5 * 8\n\t"
2058
+ "lb zero, 0(%[qh])\n\t"
2059
+ "lb zero, 31(%[q6h])\n\t"
2060
+ "slli t1, t0, 6 * 8\n\t"
2061
+ "srai a7, t0, 56\n\t"
2062
+ "vsetvli zero, %[vl32], e8, m2\n\t"
2063
+ "vle8.v v8, (%[q6])\n\t"
2064
+ "srai t6, t6, 56\n\t"
2065
+ "srai t5, t5, 56\n\t"
2066
+ "srai t4, t4, 56\n\t"
2067
+ "srai t3, t3, 56\n\t"
2068
+ "vle8.v v10, (%[q6h])\n\t"
2069
+ "addi %[q6], %[q6], 64\n\t"
2070
+ "slli t0, t0, 7 * 8\n\t"
2071
+ "srai t2, t2, 56\n\t"
2072
+ "srai t1, t1, 56\n\t"
2073
+ "srai t0, t0, 56\n\t"
2074
+ "vle8.v v4, (%[qh])\n\t"
2075
+ "vsrl.vi v12, v8, 4\n\t"
2076
+ "vsrl.vi v14, v10, 4\n\t"
2077
+ "lb zero, 0(%[q8])\n\t"
2078
+ "vand.vi v8, v8, 0xF\n\t"
2079
+ "vand.vi v10, v10, 0xF\n\t"
2080
+ "lb zero, 32(%[q8])\n\t"
2081
+ "vsll.vi v0, v4, 4\n\t"
2082
+ "vsll.vi v2, v4, 2\n\t"
2083
+ "lb zero, 64(%[q8])\n\t"
2084
+ "vsrl.vi v6, v4, 2\n\t"
2085
+ "vand.vx v0, v0, %[mask]\n\t"
2086
+ "lb zero, 96(%[q8])\n\t"
2087
+ "vand.vx v2, v2, %[mask]\n\t"
2088
+ "vand.vx v4, v4, %[mask]\n\t"
2089
+ "vand.vx v6, v6, %[mask]\n\t"
2090
+ "vor.vv v8, v8, v0\n\t"
2091
+ "lb zero, 127(%[q8])\n\t"
2092
+ "vor.vv v10, v10, v2\n\t"
2093
+ "vor.vv v12, v12, v4\n\t"
2094
+ "vor.vv v14, v14, v6\n\t"
2095
+ "vsetvli zero, %[vl128], e8, m8\n\t"
2096
+ "vle8.v v0, (%[q8])\n\t"
2097
+ "vsub.vx v8, v8, %[vl32]\n\t"
2098
+ "vsetvli zero, %[vl64], e8, m4\n\t"
2099
+ "vwmul.vv v16, v0, v8\n\t"
2100
+ "vwmul.vv v24, v4, v12\n\t"
2101
+ "vsetivli zero, 16, e16, m2\n\t"
2102
+ "vmv.v.x v0, zero\n\t"
2103
+ "vwredsum.vs v10, v16, v0\n\t"
2104
+ "vwredsum.vs v9, v18, v0\n\t"
2105
+ "vwredsum.vs v8, v20, v0\n\t"
2106
+ "vwredsum.vs v7, v22, v0\n\t"
2107
+ "vwredsum.vs v11, v24, v0\n\t"
2108
+ "vwredsum.vs v12, v26, v0\n\t"
2109
+ "vwredsum.vs v13, v28, v0\n\t"
2110
+ "vwredsum.vs v14, v30, v0\n\t"
2111
+ "vsetivli zero, 4, e32, m1\n\t"
2112
+ "vmul.vx v0, v10, t0\n\t"
2113
+ "vmul.vx v1, v9, t1\n\t"
2114
+ "vmacc.vx v0, t2, v8\n\t"
2115
+ "vmacc.vx v1, t3, v7\n\t"
2116
+ "vmacc.vx v0, t4, v11\n\t"
2117
+ "vmacc.vx v1, t5, v12\n\t"
2118
+ "vmacc.vx v0, t6, v13\n\t"
2119
+ "vmacc.vx v1, a7, v14\n\t"
2120
+ "vadd.vv v0, v0, v1\n\t"
2121
+ "vfcvt.f.x.v v0, v0\n\t"
2122
+ "vfmv.f.s %[ftmp], v0\n\t"
2123
+ "fmadd.s %[sumf], %[d], %[ftmp], %[sumf]"
2124
+ : [q6] "+&r" (q6), [q6h] "=&r" (q6h)
2125
+ , [scale] "+&r" (scale)
2126
+ , [sumf] "+&f" (sumf), [ftmp] "=&f" (ftmp)
2127
+ : [qh] "r" (qh), [q8] "r" (q8)
2128
+ , [vl32] "r" (32), [vl64] "r" (64), [vl128] "r" (128)
2129
+ , [mask] "r" (0x30), [d] "f" (d)
2130
+ : "memory"
2131
+ , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
2132
+ , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
2133
+ , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
2134
+ , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
2135
+ , "t0", "t1", "t2", "t3", "t4", "t5", "t6", "a7"
2136
+ , "a6", "a5", "a4", "a3"
2137
+ );
2138
+ qh += 32; q8 += 128;
2139
+ }
2140
+ }
2141
+ break;
2142
+ default:
2143
+ assert(false && "Unsupported vector length");
2144
+ break;
2145
+ }
2146
+
2147
+ *s = sumf;
2148
+
2149
+ #else
2150
+
2151
+ UNUSED(x);
2152
+ UNUSED(y);
2153
+ UNUSED(nb);
2154
+
2155
+ ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
2156
+ #endif
2157
+ }
2158
+
2159
+ #if defined __riscv_v_intrinsic
2160
+ static NOINLINE void ggml_vec_dot_iq1_s_q8_K_vl128(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
2161
+ assert(n % QK_K == 0);
2162
+ assert(nrc == 1);
2163
+ UNUSED(nrc);
2164
+ UNUSED(bx);
2165
+ UNUSED(by);
2166
+ UNUSED(bs);
2167
+
2168
+ const block_iq1_s * GGML_RESTRICT x = vx;
2169
+ const block_q8_K * GGML_RESTRICT y = vy;
2170
+
2171
+ const int nb = n / QK_K;
2172
+
2173
+ float sumf = 0;
2174
+ for (int i = 0; i < nb; ++i) {
2175
+ // Load qh once for the entire superblock.
2176
+ vuint16m1_t qh = __riscv_vle16_v_u16m1(x[i].qh, 8);
2177
+
2178
+ // Calculate ls.
2179
+ vuint16m1_t temp = __riscv_vsrl_vx_u16m1(qh, 12, 8);
2180
+ temp = __riscv_vand_vx_u16m1(temp, 7, 8);
2181
+ vint32m2_t ls = __riscv_vreinterpret_v_u32m2_i32m2(__riscv_vwmulu_vx_u32m2(temp, 2, 8));
2182
+ ls = __riscv_vadd_vx_i32m2(ls, 1, 8);
2183
+
2184
+ // Calculate delta.
2185
+ vbool16_t mask = __riscv_vmseq_vx_u16m1_b16(__riscv_vand_vx_u16m1(qh, 0x8000, 8), 0, 8);
2186
+ vint32m2_t delta_neg = __riscv_vmv_v_x_i32m2(-1, 8);
2187
+ vint32m2_t delta_pos = __riscv_vmv_v_x_i32m2(1, 8);
2188
+ vint32m2_t delta = __riscv_vmerge_vvm_i32m2(delta_neg, delta_pos, mask, 8);
2189
+
2190
+ // Load qs.
2191
+ vuint8m2_t qs = __riscv_vle8_v_u8m2(x[i].qs, 32);
2192
+
2193
+ // Prepare the indices.
2194
+ const uint64_t shift = 0x0009000600030000;
2195
+ vuint16m4_t qh_shift = __riscv_vreinterpret_v_u64m4_u16m4(__riscv_vmv_v_x_u64m4(shift, 8));
2196
+ vuint16m4_t qh_gather_index = __riscv_vreinterpret_v_i16m4_u16m4(
2197
+ __riscv_vdiv_vx_i16m4(__riscv_vreinterpret_v_u16m4_i16m4(__riscv_vid_v_u16m4(32)), 4, 32));
2198
+ vuint16m4_t qh_ext = __riscv_vlmul_ext_v_u16m2_u16m4(__riscv_vlmul_ext_v_u16m1_u16m2(qh));
2199
+ vuint16m4_t qh_index = __riscv_vrgather_vv_u16m4(qh_ext, qh_gather_index, 32);
2200
+ qh_index = __riscv_vsrl_vv_u16m4(qh_index, qh_shift, 32);
2201
+ qh_index = __riscv_vand_vx_u16m4(qh_index, 7, 32);
2202
+ qh_index = __riscv_vsll_vx_u16m4(qh_index, 8, 32);
2203
+ qh_index = __riscv_vor_vv_u16m4(qh_index, __riscv_vzext_vf2_u16m4(qs, 32), 32);
2204
+ vuint16m4_t index = __riscv_vsll_vx_u16m4(qh_index, 3, 32);
2205
+
2206
+ // Final lsums.
2207
+ int32_t lsums_s[8];
2208
+ vint32m1_t one_scalar = __riscv_vmv_v_x_i32m1(0, 1);
2209
+
2210
+ // Sub-blocks 1-2
2211
+ {
2212
+ vuint16m1_t grid_index0 = __riscv_vget_v_u16m4_u16m1(index, 0);
2213
+ vint8m4_t grid0 = __riscv_vreinterpret_v_i64m4_i8m4(__riscv_vluxei16_v_i64m4((const int64_t*)iq1s_grid, grid_index0, 8));
2214
+ vint8m4_t q80 = __riscv_vle8_v_i8m4(&y[i].qs[0], 64);
2215
+ vint16m8_t lsum0 = __riscv_vwmul_vv_i16m8(grid0, q80, 128);
2216
+ lsums_s[0] = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m4_i32m1(__riscv_vget_v_i16m8_i16m4(lsum0, 0), one_scalar, 32));
2217
+ lsums_s[1] = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m4_i32m1(__riscv_vget_v_i16m8_i16m4(lsum0, 1), one_scalar, 32));
2218
+ }
2219
+ __asm__ __volatile__("" ::: "memory");
2220
+ // Sub-blocks 3-4
2221
+ {
2222
+ vuint16m1_t grid_index0 = __riscv_vget_v_u16m4_u16m1(index, 1);
2223
+ vint8m4_t grid0 = __riscv_vreinterpret_v_i64m4_i8m4(__riscv_vluxei16_v_i64m4((const int64_t*)iq1s_grid, grid_index0, 8));
2224
+ vint8m4_t q80 = __riscv_vle8_v_i8m4(&y[i].qs[64], 64);
2225
+ vint16m8_t lsum0 = __riscv_vwmul_vv_i16m8(grid0, q80, 128);
2226
+ lsums_s[2] = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m4_i32m1(__riscv_vget_v_i16m8_i16m4(lsum0, 0), one_scalar, 32));
2227
+ lsums_s[3] = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m4_i32m1(__riscv_vget_v_i16m8_i16m4(lsum0, 1), one_scalar, 32));
2228
+ }
2229
+ __asm__ __volatile__("" ::: "memory");
2230
+ // Sub-blocks 5-6
2231
+ {
2232
+ vuint16m1_t grid_index0 = __riscv_vget_v_u16m4_u16m1(index, 2);
2233
+ vint8m4_t grid0 = __riscv_vreinterpret_v_i64m4_i8m4(__riscv_vluxei16_v_i64m4((const int64_t*)iq1s_grid, grid_index0, 8));
2234
+ vint8m4_t q80 = __riscv_vle8_v_i8m4(&y[i].qs[128], 64);
2235
+ vint16m8_t lsum0 = __riscv_vwmul_vv_i16m8(grid0, q80, 128);
2236
+ lsums_s[4] = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m4_i32m1(__riscv_vget_v_i16m8_i16m4(lsum0, 0), one_scalar, 32));
2237
+ lsums_s[5] = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m4_i32m1(__riscv_vget_v_i16m8_i16m4(lsum0, 1), one_scalar, 32));
2238
+ }
2239
+ __asm__ __volatile__("" ::: "memory");
2240
+ // Sub-blocks 7-8
2241
+ {
2242
+ vuint16m1_t grid_index0 = __riscv_vget_v_u16m4_u16m1(index, 3);
2243
+ vint8m4_t grid0 = __riscv_vreinterpret_v_i64m4_i8m4(__riscv_vluxei16_v_i64m4((const int64_t*)iq1s_grid, grid_index0, 8));
2244
+ vint8m4_t q80 = __riscv_vle8_v_i8m4(&y[i].qs[192], 64);
2245
+ vint16m8_t lsum0 = __riscv_vwmul_vv_i16m8(grid0, q80, 128);
2246
+ lsums_s[6] = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m4_i32m1(__riscv_vget_v_i16m8_i16m4(lsum0, 0), one_scalar, 32));
2247
+ lsums_s[7] = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m4_i32m1(__riscv_vget_v_i16m8_i16m4(lsum0, 1), one_scalar, 32));
2248
+ }
2249
+ __asm__ __volatile__("" ::: "memory");
2250
+ vint32m2_t lsums = __riscv_vle32_v_i32m2(&lsums_s[0], 8);
2251
+
2252
+ // Calculate the bsums.
2253
+ vint16m2_t bsums_0 = __riscv_vle16_v_i16m2(y[i].bsums, 16);
2254
+ const vuint32m2_t bsums_i32 = __riscv_vreinterpret_v_u16m2_u32m2(__riscv_vreinterpret_v_i16m2_u16m2(bsums_0));
2255
+ const vint16m1_t bsums_i32_0 = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vnsrl_wx_u16m1(bsums_i32, 0, 8));
2256
+ const vint16m1_t bsums_i32_1 = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vnsrl_wx_u16m1(bsums_i32, 16, 8));
2257
+ const vint32m2_t bsums = __riscv_vwadd_vv_i32m2(bsums_i32_0, bsums_i32_1, 8);
2258
+
2259
+ // Accumulation.
2260
+ vint32m2_t sumi_v = __riscv_vmul_vv_i32m2(ls, lsums, 8);
2261
+ vint32m2_t sumi1_v = __riscv_vmul_vv_i32m2(__riscv_vmul_vv_i32m2(ls, delta, 8), bsums, 8);
2262
+
2263
+ // Update sumf.
2264
+ int sumi = __riscv_vmv_x_s_i32m1_i32(__riscv_vredsum_vs_i32m2_i32m1(sumi_v, __riscv_vmv_v_x_i32m1(0.0f, 1), 8));
2265
+ int sumi1 = __riscv_vmv_x_s_i32m1_i32(__riscv_vredsum_vs_i32m2_i32m1(sumi1_v, __riscv_vmv_v_x_i32m1(0.0f, 1), 8));
2266
+ sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
2267
+ }
2268
+
2269
+ *s = sumf;
2270
+ }
2271
+
2272
+ static NOINLINE void ggml_vec_dot_iq1_s_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
2273
+ assert(n % QK_K == 0);
2274
+ assert(nrc == 1);
2275
+ UNUSED(nrc);
2276
+ UNUSED(bx);
2277
+ UNUSED(by);
2278
+ UNUSED(bs);
2279
+
2280
+ const block_iq1_s * GGML_RESTRICT x = vx;
2281
+ const block_q8_K * GGML_RESTRICT y = vy;
2282
+
2283
+ const int nb = n / QK_K;
2284
+
2285
+ float sumf = 0;
2286
+ for (int i = 0; i < nb; ++i) {
2287
+ // Load qh once for the entire superblock.
2288
+ vuint16mf2_t qh = __riscv_vle16_v_u16mf2(x[i].qh, 8);
2289
+
2290
+ // Calculate ls.
2291
+ vuint16mf2_t temp = __riscv_vsrl_vx_u16mf2(qh, 12, 8);
2292
+ temp = __riscv_vand_vx_u16mf2(temp, 7, 8);
2293
+ vint32m1_t ls = __riscv_vreinterpret_v_u32m1_i32m1(__riscv_vwmulu_vx_u32m1(temp, 2, 8));
2294
+ ls = __riscv_vadd_vx_i32m1(ls, 1, 8);
2295
+
2296
+ // Calculate delta.
2297
+ vbool32_t mask = __riscv_vmseq_vx_u16mf2_b32(__riscv_vand_vx_u16mf2(qh, 0x8000, 8), 0, 8);
2298
+ vint32m1_t delta_neg = __riscv_vmv_v_x_i32m1(-1, 8);
2299
+ vint32m1_t delta_pos = __riscv_vmv_v_x_i32m1(1, 8);
2300
+ vint32m1_t delta = __riscv_vmerge_vvm_i32m1(delta_neg, delta_pos, mask, 8);
2301
+
2302
+ // Load qs.
2303
+ vuint8m1_t qs = __riscv_vle8_v_u8m1(x[i].qs, 32);
2304
+
2305
+ // Prepare the indices.
2306
+ const uint64_t shift = 0x0009000600030000;
2307
+ vuint16m2_t qh_shift = __riscv_vreinterpret_v_u64m2_u16m2(__riscv_vmv_v_x_u64m2(shift, 8));
2308
+ vuint16m2_t qh_gather_index = __riscv_vreinterpret_v_i16m2_u16m2(
2309
+ __riscv_vdiv_vx_i16m2(__riscv_vreinterpret_v_u16m2_i16m2(__riscv_vid_v_u16m2(32)), 4, 32));
2310
+ vuint16m2_t qh_ext = __riscv_vlmul_ext_v_u16m1_u16m2(__riscv_vlmul_ext_v_u16mf2_u16m1(qh));
2311
+ vuint16m2_t qh_index = __riscv_vrgather_vv_u16m2(qh_ext, qh_gather_index, 32);
2312
+ qh_index = __riscv_vsrl_vv_u16m2(qh_index, qh_shift, 32);
2313
+ qh_index = __riscv_vand_vx_u16m2(qh_index, 7, 32);
2314
+ qh_index = __riscv_vsll_vx_u16m2(qh_index, 8, 32);
2315
+ qh_index = __riscv_vor_vv_u16m2(qh_index, __riscv_vzext_vf2_u16m2(qs, 32), 32);
2316
+ vuint16m2_t index = __riscv_vsll_vx_u16m2(qh_index, 3, 32);
2317
+
2318
+ // Final lsums.
2319
+ int32_t lsums_s[8];
2320
+ vint32m1_t one_scalar = __riscv_vmv_v_x_i32m1(0, 1);
2321
+
2322
+ // Sub-blocks 1-4
2323
+ {
2324
+ vuint16m1_t grid_index0 = __riscv_vget_v_u16m2_u16m1(index, 0);
2325
+ vint8m4_t grid0 = __riscv_vreinterpret_v_i64m4_i8m4(__riscv_vluxei16_v_i64m4((const int64_t*)iq1s_grid, grid_index0, 16));
2326
+ vint8m4_t q80 = __riscv_vle8_v_i8m4(y[i].qs, 128);
2327
+ vint16m8_t lsum0 = __riscv_vwmul_vv_i16m8(grid0, q80, 128);
2328
+ lsums_s[0] = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(__riscv_vget_v_i16m8_i16m2(lsum0, 0), one_scalar, 32));
2329
+ lsums_s[1] = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(__riscv_vget_v_i16m8_i16m2(lsum0, 1), one_scalar, 32));
2330
+ lsums_s[2] = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(__riscv_vget_v_i16m8_i16m2(lsum0, 2), one_scalar, 32));
2331
+ lsums_s[3] = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(__riscv_vget_v_i16m8_i16m2(lsum0, 3), one_scalar, 32));
2332
+ }
2333
+ __asm__ __volatile__("" ::: "memory");
2334
+ // Sub-blocks 5-8
2335
+ {
2336
+ vuint16m1_t grid_index1 = __riscv_vget_v_u16m2_u16m1(index, 1);
2337
+ vint8m4_t grid1 = __riscv_vreinterpret_v_i64m4_i8m4(__riscv_vluxei16_v_i64m4((const int64_t*)iq1s_grid, grid_index1, 16));
2338
+ vint8m4_t q81 = __riscv_vle8_v_i8m4(&y[i].qs[128], 128);
2339
+ vint16m8_t lsum1 = __riscv_vwmul_vv_i16m8(grid1, q81, 128);
2340
+ lsums_s[4] = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(__riscv_vget_v_i16m8_i16m2(lsum1, 0), one_scalar, 32));
2341
+ lsums_s[5] = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(__riscv_vget_v_i16m8_i16m2(lsum1, 1), one_scalar, 32));
2342
+ lsums_s[6] = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(__riscv_vget_v_i16m8_i16m2(lsum1, 2), one_scalar, 32));
2343
+ lsums_s[7] = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(__riscv_vget_v_i16m8_i16m2(lsum1, 3), one_scalar, 32));
2344
+ }
2345
+ __asm__ __volatile__("" ::: "memory");
2346
+ vint32m1_t lsums = __riscv_vle32_v_i32m1(&lsums_s[0], 8);
2347
+
2348
+ // Calculate the bsums.
2349
+ vint16m1_t bsums_0 = __riscv_vle16_v_i16m1(y[i].bsums, 16);
2350
+ const vuint32m1_t bsums_i32 = __riscv_vreinterpret_v_u16m1_u32m1(__riscv_vreinterpret_v_i16m1_u16m1(bsums_0));
2351
+ const vint16mf2_t bsums_i32_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(bsums_i32, 0, 8));
2352
+ const vint16mf2_t bsums_i32_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(bsums_i32, 16, 8));
2353
+ const vint32m1_t bsums = __riscv_vwadd_vv_i32m1(bsums_i32_0, bsums_i32_1, 8);
2354
+
2355
+ // Accumulation.
2356
+ vint32m1_t sumi_v = __riscv_vmul_vv_i32m1(ls, lsums, 8);
2357
+ vint32m1_t sumi1_v = __riscv_vmul_vv_i32m1(__riscv_vmul_vv_i32m1(ls, delta, 8), bsums, 8);
2358
+
2359
+ // Update sumf.
2360
+ int sumi = __riscv_vmv_x_s_i32m1_i32(__riscv_vredsum_vs_i32m1_i32m1(sumi_v, __riscv_vmv_v_x_i32m1(0.0f, 1), 8));
2361
+ int sumi1 = __riscv_vmv_x_s_i32m1_i32(__riscv_vredsum_vs_i32m1_i32m1(sumi1_v, __riscv_vmv_v_x_i32m1(0.0f, 1), 8));
2362
+ sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
2363
+ }
2364
+
2365
+ *s = sumf;
2366
+ }
2367
+ #endif
2368
+
2369
+ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
2370
+ #if defined __riscv_v_intrinsic
2371
+ switch (__riscv_vlenb() * 8) {
2372
+ case 128:
2373
+ ggml_vec_dot_iq1_s_q8_K_vl128(n, s, bs, vx, bx, vy, by, nrc);
2374
+ break;
2375
+ case 256:
2376
+ ggml_vec_dot_iq1_s_q8_K_vl256(n, s, bs, vx, bx, vy, by, nrc);
2377
+ break;
2378
+ default:
2379
+ ggml_vec_dot_iq1_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
2380
+ break;
2381
+ }
2382
+ #else
2383
+ ggml_vec_dot_iq1_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
2384
+ #endif
2385
+ }
2386
+
2387
+ #if defined __riscv_v_intrinsic
2388
+ static NOINLINE void ggml_vec_dot_iq1_m_q8_K_vl128(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
2389
+ assert(n % QK_K == 0);
2390
+ assert(nrc == 1);
2391
+ UNUSED(nrc);
2392
+ UNUSED(bx);
2393
+ UNUSED(by);
2394
+ UNUSED(bs);
2395
+
2396
+ const block_iq1_m * GGML_RESTRICT x = vx;
2397
+ const block_q8_K * GGML_RESTRICT y = vy;
2398
+
2399
+ const int nb = n / QK_K;
2400
+
2401
+ iq1m_scale_t scale;
2402
+ float sumf = 0.0f;
2403
+ for (int i = 0; i < nb; ++i) {
2404
+ const int8_t * q8 = y[i].qs;
2405
+ const uint8_t * qs = x[i].qs;
2406
+ const uint8_t * qh = x[i].qh;
2407
+ const uint16_t * sc = (const uint16_t *)x[i].scales;
2408
+
2409
+ scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
2410
+
2411
+ // Accumulators.
2412
+ vint32m4_t acc1 = __riscv_vmv_v_x_i32m4(0, 16);
2413
+ vint32m4_t acc2 = __riscv_vmv_v_x_i32m4(0, 16);
2414
+
2415
+ // We process 8 16-element sub-blocks together.
2416
+ #pragma GCC unroll 1
2417
+ for (int ib = 0; ib < QK_K/128; ib++) {
2418
+ // Load qh for 8 sub-blocks.
2419
+ const vuint8mf2_t qh_8 = __riscv_vle8_v_u8mf2(qh, 8);
2420
+ const vuint16m1_t qh_16_lo = __riscv_vzext_vf2_u16m1(qh_8, 8);
2421
+ const vuint16m1_t qh_16_hi = __riscv_vsll_vx_u16m1(qh_16_lo, 8, 8);
2422
+ const vuint16m2_t qhb = __riscv_vzext_vf2_u16m2(
2423
+ __riscv_vreinterpret_v_u16m1_u8m1(__riscv_vor_vv_u16m1(qh_16_lo, qh_16_hi, 8)), 16);
2424
+ qh += 8;
2425
+
2426
+ // Prepare grid indices.
2427
+ const vuint16m2_t qsb = __riscv_vzext_vf2_u16m2(__riscv_vle8_v_u8m1(&qs[0], 16), 16);
2428
+ const vuint16m2_t shift = __riscv_vreinterpret_v_u32m2_u16m2(__riscv_vmv_v_x_u32m2(0x00040008, 8));
2429
+ vuint16m2_t index = __riscv_vor_vv_u16m2(qsb, __riscv_vand_vx_u16m2(__riscv_vsll_vv_u16m2(qhb, shift, 16), 0x700, 16), 16);
2430
+ index = __riscv_vsll_vx_u16m2(index, 3, 16);
2431
+ qs += 16;
2432
+
2433
+ // Prepare the deltas.
2434
+ const vbool8_t mask = __riscv_vmsgtu_vx_u16m2_b8(
2435
+ __riscv_vand_vv_u16m2(qhb, __riscv_vreinterpret_v_u32m2_u16m2(__riscv_vmv_v_x_u32m2(0x00800008, 8)), 16), 0, 16);
2436
+ const vint64m8_t delta_pos = __riscv_vmv_v_x_i64m8(0x0101010101010101, 16);
2437
+ const vint8m8_t delta = __riscv_vreinterpret_v_i64m8_i8m8(
2438
+ __riscv_vmerge_vxm_i64m8(delta_pos, 0xffffffffffffffff, mask, 16));
2439
+
2440
+ // Sub-blocks 0-3
2441
+ {
2442
+ // Load the grid.
2443
+ const vint8m4_t iq1b = __riscv_vreinterpret_v_i64m4_i8m4(__riscv_vreinterpret_v_u64m4_i64m4(
2444
+ __riscv_vluxei16_v_u64m4(iq1s_grid, __riscv_vget_v_u16m2_u16m1(index, 0), 8)));
2445
+
2446
+ // Calculate the lsums.
2447
+ //
2448
+ // Sub-block 0, 1
2449
+ {
2450
+ // Load q8 for each sub-block.
2451
+ const vint8m2_t q8b = __riscv_vle8_v_i8m2(q8, 32);
2452
+ q8 += 32;
2453
+
2454
+ // Calculate the lsums.
2455
+ const vint16m4_t lsum1 = __riscv_vwmul_vv_i16m4(__riscv_vget_v_i8m4_i8m2(iq1b, 0), q8b, 32);
2456
+ const vint16m4_t lsum2 = __riscv_vwmul_vv_i16m4(__riscv_vget_v_i8m8_i8m2(delta, 0), q8b, 32);
2457
+
2458
+ // Prepare the scales.
2459
+ const int16_t ls_0 = 2*((sc[0] >> 0) & 0x7) + 1;
2460
+ const int16_t ls_1 = 2*((sc[0] >> 3) & 0x7) + 1;
2461
+
2462
+ // Accumulate in acc0 and acc1 for each sub-block.
2463
+ acc1 = __riscv_vwmacc_vx_i32m4(acc1, ls_0, __riscv_vget_v_i16m4_i16m2(lsum1, 0), 16);
2464
+ acc1 = __riscv_vwmacc_vx_i32m4(acc1, ls_1, __riscv_vget_v_i16m4_i16m2(lsum1, 1), 16);
2465
+ acc2 = __riscv_vwmacc_vx_i32m4(acc2, ls_0, __riscv_vget_v_i16m4_i16m2(lsum2, 0), 16);
2466
+ acc2 = __riscv_vwmacc_vx_i32m4(acc2, ls_1, __riscv_vget_v_i16m4_i16m2(lsum2, 1), 16);
2467
+ }
2468
+ __asm__ __volatile__("" ::: "memory");
2469
+ // Sub-block 2, 3
2470
+ {
2471
+ // Load q8 for each sub-block.
2472
+ const vint8m2_t q8b = __riscv_vle8_v_i8m2(q8, 32);
2473
+ q8 += 32;
2474
+
2475
+ // Calculate the lsums.
2476
+ const vint16m4_t lsum1 = __riscv_vwmul_vv_i16m4(__riscv_vget_v_i8m4_i8m2(iq1b, 1), q8b, 32);
2477
+ const vint16m4_t lsum2 = __riscv_vwmul_vv_i16m4(__riscv_vget_v_i8m8_i8m2(delta, 1), q8b, 32);
2478
+
2479
+ // Prepare the scales.
2480
+ const int16_t ls_0 = 2*((sc[0] >> 6) & 0x7) + 1;
2481
+ const int16_t ls_1 = 2*((sc[0] >> 9) & 0x7) + 1;
2482
+
2483
+ // Accumulate in acc0 and acc1 for each sub-block.
2484
+ acc1 = __riscv_vwmacc_vx_i32m4(acc1, ls_0, __riscv_vget_v_i16m4_i16m2(lsum1, 0), 16);
2485
+ acc1 = __riscv_vwmacc_vx_i32m4(acc1, ls_1, __riscv_vget_v_i16m4_i16m2(lsum1, 1), 16);
2486
+ acc2 = __riscv_vwmacc_vx_i32m4(acc2, ls_0, __riscv_vget_v_i16m4_i16m2(lsum2, 0), 16);
2487
+ acc2 = __riscv_vwmacc_vx_i32m4(acc2, ls_1, __riscv_vget_v_i16m4_i16m2(lsum2, 1), 16);
2488
+ }
2489
+ sc += 1;
2490
+ }
2491
+ __asm__ __volatile__("" ::: "memory");
2492
+ // Sub-blocks 4-7
2493
+ {
2494
+ // Load the grid.
2495
+ const vint8m4_t iq1b = __riscv_vreinterpret_v_i64m4_i8m4(__riscv_vreinterpret_v_u64m4_i64m4(
2496
+ __riscv_vluxei16_v_u64m4(iq1s_grid, __riscv_vget_v_u16m2_u16m1(index, 1), 8)));
2497
+
2498
+ // Calculate the lsums.
2499
+ //
2500
+ // Sub-block 4, 5
2501
+ {
2502
+ // Load q8 for each sub-block.
2503
+ const vint8m2_t q8b = __riscv_vle8_v_i8m2(q8, 32);
2504
+ q8 += 32;
2505
+
2506
+ // Calculate the lsums.
2507
+ const vint16m4_t lsum1 = __riscv_vwmul_vv_i16m4(__riscv_vget_v_i8m4_i8m2(iq1b, 0), q8b, 32);
2508
+ const vint16m4_t lsum2 = __riscv_vwmul_vv_i16m4(__riscv_vget_v_i8m8_i8m2(delta, 2), q8b, 32);
2509
+
2510
+ // Prepare the scales.
2511
+ const int16_t ls_0 = 2*((sc[0] >> 0) & 0x7) + 1;
2512
+ const int16_t ls_1 = 2*((sc[0] >> 3) & 0x7) + 1;
2513
+
2514
+ // Accumulate in acc0 and acc1 for each sub-block.
2515
+ acc1 = __riscv_vwmacc_vx_i32m4(acc1, ls_0, __riscv_vget_v_i16m4_i16m2(lsum1, 0), 16);
2516
+ acc1 = __riscv_vwmacc_vx_i32m4(acc1, ls_1, __riscv_vget_v_i16m4_i16m2(lsum1, 1), 16);
2517
+ acc2 = __riscv_vwmacc_vx_i32m4(acc2, ls_0, __riscv_vget_v_i16m4_i16m2(lsum2, 0), 16);
2518
+ acc2 = __riscv_vwmacc_vx_i32m4(acc2, ls_1, __riscv_vget_v_i16m4_i16m2(lsum2, 1), 16);
2519
+ }
2520
+ __asm__ __volatile__("" ::: "memory");
2521
+ // Sub-block 6, 7
2522
+ {
2523
+ // Load q8 for each sub-block.
2524
+ const vint8m2_t q8b = __riscv_vle8_v_i8m2(q8, 32);
2525
+ q8 += 32;
2526
+
2527
+ // Calculate the lsums.
2528
+ const vint16m4_t lsum1 = __riscv_vwmul_vv_i16m4(__riscv_vget_v_i8m4_i8m2(iq1b, 1), q8b, 32);
2529
+ const vint16m4_t lsum2 = __riscv_vwmul_vv_i16m4(__riscv_vget_v_i8m8_i8m2(delta, 3), q8b, 32);
2530
+
2531
+ // Prepare the scales.
2532
+ const int16_t ls_0 = 2*((sc[0] >> 6) & 0x7) + 1;
2533
+ const int16_t ls_1 = 2*((sc[0] >> 9) & 0x7) + 1;
2534
+
2535
+ // Accumulate in acc0 and acc1 for each sub-block.
2536
+ acc1 = __riscv_vwmacc_vx_i32m4(acc1, ls_0, __riscv_vget_v_i16m4_i16m2(lsum1, 0), 16);
2537
+ acc1 = __riscv_vwmacc_vx_i32m4(acc1, ls_1, __riscv_vget_v_i16m4_i16m2(lsum1, 1), 16);
2538
+ acc2 = __riscv_vwmacc_vx_i32m4(acc2, ls_0, __riscv_vget_v_i16m4_i16m2(lsum2, 0), 16);
2539
+ acc2 = __riscv_vwmacc_vx_i32m4(acc2, ls_1, __riscv_vget_v_i16m4_i16m2(lsum2, 1), 16);
2540
+ }
2541
+ sc += 1;
2542
+ }
2543
+ }
2544
+
2545
+ // Reduce and accumulate in `sumf`.
2546
+ vint32m1_t one = __riscv_vmv_v_x_i32m1(0, 1);
2547
+ int sumi1 = __riscv_vmv_x_s_i32m1_i32(__riscv_vredsum_vs_i32m4_i32m1(acc1, one, 16));
2548
+ int sumi2 = __riscv_vmv_x_s_i32m1_i32(__riscv_vredsum_vs_i32m4_i32m1(acc2, one, 16));
2549
+ sumf += y[i].d * GGML_CPU_FP16_TO_FP32(scale.f16) * (sumi1 + IQ1M_DELTA * sumi2);
2550
+ }
2551
+
2552
+ *s = sumf;
2553
+ }
2554
+
2555
+ static NOINLINE void ggml_vec_dot_iq1_m_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
2556
+ assert(n % QK_K == 0);
2557
+ assert(nrc == 1);
2558
+ UNUSED(nrc);
2559
+ UNUSED(bx);
2560
+ UNUSED(by);
2561
+ UNUSED(bs);
2562
+
2563
+ const block_iq1_m * GGML_RESTRICT x = vx;
2564
+ const block_q8_K * GGML_RESTRICT y = vy;
2565
+
2566
+ const int nb = n / QK_K;
2567
+
2568
+ iq1m_scale_t scale;
2569
+ float sumf = 0.0f;
2570
+ for (int i = 0; i < nb; ++i) {
2571
+ const int8_t * q8 = y[i].qs;
2572
+ const uint8_t * qs = x[i].qs;
2573
+ const uint8_t * qh = x[i].qh;
2574
+ const uint16_t * sc = (const uint16_t *)x[i].scales;
2575
+
2576
+ scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
2577
+
2578
+ // Accumulators.
2579
+ vint32m2_t acc1 = __riscv_vmv_v_x_i32m2(0, 16);
2580
+ vint32m2_t acc2 = __riscv_vmv_v_x_i32m2(0, 16);
2581
+
2582
+ // We process 8 16-element sub-blocks together.
2583
+ #pragma GCC unroll 1
2584
+ for (int ib = 0; ib < QK_K/128; ib++) {
2585
+ // Load qh for 8 sub-blocks.
2586
+ const vuint8mf4_t qh_8 = __riscv_vle8_v_u8mf4(qh, 8);
2587
+ const vuint16mf2_t qh_16_lo = __riscv_vzext_vf2_u16mf2(qh_8, 8);
2588
+ const vuint16mf2_t qh_16_hi = __riscv_vsll_vx_u16mf2(qh_16_lo, 8, 8);
2589
+ const vuint16m1_t qhb = __riscv_vzext_vf2_u16m1(
2590
+ __riscv_vreinterpret_v_u16mf2_u8mf2(__riscv_vor_vv_u16mf2(qh_16_lo, qh_16_hi, 8)), 16);
2591
+ qh += 8;
2592
+
2593
+ __asm__ __volatile__("" ::: "memory");
2594
+
2595
+ // Prepare grid indices.
2596
+ const vuint16m1_t qsb = __riscv_vzext_vf2_u16m1(__riscv_vle8_v_u8mf2(&qs[0], 16), 16);
2597
+ const vuint16m1_t shift = __riscv_vreinterpret_v_u32m1_u16m1(__riscv_vmv_v_x_u32m1(0x00040008, 8));
2598
+ vuint16m1_t index = __riscv_vor_vv_u16m1(qsb, __riscv_vand_vx_u16m1(__riscv_vsll_vv_u16m1(qhb, shift, 16), 0x700, 16), 16);
2599
+ index = __riscv_vsll_vx_u16m1(index, 3, 16);
2600
+ qs += 16;
2601
+
2602
+ __asm__ __volatile__("" ::: "memory");
2603
+
2604
+ // Load the grid.
2605
+ const vint8m4_t iq1b = __riscv_vreinterpret_v_i64m4_i8m4(__riscv_vreinterpret_v_u64m4_i64m4(
2606
+ __riscv_vluxei16_v_u64m4(iq1s_grid, index, 16)));
2607
+
2608
+ // Prepare the deltas.
2609
+ const vbool16_t mask = __riscv_vmsgtu_vx_u16m1_b16(
2610
+ __riscv_vand_vv_u16m1(qhb, __riscv_vreinterpret_v_u32m1_u16m1(__riscv_vmv_v_x_u32m1(0x00800008, 8)), 16), 0, 16);
2611
+ const vint64m4_t delta_pos = __riscv_vmv_v_x_i64m4(0x0101010101010101, 16);
2612
+ const vint8m4_t delta = __riscv_vreinterpret_v_i64m4_i8m4(
2613
+ __riscv_vmerge_vxm_i64m4(delta_pos, 0xffffffffffffffff, mask, 16));
2614
+
2615
+ // Load q8 for sub-blocks.
2616
+ const vint8m4_t q8b = __riscv_vle8_v_i8m4(q8, 128);
2617
+ q8 += 128;
2618
+
2619
+ // Calculate the lsums.
2620
+ const vint16m8_t lsum1 = __riscv_vwmul_vv_i16m8(iq1b, q8b, 128);
2621
+ const vint16m8_t lsum2 = __riscv_vwmul_vv_i16m8(delta, q8b, 128);
2622
+
2623
+ // Prepare the scales.
2624
+ const int16_t ls_0_0 = 2*((sc[0] >> 0) & 0x7) + 1;
2625
+ const int16_t ls_0_1 = 2*((sc[0] >> 3) & 0x7) + 1;
2626
+ const int16_t ls_1_0 = 2*((sc[0] >> 6) & 0x7) + 1;
2627
+ const int16_t ls_1_1 = 2*((sc[0] >> 9) & 0x7) + 1;
2628
+ const int16_t ls_2_0 = 2*((sc[1] >> 0) & 0x7) + 1;
2629
+ const int16_t ls_2_1 = 2*((sc[1] >> 3) & 0x7) + 1;
2630
+ const int16_t ls_3_0 = 2*((sc[1] >> 6) & 0x7) + 1;
2631
+ const int16_t ls_3_1 = 2*((sc[1] >> 9) & 0x7) + 1;
2632
+ sc += 2;
2633
+
2634
+ // Accumulate in acc0 and acc1 for each sub-block.
2635
+ acc1 = __riscv_vwmacc_vx_i32m2(acc1, ls_0_0, __riscv_vget_v_i16m8_i16m1(lsum1, 0), 16);
2636
+ acc1 = __riscv_vwmacc_vx_i32m2(acc1, ls_0_1, __riscv_vget_v_i16m8_i16m1(lsum1, 1), 16);
2637
+ acc2 = __riscv_vwmacc_vx_i32m2(acc2, ls_0_0, __riscv_vget_v_i16m8_i16m1(lsum2, 0), 16);
2638
+ acc2 = __riscv_vwmacc_vx_i32m2(acc2, ls_0_1, __riscv_vget_v_i16m8_i16m1(lsum2, 1), 16);
2639
+ //
2640
+ acc1 = __riscv_vwmacc_vx_i32m2(acc1, ls_1_0, __riscv_vget_v_i16m8_i16m1(lsum1, 2), 16);
2641
+ acc1 = __riscv_vwmacc_vx_i32m2(acc1, ls_1_1, __riscv_vget_v_i16m8_i16m1(lsum1, 3), 16);
2642
+ acc2 = __riscv_vwmacc_vx_i32m2(acc2, ls_1_0, __riscv_vget_v_i16m8_i16m1(lsum2, 2), 16);
2643
+ acc2 = __riscv_vwmacc_vx_i32m2(acc2, ls_1_1, __riscv_vget_v_i16m8_i16m1(lsum2, 3), 16);
2644
+ //
2645
+ acc1 = __riscv_vwmacc_vx_i32m2(acc1, ls_2_0, __riscv_vget_v_i16m8_i16m1(lsum1, 4), 16);
2646
+ acc1 = __riscv_vwmacc_vx_i32m2(acc1, ls_2_1, __riscv_vget_v_i16m8_i16m1(lsum1, 5), 16);
2647
+ acc2 = __riscv_vwmacc_vx_i32m2(acc2, ls_2_0, __riscv_vget_v_i16m8_i16m1(lsum2, 4), 16);
2648
+ acc2 = __riscv_vwmacc_vx_i32m2(acc2, ls_2_1, __riscv_vget_v_i16m8_i16m1(lsum2, 5), 16);
2649
+ //
2650
+ acc1 = __riscv_vwmacc_vx_i32m2(acc1, ls_3_0, __riscv_vget_v_i16m8_i16m1(lsum1, 6), 16);
2651
+ acc1 = __riscv_vwmacc_vx_i32m2(acc1, ls_3_1, __riscv_vget_v_i16m8_i16m1(lsum1, 7), 16);
2652
+ acc2 = __riscv_vwmacc_vx_i32m2(acc2, ls_3_0, __riscv_vget_v_i16m8_i16m1(lsum2, 6), 16);
2653
+ acc2 = __riscv_vwmacc_vx_i32m2(acc2, ls_3_1, __riscv_vget_v_i16m8_i16m1(lsum2, 7), 16);
2654
+
2655
+ __asm__ __volatile__("" ::: "memory");
2656
+ }
2657
+
2658
+ // Reduce and accumulate in `sumf`.
2659
+ vint32m1_t one = __riscv_vmv_v_x_i32m1(0, 1);
2660
+ int sumi1 = __riscv_vmv_x_s_i32m1_i32(__riscv_vredsum_vs_i32m2_i32m1(acc1, one, 16));
2661
+ int sumi2 = __riscv_vmv_x_s_i32m1_i32(__riscv_vredsum_vs_i32m2_i32m1(acc2, one, 16));
2662
+ sumf += y[i].d * GGML_CPU_FP16_TO_FP32(scale.f16) * (sumi1 + IQ1M_DELTA * sumi2);
2663
+ }
2664
+
2665
+ *s = sumf;
2666
+ }
2667
+ #endif
2668
+
2669
+ void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
2670
+ #if defined __riscv_v_intrinsic
2671
+ switch (__riscv_vlenb() * 8) {
2672
+ case 128:
2673
+ ggml_vec_dot_iq1_m_q8_K_vl128(n, s, bs, vx, bx, vy, by, nrc);
2674
+ break;
2675
+ case 256:
2676
+ ggml_vec_dot_iq1_m_q8_K_vl256(n, s, bs, vx, bx, vy, by, nrc);
2677
+ break;
2678
+ default:
2679
+ ggml_vec_dot_iq1_m_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
2680
+ break;
2681
+ }
2682
+ #else
2683
+ ggml_vec_dot_iq1_m_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
2684
+ #endif
2685
+ }
2686
+
2687
+ #if defined __riscv_v_intrinsic
2688
+ static const uint8_t sign_gather_indices_arr[64] = {
2689
+ 0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1, 2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,
2690
+ 4,4,4,4,4,4,4,4, 5,5,5,5,5,5,5,5, 6,6,6,6,6,6,6,6, 7,7,7,7,7,7,7,7
2691
+ };
2692
+
2693
+ static const uint8_t sign_bit_masks_arr[64] = {
2694
+ 1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128,
2695
+ 1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128
2696
+ };
2697
+
2698
+ static NOINLINE void ggml_vec_dot_iq2_s_q8_K_vl128(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
2699
+ assert(n % QK_K == 0);
2700
+ UNUSED(nrc); UNUSED(bx); UNUSED(by); UNUSED(bs);
2701
+
2702
+ const block_iq2_s * GGML_RESTRICT x = vx;
2703
+ const block_q8_K * GGML_RESTRICT y = vy;
2704
+
2705
+ const int nb = n / QK_K;
2706
+ const uint64_t * grid64 = (const uint64_t *)iq2s_grid;
2707
+
2708
+ // Pre-load Constants
2709
+ vuint8m2_t v_ids = __riscv_vid_v_u8m2(32);
2710
+ vuint8m2_t v_sign_gather_indices = __riscv_vsrl_vx_u8m2(v_ids, 3, 32);
2711
+ vuint8m2_t v_ones = __riscv_vmv_v_x_u8m2(1, 32);
2712
+ vuint8m2_t v_shift_amts = __riscv_vand_vx_u8m2(v_ids, 7, 32);
2713
+ vuint8m2_t v_sign_masks = __riscv_vsll_vv_u8m2(v_ones, v_shift_amts, 32);
2714
+ uint16_t shift_qh_arr[4] = {11, 9, 7, 5};
2715
+ vuint16mf2_t v_shift_qh = __riscv_vle16_v_u16mf2(shift_qh_arr, 4);
2716
+
2717
+ float sumf = 0.0f;
2718
+
2719
+ for (int i = 0; i < nb; ++i) {
2720
+ const float combined_scale = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
2721
+
2722
+ const uint8_t * GGML_RESTRICT qs = x[i].qs;
2723
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
2724
+ const uint8_t * GGML_RESTRICT scales = x[i].scales;
2725
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
2726
+
2727
+ const uint8_t * signs_ptr = qs + 32;
2728
+ float sum_block = 0.0f;
2729
+
2730
+ for (int ib = 0; ib < 8; ++ib) {
2731
+
2732
+ // Load Low Bits [4 bytes]
2733
+ vuint8mf4_t v_qs_u8 = __riscv_vle8_v_u8mf4(qs, 4);
2734
+ qs += 4;
2735
+
2736
+ // Load 1 byte. It contains bits for 4 mini-blocks.
2737
+ uint8_t qh_val = *qh++;
2738
+
2739
+ // Combine Low + High bits of 10bit indices
2740
+ vuint8mf4_t v_qh_raw = __riscv_vmv_v_x_u8mf4(qh_val, 4);
2741
+ vuint16mf2_t v_qh_u16 = __riscv_vwcvtu_x_x_v_u16mf2(v_qh_raw, 4);
2742
+ vuint16mf2_t v_qh_mf2 = __riscv_vsll_vv_u16mf2(v_qh_u16, v_shift_qh, 4);
2743
+ v_qh_mf2 = __riscv_vand_vx_u16mf2(v_qh_mf2, 0x1800, 4);
2744
+ vuint16mf2_t v_qs_u16_mf2 = __riscv_vwcvtu_x_x_v_u16mf2(v_qs_u8, 4);
2745
+ vuint16mf2_t v_qs_u16 = __riscv_vsll_vx_u16mf2(v_qs_u16_mf2, 3, 4);
2746
+ vuint16mf2_t v_grid_offsets = __riscv_vor_vv_u16mf2(v_qs_u16, v_qh_mf2, 4);
2747
+
2748
+ // Lookup Grid
2749
+ vint8m2_t v_grid_i8 = __riscv_vreinterpret_v_u8m2_i8m2(__riscv_vreinterpret_v_u64m2_u8m2(__riscv_vluxei16_v_u64m2(grid64, v_grid_offsets, 4)));
2750
+
2751
+ vuint8mf4_t v_signs_raw = __riscv_vle8_v_u8mf4(signs_ptr, 4);
2752
+ signs_ptr += 4;
2753
+ vuint8m2_t v_signs_source = __riscv_vlmul_ext_v_u8mf4_u8m2(v_signs_raw);
2754
+ vuint8m2_t v_signs_bcast = __riscv_vrgather_vv_u8m2(v_signs_source, v_sign_gather_indices, 32);
2755
+
2756
+ // generating sign mask
2757
+ vuint8m2_t v_sign_bits = __riscv_vand_vv_u8m2(v_signs_bcast, v_sign_masks, 32);
2758
+ vbool4_t m_negative = __riscv_vmsne_vx_u8m2_b4(v_sign_bits, 0, 32);
2759
+
2760
+ vint8m2_t v_q8 = __riscv_vle8_v_i8m2(q8, 32);
2761
+ q8 += 32;
2762
+
2763
+ // apply signs
2764
+ vint8m2_t v_q8_signed = __riscv_vrsub_vx_i8m2_mu(m_negative,v_q8, v_q8, 0, 32);
2765
+ vint16m4_t v_dot = __riscv_vwmul_vv_i16m4(v_grid_i8, v_q8_signed, 32);
2766
+
2767
+ // Reduction
2768
+ vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, 1);
2769
+
2770
+ // Reduce 0-15 (First Half)
2771
+ int32_t s0 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(
2772
+ __riscv_vget_v_i16m4_i16m2(v_dot, 0), v_zero, 16));
2773
+
2774
+ // Reduce 16-31 (Second Half)
2775
+ int32_t s1 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(
2776
+ __riscv_vget_v_i16m4_i16m2(v_dot, 1), v_zero, 16));
2777
+
2778
+ // Apply sub Scales
2779
+ uint8_t sc = *scales++;
2780
+
2781
+ sum_block += s0 * (2 * (sc & 0xF) + 1);
2782
+ sum_block += s1 * (2 * (sc >> 4) + 1);
2783
+ }
2784
+ sumf += sum_block * combined_scale;
2785
+ }
2786
+ *s = 0.125f * sumf;
2787
+ }
2788
+
2789
+ static NOINLINE void ggml_vec_dot_iq2_s_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
2790
+ assert(n % QK_K == 0);
2791
+ UNUSED(nrc); UNUSED(bx); UNUSED(by); UNUSED(bs);
2792
+
2793
+ const block_iq2_s * GGML_RESTRICT x = vx;
2794
+ const block_q8_K * GGML_RESTRICT y = vy;
2795
+
2796
+ const int nb = n / QK_K;
2797
+ const uint64_t * grid64 = (const uint64_t *)iq2s_grid;
2798
+
2799
+ // --- Pre-load Constants ---
2800
+ uint16_t gather_qh_arr[8] = {0, 0, 0, 0, 1, 1, 1, 1};
2801
+ vuint16mf2_t v_gather_qh = __riscv_vle16_v_u16mf2(gather_qh_arr, 8);
2802
+ uint16_t shift_qh_arr[8] = {11, 9, 7, 5, 11, 9, 7, 5};
2803
+ vuint16mf2_t v_shift_qh = __riscv_vle16_v_u16mf2(shift_qh_arr, 8);
2804
+
2805
+ // Constants for sign extraction
2806
+ vuint8m2_t v_sign_gather_indices = __riscv_vle8_v_u8m2(sign_gather_indices_arr, 64);
2807
+ vuint8m2_t v_sign_masks = __riscv_vle8_v_u8m2(sign_bit_masks_arr, 64);
2808
+
2809
+ float sumf = 0.0f;
2810
+
2811
+ for (int i = 0; i < nb; ++i) {
2812
+ const float combined_scale = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
2813
+
2814
+ const uint8_t * GGML_RESTRICT qs = x[i].qs;
2815
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
2816
+ const uint8_t * GGML_RESTRICT scales = x[i].scales;
2817
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
2818
+
2819
+ const uint8_t * signs_ptr = qs + 32;
2820
+
2821
+ float sum_block = 0.0f;
2822
+
2823
+ for (int ib = 0; ib < 4; ++ib) {
2824
+ // Combine low + high bits
2825
+ vuint8mf4_t v_qs_u8 = __riscv_vle8_v_u8mf4(qs, 8);
2826
+ qs += 8;
2827
+ uint16_t qh_val;
2828
+ memcpy(&qh_val, qh, 2);
2829
+ qh += 2;
2830
+ vuint8mf8_t v_qh_raw = __riscv_vle8_v_u8mf8((const uint8_t*)&qh_val, 2);
2831
+ vuint16mf4_t v_qh_u16 = __riscv_vwcvtu_x_x_v_u16mf4(v_qh_raw, 2);
2832
+ vuint16mf2_t v_qh_u16_ext = __riscv_vlmul_ext_v_u16mf4_u16mf2(v_qh_u16);
2833
+ vuint16mf2_t v_qh_expanded = __riscv_vrgather_vv_u16mf2(v_qh_u16_ext, v_gather_qh, 8);
2834
+ v_qh_expanded = __riscv_vsll_vv_u16mf2(v_qh_expanded, v_shift_qh, 8);
2835
+
2836
+ // Mask: We want bits 11-12. 0x1800 = 0001 1000 0000 0000
2837
+ v_qh_expanded = __riscv_vand_vx_u16mf2(v_qh_expanded, 0x1800, 8);
2838
+ vuint16mf2_t v_qs_u16 = __riscv_vwcvtu_x_x_v_u16mf2(v_qs_u8, 8);
2839
+
2840
+ // Multiply by 8 to get byte offset, instead of element offset
2841
+ v_qs_u16 = __riscv_vsll_vx_u16mf2(v_qs_u16, 3, 8);
2842
+ vuint16mf2_t v_grid_offsets = __riscv_vor_vv_u16mf2(v_qs_u16, v_qh_expanded, 8);
2843
+
2844
+ // Lookup Grid using Byte Offsets
2845
+ vuint64m2_t v_grid_vals = __riscv_vluxei16_v_u64m2(grid64, v_grid_offsets, 8);
2846
+
2847
+ vuint8m2_t v_grid_u8 = __riscv_vreinterpret_v_u64m2_u8m2(v_grid_vals);
2848
+ vint8m2_t v_grid_i8 = __riscv_vreinterpret_v_u8m2_i8m2(v_grid_u8);
2849
+
2850
+ // Load signs and generate sign mask
2851
+ vuint8mf4_t v_signs_raw = __riscv_vle8_v_u8mf4(signs_ptr, 8);
2852
+ signs_ptr += 8;
2853
+
2854
+ vuint8m2_t v_signs_source = __riscv_vlmul_ext_v_u8mf4_u8m2(v_signs_raw);
2855
+ vuint8m2_t v_signs_bcast = __riscv_vrgather_vv_u8m2(v_signs_source, v_sign_gather_indices, 64);
2856
+
2857
+ vuint8m2_t v_sign_bits = __riscv_vand_vv_u8m2(v_signs_bcast, v_sign_masks, 64);
2858
+ vbool4_t m_negative = __riscv_vmsne_vx_u8m2_b4(v_sign_bits, 0, 64);
2859
+
2860
+ vint8m2_t v_q8 = __riscv_vle8_v_i8m2(q8, 64);
2861
+ q8 += 64;
2862
+
2863
+ vint8m2_t v_q8_signed = __riscv_vrsub_vx_i8m2_mu(m_negative, v_q8, v_q8, 0, 64);
2864
+ vint16m4_t v_dot = __riscv_vwmul_vv_i16m4(v_grid_i8, v_q8_signed, 64);
2865
+
2866
+ vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, 1);
2867
+
2868
+ int32_t s0 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m1_i32m1(
2869
+ __riscv_vget_v_i16m4_i16m1(v_dot, 0), v_zero, 16));
2870
+ int32_t s1 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m1_i32m1(
2871
+ __riscv_vget_v_i16m4_i16m1(v_dot, 1), v_zero, 16));
2872
+ int32_t s2 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m1_i32m1(
2873
+ __riscv_vget_v_i16m4_i16m1(v_dot, 2), v_zero, 16));
2874
+ int32_t s3 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m1_i32m1(
2875
+ __riscv_vget_v_i16m4_i16m1(v_dot, 3), v_zero, 16));
2876
+
2877
+ uint8_t sc0 = scales[0];
2878
+ uint8_t sc1 = scales[1];
2879
+ scales += 2;
2880
+
2881
+ sum_block += s0 * (2 * (sc0 & 0xF) + 1);
2882
+ sum_block += s1 * (2 * (sc0 >> 4) + 1);
2883
+ sum_block += s2 * (2 * (sc1 & 0xF) + 1);
2884
+ sum_block += s3 * (2 * (sc1 >> 4) + 1);
2885
+ }
2886
+ sumf += sum_block * combined_scale;
2887
+ }
2888
+ *s = 0.125f * sumf;
2889
+ }
2890
+ #endif
2891
+
2892
+ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
2893
+ #if defined __riscv_v_intrinsic
2894
+ switch (__riscv_vlenb() * 8) {
2895
+ case 128:
2896
+ ggml_vec_dot_iq2_s_q8_K_vl128(n, s, bs, vx, bx, vy, by, nrc);
2897
+ break;
2898
+ case 256:
2899
+ ggml_vec_dot_iq2_s_q8_K_vl256(n, s, bs, vx, bx, vy, by, nrc);
2900
+ break;
2901
+ default:
2902
+ ggml_vec_dot_iq2_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
2903
+ break;
2904
+ }
2905
+ #else
2906
+ ggml_vec_dot_iq2_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
2907
+ #endif
2908
+ }
2909
+
2910
+ #if defined __riscv_v_intrinsic
2911
+ static const int8_t keven_signs_q2xs[1024] = {
2912
+ 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1,
2913
+ 1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, 1, 1, -1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, -1,
2914
+ 1, 1, 1, -1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, -1,
2915
+ 1, 1, -1, -1, 1, 1, 1, 1, -1, 1, -1, -1, 1, 1, 1, -1, 1, -1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, 1,
2916
+ 1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, 1, -1, 1, 1, 1, 1, -1, 1, 1, -1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, -1,
2917
+ 1, 1, -1, 1, -1, 1, 1, 1, -1, 1, -1, 1, -1, 1, 1, -1, 1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, 1,
2918
+ 1, 1, 1, -1, -1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, -1, 1, -1, 1, -1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, 1,
2919
+ 1, 1, -1, -1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, -1,
2920
+ 1, 1, 1, 1, 1, -1, 1, -1, -1, 1, 1, 1, 1, -1, 1, 1, 1, -1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, 1, -1, 1, -1,
2921
+ 1, 1, -1, 1, 1, -1, 1, 1, -1, 1, -1, 1, 1, -1, 1, -1, 1, -1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, 1, 1,
2922
+ 1, 1, 1, -1, 1, -1, 1, 1, -1, 1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, 1, 1,
2923
+ 1, 1, -1, -1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, 1, 1, 1, -1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, -1,
2924
+ 1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, 1, -1, -1, 1, -1, 1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, -1, 1, 1,
2925
+ 1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, 1, -1, -1, 1, 1, 1, -1, -1, 1, -1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, -1,
2926
+ 1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, -1, -1, -1, 1, 1, 1, -1, 1, -1, -1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, -1,
2927
+ 1, 1, -1, -1, -1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, -1, 1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, 1,
2928
+ 1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, -1, 1, 1, -1, 1, 1, 1, 1, -1, 1, -1, -1, 1, 1, 1, 1, -1, -1,
2929
+ 1, 1, -1, 1, 1, 1, -1, 1, -1, 1, -1, 1, 1, 1, -1, -1, 1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, -1, 1,
2930
+ 1, 1, 1, -1, 1, 1, -1, 1, -1, 1, 1, -1, 1, 1, -1, -1, 1, -1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, 1,
2931
+ 1, 1, -1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, -1, 1, 1, -1, -1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, -1,
2932
+ 1, 1, 1, 1, -1, 1, -1, 1, -1, 1, 1, 1, -1, 1, -1, -1, 1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, 1, -1, 1,
2933
+ 1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, 1, -1, 1, 1, -1, -1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, -1,
2934
+ 1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, -1, 1, -1, 1, 1, -1, 1, -1, -1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, -1,
2935
+ 1, 1, -1, -1, -1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, -1, 1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, 1,
2936
+ 1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, -1, -1, -1, 1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, -1, -1, 1,
2937
+ 1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, -1, -1,
2938
+ 1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, 1, -1, -1, 1, 1, -1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, 1, -1, -1, -1,
2939
+ 1, 1, -1, -1, 1, -1, -1, 1, -1, 1, -1, -1, 1, -1, -1, -1, 1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, 1,
2940
+ 1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, -1, -1, -1, 1, 1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, -1, -1, -1, -1,
2941
+ 1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, 1, -1, -1, -1, -1, 1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, 1,
2942
+ 1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, -1, 1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, 1,
2943
+ 1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, 1, 1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1,
2944
+ };
2945
+
2946
+ static NOINLINE void ggml_vec_dot_iq2_xs_q8_K_vl128(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
2947
+ assert(n % QK_K == 0);
2948
+ assert(nrc == 1);
2949
+ UNUSED(nrc);
2950
+ UNUSED(bx);
2951
+ UNUSED(by);
2952
+ UNUSED(bs);
2953
+
2954
+ const block_iq2_xs * GGML_RESTRICT x = vx;
2955
+ const block_q8_K * GGML_RESTRICT y = vy;
2956
+
2957
+ const int nb = n / QK_K;
2958
+ const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
2959
+ const uint64_t * grid64 = (const uint64_t *)iq2xs_grid;
2960
+
2961
+ float sumf = 0.0f;
2962
+ #pragma GCC unroll 1
2963
+ for (int i = 0; i < nb; ++i) {
2964
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
2965
+ const uint16_t * GGML_RESTRICT qs = x[i].qs;
2966
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
2967
+ const uint8_t * GGML_RESTRICT scales = x[i].scales;
2968
+
2969
+ int32_t sum_int = 0;
2970
+
2971
+ // Loop over 4 subblocks of 64 elements
2972
+ for (int ib64 = 0; ib64 < QK_K / 64; ++ib64) {
2973
+
2974
+ // Load indices.
2975
+ vuint16m1_t v_qs = __riscv_vle16_v_u16m1(qs, 8);
2976
+ qs += 8;
2977
+
2978
+ // Prepare offsets
2979
+ vuint16m1_t vidx_grid = __riscv_vsll_vx_u16m1(__riscv_vand_vx_u16m1(v_qs, 511, 8), 3, 8);
2980
+ vuint16m1_t vidx_sign = __riscv_vsll_vx_u16m1(__riscv_vsrl_vx_u16m1(v_qs, 9, 8), 3, 8);
2981
+
2982
+ // load values and signs from the lookup tables
2983
+ vuint64m4_t vq2_64 = __riscv_vluxei16_v_u64m4(grid64, vidx_grid, 8);
2984
+ vuint64m4_t vs2_64 = __riscv_vluxei16_v_u64m4(signs64, vidx_sign, 8);
2985
+ vint8m4_t q2u = __riscv_vreinterpret_v_u8m4_i8m4(__riscv_vreinterpret_v_u64m4_u8m4(vq2_64));
2986
+ vint8m4_t q2s = __riscv_vreinterpret_v_u8m4_i8m4(__riscv_vreinterpret_v_u64m4_u8m4(vs2_64));
2987
+ vint8m4_t q2_final = __riscv_vmul_vv_i8m4(q2u, q2s, 64);
2988
+ asm volatile("" ::: "memory");
2989
+ vint8m4_t q8v = __riscv_vle8_v_i8m4(q8, 64);
2990
+ q8 += 64;
2991
+
2992
+ vint16m8_t prod = __riscv_vwmul_vv_i16m8(q2_final, q8v, 64);
2993
+ asm volatile("" ::: "memory");
2994
+ vint32m1_t zero_vec = __riscv_vmv_v_x_i32m1(0, 1);
2995
+
2996
+ int32_t sum0 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(
2997
+ __riscv_vget_v_i16m8_i16m2(prod, 0), zero_vec, 16));
2998
+
2999
+ int32_t sum1 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(
3000
+ __riscv_vget_v_i16m8_i16m2(prod, 1), zero_vec, 16));
3001
+
3002
+ int32_t sum2 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(
3003
+ __riscv_vget_v_i16m8_i16m2(prod, 2), zero_vec, 16));
3004
+
3005
+ int32_t sum3 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(
3006
+ __riscv_vget_v_i16m8_i16m2(prod, 3), zero_vec, 16));
3007
+
3008
+ const uint8_t scale_byte_1 = scales[0];
3009
+ const uint8_t scale_byte_2 = scales[1];
3010
+ scales += 2;
3011
+
3012
+ sum_int += sum0 * ((scale_byte_1 & 0x0F) * 2 + 1);
3013
+ sum_int += sum1 * ((scale_byte_1 >> 4) * 2 + 1);
3014
+ sum_int += sum2 * ((scale_byte_2 & 0x0F) * 2 + 1);
3015
+ sum_int += sum3 * ((scale_byte_2 >> 4) * 2 + 1);
3016
+ }
3017
+
3018
+ sumf += d * sum_int;
3019
+ }
3020
+ *s = 0.125f * sumf;
3021
+ }
3022
+
3023
+ static NOINLINE void ggml_vec_dot_iq2_xs_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
3024
+ assert(n % QK_K == 0);
3025
+ assert(nrc == 1);
3026
+ UNUSED(nrc);
3027
+ UNUSED(bx);
3028
+ UNUSED(by);
3029
+ UNUSED(bs);
3030
+
3031
+ const block_iq2_xs * GGML_RESTRICT x = vx;
3032
+ const block_q8_K * GGML_RESTRICT y = vy;
3033
+
3034
+ const int nb = n / QK_K;
3035
+ const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
3036
+ const uint64_t * grid64 = (const uint64_t *)iq2xs_grid;
3037
+
3038
+ float sumf = 0.0f;
3039
+
3040
+ for (int i = 0; i < nb; ++i) {
3041
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
3042
+ const uint16_t * GGML_RESTRICT qs = x[i].qs;
3043
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
3044
+ const uint8_t * GGML_RESTRICT scales = x[i].scales;
3045
+
3046
+ int32_t sum_int = 0;
3047
+
3048
+ // Loop over 4 subblocks of 64 elements (QK_K = 256)
3049
+ for (int ib64 = 0; ib64 < QK_K / 64; ++ib64) {
3050
+ // Load 8 uint16 indices (controls 64 values)
3051
+ vuint16mf2_t v_qs = __riscv_vle16_v_u16mf2(qs, 8);
3052
+ qs += 8;
3053
+
3054
+ // Extract indices for grid (low 9 bits) and signs (high 7 bits)
3055
+ // Multiply by 8 (<< 3) for byte offsets into the uint64 tables
3056
+ vuint16mf2_t vidx_grid = __riscv_vsll_vx_u16mf2(__riscv_vand_vx_u16mf2(v_qs, 511, 8), 3, 8);
3057
+ vuint16mf2_t vidx_sign = __riscv_vsll_vx_u16mf2(__riscv_vsrl_vx_u16mf2(v_qs, 9, 8), 3, 8);
3058
+
3059
+ vuint64m2_t vq2_64 = __riscv_vluxei16_v_u64m2(grid64, vidx_grid, 8);
3060
+ vuint64m2_t vs2_64 = __riscv_vluxei16_v_u64m2(signs64, vidx_sign, 8);
3061
+
3062
+ vint8m2_t q2u = __riscv_vreinterpret_v_u8m2_i8m2(__riscv_vreinterpret_v_u64m2_u8m2(vq2_64));
3063
+ vint8m2_t q2s = __riscv_vreinterpret_v_u8m2_i8m2(__riscv_vreinterpret_v_u64m2_u8m2(vs2_64));
3064
+
3065
+ vint8m2_t q2_final = __riscv_vmul_vv_i8m2(q2u, q2s, 64);
3066
+
3067
+ vint8m2_t q8v = __riscv_vle8_v_i8m2(q8, 64);
3068
+ q8 += 64;
3069
+
3070
+ vint16m4_t prod = __riscv_vwmul_vv_i16m4(q2_final, q8v, 64);
3071
+
3072
+ vint32m1_t zero_vec = __riscv_vmv_v_x_i32m1(0, 1);
3073
+
3074
+ int32_t sum0 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m1_i32m1(
3075
+ __riscv_vget_v_i16m4_i16m1(prod, 0), zero_vec, 16));
3076
+ int32_t sum1 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m1_i32m1(
3077
+ __riscv_vget_v_i16m4_i16m1(prod, 1), zero_vec, 16));
3078
+ int32_t sum2 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m1_i32m1(
3079
+ __riscv_vget_v_i16m4_i16m1(prod, 2), zero_vec, 16));
3080
+ int32_t sum3 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m1_i32m1(
3081
+ __riscv_vget_v_i16m4_i16m1(prod, 3), zero_vec, 16));
3082
+
3083
+ const uint8_t scale_byte_1 = scales[0];
3084
+ const uint8_t scale_byte_2 = scales[1];
3085
+ scales += 2;
3086
+
3087
+ sum_int += sum0 * ((scale_byte_1 & 0x0F) * 2 + 1);
3088
+ sum_int += sum1 * ((scale_byte_1 >> 4) * 2 + 1);
3089
+ sum_int += sum2 * ((scale_byte_2 & 0x0F) * 2 + 1);
3090
+ sum_int += sum3 * ((scale_byte_2 >> 4) * 2 + 1);
3091
+ }
3092
+
3093
+ sumf += d * sum_int;
3094
+ }
3095
+ *s = 0.125f * sumf;
3096
+ }
3097
+ #endif
3098
+
3099
+ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
3100
+ #if defined __riscv_v_intrinsic
3101
+ switch (__riscv_vlenb() * 8) {
3102
+ case 128:
3103
+ ggml_vec_dot_iq2_xs_q8_K_vl128(n, s, bs, vx, bx, vy, by, nrc);
3104
+ break;
3105
+ case 256:
3106
+ ggml_vec_dot_iq2_xs_q8_K_vl256(n, s, bs, vx, bx, vy, by, nrc);
3107
+ break;
3108
+ default:
3109
+ ggml_vec_dot_iq2_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
3110
+ break;
3111
+ }
3112
+ #else
3113
+ ggml_vec_dot_iq2_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
3114
+ #endif
3115
+ }
3116
+
3117
+ #if defined __riscv_v_intrinsic
3118
+ static NOINLINE void ggml_vec_dot_iq2_xxs_q8_K_vl128(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
3119
+ assert(n % QK_K == 0);
3120
+ assert(nrc == 1);
3121
+ UNUSED(nrc);
3122
+ UNUSED(bx);
3123
+ UNUSED(by);
3124
+ UNUSED(bs);
3125
+
3126
+ const block_iq2_xxs * GGML_RESTRICT x = vx;
3127
+ const block_q8_K * GGML_RESTRICT y = vy;
3128
+
3129
+ const int nb = n / QK_K;
3130
+ const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
3131
+ const uint64_t * grid64 = (const uint64_t *)iq2xxs_grid;
3132
+
3133
+ uint32_t shift_constants[4] = {0, 7, 14, 21};
3134
+ vuint32m1_t v_shifts = __riscv_vle32_v_u32m1(shift_constants, 4);
3135
+
3136
+ float sumf = 0.0f;
3137
+ for (int i = 0; i < nb; ++i) {
3138
+ const float combined_scale = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
3139
+
3140
+ const uint8_t * GGML_RESTRICT q2_ptr = (const uint8_t *) x[i].qs;
3141
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
3142
+
3143
+ float sum = 0.0f;
3144
+
3145
+ #pragma GCC unroll 1
3146
+ for (int ib32 = 0; ib32 < QK_K / 32; ib32 += 2) {
3147
+ vint8m2_t q8_1 = __riscv_vle8_v_i8m2(q8, 32); q8 += 32;
3148
+ vint8m2_t q8_2 = __riscv_vle8_v_i8m2(q8, 32); q8 += 32;
3149
+
3150
+ vuint8mf4_t v_raw_q2_1 = __riscv_vle8_v_u8mf4(q2_ptr, 4);
3151
+ vuint8mf4_t v_raw_q2_2 = __riscv_vle8_v_u8mf4(q2_ptr + 8, 4);
3152
+
3153
+ vuint16mf2_t vidx_q2_1 = __riscv_vwcvtu_x_x_v_u16mf2(v_raw_q2_1, 4);
3154
+ vuint16mf2_t vidx_q2_2 = __riscv_vwcvtu_x_x_v_u16mf2(v_raw_q2_2, 4);
3155
+
3156
+ vidx_q2_1 = __riscv_vsll_vx_u16mf2(vidx_q2_1, 3, 4);
3157
+ vidx_q2_2 = __riscv_vsll_vx_u16mf2(vidx_q2_2, 3, 4);
3158
+
3159
+ uint32_t s_packed_1, s_packed_2;
3160
+ memcpy(&s_packed_1, q2_ptr + 4, 4);
3161
+ memcpy(&s_packed_2, q2_ptr + 12, 4);
3162
+
3163
+ vuint32m1_t v_s_1 = __riscv_vmv_v_x_u32m1(s_packed_1, 4);
3164
+ vuint32m1_t v_s_2 = __riscv_vmv_v_x_u32m1(s_packed_2, 4);
3165
+ v_s_1 = __riscv_vsrl_vv_u32m1(v_s_1, v_shifts, 4);
3166
+ v_s_2 = __riscv_vsrl_vv_u32m1(v_s_2, v_shifts, 4);
3167
+
3168
+ v_s_1 = __riscv_vand_vx_u32m1(v_s_1, 127, 4);
3169
+ v_s_2 = __riscv_vand_vx_u32m1(v_s_2, 127, 4);
3170
+
3171
+ vuint16mf2_t vidx_s2_1 = __riscv_vsll_vx_u16mf2(__riscv_vncvt_x_x_w_u16mf2(v_s_1, 4), 3, 4);
3172
+ vuint16mf2_t vidx_s2_2 = __riscv_vsll_vx_u16mf2(__riscv_vncvt_x_x_w_u16mf2(v_s_2, 4), 3, 4);
3173
+
3174
+ vuint64m2_t vq2_64_1 = __riscv_vluxei16_v_u64m2(grid64, vidx_q2_1, 4);
3175
+ vuint64m2_t vq2_64_2 = __riscv_vluxei16_v_u64m2(grid64, vidx_q2_2, 4);
3176
+
3177
+ vint8m2_t q2_1 = __riscv_vreinterpret_v_u8m2_i8m2(__riscv_vreinterpret_v_u64m2_u8m2(vq2_64_1));
3178
+ vint8m2_t q2_2 = __riscv_vreinterpret_v_u8m2_i8m2(__riscv_vreinterpret_v_u64m2_u8m2(vq2_64_2));
3179
+
3180
+ vuint64m2_t vs2_64_1 = __riscv_vluxei16_v_u64m2(signs64, vidx_s2_1, 4);
3181
+ vuint64m2_t vs2_64_2 = __riscv_vluxei16_v_u64m2(signs64, vidx_s2_2, 4);
3182
+ vint8m2_t s2_1 = __riscv_vreinterpret_v_u8m2_i8m2(__riscv_vreinterpret_v_u64m2_u8m2(vs2_64_1));
3183
+ vint8m2_t s2_2 = __riscv_vreinterpret_v_u8m2_i8m2(__riscv_vreinterpret_v_u64m2_u8m2(vs2_64_2));
3184
+
3185
+ vint8m2_t q8s_1 = __riscv_vmul_vv_i8m2(q8_1, s2_1, 32);
3186
+ vint8m2_t q8s_2 = __riscv_vmul_vv_i8m2(q8_2, s2_2, 32);
3187
+
3188
+ vint16m4_t dot1 = __riscv_vwmul_vv_i16m4(q8s_1, q2_1, 32);
3189
+ vint16m4_t dot2 = __riscv_vwmul_vv_i16m4(q8s_2, q2_2, 32);
3190
+
3191
+ vint32m1_t zero_vec = __riscv_vmv_v_x_i32m1(0, 1);
3192
+ vint32m1_t sumv1 = __riscv_vwredsum_vs_i16m4_i32m1(dot1, zero_vec, 32);
3193
+ vint32m1_t sumv2 = __riscv_vwredsum_vs_i16m4_i32m1(dot2, zero_vec, 32);
3194
+
3195
+ int32_t scalar_sum1 = __riscv_vmv_x_s_i32m1_i32(sumv1);
3196
+ int32_t scalar_sum2 = __riscv_vmv_x_s_i32m1_i32(sumv2);
3197
+
3198
+ int16_t scale1 = 2 * ((s_packed_1 >> 28) & 0xF) + 1;
3199
+ int16_t scale2 = 2 * ((s_packed_2 >> 28) & 0xF) + 1;
3200
+
3201
+ sum += scalar_sum1 * scale1 + scalar_sum2 * scale2;
3202
+ q2_ptr += 16;
3203
+ }
3204
+ sumf += sum * combined_scale;
3205
+ }
3206
+ *s = 0.125f * sumf;
3207
+ }
3208
+
3209
+ static NOINLINE void ggml_vec_dot_iq2_xxs_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
3210
+ assert(n % QK_K == 0);
3211
+ assert(nrc == 1);
3212
+ UNUSED(nrc);
3213
+ UNUSED(bx);
3214
+ UNUSED(by);
3215
+ UNUSED(bs);
3216
+
3217
+ const block_iq2_xxs * GGML_RESTRICT x = vx;
3218
+ const block_q8_K * GGML_RESTRICT y = vy;
3219
+
3220
+ const int nb = n / QK_K;
3221
+ const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
3222
+ const uint64_t * grid64 = (const uint64_t *)iq2xxs_grid;
3223
+
3224
+ uint32_t shift_constants[4] = {0, 7, 14, 21};
3225
+ vuint32mf2_t v_shifts = __riscv_vle32_v_u32mf2(shift_constants, 4);
3226
+
3227
+ float sumf = 0.0f;
3228
+
3229
+ for (int i = 0; i < nb; ++i) {
3230
+ const float combined_scale = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
3231
+
3232
+ const uint8_t * GGML_RESTRICT q2_ptr = (const uint8_t *) x[i].qs;
3233
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
3234
+
3235
+ float sum = 0.0f;
3236
+
3237
+ for (int ib32 = 0; ib32 < QK_K / 32; ib32 += 2) {
3238
+ vint8m1_t q8_1 = __riscv_vle8_v_i8m1(q8, 32); q8 += 32;
3239
+ vint8m1_t q8_2 = __riscv_vle8_v_i8m1(q8, 32); q8 += 32;
3240
+
3241
+ vuint8mf8_t v_raw_q2_1 = __riscv_vle8_v_u8mf8(q2_ptr, 4);
3242
+ vuint8mf8_t v_raw_q2_2 = __riscv_vle8_v_u8mf8(q2_ptr + 8, 4);
3243
+
3244
+ vuint16mf4_t vidx_q2_1 = __riscv_vwcvtu_x_x_v_u16mf4(v_raw_q2_1, 4);
3245
+ vuint16mf4_t vidx_q2_2 = __riscv_vwcvtu_x_x_v_u16mf4(v_raw_q2_2, 4);
3246
+
3247
+ vidx_q2_1 = __riscv_vsll_vx_u16mf4(vidx_q2_1, 3, 4);
3248
+ vidx_q2_2 = __riscv_vsll_vx_u16mf4(vidx_q2_2, 3, 4);
3249
+
3250
+ uint32_t s_packed_1, s_packed_2;
3251
+ memcpy(&s_packed_1, q2_ptr + 4, 4);
3252
+ memcpy(&s_packed_2, q2_ptr + 12, 4);
3253
+
3254
+ vuint32mf2_t v_s_1 = __riscv_vmv_v_x_u32mf2(s_packed_1, 4);
3255
+ vuint32mf2_t v_s_2 = __riscv_vmv_v_x_u32mf2(s_packed_2, 4);
3256
+
3257
+ v_s_1 = __riscv_vsrl_vv_u32mf2(v_s_1, v_shifts, 4);
3258
+ v_s_2 = __riscv_vsrl_vv_u32mf2(v_s_2, v_shifts, 4);
3259
+
3260
+ v_s_1 = __riscv_vand_vx_u32mf2(v_s_1, 127, 4);
3261
+ v_s_2 = __riscv_vand_vx_u32mf2(v_s_2, 127, 4);
3262
+
3263
+ // Narrow u32 -> u16 (vncvt) and Scale by 8 to get byte offsets
3264
+ vuint16mf4_t vidx_s2_1 = __riscv_vsll_vx_u16mf4(__riscv_vncvt_x_x_w_u16mf4(v_s_1, 4), 3, 4);
3265
+ vuint16mf4_t vidx_s2_2 = __riscv_vsll_vx_u16mf4(__riscv_vncvt_x_x_w_u16mf4(v_s_2, 4), 3, 4);
3266
+
3267
+ // Load q2 values from lookup grid
3268
+ vuint64m1_t vq2_64_1 = __riscv_vluxei16_v_u64m1(grid64, vidx_q2_1, 4);
3269
+ vuint64m1_t vq2_64_2 = __riscv_vluxei16_v_u64m1(grid64, vidx_q2_2, 4);
3270
+ vint8m1_t q2_1 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vreinterpret_v_u64m1_u8m1(vq2_64_1));
3271
+ vint8m1_t q2_2 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vreinterpret_v_u64m1_u8m1(vq2_64_2));
3272
+
3273
+ // Load sign values
3274
+ vuint64m1_t vs2_64_1 = __riscv_vluxei16_v_u64m1(signs64, vidx_s2_1, 4);
3275
+ vuint64m1_t vs2_64_2 = __riscv_vluxei16_v_u64m1(signs64, vidx_s2_2, 4);
3276
+ vint8m1_t s2_1 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vreinterpret_v_u64m1_u8m1(vs2_64_1));
3277
+ vint8m1_t s2_2 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vreinterpret_v_u64m1_u8m1(vs2_64_2));
3278
+
3279
+ // Apply signs to q8
3280
+ vint8m1_t q8s_1 = __riscv_vmul_vv_i8m1(q8_1, s2_1, 32);
3281
+ vint8m1_t q8s_2 = __riscv_vmul_vv_i8m1(q8_2, s2_2, 32);
3282
+
3283
+ // multiplying q2 with q8
3284
+ vint16m2_t dot1 = __riscv_vwmul_vv_i16m2(q8s_1, q2_1, 32);
3285
+ vint16m2_t dot2 = __riscv_vwmul_vv_i16m2(q8s_2, q2_2, 32);
3286
+
3287
+ vint32m1_t zero_vec = __riscv_vmv_v_x_i32m1(0, 1);
3288
+ vint32m1_t sumv1 = __riscv_vwredsum_vs_i16m2_i32m1(dot1, zero_vec, 32);
3289
+ vint32m1_t sumv2 = __riscv_vwredsum_vs_i16m2_i32m1(dot2, zero_vec, 32);
3290
+ int32_t scalar_sum1 = __riscv_vmv_x_s_i32m1_i32(sumv1);
3291
+ int32_t scalar_sum2 = __riscv_vmv_x_s_i32m1_i32(sumv2);
3292
+ int16_t scale1 = 2 * ((s_packed_1 >> 28) & 0xF) + 1;
3293
+ int16_t scale2 = 2 * ((s_packed_2 >> 28) & 0xF) + 1;
3294
+
3295
+ sum += scalar_sum1 * scale1 + scalar_sum2 * scale2;
3296
+ q2_ptr += 16;
3297
+ }
3298
+ sumf += sum * combined_scale;
3299
+ }
3300
+ *s = 0.125f * sumf;
3301
+ }
3302
+ #endif
3303
+
3304
+ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
3305
+ #if defined __riscv_v_intrinsic
3306
+ switch (__riscv_vlenb() * 8) {
3307
+ case 128:
3308
+ ggml_vec_dot_iq2_xxs_q8_K_vl128(n, s, bs, vx, bx, vy, by, nrc);
3309
+ break;
3310
+ default: // 256 and above
3311
+ ggml_vec_dot_iq2_xxs_q8_K_vl256(n, s, bs, vx, bx, vy, by, nrc);
3312
+ break;
3313
+ }
3314
+ #else
3315
+ ggml_vec_dot_iq2_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
3316
+ #endif
3317
+ }
3318
+
3319
+ #if defined __riscv_v_intrinsic
3320
+ static NOINLINE void ggml_vec_dot_iq3_s_q8_K_vl128(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
3321
+ assert(n % QK_K == 0);
3322
+ UNUSED(nrc); UNUSED(bx); UNUSED(by); UNUSED(bs);
3323
+ const block_iq3_s * GGML_RESTRICT x = vx;
3324
+ const block_q8_K * GGML_RESTRICT y = vy;
3325
+
3326
+ const int nb = n / QK_K;
3327
+ const uint32_t * grid32 = (const uint32_t *)iq3s_grid;
3328
+
3329
+ vuint8mf2_t v_id_8 = __riscv_vid_v_u8mf2(8);
3330
+ vuint8m2_t v_id_32 = __riscv_vid_v_u8m2(32);
3331
+
3332
+ // Keeping these in a tight scope to hint they're only needed for the mask computation.
3333
+ vuint8m2_t v_sign_gather_indices, v_sign_masks;
3334
+ {
3335
+ vuint8m2_t v_shifts = __riscv_vand_vx_u8m2(v_id_32, 7, 32);
3336
+ vuint8m2_t v_one_32 = __riscv_vmv_v_x_u8m2(1, 32);
3337
+ v_sign_gather_indices = __riscv_vsrl_vx_u8m2(v_id_32, 3, 32);
3338
+ v_sign_masks = __riscv_vsll_vv_u8m2(v_one_32, v_shifts, 32);
3339
+ }
3340
+
3341
+ float sumf = 0.0f;
3342
+
3343
+ for (int i = 0; i < nb; ++i) {
3344
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d);
3345
+ const float combined_scale = d * y[i].d;
3346
+
3347
+ const uint8_t * GGML_RESTRICT qs = x[i].qs;
3348
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
3349
+ const uint8_t * GGML_RESTRICT scales = x[i].scales;
3350
+ const uint8_t * GGML_RESTRICT signs = x[i].signs;
3351
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
3352
+
3353
+ float sum_block = 0.0f;
3354
+
3355
+ for (int ib = 0; ib < 8; ++ib) {
3356
+
3357
+ // Grid lookup
3358
+ vuint8m2_t v_grid_u8;
3359
+ {
3360
+ vuint8mf2_t v_qs_u8 = __riscv_vle8_v_u8mf2(qs, 8);
3361
+ qs += 8;
3362
+
3363
+ uint8_t qh_val = *qh++;
3364
+ vuint8mf2_t v_qh_val = __riscv_vmv_v_x_u8mf2(qh_val, 8);
3365
+ v_qh_val = __riscv_vsrl_vv_u8mf2(v_qh_val, v_id_8, 8);
3366
+ v_qh_val = __riscv_vand_vx_u8mf2(v_qh_val, 1, 8);
3367
+
3368
+ vuint16m1_t v_qs_u16 = __riscv_vwcvtu_x_x_v_u16m1(v_qs_u8, 8);
3369
+ v_qs_u16 = __riscv_vsll_vx_u16m1(v_qs_u16, 2, 8);
3370
+
3371
+ vuint16m1_t v_qh_u16 = __riscv_vwcvtu_x_x_v_u16m1(v_qh_val, 8);
3372
+ v_qh_u16 = __riscv_vsll_vx_u16m1(v_qh_u16, 10, 8);
3373
+
3374
+ vuint16m1_t v_grid_offsets = __riscv_vor_vv_u16m1(v_qs_u16, v_qh_u16, 8);
3375
+
3376
+ vuint32m2_t v_grid_packed = __riscv_vluxei16_v_u32m2(grid32, v_grid_offsets, 8);
3377
+ v_grid_u8 = __riscv_vreinterpret_v_u32m2_u8m2(v_grid_packed);
3378
+ }
3379
+ __asm__ volatile ("" ::: "memory");
3380
+
3381
+ //Sign application and dot product
3382
+ int32_t s_val;
3383
+ {
3384
+ vuint8mf4_t v_signs_raw = __riscv_vle8_v_u8mf4(signs, 4);
3385
+ signs += 4;
3386
+
3387
+ vuint8m2_t v_signs_source = __riscv_vlmul_ext_v_u8mf4_u8m2(v_signs_raw);
3388
+ vuint8m2_t v_signs_bcast = __riscv_vrgather_vv_u8m2(v_signs_source, v_sign_gather_indices, 32);
3389
+ vuint8m2_t v_sign_bits = __riscv_vand_vv_u8m2(v_signs_bcast, v_sign_masks, 32);
3390
+ vbool4_t m_negative = __riscv_vmsne_vx_u8m2_b4(v_sign_bits, 0, 32);
3391
+
3392
+ vint8m2_t v_q8 = __riscv_vle8_v_i8m2(q8, 32);
3393
+ q8 += 32;
3394
+
3395
+ vint8m2_t v_q8_signed = __riscv_vrsub_vx_i8m2_mu(m_negative, v_q8, v_q8, 0, 32);
3396
+ vint16m4_t v_dot = __riscv_vwmulsu_vv_i16m4(v_q8_signed, v_grid_u8, 32);
3397
+
3398
+ vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, 1);
3399
+ s_val = __riscv_vmv_x_s_i32m1_i32(
3400
+ __riscv_vwredsum_vs_i16m4_i32m1(v_dot, v_zero, 32));
3401
+ }
3402
+ __asm__ volatile ("" ::: "memory");
3403
+ {
3404
+ uint8_t sc_byte = scales[ib >> 1];
3405
+ int sc_val = (ib & 1) ? (sc_byte >> 4) : (sc_byte & 0xF);
3406
+ sc_val = sc_val * 2 + 1;
3407
+ sum_block += (float)(s_val * sc_val);
3408
+ }
3409
+ }
3410
+ sumf += sum_block * combined_scale;
3411
+ }
3412
+ *s = sumf;
3413
+ }
3414
+
3415
+ static NOINLINE void ggml_vec_dot_iq3_s_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
3416
+ assert(n % QK_K == 0);
3417
+ UNUSED(nrc);
3418
+ UNUSED(bx);
3419
+ UNUSED(by);
3420
+ UNUSED(bs);
3421
+
3422
+ const block_iq3_s * GGML_RESTRICT x = vx;
3423
+ const block_q8_K * GGML_RESTRICT y = vy;
3424
+
3425
+ const int nb = n / QK_K;
3426
+
3427
+ const uint64_t * grid64 = (const uint64_t *)iq3s_grid;
3428
+
3429
+ // --- Pre-load Constants ---
3430
+ const uint16_t qh_bit_shifts_arr[16] = {
3431
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
3432
+ };
3433
+ vuint8m2_t v_sign_gather_indices = __riscv_vle8_v_u8m2(sign_gather_indices_arr, 64);
3434
+ vuint8m2_t v_sign_masks = __riscv_vle8_v_u8m2(sign_bit_masks_arr, 64);
3435
+ vuint16m1_t v_qh_shifts = __riscv_vle16_v_u16m1(qh_bit_shifts_arr, 16);
3436
+
3437
+ float sumf = 0.0f;
3438
+
3439
+ for (int i = 0; i < nb; ++i) {
3440
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d);
3441
+ const float combined_scale = d * y[i].d;
3442
+
3443
+ const uint8_t * GGML_RESTRICT qs = x[i].qs;
3444
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
3445
+ const uint8_t * GGML_RESTRICT scales = x[i].scales;
3446
+ const uint8_t * GGML_RESTRICT signs = x[i].signs;
3447
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
3448
+
3449
+ float sum_block = 0.0f;
3450
+
3451
+ // Loop: Process 64 weights (16 mini-blocks of 4) per iteration
3452
+ for (int ib = 0; ib < 4; ++ib) {
3453
+
3454
+ vuint8mf2_t v_qs_u8 = __riscv_vle8_v_u8mf2(qs, 16);
3455
+ qs += 16;
3456
+
3457
+ uint16_t qh_val;
3458
+ memcpy(&qh_val, qh, 2);
3459
+ qh += 2;
3460
+
3461
+ vuint16m1_t v_qh_val = __riscv_vmv_v_x_u16m1(qh_val, 16);
3462
+ // Extract bits: (qh >> i) & 1
3463
+ v_qh_val = __riscv_vsrl_vv_u16m1(v_qh_val, v_qh_shifts, 16);
3464
+ v_qh_val = __riscv_vand_vx_u16m1(v_qh_val, 1, 16);
3465
+
3466
+ vuint16m1_t v_qs_u16 = __riscv_vwcvtu_x_x_v_u16m1(v_qs_u8, 16);
3467
+ v_qs_u16 = __riscv_vsll_vx_u16m1(v_qs_u16, 2, 16);
3468
+ v_qh_val = __riscv_vsll_vx_u16m1(v_qh_val, 10, 16);
3469
+ vuint16m1_t v_grid_offsets = __riscv_vor_vv_u16m1(v_qs_u16, v_qh_val, 16);
3470
+
3471
+ // Grid value is 4xuint8
3472
+ vuint32m2_t v_grid_packed = __riscv_vluxei16_v_u32m2((const uint32_t *)grid64, v_grid_offsets, 16);
3473
+ vuint8m2_t v_grid_u8 = __riscv_vreinterpret_v_u32m2_u8m2(v_grid_packed);
3474
+ vuint8mf4_t v_signs_raw = __riscv_vle8_v_u8mf4(signs, 8);
3475
+ signs += 8;
3476
+
3477
+ // Generate sign mask
3478
+ vuint8m2_t v_signs_source = __riscv_vlmul_ext_v_u8mf4_u8m2(v_signs_raw);
3479
+ vuint8m2_t v_signs_bcast = __riscv_vrgather_vv_u8m2(v_signs_source, v_sign_gather_indices, 64);
3480
+ vuint8m2_t v_sign_bits = __riscv_vand_vv_u8m2(v_signs_bcast, v_sign_masks, 64);
3481
+ vbool4_t m_negative = __riscv_vmsne_vx_u8m2_b4(v_sign_bits, 0, 64);
3482
+
3483
+ vint8m2_t v_q8 = __riscv_vle8_v_i8m2(q8, 64);
3484
+ q8 += 64;
3485
+
3486
+ // Apply Signs
3487
+ vint8m2_t v_q8_signed = __riscv_vrsub_vx_i8m2_mu(m_negative, v_q8, v_q8, 0, 64);
3488
+ vint16m4_t v_dot = __riscv_vwmulsu_vv_i16m4(v_q8_signed, v_grid_u8, 64);
3489
+
3490
+ // Reduction
3491
+ vint16m2_t v_dot_lo = __riscv_vget_v_i16m4_i16m2(v_dot, 0);
3492
+ vint16m2_t v_dot_hi = __riscv_vget_v_i16m4_i16m2(v_dot, 1);
3493
+ vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, 1);
3494
+
3495
+ int32_t s_lo = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(v_dot_lo, v_zero, 32));
3496
+ int32_t s_hi = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(v_dot_hi, v_zero, 32));
3497
+
3498
+ // Apply sub-scales
3499
+ uint8_t sc_byte = *scales++;
3500
+ int sc_lo = (sc_byte & 0xF) * 2 + 1;
3501
+ int sc_hi = (sc_byte >> 4) * 2 + 1;
3502
+
3503
+ sum_block += s_lo * sc_lo + s_hi * sc_hi;
3504
+ }
3505
+ sumf += sum_block * combined_scale;
3506
+ }
3507
+ *s = sumf;
3508
+ }
3509
+ #endif
3510
+
3511
+ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
3512
+ #if defined __riscv_v_intrinsic
3513
+ switch (__riscv_vlenb() * 8) {
3514
+ case 128:
3515
+ ggml_vec_dot_iq3_s_q8_K_vl128(n, s, bs, vx, bx, vy, by, nrc);
3516
+ break;
3517
+ case 256:
3518
+ ggml_vec_dot_iq3_s_q8_K_vl256(n, s, bs, vx, bx, vy, by, nrc);
3519
+ break;
3520
+ default:
3521
+ ggml_vec_dot_iq3_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
3522
+ break;
3523
+ }
3524
+ #else
3525
+ ggml_vec_dot_iq3_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
3526
+ #endif
3527
+ }
3528
+
3529
+ #if defined __riscv_v_intrinsic
3530
+ static NOINLINE void ggml_vec_dot_iq3_xxs_q8_K_vl128(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
3531
+ assert(n % QK_K == 0);
3532
+ UNUSED(nrc); UNUSED(bx); UNUSED(by); UNUSED(bs);
3533
+
3534
+ const block_iq3_xxs * GGML_RESTRICT x = vx;
3535
+ const block_q8_K * GGML_RESTRICT y = vy;
3536
+ const int nb = n / QK_K;
3537
+
3538
+ const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
3539
+ const uint32_t * grid32 = (const uint32_t *)iq3xxs_grid;
3540
+
3541
+ // constants for unpacking logic
3542
+ const uint32_t shifts_val[8] = {0, 7, 14, 21, 0, 7, 14, 21};
3543
+ vuint32m2_t v_shifts = __riscv_vle32_v_u32m2(shifts_val, 8);
3544
+
3545
+ const uint32_t gather_idx_val[8] = {0, 0, 0, 0, 1, 1, 1, 1};
3546
+ vuint32m2_t v_gather_idx = __riscv_vle32_v_u32m2(gather_idx_val, 8);
3547
+
3548
+ uint32_t aux32[2];
3549
+ float sumf = 0.0f;
3550
+
3551
+ for (int i = 0; i < nb; ++i) {
3552
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
3553
+
3554
+ const uint8_t * GGML_RESTRICT q3_indices = x[i].qs;
3555
+ const uint8_t * GGML_RESTRICT metadata = x[i].qs + QK_K/4;
3556
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
3557
+
3558
+ float block_sum = 0.0f;
3559
+
3560
+ // Process 64 weights per loop
3561
+ for (int ib = 0; ib < QK_K / 64; ++ib) {
3562
+
3563
+ // load of metadata via memcpy
3564
+ memcpy(aux32, metadata, 2 * sizeof(uint32_t));
3565
+ metadata += 2 * sizeof(uint32_t);
3566
+
3567
+ vuint8m1_t v_q3_idx_u8 = __riscv_vle8_v_u8m1(q3_indices, 16);
3568
+ q3_indices += 16;
3569
+
3570
+ vuint16m2_t v_q3_idx_u16 = __riscv_vwmulu_vx_u16m2(v_q3_idx_u8, 4, 16);
3571
+
3572
+ vuint32m4_t v_q3_magnitudes_u32 = __riscv_vluxei16_v_u32m4(grid32, v_q3_idx_u16, 16);
3573
+
3574
+ vint8m4_t v_q3_magnitudes = __riscv_vreinterpret_v_u8m4_i8m4(
3575
+ __riscv_vreinterpret_v_u32m4_u8m4(v_q3_magnitudes_u32));
3576
+
3577
+ vuint32m2_t v_aux = __riscv_vle32_v_u32m2(aux32, 2);
3578
+
3579
+ vuint32m2_t v_aux_expanded = __riscv_vrgather_vv_u32m2(v_aux, v_gather_idx, 8);
3580
+
3581
+ vuint32m2_t v_s_vals_raw = __riscv_vand_vx_u32m2(
3582
+ __riscv_vsrl_vv_u32m2(v_aux_expanded, v_shifts, 8), 127, 8);
3583
+
3584
+ vuint16m1_t sign_indices_byte_offset = __riscv_vsll_vx_u16m1(
3585
+ __riscv_vncvt_x_x_w_u16m1(v_s_vals_raw, 8), 3, 8);
3586
+
3587
+ vuint64m4_t v_s_vals_u64 = __riscv_vluxei16_v_u64m4(signs64, sign_indices_byte_offset, 8);
3588
+
3589
+ vint8m4_t v_s_vals = __riscv_vreinterpret_v_u8m4_i8m4(
3590
+ __riscv_vreinterpret_v_u64m4_u8m4(v_s_vals_u64));
3591
+
3592
+ vint8m4_t v_q3_signed = __riscv_vmul_vv_i8m4(v_q3_magnitudes, v_s_vals, 64);
3593
+ asm volatile("" ::: "memory");
3594
+ vint8m4_t v_q8 = __riscv_vle8_v_i8m4(q8, 64);
3595
+ q8 += 64;
3596
+
3597
+ vint16m8_t v_dot = __riscv_vwmul_vv_i16m8(v_q8, v_q3_signed, 64);
3598
+
3599
+ asm volatile("" ::: "memory");
3600
+
3601
+ vint16m4_t v_dot_1 = __riscv_vget_v_i16m8_i16m4(v_dot, 0);
3602
+ vint16m4_t v_dot_2 = __riscv_vget_v_i16m8_i16m4(v_dot, 1);
3603
+
3604
+ vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, 1);
3605
+
3606
+ vint32m1_t v_sum_1 = __riscv_vwredsum_vs_i16m4_i32m1(v_dot_1, v_zero, 32);
3607
+ vint32m1_t v_sum_2 = __riscv_vwredsum_vs_i16m4_i32m1(v_dot_2, v_zero, 32);
3608
+
3609
+ int32_t sum1_i = __riscv_vmv_x_s_i32m1_i32(v_sum_1);
3610
+ int32_t sum2_i = __riscv_vmv_x_s_i32m1_i32(v_sum_2);
3611
+
3612
+ const float scale1_f = (float)(2 * (aux32[0] >> 28) + 1);
3613
+ const float scale2_f = (float)(2 * (aux32[1] >> 28) + 1);
3614
+
3615
+ block_sum += sum1_i * scale1_f + sum2_i * scale2_f;
3616
+ }
3617
+
3618
+ sumf += d * block_sum;
3619
+ }
3620
+ *s = 0.25f * sumf;
3621
+ }
3622
+
3623
+ static NOINLINE void ggml_vec_dot_iq3_xxs_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
3624
+ assert(n % QK_K == 0);
3625
+ assert(nrc == 1);
3626
+ UNUSED(nrc);
3627
+ UNUSED(bx);
3628
+ UNUSED(by);
3629
+ UNUSED(bs);
3630
+
3631
+ const block_iq3_xxs * GGML_RESTRICT x = vx;
3632
+ const block_q8_K * GGML_RESTRICT y = vy;
3633
+ const int nb = n / QK_K;
3634
+
3635
+ const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
3636
+ const uint32_t * grid32 = (const uint32_t *)iq3xxs_grid;
3637
+
3638
+ // constants for unpacking logic
3639
+ const uint32_t shifts_val[8] = {0, 7, 14, 21, 0, 7, 14, 21};
3640
+ vuint32m1_t v_shifts = __riscv_vle32_v_u32m1(shifts_val, 8);
3641
+
3642
+ const uint32_t gather_idx_val[8] = {0, 0, 0, 0, 1, 1, 1, 1};
3643
+ vuint32m1_t v_gather_idx = __riscv_vle32_v_u32m1(gather_idx_val, 8);
3644
+
3645
+ uint32_t aux32[2];
3646
+ float sumf = 0.0f;
3647
+
3648
+ for (int i = 0; i < nb; ++i) {
3649
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
3650
+
3651
+ const uint8_t * GGML_RESTRICT q3_indices = x[i].qs;
3652
+ const uint8_t * GGML_RESTRICT metadata = x[i].qs + QK_K/4;
3653
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
3654
+
3655
+ float block_sum = 0.0f;
3656
+
3657
+ for (int ib = 0; ib < QK_K / 64; ++ib) {
3658
+ // Load q8 (64 bytes)
3659
+ vint8m2_t v_q8 = __riscv_vle8_v_i8m2(q8, 64);
3660
+ q8 += 64;
3661
+
3662
+ // load of metadata via memcpy
3663
+ memcpy(aux32, metadata, 2 * sizeof(uint32_t));
3664
+ metadata += 2 * sizeof(uint32_t);
3665
+
3666
+ // Load q3 indices and gather magnitudes
3667
+ vuint8mf2_t v_q3_idx_u8 = __riscv_vle8_v_u8mf2(q3_indices, 16);
3668
+ q3_indices += 16;
3669
+
3670
+ vuint16m1_t v_q3_idx_u16 = __riscv_vwmulu_vx_u16m1(v_q3_idx_u8, 4, 16);
3671
+ vuint32m2_t v_q3_magnitudes_u32 = __riscv_vluxei16_v_u32m2(grid32, v_q3_idx_u16, 16);
3672
+ vint8m2_t v_q3_magnitudes = __riscv_vreinterpret_v_u8m2_i8m2(__riscv_vreinterpret_v_u32m2_u8m2(v_q3_magnitudes_u32));
3673
+
3674
+ // --- Unpacking of Sign Indices ---
3675
+
3676
+ // 1. Load the 2 auxiliary 32-bit integers into a vector
3677
+ vuint32m1_t v_aux = __riscv_vle32_v_u32m1(aux32, 2);
3678
+
3679
+ // 2. Broadcast/Gather: replicate aux[0] to first 4 lanes, aux[1] to next 4 lanes
3680
+ vuint32m1_t v_aux_expanded = __riscv_vrgather_vv_u32m1(v_aux, v_gather_idx, 8);
3681
+
3682
+ // 3. Apply Shifts and Mask: ((val >> shift) & 127)
3683
+ vuint32m1_t v_s_vals_raw = __riscv_vand_vx_u32m1(__riscv_vsrl_vv_u32m1(v_aux_expanded, v_shifts, 8), 127, 8);
3684
+
3685
+ // 4. Narrow to u16 (required for vluxei index) and multiply by 8 (byte offset for u64 table)
3686
+ vuint16mf2_t sign_indices_byte_offset = __riscv_vsll_vx_u16mf2(__riscv_vncvt_x_x_w_u16mf2(v_s_vals_raw, 8), 3, 8);
3687
+
3688
+ // 5. Gather Signs
3689
+ vuint64m2_t v_s_vals_u64 = __riscv_vluxei16_v_u64m2(signs64, sign_indices_byte_offset, 8);
3690
+ vint8m2_t v_s_vals = __riscv_vreinterpret_v_u8m2_i8m2(__riscv_vreinterpret_v_u64m2_u8m2(v_s_vals_u64));
3691
+
3692
+ vint8m2_t v_q3_signed = __riscv_vmul_vv_i8m2(v_q3_magnitudes, v_s_vals, 64);
3693
+ vint16m4_t v_dot = __riscv_vwmul_vv_i16m4(v_q8, v_q3_signed, 64);
3694
+
3695
+ vint16m2_t v_dot_1 = __riscv_vget_v_i16m4_i16m2(v_dot, 0);
3696
+ vint16m2_t v_dot_2 = __riscv_vget_v_i16m4_i16m2(v_dot, 1);
3697
+
3698
+ vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, 1);
3699
+ vint32m1_t v_sum_1 = __riscv_vwredsum_vs_i16m2_i32m1(v_dot_1, v_zero, 32);
3700
+ vint32m1_t v_sum_2 = __riscv_vwredsum_vs_i16m2_i32m1(v_dot_2, v_zero, 32);
3701
+
3702
+ int32_t sum1_i = __riscv_vmv_x_s_i32m1_i32(v_sum_1);
3703
+ int32_t sum2_i = __riscv_vmv_x_s_i32m1_i32(v_sum_2);
3704
+
3705
+ const float scale1_f = (float)(2 * (aux32[0] >> 28) + 1);
3706
+ const float scale2_f = (float)(2 * (aux32[1] >> 28) + 1);
3707
+
3708
+ block_sum += sum1_i * scale1_f + sum2_i * scale2_f;
3709
+ }
3710
+
3711
+ sumf += d * block_sum;
3712
+ }
3713
+ *s = 0.25f * sumf;
3714
+ }
3715
+ #endif
3716
+
3717
+ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
3718
+ #if defined __riscv_v_intrinsic
3719
+ switch (__riscv_vlenb() * 8) {
3720
+ case 128:
3721
+ ggml_vec_dot_iq3_xxs_q8_K_vl128(n, s, bs, vx, bx, vy, by, nrc);
3722
+ break;
3723
+ case 256:
3724
+ ggml_vec_dot_iq3_xxs_q8_K_vl256(n, s, bs, vx, bx, vy, by, nrc);
3725
+ break;
3726
+ default:
3727
+ ggml_vec_dot_iq3_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
3728
+ break;
3729
+ }
3730
+ #else
3731
+ ggml_vec_dot_iq3_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
3732
+ #endif
3733
+ }
3734
+
3735
+ #if defined __riscv_v_intrinsic
3736
+ static NOINLINE void ggml_vec_dot_iq4_nl_q8_0_vl128(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
3737
+ assert(nrc == 1);
3738
+ UNUSED(nrc);
3739
+ UNUSED(bx);
3740
+ UNUSED(by);
3741
+ UNUSED(bs);
3742
+ assert(n % QK4_NL == 0);
3743
+ static_assert(QK4_NL == QK8_0, "QK4_NL and QK8_0 must be the same");
3744
+
3745
+ const block_iq4_nl * GGML_RESTRICT x = vx;
3746
+ const block_q8_0 * GGML_RESTRICT y = vy;
3747
+
3748
+ const int nb = n / QK4_NL;
3749
+
3750
+ int ib = 0;
3751
+ float sumf = 0;
3752
+
3753
+ // Load the lookup table once.
3754
+ const vint8m2_t values = __riscv_vle8_v_i8m2(kvalues_iq4nl, 16);
3755
+ int acc1, acc2;
3756
+
3757
+ // We process 2 blocks at once.
3758
+ for (; ib + 1 < nb; ib += 2) {
3759
+ // Weights and activations.
3760
+ vuint8m1_t iq4_packed1 = __riscv_vle8_v_u8m1(x[ib + 0].qs, 16);
3761
+ vint8m2_t q8b1 = __riscv_vle8_v_i8m2(y[ib + 0].qs, 32);
3762
+ vuint8m1_t iq4_packed2 = __riscv_vle8_v_u8m1(x[ib + 1].qs, 16);
3763
+ vint8m2_t q8b2 = __riscv_vle8_v_i8m2(y[ib + 1].qs, 32);
3764
+
3765
+ // Unpack the weight blocks.
3766
+ vuint8m2_t iq4bits1 = __riscv_vcreate_v_u8m1_u8m2(
3767
+ __riscv_vand_vx_u8m1(iq4_packed1, 0xf, 16),
3768
+ __riscv_vsrl_vx_u8m1(iq4_packed1, 4, 16)
3769
+ );
3770
+ vuint8m2_t iq4bits2 = __riscv_vcreate_v_u8m1_u8m2(
3771
+ __riscv_vand_vx_u8m1(iq4_packed2, 0xf, 16),
3772
+ __riscv_vsrl_vx_u8m1(iq4_packed2, 4, 16)
3773
+ );
3774
+
3775
+ // Gather values from the lookup table.
3776
+ vint8m2_t iq4b1 = __riscv_vrgather_vv_i8m2(values, iq4bits1, 32);
3777
+ vint8m2_t iq4b2 = __riscv_vrgather_vv_i8m2(values, iq4bits2, 32);
3778
+
3779
+ // Accumulation.
3780
+ vint16m4_t sum1 = __riscv_vwmul_vv_i16m4(q8b1, iq4b1, 32);
3781
+ vint16m4_t sum2 = __riscv_vwmul_vv_i16m4(q8b2, iq4b2, 32);
3782
+ __riscv_vse32_v_i32m1(&acc1,__riscv_vwredsum_vs_i16m4_i32m1(sum1, __riscv_vmv_v_x_i32m1(0, 1), 32), 1);
3783
+ __riscv_vse32_v_i32m1(&acc2,__riscv_vwredsum_vs_i16m4_i32m1(sum2, __riscv_vmv_v_x_i32m1(0, 1), 32), 1);
3784
+ sumf += ((GGML_CPU_FP16_TO_FP32(x[ib + 0].d) * GGML_CPU_FP16_TO_FP32(y[ib + 0].d) * acc1));
3785
+ sumf += ((GGML_CPU_FP16_TO_FP32(x[ib + 1].d) * GGML_CPU_FP16_TO_FP32(y[ib + 1].d) * acc2));
3786
+ }
3787
+
3788
+ *s = sumf;
3789
+ }
3790
+
3791
+ static NOINLINE void ggml_vec_dot_iq4_nl_q8_0_vl256(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
3792
+ assert(nrc == 1);
3793
+ UNUSED(nrc);
3794
+ UNUSED(bx);
3795
+ UNUSED(by);
3796
+ UNUSED(bs);
3797
+ assert(n % QK4_NL == 0);
3798
+ static_assert(QK4_NL == QK8_0, "QK4_NL and QK8_0 must be the same");
3799
+
3800
+ const block_iq4_nl * GGML_RESTRICT x = vx;
3801
+ const block_q8_0 * GGML_RESTRICT y = vy;
3802
+
3803
+ const int nb = n / QK4_NL;
3804
+
3805
+ int ib = 0;
3806
+ float sumf = 0;
3807
+
3808
+ // Load the lookup table once.
3809
+ const vint8mf2_t values = __riscv_vle8_v_i8mf2(kvalues_iq4nl, 16);
3810
+ int acc1, acc2;
3811
+
3812
+ // We process 2 blocks at once.
3813
+ for (; ib + 1 < nb; ib += 2) {
3814
+ // Weights and activations.
3815
+ vuint8mf2_t iq4_packed1 = __riscv_vle8_v_u8mf2(x[ib + 0].qs, 16);
3816
+ vint8mf2_t q8b_lo1 = __riscv_vle8_v_i8mf2(y[ib + 0].qs, 16);
3817
+ vint8mf2_t q8b_hi1 = __riscv_vle8_v_i8mf2(y[ib + 0].qs + 16, 16);
3818
+ vuint8mf2_t iq4_packed2 = __riscv_vle8_v_u8mf2(x[ib + 1].qs, 16);
3819
+ vint8mf2_t q8b_lo2 = __riscv_vle8_v_i8mf2(y[ib + 1].qs, 16);
3820
+ vint8mf2_t q8b_hi2 = __riscv_vle8_v_i8mf2(y[ib + 1].qs + 16, 16);
3821
+
3822
+ // Unpack the weight blocks.
3823
+ vuint8mf2_t iq4bits_lo1 = __riscv_vand_vx_u8mf2(iq4_packed1, 0xf, 16);
3824
+ vuint8mf2_t iq4bits_hi1 = __riscv_vsrl_vx_u8mf2(iq4_packed1, 4, 16);
3825
+ vuint8mf2_t iq4bits_lo2 = __riscv_vand_vx_u8mf2(iq4_packed2, 0xf, 16);
3826
+ vuint8mf2_t iq4bits_hi2 = __riscv_vsrl_vx_u8mf2(iq4_packed2, 4, 16);
3827
+
3828
+ // Gather values from the lookup table.
3829
+ vint8mf2_t iq4b_lo1 = __riscv_vrgather_vv_i8mf2(values, iq4bits_lo1, 16);
3830
+ vint8mf2_t iq4b_hi1 = __riscv_vrgather_vv_i8mf2(values, iq4bits_hi1, 16);
3831
+ vint8mf2_t iq4b_lo2 = __riscv_vrgather_vv_i8mf2(values, iq4bits_lo2, 16);
3832
+ vint8mf2_t iq4b_hi2 = __riscv_vrgather_vv_i8mf2(values, iq4bits_hi2, 16);
3833
+
3834
+ // Accumulation.
3835
+ vint16m1_t sum1 = __riscv_vwmul_vv_i16m1(q8b_lo1, iq4b_lo1, 16);
3836
+ sum1 = __riscv_vwmacc_vv_i16m1(sum1, q8b_hi1, iq4b_hi1, 16);
3837
+ vint16m1_t sum2 = __riscv_vwmul_vv_i16m1(q8b_lo2, iq4b_lo2, 16);
3838
+ sum2 = __riscv_vwmacc_vv_i16m1(sum2, q8b_hi2, iq4b_hi2, 16);
3839
+ __riscv_vse32_v_i32m1(&acc1,__riscv_vwredsum_vs_i16m1_i32m1(sum1, __riscv_vmv_v_x_i32m1(0, 1), 16), 1);
3840
+ __riscv_vse32_v_i32m1(&acc2,__riscv_vwredsum_vs_i16m1_i32m1(sum2, __riscv_vmv_v_x_i32m1(0, 1), 16), 1);
3841
+ sumf += ((GGML_CPU_FP16_TO_FP32(x[ib + 0].d) * GGML_CPU_FP16_TO_FP32(y[ib + 0].d) * acc1));
3842
+ sumf += ((GGML_CPU_FP16_TO_FP32(x[ib + 1].d) * GGML_CPU_FP16_TO_FP32(y[ib + 1].d) * acc2));
3843
+ }
3844
+
3845
+ *s = sumf;
3846
+ }
3847
+ #endif
3848
+
3849
+ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
3850
+ #if defined __riscv_v_intrinsic
3851
+ switch (__riscv_vlenb() * 8) {
3852
+ case 128:
3853
+ ggml_vec_dot_iq4_nl_q8_0_vl128(n, s, bs, vx, bx, vy, by, nrc);
3854
+ break;
3855
+ default: // 256 and above
3856
+ ggml_vec_dot_iq4_nl_q8_0_vl256(n, s, bs, vx, bx, vy, by, nrc);
3857
+ break;
3858
+ }
3859
+ #else
3860
+ ggml_vec_dot_iq4_nl_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
3861
+ #endif
3862
+ }
3863
+
3864
+ #if defined __riscv_v_intrinsic
3865
+ static NOINLINE void ggml_vec_dot_iq4_xs_q8_K_vl128(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
3866
+ assert(nrc == 1);
3867
+ UNUSED(nrc);
3868
+ UNUSED(bx);
3869
+ UNUSED(by);
3870
+ UNUSED(bs);
3871
+ assert(n % QK_K == 0);
3872
+
3873
+ const block_iq4_xs * GGML_RESTRICT x = vx;
3874
+ const block_q8_K * GGML_RESTRICT y = vy;
3875
+
3876
+ const int nb = n / QK_K;
3877
+
3878
+ const vint8m4_t values = __riscv_vle8_v_i8m4(kvalues_iq4nl, 16);
3879
+ float sumf = 0;
3880
+
3881
+ for (int ibl = 0; ibl < nb; ++ibl) {
3882
+ const int8_t * q8 = y[ibl].qs;
3883
+ const uint8_t * iq4 = x[ibl].qs;
3884
+ uint16_t h = x[ibl].scales_h;
3885
+
3886
+ // We process 2 sub-blocks together.
3887
+ int sumi1 = 0, sumi2 = 0;
3888
+ #pragma GCC unroll 1
3889
+ for (int ib = 0; ib < QK_K / 64; ++ib) {
3890
+ // Load the packed weights.
3891
+ const vuint8m2_t iq4_packed = __riscv_vle8_v_u8m2(iq4, 32);
3892
+ iq4 += 32;
3893
+
3894
+ // Unpack the weight blocks.
3895
+ const vuint8m2_t iq4bits_lo = __riscv_vand_vx_u8m2(iq4_packed, 0xf, 32);
3896
+ const vuint8m2_t iq4bits_hi = __riscv_vsrl_vx_u8m2(iq4_packed, 4, 32);
3897
+ const vuint8m4_t iq4bits = __riscv_vcreate_v_u8m2_u8m4(iq4bits_lo, iq4bits_hi);
3898
+ const vuint8m4_t iq4bits_reorder = __riscv_vcreate_v_u8m1_u8m4(
3899
+ __riscv_vmv_v_v_u8m1(__riscv_vget_v_u8m4_u8m1(iq4bits, 0), 16),
3900
+ __riscv_vmv_v_v_u8m1(__riscv_vget_v_u8m4_u8m1(iq4bits, 2), 16),
3901
+ __riscv_vmv_v_v_u8m1(__riscv_vget_v_u8m4_u8m1(iq4bits, 1), 16),
3902
+ __riscv_vmv_v_v_u8m1(__riscv_vget_v_u8m4_u8m1(iq4bits, 3), 16)
3903
+ );
3904
+ const vint8m4_t iq4b = __riscv_vrgather_vv_i8m4(values, iq4bits_reorder, 64);
3905
+
3906
+ // Multiply with activations.
3907
+ const vint8m4_t q8b = __riscv_vle8_v_i8m4(q8, 64);
3908
+ q8 += 64;
3909
+ const vint16m8_t prod = __riscv_vwmul_vv_i16m8(iq4b, q8b, 64);
3910
+
3911
+ // Reduce separately.
3912
+ const int acc0 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m4_i32m1(__riscv_vget_v_i16m8_i16m4(prod, 0), __riscv_vmv_v_x_i32m1(0, 1), 32));
3913
+ const int acc1 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m4_i32m1(__riscv_vget_v_i16m8_i16m4(prod, 1), __riscv_vmv_v_x_i32m1(0, 1), 32));
3914
+
3915
+ const int ls1 = ((x[ibl].scales_l[ib] & 0xf) | ((h << 4) & 0x30)) - 32;
3916
+ const int ls2 = ((x[ibl].scales_l[ib] >> 4) | ((h << 2) & 0x30)) - 32;
3917
+ h >>= 4;
3918
+
3919
+ sumi1 += acc0 * ls1;
3920
+ sumi2 += acc1 * ls2;
3921
+
3922
+ __asm__ __volatile__("" ::: "memory");
3923
+ }
3924
+
3925
+ sumf += GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi1 + sumi2);
3926
+ }
3927
+
3928
+ *s = sumf;
3929
+ }
3930
+
3931
+ static NOINLINE void ggml_vec_dot_iq4_xs_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
3932
+ assert(nrc == 1);
3933
+ UNUSED(nrc);
3934
+ UNUSED(bx);
3935
+ UNUSED(by);
3936
+ UNUSED(bs);
3937
+ assert(n % QK_K == 0);
3938
+
3939
+ const block_iq4_xs * GGML_RESTRICT x = vx;
3940
+ const block_q8_K * GGML_RESTRICT y = vy;
3941
+
3942
+ const int nb = n / QK_K;
3943
+
3944
+ const vint8m4_t values = __riscv_vle8_v_i8m4(kvalues_iq4nl, 16);
3945
+ float sumf = 0;
3946
+
3947
+ // Indices for re-ordering IQ4 data.
3948
+ uint16_t index[16] = {
3949
+ 0, 1, 8, 9,
3950
+ 2, 3, 10, 11,
3951
+ 4, 5,12, 13,
3952
+ 6, 7, 14, 15,
3953
+ };
3954
+ vuint16m1_t i_vec = __riscv_vle16_v_u16m1(index, 16);
3955
+
3956
+ for (int ibl = 0; ibl < nb; ++ibl) {
3957
+ const int8_t * q8 = y[ibl].qs;
3958
+ const uint8_t * iq4 = x[ibl].qs;
3959
+ uint16_t h = x[ibl].scales_h;
3960
+
3961
+ int sumi1 = 0, sumi2 = 0, sumi3 = 0, sumi4 = 0;
3962
+
3963
+ #pragma GCC unroll 1
3964
+ for (int ib = 0; ib < QK_K / 128; ++ib) {
3965
+ // Weights and activations.
3966
+ vuint8m2_t iq4_packed = __riscv_vle8_v_u8m2(iq4, 64);
3967
+ iq4 += 64;
3968
+
3969
+ // Unpack the weight blocks.
3970
+ vuint8m2_t iq4bits_lo = __riscv_vand_vx_u8m2(iq4_packed, 0xf, 64);
3971
+ vuint8m2_t iq4bits_hi = __riscv_vsrl_vx_u8m2(iq4_packed, 4, 64);
3972
+ vuint8m4_t iq4bits = __riscv_vcreate_v_u8m2_u8m4(iq4bits_lo, iq4bits_hi);
3973
+ vuint8m4_t iq4bits_reorder = __riscv_vreinterpret_v_u64m4_u8m4(__riscv_vrgatherei16_vv_u64m4(__riscv_vreinterpret_v_u8m4_u64m4(iq4bits), i_vec, 16));
3974
+ vint8m4_t iq4b = __riscv_vrgather_vv_i8m4(values, iq4bits_reorder, 128);
3975
+
3976
+ __asm__ __volatile__("" ::: "memory");
3977
+
3978
+ // Multiply with activations.
3979
+ vint8m4_t q8b = __riscv_vle8_v_i8m4(q8, 128);
3980
+ vint16m8_t prod = __riscv_vwmul_vv_i16m8(iq4b, q8b, 128);
3981
+ q8 += 128;
3982
+
3983
+ __asm__ __volatile__("" ::: "memory");
3984
+
3985
+ // Reduce separately.
3986
+ int acc0 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(__riscv_vget_v_i16m8_i16m2(prod, 0), __riscv_vmv_v_x_i32m1(0, 1), 32));
3987
+ int acc1 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(__riscv_vget_v_i16m8_i16m2(prod, 1), __riscv_vmv_v_x_i32m1(0, 1), 32));
3988
+ int acc2 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(__riscv_vget_v_i16m8_i16m2(prod, 2), __riscv_vmv_v_x_i32m1(0, 1), 32));
3989
+ int acc3 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(__riscv_vget_v_i16m8_i16m2(prod, 3), __riscv_vmv_v_x_i32m1(0, 1), 32));
3990
+
3991
+ int ls1 = ((x[ibl].scales_l[ib * 2 + 0] & 0xf) | ((h << 4) & 0x30)) - 32;
3992
+ int ls2 = ((x[ibl].scales_l[ib * 2 + 0] >> 4) | ((h << 2) & 0x30)) - 32;
3993
+ int ls3 = ((x[ibl].scales_l[ib * 2 + 1] & 0xf) | ((h << 0) & 0x30)) - 32;
3994
+ int ls4 = ((x[ibl].scales_l[ib * 2 + 1] >> 4) | ((h >> 2) & 0x30)) - 32;
3995
+ h >>= 8;
3996
+
3997
+ sumi1 += acc0 * ls1;
3998
+ sumi2 += acc1 * ls2;
3999
+ sumi3 += acc2 * ls3;
4000
+ sumi4 += acc3 * ls4;
4001
+
4002
+ __asm__ __volatile__("" ::: "memory");
4003
+ }
4004
+
4005
+ sumf += GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi1 + sumi2 + sumi3 + sumi4);
4006
+ }
4007
+
4008
+ *s = sumf;
4009
+ }
4010
+ #endif
4011
+
4012
+ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
4013
+ #if defined __riscv_v_intrinsic
4014
+ switch (__riscv_vlenb() * 8) {
4015
+ case 128:
4016
+ ggml_vec_dot_iq4_xs_q8_K_vl128(n, s, bs, vx, bx, vy, by, nrc);
4017
+ break;
4018
+ case 256:
4019
+ ggml_vec_dot_iq4_xs_q8_K_vl256(n, s, bs, vx, bx, vy, by, nrc);
4020
+ break;
4021
+ default:
4022
+ ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
4023
+ break;
4024
+ }
4025
+ #else
4026
+ ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
4027
+ #endif
4028
+ }
4029
+
4030
+ #if defined __riscv_v_intrinsic
4031
+ static NOINLINE void ggml_vec_dot_tq1_0_q8_K_vl128(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
4032
+ assert(nrc == 1);
4033
+ UNUSED(nrc);
4034
+ UNUSED(bx);
4035
+ UNUSED(by);
4036
+ UNUSED(bs);
4037
+
4038
+ const block_tq1_0 * GGML_RESTRICT x = vx;
4039
+ const block_q8_K * GGML_RESTRICT y = vy;
4040
+
4041
+ const int nb = n / QK_K;
4042
+
4043
+ float sumf = 0.0f;
4044
+ uint8_t pow[16] = {1, 1, 1, 1, 3, 3, 3, 3, 9, 9, 9, 9, 27, 27, 27, 27};
4045
+
4046
+ for (int i = 0; i < nb; i++) {
4047
+ const uint8_t * GGML_RESTRICT tq = x[i].qs;
4048
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
4049
+
4050
+ // First loop.
4051
+ vint16m4_t suml1;
4052
+ {
4053
+ const int vl = 32;
4054
+ const vuint8m2_t tqb = __riscv_vle8_v_u8m2(tq, vl);
4055
+ tq += 32;
4056
+
4057
+ {
4058
+ const vuint16m4_t tq0 = __riscv_vsrl_vx_u16m4(__riscv_vwmulu_vx_u16m4(tqb, 3, vl), 8, vl);
4059
+ const vint16m4_t q80 = __riscv_vwcvt_x_x_v_i16m4(__riscv_vle8_v_i8m2(q8, vl), vl);
4060
+ suml1 = __riscv_vmul_vv_i16m4(__riscv_vreinterpret_v_u16m4_i16m4(__riscv_vsub_vx_u16m4(tq0, 1, vl)), q80, vl);
4061
+ q8 += 32;
4062
+ }
4063
+
4064
+ uint8_t pow3 = 3;
4065
+ #pragma GCC unroll 1
4066
+ for (int t = 0; t < 4; t++) {
4067
+ const vuint16m4_t tqn = __riscv_vsrl_vx_u16m4(__riscv_vwmulu_vx_u16m4(__riscv_vmul_vx_u8m2(tqb, pow3, vl), 3, vl), 8, vl);
4068
+ const vint16m4_t q8n = __riscv_vwcvt_x_x_v_i16m4(__riscv_vle8_v_i8m2(q8, vl), vl);
4069
+ suml1 = __riscv_vmacc_vv_i16m4(suml1, __riscv_vreinterpret_v_u16m4_i16m4(__riscv_vsub_vx_u16m4(tqn, 1, vl)), q8n, vl);
4070
+ pow3 *= 3;
4071
+ q8 += 32;
4072
+ }
4073
+ }
4074
+
4075
+ // Second loop.
4076
+ vint16m2_t suml2;
4077
+ {
4078
+ const int vl = 16;
4079
+ const vuint8m1_t tqb = __riscv_vle8_v_u8m1(tq, vl);
4080
+
4081
+ {
4082
+ const vuint16m2_t tq0 = __riscv_vsrl_vx_u16m2(__riscv_vwmulu_vx_u16m2(tqb, 3, vl), 8, vl);
4083
+ const vint16m2_t q80 = __riscv_vwcvt_x_x_v_i16m2(__riscv_vle8_v_i8m1(q8, vl), vl);
4084
+ suml2 = __riscv_vmul_vv_i16m2(__riscv_vreinterpret_v_u16m2_i16m2(__riscv_vsub_vx_u16m2(tq0, 1, vl)), q80, vl);
4085
+ q8 += 16;
4086
+ }
4087
+
4088
+ uint8_t pow3 = 3;
4089
+ #pragma GCC unroll 1
4090
+ for (int t = 0; t < 4; t++) {
4091
+ const vuint16m2_t tqn = __riscv_vsrl_vx_u16m2(__riscv_vwmulu_vx_u16m2(__riscv_vmul_vx_u8m1(tqb, pow3, vl), 3, vl), 8, vl);
4092
+ const vint16m2_t q8n = __riscv_vwcvt_x_x_v_i16m2(__riscv_vle8_v_i8m1(q8, vl), vl);
4093
+ suml2 = __riscv_vmacc_vv_i16m2(suml2, __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vsub_vx_u16m2(tqn, 1, vl)), q8n, vl);
4094
+ pow3 *= 3;
4095
+ q8 += 16;
4096
+ }
4097
+ }
4098
+
4099
+ // Third loop.
4100
+ vint16m2_t suml3;
4101
+ {
4102
+ const int vl = 16;
4103
+
4104
+ uint32_t qh;
4105
+ memcpy(&qh, &x[i].qh[0], 4);
4106
+ // Prevent fusion with vmv.
4107
+ __asm__ __volatile__("" : "+r"(qh));
4108
+ const vuint8m1_t tqb = __riscv_vreinterpret_v_u32m1_u8m1(__riscv_vmv_v_x_u32m1(qh, vl / 4));
4109
+
4110
+ const vuint8m1_t p = __riscv_vle8_v_u8m1(pow, vl);
4111
+
4112
+ const vuint16m2_t tq0 = __riscv_vsrl_vx_u16m2(__riscv_vwmulu_vx_u16m2(__riscv_vmul_vv_u8m1(tqb, p, vl), 3, vl), 8, vl);
4113
+
4114
+ const vint16m2_t q80 = __riscv_vwcvt_x_x_v_i16m2(__riscv_vle8_v_i8m1(q8, vl), vl);
4115
+
4116
+ suml3 = __riscv_vmul_vv_i16m2(__riscv_vreinterpret_v_u16m2_i16m2(__riscv_vsub_vx_u16m2(tq0, 1, vl)), q80, vl);
4117
+ }
4118
+
4119
+ vint16m2_t sumb = __riscv_vadd_vv_i16m2(__riscv_vget_v_i16m4_i16m2(suml1, 0), __riscv_vget_v_i16m4_i16m2(suml1, 1), 16);
4120
+ sumb = __riscv_vadd_vv_i16m2(sumb, suml2, 16);
4121
+ sumb = __riscv_vadd_vv_i16m2(sumb, suml3, 16);
4122
+
4123
+ vint32m1_t sum = __riscv_vwredsum_vs_i16m2_i32m1(sumb, __riscv_vmv_v_x_i32m1(0, 1), 16);
4124
+ sumf += __riscv_vmv_x_s_i32m1_i32(sum) * y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
4125
+ }
4126
+
4127
+ *s = sumf;
4128
+ }
4129
+
4130
+ static NOINLINE void ggml_vec_dot_tq1_0_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
4131
+ assert(nrc == 1);
4132
+ UNUSED(nrc);
4133
+ UNUSED(bx);
4134
+ UNUSED(by);
4135
+ UNUSED(bs);
4136
+
4137
+ const block_tq1_0 * GGML_RESTRICT x = vx;
4138
+ const block_q8_K * GGML_RESTRICT y = vy;
4139
+
4140
+ const int nb = n / QK_K;
4141
+
4142
+ float sumf = 0.0f;
4143
+ uint8_t pow[16] = {1, 1, 1, 1, 3, 3, 3, 3, 9, 9, 9, 9, 27, 27, 27, 27};
4144
+
4145
+ for (int i = 0; i < nb; i++) {
4146
+ // First loop.
4147
+ vint16m2_t suml1;
4148
+ {
4149
+ const int vl = 32;
4150
+ vuint8m1_t tq = __riscv_vle8_v_u8m1(x[i].qs, vl);
4151
+
4152
+ vuint16m2_t tq0 = __riscv_vsrl_vx_u16m2(__riscv_vwmulu_vx_u16m2(tq, 3, vl), 8, vl);
4153
+ vuint16m2_t tq1 = __riscv_vsrl_vx_u16m2(__riscv_vwmulu_vx_u16m2(__riscv_vmul_vx_u8m1(tq, 3, vl), 3, vl), 8, vl);
4154
+ vuint16m2_t tq2 = __riscv_vsrl_vx_u16m2(__riscv_vwmulu_vx_u16m2(__riscv_vmul_vx_u8m1(tq, 9, vl), 3, vl), 8, vl);
4155
+ vuint16m2_t tq3 = __riscv_vsrl_vx_u16m2(__riscv_vwmulu_vx_u16m2(__riscv_vmul_vx_u8m1(tq, 27, vl), 3, vl), 8, vl);
4156
+ vuint16m2_t tq4 = __riscv_vsrl_vx_u16m2(__riscv_vwmulu_vx_u16m2(__riscv_vmul_vx_u8m1(tq, 81, vl), 3, vl), 8, vl);
4157
+
4158
+ vint16m2_t q80 = __riscv_vwcvt_x_x_v_i16m2(__riscv_vle8_v_i8m1(y[i].qs + 0, vl), vl);
4159
+ vint16m2_t q81 = __riscv_vwcvt_x_x_v_i16m2(__riscv_vle8_v_i8m1(y[i].qs + 32, vl), vl);
4160
+ vint16m2_t q82 = __riscv_vwcvt_x_x_v_i16m2(__riscv_vle8_v_i8m1(y[i].qs + 64, vl), vl);
4161
+ vint16m2_t q83 = __riscv_vwcvt_x_x_v_i16m2(__riscv_vle8_v_i8m1(y[i].qs + 96, vl), vl);
4162
+ vint16m2_t q84 = __riscv_vwcvt_x_x_v_i16m2(__riscv_vle8_v_i8m1(y[i].qs + 128, vl), vl);
4163
+
4164
+ vint16m2_t sum0 = __riscv_vmul_vv_i16m2(__riscv_vreinterpret_v_u16m2_i16m2(__riscv_vsub_vx_u16m2(tq0, 1, vl)), q80, vl);
4165
+ vint16m2_t sum1 = __riscv_vmul_vv_i16m2(__riscv_vreinterpret_v_u16m2_i16m2(__riscv_vsub_vx_u16m2(tq1, 1, vl)), q81, vl);
4166
+ vint16m2_t sum2 = __riscv_vmul_vv_i16m2(__riscv_vreinterpret_v_u16m2_i16m2(__riscv_vsub_vx_u16m2(tq2, 1, vl)), q82, vl);
4167
+ vint16m2_t sum3 = __riscv_vmul_vv_i16m2(__riscv_vreinterpret_v_u16m2_i16m2(__riscv_vsub_vx_u16m2(tq3, 1, vl)), q83, vl);
4168
+ vint16m2_t sum4 = __riscv_vmul_vv_i16m2(__riscv_vreinterpret_v_u16m2_i16m2(__riscv_vsub_vx_u16m2(tq4, 1, vl)), q84, vl);
4169
+
4170
+ vint16m2_t sumi0 = __riscv_vadd_vv_i16m2(sum0, sum1, vl);
4171
+ vint16m2_t sumi1 = __riscv_vadd_vv_i16m2(sum2, sum3, vl);
4172
+ suml1 = __riscv_vadd_vv_i16m2(sum4, __riscv_vadd_vv_i16m2(sumi0, sumi1, vl), vl);
4173
+ }
4174
+
4175
+ // Second loop.
4176
+ vint16m1_t suml2;
4177
+ {
4178
+ const int vl = 16;
4179
+ vuint8mf2_t tq = __riscv_vle8_v_u8mf2(x[i].qs + 32, vl);
4180
+
4181
+ vuint16m1_t tq0 = __riscv_vsrl_vx_u16m1(__riscv_vwmulu_vx_u16m1(tq, 3 * 1, vl), 8, vl);
4182
+ vuint16m1_t tq1 = __riscv_vsrl_vx_u16m1(__riscv_vwmulu_vx_u16m1(__riscv_vmul_vx_u8mf2(tq, 3, vl), 3, vl), 8, vl);
4183
+ vuint16m1_t tq2 = __riscv_vsrl_vx_u16m1(__riscv_vwmulu_vx_u16m1(__riscv_vmul_vx_u8mf2(tq, 9, vl), 3, vl), 8, vl);
4184
+ vuint16m1_t tq3 = __riscv_vsrl_vx_u16m1(__riscv_vwmulu_vx_u16m1(__riscv_vmul_vx_u8mf2(tq, 27, vl), 3, vl), 8, vl);
4185
+ vuint16m1_t tq4 = __riscv_vsrl_vx_u16m1(__riscv_vwmulu_vx_u16m1(__riscv_vmul_vx_u8mf2(tq, 81, vl), 3, vl), 8, vl);
4186
+
4187
+ vint16m1_t q80 = __riscv_vwcvt_x_x_v_i16m1(__riscv_vle8_v_i8mf2(y[i].qs + 160, vl), vl);
4188
+ vint16m1_t q81 = __riscv_vwcvt_x_x_v_i16m1(__riscv_vle8_v_i8mf2(y[i].qs + 176, vl), vl);
4189
+ vint16m1_t q82 = __riscv_vwcvt_x_x_v_i16m1(__riscv_vle8_v_i8mf2(y[i].qs + 192, vl), vl);
4190
+ vint16m1_t q83 = __riscv_vwcvt_x_x_v_i16m1(__riscv_vle8_v_i8mf2(y[i].qs + 208, vl), vl);
4191
+ vint16m1_t q84 = __riscv_vwcvt_x_x_v_i16m1(__riscv_vle8_v_i8mf2(y[i].qs + 224, vl), vl);
4192
+
4193
+ vint16m1_t sum0 = __riscv_vmul_vv_i16m1(__riscv_vreinterpret_v_u16m1_i16m1(__riscv_vsub_vx_u16m1(tq0, 1, vl)), q80, vl);
4194
+ vint16m1_t sum1 = __riscv_vmul_vv_i16m1(__riscv_vreinterpret_v_u16m1_i16m1(__riscv_vsub_vx_u16m1(tq1, 1, vl)), q81, vl);
4195
+ vint16m1_t sum2 = __riscv_vmul_vv_i16m1(__riscv_vreinterpret_v_u16m1_i16m1(__riscv_vsub_vx_u16m1(tq2, 1, vl)), q82, vl);
4196
+ vint16m1_t sum3 = __riscv_vmul_vv_i16m1(__riscv_vreinterpret_v_u16m1_i16m1(__riscv_vsub_vx_u16m1(tq3, 1, vl)), q83, vl);
4197
+ vint16m1_t sum4 = __riscv_vmul_vv_i16m1(__riscv_vreinterpret_v_u16m1_i16m1(__riscv_vsub_vx_u16m1(tq4, 1, vl)), q84, vl);
4198
+
4199
+ vint16m1_t sumi0 = __riscv_vadd_vv_i16m1(sum0, sum1, vl);
4200
+ vint16m1_t sumi1 = __riscv_vadd_vv_i16m1(sum2, sum3, vl);
4201
+ suml2 = __riscv_vadd_vv_i16m1(sum4, __riscv_vadd_vv_i16m1(sumi0, sumi1, vl), vl);
4202
+ }
4203
+
4204
+ // Third loop.
4205
+ vint16m1_t suml3;
4206
+ {
4207
+ const int vl = 16;
4208
+
4209
+ uint32_t qh;
4210
+ memcpy(&qh, &x[i].qh[0], 4);
4211
+ // Prevent fusion with vmv.
4212
+ __asm__ __volatile__("" : "+r"(qh));
4213
+ vuint8mf2_t tq = __riscv_vreinterpret_v_u32mf2_u8mf2(__riscv_vmv_v_x_u32mf2(qh, vl / 4));
4214
+
4215
+ vuint8mf2_t p = __riscv_vle8_v_u8mf2(pow, vl);
4216
+
4217
+ vuint16m1_t tq0 = __riscv_vsrl_vx_u16m1(__riscv_vwmulu_vx_u16m1(__riscv_vmul_vv_u8mf2(tq, p, vl), 3, vl), 8, vl);
4218
+
4219
+ vint16m1_t q80 = __riscv_vwcvt_x_x_v_i16m1(__riscv_vle8_v_i8mf2(y[i].qs + 240, vl), vl);
4220
+
4221
+ suml3 = __riscv_vmul_vv_i16m1(__riscv_vreinterpret_v_u16m1_i16m1(__riscv_vsub_vx_u16m1(tq0, 1, vl)), q80, vl);
4222
+ }
4223
+
4224
+ vint16m1_t sumb = __riscv_vadd_vv_i16m1(__riscv_vget_v_i16m2_i16m1(suml1, 0), __riscv_vget_v_i16m2_i16m1(suml1, 1), 16);
4225
+ sumb = __riscv_vadd_vv_i16m1(sumb, __riscv_vadd_vv_i16m1(suml2, suml3, 16), 16);
4226
+
4227
+ vint32m1_t sum = __riscv_vwredsum_vs_i16m1_i32m1(sumb, __riscv_vmv_v_x_i32m1(0, 1), 16);
4228
+ sumf += __riscv_vmv_x_s_i32m1_i32(sum) * y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
4229
+ }
4230
+
4231
+ *s = sumf;
4232
+ }
4233
+ #endif
4234
+
4235
+ void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
4236
+ #if defined __riscv_v_intrinsic
4237
+ switch (__riscv_vlenb() * 8) {
4238
+ case 128:
4239
+ ggml_vec_dot_tq1_0_q8_K_vl128(n, s, bs, vx, bx, vy, by, nrc);
4240
+ break;
4241
+ case 256:
4242
+ ggml_vec_dot_tq1_0_q8_K_vl256(n, s, bs, vx, bx, vy, by, nrc);
4243
+ break;
4244
+ default:
4245
+ ggml_vec_dot_tq1_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
4246
+ break;
4247
+ }
4248
+ #else
4249
+ ggml_vec_dot_tq1_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
4250
+ #endif
4251
+ }
4252
+
4253
+ #if defined __riscv_v_intrinsic
4254
+ static NOINLINE void ggml_vec_dot_tq2_0_q8_K_vl128(const int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
4255
+ assert(n % QK_K == 0);
4256
+ assert(nrc == 1);
4257
+ UNUSED(nrc);
4258
+ UNUSED(bx);
4259
+ UNUSED(by);
4260
+ UNUSED(bs);
4261
+
4262
+ const block_tq2_0 * GGML_RESTRICT x = vx;
4263
+ const block_q8_K * GGML_RESTRICT y = vy;
4264
+
4265
+ const int nb = n / QK_K;
4266
+ float sumf = 0.0f;
4267
+ for (int i = 0; i < nb; ++i) {
4268
+ int32_t sumi = 0;
4269
+
4270
+ for (size_t j = 0; j < sizeof(x[0].qs); j += 32) {
4271
+ const int8_t * py0 = &y[i].qs[j * 4 + 0 * 32];
4272
+ const int8_t * py1 = &y[i].qs[j * 4 + 1 * 32];
4273
+ const int8_t * py2 = &y[i].qs[j * 4 + 2 * 32];
4274
+ const int8_t * py3 = &y[i].qs[j * 4 + 3 * 32];
4275
+ const uint8_t* px = &x[i].qs[j];
4276
+
4277
+ size_t vl = __riscv_vsetvl_e16m4(32);
4278
+ vint16m4_t vacc16 = __riscv_vmv_v_x_i16m4(0, vl);
4279
+
4280
+ // Load Raw Packed elements
4281
+ vl = __riscv_vsetvl_e8m2(32);
4282
+ vuint8m2_t vx_u8 = __riscv_vle8_v_u8m2(px, vl);
4283
+
4284
+ // Process bits 1:0
4285
+ {
4286
+ // Unpack
4287
+ vuint8m2_t t0 = __riscv_vand_vx_u8m2(vx_u8, 0x03, vl);
4288
+ vint8m2_t vq = __riscv_vsub_vx_i8m2(__riscv_vreinterpret_v_u8m2_i8m2(t0), 1, vl);
4289
+ vint8m2_t vy = __riscv_vle8_v_i8m2(py0, vl);
4290
+ // Accumulate
4291
+ vacc16 = __riscv_vwmacc_vv_i16m4(vacc16, vq, vy, vl);
4292
+ }
4293
+ __asm__ volatile("" ::: "memory");
4294
+ // Process bits 3:2
4295
+ {
4296
+ vuint8m2_t t1 = __riscv_vsrl_vx_u8m2(vx_u8, 2, vl);
4297
+ t1 = __riscv_vand_vx_u8m2(t1, 0x03, vl);
4298
+ vint8m2_t vq = __riscv_vsub_vx_i8m2(__riscv_vreinterpret_v_u8m2_i8m2(t1), 1, vl);
4299
+
4300
+ vint8m2_t vy = __riscv_vle8_v_i8m2(py1, vl);
4301
+ vacc16 = __riscv_vwmacc_vv_i16m4(vacc16, vq, vy, vl);
4302
+ }
4303
+ __asm__ volatile("" ::: "memory");
4304
+ // Process bits 5:4
4305
+ {
4306
+ vuint8m2_t t2 = __riscv_vsrl_vx_u8m2(vx_u8, 4, vl);
4307
+ t2 = __riscv_vand_vx_u8m2(t2, 0x03, vl);
4308
+ vint8m2_t vq = __riscv_vsub_vx_i8m2(__riscv_vreinterpret_v_u8m2_i8m2(t2), 1, vl);
4309
+
4310
+ vint8m2_t vy = __riscv_vle8_v_i8m2(py2, vl);
4311
+ vacc16 = __riscv_vwmacc_vv_i16m4(vacc16, vq, vy, vl);
4312
+ }
4313
+ __asm__ volatile("" ::: "memory");
4314
+ // Process bits 7:6
4315
+ {
4316
+ vuint8m2_t t3 = __riscv_vsrl_vx_u8m2(vx_u8, 6, vl);
4317
+ vint8m2_t vq = __riscv_vsub_vx_i8m2(__riscv_vreinterpret_v_u8m2_i8m2(t3), 1, vl);
4318
+
4319
+ vint8m2_t vy = __riscv_vle8_v_i8m2(py3, vl);
4320
+ vacc16 = __riscv_vwmacc_vv_i16m4(vacc16, vq, vy, vl);
4321
+ }
4322
+ __asm__ volatile("" ::: "memory");
4323
+ vl = __riscv_vsetvl_e16m4(32);
4324
+ vint32m1_t vzero32 = __riscv_vmv_v_x_i32m1(0, 1);
4325
+ vint32m1_t vred32 = __riscv_vwredsum_vs_i16m4_i32m1(vacc16, vzero32, vl);
4326
+ sumi += __riscv_vmv_x_s_i32m1_i32(vred32);
4327
+ }
4328
+
4329
+ const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
4330
+ sumf += (float)sumi * d;
4331
+ }
4332
+
4333
+ *s = sumf;
4334
+ }
4335
+
4336
+ static NOINLINE void ggml_vec_dot_tq2_0_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
4337
+ assert(n % QK_K == 0);
4338
+ assert(nrc == 1);
4339
+ UNUSED(nrc);
4340
+ UNUSED(bx);
4341
+ UNUSED(by);
4342
+ UNUSED(bs);
4343
+
4344
+ const block_tq2_0 * GGML_RESTRICT x = vx;
4345
+ const block_q8_K * GGML_RESTRICT y = vy;
4346
+
4347
+ const int nb = n / QK_K;
4348
+
4349
+ float sumf = 0.0f;
4350
+ for (int i = 0; i < nb; ++i) {
4351
+ int32_t sumi = 0;
4352
+
4353
+ for (size_t j = 0; j < sizeof(x[0].qs); j += 32) {
4354
+ const int8_t * py0 = &y[i].qs[j * 4 + 0 * 32];
4355
+ const int8_t * py1 = &y[i].qs[j * 4 + 1 * 32];
4356
+ const int8_t * py2 = &y[i].qs[j * 4 + 2 * 32];
4357
+ const int8_t * py3 = &y[i].qs[j * 4 + 3 * 32];
4358
+ const uint8_t* px = &x[i].qs[j];
4359
+
4360
+ size_t vlmax_16m2 = __riscv_vsetvl_e16m2(32);
4361
+ vint16m2_t vacc16 = __riscv_vmv_v_x_i16m2(0, vlmax_16m2);
4362
+
4363
+ size_t vl = __riscv_vsetvl_e8m1(32);
4364
+
4365
+ vuint8m1_t vx_u8 = __riscv_vle8_v_u8m1(px, vl);
4366
+
4367
+ vint8m1_t vy0 = __riscv_vle8_v_i8m1(py0 , vl);
4368
+ vint8m1_t vy1 = __riscv_vle8_v_i8m1(py1, vl);
4369
+ vint8m1_t vy2 = __riscv_vle8_v_i8m1(py2, vl);
4370
+ vint8m1_t vy3 = __riscv_vle8_v_i8m1(py3, vl);
4371
+
4372
+ // l=0 (bits 1:0)
4373
+ vuint8m1_t t0 = __riscv_vand_vx_u8m1(vx_u8, 0x03, vl);
4374
+ vint8m1_t vq0 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(t0), 1, vl);
4375
+
4376
+ // l=1 (bits 3:2)
4377
+ vuint8m1_t t1 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(vx_u8, 2, vl), 0x03, vl);
4378
+ vint8m1_t vq1 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(t1), 1, vl);
4379
+
4380
+ // l=2 (bits 5:4)
4381
+ vuint8m1_t t2 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(vx_u8, 4, vl), 0x03, vl);
4382
+ vint8m1_t vq2 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(t2), 1, vl);
4383
+
4384
+ // l=3 (bits 7:6)
4385
+ vuint8m1_t t3 = __riscv_vsrl_vx_u8m1(vx_u8, 6, vl); // No final AND needed as vsrl shifts in zeros
4386
+ vint8m1_t vq3 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(t3), 1, vl);
4387
+
4388
+ // 4. Multiply and accumulate
4389
+ vacc16 = __riscv_vwmacc_vv_i16m2(vacc16, vq0, vy0, vl);
4390
+ vacc16 = __riscv_vwmacc_vv_i16m2(vacc16, vq1, vy1, vl);
4391
+ vacc16 = __riscv_vwmacc_vv_i16m2(vacc16, vq2, vy2, vl);
4392
+ vacc16 = __riscv_vwmacc_vv_i16m2(vacc16, vq3, vy3, vl);
4393
+
4394
+ vlmax_16m2 = __riscv_vsetvl_e16m2(32);
4395
+ vint32m1_t vzero32 = __riscv_vmv_v_x_i32m1(0, 1);
4396
+ vint32m1_t vred32 = __riscv_vwredsum_vs_i16m2_i32m1(vacc16, vzero32, vlmax_16m2);
4397
+
4398
+ sumi += __riscv_vmv_x_s_i32m1_i32(vred32);
4399
+ }
4400
+ const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
4401
+ sumf += (float)sumi * d;
4402
+ }
4403
+
4404
+ *s = sumf;
4405
+ }
4406
+ #endif
4407
+
4408
+ void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
4409
+ #if defined __riscv_v_intrinsic
4410
+ switch (__riscv_vlenb() * 8) {
4411
+ case 128:
4412
+ ggml_vec_dot_tq2_0_q8_K_vl128(n, s, bs, vx, bx, vy, by, nrc);
4413
+ break;
4414
+ case 256:
4415
+ ggml_vec_dot_tq2_0_q8_K_vl256(n, s, bs, vx, bx, vy, by, nrc);
4416
+ break;
4417
+ default:
4418
+ ggml_vec_dot_tq2_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
4419
+ break;
4420
+ }
4421
+ #else
4422
+ ggml_vec_dot_tq2_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
4423
+ #endif
4424
+ }
4425
+
4426
+ #if defined __riscv_v_intrinsic
4427
+ static NOINLINE void ggml_vec_dot_mxfp4_q8_0_vl128(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
4428
+ assert(nrc == 1);
4429
+ UNUSED(nrc);
4430
+ UNUSED(bx);
4431
+ UNUSED(by);
4432
+ UNUSED(bs);
4433
+ assert(n % QK_MXFP4 == 0);
4434
+ static_assert(QK_MXFP4 == QK8_0, "QK_MXFP4 and QK8_0 must be the same");
4435
+
4436
+ const block_mxfp4 * GGML_RESTRICT x = vx;
4437
+ const block_q8_0 * GGML_RESTRICT y = vy;
4438
+
4439
+ const int nb = n / QK_MXFP4;
4440
+
4441
+ int ib = 0;
4442
+ float sumf = 0;
4443
+
4444
+ // Load the lookup table once.
4445
+ const vint8m2_t values = __riscv_vle8_v_i8m2(kvalues_mxfp4, 16);
4446
+ int acc1, acc2;
4447
+
4448
+ // We process 2 blocks at once.
4449
+ for (; ib + 1 < nb; ib += 2) {
4450
+ // Weights and activations.
4451
+ vuint8m1_t mx_packed1 = __riscv_vle8_v_u8m1(x[ib + 0].qs, 16);
4452
+ vint8m2_t q8b1 = __riscv_vle8_v_i8m2(y[ib + 0].qs, 32);
4453
+ vuint8m1_t mx_packed2 = __riscv_vle8_v_u8m1(x[ib + 1].qs, 16);
4454
+ vint8m2_t q8b2 = __riscv_vle8_v_i8m2(y[ib + 1].qs, 32);
4455
+
4456
+ // Unpack the weight blocks.
4457
+ vuint8m2_t mxbits1 = __riscv_vcreate_v_u8m1_u8m2(
4458
+ __riscv_vand_vx_u8m1(mx_packed1, 0xf, 16),
4459
+ __riscv_vsrl_vx_u8m1(mx_packed1, 4, 16)
4460
+ );
4461
+ vuint8m2_t mxbits2 = __riscv_vcreate_v_u8m1_u8m2(
4462
+ __riscv_vand_vx_u8m1(mx_packed2, 0xf, 16),
4463
+ __riscv_vsrl_vx_u8m1(mx_packed2, 4, 16)
4464
+ );
4465
+
4466
+ // Gather values from the lookup table.
4467
+ vint8m2_t mxb1 = __riscv_vrgather_vv_i8m2(values, mxbits1, 32);
4468
+ vint8m2_t mxb2 = __riscv_vrgather_vv_i8m2(values, mxbits2, 32);
4469
+
4470
+ // Accumulation.
4471
+ vint16m4_t sum1 = __riscv_vwmul_vv_i16m4(q8b1, mxb1, 32);
4472
+ vint16m4_t sum2 = __riscv_vwmul_vv_i16m4(q8b2, mxb2, 32);
4473
+ __riscv_vse32_v_i32m1(&acc1,__riscv_vwredsum_vs_i16m4_i32m1(sum1, __riscv_vmv_v_x_i32m1(0, 1), 32), 1);
4474
+ __riscv_vse32_v_i32m1(&acc2,__riscv_vwredsum_vs_i16m4_i32m1(sum2, __riscv_vmv_v_x_i32m1(0, 1), 32), 1);
4475
+ sumf += ((GGML_E8M0_TO_FP32_HALF(x[ib + 0].e) * GGML_CPU_FP16_TO_FP32(y[ib + 0].d) * acc1));
4476
+ sumf += ((GGML_E8M0_TO_FP32_HALF(x[ib + 1].e) * GGML_CPU_FP16_TO_FP32(y[ib + 1].d) * acc2));
4477
+ }
4478
+
4479
+ *s = sumf;
4480
+ }
4481
+
4482
+ static NOINLINE void ggml_vec_dot_mxfp4_q8_0_vl256(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
4483
+ assert(nrc == 1);
4484
+ UNUSED(nrc);
4485
+ UNUSED(bx);
4486
+ UNUSED(by);
4487
+ UNUSED(bs);
4488
+ assert(n % QK_MXFP4 == 0);
4489
+ static_assert(QK_MXFP4 == QK8_0, "QK_MXFP4 and QK8_0 must be the same");
4490
+
4491
+ const block_mxfp4 * GGML_RESTRICT x = vx;
4492
+ const block_q8_0 * GGML_RESTRICT y = vy;
4493
+
4494
+ const int nb = n / QK_MXFP4;
4495
+
4496
+ int ib = 0;
4497
+ float sumf = 0;
4498
+
4499
+ // Load the lookup table once.
4500
+ const vint8mf2_t values = __riscv_vle8_v_i8mf2(kvalues_mxfp4, 16);
4501
+ int acc1, acc2;
4502
+
4503
+ // We process 2 blocks at once.
4504
+ for (; ib + 1 < nb; ib+=2) {
4505
+ // Weights and activations.
4506
+ vuint8mf2_t mx_packed1 = __riscv_vle8_v_u8mf2(x[ib + 0].qs, 16);
4507
+ vint8mf2_t q8b_lo1 = __riscv_vle8_v_i8mf2(y[ib + 0].qs, 16);
4508
+ vint8mf2_t q8b_hi1 = __riscv_vle8_v_i8mf2(y[ib + 0].qs + 16, 16);
4509
+ vuint8mf2_t mx_packed2 = __riscv_vle8_v_u8mf2(x[ib + 1].qs, 16);
4510
+ vint8mf2_t q8b_lo2 = __riscv_vle8_v_i8mf2(y[ib + 1].qs, 16);
4511
+ vint8mf2_t q8b_hi2 = __riscv_vle8_v_i8mf2(y[ib + 1].qs + 16, 16);
4512
+
4513
+ // Unpack the weight blocks.
4514
+ vuint8mf2_t mxbits_lo1 = __riscv_vand_vx_u8mf2(mx_packed1, 0xf, 16);
4515
+ vuint8mf2_t mxbits_hi1 = __riscv_vsrl_vx_u8mf2(mx_packed1, 4, 16);
4516
+ vuint8mf2_t mxbits_lo2 = __riscv_vand_vx_u8mf2(mx_packed2, 0xf, 16);
4517
+ vuint8mf2_t mxbits_hi2 = __riscv_vsrl_vx_u8mf2(mx_packed2, 4, 16);
4518
+
4519
+ // Gather values from the lookup table.
4520
+ vint8mf2_t mxb_lo1 = __riscv_vrgather_vv_i8mf2(values, mxbits_lo1, 16);
4521
+ vint8mf2_t mxb_hi1 = __riscv_vrgather_vv_i8mf2(values, mxbits_hi1, 16);
4522
+ vint8mf2_t mxb_lo2 = __riscv_vrgather_vv_i8mf2(values, mxbits_lo2, 16);
4523
+ vint8mf2_t mxb_hi2 = __riscv_vrgather_vv_i8mf2(values, mxbits_hi2, 16);
4524
+
4525
+ // Accumulation.
4526
+ vint16m1_t sum1 = __riscv_vwmul_vv_i16m1(q8b_lo1, mxb_lo1, 16);
4527
+ sum1 = __riscv_vwmacc_vv_i16m1(sum1, q8b_hi1, mxb_hi1, 16);
4528
+ vint16m1_t sum2 = __riscv_vwmul_vv_i16m1(q8b_lo2, mxb_lo2, 16);
4529
+ sum2 = __riscv_vwmacc_vv_i16m1(sum2, q8b_hi2, mxb_hi2, 16);
4530
+ __riscv_vse32_v_i32m1(&acc1,__riscv_vwredsum_vs_i16m1_i32m1(sum1, __riscv_vmv_v_x_i32m1(0, 1), 16), 1);
4531
+ __riscv_vse32_v_i32m1(&acc2,__riscv_vwredsum_vs_i16m1_i32m1(sum2, __riscv_vmv_v_x_i32m1(0, 1), 16), 1);
4532
+ sumf += ((GGML_E8M0_TO_FP32_HALF(x[ib + 0].e) * GGML_CPU_FP16_TO_FP32(y[ib + 0].d) * acc1));
4533
+ sumf += ((GGML_E8M0_TO_FP32_HALF(x[ib + 1].e) * GGML_CPU_FP16_TO_FP32(y[ib + 1].d) * acc2));
4534
+ }
4535
+
4536
+ *s = sumf;
4537
+ }
4538
+ #endif
4539
+
4540
+ void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
4541
+ #if defined __riscv_v_intrinsic
4542
+ switch (__riscv_vlenb() * 8) {
4543
+ case 128:
4544
+ ggml_vec_dot_mxfp4_q8_0_vl128(n, s, bs, vx, bx, vy, by, nrc);
4545
+ break;
4546
+ default: // 256 and above
4547
+ ggml_vec_dot_mxfp4_q8_0_vl256(n, s, bs, vx, bx, vy, by, nrc);
4548
+ break;
4549
+ }
4550
+ #else
4551
+ ggml_vec_dot_mxfp4_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
4552
+ #endif
4553
+ }