toy 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2107) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +1124 -0
  3. data/LICENSE +21 -0
  4. data/Makefile +2022 -0
  5. data/README.md +154 -0
  6. data/bin/toy +10 -0
  7. data/lib/toy/compute.rb +135 -0
  8. data/lib/toy/compute_cuda.rb +104 -0
  9. data/lib/toy/compute_metal.rb +97 -0
  10. data/lib/toy/core/cli/describe.rb +188 -0
  11. data/lib/toy/core/cli/eval.rb +385 -0
  12. data/lib/toy/core/cli/exit_codes.rb +15 -0
  13. data/lib/toy/core/cli/fetch.rb +238 -0
  14. data/lib/toy/core/cli/infer.rb +268 -0
  15. data/lib/toy/core/cli/install.rb +228 -0
  16. data/lib/toy/core/cli/list.rb +86 -0
  17. data/lib/toy/core/cli/manifest.rb +49 -0
  18. data/lib/toy/core/cli/new.rb +594 -0
  19. data/lib/toy/core/cli/serve.rb +237 -0
  20. data/lib/toy/core/cli/train.rb +471 -0
  21. data/lib/toy/core/cli.rb +165 -0
  22. data/lib/toy/core/config.rb +64 -0
  23. data/lib/toy/core/gguf_meta.rb +161 -0
  24. data/lib/toy/core/model_scan.rb +221 -0
  25. data/lib/toy/core/run_log.rb +94 -0
  26. data/lib/toy/core/toy_root.rb +95 -0
  27. data/lib/toy/dev/toy_card.rb +299 -0
  28. data/lib/toy/dev/toy_describe_flow.rb +412 -0
  29. data/lib/toy/dev/toy_logprobs.rb +86 -0
  30. data/lib/toy/dev/toy_tap.rb +183 -0
  31. data/lib/toy/dev/toy_token_drift.rb +121 -0
  32. data/lib/toy/ffi/tinynn.rb +1491 -0
  33. data/lib/toy/ffi/tinynn_cuda.rb +1124 -0
  34. data/lib/toy/ffi/tinynn_metal.rb +359 -0
  35. data/lib/toy/ffi_manifest.rb +84 -0
  36. data/lib/toy/io/bpe.rb +325 -0
  37. data/lib/toy/io/gguf_kv.rb +35 -0
  38. data/lib/toy/io/gguf_load.rb +331 -0
  39. data/lib/toy/io/loaders/toy_gpt2_loader.rb +70 -0
  40. data/lib/toy/io/loaders/toy_smollm2_loader.rb +754 -0
  41. data/lib/toy/io/model_index.rb +206 -0
  42. data/lib/toy/io/run_bundle.rb +280 -0
  43. data/lib/toy/io/tokenizer.rb +613 -0
  44. data/lib/toy/io/toy_corpus_loader.rb +52 -0
  45. data/lib/toy/io/toy_events.rb +56 -0
  46. data/lib/toy/io/toy_image_loader.rb +48 -0
  47. data/lib/toy/llm/adamw.rb +169 -0
  48. data/lib/toy/llm/archs/llama_arch.rb +233 -0
  49. data/lib/toy/llm/archs/llama_arch_cuda.rb +237 -0
  50. data/lib/toy/llm/archs/llama_arch_metal.rb +237 -0
  51. data/lib/toy/llm/blocks/transformer_block.rb +876 -0
  52. data/lib/toy/llm/blocks/transformer_block_cuda.rb +880 -0
  53. data/lib/toy/llm/blocks/transformer_block_metal.rb +880 -0
  54. data/lib/toy/llm/classify_batch.rb +88 -0
  55. data/lib/toy/llm/engine/gpt2_fwd_engine.rb +360 -0
  56. data/lib/toy/llm/engine/gpt2_fwd_engine_cuda.rb +362 -0
  57. data/lib/toy/llm/engine/gpt2_fwd_engine_metal.rb +362 -0
  58. data/lib/toy/llm/engine/gpt2_kv_engine.rb +346 -0
  59. data/lib/toy/llm/engine/gpt2_kv_engine_cuda.rb +348 -0
  60. data/lib/toy/llm/engine/gpt2_kv_engine_metal.rb +348 -0
  61. data/lib/toy/llm/engine/gpt2_seq_engine.rb +289 -0
  62. data/lib/toy/llm/engine/gpt2_seq_engine_cuda.rb +293 -0
  63. data/lib/toy/llm/engine/gpt2_seq_engine_metal.rb +293 -0
  64. data/lib/toy/llm/engine/llama_kv_engine.rb +1593 -0
  65. data/lib/toy/llm/engine/llama_kv_engine_cuda.rb +1526 -0
  66. data/lib/toy/llm/engine/llama_kv_engine_metal.rb +1526 -0
  67. data/lib/toy/llm/engine/llama_seq_engine.rb +1233 -0
  68. data/lib/toy/llm/engine/llama_seq_engine_cuda.rb +1238 -0
  69. data/lib/toy/llm/engine/llama_seq_engine_metal.rb +1238 -0
  70. data/lib/toy/llm/engine/vit_tiny_engine.rb +467 -0
  71. data/lib/toy/llm/labels.rb +142 -0
  72. data/lib/toy/llm/primitives/gqa.rb +62 -0
  73. data/lib/toy/llm/primitives/gqa_cuda.rb +66 -0
  74. data/lib/toy/llm/primitives/gqa_metal.rb +66 -0
  75. data/lib/toy/llm/primitives/rms_norm.rb +39 -0
  76. data/lib/toy/llm/primitives/rms_norm_cuda.rb +43 -0
  77. data/lib/toy/llm/primitives/rms_norm_metal.rb +43 -0
  78. data/lib/toy/llm/primitives/rope.rb +68 -0
  79. data/lib/toy/llm/primitives/rope_cuda.rb +72 -0
  80. data/lib/toy/llm/primitives/rope_metal.rb +72 -0
  81. data/lib/toy/llm/primitives/swiglu.rb +41 -0
  82. data/lib/toy/llm/primitives/swiglu_cuda.rb +45 -0
  83. data/lib/toy/llm/primitives/swiglu_metal.rb +45 -0
  84. data/lib/toy/llm/recipe_options.rb +71 -0
  85. data/lib/toy/llm/recipes/from_scratch.rb +105 -0
  86. data/lib/toy/llm/recipes/from_scratch_cuda.rb +109 -0
  87. data/lib/toy/llm/recipes/from_scratch_metal.rb +109 -0
  88. data/lib/toy/llm/recipes/lora.rb +110 -0
  89. data/lib/toy/llm/recipes/lora_cuda.rb +114 -0
  90. data/lib/toy/llm/recipes/lora_metal.rb +114 -0
  91. data/lib/toy/llm/recipes/vit_tiny.rb +75 -0
  92. data/lib/toy/llm/recipes/warm_start.rb +235 -0
  93. data/lib/toy/llm/recipes/warm_start_cuda.rb +239 -0
  94. data/lib/toy/llm/recipes/warm_start_metal.rb +239 -0
  95. data/lib/toy/llm/training_batch.rb +133 -0
  96. data/lib/toy/models/arch.rb +253 -0
  97. data/lib/toy/models/gpt2.rb +311 -0
  98. data/lib/toy/models/toy_gpt2.rb +177 -0
  99. data/lib/toy/models/toy_smollm2.rb +393 -0
  100. data/lib/toy/models/toy_vit.rb +83 -0
  101. data/lib/toy/models/transformer.rb +1494 -0
  102. data/lib/toy/models/transformer_lm.rb +298 -0
  103. data/lib/toy/models/transformer_lm_cuda.rb +159 -0
  104. data/lib/toy/models/transformer_lm_metal.rb +142 -0
  105. data/lib/toy/mri.rb +300 -0
  106. data/lib/toy/run/eval.rb +76 -0
  107. data/lib/toy/run/eval_cuda.rb +66 -0
  108. data/lib/toy/run/eval_lmc.rb +334 -0
  109. data/lib/toy/run/eval_metal.rb +67 -0
  110. data/lib/toy/run/infer.rb +130 -0
  111. data/lib/toy/run/infer_cuda.rb +118 -0
  112. data/lib/toy/run/infer_metal.rb +119 -0
  113. data/lib/toy/run/infer_trace.rb +37 -0
  114. data/lib/toy/run/serve.rb +144 -0
  115. data/lib/toy/run/train.rb +404 -0
  116. data/lib/toy/run/train_cuda.rb +397 -0
  117. data/lib/toy/run/train_gpt2.rb +103 -0
  118. data/lib/toy/run/train_gpt2_cuda.rb +85 -0
  119. data/lib/toy/run/train_gpt2_metal.rb +85 -0
  120. data/lib/toy/run/train_lora.rb +207 -0
  121. data/lib/toy/run/train_lora_cuda.rb +219 -0
  122. data/lib/toy/run/train_metal.rb +227 -0
  123. data/lib/toy/run/train_vit.rb +251 -0
  124. data/lib/toy/serve/openai/embeddings_handler.rb +92 -0
  125. data/lib/toy/serve/openai/handlers.rb +143 -0
  126. data/lib/toy/serve/openai/server.rb +159 -0
  127. data/lib/toy/train/sampler.rb +314 -0
  128. data/lib/toy/train/toy_chat_template.rb +179 -0
  129. data/lib/toy/train/toy_drift_grad.rb +176 -0
  130. data/lib/toy/train/toy_gguf_fuse.rb +428 -0
  131. data/lib/toy/train/toy_gguf_writer.rb +100 -0
  132. data/lib/toy/train/toy_lr_schedule.rb +39 -0
  133. data/lib/toy/train/toy_sample.rb +125 -0
  134. data/lib/toy/train/toy_trainer.rb +86 -0
  135. data/lib/toy/train/training.rb +160 -0
  136. data/lib/toy/version.rb +11 -0
  137. data/lib/toy.rb +902 -0
  138. data/prep/progress +118 -0
  139. data/prep/quietly +64 -0
  140. data/sig/toy.rbs +397 -0
  141. data/sig/toy_compute.rbs +450 -0
  142. data/spinel-ext.json +122 -0
  143. data/tinynn/Makefile +71 -0
  144. data/tinynn/tinynn_backend_cuda.c +99 -0
  145. data/tinynn/tinynn_backend_metal.m +75 -0
  146. data/tinynn/tinynn_events.c +122 -0
  147. data/tinynn/tinynn_events.h +83 -0
  148. data/tinynn/tinynn_ggml.c +2460 -0
  149. data/tinynn/tinynn_ggml.h +545 -0
  150. data/tinynn/tinynn_gguf.c +783 -0
  151. data/tinynn/tinynn_gguf.h +167 -0
  152. data/tinynn/tinynn_trace.c +180 -0
  153. data/tinynn/tinynn_trace.h +85 -0
  154. data/vendor/ggml/AUTHORS +335 -0
  155. data/vendor/ggml/CMakeLists.txt +505 -0
  156. data/vendor/ggml/CONTRIBUTING.md +3 -0
  157. data/vendor/ggml/LICENSE +21 -0
  158. data/vendor/ggml/README.md +50 -0
  159. data/vendor/ggml/ci/run.sh +395 -0
  160. data/vendor/ggml/cmake/FindNCCL.cmake +36 -0
  161. data/vendor/ggml/cmake/GitVars.cmake +22 -0
  162. data/vendor/ggml/cmake/common.cmake +50 -0
  163. data/vendor/ggml/cmake/ggml-config.cmake.in +191 -0
  164. data/vendor/ggml/docs/gguf.md +828 -0
  165. data/vendor/ggml/examples/CMakeLists.txt +34 -0
  166. data/vendor/ggml/examples/common-ggml.cpp +244 -0
  167. data/vendor/ggml/examples/common-ggml.h +18 -0
  168. data/vendor/ggml/examples/common.cpp +675 -0
  169. data/vendor/ggml/examples/common.h +322 -0
  170. data/vendor/ggml/examples/gpt-2/CMakeLists.txt +32 -0
  171. data/vendor/ggml/examples/gpt-2/README.md +225 -0
  172. data/vendor/ggml/examples/gpt-2/convert-cerebras-to-ggml.py +183 -0
  173. data/vendor/ggml/examples/gpt-2/convert-ckpt-to-ggml.py +159 -0
  174. data/vendor/ggml/examples/gpt-2/convert-h5-to-ggml.py +195 -0
  175. data/vendor/ggml/examples/gpt-2/download-ggml-model.sh +69 -0
  176. data/vendor/ggml/examples/gpt-2/download-model.sh +48 -0
  177. data/vendor/ggml/examples/gpt-2/main-alloc.cpp +880 -0
  178. data/vendor/ggml/examples/gpt-2/main-backend.cpp +946 -0
  179. data/vendor/ggml/examples/gpt-2/main-batched.cpp +1210 -0
  180. data/vendor/ggml/examples/gpt-2/main-ctx.cpp +840 -0
  181. data/vendor/ggml/examples/gpt-2/main-sched.cpp +1079 -0
  182. data/vendor/ggml/examples/gpt-2/quantize.cpp +184 -0
  183. data/vendor/ggml/examples/gpt-j/CMakeLists.txt +13 -0
  184. data/vendor/ggml/examples/gpt-j/README.md +239 -0
  185. data/vendor/ggml/examples/gpt-j/convert-h5-to-ggml.py +173 -0
  186. data/vendor/ggml/examples/gpt-j/download-ggml-model.sh +69 -0
  187. data/vendor/ggml/examples/gpt-j/download-model.sh +11 -0
  188. data/vendor/ggml/examples/gpt-j/main.cpp +755 -0
  189. data/vendor/ggml/examples/gpt-j/quantize.cpp +182 -0
  190. data/vendor/ggml/examples/magika/CMakeLists.txt +17 -0
  191. data/vendor/ggml/examples/magika/README.md +23 -0
  192. data/vendor/ggml/examples/magika/convert.py +32 -0
  193. data/vendor/ggml/examples/magika/main.cpp +374 -0
  194. data/vendor/ggml/examples/mnist/CMakeLists.txt +58 -0
  195. data/vendor/ggml/examples/mnist/README.md +206 -0
  196. data/vendor/ggml/examples/mnist/mnist-common.cpp +496 -0
  197. data/vendor/ggml/examples/mnist/mnist-common.h +166 -0
  198. data/vendor/ggml/examples/mnist/mnist-eval.cpp +67 -0
  199. data/vendor/ggml/examples/mnist/mnist-train-cnn.py +91 -0
  200. data/vendor/ggml/examples/mnist/mnist-train-fc.py +131 -0
  201. data/vendor/ggml/examples/mnist/mnist-train.cpp +39 -0
  202. data/vendor/ggml/examples/mnist/server.py +36 -0
  203. data/vendor/ggml/examples/mnist/web/index.html +178 -0
  204. data/vendor/ggml/examples/perf-metal/CMakeLists.txt +7 -0
  205. data/vendor/ggml/examples/perf-metal/perf-metal.cpp +152 -0
  206. data/vendor/ggml/examples/prompts/dolly-v2.txt +100 -0
  207. data/vendor/ggml/examples/prompts/gpt-2-chinese.txt +1 -0
  208. data/vendor/ggml/examples/prompts/gpt-2.txt +100 -0
  209. data/vendor/ggml/examples/prompts/gpt-j.txt +100 -0
  210. data/vendor/ggml/examples/prompts/gpt-neox-japanese.txt +1 -0
  211. data/vendor/ggml/examples/prompts/gpt-neox.txt +100 -0
  212. data/vendor/ggml/examples/prompts/polyglot-ko.txt +3 -0
  213. data/vendor/ggml/examples/prompts/replit.txt +100 -0
  214. data/vendor/ggml/examples/prompts/starcoder.txt +100 -0
  215. data/vendor/ggml/examples/prompts/test-cases.txt +110 -0
  216. data/vendor/ggml/examples/prompts/tokenize_huggingface.py +65 -0
  217. data/vendor/ggml/examples/prompts/whisper.txt +100 -0
  218. data/vendor/ggml/examples/python/README.md +115 -0
  219. data/vendor/ggml/examples/python/api.h +14 -0
  220. data/vendor/ggml/examples/python/example_add_quant.py +25 -0
  221. data/vendor/ggml/examples/python/example_test_all_quants.py +68 -0
  222. data/vendor/ggml/examples/python/ggml/__init__.py +58 -0
  223. data/vendor/ggml/examples/python/ggml/__init__.pyi +2406 -0
  224. data/vendor/ggml/examples/python/ggml/cffi.py +11 -0
  225. data/vendor/ggml/examples/python/ggml/ffi/__init__.pyi +7 -0
  226. data/vendor/ggml/examples/python/ggml/utils.py +182 -0
  227. data/vendor/ggml/examples/python/regenerate.py +42 -0
  228. data/vendor/ggml/examples/python/stubs.py +128 -0
  229. data/vendor/ggml/examples/python/test_tensor.py +258 -0
  230. data/vendor/ggml/examples/sam/CMakeLists.txt +13 -0
  231. data/vendor/ggml/examples/sam/README.md +95 -0
  232. data/vendor/ggml/examples/sam/convert-pth-to-ggml.py +147 -0
  233. data/vendor/ggml/examples/sam/example.jpg +0 -0
  234. data/vendor/ggml/examples/sam/sam.cpp +2370 -0
  235. data/vendor/ggml/examples/simple/CMakeLists.txt +21 -0
  236. data/vendor/ggml/examples/simple/README.md +61 -0
  237. data/vendor/ggml/examples/simple/simple-backend.cpp +153 -0
  238. data/vendor/ggml/examples/simple/simple-ctx.cpp +127 -0
  239. data/vendor/ggml/examples/stb_image.h +7987 -0
  240. data/vendor/ggml/examples/stb_image_write.h +1724 -0
  241. data/vendor/ggml/examples/test-cmake/CMakeLists.txt +10 -0
  242. data/vendor/ggml/examples/test-cmake/README.md +3 -0
  243. data/vendor/ggml/examples/test-cmake/test-cmake.cpp +6 -0
  244. data/vendor/ggml/examples/yolo/CMakeLists.txt +6 -0
  245. data/vendor/ggml/examples/yolo/README.md +59 -0
  246. data/vendor/ggml/examples/yolo/convert-yolov3-tiny.py +53 -0
  247. data/vendor/ggml/examples/yolo/data/coco.names +80 -0
  248. data/vendor/ggml/examples/yolo/data/labels/100_0.png +0 -0
  249. data/vendor/ggml/examples/yolo/data/labels/100_1.png +0 -0
  250. data/vendor/ggml/examples/yolo/data/labels/100_2.png +0 -0
  251. data/vendor/ggml/examples/yolo/data/labels/100_3.png +0 -0
  252. data/vendor/ggml/examples/yolo/data/labels/100_4.png +0 -0
  253. data/vendor/ggml/examples/yolo/data/labels/100_5.png +0 -0
  254. data/vendor/ggml/examples/yolo/data/labels/100_6.png +0 -0
  255. data/vendor/ggml/examples/yolo/data/labels/100_7.png +0 -0
  256. data/vendor/ggml/examples/yolo/data/labels/101_0.png +0 -0
  257. data/vendor/ggml/examples/yolo/data/labels/101_1.png +0 -0
  258. data/vendor/ggml/examples/yolo/data/labels/101_2.png +0 -0
  259. data/vendor/ggml/examples/yolo/data/labels/101_3.png +0 -0
  260. data/vendor/ggml/examples/yolo/data/labels/101_4.png +0 -0
  261. data/vendor/ggml/examples/yolo/data/labels/101_5.png +0 -0
  262. data/vendor/ggml/examples/yolo/data/labels/101_6.png +0 -0
  263. data/vendor/ggml/examples/yolo/data/labels/101_7.png +0 -0
  264. data/vendor/ggml/examples/yolo/data/labels/102_0.png +0 -0
  265. data/vendor/ggml/examples/yolo/data/labels/102_1.png +0 -0
  266. data/vendor/ggml/examples/yolo/data/labels/102_2.png +0 -0
  267. data/vendor/ggml/examples/yolo/data/labels/102_3.png +0 -0
  268. data/vendor/ggml/examples/yolo/data/labels/102_4.png +0 -0
  269. data/vendor/ggml/examples/yolo/data/labels/102_5.png +0 -0
  270. data/vendor/ggml/examples/yolo/data/labels/102_6.png +0 -0
  271. data/vendor/ggml/examples/yolo/data/labels/102_7.png +0 -0
  272. data/vendor/ggml/examples/yolo/data/labels/103_0.png +0 -0
  273. data/vendor/ggml/examples/yolo/data/labels/103_1.png +0 -0
  274. data/vendor/ggml/examples/yolo/data/labels/103_2.png +0 -0
  275. data/vendor/ggml/examples/yolo/data/labels/103_3.png +0 -0
  276. data/vendor/ggml/examples/yolo/data/labels/103_4.png +0 -0
  277. data/vendor/ggml/examples/yolo/data/labels/103_5.png +0 -0
  278. data/vendor/ggml/examples/yolo/data/labels/103_6.png +0 -0
  279. data/vendor/ggml/examples/yolo/data/labels/103_7.png +0 -0
  280. data/vendor/ggml/examples/yolo/data/labels/104_0.png +0 -0
  281. data/vendor/ggml/examples/yolo/data/labels/104_1.png +0 -0
  282. data/vendor/ggml/examples/yolo/data/labels/104_2.png +0 -0
  283. data/vendor/ggml/examples/yolo/data/labels/104_3.png +0 -0
  284. data/vendor/ggml/examples/yolo/data/labels/104_4.png +0 -0
  285. data/vendor/ggml/examples/yolo/data/labels/104_5.png +0 -0
  286. data/vendor/ggml/examples/yolo/data/labels/104_6.png +0 -0
  287. data/vendor/ggml/examples/yolo/data/labels/104_7.png +0 -0
  288. data/vendor/ggml/examples/yolo/data/labels/105_0.png +0 -0
  289. data/vendor/ggml/examples/yolo/data/labels/105_1.png +0 -0
  290. data/vendor/ggml/examples/yolo/data/labels/105_2.png +0 -0
  291. data/vendor/ggml/examples/yolo/data/labels/105_3.png +0 -0
  292. data/vendor/ggml/examples/yolo/data/labels/105_4.png +0 -0
  293. data/vendor/ggml/examples/yolo/data/labels/105_5.png +0 -0
  294. data/vendor/ggml/examples/yolo/data/labels/105_6.png +0 -0
  295. data/vendor/ggml/examples/yolo/data/labels/105_7.png +0 -0
  296. data/vendor/ggml/examples/yolo/data/labels/106_0.png +0 -0
  297. data/vendor/ggml/examples/yolo/data/labels/106_1.png +0 -0
  298. data/vendor/ggml/examples/yolo/data/labels/106_2.png +0 -0
  299. data/vendor/ggml/examples/yolo/data/labels/106_3.png +0 -0
  300. data/vendor/ggml/examples/yolo/data/labels/106_4.png +0 -0
  301. data/vendor/ggml/examples/yolo/data/labels/106_5.png +0 -0
  302. data/vendor/ggml/examples/yolo/data/labels/106_6.png +0 -0
  303. data/vendor/ggml/examples/yolo/data/labels/106_7.png +0 -0
  304. data/vendor/ggml/examples/yolo/data/labels/107_0.png +0 -0
  305. data/vendor/ggml/examples/yolo/data/labels/107_1.png +0 -0
  306. data/vendor/ggml/examples/yolo/data/labels/107_2.png +0 -0
  307. data/vendor/ggml/examples/yolo/data/labels/107_3.png +0 -0
  308. data/vendor/ggml/examples/yolo/data/labels/107_4.png +0 -0
  309. data/vendor/ggml/examples/yolo/data/labels/107_5.png +0 -0
  310. data/vendor/ggml/examples/yolo/data/labels/107_6.png +0 -0
  311. data/vendor/ggml/examples/yolo/data/labels/107_7.png +0 -0
  312. data/vendor/ggml/examples/yolo/data/labels/108_0.png +0 -0
  313. data/vendor/ggml/examples/yolo/data/labels/108_1.png +0 -0
  314. data/vendor/ggml/examples/yolo/data/labels/108_2.png +0 -0
  315. data/vendor/ggml/examples/yolo/data/labels/108_3.png +0 -0
  316. data/vendor/ggml/examples/yolo/data/labels/108_4.png +0 -0
  317. data/vendor/ggml/examples/yolo/data/labels/108_5.png +0 -0
  318. data/vendor/ggml/examples/yolo/data/labels/108_6.png +0 -0
  319. data/vendor/ggml/examples/yolo/data/labels/108_7.png +0 -0
  320. data/vendor/ggml/examples/yolo/data/labels/109_0.png +0 -0
  321. data/vendor/ggml/examples/yolo/data/labels/109_1.png +0 -0
  322. data/vendor/ggml/examples/yolo/data/labels/109_2.png +0 -0
  323. data/vendor/ggml/examples/yolo/data/labels/109_3.png +0 -0
  324. data/vendor/ggml/examples/yolo/data/labels/109_4.png +0 -0
  325. data/vendor/ggml/examples/yolo/data/labels/109_5.png +0 -0
  326. data/vendor/ggml/examples/yolo/data/labels/109_6.png +0 -0
  327. data/vendor/ggml/examples/yolo/data/labels/109_7.png +0 -0
  328. data/vendor/ggml/examples/yolo/data/labels/110_0.png +0 -0
  329. data/vendor/ggml/examples/yolo/data/labels/110_1.png +0 -0
  330. data/vendor/ggml/examples/yolo/data/labels/110_2.png +0 -0
  331. data/vendor/ggml/examples/yolo/data/labels/110_3.png +0 -0
  332. data/vendor/ggml/examples/yolo/data/labels/110_4.png +0 -0
  333. data/vendor/ggml/examples/yolo/data/labels/110_5.png +0 -0
  334. data/vendor/ggml/examples/yolo/data/labels/110_6.png +0 -0
  335. data/vendor/ggml/examples/yolo/data/labels/110_7.png +0 -0
  336. data/vendor/ggml/examples/yolo/data/labels/111_0.png +0 -0
  337. data/vendor/ggml/examples/yolo/data/labels/111_1.png +0 -0
  338. data/vendor/ggml/examples/yolo/data/labels/111_2.png +0 -0
  339. data/vendor/ggml/examples/yolo/data/labels/111_3.png +0 -0
  340. data/vendor/ggml/examples/yolo/data/labels/111_4.png +0 -0
  341. data/vendor/ggml/examples/yolo/data/labels/111_5.png +0 -0
  342. data/vendor/ggml/examples/yolo/data/labels/111_6.png +0 -0
  343. data/vendor/ggml/examples/yolo/data/labels/111_7.png +0 -0
  344. data/vendor/ggml/examples/yolo/data/labels/112_0.png +0 -0
  345. data/vendor/ggml/examples/yolo/data/labels/112_1.png +0 -0
  346. data/vendor/ggml/examples/yolo/data/labels/112_2.png +0 -0
  347. data/vendor/ggml/examples/yolo/data/labels/112_3.png +0 -0
  348. data/vendor/ggml/examples/yolo/data/labels/112_4.png +0 -0
  349. data/vendor/ggml/examples/yolo/data/labels/112_5.png +0 -0
  350. data/vendor/ggml/examples/yolo/data/labels/112_6.png +0 -0
  351. data/vendor/ggml/examples/yolo/data/labels/112_7.png +0 -0
  352. data/vendor/ggml/examples/yolo/data/labels/113_0.png +0 -0
  353. data/vendor/ggml/examples/yolo/data/labels/113_1.png +0 -0
  354. data/vendor/ggml/examples/yolo/data/labels/113_2.png +0 -0
  355. data/vendor/ggml/examples/yolo/data/labels/113_3.png +0 -0
  356. data/vendor/ggml/examples/yolo/data/labels/113_4.png +0 -0
  357. data/vendor/ggml/examples/yolo/data/labels/113_5.png +0 -0
  358. data/vendor/ggml/examples/yolo/data/labels/113_6.png +0 -0
  359. data/vendor/ggml/examples/yolo/data/labels/113_7.png +0 -0
  360. data/vendor/ggml/examples/yolo/data/labels/114_0.png +0 -0
  361. data/vendor/ggml/examples/yolo/data/labels/114_1.png +0 -0
  362. data/vendor/ggml/examples/yolo/data/labels/114_2.png +0 -0
  363. data/vendor/ggml/examples/yolo/data/labels/114_3.png +0 -0
  364. data/vendor/ggml/examples/yolo/data/labels/114_4.png +0 -0
  365. data/vendor/ggml/examples/yolo/data/labels/114_5.png +0 -0
  366. data/vendor/ggml/examples/yolo/data/labels/114_6.png +0 -0
  367. data/vendor/ggml/examples/yolo/data/labels/114_7.png +0 -0
  368. data/vendor/ggml/examples/yolo/data/labels/115_0.png +0 -0
  369. data/vendor/ggml/examples/yolo/data/labels/115_1.png +0 -0
  370. data/vendor/ggml/examples/yolo/data/labels/115_2.png +0 -0
  371. data/vendor/ggml/examples/yolo/data/labels/115_3.png +0 -0
  372. data/vendor/ggml/examples/yolo/data/labels/115_4.png +0 -0
  373. data/vendor/ggml/examples/yolo/data/labels/115_5.png +0 -0
  374. data/vendor/ggml/examples/yolo/data/labels/115_6.png +0 -0
  375. data/vendor/ggml/examples/yolo/data/labels/115_7.png +0 -0
  376. data/vendor/ggml/examples/yolo/data/labels/116_0.png +0 -0
  377. data/vendor/ggml/examples/yolo/data/labels/116_1.png +0 -0
  378. data/vendor/ggml/examples/yolo/data/labels/116_2.png +0 -0
  379. data/vendor/ggml/examples/yolo/data/labels/116_3.png +0 -0
  380. data/vendor/ggml/examples/yolo/data/labels/116_4.png +0 -0
  381. data/vendor/ggml/examples/yolo/data/labels/116_5.png +0 -0
  382. data/vendor/ggml/examples/yolo/data/labels/116_6.png +0 -0
  383. data/vendor/ggml/examples/yolo/data/labels/116_7.png +0 -0
  384. data/vendor/ggml/examples/yolo/data/labels/117_0.png +0 -0
  385. data/vendor/ggml/examples/yolo/data/labels/117_1.png +0 -0
  386. data/vendor/ggml/examples/yolo/data/labels/117_2.png +0 -0
  387. data/vendor/ggml/examples/yolo/data/labels/117_3.png +0 -0
  388. data/vendor/ggml/examples/yolo/data/labels/117_4.png +0 -0
  389. data/vendor/ggml/examples/yolo/data/labels/117_5.png +0 -0
  390. data/vendor/ggml/examples/yolo/data/labels/117_6.png +0 -0
  391. data/vendor/ggml/examples/yolo/data/labels/117_7.png +0 -0
  392. data/vendor/ggml/examples/yolo/data/labels/118_0.png +0 -0
  393. data/vendor/ggml/examples/yolo/data/labels/118_1.png +0 -0
  394. data/vendor/ggml/examples/yolo/data/labels/118_2.png +0 -0
  395. data/vendor/ggml/examples/yolo/data/labels/118_3.png +0 -0
  396. data/vendor/ggml/examples/yolo/data/labels/118_4.png +0 -0
  397. data/vendor/ggml/examples/yolo/data/labels/118_5.png +0 -0
  398. data/vendor/ggml/examples/yolo/data/labels/118_6.png +0 -0
  399. data/vendor/ggml/examples/yolo/data/labels/118_7.png +0 -0
  400. data/vendor/ggml/examples/yolo/data/labels/119_0.png +0 -0
  401. data/vendor/ggml/examples/yolo/data/labels/119_1.png +0 -0
  402. data/vendor/ggml/examples/yolo/data/labels/119_2.png +0 -0
  403. data/vendor/ggml/examples/yolo/data/labels/119_3.png +0 -0
  404. data/vendor/ggml/examples/yolo/data/labels/119_4.png +0 -0
  405. data/vendor/ggml/examples/yolo/data/labels/119_5.png +0 -0
  406. data/vendor/ggml/examples/yolo/data/labels/119_6.png +0 -0
  407. data/vendor/ggml/examples/yolo/data/labels/119_7.png +0 -0
  408. data/vendor/ggml/examples/yolo/data/labels/120_0.png +0 -0
  409. data/vendor/ggml/examples/yolo/data/labels/120_1.png +0 -0
  410. data/vendor/ggml/examples/yolo/data/labels/120_2.png +0 -0
  411. data/vendor/ggml/examples/yolo/data/labels/120_3.png +0 -0
  412. data/vendor/ggml/examples/yolo/data/labels/120_4.png +0 -0
  413. data/vendor/ggml/examples/yolo/data/labels/120_5.png +0 -0
  414. data/vendor/ggml/examples/yolo/data/labels/120_6.png +0 -0
  415. data/vendor/ggml/examples/yolo/data/labels/120_7.png +0 -0
  416. data/vendor/ggml/examples/yolo/data/labels/121_0.png +0 -0
  417. data/vendor/ggml/examples/yolo/data/labels/121_1.png +0 -0
  418. data/vendor/ggml/examples/yolo/data/labels/121_2.png +0 -0
  419. data/vendor/ggml/examples/yolo/data/labels/121_3.png +0 -0
  420. data/vendor/ggml/examples/yolo/data/labels/121_4.png +0 -0
  421. data/vendor/ggml/examples/yolo/data/labels/121_5.png +0 -0
  422. data/vendor/ggml/examples/yolo/data/labels/121_6.png +0 -0
  423. data/vendor/ggml/examples/yolo/data/labels/121_7.png +0 -0
  424. data/vendor/ggml/examples/yolo/data/labels/122_0.png +0 -0
  425. data/vendor/ggml/examples/yolo/data/labels/122_1.png +0 -0
  426. data/vendor/ggml/examples/yolo/data/labels/122_2.png +0 -0
  427. data/vendor/ggml/examples/yolo/data/labels/122_3.png +0 -0
  428. data/vendor/ggml/examples/yolo/data/labels/122_4.png +0 -0
  429. data/vendor/ggml/examples/yolo/data/labels/122_5.png +0 -0
  430. data/vendor/ggml/examples/yolo/data/labels/122_6.png +0 -0
  431. data/vendor/ggml/examples/yolo/data/labels/122_7.png +0 -0
  432. data/vendor/ggml/examples/yolo/data/labels/123_0.png +0 -0
  433. data/vendor/ggml/examples/yolo/data/labels/123_1.png +0 -0
  434. data/vendor/ggml/examples/yolo/data/labels/123_2.png +0 -0
  435. data/vendor/ggml/examples/yolo/data/labels/123_3.png +0 -0
  436. data/vendor/ggml/examples/yolo/data/labels/123_4.png +0 -0
  437. data/vendor/ggml/examples/yolo/data/labels/123_5.png +0 -0
  438. data/vendor/ggml/examples/yolo/data/labels/123_6.png +0 -0
  439. data/vendor/ggml/examples/yolo/data/labels/123_7.png +0 -0
  440. data/vendor/ggml/examples/yolo/data/labels/124_0.png +0 -0
  441. data/vendor/ggml/examples/yolo/data/labels/124_1.png +0 -0
  442. data/vendor/ggml/examples/yolo/data/labels/124_2.png +0 -0
  443. data/vendor/ggml/examples/yolo/data/labels/124_3.png +0 -0
  444. data/vendor/ggml/examples/yolo/data/labels/124_4.png +0 -0
  445. data/vendor/ggml/examples/yolo/data/labels/124_5.png +0 -0
  446. data/vendor/ggml/examples/yolo/data/labels/124_6.png +0 -0
  447. data/vendor/ggml/examples/yolo/data/labels/124_7.png +0 -0
  448. data/vendor/ggml/examples/yolo/data/labels/125_0.png +0 -0
  449. data/vendor/ggml/examples/yolo/data/labels/125_1.png +0 -0
  450. data/vendor/ggml/examples/yolo/data/labels/125_2.png +0 -0
  451. data/vendor/ggml/examples/yolo/data/labels/125_3.png +0 -0
  452. data/vendor/ggml/examples/yolo/data/labels/125_4.png +0 -0
  453. data/vendor/ggml/examples/yolo/data/labels/125_5.png +0 -0
  454. data/vendor/ggml/examples/yolo/data/labels/125_6.png +0 -0
  455. data/vendor/ggml/examples/yolo/data/labels/125_7.png +0 -0
  456. data/vendor/ggml/examples/yolo/data/labels/126_0.png +0 -0
  457. data/vendor/ggml/examples/yolo/data/labels/126_1.png +0 -0
  458. data/vendor/ggml/examples/yolo/data/labels/126_2.png +0 -0
  459. data/vendor/ggml/examples/yolo/data/labels/126_3.png +0 -0
  460. data/vendor/ggml/examples/yolo/data/labels/126_4.png +0 -0
  461. data/vendor/ggml/examples/yolo/data/labels/126_5.png +0 -0
  462. data/vendor/ggml/examples/yolo/data/labels/126_6.png +0 -0
  463. data/vendor/ggml/examples/yolo/data/labels/126_7.png +0 -0
  464. data/vendor/ggml/examples/yolo/data/labels/32_0.png +0 -0
  465. data/vendor/ggml/examples/yolo/data/labels/32_1.png +0 -0
  466. data/vendor/ggml/examples/yolo/data/labels/32_2.png +0 -0
  467. data/vendor/ggml/examples/yolo/data/labels/32_3.png +0 -0
  468. data/vendor/ggml/examples/yolo/data/labels/32_4.png +0 -0
  469. data/vendor/ggml/examples/yolo/data/labels/32_5.png +0 -0
  470. data/vendor/ggml/examples/yolo/data/labels/32_6.png +0 -0
  471. data/vendor/ggml/examples/yolo/data/labels/32_7.png +0 -0
  472. data/vendor/ggml/examples/yolo/data/labels/33_0.png +0 -0
  473. data/vendor/ggml/examples/yolo/data/labels/33_1.png +0 -0
  474. data/vendor/ggml/examples/yolo/data/labels/33_2.png +0 -0
  475. data/vendor/ggml/examples/yolo/data/labels/33_3.png +0 -0
  476. data/vendor/ggml/examples/yolo/data/labels/33_4.png +0 -0
  477. data/vendor/ggml/examples/yolo/data/labels/33_5.png +0 -0
  478. data/vendor/ggml/examples/yolo/data/labels/33_6.png +0 -0
  479. data/vendor/ggml/examples/yolo/data/labels/33_7.png +0 -0
  480. data/vendor/ggml/examples/yolo/data/labels/34_0.png +0 -0
  481. data/vendor/ggml/examples/yolo/data/labels/34_1.png +0 -0
  482. data/vendor/ggml/examples/yolo/data/labels/34_2.png +0 -0
  483. data/vendor/ggml/examples/yolo/data/labels/34_3.png +0 -0
  484. data/vendor/ggml/examples/yolo/data/labels/34_4.png +0 -0
  485. data/vendor/ggml/examples/yolo/data/labels/34_5.png +0 -0
  486. data/vendor/ggml/examples/yolo/data/labels/34_6.png +0 -0
  487. data/vendor/ggml/examples/yolo/data/labels/34_7.png +0 -0
  488. data/vendor/ggml/examples/yolo/data/labels/35_0.png +0 -0
  489. data/vendor/ggml/examples/yolo/data/labels/35_1.png +0 -0
  490. data/vendor/ggml/examples/yolo/data/labels/35_2.png +0 -0
  491. data/vendor/ggml/examples/yolo/data/labels/35_3.png +0 -0
  492. data/vendor/ggml/examples/yolo/data/labels/35_4.png +0 -0
  493. data/vendor/ggml/examples/yolo/data/labels/35_5.png +0 -0
  494. data/vendor/ggml/examples/yolo/data/labels/35_6.png +0 -0
  495. data/vendor/ggml/examples/yolo/data/labels/35_7.png +0 -0
  496. data/vendor/ggml/examples/yolo/data/labels/36_0.png +0 -0
  497. data/vendor/ggml/examples/yolo/data/labels/36_1.png +0 -0
  498. data/vendor/ggml/examples/yolo/data/labels/36_2.png +0 -0
  499. data/vendor/ggml/examples/yolo/data/labels/36_3.png +0 -0
  500. data/vendor/ggml/examples/yolo/data/labels/36_4.png +0 -0
  501. data/vendor/ggml/examples/yolo/data/labels/36_5.png +0 -0
  502. data/vendor/ggml/examples/yolo/data/labels/36_6.png +0 -0
  503. data/vendor/ggml/examples/yolo/data/labels/36_7.png +0 -0
  504. data/vendor/ggml/examples/yolo/data/labels/37_0.png +0 -0
  505. data/vendor/ggml/examples/yolo/data/labels/37_1.png +0 -0
  506. data/vendor/ggml/examples/yolo/data/labels/37_2.png +0 -0
  507. data/vendor/ggml/examples/yolo/data/labels/37_3.png +0 -0
  508. data/vendor/ggml/examples/yolo/data/labels/37_4.png +0 -0
  509. data/vendor/ggml/examples/yolo/data/labels/37_5.png +0 -0
  510. data/vendor/ggml/examples/yolo/data/labels/37_6.png +0 -0
  511. data/vendor/ggml/examples/yolo/data/labels/37_7.png +0 -0
  512. data/vendor/ggml/examples/yolo/data/labels/38_0.png +0 -0
  513. data/vendor/ggml/examples/yolo/data/labels/38_1.png +0 -0
  514. data/vendor/ggml/examples/yolo/data/labels/38_2.png +0 -0
  515. data/vendor/ggml/examples/yolo/data/labels/38_3.png +0 -0
  516. data/vendor/ggml/examples/yolo/data/labels/38_4.png +0 -0
  517. data/vendor/ggml/examples/yolo/data/labels/38_5.png +0 -0
  518. data/vendor/ggml/examples/yolo/data/labels/38_6.png +0 -0
  519. data/vendor/ggml/examples/yolo/data/labels/38_7.png +0 -0
  520. data/vendor/ggml/examples/yolo/data/labels/39_0.png +0 -0
  521. data/vendor/ggml/examples/yolo/data/labels/39_1.png +0 -0
  522. data/vendor/ggml/examples/yolo/data/labels/39_2.png +0 -0
  523. data/vendor/ggml/examples/yolo/data/labels/39_3.png +0 -0
  524. data/vendor/ggml/examples/yolo/data/labels/39_4.png +0 -0
  525. data/vendor/ggml/examples/yolo/data/labels/39_5.png +0 -0
  526. data/vendor/ggml/examples/yolo/data/labels/39_6.png +0 -0
  527. data/vendor/ggml/examples/yolo/data/labels/39_7.png +0 -0
  528. data/vendor/ggml/examples/yolo/data/labels/40_0.png +0 -0
  529. data/vendor/ggml/examples/yolo/data/labels/40_1.png +0 -0
  530. data/vendor/ggml/examples/yolo/data/labels/40_2.png +0 -0
  531. data/vendor/ggml/examples/yolo/data/labels/40_3.png +0 -0
  532. data/vendor/ggml/examples/yolo/data/labels/40_4.png +0 -0
  533. data/vendor/ggml/examples/yolo/data/labels/40_5.png +0 -0
  534. data/vendor/ggml/examples/yolo/data/labels/40_6.png +0 -0
  535. data/vendor/ggml/examples/yolo/data/labels/40_7.png +0 -0
  536. data/vendor/ggml/examples/yolo/data/labels/41_0.png +0 -0
  537. data/vendor/ggml/examples/yolo/data/labels/41_1.png +0 -0
  538. data/vendor/ggml/examples/yolo/data/labels/41_2.png +0 -0
  539. data/vendor/ggml/examples/yolo/data/labels/41_3.png +0 -0
  540. data/vendor/ggml/examples/yolo/data/labels/41_4.png +0 -0
  541. data/vendor/ggml/examples/yolo/data/labels/41_5.png +0 -0
  542. data/vendor/ggml/examples/yolo/data/labels/41_6.png +0 -0
  543. data/vendor/ggml/examples/yolo/data/labels/41_7.png +0 -0
  544. data/vendor/ggml/examples/yolo/data/labels/42_0.png +0 -0
  545. data/vendor/ggml/examples/yolo/data/labels/42_1.png +0 -0
  546. data/vendor/ggml/examples/yolo/data/labels/42_2.png +0 -0
  547. data/vendor/ggml/examples/yolo/data/labels/42_3.png +0 -0
  548. data/vendor/ggml/examples/yolo/data/labels/42_4.png +0 -0
  549. data/vendor/ggml/examples/yolo/data/labels/42_5.png +0 -0
  550. data/vendor/ggml/examples/yolo/data/labels/42_6.png +0 -0
  551. data/vendor/ggml/examples/yolo/data/labels/42_7.png +0 -0
  552. data/vendor/ggml/examples/yolo/data/labels/43_0.png +0 -0
  553. data/vendor/ggml/examples/yolo/data/labels/43_1.png +0 -0
  554. data/vendor/ggml/examples/yolo/data/labels/43_2.png +0 -0
  555. data/vendor/ggml/examples/yolo/data/labels/43_3.png +0 -0
  556. data/vendor/ggml/examples/yolo/data/labels/43_4.png +0 -0
  557. data/vendor/ggml/examples/yolo/data/labels/43_5.png +0 -0
  558. data/vendor/ggml/examples/yolo/data/labels/43_6.png +0 -0
  559. data/vendor/ggml/examples/yolo/data/labels/43_7.png +0 -0
  560. data/vendor/ggml/examples/yolo/data/labels/44_0.png +0 -0
  561. data/vendor/ggml/examples/yolo/data/labels/44_1.png +0 -0
  562. data/vendor/ggml/examples/yolo/data/labels/44_2.png +0 -0
  563. data/vendor/ggml/examples/yolo/data/labels/44_3.png +0 -0
  564. data/vendor/ggml/examples/yolo/data/labels/44_4.png +0 -0
  565. data/vendor/ggml/examples/yolo/data/labels/44_5.png +0 -0
  566. data/vendor/ggml/examples/yolo/data/labels/44_6.png +0 -0
  567. data/vendor/ggml/examples/yolo/data/labels/44_7.png +0 -0
  568. data/vendor/ggml/examples/yolo/data/labels/45_0.png +0 -0
  569. data/vendor/ggml/examples/yolo/data/labels/45_1.png +0 -0
  570. data/vendor/ggml/examples/yolo/data/labels/45_2.png +0 -0
  571. data/vendor/ggml/examples/yolo/data/labels/45_3.png +0 -0
  572. data/vendor/ggml/examples/yolo/data/labels/45_4.png +0 -0
  573. data/vendor/ggml/examples/yolo/data/labels/45_5.png +0 -0
  574. data/vendor/ggml/examples/yolo/data/labels/45_6.png +0 -0
  575. data/vendor/ggml/examples/yolo/data/labels/45_7.png +0 -0
  576. data/vendor/ggml/examples/yolo/data/labels/46_0.png +0 -0
  577. data/vendor/ggml/examples/yolo/data/labels/46_1.png +0 -0
  578. data/vendor/ggml/examples/yolo/data/labels/46_2.png +0 -0
  579. data/vendor/ggml/examples/yolo/data/labels/46_3.png +0 -0
  580. data/vendor/ggml/examples/yolo/data/labels/46_4.png +0 -0
  581. data/vendor/ggml/examples/yolo/data/labels/46_5.png +0 -0
  582. data/vendor/ggml/examples/yolo/data/labels/46_6.png +0 -0
  583. data/vendor/ggml/examples/yolo/data/labels/46_7.png +0 -0
  584. data/vendor/ggml/examples/yolo/data/labels/47_0.png +0 -0
  585. data/vendor/ggml/examples/yolo/data/labels/47_1.png +0 -0
  586. data/vendor/ggml/examples/yolo/data/labels/47_2.png +0 -0
  587. data/vendor/ggml/examples/yolo/data/labels/47_3.png +0 -0
  588. data/vendor/ggml/examples/yolo/data/labels/47_4.png +0 -0
  589. data/vendor/ggml/examples/yolo/data/labels/47_5.png +0 -0
  590. data/vendor/ggml/examples/yolo/data/labels/47_6.png +0 -0
  591. data/vendor/ggml/examples/yolo/data/labels/47_7.png +0 -0
  592. data/vendor/ggml/examples/yolo/data/labels/48_0.png +0 -0
  593. data/vendor/ggml/examples/yolo/data/labels/48_1.png +0 -0
  594. data/vendor/ggml/examples/yolo/data/labels/48_2.png +0 -0
  595. data/vendor/ggml/examples/yolo/data/labels/48_3.png +0 -0
  596. data/vendor/ggml/examples/yolo/data/labels/48_4.png +0 -0
  597. data/vendor/ggml/examples/yolo/data/labels/48_5.png +0 -0
  598. data/vendor/ggml/examples/yolo/data/labels/48_6.png +0 -0
  599. data/vendor/ggml/examples/yolo/data/labels/48_7.png +0 -0
  600. data/vendor/ggml/examples/yolo/data/labels/49_0.png +0 -0
  601. data/vendor/ggml/examples/yolo/data/labels/49_1.png +0 -0
  602. data/vendor/ggml/examples/yolo/data/labels/49_2.png +0 -0
  603. data/vendor/ggml/examples/yolo/data/labels/49_3.png +0 -0
  604. data/vendor/ggml/examples/yolo/data/labels/49_4.png +0 -0
  605. data/vendor/ggml/examples/yolo/data/labels/49_5.png +0 -0
  606. data/vendor/ggml/examples/yolo/data/labels/49_6.png +0 -0
  607. data/vendor/ggml/examples/yolo/data/labels/49_7.png +0 -0
  608. data/vendor/ggml/examples/yolo/data/labels/50_0.png +0 -0
  609. data/vendor/ggml/examples/yolo/data/labels/50_1.png +0 -0
  610. data/vendor/ggml/examples/yolo/data/labels/50_2.png +0 -0
  611. data/vendor/ggml/examples/yolo/data/labels/50_3.png +0 -0
  612. data/vendor/ggml/examples/yolo/data/labels/50_4.png +0 -0
  613. data/vendor/ggml/examples/yolo/data/labels/50_5.png +0 -0
  614. data/vendor/ggml/examples/yolo/data/labels/50_6.png +0 -0
  615. data/vendor/ggml/examples/yolo/data/labels/50_7.png +0 -0
  616. data/vendor/ggml/examples/yolo/data/labels/51_0.png +0 -0
  617. data/vendor/ggml/examples/yolo/data/labels/51_1.png +0 -0
  618. data/vendor/ggml/examples/yolo/data/labels/51_2.png +0 -0
  619. data/vendor/ggml/examples/yolo/data/labels/51_3.png +0 -0
  620. data/vendor/ggml/examples/yolo/data/labels/51_4.png +0 -0
  621. data/vendor/ggml/examples/yolo/data/labels/51_5.png +0 -0
  622. data/vendor/ggml/examples/yolo/data/labels/51_6.png +0 -0
  623. data/vendor/ggml/examples/yolo/data/labels/51_7.png +0 -0
  624. data/vendor/ggml/examples/yolo/data/labels/52_0.png +0 -0
  625. data/vendor/ggml/examples/yolo/data/labels/52_1.png +0 -0
  626. data/vendor/ggml/examples/yolo/data/labels/52_2.png +0 -0
  627. data/vendor/ggml/examples/yolo/data/labels/52_3.png +0 -0
  628. data/vendor/ggml/examples/yolo/data/labels/52_4.png +0 -0
  629. data/vendor/ggml/examples/yolo/data/labels/52_5.png +0 -0
  630. data/vendor/ggml/examples/yolo/data/labels/52_6.png +0 -0
  631. data/vendor/ggml/examples/yolo/data/labels/52_7.png +0 -0
  632. data/vendor/ggml/examples/yolo/data/labels/53_0.png +0 -0
  633. data/vendor/ggml/examples/yolo/data/labels/53_1.png +0 -0
  634. data/vendor/ggml/examples/yolo/data/labels/53_2.png +0 -0
  635. data/vendor/ggml/examples/yolo/data/labels/53_3.png +0 -0
  636. data/vendor/ggml/examples/yolo/data/labels/53_4.png +0 -0
  637. data/vendor/ggml/examples/yolo/data/labels/53_5.png +0 -0
  638. data/vendor/ggml/examples/yolo/data/labels/53_6.png +0 -0
  639. data/vendor/ggml/examples/yolo/data/labels/53_7.png +0 -0
  640. data/vendor/ggml/examples/yolo/data/labels/54_0.png +0 -0
  641. data/vendor/ggml/examples/yolo/data/labels/54_1.png +0 -0
  642. data/vendor/ggml/examples/yolo/data/labels/54_2.png +0 -0
  643. data/vendor/ggml/examples/yolo/data/labels/54_3.png +0 -0
  644. data/vendor/ggml/examples/yolo/data/labels/54_4.png +0 -0
  645. data/vendor/ggml/examples/yolo/data/labels/54_5.png +0 -0
  646. data/vendor/ggml/examples/yolo/data/labels/54_6.png +0 -0
  647. data/vendor/ggml/examples/yolo/data/labels/54_7.png +0 -0
  648. data/vendor/ggml/examples/yolo/data/labels/55_0.png +0 -0
  649. data/vendor/ggml/examples/yolo/data/labels/55_1.png +0 -0
  650. data/vendor/ggml/examples/yolo/data/labels/55_2.png +0 -0
  651. data/vendor/ggml/examples/yolo/data/labels/55_3.png +0 -0
  652. data/vendor/ggml/examples/yolo/data/labels/55_4.png +0 -0
  653. data/vendor/ggml/examples/yolo/data/labels/55_5.png +0 -0
  654. data/vendor/ggml/examples/yolo/data/labels/55_6.png +0 -0
  655. data/vendor/ggml/examples/yolo/data/labels/55_7.png +0 -0
  656. data/vendor/ggml/examples/yolo/data/labels/56_0.png +0 -0
  657. data/vendor/ggml/examples/yolo/data/labels/56_1.png +0 -0
  658. data/vendor/ggml/examples/yolo/data/labels/56_2.png +0 -0
  659. data/vendor/ggml/examples/yolo/data/labels/56_3.png +0 -0
  660. data/vendor/ggml/examples/yolo/data/labels/56_4.png +0 -0
  661. data/vendor/ggml/examples/yolo/data/labels/56_5.png +0 -0
  662. data/vendor/ggml/examples/yolo/data/labels/56_6.png +0 -0
  663. data/vendor/ggml/examples/yolo/data/labels/56_7.png +0 -0
  664. data/vendor/ggml/examples/yolo/data/labels/57_0.png +0 -0
  665. data/vendor/ggml/examples/yolo/data/labels/57_1.png +0 -0
  666. data/vendor/ggml/examples/yolo/data/labels/57_2.png +0 -0
  667. data/vendor/ggml/examples/yolo/data/labels/57_3.png +0 -0
  668. data/vendor/ggml/examples/yolo/data/labels/57_4.png +0 -0
  669. data/vendor/ggml/examples/yolo/data/labels/57_5.png +0 -0
  670. data/vendor/ggml/examples/yolo/data/labels/57_6.png +0 -0
  671. data/vendor/ggml/examples/yolo/data/labels/57_7.png +0 -0
  672. data/vendor/ggml/examples/yolo/data/labels/58_0.png +0 -0
  673. data/vendor/ggml/examples/yolo/data/labels/58_1.png +0 -0
  674. data/vendor/ggml/examples/yolo/data/labels/58_2.png +0 -0
  675. data/vendor/ggml/examples/yolo/data/labels/58_3.png +0 -0
  676. data/vendor/ggml/examples/yolo/data/labels/58_4.png +0 -0
  677. data/vendor/ggml/examples/yolo/data/labels/58_5.png +0 -0
  678. data/vendor/ggml/examples/yolo/data/labels/58_6.png +0 -0
  679. data/vendor/ggml/examples/yolo/data/labels/58_7.png +0 -0
  680. data/vendor/ggml/examples/yolo/data/labels/59_0.png +0 -0
  681. data/vendor/ggml/examples/yolo/data/labels/59_1.png +0 -0
  682. data/vendor/ggml/examples/yolo/data/labels/59_2.png +0 -0
  683. data/vendor/ggml/examples/yolo/data/labels/59_3.png +0 -0
  684. data/vendor/ggml/examples/yolo/data/labels/59_4.png +0 -0
  685. data/vendor/ggml/examples/yolo/data/labels/59_5.png +0 -0
  686. data/vendor/ggml/examples/yolo/data/labels/59_6.png +0 -0
  687. data/vendor/ggml/examples/yolo/data/labels/59_7.png +0 -0
  688. data/vendor/ggml/examples/yolo/data/labels/60_0.png +0 -0
  689. data/vendor/ggml/examples/yolo/data/labels/60_1.png +0 -0
  690. data/vendor/ggml/examples/yolo/data/labels/60_2.png +0 -0
  691. data/vendor/ggml/examples/yolo/data/labels/60_3.png +0 -0
  692. data/vendor/ggml/examples/yolo/data/labels/60_4.png +0 -0
  693. data/vendor/ggml/examples/yolo/data/labels/60_5.png +0 -0
  694. data/vendor/ggml/examples/yolo/data/labels/60_6.png +0 -0
  695. data/vendor/ggml/examples/yolo/data/labels/60_7.png +0 -0
  696. data/vendor/ggml/examples/yolo/data/labels/61_0.png +0 -0
  697. data/vendor/ggml/examples/yolo/data/labels/61_1.png +0 -0
  698. data/vendor/ggml/examples/yolo/data/labels/61_2.png +0 -0
  699. data/vendor/ggml/examples/yolo/data/labels/61_3.png +0 -0
  700. data/vendor/ggml/examples/yolo/data/labels/61_4.png +0 -0
  701. data/vendor/ggml/examples/yolo/data/labels/61_5.png +0 -0
  702. data/vendor/ggml/examples/yolo/data/labels/61_6.png +0 -0
  703. data/vendor/ggml/examples/yolo/data/labels/61_7.png +0 -0
  704. data/vendor/ggml/examples/yolo/data/labels/62_0.png +0 -0
  705. data/vendor/ggml/examples/yolo/data/labels/62_1.png +0 -0
  706. data/vendor/ggml/examples/yolo/data/labels/62_2.png +0 -0
  707. data/vendor/ggml/examples/yolo/data/labels/62_3.png +0 -0
  708. data/vendor/ggml/examples/yolo/data/labels/62_4.png +0 -0
  709. data/vendor/ggml/examples/yolo/data/labels/62_5.png +0 -0
  710. data/vendor/ggml/examples/yolo/data/labels/62_6.png +0 -0
  711. data/vendor/ggml/examples/yolo/data/labels/62_7.png +0 -0
  712. data/vendor/ggml/examples/yolo/data/labels/63_0.png +0 -0
  713. data/vendor/ggml/examples/yolo/data/labels/63_1.png +0 -0
  714. data/vendor/ggml/examples/yolo/data/labels/63_2.png +0 -0
  715. data/vendor/ggml/examples/yolo/data/labels/63_3.png +0 -0
  716. data/vendor/ggml/examples/yolo/data/labels/63_4.png +0 -0
  717. data/vendor/ggml/examples/yolo/data/labels/63_5.png +0 -0
  718. data/vendor/ggml/examples/yolo/data/labels/63_6.png +0 -0
  719. data/vendor/ggml/examples/yolo/data/labels/63_7.png +0 -0
  720. data/vendor/ggml/examples/yolo/data/labels/64_0.png +0 -0
  721. data/vendor/ggml/examples/yolo/data/labels/64_1.png +0 -0
  722. data/vendor/ggml/examples/yolo/data/labels/64_2.png +0 -0
  723. data/vendor/ggml/examples/yolo/data/labels/64_3.png +0 -0
  724. data/vendor/ggml/examples/yolo/data/labels/64_4.png +0 -0
  725. data/vendor/ggml/examples/yolo/data/labels/64_5.png +0 -0
  726. data/vendor/ggml/examples/yolo/data/labels/64_6.png +0 -0
  727. data/vendor/ggml/examples/yolo/data/labels/64_7.png +0 -0
  728. data/vendor/ggml/examples/yolo/data/labels/65_0.png +0 -0
  729. data/vendor/ggml/examples/yolo/data/labels/65_1.png +0 -0
  730. data/vendor/ggml/examples/yolo/data/labels/65_2.png +0 -0
  731. data/vendor/ggml/examples/yolo/data/labels/65_3.png +0 -0
  732. data/vendor/ggml/examples/yolo/data/labels/65_4.png +0 -0
  733. data/vendor/ggml/examples/yolo/data/labels/65_5.png +0 -0
  734. data/vendor/ggml/examples/yolo/data/labels/65_6.png +0 -0
  735. data/vendor/ggml/examples/yolo/data/labels/65_7.png +0 -0
  736. data/vendor/ggml/examples/yolo/data/labels/66_0.png +0 -0
  737. data/vendor/ggml/examples/yolo/data/labels/66_1.png +0 -0
  738. data/vendor/ggml/examples/yolo/data/labels/66_2.png +0 -0
  739. data/vendor/ggml/examples/yolo/data/labels/66_3.png +0 -0
  740. data/vendor/ggml/examples/yolo/data/labels/66_4.png +0 -0
  741. data/vendor/ggml/examples/yolo/data/labels/66_5.png +0 -0
  742. data/vendor/ggml/examples/yolo/data/labels/66_6.png +0 -0
  743. data/vendor/ggml/examples/yolo/data/labels/66_7.png +0 -0
  744. data/vendor/ggml/examples/yolo/data/labels/67_0.png +0 -0
  745. data/vendor/ggml/examples/yolo/data/labels/67_1.png +0 -0
  746. data/vendor/ggml/examples/yolo/data/labels/67_2.png +0 -0
  747. data/vendor/ggml/examples/yolo/data/labels/67_3.png +0 -0
  748. data/vendor/ggml/examples/yolo/data/labels/67_4.png +0 -0
  749. data/vendor/ggml/examples/yolo/data/labels/67_5.png +0 -0
  750. data/vendor/ggml/examples/yolo/data/labels/67_6.png +0 -0
  751. data/vendor/ggml/examples/yolo/data/labels/67_7.png +0 -0
  752. data/vendor/ggml/examples/yolo/data/labels/68_0.png +0 -0
  753. data/vendor/ggml/examples/yolo/data/labels/68_1.png +0 -0
  754. data/vendor/ggml/examples/yolo/data/labels/68_2.png +0 -0
  755. data/vendor/ggml/examples/yolo/data/labels/68_3.png +0 -0
  756. data/vendor/ggml/examples/yolo/data/labels/68_4.png +0 -0
  757. data/vendor/ggml/examples/yolo/data/labels/68_5.png +0 -0
  758. data/vendor/ggml/examples/yolo/data/labels/68_6.png +0 -0
  759. data/vendor/ggml/examples/yolo/data/labels/68_7.png +0 -0
  760. data/vendor/ggml/examples/yolo/data/labels/69_0.png +0 -0
  761. data/vendor/ggml/examples/yolo/data/labels/69_1.png +0 -0
  762. data/vendor/ggml/examples/yolo/data/labels/69_2.png +0 -0
  763. data/vendor/ggml/examples/yolo/data/labels/69_3.png +0 -0
  764. data/vendor/ggml/examples/yolo/data/labels/69_4.png +0 -0
  765. data/vendor/ggml/examples/yolo/data/labels/69_5.png +0 -0
  766. data/vendor/ggml/examples/yolo/data/labels/69_6.png +0 -0
  767. data/vendor/ggml/examples/yolo/data/labels/69_7.png +0 -0
  768. data/vendor/ggml/examples/yolo/data/labels/70_0.png +0 -0
  769. data/vendor/ggml/examples/yolo/data/labels/70_1.png +0 -0
  770. data/vendor/ggml/examples/yolo/data/labels/70_2.png +0 -0
  771. data/vendor/ggml/examples/yolo/data/labels/70_3.png +0 -0
  772. data/vendor/ggml/examples/yolo/data/labels/70_4.png +0 -0
  773. data/vendor/ggml/examples/yolo/data/labels/70_5.png +0 -0
  774. data/vendor/ggml/examples/yolo/data/labels/70_6.png +0 -0
  775. data/vendor/ggml/examples/yolo/data/labels/70_7.png +0 -0
  776. data/vendor/ggml/examples/yolo/data/labels/71_0.png +0 -0
  777. data/vendor/ggml/examples/yolo/data/labels/71_1.png +0 -0
  778. data/vendor/ggml/examples/yolo/data/labels/71_2.png +0 -0
  779. data/vendor/ggml/examples/yolo/data/labels/71_3.png +0 -0
  780. data/vendor/ggml/examples/yolo/data/labels/71_4.png +0 -0
  781. data/vendor/ggml/examples/yolo/data/labels/71_5.png +0 -0
  782. data/vendor/ggml/examples/yolo/data/labels/71_6.png +0 -0
  783. data/vendor/ggml/examples/yolo/data/labels/71_7.png +0 -0
  784. data/vendor/ggml/examples/yolo/data/labels/72_0.png +0 -0
  785. data/vendor/ggml/examples/yolo/data/labels/72_1.png +0 -0
  786. data/vendor/ggml/examples/yolo/data/labels/72_2.png +0 -0
  787. data/vendor/ggml/examples/yolo/data/labels/72_3.png +0 -0
  788. data/vendor/ggml/examples/yolo/data/labels/72_4.png +0 -0
  789. data/vendor/ggml/examples/yolo/data/labels/72_5.png +0 -0
  790. data/vendor/ggml/examples/yolo/data/labels/72_6.png +0 -0
  791. data/vendor/ggml/examples/yolo/data/labels/72_7.png +0 -0
  792. data/vendor/ggml/examples/yolo/data/labels/73_0.png +0 -0
  793. data/vendor/ggml/examples/yolo/data/labels/73_1.png +0 -0
  794. data/vendor/ggml/examples/yolo/data/labels/73_2.png +0 -0
  795. data/vendor/ggml/examples/yolo/data/labels/73_3.png +0 -0
  796. data/vendor/ggml/examples/yolo/data/labels/73_4.png +0 -0
  797. data/vendor/ggml/examples/yolo/data/labels/73_5.png +0 -0
  798. data/vendor/ggml/examples/yolo/data/labels/73_6.png +0 -0
  799. data/vendor/ggml/examples/yolo/data/labels/73_7.png +0 -0
  800. data/vendor/ggml/examples/yolo/data/labels/74_0.png +0 -0
  801. data/vendor/ggml/examples/yolo/data/labels/74_1.png +0 -0
  802. data/vendor/ggml/examples/yolo/data/labels/74_2.png +0 -0
  803. data/vendor/ggml/examples/yolo/data/labels/74_3.png +0 -0
  804. data/vendor/ggml/examples/yolo/data/labels/74_4.png +0 -0
  805. data/vendor/ggml/examples/yolo/data/labels/74_5.png +0 -0
  806. data/vendor/ggml/examples/yolo/data/labels/74_6.png +0 -0
  807. data/vendor/ggml/examples/yolo/data/labels/74_7.png +0 -0
  808. data/vendor/ggml/examples/yolo/data/labels/75_0.png +0 -0
  809. data/vendor/ggml/examples/yolo/data/labels/75_1.png +0 -0
  810. data/vendor/ggml/examples/yolo/data/labels/75_2.png +0 -0
  811. data/vendor/ggml/examples/yolo/data/labels/75_3.png +0 -0
  812. data/vendor/ggml/examples/yolo/data/labels/75_4.png +0 -0
  813. data/vendor/ggml/examples/yolo/data/labels/75_5.png +0 -0
  814. data/vendor/ggml/examples/yolo/data/labels/75_6.png +0 -0
  815. data/vendor/ggml/examples/yolo/data/labels/75_7.png +0 -0
  816. data/vendor/ggml/examples/yolo/data/labels/76_0.png +0 -0
  817. data/vendor/ggml/examples/yolo/data/labels/76_1.png +0 -0
  818. data/vendor/ggml/examples/yolo/data/labels/76_2.png +0 -0
  819. data/vendor/ggml/examples/yolo/data/labels/76_3.png +0 -0
  820. data/vendor/ggml/examples/yolo/data/labels/76_4.png +0 -0
  821. data/vendor/ggml/examples/yolo/data/labels/76_5.png +0 -0
  822. data/vendor/ggml/examples/yolo/data/labels/76_6.png +0 -0
  823. data/vendor/ggml/examples/yolo/data/labels/76_7.png +0 -0
  824. data/vendor/ggml/examples/yolo/data/labels/77_0.png +0 -0
  825. data/vendor/ggml/examples/yolo/data/labels/77_1.png +0 -0
  826. data/vendor/ggml/examples/yolo/data/labels/77_2.png +0 -0
  827. data/vendor/ggml/examples/yolo/data/labels/77_3.png +0 -0
  828. data/vendor/ggml/examples/yolo/data/labels/77_4.png +0 -0
  829. data/vendor/ggml/examples/yolo/data/labels/77_5.png +0 -0
  830. data/vendor/ggml/examples/yolo/data/labels/77_6.png +0 -0
  831. data/vendor/ggml/examples/yolo/data/labels/77_7.png +0 -0
  832. data/vendor/ggml/examples/yolo/data/labels/78_0.png +0 -0
  833. data/vendor/ggml/examples/yolo/data/labels/78_1.png +0 -0
  834. data/vendor/ggml/examples/yolo/data/labels/78_2.png +0 -0
  835. data/vendor/ggml/examples/yolo/data/labels/78_3.png +0 -0
  836. data/vendor/ggml/examples/yolo/data/labels/78_4.png +0 -0
  837. data/vendor/ggml/examples/yolo/data/labels/78_5.png +0 -0
  838. data/vendor/ggml/examples/yolo/data/labels/78_6.png +0 -0
  839. data/vendor/ggml/examples/yolo/data/labels/78_7.png +0 -0
  840. data/vendor/ggml/examples/yolo/data/labels/79_0.png +0 -0
  841. data/vendor/ggml/examples/yolo/data/labels/79_1.png +0 -0
  842. data/vendor/ggml/examples/yolo/data/labels/79_2.png +0 -0
  843. data/vendor/ggml/examples/yolo/data/labels/79_3.png +0 -0
  844. data/vendor/ggml/examples/yolo/data/labels/79_4.png +0 -0
  845. data/vendor/ggml/examples/yolo/data/labels/79_5.png +0 -0
  846. data/vendor/ggml/examples/yolo/data/labels/79_6.png +0 -0
  847. data/vendor/ggml/examples/yolo/data/labels/79_7.png +0 -0
  848. data/vendor/ggml/examples/yolo/data/labels/80_0.png +0 -0
  849. data/vendor/ggml/examples/yolo/data/labels/80_1.png +0 -0
  850. data/vendor/ggml/examples/yolo/data/labels/80_2.png +0 -0
  851. data/vendor/ggml/examples/yolo/data/labels/80_3.png +0 -0
  852. data/vendor/ggml/examples/yolo/data/labels/80_4.png +0 -0
  853. data/vendor/ggml/examples/yolo/data/labels/80_5.png +0 -0
  854. data/vendor/ggml/examples/yolo/data/labels/80_6.png +0 -0
  855. data/vendor/ggml/examples/yolo/data/labels/80_7.png +0 -0
  856. data/vendor/ggml/examples/yolo/data/labels/81_0.png +0 -0
  857. data/vendor/ggml/examples/yolo/data/labels/81_1.png +0 -0
  858. data/vendor/ggml/examples/yolo/data/labels/81_2.png +0 -0
  859. data/vendor/ggml/examples/yolo/data/labels/81_3.png +0 -0
  860. data/vendor/ggml/examples/yolo/data/labels/81_4.png +0 -0
  861. data/vendor/ggml/examples/yolo/data/labels/81_5.png +0 -0
  862. data/vendor/ggml/examples/yolo/data/labels/81_6.png +0 -0
  863. data/vendor/ggml/examples/yolo/data/labels/81_7.png +0 -0
  864. data/vendor/ggml/examples/yolo/data/labels/82_0.png +0 -0
  865. data/vendor/ggml/examples/yolo/data/labels/82_1.png +0 -0
  866. data/vendor/ggml/examples/yolo/data/labels/82_2.png +0 -0
  867. data/vendor/ggml/examples/yolo/data/labels/82_3.png +0 -0
  868. data/vendor/ggml/examples/yolo/data/labels/82_4.png +0 -0
  869. data/vendor/ggml/examples/yolo/data/labels/82_5.png +0 -0
  870. data/vendor/ggml/examples/yolo/data/labels/82_6.png +0 -0
  871. data/vendor/ggml/examples/yolo/data/labels/82_7.png +0 -0
  872. data/vendor/ggml/examples/yolo/data/labels/83_0.png +0 -0
  873. data/vendor/ggml/examples/yolo/data/labels/83_1.png +0 -0
  874. data/vendor/ggml/examples/yolo/data/labels/83_2.png +0 -0
  875. data/vendor/ggml/examples/yolo/data/labels/83_3.png +0 -0
  876. data/vendor/ggml/examples/yolo/data/labels/83_4.png +0 -0
  877. data/vendor/ggml/examples/yolo/data/labels/83_5.png +0 -0
  878. data/vendor/ggml/examples/yolo/data/labels/83_6.png +0 -0
  879. data/vendor/ggml/examples/yolo/data/labels/83_7.png +0 -0
  880. data/vendor/ggml/examples/yolo/data/labels/84_0.png +0 -0
  881. data/vendor/ggml/examples/yolo/data/labels/84_1.png +0 -0
  882. data/vendor/ggml/examples/yolo/data/labels/84_2.png +0 -0
  883. data/vendor/ggml/examples/yolo/data/labels/84_3.png +0 -0
  884. data/vendor/ggml/examples/yolo/data/labels/84_4.png +0 -0
  885. data/vendor/ggml/examples/yolo/data/labels/84_5.png +0 -0
  886. data/vendor/ggml/examples/yolo/data/labels/84_6.png +0 -0
  887. data/vendor/ggml/examples/yolo/data/labels/84_7.png +0 -0
  888. data/vendor/ggml/examples/yolo/data/labels/85_0.png +0 -0
  889. data/vendor/ggml/examples/yolo/data/labels/85_1.png +0 -0
  890. data/vendor/ggml/examples/yolo/data/labels/85_2.png +0 -0
  891. data/vendor/ggml/examples/yolo/data/labels/85_3.png +0 -0
  892. data/vendor/ggml/examples/yolo/data/labels/85_4.png +0 -0
  893. data/vendor/ggml/examples/yolo/data/labels/85_5.png +0 -0
  894. data/vendor/ggml/examples/yolo/data/labels/85_6.png +0 -0
  895. data/vendor/ggml/examples/yolo/data/labels/85_7.png +0 -0
  896. data/vendor/ggml/examples/yolo/data/labels/86_0.png +0 -0
  897. data/vendor/ggml/examples/yolo/data/labels/86_1.png +0 -0
  898. data/vendor/ggml/examples/yolo/data/labels/86_2.png +0 -0
  899. data/vendor/ggml/examples/yolo/data/labels/86_3.png +0 -0
  900. data/vendor/ggml/examples/yolo/data/labels/86_4.png +0 -0
  901. data/vendor/ggml/examples/yolo/data/labels/86_5.png +0 -0
  902. data/vendor/ggml/examples/yolo/data/labels/86_6.png +0 -0
  903. data/vendor/ggml/examples/yolo/data/labels/86_7.png +0 -0
  904. data/vendor/ggml/examples/yolo/data/labels/87_0.png +0 -0
  905. data/vendor/ggml/examples/yolo/data/labels/87_1.png +0 -0
  906. data/vendor/ggml/examples/yolo/data/labels/87_2.png +0 -0
  907. data/vendor/ggml/examples/yolo/data/labels/87_3.png +0 -0
  908. data/vendor/ggml/examples/yolo/data/labels/87_4.png +0 -0
  909. data/vendor/ggml/examples/yolo/data/labels/87_5.png +0 -0
  910. data/vendor/ggml/examples/yolo/data/labels/87_6.png +0 -0
  911. data/vendor/ggml/examples/yolo/data/labels/87_7.png +0 -0
  912. data/vendor/ggml/examples/yolo/data/labels/88_0.png +0 -0
  913. data/vendor/ggml/examples/yolo/data/labels/88_1.png +0 -0
  914. data/vendor/ggml/examples/yolo/data/labels/88_2.png +0 -0
  915. data/vendor/ggml/examples/yolo/data/labels/88_3.png +0 -0
  916. data/vendor/ggml/examples/yolo/data/labels/88_4.png +0 -0
  917. data/vendor/ggml/examples/yolo/data/labels/88_5.png +0 -0
  918. data/vendor/ggml/examples/yolo/data/labels/88_6.png +0 -0
  919. data/vendor/ggml/examples/yolo/data/labels/88_7.png +0 -0
  920. data/vendor/ggml/examples/yolo/data/labels/89_0.png +0 -0
  921. data/vendor/ggml/examples/yolo/data/labels/89_1.png +0 -0
  922. data/vendor/ggml/examples/yolo/data/labels/89_2.png +0 -0
  923. data/vendor/ggml/examples/yolo/data/labels/89_3.png +0 -0
  924. data/vendor/ggml/examples/yolo/data/labels/89_4.png +0 -0
  925. data/vendor/ggml/examples/yolo/data/labels/89_5.png +0 -0
  926. data/vendor/ggml/examples/yolo/data/labels/89_6.png +0 -0
  927. data/vendor/ggml/examples/yolo/data/labels/89_7.png +0 -0
  928. data/vendor/ggml/examples/yolo/data/labels/90_0.png +0 -0
  929. data/vendor/ggml/examples/yolo/data/labels/90_1.png +0 -0
  930. data/vendor/ggml/examples/yolo/data/labels/90_2.png +0 -0
  931. data/vendor/ggml/examples/yolo/data/labels/90_3.png +0 -0
  932. data/vendor/ggml/examples/yolo/data/labels/90_4.png +0 -0
  933. data/vendor/ggml/examples/yolo/data/labels/90_5.png +0 -0
  934. data/vendor/ggml/examples/yolo/data/labels/90_6.png +0 -0
  935. data/vendor/ggml/examples/yolo/data/labels/90_7.png +0 -0
  936. data/vendor/ggml/examples/yolo/data/labels/91_0.png +0 -0
  937. data/vendor/ggml/examples/yolo/data/labels/91_1.png +0 -0
  938. data/vendor/ggml/examples/yolo/data/labels/91_2.png +0 -0
  939. data/vendor/ggml/examples/yolo/data/labels/91_3.png +0 -0
  940. data/vendor/ggml/examples/yolo/data/labels/91_4.png +0 -0
  941. data/vendor/ggml/examples/yolo/data/labels/91_5.png +0 -0
  942. data/vendor/ggml/examples/yolo/data/labels/91_6.png +0 -0
  943. data/vendor/ggml/examples/yolo/data/labels/91_7.png +0 -0
  944. data/vendor/ggml/examples/yolo/data/labels/92_0.png +0 -0
  945. data/vendor/ggml/examples/yolo/data/labels/92_1.png +0 -0
  946. data/vendor/ggml/examples/yolo/data/labels/92_2.png +0 -0
  947. data/vendor/ggml/examples/yolo/data/labels/92_3.png +0 -0
  948. data/vendor/ggml/examples/yolo/data/labels/92_4.png +0 -0
  949. data/vendor/ggml/examples/yolo/data/labels/92_5.png +0 -0
  950. data/vendor/ggml/examples/yolo/data/labels/92_6.png +0 -0
  951. data/vendor/ggml/examples/yolo/data/labels/92_7.png +0 -0
  952. data/vendor/ggml/examples/yolo/data/labels/93_0.png +0 -0
  953. data/vendor/ggml/examples/yolo/data/labels/93_1.png +0 -0
  954. data/vendor/ggml/examples/yolo/data/labels/93_2.png +0 -0
  955. data/vendor/ggml/examples/yolo/data/labels/93_3.png +0 -0
  956. data/vendor/ggml/examples/yolo/data/labels/93_4.png +0 -0
  957. data/vendor/ggml/examples/yolo/data/labels/93_5.png +0 -0
  958. data/vendor/ggml/examples/yolo/data/labels/93_6.png +0 -0
  959. data/vendor/ggml/examples/yolo/data/labels/93_7.png +0 -0
  960. data/vendor/ggml/examples/yolo/data/labels/94_0.png +0 -0
  961. data/vendor/ggml/examples/yolo/data/labels/94_1.png +0 -0
  962. data/vendor/ggml/examples/yolo/data/labels/94_2.png +0 -0
  963. data/vendor/ggml/examples/yolo/data/labels/94_3.png +0 -0
  964. data/vendor/ggml/examples/yolo/data/labels/94_4.png +0 -0
  965. data/vendor/ggml/examples/yolo/data/labels/94_5.png +0 -0
  966. data/vendor/ggml/examples/yolo/data/labels/94_6.png +0 -0
  967. data/vendor/ggml/examples/yolo/data/labels/94_7.png +0 -0
  968. data/vendor/ggml/examples/yolo/data/labels/95_0.png +0 -0
  969. data/vendor/ggml/examples/yolo/data/labels/95_1.png +0 -0
  970. data/vendor/ggml/examples/yolo/data/labels/95_2.png +0 -0
  971. data/vendor/ggml/examples/yolo/data/labels/95_3.png +0 -0
  972. data/vendor/ggml/examples/yolo/data/labels/95_4.png +0 -0
  973. data/vendor/ggml/examples/yolo/data/labels/95_5.png +0 -0
  974. data/vendor/ggml/examples/yolo/data/labels/95_6.png +0 -0
  975. data/vendor/ggml/examples/yolo/data/labels/95_7.png +0 -0
  976. data/vendor/ggml/examples/yolo/data/labels/96_0.png +0 -0
  977. data/vendor/ggml/examples/yolo/data/labels/96_1.png +0 -0
  978. data/vendor/ggml/examples/yolo/data/labels/96_2.png +0 -0
  979. data/vendor/ggml/examples/yolo/data/labels/96_3.png +0 -0
  980. data/vendor/ggml/examples/yolo/data/labels/96_4.png +0 -0
  981. data/vendor/ggml/examples/yolo/data/labels/96_5.png +0 -0
  982. data/vendor/ggml/examples/yolo/data/labels/96_6.png +0 -0
  983. data/vendor/ggml/examples/yolo/data/labels/96_7.png +0 -0
  984. data/vendor/ggml/examples/yolo/data/labels/97_0.png +0 -0
  985. data/vendor/ggml/examples/yolo/data/labels/97_1.png +0 -0
  986. data/vendor/ggml/examples/yolo/data/labels/97_2.png +0 -0
  987. data/vendor/ggml/examples/yolo/data/labels/97_3.png +0 -0
  988. data/vendor/ggml/examples/yolo/data/labels/97_4.png +0 -0
  989. data/vendor/ggml/examples/yolo/data/labels/97_5.png +0 -0
  990. data/vendor/ggml/examples/yolo/data/labels/97_6.png +0 -0
  991. data/vendor/ggml/examples/yolo/data/labels/97_7.png +0 -0
  992. data/vendor/ggml/examples/yolo/data/labels/98_0.png +0 -0
  993. data/vendor/ggml/examples/yolo/data/labels/98_1.png +0 -0
  994. data/vendor/ggml/examples/yolo/data/labels/98_2.png +0 -0
  995. data/vendor/ggml/examples/yolo/data/labels/98_3.png +0 -0
  996. data/vendor/ggml/examples/yolo/data/labels/98_4.png +0 -0
  997. data/vendor/ggml/examples/yolo/data/labels/98_5.png +0 -0
  998. data/vendor/ggml/examples/yolo/data/labels/98_6.png +0 -0
  999. data/vendor/ggml/examples/yolo/data/labels/98_7.png +0 -0
  1000. data/vendor/ggml/examples/yolo/data/labels/99_0.png +0 -0
  1001. data/vendor/ggml/examples/yolo/data/labels/99_1.png +0 -0
  1002. data/vendor/ggml/examples/yolo/data/labels/99_2.png +0 -0
  1003. data/vendor/ggml/examples/yolo/data/labels/99_3.png +0 -0
  1004. data/vendor/ggml/examples/yolo/data/labels/99_4.png +0 -0
  1005. data/vendor/ggml/examples/yolo/data/labels/99_5.png +0 -0
  1006. data/vendor/ggml/examples/yolo/data/labels/99_6.png +0 -0
  1007. data/vendor/ggml/examples/yolo/data/labels/99_7.png +0 -0
  1008. data/vendor/ggml/examples/yolo/yolo-image.cpp +210 -0
  1009. data/vendor/ggml/examples/yolo/yolo-image.h +39 -0
  1010. data/vendor/ggml/examples/yolo/yolov3-tiny.cpp +661 -0
  1011. data/vendor/ggml/ggml.pc.in +10 -0
  1012. data/vendor/ggml/include/ggml-alloc.h +85 -0
  1013. data/vendor/ggml/include/ggml-backend.h +431 -0
  1014. data/vendor/ggml/include/ggml-blas.h +25 -0
  1015. data/vendor/ggml/include/ggml-cann.h +123 -0
  1016. data/vendor/ggml/include/ggml-cpp.h +39 -0
  1017. data/vendor/ggml/include/ggml-cpu.h +151 -0
  1018. data/vendor/ggml/include/ggml-cuda.h +50 -0
  1019. data/vendor/ggml/include/ggml-hexagon.h +19 -0
  1020. data/vendor/ggml/include/ggml-metal.h +61 -0
  1021. data/vendor/ggml/include/ggml-opencl.h +26 -0
  1022. data/vendor/ggml/include/ggml-openvino.h +37 -0
  1023. data/vendor/ggml/include/ggml-opt.h +256 -0
  1024. data/vendor/ggml/include/ggml-rpc.h +35 -0
  1025. data/vendor/ggml/include/ggml-sycl.h +49 -0
  1026. data/vendor/ggml/include/ggml-virtgpu.h +14 -0
  1027. data/vendor/ggml/include/ggml-vulkan.h +29 -0
  1028. data/vendor/ggml/include/ggml-webgpu.h +19 -0
  1029. data/vendor/ggml/include/ggml-zdnn.h +17 -0
  1030. data/vendor/ggml/include/ggml-zendnn.h +22 -0
  1031. data/vendor/ggml/include/ggml.h +2845 -0
  1032. data/vendor/ggml/include/gguf.h +204 -0
  1033. data/vendor/ggml/requirements.txt +12 -0
  1034. data/vendor/ggml/scripts/gen-authors.sh +9 -0
  1035. data/vendor/ggml/scripts/release.sh +296 -0
  1036. data/vendor/ggml/scripts/sync-llama-am.sh +167 -0
  1037. data/vendor/ggml/scripts/sync-llama.last +1 -0
  1038. data/vendor/ggml/scripts/sync-llama.sh +21 -0
  1039. data/vendor/ggml/scripts/sync-whisper-am.sh +138 -0
  1040. data/vendor/ggml/scripts/sync-whisper.last +1 -0
  1041. data/vendor/ggml/scripts/sync-whisper.sh +17 -0
  1042. data/vendor/ggml/src/CMakeLists.txt +493 -0
  1043. data/vendor/ggml/src/ggml-alloc.c +1248 -0
  1044. data/vendor/ggml/src/ggml-backend-dl.cpp +48 -0
  1045. data/vendor/ggml/src/ggml-backend-dl.h +45 -0
  1046. data/vendor/ggml/src/ggml-backend-impl.h +275 -0
  1047. data/vendor/ggml/src/ggml-backend-meta.cpp +2144 -0
  1048. data/vendor/ggml/src/ggml-backend-reg.cpp +586 -0
  1049. data/vendor/ggml/src/ggml-backend.cpp +2371 -0
  1050. data/vendor/ggml/src/ggml-blas/CMakeLists.txt +101 -0
  1051. data/vendor/ggml/src/ggml-blas/ggml-blas.cpp +522 -0
  1052. data/vendor/ggml/src/ggml-cann/CMakeLists.txt +89 -0
  1053. data/vendor/ggml/src/ggml-cann/acl_tensor.cpp +195 -0
  1054. data/vendor/ggml/src/ggml-cann/acl_tensor.h +349 -0
  1055. data/vendor/ggml/src/ggml-cann/aclnn_ops.cpp +4436 -0
  1056. data/vendor/ggml/src/ggml-cann/aclnn_ops.h +1190 -0
  1057. data/vendor/ggml/src/ggml-cann/common.h +651 -0
  1058. data/vendor/ggml/src/ggml-cann/ggml-cann.cpp +3062 -0
  1059. data/vendor/ggml/src/ggml-common.h +1900 -0
  1060. data/vendor/ggml/src/ggml-cpu/CMakeLists.txt +731 -0
  1061. data/vendor/ggml/src/ggml-cpu/amx/amx.cpp +249 -0
  1062. data/vendor/ggml/src/ggml-cpu/amx/amx.h +8 -0
  1063. data/vendor/ggml/src/ggml-cpu/amx/common.h +115 -0
  1064. data/vendor/ggml/src/ggml-cpu/amx/mmq.cpp +2512 -0
  1065. data/vendor/ggml/src/ggml-cpu/amx/mmq.h +10 -0
  1066. data/vendor/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +98 -0
  1067. data/vendor/ggml/src/ggml-cpu/arch/arm/quants.c +4245 -0
  1068. data/vendor/ggml/src/ggml-cpu/arch/arm/repack.cpp +5156 -0
  1069. data/vendor/ggml/src/ggml-cpu/arch/loongarch/quants.c +2158 -0
  1070. data/vendor/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  1071. data/vendor/ggml/src/ggml-cpu/arch/powerpc/quants.c +2304 -0
  1072. data/vendor/ggml/src/ggml-cpu/arch/riscv/cpu-feats.cpp +38 -0
  1073. data/vendor/ggml/src/ggml-cpu/arch/riscv/quants.c +4553 -0
  1074. data/vendor/ggml/src/ggml-cpu/arch/riscv/repack.cpp +1703 -0
  1075. data/vendor/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
  1076. data/vendor/ggml/src/ggml-cpu/arch/s390/quants.c +1465 -0
  1077. data/vendor/ggml/src/ggml-cpu/arch/wasm/quants.c +1220 -0
  1078. data/vendor/ggml/src/ggml-cpu/arch/x86/cpu-feats.cpp +327 -0
  1079. data/vendor/ggml/src/ggml-cpu/arch/x86/quants.c +3970 -0
  1080. data/vendor/ggml/src/ggml-cpu/arch/x86/repack.cpp +6407 -0
  1081. data/vendor/ggml/src/ggml-cpu/arch-fallback.h +348 -0
  1082. data/vendor/ggml/src/ggml-cpu/binary-ops.cpp +154 -0
  1083. data/vendor/ggml/src/ggml-cpu/binary-ops.h +16 -0
  1084. data/vendor/ggml/src/ggml-cpu/cmake/FindSIMD.cmake +100 -0
  1085. data/vendor/ggml/src/ggml-cpu/cmake/FindSMTIME.cmake +32 -0
  1086. data/vendor/ggml/src/ggml-cpu/common.h +95 -0
  1087. data/vendor/ggml/src/ggml-cpu/ggml-cpu-impl.h +539 -0
  1088. data/vendor/ggml/src/ggml-cpu/ggml-cpu.c +3835 -0
  1089. data/vendor/ggml/src/ggml-cpu/ggml-cpu.cpp +703 -0
  1090. data/vendor/ggml/src/ggml-cpu/hbm.cpp +55 -0
  1091. data/vendor/ggml/src/ggml-cpu/hbm.h +8 -0
  1092. data/vendor/ggml/src/ggml-cpu/kleidiai/kernels.cpp +939 -0
  1093. data/vendor/ggml/src/ggml-cpu/kleidiai/kernels.h +90 -0
  1094. data/vendor/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1513 -0
  1095. data/vendor/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  1096. data/vendor/ggml/src/ggml-cpu/llamafile/sgemm.cpp +4051 -0
  1097. data/vendor/ggml/src/ggml-cpu/llamafile/sgemm.h +25 -0
  1098. data/vendor/ggml/src/ggml-cpu/ops.cpp +11373 -0
  1099. data/vendor/ggml/src/ggml-cpu/ops.h +119 -0
  1100. data/vendor/ggml/src/ggml-cpu/quants.c +1288 -0
  1101. data/vendor/ggml/src/ggml-cpu/quants.h +103 -0
  1102. data/vendor/ggml/src/ggml-cpu/repack.cpp +4836 -0
  1103. data/vendor/ggml/src/ggml-cpu/repack.h +245 -0
  1104. data/vendor/ggml/src/ggml-cpu/simd-gemm.h +226 -0
  1105. data/vendor/ggml/src/ggml-cpu/simd-mappings.h +1319 -0
  1106. data/vendor/ggml/src/ggml-cpu/spacemit/ime.cpp +1740 -0
  1107. data/vendor/ggml/src/ggml-cpu/spacemit/ime.h +21 -0
  1108. data/vendor/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +1027 -0
  1109. data/vendor/ggml/src/ggml-cpu/spacemit/ime2_kernels.cpp +5768 -0
  1110. data/vendor/ggml/src/ggml-cpu/spacemit/ime_env.cpp +320 -0
  1111. data/vendor/ggml/src/ggml-cpu/spacemit/ime_env.h +55 -0
  1112. data/vendor/ggml/src/ggml-cpu/spacemit/ime_kernels.h +189 -0
  1113. data/vendor/ggml/src/ggml-cpu/spacemit/repack.cpp +1795 -0
  1114. data/vendor/ggml/src/ggml-cpu/spacemit/repack.h +14 -0
  1115. data/vendor/ggml/src/ggml-cpu/spacemit/rvv_kernels.cpp +3178 -0
  1116. data/vendor/ggml/src/ggml-cpu/spacemit/rvv_kernels.h +95 -0
  1117. data/vendor/ggml/src/ggml-cpu/spacemit/spine_barrier.h +34 -0
  1118. data/vendor/ggml/src/ggml-cpu/spacemit/spine_mem_pool.cpp +760 -0
  1119. data/vendor/ggml/src/ggml-cpu/spacemit/spine_mem_pool.h +32 -0
  1120. data/vendor/ggml/src/ggml-cpu/spacemit/spine_tcm.h +409 -0
  1121. data/vendor/ggml/src/ggml-cpu/traits.cpp +36 -0
  1122. data/vendor/ggml/src/ggml-cpu/traits.h +38 -0
  1123. data/vendor/ggml/src/ggml-cpu/unary-ops.cpp +337 -0
  1124. data/vendor/ggml/src/ggml-cpu/unary-ops.h +35 -0
  1125. data/vendor/ggml/src/ggml-cpu/vec.cpp +629 -0
  1126. data/vendor/ggml/src/ggml-cpu/vec.h +1588 -0
  1127. data/vendor/ggml/src/ggml-cuda/CMakeLists.txt +268 -0
  1128. data/vendor/ggml/src/ggml-cuda/acc.cu +61 -0
  1129. data/vendor/ggml/src/ggml-cuda/acc.cuh +5 -0
  1130. data/vendor/ggml/src/ggml-cuda/add-id.cu +58 -0
  1131. data/vendor/ggml/src/ggml-cuda/add-id.cuh +3 -0
  1132. data/vendor/ggml/src/ggml-cuda/allreduce.cu +971 -0
  1133. data/vendor/ggml/src/ggml-cuda/allreduce.cuh +29 -0
  1134. data/vendor/ggml/src/ggml-cuda/arange.cu +34 -0
  1135. data/vendor/ggml/src/ggml-cuda/arange.cuh +5 -0
  1136. data/vendor/ggml/src/ggml-cuda/argmax.cu +91 -0
  1137. data/vendor/ggml/src/ggml-cuda/argmax.cuh +3 -0
  1138. data/vendor/ggml/src/ggml-cuda/argsort.cu +266 -0
  1139. data/vendor/ggml/src/ggml-cuda/argsort.cuh +19 -0
  1140. data/vendor/ggml/src/ggml-cuda/binbcast.cu +534 -0
  1141. data/vendor/ggml/src/ggml-cuda/binbcast.cuh +12 -0
  1142. data/vendor/ggml/src/ggml-cuda/clamp.cu +45 -0
  1143. data/vendor/ggml/src/ggml-cuda/clamp.cuh +5 -0
  1144. data/vendor/ggml/src/ggml-cuda/common.cuh +1489 -0
  1145. data/vendor/ggml/src/ggml-cuda/concat.cu +204 -0
  1146. data/vendor/ggml/src/ggml-cuda/concat.cuh +5 -0
  1147. data/vendor/ggml/src/ggml-cuda/conv-transpose-1d.cu +86 -0
  1148. data/vendor/ggml/src/ggml-cuda/conv-transpose-1d.cuh +5 -0
  1149. data/vendor/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
  1150. data/vendor/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
  1151. data/vendor/ggml/src/ggml-cuda/conv2d-transpose.cu +115 -0
  1152. data/vendor/ggml/src/ggml-cuda/conv2d-transpose.cuh +5 -0
  1153. data/vendor/ggml/src/ggml-cuda/conv2d.cu +166 -0
  1154. data/vendor/ggml/src/ggml-cuda/conv2d.cuh +5 -0
  1155. data/vendor/ggml/src/ggml-cuda/convert.cu +892 -0
  1156. data/vendor/ggml/src/ggml-cuda/convert.cuh +66 -0
  1157. data/vendor/ggml/src/ggml-cuda/count-equal.cu +64 -0
  1158. data/vendor/ggml/src/ggml-cuda/count-equal.cuh +5 -0
  1159. data/vendor/ggml/src/ggml-cuda/cp-async.cuh +57 -0
  1160. data/vendor/ggml/src/ggml-cuda/cpy-utils.cuh +217 -0
  1161. data/vendor/ggml/src/ggml-cuda/cpy.cu +558 -0
  1162. data/vendor/ggml/src/ggml-cuda/cpy.cuh +7 -0
  1163. data/vendor/ggml/src/ggml-cuda/cross-entropy-loss.cu +177 -0
  1164. data/vendor/ggml/src/ggml-cuda/cross-entropy-loss.cuh +7 -0
  1165. data/vendor/ggml/src/ggml-cuda/cumsum.cu +307 -0
  1166. data/vendor/ggml/src/ggml-cuda/cumsum.cuh +5 -0
  1167. data/vendor/ggml/src/ggml-cuda/dequantize.cuh +99 -0
  1168. data/vendor/ggml/src/ggml-cuda/diag.cu +77 -0
  1169. data/vendor/ggml/src/ggml-cuda/diag.cuh +5 -0
  1170. data/vendor/ggml/src/ggml-cuda/diagmask.cu +40 -0
  1171. data/vendor/ggml/src/ggml-cuda/diagmask.cuh +5 -0
  1172. data/vendor/ggml/src/ggml-cuda/fattn-common.cuh +1212 -0
  1173. data/vendor/ggml/src/ggml-cuda/fattn-mma-f16.cuh +2020 -0
  1174. data/vendor/ggml/src/ggml-cuda/fattn-tile.cu +61 -0
  1175. data/vendor/ggml/src/ggml-cuda/fattn-tile.cuh +1347 -0
  1176. data/vendor/ggml/src/ggml-cuda/fattn-vec.cuh +600 -0
  1177. data/vendor/ggml/src/ggml-cuda/fattn-wmma-f16.cu +696 -0
  1178. data/vendor/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +51 -0
  1179. data/vendor/ggml/src/ggml-cuda/fattn.cu +562 -0
  1180. data/vendor/ggml/src/ggml-cuda/fattn.cuh +5 -0
  1181. data/vendor/ggml/src/ggml-cuda/fill.cu +37 -0
  1182. data/vendor/ggml/src/ggml-cuda/fill.cuh +3 -0
  1183. data/vendor/ggml/src/ggml-cuda/gated_delta_net.cu +311 -0
  1184. data/vendor/ggml/src/ggml-cuda/gated_delta_net.cuh +4 -0
  1185. data/vendor/ggml/src/ggml-cuda/getrows.cu +300 -0
  1186. data/vendor/ggml/src/ggml-cuda/getrows.cuh +15 -0
  1187. data/vendor/ggml/src/ggml-cuda/ggml-cuda.cu +5684 -0
  1188. data/vendor/ggml/src/ggml-cuda/gla.cu +93 -0
  1189. data/vendor/ggml/src/ggml-cuda/gla.cuh +3 -0
  1190. data/vendor/ggml/src/ggml-cuda/im2col.cu +267 -0
  1191. data/vendor/ggml/src/ggml-cuda/im2col.cuh +6 -0
  1192. data/vendor/ggml/src/ggml-cuda/mean.cu +75 -0
  1193. data/vendor/ggml/src/ggml-cuda/mean.cuh +3 -0
  1194. data/vendor/ggml/src/ggml-cuda/mma.cuh +1456 -0
  1195. data/vendor/ggml/src/ggml-cuda/mmf.cu +191 -0
  1196. data/vendor/ggml/src/ggml-cuda/mmf.cuh +908 -0
  1197. data/vendor/ggml/src/ggml-cuda/mmid.cu +164 -0
  1198. data/vendor/ggml/src/ggml-cuda/mmid.cuh +5 -0
  1199. data/vendor/ggml/src/ggml-cuda/mmq.cu +372 -0
  1200. data/vendor/ggml/src/ggml-cuda/mmq.cuh +4176 -0
  1201. data/vendor/ggml/src/ggml-cuda/mmvf.cu +862 -0
  1202. data/vendor/ggml/src/ggml-cuda/mmvf.cuh +14 -0
  1203. data/vendor/ggml/src/ggml-cuda/mmvq.cu +1161 -0
  1204. data/vendor/ggml/src/ggml-cuda/mmvq.cuh +16 -0
  1205. data/vendor/ggml/src/ggml-cuda/norm.cu +672 -0
  1206. data/vendor/ggml/src/ggml-cuda/norm.cuh +18 -0
  1207. data/vendor/ggml/src/ggml-cuda/opt-step-adamw.cu +78 -0
  1208. data/vendor/ggml/src/ggml-cuda/opt-step-adamw.cuh +5 -0
  1209. data/vendor/ggml/src/ggml-cuda/opt-step-sgd.cu +49 -0
  1210. data/vendor/ggml/src/ggml-cuda/opt-step-sgd.cuh +5 -0
  1211. data/vendor/ggml/src/ggml-cuda/out-prod.cu +84 -0
  1212. data/vendor/ggml/src/ggml-cuda/out-prod.cuh +3 -0
  1213. data/vendor/ggml/src/ggml-cuda/pad.cu +106 -0
  1214. data/vendor/ggml/src/ggml-cuda/pad.cuh +5 -0
  1215. data/vendor/ggml/src/ggml-cuda/pad_reflect_1d.cu +91 -0
  1216. data/vendor/ggml/src/ggml-cuda/pad_reflect_1d.cuh +5 -0
  1217. data/vendor/ggml/src/ggml-cuda/pool2d.cu +94 -0
  1218. data/vendor/ggml/src/ggml-cuda/pool2d.cuh +5 -0
  1219. data/vendor/ggml/src/ggml-cuda/quantize.cu +443 -0
  1220. data/vendor/ggml/src/ggml-cuda/quantize.cuh +41 -0
  1221. data/vendor/ggml/src/ggml-cuda/reduce_rows.cuh +39 -0
  1222. data/vendor/ggml/src/ggml-cuda/roll.cu +67 -0
  1223. data/vendor/ggml/src/ggml-cuda/roll.cuh +5 -0
  1224. data/vendor/ggml/src/ggml-cuda/rope.cu +665 -0
  1225. data/vendor/ggml/src/ggml-cuda/rope.cuh +9 -0
  1226. data/vendor/ggml/src/ggml-cuda/scale.cu +34 -0
  1227. data/vendor/ggml/src/ggml-cuda/scale.cuh +5 -0
  1228. data/vendor/ggml/src/ggml-cuda/set-rows.cu +330 -0
  1229. data/vendor/ggml/src/ggml-cuda/set-rows.cuh +7 -0
  1230. data/vendor/ggml/src/ggml-cuda/set.cu +39 -0
  1231. data/vendor/ggml/src/ggml-cuda/set.cuh +7 -0
  1232. data/vendor/ggml/src/ggml-cuda/snake.cu +72 -0
  1233. data/vendor/ggml/src/ggml-cuda/snake.cuh +8 -0
  1234. data/vendor/ggml/src/ggml-cuda/softcap.cu +34 -0
  1235. data/vendor/ggml/src/ggml-cuda/softcap.cuh +5 -0
  1236. data/vendor/ggml/src/ggml-cuda/softmax.cu +472 -0
  1237. data/vendor/ggml/src/ggml-cuda/softmax.cuh +7 -0
  1238. data/vendor/ggml/src/ggml-cuda/solve_tri.cu +275 -0
  1239. data/vendor/ggml/src/ggml-cuda/solve_tri.cuh +3 -0
  1240. data/vendor/ggml/src/ggml-cuda/ssm-conv.cu +197 -0
  1241. data/vendor/ggml/src/ggml-cuda/ssm-conv.cuh +3 -0
  1242. data/vendor/ggml/src/ggml-cuda/ssm-scan.cu +342 -0
  1243. data/vendor/ggml/src/ggml-cuda/ssm-scan.cuh +3 -0
  1244. data/vendor/ggml/src/ggml-cuda/sum.cu +41 -0
  1245. data/vendor/ggml/src/ggml-cuda/sum.cuh +5 -0
  1246. data/vendor/ggml/src/ggml-cuda/sumrows.cu +43 -0
  1247. data/vendor/ggml/src/ggml-cuda/sumrows.cuh +4 -0
  1248. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu +6 -0
  1249. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_32.cu +6 -0
  1250. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu +12 -0
  1251. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_1.cu +10 -0
  1252. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_2.cu +10 -0
  1253. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +12 -0
  1254. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu +6 -0
  1255. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_32.cu +6 -0
  1256. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +12 -0
  1257. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu +12 -0
  1258. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_1.cu +10 -0
  1259. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_2.cu +10 -0
  1260. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu +6 -0
  1261. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_2.cu +10 -0
  1262. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +12 -0
  1263. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_8.cu +12 -0
  1264. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_64-ncols2_1.cu +10 -0
  1265. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_1.cu +10 -0
  1266. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_2.cu +10 -0
  1267. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +12 -0
  1268. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_8.cu +12 -0
  1269. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq112-dv112.cu +5 -0
  1270. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq128-dv128.cu +5 -0
  1271. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq192-dv128.cu +5 -0
  1272. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq256-dv256.cu +5 -0
  1273. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq320-dv256.cu +5 -0
  1274. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq40-dv40.cu +5 -0
  1275. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq512-dv512.cu +5 -0
  1276. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq576-dv512.cu +5 -0
  1277. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq64-dv64.cu +5 -0
  1278. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq72-dv72.cu +5 -0
  1279. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq80-dv80.cu +5 -0
  1280. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq96-dv96.cu +5 -0
  1281. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-bf16.cu +7 -0
  1282. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-f16.cu +7 -0
  1283. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_0.cu +7 -0
  1284. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_1.cu +7 -0
  1285. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_0.cu +7 -0
  1286. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_1.cu +7 -0
  1287. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q8_0.cu +7 -0
  1288. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-bf16.cu +7 -0
  1289. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu +7 -0
  1290. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_0.cu +7 -0
  1291. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_1.cu +7 -0
  1292. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_0.cu +7 -0
  1293. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_1.cu +7 -0
  1294. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q8_0.cu +7 -0
  1295. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-bf16.cu +7 -0
  1296. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-f16.cu +7 -0
  1297. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu +7 -0
  1298. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_1.cu +7 -0
  1299. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_0.cu +7 -0
  1300. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_1.cu +7 -0
  1301. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q8_0.cu +7 -0
  1302. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-bf16.cu +7 -0
  1303. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-f16.cu +7 -0
  1304. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_0.cu +7 -0
  1305. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_1.cu +7 -0
  1306. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_0.cu +7 -0
  1307. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_1.cu +7 -0
  1308. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q8_0.cu +7 -0
  1309. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-bf16.cu +7 -0
  1310. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-f16.cu +7 -0
  1311. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_0.cu +7 -0
  1312. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_1.cu +7 -0
  1313. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_0.cu +7 -0
  1314. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_1.cu +7 -0
  1315. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q8_0.cu +7 -0
  1316. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-bf16.cu +7 -0
  1317. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-f16.cu +7 -0
  1318. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_0.cu +7 -0
  1319. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_1.cu +7 -0
  1320. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_0.cu +7 -0
  1321. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_1.cu +7 -0
  1322. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q8_0.cu +7 -0
  1323. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-bf16.cu +7 -0
  1324. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-f16.cu +7 -0
  1325. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_0.cu +7 -0
  1326. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_1.cu +7 -0
  1327. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_0.cu +7 -0
  1328. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_1.cu +7 -0
  1329. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu +7 -0
  1330. data/vendor/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +110 -0
  1331. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_1.cu +5 -0
  1332. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_10.cu +5 -0
  1333. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_11.cu +5 -0
  1334. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_12.cu +5 -0
  1335. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_13.cu +5 -0
  1336. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_14.cu +5 -0
  1337. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_15.cu +5 -0
  1338. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_16.cu +5 -0
  1339. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_2.cu +5 -0
  1340. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_3.cu +5 -0
  1341. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_4.cu +5 -0
  1342. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_5.cu +5 -0
  1343. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_6.cu +5 -0
  1344. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_7.cu +5 -0
  1345. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_8.cu +5 -0
  1346. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_9.cu +5 -0
  1347. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq1_s.cu +5 -0
  1348. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_s.cu +5 -0
  1349. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xs.cu +5 -0
  1350. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xxs.cu +5 -0
  1351. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_s.cu +5 -0
  1352. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_xxs.cu +5 -0
  1353. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_nl.cu +5 -0
  1354. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_xs.cu +5 -0
  1355. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-mxfp4.cu +5 -0
  1356. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-nvfp4.cu +5 -0
  1357. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q1_0.cu +5 -0
  1358. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
  1359. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
  1360. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
  1361. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
  1362. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
  1363. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
  1364. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
  1365. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
  1366. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
  1367. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
  1368. data/vendor/ggml/src/ggml-cuda/top-k.cu +95 -0
  1369. data/vendor/ggml/src/ggml-cuda/top-k.cuh +3 -0
  1370. data/vendor/ggml/src/ggml-cuda/topk-moe.cu +415 -0
  1371. data/vendor/ggml/src/ggml-cuda/topk-moe.cuh +27 -0
  1372. data/vendor/ggml/src/ggml-cuda/tri.cu +136 -0
  1373. data/vendor/ggml/src/ggml-cuda/tri.cuh +5 -0
  1374. data/vendor/ggml/src/ggml-cuda/tsembd.cu +47 -0
  1375. data/vendor/ggml/src/ggml-cuda/tsembd.cuh +5 -0
  1376. data/vendor/ggml/src/ggml-cuda/unary.cu +640 -0
  1377. data/vendor/ggml/src/ggml-cuda/unary.cuh +114 -0
  1378. data/vendor/ggml/src/ggml-cuda/upscale.cu +293 -0
  1379. data/vendor/ggml/src/ggml-cuda/upscale.cuh +5 -0
  1380. data/vendor/ggml/src/ggml-cuda/vecdotq.cuh +1317 -0
  1381. data/vendor/ggml/src/ggml-cuda/vendors/cuda.h +28 -0
  1382. data/vendor/ggml/src/ggml-cuda/vendors/hip.h +304 -0
  1383. data/vendor/ggml/src/ggml-cuda/vendors/musa.h +150 -0
  1384. data/vendor/ggml/src/ggml-cuda/wkv.cu +199 -0
  1385. data/vendor/ggml/src/ggml-cuda/wkv.cuh +7 -0
  1386. data/vendor/ggml/src/ggml-hexagon/CMakeLists.txt +118 -0
  1387. data/vendor/ggml/src/ggml-hexagon/ggml-hexagon.cpp +3680 -0
  1388. data/vendor/ggml/src/ggml-hexagon/htp/CMakeLists.txt +78 -0
  1389. data/vendor/ggml/src/ggml-hexagon/htp/act-ops.c +782 -0
  1390. data/vendor/ggml/src/ggml-hexagon/htp/argsort-ops.c +293 -0
  1391. data/vendor/ggml/src/ggml-hexagon/htp/binary-ops.c +872 -0
  1392. data/vendor/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +157 -0
  1393. data/vendor/ggml/src/ggml-hexagon/htp/cpy-ops.c +275 -0
  1394. data/vendor/ggml/src/ggml-hexagon/htp/cumsum-ops.c +270 -0
  1395. data/vendor/ggml/src/ggml-hexagon/htp/diag-ops.c +216 -0
  1396. data/vendor/ggml/src/ggml-hexagon/htp/fill-ops.c +123 -0
  1397. data/vendor/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +727 -0
  1398. data/vendor/ggml/src/ggml-hexagon/htp/gated-delta-net-ops.c +955 -0
  1399. data/vendor/ggml/src/ggml-hexagon/htp/get-rows-ops.c +124 -0
  1400. data/vendor/ggml/src/ggml-hexagon/htp/hex-dma.c +63 -0
  1401. data/vendor/ggml/src/ggml-hexagon/htp/hex-dma.h +372 -0
  1402. data/vendor/ggml/src/ggml-hexagon/htp/hex-dump.h +86 -0
  1403. data/vendor/ggml/src/ggml-hexagon/htp/hex-fastdiv.h +37 -0
  1404. data/vendor/ggml/src/ggml-hexagon/htp/hex-utils.h +137 -0
  1405. data/vendor/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c +1841 -0
  1406. data/vendor/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c +1785 -0
  1407. data/vendor/ggml/src/ggml-hexagon/htp/hmx-ops.h +71 -0
  1408. data/vendor/ggml/src/ggml-hexagon/htp/hmx-profile.h +34 -0
  1409. data/vendor/ggml/src/ggml-hexagon/htp/hmx-queue.c +158 -0
  1410. data/vendor/ggml/src/ggml-hexagon/htp/hmx-queue.h +134 -0
  1411. data/vendor/ggml/src/ggml-hexagon/htp/hmx-utils.h +200 -0
  1412. data/vendor/ggml/src/ggml-hexagon/htp/htp-ctx.h +111 -0
  1413. data/vendor/ggml/src/ggml-hexagon/htp/htp-ops.h +181 -0
  1414. data/vendor/ggml/src/ggml-hexagon/htp/htp_iface.idl +22 -0
  1415. data/vendor/ggml/src/ggml-hexagon/htp/hvx-arith.h +443 -0
  1416. data/vendor/ggml/src/ggml-hexagon/htp/hvx-base.h +308 -0
  1417. data/vendor/ggml/src/ggml-hexagon/htp/hvx-copy.h +262 -0
  1418. data/vendor/ggml/src/ggml-hexagon/htp/hvx-div.h +291 -0
  1419. data/vendor/ggml/src/ggml-hexagon/htp/hvx-dump.h +129 -0
  1420. data/vendor/ggml/src/ggml-hexagon/htp/hvx-exp.h +216 -0
  1421. data/vendor/ggml/src/ggml-hexagon/htp/hvx-floor.h +100 -0
  1422. data/vendor/ggml/src/ggml-hexagon/htp/hvx-inverse.h +210 -0
  1423. data/vendor/ggml/src/ggml-hexagon/htp/hvx-reduce.h +296 -0
  1424. data/vendor/ggml/src/ggml-hexagon/htp/hvx-repl.h +74 -0
  1425. data/vendor/ggml/src/ggml-hexagon/htp/hvx-scale.h +133 -0
  1426. data/vendor/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +142 -0
  1427. data/vendor/ggml/src/ggml-hexagon/htp/hvx-sqrt.h +126 -0
  1428. data/vendor/ggml/src/ggml-hexagon/htp/hvx-types.h +36 -0
  1429. data/vendor/ggml/src/ggml-hexagon/htp/hvx-utils.h +19 -0
  1430. data/vendor/ggml/src/ggml-hexagon/htp/main.c +880 -0
  1431. data/vendor/ggml/src/ggml-hexagon/htp/matmul-ops.c +3173 -0
  1432. data/vendor/ggml/src/ggml-hexagon/htp/repeat-ops.c +148 -0
  1433. data/vendor/ggml/src/ggml-hexagon/htp/rope-ops.c +494 -0
  1434. data/vendor/ggml/src/ggml-hexagon/htp/set-rows-ops.c +184 -0
  1435. data/vendor/ggml/src/ggml-hexagon/htp/softmax-ops.c +407 -0
  1436. data/vendor/ggml/src/ggml-hexagon/htp/solve-tri-ops.c +267 -0
  1437. data/vendor/ggml/src/ggml-hexagon/htp/ssm-conv.c +340 -0
  1438. data/vendor/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +128 -0
  1439. data/vendor/ggml/src/ggml-hexagon/htp/unary-ops.c +657 -0
  1440. data/vendor/ggml/src/ggml-hexagon/htp/vtcm-utils.h +16 -0
  1441. data/vendor/ggml/src/ggml-hexagon/htp/worker-pool.c +293 -0
  1442. data/vendor/ggml/src/ggml-hexagon/htp/worker-pool.h +57 -0
  1443. data/vendor/ggml/src/ggml-hexagon/htp-drv.cpp +418 -0
  1444. data/vendor/ggml/src/ggml-hexagon/htp-drv.h +121 -0
  1445. data/vendor/ggml/src/ggml-hexagon/libdl.h +79 -0
  1446. data/vendor/ggml/src/ggml-hexagon/libggml-htp.inf +40 -0
  1447. data/vendor/ggml/src/ggml-hexagon/op-desc.h +153 -0
  1448. data/vendor/ggml/src/ggml-hip/CMakeLists.txt +157 -0
  1449. data/vendor/ggml/src/ggml-impl.h +783 -0
  1450. data/vendor/ggml/src/ggml-metal/CMakeLists.txt +124 -0
  1451. data/vendor/ggml/src/ggml-metal/ggml-metal-common.cpp +457 -0
  1452. data/vendor/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
  1453. data/vendor/ggml/src/ggml-metal/ggml-metal-context.h +41 -0
  1454. data/vendor/ggml/src/ggml-metal/ggml-metal-context.m +739 -0
  1455. data/vendor/ggml/src/ggml-metal/ggml-metal-device.cpp +2053 -0
  1456. data/vendor/ggml/src/ggml-metal/ggml-metal-device.h +296 -0
  1457. data/vendor/ggml/src/ggml-metal/ggml-metal-device.m +1829 -0
  1458. data/vendor/ggml/src/ggml-metal/ggml-metal-impl.h +1175 -0
  1459. data/vendor/ggml/src/ggml-metal/ggml-metal-ops.cpp +4606 -0
  1460. data/vendor/ggml/src/ggml-metal/ggml-metal-ops.h +97 -0
  1461. data/vendor/ggml/src/ggml-metal/ggml-metal.cpp +950 -0
  1462. data/vendor/ggml/src/ggml-metal/ggml-metal.metal +10679 -0
  1463. data/vendor/ggml/src/ggml-musa/CMakeLists.txt +124 -0
  1464. data/vendor/ggml/src/ggml-musa/mudnn.cu +112 -0
  1465. data/vendor/ggml/src/ggml-musa/mudnn.cuh +12 -0
  1466. data/vendor/ggml/src/ggml-opencl/CMakeLists.txt +189 -0
  1467. data/vendor/ggml/src/ggml-opencl/ggml-opencl.cpp +16374 -0
  1468. data/vendor/ggml/src/ggml-opencl/kernels/add.cl +190 -0
  1469. data/vendor/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
  1470. data/vendor/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
  1471. data/vendor/ggml/src/ggml-opencl/kernels/clamp.cl +20 -0
  1472. data/vendor/ggml/src/ggml-opencl/kernels/concat.cl +51 -0
  1473. data/vendor/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
  1474. data/vendor/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
  1475. data/vendor/ggml/src/ggml-opencl/kernels/cpy.cl +229 -0
  1476. data/vendor/ggml/src/ggml-opencl/kernels/cumsum.cl +139 -0
  1477. data/vendor/ggml/src/ggml-opencl/kernels/cvt.cl +1471 -0
  1478. data/vendor/ggml/src/ggml-opencl/kernels/diag.cl +27 -0
  1479. data/vendor/ggml/src/ggml-opencl/kernels/diag_mask_inf.cl +58 -0
  1480. data/vendor/ggml/src/ggml-opencl/kernels/div.cl +138 -0
  1481. data/vendor/ggml/src/ggml-opencl/kernels/embed_kernel.py +26 -0
  1482. data/vendor/ggml/src/ggml-opencl/kernels/exp.cl +125 -0
  1483. data/vendor/ggml/src/ggml-opencl/kernels/expm1.cl +113 -0
  1484. data/vendor/ggml/src/ggml-opencl/kernels/fill.cl +17 -0
  1485. data/vendor/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +370 -0
  1486. data/vendor/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +371 -0
  1487. data/vendor/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +373 -0
  1488. data/vendor/ggml/src/ggml-opencl/kernels/gelu.cl +89 -0
  1489. data/vendor/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32.cl +162 -0
  1490. data/vendor/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32_ns.cl +302 -0
  1491. data/vendor/ggml/src/ggml-opencl/kernels/gemm_moe_q4_0_f32_ns.cl +252 -0
  1492. data/vendor/ggml/src/ggml-opencl/kernels/gemm_moe_q4_1_f32_ns.cl +254 -0
  1493. data/vendor/ggml/src/ggml-opencl/kernels/gemm_moe_q5_0_f32_ns.cl +256 -0
  1494. data/vendor/ggml/src/ggml-opencl/kernels/gemm_moe_q5_1_f32_ns.cl +258 -0
  1495. data/vendor/ggml/src/ggml-opencl/kernels/gemm_noshuffle_iq4_nl_f32.cl +150 -0
  1496. data/vendor/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_0_f32.cl +139 -0
  1497. data/vendor/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_1_f32.cl +132 -0
  1498. data/vendor/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_k_f32.cl +172 -0
  1499. data/vendor/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_k_f32.cl +176 -0
  1500. data/vendor/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q6_k_f32.cl +140 -0
  1501. data/vendor/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q8_0_f32.cl +129 -0
  1502. data/vendor/ggml/src/ggml-opencl/kernels/gemm_xmem_f16_f32_os8.cl +233 -0
  1503. data/vendor/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32.cl +156 -0
  1504. data/vendor/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32_ns.cl +161 -0
  1505. data/vendor/ggml/src/ggml-opencl/kernels/gemv_moe_q4_0_f32_ns.cl +116 -0
  1506. data/vendor/ggml/src/ggml-opencl/kernels/gemv_moe_q4_1_f32_ns.cl +119 -0
  1507. data/vendor/ggml/src/ggml-opencl/kernels/gemv_moe_q5_0_f32_ns.cl +119 -0
  1508. data/vendor/ggml/src/ggml-opencl/kernels/gemv_moe_q5_1_f32_ns.cl +121 -0
  1509. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_iq4_nl_f32.cl +302 -0
  1510. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_0_f32.cl +274 -0
  1511. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_0_f32_spec.cl +268 -0
  1512. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_1_f32.cl +283 -0
  1513. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_k_f32.cl +318 -0
  1514. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_k_f32.cl +326 -0
  1515. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q6_k_f32.cl +293 -0
  1516. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q8_0_f32.cl +195 -0
  1517. data/vendor/ggml/src/ggml-opencl/kernels/get_rows.cl +187 -0
  1518. data/vendor/ggml/src/ggml-opencl/kernels/glu.cl +378 -0
  1519. data/vendor/ggml/src/ggml-opencl/kernels/group_norm.cl +121 -0
  1520. data/vendor/ggml/src/ggml-opencl/kernels/im2col_f16.cl +57 -0
  1521. data/vendor/ggml/src/ggml-opencl/kernels/im2col_f32.cl +57 -0
  1522. data/vendor/ggml/src/ggml-opencl/kernels/l2_norm.cl +71 -0
  1523. data/vendor/ggml/src/ggml-opencl/kernels/mean.cl +140 -0
  1524. data/vendor/ggml/src/ggml-opencl/kernels/moe_reorder_b.cl +30 -0
  1525. data/vendor/ggml/src/ggml-opencl/kernels/moe_sort_by_expert.cl +82 -0
  1526. data/vendor/ggml/src/ggml-opencl/kernels/mul.cl +152 -0
  1527. data/vendor/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
  1528. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_kq_kqv.cl +273 -0
  1529. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +146 -0
  1530. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +147 -0
  1531. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_iq4_nl_f32_l4_lm.cl +171 -0
  1532. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl +163 -0
  1533. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl +165 -0
  1534. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_q4_k_f32_l4_lm.cl +179 -0
  1535. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_q5_k_f32_l4_lm.cl +192 -0
  1536. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_q6_k_f32_l4_lm.cl +158 -0
  1537. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl +154 -0
  1538. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_f16_f16.cl +118 -0
  1539. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32.cl +118 -0
  1540. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_1row.cl +94 -0
  1541. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_l4.cl +84 -0
  1542. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_f32_f32.cl +118 -0
  1543. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
  1544. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
  1545. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  1546. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
  1547. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
  1548. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32.cl +164 -0
  1549. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32_flat.cl +202 -0
  1550. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
  1551. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
  1552. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32.cl +192 -0
  1553. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_16x_flat.cl +307 -0
  1554. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_8x_flat.cl +265 -0
  1555. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_8x_flat.cl +272 -0
  1556. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_v.cl +254 -0
  1557. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl +219 -0
  1558. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl +229 -0
  1559. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl +180 -0
  1560. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32_flat.cl +196 -0
  1561. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32.cl +187 -0
  1562. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32_flat.cl +203 -0
  1563. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32.cl +194 -0
  1564. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +194 -0
  1565. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
  1566. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
  1567. data/vendor/ggml/src/ggml-opencl/kernels/neg.cl +125 -0
  1568. data/vendor/ggml/src/ggml-opencl/kernels/norm.cl +161 -0
  1569. data/vendor/ggml/src/ggml-opencl/kernels/pad.cl +39 -0
  1570. data/vendor/ggml/src/ggml-opencl/kernels/relu.cl +16 -0
  1571. data/vendor/ggml/src/ggml-opencl/kernels/repeat.cl +38 -0
  1572. data/vendor/ggml/src/ggml-opencl/kernels/rms_norm.cl +190 -0
  1573. data/vendor/ggml/src/ggml-opencl/kernels/rope.cl +747 -0
  1574. data/vendor/ggml/src/ggml-opencl/kernels/scale.cl +27 -0
  1575. data/vendor/ggml/src/ggml-opencl/kernels/set_rows.cl +208 -0
  1576. data/vendor/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
  1577. data/vendor/ggml/src/ggml-opencl/kernels/silu.cl +30 -0
  1578. data/vendor/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +108 -0
  1579. data/vendor/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +108 -0
  1580. data/vendor/ggml/src/ggml-opencl/kernels/softmax_f16.cl +107 -0
  1581. data/vendor/ggml/src/ggml-opencl/kernels/softmax_f32.cl +107 -0
  1582. data/vendor/ggml/src/ggml-opencl/kernels/softplus.cl +116 -0
  1583. data/vendor/ggml/src/ggml-opencl/kernels/solve_tri.cl +51 -0
  1584. data/vendor/ggml/src/ggml-opencl/kernels/sqr.cl +53 -0
  1585. data/vendor/ggml/src/ggml-opencl/kernels/sqrt.cl +53 -0
  1586. data/vendor/ggml/src/ggml-opencl/kernels/ssm_conv.cl +77 -0
  1587. data/vendor/ggml/src/ggml-opencl/kernels/sub.cl +138 -0
  1588. data/vendor/ggml/src/ggml-opencl/kernels/sum_rows.cl +140 -0
  1589. data/vendor/ggml/src/ggml-opencl/kernels/tanh.cl +109 -0
  1590. data/vendor/ggml/src/ggml-opencl/kernels/transpose.cl +143 -0
  1591. data/vendor/ggml/src/ggml-opencl/kernels/tri.cl +32 -0
  1592. data/vendor/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  1593. data/vendor/ggml/src/ggml-opencl/kernels/upscale.cl +120 -0
  1594. data/vendor/ggml/src/ggml-openvino/CMakeLists.txt +22 -0
  1595. data/vendor/ggml/src/ggml-openvino/ggml-decoder.cpp +985 -0
  1596. data/vendor/ggml/src/ggml-openvino/ggml-decoder.h +294 -0
  1597. data/vendor/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +380 -0
  1598. data/vendor/ggml/src/ggml-openvino/ggml-openvino-extra.h +182 -0
  1599. data/vendor/ggml/src/ggml-openvino/ggml-openvino.cpp +1132 -0
  1600. data/vendor/ggml/src/ggml-openvino/ggml-quants.cpp +956 -0
  1601. data/vendor/ggml/src/ggml-openvino/ggml-quants.h +153 -0
  1602. data/vendor/ggml/src/ggml-openvino/openvino/decoder.h +74 -0
  1603. data/vendor/ggml/src/ggml-openvino/openvino/frontend.cpp +27 -0
  1604. data/vendor/ggml/src/ggml-openvino/openvino/frontend.h +23 -0
  1605. data/vendor/ggml/src/ggml-openvino/openvino/input_model.cpp +17 -0
  1606. data/vendor/ggml/src/ggml-openvino/openvino/input_model.h +29 -0
  1607. data/vendor/ggml/src/ggml-openvino/openvino/node_context.h +112 -0
  1608. data/vendor/ggml/src/ggml-openvino/openvino/op/cont.cpp +48 -0
  1609. data/vendor/ggml/src/ggml-openvino/openvino/op/cpy.cpp +21 -0
  1610. data/vendor/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +90 -0
  1611. data/vendor/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +69 -0
  1612. data/vendor/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp +61 -0
  1613. data/vendor/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp +62 -0
  1614. data/vendor/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +90 -0
  1615. data/vendor/ggml/src/ggml-openvino/openvino/op/permute.cpp +102 -0
  1616. data/vendor/ggml/src/ggml-openvino/openvino/op/reshape.cpp +83 -0
  1617. data/vendor/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp +46 -0
  1618. data/vendor/ggml/src/ggml-openvino/openvino/op/rope.cpp +149 -0
  1619. data/vendor/ggml/src/ggml-openvino/openvino/op/scale.cpp +41 -0
  1620. data/vendor/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +76 -0
  1621. data/vendor/ggml/src/ggml-openvino/openvino/op/softmax.cpp +89 -0
  1622. data/vendor/ggml/src/ggml-openvino/openvino/op/transpose.cpp +23 -0
  1623. data/vendor/ggml/src/ggml-openvino/openvino/op/unary_gelu.cpp +25 -0
  1624. data/vendor/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp +27 -0
  1625. data/vendor/ggml/src/ggml-openvino/openvino/op/view.cpp +53 -0
  1626. data/vendor/ggml/src/ggml-openvino/openvino/op_table.cpp +47 -0
  1627. data/vendor/ggml/src/ggml-openvino/openvino/op_table.h +40 -0
  1628. data/vendor/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp +60 -0
  1629. data/vendor/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.h +17 -0
  1630. data/vendor/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.h +29 -0
  1631. data/vendor/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.cpp +58 -0
  1632. data/vendor/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.h +17 -0
  1633. data/vendor/ggml/src/ggml-openvino/openvino/rt_info/weightless_caching_attributes.hpp +41 -0
  1634. data/vendor/ggml/src/ggml-openvino/openvino/translate_session.cpp +317 -0
  1635. data/vendor/ggml/src/ggml-openvino/openvino/translate_session.h +28 -0
  1636. data/vendor/ggml/src/ggml-openvino/openvino/utils.cpp +257 -0
  1637. data/vendor/ggml/src/ggml-openvino/openvino/utils.h +86 -0
  1638. data/vendor/ggml/src/ggml-openvino/utils.cpp +880 -0
  1639. data/vendor/ggml/src/ggml-openvino/utils.h +143 -0
  1640. data/vendor/ggml/src/ggml-opt.cpp +1094 -0
  1641. data/vendor/ggml/src/ggml-quants.c +5491 -0
  1642. data/vendor/ggml/src/ggml-quants.h +112 -0
  1643. data/vendor/ggml/src/ggml-rpc/CMakeLists.txt +33 -0
  1644. data/vendor/ggml/src/ggml-rpc/ggml-rpc.cpp +1974 -0
  1645. data/vendor/ggml/src/ggml-rpc/transport.cpp +683 -0
  1646. data/vendor/ggml/src/ggml-rpc/transport.h +34 -0
  1647. data/vendor/ggml/src/ggml-sycl/CMakeLists.txt +207 -0
  1648. data/vendor/ggml/src/ggml-sycl/add-id.cpp +81 -0
  1649. data/vendor/ggml/src/ggml-sycl/add-id.hpp +8 -0
  1650. data/vendor/ggml/src/ggml-sycl/backend.hpp +48 -0
  1651. data/vendor/ggml/src/ggml-sycl/binbcast.cpp +346 -0
  1652. data/vendor/ggml/src/ggml-sycl/binbcast.hpp +39 -0
  1653. data/vendor/ggml/src/ggml-sycl/common.cpp +155 -0
  1654. data/vendor/ggml/src/ggml-sycl/common.hpp +1002 -0
  1655. data/vendor/ggml/src/ggml-sycl/concat.cpp +202 -0
  1656. data/vendor/ggml/src/ggml-sycl/concat.hpp +20 -0
  1657. data/vendor/ggml/src/ggml-sycl/conv.cpp +101 -0
  1658. data/vendor/ggml/src/ggml-sycl/conv.hpp +20 -0
  1659. data/vendor/ggml/src/ggml-sycl/convert.cpp +825 -0
  1660. data/vendor/ggml/src/ggml-sycl/convert.hpp +64 -0
  1661. data/vendor/ggml/src/ggml-sycl/count-equal.cpp +79 -0
  1662. data/vendor/ggml/src/ggml-sycl/count-equal.hpp +9 -0
  1663. data/vendor/ggml/src/ggml-sycl/cpy.cpp +602 -0
  1664. data/vendor/ggml/src/ggml-sycl/cpy.hpp +223 -0
  1665. data/vendor/ggml/src/ggml-sycl/cumsum.cpp +148 -0
  1666. data/vendor/ggml/src/ggml-sycl/cumsum.hpp +5 -0
  1667. data/vendor/ggml/src/ggml-sycl/dequantize.hpp +975 -0
  1668. data/vendor/ggml/src/ggml-sycl/diag.cpp +67 -0
  1669. data/vendor/ggml/src/ggml-sycl/diag.hpp +5 -0
  1670. data/vendor/ggml/src/ggml-sycl/dmmv.cpp +1579 -0
  1671. data/vendor/ggml/src/ggml-sycl/dmmv.hpp +27 -0
  1672. data/vendor/ggml/src/ggml-sycl/dpct/helper.hpp +3774 -0
  1673. data/vendor/ggml/src/ggml-sycl/element_wise.cpp +1124 -0
  1674. data/vendor/ggml/src/ggml-sycl/element_wise.hpp +94 -0
  1675. data/vendor/ggml/src/ggml-sycl/fattn-buffers.cpp +56 -0
  1676. data/vendor/ggml/src/ggml-sycl/fattn-buffers.hpp +63 -0
  1677. data/vendor/ggml/src/ggml-sycl/fattn-common.hpp +1181 -0
  1678. data/vendor/ggml/src/ggml-sycl/fattn-tile.cpp +59 -0
  1679. data/vendor/ggml/src/ggml-sycl/fattn-tile.hpp +1246 -0
  1680. data/vendor/ggml/src/ggml-sycl/fattn-vec.hpp +674 -0
  1681. data/vendor/ggml/src/ggml-sycl/fattn.cpp +227 -0
  1682. data/vendor/ggml/src/ggml-sycl/fattn.hpp +22 -0
  1683. data/vendor/ggml/src/ggml-sycl/fill.cpp +55 -0
  1684. data/vendor/ggml/src/ggml-sycl/fill.hpp +5 -0
  1685. data/vendor/ggml/src/ggml-sycl/gated_delta_net.cpp +307 -0
  1686. data/vendor/ggml/src/ggml-sycl/gated_delta_net.hpp +9 -0
  1687. data/vendor/ggml/src/ggml-sycl/gemm.hpp +93 -0
  1688. data/vendor/ggml/src/ggml-sycl/getrows.cpp +219 -0
  1689. data/vendor/ggml/src/ggml-sycl/getrows.hpp +20 -0
  1690. data/vendor/ggml/src/ggml-sycl/ggml-sycl.cpp +5520 -0
  1691. data/vendor/ggml/src/ggml-sycl/gla.cpp +106 -0
  1692. data/vendor/ggml/src/ggml-sycl/gla.hpp +8 -0
  1693. data/vendor/ggml/src/ggml-sycl/im2col.cpp +400 -0
  1694. data/vendor/ggml/src/ggml-sycl/im2col.hpp +23 -0
  1695. data/vendor/ggml/src/ggml-sycl/mmq.cpp +3030 -0
  1696. data/vendor/ggml/src/ggml-sycl/mmq.hpp +33 -0
  1697. data/vendor/ggml/src/ggml-sycl/mmvq.cpp +1380 -0
  1698. data/vendor/ggml/src/ggml-sycl/mmvq.hpp +43 -0
  1699. data/vendor/ggml/src/ggml-sycl/norm.cpp +656 -0
  1700. data/vendor/ggml/src/ggml-sycl/norm.hpp +28 -0
  1701. data/vendor/ggml/src/ggml-sycl/outprod.cpp +47 -0
  1702. data/vendor/ggml/src/ggml-sycl/outprod.hpp +10 -0
  1703. data/vendor/ggml/src/ggml-sycl/pad.cpp +97 -0
  1704. data/vendor/ggml/src/ggml-sycl/pad.hpp +24 -0
  1705. data/vendor/ggml/src/ggml-sycl/pad_reflect_1d.cpp +100 -0
  1706. data/vendor/ggml/src/ggml-sycl/pad_reflect_1d.hpp +10 -0
  1707. data/vendor/ggml/src/ggml-sycl/presets.hpp +79 -0
  1708. data/vendor/ggml/src/ggml-sycl/quantize.hpp +133 -0
  1709. data/vendor/ggml/src/ggml-sycl/quants.hpp +156 -0
  1710. data/vendor/ggml/src/ggml-sycl/repeat_back.cpp +76 -0
  1711. data/vendor/ggml/src/ggml-sycl/repeat_back.hpp +8 -0
  1712. data/vendor/ggml/src/ggml-sycl/roll.cpp +122 -0
  1713. data/vendor/ggml/src/ggml-sycl/roll.hpp +20 -0
  1714. data/vendor/ggml/src/ggml-sycl/rope.cpp +641 -0
  1715. data/vendor/ggml/src/ggml-sycl/rope.hpp +26 -0
  1716. data/vendor/ggml/src/ggml-sycl/set.cpp +73 -0
  1717. data/vendor/ggml/src/ggml-sycl/set.hpp +5 -0
  1718. data/vendor/ggml/src/ggml-sycl/set_rows.cpp +240 -0
  1719. data/vendor/ggml/src/ggml-sycl/set_rows.hpp +8 -0
  1720. data/vendor/ggml/src/ggml-sycl/softmax.cpp +426 -0
  1721. data/vendor/ggml/src/ggml-sycl/softmax.hpp +24 -0
  1722. data/vendor/ggml/src/ggml-sycl/solve_tri.cpp +172 -0
  1723. data/vendor/ggml/src/ggml-sycl/solve_tri.hpp +8 -0
  1724. data/vendor/ggml/src/ggml-sycl/ssm_conv.cpp +132 -0
  1725. data/vendor/ggml/src/ggml-sycl/ssm_conv.hpp +5 -0
  1726. data/vendor/ggml/src/ggml-sycl/ssm_scan.cpp +156 -0
  1727. data/vendor/ggml/src/ggml-sycl/ssm_scan.hpp +5 -0
  1728. data/vendor/ggml/src/ggml-sycl/sycl_hw.cpp +67 -0
  1729. data/vendor/ggml/src/ggml-sycl/sycl_hw.hpp +38 -0
  1730. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq112-dv112.cpp +5 -0
  1731. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq128-dv128.cpp +5 -0
  1732. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq256-dv256.cpp +5 -0
  1733. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq40-dv40.cpp +5 -0
  1734. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq512-dv512.cpp +6 -0
  1735. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq576-dv512.cpp +5 -0
  1736. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq64-dv64.cpp +5 -0
  1737. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq72-dv72.cpp +5 -0
  1738. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq80-dv80.cpp +5 -0
  1739. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq96-dv96.cpp +5 -0
  1740. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-f16.cpp +8 -0
  1741. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_0.cpp +8 -0
  1742. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_1.cpp +8 -0
  1743. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_0.cpp +8 -0
  1744. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_1.cpp +8 -0
  1745. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q8_0.cpp +8 -0
  1746. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-f16.cpp +8 -0
  1747. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_0.cpp +8 -0
  1748. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_1.cpp +8 -0
  1749. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_0.cpp +8 -0
  1750. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_1.cpp +8 -0
  1751. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q8_0.cpp +8 -0
  1752. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-f16.cpp +8 -0
  1753. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_0.cpp +8 -0
  1754. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_1.cpp +8 -0
  1755. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_0.cpp +8 -0
  1756. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_1.cpp +8 -0
  1757. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q8_0.cpp +8 -0
  1758. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-f16.cpp +8 -0
  1759. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_0.cpp +8 -0
  1760. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_1.cpp +8 -0
  1761. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_0.cpp +8 -0
  1762. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_1.cpp +8 -0
  1763. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q8_0.cpp +8 -0
  1764. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-f16.cpp +8 -0
  1765. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_0.cpp +8 -0
  1766. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_1.cpp +8 -0
  1767. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_0.cpp +8 -0
  1768. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_1.cpp +8 -0
  1769. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q8_0.cpp +8 -0
  1770. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-f16.cpp +8 -0
  1771. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_0.cpp +8 -0
  1772. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_1.cpp +8 -0
  1773. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_0.cpp +8 -0
  1774. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_1.cpp +8 -0
  1775. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q8_0.cpp +8 -0
  1776. data/vendor/ggml/src/ggml-sycl/tsembd.cpp +73 -0
  1777. data/vendor/ggml/src/ggml-sycl/tsembd.hpp +20 -0
  1778. data/vendor/ggml/src/ggml-sycl/type.hpp +112 -0
  1779. data/vendor/ggml/src/ggml-sycl/upscale.cpp +410 -0
  1780. data/vendor/ggml/src/ggml-sycl/upscale.hpp +9 -0
  1781. data/vendor/ggml/src/ggml-sycl/vecdotq.hpp +1508 -0
  1782. data/vendor/ggml/src/ggml-sycl/wkv.cpp +293 -0
  1783. data/vendor/ggml/src/ggml-sycl/wkv.hpp +10 -0
  1784. data/vendor/ggml/src/ggml-threading.cpp +12 -0
  1785. data/vendor/ggml/src/ggml-threading.h +14 -0
  1786. data/vendor/ggml/src/ggml-virtgpu/CMakeLists.txt +70 -0
  1787. data/vendor/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp +87 -0
  1788. data/vendor/ggml/src/ggml-virtgpu/backend/CMakeLists.txt +21 -0
  1789. data/vendor/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp +115 -0
  1790. data/vendor/ggml/src/ggml-virtgpu/backend/backend-convert.h +13 -0
  1791. data/vendor/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +102 -0
  1792. data/vendor/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +105 -0
  1793. data/vendor/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +179 -0
  1794. data/vendor/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp +148 -0
  1795. data/vendor/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +51 -0
  1796. data/vendor/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +73 -0
  1797. data/vendor/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +27 -0
  1798. data/vendor/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +32 -0
  1799. data/vendor/ggml/src/ggml-virtgpu/backend/backend.cpp +144 -0
  1800. data/vendor/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +95 -0
  1801. data/vendor/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +94 -0
  1802. data/vendor/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +50 -0
  1803. data/vendor/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +378 -0
  1804. data/vendor/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +232 -0
  1805. data/vendor/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +58 -0
  1806. data/vendor/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +81 -0
  1807. data/vendor/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +123 -0
  1808. data/vendor/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +160 -0
  1809. data/vendor/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +213 -0
  1810. data/vendor/ggml/src/ggml-virtgpu/ggml-backend.cpp +71 -0
  1811. data/vendor/ggml/src/ggml-virtgpu/ggml-remoting.h +71 -0
  1812. data/vendor/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml +166 -0
  1813. data/vendor/ggml/src/ggml-virtgpu/include/apir_hw.h +9 -0
  1814. data/vendor/ggml/src/ggml-virtgpu/regenerate_remoting.py +333 -0
  1815. data/vendor/ggml/src/ggml-virtgpu/virtgpu-apir.h +15 -0
  1816. data/vendor/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +58 -0
  1817. data/vendor/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +110 -0
  1818. data/vendor/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +173 -0
  1819. data/vendor/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +192 -0
  1820. data/vendor/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +36 -0
  1821. data/vendor/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +53 -0
  1822. data/vendor/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +99 -0
  1823. data/vendor/ggml/src/ggml-virtgpu/virtgpu-shm.h +23 -0
  1824. data/vendor/ggml/src/ggml-virtgpu/virtgpu-utils.cpp +179 -0
  1825. data/vendor/ggml/src/ggml-virtgpu/virtgpu-utils.h +86 -0
  1826. data/vendor/ggml/src/ggml-virtgpu/virtgpu.cpp +545 -0
  1827. data/vendor/ggml/src/ggml-virtgpu/virtgpu.h +115 -0
  1828. data/vendor/ggml/src/ggml-vulkan/CMakeLists.txt +220 -0
  1829. data/vendor/ggml/src/ggml-vulkan/cmake/host-toolchain.cmake.in +15 -0
  1830. data/vendor/ggml/src/ggml-vulkan/ggml-vulkan.cpp +17208 -0
  1831. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +31 -0
  1832. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/abs.comp +21 -0
  1833. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +37 -0
  1834. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +69 -0
  1835. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/add1.comp +28 -0
  1836. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +42 -0
  1837. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/arange.comp +20 -0
  1838. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +60 -0
  1839. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +86 -0
  1840. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/argsort_large.comp +114 -0
  1841. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/ceil.comp +22 -0
  1842. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp +17 -0
  1843. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/concat.comp +41 -0
  1844. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +49 -0
  1845. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp +105 -0
  1846. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +347 -0
  1847. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
  1848. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +23 -0
  1849. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +51 -0
  1850. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +320 -0
  1851. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/copy_transpose.comp +67 -0
  1852. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp +17 -0
  1853. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/count_equal.comp +31 -0
  1854. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/count_experts.comp +51 -0
  1855. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/cumsum.comp +83 -0
  1856. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass1.comp +60 -0
  1857. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass2.comp +66 -0
  1858. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_f32.comp +20 -0
  1859. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl +653 -0
  1860. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.glsl +768 -0
  1861. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_head.glsl +13 -0
  1862. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +42 -0
  1863. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_s.comp +35 -0
  1864. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +44 -0
  1865. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xs.comp +43 -0
  1866. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +49 -0
  1867. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +40 -0
  1868. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +51 -0
  1869. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp +32 -0
  1870. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp +34 -0
  1871. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +32 -0
  1872. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_nvfp4.comp +32 -0
  1873. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q1_0.comp +29 -0
  1874. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +34 -0
  1875. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +42 -0
  1876. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_0.comp +30 -0
  1877. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_1.comp +32 -0
  1878. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +68 -0
  1879. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_0.comp +34 -0
  1880. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_1.comp +35 -0
  1881. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +70 -0
  1882. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +33 -0
  1883. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q8_0.comp +31 -0
  1884. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp +28 -0
  1885. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/diag_mask_inf.comp +34 -0
  1886. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/div.comp +27 -0
  1887. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/elu.comp +27 -0
  1888. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +20 -0
  1889. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/bfloat16.comp +7 -0
  1890. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/coopmat.comp +7 -0
  1891. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/coopmat2.comp +7 -0
  1892. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/integer_dot.comp +7 -0
  1893. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/fill.comp +19 -0
  1894. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +756 -0
  1895. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +255 -0
  1896. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +626 -0
  1897. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +427 -0
  1898. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_dequant.glsl +123 -0
  1899. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mask_opt.comp +162 -0
  1900. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mmq_funcs.glsl +203 -0
  1901. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +121 -0
  1902. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/floor.comp +22 -0
  1903. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp +190 -0
  1904. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
  1905. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
  1906. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
  1907. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp +25 -0
  1908. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
  1909. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp +23 -0
  1910. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.glsl +65 -0
  1911. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/generic_head.glsl +11 -0
  1912. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.glsl +83 -0
  1913. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +42 -0
  1914. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +51 -0
  1915. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.glsl +28 -0
  1916. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.glsl +39 -0
  1917. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp +66 -0
  1918. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +22 -0
  1919. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +22 -0
  1920. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +93 -0
  1921. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +124 -0
  1922. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +44 -0
  1923. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/leaky_relu.comp +22 -0
  1924. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +17 -0
  1925. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul.comp +27 -0
  1926. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_split_k_reduce.comp +48 -0
  1927. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +169 -0
  1928. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl +230 -0
  1929. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +35 -0
  1930. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_m.comp +132 -0
  1931. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp +95 -0
  1932. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_s.comp +90 -0
  1933. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +105 -0
  1934. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xxs.comp +87 -0
  1935. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_s.comp +90 -0
  1936. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_xxs.comp +88 -0
  1937. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +124 -0
  1938. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp +156 -0
  1939. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +128 -0
  1940. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +132 -0
  1941. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +134 -0
  1942. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +165 -0
  1943. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +130 -0
  1944. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +143 -0
  1945. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +503 -0
  1946. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +464 -0
  1947. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +624 -0
  1948. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +600 -0
  1949. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +74 -0
  1950. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +311 -0
  1951. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +454 -0
  1952. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +93 -0
  1953. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +194 -0
  1954. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/neg.comp +20 -0
  1955. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/norm.comp +44 -0
  1956. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_adamw.comp +42 -0
  1957. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +22 -0
  1958. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +64 -0
  1959. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/pool2d.comp +74 -0
  1960. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +127 -0
  1961. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
  1962. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp +21 -0
  1963. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp +26 -0
  1964. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp +37 -0
  1965. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +150 -0
  1966. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_back.comp +55 -0
  1967. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +65 -0
  1968. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
  1969. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +207 -0
  1970. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +19 -0
  1971. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +17 -0
  1972. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +17 -0
  1973. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +17 -0
  1974. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +31 -0
  1975. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +17 -0
  1976. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/round.comp +29 -0
  1977. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +24 -0
  1978. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/sgn.comp +21 -0
  1979. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp +20 -0
  1980. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp +22 -0
  1981. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/silu_back.comp +26 -0
  1982. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp +17 -0
  1983. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +195 -0
  1984. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +54 -0
  1985. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large1.comp +62 -0
  1986. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large2.comp +79 -0
  1987. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large3.comp +65 -0
  1988. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large_common.glsl +53 -0
  1989. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/softplus.comp +23 -0
  1990. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/solve_tri.comp +81 -0
  1991. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +17 -0
  1992. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/square.comp +17 -0
  1993. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +50 -0
  1994. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/ssm_scan.comp +124 -0
  1995. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/step.comp +22 -0
  1996. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/sub.comp +29 -0
  1997. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +47 -0
  1998. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.glsl +25 -0
  1999. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
  2000. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +14 -0
  2001. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp +20 -0
  2002. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +42 -0
  2003. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/topk_argsort.comp +118 -0
  2004. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +213 -0
  2005. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/topk_nary_search.comp +246 -0
  2006. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp +42 -0
  2007. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/trunc.comp +22 -0
  2008. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl +1846 -0
  2009. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +178 -0
  2010. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/utils.glsl +25 -0
  2011. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +1183 -0
  2012. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/wkv6.comp +87 -0
  2013. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/wkv7.comp +91 -0
  2014. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/xielu.comp +35 -0
  2015. data/vendor/ggml/src/ggml-webgpu/CMakeLists.txt +80 -0
  2016. data/vendor/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +3231 -0
  2017. data/vendor/ggml/src/ggml-webgpu/ggml-webgpu.cpp +4461 -0
  2018. data/vendor/ggml/src/ggml-webgpu/pre_wgsl.hpp +778 -0
  2019. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/add_id.wgsl +64 -0
  2020. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl +72 -0
  2021. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl +106 -0
  2022. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl +134 -0
  2023. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +139 -0
  2024. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +905 -0
  2025. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl +75 -0
  2026. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/conv2d.wgsl +165 -0
  2027. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +81 -0
  2028. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl +66 -0
  2029. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +89 -0
  2030. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +706 -0
  2031. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_tile.wgsl +351 -0
  2032. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_blk.wgsl +101 -0
  2033. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_reduce.wgsl +84 -0
  2034. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_split.wgsl +720 -0
  2035. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/gated_delta_net.wgsl +132 -0
  2036. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.wgsl +773 -0
  2037. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/glu.wgsl +155 -0
  2038. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/im2col.wgsl +101 -0
  2039. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
  2040. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +747 -0
  2041. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +1210 -0
  2042. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id.wgsl +195 -0
  2043. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_gather.wgsl +55 -0
  2044. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_vec.wgsl +154 -0
  2045. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl +149 -0
  2046. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.wgsl +200 -0
  2047. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +133 -0
  2048. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_acc.tmpl +1433 -0
  2049. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl +86 -0
  2050. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/repeat.wgsl +67 -0
  2051. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_mul.wgsl +152 -0
  2052. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/rope.wgsl +224 -0
  2053. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/row_norm.wgsl +153 -0
  2054. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/scale.wgsl +63 -0
  2055. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/set.wgsl +109 -0
  2056. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +109 -0
  2057. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.wgsl +245 -0
  2058. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/solve_tri.wgsl +121 -0
  2059. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/ssm_conv.wgsl +65 -0
  2060. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/ssm_scan.wgsl +193 -0
  2061. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl +55 -0
  2062. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +210 -0
  2063. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/upscale.wgsl +240 -0
  2064. data/vendor/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
  2065. data/vendor/ggml/src/ggml-zdnn/common.hpp +59 -0
  2066. data/vendor/ggml/src/ggml-zdnn/ggml-zdnn.cpp +637 -0
  2067. data/vendor/ggml/src/ggml-zdnn/mmf.cpp +80 -0
  2068. data/vendor/ggml/src/ggml-zdnn/mmf.hpp +12 -0
  2069. data/vendor/ggml/src/ggml-zdnn/utils.cpp +79 -0
  2070. data/vendor/ggml/src/ggml-zdnn/utils.hpp +19 -0
  2071. data/vendor/ggml/src/ggml-zendnn/CMakeLists.txt +91 -0
  2072. data/vendor/ggml/src/ggml-zendnn/ggml-zendnn.cpp +669 -0
  2073. data/vendor/ggml/src/ggml.c +7777 -0
  2074. data/vendor/ggml/src/ggml.cpp +26 -0
  2075. data/vendor/ggml/src/gguf.cpp +1556 -0
  2076. data/vendor/ggml/tests/CMakeLists.txt +356 -0
  2077. data/vendor/ggml/tests/test-arange.cpp +100 -0
  2078. data/vendor/ggml/tests/test-backend-ops.cpp +9786 -0
  2079. data/vendor/ggml/tests/test-cont.c +170 -0
  2080. data/vendor/ggml/tests/test-conv-transpose-1d.cpp +691 -0
  2081. data/vendor/ggml/tests/test-conv-transpose.c +248 -0
  2082. data/vendor/ggml/tests/test-conv1d-dw-c1.cpp +243 -0
  2083. data/vendor/ggml/tests/test-conv1d-dw-c2.cpp +243 -0
  2084. data/vendor/ggml/tests/test-conv1d.cpp +289 -0
  2085. data/vendor/ggml/tests/test-conv2d-dw.cpp +153 -0
  2086. data/vendor/ggml/tests/test-conv2d.cpp +391 -0
  2087. data/vendor/ggml/tests/test-customop.c +300 -0
  2088. data/vendor/ggml/tests/test-dup.c +111 -0
  2089. data/vendor/ggml/tests/test-interpolate.cpp +166 -0
  2090. data/vendor/ggml/tests/test-opt.cpp +1003 -0
  2091. data/vendor/ggml/tests/test-pad-reflect-1d.cpp +213 -0
  2092. data/vendor/ggml/tests/test-pool.c +274 -0
  2093. data/vendor/ggml/tests/test-quantize-fns.cpp +196 -0
  2094. data/vendor/ggml/tests/test-quantize-perf.cpp +356 -0
  2095. data/vendor/ggml/tests/test-rel-pos.c +87 -0
  2096. data/vendor/ggml/tests/test-roll.cpp +128 -0
  2097. data/vendor/ggml/tests/test-timestep_embedding.cpp +180 -0
  2098. data/vendor-patches/0001-cuda-buffer_from_ptr.patch +253 -0
  2099. data/vendor-patches/0002-cuda-buffer_from_ptr-reuse-iface.patch +117 -0
  2100. data/vendor-patches/0003-cuda-buffer_from_ptr-copy-mode.patch +128 -0
  2101. data/vendor-patches/0004-cuda-cpy-strided.patch +61 -0
  2102. data/vendor-patches/0005-concat-backward.patch +36 -0
  2103. data/vendor-patches/0006-getrows-back-large-vocab.patch +69 -0
  2104. data/vendor-patches/0007-gpt2-backward-kernels.patch +438 -0
  2105. data/vendor-patches/0008-mul-mat-backward-mixed-precision.patch +50 -0
  2106. data/vendor-patches/0009-sched-unsupported-node-diagnostic.patch +26 -0
  2107. metadata +2161 -0
@@ -0,0 +1,3173 @@
1
+ #pragma clang diagnostic ignored "-Wgnu-zero-variadic-macro-arguments"
2
+ #pragma clang diagnostic ignored "-Wunused-function"
3
+ #pragma clang diagnostic ignored "-Wunused-variable"
4
+ #pragma clang diagnostic ignored "-Wunused-but-set-variable"
5
+
6
+ #include <HAP_farf.h>
7
+ #include <HAP_perf.h>
8
+
9
+ #include <math.h>
10
+ #include <string.h>
11
+
12
+ #include "hex-dma.h"
13
+ #include "hvx-utils.h"
14
+ #include "hvx-dump.h"
15
+
16
+ #define GGML_COMMON_DECL_C
17
+ #include "ggml-common.h"
18
+ #include "htp-ctx.h"
19
+ #include "htp-ops.h"
20
+ #include "htp-ops.h"
21
+ #include "hmx-ops.h"
22
+
23
+ #define MM_SPAD_SRC0_NROWS 16
24
+ #define MM_SPAD_SRC1_NROWS 16
25
+ #define MM_SPAD_DST_NROWS 2
26
+
27
+ struct htp_matmul_context {
28
+ const char * type;
29
+ struct htp_ops_context * octx;
30
+
31
+ void (*vec_dot_1x1)(const int n, float * restrict s0,
32
+ const void * restrict vx0,
33
+ const void * restrict vy0);
34
+
35
+ void (*vec_dot_2x1)(const int n, float * restrict s0,
36
+ const void * restrict vx0, const void * restrict vx1,
37
+ const void * restrict vy0);
38
+
39
+ void (*vec_dot_2x2)(const int n, float * restrict s0, float * restrict s1,
40
+ const void * restrict vx0, const void * restrict vx1,
41
+ const void * restrict vy0, const void * restrict vy1);
42
+
43
+ // Precomputed values
44
+ uint32_t src0_nrows_per_thread;
45
+ uint32_t src1_nrows_per_thread;
46
+
47
+ struct fastdiv_values mm_div_ne12_ne1;
48
+ struct fastdiv_values mm_div_ne1;
49
+ struct fastdiv_values mm_div_r2;
50
+ struct fastdiv_values mm_div_r3;
51
+ };
52
+
53
+ // vdelta control to expand first 32 e8m0 values into 32 uint32 elements
54
+ static const uint8_t __attribute__((aligned(128))) expand_x32_e8m0[128] = {
55
+ 0x00, 0x00, 0x00, 0x00, 0x01, 0x04, 0x00, 0x00, 0x02, 0x00, 0x08, 0x08, 0x01, 0x02, 0x00, 0x04, 0x04, 0x00, 0x00,
56
+ 0x00, 0x11, 0x10, 0x10, 0x10, 0x02, 0x00, 0x04, 0x00, 0x01, 0x02, 0x08, 0x08, 0x08, 0x08, 0x00, 0x00, 0x01, 0x04,
57
+ 0x00, 0x00, 0x22, 0x20, 0x20, 0x20, 0x21, 0x22, 0x20, 0x24, 0x04, 0x00, 0x00, 0x00, 0x09, 0x08, 0x00, 0x00, 0x02,
58
+ 0x00, 0x04, 0x00, 0x11, 0x12, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x01, 0x04, 0x00, 0x00, 0x02, 0x00, 0x08, 0x08,
59
+ 0x01, 0x02, 0x00, 0x04, 0x44, 0x40, 0x40, 0x40, 0x41, 0x40, 0x40, 0x40, 0x42, 0x40, 0x44, 0x40, 0x41, 0x42, 0x48,
60
+ 0x48, 0x08, 0x08, 0x00, 0x00, 0x01, 0x04, 0x00, 0x00, 0x12, 0x10, 0x10, 0x10, 0x01, 0x02, 0x00, 0x04, 0x04, 0x00,
61
+ 0x00, 0x00, 0x09, 0x08, 0x00, 0x00, 0x22, 0x20, 0x24, 0x20, 0x21, 0x22, 0x20, 0x20,
62
+ };
63
+
64
+ // IQ4_NL dequantization LUT: maps 4-bit index (0-15) to int8 kvalue
65
+ // kvalues: -127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113
66
+ static const uint8_t __attribute__((aligned(VLEN))) kvalues_iq4nl_lut[] = {
67
+ 0x81, 0, 0x98, 0, 0xAD, 0, 0xBF, 0, 0xCF, 0, 0xDD, 0, 0xEA, 0, 0xF6, 0, 0x01, 0, 0x0D, 0, 0x19, 0, 0x26, 0,
68
+ 0x35, 0, 0x45, 0, 0x59, 0, 0x71, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
69
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
70
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
71
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
72
+ };
73
+
74
+ static const uint8_t __attribute__((aligned(VLEN))) kvalues_mxfp4_lut[] = {
75
+ 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 6, 0, 8, 0, 12, 0, 0, 0, 0xff, 0, 0xfe, 0, 0xfd, 0, 0xfc, 0,
76
+ 0xfa, 0, 0xf8, 0, 0xf4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
77
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
78
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
79
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
80
+ };
81
+
82
+ static inline HVX_Vector_x8 hvx_vec_load_iq4nlx4x8_full(const uint8_t * restrict ptr) {
83
+ const HVX_Vector * restrict vptr = (const HVX_Vector *) ptr;
84
+
85
+ HVX_Vector v0_1 = vptr[0]; // first 256 elements (128 bytes)
86
+ HVX_Vector v2_3 = vptr[1]; // ...
87
+ HVX_Vector v4_5 = vptr[2]; // ...
88
+ HVX_Vector v6_7 = vptr[3]; // ...
89
+
90
+ const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
91
+ const HVX_Vector lut = *(const HVX_Vector *) kvalues_iq4nl_lut;
92
+
93
+ HVX_Vector v0 = Q6_V_vand_VV(v0_1, mask_h4); // & 0x0F
94
+ HVX_Vector v1 = Q6_Vub_vlsr_VubR(v0_1, 4); // >> 4
95
+ HVX_Vector v2 = Q6_V_vand_VV(v2_3, mask_h4); // & 0x0F
96
+ HVX_Vector v3 = Q6_Vub_vlsr_VubR(v2_3, 4); // >> 4
97
+ HVX_Vector v4 = Q6_V_vand_VV(v4_5, mask_h4); // & 0x0F
98
+ HVX_Vector v5 = Q6_Vub_vlsr_VubR(v4_5, 4); // >> 4
99
+ HVX_Vector v6 = Q6_V_vand_VV(v6_7, mask_h4); // & 0x0F
100
+ HVX_Vector v7 = Q6_Vub_vlsr_VubR(v6_7, 4); // >> 4
101
+
102
+ v0 = Q6_Vb_vlut32_VbVbI(v0, lut, 0);
103
+ v1 = Q6_Vb_vlut32_VbVbI(v1, lut, 0);
104
+ v2 = Q6_Vb_vlut32_VbVbI(v2, lut, 0);
105
+ v3 = Q6_Vb_vlut32_VbVbI(v3, lut, 0);
106
+ v4 = Q6_Vb_vlut32_VbVbI(v4, lut, 0);
107
+ v5 = Q6_Vb_vlut32_VbVbI(v5, lut, 0);
108
+ v6 = Q6_Vb_vlut32_VbVbI(v6, lut, 0);
109
+ v7 = Q6_Vb_vlut32_VbVbI(v7, lut, 0);
110
+
111
+ HVX_Vector_x8 r = { v0, v1, v2, v3, v4, v5, v6, v7 };
112
+ return r;
113
+ }
114
+
115
+ static inline HVX_Vector_x8 hvx_vec_load_iq4nlx4x8_partial(const uint8_t * restrict ptr, uint32_t n) {
116
+ const HVX_Vector * restrict vptr = (const HVX_Vector *) ptr;
117
+
118
+ const uint32_t qk = QK_Q4_0x4x2; // 256
119
+ const uint32_t nb = n / qk;
120
+ const uint32_t nloe = n % qk;
121
+
122
+ const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
123
+ const HVX_Vector lut = *(const HVX_Vector *) kvalues_iq4nl_lut;
124
+
125
+ HVX_Vector_x8 r;
126
+ uint32_t i = 0;
127
+
128
+ #pragma unroll(2)
129
+ for (i = 0; i < nb; i++) {
130
+ HVX_Vector v = vptr[i]; // 256 elements (128 bytes)
131
+ HVX_Vector v0 = Q6_V_vand_VV(v, mask_h4); // & 0x0F : first 128 elements
132
+ HVX_Vector v1 = Q6_Vub_vlsr_VubR(v, 4); // >> 4 : second 128 elements
133
+ r.v[i * 2 + 0] = Q6_Vb_vlut32_VbVbI(v0, lut, 0);
134
+ r.v[i * 2 + 1] = Q6_Vb_vlut32_VbVbI(v1, lut, 0);
135
+ }
136
+
137
+ if (nloe) {
138
+ HVX_Vector v = vptr[i]; // 256 elements (128 bytes)
139
+ HVX_Vector v0 = Q6_V_vand_VV(v, mask_h4); // & 0x0F : even 128 elements
140
+ HVX_Vector v1 = Q6_Vub_vlsr_VubR(v, 4); // >> 4 : odd 128 elements
141
+ HVX_VectorPair v0_1_p = Q6_W_vshuff_VVR(v1, v0, -1); // zip even:odd:...
142
+ r.v[i * 2 + 0] = Q6_Vb_vlut32_VbVbI(Q6_V_lo_W(v0_1_p), lut, 0);
143
+ r.v[i * 2 + 1] = Q6_Vb_vlut32_VbVbI(Q6_V_hi_W(v0_1_p), lut, 0);
144
+ }
145
+
146
+ return r;
147
+ }
148
+
149
+ // q4x4x2 and q8x4x2 are the flat q4/8_0 formats where all quants are stored first followed by all scales
150
+
151
+ static inline size_t q8x4x2_row_size(uint32_t ne) {
152
+ // ensures perfect alignment of quants and full row
153
+ const uint32_t qk = QK_Q8_0x4x2;
154
+ const uint32_t nb = (ne + qk - 1) / qk;
155
+ return hex_round_up(ne + nb * 8 * sizeof(__fp16), 128);
156
+ }
157
+
158
+ static inline HVX_Vector_x8 hvx_vec_load_q4x4x8_full(const uint8_t * restrict ptr) {
159
+ const HVX_Vector * restrict vptr = (const HVX_Vector *) ptr;
160
+
161
+ HVX_Vector v0_1 = vptr[0]; // first 256 elements (128 bytes)
162
+ HVX_Vector v2_3 = vptr[1]; // ...
163
+ HVX_Vector v4_5 = vptr[2]; // ...
164
+ HVX_Vector v6_7 = vptr[3]; // ...
165
+
166
+ const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
167
+ const HVX_Vector i8 = Q6_Vb_vsplat_R(8);
168
+
169
+ HVX_Vector v0 = Q6_V_vand_VV(v0_1, mask_h4); // & 0x0F : first 128 elements
170
+ HVX_Vector v1 = Q6_Vub_vlsr_VubR(v0_1, 4); // >> 4 : second 128 elements
171
+ HVX_Vector v2 = Q6_V_vand_VV(v2_3, mask_h4); // & 0x0F ...
172
+ HVX_Vector v3 = Q6_Vub_vlsr_VubR(v2_3, 4); // >> 4
173
+ HVX_Vector v4 = Q6_V_vand_VV(v4_5, mask_h4); // & 0x0F
174
+ HVX_Vector v5 = Q6_Vub_vlsr_VubR(v4_5, 4); // >> 4
175
+ HVX_Vector v6 = Q6_V_vand_VV(v6_7, mask_h4); // & 0x0F
176
+ HVX_Vector v7 = Q6_Vub_vlsr_VubR(v6_7, 4); // >> 4
177
+
178
+ // Convert uint4 to int4 (i.e. x - 8)
179
+ v0 = Q6_Vb_vsub_VbVb(v0, i8);
180
+ v1 = Q6_Vb_vsub_VbVb(v1, i8);
181
+ v2 = Q6_Vb_vsub_VbVb(v2, i8);
182
+ v3 = Q6_Vb_vsub_VbVb(v3, i8);
183
+ v4 = Q6_Vb_vsub_VbVb(v4, i8);
184
+ v5 = Q6_Vb_vsub_VbVb(v5, i8);
185
+ v6 = Q6_Vb_vsub_VbVb(v6, i8);
186
+ v7 = Q6_Vb_vsub_VbVb(v7, i8);
187
+
188
+ HVX_Vector_x8 r = { v0, v1, v2, v3, v4, v5, v6, v7 };
189
+ return r;
190
+ }
191
+
192
+ static HVX_Vector_x8 hvx_vec_load_q4x4x8_partial(const uint8_t * restrict ptr, uint32_t n) {
193
+ const HVX_Vector * restrict vptr = (const HVX_Vector *) ptr;
194
+
195
+ const uint32_t qk = QK_Q4_0x4x2; // 256
196
+ const uint32_t nb = n / qk;
197
+ const uint32_t nloe = n % qk;
198
+
199
+ const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
200
+ const HVX_Vector i8 = Q6_Vb_vsplat_R(8);
201
+
202
+ HVX_Vector_x8 r;
203
+ uint32_t i = 0;
204
+
205
+ #pragma unroll(2)
206
+ for (i=0; i < nb; i++) {
207
+ HVX_Vector v = vptr[i]; // 256 elements (128 bytes)
208
+ HVX_Vector v0 = Q6_V_vand_VV(v, mask_h4); // & 0x0F : first 128 elements
209
+ HVX_Vector v1 = Q6_Vub_vlsr_VubR(v, 4); // >> 4 : second 128 elements
210
+ r.v[i*2+0] = Q6_Vb_vsub_VbVb(v0, i8);
211
+ r.v[i*2+1] = Q6_Vb_vsub_VbVb(v1, i8);
212
+ }
213
+
214
+ if (nloe) {
215
+ HVX_Vector v = vptr[i]; // 256 elements (128 bytes)
216
+ HVX_Vector v0 = Q6_V_vand_VV(v, mask_h4); // & 0x0F : even 128 elements
217
+ HVX_Vector v1 = Q6_Vub_vlsr_VubR(v, 4); // >> 4 : odd 128 elements
218
+ HVX_VectorPair v0_1_p = Q6_W_vshuff_VVR(v1, v0, -1); // zip even:odd:...
219
+ r.v[i*2+0] = Q6_Vb_vsub_VbVb(Q6_V_lo_W(v0_1_p), i8);
220
+ r.v[i*2+1] = Q6_Vb_vsub_VbVb(Q6_V_hi_W(v0_1_p), i8);
221
+ }
222
+
223
+ return r;
224
+ }
225
+
226
+ static inline HVX_Vector_x8 hvx_vec_load_mxfp4x4x8_full(const uint8_t * restrict ptr) {
227
+ const HVX_Vector * restrict vptr = (const HVX_Vector *) ptr;
228
+
229
+ HVX_Vector v0_1 = vptr[0]; // first 256 elements (128 bytes)
230
+ HVX_Vector v2_3 = vptr[1]; // ...
231
+ HVX_Vector v4_5 = vptr[2]; // ...
232
+ HVX_Vector v6_7 = vptr[3]; // ...
233
+
234
+ const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
235
+ const HVX_Vector lut = *(const HVX_Vector *) kvalues_mxfp4_lut;
236
+
237
+ HVX_Vector v0 = Q6_V_vand_VV(v0_1, mask_h4); // & 0x0F
238
+ HVX_Vector v1 = Q6_Vub_vlsr_VubR(v0_1, 4); // >> 4
239
+ HVX_Vector v2 = Q6_V_vand_VV(v2_3, mask_h4); // & 0x0F
240
+ HVX_Vector v3 = Q6_Vub_vlsr_VubR(v2_3, 4); // >> 4
241
+ HVX_Vector v4 = Q6_V_vand_VV(v4_5, mask_h4); // & 0x0F
242
+ HVX_Vector v5 = Q6_Vub_vlsr_VubR(v4_5, 4); // >> 4
243
+ HVX_Vector v6 = Q6_V_vand_VV(v6_7, mask_h4); // & 0x0F
244
+ HVX_Vector v7 = Q6_Vub_vlsr_VubR(v6_7, 4); // >> 4
245
+
246
+ v0 = Q6_Vb_vlut32_VbVbI(v0, lut, 0);
247
+ v1 = Q6_Vb_vlut32_VbVbI(v1, lut, 0);
248
+ v2 = Q6_Vb_vlut32_VbVbI(v2, lut, 0);
249
+ v3 = Q6_Vb_vlut32_VbVbI(v3, lut, 0);
250
+ v4 = Q6_Vb_vlut32_VbVbI(v4, lut, 0);
251
+ v5 = Q6_Vb_vlut32_VbVbI(v5, lut, 0);
252
+ v6 = Q6_Vb_vlut32_VbVbI(v6, lut, 0);
253
+ v7 = Q6_Vb_vlut32_VbVbI(v7, lut, 0);
254
+
255
+ HVX_Vector_x8 r = { v0, v1, v2, v3, v4, v5, v6, v7 };
256
+ return r;
257
+ }
258
+
259
+ static inline HVX_Vector_x8 hvx_vec_load_mxfp4x4x8_partial(const uint8_t * restrict ptr, uint32_t n) {
260
+ const HVX_Vector * restrict vptr = (const HVX_Vector *) ptr;
261
+
262
+ const uint32_t qk = QK_Q4_0x4x2; // 256
263
+ const uint32_t nb = n / qk;
264
+ const uint32_t nloe = n % qk;
265
+
266
+ const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
267
+ const HVX_Vector lut = *(const HVX_Vector *) kvalues_mxfp4_lut;
268
+
269
+ HVX_Vector_x8 r;
270
+ uint32_t i = 0;
271
+
272
+ #pragma unroll(2)
273
+ for (i=0; i < nb; i++) {
274
+ HVX_Vector v = vptr[i]; // 256 elements (128 bytes)
275
+ HVX_Vector v0 = Q6_V_vand_VV(v, mask_h4); // & 0x0F : first 128 elements
276
+ HVX_Vector v1 = Q6_Vub_vlsr_VubR(v, 4); // >> 4 : second 128 elements
277
+ r.v[i*2+0] = Q6_Vb_vlut32_VbVbI(v0, lut, 0);
278
+ r.v[i*2+1] = Q6_Vb_vlut32_VbVbI(v1, lut, 0);
279
+ }
280
+
281
+ if (nloe) {
282
+ HVX_Vector v = vptr[i]; // 256 elements (128 bytes)
283
+ HVX_Vector v0 = Q6_V_vand_VV(v, mask_h4); // & 0x0F : even 128 elements
284
+ HVX_Vector v1 = Q6_Vub_vlsr_VubR(v, 4); // >> 4 : odd 128 elements
285
+ HVX_VectorPair v0_1_p = Q6_W_vshuff_VVR(v1, v0, -1); // zip even:odd:...
286
+ r.v[i*2+0] = Q6_Vb_vlut32_VbVbI(Q6_V_lo_W(v0_1_p), lut, 0);
287
+ r.v[i*2+1] = Q6_Vb_vlut32_VbVbI(Q6_V_hi_W(v0_1_p), lut, 0);
288
+ }
289
+
290
+ return r;
291
+ }
292
+
293
+ static inline HVX_Vector_x8 hvx_vec_load_q8x4x8_full(const uint8_t * restrict ptr) {
294
+ const HVX_Vector * restrict vptr = (const HVX_Vector *) ptr;
295
+
296
+ HVX_Vector v0 = vptr[0]; // first 128 vals
297
+ HVX_Vector v1 = vptr[1]; // ...
298
+ HVX_Vector v2 = vptr[2]; // ...
299
+ HVX_Vector v3 = vptr[3]; // ...
300
+ HVX_Vector v4 = vptr[4]; // ...
301
+ HVX_Vector v5 = vptr[5]; // ...
302
+ HVX_Vector v6 = vptr[6]; // ...
303
+ HVX_Vector v7 = vptr[7]; // ...
304
+
305
+ HVX_Vector_x8 r = { v0, v1, v2, v3, v4, v5, v6, v7 };
306
+ return r;
307
+ }
308
+
309
+ static inline HVX_Vector_x8 hvx_vec_load_q8x4x8_partial(const uint8_t * restrict ptr, uint32_t nloe) {
310
+ return hvx_vec_load_q8x4x8_full(ptr);
311
+ }
312
+
313
+ // Reduce multiply 1024 x 1024 int8 elements (32x q4/8 blocks in 8x HVX vectors).
314
+ // Accumulate each block into a single int32 value.
315
+ // Return a single HVX vector with 32x int32 accumulators.
316
+ // This version is parameterized to support less than 1024 elements.
317
+ // if() checks are optimized out at compile time -- make sure to pass N as a constexpr.
318
+
319
+ static inline HVX_Vector hvx_vec_rmpy_x8_n(HVX_Vector_x8 x, HVX_Vector_x8 y, unsigned int n) {
320
+ HVX_Vector r0 = Q6_V_vzero();
321
+ HVX_Vector r1 = Q6_V_vzero();
322
+ HVX_Vector r2 = Q6_V_vzero();
323
+ HVX_Vector r3 = Q6_V_vzero();
324
+ HVX_Vector r4 = Q6_V_vzero();
325
+ HVX_Vector r5 = Q6_V_vzero();
326
+ HVX_Vector r6 = Q6_V_vzero();
327
+ HVX_Vector r7 = Q6_V_vzero();
328
+
329
+ HVX_VectorPair p3;
330
+ HVX_VectorPair p2;
331
+ HVX_VectorPair p1;
332
+ HVX_VectorPair p0;
333
+
334
+ if (n >= 128) { r0 = Q6_Vw_vrmpy_VbVb(x.v[0], y.v[0]); }
335
+ if (n >= 256) { r1 = Q6_Vw_vrmpy_VbVb(x.v[1], y.v[1]); }
336
+ if (n >= 384) { r2 = Q6_Vw_vrmpy_VbVb(x.v[2], y.v[2]); }
337
+ if (n >= 512) { r3 = Q6_Vw_vrmpy_VbVb(x.v[3], y.v[3]); }
338
+ if (n >= 640) { r4 = Q6_Vw_vrmpy_VbVb(x.v[4], y.v[4]); }
339
+ if (n >= 768) { r5 = Q6_Vw_vrmpy_VbVb(x.v[5], y.v[5]); }
340
+ if (n >= 896) { r6 = Q6_Vw_vrmpy_VbVb(x.v[6], y.v[6]); }
341
+ if (n >= 1024) { r7 = Q6_Vw_vrmpy_VbVb(x.v[7], y.v[7]); }
342
+
343
+ if (n >= 128) { p0 = Q6_W_vdeal_VVR(r1, r0, -4); }
344
+ if (n >= 384) { p1 = Q6_W_vdeal_VVR(r3, r2, -4); }
345
+ if (n >= 640) { p2 = Q6_W_vdeal_VVR(r5, r4, -4); }
346
+ if (n >= 896) { p3 = Q6_W_vdeal_VVR(r7, r6, -4); }
347
+
348
+ if (n >= 128) { r0 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p0), Q6_V_hi_W(p0)); }
349
+ if (n >= 384) { r1 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p1), Q6_V_hi_W(p1)); }
350
+ if (n >= 640) { r2 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p2), Q6_V_hi_W(p2)); }
351
+ if (n >= 896) { r3 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p3), Q6_V_hi_W(p3)); }
352
+
353
+ if (n >= 128) { p0 = Q6_W_vdeal_VVR(r1, r0, -4); }
354
+ if (n >= 640) { p1 = Q6_W_vdeal_VVR(r3, r2, -4); }
355
+
356
+ if (n >= 128) { r0 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p0), Q6_V_hi_W(p0)); }
357
+ if (n >= 640) { r1 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p1), Q6_V_hi_W(p1)); }
358
+
359
+ if (n >= 128) { p0 = Q6_W_vdeal_VVR(r1, r0, -4); }
360
+ if (n >= 128) { r0 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p0), Q6_V_hi_W(p0)); }
361
+
362
+ return r0;
363
+ }
364
+
365
+ static inline HVX_Vector hvx_vec_rmpy_x8_full(HVX_Vector_x8 x, HVX_Vector_x8 y) {
366
+ HVX_Vector r0 = Q6_Vw_vrmpy_VbVb(x.v[0], y.v[0]);
367
+ HVX_Vector r1 = Q6_Vw_vrmpy_VbVb(x.v[1], y.v[1]);
368
+ HVX_Vector r2 = Q6_Vw_vrmpy_VbVb(x.v[2], y.v[2]);
369
+ HVX_Vector r3 = Q6_Vw_vrmpy_VbVb(x.v[3], y.v[3]);
370
+ HVX_Vector r4 = Q6_Vw_vrmpy_VbVb(x.v[4], y.v[4]);
371
+ HVX_Vector r5 = Q6_Vw_vrmpy_VbVb(x.v[5], y.v[5]);
372
+ HVX_Vector r6 = Q6_Vw_vrmpy_VbVb(x.v[6], y.v[6]);
373
+ HVX_Vector r7 = Q6_Vw_vrmpy_VbVb(x.v[7], y.v[7]);
374
+
375
+ HVX_VectorPair p0 = Q6_W_vdeal_VVR(r1, r0, -4);
376
+ HVX_VectorPair p1 = Q6_W_vdeal_VVR(r3, r2, -4);
377
+ HVX_VectorPair p2 = Q6_W_vdeal_VVR(r5, r4, -4);
378
+ HVX_VectorPair p3 = Q6_W_vdeal_VVR(r7, r6, -4);
379
+
380
+ r0 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p0), Q6_V_hi_W(p0));
381
+ r1 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p1), Q6_V_hi_W(p1));
382
+ r2 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p2), Q6_V_hi_W(p2));
383
+ r3 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p3), Q6_V_hi_W(p3));
384
+
385
+ p0 = Q6_W_vdeal_VVR(r1, r0, -4);
386
+ p1 = Q6_W_vdeal_VVR(r3, r2, -4);
387
+
388
+ r0 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p0), Q6_V_hi_W(p0));
389
+ r1 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p1), Q6_V_hi_W(p1));
390
+
391
+ p0 = Q6_W_vdeal_VVR(r1, r0, -4);
392
+ r0 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p0), Q6_V_hi_W(p0));
393
+
394
+ return r0;
395
+ }
396
+
397
+ static inline HVX_Vector hvx_vec_rmpy_x8_partial(HVX_Vector_x8 x, HVX_Vector_x8 y, unsigned int n) {
398
+ if (n >= 512)
399
+ return hvx_vec_rmpy_x8_full(x, y);
400
+
401
+ return hvx_vec_rmpy_x8_partial(x, y, 512);
402
+ }
403
+
404
+ static void vec_dot_q4x4x2_q8x4x2_1x1(const int n, float * restrict s0, const void * restrict vx0, const void * restrict vy0) {
405
+ assert(n % 32 == 0); // min sub-block size
406
+ assert((unsigned long) vx0 % 128 == 0);
407
+ assert((unsigned long) vy0 % 128 == 0);
408
+
409
+ const uint32_t qk = QK_Q4_0x4x2 * 4;
410
+
411
+ const uint32_t x_dblk_size = 8 * 4 * 2; // 32x __fp16
412
+ const uint32_t x_qblk_size = qk / 2; // int4
413
+ const uint32_t x_qrow_size = n / 2; // int4 (not padded)
414
+
415
+ const uint32_t y_dblk_size = 8 * 4 * 2; // 32x __fp16
416
+ const uint32_t y_qblk_size = qk; // int8
417
+ const uint32_t y_qrow_size = n; // int8 (not padded)
418
+
419
+ const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0 + 0); // quants first
420
+ const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0 + x_qrow_size); // then scales
421
+
422
+ const uint8_t * restrict y_q = ((const uint8_t *) vy0 + 0); // quants first
423
+ const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size); // then scales
424
+
425
+ // Row sum (sf)
426
+ HVX_Vector r0_sum = Q6_V_vzero();
427
+
428
+ // Multiply and accumulate into int32.
429
+ // Compute combined scale (fp32).
430
+ // Apply scale to acc and accumulate into the row sum (qf32).
431
+
432
+ const uint32_t nb = n / qk; // num full blocks
433
+ const uint32_t nloe = n % qk; // num leftover elemements
434
+
435
+ uint32_t i = 0;
436
+ for (; i < nb; i++) {
437
+ HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full(y_q + i * y_qblk_size);
438
+ HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8_full(r0_x_q + i * x_qblk_size);
439
+
440
+ HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
441
+
442
+ HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
443
+ HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
444
+
445
+ HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
446
+
447
+ HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
448
+
449
+ r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
450
+ }
451
+
452
+ // Process leftovers
453
+ if (nloe) {
454
+ HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial(y_q + i * y_qblk_size, nloe);
455
+ HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8_partial(r0_x_q + i * x_qblk_size, nloe);
456
+
457
+ HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe));
458
+
459
+ HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
460
+ HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
461
+
462
+ HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
463
+
464
+ // Zero out unused elements
465
+ HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
466
+ r0_dd = Q6_V_vand_QV(bmask, r0_dd);
467
+ r0_ia = Q6_V_vand_QV(bmask, r0_ia);
468
+
469
+ HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
470
+
471
+ r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
472
+ }
473
+
474
+ r0_sum = hvx_vec_reduce_sum_f32(r0_sum);
475
+
476
+ hvx_vec_store_u(s0, 4, r0_sum);
477
+ }
478
+
479
+ static void vec_dot_q4x4x2_q8x4x2_2x1(const int n, float * restrict s0,
480
+ const void * restrict vx0, const void * restrict vx1,
481
+ const void * restrict vy0) {
482
+ assert(n % 32 == 0); // min sub-block size
483
+ assert((unsigned long) vx0 % 128 == 0);
484
+ assert((unsigned long) vx1 % 128 == 0);
485
+ assert((unsigned long) vy0 % 128 == 0);
486
+
487
+ const uint32_t qk = QK_Q4_0x4x2 * 4;
488
+
489
+ const uint32_t x_dblk_size = 8 * 4 * 2; // 32x __fp16
490
+ const uint32_t x_qblk_size = qk / 2; // int4
491
+ const uint32_t x_qrow_size = n / 2; // int4 (not padded)
492
+
493
+ const uint32_t y_dblk_size = 8 * 4 * 2; // 32x __fp16
494
+ const uint32_t y_qblk_size = qk; // int8
495
+ const uint32_t y_qrow_size = n; // int8 (not padded)
496
+
497
+ const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0) + 0; // quants first
498
+ const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0) + x_qrow_size; // then scales
499
+ const uint8_t * restrict r1_x_q = ((const uint8_t *) vx1) + 0; // quants first
500
+ const uint8_t * restrict r1_x_d = ((const uint8_t *) vx1) + x_qrow_size; // then scales
501
+
502
+ const uint8_t * restrict y_q = ((const uint8_t *) vy0 + 0); // quants first
503
+ const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size); // then scales
504
+
505
+ // Row sum (sf)
506
+ HVX_Vector r0_sum = Q6_V_vzero();
507
+ HVX_Vector r1_sum = Q6_V_vzero();
508
+
509
+ // Multiply and accumulate into int32.
510
+ // Compute combined scale (fp32).
511
+ // Apply scale to acc and accumulate into the row sum (qf32).
512
+
513
+ const uint32_t nb = n / qk; // num full blocks
514
+ const uint32_t nloe = n % qk; // num leftover elemements
515
+
516
+ uint32_t i = 0;
517
+ for (; i < nb; i++) {
518
+ HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full(y_q + i * y_qblk_size);
519
+ HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8_full(r0_x_q + i * x_qblk_size);
520
+ HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8_full(r1_x_q + i * x_qblk_size);
521
+
522
+ HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
523
+ HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q));
524
+
525
+ HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
526
+ HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
527
+ HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
528
+
529
+ HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
530
+ HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d)));
531
+
532
+ HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
533
+ HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd);
534
+
535
+ r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
536
+ r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum));
537
+ }
538
+
539
+ // Process leftovers
540
+ if (nloe) {
541
+ HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial(y_q + i * y_qblk_size, nloe);
542
+ HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8_partial(r0_x_q + i * x_qblk_size, nloe);
543
+ HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8_partial(r1_x_q + i * x_qblk_size, nloe);
544
+
545
+ HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe));
546
+ HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy_q, nloe));
547
+
548
+ HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
549
+ HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
550
+ HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
551
+
552
+ HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
553
+ HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d)));
554
+
555
+ // Zero out unused elements
556
+ HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
557
+ r0_dd = Q6_V_vand_QV(bmask, r0_dd);
558
+ r1_dd = Q6_V_vand_QV(bmask, r1_dd);
559
+ r0_ia = Q6_V_vand_QV(bmask, r0_ia);
560
+ r1_ia = Q6_V_vand_QV(bmask, r1_ia);
561
+
562
+ HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
563
+ HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd);
564
+
565
+ r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
566
+ r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum));
567
+ }
568
+
569
+ HVX_Vector rsum = hvx_vec_reduce_sum_f32x2(r0_sum, r1_sum);
570
+ hvx_vec_store_u(s0, 8, rsum);
571
+ }
572
+
573
+ static void vec_dot_q4x4x2_q8x4x2_2x2(const int n, float * restrict s0, float * restrict s1,
574
+ const void * restrict vx0, const void * restrict vx1,
575
+ const void * restrict vy0, const void * restrict vy1) {
576
+ assert(n % 32 == 0);
577
+ assert((unsigned long) vx0 % 128 == 0);
578
+ assert((unsigned long) vx1 % 128 == 0);
579
+ assert((unsigned long) vy0 % 128 == 0);
580
+ assert((unsigned long) vy1 % 128 == 0);
581
+
582
+ const uint32_t qk = QK_Q4_0x4x2 * 4;
583
+
584
+ const uint32_t x_dblk_size = 8 * 4 * 2; // 32x __fp16
585
+ const uint32_t x_qblk_size = qk / 2; // int4
586
+ const uint32_t x_qrow_size = n / 2; // int4 (not padded)
587
+
588
+ const uint32_t y_dblk_size = 8 * 4 * 2; // 32x __fp16
589
+ const uint32_t y_qblk_size = qk; // int8
590
+ const uint32_t y_qrow_size = n; // int8 (not padded)
591
+
592
+ const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0) + 0; // quants first
593
+ const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0) + x_qrow_size; // then scales
594
+ const uint8_t * restrict r1_x_q = ((const uint8_t *) vx1) + 0; // quants first
595
+ const uint8_t * restrict r1_x_d = ((const uint8_t *) vx1) + x_qrow_size; // then scales
596
+
597
+ const uint8_t * restrict y0_q = ((const uint8_t *) vy0) + 0; // quants first
598
+ const uint8_t * restrict y0_d = ((const uint8_t *) vy0) + y_qrow_size; // then scales
599
+ const uint8_t * restrict y1_q = ((const uint8_t *) vy1) + 0; // quants first
600
+ const uint8_t * restrict y1_d = ((const uint8_t *) vy1) + y_qrow_size; // then scales
601
+
602
+ // Row sums (sf) - 4 accumulators for 2×2 tile
603
+ HVX_Vector r0_c0_sum = Q6_V_vzero();
604
+ HVX_Vector r0_c1_sum = Q6_V_vzero();
605
+ HVX_Vector r1_c0_sum = Q6_V_vzero();
606
+ HVX_Vector r1_c1_sum = Q6_V_vzero();
607
+
608
+ const uint32_t nb = n / qk; // num full blocks
609
+ const uint32_t nloe = n % qk; // num leftover elements
610
+
611
+ uint32_t i = 0;
612
+ for (; i < nb; i++) {
613
+ // Load src1 columns (reused across both src0 rows)
614
+ HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8_full(y0_q + i * y_qblk_size);
615
+ HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8_full(y1_q + i * y_qblk_size);
616
+
617
+ // Load src0 rows (reused across both src1 columns)
618
+ HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8_full(r0_x_q + i * x_qblk_size);
619
+ HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8_full(r1_x_q + i * x_qblk_size);
620
+
621
+ // Compute 4 dot products: r0×c0, r0×c1, r1×c0, r1×c1
622
+ HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy0_q));
623
+ HVX_Vector r0_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy1_q));
624
+ HVX_Vector r1_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy0_q));
625
+ HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy1_q));
626
+
627
+ // Load scales
628
+ HVX_Vector vy0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y0_d + i * y_dblk_size));
629
+ HVX_Vector vy1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y1_d + i * y_dblk_size));
630
+ HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
631
+ HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
632
+
633
+ // Compute combined scales
634
+ HVX_Vector r0_c0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy0_d)));
635
+ HVX_Vector r0_c1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy1_d)));
636
+ HVX_Vector r1_c0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy0_d)));
637
+ HVX_Vector r1_c1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy1_d)));
638
+
639
+ // Apply scales and accumulate
640
+ HVX_Vector r0_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c0_ia, r0_c0_dd);
641
+ HVX_Vector r0_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c1_ia, r0_c1_dd);
642
+ HVX_Vector r1_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c0_ia, r1_c0_dd);
643
+ HVX_Vector r1_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c1_ia, r1_c1_dd);
644
+
645
+ r0_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c0_fa, r0_c0_sum));
646
+ r0_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c1_fa, r0_c1_sum));
647
+ r1_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c0_fa, r1_c0_sum));
648
+ r1_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c1_fa, r1_c1_sum));
649
+ }
650
+
651
+ // Process leftovers
652
+ if (nloe) {
653
+ HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8_partial(y0_q + i * y_qblk_size, nloe);
654
+ HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8_partial(y1_q + i * y_qblk_size, nloe);
655
+ HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8_partial(r0_x_q + i * x_qblk_size, nloe);
656
+ HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8_partial(r1_x_q + i * x_qblk_size, nloe);
657
+
658
+ HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy0_q, nloe));
659
+ HVX_Vector r0_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy1_q, nloe));
660
+ HVX_Vector r1_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy0_q, nloe));
661
+ HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy1_q, nloe));
662
+
663
+ HVX_Vector vy0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y0_d + i * y_dblk_size));
664
+ HVX_Vector vy1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y1_d + i * y_dblk_size));
665
+ HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
666
+ HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
667
+
668
+ HVX_Vector r0_c0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy0_d)));
669
+ HVX_Vector r0_c1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy1_d)));
670
+ HVX_Vector r1_c0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy0_d)));
671
+ HVX_Vector r1_c1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy1_d)));
672
+
673
+ // Zero out unused scales
674
+ HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
675
+ r0_c0_dd = Q6_V_vand_QV(bmask, r0_c0_dd);
676
+ r0_c1_dd = Q6_V_vand_QV(bmask, r0_c1_dd);
677
+ r1_c0_dd = Q6_V_vand_QV(bmask, r1_c0_dd);
678
+ r1_c1_dd = Q6_V_vand_QV(bmask, r1_c1_dd);
679
+ r0_c0_ia = Q6_V_vand_QV(bmask, r0_c0_ia);
680
+ r0_c1_ia = Q6_V_vand_QV(bmask, r0_c1_ia);
681
+ r1_c0_ia = Q6_V_vand_QV(bmask, r1_c0_ia);
682
+ r1_c1_ia = Q6_V_vand_QV(bmask, r1_c1_ia);
683
+
684
+ HVX_Vector r0_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c0_ia, r0_c0_dd);
685
+ HVX_Vector r0_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c1_ia, r0_c1_dd);
686
+ HVX_Vector r1_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c0_ia, r1_c0_dd);
687
+ HVX_Vector r1_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c1_ia, r1_c1_dd);
688
+
689
+ r0_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c0_fa, r0_c0_sum));
690
+ r0_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c1_fa, r0_c1_sum));
691
+ r1_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c0_fa, r1_c0_sum));
692
+ r1_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c1_fa, r1_c1_sum));
693
+ }
694
+
695
+ // Reduce and store results
696
+ HVX_Vector r0_r1_c0_sum = hvx_vec_reduce_sum_f32x2(r0_c0_sum, r1_c0_sum);
697
+ HVX_Vector r0_r1_c1_sum = hvx_vec_reduce_sum_f32x2(r0_c1_sum, r1_c1_sum);
698
+
699
+ hvx_vec_store_u(s0, 8, r0_r1_c0_sum); // row0,col0 row1,col0
700
+ hvx_vec_store_u(s1, 8, r0_r1_c1_sum); // row0,col1 row1,col1
701
+ }
702
+
703
+ static void vec_dot_q8x4x2_q8x4x2_1x1(const int n, float * restrict s0, const void * restrict vx0, const void * restrict vy0) {
704
+ assert(n % 32 == 0); // min sub-block size
705
+ assert((unsigned long) vx0 % 128 == 0);
706
+ assert((unsigned long) vy0 % 128 == 0);
707
+
708
+ const uint32_t qk = QK_Q4_0x4x2 * 4;
709
+
710
+ const uint32_t x_dblk_size = 8 * 4 * 2; // 32x __fp16
711
+ const uint32_t x_qblk_size = qk; // int8
712
+ const uint32_t x_qrow_size = n; // int8 (not padded)
713
+
714
+ const uint32_t y_dblk_size = 8 * 4 * 2; // 32x __fp16
715
+ const uint32_t y_qblk_size = qk; // int8
716
+ const uint32_t y_qrow_size = n; // int8 (not padded)
717
+
718
+ const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0 + 0); // quants first
719
+ const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0 + x_qrow_size); // then scales
720
+
721
+ const uint8_t * restrict y_q = ((const uint8_t *) vy0 + 0); // quants first
722
+ const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size); // then scales
723
+
724
+ // Row sum (sf)
725
+ HVX_Vector r0_sum = Q6_V_vzero();
726
+
727
+ // Multiply and accumulate into int32.
728
+ // Compute combined scale (fp32).
729
+ // Apply scale to acc and accumulate into the row sum (qf32).
730
+
731
+ const uint32_t nb = n / qk; // num full blocks
732
+ int32_t nloe = n % qk; // num leftover elemements (must be signed)
733
+
734
+ uint32_t i = 0;
735
+ for (; i < nb; i++) {
736
+ HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full(y_q + i * y_qblk_size);
737
+ HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8_full(r0_x_q + i * x_qblk_size);
738
+
739
+ HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
740
+
741
+ HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
742
+ HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
743
+
744
+ HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
745
+
746
+ HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
747
+
748
+ r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
749
+ }
750
+
751
+ // Process leftovers
752
+ if (nloe) {
753
+ HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial(y_q + i * y_qblk_size, nloe);
754
+ HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8_partial(r0_x_q + i * x_qblk_size, nloe);
755
+
756
+ HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe));
757
+
758
+ HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
759
+ HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
760
+
761
+ HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
762
+
763
+ // Zero out unused elements
764
+ HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
765
+ r0_dd = Q6_V_vand_QV(bmask, r0_dd);
766
+ r0_ia = Q6_V_vand_QV(bmask, r0_ia);
767
+
768
+ HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
769
+
770
+ r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
771
+ }
772
+
773
+ r0_sum = hvx_vec_reduce_sum_f32(r0_sum);
774
+
775
+ hvx_vec_store_u(s0, 4, r0_sum);
776
+ }
777
+
778
+ static void vec_dot_q8x4x2_q8x4x2_2x1(const int n, float * restrict s0,
779
+ const void * restrict vx0, const void * restrict vx1,
780
+ const void * restrict vy0) {
781
+ assert(n % 32 == 0); // min sub-block size
782
+ assert((unsigned long) vx0 % 128 == 0);
783
+ assert((unsigned long) vx1 % 128 == 0);
784
+ assert((unsigned long) vy0 % 128 == 0);
785
+
786
+ const uint32_t qk = QK_Q4_0x4x2 * 4;
787
+
788
+ const uint32_t x_dblk_size = 8 * 4 * 2; // 32x __fp16
789
+ const uint32_t x_qblk_size = qk; // int8
790
+ const uint32_t x_qrow_size = n; // int8 (not padded)
791
+
792
+ const uint32_t y_dblk_size = 8 * 4 * 2; // 32x __fp16
793
+ const uint32_t y_qblk_size = qk; // int8
794
+ const uint32_t y_qrow_size = n; // int8 (not padded)
795
+
796
+ const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0) + 0; // quants first
797
+ const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0) + x_qrow_size; // then scales
798
+ const uint8_t * restrict r1_x_q = ((const uint8_t *) vx1) + 0; // quants first
799
+ const uint8_t * restrict r1_x_d = ((const uint8_t *) vx1) + x_qrow_size; // then scales
800
+
801
+ const uint8_t * restrict y_q = ((const uint8_t *) vy0 + 0); // quants first
802
+ const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size); // then scales
803
+
804
+ // Row sum (qf32)
805
+ HVX_Vector r0_sum = Q6_V_vzero();
806
+ HVX_Vector r1_sum = Q6_V_vzero();
807
+
808
+ // Multiply and accumulate into int32.
809
+ // Compute combined scale (fp32).
810
+ // Apply scale to acc and accumulate into the row sum (qf32).
811
+
812
+ const uint32_t nb = n / qk; // num full blocks
813
+ int32_t nloe = n % qk; // num leftover elemements (must be signed)
814
+
815
+ uint32_t i = 0;
816
+ for (; i < nb; i++) {
817
+ HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full(y_q + i * y_qblk_size);
818
+ HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8_full(r0_x_q + i * x_qblk_size);
819
+ HVX_Vector_x8 r1_q = hvx_vec_load_q8x4x8_full(r1_x_q + i * x_qblk_size);
820
+
821
+ HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
822
+ HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q));
823
+
824
+ HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
825
+ HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
826
+ HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
827
+
828
+ HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
829
+ HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d)));
830
+
831
+ HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
832
+ HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd);
833
+
834
+ r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
835
+ r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum));
836
+ }
837
+
838
+ // Process leftovers
839
+ if (nloe) {
840
+ HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial(y_q + i * y_qblk_size, nloe);
841
+ HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8_partial(r0_x_q + i * x_qblk_size, nloe);
842
+ HVX_Vector_x8 r1_q = hvx_vec_load_q8x4x8_partial(r1_x_q + i * x_qblk_size, nloe);
843
+
844
+ HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe));
845
+ HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy_q, nloe));
846
+
847
+ HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
848
+ HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
849
+ HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
850
+
851
+ HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
852
+ HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d)));
853
+
854
+ // Zero out unused elements
855
+ HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
856
+ r0_dd = Q6_V_vand_QV(bmask, r0_dd);
857
+ r1_dd = Q6_V_vand_QV(bmask, r1_dd);
858
+ r0_ia = Q6_V_vand_QV(bmask, r0_ia);
859
+ r1_ia = Q6_V_vand_QV(bmask, r1_ia);
860
+
861
+ HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
862
+ HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd);
863
+
864
+ r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
865
+ r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum));
866
+ }
867
+
868
+ HVX_Vector rsum = hvx_vec_reduce_sum_f32x2(r0_sum, r1_sum);
869
+ hvx_vec_store_u(s0, 8, rsum);
870
+ }
871
+
872
+ static void vec_dot_q8x4x2_q8x4x2_2x2(const int n, float * restrict s0, float * restrict s1,
873
+ const void * restrict vx0, const void * restrict vx1,
874
+ const void * restrict vy0, const void * restrict vy1) {
875
+ assert(n % 32 == 0);
876
+ assert((unsigned long) vx0 % 128 == 0);
877
+ assert((unsigned long) vx1 % 128 == 0);
878
+ assert((unsigned long) vy0 % 128 == 0);
879
+ assert((unsigned long) vy1 % 128 == 0);
880
+
881
+ const uint32_t qk = QK_Q8_0x4x2 * 4;
882
+
883
+ const uint32_t x_dblk_size = 8 * 4 * 2; // 32x __fp16
884
+ const uint32_t x_qblk_size = qk; // int8
885
+ const uint32_t x_qrow_size = n; // int8 (not padded)
886
+
887
+ const uint32_t y_dblk_size = 8 * 4 * 2; // 32x __fp16
888
+ const uint32_t y_qblk_size = qk; // int8
889
+ const uint32_t y_qrow_size = n; // int8 (not padded)
890
+
891
+ const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0) + 0; // quants first
892
+ const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0) + x_qrow_size; // then scales
893
+ const uint8_t * restrict r1_x_q = ((const uint8_t *) vx1) + 0; // quants first
894
+ const uint8_t * restrict r1_x_d = ((const uint8_t *) vx1) + x_qrow_size; // then scales
895
+
896
+ const uint8_t * restrict y0_q = ((const uint8_t *) vy0) + 0; // quants first
897
+ const uint8_t * restrict y0_d = ((const uint8_t *) vy0) + y_qrow_size; // then scales
898
+ const uint8_t * restrict y1_q = ((const uint8_t *) vy1) + 0; // quants first
899
+ const uint8_t * restrict y1_d = ((const uint8_t *) vy1) + y_qrow_size; // then scales
900
+
901
+ // Row sums (sf) - 4 accumulators for 2×2 tile
902
+ HVX_Vector r0_c0_sum = Q6_V_vzero();
903
+ HVX_Vector r0_c1_sum = Q6_V_vzero();
904
+ HVX_Vector r1_c0_sum = Q6_V_vzero();
905
+ HVX_Vector r1_c1_sum = Q6_V_vzero();
906
+
907
+ const uint32_t nb = n / qk; // num full blocks
908
+ const uint32_t nloe = n % qk; // num leftover elements
909
+
910
+ uint32_t i = 0;
911
+ for (; i < nb; i++) {
912
+ // Load src1 columns (reused across both src0 rows)
913
+ HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8_full(y0_q + i * y_qblk_size);
914
+ HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8_full(y1_q + i * y_qblk_size);
915
+
916
+ // Load src0 rows (reused across both src1 columns)
917
+ HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8_full(r0_x_q + i * x_qblk_size);
918
+ HVX_Vector_x8 r1_q = hvx_vec_load_q8x4x8_full(r1_x_q + i * x_qblk_size);
919
+
920
+ // Compute 4 dot products: r0×c0, r0×c1, r1×c0, r1×c1
921
+ HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy0_q));
922
+ HVX_Vector r0_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy1_q));
923
+ HVX_Vector r1_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy0_q));
924
+ HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy1_q));
925
+
926
+ // Load scales
927
+ HVX_Vector vy0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y0_d + i * y_dblk_size));
928
+ HVX_Vector vy1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y1_d + i * y_dblk_size));
929
+ HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
930
+ HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
931
+
932
+ // Compute combined scales
933
+ HVX_Vector r0_c0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy0_d)));
934
+ HVX_Vector r0_c1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy1_d)));
935
+ HVX_Vector r1_c0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy0_d)));
936
+ HVX_Vector r1_c1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy1_d)));
937
+
938
+ // Apply scales and accumulate
939
+ HVX_Vector r0_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c0_ia, r0_c0_dd);
940
+ HVX_Vector r0_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c1_ia, r0_c1_dd);
941
+ HVX_Vector r1_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c0_ia, r1_c0_dd);
942
+ HVX_Vector r1_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c1_ia, r1_c1_dd);
943
+
944
+ r0_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c0_fa, r0_c0_sum));
945
+ r0_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c1_fa, r0_c1_sum));
946
+ r1_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c0_fa, r1_c0_sum));
947
+ r1_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c1_fa, r1_c1_sum));
948
+ }
949
+
950
+ // Process leftovers
951
+ if (nloe) {
952
+ HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8_partial(y0_q + i * y_qblk_size, nloe);
953
+ HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8_partial(y1_q + i * y_qblk_size, nloe);
954
+ HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8_partial(r0_x_q + i * x_qblk_size, nloe);
955
+ HVX_Vector_x8 r1_q = hvx_vec_load_q8x4x8_partial(r1_x_q + i * x_qblk_size, nloe);
956
+
957
+ HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy0_q, nloe));
958
+ HVX_Vector r0_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy1_q, nloe));
959
+ HVX_Vector r1_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy0_q, nloe));
960
+ HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy1_q, nloe));
961
+
962
+ HVX_Vector vy0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y0_d + i * y_dblk_size));
963
+ HVX_Vector vy1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y1_d + i * y_dblk_size));
964
+ HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
965
+ HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
966
+
967
+ HVX_Vector r0_c0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy0_d)));
968
+ HVX_Vector r0_c1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy1_d)));
969
+ HVX_Vector r1_c0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy0_d)));
970
+ HVX_Vector r1_c1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy1_d)));
971
+
972
+ // Zero out unused elements
973
+ HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
974
+ r0_c0_dd = Q6_V_vand_QV(bmask, r0_c0_dd);
975
+ r0_c1_dd = Q6_V_vand_QV(bmask, r0_c1_dd);
976
+ r1_c0_dd = Q6_V_vand_QV(bmask, r1_c0_dd);
977
+ r1_c1_dd = Q6_V_vand_QV(bmask, r1_c1_dd);
978
+ r0_c0_ia = Q6_V_vand_QV(bmask, r0_c0_ia);
979
+ r0_c1_ia = Q6_V_vand_QV(bmask, r0_c1_ia);
980
+ r1_c0_ia = Q6_V_vand_QV(bmask, r1_c0_ia);
981
+ r1_c1_ia = Q6_V_vand_QV(bmask, r1_c1_ia);
982
+
983
+ HVX_Vector r0_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c0_ia, r0_c0_dd);
984
+ HVX_Vector r0_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c1_ia, r0_c1_dd);
985
+ HVX_Vector r1_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c0_ia, r1_c0_dd);
986
+ HVX_Vector r1_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c1_ia, r1_c1_dd);
987
+
988
+ r0_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c0_fa, r0_c0_sum));
989
+ r0_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c1_fa, r0_c1_sum));
990
+ r1_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c0_fa, r1_c0_sum));
991
+ r1_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c1_fa, r1_c1_sum));
992
+ }
993
+
994
+ // Reduce and store results
995
+ HVX_Vector r0_r1_c0_sum = hvx_vec_reduce_sum_f32x2(r0_c0_sum, r1_c0_sum);
996
+ HVX_Vector r0_r1_c1_sum = hvx_vec_reduce_sum_f32x2(r0_c1_sum, r1_c1_sum);
997
+
998
+ hvx_vec_store_u(&s0[0], 8, r0_r1_c0_sum); // row0,col0 row1,col0
999
+ hvx_vec_store_u(&s1[0], 8, r0_r1_c1_sum); // row0,col1 row1,col1
1000
+ }
1001
+
1002
+ // ======== IQ4_NL x Q8_0 vec_dot kernels ========
1003
+ // Same structure as Q4_0 vec_dot but uses IQ4_NL LUT-based load (4-bit index -> int8 kvalue).
1004
+ // Scale format is identical to Q4_0 (fp16 scales).
1005
+
1006
+ static void vec_dot_iq4nlx4x2_q8x4x2_1x1(const int n,
1007
+ float * restrict s0,
1008
+ const void * restrict vx0,
1009
+ const void * restrict vy0) {
1010
+ assert(n % 32 == 0);
1011
+ assert((unsigned long) vx0 % 128 == 0);
1012
+ assert((unsigned long) vy0 % 128 == 0);
1013
+
1014
+ const uint32_t qk = QK_Q4_0x4x2 * 4;
1015
+
1016
+ const uint32_t x_dblk_size = 8 * 4 * 2; // 32x __fp16
1017
+ const uint32_t x_qblk_size = qk / 2; // int4
1018
+ const uint32_t x_qrow_size = n / 2; // int4 (not padded)
1019
+
1020
+ const uint32_t y_dblk_size = 8 * 4 * 2; // 32x __fp16
1021
+ const uint32_t y_qblk_size = qk; // int8
1022
+ const uint32_t y_qrow_size = n; // int8 (not padded)
1023
+
1024
+ const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0 + 0); // quants first
1025
+ const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0 + x_qrow_size); // then scales
1026
+
1027
+ const uint8_t * restrict y_q = ((const uint8_t *) vy0 + 0); // quants first
1028
+ const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size); // then scales
1029
+
1030
+ HVX_Vector r0_sum = Q6_V_vzero();
1031
+
1032
+ const uint32_t nb = n / qk;
1033
+ const uint32_t nloe = n % qk;
1034
+
1035
+ uint32_t i = 0;
1036
+ for (; i < nb; i++) {
1037
+ HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full(y_q + i * y_qblk_size);
1038
+ HVX_Vector_x8 r0_q = hvx_vec_load_iq4nlx4x8_full(r0_x_q + i * x_qblk_size);
1039
+
1040
+ HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
1041
+
1042
+ HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
1043
+ HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
1044
+
1045
+ HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
1046
+
1047
+ HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
1048
+
1049
+ r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
1050
+ }
1051
+
1052
+ if (nloe) {
1053
+ HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial(y_q + i * y_qblk_size, nloe);
1054
+ HVX_Vector_x8 r0_q = hvx_vec_load_iq4nlx4x8_partial(r0_x_q + i * x_qblk_size, nloe);
1055
+
1056
+ HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe));
1057
+
1058
+ HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
1059
+ HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
1060
+
1061
+ HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
1062
+
1063
+ HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
1064
+ r0_dd = Q6_V_vand_QV(bmask, r0_dd);
1065
+ r0_ia = Q6_V_vand_QV(bmask, r0_ia);
1066
+
1067
+ HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
1068
+
1069
+ r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
1070
+ }
1071
+
1072
+ r0_sum = hvx_vec_reduce_sum_f32(r0_sum);
1073
+
1074
+ hvx_vec_store_u(s0, 4, r0_sum);
1075
+ }
1076
+
1077
+ static void vec_dot_iq4nlx4x2_q8x4x2_2x1(const int n,
1078
+ float * restrict s0,
1079
+ const void * restrict vx0,
1080
+ const void * restrict vx1,
1081
+ const void * restrict vy0) {
1082
+ assert(n % 32 == 0);
1083
+ assert((unsigned long) vx0 % 128 == 0);
1084
+ assert((unsigned long) vx1 % 128 == 0);
1085
+ assert((unsigned long) vy0 % 128 == 0);
1086
+
1087
+ const uint32_t qk = QK_Q4_0x4x2 * 4;
1088
+
1089
+ const uint32_t x_dblk_size = 8 * 4 * 2; // 32x __fp16
1090
+ const uint32_t x_qblk_size = qk / 2; // int4
1091
+ const uint32_t x_qrow_size = n / 2; // int4 (not padded)
1092
+
1093
+ const uint32_t y_dblk_size = 8 * 4 * 2; // 32x __fp16
1094
+ const uint32_t y_qblk_size = qk; // int8
1095
+ const uint32_t y_qrow_size = n; // int8 (not padded)
1096
+
1097
+ const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0) + 0; // quants first
1098
+ const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0) + x_qrow_size; // then scales
1099
+ const uint8_t * restrict r1_x_q = ((const uint8_t *) vx1) + 0; // quants first
1100
+ const uint8_t * restrict r1_x_d = ((const uint8_t *) vx1) + x_qrow_size; // then scales
1101
+
1102
+ const uint8_t * restrict y_q = ((const uint8_t *) vy0 + 0); // quants first
1103
+ const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size); // then scales
1104
+
1105
+ HVX_Vector r0_sum = Q6_V_vzero();
1106
+ HVX_Vector r1_sum = Q6_V_vzero();
1107
+
1108
+ const uint32_t nb = n / qk;
1109
+ const uint32_t nloe = n % qk;
1110
+
1111
+ uint32_t i = 0;
1112
+ for (; i < nb; i++) {
1113
+ HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full(y_q + i * y_qblk_size);
1114
+ HVX_Vector_x8 r0_q = hvx_vec_load_iq4nlx4x8_full(r0_x_q + i * x_qblk_size);
1115
+ HVX_Vector_x8 r1_q = hvx_vec_load_iq4nlx4x8_full(r1_x_q + i * x_qblk_size);
1116
+
1117
+ HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
1118
+ HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q));
1119
+
1120
+ HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
1121
+ HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
1122
+ HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
1123
+
1124
+ HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
1125
+ HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d)));
1126
+
1127
+ HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
1128
+ HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd);
1129
+
1130
+ r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
1131
+ r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum));
1132
+ }
1133
+
1134
+ if (nloe) {
1135
+ HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial(y_q + i * y_qblk_size, nloe);
1136
+ HVX_Vector_x8 r0_q = hvx_vec_load_iq4nlx4x8_partial(r0_x_q + i * x_qblk_size, nloe);
1137
+ HVX_Vector_x8 r1_q = hvx_vec_load_iq4nlx4x8_partial(r1_x_q + i * x_qblk_size, nloe);
1138
+
1139
+ HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe));
1140
+ HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy_q, nloe));
1141
+
1142
+ HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
1143
+ HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
1144
+ HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
1145
+
1146
+ HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
1147
+ HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d)));
1148
+
1149
+ HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
1150
+ r0_dd = Q6_V_vand_QV(bmask, r0_dd);
1151
+ r1_dd = Q6_V_vand_QV(bmask, r1_dd);
1152
+ r0_ia = Q6_V_vand_QV(bmask, r0_ia);
1153
+ r1_ia = Q6_V_vand_QV(bmask, r1_ia);
1154
+
1155
+ HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
1156
+ HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd);
1157
+
1158
+ r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
1159
+ r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum));
1160
+ }
1161
+
1162
+ HVX_Vector rsum = hvx_vec_reduce_sum_f32x2(r0_sum, r1_sum);
1163
+ hvx_vec_store_u(s0, 8, rsum);
1164
+ }
1165
+
1166
+ static void vec_dot_iq4nlx4x2_q8x4x2_2x2(const int n,
1167
+ float * restrict s0,
1168
+ float * restrict s1,
1169
+ const void * restrict vx0,
1170
+ const void * restrict vx1,
1171
+ const void * restrict vy0,
1172
+ const void * restrict vy1) {
1173
+ assert(n % 32 == 0);
1174
+ assert((unsigned long) vx0 % 128 == 0);
1175
+ assert((unsigned long) vx1 % 128 == 0);
1176
+ assert((unsigned long) vy0 % 128 == 0);
1177
+ assert((unsigned long) vy1 % 128 == 0);
1178
+
1179
+ const uint32_t qk = QK_Q4_0x4x2 * 4;
1180
+
1181
+ const uint32_t x_dblk_size = 8 * 4 * 2; // 32x __fp16
1182
+ const uint32_t x_qblk_size = qk / 2; // int4
1183
+ const uint32_t x_qrow_size = n / 2; // int4 (not padded)
1184
+
1185
+ const uint32_t y_dblk_size = 8 * 4 * 2; // 32x __fp16
1186
+ const uint32_t y_qblk_size = qk; // int8
1187
+ const uint32_t y_qrow_size = n; // int8 (not padded)
1188
+
1189
+ const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0) + 0;
1190
+ const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0) + x_qrow_size;
1191
+ const uint8_t * restrict r1_x_q = ((const uint8_t *) vx1) + 0;
1192
+ const uint8_t * restrict r1_x_d = ((const uint8_t *) vx1) + x_qrow_size;
1193
+
1194
+ const uint8_t * restrict y0_q = ((const uint8_t *) vy0) + 0;
1195
+ const uint8_t * restrict y0_d = ((const uint8_t *) vy0) + y_qrow_size;
1196
+ const uint8_t * restrict y1_q = ((const uint8_t *) vy1) + 0;
1197
+ const uint8_t * restrict y1_d = ((const uint8_t *) vy1) + y_qrow_size;
1198
+
1199
+ HVX_Vector r0_c0_sum = Q6_V_vzero();
1200
+ HVX_Vector r0_c1_sum = Q6_V_vzero();
1201
+ HVX_Vector r1_c0_sum = Q6_V_vzero();
1202
+ HVX_Vector r1_c1_sum = Q6_V_vzero();
1203
+
1204
+ const uint32_t nb = n / qk;
1205
+ const uint32_t nloe = n % qk;
1206
+
1207
+ uint32_t i = 0;
1208
+ for (; i < nb; i++) {
1209
+ HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8_full(y0_q + i * y_qblk_size);
1210
+ HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8_full(y1_q + i * y_qblk_size);
1211
+ HVX_Vector_x8 r0_q = hvx_vec_load_iq4nlx4x8_full(r0_x_q + i * x_qblk_size);
1212
+ HVX_Vector_x8 r1_q = hvx_vec_load_iq4nlx4x8_full(r1_x_q + i * x_qblk_size);
1213
+
1214
+ HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy0_q));
1215
+ HVX_Vector r0_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy1_q));
1216
+ HVX_Vector r1_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy0_q));
1217
+ HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy1_q));
1218
+
1219
+ HVX_Vector vy0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y0_d + i * y_dblk_size));
1220
+ HVX_Vector vy1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y1_d + i * y_dblk_size));
1221
+ HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
1222
+ HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
1223
+
1224
+ HVX_Vector r0_c0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy0_d)));
1225
+ HVX_Vector r0_c1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy1_d)));
1226
+ HVX_Vector r1_c0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy0_d)));
1227
+ HVX_Vector r1_c1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy1_d)));
1228
+
1229
+ HVX_Vector r0_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c0_ia, r0_c0_dd);
1230
+ HVX_Vector r0_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c1_ia, r0_c1_dd);
1231
+ HVX_Vector r1_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c0_ia, r1_c0_dd);
1232
+ HVX_Vector r1_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c1_ia, r1_c1_dd);
1233
+
1234
+ r0_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c0_fa, r0_c0_sum));
1235
+ r0_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c1_fa, r0_c1_sum));
1236
+ r1_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c0_fa, r1_c0_sum));
1237
+ r1_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c1_fa, r1_c1_sum));
1238
+ }
1239
+
1240
+ if (nloe) {
1241
+ HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8_partial(y0_q + i * y_qblk_size, nloe);
1242
+ HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8_partial(y1_q + i * y_qblk_size, nloe);
1243
+ HVX_Vector_x8 r0_q = hvx_vec_load_iq4nlx4x8_partial(r0_x_q + i * x_qblk_size, nloe);
1244
+ HVX_Vector_x8 r1_q = hvx_vec_load_iq4nlx4x8_partial(r1_x_q + i * x_qblk_size, nloe);
1245
+
1246
+ HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy0_q, nloe));
1247
+ HVX_Vector r0_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy1_q, nloe));
1248
+ HVX_Vector r1_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy0_q, nloe));
1249
+ HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy1_q, nloe));
1250
+
1251
+ HVX_Vector vy0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y0_d + i * y_dblk_size));
1252
+ HVX_Vector vy1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y1_d + i * y_dblk_size));
1253
+ HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
1254
+ HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
1255
+
1256
+ HVX_Vector r0_c0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy0_d)));
1257
+ HVX_Vector r0_c1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy1_d)));
1258
+ HVX_Vector r1_c0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy0_d)));
1259
+ HVX_Vector r1_c1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy1_d)));
1260
+
1261
+ HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
1262
+ r0_c0_dd = Q6_V_vand_QV(bmask, r0_c0_dd);
1263
+ r0_c1_dd = Q6_V_vand_QV(bmask, r0_c1_dd);
1264
+ r1_c0_dd = Q6_V_vand_QV(bmask, r1_c0_dd);
1265
+ r1_c1_dd = Q6_V_vand_QV(bmask, r1_c1_dd);
1266
+ r0_c0_ia = Q6_V_vand_QV(bmask, r0_c0_ia);
1267
+ r0_c1_ia = Q6_V_vand_QV(bmask, r0_c1_ia);
1268
+ r1_c0_ia = Q6_V_vand_QV(bmask, r1_c0_ia);
1269
+ r1_c1_ia = Q6_V_vand_QV(bmask, r1_c1_ia);
1270
+
1271
+ HVX_Vector r0_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c0_ia, r0_c0_dd);
1272
+ HVX_Vector r0_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c1_ia, r0_c1_dd);
1273
+ HVX_Vector r1_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c0_ia, r1_c0_dd);
1274
+ HVX_Vector r1_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c1_ia, r1_c1_dd);
1275
+
1276
+ r0_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c0_fa, r0_c0_sum));
1277
+ r0_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c1_fa, r0_c1_sum));
1278
+ r1_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c0_fa, r1_c0_sum));
1279
+ r1_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c1_fa, r1_c1_sum));
1280
+ }
1281
+
1282
+ HVX_Vector r0_r1_c0_sum = hvx_vec_reduce_sum_f32x2(r0_c0_sum, r1_c0_sum);
1283
+ HVX_Vector r0_r1_c1_sum = hvx_vec_reduce_sum_f32x2(r0_c1_sum, r1_c1_sum);
1284
+
1285
+ hvx_vec_store_u(&s0[0], 8, r0_r1_c0_sum);
1286
+ hvx_vec_store_u(&s1[0], 8, r0_r1_c1_sum);
1287
+ }
1288
+
1289
+ static void vec_dot_mxfp4x4x2_q8x4x2_1x1(const int n, float * restrict s0, const void * restrict vx0, const void * restrict vy0) {
1290
+ assert(n % 32 == 0); // min sub-block size
1291
+ assert((unsigned long) vx0 % 128 == 0);
1292
+ assert((unsigned long) vy0 % 128 == 0);
1293
+
1294
+ const uint32_t qk = QK_MXFP4x4x2 * 4;
1295
+
1296
+ const uint32_t x_dblk_size = 8 * 4 * 1; // 32x e8m0
1297
+ const uint32_t x_qblk_size = qk / 2; // fp4
1298
+ const uint32_t x_qrow_size = n / 2; // fp4 (not padded)
1299
+
1300
+ const uint32_t y_dblk_size = 8 * 4 * 2; // 32x __fp16
1301
+ const uint32_t y_qblk_size = qk; // int8
1302
+ const uint32_t y_qrow_size = n; // int8 (not padded)
1303
+
1304
+ const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0 + 0); // quants first
1305
+ const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0 + x_qrow_size); // then scales
1306
+
1307
+ const uint8_t * restrict y_q = ((const uint8_t *) vy0 + 0); // quants first
1308
+ const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size); // then scales
1309
+
1310
+ // Row sum (sf)
1311
+ HVX_Vector r0_sum = Q6_V_vzero();
1312
+
1313
+ // Multiply and accumulate into int32.
1314
+ // Compute combined scale (fp32).
1315
+ // Apply scale to acc and accumulate into the row sum (qf32).
1316
+
1317
+ const uint32_t nb = n / qk; // num full blocks
1318
+ int32_t nloe = n % qk; // num leftover elemements (must be signed)
1319
+
1320
+ uint32_t i = 0;
1321
+ for (; i < nb; i++) {
1322
+ HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full( y_q + i * y_qblk_size);
1323
+ HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8_full(r0_x_q + i * x_qblk_size);
1324
+
1325
+ HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
1326
+
1327
+ HVX_Vector vy_d = *(const HVX_UVector *) (y_d + i * y_dblk_size);
1328
+ HVX_Vector r0_d = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size);
1329
+
1330
+ // Convert vy_d from fp16 to fp32 while applying 0.5 scaling which is used for e8m0 halving
1331
+ HVX_Vector half = Q6_Vh_vsplat_R(0x3800); // 0.5 in fp16
1332
+ vy_d = Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(vy_d), half));
1333
+ vy_d = Q6_Vsf_equals_Vqf32(vy_d);
1334
+
1335
+ // Convert rX_d scales from e8m0 to fp32
1336
+ // Expand and zero-pad 32x uint8 e8m0 values to uint32s : 0 0 0 0, 0 0 0 1, 0 0 0 2, ...
1337
+ // Left shift with zero fill to create FP32
1338
+ // FIXME: might need to handle zero as a special case (see ggml-cpu code)
1339
+ HVX_Vector expand = *(const HVX_Vector *) expand_x32_e8m0;
1340
+ HVX_Vector e8m0_mask = Q6_V_vsplat_R(0x000000ff);
1341
+ r0_d = Q6_V_vdelta_VV(r0_d, expand);
1342
+ r0_d = Q6_V_vand_VV(r0_d, e8m0_mask);
1343
+ r0_d = Q6_Vw_vasl_VwR(r0_d, 23);
1344
+
1345
+ HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r0_d, vy_d));
1346
+
1347
+ HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
1348
+
1349
+ r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
1350
+ }
1351
+
1352
+ // Process leftovers
1353
+ if (nloe) {
1354
+ HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial( y_q + i * y_qblk_size, nloe);
1355
+ HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8_partial(r0_x_q + i * x_qblk_size, nloe);
1356
+
1357
+ HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe));
1358
+
1359
+ HVX_Vector vy_d = *(const HVX_UVector *) (y_d + i * y_dblk_size);
1360
+ HVX_Vector r0_d = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size);
1361
+
1362
+ // Convert vy_d from fp16 to fp32 while applying 0.5 scaling which is used for e8m0 halving
1363
+ HVX_Vector half = Q6_Vh_vsplat_R(0x3800); // 0.5 in fp16
1364
+ vy_d = Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(vy_d), half));
1365
+ vy_d = Q6_Vsf_equals_Vqf32(vy_d);
1366
+
1367
+ // Convert rX_d scales from e8m0 to fp32
1368
+ // Expand and zero-pad 32x uint8 e8m0 values to uint32s : 0 0 0 0, 0 0 0 1, 0 0 0 2, ...
1369
+ // Left shift with zero fill to create FP32
1370
+ // FIXME: might need to handle zero as a special case (see ggml-cpu code)
1371
+ HVX_Vector expand = *(const HVX_Vector *) expand_x32_e8m0;
1372
+ HVX_Vector e8m0_mask = Q6_V_vsplat_R(0x000000ff);
1373
+ r0_d = Q6_V_vdelta_VV(r0_d, expand);
1374
+ r0_d = Q6_V_vand_VV(r0_d, e8m0_mask);
1375
+ r0_d = Q6_Vw_vasl_VwR(r0_d, 23);
1376
+
1377
+ HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r0_d, vy_d));
1378
+
1379
+ // Zero-out unused scales
1380
+ HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
1381
+ r0_dd = Q6_V_vand_QV(bmask, r0_dd);
1382
+ r0_ia = Q6_V_vand_QV(bmask, r0_ia);
1383
+
1384
+ HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
1385
+
1386
+ r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
1387
+ }
1388
+
1389
+ r0_sum = hvx_vec_reduce_sum_f32(r0_sum);
1390
+
1391
+ hvx_vec_store_u(s0, 4, r0_sum);
1392
+ }
1393
+
1394
+ static void vec_dot_mxfp4x4x2_q8x4x2_2x1(const int n, float * restrict s0,
1395
+ const void * restrict vx0, const void * restrict vx1,
1396
+ const void * restrict vy0) {
1397
+ assert(n % 32 == 0); // min sub-block size
1398
+ assert((unsigned long) vx0 % 128 == 0);
1399
+ assert((unsigned long) vx1 % 128 == 0);
1400
+ assert((unsigned long) vy0 % 128 == 0);
1401
+
1402
+ const uint32_t qk = QK_MXFP4x4x2 * 4;
1403
+
1404
+ const uint32_t x_dblk_size = 8 * 4 * 1; // 32x e8m0
1405
+ const uint32_t x_qblk_size = qk / 2; // fp4
1406
+ const uint32_t x_qrow_size = n / 2; // fp4 (not padded)
1407
+
1408
+ const uint32_t y_dblk_size = 8 * 4 * 2; // 32x __fp16
1409
+ const uint32_t y_qblk_size = qk; // int8
1410
+ const uint32_t y_qrow_size = n; // int8 (not padded)
1411
+
1412
+ const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0) + 0; // quants first
1413
+ const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0) + x_qrow_size; // then scales
1414
+ const uint8_t * restrict r1_x_q = ((const uint8_t *) vx1) + 0; // quants first
1415
+ const uint8_t * restrict r1_x_d = ((const uint8_t *) vx1) + x_qrow_size; // then scales
1416
+
1417
+ const uint8_t * restrict y_q = ((const uint8_t *) vy0) + 0; // quants first
1418
+ const uint8_t * restrict y_d = ((const uint8_t *) vy0) + y_qrow_size; // then scales
1419
+
1420
+ // Row sum (sf)
1421
+ HVX_Vector r0_sum = Q6_V_vzero();
1422
+ HVX_Vector r1_sum = Q6_V_vzero();
1423
+
1424
+ // Multiply and accumulate into int32.
1425
+ // Compute combined scale (fp32).
1426
+ // Apply scale to acc and accumulate into the row sum (f32).
1427
+
1428
+ const uint32_t nb = n / qk; // num full blocks
1429
+ int32_t nloe = n % qk; // num leftover elemements (must be signed)
1430
+
1431
+ uint32_t i = 0;
1432
+ for (; i < nb; i++) {
1433
+ HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full( y_q + i * y_qblk_size);
1434
+ HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8_full(r0_x_q + i * x_qblk_size);
1435
+ HVX_Vector_x8 r1_q = hvx_vec_load_mxfp4x4x8_full(r1_x_q + i * x_qblk_size);
1436
+
1437
+ HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
1438
+ HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q));
1439
+
1440
+ HVX_Vector vy_d = *(const HVX_UVector *) (y_d + i * y_dblk_size);
1441
+ HVX_Vector r0_d = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size);
1442
+ HVX_Vector r1_d = *(const HVX_UVector *) (r1_x_d + i * x_dblk_size);
1443
+
1444
+ // Convert vy_d from fp16 to fp32 while applying 0.5 scaling which is used for e8m0 halving
1445
+ HVX_Vector half = Q6_Vh_vsplat_R(0x3800); // 0.5 in fp16
1446
+ vy_d = Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(vy_d), half));
1447
+ vy_d = Q6_Vsf_equals_Vqf32(vy_d);
1448
+
1449
+ // Convert rX_d scales from e8m0 to fp32
1450
+ // Expand and zero-pad 32x uint8 e8m0 values to uint32s : 0 0 0 0, 0 0 0 1, 0 0 0 2, ...
1451
+ // Left shift with zero fill to create FP32
1452
+ // FIXME: might need to handle zero as a special case (see ggml-cpu code)
1453
+ HVX_Vector expand = *(const HVX_Vector *) expand_x32_e8m0;
1454
+ HVX_Vector e8m0_mask = Q6_V_vsplat_R(0x000000ff);
1455
+ r0_d = Q6_V_vdelta_VV(r0_d, expand);
1456
+ r0_d = Q6_V_vand_VV(r0_d, e8m0_mask);
1457
+ r0_d = Q6_Vw_vasl_VwR(r0_d, 23);
1458
+ r1_d = Q6_V_vdelta_VV(r1_d, expand);
1459
+ r1_d = Q6_V_vand_VV(r1_d, e8m0_mask);
1460
+ r1_d = Q6_Vw_vasl_VwR(r1_d, 23);
1461
+
1462
+ HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r0_d, vy_d));
1463
+ HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r1_d, vy_d));
1464
+
1465
+ HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
1466
+ HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd);
1467
+
1468
+ r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
1469
+ r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum));
1470
+ }
1471
+
1472
+ // Process leftovers
1473
+ if (nloe) {
1474
+ HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial( y_q + i * y_qblk_size, nloe);
1475
+ HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8_partial(r0_x_q + i * x_qblk_size, nloe);
1476
+ HVX_Vector_x8 r1_q = hvx_vec_load_mxfp4x4x8_partial(r1_x_q + i * x_qblk_size, nloe);
1477
+
1478
+ HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
1479
+ HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q));
1480
+
1481
+ HVX_Vector vy_d = *(const HVX_UVector *) (y_d + i * y_dblk_size);
1482
+ HVX_Vector r0_d = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size);
1483
+ HVX_Vector r1_d = *(const HVX_UVector *) (r1_x_d + i * x_dblk_size);
1484
+
1485
+ // Convert vy_d from fp16 to fp32 while applying 0.5 scaling which is used for e8m0 halving
1486
+ HVX_Vector half = Q6_Vh_vsplat_R(0x3800); // 0.5 in fp16
1487
+ vy_d = Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(vy_d), half));
1488
+ vy_d = Q6_Vsf_equals_Vqf32(vy_d);
1489
+
1490
+ // Convert rX_d scales from e8m0 to fp32
1491
+ // Expand and zero-pad 32x uint8 e8m0 values to uint32s : 0 0 0 0, 0 0 0 1, 0 0 0 2, ...
1492
+ // Left shift with zero fill to create FP32
1493
+ // FIXME: might need to handle zero as a special case (see ggml-cpu code)
1494
+ HVX_Vector expand = *(const HVX_Vector *) expand_x32_e8m0;
1495
+ HVX_Vector e8m0_mask = Q6_V_vsplat_R(0x000000ff);
1496
+ r0_d = Q6_V_vdelta_VV(r0_d, expand);
1497
+ r0_d = Q6_V_vand_VV(r0_d, e8m0_mask);
1498
+ r0_d = Q6_Vw_vasl_VwR(r0_d, 23);
1499
+ r1_d = Q6_V_vdelta_VV(r1_d, expand);
1500
+ r1_d = Q6_V_vand_VV(r1_d, e8m0_mask);
1501
+ r1_d = Q6_Vw_vasl_VwR(r1_d, 23);
1502
+
1503
+ HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r0_d, vy_d));
1504
+ HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r1_d, vy_d));
1505
+
1506
+ // Zero-out unused values
1507
+ HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
1508
+ r0_dd = Q6_V_vand_QV(bmask, r0_dd);
1509
+ r1_dd = Q6_V_vand_QV(bmask, r1_dd);
1510
+ r0_ia = Q6_V_vand_QV(bmask, r0_ia);
1511
+ r1_ia = Q6_V_vand_QV(bmask, r1_ia);
1512
+
1513
+ HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
1514
+ HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd);
1515
+
1516
+ r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
1517
+ r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum));
1518
+ }
1519
+
1520
+ HVX_Vector rsum = hvx_vec_reduce_sum_f32x2(r0_sum, r1_sum);
1521
+ hvx_vec_store_u(s0, 8, rsum);
1522
+ }
1523
+
1524
+ static void vec_dot_mxfp4x4x2_q8x4x2_2x2(const int n, float * restrict s0, float * restrict s1,
1525
+ const void * restrict vx0, const void * restrict vx1,
1526
+ const void * restrict vy0, const void * restrict vy1) {
1527
+ assert(n % 32 == 0);
1528
+ assert((unsigned long) vx0 % 128 == 0);
1529
+ assert((unsigned long) vx1 % 128 == 0);
1530
+ assert((unsigned long) vy0 % 128 == 0);
1531
+ assert((unsigned long) vy1 % 128 == 0);
1532
+
1533
+ const uint32_t qk = QK_MXFP4x4x2 * 4;
1534
+
1535
+ const uint32_t x_dblk_size = 8 * 4 * 1; // 32x e8m0
1536
+ const uint32_t x_qblk_size = qk / 2; // fp4
1537
+ const uint32_t x_qrow_size = n / 2; // fp4 (not padded)
1538
+
1539
+ const uint32_t y_dblk_size = 8 * 4 * 2; // 32x __fp16
1540
+ const uint32_t y_qblk_size = qk; // int8
1541
+ const uint32_t y_qrow_size = n; // int8 (not padded)
1542
+
1543
+ const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0) + 0; // quants first
1544
+ const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0) + x_qrow_size; // then scales
1545
+ const uint8_t * restrict r1_x_q = ((const uint8_t *) vx1) + 0; // quants first
1546
+ const uint8_t * restrict r1_x_d = ((const uint8_t *) vx1) + x_qrow_size; // then scales
1547
+
1548
+ const uint8_t * restrict y0_q = ((const uint8_t *) vy0) + 0; // quants first
1549
+ const uint8_t * restrict y0_d = ((const uint8_t *) vy0) + y_qrow_size; // then scales
1550
+ const uint8_t * restrict y1_q = ((const uint8_t *) vy1) + 0; // quants first
1551
+ const uint8_t * restrict y1_d = ((const uint8_t *) vy1) + y_qrow_size; // then scales
1552
+
1553
+ // Row sums (sf) - 4 accumulators for 2×2 tile
1554
+ HVX_Vector r0_c0_sum = Q6_V_vzero();
1555
+ HVX_Vector r0_c1_sum = Q6_V_vzero();
1556
+ HVX_Vector r1_c0_sum = Q6_V_vzero();
1557
+ HVX_Vector r1_c1_sum = Q6_V_vzero();
1558
+
1559
+ const uint32_t nb = n / qk; // num full blocks
1560
+ const uint32_t nloe = n % qk; // num leftover elements
1561
+
1562
+ uint32_t i = 0;
1563
+ for (; i < nb; i++) {
1564
+ // Load src1 columns (reused across both src0 rows)
1565
+ HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8_full(y0_q + i * y_qblk_size);
1566
+ HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8_full(y1_q + i * y_qblk_size);
1567
+
1568
+ // Load src0 rows (reused across both src1 columns)
1569
+ HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8_full(r0_x_q + i * x_qblk_size);
1570
+ HVX_Vector_x8 r1_q = hvx_vec_load_mxfp4x4x8_full(r1_x_q + i * x_qblk_size);
1571
+
1572
+ // Compute 4 dot products: r0×c0, r0×c1, r1×c0, r1×c1
1573
+ HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy0_q));
1574
+ HVX_Vector r0_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy1_q));
1575
+ HVX_Vector r1_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy0_q));
1576
+ HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy1_q));
1577
+
1578
+ // Load scales
1579
+ HVX_Vector vy0_d = *(const HVX_UVector *) (y0_d + i * y_dblk_size);
1580
+ HVX_Vector vy1_d = *(const HVX_UVector *) (y1_d + i * y_dblk_size);
1581
+ HVX_Vector r0_d = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size);
1582
+ HVX_Vector r1_d = *(const HVX_UVector *) (r1_x_d + i * x_dblk_size);
1583
+
1584
+ // Convert vy_d from fp16 to fp32 while applying 0.5 scaling which is used for e8m0 halving
1585
+ HVX_Vector half = Q6_Vh_vsplat_R(0x3800); // 0.5 in fp16
1586
+ vy0_d = Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(vy0_d), half));
1587
+ vy0_d = Q6_Vsf_equals_Vqf32(vy0_d);
1588
+ vy1_d = Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(vy1_d), half));
1589
+ vy1_d = Q6_Vsf_equals_Vqf32(vy1_d);
1590
+
1591
+ // Convert rX_d scales from e8m0 to fp32
1592
+ // Expand and zero-pad 32x uint8 e8m0 values to uint32s : 0 0 0 0, 0 0 0 1, 0 0 0 2, ...
1593
+ // Left shift with zero fill to create FP32
1594
+ // FIXME: might need to handle zero as a special case (see ggml-cpu code)
1595
+ HVX_Vector expand = *(const HVX_Vector *) expand_x32_e8m0;
1596
+ HVX_Vector e8m0_mask = Q6_V_vsplat_R(0x000000ff);
1597
+ r0_d = Q6_V_vdelta_VV(r0_d, expand);
1598
+ r0_d = Q6_V_vand_VV(r0_d, e8m0_mask);
1599
+ r0_d = Q6_Vw_vasl_VwR(r0_d, 23);
1600
+ r1_d = Q6_V_vdelta_VV(r1_d, expand);
1601
+ r1_d = Q6_V_vand_VV(r1_d, e8m0_mask);
1602
+ r1_d = Q6_Vw_vasl_VwR(r1_d, 23);
1603
+
1604
+ // Compute combined scales
1605
+ HVX_Vector r0_c0_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r0_d, vy0_d));
1606
+ HVX_Vector r0_c1_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r0_d, vy1_d));
1607
+ HVX_Vector r1_c0_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r1_d, vy0_d));
1608
+ HVX_Vector r1_c1_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r1_d, vy1_d));
1609
+
1610
+ // Apply scales and accumulate
1611
+ HVX_Vector r0_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c0_ia, r0_c0_dd);
1612
+ HVX_Vector r0_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c1_ia, r0_c1_dd);
1613
+ HVX_Vector r1_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c0_ia, r1_c0_dd);
1614
+ HVX_Vector r1_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c1_ia, r1_c1_dd);
1615
+
1616
+ r0_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c0_fa, r0_c0_sum));
1617
+ r0_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c1_fa, r0_c1_sum));
1618
+ r1_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c0_fa, r1_c0_sum));
1619
+ r1_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c1_fa, r1_c1_sum));
1620
+ }
1621
+
1622
+ // Process leftovers
1623
+ if (nloe) {
1624
+ HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8_partial( y0_q + i * y_qblk_size, nloe);
1625
+ HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8_partial( y1_q + i * y_qblk_size, nloe);
1626
+ HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8_partial(r0_x_q + i * x_qblk_size, nloe);
1627
+ HVX_Vector_x8 r1_q = hvx_vec_load_mxfp4x4x8_partial(r1_x_q + i * x_qblk_size, nloe);
1628
+
1629
+ HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy0_q, nloe));
1630
+ HVX_Vector r0_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy1_q, nloe));
1631
+ HVX_Vector r1_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy0_q, nloe));
1632
+ HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy1_q, nloe));
1633
+
1634
+ HVX_Vector vy0_d = *(const HVX_UVector *) (y0_d + i * y_dblk_size);
1635
+ HVX_Vector vy1_d = *(const HVX_UVector *) (y1_d + i * y_dblk_size);
1636
+ HVX_Vector r0_d = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size);
1637
+ HVX_Vector r1_d = *(const HVX_UVector *) (r1_x_d + i * x_dblk_size);
1638
+
1639
+ // Convert vy_d from fp16 to fp32 while applying 0.5 scaling which is used for e8m0 halving
1640
+ HVX_Vector half = Q6_Vh_vsplat_R(0x3800); // 0.5 in fp16
1641
+ vy0_d = Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(vy0_d), half));
1642
+ vy0_d = Q6_Vsf_equals_Vqf32(vy0_d);
1643
+ vy1_d = Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(vy1_d), half));
1644
+ vy1_d = Q6_Vsf_equals_Vqf32(vy1_d);
1645
+
1646
+ // Convert rX_d scales from e8m0 to fp32
1647
+ // Expand and zero-pad 32x uint8 e8m0 values to uint32s : 0 0 0 0, 0 0 0 1, 0 0 0 2, ...
1648
+ // Left shift with zero fill to create FP32
1649
+ // FIXME: might need to handle zero as a special case (see ggml-cpu code)
1650
+ HVX_Vector expand = *(const HVX_Vector *) expand_x32_e8m0;
1651
+ HVX_Vector e8m0_mask = Q6_V_vsplat_R(0x000000ff);
1652
+ r0_d = Q6_V_vdelta_VV(r0_d, expand);
1653
+ r0_d = Q6_V_vand_VV(r0_d, e8m0_mask);
1654
+ r0_d = Q6_Vw_vasl_VwR(r0_d, 23);
1655
+ r1_d = Q6_V_vdelta_VV(r1_d, expand);
1656
+ r1_d = Q6_V_vand_VV(r1_d, e8m0_mask);
1657
+ r1_d = Q6_Vw_vasl_VwR(r1_d, 23);
1658
+
1659
+ HVX_Vector r0_c0_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r0_d, vy0_d));
1660
+ HVX_Vector r0_c1_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r0_d, vy1_d));
1661
+ HVX_Vector r1_c0_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r1_d, vy0_d));
1662
+ HVX_Vector r1_c1_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r1_d, vy1_d));
1663
+
1664
+ // Zero out unused scales
1665
+ HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
1666
+ r0_c0_dd = Q6_V_vand_QV(bmask, r0_c0_dd);
1667
+ r0_c1_dd = Q6_V_vand_QV(bmask, r0_c1_dd);
1668
+ r1_c0_dd = Q6_V_vand_QV(bmask, r1_c0_dd);
1669
+ r1_c1_dd = Q6_V_vand_QV(bmask, r1_c1_dd);
1670
+ r0_c0_ia = Q6_V_vand_QV(bmask, r0_c0_ia);
1671
+ r0_c1_ia = Q6_V_vand_QV(bmask, r0_c1_ia);
1672
+ r1_c0_ia = Q6_V_vand_QV(bmask, r1_c0_ia);
1673
+ r1_c1_ia = Q6_V_vand_QV(bmask, r1_c1_ia);
1674
+
1675
+ HVX_Vector r0_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c0_ia, r0_c0_dd);
1676
+ HVX_Vector r0_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c1_ia, r0_c1_dd);
1677
+ HVX_Vector r1_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c0_ia, r1_c0_dd);
1678
+ HVX_Vector r1_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c1_ia, r1_c1_dd);
1679
+
1680
+ r0_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c0_fa, r0_c0_sum));
1681
+ r0_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c1_fa, r0_c1_sum));
1682
+ r1_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c0_fa, r1_c0_sum));
1683
+ r1_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c1_fa, r1_c1_sum));
1684
+ }
1685
+
1686
+ // Reduce and store results
1687
+ HVX_Vector r0_r1_c0_sum = hvx_vec_reduce_sum_f32x2(r0_c0_sum, r1_c0_sum);
1688
+ HVX_Vector r0_r1_c1_sum = hvx_vec_reduce_sum_f32x2(r0_c1_sum, r1_c1_sum);
1689
+
1690
+ hvx_vec_store_u(&s0[0], 8, r0_r1_c0_sum); // row0,col0 row1,col0
1691
+ hvx_vec_store_u(&s1[0], 8, r0_r1_c1_sum); // row0,col1 row1,col1
1692
+ }
1693
+
1694
+ static void vec_dot_f16_f16_aa_1x1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
1695
+ const HVX_Vector * restrict x = (const HVX_Vector *) vx;
1696
+ const HVX_Vector * restrict y = (const HVX_Vector *) vy;
1697
+
1698
+ uint32_t nvec = n / VLEN_FP16; // num full fp16 hvx vectors
1699
+ uint32_t nloe = n % VLEN_FP16; // leftover elements
1700
+
1701
+ HVX_VectorPair rsum_p = Q6_W_vzero();
1702
+
1703
+ uint32_t i = 0;
1704
+
1705
+ #pragma unroll(4)
1706
+ for (i = 0; i < nvec; i++) {
1707
+ rsum_p = hvx_vec_mpyacc_f32_f16(rsum_p, x[i], y[i]);
1708
+ }
1709
+
1710
+ if (nloe) {
1711
+ HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe * 2);
1712
+ HVX_Vector x_hf = Q6_V_vand_QV(bmask, x[i]);
1713
+ HVX_Vector y_hf = Q6_V_vand_QV(bmask, y[i]);
1714
+ rsum_p = hvx_vec_mpyacc_f32_f16(rsum_p, x_hf, y_hf);
1715
+ }
1716
+
1717
+ HVX_Vector rsum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(Q6_V_lo_W(rsum_p), Q6_V_hi_W(rsum_p)));
1718
+ hvx_vec_store_u(s, 4, hvx_vec_reduce_sum_f32(rsum));
1719
+ }
1720
+
1721
+ static void vec_dot_f16_f16_aa_2x1(const int n, float * restrict s0,
1722
+ const void * restrict vx0, const void * restrict vx1,
1723
+ const void * restrict vy0) {
1724
+ const HVX_Vector * restrict x0 = (const HVX_Vector *) vx0;
1725
+ const HVX_Vector * restrict x1 = (const HVX_Vector *) vx1;
1726
+ const HVX_Vector * restrict y = (const HVX_Vector *) vy0;
1727
+
1728
+ uint32_t nvec = n / VLEN_FP16;
1729
+ uint32_t nloe = n % VLEN_FP16;
1730
+
1731
+ HVX_VectorPair rsum0_p = Q6_W_vzero();
1732
+ HVX_VectorPair rsum1_p = Q6_W_vzero();
1733
+
1734
+ uint32_t i = 0;
1735
+
1736
+ #pragma unroll(2)
1737
+ for (i = 0; i < nvec; i++) {
1738
+ HVX_Vector y_hf = y[i];
1739
+ rsum0_p = hvx_vec_mpyacc_f32_f16(rsum0_p, x0[i], y_hf);
1740
+ rsum1_p = hvx_vec_mpyacc_f32_f16(rsum1_p, x1[i], y_hf);
1741
+ }
1742
+
1743
+ if (nloe) {
1744
+ HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe * 2);
1745
+ HVX_Vector y_hf = Q6_V_vand_QV(bmask, y[i]);
1746
+ HVX_Vector x0_hf = Q6_V_vand_QV(bmask, x0[i]);
1747
+ HVX_Vector x1_hf = Q6_V_vand_QV(bmask, x1[i]);
1748
+ rsum0_p = hvx_vec_mpyacc_f32_f16(rsum0_p, x0_hf, y_hf);
1749
+ rsum1_p = hvx_vec_mpyacc_f32_f16(rsum1_p, x1_hf, y_hf);
1750
+ }
1751
+
1752
+ HVX_Vector rsum0 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(Q6_V_lo_W(rsum0_p), Q6_V_hi_W(rsum0_p)));
1753
+ HVX_Vector rsum1 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(Q6_V_lo_W(rsum1_p), Q6_V_hi_W(rsum1_p)));
1754
+ HVX_Vector rsum = hvx_vec_reduce_sum_f32x2(rsum0, rsum1);
1755
+ hvx_vec_store_u(s0, 8, rsum);
1756
+ }
1757
+
1758
+ static void vec_dot_f16_f16_aa_2x2(const int n, float * restrict s0, float * restrict s1,
1759
+ const void * restrict vx0, const void * restrict vx1,
1760
+ const void * restrict vy0, const void * restrict vy1) {
1761
+ const HVX_Vector * restrict x0 = (const HVX_Vector *) vx0;
1762
+ const HVX_Vector * restrict x1 = (const HVX_Vector *) vx1;
1763
+ const HVX_Vector * restrict y0 = (const HVX_Vector *) vy0;
1764
+ const HVX_Vector * restrict y1 = (const HVX_Vector *) vy1;
1765
+
1766
+ uint32_t nvec = n / VLEN_FP16;
1767
+ uint32_t nloe = n % VLEN_FP16;
1768
+
1769
+ // Row sums (sf) - 4 accumulators for 2×2 tile
1770
+ HVX_VectorPair r0_c0_sum_p = Q6_W_vzero();
1771
+ HVX_VectorPair r0_c1_sum_p = Q6_W_vzero();
1772
+ HVX_VectorPair r1_c0_sum_p = Q6_W_vzero();
1773
+ HVX_VectorPair r1_c1_sum_p = Q6_W_vzero();
1774
+
1775
+ uint32_t i = 0;
1776
+
1777
+ #pragma unroll(2)
1778
+ for (i = 0; i < nvec; i++) {
1779
+ HVX_Vector r0_hf = x0[i];
1780
+ HVX_Vector r1_hf = x1[i];
1781
+ HVX_Vector c0_hf = y0[i];
1782
+ HVX_Vector c1_hf = y1[i];
1783
+
1784
+ // Compute 4 dot products: r0×c0, r0×c1, r1×c0, r1×c1
1785
+ r0_c0_sum_p = hvx_vec_mpyacc_f32_f16(r0_c0_sum_p, r0_hf, c0_hf);
1786
+ r0_c1_sum_p = hvx_vec_mpyacc_f32_f16(r0_c1_sum_p, r0_hf, c1_hf);
1787
+ r1_c0_sum_p = hvx_vec_mpyacc_f32_f16(r1_c0_sum_p, r1_hf, c0_hf);
1788
+ r1_c1_sum_p = hvx_vec_mpyacc_f32_f16(r1_c1_sum_p, r1_hf, c1_hf);
1789
+ }
1790
+
1791
+ if (nloe) {
1792
+ HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe * 2);
1793
+
1794
+ HVX_Vector r0_hf = Q6_V_vand_QV(bmask, x0[i]);
1795
+ HVX_Vector r1_hf = Q6_V_vand_QV(bmask, x1[i]);
1796
+ HVX_Vector c0_hf = Q6_V_vand_QV(bmask, y0[i]);
1797
+ HVX_Vector c1_hf = Q6_V_vand_QV(bmask, y1[i]);
1798
+
1799
+ r0_c0_sum_p = hvx_vec_mpyacc_f32_f16(r0_c0_sum_p, r0_hf, c0_hf);
1800
+ r0_c1_sum_p = hvx_vec_mpyacc_f32_f16(r0_c1_sum_p, r0_hf, c1_hf);
1801
+ r1_c0_sum_p = hvx_vec_mpyacc_f32_f16(r1_c0_sum_p, r1_hf, c0_hf);
1802
+ r1_c1_sum_p = hvx_vec_mpyacc_f32_f16(r1_c1_sum_p, r1_hf, c1_hf);
1803
+ }
1804
+
1805
+ HVX_Vector r0_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(Q6_V_lo_W(r0_c0_sum_p), Q6_V_hi_W(r0_c0_sum_p)));
1806
+ HVX_Vector r0_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(Q6_V_lo_W(r0_c1_sum_p), Q6_V_hi_W(r0_c1_sum_p)));
1807
+ HVX_Vector r1_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(Q6_V_lo_W(r1_c0_sum_p), Q6_V_hi_W(r1_c0_sum_p)));
1808
+ HVX_Vector r1_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(Q6_V_lo_W(r1_c1_sum_p), Q6_V_hi_W(r1_c1_sum_p)));
1809
+
1810
+ // Reduce and store results
1811
+ HVX_Vector r0_r1_c0_sum = hvx_vec_reduce_sum_f32x2(r0_c0_sum, r1_c0_sum);
1812
+ HVX_Vector r0_r1_c1_sum = hvx_vec_reduce_sum_f32x2(r0_c1_sum, r1_c1_sum);
1813
+
1814
+ hvx_vec_store_u(&s0[0], 8, r0_r1_c0_sum); // row0,col0 row1,col0
1815
+ hvx_vec_store_u(&s1[0], 8, r0_r1_c1_sum); // row0,col1 row1,col1
1816
+ }
1817
+
1818
+ static void vec_dot_f16_f16_uu_1x1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
1819
+ const HVX_UVector * restrict x = (const HVX_UVector *) vx;
1820
+ const HVX_UVector * restrict y = (const HVX_UVector *) vy;
1821
+
1822
+ uint32_t nvec = n / VLEN_FP16; // num full fp16 hvx vectors
1823
+ uint32_t nloe = n % VLEN_FP16; // leftover elements
1824
+
1825
+ HVX_Vector rsum = Q6_V_vzero();
1826
+
1827
+ uint32_t i = 0;
1828
+
1829
+ #pragma unroll(4)
1830
+ for (i = 0; i < nvec; i++) {
1831
+ HVX_VectorPair xy_qf = Q6_Wqf32_vmpy_VhfVhf(x[i], y[i]);
1832
+ rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf)));
1833
+ }
1834
+
1835
+ if (nloe) {
1836
+ HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe * 2);
1837
+ HVX_Vector x_hf = Q6_V_vand_QV(bmask, x[i]);
1838
+ HVX_Vector y_hf = Q6_V_vand_QV(bmask, y[i]);
1839
+
1840
+ HVX_VectorPair xy_qf = Q6_Wqf32_vmpy_VhfVhf(x_hf, y_hf);
1841
+ rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf)));
1842
+ }
1843
+
1844
+ rsum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(rsum));
1845
+ hvx_vec_store_u(&s[0], 4, rsum);
1846
+ }
1847
+
1848
+ static void vec_dot_f16_f32_uu_1x1(const int n, float * restrict s, const void * restrict x, const void * restrict y) {
1849
+ const HVX_UVector * restrict vx = (const HVX_UVector * restrict) x;
1850
+ const HVX_UVector * restrict vy = (const HVX_UVector * restrict) y;
1851
+
1852
+ uint32_t nvec = n / VLEN_FP16; // num full fp16 hvx vectors
1853
+ uint32_t nloe = n % VLEN_FP16; // leftover elements
1854
+
1855
+ const HVX_Vector zero = Q6_V_vzero();
1856
+
1857
+ HVX_Vector rsum = Q6_V_vzero();
1858
+
1859
+ uint32_t i = 0;
1860
+
1861
+ #pragma unroll(2)
1862
+ for (i = 0; i < nvec; i++) {
1863
+ // Load y (fp32) and convert into fp16
1864
+ HVX_Vector y0_qf = Q6_Vqf32_vsub_VsfVsf(vy[i*2+0], zero); // 32 elements
1865
+ HVX_Vector y1_qf = Q6_Vqf32_vsub_VsfVsf(vy[i*2+1], zero); // 32 elements
1866
+ HVX_Vector y_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(y1_qf, y0_qf)));
1867
+
1868
+ // Load x (fp16)
1869
+ HVX_Vector x_hf = vx[i];
1870
+
1871
+ HVX_VectorPair xy_qf = Q6_Wqf32_vmpy_VhfVhf(x_hf, y_hf);
1872
+
1873
+ rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf)));
1874
+ }
1875
+
1876
+ if (nloe) {
1877
+ // Load y (fp32) and convert into fp16
1878
+ HVX_Vector y0_qf = Q6_Vqf32_vsub_VsfVsf(vy[i*2+0], zero); // 32 elements
1879
+ HVX_Vector y1_qf = Q6_Vqf32_vsub_VsfVsf(vy[i*2+1], zero); // 32 elements
1880
+ HVX_Vector y_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(y1_qf, y0_qf)));
1881
+
1882
+ // Load x (fp16)
1883
+ HVX_Vector x_hf = vx[i];
1884
+
1885
+ // Zero-out unused elements
1886
+ // Note that we need to clear both x and y because they may contain NANs
1887
+ HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe * 2);
1888
+ x_hf = Q6_V_vand_QV(bmask, x_hf);
1889
+ y_hf = Q6_V_vand_QV(bmask, y_hf);
1890
+
1891
+ HVX_VectorPair xy_qf = Q6_Wqf32_vmpy_VhfVhf(x_hf, y_hf);
1892
+
1893
+ rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf)));
1894
+ }
1895
+
1896
+ // Convert into fp32 and reduce
1897
+ rsum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(rsum));
1898
+ hvx_vec_store_u(&s[0], 4, rsum);
1899
+ }
1900
+
1901
+ #define htp_matmul_tensors_preamble \
1902
+ const struct htp_tensor * restrict src0 = octx->src[0]; \
1903
+ const struct htp_tensor * restrict src1 = octx->src[1]; \
1904
+ const struct htp_tensor * restrict src2 = octx->src[2]; \
1905
+ const struct htp_tensor * restrict dst = octx->dst; \
1906
+ struct htp_spad * restrict src0_spad = &octx->src0_spad; \
1907
+ struct htp_spad * restrict src1_spad = &octx->src1_spad; \
1908
+ struct htp_spad * restrict dst_spad = &octx->dst_spad; \
1909
+ \
1910
+ const uint32_t ne00 = src0->ne[0]; \
1911
+ const uint32_t ne01 = src0->ne[1]; \
1912
+ const uint32_t ne02 = src0->ne[2]; \
1913
+ const uint32_t ne03 = src0->ne[3]; \
1914
+ \
1915
+ const uint32_t ne10 = src1->ne[0]; \
1916
+ const uint32_t ne11 = src1->ne[1]; \
1917
+ const uint32_t ne12 = src1->ne[2]; \
1918
+ const uint32_t ne13 = src1->ne[3]; \
1919
+ \
1920
+ const uint32_t ne20 = src2->ne[0]; \
1921
+ const uint32_t ne21 = src2->ne[1]; \
1922
+ const uint32_t ne22 = src2->ne[2]; \
1923
+ const uint32_t ne23 = src2->ne[3]; \
1924
+ \
1925
+ const uint32_t ne0 = dst->ne[0]; \
1926
+ const uint32_t ne1 = dst->ne[1]; \
1927
+ const uint32_t ne2 = dst->ne[2]; \
1928
+ const uint32_t ne3 = dst->ne[3]; \
1929
+ \
1930
+ const uint32_t nb00 = src0->nb[0]; \
1931
+ const uint32_t nb01 = src0->nb[1]; \
1932
+ const uint32_t nb02 = src0->nb[2]; \
1933
+ const uint32_t nb03 = src0->nb[3]; \
1934
+ \
1935
+ const uint32_t nb10 = src1->nb[0]; \
1936
+ const uint32_t nb11 = src1->nb[1]; \
1937
+ const uint32_t nb12 = src1->nb[2]; \
1938
+ const uint32_t nb13 = src1->nb[3]; \
1939
+ \
1940
+ const uint32_t nb0 = dst->nb[0]; \
1941
+ const uint32_t nb1 = dst->nb[1]; \
1942
+ const uint32_t nb2 = dst->nb[2]; \
1943
+ const uint32_t nb3 = dst->nb[3];
1944
+
1945
+ #define htp_matmul_preamble \
1946
+ struct htp_matmul_context * mmctx = data; \
1947
+ struct htp_ops_context * octx = mmctx->octx; \
1948
+ htp_matmul_tensors_preamble; \
1949
+ dma_queue *dma_queue = octx->ctx->dma[ith]; \
1950
+ uint32_t src0_nrows_per_thread = mmctx->src0_nrows_per_thread;
1951
+
1952
+ // *** matmul with support for 4d tensors and full broadcasting
1953
+
1954
+ static void matmul_4d(unsigned int nth, unsigned int ith, void * data) {
1955
+ htp_matmul_preamble;
1956
+
1957
+ uint64_t t1, t2;
1958
+ t1 = HAP_perf_get_qtimer_count();
1959
+
1960
+ assert(ne12 % ne02 == 0);
1961
+ assert(ne13 % ne03 == 0);
1962
+
1963
+ // This is the size of the first dimension of the result, so we can iterate that way. (see the ASSERT above, these are the same numbers)
1964
+ const uint32_t nr0 = ne0;
1965
+
1966
+ // This is the size of the rest of the dimensions of the result
1967
+ const uint32_t nr1 = ne1 * ne2 * ne3;
1968
+
1969
+ // distribute the thread work across the inner or outer loop based on which one is larger
1970
+ uint32_t nchunk0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
1971
+ uint32_t nchunk1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
1972
+
1973
+ // The number of elements in each chunk
1974
+ const uint32_t dr0 = (nr0 + nchunk0 - 1) / nchunk0;
1975
+ const uint32_t dr1 = (nr1 + nchunk1 - 1) / nchunk1;
1976
+
1977
+ uint32_t current_chunk = ith;
1978
+
1979
+ const uint32_t ith0 = current_chunk % nchunk0;
1980
+ const uint32_t ith1 = current_chunk / nchunk0;
1981
+
1982
+ const uint32_t ir0_start = dr0 * ith0;
1983
+ const uint32_t ir0_end = MIN(ir0_start + dr0, nr0);
1984
+
1985
+ const uint32_t ir1_start = dr1 * ith1;
1986
+ const uint32_t ir1_end = MIN(ir1_start + dr1, nr1);
1987
+
1988
+ // no work for this thread
1989
+ if (ir0_start >= ir0_end || ir1_start >= ir1_end) {
1990
+ return;
1991
+ }
1992
+
1993
+ // block-tiling attempt
1994
+ const uint32_t blck_0 = 64;
1995
+ const uint32_t blck_1 = 64;
1996
+
1997
+ for (uint32_t iir1 = ir1_start; iir1 < ir1_end; iir1 += blck_1) {
1998
+ for (uint32_t iir0 = ir0_start; iir0 < ir0_end; iir0 += blck_0) {
1999
+ for (uint32_t ir1 = iir1; ir1 < MIN(iir1 + blck_1, ir1_end); ir1++) {
2000
+ const uint32_t i13 = fastdiv(ir1, &mmctx->mm_div_ne12_ne1);
2001
+ const uint32_t i12 = fastdiv(ir1 - i13 * ne12 * ne1, &mmctx->mm_div_ne1);
2002
+ const uint32_t i11 = (ir1 - i13 * ne12 * ne1 - i12 * ne1);
2003
+
2004
+ // broadcast src0 into src1
2005
+ const uint32_t i03 = fastdiv(i13, &mmctx->mm_div_r3);
2006
+ const uint32_t i02 = fastdiv(i12, &mmctx->mm_div_r2);
2007
+
2008
+ const uint32_t i1 = i11;
2009
+ const uint32_t i2 = i12;
2010
+ const uint32_t i3 = i13;
2011
+
2012
+ const uint8_t * restrict src0_base = (const uint8_t *) src0->data + (0 + i02 * nb02 + i03 * nb03);
2013
+ const uint8_t * restrict src1_col = (const uint8_t *) src1->data + (i11 * nb11 + i12 * nb12 + i13 * nb13);
2014
+ float * dst_col = (float *) ((uint8_t * restrict) dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3));
2015
+
2016
+ const uint32_t ir0_block_end = MIN(iir0 + blck_0, ir0_end);
2017
+ for (uint32_t ir0 = iir0; ir0 < ir0_block_end; ir0++) {
2018
+ const uint8_t * restrict src0_row = src0_base + ir0 * nb01;
2019
+ mmctx->vec_dot_1x1(ne00, &dst_col[ir0], src0_row, src1_col);
2020
+ }
2021
+ }
2022
+ }
2023
+ }
2024
+
2025
+ t2 = HAP_perf_get_qtimer_count();
2026
+
2027
+ FARF(HIGH, "matmul-4d %d/%d: %ux%ux%ux%u (%u:%u %u:%u) * %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth,
2028
+ src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], ir0_start, ir0_end, ir1_start, ir1_end, src1->ne[0],
2029
+ src1->ne[1], src1->ne[2], src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
2030
+ (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
2031
+ }
2032
+
2033
+ // src1 tensor is already in VTCM spad
2034
+ static void matmul_2d(unsigned int nth, unsigned int ith, void * data) {
2035
+ htp_matmul_preamble;
2036
+
2037
+ const uint32_t src0_nrows = ne01 * ne02 * ne03; // src0 rows
2038
+ const uint32_t src1_nrows = ne11 * ne12 * ne13; // src1 rows
2039
+
2040
+ const uint32_t src0_start_row = src0_nrows_per_thread * ith;
2041
+ const uint32_t src0_end_row = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
2042
+ const uint32_t src0_end_row_x2 = src0_start_row + ((src0_end_row - src0_start_row) & ~1U);
2043
+
2044
+ // no work for this thread
2045
+ if (src0_start_row >= src0_end_row) {
2046
+ return;
2047
+ }
2048
+
2049
+ const size_t dst_row_size = nb1;
2050
+ const size_t src0_row_size = nb01;
2051
+ const size_t src1_row_size = nb11;
2052
+
2053
+ const size_t src0_stride = src0_spad->stride;
2054
+ const size_t src1_stride = src1_spad->stride;
2055
+
2056
+ // Per-thread VTCM scratchpads for all tensors
2057
+ // Note that the entire src1 tensor is already in VTCM
2058
+ // For other tensors we allocate N rows per thread, padded to HVX vector size
2059
+ uint8_t * restrict spad_dst = dst_spad->data + dst_spad->size_per_thread * ith;
2060
+ uint8_t * restrict spad_src0 = src0_spad->data + src0_spad->size_per_thread * ith;
2061
+ uint8_t * restrict src1_data = src1_spad->data;
2062
+
2063
+ volatile uint64_t t1, t2;
2064
+ t1 = HAP_perf_get_qtimer_count();
2065
+
2066
+ const uint8_t * restrict src0_row = (const uint8_t *) src0->data;
2067
+
2068
+ // Prefill spad with src0 rows
2069
+ #pragma unroll(4)
2070
+ for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) {
2071
+ const int is0 = (ir0 - src0_start_row);
2072
+ if (is0 >= MM_SPAD_SRC0_NROWS) {
2073
+ break;
2074
+ }
2075
+ dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_stride, src0_row + ir0 * src0_row_size),
2076
+ src0_stride, src0_row_size, 2);
2077
+ }
2078
+
2079
+ // Process src0 rows
2080
+ for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) {
2081
+ const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
2082
+
2083
+ // Process src1 columns in pairs (2×2 tiling)
2084
+ uint32_t ir1 = 0;
2085
+ for (; ir1 + 1 < src1_nrows; ir1 += 2) {
2086
+ const uint8_t * restrict src1_col0 = (const uint8_t *) (src1_data + (ir1+0) * src1_stride);
2087
+ const uint8_t * restrict src1_col1 = (const uint8_t *) (src1_data + (ir1+1) * src1_stride);
2088
+ float * restrict dst_row0 = (float *) (dst->data + ((ir1+0) * dst_row_size));
2089
+ float * restrict dst_row1 = (float *) (dst->data + ((ir1+1) * dst_row_size));
2090
+ mmctx->vec_dot_2x2(ne00, &dst_row0[ir0], &dst_row1[ir0], ss0, ss0 + src0_stride, src1_col0, src1_col1);
2091
+ }
2092
+
2093
+ // Handle remaining src1 rows (fallback to 2×1)
2094
+ for (; ir1 < src1_nrows; ++ir1) {
2095
+ const uint8_t * restrict src1_col = (const uint8_t *) (src1_data + ir1 * src1_stride);
2096
+ float * restrict dst_row = (float *) (dst->data + (ir1 * dst_row_size));
2097
+ mmctx->vec_dot_2x1(ne00, &dst_row[ir0], ss0, ss0 + src0_stride, src1_col);
2098
+ }
2099
+
2100
+ // Prefetch next (n + spad_nrows) row
2101
+ const int pr0 = (ir0 + MM_SPAD_SRC0_NROWS);
2102
+ const int is0 = (pr0 - src0_start_row) % MM_SPAD_SRC0_NROWS;
2103
+ if (pr0 < src0_end_row_x2) {
2104
+ dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_stride, src0_row + pr0 * src0_row_size),
2105
+ src0_stride, src0_row_size, 2);
2106
+ }
2107
+ }
2108
+
2109
+ // Process the last row (if any)
2110
+ if (src0_end_row != src0_end_row_x2) {
2111
+ uint32_t ir0 = src0_end_row_x2;
2112
+ const int is0 = (ir0 - src0_start_row);
2113
+ dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_stride, src0_row + ir0 * src0_row_size),
2114
+ src0_stride, src0_row_size, 1);
2115
+ const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
2116
+
2117
+ #pragma unroll(2)
2118
+ for (uint32_t ir1 = 0; ir1 < src1_nrows; ++ir1) {
2119
+ const uint8_t * restrict src1_col = (const uint8_t *) (src1_data + ir1 * src1_stride);
2120
+ float * restrict dst_row = (float *) (dst->data + (ir1 * dst_row_size));
2121
+ mmctx->vec_dot_1x1(ne00, &dst_row[ir0], ss0, src1_col);
2122
+ }
2123
+ }
2124
+
2125
+ t2 = HAP_perf_get_qtimer_count();
2126
+
2127
+ FARF(HIGH, "matmul-%s %d/%d: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", mmctx->type, ith, nth,
2128
+ src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0], src1->ne[1],
2129
+ src1->ne[2], src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
2130
+ (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
2131
+ }
2132
+
2133
+ // q8x4x2 src1 tensor is already in VTCM spad
2134
+ static void matvec_2d(unsigned int nth, unsigned int ith, void * data) {
2135
+ htp_matmul_preamble;
2136
+
2137
+ const uint32_t src0_nrows = ne01;
2138
+
2139
+ const uint32_t src0_start_row = src0_nrows_per_thread * ith;
2140
+ const uint32_t src0_end_row = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
2141
+ const uint32_t src0_end_row_x2 = src0_start_row + ((src0_end_row - src0_start_row) & ~1U);
2142
+
2143
+ // no work for this thread
2144
+ if (src0_start_row >= src0_end_row) {
2145
+ return;
2146
+ }
2147
+
2148
+ const size_t dst_row_size = nb1;
2149
+ const size_t src0_row_size = nb01;
2150
+ const size_t src1_row_size = nb11;
2151
+
2152
+ const size_t src0_stride = src0_spad->stride;
2153
+ const size_t src1_stride = src1_spad->stride;
2154
+
2155
+ // Per-thread VTCM scratchpads for all tensors
2156
+ // Note that the entire src1 tensor is already in VTCM
2157
+ // For other tensors we allocate N rows per thread, padded to HVX vector size
2158
+ uint8_t * spad_dst = dst_spad->data + dst_spad->size_per_thread * ith;
2159
+ uint8_t * spad_src0 = src0_spad->data + src0_spad->size_per_thread * ith;
2160
+ uint8_t * src1_data = src1_spad->data;
2161
+
2162
+ uint64_t t1, t2;
2163
+ t1 = HAP_perf_get_qtimer_count();
2164
+
2165
+ float * tmp = (float *) spad_dst;
2166
+
2167
+ const uint8_t * restrict src0_row = (const uint8_t *) src0->data;
2168
+ const uint8_t * restrict src1_col = (const uint8_t *) src1_data;
2169
+ float * restrict dst_col = (float *) dst->data;
2170
+
2171
+ // Prefill spad with 2x src0 rows
2172
+ #pragma unroll(2)
2173
+ for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) {
2174
+ const uint32_t is0 = (ir0 - src0_start_row);
2175
+ if (is0 >= MM_SPAD_SRC0_NROWS) {
2176
+ break;
2177
+ }
2178
+ dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_stride, src0_row + ir0 * src0_row_size),
2179
+ src0_stride, src0_row_size, 2);
2180
+ }
2181
+
2182
+ // Process src0 rows
2183
+ for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) {
2184
+ const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
2185
+ mmctx->vec_dot_2x1(ne00, &tmp[ir0 - src0_start_row], ss0, ss0 + src0_stride, src1_col);
2186
+
2187
+ // Prefetch next (n + spad_nrows) row
2188
+ const uint32_t pr0 = (ir0 + MM_SPAD_SRC0_NROWS);
2189
+ const uint32_t is0 = (pr0 - src0_start_row) % MM_SPAD_SRC0_NROWS;
2190
+ if (pr0 < src0_end_row_x2) {
2191
+ dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_stride, src0_row + pr0 * src0_row_size),
2192
+ src0_stride, src0_row_size, 2);
2193
+ }
2194
+ }
2195
+
2196
+ // Process the last row (if any)
2197
+ if (src0_end_row != src0_end_row_x2) {
2198
+ const uint32_t ir0 = src0_end_row_x2;
2199
+ const uint32_t is0 = (ir0 - src0_start_row);
2200
+ dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_stride, src0_row + ir0 * src0_row_size),
2201
+ src0_stride, src0_row_size, 1);
2202
+ const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
2203
+ mmctx->vec_dot_1x1(ne00, &tmp[ir0 - src0_start_row], ss0, src1_col);
2204
+ }
2205
+
2206
+ hvx_copy_f32_ua((uint8_t *) &dst_col[src0_start_row], (uint8_t *) tmp, src0_end_row - src0_start_row);
2207
+
2208
+ t2 = HAP_perf_get_qtimer_count();
2209
+
2210
+ FARF(HIGH, "matvec-%s %u/%u: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", mmctx->type, ith, nth,
2211
+ src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0], src1->ne[1],
2212
+ src1->ne[2], src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
2213
+ (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
2214
+ }
2215
+
2216
+ #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id) * ids->ne[0] * ids->ne[1] + (i1)]
2217
+
2218
+ struct mmid_row_mapping {
2219
+ uint32_t i1;
2220
+ uint32_t i2;
2221
+ };
2222
+
2223
+ // src1 tensor is already in VTCM spad
2224
+ static void matmul_id(unsigned int nth, unsigned int ith, void * data) {
2225
+ htp_matmul_preamble;
2226
+
2227
+ const struct htp_tensor * restrict ids = octx->src[2];
2228
+ struct htp_spad * restrict src2_spad = &octx->src2_spad;
2229
+
2230
+ uint64_t t1, t2;
2231
+ t1 = HAP_perf_get_qtimer_count();
2232
+
2233
+ const uint32_t src0_nrows = ne01; // src0 rows per expert
2234
+ const uint32_t src1_nrows = ne11;
2235
+
2236
+ const uint32_t src0_start_row = src0_nrows_per_thread * ith;
2237
+ const uint32_t src0_end_row = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
2238
+ const uint32_t src0_end_row_x2 = src0_start_row + ((src0_end_row - src0_start_row) & ~1U);
2239
+
2240
+ // no work for this thread
2241
+ if (src0_start_row >= src0_end_row) {
2242
+ return;
2243
+ }
2244
+
2245
+ const uint32_t n_ids = ids->ne[0]; // n_expert_used
2246
+ const uint32_t n_as = ne02; // n_expert
2247
+
2248
+ const size_t matrix_row_counts_size = n_as * sizeof(uint32_t);
2249
+ const size_t matrix_row_map_size = n_as * ids->ne[0] * ids->ne[1] * sizeof(struct mmid_row_mapping);
2250
+
2251
+ const uint32_t * matrix_row_counts = (const uint32_t *) src2_spad->data + 0;
2252
+ const struct mmid_row_mapping * matrix_rows = (const void *) src2_spad->data + matrix_row_counts_size;
2253
+
2254
+ const size_t dst_row_size = nb1;
2255
+ const size_t src0_row_size = nb01;
2256
+ const size_t src1_row_size = q8x4x2_row_size(ne10);
2257
+
2258
+ const size_t src0_row_size_padded = hex_round_up(src0_row_size, 128);
2259
+
2260
+ // Per-thread VTCM scratchpads for all tensors
2261
+ // Note that the entire src1 tensor is already in VTCM
2262
+ // For other tensors we allocate N rows per thread, padded to HVX vector size
2263
+ uint8_t * restrict spad_dst = dst_spad->data + dst_spad->size_per_thread * ith;
2264
+ uint8_t * restrict spad_src0 = src0_spad->data + src0_spad->size_per_thread * ith;
2265
+ uint8_t * restrict src1_data = src1_spad->data;
2266
+
2267
+ for (uint32_t cur_a = 0; cur_a < n_as; ++cur_a) {
2268
+ const int32_t cne1 = matrix_row_counts[cur_a];
2269
+
2270
+ if (cne1 == 0) {
2271
+ continue;
2272
+ }
2273
+
2274
+ const uint8_t * src0_row = (const uint8_t *) src0->data + (0 + cur_a * nb02 + 0);
2275
+
2276
+ // Prefill spad with src0 rows
2277
+ #pragma unroll(4)
2278
+ for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) {
2279
+ const int is0 = (ir0 - src0_start_row);
2280
+ if (is0 >= MM_SPAD_SRC0_NROWS) {
2281
+ break;
2282
+ }
2283
+ dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size),
2284
+ src0_row_size_padded, src0_row_size, 2);
2285
+ }
2286
+
2287
+ // Process src0 rows
2288
+ for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) {
2289
+ const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
2290
+
2291
+ for (uint32_t cid = 0; cid < cne1; ++cid) {
2292
+ struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, cid);
2293
+ const int rm1 = row_mapping.i1; // expert idx
2294
+ const int rm2 = row_mapping.i2; // token idx
2295
+
2296
+ const uint32_t ir1 = src1_nrows == 1 ? 0 : rm1; // src1 row idx
2297
+ const uint8_t * restrict src1_col = (const uint8_t *) (src1_data + (ir1 + rm2 * ne11 + 0) * src1_row_size);
2298
+ float * dst_row = (float *) (dst->data + (rm1 * nb1 + rm2 * nb2 + 0));
2299
+
2300
+ mmctx->vec_dot_2x1(ne00, &dst_row[ir0], ss0, ss0 + src0_row_size_padded, src1_col);
2301
+ }
2302
+
2303
+ // Prefetch next (n + spad_nrows) row
2304
+ const int pr0 = (ir0 + MM_SPAD_SRC0_NROWS);
2305
+ const int is0 = (pr0 - src0_start_row) % MM_SPAD_SRC0_NROWS;
2306
+ if (pr0 < src0_end_row_x2) {
2307
+ dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size),
2308
+ src0_row_size_padded, src0_row_size, 2);
2309
+ }
2310
+ }
2311
+
2312
+ // Process the last row (if any)
2313
+ if (src0_end_row != src0_end_row_x2) {
2314
+ uint32_t ir0 = src0_end_row_x2;
2315
+ const uint32_t is0 = (ir0 - src0_start_row);
2316
+ dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size),
2317
+ src0_row_size_padded, src0_row_size, 1);
2318
+ const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
2319
+
2320
+ for (uint32_t cid = 0; cid < cne1; ++cid) {
2321
+ struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, cid);
2322
+ const int rm1 = row_mapping.i1; // expert idx
2323
+ const int rm2 = row_mapping.i2; // token idx
2324
+
2325
+ const uint32_t ir1 = src1_nrows == 1 ? 0 : rm1; // src1 row idx
2326
+ const uint8_t * restrict src1_col = (const uint8_t *) (src1_data + (ir1 + rm2 * ne11 + 0) * src1_row_size);
2327
+ float * dst_row = (float *) (dst->data + (rm1 * nb1 + rm2 * nb2 + 0));
2328
+
2329
+ mmctx->vec_dot_1x1(ne00, &dst_row[ir0], ss0, src1_col);
2330
+ }
2331
+ }
2332
+ }
2333
+
2334
+ t2 = HAP_perf_get_qtimer_count();
2335
+
2336
+ FARF(HIGH, "matmul-id-%s %d/%d: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u (%ux%ux%ux%u) -> %ux%ux%ux%u usec %u\n", mmctx->type,
2337
+ ith, nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0],
2338
+ src1->ne[1], src1->ne[2], src1->ne[3], ids->ne[0], ids->ne[1], ids->ne[2], ids->ne[3], dst->ne[0], dst->ne[1],
2339
+ dst->ne[2], dst->ne[3], (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
2340
+ }
2341
+
2342
+ // src1 tensor is already in VTCM spad
2343
+ static void matvec_id(unsigned int nth, unsigned int ith, void * data) {
2344
+ htp_matmul_preamble;
2345
+
2346
+ const struct htp_tensor * restrict ids = octx->src[2];
2347
+ struct htp_spad * restrict src2_spad = &octx->src2_spad;
2348
+
2349
+ uint64_t t1, t2;
2350
+ t1 = HAP_perf_get_qtimer_count();
2351
+
2352
+ const uint32_t src0_nrows = ne01; // src0 rows per expert
2353
+
2354
+ const uint32_t src0_start_row = src0_nrows_per_thread * ith;
2355
+ const uint32_t src0_end_row = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
2356
+ const uint32_t src0_end_row_x2 = src0_start_row + ((src0_end_row - src0_start_row) & ~1U);
2357
+
2358
+ // no work for this thread
2359
+ if (src0_start_row >= src0_end_row) {
2360
+ return;
2361
+ }
2362
+
2363
+ assert(ne13 % ne03 == 0);
2364
+
2365
+ const size_t dst_row_size = nb1;
2366
+ const size_t src0_row_size = nb01;
2367
+ const size_t src1_row_size = q8x4x2_row_size(ne10);
2368
+
2369
+ const size_t src0_row_size_padded = hex_round_up(src0_row_size, 128);
2370
+
2371
+ const uint32_t n_aids = src2->ne[0]; // num activated experts
2372
+ const uint32_t n_ids = ne02; // num experts
2373
+
2374
+ // Per-thread VTCM scratchpads for all tensors
2375
+ // Note that the entire src1 tensor is already in VTCM
2376
+ // For other tensors we allocate N rows per thread, padded to HVX vector size
2377
+ uint8_t * restrict spad_dst = dst_spad->data + dst_spad->size_per_thread * ith;
2378
+ uint8_t * restrict spad_src0 = src0_spad->data + src0_spad->size_per_thread * ith;
2379
+ uint8_t * restrict src1_data = src1_spad->data;
2380
+
2381
+ for (uint32_t ie1 = 0; ie1 < n_aids; ++ie1) { // for each expert
2382
+ const uint32_t eid = *(const int32_t *) ((const uint8_t *) src2->data + ie1 * src2->nb[0]);
2383
+ assert(eid < n_ids);
2384
+
2385
+ const uint8_t * restrict src0_row = (const uint8_t *) src0->data + eid * nb02;
2386
+ const uint8_t * restrict src1_col = (const uint8_t *) src1_data;
2387
+ float * restrict dst_row = (float *) (dst->data + ie1 * nb1);
2388
+
2389
+ // Prefill spad with src0 rows
2390
+ #pragma unroll(4)
2391
+ for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) {
2392
+ const int is0 = (ir0 - src0_start_row);
2393
+ if (is0 >= MM_SPAD_SRC0_NROWS) {
2394
+ break;
2395
+ }
2396
+ dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size),
2397
+ src0_row_size_padded, src0_row_size, 2);
2398
+ }
2399
+
2400
+ // Process src0 rows
2401
+ for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) {
2402
+ const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
2403
+ mmctx->vec_dot_2x1(ne00, &dst_row[ir0], ss0, ss0 + src0_row_size_padded, src1_col);
2404
+
2405
+ // Prefetch next (n + spad_nrows) row
2406
+ const int pr0 = (ir0 + MM_SPAD_SRC0_NROWS);
2407
+ const int is0 = (pr0 - src0_start_row) % MM_SPAD_SRC0_NROWS;
2408
+ if (pr0 < src0_end_row_x2) {
2409
+ dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size),
2410
+ src0_row_size_padded, src0_row_size, 2);
2411
+ }
2412
+ }
2413
+
2414
+ // Process the last row (if any)
2415
+ if (src0_end_row != src0_end_row_x2) {
2416
+ uint32_t ir0 = src0_end_row_x2;
2417
+ const uint32_t is0 = (ir0 - src0_start_row);
2418
+ dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size),
2419
+ src0_row_size_padded, src0_row_size, 1);
2420
+ const uint8_t * ss0 = dma_queue_pop(dma_queue).dst;
2421
+ mmctx->vec_dot_1x1(ne00, &dst_row[ir0], ss0, src1_col);
2422
+ }
2423
+ }
2424
+
2425
+ t2 = HAP_perf_get_qtimer_count();
2426
+
2427
+ FARF(HIGH, "matvec-id-%s %d/%d: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u (%ux%ux%ux%u) -> %ux%ux%ux%u usec %u\n", mmctx->type,
2428
+ ith, nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0],
2429
+ src1->ne[1], src1->ne[2], src1->ne[3], src2->ne[0], src2->ne[1], src2->ne[2], src2->ne[3], dst->ne[0],
2430
+ dst->ne[1], dst->ne[2], dst->ne[3], (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
2431
+ }
2432
+
2433
+ // *** dynamic quant
2434
+
2435
+ static inline void quantize_block_f32_q8x1(float * restrict x, uint8_t * restrict y_q, uint8_t * restrict y_d) {
2436
+ assert((unsigned long) x % 128 == 0);
2437
+ assert((unsigned long) y_q % 128 == 0);
2438
+
2439
+ HVX_Vector * vx = (HVX_Vector *) x;
2440
+ HVX_Vector zero = Q6_V_vzero();
2441
+
2442
+ // Use reduce max fp32 to find max(abs(e)) first
2443
+ HVX_Vector vmax0_sf = hvx_vec_reduce_max_f32(hvx_vec_abs_f32(vx[0]));
2444
+ HVX_Vector vmax1_sf = hvx_vec_reduce_max_f32(hvx_vec_abs_f32(vx[1]));
2445
+ HVX_Vector vmax2_sf = hvx_vec_reduce_max_f32(hvx_vec_abs_f32(vx[2]));
2446
+ HVX_Vector vmax3_sf = hvx_vec_reduce_max_f32(hvx_vec_abs_f32(vx[3]));
2447
+ // Load and convert into QF32
2448
+ HVX_Vector vx0_qf = Q6_Vqf32_vsub_VsfVsf(vx[0], zero); // 32 elements
2449
+ HVX_Vector vx1_qf = Q6_Vqf32_vsub_VsfVsf(vx[1], zero); // 32 elements
2450
+ HVX_Vector vx2_qf = Q6_Vqf32_vsub_VsfVsf(vx[2], zero); // 32 elements
2451
+ HVX_Vector vx3_qf = Q6_Vqf32_vsub_VsfVsf(vx[3], zero); // 32 elements
2452
+
2453
+ // Convert to QF32
2454
+ HVX_Vector vmax0_qf = Q6_Vqf32_vsub_VsfVsf(vmax0_sf, zero); // replicated over all lanes
2455
+ HVX_Vector vmax1_qf = Q6_Vqf32_vsub_VsfVsf(vmax1_sf, zero); // replicated over all lanes
2456
+ HVX_Vector vmax2_qf = Q6_Vqf32_vsub_VsfVsf(vmax2_sf, zero); // replicated over all lanes
2457
+ HVX_Vector vmax3_qf = Q6_Vqf32_vsub_VsfVsf(vmax3_sf, zero); // replicated over all lanes
2458
+
2459
+ // Combine and convert to fp16
2460
+ HVX_Vector vmax01_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vmax1_qf, vmax0_qf)));
2461
+ HVX_Vector vmax23_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vmax3_qf, vmax2_qf)));
2462
+
2463
+ // Convert into fp16
2464
+ HVX_Vector vx01_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vx1_qf, vx0_qf)));
2465
+ HVX_Vector vx23_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vx3_qf, vx2_qf)));
2466
+
2467
+ HVX_Vector vd01_qf16 = Q6_Vqf16_vmpy_VhfVhf(vmax01_hf, Q6_Vh_vsplat_R(0x2008)); // 1.0 / 127.0
2468
+ HVX_Vector vd23_qf16 = Q6_Vqf16_vmpy_VhfVhf(vmax23_hf, Q6_Vh_vsplat_R(0x2008)); // 1.0 / 127.0
2469
+ HVX_Vector vd01_hf = Q6_Vhf_equals_Vqf16(vd01_qf16);
2470
+ HVX_Vector vd23_hf = Q6_Vhf_equals_Vqf16(vd23_qf16);
2471
+
2472
+ hvx_vec_store_u(y_d + 0, 2, vd01_hf);
2473
+ HVX_Vector rotated_vd_hf = Q6_V_vror_VR(vd01_hf, 64);
2474
+ hvx_vec_store_u(y_d + 2, 2, rotated_vd_hf);
2475
+
2476
+ hvx_vec_store_u(y_d + 4, 2, vd23_hf);
2477
+ rotated_vd_hf = Q6_V_vror_VR(vd23_hf, 64);
2478
+ hvx_vec_store_u(y_d + 6, 2, rotated_vd_hf);
2479
+
2480
+ // Divide input by the scale
2481
+ HVX_Vector vd01_inv_hf = hvx_vec_inverse_f16(vd01_hf);
2482
+ HVX_Vector vd23_inv_hf = hvx_vec_inverse_f16(vd23_hf);
2483
+ vx01_hf = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(vx01_hf, vd01_inv_hf));
2484
+ vx23_hf = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(vx23_hf, vd23_inv_hf));
2485
+
2486
+ // Convert to int8
2487
+ HVX_Vector vx01_i16 = hvx_vec_i16_from_hf_rnd_sat(vx01_hf);
2488
+ HVX_Vector vx23_i16 = hvx_vec_i16_from_hf_rnd_sat(vx23_hf);
2489
+ HVX_Vector vx_i8 = Q6_Vb_vpack_VhVh_sat(vx23_i16, vx01_i16);
2490
+
2491
+ *(HVX_Vector *) y_q = vx_i8;
2492
+ }
2493
+
2494
+ static inline void quantize_block_f32_q8x2(float * restrict x, uint8_t * restrict y_q, uint8_t * restrict y_d) {
2495
+ assert((unsigned long) x % 128 == 0);
2496
+ assert((unsigned long) y_q % 128 == 0);
2497
+
2498
+ HVX_Vector * vx = (HVX_Vector *) x;
2499
+
2500
+ // Load and convert into QF32
2501
+ HVX_Vector zero = Q6_V_vzero();
2502
+ HVX_Vector vx0_qf = Q6_Vqf32_vsub_VsfVsf(vx[0], zero); // 32 elements
2503
+ HVX_Vector vx1_qf = Q6_Vqf32_vsub_VsfVsf(vx[1], zero); // 32 elements
2504
+ HVX_Vector vx2_qf = Q6_Vqf32_vsub_VsfVsf(vx[2], zero); // 32 elements
2505
+ HVX_Vector vx3_qf = Q6_Vqf32_vsub_VsfVsf(vx[3], zero); // 32 elements
2506
+
2507
+ // Convert into fp16
2508
+ HVX_Vector vx01_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vx1_qf, vx0_qf)));
2509
+ HVX_Vector vx23_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vx3_qf, vx2_qf)));
2510
+
2511
+ // Compute max and scale
2512
+ HVX_Vector vmax01_hf = hvx_vec_reduce_max_f16(hvx_vec_abs_f16(vx01_hf)); // replicated over all lanes
2513
+ HVX_Vector vmax23_hf = hvx_vec_reduce_max_f16(hvx_vec_abs_f16(vx23_hf)); // replicated over all lanes
2514
+
2515
+ HVX_Vector vd01_qf16 = Q6_Vqf16_vmpy_VhfVhf(vmax01_hf, Q6_Vh_vsplat_R(0x2008)); // 1.0 / 127.0
2516
+ HVX_Vector vd23_qf16 = Q6_Vqf16_vmpy_VhfVhf(vmax23_hf, Q6_Vh_vsplat_R(0x2008)); // 1.0 / 127.0
2517
+ HVX_Vector vd01_hf = Q6_Vhf_equals_Vqf16(vd01_qf16);
2518
+ HVX_Vector vd23_hf = Q6_Vhf_equals_Vqf16(vd23_qf16);
2519
+
2520
+ hvx_vec_store_u(y_d + 0, 4, vd01_hf);
2521
+ hvx_vec_store_u(y_d + 4, 4, vd23_hf);
2522
+
2523
+ // Divide input by the scale
2524
+ HVX_Vector vd01_inv_hf = hvx_vec_inverse_f16(vd01_hf);
2525
+ HVX_Vector vd23_inv_hf = hvx_vec_inverse_f16(vd23_hf);
2526
+ vx01_hf = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(vx01_hf, vd01_inv_hf));
2527
+ vx23_hf = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(vx23_hf, vd23_inv_hf));
2528
+
2529
+ // Convert to int8
2530
+ HVX_Vector vx01_i16 = hvx_vec_i16_from_hf_rnd_sat(vx01_hf);
2531
+ HVX_Vector vx23_i16 = hvx_vec_i16_from_hf_rnd_sat(vx23_hf);
2532
+ HVX_Vector vx_i8 = Q6_Vb_vpack_VhVh_sat(vx23_i16, vx01_i16);
2533
+
2534
+ *(HVX_Vector *) y_q = vx_i8;
2535
+ }
2536
+
2537
+ static inline void quantize_block_f32_q8x4(float * restrict x, uint8_t * restrict y_q, uint8_t * restrict y_d) {
2538
+ assert((unsigned long) x % 128 == 0);
2539
+ assert((unsigned long) y_q % 128 == 0);
2540
+
2541
+ HVX_Vector * vx = (HVX_Vector *) x;
2542
+
2543
+ // Load and convert into QF32
2544
+ HVX_Vector zero = Q6_V_vzero();
2545
+ HVX_Vector vx0_qf = Q6_Vqf32_vsub_VsfVsf(vx[0], zero); // 32 elements
2546
+ HVX_Vector vx1_qf = Q6_Vqf32_vsub_VsfVsf(vx[1], zero); // 32 elements
2547
+ HVX_Vector vx2_qf = Q6_Vqf32_vsub_VsfVsf(vx[2], zero); // 32 elements
2548
+ HVX_Vector vx3_qf = Q6_Vqf32_vsub_VsfVsf(vx[3], zero); // 32 elements
2549
+
2550
+ // Convert into fp16
2551
+ HVX_Vector vx01_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vx1_qf, vx0_qf)));
2552
+ HVX_Vector vx23_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vx3_qf, vx2_qf)));
2553
+
2554
+ // Compute max and scale
2555
+ HVX_Vector vmax_hf = hvx_vec_reduce_max_f16(hvx_vec_abs_f16(vx01_hf));
2556
+ vmax_hf = hvx_vec_reduce_max2_f16(hvx_vec_abs_f16(vx23_hf), vmax_hf); // replicated over all lanes
2557
+
2558
+ HVX_Vector vd_qf16 = Q6_Vqf16_vmpy_VhfVhf(vmax_hf, Q6_Vh_vsplat_R(0x2008)); // 1.0 / 127.0
2559
+ HVX_Vector vd_hf = Q6_Vhf_equals_Vqf16(vd_qf16);
2560
+
2561
+ *(HVX_UVector *) y_d = vd_hf;
2562
+
2563
+ // Divide input by the scale
2564
+ HVX_Vector vd_inv_hf = hvx_vec_inverse_f16(vd_hf);
2565
+ vx01_hf = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(vx01_hf, vd_inv_hf));
2566
+ vx23_hf = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(vx23_hf, vd_inv_hf));
2567
+
2568
+ // Convert to int8
2569
+ HVX_Vector vx01_i16 = hvx_vec_i16_from_hf_rnd_sat(vx01_hf);
2570
+ HVX_Vector vx23_i16 = hvx_vec_i16_from_hf_rnd_sat(vx23_hf);
2571
+ HVX_Vector vx_i8 = Q6_Vb_vpack_VhVh_sat(vx23_i16, vx01_i16);
2572
+
2573
+ *(HVX_Vector *) y_q = vx_i8;
2574
+ }
2575
+
2576
+ // Overrides input x
2577
+ static void quantize_row_f32_q8x4x2(float * restrict x, uint8_t * restrict y, uint32_t k) {
2578
+ assert(k % 32 == 0);
2579
+ const uint32_t qk = QK_Q8_0x4x2;
2580
+ const uint32_t nb = (k + qk - 1) / qk;
2581
+
2582
+ const uint32_t qrow_size = k; // int8
2583
+
2584
+ const uint32_t dblk_size = 8 * 2; // 8x __fp16
2585
+ const uint32_t qblk_size = QK_Q8_0x4x2; // int8
2586
+
2587
+ uint8_t * restrict y_q = (y + 0); // quants first
2588
+ uint8_t * restrict y_d = (y + qrow_size); // then scales
2589
+
2590
+ // Temp scales override input since we're working off of the aligned temp buffer in VTCM
2591
+ uint8_t * restrict t_d = (uint8_t *) x;
2592
+
2593
+ for (uint32_t i = 0; i < nb; i++) {
2594
+ #if FP32_QUANTIZE_GROUP_SIZE == 32
2595
+ quantize_block_f32_q8x1(x + (i*2 + 0) * qk/2, y_q + (i*2 + 0) * qblk_size/2, t_d + (i*2 + 0) * dblk_size/2);
2596
+ quantize_block_f32_q8x1(x + (i*2 + 1) * qk/2, y_q + (i*2 + 1) * qblk_size/2, t_d + (i*2 + 1) * dblk_size/2);
2597
+ #elif FP32_QUANTIZE_GROUP_SIZE == 64
2598
+ quantize_block_f32_q8x2(x + (i*2 + 0) * qk/2, y_q + (i*2 + 0) * qblk_size/2, t_d + (i*2 + 0) * dblk_size/2);
2599
+ quantize_block_f32_q8x2(x + (i*2 + 1) * qk/2, y_q + (i*2 + 1) * qblk_size/2, t_d + (i*2 + 1) * dblk_size/2);
2600
+ #elif FP32_QUANTIZE_GROUP_SIZE == 128
2601
+ quantize_block_f32_q8x4(x + (i*2 + 0) * qk/2, y_q + (i*2 + 0) * qblk_size/2, t_d + (i*2 + 0) * dblk_size/2);
2602
+ quantize_block_f32_q8x4(x + (i*2 + 1) * qk/2, y_q + (i*2 + 1) * qblk_size/2, t_d + (i*2 + 1) * dblk_size/2);
2603
+ #else
2604
+ #error "FP32_QUANTIZE_GROUP_SIZE must be 32, 64, or 128"
2605
+ #endif
2606
+ }
2607
+
2608
+ // now copy the scales into final location
2609
+ hvx_copy_f16_ua(y_d, t_d, nb * 8);
2610
+ }
2611
+
2612
+ static void quantize_f32_q8x4x2(unsigned int nth, unsigned int ith, void * data) {
2613
+ struct htp_matmul_context * mmctx = data;
2614
+ struct htp_ops_context * octx = mmctx->octx;
2615
+
2616
+ const struct htp_tensor * src = octx->src[1];
2617
+ uint8_t * restrict dst = octx->src1_spad.data;
2618
+ struct htp_spad * spad = &octx->src0_spad;
2619
+ uint32_t nrows_per_thread = mmctx->src1_nrows_per_thread;
2620
+
2621
+ uint64_t t1 = HAP_perf_get_qtimer_count();
2622
+
2623
+ const uint32_t ne0 = src->ne[0];
2624
+ const uint32_t ne1 = src->ne[1];
2625
+ const uint32_t ne2 = src->ne[2];
2626
+ const uint32_t ne3 = src->ne[3];
2627
+
2628
+ const uint32_t nrows = ne1 * ne2 * ne3; // total n_rows
2629
+
2630
+ const uint32_t ir_first = nrows_per_thread * ith; // first row
2631
+ const uint32_t ir_last = MIN(ir_first + nrows_per_thread, nrows); // last row
2632
+
2633
+ const size_t src_row_size = src->nb[1];
2634
+ const size_t dst_row_size = q8x4x2_row_size(ne0);
2635
+
2636
+ uint8_t * restrict src_data = (uint8_t *) src->data + (src_row_size * ir_first);
2637
+ uint8_t * restrict dst_data = (uint8_t *) dst + (dst_row_size * ir_first);
2638
+ uint8_t * restrict tmp_data = (uint8_t *) spad->data + (spad->size_per_thread * ith);
2639
+
2640
+ const size_t src_row_size_padded = hex_round_up(src_row_size, QK_Q8_0x4x2 * sizeof(float));
2641
+ memset(tmp_data, 0, src_row_size_padded); // zero-out temp row data for padding
2642
+
2643
+ for (uint32_t i = ir_first; i < ir_last; ++i) {
2644
+ hex_l2fetch(src_data, src_row_size, src_row_size, 2);
2645
+ hvx_copy_f32_aa(tmp_data, src_data, ne0);
2646
+
2647
+ // FARF(HIGH, "quantize-q8x4-row: %u\n", i);
2648
+ quantize_row_f32_q8x4x2((float *) tmp_data, dst_data, ne0);
2649
+ dst_data += dst_row_size;
2650
+ src_data += src_row_size;
2651
+ }
2652
+
2653
+ uint64_t t2 = HAP_perf_get_qtimer_count();
2654
+
2655
+ FARF(HIGH, "quantize-f32-q8x4: %u/%u : n-rows %u (%u:%u) row-size %u -> %u usec %u\n", ith, nth, nrows, ir_first,
2656
+ ir_last, src_row_size, dst_row_size, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
2657
+ }
2658
+
2659
+ static void quantize_f32_f16(unsigned int nth, unsigned int ith, void * data) {
2660
+ struct htp_matmul_context * mmctx = data;
2661
+ struct htp_ops_context * octx = mmctx->octx;
2662
+
2663
+ const struct htp_tensor * src = octx->src[1];
2664
+ uint8_t * restrict dst = octx->src1_spad.data;
2665
+ uint32_t nrows_per_thread = mmctx->src1_nrows_per_thread;
2666
+ uint32_t dst_stride = octx->src1_spad.stride;
2667
+
2668
+ uint64_t t1 = HAP_perf_get_qtimer_count();
2669
+
2670
+ const uint32_t ne0 = src->ne[0];
2671
+ const uint32_t ne1 = src->ne[1];
2672
+ const uint32_t ne2 = src->ne[2];
2673
+ const uint32_t ne3 = src->ne[3];
2674
+
2675
+ const uint32_t nrows = ne1 * ne2 * ne3; // total n_rows
2676
+
2677
+ const uint32_t ir_first = nrows_per_thread * ith; // first row
2678
+ const uint32_t ir_last = MIN(ir_first + nrows_per_thread, nrows); // last row
2679
+
2680
+ const size_t src_row_size = ne0 * sizeof(float);
2681
+ const size_t src_stride = src->nb[1];
2682
+
2683
+ uint8_t * restrict src_data = (uint8_t *) src->data + (src_stride * ir_first);
2684
+ uint8_t * restrict dst_data = (uint8_t *) dst + (dst_stride * ir_first);
2685
+
2686
+ for (uint32_t i = ir_first; i < ir_last; ++i) {
2687
+ hex_l2fetch(src_data, src_row_size, src_stride, 2);
2688
+ hvx_copy_f16_f32_au(dst_data, src_data, ne0);
2689
+
2690
+ dst_data += dst_stride;
2691
+ src_data += src_stride;
2692
+ }
2693
+
2694
+ uint64_t t2 = HAP_perf_get_qtimer_count();
2695
+
2696
+ FARF(HIGH, "quantize-f32-f16: %u/%u : n-rows %u (%u:%u) row-size %u (%u) -> %u usec %u\n", ith, nth, nrows, ir_first,
2697
+ ir_last, src_row_size, src_stride, dst_stride, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
2698
+ }
2699
+
2700
+ // TODO just a plain copy that should be done via the DMA during the Op setup
2701
+ static void quantize_f16_f16(unsigned int nth, unsigned int ith, void * data) {
2702
+ struct htp_matmul_context * mmctx = data;
2703
+ struct htp_ops_context * octx = mmctx->octx;
2704
+
2705
+ const struct htp_tensor * src = octx->src[1];
2706
+ uint8_t * restrict dst = octx->src1_spad.data;
2707
+ uint32_t nrows_per_thread = mmctx->src1_nrows_per_thread;
2708
+ uint32_t dst_stride = octx->src1_spad.stride;
2709
+
2710
+ uint64_t t1 = HAP_perf_get_qtimer_count();
2711
+
2712
+ const uint32_t ne0 = src->ne[0];
2713
+ const uint32_t ne1 = src->ne[1];
2714
+ const uint32_t ne2 = src->ne[2];
2715
+ const uint32_t ne3 = src->ne[3];
2716
+
2717
+ const uint32_t nrows = ne1 * ne2 * ne3; // total n_rows
2718
+
2719
+ const uint32_t ir_first = nrows_per_thread * ith; // first row
2720
+ const uint32_t ir_last = MIN(ir_first + nrows_per_thread, nrows); // last row
2721
+
2722
+ const size_t src_row_size = ne0 * sizeof(float);
2723
+ const size_t src_stride = src->nb[1];
2724
+
2725
+ uint8_t * restrict src_data = (uint8_t *) src->data + (src_stride * ir_first);
2726
+ uint8_t * restrict dst_data = (uint8_t *) dst + (dst_stride * ir_first);
2727
+
2728
+ for (uint32_t i = ir_first; i < ir_last; ++i) {
2729
+ hex_l2fetch(src_data, src_row_size, src_stride, 2);
2730
+ hvx_copy_f16_au(dst_data, src_data, ne0);
2731
+
2732
+ dst_data += dst_stride;
2733
+ src_data += src_stride;
2734
+ }
2735
+
2736
+ uint64_t t2 = HAP_perf_get_qtimer_count();
2737
+
2738
+ FARF(HIGH, "quantize-f16-f16: %u/%u : n-rows %u (%u:%u) row-size %u (%u) -> %u usec %u\n", ith, nth, nrows, ir_first,
2739
+ ir_last, src_row_size, src_stride, dst_stride, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
2740
+ }
2741
+
2742
+
2743
+ static inline bool htp_is_permuted(const struct htp_tensor * t) {
2744
+ return t->nb[0] > t->nb[1] || t->nb[1] > t->nb[2] || t->nb[2] > t->nb[3];
2745
+ }
2746
+
2747
+ static int htp_mminit_vec_dot(struct htp_matmul_context * mmctx, enum htp_data_type type) {
2748
+ switch (type) {
2749
+ case HTP_TYPE_Q4_0:
2750
+ mmctx->type = "q4x4x2-f32";
2751
+ mmctx->vec_dot_1x1 = vec_dot_q4x4x2_q8x4x2_1x1;
2752
+ mmctx->vec_dot_2x1 = vec_dot_q4x4x2_q8x4x2_2x1;
2753
+ mmctx->vec_dot_2x2 = vec_dot_q4x4x2_q8x4x2_2x2;
2754
+ return 0;
2755
+ case HTP_TYPE_Q8_0:
2756
+ mmctx->type = "q8x4x2-f32";
2757
+ mmctx->vec_dot_1x1 = vec_dot_q8x4x2_q8x4x2_1x1;
2758
+ mmctx->vec_dot_2x1 = vec_dot_q8x4x2_q8x4x2_2x1;
2759
+ mmctx->vec_dot_2x2 = vec_dot_q8x4x2_q8x4x2_2x2;
2760
+ return 0;
2761
+ case HTP_TYPE_IQ4_NL:
2762
+ mmctx->type = "iq4nlx4x2-f32";
2763
+ mmctx->vec_dot_1x1 = vec_dot_iq4nlx4x2_q8x4x2_1x1;
2764
+ mmctx->vec_dot_2x1 = vec_dot_iq4nlx4x2_q8x4x2_2x1;
2765
+ mmctx->vec_dot_2x2 = vec_dot_iq4nlx4x2_q8x4x2_2x2;
2766
+ return 0;
2767
+ case HTP_TYPE_MXFP4:
2768
+ mmctx->type = "mxfp4x4x2-f32";
2769
+ mmctx->vec_dot_1x1 = vec_dot_mxfp4x4x2_q8x4x2_1x1;
2770
+ mmctx->vec_dot_2x1 = vec_dot_mxfp4x4x2_q8x4x2_2x1;
2771
+ mmctx->vec_dot_2x2 = vec_dot_mxfp4x4x2_q8x4x2_2x2;
2772
+ return 0;
2773
+ default:
2774
+ return -1;
2775
+ }
2776
+ }
2777
+
2778
+ static void htp_mminit_spad(struct htp_ops_context * octx,
2779
+ size_t dst_row_size,
2780
+ size_t src0_row_size_padded,
2781
+ size_t src1_row_size,
2782
+ uint32_t src1_nrows,
2783
+ size_t src2_spad_size_per_thread) {
2784
+ octx->dst_spad.size_per_thread = hex_round_up(MM_SPAD_DST_NROWS * dst_row_size, 256);
2785
+ octx->src0_spad.size_per_thread = hex_round_up(MM_SPAD_SRC0_NROWS * src0_row_size_padded, 256);
2786
+ octx->src1_spad.size_per_thread = hex_round_up(src1_row_size * src1_nrows, 256);
2787
+
2788
+ if (src2_spad_size_per_thread > 0) {
2789
+ octx->src2_spad.size_per_thread = src2_spad_size_per_thread;
2790
+ octx->src2_spad.size = octx->src2_spad.size_per_thread;
2791
+ }
2792
+
2793
+ // src0 spad is also used in dynamic quantizer to store padded src1 rows
2794
+ size_t src1_row_size_padded = hex_round_up(src1_row_size, QK_Q8_0x4x2 * sizeof(float));
2795
+ if (octx->src0_spad.size_per_thread < src1_row_size_padded) {
2796
+ octx->src0_spad.size_per_thread = src1_row_size_padded;
2797
+ }
2798
+
2799
+ octx->src1_spad.size = octx->src1_spad.size_per_thread;
2800
+ octx->src0_spad.size = octx->src0_spad.size_per_thread * octx->n_threads;
2801
+ octx->dst_spad.size = octx->dst_spad.size_per_thread * octx->n_threads;
2802
+ }
2803
+
2804
+ static int op_matmul_hvx(struct htp_ops_context * octx) {
2805
+ htp_matmul_tensors_preamble;
2806
+
2807
+ struct htp_matmul_context mmctx_struct = {0};
2808
+ struct htp_matmul_context * mmctx = &mmctx_struct;
2809
+ mmctx->octx = octx;
2810
+
2811
+ const uint32_t src0_nrows = ne01 * ne02 * ne03;
2812
+ const uint32_t src1_nrows = ne11 * ne12 * ne13;
2813
+
2814
+ // Compute src0_nrows_per_thread
2815
+ mmctx->src0_nrows_per_thread = (src0_nrows + octx->n_threads - 1) / octx->n_threads;
2816
+ mmctx->src0_nrows_per_thread += (mmctx->src0_nrows_per_thread & 1); // round up to even
2817
+
2818
+ const size_t src0_row_size = nb01;
2819
+ const size_t dst_row_size = nb1;
2820
+ size_t src1_row_size = nb11;
2821
+
2822
+ const size_t src0_row_size_padded = hex_round_up(src0_row_size, 128);
2823
+ size_t src1_row_size_padded;
2824
+
2825
+ worker_callback_t quant_job_func;
2826
+ worker_callback_t matmul_job_func = src1_nrows > 1 ? matmul_2d : matvec_2d;
2827
+
2828
+ bool need_quant = true;
2829
+
2830
+ if (src0->type == HTP_TYPE_F16) {
2831
+ // Try optimized f16-f16 path first (src1 in VTCM)
2832
+ const size_t f16_src1_row_size = hex_round_up(ne10 * 2, 128);
2833
+ const size_t f16_src1_spad_size = hex_round_up(f16_src1_row_size * src1_nrows, 256);
2834
+ const size_t f16_src0_spad_size = hex_round_up(MM_SPAD_SRC0_NROWS * src0_row_size_padded, 256) * octx->n_threads;
2835
+ const size_t f16_dst_spad_size = hex_round_up(MM_SPAD_DST_NROWS * dst_row_size, 256) * octx->n_threads;
2836
+
2837
+ const size_t f16_total_size = f16_src1_spad_size + f16_src0_spad_size + f16_dst_spad_size;
2838
+
2839
+ // Default matmul implementation does not support multi-batch src0 (N-vs-N broadcasting).
2840
+ // It only supports 1-vs-N broadcasting (src0 is 2D) or standard 2D matmul.
2841
+ const bool is_batched = (ne02 > 1) || (ne03 > 1);
2842
+ const bool is_permuted = htp_is_permuted(octx->src[0]) || htp_is_permuted(octx->src[1]);
2843
+
2844
+ if (!is_batched && !is_permuted && f16_total_size <= octx->ctx->vtcm_size) {
2845
+ // Optimized path
2846
+ quant_job_func = (src1->type == HTP_TYPE_F32) ? quantize_f32_f16 : quantize_f16_f16;
2847
+ mmctx->type = "f16-f16";
2848
+ mmctx->vec_dot_1x1 = vec_dot_f16_f16_aa_1x1;
2849
+ mmctx->vec_dot_2x1 = vec_dot_f16_f16_aa_2x1;
2850
+ mmctx->vec_dot_2x2 = vec_dot_f16_f16_aa_2x2;
2851
+
2852
+ src1_row_size = f16_src1_row_size; // row size post quantization
2853
+
2854
+ octx->dst_spad.size_per_thread = hex_round_up(MM_SPAD_DST_NROWS * dst_row_size, 256);
2855
+ octx->src0_spad.size_per_thread = hex_round_up(MM_SPAD_SRC0_NROWS * src0_row_size_padded, 256);
2856
+ octx->src1_spad.size_per_thread = hex_round_up(src1_row_size * src1_nrows, 256);
2857
+
2858
+ octx->src1_spad.size = octx->src1_spad.size_per_thread;
2859
+ octx->src0_spad.size = octx->src0_spad.size_per_thread * octx->n_threads;
2860
+ octx->dst_spad.size = octx->dst_spad.size_per_thread * octx->n_threads;
2861
+ } else {
2862
+ // Fallback to f16/f32 (DDR) if src1 doesn't fit in VTCM or broadcasting is required
2863
+ quant_job_func = NULL;
2864
+ if (src1->type == HTP_TYPE_F32) {
2865
+ mmctx->type = "f16-f32";
2866
+ mmctx->vec_dot_1x1 = vec_dot_f16_f32_uu_1x1;
2867
+ matmul_job_func = matmul_4d;
2868
+ } else {
2869
+ mmctx->type = "f16-f16";
2870
+ mmctx->vec_dot_1x1 = vec_dot_f16_f16_uu_1x1;
2871
+ matmul_job_func = matmul_4d;
2872
+ }
2873
+
2874
+ src1_row_size = nb11; // original row size in DDR
2875
+
2876
+ octx->dst_spad.size_per_thread = hex_round_up(MM_SPAD_DST_NROWS * dst_row_size, 256);
2877
+ octx->src0_spad.size_per_thread = hex_round_up(MM_SPAD_SRC0_NROWS * src0_row_size, 256);
2878
+ octx->src1_spad.size_per_thread = hex_round_up(MM_SPAD_SRC1_NROWS * src1_row_size, 256);
2879
+
2880
+ octx->src0_spad.size = octx->src0_spad.size_per_thread * octx->n_threads;
2881
+ octx->src1_spad.size = octx->src1_spad.size_per_thread * octx->n_threads;
2882
+ octx->dst_spad.size = octx->dst_spad.size_per_thread * octx->n_threads;
2883
+
2884
+ // Init fastdiv for matmul_4d (supports broadcasting)
2885
+ mmctx->mm_div_ne12_ne1 = init_fastdiv_values(src1->ne[2] * dst->ne[1]);
2886
+ mmctx->mm_div_ne1 = init_fastdiv_values(dst->ne[1]);
2887
+ mmctx->mm_div_r2 = init_fastdiv_values(src1->ne[2] / src0->ne[2]);
2888
+ mmctx->mm_div_r3 = init_fastdiv_values(src1->ne[3] / src0->ne[3]);
2889
+
2890
+ need_quant = false;
2891
+ }
2892
+ } else {
2893
+ if (htp_mminit_vec_dot(mmctx, src0->type) != 0) {
2894
+ return HTP_STATUS_NO_SUPPORT;
2895
+ }
2896
+
2897
+ quant_job_func = quantize_f32_q8x4x2;
2898
+ src1_row_size = q8x4x2_row_size(ne10);
2899
+ htp_mminit_spad(octx, dst_row_size, src0_row_size_padded, src1_row_size, src1_nrows, 0);
2900
+ }
2901
+
2902
+ // VTCM scratchpads for all tensors
2903
+ size_t spad_size = octx->src1_spad.size + octx->src0_spad.size + octx->dst_spad.size;
2904
+
2905
+ FARF(HIGH, "matmul-%s : src0-spad-size %u src1-spad-size %u dst-spad-size %u (%zu)\n", mmctx->type,
2906
+ octx->src0_spad.size, octx->src1_spad.size, octx->dst_spad.size, spad_size);
2907
+
2908
+ FARF(HIGH, "matmul-%s : %ux%ux%ux%u * %ux%ux%ux%u-> %ux%ux%ux%u (0x%p, 0x%p, 0x%p)\n", mmctx->type, src0->ne[0],
2909
+ src0->ne[1], src0->ne[2], src0->ne[3], src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], dst->ne[0],
2910
+ dst->ne[1], dst->ne[2], dst->ne[3], src0->data, src1->data, dst->data);
2911
+
2912
+ // Make sure the reserved vtcm size is sufficient
2913
+ if (octx->ctx->vtcm_size < spad_size) {
2914
+ FARF(ERROR, "matmul-%s : current VTCM reservation %zu is too small, needed %zu\n", mmctx->type,
2915
+ octx->ctx->vtcm_size, spad_size);
2916
+ return HTP_STATUS_VTCM_TOO_SMALL;
2917
+ }
2918
+
2919
+ // Place src1 spad first. We use it for dyn.quant and may reuse between ops
2920
+ octx->src1_spad.data = octx->ctx->vtcm_base;
2921
+ octx->src0_spad.data = octx->src1_spad.data + octx->src1_spad.size;
2922
+ octx->dst_spad.data = octx->src0_spad.data + octx->src0_spad.size;
2923
+
2924
+ octx->src1_spad.src = (src1 == octx->src1_spad.src) ? src1 : NULL;
2925
+ octx->src0_spad.src = NULL;
2926
+ octx->dst_spad.src = NULL;
2927
+
2928
+ octx->src0_spad.stride = src0_row_size_padded;
2929
+ octx->src1_spad.stride = src1_row_size;
2930
+
2931
+ if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)
2932
+ return HTP_STATUS_OK;
2933
+
2934
+ if (need_quant && !octx->src1_spad.src) {
2935
+ const uint32_t n_quant_jobs = MIN(src1_nrows, octx->n_threads);
2936
+ mmctx->src1_nrows_per_thread = (src1_nrows + n_quant_jobs - 1) / n_quant_jobs;
2937
+ worker_pool_run_func(octx->ctx->worker_pool, quant_job_func, mmctx, n_quant_jobs);
2938
+ octx->src1_spad.src = src1;
2939
+ }
2940
+
2941
+ const uint32_t n_matmul_jobs = octx->n_threads;
2942
+ worker_pool_run_func(octx->ctx->worker_pool, matmul_job_func, mmctx, n_matmul_jobs);
2943
+
2944
+ return HTP_STATUS_OK;
2945
+ }
2946
+
2947
+ int op_matmul(struct htp_ops_context * octx) {
2948
+ htp_matmul_tensors_preamble;
2949
+
2950
+ #ifndef HTP_HAS_HMX
2951
+ return op_matmul_hvx(octx);
2952
+ #else
2953
+ if (!octx->ctx->hmx_enabled) {
2954
+ return op_matmul_hvx(octx);
2955
+ }
2956
+
2957
+ // HMX weight tile requires N to be 32-aligned.
2958
+ if (src0->ne[1] % 32 != 0) {
2959
+ return op_matmul_hvx(octx);
2960
+ }
2961
+
2962
+ // HMX supports F16, Q4_0, Q8_0, IQ4_NL, MXFP4 weights.
2963
+ // Other types fall back to HVX.
2964
+ uint32_t wtype = src0->type;
2965
+ if (wtype != HTP_TYPE_F16 && wtype != HTP_TYPE_Q4_0 && wtype != HTP_TYPE_Q8_0 && wtype != HTP_TYPE_IQ4_NL && wtype != HTP_TYPE_MXFP4) {
2966
+ return op_matmul_hvx(octx);
2967
+ }
2968
+
2969
+ // Quantised HMX path requires K aligned to 256 (x4x2 super-block).
2970
+ // F16 HMX path requires K aligned to 32 (tile width).
2971
+ if (wtype != HTP_TYPE_F16 && src0->ne[0] % 256 != 0) {
2972
+ return op_matmul_hvx(octx);
2973
+ }
2974
+
2975
+ if (wtype == HTP_TYPE_F16 && src0->ne[0] % 32 != 0) {
2976
+ return op_matmul_hvx(octx);
2977
+ }
2978
+
2979
+ const bool is_batched = (src0->ne[2] * src0->ne[3] > 1 || src1->ne[2] * src1->ne[3] > 1);
2980
+
2981
+ // Quantised HMX kernels only handle flat 2D matmul (host already rejects
2982
+ // batched quantised, but guard here too). F16 batched matmul is handled
2983
+ // by the dedicated wrapper in hmx-matmul-ops.c.
2984
+ if (is_batched && src0->type != HTP_TYPE_F16) {
2985
+ return op_matmul_hvx(octx);
2986
+ }
2987
+
2988
+ // HMX assumes contiguous row-major layout. Fall back for permuted
2989
+ // tensors where strides are non-monotonic (e.g. transposed KV cache).
2990
+ if (src0->nb[0] > src0->nb[1] || src1->nb[0] > src1->nb[1]) {
2991
+ return op_matmul_hvx(octx);
2992
+ }
2993
+
2994
+ // M alignment: Use HMX when M >= 32, the last partial tile (m_total % 32 rows)
2995
+ // is handled by HMX itself; when M < 32 fall back to HVX.
2996
+ const int m_total = (int) src1->ne[1];
2997
+ const int m_hmx = m_total & ~31; // 0 when M < 32
2998
+
2999
+ if (m_hmx == 0) {
3000
+ return op_matmul_hvx(octx);
3001
+ }
3002
+
3003
+ // Always re-quantize src1 since HMX kernel overwrites vtcm/spad,
3004
+ // so any previously cached quantized data is invalid.
3005
+ octx->src1_spad.src = NULL;
3006
+
3007
+ int k = (int) src0->ne[0]; // inner dimension
3008
+ int n = (int) src0->ne[1]; // weight columns
3009
+
3010
+ int ret = -1;
3011
+
3012
+ // Row strides in elements. For compact tensors these equal k; for
3013
+ // permuted attention views they can be larger, so pass the real stride.
3014
+ const int act_stride = (int)(src1->nb[1] / sizeof(float));
3015
+ const int wgt_stride = (int)(src0->nb[1] / sizeof(__fp16));
3016
+
3017
+ if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE) {
3018
+ return HTP_STATUS_OK;
3019
+ }
3020
+
3021
+ if (src0->type == HTP_TYPE_F16) {
3022
+ if (is_batched) {
3023
+ hmx_matmul_w16a32_batched_params_t batch_params = {
3024
+ .dst = (float *) dst->data,
3025
+ .activation = (float *) src1->data,
3026
+ .permuted_weight = (const __fp16 *) src0->data,
3027
+ .m = m_total,
3028
+ .k = k,
3029
+ .n = n,
3030
+ .act_stride = act_stride,
3031
+ .weight_stride = wgt_stride,
3032
+ .dst_stride = (int) (dst->nb[1] / sizeof(float)),
3033
+ .ne02 = ne02,
3034
+ .ne03 = ne03,
3035
+ .ne12 = ne12,
3036
+ .ne13 = ne13,
3037
+ .src0_nb2 = src0->nb[2],
3038
+ .src0_nb3 = src0->nb[3],
3039
+ .src1_nb2 = src1->nb[2],
3040
+ .src1_nb3 = src1->nb[3],
3041
+ .dst_nb2 = dst->nb[2],
3042
+ .dst_nb3 = dst->nb[3],
3043
+ };
3044
+ ret = hmx_mat_mul_permuted_w16a32_batched(octx->ctx, &batch_params);
3045
+ } else {
3046
+ ret = hmx_mat_mul_permuted_w16a32(octx->ctx,
3047
+ (float*) dst->data, (float*) src1->data, (const __fp16 *) src0->data,
3048
+ m_total, k, n, act_stride, wgt_stride);
3049
+ }
3050
+ } else {
3051
+ ret = hmx_mat_mul_permuted_qk_0_d16a32(octx->ctx,
3052
+ (float*) dst->data, (float*) src1->data, (const uint8_t *) src0->data,
3053
+ m_total, k, n, (int) src0->type);
3054
+ }
3055
+
3056
+ if (ret != 0) {
3057
+ FARF(HIGH, "HMX matmul failed (ret=%d), falling back to HVX", ret);
3058
+ return op_matmul(octx);
3059
+ }
3060
+
3061
+ return 0;
3062
+ #endif // HTP_HAS_HMX
3063
+ }
3064
+
3065
+ int op_matmul_id(struct htp_ops_context * octx) {
3066
+ htp_matmul_tensors_preamble;
3067
+
3068
+ struct htp_matmul_context mmctx_struct = {0};
3069
+ struct htp_matmul_context * mmctx = &mmctx_struct;
3070
+ mmctx->octx = octx;
3071
+
3072
+ const struct htp_tensor * restrict ids = octx->src[2];
3073
+
3074
+ const size_t src0_row_size = nb01;
3075
+ const size_t dst_row_size = nb1;
3076
+
3077
+ const size_t src0_row_size_padded = hex_round_up(src0_row_size, 128);
3078
+
3079
+ const uint32_t src0_nrows = ne01; // per expert
3080
+ const uint32_t src1_nrows = ne11 * ne12 * ne13;
3081
+
3082
+ worker_callback_t quant_job_func;
3083
+ worker_callback_t matmul_id_job_func = src1_nrows > 1 ? matmul_id : matvec_id;
3084
+
3085
+ // Compute src0_nrows_per_thread
3086
+ mmctx->src0_nrows_per_thread = (src0_nrows + octx->n_threads - 1) / octx->n_threads;
3087
+ mmctx->src0_nrows_per_thread += (mmctx->src0_nrows_per_thread & 1); // round up to even
3088
+
3089
+ size_t src1_row_size;
3090
+ size_t src1_row_size_padded;
3091
+
3092
+ // row groups
3093
+ const int n_ids = ids->ne[0]; // n_expert_used
3094
+ const int n_as = ne02; // n_expert
3095
+
3096
+ size_t matrix_row_counts_size = n_as * sizeof(uint32_t);
3097
+ size_t matrix_row_map_size = n_as * ids->ne[0] * ids->ne[1] * sizeof(struct mmid_row_mapping);
3098
+
3099
+ if (htp_mminit_vec_dot(mmctx, src0->type) != 0) {
3100
+ return HTP_STATUS_NO_SUPPORT;
3101
+ }
3102
+
3103
+ quant_job_func = quantize_f32_q8x4x2;
3104
+ src1_row_size = q8x4x2_row_size(ne10);
3105
+
3106
+ const size_t src2_spad_size_per_thread = hex_round_up(matrix_row_counts_size + matrix_row_map_size, 256);
3107
+ htp_mminit_spad(octx, dst_row_size, src0_row_size_padded, src1_row_size, src1_nrows, src2_spad_size_per_thread);
3108
+
3109
+ size_t spad_size = octx->src2_spad.size + octx->src1_spad.size + octx->src0_spad.size + octx->dst_spad.size;
3110
+
3111
+ FARF(HIGH, "matmul-id-%s : src0-spad-size %u src1-spad-size %u src2-spad-size %u dst-spad-size %u (%zu)\n", mmctx->type,
3112
+ octx->src0_spad.size, octx->src1_spad.size, octx->src2_spad.size, octx->dst_spad.size, spad_size);
3113
+
3114
+ FARF(HIGH, "matmul-id-%s : %ux%ux%ux%u * %ux%ux%ux%u (%ux%ux%ux%u) -> %ux%ux%ux%u (0x%p, 0x%p, 0x%p)\n", mmctx->type,
3115
+ src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
3116
+ ids->ne[0], ids->ne[1], ids->ne[2], ids->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], src0->data,
3117
+ src1->data, dst->data);
3118
+
3119
+ // Make sure the reserved vtcm size is sufficient
3120
+ if (octx->ctx->vtcm_size < spad_size) {
3121
+ FARF(ERROR, "matmul-id-%s : current VTCM reservation %zu is too small, needed %zu\n", mmctx->type, octx->ctx->vtcm_size, spad_size);
3122
+ return HTP_STATUS_VTCM_TOO_SMALL;
3123
+ }
3124
+
3125
+ // Place src1 spad first. We use it for dyn.quant and may reuse in subseq ops.
3126
+ octx->src1_spad.data = octx->ctx->vtcm_base;
3127
+ octx->src0_spad.data = octx->src1_spad.data + octx->src1_spad.size;
3128
+ octx->src2_spad.data = octx->src0_spad.data + octx->src0_spad.size;
3129
+ octx->dst_spad.data = octx->src2_spad.data + octx->src2_spad.size;
3130
+
3131
+ octx->src1_spad.src = (src1 == octx->src1_spad.src) ? src1 : NULL;
3132
+ octx->src0_spad.src = NULL;
3133
+ octx->src2_spad.src = NULL;
3134
+ octx->dst_spad.src = NULL;
3135
+
3136
+ octx->src0_spad.stride = src0_row_size_padded;
3137
+ octx->src1_spad.stride = src1_row_size;
3138
+
3139
+ if (src1_nrows > 1) {
3140
+ // initialize matrix_row_counts and map
3141
+ uint32_t * matrix_row_counts = (uint32_t *) octx->src2_spad.data + 0;
3142
+ struct mmid_row_mapping * matrix_rows = (void *) octx->src2_spad.data + matrix_row_counts_size;
3143
+
3144
+ memset(matrix_row_counts, 0, n_as * sizeof(uint32_t));
3145
+
3146
+ // group rows by src0 matrix
3147
+ for (uint32_t iid1 = 0; iid1 < ids->ne[1]; ++iid1) { // token idx
3148
+ for (uint32_t id = 0; id < n_ids; ++id) { // expert idx
3149
+ const uint32_t i02 = *(const uint32_t *) ((const uint8_t *) ids->data + iid1 * ids->nb[1] + id * ids->nb[0]);
3150
+
3151
+ assert(i02 >= 0 && i02 < n_as);
3152
+
3153
+ MMID_MATRIX_ROW(i02, matrix_row_counts[i02]) = (struct mmid_row_mapping) { id, iid1 };
3154
+ matrix_row_counts[i02] += 1;
3155
+ }
3156
+ }
3157
+ }
3158
+
3159
+ if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)
3160
+ return HTP_STATUS_OK;
3161
+
3162
+ if (octx->src1_spad.src != src1) {
3163
+ const uint32_t n_quant_jobs = MIN(src1_nrows, octx->n_threads);
3164
+ mmctx->src1_nrows_per_thread = (src1_nrows + n_quant_jobs - 1) / n_quant_jobs;
3165
+ worker_pool_run_func(octx->ctx->worker_pool, quant_job_func, mmctx, n_quant_jobs);
3166
+ octx->src1_spad.src = src1;
3167
+ }
3168
+
3169
+ const uint32_t n_matmul_jobs = octx->n_threads;
3170
+ worker_pool_run_func(octx->ctx->worker_pool, matmul_id_job_func, mmctx, n_matmul_jobs);
3171
+
3172
+ return HTP_STATUS_OK;
3173
+ }