toy 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2107) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +1124 -0
  3. data/LICENSE +21 -0
  4. data/Makefile +2022 -0
  5. data/README.md +154 -0
  6. data/bin/toy +10 -0
  7. data/lib/toy/compute.rb +135 -0
  8. data/lib/toy/compute_cuda.rb +104 -0
  9. data/lib/toy/compute_metal.rb +97 -0
  10. data/lib/toy/core/cli/describe.rb +188 -0
  11. data/lib/toy/core/cli/eval.rb +385 -0
  12. data/lib/toy/core/cli/exit_codes.rb +15 -0
  13. data/lib/toy/core/cli/fetch.rb +238 -0
  14. data/lib/toy/core/cli/infer.rb +268 -0
  15. data/lib/toy/core/cli/install.rb +228 -0
  16. data/lib/toy/core/cli/list.rb +86 -0
  17. data/lib/toy/core/cli/manifest.rb +49 -0
  18. data/lib/toy/core/cli/new.rb +594 -0
  19. data/lib/toy/core/cli/serve.rb +237 -0
  20. data/lib/toy/core/cli/train.rb +471 -0
  21. data/lib/toy/core/cli.rb +165 -0
  22. data/lib/toy/core/config.rb +64 -0
  23. data/lib/toy/core/gguf_meta.rb +161 -0
  24. data/lib/toy/core/model_scan.rb +221 -0
  25. data/lib/toy/core/run_log.rb +94 -0
  26. data/lib/toy/core/toy_root.rb +95 -0
  27. data/lib/toy/dev/toy_card.rb +299 -0
  28. data/lib/toy/dev/toy_describe_flow.rb +412 -0
  29. data/lib/toy/dev/toy_logprobs.rb +86 -0
  30. data/lib/toy/dev/toy_tap.rb +183 -0
  31. data/lib/toy/dev/toy_token_drift.rb +121 -0
  32. data/lib/toy/ffi/tinynn.rb +1491 -0
  33. data/lib/toy/ffi/tinynn_cuda.rb +1124 -0
  34. data/lib/toy/ffi/tinynn_metal.rb +359 -0
  35. data/lib/toy/ffi_manifest.rb +84 -0
  36. data/lib/toy/io/bpe.rb +325 -0
  37. data/lib/toy/io/gguf_kv.rb +35 -0
  38. data/lib/toy/io/gguf_load.rb +331 -0
  39. data/lib/toy/io/loaders/toy_gpt2_loader.rb +70 -0
  40. data/lib/toy/io/loaders/toy_smollm2_loader.rb +754 -0
  41. data/lib/toy/io/model_index.rb +206 -0
  42. data/lib/toy/io/run_bundle.rb +280 -0
  43. data/lib/toy/io/tokenizer.rb +613 -0
  44. data/lib/toy/io/toy_corpus_loader.rb +52 -0
  45. data/lib/toy/io/toy_events.rb +56 -0
  46. data/lib/toy/io/toy_image_loader.rb +48 -0
  47. data/lib/toy/llm/adamw.rb +169 -0
  48. data/lib/toy/llm/archs/llama_arch.rb +233 -0
  49. data/lib/toy/llm/archs/llama_arch_cuda.rb +237 -0
  50. data/lib/toy/llm/archs/llama_arch_metal.rb +237 -0
  51. data/lib/toy/llm/blocks/transformer_block.rb +876 -0
  52. data/lib/toy/llm/blocks/transformer_block_cuda.rb +880 -0
  53. data/lib/toy/llm/blocks/transformer_block_metal.rb +880 -0
  54. data/lib/toy/llm/classify_batch.rb +88 -0
  55. data/lib/toy/llm/engine/gpt2_fwd_engine.rb +360 -0
  56. data/lib/toy/llm/engine/gpt2_fwd_engine_cuda.rb +362 -0
  57. data/lib/toy/llm/engine/gpt2_fwd_engine_metal.rb +362 -0
  58. data/lib/toy/llm/engine/gpt2_kv_engine.rb +346 -0
  59. data/lib/toy/llm/engine/gpt2_kv_engine_cuda.rb +348 -0
  60. data/lib/toy/llm/engine/gpt2_kv_engine_metal.rb +348 -0
  61. data/lib/toy/llm/engine/gpt2_seq_engine.rb +289 -0
  62. data/lib/toy/llm/engine/gpt2_seq_engine_cuda.rb +293 -0
  63. data/lib/toy/llm/engine/gpt2_seq_engine_metal.rb +293 -0
  64. data/lib/toy/llm/engine/llama_kv_engine.rb +1593 -0
  65. data/lib/toy/llm/engine/llama_kv_engine_cuda.rb +1526 -0
  66. data/lib/toy/llm/engine/llama_kv_engine_metal.rb +1526 -0
  67. data/lib/toy/llm/engine/llama_seq_engine.rb +1233 -0
  68. data/lib/toy/llm/engine/llama_seq_engine_cuda.rb +1238 -0
  69. data/lib/toy/llm/engine/llama_seq_engine_metal.rb +1238 -0
  70. data/lib/toy/llm/engine/vit_tiny_engine.rb +467 -0
  71. data/lib/toy/llm/labels.rb +142 -0
  72. data/lib/toy/llm/primitives/gqa.rb +62 -0
  73. data/lib/toy/llm/primitives/gqa_cuda.rb +66 -0
  74. data/lib/toy/llm/primitives/gqa_metal.rb +66 -0
  75. data/lib/toy/llm/primitives/rms_norm.rb +39 -0
  76. data/lib/toy/llm/primitives/rms_norm_cuda.rb +43 -0
  77. data/lib/toy/llm/primitives/rms_norm_metal.rb +43 -0
  78. data/lib/toy/llm/primitives/rope.rb +68 -0
  79. data/lib/toy/llm/primitives/rope_cuda.rb +72 -0
  80. data/lib/toy/llm/primitives/rope_metal.rb +72 -0
  81. data/lib/toy/llm/primitives/swiglu.rb +41 -0
  82. data/lib/toy/llm/primitives/swiglu_cuda.rb +45 -0
  83. data/lib/toy/llm/primitives/swiglu_metal.rb +45 -0
  84. data/lib/toy/llm/recipe_options.rb +71 -0
  85. data/lib/toy/llm/recipes/from_scratch.rb +105 -0
  86. data/lib/toy/llm/recipes/from_scratch_cuda.rb +109 -0
  87. data/lib/toy/llm/recipes/from_scratch_metal.rb +109 -0
  88. data/lib/toy/llm/recipes/lora.rb +110 -0
  89. data/lib/toy/llm/recipes/lora_cuda.rb +114 -0
  90. data/lib/toy/llm/recipes/lora_metal.rb +114 -0
  91. data/lib/toy/llm/recipes/vit_tiny.rb +75 -0
  92. data/lib/toy/llm/recipes/warm_start.rb +235 -0
  93. data/lib/toy/llm/recipes/warm_start_cuda.rb +239 -0
  94. data/lib/toy/llm/recipes/warm_start_metal.rb +239 -0
  95. data/lib/toy/llm/training_batch.rb +133 -0
  96. data/lib/toy/models/arch.rb +253 -0
  97. data/lib/toy/models/gpt2.rb +311 -0
  98. data/lib/toy/models/toy_gpt2.rb +177 -0
  99. data/lib/toy/models/toy_smollm2.rb +393 -0
  100. data/lib/toy/models/toy_vit.rb +83 -0
  101. data/lib/toy/models/transformer.rb +1494 -0
  102. data/lib/toy/models/transformer_lm.rb +298 -0
  103. data/lib/toy/models/transformer_lm_cuda.rb +159 -0
  104. data/lib/toy/models/transformer_lm_metal.rb +142 -0
  105. data/lib/toy/mri.rb +300 -0
  106. data/lib/toy/run/eval.rb +76 -0
  107. data/lib/toy/run/eval_cuda.rb +66 -0
  108. data/lib/toy/run/eval_lmc.rb +334 -0
  109. data/lib/toy/run/eval_metal.rb +67 -0
  110. data/lib/toy/run/infer.rb +130 -0
  111. data/lib/toy/run/infer_cuda.rb +118 -0
  112. data/lib/toy/run/infer_metal.rb +119 -0
  113. data/lib/toy/run/infer_trace.rb +37 -0
  114. data/lib/toy/run/serve.rb +144 -0
  115. data/lib/toy/run/train.rb +404 -0
  116. data/lib/toy/run/train_cuda.rb +397 -0
  117. data/lib/toy/run/train_gpt2.rb +103 -0
  118. data/lib/toy/run/train_gpt2_cuda.rb +85 -0
  119. data/lib/toy/run/train_gpt2_metal.rb +85 -0
  120. data/lib/toy/run/train_lora.rb +207 -0
  121. data/lib/toy/run/train_lora_cuda.rb +219 -0
  122. data/lib/toy/run/train_metal.rb +227 -0
  123. data/lib/toy/run/train_vit.rb +251 -0
  124. data/lib/toy/serve/openai/embeddings_handler.rb +92 -0
  125. data/lib/toy/serve/openai/handlers.rb +143 -0
  126. data/lib/toy/serve/openai/server.rb +159 -0
  127. data/lib/toy/train/sampler.rb +314 -0
  128. data/lib/toy/train/toy_chat_template.rb +179 -0
  129. data/lib/toy/train/toy_drift_grad.rb +176 -0
  130. data/lib/toy/train/toy_gguf_fuse.rb +428 -0
  131. data/lib/toy/train/toy_gguf_writer.rb +100 -0
  132. data/lib/toy/train/toy_lr_schedule.rb +39 -0
  133. data/lib/toy/train/toy_sample.rb +125 -0
  134. data/lib/toy/train/toy_trainer.rb +86 -0
  135. data/lib/toy/train/training.rb +160 -0
  136. data/lib/toy/version.rb +11 -0
  137. data/lib/toy.rb +902 -0
  138. data/prep/progress +118 -0
  139. data/prep/quietly +64 -0
  140. data/sig/toy.rbs +397 -0
  141. data/sig/toy_compute.rbs +450 -0
  142. data/spinel-ext.json +122 -0
  143. data/tinynn/Makefile +71 -0
  144. data/tinynn/tinynn_backend_cuda.c +99 -0
  145. data/tinynn/tinynn_backend_metal.m +75 -0
  146. data/tinynn/tinynn_events.c +122 -0
  147. data/tinynn/tinynn_events.h +83 -0
  148. data/tinynn/tinynn_ggml.c +2460 -0
  149. data/tinynn/tinynn_ggml.h +545 -0
  150. data/tinynn/tinynn_gguf.c +783 -0
  151. data/tinynn/tinynn_gguf.h +167 -0
  152. data/tinynn/tinynn_trace.c +180 -0
  153. data/tinynn/tinynn_trace.h +85 -0
  154. data/vendor/ggml/AUTHORS +335 -0
  155. data/vendor/ggml/CMakeLists.txt +505 -0
  156. data/vendor/ggml/CONTRIBUTING.md +3 -0
  157. data/vendor/ggml/LICENSE +21 -0
  158. data/vendor/ggml/README.md +50 -0
  159. data/vendor/ggml/ci/run.sh +395 -0
  160. data/vendor/ggml/cmake/FindNCCL.cmake +36 -0
  161. data/vendor/ggml/cmake/GitVars.cmake +22 -0
  162. data/vendor/ggml/cmake/common.cmake +50 -0
  163. data/vendor/ggml/cmake/ggml-config.cmake.in +191 -0
  164. data/vendor/ggml/docs/gguf.md +828 -0
  165. data/vendor/ggml/examples/CMakeLists.txt +34 -0
  166. data/vendor/ggml/examples/common-ggml.cpp +244 -0
  167. data/vendor/ggml/examples/common-ggml.h +18 -0
  168. data/vendor/ggml/examples/common.cpp +675 -0
  169. data/vendor/ggml/examples/common.h +322 -0
  170. data/vendor/ggml/examples/gpt-2/CMakeLists.txt +32 -0
  171. data/vendor/ggml/examples/gpt-2/README.md +225 -0
  172. data/vendor/ggml/examples/gpt-2/convert-cerebras-to-ggml.py +183 -0
  173. data/vendor/ggml/examples/gpt-2/convert-ckpt-to-ggml.py +159 -0
  174. data/vendor/ggml/examples/gpt-2/convert-h5-to-ggml.py +195 -0
  175. data/vendor/ggml/examples/gpt-2/download-ggml-model.sh +69 -0
  176. data/vendor/ggml/examples/gpt-2/download-model.sh +48 -0
  177. data/vendor/ggml/examples/gpt-2/main-alloc.cpp +880 -0
  178. data/vendor/ggml/examples/gpt-2/main-backend.cpp +946 -0
  179. data/vendor/ggml/examples/gpt-2/main-batched.cpp +1210 -0
  180. data/vendor/ggml/examples/gpt-2/main-ctx.cpp +840 -0
  181. data/vendor/ggml/examples/gpt-2/main-sched.cpp +1079 -0
  182. data/vendor/ggml/examples/gpt-2/quantize.cpp +184 -0
  183. data/vendor/ggml/examples/gpt-j/CMakeLists.txt +13 -0
  184. data/vendor/ggml/examples/gpt-j/README.md +239 -0
  185. data/vendor/ggml/examples/gpt-j/convert-h5-to-ggml.py +173 -0
  186. data/vendor/ggml/examples/gpt-j/download-ggml-model.sh +69 -0
  187. data/vendor/ggml/examples/gpt-j/download-model.sh +11 -0
  188. data/vendor/ggml/examples/gpt-j/main.cpp +755 -0
  189. data/vendor/ggml/examples/gpt-j/quantize.cpp +182 -0
  190. data/vendor/ggml/examples/magika/CMakeLists.txt +17 -0
  191. data/vendor/ggml/examples/magika/README.md +23 -0
  192. data/vendor/ggml/examples/magika/convert.py +32 -0
  193. data/vendor/ggml/examples/magika/main.cpp +374 -0
  194. data/vendor/ggml/examples/mnist/CMakeLists.txt +58 -0
  195. data/vendor/ggml/examples/mnist/README.md +206 -0
  196. data/vendor/ggml/examples/mnist/mnist-common.cpp +496 -0
  197. data/vendor/ggml/examples/mnist/mnist-common.h +166 -0
  198. data/vendor/ggml/examples/mnist/mnist-eval.cpp +67 -0
  199. data/vendor/ggml/examples/mnist/mnist-train-cnn.py +91 -0
  200. data/vendor/ggml/examples/mnist/mnist-train-fc.py +131 -0
  201. data/vendor/ggml/examples/mnist/mnist-train.cpp +39 -0
  202. data/vendor/ggml/examples/mnist/server.py +36 -0
  203. data/vendor/ggml/examples/mnist/web/index.html +178 -0
  204. data/vendor/ggml/examples/perf-metal/CMakeLists.txt +7 -0
  205. data/vendor/ggml/examples/perf-metal/perf-metal.cpp +152 -0
  206. data/vendor/ggml/examples/prompts/dolly-v2.txt +100 -0
  207. data/vendor/ggml/examples/prompts/gpt-2-chinese.txt +1 -0
  208. data/vendor/ggml/examples/prompts/gpt-2.txt +100 -0
  209. data/vendor/ggml/examples/prompts/gpt-j.txt +100 -0
  210. data/vendor/ggml/examples/prompts/gpt-neox-japanese.txt +1 -0
  211. data/vendor/ggml/examples/prompts/gpt-neox.txt +100 -0
  212. data/vendor/ggml/examples/prompts/polyglot-ko.txt +3 -0
  213. data/vendor/ggml/examples/prompts/replit.txt +100 -0
  214. data/vendor/ggml/examples/prompts/starcoder.txt +100 -0
  215. data/vendor/ggml/examples/prompts/test-cases.txt +110 -0
  216. data/vendor/ggml/examples/prompts/tokenize_huggingface.py +65 -0
  217. data/vendor/ggml/examples/prompts/whisper.txt +100 -0
  218. data/vendor/ggml/examples/python/README.md +115 -0
  219. data/vendor/ggml/examples/python/api.h +14 -0
  220. data/vendor/ggml/examples/python/example_add_quant.py +25 -0
  221. data/vendor/ggml/examples/python/example_test_all_quants.py +68 -0
  222. data/vendor/ggml/examples/python/ggml/__init__.py +58 -0
  223. data/vendor/ggml/examples/python/ggml/__init__.pyi +2406 -0
  224. data/vendor/ggml/examples/python/ggml/cffi.py +11 -0
  225. data/vendor/ggml/examples/python/ggml/ffi/__init__.pyi +7 -0
  226. data/vendor/ggml/examples/python/ggml/utils.py +182 -0
  227. data/vendor/ggml/examples/python/regenerate.py +42 -0
  228. data/vendor/ggml/examples/python/stubs.py +128 -0
  229. data/vendor/ggml/examples/python/test_tensor.py +258 -0
  230. data/vendor/ggml/examples/sam/CMakeLists.txt +13 -0
  231. data/vendor/ggml/examples/sam/README.md +95 -0
  232. data/vendor/ggml/examples/sam/convert-pth-to-ggml.py +147 -0
  233. data/vendor/ggml/examples/sam/example.jpg +0 -0
  234. data/vendor/ggml/examples/sam/sam.cpp +2370 -0
  235. data/vendor/ggml/examples/simple/CMakeLists.txt +21 -0
  236. data/vendor/ggml/examples/simple/README.md +61 -0
  237. data/vendor/ggml/examples/simple/simple-backend.cpp +153 -0
  238. data/vendor/ggml/examples/simple/simple-ctx.cpp +127 -0
  239. data/vendor/ggml/examples/stb_image.h +7987 -0
  240. data/vendor/ggml/examples/stb_image_write.h +1724 -0
  241. data/vendor/ggml/examples/test-cmake/CMakeLists.txt +10 -0
  242. data/vendor/ggml/examples/test-cmake/README.md +3 -0
  243. data/vendor/ggml/examples/test-cmake/test-cmake.cpp +6 -0
  244. data/vendor/ggml/examples/yolo/CMakeLists.txt +6 -0
  245. data/vendor/ggml/examples/yolo/README.md +59 -0
  246. data/vendor/ggml/examples/yolo/convert-yolov3-tiny.py +53 -0
  247. data/vendor/ggml/examples/yolo/data/coco.names +80 -0
  248. data/vendor/ggml/examples/yolo/data/labels/100_0.png +0 -0
  249. data/vendor/ggml/examples/yolo/data/labels/100_1.png +0 -0
  250. data/vendor/ggml/examples/yolo/data/labels/100_2.png +0 -0
  251. data/vendor/ggml/examples/yolo/data/labels/100_3.png +0 -0
  252. data/vendor/ggml/examples/yolo/data/labels/100_4.png +0 -0
  253. data/vendor/ggml/examples/yolo/data/labels/100_5.png +0 -0
  254. data/vendor/ggml/examples/yolo/data/labels/100_6.png +0 -0
  255. data/vendor/ggml/examples/yolo/data/labels/100_7.png +0 -0
  256. data/vendor/ggml/examples/yolo/data/labels/101_0.png +0 -0
  257. data/vendor/ggml/examples/yolo/data/labels/101_1.png +0 -0
  258. data/vendor/ggml/examples/yolo/data/labels/101_2.png +0 -0
  259. data/vendor/ggml/examples/yolo/data/labels/101_3.png +0 -0
  260. data/vendor/ggml/examples/yolo/data/labels/101_4.png +0 -0
  261. data/vendor/ggml/examples/yolo/data/labels/101_5.png +0 -0
  262. data/vendor/ggml/examples/yolo/data/labels/101_6.png +0 -0
  263. data/vendor/ggml/examples/yolo/data/labels/101_7.png +0 -0
  264. data/vendor/ggml/examples/yolo/data/labels/102_0.png +0 -0
  265. data/vendor/ggml/examples/yolo/data/labels/102_1.png +0 -0
  266. data/vendor/ggml/examples/yolo/data/labels/102_2.png +0 -0
  267. data/vendor/ggml/examples/yolo/data/labels/102_3.png +0 -0
  268. data/vendor/ggml/examples/yolo/data/labels/102_4.png +0 -0
  269. data/vendor/ggml/examples/yolo/data/labels/102_5.png +0 -0
  270. data/vendor/ggml/examples/yolo/data/labels/102_6.png +0 -0
  271. data/vendor/ggml/examples/yolo/data/labels/102_7.png +0 -0
  272. data/vendor/ggml/examples/yolo/data/labels/103_0.png +0 -0
  273. data/vendor/ggml/examples/yolo/data/labels/103_1.png +0 -0
  274. data/vendor/ggml/examples/yolo/data/labels/103_2.png +0 -0
  275. data/vendor/ggml/examples/yolo/data/labels/103_3.png +0 -0
  276. data/vendor/ggml/examples/yolo/data/labels/103_4.png +0 -0
  277. data/vendor/ggml/examples/yolo/data/labels/103_5.png +0 -0
  278. data/vendor/ggml/examples/yolo/data/labels/103_6.png +0 -0
  279. data/vendor/ggml/examples/yolo/data/labels/103_7.png +0 -0
  280. data/vendor/ggml/examples/yolo/data/labels/104_0.png +0 -0
  281. data/vendor/ggml/examples/yolo/data/labels/104_1.png +0 -0
  282. data/vendor/ggml/examples/yolo/data/labels/104_2.png +0 -0
  283. data/vendor/ggml/examples/yolo/data/labels/104_3.png +0 -0
  284. data/vendor/ggml/examples/yolo/data/labels/104_4.png +0 -0
  285. data/vendor/ggml/examples/yolo/data/labels/104_5.png +0 -0
  286. data/vendor/ggml/examples/yolo/data/labels/104_6.png +0 -0
  287. data/vendor/ggml/examples/yolo/data/labels/104_7.png +0 -0
  288. data/vendor/ggml/examples/yolo/data/labels/105_0.png +0 -0
  289. data/vendor/ggml/examples/yolo/data/labels/105_1.png +0 -0
  290. data/vendor/ggml/examples/yolo/data/labels/105_2.png +0 -0
  291. data/vendor/ggml/examples/yolo/data/labels/105_3.png +0 -0
  292. data/vendor/ggml/examples/yolo/data/labels/105_4.png +0 -0
  293. data/vendor/ggml/examples/yolo/data/labels/105_5.png +0 -0
  294. data/vendor/ggml/examples/yolo/data/labels/105_6.png +0 -0
  295. data/vendor/ggml/examples/yolo/data/labels/105_7.png +0 -0
  296. data/vendor/ggml/examples/yolo/data/labels/106_0.png +0 -0
  297. data/vendor/ggml/examples/yolo/data/labels/106_1.png +0 -0
  298. data/vendor/ggml/examples/yolo/data/labels/106_2.png +0 -0
  299. data/vendor/ggml/examples/yolo/data/labels/106_3.png +0 -0
  300. data/vendor/ggml/examples/yolo/data/labels/106_4.png +0 -0
  301. data/vendor/ggml/examples/yolo/data/labels/106_5.png +0 -0
  302. data/vendor/ggml/examples/yolo/data/labels/106_6.png +0 -0
  303. data/vendor/ggml/examples/yolo/data/labels/106_7.png +0 -0
  304. data/vendor/ggml/examples/yolo/data/labels/107_0.png +0 -0
  305. data/vendor/ggml/examples/yolo/data/labels/107_1.png +0 -0
  306. data/vendor/ggml/examples/yolo/data/labels/107_2.png +0 -0
  307. data/vendor/ggml/examples/yolo/data/labels/107_3.png +0 -0
  308. data/vendor/ggml/examples/yolo/data/labels/107_4.png +0 -0
  309. data/vendor/ggml/examples/yolo/data/labels/107_5.png +0 -0
  310. data/vendor/ggml/examples/yolo/data/labels/107_6.png +0 -0
  311. data/vendor/ggml/examples/yolo/data/labels/107_7.png +0 -0
  312. data/vendor/ggml/examples/yolo/data/labels/108_0.png +0 -0
  313. data/vendor/ggml/examples/yolo/data/labels/108_1.png +0 -0
  314. data/vendor/ggml/examples/yolo/data/labels/108_2.png +0 -0
  315. data/vendor/ggml/examples/yolo/data/labels/108_3.png +0 -0
  316. data/vendor/ggml/examples/yolo/data/labels/108_4.png +0 -0
  317. data/vendor/ggml/examples/yolo/data/labels/108_5.png +0 -0
  318. data/vendor/ggml/examples/yolo/data/labels/108_6.png +0 -0
  319. data/vendor/ggml/examples/yolo/data/labels/108_7.png +0 -0
  320. data/vendor/ggml/examples/yolo/data/labels/109_0.png +0 -0
  321. data/vendor/ggml/examples/yolo/data/labels/109_1.png +0 -0
  322. data/vendor/ggml/examples/yolo/data/labels/109_2.png +0 -0
  323. data/vendor/ggml/examples/yolo/data/labels/109_3.png +0 -0
  324. data/vendor/ggml/examples/yolo/data/labels/109_4.png +0 -0
  325. data/vendor/ggml/examples/yolo/data/labels/109_5.png +0 -0
  326. data/vendor/ggml/examples/yolo/data/labels/109_6.png +0 -0
  327. data/vendor/ggml/examples/yolo/data/labels/109_7.png +0 -0
  328. data/vendor/ggml/examples/yolo/data/labels/110_0.png +0 -0
  329. data/vendor/ggml/examples/yolo/data/labels/110_1.png +0 -0
  330. data/vendor/ggml/examples/yolo/data/labels/110_2.png +0 -0
  331. data/vendor/ggml/examples/yolo/data/labels/110_3.png +0 -0
  332. data/vendor/ggml/examples/yolo/data/labels/110_4.png +0 -0
  333. data/vendor/ggml/examples/yolo/data/labels/110_5.png +0 -0
  334. data/vendor/ggml/examples/yolo/data/labels/110_6.png +0 -0
  335. data/vendor/ggml/examples/yolo/data/labels/110_7.png +0 -0
  336. data/vendor/ggml/examples/yolo/data/labels/111_0.png +0 -0
  337. data/vendor/ggml/examples/yolo/data/labels/111_1.png +0 -0
  338. data/vendor/ggml/examples/yolo/data/labels/111_2.png +0 -0
  339. data/vendor/ggml/examples/yolo/data/labels/111_3.png +0 -0
  340. data/vendor/ggml/examples/yolo/data/labels/111_4.png +0 -0
  341. data/vendor/ggml/examples/yolo/data/labels/111_5.png +0 -0
  342. data/vendor/ggml/examples/yolo/data/labels/111_6.png +0 -0
  343. data/vendor/ggml/examples/yolo/data/labels/111_7.png +0 -0
  344. data/vendor/ggml/examples/yolo/data/labels/112_0.png +0 -0
  345. data/vendor/ggml/examples/yolo/data/labels/112_1.png +0 -0
  346. data/vendor/ggml/examples/yolo/data/labels/112_2.png +0 -0
  347. data/vendor/ggml/examples/yolo/data/labels/112_3.png +0 -0
  348. data/vendor/ggml/examples/yolo/data/labels/112_4.png +0 -0
  349. data/vendor/ggml/examples/yolo/data/labels/112_5.png +0 -0
  350. data/vendor/ggml/examples/yolo/data/labels/112_6.png +0 -0
  351. data/vendor/ggml/examples/yolo/data/labels/112_7.png +0 -0
  352. data/vendor/ggml/examples/yolo/data/labels/113_0.png +0 -0
  353. data/vendor/ggml/examples/yolo/data/labels/113_1.png +0 -0
  354. data/vendor/ggml/examples/yolo/data/labels/113_2.png +0 -0
  355. data/vendor/ggml/examples/yolo/data/labels/113_3.png +0 -0
  356. data/vendor/ggml/examples/yolo/data/labels/113_4.png +0 -0
  357. data/vendor/ggml/examples/yolo/data/labels/113_5.png +0 -0
  358. data/vendor/ggml/examples/yolo/data/labels/113_6.png +0 -0
  359. data/vendor/ggml/examples/yolo/data/labels/113_7.png +0 -0
  360. data/vendor/ggml/examples/yolo/data/labels/114_0.png +0 -0
  361. data/vendor/ggml/examples/yolo/data/labels/114_1.png +0 -0
  362. data/vendor/ggml/examples/yolo/data/labels/114_2.png +0 -0
  363. data/vendor/ggml/examples/yolo/data/labels/114_3.png +0 -0
  364. data/vendor/ggml/examples/yolo/data/labels/114_4.png +0 -0
  365. data/vendor/ggml/examples/yolo/data/labels/114_5.png +0 -0
  366. data/vendor/ggml/examples/yolo/data/labels/114_6.png +0 -0
  367. data/vendor/ggml/examples/yolo/data/labels/114_7.png +0 -0
  368. data/vendor/ggml/examples/yolo/data/labels/115_0.png +0 -0
  369. data/vendor/ggml/examples/yolo/data/labels/115_1.png +0 -0
  370. data/vendor/ggml/examples/yolo/data/labels/115_2.png +0 -0
  371. data/vendor/ggml/examples/yolo/data/labels/115_3.png +0 -0
  372. data/vendor/ggml/examples/yolo/data/labels/115_4.png +0 -0
  373. data/vendor/ggml/examples/yolo/data/labels/115_5.png +0 -0
  374. data/vendor/ggml/examples/yolo/data/labels/115_6.png +0 -0
  375. data/vendor/ggml/examples/yolo/data/labels/115_7.png +0 -0
  376. data/vendor/ggml/examples/yolo/data/labels/116_0.png +0 -0
  377. data/vendor/ggml/examples/yolo/data/labels/116_1.png +0 -0
  378. data/vendor/ggml/examples/yolo/data/labels/116_2.png +0 -0
  379. data/vendor/ggml/examples/yolo/data/labels/116_3.png +0 -0
  380. data/vendor/ggml/examples/yolo/data/labels/116_4.png +0 -0
  381. data/vendor/ggml/examples/yolo/data/labels/116_5.png +0 -0
  382. data/vendor/ggml/examples/yolo/data/labels/116_6.png +0 -0
  383. data/vendor/ggml/examples/yolo/data/labels/116_7.png +0 -0
  384. data/vendor/ggml/examples/yolo/data/labels/117_0.png +0 -0
  385. data/vendor/ggml/examples/yolo/data/labels/117_1.png +0 -0
  386. data/vendor/ggml/examples/yolo/data/labels/117_2.png +0 -0
  387. data/vendor/ggml/examples/yolo/data/labels/117_3.png +0 -0
  388. data/vendor/ggml/examples/yolo/data/labels/117_4.png +0 -0
  389. data/vendor/ggml/examples/yolo/data/labels/117_5.png +0 -0
  390. data/vendor/ggml/examples/yolo/data/labels/117_6.png +0 -0
  391. data/vendor/ggml/examples/yolo/data/labels/117_7.png +0 -0
  392. data/vendor/ggml/examples/yolo/data/labels/118_0.png +0 -0
  393. data/vendor/ggml/examples/yolo/data/labels/118_1.png +0 -0
  394. data/vendor/ggml/examples/yolo/data/labels/118_2.png +0 -0
  395. data/vendor/ggml/examples/yolo/data/labels/118_3.png +0 -0
  396. data/vendor/ggml/examples/yolo/data/labels/118_4.png +0 -0
  397. data/vendor/ggml/examples/yolo/data/labels/118_5.png +0 -0
  398. data/vendor/ggml/examples/yolo/data/labels/118_6.png +0 -0
  399. data/vendor/ggml/examples/yolo/data/labels/118_7.png +0 -0
  400. data/vendor/ggml/examples/yolo/data/labels/119_0.png +0 -0
  401. data/vendor/ggml/examples/yolo/data/labels/119_1.png +0 -0
  402. data/vendor/ggml/examples/yolo/data/labels/119_2.png +0 -0
  403. data/vendor/ggml/examples/yolo/data/labels/119_3.png +0 -0
  404. data/vendor/ggml/examples/yolo/data/labels/119_4.png +0 -0
  405. data/vendor/ggml/examples/yolo/data/labels/119_5.png +0 -0
  406. data/vendor/ggml/examples/yolo/data/labels/119_6.png +0 -0
  407. data/vendor/ggml/examples/yolo/data/labels/119_7.png +0 -0
  408. data/vendor/ggml/examples/yolo/data/labels/120_0.png +0 -0
  409. data/vendor/ggml/examples/yolo/data/labels/120_1.png +0 -0
  410. data/vendor/ggml/examples/yolo/data/labels/120_2.png +0 -0
  411. data/vendor/ggml/examples/yolo/data/labels/120_3.png +0 -0
  412. data/vendor/ggml/examples/yolo/data/labels/120_4.png +0 -0
  413. data/vendor/ggml/examples/yolo/data/labels/120_5.png +0 -0
  414. data/vendor/ggml/examples/yolo/data/labels/120_6.png +0 -0
  415. data/vendor/ggml/examples/yolo/data/labels/120_7.png +0 -0
  416. data/vendor/ggml/examples/yolo/data/labels/121_0.png +0 -0
  417. data/vendor/ggml/examples/yolo/data/labels/121_1.png +0 -0
  418. data/vendor/ggml/examples/yolo/data/labels/121_2.png +0 -0
  419. data/vendor/ggml/examples/yolo/data/labels/121_3.png +0 -0
  420. data/vendor/ggml/examples/yolo/data/labels/121_4.png +0 -0
  421. data/vendor/ggml/examples/yolo/data/labels/121_5.png +0 -0
  422. data/vendor/ggml/examples/yolo/data/labels/121_6.png +0 -0
  423. data/vendor/ggml/examples/yolo/data/labels/121_7.png +0 -0
  424. data/vendor/ggml/examples/yolo/data/labels/122_0.png +0 -0
  425. data/vendor/ggml/examples/yolo/data/labels/122_1.png +0 -0
  426. data/vendor/ggml/examples/yolo/data/labels/122_2.png +0 -0
  427. data/vendor/ggml/examples/yolo/data/labels/122_3.png +0 -0
  428. data/vendor/ggml/examples/yolo/data/labels/122_4.png +0 -0
  429. data/vendor/ggml/examples/yolo/data/labels/122_5.png +0 -0
  430. data/vendor/ggml/examples/yolo/data/labels/122_6.png +0 -0
  431. data/vendor/ggml/examples/yolo/data/labels/122_7.png +0 -0
  432. data/vendor/ggml/examples/yolo/data/labels/123_0.png +0 -0
  433. data/vendor/ggml/examples/yolo/data/labels/123_1.png +0 -0
  434. data/vendor/ggml/examples/yolo/data/labels/123_2.png +0 -0
  435. data/vendor/ggml/examples/yolo/data/labels/123_3.png +0 -0
  436. data/vendor/ggml/examples/yolo/data/labels/123_4.png +0 -0
  437. data/vendor/ggml/examples/yolo/data/labels/123_5.png +0 -0
  438. data/vendor/ggml/examples/yolo/data/labels/123_6.png +0 -0
  439. data/vendor/ggml/examples/yolo/data/labels/123_7.png +0 -0
  440. data/vendor/ggml/examples/yolo/data/labels/124_0.png +0 -0
  441. data/vendor/ggml/examples/yolo/data/labels/124_1.png +0 -0
  442. data/vendor/ggml/examples/yolo/data/labels/124_2.png +0 -0
  443. data/vendor/ggml/examples/yolo/data/labels/124_3.png +0 -0
  444. data/vendor/ggml/examples/yolo/data/labels/124_4.png +0 -0
  445. data/vendor/ggml/examples/yolo/data/labels/124_5.png +0 -0
  446. data/vendor/ggml/examples/yolo/data/labels/124_6.png +0 -0
  447. data/vendor/ggml/examples/yolo/data/labels/124_7.png +0 -0
  448. data/vendor/ggml/examples/yolo/data/labels/125_0.png +0 -0
  449. data/vendor/ggml/examples/yolo/data/labels/125_1.png +0 -0
  450. data/vendor/ggml/examples/yolo/data/labels/125_2.png +0 -0
  451. data/vendor/ggml/examples/yolo/data/labels/125_3.png +0 -0
  452. data/vendor/ggml/examples/yolo/data/labels/125_4.png +0 -0
  453. data/vendor/ggml/examples/yolo/data/labels/125_5.png +0 -0
  454. data/vendor/ggml/examples/yolo/data/labels/125_6.png +0 -0
  455. data/vendor/ggml/examples/yolo/data/labels/125_7.png +0 -0
  456. data/vendor/ggml/examples/yolo/data/labels/126_0.png +0 -0
  457. data/vendor/ggml/examples/yolo/data/labels/126_1.png +0 -0
  458. data/vendor/ggml/examples/yolo/data/labels/126_2.png +0 -0
  459. data/vendor/ggml/examples/yolo/data/labels/126_3.png +0 -0
  460. data/vendor/ggml/examples/yolo/data/labels/126_4.png +0 -0
  461. data/vendor/ggml/examples/yolo/data/labels/126_5.png +0 -0
  462. data/vendor/ggml/examples/yolo/data/labels/126_6.png +0 -0
  463. data/vendor/ggml/examples/yolo/data/labels/126_7.png +0 -0
  464. data/vendor/ggml/examples/yolo/data/labels/32_0.png +0 -0
  465. data/vendor/ggml/examples/yolo/data/labels/32_1.png +0 -0
  466. data/vendor/ggml/examples/yolo/data/labels/32_2.png +0 -0
  467. data/vendor/ggml/examples/yolo/data/labels/32_3.png +0 -0
  468. data/vendor/ggml/examples/yolo/data/labels/32_4.png +0 -0
  469. data/vendor/ggml/examples/yolo/data/labels/32_5.png +0 -0
  470. data/vendor/ggml/examples/yolo/data/labels/32_6.png +0 -0
  471. data/vendor/ggml/examples/yolo/data/labels/32_7.png +0 -0
  472. data/vendor/ggml/examples/yolo/data/labels/33_0.png +0 -0
  473. data/vendor/ggml/examples/yolo/data/labels/33_1.png +0 -0
  474. data/vendor/ggml/examples/yolo/data/labels/33_2.png +0 -0
  475. data/vendor/ggml/examples/yolo/data/labels/33_3.png +0 -0
  476. data/vendor/ggml/examples/yolo/data/labels/33_4.png +0 -0
  477. data/vendor/ggml/examples/yolo/data/labels/33_5.png +0 -0
  478. data/vendor/ggml/examples/yolo/data/labels/33_6.png +0 -0
  479. data/vendor/ggml/examples/yolo/data/labels/33_7.png +0 -0
  480. data/vendor/ggml/examples/yolo/data/labels/34_0.png +0 -0
  481. data/vendor/ggml/examples/yolo/data/labels/34_1.png +0 -0
  482. data/vendor/ggml/examples/yolo/data/labels/34_2.png +0 -0
  483. data/vendor/ggml/examples/yolo/data/labels/34_3.png +0 -0
  484. data/vendor/ggml/examples/yolo/data/labels/34_4.png +0 -0
  485. data/vendor/ggml/examples/yolo/data/labels/34_5.png +0 -0
  486. data/vendor/ggml/examples/yolo/data/labels/34_6.png +0 -0
  487. data/vendor/ggml/examples/yolo/data/labels/34_7.png +0 -0
  488. data/vendor/ggml/examples/yolo/data/labels/35_0.png +0 -0
  489. data/vendor/ggml/examples/yolo/data/labels/35_1.png +0 -0
  490. data/vendor/ggml/examples/yolo/data/labels/35_2.png +0 -0
  491. data/vendor/ggml/examples/yolo/data/labels/35_3.png +0 -0
  492. data/vendor/ggml/examples/yolo/data/labels/35_4.png +0 -0
  493. data/vendor/ggml/examples/yolo/data/labels/35_5.png +0 -0
  494. data/vendor/ggml/examples/yolo/data/labels/35_6.png +0 -0
  495. data/vendor/ggml/examples/yolo/data/labels/35_7.png +0 -0
  496. data/vendor/ggml/examples/yolo/data/labels/36_0.png +0 -0
  497. data/vendor/ggml/examples/yolo/data/labels/36_1.png +0 -0
  498. data/vendor/ggml/examples/yolo/data/labels/36_2.png +0 -0
  499. data/vendor/ggml/examples/yolo/data/labels/36_3.png +0 -0
  500. data/vendor/ggml/examples/yolo/data/labels/36_4.png +0 -0
  501. data/vendor/ggml/examples/yolo/data/labels/36_5.png +0 -0
  502. data/vendor/ggml/examples/yolo/data/labels/36_6.png +0 -0
  503. data/vendor/ggml/examples/yolo/data/labels/36_7.png +0 -0
  504. data/vendor/ggml/examples/yolo/data/labels/37_0.png +0 -0
  505. data/vendor/ggml/examples/yolo/data/labels/37_1.png +0 -0
  506. data/vendor/ggml/examples/yolo/data/labels/37_2.png +0 -0
  507. data/vendor/ggml/examples/yolo/data/labels/37_3.png +0 -0
  508. data/vendor/ggml/examples/yolo/data/labels/37_4.png +0 -0
  509. data/vendor/ggml/examples/yolo/data/labels/37_5.png +0 -0
  510. data/vendor/ggml/examples/yolo/data/labels/37_6.png +0 -0
  511. data/vendor/ggml/examples/yolo/data/labels/37_7.png +0 -0
  512. data/vendor/ggml/examples/yolo/data/labels/38_0.png +0 -0
  513. data/vendor/ggml/examples/yolo/data/labels/38_1.png +0 -0
  514. data/vendor/ggml/examples/yolo/data/labels/38_2.png +0 -0
  515. data/vendor/ggml/examples/yolo/data/labels/38_3.png +0 -0
  516. data/vendor/ggml/examples/yolo/data/labels/38_4.png +0 -0
  517. data/vendor/ggml/examples/yolo/data/labels/38_5.png +0 -0
  518. data/vendor/ggml/examples/yolo/data/labels/38_6.png +0 -0
  519. data/vendor/ggml/examples/yolo/data/labels/38_7.png +0 -0
  520. data/vendor/ggml/examples/yolo/data/labels/39_0.png +0 -0
  521. data/vendor/ggml/examples/yolo/data/labels/39_1.png +0 -0
  522. data/vendor/ggml/examples/yolo/data/labels/39_2.png +0 -0
  523. data/vendor/ggml/examples/yolo/data/labels/39_3.png +0 -0
  524. data/vendor/ggml/examples/yolo/data/labels/39_4.png +0 -0
  525. data/vendor/ggml/examples/yolo/data/labels/39_5.png +0 -0
  526. data/vendor/ggml/examples/yolo/data/labels/39_6.png +0 -0
  527. data/vendor/ggml/examples/yolo/data/labels/39_7.png +0 -0
  528. data/vendor/ggml/examples/yolo/data/labels/40_0.png +0 -0
  529. data/vendor/ggml/examples/yolo/data/labels/40_1.png +0 -0
  530. data/vendor/ggml/examples/yolo/data/labels/40_2.png +0 -0
  531. data/vendor/ggml/examples/yolo/data/labels/40_3.png +0 -0
  532. data/vendor/ggml/examples/yolo/data/labels/40_4.png +0 -0
  533. data/vendor/ggml/examples/yolo/data/labels/40_5.png +0 -0
  534. data/vendor/ggml/examples/yolo/data/labels/40_6.png +0 -0
  535. data/vendor/ggml/examples/yolo/data/labels/40_7.png +0 -0
  536. data/vendor/ggml/examples/yolo/data/labels/41_0.png +0 -0
  537. data/vendor/ggml/examples/yolo/data/labels/41_1.png +0 -0
  538. data/vendor/ggml/examples/yolo/data/labels/41_2.png +0 -0
  539. data/vendor/ggml/examples/yolo/data/labels/41_3.png +0 -0
  540. data/vendor/ggml/examples/yolo/data/labels/41_4.png +0 -0
  541. data/vendor/ggml/examples/yolo/data/labels/41_5.png +0 -0
  542. data/vendor/ggml/examples/yolo/data/labels/41_6.png +0 -0
  543. data/vendor/ggml/examples/yolo/data/labels/41_7.png +0 -0
  544. data/vendor/ggml/examples/yolo/data/labels/42_0.png +0 -0
  545. data/vendor/ggml/examples/yolo/data/labels/42_1.png +0 -0
  546. data/vendor/ggml/examples/yolo/data/labels/42_2.png +0 -0
  547. data/vendor/ggml/examples/yolo/data/labels/42_3.png +0 -0
  548. data/vendor/ggml/examples/yolo/data/labels/42_4.png +0 -0
  549. data/vendor/ggml/examples/yolo/data/labels/42_5.png +0 -0
  550. data/vendor/ggml/examples/yolo/data/labels/42_6.png +0 -0
  551. data/vendor/ggml/examples/yolo/data/labels/42_7.png +0 -0
  552. data/vendor/ggml/examples/yolo/data/labels/43_0.png +0 -0
  553. data/vendor/ggml/examples/yolo/data/labels/43_1.png +0 -0
  554. data/vendor/ggml/examples/yolo/data/labels/43_2.png +0 -0
  555. data/vendor/ggml/examples/yolo/data/labels/43_3.png +0 -0
  556. data/vendor/ggml/examples/yolo/data/labels/43_4.png +0 -0
  557. data/vendor/ggml/examples/yolo/data/labels/43_5.png +0 -0
  558. data/vendor/ggml/examples/yolo/data/labels/43_6.png +0 -0
  559. data/vendor/ggml/examples/yolo/data/labels/43_7.png +0 -0
  560. data/vendor/ggml/examples/yolo/data/labels/44_0.png +0 -0
  561. data/vendor/ggml/examples/yolo/data/labels/44_1.png +0 -0
  562. data/vendor/ggml/examples/yolo/data/labels/44_2.png +0 -0
  563. data/vendor/ggml/examples/yolo/data/labels/44_3.png +0 -0
  564. data/vendor/ggml/examples/yolo/data/labels/44_4.png +0 -0
  565. data/vendor/ggml/examples/yolo/data/labels/44_5.png +0 -0
  566. data/vendor/ggml/examples/yolo/data/labels/44_6.png +0 -0
  567. data/vendor/ggml/examples/yolo/data/labels/44_7.png +0 -0
  568. data/vendor/ggml/examples/yolo/data/labels/45_0.png +0 -0
  569. data/vendor/ggml/examples/yolo/data/labels/45_1.png +0 -0
  570. data/vendor/ggml/examples/yolo/data/labels/45_2.png +0 -0
  571. data/vendor/ggml/examples/yolo/data/labels/45_3.png +0 -0
  572. data/vendor/ggml/examples/yolo/data/labels/45_4.png +0 -0
  573. data/vendor/ggml/examples/yolo/data/labels/45_5.png +0 -0
  574. data/vendor/ggml/examples/yolo/data/labels/45_6.png +0 -0
  575. data/vendor/ggml/examples/yolo/data/labels/45_7.png +0 -0
  576. data/vendor/ggml/examples/yolo/data/labels/46_0.png +0 -0
  577. data/vendor/ggml/examples/yolo/data/labels/46_1.png +0 -0
  578. data/vendor/ggml/examples/yolo/data/labels/46_2.png +0 -0
  579. data/vendor/ggml/examples/yolo/data/labels/46_3.png +0 -0
  580. data/vendor/ggml/examples/yolo/data/labels/46_4.png +0 -0
  581. data/vendor/ggml/examples/yolo/data/labels/46_5.png +0 -0
  582. data/vendor/ggml/examples/yolo/data/labels/46_6.png +0 -0
  583. data/vendor/ggml/examples/yolo/data/labels/46_7.png +0 -0
  584. data/vendor/ggml/examples/yolo/data/labels/47_0.png +0 -0
  585. data/vendor/ggml/examples/yolo/data/labels/47_1.png +0 -0
  586. data/vendor/ggml/examples/yolo/data/labels/47_2.png +0 -0
  587. data/vendor/ggml/examples/yolo/data/labels/47_3.png +0 -0
  588. data/vendor/ggml/examples/yolo/data/labels/47_4.png +0 -0
  589. data/vendor/ggml/examples/yolo/data/labels/47_5.png +0 -0
  590. data/vendor/ggml/examples/yolo/data/labels/47_6.png +0 -0
  591. data/vendor/ggml/examples/yolo/data/labels/47_7.png +0 -0
  592. data/vendor/ggml/examples/yolo/data/labels/48_0.png +0 -0
  593. data/vendor/ggml/examples/yolo/data/labels/48_1.png +0 -0
  594. data/vendor/ggml/examples/yolo/data/labels/48_2.png +0 -0
  595. data/vendor/ggml/examples/yolo/data/labels/48_3.png +0 -0
  596. data/vendor/ggml/examples/yolo/data/labels/48_4.png +0 -0
  597. data/vendor/ggml/examples/yolo/data/labels/48_5.png +0 -0
  598. data/vendor/ggml/examples/yolo/data/labels/48_6.png +0 -0
  599. data/vendor/ggml/examples/yolo/data/labels/48_7.png +0 -0
  600. data/vendor/ggml/examples/yolo/data/labels/49_0.png +0 -0
  601. data/vendor/ggml/examples/yolo/data/labels/49_1.png +0 -0
  602. data/vendor/ggml/examples/yolo/data/labels/49_2.png +0 -0
  603. data/vendor/ggml/examples/yolo/data/labels/49_3.png +0 -0
  604. data/vendor/ggml/examples/yolo/data/labels/49_4.png +0 -0
  605. data/vendor/ggml/examples/yolo/data/labels/49_5.png +0 -0
  606. data/vendor/ggml/examples/yolo/data/labels/49_6.png +0 -0
  607. data/vendor/ggml/examples/yolo/data/labels/49_7.png +0 -0
  608. data/vendor/ggml/examples/yolo/data/labels/50_0.png +0 -0
  609. data/vendor/ggml/examples/yolo/data/labels/50_1.png +0 -0
  610. data/vendor/ggml/examples/yolo/data/labels/50_2.png +0 -0
  611. data/vendor/ggml/examples/yolo/data/labels/50_3.png +0 -0
  612. data/vendor/ggml/examples/yolo/data/labels/50_4.png +0 -0
  613. data/vendor/ggml/examples/yolo/data/labels/50_5.png +0 -0
  614. data/vendor/ggml/examples/yolo/data/labels/50_6.png +0 -0
  615. data/vendor/ggml/examples/yolo/data/labels/50_7.png +0 -0
  616. data/vendor/ggml/examples/yolo/data/labels/51_0.png +0 -0
  617. data/vendor/ggml/examples/yolo/data/labels/51_1.png +0 -0
  618. data/vendor/ggml/examples/yolo/data/labels/51_2.png +0 -0
  619. data/vendor/ggml/examples/yolo/data/labels/51_3.png +0 -0
  620. data/vendor/ggml/examples/yolo/data/labels/51_4.png +0 -0
  621. data/vendor/ggml/examples/yolo/data/labels/51_5.png +0 -0
  622. data/vendor/ggml/examples/yolo/data/labels/51_6.png +0 -0
  623. data/vendor/ggml/examples/yolo/data/labels/51_7.png +0 -0
  624. data/vendor/ggml/examples/yolo/data/labels/52_0.png +0 -0
  625. data/vendor/ggml/examples/yolo/data/labels/52_1.png +0 -0
  626. data/vendor/ggml/examples/yolo/data/labels/52_2.png +0 -0
  627. data/vendor/ggml/examples/yolo/data/labels/52_3.png +0 -0
  628. data/vendor/ggml/examples/yolo/data/labels/52_4.png +0 -0
  629. data/vendor/ggml/examples/yolo/data/labels/52_5.png +0 -0
  630. data/vendor/ggml/examples/yolo/data/labels/52_6.png +0 -0
  631. data/vendor/ggml/examples/yolo/data/labels/52_7.png +0 -0
  632. data/vendor/ggml/examples/yolo/data/labels/53_0.png +0 -0
  633. data/vendor/ggml/examples/yolo/data/labels/53_1.png +0 -0
  634. data/vendor/ggml/examples/yolo/data/labels/53_2.png +0 -0
  635. data/vendor/ggml/examples/yolo/data/labels/53_3.png +0 -0
  636. data/vendor/ggml/examples/yolo/data/labels/53_4.png +0 -0
  637. data/vendor/ggml/examples/yolo/data/labels/53_5.png +0 -0
  638. data/vendor/ggml/examples/yolo/data/labels/53_6.png +0 -0
  639. data/vendor/ggml/examples/yolo/data/labels/53_7.png +0 -0
  640. data/vendor/ggml/examples/yolo/data/labels/54_0.png +0 -0
  641. data/vendor/ggml/examples/yolo/data/labels/54_1.png +0 -0
  642. data/vendor/ggml/examples/yolo/data/labels/54_2.png +0 -0
  643. data/vendor/ggml/examples/yolo/data/labels/54_3.png +0 -0
  644. data/vendor/ggml/examples/yolo/data/labels/54_4.png +0 -0
  645. data/vendor/ggml/examples/yolo/data/labels/54_5.png +0 -0
  646. data/vendor/ggml/examples/yolo/data/labels/54_6.png +0 -0
  647. data/vendor/ggml/examples/yolo/data/labels/54_7.png +0 -0
  648. data/vendor/ggml/examples/yolo/data/labels/55_0.png +0 -0
  649. data/vendor/ggml/examples/yolo/data/labels/55_1.png +0 -0
  650. data/vendor/ggml/examples/yolo/data/labels/55_2.png +0 -0
  651. data/vendor/ggml/examples/yolo/data/labels/55_3.png +0 -0
  652. data/vendor/ggml/examples/yolo/data/labels/55_4.png +0 -0
  653. data/vendor/ggml/examples/yolo/data/labels/55_5.png +0 -0
  654. data/vendor/ggml/examples/yolo/data/labels/55_6.png +0 -0
  655. data/vendor/ggml/examples/yolo/data/labels/55_7.png +0 -0
  656. data/vendor/ggml/examples/yolo/data/labels/56_0.png +0 -0
  657. data/vendor/ggml/examples/yolo/data/labels/56_1.png +0 -0
  658. data/vendor/ggml/examples/yolo/data/labels/56_2.png +0 -0
  659. data/vendor/ggml/examples/yolo/data/labels/56_3.png +0 -0
  660. data/vendor/ggml/examples/yolo/data/labels/56_4.png +0 -0
  661. data/vendor/ggml/examples/yolo/data/labels/56_5.png +0 -0
  662. data/vendor/ggml/examples/yolo/data/labels/56_6.png +0 -0
  663. data/vendor/ggml/examples/yolo/data/labels/56_7.png +0 -0
  664. data/vendor/ggml/examples/yolo/data/labels/57_0.png +0 -0
  665. data/vendor/ggml/examples/yolo/data/labels/57_1.png +0 -0
  666. data/vendor/ggml/examples/yolo/data/labels/57_2.png +0 -0
  667. data/vendor/ggml/examples/yolo/data/labels/57_3.png +0 -0
  668. data/vendor/ggml/examples/yolo/data/labels/57_4.png +0 -0
  669. data/vendor/ggml/examples/yolo/data/labels/57_5.png +0 -0
  670. data/vendor/ggml/examples/yolo/data/labels/57_6.png +0 -0
  671. data/vendor/ggml/examples/yolo/data/labels/57_7.png +0 -0
  672. data/vendor/ggml/examples/yolo/data/labels/58_0.png +0 -0
  673. data/vendor/ggml/examples/yolo/data/labels/58_1.png +0 -0
  674. data/vendor/ggml/examples/yolo/data/labels/58_2.png +0 -0
  675. data/vendor/ggml/examples/yolo/data/labels/58_3.png +0 -0
  676. data/vendor/ggml/examples/yolo/data/labels/58_4.png +0 -0
  677. data/vendor/ggml/examples/yolo/data/labels/58_5.png +0 -0
  678. data/vendor/ggml/examples/yolo/data/labels/58_6.png +0 -0
  679. data/vendor/ggml/examples/yolo/data/labels/58_7.png +0 -0
  680. data/vendor/ggml/examples/yolo/data/labels/59_0.png +0 -0
  681. data/vendor/ggml/examples/yolo/data/labels/59_1.png +0 -0
  682. data/vendor/ggml/examples/yolo/data/labels/59_2.png +0 -0
  683. data/vendor/ggml/examples/yolo/data/labels/59_3.png +0 -0
  684. data/vendor/ggml/examples/yolo/data/labels/59_4.png +0 -0
  685. data/vendor/ggml/examples/yolo/data/labels/59_5.png +0 -0
  686. data/vendor/ggml/examples/yolo/data/labels/59_6.png +0 -0
  687. data/vendor/ggml/examples/yolo/data/labels/59_7.png +0 -0
  688. data/vendor/ggml/examples/yolo/data/labels/60_0.png +0 -0
  689. data/vendor/ggml/examples/yolo/data/labels/60_1.png +0 -0
  690. data/vendor/ggml/examples/yolo/data/labels/60_2.png +0 -0
  691. data/vendor/ggml/examples/yolo/data/labels/60_3.png +0 -0
  692. data/vendor/ggml/examples/yolo/data/labels/60_4.png +0 -0
  693. data/vendor/ggml/examples/yolo/data/labels/60_5.png +0 -0
  694. data/vendor/ggml/examples/yolo/data/labels/60_6.png +0 -0
  695. data/vendor/ggml/examples/yolo/data/labels/60_7.png +0 -0
  696. data/vendor/ggml/examples/yolo/data/labels/61_0.png +0 -0
  697. data/vendor/ggml/examples/yolo/data/labels/61_1.png +0 -0
  698. data/vendor/ggml/examples/yolo/data/labels/61_2.png +0 -0
  699. data/vendor/ggml/examples/yolo/data/labels/61_3.png +0 -0
  700. data/vendor/ggml/examples/yolo/data/labels/61_4.png +0 -0
  701. data/vendor/ggml/examples/yolo/data/labels/61_5.png +0 -0
  702. data/vendor/ggml/examples/yolo/data/labels/61_6.png +0 -0
  703. data/vendor/ggml/examples/yolo/data/labels/61_7.png +0 -0
  704. data/vendor/ggml/examples/yolo/data/labels/62_0.png +0 -0
  705. data/vendor/ggml/examples/yolo/data/labels/62_1.png +0 -0
  706. data/vendor/ggml/examples/yolo/data/labels/62_2.png +0 -0
  707. data/vendor/ggml/examples/yolo/data/labels/62_3.png +0 -0
  708. data/vendor/ggml/examples/yolo/data/labels/62_4.png +0 -0
  709. data/vendor/ggml/examples/yolo/data/labels/62_5.png +0 -0
  710. data/vendor/ggml/examples/yolo/data/labels/62_6.png +0 -0
  711. data/vendor/ggml/examples/yolo/data/labels/62_7.png +0 -0
  712. data/vendor/ggml/examples/yolo/data/labels/63_0.png +0 -0
  713. data/vendor/ggml/examples/yolo/data/labels/63_1.png +0 -0
  714. data/vendor/ggml/examples/yolo/data/labels/63_2.png +0 -0
  715. data/vendor/ggml/examples/yolo/data/labels/63_3.png +0 -0
  716. data/vendor/ggml/examples/yolo/data/labels/63_4.png +0 -0
  717. data/vendor/ggml/examples/yolo/data/labels/63_5.png +0 -0
  718. data/vendor/ggml/examples/yolo/data/labels/63_6.png +0 -0
  719. data/vendor/ggml/examples/yolo/data/labels/63_7.png +0 -0
  720. data/vendor/ggml/examples/yolo/data/labels/64_0.png +0 -0
  721. data/vendor/ggml/examples/yolo/data/labels/64_1.png +0 -0
  722. data/vendor/ggml/examples/yolo/data/labels/64_2.png +0 -0
  723. data/vendor/ggml/examples/yolo/data/labels/64_3.png +0 -0
  724. data/vendor/ggml/examples/yolo/data/labels/64_4.png +0 -0
  725. data/vendor/ggml/examples/yolo/data/labels/64_5.png +0 -0
  726. data/vendor/ggml/examples/yolo/data/labels/64_6.png +0 -0
  727. data/vendor/ggml/examples/yolo/data/labels/64_7.png +0 -0
  728. data/vendor/ggml/examples/yolo/data/labels/65_0.png +0 -0
  729. data/vendor/ggml/examples/yolo/data/labels/65_1.png +0 -0
  730. data/vendor/ggml/examples/yolo/data/labels/65_2.png +0 -0
  731. data/vendor/ggml/examples/yolo/data/labels/65_3.png +0 -0
  732. data/vendor/ggml/examples/yolo/data/labels/65_4.png +0 -0
  733. data/vendor/ggml/examples/yolo/data/labels/65_5.png +0 -0
  734. data/vendor/ggml/examples/yolo/data/labels/65_6.png +0 -0
  735. data/vendor/ggml/examples/yolo/data/labels/65_7.png +0 -0
  736. data/vendor/ggml/examples/yolo/data/labels/66_0.png +0 -0
  737. data/vendor/ggml/examples/yolo/data/labels/66_1.png +0 -0
  738. data/vendor/ggml/examples/yolo/data/labels/66_2.png +0 -0
  739. data/vendor/ggml/examples/yolo/data/labels/66_3.png +0 -0
  740. data/vendor/ggml/examples/yolo/data/labels/66_4.png +0 -0
  741. data/vendor/ggml/examples/yolo/data/labels/66_5.png +0 -0
  742. data/vendor/ggml/examples/yolo/data/labels/66_6.png +0 -0
  743. data/vendor/ggml/examples/yolo/data/labels/66_7.png +0 -0
  744. data/vendor/ggml/examples/yolo/data/labels/67_0.png +0 -0
  745. data/vendor/ggml/examples/yolo/data/labels/67_1.png +0 -0
  746. data/vendor/ggml/examples/yolo/data/labels/67_2.png +0 -0
  747. data/vendor/ggml/examples/yolo/data/labels/67_3.png +0 -0
  748. data/vendor/ggml/examples/yolo/data/labels/67_4.png +0 -0
  749. data/vendor/ggml/examples/yolo/data/labels/67_5.png +0 -0
  750. data/vendor/ggml/examples/yolo/data/labels/67_6.png +0 -0
  751. data/vendor/ggml/examples/yolo/data/labels/67_7.png +0 -0
  752. data/vendor/ggml/examples/yolo/data/labels/68_0.png +0 -0
  753. data/vendor/ggml/examples/yolo/data/labels/68_1.png +0 -0
  754. data/vendor/ggml/examples/yolo/data/labels/68_2.png +0 -0
  755. data/vendor/ggml/examples/yolo/data/labels/68_3.png +0 -0
  756. data/vendor/ggml/examples/yolo/data/labels/68_4.png +0 -0
  757. data/vendor/ggml/examples/yolo/data/labels/68_5.png +0 -0
  758. data/vendor/ggml/examples/yolo/data/labels/68_6.png +0 -0
  759. data/vendor/ggml/examples/yolo/data/labels/68_7.png +0 -0
  760. data/vendor/ggml/examples/yolo/data/labels/69_0.png +0 -0
  761. data/vendor/ggml/examples/yolo/data/labels/69_1.png +0 -0
  762. data/vendor/ggml/examples/yolo/data/labels/69_2.png +0 -0
  763. data/vendor/ggml/examples/yolo/data/labels/69_3.png +0 -0
  764. data/vendor/ggml/examples/yolo/data/labels/69_4.png +0 -0
  765. data/vendor/ggml/examples/yolo/data/labels/69_5.png +0 -0
  766. data/vendor/ggml/examples/yolo/data/labels/69_6.png +0 -0
  767. data/vendor/ggml/examples/yolo/data/labels/69_7.png +0 -0
  768. data/vendor/ggml/examples/yolo/data/labels/70_0.png +0 -0
  769. data/vendor/ggml/examples/yolo/data/labels/70_1.png +0 -0
  770. data/vendor/ggml/examples/yolo/data/labels/70_2.png +0 -0
  771. data/vendor/ggml/examples/yolo/data/labels/70_3.png +0 -0
  772. data/vendor/ggml/examples/yolo/data/labels/70_4.png +0 -0
  773. data/vendor/ggml/examples/yolo/data/labels/70_5.png +0 -0
  774. data/vendor/ggml/examples/yolo/data/labels/70_6.png +0 -0
  775. data/vendor/ggml/examples/yolo/data/labels/70_7.png +0 -0
  776. data/vendor/ggml/examples/yolo/data/labels/71_0.png +0 -0
  777. data/vendor/ggml/examples/yolo/data/labels/71_1.png +0 -0
  778. data/vendor/ggml/examples/yolo/data/labels/71_2.png +0 -0
  779. data/vendor/ggml/examples/yolo/data/labels/71_3.png +0 -0
  780. data/vendor/ggml/examples/yolo/data/labels/71_4.png +0 -0
  781. data/vendor/ggml/examples/yolo/data/labels/71_5.png +0 -0
  782. data/vendor/ggml/examples/yolo/data/labels/71_6.png +0 -0
  783. data/vendor/ggml/examples/yolo/data/labels/71_7.png +0 -0
  784. data/vendor/ggml/examples/yolo/data/labels/72_0.png +0 -0
  785. data/vendor/ggml/examples/yolo/data/labels/72_1.png +0 -0
  786. data/vendor/ggml/examples/yolo/data/labels/72_2.png +0 -0
  787. data/vendor/ggml/examples/yolo/data/labels/72_3.png +0 -0
  788. data/vendor/ggml/examples/yolo/data/labels/72_4.png +0 -0
  789. data/vendor/ggml/examples/yolo/data/labels/72_5.png +0 -0
  790. data/vendor/ggml/examples/yolo/data/labels/72_6.png +0 -0
  791. data/vendor/ggml/examples/yolo/data/labels/72_7.png +0 -0
  792. data/vendor/ggml/examples/yolo/data/labels/73_0.png +0 -0
  793. data/vendor/ggml/examples/yolo/data/labels/73_1.png +0 -0
  794. data/vendor/ggml/examples/yolo/data/labels/73_2.png +0 -0
  795. data/vendor/ggml/examples/yolo/data/labels/73_3.png +0 -0
  796. data/vendor/ggml/examples/yolo/data/labels/73_4.png +0 -0
  797. data/vendor/ggml/examples/yolo/data/labels/73_5.png +0 -0
  798. data/vendor/ggml/examples/yolo/data/labels/73_6.png +0 -0
  799. data/vendor/ggml/examples/yolo/data/labels/73_7.png +0 -0
  800. data/vendor/ggml/examples/yolo/data/labels/74_0.png +0 -0
  801. data/vendor/ggml/examples/yolo/data/labels/74_1.png +0 -0
  802. data/vendor/ggml/examples/yolo/data/labels/74_2.png +0 -0
  803. data/vendor/ggml/examples/yolo/data/labels/74_3.png +0 -0
  804. data/vendor/ggml/examples/yolo/data/labels/74_4.png +0 -0
  805. data/vendor/ggml/examples/yolo/data/labels/74_5.png +0 -0
  806. data/vendor/ggml/examples/yolo/data/labels/74_6.png +0 -0
  807. data/vendor/ggml/examples/yolo/data/labels/74_7.png +0 -0
  808. data/vendor/ggml/examples/yolo/data/labels/75_0.png +0 -0
  809. data/vendor/ggml/examples/yolo/data/labels/75_1.png +0 -0
  810. data/vendor/ggml/examples/yolo/data/labels/75_2.png +0 -0
  811. data/vendor/ggml/examples/yolo/data/labels/75_3.png +0 -0
  812. data/vendor/ggml/examples/yolo/data/labels/75_4.png +0 -0
  813. data/vendor/ggml/examples/yolo/data/labels/75_5.png +0 -0
  814. data/vendor/ggml/examples/yolo/data/labels/75_6.png +0 -0
  815. data/vendor/ggml/examples/yolo/data/labels/75_7.png +0 -0
  816. data/vendor/ggml/examples/yolo/data/labels/76_0.png +0 -0
  817. data/vendor/ggml/examples/yolo/data/labels/76_1.png +0 -0
  818. data/vendor/ggml/examples/yolo/data/labels/76_2.png +0 -0
  819. data/vendor/ggml/examples/yolo/data/labels/76_3.png +0 -0
  820. data/vendor/ggml/examples/yolo/data/labels/76_4.png +0 -0
  821. data/vendor/ggml/examples/yolo/data/labels/76_5.png +0 -0
  822. data/vendor/ggml/examples/yolo/data/labels/76_6.png +0 -0
  823. data/vendor/ggml/examples/yolo/data/labels/76_7.png +0 -0
  824. data/vendor/ggml/examples/yolo/data/labels/77_0.png +0 -0
  825. data/vendor/ggml/examples/yolo/data/labels/77_1.png +0 -0
  826. data/vendor/ggml/examples/yolo/data/labels/77_2.png +0 -0
  827. data/vendor/ggml/examples/yolo/data/labels/77_3.png +0 -0
  828. data/vendor/ggml/examples/yolo/data/labels/77_4.png +0 -0
  829. data/vendor/ggml/examples/yolo/data/labels/77_5.png +0 -0
  830. data/vendor/ggml/examples/yolo/data/labels/77_6.png +0 -0
  831. data/vendor/ggml/examples/yolo/data/labels/77_7.png +0 -0
  832. data/vendor/ggml/examples/yolo/data/labels/78_0.png +0 -0
  833. data/vendor/ggml/examples/yolo/data/labels/78_1.png +0 -0
  834. data/vendor/ggml/examples/yolo/data/labels/78_2.png +0 -0
  835. data/vendor/ggml/examples/yolo/data/labels/78_3.png +0 -0
  836. data/vendor/ggml/examples/yolo/data/labels/78_4.png +0 -0
  837. data/vendor/ggml/examples/yolo/data/labels/78_5.png +0 -0
  838. data/vendor/ggml/examples/yolo/data/labels/78_6.png +0 -0
  839. data/vendor/ggml/examples/yolo/data/labels/78_7.png +0 -0
  840. data/vendor/ggml/examples/yolo/data/labels/79_0.png +0 -0
  841. data/vendor/ggml/examples/yolo/data/labels/79_1.png +0 -0
  842. data/vendor/ggml/examples/yolo/data/labels/79_2.png +0 -0
  843. data/vendor/ggml/examples/yolo/data/labels/79_3.png +0 -0
  844. data/vendor/ggml/examples/yolo/data/labels/79_4.png +0 -0
  845. data/vendor/ggml/examples/yolo/data/labels/79_5.png +0 -0
  846. data/vendor/ggml/examples/yolo/data/labels/79_6.png +0 -0
  847. data/vendor/ggml/examples/yolo/data/labels/79_7.png +0 -0
  848. data/vendor/ggml/examples/yolo/data/labels/80_0.png +0 -0
  849. data/vendor/ggml/examples/yolo/data/labels/80_1.png +0 -0
  850. data/vendor/ggml/examples/yolo/data/labels/80_2.png +0 -0
  851. data/vendor/ggml/examples/yolo/data/labels/80_3.png +0 -0
  852. data/vendor/ggml/examples/yolo/data/labels/80_4.png +0 -0
  853. data/vendor/ggml/examples/yolo/data/labels/80_5.png +0 -0
  854. data/vendor/ggml/examples/yolo/data/labels/80_6.png +0 -0
  855. data/vendor/ggml/examples/yolo/data/labels/80_7.png +0 -0
  856. data/vendor/ggml/examples/yolo/data/labels/81_0.png +0 -0
  857. data/vendor/ggml/examples/yolo/data/labels/81_1.png +0 -0
  858. data/vendor/ggml/examples/yolo/data/labels/81_2.png +0 -0
  859. data/vendor/ggml/examples/yolo/data/labels/81_3.png +0 -0
  860. data/vendor/ggml/examples/yolo/data/labels/81_4.png +0 -0
  861. data/vendor/ggml/examples/yolo/data/labels/81_5.png +0 -0
  862. data/vendor/ggml/examples/yolo/data/labels/81_6.png +0 -0
  863. data/vendor/ggml/examples/yolo/data/labels/81_7.png +0 -0
  864. data/vendor/ggml/examples/yolo/data/labels/82_0.png +0 -0
  865. data/vendor/ggml/examples/yolo/data/labels/82_1.png +0 -0
  866. data/vendor/ggml/examples/yolo/data/labels/82_2.png +0 -0
  867. data/vendor/ggml/examples/yolo/data/labels/82_3.png +0 -0
  868. data/vendor/ggml/examples/yolo/data/labels/82_4.png +0 -0
  869. data/vendor/ggml/examples/yolo/data/labels/82_5.png +0 -0
  870. data/vendor/ggml/examples/yolo/data/labels/82_6.png +0 -0
  871. data/vendor/ggml/examples/yolo/data/labels/82_7.png +0 -0
  872. data/vendor/ggml/examples/yolo/data/labels/83_0.png +0 -0
  873. data/vendor/ggml/examples/yolo/data/labels/83_1.png +0 -0
  874. data/vendor/ggml/examples/yolo/data/labels/83_2.png +0 -0
  875. data/vendor/ggml/examples/yolo/data/labels/83_3.png +0 -0
  876. data/vendor/ggml/examples/yolo/data/labels/83_4.png +0 -0
  877. data/vendor/ggml/examples/yolo/data/labels/83_5.png +0 -0
  878. data/vendor/ggml/examples/yolo/data/labels/83_6.png +0 -0
  879. data/vendor/ggml/examples/yolo/data/labels/83_7.png +0 -0
  880. data/vendor/ggml/examples/yolo/data/labels/84_0.png +0 -0
  881. data/vendor/ggml/examples/yolo/data/labels/84_1.png +0 -0
  882. data/vendor/ggml/examples/yolo/data/labels/84_2.png +0 -0
  883. data/vendor/ggml/examples/yolo/data/labels/84_3.png +0 -0
  884. data/vendor/ggml/examples/yolo/data/labels/84_4.png +0 -0
  885. data/vendor/ggml/examples/yolo/data/labels/84_5.png +0 -0
  886. data/vendor/ggml/examples/yolo/data/labels/84_6.png +0 -0
  887. data/vendor/ggml/examples/yolo/data/labels/84_7.png +0 -0
  888. data/vendor/ggml/examples/yolo/data/labels/85_0.png +0 -0
  889. data/vendor/ggml/examples/yolo/data/labels/85_1.png +0 -0
  890. data/vendor/ggml/examples/yolo/data/labels/85_2.png +0 -0
  891. data/vendor/ggml/examples/yolo/data/labels/85_3.png +0 -0
  892. data/vendor/ggml/examples/yolo/data/labels/85_4.png +0 -0
  893. data/vendor/ggml/examples/yolo/data/labels/85_5.png +0 -0
  894. data/vendor/ggml/examples/yolo/data/labels/85_6.png +0 -0
  895. data/vendor/ggml/examples/yolo/data/labels/85_7.png +0 -0
  896. data/vendor/ggml/examples/yolo/data/labels/86_0.png +0 -0
  897. data/vendor/ggml/examples/yolo/data/labels/86_1.png +0 -0
  898. data/vendor/ggml/examples/yolo/data/labels/86_2.png +0 -0
  899. data/vendor/ggml/examples/yolo/data/labels/86_3.png +0 -0
  900. data/vendor/ggml/examples/yolo/data/labels/86_4.png +0 -0
  901. data/vendor/ggml/examples/yolo/data/labels/86_5.png +0 -0
  902. data/vendor/ggml/examples/yolo/data/labels/86_6.png +0 -0
  903. data/vendor/ggml/examples/yolo/data/labels/86_7.png +0 -0
  904. data/vendor/ggml/examples/yolo/data/labels/87_0.png +0 -0
  905. data/vendor/ggml/examples/yolo/data/labels/87_1.png +0 -0
  906. data/vendor/ggml/examples/yolo/data/labels/87_2.png +0 -0
  907. data/vendor/ggml/examples/yolo/data/labels/87_3.png +0 -0
  908. data/vendor/ggml/examples/yolo/data/labels/87_4.png +0 -0
  909. data/vendor/ggml/examples/yolo/data/labels/87_5.png +0 -0
  910. data/vendor/ggml/examples/yolo/data/labels/87_6.png +0 -0
  911. data/vendor/ggml/examples/yolo/data/labels/87_7.png +0 -0
  912. data/vendor/ggml/examples/yolo/data/labels/88_0.png +0 -0
  913. data/vendor/ggml/examples/yolo/data/labels/88_1.png +0 -0
  914. data/vendor/ggml/examples/yolo/data/labels/88_2.png +0 -0
  915. data/vendor/ggml/examples/yolo/data/labels/88_3.png +0 -0
  916. data/vendor/ggml/examples/yolo/data/labels/88_4.png +0 -0
  917. data/vendor/ggml/examples/yolo/data/labels/88_5.png +0 -0
  918. data/vendor/ggml/examples/yolo/data/labels/88_6.png +0 -0
  919. data/vendor/ggml/examples/yolo/data/labels/88_7.png +0 -0
  920. data/vendor/ggml/examples/yolo/data/labels/89_0.png +0 -0
  921. data/vendor/ggml/examples/yolo/data/labels/89_1.png +0 -0
  922. data/vendor/ggml/examples/yolo/data/labels/89_2.png +0 -0
  923. data/vendor/ggml/examples/yolo/data/labels/89_3.png +0 -0
  924. data/vendor/ggml/examples/yolo/data/labels/89_4.png +0 -0
  925. data/vendor/ggml/examples/yolo/data/labels/89_5.png +0 -0
  926. data/vendor/ggml/examples/yolo/data/labels/89_6.png +0 -0
  927. data/vendor/ggml/examples/yolo/data/labels/89_7.png +0 -0
  928. data/vendor/ggml/examples/yolo/data/labels/90_0.png +0 -0
  929. data/vendor/ggml/examples/yolo/data/labels/90_1.png +0 -0
  930. data/vendor/ggml/examples/yolo/data/labels/90_2.png +0 -0
  931. data/vendor/ggml/examples/yolo/data/labels/90_3.png +0 -0
  932. data/vendor/ggml/examples/yolo/data/labels/90_4.png +0 -0
  933. data/vendor/ggml/examples/yolo/data/labels/90_5.png +0 -0
  934. data/vendor/ggml/examples/yolo/data/labels/90_6.png +0 -0
  935. data/vendor/ggml/examples/yolo/data/labels/90_7.png +0 -0
  936. data/vendor/ggml/examples/yolo/data/labels/91_0.png +0 -0
  937. data/vendor/ggml/examples/yolo/data/labels/91_1.png +0 -0
  938. data/vendor/ggml/examples/yolo/data/labels/91_2.png +0 -0
  939. data/vendor/ggml/examples/yolo/data/labels/91_3.png +0 -0
  940. data/vendor/ggml/examples/yolo/data/labels/91_4.png +0 -0
  941. data/vendor/ggml/examples/yolo/data/labels/91_5.png +0 -0
  942. data/vendor/ggml/examples/yolo/data/labels/91_6.png +0 -0
  943. data/vendor/ggml/examples/yolo/data/labels/91_7.png +0 -0
  944. data/vendor/ggml/examples/yolo/data/labels/92_0.png +0 -0
  945. data/vendor/ggml/examples/yolo/data/labels/92_1.png +0 -0
  946. data/vendor/ggml/examples/yolo/data/labels/92_2.png +0 -0
  947. data/vendor/ggml/examples/yolo/data/labels/92_3.png +0 -0
  948. data/vendor/ggml/examples/yolo/data/labels/92_4.png +0 -0
  949. data/vendor/ggml/examples/yolo/data/labels/92_5.png +0 -0
  950. data/vendor/ggml/examples/yolo/data/labels/92_6.png +0 -0
  951. data/vendor/ggml/examples/yolo/data/labels/92_7.png +0 -0
  952. data/vendor/ggml/examples/yolo/data/labels/93_0.png +0 -0
  953. data/vendor/ggml/examples/yolo/data/labels/93_1.png +0 -0
  954. data/vendor/ggml/examples/yolo/data/labels/93_2.png +0 -0
  955. data/vendor/ggml/examples/yolo/data/labels/93_3.png +0 -0
  956. data/vendor/ggml/examples/yolo/data/labels/93_4.png +0 -0
  957. data/vendor/ggml/examples/yolo/data/labels/93_5.png +0 -0
  958. data/vendor/ggml/examples/yolo/data/labels/93_6.png +0 -0
  959. data/vendor/ggml/examples/yolo/data/labels/93_7.png +0 -0
  960. data/vendor/ggml/examples/yolo/data/labels/94_0.png +0 -0
  961. data/vendor/ggml/examples/yolo/data/labels/94_1.png +0 -0
  962. data/vendor/ggml/examples/yolo/data/labels/94_2.png +0 -0
  963. data/vendor/ggml/examples/yolo/data/labels/94_3.png +0 -0
  964. data/vendor/ggml/examples/yolo/data/labels/94_4.png +0 -0
  965. data/vendor/ggml/examples/yolo/data/labels/94_5.png +0 -0
  966. data/vendor/ggml/examples/yolo/data/labels/94_6.png +0 -0
  967. data/vendor/ggml/examples/yolo/data/labels/94_7.png +0 -0
  968. data/vendor/ggml/examples/yolo/data/labels/95_0.png +0 -0
  969. data/vendor/ggml/examples/yolo/data/labels/95_1.png +0 -0
  970. data/vendor/ggml/examples/yolo/data/labels/95_2.png +0 -0
  971. data/vendor/ggml/examples/yolo/data/labels/95_3.png +0 -0
  972. data/vendor/ggml/examples/yolo/data/labels/95_4.png +0 -0
  973. data/vendor/ggml/examples/yolo/data/labels/95_5.png +0 -0
  974. data/vendor/ggml/examples/yolo/data/labels/95_6.png +0 -0
  975. data/vendor/ggml/examples/yolo/data/labels/95_7.png +0 -0
  976. data/vendor/ggml/examples/yolo/data/labels/96_0.png +0 -0
  977. data/vendor/ggml/examples/yolo/data/labels/96_1.png +0 -0
  978. data/vendor/ggml/examples/yolo/data/labels/96_2.png +0 -0
  979. data/vendor/ggml/examples/yolo/data/labels/96_3.png +0 -0
  980. data/vendor/ggml/examples/yolo/data/labels/96_4.png +0 -0
  981. data/vendor/ggml/examples/yolo/data/labels/96_5.png +0 -0
  982. data/vendor/ggml/examples/yolo/data/labels/96_6.png +0 -0
  983. data/vendor/ggml/examples/yolo/data/labels/96_7.png +0 -0
  984. data/vendor/ggml/examples/yolo/data/labels/97_0.png +0 -0
  985. data/vendor/ggml/examples/yolo/data/labels/97_1.png +0 -0
  986. data/vendor/ggml/examples/yolo/data/labels/97_2.png +0 -0
  987. data/vendor/ggml/examples/yolo/data/labels/97_3.png +0 -0
  988. data/vendor/ggml/examples/yolo/data/labels/97_4.png +0 -0
  989. data/vendor/ggml/examples/yolo/data/labels/97_5.png +0 -0
  990. data/vendor/ggml/examples/yolo/data/labels/97_6.png +0 -0
  991. data/vendor/ggml/examples/yolo/data/labels/97_7.png +0 -0
  992. data/vendor/ggml/examples/yolo/data/labels/98_0.png +0 -0
  993. data/vendor/ggml/examples/yolo/data/labels/98_1.png +0 -0
  994. data/vendor/ggml/examples/yolo/data/labels/98_2.png +0 -0
  995. data/vendor/ggml/examples/yolo/data/labels/98_3.png +0 -0
  996. data/vendor/ggml/examples/yolo/data/labels/98_4.png +0 -0
  997. data/vendor/ggml/examples/yolo/data/labels/98_5.png +0 -0
  998. data/vendor/ggml/examples/yolo/data/labels/98_6.png +0 -0
  999. data/vendor/ggml/examples/yolo/data/labels/98_7.png +0 -0
  1000. data/vendor/ggml/examples/yolo/data/labels/99_0.png +0 -0
  1001. data/vendor/ggml/examples/yolo/data/labels/99_1.png +0 -0
  1002. data/vendor/ggml/examples/yolo/data/labels/99_2.png +0 -0
  1003. data/vendor/ggml/examples/yolo/data/labels/99_3.png +0 -0
  1004. data/vendor/ggml/examples/yolo/data/labels/99_4.png +0 -0
  1005. data/vendor/ggml/examples/yolo/data/labels/99_5.png +0 -0
  1006. data/vendor/ggml/examples/yolo/data/labels/99_6.png +0 -0
  1007. data/vendor/ggml/examples/yolo/data/labels/99_7.png +0 -0
  1008. data/vendor/ggml/examples/yolo/yolo-image.cpp +210 -0
  1009. data/vendor/ggml/examples/yolo/yolo-image.h +39 -0
  1010. data/vendor/ggml/examples/yolo/yolov3-tiny.cpp +661 -0
  1011. data/vendor/ggml/ggml.pc.in +10 -0
  1012. data/vendor/ggml/include/ggml-alloc.h +85 -0
  1013. data/vendor/ggml/include/ggml-backend.h +431 -0
  1014. data/vendor/ggml/include/ggml-blas.h +25 -0
  1015. data/vendor/ggml/include/ggml-cann.h +123 -0
  1016. data/vendor/ggml/include/ggml-cpp.h +39 -0
  1017. data/vendor/ggml/include/ggml-cpu.h +151 -0
  1018. data/vendor/ggml/include/ggml-cuda.h +50 -0
  1019. data/vendor/ggml/include/ggml-hexagon.h +19 -0
  1020. data/vendor/ggml/include/ggml-metal.h +61 -0
  1021. data/vendor/ggml/include/ggml-opencl.h +26 -0
  1022. data/vendor/ggml/include/ggml-openvino.h +37 -0
  1023. data/vendor/ggml/include/ggml-opt.h +256 -0
  1024. data/vendor/ggml/include/ggml-rpc.h +35 -0
  1025. data/vendor/ggml/include/ggml-sycl.h +49 -0
  1026. data/vendor/ggml/include/ggml-virtgpu.h +14 -0
  1027. data/vendor/ggml/include/ggml-vulkan.h +29 -0
  1028. data/vendor/ggml/include/ggml-webgpu.h +19 -0
  1029. data/vendor/ggml/include/ggml-zdnn.h +17 -0
  1030. data/vendor/ggml/include/ggml-zendnn.h +22 -0
  1031. data/vendor/ggml/include/ggml.h +2845 -0
  1032. data/vendor/ggml/include/gguf.h +204 -0
  1033. data/vendor/ggml/requirements.txt +12 -0
  1034. data/vendor/ggml/scripts/gen-authors.sh +9 -0
  1035. data/vendor/ggml/scripts/release.sh +296 -0
  1036. data/vendor/ggml/scripts/sync-llama-am.sh +167 -0
  1037. data/vendor/ggml/scripts/sync-llama.last +1 -0
  1038. data/vendor/ggml/scripts/sync-llama.sh +21 -0
  1039. data/vendor/ggml/scripts/sync-whisper-am.sh +138 -0
  1040. data/vendor/ggml/scripts/sync-whisper.last +1 -0
  1041. data/vendor/ggml/scripts/sync-whisper.sh +17 -0
  1042. data/vendor/ggml/src/CMakeLists.txt +493 -0
  1043. data/vendor/ggml/src/ggml-alloc.c +1248 -0
  1044. data/vendor/ggml/src/ggml-backend-dl.cpp +48 -0
  1045. data/vendor/ggml/src/ggml-backend-dl.h +45 -0
  1046. data/vendor/ggml/src/ggml-backend-impl.h +275 -0
  1047. data/vendor/ggml/src/ggml-backend-meta.cpp +2144 -0
  1048. data/vendor/ggml/src/ggml-backend-reg.cpp +586 -0
  1049. data/vendor/ggml/src/ggml-backend.cpp +2371 -0
  1050. data/vendor/ggml/src/ggml-blas/CMakeLists.txt +101 -0
  1051. data/vendor/ggml/src/ggml-blas/ggml-blas.cpp +522 -0
  1052. data/vendor/ggml/src/ggml-cann/CMakeLists.txt +89 -0
  1053. data/vendor/ggml/src/ggml-cann/acl_tensor.cpp +195 -0
  1054. data/vendor/ggml/src/ggml-cann/acl_tensor.h +349 -0
  1055. data/vendor/ggml/src/ggml-cann/aclnn_ops.cpp +4436 -0
  1056. data/vendor/ggml/src/ggml-cann/aclnn_ops.h +1190 -0
  1057. data/vendor/ggml/src/ggml-cann/common.h +651 -0
  1058. data/vendor/ggml/src/ggml-cann/ggml-cann.cpp +3062 -0
  1059. data/vendor/ggml/src/ggml-common.h +1900 -0
  1060. data/vendor/ggml/src/ggml-cpu/CMakeLists.txt +731 -0
  1061. data/vendor/ggml/src/ggml-cpu/amx/amx.cpp +249 -0
  1062. data/vendor/ggml/src/ggml-cpu/amx/amx.h +8 -0
  1063. data/vendor/ggml/src/ggml-cpu/amx/common.h +115 -0
  1064. data/vendor/ggml/src/ggml-cpu/amx/mmq.cpp +2512 -0
  1065. data/vendor/ggml/src/ggml-cpu/amx/mmq.h +10 -0
  1066. data/vendor/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +98 -0
  1067. data/vendor/ggml/src/ggml-cpu/arch/arm/quants.c +4245 -0
  1068. data/vendor/ggml/src/ggml-cpu/arch/arm/repack.cpp +5156 -0
  1069. data/vendor/ggml/src/ggml-cpu/arch/loongarch/quants.c +2158 -0
  1070. data/vendor/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  1071. data/vendor/ggml/src/ggml-cpu/arch/powerpc/quants.c +2304 -0
  1072. data/vendor/ggml/src/ggml-cpu/arch/riscv/cpu-feats.cpp +38 -0
  1073. data/vendor/ggml/src/ggml-cpu/arch/riscv/quants.c +4553 -0
  1074. data/vendor/ggml/src/ggml-cpu/arch/riscv/repack.cpp +1703 -0
  1075. data/vendor/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
  1076. data/vendor/ggml/src/ggml-cpu/arch/s390/quants.c +1465 -0
  1077. data/vendor/ggml/src/ggml-cpu/arch/wasm/quants.c +1220 -0
  1078. data/vendor/ggml/src/ggml-cpu/arch/x86/cpu-feats.cpp +327 -0
  1079. data/vendor/ggml/src/ggml-cpu/arch/x86/quants.c +3970 -0
  1080. data/vendor/ggml/src/ggml-cpu/arch/x86/repack.cpp +6407 -0
  1081. data/vendor/ggml/src/ggml-cpu/arch-fallback.h +348 -0
  1082. data/vendor/ggml/src/ggml-cpu/binary-ops.cpp +154 -0
  1083. data/vendor/ggml/src/ggml-cpu/binary-ops.h +16 -0
  1084. data/vendor/ggml/src/ggml-cpu/cmake/FindSIMD.cmake +100 -0
  1085. data/vendor/ggml/src/ggml-cpu/cmake/FindSMTIME.cmake +32 -0
  1086. data/vendor/ggml/src/ggml-cpu/common.h +95 -0
  1087. data/vendor/ggml/src/ggml-cpu/ggml-cpu-impl.h +539 -0
  1088. data/vendor/ggml/src/ggml-cpu/ggml-cpu.c +3835 -0
  1089. data/vendor/ggml/src/ggml-cpu/ggml-cpu.cpp +703 -0
  1090. data/vendor/ggml/src/ggml-cpu/hbm.cpp +55 -0
  1091. data/vendor/ggml/src/ggml-cpu/hbm.h +8 -0
  1092. data/vendor/ggml/src/ggml-cpu/kleidiai/kernels.cpp +939 -0
  1093. data/vendor/ggml/src/ggml-cpu/kleidiai/kernels.h +90 -0
  1094. data/vendor/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1513 -0
  1095. data/vendor/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  1096. data/vendor/ggml/src/ggml-cpu/llamafile/sgemm.cpp +4051 -0
  1097. data/vendor/ggml/src/ggml-cpu/llamafile/sgemm.h +25 -0
  1098. data/vendor/ggml/src/ggml-cpu/ops.cpp +11373 -0
  1099. data/vendor/ggml/src/ggml-cpu/ops.h +119 -0
  1100. data/vendor/ggml/src/ggml-cpu/quants.c +1288 -0
  1101. data/vendor/ggml/src/ggml-cpu/quants.h +103 -0
  1102. data/vendor/ggml/src/ggml-cpu/repack.cpp +4836 -0
  1103. data/vendor/ggml/src/ggml-cpu/repack.h +245 -0
  1104. data/vendor/ggml/src/ggml-cpu/simd-gemm.h +226 -0
  1105. data/vendor/ggml/src/ggml-cpu/simd-mappings.h +1319 -0
  1106. data/vendor/ggml/src/ggml-cpu/spacemit/ime.cpp +1740 -0
  1107. data/vendor/ggml/src/ggml-cpu/spacemit/ime.h +21 -0
  1108. data/vendor/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +1027 -0
  1109. data/vendor/ggml/src/ggml-cpu/spacemit/ime2_kernels.cpp +5768 -0
  1110. data/vendor/ggml/src/ggml-cpu/spacemit/ime_env.cpp +320 -0
  1111. data/vendor/ggml/src/ggml-cpu/spacemit/ime_env.h +55 -0
  1112. data/vendor/ggml/src/ggml-cpu/spacemit/ime_kernels.h +189 -0
  1113. data/vendor/ggml/src/ggml-cpu/spacemit/repack.cpp +1795 -0
  1114. data/vendor/ggml/src/ggml-cpu/spacemit/repack.h +14 -0
  1115. data/vendor/ggml/src/ggml-cpu/spacemit/rvv_kernels.cpp +3178 -0
  1116. data/vendor/ggml/src/ggml-cpu/spacemit/rvv_kernels.h +95 -0
  1117. data/vendor/ggml/src/ggml-cpu/spacemit/spine_barrier.h +34 -0
  1118. data/vendor/ggml/src/ggml-cpu/spacemit/spine_mem_pool.cpp +760 -0
  1119. data/vendor/ggml/src/ggml-cpu/spacemit/spine_mem_pool.h +32 -0
  1120. data/vendor/ggml/src/ggml-cpu/spacemit/spine_tcm.h +409 -0
  1121. data/vendor/ggml/src/ggml-cpu/traits.cpp +36 -0
  1122. data/vendor/ggml/src/ggml-cpu/traits.h +38 -0
  1123. data/vendor/ggml/src/ggml-cpu/unary-ops.cpp +337 -0
  1124. data/vendor/ggml/src/ggml-cpu/unary-ops.h +35 -0
  1125. data/vendor/ggml/src/ggml-cpu/vec.cpp +629 -0
  1126. data/vendor/ggml/src/ggml-cpu/vec.h +1588 -0
  1127. data/vendor/ggml/src/ggml-cuda/CMakeLists.txt +268 -0
  1128. data/vendor/ggml/src/ggml-cuda/acc.cu +61 -0
  1129. data/vendor/ggml/src/ggml-cuda/acc.cuh +5 -0
  1130. data/vendor/ggml/src/ggml-cuda/add-id.cu +58 -0
  1131. data/vendor/ggml/src/ggml-cuda/add-id.cuh +3 -0
  1132. data/vendor/ggml/src/ggml-cuda/allreduce.cu +971 -0
  1133. data/vendor/ggml/src/ggml-cuda/allreduce.cuh +29 -0
  1134. data/vendor/ggml/src/ggml-cuda/arange.cu +34 -0
  1135. data/vendor/ggml/src/ggml-cuda/arange.cuh +5 -0
  1136. data/vendor/ggml/src/ggml-cuda/argmax.cu +91 -0
  1137. data/vendor/ggml/src/ggml-cuda/argmax.cuh +3 -0
  1138. data/vendor/ggml/src/ggml-cuda/argsort.cu +266 -0
  1139. data/vendor/ggml/src/ggml-cuda/argsort.cuh +19 -0
  1140. data/vendor/ggml/src/ggml-cuda/binbcast.cu +534 -0
  1141. data/vendor/ggml/src/ggml-cuda/binbcast.cuh +12 -0
  1142. data/vendor/ggml/src/ggml-cuda/clamp.cu +45 -0
  1143. data/vendor/ggml/src/ggml-cuda/clamp.cuh +5 -0
  1144. data/vendor/ggml/src/ggml-cuda/common.cuh +1489 -0
  1145. data/vendor/ggml/src/ggml-cuda/concat.cu +204 -0
  1146. data/vendor/ggml/src/ggml-cuda/concat.cuh +5 -0
  1147. data/vendor/ggml/src/ggml-cuda/conv-transpose-1d.cu +86 -0
  1148. data/vendor/ggml/src/ggml-cuda/conv-transpose-1d.cuh +5 -0
  1149. data/vendor/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
  1150. data/vendor/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
  1151. data/vendor/ggml/src/ggml-cuda/conv2d-transpose.cu +115 -0
  1152. data/vendor/ggml/src/ggml-cuda/conv2d-transpose.cuh +5 -0
  1153. data/vendor/ggml/src/ggml-cuda/conv2d.cu +166 -0
  1154. data/vendor/ggml/src/ggml-cuda/conv2d.cuh +5 -0
  1155. data/vendor/ggml/src/ggml-cuda/convert.cu +892 -0
  1156. data/vendor/ggml/src/ggml-cuda/convert.cuh +66 -0
  1157. data/vendor/ggml/src/ggml-cuda/count-equal.cu +64 -0
  1158. data/vendor/ggml/src/ggml-cuda/count-equal.cuh +5 -0
  1159. data/vendor/ggml/src/ggml-cuda/cp-async.cuh +57 -0
  1160. data/vendor/ggml/src/ggml-cuda/cpy-utils.cuh +217 -0
  1161. data/vendor/ggml/src/ggml-cuda/cpy.cu +558 -0
  1162. data/vendor/ggml/src/ggml-cuda/cpy.cuh +7 -0
  1163. data/vendor/ggml/src/ggml-cuda/cross-entropy-loss.cu +177 -0
  1164. data/vendor/ggml/src/ggml-cuda/cross-entropy-loss.cuh +7 -0
  1165. data/vendor/ggml/src/ggml-cuda/cumsum.cu +307 -0
  1166. data/vendor/ggml/src/ggml-cuda/cumsum.cuh +5 -0
  1167. data/vendor/ggml/src/ggml-cuda/dequantize.cuh +99 -0
  1168. data/vendor/ggml/src/ggml-cuda/diag.cu +77 -0
  1169. data/vendor/ggml/src/ggml-cuda/diag.cuh +5 -0
  1170. data/vendor/ggml/src/ggml-cuda/diagmask.cu +40 -0
  1171. data/vendor/ggml/src/ggml-cuda/diagmask.cuh +5 -0
  1172. data/vendor/ggml/src/ggml-cuda/fattn-common.cuh +1212 -0
  1173. data/vendor/ggml/src/ggml-cuda/fattn-mma-f16.cuh +2020 -0
  1174. data/vendor/ggml/src/ggml-cuda/fattn-tile.cu +61 -0
  1175. data/vendor/ggml/src/ggml-cuda/fattn-tile.cuh +1347 -0
  1176. data/vendor/ggml/src/ggml-cuda/fattn-vec.cuh +600 -0
  1177. data/vendor/ggml/src/ggml-cuda/fattn-wmma-f16.cu +696 -0
  1178. data/vendor/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +51 -0
  1179. data/vendor/ggml/src/ggml-cuda/fattn.cu +562 -0
  1180. data/vendor/ggml/src/ggml-cuda/fattn.cuh +5 -0
  1181. data/vendor/ggml/src/ggml-cuda/fill.cu +37 -0
  1182. data/vendor/ggml/src/ggml-cuda/fill.cuh +3 -0
  1183. data/vendor/ggml/src/ggml-cuda/gated_delta_net.cu +311 -0
  1184. data/vendor/ggml/src/ggml-cuda/gated_delta_net.cuh +4 -0
  1185. data/vendor/ggml/src/ggml-cuda/getrows.cu +300 -0
  1186. data/vendor/ggml/src/ggml-cuda/getrows.cuh +15 -0
  1187. data/vendor/ggml/src/ggml-cuda/ggml-cuda.cu +5684 -0
  1188. data/vendor/ggml/src/ggml-cuda/gla.cu +93 -0
  1189. data/vendor/ggml/src/ggml-cuda/gla.cuh +3 -0
  1190. data/vendor/ggml/src/ggml-cuda/im2col.cu +267 -0
  1191. data/vendor/ggml/src/ggml-cuda/im2col.cuh +6 -0
  1192. data/vendor/ggml/src/ggml-cuda/mean.cu +75 -0
  1193. data/vendor/ggml/src/ggml-cuda/mean.cuh +3 -0
  1194. data/vendor/ggml/src/ggml-cuda/mma.cuh +1456 -0
  1195. data/vendor/ggml/src/ggml-cuda/mmf.cu +191 -0
  1196. data/vendor/ggml/src/ggml-cuda/mmf.cuh +908 -0
  1197. data/vendor/ggml/src/ggml-cuda/mmid.cu +164 -0
  1198. data/vendor/ggml/src/ggml-cuda/mmid.cuh +5 -0
  1199. data/vendor/ggml/src/ggml-cuda/mmq.cu +372 -0
  1200. data/vendor/ggml/src/ggml-cuda/mmq.cuh +4176 -0
  1201. data/vendor/ggml/src/ggml-cuda/mmvf.cu +862 -0
  1202. data/vendor/ggml/src/ggml-cuda/mmvf.cuh +14 -0
  1203. data/vendor/ggml/src/ggml-cuda/mmvq.cu +1161 -0
  1204. data/vendor/ggml/src/ggml-cuda/mmvq.cuh +16 -0
  1205. data/vendor/ggml/src/ggml-cuda/norm.cu +672 -0
  1206. data/vendor/ggml/src/ggml-cuda/norm.cuh +18 -0
  1207. data/vendor/ggml/src/ggml-cuda/opt-step-adamw.cu +78 -0
  1208. data/vendor/ggml/src/ggml-cuda/opt-step-adamw.cuh +5 -0
  1209. data/vendor/ggml/src/ggml-cuda/opt-step-sgd.cu +49 -0
  1210. data/vendor/ggml/src/ggml-cuda/opt-step-sgd.cuh +5 -0
  1211. data/vendor/ggml/src/ggml-cuda/out-prod.cu +84 -0
  1212. data/vendor/ggml/src/ggml-cuda/out-prod.cuh +3 -0
  1213. data/vendor/ggml/src/ggml-cuda/pad.cu +106 -0
  1214. data/vendor/ggml/src/ggml-cuda/pad.cuh +5 -0
  1215. data/vendor/ggml/src/ggml-cuda/pad_reflect_1d.cu +91 -0
  1216. data/vendor/ggml/src/ggml-cuda/pad_reflect_1d.cuh +5 -0
  1217. data/vendor/ggml/src/ggml-cuda/pool2d.cu +94 -0
  1218. data/vendor/ggml/src/ggml-cuda/pool2d.cuh +5 -0
  1219. data/vendor/ggml/src/ggml-cuda/quantize.cu +443 -0
  1220. data/vendor/ggml/src/ggml-cuda/quantize.cuh +41 -0
  1221. data/vendor/ggml/src/ggml-cuda/reduce_rows.cuh +39 -0
  1222. data/vendor/ggml/src/ggml-cuda/roll.cu +67 -0
  1223. data/vendor/ggml/src/ggml-cuda/roll.cuh +5 -0
  1224. data/vendor/ggml/src/ggml-cuda/rope.cu +665 -0
  1225. data/vendor/ggml/src/ggml-cuda/rope.cuh +9 -0
  1226. data/vendor/ggml/src/ggml-cuda/scale.cu +34 -0
  1227. data/vendor/ggml/src/ggml-cuda/scale.cuh +5 -0
  1228. data/vendor/ggml/src/ggml-cuda/set-rows.cu +330 -0
  1229. data/vendor/ggml/src/ggml-cuda/set-rows.cuh +7 -0
  1230. data/vendor/ggml/src/ggml-cuda/set.cu +39 -0
  1231. data/vendor/ggml/src/ggml-cuda/set.cuh +7 -0
  1232. data/vendor/ggml/src/ggml-cuda/snake.cu +72 -0
  1233. data/vendor/ggml/src/ggml-cuda/snake.cuh +8 -0
  1234. data/vendor/ggml/src/ggml-cuda/softcap.cu +34 -0
  1235. data/vendor/ggml/src/ggml-cuda/softcap.cuh +5 -0
  1236. data/vendor/ggml/src/ggml-cuda/softmax.cu +472 -0
  1237. data/vendor/ggml/src/ggml-cuda/softmax.cuh +7 -0
  1238. data/vendor/ggml/src/ggml-cuda/solve_tri.cu +275 -0
  1239. data/vendor/ggml/src/ggml-cuda/solve_tri.cuh +3 -0
  1240. data/vendor/ggml/src/ggml-cuda/ssm-conv.cu +197 -0
  1241. data/vendor/ggml/src/ggml-cuda/ssm-conv.cuh +3 -0
  1242. data/vendor/ggml/src/ggml-cuda/ssm-scan.cu +342 -0
  1243. data/vendor/ggml/src/ggml-cuda/ssm-scan.cuh +3 -0
  1244. data/vendor/ggml/src/ggml-cuda/sum.cu +41 -0
  1245. data/vendor/ggml/src/ggml-cuda/sum.cuh +5 -0
  1246. data/vendor/ggml/src/ggml-cuda/sumrows.cu +43 -0
  1247. data/vendor/ggml/src/ggml-cuda/sumrows.cuh +4 -0
  1248. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu +6 -0
  1249. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_32.cu +6 -0
  1250. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu +12 -0
  1251. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_1.cu +10 -0
  1252. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_2.cu +10 -0
  1253. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +12 -0
  1254. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu +6 -0
  1255. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_32.cu +6 -0
  1256. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +12 -0
  1257. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu +12 -0
  1258. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_1.cu +10 -0
  1259. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_2.cu +10 -0
  1260. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu +6 -0
  1261. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_2.cu +10 -0
  1262. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +12 -0
  1263. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_8.cu +12 -0
  1264. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_64-ncols2_1.cu +10 -0
  1265. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_1.cu +10 -0
  1266. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_2.cu +10 -0
  1267. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +12 -0
  1268. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_8.cu +12 -0
  1269. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq112-dv112.cu +5 -0
  1270. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq128-dv128.cu +5 -0
  1271. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq192-dv128.cu +5 -0
  1272. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq256-dv256.cu +5 -0
  1273. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq320-dv256.cu +5 -0
  1274. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq40-dv40.cu +5 -0
  1275. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq512-dv512.cu +5 -0
  1276. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq576-dv512.cu +5 -0
  1277. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq64-dv64.cu +5 -0
  1278. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq72-dv72.cu +5 -0
  1279. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq80-dv80.cu +5 -0
  1280. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq96-dv96.cu +5 -0
  1281. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-bf16.cu +7 -0
  1282. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-f16.cu +7 -0
  1283. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_0.cu +7 -0
  1284. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_1.cu +7 -0
  1285. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_0.cu +7 -0
  1286. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_1.cu +7 -0
  1287. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q8_0.cu +7 -0
  1288. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-bf16.cu +7 -0
  1289. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu +7 -0
  1290. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_0.cu +7 -0
  1291. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_1.cu +7 -0
  1292. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_0.cu +7 -0
  1293. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_1.cu +7 -0
  1294. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q8_0.cu +7 -0
  1295. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-bf16.cu +7 -0
  1296. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-f16.cu +7 -0
  1297. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu +7 -0
  1298. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_1.cu +7 -0
  1299. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_0.cu +7 -0
  1300. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_1.cu +7 -0
  1301. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q8_0.cu +7 -0
  1302. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-bf16.cu +7 -0
  1303. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-f16.cu +7 -0
  1304. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_0.cu +7 -0
  1305. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_1.cu +7 -0
  1306. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_0.cu +7 -0
  1307. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_1.cu +7 -0
  1308. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q8_0.cu +7 -0
  1309. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-bf16.cu +7 -0
  1310. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-f16.cu +7 -0
  1311. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_0.cu +7 -0
  1312. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_1.cu +7 -0
  1313. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_0.cu +7 -0
  1314. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_1.cu +7 -0
  1315. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q8_0.cu +7 -0
  1316. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-bf16.cu +7 -0
  1317. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-f16.cu +7 -0
  1318. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_0.cu +7 -0
  1319. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_1.cu +7 -0
  1320. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_0.cu +7 -0
  1321. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_1.cu +7 -0
  1322. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q8_0.cu +7 -0
  1323. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-bf16.cu +7 -0
  1324. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-f16.cu +7 -0
  1325. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_0.cu +7 -0
  1326. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_1.cu +7 -0
  1327. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_0.cu +7 -0
  1328. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_1.cu +7 -0
  1329. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu +7 -0
  1330. data/vendor/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +110 -0
  1331. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_1.cu +5 -0
  1332. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_10.cu +5 -0
  1333. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_11.cu +5 -0
  1334. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_12.cu +5 -0
  1335. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_13.cu +5 -0
  1336. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_14.cu +5 -0
  1337. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_15.cu +5 -0
  1338. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_16.cu +5 -0
  1339. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_2.cu +5 -0
  1340. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_3.cu +5 -0
  1341. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_4.cu +5 -0
  1342. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_5.cu +5 -0
  1343. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_6.cu +5 -0
  1344. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_7.cu +5 -0
  1345. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_8.cu +5 -0
  1346. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_9.cu +5 -0
  1347. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq1_s.cu +5 -0
  1348. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_s.cu +5 -0
  1349. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xs.cu +5 -0
  1350. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xxs.cu +5 -0
  1351. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_s.cu +5 -0
  1352. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_xxs.cu +5 -0
  1353. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_nl.cu +5 -0
  1354. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_xs.cu +5 -0
  1355. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-mxfp4.cu +5 -0
  1356. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-nvfp4.cu +5 -0
  1357. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q1_0.cu +5 -0
  1358. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
  1359. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
  1360. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
  1361. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
  1362. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
  1363. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
  1364. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
  1365. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
  1366. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
  1367. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
  1368. data/vendor/ggml/src/ggml-cuda/top-k.cu +95 -0
  1369. data/vendor/ggml/src/ggml-cuda/top-k.cuh +3 -0
  1370. data/vendor/ggml/src/ggml-cuda/topk-moe.cu +415 -0
  1371. data/vendor/ggml/src/ggml-cuda/topk-moe.cuh +27 -0
  1372. data/vendor/ggml/src/ggml-cuda/tri.cu +136 -0
  1373. data/vendor/ggml/src/ggml-cuda/tri.cuh +5 -0
  1374. data/vendor/ggml/src/ggml-cuda/tsembd.cu +47 -0
  1375. data/vendor/ggml/src/ggml-cuda/tsembd.cuh +5 -0
  1376. data/vendor/ggml/src/ggml-cuda/unary.cu +640 -0
  1377. data/vendor/ggml/src/ggml-cuda/unary.cuh +114 -0
  1378. data/vendor/ggml/src/ggml-cuda/upscale.cu +293 -0
  1379. data/vendor/ggml/src/ggml-cuda/upscale.cuh +5 -0
  1380. data/vendor/ggml/src/ggml-cuda/vecdotq.cuh +1317 -0
  1381. data/vendor/ggml/src/ggml-cuda/vendors/cuda.h +28 -0
  1382. data/vendor/ggml/src/ggml-cuda/vendors/hip.h +304 -0
  1383. data/vendor/ggml/src/ggml-cuda/vendors/musa.h +150 -0
  1384. data/vendor/ggml/src/ggml-cuda/wkv.cu +199 -0
  1385. data/vendor/ggml/src/ggml-cuda/wkv.cuh +7 -0
  1386. data/vendor/ggml/src/ggml-hexagon/CMakeLists.txt +118 -0
  1387. data/vendor/ggml/src/ggml-hexagon/ggml-hexagon.cpp +3680 -0
  1388. data/vendor/ggml/src/ggml-hexagon/htp/CMakeLists.txt +78 -0
  1389. data/vendor/ggml/src/ggml-hexagon/htp/act-ops.c +782 -0
  1390. data/vendor/ggml/src/ggml-hexagon/htp/argsort-ops.c +293 -0
  1391. data/vendor/ggml/src/ggml-hexagon/htp/binary-ops.c +872 -0
  1392. data/vendor/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +157 -0
  1393. data/vendor/ggml/src/ggml-hexagon/htp/cpy-ops.c +275 -0
  1394. data/vendor/ggml/src/ggml-hexagon/htp/cumsum-ops.c +270 -0
  1395. data/vendor/ggml/src/ggml-hexagon/htp/diag-ops.c +216 -0
  1396. data/vendor/ggml/src/ggml-hexagon/htp/fill-ops.c +123 -0
  1397. data/vendor/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +727 -0
  1398. data/vendor/ggml/src/ggml-hexagon/htp/gated-delta-net-ops.c +955 -0
  1399. data/vendor/ggml/src/ggml-hexagon/htp/get-rows-ops.c +124 -0
  1400. data/vendor/ggml/src/ggml-hexagon/htp/hex-dma.c +63 -0
  1401. data/vendor/ggml/src/ggml-hexagon/htp/hex-dma.h +372 -0
  1402. data/vendor/ggml/src/ggml-hexagon/htp/hex-dump.h +86 -0
  1403. data/vendor/ggml/src/ggml-hexagon/htp/hex-fastdiv.h +37 -0
  1404. data/vendor/ggml/src/ggml-hexagon/htp/hex-utils.h +137 -0
  1405. data/vendor/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c +1841 -0
  1406. data/vendor/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c +1785 -0
  1407. data/vendor/ggml/src/ggml-hexagon/htp/hmx-ops.h +71 -0
  1408. data/vendor/ggml/src/ggml-hexagon/htp/hmx-profile.h +34 -0
  1409. data/vendor/ggml/src/ggml-hexagon/htp/hmx-queue.c +158 -0
  1410. data/vendor/ggml/src/ggml-hexagon/htp/hmx-queue.h +134 -0
  1411. data/vendor/ggml/src/ggml-hexagon/htp/hmx-utils.h +200 -0
  1412. data/vendor/ggml/src/ggml-hexagon/htp/htp-ctx.h +111 -0
  1413. data/vendor/ggml/src/ggml-hexagon/htp/htp-ops.h +181 -0
  1414. data/vendor/ggml/src/ggml-hexagon/htp/htp_iface.idl +22 -0
  1415. data/vendor/ggml/src/ggml-hexagon/htp/hvx-arith.h +443 -0
  1416. data/vendor/ggml/src/ggml-hexagon/htp/hvx-base.h +308 -0
  1417. data/vendor/ggml/src/ggml-hexagon/htp/hvx-copy.h +262 -0
  1418. data/vendor/ggml/src/ggml-hexagon/htp/hvx-div.h +291 -0
  1419. data/vendor/ggml/src/ggml-hexagon/htp/hvx-dump.h +129 -0
  1420. data/vendor/ggml/src/ggml-hexagon/htp/hvx-exp.h +216 -0
  1421. data/vendor/ggml/src/ggml-hexagon/htp/hvx-floor.h +100 -0
  1422. data/vendor/ggml/src/ggml-hexagon/htp/hvx-inverse.h +210 -0
  1423. data/vendor/ggml/src/ggml-hexagon/htp/hvx-reduce.h +296 -0
  1424. data/vendor/ggml/src/ggml-hexagon/htp/hvx-repl.h +74 -0
  1425. data/vendor/ggml/src/ggml-hexagon/htp/hvx-scale.h +133 -0
  1426. data/vendor/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +142 -0
  1427. data/vendor/ggml/src/ggml-hexagon/htp/hvx-sqrt.h +126 -0
  1428. data/vendor/ggml/src/ggml-hexagon/htp/hvx-types.h +36 -0
  1429. data/vendor/ggml/src/ggml-hexagon/htp/hvx-utils.h +19 -0
  1430. data/vendor/ggml/src/ggml-hexagon/htp/main.c +880 -0
  1431. data/vendor/ggml/src/ggml-hexagon/htp/matmul-ops.c +3173 -0
  1432. data/vendor/ggml/src/ggml-hexagon/htp/repeat-ops.c +148 -0
  1433. data/vendor/ggml/src/ggml-hexagon/htp/rope-ops.c +494 -0
  1434. data/vendor/ggml/src/ggml-hexagon/htp/set-rows-ops.c +184 -0
  1435. data/vendor/ggml/src/ggml-hexagon/htp/softmax-ops.c +407 -0
  1436. data/vendor/ggml/src/ggml-hexagon/htp/solve-tri-ops.c +267 -0
  1437. data/vendor/ggml/src/ggml-hexagon/htp/ssm-conv.c +340 -0
  1438. data/vendor/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +128 -0
  1439. data/vendor/ggml/src/ggml-hexagon/htp/unary-ops.c +657 -0
  1440. data/vendor/ggml/src/ggml-hexagon/htp/vtcm-utils.h +16 -0
  1441. data/vendor/ggml/src/ggml-hexagon/htp/worker-pool.c +293 -0
  1442. data/vendor/ggml/src/ggml-hexagon/htp/worker-pool.h +57 -0
  1443. data/vendor/ggml/src/ggml-hexagon/htp-drv.cpp +418 -0
  1444. data/vendor/ggml/src/ggml-hexagon/htp-drv.h +121 -0
  1445. data/vendor/ggml/src/ggml-hexagon/libdl.h +79 -0
  1446. data/vendor/ggml/src/ggml-hexagon/libggml-htp.inf +40 -0
  1447. data/vendor/ggml/src/ggml-hexagon/op-desc.h +153 -0
  1448. data/vendor/ggml/src/ggml-hip/CMakeLists.txt +157 -0
  1449. data/vendor/ggml/src/ggml-impl.h +783 -0
  1450. data/vendor/ggml/src/ggml-metal/CMakeLists.txt +124 -0
  1451. data/vendor/ggml/src/ggml-metal/ggml-metal-common.cpp +457 -0
  1452. data/vendor/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
  1453. data/vendor/ggml/src/ggml-metal/ggml-metal-context.h +41 -0
  1454. data/vendor/ggml/src/ggml-metal/ggml-metal-context.m +739 -0
  1455. data/vendor/ggml/src/ggml-metal/ggml-metal-device.cpp +2053 -0
  1456. data/vendor/ggml/src/ggml-metal/ggml-metal-device.h +296 -0
  1457. data/vendor/ggml/src/ggml-metal/ggml-metal-device.m +1829 -0
  1458. data/vendor/ggml/src/ggml-metal/ggml-metal-impl.h +1175 -0
  1459. data/vendor/ggml/src/ggml-metal/ggml-metal-ops.cpp +4606 -0
  1460. data/vendor/ggml/src/ggml-metal/ggml-metal-ops.h +97 -0
  1461. data/vendor/ggml/src/ggml-metal/ggml-metal.cpp +950 -0
  1462. data/vendor/ggml/src/ggml-metal/ggml-metal.metal +10679 -0
  1463. data/vendor/ggml/src/ggml-musa/CMakeLists.txt +124 -0
  1464. data/vendor/ggml/src/ggml-musa/mudnn.cu +112 -0
  1465. data/vendor/ggml/src/ggml-musa/mudnn.cuh +12 -0
  1466. data/vendor/ggml/src/ggml-opencl/CMakeLists.txt +189 -0
  1467. data/vendor/ggml/src/ggml-opencl/ggml-opencl.cpp +16374 -0
  1468. data/vendor/ggml/src/ggml-opencl/kernels/add.cl +190 -0
  1469. data/vendor/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
  1470. data/vendor/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
  1471. data/vendor/ggml/src/ggml-opencl/kernels/clamp.cl +20 -0
  1472. data/vendor/ggml/src/ggml-opencl/kernels/concat.cl +51 -0
  1473. data/vendor/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
  1474. data/vendor/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
  1475. data/vendor/ggml/src/ggml-opencl/kernels/cpy.cl +229 -0
  1476. data/vendor/ggml/src/ggml-opencl/kernels/cumsum.cl +139 -0
  1477. data/vendor/ggml/src/ggml-opencl/kernels/cvt.cl +1471 -0
  1478. data/vendor/ggml/src/ggml-opencl/kernels/diag.cl +27 -0
  1479. data/vendor/ggml/src/ggml-opencl/kernels/diag_mask_inf.cl +58 -0
  1480. data/vendor/ggml/src/ggml-opencl/kernels/div.cl +138 -0
  1481. data/vendor/ggml/src/ggml-opencl/kernels/embed_kernel.py +26 -0
  1482. data/vendor/ggml/src/ggml-opencl/kernels/exp.cl +125 -0
  1483. data/vendor/ggml/src/ggml-opencl/kernels/expm1.cl +113 -0
  1484. data/vendor/ggml/src/ggml-opencl/kernels/fill.cl +17 -0
  1485. data/vendor/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +370 -0
  1486. data/vendor/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +371 -0
  1487. data/vendor/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +373 -0
  1488. data/vendor/ggml/src/ggml-opencl/kernels/gelu.cl +89 -0
  1489. data/vendor/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32.cl +162 -0
  1490. data/vendor/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32_ns.cl +302 -0
  1491. data/vendor/ggml/src/ggml-opencl/kernels/gemm_moe_q4_0_f32_ns.cl +252 -0
  1492. data/vendor/ggml/src/ggml-opencl/kernels/gemm_moe_q4_1_f32_ns.cl +254 -0
  1493. data/vendor/ggml/src/ggml-opencl/kernels/gemm_moe_q5_0_f32_ns.cl +256 -0
  1494. data/vendor/ggml/src/ggml-opencl/kernels/gemm_moe_q5_1_f32_ns.cl +258 -0
  1495. data/vendor/ggml/src/ggml-opencl/kernels/gemm_noshuffle_iq4_nl_f32.cl +150 -0
  1496. data/vendor/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_0_f32.cl +139 -0
  1497. data/vendor/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_1_f32.cl +132 -0
  1498. data/vendor/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_k_f32.cl +172 -0
  1499. data/vendor/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_k_f32.cl +176 -0
  1500. data/vendor/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q6_k_f32.cl +140 -0
  1501. data/vendor/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q8_0_f32.cl +129 -0
  1502. data/vendor/ggml/src/ggml-opencl/kernels/gemm_xmem_f16_f32_os8.cl +233 -0
  1503. data/vendor/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32.cl +156 -0
  1504. data/vendor/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32_ns.cl +161 -0
  1505. data/vendor/ggml/src/ggml-opencl/kernels/gemv_moe_q4_0_f32_ns.cl +116 -0
  1506. data/vendor/ggml/src/ggml-opencl/kernels/gemv_moe_q4_1_f32_ns.cl +119 -0
  1507. data/vendor/ggml/src/ggml-opencl/kernels/gemv_moe_q5_0_f32_ns.cl +119 -0
  1508. data/vendor/ggml/src/ggml-opencl/kernels/gemv_moe_q5_1_f32_ns.cl +121 -0
  1509. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_iq4_nl_f32.cl +302 -0
  1510. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_0_f32.cl +274 -0
  1511. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_0_f32_spec.cl +268 -0
  1512. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_1_f32.cl +283 -0
  1513. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_k_f32.cl +318 -0
  1514. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_k_f32.cl +326 -0
  1515. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q6_k_f32.cl +293 -0
  1516. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q8_0_f32.cl +195 -0
  1517. data/vendor/ggml/src/ggml-opencl/kernels/get_rows.cl +187 -0
  1518. data/vendor/ggml/src/ggml-opencl/kernels/glu.cl +378 -0
  1519. data/vendor/ggml/src/ggml-opencl/kernels/group_norm.cl +121 -0
  1520. data/vendor/ggml/src/ggml-opencl/kernels/im2col_f16.cl +57 -0
  1521. data/vendor/ggml/src/ggml-opencl/kernels/im2col_f32.cl +57 -0
  1522. data/vendor/ggml/src/ggml-opencl/kernels/l2_norm.cl +71 -0
  1523. data/vendor/ggml/src/ggml-opencl/kernels/mean.cl +140 -0
  1524. data/vendor/ggml/src/ggml-opencl/kernels/moe_reorder_b.cl +30 -0
  1525. data/vendor/ggml/src/ggml-opencl/kernels/moe_sort_by_expert.cl +82 -0
  1526. data/vendor/ggml/src/ggml-opencl/kernels/mul.cl +152 -0
  1527. data/vendor/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
  1528. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_kq_kqv.cl +273 -0
  1529. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +146 -0
  1530. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +147 -0
  1531. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_iq4_nl_f32_l4_lm.cl +171 -0
  1532. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl +163 -0
  1533. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl +165 -0
  1534. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_q4_k_f32_l4_lm.cl +179 -0
  1535. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_q5_k_f32_l4_lm.cl +192 -0
  1536. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_q6_k_f32_l4_lm.cl +158 -0
  1537. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl +154 -0
  1538. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_f16_f16.cl +118 -0
  1539. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32.cl +118 -0
  1540. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_1row.cl +94 -0
  1541. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_l4.cl +84 -0
  1542. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_f32_f32.cl +118 -0
  1543. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
  1544. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
  1545. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  1546. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
  1547. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
  1548. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32.cl +164 -0
  1549. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32_flat.cl +202 -0
  1550. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
  1551. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
  1552. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32.cl +192 -0
  1553. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_16x_flat.cl +307 -0
  1554. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_8x_flat.cl +265 -0
  1555. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_8x_flat.cl +272 -0
  1556. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_v.cl +254 -0
  1557. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl +219 -0
  1558. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl +229 -0
  1559. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl +180 -0
  1560. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32_flat.cl +196 -0
  1561. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32.cl +187 -0
  1562. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32_flat.cl +203 -0
  1563. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32.cl +194 -0
  1564. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +194 -0
  1565. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
  1566. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
  1567. data/vendor/ggml/src/ggml-opencl/kernels/neg.cl +125 -0
  1568. data/vendor/ggml/src/ggml-opencl/kernels/norm.cl +161 -0
  1569. data/vendor/ggml/src/ggml-opencl/kernels/pad.cl +39 -0
  1570. data/vendor/ggml/src/ggml-opencl/kernels/relu.cl +16 -0
  1571. data/vendor/ggml/src/ggml-opencl/kernels/repeat.cl +38 -0
  1572. data/vendor/ggml/src/ggml-opencl/kernels/rms_norm.cl +190 -0
  1573. data/vendor/ggml/src/ggml-opencl/kernels/rope.cl +747 -0
  1574. data/vendor/ggml/src/ggml-opencl/kernels/scale.cl +27 -0
  1575. data/vendor/ggml/src/ggml-opencl/kernels/set_rows.cl +208 -0
  1576. data/vendor/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
  1577. data/vendor/ggml/src/ggml-opencl/kernels/silu.cl +30 -0
  1578. data/vendor/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +108 -0
  1579. data/vendor/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +108 -0
  1580. data/vendor/ggml/src/ggml-opencl/kernels/softmax_f16.cl +107 -0
  1581. data/vendor/ggml/src/ggml-opencl/kernels/softmax_f32.cl +107 -0
  1582. data/vendor/ggml/src/ggml-opencl/kernels/softplus.cl +116 -0
  1583. data/vendor/ggml/src/ggml-opencl/kernels/solve_tri.cl +51 -0
  1584. data/vendor/ggml/src/ggml-opencl/kernels/sqr.cl +53 -0
  1585. data/vendor/ggml/src/ggml-opencl/kernels/sqrt.cl +53 -0
  1586. data/vendor/ggml/src/ggml-opencl/kernels/ssm_conv.cl +77 -0
  1587. data/vendor/ggml/src/ggml-opencl/kernels/sub.cl +138 -0
  1588. data/vendor/ggml/src/ggml-opencl/kernels/sum_rows.cl +140 -0
  1589. data/vendor/ggml/src/ggml-opencl/kernels/tanh.cl +109 -0
  1590. data/vendor/ggml/src/ggml-opencl/kernels/transpose.cl +143 -0
  1591. data/vendor/ggml/src/ggml-opencl/kernels/tri.cl +32 -0
  1592. data/vendor/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  1593. data/vendor/ggml/src/ggml-opencl/kernels/upscale.cl +120 -0
  1594. data/vendor/ggml/src/ggml-openvino/CMakeLists.txt +22 -0
  1595. data/vendor/ggml/src/ggml-openvino/ggml-decoder.cpp +985 -0
  1596. data/vendor/ggml/src/ggml-openvino/ggml-decoder.h +294 -0
  1597. data/vendor/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +380 -0
  1598. data/vendor/ggml/src/ggml-openvino/ggml-openvino-extra.h +182 -0
  1599. data/vendor/ggml/src/ggml-openvino/ggml-openvino.cpp +1132 -0
  1600. data/vendor/ggml/src/ggml-openvino/ggml-quants.cpp +956 -0
  1601. data/vendor/ggml/src/ggml-openvino/ggml-quants.h +153 -0
  1602. data/vendor/ggml/src/ggml-openvino/openvino/decoder.h +74 -0
  1603. data/vendor/ggml/src/ggml-openvino/openvino/frontend.cpp +27 -0
  1604. data/vendor/ggml/src/ggml-openvino/openvino/frontend.h +23 -0
  1605. data/vendor/ggml/src/ggml-openvino/openvino/input_model.cpp +17 -0
  1606. data/vendor/ggml/src/ggml-openvino/openvino/input_model.h +29 -0
  1607. data/vendor/ggml/src/ggml-openvino/openvino/node_context.h +112 -0
  1608. data/vendor/ggml/src/ggml-openvino/openvino/op/cont.cpp +48 -0
  1609. data/vendor/ggml/src/ggml-openvino/openvino/op/cpy.cpp +21 -0
  1610. data/vendor/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +90 -0
  1611. data/vendor/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +69 -0
  1612. data/vendor/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp +61 -0
  1613. data/vendor/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp +62 -0
  1614. data/vendor/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +90 -0
  1615. data/vendor/ggml/src/ggml-openvino/openvino/op/permute.cpp +102 -0
  1616. data/vendor/ggml/src/ggml-openvino/openvino/op/reshape.cpp +83 -0
  1617. data/vendor/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp +46 -0
  1618. data/vendor/ggml/src/ggml-openvino/openvino/op/rope.cpp +149 -0
  1619. data/vendor/ggml/src/ggml-openvino/openvino/op/scale.cpp +41 -0
  1620. data/vendor/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +76 -0
  1621. data/vendor/ggml/src/ggml-openvino/openvino/op/softmax.cpp +89 -0
  1622. data/vendor/ggml/src/ggml-openvino/openvino/op/transpose.cpp +23 -0
  1623. data/vendor/ggml/src/ggml-openvino/openvino/op/unary_gelu.cpp +25 -0
  1624. data/vendor/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp +27 -0
  1625. data/vendor/ggml/src/ggml-openvino/openvino/op/view.cpp +53 -0
  1626. data/vendor/ggml/src/ggml-openvino/openvino/op_table.cpp +47 -0
  1627. data/vendor/ggml/src/ggml-openvino/openvino/op_table.h +40 -0
  1628. data/vendor/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp +60 -0
  1629. data/vendor/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.h +17 -0
  1630. data/vendor/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.h +29 -0
  1631. data/vendor/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.cpp +58 -0
  1632. data/vendor/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.h +17 -0
  1633. data/vendor/ggml/src/ggml-openvino/openvino/rt_info/weightless_caching_attributes.hpp +41 -0
  1634. data/vendor/ggml/src/ggml-openvino/openvino/translate_session.cpp +317 -0
  1635. data/vendor/ggml/src/ggml-openvino/openvino/translate_session.h +28 -0
  1636. data/vendor/ggml/src/ggml-openvino/openvino/utils.cpp +257 -0
  1637. data/vendor/ggml/src/ggml-openvino/openvino/utils.h +86 -0
  1638. data/vendor/ggml/src/ggml-openvino/utils.cpp +880 -0
  1639. data/vendor/ggml/src/ggml-openvino/utils.h +143 -0
  1640. data/vendor/ggml/src/ggml-opt.cpp +1094 -0
  1641. data/vendor/ggml/src/ggml-quants.c +5491 -0
  1642. data/vendor/ggml/src/ggml-quants.h +112 -0
  1643. data/vendor/ggml/src/ggml-rpc/CMakeLists.txt +33 -0
  1644. data/vendor/ggml/src/ggml-rpc/ggml-rpc.cpp +1974 -0
  1645. data/vendor/ggml/src/ggml-rpc/transport.cpp +683 -0
  1646. data/vendor/ggml/src/ggml-rpc/transport.h +34 -0
  1647. data/vendor/ggml/src/ggml-sycl/CMakeLists.txt +207 -0
  1648. data/vendor/ggml/src/ggml-sycl/add-id.cpp +81 -0
  1649. data/vendor/ggml/src/ggml-sycl/add-id.hpp +8 -0
  1650. data/vendor/ggml/src/ggml-sycl/backend.hpp +48 -0
  1651. data/vendor/ggml/src/ggml-sycl/binbcast.cpp +346 -0
  1652. data/vendor/ggml/src/ggml-sycl/binbcast.hpp +39 -0
  1653. data/vendor/ggml/src/ggml-sycl/common.cpp +155 -0
  1654. data/vendor/ggml/src/ggml-sycl/common.hpp +1002 -0
  1655. data/vendor/ggml/src/ggml-sycl/concat.cpp +202 -0
  1656. data/vendor/ggml/src/ggml-sycl/concat.hpp +20 -0
  1657. data/vendor/ggml/src/ggml-sycl/conv.cpp +101 -0
  1658. data/vendor/ggml/src/ggml-sycl/conv.hpp +20 -0
  1659. data/vendor/ggml/src/ggml-sycl/convert.cpp +825 -0
  1660. data/vendor/ggml/src/ggml-sycl/convert.hpp +64 -0
  1661. data/vendor/ggml/src/ggml-sycl/count-equal.cpp +79 -0
  1662. data/vendor/ggml/src/ggml-sycl/count-equal.hpp +9 -0
  1663. data/vendor/ggml/src/ggml-sycl/cpy.cpp +602 -0
  1664. data/vendor/ggml/src/ggml-sycl/cpy.hpp +223 -0
  1665. data/vendor/ggml/src/ggml-sycl/cumsum.cpp +148 -0
  1666. data/vendor/ggml/src/ggml-sycl/cumsum.hpp +5 -0
  1667. data/vendor/ggml/src/ggml-sycl/dequantize.hpp +975 -0
  1668. data/vendor/ggml/src/ggml-sycl/diag.cpp +67 -0
  1669. data/vendor/ggml/src/ggml-sycl/diag.hpp +5 -0
  1670. data/vendor/ggml/src/ggml-sycl/dmmv.cpp +1579 -0
  1671. data/vendor/ggml/src/ggml-sycl/dmmv.hpp +27 -0
  1672. data/vendor/ggml/src/ggml-sycl/dpct/helper.hpp +3774 -0
  1673. data/vendor/ggml/src/ggml-sycl/element_wise.cpp +1124 -0
  1674. data/vendor/ggml/src/ggml-sycl/element_wise.hpp +94 -0
  1675. data/vendor/ggml/src/ggml-sycl/fattn-buffers.cpp +56 -0
  1676. data/vendor/ggml/src/ggml-sycl/fattn-buffers.hpp +63 -0
  1677. data/vendor/ggml/src/ggml-sycl/fattn-common.hpp +1181 -0
  1678. data/vendor/ggml/src/ggml-sycl/fattn-tile.cpp +59 -0
  1679. data/vendor/ggml/src/ggml-sycl/fattn-tile.hpp +1246 -0
  1680. data/vendor/ggml/src/ggml-sycl/fattn-vec.hpp +674 -0
  1681. data/vendor/ggml/src/ggml-sycl/fattn.cpp +227 -0
  1682. data/vendor/ggml/src/ggml-sycl/fattn.hpp +22 -0
  1683. data/vendor/ggml/src/ggml-sycl/fill.cpp +55 -0
  1684. data/vendor/ggml/src/ggml-sycl/fill.hpp +5 -0
  1685. data/vendor/ggml/src/ggml-sycl/gated_delta_net.cpp +307 -0
  1686. data/vendor/ggml/src/ggml-sycl/gated_delta_net.hpp +9 -0
  1687. data/vendor/ggml/src/ggml-sycl/gemm.hpp +93 -0
  1688. data/vendor/ggml/src/ggml-sycl/getrows.cpp +219 -0
  1689. data/vendor/ggml/src/ggml-sycl/getrows.hpp +20 -0
  1690. data/vendor/ggml/src/ggml-sycl/ggml-sycl.cpp +5520 -0
  1691. data/vendor/ggml/src/ggml-sycl/gla.cpp +106 -0
  1692. data/vendor/ggml/src/ggml-sycl/gla.hpp +8 -0
  1693. data/vendor/ggml/src/ggml-sycl/im2col.cpp +400 -0
  1694. data/vendor/ggml/src/ggml-sycl/im2col.hpp +23 -0
  1695. data/vendor/ggml/src/ggml-sycl/mmq.cpp +3030 -0
  1696. data/vendor/ggml/src/ggml-sycl/mmq.hpp +33 -0
  1697. data/vendor/ggml/src/ggml-sycl/mmvq.cpp +1380 -0
  1698. data/vendor/ggml/src/ggml-sycl/mmvq.hpp +43 -0
  1699. data/vendor/ggml/src/ggml-sycl/norm.cpp +656 -0
  1700. data/vendor/ggml/src/ggml-sycl/norm.hpp +28 -0
  1701. data/vendor/ggml/src/ggml-sycl/outprod.cpp +47 -0
  1702. data/vendor/ggml/src/ggml-sycl/outprod.hpp +10 -0
  1703. data/vendor/ggml/src/ggml-sycl/pad.cpp +97 -0
  1704. data/vendor/ggml/src/ggml-sycl/pad.hpp +24 -0
  1705. data/vendor/ggml/src/ggml-sycl/pad_reflect_1d.cpp +100 -0
  1706. data/vendor/ggml/src/ggml-sycl/pad_reflect_1d.hpp +10 -0
  1707. data/vendor/ggml/src/ggml-sycl/presets.hpp +79 -0
  1708. data/vendor/ggml/src/ggml-sycl/quantize.hpp +133 -0
  1709. data/vendor/ggml/src/ggml-sycl/quants.hpp +156 -0
  1710. data/vendor/ggml/src/ggml-sycl/repeat_back.cpp +76 -0
  1711. data/vendor/ggml/src/ggml-sycl/repeat_back.hpp +8 -0
  1712. data/vendor/ggml/src/ggml-sycl/roll.cpp +122 -0
  1713. data/vendor/ggml/src/ggml-sycl/roll.hpp +20 -0
  1714. data/vendor/ggml/src/ggml-sycl/rope.cpp +641 -0
  1715. data/vendor/ggml/src/ggml-sycl/rope.hpp +26 -0
  1716. data/vendor/ggml/src/ggml-sycl/set.cpp +73 -0
  1717. data/vendor/ggml/src/ggml-sycl/set.hpp +5 -0
  1718. data/vendor/ggml/src/ggml-sycl/set_rows.cpp +240 -0
  1719. data/vendor/ggml/src/ggml-sycl/set_rows.hpp +8 -0
  1720. data/vendor/ggml/src/ggml-sycl/softmax.cpp +426 -0
  1721. data/vendor/ggml/src/ggml-sycl/softmax.hpp +24 -0
  1722. data/vendor/ggml/src/ggml-sycl/solve_tri.cpp +172 -0
  1723. data/vendor/ggml/src/ggml-sycl/solve_tri.hpp +8 -0
  1724. data/vendor/ggml/src/ggml-sycl/ssm_conv.cpp +132 -0
  1725. data/vendor/ggml/src/ggml-sycl/ssm_conv.hpp +5 -0
  1726. data/vendor/ggml/src/ggml-sycl/ssm_scan.cpp +156 -0
  1727. data/vendor/ggml/src/ggml-sycl/ssm_scan.hpp +5 -0
  1728. data/vendor/ggml/src/ggml-sycl/sycl_hw.cpp +67 -0
  1729. data/vendor/ggml/src/ggml-sycl/sycl_hw.hpp +38 -0
  1730. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq112-dv112.cpp +5 -0
  1731. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq128-dv128.cpp +5 -0
  1732. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq256-dv256.cpp +5 -0
  1733. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq40-dv40.cpp +5 -0
  1734. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq512-dv512.cpp +6 -0
  1735. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq576-dv512.cpp +5 -0
  1736. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq64-dv64.cpp +5 -0
  1737. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq72-dv72.cpp +5 -0
  1738. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq80-dv80.cpp +5 -0
  1739. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq96-dv96.cpp +5 -0
  1740. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-f16.cpp +8 -0
  1741. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_0.cpp +8 -0
  1742. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_1.cpp +8 -0
  1743. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_0.cpp +8 -0
  1744. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_1.cpp +8 -0
  1745. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q8_0.cpp +8 -0
  1746. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-f16.cpp +8 -0
  1747. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_0.cpp +8 -0
  1748. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_1.cpp +8 -0
  1749. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_0.cpp +8 -0
  1750. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_1.cpp +8 -0
  1751. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q8_0.cpp +8 -0
  1752. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-f16.cpp +8 -0
  1753. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_0.cpp +8 -0
  1754. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_1.cpp +8 -0
  1755. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_0.cpp +8 -0
  1756. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_1.cpp +8 -0
  1757. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q8_0.cpp +8 -0
  1758. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-f16.cpp +8 -0
  1759. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_0.cpp +8 -0
  1760. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_1.cpp +8 -0
  1761. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_0.cpp +8 -0
  1762. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_1.cpp +8 -0
  1763. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q8_0.cpp +8 -0
  1764. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-f16.cpp +8 -0
  1765. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_0.cpp +8 -0
  1766. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_1.cpp +8 -0
  1767. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_0.cpp +8 -0
  1768. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_1.cpp +8 -0
  1769. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q8_0.cpp +8 -0
  1770. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-f16.cpp +8 -0
  1771. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_0.cpp +8 -0
  1772. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_1.cpp +8 -0
  1773. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_0.cpp +8 -0
  1774. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_1.cpp +8 -0
  1775. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q8_0.cpp +8 -0
  1776. data/vendor/ggml/src/ggml-sycl/tsembd.cpp +73 -0
  1777. data/vendor/ggml/src/ggml-sycl/tsembd.hpp +20 -0
  1778. data/vendor/ggml/src/ggml-sycl/type.hpp +112 -0
  1779. data/vendor/ggml/src/ggml-sycl/upscale.cpp +410 -0
  1780. data/vendor/ggml/src/ggml-sycl/upscale.hpp +9 -0
  1781. data/vendor/ggml/src/ggml-sycl/vecdotq.hpp +1508 -0
  1782. data/vendor/ggml/src/ggml-sycl/wkv.cpp +293 -0
  1783. data/vendor/ggml/src/ggml-sycl/wkv.hpp +10 -0
  1784. data/vendor/ggml/src/ggml-threading.cpp +12 -0
  1785. data/vendor/ggml/src/ggml-threading.h +14 -0
  1786. data/vendor/ggml/src/ggml-virtgpu/CMakeLists.txt +70 -0
  1787. data/vendor/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp +87 -0
  1788. data/vendor/ggml/src/ggml-virtgpu/backend/CMakeLists.txt +21 -0
  1789. data/vendor/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp +115 -0
  1790. data/vendor/ggml/src/ggml-virtgpu/backend/backend-convert.h +13 -0
  1791. data/vendor/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +102 -0
  1792. data/vendor/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +105 -0
  1793. data/vendor/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +179 -0
  1794. data/vendor/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp +148 -0
  1795. data/vendor/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +51 -0
  1796. data/vendor/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +73 -0
  1797. data/vendor/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +27 -0
  1798. data/vendor/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +32 -0
  1799. data/vendor/ggml/src/ggml-virtgpu/backend/backend.cpp +144 -0
  1800. data/vendor/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +95 -0
  1801. data/vendor/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +94 -0
  1802. data/vendor/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +50 -0
  1803. data/vendor/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +378 -0
  1804. data/vendor/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +232 -0
  1805. data/vendor/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +58 -0
  1806. data/vendor/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +81 -0
  1807. data/vendor/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +123 -0
  1808. data/vendor/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +160 -0
  1809. data/vendor/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +213 -0
  1810. data/vendor/ggml/src/ggml-virtgpu/ggml-backend.cpp +71 -0
  1811. data/vendor/ggml/src/ggml-virtgpu/ggml-remoting.h +71 -0
  1812. data/vendor/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml +166 -0
  1813. data/vendor/ggml/src/ggml-virtgpu/include/apir_hw.h +9 -0
  1814. data/vendor/ggml/src/ggml-virtgpu/regenerate_remoting.py +333 -0
  1815. data/vendor/ggml/src/ggml-virtgpu/virtgpu-apir.h +15 -0
  1816. data/vendor/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +58 -0
  1817. data/vendor/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +110 -0
  1818. data/vendor/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +173 -0
  1819. data/vendor/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +192 -0
  1820. data/vendor/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +36 -0
  1821. data/vendor/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +53 -0
  1822. data/vendor/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +99 -0
  1823. data/vendor/ggml/src/ggml-virtgpu/virtgpu-shm.h +23 -0
  1824. data/vendor/ggml/src/ggml-virtgpu/virtgpu-utils.cpp +179 -0
  1825. data/vendor/ggml/src/ggml-virtgpu/virtgpu-utils.h +86 -0
  1826. data/vendor/ggml/src/ggml-virtgpu/virtgpu.cpp +545 -0
  1827. data/vendor/ggml/src/ggml-virtgpu/virtgpu.h +115 -0
  1828. data/vendor/ggml/src/ggml-vulkan/CMakeLists.txt +220 -0
  1829. data/vendor/ggml/src/ggml-vulkan/cmake/host-toolchain.cmake.in +15 -0
  1830. data/vendor/ggml/src/ggml-vulkan/ggml-vulkan.cpp +17208 -0
  1831. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +31 -0
  1832. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/abs.comp +21 -0
  1833. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +37 -0
  1834. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +69 -0
  1835. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/add1.comp +28 -0
  1836. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +42 -0
  1837. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/arange.comp +20 -0
  1838. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +60 -0
  1839. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +86 -0
  1840. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/argsort_large.comp +114 -0
  1841. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/ceil.comp +22 -0
  1842. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp +17 -0
  1843. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/concat.comp +41 -0
  1844. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +49 -0
  1845. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp +105 -0
  1846. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +347 -0
  1847. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
  1848. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +23 -0
  1849. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +51 -0
  1850. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +320 -0
  1851. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/copy_transpose.comp +67 -0
  1852. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp +17 -0
  1853. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/count_equal.comp +31 -0
  1854. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/count_experts.comp +51 -0
  1855. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/cumsum.comp +83 -0
  1856. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass1.comp +60 -0
  1857. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass2.comp +66 -0
  1858. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_f32.comp +20 -0
  1859. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl +653 -0
  1860. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.glsl +768 -0
  1861. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_head.glsl +13 -0
  1862. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +42 -0
  1863. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_s.comp +35 -0
  1864. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +44 -0
  1865. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xs.comp +43 -0
  1866. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +49 -0
  1867. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +40 -0
  1868. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +51 -0
  1869. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp +32 -0
  1870. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp +34 -0
  1871. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +32 -0
  1872. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_nvfp4.comp +32 -0
  1873. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q1_0.comp +29 -0
  1874. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +34 -0
  1875. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +42 -0
  1876. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_0.comp +30 -0
  1877. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_1.comp +32 -0
  1878. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +68 -0
  1879. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_0.comp +34 -0
  1880. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_1.comp +35 -0
  1881. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +70 -0
  1882. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +33 -0
  1883. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q8_0.comp +31 -0
  1884. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp +28 -0
  1885. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/diag_mask_inf.comp +34 -0
  1886. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/div.comp +27 -0
  1887. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/elu.comp +27 -0
  1888. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +20 -0
  1889. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/bfloat16.comp +7 -0
  1890. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/coopmat.comp +7 -0
  1891. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/coopmat2.comp +7 -0
  1892. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/integer_dot.comp +7 -0
  1893. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/fill.comp +19 -0
  1894. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +756 -0
  1895. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +255 -0
  1896. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +626 -0
  1897. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +427 -0
  1898. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_dequant.glsl +123 -0
  1899. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mask_opt.comp +162 -0
  1900. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mmq_funcs.glsl +203 -0
  1901. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +121 -0
  1902. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/floor.comp +22 -0
  1903. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp +190 -0
  1904. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
  1905. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
  1906. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
  1907. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp +25 -0
  1908. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
  1909. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp +23 -0
  1910. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.glsl +65 -0
  1911. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/generic_head.glsl +11 -0
  1912. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.glsl +83 -0
  1913. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +42 -0
  1914. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +51 -0
  1915. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.glsl +28 -0
  1916. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.glsl +39 -0
  1917. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp +66 -0
  1918. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +22 -0
  1919. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +22 -0
  1920. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +93 -0
  1921. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +124 -0
  1922. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +44 -0
  1923. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/leaky_relu.comp +22 -0
  1924. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +17 -0
  1925. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul.comp +27 -0
  1926. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_split_k_reduce.comp +48 -0
  1927. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +169 -0
  1928. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl +230 -0
  1929. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +35 -0
  1930. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_m.comp +132 -0
  1931. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp +95 -0
  1932. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_s.comp +90 -0
  1933. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +105 -0
  1934. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xxs.comp +87 -0
  1935. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_s.comp +90 -0
  1936. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_xxs.comp +88 -0
  1937. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +124 -0
  1938. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp +156 -0
  1939. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +128 -0
  1940. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +132 -0
  1941. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +134 -0
  1942. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +165 -0
  1943. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +130 -0
  1944. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +143 -0
  1945. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +503 -0
  1946. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +464 -0
  1947. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +624 -0
  1948. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +600 -0
  1949. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +74 -0
  1950. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +311 -0
  1951. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +454 -0
  1952. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +93 -0
  1953. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +194 -0
  1954. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/neg.comp +20 -0
  1955. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/norm.comp +44 -0
  1956. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_adamw.comp +42 -0
  1957. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +22 -0
  1958. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +64 -0
  1959. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/pool2d.comp +74 -0
  1960. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +127 -0
  1961. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
  1962. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp +21 -0
  1963. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp +26 -0
  1964. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp +37 -0
  1965. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +150 -0
  1966. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_back.comp +55 -0
  1967. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +65 -0
  1968. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
  1969. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +207 -0
  1970. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +19 -0
  1971. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +17 -0
  1972. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +17 -0
  1973. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +17 -0
  1974. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +31 -0
  1975. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +17 -0
  1976. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/round.comp +29 -0
  1977. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +24 -0
  1978. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/sgn.comp +21 -0
  1979. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp +20 -0
  1980. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp +22 -0
  1981. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/silu_back.comp +26 -0
  1982. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp +17 -0
  1983. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +195 -0
  1984. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +54 -0
  1985. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large1.comp +62 -0
  1986. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large2.comp +79 -0
  1987. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large3.comp +65 -0
  1988. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large_common.glsl +53 -0
  1989. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/softplus.comp +23 -0
  1990. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/solve_tri.comp +81 -0
  1991. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +17 -0
  1992. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/square.comp +17 -0
  1993. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +50 -0
  1994. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/ssm_scan.comp +124 -0
  1995. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/step.comp +22 -0
  1996. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/sub.comp +29 -0
  1997. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +47 -0
  1998. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.glsl +25 -0
  1999. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
  2000. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +14 -0
  2001. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp +20 -0
  2002. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +42 -0
  2003. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/topk_argsort.comp +118 -0
  2004. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +213 -0
  2005. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/topk_nary_search.comp +246 -0
  2006. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp +42 -0
  2007. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/trunc.comp +22 -0
  2008. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl +1846 -0
  2009. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +178 -0
  2010. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/utils.glsl +25 -0
  2011. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +1183 -0
  2012. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/wkv6.comp +87 -0
  2013. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/wkv7.comp +91 -0
  2014. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/xielu.comp +35 -0
  2015. data/vendor/ggml/src/ggml-webgpu/CMakeLists.txt +80 -0
  2016. data/vendor/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +3231 -0
  2017. data/vendor/ggml/src/ggml-webgpu/ggml-webgpu.cpp +4461 -0
  2018. data/vendor/ggml/src/ggml-webgpu/pre_wgsl.hpp +778 -0
  2019. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/add_id.wgsl +64 -0
  2020. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl +72 -0
  2021. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl +106 -0
  2022. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl +134 -0
  2023. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +139 -0
  2024. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +905 -0
  2025. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl +75 -0
  2026. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/conv2d.wgsl +165 -0
  2027. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +81 -0
  2028. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl +66 -0
  2029. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +89 -0
  2030. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +706 -0
  2031. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_tile.wgsl +351 -0
  2032. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_blk.wgsl +101 -0
  2033. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_reduce.wgsl +84 -0
  2034. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_split.wgsl +720 -0
  2035. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/gated_delta_net.wgsl +132 -0
  2036. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.wgsl +773 -0
  2037. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/glu.wgsl +155 -0
  2038. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/im2col.wgsl +101 -0
  2039. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
  2040. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +747 -0
  2041. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +1210 -0
  2042. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id.wgsl +195 -0
  2043. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_gather.wgsl +55 -0
  2044. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_vec.wgsl +154 -0
  2045. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl +149 -0
  2046. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.wgsl +200 -0
  2047. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +133 -0
  2048. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_acc.tmpl +1433 -0
  2049. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl +86 -0
  2050. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/repeat.wgsl +67 -0
  2051. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_mul.wgsl +152 -0
  2052. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/rope.wgsl +224 -0
  2053. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/row_norm.wgsl +153 -0
  2054. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/scale.wgsl +63 -0
  2055. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/set.wgsl +109 -0
  2056. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +109 -0
  2057. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.wgsl +245 -0
  2058. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/solve_tri.wgsl +121 -0
  2059. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/ssm_conv.wgsl +65 -0
  2060. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/ssm_scan.wgsl +193 -0
  2061. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl +55 -0
  2062. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +210 -0
  2063. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/upscale.wgsl +240 -0
  2064. data/vendor/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
  2065. data/vendor/ggml/src/ggml-zdnn/common.hpp +59 -0
  2066. data/vendor/ggml/src/ggml-zdnn/ggml-zdnn.cpp +637 -0
  2067. data/vendor/ggml/src/ggml-zdnn/mmf.cpp +80 -0
  2068. data/vendor/ggml/src/ggml-zdnn/mmf.hpp +12 -0
  2069. data/vendor/ggml/src/ggml-zdnn/utils.cpp +79 -0
  2070. data/vendor/ggml/src/ggml-zdnn/utils.hpp +19 -0
  2071. data/vendor/ggml/src/ggml-zendnn/CMakeLists.txt +91 -0
  2072. data/vendor/ggml/src/ggml-zendnn/ggml-zendnn.cpp +669 -0
  2073. data/vendor/ggml/src/ggml.c +7777 -0
  2074. data/vendor/ggml/src/ggml.cpp +26 -0
  2075. data/vendor/ggml/src/gguf.cpp +1556 -0
  2076. data/vendor/ggml/tests/CMakeLists.txt +356 -0
  2077. data/vendor/ggml/tests/test-arange.cpp +100 -0
  2078. data/vendor/ggml/tests/test-backend-ops.cpp +9786 -0
  2079. data/vendor/ggml/tests/test-cont.c +170 -0
  2080. data/vendor/ggml/tests/test-conv-transpose-1d.cpp +691 -0
  2081. data/vendor/ggml/tests/test-conv-transpose.c +248 -0
  2082. data/vendor/ggml/tests/test-conv1d-dw-c1.cpp +243 -0
  2083. data/vendor/ggml/tests/test-conv1d-dw-c2.cpp +243 -0
  2084. data/vendor/ggml/tests/test-conv1d.cpp +289 -0
  2085. data/vendor/ggml/tests/test-conv2d-dw.cpp +153 -0
  2086. data/vendor/ggml/tests/test-conv2d.cpp +391 -0
  2087. data/vendor/ggml/tests/test-customop.c +300 -0
  2088. data/vendor/ggml/tests/test-dup.c +111 -0
  2089. data/vendor/ggml/tests/test-interpolate.cpp +166 -0
  2090. data/vendor/ggml/tests/test-opt.cpp +1003 -0
  2091. data/vendor/ggml/tests/test-pad-reflect-1d.cpp +213 -0
  2092. data/vendor/ggml/tests/test-pool.c +274 -0
  2093. data/vendor/ggml/tests/test-quantize-fns.cpp +196 -0
  2094. data/vendor/ggml/tests/test-quantize-perf.cpp +356 -0
  2095. data/vendor/ggml/tests/test-rel-pos.c +87 -0
  2096. data/vendor/ggml/tests/test-roll.cpp +128 -0
  2097. data/vendor/ggml/tests/test-timestep_embedding.cpp +180 -0
  2098. data/vendor-patches/0001-cuda-buffer_from_ptr.patch +253 -0
  2099. data/vendor-patches/0002-cuda-buffer_from_ptr-reuse-iface.patch +117 -0
  2100. data/vendor-patches/0003-cuda-buffer_from_ptr-copy-mode.patch +128 -0
  2101. data/vendor-patches/0004-cuda-cpy-strided.patch +61 -0
  2102. data/vendor-patches/0005-concat-backward.patch +36 -0
  2103. data/vendor-patches/0006-getrows-back-large-vocab.patch +69 -0
  2104. data/vendor-patches/0007-gpt2-backward-kernels.patch +438 -0
  2105. data/vendor-patches/0008-mul-mat-backward-mixed-precision.patch +50 -0
  2106. data/vendor-patches/0009-sched-unsupported-node-diagnostic.patch +26 -0
  2107. metadata +2161 -0
@@ -0,0 +1,3178 @@
1
+ #include "rvv_kernels.h"
2
+
3
+ #include "common.h"
4
+ #include "ggml.h"
5
+ #include "ops.h"
6
+ #include "string.h"
7
+
8
+ #include <algorithm>
9
+ #include <cmath>
10
+ #include <cstdint>
11
+ #include <stdexcept>
12
+
13
+ #if !defined(__riscv_v) || !defined(__riscv_v_intrinsic)
14
+ # error "riscv v extension or v_intrinsic not enabled"
15
+ #else
16
+ # include <riscv_vector.h>
17
+ #endif
18
+
19
+ #if !defined(__riscv_zfh)
20
+ # error "riscv zfh extension not enabled"
21
+ #endif
22
+
23
+ #if defined(__GNUC__)
24
+ # pragma GCC diagnostic ignored "-Woverlength-strings"
25
+ # pragma GCC diagnostic ignored "-Wcast-qual"
26
+ # pragma GCC diagnostic ignored "-Wunused-parameter"
27
+ #endif
28
+
29
+ namespace spacemit_kernels::rvv {
30
+
31
+ namespace {
32
+
33
+ auto align_up(size_t value, size_t alignment) {
34
+ return (value + alignment - 1) / alignment * alignment;
35
+ }
36
+
37
+ static inline bool flash_attn_ext_supported_d_vlen1024_vf16(int64_t d) {
38
+ return d > 0 && d <= 128;
39
+ }
40
+
41
+ static inline bool flash_attn_ext_supported_shape_vlen1024_vf16(int64_t DK, int64_t DV) {
42
+ return flash_attn_ext_supported_d_vlen1024_vf16(DK) && flash_attn_ext_supported_d_vlen1024_vf16(DV);
43
+ }
44
+
45
+ static inline float reduce_sum_f32m4_vlen1024(vfloat32m4_t v, size_t vl) {
46
+ vfloat32m1_t s_v = __riscv_vfmv_v_f_f32m1(0.0f, 1);
47
+ s_v = __riscv_vfredusum_vs_f32m4_f32m1(v, s_v, vl);
48
+ return __riscv_vfmv_f_s_f32m1_f32(s_v);
49
+ }
50
+
51
+ static inline float reduce_sum_f32m2_vlen1024(vfloat32m2_t v, size_t vl) {
52
+ vfloat32m1_t s_v = __riscv_vfmv_v_f_f32m1(0.0f, 1);
53
+ s_v = __riscv_vfredusum_vs_f32m2_f32m1(v, s_v, vl);
54
+ return __riscv_vfmv_f_s_f32m1_f32(s_v);
55
+ }
56
+
57
+ // Adapted from ggml_v_expf_m2 in vec.h. This is accurate enough for softmax.
58
+ static inline vfloat32m2_t rvv_expf_approx_f32m2(vfloat32m2_t x, size_t vl) {
59
+ const vfloat32m2_t r = __riscv_vfmv_v_f_f32m2(0x1.8p23f, vl);
60
+ const vfloat32m2_t z = __riscv_vfmacc_vf_f32m2(r, 0x1.715476p+0f, x, vl);
61
+ const vfloat32m2_t n = __riscv_vfsub_vv_f32m2(z, r, vl);
62
+ const vfloat32m2_t b =
63
+ __riscv_vfnmsac_vf_f32m2(__riscv_vfnmsac_vf_f32m2(x, 0x1.62e4p-1f, n, vl), 0x1.7f7d1cp-20f, n, vl);
64
+ const vuint32m2_t e = __riscv_vsll_vx_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(z), 23, vl);
65
+ const vfloat32m2_t k = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vadd_vx_u32m2(e, 0x3f800000, vl));
66
+ const vbool16_t c = __riscv_vmfgt_vf_f32m2_b16(__riscv_vfabs_v_f32m2(n, vl), 126.0f, vl);
67
+ const vfloat32m2_t u = __riscv_vfmul_vv_f32m2(b, b, vl);
68
+ const vfloat32m2_t j = __riscv_vfmacc_vv_f32m2(
69
+ __riscv_vfmul_vf_f32m2(b, 0x1.ffffecp-1f, vl),
70
+ __riscv_vfmacc_vv_f32m2(
71
+ __riscv_vfmacc_vf_f32m2(__riscv_vfmv_v_f_f32m2(0x1.fffdb6p-2f, vl), 0x1.555e66p-3f, b, vl),
72
+ __riscv_vfmacc_vf_f32m2(__riscv_vfmv_v_f_f32m2(0x1.573e2ep-5f, vl), 0x1.0e4020p-7f, b, vl), u, vl),
73
+ u, vl);
74
+
75
+ if (!__riscv_vcpop_m_b16(c, vl)) {
76
+ return __riscv_vfmacc_vv_f32m2(k, j, k, vl);
77
+ }
78
+
79
+ const vbool16_t dm = __riscv_vmfle_vf_f32m2_b16(n, 0.0f, vl);
80
+ const vuint32m2_t d = __riscv_vmerge_vxm_u32m2(__riscv_vmv_v_x_u32m2(0, vl), 0x82000000, dm, vl);
81
+ const vfloat32m2_t s1 = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vadd_vx_u32m2(d, 0x7f000000, vl));
82
+ const vfloat32m2_t s2 = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vsub_vv_u32m2(e, d, vl));
83
+ const vfloat32m2_t r1 =
84
+ __riscv_vmerge_vvm_f32m2(__riscv_vfmacc_vv_f32m2(k, k, j, vl),
85
+ __riscv_vfmul_vv_f32m2(__riscv_vfmacc_vv_f32m2(s2, s2, j, vl), s1, vl), c, vl);
86
+ return __riscv_vmerge_vvm_f32m2(r1, __riscv_vfmul_vv_f32m2(s1, s1, vl),
87
+ __riscv_vmfgt_vf_f32m2_b16(__riscv_vfabs_v_f32m2(n, vl), 192.0f, vl), vl);
88
+ }
89
+
90
+ static inline vfloat32m2_t rvv_tanh_approx_f32m2(vfloat32m2_t x, size_t vl) {
91
+ const vfloat32m2_t abs_x = __riscv_vfabs_v_f32m2(x, vl);
92
+ const vfloat32m2_t neg_2_abs = __riscv_vfmul_vf_f32m2(abs_x, -2.0f, vl);
93
+ const vfloat32m2_t exp_term = rvv_expf_approx_f32m2(neg_2_abs, vl);
94
+ const vfloat32m2_t numerator = __riscv_vfsub_vf_f32m2(exp_term, 1.0f, vl);
95
+ const vfloat32m2_t denominator = __riscv_vfadd_vf_f32m2(exp_term, 1.0f, vl);
96
+ const vfloat32m2_t tanh_abs = __riscv_vfneg_v_f32m2(__riscv_vfdiv_vv_f32m2(numerator, denominator, vl), vl);
97
+ const vbool16_t neg_mask = __riscv_vmflt_vf_f32m2_b16(x, 0.0f, vl);
98
+ const vfloat32m2_t tanh_neg = __riscv_vfneg_v_f32m2(tanh_abs, vl);
99
+ return __riscv_vmerge_vvm_f32m2(tanh_abs, tanh_neg, neg_mask, vl);
100
+ }
101
+
102
+ static void rvv_softcap_tanh_inplace_f32(float * dst, int64_t dst_stride, int64_t tile_rows, int64_t n, float softcap) {
103
+ for (int tq = 0; tq < tile_rows; ++tq, dst += dst_stride) {
104
+ float * dst_row = dst;
105
+ int64_t remaining = n;
106
+ while (remaining > 0) {
107
+ const size_t vl = __riscv_vsetvl_e32m2(remaining);
108
+ vfloat32m2_t v = __riscv_vle32_v_f32m2(dst_row, vl);
109
+ v = rvv_tanh_approx_f32m2(v, vl);
110
+ v = __riscv_vfmul_vf_f32m2(v, softcap, vl);
111
+ __riscv_vse32_v_f32m2(dst_row, v, vl);
112
+ dst_row += vl;
113
+ remaining -= vl;
114
+ }
115
+ }
116
+ }
117
+
118
+ static inline float rvv_softmax_exp_inplace_f32(float * dst, int64_t n, float max_value) {
119
+ float row_sum = 0.0f;
120
+ while (n > 0) {
121
+ const size_t vl = __riscv_vsetvl_e32m2(n);
122
+ vfloat32m2_t v = __riscv_vle32_v_f32m2(dst, vl);
123
+ v = __riscv_vfsub_vf_f32m2(v, max_value, vl);
124
+ v = rvv_expf_approx_f32m2(v, vl);
125
+ __riscv_vse32_v_f32m2(dst, v, vl);
126
+ row_sum += reduce_sum_f32m2_vlen1024(v, vl);
127
+ dst += vl;
128
+ n -= vl;
129
+ }
130
+ return row_sum;
131
+ }
132
+
133
+ static inline float rvv_add_max_inplace_f32(float * dst, const float * src, int64_t n) {
134
+ float max_val = -INFINITY;
135
+ while (n > 0) {
136
+ const size_t vl = __riscv_vsetvl_e32m4(n);
137
+ vfloat32m4_t vdst = __riscv_vle32_v_f32m4(dst, vl);
138
+ vfloat32m4_t vsrc = __riscv_vle32_v_f32m4(src, vl);
139
+ vdst = __riscv_vfadd_vv_f32m4(vdst, vsrc, vl);
140
+ __riscv_vse32_v_f32m4(dst, vdst, vl);
141
+
142
+ vfloat32m1_t seed = __riscv_vfmv_v_f_f32m1(max_val, 1);
143
+ seed = __riscv_vfredmax_vs_f32m4_f32m1(vdst, seed, vl);
144
+ max_val = __riscv_vfmv_f_s_f32m1_f32(seed);
145
+
146
+ dst += vl;
147
+ src += vl;
148
+ n -= vl;
149
+ }
150
+ return max_val;
151
+ }
152
+
153
+ static inline float rvv_softcap_add_max_inplace_f32(float * dst, const float * src, int64_t n, float softcap) {
154
+ if (softcap == 0.0f) {
155
+ return rvv_add_max_inplace_f32(dst, src, n);
156
+ }
157
+
158
+ float max_val = -INFINITY;
159
+ while (n > 0) {
160
+ const size_t vl = __riscv_vsetvl_e32m2(n);
161
+ vfloat32m2_t vdst = __riscv_vle32_v_f32m2(dst, vl);
162
+ vfloat32m2_t vsrc = __riscv_vle32_v_f32m2(src, vl);
163
+ vdst = rvv_tanh_approx_f32m2(vdst, vl);
164
+ vdst = __riscv_vfmul_vf_f32m2(vdst, softcap, vl);
165
+ vdst = __riscv_vfadd_vv_f32m2(vdst, vsrc, vl);
166
+ __riscv_vse32_v_f32m2(dst, vdst, vl);
167
+
168
+ vfloat32m1_t seed = __riscv_vfmv_v_f_f32m1(max_val, 1);
169
+ seed = __riscv_vfredmax_vs_f32m2_f32m1(vdst, seed, vl);
170
+ max_val = __riscv_vfmv_f_s_f32m1_f32(seed);
171
+
172
+ dst += vl;
173
+ src += vl;
174
+ n -= vl;
175
+ }
176
+ return max_val;
177
+ }
178
+
179
+ static inline void rvv_zero_f32(float * dst, int64_t n) {
180
+ while (n > 0) {
181
+ const size_t vl = __riscv_vsetvl_e32m4(n);
182
+ const vfloat32m4_t z = __riscv_vfmv_v_f_f32m4(0.0f, vl);
183
+ __riscv_vse32_v_f32m4(dst, z, vl);
184
+ dst += vl;
185
+ n -= vl;
186
+ }
187
+ }
188
+
189
+ static inline void rvv_scale_f32(float * dst, float scale, int64_t n) {
190
+ while (n > 0) {
191
+ const size_t vl = __riscv_vsetvl_e32m4(n);
192
+ vfloat32m4_t v = __riscv_vle32_v_f32m4(dst, vl);
193
+ v = __riscv_vfmul_vf_f32m4(v, scale, vl);
194
+ __riscv_vse32_v_f32m4(dst, v, vl);
195
+ dst += vl;
196
+ n -= vl;
197
+ }
198
+ }
199
+
200
+ static inline void rvv_add_inplace_f32(float * dst,
201
+ int64_t dst_stride,
202
+ const float * src,
203
+ int64_t src_stride,
204
+ int64_t tile_rows,
205
+ int64_t n) {
206
+ for (int tq = 0; tq < tile_rows; ++tq, dst += dst_stride, src += src_stride) {
207
+ int64_t remaining = n;
208
+ float * dst_row = dst;
209
+ const float * src_row = src;
210
+ while (remaining > 0) {
211
+ const size_t vl = __riscv_vsetvl_e32m4(remaining);
212
+ vfloat32m4_t vdst = __riscv_vle32_v_f32m4(dst_row, vl);
213
+ vfloat32m4_t vsrc = __riscv_vle32_v_f32m4(src_row, vl);
214
+ vdst = __riscv_vfadd_vv_f32m4(vdst, vsrc, vl);
215
+ __riscv_vse32_v_f32m4(dst_row, vdst, vl);
216
+ dst_row += vl;
217
+ src_row += vl;
218
+ remaining -= vl;
219
+ }
220
+ }
221
+ }
222
+
223
+ static inline float rvv_max_f32(const float * src, int64_t n) {
224
+ float max_val = -INFINITY;
225
+ while (n > 0) {
226
+ const size_t vl = __riscv_vsetvl_e32m4(n);
227
+ const vfloat32m4_t v = __riscv_vle32_v_f32m4(src, vl);
228
+ vfloat32m1_t seed = __riscv_vfmv_v_f_f32m1(max_val, 1);
229
+ seed = __riscv_vfredmax_vs_f32m4_f32m1(v, seed, vl);
230
+ max_val = __riscv_vfmv_f_s_f32m1_f32(seed);
231
+ src += vl;
232
+ n -= vl;
233
+ }
234
+ return max_val;
235
+ }
236
+
237
+ static void rvv_pack_f32_as_scaled_f16(void * dst,
238
+ int64_t dst_row_stride,
239
+ const void * src,
240
+ int64_t src_row_stride,
241
+ int64_t tile_rows,
242
+ int64_t n,
243
+ float scale) {
244
+ for (int tq = 0; tq < tile_rows; ++tq) {
245
+ const float * row_ptr = (const float *) ((const char *) src + tq * src_row_stride);
246
+ _Float16 * dst_row_ptr = (_Float16 *) ((char *) dst + tq * dst_row_stride);
247
+ int64_t remaining = n;
248
+ while (remaining > 0) {
249
+ const size_t vl = __riscv_vsetvl_e32m4(remaining);
250
+ vfloat32m4_t v32 = __riscv_vle32_v_f32m4(row_ptr, vl);
251
+ v32 = __riscv_vfmul_vf_f32m4(v32, scale, vl);
252
+ const vfloat16m2_t v16 = __riscv_vfncvt_f_f_w_f16m2(v32, vl);
253
+ __riscv_vse16_v_f16m2(dst_row_ptr, v16, vl);
254
+ dst_row_ptr += vl;
255
+ row_ptr += vl;
256
+ remaining -= vl;
257
+ }
258
+ }
259
+ }
260
+
261
+ static void rvv_pack_scaled_f16_as_f32(void * dst,
262
+ int64_t dst_row_stride,
263
+ const void * src,
264
+ int64_t src_row_stride,
265
+ int64_t tile_rows,
266
+ int64_t n,
267
+ float scale) {
268
+ for (int tq = 0; tq < tile_rows; ++tq) {
269
+ const _Float16 * row_ptr = (const _Float16 *) ((const char *) src + tq * src_row_stride);
270
+ float * dst_row_ptr = (float *) ((char *) dst + tq * dst_row_stride);
271
+ int64_t remaining = n;
272
+ while (remaining > 0) {
273
+ const size_t vl = __riscv_vsetvl_e16m2(remaining);
274
+ const vfloat16m2_t v16 = __riscv_vle16_v_f16m2(row_ptr, vl);
275
+ vfloat32m4_t v32 = __riscv_vfwcvt_f_f_v_f32m4(v16, vl);
276
+ v32 = __riscv_vfmul_vf_f32m4(v32, scale, vl);
277
+ __riscv_vse32_v_f32m4(dst_row_ptr, v32, vl);
278
+ dst_row_ptr += vl;
279
+ row_ptr += vl;
280
+ remaining -= vl;
281
+ }
282
+ }
283
+ }
284
+
285
+ static void rvv_pack_scaled_f32_as_f32(void * dst,
286
+ int64_t dst_row_stride,
287
+ const void * src,
288
+ int64_t src_row_stride,
289
+ int64_t tile_rows,
290
+ int64_t n,
291
+ float * scale) {
292
+ for (int tq = 0; tq < tile_rows; ++tq) {
293
+ const float * row_ptr = (const float *) ((const char *) src + tq * src_row_stride);
294
+ float * dst_row_ptr = (float *) ((char *) dst + tq * dst_row_stride);
295
+ int64_t remaining = n;
296
+ while (remaining > 0) {
297
+ const size_t vl = __riscv_vsetvl_e32m4(remaining);
298
+ vfloat32m4_t v32 = __riscv_vle32_v_f32m4(row_ptr, vl);
299
+ v32 = __riscv_vfmul_vf_f32m4(v32, scale[tq], vl);
300
+ __riscv_vse32_v_f32m4(dst_row_ptr, v32, vl);
301
+ dst_row_ptr += vl;
302
+ row_ptr += vl;
303
+ remaining -= vl;
304
+ }
305
+ }
306
+ }
307
+
308
+ static inline void rvv_transposed_s32_mn_to_nm(int8_t * dst,
309
+ int64_t n_dst_stride,
310
+ int8_t * src,
311
+ int64_t m_src_stride,
312
+ int64_t m,
313
+ int64_t n) {
314
+ int8_t * in = src;
315
+ int8_t * out = dst;
316
+
317
+ __asm__ volatile(
318
+ "vsetvli t0, zero, e32, m1, tu, mu \n\t"
319
+ "mul t3, t0, %[os0] \n\t"
320
+ "srli t2, %[isz0], 3 \n\t"
321
+ "blez t2, M1%= \n\t"
322
+
323
+ "LOOP_M8%=: \n\t"
324
+ "addi a1, %[dst], 0 \n\t"
325
+ "addi s1, %[src], 0 \n\t"
326
+ "add s2, %[src], %[is0] \n\t"
327
+ "add s3, s2, %[is0] \n\t"
328
+ "add s4, s3, %[is0] \n\t"
329
+ "add s5, s4, %[is0] \n\t"
330
+ "add s6, s5, %[is0] \n\t"
331
+ "add s7, s6, %[is0] \n\t"
332
+ "add s8, s7, %[is0] \n\t"
333
+ "addi t1, %[isz1], 0 \n\t"
334
+
335
+ "LOOP_M8N%=: \n\t"
336
+ "vsetvli t0, t1, e32, m1, tu, mu \n\t"
337
+ "sub t1, t1, t0 \n\t"
338
+ "vle32.v v0, (s1) \n\t"
339
+ "sh2add s1, t0, s1 \n\t"
340
+ "vle32.v v1, (s2) \n\t"
341
+ "sh2add s2, t0, s2 \n\t"
342
+ "vle32.v v2, (s3) \n\t"
343
+ "sh2add s3, t0, s3 \n\t"
344
+ "vle32.v v3, (s4) \n\t"
345
+ "sh2add s4, t0, s4 \n\t"
346
+ "vle32.v v4, (s5) \n\t"
347
+ "sh2add s5, t0, s5 \n\t"
348
+ "vle32.v v5, (s6) \n\t"
349
+ "sh2add s6, t0, s6 \n\t"
350
+ "vle32.v v6, (s7) \n\t"
351
+ "sh2add s7, t0, s7 \n\t"
352
+ "vle32.v v7, (s8) \n\t"
353
+ "sh2add s8, t0, s8 \n\t"
354
+ "vssseg8e32.v v0, (a1), %[os0] \n\t"
355
+ "add a1, a1, t3 \n\t"
356
+ "bnez t1, LOOP_M8N%= \n\t"
357
+ "sh3add %[src], %[is0], %[src] \n\t"
358
+ "addi %[dst], %[dst], 32 \n\t"
359
+ "addi t2, t2, -1 \n\t"
360
+ "bnez t2, LOOP_M8%= \n\t"
361
+
362
+ "M1%=: \n\t"
363
+ "andi t2, %[isz0], 7 \n\t"
364
+ "blez t2, END%= \n\t"
365
+
366
+ "LOOP_M1%=: \n\t"
367
+ "addi a1, %[dst], 0 \n\t"
368
+ "addi s1, %[src], 0 \n\t"
369
+ "addi t1, %[isz1], 0 \n\t"
370
+
371
+ "LOOP_M1N%=: \n\t"
372
+ "vsetvli t0, t1, e32, m1, tu, mu \n\t"
373
+ "sub t1, t1, t0 \n\t"
374
+ "vle32.v v0, (s1) \n\t"
375
+ "sh2add s1, t0, s1 \n\t"
376
+ "vsse32.v v0, (a1), %[os0] \n\t"
377
+ "add a1, a1, t3 \n\t"
378
+ "bnez t1, LOOP_M1N%= \n\t"
379
+ "add %[src], %[is0], %[src] \n\t"
380
+ "addi %[dst], %[dst], 4 \n\t"
381
+ "addi t2, t2, -1 \n\t"
382
+ "bnez t2, LOOP_M1%= \n\t"
383
+ "END%=: \n\t"
384
+
385
+ : [src] "+r"(in), [dst] "+r"(out), [isz0] "+r"(m)
386
+ : [isz1] "r"(n), [is0] "r"(m_src_stride), [os0] "r"(n_dst_stride)
387
+ : "cc", "t0", "t1", "t2", "t3", "s1", "s2", "s3", "s4", "s5", "s6", "s7", "s8", "a1");
388
+ }
389
+
390
+ static inline void rvv_transposed_s16_mn_to_nm(int8_t * dst,
391
+ int64_t n_dst_stride,
392
+ int8_t * src,
393
+ int64_t m_src_stride,
394
+ int64_t m,
395
+ int64_t n) {
396
+ int8_t * in = src;
397
+ int8_t * out = dst;
398
+
399
+ __asm__ volatile(
400
+ "vsetvli t0, zero, e16, m1, tu, mu \n\t"
401
+ "mul t3, t0, %[os0] \n\t"
402
+ "srli t2, %[isz0], 3 \n\t"
403
+ "blez t2, M1%= \n\t"
404
+
405
+ "LOOP_M8%=: \n\t"
406
+ "addi a1, %[dst], 0 \n\t"
407
+ "addi s1, %[src], 0 \n\t"
408
+ "add s2, %[src], %[is0] \n\t"
409
+ "add s3, s2, %[is0] \n\t"
410
+ "add s4, s3, %[is0] \n\t"
411
+ "add s5, s4, %[is0] \n\t"
412
+ "add s6, s5, %[is0] \n\t"
413
+ "add s7, s6, %[is0] \n\t"
414
+ "add s8, s7, %[is0] \n\t"
415
+ "addi t1, %[isz1], 0 \n\t"
416
+
417
+ "LOOP_M8N%=: \n\t"
418
+ "vsetvli t0, t1, e16, m1, tu, mu \n\t"
419
+ "sub t1, t1, t0 \n\t"
420
+ "vle16.v v0, (s1) \n\t"
421
+ "sh1add s1, t0, s1 \n\t"
422
+ "vle16.v v1, (s2) \n\t"
423
+ "sh1add s2, t0, s2 \n\t"
424
+ "vle16.v v2, (s3) \n\t"
425
+ "sh1add s3, t0, s3 \n\t"
426
+ "vle16.v v3, (s4) \n\t"
427
+ "sh1add s4, t0, s4 \n\t"
428
+ "vle16.v v4, (s5) \n\t"
429
+ "sh1add s5, t0, s5 \n\t"
430
+ "vle16.v v5, (s6) \n\t"
431
+ "sh1add s6, t0, s6 \n\t"
432
+ "vle16.v v6, (s7) \n\t"
433
+ "sh1add s7, t0, s7 \n\t"
434
+ "vle16.v v7, (s8) \n\t"
435
+ "sh1add s8, t0, s8 \n\t"
436
+ "vssseg8e16.v v0, (a1), %[os0] \n\t"
437
+ "add a1, a1, t3 \n\t"
438
+ "bnez t1, LOOP_M8N%= \n\t"
439
+ "sh3add %[src], %[is0], %[src] \n\t"
440
+ "addi %[dst], %[dst], 16 \n\t"
441
+ "addi t2, t2, -1 \n\t"
442
+ "bnez t2, LOOP_M8%= \n\t"
443
+
444
+ "M1%=: \n\t"
445
+ "andi t2, %[isz0], 7 \n\t"
446
+ "blez t2, END%= \n\t"
447
+
448
+ "LOOP_M1%=: \n\t"
449
+ "addi a1, %[dst], 0 \n\t"
450
+ "addi s1, %[src], 0 \n\t"
451
+ "addi t1, %[isz1], 0 \n\t"
452
+
453
+ "LOOP_M1N%=: \n\t"
454
+ "vsetvli t0, t1, e16, m1, tu, mu \n\t"
455
+ "sub t1, t1, t0 \n\t"
456
+ "vle16.v v0, (s1) \n\t"
457
+ "sh1add s1, t0, s1 \n\t"
458
+ "vsse16.v v0, (a1), %[os0] \n\t"
459
+ "add a1, a1, t3 \n\t"
460
+ "bnez t1, LOOP_M1N%= \n\t"
461
+ "add %[src], %[is0], %[src] \n\t"
462
+ "addi %[dst], %[dst], 2 \n\t"
463
+ "addi t2, t2, -1 \n\t"
464
+ "bnez t2, LOOP_M1%= \n\t"
465
+ "END%=: \n\t"
466
+
467
+ : [src] "+r"(in), [dst] "+r"(out), [isz0] "+r"(m)
468
+ : [isz1] "r"(n), [is0] "r"(m_src_stride), [os0] "r"(n_dst_stride)
469
+ : "cc", "t0", "t1", "t2", "t3", "s1", "s2", "s3", "s4", "s5", "s6", "s7", "s8", "a1");
470
+ }
471
+
472
+ static inline void rvv_qk_dot_tile_f16_x1(float * dst,
473
+ const _Float16 * q_row,
474
+ const _Float16 * k_pack,
475
+ int64_t dk,
476
+ int64_t kv_tile) {
477
+ const size_t vl = __riscv_vsetvl_e16m1(kv_tile);
478
+ vfloat32m2_t acc = __riscv_vfmv_v_f_f32m2(0.0f, vl);
479
+
480
+ for (int64_t d = 0; d < dk; ++d) {
481
+ const vfloat16m1_t k_vec = __riscv_vle16_v_f16m1(k_pack + d * ggml_fa_tile_config::KV, vl);
482
+ acc = __riscv_vfwmacc_vf_f32m2(acc, q_row[d], k_vec, vl);
483
+ }
484
+
485
+ __riscv_vse32_v_f32m2(dst, acc, vl);
486
+ }
487
+
488
+ static inline void rvv_qk_dot_tile_f16_x4(float * dst0,
489
+ float * dst1,
490
+ float * dst2,
491
+ float * dst3,
492
+ const _Float16 * q0,
493
+ const _Float16 * q1,
494
+ const _Float16 * q2,
495
+ const _Float16 * q3,
496
+ const _Float16 * k_pack,
497
+ int64_t dk,
498
+ int64_t kv_tile) {
499
+ const size_t vl = __riscv_vsetvl_e16m1(kv_tile);
500
+ vfloat32m2_t acc0 = __riscv_vfmv_v_f_f32m2(0.0f, vl);
501
+ vfloat32m2_t acc1 = __riscv_vfmv_v_f_f32m2(0.0f, vl);
502
+ vfloat32m2_t acc2 = __riscv_vfmv_v_f_f32m2(0.0f, vl);
503
+ vfloat32m2_t acc3 = __riscv_vfmv_v_f_f32m2(0.0f, vl);
504
+
505
+ for (int64_t d = 0; d < dk; ++d) {
506
+ const vfloat16m1_t k_vec = __riscv_vle16_v_f16m1(k_pack + d * ggml_fa_tile_config::KV, vl);
507
+ acc0 = __riscv_vfwmacc_vf_f32m2(acc0, q0[d], k_vec, vl);
508
+ acc1 = __riscv_vfwmacc_vf_f32m2(acc1, q1[d], k_vec, vl);
509
+ acc2 = __riscv_vfwmacc_vf_f32m2(acc2, q2[d], k_vec, vl);
510
+ acc3 = __riscv_vfwmacc_vf_f32m2(acc3, q3[d], k_vec, vl);
511
+ }
512
+
513
+ __riscv_vse32_v_f32m2(dst0, acc0, vl);
514
+ __riscv_vse32_v_f32m2(dst1, acc1, vl);
515
+ __riscv_vse32_v_f32m2(dst2, acc2, vl);
516
+ __riscv_vse32_v_f32m2(dst3, acc3, vl);
517
+ }
518
+
519
+ static inline void rvv_pv_accumulate_f16_x1(float * dst,
520
+ const float * prob,
521
+ const _Float16 * v_pack,
522
+ int64_t kv_tile,
523
+ int64_t dv) {
524
+ int64_t d_left = dv;
525
+ int64_t d_off = 0;
526
+
527
+ while (d_left > 0) {
528
+ const size_t vl = __riscv_vsetvl_e16m2(d_left);
529
+ vfloat32m4_t acc = __riscv_vle32_v_f32m4(dst + d_off, vl);
530
+
531
+ for (int64_t tk = 0; tk < kv_tile; ++tk) {
532
+ const vfloat16m2_t v16 = __riscv_vle16_v_f16m2(v_pack + tk * dv + d_off, vl);
533
+ const vfloat32m4_t v32 = __riscv_vfwcvt_f_f_v_f32m4(v16, vl);
534
+ acc = __riscv_vfmacc_vf_f32m4(acc, prob[tk], v32, vl);
535
+ }
536
+
537
+ __riscv_vse32_v_f32m4(dst + d_off, acc, vl);
538
+ d_left -= vl;
539
+ d_off += vl;
540
+ }
541
+ }
542
+
543
+ static inline void rvv_pv_accumulate_f16_x4(float * dst0,
544
+ float * dst1,
545
+ float * dst2,
546
+ float * dst3,
547
+ const float * prob0,
548
+ const float * prob1,
549
+ const float * prob2,
550
+ const float * prob3,
551
+ const _Float16 * v_pack,
552
+ int64_t kv_tile,
553
+ int64_t dv) {
554
+ int64_t d_left = dv;
555
+ int64_t d_off = 0;
556
+
557
+ while (d_left > 0) {
558
+ const size_t vl = __riscv_vsetvl_e16m2(d_left);
559
+ vfloat32m4_t acc0 = __riscv_vle32_v_f32m4(dst0 + d_off, vl);
560
+ vfloat32m4_t acc1 = __riscv_vle32_v_f32m4(dst1 + d_off, vl);
561
+ vfloat32m4_t acc2 = __riscv_vle32_v_f32m4(dst2 + d_off, vl);
562
+ vfloat32m4_t acc3 = __riscv_vle32_v_f32m4(dst3 + d_off, vl);
563
+
564
+ for (int64_t tk = 0; tk < kv_tile; ++tk) {
565
+ const vfloat16m2_t v16 = __riscv_vle16_v_f16m2(v_pack + tk * dv + d_off, vl);
566
+ const vfloat32m4_t v32 = __riscv_vfwcvt_f_f_v_f32m4(v16, vl);
567
+ acc0 = __riscv_vfmacc_vf_f32m4(acc0, prob0[tk], v32, vl);
568
+ acc1 = __riscv_vfmacc_vf_f32m4(acc1, prob1[tk], v32, vl);
569
+ acc2 = __riscv_vfmacc_vf_f32m4(acc2, prob2[tk], v32, vl);
570
+ acc3 = __riscv_vfmacc_vf_f32m4(acc3, prob3[tk], v32, vl);
571
+ }
572
+
573
+ __riscv_vse32_v_f32m4(dst0 + d_off, acc0, vl);
574
+ __riscv_vse32_v_f32m4(dst1 + d_off, acc1, vl);
575
+ __riscv_vse32_v_f32m4(dst2 + d_off, acc2, vl);
576
+ __riscv_vse32_v_f32m4(dst3 + d_off, acc3, vl);
577
+ d_left -= vl;
578
+ d_off += vl;
579
+ }
580
+ }
581
+
582
+ static inline void rvv_qk_dot_tile(float * dst,
583
+ const float * q_row,
584
+ const float * k_pack,
585
+ int64_t dk,
586
+ int64_t kv_tile,
587
+ float scale) {
588
+ const size_t vl = __riscv_vsetvl_e32m4(kv_tile);
589
+ vfloat32m4_t acc = __riscv_vfmv_v_f_f32m4(0.0f, vl);
590
+
591
+ for (int64_t d = 0; d < dk; ++d) {
592
+ const vfloat32m4_t k_vec = __riscv_vle32_v_f32m4(k_pack + d * kv_tile, vl);
593
+ acc = __riscv_vfmacc_vf_f32m4(acc, q_row[d] * scale, k_vec, vl);
594
+ }
595
+
596
+ __riscv_vse32_v_f32m4(dst, acc, vl);
597
+ }
598
+
599
+ static inline void rvv_pv_accumulate(float * dst,
600
+ const float * prob,
601
+ const float * v_pack,
602
+ int64_t kv_tile,
603
+ int64_t dv) {
604
+ int64_t d_left = dv;
605
+ int64_t d_off = 0;
606
+
607
+ while (d_left > 0) {
608
+ const size_t vl = __riscv_vsetvl_e32m4(d_left);
609
+ vfloat32m4_t acc = __riscv_vle32_v_f32m4(dst + d_off, vl);
610
+
611
+ for (int64_t tk = 0; tk < kv_tile; ++tk) {
612
+ const vfloat32m4_t v_vec = __riscv_vle32_v_f32m4(v_pack + tk * dv + d_off, vl);
613
+ acc = __riscv_vfmacc_vf_f32m4(acc, prob[tk], v_vec, vl);
614
+ }
615
+
616
+ __riscv_vse32_v_f32m4(dst + d_off, acc, vl);
617
+ d_left -= vl;
618
+ d_off += vl;
619
+ }
620
+ }
621
+
622
+ static void permute_transpose_impl(const ggml_tensor * src0,
623
+ ggml_tensor * dst,
624
+ int64_t batch,
625
+ int64_t m,
626
+ int64_t n,
627
+ int64_t batch_stride,
628
+ int64_t m_src_stride,
629
+ int64_t n_src_stride,
630
+ int64_t n_dst_stride,
631
+ int ith,
632
+ int nth) {
633
+ GGML_ASSERT(n_src_stride == sizeof(int32_t) || n_src_stride == sizeof(int16_t));
634
+
635
+ if (n_src_stride == sizeof(int32_t)) {
636
+ for (int64_t bi = ith; bi < batch; bi += nth) {
637
+ rvv_transposed_s32_mn_to_nm((int8_t *) ((char *) dst->data + bi * batch_stride), n_dst_stride,
638
+ (int8_t *) ((char *) src0->data + bi * batch_stride), m_src_stride, m, n);
639
+ }
640
+ } else if (n_src_stride == sizeof(int16_t)) {
641
+ for (int64_t bi = ith; bi < batch; bi += nth) {
642
+ rvv_transposed_s32_mn_to_nm((int8_t *) ((char *) dst->data + bi * batch_stride), n_dst_stride,
643
+ (int8_t *) ((char *) src0->data + bi * batch_stride), m_src_stride, m, n);
644
+ }
645
+ } else {
646
+ GGML_ABORT("not implemented");
647
+ }
648
+ }
649
+
650
+ template <size_t QLEN>
651
+ static void flash_attn_ext_f16_one_chunk_inner_vlen1024_vf16_mrow(float ** pq,
652
+ const char * k_data_row,
653
+ const char * v_data_row,
654
+ const ggml_fp16_t * mp,
655
+ float ** sinks,
656
+ float ** dst,
657
+ float scale,
658
+ float logit_softcap,
659
+ float slope,
660
+ int64_t nek1,
661
+ int64_t nbk1,
662
+ int64_t nbv1,
663
+ int64_t DV,
664
+ int64_t DK,
665
+ void * tcm_buffer,
666
+ size_t tcm_buffer_size) {
667
+ GGML_ASSERT(flash_attn_ext_supported_shape_vlen1024_vf16(DK, DV));
668
+ float S[QLEN] = { 0.0f }; // sum
669
+ float M[QLEN] = { -INFINITY }; // maximum KQ value
670
+
671
+ _Float16 * kq16_buffer = (_Float16 *) tcm_buffer;
672
+ _Float16 * qv_buffer = kq16_buffer + QLEN * DV;
673
+ const size_t qkv_temp_buffer_size = (QLEN * DV + QLEN * DK) * sizeof(_Float16);
674
+ char * kv_tile_buffer = (char *) (qv_buffer + QLEN * DK);
675
+
676
+ {
677
+ vfloat16m2_t VKQ16_v = __riscv_vfmv_v_f_f16m2(0.0f, DV);
678
+ for (int64_t i = 0; i < QLEN; ++i) {
679
+ __riscv_vse16_v_f16m2(kq16_buffer + i * DV, VKQ16_v, DV);
680
+ vfloat16m2_t Q_q_v = __riscv_vfncvt_f_f_w_f16m2(__riscv_vle32_v_f32m4(pq[i], DK), DK);
681
+ __riscv_vse16_v_f16m2(qv_buffer + i * DK, Q_q_v, DK);
682
+ }
683
+ }
684
+
685
+ const uintptr_t scratch_addr = reinterpret_cast<uintptr_t>(kv_tile_buffer);
686
+ const size_t scratch_size = tcm_buffer_size > qkv_temp_buffer_size ? tcm_buffer_size - qkv_temp_buffer_size : 0;
687
+ const uintptr_t kq_tile_addr = align_up(scratch_addr, alignof(float));
688
+ const size_t scratch_prefix = kq_tile_addr - scratch_addr;
689
+ const size_t packed_tile_size =
690
+ QLEN * sizeof(float) + DK * sizeof(_Float16) + DV * sizeof(_Float16) + sizeof(float);
691
+ const int64_t max_ic_tile_step = ((int64_t) __riscv_vsetvlmax_e16m1()) & ~((int64_t) 7);
692
+ const int64_t max_fit_by_tcm =
693
+ scratch_size > scratch_prefix ? (int64_t) ((scratch_size - scratch_prefix) / packed_tile_size) : 0;
694
+ const int64_t ic_tile_step = std::min(max_ic_tile_step, max_fit_by_tcm) & ~((int64_t) 7);
695
+
696
+ const uintptr_t k_tile_addr = kq_tile_addr + QLEN * ic_tile_step * sizeof(float);
697
+ const uintptr_t v_tile_addr = k_tile_addr + DK * ic_tile_step * sizeof(_Float16);
698
+ const uintptr_t mv_tile_addr = v_tile_addr + ic_tile_step * DV * sizeof(_Float16);
699
+
700
+ if (ic_tile_step >= 8) {
701
+ float * kq_tile_buffer = reinterpret_cast<float *>(kq_tile_addr);
702
+ _Float16 * k_tile_pack = reinterpret_cast<_Float16 *>(k_tile_addr);
703
+ _Float16 * v_tile_pack = reinterpret_cast<_Float16 *>(v_tile_addr);
704
+ float * mv_tile_pack = reinterpret_cast<float *>(mv_tile_addr);
705
+
706
+ const int64_t k_tile_byte_stride = ic_tile_step * (int64_t) sizeof(_Float16);
707
+
708
+ int64_t ic_step = 0;
709
+ for (int64_t ic = 0; ic < nek1; ++ic) {
710
+ const float mv = mp ? slope * ((_Float16 *) mp)[ic] : 0.0f;
711
+
712
+ if (mv != -INFINITY) {
713
+ const _Float16 * k_data = (const _Float16 *) (k_data_row + ic * nbk1);
714
+ const _Float16 * v_data = (const _Float16 *) (v_data_row + ic * nbv1);
715
+
716
+ const vfloat16m2_t k_data_v = __riscv_vle16_v_f16m2(k_data, DK);
717
+ const vfloat16m2_t v_data_v = __riscv_vle16_v_f16m2(v_data, DV);
718
+ __riscv_vsse16_v_f16m2(k_tile_pack + ic_step, k_tile_byte_stride, k_data_v, DK);
719
+ __riscv_vse16_v_f16m2(v_tile_pack + ic_step * DV, v_data_v, DV);
720
+ mv_tile_pack[ic_step] = mv;
721
+ ic_step++;
722
+ }
723
+
724
+ if (ic_step > 0 && (ic_step == ic_tile_step || ic == (nek1 - 1))) {
725
+ if constexpr (QLEN == 4) {
726
+ const size_t qk_vl = __riscv_vsetvl_e16m1(ic_step);
727
+ vfloat32m2_t qk_acc0 = __riscv_vfmv_v_f_f32m2(0.0f, qk_vl);
728
+ vfloat32m2_t qk_acc1 = __riscv_vfmv_v_f_f32m2(0.0f, qk_vl);
729
+ vfloat32m2_t qk_acc2 = __riscv_vfmv_v_f_f32m2(0.0f, qk_vl);
730
+ vfloat32m2_t qk_acc3 = __riscv_vfmv_v_f_f32m2(0.0f, qk_vl);
731
+
732
+ for (int64_t d = 0; d < DK; ++d) {
733
+ const vfloat16m1_t k_vec = __riscv_vle16_v_f16m1(k_tile_pack + d * ic_tile_step, qk_vl);
734
+ qk_acc0 = __riscv_vfwmacc_vf_f32m2(qk_acc0, qv_buffer[0 * DK + d], k_vec, qk_vl);
735
+ qk_acc1 = __riscv_vfwmacc_vf_f32m2(qk_acc1, qv_buffer[1 * DK + d], k_vec, qk_vl);
736
+ qk_acc2 = __riscv_vfwmacc_vf_f32m2(qk_acc2, qv_buffer[2 * DK + d], k_vec, qk_vl);
737
+ qk_acc3 = __riscv_vfwmacc_vf_f32m2(qk_acc3, qv_buffer[3 * DK + d], k_vec, qk_vl);
738
+ }
739
+
740
+ qk_acc0 = __riscv_vfmul_vf_f32m2(qk_acc0, scale, qk_vl);
741
+ qk_acc1 = __riscv_vfmul_vf_f32m2(qk_acc1, scale, qk_vl);
742
+ qk_acc2 = __riscv_vfmul_vf_f32m2(qk_acc2, scale, qk_vl);
743
+ qk_acc3 = __riscv_vfmul_vf_f32m2(qk_acc3, scale, qk_vl);
744
+
745
+ __riscv_vse32_v_f32m2(kq_tile_buffer + 0 * ic_tile_step, qk_acc0, qk_vl);
746
+ __riscv_vse32_v_f32m2(kq_tile_buffer + 1 * ic_tile_step, qk_acc1, qk_vl);
747
+ __riscv_vse32_v_f32m2(kq_tile_buffer + 2 * ic_tile_step, qk_acc2, qk_vl);
748
+ __riscv_vse32_v_f32m2(kq_tile_buffer + 3 * ic_tile_step, qk_acc3, qk_vl);
749
+ } else {
750
+ static_assert(QLEN == 2, "unsupported QLEN");
751
+
752
+ const size_t qk_vl = __riscv_vsetvl_e16m1(ic_step);
753
+ vfloat32m2_t qk_acc0 = __riscv_vfmv_v_f_f32m2(0.0f, qk_vl);
754
+ vfloat32m2_t qk_acc1 = __riscv_vfmv_v_f_f32m2(0.0f, qk_vl);
755
+
756
+ for (int64_t d = 0; d < DK; ++d) {
757
+ const vfloat16m1_t k_vec = __riscv_vle16_v_f16m1(k_tile_pack + d * ic_tile_step, qk_vl);
758
+ qk_acc0 = __riscv_vfwmacc_vf_f32m2(qk_acc0, qv_buffer[0 * DK + d], k_vec, qk_vl);
759
+ qk_acc1 = __riscv_vfwmacc_vf_f32m2(qk_acc1, qv_buffer[1 * DK + d], k_vec, qk_vl);
760
+ }
761
+
762
+ qk_acc0 = __riscv_vfmul_vf_f32m2(qk_acc0, scale, qk_vl);
763
+ qk_acc1 = __riscv_vfmul_vf_f32m2(qk_acc1, scale, qk_vl);
764
+
765
+ __riscv_vse32_v_f32m2(kq_tile_buffer + 0 * ic_tile_step, qk_acc0, qk_vl);
766
+ __riscv_vse32_v_f32m2(kq_tile_buffer + 1 * ic_tile_step, qk_acc1, qk_vl);
767
+ }
768
+
769
+ for (int i = 0; i < QLEN; ++i) {
770
+ float * row_ptr = kq_tile_buffer + i * ic_tile_step;
771
+ const float tile_max =
772
+ rvv_softcap_add_max_inplace_f32(row_ptr, mv_tile_pack, ic_step, logit_softcap);
773
+
774
+ const float Mold = M[i];
775
+
776
+ if (tile_max > Mold) {
777
+ const float ms = expf(Mold - tile_max);
778
+ M[i] = tile_max;
779
+ S[i] *= ms;
780
+
781
+ vfloat16m2_t VKQ16_v = __riscv_vle16_v_f16m2(kq16_buffer + i * DV, DV);
782
+ VKQ16_v = __riscv_vfmul_vf_f16m2(VKQ16_v, (_Float16) ms, DV);
783
+ __riscv_vse16_v_f16m2(kq16_buffer + i * DV, VKQ16_v, DV);
784
+ }
785
+
786
+ S[i] += rvv_softmax_exp_inplace_f32(row_ptr, ic_step, M[i]);
787
+ }
788
+
789
+ if constexpr (QLEN == 4) {
790
+ vfloat16m2_t pv_acc0 = __riscv_vle16_v_f16m2(kq16_buffer + 0 * DV, DV);
791
+ vfloat16m2_t pv_acc1 = __riscv_vle16_v_f16m2(kq16_buffer + 1 * DV, DV);
792
+ vfloat16m2_t pv_acc2 = __riscv_vle16_v_f16m2(kq16_buffer + 2 * DV, DV);
793
+ vfloat16m2_t pv_acc3 = __riscv_vle16_v_f16m2(kq16_buffer + 3 * DV, DV);
794
+
795
+ for (int64_t tk = 0; tk < ic_step; ++tk) {
796
+ const vfloat16m2_t v16 = __riscv_vle16_v_f16m2(v_tile_pack + tk * DV, DV);
797
+ pv_acc0 =
798
+ __riscv_vfmacc_vf_f16m2(pv_acc0, (_Float16) kq_tile_buffer[0 * ic_tile_step + tk], v16, DV);
799
+ pv_acc1 =
800
+ __riscv_vfmacc_vf_f16m2(pv_acc1, (_Float16) kq_tile_buffer[1 * ic_tile_step + tk], v16, DV);
801
+ pv_acc2 =
802
+ __riscv_vfmacc_vf_f16m2(pv_acc2, (_Float16) kq_tile_buffer[2 * ic_tile_step + tk], v16, DV);
803
+ pv_acc3 =
804
+ __riscv_vfmacc_vf_f16m2(pv_acc3, (_Float16) kq_tile_buffer[3 * ic_tile_step + tk], v16, DV);
805
+ }
806
+
807
+ __riscv_vse16_v_f16m2(kq16_buffer + 0 * DV, pv_acc0, DV);
808
+ __riscv_vse16_v_f16m2(kq16_buffer + 1 * DV, pv_acc1, DV);
809
+ __riscv_vse16_v_f16m2(kq16_buffer + 2 * DV, pv_acc2, DV);
810
+ __riscv_vse16_v_f16m2(kq16_buffer + 3 * DV, pv_acc3, DV);
811
+ } else {
812
+ static_assert(QLEN == 2, "unsupported QLEN");
813
+ vfloat16m2_t pv_acc0 = __riscv_vle16_v_f16m2(kq16_buffer + 0 * DV, DV);
814
+ vfloat16m2_t pv_acc1 = __riscv_vle16_v_f16m2(kq16_buffer + 1 * DV, DV);
815
+
816
+ for (int64_t tk = 0; tk < ic_step; ++tk) {
817
+ const vfloat16m2_t v16 = __riscv_vle16_v_f16m2(v_tile_pack + tk * DV, DV);
818
+ pv_acc0 =
819
+ __riscv_vfmacc_vf_f16m2(pv_acc0, (_Float16) kq_tile_buffer[0 * ic_tile_step + tk], v16, DV);
820
+ pv_acc1 =
821
+ __riscv_vfmacc_vf_f16m2(pv_acc1, (_Float16) kq_tile_buffer[1 * ic_tile_step + tk], v16, DV);
822
+ }
823
+
824
+ __riscv_vse16_v_f16m2(kq16_buffer + 0 * DV, pv_acc0, DV);
825
+ __riscv_vse16_v_f16m2(kq16_buffer + 1 * DV, pv_acc1, DV);
826
+ }
827
+
828
+ ic_step = 0;
829
+ }
830
+ }
831
+ } else {
832
+ for (int64_t ic = 0; ic < nek1; ++ic) {
833
+ const float mv = mp ? slope * ((_Float16 *) mp)[ic] : 0.0f;
834
+
835
+ const char * k_data = k_data_row + ic * nbk1;
836
+ const char * v_data = v_data_row + ic * nbv1;
837
+
838
+ vfloat16m2_t k_data_v;
839
+ vfloat16m2_t v_data_v;
840
+
841
+ if (mv != -INFINITY) {
842
+ k_data_v = __riscv_vle16_v_f16m2((_Float16 *) k_data, DK);
843
+ v_data_v = __riscv_vle16_v_f16m2((_Float16 *) v_data, DV);
844
+ } else {
845
+ continue;
846
+ }
847
+
848
+ for (int i = 0; i < QLEN; ++i) {
849
+ vfloat16m2_t Q_q_v = __riscv_vle16_v_f16m2(qv_buffer + i * DK, DK);
850
+ vfloat32m4_t qk_acc_v = __riscv_vfwmul_vv_f32m4(k_data_v, Q_q_v, DK);
851
+ float s = reduce_sum_f32m4_vlen1024(qk_acc_v, DK);
852
+ s = s * scale;
853
+ if (logit_softcap != 0.0f) {
854
+ s = logit_softcap * tanhf(s);
855
+ }
856
+ s += mv;
857
+
858
+ const float Mold = M[i];
859
+
860
+ float ms = 1.0f; // upon new higher max val, scale VKQ and KQ sum with this value
861
+ float vs = 1.0f; // post-softmax KQ value, expf(s - M)
862
+
863
+ vfloat16m2_t VKQ16_v = __riscv_vle16_v_f16m2(kq16_buffer + i * DV, DV);
864
+ if (s > M[i]) {
865
+ // s is new maximum, ms < 1.0f, vs == expf(s - s) == 1.0f
866
+ M[i] = s;
867
+ ms = expf(Mold - M[i]);
868
+
869
+ // V = V*expf(Mold - M)
870
+ VKQ16_v = __riscv_vfmul_vf_f16m2(VKQ16_v, ms, DV);
871
+ } else {
872
+ // no new maximum, ms == 1.0f, vs != 1.0f
873
+ vs = expf(s - M[i]);
874
+ }
875
+ VKQ16_v = __riscv_vfmacc_vf_f16m2(VKQ16_v, vs, v_data_v, DV);
876
+ __riscv_vse16_v_f16m2(kq16_buffer + i * DV, VKQ16_v, DV);
877
+ S[i] = S[i] * ms + vs; // scale and increment sum with partial sum
878
+ }
879
+ }
880
+ }
881
+
882
+ for (int i = 0; i < QLEN; ++i) {
883
+ vfloat16m2_t VKQ16_v = __riscv_vle16_v_f16m2(kq16_buffer + i * DV, DV);
884
+ vfloat32m4_t VKQ32_v = __riscv_vfwcvt_f_f_v_f32m4(VKQ16_v, DV);
885
+
886
+ // sinks
887
+ if (sinks[i]) {
888
+ const float s = *(sinks[i]);
889
+
890
+ float ms = 1.0f;
891
+ float vs = 1.0f;
892
+
893
+ if (s > M[i]) {
894
+ ms = expf(M[i] - s);
895
+ M[i] = s;
896
+ VKQ32_v = __riscv_vfmul_vf_f32m4(VKQ32_v, ms, DV);
897
+ } else {
898
+ vs = expf(s - M[i]);
899
+ }
900
+
901
+ S[i] = S[i] * ms + vs;
902
+ }
903
+
904
+ // V /= S
905
+ const float S_inv = S[i] == 0.0f ? 0.0f : 1.0f / S[i];
906
+
907
+ VKQ32_v = __riscv_vfmul_vf_f32m4(VKQ32_v, S_inv, DV);
908
+
909
+ __riscv_vse32_v_f32m4(dst[i], VKQ32_v, DV);
910
+ }
911
+ }
912
+
913
+ static void flash_attn_ext_f16_one_chunk_inner_vlen1024_vf16_m1(const float * pq,
914
+ const char * k_data_row,
915
+ const char * v_data_row,
916
+ const ggml_fp16_t * mp,
917
+ const float * sinks,
918
+ float * dst,
919
+ float scale,
920
+ float logit_softcap,
921
+ float slope,
922
+ int64_t nek1,
923
+ int64_t nbk1,
924
+ int64_t nbv1,
925
+ int64_t DV,
926
+ int64_t DK) {
927
+ GGML_ASSERT(flash_attn_ext_supported_shape_vlen1024_vf16(DK, DV));
928
+
929
+ float S = 0.0f; // sum
930
+ float M = -INFINITY; // maximum KQ value
931
+
932
+ vfloat16m2_t VKQ16_v = __riscv_vfmv_v_f_f16m2(0.0f, DV);
933
+
934
+ vfloat16m2_t Q_q_v = __riscv_vfncvt_f_f_w_f16m2(__riscv_vle32_v_f32m4(pq, DK), DK);
935
+
936
+ for (int64_t ic = 0; ic < nek1; ++ic) {
937
+ const float mv = mp ? slope * ((_Float16 *) mp)[ic] : 0.0f;
938
+ if (mv == -INFINITY) {
939
+ continue;
940
+ }
941
+
942
+ const char * k_data = k_data_row + ic * nbk1;
943
+
944
+ vfloat16m2_t k_data_v = __riscv_vle16_v_f16m2((_Float16 *) k_data, DK);
945
+
946
+ vfloat32m4_t qk_acc_v = __riscv_vfwmul_vv_f32m4(k_data_v, Q_q_v, DK);
947
+ float s = reduce_sum_f32m4_vlen1024(qk_acc_v, DK);
948
+
949
+ s = s * scale; // scale KQ value
950
+
951
+ if (logit_softcap != 0.0f) {
952
+ s = logit_softcap * tanhf(s);
953
+ }
954
+
955
+ s += mv; // apply mask
956
+
957
+ const float Mold = M;
958
+
959
+ float ms = 1.0f; // upon new higher max val, scale VKQ and KQ sum with this value
960
+ float vs = 1.0f; // post-softmax KQ value, expf(s - M)
961
+
962
+ const char * v_data = v_data_row + ic * nbv1;
963
+
964
+ vfloat16m2_t v_data_v = __riscv_vle16_v_f16m2((_Float16 *) v_data, DV);
965
+
966
+ if (s > M) {
967
+ // s is new maximum, ms < 1.0f, vs == expf(s - s) == 1.0f
968
+ M = s;
969
+ ms = expf(Mold - M);
970
+
971
+ // V = V*expf(Mold - M)
972
+ VKQ16_v = __riscv_vfmul_vf_f16m2(VKQ16_v, ms, DV);
973
+ } else {
974
+ // no new maximum, ms == 1.0f, vs != 1.0f
975
+ vs = expf(s - M);
976
+ }
977
+
978
+ VKQ16_v = __riscv_vfmacc_vf_f16m2(VKQ16_v, vs, v_data_v, DV);
979
+
980
+ S = S * ms + vs; // scale and increment sum with partial sum
981
+ }
982
+
983
+ vfloat32m4_t VKQ32_v = __riscv_vfwcvt_f_f_v_f32m4(VKQ16_v, DV);
984
+
985
+ // sinks
986
+ if (sinks) {
987
+ const float s = *sinks;
988
+
989
+ float ms = 1.0f;
990
+ float vs = 1.0f;
991
+
992
+ if (s > M) {
993
+ ms = expf(M - s);
994
+ M = s;
995
+ VKQ32_v = __riscv_vfmul_vf_f32m4(VKQ32_v, ms, DV);
996
+ } else {
997
+ vs = expf(s - M);
998
+ }
999
+
1000
+ S = S * ms + vs;
1001
+ }
1002
+
1003
+ // V /= S
1004
+ const float S_inv = S == 0.0f ? 0.0f : 1.0f / S;
1005
+
1006
+ VKQ32_v = __riscv_vfmul_vf_f32m4(VKQ32_v, S_inv, DV);
1007
+
1008
+ __riscv_vse32_v_f32m4(dst, VKQ32_v, DV);
1009
+ }
1010
+
1011
+ } // namespace
1012
+
1013
+ void memcpy1d(void * dst, const void * src, int64_t size) {
1014
+ size_t byte_size_all = size;
1015
+ size_t vlen = __riscv_vlenb() * 8;
1016
+ if (vlen == 256) {
1017
+ // 1024 bytes
1018
+ __asm__ volatile(
1019
+ //
1020
+ "srli t0, %[size], 10 \n\t"
1021
+ "blez t0, memcpy_tail%= \n\t"
1022
+ "vsetvli t1, x0, e8, m8, tu, mu \n\t"
1023
+ "memcpy_main_loop%=: \n\t"
1024
+ "addi t0, t0, -1 \n\t"
1025
+ "vle8.v v0, (%[s]) \n\t"
1026
+ "addi %[s], %[s], 256 \n\t"
1027
+ "vle8.v v8, (%[s]) \n\t"
1028
+ "addi %[s], %[s], 256 \n\t"
1029
+ "vle8.v v16, (%[s]) \n\t"
1030
+ "addi %[s], %[s], 256 \n\t"
1031
+ "vle8.v v24, (%[s]) \n\t"
1032
+ "addi %[s], %[s], 256 \n\t"
1033
+ //
1034
+ "vse8.v v0, (%[d]) \n\t"
1035
+ "addi %[d], %[d], 256 \n\t"
1036
+ "vse8.v v8, (%[d]) \n\t"
1037
+ "addi %[d], %[d], 256 \n\t"
1038
+ "vse8.v v16, (%[d]) \n\t"
1039
+ "addi %[d], %[d], 256 \n\t"
1040
+ "vse8.v v24, (%[d]) \n\t"
1041
+ "addi %[d], %[d], 256 \n\t"
1042
+ //
1043
+ "bnez t0, memcpy_main_loop%= \n\t"
1044
+ "memcpy_tail%=: \n\t"
1045
+ "andi t1, %[size], 1023 \n\t"
1046
+ "blez t1, out%= \n\t"
1047
+ "memcpy_tail_loop%=: \n\t"
1048
+ "vsetvli t0, t1, e8, m8, tu, mu \n\t"
1049
+ "sub t1, t1, t0 \n\t"
1050
+ "vle8.v v0, (%[s]) \n\t"
1051
+ "add %[s], %[s], t0 \n\t"
1052
+ "vse8.v v0, (%[d]) \n\t"
1053
+ "add %[d], %[d], t0 \n\t"
1054
+ "bnez t1, memcpy_tail_loop%= \n\t"
1055
+ "out%=: \n\t"
1056
+ : [s] "+r"(src), [d] "+r"(dst)
1057
+ : [size] "r"(byte_size_all)
1058
+ : "cc", "t0", "t1");
1059
+ } else if (vlen == 1024) {
1060
+ // 2048 bytes
1061
+ __asm__ volatile(
1062
+ //
1063
+ "srli t0, %[size], 11 \n\t"
1064
+ "blez t0, memcpy_tail%= \n\t"
1065
+ "vsetvli t1, x0, e8, m8, tu, mu \n\t"
1066
+ "addi t2, %[s], 1024 \n\t"
1067
+ "addi t3, %[d], 1024 \n\t"
1068
+ "li t5, 2048 \n\t"
1069
+ "memcpy_main_loop%=: \n\t"
1070
+ "addi t0, t0, -1 \n\t"
1071
+ "vle8.v v0, (%[s]) \n\t"
1072
+ "add %[s], %[s], t5 \n\t"
1073
+ "vle8.v v8, (t2) \n\t"
1074
+ "add t2, t2, t5 \n\t"
1075
+ //
1076
+ "vse8.v v0, (%[d]) \n\t"
1077
+ "add %[d], %[d], t5 \n\t"
1078
+ "vse8.v v8, (t3) \n\t"
1079
+ "add t3, t3, t5 \n\t"
1080
+ //
1081
+ "bnez t0, memcpy_main_loop%= \n\t"
1082
+ "memcpy_tail%=: \n\t"
1083
+ "andi t1, %[size], 2047 \n\t"
1084
+ "blez t1, out%= \n\t"
1085
+ "memcpy_tail_loop%=: \n\t"
1086
+ "vsetvli t0, t1, e8, m2, tu, mu \n\t"
1087
+ "sub t1, t1, t0 \n\t"
1088
+ "vle8.v v0, (%[s]) \n\t"
1089
+ "add %[s], %[s], t0 \n\t"
1090
+ "vse8.v v0, (%[d]) \n\t"
1091
+ "add %[d], %[d], t0 \n\t"
1092
+ "bnez t1, memcpy_tail_loop%= \n\t"
1093
+ "out%=: \n\t"
1094
+ : [s] "+r"(src), [d] "+r"(dst)
1095
+ : [size] "r"(byte_size_all)
1096
+ : "cc", "t0", "t1", "t2", "t3", "t5");
1097
+ } else {
1098
+ __asm__ volatile(
1099
+ //
1100
+ "add t1, %[size], zero \n\t"
1101
+ "memcpy_tail_loop%=: \n\t"
1102
+ "vsetvli t0, t1, e8, m8, tu, mu \n\t"
1103
+ "sub t1, t1, t0 \n\t"
1104
+ "vle8.v v0, (%[s]) \n\t"
1105
+ "add %[s], %[s], t0 \n\t"
1106
+ "vse8.v v0, (%[d]) \n\t"
1107
+ "add %[d], %[d], t0 \n\t"
1108
+ "bnez t1, memcpy_tail_loop%= \n\t"
1109
+ : [s] "+r"(src), [d] "+r"(dst)
1110
+ : [size] "r"(byte_size_all)
1111
+ : "cc", "t0", "t1", "t2", "t4", "t3");
1112
+ }
1113
+ }
1114
+
1115
+ void memcpy2d(void * dst, int64_t dst_stride, const void * src, int64_t src_stride, int64_t tile_rows, int64_t size) {
1116
+ for (int64_t i = 0; i < tile_rows; ++i) {
1117
+ memcpy1d((char *) dst + i * dst_stride, (const char *) src + i * src_stride, size);
1118
+ }
1119
+ }
1120
+
1121
+ void forward_flash_attn_ext_f16_one_chunk_vlen1024_vf16(const ggml_compute_params * params,
1122
+ ggml_tensor * dst,
1123
+ int ir0,
1124
+ int ir1,
1125
+ void * tcm_buffer,
1126
+ size_t tcm_buffer_size) {
1127
+ const ggml_tensor * q = dst->src[0];
1128
+ const ggml_tensor * k = dst->src[1];
1129
+ const ggml_tensor * v = dst->src[2];
1130
+ const ggml_tensor * mask = dst->src[3];
1131
+ const ggml_tensor * sinks = dst->src[4];
1132
+
1133
+ GGML_TENSOR_LOCALS(int64_t, neq, q, ne)
1134
+ GGML_TENSOR_LOCALS(size_t, nbq, q, nb)
1135
+ GGML_TENSOR_LOCALS(int64_t, nek, k, ne)
1136
+ GGML_TENSOR_LOCALS(size_t, nbk, k, nb)
1137
+ GGML_TENSOR_LOCALS(int64_t, nev, v, ne)
1138
+ GGML_TENSOR_LOCALS(size_t, nbv, v, nb)
1139
+ GGML_TENSOR_LOCALS(int64_t, ne, dst, ne)
1140
+ GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
1141
+
1142
+ const int64_t DK = nek0;
1143
+ const int64_t DV = nev0;
1144
+ const int64_t N = neq1;
1145
+
1146
+ GGML_ASSERT(flash_attn_ext_supported_shape_vlen1024_vf16(DK, DV));
1147
+
1148
+ // broadcast factors
1149
+ const int64_t rk2 = neq2 / nek2;
1150
+ const int64_t rk3 = neq3 / nek3;
1151
+
1152
+ const int64_t rv2 = neq2 / nev2;
1153
+ const int64_t rv3 = neq3 / nev3;
1154
+
1155
+ // parallelize by q rows using ggml_vec_dot_f32
1156
+
1157
+ float scale = *((float *) dst->op_params + 0);
1158
+ float max_bias = *((float *) dst->op_params + 1);
1159
+ float logit_softcap = *((float *) dst->op_params + 2);
1160
+
1161
+ if (logit_softcap != 0) {
1162
+ scale /= logit_softcap;
1163
+ }
1164
+
1165
+ const uint32_t n_head = neq2;
1166
+ const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head));
1167
+
1168
+ const float m0 = powf(2.0f, -(max_bias) / n_head_log2);
1169
+ const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
1170
+
1171
+ const int KV_row_size = DK * sizeof(_Float16) + DV * sizeof(_Float16);
1172
+
1173
+ int ith = params->ith;
1174
+ int ir_step = 1;
1175
+ for (int ir = ir0; ir < ir1; ir += ir_step) {
1176
+ // q indices
1177
+ const int iq3 = ir / (neq2 * neq1);
1178
+ const int iq2 = (ir - iq3 * neq2 * neq1) / neq1;
1179
+ const int iq1 = (ir - iq3 * neq2 * neq1 - iq2 * neq1);
1180
+
1181
+ const int iq3_1 = (ir + 1) / (neq2 * neq1);
1182
+ const int iq2_1 = (ir + 1 - iq3_1 * neq2 * neq1) / neq1;
1183
+ const int iq1_1 = (ir + 1 - iq3_1 * neq2 * neq1 - iq2_1 * neq1);
1184
+
1185
+ const int iq3_2 = (ir + 2) / (neq2 * neq1);
1186
+ const int iq2_2 = (ir + 2 - iq3_2 * neq2 * neq1) / neq1;
1187
+ const int iq1_2 = (ir + 2 - iq3_2 * neq2 * neq1 - iq2_2 * neq1);
1188
+
1189
+ const int iq3_3 = (ir + 3) / (neq2 * neq1);
1190
+ const int iq2_3 = (ir + 3 - iq3_3 * neq2 * neq1) / neq1;
1191
+ const int iq1_3 = (ir + 3 - iq3_3 * neq2 * neq1 - iq2_3 * neq1);
1192
+
1193
+ const uint32_t h = iq2; // head index
1194
+ const float slope =
1195
+ (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2 * (h - n_head_log2) + 1) : 1.0f;
1196
+
1197
+ const ggml_fp16_t * mp =
1198
+ mask ? (ggml_fp16_t *) ((char *) mask->data + iq1 * mask->nb[1] + (iq2 % mask->ne[2]) * mask->nb[2] +
1199
+ (iq3 % mask->ne[3]) * mask->nb[3]) :
1200
+ NULL;
1201
+
1202
+ const bool mp_equal_2 = iq1_1 == iq1 && (iq2 % mask->ne[2]) == (iq2_1 % mask->ne[2]) &&
1203
+ (iq3 % mask->ne[3]) == (iq3_1 % mask->ne[3]);
1204
+
1205
+ const bool mp_equal_4 = mp_equal_2 && iq1_2 == iq1 && (iq2 % mask->ne[2]) == (iq2_2 % mask->ne[2]) &&
1206
+ (iq3 % mask->ne[3]) == (iq3_2 % mask->ne[3]) && iq1_3 == iq1 &&
1207
+ (iq2 % mask->ne[2]) == (iq2_3 % mask->ne[2]) &&
1208
+ (iq3 % mask->ne[3]) == (iq3_3 % mask->ne[3]);
1209
+
1210
+ // k indices
1211
+ const int ik3 = iq3 / rk3;
1212
+ const int ik2 = iq2 / rk2;
1213
+
1214
+ const int ik3_1 = iq3_1 / rk3;
1215
+ const int ik2_1 = iq2_1 / rk2;
1216
+
1217
+ const int ik3_2 = iq3_2 / rk3;
1218
+ const int ik2_2 = iq2_2 / rk2;
1219
+
1220
+ const int ik3_3 = iq3_3 / rk3;
1221
+ const int ik2_3 = iq2_3 / rk2;
1222
+
1223
+ // v indices
1224
+ const int iv3 = iq3 / rv3;
1225
+ const int iv2 = iq2 / rv2;
1226
+
1227
+ const int iv3_1 = iq3_1 / rv3;
1228
+ const int iv2_1 = iq2_1 / rv2;
1229
+
1230
+ const int iv3_2 = iq3_2 / rv3;
1231
+ const int iv2_2 = iq2_2 / rv2;
1232
+
1233
+ const int iv3_3 = iq3_3 / rv3;
1234
+ const int iv2_3 = iq2_3 / rv2;
1235
+
1236
+ const float * pq = (const float *) ((char *) q->data + (iq1 * nbq1 + iq2 * nbq2 + iq3 * nbq3));
1237
+
1238
+ std::array<float *, 4> pq_buffer;
1239
+ std::array<float *, 4> sinks_buffer;
1240
+ std::array<float *, 4> dst_buffer;
1241
+
1242
+ if (tcm_buffer != nullptr && 4 * KV_row_size < tcm_buffer_size && ir < (ir1 - 3) && mp_equal_4 &&
1243
+ ik3_3 == ik3 && ik2_3 == ik2 && iv3_3 == iv3 && iv2_3 == iv2 && ik3_2 == ik3 && ik2_2 == ik2 &&
1244
+ iv3_2 == iv3 && iv2_2 == iv2 && ik3_1 == ik3 && ik2_1 == ik2 && iv3_1 == iv3 && iv2_1 == iv2) {
1245
+ ir_step = 4;
1246
+
1247
+ pq_buffer[0] = (float *) ((char *) q->data + (iq1 * nbq1 + iq2 * nbq2 + iq3 * nbq3));
1248
+ pq_buffer[1] = (float *) ((char *) q->data + (iq1_1 * nbq1 + iq2_1 * nbq2 + iq3_1 * nbq3));
1249
+ pq_buffer[2] = (float *) ((char *) q->data + (iq1_2 * nbq1 + iq2_2 * nbq2 + iq3_2 * nbq3));
1250
+ pq_buffer[3] = (float *) ((char *) q->data + (iq1_3 * nbq1 + iq2_3 * nbq2 + iq3_3 * nbq3));
1251
+
1252
+ sinks_buffer[0] = sinks ? ((float *) ((char *) sinks->data)) + iq2 : nullptr;
1253
+ sinks_buffer[1] = sinks ? ((float *) ((char *) sinks->data)) + iq2_1 : nullptr;
1254
+ sinks_buffer[2] = sinks ? ((float *) ((char *) sinks->data)) + iq2_2 : nullptr;
1255
+ sinks_buffer[3] = sinks ? ((float *) ((char *) sinks->data)) + iq2_3 : nullptr;
1256
+
1257
+ dst_buffer[0] = (float *) ((char *) dst->data + (iq3 * ne2 * ne1 + iq2 + iq1 * ne1) * nb1);
1258
+ dst_buffer[1] = (float *) ((char *) dst->data + (iq3_1 * ne2 * ne1 + iq2_1 + iq1_1 * ne1) * nb1);
1259
+ dst_buffer[2] = (float *) ((char *) dst->data + (iq3_2 * ne2 * ne1 + iq2_2 + iq1_2 * ne1) * nb1);
1260
+ dst_buffer[3] = (float *) ((char *) dst->data + (iq3_3 * ne2 * ne1 + iq2_3 + iq1_3 * ne1) * nb1);
1261
+
1262
+ flash_attn_ext_f16_one_chunk_inner_vlen1024_vf16_mrow<4>( //
1263
+ pq_buffer.data(), //
1264
+ (const char *) k->data + (ik2 * nbk2 + ik3 * nbk3), //
1265
+ (const char *) v->data + (iv2 * nbv2 + iv3 * nbv3), //
1266
+ mp, //
1267
+ sinks_buffer.data(), //
1268
+ dst_buffer.data(), //
1269
+ scale, logit_softcap, slope, nek1, nbk1, nbv1, DV, DK, tcm_buffer, tcm_buffer_size);
1270
+ } else if (tcm_buffer != nullptr && 2 * KV_row_size < tcm_buffer_size && ir < (ir1 - 1) && mp_equal_2 &&
1271
+ ik3_1 == ik3 && ik2_1 == ik2 && iv3_1 == iv3 && iv2_1 == iv2) {
1272
+ ir_step = 2;
1273
+
1274
+ pq_buffer[0] = (float *) ((char *) q->data + (iq1 * nbq1 + iq2 * nbq2 + iq3 * nbq3));
1275
+ pq_buffer[1] = (float *) ((char *) q->data + (iq1_1 * nbq1 + iq2_1 * nbq2 + iq3_1 * nbq3));
1276
+
1277
+ sinks_buffer[0] = sinks ? ((float *) ((char *) sinks->data)) + iq2 : nullptr;
1278
+ sinks_buffer[1] = sinks ? ((float *) ((char *) sinks->data)) + iq2_1 : nullptr;
1279
+
1280
+ dst_buffer[0] = (float *) ((char *) dst->data + (iq3 * ne2 * ne1 + iq2 + iq1 * ne1) * nb1);
1281
+ dst_buffer[1] = (float *) ((char *) dst->data + (iq3_1 * ne2 * ne1 + iq2_1 + iq1_1 * ne1) * nb1);
1282
+
1283
+ flash_attn_ext_f16_one_chunk_inner_vlen1024_vf16_mrow<2>( //
1284
+ pq_buffer.data(), //
1285
+ (const char *) k->data + (ik2 * nbk2 + ik3 * nbk3), //
1286
+ (const char *) v->data + (iv2 * nbv2 + iv3 * nbv3), //
1287
+ mp, //
1288
+ sinks_buffer.data(), //
1289
+ dst_buffer.data(), //
1290
+ scale, logit_softcap, slope, nek1, nbk1, nbv1, DV, DK, tcm_buffer, tcm_buffer_size);
1291
+ } else {
1292
+ ir_step = 1;
1293
+ flash_attn_ext_f16_one_chunk_inner_vlen1024_vf16_m1( //
1294
+ pq, //
1295
+ (const char *) k->data + (ik2 * nbk2 + ik3 * nbk3), //
1296
+ (const char *) v->data + (iv2 * nbv2 + iv3 * nbv3), //
1297
+ mp, //
1298
+ sinks ? ((float *) ((char *) sinks->data)) + h : nullptr, //
1299
+ (float *) ((char *) dst->data + (iq3 * ne2 * ne1 + iq2 + iq1 * ne1) * nb1), //
1300
+ scale, logit_softcap, slope, nek1, nbk1, nbv1, DV, DK);
1301
+ }
1302
+ }
1303
+ }
1304
+
1305
+ void forward_flash_attn_ext_f16_tiled_vlen1024_vf16(const ggml_compute_params * params,
1306
+ ggml_tensor * dst,
1307
+ int ir0,
1308
+ int ir1,
1309
+ void * tcm_buffer,
1310
+ size_t tcm_buffer_size) {
1311
+ const ggml_tensor * q = dst->src[0];
1312
+ const ggml_tensor * k = dst->src[1];
1313
+ const ggml_tensor * v = dst->src[2];
1314
+ const ggml_tensor * mask = dst->src[3];
1315
+ const ggml_tensor * sinks = dst->src[4];
1316
+
1317
+ GGML_TENSOR_LOCALS(int64_t, neq, q, ne)
1318
+ GGML_TENSOR_LOCALS(size_t, nbq, q, nb)
1319
+ GGML_TENSOR_LOCALS(int64_t, nek, k, ne)
1320
+ GGML_TENSOR_LOCALS(size_t, nbk, k, nb)
1321
+ GGML_TENSOR_LOCALS(int64_t, nev, v, ne)
1322
+ GGML_TENSOR_LOCALS(size_t, nbv, v, nb)
1323
+ GGML_TENSOR_LOCALS(int64_t, ne, dst, ne)
1324
+ GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
1325
+
1326
+ const int64_t DK = nek0;
1327
+ const int64_t DV = nev0;
1328
+ const int64_t N = neq1;
1329
+
1330
+ GGML_ASSERT(flash_attn_ext_supported_shape_vlen1024_vf16(DK, DV));
1331
+
1332
+ GGML_ASSERT(ne0 == DV);
1333
+ GGML_ASSERT(ne2 == N);
1334
+
1335
+ // input tensor rows must be contiguous
1336
+ GGML_ASSERT(nbq0 == ggml_type_size(q->type));
1337
+ GGML_ASSERT(nbk0 == ggml_type_size(k->type));
1338
+ GGML_ASSERT(nbv0 == ggml_type_size(v->type));
1339
+
1340
+ GGML_ASSERT(neq0 == DK);
1341
+ GGML_ASSERT(nek0 == DK);
1342
+ GGML_ASSERT(nev0 == DV);
1343
+
1344
+ GGML_ASSERT(neq1 == N);
1345
+
1346
+ // dst cannot be transposed or permuted
1347
+ GGML_ASSERT(nb0 == sizeof(float));
1348
+ GGML_ASSERT(nb0 <= nb1);
1349
+ GGML_ASSERT(nb1 <= nb2);
1350
+ GGML_ASSERT(nb2 <= nb3);
1351
+
1352
+ GGML_ASSERT(k->type == v->type);
1353
+ const ggml_type kv_type = k->type;
1354
+
1355
+ // broadcast factors
1356
+ const int64_t rk2 = neq2 / nek2;
1357
+ const int64_t rk3 = neq3 / nek3;
1358
+
1359
+ const int64_t rv2 = neq2 / nev2;
1360
+ const int64_t rv3 = neq3 / nev3;
1361
+
1362
+ float * param_list = (float *) dst->op_params;
1363
+ float scale = param_list[0];
1364
+ float max_bias = param_list[1];
1365
+ float logit_softcap = param_list[2];
1366
+
1367
+ if (logit_softcap != 0) {
1368
+ scale /= logit_softcap;
1369
+ }
1370
+
1371
+ const uint32_t n_head = neq2;
1372
+ const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head));
1373
+
1374
+ const float m0 = powf(2.0f, -(max_bias) / n_head_log2);
1375
+ const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
1376
+
1377
+ int ith = params->ith;
1378
+
1379
+ static constexpr int Q_TILE_SZ = ggml_fa_tile_config::Q;
1380
+ static constexpr int KV_TILE_SZ = ggml_fa_tile_config::KV;
1381
+
1382
+ // Per-thread scratch layout:
1383
+ // Q_f32: Q_TILE_SZ * DK
1384
+ // KQ: Q_TILE_SZ * KV_TILE_SZ
1385
+ // mask32: Q_TILE_SZ * KV_TILE_SZ
1386
+ // VKQ32: Q_TILE_SZ * DV
1387
+ // V32: KV_TILE_SZ * DV
1388
+ // K_f32: DK * KV_TILE_SZ (transposed K tile)
1389
+ float * base = (float *) params->wdata + ith * (Q_TILE_SZ * DK + 2 * Q_TILE_SZ * KV_TILE_SZ + Q_TILE_SZ * DV +
1390
+ KV_TILE_SZ * DV + KV_TILE_SZ * DK + CACHE_LINE_SIZE_F32);
1391
+ const size_t base_size =
1392
+ (Q_TILE_SZ * DK + 2 * Q_TILE_SZ * KV_TILE_SZ + Q_TILE_SZ * DV + KV_TILE_SZ * DV + KV_TILE_SZ * DK) *
1393
+ sizeof(float) +
1394
+ CACHE_LINE_SIZE_F32;
1395
+
1396
+ if (base_size <= tcm_buffer_size && tcm_buffer != nullptr) {
1397
+ base = (float *) tcm_buffer;
1398
+ }
1399
+
1400
+ float S_M_Buf[Q_TILE_SZ * 2]; // buffer to hold S, M, bias for one tile to reduce register pressure in main loop
1401
+ float * S = S_M_Buf;
1402
+ float * M = S_M_Buf + Q_TILE_SZ;
1403
+
1404
+ int ir = ir0;
1405
+ while (ir < ir1) {
1406
+ // q indices for the start of this tile
1407
+ const int iq3 = ir / (neq2 * neq1);
1408
+ const int iq2 = (ir - iq3 * neq2 * neq1) / neq1;
1409
+ const int iq1 = (ir - iq3 * neq2 * neq1 - iq2 * neq1);
1410
+
1411
+ // Number of valid rows in this tile:
1412
+ // - limited by tile size (Q_TILE_SZ)
1413
+ // - limited by chunk boundary (ir1 - ir)
1414
+ // - limited by head boundary (neq1 - iq1) to avoid crossing into next head
1415
+ const int tile_rows = MIN(Q_TILE_SZ, MIN((int) (ir1 - ir), (int) (neq1 - iq1)));
1416
+ GGML_ASSERT(tile_rows > 0);
1417
+
1418
+ const uint32_t h = iq2; // head index
1419
+ const float slope =
1420
+ (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2 * (h - n_head_log2) + 1) : 1.0f;
1421
+
1422
+ for (int i = 0; i < Q_TILE_SZ; ++i) {
1423
+ S[i] = 0.;
1424
+ M[i] = -INFINITY;
1425
+ }
1426
+
1427
+ float * Q_f32 = base;
1428
+ float * KQ = (float *) ((char *) base + Q_TILE_SZ * DK * sizeof(float));
1429
+ float * mask32 = KQ + Q_TILE_SZ * KV_TILE_SZ;
1430
+ float * VKQ32 = mask32 + Q_TILE_SZ * KV_TILE_SZ;
1431
+ float * V32 = VKQ32 + Q_TILE_SZ * DV;
1432
+ float * K_f32 = V32 + KV_TILE_SZ * DV;
1433
+ _Float16 * Q_f16 = (_Float16 *) Q_f32;
1434
+ _Float16 * V_f16 = (_Float16 *) V32;
1435
+ _Float16 * K_f16 = (_Float16 *) K_f32;
1436
+
1437
+ rvv_zero_f32(VKQ32, Q_TILE_SZ * DV);
1438
+
1439
+ // k indices
1440
+ const int ik3 = iq3 / rk3;
1441
+ const int ik2 = iq2 / rk2;
1442
+
1443
+ // v indices
1444
+ const int iv3 = iq3 / rv3;
1445
+ const int iv2 = iq2 / rv2;
1446
+
1447
+ const float * pq = (const float *) ((char *) q->data + (iq1 * nbq1 + iq2 * nbq2 + iq3 * nbq3));
1448
+ if (kv_type == GGML_TYPE_F16) {
1449
+ rvv_pack_f32_as_scaled_f16((uint8_t *) Q_f16, DK * sizeof(_Float16), (uint8_t *) pq, nbq1, tile_rows, DK,
1450
+ scale);
1451
+ } else {
1452
+ memcpy2d(Q_f32, DK * sizeof(float), pq, nbq1, tile_rows, DK * sizeof(float));
1453
+ }
1454
+
1455
+ for (int64_t ic = 0; ic < nek1; ic += KV_TILE_SZ) {
1456
+ const int kv_tile = (int) std::min((int64_t) KV_TILE_SZ, nek1 - ic);
1457
+
1458
+ rvv_zero_f32(K_f32, DK * KV_TILE_SZ);
1459
+ rvv_zero_f32(V32, KV_TILE_SZ * DV);
1460
+
1461
+ // skip the tile entirely if all the masks are -inf
1462
+ if (mask) {
1463
+ bool can_skip = true;
1464
+ const ggml_fp16_t * mp_row =
1465
+ (const ggml_fp16_t *) ((const char *) mask->data + iq1 * mask->nb[1] +
1466
+ (iq2 % mask->ne[2]) * mask->nb[2] + (iq3 % mask->ne[3]) * mask->nb[3]);
1467
+ rvv_pack_scaled_f16_as_f32(mask32, KV_TILE_SZ * sizeof(float), mp_row + ic, mask->nb[1], tile_rows,
1468
+ kv_tile, slope);
1469
+
1470
+ for (int tq = 0; tq < tile_rows; tq++) {
1471
+ for (int tk = 0; tk < kv_tile; tk++) {
1472
+ if (mask32[tq * KV_TILE_SZ + tk] != -INFINITY) {
1473
+ can_skip = false;
1474
+ }
1475
+ }
1476
+ // Pad remaining mask entries with -inf
1477
+ for (int tk = kv_tile; tk < KV_TILE_SZ; tk++) {
1478
+ mask32[tq * KV_TILE_SZ + tk] = -INFINITY;
1479
+ }
1480
+ }
1481
+
1482
+ if (can_skip) {
1483
+ continue;
1484
+ }
1485
+ }
1486
+
1487
+ if (kv_type == GGML_TYPE_F16) {
1488
+ rvv_transposed_s16_mn_to_nm((int8_t *) K_f16, KV_TILE_SZ * sizeof(_Float16),
1489
+ (int8_t *) k->data + ic * nbk1 + ik2 * nbk2 + ik3 * nbk3, nbk1, kv_tile,
1490
+ DK);
1491
+
1492
+ int tq = 0;
1493
+ for (; tq + 3 < tile_rows; tq += 4) {
1494
+ rvv_qk_dot_tile_f16_x4(KQ + (tq + 0) * KV_TILE_SZ, KQ + (tq + 1) * KV_TILE_SZ,
1495
+ KQ + (tq + 2) * KV_TILE_SZ, KQ + (tq + 3) * KV_TILE_SZ,
1496
+ Q_f16 + (tq + 0) * DK, Q_f16 + (tq + 1) * DK, Q_f16 + (tq + 2) * DK,
1497
+ Q_f16 + (tq + 3) * DK, K_f16, DK, kv_tile);
1498
+ }
1499
+ for (; tq < tile_rows; ++tq) {
1500
+ rvv_qk_dot_tile_f16_x1(KQ + tq * KV_TILE_SZ, Q_f16 + tq * DK, K_f16, DK, kv_tile);
1501
+ }
1502
+ } else {
1503
+ for (int tk = 0; tk < kv_tile; tk++) {
1504
+ const char * k_data = (const char *) k->data + (ic + tk) * nbk1 + ik2 * nbk2 + ik3 * nbk3;
1505
+ float * k_col = K_f32 + tk;
1506
+ const float * k_src = (const float *) k_data;
1507
+ for (int64_t dk = 0; dk < DK; ++dk) {
1508
+ k_col[dk * KV_TILE_SZ] = k_src[dk];
1509
+ }
1510
+ }
1511
+
1512
+ for (int tq = 0; tq < tile_rows; ++tq) {
1513
+ rvv_qk_dot_tile(KQ + tq * KV_TILE_SZ, Q_f32 + tq * DK, K_f32, DK, KV_TILE_SZ, scale);
1514
+ }
1515
+ }
1516
+
1517
+ // Set padded KQ entries to -inf so softmax gives them zero weight
1518
+ if (kv_tile < KV_TILE_SZ) {
1519
+ for (int tq = 0; tq < tile_rows; tq++) {
1520
+ for (int tk = kv_tile; tk < KV_TILE_SZ; tk++) {
1521
+ KQ[tq * KV_TILE_SZ + tk] = -INFINITY;
1522
+ }
1523
+ }
1524
+ }
1525
+
1526
+ if (logit_softcap != 0.0f) {
1527
+ rvv_softcap_tanh_inplace_f32(KQ, KV_TILE_SZ, tile_rows, KV_TILE_SZ, logit_softcap);
1528
+ }
1529
+
1530
+ if (mask) {
1531
+ rvv_add_inplace_f32(KQ, KV_TILE_SZ, mask32, KV_TILE_SZ, tile_rows, KV_TILE_SZ);
1532
+ }
1533
+
1534
+ bool skip[Q_TILE_SZ] = {};
1535
+
1536
+ for (int tq = 0; tq < tile_rows; tq++) {
1537
+ float * kq_row = KQ + tq * KV_TILE_SZ;
1538
+
1539
+ const float tile_max = rvv_max_f32(kq_row, KV_TILE_SZ);
1540
+
1541
+ if (tile_max == -INFINITY) {
1542
+ skip[tq] = true;
1543
+ continue;
1544
+ }
1545
+
1546
+ const float Mold = M[tq];
1547
+ const float Mnew = fmaxf(Mold, tile_max);
1548
+
1549
+ if (Mnew > Mold) {
1550
+ const float ms = expf(Mold - Mnew);
1551
+ rvv_scale_f32(VKQ32 + tq * DV, ms, DV);
1552
+ S[tq] *= ms;
1553
+ }
1554
+ M[tq] = Mnew;
1555
+
1556
+ S[tq] += rvv_softmax_exp_inplace_f32(kq_row, KV_TILE_SZ, Mnew);
1557
+ }
1558
+
1559
+ // Pack V as contiguous [KV_TILE_SZ][DV].
1560
+ if (kv_type == GGML_TYPE_F16) {
1561
+ const char * v_data = (const char *) v->data + ic * nbv1 + iv2 * nbv2 + iv3 * nbv3;
1562
+ memcpy2d(V_f16, DV * sizeof(_Float16), v_data, nbv1, kv_tile, DV * sizeof(_Float16));
1563
+
1564
+ int tq = 0;
1565
+ for (; tq + 3 < tile_rows; tq += 4) {
1566
+ if (skip[tq + 0] || skip[tq + 1] || skip[tq + 2] || skip[tq + 3]) {
1567
+ for (int i = 0; i < 4; ++i) {
1568
+ if (!skip[tq + i]) {
1569
+ rvv_pv_accumulate_f16_x1(VKQ32 + (tq + i) * DV, KQ + (tq + i) * KV_TILE_SZ, V_f16,
1570
+ KV_TILE_SZ, DV);
1571
+ }
1572
+ }
1573
+ continue;
1574
+ }
1575
+
1576
+ rvv_pv_accumulate_f16_x4(VKQ32 + (tq + 0) * DV, VKQ32 + (tq + 1) * DV, VKQ32 + (tq + 2) * DV,
1577
+ VKQ32 + (tq + 3) * DV, KQ + (tq + 0) * KV_TILE_SZ,
1578
+ KQ + (tq + 1) * KV_TILE_SZ, KQ + (tq + 2) * KV_TILE_SZ,
1579
+ KQ + (tq + 3) * KV_TILE_SZ, V_f16, KV_TILE_SZ, DV);
1580
+ }
1581
+ for (; tq < tile_rows; ++tq) {
1582
+ if (!skip[tq]) {
1583
+ rvv_pv_accumulate_f16_x1(VKQ32 + tq * DV, KQ + tq * KV_TILE_SZ, V_f16, KV_TILE_SZ, DV);
1584
+ }
1585
+ }
1586
+ } else {
1587
+ const char * v_data = (const char *) v->data + ic * nbv1 + iv2 * nbv2 + iv3 * nbv3;
1588
+ memcpy2d(V32, DV * sizeof(float), v_data, nbv1, kv_tile, DV * sizeof(float));
1589
+
1590
+ for (int tq = 0; tq < tile_rows; ++tq) {
1591
+ if (!skip[tq]) {
1592
+ rvv_pv_accumulate(VKQ32 + tq * DV, KQ + tq * KV_TILE_SZ, V32, KV_TILE_SZ, DV);
1593
+ }
1594
+ }
1595
+ }
1596
+ }
1597
+
1598
+ // sinks (apply only to valid rows in the tile)
1599
+ if (sinks) {
1600
+ const float s = ((float *) ((char *) sinks->data))[h];
1601
+
1602
+ for (int tq = 0; tq < tile_rows; tq++) {
1603
+ float ms = 1.0f;
1604
+ float vs = 1.0f;
1605
+
1606
+ if (s > M[tq]) {
1607
+ ms = expf(M[tq] - s);
1608
+ rvv_scale_f32(VKQ32 + tq * DV, ms, DV);
1609
+ } else {
1610
+ vs = expf(s - M[tq]);
1611
+ }
1612
+
1613
+ float S_temp = S[tq] * ms + vs;
1614
+ S[tq] = S_temp == 0.0f ? 0.0f : 1.0f / S_temp;
1615
+ }
1616
+ } else {
1617
+ for (int tq = 0; tq < tile_rows; tq++) {
1618
+ const float S_inv = S[tq] == 0.0f ? 0.0f : 1.0f / S[tq];
1619
+ S[tq] = S_inv;
1620
+ }
1621
+ }
1622
+
1623
+ float * dst_ptr = (float *) ((char *) dst->data + (iq3 * ne2 * ne1 + iq2 + (iq1) *ne1) * nb1);
1624
+ rvv_pack_scaled_f32_as_f32(dst_ptr, nb1 * ne1, VKQ32, DV * sizeof(float), tile_rows, DV, S);
1625
+
1626
+ ir += tile_rows;
1627
+ }
1628
+ }
1629
+
1630
+ void forward_rms_norm_f32(ggml_compute_params * params, ggml_tensor * op) {
1631
+ const ggml_tensor * src0 = op->src[0];
1632
+ ggml_tensor * dst = op;
1633
+ GGML_ASSERT(ggml_are_same_shape(src0, dst));
1634
+ GGML_ASSERT(src0->nb[0] == sizeof(float));
1635
+
1636
+ int ith = params->ith;
1637
+ int nth = params->nth;
1638
+
1639
+ GGML_TENSOR_UNARY_OP_LOCALS
1640
+
1641
+ float epsilon = *((float *) dst->op_params);
1642
+
1643
+ GGML_ASSERT(epsilon > 0.0f);
1644
+
1645
+ auto * input = (char *) src0->data;
1646
+ auto * output = (char *) dst->data;
1647
+
1648
+ const auto hidden_size = ne00;
1649
+ const auto task_count = ne01 * ne02 * ne03;
1650
+ const auto task_per_thread = (task_count + nth - 1) / nth;
1651
+
1652
+ const auto task_begin = ith * task_per_thread;
1653
+ const auto task_end = std::min((ith + 1) * task_per_thread, task_count);
1654
+
1655
+ for (auto task_idx = task_begin; task_idx < task_end; task_idx++) {
1656
+ int64_t i03 = task_idx / (ne02 * ne01);
1657
+ int64_t i02 = (task_idx - i03 * ne02 * ne01) / ne01;
1658
+ int64_t i01 = (task_idx - i03 * ne02 * ne01 - i02 * ne01);
1659
+
1660
+ auto * p_input = (float *) (input + i01 * nb01 + i02 * nb02 + i03 * nb03);
1661
+ auto * p_output = (float *) (output + i01 * nb1 + i02 * nb2 + i03 * nb3);
1662
+ auto * p_temp_output = p_output;
1663
+
1664
+ size_t gvl = __riscv_vsetvlmax_e32m4();
1665
+ vfloat32m4_t sum_sq = __riscv_vfmv_v_f_f32m4(0.f, gvl);
1666
+ int64_t length = hidden_size;
1667
+ while (length > 0) {
1668
+ gvl = __riscv_vsetvl_e32m4(length);
1669
+ vfloat32m4_t src_data = __riscv_vle32_v_f32m4(p_input, gvl);
1670
+ sum_sq = __riscv_vfmacc_vv_f32m4(sum_sq, src_data, src_data, gvl);
1671
+ __riscv_vse32_v_f32m4(p_temp_output, src_data, gvl);
1672
+
1673
+ p_input += gvl;
1674
+ p_temp_output += gvl;
1675
+ length -= gvl;
1676
+ }
1677
+
1678
+ gvl = __riscv_vsetvlmax_e32m1();
1679
+ vfloat32m1_t zero_v = __riscv_vfmv_v_f_f32m1(0.f, gvl);
1680
+ vfloat32m1_t mean_square_v =
1681
+ __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m4_f32m1(sum_sq, 0), __riscv_vget_v_f32m4_f32m1(sum_sq, 1), gvl);
1682
+
1683
+ mean_square_v = __riscv_vfadd_vv_f32m1(mean_square_v, __riscv_vget_v_f32m4_f32m1(sum_sq, 2), gvl);
1684
+ mean_square_v = __riscv_vfadd_vv_f32m1(mean_square_v, __riscv_vget_v_f32m4_f32m1(sum_sq, 3), gvl);
1685
+ mean_square_v = __riscv_vfredusum_vs_f32m1_f32m1(mean_square_v, zero_v, gvl);
1686
+
1687
+ float mean_square = __riscv_vfmv_f_s_f32m1_f32(mean_square_v);
1688
+ mean_square /= hidden_size;
1689
+
1690
+ mean_square = sqrt(mean_square + epsilon);
1691
+
1692
+ mean_square = 1.0f / mean_square;
1693
+ length = hidden_size;
1694
+ p_temp_output = p_output;
1695
+
1696
+ while (length > 0) {
1697
+ gvl = __riscv_vsetvl_e32m4(length);
1698
+ vfloat32m4_t src_data = __riscv_vle32_v_f32m4(p_temp_output, gvl);
1699
+ src_data = __riscv_vfmul_vf_f32m4(src_data, mean_square, gvl);
1700
+ __riscv_vse32_v_f32m4(p_output, src_data, gvl);
1701
+ p_temp_output += gvl;
1702
+ p_output += gvl;
1703
+ length -= gvl;
1704
+ }
1705
+ }
1706
+ }
1707
+
1708
+ template <size_t MB_ROWS>
1709
+ void quantize_a_nrow_i8_ref(size_t blk_len, const float * a_ptr, size_t count_k, uint8_t * quant_a_ptr) {
1710
+ int64_t a_blk_stride = q8_blk_size(blk_len, true);
1711
+ int64_t a_nrow_block_stride = a_blk_stride * MB_ROWS;
1712
+ for (size_t k = 0; k < count_k; k += blk_len, quant_a_ptr += a_nrow_block_stride) {
1713
+ float * scale_a_ptr = reinterpret_cast<float *>(quant_a_ptr);
1714
+ int16_t * a_sum_ptr = reinterpret_cast<int16_t *>(quant_a_ptr + sizeof(float) * MB_ROWS);
1715
+ int8_t * quant_a_blk =
1716
+ reinterpret_cast<int8_t *>(quant_a_ptr + sizeof(float) * MB_ROWS + sizeof(int16_t) * MB_ROWS);
1717
+
1718
+ for (size_t row = 0; row < MB_ROWS; row++) {
1719
+ float max_abs_a = 0.0f;
1720
+ for (size_t bk = 0; bk < blk_len; bk++) {
1721
+ max_abs_a = std::max(max_abs_a, std::abs(a_ptr[row * count_k + k + bk]));
1722
+ }
1723
+
1724
+ float rep_scale_a = ((1 << 7) - 1) / max_abs_a;
1725
+ scale_a_ptr[row] = 1 / rep_scale_a;
1726
+
1727
+ int16_t a_sum = 0;
1728
+ for (size_t bk = 0; bk < blk_len; bk++) {
1729
+ const int8_t quantized = static_cast<int8_t>(
1730
+ std::clamp(std::nearbyintf(a_ptr[row * count_k + k + bk] * rep_scale_a), -128.0f, 127.0f));
1731
+ quant_a_blk[row * blk_len + bk] = quantized;
1732
+ a_sum += quantized;
1733
+ }
1734
+ a_sum_ptr[row] = -a_sum;
1735
+ }
1736
+ }
1737
+ }
1738
+
1739
+ template <size_t MB_ROWS>
1740
+ void quantize_a_nrow_i8_hp_ref(size_t blk_len, const float * a_ptr, size_t count_k, uint8_t * quant_a_ptr) {
1741
+ constexpr size_t k_subblk_len = 32;
1742
+ const size_t subblk_count = blk_len / k_subblk_len;
1743
+
1744
+ GGML_ASSERT(blk_len == 256);
1745
+
1746
+ float scale_temp[8] = { 0.0f };
1747
+ int64_t a_blk_stride = q8_hp_blk_size(blk_len, true, true);
1748
+ int64_t a_nrow_block_stride = a_blk_stride * MB_ROWS;
1749
+ int64_t a_subblk_stride = q8_hp_blk_size(k_subblk_len, false, false) * MB_ROWS;
1750
+
1751
+ for (size_t k = 0; k < count_k; k += blk_len, quant_a_ptr += a_nrow_block_stride) {
1752
+ _Float16 * a_sum_ptr = reinterpret_cast<_Float16 *>(quant_a_ptr + a_subblk_stride * subblk_count);
1753
+
1754
+ float scale_avg = 0.0f;
1755
+ for (size_t kk = 0; kk < subblk_count; kk++) {
1756
+ float max_abs_a = 0.0f;
1757
+ for (size_t row = 0; row < MB_ROWS; row++) {
1758
+ for (size_t bk = 0; bk < k_subblk_len; bk++) {
1759
+ max_abs_a = std::max(max_abs_a, std::abs(a_ptr[row * count_k + k + bk + kk * k_subblk_len]));
1760
+ }
1761
+ }
1762
+ scale_temp[kk] = max_abs_a / ((1 << 7) - 1);
1763
+ scale_avg += scale_temp[kk];
1764
+ }
1765
+
1766
+ scale_avg /= subblk_count;
1767
+ float scale_factor = 1.0f / scale_avg;
1768
+
1769
+ _Float16 * scale_avg_ptr =
1770
+ reinterpret_cast<_Float16 *>(quant_a_ptr + a_nrow_block_stride - sizeof(_Float16) * MB_ROWS);
1771
+ scale_avg_ptr[0] = scale_avg;
1772
+
1773
+ for (size_t kk = 0; kk < subblk_count; kk++) {
1774
+ uint8_t * a_subblk_base = quant_a_ptr + kk * a_subblk_stride;
1775
+ _Float16 * scale_a_ptr = reinterpret_cast<_Float16 *>(a_subblk_base);
1776
+ int8_t * quant_a_blk = reinterpret_cast<int8_t *>(a_subblk_base + sizeof(_Float16) * MB_ROWS);
1777
+
1778
+ scale_a_ptr[0] = static_cast<_Float16>(scale_temp[kk] * scale_factor);
1779
+
1780
+ const float rep_scale_a = 1.0f / scale_temp[kk];
1781
+
1782
+ for (size_t row = 0; row < MB_ROWS; row++) {
1783
+ int16_t a_sum = 0;
1784
+ for (size_t bk = 0; bk < k_subblk_len; bk++) {
1785
+ const int8_t quantized = static_cast<int8_t>(
1786
+ std::clamp(std::nearbyintf(a_ptr[row * count_k + k + bk + kk * k_subblk_len] * rep_scale_a),
1787
+ -128.0f, 127.0f));
1788
+ quant_a_blk[row * k_subblk_len + bk] = quantized;
1789
+ a_sum += quantized;
1790
+ }
1791
+ a_sum_ptr[row * subblk_count + kk] = static_cast<_Float16>(-a_sum) * static_cast<_Float16>(8.0f);
1792
+ }
1793
+ }
1794
+ }
1795
+ }
1796
+
1797
+ template <size_t MB_ROWS>
1798
+ void quantize_a_nrow_i8k_ref(size_t blk_len, const float * a_ptr, size_t count_k, uint8_t * quant_a_ptr) {
1799
+ int64_t a_blk_stride = q8k_blk_size(256);
1800
+ int64_t a_nrow_block_stride = a_blk_stride * MB_ROWS;
1801
+ int64_t a_sum_size = 256 / 16;
1802
+
1803
+ for (size_t k = 0; k < count_k; k += blk_len, quant_a_ptr += a_nrow_block_stride) {
1804
+ float * scale_a_ptr = reinterpret_cast<float *>(quant_a_ptr);
1805
+ int16_t * a_sum_ptr = reinterpret_cast<int16_t *>(quant_a_ptr + sizeof(float) * MB_ROWS);
1806
+ int8_t * quant_a_blk =
1807
+ reinterpret_cast<int8_t *>(quant_a_ptr + sizeof(float) * MB_ROWS + sizeof(int16_t) * a_sum_size * MB_ROWS);
1808
+
1809
+ for (size_t row = 0; row < MB_ROWS; row++) {
1810
+ float max_a = 0.0f;
1811
+ float max_abs_a = 0.0f;
1812
+ for (size_t bk = 0; bk < blk_len; bk++) {
1813
+ float ax = std::abs(a_ptr[row * count_k + k + bk]);
1814
+ if (ax > max_abs_a) {
1815
+ max_abs_a = ax;
1816
+ max_a = a_ptr[row * count_k + k + bk];
1817
+ }
1818
+ }
1819
+
1820
+ if (!max_abs_a) {
1821
+ scale_a_ptr[row] = 0;
1822
+ for (size_t bki = 0; bki < a_sum_size; bki++) {
1823
+ for (size_t bk = bki * 16; bk < (bki + 1) * 16; bk++) {
1824
+ quant_a_blk[row * blk_len + bk] = 0;
1825
+ }
1826
+ a_sum_ptr[row * a_sum_size + bki] = 0;
1827
+ }
1828
+ continue;
1829
+ }
1830
+
1831
+ float rep_scale_a = ((1 << 7) - 1) / max_abs_a;
1832
+ scale_a_ptr[row] = 1 / rep_scale_a;
1833
+
1834
+ for (size_t bki = 0; bki < a_sum_size; bki++) {
1835
+ int16_t a_sum = 0;
1836
+ for (size_t bk = bki * 16; bk < (bki + 1) * 16; bk++) {
1837
+ const int8_t quantized = static_cast<int8_t>(
1838
+ std::clamp(std::nearbyintf(a_ptr[row * count_k + k + bk] * rep_scale_a), -128.0f, 127.0f));
1839
+ quant_a_blk[row * blk_len + bk] = quantized;
1840
+ a_sum += quantized;
1841
+ }
1842
+ a_sum_ptr[row * a_sum_size + bki] = -a_sum;
1843
+ }
1844
+ }
1845
+ }
1846
+ }
1847
+
1848
+ void quantize_a_row_i8(size_t blk_len, const float * a_ptr, size_t count_k, uint8_t * quant_a_ptr) {
1849
+ GGML_ASSERT(blk_len == 32);
1850
+ int64_t a_blk_stride = q8_blk_size(blk_len, true);
1851
+ size_t vlenb = __riscv_vlenb();
1852
+
1853
+ if (vlenb == 128) {
1854
+ for (size_t k = 0; k < count_k; k += blk_len, quant_a_ptr += a_blk_stride) {
1855
+ float * scale_a_ptr = reinterpret_cast<float *>(quant_a_ptr);
1856
+ int16_t * a_sum_ptr = reinterpret_cast<int16_t *>(quant_a_ptr + sizeof(float));
1857
+ int8_t * quant_a_blk = reinterpret_cast<int8_t *>(quant_a_ptr + sizeof(float) + sizeof(int16_t));
1858
+
1859
+ size_t vl = __riscv_vsetvl_e32m1(blk_len);
1860
+ vfloat32m1_t v_a = __riscv_vle32_v_f32m1(a_ptr + k, vl);
1861
+ vfloat32m1_t v_a_abs = __riscv_vfabs_v_f32m1(v_a, vl);
1862
+
1863
+ vfloat32m1_t tmp = __riscv_vfmv_v_f_f32m1(0.0f, vl);
1864
+ vfloat32m1_t v_a_max = __riscv_vfredmax_vs_f32m1_f32m1(v_a_abs, tmp, vl);
1865
+ float max_abs_a = __riscv_vfmv_f_s_f32m1_f32(v_a_max);
1866
+
1867
+ float scale_a = max_abs_a / ((1 << 7) - 1);
1868
+ float rep_scale_a = scale_a ? 1.0f / scale_a : 0.0f;
1869
+ scale_a_ptr[0] = scale_a;
1870
+
1871
+ vfloat32m1_t v_a_scale = __riscv_vfmul_vf_f32m1(v_a, rep_scale_a, vl);
1872
+ vint16mf2_t v_a_quant = __riscv_vfncvt_x_f_w_i16mf2(v_a_scale, vl);
1873
+ vint8mf4_t v_a_quant_i8 = __riscv_vncvt_x_x_w_i8mf4(v_a_quant, vl);
1874
+
1875
+ vint16m1_t tmp_sum = __riscv_vmv_v_x_i16m1(0, vl);
1876
+ vint16m1_t v_a_sum = __riscv_vwredsum_vs_i8mf4_i16m1(v_a_quant_i8, tmp_sum, vl);
1877
+ int16_t a_sum = __riscv_vmv_x_s_i16m1_i16(v_a_sum);
1878
+ a_sum_ptr[0] = -a_sum;
1879
+
1880
+ __riscv_vse8_v_i8mf4(quant_a_blk, v_a_quant_i8, vl);
1881
+ }
1882
+ } else if (vlenb == 32) {
1883
+ for (size_t k = 0; k < count_k; k += blk_len, quant_a_ptr += a_blk_stride) {
1884
+ float * scale_a_ptr = reinterpret_cast<float *>(quant_a_ptr);
1885
+ int16_t * a_sum_ptr = reinterpret_cast<int16_t *>(quant_a_ptr + sizeof(float));
1886
+ int8_t * quant_a_blk = reinterpret_cast<int8_t *>(quant_a_ptr + sizeof(float) + sizeof(int16_t));
1887
+
1888
+ size_t vl = __riscv_vsetvl_e32m4(blk_len);
1889
+ vfloat32m4_t v_a = __riscv_vle32_v_f32m4(a_ptr + k, vl);
1890
+ vfloat32m4_t v_a_abs = __riscv_vfabs_v_f32m4(v_a, vl);
1891
+
1892
+ vfloat32m1_t tmp = __riscv_vfmv_v_f_f32m1(0.0f, vl);
1893
+ vfloat32m1_t v_a_max = __riscv_vfredmax_vs_f32m4_f32m1(v_a_abs, tmp, vl);
1894
+ float max_abs_a = __riscv_vfmv_f_s_f32m1_f32(v_a_max);
1895
+
1896
+ float scale_a = max_abs_a / ((1 << 7) - 1);
1897
+ float rep_scale_a = scale_a ? 1.0f / scale_a : 0.0f;
1898
+ scale_a_ptr[0] = scale_a;
1899
+
1900
+ vfloat32m4_t v_a_scale = __riscv_vfmul_vf_f32m4(v_a, rep_scale_a, vl);
1901
+ vint16m2_t v_a_quant = __riscv_vfncvt_x_f_w_i16m2(v_a_scale, vl);
1902
+ vint8m1_t v_a_quant_i8 = __riscv_vncvt_x_x_w_i8m1(v_a_quant, vl);
1903
+
1904
+ vint16m1_t tmp_sum = __riscv_vmv_v_x_i16m1(0, vl);
1905
+ vint16m1_t v_a_sum = __riscv_vwredsum_vs_i8m1_i16m1(v_a_quant_i8, tmp_sum, vl);
1906
+ int16_t a_sum = __riscv_vmv_x_s_i16m1_i16(v_a_sum);
1907
+ a_sum_ptr[0] = -a_sum;
1908
+
1909
+ __riscv_vse8_v_i8m1(quant_a_blk, v_a_quant_i8, vl);
1910
+ }
1911
+ } else {
1912
+ quantize_a_nrow_i8_ref<1>(blk_len, a_ptr, count_k, quant_a_ptr);
1913
+ }
1914
+ }
1915
+
1916
+ void quantize_a_4row_i8(size_t blk_len, const float * a_ptr, size_t count_k, uint8_t * quant_a_ptr) {
1917
+ GGML_ASSERT(blk_len == 32);
1918
+ int64_t a_blk_stride = q8_blk_size(blk_len, true);
1919
+ int64_t a_nrow_block_stride = a_blk_stride * 4;
1920
+ size_t vlenb = __riscv_vlenb();
1921
+
1922
+ if (vlenb == 128) {
1923
+ for (size_t k = 0; k < count_k; k += blk_len, quant_a_ptr += a_nrow_block_stride) {
1924
+ float * scale_a_ptr = reinterpret_cast<float *>(quant_a_ptr);
1925
+ int16_t * a_sum_ptr = reinterpret_cast<int16_t *>(quant_a_ptr + sizeof(float) * 4);
1926
+ int8_t * quant_a_blk = reinterpret_cast<int8_t *>(quant_a_ptr + sizeof(float) * 4 + sizeof(int16_t) * 4);
1927
+
1928
+ for (size_t mi = 0; mi < 4; mi++) {
1929
+ size_t vl = __riscv_vsetvl_e32m1(blk_len);
1930
+ vfloat32m1_t v_a = __riscv_vle32_v_f32m1(a_ptr + mi * count_k + k, vl);
1931
+ vfloat32m1_t v_a_abs = __riscv_vfabs_v_f32m1(v_a, vl);
1932
+
1933
+ vfloat32m1_t tmp = __riscv_vfmv_v_f_f32m1(0.0f, vl);
1934
+ vfloat32m1_t v_a_max = __riscv_vfredmax_vs_f32m1_f32m1(v_a_abs, tmp, vl);
1935
+ float max_abs_a = __riscv_vfmv_f_s_f32m1_f32(v_a_max);
1936
+
1937
+ float scale_a = max_abs_a / ((1 << 7) - 1);
1938
+ float rep_scale_a = scale_a ? 1.0f / scale_a : 0.0f;
1939
+ scale_a_ptr[mi] = scale_a;
1940
+
1941
+ vfloat32m1_t v_a_scale = __riscv_vfmul_vf_f32m1(v_a, rep_scale_a, vl);
1942
+ vint16mf2_t v_a_quant = __riscv_vfncvt_x_f_w_i16mf2(v_a_scale, vl);
1943
+ vint8mf4_t v_a_quant_i8 = __riscv_vncvt_x_x_w_i8mf4(v_a_quant, vl);
1944
+
1945
+ vint16m1_t tmp_sum = __riscv_vmv_v_x_i16m1(0, vl);
1946
+ vint16m1_t v_a_sum = __riscv_vwredsum_vs_i8mf4_i16m1(v_a_quant_i8, tmp_sum, vl);
1947
+ int16_t a_sum = __riscv_vmv_x_s_i16m1_i16(v_a_sum);
1948
+ a_sum_ptr[mi] = -a_sum;
1949
+
1950
+ __riscv_vse8_v_i8mf4(quant_a_blk + mi * blk_len, v_a_quant_i8, vl);
1951
+ }
1952
+ }
1953
+ } else if (vlenb == 32) {
1954
+ for (size_t k = 0; k < count_k; k += blk_len, quant_a_ptr += a_nrow_block_stride) {
1955
+ float * scale_a_ptr = reinterpret_cast<float *>(quant_a_ptr);
1956
+ int16_t * a_sum_ptr = reinterpret_cast<int16_t *>(quant_a_ptr + sizeof(float) * 4);
1957
+ int8_t * quant_a_blk = reinterpret_cast<int8_t *>(quant_a_ptr + sizeof(float) * 4 + sizeof(int16_t) * 4);
1958
+
1959
+ for (size_t mi = 0; mi < 4; mi++) {
1960
+ size_t vl = __riscv_vsetvl_e32m4(blk_len);
1961
+ vfloat32m4_t v_a = __riscv_vle32_v_f32m4(a_ptr + mi * count_k + k, vl);
1962
+ vfloat32m4_t v_a_abs = __riscv_vfabs_v_f32m4(v_a, vl);
1963
+
1964
+ vfloat32m1_t tmp = __riscv_vfmv_v_f_f32m1(0.0f, vl);
1965
+ vfloat32m1_t v_a_max = __riscv_vfredmax_vs_f32m4_f32m1(v_a_abs, tmp, vl);
1966
+ float max_abs_a = __riscv_vfmv_f_s_f32m1_f32(v_a_max);
1967
+
1968
+ float scale_a = max_abs_a / ((1 << 7) - 1);
1969
+ float rep_scale_a = scale_a ? 1.0f / scale_a : 0.0f;
1970
+ scale_a_ptr[mi] = scale_a;
1971
+
1972
+ vfloat32m4_t v_a_scale = __riscv_vfmul_vf_f32m4(v_a, rep_scale_a, vl);
1973
+ vint16m2_t v_a_quant = __riscv_vfncvt_x_f_w_i16m2(v_a_scale, vl);
1974
+ vint8m1_t v_a_quant_i8 = __riscv_vncvt_x_x_w_i8m1(v_a_quant, vl);
1975
+
1976
+ vint16m1_t tmp_sum = __riscv_vmv_v_x_i16m1(0, vl);
1977
+ vint16m1_t v_a_sum = __riscv_vwredsum_vs_i8m1_i16m1(v_a_quant_i8, tmp_sum, vl);
1978
+ int16_t a_sum = __riscv_vmv_x_s_i16m1_i16(v_a_sum);
1979
+ a_sum_ptr[mi] = -a_sum;
1980
+
1981
+ __riscv_vse8_v_i8m1(quant_a_blk + mi * blk_len, v_a_quant_i8, vl);
1982
+ }
1983
+ }
1984
+ } else {
1985
+ quantize_a_nrow_i8_ref<4>(blk_len, a_ptr, count_k, quant_a_ptr);
1986
+ }
1987
+ }
1988
+
1989
+ void quantize_a_row_i8_hp(size_t blk_len, const float * a_ptr, size_t count_k, uint8_t * quant_a_ptr) {
1990
+ constexpr size_t k_subblk_len = 32;
1991
+ GGML_ASSERT(blk_len == 256);
1992
+
1993
+ constexpr size_t subblk_count = 256 / k_subblk_len;
1994
+ int64_t a_blk_stride = q8_hp_blk_size(blk_len, true, true);
1995
+ int64_t a_subblk_stride = q8_hp_blk_size(k_subblk_len, false, false);
1996
+ size_t vlenb = __riscv_vlenb();
1997
+ float scale_temp[subblk_count] = { 0.0f };
1998
+
1999
+ if (vlenb == 128) {
2000
+ for (size_t k = 0; k < count_k; k += blk_len, quant_a_ptr += a_blk_stride) {
2001
+ _Float16 * a_sum_ptr = reinterpret_cast<_Float16 *>(quant_a_ptr + a_subblk_stride * subblk_count);
2002
+ _Float16 * scale_avg_ptr = reinterpret_cast<_Float16 *>(quant_a_ptr + a_blk_stride - sizeof(_Float16));
2003
+ float scale_avg = 0.0f;
2004
+
2005
+ for (size_t kk = 0; kk < subblk_count; ++kk) {
2006
+ const float * a_src_ptr = a_ptr + k + kk * k_subblk_len;
2007
+
2008
+ size_t vl = __riscv_vsetvl_e32m1(k_subblk_len);
2009
+ vfloat32m1_t v_a = __riscv_vle32_v_f32m1(a_src_ptr, vl);
2010
+ vfloat32m1_t v_a_abs = __riscv_vfabs_v_f32m1(v_a, vl);
2011
+
2012
+ vfloat32m1_t tmp = __riscv_vfmv_v_f_f32m1(0.0f, vl);
2013
+ vfloat32m1_t v_a_max = __riscv_vfredmax_vs_f32m1_f32m1(v_a_abs, tmp, vl);
2014
+ float max_abs_a = __riscv_vfmv_f_s_f32m1_f32(v_a_max);
2015
+
2016
+ scale_temp[kk] = max_abs_a / ((1 << 7) - 1);
2017
+ scale_avg += scale_temp[kk];
2018
+ }
2019
+
2020
+ scale_avg /= subblk_count;
2021
+ const float scale_factor = scale_avg ? 1.0f / scale_avg : 0.0f;
2022
+ scale_avg_ptr[0] = static_cast<_Float16>(scale_avg);
2023
+
2024
+ for (size_t kk = 0; kk < subblk_count; ++kk) {
2025
+ uint8_t * a_subblk_base = quant_a_ptr + kk * a_subblk_stride;
2026
+ _Float16 * scale_a_ptr = reinterpret_cast<_Float16 *>(a_subblk_base);
2027
+ int8_t * quant_a_blk = reinterpret_cast<int8_t *>(a_subblk_base + sizeof(_Float16));
2028
+ const float * a_src_ptr = a_ptr + k + kk * k_subblk_len;
2029
+
2030
+ size_t vl = __riscv_vsetvl_e32m1(k_subblk_len);
2031
+ vfloat32m1_t v_a = __riscv_vle32_v_f32m1(a_src_ptr, vl);
2032
+ float rep_scale_a = scale_temp[kk] ? 1.0f / scale_temp[kk] : 0.0f;
2033
+ scale_a_ptr[0] = static_cast<_Float16>(scale_temp[kk] * scale_factor);
2034
+
2035
+ vfloat32m1_t v_a_scale = __riscv_vfmul_vf_f32m1(v_a, rep_scale_a, vl);
2036
+ vint16mf2_t v_a_quant = __riscv_vfncvt_x_f_w_i16mf2(v_a_scale, vl);
2037
+ vint8mf4_t v_a_quant_i8 = __riscv_vncvt_x_x_w_i8mf4(v_a_quant, vl);
2038
+
2039
+ vint16m1_t tmp_sum = __riscv_vmv_v_x_i16m1(0, vl);
2040
+ vint16m1_t v_a_sum = __riscv_vwredsum_vs_i8mf4_i16m1(v_a_quant_i8, tmp_sum, vl);
2041
+ int16_t a_sum = __riscv_vmv_x_s_i16m1_i16(v_a_sum);
2042
+ a_sum_ptr[kk] = static_cast<_Float16>(-a_sum) * static_cast<_Float16>(8.0f);
2043
+
2044
+ __riscv_vse8_v_i8mf4(quant_a_blk, v_a_quant_i8, vl);
2045
+ }
2046
+ }
2047
+ } else if (vlenb == 32) {
2048
+ for (size_t k = 0; k < count_k; k += blk_len, quant_a_ptr += a_blk_stride) {
2049
+ _Float16 * a_sum_ptr = reinterpret_cast<_Float16 *>(quant_a_ptr + a_subblk_stride * subblk_count);
2050
+ _Float16 * scale_avg_ptr = reinterpret_cast<_Float16 *>(quant_a_ptr + a_blk_stride - sizeof(_Float16));
2051
+ float scale_avg = 0.0f;
2052
+
2053
+ for (size_t kk = 0; kk < subblk_count; ++kk) {
2054
+ const float * a_src_ptr = a_ptr + k + kk * k_subblk_len;
2055
+
2056
+ size_t vl = __riscv_vsetvl_e32m4(k_subblk_len);
2057
+ vfloat32m4_t v_a = __riscv_vle32_v_f32m4(a_src_ptr, vl);
2058
+ vfloat32m4_t v_a_abs = __riscv_vfabs_v_f32m4(v_a, vl);
2059
+
2060
+ vfloat32m1_t tmp = __riscv_vfmv_v_f_f32m1(0.0f, vl);
2061
+ vfloat32m1_t v_a_max = __riscv_vfredmax_vs_f32m4_f32m1(v_a_abs, tmp, vl);
2062
+ float max_abs_a = __riscv_vfmv_f_s_f32m1_f32(v_a_max);
2063
+
2064
+ scale_temp[kk] = max_abs_a / ((1 << 7) - 1);
2065
+ scale_avg += scale_temp[kk];
2066
+ }
2067
+
2068
+ scale_avg /= subblk_count;
2069
+ const float scale_factor = scale_avg ? 1.0f / scale_avg : 0.0f;
2070
+ scale_avg_ptr[0] = static_cast<_Float16>(scale_avg);
2071
+
2072
+ for (size_t kk = 0; kk < subblk_count; ++kk) {
2073
+ uint8_t * a_subblk_base = quant_a_ptr + kk * a_subblk_stride;
2074
+ _Float16 * scale_a_ptr = reinterpret_cast<_Float16 *>(a_subblk_base);
2075
+ int8_t * quant_a_blk = reinterpret_cast<int8_t *>(a_subblk_base + sizeof(_Float16));
2076
+ const float * a_src_ptr = a_ptr + k + kk * k_subblk_len;
2077
+
2078
+ size_t vl = __riscv_vsetvl_e32m4(k_subblk_len);
2079
+ vfloat32m4_t v_a = __riscv_vle32_v_f32m4(a_src_ptr, vl);
2080
+ float rep_scale_a = scale_temp[kk] ? 1.0f / scale_temp[kk] : 0.0f;
2081
+ scale_a_ptr[0] = static_cast<_Float16>(scale_temp[kk] * scale_factor);
2082
+
2083
+ vfloat32m4_t v_a_scale = __riscv_vfmul_vf_f32m4(v_a, rep_scale_a, vl);
2084
+ vint16m2_t v_a_quant = __riscv_vfncvt_x_f_w_i16m2(v_a_scale, vl);
2085
+ vint8m1_t v_a_quant_i8 = __riscv_vncvt_x_x_w_i8m1(v_a_quant, vl);
2086
+
2087
+ vint16m1_t tmp_sum = __riscv_vmv_v_x_i16m1(0, vl);
2088
+ vint16m1_t v_a_sum = __riscv_vwredsum_vs_i8m1_i16m1(v_a_quant_i8, tmp_sum, vl);
2089
+ int16_t a_sum = __riscv_vmv_x_s_i16m1_i16(v_a_sum);
2090
+ a_sum_ptr[kk] = static_cast<_Float16>(-a_sum) * static_cast<_Float16>(8.0f);
2091
+
2092
+ __riscv_vse8_v_i8m1(quant_a_blk, v_a_quant_i8, vl);
2093
+ }
2094
+ }
2095
+ } else {
2096
+ quantize_a_nrow_i8_hp_ref<1>(blk_len, a_ptr, count_k, quant_a_ptr);
2097
+ }
2098
+ }
2099
+
2100
+ void quantize_a_4row_i8_hp(size_t blk_len, const float * a_ptr, size_t count_k, uint8_t * quant_a_ptr) {
2101
+ constexpr size_t k_subblk_len = 32;
2102
+ GGML_ASSERT(blk_len == 256);
2103
+
2104
+ constexpr size_t subblk_count = 256 / k_subblk_len;
2105
+ int64_t a_blk_stride = q8_hp_blk_size(blk_len, true, true);
2106
+ int64_t a_nrow_block_stride = a_blk_stride * 4;
2107
+ int64_t a_subblk_stride = q8_hp_blk_size(k_subblk_len, false, false) * 4;
2108
+ size_t vlenb = __riscv_vlenb();
2109
+ float scale_temp[subblk_count] = { 0.0f };
2110
+
2111
+ if (vlenb == 128) {
2112
+ for (size_t k = 0; k < count_k; k += blk_len, quant_a_ptr += a_nrow_block_stride) {
2113
+ _Float16 * a_sum_ptr = reinterpret_cast<_Float16 *>(quant_a_ptr + a_subblk_stride * subblk_count);
2114
+ _Float16 * scale_avg_ptr =
2115
+ reinterpret_cast<_Float16 *>(quant_a_ptr + a_nrow_block_stride - sizeof(_Float16) * 4);
2116
+ float scale_avg = 0.0f;
2117
+
2118
+ for (size_t kk = 0; kk < subblk_count; ++kk) {
2119
+ const float * a_src_ptr0 = a_ptr + 0 * count_k + k + kk * k_subblk_len;
2120
+ const float * a_src_ptr1 = a_ptr + 1 * count_k + k + kk * k_subblk_len;
2121
+ const float * a_src_ptr2 = a_ptr + 2 * count_k + k + kk * k_subblk_len;
2122
+ const float * a_src_ptr3 = a_ptr + 3 * count_k + k + kk * k_subblk_len;
2123
+
2124
+ size_t vl = __riscv_vsetvl_e32m1(k_subblk_len);
2125
+ vfloat32m1_t v_a0 = __riscv_vle32_v_f32m1(a_src_ptr0, vl);
2126
+ vfloat32m1_t v_a1 = __riscv_vle32_v_f32m1(a_src_ptr1, vl);
2127
+ vfloat32m1_t v_a2 = __riscv_vle32_v_f32m1(a_src_ptr2, vl);
2128
+ vfloat32m1_t v_a3 = __riscv_vle32_v_f32m1(a_src_ptr3, vl);
2129
+ vfloat32m1_t v_a0_abs = __riscv_vfabs_v_f32m1(v_a0, vl);
2130
+ vfloat32m1_t v_a1_abs = __riscv_vfabs_v_f32m1(v_a1, vl);
2131
+ vfloat32m1_t v_a2_abs = __riscv_vfabs_v_f32m1(v_a2, vl);
2132
+ vfloat32m1_t v_a3_abs = __riscv_vfabs_v_f32m1(v_a3, vl);
2133
+
2134
+ vfloat32m1_t v_max_abs = __riscv_vfmax_vv_f32m1(v_a0_abs, v_a1_abs, vl);
2135
+ v_max_abs = __riscv_vfmax_vv_f32m1(v_max_abs, v_a2_abs, vl);
2136
+ v_max_abs = __riscv_vfmax_vv_f32m1(v_max_abs, v_a3_abs, vl);
2137
+
2138
+ vfloat32m1_t tmp = __riscv_vfmv_v_f_f32m1(0.0f, vl);
2139
+ vfloat32m1_t v_a_max = __riscv_vfredmax_vs_f32m1_f32m1(v_max_abs, tmp, vl);
2140
+ float max_abs_a = __riscv_vfmv_f_s_f32m1_f32(v_a_max);
2141
+
2142
+ scale_temp[kk] = max_abs_a / ((1 << 7) - 1);
2143
+ scale_avg += scale_temp[kk];
2144
+ }
2145
+
2146
+ scale_avg /= subblk_count;
2147
+ const float scale_factor = scale_avg ? 1.0f / scale_avg : 0.0f;
2148
+ scale_avg_ptr[0] = static_cast<_Float16>(scale_avg);
2149
+
2150
+ for (size_t kk = 0; kk < subblk_count; ++kk) {
2151
+ uint8_t * a_subblk_base = quant_a_ptr + kk * a_subblk_stride;
2152
+ _Float16 * scale_a_ptr = reinterpret_cast<_Float16 *>(a_subblk_base);
2153
+ int8_t * quant_a_blk = reinterpret_cast<int8_t *>(a_subblk_base + sizeof(_Float16) * 4);
2154
+ const float * a_src_ptr0 = a_ptr + 0 * count_k + k + kk * k_subblk_len;
2155
+ const float * a_src_ptr1 = a_ptr + 1 * count_k + k + kk * k_subblk_len;
2156
+ const float * a_src_ptr2 = a_ptr + 2 * count_k + k + kk * k_subblk_len;
2157
+ const float * a_src_ptr3 = a_ptr + 3 * count_k + k + kk * k_subblk_len;
2158
+
2159
+ size_t vl = __riscv_vsetvl_e32m1(k_subblk_len);
2160
+ vfloat32m1_t v_a0 = __riscv_vle32_v_f32m1(a_src_ptr0, vl);
2161
+ vfloat32m1_t v_a1 = __riscv_vle32_v_f32m1(a_src_ptr1, vl);
2162
+ vfloat32m1_t v_a2 = __riscv_vle32_v_f32m1(a_src_ptr2, vl);
2163
+ vfloat32m1_t v_a3 = __riscv_vle32_v_f32m1(a_src_ptr3, vl);
2164
+
2165
+ float rep_scale_a = scale_temp[kk] ? 1.0f / scale_temp[kk] : 0.0f;
2166
+ scale_a_ptr[0] = static_cast<_Float16>(scale_temp[kk] * scale_factor);
2167
+
2168
+ vfloat32m1_t v_a0_scale = __riscv_vfmul_vf_f32m1(v_a0, rep_scale_a, vl);
2169
+ vfloat32m1_t v_a1_scale = __riscv_vfmul_vf_f32m1(v_a1, rep_scale_a, vl);
2170
+ vfloat32m1_t v_a2_scale = __riscv_vfmul_vf_f32m1(v_a2, rep_scale_a, vl);
2171
+ vfloat32m1_t v_a3_scale = __riscv_vfmul_vf_f32m1(v_a3, rep_scale_a, vl);
2172
+ vint16mf2_t v_a0_quant = __riscv_vfncvt_x_f_w_i16mf2(v_a0_scale, vl);
2173
+ vint16mf2_t v_a1_quant = __riscv_vfncvt_x_f_w_i16mf2(v_a1_scale, vl);
2174
+ vint16mf2_t v_a2_quant = __riscv_vfncvt_x_f_w_i16mf2(v_a2_scale, vl);
2175
+ vint16mf2_t v_a3_quant = __riscv_vfncvt_x_f_w_i16mf2(v_a3_scale, vl);
2176
+ vint8mf4_t v_a0_quant_i8 = __riscv_vncvt_x_x_w_i8mf4(v_a0_quant, vl);
2177
+ vint8mf4_t v_a1_quant_i8 = __riscv_vncvt_x_x_w_i8mf4(v_a1_quant, vl);
2178
+ vint8mf4_t v_a2_quant_i8 = __riscv_vncvt_x_x_w_i8mf4(v_a2_quant, vl);
2179
+ vint8mf4_t v_a3_quant_i8 = __riscv_vncvt_x_x_w_i8mf4(v_a3_quant, vl);
2180
+
2181
+ vint16m1_t tmp_sum0 = __riscv_vmv_v_x_i16m1(0, vl);
2182
+ vint16m1_t tmp_sum1 = __riscv_vmv_v_x_i16m1(0, vl);
2183
+ vint16m1_t tmp_sum2 = __riscv_vmv_v_x_i16m1(0, vl);
2184
+ vint16m1_t tmp_sum3 = __riscv_vmv_v_x_i16m1(0, vl);
2185
+ vint16m1_t v_a0_sum = __riscv_vwredsum_vs_i8mf4_i16m1(v_a0_quant_i8, tmp_sum0, vl);
2186
+ vint16m1_t v_a1_sum = __riscv_vwredsum_vs_i8mf4_i16m1(v_a1_quant_i8, tmp_sum1, vl);
2187
+ vint16m1_t v_a2_sum = __riscv_vwredsum_vs_i8mf4_i16m1(v_a2_quant_i8, tmp_sum2, vl);
2188
+ vint16m1_t v_a3_sum = __riscv_vwredsum_vs_i8mf4_i16m1(v_a3_quant_i8, tmp_sum3, vl);
2189
+
2190
+ a_sum_ptr[0 * subblk_count + kk] =
2191
+ static_cast<_Float16>(-__riscv_vmv_x_s_i16m1_i16(v_a0_sum)) * static_cast<_Float16>(8.0f);
2192
+ a_sum_ptr[1 * subblk_count + kk] =
2193
+ static_cast<_Float16>(-__riscv_vmv_x_s_i16m1_i16(v_a1_sum)) * static_cast<_Float16>(8.0f);
2194
+ a_sum_ptr[2 * subblk_count + kk] =
2195
+ static_cast<_Float16>(-__riscv_vmv_x_s_i16m1_i16(v_a2_sum)) * static_cast<_Float16>(8.0f);
2196
+ a_sum_ptr[3 * subblk_count + kk] =
2197
+ static_cast<_Float16>(-__riscv_vmv_x_s_i16m1_i16(v_a3_sum)) * static_cast<_Float16>(8.0f);
2198
+
2199
+ __riscv_vse8_v_i8mf4(quant_a_blk + 0 * k_subblk_len, v_a0_quant_i8, vl);
2200
+ __riscv_vse8_v_i8mf4(quant_a_blk + 1 * k_subblk_len, v_a1_quant_i8, vl);
2201
+ __riscv_vse8_v_i8mf4(quant_a_blk + 2 * k_subblk_len, v_a2_quant_i8, vl);
2202
+ __riscv_vse8_v_i8mf4(quant_a_blk + 3 * k_subblk_len, v_a3_quant_i8, vl);
2203
+ }
2204
+ }
2205
+ } else if (vlenb == 32) {
2206
+ for (size_t k = 0; k < count_k; k += blk_len, quant_a_ptr += a_nrow_block_stride) {
2207
+ _Float16 * a_sum_ptr = reinterpret_cast<_Float16 *>(quant_a_ptr + a_subblk_stride * subblk_count);
2208
+ _Float16 * scale_avg_ptr =
2209
+ reinterpret_cast<_Float16 *>(quant_a_ptr + a_nrow_block_stride - sizeof(_Float16) * 4);
2210
+ float scale_avg = 0.0f;
2211
+
2212
+ for (size_t kk = 0; kk < subblk_count; ++kk) {
2213
+ const float * a_src_ptr0 = a_ptr + 0 * count_k + k + kk * k_subblk_len;
2214
+ const float * a_src_ptr1 = a_ptr + 1 * count_k + k + kk * k_subblk_len;
2215
+ const float * a_src_ptr2 = a_ptr + 2 * count_k + k + kk * k_subblk_len;
2216
+ const float * a_src_ptr3 = a_ptr + 3 * count_k + k + kk * k_subblk_len;
2217
+
2218
+ size_t vl = __riscv_vsetvl_e32m4(k_subblk_len);
2219
+ vfloat32m4_t v_a0 = __riscv_vle32_v_f32m4(a_src_ptr0, vl);
2220
+ vfloat32m4_t v_a1 = __riscv_vle32_v_f32m4(a_src_ptr1, vl);
2221
+ vfloat32m4_t v_a2 = __riscv_vle32_v_f32m4(a_src_ptr2, vl);
2222
+ vfloat32m4_t v_a3 = __riscv_vle32_v_f32m4(a_src_ptr3, vl);
2223
+
2224
+ vfloat32m4_t v_a0_abs = __riscv_vfabs_v_f32m4(v_a0, vl);
2225
+ vfloat32m4_t v_a1_abs = __riscv_vfabs_v_f32m4(v_a1, vl);
2226
+ vfloat32m4_t v_a2_abs = __riscv_vfabs_v_f32m4(v_a2, vl);
2227
+ vfloat32m4_t v_a3_abs = __riscv_vfabs_v_f32m4(v_a3, vl);
2228
+
2229
+ vfloat32m4_t v_max_abs = __riscv_vfmax_vv_f32m4(v_a0_abs, v_a1_abs, vl);
2230
+ v_max_abs = __riscv_vfmax_vv_f32m4(v_max_abs, v_a2_abs, vl);
2231
+ v_max_abs = __riscv_vfmax_vv_f32m4(v_max_abs, v_a3_abs, vl);
2232
+
2233
+ vfloat32m1_t tmp = __riscv_vfmv_v_f_f32m1(0.0f, vl);
2234
+ vfloat32m1_t v_a_max = __riscv_vfredmax_vs_f32m4_f32m1(v_max_abs, tmp, vl);
2235
+ float max_abs_a = __riscv_vfmv_f_s_f32m1_f32(v_a_max);
2236
+
2237
+ scale_temp[kk] = max_abs_a / ((1 << 7) - 1);
2238
+ scale_avg += scale_temp[kk];
2239
+ }
2240
+
2241
+ scale_avg /= subblk_count;
2242
+ const float scale_factor = scale_avg ? 1.0f / scale_avg : 0.0f;
2243
+ scale_avg_ptr[0] = static_cast<_Float16>(scale_avg);
2244
+
2245
+ for (size_t kk = 0; kk < subblk_count; ++kk) {
2246
+ uint8_t * a_subblk_base = quant_a_ptr + kk * a_subblk_stride;
2247
+ _Float16 * scale_a_ptr = reinterpret_cast<_Float16 *>(a_subblk_base);
2248
+ int8_t * quant_a_blk = reinterpret_cast<int8_t *>(a_subblk_base + sizeof(_Float16) * 4);
2249
+ const float * a_src_ptr0 = a_ptr + 0 * count_k + k + kk * k_subblk_len;
2250
+ const float * a_src_ptr1 = a_ptr + 1 * count_k + k + kk * k_subblk_len;
2251
+ const float * a_src_ptr2 = a_ptr + 2 * count_k + k + kk * k_subblk_len;
2252
+ const float * a_src_ptr3 = a_ptr + 3 * count_k + k + kk * k_subblk_len;
2253
+
2254
+ size_t vl = __riscv_vsetvl_e32m4(k_subblk_len);
2255
+ vfloat32m4_t v_a0 = __riscv_vle32_v_f32m4(a_src_ptr0, vl);
2256
+ vfloat32m4_t v_a1 = __riscv_vle32_v_f32m4(a_src_ptr1, vl);
2257
+ vfloat32m4_t v_a2 = __riscv_vle32_v_f32m4(a_src_ptr2, vl);
2258
+ vfloat32m4_t v_a3 = __riscv_vle32_v_f32m4(a_src_ptr3, vl);
2259
+
2260
+ float rep_scale_a = scale_temp[kk] ? 1.0f / scale_temp[kk] : 0.0f;
2261
+ scale_a_ptr[0] = static_cast<_Float16>(scale_temp[kk] * scale_factor);
2262
+
2263
+ vfloat32m4_t v_a0_scale = __riscv_vfmul_vf_f32m4(v_a0, rep_scale_a, vl);
2264
+ vfloat32m4_t v_a1_scale = __riscv_vfmul_vf_f32m4(v_a1, rep_scale_a, vl);
2265
+ vfloat32m4_t v_a2_scale = __riscv_vfmul_vf_f32m4(v_a2, rep_scale_a, vl);
2266
+ vfloat32m4_t v_a3_scale = __riscv_vfmul_vf_f32m4(v_a3, rep_scale_a, vl);
2267
+ vint16m2_t v_a0_quant = __riscv_vfncvt_x_f_w_i16m2(v_a0_scale, vl);
2268
+ vint16m2_t v_a1_quant = __riscv_vfncvt_x_f_w_i16m2(v_a1_scale, vl);
2269
+ vint16m2_t v_a2_quant = __riscv_vfncvt_x_f_w_i16m2(v_a2_scale, vl);
2270
+ vint16m2_t v_a3_quant = __riscv_vfncvt_x_f_w_i16m2(v_a3_scale, vl);
2271
+ vint8m1_t v_a0_quant_i8 = __riscv_vncvt_x_x_w_i8m1(v_a0_quant, vl);
2272
+ vint8m1_t v_a1_quant_i8 = __riscv_vncvt_x_x_w_i8m1(v_a1_quant, vl);
2273
+ vint8m1_t v_a2_quant_i8 = __riscv_vncvt_x_x_w_i8m1(v_a2_quant, vl);
2274
+ vint8m1_t v_a3_quant_i8 = __riscv_vncvt_x_x_w_i8m1(v_a3_quant, vl);
2275
+
2276
+ vint16m1_t tmp_sum0 = __riscv_vmv_v_x_i16m1(0, vl);
2277
+ vint16m1_t tmp_sum1 = __riscv_vmv_v_x_i16m1(0, vl);
2278
+ vint16m1_t tmp_sum2 = __riscv_vmv_v_x_i16m1(0, vl);
2279
+ vint16m1_t tmp_sum3 = __riscv_vmv_v_x_i16m1(0, vl);
2280
+ vint16m1_t v_a0_sum = __riscv_vwredsum_vs_i8m1_i16m1(v_a0_quant_i8, tmp_sum0, vl);
2281
+ vint16m1_t v_a1_sum = __riscv_vwredsum_vs_i8m1_i16m1(v_a1_quant_i8, tmp_sum1, vl);
2282
+ vint16m1_t v_a2_sum = __riscv_vwredsum_vs_i8m1_i16m1(v_a2_quant_i8, tmp_sum2, vl);
2283
+ vint16m1_t v_a3_sum = __riscv_vwredsum_vs_i8m1_i16m1(v_a3_quant_i8, tmp_sum3, vl);
2284
+
2285
+ a_sum_ptr[0 * subblk_count + kk] =
2286
+ static_cast<_Float16>(-__riscv_vmv_x_s_i16m1_i16(v_a0_sum)) * static_cast<_Float16>(8.0f);
2287
+ a_sum_ptr[1 * subblk_count + kk] =
2288
+ static_cast<_Float16>(-__riscv_vmv_x_s_i16m1_i16(v_a1_sum)) * static_cast<_Float16>(8.0f);
2289
+ a_sum_ptr[2 * subblk_count + kk] =
2290
+ static_cast<_Float16>(-__riscv_vmv_x_s_i16m1_i16(v_a2_sum)) * static_cast<_Float16>(8.0f);
2291
+ a_sum_ptr[3 * subblk_count + kk] =
2292
+ static_cast<_Float16>(-__riscv_vmv_x_s_i16m1_i16(v_a3_sum)) * static_cast<_Float16>(8.0f);
2293
+
2294
+ __riscv_vse8_v_i8m1(quant_a_blk + 0 * k_subblk_len, v_a0_quant_i8, vl);
2295
+ __riscv_vse8_v_i8m1(quant_a_blk + 1 * k_subblk_len, v_a1_quant_i8, vl);
2296
+ __riscv_vse8_v_i8m1(quant_a_blk + 2 * k_subblk_len, v_a2_quant_i8, vl);
2297
+ __riscv_vse8_v_i8m1(quant_a_blk + 3 * k_subblk_len, v_a3_quant_i8, vl);
2298
+ }
2299
+ }
2300
+ } else {
2301
+ quantize_a_nrow_i8_hp_ref<4>(blk_len, a_ptr, count_k, quant_a_ptr);
2302
+ }
2303
+ }
2304
+
2305
+ void quantize_a_row_i8k(size_t blk_len, const float * a_ptr, size_t count_k, uint8_t * quant_a_ptr) {
2306
+ GGML_ASSERT(blk_len == 256);
2307
+ constexpr int64_t a_blk_stride = q8k_blk_size(256);
2308
+ constexpr int64_t a_sum_size = 256 / 16;
2309
+ size_t vlenb = __riscv_vlenb();
2310
+
2311
+ if (vlenb == 128) {
2312
+ // vlen = 1024 bits, can process 32 float32 elements with m1
2313
+ for (size_t k = 0; k < count_k; k += blk_len, quant_a_ptr += a_blk_stride) {
2314
+ float * scale_a_ptr = reinterpret_cast<float *>(quant_a_ptr);
2315
+ int16_t * a_sum_ptr = reinterpret_cast<int16_t *>(quant_a_ptr + sizeof(float));
2316
+ int8_t * quant_a_blk =
2317
+ reinterpret_cast<int8_t *>(quant_a_ptr + sizeof(float) + sizeof(int16_t) * a_sum_size);
2318
+
2319
+ // Find max absolute value across all 256 elements
2320
+ size_t vl = __riscv_vsetvl_e32m1(16);
2321
+ vfloat32m1_t v_max_abs = __riscv_vfmv_v_f_f32m1(0.0f, vl);
2322
+
2323
+ for (size_t bki = 0; bki < a_sum_size; bki++) {
2324
+ vfloat32m1_t v_a = __riscv_vle32_v_f32m1(a_ptr + k + bki * 16, vl);
2325
+ vfloat32m1_t v_a_abs = __riscv_vfabs_v_f32m1(v_a, vl);
2326
+ v_max_abs = __riscv_vfmax_vv_f32m1(v_a_abs, v_max_abs, vl);
2327
+ }
2328
+ vfloat32m1_t tmp = __riscv_vfmv_v_f_f32m1(0.0f, vl);
2329
+ vfloat32m1_t v_local_max = __riscv_vfredmax_vs_f32m1_f32m1(v_max_abs, tmp, vl);
2330
+ float max_abs_a = __riscv_vfmv_f_s_f32m1_f32(v_local_max);
2331
+
2332
+ float scale_a = max_abs_a / ((1 << 7) - 1);
2333
+ float rep_scale_a = scale_a ? 1.0f / scale_a : 0.0f;
2334
+ scale_a_ptr[0] = scale_a;
2335
+
2336
+ // Quantize and compute sums for each 16-element group
2337
+ for (size_t bki = 0; bki < a_sum_size; bki++) {
2338
+ vfloat32m1_t v_a = __riscv_vle32_v_f32m1(a_ptr + k + bki * 16, vl);
2339
+ vfloat32m1_t v_a_scale = __riscv_vfmul_vf_f32m1(v_a, rep_scale_a, vl);
2340
+ vint16mf2_t v_a_quant = __riscv_vfncvt_x_f_w_i16mf2(v_a_scale, vl);
2341
+ vint8mf4_t v_a_quant_i8 = __riscv_vncvt_x_x_w_i8mf4(v_a_quant, vl);
2342
+
2343
+ vint16m1_t tmp_sum = __riscv_vmv_v_x_i16m1(0, vl);
2344
+ vint16m1_t v_a_sum = __riscv_vwredsum_vs_i8mf4_i16m1(v_a_quant_i8, tmp_sum, vl);
2345
+ int16_t a_sum = __riscv_vmv_x_s_i16m1_i16(v_a_sum);
2346
+ a_sum_ptr[bki] = -a_sum;
2347
+
2348
+ __riscv_vse8_v_i8mf4(quant_a_blk + bki * 16, v_a_quant_i8, vl);
2349
+ }
2350
+ }
2351
+ } else if (vlenb == 32) {
2352
+ // vlen = 256 bits, can process 8 float32 elements with m1
2353
+ for (size_t k = 0; k < count_k; k += blk_len, quant_a_ptr += a_blk_stride) {
2354
+ float * scale_a_ptr = reinterpret_cast<float *>(quant_a_ptr);
2355
+ int16_t * a_sum_ptr = reinterpret_cast<int16_t *>(quant_a_ptr + sizeof(float));
2356
+ int8_t * quant_a_blk =
2357
+ reinterpret_cast<int8_t *>(quant_a_ptr + sizeof(float) + sizeof(int16_t) * a_sum_size);
2358
+
2359
+ // Find max absolute value across all 256 elements
2360
+ size_t vl = __riscv_vsetvl_e32m2(16);
2361
+ vfloat32m2_t v_max_abs = __riscv_vfmv_v_f_f32m2(0.0f, vl);
2362
+
2363
+ for (size_t bki = 0; bki < a_sum_size; bki++) {
2364
+ vfloat32m2_t v_a = __riscv_vle32_v_f32m2(a_ptr + k + bki * 16, vl);
2365
+ vfloat32m2_t v_a_abs = __riscv_vfabs_v_f32m2(v_a, vl);
2366
+ v_max_abs = __riscv_vfmax_vv_f32m2(v_a_abs, v_max_abs, vl);
2367
+ }
2368
+ vfloat32m1_t tmp = __riscv_vfmv_v_f_f32m1(0.0f, vl);
2369
+ vfloat32m1_t v_local_max = __riscv_vfredmax_vs_f32m2_f32m1(v_max_abs, tmp, vl);
2370
+ float max_abs_a = __riscv_vfmv_f_s_f32m1_f32(v_local_max);
2371
+
2372
+ float scale_a = max_abs_a / ((1 << 7) - 1);
2373
+ float rep_scale_a = scale_a ? 1.0f / scale_a : 0.0f;
2374
+ scale_a_ptr[0] = scale_a;
2375
+
2376
+ // Quantize and compute sums for each 16-element group
2377
+ for (size_t bki = 0; bki < a_sum_size; bki++) {
2378
+ vfloat32m2_t v_a = __riscv_vle32_v_f32m2(a_ptr + k + bki * 16, vl);
2379
+ vfloat32m2_t v_a_scale = __riscv_vfmul_vf_f32m2(v_a, rep_scale_a, vl);
2380
+ vint16m1_t v_a_quant = __riscv_vfncvt_x_f_w_i16m1(v_a_scale, vl);
2381
+ vint8mf2_t v_a_quant_i8 = __riscv_vncvt_x_x_w_i8mf2(v_a_quant, vl);
2382
+
2383
+ vint16m1_t tmp_sum = __riscv_vmv_v_x_i16m1(0, vl);
2384
+ vint16m1_t v_a_sum = __riscv_vwredsum_vs_i8mf2_i16m1(v_a_quant_i8, tmp_sum, vl);
2385
+ int16_t a_sum = __riscv_vmv_x_s_i16m1_i16(v_a_sum);
2386
+ a_sum_ptr[bki] = -a_sum;
2387
+
2388
+ __riscv_vse8_v_i8mf2(quant_a_blk + bki * 16, v_a_quant_i8, vl);
2389
+ }
2390
+ }
2391
+ } else {
2392
+ quantize_a_nrow_i8k_ref<1>(blk_len, a_ptr, count_k, quant_a_ptr);
2393
+ }
2394
+ }
2395
+
2396
+ void quantize_a_4row_i8k(size_t blk_len, const float * a_ptr, size_t count_k, uint8_t * quant_a_ptr) {
2397
+ GGML_ASSERT(blk_len == 256);
2398
+ constexpr int64_t a_blk_stride = q8k_blk_size(256);
2399
+ constexpr int64_t a_nrow_block_stride = a_blk_stride * 4;
2400
+ constexpr int64_t a_sum_size = 256 / 16;
2401
+ size_t vlenb = __riscv_vlenb();
2402
+
2403
+ if (vlenb == 128) {
2404
+ // vlen = 1024 bits
2405
+ for (size_t k = 0; k < count_k; k += blk_len, quant_a_ptr += a_nrow_block_stride) {
2406
+ float * scale_a_ptr = reinterpret_cast<float *>(quant_a_ptr);
2407
+ int16_t * a_sum_ptr = reinterpret_cast<int16_t *>(quant_a_ptr + sizeof(float) * 4);
2408
+ int8_t * quant_a_blk =
2409
+ reinterpret_cast<int8_t *>(quant_a_ptr + sizeof(float) * 4 + sizeof(int16_t) * a_sum_size * 4);
2410
+
2411
+ for (size_t mi = 0; mi < 4; mi++) {
2412
+ // Find max absolute value across all 256 elements for this row
2413
+ size_t vl = __riscv_vsetvl_e32m1(16);
2414
+ vfloat32m1_t v_max_abs = __riscv_vfmv_v_f_f32m1(0.0f, vl);
2415
+
2416
+ for (size_t bki = 0; bki < a_sum_size; bki++) {
2417
+ vfloat32m1_t v_a = __riscv_vle32_v_f32m1(a_ptr + mi * count_k + k + bki * 16, vl);
2418
+ vfloat32m1_t v_a_abs = __riscv_vfabs_v_f32m1(v_a, vl);
2419
+ v_max_abs = __riscv_vfmax_vv_f32m1(v_a_abs, v_max_abs, vl);
2420
+ }
2421
+ vfloat32m1_t tmp = __riscv_vfmv_v_f_f32m1(0.0f, vl);
2422
+ vfloat32m1_t v_local_max = __riscv_vfredmax_vs_f32m1_f32m1(v_max_abs, tmp, vl);
2423
+ float max_abs_a = __riscv_vfmv_f_s_f32m1_f32(v_local_max);
2424
+
2425
+ float scale_a = max_abs_a / ((1 << 7) - 1);
2426
+ float rep_scale_a = scale_a ? 1.0f / scale_a : 0.0f;
2427
+ scale_a_ptr[mi] = scale_a;
2428
+
2429
+ // Quantize and compute sums for each 16-element group
2430
+ for (size_t bki = 0; bki < a_sum_size; bki++) {
2431
+ vfloat32m1_t v_a = __riscv_vle32_v_f32m1(a_ptr + mi * count_k + k + bki * 16, vl);
2432
+ vfloat32m1_t v_a_scale = __riscv_vfmul_vf_f32m1(v_a, rep_scale_a, vl);
2433
+ vint16mf2_t v_a_quant = __riscv_vfncvt_x_f_w_i16mf2(v_a_scale, vl);
2434
+ vint8mf4_t v_a_quant_i8 = __riscv_vncvt_x_x_w_i8mf4(v_a_quant, vl);
2435
+
2436
+ vint16m1_t tmp_sum = __riscv_vmv_v_x_i16m1(0, vl);
2437
+ vint16m1_t v_a_sum = __riscv_vwredsum_vs_i8mf4_i16m1(v_a_quant_i8, tmp_sum, vl);
2438
+ int16_t a_sum = __riscv_vmv_x_s_i16m1_i16(v_a_sum);
2439
+ a_sum_ptr[mi * a_sum_size + bki] = -a_sum;
2440
+
2441
+ __riscv_vse8_v_i8mf4(quant_a_blk + mi * blk_len + bki * 16, v_a_quant_i8, vl);
2442
+ }
2443
+ }
2444
+ }
2445
+ } else if (vlenb == 32) {
2446
+ // vlen = 256 bits
2447
+ for (size_t k = 0; k < count_k; k += blk_len, quant_a_ptr += a_nrow_block_stride) {
2448
+ float * scale_a_ptr = reinterpret_cast<float *>(quant_a_ptr);
2449
+ int16_t * a_sum_ptr = reinterpret_cast<int16_t *>(quant_a_ptr + sizeof(float) * 4);
2450
+ int8_t * quant_a_blk =
2451
+ reinterpret_cast<int8_t *>(quant_a_ptr + sizeof(float) * 4 + sizeof(int16_t) * a_sum_size * 4);
2452
+
2453
+ for (size_t mi = 0; mi < 4; mi++) {
2454
+ // Find max absolute value across all 256 elements for this row
2455
+ size_t vl = __riscv_vsetvl_e32m2(16);
2456
+ vfloat32m2_t v_max_abs = __riscv_vfmv_v_f_f32m2(0.0f, vl);
2457
+
2458
+ for (size_t bki = 0; bki < a_sum_size; bki++) {
2459
+ vfloat32m2_t v_a = __riscv_vle32_v_f32m2(a_ptr + mi * count_k + k + bki * 16, vl);
2460
+ vfloat32m2_t v_a_abs = __riscv_vfabs_v_f32m2(v_a, vl);
2461
+ v_max_abs = __riscv_vfmax_vv_f32m2(v_a_abs, v_max_abs, vl);
2462
+ }
2463
+ vfloat32m1_t tmp = __riscv_vfmv_v_f_f32m1(0.0f, vl);
2464
+ vfloat32m1_t v_local_max = __riscv_vfredmax_vs_f32m2_f32m1(v_max_abs, tmp, vl);
2465
+ float max_abs_a = __riscv_vfmv_f_s_f32m1_f32(v_local_max);
2466
+
2467
+ float scale_a = max_abs_a / ((1 << 7) - 1);
2468
+ float rep_scale_a = scale_a ? 1.0f / scale_a : 0.0f;
2469
+ scale_a_ptr[mi] = scale_a;
2470
+
2471
+ // Quantize and compute sums for each 16-element group
2472
+ for (size_t bki = 0; bki < a_sum_size; bki++) {
2473
+ vfloat32m2_t v_a = __riscv_vle32_v_f32m2(a_ptr + mi * count_k + k + bki * 16, vl);
2474
+ vfloat32m2_t v_a_scale = __riscv_vfmul_vf_f32m2(v_a, rep_scale_a, vl);
2475
+ vint16m1_t v_a_quant = __riscv_vfncvt_x_f_w_i16m1(v_a_scale, vl);
2476
+ vint8mf2_t v_a_quant_i8 = __riscv_vncvt_x_x_w_i8mf2(v_a_quant, vl);
2477
+
2478
+ vint16m1_t tmp_sum = __riscv_vmv_v_x_i16m1(0, vl);
2479
+ vint16m1_t v_a_sum = __riscv_vwredsum_vs_i8mf2_i16m1(v_a_quant_i8, tmp_sum, vl);
2480
+ int16_t a_sum = __riscv_vmv_x_s_i16m1_i16(v_a_sum);
2481
+ a_sum_ptr[mi * a_sum_size + bki] = -a_sum;
2482
+
2483
+ __riscv_vse8_v_i8mf2(quant_a_blk + mi * blk_len + bki * 16, v_a_quant_i8, vl);
2484
+ }
2485
+ }
2486
+ }
2487
+ } else {
2488
+ quantize_a_nrow_i8k_ref<4>(blk_len, a_ptr, count_k, quant_a_ptr);
2489
+ }
2490
+ }
2491
+
2492
+ void forward_cpy_with_permute(ggml_compute_params * params, ggml_tensor * op) {
2493
+ const ggml_tensor * src0 = op->src[0];
2494
+ ggml_tensor * dst = op;
2495
+ const int ith = params->ith;
2496
+ const int nth = params->nth;
2497
+
2498
+ // [batch, m, n] -> [batch, n, m]
2499
+ int64_t batch = src0->ne[2] * src0->ne[3];
2500
+ int64_t m = src0->ne[1];
2501
+ int64_t n = src0->ne[0];
2502
+
2503
+ int64_t batch_stride = src0->nb[2];
2504
+ int64_t m_src_stride = src0->nb[0];
2505
+ int64_t n_src_stride = src0->nb[1];
2506
+ int64_t n_dst_stride = n_src_stride * m;
2507
+
2508
+ permute_transpose_impl(src0, dst, batch, m, n, batch_stride, m_src_stride, n_src_stride, n_dst_stride, ith, nth);
2509
+ }
2510
+
2511
+ void forward_cont_with_permute(ggml_compute_params * params, ggml_tensor * op) {
2512
+ const ggml_tensor * src0 = op->src[0];
2513
+ ggml_tensor * dst = op;
2514
+ const int ith = params->ith;
2515
+ const int nth = params->nth;
2516
+
2517
+ // [batch, m, n] -> [batch, n, m]
2518
+ int64_t batch = dst->ne[2] * dst->ne[3];
2519
+ int64_t n = dst->ne[1];
2520
+ int64_t m = dst->ne[0];
2521
+
2522
+ int64_t batch_stride = dst->nb[2];
2523
+ int64_t m_src_stride = src0->nb[0];
2524
+ int64_t n_src_stride = src0->nb[1];
2525
+ int64_t n_dst_stride = dst->nb[1];
2526
+
2527
+ permute_transpose_impl(src0, dst, batch, m, n, batch_stride, m_src_stride, n_src_stride, n_dst_stride, ith, nth);
2528
+ }
2529
+
2530
+ void forward_norm_f32(ggml_compute_params * params, ggml_tensor * op) {
2531
+ const ggml_tensor * src0 = op->src[0];
2532
+ ggml_tensor * dst = op;
2533
+ GGML_ASSERT(ggml_are_same_shape(src0, dst));
2534
+ GGML_ASSERT(src0->nb[0] == sizeof(float));
2535
+
2536
+ int ith = params->ith;
2537
+ int nth = params->nth;
2538
+
2539
+ GGML_TENSOR_UNARY_OP_LOCALS
2540
+
2541
+ float epsilon = *((float *) dst->op_params);
2542
+
2543
+ GGML_ASSERT(epsilon > 0.0f);
2544
+
2545
+ auto * input = (char *) src0->data;
2546
+ auto * output = (char *) dst->data;
2547
+
2548
+ const auto hidden_size = ne00;
2549
+ const auto task_count = ne01 * ne02 * ne03;
2550
+ const auto task_per_thread = (task_count + nth - 1) / nth;
2551
+
2552
+ const auto task_begin = ith * task_per_thread;
2553
+ const auto task_end = std::min((ith + 1) * task_per_thread, task_count);
2554
+
2555
+ for (auto task_idx = task_begin; task_idx < task_end; task_idx++) {
2556
+ int64_t i03 = task_idx / (ne02 * ne01);
2557
+ int64_t i02 = (task_idx - i03 * ne02 * ne01) / ne01;
2558
+ int64_t i01 = (task_idx - i03 * ne02 * ne01 - i02 * ne01);
2559
+
2560
+ auto * p_input = (float *) (input + i01 * nb01 + i02 * nb02 + i03 * nb03);
2561
+ auto * p_output = (float *) (output + i01 * nb1 + i02 * nb2 + i03 * nb3);
2562
+ auto * p_temp_output = p_output;
2563
+
2564
+ size_t gvl = __riscv_vsetvlmax_e32m4();
2565
+ vfloat32m4_t sum = __riscv_vfmv_v_f_f32m4(0.f, gvl);
2566
+ vfloat32m4_t sum_sq = __riscv_vfmv_v_f_f32m4(0.f, gvl);
2567
+ int64_t length = hidden_size;
2568
+ while (length > 0) {
2569
+ gvl = __riscv_vsetvl_e32m4(length);
2570
+ // load data
2571
+ vfloat32m4_t src_data = __riscv_vle32_v_f32m4(p_input, gvl);
2572
+
2573
+ sum = __riscv_vfadd_vv_f32m4(sum, src_data, gvl);
2574
+ sum_sq = __riscv_vfmacc_vv_f32m4(sum_sq, src_data, src_data, gvl);
2575
+
2576
+ __riscv_vse32_v_f32m4(p_temp_output, src_data, gvl);
2577
+
2578
+ p_input += gvl;
2579
+ p_temp_output += gvl;
2580
+ length -= gvl;
2581
+ }
2582
+
2583
+ gvl = __riscv_vsetvlmax_e32m1();
2584
+
2585
+ float mean = 0.f;
2586
+ vfloat32m1_t zero_v = __riscv_vfmv_v_f_f32m1(0.f, gvl);
2587
+ vfloat32m1_t mean_v =
2588
+ __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m4_f32m1(sum, 0), __riscv_vget_v_f32m4_f32m1(sum, 1), gvl);
2589
+ mean_v = __riscv_vfadd_vv_f32m1(mean_v, __riscv_vget_v_f32m4_f32m1(sum, 2), gvl);
2590
+ mean_v = __riscv_vfadd_vv_f32m1(mean_v, __riscv_vget_v_f32m4_f32m1(sum, 3), gvl);
2591
+ mean_v = __riscv_vfredusum_vs_f32m1_f32m1(mean_v, zero_v, gvl);
2592
+ mean = __riscv_vfmv_f_s_f32m1_f32(mean_v);
2593
+ mean /= hidden_size;
2594
+
2595
+ vfloat32m1_t mean_square_v =
2596
+ __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m4_f32m1(sum_sq, 0), __riscv_vget_v_f32m4_f32m1(sum_sq, 1), gvl);
2597
+ mean_square_v = __riscv_vfadd_vv_f32m1(mean_square_v, __riscv_vget_v_f32m4_f32m1(sum_sq, 2), gvl);
2598
+ mean_square_v = __riscv_vfadd_vv_f32m1(mean_square_v, __riscv_vget_v_f32m4_f32m1(sum_sq, 3), gvl);
2599
+ mean_square_v = __riscv_vfredusum_vs_f32m1_f32m1(mean_square_v, zero_v, gvl);
2600
+
2601
+ float mean_square = __riscv_vfmv_f_s_f32m1_f32(mean_square_v);
2602
+ mean_square /= hidden_size;
2603
+ mean_square = sqrt(mean_square - mean * mean + epsilon);
2604
+
2605
+ mean_square = 1.0f / mean_square;
2606
+ length = hidden_size;
2607
+ p_temp_output = p_output;
2608
+
2609
+ while (length > 0) {
2610
+ gvl = __riscv_vsetvl_e32m4(length);
2611
+ vfloat32m4_t src_data = __riscv_vle32_v_f32m4(p_temp_output, gvl);
2612
+ src_data = __riscv_vfsub_vf_f32m4(src_data, mean, gvl);
2613
+ src_data = __riscv_vfmul_vf_f32m4(src_data, mean_square, gvl);
2614
+ __riscv_vse32_v_f32m4(p_output, src_data, gvl);
2615
+ p_temp_output += gvl;
2616
+ p_output += gvl;
2617
+ length -= gvl;
2618
+ }
2619
+ }
2620
+ }
2621
+
2622
+ template <ggml_op op_type, typename T> void forward_binary(ggml_compute_params * params, ggml_tensor * op) {
2623
+ const ggml_tensor * src0 = op->src[0];
2624
+ const ggml_tensor * src1 = op->src[1];
2625
+ ggml_tensor * dst = op;
2626
+ GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
2627
+
2628
+ auto src0_rows = ggml_nrows(src0);
2629
+ auto src1_rows = ggml_nrows(src1);
2630
+
2631
+ int ith = params->ith;
2632
+ int nth = params->nth;
2633
+
2634
+ GGML_TENSOR_BINARY_OP_LOCALS
2635
+
2636
+ GGML_ASSERT(nb0 == sizeof(T));
2637
+ GGML_ASSERT(nb00 == sizeof(T));
2638
+
2639
+ const auto [ir0, ir1] = get_thread_range(params, src0);
2640
+
2641
+ auto compute_func_vv = [&](int64_t blk_len, int64_t r, T * src0_ptr, T * src1_ptr, T * dst_ptr) {
2642
+ int64_t idx = 0;
2643
+ if constexpr (op_type == GGML_OP_ADD) {
2644
+ if constexpr (std::is_same_v<T, float>) {
2645
+ for (size_t vl; blk_len > 0; blk_len -= vl, idx += vl) {
2646
+ vl = __riscv_vsetvl_e32m4(blk_len);
2647
+ vfloat32m4_t lhs = __riscv_vle32_v_f32m4(src0_ptr + idx + r, vl);
2648
+ vfloat32m4_t rhs = __riscv_vle32_v_f32m4(src1_ptr + idx, vl);
2649
+ vfloat32m4_t res = __riscv_vfadd_vv_f32m4(lhs, rhs, vl);
2650
+ __riscv_vse32_v_f32m4(dst_ptr + idx + r, res, vl);
2651
+ }
2652
+ } else if constexpr (std::is_same_v<T, _Float16>) {
2653
+ for (size_t vl; blk_len > 0; blk_len -= vl, idx += vl) {
2654
+ vl = __riscv_vsetvl_e16m4(blk_len);
2655
+ vfloat16m4_t lhs = __riscv_vle16_v_f16m4((src0_ptr + idx + r), vl);
2656
+ vfloat16m4_t rhs = __riscv_vle16_v_f16m4((src1_ptr + idx), vl);
2657
+ vfloat16m4_t res = __riscv_vfadd_vv_f16m4(lhs, rhs, vl);
2658
+ __riscv_vse16_v_f16m4((dst_ptr + idx + r), res, vl);
2659
+ }
2660
+ } else {
2661
+ GGML_ABORT("fatal error");
2662
+ }
2663
+ } else if constexpr (op_type == GGML_OP_SUB) {
2664
+ if constexpr (std::is_same_v<T, float>) {
2665
+ for (size_t vl; blk_len > 0; blk_len -= vl, idx += vl) {
2666
+ vl = __riscv_vsetvl_e32m4(blk_len);
2667
+ vfloat32m4_t lhs = __riscv_vle32_v_f32m4(src0_ptr + idx + r, vl);
2668
+ vfloat32m4_t rhs = __riscv_vle32_v_f32m4(src1_ptr + idx, vl);
2669
+ vfloat32m4_t res = __riscv_vfsub_vv_f32m4(lhs, rhs, vl);
2670
+ __riscv_vse32_v_f32m4(dst_ptr + idx + r, res, vl);
2671
+ }
2672
+ } else if constexpr (std::is_same_v<T, _Float16>) {
2673
+ for (size_t vl; blk_len > 0; blk_len -= vl, idx += vl) {
2674
+ vl = __riscv_vsetvl_e16m4(blk_len);
2675
+ vfloat16m4_t lhs = __riscv_vle16_v_f16m4((src0_ptr + idx + r), vl);
2676
+ vfloat16m4_t rhs = __riscv_vle16_v_f16m4((src1_ptr + idx), vl);
2677
+ vfloat16m4_t res = __riscv_vfsub_vv_f16m4(lhs, rhs, vl);
2678
+ __riscv_vse16_v_f16m4((dst_ptr + idx + r), res, vl);
2679
+ }
2680
+ } else {
2681
+ GGML_ABORT("fatal error");
2682
+ }
2683
+ } else if constexpr (op_type == GGML_OP_MUL) {
2684
+ if constexpr (std::is_same_v<T, float>) {
2685
+ for (size_t vl; blk_len > 0; blk_len -= vl, idx += vl) {
2686
+ vl = __riscv_vsetvl_e32m4(blk_len);
2687
+ vfloat32m4_t lhs = __riscv_vle32_v_f32m4(src0_ptr + idx + r, vl);
2688
+ vfloat32m4_t rhs = __riscv_vle32_v_f32m4(src1_ptr + idx, vl);
2689
+ vfloat32m4_t res = __riscv_vfmul_vv_f32m4(lhs, rhs, vl);
2690
+ __riscv_vse32_v_f32m4(dst_ptr + idx + r, res, vl);
2691
+ }
2692
+ } else if constexpr (std::is_same_v<T, _Float16>) {
2693
+ for (size_t vl; blk_len > 0; blk_len -= vl, idx += vl) {
2694
+ vl = __riscv_vsetvl_e16m4(blk_len);
2695
+ vfloat16m4_t lhs = __riscv_vle16_v_f16m4((src0_ptr + idx + r), vl);
2696
+ vfloat16m4_t rhs = __riscv_vle16_v_f16m4((src1_ptr + idx), vl);
2697
+ vfloat16m4_t res = __riscv_vfmul_vv_f16m4(lhs, rhs, vl);
2698
+ __riscv_vse16_v_f16m4((dst_ptr + idx + r), res, vl);
2699
+ }
2700
+ } else {
2701
+ GGML_ABORT("fatal error");
2702
+ }
2703
+ } else if constexpr (op_type == GGML_OP_DIV) {
2704
+ if constexpr (std::is_same_v<T, float>) {
2705
+ for (size_t vl; blk_len > 0; blk_len -= vl, idx += vl) {
2706
+ vl = __riscv_vsetvl_e32m4(blk_len);
2707
+ vfloat32m4_t lhs = __riscv_vle32_v_f32m4(src0_ptr + idx + r, vl);
2708
+ vfloat32m4_t rhs = __riscv_vle32_v_f32m4(src1_ptr + idx, vl);
2709
+ vfloat32m4_t res = __riscv_vfdiv_vv_f32m4(lhs, rhs, vl);
2710
+ __riscv_vse32_v_f32m4(dst_ptr + idx + r, res, vl);
2711
+ }
2712
+ } else if constexpr (std::is_same_v<T, _Float16>) {
2713
+ for (size_t vl; blk_len > 0; blk_len -= vl, idx += vl) {
2714
+ vl = __riscv_vsetvl_e16m4(blk_len);
2715
+ vfloat16m4_t lhs = __riscv_vle16_v_f16m4((src0_ptr + idx + r), vl);
2716
+ vfloat16m4_t rhs = __riscv_vle16_v_f16m4((src1_ptr + idx), vl);
2717
+ vfloat16m4_t res = __riscv_vfdiv_vv_f16m4(lhs, rhs, vl);
2718
+ __riscv_vse16_v_f16m4((dst_ptr + idx + r), res, vl);
2719
+ }
2720
+ } else {
2721
+ GGML_ABORT("fatal error");
2722
+ }
2723
+ } else {
2724
+ GGML_ABORT("fatal error");
2725
+ }
2726
+ };
2727
+
2728
+ if (src0_rows == src1_rows && src0_rows == 1 && ne00 == ne10) {
2729
+ int64_t task_per_thread = (ne00 + nth - 1) / nth;
2730
+ int64_t task_begin = ith * task_per_thread;
2731
+ int64_t task_end = std::min((ith + 1) * task_per_thread, ne00);
2732
+
2733
+ T * dst_ptr = ((T *) dst->data) + task_begin;
2734
+ T * src0_ptr = ((T *) src0->data) + task_begin;
2735
+ T * src1_ptr = ((T *) src1->data) + task_begin;
2736
+
2737
+ compute_func_vv(task_end - task_begin, 0, src0_ptr, src1_ptr, dst_ptr);
2738
+ } else if (ne10 > 1) {
2739
+ for (int64_t ir = ir0; ir < ir1; ++ir) {
2740
+ const int64_t i03 = ir / (ne02 * ne01);
2741
+ const int64_t i02 = (ir - i03 * ne02 * ne01) / ne01;
2742
+ const int64_t i01 = (ir - i03 * ne02 * ne01 - i02 * ne01);
2743
+
2744
+ const int64_t i13 = i03 % ne13;
2745
+ const int64_t i12 = i02 % ne12;
2746
+ const int64_t i11 = i01 % ne11;
2747
+
2748
+ T * dst_ptr = (T *) ((char *) dst->data + i03 * nb3 + i02 * nb2 + i01 * nb1);
2749
+ T * src0_ptr = (T *) ((char *) src0->data + i03 * nb03 + i02 * nb02 + i01 * nb01);
2750
+ T * src1_ptr = (T *) ((char *) src1->data + i13 * nb13 + i12 * nb12 + i11 * nb11);
2751
+
2752
+ // src1 is broadcastable across src0 and dst in i1, i2, i3
2753
+ for (int64_t r = 0; r < ne00; r += ne10) {
2754
+ compute_func_vv(ne10, r, src0_ptr, src1_ptr, dst_ptr);
2755
+ }
2756
+ }
2757
+ } else {
2758
+ for (int64_t ir = ir0; ir < ir1; ++ir) {
2759
+ const int64_t i03 = ir / (ne02 * ne01);
2760
+ const int64_t i02 = (ir - i03 * ne02 * ne01) / ne01;
2761
+ const int64_t i01 = (ir - i03 * ne02 * ne01 - i02 * ne01);
2762
+
2763
+ const int64_t i13 = i03 % ne13;
2764
+ const int64_t i12 = i02 % ne12;
2765
+ const int64_t i11 = i01 % ne11;
2766
+
2767
+ T * dst_ptr = (T *) ((char *) dst->data + i03 * nb3 + i02 * nb2 + i01 * nb1);
2768
+ T * src0_ptr = (T *) ((char *) src0->data + i03 * nb03 + i02 * nb02 + i01 * nb01);
2769
+ T * src1_ptr = (T *) ((char *) src1->data + i13 * nb13 + i12 * nb12 + i11 * nb11);
2770
+
2771
+ T rhs_scalar = src1_ptr[0];
2772
+ int64_t blk_len = ne00;
2773
+ int64_t r = 0;
2774
+
2775
+ for (size_t vl; blk_len > 0; blk_len -= vl, r += vl) {
2776
+ if constexpr (op_type == GGML_OP_ADD) {
2777
+ if constexpr (std::is_same_v<T, float>) {
2778
+ vl = __riscv_vsetvl_e32m4(blk_len);
2779
+ vfloat32m4_t lhs = __riscv_vle32_v_f32m4(src0_ptr + r, vl);
2780
+ vfloat32m4_t res = __riscv_vfadd_vf_f32m4(lhs, rhs_scalar, vl);
2781
+ __riscv_vse32_v_f32m4(dst_ptr + r, res, vl);
2782
+ } else if constexpr (std::is_same_v<T, _Float16>) {
2783
+ vl = __riscv_vsetvl_e16m4(blk_len);
2784
+ vfloat16m4_t lhs = __riscv_vle16_v_f16m4((src0_ptr + r), vl);
2785
+ vfloat16m4_t res = __riscv_vfadd_vf_f16m4(lhs, rhs_scalar, vl);
2786
+ __riscv_vse16_v_f16m4((dst_ptr + r), res, vl);
2787
+ } else {
2788
+ GGML_ABORT("fatal error");
2789
+ }
2790
+ } else if constexpr (op_type == GGML_OP_SUB) {
2791
+ if constexpr (std::is_same_v<T, float>) {
2792
+ vl = __riscv_vsetvl_e32m4(blk_len);
2793
+ vfloat32m4_t lhs = __riscv_vle32_v_f32m4(src0_ptr + r, vl);
2794
+ vfloat32m4_t res = __riscv_vfsub_vf_f32m4(lhs, rhs_scalar, vl);
2795
+ __riscv_vse32_v_f32m4(dst_ptr + r, res, vl);
2796
+ } else if constexpr (std::is_same_v<T, _Float16>) {
2797
+ vl = __riscv_vsetvl_e16m4(blk_len);
2798
+ vfloat16m4_t lhs = __riscv_vle16_v_f16m4((src0_ptr + r), vl);
2799
+ vfloat16m4_t res = __riscv_vfsub_vf_f16m4(lhs, rhs_scalar, vl);
2800
+ __riscv_vse16_v_f16m4((dst_ptr + r), res, vl);
2801
+ } else {
2802
+ GGML_ABORT("fatal error");
2803
+ }
2804
+ } else if constexpr (op_type == GGML_OP_MUL) {
2805
+ if constexpr (std::is_same_v<T, float>) {
2806
+ vl = __riscv_vsetvl_e32m4(blk_len);
2807
+ vfloat32m4_t lhs = __riscv_vle32_v_f32m4(src0_ptr + r, vl);
2808
+ vfloat32m4_t res = __riscv_vfmul_vf_f32m4(lhs, rhs_scalar, vl);
2809
+ __riscv_vse32_v_f32m4(dst_ptr + r, res, vl);
2810
+ } else if constexpr (std::is_same_v<T, _Float16>) {
2811
+ vl = __riscv_vsetvl_e16m4(blk_len);
2812
+ vfloat16m4_t lhs = __riscv_vle16_v_f16m4((src0_ptr + r), vl);
2813
+ vfloat16m4_t res = __riscv_vfmul_vf_f16m4(lhs, rhs_scalar, vl);
2814
+ __riscv_vse16_v_f16m4((dst_ptr + r), res, vl);
2815
+ } else {
2816
+ GGML_ABORT("fatal error");
2817
+ }
2818
+ } else if constexpr (op_type == GGML_OP_DIV) {
2819
+ if constexpr (std::is_same_v<T, float>) {
2820
+ vl = __riscv_vsetvl_e32m4(blk_len);
2821
+ vfloat32m4_t lhs = __riscv_vle32_v_f32m4(src0_ptr + r, vl);
2822
+ vfloat32m4_t res = __riscv_vfdiv_vf_f32m4(lhs, rhs_scalar, vl);
2823
+ __riscv_vse32_v_f32m4(dst_ptr + r, res, vl);
2824
+ } else if constexpr (std::is_same_v<T, _Float16>) {
2825
+ vl = __riscv_vsetvl_e16m4(blk_len);
2826
+ vfloat16m4_t lhs = __riscv_vle16_v_f16m4((src0_ptr + r), vl);
2827
+ vfloat16m4_t res = __riscv_vfdiv_vf_f16m4(lhs, rhs_scalar, vl);
2828
+ __riscv_vse16_v_f16m4((dst_ptr + r), res, vl);
2829
+ } else {
2830
+ GGML_ABORT("fatal error");
2831
+ }
2832
+ } else {
2833
+ GGML_ABORT("fatal error");
2834
+ }
2835
+ }
2836
+ }
2837
+ }
2838
+ }
2839
+
2840
+ template <typename T> void forward_sum_rows(const ggml_compute_params * params, ggml_tensor * op) {
2841
+ const ggml_tensor * src0 = op->src[0];
2842
+ ggml_tensor * dst = op;
2843
+
2844
+ const int ith = params->ith;
2845
+ const int nth = params->nth;
2846
+
2847
+ GGML_TENSOR_UNARY_OP_LOCALS
2848
+
2849
+ GGML_ASSERT(ne0 == 1);
2850
+ GGML_ASSERT(ne1 == ne01);
2851
+ GGML_ASSERT(ne2 == ne02);
2852
+ GGML_ASSERT(ne3 == ne03);
2853
+
2854
+ int64_t n_task = ne01 * ne02 * ne03;
2855
+ int64_t task_per_thread = (n_task + nth - 1) / nth;
2856
+ int64_t ir_start = ith * task_per_thread;
2857
+ int64_t ir_end = std::min(ir_start + task_per_thread, n_task);
2858
+
2859
+ for (int64_t ir = ir_start; ir < ir_end; ir++) {
2860
+ const int64_t i3 = ir / (ne02 * ne01);
2861
+ const int64_t i2 = (ir - i3 * ne02 * ne01) / ne01;
2862
+ const int64_t i1 = (ir - i3 * ne02 * ne01 - i2 * ne01);
2863
+
2864
+ T * src_row = (T *) ((char *) src0->data + i1 * nb01 + i2 * nb02 + i3 * nb03);
2865
+ T * dst_row = (T *) ((char *) op->data + i1 * nb1 + i2 * nb2 + i3 * nb3);
2866
+
2867
+ float row_sum = 0;
2868
+
2869
+ if constexpr (std::is_same_v<T, float>) {
2870
+ size_t gvl = __riscv_vsetvlmax_e32m4();
2871
+ vfloat32m4_t acc_vec = __riscv_vfmv_v_f_f32m4(0.0f, gvl);
2872
+ int64_t length = ne00;
2873
+ const float * p_data = src_row;
2874
+
2875
+ while (length > 0) {
2876
+ size_t vl = __riscv_vsetvl_e32m4(length);
2877
+ vfloat32m4_t vec = __riscv_vle32_v_f32m4(p_data, vl);
2878
+ acc_vec = __riscv_vfadd_vv_f32m4(acc_vec, vec, vl);
2879
+ p_data += vl;
2880
+ length -= vl;
2881
+ }
2882
+
2883
+ gvl = __riscv_vsetvlmax_e32m1();
2884
+ vfloat32m1_t zero_v = __riscv_vfmv_v_f_f32m1(0.0f, gvl);
2885
+ vfloat32m1_t sum_v = __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m4_f32m1(acc_vec, 0),
2886
+ __riscv_vget_v_f32m4_f32m1(acc_vec, 1), gvl);
2887
+ sum_v = __riscv_vfadd_vv_f32m1(sum_v, __riscv_vget_v_f32m4_f32m1(acc_vec, 2), gvl);
2888
+ sum_v = __riscv_vfadd_vv_f32m1(sum_v, __riscv_vget_v_f32m4_f32m1(acc_vec, 3), gvl);
2889
+ sum_v = __riscv_vfredusum_vs_f32m1_f32m1(sum_v, zero_v, gvl);
2890
+ row_sum = __riscv_vfmv_f_s_f32m1_f32(sum_v);
2891
+ } else if constexpr (std::is_same_v<T, _Float16>) {
2892
+ size_t gvl = __riscv_vsetvlmax_e16m2();
2893
+ vfloat32m4_t acc_vec = __riscv_vfmv_v_f_f32m4(0.0f, gvl);
2894
+ int64_t length = ne00;
2895
+ const _Float16 * p_data = src_row;
2896
+
2897
+ while (length > 0) {
2898
+ size_t vl = __riscv_vsetvl_e16m2(length);
2899
+ vfloat16m2_t vec_f16 = __riscv_vle16_v_f16m2(p_data, vl);
2900
+ vfloat32m4_t vec_f32 = __riscv_vfwcvt_f_f_v_f32m4(vec_f16, vl);
2901
+ acc_vec = __riscv_vfadd_vv_f32m4(acc_vec, vec_f32, vl);
2902
+ p_data += vl;
2903
+ length -= vl;
2904
+ }
2905
+
2906
+ gvl = __riscv_vsetvlmax_e32m1();
2907
+ vfloat32m1_t zero_v = __riscv_vfmv_v_f_f32m1(0.0f, gvl);
2908
+ vfloat32m1_t sum_v = __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m4_f32m1(acc_vec, 0),
2909
+ __riscv_vget_v_f32m4_f32m1(acc_vec, 1), gvl);
2910
+ sum_v = __riscv_vfadd_vv_f32m1(sum_v, __riscv_vget_v_f32m4_f32m1(acc_vec, 2), gvl);
2911
+ sum_v = __riscv_vfadd_vv_f32m1(sum_v, __riscv_vget_v_f32m4_f32m1(acc_vec, 3), gvl);
2912
+ sum_v = __riscv_vfredusum_vs_f32m1_f32m1(sum_v, zero_v, gvl);
2913
+ row_sum = __riscv_vfmv_f_s_f32m1_f32(sum_v);
2914
+ } else {
2915
+ GGML_ABORT("fatal error");
2916
+ }
2917
+
2918
+ dst_row[0] = row_sum;
2919
+ }
2920
+ }
2921
+
2922
+ template <typename T> void forward_repeat_nrows(ggml_compute_params * params, ggml_tensor * op) {
2923
+ const ggml_tensor * src0 = op->src[0];
2924
+ ggml_tensor * dst = op;
2925
+
2926
+ const int ith = params->ith;
2927
+ const int nth = params->nth;
2928
+
2929
+ int64_t nrows = ggml_nrows(src0);
2930
+ int64_t nrows_per_thread = (nrows + nth - 1) / nth;
2931
+ int64_t ir_start = ith * nrows_per_thread;
2932
+ int64_t ir_end = std::min(ir_start + nrows_per_thread, nrows);
2933
+
2934
+ if (src0->ne[0] == 1) {
2935
+ for (int64_t ir = ir_start; ir < ir_end; ir++) {
2936
+ T * src_row = (T *) ((char *) src0->data + ir * src0->nb[1]);
2937
+ T * dst_row = (T *) ((char *) dst->data + ir * dst->nb[1]);
2938
+
2939
+ T src_scalar = src_row[0];
2940
+
2941
+ int64_t length = dst->ne[0];
2942
+ int64_t idx = 0;
2943
+ size_t vl = 0;
2944
+
2945
+ while (length > 0) {
2946
+ if constexpr (std::is_same_v<T, int32_t>) {
2947
+ vl = __riscv_vsetvl_e32m4(length);
2948
+ vint32m4_t vec = __riscv_vmv_v_x_i32m4(src_scalar, vl);
2949
+ __riscv_vse32_v_i32m4(dst_row + idx, vec, vl);
2950
+ } else if constexpr (std::is_same_v<T, int16_t>) {
2951
+ vl = __riscv_vsetvl_e16m4(length);
2952
+ vint16m4_t vec = __riscv_vmv_v_x_i16m4(src_scalar, vl);
2953
+ __riscv_vse16_v_i16m4((dst_row + idx), vec, vl);
2954
+ } else {
2955
+ GGML_ABORT("fatal error");
2956
+ }
2957
+ idx += vl;
2958
+ length -= vl;
2959
+ }
2960
+ }
2961
+ } else if (src0->ne[0] == dst->ne[0]) {
2962
+ for (int64_t ir = ir_start; ir < ir_end; ir++) {
2963
+ T * src_row = (T *) ((char *) src0->data + ir * src0->nb[1]);
2964
+ T * dst_row = (T *) ((char *) dst->data + ir * dst->nb[1]);
2965
+
2966
+ int64_t length = dst->ne[0];
2967
+ int64_t idx = 0;
2968
+ size_t vl = 0;
2969
+
2970
+ while (length > 0) {
2971
+ if constexpr (std::is_same_v<T, int32_t>) {
2972
+ vl = __riscv_vsetvl_e32m4(length);
2973
+ vint32m4_t vec = __riscv_vle32_v_i32m4(src_row + idx, vl);
2974
+ __riscv_vse32_v_i32m4(dst_row + idx, vec, vl);
2975
+ } else if constexpr (std::is_same_v<T, int16_t>) {
2976
+ vl = __riscv_vsetvl_e16m4(length);
2977
+ vint16m4_t vec = __riscv_vle16_v_i16m4((src_row + idx), vl);
2978
+ __riscv_vse16_v_i16m4((dst_row + idx), vec, vl);
2979
+ } else {
2980
+ GGML_ABORT("fatal error");
2981
+ }
2982
+ idx += vl;
2983
+ length -= vl;
2984
+ }
2985
+ }
2986
+ } else {
2987
+ GGML_ABORT("fatal error");
2988
+ }
2989
+ }
2990
+
2991
+ template <typename T> void forward_repeat_dim1(ggml_compute_params * params, ggml_tensor * op) {
2992
+ const ggml_tensor * src0 = op->src[0];
2993
+ ggml_tensor * dst = op;
2994
+
2995
+ const int ith = params->ith;
2996
+ const int nth = params->nth;
2997
+
2998
+ const int64_t ne0 = dst->ne[0];
2999
+ const int64_t ne1 = dst->ne[1];
3000
+ const int64_t ne2 = dst->ne[2];
3001
+ const int64_t ne3 = dst->ne[3];
3002
+
3003
+ const int64_t total_batches = ne2 * ne3;
3004
+ const int64_t batches_per_thread = (total_batches + nth - 1) / nth;
3005
+ const int64_t batch_start = ith * batches_per_thread;
3006
+ const int64_t batch_end = std::min(batch_start + batches_per_thread, total_batches);
3007
+
3008
+ for (int64_t b = batch_start; b < batch_end; b++) {
3009
+ const int64_t i3 = b / ne2;
3010
+ const int64_t i2 = b % ne2;
3011
+
3012
+ T * src_base = (T *) ((char *) src0->data + i2 * src0->nb[2] + i3 * src0->nb[3]);
3013
+ T * dst_batch = (T *) ((char *) dst->data + i2 * dst->nb[2] + i3 * dst->nb[3]);
3014
+
3015
+ for (int64_t i1 = 0; i1 < ne1; i1++) {
3016
+ T * dst_ptr = (T *) ((char *) dst_batch + i1 * dst->nb[1]);
3017
+ int64_t length = ne0;
3018
+ int64_t idx = 0;
3019
+
3020
+ while (length > 0) {
3021
+ if constexpr (std::is_same_v<T, int32_t>) {
3022
+ size_t vl = __riscv_vsetvl_e32m4(length);
3023
+ vint32m4_t vec = __riscv_vle32_v_i32m4(src_base + idx, vl);
3024
+ __riscv_vse32_v_i32m4(dst_ptr + idx, vec, vl);
3025
+ idx += vl;
3026
+ length -= vl;
3027
+ } else if constexpr (std::is_same_v<T, int16_t>) {
3028
+ size_t vl = __riscv_vsetvl_e16m4(length);
3029
+ vint16m4_t vec = __riscv_vle16_v_i16m4((src_base + idx), vl);
3030
+ __riscv_vse16_v_i16m4((dst_ptr + idx), vec, vl);
3031
+ idx += vl;
3032
+ length -= vl;
3033
+ } else {
3034
+ GGML_ABORT("fatal error");
3035
+ }
3036
+ }
3037
+ }
3038
+ }
3039
+ }
3040
+
3041
+ template <typename T> void forward_get_rows(ggml_compute_params * params, ggml_tensor * op) {
3042
+ const ggml_tensor * src0 = op->src[0];
3043
+ const ggml_tensor * src1 = op->src[1];
3044
+ ggml_tensor * dst = op;
3045
+
3046
+ GGML_TENSOR_BINARY_OP_LOCALS
3047
+
3048
+ const int64_t nc = ne00;
3049
+ const int64_t nr = ggml_nelements(src1);
3050
+
3051
+ assert(ne0 == nc);
3052
+ assert(ne02 == ne11);
3053
+ assert(nb00 == sizeof(float));
3054
+ assert(ggml_nrows(op) == nr);
3055
+
3056
+ const int ith = params->ith;
3057
+ const int nth = params->nth;
3058
+
3059
+ int rows_nth = nth;
3060
+ int cols_nth = 1;
3061
+
3062
+ if (nr == 1) {
3063
+ rows_nth = 1;
3064
+ cols_nth = nth;
3065
+ }
3066
+
3067
+ // rows per thread
3068
+ const int dr = (nr + rows_nth - 1) / rows_nth;
3069
+ const int dc = (nc + cols_nth - 1) / cols_nth;
3070
+
3071
+ int rows_ith = ith % rows_nth;
3072
+ int cols_ith = ith % cols_nth;
3073
+
3074
+ // row range for this thread
3075
+ const int ir0 = dr * rows_ith;
3076
+ const int ir1 = MIN(ir0 + dr, nr);
3077
+
3078
+ const int cr0 = dc * cols_ith;
3079
+ const int cr1 = MIN(cr0 + dc, nc);
3080
+
3081
+ for (int64_t i = ir0; i < ir1; ++i) {
3082
+ const int64_t i12 = i / (ne11 * ne10);
3083
+ const int64_t i11 = (i - i12 * ne11 * ne10) / ne10;
3084
+ const int64_t i10 = (i - i12 * ne11 * ne10 - i11 * ne10);
3085
+ const int64_t i01 = *(int32_t *) ((char *) src1->data + i10 * nb10 + i11 * nb11 + i12 * nb12);
3086
+
3087
+ GGML_ASSERT(i01 >= 0 && i01 < ne01);
3088
+
3089
+ memcpy1d(((char *) dst->data + i10 * nb1 + i11 * nb2 + i12 * nb3) + cr0 * sizeof(T),
3090
+ ((char *) src0->data + i01 * nb01 + i11 * nb02 + i12 * nb03) + cr0 * sizeof(T),
3091
+ (cr1 - cr0) * sizeof(T));
3092
+ }
3093
+ }
3094
+
3095
+ template <typename T> void forward_concat(ggml_compute_params * params, ggml_tensor * op) {
3096
+ const ggml_tensor * src0 = op->src[0];
3097
+ const ggml_tensor * src1 = op->src[1];
3098
+ ggml_tensor * dst = op;
3099
+
3100
+ GGML_ASSERT(ggml_type_size(src0->type) == sizeof(float));
3101
+
3102
+ GGML_TENSOR_BINARY_OP_LOCALS
3103
+
3104
+ const int32_t dim = ggml_get_op_params_i32(dst, 0);
3105
+
3106
+ GGML_ASSERT(dim == 0 && nb0 == sizeof(float) && nb1 == sizeof(float) * (ne00 + ne10));
3107
+
3108
+ const int64_t nr = ggml_nrows(dst);
3109
+ const int64_t nc = ne0;
3110
+
3111
+ const int ith = params->ith;
3112
+ const int nth = params->nth;
3113
+
3114
+ int rows_nth = nth;
3115
+ int cols_nth = 1;
3116
+
3117
+ if (nr == 1) {
3118
+ rows_nth = 1;
3119
+ cols_nth = nth;
3120
+ }
3121
+
3122
+ const int dr = (nr + rows_nth - 1) / rows_nth;
3123
+ const int dc = (nc + cols_nth - 1) / cols_nth;
3124
+
3125
+ int rows_ith = ith % rows_nth;
3126
+ int cols_ith = ith % cols_nth;
3127
+
3128
+ // row range for this thread
3129
+ const int ir0 = dr * rows_ith;
3130
+ const int ir1 = MIN(ir0 + dr, nr);
3131
+
3132
+ const int cr0 = dc * cols_ith;
3133
+ const int cr1 = MIN(cr0 + dc, nc);
3134
+
3135
+ int64_t o[4] = { 0, 0, 0, 0 };
3136
+ o[dim] = src0->ne[dim];
3137
+ const float * x;
3138
+
3139
+ for (int64_t i = ir0; i < ir1; ++i) {
3140
+ const int64_t i3 = i / (ne02 * ne01);
3141
+ const int64_t i2 = (i - i3 * ne02 * ne01) / ne01;
3142
+ const int64_t i1 = (i - i3 * ne02 * ne01 - i2 * ne01);
3143
+
3144
+ for (int i0 = cr0; i0 < cr1; i0++) {
3145
+ if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
3146
+ x = (const float *) ((const char *) src0->data + (i0) *nb00 + (i1) *nb01 + (i2) *nb02 + (i3) *nb03);
3147
+ } else {
3148
+ x = (const float *) ((const char *) src1->data + (i0 - o[0]) * nb10 + (i1 - o[1]) * nb11 +
3149
+ (i2 - o[2]) * nb12 + (i3 - o[3]) * nb13);
3150
+ }
3151
+
3152
+ float * y = (float *) ((char *) dst->data + i0 * nb0 + i1 * nb1 + i2 * nb2 + i3 * nb3);
3153
+
3154
+ *y = *x;
3155
+ }
3156
+ }
3157
+ }
3158
+
3159
+ template void forward_binary<GGML_OP_ADD, float>(ggml_compute_params * params, ggml_tensor * op);
3160
+ template void forward_binary<GGML_OP_SUB, float>(ggml_compute_params * params, ggml_tensor * op);
3161
+ template void forward_binary<GGML_OP_MUL, float>(ggml_compute_params * params, ggml_tensor * op);
3162
+ template void forward_binary<GGML_OP_DIV, float>(ggml_compute_params * params, ggml_tensor * op);
3163
+ template void forward_binary<GGML_OP_ADD, _Float16>(ggml_compute_params * params, ggml_tensor * op);
3164
+ template void forward_binary<GGML_OP_SUB, _Float16>(ggml_compute_params * params, ggml_tensor * op);
3165
+ template void forward_binary<GGML_OP_MUL, _Float16>(ggml_compute_params * params, ggml_tensor * op);
3166
+ template void forward_binary<GGML_OP_DIV, _Float16>(ggml_compute_params * params, ggml_tensor * op);
3167
+ template void forward_sum_rows<float>(const ggml_compute_params * params, ggml_tensor * op);
3168
+ template void forward_sum_rows<_Float16>(const ggml_compute_params * params, ggml_tensor * op);
3169
+ template void forward_repeat_nrows<int32_t>(ggml_compute_params * params, ggml_tensor * op);
3170
+ template void forward_repeat_nrows<int16_t>(ggml_compute_params * params, ggml_tensor * op);
3171
+ template void forward_repeat_dim1<int32_t>(ggml_compute_params * params, ggml_tensor * op);
3172
+ template void forward_repeat_dim1<int16_t>(ggml_compute_params * params, ggml_tensor * op);
3173
+ template void forward_get_rows<int32_t>(ggml_compute_params * params, ggml_tensor * op);
3174
+ template void forward_get_rows<int16_t>(ggml_compute_params * params, ggml_tensor * op);
3175
+ template void forward_concat<int32_t>(ggml_compute_params * params, ggml_tensor * op);
3176
+ template void forward_concat<int16_t>(ggml_compute_params * params, ggml_tensor * op);
3177
+
3178
+ } // namespace spacemit_kernels::rvv