toy 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2107) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +1124 -0
  3. data/LICENSE +21 -0
  4. data/Makefile +2022 -0
  5. data/README.md +154 -0
  6. data/bin/toy +10 -0
  7. data/lib/toy/compute.rb +135 -0
  8. data/lib/toy/compute_cuda.rb +104 -0
  9. data/lib/toy/compute_metal.rb +97 -0
  10. data/lib/toy/core/cli/describe.rb +188 -0
  11. data/lib/toy/core/cli/eval.rb +385 -0
  12. data/lib/toy/core/cli/exit_codes.rb +15 -0
  13. data/lib/toy/core/cli/fetch.rb +238 -0
  14. data/lib/toy/core/cli/infer.rb +268 -0
  15. data/lib/toy/core/cli/install.rb +228 -0
  16. data/lib/toy/core/cli/list.rb +86 -0
  17. data/lib/toy/core/cli/manifest.rb +49 -0
  18. data/lib/toy/core/cli/new.rb +594 -0
  19. data/lib/toy/core/cli/serve.rb +237 -0
  20. data/lib/toy/core/cli/train.rb +471 -0
  21. data/lib/toy/core/cli.rb +165 -0
  22. data/lib/toy/core/config.rb +64 -0
  23. data/lib/toy/core/gguf_meta.rb +161 -0
  24. data/lib/toy/core/model_scan.rb +221 -0
  25. data/lib/toy/core/run_log.rb +94 -0
  26. data/lib/toy/core/toy_root.rb +95 -0
  27. data/lib/toy/dev/toy_card.rb +299 -0
  28. data/lib/toy/dev/toy_describe_flow.rb +412 -0
  29. data/lib/toy/dev/toy_logprobs.rb +86 -0
  30. data/lib/toy/dev/toy_tap.rb +183 -0
  31. data/lib/toy/dev/toy_token_drift.rb +121 -0
  32. data/lib/toy/ffi/tinynn.rb +1491 -0
  33. data/lib/toy/ffi/tinynn_cuda.rb +1124 -0
  34. data/lib/toy/ffi/tinynn_metal.rb +359 -0
  35. data/lib/toy/ffi_manifest.rb +84 -0
  36. data/lib/toy/io/bpe.rb +325 -0
  37. data/lib/toy/io/gguf_kv.rb +35 -0
  38. data/lib/toy/io/gguf_load.rb +331 -0
  39. data/lib/toy/io/loaders/toy_gpt2_loader.rb +70 -0
  40. data/lib/toy/io/loaders/toy_smollm2_loader.rb +754 -0
  41. data/lib/toy/io/model_index.rb +206 -0
  42. data/lib/toy/io/run_bundle.rb +280 -0
  43. data/lib/toy/io/tokenizer.rb +613 -0
  44. data/lib/toy/io/toy_corpus_loader.rb +52 -0
  45. data/lib/toy/io/toy_events.rb +56 -0
  46. data/lib/toy/io/toy_image_loader.rb +48 -0
  47. data/lib/toy/llm/adamw.rb +169 -0
  48. data/lib/toy/llm/archs/llama_arch.rb +233 -0
  49. data/lib/toy/llm/archs/llama_arch_cuda.rb +237 -0
  50. data/lib/toy/llm/archs/llama_arch_metal.rb +237 -0
  51. data/lib/toy/llm/blocks/transformer_block.rb +876 -0
  52. data/lib/toy/llm/blocks/transformer_block_cuda.rb +880 -0
  53. data/lib/toy/llm/blocks/transformer_block_metal.rb +880 -0
  54. data/lib/toy/llm/classify_batch.rb +88 -0
  55. data/lib/toy/llm/engine/gpt2_fwd_engine.rb +360 -0
  56. data/lib/toy/llm/engine/gpt2_fwd_engine_cuda.rb +362 -0
  57. data/lib/toy/llm/engine/gpt2_fwd_engine_metal.rb +362 -0
  58. data/lib/toy/llm/engine/gpt2_kv_engine.rb +346 -0
  59. data/lib/toy/llm/engine/gpt2_kv_engine_cuda.rb +348 -0
  60. data/lib/toy/llm/engine/gpt2_kv_engine_metal.rb +348 -0
  61. data/lib/toy/llm/engine/gpt2_seq_engine.rb +289 -0
  62. data/lib/toy/llm/engine/gpt2_seq_engine_cuda.rb +293 -0
  63. data/lib/toy/llm/engine/gpt2_seq_engine_metal.rb +293 -0
  64. data/lib/toy/llm/engine/llama_kv_engine.rb +1593 -0
  65. data/lib/toy/llm/engine/llama_kv_engine_cuda.rb +1526 -0
  66. data/lib/toy/llm/engine/llama_kv_engine_metal.rb +1526 -0
  67. data/lib/toy/llm/engine/llama_seq_engine.rb +1233 -0
  68. data/lib/toy/llm/engine/llama_seq_engine_cuda.rb +1238 -0
  69. data/lib/toy/llm/engine/llama_seq_engine_metal.rb +1238 -0
  70. data/lib/toy/llm/engine/vit_tiny_engine.rb +467 -0
  71. data/lib/toy/llm/labels.rb +142 -0
  72. data/lib/toy/llm/primitives/gqa.rb +62 -0
  73. data/lib/toy/llm/primitives/gqa_cuda.rb +66 -0
  74. data/lib/toy/llm/primitives/gqa_metal.rb +66 -0
  75. data/lib/toy/llm/primitives/rms_norm.rb +39 -0
  76. data/lib/toy/llm/primitives/rms_norm_cuda.rb +43 -0
  77. data/lib/toy/llm/primitives/rms_norm_metal.rb +43 -0
  78. data/lib/toy/llm/primitives/rope.rb +68 -0
  79. data/lib/toy/llm/primitives/rope_cuda.rb +72 -0
  80. data/lib/toy/llm/primitives/rope_metal.rb +72 -0
  81. data/lib/toy/llm/primitives/swiglu.rb +41 -0
  82. data/lib/toy/llm/primitives/swiglu_cuda.rb +45 -0
  83. data/lib/toy/llm/primitives/swiglu_metal.rb +45 -0
  84. data/lib/toy/llm/recipe_options.rb +71 -0
  85. data/lib/toy/llm/recipes/from_scratch.rb +105 -0
  86. data/lib/toy/llm/recipes/from_scratch_cuda.rb +109 -0
  87. data/lib/toy/llm/recipes/from_scratch_metal.rb +109 -0
  88. data/lib/toy/llm/recipes/lora.rb +110 -0
  89. data/lib/toy/llm/recipes/lora_cuda.rb +114 -0
  90. data/lib/toy/llm/recipes/lora_metal.rb +114 -0
  91. data/lib/toy/llm/recipes/vit_tiny.rb +75 -0
  92. data/lib/toy/llm/recipes/warm_start.rb +235 -0
  93. data/lib/toy/llm/recipes/warm_start_cuda.rb +239 -0
  94. data/lib/toy/llm/recipes/warm_start_metal.rb +239 -0
  95. data/lib/toy/llm/training_batch.rb +133 -0
  96. data/lib/toy/models/arch.rb +253 -0
  97. data/lib/toy/models/gpt2.rb +311 -0
  98. data/lib/toy/models/toy_gpt2.rb +177 -0
  99. data/lib/toy/models/toy_smollm2.rb +393 -0
  100. data/lib/toy/models/toy_vit.rb +83 -0
  101. data/lib/toy/models/transformer.rb +1494 -0
  102. data/lib/toy/models/transformer_lm.rb +298 -0
  103. data/lib/toy/models/transformer_lm_cuda.rb +159 -0
  104. data/lib/toy/models/transformer_lm_metal.rb +142 -0
  105. data/lib/toy/mri.rb +300 -0
  106. data/lib/toy/run/eval.rb +76 -0
  107. data/lib/toy/run/eval_cuda.rb +66 -0
  108. data/lib/toy/run/eval_lmc.rb +334 -0
  109. data/lib/toy/run/eval_metal.rb +67 -0
  110. data/lib/toy/run/infer.rb +130 -0
  111. data/lib/toy/run/infer_cuda.rb +118 -0
  112. data/lib/toy/run/infer_metal.rb +119 -0
  113. data/lib/toy/run/infer_trace.rb +37 -0
  114. data/lib/toy/run/serve.rb +144 -0
  115. data/lib/toy/run/train.rb +404 -0
  116. data/lib/toy/run/train_cuda.rb +397 -0
  117. data/lib/toy/run/train_gpt2.rb +103 -0
  118. data/lib/toy/run/train_gpt2_cuda.rb +85 -0
  119. data/lib/toy/run/train_gpt2_metal.rb +85 -0
  120. data/lib/toy/run/train_lora.rb +207 -0
  121. data/lib/toy/run/train_lora_cuda.rb +219 -0
  122. data/lib/toy/run/train_metal.rb +227 -0
  123. data/lib/toy/run/train_vit.rb +251 -0
  124. data/lib/toy/serve/openai/embeddings_handler.rb +92 -0
  125. data/lib/toy/serve/openai/handlers.rb +143 -0
  126. data/lib/toy/serve/openai/server.rb +159 -0
  127. data/lib/toy/train/sampler.rb +314 -0
  128. data/lib/toy/train/toy_chat_template.rb +179 -0
  129. data/lib/toy/train/toy_drift_grad.rb +176 -0
  130. data/lib/toy/train/toy_gguf_fuse.rb +428 -0
  131. data/lib/toy/train/toy_gguf_writer.rb +100 -0
  132. data/lib/toy/train/toy_lr_schedule.rb +39 -0
  133. data/lib/toy/train/toy_sample.rb +125 -0
  134. data/lib/toy/train/toy_trainer.rb +86 -0
  135. data/lib/toy/train/training.rb +160 -0
  136. data/lib/toy/version.rb +11 -0
  137. data/lib/toy.rb +902 -0
  138. data/prep/progress +118 -0
  139. data/prep/quietly +64 -0
  140. data/sig/toy.rbs +397 -0
  141. data/sig/toy_compute.rbs +450 -0
  142. data/spinel-ext.json +122 -0
  143. data/tinynn/Makefile +71 -0
  144. data/tinynn/tinynn_backend_cuda.c +99 -0
  145. data/tinynn/tinynn_backend_metal.m +75 -0
  146. data/tinynn/tinynn_events.c +122 -0
  147. data/tinynn/tinynn_events.h +83 -0
  148. data/tinynn/tinynn_ggml.c +2460 -0
  149. data/tinynn/tinynn_ggml.h +545 -0
  150. data/tinynn/tinynn_gguf.c +783 -0
  151. data/tinynn/tinynn_gguf.h +167 -0
  152. data/tinynn/tinynn_trace.c +180 -0
  153. data/tinynn/tinynn_trace.h +85 -0
  154. data/vendor/ggml/AUTHORS +335 -0
  155. data/vendor/ggml/CMakeLists.txt +505 -0
  156. data/vendor/ggml/CONTRIBUTING.md +3 -0
  157. data/vendor/ggml/LICENSE +21 -0
  158. data/vendor/ggml/README.md +50 -0
  159. data/vendor/ggml/ci/run.sh +395 -0
  160. data/vendor/ggml/cmake/FindNCCL.cmake +36 -0
  161. data/vendor/ggml/cmake/GitVars.cmake +22 -0
  162. data/vendor/ggml/cmake/common.cmake +50 -0
  163. data/vendor/ggml/cmake/ggml-config.cmake.in +191 -0
  164. data/vendor/ggml/docs/gguf.md +828 -0
  165. data/vendor/ggml/examples/CMakeLists.txt +34 -0
  166. data/vendor/ggml/examples/common-ggml.cpp +244 -0
  167. data/vendor/ggml/examples/common-ggml.h +18 -0
  168. data/vendor/ggml/examples/common.cpp +675 -0
  169. data/vendor/ggml/examples/common.h +322 -0
  170. data/vendor/ggml/examples/gpt-2/CMakeLists.txt +32 -0
  171. data/vendor/ggml/examples/gpt-2/README.md +225 -0
  172. data/vendor/ggml/examples/gpt-2/convert-cerebras-to-ggml.py +183 -0
  173. data/vendor/ggml/examples/gpt-2/convert-ckpt-to-ggml.py +159 -0
  174. data/vendor/ggml/examples/gpt-2/convert-h5-to-ggml.py +195 -0
  175. data/vendor/ggml/examples/gpt-2/download-ggml-model.sh +69 -0
  176. data/vendor/ggml/examples/gpt-2/download-model.sh +48 -0
  177. data/vendor/ggml/examples/gpt-2/main-alloc.cpp +880 -0
  178. data/vendor/ggml/examples/gpt-2/main-backend.cpp +946 -0
  179. data/vendor/ggml/examples/gpt-2/main-batched.cpp +1210 -0
  180. data/vendor/ggml/examples/gpt-2/main-ctx.cpp +840 -0
  181. data/vendor/ggml/examples/gpt-2/main-sched.cpp +1079 -0
  182. data/vendor/ggml/examples/gpt-2/quantize.cpp +184 -0
  183. data/vendor/ggml/examples/gpt-j/CMakeLists.txt +13 -0
  184. data/vendor/ggml/examples/gpt-j/README.md +239 -0
  185. data/vendor/ggml/examples/gpt-j/convert-h5-to-ggml.py +173 -0
  186. data/vendor/ggml/examples/gpt-j/download-ggml-model.sh +69 -0
  187. data/vendor/ggml/examples/gpt-j/download-model.sh +11 -0
  188. data/vendor/ggml/examples/gpt-j/main.cpp +755 -0
  189. data/vendor/ggml/examples/gpt-j/quantize.cpp +182 -0
  190. data/vendor/ggml/examples/magika/CMakeLists.txt +17 -0
  191. data/vendor/ggml/examples/magika/README.md +23 -0
  192. data/vendor/ggml/examples/magika/convert.py +32 -0
  193. data/vendor/ggml/examples/magika/main.cpp +374 -0
  194. data/vendor/ggml/examples/mnist/CMakeLists.txt +58 -0
  195. data/vendor/ggml/examples/mnist/README.md +206 -0
  196. data/vendor/ggml/examples/mnist/mnist-common.cpp +496 -0
  197. data/vendor/ggml/examples/mnist/mnist-common.h +166 -0
  198. data/vendor/ggml/examples/mnist/mnist-eval.cpp +67 -0
  199. data/vendor/ggml/examples/mnist/mnist-train-cnn.py +91 -0
  200. data/vendor/ggml/examples/mnist/mnist-train-fc.py +131 -0
  201. data/vendor/ggml/examples/mnist/mnist-train.cpp +39 -0
  202. data/vendor/ggml/examples/mnist/server.py +36 -0
  203. data/vendor/ggml/examples/mnist/web/index.html +178 -0
  204. data/vendor/ggml/examples/perf-metal/CMakeLists.txt +7 -0
  205. data/vendor/ggml/examples/perf-metal/perf-metal.cpp +152 -0
  206. data/vendor/ggml/examples/prompts/dolly-v2.txt +100 -0
  207. data/vendor/ggml/examples/prompts/gpt-2-chinese.txt +1 -0
  208. data/vendor/ggml/examples/prompts/gpt-2.txt +100 -0
  209. data/vendor/ggml/examples/prompts/gpt-j.txt +100 -0
  210. data/vendor/ggml/examples/prompts/gpt-neox-japanese.txt +1 -0
  211. data/vendor/ggml/examples/prompts/gpt-neox.txt +100 -0
  212. data/vendor/ggml/examples/prompts/polyglot-ko.txt +3 -0
  213. data/vendor/ggml/examples/prompts/replit.txt +100 -0
  214. data/vendor/ggml/examples/prompts/starcoder.txt +100 -0
  215. data/vendor/ggml/examples/prompts/test-cases.txt +110 -0
  216. data/vendor/ggml/examples/prompts/tokenize_huggingface.py +65 -0
  217. data/vendor/ggml/examples/prompts/whisper.txt +100 -0
  218. data/vendor/ggml/examples/python/README.md +115 -0
  219. data/vendor/ggml/examples/python/api.h +14 -0
  220. data/vendor/ggml/examples/python/example_add_quant.py +25 -0
  221. data/vendor/ggml/examples/python/example_test_all_quants.py +68 -0
  222. data/vendor/ggml/examples/python/ggml/__init__.py +58 -0
  223. data/vendor/ggml/examples/python/ggml/__init__.pyi +2406 -0
  224. data/vendor/ggml/examples/python/ggml/cffi.py +11 -0
  225. data/vendor/ggml/examples/python/ggml/ffi/__init__.pyi +7 -0
  226. data/vendor/ggml/examples/python/ggml/utils.py +182 -0
  227. data/vendor/ggml/examples/python/regenerate.py +42 -0
  228. data/vendor/ggml/examples/python/stubs.py +128 -0
  229. data/vendor/ggml/examples/python/test_tensor.py +258 -0
  230. data/vendor/ggml/examples/sam/CMakeLists.txt +13 -0
  231. data/vendor/ggml/examples/sam/README.md +95 -0
  232. data/vendor/ggml/examples/sam/convert-pth-to-ggml.py +147 -0
  233. data/vendor/ggml/examples/sam/example.jpg +0 -0
  234. data/vendor/ggml/examples/sam/sam.cpp +2370 -0
  235. data/vendor/ggml/examples/simple/CMakeLists.txt +21 -0
  236. data/vendor/ggml/examples/simple/README.md +61 -0
  237. data/vendor/ggml/examples/simple/simple-backend.cpp +153 -0
  238. data/vendor/ggml/examples/simple/simple-ctx.cpp +127 -0
  239. data/vendor/ggml/examples/stb_image.h +7987 -0
  240. data/vendor/ggml/examples/stb_image_write.h +1724 -0
  241. data/vendor/ggml/examples/test-cmake/CMakeLists.txt +10 -0
  242. data/vendor/ggml/examples/test-cmake/README.md +3 -0
  243. data/vendor/ggml/examples/test-cmake/test-cmake.cpp +6 -0
  244. data/vendor/ggml/examples/yolo/CMakeLists.txt +6 -0
  245. data/vendor/ggml/examples/yolo/README.md +59 -0
  246. data/vendor/ggml/examples/yolo/convert-yolov3-tiny.py +53 -0
  247. data/vendor/ggml/examples/yolo/data/coco.names +80 -0
  248. data/vendor/ggml/examples/yolo/data/labels/100_0.png +0 -0
  249. data/vendor/ggml/examples/yolo/data/labels/100_1.png +0 -0
  250. data/vendor/ggml/examples/yolo/data/labels/100_2.png +0 -0
  251. data/vendor/ggml/examples/yolo/data/labels/100_3.png +0 -0
  252. data/vendor/ggml/examples/yolo/data/labels/100_4.png +0 -0
  253. data/vendor/ggml/examples/yolo/data/labels/100_5.png +0 -0
  254. data/vendor/ggml/examples/yolo/data/labels/100_6.png +0 -0
  255. data/vendor/ggml/examples/yolo/data/labels/100_7.png +0 -0
  256. data/vendor/ggml/examples/yolo/data/labels/101_0.png +0 -0
  257. data/vendor/ggml/examples/yolo/data/labels/101_1.png +0 -0
  258. data/vendor/ggml/examples/yolo/data/labels/101_2.png +0 -0
  259. data/vendor/ggml/examples/yolo/data/labels/101_3.png +0 -0
  260. data/vendor/ggml/examples/yolo/data/labels/101_4.png +0 -0
  261. data/vendor/ggml/examples/yolo/data/labels/101_5.png +0 -0
  262. data/vendor/ggml/examples/yolo/data/labels/101_6.png +0 -0
  263. data/vendor/ggml/examples/yolo/data/labels/101_7.png +0 -0
  264. data/vendor/ggml/examples/yolo/data/labels/102_0.png +0 -0
  265. data/vendor/ggml/examples/yolo/data/labels/102_1.png +0 -0
  266. data/vendor/ggml/examples/yolo/data/labels/102_2.png +0 -0
  267. data/vendor/ggml/examples/yolo/data/labels/102_3.png +0 -0
  268. data/vendor/ggml/examples/yolo/data/labels/102_4.png +0 -0
  269. data/vendor/ggml/examples/yolo/data/labels/102_5.png +0 -0
  270. data/vendor/ggml/examples/yolo/data/labels/102_6.png +0 -0
  271. data/vendor/ggml/examples/yolo/data/labels/102_7.png +0 -0
  272. data/vendor/ggml/examples/yolo/data/labels/103_0.png +0 -0
  273. data/vendor/ggml/examples/yolo/data/labels/103_1.png +0 -0
  274. data/vendor/ggml/examples/yolo/data/labels/103_2.png +0 -0
  275. data/vendor/ggml/examples/yolo/data/labels/103_3.png +0 -0
  276. data/vendor/ggml/examples/yolo/data/labels/103_4.png +0 -0
  277. data/vendor/ggml/examples/yolo/data/labels/103_5.png +0 -0
  278. data/vendor/ggml/examples/yolo/data/labels/103_6.png +0 -0
  279. data/vendor/ggml/examples/yolo/data/labels/103_7.png +0 -0
  280. data/vendor/ggml/examples/yolo/data/labels/104_0.png +0 -0
  281. data/vendor/ggml/examples/yolo/data/labels/104_1.png +0 -0
  282. data/vendor/ggml/examples/yolo/data/labels/104_2.png +0 -0
  283. data/vendor/ggml/examples/yolo/data/labels/104_3.png +0 -0
  284. data/vendor/ggml/examples/yolo/data/labels/104_4.png +0 -0
  285. data/vendor/ggml/examples/yolo/data/labels/104_5.png +0 -0
  286. data/vendor/ggml/examples/yolo/data/labels/104_6.png +0 -0
  287. data/vendor/ggml/examples/yolo/data/labels/104_7.png +0 -0
  288. data/vendor/ggml/examples/yolo/data/labels/105_0.png +0 -0
  289. data/vendor/ggml/examples/yolo/data/labels/105_1.png +0 -0
  290. data/vendor/ggml/examples/yolo/data/labels/105_2.png +0 -0
  291. data/vendor/ggml/examples/yolo/data/labels/105_3.png +0 -0
  292. data/vendor/ggml/examples/yolo/data/labels/105_4.png +0 -0
  293. data/vendor/ggml/examples/yolo/data/labels/105_5.png +0 -0
  294. data/vendor/ggml/examples/yolo/data/labels/105_6.png +0 -0
  295. data/vendor/ggml/examples/yolo/data/labels/105_7.png +0 -0
  296. data/vendor/ggml/examples/yolo/data/labels/106_0.png +0 -0
  297. data/vendor/ggml/examples/yolo/data/labels/106_1.png +0 -0
  298. data/vendor/ggml/examples/yolo/data/labels/106_2.png +0 -0
  299. data/vendor/ggml/examples/yolo/data/labels/106_3.png +0 -0
  300. data/vendor/ggml/examples/yolo/data/labels/106_4.png +0 -0
  301. data/vendor/ggml/examples/yolo/data/labels/106_5.png +0 -0
  302. data/vendor/ggml/examples/yolo/data/labels/106_6.png +0 -0
  303. data/vendor/ggml/examples/yolo/data/labels/106_7.png +0 -0
  304. data/vendor/ggml/examples/yolo/data/labels/107_0.png +0 -0
  305. data/vendor/ggml/examples/yolo/data/labels/107_1.png +0 -0
  306. data/vendor/ggml/examples/yolo/data/labels/107_2.png +0 -0
  307. data/vendor/ggml/examples/yolo/data/labels/107_3.png +0 -0
  308. data/vendor/ggml/examples/yolo/data/labels/107_4.png +0 -0
  309. data/vendor/ggml/examples/yolo/data/labels/107_5.png +0 -0
  310. data/vendor/ggml/examples/yolo/data/labels/107_6.png +0 -0
  311. data/vendor/ggml/examples/yolo/data/labels/107_7.png +0 -0
  312. data/vendor/ggml/examples/yolo/data/labels/108_0.png +0 -0
  313. data/vendor/ggml/examples/yolo/data/labels/108_1.png +0 -0
  314. data/vendor/ggml/examples/yolo/data/labels/108_2.png +0 -0
  315. data/vendor/ggml/examples/yolo/data/labels/108_3.png +0 -0
  316. data/vendor/ggml/examples/yolo/data/labels/108_4.png +0 -0
  317. data/vendor/ggml/examples/yolo/data/labels/108_5.png +0 -0
  318. data/vendor/ggml/examples/yolo/data/labels/108_6.png +0 -0
  319. data/vendor/ggml/examples/yolo/data/labels/108_7.png +0 -0
  320. data/vendor/ggml/examples/yolo/data/labels/109_0.png +0 -0
  321. data/vendor/ggml/examples/yolo/data/labels/109_1.png +0 -0
  322. data/vendor/ggml/examples/yolo/data/labels/109_2.png +0 -0
  323. data/vendor/ggml/examples/yolo/data/labels/109_3.png +0 -0
  324. data/vendor/ggml/examples/yolo/data/labels/109_4.png +0 -0
  325. data/vendor/ggml/examples/yolo/data/labels/109_5.png +0 -0
  326. data/vendor/ggml/examples/yolo/data/labels/109_6.png +0 -0
  327. data/vendor/ggml/examples/yolo/data/labels/109_7.png +0 -0
  328. data/vendor/ggml/examples/yolo/data/labels/110_0.png +0 -0
  329. data/vendor/ggml/examples/yolo/data/labels/110_1.png +0 -0
  330. data/vendor/ggml/examples/yolo/data/labels/110_2.png +0 -0
  331. data/vendor/ggml/examples/yolo/data/labels/110_3.png +0 -0
  332. data/vendor/ggml/examples/yolo/data/labels/110_4.png +0 -0
  333. data/vendor/ggml/examples/yolo/data/labels/110_5.png +0 -0
  334. data/vendor/ggml/examples/yolo/data/labels/110_6.png +0 -0
  335. data/vendor/ggml/examples/yolo/data/labels/110_7.png +0 -0
  336. data/vendor/ggml/examples/yolo/data/labels/111_0.png +0 -0
  337. data/vendor/ggml/examples/yolo/data/labels/111_1.png +0 -0
  338. data/vendor/ggml/examples/yolo/data/labels/111_2.png +0 -0
  339. data/vendor/ggml/examples/yolo/data/labels/111_3.png +0 -0
  340. data/vendor/ggml/examples/yolo/data/labels/111_4.png +0 -0
  341. data/vendor/ggml/examples/yolo/data/labels/111_5.png +0 -0
  342. data/vendor/ggml/examples/yolo/data/labels/111_6.png +0 -0
  343. data/vendor/ggml/examples/yolo/data/labels/111_7.png +0 -0
  344. data/vendor/ggml/examples/yolo/data/labels/112_0.png +0 -0
  345. data/vendor/ggml/examples/yolo/data/labels/112_1.png +0 -0
  346. data/vendor/ggml/examples/yolo/data/labels/112_2.png +0 -0
  347. data/vendor/ggml/examples/yolo/data/labels/112_3.png +0 -0
  348. data/vendor/ggml/examples/yolo/data/labels/112_4.png +0 -0
  349. data/vendor/ggml/examples/yolo/data/labels/112_5.png +0 -0
  350. data/vendor/ggml/examples/yolo/data/labels/112_6.png +0 -0
  351. data/vendor/ggml/examples/yolo/data/labels/112_7.png +0 -0
  352. data/vendor/ggml/examples/yolo/data/labels/113_0.png +0 -0
  353. data/vendor/ggml/examples/yolo/data/labels/113_1.png +0 -0
  354. data/vendor/ggml/examples/yolo/data/labels/113_2.png +0 -0
  355. data/vendor/ggml/examples/yolo/data/labels/113_3.png +0 -0
  356. data/vendor/ggml/examples/yolo/data/labels/113_4.png +0 -0
  357. data/vendor/ggml/examples/yolo/data/labels/113_5.png +0 -0
  358. data/vendor/ggml/examples/yolo/data/labels/113_6.png +0 -0
  359. data/vendor/ggml/examples/yolo/data/labels/113_7.png +0 -0
  360. data/vendor/ggml/examples/yolo/data/labels/114_0.png +0 -0
  361. data/vendor/ggml/examples/yolo/data/labels/114_1.png +0 -0
  362. data/vendor/ggml/examples/yolo/data/labels/114_2.png +0 -0
  363. data/vendor/ggml/examples/yolo/data/labels/114_3.png +0 -0
  364. data/vendor/ggml/examples/yolo/data/labels/114_4.png +0 -0
  365. data/vendor/ggml/examples/yolo/data/labels/114_5.png +0 -0
  366. data/vendor/ggml/examples/yolo/data/labels/114_6.png +0 -0
  367. data/vendor/ggml/examples/yolo/data/labels/114_7.png +0 -0
  368. data/vendor/ggml/examples/yolo/data/labels/115_0.png +0 -0
  369. data/vendor/ggml/examples/yolo/data/labels/115_1.png +0 -0
  370. data/vendor/ggml/examples/yolo/data/labels/115_2.png +0 -0
  371. data/vendor/ggml/examples/yolo/data/labels/115_3.png +0 -0
  372. data/vendor/ggml/examples/yolo/data/labels/115_4.png +0 -0
  373. data/vendor/ggml/examples/yolo/data/labels/115_5.png +0 -0
  374. data/vendor/ggml/examples/yolo/data/labels/115_6.png +0 -0
  375. data/vendor/ggml/examples/yolo/data/labels/115_7.png +0 -0
  376. data/vendor/ggml/examples/yolo/data/labels/116_0.png +0 -0
  377. data/vendor/ggml/examples/yolo/data/labels/116_1.png +0 -0
  378. data/vendor/ggml/examples/yolo/data/labels/116_2.png +0 -0
  379. data/vendor/ggml/examples/yolo/data/labels/116_3.png +0 -0
  380. data/vendor/ggml/examples/yolo/data/labels/116_4.png +0 -0
  381. data/vendor/ggml/examples/yolo/data/labels/116_5.png +0 -0
  382. data/vendor/ggml/examples/yolo/data/labels/116_6.png +0 -0
  383. data/vendor/ggml/examples/yolo/data/labels/116_7.png +0 -0
  384. data/vendor/ggml/examples/yolo/data/labels/117_0.png +0 -0
  385. data/vendor/ggml/examples/yolo/data/labels/117_1.png +0 -0
  386. data/vendor/ggml/examples/yolo/data/labels/117_2.png +0 -0
  387. data/vendor/ggml/examples/yolo/data/labels/117_3.png +0 -0
  388. data/vendor/ggml/examples/yolo/data/labels/117_4.png +0 -0
  389. data/vendor/ggml/examples/yolo/data/labels/117_5.png +0 -0
  390. data/vendor/ggml/examples/yolo/data/labels/117_6.png +0 -0
  391. data/vendor/ggml/examples/yolo/data/labels/117_7.png +0 -0
  392. data/vendor/ggml/examples/yolo/data/labels/118_0.png +0 -0
  393. data/vendor/ggml/examples/yolo/data/labels/118_1.png +0 -0
  394. data/vendor/ggml/examples/yolo/data/labels/118_2.png +0 -0
  395. data/vendor/ggml/examples/yolo/data/labels/118_3.png +0 -0
  396. data/vendor/ggml/examples/yolo/data/labels/118_4.png +0 -0
  397. data/vendor/ggml/examples/yolo/data/labels/118_5.png +0 -0
  398. data/vendor/ggml/examples/yolo/data/labels/118_6.png +0 -0
  399. data/vendor/ggml/examples/yolo/data/labels/118_7.png +0 -0
  400. data/vendor/ggml/examples/yolo/data/labels/119_0.png +0 -0
  401. data/vendor/ggml/examples/yolo/data/labels/119_1.png +0 -0
  402. data/vendor/ggml/examples/yolo/data/labels/119_2.png +0 -0
  403. data/vendor/ggml/examples/yolo/data/labels/119_3.png +0 -0
  404. data/vendor/ggml/examples/yolo/data/labels/119_4.png +0 -0
  405. data/vendor/ggml/examples/yolo/data/labels/119_5.png +0 -0
  406. data/vendor/ggml/examples/yolo/data/labels/119_6.png +0 -0
  407. data/vendor/ggml/examples/yolo/data/labels/119_7.png +0 -0
  408. data/vendor/ggml/examples/yolo/data/labels/120_0.png +0 -0
  409. data/vendor/ggml/examples/yolo/data/labels/120_1.png +0 -0
  410. data/vendor/ggml/examples/yolo/data/labels/120_2.png +0 -0
  411. data/vendor/ggml/examples/yolo/data/labels/120_3.png +0 -0
  412. data/vendor/ggml/examples/yolo/data/labels/120_4.png +0 -0
  413. data/vendor/ggml/examples/yolo/data/labels/120_5.png +0 -0
  414. data/vendor/ggml/examples/yolo/data/labels/120_6.png +0 -0
  415. data/vendor/ggml/examples/yolo/data/labels/120_7.png +0 -0
  416. data/vendor/ggml/examples/yolo/data/labels/121_0.png +0 -0
  417. data/vendor/ggml/examples/yolo/data/labels/121_1.png +0 -0
  418. data/vendor/ggml/examples/yolo/data/labels/121_2.png +0 -0
  419. data/vendor/ggml/examples/yolo/data/labels/121_3.png +0 -0
  420. data/vendor/ggml/examples/yolo/data/labels/121_4.png +0 -0
  421. data/vendor/ggml/examples/yolo/data/labels/121_5.png +0 -0
  422. data/vendor/ggml/examples/yolo/data/labels/121_6.png +0 -0
  423. data/vendor/ggml/examples/yolo/data/labels/121_7.png +0 -0
  424. data/vendor/ggml/examples/yolo/data/labels/122_0.png +0 -0
  425. data/vendor/ggml/examples/yolo/data/labels/122_1.png +0 -0
  426. data/vendor/ggml/examples/yolo/data/labels/122_2.png +0 -0
  427. data/vendor/ggml/examples/yolo/data/labels/122_3.png +0 -0
  428. data/vendor/ggml/examples/yolo/data/labels/122_4.png +0 -0
  429. data/vendor/ggml/examples/yolo/data/labels/122_5.png +0 -0
  430. data/vendor/ggml/examples/yolo/data/labels/122_6.png +0 -0
  431. data/vendor/ggml/examples/yolo/data/labels/122_7.png +0 -0
  432. data/vendor/ggml/examples/yolo/data/labels/123_0.png +0 -0
  433. data/vendor/ggml/examples/yolo/data/labels/123_1.png +0 -0
  434. data/vendor/ggml/examples/yolo/data/labels/123_2.png +0 -0
  435. data/vendor/ggml/examples/yolo/data/labels/123_3.png +0 -0
  436. data/vendor/ggml/examples/yolo/data/labels/123_4.png +0 -0
  437. data/vendor/ggml/examples/yolo/data/labels/123_5.png +0 -0
  438. data/vendor/ggml/examples/yolo/data/labels/123_6.png +0 -0
  439. data/vendor/ggml/examples/yolo/data/labels/123_7.png +0 -0
  440. data/vendor/ggml/examples/yolo/data/labels/124_0.png +0 -0
  441. data/vendor/ggml/examples/yolo/data/labels/124_1.png +0 -0
  442. data/vendor/ggml/examples/yolo/data/labels/124_2.png +0 -0
  443. data/vendor/ggml/examples/yolo/data/labels/124_3.png +0 -0
  444. data/vendor/ggml/examples/yolo/data/labels/124_4.png +0 -0
  445. data/vendor/ggml/examples/yolo/data/labels/124_5.png +0 -0
  446. data/vendor/ggml/examples/yolo/data/labels/124_6.png +0 -0
  447. data/vendor/ggml/examples/yolo/data/labels/124_7.png +0 -0
  448. data/vendor/ggml/examples/yolo/data/labels/125_0.png +0 -0
  449. data/vendor/ggml/examples/yolo/data/labels/125_1.png +0 -0
  450. data/vendor/ggml/examples/yolo/data/labels/125_2.png +0 -0
  451. data/vendor/ggml/examples/yolo/data/labels/125_3.png +0 -0
  452. data/vendor/ggml/examples/yolo/data/labels/125_4.png +0 -0
  453. data/vendor/ggml/examples/yolo/data/labels/125_5.png +0 -0
  454. data/vendor/ggml/examples/yolo/data/labels/125_6.png +0 -0
  455. data/vendor/ggml/examples/yolo/data/labels/125_7.png +0 -0
  456. data/vendor/ggml/examples/yolo/data/labels/126_0.png +0 -0
  457. data/vendor/ggml/examples/yolo/data/labels/126_1.png +0 -0
  458. data/vendor/ggml/examples/yolo/data/labels/126_2.png +0 -0
  459. data/vendor/ggml/examples/yolo/data/labels/126_3.png +0 -0
  460. data/vendor/ggml/examples/yolo/data/labels/126_4.png +0 -0
  461. data/vendor/ggml/examples/yolo/data/labels/126_5.png +0 -0
  462. data/vendor/ggml/examples/yolo/data/labels/126_6.png +0 -0
  463. data/vendor/ggml/examples/yolo/data/labels/126_7.png +0 -0
  464. data/vendor/ggml/examples/yolo/data/labels/32_0.png +0 -0
  465. data/vendor/ggml/examples/yolo/data/labels/32_1.png +0 -0
  466. data/vendor/ggml/examples/yolo/data/labels/32_2.png +0 -0
  467. data/vendor/ggml/examples/yolo/data/labels/32_3.png +0 -0
  468. data/vendor/ggml/examples/yolo/data/labels/32_4.png +0 -0
  469. data/vendor/ggml/examples/yolo/data/labels/32_5.png +0 -0
  470. data/vendor/ggml/examples/yolo/data/labels/32_6.png +0 -0
  471. data/vendor/ggml/examples/yolo/data/labels/32_7.png +0 -0
  472. data/vendor/ggml/examples/yolo/data/labels/33_0.png +0 -0
  473. data/vendor/ggml/examples/yolo/data/labels/33_1.png +0 -0
  474. data/vendor/ggml/examples/yolo/data/labels/33_2.png +0 -0
  475. data/vendor/ggml/examples/yolo/data/labels/33_3.png +0 -0
  476. data/vendor/ggml/examples/yolo/data/labels/33_4.png +0 -0
  477. data/vendor/ggml/examples/yolo/data/labels/33_5.png +0 -0
  478. data/vendor/ggml/examples/yolo/data/labels/33_6.png +0 -0
  479. data/vendor/ggml/examples/yolo/data/labels/33_7.png +0 -0
  480. data/vendor/ggml/examples/yolo/data/labels/34_0.png +0 -0
  481. data/vendor/ggml/examples/yolo/data/labels/34_1.png +0 -0
  482. data/vendor/ggml/examples/yolo/data/labels/34_2.png +0 -0
  483. data/vendor/ggml/examples/yolo/data/labels/34_3.png +0 -0
  484. data/vendor/ggml/examples/yolo/data/labels/34_4.png +0 -0
  485. data/vendor/ggml/examples/yolo/data/labels/34_5.png +0 -0
  486. data/vendor/ggml/examples/yolo/data/labels/34_6.png +0 -0
  487. data/vendor/ggml/examples/yolo/data/labels/34_7.png +0 -0
  488. data/vendor/ggml/examples/yolo/data/labels/35_0.png +0 -0
  489. data/vendor/ggml/examples/yolo/data/labels/35_1.png +0 -0
  490. data/vendor/ggml/examples/yolo/data/labels/35_2.png +0 -0
  491. data/vendor/ggml/examples/yolo/data/labels/35_3.png +0 -0
  492. data/vendor/ggml/examples/yolo/data/labels/35_4.png +0 -0
  493. data/vendor/ggml/examples/yolo/data/labels/35_5.png +0 -0
  494. data/vendor/ggml/examples/yolo/data/labels/35_6.png +0 -0
  495. data/vendor/ggml/examples/yolo/data/labels/35_7.png +0 -0
  496. data/vendor/ggml/examples/yolo/data/labels/36_0.png +0 -0
  497. data/vendor/ggml/examples/yolo/data/labels/36_1.png +0 -0
  498. data/vendor/ggml/examples/yolo/data/labels/36_2.png +0 -0
  499. data/vendor/ggml/examples/yolo/data/labels/36_3.png +0 -0
  500. data/vendor/ggml/examples/yolo/data/labels/36_4.png +0 -0
  501. data/vendor/ggml/examples/yolo/data/labels/36_5.png +0 -0
  502. data/vendor/ggml/examples/yolo/data/labels/36_6.png +0 -0
  503. data/vendor/ggml/examples/yolo/data/labels/36_7.png +0 -0
  504. data/vendor/ggml/examples/yolo/data/labels/37_0.png +0 -0
  505. data/vendor/ggml/examples/yolo/data/labels/37_1.png +0 -0
  506. data/vendor/ggml/examples/yolo/data/labels/37_2.png +0 -0
  507. data/vendor/ggml/examples/yolo/data/labels/37_3.png +0 -0
  508. data/vendor/ggml/examples/yolo/data/labels/37_4.png +0 -0
  509. data/vendor/ggml/examples/yolo/data/labels/37_5.png +0 -0
  510. data/vendor/ggml/examples/yolo/data/labels/37_6.png +0 -0
  511. data/vendor/ggml/examples/yolo/data/labels/37_7.png +0 -0
  512. data/vendor/ggml/examples/yolo/data/labels/38_0.png +0 -0
  513. data/vendor/ggml/examples/yolo/data/labels/38_1.png +0 -0
  514. data/vendor/ggml/examples/yolo/data/labels/38_2.png +0 -0
  515. data/vendor/ggml/examples/yolo/data/labels/38_3.png +0 -0
  516. data/vendor/ggml/examples/yolo/data/labels/38_4.png +0 -0
  517. data/vendor/ggml/examples/yolo/data/labels/38_5.png +0 -0
  518. data/vendor/ggml/examples/yolo/data/labels/38_6.png +0 -0
  519. data/vendor/ggml/examples/yolo/data/labels/38_7.png +0 -0
  520. data/vendor/ggml/examples/yolo/data/labels/39_0.png +0 -0
  521. data/vendor/ggml/examples/yolo/data/labels/39_1.png +0 -0
  522. data/vendor/ggml/examples/yolo/data/labels/39_2.png +0 -0
  523. data/vendor/ggml/examples/yolo/data/labels/39_3.png +0 -0
  524. data/vendor/ggml/examples/yolo/data/labels/39_4.png +0 -0
  525. data/vendor/ggml/examples/yolo/data/labels/39_5.png +0 -0
  526. data/vendor/ggml/examples/yolo/data/labels/39_6.png +0 -0
  527. data/vendor/ggml/examples/yolo/data/labels/39_7.png +0 -0
  528. data/vendor/ggml/examples/yolo/data/labels/40_0.png +0 -0
  529. data/vendor/ggml/examples/yolo/data/labels/40_1.png +0 -0
  530. data/vendor/ggml/examples/yolo/data/labels/40_2.png +0 -0
  531. data/vendor/ggml/examples/yolo/data/labels/40_3.png +0 -0
  532. data/vendor/ggml/examples/yolo/data/labels/40_4.png +0 -0
  533. data/vendor/ggml/examples/yolo/data/labels/40_5.png +0 -0
  534. data/vendor/ggml/examples/yolo/data/labels/40_6.png +0 -0
  535. data/vendor/ggml/examples/yolo/data/labels/40_7.png +0 -0
  536. data/vendor/ggml/examples/yolo/data/labels/41_0.png +0 -0
  537. data/vendor/ggml/examples/yolo/data/labels/41_1.png +0 -0
  538. data/vendor/ggml/examples/yolo/data/labels/41_2.png +0 -0
  539. data/vendor/ggml/examples/yolo/data/labels/41_3.png +0 -0
  540. data/vendor/ggml/examples/yolo/data/labels/41_4.png +0 -0
  541. data/vendor/ggml/examples/yolo/data/labels/41_5.png +0 -0
  542. data/vendor/ggml/examples/yolo/data/labels/41_6.png +0 -0
  543. data/vendor/ggml/examples/yolo/data/labels/41_7.png +0 -0
  544. data/vendor/ggml/examples/yolo/data/labels/42_0.png +0 -0
  545. data/vendor/ggml/examples/yolo/data/labels/42_1.png +0 -0
  546. data/vendor/ggml/examples/yolo/data/labels/42_2.png +0 -0
  547. data/vendor/ggml/examples/yolo/data/labels/42_3.png +0 -0
  548. data/vendor/ggml/examples/yolo/data/labels/42_4.png +0 -0
  549. data/vendor/ggml/examples/yolo/data/labels/42_5.png +0 -0
  550. data/vendor/ggml/examples/yolo/data/labels/42_6.png +0 -0
  551. data/vendor/ggml/examples/yolo/data/labels/42_7.png +0 -0
  552. data/vendor/ggml/examples/yolo/data/labels/43_0.png +0 -0
  553. data/vendor/ggml/examples/yolo/data/labels/43_1.png +0 -0
  554. data/vendor/ggml/examples/yolo/data/labels/43_2.png +0 -0
  555. data/vendor/ggml/examples/yolo/data/labels/43_3.png +0 -0
  556. data/vendor/ggml/examples/yolo/data/labels/43_4.png +0 -0
  557. data/vendor/ggml/examples/yolo/data/labels/43_5.png +0 -0
  558. data/vendor/ggml/examples/yolo/data/labels/43_6.png +0 -0
  559. data/vendor/ggml/examples/yolo/data/labels/43_7.png +0 -0
  560. data/vendor/ggml/examples/yolo/data/labels/44_0.png +0 -0
  561. data/vendor/ggml/examples/yolo/data/labels/44_1.png +0 -0
  562. data/vendor/ggml/examples/yolo/data/labels/44_2.png +0 -0
  563. data/vendor/ggml/examples/yolo/data/labels/44_3.png +0 -0
  564. data/vendor/ggml/examples/yolo/data/labels/44_4.png +0 -0
  565. data/vendor/ggml/examples/yolo/data/labels/44_5.png +0 -0
  566. data/vendor/ggml/examples/yolo/data/labels/44_6.png +0 -0
  567. data/vendor/ggml/examples/yolo/data/labels/44_7.png +0 -0
  568. data/vendor/ggml/examples/yolo/data/labels/45_0.png +0 -0
  569. data/vendor/ggml/examples/yolo/data/labels/45_1.png +0 -0
  570. data/vendor/ggml/examples/yolo/data/labels/45_2.png +0 -0
  571. data/vendor/ggml/examples/yolo/data/labels/45_3.png +0 -0
  572. data/vendor/ggml/examples/yolo/data/labels/45_4.png +0 -0
  573. data/vendor/ggml/examples/yolo/data/labels/45_5.png +0 -0
  574. data/vendor/ggml/examples/yolo/data/labels/45_6.png +0 -0
  575. data/vendor/ggml/examples/yolo/data/labels/45_7.png +0 -0
  576. data/vendor/ggml/examples/yolo/data/labels/46_0.png +0 -0
  577. data/vendor/ggml/examples/yolo/data/labels/46_1.png +0 -0
  578. data/vendor/ggml/examples/yolo/data/labels/46_2.png +0 -0
  579. data/vendor/ggml/examples/yolo/data/labels/46_3.png +0 -0
  580. data/vendor/ggml/examples/yolo/data/labels/46_4.png +0 -0
  581. data/vendor/ggml/examples/yolo/data/labels/46_5.png +0 -0
  582. data/vendor/ggml/examples/yolo/data/labels/46_6.png +0 -0
  583. data/vendor/ggml/examples/yolo/data/labels/46_7.png +0 -0
  584. data/vendor/ggml/examples/yolo/data/labels/47_0.png +0 -0
  585. data/vendor/ggml/examples/yolo/data/labels/47_1.png +0 -0
  586. data/vendor/ggml/examples/yolo/data/labels/47_2.png +0 -0
  587. data/vendor/ggml/examples/yolo/data/labels/47_3.png +0 -0
  588. data/vendor/ggml/examples/yolo/data/labels/47_4.png +0 -0
  589. data/vendor/ggml/examples/yolo/data/labels/47_5.png +0 -0
  590. data/vendor/ggml/examples/yolo/data/labels/47_6.png +0 -0
  591. data/vendor/ggml/examples/yolo/data/labels/47_7.png +0 -0
  592. data/vendor/ggml/examples/yolo/data/labels/48_0.png +0 -0
  593. data/vendor/ggml/examples/yolo/data/labels/48_1.png +0 -0
  594. data/vendor/ggml/examples/yolo/data/labels/48_2.png +0 -0
  595. data/vendor/ggml/examples/yolo/data/labels/48_3.png +0 -0
  596. data/vendor/ggml/examples/yolo/data/labels/48_4.png +0 -0
  597. data/vendor/ggml/examples/yolo/data/labels/48_5.png +0 -0
  598. data/vendor/ggml/examples/yolo/data/labels/48_6.png +0 -0
  599. data/vendor/ggml/examples/yolo/data/labels/48_7.png +0 -0
  600. data/vendor/ggml/examples/yolo/data/labels/49_0.png +0 -0
  601. data/vendor/ggml/examples/yolo/data/labels/49_1.png +0 -0
  602. data/vendor/ggml/examples/yolo/data/labels/49_2.png +0 -0
  603. data/vendor/ggml/examples/yolo/data/labels/49_3.png +0 -0
  604. data/vendor/ggml/examples/yolo/data/labels/49_4.png +0 -0
  605. data/vendor/ggml/examples/yolo/data/labels/49_5.png +0 -0
  606. data/vendor/ggml/examples/yolo/data/labels/49_6.png +0 -0
  607. data/vendor/ggml/examples/yolo/data/labels/49_7.png +0 -0
  608. data/vendor/ggml/examples/yolo/data/labels/50_0.png +0 -0
  609. data/vendor/ggml/examples/yolo/data/labels/50_1.png +0 -0
  610. data/vendor/ggml/examples/yolo/data/labels/50_2.png +0 -0
  611. data/vendor/ggml/examples/yolo/data/labels/50_3.png +0 -0
  612. data/vendor/ggml/examples/yolo/data/labels/50_4.png +0 -0
  613. data/vendor/ggml/examples/yolo/data/labels/50_5.png +0 -0
  614. data/vendor/ggml/examples/yolo/data/labels/50_6.png +0 -0
  615. data/vendor/ggml/examples/yolo/data/labels/50_7.png +0 -0
  616. data/vendor/ggml/examples/yolo/data/labels/51_0.png +0 -0
  617. data/vendor/ggml/examples/yolo/data/labels/51_1.png +0 -0
  618. data/vendor/ggml/examples/yolo/data/labels/51_2.png +0 -0
  619. data/vendor/ggml/examples/yolo/data/labels/51_3.png +0 -0
  620. data/vendor/ggml/examples/yolo/data/labels/51_4.png +0 -0
  621. data/vendor/ggml/examples/yolo/data/labels/51_5.png +0 -0
  622. data/vendor/ggml/examples/yolo/data/labels/51_6.png +0 -0
  623. data/vendor/ggml/examples/yolo/data/labels/51_7.png +0 -0
  624. data/vendor/ggml/examples/yolo/data/labels/52_0.png +0 -0
  625. data/vendor/ggml/examples/yolo/data/labels/52_1.png +0 -0
  626. data/vendor/ggml/examples/yolo/data/labels/52_2.png +0 -0
  627. data/vendor/ggml/examples/yolo/data/labels/52_3.png +0 -0
  628. data/vendor/ggml/examples/yolo/data/labels/52_4.png +0 -0
  629. data/vendor/ggml/examples/yolo/data/labels/52_5.png +0 -0
  630. data/vendor/ggml/examples/yolo/data/labels/52_6.png +0 -0
  631. data/vendor/ggml/examples/yolo/data/labels/52_7.png +0 -0
  632. data/vendor/ggml/examples/yolo/data/labels/53_0.png +0 -0
  633. data/vendor/ggml/examples/yolo/data/labels/53_1.png +0 -0
  634. data/vendor/ggml/examples/yolo/data/labels/53_2.png +0 -0
  635. data/vendor/ggml/examples/yolo/data/labels/53_3.png +0 -0
  636. data/vendor/ggml/examples/yolo/data/labels/53_4.png +0 -0
  637. data/vendor/ggml/examples/yolo/data/labels/53_5.png +0 -0
  638. data/vendor/ggml/examples/yolo/data/labels/53_6.png +0 -0
  639. data/vendor/ggml/examples/yolo/data/labels/53_7.png +0 -0
  640. data/vendor/ggml/examples/yolo/data/labels/54_0.png +0 -0
  641. data/vendor/ggml/examples/yolo/data/labels/54_1.png +0 -0
  642. data/vendor/ggml/examples/yolo/data/labels/54_2.png +0 -0
  643. data/vendor/ggml/examples/yolo/data/labels/54_3.png +0 -0
  644. data/vendor/ggml/examples/yolo/data/labels/54_4.png +0 -0
  645. data/vendor/ggml/examples/yolo/data/labels/54_5.png +0 -0
  646. data/vendor/ggml/examples/yolo/data/labels/54_6.png +0 -0
  647. data/vendor/ggml/examples/yolo/data/labels/54_7.png +0 -0
  648. data/vendor/ggml/examples/yolo/data/labels/55_0.png +0 -0
  649. data/vendor/ggml/examples/yolo/data/labels/55_1.png +0 -0
  650. data/vendor/ggml/examples/yolo/data/labels/55_2.png +0 -0
  651. data/vendor/ggml/examples/yolo/data/labels/55_3.png +0 -0
  652. data/vendor/ggml/examples/yolo/data/labels/55_4.png +0 -0
  653. data/vendor/ggml/examples/yolo/data/labels/55_5.png +0 -0
  654. data/vendor/ggml/examples/yolo/data/labels/55_6.png +0 -0
  655. data/vendor/ggml/examples/yolo/data/labels/55_7.png +0 -0
  656. data/vendor/ggml/examples/yolo/data/labels/56_0.png +0 -0
  657. data/vendor/ggml/examples/yolo/data/labels/56_1.png +0 -0
  658. data/vendor/ggml/examples/yolo/data/labels/56_2.png +0 -0
  659. data/vendor/ggml/examples/yolo/data/labels/56_3.png +0 -0
  660. data/vendor/ggml/examples/yolo/data/labels/56_4.png +0 -0
  661. data/vendor/ggml/examples/yolo/data/labels/56_5.png +0 -0
  662. data/vendor/ggml/examples/yolo/data/labels/56_6.png +0 -0
  663. data/vendor/ggml/examples/yolo/data/labels/56_7.png +0 -0
  664. data/vendor/ggml/examples/yolo/data/labels/57_0.png +0 -0
  665. data/vendor/ggml/examples/yolo/data/labels/57_1.png +0 -0
  666. data/vendor/ggml/examples/yolo/data/labels/57_2.png +0 -0
  667. data/vendor/ggml/examples/yolo/data/labels/57_3.png +0 -0
  668. data/vendor/ggml/examples/yolo/data/labels/57_4.png +0 -0
  669. data/vendor/ggml/examples/yolo/data/labels/57_5.png +0 -0
  670. data/vendor/ggml/examples/yolo/data/labels/57_6.png +0 -0
  671. data/vendor/ggml/examples/yolo/data/labels/57_7.png +0 -0
  672. data/vendor/ggml/examples/yolo/data/labels/58_0.png +0 -0
  673. data/vendor/ggml/examples/yolo/data/labels/58_1.png +0 -0
  674. data/vendor/ggml/examples/yolo/data/labels/58_2.png +0 -0
  675. data/vendor/ggml/examples/yolo/data/labels/58_3.png +0 -0
  676. data/vendor/ggml/examples/yolo/data/labels/58_4.png +0 -0
  677. data/vendor/ggml/examples/yolo/data/labels/58_5.png +0 -0
  678. data/vendor/ggml/examples/yolo/data/labels/58_6.png +0 -0
  679. data/vendor/ggml/examples/yolo/data/labels/58_7.png +0 -0
  680. data/vendor/ggml/examples/yolo/data/labels/59_0.png +0 -0
  681. data/vendor/ggml/examples/yolo/data/labels/59_1.png +0 -0
  682. data/vendor/ggml/examples/yolo/data/labels/59_2.png +0 -0
  683. data/vendor/ggml/examples/yolo/data/labels/59_3.png +0 -0
  684. data/vendor/ggml/examples/yolo/data/labels/59_4.png +0 -0
  685. data/vendor/ggml/examples/yolo/data/labels/59_5.png +0 -0
  686. data/vendor/ggml/examples/yolo/data/labels/59_6.png +0 -0
  687. data/vendor/ggml/examples/yolo/data/labels/59_7.png +0 -0
  688. data/vendor/ggml/examples/yolo/data/labels/60_0.png +0 -0
  689. data/vendor/ggml/examples/yolo/data/labels/60_1.png +0 -0
  690. data/vendor/ggml/examples/yolo/data/labels/60_2.png +0 -0
  691. data/vendor/ggml/examples/yolo/data/labels/60_3.png +0 -0
  692. data/vendor/ggml/examples/yolo/data/labels/60_4.png +0 -0
  693. data/vendor/ggml/examples/yolo/data/labels/60_5.png +0 -0
  694. data/vendor/ggml/examples/yolo/data/labels/60_6.png +0 -0
  695. data/vendor/ggml/examples/yolo/data/labels/60_7.png +0 -0
  696. data/vendor/ggml/examples/yolo/data/labels/61_0.png +0 -0
  697. data/vendor/ggml/examples/yolo/data/labels/61_1.png +0 -0
  698. data/vendor/ggml/examples/yolo/data/labels/61_2.png +0 -0
  699. data/vendor/ggml/examples/yolo/data/labels/61_3.png +0 -0
  700. data/vendor/ggml/examples/yolo/data/labels/61_4.png +0 -0
  701. data/vendor/ggml/examples/yolo/data/labels/61_5.png +0 -0
  702. data/vendor/ggml/examples/yolo/data/labels/61_6.png +0 -0
  703. data/vendor/ggml/examples/yolo/data/labels/61_7.png +0 -0
  704. data/vendor/ggml/examples/yolo/data/labels/62_0.png +0 -0
  705. data/vendor/ggml/examples/yolo/data/labels/62_1.png +0 -0
  706. data/vendor/ggml/examples/yolo/data/labels/62_2.png +0 -0
  707. data/vendor/ggml/examples/yolo/data/labels/62_3.png +0 -0
  708. data/vendor/ggml/examples/yolo/data/labels/62_4.png +0 -0
  709. data/vendor/ggml/examples/yolo/data/labels/62_5.png +0 -0
  710. data/vendor/ggml/examples/yolo/data/labels/62_6.png +0 -0
  711. data/vendor/ggml/examples/yolo/data/labels/62_7.png +0 -0
  712. data/vendor/ggml/examples/yolo/data/labels/63_0.png +0 -0
  713. data/vendor/ggml/examples/yolo/data/labels/63_1.png +0 -0
  714. data/vendor/ggml/examples/yolo/data/labels/63_2.png +0 -0
  715. data/vendor/ggml/examples/yolo/data/labels/63_3.png +0 -0
  716. data/vendor/ggml/examples/yolo/data/labels/63_4.png +0 -0
  717. data/vendor/ggml/examples/yolo/data/labels/63_5.png +0 -0
  718. data/vendor/ggml/examples/yolo/data/labels/63_6.png +0 -0
  719. data/vendor/ggml/examples/yolo/data/labels/63_7.png +0 -0
  720. data/vendor/ggml/examples/yolo/data/labels/64_0.png +0 -0
  721. data/vendor/ggml/examples/yolo/data/labels/64_1.png +0 -0
  722. data/vendor/ggml/examples/yolo/data/labels/64_2.png +0 -0
  723. data/vendor/ggml/examples/yolo/data/labels/64_3.png +0 -0
  724. data/vendor/ggml/examples/yolo/data/labels/64_4.png +0 -0
  725. data/vendor/ggml/examples/yolo/data/labels/64_5.png +0 -0
  726. data/vendor/ggml/examples/yolo/data/labels/64_6.png +0 -0
  727. data/vendor/ggml/examples/yolo/data/labels/64_7.png +0 -0
  728. data/vendor/ggml/examples/yolo/data/labels/65_0.png +0 -0
  729. data/vendor/ggml/examples/yolo/data/labels/65_1.png +0 -0
  730. data/vendor/ggml/examples/yolo/data/labels/65_2.png +0 -0
  731. data/vendor/ggml/examples/yolo/data/labels/65_3.png +0 -0
  732. data/vendor/ggml/examples/yolo/data/labels/65_4.png +0 -0
  733. data/vendor/ggml/examples/yolo/data/labels/65_5.png +0 -0
  734. data/vendor/ggml/examples/yolo/data/labels/65_6.png +0 -0
  735. data/vendor/ggml/examples/yolo/data/labels/65_7.png +0 -0
  736. data/vendor/ggml/examples/yolo/data/labels/66_0.png +0 -0
  737. data/vendor/ggml/examples/yolo/data/labels/66_1.png +0 -0
  738. data/vendor/ggml/examples/yolo/data/labels/66_2.png +0 -0
  739. data/vendor/ggml/examples/yolo/data/labels/66_3.png +0 -0
  740. data/vendor/ggml/examples/yolo/data/labels/66_4.png +0 -0
  741. data/vendor/ggml/examples/yolo/data/labels/66_5.png +0 -0
  742. data/vendor/ggml/examples/yolo/data/labels/66_6.png +0 -0
  743. data/vendor/ggml/examples/yolo/data/labels/66_7.png +0 -0
  744. data/vendor/ggml/examples/yolo/data/labels/67_0.png +0 -0
  745. data/vendor/ggml/examples/yolo/data/labels/67_1.png +0 -0
  746. data/vendor/ggml/examples/yolo/data/labels/67_2.png +0 -0
  747. data/vendor/ggml/examples/yolo/data/labels/67_3.png +0 -0
  748. data/vendor/ggml/examples/yolo/data/labels/67_4.png +0 -0
  749. data/vendor/ggml/examples/yolo/data/labels/67_5.png +0 -0
  750. data/vendor/ggml/examples/yolo/data/labels/67_6.png +0 -0
  751. data/vendor/ggml/examples/yolo/data/labels/67_7.png +0 -0
  752. data/vendor/ggml/examples/yolo/data/labels/68_0.png +0 -0
  753. data/vendor/ggml/examples/yolo/data/labels/68_1.png +0 -0
  754. data/vendor/ggml/examples/yolo/data/labels/68_2.png +0 -0
  755. data/vendor/ggml/examples/yolo/data/labels/68_3.png +0 -0
  756. data/vendor/ggml/examples/yolo/data/labels/68_4.png +0 -0
  757. data/vendor/ggml/examples/yolo/data/labels/68_5.png +0 -0
  758. data/vendor/ggml/examples/yolo/data/labels/68_6.png +0 -0
  759. data/vendor/ggml/examples/yolo/data/labels/68_7.png +0 -0
  760. data/vendor/ggml/examples/yolo/data/labels/69_0.png +0 -0
  761. data/vendor/ggml/examples/yolo/data/labels/69_1.png +0 -0
  762. data/vendor/ggml/examples/yolo/data/labels/69_2.png +0 -0
  763. data/vendor/ggml/examples/yolo/data/labels/69_3.png +0 -0
  764. data/vendor/ggml/examples/yolo/data/labels/69_4.png +0 -0
  765. data/vendor/ggml/examples/yolo/data/labels/69_5.png +0 -0
  766. data/vendor/ggml/examples/yolo/data/labels/69_6.png +0 -0
  767. data/vendor/ggml/examples/yolo/data/labels/69_7.png +0 -0
  768. data/vendor/ggml/examples/yolo/data/labels/70_0.png +0 -0
  769. data/vendor/ggml/examples/yolo/data/labels/70_1.png +0 -0
  770. data/vendor/ggml/examples/yolo/data/labels/70_2.png +0 -0
  771. data/vendor/ggml/examples/yolo/data/labels/70_3.png +0 -0
  772. data/vendor/ggml/examples/yolo/data/labels/70_4.png +0 -0
  773. data/vendor/ggml/examples/yolo/data/labels/70_5.png +0 -0
  774. data/vendor/ggml/examples/yolo/data/labels/70_6.png +0 -0
  775. data/vendor/ggml/examples/yolo/data/labels/70_7.png +0 -0
  776. data/vendor/ggml/examples/yolo/data/labels/71_0.png +0 -0
  777. data/vendor/ggml/examples/yolo/data/labels/71_1.png +0 -0
  778. data/vendor/ggml/examples/yolo/data/labels/71_2.png +0 -0
  779. data/vendor/ggml/examples/yolo/data/labels/71_3.png +0 -0
  780. data/vendor/ggml/examples/yolo/data/labels/71_4.png +0 -0
  781. data/vendor/ggml/examples/yolo/data/labels/71_5.png +0 -0
  782. data/vendor/ggml/examples/yolo/data/labels/71_6.png +0 -0
  783. data/vendor/ggml/examples/yolo/data/labels/71_7.png +0 -0
  784. data/vendor/ggml/examples/yolo/data/labels/72_0.png +0 -0
  785. data/vendor/ggml/examples/yolo/data/labels/72_1.png +0 -0
  786. data/vendor/ggml/examples/yolo/data/labels/72_2.png +0 -0
  787. data/vendor/ggml/examples/yolo/data/labels/72_3.png +0 -0
  788. data/vendor/ggml/examples/yolo/data/labels/72_4.png +0 -0
  789. data/vendor/ggml/examples/yolo/data/labels/72_5.png +0 -0
  790. data/vendor/ggml/examples/yolo/data/labels/72_6.png +0 -0
  791. data/vendor/ggml/examples/yolo/data/labels/72_7.png +0 -0
  792. data/vendor/ggml/examples/yolo/data/labels/73_0.png +0 -0
  793. data/vendor/ggml/examples/yolo/data/labels/73_1.png +0 -0
  794. data/vendor/ggml/examples/yolo/data/labels/73_2.png +0 -0
  795. data/vendor/ggml/examples/yolo/data/labels/73_3.png +0 -0
  796. data/vendor/ggml/examples/yolo/data/labels/73_4.png +0 -0
  797. data/vendor/ggml/examples/yolo/data/labels/73_5.png +0 -0
  798. data/vendor/ggml/examples/yolo/data/labels/73_6.png +0 -0
  799. data/vendor/ggml/examples/yolo/data/labels/73_7.png +0 -0
  800. data/vendor/ggml/examples/yolo/data/labels/74_0.png +0 -0
  801. data/vendor/ggml/examples/yolo/data/labels/74_1.png +0 -0
  802. data/vendor/ggml/examples/yolo/data/labels/74_2.png +0 -0
  803. data/vendor/ggml/examples/yolo/data/labels/74_3.png +0 -0
  804. data/vendor/ggml/examples/yolo/data/labels/74_4.png +0 -0
  805. data/vendor/ggml/examples/yolo/data/labels/74_5.png +0 -0
  806. data/vendor/ggml/examples/yolo/data/labels/74_6.png +0 -0
  807. data/vendor/ggml/examples/yolo/data/labels/74_7.png +0 -0
  808. data/vendor/ggml/examples/yolo/data/labels/75_0.png +0 -0
  809. data/vendor/ggml/examples/yolo/data/labels/75_1.png +0 -0
  810. data/vendor/ggml/examples/yolo/data/labels/75_2.png +0 -0
  811. data/vendor/ggml/examples/yolo/data/labels/75_3.png +0 -0
  812. data/vendor/ggml/examples/yolo/data/labels/75_4.png +0 -0
  813. data/vendor/ggml/examples/yolo/data/labels/75_5.png +0 -0
  814. data/vendor/ggml/examples/yolo/data/labels/75_6.png +0 -0
  815. data/vendor/ggml/examples/yolo/data/labels/75_7.png +0 -0
  816. data/vendor/ggml/examples/yolo/data/labels/76_0.png +0 -0
  817. data/vendor/ggml/examples/yolo/data/labels/76_1.png +0 -0
  818. data/vendor/ggml/examples/yolo/data/labels/76_2.png +0 -0
  819. data/vendor/ggml/examples/yolo/data/labels/76_3.png +0 -0
  820. data/vendor/ggml/examples/yolo/data/labels/76_4.png +0 -0
  821. data/vendor/ggml/examples/yolo/data/labels/76_5.png +0 -0
  822. data/vendor/ggml/examples/yolo/data/labels/76_6.png +0 -0
  823. data/vendor/ggml/examples/yolo/data/labels/76_7.png +0 -0
  824. data/vendor/ggml/examples/yolo/data/labels/77_0.png +0 -0
  825. data/vendor/ggml/examples/yolo/data/labels/77_1.png +0 -0
  826. data/vendor/ggml/examples/yolo/data/labels/77_2.png +0 -0
  827. data/vendor/ggml/examples/yolo/data/labels/77_3.png +0 -0
  828. data/vendor/ggml/examples/yolo/data/labels/77_4.png +0 -0
  829. data/vendor/ggml/examples/yolo/data/labels/77_5.png +0 -0
  830. data/vendor/ggml/examples/yolo/data/labels/77_6.png +0 -0
  831. data/vendor/ggml/examples/yolo/data/labels/77_7.png +0 -0
  832. data/vendor/ggml/examples/yolo/data/labels/78_0.png +0 -0
  833. data/vendor/ggml/examples/yolo/data/labels/78_1.png +0 -0
  834. data/vendor/ggml/examples/yolo/data/labels/78_2.png +0 -0
  835. data/vendor/ggml/examples/yolo/data/labels/78_3.png +0 -0
  836. data/vendor/ggml/examples/yolo/data/labels/78_4.png +0 -0
  837. data/vendor/ggml/examples/yolo/data/labels/78_5.png +0 -0
  838. data/vendor/ggml/examples/yolo/data/labels/78_6.png +0 -0
  839. data/vendor/ggml/examples/yolo/data/labels/78_7.png +0 -0
  840. data/vendor/ggml/examples/yolo/data/labels/79_0.png +0 -0
  841. data/vendor/ggml/examples/yolo/data/labels/79_1.png +0 -0
  842. data/vendor/ggml/examples/yolo/data/labels/79_2.png +0 -0
  843. data/vendor/ggml/examples/yolo/data/labels/79_3.png +0 -0
  844. data/vendor/ggml/examples/yolo/data/labels/79_4.png +0 -0
  845. data/vendor/ggml/examples/yolo/data/labels/79_5.png +0 -0
  846. data/vendor/ggml/examples/yolo/data/labels/79_6.png +0 -0
  847. data/vendor/ggml/examples/yolo/data/labels/79_7.png +0 -0
  848. data/vendor/ggml/examples/yolo/data/labels/80_0.png +0 -0
  849. data/vendor/ggml/examples/yolo/data/labels/80_1.png +0 -0
  850. data/vendor/ggml/examples/yolo/data/labels/80_2.png +0 -0
  851. data/vendor/ggml/examples/yolo/data/labels/80_3.png +0 -0
  852. data/vendor/ggml/examples/yolo/data/labels/80_4.png +0 -0
  853. data/vendor/ggml/examples/yolo/data/labels/80_5.png +0 -0
  854. data/vendor/ggml/examples/yolo/data/labels/80_6.png +0 -0
  855. data/vendor/ggml/examples/yolo/data/labels/80_7.png +0 -0
  856. data/vendor/ggml/examples/yolo/data/labels/81_0.png +0 -0
  857. data/vendor/ggml/examples/yolo/data/labels/81_1.png +0 -0
  858. data/vendor/ggml/examples/yolo/data/labels/81_2.png +0 -0
  859. data/vendor/ggml/examples/yolo/data/labels/81_3.png +0 -0
  860. data/vendor/ggml/examples/yolo/data/labels/81_4.png +0 -0
  861. data/vendor/ggml/examples/yolo/data/labels/81_5.png +0 -0
  862. data/vendor/ggml/examples/yolo/data/labels/81_6.png +0 -0
  863. data/vendor/ggml/examples/yolo/data/labels/81_7.png +0 -0
  864. data/vendor/ggml/examples/yolo/data/labels/82_0.png +0 -0
  865. data/vendor/ggml/examples/yolo/data/labels/82_1.png +0 -0
  866. data/vendor/ggml/examples/yolo/data/labels/82_2.png +0 -0
  867. data/vendor/ggml/examples/yolo/data/labels/82_3.png +0 -0
  868. data/vendor/ggml/examples/yolo/data/labels/82_4.png +0 -0
  869. data/vendor/ggml/examples/yolo/data/labels/82_5.png +0 -0
  870. data/vendor/ggml/examples/yolo/data/labels/82_6.png +0 -0
  871. data/vendor/ggml/examples/yolo/data/labels/82_7.png +0 -0
  872. data/vendor/ggml/examples/yolo/data/labels/83_0.png +0 -0
  873. data/vendor/ggml/examples/yolo/data/labels/83_1.png +0 -0
  874. data/vendor/ggml/examples/yolo/data/labels/83_2.png +0 -0
  875. data/vendor/ggml/examples/yolo/data/labels/83_3.png +0 -0
  876. data/vendor/ggml/examples/yolo/data/labels/83_4.png +0 -0
  877. data/vendor/ggml/examples/yolo/data/labels/83_5.png +0 -0
  878. data/vendor/ggml/examples/yolo/data/labels/83_6.png +0 -0
  879. data/vendor/ggml/examples/yolo/data/labels/83_7.png +0 -0
  880. data/vendor/ggml/examples/yolo/data/labels/84_0.png +0 -0
  881. data/vendor/ggml/examples/yolo/data/labels/84_1.png +0 -0
  882. data/vendor/ggml/examples/yolo/data/labels/84_2.png +0 -0
  883. data/vendor/ggml/examples/yolo/data/labels/84_3.png +0 -0
  884. data/vendor/ggml/examples/yolo/data/labels/84_4.png +0 -0
  885. data/vendor/ggml/examples/yolo/data/labels/84_5.png +0 -0
  886. data/vendor/ggml/examples/yolo/data/labels/84_6.png +0 -0
  887. data/vendor/ggml/examples/yolo/data/labels/84_7.png +0 -0
  888. data/vendor/ggml/examples/yolo/data/labels/85_0.png +0 -0
  889. data/vendor/ggml/examples/yolo/data/labels/85_1.png +0 -0
  890. data/vendor/ggml/examples/yolo/data/labels/85_2.png +0 -0
  891. data/vendor/ggml/examples/yolo/data/labels/85_3.png +0 -0
  892. data/vendor/ggml/examples/yolo/data/labels/85_4.png +0 -0
  893. data/vendor/ggml/examples/yolo/data/labels/85_5.png +0 -0
  894. data/vendor/ggml/examples/yolo/data/labels/85_6.png +0 -0
  895. data/vendor/ggml/examples/yolo/data/labels/85_7.png +0 -0
  896. data/vendor/ggml/examples/yolo/data/labels/86_0.png +0 -0
  897. data/vendor/ggml/examples/yolo/data/labels/86_1.png +0 -0
  898. data/vendor/ggml/examples/yolo/data/labels/86_2.png +0 -0
  899. data/vendor/ggml/examples/yolo/data/labels/86_3.png +0 -0
  900. data/vendor/ggml/examples/yolo/data/labels/86_4.png +0 -0
  901. data/vendor/ggml/examples/yolo/data/labels/86_5.png +0 -0
  902. data/vendor/ggml/examples/yolo/data/labels/86_6.png +0 -0
  903. data/vendor/ggml/examples/yolo/data/labels/86_7.png +0 -0
  904. data/vendor/ggml/examples/yolo/data/labels/87_0.png +0 -0
  905. data/vendor/ggml/examples/yolo/data/labels/87_1.png +0 -0
  906. data/vendor/ggml/examples/yolo/data/labels/87_2.png +0 -0
  907. data/vendor/ggml/examples/yolo/data/labels/87_3.png +0 -0
  908. data/vendor/ggml/examples/yolo/data/labels/87_4.png +0 -0
  909. data/vendor/ggml/examples/yolo/data/labels/87_5.png +0 -0
  910. data/vendor/ggml/examples/yolo/data/labels/87_6.png +0 -0
  911. data/vendor/ggml/examples/yolo/data/labels/87_7.png +0 -0
  912. data/vendor/ggml/examples/yolo/data/labels/88_0.png +0 -0
  913. data/vendor/ggml/examples/yolo/data/labels/88_1.png +0 -0
  914. data/vendor/ggml/examples/yolo/data/labels/88_2.png +0 -0
  915. data/vendor/ggml/examples/yolo/data/labels/88_3.png +0 -0
  916. data/vendor/ggml/examples/yolo/data/labels/88_4.png +0 -0
  917. data/vendor/ggml/examples/yolo/data/labels/88_5.png +0 -0
  918. data/vendor/ggml/examples/yolo/data/labels/88_6.png +0 -0
  919. data/vendor/ggml/examples/yolo/data/labels/88_7.png +0 -0
  920. data/vendor/ggml/examples/yolo/data/labels/89_0.png +0 -0
  921. data/vendor/ggml/examples/yolo/data/labels/89_1.png +0 -0
  922. data/vendor/ggml/examples/yolo/data/labels/89_2.png +0 -0
  923. data/vendor/ggml/examples/yolo/data/labels/89_3.png +0 -0
  924. data/vendor/ggml/examples/yolo/data/labels/89_4.png +0 -0
  925. data/vendor/ggml/examples/yolo/data/labels/89_5.png +0 -0
  926. data/vendor/ggml/examples/yolo/data/labels/89_6.png +0 -0
  927. data/vendor/ggml/examples/yolo/data/labels/89_7.png +0 -0
  928. data/vendor/ggml/examples/yolo/data/labels/90_0.png +0 -0
  929. data/vendor/ggml/examples/yolo/data/labels/90_1.png +0 -0
  930. data/vendor/ggml/examples/yolo/data/labels/90_2.png +0 -0
  931. data/vendor/ggml/examples/yolo/data/labels/90_3.png +0 -0
  932. data/vendor/ggml/examples/yolo/data/labels/90_4.png +0 -0
  933. data/vendor/ggml/examples/yolo/data/labels/90_5.png +0 -0
  934. data/vendor/ggml/examples/yolo/data/labels/90_6.png +0 -0
  935. data/vendor/ggml/examples/yolo/data/labels/90_7.png +0 -0
  936. data/vendor/ggml/examples/yolo/data/labels/91_0.png +0 -0
  937. data/vendor/ggml/examples/yolo/data/labels/91_1.png +0 -0
  938. data/vendor/ggml/examples/yolo/data/labels/91_2.png +0 -0
  939. data/vendor/ggml/examples/yolo/data/labels/91_3.png +0 -0
  940. data/vendor/ggml/examples/yolo/data/labels/91_4.png +0 -0
  941. data/vendor/ggml/examples/yolo/data/labels/91_5.png +0 -0
  942. data/vendor/ggml/examples/yolo/data/labels/91_6.png +0 -0
  943. data/vendor/ggml/examples/yolo/data/labels/91_7.png +0 -0
  944. data/vendor/ggml/examples/yolo/data/labels/92_0.png +0 -0
  945. data/vendor/ggml/examples/yolo/data/labels/92_1.png +0 -0
  946. data/vendor/ggml/examples/yolo/data/labels/92_2.png +0 -0
  947. data/vendor/ggml/examples/yolo/data/labels/92_3.png +0 -0
  948. data/vendor/ggml/examples/yolo/data/labels/92_4.png +0 -0
  949. data/vendor/ggml/examples/yolo/data/labels/92_5.png +0 -0
  950. data/vendor/ggml/examples/yolo/data/labels/92_6.png +0 -0
  951. data/vendor/ggml/examples/yolo/data/labels/92_7.png +0 -0
  952. data/vendor/ggml/examples/yolo/data/labels/93_0.png +0 -0
  953. data/vendor/ggml/examples/yolo/data/labels/93_1.png +0 -0
  954. data/vendor/ggml/examples/yolo/data/labels/93_2.png +0 -0
  955. data/vendor/ggml/examples/yolo/data/labels/93_3.png +0 -0
  956. data/vendor/ggml/examples/yolo/data/labels/93_4.png +0 -0
  957. data/vendor/ggml/examples/yolo/data/labels/93_5.png +0 -0
  958. data/vendor/ggml/examples/yolo/data/labels/93_6.png +0 -0
  959. data/vendor/ggml/examples/yolo/data/labels/93_7.png +0 -0
  960. data/vendor/ggml/examples/yolo/data/labels/94_0.png +0 -0
  961. data/vendor/ggml/examples/yolo/data/labels/94_1.png +0 -0
  962. data/vendor/ggml/examples/yolo/data/labels/94_2.png +0 -0
  963. data/vendor/ggml/examples/yolo/data/labels/94_3.png +0 -0
  964. data/vendor/ggml/examples/yolo/data/labels/94_4.png +0 -0
  965. data/vendor/ggml/examples/yolo/data/labels/94_5.png +0 -0
  966. data/vendor/ggml/examples/yolo/data/labels/94_6.png +0 -0
  967. data/vendor/ggml/examples/yolo/data/labels/94_7.png +0 -0
  968. data/vendor/ggml/examples/yolo/data/labels/95_0.png +0 -0
  969. data/vendor/ggml/examples/yolo/data/labels/95_1.png +0 -0
  970. data/vendor/ggml/examples/yolo/data/labels/95_2.png +0 -0
  971. data/vendor/ggml/examples/yolo/data/labels/95_3.png +0 -0
  972. data/vendor/ggml/examples/yolo/data/labels/95_4.png +0 -0
  973. data/vendor/ggml/examples/yolo/data/labels/95_5.png +0 -0
  974. data/vendor/ggml/examples/yolo/data/labels/95_6.png +0 -0
  975. data/vendor/ggml/examples/yolo/data/labels/95_7.png +0 -0
  976. data/vendor/ggml/examples/yolo/data/labels/96_0.png +0 -0
  977. data/vendor/ggml/examples/yolo/data/labels/96_1.png +0 -0
  978. data/vendor/ggml/examples/yolo/data/labels/96_2.png +0 -0
  979. data/vendor/ggml/examples/yolo/data/labels/96_3.png +0 -0
  980. data/vendor/ggml/examples/yolo/data/labels/96_4.png +0 -0
  981. data/vendor/ggml/examples/yolo/data/labels/96_5.png +0 -0
  982. data/vendor/ggml/examples/yolo/data/labels/96_6.png +0 -0
  983. data/vendor/ggml/examples/yolo/data/labels/96_7.png +0 -0
  984. data/vendor/ggml/examples/yolo/data/labels/97_0.png +0 -0
  985. data/vendor/ggml/examples/yolo/data/labels/97_1.png +0 -0
  986. data/vendor/ggml/examples/yolo/data/labels/97_2.png +0 -0
  987. data/vendor/ggml/examples/yolo/data/labels/97_3.png +0 -0
  988. data/vendor/ggml/examples/yolo/data/labels/97_4.png +0 -0
  989. data/vendor/ggml/examples/yolo/data/labels/97_5.png +0 -0
  990. data/vendor/ggml/examples/yolo/data/labels/97_6.png +0 -0
  991. data/vendor/ggml/examples/yolo/data/labels/97_7.png +0 -0
  992. data/vendor/ggml/examples/yolo/data/labels/98_0.png +0 -0
  993. data/vendor/ggml/examples/yolo/data/labels/98_1.png +0 -0
  994. data/vendor/ggml/examples/yolo/data/labels/98_2.png +0 -0
  995. data/vendor/ggml/examples/yolo/data/labels/98_3.png +0 -0
  996. data/vendor/ggml/examples/yolo/data/labels/98_4.png +0 -0
  997. data/vendor/ggml/examples/yolo/data/labels/98_5.png +0 -0
  998. data/vendor/ggml/examples/yolo/data/labels/98_6.png +0 -0
  999. data/vendor/ggml/examples/yolo/data/labels/98_7.png +0 -0
  1000. data/vendor/ggml/examples/yolo/data/labels/99_0.png +0 -0
  1001. data/vendor/ggml/examples/yolo/data/labels/99_1.png +0 -0
  1002. data/vendor/ggml/examples/yolo/data/labels/99_2.png +0 -0
  1003. data/vendor/ggml/examples/yolo/data/labels/99_3.png +0 -0
  1004. data/vendor/ggml/examples/yolo/data/labels/99_4.png +0 -0
  1005. data/vendor/ggml/examples/yolo/data/labels/99_5.png +0 -0
  1006. data/vendor/ggml/examples/yolo/data/labels/99_6.png +0 -0
  1007. data/vendor/ggml/examples/yolo/data/labels/99_7.png +0 -0
  1008. data/vendor/ggml/examples/yolo/yolo-image.cpp +210 -0
  1009. data/vendor/ggml/examples/yolo/yolo-image.h +39 -0
  1010. data/vendor/ggml/examples/yolo/yolov3-tiny.cpp +661 -0
  1011. data/vendor/ggml/ggml.pc.in +10 -0
  1012. data/vendor/ggml/include/ggml-alloc.h +85 -0
  1013. data/vendor/ggml/include/ggml-backend.h +431 -0
  1014. data/vendor/ggml/include/ggml-blas.h +25 -0
  1015. data/vendor/ggml/include/ggml-cann.h +123 -0
  1016. data/vendor/ggml/include/ggml-cpp.h +39 -0
  1017. data/vendor/ggml/include/ggml-cpu.h +151 -0
  1018. data/vendor/ggml/include/ggml-cuda.h +50 -0
  1019. data/vendor/ggml/include/ggml-hexagon.h +19 -0
  1020. data/vendor/ggml/include/ggml-metal.h +61 -0
  1021. data/vendor/ggml/include/ggml-opencl.h +26 -0
  1022. data/vendor/ggml/include/ggml-openvino.h +37 -0
  1023. data/vendor/ggml/include/ggml-opt.h +256 -0
  1024. data/vendor/ggml/include/ggml-rpc.h +35 -0
  1025. data/vendor/ggml/include/ggml-sycl.h +49 -0
  1026. data/vendor/ggml/include/ggml-virtgpu.h +14 -0
  1027. data/vendor/ggml/include/ggml-vulkan.h +29 -0
  1028. data/vendor/ggml/include/ggml-webgpu.h +19 -0
  1029. data/vendor/ggml/include/ggml-zdnn.h +17 -0
  1030. data/vendor/ggml/include/ggml-zendnn.h +22 -0
  1031. data/vendor/ggml/include/ggml.h +2845 -0
  1032. data/vendor/ggml/include/gguf.h +204 -0
  1033. data/vendor/ggml/requirements.txt +12 -0
  1034. data/vendor/ggml/scripts/gen-authors.sh +9 -0
  1035. data/vendor/ggml/scripts/release.sh +296 -0
  1036. data/vendor/ggml/scripts/sync-llama-am.sh +167 -0
  1037. data/vendor/ggml/scripts/sync-llama.last +1 -0
  1038. data/vendor/ggml/scripts/sync-llama.sh +21 -0
  1039. data/vendor/ggml/scripts/sync-whisper-am.sh +138 -0
  1040. data/vendor/ggml/scripts/sync-whisper.last +1 -0
  1041. data/vendor/ggml/scripts/sync-whisper.sh +17 -0
  1042. data/vendor/ggml/src/CMakeLists.txt +493 -0
  1043. data/vendor/ggml/src/ggml-alloc.c +1248 -0
  1044. data/vendor/ggml/src/ggml-backend-dl.cpp +48 -0
  1045. data/vendor/ggml/src/ggml-backend-dl.h +45 -0
  1046. data/vendor/ggml/src/ggml-backend-impl.h +275 -0
  1047. data/vendor/ggml/src/ggml-backend-meta.cpp +2144 -0
  1048. data/vendor/ggml/src/ggml-backend-reg.cpp +586 -0
  1049. data/vendor/ggml/src/ggml-backend.cpp +2371 -0
  1050. data/vendor/ggml/src/ggml-blas/CMakeLists.txt +101 -0
  1051. data/vendor/ggml/src/ggml-blas/ggml-blas.cpp +522 -0
  1052. data/vendor/ggml/src/ggml-cann/CMakeLists.txt +89 -0
  1053. data/vendor/ggml/src/ggml-cann/acl_tensor.cpp +195 -0
  1054. data/vendor/ggml/src/ggml-cann/acl_tensor.h +349 -0
  1055. data/vendor/ggml/src/ggml-cann/aclnn_ops.cpp +4436 -0
  1056. data/vendor/ggml/src/ggml-cann/aclnn_ops.h +1190 -0
  1057. data/vendor/ggml/src/ggml-cann/common.h +651 -0
  1058. data/vendor/ggml/src/ggml-cann/ggml-cann.cpp +3062 -0
  1059. data/vendor/ggml/src/ggml-common.h +1900 -0
  1060. data/vendor/ggml/src/ggml-cpu/CMakeLists.txt +731 -0
  1061. data/vendor/ggml/src/ggml-cpu/amx/amx.cpp +249 -0
  1062. data/vendor/ggml/src/ggml-cpu/amx/amx.h +8 -0
  1063. data/vendor/ggml/src/ggml-cpu/amx/common.h +115 -0
  1064. data/vendor/ggml/src/ggml-cpu/amx/mmq.cpp +2512 -0
  1065. data/vendor/ggml/src/ggml-cpu/amx/mmq.h +10 -0
  1066. data/vendor/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +98 -0
  1067. data/vendor/ggml/src/ggml-cpu/arch/arm/quants.c +4245 -0
  1068. data/vendor/ggml/src/ggml-cpu/arch/arm/repack.cpp +5156 -0
  1069. data/vendor/ggml/src/ggml-cpu/arch/loongarch/quants.c +2158 -0
  1070. data/vendor/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  1071. data/vendor/ggml/src/ggml-cpu/arch/powerpc/quants.c +2304 -0
  1072. data/vendor/ggml/src/ggml-cpu/arch/riscv/cpu-feats.cpp +38 -0
  1073. data/vendor/ggml/src/ggml-cpu/arch/riscv/quants.c +4553 -0
  1074. data/vendor/ggml/src/ggml-cpu/arch/riscv/repack.cpp +1703 -0
  1075. data/vendor/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
  1076. data/vendor/ggml/src/ggml-cpu/arch/s390/quants.c +1465 -0
  1077. data/vendor/ggml/src/ggml-cpu/arch/wasm/quants.c +1220 -0
  1078. data/vendor/ggml/src/ggml-cpu/arch/x86/cpu-feats.cpp +327 -0
  1079. data/vendor/ggml/src/ggml-cpu/arch/x86/quants.c +3970 -0
  1080. data/vendor/ggml/src/ggml-cpu/arch/x86/repack.cpp +6407 -0
  1081. data/vendor/ggml/src/ggml-cpu/arch-fallback.h +348 -0
  1082. data/vendor/ggml/src/ggml-cpu/binary-ops.cpp +154 -0
  1083. data/vendor/ggml/src/ggml-cpu/binary-ops.h +16 -0
  1084. data/vendor/ggml/src/ggml-cpu/cmake/FindSIMD.cmake +100 -0
  1085. data/vendor/ggml/src/ggml-cpu/cmake/FindSMTIME.cmake +32 -0
  1086. data/vendor/ggml/src/ggml-cpu/common.h +95 -0
  1087. data/vendor/ggml/src/ggml-cpu/ggml-cpu-impl.h +539 -0
  1088. data/vendor/ggml/src/ggml-cpu/ggml-cpu.c +3835 -0
  1089. data/vendor/ggml/src/ggml-cpu/ggml-cpu.cpp +703 -0
  1090. data/vendor/ggml/src/ggml-cpu/hbm.cpp +55 -0
  1091. data/vendor/ggml/src/ggml-cpu/hbm.h +8 -0
  1092. data/vendor/ggml/src/ggml-cpu/kleidiai/kernels.cpp +939 -0
  1093. data/vendor/ggml/src/ggml-cpu/kleidiai/kernels.h +90 -0
  1094. data/vendor/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1513 -0
  1095. data/vendor/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  1096. data/vendor/ggml/src/ggml-cpu/llamafile/sgemm.cpp +4051 -0
  1097. data/vendor/ggml/src/ggml-cpu/llamafile/sgemm.h +25 -0
  1098. data/vendor/ggml/src/ggml-cpu/ops.cpp +11373 -0
  1099. data/vendor/ggml/src/ggml-cpu/ops.h +119 -0
  1100. data/vendor/ggml/src/ggml-cpu/quants.c +1288 -0
  1101. data/vendor/ggml/src/ggml-cpu/quants.h +103 -0
  1102. data/vendor/ggml/src/ggml-cpu/repack.cpp +4836 -0
  1103. data/vendor/ggml/src/ggml-cpu/repack.h +245 -0
  1104. data/vendor/ggml/src/ggml-cpu/simd-gemm.h +226 -0
  1105. data/vendor/ggml/src/ggml-cpu/simd-mappings.h +1319 -0
  1106. data/vendor/ggml/src/ggml-cpu/spacemit/ime.cpp +1740 -0
  1107. data/vendor/ggml/src/ggml-cpu/spacemit/ime.h +21 -0
  1108. data/vendor/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +1027 -0
  1109. data/vendor/ggml/src/ggml-cpu/spacemit/ime2_kernels.cpp +5768 -0
  1110. data/vendor/ggml/src/ggml-cpu/spacemit/ime_env.cpp +320 -0
  1111. data/vendor/ggml/src/ggml-cpu/spacemit/ime_env.h +55 -0
  1112. data/vendor/ggml/src/ggml-cpu/spacemit/ime_kernels.h +189 -0
  1113. data/vendor/ggml/src/ggml-cpu/spacemit/repack.cpp +1795 -0
  1114. data/vendor/ggml/src/ggml-cpu/spacemit/repack.h +14 -0
  1115. data/vendor/ggml/src/ggml-cpu/spacemit/rvv_kernels.cpp +3178 -0
  1116. data/vendor/ggml/src/ggml-cpu/spacemit/rvv_kernels.h +95 -0
  1117. data/vendor/ggml/src/ggml-cpu/spacemit/spine_barrier.h +34 -0
  1118. data/vendor/ggml/src/ggml-cpu/spacemit/spine_mem_pool.cpp +760 -0
  1119. data/vendor/ggml/src/ggml-cpu/spacemit/spine_mem_pool.h +32 -0
  1120. data/vendor/ggml/src/ggml-cpu/spacemit/spine_tcm.h +409 -0
  1121. data/vendor/ggml/src/ggml-cpu/traits.cpp +36 -0
  1122. data/vendor/ggml/src/ggml-cpu/traits.h +38 -0
  1123. data/vendor/ggml/src/ggml-cpu/unary-ops.cpp +337 -0
  1124. data/vendor/ggml/src/ggml-cpu/unary-ops.h +35 -0
  1125. data/vendor/ggml/src/ggml-cpu/vec.cpp +629 -0
  1126. data/vendor/ggml/src/ggml-cpu/vec.h +1588 -0
  1127. data/vendor/ggml/src/ggml-cuda/CMakeLists.txt +268 -0
  1128. data/vendor/ggml/src/ggml-cuda/acc.cu +61 -0
  1129. data/vendor/ggml/src/ggml-cuda/acc.cuh +5 -0
  1130. data/vendor/ggml/src/ggml-cuda/add-id.cu +58 -0
  1131. data/vendor/ggml/src/ggml-cuda/add-id.cuh +3 -0
  1132. data/vendor/ggml/src/ggml-cuda/allreduce.cu +971 -0
  1133. data/vendor/ggml/src/ggml-cuda/allreduce.cuh +29 -0
  1134. data/vendor/ggml/src/ggml-cuda/arange.cu +34 -0
  1135. data/vendor/ggml/src/ggml-cuda/arange.cuh +5 -0
  1136. data/vendor/ggml/src/ggml-cuda/argmax.cu +91 -0
  1137. data/vendor/ggml/src/ggml-cuda/argmax.cuh +3 -0
  1138. data/vendor/ggml/src/ggml-cuda/argsort.cu +266 -0
  1139. data/vendor/ggml/src/ggml-cuda/argsort.cuh +19 -0
  1140. data/vendor/ggml/src/ggml-cuda/binbcast.cu +534 -0
  1141. data/vendor/ggml/src/ggml-cuda/binbcast.cuh +12 -0
  1142. data/vendor/ggml/src/ggml-cuda/clamp.cu +45 -0
  1143. data/vendor/ggml/src/ggml-cuda/clamp.cuh +5 -0
  1144. data/vendor/ggml/src/ggml-cuda/common.cuh +1489 -0
  1145. data/vendor/ggml/src/ggml-cuda/concat.cu +204 -0
  1146. data/vendor/ggml/src/ggml-cuda/concat.cuh +5 -0
  1147. data/vendor/ggml/src/ggml-cuda/conv-transpose-1d.cu +86 -0
  1148. data/vendor/ggml/src/ggml-cuda/conv-transpose-1d.cuh +5 -0
  1149. data/vendor/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
  1150. data/vendor/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
  1151. data/vendor/ggml/src/ggml-cuda/conv2d-transpose.cu +115 -0
  1152. data/vendor/ggml/src/ggml-cuda/conv2d-transpose.cuh +5 -0
  1153. data/vendor/ggml/src/ggml-cuda/conv2d.cu +166 -0
  1154. data/vendor/ggml/src/ggml-cuda/conv2d.cuh +5 -0
  1155. data/vendor/ggml/src/ggml-cuda/convert.cu +892 -0
  1156. data/vendor/ggml/src/ggml-cuda/convert.cuh +66 -0
  1157. data/vendor/ggml/src/ggml-cuda/count-equal.cu +64 -0
  1158. data/vendor/ggml/src/ggml-cuda/count-equal.cuh +5 -0
  1159. data/vendor/ggml/src/ggml-cuda/cp-async.cuh +57 -0
  1160. data/vendor/ggml/src/ggml-cuda/cpy-utils.cuh +217 -0
  1161. data/vendor/ggml/src/ggml-cuda/cpy.cu +558 -0
  1162. data/vendor/ggml/src/ggml-cuda/cpy.cuh +7 -0
  1163. data/vendor/ggml/src/ggml-cuda/cross-entropy-loss.cu +177 -0
  1164. data/vendor/ggml/src/ggml-cuda/cross-entropy-loss.cuh +7 -0
  1165. data/vendor/ggml/src/ggml-cuda/cumsum.cu +307 -0
  1166. data/vendor/ggml/src/ggml-cuda/cumsum.cuh +5 -0
  1167. data/vendor/ggml/src/ggml-cuda/dequantize.cuh +99 -0
  1168. data/vendor/ggml/src/ggml-cuda/diag.cu +77 -0
  1169. data/vendor/ggml/src/ggml-cuda/diag.cuh +5 -0
  1170. data/vendor/ggml/src/ggml-cuda/diagmask.cu +40 -0
  1171. data/vendor/ggml/src/ggml-cuda/diagmask.cuh +5 -0
  1172. data/vendor/ggml/src/ggml-cuda/fattn-common.cuh +1212 -0
  1173. data/vendor/ggml/src/ggml-cuda/fattn-mma-f16.cuh +2020 -0
  1174. data/vendor/ggml/src/ggml-cuda/fattn-tile.cu +61 -0
  1175. data/vendor/ggml/src/ggml-cuda/fattn-tile.cuh +1347 -0
  1176. data/vendor/ggml/src/ggml-cuda/fattn-vec.cuh +600 -0
  1177. data/vendor/ggml/src/ggml-cuda/fattn-wmma-f16.cu +696 -0
  1178. data/vendor/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +51 -0
  1179. data/vendor/ggml/src/ggml-cuda/fattn.cu +562 -0
  1180. data/vendor/ggml/src/ggml-cuda/fattn.cuh +5 -0
  1181. data/vendor/ggml/src/ggml-cuda/fill.cu +37 -0
  1182. data/vendor/ggml/src/ggml-cuda/fill.cuh +3 -0
  1183. data/vendor/ggml/src/ggml-cuda/gated_delta_net.cu +311 -0
  1184. data/vendor/ggml/src/ggml-cuda/gated_delta_net.cuh +4 -0
  1185. data/vendor/ggml/src/ggml-cuda/getrows.cu +300 -0
  1186. data/vendor/ggml/src/ggml-cuda/getrows.cuh +15 -0
  1187. data/vendor/ggml/src/ggml-cuda/ggml-cuda.cu +5684 -0
  1188. data/vendor/ggml/src/ggml-cuda/gla.cu +93 -0
  1189. data/vendor/ggml/src/ggml-cuda/gla.cuh +3 -0
  1190. data/vendor/ggml/src/ggml-cuda/im2col.cu +267 -0
  1191. data/vendor/ggml/src/ggml-cuda/im2col.cuh +6 -0
  1192. data/vendor/ggml/src/ggml-cuda/mean.cu +75 -0
  1193. data/vendor/ggml/src/ggml-cuda/mean.cuh +3 -0
  1194. data/vendor/ggml/src/ggml-cuda/mma.cuh +1456 -0
  1195. data/vendor/ggml/src/ggml-cuda/mmf.cu +191 -0
  1196. data/vendor/ggml/src/ggml-cuda/mmf.cuh +908 -0
  1197. data/vendor/ggml/src/ggml-cuda/mmid.cu +164 -0
  1198. data/vendor/ggml/src/ggml-cuda/mmid.cuh +5 -0
  1199. data/vendor/ggml/src/ggml-cuda/mmq.cu +372 -0
  1200. data/vendor/ggml/src/ggml-cuda/mmq.cuh +4176 -0
  1201. data/vendor/ggml/src/ggml-cuda/mmvf.cu +862 -0
  1202. data/vendor/ggml/src/ggml-cuda/mmvf.cuh +14 -0
  1203. data/vendor/ggml/src/ggml-cuda/mmvq.cu +1161 -0
  1204. data/vendor/ggml/src/ggml-cuda/mmvq.cuh +16 -0
  1205. data/vendor/ggml/src/ggml-cuda/norm.cu +672 -0
  1206. data/vendor/ggml/src/ggml-cuda/norm.cuh +18 -0
  1207. data/vendor/ggml/src/ggml-cuda/opt-step-adamw.cu +78 -0
  1208. data/vendor/ggml/src/ggml-cuda/opt-step-adamw.cuh +5 -0
  1209. data/vendor/ggml/src/ggml-cuda/opt-step-sgd.cu +49 -0
  1210. data/vendor/ggml/src/ggml-cuda/opt-step-sgd.cuh +5 -0
  1211. data/vendor/ggml/src/ggml-cuda/out-prod.cu +84 -0
  1212. data/vendor/ggml/src/ggml-cuda/out-prod.cuh +3 -0
  1213. data/vendor/ggml/src/ggml-cuda/pad.cu +106 -0
  1214. data/vendor/ggml/src/ggml-cuda/pad.cuh +5 -0
  1215. data/vendor/ggml/src/ggml-cuda/pad_reflect_1d.cu +91 -0
  1216. data/vendor/ggml/src/ggml-cuda/pad_reflect_1d.cuh +5 -0
  1217. data/vendor/ggml/src/ggml-cuda/pool2d.cu +94 -0
  1218. data/vendor/ggml/src/ggml-cuda/pool2d.cuh +5 -0
  1219. data/vendor/ggml/src/ggml-cuda/quantize.cu +443 -0
  1220. data/vendor/ggml/src/ggml-cuda/quantize.cuh +41 -0
  1221. data/vendor/ggml/src/ggml-cuda/reduce_rows.cuh +39 -0
  1222. data/vendor/ggml/src/ggml-cuda/roll.cu +67 -0
  1223. data/vendor/ggml/src/ggml-cuda/roll.cuh +5 -0
  1224. data/vendor/ggml/src/ggml-cuda/rope.cu +665 -0
  1225. data/vendor/ggml/src/ggml-cuda/rope.cuh +9 -0
  1226. data/vendor/ggml/src/ggml-cuda/scale.cu +34 -0
  1227. data/vendor/ggml/src/ggml-cuda/scale.cuh +5 -0
  1228. data/vendor/ggml/src/ggml-cuda/set-rows.cu +330 -0
  1229. data/vendor/ggml/src/ggml-cuda/set-rows.cuh +7 -0
  1230. data/vendor/ggml/src/ggml-cuda/set.cu +39 -0
  1231. data/vendor/ggml/src/ggml-cuda/set.cuh +7 -0
  1232. data/vendor/ggml/src/ggml-cuda/snake.cu +72 -0
  1233. data/vendor/ggml/src/ggml-cuda/snake.cuh +8 -0
  1234. data/vendor/ggml/src/ggml-cuda/softcap.cu +34 -0
  1235. data/vendor/ggml/src/ggml-cuda/softcap.cuh +5 -0
  1236. data/vendor/ggml/src/ggml-cuda/softmax.cu +472 -0
  1237. data/vendor/ggml/src/ggml-cuda/softmax.cuh +7 -0
  1238. data/vendor/ggml/src/ggml-cuda/solve_tri.cu +275 -0
  1239. data/vendor/ggml/src/ggml-cuda/solve_tri.cuh +3 -0
  1240. data/vendor/ggml/src/ggml-cuda/ssm-conv.cu +197 -0
  1241. data/vendor/ggml/src/ggml-cuda/ssm-conv.cuh +3 -0
  1242. data/vendor/ggml/src/ggml-cuda/ssm-scan.cu +342 -0
  1243. data/vendor/ggml/src/ggml-cuda/ssm-scan.cuh +3 -0
  1244. data/vendor/ggml/src/ggml-cuda/sum.cu +41 -0
  1245. data/vendor/ggml/src/ggml-cuda/sum.cuh +5 -0
  1246. data/vendor/ggml/src/ggml-cuda/sumrows.cu +43 -0
  1247. data/vendor/ggml/src/ggml-cuda/sumrows.cuh +4 -0
  1248. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu +6 -0
  1249. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_32.cu +6 -0
  1250. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu +12 -0
  1251. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_1.cu +10 -0
  1252. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_2.cu +10 -0
  1253. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +12 -0
  1254. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu +6 -0
  1255. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_32.cu +6 -0
  1256. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +12 -0
  1257. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu +12 -0
  1258. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_1.cu +10 -0
  1259. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_2.cu +10 -0
  1260. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu +6 -0
  1261. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_2.cu +10 -0
  1262. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +12 -0
  1263. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_8.cu +12 -0
  1264. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_64-ncols2_1.cu +10 -0
  1265. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_1.cu +10 -0
  1266. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_2.cu +10 -0
  1267. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +12 -0
  1268. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_8.cu +12 -0
  1269. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq112-dv112.cu +5 -0
  1270. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq128-dv128.cu +5 -0
  1271. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq192-dv128.cu +5 -0
  1272. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq256-dv256.cu +5 -0
  1273. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq320-dv256.cu +5 -0
  1274. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq40-dv40.cu +5 -0
  1275. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq512-dv512.cu +5 -0
  1276. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq576-dv512.cu +5 -0
  1277. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq64-dv64.cu +5 -0
  1278. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq72-dv72.cu +5 -0
  1279. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq80-dv80.cu +5 -0
  1280. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq96-dv96.cu +5 -0
  1281. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-bf16.cu +7 -0
  1282. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-f16.cu +7 -0
  1283. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_0.cu +7 -0
  1284. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_1.cu +7 -0
  1285. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_0.cu +7 -0
  1286. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_1.cu +7 -0
  1287. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q8_0.cu +7 -0
  1288. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-bf16.cu +7 -0
  1289. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu +7 -0
  1290. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_0.cu +7 -0
  1291. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_1.cu +7 -0
  1292. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_0.cu +7 -0
  1293. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_1.cu +7 -0
  1294. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q8_0.cu +7 -0
  1295. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-bf16.cu +7 -0
  1296. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-f16.cu +7 -0
  1297. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu +7 -0
  1298. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_1.cu +7 -0
  1299. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_0.cu +7 -0
  1300. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_1.cu +7 -0
  1301. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q8_0.cu +7 -0
  1302. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-bf16.cu +7 -0
  1303. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-f16.cu +7 -0
  1304. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_0.cu +7 -0
  1305. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_1.cu +7 -0
  1306. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_0.cu +7 -0
  1307. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_1.cu +7 -0
  1308. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q8_0.cu +7 -0
  1309. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-bf16.cu +7 -0
  1310. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-f16.cu +7 -0
  1311. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_0.cu +7 -0
  1312. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_1.cu +7 -0
  1313. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_0.cu +7 -0
  1314. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_1.cu +7 -0
  1315. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q8_0.cu +7 -0
  1316. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-bf16.cu +7 -0
  1317. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-f16.cu +7 -0
  1318. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_0.cu +7 -0
  1319. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_1.cu +7 -0
  1320. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_0.cu +7 -0
  1321. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_1.cu +7 -0
  1322. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q8_0.cu +7 -0
  1323. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-bf16.cu +7 -0
  1324. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-f16.cu +7 -0
  1325. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_0.cu +7 -0
  1326. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_1.cu +7 -0
  1327. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_0.cu +7 -0
  1328. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_1.cu +7 -0
  1329. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu +7 -0
  1330. data/vendor/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +110 -0
  1331. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_1.cu +5 -0
  1332. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_10.cu +5 -0
  1333. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_11.cu +5 -0
  1334. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_12.cu +5 -0
  1335. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_13.cu +5 -0
  1336. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_14.cu +5 -0
  1337. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_15.cu +5 -0
  1338. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_16.cu +5 -0
  1339. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_2.cu +5 -0
  1340. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_3.cu +5 -0
  1341. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_4.cu +5 -0
  1342. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_5.cu +5 -0
  1343. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_6.cu +5 -0
  1344. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_7.cu +5 -0
  1345. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_8.cu +5 -0
  1346. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_9.cu +5 -0
  1347. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq1_s.cu +5 -0
  1348. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_s.cu +5 -0
  1349. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xs.cu +5 -0
  1350. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xxs.cu +5 -0
  1351. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_s.cu +5 -0
  1352. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_xxs.cu +5 -0
  1353. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_nl.cu +5 -0
  1354. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_xs.cu +5 -0
  1355. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-mxfp4.cu +5 -0
  1356. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-nvfp4.cu +5 -0
  1357. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q1_0.cu +5 -0
  1358. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
  1359. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
  1360. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
  1361. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
  1362. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
  1363. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
  1364. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
  1365. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
  1366. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
  1367. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
  1368. data/vendor/ggml/src/ggml-cuda/top-k.cu +95 -0
  1369. data/vendor/ggml/src/ggml-cuda/top-k.cuh +3 -0
  1370. data/vendor/ggml/src/ggml-cuda/topk-moe.cu +415 -0
  1371. data/vendor/ggml/src/ggml-cuda/topk-moe.cuh +27 -0
  1372. data/vendor/ggml/src/ggml-cuda/tri.cu +136 -0
  1373. data/vendor/ggml/src/ggml-cuda/tri.cuh +5 -0
  1374. data/vendor/ggml/src/ggml-cuda/tsembd.cu +47 -0
  1375. data/vendor/ggml/src/ggml-cuda/tsembd.cuh +5 -0
  1376. data/vendor/ggml/src/ggml-cuda/unary.cu +640 -0
  1377. data/vendor/ggml/src/ggml-cuda/unary.cuh +114 -0
  1378. data/vendor/ggml/src/ggml-cuda/upscale.cu +293 -0
  1379. data/vendor/ggml/src/ggml-cuda/upscale.cuh +5 -0
  1380. data/vendor/ggml/src/ggml-cuda/vecdotq.cuh +1317 -0
  1381. data/vendor/ggml/src/ggml-cuda/vendors/cuda.h +28 -0
  1382. data/vendor/ggml/src/ggml-cuda/vendors/hip.h +304 -0
  1383. data/vendor/ggml/src/ggml-cuda/vendors/musa.h +150 -0
  1384. data/vendor/ggml/src/ggml-cuda/wkv.cu +199 -0
  1385. data/vendor/ggml/src/ggml-cuda/wkv.cuh +7 -0
  1386. data/vendor/ggml/src/ggml-hexagon/CMakeLists.txt +118 -0
  1387. data/vendor/ggml/src/ggml-hexagon/ggml-hexagon.cpp +3680 -0
  1388. data/vendor/ggml/src/ggml-hexagon/htp/CMakeLists.txt +78 -0
  1389. data/vendor/ggml/src/ggml-hexagon/htp/act-ops.c +782 -0
  1390. data/vendor/ggml/src/ggml-hexagon/htp/argsort-ops.c +293 -0
  1391. data/vendor/ggml/src/ggml-hexagon/htp/binary-ops.c +872 -0
  1392. data/vendor/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +157 -0
  1393. data/vendor/ggml/src/ggml-hexagon/htp/cpy-ops.c +275 -0
  1394. data/vendor/ggml/src/ggml-hexagon/htp/cumsum-ops.c +270 -0
  1395. data/vendor/ggml/src/ggml-hexagon/htp/diag-ops.c +216 -0
  1396. data/vendor/ggml/src/ggml-hexagon/htp/fill-ops.c +123 -0
  1397. data/vendor/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +727 -0
  1398. data/vendor/ggml/src/ggml-hexagon/htp/gated-delta-net-ops.c +955 -0
  1399. data/vendor/ggml/src/ggml-hexagon/htp/get-rows-ops.c +124 -0
  1400. data/vendor/ggml/src/ggml-hexagon/htp/hex-dma.c +63 -0
  1401. data/vendor/ggml/src/ggml-hexagon/htp/hex-dma.h +372 -0
  1402. data/vendor/ggml/src/ggml-hexagon/htp/hex-dump.h +86 -0
  1403. data/vendor/ggml/src/ggml-hexagon/htp/hex-fastdiv.h +37 -0
  1404. data/vendor/ggml/src/ggml-hexagon/htp/hex-utils.h +137 -0
  1405. data/vendor/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c +1841 -0
  1406. data/vendor/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c +1785 -0
  1407. data/vendor/ggml/src/ggml-hexagon/htp/hmx-ops.h +71 -0
  1408. data/vendor/ggml/src/ggml-hexagon/htp/hmx-profile.h +34 -0
  1409. data/vendor/ggml/src/ggml-hexagon/htp/hmx-queue.c +158 -0
  1410. data/vendor/ggml/src/ggml-hexagon/htp/hmx-queue.h +134 -0
  1411. data/vendor/ggml/src/ggml-hexagon/htp/hmx-utils.h +200 -0
  1412. data/vendor/ggml/src/ggml-hexagon/htp/htp-ctx.h +111 -0
  1413. data/vendor/ggml/src/ggml-hexagon/htp/htp-ops.h +181 -0
  1414. data/vendor/ggml/src/ggml-hexagon/htp/htp_iface.idl +22 -0
  1415. data/vendor/ggml/src/ggml-hexagon/htp/hvx-arith.h +443 -0
  1416. data/vendor/ggml/src/ggml-hexagon/htp/hvx-base.h +308 -0
  1417. data/vendor/ggml/src/ggml-hexagon/htp/hvx-copy.h +262 -0
  1418. data/vendor/ggml/src/ggml-hexagon/htp/hvx-div.h +291 -0
  1419. data/vendor/ggml/src/ggml-hexagon/htp/hvx-dump.h +129 -0
  1420. data/vendor/ggml/src/ggml-hexagon/htp/hvx-exp.h +216 -0
  1421. data/vendor/ggml/src/ggml-hexagon/htp/hvx-floor.h +100 -0
  1422. data/vendor/ggml/src/ggml-hexagon/htp/hvx-inverse.h +210 -0
  1423. data/vendor/ggml/src/ggml-hexagon/htp/hvx-reduce.h +296 -0
  1424. data/vendor/ggml/src/ggml-hexagon/htp/hvx-repl.h +74 -0
  1425. data/vendor/ggml/src/ggml-hexagon/htp/hvx-scale.h +133 -0
  1426. data/vendor/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +142 -0
  1427. data/vendor/ggml/src/ggml-hexagon/htp/hvx-sqrt.h +126 -0
  1428. data/vendor/ggml/src/ggml-hexagon/htp/hvx-types.h +36 -0
  1429. data/vendor/ggml/src/ggml-hexagon/htp/hvx-utils.h +19 -0
  1430. data/vendor/ggml/src/ggml-hexagon/htp/main.c +880 -0
  1431. data/vendor/ggml/src/ggml-hexagon/htp/matmul-ops.c +3173 -0
  1432. data/vendor/ggml/src/ggml-hexagon/htp/repeat-ops.c +148 -0
  1433. data/vendor/ggml/src/ggml-hexagon/htp/rope-ops.c +494 -0
  1434. data/vendor/ggml/src/ggml-hexagon/htp/set-rows-ops.c +184 -0
  1435. data/vendor/ggml/src/ggml-hexagon/htp/softmax-ops.c +407 -0
  1436. data/vendor/ggml/src/ggml-hexagon/htp/solve-tri-ops.c +267 -0
  1437. data/vendor/ggml/src/ggml-hexagon/htp/ssm-conv.c +340 -0
  1438. data/vendor/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +128 -0
  1439. data/vendor/ggml/src/ggml-hexagon/htp/unary-ops.c +657 -0
  1440. data/vendor/ggml/src/ggml-hexagon/htp/vtcm-utils.h +16 -0
  1441. data/vendor/ggml/src/ggml-hexagon/htp/worker-pool.c +293 -0
  1442. data/vendor/ggml/src/ggml-hexagon/htp/worker-pool.h +57 -0
  1443. data/vendor/ggml/src/ggml-hexagon/htp-drv.cpp +418 -0
  1444. data/vendor/ggml/src/ggml-hexagon/htp-drv.h +121 -0
  1445. data/vendor/ggml/src/ggml-hexagon/libdl.h +79 -0
  1446. data/vendor/ggml/src/ggml-hexagon/libggml-htp.inf +40 -0
  1447. data/vendor/ggml/src/ggml-hexagon/op-desc.h +153 -0
  1448. data/vendor/ggml/src/ggml-hip/CMakeLists.txt +157 -0
  1449. data/vendor/ggml/src/ggml-impl.h +783 -0
  1450. data/vendor/ggml/src/ggml-metal/CMakeLists.txt +124 -0
  1451. data/vendor/ggml/src/ggml-metal/ggml-metal-common.cpp +457 -0
  1452. data/vendor/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
  1453. data/vendor/ggml/src/ggml-metal/ggml-metal-context.h +41 -0
  1454. data/vendor/ggml/src/ggml-metal/ggml-metal-context.m +739 -0
  1455. data/vendor/ggml/src/ggml-metal/ggml-metal-device.cpp +2053 -0
  1456. data/vendor/ggml/src/ggml-metal/ggml-metal-device.h +296 -0
  1457. data/vendor/ggml/src/ggml-metal/ggml-metal-device.m +1829 -0
  1458. data/vendor/ggml/src/ggml-metal/ggml-metal-impl.h +1175 -0
  1459. data/vendor/ggml/src/ggml-metal/ggml-metal-ops.cpp +4606 -0
  1460. data/vendor/ggml/src/ggml-metal/ggml-metal-ops.h +97 -0
  1461. data/vendor/ggml/src/ggml-metal/ggml-metal.cpp +950 -0
  1462. data/vendor/ggml/src/ggml-metal/ggml-metal.metal +10679 -0
  1463. data/vendor/ggml/src/ggml-musa/CMakeLists.txt +124 -0
  1464. data/vendor/ggml/src/ggml-musa/mudnn.cu +112 -0
  1465. data/vendor/ggml/src/ggml-musa/mudnn.cuh +12 -0
  1466. data/vendor/ggml/src/ggml-opencl/CMakeLists.txt +189 -0
  1467. data/vendor/ggml/src/ggml-opencl/ggml-opencl.cpp +16374 -0
  1468. data/vendor/ggml/src/ggml-opencl/kernels/add.cl +190 -0
  1469. data/vendor/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
  1470. data/vendor/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
  1471. data/vendor/ggml/src/ggml-opencl/kernels/clamp.cl +20 -0
  1472. data/vendor/ggml/src/ggml-opencl/kernels/concat.cl +51 -0
  1473. data/vendor/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
  1474. data/vendor/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
  1475. data/vendor/ggml/src/ggml-opencl/kernels/cpy.cl +229 -0
  1476. data/vendor/ggml/src/ggml-opencl/kernels/cumsum.cl +139 -0
  1477. data/vendor/ggml/src/ggml-opencl/kernels/cvt.cl +1471 -0
  1478. data/vendor/ggml/src/ggml-opencl/kernels/diag.cl +27 -0
  1479. data/vendor/ggml/src/ggml-opencl/kernels/diag_mask_inf.cl +58 -0
  1480. data/vendor/ggml/src/ggml-opencl/kernels/div.cl +138 -0
  1481. data/vendor/ggml/src/ggml-opencl/kernels/embed_kernel.py +26 -0
  1482. data/vendor/ggml/src/ggml-opencl/kernels/exp.cl +125 -0
  1483. data/vendor/ggml/src/ggml-opencl/kernels/expm1.cl +113 -0
  1484. data/vendor/ggml/src/ggml-opencl/kernels/fill.cl +17 -0
  1485. data/vendor/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +370 -0
  1486. data/vendor/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +371 -0
  1487. data/vendor/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +373 -0
  1488. data/vendor/ggml/src/ggml-opencl/kernels/gelu.cl +89 -0
  1489. data/vendor/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32.cl +162 -0
  1490. data/vendor/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32_ns.cl +302 -0
  1491. data/vendor/ggml/src/ggml-opencl/kernels/gemm_moe_q4_0_f32_ns.cl +252 -0
  1492. data/vendor/ggml/src/ggml-opencl/kernels/gemm_moe_q4_1_f32_ns.cl +254 -0
  1493. data/vendor/ggml/src/ggml-opencl/kernels/gemm_moe_q5_0_f32_ns.cl +256 -0
  1494. data/vendor/ggml/src/ggml-opencl/kernels/gemm_moe_q5_1_f32_ns.cl +258 -0
  1495. data/vendor/ggml/src/ggml-opencl/kernels/gemm_noshuffle_iq4_nl_f32.cl +150 -0
  1496. data/vendor/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_0_f32.cl +139 -0
  1497. data/vendor/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_1_f32.cl +132 -0
  1498. data/vendor/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_k_f32.cl +172 -0
  1499. data/vendor/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_k_f32.cl +176 -0
  1500. data/vendor/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q6_k_f32.cl +140 -0
  1501. data/vendor/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q8_0_f32.cl +129 -0
  1502. data/vendor/ggml/src/ggml-opencl/kernels/gemm_xmem_f16_f32_os8.cl +233 -0
  1503. data/vendor/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32.cl +156 -0
  1504. data/vendor/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32_ns.cl +161 -0
  1505. data/vendor/ggml/src/ggml-opencl/kernels/gemv_moe_q4_0_f32_ns.cl +116 -0
  1506. data/vendor/ggml/src/ggml-opencl/kernels/gemv_moe_q4_1_f32_ns.cl +119 -0
  1507. data/vendor/ggml/src/ggml-opencl/kernels/gemv_moe_q5_0_f32_ns.cl +119 -0
  1508. data/vendor/ggml/src/ggml-opencl/kernels/gemv_moe_q5_1_f32_ns.cl +121 -0
  1509. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_iq4_nl_f32.cl +302 -0
  1510. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_0_f32.cl +274 -0
  1511. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_0_f32_spec.cl +268 -0
  1512. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_1_f32.cl +283 -0
  1513. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_k_f32.cl +318 -0
  1514. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_k_f32.cl +326 -0
  1515. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q6_k_f32.cl +293 -0
  1516. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q8_0_f32.cl +195 -0
  1517. data/vendor/ggml/src/ggml-opencl/kernels/get_rows.cl +187 -0
  1518. data/vendor/ggml/src/ggml-opencl/kernels/glu.cl +378 -0
  1519. data/vendor/ggml/src/ggml-opencl/kernels/group_norm.cl +121 -0
  1520. data/vendor/ggml/src/ggml-opencl/kernels/im2col_f16.cl +57 -0
  1521. data/vendor/ggml/src/ggml-opencl/kernels/im2col_f32.cl +57 -0
  1522. data/vendor/ggml/src/ggml-opencl/kernels/l2_norm.cl +71 -0
  1523. data/vendor/ggml/src/ggml-opencl/kernels/mean.cl +140 -0
  1524. data/vendor/ggml/src/ggml-opencl/kernels/moe_reorder_b.cl +30 -0
  1525. data/vendor/ggml/src/ggml-opencl/kernels/moe_sort_by_expert.cl +82 -0
  1526. data/vendor/ggml/src/ggml-opencl/kernels/mul.cl +152 -0
  1527. data/vendor/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
  1528. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_kq_kqv.cl +273 -0
  1529. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +146 -0
  1530. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +147 -0
  1531. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_iq4_nl_f32_l4_lm.cl +171 -0
  1532. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl +163 -0
  1533. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl +165 -0
  1534. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_q4_k_f32_l4_lm.cl +179 -0
  1535. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_q5_k_f32_l4_lm.cl +192 -0
  1536. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_q6_k_f32_l4_lm.cl +158 -0
  1537. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl +154 -0
  1538. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_f16_f16.cl +118 -0
  1539. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32.cl +118 -0
  1540. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_1row.cl +94 -0
  1541. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_l4.cl +84 -0
  1542. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_f32_f32.cl +118 -0
  1543. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
  1544. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
  1545. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  1546. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
  1547. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
  1548. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32.cl +164 -0
  1549. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32_flat.cl +202 -0
  1550. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
  1551. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
  1552. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32.cl +192 -0
  1553. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_16x_flat.cl +307 -0
  1554. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_8x_flat.cl +265 -0
  1555. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_8x_flat.cl +272 -0
  1556. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_v.cl +254 -0
  1557. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl +219 -0
  1558. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl +229 -0
  1559. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl +180 -0
  1560. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32_flat.cl +196 -0
  1561. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32.cl +187 -0
  1562. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32_flat.cl +203 -0
  1563. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32.cl +194 -0
  1564. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +194 -0
  1565. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
  1566. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
  1567. data/vendor/ggml/src/ggml-opencl/kernels/neg.cl +125 -0
  1568. data/vendor/ggml/src/ggml-opencl/kernels/norm.cl +161 -0
  1569. data/vendor/ggml/src/ggml-opencl/kernels/pad.cl +39 -0
  1570. data/vendor/ggml/src/ggml-opencl/kernels/relu.cl +16 -0
  1571. data/vendor/ggml/src/ggml-opencl/kernels/repeat.cl +38 -0
  1572. data/vendor/ggml/src/ggml-opencl/kernels/rms_norm.cl +190 -0
  1573. data/vendor/ggml/src/ggml-opencl/kernels/rope.cl +747 -0
  1574. data/vendor/ggml/src/ggml-opencl/kernels/scale.cl +27 -0
  1575. data/vendor/ggml/src/ggml-opencl/kernels/set_rows.cl +208 -0
  1576. data/vendor/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
  1577. data/vendor/ggml/src/ggml-opencl/kernels/silu.cl +30 -0
  1578. data/vendor/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +108 -0
  1579. data/vendor/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +108 -0
  1580. data/vendor/ggml/src/ggml-opencl/kernels/softmax_f16.cl +107 -0
  1581. data/vendor/ggml/src/ggml-opencl/kernels/softmax_f32.cl +107 -0
  1582. data/vendor/ggml/src/ggml-opencl/kernels/softplus.cl +116 -0
  1583. data/vendor/ggml/src/ggml-opencl/kernels/solve_tri.cl +51 -0
  1584. data/vendor/ggml/src/ggml-opencl/kernels/sqr.cl +53 -0
  1585. data/vendor/ggml/src/ggml-opencl/kernels/sqrt.cl +53 -0
  1586. data/vendor/ggml/src/ggml-opencl/kernels/ssm_conv.cl +77 -0
  1587. data/vendor/ggml/src/ggml-opencl/kernels/sub.cl +138 -0
  1588. data/vendor/ggml/src/ggml-opencl/kernels/sum_rows.cl +140 -0
  1589. data/vendor/ggml/src/ggml-opencl/kernels/tanh.cl +109 -0
  1590. data/vendor/ggml/src/ggml-opencl/kernels/transpose.cl +143 -0
  1591. data/vendor/ggml/src/ggml-opencl/kernels/tri.cl +32 -0
  1592. data/vendor/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  1593. data/vendor/ggml/src/ggml-opencl/kernels/upscale.cl +120 -0
  1594. data/vendor/ggml/src/ggml-openvino/CMakeLists.txt +22 -0
  1595. data/vendor/ggml/src/ggml-openvino/ggml-decoder.cpp +985 -0
  1596. data/vendor/ggml/src/ggml-openvino/ggml-decoder.h +294 -0
  1597. data/vendor/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +380 -0
  1598. data/vendor/ggml/src/ggml-openvino/ggml-openvino-extra.h +182 -0
  1599. data/vendor/ggml/src/ggml-openvino/ggml-openvino.cpp +1132 -0
  1600. data/vendor/ggml/src/ggml-openvino/ggml-quants.cpp +956 -0
  1601. data/vendor/ggml/src/ggml-openvino/ggml-quants.h +153 -0
  1602. data/vendor/ggml/src/ggml-openvino/openvino/decoder.h +74 -0
  1603. data/vendor/ggml/src/ggml-openvino/openvino/frontend.cpp +27 -0
  1604. data/vendor/ggml/src/ggml-openvino/openvino/frontend.h +23 -0
  1605. data/vendor/ggml/src/ggml-openvino/openvino/input_model.cpp +17 -0
  1606. data/vendor/ggml/src/ggml-openvino/openvino/input_model.h +29 -0
  1607. data/vendor/ggml/src/ggml-openvino/openvino/node_context.h +112 -0
  1608. data/vendor/ggml/src/ggml-openvino/openvino/op/cont.cpp +48 -0
  1609. data/vendor/ggml/src/ggml-openvino/openvino/op/cpy.cpp +21 -0
  1610. data/vendor/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +90 -0
  1611. data/vendor/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +69 -0
  1612. data/vendor/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp +61 -0
  1613. data/vendor/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp +62 -0
  1614. data/vendor/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +90 -0
  1615. data/vendor/ggml/src/ggml-openvino/openvino/op/permute.cpp +102 -0
  1616. data/vendor/ggml/src/ggml-openvino/openvino/op/reshape.cpp +83 -0
  1617. data/vendor/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp +46 -0
  1618. data/vendor/ggml/src/ggml-openvino/openvino/op/rope.cpp +149 -0
  1619. data/vendor/ggml/src/ggml-openvino/openvino/op/scale.cpp +41 -0
  1620. data/vendor/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +76 -0
  1621. data/vendor/ggml/src/ggml-openvino/openvino/op/softmax.cpp +89 -0
  1622. data/vendor/ggml/src/ggml-openvino/openvino/op/transpose.cpp +23 -0
  1623. data/vendor/ggml/src/ggml-openvino/openvino/op/unary_gelu.cpp +25 -0
  1624. data/vendor/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp +27 -0
  1625. data/vendor/ggml/src/ggml-openvino/openvino/op/view.cpp +53 -0
  1626. data/vendor/ggml/src/ggml-openvino/openvino/op_table.cpp +47 -0
  1627. data/vendor/ggml/src/ggml-openvino/openvino/op_table.h +40 -0
  1628. data/vendor/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp +60 -0
  1629. data/vendor/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.h +17 -0
  1630. data/vendor/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.h +29 -0
  1631. data/vendor/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.cpp +58 -0
  1632. data/vendor/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.h +17 -0
  1633. data/vendor/ggml/src/ggml-openvino/openvino/rt_info/weightless_caching_attributes.hpp +41 -0
  1634. data/vendor/ggml/src/ggml-openvino/openvino/translate_session.cpp +317 -0
  1635. data/vendor/ggml/src/ggml-openvino/openvino/translate_session.h +28 -0
  1636. data/vendor/ggml/src/ggml-openvino/openvino/utils.cpp +257 -0
  1637. data/vendor/ggml/src/ggml-openvino/openvino/utils.h +86 -0
  1638. data/vendor/ggml/src/ggml-openvino/utils.cpp +880 -0
  1639. data/vendor/ggml/src/ggml-openvino/utils.h +143 -0
  1640. data/vendor/ggml/src/ggml-opt.cpp +1094 -0
  1641. data/vendor/ggml/src/ggml-quants.c +5491 -0
  1642. data/vendor/ggml/src/ggml-quants.h +112 -0
  1643. data/vendor/ggml/src/ggml-rpc/CMakeLists.txt +33 -0
  1644. data/vendor/ggml/src/ggml-rpc/ggml-rpc.cpp +1974 -0
  1645. data/vendor/ggml/src/ggml-rpc/transport.cpp +683 -0
  1646. data/vendor/ggml/src/ggml-rpc/transport.h +34 -0
  1647. data/vendor/ggml/src/ggml-sycl/CMakeLists.txt +207 -0
  1648. data/vendor/ggml/src/ggml-sycl/add-id.cpp +81 -0
  1649. data/vendor/ggml/src/ggml-sycl/add-id.hpp +8 -0
  1650. data/vendor/ggml/src/ggml-sycl/backend.hpp +48 -0
  1651. data/vendor/ggml/src/ggml-sycl/binbcast.cpp +346 -0
  1652. data/vendor/ggml/src/ggml-sycl/binbcast.hpp +39 -0
  1653. data/vendor/ggml/src/ggml-sycl/common.cpp +155 -0
  1654. data/vendor/ggml/src/ggml-sycl/common.hpp +1002 -0
  1655. data/vendor/ggml/src/ggml-sycl/concat.cpp +202 -0
  1656. data/vendor/ggml/src/ggml-sycl/concat.hpp +20 -0
  1657. data/vendor/ggml/src/ggml-sycl/conv.cpp +101 -0
  1658. data/vendor/ggml/src/ggml-sycl/conv.hpp +20 -0
  1659. data/vendor/ggml/src/ggml-sycl/convert.cpp +825 -0
  1660. data/vendor/ggml/src/ggml-sycl/convert.hpp +64 -0
  1661. data/vendor/ggml/src/ggml-sycl/count-equal.cpp +79 -0
  1662. data/vendor/ggml/src/ggml-sycl/count-equal.hpp +9 -0
  1663. data/vendor/ggml/src/ggml-sycl/cpy.cpp +602 -0
  1664. data/vendor/ggml/src/ggml-sycl/cpy.hpp +223 -0
  1665. data/vendor/ggml/src/ggml-sycl/cumsum.cpp +148 -0
  1666. data/vendor/ggml/src/ggml-sycl/cumsum.hpp +5 -0
  1667. data/vendor/ggml/src/ggml-sycl/dequantize.hpp +975 -0
  1668. data/vendor/ggml/src/ggml-sycl/diag.cpp +67 -0
  1669. data/vendor/ggml/src/ggml-sycl/diag.hpp +5 -0
  1670. data/vendor/ggml/src/ggml-sycl/dmmv.cpp +1579 -0
  1671. data/vendor/ggml/src/ggml-sycl/dmmv.hpp +27 -0
  1672. data/vendor/ggml/src/ggml-sycl/dpct/helper.hpp +3774 -0
  1673. data/vendor/ggml/src/ggml-sycl/element_wise.cpp +1124 -0
  1674. data/vendor/ggml/src/ggml-sycl/element_wise.hpp +94 -0
  1675. data/vendor/ggml/src/ggml-sycl/fattn-buffers.cpp +56 -0
  1676. data/vendor/ggml/src/ggml-sycl/fattn-buffers.hpp +63 -0
  1677. data/vendor/ggml/src/ggml-sycl/fattn-common.hpp +1181 -0
  1678. data/vendor/ggml/src/ggml-sycl/fattn-tile.cpp +59 -0
  1679. data/vendor/ggml/src/ggml-sycl/fattn-tile.hpp +1246 -0
  1680. data/vendor/ggml/src/ggml-sycl/fattn-vec.hpp +674 -0
  1681. data/vendor/ggml/src/ggml-sycl/fattn.cpp +227 -0
  1682. data/vendor/ggml/src/ggml-sycl/fattn.hpp +22 -0
  1683. data/vendor/ggml/src/ggml-sycl/fill.cpp +55 -0
  1684. data/vendor/ggml/src/ggml-sycl/fill.hpp +5 -0
  1685. data/vendor/ggml/src/ggml-sycl/gated_delta_net.cpp +307 -0
  1686. data/vendor/ggml/src/ggml-sycl/gated_delta_net.hpp +9 -0
  1687. data/vendor/ggml/src/ggml-sycl/gemm.hpp +93 -0
  1688. data/vendor/ggml/src/ggml-sycl/getrows.cpp +219 -0
  1689. data/vendor/ggml/src/ggml-sycl/getrows.hpp +20 -0
  1690. data/vendor/ggml/src/ggml-sycl/ggml-sycl.cpp +5520 -0
  1691. data/vendor/ggml/src/ggml-sycl/gla.cpp +106 -0
  1692. data/vendor/ggml/src/ggml-sycl/gla.hpp +8 -0
  1693. data/vendor/ggml/src/ggml-sycl/im2col.cpp +400 -0
  1694. data/vendor/ggml/src/ggml-sycl/im2col.hpp +23 -0
  1695. data/vendor/ggml/src/ggml-sycl/mmq.cpp +3030 -0
  1696. data/vendor/ggml/src/ggml-sycl/mmq.hpp +33 -0
  1697. data/vendor/ggml/src/ggml-sycl/mmvq.cpp +1380 -0
  1698. data/vendor/ggml/src/ggml-sycl/mmvq.hpp +43 -0
  1699. data/vendor/ggml/src/ggml-sycl/norm.cpp +656 -0
  1700. data/vendor/ggml/src/ggml-sycl/norm.hpp +28 -0
  1701. data/vendor/ggml/src/ggml-sycl/outprod.cpp +47 -0
  1702. data/vendor/ggml/src/ggml-sycl/outprod.hpp +10 -0
  1703. data/vendor/ggml/src/ggml-sycl/pad.cpp +97 -0
  1704. data/vendor/ggml/src/ggml-sycl/pad.hpp +24 -0
  1705. data/vendor/ggml/src/ggml-sycl/pad_reflect_1d.cpp +100 -0
  1706. data/vendor/ggml/src/ggml-sycl/pad_reflect_1d.hpp +10 -0
  1707. data/vendor/ggml/src/ggml-sycl/presets.hpp +79 -0
  1708. data/vendor/ggml/src/ggml-sycl/quantize.hpp +133 -0
  1709. data/vendor/ggml/src/ggml-sycl/quants.hpp +156 -0
  1710. data/vendor/ggml/src/ggml-sycl/repeat_back.cpp +76 -0
  1711. data/vendor/ggml/src/ggml-sycl/repeat_back.hpp +8 -0
  1712. data/vendor/ggml/src/ggml-sycl/roll.cpp +122 -0
  1713. data/vendor/ggml/src/ggml-sycl/roll.hpp +20 -0
  1714. data/vendor/ggml/src/ggml-sycl/rope.cpp +641 -0
  1715. data/vendor/ggml/src/ggml-sycl/rope.hpp +26 -0
  1716. data/vendor/ggml/src/ggml-sycl/set.cpp +73 -0
  1717. data/vendor/ggml/src/ggml-sycl/set.hpp +5 -0
  1718. data/vendor/ggml/src/ggml-sycl/set_rows.cpp +240 -0
  1719. data/vendor/ggml/src/ggml-sycl/set_rows.hpp +8 -0
  1720. data/vendor/ggml/src/ggml-sycl/softmax.cpp +426 -0
  1721. data/vendor/ggml/src/ggml-sycl/softmax.hpp +24 -0
  1722. data/vendor/ggml/src/ggml-sycl/solve_tri.cpp +172 -0
  1723. data/vendor/ggml/src/ggml-sycl/solve_tri.hpp +8 -0
  1724. data/vendor/ggml/src/ggml-sycl/ssm_conv.cpp +132 -0
  1725. data/vendor/ggml/src/ggml-sycl/ssm_conv.hpp +5 -0
  1726. data/vendor/ggml/src/ggml-sycl/ssm_scan.cpp +156 -0
  1727. data/vendor/ggml/src/ggml-sycl/ssm_scan.hpp +5 -0
  1728. data/vendor/ggml/src/ggml-sycl/sycl_hw.cpp +67 -0
  1729. data/vendor/ggml/src/ggml-sycl/sycl_hw.hpp +38 -0
  1730. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq112-dv112.cpp +5 -0
  1731. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq128-dv128.cpp +5 -0
  1732. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq256-dv256.cpp +5 -0
  1733. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq40-dv40.cpp +5 -0
  1734. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq512-dv512.cpp +6 -0
  1735. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq576-dv512.cpp +5 -0
  1736. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq64-dv64.cpp +5 -0
  1737. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq72-dv72.cpp +5 -0
  1738. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq80-dv80.cpp +5 -0
  1739. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq96-dv96.cpp +5 -0
  1740. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-f16.cpp +8 -0
  1741. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_0.cpp +8 -0
  1742. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_1.cpp +8 -0
  1743. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_0.cpp +8 -0
  1744. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_1.cpp +8 -0
  1745. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q8_0.cpp +8 -0
  1746. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-f16.cpp +8 -0
  1747. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_0.cpp +8 -0
  1748. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_1.cpp +8 -0
  1749. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_0.cpp +8 -0
  1750. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_1.cpp +8 -0
  1751. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q8_0.cpp +8 -0
  1752. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-f16.cpp +8 -0
  1753. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_0.cpp +8 -0
  1754. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_1.cpp +8 -0
  1755. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_0.cpp +8 -0
  1756. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_1.cpp +8 -0
  1757. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q8_0.cpp +8 -0
  1758. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-f16.cpp +8 -0
  1759. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_0.cpp +8 -0
  1760. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_1.cpp +8 -0
  1761. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_0.cpp +8 -0
  1762. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_1.cpp +8 -0
  1763. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q8_0.cpp +8 -0
  1764. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-f16.cpp +8 -0
  1765. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_0.cpp +8 -0
  1766. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_1.cpp +8 -0
  1767. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_0.cpp +8 -0
  1768. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_1.cpp +8 -0
  1769. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q8_0.cpp +8 -0
  1770. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-f16.cpp +8 -0
  1771. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_0.cpp +8 -0
  1772. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_1.cpp +8 -0
  1773. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_0.cpp +8 -0
  1774. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_1.cpp +8 -0
  1775. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q8_0.cpp +8 -0
  1776. data/vendor/ggml/src/ggml-sycl/tsembd.cpp +73 -0
  1777. data/vendor/ggml/src/ggml-sycl/tsembd.hpp +20 -0
  1778. data/vendor/ggml/src/ggml-sycl/type.hpp +112 -0
  1779. data/vendor/ggml/src/ggml-sycl/upscale.cpp +410 -0
  1780. data/vendor/ggml/src/ggml-sycl/upscale.hpp +9 -0
  1781. data/vendor/ggml/src/ggml-sycl/vecdotq.hpp +1508 -0
  1782. data/vendor/ggml/src/ggml-sycl/wkv.cpp +293 -0
  1783. data/vendor/ggml/src/ggml-sycl/wkv.hpp +10 -0
  1784. data/vendor/ggml/src/ggml-threading.cpp +12 -0
  1785. data/vendor/ggml/src/ggml-threading.h +14 -0
  1786. data/vendor/ggml/src/ggml-virtgpu/CMakeLists.txt +70 -0
  1787. data/vendor/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp +87 -0
  1788. data/vendor/ggml/src/ggml-virtgpu/backend/CMakeLists.txt +21 -0
  1789. data/vendor/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp +115 -0
  1790. data/vendor/ggml/src/ggml-virtgpu/backend/backend-convert.h +13 -0
  1791. data/vendor/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +102 -0
  1792. data/vendor/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +105 -0
  1793. data/vendor/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +179 -0
  1794. data/vendor/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp +148 -0
  1795. data/vendor/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +51 -0
  1796. data/vendor/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +73 -0
  1797. data/vendor/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +27 -0
  1798. data/vendor/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +32 -0
  1799. data/vendor/ggml/src/ggml-virtgpu/backend/backend.cpp +144 -0
  1800. data/vendor/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +95 -0
  1801. data/vendor/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +94 -0
  1802. data/vendor/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +50 -0
  1803. data/vendor/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +378 -0
  1804. data/vendor/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +232 -0
  1805. data/vendor/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +58 -0
  1806. data/vendor/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +81 -0
  1807. data/vendor/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +123 -0
  1808. data/vendor/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +160 -0
  1809. data/vendor/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +213 -0
  1810. data/vendor/ggml/src/ggml-virtgpu/ggml-backend.cpp +71 -0
  1811. data/vendor/ggml/src/ggml-virtgpu/ggml-remoting.h +71 -0
  1812. data/vendor/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml +166 -0
  1813. data/vendor/ggml/src/ggml-virtgpu/include/apir_hw.h +9 -0
  1814. data/vendor/ggml/src/ggml-virtgpu/regenerate_remoting.py +333 -0
  1815. data/vendor/ggml/src/ggml-virtgpu/virtgpu-apir.h +15 -0
  1816. data/vendor/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +58 -0
  1817. data/vendor/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +110 -0
  1818. data/vendor/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +173 -0
  1819. data/vendor/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +192 -0
  1820. data/vendor/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +36 -0
  1821. data/vendor/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +53 -0
  1822. data/vendor/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +99 -0
  1823. data/vendor/ggml/src/ggml-virtgpu/virtgpu-shm.h +23 -0
  1824. data/vendor/ggml/src/ggml-virtgpu/virtgpu-utils.cpp +179 -0
  1825. data/vendor/ggml/src/ggml-virtgpu/virtgpu-utils.h +86 -0
  1826. data/vendor/ggml/src/ggml-virtgpu/virtgpu.cpp +545 -0
  1827. data/vendor/ggml/src/ggml-virtgpu/virtgpu.h +115 -0
  1828. data/vendor/ggml/src/ggml-vulkan/CMakeLists.txt +220 -0
  1829. data/vendor/ggml/src/ggml-vulkan/cmake/host-toolchain.cmake.in +15 -0
  1830. data/vendor/ggml/src/ggml-vulkan/ggml-vulkan.cpp +17208 -0
  1831. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +31 -0
  1832. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/abs.comp +21 -0
  1833. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +37 -0
  1834. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +69 -0
  1835. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/add1.comp +28 -0
  1836. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +42 -0
  1837. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/arange.comp +20 -0
  1838. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +60 -0
  1839. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +86 -0
  1840. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/argsort_large.comp +114 -0
  1841. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/ceil.comp +22 -0
  1842. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp +17 -0
  1843. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/concat.comp +41 -0
  1844. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +49 -0
  1845. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp +105 -0
  1846. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +347 -0
  1847. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
  1848. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +23 -0
  1849. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +51 -0
  1850. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +320 -0
  1851. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/copy_transpose.comp +67 -0
  1852. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp +17 -0
  1853. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/count_equal.comp +31 -0
  1854. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/count_experts.comp +51 -0
  1855. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/cumsum.comp +83 -0
  1856. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass1.comp +60 -0
  1857. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass2.comp +66 -0
  1858. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_f32.comp +20 -0
  1859. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl +653 -0
  1860. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.glsl +768 -0
  1861. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_head.glsl +13 -0
  1862. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +42 -0
  1863. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_s.comp +35 -0
  1864. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +44 -0
  1865. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xs.comp +43 -0
  1866. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +49 -0
  1867. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +40 -0
  1868. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +51 -0
  1869. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp +32 -0
  1870. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp +34 -0
  1871. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +32 -0
  1872. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_nvfp4.comp +32 -0
  1873. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q1_0.comp +29 -0
  1874. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +34 -0
  1875. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +42 -0
  1876. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_0.comp +30 -0
  1877. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_1.comp +32 -0
  1878. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +68 -0
  1879. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_0.comp +34 -0
  1880. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_1.comp +35 -0
  1881. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +70 -0
  1882. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +33 -0
  1883. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q8_0.comp +31 -0
  1884. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp +28 -0
  1885. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/diag_mask_inf.comp +34 -0
  1886. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/div.comp +27 -0
  1887. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/elu.comp +27 -0
  1888. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +20 -0
  1889. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/bfloat16.comp +7 -0
  1890. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/coopmat.comp +7 -0
  1891. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/coopmat2.comp +7 -0
  1892. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/integer_dot.comp +7 -0
  1893. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/fill.comp +19 -0
  1894. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +756 -0
  1895. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +255 -0
  1896. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +626 -0
  1897. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +427 -0
  1898. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_dequant.glsl +123 -0
  1899. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mask_opt.comp +162 -0
  1900. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mmq_funcs.glsl +203 -0
  1901. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +121 -0
  1902. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/floor.comp +22 -0
  1903. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp +190 -0
  1904. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
  1905. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
  1906. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
  1907. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp +25 -0
  1908. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
  1909. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp +23 -0
  1910. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.glsl +65 -0
  1911. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/generic_head.glsl +11 -0
  1912. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.glsl +83 -0
  1913. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +42 -0
  1914. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +51 -0
  1915. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.glsl +28 -0
  1916. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.glsl +39 -0
  1917. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp +66 -0
  1918. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +22 -0
  1919. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +22 -0
  1920. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +93 -0
  1921. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +124 -0
  1922. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +44 -0
  1923. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/leaky_relu.comp +22 -0
  1924. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +17 -0
  1925. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul.comp +27 -0
  1926. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_split_k_reduce.comp +48 -0
  1927. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +169 -0
  1928. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl +230 -0
  1929. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +35 -0
  1930. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_m.comp +132 -0
  1931. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp +95 -0
  1932. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_s.comp +90 -0
  1933. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +105 -0
  1934. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xxs.comp +87 -0
  1935. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_s.comp +90 -0
  1936. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_xxs.comp +88 -0
  1937. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +124 -0
  1938. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp +156 -0
  1939. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +128 -0
  1940. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +132 -0
  1941. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +134 -0
  1942. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +165 -0
  1943. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +130 -0
  1944. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +143 -0
  1945. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +503 -0
  1946. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +464 -0
  1947. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +624 -0
  1948. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +600 -0
  1949. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +74 -0
  1950. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +311 -0
  1951. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +454 -0
  1952. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +93 -0
  1953. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +194 -0
  1954. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/neg.comp +20 -0
  1955. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/norm.comp +44 -0
  1956. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_adamw.comp +42 -0
  1957. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +22 -0
  1958. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +64 -0
  1959. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/pool2d.comp +74 -0
  1960. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +127 -0
  1961. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
  1962. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp +21 -0
  1963. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp +26 -0
  1964. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp +37 -0
  1965. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +150 -0
  1966. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_back.comp +55 -0
  1967. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +65 -0
  1968. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
  1969. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +207 -0
  1970. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +19 -0
  1971. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +17 -0
  1972. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +17 -0
  1973. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +17 -0
  1974. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +31 -0
  1975. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +17 -0
  1976. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/round.comp +29 -0
  1977. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +24 -0
  1978. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/sgn.comp +21 -0
  1979. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp +20 -0
  1980. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp +22 -0
  1981. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/silu_back.comp +26 -0
  1982. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp +17 -0
  1983. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +195 -0
  1984. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +54 -0
  1985. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large1.comp +62 -0
  1986. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large2.comp +79 -0
  1987. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large3.comp +65 -0
  1988. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large_common.glsl +53 -0
  1989. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/softplus.comp +23 -0
  1990. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/solve_tri.comp +81 -0
  1991. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +17 -0
  1992. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/square.comp +17 -0
  1993. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +50 -0
  1994. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/ssm_scan.comp +124 -0
  1995. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/step.comp +22 -0
  1996. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/sub.comp +29 -0
  1997. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +47 -0
  1998. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.glsl +25 -0
  1999. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
  2000. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +14 -0
  2001. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp +20 -0
  2002. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +42 -0
  2003. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/topk_argsort.comp +118 -0
  2004. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +213 -0
  2005. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/topk_nary_search.comp +246 -0
  2006. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp +42 -0
  2007. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/trunc.comp +22 -0
  2008. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl +1846 -0
  2009. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +178 -0
  2010. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/utils.glsl +25 -0
  2011. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +1183 -0
  2012. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/wkv6.comp +87 -0
  2013. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/wkv7.comp +91 -0
  2014. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/xielu.comp +35 -0
  2015. data/vendor/ggml/src/ggml-webgpu/CMakeLists.txt +80 -0
  2016. data/vendor/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +3231 -0
  2017. data/vendor/ggml/src/ggml-webgpu/ggml-webgpu.cpp +4461 -0
  2018. data/vendor/ggml/src/ggml-webgpu/pre_wgsl.hpp +778 -0
  2019. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/add_id.wgsl +64 -0
  2020. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl +72 -0
  2021. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl +106 -0
  2022. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl +134 -0
  2023. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +139 -0
  2024. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +905 -0
  2025. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl +75 -0
  2026. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/conv2d.wgsl +165 -0
  2027. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +81 -0
  2028. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl +66 -0
  2029. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +89 -0
  2030. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +706 -0
  2031. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_tile.wgsl +351 -0
  2032. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_blk.wgsl +101 -0
  2033. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_reduce.wgsl +84 -0
  2034. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_split.wgsl +720 -0
  2035. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/gated_delta_net.wgsl +132 -0
  2036. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.wgsl +773 -0
  2037. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/glu.wgsl +155 -0
  2038. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/im2col.wgsl +101 -0
  2039. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
  2040. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +747 -0
  2041. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +1210 -0
  2042. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id.wgsl +195 -0
  2043. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_gather.wgsl +55 -0
  2044. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_vec.wgsl +154 -0
  2045. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl +149 -0
  2046. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.wgsl +200 -0
  2047. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +133 -0
  2048. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_acc.tmpl +1433 -0
  2049. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl +86 -0
  2050. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/repeat.wgsl +67 -0
  2051. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_mul.wgsl +152 -0
  2052. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/rope.wgsl +224 -0
  2053. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/row_norm.wgsl +153 -0
  2054. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/scale.wgsl +63 -0
  2055. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/set.wgsl +109 -0
  2056. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +109 -0
  2057. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.wgsl +245 -0
  2058. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/solve_tri.wgsl +121 -0
  2059. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/ssm_conv.wgsl +65 -0
  2060. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/ssm_scan.wgsl +193 -0
  2061. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl +55 -0
  2062. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +210 -0
  2063. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/upscale.wgsl +240 -0
  2064. data/vendor/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
  2065. data/vendor/ggml/src/ggml-zdnn/common.hpp +59 -0
  2066. data/vendor/ggml/src/ggml-zdnn/ggml-zdnn.cpp +637 -0
  2067. data/vendor/ggml/src/ggml-zdnn/mmf.cpp +80 -0
  2068. data/vendor/ggml/src/ggml-zdnn/mmf.hpp +12 -0
  2069. data/vendor/ggml/src/ggml-zdnn/utils.cpp +79 -0
  2070. data/vendor/ggml/src/ggml-zdnn/utils.hpp +19 -0
  2071. data/vendor/ggml/src/ggml-zendnn/CMakeLists.txt +91 -0
  2072. data/vendor/ggml/src/ggml-zendnn/ggml-zendnn.cpp +669 -0
  2073. data/vendor/ggml/src/ggml.c +7777 -0
  2074. data/vendor/ggml/src/ggml.cpp +26 -0
  2075. data/vendor/ggml/src/gguf.cpp +1556 -0
  2076. data/vendor/ggml/tests/CMakeLists.txt +356 -0
  2077. data/vendor/ggml/tests/test-arange.cpp +100 -0
  2078. data/vendor/ggml/tests/test-backend-ops.cpp +9786 -0
  2079. data/vendor/ggml/tests/test-cont.c +170 -0
  2080. data/vendor/ggml/tests/test-conv-transpose-1d.cpp +691 -0
  2081. data/vendor/ggml/tests/test-conv-transpose.c +248 -0
  2082. data/vendor/ggml/tests/test-conv1d-dw-c1.cpp +243 -0
  2083. data/vendor/ggml/tests/test-conv1d-dw-c2.cpp +243 -0
  2084. data/vendor/ggml/tests/test-conv1d.cpp +289 -0
  2085. data/vendor/ggml/tests/test-conv2d-dw.cpp +153 -0
  2086. data/vendor/ggml/tests/test-conv2d.cpp +391 -0
  2087. data/vendor/ggml/tests/test-customop.c +300 -0
  2088. data/vendor/ggml/tests/test-dup.c +111 -0
  2089. data/vendor/ggml/tests/test-interpolate.cpp +166 -0
  2090. data/vendor/ggml/tests/test-opt.cpp +1003 -0
  2091. data/vendor/ggml/tests/test-pad-reflect-1d.cpp +213 -0
  2092. data/vendor/ggml/tests/test-pool.c +274 -0
  2093. data/vendor/ggml/tests/test-quantize-fns.cpp +196 -0
  2094. data/vendor/ggml/tests/test-quantize-perf.cpp +356 -0
  2095. data/vendor/ggml/tests/test-rel-pos.c +87 -0
  2096. data/vendor/ggml/tests/test-roll.cpp +128 -0
  2097. data/vendor/ggml/tests/test-timestep_embedding.cpp +180 -0
  2098. data/vendor-patches/0001-cuda-buffer_from_ptr.patch +253 -0
  2099. data/vendor-patches/0002-cuda-buffer_from_ptr-reuse-iface.patch +117 -0
  2100. data/vendor-patches/0003-cuda-buffer_from_ptr-copy-mode.patch +128 -0
  2101. data/vendor-patches/0004-cuda-cpy-strided.patch +61 -0
  2102. data/vendor-patches/0005-concat-backward.patch +36 -0
  2103. data/vendor-patches/0006-getrows-back-large-vocab.patch +69 -0
  2104. data/vendor-patches/0007-gpt2-backward-kernels.patch +438 -0
  2105. data/vendor-patches/0008-mul-mat-backward-mixed-precision.patch +50 -0
  2106. data/vendor-patches/0009-sched-unsupported-node-diagnostic.patch +26 -0
  2107. metadata +2161 -0
@@ -0,0 +1,3680 @@
1
+ #include <assert.h>
2
+ #include <inttypes.h>
3
+ #include <stdio.h>
4
+ #include <stdlib.h>
5
+ #include <string.h>
6
+ #include <time.h>
7
+
8
+ #include <atomic>
9
+ #include <chrono>
10
+ #include <mutex>
11
+ #include <thread>
12
+ #include <cstddef>
13
+ #include <stdexcept>
14
+ #include <string>
15
+ #include <sstream>
16
+ #include <iomanip>
17
+ #include <unordered_set>
18
+ #include <unordered_map>
19
+ #include <regex>
20
+ #include <queue>
21
+
22
+ #ifdef _WIN32
23
+ # include <sal.h>
24
+ #else
25
+ # include <semaphore.h>
26
+ # include <unistd.h>
27
+ #endif
28
+
29
+ #pragma clang diagnostic ignored "-Wnested-anon-types"
30
+ #pragma clang diagnostic ignored "-Wgnu-anonymous-struct"
31
+
32
+ #include <AEEStdErr.h>
33
+ #include <dspqueue.h>
34
+ #include <rpcmem.h>
35
+
36
+ #define GGML_COMMON_IMPL_CPP
37
+ #include "ggml-backend-impl.h"
38
+ #include "ggml-common.h"
39
+ #include "ggml-hexagon.h"
40
+ #include "ggml-impl.h"
41
+ #include "ggml-quants.h"
42
+ #include "op-desc.h"
43
+ #include "htp-ops.h"
44
+ #include "htp_iface.h"
45
+ #include "htp-drv.h"
46
+
47
+ using intvec = std::vector<int>;
48
+ using uintvec = std::vector<unsigned int>;
49
+ using u32vec = std::vector<uint32_t>;
50
+
51
+ static int opt_arch = 0; // autodetect
52
+ static size_t opt_ndev = 1;
53
+ static size_t opt_nhvx = 0; // use all
54
+ static int opt_use_hmx = 1; // when set, enable HMX; when 0, use HVX only
55
+ static size_t opt_vmem = HTP_OP_MAX_VMEM_DEFAULT; // max available va space for buffer mappings
56
+ static size_t opt_mbuf = 1ul * 1024 * 1024 * 1024; // max buffer size
57
+ static int opt_etm = 0;
58
+ static int opt_verbose = 0;
59
+ static int opt_profile = 0; // profiling mode (0-disabled, 1-basic, 2-pmu)
60
+ static int opt_hostbuf = 1; // hostbuf ON by default
61
+
62
+ // Default PMU events, if profiling with PMU (mode=2) is enabled
63
+ // See https://docs.qualcomm.com/doc/80-N2040-60/topic/pmu-events.html
64
+ // https://docs.qualcomm.com/doc/80-N2040-61/topic/hvx-pmu-events.html
65
+ static u32vec opt_pmu_evt { 0x3, 0x111, 0x100, 0x105, 0x240, 0x256, 0x7D, 0x8C };
66
+
67
+ // Enable all stages by default
68
+ static int opt_opstage = HTP_OPSTAGE_QUEUE | HTP_OPSTAGE_COMPUTE;
69
+ static int opt_opbatch = 1024; // max number of ops in a batch
70
+ static int opt_opqueue = 16; // max number of pending batches
71
+
72
+ static std::regex* opt_opfilter = NULL; // regex of ops to not claim
73
+
74
+ #define HEX_VERBOSE(...) \
75
+ if (opt_verbose) GGML_LOG_DEBUG(__VA_ARGS__)
76
+
77
+ static inline uint64_t hex_is_aligned(void * addr, uint32_t align) {
78
+ return ((size_t) addr & (align - 1)) == 0;
79
+ }
80
+
81
+ static inline size_t hex_round_up(size_t n, size_t m) {
82
+ return m * ((n + m - 1) / m);
83
+ }
84
+
85
+ static const char * status_to_str(uint32_t status) {
86
+ switch (status) {
87
+ case HTP_STATUS_OK:
88
+ return "OK";
89
+ case HTP_STATUS_NO_SUPPORT:
90
+ return "NO-SUPPORT";
91
+ case HTP_STATUS_INVAL_PARAMS:
92
+ return "INVAL-PARAMS";
93
+ case HTP_STATUS_VTCM_TOO_SMALL:
94
+ return "VTCM-TOO-SMALL";
95
+ case HTP_STATUS_INTERNAL_ERR:
96
+ return "INTERNAL-ERROR";
97
+ default:
98
+ return "UNKNOWN";
99
+ }
100
+ }
101
+
102
+ // ** debug helpers
103
+
104
+ static void ggml_hexagon_dump_op_exec(const std::string &sess_name, const ggml_tensor * op, const uint32_t req_flags) {
105
+ if (!opt_verbose) return;
106
+
107
+ op_desc desc(op);
108
+ GGML_LOG_DEBUG("ggml-hex: %s execute-op %s: %s : %s : %s : %s : %s : flags 0x%x\n", sess_name.c_str(),
109
+ ggml_op_desc(op), desc.names, desc.dims, desc.types, desc.strides, desc.buffs, req_flags);
110
+ }
111
+
112
+ static void ggml_hexagon_dump_op_supp(const std::string &sess_name, const struct ggml_tensor * op, bool supp) {
113
+ if (!opt_verbose) return;
114
+
115
+ op_desc desc(op);
116
+ GGML_LOG_DEBUG("ggml-hex: %s supports-op %s: %s : %s : %s : %s : %s : %s\n", sess_name.c_str(),
117
+ ggml_op_desc(op), desc.names, desc.dims, desc.types, desc.strides, desc.buffs, supp ? "yes" : "no");
118
+ }
119
+
120
+ static void ggml_hexagon_dump_op_prof(const std::string &sess_name, const ggml_tensor * op,
121
+ uint32_t op_usec, uint32_t op_cycles, const uint32_t pmu[]) {
122
+ if (!opt_profile) return;
123
+
124
+ char pmu_str[256] = "";
125
+ if (opt_profile > 1) {
126
+ static_assert(HTP_PROF_PMU_NCNT == 8, "current implementation assumes 8 PMU counters");
127
+ sprintf(pmu_str, " pmu [%u,%u,%u,%u,%u,%u,%u,%u]",
128
+ pmu[0], pmu[1], pmu[2], pmu[3], pmu[4], pmu[5], pmu[6], pmu[7]);
129
+ }
130
+
131
+ op_desc desc(op);
132
+ GGML_LOG_DEBUG("ggml-hex: %s profile-op %s: %s : %s : %s : %s : usec %u cycles %u%s\n", sess_name.c_str(),
133
+ ggml_op_desc(op), desc.names, desc.dims, desc.types, desc.strides, op_usec, op_cycles, pmu_str);
134
+ }
135
+
136
+ // ** backend sessions
137
+
138
+ struct ggml_hexagon_opbatch;
139
+ struct ggml_hexagon_opqueue;
140
+
141
+ struct ggml_hexagon_session {
142
+ std::string name;
143
+ remote_handle64 handle;
144
+ dspqueue_t queue;
145
+ uint32_t session_id;
146
+ uint32_t domain_id;
147
+ uint64_t queue_id;
148
+ int dev_id;
149
+ bool valid_session;
150
+ bool valid_handle;
151
+ bool valid_queue;
152
+ bool valid_iface;
153
+
154
+ std::atomic<int> op_pending;
155
+ ggml_hexagon_opbatch* op_batch;
156
+ ggml_hexagon_opqueue* op_queue;
157
+
158
+ ggml_backend_buffer_type buffer_type = {};
159
+ ggml_backend_buffer_type repack_buffer_type = {};
160
+
161
+ ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) noexcept(false);
162
+ ~ggml_hexagon_session() noexcept(true);
163
+
164
+ const char* c_name() const { return name.c_str(); }
165
+
166
+ void allocate(int dev_id) noexcept(false);
167
+ void release() noexcept(true);
168
+
169
+ void enqueue_op(htp_op_code opcode, const ggml_tensor *op);
170
+ void flush(bool all = true);
171
+
172
+ void flush_pending(bool all = false);
173
+ void flush_batch();
174
+ };
175
+
176
+ // ** backend buffers
177
+
178
+ struct ggml_backend_hexagon_buffer_type_context {
179
+ ggml_backend_hexagon_buffer_type_context(const std::string & name, ggml_hexagon_session * sess) {
180
+ this->sess = sess;
181
+ this->name = name;
182
+ }
183
+
184
+ ggml_hexagon_session * sess;
185
+ std::string name;
186
+ };
187
+
188
+ struct ggml_hexagon_shared_buffer {
189
+ ggml_hexagon_session * sess;
190
+ uint8_t * base;
191
+ size_t size;
192
+ int fd;
193
+ bool mapped;
194
+ bool pinned;
195
+
196
+ void mmap() {
197
+ fastrpc_map_flags flags = this->pinned ? FASTRPC_MAP_FD : FASTRPC_MAP_FD_DELAYED;
198
+
199
+ int err = fastrpc_mmap(sess->domain_id, this->fd, (void *) this->base, 0, this->size, flags);
200
+ if (err != 0) {
201
+ GGML_LOG_ERROR("ggml-hex: %s buffer mapping failed : domain_id %d size %zu fd %d error 0x%08x\n", sess->c_name(),
202
+ sess->domain_id, this->size, this->fd, (unsigned) err);
203
+ throw std::runtime_error("ggml-hex: fastrpc_mmap failed (see log for details)");
204
+ }
205
+
206
+ HEX_VERBOSE("ggml-hex: %s mapped buffer: base %p size %zu fd %d pinned %u\n",
207
+ sess->c_name(), (void *) this->base, this->size, this->fd, pinned);
208
+
209
+ this->mapped = true;
210
+ }
211
+
212
+ void unmap() {
213
+ if (!this->mapped) return;
214
+
215
+ if (!this->pinned) {
216
+ // HTP might still hold a reference, tell it drop it
217
+ htp_iface_munmap(sess->handle, this->fd);
218
+ }
219
+
220
+ fastrpc_munmap(sess->domain_id, this->fd, (void *) this->base, this->size);
221
+
222
+ HEX_VERBOSE("ggml-hex: %s unmapped buffer: base %p size %zu fd %d\n", sess->c_name(),
223
+ (void *) this->base, size, this->fd);
224
+
225
+ this->mapped = false;
226
+ this->fd = -1;
227
+ }
228
+
229
+ void alloc(size_t size) {
230
+ if (this->base) return;
231
+
232
+ this->base = (uint8_t *) rpcmem_alloc2(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, size);
233
+ if (!this->base) {
234
+ GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer : size %zu\n", sess->c_name(), size);
235
+ throw std::runtime_error("ggml-hex: rpcmem_alloc failed (see log for details)");
236
+ }
237
+
238
+ this->fd = rpcmem_to_fd(this->base);
239
+ if (this->fd < 0) {
240
+ GGML_LOG_ERROR("ggml-hex: %s failed to get FD for buffer %p\n", sess->c_name(), (void *) this->base);
241
+ throw std::runtime_error("ggml-hex: rpcmem_to_fd failed (see log for details)");
242
+ }
243
+ this->size = size;
244
+
245
+ HEX_VERBOSE("ggml-hex: %s allocated buffer: base %p size %zu fd %d pinned %d\n", sess->c_name(),
246
+ (void *) this->base, this->size, this->fd, (int) pinned);
247
+ mmap();
248
+ }
249
+
250
+ void free() {
251
+ if (!this->base) return;
252
+
253
+ unmap();
254
+ rpcmem_free(this->base);
255
+
256
+ HEX_VERBOSE("ggml-hex: %s freed buffer: base %p size %zu fd %d\n", sess->c_name(),
257
+ (void *) this->base, size, this->fd);
258
+
259
+ this->base = NULL;
260
+ }
261
+
262
+ ggml_hexagon_shared_buffer(ggml_hexagon_session * sess, size_t size, bool pinned = false) {
263
+ this->sess = sess;
264
+ this->size = 0;
265
+ this->base = nullptr;
266
+ this->fd = -1;
267
+ this->mapped = false;
268
+ this->pinned = pinned;
269
+
270
+ alloc(size);
271
+ }
272
+
273
+ ~ggml_hexagon_shared_buffer() {
274
+ free();
275
+ }
276
+ };
277
+
278
+ static ggml_hexagon_session * ggml_backend_hexagon_buffer_get_sess(ggml_backend_buffer_t buffer) {
279
+ return static_cast<ggml_backend_hexagon_buffer_type_context *>(buffer->buft->context)->sess;
280
+ }
281
+
282
+ static void ggml_backend_hexagon_buffer_free_buffer(ggml_backend_buffer_t buffer) {
283
+ auto sbuf = static_cast<ggml_hexagon_shared_buffer *>(buffer->context);
284
+ delete sbuf;
285
+ }
286
+
287
+ static void * ggml_backend_hexagon_buffer_get_base(ggml_backend_buffer_t buffer) {
288
+ auto sbuf = static_cast<ggml_hexagon_shared_buffer *>(buffer->context);
289
+ return sbuf->base;
290
+ }
291
+
292
+ static enum ggml_status ggml_backend_hexagon_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
293
+ auto sbuf = static_cast<ggml_hexagon_shared_buffer *>(buffer->context);
294
+ auto sess = sbuf->sess;
295
+
296
+ HEX_VERBOSE("ggml-hex: %s init-tensor %s : base %p data %p nbytes %zu usage %d\n", sess->c_name(),
297
+ tensor->name, (void *) sbuf->base, tensor->data, ggml_nbytes(tensor), (int) buffer->usage);
298
+
299
+ if (tensor->view_src != NULL && tensor->view_offs == 0) {
300
+ return GGML_STATUS_SUCCESS; // nothing to do for the view
301
+ }
302
+
303
+ return GGML_STATUS_SUCCESS;
304
+ }
305
+
306
+ // ======== Q4x4x2 ====================
307
+ struct x2_q4 {
308
+ int v[2];
309
+ };
310
+
311
+ static x2_q4 unpack_q4(uint8_t v) {
312
+ x2_q4 x = { (int) (v & 0x0f) - 8, (int) (v >> 4) - 8 };
313
+ return x;
314
+ }
315
+
316
+ static void dump_block_q4_0(const block_q4_0 * b, int i) {
317
+ HEX_VERBOSE("ggml-hex: repack q4_0 %d: %d %d %d %d ... %d %d %d %d : %.6f\n", i, unpack_q4(b->qs[0]).v[0],
318
+ unpack_q4(b->qs[1]).v[0], unpack_q4(b->qs[2]).v[0], unpack_q4(b->qs[3]).v[0], unpack_q4(b->qs[12]).v[1],
319
+ unpack_q4(b->qs[13]).v[1], unpack_q4(b->qs[14]).v[1], unpack_q4(b->qs[15]).v[1],
320
+ GGML_FP16_TO_FP32(b->d));
321
+ }
322
+
323
+ static void dump_packed_block_q4x4x2(const uint8_t * v, unsigned int i, size_t k) {
324
+ static const int qk = QK_Q4_0x4x2;
325
+ const int dblk_size = 8 * 2; // 8x __fp16
326
+ const int qblk_size = qk / 2; // int4
327
+ const int qrow_size = k / 2; // int4 (not padded)
328
+
329
+ const uint8_t * v_q = v + 0; // quants first
330
+ const uint8_t * v_d = v + qrow_size; // then scales
331
+
332
+ const uint8_t * q = v_q + i * qblk_size;
333
+ const ggml_half * d = (const ggml_half *) (v_d + i * dblk_size);
334
+
335
+ HEX_VERBOSE("ggml-hex: repack q4x4x2-%d: %d %d %d %d ... %d %d %d %d ... %d %d %d %d : %.6f %.6f %.6f %.6f\n", i,
336
+ unpack_q4(q[0]).v[0], unpack_q4(q[1]).v[0], unpack_q4(q[2]).v[0], unpack_q4(q[3]).v[0],
337
+ unpack_q4(q[60]).v[0], unpack_q4(q[61]).v[0], unpack_q4(q[62]).v[0], unpack_q4(q[63]).v[0],
338
+ unpack_q4(q[124]).v[0], unpack_q4(q[125]).v[0], unpack_q4(q[126]).v[0], unpack_q4(q[127]).v[0],
339
+ GGML_FP16_TO_FP32(d[0]), GGML_FP16_TO_FP32(d[1]), GGML_FP16_TO_FP32(d[2]), GGML_FP16_TO_FP32(d[3]));
340
+
341
+ HEX_VERBOSE("ggml-hex: repack q4x4x2-%d: %d %d %d %d ... %d %d %d %d ... %d %d %d %d : %.6f %.6f %.6f %.6f\n",
342
+ i + 1, unpack_q4(q[0]).v[1], unpack_q4(q[1]).v[1], unpack_q4(q[2]).v[1], unpack_q4(q[3]).v[1],
343
+ unpack_q4(q[60]).v[1], unpack_q4(q[61]).v[1], unpack_q4(q[62]).v[1], unpack_q4(q[63]).v[1],
344
+ unpack_q4(q[124]).v[1], unpack_q4(q[125]).v[1], unpack_q4(q[126]).v[1], unpack_q4(q[127]).v[1],
345
+ GGML_FP16_TO_FP32(d[4]), GGML_FP16_TO_FP32(d[5]), GGML_FP16_TO_FP32(d[6]), GGML_FP16_TO_FP32(d[7]));
346
+ }
347
+
348
+ static void unpack_q4_0_quants(uint8_t * qs, const block_q4_0 * x, unsigned int bi) {
349
+ static const int qk = QK4_0;
350
+
351
+ for (unsigned int i = 0; i < qk / 2; ++i) {
352
+ const int x0 = (x->qs[i] & 0x0F);
353
+ const int x1 = (x->qs[i] >> 4);
354
+ qs[bi * qk + i + 0] = x0;
355
+ qs[bi * qk + i + qk / 2] = x1;
356
+ }
357
+ }
358
+
359
+ static void pack_q4_0_quants(block_q4_0 * x, const uint8_t * qs, unsigned int bi) {
360
+ static const int qk = QK4_0;
361
+
362
+ for (unsigned int i = 0; i < qk / 2; ++i) {
363
+ const uint8_t x0 = qs[bi * qk + i + 0];
364
+ const uint8_t x1 = qs[bi * qk + i + qk / 2];
365
+ x->qs[i] = x0 | (x1 << 4);
366
+ }
367
+ }
368
+
369
+ static void repack_row_q4x4x2(uint8_t * y, const block_q4_0 * x, int64_t k) {
370
+ static const int qk = QK_Q4_0x4x2;
371
+ const int nb = (k + qk - 1) / qk; // number of blocks (padded)
372
+ const int nloe = k % qk; // leftovers
373
+
374
+ const int dblk_size = 8 * 2; // 8x __fp16
375
+ const int qblk_size = qk / 2; // int4
376
+ const int qrow_size = k / 2; // int4 (not padded to blocks)
377
+
378
+ uint8_t * y_q = y + 0; // quants first
379
+ uint8_t * y_d = y + qrow_size; // then scales
380
+
381
+ if (opt_verbose > 2) {
382
+ for (int i = 0; i < nb; i++) {
383
+ dump_block_q4_0(&x[i * 8 + 0], 0);
384
+ dump_block_q4_0(&x[i * 8 + 1], 1);
385
+ dump_block_q4_0(&x[i * 8 + 2], 2);
386
+ dump_block_q4_0(&x[i * 8 + 3], 3);
387
+ dump_block_q4_0(&x[i * 8 + 4], 4);
388
+ dump_block_q4_0(&x[i * 8 + 5], 5);
389
+ dump_block_q4_0(&x[i * 8 + 6], 6);
390
+ dump_block_q4_0(&x[i * 8 + 7], 7);
391
+ }
392
+ }
393
+
394
+ // Repack the quants
395
+ for (int i = 0; i < nb; i++) {
396
+ uint8_t qs[QK_Q4_0x4x2]; // unpacked quants
397
+ unpack_q4_0_quants(qs, &x[i * 8 + 0], 0);
398
+ unpack_q4_0_quants(qs, &x[i * 8 + 1], 1);
399
+ unpack_q4_0_quants(qs, &x[i * 8 + 2], 2);
400
+ unpack_q4_0_quants(qs, &x[i * 8 + 3], 3);
401
+ unpack_q4_0_quants(qs, &x[i * 8 + 4], 4);
402
+ unpack_q4_0_quants(qs, &x[i * 8 + 5], 5);
403
+ unpack_q4_0_quants(qs, &x[i * 8 + 6], 6);
404
+ unpack_q4_0_quants(qs, &x[i * 8 + 7], 7);
405
+
406
+ bool partial = (nloe && i == nb-1);
407
+
408
+ uint8_t * q = y_q + (i * qblk_size);
409
+ for (int j = 0; j < qk / 2; j++) {
410
+ q[j] = partial ? (qs[j*2+1] << 4) | qs[j*2+0] : (qs[j+128] << 4) | qs[j+000];
411
+ }
412
+ }
413
+
414
+ // Repack the scales
415
+ // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2)
416
+ // the last block is truncated and overridden by the scales.
417
+ for (int i = 0; i < nb; i++) {
418
+ // Repack the scales
419
+ ggml_half * d = (ggml_half *) (y_d + i * dblk_size);
420
+ d[0] = x[i * 8 + 0].d;
421
+ d[1] = x[i * 8 + 1].d;
422
+ d[2] = x[i * 8 + 2].d;
423
+ d[3] = x[i * 8 + 3].d;
424
+ d[4] = x[i * 8 + 4].d;
425
+ d[5] = x[i * 8 + 5].d;
426
+ d[6] = x[i * 8 + 6].d;
427
+ d[7] = x[i * 8 + 7].d;
428
+ }
429
+
430
+ if (opt_verbose > 2) {
431
+ for (int i = 0; i < nb; i++) {
432
+ dump_packed_block_q4x4x2(y, i, k);
433
+ }
434
+ }
435
+ }
436
+
437
+ static void unpack_row_q4x4x2(block_q4_0 * x, const uint8_t * y, int64_t k) {
438
+ static const int qk = QK_Q4_0x4x2;
439
+ const int nb = (k + qk - 1) / qk; // number of blocks (padded)
440
+ const int nloe = k % qk; // leftovers
441
+
442
+ const int dblk_size = 8 * 2; // 8x __fp16
443
+ const int qblk_size = qk / 2; // int4
444
+ const int qrow_size = k / 2; // int4 (not padded to blocks)
445
+
446
+ const uint8_t * y_q = y + 0; // quants first
447
+ const uint8_t * y_d = y + qrow_size; // then scales
448
+
449
+ if (opt_verbose > 2) {
450
+ for (int i = 0; i < nb; i++) {
451
+ dump_packed_block_q4x4x2(y, i, k);
452
+ }
453
+ }
454
+
455
+ // Unpack the quants
456
+ for (int i = 0; i < nb; i++) {
457
+ uint8_t qs[QK_Q4_0x4x2]; // unpacked quants
458
+
459
+ bool partial = (nloe && i == nb-1);
460
+
461
+ const uint8_t * q = y_q + (i * qblk_size);
462
+ for (int j = 0; j < qk / 2; j++) {
463
+ if (partial) {
464
+ qs[j*2+0] = q[j] & 0xf;
465
+ qs[j*2+1] = q[j] >> 4;
466
+ } else {
467
+ qs[j+000] = q[j] & 0xf;
468
+ qs[j+128] = q[j] >> 4;
469
+ }
470
+ }
471
+
472
+ pack_q4_0_quants(&x[i * 8 + 0], qs, 0);
473
+ pack_q4_0_quants(&x[i * 8 + 1], qs, 1);
474
+ pack_q4_0_quants(&x[i * 8 + 2], qs, 2);
475
+ pack_q4_0_quants(&x[i * 8 + 3], qs, 3);
476
+ pack_q4_0_quants(&x[i * 8 + 4], qs, 4);
477
+ pack_q4_0_quants(&x[i * 8 + 5], qs, 5);
478
+ pack_q4_0_quants(&x[i * 8 + 6], qs, 6);
479
+ pack_q4_0_quants(&x[i * 8 + 7], qs, 7);
480
+ }
481
+
482
+ // Repack the scales
483
+ // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2)
484
+ // the last block is truncated and overridden by the scales.
485
+ for (int i = 0; i < nb; i++) {
486
+ // Unpack the scales
487
+ const ggml_half * d = (const ggml_half *) (y_d + i * dblk_size);
488
+ x[i * 8 + 0].d = d[0];
489
+ x[i * 8 + 1].d = d[1];
490
+ x[i * 8 + 2].d = d[2];
491
+ x[i * 8 + 3].d = d[3];
492
+ x[i * 8 + 4].d = d[4];
493
+ x[i * 8 + 5].d = d[5];
494
+ x[i * 8 + 6].d = d[6];
495
+ x[i * 8 + 7].d = d[7];
496
+ }
497
+
498
+ if (opt_verbose > 2) {
499
+ for (int i = 0; i < nb; i++) {
500
+ dump_block_q4_0(&x[i * 8 + 0], 0);
501
+ dump_block_q4_0(&x[i * 8 + 1], 1);
502
+ dump_block_q4_0(&x[i * 8 + 2], 2);
503
+ dump_block_q4_0(&x[i * 8 + 3], 3);
504
+ dump_block_q4_0(&x[i * 8 + 4], 4);
505
+ dump_block_q4_0(&x[i * 8 + 5], 5);
506
+ dump_block_q4_0(&x[i * 8 + 6], 6);
507
+ dump_block_q4_0(&x[i * 8 + 7], 7);
508
+ }
509
+ }
510
+ }
511
+
512
+ static void init_row_q4x4x2(block_q4_0 * x, int64_t k) {
513
+ static const int qk = QK_Q4_0x4x2;
514
+ const int nb = (k + qk - 1) / qk; // number of blocks (padded)
515
+
516
+ // Init the quants such that they unpack into zeros
517
+ uint8_t qs[QK_Q4_0x4x2]; // unpacked quants
518
+ memset(qs, 8, sizeof(qs));
519
+
520
+ for (int i = 0; i < nb; i++) {
521
+ pack_q4_0_quants(&x[i * 8 + 0], qs, 0);
522
+ pack_q4_0_quants(&x[i * 8 + 1], qs, 1);
523
+ pack_q4_0_quants(&x[i * 8 + 2], qs, 2);
524
+ pack_q4_0_quants(&x[i * 8 + 3], qs, 3);
525
+ pack_q4_0_quants(&x[i * 8 + 4], qs, 4);
526
+ pack_q4_0_quants(&x[i * 8 + 5], qs, 5);
527
+ pack_q4_0_quants(&x[i * 8 + 6], qs, 6);
528
+ pack_q4_0_quants(&x[i * 8 + 7], qs, 7);
529
+ }
530
+
531
+ // Init the scales
532
+ // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2)
533
+ // the last block is truncated and overridden by the scales.
534
+ for (int i = 0; i < nb; i++) {
535
+ // Unpack the scales
536
+ x[i * 8 + 0].d = 0;
537
+ x[i * 8 + 1].d = 0;
538
+ x[i * 8 + 2].d = 0;
539
+ x[i * 8 + 3].d = 0;
540
+ x[i * 8 + 4].d = 0;
541
+ x[i * 8 + 5].d = 0;
542
+ x[i * 8 + 6].d = 0;
543
+ x[i * 8 + 7].d = 0;
544
+ }
545
+ }
546
+
547
+ // repack q4_0 data into q4x4x2 tensor
548
+ static void repack_q4_0_q4x4x2(ggml_tensor * t, const void * data, size_t size) {
549
+ int64_t nrows = ggml_nrows(t);
550
+
551
+ size_t row_size = ggml_row_size(t->type, t->ne[0]);
552
+ size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q4_0x4x2)); // extra elements for the pad
553
+ size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any)
554
+
555
+ // Ensure we don't try to read more data than is available in the source buffer 'data'
556
+ // or write more than the tensor can hold.
557
+ const size_t total_tensor_size = (size_t)nrows * row_size;
558
+ const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
559
+
560
+ // Calculate how many full rows and how many remaining bytes we need to process.
561
+ const int64_t n_full_rows = n_bytes_to_copy / row_size;
562
+ const size_t n_rem_bytes = n_bytes_to_copy % row_size;
563
+
564
+ void * buf_pd = ggml_aligned_malloc(row_size_pd);
565
+ GGML_ASSERT(buf_pd != NULL);
566
+
567
+ void * buf_rp = ggml_aligned_malloc(row_size_rp);
568
+ GGML_ASSERT(buf_rp != NULL);
569
+
570
+ HEX_VERBOSE("ggml-hex: repack-q4_0-q4x4x2 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data, size,
571
+ t->ne[0], nrows, row_size);
572
+
573
+ init_row_q4x4x2((block_q4_0 *) buf_pd, t->ne[0]); // init padded buffer to make sure the tail is all zeros
574
+
575
+ // 1. Process all the full rows
576
+ for (int64_t i = 0; i < n_full_rows; i++) {
577
+ const uint8_t * src = (const uint8_t *) data + (i * row_size);
578
+ uint8_t * dst = (uint8_t *) t->data + (i * row_size);
579
+
580
+ memcpy(buf_pd, src, row_size);
581
+ repack_row_q4x4x2((uint8_t *) buf_rp, (const block_q4_0 *) buf_pd, t->ne[0]);
582
+ memcpy(dst, buf_rp, row_size);
583
+ }
584
+
585
+ // 2. Process the final, potentially partial, row
586
+ if (n_rem_bytes > 0) {
587
+ const int64_t i = n_full_rows;
588
+ const uint8_t * src = (const uint8_t *) data + (i * row_size);
589
+ uint8_t * dst = (uint8_t *) t->data + (i * row_size);
590
+
591
+ // re-init the row because we are potentially copying a partial row
592
+ init_row_q4x4x2((block_q4_0 *) buf_pd, t->ne[0]);
593
+
594
+ // Copy only the remaining bytes from the source.
595
+ memcpy(buf_pd, src, n_rem_bytes);
596
+
597
+ // Repack the entire buffer
598
+ repack_row_q4x4x2((uint8_t *) buf_rp, (const block_q4_0 *) buf_pd, t->ne[0]);
599
+
600
+ // Write only the corresponding remaining bytes to the destination tensor.
601
+ memcpy(dst, buf_rp, n_rem_bytes);
602
+ }
603
+
604
+ ggml_aligned_free(buf_pd, row_size_pd);
605
+ ggml_aligned_free(buf_rp, row_size_rp);
606
+ }
607
+
608
+ // repack q4x4x2 tensor into q4_0 data
609
+ static void repack_q4x4x2_q4_0(void * data, const ggml_tensor * t, size_t size) {
610
+ int64_t nrows = ggml_nrows(t);
611
+
612
+ size_t row_size = ggml_row_size(t->type, t->ne[0]);
613
+ size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q4_0x4x2)); // extra elements for the pad
614
+ size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any)
615
+
616
+ // Ensure we don't try to copy more data than the tensor actually contains.
617
+ const size_t total_tensor_size = (size_t)nrows * row_size;
618
+ const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
619
+
620
+ // Calculate how many full rows and how many remaining bytes we need to process.
621
+ const int64_t n_full_rows = n_bytes_to_copy / row_size;
622
+ const size_t n_rem_bytes = n_bytes_to_copy % row_size;
623
+
624
+ void * buf_pd = ggml_aligned_malloc(row_size_pd);
625
+ GGML_ASSERT(buf_pd != NULL);
626
+
627
+ void * buf_rp = ggml_aligned_malloc(row_size_rp);
628
+ GGML_ASSERT(buf_rp != NULL);
629
+
630
+ HEX_VERBOSE("ggml-hex: repack-q4x4x2-q4_0 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data, size,
631
+ t->ne[0], nrows, row_size);
632
+
633
+ memset(buf_pd, 0, row_size_pd); // clear-out padded buffer to make sure the tail is all zeros
634
+
635
+ // 1. Process all the full rows
636
+ for (int64_t i = 0; i < n_full_rows; i++) {
637
+ const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
638
+ uint8_t * dst = (uint8_t *) data + (i * row_size);
639
+
640
+ memcpy(buf_pd, src, row_size);
641
+ unpack_row_q4x4x2((block_q4_0 *) buf_rp, (const uint8_t *) buf_pd, t->ne[0]);
642
+ memcpy(dst, buf_rp, row_size);
643
+ }
644
+
645
+ // 2. Process the final, potentially partial, row
646
+ if (n_rem_bytes > 0) {
647
+ const int64_t i = n_full_rows;
648
+ const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
649
+ uint8_t * dst = (uint8_t *) data + (i * row_size);
650
+
651
+ // We still need to read and unpack the entire source row because quantization is block-based.
652
+ memcpy(buf_pd, src, row_size);
653
+ unpack_row_q4x4x2((block_q4_0 *) buf_rp, (const uint8_t *) buf_pd, t->ne[0]);
654
+
655
+ // But we only copy the remaining number of bytes to the destination.
656
+ memcpy(dst, buf_rp, n_rem_bytes);
657
+ }
658
+
659
+ ggml_aligned_free(buf_pd, row_size_pd);
660
+ ggml_aligned_free(buf_rp, row_size_rp);
661
+ }
662
+
663
+ // ======== Q8x4x2 ====================
664
+ static void dump_block_q8_0(const block_q8_0 * b, int i) {
665
+ HEX_VERBOSE("ggml-hex: repack q8_0 %d: %d %d %d %d ... %d %d %d %d : %.6f\n", i, b->qs[0], b->qs[1], b->qs[2],
666
+ b->qs[3], b->qs[28], b->qs[29], b->qs[30], b->qs[31], GGML_FP16_TO_FP32(b->d));
667
+ }
668
+
669
+ static void dump_packed_block_q8x4x2(const uint8_t * v, unsigned int i, size_t k) {
670
+ static const int qk = QK_Q8_0x4x2;
671
+ const int dblk_size = 8 * 2; // 8x __fp16
672
+ const int qblk_size = qk; // int8
673
+ const int qrow_size = k; // int8 (not padded)
674
+
675
+ const uint8_t * v_q = v + 0; // quants first
676
+ const uint8_t * v_d = v + qrow_size; // then scales
677
+
678
+ const uint8_t * q = v_q + i * qblk_size;
679
+ const ggml_half * d = (const ggml_half *) (v_d + i * dblk_size);
680
+
681
+ HEX_VERBOSE("ggml-hex: repack q8x4x2-%d: %d %d %d %d ... %d %d %d %d ... %d %d %d %d : %.6f %.6f %.6f %.6f\n", i,
682
+ q[0], q[1], q[2], q[3], q[60], q[61], q[62], q[63], q[124], q[125], q[126], q[127],
683
+ GGML_FP16_TO_FP32(d[0]), GGML_FP16_TO_FP32(d[1]), GGML_FP16_TO_FP32(d[2]), GGML_FP16_TO_FP32(d[3]));
684
+
685
+ HEX_VERBOSE("ggml-hex: repack q8x4x2-%d: %d %d %d %d ... %d %d %d %d ... %d %d %d %d : %.6f %.6f %.6f %.6f\n",
686
+ i + 1, q[128], q[129], q[130], q[131], q[192], q[193], q[194], q[195], q[252], q[253], q[254], q[255],
687
+ GGML_FP16_TO_FP32(d[4]), GGML_FP16_TO_FP32(d[5]), GGML_FP16_TO_FP32(d[6]), GGML_FP16_TO_FP32(d[7]));
688
+ }
689
+
690
+ static void unpack_q8_0_quants(uint8_t * qs, const block_q8_0 * x, unsigned int bi) {
691
+ static const int qk = QK8_0;
692
+
693
+ for (unsigned int i = 0; i < qk; ++i) {
694
+ qs[bi * qk + i] = x->qs[i];
695
+ }
696
+ }
697
+
698
+ static void pack_q8_0_quants(block_q8_0 * x, const uint8_t * qs, unsigned int bi) {
699
+ static const int qk = QK8_0;
700
+
701
+ for (unsigned int i = 0; i < qk; ++i) {
702
+ x->qs[i] = qs[bi * qk + i];
703
+ }
704
+ }
705
+
706
+ static void repack_row_q8x4x2(uint8_t * y, const block_q8_0 * x, int64_t k) {
707
+ static const int qk = QK_Q8_0x4x2;
708
+ const int nb = (k + qk - 1) / qk; // number of blocks (padded)
709
+
710
+ const int dblk_size = 8 * 2; // 8x __fp16
711
+ const int qblk_size = qk; // int8
712
+ const int qrow_size = k; // int8 (not padded to blocks)
713
+
714
+ uint8_t * y_q = y + 0; // quants first
715
+ uint8_t * y_d = y + qrow_size; // then scales
716
+
717
+ if (opt_verbose > 2) {
718
+ for (int i = 0; i < nb; i++) {
719
+ dump_block_q8_0(&x[i * 8 + 0], 0);
720
+ dump_block_q8_0(&x[i * 8 + 1], 1);
721
+ dump_block_q8_0(&x[i * 8 + 2], 2);
722
+ dump_block_q8_0(&x[i * 8 + 3], 3);
723
+ dump_block_q8_0(&x[i * 8 + 4], 4);
724
+ dump_block_q8_0(&x[i * 8 + 5], 5);
725
+ dump_block_q8_0(&x[i * 8 + 6], 6);
726
+ dump_block_q8_0(&x[i * 8 + 7], 7);
727
+ }
728
+ }
729
+
730
+ // Repack the quants
731
+ for (int i = 0; i < nb; i++) {
732
+ uint8_t qs[QK_Q8_0x4x2]; // unpacked quants
733
+
734
+ unpack_q8_0_quants(qs, &x[i * 8 + 0], 0);
735
+ unpack_q8_0_quants(qs, &x[i * 8 + 1], 1);
736
+ unpack_q8_0_quants(qs, &x[i * 8 + 2], 2);
737
+ unpack_q8_0_quants(qs, &x[i * 8 + 3], 3);
738
+ unpack_q8_0_quants(qs, &x[i * 8 + 4], 4);
739
+ unpack_q8_0_quants(qs, &x[i * 8 + 5], 5);
740
+ unpack_q8_0_quants(qs, &x[i * 8 + 6], 6);
741
+ unpack_q8_0_quants(qs, &x[i * 8 + 7], 7);
742
+
743
+ uint8_t * q = y_q + (i * qblk_size);
744
+ for (int j = 0; j < qk; j++) {
745
+ q[j] = qs[j];
746
+ }
747
+ }
748
+
749
+ // Repack the scales
750
+ // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2)
751
+ // the last block is truncated and overridden by the scales.
752
+ for (int i = 0; i < nb; i++) {
753
+ // Repack the scales
754
+ ggml_half * d = (ggml_half *) (y_d + i * dblk_size);
755
+ d[0] = x[i * 8 + 0].d;
756
+ d[1] = x[i * 8 + 1].d;
757
+ d[2] = x[i * 8 + 2].d;
758
+ d[3] = x[i * 8 + 3].d;
759
+ d[4] = x[i * 8 + 4].d;
760
+ d[5] = x[i * 8 + 5].d;
761
+ d[6] = x[i * 8 + 6].d;
762
+ d[7] = x[i * 8 + 7].d;
763
+ }
764
+
765
+ if (opt_verbose > 2) {
766
+ for (int i = 0; i < nb; i++) {
767
+ dump_packed_block_q8x4x2(y, i, k);
768
+ }
769
+ }
770
+ }
771
+
772
+ static void unpack_row_q8x4x2(block_q8_0 * x, const uint8_t * y, int64_t k) {
773
+ static const int qk = QK_Q8_0x4x2;
774
+ const int nb = (k + qk - 1) / qk; // number of blocks (padded)
775
+
776
+ const int dblk_size = 8 * 2; // 8x __fp16
777
+ const int qblk_size = qk; // int8
778
+ const int qrow_size = k; // int8 (not padded to blocks)
779
+
780
+ const uint8_t * y_q = y + 0; // quants first
781
+ const uint8_t * y_d = y + qrow_size; // then scales
782
+
783
+ if (opt_verbose > 2) {
784
+ for (int i = 0; i < nb; i++) {
785
+ dump_packed_block_q8x4x2(y, i, k);
786
+ }
787
+ }
788
+
789
+ // Unpack the quants
790
+ for (int i = 0; i < nb; i++) {
791
+ uint8_t qs[QK_Q4_0x4x2]; // unpacked quants
792
+
793
+ const uint8_t * q = y_q + (i * qblk_size);
794
+ for (int j = 0; j < qk; j++) {
795
+ qs[j] = q[j];
796
+ }
797
+
798
+ pack_q8_0_quants(&x[i * 8 + 0], qs, 0);
799
+ pack_q8_0_quants(&x[i * 8 + 1], qs, 1);
800
+ pack_q8_0_quants(&x[i * 8 + 2], qs, 2);
801
+ pack_q8_0_quants(&x[i * 8 + 3], qs, 3);
802
+ pack_q8_0_quants(&x[i * 8 + 4], qs, 4);
803
+ pack_q8_0_quants(&x[i * 8 + 5], qs, 5);
804
+ pack_q8_0_quants(&x[i * 8 + 6], qs, 6);
805
+ pack_q8_0_quants(&x[i * 8 + 7], qs, 7);
806
+ }
807
+
808
+ // Repack the scales
809
+ // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2)
810
+ // the last block is truncated and overridden by the scales.
811
+ for (int i = 0; i < nb; i++) {
812
+ // Unpack the scales
813
+ const ggml_half * d = (const ggml_half *) (y_d + i * dblk_size);
814
+ x[i * 8 + 0].d = d[0];
815
+ x[i * 8 + 1].d = d[1];
816
+ x[i * 8 + 2].d = d[2];
817
+ x[i * 8 + 3].d = d[3];
818
+ x[i * 8 + 4].d = d[4];
819
+ x[i * 8 + 5].d = d[5];
820
+ x[i * 8 + 6].d = d[6];
821
+ x[i * 8 + 7].d = d[7];
822
+ }
823
+
824
+ if (opt_verbose > 2) {
825
+ for (int i = 0; i < nb; i++) {
826
+ dump_block_q8_0(&x[i * 8 + 0], 0);
827
+ dump_block_q8_0(&x[i * 8 + 1], 1);
828
+ dump_block_q8_0(&x[i * 8 + 2], 2);
829
+ dump_block_q8_0(&x[i * 8 + 3], 3);
830
+ dump_block_q8_0(&x[i * 8 + 4], 4);
831
+ dump_block_q8_0(&x[i * 8 + 5], 5);
832
+ dump_block_q8_0(&x[i * 8 + 6], 6);
833
+ dump_block_q8_0(&x[i * 8 + 7], 7);
834
+ }
835
+ }
836
+ }
837
+
838
+ static void init_row_q8x4x2(block_q8_0 * x, int64_t k) {
839
+ static const int qk = QK_Q8_0x4x2;
840
+ const int nb = (k + qk - 1) / qk; // number of blocks (padded)
841
+
842
+ // Init the quants such that they unpack into zeros
843
+ uint8_t qs[QK_Q8_0x4x2]; // unpacked quants
844
+ memset(qs, 0, sizeof(qs));
845
+
846
+ for (int i = 0; i < nb; i++) {
847
+ pack_q8_0_quants(&x[i * 8 + 0], qs, 0);
848
+ pack_q8_0_quants(&x[i * 8 + 1], qs, 1);
849
+ pack_q8_0_quants(&x[i * 8 + 2], qs, 2);
850
+ pack_q8_0_quants(&x[i * 8 + 3], qs, 3);
851
+ pack_q8_0_quants(&x[i * 8 + 4], qs, 4);
852
+ pack_q8_0_quants(&x[i * 8 + 5], qs, 5);
853
+ pack_q8_0_quants(&x[i * 8 + 6], qs, 6);
854
+ pack_q8_0_quants(&x[i * 8 + 7], qs, 7);
855
+ }
856
+
857
+ // Init the scales
858
+ // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q8_0x4x2)
859
+ // the last block is truncated and overridden by the scales.
860
+ for (int i = 0; i < nb; i++) {
861
+ // Unpack the scales
862
+ x[i * 8 + 0].d = 0;
863
+ x[i * 8 + 1].d = 0;
864
+ x[i * 8 + 2].d = 0;
865
+ x[i * 8 + 3].d = 0;
866
+ x[i * 8 + 4].d = 0;
867
+ x[i * 8 + 5].d = 0;
868
+ x[i * 8 + 6].d = 0;
869
+ x[i * 8 + 7].d = 0;
870
+ }
871
+ }
872
+
873
+ // repack q8_0 data into q8x4x2 tensor
874
+ static void repack_q8_0_q8x4x2(ggml_tensor * t, const void * data, size_t size) {
875
+ int64_t nrows = ggml_nrows(t);
876
+
877
+ size_t row_size = ggml_row_size(t->type, t->ne[0]);
878
+ size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q8_0x4x2)); // extra elements for the pad
879
+ size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any)
880
+
881
+ // Ensure we don't try to read more data than is available in the source buffer 'data'
882
+ // or write more than the tensor can hold.
883
+ const size_t total_tensor_size = (size_t)nrows * row_size;
884
+ const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
885
+
886
+ // Calculate how many full rows and how many remaining bytes we need to process.
887
+ const int64_t n_full_rows = n_bytes_to_copy / row_size;
888
+ const size_t n_rem_bytes = n_bytes_to_copy % row_size;
889
+
890
+ void * buf_pd = ggml_aligned_malloc(row_size_pd);
891
+ GGML_ASSERT(buf_pd != NULL);
892
+
893
+ void * buf_rp = ggml_aligned_malloc(row_size_rp);
894
+ GGML_ASSERT(buf_rp != NULL);
895
+
896
+ HEX_VERBOSE("ggml-hex: repack-q8_0-q8x4x2 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data, size,
897
+ t->ne[0], nrows, row_size);
898
+
899
+ init_row_q8x4x2((block_q8_0 *) buf_pd, t->ne[0]); // init padded buffer to make sure the tail is all zeros
900
+
901
+ // 1. Process all the full rows
902
+ for (int64_t i = 0; i < n_full_rows; i++) {
903
+ const uint8_t * src = (const uint8_t *) data + (i * row_size);
904
+ uint8_t * dst = (uint8_t *) t->data + (i * row_size);
905
+
906
+ memcpy(buf_pd, src, row_size);
907
+ repack_row_q8x4x2((uint8_t *) buf_rp, (const block_q8_0 *) buf_pd, t->ne[0]);
908
+ memcpy(dst, buf_rp, row_size);
909
+ }
910
+
911
+ // 2. Process the final, potentially partial, row
912
+ if (n_rem_bytes > 0) {
913
+ const int64_t i = n_full_rows;
914
+ const uint8_t * src = (const uint8_t *) data + (i * row_size);
915
+ uint8_t * dst = (uint8_t *) t->data + (i * row_size);
916
+
917
+ // re-init the row because we are potentially copying a partial row
918
+ init_row_q8x4x2((block_q8_0 *) buf_pd, t->ne[0]);
919
+
920
+ // Copy only the remaining bytes from the source.
921
+ memcpy(buf_pd, src, n_rem_bytes);
922
+
923
+ // Repack the entire buffer
924
+ repack_row_q8x4x2((uint8_t *) buf_rp, (const block_q8_0 *) buf_pd, t->ne[0]);
925
+
926
+ // Write only the corresponding remaining bytes to the destination tensor.
927
+ memcpy(dst, buf_rp, n_rem_bytes);
928
+ }
929
+
930
+ ggml_aligned_free(buf_pd, row_size_pd);
931
+ ggml_aligned_free(buf_rp, row_size_rp);
932
+ }
933
+
934
+ // repack q8x4x2 tensor into q8_0 data
935
+ static void repack_q8x4x2_q8_0(void * data, const ggml_tensor * t, size_t size) {
936
+ int64_t nrows = ggml_nrows(t);
937
+
938
+ size_t row_size = ggml_row_size(t->type, t->ne[0]);
939
+ size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q8_0x4x2)); // extra elements for the pad
940
+ size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any)
941
+
942
+ // Ensure we don't try to copy more data than the tensor actually contains.
943
+ const size_t total_tensor_size = (size_t)nrows * row_size;
944
+ const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
945
+
946
+ // Calculate how many full rows and how many remaining bytes we need to process.
947
+ const int64_t n_full_rows = n_bytes_to_copy / row_size;
948
+ const size_t n_rem_bytes = n_bytes_to_copy % row_size;
949
+
950
+ void * buf_pd = ggml_aligned_malloc(row_size_pd);
951
+ GGML_ASSERT(buf_pd != NULL);
952
+
953
+ void * buf_rp = ggml_aligned_malloc(row_size_rp);
954
+ GGML_ASSERT(buf_rp != NULL);
955
+
956
+ HEX_VERBOSE("ggml-hex: repack-q8x4x2-q8_0 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data, size,
957
+ t->ne[0], nrows, row_size);
958
+
959
+ memset(buf_pd, 0, row_size_pd); // clear-out padded buffer to make sure the tail is all zeros
960
+
961
+ // 1. Process all the full rows
962
+ for (int64_t i = 0; i < n_full_rows; i++) {
963
+ const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
964
+ uint8_t * dst = (uint8_t *) data + (i * row_size);
965
+
966
+ memcpy(buf_pd, src, row_size);
967
+ unpack_row_q8x4x2((block_q8_0 *) buf_rp, (const uint8_t *) buf_pd, t->ne[0]);
968
+ memcpy(dst, buf_rp, row_size);
969
+ }
970
+
971
+ // 2. Process the final, potentially partial, row
972
+ if (n_rem_bytes > 0) {
973
+ const int64_t i = n_full_rows;
974
+ const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
975
+ uint8_t * dst = (uint8_t *) data + (i * row_size);
976
+
977
+ // We still need to read and unpack the entire source row because quantization is block-based.
978
+ memcpy(buf_pd, src, row_size);
979
+ unpack_row_q8x4x2((block_q8_0 *) buf_rp, (const uint8_t *) buf_pd, t->ne[0]);
980
+
981
+ // But we only copy the remaining number of bytes to the destination.
982
+ memcpy(dst, buf_rp, n_rem_bytes);
983
+ }
984
+
985
+ ggml_aligned_free(buf_pd, row_size_pd);
986
+ ggml_aligned_free(buf_rp, row_size_rp);
987
+ }
988
+
989
+ // ======== MXFP4x4x2 ====================
990
+ struct x2_mxfp4 {
991
+ int v[2];
992
+ };
993
+
994
+ static x2_mxfp4 unpack_mxfp4(uint8_t v) {
995
+ x2_mxfp4 x;
996
+ x.v[0] = kvalues_mxfp4[(v & 0x0f)];
997
+ x.v[1] = kvalues_mxfp4[(v >> 4)];
998
+ return x;
999
+ }
1000
+
1001
+ static void dump_block_mxfp4(const block_mxfp4 * b, int i) {
1002
+ HEX_VERBOSE("ggml-hex: repack mxfp4 %d: %d %d %d %d ... %d %d %d %d : %.6f\n", i, unpack_mxfp4(b->qs[0]).v[0],
1003
+ unpack_mxfp4(b->qs[1]).v[0], unpack_mxfp4(b->qs[2]).v[0], unpack_mxfp4(b->qs[3]).v[0],
1004
+ unpack_mxfp4(b->qs[12]).v[1], unpack_mxfp4(b->qs[13]).v[1], unpack_mxfp4(b->qs[14]).v[1],
1005
+ unpack_mxfp4(b->qs[15]).v[1], GGML_E8M0_TO_FP32_HALF(b->e));
1006
+ }
1007
+
1008
+ static void dump_packed_block_mxfp4x4x2(const uint8_t * v, unsigned int i, size_t k) {
1009
+ static const int qk = QK_MXFP4x4x2;
1010
+ const int eblk_size = 8 * 1; // 8x E8M0
1011
+ const int qblk_size = qk / 2; // int4
1012
+ const int qrow_size = k / 2; // int4 (not padded)
1013
+
1014
+ const uint8_t * v_q = v + 0; // quants first
1015
+ const uint8_t * v_e = v + qrow_size; // then scales
1016
+
1017
+ const uint8_t * q = v_q + i * qblk_size;
1018
+ const uint8_t * e = (const uint8_t *) (v_e + i * eblk_size);
1019
+
1020
+ HEX_VERBOSE("ggml-hex: repack mxfp4x4x2-%d: %d %d %d %d ... %d %d %d %d ... %d %d %d %d : %.6f %.6f %.6f %.6f\n", i,
1021
+ unpack_mxfp4(q[0]).v[0], unpack_mxfp4(q[1]).v[0], unpack_mxfp4(q[2]).v[0], unpack_mxfp4(q[3]).v[0],
1022
+ unpack_mxfp4(q[60]).v[0], unpack_mxfp4(q[61]).v[0], unpack_mxfp4(q[62]).v[0], unpack_mxfp4(q[63]).v[0],
1023
+ unpack_mxfp4(q[124]).v[0], unpack_mxfp4(q[125]).v[0], unpack_mxfp4(q[126]).v[0],
1024
+ unpack_mxfp4(q[127]).v[0], GGML_E8M0_TO_FP32_HALF(e[0]), GGML_E8M0_TO_FP32_HALF(e[1]),
1025
+ GGML_E8M0_TO_FP32_HALF(e[2]), GGML_E8M0_TO_FP32_HALF(e[3]));
1026
+
1027
+ HEX_VERBOSE("ggml-hex: repack mxfp4x4x2-%d: %d %d %d %d ... %d %d %d %d ... %d %d %d %d : %.6f %.6f %.6f %.6f\n",
1028
+ i + 1, unpack_mxfp4(q[0]).v[1], unpack_mxfp4(q[1]).v[1], unpack_mxfp4(q[2]).v[1],
1029
+ unpack_mxfp4(q[3]).v[1], unpack_mxfp4(q[60]).v[1], unpack_mxfp4(q[61]).v[1], unpack_mxfp4(q[62]).v[1],
1030
+ unpack_mxfp4(q[63]).v[1], unpack_mxfp4(q[124]).v[1], unpack_mxfp4(q[125]).v[1],
1031
+ unpack_mxfp4(q[126]).v[1], unpack_mxfp4(q[127]).v[1], GGML_E8M0_TO_FP32_HALF(e[4]),
1032
+ GGML_E8M0_TO_FP32_HALF(e[5]), GGML_E8M0_TO_FP32_HALF(e[6]), GGML_E8M0_TO_FP32_HALF(e[7]));
1033
+ }
1034
+
1035
+ static void unpack_mxfp4_quants(uint8_t * qs, const block_mxfp4 * x, unsigned int bi) {
1036
+ static const int qk = QK_MXFP4;
1037
+
1038
+ for (unsigned int i = 0; i < qk / 2; ++i) {
1039
+ const uint8_t x0 = (x->qs[i] & 0x0F);
1040
+ const uint8_t x1 = (x->qs[i] >> 4);
1041
+ qs[bi * qk + i + 0] = x0;
1042
+ qs[bi * qk + i + qk / 2] = x1;
1043
+ }
1044
+ }
1045
+
1046
+ static void pack_mxfp4_quants(block_mxfp4 * x, const uint8_t * qs, unsigned int bi) {
1047
+ static const int qk = QK4_0;
1048
+
1049
+ for (unsigned int i = 0; i < qk / 2; ++i) {
1050
+ const uint8_t x0 = qs[bi * qk + i + 0];
1051
+ const uint8_t x1 = qs[bi * qk + i + qk / 2];
1052
+ x->qs[i] = x0 | (x1 << 4);
1053
+ }
1054
+ }
1055
+
1056
+ static void repack_row_mxfp4x4x2(uint8_t * y, const block_mxfp4 * x, int64_t k) {
1057
+ static const int qk = QK_MXFP4x4x2;
1058
+ const int nb = (k + qk - 1) / qk; // number of blocks (padded)
1059
+ const int nloe = k % qk; // leftovers
1060
+
1061
+ const int eblk_size = 8 * 1; // 8x E8M0
1062
+ const int qblk_size = qk / 2; // int4
1063
+ const int qrow_size = k / 2; // int4 (not padded to blocks)
1064
+
1065
+ uint8_t * y_q = y + 0; // quants first
1066
+ uint8_t * y_e = y + qrow_size; // then scales
1067
+
1068
+ if (opt_verbose > 2) {
1069
+ for (int i = 0; i < nb; i++) {
1070
+ dump_block_mxfp4(&x[i * 8 + 0], 0);
1071
+ dump_block_mxfp4(&x[i * 8 + 1], 1);
1072
+ dump_block_mxfp4(&x[i * 8 + 2], 2);
1073
+ dump_block_mxfp4(&x[i * 8 + 3], 3);
1074
+ dump_block_mxfp4(&x[i * 8 + 4], 4);
1075
+ dump_block_mxfp4(&x[i * 8 + 5], 5);
1076
+ dump_block_mxfp4(&x[i * 8 + 6], 6);
1077
+ dump_block_mxfp4(&x[i * 8 + 7], 7);
1078
+ }
1079
+ }
1080
+
1081
+ // Repack the quants
1082
+ for (int i = 0; i < nb; i++) {
1083
+ uint8_t qs[QK_MXFP4x4x2]; // unpacked quants
1084
+
1085
+ unpack_mxfp4_quants(qs, &x[i * 8 + 0], 0);
1086
+ unpack_mxfp4_quants(qs, &x[i * 8 + 1], 1);
1087
+ unpack_mxfp4_quants(qs, &x[i * 8 + 2], 2);
1088
+ unpack_mxfp4_quants(qs, &x[i * 8 + 3], 3);
1089
+ unpack_mxfp4_quants(qs, &x[i * 8 + 4], 4);
1090
+ unpack_mxfp4_quants(qs, &x[i * 8 + 5], 5);
1091
+ unpack_mxfp4_quants(qs, &x[i * 8 + 6], 6);
1092
+ unpack_mxfp4_quants(qs, &x[i * 8 + 7], 7);
1093
+
1094
+ bool partial = (nloe && i == nb-1);
1095
+
1096
+ uint8_t * q = y_q + (i * qblk_size);
1097
+ for (int j = 0; j < qk / 2; j++) {
1098
+ q[j] = partial ? (qs[j*2+1] << 4) | qs[j*2+0] : (qs[j+128] << 4) | qs[j+000];
1099
+ }
1100
+ }
1101
+
1102
+ // Repack the scales
1103
+ // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_MXFP4x4x2)
1104
+ // the last block is truncated and overridden by the scales.
1105
+ for (int i = 0; i < nb; i++) {
1106
+ // Repack the scales
1107
+ uint8_t * e = (uint8_t *) (y_e + i * eblk_size);
1108
+ e[0] = x[i * 8 + 0].e;
1109
+ e[1] = x[i * 8 + 1].e;
1110
+ e[2] = x[i * 8 + 2].e;
1111
+ e[3] = x[i * 8 + 3].e;
1112
+ e[4] = x[i * 8 + 4].e;
1113
+ e[5] = x[i * 8 + 5].e;
1114
+ e[6] = x[i * 8 + 6].e;
1115
+ e[7] = x[i * 8 + 7].e;
1116
+ }
1117
+
1118
+ if (opt_verbose > 2) {
1119
+ for (int i = 0; i < nb; i++) {
1120
+ dump_packed_block_mxfp4x4x2(y, i, k);
1121
+ }
1122
+ }
1123
+ }
1124
+
1125
+ static void unpack_row_mxfp4x4x2(block_mxfp4 * x, const uint8_t * y, int64_t k) {
1126
+ static const int qk = QK_MXFP4x4x2;
1127
+ const int nb = (k + qk - 1) / qk; // number of blocks (padded)
1128
+ const int nloe = k % qk; // leftovers
1129
+
1130
+ const int eblk_size = 8 * 1; // 8x E8M0
1131
+ const int qblk_size = qk / 2; // int4
1132
+ const int qrow_size = k / 2; // int4 (not padded to blocks)
1133
+
1134
+ const uint8_t * y_q = y + 0; // quants first
1135
+ const uint8_t * y_e = y + qrow_size; // then scales
1136
+
1137
+ if (opt_verbose > 2) {
1138
+ for (int i = 0; i < nb; i++) {
1139
+ dump_packed_block_mxfp4x4x2(y, i, k);
1140
+ }
1141
+ }
1142
+
1143
+ // Unpack the quants
1144
+ for (int i = 0; i < nb; i++) {
1145
+ uint8_t qs[QK_MXFP4x4x2]; // unpacked quants
1146
+
1147
+ bool partial = (nloe && i == nb-1);
1148
+
1149
+ const uint8_t * q = y_q + (i * qblk_size);
1150
+ for (int j = 0; j < qk / 2; j++) {
1151
+ if (partial) {
1152
+ qs[j*2+0] = q[j] & 0xf;
1153
+ qs[j*2+1] = q[j] >> 4;
1154
+ } else {
1155
+ qs[j+000] = q[j] & 0xf;
1156
+ qs[j+128] = q[j] >> 4;
1157
+ }
1158
+ }
1159
+
1160
+ pack_mxfp4_quants(&x[i * 8 + 0], qs, 0);
1161
+ pack_mxfp4_quants(&x[i * 8 + 1], qs, 1);
1162
+ pack_mxfp4_quants(&x[i * 8 + 2], qs, 2);
1163
+ pack_mxfp4_quants(&x[i * 8 + 3], qs, 3);
1164
+ pack_mxfp4_quants(&x[i * 8 + 4], qs, 4);
1165
+ pack_mxfp4_quants(&x[i * 8 + 5], qs, 5);
1166
+ pack_mxfp4_quants(&x[i * 8 + 6], qs, 6);
1167
+ pack_mxfp4_quants(&x[i * 8 + 7], qs, 7);
1168
+ }
1169
+
1170
+ // Repack the scales
1171
+ // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_MXFP4_0x4x2)
1172
+ // the last block is truncated and overridden by the scales.
1173
+ for (int i = 0; i < nb; i++) {
1174
+ // Unpack the scales
1175
+ const uint8_t * e = (const uint8_t *) (y_e + i * eblk_size);
1176
+ x[i * 8 + 0].e = e[0];
1177
+ x[i * 8 + 1].e = e[1];
1178
+ x[i * 8 + 2].e = e[2];
1179
+ x[i * 8 + 3].e = e[3];
1180
+ x[i * 8 + 4].e = e[4];
1181
+ x[i * 8 + 5].e = e[5];
1182
+ x[i * 8 + 6].e = e[6];
1183
+ x[i * 8 + 7].e = e[7];
1184
+ }
1185
+
1186
+ if (opt_verbose > 2) {
1187
+ for (int i = 0; i < nb; i++) {
1188
+ dump_block_mxfp4(&x[i * 8 + 0], 0);
1189
+ dump_block_mxfp4(&x[i * 8 + 1], 1);
1190
+ dump_block_mxfp4(&x[i * 8 + 2], 2);
1191
+ dump_block_mxfp4(&x[i * 8 + 3], 3);
1192
+ dump_block_mxfp4(&x[i * 8 + 4], 4);
1193
+ dump_block_mxfp4(&x[i * 8 + 5], 5);
1194
+ dump_block_mxfp4(&x[i * 8 + 6], 6);
1195
+ dump_block_mxfp4(&x[i * 8 + 7], 7);
1196
+ }
1197
+ }
1198
+ }
1199
+
1200
+ static void init_row_mxfp4x4x2(block_mxfp4 * x, int64_t k) {
1201
+ static const int qk = QK_MXFP4x4x2;
1202
+ const int nb = (k + qk - 1) / qk; // number of blocks (padded)
1203
+
1204
+ // Init the quants such that they unpack into zeros
1205
+ uint8_t qs[QK_MXFP4x4x2]; // unpacked quants
1206
+ memset(qs, 0, sizeof(qs));
1207
+
1208
+ for (int i = 0; i < nb; i++) {
1209
+ pack_mxfp4_quants(&x[i * 8 + 0], qs, 0);
1210
+ pack_mxfp4_quants(&x[i * 8 + 1], qs, 1);
1211
+ pack_mxfp4_quants(&x[i * 8 + 2], qs, 2);
1212
+ pack_mxfp4_quants(&x[i * 8 + 3], qs, 3);
1213
+ pack_mxfp4_quants(&x[i * 8 + 4], qs, 4);
1214
+ pack_mxfp4_quants(&x[i * 8 + 5], qs, 5);
1215
+ pack_mxfp4_quants(&x[i * 8 + 6], qs, 6);
1216
+ pack_mxfp4_quants(&x[i * 8 + 7], qs, 7);
1217
+ }
1218
+
1219
+ // Init the scales
1220
+ // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_MXFP4x4x2)
1221
+ // the last block is truncated and overridden by the scales.
1222
+ for (int i = 0; i < nb; i++) {
1223
+ // Unpack the scales
1224
+ x[i * 8 + 0].e = 0;
1225
+ x[i * 8 + 1].e = 0;
1226
+ x[i * 8 + 2].e = 0;
1227
+ x[i * 8 + 3].e = 0;
1228
+ x[i * 8 + 4].e = 0;
1229
+ x[i * 8 + 5].e = 0;
1230
+ x[i * 8 + 6].e = 0;
1231
+ x[i * 8 + 7].e = 0;
1232
+ }
1233
+ }
1234
+
1235
+ // repack mxfp4 data into mxfp4x4x2 tensor
1236
+ static void repack_mxfp4_mxfp4x4x2(ggml_tensor * t, const void * data, size_t size) {
1237
+ int64_t nrows = ggml_nrows(t);
1238
+
1239
+ size_t row_size = ggml_row_size(t->type, t->ne[0]);
1240
+ size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_MXFP4x4x2)); // extra elements for the pad
1241
+ size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any)
1242
+
1243
+ // Ensure we don't try to read more data than is available in the source buffer 'data'
1244
+ // or write more than the tensor can hold.
1245
+ const size_t total_tensor_size = (size_t)nrows * row_size;
1246
+ const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
1247
+
1248
+ // Calculate how many full rows and how many remaining bytes we need to process.
1249
+ const int64_t n_full_rows = n_bytes_to_copy / row_size;
1250
+ const size_t n_rem_bytes = n_bytes_to_copy % row_size;
1251
+
1252
+ void * buf_pd = ggml_aligned_malloc(row_size_pd);
1253
+ GGML_ASSERT(buf_pd != NULL);
1254
+
1255
+ void * buf_rp = ggml_aligned_malloc(row_size_rp);
1256
+ GGML_ASSERT(buf_rp != NULL);
1257
+
1258
+ HEX_VERBOSE("ggml-hex: repack-mxfp4-mxfp4x4x2 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data,
1259
+ size, t->ne[0], nrows, row_size);
1260
+
1261
+ init_row_mxfp4x4x2((block_mxfp4 *) buf_pd, t->ne[0]); // init padded buffer to make sure the tail is all zeros
1262
+
1263
+ // 1. Process all the full rows
1264
+ for (int64_t i = 0; i < n_full_rows; i++) {
1265
+ const uint8_t * src = (const uint8_t *) data + (i * row_size);
1266
+ uint8_t * dst = (uint8_t *) t->data + (i * row_size);
1267
+
1268
+ memcpy(buf_pd, src, row_size);
1269
+ repack_row_mxfp4x4x2((uint8_t *) buf_rp, (const block_mxfp4 *) buf_pd, t->ne[0]);
1270
+ memcpy(dst, buf_rp, row_size);
1271
+ }
1272
+
1273
+ // 2. Process the final, potentially partial, row
1274
+ if (n_rem_bytes > 0) {
1275
+ const int64_t i = n_full_rows;
1276
+ const uint8_t * src = (const uint8_t *) data + (i * row_size);
1277
+ uint8_t * dst = (uint8_t *) t->data + (i * row_size);
1278
+
1279
+ // re-init the row because we are potentially copying a partial row
1280
+ init_row_mxfp4x4x2((block_mxfp4 *) buf_pd, t->ne[0]);
1281
+
1282
+ // Copy only the remaining bytes from the source.
1283
+ memcpy(buf_pd, src, n_rem_bytes);
1284
+
1285
+ // Repack the entire buffer (partial data + zero padding).
1286
+ repack_row_mxfp4x4x2((uint8_t *) buf_rp, (const block_mxfp4 *) buf_pd, t->ne[0]);
1287
+
1288
+ // Write only the corresponding remaining bytes to the destination tensor.
1289
+ memcpy(dst, buf_rp, n_rem_bytes);
1290
+ }
1291
+
1292
+ ggml_aligned_free(buf_pd, row_size_pd);
1293
+ ggml_aligned_free(buf_rp, row_size_rp);
1294
+ }
1295
+
1296
+ // repack mxfp4x4x2 tensor into mxfp4 data
1297
+ static void repack_mxfp4x4x2_mxfp4(void * data, const ggml_tensor * t, size_t size) {
1298
+ int64_t nrows = ggml_nrows(t);
1299
+
1300
+ size_t row_size = ggml_row_size(t->type, t->ne[0]);
1301
+ size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_MXFP4x4x2)); // extra elements for the pad
1302
+ size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any)
1303
+
1304
+ // Ensure we don't try to copy more data than the tensor actually contains.
1305
+ const size_t total_tensor_size = (size_t)nrows * row_size;
1306
+ const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
1307
+
1308
+ // Calculate how many full rows and how many remaining bytes we need to process.
1309
+ const int64_t n_full_rows = n_bytes_to_copy / row_size;
1310
+ const size_t n_rem_bytes = n_bytes_to_copy % row_size;
1311
+
1312
+ void * buf_pd = ggml_aligned_malloc(row_size_pd);
1313
+ GGML_ASSERT(buf_pd != NULL);
1314
+
1315
+ void * buf_rp = ggml_aligned_malloc(row_size_rp);
1316
+ GGML_ASSERT(buf_rp != NULL);
1317
+
1318
+ HEX_VERBOSE("ggml-hex: repack-mxfp4x4x2-mxfp4 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data,
1319
+ size, t->ne[0], nrows, row_size);
1320
+
1321
+ memset(buf_pd, 0, row_size_pd); // clear-out padded buffer to make sure the tail is all zeros
1322
+
1323
+ // 1. Process all the full rows
1324
+ for (int64_t i = 0; i < n_full_rows; i++) {
1325
+ const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
1326
+ uint8_t * dst = (uint8_t *) data + (i * row_size);
1327
+
1328
+ memcpy(buf_pd, src, row_size);
1329
+ unpack_row_mxfp4x4x2((block_mxfp4 *) buf_rp, (const uint8_t *) buf_pd, t->ne[0]);
1330
+ memcpy(dst, buf_rp, row_size);
1331
+ }
1332
+
1333
+ // 2. Process the final, potentially partial, row
1334
+ if (n_rem_bytes > 0) {
1335
+ const int64_t i = n_full_rows;
1336
+ const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
1337
+ uint8_t * dst = (uint8_t *) data + (i * row_size);
1338
+
1339
+ // We still need to read and unpack the entire source row because the format is block-based.
1340
+ memcpy(buf_pd, src, row_size);
1341
+ unpack_row_mxfp4x4x2((block_mxfp4 *) buf_rp, (const uint8_t *) buf_pd, t->ne[0]);
1342
+
1343
+ // But we only copy the remaining number of bytes to the destination to respect the size limit.
1344
+ memcpy(dst, buf_rp, n_rem_bytes);
1345
+ }
1346
+
1347
+ ggml_aligned_free(buf_pd, row_size_pd);
1348
+ ggml_aligned_free(buf_rp, row_size_rp);
1349
+ }
1350
+
1351
+ static void ggml_backend_hexagon_buffer_set_tensor(ggml_backend_buffer_t buffer,
1352
+ ggml_tensor * tensor,
1353
+ const void * data,
1354
+ size_t offset,
1355
+ size_t size) {
1356
+ auto sbuf = (ggml_hexagon_shared_buffer *) buffer->context;
1357
+ auto sess = sbuf->sess;
1358
+
1359
+ HEX_VERBOSE("ggml-hex: %s set-tensor %s : data %p offset %zu size %zu\n", sess->c_name(), tensor->name, data, offset, size);
1360
+
1361
+ switch (tensor->type) {
1362
+ case GGML_TYPE_Q4_0:
1363
+ GGML_ASSERT(offset == 0);
1364
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
1365
+ repack_q4_0_q4x4x2(tensor, data, size);
1366
+ break;
1367
+
1368
+ case GGML_TYPE_Q8_0:
1369
+ GGML_ASSERT(offset == 0);
1370
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
1371
+ repack_q8_0_q8x4x2(tensor, data, size);
1372
+ break;
1373
+
1374
+ case GGML_TYPE_IQ4_NL:
1375
+ GGML_ASSERT(offset == 0);
1376
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
1377
+ // IQ4_NL has identical block layout to Q4_0 (ggml_half d + uint8_t qs[16])
1378
+ repack_q4_0_q4x4x2(tensor, data, size);
1379
+ break;
1380
+
1381
+ case GGML_TYPE_MXFP4:
1382
+ GGML_ASSERT(offset == 0);
1383
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
1384
+ repack_mxfp4_mxfp4x4x2(tensor, data, size);
1385
+ break;
1386
+
1387
+ default:
1388
+ memcpy((char *) tensor->data + offset, data, size);
1389
+ break;
1390
+ }
1391
+ }
1392
+
1393
+ static void ggml_backend_hexagon_buffer_get_tensor(ggml_backend_buffer_t buffer,
1394
+ const ggml_tensor * tensor,
1395
+ void * data,
1396
+ size_t offset,
1397
+ size_t size) {
1398
+ auto sbuf = (ggml_hexagon_shared_buffer *) buffer->context;
1399
+ auto sess = sbuf->sess;
1400
+
1401
+ HEX_VERBOSE("ggml-hex: %s get-tensor %s : data %p offset %zu size %zu\n", sess->c_name(), tensor->name, data, offset, size);
1402
+
1403
+ switch (tensor->type) {
1404
+ case GGML_TYPE_Q4_0:
1405
+ GGML_ASSERT(offset == 0);
1406
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
1407
+ repack_q4x4x2_q4_0(data, tensor, size);
1408
+ break;
1409
+
1410
+ case GGML_TYPE_Q8_0:
1411
+ GGML_ASSERT(offset == 0);
1412
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
1413
+ repack_q8x4x2_q8_0(data, tensor, size);
1414
+ break;
1415
+
1416
+ case GGML_TYPE_IQ4_NL:
1417
+ GGML_ASSERT(offset == 0);
1418
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
1419
+ repack_q4x4x2_q4_0(data, tensor, size);
1420
+ break;
1421
+
1422
+ case GGML_TYPE_MXFP4:
1423
+ GGML_ASSERT(offset == 0);
1424
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
1425
+ repack_mxfp4x4x2_mxfp4(data, tensor, size);
1426
+ break;
1427
+
1428
+ default:
1429
+ memcpy(data, (const char *) tensor->data + offset, size);
1430
+ break;
1431
+ }
1432
+ }
1433
+
1434
+ static bool ggml_backend_hexagon_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
1435
+ const struct ggml_tensor * src,
1436
+ struct ggml_tensor * dst) {
1437
+ GGML_UNUSED(buffer);
1438
+ GGML_UNUSED(src);
1439
+ GGML_UNUSED(dst);
1440
+ // we might optimize this later, for now take the slow path (ie get/set_tensor)
1441
+ return false;
1442
+ }
1443
+
1444
+ static void ggml_backend_hexagon_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
1445
+ auto sbuf = (ggml_hexagon_shared_buffer *) buffer->context;
1446
+ auto sess = sbuf->sess;
1447
+ HEX_VERBOSE("ggml-hex: %s clear-buff base %p size %zu\n", sess->c_name(), (void *) sbuf->base, sbuf->size);
1448
+ memset(sbuf->base, value, sbuf->size);
1449
+ }
1450
+
1451
+ static ggml_backend_buffer_i ggml_backend_hexagon_buffer_interface = {
1452
+ /* .free_buffer = */ ggml_backend_hexagon_buffer_free_buffer,
1453
+ /* .get_base = */ ggml_backend_hexagon_buffer_get_base,
1454
+ /* .init_tensor = */ ggml_backend_hexagon_buffer_init_tensor,
1455
+ /* .memset_tensor = */ NULL,
1456
+ /* .set_tensor = */ ggml_backend_hexagon_buffer_set_tensor,
1457
+ /* .get_tensor = */ ggml_backend_hexagon_buffer_get_tensor,
1458
+ /* .set_tensor_2d = */ NULL,
1459
+ /* .get_tensor_2d = */ NULL,
1460
+ /* .cpy_tensor = */ ggml_backend_hexagon_buffer_cpy_tensor,
1461
+ /* .clear = */ ggml_backend_hexagon_buffer_clear,
1462
+ /* .reset = */ NULL,
1463
+ };
1464
+
1465
+ // ** backend buffer type
1466
+
1467
+ static const char * ggml_backend_hexagon_buffer_type_name(ggml_backend_buffer_type_t buffer_type) {
1468
+ return static_cast<ggml_backend_hexagon_buffer_type_context *>(buffer_type->context)->name.c_str();
1469
+ }
1470
+
1471
+ static ggml_backend_buffer_t ggml_backend_hexagon_buffer_type_alloc_buffer(
1472
+ ggml_backend_buffer_type_t buffer_type, size_t size) {
1473
+ auto sess = static_cast<ggml_backend_hexagon_buffer_type_context *>(buffer_type->context)->sess;
1474
+ try {
1475
+ size += 4 * 1024; // guard page
1476
+ ggml_hexagon_shared_buffer * sbuf = new ggml_hexagon_shared_buffer(sess, size);
1477
+ return ggml_backend_buffer_init(buffer_type, ggml_backend_hexagon_buffer_interface, sbuf, size);
1478
+ } catch (const std::exception & exc) {
1479
+ GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer context (host): %s\n", sess->c_name(), exc.what());
1480
+ return nullptr;
1481
+ }
1482
+ }
1483
+
1484
+ static ggml_backend_buffer_t ggml_backend_hexagon_repack_buffer_type_alloc_buffer(
1485
+ ggml_backend_buffer_type_t buffer_type, size_t size) {
1486
+ auto sess = static_cast<ggml_backend_hexagon_buffer_type_context *>(buffer_type->context)->sess;
1487
+ try {
1488
+ size += 4 * 1024; // guard page
1489
+ ggml_hexagon_shared_buffer * sbuf = new ggml_hexagon_shared_buffer(sess, size);
1490
+ return ggml_backend_buffer_init(buffer_type, ggml_backend_hexagon_buffer_interface, sbuf, size);
1491
+ } catch (const std::exception & exc) {
1492
+ GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer context (repack): %s\n", sess->c_name(), exc.what());
1493
+ return nullptr;
1494
+ }
1495
+ }
1496
+
1497
+ static size_t ggml_backend_hexagon_buffer_type_get_alignment(ggml_backend_buffer_type_t buffer_type) {
1498
+ return 128; // HVX alignment
1499
+ GGML_UNUSED(buffer_type);
1500
+ }
1501
+
1502
+ static size_t ggml_backend_hexagon_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * t) {
1503
+ return ggml_nbytes(t);
1504
+ }
1505
+
1506
+ static size_t ggml_backend_hexagon_buffer_type_get_max_size(ggml_backend_buffer_type_t buffer_type) {
1507
+ return opt_mbuf; // typically 1GB per buffer
1508
+ GGML_UNUSED(buffer_type);
1509
+ }
1510
+
1511
+ static bool ggml_backend_hexagon_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
1512
+ return opt_hostbuf;
1513
+ GGML_UNUSED(buft);
1514
+ }
1515
+
1516
+ static bool ggml_backend_hexagon_repack_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
1517
+ return false;
1518
+ GGML_UNUSED(buft);
1519
+ }
1520
+
1521
+ static ggml_backend_buffer_type_i ggml_backend_hexagon_buffer_type_interface = {
1522
+ /* .get_name = */ ggml_backend_hexagon_buffer_type_name,
1523
+ /* .alloc_buffer = */ ggml_backend_hexagon_buffer_type_alloc_buffer,
1524
+ /* .get_alignment = */ ggml_backend_hexagon_buffer_type_get_alignment,
1525
+ /* .get_max_size = */ ggml_backend_hexagon_buffer_type_get_max_size,
1526
+ /* .get_alloc_size = */ ggml_backend_hexagon_buffer_type_get_alloc_size,
1527
+ /* .is_host = */ ggml_backend_hexagon_buffer_type_is_host,
1528
+ };
1529
+
1530
+ static ggml_backend_buffer_type_i ggml_backend_hexagon_repack_buffer_type_interface = {
1531
+ /* .get_name = */ ggml_backend_hexagon_buffer_type_name,
1532
+ /* .alloc_buffer = */ ggml_backend_hexagon_repack_buffer_type_alloc_buffer,
1533
+ /* .get_alignment = */ ggml_backend_hexagon_buffer_type_get_alignment,
1534
+ /* .get_max_size = */ ggml_backend_hexagon_buffer_type_get_max_size,
1535
+ /* .get_alloc_size = */ ggml_backend_hexagon_buffer_type_get_alloc_size,
1536
+ /* .is_host = */ ggml_backend_hexagon_repack_buffer_type_is_host,
1537
+ };
1538
+
1539
+ // Backend session implementation
1540
+
1541
+ struct ggml_hexagon_opbatch {
1542
+ ggml_hexagon_session* sess;
1543
+
1544
+ std::vector<const ggml_tensor*> ops; // pointers to original ops
1545
+
1546
+ std::vector<htp_buf_desc> h_bufs; // htp buffer descriptors
1547
+ std::vector<htp_tensor> h_tens; // htp tensor descriptors
1548
+ std::vector<htp_op_desc> h_ops; // htp op descriptors
1549
+
1550
+ std::unordered_map<int, int> b_map; // buffer fd to index
1551
+ std::unordered_map<const ggml_tensor*, int> t_map; // tensor ptr to index
1552
+ std::unordered_multimap<void*, int> d_map; // tensor data to index
1553
+
1554
+ unsigned int n_bufs; // num buffers in the batch
1555
+ unsigned int n_tens; // num tensors ...
1556
+ unsigned int n_ops; // num ops ...
1557
+ size_t b_vmem; // sum of all buffer sizes
1558
+
1559
+ unsigned int n_bufs_max;
1560
+ unsigned int n_tens_max;
1561
+ unsigned int n_ops_max;
1562
+ size_t b_vmem_max;
1563
+
1564
+ void reset() {
1565
+ n_bufs = 0;
1566
+ n_tens = 0;
1567
+ n_ops = 0;
1568
+ b_vmem = 0;
1569
+
1570
+ b_map.clear();
1571
+ t_map.clear();
1572
+ d_map.clear();
1573
+ }
1574
+
1575
+ ggml_hexagon_opbatch(ggml_hexagon_session *sess, size_t batch_size, size_t max_vmem) {
1576
+ this->sess = sess;
1577
+
1578
+ n_bufs_max = HTP_OP_MAX_BUFS;
1579
+ n_ops_max = batch_size;
1580
+ n_tens_max = n_ops_max + n_ops_max * HTP_OP_MAX_INPUTS;
1581
+
1582
+ b_vmem_max = max_vmem;
1583
+
1584
+ ops.resize(n_ops_max);
1585
+
1586
+ h_bufs.resize(n_bufs_max);
1587
+ h_tens.resize(n_tens_max);
1588
+ h_ops.resize(n_ops_max);
1589
+
1590
+ b_map.reserve(n_bufs_max);
1591
+ t_map.reserve(n_tens_max);
1592
+ d_map.reserve(n_tens_max);
1593
+
1594
+ GGML_LOG_INFO("ggml-hex: %s op batching: n-bufs %u n-tensors %u n-ops %u vmem %zu\n",
1595
+ sess->c_name(), n_bufs_max, n_tens_max, n_ops_max, b_vmem_max);
1596
+
1597
+ reset();
1598
+ }
1599
+
1600
+ bool empty() const { return n_ops == 0; }
1601
+
1602
+ // add buffer and return its index
1603
+ int add_buffer(ggml_hexagon_shared_buffer * sbuf) {
1604
+ // Lookup by fd
1605
+ auto it = b_map.find(sbuf->fd);
1606
+ if (it != b_map.end()) { return it->second; }
1607
+
1608
+ // Add new buffer to the batch
1609
+ int bi = n_bufs++;
1610
+ GGML_ASSERT(n_bufs < HTP_OP_MAX_BUFS);
1611
+
1612
+ b_map.insert({sbuf->fd, bi});
1613
+
1614
+ htp_buf_desc &b = h_bufs[bi];
1615
+ b.base = (uint64_t) sbuf->base;
1616
+ b.fd = sbuf->fd;
1617
+ b.size = sbuf->size;
1618
+
1619
+ b_vmem += b.size;
1620
+
1621
+ HEX_VERBOSE("ggml-hex: add-buffer #%u : fd %d base %p size %zu : vmem %zu\n", bi, b.fd, (void*) sbuf->base, (size_t) b.size, b_vmem);
1622
+
1623
+ return bi;
1624
+ }
1625
+
1626
+ bool same_shape(const htp_tensor * h, const ggml_tensor * t) const {
1627
+ return (h->ne[0] == t->ne[0]) && (h->ne[1] == t->ne[1]) && (h->ne[2] == t->ne[2]) && (h->ne[3] == t->ne[3]) &&
1628
+ (h->nb[0] == t->nb[0]) && (h->nb[1] == t->nb[1]) && (h->nb[2] == t->nb[2]) && (h->nb[3] == t->nb[3]);
1629
+ }
1630
+
1631
+ // add tensor and return its index
1632
+ int add_tensor(const ggml_tensor * t) {
1633
+ auto sbuf = static_cast<ggml_hexagon_shared_buffer *>(t->buffer->context);
1634
+
1635
+ // First lookup by tensor data
1636
+ auto range = d_map.equal_range(t->data);
1637
+ for (auto it = range.first; it != range.second; ++it) {
1638
+ htp_tensor * h = &h_tens[it->second];
1639
+ if (same_shape(h, t)) { return it->second; }
1640
+ }
1641
+
1642
+ // Lookup by tensor ptr
1643
+ auto it = t_map.find(t);
1644
+ if (it != t_map.end()) { return it->second; }
1645
+
1646
+ // Add new tensor to the batch
1647
+ int ti = n_tens++;
1648
+ GGML_ASSERT(n_tens <= n_tens_max);
1649
+
1650
+ t_map.insert({t, ti});
1651
+ d_map.insert({t->data, ti});
1652
+
1653
+ uint64_t t_offset = (uint8_t *) t->data - sbuf->base;
1654
+ size_t t_size = ggml_nbytes(t);
1655
+
1656
+ htp_tensor &h = h_tens[ti];
1657
+ h.bi = add_buffer(sbuf);
1658
+ h.data = t_offset;
1659
+ h.size = t_size;
1660
+ h.type = t->type;
1661
+ h.ne[0] = t->ne[0]; h.ne[1] = t->ne[1]; h.ne[2] = t->ne[2]; h.ne[3] = t->ne[3];
1662
+ h.nb[0] = t->nb[0]; h.nb[1] = t->nb[1]; h.nb[2] = t->nb[2]; h.nb[3] = t->nb[3];
1663
+
1664
+ h.flags = 0;
1665
+ if (ggml_backend_buffer_get_usage(t->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
1666
+ h.flags |= HTP_TENSOR_COMPUTE;
1667
+ }
1668
+
1669
+ HEX_VERBOSE("ggml-hex: add-tensor #%u %s : bi %d data %p offset %zu size %zu flags 0x%x : %zu:%zu:%zu:%zu\n",
1670
+ ti, t->name, h.bi, (void*) t->data, (size_t) t_offset, t_size, h.flags,
1671
+ (size_t) t->ne[0], (size_t) t->ne[1], (size_t) t->ne[2], (size_t) t->ne[3]);
1672
+
1673
+ return ti;
1674
+ }
1675
+
1676
+ bool fit_op(const struct ggml_tensor *t) const {
1677
+ if (n_ops >= n_ops_max ) return false;
1678
+
1679
+ // check how much extras we will need
1680
+ size_t extra_bufs = 0;
1681
+ size_t extra_vmem = 0;
1682
+ size_t extra_tens = 0;
1683
+
1684
+ auto fit_tensor = [&](const ggml_tensor *t) {
1685
+ if (!t_map.count(t)) {
1686
+ extra_tens++;
1687
+
1688
+ auto sbuf = static_cast<ggml_hexagon_shared_buffer *>(t->buffer->context);
1689
+ if (!b_map.count(sbuf->fd)) {
1690
+ extra_vmem += sbuf->size;
1691
+ extra_bufs += 1;
1692
+ }
1693
+ }
1694
+ };
1695
+
1696
+ for (unsigned int i=0; i < HTP_OP_MAX_INPUTS && t->src[i]; i++) {
1697
+ fit_tensor(t->src[i]);
1698
+ }
1699
+ fit_tensor(t);
1700
+
1701
+ if ((extra_bufs + n_bufs) > n_bufs_max) return false;
1702
+ if ((extra_tens + n_tens) > n_tens_max) return false;
1703
+ if ((extra_vmem + b_vmem) > b_vmem_max) return false;
1704
+
1705
+ return true;
1706
+ }
1707
+
1708
+ // assumes that fit_op() was called first and returned true
1709
+ void add_op(htp_op_code opcode, const struct ggml_tensor * t) {
1710
+ // Add new op
1711
+
1712
+ unsigned int n = n_ops++;
1713
+ GGML_ASSERT(n_ops <= n_ops_max);
1714
+
1715
+ ops[n] = t;
1716
+
1717
+ htp_op_desc &o = h_ops[n];
1718
+ memcpy(&o.params, &t->op_params, sizeof(t->op_params));
1719
+ o.opcode = opcode;
1720
+ o.flags = 0;
1721
+
1722
+ if (!(opt_opstage & HTP_OPSTAGE_COMPUTE)) {
1723
+ o.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
1724
+ }
1725
+
1726
+ ggml_hexagon_dump_op_exec(sess->c_name(), t, o.flags);
1727
+
1728
+ for (unsigned int i=0; i < HTP_OP_MAX_INPUTS; i++) {
1729
+ o.src[i] = t->src[i] ? add_tensor(t->src[i]) : 0xffff;
1730
+ }
1731
+ o.dst = add_tensor(t);
1732
+ }
1733
+ };
1734
+
1735
+ struct ggml_hexagon_opqueue {
1736
+ // Shared buffer for storing batches
1737
+ ggml_hexagon_shared_buffer *shm_buf;
1738
+ size_t shm_blk_size;
1739
+
1740
+ using opvec = std::vector<const ggml_tensor*>;
1741
+
1742
+ std::queue<unsigned int> done; // completed batch ids
1743
+ std::vector<opvec> op_cache; // per batch op cache
1744
+ std::vector<uint64_t> start_usec; // per batch start time
1745
+
1746
+ ggml_hexagon_opqueue(ggml_hexagon_session *sess, size_t batch_size, size_t depth) {
1747
+ size_t n_bufs = HTP_OP_MAX_BUFS;
1748
+ size_t n_ops = batch_size;
1749
+ size_t n_tensors = n_ops + n_ops * HTP_OP_MAX_INPUTS;
1750
+
1751
+ shm_blk_size = sizeof(htp_buf_desc) * n_bufs +
1752
+ sizeof(htp_tensor) * n_tensors +
1753
+ sizeof(htp_op_desc) * n_ops +
1754
+ sizeof(htp_prof_desc) * n_ops;
1755
+
1756
+ shm_buf = new ggml_hexagon_shared_buffer(sess, shm_blk_size * depth, true /* pinned */);
1757
+
1758
+ op_cache.resize(depth);
1759
+ start_usec.resize(depth, 0);
1760
+
1761
+ // init done queue
1762
+ for (unsigned int i = 0; i < depth; i++) { done.push(i); }
1763
+
1764
+ if (opt_verbose) {
1765
+ GGML_LOG_INFO("ggml-hex: %s allocated op-queue : batch-size %zu depth %zu shm-size %zu shm-block-size %zu\n",
1766
+ sess->c_name(), batch_size, depth, shm_buf->size, shm_blk_size);
1767
+ }
1768
+ }
1769
+
1770
+ ~ggml_hexagon_opqueue() {
1771
+ delete shm_buf;
1772
+ }
1773
+
1774
+ // push new batch
1775
+ bool push(htp_opbatch_req& req, dspqueue_buffer& dbuf, ggml_hexagon_opbatch* op_batch) {
1776
+ static_assert(sizeof(htp_opbatch_req) % 8 == 0, "sizeof(htp_opbatch_req) must be multiple of 8");
1777
+ static_assert(sizeof(htp_opbatch_rsp) % 8 == 0, "sizeof(htp_opbatch_rsp) must be multiple of 8");
1778
+ static_assert(sizeof(htp_buf_desc) % 8 == 0, "sizeof(htp_buf_desc) must be multiple of 8");
1779
+ static_assert(sizeof(htp_tensor) % 8 == 0, "sizeof(htp_tensor) must be multiple of 8");
1780
+ static_assert(sizeof(htp_op_desc) % 8 == 0, "sizeof(htp_op_desc) must be multiple of 8");
1781
+ static_assert(sizeof(htp_prof_desc) % 8 == 0, "sizeof(htp_prof_desc) must be multiple of 8");
1782
+
1783
+ if (done.empty()) { return false; }
1784
+
1785
+ req.id = done.front(); done.pop(); // batch id
1786
+ req.n_bufs = op_batch->n_bufs;
1787
+ req.n_tensors = op_batch->n_tens;
1788
+ req.n_ops = op_batch->n_ops;
1789
+
1790
+ op_cache[req.id] = op_batch->ops;
1791
+ start_usec[req.id] = ggml_time_us();
1792
+
1793
+ const size_t b_size = sizeof(htp_buf_desc) * req.n_bufs;
1794
+ const size_t t_size = sizeof(htp_tensor) * req.n_tensors;
1795
+ const size_t o_size = sizeof(htp_op_desc) * req.n_ops;
1796
+ const size_t p_size = sizeof(htp_prof_desc) * req.n_ops;
1797
+
1798
+ dbuf.ptr = shm_buf->base + (req.id * shm_blk_size);
1799
+ dbuf.fd = shm_buf->fd;
1800
+ dbuf.flags = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT;
1801
+ dbuf.offset = (uint8_t*) dbuf.ptr - (uint8_t*) shm_buf->base;
1802
+ dbuf.size = b_size + t_size + o_size + p_size;
1803
+
1804
+ GGML_ASSERT(dbuf.size <= shm_blk_size);
1805
+
1806
+ uint8_t * m_ptr = (uint8_t*) dbuf.ptr;
1807
+ uint8_t * b_ptr = m_ptr; m_ptr += b_size;
1808
+ uint8_t * t_ptr = m_ptr; m_ptr += t_size;
1809
+ uint8_t * o_ptr = m_ptr;
1810
+
1811
+ memcpy(b_ptr, (void *) op_batch->h_bufs.data(), b_size);
1812
+ memcpy(t_ptr, (void *) op_batch->h_tens.data(), t_size);
1813
+ memcpy(o_ptr, (void *) op_batch->h_ops.data(), o_size);
1814
+
1815
+ HEX_VERBOSE("ggml-hex: %s op-queue push batch #%u : n-bufs %u n-tensors %u n-ops %u vmem %zu : b-size %zu t-size %zu o-size %zu m-size %zu\n",
1816
+ shm_buf->sess->c_name(), req.id, req.n_bufs, req.n_tensors, req.n_ops, op_batch->b_vmem,
1817
+ b_size, t_size, o_size, (size_t) dbuf.size);
1818
+
1819
+ op_batch->reset();
1820
+
1821
+ if (opt_verbose > 1) {
1822
+ htp_buf_desc *b = (htp_buf_desc*) b_ptr;
1823
+ for (unsigned int i=0; i < req.n_bufs; i++) {
1824
+ GGML_LOG_DEBUG("ggml-hex: %s htp-buf #%u : fd %d base %p size %zu\n", shm_buf->sess->c_name(), i,
1825
+ b[i].fd, (void *) b[i].base, (size_t) b[i].size);
1826
+ }
1827
+ htp_tensor *t = (htp_tensor*) t_ptr;
1828
+ for (unsigned int i=0; i < req.n_tensors; i++) {
1829
+ GGML_LOG_DEBUG("ggml-hex: %s htp-tensor #%u : bi %u offset %u size %u : %zu:%zu:%zu:%zu\n",
1830
+ shm_buf->sess->c_name(), i, t[i].bi, t[i].data, t[i].size,
1831
+ (size_t) t[i].ne[0], (size_t) t[i].ne[1], (size_t) t[i].ne[2], (size_t) t[i].ne[3]);
1832
+ }
1833
+ }
1834
+
1835
+ return true;
1836
+ }
1837
+
1838
+ void pop(htp_opbatch_rsp rsp, dspqueue_buffer dbuf) {
1839
+ GGML_ASSERT(rsp.id < op_cache.size());
1840
+
1841
+ done.push(rsp.id);
1842
+
1843
+ const size_t b_size = sizeof(htp_buf_desc) * rsp.n_bufs;
1844
+ const size_t t_size = sizeof(htp_tensor) * rsp.n_tensors;
1845
+ const size_t o_size = sizeof(htp_op_desc) * rsp.n_ops;
1846
+ const size_t p_size = sizeof(htp_prof_desc) * rsp.n_ops;
1847
+
1848
+ const size_t m_size = b_size + t_size + o_size + p_size;
1849
+ GGML_ASSERT(m_size <= shm_blk_size);
1850
+
1851
+ HEX_VERBOSE("ggml-hex: %s op-queue pop batch #%u : n-bufs %u n-tensors %u n-ops %u : m-size %zu b-size %zu t-size %zu o-size %zu\n",
1852
+ shm_buf->sess->c_name(), rsp.id, rsp.n_bufs, rsp.n_tensors, rsp.n_ops,
1853
+ (size_t) dbuf.size, b_size, t_size, o_size);
1854
+
1855
+ uint8_t * m_ptr = (uint8_t*) dbuf.ptr;
1856
+ uint8_t * p_ptr = m_ptr + (b_size + t_size + o_size);
1857
+
1858
+ if (opt_profile && rsp.n_ops > 0) {
1859
+ auto & ops = op_cache[rsp.id];
1860
+
1861
+ uint64_t batch_usec = ggml_time_us() - start_usec[rsp.id];
1862
+ uint32_t htp_usec = 0;
1863
+
1864
+ GGML_ASSERT(rsp.n_ops <= ops.size());
1865
+
1866
+ const htp_prof_desc * pd = (const htp_prof_desc *) p_ptr;
1867
+ for (uint32_t i = 0; i < rsp.n_ops; i++) {
1868
+ htp_usec += pd[i].usecs;
1869
+ ggml_hexagon_dump_op_prof(shm_buf->sess->name, ops[i], pd[i].usecs, pd[i].cycles, pd[i].pmu);
1870
+ }
1871
+
1872
+ GGML_LOG_DEBUG("ggml-hex: %s profile-batch n-ops %u batch-dur-usec %lld htp-ops-usec %u\n",
1873
+ shm_buf->sess->c_name(), rsp.n_ops, (long long) batch_usec, htp_usec);
1874
+ }
1875
+ }
1876
+ };
1877
+
1878
+ // Flush HTP response queue i.e wait for all outstanding requests to complete
1879
+ void ggml_hexagon_session::flush_pending(bool all) {
1880
+ while (this->op_pending) {
1881
+ struct htp_opbatch_rsp rsp;
1882
+ uint32_t rsp_size;
1883
+ uint32_t flags;
1884
+
1885
+ struct dspqueue_buffer dbuf;
1886
+ uint32_t n_dbufs;
1887
+
1888
+ // Read response packet from queue
1889
+ int err = dspqueue_read(this->queue, &flags, 1, &n_dbufs, &dbuf, sizeof(rsp), &rsp_size, (uint8_t *) &rsp, DSPQUEUE_TIMEOUT);
1890
+ if (err == AEE_EEXPIRED) {
1891
+ continue;
1892
+ }
1893
+
1894
+ if (err != 0) {
1895
+ GGML_ABORT("ggml-hex: dspqueue_read failed: 0x%08x\n", (unsigned) err);
1896
+ }
1897
+
1898
+ // Basic sanity checks
1899
+ if (rsp_size != sizeof(rsp) || n_dbufs != 1) {
1900
+ GGML_ABORT("ggml-hex: %s dspcall : bad response : size %u dspbufs %u\n", this->c_name(), rsp_size, n_dbufs);
1901
+ }
1902
+
1903
+ if (rsp.status != HTP_STATUS_OK) {
1904
+ GGML_LOG_ERROR("ggml-hex: %s dspcall : dsp-rsp: %s\n", this->c_name(), status_to_str(rsp.status));
1905
+ // TODO: handle errors
1906
+ }
1907
+
1908
+ op_queue->pop(rsp, dbuf);
1909
+
1910
+ this->op_pending--; // atomic dec
1911
+
1912
+ if (!all) break;
1913
+ }
1914
+ }
1915
+
1916
+ void ggml_hexagon_session::flush_batch() {
1917
+ if (op_batch->empty()) { return; }
1918
+
1919
+ htp_opbatch_req req {};
1920
+ dspqueue_buffer dbuf{};
1921
+
1922
+ if (!op_queue->push(req, dbuf, op_batch)) {
1923
+ flush_pending(false);
1924
+ op_queue->push(req, dbuf, op_batch);
1925
+ }
1926
+
1927
+ // Bump pending flag (cleared in the session::flush once we get the response)
1928
+ this->op_pending++; // atomic inc
1929
+
1930
+ HEX_VERBOSE("ggml-hex: %s queue-opbatch: %p size %u\n", this->c_name(), dbuf.ptr, dbuf.size);
1931
+
1932
+ int err = dspqueue_write(this->queue, 0, 1, &dbuf, sizeof(req), (const uint8_t*) &req, DSPQUEUE_TIMEOUT);
1933
+ if (err != 0) {
1934
+ GGML_ABORT("ggml-hex: %s dspqueue_write failed: 0x%08x\n", this->c_name(), (unsigned) err);
1935
+ }
1936
+ }
1937
+
1938
+ void ggml_hexagon_session::enqueue_op(htp_op_code opcode, const ggml_tensor *op) {
1939
+ if (!op_batch->fit_op(op)) {
1940
+ flush_batch();
1941
+ }
1942
+ op_batch->add_op(opcode, op);
1943
+ }
1944
+
1945
+ // Flush HTP response queue i.e wait for all outstanding requests to complete
1946
+ void ggml_hexagon_session::flush(bool all) {
1947
+ flush_batch();
1948
+ flush_pending(all);
1949
+ }
1950
+
1951
+ static size_t ggml_hexagon_measure_max_vmem(ggml_hexagon_session *sess) {
1952
+ // Allocate a bunch pinned buffers till failure.
1953
+ // This is kind of expensive but handy for figuring out exactly how much we can mmap on a specific device.
1954
+ // Typically we're going to allocate all/most of these buffers anyway for the model weights.
1955
+
1956
+ std::vector<ggml_hexagon_shared_buffer *> sbufs;
1957
+
1958
+ const size_t MiB = 1024 * 1024;
1959
+ const size_t GiB = MiB * 1024;
1960
+
1961
+ size_t vmem = 0;
1962
+ size_t step = 256u * MiB;
1963
+
1964
+ try {
1965
+ sbufs.push_back(new ggml_hexagon_shared_buffer(sess, GiB, true)); vmem += GiB;
1966
+ sbufs.push_back(new ggml_hexagon_shared_buffer(sess, GiB, true)); vmem += GiB;
1967
+ sbufs.push_back(new ggml_hexagon_shared_buffer(sess, GiB, true)); vmem += GiB;
1968
+
1969
+ while (1) {
1970
+ sbufs.push_back(new ggml_hexagon_shared_buffer(sess, step, true));
1971
+ vmem += step;
1972
+ }
1973
+ } catch (...) { }
1974
+
1975
+ for (auto b : sbufs) { delete b; }
1976
+
1977
+ return vmem - step; // backoff to account for overhead from internal mappings
1978
+ }
1979
+
1980
+ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {
1981
+ this->valid_session = false;
1982
+ this->valid_handle = false;
1983
+ this->valid_queue = false;
1984
+ this->valid_iface = false;
1985
+
1986
+ this->domain_id = 3; // Default for CDSP, updated after the session is created
1987
+ this->session_id = 0; // Default for CDSP, updated after the session is created
1988
+ this->dev_id = dev_id;
1989
+ this->name = std::string("HTP") + std::to_string(dev_id);
1990
+
1991
+ this->op_pending = 0;
1992
+
1993
+ GGML_LOG_DEBUG("ggml-hex: %s allocating new session\n", this->name.c_str());
1994
+
1995
+ domain * my_domain = get_domain(this->domain_id);
1996
+ if (my_domain == NULL) {
1997
+ GGML_LOG_ERROR("ggml-hex: unable to get domain struct for CDSP\n");
1998
+ throw std::runtime_error("ggml-hex: failed to get CDSP domain (see log for details)");
1999
+ }
2000
+
2001
+ // Create new session
2002
+ if (dev_id != 0) {
2003
+ struct remote_rpc_reserve_new_session n;
2004
+ n.domain_name_len = strlen(CDSP_DOMAIN_NAME);
2005
+ n.domain_name = const_cast<char *>(CDSP_DOMAIN_NAME);
2006
+ n.session_name = const_cast<char *>(this->name.c_str());
2007
+ n.session_name_len = this->name.size();
2008
+
2009
+ int err = remote_session_control(FASTRPC_RESERVE_NEW_SESSION, (void *) &n, sizeof(n));
2010
+ if (err != AEE_SUCCESS) {
2011
+ GGML_LOG_ERROR("ggml-hex: failed to reserve new session %d : error 0x%x\n", dev_id, err);
2012
+ throw std::runtime_error("ggml-hex: remote_session_control(new-sess) failed (see log for details)");
2013
+ }
2014
+
2015
+ // Save the IDs
2016
+ this->session_id = n.session_id;
2017
+ this->domain_id = n.effective_domain_id;
2018
+ this->valid_session = true;
2019
+ }
2020
+
2021
+ // Get session URI
2022
+
2023
+ char session_uri[256];
2024
+ {
2025
+ char htp_uri[256];
2026
+ snprintf(htp_uri, sizeof(htp_uri), "file:///libggml-htp-v%u.so?htp_iface_skel_handle_invoke&_modver=1.0", opt_arch);
2027
+
2028
+ struct remote_rpc_get_uri u = {};
2029
+ u.session_id = this->session_id;
2030
+ u.domain_name = const_cast<char *>(CDSP_DOMAIN_NAME);
2031
+ u.domain_name_len = strlen(CDSP_DOMAIN_NAME);
2032
+ u.module_uri = const_cast<char *>(htp_uri);
2033
+ u.module_uri_len = strlen(htp_uri);
2034
+ u.uri = session_uri;
2035
+ u.uri_len = sizeof(session_uri);
2036
+
2037
+ int err = remote_session_control(FASTRPC_GET_URI, (void *) &u, sizeof(u));
2038
+ if (err != AEE_SUCCESS) {
2039
+ // fallback to single session uris
2040
+ int htp_URI_domain_len = strlen(htp_uri) + MAX_DOMAIN_NAMELEN;
2041
+
2042
+ snprintf(session_uri, htp_URI_domain_len, "%s%s", htp_uri, my_domain->uri);
2043
+
2044
+ GGML_LOG_WARN("ggml-hex: failed to get URI for session %d : error 0x%x. Falling back to single session URI: %s\n", dev_id, err, session_uri);
2045
+ }
2046
+ }
2047
+
2048
+ // Enable Unsigned PD
2049
+ {
2050
+ struct remote_rpc_control_unsigned_module u;
2051
+ u.domain = this->domain_id;
2052
+ u.enable = 1;
2053
+ int err = remote_session_control(DSPRPC_CONTROL_UNSIGNED_MODULE, (void *) &u, sizeof(u));
2054
+ if (err != AEE_SUCCESS) {
2055
+ GGML_LOG_ERROR("ggml-hex: failed to enable unsigned PD for session %d : error 0x%x\n", dev_id, err);
2056
+ throw std::runtime_error("ggml-hex: remote_session_control(unsign) failed (see log for details)");
2057
+ }
2058
+ }
2059
+
2060
+ // Open session
2061
+ int err = htp_iface_open(session_uri, &this->handle);
2062
+ if (err != AEE_SUCCESS) {
2063
+ GGML_LOG_ERROR("ggml-hex: failed to open session %d : error 0x%x\n", dev_id, err);
2064
+ throw std::runtime_error("ggml-hex: failed to open session (see log for details)");
2065
+ }
2066
+
2067
+ this->valid_handle = true;
2068
+
2069
+ // Enable FastRPC QoS mode
2070
+ {
2071
+ struct remote_rpc_control_latency l;
2072
+ l.enable = 1;
2073
+
2074
+ int err = remote_handle64_control(this->handle, DSPRPC_CONTROL_LATENCY, (void *) &l, sizeof(l));
2075
+ if (err != 0) {
2076
+ GGML_LOG_WARN("ggml-hex: failed to enable fastrpc QOS mode: 0x%08x\n", (unsigned) err);
2077
+ }
2078
+ }
2079
+
2080
+ GGML_LOG_INFO("ggml-hex: %s new session : session-id %d domain-id %d uri %s handle 0x%lx\n", this->c_name(),
2081
+ this->session_id, this->domain_id, session_uri, (unsigned long) this->handle);
2082
+
2083
+ const size_t req_q_size = (sizeof(htp_opbatch_req) * opt_opqueue * 2) + 1024;
2084
+ const size_t rsp_q_size = (sizeof(htp_opbatch_rsp) * opt_opqueue * 2) + 1024;
2085
+
2086
+ // Now let's setup the DSP queue
2087
+ err = dspqueue_create(this->domain_id,
2088
+ 0, // Flags
2089
+ req_q_size, // Request queue size (in bytes)
2090
+ rsp_q_size, // Response queue size (in bytes)
2091
+ nullptr, // Read packet callback (we handle reads explicitly)
2092
+ nullptr, // Error callback (we handle errors during reads)
2093
+ (void *) this, // Callback context
2094
+ &queue);
2095
+ if (err != 0) {
2096
+ GGML_LOG_ERROR("ggml-hex: %s dspqueue_create failed: 0x%08x\n", this->name.c_str(), (unsigned) err);
2097
+ throw std::runtime_error("ggml-hex: failed to create dspqueue (see log for details)");
2098
+ }
2099
+
2100
+ this->valid_queue = true;
2101
+
2102
+ // Export queue for use on the DSP
2103
+ err = dspqueue_export(queue, &this->queue_id);
2104
+ if (err != 0) {
2105
+ GGML_LOG_ERROR("ggml-hex: dspqueue_export failed: 0x%08x\n", (unsigned) err);
2106
+ throw std::runtime_error("ggml-hex: dspqueue export failed (see log for details)");
2107
+ }
2108
+
2109
+ if (opt_etm) {
2110
+ err = htp_iface_etm(this->handle, 1);
2111
+ if (err != 0) {
2112
+ GGML_LOG_ERROR("ggml-hex: failed to enable ETM tracing: 0x%08x\n", (unsigned) err);
2113
+ }
2114
+ }
2115
+
2116
+ if (opt_profile) {
2117
+ htp_iface_pmu_conf pmu_conf{};
2118
+ std::copy(opt_pmu_evt.begin(), opt_pmu_evt.end(), pmu_conf.events);
2119
+
2120
+ err = htp_iface_profiler(this->handle, opt_profile, &pmu_conf);
2121
+ if (err != 0) {
2122
+ GGML_LOG_ERROR("ggml-hex: failed to enable profiling: 0x%08x\n", (unsigned) err);
2123
+ }
2124
+ }
2125
+
2126
+ // Allocate buffers and state for op batching
2127
+ this->op_queue = new ggml_hexagon_opqueue(this, opt_opbatch, opt_opqueue);
2128
+
2129
+ if (!opt_vmem) {
2130
+ opt_vmem = ggml_hexagon_measure_max_vmem(this);
2131
+ GGML_LOG_INFO("ggml-hex: %s measured max vmem %zu\n", this->c_name(), opt_vmem);
2132
+ }
2133
+
2134
+ this->op_batch = new ggml_hexagon_opbatch(this, opt_opbatch, opt_vmem);
2135
+
2136
+ // Start dspqueue/opbatch processing
2137
+ err = htp_iface_start(this->handle, dev_id, this->queue_id, opt_nhvx, opt_use_hmx, opt_vmem);
2138
+ if (err != 0) {
2139
+ GGML_LOG_ERROR("ggml-hex: %s failed to start session: 0x%08x\n", this->c_name(), (unsigned) err);
2140
+ throw std::runtime_error("ggml-hex: iface start failed (see log for details)");
2141
+ }
2142
+ this->valid_iface = true;
2143
+ }
2144
+
2145
+ void ggml_hexagon_session::release() noexcept(true) {
2146
+ GGML_LOG_INFO("ggml-hex: releasing session: %s\n", this->name.c_str());
2147
+
2148
+ int err;
2149
+
2150
+ if (this->valid_iface) {
2151
+ // Stop dspqueue/opbatch processing
2152
+ err = htp_iface_stop(this->handle);
2153
+ if (err != 0) {
2154
+ GGML_ABORT("ggml-hex: htp_iface_stop failed: 0x%08x\n", (unsigned) err);
2155
+ }
2156
+ }
2157
+
2158
+ delete this->op_batch;
2159
+ delete this->op_queue;
2160
+
2161
+ if (opt_etm) {
2162
+ err = htp_iface_etm(this->handle, 0);
2163
+ if (err != 0) {
2164
+ GGML_LOG_ERROR("ggml-hex: warn : failed to disable ETM tracing: 0x%08x\n", (unsigned) err);
2165
+ }
2166
+ }
2167
+
2168
+ if (opt_profile) {
2169
+ htp_iface_pmu_conf pmu_conf{};
2170
+ err = htp_iface_profiler(this->handle, 0, &pmu_conf);
2171
+ if (err != 0) {
2172
+ GGML_LOG_ERROR("ggml-hex: warn : failed to disable profiling: 0x%08x\n", (unsigned) err);
2173
+ }
2174
+ }
2175
+
2176
+ if (this->valid_queue) {
2177
+ err = dspqueue_close(queue);
2178
+ if (err != 0) {
2179
+ GGML_ABORT("ggml-hex: dspqueue_close failed: 0x%08x\n", (unsigned) err);
2180
+ }
2181
+ }
2182
+
2183
+ if (this->valid_handle) {
2184
+ htp_iface_close(this->handle);
2185
+ }
2186
+ }
2187
+
2188
+ ggml_hexagon_session::ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) noexcept(false) {
2189
+ buffer_type.device = dev;
2190
+ repack_buffer_type.device = dev;
2191
+
2192
+ op_batch = nullptr;
2193
+ op_queue = nullptr;
2194
+
2195
+ try {
2196
+ allocate(dev_id);
2197
+
2198
+ buffer_type.iface = ggml_backend_hexagon_buffer_type_interface;
2199
+ buffer_type.context = new ggml_backend_hexagon_buffer_type_context(this->name, this);
2200
+
2201
+ repack_buffer_type.iface = ggml_backend_hexagon_repack_buffer_type_interface;
2202
+ repack_buffer_type.context = new ggml_backend_hexagon_buffer_type_context(this->name + "-REPACK", this);
2203
+ } catch (const std::exception & exc) {
2204
+ release();
2205
+ throw;
2206
+ }
2207
+ }
2208
+
2209
+ ggml_hexagon_session::~ggml_hexagon_session() noexcept(true) {
2210
+ release();
2211
+
2212
+ delete static_cast<ggml_backend_hexagon_buffer_type_context *>(buffer_type.context);
2213
+ delete static_cast<ggml_backend_hexagon_buffer_type_context *>(repack_buffer_type.context);
2214
+ }
2215
+
2216
+ // ** backend interface
2217
+
2218
+ static bool ggml_backend_buffer_is_hexagon(const struct ggml_backend_buffer * b) {
2219
+ return b->buft->iface.get_alignment == ggml_backend_hexagon_buffer_type_get_alignment;
2220
+ }
2221
+
2222
+ static inline bool ggml_backend_buffer_is_hexagon_repack(const struct ggml_backend_buffer * b) {
2223
+ if (!opt_hostbuf) {
2224
+ return ggml_backend_buffer_is_hexagon(b);
2225
+ }
2226
+ return b->buft->iface.alloc_buffer == ggml_backend_hexagon_repack_buffer_type_alloc_buffer;
2227
+ }
2228
+
2229
+ static bool ggml_hexagon_supported_flash_attn_ext(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
2230
+ const struct ggml_tensor * src0 = op->src[0];
2231
+ const struct ggml_tensor * src1 = op->src[1];
2232
+ const struct ggml_tensor * src2 = op->src[2];
2233
+ const struct ggml_tensor * src3 = op->src[3];
2234
+ const struct ggml_tensor * src4 = op->src[4];
2235
+ const struct ggml_tensor * dst = op;
2236
+
2237
+ // Check for F16 support only as requested
2238
+ if ((src0->type != GGML_TYPE_F16 && src0->type != GGML_TYPE_F32) || src1->type != GGML_TYPE_F16 || src2->type != GGML_TYPE_F16) {
2239
+ return false;
2240
+ }
2241
+
2242
+ if (src3 && src3->type != GGML_TYPE_F16) { // mask
2243
+ return false;
2244
+ }
2245
+
2246
+ if (src4 && src4->type != GGML_TYPE_F32) { // sinks
2247
+ return false;
2248
+ }
2249
+
2250
+ // For now we support F32 or F16 output as htp backend often converts output on the fly if needed,
2251
+ // but the op implementation writes to F16 or F32.
2252
+ // Let's assume dst can be F32 or F16.
2253
+ if (dst->type != GGML_TYPE_F32 && dst->type != GGML_TYPE_F16) {
2254
+ return false;
2255
+ }
2256
+
2257
+ if (dst->ne[3] != 1) {
2258
+ return false;
2259
+ }
2260
+
2261
+ return true;
2262
+ }
2263
+
2264
+ static bool ggml_hexagon_supported_gated_delta_net(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
2265
+ const struct ggml_tensor * q = op->src[0];
2266
+ const struct ggml_tensor * k = op->src[1];
2267
+ const struct ggml_tensor * v = op->src[2];
2268
+ const struct ggml_tensor * g = op->src[3];
2269
+ const struct ggml_tensor * beta = op->src[4];
2270
+ const struct ggml_tensor * state = op->src[5];
2271
+ const struct ggml_tensor * dst = op;
2272
+
2273
+ if (!q || !k || !v || !g || !beta || !state) {
2274
+ return false;
2275
+ }
2276
+
2277
+ if (q->type != GGML_TYPE_F32 || k->type != GGML_TYPE_F32 || v->type != GGML_TYPE_F32 ||
2278
+ g->type != GGML_TYPE_F32 || beta->type != GGML_TYPE_F32 || state->type != GGML_TYPE_F32 ||
2279
+ dst->type != GGML_TYPE_F32) {
2280
+ return false;
2281
+ }
2282
+
2283
+ if (!ggml_is_contiguous_rows(q) || !ggml_is_contiguous_rows(k) || !ggml_is_contiguous_rows(v) ||
2284
+ !ggml_is_contiguous(g) || !ggml_is_contiguous(beta) || !ggml_is_contiguous(state) ||
2285
+ !ggml_is_contiguous(dst)) {
2286
+ return false;
2287
+ }
2288
+
2289
+ const int64_t S_v = v->ne[0];
2290
+ const int64_t H = v->ne[1];
2291
+ const int64_t n_tokens = v->ne[2];
2292
+ const int64_t n_seqs = v->ne[3];
2293
+
2294
+ if (S_v <= 0 || S_v > 128 || H <= 0 || n_tokens <= 0 || n_seqs <= 0) {
2295
+ return false;
2296
+ }
2297
+ if (q->ne[0] != S_v || k->ne[0] != S_v || q->ne[1] <= 0 || k->ne[1] <= 0 ||
2298
+ q->ne[2] != n_tokens || k->ne[2] != n_tokens || q->ne[3] <= 0 || k->ne[3] <= 0 ||
2299
+ (n_seqs % q->ne[3]) != 0 || (n_seqs % k->ne[3]) != 0) {
2300
+ return false;
2301
+ }
2302
+ if ((g->ne[0] != 1 && g->ne[0] != S_v) || beta->ne[0] != 1) {
2303
+ return false;
2304
+ }
2305
+ if (ggml_nelements(state) != S_v * S_v * H * n_seqs) {
2306
+ return false;
2307
+ }
2308
+ if (dst->ne[0] != S_v * H || dst->ne[1] != n_tokens * n_seqs + S_v * n_seqs) {
2309
+ return false;
2310
+ }
2311
+
2312
+ GGML_UNUSED(sess);
2313
+ return true;
2314
+ }
2315
+
2316
+ static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * sess, const struct ggml_tensor * dst) {
2317
+ const struct ggml_tensor * src0 = dst->src[0];
2318
+ const struct ggml_tensor * src1 = dst->src[1];
2319
+
2320
+ if (dst->type != GGML_TYPE_F32) {
2321
+ return false;
2322
+ }
2323
+
2324
+ if (src1->type != GGML_TYPE_F32 && src1->type != GGML_TYPE_F16) {
2325
+ return false;
2326
+ }
2327
+
2328
+ switch (src0->type) {
2329
+ case GGML_TYPE_Q4_0:
2330
+ case GGML_TYPE_Q8_0:
2331
+ case GGML_TYPE_IQ4_NL:
2332
+ case GGML_TYPE_MXFP4:
2333
+ if (src0->ne[0] % 32) {
2334
+ return false;
2335
+ }
2336
+
2337
+ if (ggml_nrows(src0) > 16 * 1024) {
2338
+ return false; // typically the lm-head which would be too large for VTCM
2339
+ }
2340
+
2341
+ if (ggml_nrows(src1) > 1024 || src1->ne[2] != 1 || src1->ne[3] != 1) {
2342
+ return false; // no huge batches or broadcasting (for now)
2343
+ }
2344
+
2345
+ // src0 (weights) must be repacked
2346
+ if (src0->buffer && !ggml_backend_buffer_is_hexagon_repack(src0->buffer)) {
2347
+ return false;
2348
+ }
2349
+ break;
2350
+
2351
+ case GGML_TYPE_F16:
2352
+ if (src0->nb[1] < src0->nb[0]) {
2353
+ GGML_LOG_DEBUG("ggml_hexagon_supported_mul_mat: permuted F16 src0 not supported\n");
2354
+ return false;
2355
+ }
2356
+ if (ggml_nrows(src1) > 1024) {
2357
+ return false; // no huge batches (for now)
2358
+ }
2359
+ break;
2360
+
2361
+ default:
2362
+ return false;
2363
+ }
2364
+
2365
+ return true;
2366
+ }
2367
+
2368
+ static bool ggml_hexagon_supported_mul_mat_id(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
2369
+ const struct ggml_tensor * src0 = op->src[0];
2370
+ const struct ggml_tensor * src1 = op->src[1];
2371
+ const struct ggml_tensor * src2 = op->src[2];
2372
+ const struct ggml_tensor * dst = op;
2373
+
2374
+ if (src1->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32 || src2->type != GGML_TYPE_I32) {
2375
+ return false;
2376
+ }
2377
+
2378
+ switch (src0->type) {
2379
+ case GGML_TYPE_Q4_0:
2380
+ case GGML_TYPE_Q8_0:
2381
+ case GGML_TYPE_IQ4_NL:
2382
+ case GGML_TYPE_MXFP4:
2383
+ if ((src0->ne[0] % 32)) {
2384
+ return false;
2385
+ }
2386
+
2387
+ // src0 (weights) must be repacked
2388
+ if (src0->buffer && !ggml_backend_buffer_is_hexagon_repack(src0->buffer)) {
2389
+ return false;
2390
+ }
2391
+ break;
2392
+
2393
+ default:
2394
+ return false;
2395
+ }
2396
+
2397
+ return true;
2398
+ }
2399
+
2400
+ static bool ggml_hexagon_supported_binary(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
2401
+ const struct ggml_tensor * src0 = op->src[0];
2402
+ const struct ggml_tensor * src1 = op->src[1];
2403
+ const struct ggml_tensor * dst = op;
2404
+
2405
+ if (src0->type == GGML_TYPE_F32) {
2406
+ if (src1->type != GGML_TYPE_F32) {
2407
+ return false;
2408
+ }
2409
+ if (dst->type != GGML_TYPE_F32) {
2410
+ return false;
2411
+ }
2412
+ }
2413
+ else if (src0->type == GGML_TYPE_F16) {
2414
+ if (src1->type != GGML_TYPE_F16) {
2415
+ return false;
2416
+ }
2417
+ if (dst->type != GGML_TYPE_F16) {
2418
+ return false;
2419
+ }
2420
+ }
2421
+ else {
2422
+ return false;
2423
+ }
2424
+
2425
+ if (!ggml_are_same_shape(src0, dst)) {
2426
+ return false;
2427
+ }
2428
+ if (!ggml_can_repeat(src1, src0) || ggml_is_permuted(src1)) {
2429
+ return false;
2430
+ }
2431
+
2432
+ return true;
2433
+ }
2434
+
2435
+ static bool ggml_hexagon_supported_add_id(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
2436
+ const struct ggml_tensor * src0 = op->src[0];
2437
+ const struct ggml_tensor * src1 = op->src[1];
2438
+ const struct ggml_tensor * dst = op;
2439
+
2440
+ if (src0->type != GGML_TYPE_F32) {
2441
+ return false;
2442
+ }
2443
+ if (src1->type != GGML_TYPE_F32) {
2444
+ return false;
2445
+ }
2446
+ if (dst->type != GGML_TYPE_F32) {
2447
+ return false;
2448
+ }
2449
+ if (!ggml_are_same_shape(src0, dst)) {
2450
+ return false;
2451
+ }
2452
+
2453
+ // REVISIT: add support for non-contigiuos tensors
2454
+ if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1) || !ggml_is_contiguous(dst)) {
2455
+ return false;
2456
+ }
2457
+
2458
+ return true;
2459
+ }
2460
+
2461
+ static bool ggml_hexagon_supported_unary(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
2462
+ const struct ggml_tensor * src0 = op->src[0];
2463
+ const struct ggml_tensor * dst = op;
2464
+
2465
+ if (src0->type != GGML_TYPE_F32) {
2466
+ return false;
2467
+ }
2468
+ if (dst->type != GGML_TYPE_F32) {
2469
+ return false;
2470
+ }
2471
+ if (!ggml_are_same_shape(src0, dst)) {
2472
+ return false;
2473
+ }
2474
+
2475
+ // dst must be contiguous; src0 may be non-contiguous
2476
+ if (!ggml_is_contiguous(dst)) {
2477
+ return false;
2478
+ }
2479
+
2480
+ return true;
2481
+ }
2482
+
2483
+ static bool ggml_hexagon_supported_sum_rows(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
2484
+ const struct ggml_tensor * src0 = op->src[0];
2485
+ const struct ggml_tensor * dst = op;
2486
+
2487
+ if (src0->type != GGML_TYPE_F32) {
2488
+ return false;
2489
+ }
2490
+ if (dst->type != GGML_TYPE_F32) {
2491
+ return false;
2492
+ }
2493
+
2494
+ // TODO: add support for non-contigiuos tensors
2495
+ if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(dst)) {
2496
+ return false;
2497
+ }
2498
+
2499
+ return true;
2500
+ }
2501
+
2502
+ static bool ggml_hexagon_supported_activations(const struct ggml_hexagon_session * sess,
2503
+ const struct ggml_tensor * op) {
2504
+ const struct ggml_tensor * src0 = op->src[0];
2505
+ const struct ggml_tensor * src1 = op->src[1];
2506
+ const struct ggml_tensor * dst = op;
2507
+
2508
+ if (src0->type != GGML_TYPE_F32) {
2509
+ return false;
2510
+ }
2511
+ if (dst->type != GGML_TYPE_F32) {
2512
+ return false;
2513
+ }
2514
+
2515
+ if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(dst)) {
2516
+ return false;
2517
+ }
2518
+
2519
+ if (src1) {
2520
+ if (src1->type != GGML_TYPE_F32) {
2521
+ return false;
2522
+ }
2523
+ if (!ggml_are_same_shape(src0, src1)) {
2524
+ return false;
2525
+ }
2526
+ if (!ggml_is_contiguous(src1)) {
2527
+ return false;
2528
+ }
2529
+ }
2530
+
2531
+ return true;
2532
+ }
2533
+
2534
+ static bool ggml_hexagon_supported_softmax(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
2535
+ const struct ggml_tensor * src0 = op->src[0];
2536
+ const struct ggml_tensor * src1 = op->src[1];
2537
+ const struct ggml_tensor * src2 = op->src[2];
2538
+ const struct ggml_tensor * dst = op;
2539
+
2540
+ if (src2) {
2541
+ return false; // FIXME: add support for sinks
2542
+ }
2543
+
2544
+ if (src0->type != GGML_TYPE_F32) {
2545
+ return false;
2546
+ }
2547
+ if (dst->type != GGML_TYPE_F32) {
2548
+ return false;
2549
+ }
2550
+
2551
+ if (src1) {
2552
+ if (src1->type != GGML_TYPE_F32 && src1->type != GGML_TYPE_F16) {
2553
+ return false;
2554
+ }
2555
+ if (src0->ne[0] != src1->ne[0]) {
2556
+ return false;
2557
+ }
2558
+ if (src1->ne[1] < src0->ne[1]) {
2559
+ return false;
2560
+ }
2561
+ if (src0->ne[2] % src1->ne[2] != 0) {
2562
+ return false;
2563
+ }
2564
+ if (src0->ne[3] % src1->ne[3] != 0) {
2565
+ return false;
2566
+ }
2567
+ }
2568
+
2569
+ if (src1) {
2570
+ if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1) || !ggml_is_contiguous(dst)) {
2571
+ return false;
2572
+ }
2573
+ } else {
2574
+ if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(dst)) {
2575
+ return false;
2576
+ }
2577
+ }
2578
+
2579
+ // Reject non-HVX-aligned sizes when ne[0] > HVX_F32_LANES
2580
+ // The HVX softmax implementation has issues with tail handling for larger non-aligned sizes
2581
+ // Small sizes (ne[0] <= 32) work correctly with tail-only processing
2582
+ const int64_t ne0 = src0->ne[0];
2583
+ if (ne0 > 32 && (ne0 & (32 - 1)) != 0) {
2584
+ return false;
2585
+ }
2586
+
2587
+ // HVX vector size constraints for softmax
2588
+ #define SOFTMAX_MAX_ROW_SIZE 131072 // 128K elements max for numerical precision
2589
+
2590
+ // Reject very large row sizes to avoid numerical precision issues
2591
+ // Softmax accumulation over many elements can lead to precision loss
2592
+ if (ne0 > SOFTMAX_MAX_ROW_SIZE) {
2593
+ return false;
2594
+ }
2595
+
2596
+ return true;
2597
+ }
2598
+
2599
+ static bool ggml_hexagon_supported_set_rows(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
2600
+ const struct ggml_tensor * src0 = op->src[0]; // values
2601
+ const struct ggml_tensor * src1 = op->src[1]; // indices
2602
+ const struct ggml_tensor * dst = op;
2603
+
2604
+ if (src0->type != GGML_TYPE_F32) {
2605
+ return false;
2606
+ }
2607
+
2608
+ if (src1->type != GGML_TYPE_I32 && src1->type != GGML_TYPE_I64) {
2609
+ return false;
2610
+ }
2611
+
2612
+ if (dst->type != GGML_TYPE_F16) {
2613
+ return false;
2614
+ }
2615
+
2616
+ return true;
2617
+ }
2618
+
2619
+ static bool ggml_hexagon_supported_get_rows(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
2620
+ const struct ggml_tensor * src0 = op->src[0]; // values
2621
+ const struct ggml_tensor * src1 = op->src[1]; // indices
2622
+ const struct ggml_tensor * dst = op;
2623
+
2624
+ if (src0->type != GGML_TYPE_F32) {
2625
+ return false;
2626
+ }
2627
+
2628
+ if (src1->type != GGML_TYPE_I32 && src1->type != GGML_TYPE_I64) {
2629
+ return false;
2630
+ }
2631
+
2632
+ if (dst->type != GGML_TYPE_F32) {
2633
+ return false;
2634
+ }
2635
+
2636
+ return true;
2637
+ }
2638
+
2639
+ static bool ggml_hexagon_supported_argsort(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
2640
+ const struct ggml_tensor * src0 = op->src[0]; // values
2641
+ const struct ggml_tensor * dst = op; // indices
2642
+
2643
+ if (src0->type != GGML_TYPE_F32) {
2644
+ return false;
2645
+ }
2646
+
2647
+ if (dst->type != GGML_TYPE_I32) {
2648
+ return false;
2649
+ }
2650
+
2651
+ if (src0->ne[0] > (16*1024)) {
2652
+ // reject tensors with huge rows for now
2653
+ return false;
2654
+ }
2655
+
2656
+ return true;
2657
+ }
2658
+
2659
+ static bool ggml_hexagon_supported_rope(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
2660
+ const int32_t * op_params = &op->op_params[0];
2661
+
2662
+ int mode = op_params[2];
2663
+
2664
+ if ((mode & GGML_ROPE_TYPE_MROPE) || (mode & GGML_ROPE_TYPE_VISION)) {
2665
+ return false;
2666
+ }
2667
+ if (mode & 1) {
2668
+ return false;
2669
+ }
2670
+
2671
+ const struct ggml_tensor * src0 = op->src[0];
2672
+ const struct ggml_tensor * src1 = op->src[1];
2673
+ const struct ggml_tensor * src2 = op->src[2];
2674
+ const struct ggml_tensor * dst = op;
2675
+
2676
+ if (src0->type != GGML_TYPE_F32) {
2677
+ return false; // FIXME: add support for GGML_TYPE_F16 for src0
2678
+ }
2679
+ if (dst->type != GGML_TYPE_F32) {
2680
+ return false;
2681
+ }
2682
+ if (src1->type != GGML_TYPE_I32) {
2683
+ return false;
2684
+ }
2685
+ if (src2) {
2686
+ if (src2->type != GGML_TYPE_F32) {
2687
+ return false;
2688
+ }
2689
+ int n_dims = op_params[1];
2690
+ if (src2->ne[0] < (n_dims / 2)) {
2691
+ return false;
2692
+ }
2693
+ }
2694
+
2695
+ if (src2) {
2696
+ if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1) || !ggml_is_contiguous(src2) ||
2697
+ !ggml_is_contiguous(dst)) {
2698
+ return false;
2699
+ }
2700
+ } else {
2701
+ if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1) || !ggml_is_contiguous(dst)) {
2702
+ return false;
2703
+ }
2704
+ }
2705
+
2706
+ return true;
2707
+ }
2708
+
2709
+ static bool ggml_hexagon_supported_ssm_conv(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
2710
+ const struct ggml_tensor * src0 = op->src[0];
2711
+ const struct ggml_tensor * src1 = op->src[1];
2712
+ const struct ggml_tensor * dst = op;
2713
+
2714
+ // Only support FP32 for now
2715
+ if (src0->type != GGML_TYPE_F32 || src1->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) {
2716
+ return false;
2717
+ }
2718
+
2719
+ // Check IO tensor shapes and dims
2720
+ if (src0->ne[3] != 1 || src1->ne[2] != 1 || src1->ne[3] != 1 || dst->ne[3] != 1) {
2721
+ return false; // src0 should be effectively 3D
2722
+ }
2723
+
2724
+ const int d_conv = src1->ne[0];
2725
+ const int d_inner = src0->ne[1];
2726
+ const int n_t = dst->ne[1];
2727
+ const int n_s = dst->ne[2];
2728
+
2729
+ if (src0->ne[0] != d_conv - 1 + n_t || src0->ne[1] != d_inner || src0->ne[2] != n_s) {
2730
+ return false;
2731
+ }
2732
+ if (src1->ne[0] != d_conv || src1->ne[1] != d_inner) {
2733
+ return false;
2734
+ }
2735
+ if (dst->ne[0] != d_inner || dst->ne[1] != n_t || dst->ne[2] != n_s) {
2736
+ return false;
2737
+ }
2738
+
2739
+ // TODO: add support for non-contiguous tensors
2740
+ if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1) || !ggml_is_contiguous(dst)) {
2741
+ return false;
2742
+ }
2743
+
2744
+ return true;
2745
+ }
2746
+
2747
+ static bool ggml_hexagon_supported_cumsum(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
2748
+ const struct ggml_tensor * src0 = op->src[0];
2749
+ const struct ggml_tensor * dst = op;
2750
+
2751
+ if (src0->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) {
2752
+ return false;
2753
+ }
2754
+
2755
+ if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(dst)) {
2756
+ return false;
2757
+ }
2758
+
2759
+ GGML_UNUSED(sess);
2760
+ return true;
2761
+ }
2762
+
2763
+ static bool ggml_hexagon_supported_diag(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
2764
+ const struct ggml_tensor * src0 = op->src[0];
2765
+ const struct ggml_tensor * dst = op;
2766
+
2767
+ // diag only supports F32 currently
2768
+ if (src0->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) {
2769
+ return false;
2770
+ }
2771
+
2772
+ // Input must have ne[1] == 1 (vector input)
2773
+ if (src0->ne[1] != 1) {
2774
+ return false;
2775
+ }
2776
+
2777
+ // Output must be square in first two dimensions
2778
+ if (dst->ne[0] != dst->ne[1] || dst->ne[0] != src0->ne[0]) {
2779
+ return false;
2780
+ }
2781
+
2782
+ GGML_UNUSED(sess);
2783
+ return true;
2784
+ }
2785
+
2786
+ static bool ggml_hexagon_supported_solve_tri(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
2787
+ const struct ggml_tensor * src0 = op->src[0]; // A
2788
+ const struct ggml_tensor * src1 = op->src[1]; // B
2789
+ const struct ggml_tensor * dst = op; // X
2790
+
2791
+ if (!src0 || !src1) {
2792
+ return false;
2793
+ }
2794
+
2795
+ if (src0->type != GGML_TYPE_F32 || src1->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) {
2796
+ return false;
2797
+ }
2798
+
2799
+ if (src0->ne[0] != src0->ne[1]) {
2800
+ return false;
2801
+ }
2802
+
2803
+ if (src0->ne[1] != src1->ne[1]) {
2804
+ return false;
2805
+ }
2806
+
2807
+ if (src0->ne[2] != src1->ne[2] || src0->ne[3] != src1->ne[3]) {
2808
+ return false;
2809
+ }
2810
+
2811
+ if (dst->ne[0] != src1->ne[0] || dst->ne[1] != src1->ne[1] || dst->ne[2] != src1->ne[2] || dst->ne[3] != src1->ne[3]) {
2812
+ return false;
2813
+ }
2814
+
2815
+ GGML_UNUSED(sess);
2816
+ return true;
2817
+ }
2818
+
2819
+ static const char * ggml_backend_hexagon_name(ggml_backend_t backend) {
2820
+ auto sess = static_cast<ggml_hexagon_session *>(backend->context);
2821
+ return sess->c_name();
2822
+ }
2823
+
2824
+ static void ggml_backend_hexagon_free(ggml_backend_t backend) {
2825
+ // we just need to delete the backend here
2826
+ // the sessions are allocated & freed as part of the registry
2827
+ delete backend;
2828
+ }
2829
+
2830
+ static htp_op_code op_remap_to_htp(const ggml_tensor * t) {
2831
+ switch (t->op) {
2832
+ case GGML_OP_FLASH_ATTN_EXT: return HTP_OP_FLASH_ATTN_EXT;
2833
+ case GGML_OP_MUL_MAT: return HTP_OP_MUL_MAT;
2834
+ case GGML_OP_MUL_MAT_ID: return HTP_OP_MUL_MAT_ID;
2835
+ case GGML_OP_MUL: return HTP_OP_MUL;
2836
+ case GGML_OP_ADD: return HTP_OP_ADD;
2837
+ case GGML_OP_ADD_ID: return HTP_OP_ADD_ID;
2838
+ case GGML_OP_SUB: return HTP_OP_SUB;
2839
+ case GGML_OP_DIV: return HTP_OP_DIV;
2840
+ case GGML_OP_CPY: return HTP_OP_CPY;
2841
+ case GGML_OP_CONT: return HTP_OP_CPY;
2842
+ case GGML_OP_GET_ROWS: return HTP_OP_GET_ROWS;
2843
+ case GGML_OP_SET_ROWS: return HTP_OP_SET_ROWS;
2844
+ case GGML_OP_SUM_ROWS: return HTP_OP_SUM_ROWS;
2845
+ case GGML_OP_ARGSORT: return HTP_OP_ARGSORT;
2846
+ case GGML_OP_L2_NORM: return HTP_OP_L2_NORM;
2847
+ case GGML_OP_RMS_NORM: return HTP_OP_RMS_NORM;
2848
+ case GGML_OP_SCALE: return HTP_OP_SCALE;
2849
+ case GGML_OP_SQR: return HTP_OP_SQR;
2850
+ case GGML_OP_SQRT: return HTP_OP_SQRT;
2851
+ case GGML_OP_SOFT_MAX: return HTP_OP_SOFTMAX;
2852
+ case GGML_OP_SSM_CONV: return HTP_OP_SSM_CONV;
2853
+ case GGML_OP_GATED_DELTA_NET: return HTP_OP_GATED_DELTA_NET;
2854
+ case GGML_OP_ROPE: return HTP_OP_ROPE;
2855
+ case GGML_OP_REPEAT: return HTP_OP_REPEAT;
2856
+ case GGML_OP_CUMSUM: return HTP_OP_CUMSUM;
2857
+ case GGML_OP_FILL: return HTP_OP_FILL;
2858
+ case GGML_OP_DIAG: return HTP_OP_DIAG;
2859
+ case GGML_OP_SOLVE_TRI: return HTP_OP_SOLVE_TRI;
2860
+ case GGML_OP_UNARY:
2861
+ switch (ggml_get_unary_op(t)) {
2862
+ case GGML_UNARY_OP_SILU: return HTP_OP_UNARY_SILU;
2863
+ case GGML_UNARY_OP_GELU: return HTP_OP_UNARY_GELU;
2864
+ case GGML_UNARY_OP_SIGMOID: return HTP_OP_UNARY_SIGMOID;
2865
+ case GGML_UNARY_OP_NEG: return HTP_OP_UNARY_NEG;
2866
+ case GGML_UNARY_OP_EXP: return HTP_OP_UNARY_EXP;
2867
+ case GGML_UNARY_OP_SOFTPLUS: return HTP_OP_UNARY_SOFTPLUS;
2868
+ case GGML_UNARY_OP_TANH: return HTP_OP_UNARY_TANH;
2869
+ default:
2870
+ break;
2871
+ }
2872
+ break;
2873
+
2874
+ case GGML_OP_GLU:
2875
+ switch (ggml_get_glu_op(t)) {
2876
+ case GGML_GLU_OP_SWIGLU: return HTP_OP_GLU_SWIGLU;
2877
+ case GGML_GLU_OP_SWIGLU_OAI: return HTP_OP_GLU_SWIGLU_OAI;
2878
+ case GGML_GLU_OP_GEGLU: return HTP_OP_GLU_GEGLU;
2879
+ default: break;
2880
+ }
2881
+ break;
2882
+
2883
+ default:
2884
+ GGML_ABORT("\nggml-hex: graph-compute %s is not supported\n", ggml_op_desc(t));
2885
+ }
2886
+ return HTP_OP_INVALID;
2887
+ }
2888
+
2889
+ static inline bool op_is_compute(ggml_tensor *node)
2890
+ {
2891
+ return !ggml_op_is_empty(node->op) && !ggml_is_empty(node) && (node->flags & GGML_TENSOR_FLAG_COMPUTE);
2892
+ }
2893
+
2894
+ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, ggml_cgraph * graph) {
2895
+ auto sess = static_cast<ggml_hexagon_session *>(backend->context);
2896
+
2897
+ HEX_VERBOSE("ggml-hex: %s graph-compute n_nodes %d\n", sess->c_name(), graph->n_nodes);
2898
+
2899
+ for (int i = 0; i < graph->n_nodes; ++i) {
2900
+ ggml_tensor * n = graph->nodes[i];
2901
+ if (op_is_compute(n) && (opt_opstage & HTP_OPSTAGE_QUEUE)) {
2902
+ sess->enqueue_op(op_remap_to_htp(n), n);
2903
+ }
2904
+ }
2905
+
2906
+ // Wait until all pending ops complete
2907
+ sess->flush();
2908
+
2909
+ return GGML_STATUS_SUCCESS;
2910
+ }
2911
+
2912
+ static void ggml_backend_hexagon_synchronize(ggml_backend_t backend) {
2913
+ auto sess = static_cast<ggml_hexagon_session *>(backend->context);
2914
+
2915
+ HEX_VERBOSE("ggml-hex: %s synchronize\n", sess->c_name());
2916
+
2917
+ // Wait until all pending ops complete
2918
+ sess->flush();
2919
+ }
2920
+
2921
+ struct node_info {
2922
+ ggml_tensor * node;
2923
+
2924
+ std::vector<ggml_tensor *> fused;
2925
+
2926
+ ggml_op op() const {
2927
+ return node->op;
2928
+ }
2929
+
2930
+ const ggml_tensor * dst() const {
2931
+ return fused.empty() ? node : fused.back();
2932
+ }
2933
+
2934
+ const ggml_tensor * src0() const {
2935
+ return node->src[0];
2936
+ }
2937
+
2938
+ const ggml_tensor * src1() const {
2939
+ return node->src[1];
2940
+ }
2941
+
2942
+ bool is_empty() const {
2943
+ return ggml_op_is_empty(node->op);
2944
+ }
2945
+
2946
+ void add_fused(ggml_tensor * t) {
2947
+ fused.push_back(t);
2948
+ }
2949
+
2950
+ bool stackable() const {
2951
+ switch (this->op()) {
2952
+ case GGML_OP_MUL_MAT:
2953
+ case GGML_OP_MUL_MAT_ID:
2954
+ return ggml_is_quantized(this->src0()->type);
2955
+ default:
2956
+ return false;
2957
+ }
2958
+ }
2959
+
2960
+ bool same_input(const node_info& n) const {
2961
+ return n.src1() == this->src1();
2962
+ }
2963
+ };
2964
+
2965
+ static std::vector<int> ggml_hexagon_graph_optimize_reorder(const std::vector<node_info> & nodes) {
2966
+ const int n = nodes.size();
2967
+
2968
+ std::vector<int> res;
2969
+ res.reserve(n);
2970
+
2971
+ std::vector<bool> used(n, false);
2972
+
2973
+ // The main goal here is to stack the MUL_MAT ops with the same src1 input.
2974
+ // This allows use to reuse dynamically quantized src1 in VTCM.
2975
+
2976
+ // TODO: the current version might do incorrect reordering in cases where quantized src0
2977
+ // input is an output of another Op.
2978
+
2979
+ for (int i0 = 0; i0 < n; i0++) {
2980
+ if (used[i0]) {
2981
+ continue;
2982
+ }
2983
+
2984
+ res.push_back(i0);
2985
+
2986
+ const auto & node0 = nodes[i0];
2987
+
2988
+ if (!node0.stackable()) {
2989
+ continue;
2990
+ }
2991
+
2992
+ // that many nodes forward to search for stackable nodes that can reuse VTCM
2993
+ constexpr int N_FORWARD = 16;
2994
+
2995
+ for (int i1 = i0 + 1; i1 < i0 + N_FORWARD && i1 < n; i1++) {
2996
+ if (used[i1]) {
2997
+ continue;
2998
+ }
2999
+
3000
+ const auto & node1 = nodes[i1];
3001
+
3002
+ if (node1.stackable() && node1.same_input(node0)) {
3003
+ res.push_back(i1);
3004
+ used[i1] = true;
3005
+ }
3006
+ }
3007
+ }
3008
+
3009
+ return res;
3010
+ }
3011
+
3012
+ static void ggml_backend_hexagon_graph_optimize(ggml_backend_t backend, ggml_cgraph * gf) {
3013
+ const int n = gf->n_nodes;
3014
+
3015
+ constexpr int MAX_FUSE = 16;
3016
+
3017
+ enum ggml_op ops[MAX_FUSE];
3018
+
3019
+ std::vector<node_info> nodes;
3020
+ nodes.reserve(gf->n_nodes);
3021
+
3022
+ // fuse nodes:
3023
+ // we don't want to make reorders that break fusing, so we first pack all fusable tensors
3024
+ // and perform the reorder over the fused nodes. after the reorder is done, we unfuse
3025
+ for (int i = 0; i < n; i++) {
3026
+ node_info node = {
3027
+ /*.node =*/gf->nodes[i],
3028
+ /*.fused =*/{},
3029
+ };
3030
+
3031
+ // fuse only ops that start with these operations
3032
+ // can be expanded when needed
3033
+ if (node.op() == GGML_OP_ADD ||
3034
+ node.op() == GGML_OP_NORM ||
3035
+ node.op() == GGML_OP_RMS_NORM) {
3036
+ ops[0] = node.op();
3037
+
3038
+ int f = i + 1;
3039
+ while (f < n && f < i + MAX_FUSE) {
3040
+ // conservatively allow fusing only these ops
3041
+ // can be expanded when needed
3042
+ if (gf->nodes[f]->op != GGML_OP_ADD &&
3043
+ gf->nodes[f]->op != GGML_OP_MUL &&
3044
+ gf->nodes[f]->op != GGML_OP_NORM &&
3045
+ gf->nodes[f]->op != GGML_OP_RMS_NORM) {
3046
+ break;
3047
+ }
3048
+ ops[f - i] = gf->nodes[f]->op;
3049
+ f++;
3050
+ }
3051
+
3052
+ f -= i;
3053
+ for (; f > 1; f--) {
3054
+ if (ggml_can_fuse(gf, i, ops, f)) {
3055
+ break;
3056
+ }
3057
+ }
3058
+
3059
+ // add the fused tensors into the node info so we can unfuse them later
3060
+ for (int k = 1; k < f; k++) {
3061
+ ++i;
3062
+
3063
+ // the .dst() becomes the last fused tensor
3064
+ node.add_fused(gf->nodes[i]);
3065
+ }
3066
+ }
3067
+
3068
+ nodes.push_back(std::move(node));
3069
+ }
3070
+
3071
+ const auto order = ggml_hexagon_graph_optimize_reorder(nodes);
3072
+
3073
+ // unfuse
3074
+ {
3075
+ int j = 0;
3076
+ for (const auto i : order) {
3077
+ const auto & node = nodes[i];
3078
+
3079
+ gf->nodes[j++] = node.node;
3080
+
3081
+ for (auto * fused : node.fused) {
3082
+ gf->nodes[j++] = fused;
3083
+ }
3084
+ }
3085
+ }
3086
+ }
3087
+
3088
+ static struct ggml_backend_i hexagon_backend_i = {
3089
+ /* .get_name = */ ggml_backend_hexagon_name,
3090
+ /* .free = */ ggml_backend_hexagon_free,
3091
+ /* .set_tensor_async = */ NULL,
3092
+ /* .get_tensor_async = */ NULL,
3093
+ /* .set_tensor_2d_async = */ NULL,
3094
+ /* .get_tensor_2d_async = */ NULL,
3095
+ /* .cpy_tensor_async = */ NULL,
3096
+ /* .synchronize = */ ggml_backend_hexagon_synchronize,
3097
+ /* .graph_plan_create = */ NULL,
3098
+ /* .graph_plan_free = */ NULL,
3099
+ /* .graph_plan_update = */ NULL,
3100
+ /* .graph_plan_compute = */ NULL,
3101
+ /* .graph_compute = */ ggml_backend_hexagon_graph_compute,
3102
+ /* .event_record = */ NULL,
3103
+ /* .event_wait = */ NULL,
3104
+ /* .graph_optimize = */ ggml_backend_hexagon_graph_optimize,
3105
+ };
3106
+
3107
+ static ggml_guid_t ggml_backend_hexagon_guid() {
3108
+ static ggml_guid guid = { 0x7b, 0x57, 0xdc, 0xaf, 0xde, 0x12, 0x1d, 0x49,
3109
+ 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11 };
3110
+ return &guid;
3111
+ }
3112
+
3113
+ bool ggml_backend_is_hexagon(ggml_backend_t backend) {
3114
+ return backend && backend->iface.get_name == ggml_backend_hexagon_name;
3115
+ }
3116
+
3117
+ // device interface
3118
+
3119
+ static ggml_backend_t ggml_backend_hexagon_device_init(ggml_backend_dev_t dev, const char * params) {
3120
+ auto sess = static_cast<ggml_hexagon_session *>(dev->context);
3121
+
3122
+ return new ggml_backend{
3123
+ /* .guid = */ ggml_backend_hexagon_guid(),
3124
+ /* .interface = */ hexagon_backend_i,
3125
+ /* .device = */ dev,
3126
+ /* .context = */ sess,
3127
+ };
3128
+
3129
+ GGML_UNUSED(params);
3130
+ }
3131
+
3132
+ static const char * ggml_backend_hexagon_device_get_name(ggml_backend_dev_t dev) {
3133
+ auto sess = static_cast<ggml_hexagon_session *>(dev->context);
3134
+ return sess->c_name();
3135
+
3136
+ GGML_UNUSED(dev);
3137
+ }
3138
+
3139
+ static const char * ggml_backend_hexagon_device_get_description(ggml_backend_dev_t dev) {
3140
+ return "Hexagon";
3141
+ GGML_UNUSED(dev);
3142
+ }
3143
+
3144
+ static void ggml_backend_hexagon_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
3145
+ *free = 0;
3146
+ *total = *free;
3147
+
3148
+ GGML_UNUSED(dev);
3149
+ }
3150
+
3151
+ static enum ggml_backend_dev_type ggml_backend_hexagon_device_get_type(ggml_backend_dev_t dev) {
3152
+ return GGML_BACKEND_DEVICE_TYPE_GPU;
3153
+
3154
+ GGML_UNUSED(dev);
3155
+ }
3156
+
3157
+ static void ggml_backend_hexagon_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
3158
+ props->name = ggml_backend_hexagon_device_get_name(dev);
3159
+ props->description = ggml_backend_hexagon_device_get_description(dev);
3160
+ props->type = ggml_backend_hexagon_device_get_type(dev);
3161
+ ggml_backend_hexagon_device_get_memory(dev, &props->memory_free, &props->memory_total);
3162
+ props->caps = {
3163
+ /* .async = */ true,
3164
+ /* .host_buffer = */ (bool) opt_hostbuf,
3165
+ /* .buffer_from_host_ptr = */ false,
3166
+ /* .events = */ false,
3167
+ };
3168
+ }
3169
+
3170
+ static ggml_backend_buffer_type_t ggml_backend_hexagon_device_get_buffer_type(ggml_backend_dev_t dev) {
3171
+ auto sess = static_cast<ggml_hexagon_session *>(dev->context);
3172
+ return &sess->buffer_type;
3173
+ }
3174
+
3175
+ static ggml_backend_buffer_type_t ggml_backend_hexagon_device_get_repack_buffer_type(ggml_backend_dev_t dev) {
3176
+ auto sess = static_cast<ggml_hexagon_session *>(dev->context);
3177
+ return &sess->repack_buffer_type;
3178
+ }
3179
+
3180
+ static bool ggml_hexagon_supported_buffer(ggml_hexagon_session *sess, const struct ggml_tensor * t) {
3181
+ if (t && t->buffer) {
3182
+ if (ggml_backend_buffer_is_hexagon(t->buffer) == false) return false; // not our buffer
3183
+ if (ggml_backend_hexagon_buffer_get_sess(t->buffer) != sess) return false; // wrong session
3184
+ }
3185
+ return true;
3186
+ }
3187
+
3188
+ static bool ggml_hexagon_supported_buffers(ggml_hexagon_session *sess, const struct ggml_tensor * t) {
3189
+ // all srcs & dsts must be mapped to the same session
3190
+ if (!ggml_hexagon_supported_buffer(sess, t)) {
3191
+ return false;
3192
+ }
3193
+
3194
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
3195
+ if (!ggml_hexagon_supported_buffer(sess, t->src[i])) {
3196
+ return false;
3197
+ }
3198
+ }
3199
+
3200
+ return true;
3201
+ }
3202
+
3203
+ static bool ggml_hexagon_supported_cpy(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
3204
+ const struct ggml_tensor * src0 = op->src[0];
3205
+ const struct ggml_tensor * dst = op;
3206
+
3207
+ // for now we can do f32 -> f16 and f16 -> f32 (without reshaping)
3208
+ if (src0->type != GGML_TYPE_F32 && src0->type != GGML_TYPE_F16) return false;
3209
+ if ( dst->type != GGML_TYPE_F32 && dst->type != GGML_TYPE_F16) return false;
3210
+
3211
+ const bool sametype = (src0->type == dst->type);
3212
+ const bool transposed = ggml_is_transposed(src0) || ggml_is_transposed(dst);
3213
+ const bool sameshape = !transposed && ggml_are_same_shape(src0, dst);
3214
+
3215
+ // can handle any shape and any same-type (pretty slow if reshaping is required)
3216
+ if (sametype) return true;
3217
+
3218
+ // cannot handle re-shaping and type conversion at the same time
3219
+ if (!sameshape) return false;
3220
+
3221
+ return true;
3222
+ }
3223
+
3224
+ static bool ggml_hexagon_supported_cont(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
3225
+ GGML_UNUSED(sess);
3226
+ const struct ggml_tensor * src0 = op->src[0];
3227
+
3228
+ // CONT is same-type only, supports f32 and f16
3229
+ if (src0->type != GGML_TYPE_F32 && src0->type != GGML_TYPE_F16) return false;
3230
+
3231
+ return true;
3232
+ }
3233
+
3234
+ static bool ggml_hexagon_supported_repeat(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
3235
+ GGML_UNUSED(sess);
3236
+ const struct ggml_tensor * src0 = op->src[0];
3237
+ const struct ggml_tensor * dst = op;
3238
+
3239
+ // Support f32 and f16
3240
+ if (src0->type != GGML_TYPE_F32 && src0->type != GGML_TYPE_F16) return false;
3241
+
3242
+ // src and dst must be the same type
3243
+ if (src0->type != dst->type) return false;
3244
+
3245
+ // dst dims must be multiples of src dims
3246
+ if (dst->ne[0] % src0->ne[0] != 0) return false;
3247
+ if (dst->ne[1] % src0->ne[1] != 0) return false;
3248
+ if (dst->ne[2] % src0->ne[2] != 0) return false;
3249
+ if (dst->ne[3] % src0->ne[3] != 0) return false;
3250
+
3251
+ // require contiguous tensors (no transposition)
3252
+ if (ggml_is_transposed(src0) || ggml_is_transposed(dst)) return false;
3253
+
3254
+ return true;
3255
+ }
3256
+
3257
+ static bool ggml_hexagon_supported_fill(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
3258
+ const struct ggml_tensor * dst = op;
3259
+
3260
+ if (dst->type != GGML_TYPE_F32 && dst->type != GGML_TYPE_F16) {
3261
+ return false;
3262
+ }
3263
+
3264
+ GGML_UNUSED(sess);
3265
+ return true;
3266
+ }
3267
+
3268
+ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
3269
+ auto sess = static_cast<ggml_hexagon_session *>(dev->context);
3270
+
3271
+ // reject ops that match the filter
3272
+ if (opt_opfilter && std::regex_match(ggml_op_desc(op), *opt_opfilter)) {
3273
+ return false;
3274
+ }
3275
+
3276
+ // all srcs & dsts must be mapped to the same session
3277
+ if (!ggml_hexagon_supported_buffers(sess, op)) {
3278
+ ggml_hexagon_dump_op_supp(sess->name, op, false);
3279
+ return false;
3280
+ }
3281
+
3282
+ bool supp = false;
3283
+ switch (op->op) {
3284
+ case GGML_OP_NONE:
3285
+ case GGML_OP_RESHAPE:
3286
+ case GGML_OP_VIEW:
3287
+ case GGML_OP_PERMUTE:
3288
+ case GGML_OP_TRANSPOSE:
3289
+ supp = true;
3290
+ break;
3291
+
3292
+ case GGML_OP_MUL:
3293
+ case GGML_OP_ADD:
3294
+ case GGML_OP_SUB:
3295
+ case GGML_OP_DIV:
3296
+ supp = ggml_hexagon_supported_binary(sess, op);
3297
+ break;
3298
+
3299
+ case GGML_OP_MUL_MAT:
3300
+ supp = ggml_hexagon_supported_mul_mat(sess, op);
3301
+ break;
3302
+
3303
+ case GGML_OP_MUL_MAT_ID:
3304
+ supp = ggml_hexagon_supported_mul_mat_id(sess, op);
3305
+ break;
3306
+
3307
+ case GGML_OP_ADD_ID:
3308
+ supp = ggml_hexagon_supported_add_id(sess, op);
3309
+ break;
3310
+
3311
+ case GGML_OP_L2_NORM:
3312
+ supp = ggml_hexagon_supported_unary(sess, op);
3313
+ break;
3314
+
3315
+ case GGML_OP_RMS_NORM:
3316
+ case GGML_OP_SCALE:
3317
+ supp = ggml_hexagon_supported_unary(sess, op);
3318
+ break;
3319
+
3320
+ case GGML_OP_SQR:
3321
+ case GGML_OP_SQRT:
3322
+ supp = ggml_hexagon_supported_unary(sess, op);
3323
+ break;
3324
+
3325
+ case GGML_OP_SUM_ROWS:
3326
+ supp = ggml_hexagon_supported_sum_rows(sess, op);
3327
+ break;
3328
+
3329
+ case GGML_OP_SOFT_MAX:
3330
+ supp = ggml_hexagon_supported_softmax(sess, op);
3331
+ break;
3332
+
3333
+ case GGML_OP_UNARY:
3334
+ switch (ggml_get_unary_op(op)) {
3335
+ case GGML_UNARY_OP_NEG:
3336
+ case GGML_UNARY_OP_EXP:
3337
+ case GGML_UNARY_OP_SIGMOID:
3338
+ case GGML_UNARY_OP_SOFTPLUS:
3339
+ case GGML_UNARY_OP_TANH:
3340
+ supp = ggml_hexagon_supported_unary(sess, op);
3341
+ break;
3342
+ case GGML_UNARY_OP_SILU:
3343
+ case GGML_UNARY_OP_GELU:
3344
+ supp = ggml_hexagon_supported_activations(sess, op);
3345
+ break;
3346
+ default:
3347
+ break;
3348
+ }
3349
+ break;
3350
+
3351
+ case GGML_OP_GLU:
3352
+ switch (ggml_get_glu_op(op)) {
3353
+ case GGML_GLU_OP_SWIGLU:
3354
+ case GGML_GLU_OP_SWIGLU_OAI:
3355
+ case GGML_GLU_OP_GEGLU:
3356
+ supp = ggml_hexagon_supported_activations(sess, op);
3357
+ break;
3358
+ default:
3359
+ break;
3360
+ }
3361
+ break;
3362
+
3363
+ case GGML_OP_ROPE:
3364
+ supp = ggml_hexagon_supported_rope(sess, op);
3365
+ break;
3366
+
3367
+ case GGML_OP_FLASH_ATTN_EXT:
3368
+ supp = ggml_hexagon_supported_flash_attn_ext(sess, op);
3369
+ break;
3370
+
3371
+ case GGML_OP_SET_ROWS:
3372
+ supp = ggml_hexagon_supported_set_rows(sess, op);
3373
+ break;
3374
+
3375
+ case GGML_OP_GET_ROWS:
3376
+ supp = ggml_hexagon_supported_get_rows(sess, op);
3377
+ break;
3378
+
3379
+ case GGML_OP_CPY:
3380
+ supp = ggml_hexagon_supported_cpy(sess, op);
3381
+ break;
3382
+
3383
+ case GGML_OP_CONT:
3384
+ supp = ggml_hexagon_supported_cont(sess, op);
3385
+ break;
3386
+
3387
+ case GGML_OP_REPEAT:
3388
+ supp = ggml_hexagon_supported_repeat(sess, op);
3389
+ break;
3390
+
3391
+ case GGML_OP_ARGSORT:
3392
+ supp = ggml_hexagon_supported_argsort(sess, op);
3393
+ break;
3394
+
3395
+ case GGML_OP_SSM_CONV:
3396
+ supp = ggml_hexagon_supported_ssm_conv(sess, op);
3397
+ break;
3398
+
3399
+ case GGML_OP_GATED_DELTA_NET:
3400
+ supp = ggml_hexagon_supported_gated_delta_net(sess, op);
3401
+ break;
3402
+
3403
+ case GGML_OP_CUMSUM:
3404
+ supp = ggml_hexagon_supported_cumsum(sess, op);
3405
+ break;
3406
+
3407
+ case GGML_OP_FILL:
3408
+ supp = ggml_hexagon_supported_fill(sess, op);
3409
+ break;
3410
+
3411
+ case GGML_OP_DIAG:
3412
+ supp = ggml_hexagon_supported_diag(sess, op);
3413
+ break;
3414
+
3415
+ case GGML_OP_SOLVE_TRI:
3416
+ supp = ggml_hexagon_supported_solve_tri(sess, op);
3417
+ break;
3418
+
3419
+ default:
3420
+ break;
3421
+ }
3422
+
3423
+ ggml_hexagon_dump_op_supp(sess->name, op, supp);
3424
+ return supp;
3425
+ }
3426
+
3427
+ static bool ggml_backend_hexagon_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
3428
+ if (buft->iface.get_alignment != ggml_backend_hexagon_buffer_type_get_alignment) {
3429
+ return false;
3430
+ }
3431
+
3432
+ auto s0 = static_cast<ggml_hexagon_session *>(dev->context);
3433
+ auto s1 = static_cast<ggml_backend_hexagon_buffer_type_context *>(buft->context)->sess;
3434
+
3435
+ // Need session/domain-id for buffers to be compatible
3436
+ bool supp = (s0->session_id == s1->session_id);
3437
+
3438
+ HEX_VERBOSE("ggml-hex: %s device-supports-buft %s (%d)\n", s0->name.c_str(), s1->name.c_str(), (int) supp);
3439
+
3440
+ return supp;
3441
+ }
3442
+
3443
+ static ggml_backend_buffer_type_t * ggml_backend_hexagon_device_get_extra_buffers_type(ggml_backend_dev_t dev) {
3444
+ auto s0 = static_cast<ggml_hexagon_session *>(dev->context);
3445
+ HEX_VERBOSE("ggml-hex: device-get-extra-buft : %s \n", s0->name.c_str());
3446
+
3447
+ static ggml_backend_buffer_type_t bufts[2];
3448
+ bufts[0] = ggml_backend_hexagon_device_get_repack_buffer_type(dev);
3449
+ bufts[1] = NULL;
3450
+ return bufts;
3451
+ }
3452
+
3453
+ static const struct ggml_backend_device_i ggml_backend_hexagon_device_i = {
3454
+ /* .get_name = */ ggml_backend_hexagon_device_get_name,
3455
+ /* .get_description = */ ggml_backend_hexagon_device_get_description,
3456
+ /* .get_memory = */ ggml_backend_hexagon_device_get_memory,
3457
+ /* .get_type = */ ggml_backend_hexagon_device_get_type,
3458
+ /* .get_props = */ ggml_backend_hexagon_device_get_props,
3459
+ /* .init_backend = */ ggml_backend_hexagon_device_init,
3460
+ /* .get_buffer_type = */ ggml_backend_hexagon_device_get_buffer_type,
3461
+ /* .get_host_buffer_type = */ NULL, // ggml_backend_hexagon_device_get_host_buffer_type,
3462
+ /* .buffer_from_host_ptr = */ NULL, // ggml_backend_hexagon_device_buffer_from_ptr,
3463
+ /* .supports_op = */ ggml_backend_hexagon_device_supports_op,
3464
+ /* .supports_buft = */ ggml_backend_hexagon_device_supports_buft,
3465
+ /* .offload_op = */ NULL, // ggml_backend_hexagon_device_offload_op,
3466
+ /* .event_new = */ NULL,
3467
+ /* .event_free = */ NULL,
3468
+ /* .event_synchronize = */ NULL,
3469
+ };
3470
+
3471
+ //** backend registry
3472
+
3473
+ #define GGML_HEXAGON_MAX_SESSIONS 16
3474
+
3475
+ struct ggml_hexagon_registry {
3476
+ ggml_hexagon_registry(ggml_backend_reg_t reg);
3477
+ ~ggml_hexagon_registry();
3478
+
3479
+ ggml_backend_device devices[GGML_HEXAGON_MAX_SESSIONS];
3480
+ };
3481
+
3482
+ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
3483
+ GGML_LOG_INFO("ggml-hex: Hexagon backend (experimental) : allocating new registry : ndev %zu\n", opt_ndev);
3484
+
3485
+ GGML_LOG_INFO("ggml-hex: Hexagon Arch version v%d\n", opt_arch);
3486
+
3487
+ // Create devices / sessions
3488
+ for (size_t i = 0; i < opt_ndev; i++) {
3489
+ devices[i].iface = ggml_backend_hexagon_device_i;
3490
+ devices[i].reg = reg;
3491
+ try {
3492
+ devices[i].context = new ggml_hexagon_session(i, &devices[i]);
3493
+ } catch (const std::exception & exc) {
3494
+ GGML_LOG_ERROR("ggml-hex: failed to create device/session %zu\n", i);
3495
+ devices[i].context = nullptr;
3496
+ }
3497
+ }
3498
+ }
3499
+
3500
+ ggml_hexagon_registry::~ggml_hexagon_registry() {
3501
+ GGML_LOG_INFO("ggml-hex: releasing registry\n");
3502
+
3503
+ // Release devices / sessions
3504
+ for (size_t i = 0; i < opt_ndev; i++) {
3505
+ auto sess = static_cast<ggml_hexagon_session *>(devices[i].context);
3506
+ delete sess;
3507
+ }
3508
+ }
3509
+
3510
+ static const char * ggml_backend_hexagon_reg_get_name(ggml_backend_reg_t reg) {
3511
+ return "HTP";
3512
+ GGML_UNUSED(reg);
3513
+ }
3514
+
3515
+ static size_t ggml_backend_hexagon_reg_get_device_count(ggml_backend_reg_t reg) {
3516
+ return opt_ndev;
3517
+ GGML_UNUSED(reg);
3518
+ }
3519
+
3520
+ static ggml_backend_dev_t ggml_backend_hexagon_reg_get_device(ggml_backend_reg_t reg, size_t index) {
3521
+ auto hreg = static_cast<ggml_hexagon_registry *>(reg->context);
3522
+
3523
+ if (index >= opt_ndev || !hreg->devices[index].context) {
3524
+ return nullptr;
3525
+ }
3526
+
3527
+ return &hreg->devices[index];
3528
+ }
3529
+
3530
+ static void * ggml_backend_hexagon_get_proc_address(ggml_backend_reg_t reg, const char * name) {
3531
+ if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0 && opt_hostbuf) {
3532
+ ggml_backend_dev_get_extra_bufts_t fct = ggml_backend_hexagon_device_get_extra_buffers_type;
3533
+ return (void *) fct;
3534
+ }
3535
+
3536
+ return NULL;
3537
+ }
3538
+
3539
+ template<typename T> std::vector<T> str_to_vec(const char* str) {
3540
+ std::stringstream ss(str);
3541
+ std::vector<T> v;
3542
+ std::string t;
3543
+
3544
+ while (std::getline(ss, t, ',')) {
3545
+ v.push_back(std::stoul(t, nullptr, 0));
3546
+ }
3547
+
3548
+ return v;
3549
+ }
3550
+
3551
+ template<typename T, int BASE=10> std::string vec_to_str(std::vector<T> v) {
3552
+ std::stringstream ss;
3553
+ ss << std::setbase(BASE) << std::showbase;
3554
+ for (auto i : v) { ss << i << ','; }
3555
+ auto str = ss.str(); str.pop_back(); // drop last comma
3556
+ return str;
3557
+ }
3558
+
3559
+ static void ggml_hexagon_init(ggml_backend_reg * reg) {
3560
+ // Basic sanity checks to make sure definitions match
3561
+ static_assert((unsigned int) HTP_TYPE_Q4_0 == (unsigned int) GGML_TYPE_Q4_0,
3562
+ "please update hexagon_type to match ggml_type");
3563
+ static_assert((unsigned int) HTP_TYPE_Q8_0 == (unsigned int) GGML_TYPE_Q8_0,
3564
+ "please update hexagon_type to match ggml_type");
3565
+ static_assert((unsigned int) HTP_TYPE_MXFP4 == (unsigned int) GGML_TYPE_MXFP4,
3566
+ "please update hexagon_type to match ggml_type");
3567
+ static_assert((unsigned int) HTP_TYPE_IQ4_NL == (unsigned int) GGML_TYPE_IQ4_NL,
3568
+ "please update hexagon_type to match ggml_type");
3569
+
3570
+ const char * str_verbose = getenv("GGML_HEXAGON_VERBOSE");
3571
+ const char * str_hostbuf = getenv("GGML_HEXAGON_HOSTBUF");
3572
+ const char * str_opstage = getenv("GGML_HEXAGON_OPSTAGE");
3573
+ const char * str_opbatch = getenv("GGML_HEXAGON_OPBATCH");
3574
+ const char * str_opqueue = getenv("GGML_HEXAGON_OPQUEUE");
3575
+ const char * str_opfilter = getenv("GGML_HEXAGON_OPFILTER");
3576
+ const char * str_profile = getenv("GGML_HEXAGON_PROFILE");
3577
+ const char * str_etm = getenv("GGML_HEXAGON_ETM");
3578
+ const char * str_nhvx = getenv("GGML_HEXAGON_NHVX");
3579
+ const char * str_use_hmx = getenv("GGML_HEXAGON_USE_HMX");
3580
+ const char * str_ndev = getenv("GGML_HEXAGON_NDEV");
3581
+ const char * str_arch = getenv("GGML_HEXAGON_ARCH");
3582
+ const char * str_vmem = getenv("GGML_HEXAGON_VMEM");
3583
+ const char * str_mbuf = getenv("GGML_HEXAGON_MBUF");
3584
+
3585
+ // Init Arch first since it affects other defaults
3586
+ if (!str_arch) {
3587
+ int err = get_hex_arch_ver(CDSP_DOMAIN_ID, &opt_arch);
3588
+ if (err != 0) {
3589
+ GGML_LOG_ERROR("ggml-hex: failed to query HTP version (err %d) defaulting to v73\n", err);
3590
+ opt_arch = 73;
3591
+ }
3592
+ } else {
3593
+ if (str_arch[0] == 'v' || str_arch[0] == 'V') {
3594
+ str_arch++;
3595
+ }
3596
+ opt_arch = strtoul(str_arch, NULL, 0);
3597
+ }
3598
+
3599
+ size_t MiB = 1024 * 1024;
3600
+
3601
+ // Update vmem default
3602
+ opt_vmem = opt_arch >= 75 ? HTP_OP_MAX_VMEM_DEFAULT : 3000 * MiB;
3603
+
3604
+ auto RE_ICASE = std::regex_constants::icase;
3605
+
3606
+ opt_opfilter = str_opfilter ? new std::regex(str_opfilter, RE_ICASE) : NULL;
3607
+ opt_verbose = str_verbose ? atoi(str_verbose) : 0;
3608
+ opt_hostbuf = str_hostbuf ? atoi(str_hostbuf) : opt_hostbuf;
3609
+ opt_opstage = str_opstage ? strtoul(str_opstage, NULL, 0) : opt_opstage;
3610
+ opt_opbatch = str_opbatch ? strtoul(str_opbatch, NULL, 0) : opt_opbatch;
3611
+ opt_opqueue = str_opqueue ? strtoul(str_opqueue, NULL, 0) : opt_opqueue;
3612
+ opt_profile = str_profile ? atoi(str_profile) : 0;
3613
+ opt_etm = str_etm ? atoi(str_etm) : 0;
3614
+ opt_nhvx = str_nhvx ? strtoul(str_nhvx, NULL, 0) : opt_nhvx;
3615
+ opt_use_hmx = str_use_hmx ? atoi(str_use_hmx) : opt_use_hmx;
3616
+ opt_ndev = str_ndev ? strtoul(str_ndev, NULL, 0) : opt_ndev;
3617
+ opt_hostbuf = str_hostbuf ? atoi(str_hostbuf) : opt_hostbuf;
3618
+ opt_mbuf = str_mbuf ? strtoul(str_mbuf, NULL, 0) * MiB : opt_mbuf;
3619
+ opt_vmem = str_vmem ? strtoul(str_vmem, NULL, 0) * MiB : opt_vmem;
3620
+
3621
+ if (opt_ndev > GGML_HEXAGON_MAX_SESSIONS) {
3622
+ opt_ndev = GGML_HEXAGON_MAX_SESSIONS;
3623
+ }
3624
+
3625
+ #if defined(__ANDROID__)
3626
+ if (opt_arch < 75) {
3627
+ opt_ndev = 1;
3628
+ GGML_LOG_WARN("ggml-hex: forcing ndev to 1 for SoCs archs lower than v75.\n");
3629
+ }
3630
+ #endif
3631
+
3632
+ if (str_profile) {
3633
+ opt_pmu_evt = [&]() -> std::vector<uint32_t> {
3634
+ auto v = str_to_vec<uint32_t>(str_profile);
3635
+ switch (v.size()) {
3636
+ case 1: opt_profile = v[0]; return opt_pmu_evt; // mode with default pmu events
3637
+ case 8: opt_profile = 2; return v; // mode with custom pmu events
3638
+ default: opt_profile = 0; return {}; // garbage input
3639
+ }}();
3640
+ if (opt_profile == 1) opt_pmu_evt = {};
3641
+ GGML_LOG_INFO("ggml-hex: Profiling mode %u : pmu-evt [ %s ]\n", opt_profile,
3642
+ vec_to_str<uint32_t, 16>(opt_pmu_evt).c_str());
3643
+ }
3644
+
3645
+ reg->context = new ggml_hexagon_registry(reg);
3646
+ }
3647
+
3648
+ static const struct ggml_backend_reg_i ggml_backend_hexagon_reg_i = {
3649
+ /* .get_name = */ ggml_backend_hexagon_reg_get_name,
3650
+ /* .get_device_count = */ ggml_backend_hexagon_reg_get_device_count,
3651
+ /* .get_device = */ ggml_backend_hexagon_reg_get_device,
3652
+ /* .get_proc_address = */ ggml_backend_hexagon_get_proc_address,
3653
+ };
3654
+
3655
+ ggml_backend_reg_t ggml_backend_hexagon_reg(void) {
3656
+ static bool initialized = false;
3657
+
3658
+ static ggml_backend_reg reg = { /* .api_version = */ GGML_BACKEND_API_VERSION,
3659
+ /* .iface = */ ggml_backend_hexagon_reg_i,
3660
+ /* .context = */ NULL };
3661
+
3662
+ {
3663
+ static std::mutex mutex;
3664
+ std::lock_guard<std::mutex> lock(mutex);
3665
+ if (!initialized) {
3666
+ auto nErr = htpdrv_init();
3667
+ if (nErr != AEE_SUCCESS) {
3668
+ return NULL;
3669
+ }
3670
+
3671
+ ggml_hexagon_init(&reg);
3672
+ }
3673
+
3674
+ initialized = true;
3675
+ }
3676
+
3677
+ return &reg;
3678
+ }
3679
+
3680
+ GGML_BACKEND_DL_IMPL(ggml_backend_hexagon_reg)