toy 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2107) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +1124 -0
  3. data/LICENSE +21 -0
  4. data/Makefile +2022 -0
  5. data/README.md +154 -0
  6. data/bin/toy +10 -0
  7. data/lib/toy/compute.rb +135 -0
  8. data/lib/toy/compute_cuda.rb +104 -0
  9. data/lib/toy/compute_metal.rb +97 -0
  10. data/lib/toy/core/cli/describe.rb +188 -0
  11. data/lib/toy/core/cli/eval.rb +385 -0
  12. data/lib/toy/core/cli/exit_codes.rb +15 -0
  13. data/lib/toy/core/cli/fetch.rb +238 -0
  14. data/lib/toy/core/cli/infer.rb +268 -0
  15. data/lib/toy/core/cli/install.rb +228 -0
  16. data/lib/toy/core/cli/list.rb +86 -0
  17. data/lib/toy/core/cli/manifest.rb +49 -0
  18. data/lib/toy/core/cli/new.rb +594 -0
  19. data/lib/toy/core/cli/serve.rb +237 -0
  20. data/lib/toy/core/cli/train.rb +471 -0
  21. data/lib/toy/core/cli.rb +165 -0
  22. data/lib/toy/core/config.rb +64 -0
  23. data/lib/toy/core/gguf_meta.rb +161 -0
  24. data/lib/toy/core/model_scan.rb +221 -0
  25. data/lib/toy/core/run_log.rb +94 -0
  26. data/lib/toy/core/toy_root.rb +95 -0
  27. data/lib/toy/dev/toy_card.rb +299 -0
  28. data/lib/toy/dev/toy_describe_flow.rb +412 -0
  29. data/lib/toy/dev/toy_logprobs.rb +86 -0
  30. data/lib/toy/dev/toy_tap.rb +183 -0
  31. data/lib/toy/dev/toy_token_drift.rb +121 -0
  32. data/lib/toy/ffi/tinynn.rb +1491 -0
  33. data/lib/toy/ffi/tinynn_cuda.rb +1124 -0
  34. data/lib/toy/ffi/tinynn_metal.rb +359 -0
  35. data/lib/toy/ffi_manifest.rb +84 -0
  36. data/lib/toy/io/bpe.rb +325 -0
  37. data/lib/toy/io/gguf_kv.rb +35 -0
  38. data/lib/toy/io/gguf_load.rb +331 -0
  39. data/lib/toy/io/loaders/toy_gpt2_loader.rb +70 -0
  40. data/lib/toy/io/loaders/toy_smollm2_loader.rb +754 -0
  41. data/lib/toy/io/model_index.rb +206 -0
  42. data/lib/toy/io/run_bundle.rb +280 -0
  43. data/lib/toy/io/tokenizer.rb +613 -0
  44. data/lib/toy/io/toy_corpus_loader.rb +52 -0
  45. data/lib/toy/io/toy_events.rb +56 -0
  46. data/lib/toy/io/toy_image_loader.rb +48 -0
  47. data/lib/toy/llm/adamw.rb +169 -0
  48. data/lib/toy/llm/archs/llama_arch.rb +233 -0
  49. data/lib/toy/llm/archs/llama_arch_cuda.rb +237 -0
  50. data/lib/toy/llm/archs/llama_arch_metal.rb +237 -0
  51. data/lib/toy/llm/blocks/transformer_block.rb +876 -0
  52. data/lib/toy/llm/blocks/transformer_block_cuda.rb +880 -0
  53. data/lib/toy/llm/blocks/transformer_block_metal.rb +880 -0
  54. data/lib/toy/llm/classify_batch.rb +88 -0
  55. data/lib/toy/llm/engine/gpt2_fwd_engine.rb +360 -0
  56. data/lib/toy/llm/engine/gpt2_fwd_engine_cuda.rb +362 -0
  57. data/lib/toy/llm/engine/gpt2_fwd_engine_metal.rb +362 -0
  58. data/lib/toy/llm/engine/gpt2_kv_engine.rb +346 -0
  59. data/lib/toy/llm/engine/gpt2_kv_engine_cuda.rb +348 -0
  60. data/lib/toy/llm/engine/gpt2_kv_engine_metal.rb +348 -0
  61. data/lib/toy/llm/engine/gpt2_seq_engine.rb +289 -0
  62. data/lib/toy/llm/engine/gpt2_seq_engine_cuda.rb +293 -0
  63. data/lib/toy/llm/engine/gpt2_seq_engine_metal.rb +293 -0
  64. data/lib/toy/llm/engine/llama_kv_engine.rb +1593 -0
  65. data/lib/toy/llm/engine/llama_kv_engine_cuda.rb +1526 -0
  66. data/lib/toy/llm/engine/llama_kv_engine_metal.rb +1526 -0
  67. data/lib/toy/llm/engine/llama_seq_engine.rb +1233 -0
  68. data/lib/toy/llm/engine/llama_seq_engine_cuda.rb +1238 -0
  69. data/lib/toy/llm/engine/llama_seq_engine_metal.rb +1238 -0
  70. data/lib/toy/llm/engine/vit_tiny_engine.rb +467 -0
  71. data/lib/toy/llm/labels.rb +142 -0
  72. data/lib/toy/llm/primitives/gqa.rb +62 -0
  73. data/lib/toy/llm/primitives/gqa_cuda.rb +66 -0
  74. data/lib/toy/llm/primitives/gqa_metal.rb +66 -0
  75. data/lib/toy/llm/primitives/rms_norm.rb +39 -0
  76. data/lib/toy/llm/primitives/rms_norm_cuda.rb +43 -0
  77. data/lib/toy/llm/primitives/rms_norm_metal.rb +43 -0
  78. data/lib/toy/llm/primitives/rope.rb +68 -0
  79. data/lib/toy/llm/primitives/rope_cuda.rb +72 -0
  80. data/lib/toy/llm/primitives/rope_metal.rb +72 -0
  81. data/lib/toy/llm/primitives/swiglu.rb +41 -0
  82. data/lib/toy/llm/primitives/swiglu_cuda.rb +45 -0
  83. data/lib/toy/llm/primitives/swiglu_metal.rb +45 -0
  84. data/lib/toy/llm/recipe_options.rb +71 -0
  85. data/lib/toy/llm/recipes/from_scratch.rb +105 -0
  86. data/lib/toy/llm/recipes/from_scratch_cuda.rb +109 -0
  87. data/lib/toy/llm/recipes/from_scratch_metal.rb +109 -0
  88. data/lib/toy/llm/recipes/lora.rb +110 -0
  89. data/lib/toy/llm/recipes/lora_cuda.rb +114 -0
  90. data/lib/toy/llm/recipes/lora_metal.rb +114 -0
  91. data/lib/toy/llm/recipes/vit_tiny.rb +75 -0
  92. data/lib/toy/llm/recipes/warm_start.rb +235 -0
  93. data/lib/toy/llm/recipes/warm_start_cuda.rb +239 -0
  94. data/lib/toy/llm/recipes/warm_start_metal.rb +239 -0
  95. data/lib/toy/llm/training_batch.rb +133 -0
  96. data/lib/toy/models/arch.rb +253 -0
  97. data/lib/toy/models/gpt2.rb +311 -0
  98. data/lib/toy/models/toy_gpt2.rb +177 -0
  99. data/lib/toy/models/toy_smollm2.rb +393 -0
  100. data/lib/toy/models/toy_vit.rb +83 -0
  101. data/lib/toy/models/transformer.rb +1494 -0
  102. data/lib/toy/models/transformer_lm.rb +298 -0
  103. data/lib/toy/models/transformer_lm_cuda.rb +159 -0
  104. data/lib/toy/models/transformer_lm_metal.rb +142 -0
  105. data/lib/toy/mri.rb +300 -0
  106. data/lib/toy/run/eval.rb +76 -0
  107. data/lib/toy/run/eval_cuda.rb +66 -0
  108. data/lib/toy/run/eval_lmc.rb +334 -0
  109. data/lib/toy/run/eval_metal.rb +67 -0
  110. data/lib/toy/run/infer.rb +130 -0
  111. data/lib/toy/run/infer_cuda.rb +118 -0
  112. data/lib/toy/run/infer_metal.rb +119 -0
  113. data/lib/toy/run/infer_trace.rb +37 -0
  114. data/lib/toy/run/serve.rb +144 -0
  115. data/lib/toy/run/train.rb +404 -0
  116. data/lib/toy/run/train_cuda.rb +397 -0
  117. data/lib/toy/run/train_gpt2.rb +103 -0
  118. data/lib/toy/run/train_gpt2_cuda.rb +85 -0
  119. data/lib/toy/run/train_gpt2_metal.rb +85 -0
  120. data/lib/toy/run/train_lora.rb +207 -0
  121. data/lib/toy/run/train_lora_cuda.rb +219 -0
  122. data/lib/toy/run/train_metal.rb +227 -0
  123. data/lib/toy/run/train_vit.rb +251 -0
  124. data/lib/toy/serve/openai/embeddings_handler.rb +92 -0
  125. data/lib/toy/serve/openai/handlers.rb +143 -0
  126. data/lib/toy/serve/openai/server.rb +159 -0
  127. data/lib/toy/train/sampler.rb +314 -0
  128. data/lib/toy/train/toy_chat_template.rb +179 -0
  129. data/lib/toy/train/toy_drift_grad.rb +176 -0
  130. data/lib/toy/train/toy_gguf_fuse.rb +428 -0
  131. data/lib/toy/train/toy_gguf_writer.rb +100 -0
  132. data/lib/toy/train/toy_lr_schedule.rb +39 -0
  133. data/lib/toy/train/toy_sample.rb +125 -0
  134. data/lib/toy/train/toy_trainer.rb +86 -0
  135. data/lib/toy/train/training.rb +160 -0
  136. data/lib/toy/version.rb +11 -0
  137. data/lib/toy.rb +902 -0
  138. data/prep/progress +118 -0
  139. data/prep/quietly +64 -0
  140. data/sig/toy.rbs +397 -0
  141. data/sig/toy_compute.rbs +450 -0
  142. data/spinel-ext.json +122 -0
  143. data/tinynn/Makefile +71 -0
  144. data/tinynn/tinynn_backend_cuda.c +99 -0
  145. data/tinynn/tinynn_backend_metal.m +75 -0
  146. data/tinynn/tinynn_events.c +122 -0
  147. data/tinynn/tinynn_events.h +83 -0
  148. data/tinynn/tinynn_ggml.c +2460 -0
  149. data/tinynn/tinynn_ggml.h +545 -0
  150. data/tinynn/tinynn_gguf.c +783 -0
  151. data/tinynn/tinynn_gguf.h +167 -0
  152. data/tinynn/tinynn_trace.c +180 -0
  153. data/tinynn/tinynn_trace.h +85 -0
  154. data/vendor/ggml/AUTHORS +335 -0
  155. data/vendor/ggml/CMakeLists.txt +505 -0
  156. data/vendor/ggml/CONTRIBUTING.md +3 -0
  157. data/vendor/ggml/LICENSE +21 -0
  158. data/vendor/ggml/README.md +50 -0
  159. data/vendor/ggml/ci/run.sh +395 -0
  160. data/vendor/ggml/cmake/FindNCCL.cmake +36 -0
  161. data/vendor/ggml/cmake/GitVars.cmake +22 -0
  162. data/vendor/ggml/cmake/common.cmake +50 -0
  163. data/vendor/ggml/cmake/ggml-config.cmake.in +191 -0
  164. data/vendor/ggml/docs/gguf.md +828 -0
  165. data/vendor/ggml/examples/CMakeLists.txt +34 -0
  166. data/vendor/ggml/examples/common-ggml.cpp +244 -0
  167. data/vendor/ggml/examples/common-ggml.h +18 -0
  168. data/vendor/ggml/examples/common.cpp +675 -0
  169. data/vendor/ggml/examples/common.h +322 -0
  170. data/vendor/ggml/examples/gpt-2/CMakeLists.txt +32 -0
  171. data/vendor/ggml/examples/gpt-2/README.md +225 -0
  172. data/vendor/ggml/examples/gpt-2/convert-cerebras-to-ggml.py +183 -0
  173. data/vendor/ggml/examples/gpt-2/convert-ckpt-to-ggml.py +159 -0
  174. data/vendor/ggml/examples/gpt-2/convert-h5-to-ggml.py +195 -0
  175. data/vendor/ggml/examples/gpt-2/download-ggml-model.sh +69 -0
  176. data/vendor/ggml/examples/gpt-2/download-model.sh +48 -0
  177. data/vendor/ggml/examples/gpt-2/main-alloc.cpp +880 -0
  178. data/vendor/ggml/examples/gpt-2/main-backend.cpp +946 -0
  179. data/vendor/ggml/examples/gpt-2/main-batched.cpp +1210 -0
  180. data/vendor/ggml/examples/gpt-2/main-ctx.cpp +840 -0
  181. data/vendor/ggml/examples/gpt-2/main-sched.cpp +1079 -0
  182. data/vendor/ggml/examples/gpt-2/quantize.cpp +184 -0
  183. data/vendor/ggml/examples/gpt-j/CMakeLists.txt +13 -0
  184. data/vendor/ggml/examples/gpt-j/README.md +239 -0
  185. data/vendor/ggml/examples/gpt-j/convert-h5-to-ggml.py +173 -0
  186. data/vendor/ggml/examples/gpt-j/download-ggml-model.sh +69 -0
  187. data/vendor/ggml/examples/gpt-j/download-model.sh +11 -0
  188. data/vendor/ggml/examples/gpt-j/main.cpp +755 -0
  189. data/vendor/ggml/examples/gpt-j/quantize.cpp +182 -0
  190. data/vendor/ggml/examples/magika/CMakeLists.txt +17 -0
  191. data/vendor/ggml/examples/magika/README.md +23 -0
  192. data/vendor/ggml/examples/magika/convert.py +32 -0
  193. data/vendor/ggml/examples/magika/main.cpp +374 -0
  194. data/vendor/ggml/examples/mnist/CMakeLists.txt +58 -0
  195. data/vendor/ggml/examples/mnist/README.md +206 -0
  196. data/vendor/ggml/examples/mnist/mnist-common.cpp +496 -0
  197. data/vendor/ggml/examples/mnist/mnist-common.h +166 -0
  198. data/vendor/ggml/examples/mnist/mnist-eval.cpp +67 -0
  199. data/vendor/ggml/examples/mnist/mnist-train-cnn.py +91 -0
  200. data/vendor/ggml/examples/mnist/mnist-train-fc.py +131 -0
  201. data/vendor/ggml/examples/mnist/mnist-train.cpp +39 -0
  202. data/vendor/ggml/examples/mnist/server.py +36 -0
  203. data/vendor/ggml/examples/mnist/web/index.html +178 -0
  204. data/vendor/ggml/examples/perf-metal/CMakeLists.txt +7 -0
  205. data/vendor/ggml/examples/perf-metal/perf-metal.cpp +152 -0
  206. data/vendor/ggml/examples/prompts/dolly-v2.txt +100 -0
  207. data/vendor/ggml/examples/prompts/gpt-2-chinese.txt +1 -0
  208. data/vendor/ggml/examples/prompts/gpt-2.txt +100 -0
  209. data/vendor/ggml/examples/prompts/gpt-j.txt +100 -0
  210. data/vendor/ggml/examples/prompts/gpt-neox-japanese.txt +1 -0
  211. data/vendor/ggml/examples/prompts/gpt-neox.txt +100 -0
  212. data/vendor/ggml/examples/prompts/polyglot-ko.txt +3 -0
  213. data/vendor/ggml/examples/prompts/replit.txt +100 -0
  214. data/vendor/ggml/examples/prompts/starcoder.txt +100 -0
  215. data/vendor/ggml/examples/prompts/test-cases.txt +110 -0
  216. data/vendor/ggml/examples/prompts/tokenize_huggingface.py +65 -0
  217. data/vendor/ggml/examples/prompts/whisper.txt +100 -0
  218. data/vendor/ggml/examples/python/README.md +115 -0
  219. data/vendor/ggml/examples/python/api.h +14 -0
  220. data/vendor/ggml/examples/python/example_add_quant.py +25 -0
  221. data/vendor/ggml/examples/python/example_test_all_quants.py +68 -0
  222. data/vendor/ggml/examples/python/ggml/__init__.py +58 -0
  223. data/vendor/ggml/examples/python/ggml/__init__.pyi +2406 -0
  224. data/vendor/ggml/examples/python/ggml/cffi.py +11 -0
  225. data/vendor/ggml/examples/python/ggml/ffi/__init__.pyi +7 -0
  226. data/vendor/ggml/examples/python/ggml/utils.py +182 -0
  227. data/vendor/ggml/examples/python/regenerate.py +42 -0
  228. data/vendor/ggml/examples/python/stubs.py +128 -0
  229. data/vendor/ggml/examples/python/test_tensor.py +258 -0
  230. data/vendor/ggml/examples/sam/CMakeLists.txt +13 -0
  231. data/vendor/ggml/examples/sam/README.md +95 -0
  232. data/vendor/ggml/examples/sam/convert-pth-to-ggml.py +147 -0
  233. data/vendor/ggml/examples/sam/example.jpg +0 -0
  234. data/vendor/ggml/examples/sam/sam.cpp +2370 -0
  235. data/vendor/ggml/examples/simple/CMakeLists.txt +21 -0
  236. data/vendor/ggml/examples/simple/README.md +61 -0
  237. data/vendor/ggml/examples/simple/simple-backend.cpp +153 -0
  238. data/vendor/ggml/examples/simple/simple-ctx.cpp +127 -0
  239. data/vendor/ggml/examples/stb_image.h +7987 -0
  240. data/vendor/ggml/examples/stb_image_write.h +1724 -0
  241. data/vendor/ggml/examples/test-cmake/CMakeLists.txt +10 -0
  242. data/vendor/ggml/examples/test-cmake/README.md +3 -0
  243. data/vendor/ggml/examples/test-cmake/test-cmake.cpp +6 -0
  244. data/vendor/ggml/examples/yolo/CMakeLists.txt +6 -0
  245. data/vendor/ggml/examples/yolo/README.md +59 -0
  246. data/vendor/ggml/examples/yolo/convert-yolov3-tiny.py +53 -0
  247. data/vendor/ggml/examples/yolo/data/coco.names +80 -0
  248. data/vendor/ggml/examples/yolo/data/labels/100_0.png +0 -0
  249. data/vendor/ggml/examples/yolo/data/labels/100_1.png +0 -0
  250. data/vendor/ggml/examples/yolo/data/labels/100_2.png +0 -0
  251. data/vendor/ggml/examples/yolo/data/labels/100_3.png +0 -0
  252. data/vendor/ggml/examples/yolo/data/labels/100_4.png +0 -0
  253. data/vendor/ggml/examples/yolo/data/labels/100_5.png +0 -0
  254. data/vendor/ggml/examples/yolo/data/labels/100_6.png +0 -0
  255. data/vendor/ggml/examples/yolo/data/labels/100_7.png +0 -0
  256. data/vendor/ggml/examples/yolo/data/labels/101_0.png +0 -0
  257. data/vendor/ggml/examples/yolo/data/labels/101_1.png +0 -0
  258. data/vendor/ggml/examples/yolo/data/labels/101_2.png +0 -0
  259. data/vendor/ggml/examples/yolo/data/labels/101_3.png +0 -0
  260. data/vendor/ggml/examples/yolo/data/labels/101_4.png +0 -0
  261. data/vendor/ggml/examples/yolo/data/labels/101_5.png +0 -0
  262. data/vendor/ggml/examples/yolo/data/labels/101_6.png +0 -0
  263. data/vendor/ggml/examples/yolo/data/labels/101_7.png +0 -0
  264. data/vendor/ggml/examples/yolo/data/labels/102_0.png +0 -0
  265. data/vendor/ggml/examples/yolo/data/labels/102_1.png +0 -0
  266. data/vendor/ggml/examples/yolo/data/labels/102_2.png +0 -0
  267. data/vendor/ggml/examples/yolo/data/labels/102_3.png +0 -0
  268. data/vendor/ggml/examples/yolo/data/labels/102_4.png +0 -0
  269. data/vendor/ggml/examples/yolo/data/labels/102_5.png +0 -0
  270. data/vendor/ggml/examples/yolo/data/labels/102_6.png +0 -0
  271. data/vendor/ggml/examples/yolo/data/labels/102_7.png +0 -0
  272. data/vendor/ggml/examples/yolo/data/labels/103_0.png +0 -0
  273. data/vendor/ggml/examples/yolo/data/labels/103_1.png +0 -0
  274. data/vendor/ggml/examples/yolo/data/labels/103_2.png +0 -0
  275. data/vendor/ggml/examples/yolo/data/labels/103_3.png +0 -0
  276. data/vendor/ggml/examples/yolo/data/labels/103_4.png +0 -0
  277. data/vendor/ggml/examples/yolo/data/labels/103_5.png +0 -0
  278. data/vendor/ggml/examples/yolo/data/labels/103_6.png +0 -0
  279. data/vendor/ggml/examples/yolo/data/labels/103_7.png +0 -0
  280. data/vendor/ggml/examples/yolo/data/labels/104_0.png +0 -0
  281. data/vendor/ggml/examples/yolo/data/labels/104_1.png +0 -0
  282. data/vendor/ggml/examples/yolo/data/labels/104_2.png +0 -0
  283. data/vendor/ggml/examples/yolo/data/labels/104_3.png +0 -0
  284. data/vendor/ggml/examples/yolo/data/labels/104_4.png +0 -0
  285. data/vendor/ggml/examples/yolo/data/labels/104_5.png +0 -0
  286. data/vendor/ggml/examples/yolo/data/labels/104_6.png +0 -0
  287. data/vendor/ggml/examples/yolo/data/labels/104_7.png +0 -0
  288. data/vendor/ggml/examples/yolo/data/labels/105_0.png +0 -0
  289. data/vendor/ggml/examples/yolo/data/labels/105_1.png +0 -0
  290. data/vendor/ggml/examples/yolo/data/labels/105_2.png +0 -0
  291. data/vendor/ggml/examples/yolo/data/labels/105_3.png +0 -0
  292. data/vendor/ggml/examples/yolo/data/labels/105_4.png +0 -0
  293. data/vendor/ggml/examples/yolo/data/labels/105_5.png +0 -0
  294. data/vendor/ggml/examples/yolo/data/labels/105_6.png +0 -0
  295. data/vendor/ggml/examples/yolo/data/labels/105_7.png +0 -0
  296. data/vendor/ggml/examples/yolo/data/labels/106_0.png +0 -0
  297. data/vendor/ggml/examples/yolo/data/labels/106_1.png +0 -0
  298. data/vendor/ggml/examples/yolo/data/labels/106_2.png +0 -0
  299. data/vendor/ggml/examples/yolo/data/labels/106_3.png +0 -0
  300. data/vendor/ggml/examples/yolo/data/labels/106_4.png +0 -0
  301. data/vendor/ggml/examples/yolo/data/labels/106_5.png +0 -0
  302. data/vendor/ggml/examples/yolo/data/labels/106_6.png +0 -0
  303. data/vendor/ggml/examples/yolo/data/labels/106_7.png +0 -0
  304. data/vendor/ggml/examples/yolo/data/labels/107_0.png +0 -0
  305. data/vendor/ggml/examples/yolo/data/labels/107_1.png +0 -0
  306. data/vendor/ggml/examples/yolo/data/labels/107_2.png +0 -0
  307. data/vendor/ggml/examples/yolo/data/labels/107_3.png +0 -0
  308. data/vendor/ggml/examples/yolo/data/labels/107_4.png +0 -0
  309. data/vendor/ggml/examples/yolo/data/labels/107_5.png +0 -0
  310. data/vendor/ggml/examples/yolo/data/labels/107_6.png +0 -0
  311. data/vendor/ggml/examples/yolo/data/labels/107_7.png +0 -0
  312. data/vendor/ggml/examples/yolo/data/labels/108_0.png +0 -0
  313. data/vendor/ggml/examples/yolo/data/labels/108_1.png +0 -0
  314. data/vendor/ggml/examples/yolo/data/labels/108_2.png +0 -0
  315. data/vendor/ggml/examples/yolo/data/labels/108_3.png +0 -0
  316. data/vendor/ggml/examples/yolo/data/labels/108_4.png +0 -0
  317. data/vendor/ggml/examples/yolo/data/labels/108_5.png +0 -0
  318. data/vendor/ggml/examples/yolo/data/labels/108_6.png +0 -0
  319. data/vendor/ggml/examples/yolo/data/labels/108_7.png +0 -0
  320. data/vendor/ggml/examples/yolo/data/labels/109_0.png +0 -0
  321. data/vendor/ggml/examples/yolo/data/labels/109_1.png +0 -0
  322. data/vendor/ggml/examples/yolo/data/labels/109_2.png +0 -0
  323. data/vendor/ggml/examples/yolo/data/labels/109_3.png +0 -0
  324. data/vendor/ggml/examples/yolo/data/labels/109_4.png +0 -0
  325. data/vendor/ggml/examples/yolo/data/labels/109_5.png +0 -0
  326. data/vendor/ggml/examples/yolo/data/labels/109_6.png +0 -0
  327. data/vendor/ggml/examples/yolo/data/labels/109_7.png +0 -0
  328. data/vendor/ggml/examples/yolo/data/labels/110_0.png +0 -0
  329. data/vendor/ggml/examples/yolo/data/labels/110_1.png +0 -0
  330. data/vendor/ggml/examples/yolo/data/labels/110_2.png +0 -0
  331. data/vendor/ggml/examples/yolo/data/labels/110_3.png +0 -0
  332. data/vendor/ggml/examples/yolo/data/labels/110_4.png +0 -0
  333. data/vendor/ggml/examples/yolo/data/labels/110_5.png +0 -0
  334. data/vendor/ggml/examples/yolo/data/labels/110_6.png +0 -0
  335. data/vendor/ggml/examples/yolo/data/labels/110_7.png +0 -0
  336. data/vendor/ggml/examples/yolo/data/labels/111_0.png +0 -0
  337. data/vendor/ggml/examples/yolo/data/labels/111_1.png +0 -0
  338. data/vendor/ggml/examples/yolo/data/labels/111_2.png +0 -0
  339. data/vendor/ggml/examples/yolo/data/labels/111_3.png +0 -0
  340. data/vendor/ggml/examples/yolo/data/labels/111_4.png +0 -0
  341. data/vendor/ggml/examples/yolo/data/labels/111_5.png +0 -0
  342. data/vendor/ggml/examples/yolo/data/labels/111_6.png +0 -0
  343. data/vendor/ggml/examples/yolo/data/labels/111_7.png +0 -0
  344. data/vendor/ggml/examples/yolo/data/labels/112_0.png +0 -0
  345. data/vendor/ggml/examples/yolo/data/labels/112_1.png +0 -0
  346. data/vendor/ggml/examples/yolo/data/labels/112_2.png +0 -0
  347. data/vendor/ggml/examples/yolo/data/labels/112_3.png +0 -0
  348. data/vendor/ggml/examples/yolo/data/labels/112_4.png +0 -0
  349. data/vendor/ggml/examples/yolo/data/labels/112_5.png +0 -0
  350. data/vendor/ggml/examples/yolo/data/labels/112_6.png +0 -0
  351. data/vendor/ggml/examples/yolo/data/labels/112_7.png +0 -0
  352. data/vendor/ggml/examples/yolo/data/labels/113_0.png +0 -0
  353. data/vendor/ggml/examples/yolo/data/labels/113_1.png +0 -0
  354. data/vendor/ggml/examples/yolo/data/labels/113_2.png +0 -0
  355. data/vendor/ggml/examples/yolo/data/labels/113_3.png +0 -0
  356. data/vendor/ggml/examples/yolo/data/labels/113_4.png +0 -0
  357. data/vendor/ggml/examples/yolo/data/labels/113_5.png +0 -0
  358. data/vendor/ggml/examples/yolo/data/labels/113_6.png +0 -0
  359. data/vendor/ggml/examples/yolo/data/labels/113_7.png +0 -0
  360. data/vendor/ggml/examples/yolo/data/labels/114_0.png +0 -0
  361. data/vendor/ggml/examples/yolo/data/labels/114_1.png +0 -0
  362. data/vendor/ggml/examples/yolo/data/labels/114_2.png +0 -0
  363. data/vendor/ggml/examples/yolo/data/labels/114_3.png +0 -0
  364. data/vendor/ggml/examples/yolo/data/labels/114_4.png +0 -0
  365. data/vendor/ggml/examples/yolo/data/labels/114_5.png +0 -0
  366. data/vendor/ggml/examples/yolo/data/labels/114_6.png +0 -0
  367. data/vendor/ggml/examples/yolo/data/labels/114_7.png +0 -0
  368. data/vendor/ggml/examples/yolo/data/labels/115_0.png +0 -0
  369. data/vendor/ggml/examples/yolo/data/labels/115_1.png +0 -0
  370. data/vendor/ggml/examples/yolo/data/labels/115_2.png +0 -0
  371. data/vendor/ggml/examples/yolo/data/labels/115_3.png +0 -0
  372. data/vendor/ggml/examples/yolo/data/labels/115_4.png +0 -0
  373. data/vendor/ggml/examples/yolo/data/labels/115_5.png +0 -0
  374. data/vendor/ggml/examples/yolo/data/labels/115_6.png +0 -0
  375. data/vendor/ggml/examples/yolo/data/labels/115_7.png +0 -0
  376. data/vendor/ggml/examples/yolo/data/labels/116_0.png +0 -0
  377. data/vendor/ggml/examples/yolo/data/labels/116_1.png +0 -0
  378. data/vendor/ggml/examples/yolo/data/labels/116_2.png +0 -0
  379. data/vendor/ggml/examples/yolo/data/labels/116_3.png +0 -0
  380. data/vendor/ggml/examples/yolo/data/labels/116_4.png +0 -0
  381. data/vendor/ggml/examples/yolo/data/labels/116_5.png +0 -0
  382. data/vendor/ggml/examples/yolo/data/labels/116_6.png +0 -0
  383. data/vendor/ggml/examples/yolo/data/labels/116_7.png +0 -0
  384. data/vendor/ggml/examples/yolo/data/labels/117_0.png +0 -0
  385. data/vendor/ggml/examples/yolo/data/labels/117_1.png +0 -0
  386. data/vendor/ggml/examples/yolo/data/labels/117_2.png +0 -0
  387. data/vendor/ggml/examples/yolo/data/labels/117_3.png +0 -0
  388. data/vendor/ggml/examples/yolo/data/labels/117_4.png +0 -0
  389. data/vendor/ggml/examples/yolo/data/labels/117_5.png +0 -0
  390. data/vendor/ggml/examples/yolo/data/labels/117_6.png +0 -0
  391. data/vendor/ggml/examples/yolo/data/labels/117_7.png +0 -0
  392. data/vendor/ggml/examples/yolo/data/labels/118_0.png +0 -0
  393. data/vendor/ggml/examples/yolo/data/labels/118_1.png +0 -0
  394. data/vendor/ggml/examples/yolo/data/labels/118_2.png +0 -0
  395. data/vendor/ggml/examples/yolo/data/labels/118_3.png +0 -0
  396. data/vendor/ggml/examples/yolo/data/labels/118_4.png +0 -0
  397. data/vendor/ggml/examples/yolo/data/labels/118_5.png +0 -0
  398. data/vendor/ggml/examples/yolo/data/labels/118_6.png +0 -0
  399. data/vendor/ggml/examples/yolo/data/labels/118_7.png +0 -0
  400. data/vendor/ggml/examples/yolo/data/labels/119_0.png +0 -0
  401. data/vendor/ggml/examples/yolo/data/labels/119_1.png +0 -0
  402. data/vendor/ggml/examples/yolo/data/labels/119_2.png +0 -0
  403. data/vendor/ggml/examples/yolo/data/labels/119_3.png +0 -0
  404. data/vendor/ggml/examples/yolo/data/labels/119_4.png +0 -0
  405. data/vendor/ggml/examples/yolo/data/labels/119_5.png +0 -0
  406. data/vendor/ggml/examples/yolo/data/labels/119_6.png +0 -0
  407. data/vendor/ggml/examples/yolo/data/labels/119_7.png +0 -0
  408. data/vendor/ggml/examples/yolo/data/labels/120_0.png +0 -0
  409. data/vendor/ggml/examples/yolo/data/labels/120_1.png +0 -0
  410. data/vendor/ggml/examples/yolo/data/labels/120_2.png +0 -0
  411. data/vendor/ggml/examples/yolo/data/labels/120_3.png +0 -0
  412. data/vendor/ggml/examples/yolo/data/labels/120_4.png +0 -0
  413. data/vendor/ggml/examples/yolo/data/labels/120_5.png +0 -0
  414. data/vendor/ggml/examples/yolo/data/labels/120_6.png +0 -0
  415. data/vendor/ggml/examples/yolo/data/labels/120_7.png +0 -0
  416. data/vendor/ggml/examples/yolo/data/labels/121_0.png +0 -0
  417. data/vendor/ggml/examples/yolo/data/labels/121_1.png +0 -0
  418. data/vendor/ggml/examples/yolo/data/labels/121_2.png +0 -0
  419. data/vendor/ggml/examples/yolo/data/labels/121_3.png +0 -0
  420. data/vendor/ggml/examples/yolo/data/labels/121_4.png +0 -0
  421. data/vendor/ggml/examples/yolo/data/labels/121_5.png +0 -0
  422. data/vendor/ggml/examples/yolo/data/labels/121_6.png +0 -0
  423. data/vendor/ggml/examples/yolo/data/labels/121_7.png +0 -0
  424. data/vendor/ggml/examples/yolo/data/labels/122_0.png +0 -0
  425. data/vendor/ggml/examples/yolo/data/labels/122_1.png +0 -0
  426. data/vendor/ggml/examples/yolo/data/labels/122_2.png +0 -0
  427. data/vendor/ggml/examples/yolo/data/labels/122_3.png +0 -0
  428. data/vendor/ggml/examples/yolo/data/labels/122_4.png +0 -0
  429. data/vendor/ggml/examples/yolo/data/labels/122_5.png +0 -0
  430. data/vendor/ggml/examples/yolo/data/labels/122_6.png +0 -0
  431. data/vendor/ggml/examples/yolo/data/labels/122_7.png +0 -0
  432. data/vendor/ggml/examples/yolo/data/labels/123_0.png +0 -0
  433. data/vendor/ggml/examples/yolo/data/labels/123_1.png +0 -0
  434. data/vendor/ggml/examples/yolo/data/labels/123_2.png +0 -0
  435. data/vendor/ggml/examples/yolo/data/labels/123_3.png +0 -0
  436. data/vendor/ggml/examples/yolo/data/labels/123_4.png +0 -0
  437. data/vendor/ggml/examples/yolo/data/labels/123_5.png +0 -0
  438. data/vendor/ggml/examples/yolo/data/labels/123_6.png +0 -0
  439. data/vendor/ggml/examples/yolo/data/labels/123_7.png +0 -0
  440. data/vendor/ggml/examples/yolo/data/labels/124_0.png +0 -0
  441. data/vendor/ggml/examples/yolo/data/labels/124_1.png +0 -0
  442. data/vendor/ggml/examples/yolo/data/labels/124_2.png +0 -0
  443. data/vendor/ggml/examples/yolo/data/labels/124_3.png +0 -0
  444. data/vendor/ggml/examples/yolo/data/labels/124_4.png +0 -0
  445. data/vendor/ggml/examples/yolo/data/labels/124_5.png +0 -0
  446. data/vendor/ggml/examples/yolo/data/labels/124_6.png +0 -0
  447. data/vendor/ggml/examples/yolo/data/labels/124_7.png +0 -0
  448. data/vendor/ggml/examples/yolo/data/labels/125_0.png +0 -0
  449. data/vendor/ggml/examples/yolo/data/labels/125_1.png +0 -0
  450. data/vendor/ggml/examples/yolo/data/labels/125_2.png +0 -0
  451. data/vendor/ggml/examples/yolo/data/labels/125_3.png +0 -0
  452. data/vendor/ggml/examples/yolo/data/labels/125_4.png +0 -0
  453. data/vendor/ggml/examples/yolo/data/labels/125_5.png +0 -0
  454. data/vendor/ggml/examples/yolo/data/labels/125_6.png +0 -0
  455. data/vendor/ggml/examples/yolo/data/labels/125_7.png +0 -0
  456. data/vendor/ggml/examples/yolo/data/labels/126_0.png +0 -0
  457. data/vendor/ggml/examples/yolo/data/labels/126_1.png +0 -0
  458. data/vendor/ggml/examples/yolo/data/labels/126_2.png +0 -0
  459. data/vendor/ggml/examples/yolo/data/labels/126_3.png +0 -0
  460. data/vendor/ggml/examples/yolo/data/labels/126_4.png +0 -0
  461. data/vendor/ggml/examples/yolo/data/labels/126_5.png +0 -0
  462. data/vendor/ggml/examples/yolo/data/labels/126_6.png +0 -0
  463. data/vendor/ggml/examples/yolo/data/labels/126_7.png +0 -0
  464. data/vendor/ggml/examples/yolo/data/labels/32_0.png +0 -0
  465. data/vendor/ggml/examples/yolo/data/labels/32_1.png +0 -0
  466. data/vendor/ggml/examples/yolo/data/labels/32_2.png +0 -0
  467. data/vendor/ggml/examples/yolo/data/labels/32_3.png +0 -0
  468. data/vendor/ggml/examples/yolo/data/labels/32_4.png +0 -0
  469. data/vendor/ggml/examples/yolo/data/labels/32_5.png +0 -0
  470. data/vendor/ggml/examples/yolo/data/labels/32_6.png +0 -0
  471. data/vendor/ggml/examples/yolo/data/labels/32_7.png +0 -0
  472. data/vendor/ggml/examples/yolo/data/labels/33_0.png +0 -0
  473. data/vendor/ggml/examples/yolo/data/labels/33_1.png +0 -0
  474. data/vendor/ggml/examples/yolo/data/labels/33_2.png +0 -0
  475. data/vendor/ggml/examples/yolo/data/labels/33_3.png +0 -0
  476. data/vendor/ggml/examples/yolo/data/labels/33_4.png +0 -0
  477. data/vendor/ggml/examples/yolo/data/labels/33_5.png +0 -0
  478. data/vendor/ggml/examples/yolo/data/labels/33_6.png +0 -0
  479. data/vendor/ggml/examples/yolo/data/labels/33_7.png +0 -0
  480. data/vendor/ggml/examples/yolo/data/labels/34_0.png +0 -0
  481. data/vendor/ggml/examples/yolo/data/labels/34_1.png +0 -0
  482. data/vendor/ggml/examples/yolo/data/labels/34_2.png +0 -0
  483. data/vendor/ggml/examples/yolo/data/labels/34_3.png +0 -0
  484. data/vendor/ggml/examples/yolo/data/labels/34_4.png +0 -0
  485. data/vendor/ggml/examples/yolo/data/labels/34_5.png +0 -0
  486. data/vendor/ggml/examples/yolo/data/labels/34_6.png +0 -0
  487. data/vendor/ggml/examples/yolo/data/labels/34_7.png +0 -0
  488. data/vendor/ggml/examples/yolo/data/labels/35_0.png +0 -0
  489. data/vendor/ggml/examples/yolo/data/labels/35_1.png +0 -0
  490. data/vendor/ggml/examples/yolo/data/labels/35_2.png +0 -0
  491. data/vendor/ggml/examples/yolo/data/labels/35_3.png +0 -0
  492. data/vendor/ggml/examples/yolo/data/labels/35_4.png +0 -0
  493. data/vendor/ggml/examples/yolo/data/labels/35_5.png +0 -0
  494. data/vendor/ggml/examples/yolo/data/labels/35_6.png +0 -0
  495. data/vendor/ggml/examples/yolo/data/labels/35_7.png +0 -0
  496. data/vendor/ggml/examples/yolo/data/labels/36_0.png +0 -0
  497. data/vendor/ggml/examples/yolo/data/labels/36_1.png +0 -0
  498. data/vendor/ggml/examples/yolo/data/labels/36_2.png +0 -0
  499. data/vendor/ggml/examples/yolo/data/labels/36_3.png +0 -0
  500. data/vendor/ggml/examples/yolo/data/labels/36_4.png +0 -0
  501. data/vendor/ggml/examples/yolo/data/labels/36_5.png +0 -0
  502. data/vendor/ggml/examples/yolo/data/labels/36_6.png +0 -0
  503. data/vendor/ggml/examples/yolo/data/labels/36_7.png +0 -0
  504. data/vendor/ggml/examples/yolo/data/labels/37_0.png +0 -0
  505. data/vendor/ggml/examples/yolo/data/labels/37_1.png +0 -0
  506. data/vendor/ggml/examples/yolo/data/labels/37_2.png +0 -0
  507. data/vendor/ggml/examples/yolo/data/labels/37_3.png +0 -0
  508. data/vendor/ggml/examples/yolo/data/labels/37_4.png +0 -0
  509. data/vendor/ggml/examples/yolo/data/labels/37_5.png +0 -0
  510. data/vendor/ggml/examples/yolo/data/labels/37_6.png +0 -0
  511. data/vendor/ggml/examples/yolo/data/labels/37_7.png +0 -0
  512. data/vendor/ggml/examples/yolo/data/labels/38_0.png +0 -0
  513. data/vendor/ggml/examples/yolo/data/labels/38_1.png +0 -0
  514. data/vendor/ggml/examples/yolo/data/labels/38_2.png +0 -0
  515. data/vendor/ggml/examples/yolo/data/labels/38_3.png +0 -0
  516. data/vendor/ggml/examples/yolo/data/labels/38_4.png +0 -0
  517. data/vendor/ggml/examples/yolo/data/labels/38_5.png +0 -0
  518. data/vendor/ggml/examples/yolo/data/labels/38_6.png +0 -0
  519. data/vendor/ggml/examples/yolo/data/labels/38_7.png +0 -0
  520. data/vendor/ggml/examples/yolo/data/labels/39_0.png +0 -0
  521. data/vendor/ggml/examples/yolo/data/labels/39_1.png +0 -0
  522. data/vendor/ggml/examples/yolo/data/labels/39_2.png +0 -0
  523. data/vendor/ggml/examples/yolo/data/labels/39_3.png +0 -0
  524. data/vendor/ggml/examples/yolo/data/labels/39_4.png +0 -0
  525. data/vendor/ggml/examples/yolo/data/labels/39_5.png +0 -0
  526. data/vendor/ggml/examples/yolo/data/labels/39_6.png +0 -0
  527. data/vendor/ggml/examples/yolo/data/labels/39_7.png +0 -0
  528. data/vendor/ggml/examples/yolo/data/labels/40_0.png +0 -0
  529. data/vendor/ggml/examples/yolo/data/labels/40_1.png +0 -0
  530. data/vendor/ggml/examples/yolo/data/labels/40_2.png +0 -0
  531. data/vendor/ggml/examples/yolo/data/labels/40_3.png +0 -0
  532. data/vendor/ggml/examples/yolo/data/labels/40_4.png +0 -0
  533. data/vendor/ggml/examples/yolo/data/labels/40_5.png +0 -0
  534. data/vendor/ggml/examples/yolo/data/labels/40_6.png +0 -0
  535. data/vendor/ggml/examples/yolo/data/labels/40_7.png +0 -0
  536. data/vendor/ggml/examples/yolo/data/labels/41_0.png +0 -0
  537. data/vendor/ggml/examples/yolo/data/labels/41_1.png +0 -0
  538. data/vendor/ggml/examples/yolo/data/labels/41_2.png +0 -0
  539. data/vendor/ggml/examples/yolo/data/labels/41_3.png +0 -0
  540. data/vendor/ggml/examples/yolo/data/labels/41_4.png +0 -0
  541. data/vendor/ggml/examples/yolo/data/labels/41_5.png +0 -0
  542. data/vendor/ggml/examples/yolo/data/labels/41_6.png +0 -0
  543. data/vendor/ggml/examples/yolo/data/labels/41_7.png +0 -0
  544. data/vendor/ggml/examples/yolo/data/labels/42_0.png +0 -0
  545. data/vendor/ggml/examples/yolo/data/labels/42_1.png +0 -0
  546. data/vendor/ggml/examples/yolo/data/labels/42_2.png +0 -0
  547. data/vendor/ggml/examples/yolo/data/labels/42_3.png +0 -0
  548. data/vendor/ggml/examples/yolo/data/labels/42_4.png +0 -0
  549. data/vendor/ggml/examples/yolo/data/labels/42_5.png +0 -0
  550. data/vendor/ggml/examples/yolo/data/labels/42_6.png +0 -0
  551. data/vendor/ggml/examples/yolo/data/labels/42_7.png +0 -0
  552. data/vendor/ggml/examples/yolo/data/labels/43_0.png +0 -0
  553. data/vendor/ggml/examples/yolo/data/labels/43_1.png +0 -0
  554. data/vendor/ggml/examples/yolo/data/labels/43_2.png +0 -0
  555. data/vendor/ggml/examples/yolo/data/labels/43_3.png +0 -0
  556. data/vendor/ggml/examples/yolo/data/labels/43_4.png +0 -0
  557. data/vendor/ggml/examples/yolo/data/labels/43_5.png +0 -0
  558. data/vendor/ggml/examples/yolo/data/labels/43_6.png +0 -0
  559. data/vendor/ggml/examples/yolo/data/labels/43_7.png +0 -0
  560. data/vendor/ggml/examples/yolo/data/labels/44_0.png +0 -0
  561. data/vendor/ggml/examples/yolo/data/labels/44_1.png +0 -0
  562. data/vendor/ggml/examples/yolo/data/labels/44_2.png +0 -0
  563. data/vendor/ggml/examples/yolo/data/labels/44_3.png +0 -0
  564. data/vendor/ggml/examples/yolo/data/labels/44_4.png +0 -0
  565. data/vendor/ggml/examples/yolo/data/labels/44_5.png +0 -0
  566. data/vendor/ggml/examples/yolo/data/labels/44_6.png +0 -0
  567. data/vendor/ggml/examples/yolo/data/labels/44_7.png +0 -0
  568. data/vendor/ggml/examples/yolo/data/labels/45_0.png +0 -0
  569. data/vendor/ggml/examples/yolo/data/labels/45_1.png +0 -0
  570. data/vendor/ggml/examples/yolo/data/labels/45_2.png +0 -0
  571. data/vendor/ggml/examples/yolo/data/labels/45_3.png +0 -0
  572. data/vendor/ggml/examples/yolo/data/labels/45_4.png +0 -0
  573. data/vendor/ggml/examples/yolo/data/labels/45_5.png +0 -0
  574. data/vendor/ggml/examples/yolo/data/labels/45_6.png +0 -0
  575. data/vendor/ggml/examples/yolo/data/labels/45_7.png +0 -0
  576. data/vendor/ggml/examples/yolo/data/labels/46_0.png +0 -0
  577. data/vendor/ggml/examples/yolo/data/labels/46_1.png +0 -0
  578. data/vendor/ggml/examples/yolo/data/labels/46_2.png +0 -0
  579. data/vendor/ggml/examples/yolo/data/labels/46_3.png +0 -0
  580. data/vendor/ggml/examples/yolo/data/labels/46_4.png +0 -0
  581. data/vendor/ggml/examples/yolo/data/labels/46_5.png +0 -0
  582. data/vendor/ggml/examples/yolo/data/labels/46_6.png +0 -0
  583. data/vendor/ggml/examples/yolo/data/labels/46_7.png +0 -0
  584. data/vendor/ggml/examples/yolo/data/labels/47_0.png +0 -0
  585. data/vendor/ggml/examples/yolo/data/labels/47_1.png +0 -0
  586. data/vendor/ggml/examples/yolo/data/labels/47_2.png +0 -0
  587. data/vendor/ggml/examples/yolo/data/labels/47_3.png +0 -0
  588. data/vendor/ggml/examples/yolo/data/labels/47_4.png +0 -0
  589. data/vendor/ggml/examples/yolo/data/labels/47_5.png +0 -0
  590. data/vendor/ggml/examples/yolo/data/labels/47_6.png +0 -0
  591. data/vendor/ggml/examples/yolo/data/labels/47_7.png +0 -0
  592. data/vendor/ggml/examples/yolo/data/labels/48_0.png +0 -0
  593. data/vendor/ggml/examples/yolo/data/labels/48_1.png +0 -0
  594. data/vendor/ggml/examples/yolo/data/labels/48_2.png +0 -0
  595. data/vendor/ggml/examples/yolo/data/labels/48_3.png +0 -0
  596. data/vendor/ggml/examples/yolo/data/labels/48_4.png +0 -0
  597. data/vendor/ggml/examples/yolo/data/labels/48_5.png +0 -0
  598. data/vendor/ggml/examples/yolo/data/labels/48_6.png +0 -0
  599. data/vendor/ggml/examples/yolo/data/labels/48_7.png +0 -0
  600. data/vendor/ggml/examples/yolo/data/labels/49_0.png +0 -0
  601. data/vendor/ggml/examples/yolo/data/labels/49_1.png +0 -0
  602. data/vendor/ggml/examples/yolo/data/labels/49_2.png +0 -0
  603. data/vendor/ggml/examples/yolo/data/labels/49_3.png +0 -0
  604. data/vendor/ggml/examples/yolo/data/labels/49_4.png +0 -0
  605. data/vendor/ggml/examples/yolo/data/labels/49_5.png +0 -0
  606. data/vendor/ggml/examples/yolo/data/labels/49_6.png +0 -0
  607. data/vendor/ggml/examples/yolo/data/labels/49_7.png +0 -0
  608. data/vendor/ggml/examples/yolo/data/labels/50_0.png +0 -0
  609. data/vendor/ggml/examples/yolo/data/labels/50_1.png +0 -0
  610. data/vendor/ggml/examples/yolo/data/labels/50_2.png +0 -0
  611. data/vendor/ggml/examples/yolo/data/labels/50_3.png +0 -0
  612. data/vendor/ggml/examples/yolo/data/labels/50_4.png +0 -0
  613. data/vendor/ggml/examples/yolo/data/labels/50_5.png +0 -0
  614. data/vendor/ggml/examples/yolo/data/labels/50_6.png +0 -0
  615. data/vendor/ggml/examples/yolo/data/labels/50_7.png +0 -0
  616. data/vendor/ggml/examples/yolo/data/labels/51_0.png +0 -0
  617. data/vendor/ggml/examples/yolo/data/labels/51_1.png +0 -0
  618. data/vendor/ggml/examples/yolo/data/labels/51_2.png +0 -0
  619. data/vendor/ggml/examples/yolo/data/labels/51_3.png +0 -0
  620. data/vendor/ggml/examples/yolo/data/labels/51_4.png +0 -0
  621. data/vendor/ggml/examples/yolo/data/labels/51_5.png +0 -0
  622. data/vendor/ggml/examples/yolo/data/labels/51_6.png +0 -0
  623. data/vendor/ggml/examples/yolo/data/labels/51_7.png +0 -0
  624. data/vendor/ggml/examples/yolo/data/labels/52_0.png +0 -0
  625. data/vendor/ggml/examples/yolo/data/labels/52_1.png +0 -0
  626. data/vendor/ggml/examples/yolo/data/labels/52_2.png +0 -0
  627. data/vendor/ggml/examples/yolo/data/labels/52_3.png +0 -0
  628. data/vendor/ggml/examples/yolo/data/labels/52_4.png +0 -0
  629. data/vendor/ggml/examples/yolo/data/labels/52_5.png +0 -0
  630. data/vendor/ggml/examples/yolo/data/labels/52_6.png +0 -0
  631. data/vendor/ggml/examples/yolo/data/labels/52_7.png +0 -0
  632. data/vendor/ggml/examples/yolo/data/labels/53_0.png +0 -0
  633. data/vendor/ggml/examples/yolo/data/labels/53_1.png +0 -0
  634. data/vendor/ggml/examples/yolo/data/labels/53_2.png +0 -0
  635. data/vendor/ggml/examples/yolo/data/labels/53_3.png +0 -0
  636. data/vendor/ggml/examples/yolo/data/labels/53_4.png +0 -0
  637. data/vendor/ggml/examples/yolo/data/labels/53_5.png +0 -0
  638. data/vendor/ggml/examples/yolo/data/labels/53_6.png +0 -0
  639. data/vendor/ggml/examples/yolo/data/labels/53_7.png +0 -0
  640. data/vendor/ggml/examples/yolo/data/labels/54_0.png +0 -0
  641. data/vendor/ggml/examples/yolo/data/labels/54_1.png +0 -0
  642. data/vendor/ggml/examples/yolo/data/labels/54_2.png +0 -0
  643. data/vendor/ggml/examples/yolo/data/labels/54_3.png +0 -0
  644. data/vendor/ggml/examples/yolo/data/labels/54_4.png +0 -0
  645. data/vendor/ggml/examples/yolo/data/labels/54_5.png +0 -0
  646. data/vendor/ggml/examples/yolo/data/labels/54_6.png +0 -0
  647. data/vendor/ggml/examples/yolo/data/labels/54_7.png +0 -0
  648. data/vendor/ggml/examples/yolo/data/labels/55_0.png +0 -0
  649. data/vendor/ggml/examples/yolo/data/labels/55_1.png +0 -0
  650. data/vendor/ggml/examples/yolo/data/labels/55_2.png +0 -0
  651. data/vendor/ggml/examples/yolo/data/labels/55_3.png +0 -0
  652. data/vendor/ggml/examples/yolo/data/labels/55_4.png +0 -0
  653. data/vendor/ggml/examples/yolo/data/labels/55_5.png +0 -0
  654. data/vendor/ggml/examples/yolo/data/labels/55_6.png +0 -0
  655. data/vendor/ggml/examples/yolo/data/labels/55_7.png +0 -0
  656. data/vendor/ggml/examples/yolo/data/labels/56_0.png +0 -0
  657. data/vendor/ggml/examples/yolo/data/labels/56_1.png +0 -0
  658. data/vendor/ggml/examples/yolo/data/labels/56_2.png +0 -0
  659. data/vendor/ggml/examples/yolo/data/labels/56_3.png +0 -0
  660. data/vendor/ggml/examples/yolo/data/labels/56_4.png +0 -0
  661. data/vendor/ggml/examples/yolo/data/labels/56_5.png +0 -0
  662. data/vendor/ggml/examples/yolo/data/labels/56_6.png +0 -0
  663. data/vendor/ggml/examples/yolo/data/labels/56_7.png +0 -0
  664. data/vendor/ggml/examples/yolo/data/labels/57_0.png +0 -0
  665. data/vendor/ggml/examples/yolo/data/labels/57_1.png +0 -0
  666. data/vendor/ggml/examples/yolo/data/labels/57_2.png +0 -0
  667. data/vendor/ggml/examples/yolo/data/labels/57_3.png +0 -0
  668. data/vendor/ggml/examples/yolo/data/labels/57_4.png +0 -0
  669. data/vendor/ggml/examples/yolo/data/labels/57_5.png +0 -0
  670. data/vendor/ggml/examples/yolo/data/labels/57_6.png +0 -0
  671. data/vendor/ggml/examples/yolo/data/labels/57_7.png +0 -0
  672. data/vendor/ggml/examples/yolo/data/labels/58_0.png +0 -0
  673. data/vendor/ggml/examples/yolo/data/labels/58_1.png +0 -0
  674. data/vendor/ggml/examples/yolo/data/labels/58_2.png +0 -0
  675. data/vendor/ggml/examples/yolo/data/labels/58_3.png +0 -0
  676. data/vendor/ggml/examples/yolo/data/labels/58_4.png +0 -0
  677. data/vendor/ggml/examples/yolo/data/labels/58_5.png +0 -0
  678. data/vendor/ggml/examples/yolo/data/labels/58_6.png +0 -0
  679. data/vendor/ggml/examples/yolo/data/labels/58_7.png +0 -0
  680. data/vendor/ggml/examples/yolo/data/labels/59_0.png +0 -0
  681. data/vendor/ggml/examples/yolo/data/labels/59_1.png +0 -0
  682. data/vendor/ggml/examples/yolo/data/labels/59_2.png +0 -0
  683. data/vendor/ggml/examples/yolo/data/labels/59_3.png +0 -0
  684. data/vendor/ggml/examples/yolo/data/labels/59_4.png +0 -0
  685. data/vendor/ggml/examples/yolo/data/labels/59_5.png +0 -0
  686. data/vendor/ggml/examples/yolo/data/labels/59_6.png +0 -0
  687. data/vendor/ggml/examples/yolo/data/labels/59_7.png +0 -0
  688. data/vendor/ggml/examples/yolo/data/labels/60_0.png +0 -0
  689. data/vendor/ggml/examples/yolo/data/labels/60_1.png +0 -0
  690. data/vendor/ggml/examples/yolo/data/labels/60_2.png +0 -0
  691. data/vendor/ggml/examples/yolo/data/labels/60_3.png +0 -0
  692. data/vendor/ggml/examples/yolo/data/labels/60_4.png +0 -0
  693. data/vendor/ggml/examples/yolo/data/labels/60_5.png +0 -0
  694. data/vendor/ggml/examples/yolo/data/labels/60_6.png +0 -0
  695. data/vendor/ggml/examples/yolo/data/labels/60_7.png +0 -0
  696. data/vendor/ggml/examples/yolo/data/labels/61_0.png +0 -0
  697. data/vendor/ggml/examples/yolo/data/labels/61_1.png +0 -0
  698. data/vendor/ggml/examples/yolo/data/labels/61_2.png +0 -0
  699. data/vendor/ggml/examples/yolo/data/labels/61_3.png +0 -0
  700. data/vendor/ggml/examples/yolo/data/labels/61_4.png +0 -0
  701. data/vendor/ggml/examples/yolo/data/labels/61_5.png +0 -0
  702. data/vendor/ggml/examples/yolo/data/labels/61_6.png +0 -0
  703. data/vendor/ggml/examples/yolo/data/labels/61_7.png +0 -0
  704. data/vendor/ggml/examples/yolo/data/labels/62_0.png +0 -0
  705. data/vendor/ggml/examples/yolo/data/labels/62_1.png +0 -0
  706. data/vendor/ggml/examples/yolo/data/labels/62_2.png +0 -0
  707. data/vendor/ggml/examples/yolo/data/labels/62_3.png +0 -0
  708. data/vendor/ggml/examples/yolo/data/labels/62_4.png +0 -0
  709. data/vendor/ggml/examples/yolo/data/labels/62_5.png +0 -0
  710. data/vendor/ggml/examples/yolo/data/labels/62_6.png +0 -0
  711. data/vendor/ggml/examples/yolo/data/labels/62_7.png +0 -0
  712. data/vendor/ggml/examples/yolo/data/labels/63_0.png +0 -0
  713. data/vendor/ggml/examples/yolo/data/labels/63_1.png +0 -0
  714. data/vendor/ggml/examples/yolo/data/labels/63_2.png +0 -0
  715. data/vendor/ggml/examples/yolo/data/labels/63_3.png +0 -0
  716. data/vendor/ggml/examples/yolo/data/labels/63_4.png +0 -0
  717. data/vendor/ggml/examples/yolo/data/labels/63_5.png +0 -0
  718. data/vendor/ggml/examples/yolo/data/labels/63_6.png +0 -0
  719. data/vendor/ggml/examples/yolo/data/labels/63_7.png +0 -0
  720. data/vendor/ggml/examples/yolo/data/labels/64_0.png +0 -0
  721. data/vendor/ggml/examples/yolo/data/labels/64_1.png +0 -0
  722. data/vendor/ggml/examples/yolo/data/labels/64_2.png +0 -0
  723. data/vendor/ggml/examples/yolo/data/labels/64_3.png +0 -0
  724. data/vendor/ggml/examples/yolo/data/labels/64_4.png +0 -0
  725. data/vendor/ggml/examples/yolo/data/labels/64_5.png +0 -0
  726. data/vendor/ggml/examples/yolo/data/labels/64_6.png +0 -0
  727. data/vendor/ggml/examples/yolo/data/labels/64_7.png +0 -0
  728. data/vendor/ggml/examples/yolo/data/labels/65_0.png +0 -0
  729. data/vendor/ggml/examples/yolo/data/labels/65_1.png +0 -0
  730. data/vendor/ggml/examples/yolo/data/labels/65_2.png +0 -0
  731. data/vendor/ggml/examples/yolo/data/labels/65_3.png +0 -0
  732. data/vendor/ggml/examples/yolo/data/labels/65_4.png +0 -0
  733. data/vendor/ggml/examples/yolo/data/labels/65_5.png +0 -0
  734. data/vendor/ggml/examples/yolo/data/labels/65_6.png +0 -0
  735. data/vendor/ggml/examples/yolo/data/labels/65_7.png +0 -0
  736. data/vendor/ggml/examples/yolo/data/labels/66_0.png +0 -0
  737. data/vendor/ggml/examples/yolo/data/labels/66_1.png +0 -0
  738. data/vendor/ggml/examples/yolo/data/labels/66_2.png +0 -0
  739. data/vendor/ggml/examples/yolo/data/labels/66_3.png +0 -0
  740. data/vendor/ggml/examples/yolo/data/labels/66_4.png +0 -0
  741. data/vendor/ggml/examples/yolo/data/labels/66_5.png +0 -0
  742. data/vendor/ggml/examples/yolo/data/labels/66_6.png +0 -0
  743. data/vendor/ggml/examples/yolo/data/labels/66_7.png +0 -0
  744. data/vendor/ggml/examples/yolo/data/labels/67_0.png +0 -0
  745. data/vendor/ggml/examples/yolo/data/labels/67_1.png +0 -0
  746. data/vendor/ggml/examples/yolo/data/labels/67_2.png +0 -0
  747. data/vendor/ggml/examples/yolo/data/labels/67_3.png +0 -0
  748. data/vendor/ggml/examples/yolo/data/labels/67_4.png +0 -0
  749. data/vendor/ggml/examples/yolo/data/labels/67_5.png +0 -0
  750. data/vendor/ggml/examples/yolo/data/labels/67_6.png +0 -0
  751. data/vendor/ggml/examples/yolo/data/labels/67_7.png +0 -0
  752. data/vendor/ggml/examples/yolo/data/labels/68_0.png +0 -0
  753. data/vendor/ggml/examples/yolo/data/labels/68_1.png +0 -0
  754. data/vendor/ggml/examples/yolo/data/labels/68_2.png +0 -0
  755. data/vendor/ggml/examples/yolo/data/labels/68_3.png +0 -0
  756. data/vendor/ggml/examples/yolo/data/labels/68_4.png +0 -0
  757. data/vendor/ggml/examples/yolo/data/labels/68_5.png +0 -0
  758. data/vendor/ggml/examples/yolo/data/labels/68_6.png +0 -0
  759. data/vendor/ggml/examples/yolo/data/labels/68_7.png +0 -0
  760. data/vendor/ggml/examples/yolo/data/labels/69_0.png +0 -0
  761. data/vendor/ggml/examples/yolo/data/labels/69_1.png +0 -0
  762. data/vendor/ggml/examples/yolo/data/labels/69_2.png +0 -0
  763. data/vendor/ggml/examples/yolo/data/labels/69_3.png +0 -0
  764. data/vendor/ggml/examples/yolo/data/labels/69_4.png +0 -0
  765. data/vendor/ggml/examples/yolo/data/labels/69_5.png +0 -0
  766. data/vendor/ggml/examples/yolo/data/labels/69_6.png +0 -0
  767. data/vendor/ggml/examples/yolo/data/labels/69_7.png +0 -0
  768. data/vendor/ggml/examples/yolo/data/labels/70_0.png +0 -0
  769. data/vendor/ggml/examples/yolo/data/labels/70_1.png +0 -0
  770. data/vendor/ggml/examples/yolo/data/labels/70_2.png +0 -0
  771. data/vendor/ggml/examples/yolo/data/labels/70_3.png +0 -0
  772. data/vendor/ggml/examples/yolo/data/labels/70_4.png +0 -0
  773. data/vendor/ggml/examples/yolo/data/labels/70_5.png +0 -0
  774. data/vendor/ggml/examples/yolo/data/labels/70_6.png +0 -0
  775. data/vendor/ggml/examples/yolo/data/labels/70_7.png +0 -0
  776. data/vendor/ggml/examples/yolo/data/labels/71_0.png +0 -0
  777. data/vendor/ggml/examples/yolo/data/labels/71_1.png +0 -0
  778. data/vendor/ggml/examples/yolo/data/labels/71_2.png +0 -0
  779. data/vendor/ggml/examples/yolo/data/labels/71_3.png +0 -0
  780. data/vendor/ggml/examples/yolo/data/labels/71_4.png +0 -0
  781. data/vendor/ggml/examples/yolo/data/labels/71_5.png +0 -0
  782. data/vendor/ggml/examples/yolo/data/labels/71_6.png +0 -0
  783. data/vendor/ggml/examples/yolo/data/labels/71_7.png +0 -0
  784. data/vendor/ggml/examples/yolo/data/labels/72_0.png +0 -0
  785. data/vendor/ggml/examples/yolo/data/labels/72_1.png +0 -0
  786. data/vendor/ggml/examples/yolo/data/labels/72_2.png +0 -0
  787. data/vendor/ggml/examples/yolo/data/labels/72_3.png +0 -0
  788. data/vendor/ggml/examples/yolo/data/labels/72_4.png +0 -0
  789. data/vendor/ggml/examples/yolo/data/labels/72_5.png +0 -0
  790. data/vendor/ggml/examples/yolo/data/labels/72_6.png +0 -0
  791. data/vendor/ggml/examples/yolo/data/labels/72_7.png +0 -0
  792. data/vendor/ggml/examples/yolo/data/labels/73_0.png +0 -0
  793. data/vendor/ggml/examples/yolo/data/labels/73_1.png +0 -0
  794. data/vendor/ggml/examples/yolo/data/labels/73_2.png +0 -0
  795. data/vendor/ggml/examples/yolo/data/labels/73_3.png +0 -0
  796. data/vendor/ggml/examples/yolo/data/labels/73_4.png +0 -0
  797. data/vendor/ggml/examples/yolo/data/labels/73_5.png +0 -0
  798. data/vendor/ggml/examples/yolo/data/labels/73_6.png +0 -0
  799. data/vendor/ggml/examples/yolo/data/labels/73_7.png +0 -0
  800. data/vendor/ggml/examples/yolo/data/labels/74_0.png +0 -0
  801. data/vendor/ggml/examples/yolo/data/labels/74_1.png +0 -0
  802. data/vendor/ggml/examples/yolo/data/labels/74_2.png +0 -0
  803. data/vendor/ggml/examples/yolo/data/labels/74_3.png +0 -0
  804. data/vendor/ggml/examples/yolo/data/labels/74_4.png +0 -0
  805. data/vendor/ggml/examples/yolo/data/labels/74_5.png +0 -0
  806. data/vendor/ggml/examples/yolo/data/labels/74_6.png +0 -0
  807. data/vendor/ggml/examples/yolo/data/labels/74_7.png +0 -0
  808. data/vendor/ggml/examples/yolo/data/labels/75_0.png +0 -0
  809. data/vendor/ggml/examples/yolo/data/labels/75_1.png +0 -0
  810. data/vendor/ggml/examples/yolo/data/labels/75_2.png +0 -0
  811. data/vendor/ggml/examples/yolo/data/labels/75_3.png +0 -0
  812. data/vendor/ggml/examples/yolo/data/labels/75_4.png +0 -0
  813. data/vendor/ggml/examples/yolo/data/labels/75_5.png +0 -0
  814. data/vendor/ggml/examples/yolo/data/labels/75_6.png +0 -0
  815. data/vendor/ggml/examples/yolo/data/labels/75_7.png +0 -0
  816. data/vendor/ggml/examples/yolo/data/labels/76_0.png +0 -0
  817. data/vendor/ggml/examples/yolo/data/labels/76_1.png +0 -0
  818. data/vendor/ggml/examples/yolo/data/labels/76_2.png +0 -0
  819. data/vendor/ggml/examples/yolo/data/labels/76_3.png +0 -0
  820. data/vendor/ggml/examples/yolo/data/labels/76_4.png +0 -0
  821. data/vendor/ggml/examples/yolo/data/labels/76_5.png +0 -0
  822. data/vendor/ggml/examples/yolo/data/labels/76_6.png +0 -0
  823. data/vendor/ggml/examples/yolo/data/labels/76_7.png +0 -0
  824. data/vendor/ggml/examples/yolo/data/labels/77_0.png +0 -0
  825. data/vendor/ggml/examples/yolo/data/labels/77_1.png +0 -0
  826. data/vendor/ggml/examples/yolo/data/labels/77_2.png +0 -0
  827. data/vendor/ggml/examples/yolo/data/labels/77_3.png +0 -0
  828. data/vendor/ggml/examples/yolo/data/labels/77_4.png +0 -0
  829. data/vendor/ggml/examples/yolo/data/labels/77_5.png +0 -0
  830. data/vendor/ggml/examples/yolo/data/labels/77_6.png +0 -0
  831. data/vendor/ggml/examples/yolo/data/labels/77_7.png +0 -0
  832. data/vendor/ggml/examples/yolo/data/labels/78_0.png +0 -0
  833. data/vendor/ggml/examples/yolo/data/labels/78_1.png +0 -0
  834. data/vendor/ggml/examples/yolo/data/labels/78_2.png +0 -0
  835. data/vendor/ggml/examples/yolo/data/labels/78_3.png +0 -0
  836. data/vendor/ggml/examples/yolo/data/labels/78_4.png +0 -0
  837. data/vendor/ggml/examples/yolo/data/labels/78_5.png +0 -0
  838. data/vendor/ggml/examples/yolo/data/labels/78_6.png +0 -0
  839. data/vendor/ggml/examples/yolo/data/labels/78_7.png +0 -0
  840. data/vendor/ggml/examples/yolo/data/labels/79_0.png +0 -0
  841. data/vendor/ggml/examples/yolo/data/labels/79_1.png +0 -0
  842. data/vendor/ggml/examples/yolo/data/labels/79_2.png +0 -0
  843. data/vendor/ggml/examples/yolo/data/labels/79_3.png +0 -0
  844. data/vendor/ggml/examples/yolo/data/labels/79_4.png +0 -0
  845. data/vendor/ggml/examples/yolo/data/labels/79_5.png +0 -0
  846. data/vendor/ggml/examples/yolo/data/labels/79_6.png +0 -0
  847. data/vendor/ggml/examples/yolo/data/labels/79_7.png +0 -0
  848. data/vendor/ggml/examples/yolo/data/labels/80_0.png +0 -0
  849. data/vendor/ggml/examples/yolo/data/labels/80_1.png +0 -0
  850. data/vendor/ggml/examples/yolo/data/labels/80_2.png +0 -0
  851. data/vendor/ggml/examples/yolo/data/labels/80_3.png +0 -0
  852. data/vendor/ggml/examples/yolo/data/labels/80_4.png +0 -0
  853. data/vendor/ggml/examples/yolo/data/labels/80_5.png +0 -0
  854. data/vendor/ggml/examples/yolo/data/labels/80_6.png +0 -0
  855. data/vendor/ggml/examples/yolo/data/labels/80_7.png +0 -0
  856. data/vendor/ggml/examples/yolo/data/labels/81_0.png +0 -0
  857. data/vendor/ggml/examples/yolo/data/labels/81_1.png +0 -0
  858. data/vendor/ggml/examples/yolo/data/labels/81_2.png +0 -0
  859. data/vendor/ggml/examples/yolo/data/labels/81_3.png +0 -0
  860. data/vendor/ggml/examples/yolo/data/labels/81_4.png +0 -0
  861. data/vendor/ggml/examples/yolo/data/labels/81_5.png +0 -0
  862. data/vendor/ggml/examples/yolo/data/labels/81_6.png +0 -0
  863. data/vendor/ggml/examples/yolo/data/labels/81_7.png +0 -0
  864. data/vendor/ggml/examples/yolo/data/labels/82_0.png +0 -0
  865. data/vendor/ggml/examples/yolo/data/labels/82_1.png +0 -0
  866. data/vendor/ggml/examples/yolo/data/labels/82_2.png +0 -0
  867. data/vendor/ggml/examples/yolo/data/labels/82_3.png +0 -0
  868. data/vendor/ggml/examples/yolo/data/labels/82_4.png +0 -0
  869. data/vendor/ggml/examples/yolo/data/labels/82_5.png +0 -0
  870. data/vendor/ggml/examples/yolo/data/labels/82_6.png +0 -0
  871. data/vendor/ggml/examples/yolo/data/labels/82_7.png +0 -0
  872. data/vendor/ggml/examples/yolo/data/labels/83_0.png +0 -0
  873. data/vendor/ggml/examples/yolo/data/labels/83_1.png +0 -0
  874. data/vendor/ggml/examples/yolo/data/labels/83_2.png +0 -0
  875. data/vendor/ggml/examples/yolo/data/labels/83_3.png +0 -0
  876. data/vendor/ggml/examples/yolo/data/labels/83_4.png +0 -0
  877. data/vendor/ggml/examples/yolo/data/labels/83_5.png +0 -0
  878. data/vendor/ggml/examples/yolo/data/labels/83_6.png +0 -0
  879. data/vendor/ggml/examples/yolo/data/labels/83_7.png +0 -0
  880. data/vendor/ggml/examples/yolo/data/labels/84_0.png +0 -0
  881. data/vendor/ggml/examples/yolo/data/labels/84_1.png +0 -0
  882. data/vendor/ggml/examples/yolo/data/labels/84_2.png +0 -0
  883. data/vendor/ggml/examples/yolo/data/labels/84_3.png +0 -0
  884. data/vendor/ggml/examples/yolo/data/labels/84_4.png +0 -0
  885. data/vendor/ggml/examples/yolo/data/labels/84_5.png +0 -0
  886. data/vendor/ggml/examples/yolo/data/labels/84_6.png +0 -0
  887. data/vendor/ggml/examples/yolo/data/labels/84_7.png +0 -0
  888. data/vendor/ggml/examples/yolo/data/labels/85_0.png +0 -0
  889. data/vendor/ggml/examples/yolo/data/labels/85_1.png +0 -0
  890. data/vendor/ggml/examples/yolo/data/labels/85_2.png +0 -0
  891. data/vendor/ggml/examples/yolo/data/labels/85_3.png +0 -0
  892. data/vendor/ggml/examples/yolo/data/labels/85_4.png +0 -0
  893. data/vendor/ggml/examples/yolo/data/labels/85_5.png +0 -0
  894. data/vendor/ggml/examples/yolo/data/labels/85_6.png +0 -0
  895. data/vendor/ggml/examples/yolo/data/labels/85_7.png +0 -0
  896. data/vendor/ggml/examples/yolo/data/labels/86_0.png +0 -0
  897. data/vendor/ggml/examples/yolo/data/labels/86_1.png +0 -0
  898. data/vendor/ggml/examples/yolo/data/labels/86_2.png +0 -0
  899. data/vendor/ggml/examples/yolo/data/labels/86_3.png +0 -0
  900. data/vendor/ggml/examples/yolo/data/labels/86_4.png +0 -0
  901. data/vendor/ggml/examples/yolo/data/labels/86_5.png +0 -0
  902. data/vendor/ggml/examples/yolo/data/labels/86_6.png +0 -0
  903. data/vendor/ggml/examples/yolo/data/labels/86_7.png +0 -0
  904. data/vendor/ggml/examples/yolo/data/labels/87_0.png +0 -0
  905. data/vendor/ggml/examples/yolo/data/labels/87_1.png +0 -0
  906. data/vendor/ggml/examples/yolo/data/labels/87_2.png +0 -0
  907. data/vendor/ggml/examples/yolo/data/labels/87_3.png +0 -0
  908. data/vendor/ggml/examples/yolo/data/labels/87_4.png +0 -0
  909. data/vendor/ggml/examples/yolo/data/labels/87_5.png +0 -0
  910. data/vendor/ggml/examples/yolo/data/labels/87_6.png +0 -0
  911. data/vendor/ggml/examples/yolo/data/labels/87_7.png +0 -0
  912. data/vendor/ggml/examples/yolo/data/labels/88_0.png +0 -0
  913. data/vendor/ggml/examples/yolo/data/labels/88_1.png +0 -0
  914. data/vendor/ggml/examples/yolo/data/labels/88_2.png +0 -0
  915. data/vendor/ggml/examples/yolo/data/labels/88_3.png +0 -0
  916. data/vendor/ggml/examples/yolo/data/labels/88_4.png +0 -0
  917. data/vendor/ggml/examples/yolo/data/labels/88_5.png +0 -0
  918. data/vendor/ggml/examples/yolo/data/labels/88_6.png +0 -0
  919. data/vendor/ggml/examples/yolo/data/labels/88_7.png +0 -0
  920. data/vendor/ggml/examples/yolo/data/labels/89_0.png +0 -0
  921. data/vendor/ggml/examples/yolo/data/labels/89_1.png +0 -0
  922. data/vendor/ggml/examples/yolo/data/labels/89_2.png +0 -0
  923. data/vendor/ggml/examples/yolo/data/labels/89_3.png +0 -0
  924. data/vendor/ggml/examples/yolo/data/labels/89_4.png +0 -0
  925. data/vendor/ggml/examples/yolo/data/labels/89_5.png +0 -0
  926. data/vendor/ggml/examples/yolo/data/labels/89_6.png +0 -0
  927. data/vendor/ggml/examples/yolo/data/labels/89_7.png +0 -0
  928. data/vendor/ggml/examples/yolo/data/labels/90_0.png +0 -0
  929. data/vendor/ggml/examples/yolo/data/labels/90_1.png +0 -0
  930. data/vendor/ggml/examples/yolo/data/labels/90_2.png +0 -0
  931. data/vendor/ggml/examples/yolo/data/labels/90_3.png +0 -0
  932. data/vendor/ggml/examples/yolo/data/labels/90_4.png +0 -0
  933. data/vendor/ggml/examples/yolo/data/labels/90_5.png +0 -0
  934. data/vendor/ggml/examples/yolo/data/labels/90_6.png +0 -0
  935. data/vendor/ggml/examples/yolo/data/labels/90_7.png +0 -0
  936. data/vendor/ggml/examples/yolo/data/labels/91_0.png +0 -0
  937. data/vendor/ggml/examples/yolo/data/labels/91_1.png +0 -0
  938. data/vendor/ggml/examples/yolo/data/labels/91_2.png +0 -0
  939. data/vendor/ggml/examples/yolo/data/labels/91_3.png +0 -0
  940. data/vendor/ggml/examples/yolo/data/labels/91_4.png +0 -0
  941. data/vendor/ggml/examples/yolo/data/labels/91_5.png +0 -0
  942. data/vendor/ggml/examples/yolo/data/labels/91_6.png +0 -0
  943. data/vendor/ggml/examples/yolo/data/labels/91_7.png +0 -0
  944. data/vendor/ggml/examples/yolo/data/labels/92_0.png +0 -0
  945. data/vendor/ggml/examples/yolo/data/labels/92_1.png +0 -0
  946. data/vendor/ggml/examples/yolo/data/labels/92_2.png +0 -0
  947. data/vendor/ggml/examples/yolo/data/labels/92_3.png +0 -0
  948. data/vendor/ggml/examples/yolo/data/labels/92_4.png +0 -0
  949. data/vendor/ggml/examples/yolo/data/labels/92_5.png +0 -0
  950. data/vendor/ggml/examples/yolo/data/labels/92_6.png +0 -0
  951. data/vendor/ggml/examples/yolo/data/labels/92_7.png +0 -0
  952. data/vendor/ggml/examples/yolo/data/labels/93_0.png +0 -0
  953. data/vendor/ggml/examples/yolo/data/labels/93_1.png +0 -0
  954. data/vendor/ggml/examples/yolo/data/labels/93_2.png +0 -0
  955. data/vendor/ggml/examples/yolo/data/labels/93_3.png +0 -0
  956. data/vendor/ggml/examples/yolo/data/labels/93_4.png +0 -0
  957. data/vendor/ggml/examples/yolo/data/labels/93_5.png +0 -0
  958. data/vendor/ggml/examples/yolo/data/labels/93_6.png +0 -0
  959. data/vendor/ggml/examples/yolo/data/labels/93_7.png +0 -0
  960. data/vendor/ggml/examples/yolo/data/labels/94_0.png +0 -0
  961. data/vendor/ggml/examples/yolo/data/labels/94_1.png +0 -0
  962. data/vendor/ggml/examples/yolo/data/labels/94_2.png +0 -0
  963. data/vendor/ggml/examples/yolo/data/labels/94_3.png +0 -0
  964. data/vendor/ggml/examples/yolo/data/labels/94_4.png +0 -0
  965. data/vendor/ggml/examples/yolo/data/labels/94_5.png +0 -0
  966. data/vendor/ggml/examples/yolo/data/labels/94_6.png +0 -0
  967. data/vendor/ggml/examples/yolo/data/labels/94_7.png +0 -0
  968. data/vendor/ggml/examples/yolo/data/labels/95_0.png +0 -0
  969. data/vendor/ggml/examples/yolo/data/labels/95_1.png +0 -0
  970. data/vendor/ggml/examples/yolo/data/labels/95_2.png +0 -0
  971. data/vendor/ggml/examples/yolo/data/labels/95_3.png +0 -0
  972. data/vendor/ggml/examples/yolo/data/labels/95_4.png +0 -0
  973. data/vendor/ggml/examples/yolo/data/labels/95_5.png +0 -0
  974. data/vendor/ggml/examples/yolo/data/labels/95_6.png +0 -0
  975. data/vendor/ggml/examples/yolo/data/labels/95_7.png +0 -0
  976. data/vendor/ggml/examples/yolo/data/labels/96_0.png +0 -0
  977. data/vendor/ggml/examples/yolo/data/labels/96_1.png +0 -0
  978. data/vendor/ggml/examples/yolo/data/labels/96_2.png +0 -0
  979. data/vendor/ggml/examples/yolo/data/labels/96_3.png +0 -0
  980. data/vendor/ggml/examples/yolo/data/labels/96_4.png +0 -0
  981. data/vendor/ggml/examples/yolo/data/labels/96_5.png +0 -0
  982. data/vendor/ggml/examples/yolo/data/labels/96_6.png +0 -0
  983. data/vendor/ggml/examples/yolo/data/labels/96_7.png +0 -0
  984. data/vendor/ggml/examples/yolo/data/labels/97_0.png +0 -0
  985. data/vendor/ggml/examples/yolo/data/labels/97_1.png +0 -0
  986. data/vendor/ggml/examples/yolo/data/labels/97_2.png +0 -0
  987. data/vendor/ggml/examples/yolo/data/labels/97_3.png +0 -0
  988. data/vendor/ggml/examples/yolo/data/labels/97_4.png +0 -0
  989. data/vendor/ggml/examples/yolo/data/labels/97_5.png +0 -0
  990. data/vendor/ggml/examples/yolo/data/labels/97_6.png +0 -0
  991. data/vendor/ggml/examples/yolo/data/labels/97_7.png +0 -0
  992. data/vendor/ggml/examples/yolo/data/labels/98_0.png +0 -0
  993. data/vendor/ggml/examples/yolo/data/labels/98_1.png +0 -0
  994. data/vendor/ggml/examples/yolo/data/labels/98_2.png +0 -0
  995. data/vendor/ggml/examples/yolo/data/labels/98_3.png +0 -0
  996. data/vendor/ggml/examples/yolo/data/labels/98_4.png +0 -0
  997. data/vendor/ggml/examples/yolo/data/labels/98_5.png +0 -0
  998. data/vendor/ggml/examples/yolo/data/labels/98_6.png +0 -0
  999. data/vendor/ggml/examples/yolo/data/labels/98_7.png +0 -0
  1000. data/vendor/ggml/examples/yolo/data/labels/99_0.png +0 -0
  1001. data/vendor/ggml/examples/yolo/data/labels/99_1.png +0 -0
  1002. data/vendor/ggml/examples/yolo/data/labels/99_2.png +0 -0
  1003. data/vendor/ggml/examples/yolo/data/labels/99_3.png +0 -0
  1004. data/vendor/ggml/examples/yolo/data/labels/99_4.png +0 -0
  1005. data/vendor/ggml/examples/yolo/data/labels/99_5.png +0 -0
  1006. data/vendor/ggml/examples/yolo/data/labels/99_6.png +0 -0
  1007. data/vendor/ggml/examples/yolo/data/labels/99_7.png +0 -0
  1008. data/vendor/ggml/examples/yolo/yolo-image.cpp +210 -0
  1009. data/vendor/ggml/examples/yolo/yolo-image.h +39 -0
  1010. data/vendor/ggml/examples/yolo/yolov3-tiny.cpp +661 -0
  1011. data/vendor/ggml/ggml.pc.in +10 -0
  1012. data/vendor/ggml/include/ggml-alloc.h +85 -0
  1013. data/vendor/ggml/include/ggml-backend.h +431 -0
  1014. data/vendor/ggml/include/ggml-blas.h +25 -0
  1015. data/vendor/ggml/include/ggml-cann.h +123 -0
  1016. data/vendor/ggml/include/ggml-cpp.h +39 -0
  1017. data/vendor/ggml/include/ggml-cpu.h +151 -0
  1018. data/vendor/ggml/include/ggml-cuda.h +50 -0
  1019. data/vendor/ggml/include/ggml-hexagon.h +19 -0
  1020. data/vendor/ggml/include/ggml-metal.h +61 -0
  1021. data/vendor/ggml/include/ggml-opencl.h +26 -0
  1022. data/vendor/ggml/include/ggml-openvino.h +37 -0
  1023. data/vendor/ggml/include/ggml-opt.h +256 -0
  1024. data/vendor/ggml/include/ggml-rpc.h +35 -0
  1025. data/vendor/ggml/include/ggml-sycl.h +49 -0
  1026. data/vendor/ggml/include/ggml-virtgpu.h +14 -0
  1027. data/vendor/ggml/include/ggml-vulkan.h +29 -0
  1028. data/vendor/ggml/include/ggml-webgpu.h +19 -0
  1029. data/vendor/ggml/include/ggml-zdnn.h +17 -0
  1030. data/vendor/ggml/include/ggml-zendnn.h +22 -0
  1031. data/vendor/ggml/include/ggml.h +2845 -0
  1032. data/vendor/ggml/include/gguf.h +204 -0
  1033. data/vendor/ggml/requirements.txt +12 -0
  1034. data/vendor/ggml/scripts/gen-authors.sh +9 -0
  1035. data/vendor/ggml/scripts/release.sh +296 -0
  1036. data/vendor/ggml/scripts/sync-llama-am.sh +167 -0
  1037. data/vendor/ggml/scripts/sync-llama.last +1 -0
  1038. data/vendor/ggml/scripts/sync-llama.sh +21 -0
  1039. data/vendor/ggml/scripts/sync-whisper-am.sh +138 -0
  1040. data/vendor/ggml/scripts/sync-whisper.last +1 -0
  1041. data/vendor/ggml/scripts/sync-whisper.sh +17 -0
  1042. data/vendor/ggml/src/CMakeLists.txt +493 -0
  1043. data/vendor/ggml/src/ggml-alloc.c +1248 -0
  1044. data/vendor/ggml/src/ggml-backend-dl.cpp +48 -0
  1045. data/vendor/ggml/src/ggml-backend-dl.h +45 -0
  1046. data/vendor/ggml/src/ggml-backend-impl.h +275 -0
  1047. data/vendor/ggml/src/ggml-backend-meta.cpp +2144 -0
  1048. data/vendor/ggml/src/ggml-backend-reg.cpp +586 -0
  1049. data/vendor/ggml/src/ggml-backend.cpp +2371 -0
  1050. data/vendor/ggml/src/ggml-blas/CMakeLists.txt +101 -0
  1051. data/vendor/ggml/src/ggml-blas/ggml-blas.cpp +522 -0
  1052. data/vendor/ggml/src/ggml-cann/CMakeLists.txt +89 -0
  1053. data/vendor/ggml/src/ggml-cann/acl_tensor.cpp +195 -0
  1054. data/vendor/ggml/src/ggml-cann/acl_tensor.h +349 -0
  1055. data/vendor/ggml/src/ggml-cann/aclnn_ops.cpp +4436 -0
  1056. data/vendor/ggml/src/ggml-cann/aclnn_ops.h +1190 -0
  1057. data/vendor/ggml/src/ggml-cann/common.h +651 -0
  1058. data/vendor/ggml/src/ggml-cann/ggml-cann.cpp +3062 -0
  1059. data/vendor/ggml/src/ggml-common.h +1900 -0
  1060. data/vendor/ggml/src/ggml-cpu/CMakeLists.txt +731 -0
  1061. data/vendor/ggml/src/ggml-cpu/amx/amx.cpp +249 -0
  1062. data/vendor/ggml/src/ggml-cpu/amx/amx.h +8 -0
  1063. data/vendor/ggml/src/ggml-cpu/amx/common.h +115 -0
  1064. data/vendor/ggml/src/ggml-cpu/amx/mmq.cpp +2512 -0
  1065. data/vendor/ggml/src/ggml-cpu/amx/mmq.h +10 -0
  1066. data/vendor/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +98 -0
  1067. data/vendor/ggml/src/ggml-cpu/arch/arm/quants.c +4245 -0
  1068. data/vendor/ggml/src/ggml-cpu/arch/arm/repack.cpp +5156 -0
  1069. data/vendor/ggml/src/ggml-cpu/arch/loongarch/quants.c +2158 -0
  1070. data/vendor/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  1071. data/vendor/ggml/src/ggml-cpu/arch/powerpc/quants.c +2304 -0
  1072. data/vendor/ggml/src/ggml-cpu/arch/riscv/cpu-feats.cpp +38 -0
  1073. data/vendor/ggml/src/ggml-cpu/arch/riscv/quants.c +4553 -0
  1074. data/vendor/ggml/src/ggml-cpu/arch/riscv/repack.cpp +1703 -0
  1075. data/vendor/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
  1076. data/vendor/ggml/src/ggml-cpu/arch/s390/quants.c +1465 -0
  1077. data/vendor/ggml/src/ggml-cpu/arch/wasm/quants.c +1220 -0
  1078. data/vendor/ggml/src/ggml-cpu/arch/x86/cpu-feats.cpp +327 -0
  1079. data/vendor/ggml/src/ggml-cpu/arch/x86/quants.c +3970 -0
  1080. data/vendor/ggml/src/ggml-cpu/arch/x86/repack.cpp +6407 -0
  1081. data/vendor/ggml/src/ggml-cpu/arch-fallback.h +348 -0
  1082. data/vendor/ggml/src/ggml-cpu/binary-ops.cpp +154 -0
  1083. data/vendor/ggml/src/ggml-cpu/binary-ops.h +16 -0
  1084. data/vendor/ggml/src/ggml-cpu/cmake/FindSIMD.cmake +100 -0
  1085. data/vendor/ggml/src/ggml-cpu/cmake/FindSMTIME.cmake +32 -0
  1086. data/vendor/ggml/src/ggml-cpu/common.h +95 -0
  1087. data/vendor/ggml/src/ggml-cpu/ggml-cpu-impl.h +539 -0
  1088. data/vendor/ggml/src/ggml-cpu/ggml-cpu.c +3835 -0
  1089. data/vendor/ggml/src/ggml-cpu/ggml-cpu.cpp +703 -0
  1090. data/vendor/ggml/src/ggml-cpu/hbm.cpp +55 -0
  1091. data/vendor/ggml/src/ggml-cpu/hbm.h +8 -0
  1092. data/vendor/ggml/src/ggml-cpu/kleidiai/kernels.cpp +939 -0
  1093. data/vendor/ggml/src/ggml-cpu/kleidiai/kernels.h +90 -0
  1094. data/vendor/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1513 -0
  1095. data/vendor/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  1096. data/vendor/ggml/src/ggml-cpu/llamafile/sgemm.cpp +4051 -0
  1097. data/vendor/ggml/src/ggml-cpu/llamafile/sgemm.h +25 -0
  1098. data/vendor/ggml/src/ggml-cpu/ops.cpp +11373 -0
  1099. data/vendor/ggml/src/ggml-cpu/ops.h +119 -0
  1100. data/vendor/ggml/src/ggml-cpu/quants.c +1288 -0
  1101. data/vendor/ggml/src/ggml-cpu/quants.h +103 -0
  1102. data/vendor/ggml/src/ggml-cpu/repack.cpp +4836 -0
  1103. data/vendor/ggml/src/ggml-cpu/repack.h +245 -0
  1104. data/vendor/ggml/src/ggml-cpu/simd-gemm.h +226 -0
  1105. data/vendor/ggml/src/ggml-cpu/simd-mappings.h +1319 -0
  1106. data/vendor/ggml/src/ggml-cpu/spacemit/ime.cpp +1740 -0
  1107. data/vendor/ggml/src/ggml-cpu/spacemit/ime.h +21 -0
  1108. data/vendor/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +1027 -0
  1109. data/vendor/ggml/src/ggml-cpu/spacemit/ime2_kernels.cpp +5768 -0
  1110. data/vendor/ggml/src/ggml-cpu/spacemit/ime_env.cpp +320 -0
  1111. data/vendor/ggml/src/ggml-cpu/spacemit/ime_env.h +55 -0
  1112. data/vendor/ggml/src/ggml-cpu/spacemit/ime_kernels.h +189 -0
  1113. data/vendor/ggml/src/ggml-cpu/spacemit/repack.cpp +1795 -0
  1114. data/vendor/ggml/src/ggml-cpu/spacemit/repack.h +14 -0
  1115. data/vendor/ggml/src/ggml-cpu/spacemit/rvv_kernels.cpp +3178 -0
  1116. data/vendor/ggml/src/ggml-cpu/spacemit/rvv_kernels.h +95 -0
  1117. data/vendor/ggml/src/ggml-cpu/spacemit/spine_barrier.h +34 -0
  1118. data/vendor/ggml/src/ggml-cpu/spacemit/spine_mem_pool.cpp +760 -0
  1119. data/vendor/ggml/src/ggml-cpu/spacemit/spine_mem_pool.h +32 -0
  1120. data/vendor/ggml/src/ggml-cpu/spacemit/spine_tcm.h +409 -0
  1121. data/vendor/ggml/src/ggml-cpu/traits.cpp +36 -0
  1122. data/vendor/ggml/src/ggml-cpu/traits.h +38 -0
  1123. data/vendor/ggml/src/ggml-cpu/unary-ops.cpp +337 -0
  1124. data/vendor/ggml/src/ggml-cpu/unary-ops.h +35 -0
  1125. data/vendor/ggml/src/ggml-cpu/vec.cpp +629 -0
  1126. data/vendor/ggml/src/ggml-cpu/vec.h +1588 -0
  1127. data/vendor/ggml/src/ggml-cuda/CMakeLists.txt +268 -0
  1128. data/vendor/ggml/src/ggml-cuda/acc.cu +61 -0
  1129. data/vendor/ggml/src/ggml-cuda/acc.cuh +5 -0
  1130. data/vendor/ggml/src/ggml-cuda/add-id.cu +58 -0
  1131. data/vendor/ggml/src/ggml-cuda/add-id.cuh +3 -0
  1132. data/vendor/ggml/src/ggml-cuda/allreduce.cu +971 -0
  1133. data/vendor/ggml/src/ggml-cuda/allreduce.cuh +29 -0
  1134. data/vendor/ggml/src/ggml-cuda/arange.cu +34 -0
  1135. data/vendor/ggml/src/ggml-cuda/arange.cuh +5 -0
  1136. data/vendor/ggml/src/ggml-cuda/argmax.cu +91 -0
  1137. data/vendor/ggml/src/ggml-cuda/argmax.cuh +3 -0
  1138. data/vendor/ggml/src/ggml-cuda/argsort.cu +266 -0
  1139. data/vendor/ggml/src/ggml-cuda/argsort.cuh +19 -0
  1140. data/vendor/ggml/src/ggml-cuda/binbcast.cu +534 -0
  1141. data/vendor/ggml/src/ggml-cuda/binbcast.cuh +12 -0
  1142. data/vendor/ggml/src/ggml-cuda/clamp.cu +45 -0
  1143. data/vendor/ggml/src/ggml-cuda/clamp.cuh +5 -0
  1144. data/vendor/ggml/src/ggml-cuda/common.cuh +1489 -0
  1145. data/vendor/ggml/src/ggml-cuda/concat.cu +204 -0
  1146. data/vendor/ggml/src/ggml-cuda/concat.cuh +5 -0
  1147. data/vendor/ggml/src/ggml-cuda/conv-transpose-1d.cu +86 -0
  1148. data/vendor/ggml/src/ggml-cuda/conv-transpose-1d.cuh +5 -0
  1149. data/vendor/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
  1150. data/vendor/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
  1151. data/vendor/ggml/src/ggml-cuda/conv2d-transpose.cu +115 -0
  1152. data/vendor/ggml/src/ggml-cuda/conv2d-transpose.cuh +5 -0
  1153. data/vendor/ggml/src/ggml-cuda/conv2d.cu +166 -0
  1154. data/vendor/ggml/src/ggml-cuda/conv2d.cuh +5 -0
  1155. data/vendor/ggml/src/ggml-cuda/convert.cu +892 -0
  1156. data/vendor/ggml/src/ggml-cuda/convert.cuh +66 -0
  1157. data/vendor/ggml/src/ggml-cuda/count-equal.cu +64 -0
  1158. data/vendor/ggml/src/ggml-cuda/count-equal.cuh +5 -0
  1159. data/vendor/ggml/src/ggml-cuda/cp-async.cuh +57 -0
  1160. data/vendor/ggml/src/ggml-cuda/cpy-utils.cuh +217 -0
  1161. data/vendor/ggml/src/ggml-cuda/cpy.cu +558 -0
  1162. data/vendor/ggml/src/ggml-cuda/cpy.cuh +7 -0
  1163. data/vendor/ggml/src/ggml-cuda/cross-entropy-loss.cu +177 -0
  1164. data/vendor/ggml/src/ggml-cuda/cross-entropy-loss.cuh +7 -0
  1165. data/vendor/ggml/src/ggml-cuda/cumsum.cu +307 -0
  1166. data/vendor/ggml/src/ggml-cuda/cumsum.cuh +5 -0
  1167. data/vendor/ggml/src/ggml-cuda/dequantize.cuh +99 -0
  1168. data/vendor/ggml/src/ggml-cuda/diag.cu +77 -0
  1169. data/vendor/ggml/src/ggml-cuda/diag.cuh +5 -0
  1170. data/vendor/ggml/src/ggml-cuda/diagmask.cu +40 -0
  1171. data/vendor/ggml/src/ggml-cuda/diagmask.cuh +5 -0
  1172. data/vendor/ggml/src/ggml-cuda/fattn-common.cuh +1212 -0
  1173. data/vendor/ggml/src/ggml-cuda/fattn-mma-f16.cuh +2020 -0
  1174. data/vendor/ggml/src/ggml-cuda/fattn-tile.cu +61 -0
  1175. data/vendor/ggml/src/ggml-cuda/fattn-tile.cuh +1347 -0
  1176. data/vendor/ggml/src/ggml-cuda/fattn-vec.cuh +600 -0
  1177. data/vendor/ggml/src/ggml-cuda/fattn-wmma-f16.cu +696 -0
  1178. data/vendor/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +51 -0
  1179. data/vendor/ggml/src/ggml-cuda/fattn.cu +562 -0
  1180. data/vendor/ggml/src/ggml-cuda/fattn.cuh +5 -0
  1181. data/vendor/ggml/src/ggml-cuda/fill.cu +37 -0
  1182. data/vendor/ggml/src/ggml-cuda/fill.cuh +3 -0
  1183. data/vendor/ggml/src/ggml-cuda/gated_delta_net.cu +311 -0
  1184. data/vendor/ggml/src/ggml-cuda/gated_delta_net.cuh +4 -0
  1185. data/vendor/ggml/src/ggml-cuda/getrows.cu +300 -0
  1186. data/vendor/ggml/src/ggml-cuda/getrows.cuh +15 -0
  1187. data/vendor/ggml/src/ggml-cuda/ggml-cuda.cu +5684 -0
  1188. data/vendor/ggml/src/ggml-cuda/gla.cu +93 -0
  1189. data/vendor/ggml/src/ggml-cuda/gla.cuh +3 -0
  1190. data/vendor/ggml/src/ggml-cuda/im2col.cu +267 -0
  1191. data/vendor/ggml/src/ggml-cuda/im2col.cuh +6 -0
  1192. data/vendor/ggml/src/ggml-cuda/mean.cu +75 -0
  1193. data/vendor/ggml/src/ggml-cuda/mean.cuh +3 -0
  1194. data/vendor/ggml/src/ggml-cuda/mma.cuh +1456 -0
  1195. data/vendor/ggml/src/ggml-cuda/mmf.cu +191 -0
  1196. data/vendor/ggml/src/ggml-cuda/mmf.cuh +908 -0
  1197. data/vendor/ggml/src/ggml-cuda/mmid.cu +164 -0
  1198. data/vendor/ggml/src/ggml-cuda/mmid.cuh +5 -0
  1199. data/vendor/ggml/src/ggml-cuda/mmq.cu +372 -0
  1200. data/vendor/ggml/src/ggml-cuda/mmq.cuh +4176 -0
  1201. data/vendor/ggml/src/ggml-cuda/mmvf.cu +862 -0
  1202. data/vendor/ggml/src/ggml-cuda/mmvf.cuh +14 -0
  1203. data/vendor/ggml/src/ggml-cuda/mmvq.cu +1161 -0
  1204. data/vendor/ggml/src/ggml-cuda/mmvq.cuh +16 -0
  1205. data/vendor/ggml/src/ggml-cuda/norm.cu +672 -0
  1206. data/vendor/ggml/src/ggml-cuda/norm.cuh +18 -0
  1207. data/vendor/ggml/src/ggml-cuda/opt-step-adamw.cu +78 -0
  1208. data/vendor/ggml/src/ggml-cuda/opt-step-adamw.cuh +5 -0
  1209. data/vendor/ggml/src/ggml-cuda/opt-step-sgd.cu +49 -0
  1210. data/vendor/ggml/src/ggml-cuda/opt-step-sgd.cuh +5 -0
  1211. data/vendor/ggml/src/ggml-cuda/out-prod.cu +84 -0
  1212. data/vendor/ggml/src/ggml-cuda/out-prod.cuh +3 -0
  1213. data/vendor/ggml/src/ggml-cuda/pad.cu +106 -0
  1214. data/vendor/ggml/src/ggml-cuda/pad.cuh +5 -0
  1215. data/vendor/ggml/src/ggml-cuda/pad_reflect_1d.cu +91 -0
  1216. data/vendor/ggml/src/ggml-cuda/pad_reflect_1d.cuh +5 -0
  1217. data/vendor/ggml/src/ggml-cuda/pool2d.cu +94 -0
  1218. data/vendor/ggml/src/ggml-cuda/pool2d.cuh +5 -0
  1219. data/vendor/ggml/src/ggml-cuda/quantize.cu +443 -0
  1220. data/vendor/ggml/src/ggml-cuda/quantize.cuh +41 -0
  1221. data/vendor/ggml/src/ggml-cuda/reduce_rows.cuh +39 -0
  1222. data/vendor/ggml/src/ggml-cuda/roll.cu +67 -0
  1223. data/vendor/ggml/src/ggml-cuda/roll.cuh +5 -0
  1224. data/vendor/ggml/src/ggml-cuda/rope.cu +665 -0
  1225. data/vendor/ggml/src/ggml-cuda/rope.cuh +9 -0
  1226. data/vendor/ggml/src/ggml-cuda/scale.cu +34 -0
  1227. data/vendor/ggml/src/ggml-cuda/scale.cuh +5 -0
  1228. data/vendor/ggml/src/ggml-cuda/set-rows.cu +330 -0
  1229. data/vendor/ggml/src/ggml-cuda/set-rows.cuh +7 -0
  1230. data/vendor/ggml/src/ggml-cuda/set.cu +39 -0
  1231. data/vendor/ggml/src/ggml-cuda/set.cuh +7 -0
  1232. data/vendor/ggml/src/ggml-cuda/snake.cu +72 -0
  1233. data/vendor/ggml/src/ggml-cuda/snake.cuh +8 -0
  1234. data/vendor/ggml/src/ggml-cuda/softcap.cu +34 -0
  1235. data/vendor/ggml/src/ggml-cuda/softcap.cuh +5 -0
  1236. data/vendor/ggml/src/ggml-cuda/softmax.cu +472 -0
  1237. data/vendor/ggml/src/ggml-cuda/softmax.cuh +7 -0
  1238. data/vendor/ggml/src/ggml-cuda/solve_tri.cu +275 -0
  1239. data/vendor/ggml/src/ggml-cuda/solve_tri.cuh +3 -0
  1240. data/vendor/ggml/src/ggml-cuda/ssm-conv.cu +197 -0
  1241. data/vendor/ggml/src/ggml-cuda/ssm-conv.cuh +3 -0
  1242. data/vendor/ggml/src/ggml-cuda/ssm-scan.cu +342 -0
  1243. data/vendor/ggml/src/ggml-cuda/ssm-scan.cuh +3 -0
  1244. data/vendor/ggml/src/ggml-cuda/sum.cu +41 -0
  1245. data/vendor/ggml/src/ggml-cuda/sum.cuh +5 -0
  1246. data/vendor/ggml/src/ggml-cuda/sumrows.cu +43 -0
  1247. data/vendor/ggml/src/ggml-cuda/sumrows.cuh +4 -0
  1248. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu +6 -0
  1249. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_32.cu +6 -0
  1250. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu +12 -0
  1251. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_1.cu +10 -0
  1252. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_2.cu +10 -0
  1253. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +12 -0
  1254. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu +6 -0
  1255. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_32.cu +6 -0
  1256. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +12 -0
  1257. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu +12 -0
  1258. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_1.cu +10 -0
  1259. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_2.cu +10 -0
  1260. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu +6 -0
  1261. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_2.cu +10 -0
  1262. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +12 -0
  1263. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_8.cu +12 -0
  1264. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_64-ncols2_1.cu +10 -0
  1265. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_1.cu +10 -0
  1266. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_2.cu +10 -0
  1267. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +12 -0
  1268. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_8.cu +12 -0
  1269. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq112-dv112.cu +5 -0
  1270. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq128-dv128.cu +5 -0
  1271. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq192-dv128.cu +5 -0
  1272. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq256-dv256.cu +5 -0
  1273. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq320-dv256.cu +5 -0
  1274. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq40-dv40.cu +5 -0
  1275. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq512-dv512.cu +5 -0
  1276. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq576-dv512.cu +5 -0
  1277. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq64-dv64.cu +5 -0
  1278. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq72-dv72.cu +5 -0
  1279. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq80-dv80.cu +5 -0
  1280. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq96-dv96.cu +5 -0
  1281. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-bf16.cu +7 -0
  1282. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-f16.cu +7 -0
  1283. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_0.cu +7 -0
  1284. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_1.cu +7 -0
  1285. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_0.cu +7 -0
  1286. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_1.cu +7 -0
  1287. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q8_0.cu +7 -0
  1288. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-bf16.cu +7 -0
  1289. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu +7 -0
  1290. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_0.cu +7 -0
  1291. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_1.cu +7 -0
  1292. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_0.cu +7 -0
  1293. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_1.cu +7 -0
  1294. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q8_0.cu +7 -0
  1295. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-bf16.cu +7 -0
  1296. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-f16.cu +7 -0
  1297. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu +7 -0
  1298. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_1.cu +7 -0
  1299. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_0.cu +7 -0
  1300. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_1.cu +7 -0
  1301. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q8_0.cu +7 -0
  1302. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-bf16.cu +7 -0
  1303. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-f16.cu +7 -0
  1304. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_0.cu +7 -0
  1305. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_1.cu +7 -0
  1306. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_0.cu +7 -0
  1307. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_1.cu +7 -0
  1308. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q8_0.cu +7 -0
  1309. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-bf16.cu +7 -0
  1310. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-f16.cu +7 -0
  1311. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_0.cu +7 -0
  1312. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_1.cu +7 -0
  1313. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_0.cu +7 -0
  1314. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_1.cu +7 -0
  1315. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q8_0.cu +7 -0
  1316. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-bf16.cu +7 -0
  1317. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-f16.cu +7 -0
  1318. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_0.cu +7 -0
  1319. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_1.cu +7 -0
  1320. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_0.cu +7 -0
  1321. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_1.cu +7 -0
  1322. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q8_0.cu +7 -0
  1323. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-bf16.cu +7 -0
  1324. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-f16.cu +7 -0
  1325. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_0.cu +7 -0
  1326. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_1.cu +7 -0
  1327. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_0.cu +7 -0
  1328. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_1.cu +7 -0
  1329. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu +7 -0
  1330. data/vendor/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +110 -0
  1331. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_1.cu +5 -0
  1332. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_10.cu +5 -0
  1333. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_11.cu +5 -0
  1334. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_12.cu +5 -0
  1335. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_13.cu +5 -0
  1336. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_14.cu +5 -0
  1337. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_15.cu +5 -0
  1338. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_16.cu +5 -0
  1339. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_2.cu +5 -0
  1340. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_3.cu +5 -0
  1341. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_4.cu +5 -0
  1342. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_5.cu +5 -0
  1343. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_6.cu +5 -0
  1344. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_7.cu +5 -0
  1345. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_8.cu +5 -0
  1346. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_9.cu +5 -0
  1347. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq1_s.cu +5 -0
  1348. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_s.cu +5 -0
  1349. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xs.cu +5 -0
  1350. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xxs.cu +5 -0
  1351. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_s.cu +5 -0
  1352. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_xxs.cu +5 -0
  1353. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_nl.cu +5 -0
  1354. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_xs.cu +5 -0
  1355. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-mxfp4.cu +5 -0
  1356. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-nvfp4.cu +5 -0
  1357. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q1_0.cu +5 -0
  1358. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
  1359. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
  1360. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
  1361. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
  1362. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
  1363. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
  1364. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
  1365. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
  1366. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
  1367. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
  1368. data/vendor/ggml/src/ggml-cuda/top-k.cu +95 -0
  1369. data/vendor/ggml/src/ggml-cuda/top-k.cuh +3 -0
  1370. data/vendor/ggml/src/ggml-cuda/topk-moe.cu +415 -0
  1371. data/vendor/ggml/src/ggml-cuda/topk-moe.cuh +27 -0
  1372. data/vendor/ggml/src/ggml-cuda/tri.cu +136 -0
  1373. data/vendor/ggml/src/ggml-cuda/tri.cuh +5 -0
  1374. data/vendor/ggml/src/ggml-cuda/tsembd.cu +47 -0
  1375. data/vendor/ggml/src/ggml-cuda/tsembd.cuh +5 -0
  1376. data/vendor/ggml/src/ggml-cuda/unary.cu +640 -0
  1377. data/vendor/ggml/src/ggml-cuda/unary.cuh +114 -0
  1378. data/vendor/ggml/src/ggml-cuda/upscale.cu +293 -0
  1379. data/vendor/ggml/src/ggml-cuda/upscale.cuh +5 -0
  1380. data/vendor/ggml/src/ggml-cuda/vecdotq.cuh +1317 -0
  1381. data/vendor/ggml/src/ggml-cuda/vendors/cuda.h +28 -0
  1382. data/vendor/ggml/src/ggml-cuda/vendors/hip.h +304 -0
  1383. data/vendor/ggml/src/ggml-cuda/vendors/musa.h +150 -0
  1384. data/vendor/ggml/src/ggml-cuda/wkv.cu +199 -0
  1385. data/vendor/ggml/src/ggml-cuda/wkv.cuh +7 -0
  1386. data/vendor/ggml/src/ggml-hexagon/CMakeLists.txt +118 -0
  1387. data/vendor/ggml/src/ggml-hexagon/ggml-hexagon.cpp +3680 -0
  1388. data/vendor/ggml/src/ggml-hexagon/htp/CMakeLists.txt +78 -0
  1389. data/vendor/ggml/src/ggml-hexagon/htp/act-ops.c +782 -0
  1390. data/vendor/ggml/src/ggml-hexagon/htp/argsort-ops.c +293 -0
  1391. data/vendor/ggml/src/ggml-hexagon/htp/binary-ops.c +872 -0
  1392. data/vendor/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +157 -0
  1393. data/vendor/ggml/src/ggml-hexagon/htp/cpy-ops.c +275 -0
  1394. data/vendor/ggml/src/ggml-hexagon/htp/cumsum-ops.c +270 -0
  1395. data/vendor/ggml/src/ggml-hexagon/htp/diag-ops.c +216 -0
  1396. data/vendor/ggml/src/ggml-hexagon/htp/fill-ops.c +123 -0
  1397. data/vendor/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +727 -0
  1398. data/vendor/ggml/src/ggml-hexagon/htp/gated-delta-net-ops.c +955 -0
  1399. data/vendor/ggml/src/ggml-hexagon/htp/get-rows-ops.c +124 -0
  1400. data/vendor/ggml/src/ggml-hexagon/htp/hex-dma.c +63 -0
  1401. data/vendor/ggml/src/ggml-hexagon/htp/hex-dma.h +372 -0
  1402. data/vendor/ggml/src/ggml-hexagon/htp/hex-dump.h +86 -0
  1403. data/vendor/ggml/src/ggml-hexagon/htp/hex-fastdiv.h +37 -0
  1404. data/vendor/ggml/src/ggml-hexagon/htp/hex-utils.h +137 -0
  1405. data/vendor/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c +1841 -0
  1406. data/vendor/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c +1785 -0
  1407. data/vendor/ggml/src/ggml-hexagon/htp/hmx-ops.h +71 -0
  1408. data/vendor/ggml/src/ggml-hexagon/htp/hmx-profile.h +34 -0
  1409. data/vendor/ggml/src/ggml-hexagon/htp/hmx-queue.c +158 -0
  1410. data/vendor/ggml/src/ggml-hexagon/htp/hmx-queue.h +134 -0
  1411. data/vendor/ggml/src/ggml-hexagon/htp/hmx-utils.h +200 -0
  1412. data/vendor/ggml/src/ggml-hexagon/htp/htp-ctx.h +111 -0
  1413. data/vendor/ggml/src/ggml-hexagon/htp/htp-ops.h +181 -0
  1414. data/vendor/ggml/src/ggml-hexagon/htp/htp_iface.idl +22 -0
  1415. data/vendor/ggml/src/ggml-hexagon/htp/hvx-arith.h +443 -0
  1416. data/vendor/ggml/src/ggml-hexagon/htp/hvx-base.h +308 -0
  1417. data/vendor/ggml/src/ggml-hexagon/htp/hvx-copy.h +262 -0
  1418. data/vendor/ggml/src/ggml-hexagon/htp/hvx-div.h +291 -0
  1419. data/vendor/ggml/src/ggml-hexagon/htp/hvx-dump.h +129 -0
  1420. data/vendor/ggml/src/ggml-hexagon/htp/hvx-exp.h +216 -0
  1421. data/vendor/ggml/src/ggml-hexagon/htp/hvx-floor.h +100 -0
  1422. data/vendor/ggml/src/ggml-hexagon/htp/hvx-inverse.h +210 -0
  1423. data/vendor/ggml/src/ggml-hexagon/htp/hvx-reduce.h +296 -0
  1424. data/vendor/ggml/src/ggml-hexagon/htp/hvx-repl.h +74 -0
  1425. data/vendor/ggml/src/ggml-hexagon/htp/hvx-scale.h +133 -0
  1426. data/vendor/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +142 -0
  1427. data/vendor/ggml/src/ggml-hexagon/htp/hvx-sqrt.h +126 -0
  1428. data/vendor/ggml/src/ggml-hexagon/htp/hvx-types.h +36 -0
  1429. data/vendor/ggml/src/ggml-hexagon/htp/hvx-utils.h +19 -0
  1430. data/vendor/ggml/src/ggml-hexagon/htp/main.c +880 -0
  1431. data/vendor/ggml/src/ggml-hexagon/htp/matmul-ops.c +3173 -0
  1432. data/vendor/ggml/src/ggml-hexagon/htp/repeat-ops.c +148 -0
  1433. data/vendor/ggml/src/ggml-hexagon/htp/rope-ops.c +494 -0
  1434. data/vendor/ggml/src/ggml-hexagon/htp/set-rows-ops.c +184 -0
  1435. data/vendor/ggml/src/ggml-hexagon/htp/softmax-ops.c +407 -0
  1436. data/vendor/ggml/src/ggml-hexagon/htp/solve-tri-ops.c +267 -0
  1437. data/vendor/ggml/src/ggml-hexagon/htp/ssm-conv.c +340 -0
  1438. data/vendor/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +128 -0
  1439. data/vendor/ggml/src/ggml-hexagon/htp/unary-ops.c +657 -0
  1440. data/vendor/ggml/src/ggml-hexagon/htp/vtcm-utils.h +16 -0
  1441. data/vendor/ggml/src/ggml-hexagon/htp/worker-pool.c +293 -0
  1442. data/vendor/ggml/src/ggml-hexagon/htp/worker-pool.h +57 -0
  1443. data/vendor/ggml/src/ggml-hexagon/htp-drv.cpp +418 -0
  1444. data/vendor/ggml/src/ggml-hexagon/htp-drv.h +121 -0
  1445. data/vendor/ggml/src/ggml-hexagon/libdl.h +79 -0
  1446. data/vendor/ggml/src/ggml-hexagon/libggml-htp.inf +40 -0
  1447. data/vendor/ggml/src/ggml-hexagon/op-desc.h +153 -0
  1448. data/vendor/ggml/src/ggml-hip/CMakeLists.txt +157 -0
  1449. data/vendor/ggml/src/ggml-impl.h +783 -0
  1450. data/vendor/ggml/src/ggml-metal/CMakeLists.txt +124 -0
  1451. data/vendor/ggml/src/ggml-metal/ggml-metal-common.cpp +457 -0
  1452. data/vendor/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
  1453. data/vendor/ggml/src/ggml-metal/ggml-metal-context.h +41 -0
  1454. data/vendor/ggml/src/ggml-metal/ggml-metal-context.m +739 -0
  1455. data/vendor/ggml/src/ggml-metal/ggml-metal-device.cpp +2053 -0
  1456. data/vendor/ggml/src/ggml-metal/ggml-metal-device.h +296 -0
  1457. data/vendor/ggml/src/ggml-metal/ggml-metal-device.m +1829 -0
  1458. data/vendor/ggml/src/ggml-metal/ggml-metal-impl.h +1175 -0
  1459. data/vendor/ggml/src/ggml-metal/ggml-metal-ops.cpp +4606 -0
  1460. data/vendor/ggml/src/ggml-metal/ggml-metal-ops.h +97 -0
  1461. data/vendor/ggml/src/ggml-metal/ggml-metal.cpp +950 -0
  1462. data/vendor/ggml/src/ggml-metal/ggml-metal.metal +10679 -0
  1463. data/vendor/ggml/src/ggml-musa/CMakeLists.txt +124 -0
  1464. data/vendor/ggml/src/ggml-musa/mudnn.cu +112 -0
  1465. data/vendor/ggml/src/ggml-musa/mudnn.cuh +12 -0
  1466. data/vendor/ggml/src/ggml-opencl/CMakeLists.txt +189 -0
  1467. data/vendor/ggml/src/ggml-opencl/ggml-opencl.cpp +16374 -0
  1468. data/vendor/ggml/src/ggml-opencl/kernels/add.cl +190 -0
  1469. data/vendor/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
  1470. data/vendor/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
  1471. data/vendor/ggml/src/ggml-opencl/kernels/clamp.cl +20 -0
  1472. data/vendor/ggml/src/ggml-opencl/kernels/concat.cl +51 -0
  1473. data/vendor/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
  1474. data/vendor/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
  1475. data/vendor/ggml/src/ggml-opencl/kernels/cpy.cl +229 -0
  1476. data/vendor/ggml/src/ggml-opencl/kernels/cumsum.cl +139 -0
  1477. data/vendor/ggml/src/ggml-opencl/kernels/cvt.cl +1471 -0
  1478. data/vendor/ggml/src/ggml-opencl/kernels/diag.cl +27 -0
  1479. data/vendor/ggml/src/ggml-opencl/kernels/diag_mask_inf.cl +58 -0
  1480. data/vendor/ggml/src/ggml-opencl/kernels/div.cl +138 -0
  1481. data/vendor/ggml/src/ggml-opencl/kernels/embed_kernel.py +26 -0
  1482. data/vendor/ggml/src/ggml-opencl/kernels/exp.cl +125 -0
  1483. data/vendor/ggml/src/ggml-opencl/kernels/expm1.cl +113 -0
  1484. data/vendor/ggml/src/ggml-opencl/kernels/fill.cl +17 -0
  1485. data/vendor/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +370 -0
  1486. data/vendor/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +371 -0
  1487. data/vendor/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +373 -0
  1488. data/vendor/ggml/src/ggml-opencl/kernels/gelu.cl +89 -0
  1489. data/vendor/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32.cl +162 -0
  1490. data/vendor/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32_ns.cl +302 -0
  1491. data/vendor/ggml/src/ggml-opencl/kernels/gemm_moe_q4_0_f32_ns.cl +252 -0
  1492. data/vendor/ggml/src/ggml-opencl/kernels/gemm_moe_q4_1_f32_ns.cl +254 -0
  1493. data/vendor/ggml/src/ggml-opencl/kernels/gemm_moe_q5_0_f32_ns.cl +256 -0
  1494. data/vendor/ggml/src/ggml-opencl/kernels/gemm_moe_q5_1_f32_ns.cl +258 -0
  1495. data/vendor/ggml/src/ggml-opencl/kernels/gemm_noshuffle_iq4_nl_f32.cl +150 -0
  1496. data/vendor/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_0_f32.cl +139 -0
  1497. data/vendor/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_1_f32.cl +132 -0
  1498. data/vendor/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_k_f32.cl +172 -0
  1499. data/vendor/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_k_f32.cl +176 -0
  1500. data/vendor/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q6_k_f32.cl +140 -0
  1501. data/vendor/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q8_0_f32.cl +129 -0
  1502. data/vendor/ggml/src/ggml-opencl/kernels/gemm_xmem_f16_f32_os8.cl +233 -0
  1503. data/vendor/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32.cl +156 -0
  1504. data/vendor/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32_ns.cl +161 -0
  1505. data/vendor/ggml/src/ggml-opencl/kernels/gemv_moe_q4_0_f32_ns.cl +116 -0
  1506. data/vendor/ggml/src/ggml-opencl/kernels/gemv_moe_q4_1_f32_ns.cl +119 -0
  1507. data/vendor/ggml/src/ggml-opencl/kernels/gemv_moe_q5_0_f32_ns.cl +119 -0
  1508. data/vendor/ggml/src/ggml-opencl/kernels/gemv_moe_q5_1_f32_ns.cl +121 -0
  1509. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_iq4_nl_f32.cl +302 -0
  1510. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_0_f32.cl +274 -0
  1511. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_0_f32_spec.cl +268 -0
  1512. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_1_f32.cl +283 -0
  1513. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_k_f32.cl +318 -0
  1514. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_k_f32.cl +326 -0
  1515. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q6_k_f32.cl +293 -0
  1516. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q8_0_f32.cl +195 -0
  1517. data/vendor/ggml/src/ggml-opencl/kernels/get_rows.cl +187 -0
  1518. data/vendor/ggml/src/ggml-opencl/kernels/glu.cl +378 -0
  1519. data/vendor/ggml/src/ggml-opencl/kernels/group_norm.cl +121 -0
  1520. data/vendor/ggml/src/ggml-opencl/kernels/im2col_f16.cl +57 -0
  1521. data/vendor/ggml/src/ggml-opencl/kernels/im2col_f32.cl +57 -0
  1522. data/vendor/ggml/src/ggml-opencl/kernels/l2_norm.cl +71 -0
  1523. data/vendor/ggml/src/ggml-opencl/kernels/mean.cl +140 -0
  1524. data/vendor/ggml/src/ggml-opencl/kernels/moe_reorder_b.cl +30 -0
  1525. data/vendor/ggml/src/ggml-opencl/kernels/moe_sort_by_expert.cl +82 -0
  1526. data/vendor/ggml/src/ggml-opencl/kernels/mul.cl +152 -0
  1527. data/vendor/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
  1528. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_kq_kqv.cl +273 -0
  1529. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +146 -0
  1530. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +147 -0
  1531. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_iq4_nl_f32_l4_lm.cl +171 -0
  1532. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl +163 -0
  1533. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl +165 -0
  1534. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_q4_k_f32_l4_lm.cl +179 -0
  1535. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_q5_k_f32_l4_lm.cl +192 -0
  1536. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_q6_k_f32_l4_lm.cl +158 -0
  1537. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl +154 -0
  1538. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_f16_f16.cl +118 -0
  1539. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32.cl +118 -0
  1540. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_1row.cl +94 -0
  1541. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_l4.cl +84 -0
  1542. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_f32_f32.cl +118 -0
  1543. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
  1544. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
  1545. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  1546. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
  1547. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
  1548. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32.cl +164 -0
  1549. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32_flat.cl +202 -0
  1550. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
  1551. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
  1552. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32.cl +192 -0
  1553. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_16x_flat.cl +307 -0
  1554. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_8x_flat.cl +265 -0
  1555. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_8x_flat.cl +272 -0
  1556. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_v.cl +254 -0
  1557. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl +219 -0
  1558. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl +229 -0
  1559. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl +180 -0
  1560. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32_flat.cl +196 -0
  1561. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32.cl +187 -0
  1562. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32_flat.cl +203 -0
  1563. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32.cl +194 -0
  1564. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +194 -0
  1565. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
  1566. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
  1567. data/vendor/ggml/src/ggml-opencl/kernels/neg.cl +125 -0
  1568. data/vendor/ggml/src/ggml-opencl/kernels/norm.cl +161 -0
  1569. data/vendor/ggml/src/ggml-opencl/kernels/pad.cl +39 -0
  1570. data/vendor/ggml/src/ggml-opencl/kernels/relu.cl +16 -0
  1571. data/vendor/ggml/src/ggml-opencl/kernels/repeat.cl +38 -0
  1572. data/vendor/ggml/src/ggml-opencl/kernels/rms_norm.cl +190 -0
  1573. data/vendor/ggml/src/ggml-opencl/kernels/rope.cl +747 -0
  1574. data/vendor/ggml/src/ggml-opencl/kernels/scale.cl +27 -0
  1575. data/vendor/ggml/src/ggml-opencl/kernels/set_rows.cl +208 -0
  1576. data/vendor/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
  1577. data/vendor/ggml/src/ggml-opencl/kernels/silu.cl +30 -0
  1578. data/vendor/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +108 -0
  1579. data/vendor/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +108 -0
  1580. data/vendor/ggml/src/ggml-opencl/kernels/softmax_f16.cl +107 -0
  1581. data/vendor/ggml/src/ggml-opencl/kernels/softmax_f32.cl +107 -0
  1582. data/vendor/ggml/src/ggml-opencl/kernels/softplus.cl +116 -0
  1583. data/vendor/ggml/src/ggml-opencl/kernels/solve_tri.cl +51 -0
  1584. data/vendor/ggml/src/ggml-opencl/kernels/sqr.cl +53 -0
  1585. data/vendor/ggml/src/ggml-opencl/kernels/sqrt.cl +53 -0
  1586. data/vendor/ggml/src/ggml-opencl/kernels/ssm_conv.cl +77 -0
  1587. data/vendor/ggml/src/ggml-opencl/kernels/sub.cl +138 -0
  1588. data/vendor/ggml/src/ggml-opencl/kernels/sum_rows.cl +140 -0
  1589. data/vendor/ggml/src/ggml-opencl/kernels/tanh.cl +109 -0
  1590. data/vendor/ggml/src/ggml-opencl/kernels/transpose.cl +143 -0
  1591. data/vendor/ggml/src/ggml-opencl/kernels/tri.cl +32 -0
  1592. data/vendor/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  1593. data/vendor/ggml/src/ggml-opencl/kernels/upscale.cl +120 -0
  1594. data/vendor/ggml/src/ggml-openvino/CMakeLists.txt +22 -0
  1595. data/vendor/ggml/src/ggml-openvino/ggml-decoder.cpp +985 -0
  1596. data/vendor/ggml/src/ggml-openvino/ggml-decoder.h +294 -0
  1597. data/vendor/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +380 -0
  1598. data/vendor/ggml/src/ggml-openvino/ggml-openvino-extra.h +182 -0
  1599. data/vendor/ggml/src/ggml-openvino/ggml-openvino.cpp +1132 -0
  1600. data/vendor/ggml/src/ggml-openvino/ggml-quants.cpp +956 -0
  1601. data/vendor/ggml/src/ggml-openvino/ggml-quants.h +153 -0
  1602. data/vendor/ggml/src/ggml-openvino/openvino/decoder.h +74 -0
  1603. data/vendor/ggml/src/ggml-openvino/openvino/frontend.cpp +27 -0
  1604. data/vendor/ggml/src/ggml-openvino/openvino/frontend.h +23 -0
  1605. data/vendor/ggml/src/ggml-openvino/openvino/input_model.cpp +17 -0
  1606. data/vendor/ggml/src/ggml-openvino/openvino/input_model.h +29 -0
  1607. data/vendor/ggml/src/ggml-openvino/openvino/node_context.h +112 -0
  1608. data/vendor/ggml/src/ggml-openvino/openvino/op/cont.cpp +48 -0
  1609. data/vendor/ggml/src/ggml-openvino/openvino/op/cpy.cpp +21 -0
  1610. data/vendor/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +90 -0
  1611. data/vendor/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +69 -0
  1612. data/vendor/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp +61 -0
  1613. data/vendor/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp +62 -0
  1614. data/vendor/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +90 -0
  1615. data/vendor/ggml/src/ggml-openvino/openvino/op/permute.cpp +102 -0
  1616. data/vendor/ggml/src/ggml-openvino/openvino/op/reshape.cpp +83 -0
  1617. data/vendor/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp +46 -0
  1618. data/vendor/ggml/src/ggml-openvino/openvino/op/rope.cpp +149 -0
  1619. data/vendor/ggml/src/ggml-openvino/openvino/op/scale.cpp +41 -0
  1620. data/vendor/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +76 -0
  1621. data/vendor/ggml/src/ggml-openvino/openvino/op/softmax.cpp +89 -0
  1622. data/vendor/ggml/src/ggml-openvino/openvino/op/transpose.cpp +23 -0
  1623. data/vendor/ggml/src/ggml-openvino/openvino/op/unary_gelu.cpp +25 -0
  1624. data/vendor/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp +27 -0
  1625. data/vendor/ggml/src/ggml-openvino/openvino/op/view.cpp +53 -0
  1626. data/vendor/ggml/src/ggml-openvino/openvino/op_table.cpp +47 -0
  1627. data/vendor/ggml/src/ggml-openvino/openvino/op_table.h +40 -0
  1628. data/vendor/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp +60 -0
  1629. data/vendor/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.h +17 -0
  1630. data/vendor/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.h +29 -0
  1631. data/vendor/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.cpp +58 -0
  1632. data/vendor/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.h +17 -0
  1633. data/vendor/ggml/src/ggml-openvino/openvino/rt_info/weightless_caching_attributes.hpp +41 -0
  1634. data/vendor/ggml/src/ggml-openvino/openvino/translate_session.cpp +317 -0
  1635. data/vendor/ggml/src/ggml-openvino/openvino/translate_session.h +28 -0
  1636. data/vendor/ggml/src/ggml-openvino/openvino/utils.cpp +257 -0
  1637. data/vendor/ggml/src/ggml-openvino/openvino/utils.h +86 -0
  1638. data/vendor/ggml/src/ggml-openvino/utils.cpp +880 -0
  1639. data/vendor/ggml/src/ggml-openvino/utils.h +143 -0
  1640. data/vendor/ggml/src/ggml-opt.cpp +1094 -0
  1641. data/vendor/ggml/src/ggml-quants.c +5491 -0
  1642. data/vendor/ggml/src/ggml-quants.h +112 -0
  1643. data/vendor/ggml/src/ggml-rpc/CMakeLists.txt +33 -0
  1644. data/vendor/ggml/src/ggml-rpc/ggml-rpc.cpp +1974 -0
  1645. data/vendor/ggml/src/ggml-rpc/transport.cpp +683 -0
  1646. data/vendor/ggml/src/ggml-rpc/transport.h +34 -0
  1647. data/vendor/ggml/src/ggml-sycl/CMakeLists.txt +207 -0
  1648. data/vendor/ggml/src/ggml-sycl/add-id.cpp +81 -0
  1649. data/vendor/ggml/src/ggml-sycl/add-id.hpp +8 -0
  1650. data/vendor/ggml/src/ggml-sycl/backend.hpp +48 -0
  1651. data/vendor/ggml/src/ggml-sycl/binbcast.cpp +346 -0
  1652. data/vendor/ggml/src/ggml-sycl/binbcast.hpp +39 -0
  1653. data/vendor/ggml/src/ggml-sycl/common.cpp +155 -0
  1654. data/vendor/ggml/src/ggml-sycl/common.hpp +1002 -0
  1655. data/vendor/ggml/src/ggml-sycl/concat.cpp +202 -0
  1656. data/vendor/ggml/src/ggml-sycl/concat.hpp +20 -0
  1657. data/vendor/ggml/src/ggml-sycl/conv.cpp +101 -0
  1658. data/vendor/ggml/src/ggml-sycl/conv.hpp +20 -0
  1659. data/vendor/ggml/src/ggml-sycl/convert.cpp +825 -0
  1660. data/vendor/ggml/src/ggml-sycl/convert.hpp +64 -0
  1661. data/vendor/ggml/src/ggml-sycl/count-equal.cpp +79 -0
  1662. data/vendor/ggml/src/ggml-sycl/count-equal.hpp +9 -0
  1663. data/vendor/ggml/src/ggml-sycl/cpy.cpp +602 -0
  1664. data/vendor/ggml/src/ggml-sycl/cpy.hpp +223 -0
  1665. data/vendor/ggml/src/ggml-sycl/cumsum.cpp +148 -0
  1666. data/vendor/ggml/src/ggml-sycl/cumsum.hpp +5 -0
  1667. data/vendor/ggml/src/ggml-sycl/dequantize.hpp +975 -0
  1668. data/vendor/ggml/src/ggml-sycl/diag.cpp +67 -0
  1669. data/vendor/ggml/src/ggml-sycl/diag.hpp +5 -0
  1670. data/vendor/ggml/src/ggml-sycl/dmmv.cpp +1579 -0
  1671. data/vendor/ggml/src/ggml-sycl/dmmv.hpp +27 -0
  1672. data/vendor/ggml/src/ggml-sycl/dpct/helper.hpp +3774 -0
  1673. data/vendor/ggml/src/ggml-sycl/element_wise.cpp +1124 -0
  1674. data/vendor/ggml/src/ggml-sycl/element_wise.hpp +94 -0
  1675. data/vendor/ggml/src/ggml-sycl/fattn-buffers.cpp +56 -0
  1676. data/vendor/ggml/src/ggml-sycl/fattn-buffers.hpp +63 -0
  1677. data/vendor/ggml/src/ggml-sycl/fattn-common.hpp +1181 -0
  1678. data/vendor/ggml/src/ggml-sycl/fattn-tile.cpp +59 -0
  1679. data/vendor/ggml/src/ggml-sycl/fattn-tile.hpp +1246 -0
  1680. data/vendor/ggml/src/ggml-sycl/fattn-vec.hpp +674 -0
  1681. data/vendor/ggml/src/ggml-sycl/fattn.cpp +227 -0
  1682. data/vendor/ggml/src/ggml-sycl/fattn.hpp +22 -0
  1683. data/vendor/ggml/src/ggml-sycl/fill.cpp +55 -0
  1684. data/vendor/ggml/src/ggml-sycl/fill.hpp +5 -0
  1685. data/vendor/ggml/src/ggml-sycl/gated_delta_net.cpp +307 -0
  1686. data/vendor/ggml/src/ggml-sycl/gated_delta_net.hpp +9 -0
  1687. data/vendor/ggml/src/ggml-sycl/gemm.hpp +93 -0
  1688. data/vendor/ggml/src/ggml-sycl/getrows.cpp +219 -0
  1689. data/vendor/ggml/src/ggml-sycl/getrows.hpp +20 -0
  1690. data/vendor/ggml/src/ggml-sycl/ggml-sycl.cpp +5520 -0
  1691. data/vendor/ggml/src/ggml-sycl/gla.cpp +106 -0
  1692. data/vendor/ggml/src/ggml-sycl/gla.hpp +8 -0
  1693. data/vendor/ggml/src/ggml-sycl/im2col.cpp +400 -0
  1694. data/vendor/ggml/src/ggml-sycl/im2col.hpp +23 -0
  1695. data/vendor/ggml/src/ggml-sycl/mmq.cpp +3030 -0
  1696. data/vendor/ggml/src/ggml-sycl/mmq.hpp +33 -0
  1697. data/vendor/ggml/src/ggml-sycl/mmvq.cpp +1380 -0
  1698. data/vendor/ggml/src/ggml-sycl/mmvq.hpp +43 -0
  1699. data/vendor/ggml/src/ggml-sycl/norm.cpp +656 -0
  1700. data/vendor/ggml/src/ggml-sycl/norm.hpp +28 -0
  1701. data/vendor/ggml/src/ggml-sycl/outprod.cpp +47 -0
  1702. data/vendor/ggml/src/ggml-sycl/outprod.hpp +10 -0
  1703. data/vendor/ggml/src/ggml-sycl/pad.cpp +97 -0
  1704. data/vendor/ggml/src/ggml-sycl/pad.hpp +24 -0
  1705. data/vendor/ggml/src/ggml-sycl/pad_reflect_1d.cpp +100 -0
  1706. data/vendor/ggml/src/ggml-sycl/pad_reflect_1d.hpp +10 -0
  1707. data/vendor/ggml/src/ggml-sycl/presets.hpp +79 -0
  1708. data/vendor/ggml/src/ggml-sycl/quantize.hpp +133 -0
  1709. data/vendor/ggml/src/ggml-sycl/quants.hpp +156 -0
  1710. data/vendor/ggml/src/ggml-sycl/repeat_back.cpp +76 -0
  1711. data/vendor/ggml/src/ggml-sycl/repeat_back.hpp +8 -0
  1712. data/vendor/ggml/src/ggml-sycl/roll.cpp +122 -0
  1713. data/vendor/ggml/src/ggml-sycl/roll.hpp +20 -0
  1714. data/vendor/ggml/src/ggml-sycl/rope.cpp +641 -0
  1715. data/vendor/ggml/src/ggml-sycl/rope.hpp +26 -0
  1716. data/vendor/ggml/src/ggml-sycl/set.cpp +73 -0
  1717. data/vendor/ggml/src/ggml-sycl/set.hpp +5 -0
  1718. data/vendor/ggml/src/ggml-sycl/set_rows.cpp +240 -0
  1719. data/vendor/ggml/src/ggml-sycl/set_rows.hpp +8 -0
  1720. data/vendor/ggml/src/ggml-sycl/softmax.cpp +426 -0
  1721. data/vendor/ggml/src/ggml-sycl/softmax.hpp +24 -0
  1722. data/vendor/ggml/src/ggml-sycl/solve_tri.cpp +172 -0
  1723. data/vendor/ggml/src/ggml-sycl/solve_tri.hpp +8 -0
  1724. data/vendor/ggml/src/ggml-sycl/ssm_conv.cpp +132 -0
  1725. data/vendor/ggml/src/ggml-sycl/ssm_conv.hpp +5 -0
  1726. data/vendor/ggml/src/ggml-sycl/ssm_scan.cpp +156 -0
  1727. data/vendor/ggml/src/ggml-sycl/ssm_scan.hpp +5 -0
  1728. data/vendor/ggml/src/ggml-sycl/sycl_hw.cpp +67 -0
  1729. data/vendor/ggml/src/ggml-sycl/sycl_hw.hpp +38 -0
  1730. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq112-dv112.cpp +5 -0
  1731. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq128-dv128.cpp +5 -0
  1732. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq256-dv256.cpp +5 -0
  1733. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq40-dv40.cpp +5 -0
  1734. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq512-dv512.cpp +6 -0
  1735. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq576-dv512.cpp +5 -0
  1736. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq64-dv64.cpp +5 -0
  1737. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq72-dv72.cpp +5 -0
  1738. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq80-dv80.cpp +5 -0
  1739. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq96-dv96.cpp +5 -0
  1740. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-f16.cpp +8 -0
  1741. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_0.cpp +8 -0
  1742. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_1.cpp +8 -0
  1743. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_0.cpp +8 -0
  1744. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_1.cpp +8 -0
  1745. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q8_0.cpp +8 -0
  1746. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-f16.cpp +8 -0
  1747. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_0.cpp +8 -0
  1748. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_1.cpp +8 -0
  1749. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_0.cpp +8 -0
  1750. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_1.cpp +8 -0
  1751. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q8_0.cpp +8 -0
  1752. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-f16.cpp +8 -0
  1753. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_0.cpp +8 -0
  1754. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_1.cpp +8 -0
  1755. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_0.cpp +8 -0
  1756. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_1.cpp +8 -0
  1757. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q8_0.cpp +8 -0
  1758. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-f16.cpp +8 -0
  1759. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_0.cpp +8 -0
  1760. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_1.cpp +8 -0
  1761. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_0.cpp +8 -0
  1762. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_1.cpp +8 -0
  1763. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q8_0.cpp +8 -0
  1764. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-f16.cpp +8 -0
  1765. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_0.cpp +8 -0
  1766. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_1.cpp +8 -0
  1767. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_0.cpp +8 -0
  1768. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_1.cpp +8 -0
  1769. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q8_0.cpp +8 -0
  1770. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-f16.cpp +8 -0
  1771. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_0.cpp +8 -0
  1772. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_1.cpp +8 -0
  1773. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_0.cpp +8 -0
  1774. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_1.cpp +8 -0
  1775. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q8_0.cpp +8 -0
  1776. data/vendor/ggml/src/ggml-sycl/tsembd.cpp +73 -0
  1777. data/vendor/ggml/src/ggml-sycl/tsembd.hpp +20 -0
  1778. data/vendor/ggml/src/ggml-sycl/type.hpp +112 -0
  1779. data/vendor/ggml/src/ggml-sycl/upscale.cpp +410 -0
  1780. data/vendor/ggml/src/ggml-sycl/upscale.hpp +9 -0
  1781. data/vendor/ggml/src/ggml-sycl/vecdotq.hpp +1508 -0
  1782. data/vendor/ggml/src/ggml-sycl/wkv.cpp +293 -0
  1783. data/vendor/ggml/src/ggml-sycl/wkv.hpp +10 -0
  1784. data/vendor/ggml/src/ggml-threading.cpp +12 -0
  1785. data/vendor/ggml/src/ggml-threading.h +14 -0
  1786. data/vendor/ggml/src/ggml-virtgpu/CMakeLists.txt +70 -0
  1787. data/vendor/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp +87 -0
  1788. data/vendor/ggml/src/ggml-virtgpu/backend/CMakeLists.txt +21 -0
  1789. data/vendor/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp +115 -0
  1790. data/vendor/ggml/src/ggml-virtgpu/backend/backend-convert.h +13 -0
  1791. data/vendor/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +102 -0
  1792. data/vendor/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +105 -0
  1793. data/vendor/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +179 -0
  1794. data/vendor/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp +148 -0
  1795. data/vendor/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +51 -0
  1796. data/vendor/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +73 -0
  1797. data/vendor/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +27 -0
  1798. data/vendor/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +32 -0
  1799. data/vendor/ggml/src/ggml-virtgpu/backend/backend.cpp +144 -0
  1800. data/vendor/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +95 -0
  1801. data/vendor/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +94 -0
  1802. data/vendor/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +50 -0
  1803. data/vendor/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +378 -0
  1804. data/vendor/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +232 -0
  1805. data/vendor/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +58 -0
  1806. data/vendor/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +81 -0
  1807. data/vendor/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +123 -0
  1808. data/vendor/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +160 -0
  1809. data/vendor/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +213 -0
  1810. data/vendor/ggml/src/ggml-virtgpu/ggml-backend.cpp +71 -0
  1811. data/vendor/ggml/src/ggml-virtgpu/ggml-remoting.h +71 -0
  1812. data/vendor/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml +166 -0
  1813. data/vendor/ggml/src/ggml-virtgpu/include/apir_hw.h +9 -0
  1814. data/vendor/ggml/src/ggml-virtgpu/regenerate_remoting.py +333 -0
  1815. data/vendor/ggml/src/ggml-virtgpu/virtgpu-apir.h +15 -0
  1816. data/vendor/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +58 -0
  1817. data/vendor/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +110 -0
  1818. data/vendor/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +173 -0
  1819. data/vendor/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +192 -0
  1820. data/vendor/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +36 -0
  1821. data/vendor/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +53 -0
  1822. data/vendor/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +99 -0
  1823. data/vendor/ggml/src/ggml-virtgpu/virtgpu-shm.h +23 -0
  1824. data/vendor/ggml/src/ggml-virtgpu/virtgpu-utils.cpp +179 -0
  1825. data/vendor/ggml/src/ggml-virtgpu/virtgpu-utils.h +86 -0
  1826. data/vendor/ggml/src/ggml-virtgpu/virtgpu.cpp +545 -0
  1827. data/vendor/ggml/src/ggml-virtgpu/virtgpu.h +115 -0
  1828. data/vendor/ggml/src/ggml-vulkan/CMakeLists.txt +220 -0
  1829. data/vendor/ggml/src/ggml-vulkan/cmake/host-toolchain.cmake.in +15 -0
  1830. data/vendor/ggml/src/ggml-vulkan/ggml-vulkan.cpp +17208 -0
  1831. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +31 -0
  1832. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/abs.comp +21 -0
  1833. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +37 -0
  1834. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +69 -0
  1835. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/add1.comp +28 -0
  1836. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +42 -0
  1837. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/arange.comp +20 -0
  1838. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +60 -0
  1839. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +86 -0
  1840. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/argsort_large.comp +114 -0
  1841. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/ceil.comp +22 -0
  1842. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp +17 -0
  1843. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/concat.comp +41 -0
  1844. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +49 -0
  1845. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp +105 -0
  1846. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +347 -0
  1847. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
  1848. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +23 -0
  1849. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +51 -0
  1850. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +320 -0
  1851. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/copy_transpose.comp +67 -0
  1852. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp +17 -0
  1853. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/count_equal.comp +31 -0
  1854. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/count_experts.comp +51 -0
  1855. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/cumsum.comp +83 -0
  1856. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass1.comp +60 -0
  1857. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass2.comp +66 -0
  1858. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_f32.comp +20 -0
  1859. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl +653 -0
  1860. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.glsl +768 -0
  1861. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_head.glsl +13 -0
  1862. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +42 -0
  1863. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_s.comp +35 -0
  1864. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +44 -0
  1865. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xs.comp +43 -0
  1866. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +49 -0
  1867. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +40 -0
  1868. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +51 -0
  1869. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp +32 -0
  1870. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp +34 -0
  1871. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +32 -0
  1872. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_nvfp4.comp +32 -0
  1873. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q1_0.comp +29 -0
  1874. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +34 -0
  1875. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +42 -0
  1876. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_0.comp +30 -0
  1877. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_1.comp +32 -0
  1878. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +68 -0
  1879. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_0.comp +34 -0
  1880. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_1.comp +35 -0
  1881. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +70 -0
  1882. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +33 -0
  1883. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q8_0.comp +31 -0
  1884. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp +28 -0
  1885. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/diag_mask_inf.comp +34 -0
  1886. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/div.comp +27 -0
  1887. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/elu.comp +27 -0
  1888. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +20 -0
  1889. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/bfloat16.comp +7 -0
  1890. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/coopmat.comp +7 -0
  1891. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/coopmat2.comp +7 -0
  1892. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/integer_dot.comp +7 -0
  1893. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/fill.comp +19 -0
  1894. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +756 -0
  1895. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +255 -0
  1896. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +626 -0
  1897. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +427 -0
  1898. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_dequant.glsl +123 -0
  1899. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mask_opt.comp +162 -0
  1900. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mmq_funcs.glsl +203 -0
  1901. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +121 -0
  1902. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/floor.comp +22 -0
  1903. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp +190 -0
  1904. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
  1905. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
  1906. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
  1907. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp +25 -0
  1908. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
  1909. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp +23 -0
  1910. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.glsl +65 -0
  1911. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/generic_head.glsl +11 -0
  1912. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.glsl +83 -0
  1913. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +42 -0
  1914. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +51 -0
  1915. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.glsl +28 -0
  1916. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.glsl +39 -0
  1917. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp +66 -0
  1918. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +22 -0
  1919. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +22 -0
  1920. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +93 -0
  1921. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +124 -0
  1922. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +44 -0
  1923. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/leaky_relu.comp +22 -0
  1924. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +17 -0
  1925. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul.comp +27 -0
  1926. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_split_k_reduce.comp +48 -0
  1927. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +169 -0
  1928. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl +230 -0
  1929. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +35 -0
  1930. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_m.comp +132 -0
  1931. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp +95 -0
  1932. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_s.comp +90 -0
  1933. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +105 -0
  1934. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xxs.comp +87 -0
  1935. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_s.comp +90 -0
  1936. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_xxs.comp +88 -0
  1937. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +124 -0
  1938. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp +156 -0
  1939. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +128 -0
  1940. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +132 -0
  1941. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +134 -0
  1942. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +165 -0
  1943. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +130 -0
  1944. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +143 -0
  1945. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +503 -0
  1946. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +464 -0
  1947. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +624 -0
  1948. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +600 -0
  1949. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +74 -0
  1950. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +311 -0
  1951. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +454 -0
  1952. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +93 -0
  1953. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +194 -0
  1954. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/neg.comp +20 -0
  1955. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/norm.comp +44 -0
  1956. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_adamw.comp +42 -0
  1957. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +22 -0
  1958. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +64 -0
  1959. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/pool2d.comp +74 -0
  1960. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +127 -0
  1961. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
  1962. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp +21 -0
  1963. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp +26 -0
  1964. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp +37 -0
  1965. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +150 -0
  1966. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_back.comp +55 -0
  1967. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +65 -0
  1968. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
  1969. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +207 -0
  1970. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +19 -0
  1971. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +17 -0
  1972. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +17 -0
  1973. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +17 -0
  1974. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +31 -0
  1975. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +17 -0
  1976. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/round.comp +29 -0
  1977. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +24 -0
  1978. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/sgn.comp +21 -0
  1979. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp +20 -0
  1980. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp +22 -0
  1981. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/silu_back.comp +26 -0
  1982. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp +17 -0
  1983. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +195 -0
  1984. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +54 -0
  1985. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large1.comp +62 -0
  1986. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large2.comp +79 -0
  1987. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large3.comp +65 -0
  1988. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large_common.glsl +53 -0
  1989. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/softplus.comp +23 -0
  1990. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/solve_tri.comp +81 -0
  1991. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +17 -0
  1992. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/square.comp +17 -0
  1993. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +50 -0
  1994. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/ssm_scan.comp +124 -0
  1995. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/step.comp +22 -0
  1996. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/sub.comp +29 -0
  1997. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +47 -0
  1998. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.glsl +25 -0
  1999. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
  2000. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +14 -0
  2001. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp +20 -0
  2002. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +42 -0
  2003. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/topk_argsort.comp +118 -0
  2004. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +213 -0
  2005. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/topk_nary_search.comp +246 -0
  2006. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp +42 -0
  2007. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/trunc.comp +22 -0
  2008. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl +1846 -0
  2009. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +178 -0
  2010. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/utils.glsl +25 -0
  2011. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +1183 -0
  2012. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/wkv6.comp +87 -0
  2013. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/wkv7.comp +91 -0
  2014. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/xielu.comp +35 -0
  2015. data/vendor/ggml/src/ggml-webgpu/CMakeLists.txt +80 -0
  2016. data/vendor/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +3231 -0
  2017. data/vendor/ggml/src/ggml-webgpu/ggml-webgpu.cpp +4461 -0
  2018. data/vendor/ggml/src/ggml-webgpu/pre_wgsl.hpp +778 -0
  2019. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/add_id.wgsl +64 -0
  2020. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl +72 -0
  2021. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl +106 -0
  2022. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl +134 -0
  2023. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +139 -0
  2024. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +905 -0
  2025. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl +75 -0
  2026. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/conv2d.wgsl +165 -0
  2027. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +81 -0
  2028. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl +66 -0
  2029. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +89 -0
  2030. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +706 -0
  2031. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_tile.wgsl +351 -0
  2032. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_blk.wgsl +101 -0
  2033. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_reduce.wgsl +84 -0
  2034. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_split.wgsl +720 -0
  2035. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/gated_delta_net.wgsl +132 -0
  2036. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.wgsl +773 -0
  2037. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/glu.wgsl +155 -0
  2038. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/im2col.wgsl +101 -0
  2039. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
  2040. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +747 -0
  2041. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +1210 -0
  2042. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id.wgsl +195 -0
  2043. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_gather.wgsl +55 -0
  2044. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_vec.wgsl +154 -0
  2045. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl +149 -0
  2046. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.wgsl +200 -0
  2047. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +133 -0
  2048. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_acc.tmpl +1433 -0
  2049. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl +86 -0
  2050. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/repeat.wgsl +67 -0
  2051. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_mul.wgsl +152 -0
  2052. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/rope.wgsl +224 -0
  2053. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/row_norm.wgsl +153 -0
  2054. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/scale.wgsl +63 -0
  2055. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/set.wgsl +109 -0
  2056. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +109 -0
  2057. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.wgsl +245 -0
  2058. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/solve_tri.wgsl +121 -0
  2059. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/ssm_conv.wgsl +65 -0
  2060. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/ssm_scan.wgsl +193 -0
  2061. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl +55 -0
  2062. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +210 -0
  2063. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/upscale.wgsl +240 -0
  2064. data/vendor/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
  2065. data/vendor/ggml/src/ggml-zdnn/common.hpp +59 -0
  2066. data/vendor/ggml/src/ggml-zdnn/ggml-zdnn.cpp +637 -0
  2067. data/vendor/ggml/src/ggml-zdnn/mmf.cpp +80 -0
  2068. data/vendor/ggml/src/ggml-zdnn/mmf.hpp +12 -0
  2069. data/vendor/ggml/src/ggml-zdnn/utils.cpp +79 -0
  2070. data/vendor/ggml/src/ggml-zdnn/utils.hpp +19 -0
  2071. data/vendor/ggml/src/ggml-zendnn/CMakeLists.txt +91 -0
  2072. data/vendor/ggml/src/ggml-zendnn/ggml-zendnn.cpp +669 -0
  2073. data/vendor/ggml/src/ggml.c +7777 -0
  2074. data/vendor/ggml/src/ggml.cpp +26 -0
  2075. data/vendor/ggml/src/gguf.cpp +1556 -0
  2076. data/vendor/ggml/tests/CMakeLists.txt +356 -0
  2077. data/vendor/ggml/tests/test-arange.cpp +100 -0
  2078. data/vendor/ggml/tests/test-backend-ops.cpp +9786 -0
  2079. data/vendor/ggml/tests/test-cont.c +170 -0
  2080. data/vendor/ggml/tests/test-conv-transpose-1d.cpp +691 -0
  2081. data/vendor/ggml/tests/test-conv-transpose.c +248 -0
  2082. data/vendor/ggml/tests/test-conv1d-dw-c1.cpp +243 -0
  2083. data/vendor/ggml/tests/test-conv1d-dw-c2.cpp +243 -0
  2084. data/vendor/ggml/tests/test-conv1d.cpp +289 -0
  2085. data/vendor/ggml/tests/test-conv2d-dw.cpp +153 -0
  2086. data/vendor/ggml/tests/test-conv2d.cpp +391 -0
  2087. data/vendor/ggml/tests/test-customop.c +300 -0
  2088. data/vendor/ggml/tests/test-dup.c +111 -0
  2089. data/vendor/ggml/tests/test-interpolate.cpp +166 -0
  2090. data/vendor/ggml/tests/test-opt.cpp +1003 -0
  2091. data/vendor/ggml/tests/test-pad-reflect-1d.cpp +213 -0
  2092. data/vendor/ggml/tests/test-pool.c +274 -0
  2093. data/vendor/ggml/tests/test-quantize-fns.cpp +196 -0
  2094. data/vendor/ggml/tests/test-quantize-perf.cpp +356 -0
  2095. data/vendor/ggml/tests/test-rel-pos.c +87 -0
  2096. data/vendor/ggml/tests/test-roll.cpp +128 -0
  2097. data/vendor/ggml/tests/test-timestep_embedding.cpp +180 -0
  2098. data/vendor-patches/0001-cuda-buffer_from_ptr.patch +253 -0
  2099. data/vendor-patches/0002-cuda-buffer_from_ptr-reuse-iface.patch +117 -0
  2100. data/vendor-patches/0003-cuda-buffer_from_ptr-copy-mode.patch +128 -0
  2101. data/vendor-patches/0004-cuda-cpy-strided.patch +61 -0
  2102. data/vendor-patches/0005-concat-backward.patch +36 -0
  2103. data/vendor-patches/0006-getrows-back-large-vocab.patch +69 -0
  2104. data/vendor-patches/0007-gpt2-backward-kernels.patch +438 -0
  2105. data/vendor-patches/0008-mul-mat-backward-mixed-precision.patch +50 -0
  2106. data/vendor-patches/0009-sched-unsupported-node-diagnostic.patch +26 -0
  2107. metadata +2161 -0
@@ -0,0 +1,3835 @@
1
+ #define _CRT_SECURE_NO_DEPRECATE // Disables "unsafe" warnings on Windows
2
+ #define _USE_MATH_DEFINES // For M_PI on MSVC
3
+
4
+ #include "ggml-backend-impl.h"
5
+ #include "ggml-backend.h"
6
+ #include "traits.h"
7
+ #include "ggml-cpu-impl.h"
8
+ #include "ggml-impl.h"
9
+ #include "quants.h"
10
+ #include "ggml-threading.h"
11
+ #include "unary-ops.h"
12
+ #include "binary-ops.h"
13
+ #include "vec.h"
14
+ #include "ops.h"
15
+ #include "ggml.h"
16
+ #include "common.h"
17
+
18
+ #if defined(_MSC_VER) || defined(__MINGW32__)
19
+ #include <malloc.h> // using malloc.h with MSC/MINGW
20
+ #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
21
+ #include <alloca.h>
22
+ #endif
23
+
24
+ #include <assert.h>
25
+ #include <errno.h>
26
+ #include <time.h>
27
+ #include <math.h>
28
+ #include <stdlib.h>
29
+ #include <string.h>
30
+ #include <stdint.h>
31
+ #include <inttypes.h>
32
+ #include <stdio.h>
33
+ #include <float.h>
34
+ #include <limits.h>
35
+ #include <stdarg.h>
36
+ #include <signal.h>
37
+ #if defined(__gnu_linux__)
38
+ #include <syscall.h>
39
+ #endif
40
+
41
+ #ifdef GGML_USE_OPENMP
42
+ #include <omp.h>
43
+ #endif
44
+
45
+ #if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
46
+ #undef GGML_USE_LLAMAFILE
47
+ #endif
48
+
49
+ #ifdef GGML_USE_LLAMAFILE
50
+ #include "llamafile/sgemm.h"
51
+ #endif
52
+
53
+ #ifdef GGML_USE_CPU_RISCV64_SPACEMIT
54
+ # include "spacemit/ime.h"
55
+ #endif
56
+
57
+ // Note: once we move threading into a separate C++ file
58
+ // will use std::hardware_destructive_interference_size instead of hardcoding it here
59
+ // and we'll use C++ attribute syntax.
60
+ #define GGML_CACHE_LINE 64
61
+
62
+ #if defined(__clang__) || defined(__GNUC__)
63
+ #define GGML_CACHE_ALIGN __attribute__((aligned(GGML_CACHE_LINE)))
64
+ #endif
65
+
66
+ #if defined(__has_feature)
67
+ #if __has_feature(thread_sanitizer)
68
+ #define GGML_TSAN_ENABLED 1
69
+ #endif
70
+ #else // __has_feature
71
+ #if defined(__SANITIZE_THREAD__)
72
+ #define GGML_TSAN_ENABLED 1
73
+ #endif
74
+ #endif // __has_feature
75
+
76
+ #define UNUSED GGML_UNUSED
77
+ #define SWAP(x, y, T) do { T SWAP = x; (x) = y; (y) = SWAP; } while (0)
78
+
79
+ // precomputed f32 table for f16 (256 KB) (simd-mappings.h)
80
+ float ggml_table_f32_f16[1 << 16];
81
+
82
+ // precomputed f32 table for e8m0 half (1 KB) (simd-mappings.h)
83
+ float ggml_table_f32_e8m0_half[1 << 8];
84
+
85
+ #if defined(__ARM_ARCH)
86
+ struct ggml_arm_arch_features_type {
87
+ int sve_cnt;
88
+ } ggml_arm_arch_features = { 0 };
89
+ #endif
90
+
91
+ #if defined(__riscv)
92
+ struct ggml_riscv_arch_features_type {
93
+ int rvv_vlen;
94
+ } ggml_riscv_arch_features = { 0 };
95
+ #endif
96
+
97
+ #if defined(_WIN32)
98
+
99
+ #define WIN32_LEAN_AND_MEAN
100
+ #ifndef NOMINMAX
101
+ #define NOMINMAX
102
+ #endif
103
+ #include <windows.h>
104
+
105
+ #if defined(_MSC_VER) && !defined(__clang__)
106
+ #define GGML_CACHE_ALIGN __declspec(align(GGML_CACHE_LINE))
107
+
108
+ typedef volatile LONG atomic_int;
109
+ typedef atomic_int atomic_bool;
110
+ typedef atomic_int atomic_flag;
111
+
112
+ #define ATOMIC_FLAG_INIT 0
113
+
114
+ typedef enum {
115
+ memory_order_relaxed,
116
+ memory_order_consume,
117
+ memory_order_acquire,
118
+ memory_order_release,
119
+ memory_order_acq_rel,
120
+ memory_order_seq_cst
121
+ } memory_order;
122
+
123
+ static void atomic_store(atomic_int * ptr, LONG val) {
124
+ InterlockedExchange(ptr, val);
125
+ }
126
+ static void atomic_store_explicit(atomic_int * ptr, LONG val, memory_order mo) {
127
+ // TODO: add support for explicit memory order
128
+ InterlockedExchange(ptr, val);
129
+ }
130
+ static LONG atomic_load(atomic_int * ptr) {
131
+ return InterlockedCompareExchange(ptr, 0, 0);
132
+ }
133
+ static LONG atomic_load_explicit(atomic_int * ptr, memory_order mo) {
134
+ // TODO: add support for explicit memory order
135
+ return InterlockedCompareExchange(ptr, 0, 0);
136
+ }
137
+ static LONG atomic_fetch_add(atomic_int * ptr, LONG inc) {
138
+ return InterlockedExchangeAdd(ptr, inc);
139
+ }
140
+ static LONG atomic_fetch_add_explicit(atomic_int * ptr, LONG inc, memory_order mo) {
141
+ // TODO: add support for explicit memory order
142
+ return InterlockedExchangeAdd(ptr, inc);
143
+ }
144
+ static atomic_bool atomic_flag_test_and_set(atomic_flag * ptr) {
145
+ return InterlockedExchange(ptr, 1);
146
+ }
147
+ static void atomic_flag_clear(atomic_flag * ptr) {
148
+ InterlockedExchange(ptr, 0);
149
+ }
150
+ static void atomic_thread_fence(memory_order mo) {
151
+ MemoryBarrier();
152
+ }
153
+ #else // clang
154
+ #include <stdatomic.h>
155
+ #endif
156
+
157
+ typedef HANDLE pthread_t;
158
+
159
+ typedef DWORD thread_ret_t;
160
+ static int pthread_create(pthread_t * out, void * unused, thread_ret_t(*func)(void *), void * arg) {
161
+ (void) unused;
162
+ HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, arg, 0, NULL);
163
+ if (handle == NULL)
164
+ {
165
+ return EAGAIN;
166
+ }
167
+
168
+ *out = handle;
169
+ return 0;
170
+ }
171
+
172
+ static int pthread_join(pthread_t thread, void * unused) {
173
+ (void) unused;
174
+ int ret = (int) WaitForSingleObject(thread, INFINITE);
175
+ CloseHandle(thread);
176
+ return ret;
177
+ }
178
+
179
+ static int sched_yield (void) {
180
+ Sleep (0);
181
+ return 0;
182
+ }
183
+ #else
184
+
185
+ #include <pthread.h>
186
+ #include <stdatomic.h>
187
+ #include <sched.h>
188
+ #if defined(__FreeBSD__)
189
+ #include <pthread_np.h>
190
+ #endif
191
+
192
+ typedef void * thread_ret_t;
193
+
194
+ #include <sys/types.h>
195
+ #include <sys/stat.h>
196
+ #include <unistd.h>
197
+
198
+ #endif
199
+
200
+ typedef pthread_t ggml_thread_t;
201
+
202
+ #define GGML_THREADPOOL_N_THREADS_MASK (0xffffU)
203
+ #define GGML_THREADPOOL_N_THREADS_BITS (16)
204
+
205
+ #if defined(__APPLE__)
206
+ #include <unistd.h>
207
+ #include <mach/mach.h>
208
+ #include <TargetConditionals.h>
209
+ #endif
210
+
211
+ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
212
+ [GGML_TYPE_F32] = {
213
+ .from_float = (ggml_from_float_t) ggml_cpu_fp32_to_fp32,
214
+ .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
215
+ .vec_dot_type = GGML_TYPE_F32,
216
+ .nrows = 1,
217
+ },
218
+ [GGML_TYPE_F16] = {
219
+ .from_float = (ggml_from_float_t) ggml_cpu_fp32_to_fp16,
220
+ .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16,
221
+ .vec_dot_type = GGML_TYPE_F16,
222
+ .nrows = 1,
223
+ },
224
+ [GGML_TYPE_Q1_0] = {
225
+ .from_float = quantize_row_q1_0,
226
+ .vec_dot = ggml_vec_dot_q1_0_q8_0,
227
+ .vec_dot_type = GGML_TYPE_Q8_0,
228
+ .nrows = 1,
229
+ },
230
+ [GGML_TYPE_Q4_0] = {
231
+ .from_float = quantize_row_q4_0,
232
+ .vec_dot = ggml_vec_dot_q4_0_q8_0,
233
+ .vec_dot_type = GGML_TYPE_Q8_0,
234
+ #if defined (__ARM_FEATURE_MATMUL_INT8)
235
+ .nrows = 2,
236
+ #else
237
+ .nrows = 1,
238
+ #endif
239
+ },
240
+ [GGML_TYPE_Q4_1] = {
241
+ .from_float = quantize_row_q4_1,
242
+ .vec_dot = ggml_vec_dot_q4_1_q8_1,
243
+ .vec_dot_type = GGML_TYPE_Q8_1,
244
+ #if defined (__ARM_FEATURE_MATMUL_INT8)
245
+ .nrows = 2,
246
+ #else
247
+ .nrows = 1,
248
+ #endif
249
+ },
250
+ [GGML_TYPE_Q5_0] = {
251
+ .from_float = quantize_row_q5_0,
252
+ .vec_dot = ggml_vec_dot_q5_0_q8_0,
253
+ .vec_dot_type = GGML_TYPE_Q8_0,
254
+ .nrows = 1,
255
+ },
256
+ [GGML_TYPE_Q5_1] = {
257
+ .from_float = quantize_row_q5_1,
258
+ .vec_dot = ggml_vec_dot_q5_1_q8_1,
259
+ .vec_dot_type = GGML_TYPE_Q8_1,
260
+ .nrows = 1,
261
+ },
262
+ [GGML_TYPE_Q8_0] = {
263
+ .from_float = quantize_row_q8_0,
264
+ .vec_dot = ggml_vec_dot_q8_0_q8_0,
265
+ .vec_dot_type = GGML_TYPE_Q8_0,
266
+ #if defined (__ARM_FEATURE_MATMUL_INT8)
267
+ .nrows = 2,
268
+ #else
269
+ .nrows = 1,
270
+ #endif
271
+ },
272
+ [GGML_TYPE_Q8_1] = {
273
+ .from_float = quantize_row_q8_1,
274
+ .vec_dot_type = GGML_TYPE_Q8_1,
275
+ .nrows = 1,
276
+ },
277
+ [GGML_TYPE_MXFP4] = {
278
+ .from_float = quantize_row_mxfp4,
279
+ .vec_dot = ggml_vec_dot_mxfp4_q8_0,
280
+ .vec_dot_type = GGML_TYPE_Q8_0,
281
+ .nrows = 1,
282
+ },
283
+ [GGML_TYPE_NVFP4] = {
284
+ .from_float = quantize_row_nvfp4,
285
+ .vec_dot = ggml_vec_dot_nvfp4_q8_0,
286
+ .vec_dot_type = GGML_TYPE_Q8_0,
287
+ .nrows = 1,
288
+ },
289
+ [GGML_TYPE_Q2_K] = {
290
+ .from_float = quantize_row_q2_K,
291
+ .vec_dot = ggml_vec_dot_q2_K_q8_K,
292
+ .vec_dot_type = GGML_TYPE_Q8_K,
293
+ .nrows = 1,
294
+ },
295
+ [GGML_TYPE_Q3_K] = {
296
+ .from_float = quantize_row_q3_K,
297
+ .vec_dot = ggml_vec_dot_q3_K_q8_K,
298
+ .vec_dot_type = GGML_TYPE_Q8_K,
299
+ .nrows = 1,
300
+ },
301
+ [GGML_TYPE_Q4_K] = {
302
+ .from_float = quantize_row_q4_K,
303
+ .vec_dot = ggml_vec_dot_q4_K_q8_K,
304
+ .vec_dot_type = GGML_TYPE_Q8_K,
305
+ #if defined (__ARM_FEATURE_MATMUL_INT8)
306
+ .nrows = 2,
307
+ #else
308
+ .nrows = 1,
309
+ #endif
310
+ },
311
+ [GGML_TYPE_Q5_K] = {
312
+ .from_float = quantize_row_q5_K,
313
+ .vec_dot = ggml_vec_dot_q5_K_q8_K,
314
+ .vec_dot_type = GGML_TYPE_Q8_K,
315
+ .nrows = 1,
316
+ },
317
+ [GGML_TYPE_Q6_K] = {
318
+ .from_float = quantize_row_q6_K,
319
+ .vec_dot = ggml_vec_dot_q6_K_q8_K,
320
+ .vec_dot_type = GGML_TYPE_Q8_K,
321
+ #if defined (__ARM_FEATURE_MATMUL_INT8)
322
+ .nrows = 2,
323
+ #else
324
+ .nrows = 1,
325
+ #endif
326
+ },
327
+ [GGML_TYPE_IQ2_XXS] = {
328
+ .from_float = NULL,
329
+ .vec_dot = ggml_vec_dot_iq2_xxs_q8_K,
330
+ .vec_dot_type = GGML_TYPE_Q8_K,
331
+ .nrows = 1,
332
+ },
333
+ [GGML_TYPE_IQ2_XS] = {
334
+ .from_float = NULL,
335
+ .vec_dot = ggml_vec_dot_iq2_xs_q8_K,
336
+ .vec_dot_type = GGML_TYPE_Q8_K,
337
+ .nrows = 1,
338
+ },
339
+ [GGML_TYPE_IQ3_XXS] = {
340
+ // NOTE: from_float for iq3 and iq2_s was removed because these quants require initialization in ggml_quantize_init
341
+ //.from_float = quantize_row_iq3_xxs,
342
+ .vec_dot = ggml_vec_dot_iq3_xxs_q8_K,
343
+ .vec_dot_type = GGML_TYPE_Q8_K,
344
+ .nrows = 1,
345
+ },
346
+ [GGML_TYPE_IQ3_S] = {
347
+ //.from_float = quantize_row_iq3_s,
348
+ .vec_dot = ggml_vec_dot_iq3_s_q8_K,
349
+ .vec_dot_type = GGML_TYPE_Q8_K,
350
+ .nrows = 1,
351
+ },
352
+ [GGML_TYPE_IQ2_S] = {
353
+ //.from_float = quantize_row_iq2_s,
354
+ .vec_dot = ggml_vec_dot_iq2_s_q8_K,
355
+ .vec_dot_type = GGML_TYPE_Q8_K,
356
+ .nrows = 1,
357
+ },
358
+ [GGML_TYPE_IQ1_S] = {
359
+ .from_float = NULL,
360
+ .vec_dot = ggml_vec_dot_iq1_s_q8_K,
361
+ .vec_dot_type = GGML_TYPE_Q8_K,
362
+ .nrows = 1,
363
+ },
364
+ [GGML_TYPE_IQ1_M] = {
365
+ .from_float = NULL,
366
+ .vec_dot = ggml_vec_dot_iq1_m_q8_K,
367
+ .vec_dot_type = GGML_TYPE_Q8_K,
368
+ .nrows = 1,
369
+ },
370
+ [GGML_TYPE_IQ4_NL] = {
371
+ .from_float = quantize_row_iq4_nl,
372
+ .vec_dot = ggml_vec_dot_iq4_nl_q8_0,
373
+ .vec_dot_type = GGML_TYPE_Q8_0,
374
+ .nrows = 1,
375
+ },
376
+ [GGML_TYPE_IQ4_XS] = {
377
+ .from_float = quantize_row_iq4_xs,
378
+ .vec_dot = ggml_vec_dot_iq4_xs_q8_K,
379
+ .vec_dot_type = GGML_TYPE_Q8_K,
380
+ .nrows = 1,
381
+ },
382
+ [GGML_TYPE_Q8_K] = {
383
+ .from_float = quantize_row_q8_K,
384
+ },
385
+ [GGML_TYPE_BF16] = {
386
+ .from_float = (ggml_from_float_t) ggml_cpu_fp32_to_bf16,
387
+ .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_bf16,
388
+ .vec_dot_type = GGML_TYPE_BF16,
389
+ .nrows = 1,
390
+ },
391
+ [GGML_TYPE_TQ1_0] = {
392
+ .from_float = quantize_row_tq1_0,
393
+ .vec_dot = ggml_vec_dot_tq1_0_q8_K,
394
+ .vec_dot_type = GGML_TYPE_Q8_K,
395
+ .nrows = 1,
396
+ },
397
+ [GGML_TYPE_TQ2_0] = {
398
+ .from_float = quantize_row_tq2_0,
399
+ .vec_dot = ggml_vec_dot_tq2_0_q8_K,
400
+ .vec_dot_type = GGML_TYPE_Q8_K,
401
+ .nrows = 1,
402
+ },
403
+ [GGML_TYPE_I32] = {
404
+ .from_float = (ggml_from_float_t) ggml_cpu_fp32_to_i32,
405
+ },
406
+ };
407
+
408
+ const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type) {
409
+ return &type_traits_cpu[type];
410
+ }
411
+
412
+ //
413
+ // Threading defs
414
+ //
415
+
416
+ typedef pthread_t ggml_thread_t;
417
+
418
+ #if defined(_WIN32)
419
+
420
+ typedef CONDITION_VARIABLE ggml_cond_t;
421
+ typedef SRWLOCK ggml_mutex_t;
422
+
423
+ #define ggml_mutex_init(m) InitializeSRWLock(m)
424
+ #define ggml_mutex_destroy(m)
425
+ #define ggml_mutex_lock(m) AcquireSRWLockExclusive(m)
426
+ #define ggml_mutex_unlock(m) ReleaseSRWLockExclusive(m)
427
+ #define ggml_mutex_lock_shared(m) AcquireSRWLockShared(m)
428
+ #define ggml_mutex_unlock_shared(m) ReleaseSRWLockShared(m)
429
+
430
+ #define ggml_cond_init(c) InitializeConditionVariable(c)
431
+ #define ggml_cond_destroy(c)
432
+ #define ggml_cond_wait(c, m) SleepConditionVariableSRW(c, m, INFINITE, CONDITION_VARIABLE_LOCKMODE_SHARED)
433
+ #define ggml_cond_broadcast(c) WakeAllConditionVariable(c)
434
+
435
+ #define ggml_thread_create pthread_create
436
+ #define ggml_thread_join pthread_join
437
+
438
+ #else
439
+
440
+ typedef pthread_cond_t ggml_cond_t;
441
+ typedef pthread_mutex_t ggml_mutex_t;
442
+
443
+ #define ggml_mutex_init(m) pthread_mutex_init(m, NULL)
444
+ #define ggml_mutex_destroy(m) pthread_mutex_destroy(m)
445
+ #define ggml_mutex_lock(m) pthread_mutex_lock(m)
446
+ #define ggml_mutex_unlock(m) pthread_mutex_unlock(m)
447
+ #define ggml_mutex_lock_shared(m) pthread_mutex_lock(m)
448
+ #define ggml_mutex_unlock_shared(m) pthread_mutex_unlock(m)
449
+
450
+ #define ggml_lock_init(x) UNUSED(x)
451
+ #define ggml_lock_destroy(x) UNUSED(x)
452
+ #if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
453
+ #define ggml_lock_lock(x) _mm_pause()
454
+ #else
455
+ #define ggml_lock_lock(x) UNUSED(x)
456
+ #endif
457
+ #define ggml_lock_unlock(x) UNUSED(x)
458
+
459
+ #define GGML_LOCK_INITIALIZER 0
460
+ #define ggml_cond_init(c) pthread_cond_init(c, NULL)
461
+ #define ggml_cond_destroy(c) pthread_cond_destroy(c)
462
+ #define ggml_cond_wait(c, m) pthread_cond_wait(c, m)
463
+ #define ggml_cond_broadcast(c) pthread_cond_broadcast(c)
464
+
465
+ #define ggml_thread_create pthread_create
466
+ #define ggml_thread_join pthread_join
467
+
468
+ #endif
469
+
470
+ // Threadpool def
471
+ struct ggml_threadpool {
472
+ ggml_mutex_t mutex; // mutex for cond.var
473
+ ggml_cond_t cond; // cond.var for waiting for new work
474
+
475
+ struct ggml_cgraph * cgraph;
476
+ struct ggml_cplan * cplan;
477
+
478
+ // synchronization primitives
479
+ atomic_int n_graph; // updated when there is work to be done (i.e each graph) holds graph and active thread counts.
480
+ atomic_int GGML_CACHE_ALIGN n_barrier;
481
+ atomic_int GGML_CACHE_ALIGN n_barrier_passed;
482
+ atomic_int GGML_CACHE_ALIGN current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
483
+
484
+ // these are atomic as an annotation for thread-sanitizer
485
+ atomic_bool stop; // Used for stopping the threadpool altogether
486
+ atomic_bool pause; // Used for pausing the threadpool or individual threads
487
+ atomic_int abort; // Used for aborting processing of a graph
488
+
489
+ struct ggml_compute_state * workers; // per thread state
490
+ int n_threads; // Number of threads in the pool
491
+ int32_t prio; // Scheduling priority
492
+ uint32_t poll; // Polling level (0 - no polling)
493
+
494
+ enum ggml_status ec;
495
+ };
496
+
497
+ // Per-thread state
498
+ struct ggml_compute_state {
499
+ #ifndef GGML_USE_OPENMP
500
+ ggml_thread_t thrd;
501
+ int last_graph;
502
+ bool pending;
503
+ #endif
504
+ bool cpumask[GGML_MAX_N_THREADS];
505
+ struct ggml_threadpool * threadpool;
506
+ int ith;
507
+ };
508
+
509
+ // Helpers for polling loops
510
+ #if defined(__aarch64__) && ( defined(__clang__) || defined(__GNUC__) )
511
+ static inline void ggml_thread_cpu_relax(void) {
512
+ __asm__ volatile("yield" ::: "memory");
513
+ }
514
+ #elif defined(__x86_64__)
515
+ static inline void ggml_thread_cpu_relax(void) {
516
+ _mm_pause();
517
+ }
518
+ #elif defined(__riscv)
519
+ static inline void ggml_thread_cpu_relax(void) {
520
+ #ifdef __riscv_zihintpause
521
+ __asm__ __volatile__ ("pause");
522
+ #else
523
+ /* Encoding of the pause instruction */
524
+ __asm__ __volatile__ (".4byte 0x100000F");
525
+ #endif
526
+ }
527
+ #else
528
+ static inline void ggml_thread_cpu_relax(void) {;}
529
+ #endif
530
+
531
+ //
532
+ // NUMA support
533
+ //
534
+
535
+ #define GGML_NUMA_MAX_NODES 8
536
+ #define GGML_NUMA_MAX_CPUS 512
537
+
538
+ struct ggml_numa_node {
539
+ uint32_t cpus[GGML_NUMA_MAX_CPUS]; // hardware threads on this node
540
+ uint32_t n_cpus;
541
+ };
542
+
543
+ struct ggml_numa_nodes {
544
+ enum ggml_numa_strategy numa_strategy;
545
+ struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES];
546
+ uint32_t n_nodes;
547
+ uint32_t total_cpus; // hardware threads on system
548
+ uint32_t current_node; // node on which main process is execting
549
+ #if defined(__gnu_linux__)
550
+ cpu_set_t cpuset; // cpuset from numactl
551
+ #else
552
+ uint32_t cpuset; // no NUMA support outside of Linux at this time. Use a portable datatype
553
+ #endif
554
+ };
555
+
556
+ //
557
+ // ggml state
558
+ //
559
+
560
+ struct ggml_state {
561
+ struct ggml_numa_nodes numa;
562
+ };
563
+
564
+ static struct ggml_state g_state = {0};
565
+
566
+ void ggml_barrier(struct ggml_threadpool * tp) {
567
+ int n_threads = atomic_load_explicit(&tp->n_graph, memory_order_relaxed) & GGML_THREADPOOL_N_THREADS_MASK;
568
+ if (n_threads == 1) {
569
+ return;
570
+ }
571
+
572
+ #ifdef GGML_USE_OPENMP
573
+ #pragma omp barrier
574
+ #else
575
+ int n_passed = atomic_load_explicit(&tp->n_barrier_passed, memory_order_relaxed);
576
+
577
+ // enter barrier (full seq-cst fence)
578
+ int n_barrier = atomic_fetch_add_explicit(&tp->n_barrier, 1, memory_order_seq_cst);
579
+
580
+ if (n_barrier == (n_threads - 1)) {
581
+ // last thread
582
+ atomic_store_explicit(&tp->n_barrier, 0, memory_order_relaxed);
583
+
584
+ // exit barrier (full seq-cst fence)
585
+ atomic_fetch_add_explicit(&tp->n_barrier_passed, 1, memory_order_seq_cst);
586
+ return;
587
+ }
588
+
589
+ // wait for other threads
590
+ while (atomic_load_explicit(&tp->n_barrier_passed, memory_order_relaxed) == n_passed) {
591
+ ggml_thread_cpu_relax();
592
+ }
593
+
594
+ // exit barrier (full seq-cst fence)
595
+ // TSAN doesn't support standalone fence yet, we use a dummy read-modify-write instead
596
+ #ifdef GGML_TSAN_ENABLED
597
+ atomic_fetch_add_explicit(&tp->n_barrier_passed, 0, memory_order_seq_cst);
598
+ #else
599
+ atomic_thread_fence(memory_order_seq_cst);
600
+ #endif
601
+ #endif
602
+ }
603
+
604
+ void ggml_threadpool_chunk_set(struct ggml_threadpool * tp, int value) {
605
+ atomic_store_explicit(&tp->current_chunk, value, memory_order_relaxed);
606
+ }
607
+
608
+ int ggml_threadpool_chunk_add(struct ggml_threadpool * tp, int value) {
609
+ return atomic_fetch_add_explicit(&tp->current_chunk, value, memory_order_relaxed);
610
+ }
611
+
612
+ #if defined(__gnu_linux__)
613
+ static cpu_set_t ggml_get_numa_affinity(void) {
614
+ cpu_set_t cpuset;
615
+ pthread_t thread;
616
+ thread = pthread_self();
617
+ CPU_ZERO(&cpuset);
618
+ pthread_getaffinity_np(thread, sizeof(cpu_set_t), &cpuset);
619
+ return cpuset;
620
+ }
621
+ #else
622
+ static uint32_t ggml_get_numa_affinity(void) {
623
+ return 0; // no NUMA support
624
+ }
625
+ #endif
626
+
627
+ void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
628
+ if (g_state.numa.n_nodes > 0) {
629
+ fprintf(stderr, "ggml_numa_init: NUMA already initialized\n");
630
+
631
+ return;
632
+ }
633
+
634
+ #if defined(__gnu_linux__)
635
+ struct stat st;
636
+ char path[256];
637
+ int rv;
638
+
639
+ // set numa scheme
640
+ g_state.numa.numa_strategy = numa_flag;
641
+
642
+ GGML_PRINT_DEBUG("numa strategy %u\n",g_state.numa.numa_strategy);
643
+
644
+ g_state.numa.cpuset = ggml_get_numa_affinity();
645
+
646
+ // enumerate nodes
647
+ while (g_state.numa.n_nodes < GGML_NUMA_MAX_NODES) {
648
+ rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", g_state.numa.n_nodes);
649
+ GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
650
+ if (stat(path, &st) != 0) { break; }
651
+ ++g_state.numa.n_nodes;
652
+ }
653
+
654
+ // enumerate CPUs
655
+ while (g_state.numa.total_cpus < GGML_NUMA_MAX_CPUS) {
656
+ rv = snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%u", g_state.numa.total_cpus);
657
+ GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
658
+ if (stat(path, &st) != 0) { break; }
659
+ ++g_state.numa.total_cpus;
660
+ }
661
+
662
+ GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", g_state.numa.n_nodes, g_state.numa.total_cpus);
663
+
664
+ // figure out which node we're on
665
+ uint current_cpu;
666
+ int getcpu_ret = 0;
667
+ #if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ > 33) || defined(__COSMOPOLITAN__)
668
+ getcpu_ret = getcpu(&current_cpu, &g_state.numa.current_node);
669
+ #else
670
+ // old glibc doesn't have a wrapper for this call. Fall back on direct syscall
671
+ # if !defined(SYS_getcpu) && defined(SYS_get_cpu)
672
+ # define SYS_getcpu SYS_get_cpu // some older glibc versions use this name
673
+ # endif
674
+ getcpu_ret = syscall(SYS_getcpu, &current_cpu, &g_state.numa.current_node);
675
+ #endif
676
+
677
+ if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1 || getcpu_ret != 0) {
678
+ g_state.numa.n_nodes = 0;
679
+ return;
680
+ }
681
+
682
+ GGML_PRINT_DEBUG("found our process on numa node %u, CPU %u\n", g_state.numa.current_node, current_cpu);
683
+
684
+ for (uint32_t n = 0; n < g_state.numa.n_nodes; ++n) {
685
+ struct ggml_numa_node * node = &g_state.numa.nodes[n];
686
+ GGML_PRINT_DEBUG("CPUs on node %u:", n);
687
+ node->n_cpus = 0;
688
+ for (uint32_t c = 0; c < g_state.numa.total_cpus; ++c) {
689
+ rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u/cpu%u", n, c);
690
+ GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
691
+ if (stat(path, &st) == 0) {
692
+ node->cpus[node->n_cpus++] = c;
693
+ GGML_PRINT_DEBUG(" %u", c);
694
+ }
695
+ }
696
+ GGML_PRINT_DEBUG("\n");
697
+ }
698
+
699
+ if (ggml_is_numa()) {
700
+ FILE *fptr = fopen("/proc/sys/kernel/numa_balancing", "r");
701
+ if (fptr != NULL) {
702
+ char buf[42];
703
+ if (fgets(buf, sizeof(buf), fptr) && strncmp(buf, "0\n", sizeof(buf)) != 0) {
704
+ GGML_LOG_WARN("/proc/sys/kernel/numa_balancing is enabled, this has been observed to impair performance\n");
705
+ }
706
+ fclose(fptr);
707
+ }
708
+ }
709
+ #else
710
+ UNUSED(numa_flag);
711
+ // TODO
712
+ #endif
713
+ }
714
+
715
+ bool ggml_is_numa(void) {
716
+ return g_state.numa.n_nodes > 1;
717
+ }
718
+
719
+ #if defined(__ARM_ARCH)
720
+ #if defined(__aarch64__) && defined(__ARM_FEATURE_SVE)
721
+ #include <arm_sve.h>
722
+ static void ggml_init_arm_arch_features(void) {
723
+ ggml_arm_arch_features.sve_cnt = svcntb();
724
+ }
725
+ #else
726
+ static void ggml_init_arm_arch_features(void) {}
727
+ #endif
728
+ #endif // __ARM_ARCH
729
+
730
+ #if defined(__riscv) && defined(__riscv_v_intrinsic)
731
+ #include <riscv_vector.h>
732
+ static void ggml_init_riscv_arch_features(void) {
733
+ ggml_riscv_arch_features.rvv_vlen = __riscv_vlenb();
734
+ }
735
+ #else
736
+ static void ggml_init_riscv_arch_features(void) {}
737
+ #endif
738
+
739
+ struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) {
740
+ GGML_ASSERT(!ggml_get_no_alloc(ctx));
741
+
742
+ struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1);
743
+
744
+ ggml_set_i32(result, value);
745
+
746
+ return result;
747
+ }
748
+
749
+ struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value) {
750
+ GGML_ASSERT(!ggml_get_no_alloc(ctx));
751
+
752
+ struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
753
+
754
+ ggml_set_f32(result, value);
755
+
756
+ return result;
757
+ }
758
+
759
+ struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value) {
760
+ const int n = ggml_nrows(tensor);
761
+ const int nc = tensor->ne[0];
762
+ const size_t n1 = tensor->nb[1];
763
+
764
+ char * const data = tensor->data;
765
+
766
+ switch (tensor->type) {
767
+ case GGML_TYPE_I8:
768
+ {
769
+ assert(tensor->nb[0] == sizeof(int8_t));
770
+ for (int i = 0; i < n; i++) {
771
+ ggml_vec_set_i8(nc, (int8_t *)(data + i*n1), value);
772
+ }
773
+ } break;
774
+ case GGML_TYPE_I16:
775
+ {
776
+ assert(tensor->nb[0] == sizeof(int16_t));
777
+ for (int i = 0; i < n; i++) {
778
+ ggml_vec_set_i16(nc, (int16_t *)(data + i*n1), value);
779
+ }
780
+ } break;
781
+ case GGML_TYPE_I32:
782
+ {
783
+ assert(tensor->nb[0] == sizeof(int32_t));
784
+ for (int i = 0; i < n; i++) {
785
+ ggml_vec_set_i32(nc, (int32_t *)(data + i*n1), value);
786
+ }
787
+ } break;
788
+ case GGML_TYPE_F16:
789
+ {
790
+ assert(tensor->nb[0] == sizeof(ggml_fp16_t));
791
+ for (int i = 0; i < n; i++) {
792
+ ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_CPU_FP32_TO_FP16(value));
793
+ }
794
+ } break;
795
+ case GGML_TYPE_BF16:
796
+ {
797
+ assert(tensor->nb[0] == sizeof(ggml_fp16_t));
798
+ for (int i = 0; i < n; i++) {
799
+ ggml_vec_set_bf16(nc, (ggml_bf16_t *)(data + i*n1), GGML_FP32_TO_BF16(value));
800
+ }
801
+ } break;
802
+ case GGML_TYPE_F32:
803
+ {
804
+ assert(tensor->nb[0] == sizeof(float));
805
+ for (int i = 0; i < n; i++) {
806
+ ggml_vec_set_f32(nc, (float *)(data + i*n1), value);
807
+ }
808
+ } break;
809
+ default:
810
+ {
811
+ GGML_ABORT("fatal error");
812
+ }
813
+ }
814
+
815
+ return tensor;
816
+ }
817
+
818
+ struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) {
819
+ const int n = ggml_nrows(tensor);
820
+ const int nc = tensor->ne[0];
821
+ const size_t n1 = tensor->nb[1];
822
+
823
+ char * const data = tensor->data;
824
+
825
+ switch (tensor->type) {
826
+ case GGML_TYPE_I8:
827
+ {
828
+ assert(tensor->nb[0] == sizeof(int8_t));
829
+ for (int i = 0; i < n; i++) {
830
+ ggml_vec_set_i8(nc, (int8_t *)(data + i*n1), value);
831
+ }
832
+ } break;
833
+ case GGML_TYPE_I16:
834
+ {
835
+ assert(tensor->nb[0] == sizeof(int16_t));
836
+ for (int i = 0; i < n; i++) {
837
+ ggml_vec_set_i16(nc, (int16_t *)(data + i*n1), value);
838
+ }
839
+ } break;
840
+ case GGML_TYPE_I32:
841
+ {
842
+ assert(tensor->nb[0] == sizeof(int32_t));
843
+ for (int i = 0; i < n; i++) {
844
+ ggml_vec_set_i32(nc, (int32_t *)(data + i*n1), value);
845
+ }
846
+ } break;
847
+ case GGML_TYPE_F16:
848
+ {
849
+ assert(tensor->nb[0] == sizeof(ggml_fp16_t));
850
+ for (int i = 0; i < n; i++) {
851
+ ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_CPU_FP32_TO_FP16(value));
852
+ }
853
+ } break;
854
+ case GGML_TYPE_BF16:
855
+ {
856
+ assert(tensor->nb[0] == sizeof(ggml_bf16_t));
857
+ for (int i = 0; i < n; i++) {
858
+ ggml_vec_set_bf16(nc, (ggml_bf16_t *)(data + i*n1), GGML_FP32_TO_BF16(value));
859
+ }
860
+ } break;
861
+ case GGML_TYPE_F32:
862
+ {
863
+ assert(tensor->nb[0] == sizeof(float));
864
+ for (int i = 0; i < n; i++) {
865
+ ggml_vec_set_f32(nc, (float *)(data + i*n1), value);
866
+ }
867
+ } break;
868
+ default:
869
+ {
870
+ GGML_ABORT("fatal error");
871
+ }
872
+ }
873
+
874
+ return tensor;
875
+ }
876
+
877
+ int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i) {
878
+ if (!ggml_is_contiguous(tensor)) {
879
+ int64_t id[4] = { 0, 0, 0, 0 };
880
+ ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]);
881
+ return ggml_get_i32_nd(tensor, id[0], id[1], id[2], id[3]);
882
+ }
883
+ switch (tensor->type) {
884
+ case GGML_TYPE_I8:
885
+ {
886
+ GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));
887
+ return ((int8_t *)(tensor->data))[i];
888
+ }
889
+ case GGML_TYPE_I16:
890
+ {
891
+ GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));
892
+ return ((int16_t *)(tensor->data))[i];
893
+ }
894
+ case GGML_TYPE_I32:
895
+ {
896
+ GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));
897
+ return ((int32_t *)(tensor->data))[i];
898
+ }
899
+ case GGML_TYPE_F16:
900
+ {
901
+ GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
902
+ return GGML_CPU_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
903
+ }
904
+ case GGML_TYPE_BF16:
905
+ {
906
+ GGML_ASSERT(tensor->nb[0] == sizeof(ggml_bf16_t));
907
+ return GGML_BF16_TO_FP32(((ggml_bf16_t *)(tensor->data))[i]);
908
+ }
909
+ case GGML_TYPE_F32:
910
+ {
911
+ GGML_ASSERT(tensor->nb[0] == sizeof(float));
912
+ return ((float *)(tensor->data))[i];
913
+ }
914
+ default:
915
+ {
916
+ GGML_ABORT("fatal error");
917
+ }
918
+ }
919
+ }
920
+
921
+ void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value) {
922
+ if (!ggml_is_contiguous(tensor)) {
923
+ int64_t id[4] = { 0, 0, 0, 0 };
924
+ ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]);
925
+ ggml_set_i32_nd(tensor, id[0], id[1], id[2], id[3], value);
926
+ return;
927
+ }
928
+ switch (tensor->type) {
929
+ case GGML_TYPE_I8:
930
+ {
931
+ GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));
932
+ ((int8_t *)(tensor->data))[i] = value;
933
+ } break;
934
+ case GGML_TYPE_I16:
935
+ {
936
+ GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));
937
+ ((int16_t *)(tensor->data))[i] = value;
938
+ } break;
939
+ case GGML_TYPE_I32:
940
+ {
941
+ GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));
942
+ ((int32_t *)(tensor->data))[i] = value;
943
+ } break;
944
+ case GGML_TYPE_F16:
945
+ {
946
+ GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
947
+ ((ggml_fp16_t *)(tensor->data))[i] = GGML_CPU_FP32_TO_FP16(value);
948
+ } break;
949
+ case GGML_TYPE_BF16:
950
+ {
951
+ GGML_ASSERT(tensor->nb[0] == sizeof(ggml_bf16_t));
952
+ ((ggml_bf16_t *)(tensor->data))[i] = GGML_FP32_TO_BF16(value);
953
+ } break;
954
+ case GGML_TYPE_F32:
955
+ {
956
+ GGML_ASSERT(tensor->nb[0] == sizeof(float));
957
+ ((float *)(tensor->data))[i] = value;
958
+ } break;
959
+ default:
960
+ {
961
+ GGML_ABORT("fatal error");
962
+ }
963
+ }
964
+ }
965
+
966
+ int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3) {
967
+ void * data = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3];
968
+ switch (tensor->type) {
969
+ case GGML_TYPE_I8:
970
+ return ((int8_t *) data)[0];
971
+ case GGML_TYPE_I16:
972
+ return ((int16_t *) data)[0];
973
+ case GGML_TYPE_I32:
974
+ return ((int32_t *) data)[0];
975
+ case GGML_TYPE_F16:
976
+ return GGML_CPU_FP16_TO_FP32(((ggml_fp16_t *) data)[0]);
977
+ case GGML_TYPE_BF16:
978
+ return GGML_BF16_TO_FP32(((ggml_bf16_t *) data)[0]);
979
+ case GGML_TYPE_F32:
980
+ return ((float *) data)[0];
981
+ default:
982
+ GGML_ABORT("fatal error");
983
+ }
984
+ }
985
+
986
+ void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value) {
987
+ void * data = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3];
988
+ switch (tensor->type) {
989
+ case GGML_TYPE_I8:
990
+ {
991
+ ((int8_t *)(data))[0] = value;
992
+ } break;
993
+ case GGML_TYPE_I16:
994
+ {
995
+ ((int16_t *)(data))[0] = value;
996
+ } break;
997
+ case GGML_TYPE_I32:
998
+ {
999
+ ((int32_t *)(data))[0] = value;
1000
+ } break;
1001
+ case GGML_TYPE_F16:
1002
+ {
1003
+ ((ggml_fp16_t *)(data))[0] = GGML_CPU_FP32_TO_FP16(value);
1004
+ } break;
1005
+ case GGML_TYPE_BF16:
1006
+ {
1007
+ ((ggml_bf16_t *)(data))[0] = GGML_FP32_TO_BF16(value);
1008
+ } break;
1009
+ case GGML_TYPE_F32:
1010
+ {
1011
+ ((float *)(data))[0] = value;
1012
+ } break;
1013
+ default:
1014
+ {
1015
+ GGML_ABORT("fatal error");
1016
+ }
1017
+ }
1018
+ }
1019
+
1020
+ float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) {
1021
+ if (!ggml_is_contiguous(tensor)) {
1022
+ int64_t id[4] = { 0, 0, 0, 0 };
1023
+ ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]);
1024
+ return ggml_get_f32_nd(tensor, id[0], id[1], id[2], id[3]);
1025
+ }
1026
+ switch (tensor->type) {
1027
+ case GGML_TYPE_I8:
1028
+ {
1029
+ return ((int8_t *)(tensor->data))[i];
1030
+ }
1031
+ case GGML_TYPE_I16:
1032
+ {
1033
+ return ((int16_t *)(tensor->data))[i];
1034
+ }
1035
+ case GGML_TYPE_I32:
1036
+ {
1037
+ return ((int32_t *)(tensor->data))[i];
1038
+ }
1039
+ case GGML_TYPE_F16:
1040
+ {
1041
+ return GGML_CPU_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
1042
+ }
1043
+ case GGML_TYPE_BF16:
1044
+ {
1045
+ return GGML_BF16_TO_FP32(((ggml_bf16_t *)(tensor->data))[i]);
1046
+ }
1047
+ case GGML_TYPE_F32:
1048
+ {
1049
+ return ((float *)(tensor->data))[i];
1050
+ }
1051
+ default:
1052
+ {
1053
+ GGML_ABORT("fatal error");
1054
+ }
1055
+ }
1056
+ }
1057
+
1058
+ void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) {
1059
+ if (!ggml_is_contiguous(tensor)) {
1060
+ int64_t id[4] = { 0, 0, 0, 0 };
1061
+ ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]);
1062
+ ggml_set_f32_nd(tensor, id[0], id[1], id[2], id[3], value);
1063
+ return;
1064
+ }
1065
+ switch (tensor->type) {
1066
+ case GGML_TYPE_I8:
1067
+ {
1068
+ ((int8_t *)(tensor->data))[i] = value;
1069
+ } break;
1070
+ case GGML_TYPE_I16:
1071
+ {
1072
+ ((int16_t *)(tensor->data))[i] = value;
1073
+ } break;
1074
+ case GGML_TYPE_I32:
1075
+ {
1076
+ ((int32_t *)(tensor->data))[i] = value;
1077
+ } break;
1078
+ case GGML_TYPE_F16:
1079
+ {
1080
+ ((ggml_fp16_t *)(tensor->data))[i] = GGML_CPU_FP32_TO_FP16(value);
1081
+ } break;
1082
+ case GGML_TYPE_BF16:
1083
+ {
1084
+ ((ggml_bf16_t *)(tensor->data))[i] = GGML_FP32_TO_BF16(value);
1085
+ } break;
1086
+ case GGML_TYPE_F32:
1087
+ {
1088
+ ((float *)(tensor->data))[i] = value;
1089
+ } break;
1090
+ default:
1091
+ {
1092
+ GGML_ABORT("fatal error");
1093
+ }
1094
+ }
1095
+ }
1096
+
1097
+ float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3) {
1098
+ void * data = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3];
1099
+ switch (tensor->type) {
1100
+ case GGML_TYPE_I8:
1101
+ return ((int8_t *) data)[0];
1102
+ case GGML_TYPE_I16:
1103
+ return ((int16_t *) data)[0];
1104
+ case GGML_TYPE_I32:
1105
+ return ((int32_t *) data)[0];
1106
+ case GGML_TYPE_F16:
1107
+ return GGML_CPU_FP16_TO_FP32(((ggml_fp16_t *) data)[0]);
1108
+ case GGML_TYPE_BF16:
1109
+ return GGML_BF16_TO_FP32(((ggml_bf16_t *) data)[0]);
1110
+ case GGML_TYPE_F32:
1111
+ return ((float *) data)[0];
1112
+ default:
1113
+ GGML_ABORT("fatal error");
1114
+ }
1115
+ }
1116
+
1117
+ void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value) {
1118
+ void * data = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3];
1119
+ switch (tensor->type) {
1120
+ case GGML_TYPE_I8:
1121
+ {
1122
+ ((int8_t *)(data))[0] = value;
1123
+ } break;
1124
+ case GGML_TYPE_I16:
1125
+ {
1126
+ ((int16_t *)(data))[0] = value;
1127
+ } break;
1128
+ case GGML_TYPE_I32:
1129
+ {
1130
+ ((int32_t *)(data))[0] = value;
1131
+ } break;
1132
+ case GGML_TYPE_F16:
1133
+ {
1134
+ ((ggml_fp16_t *)(data))[0] = GGML_CPU_FP32_TO_FP16(value);
1135
+ } break;
1136
+ case GGML_TYPE_BF16:
1137
+ {
1138
+ ((ggml_bf16_t *)(data))[0] = GGML_FP32_TO_BF16(value);
1139
+ } break;
1140
+ case GGML_TYPE_F32:
1141
+ {
1142
+ ((float *)(data))[0] = value;
1143
+ } break;
1144
+ default:
1145
+ {
1146
+ GGML_ABORT("fatal error");
1147
+ }
1148
+ }
1149
+ }
1150
+
1151
+ ////////////////////////////////////////////////////////////////////////////////
1152
+
1153
+ // ggml_compute_forward_mul_mat
1154
+
1155
+ static void ggml_compute_forward_mul_mat_one_chunk(
1156
+ const struct ggml_compute_params * params,
1157
+ struct ggml_tensor * dst,
1158
+ const enum ggml_type type,
1159
+ const int64_t num_rows_per_vec_dot,
1160
+ const int64_t ir0_start,
1161
+ const int64_t ir0_end,
1162
+ const int64_t ir1_start,
1163
+ const int64_t ir1_end) {
1164
+
1165
+ const struct ggml_tensor * src0 = dst->src[0];
1166
+ const struct ggml_tensor * src1 = dst->src[1];
1167
+
1168
+ GGML_TENSOR_BINARY_OP_LOCALS
1169
+
1170
+ const bool src1_cont = ggml_is_contiguous(src1);
1171
+
1172
+ ggml_vec_dot_t const vec_dot = type_traits_cpu[type].vec_dot;
1173
+ enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;
1174
+
1175
+ // broadcast factors
1176
+ const int64_t r2 = ne12 / ne02;
1177
+ const int64_t r3 = ne13 / ne03;
1178
+
1179
+ //printf("ir0_start = %6lld, ir0_end = %6lld, ir1_start = %6lld, ir1_end = %6lld\n", ir0_start, ir0_end, ir1_start, ir1_end);
1180
+
1181
+ // threads with no work simply yield (not sure if it helps)
1182
+ if (ir0_start >= ir0_end || ir1_start >= ir1_end) {
1183
+ return;
1184
+ }
1185
+
1186
+ const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
1187
+ const size_t row_size = ggml_row_size(vec_dot_type, ne10);
1188
+
1189
+ assert(ne12 % ne02 == 0);
1190
+ assert(ne13 % ne03 == 0);
1191
+
1192
+ // block-tiling attempt
1193
+ const int64_t blck_0 = 16;
1194
+ const int64_t blck_1 = 16;
1195
+
1196
+ const size_t src1_col_stride = src1_cont || src1->type != vec_dot_type ? row_size : nb11;
1197
+
1198
+ // attempt to reduce false-sharing (does not seem to make a difference)
1199
+ // 16 * 2, accounting for mmla kernels
1200
+ float tmp[32];
1201
+
1202
+ for (int64_t iir1 = ir1_start; iir1 < ir1_end; iir1 += blck_1) {
1203
+ for (int64_t iir0 = ir0_start; iir0 < ir0_end; iir0 += blck_0) {
1204
+ for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir1_end; ir1 += num_rows_per_vec_dot) {
1205
+ const int64_t i13 = (ir1 / (ne12 * ne1));
1206
+ const int64_t i12 = (ir1 - i13 * ne12 * ne1) / ne1;
1207
+ const int64_t i11 = (ir1 - i13 * ne12 * ne1 - i12 * ne1);
1208
+
1209
+ // broadcast src0 into src1
1210
+ const int64_t i03 = i13 / r3;
1211
+ const int64_t i02 = i12 / r2;
1212
+
1213
+ const int64_t i1 = i11;
1214
+ const int64_t i2 = i12;
1215
+ const int64_t i3 = i13;
1216
+
1217
+ const char * src0_row = (const char*)src0->data + (0 + i02 * nb02 + i03 * nb03);
1218
+
1219
+ // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
1220
+ // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
1221
+ // the original src1 data pointer, so we should index using the indices directly
1222
+ // TODO: this is a bit of a hack, we should probably have a better way to handle this
1223
+ const char * src1_col = (const char*)wdata +
1224
+ (src1_cont || src1->type != vec_dot_type
1225
+ ? (i11 + i12 * ne11 + i13 * ne12 * ne11) * row_size
1226
+ : (i11 * nb11 + i12 * nb12 + i13 * nb13));
1227
+ float * dst_col = (float*)((char*)dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3));
1228
+
1229
+ //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ++ir0) {
1230
+ // vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
1231
+ //}
1232
+
1233
+ for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ir0 += num_rows_per_vec_dot) {
1234
+ vec_dot(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0), src0_row + ir0 * nb01, (num_rows_per_vec_dot > 1 ? nb01 : 0), src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot);
1235
+ }
1236
+
1237
+ for (int cn = 0; cn < num_rows_per_vec_dot; ++cn) {
1238
+ memcpy(&dst_col[iir0 + cn * nb1 / nb0], tmp + (cn * 16), (MIN(iir0 + blck_0, ir0_end) - iir0) * sizeof(float));
1239
+ }
1240
+ }
1241
+ }
1242
+ }
1243
+ }
1244
+
1245
+ void ggml_compute_forward_mul_mat(
1246
+ const struct ggml_compute_params * params,
1247
+ struct ggml_tensor * dst) {
1248
+
1249
+ const struct ggml_tensor * src0 = dst->src[0];
1250
+ const struct ggml_tensor * src1 = dst->src[1];
1251
+
1252
+ const int32_t hint = ggml_get_op_params_i32(dst, 1);
1253
+ if (hint == GGML_HINT_SRC0_IS_HADAMARD && !params->use_ref) {
1254
+ ggml_compute_forward_fwht(params, dst);
1255
+ return;
1256
+ }
1257
+
1258
+ GGML_TENSOR_BINARY_OP_LOCALS
1259
+
1260
+ const int ith = params->ith;
1261
+ const int nth = params->nth;
1262
+
1263
+ enum ggml_type const vec_dot_type = type_traits_cpu[src0->type].vec_dot_type;
1264
+ ggml_from_float_t const from_float = type_traits_cpu[vec_dot_type].from_float;
1265
+ int64_t const vec_dot_num_rows = type_traits_cpu[src0->type].nrows;
1266
+
1267
+ GGML_ASSERT(ne0 == ne01);
1268
+ GGML_ASSERT(ne1 == ne11);
1269
+ GGML_ASSERT(ne2 == ne12);
1270
+ GGML_ASSERT(ne3 == ne13);
1271
+
1272
+ // we don't support permuted src0 or src1
1273
+ GGML_ASSERT(nb00 == ggml_type_size(src0->type));
1274
+ GGML_ASSERT(nb10 == ggml_type_size(src1->type));
1275
+
1276
+ // dst cannot be transposed or permuted
1277
+ GGML_ASSERT(nb0 == sizeof(float));
1278
+ GGML_ASSERT(nb0 <= nb1);
1279
+ GGML_ASSERT(nb1 <= nb2);
1280
+ GGML_ASSERT(nb2 <= nb3);
1281
+
1282
+ // nb01 >= nb00 - src0 is not transposed
1283
+ // compute by src0 rows
1284
+
1285
+ // TODO: extract to "extra_op"
1286
+ #if GGML_USE_LLAMAFILE
1287
+ // broadcast factors
1288
+ const int64_t r2 = ne12 / ne02;
1289
+ const int64_t r3 = ne13 / ne03;
1290
+
1291
+ const bool src1_cont = ggml_is_contiguous(src1);
1292
+
1293
+ if (src1_cont) {
1294
+ for (int64_t i13 = 0; i13 < ne13; i13++)
1295
+ for (int64_t i12 = 0; i12 < ne12; i12++)
1296
+ if (!llamafile_sgemm(params,
1297
+ ne01, ne11, ne00/ggml_blck_size(src0->type),
1298
+ (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
1299
+ nb01/ggml_type_size(src0->type),
1300
+ (const char *)src1->data + i12*nb12 + i13*nb13,
1301
+ nb11/ggml_type_size(src1->type),
1302
+ (char *)dst->data + i12*nb2 + i13*nb3,
1303
+ nb1/ggml_type_size(dst->type),
1304
+ src0->type,
1305
+ src1->type,
1306
+ dst->type))
1307
+ goto UseGgmlGemm1;
1308
+ return;
1309
+ }
1310
+ UseGgmlGemm1:;
1311
+ #endif
1312
+
1313
+ if (src1->type != vec_dot_type) {
1314
+ char * wdata = params->wdata;
1315
+
1316
+ const size_t nbw0 = ggml_type_size(vec_dot_type);
1317
+ const size_t nbw1 = ggml_row_size(vec_dot_type, ne10);
1318
+ const size_t nbw2 = nbw1*ne11;
1319
+ const size_t nbw3 = nbw2*ne12;
1320
+
1321
+ assert(params->wsize >= ne13*nbw3);
1322
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
1323
+
1324
+ #if 0
1325
+ for (int64_t i13 = 0; i13 < ne13; ++i13) {
1326
+ for (int64_t i12 = 0; i12 < ne12; ++i12) {
1327
+ for (int64_t i11 = ith; i11 < ne11; i11 += nth) {
1328
+ from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
1329
+ (void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
1330
+ ne10);
1331
+ }
1332
+ }
1333
+ }
1334
+ #else
1335
+ for (int64_t i13 = 0; i13 < ne13; ++i13) {
1336
+ for (int64_t i12 = 0; i12 < ne12; ++i12) {
1337
+ for (int64_t i11 = 0; i11 < ne11; ++i11) {
1338
+ size_t bs = ggml_blck_size(vec_dot_type);
1339
+ int64_t ne10_block_start = (ith * ne10/bs) / nth;
1340
+ int64_t ne10_block_end = ((ith + 1) * ne10/bs) / nth;
1341
+ from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + ne10_block_start*bs*nb10),
1342
+ (void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1 + ne10_block_start*nbw0),
1343
+ (ne10_block_end - ne10_block_start) * bs);
1344
+ }
1345
+ }
1346
+ }
1347
+ #endif
1348
+ }
1349
+
1350
+ if (ith == 0) {
1351
+ // Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
1352
+ atomic_store_explicit(&params->threadpool->current_chunk, nth, memory_order_relaxed);
1353
+ }
1354
+
1355
+ ggml_barrier(params->threadpool);
1356
+
1357
+ #if GGML_USE_LLAMAFILE
1358
+ if (src1->type != vec_dot_type) {
1359
+ const void* wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
1360
+ const size_t row_size = ggml_row_size(vec_dot_type, ne10);
1361
+
1362
+ for (int64_t i13 = 0; i13 < ne13; i13++)
1363
+ for (int64_t i12 = 0; i12 < ne12; i12++)
1364
+ if (!llamafile_sgemm(params,
1365
+ ne01, ne11, ne00/ggml_blck_size(src0->type),
1366
+ (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
1367
+ nb01/ggml_type_size(src0->type),
1368
+ (const char *)wdata + (i12*ne11 + i13*ne12*ne11)*row_size,
1369
+ row_size/ggml_type_size(vec_dot_type),
1370
+ (char *)dst->data + i12*nb2 + i13*nb3,
1371
+ nb1/ggml_type_size(dst->type),
1372
+ src0->type,
1373
+ vec_dot_type,
1374
+ dst->type))
1375
+ goto UseGgmlGemm2;
1376
+ return;
1377
+ }
1378
+ UseGgmlGemm2:;
1379
+ #endif
1380
+
1381
+ // This is the size of the first dimension of the result, so we can iterate that way. (see the ASSERT above, these are the same numbers)
1382
+ const int64_t nr0 = ne0;
1383
+
1384
+ // This is the size of the rest of the dimensions of the result
1385
+ const int64_t nr1 = ne1 * ne2 * ne3;
1386
+
1387
+ // Now select a reasonable chunk size.
1388
+ int chunk_size = 16;
1389
+
1390
+ // We need to step up the size if it's small
1391
+ if (nr0 == 1 || nr1 == 1) {
1392
+ chunk_size = 64;
1393
+ }
1394
+
1395
+ // distribute the work across the inner or outer loop based on which one is larger
1396
+ // The number of chunks in the 0/1 dim.
1397
+ // CEIL(nr0/chunk_size)
1398
+ int64_t nchunk0 = (nr0 + chunk_size - 1) / chunk_size;
1399
+ int64_t nchunk1 = (nr1 + chunk_size - 1) / chunk_size;
1400
+
1401
+ // If the chunking is poor for the number of threads on this setup, scrap the whole plan. Re-chunk it by thread.
1402
+ // Also, chunking by thread was measured to have perform better on NUMA systems. See https://github.com/ggml-org/llama.cpp/pull/6915
1403
+ // In theory, chunking should be just as useful on NUMA and non NUMA systems, but testing disagreed with that.
1404
+ if (nchunk0 * nchunk1 < nth * 4 || ggml_is_numa()) {
1405
+ // distribute the thread work across the inner or outer loop based on which one is larger
1406
+ nchunk0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
1407
+ nchunk1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
1408
+ }
1409
+
1410
+ // The number of elements in each chunk
1411
+ const int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0;
1412
+ const int64_t dr1 = (nr1 + nchunk1 - 1) / nchunk1;
1413
+
1414
+ // The first chunk comes from our thread_id, the rest will get auto-assigned.
1415
+ int current_chunk = ith;
1416
+
1417
+ while (current_chunk < nchunk0 * nchunk1) {
1418
+ const int64_t ith0 = current_chunk % nchunk0;
1419
+ const int64_t ith1 = current_chunk / nchunk0;
1420
+
1421
+ const int64_t ir0_start = dr0 * ith0;
1422
+ const int64_t ir0_end = MIN(ir0_start + dr0, nr0);
1423
+
1424
+ const int64_t ir1_start = dr1 * ith1;
1425
+ const int64_t ir1_end = MIN(ir1_start + dr1, nr1);
1426
+
1427
+ // dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols
1428
+ int64_t num_rows_per_vec_dot = vec_dot_num_rows;
1429
+
1430
+ // these checks are needed to avoid crossing dim1 boundaries
1431
+ // can be optimized, but the logic would become more complicated, so keeping it like this for simplicity
1432
+ if ((nr0 % 2 != 0) || (ne11 % 2 != 0) || ((ir0_end - ir0_start) % 2 != 0) || ((ir1_end - ir1_start) % 2 != 0)) {
1433
+ num_rows_per_vec_dot = 1;
1434
+ }
1435
+ ggml_compute_forward_mul_mat_one_chunk(params, dst, src0->type, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end);
1436
+
1437
+ if (nth >= nchunk0 * nchunk1) {
1438
+ break;
1439
+ }
1440
+
1441
+ current_chunk = atomic_fetch_add_explicit(&params->threadpool->current_chunk, 1, memory_order_relaxed);
1442
+ }
1443
+ }
1444
+
1445
+ // ggml_compute_forward_mul_mat_id
1446
+
1447
+ #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ids->ne[0]*ids->ne[1] + (i1)]
1448
+
1449
+ struct mmid_row_mapping {
1450
+ int32_t i1;
1451
+ int32_t i2;
1452
+ };
1453
+
1454
+ static void ggml_compute_forward_mul_mat_id_one_chunk(
1455
+ struct ggml_tensor * dst,
1456
+ const struct ggml_tensor * src0,
1457
+ const struct ggml_tensor * src1,
1458
+ const struct ggml_tensor * ids,
1459
+ const int64_t cur_a,
1460
+ const int64_t ir0_start,
1461
+ const int64_t ir0_end,
1462
+ const int64_t ir1_start,
1463
+ const int64_t ir1_end,
1464
+ const char * src0_cur,
1465
+ const struct mmid_row_mapping * matrix_rows,
1466
+ const size_t row_size,
1467
+ const bool src1_cont,
1468
+ const void * wdata) {
1469
+
1470
+ GGML_TENSOR_BINARY_OP_LOCALS
1471
+
1472
+ const enum ggml_type type = src0->type;
1473
+
1474
+ ggml_vec_dot_t const vec_dot = type_traits_cpu[type].vec_dot;
1475
+ enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;
1476
+
1477
+ const int64_t blck_0 = 16;
1478
+ const int64_t blck_1 = 16;
1479
+
1480
+ float tmp[16];
1481
+
1482
+ for (int64_t iir1 = ir1_start; iir1 < ir1_end; iir1 += blck_1) {
1483
+ for (int64_t iir0 = ir0_start; iir0 < ir0_end; iir0 += blck_0) {
1484
+ for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir1_end; ++ir1) {
1485
+ const int64_t _i12 = ir1; // logical row index for this expert
1486
+
1487
+ struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, _i12);
1488
+ const int id = row_mapping.i1; // selected expert index
1489
+
1490
+ const int64_t i11 = id % ne11;
1491
+ const int64_t i12 = row_mapping.i2; // row index in src1
1492
+
1493
+ const int64_t i1 = id; // selected expert index
1494
+ const int64_t i2 = i12; // row
1495
+
1496
+ // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
1497
+ // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
1498
+ // the original src1 data pointer, so we should index using the indices directly
1499
+ // TODO: this is a bit of a hack, we should probably have a better way to handle this
1500
+ const char * src1_col = (const char *) wdata +
1501
+ (src1_cont || src1->type != vec_dot_type
1502
+ ? (i11 + i12*ne11)*row_size
1503
+ : (i11*nb11 + i12*nb12));
1504
+
1505
+ float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2));
1506
+
1507
+ for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ++ir0) {
1508
+ vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_cur + ir0*nb01, 0, src1_col, 0, 1);
1509
+ }
1510
+
1511
+ memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir0_end) - iir0)*sizeof(float));
1512
+ }
1513
+ }
1514
+ }
1515
+ }
1516
+
1517
+ static void * incr_ptr_aligned(void ** p, size_t size, size_t align) {
1518
+
1519
+ void * ptr = *p;
1520
+ ptr = (void *) GGML_PAD((uintptr_t) ptr, align);
1521
+ *p = (void *) ((char *) ptr + size);
1522
+ return ptr;
1523
+ }
1524
+
1525
+ static void ggml_compute_forward_mul_mat_id(
1526
+ const struct ggml_compute_params * params,
1527
+ struct ggml_tensor * dst) {
1528
+
1529
+ const struct ggml_tensor * src0 = dst->src[0];
1530
+ const struct ggml_tensor * src1 = dst->src[1];
1531
+ const struct ggml_tensor * ids = dst->src[2];
1532
+
1533
+ GGML_TENSOR_BINARY_OP_LOCALS
1534
+
1535
+ const int ith = params->ith;
1536
+ const int nth = params->nth;
1537
+
1538
+ const enum ggml_type type = src0->type;
1539
+
1540
+ const bool src1_cont = ggml_is_contiguous(src1);
1541
+
1542
+ enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;
1543
+ ggml_from_float_t const from_float = type_traits_cpu[vec_dot_type].from_float;
1544
+
1545
+ // we don't support permuted src0 or src1
1546
+ GGML_ASSERT(nb00 == ggml_type_size(type));
1547
+ GGML_ASSERT(nb10 == ggml_type_size(src1->type));
1548
+
1549
+ // dst cannot be transposed or permuted
1550
+ GGML_ASSERT(nb0 == sizeof(float));
1551
+ GGML_ASSERT(nb0 <= nb1);
1552
+ GGML_ASSERT(nb1 <= nb2);
1553
+ GGML_ASSERT(nb2 <= nb3);
1554
+
1555
+ // row groups
1556
+ const int n_ids = ids->ne[0]; // n_expert_used
1557
+ const int n_as = ne02; // n_expert
1558
+
1559
+ void * wdata_cur = params->wdata;
1560
+
1561
+ if (src1->type != vec_dot_type) {
1562
+ incr_ptr_aligned(&wdata_cur, ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t));
1563
+ }
1564
+
1565
+ int64_t * matrix_row_counts = // [n_as]
1566
+ incr_ptr_aligned(&wdata_cur, n_as*sizeof(int64_t), sizeof(int64_t));
1567
+
1568
+ struct mmid_row_mapping * matrix_rows = // [n_as][ids->ne[0]*ids->ne[1]]
1569
+ incr_ptr_aligned(&wdata_cur, n_as*ids->ne[0]*ids->ne[1]*sizeof(struct mmid_row_mapping), sizeof(int64_t));
1570
+
1571
+ char (*atomic_current_chunk)[CACHE_LINE_SIZE] = // [n_as]
1572
+ incr_ptr_aligned(&wdata_cur, CACHE_LINE_SIZE * n_as, CACHE_LINE_SIZE);
1573
+
1574
+ GGML_ASSERT(params->wsize >= (size_t)((char *) wdata_cur - (char *) params->wdata));
1575
+
1576
+ if (src1->type != vec_dot_type) {
1577
+ char * wdata = params->wdata;
1578
+
1579
+ const size_t nbw0 = ggml_type_size(vec_dot_type);
1580
+ const size_t nbw1 = ggml_row_size(vec_dot_type, ne10);
1581
+ const size_t nbw2 = nbw1*ne11;
1582
+ const size_t nbw3 = nbw2*ne12;
1583
+
1584
+ assert(params->wsize >= ne13*nbw3);
1585
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
1586
+
1587
+ #if 0
1588
+ for (int64_t i13 = 0; i13 < ne13; ++i13) {
1589
+ for (int64_t i12 = ith; i12 < ne12; i12 += nth) {
1590
+ for (int64_t i11 = 0; i11 < ne11; ++i11) {
1591
+ from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
1592
+ (void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
1593
+ ne10);
1594
+ }
1595
+ }
1596
+ }
1597
+ #else
1598
+ for (int64_t i13 = 0; i13 < ne13; ++i13) {
1599
+ for (int64_t i12 = 0; i12 < ne12; ++i12) {
1600
+ for (int64_t i11 = 0; i11 < ne11; ++i11) {
1601
+ size_t bs = ggml_blck_size(vec_dot_type);
1602
+ int64_t ne10_block_start = (ith * ne10/bs) / nth;
1603
+ int64_t ne10_block_end = ((ith + 1) * ne10/bs) / nth;
1604
+ from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + ne10_block_start*bs*nb10),
1605
+ (void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1 + ne10_block_start*nbw0),
1606
+ (ne10_block_end - ne10_block_start) * bs);
1607
+ }
1608
+ }
1609
+ }
1610
+ #endif
1611
+ }
1612
+
1613
+ if (ith == 0) {
1614
+ // initialize matrix_row_counts
1615
+ memset(matrix_row_counts, 0, n_as*sizeof(int64_t));
1616
+
1617
+ // group rows by src0 matrix
1618
+ for (int64_t iid1 = 0; iid1 < ids->ne[1]; ++iid1) {
1619
+ for (int id = 0; id < n_ids; ++id) {
1620
+ const int32_t i02 = *(const int32_t *) ((const char *) ids->data + iid1*ids->nb[1] + id*ids->nb[0]);
1621
+
1622
+ assert(i02 >= 0 && i02 < n_as);
1623
+
1624
+ MMID_MATRIX_ROW(i02, matrix_row_counts[i02]) = (struct mmid_row_mapping) {id, iid1};
1625
+ matrix_row_counts[i02] += 1;
1626
+ }
1627
+ }
1628
+ }
1629
+
1630
+ // reset current_chunk
1631
+ for (int cur_a = ith; cur_a < n_as; cur_a += nth) {
1632
+ atomic_int * current_chunk_ctr = (atomic_int *)(atomic_current_chunk + cur_a);
1633
+ *current_chunk_ctr = nth;
1634
+ }
1635
+
1636
+ ggml_barrier(params->threadpool);
1637
+
1638
+ for (int cur_a = 0; cur_a < n_as; ++cur_a) {
1639
+ const int64_t cne1 = matrix_row_counts[cur_a];
1640
+
1641
+ if (cne1 == 0) {
1642
+ continue;
1643
+ }
1644
+
1645
+ const char * src0_cur = (const char *) src0->data + cur_a * nb02;
1646
+ const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
1647
+ const size_t row_size = ggml_row_size(vec_dot_type, ne10);
1648
+
1649
+ const int64_t nr0 = ne01;
1650
+ const int64_t nr1 = cne1;
1651
+
1652
+ int chunk_size = 16;
1653
+ if (nr0 == 1 || nr1 == 1) {
1654
+ chunk_size = 64;
1655
+ }
1656
+
1657
+ // disable for NUMA
1658
+ const bool disable_chunking = ggml_is_numa();
1659
+
1660
+ int64_t nchunk0 = (nr0 + chunk_size - 1) / chunk_size;
1661
+ int64_t nchunk1 = (nr1 + chunk_size - 1) / chunk_size;
1662
+
1663
+ if (nchunk0 * nchunk1 < nth * 4 || disable_chunking) {
1664
+ nchunk0 = nr0 > nr1 ? nth : 1;
1665
+ nchunk1 = nr0 > nr1 ? 1 : nth;
1666
+ }
1667
+
1668
+ const int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0;
1669
+ const int64_t dr1 = (nr1 + nchunk1 - 1) / nchunk1;
1670
+
1671
+ int current_chunk = ith;
1672
+
1673
+ atomic_int * current_chunk_ctr = (atomic_int *)(atomic_current_chunk + cur_a);
1674
+
1675
+ while (current_chunk < nchunk0 * nchunk1) {
1676
+ const int64_t ith0 = current_chunk % nchunk0;
1677
+ const int64_t ith1 = current_chunk / nchunk0;
1678
+
1679
+ const int64_t ir0_start = dr0 * ith0;
1680
+ const int64_t ir0_end = MIN(ir0_start + dr0, nr0);
1681
+
1682
+ const int64_t ir1_start = dr1 * ith1;
1683
+ const int64_t ir1_end = MIN(ir1_start + dr1, nr1);
1684
+
1685
+ ggml_compute_forward_mul_mat_id_one_chunk(
1686
+ dst, src0, src1, ids, cur_a,
1687
+ ir0_start, ir0_end, ir1_start, ir1_end,
1688
+ src0_cur, matrix_rows, row_size, src1_cont, wdata
1689
+ );
1690
+
1691
+ if (nth >= nchunk0 * nchunk1) {
1692
+ break;
1693
+ }
1694
+
1695
+ current_chunk = atomic_fetch_add_explicit(current_chunk_ctr, 1, memory_order_relaxed);
1696
+ }
1697
+ }
1698
+ }
1699
+
1700
+ /////////////////////////////////
1701
+
1702
+ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
1703
+ GGML_ASSERT(params);
1704
+
1705
+ if (tensor->op == GGML_OP_NONE || ggml_is_empty(tensor)) {
1706
+ return;
1707
+ }
1708
+
1709
+ // extra_buffer op?
1710
+ if (ggml_cpu_extra_compute_forward(params, tensor)) {
1711
+ return;
1712
+ }
1713
+
1714
+ switch (tensor->op) {
1715
+ case GGML_OP_DUP:
1716
+ {
1717
+ ggml_compute_forward_dup(params, tensor);
1718
+ } break;
1719
+ case GGML_OP_ADD:
1720
+ {
1721
+ ggml_compute_forward_add(params, tensor);
1722
+ } break;
1723
+ case GGML_OP_ADD_ID:
1724
+ {
1725
+ ggml_compute_forward_add_id(params, tensor);
1726
+ } break;
1727
+ case GGML_OP_ADD1:
1728
+ {
1729
+ ggml_compute_forward_add1(params, tensor);
1730
+ } break;
1731
+ case GGML_OP_ACC:
1732
+ {
1733
+ ggml_compute_forward_acc(params, tensor);
1734
+ } break;
1735
+ case GGML_OP_SUB:
1736
+ {
1737
+ ggml_compute_forward_sub(params, tensor);
1738
+ } break;
1739
+ case GGML_OP_MUL:
1740
+ {
1741
+ ggml_compute_forward_mul(params, tensor);
1742
+ } break;
1743
+ case GGML_OP_DIV:
1744
+ {
1745
+ ggml_compute_forward_div(params, tensor);
1746
+ } break;
1747
+ case GGML_OP_SQR:
1748
+ {
1749
+ ggml_compute_forward_sqr(params, tensor);
1750
+ } break;
1751
+ case GGML_OP_SQRT:
1752
+ {
1753
+ ggml_compute_forward_sqrt(params, tensor);
1754
+ } break;
1755
+ case GGML_OP_LOG:
1756
+ {
1757
+ ggml_compute_forward_log(params, tensor);
1758
+ } break;
1759
+ case GGML_OP_SIN:
1760
+ {
1761
+ ggml_compute_forward_sin(params, tensor);
1762
+ } break;
1763
+ case GGML_OP_COS:
1764
+ {
1765
+ ggml_compute_forward_cos(params, tensor);
1766
+ } break;
1767
+ case GGML_OP_SUM:
1768
+ {
1769
+ ggml_compute_forward_sum(params, tensor);
1770
+ } break;
1771
+ case GGML_OP_SUM_ROWS:
1772
+ {
1773
+ ggml_compute_forward_sum_rows(params, tensor);
1774
+ } break;
1775
+ case GGML_OP_CUMSUM:
1776
+ {
1777
+ ggml_compute_forward_cumsum(params, tensor);
1778
+ } break;
1779
+ case GGML_OP_MEAN:
1780
+ {
1781
+ ggml_compute_forward_mean(params, tensor);
1782
+ } break;
1783
+ case GGML_OP_ARGMAX:
1784
+ {
1785
+ ggml_compute_forward_argmax(params, tensor);
1786
+ } break;
1787
+ case GGML_OP_COUNT_EQUAL:
1788
+ {
1789
+ ggml_compute_forward_count_equal(params, tensor);
1790
+ } break;
1791
+ case GGML_OP_REPEAT:
1792
+ {
1793
+ ggml_compute_forward_repeat(params, tensor);
1794
+ } break;
1795
+ case GGML_OP_REPEAT_BACK:
1796
+ {
1797
+ ggml_compute_forward_repeat_back(params, tensor);
1798
+ } break;
1799
+ case GGML_OP_CONCAT:
1800
+ {
1801
+ ggml_compute_forward_concat(params, tensor);
1802
+ } break;
1803
+ case GGML_OP_SILU_BACK:
1804
+ {
1805
+ ggml_compute_forward_silu_back(params, tensor);
1806
+ } break;
1807
+ case GGML_OP_NORM:
1808
+ {
1809
+ ggml_compute_forward_norm(params, tensor);
1810
+ } break;
1811
+ case GGML_OP_RMS_NORM:
1812
+ {
1813
+ ggml_compute_forward_rms_norm(params, tensor);
1814
+ } break;
1815
+ case GGML_OP_RMS_NORM_BACK:
1816
+ {
1817
+ ggml_compute_forward_rms_norm_back(params, tensor);
1818
+ } break;
1819
+ case GGML_OP_GROUP_NORM:
1820
+ {
1821
+ ggml_compute_forward_group_norm(params, tensor);
1822
+ } break;
1823
+ case GGML_OP_L2_NORM:
1824
+ {
1825
+ ggml_compute_forward_l2_norm(params, tensor);
1826
+ } break;
1827
+ case GGML_OP_MUL_MAT:
1828
+ {
1829
+ ggml_compute_forward_mul_mat(params, tensor);
1830
+ } break;
1831
+ case GGML_OP_MUL_MAT_ID:
1832
+ {
1833
+ ggml_compute_forward_mul_mat_id(params, tensor);
1834
+ } break;
1835
+ case GGML_OP_OUT_PROD:
1836
+ {
1837
+ ggml_compute_forward_out_prod(params, tensor);
1838
+ } break;
1839
+ case GGML_OP_SCALE:
1840
+ {
1841
+ ggml_compute_forward_scale(params, tensor);
1842
+ } break;
1843
+ case GGML_OP_SET:
1844
+ {
1845
+ ggml_compute_forward_set(params, tensor);
1846
+ } break;
1847
+ case GGML_OP_CPY:
1848
+ {
1849
+ ggml_compute_forward_cpy(params, tensor);
1850
+ } break;
1851
+ case GGML_OP_CONT:
1852
+ {
1853
+ ggml_compute_forward_cont(params, tensor);
1854
+ } break;
1855
+ case GGML_OP_GET_ROWS:
1856
+ {
1857
+ ggml_compute_forward_get_rows(params, tensor);
1858
+ } break;
1859
+ case GGML_OP_GET_ROWS_BACK:
1860
+ {
1861
+ ggml_compute_forward_get_rows_back(params, tensor);
1862
+ } break;
1863
+ case GGML_OP_SET_ROWS:
1864
+ {
1865
+ ggml_compute_forward_set_rows(params, tensor);
1866
+ } break;
1867
+ case GGML_OP_DIAG:
1868
+ {
1869
+ ggml_compute_forward_diag(params, tensor);
1870
+ } break;
1871
+ case GGML_OP_DIAG_MASK_INF:
1872
+ {
1873
+ ggml_compute_forward_diag_mask_inf(params, tensor);
1874
+ } break;
1875
+ case GGML_OP_DIAG_MASK_ZERO:
1876
+ {
1877
+ ggml_compute_forward_diag_mask_zero(params, tensor);
1878
+ } break;
1879
+ case GGML_OP_SOFT_MAX:
1880
+ {
1881
+ ggml_compute_forward_soft_max(params, tensor);
1882
+ } break;
1883
+ case GGML_OP_SOFT_MAX_BACK:
1884
+ {
1885
+ ggml_compute_forward_soft_max_ext_back(params, tensor);
1886
+ } break;
1887
+ case GGML_OP_ROPE:
1888
+ {
1889
+ ggml_compute_forward_rope(params, tensor);
1890
+ } break;
1891
+ case GGML_OP_ROPE_BACK:
1892
+ {
1893
+ ggml_compute_forward_rope_back(params, tensor);
1894
+ } break;
1895
+ case GGML_OP_CLAMP:
1896
+ {
1897
+ ggml_compute_forward_clamp(params, tensor);
1898
+ } break;
1899
+ case GGML_OP_CONV_TRANSPOSE_1D:
1900
+ {
1901
+ ggml_compute_forward_conv_transpose_1d(params, tensor);
1902
+ } break;
1903
+ case GGML_OP_IM2COL:
1904
+ {
1905
+ ggml_compute_forward_im2col(params, tensor);
1906
+ } break;
1907
+ case GGML_OP_IM2COL_BACK:
1908
+ {
1909
+ ggml_compute_forward_im2col_back_f32(params, tensor);
1910
+ } break;
1911
+ case GGML_OP_IM2COL_3D:
1912
+ {
1913
+ ggml_compute_forward_im2col_3d(params, tensor);
1914
+ } break;
1915
+ case GGML_OP_CONV_2D:
1916
+ {
1917
+ ggml_compute_forward_conv_2d(params, tensor);
1918
+ } break;
1919
+ case GGML_OP_CONV_3D:
1920
+ {
1921
+ ggml_compute_forward_conv_3d(params, tensor);
1922
+ } break;
1923
+ case GGML_OP_CONV_2D_DW:
1924
+ {
1925
+ ggml_compute_forward_conv_2d_dw(params, tensor);
1926
+ } break;
1927
+ case GGML_OP_CONV_TRANSPOSE_2D:
1928
+ {
1929
+ ggml_compute_forward_conv_transpose_2d(params, tensor);
1930
+ } break;
1931
+ case GGML_OP_POOL_1D:
1932
+ {
1933
+ ggml_compute_forward_pool_1d(params, tensor);
1934
+ } break;
1935
+ case GGML_OP_POOL_2D:
1936
+ {
1937
+ ggml_compute_forward_pool_2d(params, tensor);
1938
+ } break;
1939
+ case GGML_OP_POOL_2D_BACK:
1940
+ {
1941
+ ggml_compute_forward_pool_2d_back(params, tensor);
1942
+ } break;
1943
+ case GGML_OP_UPSCALE:
1944
+ {
1945
+ ggml_compute_forward_upscale(params, tensor);
1946
+ } break;
1947
+ case GGML_OP_PAD:
1948
+ {
1949
+ ggml_compute_forward_pad(params, tensor);
1950
+ } break;
1951
+ case GGML_OP_PAD_REFLECT_1D:
1952
+ {
1953
+ ggml_compute_forward_pad_reflect_1d(params, tensor);
1954
+ } break;
1955
+ case GGML_OP_ROLL:
1956
+ {
1957
+ ggml_compute_forward_roll(params, tensor);
1958
+ } break;
1959
+ case GGML_OP_ARANGE:
1960
+ {
1961
+ ggml_compute_forward_arange(params, tensor);
1962
+ } break;
1963
+ case GGML_OP_TIMESTEP_EMBEDDING:
1964
+ {
1965
+ ggml_compute_forward_timestep_embedding(params, tensor);
1966
+ } break;
1967
+ case GGML_OP_ARGSORT:
1968
+ {
1969
+ ggml_compute_forward_argsort(params, tensor);
1970
+ } break;
1971
+ case GGML_OP_TOP_K:
1972
+ {
1973
+ ggml_compute_forward_top_k(params, tensor);
1974
+ } break;
1975
+ case GGML_OP_LEAKY_RELU:
1976
+ {
1977
+ ggml_compute_forward_leaky_relu(params, tensor);
1978
+ } break;
1979
+ case GGML_OP_TRI:
1980
+ {
1981
+ ggml_compute_forward_tri(params, tensor);
1982
+ } break;
1983
+ case GGML_OP_FILL:
1984
+ {
1985
+ ggml_compute_forward_fill(params, tensor);
1986
+ } break;
1987
+ case GGML_OP_FLASH_ATTN_EXT:
1988
+ {
1989
+ ggml_compute_forward_flash_attn_ext(params, tensor);
1990
+ } break;
1991
+ case GGML_OP_FLASH_ATTN_BACK:
1992
+ {
1993
+ int32_t t = ggml_get_op_params_i32(tensor, 0);
1994
+ GGML_ASSERT(t == 0 || t == 1);
1995
+ bool masked = t != 0;
1996
+ ggml_compute_forward_flash_attn_back(params, masked, tensor);
1997
+ } break;
1998
+ case GGML_OP_SSM_CONV:
1999
+ {
2000
+ ggml_compute_forward_ssm_conv(params, tensor);
2001
+ } break;
2002
+ case GGML_OP_SSM_SCAN:
2003
+ {
2004
+ ggml_compute_forward_ssm_scan(params, tensor);
2005
+ } break;
2006
+ case GGML_OP_WIN_PART:
2007
+ {
2008
+ ggml_compute_forward_win_part(params, tensor);
2009
+ } break;
2010
+ case GGML_OP_WIN_UNPART:
2011
+ {
2012
+ ggml_compute_forward_win_unpart(params, tensor);
2013
+ } break;
2014
+ case GGML_OP_UNARY:
2015
+ {
2016
+ ggml_compute_forward_unary(params, tensor);
2017
+ } break;
2018
+ case GGML_OP_GLU:
2019
+ {
2020
+ ggml_compute_forward_glu(params, tensor);
2021
+ } break;
2022
+ case GGML_OP_GET_REL_POS:
2023
+ {
2024
+ ggml_compute_forward_get_rel_pos(params, tensor);
2025
+ } break;
2026
+ case GGML_OP_ADD_REL_POS:
2027
+ {
2028
+ ggml_compute_forward_add_rel_pos(params, tensor);
2029
+ } break;
2030
+ case GGML_OP_RWKV_WKV6:
2031
+ {
2032
+ ggml_compute_forward_rwkv_wkv6(params, tensor);
2033
+ } break;
2034
+ case GGML_OP_GATED_LINEAR_ATTN:
2035
+ {
2036
+ ggml_compute_forward_gla(params, tensor);
2037
+ } break;
2038
+ case GGML_OP_RWKV_WKV7:
2039
+ {
2040
+ ggml_compute_forward_rwkv_wkv7(params, tensor);
2041
+ } break;
2042
+ case GGML_OP_SOLVE_TRI:
2043
+ {
2044
+ ggml_compute_forward_solve_tri(params, tensor);
2045
+ } break;
2046
+ case GGML_OP_GATED_DELTA_NET:
2047
+ {
2048
+ ggml_compute_forward_gated_delta_net(params, tensor);
2049
+ } break;
2050
+ case GGML_OP_MAP_CUSTOM1:
2051
+ {
2052
+ ggml_compute_forward_map_custom1(params, tensor);
2053
+ }
2054
+ break;
2055
+ case GGML_OP_MAP_CUSTOM2:
2056
+ {
2057
+ ggml_compute_forward_map_custom2(params, tensor);
2058
+ }
2059
+ break;
2060
+ case GGML_OP_MAP_CUSTOM3:
2061
+ {
2062
+ ggml_compute_forward_map_custom3(params, tensor);
2063
+ }
2064
+ break;
2065
+ case GGML_OP_CUSTOM:
2066
+ {
2067
+ ggml_compute_forward_custom(params, tensor);
2068
+ }
2069
+ break;
2070
+ case GGML_OP_CROSS_ENTROPY_LOSS:
2071
+ {
2072
+ ggml_compute_forward_cross_entropy_loss(params, tensor);
2073
+ }
2074
+ break;
2075
+ case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
2076
+ {
2077
+ ggml_compute_forward_cross_entropy_loss_back(params, tensor);
2078
+ }
2079
+ break;
2080
+ case GGML_OP_OPT_STEP_ADAMW:
2081
+ {
2082
+ ggml_compute_forward_opt_step_adamw(params, tensor);
2083
+ }
2084
+ break;
2085
+ case GGML_OP_OPT_STEP_SGD:
2086
+ {
2087
+ ggml_compute_forward_opt_step_sgd(params, tensor);
2088
+ }
2089
+ break;
2090
+ case GGML_OP_NONE:
2091
+ {
2092
+ // nop
2093
+ } break;
2094
+ case GGML_OP_RESHAPE:
2095
+ {
2096
+ // nop
2097
+ } break;
2098
+ case GGML_OP_PERMUTE:
2099
+ {
2100
+ // nop
2101
+ } break;
2102
+ case GGML_OP_VIEW:
2103
+ {
2104
+ // nop
2105
+ } break;
2106
+ case GGML_OP_TRANSPOSE:
2107
+ {
2108
+ // nop
2109
+ } break;
2110
+ case GGML_OP_COUNT:
2111
+ {
2112
+ GGML_ABORT("fatal error");
2113
+ }
2114
+ }
2115
+ }
2116
+
2117
+ // Android's libc implementation "bionic" does not support setting affinity
2118
+ #if defined(__gnu_linux__)
2119
+ static void set_numa_thread_affinity(int thread_n) {
2120
+ if (!ggml_is_numa()) {
2121
+ return;
2122
+ }
2123
+
2124
+ int node_num;
2125
+ int rv;
2126
+ size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);
2127
+
2128
+ switch(g_state.numa.numa_strategy) {
2129
+ case GGML_NUMA_STRATEGY_DISTRIBUTE:
2130
+ // run thread on node_num thread_n / (threads per node)
2131
+ node_num = thread_n % g_state.numa.n_nodes;
2132
+ break;
2133
+ case GGML_NUMA_STRATEGY_ISOLATE:
2134
+ // run thread on current_node
2135
+ node_num = g_state.numa.current_node;
2136
+ break;
2137
+ case GGML_NUMA_STRATEGY_NUMACTL:
2138
+ // use the cpuset that numactl gave us
2139
+ rv = pthread_setaffinity_np(pthread_self(), setsize, &g_state.numa.cpuset);
2140
+ if (rv) {
2141
+ fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",strerror(rv));
2142
+ }
2143
+ return;
2144
+ default:
2145
+ return;
2146
+ }
2147
+
2148
+ struct ggml_numa_node * node = &g_state.numa.nodes[node_num];
2149
+
2150
+ cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
2151
+ CPU_ZERO_S(setsize, cpus);
2152
+ for (size_t i = 0; i < node->n_cpus; ++i) {
2153
+ CPU_SET_S(node->cpus[i], setsize, cpus);
2154
+ }
2155
+
2156
+ rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
2157
+ if (rv) {
2158
+ fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", strerror(rv));
2159
+ }
2160
+
2161
+ CPU_FREE(cpus);
2162
+ }
2163
+
2164
+ static void clear_numa_thread_affinity(void) {
2165
+ if (!ggml_is_numa()) {
2166
+ return;
2167
+ }
2168
+
2169
+ size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);
2170
+
2171
+ cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
2172
+ CPU_ZERO_S(setsize, cpus);
2173
+ for (unsigned i = 0; i < g_state.numa.total_cpus; ++i) {
2174
+ CPU_SET_S(i, setsize, cpus);
2175
+ }
2176
+
2177
+ int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
2178
+ if (rv) {
2179
+ fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", strerror(rv));
2180
+ }
2181
+
2182
+ CPU_FREE(cpus);
2183
+ }
2184
+ #else
2185
+ // TODO: Windows etc.
2186
+ // (the linux implementation may also work on BSD, someone should test)
2187
+ static void set_numa_thread_affinity(int thread_n) { UNUSED(thread_n); }
2188
+ static void clear_numa_thread_affinity(void) {}
2189
+ #endif
2190
+
2191
+ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
2192
+ int n_tasks = 0;
2193
+
2194
+ if (ggml_is_empty(node)) {
2195
+ // no need to multi-thread a no-op
2196
+ n_tasks = 1;
2197
+ return n_tasks;
2198
+ }
2199
+
2200
+ switch (node->op) {
2201
+ case GGML_OP_CPY:
2202
+ case GGML_OP_DUP:
2203
+ case GGML_OP_CONT:
2204
+ case GGML_OP_ADD:
2205
+ case GGML_OP_ADD_ID:
2206
+ case GGML_OP_ADD1:
2207
+ case GGML_OP_ACC:
2208
+ case GGML_OP_CUMSUM:
2209
+ case GGML_OP_TRI:
2210
+ case GGML_OP_FILL:
2211
+ {
2212
+ n_tasks = n_threads;
2213
+ } break;
2214
+ case GGML_OP_SUB:
2215
+ case GGML_OP_SQR:
2216
+ case GGML_OP_SQRT:
2217
+ case GGML_OP_LOG:
2218
+ case GGML_OP_SIN:
2219
+ case GGML_OP_COS:
2220
+ case GGML_OP_SUM:
2221
+ case GGML_OP_SUM_ROWS:
2222
+ case GGML_OP_MEAN:
2223
+ case GGML_OP_ARGMAX:
2224
+ {
2225
+ n_tasks = 1;
2226
+ } break;
2227
+ case GGML_OP_COUNT_EQUAL:
2228
+ case GGML_OP_SOLVE_TRI:
2229
+ case GGML_OP_GATED_DELTA_NET:
2230
+ {
2231
+ n_tasks = n_threads;
2232
+ } break;
2233
+ case GGML_OP_REPEAT:
2234
+ case GGML_OP_REPEAT_BACK:
2235
+ case GGML_OP_LEAKY_RELU:
2236
+ {
2237
+ n_tasks = 1;
2238
+ } break;
2239
+ case GGML_OP_UNARY:
2240
+ switch (ggml_get_unary_op(node)) {
2241
+ case GGML_UNARY_OP_ABS:
2242
+ case GGML_UNARY_OP_SGN:
2243
+ case GGML_UNARY_OP_NEG:
2244
+ case GGML_UNARY_OP_STEP:
2245
+ case GGML_UNARY_OP_TANH:
2246
+ case GGML_UNARY_OP_ELU:
2247
+ case GGML_UNARY_OP_RELU:
2248
+ case GGML_UNARY_OP_SIGMOID:
2249
+ case GGML_UNARY_OP_HARDSWISH:
2250
+ case GGML_UNARY_OP_HARDSIGMOID:
2251
+ case GGML_UNARY_OP_EXP:
2252
+ case GGML_UNARY_OP_SOFTPLUS:
2253
+ case GGML_UNARY_OP_EXPM1:
2254
+ case GGML_UNARY_OP_FLOOR:
2255
+ case GGML_UNARY_OP_CEIL:
2256
+ case GGML_UNARY_OP_ROUND:
2257
+ case GGML_UNARY_OP_TRUNC:
2258
+ {
2259
+ n_tasks = 1;
2260
+ } break;
2261
+
2262
+ case GGML_UNARY_OP_GELU:
2263
+ case GGML_UNARY_OP_GELU_ERF:
2264
+ case GGML_UNARY_OP_GELU_QUICK:
2265
+ case GGML_UNARY_OP_SILU:
2266
+ case GGML_UNARY_OP_XIELU:
2267
+ {
2268
+ n_tasks = n_threads;
2269
+ } break;
2270
+ default:
2271
+ GGML_ABORT("fatal error");
2272
+ }
2273
+ break;
2274
+ case GGML_OP_GLU:
2275
+ switch (ggml_get_glu_op(node)) {
2276
+ case GGML_GLU_OP_REGLU:
2277
+ case GGML_GLU_OP_GEGLU:
2278
+ case GGML_GLU_OP_SWIGLU:
2279
+ case GGML_GLU_OP_SWIGLU_OAI:
2280
+ case GGML_GLU_OP_GEGLU_ERF:
2281
+ case GGML_GLU_OP_GEGLU_QUICK:
2282
+ {
2283
+ n_tasks = n_threads;
2284
+ } break;
2285
+ default:
2286
+ GGML_ABORT("fatal error");
2287
+ }
2288
+ break;
2289
+ case GGML_OP_SILU_BACK:
2290
+ case GGML_OP_MUL:
2291
+ case GGML_OP_DIV:
2292
+ case GGML_OP_NORM:
2293
+ case GGML_OP_RMS_NORM:
2294
+ case GGML_OP_RMS_NORM_BACK:
2295
+ case GGML_OP_L2_NORM:
2296
+ case GGML_OP_GROUP_NORM:
2297
+ case GGML_OP_CONCAT:
2298
+ case GGML_OP_MUL_MAT:
2299
+ case GGML_OP_MUL_MAT_ID:
2300
+ case GGML_OP_OUT_PROD:
2301
+ {
2302
+ n_tasks = n_threads;
2303
+ } break;
2304
+ case GGML_OP_GET_ROWS:
2305
+ case GGML_OP_SET_ROWS:
2306
+ {
2307
+ // FIXME: get_rows can use additional threads, but the cost of launching additional threads
2308
+ // decreases performance with GPU offloading
2309
+ //n_tasks = n_threads;
2310
+ n_tasks = 1;
2311
+ } break;
2312
+ case GGML_OP_SCALE:
2313
+ case GGML_OP_SET:
2314
+ case GGML_OP_RESHAPE:
2315
+ case GGML_OP_VIEW:
2316
+ case GGML_OP_PERMUTE:
2317
+ case GGML_OP_TRANSPOSE:
2318
+ case GGML_OP_GET_ROWS_BACK:
2319
+ case GGML_OP_DIAG:
2320
+ {
2321
+ n_tasks = 1;
2322
+ } break;
2323
+ case GGML_OP_DIAG_MASK_ZERO:
2324
+ case GGML_OP_DIAG_MASK_INF:
2325
+ case GGML_OP_SOFT_MAX_BACK:
2326
+ case GGML_OP_ROPE:
2327
+ case GGML_OP_ROPE_BACK:
2328
+ case GGML_OP_ADD_REL_POS:
2329
+ {
2330
+ n_tasks = n_threads;
2331
+ } break;
2332
+ case GGML_OP_CLAMP:
2333
+ {
2334
+ n_tasks = 1; //TODO
2335
+ } break;
2336
+ case GGML_OP_SOFT_MAX:
2337
+ {
2338
+ n_tasks = MIN(n_threads, ggml_nrows(node->src[0]));
2339
+ } break;
2340
+ case GGML_OP_IM2COL:
2341
+ case GGML_OP_IM2COL_BACK:
2342
+ case GGML_OP_IM2COL_3D:
2343
+ case GGML_OP_CONV_2D:
2344
+ case GGML_OP_CONV_3D:
2345
+ case GGML_OP_CONV_2D_DW:
2346
+ case GGML_OP_CONV_TRANSPOSE_1D:
2347
+ case GGML_OP_CONV_TRANSPOSE_2D:
2348
+ {
2349
+ n_tasks = n_threads;
2350
+ } break;
2351
+ case GGML_OP_POOL_1D:
2352
+ case GGML_OP_POOL_2D:
2353
+ case GGML_OP_POOL_2D_BACK:
2354
+ {
2355
+ n_tasks = 1;
2356
+ } break;
2357
+ case GGML_OP_UPSCALE:
2358
+ case GGML_OP_PAD:
2359
+ case GGML_OP_PAD_REFLECT_1D:
2360
+ case GGML_OP_ROLL:
2361
+ case GGML_OP_ARANGE:
2362
+ case GGML_OP_TIMESTEP_EMBEDDING:
2363
+ case GGML_OP_ARGSORT:
2364
+ case GGML_OP_TOP_K:
2365
+ case GGML_OP_FLASH_ATTN_EXT:
2366
+ case GGML_OP_FLASH_ATTN_BACK:
2367
+ case GGML_OP_SSM_CONV:
2368
+ case GGML_OP_SSM_SCAN:
2369
+ {
2370
+ n_tasks = n_threads;
2371
+ } break;
2372
+ case GGML_OP_RWKV_WKV6:
2373
+ case GGML_OP_GATED_LINEAR_ATTN:
2374
+ case GGML_OP_RWKV_WKV7:
2375
+ {
2376
+ const int64_t n_heads = node->src[1]->ne[1];
2377
+ n_tasks = MIN(n_threads, n_heads);
2378
+ } break;
2379
+ case GGML_OP_WIN_PART:
2380
+ case GGML_OP_WIN_UNPART:
2381
+ case GGML_OP_GET_REL_POS:
2382
+ {
2383
+ n_tasks = 1;
2384
+ } break;
2385
+ case GGML_OP_MAP_CUSTOM1:
2386
+ {
2387
+ struct ggml_map_custom1_op_params p;
2388
+ memcpy(&p, node->op_params, sizeof(p));
2389
+ if (p.n_tasks == GGML_N_TASKS_MAX) {
2390
+ n_tasks = n_threads;
2391
+ } else {
2392
+ n_tasks = MIN(p.n_tasks, n_threads);
2393
+ }
2394
+ } break;
2395
+ case GGML_OP_MAP_CUSTOM2:
2396
+ {
2397
+ struct ggml_map_custom2_op_params p;
2398
+ memcpy(&p, node->op_params, sizeof(p));
2399
+ if (p.n_tasks == GGML_N_TASKS_MAX) {
2400
+ n_tasks = n_threads;
2401
+ } else {
2402
+ n_tasks = MIN(p.n_tasks, n_threads);
2403
+ }
2404
+ } break;
2405
+ case GGML_OP_MAP_CUSTOM3:
2406
+ {
2407
+ struct ggml_map_custom3_op_params p;
2408
+ memcpy(&p, node->op_params, sizeof(p));
2409
+ if (p.n_tasks == GGML_N_TASKS_MAX) {
2410
+ n_tasks = n_threads;
2411
+ } else {
2412
+ n_tasks = MIN(p.n_tasks, n_threads);
2413
+ }
2414
+ } break;
2415
+ case GGML_OP_CUSTOM:
2416
+ {
2417
+ struct ggml_custom_op_params p;
2418
+ memcpy(&p, node->op_params, sizeof(p));
2419
+ if (p.n_tasks == GGML_N_TASKS_MAX) {
2420
+ n_tasks = n_threads;
2421
+ } else {
2422
+ n_tasks = MIN(p.n_tasks, n_threads);
2423
+ }
2424
+ } break;
2425
+ case GGML_OP_CROSS_ENTROPY_LOSS:
2426
+ case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
2427
+ case GGML_OP_OPT_STEP_ADAMW:
2428
+ case GGML_OP_OPT_STEP_SGD:
2429
+ {
2430
+ n_tasks = n_threads;
2431
+ } break;
2432
+ case GGML_OP_NONE:
2433
+ {
2434
+ n_tasks = 1;
2435
+ } break;
2436
+ case GGML_OP_COUNT:
2437
+ {
2438
+ GGML_ABORT("fatal error");
2439
+ }
2440
+ default:
2441
+ {
2442
+ fprintf(stderr, "%s: op not implemented: ", __func__);
2443
+ if (node->op < GGML_OP_COUNT) {
2444
+ fprintf(stderr, "%s\n", ggml_op_name(node->op));
2445
+ } else {
2446
+ fprintf(stderr, "%d\n", node->op);
2447
+ }
2448
+ GGML_ABORT("fatal error");
2449
+ }
2450
+ }
2451
+
2452
+ assert(n_tasks > 0);
2453
+
2454
+ return n_tasks;
2455
+ }
2456
+
2457
+ static thread_ret_t ggml_graph_compute_secondary_thread(void* data);
2458
+
2459
+ #if defined(_WIN32)
2460
+ #include "windows.h"
2461
+
2462
+ // TODO: support > 64 CPUs
2463
+ static bool ggml_thread_apply_affinity(bool * mask) {
2464
+ HANDLE h = GetCurrentThread();
2465
+ uint64_t bitmask = 0ULL;
2466
+
2467
+ assert(GGML_MAX_N_THREADS >= 64);
2468
+
2469
+ for (int32_t i = 0; i < 8; i++) {
2470
+ int32_t idx = i * 8;
2471
+ uint8_t val = 0;
2472
+ val |= mask[idx + 0] << 0;
2473
+ val |= mask[idx + 1] << 1;
2474
+ val |= mask[idx + 2] << 2;
2475
+ val |= mask[idx + 3] << 3;
2476
+ val |= mask[idx + 4] << 4;
2477
+ val |= mask[idx + 5] << 5;
2478
+ val |= mask[idx + 6] << 6;
2479
+ val |= mask[idx + 7] << 7;
2480
+ bitmask |= (uint64_t)val << idx;
2481
+ }
2482
+
2483
+ for (int32_t i = 64; i < GGML_MAX_N_THREADS; i++) {
2484
+ if (mask[i]) {
2485
+ fprintf(stderr, "warn: setting thread-affinity for > 64 CPUs isn't supported on windows!\n");
2486
+ break;
2487
+ }
2488
+ }
2489
+
2490
+ DWORD_PTR m = (DWORD_PTR)bitmask;
2491
+
2492
+ m = SetThreadAffinityMask(h, m);
2493
+
2494
+ return m != 0;
2495
+ }
2496
+
2497
+ static bool ggml_thread_apply_priority(int32_t prio) {
2498
+ // Note that on Windows the Process Priority Class must be updated in order to set Thread priority.
2499
+ // This is up to the applications.
2500
+ DWORD p = THREAD_PRIORITY_NORMAL;
2501
+ switch (prio) {
2502
+ case GGML_SCHED_PRIO_LOW: p = THREAD_PRIORITY_BELOW_NORMAL; break;
2503
+ case GGML_SCHED_PRIO_NORMAL: p = THREAD_PRIORITY_NORMAL; break;
2504
+ case GGML_SCHED_PRIO_MEDIUM: p = THREAD_PRIORITY_ABOVE_NORMAL; break;
2505
+ case GGML_SCHED_PRIO_HIGH: p = THREAD_PRIORITY_HIGHEST; break;
2506
+ case GGML_SCHED_PRIO_REALTIME: p = THREAD_PRIORITY_TIME_CRITICAL; break;
2507
+ }
2508
+
2509
+ if (prio != GGML_SCHED_PRIO_LOW) {
2510
+ // Tell Windows that this thread should not be throttled (needs its own CPU core).
2511
+ // Newer Windows 11 versions aggressively park (offline) CPU cores and often place
2512
+ // all our threads onto the first 4 cores which results in terrible performance with
2513
+ // n_threads > 4
2514
+ #if _WIN32_WINNT >= 0x0602
2515
+ THREAD_POWER_THROTTLING_STATE t;
2516
+ ZeroMemory(&t, sizeof(t));
2517
+ t.Version = THREAD_POWER_THROTTLING_CURRENT_VERSION;
2518
+ t.ControlMask = THREAD_POWER_THROTTLING_EXECUTION_SPEED;
2519
+ t.StateMask = 0;
2520
+
2521
+ if (!SetThreadInformation(GetCurrentThread(), ThreadPowerThrottling, &t, sizeof(t))) {
2522
+ GGML_LOG_DEBUG("failed to disable thread power throttling %d : (%d)\n", prio, (int) GetLastError());
2523
+ return false;
2524
+ }
2525
+ #endif
2526
+ }
2527
+
2528
+ if (prio == GGML_SCHED_PRIO_NORMAL) {
2529
+ // Keep inherited policy/priority
2530
+ return true;
2531
+ }
2532
+
2533
+ if (!SetThreadPriority(GetCurrentThread(), p)) {
2534
+ fprintf(stderr, "warn: failed to set thread priority %d : (%d)\n", prio, (int) GetLastError());
2535
+ return false;
2536
+ }
2537
+
2538
+ return true;
2539
+ }
2540
+
2541
+ #elif defined(__APPLE__)
2542
+ #include <sys/types.h>
2543
+ #include <sys/resource.h>
2544
+
2545
+ static bool ggml_thread_apply_affinity(const bool * mask) {
2546
+ // Not supported on Apple platforms
2547
+ UNUSED(mask);
2548
+ return true;
2549
+ }
2550
+
2551
+ static bool ggml_thread_apply_priority(int32_t prio) {
2552
+ struct sched_param p;
2553
+ int32_t policy = SCHED_OTHER;
2554
+ switch (prio) {
2555
+ // TODO: there seems to be no way to set lower prio on Apple platforms
2556
+ case GGML_SCHED_PRIO_LOW: policy = SCHED_OTHER; p.sched_priority = 0; break;
2557
+ case GGML_SCHED_PRIO_NORMAL: policy = SCHED_OTHER; p.sched_priority = 0; break;
2558
+ case GGML_SCHED_PRIO_MEDIUM: policy = SCHED_FIFO; p.sched_priority = 40; break;
2559
+ case GGML_SCHED_PRIO_HIGH: policy = SCHED_FIFO; p.sched_priority = 80; break;
2560
+ case GGML_SCHED_PRIO_REALTIME: policy = SCHED_FIFO; p.sched_priority = 90; break;
2561
+ }
2562
+
2563
+ if (prio == GGML_SCHED_PRIO_NORMAL) {
2564
+ // Keep inherited policy/priority
2565
+ return true;
2566
+ }
2567
+
2568
+ int32_t err = pthread_setschedparam(pthread_self(), policy, &p);
2569
+ if (err != 0) {
2570
+ fprintf(stderr, "warn: failed to set thread priority %d : %s (%d)\n", prio, strerror(err), err);
2571
+ return false;
2572
+ }
2573
+
2574
+ return true;
2575
+ }
2576
+
2577
+ #elif defined(__gnu_linux__)
2578
+ // TODO: this may not work on BSD, to be verified
2579
+
2580
+ static bool ggml_thread_apply_affinity(const bool * mask) {
2581
+ cpu_set_t cpuset;
2582
+ int err;
2583
+
2584
+ CPU_ZERO(&cpuset);
2585
+
2586
+ for (uint32_t i = 0; i < GGML_MAX_N_THREADS; i++) {
2587
+ if (mask[i]) {
2588
+ GGML_PRINT_DEBUG("Thread %lx: adding %d to cpuset\n", pthread_self(), i);
2589
+ CPU_SET(i, &cpuset);
2590
+ }
2591
+ }
2592
+
2593
+ #ifdef __ANDROID__
2594
+ err = sched_setaffinity(0, sizeof(cpuset), &cpuset);
2595
+ if (err < 0) {
2596
+ err = errno;
2597
+ }
2598
+ #else
2599
+ err = pthread_setaffinity_np(pthread_self(), sizeof(cpuset), &cpuset);
2600
+ #endif
2601
+ if (err != 0) {
2602
+ fprintf(stderr, "warn: failed to set affinity mask 0x%llx : %s (%d)\n", (unsigned long long)mask, strerror(err), err);
2603
+ return false;
2604
+ }
2605
+
2606
+ return true;
2607
+ }
2608
+
2609
+ static bool ggml_thread_apply_priority(int32_t prio) {
2610
+ struct sched_param p;
2611
+ int32_t policy = SCHED_OTHER;
2612
+ switch (prio) {
2613
+ case GGML_SCHED_PRIO_LOW: policy = SCHED_BATCH; p.sched_priority = 0; break;
2614
+ case GGML_SCHED_PRIO_NORMAL: policy = SCHED_OTHER; p.sched_priority = 0; break;
2615
+ case GGML_SCHED_PRIO_MEDIUM: policy = SCHED_FIFO; p.sched_priority = 40; break;
2616
+ case GGML_SCHED_PRIO_HIGH: policy = SCHED_FIFO; p.sched_priority = 80; break;
2617
+ case GGML_SCHED_PRIO_REALTIME: policy = SCHED_FIFO; p.sched_priority = 90; break;
2618
+ }
2619
+
2620
+ if (prio == GGML_SCHED_PRIO_NORMAL) {
2621
+ // Keep inherited policy/priority
2622
+ return true;
2623
+ }
2624
+
2625
+ int32_t err = pthread_setschedparam(pthread_self(), policy, &p);
2626
+ if (err != 0) {
2627
+ fprintf(stderr, "warn: failed to set thread priority %d : %s (%d)\n", prio, strerror(err), err);
2628
+ return false;
2629
+ }
2630
+
2631
+ return true;
2632
+ }
2633
+
2634
+ #else // unsupported platforms
2635
+
2636
+ static bool ggml_thread_apply_affinity(const bool * mask) {
2637
+ UNUSED(mask);
2638
+ return true;
2639
+ }
2640
+
2641
+ static bool ggml_thread_apply_priority(int32_t prio) {
2642
+ UNUSED(prio);
2643
+ return true;
2644
+ }
2645
+
2646
+ #endif
2647
+
2648
+ static bool ggml_thread_cpumask_is_valid(const bool * mask) {
2649
+ for (int i = 0; i < GGML_MAX_N_THREADS; i++) {
2650
+ if (mask[i]) { return true; }
2651
+ }
2652
+ return false;
2653
+ }
2654
+
2655
+ static void ggml_thread_cpumask_next(const bool * global_mask, bool * local_mask, bool strict, int32_t* iter) {
2656
+ if (!strict) {
2657
+ memcpy(local_mask, global_mask, GGML_MAX_N_THREADS);
2658
+ return;
2659
+ } else {
2660
+ memset(local_mask, 0, GGML_MAX_N_THREADS);
2661
+ int32_t base_idx = *iter;
2662
+ for (int32_t i = 0; i < GGML_MAX_N_THREADS; i++) {
2663
+ int32_t idx = base_idx + i;
2664
+ if (idx >= GGML_MAX_N_THREADS) {
2665
+ // Just a cheaper modulo
2666
+ idx -= GGML_MAX_N_THREADS;
2667
+ }
2668
+ if (global_mask[idx]) {
2669
+ local_mask[idx] = 1;
2670
+ *iter = idx + 1;
2671
+ return;
2672
+ }
2673
+ }
2674
+ }
2675
+ }
2676
+
2677
+ void ggml_threadpool_free(struct ggml_threadpool* threadpool) {
2678
+ if (!threadpool) return;
2679
+
2680
+ const int n_threads = threadpool->n_threads;
2681
+
2682
+ #ifndef GGML_USE_OPENMP
2683
+ struct ggml_compute_state* workers = threadpool->workers;
2684
+
2685
+ ggml_mutex_lock(&threadpool->mutex);
2686
+
2687
+ threadpool->stop = true;
2688
+ threadpool->pause = false;
2689
+
2690
+ ggml_cond_broadcast(&threadpool->cond);
2691
+ ggml_mutex_unlock(&threadpool->mutex);
2692
+
2693
+ for (int j = 1; j < n_threads; j++) {
2694
+ int32_t rc = ggml_thread_join(workers[j].thrd, NULL);
2695
+ GGML_ASSERT(rc == GGML_EXIT_SUCCESS || rc == GGML_EXIT_ABORTED);
2696
+ UNUSED(rc);
2697
+ }
2698
+
2699
+ ggml_mutex_destroy(&threadpool->mutex);
2700
+ ggml_cond_destroy(&threadpool->cond);
2701
+ #endif // GGML_USE_OPENMP
2702
+
2703
+ const size_t workers_size = sizeof(struct ggml_compute_state) * n_threads;
2704
+ ggml_aligned_free(threadpool->workers, workers_size);
2705
+ ggml_aligned_free(threadpool, sizeof(struct ggml_threadpool));
2706
+ }
2707
+
2708
+ #ifndef GGML_USE_OPENMP
2709
+ // pause/resume must be called under mutex
2710
+ static void ggml_threadpool_pause_locked(struct ggml_threadpool * threadpool) {
2711
+ GGML_PRINT_DEBUG("Pausing threadpool\n");
2712
+ threadpool->pause = true;
2713
+ ggml_cond_broadcast(&threadpool->cond);
2714
+ }
2715
+
2716
+ static void ggml_threadpool_resume_locked(struct ggml_threadpool * threadpool) {
2717
+ GGML_PRINT_DEBUG("Resuming threadpool\n");
2718
+ threadpool->pause = false;
2719
+ ggml_cond_broadcast(&threadpool->cond);
2720
+ }
2721
+ #endif
2722
+
2723
+ void ggml_threadpool_pause(struct ggml_threadpool * threadpool) {
2724
+ #ifndef GGML_USE_OPENMP
2725
+ ggml_mutex_lock(&threadpool->mutex);
2726
+ if (!threadpool->pause) {
2727
+ ggml_threadpool_pause_locked(threadpool);
2728
+ }
2729
+ ggml_mutex_unlock(&threadpool->mutex);
2730
+ #else
2731
+ UNUSED(threadpool);
2732
+ #endif
2733
+ }
2734
+
2735
+ void ggml_threadpool_resume(struct ggml_threadpool * threadpool) {
2736
+ #ifndef GGML_USE_OPENMP
2737
+ ggml_mutex_lock(&threadpool->mutex);
2738
+ if (threadpool->pause) {
2739
+ ggml_threadpool_resume_locked(threadpool);
2740
+ }
2741
+ ggml_mutex_unlock(&threadpool->mutex);
2742
+ #else
2743
+ UNUSED(threadpool);
2744
+ #endif
2745
+ }
2746
+
2747
+ struct ggml_cplan ggml_graph_plan(
2748
+ const struct ggml_cgraph * cgraph,
2749
+ int n_threads,
2750
+ struct ggml_threadpool * threadpool) {
2751
+
2752
+ if (threadpool == NULL) {
2753
+ //GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %d\n", n_threads);
2754
+ }
2755
+ if (n_threads <= 0) {
2756
+ n_threads = threadpool ? threadpool->n_threads : GGML_DEFAULT_N_THREADS;
2757
+ }
2758
+
2759
+ #if defined(__EMSCRIPTEN__) && !defined(__EMSCRIPTEN_PTHREADS__)
2760
+ // Emscripten without pthreads support can only use a single thread
2761
+ n_threads = 1;
2762
+ #endif
2763
+
2764
+ size_t work_size = 0;
2765
+
2766
+ struct ggml_cplan cplan;
2767
+ memset(&cplan, 0, sizeof(struct ggml_cplan));
2768
+
2769
+ int max_tasks = 1;
2770
+
2771
+ // thread scheduling for the different operations + work buffer size estimation
2772
+ for (int i = 0; i < cgraph->n_nodes; i++) {
2773
+ struct ggml_tensor * node = cgraph->nodes[i];
2774
+
2775
+ const int n_tasks = ggml_get_n_tasks(node, n_threads);
2776
+
2777
+ max_tasks = MAX(max_tasks, n_tasks);
2778
+
2779
+ size_t cur = 0;
2780
+
2781
+ if (!ggml_cpu_extra_work_size(n_threads, node, &cur)) {
2782
+ switch (node->op) {
2783
+ case GGML_OP_CPY:
2784
+ case GGML_OP_DUP:
2785
+ {
2786
+ if (ggml_is_quantized(node->type) ||
2787
+ // F16 -> BF16 and BF16 -> F16 copies go through intermediate F32
2788
+ (node->src[0]->type == GGML_TYPE_F16 && node->src[1] && node->src[1]->type == GGML_TYPE_BF16) ||
2789
+ (node->src[0]->type == GGML_TYPE_BF16 && node->src[1] && node->src[1]->type == GGML_TYPE_F16) ||
2790
+ // conversion between F32 and I32
2791
+ (node->src[0]->type == GGML_TYPE_F32 && node->src[1] && node->src[1]->type == GGML_TYPE_I32) ||
2792
+ (node->src[0]->type == GGML_TYPE_I32 && node->src[1] && node->src[1]->type == GGML_TYPE_F32)) {
2793
+ cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
2794
+ }
2795
+ } break;
2796
+ case GGML_OP_ADD:
2797
+ case GGML_OP_ADD_ID:
2798
+ case GGML_OP_ADD1:
2799
+ {
2800
+ if (ggml_is_quantized(node->src[0]->type)) {
2801
+ cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
2802
+ }
2803
+ } break;
2804
+ case GGML_OP_ACC:
2805
+ {
2806
+ if (ggml_is_quantized(node->src[0]->type)) {
2807
+ cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks;
2808
+ }
2809
+ } break;
2810
+ case GGML_OP_COUNT_EQUAL:
2811
+ {
2812
+ cur = ggml_type_size(node->type)*n_tasks;
2813
+ } break;
2814
+ case GGML_OP_MUL_MAT:
2815
+ {
2816
+ const enum ggml_type vec_dot_type = type_traits_cpu[node->src[0]->type].vec_dot_type;
2817
+
2818
+ if (node->src[1]->type != vec_dot_type) {
2819
+ cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
2820
+ }
2821
+ } break;
2822
+ case GGML_OP_MUL_MAT_ID:
2823
+ {
2824
+ cur = 0;
2825
+ const struct ggml_tensor * src0 = node->src[0];
2826
+ const struct ggml_tensor * src1 = node->src[1];
2827
+ const struct ggml_tensor * ids = node->src[2];
2828
+ const enum ggml_type vec_dot_type = type_traits_cpu[src0->type].vec_dot_type;
2829
+ const int n_as = src0->ne[2];
2830
+ // src1
2831
+ if (src1->type != vec_dot_type) {
2832
+ cur += ggml_row_size(vec_dot_type, ggml_nelements(src1)) + sizeof(int64_t);
2833
+ }
2834
+ // matrix_row_counts
2835
+ cur += n_as * sizeof(int64_t) + sizeof(int64_t);
2836
+ // matrix_rows
2837
+ cur += n_as*ids->ne[0]*ids->ne[1]*sizeof(struct mmid_row_mapping) + sizeof(int64_t);
2838
+ // atomic_current_chunk
2839
+ cur += CACHE_LINE_SIZE*n_as + CACHE_LINE_SIZE;
2840
+ } break;
2841
+ case GGML_OP_OUT_PROD:
2842
+ {
2843
+ if (ggml_is_quantized(node->src[0]->type)) {
2844
+ cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
2845
+ }
2846
+ } break;
2847
+ case GGML_OP_SOFT_MAX:
2848
+ case GGML_OP_ROPE:
2849
+ case GGML_OP_ROPE_BACK:
2850
+ {
2851
+ cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
2852
+ } break;
2853
+ case GGML_OP_CONV_TRANSPOSE_1D:
2854
+ {
2855
+ GGML_ASSERT(node->src[0]->ne[3] == 1);
2856
+ GGML_ASSERT(node->src[1]->ne[2] == 1);
2857
+ GGML_ASSERT(node->src[1]->ne[3] == 1);
2858
+
2859
+ const int64_t ne00 = node->src[0]->ne[0]; // K
2860
+ const int64_t ne01 = node->src[0]->ne[1]; // Cout
2861
+ const int64_t ne02 = node->src[0]->ne[2]; // Cin
2862
+ const int64_t ne10 = node->src[1]->ne[0]; // L
2863
+ const int64_t ne11 = node->src[1]->ne[1]; // Cin
2864
+
2865
+ if ((node->src[0]->type == GGML_TYPE_F16 ||
2866
+ node->src[0]->type == GGML_TYPE_BF16) &&
2867
+ node->src[1]->type == GGML_TYPE_F32) {
2868
+ cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02;
2869
+ cur += sizeof(ggml_fp16_t)*ne10*ne11;
2870
+ } else if (node->src[0]->type == GGML_TYPE_F32 &&
2871
+ node->src[1]->type == GGML_TYPE_F32) {
2872
+ cur += sizeof(float)*ne00*ne01*ne02;
2873
+ cur += sizeof(float)*ne10*ne11;
2874
+ } else {
2875
+ GGML_ABORT("fatal error");
2876
+ }
2877
+ } break;
2878
+ case GGML_OP_CONV_2D:
2879
+ case GGML_OP_CONV_3D:
2880
+ {
2881
+ cur = GGML_IM2COL_WORK_SIZE;
2882
+ } break;
2883
+ case GGML_OP_CONV_TRANSPOSE_2D:
2884
+ {
2885
+ const int64_t ne00 = node->src[0]->ne[0]; // W
2886
+ const int64_t ne01 = node->src[0]->ne[1]; // H
2887
+ const int64_t ne02 = node->src[0]->ne[2]; // Channels Out
2888
+ const int64_t ne03 = node->src[0]->ne[3]; // Channels In
2889
+
2890
+ const int64_t ne10 = node->src[1]->ne[0]; // W
2891
+ const int64_t ne11 = node->src[1]->ne[1]; // H
2892
+ const int64_t ne12 = node->src[1]->ne[2]; // Channels In
2893
+
2894
+ GGML_ASSERT(node->src[0]->type == GGML_TYPE_F16 || node->src[0]->type == GGML_TYPE_F32);
2895
+ GGML_ASSERT(node->src[1]->type == GGML_TYPE_F32);
2896
+
2897
+ cur += ggml_type_size(node->src[0]->type) * ne00 * ne01 * ne02 * ne03;
2898
+ cur += ggml_type_size(node->src[0]->type) * ne10 * ne11 * ne12;
2899
+
2900
+ } break;
2901
+ case GGML_OP_TOP_K:
2902
+ {
2903
+ cur += sizeof(int32_t)*node->src[0]->ne[0]*n_tasks;
2904
+ } break;
2905
+ case GGML_OP_FLASH_ATTN_EXT:
2906
+ {
2907
+ const int64_t neq2 = node->src[0]->ne[2]; // number of query heads
2908
+ const int64_t DK = node->src[1]->ne[0];
2909
+ const int64_t DV = node->src[2]->ne[0];
2910
+
2911
+ // Tiled flash attention scratch (tile sizes defined in common.h)
2912
+ // Per-thread: Q_q + KQ + mask + VKQ32 + V32 + K_f32 + padding
2913
+ size_t prefill = sizeof(float)*(GGML_FA_TILE_Q*DK + 2*GGML_FA_TILE_Q*GGML_FA_TILE_KV + GGML_FA_TILE_Q*DV + GGML_FA_TILE_KV*DV + GGML_FA_TILE_KV*DK)*n_tasks;
2914
+
2915
+ // Decode path: n_kv_chunks = n_tasks (one chunk per thread)
2916
+ // Per-thread: VKQ accmulator (DV), partial M, partial S + intra-thread scratch for V, Q and VKQ
2917
+ size_t n_chunks = n_tasks;
2918
+ size_t decode = sizeof(float)*(neq2*n_chunks*(2+DV) + n_tasks*(DK + 2*DV));
2919
+
2920
+ cur += MAX(prefill, decode);
2921
+ } break;
2922
+ case GGML_OP_FLASH_ATTN_BACK:
2923
+ {
2924
+ const int64_t D = node->src[0]->ne[0];
2925
+ const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
2926
+ const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back
2927
+ if (node->src[1]->type == GGML_TYPE_F32) {
2928
+ cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
2929
+ cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
2930
+ } else if (node->src[1]->type == GGML_TYPE_F16) {
2931
+ cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
2932
+ cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
2933
+ } else if (node->src[1]->type == GGML_TYPE_BF16) {
2934
+ cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
2935
+ cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
2936
+ }
2937
+ } break;
2938
+
2939
+ case GGML_OP_CROSS_ENTROPY_LOSS:
2940
+ {
2941
+ cur = ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks);
2942
+ } break;
2943
+ case GGML_OP_GATED_DELTA_NET:
2944
+ {
2945
+ const int64_t S_v = node->src[2]->ne[0];
2946
+ const int64_t K = node->src[5]->ne[1]; // state is (D, K, n_seqs)
2947
+ const int64_t per_thread = S_v + (K > 1 ? S_v * S_v : 0);
2948
+ cur = per_thread * sizeof(float) * n_tasks;
2949
+ } break;
2950
+ case GGML_OP_COUNT:
2951
+ {
2952
+ GGML_ABORT("fatal error");
2953
+ }
2954
+ default:
2955
+ break;
2956
+ }
2957
+ }
2958
+
2959
+ work_size = MAX(work_size, cur);
2960
+ }
2961
+
2962
+ if (work_size > 0) {
2963
+ work_size += CACHE_LINE_SIZE*(n_threads);
2964
+ }
2965
+
2966
+ cplan.threadpool = threadpool;
2967
+ cplan.n_threads = MIN(max_tasks, n_threads);
2968
+ cplan.work_size = work_size;
2969
+ cplan.work_data = NULL;
2970
+
2971
+ return cplan;
2972
+ }
2973
+
2974
+
2975
+ // Try to fuse the current node with subsequent nodes for better performance.
2976
+ // Returns the number of nodes skipped by fusion (>=1), or 0 if no fusion was applied.
2977
+ static bool ggml_cpu_disable_fusion = false; // initialized once in ggml_cpu_init(), read-only afterwards
2978
+
2979
+ static int ggml_cpu_try_fuse_ops(
2980
+ const struct ggml_cgraph * cgraph,
2981
+ const int node_n,
2982
+ const struct ggml_compute_params * params,
2983
+ const struct ggml_cplan * cplan) {
2984
+
2985
+ if (ggml_cpu_disable_fusion || cplan->use_ref) {
2986
+ return 0;
2987
+ }
2988
+
2989
+ struct ggml_tensor * node = cgraph->nodes[node_n];
2990
+
2991
+ if (node->op == GGML_OP_RMS_NORM) {
2992
+ // RMS_NORM + MUL fusion
2993
+ const enum ggml_op fuse_ops[] = { GGML_OP_RMS_NORM, GGML_OP_MUL };
2994
+ if (ggml_can_fuse(cgraph, node_n, fuse_ops, 2)) {
2995
+ struct ggml_tensor * mul_node = cgraph->nodes[node_n + 1];
2996
+ const struct ggml_tensor * mul_w = (mul_node->src[0] == node)
2997
+ ? mul_node->src[1] : mul_node->src[0];
2998
+ if (node->src[0]->type == GGML_TYPE_F32 &&
2999
+ mul_node->type == GGML_TYPE_F32 &&
3000
+ mul_w->type == GGML_TYPE_F32 &&
3001
+ mul_w->ne[0] == node->ne[0] &&
3002
+ mul_w->nb[0] == sizeof(float)) {
3003
+
3004
+ ggml_compute_forward_rms_norm_mul_fused(params, node, mul_node);
3005
+ return 1;
3006
+ }
3007
+ }
3008
+ }
3009
+
3010
+ return 0;
3011
+ }
3012
+
3013
+ static thread_ret_t ggml_graph_compute_thread(void * data) {
3014
+ struct ggml_compute_state * state = (struct ggml_compute_state *) data;
3015
+ struct ggml_threadpool * tp = state->threadpool;
3016
+
3017
+ const struct ggml_cgraph * cgraph = tp->cgraph;
3018
+ const struct ggml_cplan * cplan = tp->cplan;
3019
+
3020
+ #ifdef GGML_USE_CPU_RISCV64_SPACEMIT
3021
+ ggml_backend_cpu_riscv64_spacemit_set_numa_thread_affinity(state->ith);
3022
+ #else
3023
+ set_numa_thread_affinity(state->ith);
3024
+ #endif
3025
+
3026
+ struct ggml_compute_params params = {
3027
+ /*.ith =*/ state->ith,
3028
+ /*.nth =*/ atomic_load_explicit(&tp->n_graph, memory_order_relaxed) & GGML_THREADPOOL_N_THREADS_MASK,
3029
+ /*.wsize =*/ cplan->work_size,
3030
+ /*.wdata =*/ cplan->work_data,
3031
+ /*.threadpool =*/ tp,
3032
+ /*.use_ref =*/ cplan->use_ref,
3033
+ };
3034
+
3035
+ #ifdef GGML_USE_OPENMP
3036
+ GGML_PRINT_DEBUG("thread #%d compute-start cplan %p\n", state->ith, (const void *)cplan);
3037
+ #else
3038
+ GGML_PRINT_DEBUG("thread #%d compute-start cplan %p last-graph %d\n", state->ith, (const void *)cplan, state->last_graph);
3039
+ #endif
3040
+
3041
+ for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) {
3042
+ struct ggml_tensor * node = cgraph->nodes[node_n];
3043
+
3044
+ if (ggml_op_is_empty(node->op)) {
3045
+ // skip NOPs
3046
+ continue;
3047
+ }
3048
+
3049
+ if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
3050
+ continue;
3051
+ }
3052
+
3053
+ // TODO: move fused-op detection into ggml_graph_plan so fusion decisions are made once at planning time
3054
+ // Try fused ops, fall back to normal compute
3055
+ const int n_fused = ggml_cpu_try_fuse_ops(cgraph, node_n, &params, cplan);
3056
+ if (n_fused > 0) {
3057
+ node_n += n_fused;
3058
+ } else {
3059
+ ggml_compute_forward(&params, node);
3060
+ }
3061
+
3062
+ if (state->ith == 0 && cplan->abort_callback &&
3063
+ cplan->abort_callback(cplan->abort_callback_data)) {
3064
+ atomic_store_explicit(&tp->abort, node_n + 1, memory_order_relaxed);
3065
+ tp->ec = GGML_STATUS_ABORTED;
3066
+ }
3067
+
3068
+ if (node_n + 1 < cgraph->n_nodes) {
3069
+ ggml_barrier(state->threadpool);
3070
+ }
3071
+ }
3072
+
3073
+ #ifdef GGML_USE_OPENMP
3074
+ GGML_PRINT_DEBUG("thread #%d compute-done cplan %p\n", state->ith, (const void *)cplan);
3075
+ #else
3076
+ GGML_PRINT_DEBUG("thread #%d compute-done cplan %p last-graph %d\n", state->ith, (const void *)cplan, state->last_graph);
3077
+ #endif
3078
+
3079
+ ggml_barrier(state->threadpool);
3080
+
3081
+ #ifdef GGML_USE_CPU_RISCV64_SPACEMIT
3082
+ ggml_backend_cpu_riscv64_spacemit_clear_numa_thread_affinity_threaded(state->ith);
3083
+ #endif
3084
+
3085
+ return 0;
3086
+ }
3087
+
3088
+ #ifndef GGML_USE_OPENMP
3089
+
3090
+ // check if thread is ready to proceed (exit from polling or sleeping)
3091
+ // returns true if loops should exit, sets state->pending to indicate new work
3092
+ static inline bool ggml_graph_compute_thread_ready(struct ggml_compute_state * state) {
3093
+ struct ggml_threadpool * threadpool = state->threadpool;
3094
+
3095
+ if (state->pending || threadpool->stop || threadpool->pause) { return true; }
3096
+
3097
+ // check for new graph/work
3098
+ int n_graph = atomic_load_explicit(&threadpool->n_graph, memory_order_relaxed);
3099
+ int n_threads = n_graph & GGML_THREADPOOL_N_THREADS_MASK;
3100
+ if (n_graph != state->last_graph) {
3101
+ state->pending = (state->ith < n_threads);
3102
+ state->last_graph = n_graph;
3103
+ return true;
3104
+ }
3105
+
3106
+ return false;
3107
+ }
3108
+
3109
+ // sync thread state after polling
3110
+ static inline void ggml_graph_compute_thread_sync(struct ggml_compute_state * state) {
3111
+ // TSAN doesn't support standalone fence yet, we use a dummy read-modify-write instead
3112
+ #ifdef GGML_TSAN_ENABLED
3113
+ atomic_fetch_add_explicit(&state->threadpool->n_graph, 0, memory_order_seq_cst);
3114
+ #else
3115
+ atomic_thread_fence(memory_order_seq_cst);
3116
+ #endif
3117
+ UNUSED(state);
3118
+ }
3119
+
3120
+ static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state * state) {
3121
+ struct ggml_threadpool * threadpool = state->threadpool;
3122
+
3123
+ // This seems to make 0 ... 100 a decent range for polling level across modern processors.
3124
+ // Perhaps, we can adjust it dynamically based on load and things.
3125
+ const uint64_t n_rounds = 1024UL * 128 * threadpool->poll;
3126
+
3127
+ for (uint64_t i=0; !ggml_graph_compute_thread_ready(state) && i < n_rounds; i++) {
3128
+ // No new work. Keep polling.
3129
+ ggml_thread_cpu_relax();
3130
+ }
3131
+
3132
+ return state->pending;
3133
+ }
3134
+
3135
+ static inline bool ggml_graph_compute_check_for_work(struct ggml_compute_state * state) {
3136
+ struct ggml_threadpool * threadpool = state->threadpool;
3137
+
3138
+ if (ggml_graph_compute_poll_for_work(state)) {
3139
+ ggml_graph_compute_thread_sync(state);
3140
+ return state->pending;
3141
+ }
3142
+
3143
+ ggml_mutex_lock_shared(&threadpool->mutex);
3144
+ while (!ggml_graph_compute_thread_ready(state)) {
3145
+ // No new work. Wait for the signal.
3146
+ GGML_PRINT_DEBUG("thread #%d waiting for work (sleeping)\n", state->ith);
3147
+ ggml_cond_wait(&threadpool->cond, &threadpool->mutex);
3148
+ }
3149
+ ggml_mutex_unlock_shared(&threadpool->mutex);
3150
+
3151
+ return state->pending;
3152
+ }
3153
+
3154
+ static thread_ret_t ggml_graph_compute_secondary_thread(void* data) {
3155
+ struct ggml_compute_state * state = (struct ggml_compute_state *) data;
3156
+ struct ggml_threadpool * threadpool = state->threadpool;
3157
+
3158
+ ggml_thread_apply_priority(threadpool->prio);
3159
+ if (ggml_thread_cpumask_is_valid(state->cpumask)) {
3160
+ ggml_thread_apply_affinity(state->cpumask);
3161
+ }
3162
+
3163
+ while (true) {
3164
+ // Check if we need to sleep
3165
+ while (threadpool->pause) {
3166
+ GGML_PRINT_DEBUG("thread #%d inside pause loop\n", state->ith);
3167
+ ggml_mutex_lock_shared(&threadpool->mutex);
3168
+ if (threadpool->pause) {
3169
+ ggml_cond_wait(&threadpool->cond, &threadpool->mutex);
3170
+ }
3171
+ GGML_PRINT_DEBUG("thread #%d resuming after wait\n", state->ith);
3172
+ ggml_mutex_unlock_shared(&threadpool->mutex);
3173
+ }
3174
+
3175
+ // This needs to be checked for after the cond_wait
3176
+ if (threadpool->stop) break;
3177
+
3178
+ // Check if there is new work
3179
+ // The main thread is the only one that can dispatch new work
3180
+
3181
+ ggml_graph_compute_check_for_work(state);
3182
+ if (state->pending) {
3183
+ state->pending = false;
3184
+ ggml_graph_compute_thread(state);
3185
+ }
3186
+ }
3187
+
3188
+ return (thread_ret_t) 0;
3189
+ }
3190
+
3191
+ // Start processing new graph
3192
+ static void ggml_graph_compute_kickoff(struct ggml_threadpool * threadpool, int n_threads)
3193
+ {
3194
+ // Always take the mutex here because the worker threads are doing hybrid poll/wait
3195
+
3196
+ ggml_mutex_lock(&threadpool->mutex);
3197
+
3198
+ // Update the number of active threads and the graph count
3199
+ int n_graph = atomic_load_explicit(&threadpool->n_graph, memory_order_relaxed) >> GGML_THREADPOOL_N_THREADS_BITS;
3200
+ n_graph = ((n_graph + 1) << GGML_THREADPOOL_N_THREADS_BITS) | (n_threads & GGML_THREADPOOL_N_THREADS_MASK);
3201
+
3202
+ GGML_PRINT_DEBUG("compute-kickoff: n_threads %d n_graph %d\n", n_threads, n_graph);
3203
+
3204
+ // Indicate the graph is ready to be processed
3205
+ // We need the full seq-cst fence here because of the polling threads (used in thread_sync)
3206
+ atomic_store_explicit(&threadpool->n_graph, n_graph, memory_order_seq_cst);
3207
+
3208
+ if (threadpool->pause) {
3209
+ // Update main thread prio and affinity to match the threadpool settings
3210
+ ggml_thread_apply_priority(threadpool->prio);
3211
+ if (ggml_thread_cpumask_is_valid(threadpool->workers[0].cpumask)) {
3212
+ ggml_thread_apply_affinity(threadpool->workers[0].cpumask);
3213
+ }
3214
+
3215
+ // resume does cond broadcast
3216
+ ggml_threadpool_resume_locked(threadpool);
3217
+ } else {
3218
+ ggml_cond_broadcast(&threadpool->cond);
3219
+ }
3220
+
3221
+ ggml_mutex_unlock(&threadpool->mutex);
3222
+ }
3223
+
3224
+ #endif // GGML_USE_OPENMP
3225
+
3226
+ static struct ggml_threadpool * ggml_threadpool_new_impl(
3227
+ struct ggml_threadpool_params * tpp,
3228
+ struct ggml_cgraph * cgraph,
3229
+ struct ggml_cplan * cplan) {
3230
+
3231
+ struct ggml_threadpool * threadpool =
3232
+ ggml_aligned_malloc(sizeof(struct ggml_threadpool));
3233
+ {
3234
+ threadpool->cgraph = cgraph;
3235
+ threadpool->cplan = cplan;
3236
+ threadpool->n_graph = 0;
3237
+ threadpool->n_barrier = 0;
3238
+ threadpool->n_barrier_passed = 0;
3239
+ threadpool->current_chunk = 0;
3240
+ threadpool->stop = false;
3241
+ threadpool->pause = tpp->paused;
3242
+ threadpool->abort = -1;
3243
+ threadpool->workers = NULL;
3244
+ threadpool->n_threads = tpp->n_threads;
3245
+ threadpool->poll = tpp->poll;
3246
+ threadpool->prio = tpp->prio;
3247
+ threadpool->ec = GGML_STATUS_SUCCESS;
3248
+ }
3249
+
3250
+ // Allocate and init workers state
3251
+ const size_t workers_size = sizeof(struct ggml_compute_state) * tpp->n_threads;
3252
+ struct ggml_compute_state * workers = ggml_aligned_malloc(workers_size);
3253
+
3254
+ memset(workers, 0, workers_size);
3255
+ for (int j = 0; j < tpp->n_threads; j++) {
3256
+ workers[j].threadpool = threadpool;
3257
+ workers[j].ith = j;
3258
+ }
3259
+
3260
+ threadpool->workers = workers;
3261
+
3262
+ #ifdef GGML_USE_OPENMP
3263
+ int32_t cpumask_iter = 0;
3264
+
3265
+ // Compute CPU masks for each thread
3266
+ for (int j = 0; j < tpp->n_threads; j++) {
3267
+ ggml_thread_cpumask_next(tpp->cpumask, workers[j].cpumask, tpp->strict_cpu, &cpumask_iter);
3268
+ }
3269
+ #else // GGML_USE_OPENMP
3270
+ ggml_mutex_init(&threadpool->mutex);
3271
+ ggml_cond_init(&threadpool->cond);
3272
+
3273
+ // Spin the threads for all workers, and update CPU placements.
3274
+ // Place the main thread last (towards the higher numbered CPU cores).
3275
+
3276
+ int32_t cpumask_iter = 0;
3277
+
3278
+ for (int j = 1; j < tpp->n_threads; j++) {
3279
+ ggml_thread_cpumask_next(tpp->cpumask, workers[j].cpumask, tpp->strict_cpu, &cpumask_iter);
3280
+
3281
+ int32_t rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_secondary_thread, &workers[j]);
3282
+ GGML_ASSERT(rc == 0);
3283
+ }
3284
+
3285
+ ggml_thread_cpumask_next(tpp->cpumask, workers[0].cpumask, tpp->strict_cpu, &cpumask_iter);
3286
+
3287
+ if (!threadpool->pause) {
3288
+ // Update main thread prio and affinity at the start, otherwise we'll do it in resume
3289
+ ggml_thread_apply_priority(threadpool->prio);
3290
+ if (ggml_thread_cpumask_is_valid(threadpool->workers[0].cpumask)) {
3291
+ ggml_thread_apply_affinity(threadpool->workers[0].cpumask);
3292
+ }
3293
+ }
3294
+ #endif // GGML_USE_OPENMP
3295
+
3296
+ return threadpool;
3297
+ }
3298
+
3299
+ struct ggml_threadpool * ggml_threadpool_new(struct ggml_threadpool_params * tpp) {
3300
+ return ggml_threadpool_new_impl(tpp, NULL, NULL);
3301
+ }
3302
+
3303
+ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
3304
+ ggml_cpu_init();
3305
+
3306
+ GGML_ASSERT(cplan);
3307
+ GGML_ASSERT(cplan->n_threads > 0);
3308
+ GGML_ASSERT(cplan->work_size == 0 || cplan->work_data != NULL);
3309
+
3310
+ int n_threads = cplan->n_threads;
3311
+ struct ggml_threadpool * threadpool = cplan->threadpool;
3312
+
3313
+ bool disposable_threadpool = false;
3314
+
3315
+ if (threadpool == NULL) {
3316
+ //GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %d\n", n_threads);
3317
+ disposable_threadpool = true;
3318
+
3319
+ struct ggml_threadpool_params ttp = ggml_threadpool_params_default(n_threads);
3320
+ threadpool = ggml_threadpool_new_impl(&ttp, cgraph, cplan);
3321
+ } else {
3322
+ // Reset some of the parameters that need resetting
3323
+ // No worker threads should be accessing the parameters below at this stage
3324
+ threadpool->cgraph = cgraph;
3325
+ threadpool->cplan = cplan;
3326
+ threadpool->current_chunk = 0;
3327
+ threadpool->abort = -1;
3328
+ threadpool->ec = GGML_STATUS_SUCCESS;
3329
+ }
3330
+
3331
+ #ifdef GGML_USE_OPENMP
3332
+ if (n_threads > 1) {
3333
+ #pragma omp parallel num_threads(n_threads)
3334
+ {
3335
+ #pragma omp single
3336
+ {
3337
+ // update the number of threads from the actual number of threads that we got from OpenMP
3338
+ n_threads = omp_get_num_threads();
3339
+ atomic_store_explicit(&threadpool->n_graph, n_threads, memory_order_relaxed);
3340
+ }
3341
+
3342
+ // Apply thread CPU mask and priority
3343
+ int ith = omp_get_thread_num();
3344
+
3345
+ ggml_thread_apply_priority(threadpool->prio);
3346
+ if (ggml_thread_cpumask_is_valid(threadpool->workers[ith].cpumask)) {
3347
+ ggml_thread_apply_affinity(threadpool->workers[ith].cpumask);
3348
+ }
3349
+ ggml_graph_compute_thread(&threadpool->workers[ith]);
3350
+ }
3351
+ } else {
3352
+ atomic_store_explicit(&threadpool->n_graph, 1, memory_order_relaxed);
3353
+ ggml_graph_compute_thread(&threadpool->workers[0]);
3354
+ }
3355
+ #else
3356
+ if (n_threads > threadpool->n_threads) {
3357
+ GGML_LOG_WARN("cplan requested more threads (%d) than available (%d)\n", n_threads, threadpool->n_threads);
3358
+ n_threads = threadpool->n_threads;
3359
+ }
3360
+
3361
+ // Kick all threads to start the new graph
3362
+ ggml_graph_compute_kickoff(threadpool, n_threads);
3363
+
3364
+ // This is a work thread too
3365
+ ggml_graph_compute_thread(&threadpool->workers[0]);
3366
+ #endif
3367
+
3368
+ // don't leave affinity set on the main thread
3369
+ clear_numa_thread_affinity();
3370
+
3371
+ enum ggml_status ret = threadpool->ec;
3372
+
3373
+ if (disposable_threadpool) {
3374
+ ggml_threadpool_free(threadpool);
3375
+ }
3376
+
3377
+ return ret;
3378
+ }
3379
+
3380
+ enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
3381
+ struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads, NULL);
3382
+
3383
+ cplan.work_data = (uint8_t *)ggml_new_buffer(ctx, cplan.work_size);
3384
+
3385
+ return ggml_graph_compute(cgraph, &cplan);
3386
+ }
3387
+
3388
+ void ggml_cpu_fp32_to_fp32(const float * x, float * y, int64_t n) {
3389
+ memcpy(y, x, n * sizeof(float));
3390
+ }
3391
+
3392
+ void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) {
3393
+ int64_t i = 0;
3394
+ #if defined(__F16C__)
3395
+ #if defined(__AVX512F__)
3396
+ for (; i + 15 < n; i += 16) {
3397
+ __m512 x_vec = _mm512_loadu_ps(x + i);
3398
+ __m256i y_vec = _mm512_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
3399
+ _mm256_storeu_si256((__m256i *)(y + i), y_vec);
3400
+ }
3401
+ #endif
3402
+ for (; i + 7 < n; i += 8) {
3403
+ __m256 x_vec = _mm256_loadu_ps(x + i);
3404
+ __m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
3405
+ _mm_storeu_si128((__m128i *)(y + i), y_vec);
3406
+ }
3407
+ for (; i + 3 < n; i += 4) {
3408
+ __m128 x_vec = _mm_loadu_ps(x + i);
3409
+ __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
3410
+ _mm_storel_epi64((__m128i *)(y + i), y_vec);
3411
+ }
3412
+ #elif defined(__riscv_zvfh)
3413
+ for (int vl; i < n; i += vl) {
3414
+ vl = __riscv_vsetvl_e32m2(n - i);
3415
+ vfloat32m2_t vx = __riscv_vle32_v_f32m2(&x[i], vl);
3416
+ vfloat16m1_t vy = __riscv_vfncvt_f_f_w_f16m1(vx, vl);
3417
+ __riscv_vse16_v_f16m1((_Float16 *)&y[i], vy, vl);
3418
+ }
3419
+ #endif
3420
+ for (; i < n; ++i) {
3421
+ y[i] = GGML_CPU_FP32_TO_FP16(x[i]);
3422
+ }
3423
+ }
3424
+
3425
+ void ggml_cpu_fp16_to_fp32(const ggml_fp16_t * x, float * y, int64_t n) {
3426
+ int64_t i = 0;
3427
+ #if defined(__F16C__)
3428
+ #if defined(__AVX512F__)
3429
+ for (; i + 15 < n; i += 16) {
3430
+ __m256i x_vec = _mm256_loadu_si256((const __m256i *)(x + i));
3431
+ __m512 y_vec = _mm512_cvtph_ps(x_vec);
3432
+ _mm512_storeu_ps(y + i, y_vec);
3433
+ }
3434
+ #endif
3435
+ for (; i + 7 < n; i += 8) {
3436
+ __m128i x_vec = _mm_loadu_si128((const __m128i *)(x + i));
3437
+ __m256 y_vec = _mm256_cvtph_ps(x_vec);
3438
+ _mm256_storeu_ps(y + i, y_vec);
3439
+ }
3440
+ for (; i + 3 < n; i += 4) {
3441
+ __m128i x_vec = _mm_loadl_epi64((const __m128i *)(x + i));
3442
+ __m128 y_vec = _mm_cvtph_ps(x_vec);
3443
+ _mm_storeu_ps(y + i, y_vec);
3444
+ }
3445
+
3446
+ #elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfhmin)
3447
+ // calculate step size
3448
+ const int epr = __riscv_vsetvlmax_e16m2();
3449
+ const int step = epr * 2;
3450
+ const int np = (n & ~(step - 1));
3451
+
3452
+ // unroll by 2
3453
+ for (; i < np; i += step) {
3454
+ vfloat16m2_t ax0 = __riscv_vle16_v_f16m2((const _Float16*)x + i, epr);
3455
+ vfloat32m4_t ay0 = __riscv_vfwcvt_f_f_v_f32m4(ax0, epr);
3456
+ __riscv_vse32_v_f32m4(y + i, ay0, epr);
3457
+
3458
+ vfloat16m2_t ax1 = __riscv_vle16_v_f16m2((const _Float16*)x + i + epr, epr);
3459
+ vfloat32m4_t ay1 = __riscv_vfwcvt_f_f_v_f32m4(ax1, epr);
3460
+ __riscv_vse32_v_f32m4(y + i + epr, ay1, epr);
3461
+ }
3462
+
3463
+ // leftovers
3464
+ int vl;
3465
+ for (i = np; i < n; i += vl) {
3466
+ vl = __riscv_vsetvl_e16m2(n - i);
3467
+ vfloat16m2_t ax0 = __riscv_vle16_v_f16m2((const _Float16*)x + i, vl);
3468
+ vfloat32m4_t ay0 = __riscv_vfwcvt_f_f_v_f32m4(ax0, vl);
3469
+ __riscv_vse32_v_f32m4(y + i, ay0, vl);
3470
+ }
3471
+
3472
+ #endif
3473
+
3474
+ for (; i < n; ++i) {
3475
+ y[i] = GGML_CPU_FP16_TO_FP32(x[i]);
3476
+ }
3477
+ }
3478
+
3479
+ void ggml_cpu_fp32_to_bf16(const float * x, ggml_bf16_t * y, int64_t n) {
3480
+ int64_t i = 0;
3481
+ for (; i < n; ++i) {
3482
+ y[i] = GGML_FP32_TO_BF16(x[i]);
3483
+ }
3484
+ }
3485
+
3486
+ void ggml_cpu_fp32_to_i32(const float * x, int32_t * y, int64_t n) {
3487
+ int64_t i = 0;
3488
+ for (; i < n; ++i) {
3489
+ y[i] = x[i];
3490
+ }
3491
+ }
3492
+
3493
+ void ggml_cpu_bf16_to_fp32(const ggml_bf16_t * x, float * y, int64_t n) {
3494
+ int64_t i = 0;
3495
+ #if defined(__AVX2__)
3496
+ #if defined(__AVX512F__)
3497
+ for (; i + 15 < n; i += 16) {
3498
+ _mm512_storeu_ps(y + i,
3499
+ _mm512_castsi512_ps(
3500
+ _mm512_slli_epi32(
3501
+ _mm512_cvtepu16_epi32(
3502
+ _mm256_loadu_si256(
3503
+ (const __m256i *)(x + i))),
3504
+ 16)));
3505
+ }
3506
+ #endif
3507
+ for (; i + 7 < n; i += 8) {
3508
+ _mm256_storeu_ps(y + i,
3509
+ _mm256_castsi256_ps(
3510
+ _mm256_slli_epi32(
3511
+ _mm256_cvtepu16_epi32(
3512
+ _mm_loadu_si128(
3513
+ (const __m128i *)(x + i))),
3514
+ 16)));
3515
+ }
3516
+ #elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfbfmin)
3517
+ // calculate step size
3518
+ const int epr = __riscv_vsetvlmax_e16m2();
3519
+ const int step = epr * 2;
3520
+ const int np = (n & ~(step - 1));
3521
+
3522
+ // unroll by 2
3523
+ for (; i < np; i += step) {
3524
+ vbfloat16m2_t ax0 = __riscv_vle16_v_bf16m2((const __bf16*)x + i, epr);
3525
+ vfloat32m4_t ay0 = __riscv_vfwcvtbf16_f_f_v_f32m4(ax0, epr);
3526
+ __riscv_vse32_v_f32m4(y + i, ay0, epr);
3527
+
3528
+ vbfloat16m2_t ax1 = __riscv_vle16_v_bf16m2((const __bf16*)x + i + epr, epr);
3529
+ vfloat32m4_t ay1 = __riscv_vfwcvtbf16_f_f_v_f32m4(ax1, epr);
3530
+ __riscv_vse32_v_f32m4(y + i + epr, ay1, epr);
3531
+ }
3532
+
3533
+ // leftovers
3534
+ int vl;
3535
+ for (i = np; i < n; i += vl) {
3536
+ vl = __riscv_vsetvl_e16m2(n - i);
3537
+ vbfloat16m2_t ax0 = __riscv_vle16_v_bf16m2((const __bf16*)x + i, vl);
3538
+ vfloat32m4_t ay0 = __riscv_vfwcvtbf16_f_f_v_f32m4(ax0, vl);
3539
+ __riscv_vse32_v_f32m4(y + i, ay0, vl);
3540
+ }
3541
+ #endif
3542
+ for (; i < n; i++) {
3543
+ y[i] = GGML_BF16_TO_FP32(x[i]);
3544
+ }
3545
+ }
3546
+
3547
+ int ggml_cpu_has_avx(void) {
3548
+ #if defined(__AVX__)
3549
+ return 1;
3550
+ #else
3551
+ return 0;
3552
+ #endif
3553
+ }
3554
+
3555
+ int ggml_cpu_has_avx_vnni(void) {
3556
+ #if defined(__AVXVNNI__)
3557
+ return 1;
3558
+ #else
3559
+ return 0;
3560
+ #endif
3561
+ }
3562
+
3563
+ int ggml_cpu_has_avx2(void) {
3564
+ #if defined(__AVX2__)
3565
+ return 1;
3566
+ #else
3567
+ return 0;
3568
+ #endif
3569
+ }
3570
+
3571
+ int ggml_cpu_has_avx512(void) {
3572
+ #if defined(__AVX512F__)
3573
+ return 1;
3574
+ #else
3575
+ return 0;
3576
+ #endif
3577
+ }
3578
+
3579
+ int ggml_cpu_has_avx512_vbmi(void) {
3580
+ #if defined(__AVX512VBMI__)
3581
+ return 1;
3582
+ #else
3583
+ return 0;
3584
+ #endif
3585
+ }
3586
+
3587
+ int ggml_cpu_has_avx512_vnni(void) {
3588
+ #if defined(__AVX512VNNI__)
3589
+ return 1;
3590
+ #else
3591
+ return 0;
3592
+ #endif
3593
+ }
3594
+
3595
+ int ggml_cpu_has_avx512_bf16(void) {
3596
+ #if defined(__AVX512BF16__)
3597
+ return 1;
3598
+ #else
3599
+ return 0;
3600
+ #endif
3601
+ }
3602
+
3603
+ int ggml_cpu_has_amx_int8(void) {
3604
+ #if defined(__AMX_INT8__)
3605
+ return 1;
3606
+ #else
3607
+ return 0;
3608
+ #endif
3609
+ }
3610
+
3611
+ int ggml_cpu_has_bmi2(void) {
3612
+ #if defined(__BMI2__)
3613
+ return 1;
3614
+ #else
3615
+ return 0;
3616
+ #endif
3617
+ }
3618
+
3619
+ int ggml_cpu_has_fma(void) {
3620
+ #if defined(__FMA__)
3621
+ return 1;
3622
+ #else
3623
+ return 0;
3624
+ #endif
3625
+ }
3626
+
3627
+ int ggml_cpu_has_arm_fma(void) {
3628
+ #if defined(__ARM_FEATURE_FMA)
3629
+ return 1;
3630
+ #else
3631
+ return 0;
3632
+ #endif
3633
+ }
3634
+
3635
+ int ggml_cpu_has_riscv_v(void) {
3636
+ #if defined(__riscv_v_intrinsic)
3637
+ return 1;
3638
+ #else
3639
+ return 0;
3640
+ #endif
3641
+ }
3642
+
3643
+ int ggml_cpu_get_rvv_vlen(void) {
3644
+ #if defined(__riscv) && defined(__riscv_v_intrinsic)
3645
+ return ggml_riscv_arch_features.rvv_vlen;
3646
+ #else
3647
+ return 0;
3648
+ #endif
3649
+ }
3650
+
3651
+ int ggml_cpu_has_f16c(void) {
3652
+ #if defined(__F16C__)
3653
+ return 1;
3654
+ #else
3655
+ return 0;
3656
+ #endif
3657
+ }
3658
+
3659
+ int ggml_cpu_has_fp16_va(void) {
3660
+ #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
3661
+ return 1;
3662
+ #else
3663
+ return 0;
3664
+ #endif
3665
+ }
3666
+
3667
+ int ggml_cpu_has_wasm_simd(void) {
3668
+ #if defined(__wasm_simd128__)
3669
+ return 1;
3670
+ #else
3671
+ return 0;
3672
+ #endif
3673
+ }
3674
+
3675
+ int ggml_cpu_has_llamafile(void) {
3676
+ #if defined(GGML_USE_LLAMAFILE)
3677
+ return 1;
3678
+ #else
3679
+ return 0;
3680
+ #endif
3681
+ }
3682
+
3683
+ int ggml_cpu_has_sse3(void) {
3684
+ #if defined(__SSE3__)
3685
+ return 1;
3686
+ #else
3687
+ return 0;
3688
+ #endif
3689
+ }
3690
+
3691
+ int ggml_cpu_has_ssse3(void) {
3692
+ #if defined(__SSSE3__)
3693
+ return 1;
3694
+ #else
3695
+ return 0;
3696
+ #endif
3697
+ }
3698
+
3699
+ int ggml_cpu_has_vsx(void) {
3700
+ #if defined(__POWER9_VECTOR__)
3701
+ return 1;
3702
+ #else
3703
+ return 0;
3704
+ #endif
3705
+ }
3706
+
3707
+ int ggml_cpu_has_vxe(void) {
3708
+ #if defined(__VXE__) || defined(__VXE2__)
3709
+ return 1;
3710
+ #else
3711
+ return 0;
3712
+ #endif
3713
+ }
3714
+
3715
+ int ggml_cpu_has_neon(void) {
3716
+ #if defined(__ARM_ARCH) && defined(__ARM_NEON)
3717
+ return 1;
3718
+ #else
3719
+ return 0;
3720
+ #endif
3721
+ }
3722
+
3723
+ int ggml_cpu_has_dotprod(void) {
3724
+ #if defined(__ARM_ARCH) && defined(__ARM_FEATURE_DOTPROD)
3725
+ return 1;
3726
+ #else
3727
+ return 0;
3728
+ #endif
3729
+ }
3730
+
3731
+ int ggml_cpu_has_sve(void) {
3732
+ #if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SVE)
3733
+ return 1;
3734
+ #else
3735
+ return 0;
3736
+ #endif
3737
+ }
3738
+
3739
+ int ggml_cpu_has_matmul_int8(void) {
3740
+ #if defined(__ARM_ARCH) && defined(__ARM_FEATURE_MATMUL_INT8)
3741
+ return 1;
3742
+ #else
3743
+ return 0;
3744
+ #endif
3745
+ }
3746
+
3747
+ int ggml_cpu_get_sve_cnt(void) {
3748
+ #if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SVE)
3749
+ return ggml_arm_arch_features.sve_cnt;
3750
+ #else
3751
+ return 0;
3752
+ #endif
3753
+ }
3754
+
3755
+ int ggml_cpu_has_sme(void) {
3756
+ #if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SME)
3757
+ return 1;
3758
+ #else
3759
+ return 0;
3760
+ #endif
3761
+ }
3762
+
3763
+ void ggml_cpu_init(void) {
3764
+ // needed to initialize ggml_time
3765
+ {
3766
+ struct ggml_init_params params = { 0, NULL, false };
3767
+ struct ggml_context * ctx = ggml_init(params);
3768
+ ggml_free(ctx);
3769
+ }
3770
+
3771
+ ggml_critical_section_start();
3772
+
3773
+ static bool is_first_call = true;
3774
+
3775
+ if (is_first_call) {
3776
+ // initialize GELU, Quick GELU, SILU and EXP F32 tables
3777
+ {
3778
+ const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
3779
+
3780
+ for (int i = 0; i < (1 << 16); ++i) {
3781
+ union {
3782
+ uint16_t u16;
3783
+ ggml_fp16_t fp16;
3784
+ } u = {i};
3785
+ float f = GGML_COMPUTE_FP16_TO_FP32(u.fp16);
3786
+ ggml_table_f32_f16[i] = f;
3787
+ ggml_table_gelu_f16[i] = GGML_CPU_FP32_TO_FP16(ggml_gelu_f32(f));
3788
+ ggml_table_gelu_quick_f16[i] = GGML_CPU_FP32_TO_FP16(ggml_gelu_quick_f32(f));
3789
+ }
3790
+
3791
+ // initialize E8M0 half table (256 entries)
3792
+ for (int i = 0; i < (1 << 8); ++i) {
3793
+ ggml_table_f32_e8m0_half[i] = GGML_E8M0_TO_FP32_HALF(i);
3794
+ }
3795
+
3796
+ const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
3797
+
3798
+ GGML_PRINT_DEBUG("%s: GELU, Quick GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0);
3799
+
3800
+ #ifdef GGML_USE_OPENMP
3801
+ //if (!getenv("OMP_WAIT_POLICY")) {
3802
+ // // set the wait policy to active, so that OpenMP threads don't sleep
3803
+ // setenv("OMP_WAIT_POLICY", "active", 0)
3804
+ //}
3805
+
3806
+ if (!getenv("KMP_BLOCKTIME")) {
3807
+ // set the time to wait before sleeping a thread
3808
+ // this is less aggressive than setting the wait policy to active, but should achieve similar results in most cases
3809
+ #ifdef _WIN32
3810
+ _putenv_s("KMP_BLOCKTIME", "200"); // 200ms
3811
+ #else
3812
+ setenv("KMP_BLOCKTIME", "200", 0); // 200ms
3813
+ #endif
3814
+ }
3815
+ #endif
3816
+ }
3817
+
3818
+ #if defined(__ARM_ARCH)
3819
+ ggml_init_arm_arch_features();
3820
+ #endif
3821
+
3822
+ #if defined(__riscv)
3823
+ ggml_init_riscv_arch_features();
3824
+ #endif
3825
+
3826
+ {
3827
+ const char * env = getenv("GGML_CPU_DISABLE_FUSION");
3828
+ ggml_cpu_disable_fusion = (env != NULL && atoi(env) == 1);
3829
+ }
3830
+
3831
+ is_first_call = false;
3832
+ }
3833
+
3834
+ ggml_critical_section_end();
3835
+ }