toy 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2107) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +1124 -0
  3. data/LICENSE +21 -0
  4. data/Makefile +2022 -0
  5. data/README.md +154 -0
  6. data/bin/toy +10 -0
  7. data/lib/toy/compute.rb +135 -0
  8. data/lib/toy/compute_cuda.rb +104 -0
  9. data/lib/toy/compute_metal.rb +97 -0
  10. data/lib/toy/core/cli/describe.rb +188 -0
  11. data/lib/toy/core/cli/eval.rb +385 -0
  12. data/lib/toy/core/cli/exit_codes.rb +15 -0
  13. data/lib/toy/core/cli/fetch.rb +238 -0
  14. data/lib/toy/core/cli/infer.rb +268 -0
  15. data/lib/toy/core/cli/install.rb +228 -0
  16. data/lib/toy/core/cli/list.rb +86 -0
  17. data/lib/toy/core/cli/manifest.rb +49 -0
  18. data/lib/toy/core/cli/new.rb +594 -0
  19. data/lib/toy/core/cli/serve.rb +237 -0
  20. data/lib/toy/core/cli/train.rb +471 -0
  21. data/lib/toy/core/cli.rb +165 -0
  22. data/lib/toy/core/config.rb +64 -0
  23. data/lib/toy/core/gguf_meta.rb +161 -0
  24. data/lib/toy/core/model_scan.rb +221 -0
  25. data/lib/toy/core/run_log.rb +94 -0
  26. data/lib/toy/core/toy_root.rb +95 -0
  27. data/lib/toy/dev/toy_card.rb +299 -0
  28. data/lib/toy/dev/toy_describe_flow.rb +412 -0
  29. data/lib/toy/dev/toy_logprobs.rb +86 -0
  30. data/lib/toy/dev/toy_tap.rb +183 -0
  31. data/lib/toy/dev/toy_token_drift.rb +121 -0
  32. data/lib/toy/ffi/tinynn.rb +1491 -0
  33. data/lib/toy/ffi/tinynn_cuda.rb +1124 -0
  34. data/lib/toy/ffi/tinynn_metal.rb +359 -0
  35. data/lib/toy/ffi_manifest.rb +84 -0
  36. data/lib/toy/io/bpe.rb +325 -0
  37. data/lib/toy/io/gguf_kv.rb +35 -0
  38. data/lib/toy/io/gguf_load.rb +331 -0
  39. data/lib/toy/io/loaders/toy_gpt2_loader.rb +70 -0
  40. data/lib/toy/io/loaders/toy_smollm2_loader.rb +754 -0
  41. data/lib/toy/io/model_index.rb +206 -0
  42. data/lib/toy/io/run_bundle.rb +280 -0
  43. data/lib/toy/io/tokenizer.rb +613 -0
  44. data/lib/toy/io/toy_corpus_loader.rb +52 -0
  45. data/lib/toy/io/toy_events.rb +56 -0
  46. data/lib/toy/io/toy_image_loader.rb +48 -0
  47. data/lib/toy/llm/adamw.rb +169 -0
  48. data/lib/toy/llm/archs/llama_arch.rb +233 -0
  49. data/lib/toy/llm/archs/llama_arch_cuda.rb +237 -0
  50. data/lib/toy/llm/archs/llama_arch_metal.rb +237 -0
  51. data/lib/toy/llm/blocks/transformer_block.rb +876 -0
  52. data/lib/toy/llm/blocks/transformer_block_cuda.rb +880 -0
  53. data/lib/toy/llm/blocks/transformer_block_metal.rb +880 -0
  54. data/lib/toy/llm/classify_batch.rb +88 -0
  55. data/lib/toy/llm/engine/gpt2_fwd_engine.rb +360 -0
  56. data/lib/toy/llm/engine/gpt2_fwd_engine_cuda.rb +362 -0
  57. data/lib/toy/llm/engine/gpt2_fwd_engine_metal.rb +362 -0
  58. data/lib/toy/llm/engine/gpt2_kv_engine.rb +346 -0
  59. data/lib/toy/llm/engine/gpt2_kv_engine_cuda.rb +348 -0
  60. data/lib/toy/llm/engine/gpt2_kv_engine_metal.rb +348 -0
  61. data/lib/toy/llm/engine/gpt2_seq_engine.rb +289 -0
  62. data/lib/toy/llm/engine/gpt2_seq_engine_cuda.rb +293 -0
  63. data/lib/toy/llm/engine/gpt2_seq_engine_metal.rb +293 -0
  64. data/lib/toy/llm/engine/llama_kv_engine.rb +1593 -0
  65. data/lib/toy/llm/engine/llama_kv_engine_cuda.rb +1526 -0
  66. data/lib/toy/llm/engine/llama_kv_engine_metal.rb +1526 -0
  67. data/lib/toy/llm/engine/llama_seq_engine.rb +1233 -0
  68. data/lib/toy/llm/engine/llama_seq_engine_cuda.rb +1238 -0
  69. data/lib/toy/llm/engine/llama_seq_engine_metal.rb +1238 -0
  70. data/lib/toy/llm/engine/vit_tiny_engine.rb +467 -0
  71. data/lib/toy/llm/labels.rb +142 -0
  72. data/lib/toy/llm/primitives/gqa.rb +62 -0
  73. data/lib/toy/llm/primitives/gqa_cuda.rb +66 -0
  74. data/lib/toy/llm/primitives/gqa_metal.rb +66 -0
  75. data/lib/toy/llm/primitives/rms_norm.rb +39 -0
  76. data/lib/toy/llm/primitives/rms_norm_cuda.rb +43 -0
  77. data/lib/toy/llm/primitives/rms_norm_metal.rb +43 -0
  78. data/lib/toy/llm/primitives/rope.rb +68 -0
  79. data/lib/toy/llm/primitives/rope_cuda.rb +72 -0
  80. data/lib/toy/llm/primitives/rope_metal.rb +72 -0
  81. data/lib/toy/llm/primitives/swiglu.rb +41 -0
  82. data/lib/toy/llm/primitives/swiglu_cuda.rb +45 -0
  83. data/lib/toy/llm/primitives/swiglu_metal.rb +45 -0
  84. data/lib/toy/llm/recipe_options.rb +71 -0
  85. data/lib/toy/llm/recipes/from_scratch.rb +105 -0
  86. data/lib/toy/llm/recipes/from_scratch_cuda.rb +109 -0
  87. data/lib/toy/llm/recipes/from_scratch_metal.rb +109 -0
  88. data/lib/toy/llm/recipes/lora.rb +110 -0
  89. data/lib/toy/llm/recipes/lora_cuda.rb +114 -0
  90. data/lib/toy/llm/recipes/lora_metal.rb +114 -0
  91. data/lib/toy/llm/recipes/vit_tiny.rb +75 -0
  92. data/lib/toy/llm/recipes/warm_start.rb +235 -0
  93. data/lib/toy/llm/recipes/warm_start_cuda.rb +239 -0
  94. data/lib/toy/llm/recipes/warm_start_metal.rb +239 -0
  95. data/lib/toy/llm/training_batch.rb +133 -0
  96. data/lib/toy/models/arch.rb +253 -0
  97. data/lib/toy/models/gpt2.rb +311 -0
  98. data/lib/toy/models/toy_gpt2.rb +177 -0
  99. data/lib/toy/models/toy_smollm2.rb +393 -0
  100. data/lib/toy/models/toy_vit.rb +83 -0
  101. data/lib/toy/models/transformer.rb +1494 -0
  102. data/lib/toy/models/transformer_lm.rb +298 -0
  103. data/lib/toy/models/transformer_lm_cuda.rb +159 -0
  104. data/lib/toy/models/transformer_lm_metal.rb +142 -0
  105. data/lib/toy/mri.rb +300 -0
  106. data/lib/toy/run/eval.rb +76 -0
  107. data/lib/toy/run/eval_cuda.rb +66 -0
  108. data/lib/toy/run/eval_lmc.rb +334 -0
  109. data/lib/toy/run/eval_metal.rb +67 -0
  110. data/lib/toy/run/infer.rb +130 -0
  111. data/lib/toy/run/infer_cuda.rb +118 -0
  112. data/lib/toy/run/infer_metal.rb +119 -0
  113. data/lib/toy/run/infer_trace.rb +37 -0
  114. data/lib/toy/run/serve.rb +144 -0
  115. data/lib/toy/run/train.rb +404 -0
  116. data/lib/toy/run/train_cuda.rb +397 -0
  117. data/lib/toy/run/train_gpt2.rb +103 -0
  118. data/lib/toy/run/train_gpt2_cuda.rb +85 -0
  119. data/lib/toy/run/train_gpt2_metal.rb +85 -0
  120. data/lib/toy/run/train_lora.rb +207 -0
  121. data/lib/toy/run/train_lora_cuda.rb +219 -0
  122. data/lib/toy/run/train_metal.rb +227 -0
  123. data/lib/toy/run/train_vit.rb +251 -0
  124. data/lib/toy/serve/openai/embeddings_handler.rb +92 -0
  125. data/lib/toy/serve/openai/handlers.rb +143 -0
  126. data/lib/toy/serve/openai/server.rb +159 -0
  127. data/lib/toy/train/sampler.rb +314 -0
  128. data/lib/toy/train/toy_chat_template.rb +179 -0
  129. data/lib/toy/train/toy_drift_grad.rb +176 -0
  130. data/lib/toy/train/toy_gguf_fuse.rb +428 -0
  131. data/lib/toy/train/toy_gguf_writer.rb +100 -0
  132. data/lib/toy/train/toy_lr_schedule.rb +39 -0
  133. data/lib/toy/train/toy_sample.rb +125 -0
  134. data/lib/toy/train/toy_trainer.rb +86 -0
  135. data/lib/toy/train/training.rb +160 -0
  136. data/lib/toy/version.rb +11 -0
  137. data/lib/toy.rb +902 -0
  138. data/prep/progress +118 -0
  139. data/prep/quietly +64 -0
  140. data/sig/toy.rbs +397 -0
  141. data/sig/toy_compute.rbs +450 -0
  142. data/spinel-ext.json +122 -0
  143. data/tinynn/Makefile +71 -0
  144. data/tinynn/tinynn_backend_cuda.c +99 -0
  145. data/tinynn/tinynn_backend_metal.m +75 -0
  146. data/tinynn/tinynn_events.c +122 -0
  147. data/tinynn/tinynn_events.h +83 -0
  148. data/tinynn/tinynn_ggml.c +2460 -0
  149. data/tinynn/tinynn_ggml.h +545 -0
  150. data/tinynn/tinynn_gguf.c +783 -0
  151. data/tinynn/tinynn_gguf.h +167 -0
  152. data/tinynn/tinynn_trace.c +180 -0
  153. data/tinynn/tinynn_trace.h +85 -0
  154. data/vendor/ggml/AUTHORS +335 -0
  155. data/vendor/ggml/CMakeLists.txt +505 -0
  156. data/vendor/ggml/CONTRIBUTING.md +3 -0
  157. data/vendor/ggml/LICENSE +21 -0
  158. data/vendor/ggml/README.md +50 -0
  159. data/vendor/ggml/ci/run.sh +395 -0
  160. data/vendor/ggml/cmake/FindNCCL.cmake +36 -0
  161. data/vendor/ggml/cmake/GitVars.cmake +22 -0
  162. data/vendor/ggml/cmake/common.cmake +50 -0
  163. data/vendor/ggml/cmake/ggml-config.cmake.in +191 -0
  164. data/vendor/ggml/docs/gguf.md +828 -0
  165. data/vendor/ggml/examples/CMakeLists.txt +34 -0
  166. data/vendor/ggml/examples/common-ggml.cpp +244 -0
  167. data/vendor/ggml/examples/common-ggml.h +18 -0
  168. data/vendor/ggml/examples/common.cpp +675 -0
  169. data/vendor/ggml/examples/common.h +322 -0
  170. data/vendor/ggml/examples/gpt-2/CMakeLists.txt +32 -0
  171. data/vendor/ggml/examples/gpt-2/README.md +225 -0
  172. data/vendor/ggml/examples/gpt-2/convert-cerebras-to-ggml.py +183 -0
  173. data/vendor/ggml/examples/gpt-2/convert-ckpt-to-ggml.py +159 -0
  174. data/vendor/ggml/examples/gpt-2/convert-h5-to-ggml.py +195 -0
  175. data/vendor/ggml/examples/gpt-2/download-ggml-model.sh +69 -0
  176. data/vendor/ggml/examples/gpt-2/download-model.sh +48 -0
  177. data/vendor/ggml/examples/gpt-2/main-alloc.cpp +880 -0
  178. data/vendor/ggml/examples/gpt-2/main-backend.cpp +946 -0
  179. data/vendor/ggml/examples/gpt-2/main-batched.cpp +1210 -0
  180. data/vendor/ggml/examples/gpt-2/main-ctx.cpp +840 -0
  181. data/vendor/ggml/examples/gpt-2/main-sched.cpp +1079 -0
  182. data/vendor/ggml/examples/gpt-2/quantize.cpp +184 -0
  183. data/vendor/ggml/examples/gpt-j/CMakeLists.txt +13 -0
  184. data/vendor/ggml/examples/gpt-j/README.md +239 -0
  185. data/vendor/ggml/examples/gpt-j/convert-h5-to-ggml.py +173 -0
  186. data/vendor/ggml/examples/gpt-j/download-ggml-model.sh +69 -0
  187. data/vendor/ggml/examples/gpt-j/download-model.sh +11 -0
  188. data/vendor/ggml/examples/gpt-j/main.cpp +755 -0
  189. data/vendor/ggml/examples/gpt-j/quantize.cpp +182 -0
  190. data/vendor/ggml/examples/magika/CMakeLists.txt +17 -0
  191. data/vendor/ggml/examples/magika/README.md +23 -0
  192. data/vendor/ggml/examples/magika/convert.py +32 -0
  193. data/vendor/ggml/examples/magika/main.cpp +374 -0
  194. data/vendor/ggml/examples/mnist/CMakeLists.txt +58 -0
  195. data/vendor/ggml/examples/mnist/README.md +206 -0
  196. data/vendor/ggml/examples/mnist/mnist-common.cpp +496 -0
  197. data/vendor/ggml/examples/mnist/mnist-common.h +166 -0
  198. data/vendor/ggml/examples/mnist/mnist-eval.cpp +67 -0
  199. data/vendor/ggml/examples/mnist/mnist-train-cnn.py +91 -0
  200. data/vendor/ggml/examples/mnist/mnist-train-fc.py +131 -0
  201. data/vendor/ggml/examples/mnist/mnist-train.cpp +39 -0
  202. data/vendor/ggml/examples/mnist/server.py +36 -0
  203. data/vendor/ggml/examples/mnist/web/index.html +178 -0
  204. data/vendor/ggml/examples/perf-metal/CMakeLists.txt +7 -0
  205. data/vendor/ggml/examples/perf-metal/perf-metal.cpp +152 -0
  206. data/vendor/ggml/examples/prompts/dolly-v2.txt +100 -0
  207. data/vendor/ggml/examples/prompts/gpt-2-chinese.txt +1 -0
  208. data/vendor/ggml/examples/prompts/gpt-2.txt +100 -0
  209. data/vendor/ggml/examples/prompts/gpt-j.txt +100 -0
  210. data/vendor/ggml/examples/prompts/gpt-neox-japanese.txt +1 -0
  211. data/vendor/ggml/examples/prompts/gpt-neox.txt +100 -0
  212. data/vendor/ggml/examples/prompts/polyglot-ko.txt +3 -0
  213. data/vendor/ggml/examples/prompts/replit.txt +100 -0
  214. data/vendor/ggml/examples/prompts/starcoder.txt +100 -0
  215. data/vendor/ggml/examples/prompts/test-cases.txt +110 -0
  216. data/vendor/ggml/examples/prompts/tokenize_huggingface.py +65 -0
  217. data/vendor/ggml/examples/prompts/whisper.txt +100 -0
  218. data/vendor/ggml/examples/python/README.md +115 -0
  219. data/vendor/ggml/examples/python/api.h +14 -0
  220. data/vendor/ggml/examples/python/example_add_quant.py +25 -0
  221. data/vendor/ggml/examples/python/example_test_all_quants.py +68 -0
  222. data/vendor/ggml/examples/python/ggml/__init__.py +58 -0
  223. data/vendor/ggml/examples/python/ggml/__init__.pyi +2406 -0
  224. data/vendor/ggml/examples/python/ggml/cffi.py +11 -0
  225. data/vendor/ggml/examples/python/ggml/ffi/__init__.pyi +7 -0
  226. data/vendor/ggml/examples/python/ggml/utils.py +182 -0
  227. data/vendor/ggml/examples/python/regenerate.py +42 -0
  228. data/vendor/ggml/examples/python/stubs.py +128 -0
  229. data/vendor/ggml/examples/python/test_tensor.py +258 -0
  230. data/vendor/ggml/examples/sam/CMakeLists.txt +13 -0
  231. data/vendor/ggml/examples/sam/README.md +95 -0
  232. data/vendor/ggml/examples/sam/convert-pth-to-ggml.py +147 -0
  233. data/vendor/ggml/examples/sam/example.jpg +0 -0
  234. data/vendor/ggml/examples/sam/sam.cpp +2370 -0
  235. data/vendor/ggml/examples/simple/CMakeLists.txt +21 -0
  236. data/vendor/ggml/examples/simple/README.md +61 -0
  237. data/vendor/ggml/examples/simple/simple-backend.cpp +153 -0
  238. data/vendor/ggml/examples/simple/simple-ctx.cpp +127 -0
  239. data/vendor/ggml/examples/stb_image.h +7987 -0
  240. data/vendor/ggml/examples/stb_image_write.h +1724 -0
  241. data/vendor/ggml/examples/test-cmake/CMakeLists.txt +10 -0
  242. data/vendor/ggml/examples/test-cmake/README.md +3 -0
  243. data/vendor/ggml/examples/test-cmake/test-cmake.cpp +6 -0
  244. data/vendor/ggml/examples/yolo/CMakeLists.txt +6 -0
  245. data/vendor/ggml/examples/yolo/README.md +59 -0
  246. data/vendor/ggml/examples/yolo/convert-yolov3-tiny.py +53 -0
  247. data/vendor/ggml/examples/yolo/data/coco.names +80 -0
  248. data/vendor/ggml/examples/yolo/data/labels/100_0.png +0 -0
  249. data/vendor/ggml/examples/yolo/data/labels/100_1.png +0 -0
  250. data/vendor/ggml/examples/yolo/data/labels/100_2.png +0 -0
  251. data/vendor/ggml/examples/yolo/data/labels/100_3.png +0 -0
  252. data/vendor/ggml/examples/yolo/data/labels/100_4.png +0 -0
  253. data/vendor/ggml/examples/yolo/data/labels/100_5.png +0 -0
  254. data/vendor/ggml/examples/yolo/data/labels/100_6.png +0 -0
  255. data/vendor/ggml/examples/yolo/data/labels/100_7.png +0 -0
  256. data/vendor/ggml/examples/yolo/data/labels/101_0.png +0 -0
  257. data/vendor/ggml/examples/yolo/data/labels/101_1.png +0 -0
  258. data/vendor/ggml/examples/yolo/data/labels/101_2.png +0 -0
  259. data/vendor/ggml/examples/yolo/data/labels/101_3.png +0 -0
  260. data/vendor/ggml/examples/yolo/data/labels/101_4.png +0 -0
  261. data/vendor/ggml/examples/yolo/data/labels/101_5.png +0 -0
  262. data/vendor/ggml/examples/yolo/data/labels/101_6.png +0 -0
  263. data/vendor/ggml/examples/yolo/data/labels/101_7.png +0 -0
  264. data/vendor/ggml/examples/yolo/data/labels/102_0.png +0 -0
  265. data/vendor/ggml/examples/yolo/data/labels/102_1.png +0 -0
  266. data/vendor/ggml/examples/yolo/data/labels/102_2.png +0 -0
  267. data/vendor/ggml/examples/yolo/data/labels/102_3.png +0 -0
  268. data/vendor/ggml/examples/yolo/data/labels/102_4.png +0 -0
  269. data/vendor/ggml/examples/yolo/data/labels/102_5.png +0 -0
  270. data/vendor/ggml/examples/yolo/data/labels/102_6.png +0 -0
  271. data/vendor/ggml/examples/yolo/data/labels/102_7.png +0 -0
  272. data/vendor/ggml/examples/yolo/data/labels/103_0.png +0 -0
  273. data/vendor/ggml/examples/yolo/data/labels/103_1.png +0 -0
  274. data/vendor/ggml/examples/yolo/data/labels/103_2.png +0 -0
  275. data/vendor/ggml/examples/yolo/data/labels/103_3.png +0 -0
  276. data/vendor/ggml/examples/yolo/data/labels/103_4.png +0 -0
  277. data/vendor/ggml/examples/yolo/data/labels/103_5.png +0 -0
  278. data/vendor/ggml/examples/yolo/data/labels/103_6.png +0 -0
  279. data/vendor/ggml/examples/yolo/data/labels/103_7.png +0 -0
  280. data/vendor/ggml/examples/yolo/data/labels/104_0.png +0 -0
  281. data/vendor/ggml/examples/yolo/data/labels/104_1.png +0 -0
  282. data/vendor/ggml/examples/yolo/data/labels/104_2.png +0 -0
  283. data/vendor/ggml/examples/yolo/data/labels/104_3.png +0 -0
  284. data/vendor/ggml/examples/yolo/data/labels/104_4.png +0 -0
  285. data/vendor/ggml/examples/yolo/data/labels/104_5.png +0 -0
  286. data/vendor/ggml/examples/yolo/data/labels/104_6.png +0 -0
  287. data/vendor/ggml/examples/yolo/data/labels/104_7.png +0 -0
  288. data/vendor/ggml/examples/yolo/data/labels/105_0.png +0 -0
  289. data/vendor/ggml/examples/yolo/data/labels/105_1.png +0 -0
  290. data/vendor/ggml/examples/yolo/data/labels/105_2.png +0 -0
  291. data/vendor/ggml/examples/yolo/data/labels/105_3.png +0 -0
  292. data/vendor/ggml/examples/yolo/data/labels/105_4.png +0 -0
  293. data/vendor/ggml/examples/yolo/data/labels/105_5.png +0 -0
  294. data/vendor/ggml/examples/yolo/data/labels/105_6.png +0 -0
  295. data/vendor/ggml/examples/yolo/data/labels/105_7.png +0 -0
  296. data/vendor/ggml/examples/yolo/data/labels/106_0.png +0 -0
  297. data/vendor/ggml/examples/yolo/data/labels/106_1.png +0 -0
  298. data/vendor/ggml/examples/yolo/data/labels/106_2.png +0 -0
  299. data/vendor/ggml/examples/yolo/data/labels/106_3.png +0 -0
  300. data/vendor/ggml/examples/yolo/data/labels/106_4.png +0 -0
  301. data/vendor/ggml/examples/yolo/data/labels/106_5.png +0 -0
  302. data/vendor/ggml/examples/yolo/data/labels/106_6.png +0 -0
  303. data/vendor/ggml/examples/yolo/data/labels/106_7.png +0 -0
  304. data/vendor/ggml/examples/yolo/data/labels/107_0.png +0 -0
  305. data/vendor/ggml/examples/yolo/data/labels/107_1.png +0 -0
  306. data/vendor/ggml/examples/yolo/data/labels/107_2.png +0 -0
  307. data/vendor/ggml/examples/yolo/data/labels/107_3.png +0 -0
  308. data/vendor/ggml/examples/yolo/data/labels/107_4.png +0 -0
  309. data/vendor/ggml/examples/yolo/data/labels/107_5.png +0 -0
  310. data/vendor/ggml/examples/yolo/data/labels/107_6.png +0 -0
  311. data/vendor/ggml/examples/yolo/data/labels/107_7.png +0 -0
  312. data/vendor/ggml/examples/yolo/data/labels/108_0.png +0 -0
  313. data/vendor/ggml/examples/yolo/data/labels/108_1.png +0 -0
  314. data/vendor/ggml/examples/yolo/data/labels/108_2.png +0 -0
  315. data/vendor/ggml/examples/yolo/data/labels/108_3.png +0 -0
  316. data/vendor/ggml/examples/yolo/data/labels/108_4.png +0 -0
  317. data/vendor/ggml/examples/yolo/data/labels/108_5.png +0 -0
  318. data/vendor/ggml/examples/yolo/data/labels/108_6.png +0 -0
  319. data/vendor/ggml/examples/yolo/data/labels/108_7.png +0 -0
  320. data/vendor/ggml/examples/yolo/data/labels/109_0.png +0 -0
  321. data/vendor/ggml/examples/yolo/data/labels/109_1.png +0 -0
  322. data/vendor/ggml/examples/yolo/data/labels/109_2.png +0 -0
  323. data/vendor/ggml/examples/yolo/data/labels/109_3.png +0 -0
  324. data/vendor/ggml/examples/yolo/data/labels/109_4.png +0 -0
  325. data/vendor/ggml/examples/yolo/data/labels/109_5.png +0 -0
  326. data/vendor/ggml/examples/yolo/data/labels/109_6.png +0 -0
  327. data/vendor/ggml/examples/yolo/data/labels/109_7.png +0 -0
  328. data/vendor/ggml/examples/yolo/data/labels/110_0.png +0 -0
  329. data/vendor/ggml/examples/yolo/data/labels/110_1.png +0 -0
  330. data/vendor/ggml/examples/yolo/data/labels/110_2.png +0 -0
  331. data/vendor/ggml/examples/yolo/data/labels/110_3.png +0 -0
  332. data/vendor/ggml/examples/yolo/data/labels/110_4.png +0 -0
  333. data/vendor/ggml/examples/yolo/data/labels/110_5.png +0 -0
  334. data/vendor/ggml/examples/yolo/data/labels/110_6.png +0 -0
  335. data/vendor/ggml/examples/yolo/data/labels/110_7.png +0 -0
  336. data/vendor/ggml/examples/yolo/data/labels/111_0.png +0 -0
  337. data/vendor/ggml/examples/yolo/data/labels/111_1.png +0 -0
  338. data/vendor/ggml/examples/yolo/data/labels/111_2.png +0 -0
  339. data/vendor/ggml/examples/yolo/data/labels/111_3.png +0 -0
  340. data/vendor/ggml/examples/yolo/data/labels/111_4.png +0 -0
  341. data/vendor/ggml/examples/yolo/data/labels/111_5.png +0 -0
  342. data/vendor/ggml/examples/yolo/data/labels/111_6.png +0 -0
  343. data/vendor/ggml/examples/yolo/data/labels/111_7.png +0 -0
  344. data/vendor/ggml/examples/yolo/data/labels/112_0.png +0 -0
  345. data/vendor/ggml/examples/yolo/data/labels/112_1.png +0 -0
  346. data/vendor/ggml/examples/yolo/data/labels/112_2.png +0 -0
  347. data/vendor/ggml/examples/yolo/data/labels/112_3.png +0 -0
  348. data/vendor/ggml/examples/yolo/data/labels/112_4.png +0 -0
  349. data/vendor/ggml/examples/yolo/data/labels/112_5.png +0 -0
  350. data/vendor/ggml/examples/yolo/data/labels/112_6.png +0 -0
  351. data/vendor/ggml/examples/yolo/data/labels/112_7.png +0 -0
  352. data/vendor/ggml/examples/yolo/data/labels/113_0.png +0 -0
  353. data/vendor/ggml/examples/yolo/data/labels/113_1.png +0 -0
  354. data/vendor/ggml/examples/yolo/data/labels/113_2.png +0 -0
  355. data/vendor/ggml/examples/yolo/data/labels/113_3.png +0 -0
  356. data/vendor/ggml/examples/yolo/data/labels/113_4.png +0 -0
  357. data/vendor/ggml/examples/yolo/data/labels/113_5.png +0 -0
  358. data/vendor/ggml/examples/yolo/data/labels/113_6.png +0 -0
  359. data/vendor/ggml/examples/yolo/data/labels/113_7.png +0 -0
  360. data/vendor/ggml/examples/yolo/data/labels/114_0.png +0 -0
  361. data/vendor/ggml/examples/yolo/data/labels/114_1.png +0 -0
  362. data/vendor/ggml/examples/yolo/data/labels/114_2.png +0 -0
  363. data/vendor/ggml/examples/yolo/data/labels/114_3.png +0 -0
  364. data/vendor/ggml/examples/yolo/data/labels/114_4.png +0 -0
  365. data/vendor/ggml/examples/yolo/data/labels/114_5.png +0 -0
  366. data/vendor/ggml/examples/yolo/data/labels/114_6.png +0 -0
  367. data/vendor/ggml/examples/yolo/data/labels/114_7.png +0 -0
  368. data/vendor/ggml/examples/yolo/data/labels/115_0.png +0 -0
  369. data/vendor/ggml/examples/yolo/data/labels/115_1.png +0 -0
  370. data/vendor/ggml/examples/yolo/data/labels/115_2.png +0 -0
  371. data/vendor/ggml/examples/yolo/data/labels/115_3.png +0 -0
  372. data/vendor/ggml/examples/yolo/data/labels/115_4.png +0 -0
  373. data/vendor/ggml/examples/yolo/data/labels/115_5.png +0 -0
  374. data/vendor/ggml/examples/yolo/data/labels/115_6.png +0 -0
  375. data/vendor/ggml/examples/yolo/data/labels/115_7.png +0 -0
  376. data/vendor/ggml/examples/yolo/data/labels/116_0.png +0 -0
  377. data/vendor/ggml/examples/yolo/data/labels/116_1.png +0 -0
  378. data/vendor/ggml/examples/yolo/data/labels/116_2.png +0 -0
  379. data/vendor/ggml/examples/yolo/data/labels/116_3.png +0 -0
  380. data/vendor/ggml/examples/yolo/data/labels/116_4.png +0 -0
  381. data/vendor/ggml/examples/yolo/data/labels/116_5.png +0 -0
  382. data/vendor/ggml/examples/yolo/data/labels/116_6.png +0 -0
  383. data/vendor/ggml/examples/yolo/data/labels/116_7.png +0 -0
  384. data/vendor/ggml/examples/yolo/data/labels/117_0.png +0 -0
  385. data/vendor/ggml/examples/yolo/data/labels/117_1.png +0 -0
  386. data/vendor/ggml/examples/yolo/data/labels/117_2.png +0 -0
  387. data/vendor/ggml/examples/yolo/data/labels/117_3.png +0 -0
  388. data/vendor/ggml/examples/yolo/data/labels/117_4.png +0 -0
  389. data/vendor/ggml/examples/yolo/data/labels/117_5.png +0 -0
  390. data/vendor/ggml/examples/yolo/data/labels/117_6.png +0 -0
  391. data/vendor/ggml/examples/yolo/data/labels/117_7.png +0 -0
  392. data/vendor/ggml/examples/yolo/data/labels/118_0.png +0 -0
  393. data/vendor/ggml/examples/yolo/data/labels/118_1.png +0 -0
  394. data/vendor/ggml/examples/yolo/data/labels/118_2.png +0 -0
  395. data/vendor/ggml/examples/yolo/data/labels/118_3.png +0 -0
  396. data/vendor/ggml/examples/yolo/data/labels/118_4.png +0 -0
  397. data/vendor/ggml/examples/yolo/data/labels/118_5.png +0 -0
  398. data/vendor/ggml/examples/yolo/data/labels/118_6.png +0 -0
  399. data/vendor/ggml/examples/yolo/data/labels/118_7.png +0 -0
  400. data/vendor/ggml/examples/yolo/data/labels/119_0.png +0 -0
  401. data/vendor/ggml/examples/yolo/data/labels/119_1.png +0 -0
  402. data/vendor/ggml/examples/yolo/data/labels/119_2.png +0 -0
  403. data/vendor/ggml/examples/yolo/data/labels/119_3.png +0 -0
  404. data/vendor/ggml/examples/yolo/data/labels/119_4.png +0 -0
  405. data/vendor/ggml/examples/yolo/data/labels/119_5.png +0 -0
  406. data/vendor/ggml/examples/yolo/data/labels/119_6.png +0 -0
  407. data/vendor/ggml/examples/yolo/data/labels/119_7.png +0 -0
  408. data/vendor/ggml/examples/yolo/data/labels/120_0.png +0 -0
  409. data/vendor/ggml/examples/yolo/data/labels/120_1.png +0 -0
  410. data/vendor/ggml/examples/yolo/data/labels/120_2.png +0 -0
  411. data/vendor/ggml/examples/yolo/data/labels/120_3.png +0 -0
  412. data/vendor/ggml/examples/yolo/data/labels/120_4.png +0 -0
  413. data/vendor/ggml/examples/yolo/data/labels/120_5.png +0 -0
  414. data/vendor/ggml/examples/yolo/data/labels/120_6.png +0 -0
  415. data/vendor/ggml/examples/yolo/data/labels/120_7.png +0 -0
  416. data/vendor/ggml/examples/yolo/data/labels/121_0.png +0 -0
  417. data/vendor/ggml/examples/yolo/data/labels/121_1.png +0 -0
  418. data/vendor/ggml/examples/yolo/data/labels/121_2.png +0 -0
  419. data/vendor/ggml/examples/yolo/data/labels/121_3.png +0 -0
  420. data/vendor/ggml/examples/yolo/data/labels/121_4.png +0 -0
  421. data/vendor/ggml/examples/yolo/data/labels/121_5.png +0 -0
  422. data/vendor/ggml/examples/yolo/data/labels/121_6.png +0 -0
  423. data/vendor/ggml/examples/yolo/data/labels/121_7.png +0 -0
  424. data/vendor/ggml/examples/yolo/data/labels/122_0.png +0 -0
  425. data/vendor/ggml/examples/yolo/data/labels/122_1.png +0 -0
  426. data/vendor/ggml/examples/yolo/data/labels/122_2.png +0 -0
  427. data/vendor/ggml/examples/yolo/data/labels/122_3.png +0 -0
  428. data/vendor/ggml/examples/yolo/data/labels/122_4.png +0 -0
  429. data/vendor/ggml/examples/yolo/data/labels/122_5.png +0 -0
  430. data/vendor/ggml/examples/yolo/data/labels/122_6.png +0 -0
  431. data/vendor/ggml/examples/yolo/data/labels/122_7.png +0 -0
  432. data/vendor/ggml/examples/yolo/data/labels/123_0.png +0 -0
  433. data/vendor/ggml/examples/yolo/data/labels/123_1.png +0 -0
  434. data/vendor/ggml/examples/yolo/data/labels/123_2.png +0 -0
  435. data/vendor/ggml/examples/yolo/data/labels/123_3.png +0 -0
  436. data/vendor/ggml/examples/yolo/data/labels/123_4.png +0 -0
  437. data/vendor/ggml/examples/yolo/data/labels/123_5.png +0 -0
  438. data/vendor/ggml/examples/yolo/data/labels/123_6.png +0 -0
  439. data/vendor/ggml/examples/yolo/data/labels/123_7.png +0 -0
  440. data/vendor/ggml/examples/yolo/data/labels/124_0.png +0 -0
  441. data/vendor/ggml/examples/yolo/data/labels/124_1.png +0 -0
  442. data/vendor/ggml/examples/yolo/data/labels/124_2.png +0 -0
  443. data/vendor/ggml/examples/yolo/data/labels/124_3.png +0 -0
  444. data/vendor/ggml/examples/yolo/data/labels/124_4.png +0 -0
  445. data/vendor/ggml/examples/yolo/data/labels/124_5.png +0 -0
  446. data/vendor/ggml/examples/yolo/data/labels/124_6.png +0 -0
  447. data/vendor/ggml/examples/yolo/data/labels/124_7.png +0 -0
  448. data/vendor/ggml/examples/yolo/data/labels/125_0.png +0 -0
  449. data/vendor/ggml/examples/yolo/data/labels/125_1.png +0 -0
  450. data/vendor/ggml/examples/yolo/data/labels/125_2.png +0 -0
  451. data/vendor/ggml/examples/yolo/data/labels/125_3.png +0 -0
  452. data/vendor/ggml/examples/yolo/data/labels/125_4.png +0 -0
  453. data/vendor/ggml/examples/yolo/data/labels/125_5.png +0 -0
  454. data/vendor/ggml/examples/yolo/data/labels/125_6.png +0 -0
  455. data/vendor/ggml/examples/yolo/data/labels/125_7.png +0 -0
  456. data/vendor/ggml/examples/yolo/data/labels/126_0.png +0 -0
  457. data/vendor/ggml/examples/yolo/data/labels/126_1.png +0 -0
  458. data/vendor/ggml/examples/yolo/data/labels/126_2.png +0 -0
  459. data/vendor/ggml/examples/yolo/data/labels/126_3.png +0 -0
  460. data/vendor/ggml/examples/yolo/data/labels/126_4.png +0 -0
  461. data/vendor/ggml/examples/yolo/data/labels/126_5.png +0 -0
  462. data/vendor/ggml/examples/yolo/data/labels/126_6.png +0 -0
  463. data/vendor/ggml/examples/yolo/data/labels/126_7.png +0 -0
  464. data/vendor/ggml/examples/yolo/data/labels/32_0.png +0 -0
  465. data/vendor/ggml/examples/yolo/data/labels/32_1.png +0 -0
  466. data/vendor/ggml/examples/yolo/data/labels/32_2.png +0 -0
  467. data/vendor/ggml/examples/yolo/data/labels/32_3.png +0 -0
  468. data/vendor/ggml/examples/yolo/data/labels/32_4.png +0 -0
  469. data/vendor/ggml/examples/yolo/data/labels/32_5.png +0 -0
  470. data/vendor/ggml/examples/yolo/data/labels/32_6.png +0 -0
  471. data/vendor/ggml/examples/yolo/data/labels/32_7.png +0 -0
  472. data/vendor/ggml/examples/yolo/data/labels/33_0.png +0 -0
  473. data/vendor/ggml/examples/yolo/data/labels/33_1.png +0 -0
  474. data/vendor/ggml/examples/yolo/data/labels/33_2.png +0 -0
  475. data/vendor/ggml/examples/yolo/data/labels/33_3.png +0 -0
  476. data/vendor/ggml/examples/yolo/data/labels/33_4.png +0 -0
  477. data/vendor/ggml/examples/yolo/data/labels/33_5.png +0 -0
  478. data/vendor/ggml/examples/yolo/data/labels/33_6.png +0 -0
  479. data/vendor/ggml/examples/yolo/data/labels/33_7.png +0 -0
  480. data/vendor/ggml/examples/yolo/data/labels/34_0.png +0 -0
  481. data/vendor/ggml/examples/yolo/data/labels/34_1.png +0 -0
  482. data/vendor/ggml/examples/yolo/data/labels/34_2.png +0 -0
  483. data/vendor/ggml/examples/yolo/data/labels/34_3.png +0 -0
  484. data/vendor/ggml/examples/yolo/data/labels/34_4.png +0 -0
  485. data/vendor/ggml/examples/yolo/data/labels/34_5.png +0 -0
  486. data/vendor/ggml/examples/yolo/data/labels/34_6.png +0 -0
  487. data/vendor/ggml/examples/yolo/data/labels/34_7.png +0 -0
  488. data/vendor/ggml/examples/yolo/data/labels/35_0.png +0 -0
  489. data/vendor/ggml/examples/yolo/data/labels/35_1.png +0 -0
  490. data/vendor/ggml/examples/yolo/data/labels/35_2.png +0 -0
  491. data/vendor/ggml/examples/yolo/data/labels/35_3.png +0 -0
  492. data/vendor/ggml/examples/yolo/data/labels/35_4.png +0 -0
  493. data/vendor/ggml/examples/yolo/data/labels/35_5.png +0 -0
  494. data/vendor/ggml/examples/yolo/data/labels/35_6.png +0 -0
  495. data/vendor/ggml/examples/yolo/data/labels/35_7.png +0 -0
  496. data/vendor/ggml/examples/yolo/data/labels/36_0.png +0 -0
  497. data/vendor/ggml/examples/yolo/data/labels/36_1.png +0 -0
  498. data/vendor/ggml/examples/yolo/data/labels/36_2.png +0 -0
  499. data/vendor/ggml/examples/yolo/data/labels/36_3.png +0 -0
  500. data/vendor/ggml/examples/yolo/data/labels/36_4.png +0 -0
  501. data/vendor/ggml/examples/yolo/data/labels/36_5.png +0 -0
  502. data/vendor/ggml/examples/yolo/data/labels/36_6.png +0 -0
  503. data/vendor/ggml/examples/yolo/data/labels/36_7.png +0 -0
  504. data/vendor/ggml/examples/yolo/data/labels/37_0.png +0 -0
  505. data/vendor/ggml/examples/yolo/data/labels/37_1.png +0 -0
  506. data/vendor/ggml/examples/yolo/data/labels/37_2.png +0 -0
  507. data/vendor/ggml/examples/yolo/data/labels/37_3.png +0 -0
  508. data/vendor/ggml/examples/yolo/data/labels/37_4.png +0 -0
  509. data/vendor/ggml/examples/yolo/data/labels/37_5.png +0 -0
  510. data/vendor/ggml/examples/yolo/data/labels/37_6.png +0 -0
  511. data/vendor/ggml/examples/yolo/data/labels/37_7.png +0 -0
  512. data/vendor/ggml/examples/yolo/data/labels/38_0.png +0 -0
  513. data/vendor/ggml/examples/yolo/data/labels/38_1.png +0 -0
  514. data/vendor/ggml/examples/yolo/data/labels/38_2.png +0 -0
  515. data/vendor/ggml/examples/yolo/data/labels/38_3.png +0 -0
  516. data/vendor/ggml/examples/yolo/data/labels/38_4.png +0 -0
  517. data/vendor/ggml/examples/yolo/data/labels/38_5.png +0 -0
  518. data/vendor/ggml/examples/yolo/data/labels/38_6.png +0 -0
  519. data/vendor/ggml/examples/yolo/data/labels/38_7.png +0 -0
  520. data/vendor/ggml/examples/yolo/data/labels/39_0.png +0 -0
  521. data/vendor/ggml/examples/yolo/data/labels/39_1.png +0 -0
  522. data/vendor/ggml/examples/yolo/data/labels/39_2.png +0 -0
  523. data/vendor/ggml/examples/yolo/data/labels/39_3.png +0 -0
  524. data/vendor/ggml/examples/yolo/data/labels/39_4.png +0 -0
  525. data/vendor/ggml/examples/yolo/data/labels/39_5.png +0 -0
  526. data/vendor/ggml/examples/yolo/data/labels/39_6.png +0 -0
  527. data/vendor/ggml/examples/yolo/data/labels/39_7.png +0 -0
  528. data/vendor/ggml/examples/yolo/data/labels/40_0.png +0 -0
  529. data/vendor/ggml/examples/yolo/data/labels/40_1.png +0 -0
  530. data/vendor/ggml/examples/yolo/data/labels/40_2.png +0 -0
  531. data/vendor/ggml/examples/yolo/data/labels/40_3.png +0 -0
  532. data/vendor/ggml/examples/yolo/data/labels/40_4.png +0 -0
  533. data/vendor/ggml/examples/yolo/data/labels/40_5.png +0 -0
  534. data/vendor/ggml/examples/yolo/data/labels/40_6.png +0 -0
  535. data/vendor/ggml/examples/yolo/data/labels/40_7.png +0 -0
  536. data/vendor/ggml/examples/yolo/data/labels/41_0.png +0 -0
  537. data/vendor/ggml/examples/yolo/data/labels/41_1.png +0 -0
  538. data/vendor/ggml/examples/yolo/data/labels/41_2.png +0 -0
  539. data/vendor/ggml/examples/yolo/data/labels/41_3.png +0 -0
  540. data/vendor/ggml/examples/yolo/data/labels/41_4.png +0 -0
  541. data/vendor/ggml/examples/yolo/data/labels/41_5.png +0 -0
  542. data/vendor/ggml/examples/yolo/data/labels/41_6.png +0 -0
  543. data/vendor/ggml/examples/yolo/data/labels/41_7.png +0 -0
  544. data/vendor/ggml/examples/yolo/data/labels/42_0.png +0 -0
  545. data/vendor/ggml/examples/yolo/data/labels/42_1.png +0 -0
  546. data/vendor/ggml/examples/yolo/data/labels/42_2.png +0 -0
  547. data/vendor/ggml/examples/yolo/data/labels/42_3.png +0 -0
  548. data/vendor/ggml/examples/yolo/data/labels/42_4.png +0 -0
  549. data/vendor/ggml/examples/yolo/data/labels/42_5.png +0 -0
  550. data/vendor/ggml/examples/yolo/data/labels/42_6.png +0 -0
  551. data/vendor/ggml/examples/yolo/data/labels/42_7.png +0 -0
  552. data/vendor/ggml/examples/yolo/data/labels/43_0.png +0 -0
  553. data/vendor/ggml/examples/yolo/data/labels/43_1.png +0 -0
  554. data/vendor/ggml/examples/yolo/data/labels/43_2.png +0 -0
  555. data/vendor/ggml/examples/yolo/data/labels/43_3.png +0 -0
  556. data/vendor/ggml/examples/yolo/data/labels/43_4.png +0 -0
  557. data/vendor/ggml/examples/yolo/data/labels/43_5.png +0 -0
  558. data/vendor/ggml/examples/yolo/data/labels/43_6.png +0 -0
  559. data/vendor/ggml/examples/yolo/data/labels/43_7.png +0 -0
  560. data/vendor/ggml/examples/yolo/data/labels/44_0.png +0 -0
  561. data/vendor/ggml/examples/yolo/data/labels/44_1.png +0 -0
  562. data/vendor/ggml/examples/yolo/data/labels/44_2.png +0 -0
  563. data/vendor/ggml/examples/yolo/data/labels/44_3.png +0 -0
  564. data/vendor/ggml/examples/yolo/data/labels/44_4.png +0 -0
  565. data/vendor/ggml/examples/yolo/data/labels/44_5.png +0 -0
  566. data/vendor/ggml/examples/yolo/data/labels/44_6.png +0 -0
  567. data/vendor/ggml/examples/yolo/data/labels/44_7.png +0 -0
  568. data/vendor/ggml/examples/yolo/data/labels/45_0.png +0 -0
  569. data/vendor/ggml/examples/yolo/data/labels/45_1.png +0 -0
  570. data/vendor/ggml/examples/yolo/data/labels/45_2.png +0 -0
  571. data/vendor/ggml/examples/yolo/data/labels/45_3.png +0 -0
  572. data/vendor/ggml/examples/yolo/data/labels/45_4.png +0 -0
  573. data/vendor/ggml/examples/yolo/data/labels/45_5.png +0 -0
  574. data/vendor/ggml/examples/yolo/data/labels/45_6.png +0 -0
  575. data/vendor/ggml/examples/yolo/data/labels/45_7.png +0 -0
  576. data/vendor/ggml/examples/yolo/data/labels/46_0.png +0 -0
  577. data/vendor/ggml/examples/yolo/data/labels/46_1.png +0 -0
  578. data/vendor/ggml/examples/yolo/data/labels/46_2.png +0 -0
  579. data/vendor/ggml/examples/yolo/data/labels/46_3.png +0 -0
  580. data/vendor/ggml/examples/yolo/data/labels/46_4.png +0 -0
  581. data/vendor/ggml/examples/yolo/data/labels/46_5.png +0 -0
  582. data/vendor/ggml/examples/yolo/data/labels/46_6.png +0 -0
  583. data/vendor/ggml/examples/yolo/data/labels/46_7.png +0 -0
  584. data/vendor/ggml/examples/yolo/data/labels/47_0.png +0 -0
  585. data/vendor/ggml/examples/yolo/data/labels/47_1.png +0 -0
  586. data/vendor/ggml/examples/yolo/data/labels/47_2.png +0 -0
  587. data/vendor/ggml/examples/yolo/data/labels/47_3.png +0 -0
  588. data/vendor/ggml/examples/yolo/data/labels/47_4.png +0 -0
  589. data/vendor/ggml/examples/yolo/data/labels/47_5.png +0 -0
  590. data/vendor/ggml/examples/yolo/data/labels/47_6.png +0 -0
  591. data/vendor/ggml/examples/yolo/data/labels/47_7.png +0 -0
  592. data/vendor/ggml/examples/yolo/data/labels/48_0.png +0 -0
  593. data/vendor/ggml/examples/yolo/data/labels/48_1.png +0 -0
  594. data/vendor/ggml/examples/yolo/data/labels/48_2.png +0 -0
  595. data/vendor/ggml/examples/yolo/data/labels/48_3.png +0 -0
  596. data/vendor/ggml/examples/yolo/data/labels/48_4.png +0 -0
  597. data/vendor/ggml/examples/yolo/data/labels/48_5.png +0 -0
  598. data/vendor/ggml/examples/yolo/data/labels/48_6.png +0 -0
  599. data/vendor/ggml/examples/yolo/data/labels/48_7.png +0 -0
  600. data/vendor/ggml/examples/yolo/data/labels/49_0.png +0 -0
  601. data/vendor/ggml/examples/yolo/data/labels/49_1.png +0 -0
  602. data/vendor/ggml/examples/yolo/data/labels/49_2.png +0 -0
  603. data/vendor/ggml/examples/yolo/data/labels/49_3.png +0 -0
  604. data/vendor/ggml/examples/yolo/data/labels/49_4.png +0 -0
  605. data/vendor/ggml/examples/yolo/data/labels/49_5.png +0 -0
  606. data/vendor/ggml/examples/yolo/data/labels/49_6.png +0 -0
  607. data/vendor/ggml/examples/yolo/data/labels/49_7.png +0 -0
  608. data/vendor/ggml/examples/yolo/data/labels/50_0.png +0 -0
  609. data/vendor/ggml/examples/yolo/data/labels/50_1.png +0 -0
  610. data/vendor/ggml/examples/yolo/data/labels/50_2.png +0 -0
  611. data/vendor/ggml/examples/yolo/data/labels/50_3.png +0 -0
  612. data/vendor/ggml/examples/yolo/data/labels/50_4.png +0 -0
  613. data/vendor/ggml/examples/yolo/data/labels/50_5.png +0 -0
  614. data/vendor/ggml/examples/yolo/data/labels/50_6.png +0 -0
  615. data/vendor/ggml/examples/yolo/data/labels/50_7.png +0 -0
  616. data/vendor/ggml/examples/yolo/data/labels/51_0.png +0 -0
  617. data/vendor/ggml/examples/yolo/data/labels/51_1.png +0 -0
  618. data/vendor/ggml/examples/yolo/data/labels/51_2.png +0 -0
  619. data/vendor/ggml/examples/yolo/data/labels/51_3.png +0 -0
  620. data/vendor/ggml/examples/yolo/data/labels/51_4.png +0 -0
  621. data/vendor/ggml/examples/yolo/data/labels/51_5.png +0 -0
  622. data/vendor/ggml/examples/yolo/data/labels/51_6.png +0 -0
  623. data/vendor/ggml/examples/yolo/data/labels/51_7.png +0 -0
  624. data/vendor/ggml/examples/yolo/data/labels/52_0.png +0 -0
  625. data/vendor/ggml/examples/yolo/data/labels/52_1.png +0 -0
  626. data/vendor/ggml/examples/yolo/data/labels/52_2.png +0 -0
  627. data/vendor/ggml/examples/yolo/data/labels/52_3.png +0 -0
  628. data/vendor/ggml/examples/yolo/data/labels/52_4.png +0 -0
  629. data/vendor/ggml/examples/yolo/data/labels/52_5.png +0 -0
  630. data/vendor/ggml/examples/yolo/data/labels/52_6.png +0 -0
  631. data/vendor/ggml/examples/yolo/data/labels/52_7.png +0 -0
  632. data/vendor/ggml/examples/yolo/data/labels/53_0.png +0 -0
  633. data/vendor/ggml/examples/yolo/data/labels/53_1.png +0 -0
  634. data/vendor/ggml/examples/yolo/data/labels/53_2.png +0 -0
  635. data/vendor/ggml/examples/yolo/data/labels/53_3.png +0 -0
  636. data/vendor/ggml/examples/yolo/data/labels/53_4.png +0 -0
  637. data/vendor/ggml/examples/yolo/data/labels/53_5.png +0 -0
  638. data/vendor/ggml/examples/yolo/data/labels/53_6.png +0 -0
  639. data/vendor/ggml/examples/yolo/data/labels/53_7.png +0 -0
  640. data/vendor/ggml/examples/yolo/data/labels/54_0.png +0 -0
  641. data/vendor/ggml/examples/yolo/data/labels/54_1.png +0 -0
  642. data/vendor/ggml/examples/yolo/data/labels/54_2.png +0 -0
  643. data/vendor/ggml/examples/yolo/data/labels/54_3.png +0 -0
  644. data/vendor/ggml/examples/yolo/data/labels/54_4.png +0 -0
  645. data/vendor/ggml/examples/yolo/data/labels/54_5.png +0 -0
  646. data/vendor/ggml/examples/yolo/data/labels/54_6.png +0 -0
  647. data/vendor/ggml/examples/yolo/data/labels/54_7.png +0 -0
  648. data/vendor/ggml/examples/yolo/data/labels/55_0.png +0 -0
  649. data/vendor/ggml/examples/yolo/data/labels/55_1.png +0 -0
  650. data/vendor/ggml/examples/yolo/data/labels/55_2.png +0 -0
  651. data/vendor/ggml/examples/yolo/data/labels/55_3.png +0 -0
  652. data/vendor/ggml/examples/yolo/data/labels/55_4.png +0 -0
  653. data/vendor/ggml/examples/yolo/data/labels/55_5.png +0 -0
  654. data/vendor/ggml/examples/yolo/data/labels/55_6.png +0 -0
  655. data/vendor/ggml/examples/yolo/data/labels/55_7.png +0 -0
  656. data/vendor/ggml/examples/yolo/data/labels/56_0.png +0 -0
  657. data/vendor/ggml/examples/yolo/data/labels/56_1.png +0 -0
  658. data/vendor/ggml/examples/yolo/data/labels/56_2.png +0 -0
  659. data/vendor/ggml/examples/yolo/data/labels/56_3.png +0 -0
  660. data/vendor/ggml/examples/yolo/data/labels/56_4.png +0 -0
  661. data/vendor/ggml/examples/yolo/data/labels/56_5.png +0 -0
  662. data/vendor/ggml/examples/yolo/data/labels/56_6.png +0 -0
  663. data/vendor/ggml/examples/yolo/data/labels/56_7.png +0 -0
  664. data/vendor/ggml/examples/yolo/data/labels/57_0.png +0 -0
  665. data/vendor/ggml/examples/yolo/data/labels/57_1.png +0 -0
  666. data/vendor/ggml/examples/yolo/data/labels/57_2.png +0 -0
  667. data/vendor/ggml/examples/yolo/data/labels/57_3.png +0 -0
  668. data/vendor/ggml/examples/yolo/data/labels/57_4.png +0 -0
  669. data/vendor/ggml/examples/yolo/data/labels/57_5.png +0 -0
  670. data/vendor/ggml/examples/yolo/data/labels/57_6.png +0 -0
  671. data/vendor/ggml/examples/yolo/data/labels/57_7.png +0 -0
  672. data/vendor/ggml/examples/yolo/data/labels/58_0.png +0 -0
  673. data/vendor/ggml/examples/yolo/data/labels/58_1.png +0 -0
  674. data/vendor/ggml/examples/yolo/data/labels/58_2.png +0 -0
  675. data/vendor/ggml/examples/yolo/data/labels/58_3.png +0 -0
  676. data/vendor/ggml/examples/yolo/data/labels/58_4.png +0 -0
  677. data/vendor/ggml/examples/yolo/data/labels/58_5.png +0 -0
  678. data/vendor/ggml/examples/yolo/data/labels/58_6.png +0 -0
  679. data/vendor/ggml/examples/yolo/data/labels/58_7.png +0 -0
  680. data/vendor/ggml/examples/yolo/data/labels/59_0.png +0 -0
  681. data/vendor/ggml/examples/yolo/data/labels/59_1.png +0 -0
  682. data/vendor/ggml/examples/yolo/data/labels/59_2.png +0 -0
  683. data/vendor/ggml/examples/yolo/data/labels/59_3.png +0 -0
  684. data/vendor/ggml/examples/yolo/data/labels/59_4.png +0 -0
  685. data/vendor/ggml/examples/yolo/data/labels/59_5.png +0 -0
  686. data/vendor/ggml/examples/yolo/data/labels/59_6.png +0 -0
  687. data/vendor/ggml/examples/yolo/data/labels/59_7.png +0 -0
  688. data/vendor/ggml/examples/yolo/data/labels/60_0.png +0 -0
  689. data/vendor/ggml/examples/yolo/data/labels/60_1.png +0 -0
  690. data/vendor/ggml/examples/yolo/data/labels/60_2.png +0 -0
  691. data/vendor/ggml/examples/yolo/data/labels/60_3.png +0 -0
  692. data/vendor/ggml/examples/yolo/data/labels/60_4.png +0 -0
  693. data/vendor/ggml/examples/yolo/data/labels/60_5.png +0 -0
  694. data/vendor/ggml/examples/yolo/data/labels/60_6.png +0 -0
  695. data/vendor/ggml/examples/yolo/data/labels/60_7.png +0 -0
  696. data/vendor/ggml/examples/yolo/data/labels/61_0.png +0 -0
  697. data/vendor/ggml/examples/yolo/data/labels/61_1.png +0 -0
  698. data/vendor/ggml/examples/yolo/data/labels/61_2.png +0 -0
  699. data/vendor/ggml/examples/yolo/data/labels/61_3.png +0 -0
  700. data/vendor/ggml/examples/yolo/data/labels/61_4.png +0 -0
  701. data/vendor/ggml/examples/yolo/data/labels/61_5.png +0 -0
  702. data/vendor/ggml/examples/yolo/data/labels/61_6.png +0 -0
  703. data/vendor/ggml/examples/yolo/data/labels/61_7.png +0 -0
  704. data/vendor/ggml/examples/yolo/data/labels/62_0.png +0 -0
  705. data/vendor/ggml/examples/yolo/data/labels/62_1.png +0 -0
  706. data/vendor/ggml/examples/yolo/data/labels/62_2.png +0 -0
  707. data/vendor/ggml/examples/yolo/data/labels/62_3.png +0 -0
  708. data/vendor/ggml/examples/yolo/data/labels/62_4.png +0 -0
  709. data/vendor/ggml/examples/yolo/data/labels/62_5.png +0 -0
  710. data/vendor/ggml/examples/yolo/data/labels/62_6.png +0 -0
  711. data/vendor/ggml/examples/yolo/data/labels/62_7.png +0 -0
  712. data/vendor/ggml/examples/yolo/data/labels/63_0.png +0 -0
  713. data/vendor/ggml/examples/yolo/data/labels/63_1.png +0 -0
  714. data/vendor/ggml/examples/yolo/data/labels/63_2.png +0 -0
  715. data/vendor/ggml/examples/yolo/data/labels/63_3.png +0 -0
  716. data/vendor/ggml/examples/yolo/data/labels/63_4.png +0 -0
  717. data/vendor/ggml/examples/yolo/data/labels/63_5.png +0 -0
  718. data/vendor/ggml/examples/yolo/data/labels/63_6.png +0 -0
  719. data/vendor/ggml/examples/yolo/data/labels/63_7.png +0 -0
  720. data/vendor/ggml/examples/yolo/data/labels/64_0.png +0 -0
  721. data/vendor/ggml/examples/yolo/data/labels/64_1.png +0 -0
  722. data/vendor/ggml/examples/yolo/data/labels/64_2.png +0 -0
  723. data/vendor/ggml/examples/yolo/data/labels/64_3.png +0 -0
  724. data/vendor/ggml/examples/yolo/data/labels/64_4.png +0 -0
  725. data/vendor/ggml/examples/yolo/data/labels/64_5.png +0 -0
  726. data/vendor/ggml/examples/yolo/data/labels/64_6.png +0 -0
  727. data/vendor/ggml/examples/yolo/data/labels/64_7.png +0 -0
  728. data/vendor/ggml/examples/yolo/data/labels/65_0.png +0 -0
  729. data/vendor/ggml/examples/yolo/data/labels/65_1.png +0 -0
  730. data/vendor/ggml/examples/yolo/data/labels/65_2.png +0 -0
  731. data/vendor/ggml/examples/yolo/data/labels/65_3.png +0 -0
  732. data/vendor/ggml/examples/yolo/data/labels/65_4.png +0 -0
  733. data/vendor/ggml/examples/yolo/data/labels/65_5.png +0 -0
  734. data/vendor/ggml/examples/yolo/data/labels/65_6.png +0 -0
  735. data/vendor/ggml/examples/yolo/data/labels/65_7.png +0 -0
  736. data/vendor/ggml/examples/yolo/data/labels/66_0.png +0 -0
  737. data/vendor/ggml/examples/yolo/data/labels/66_1.png +0 -0
  738. data/vendor/ggml/examples/yolo/data/labels/66_2.png +0 -0
  739. data/vendor/ggml/examples/yolo/data/labels/66_3.png +0 -0
  740. data/vendor/ggml/examples/yolo/data/labels/66_4.png +0 -0
  741. data/vendor/ggml/examples/yolo/data/labels/66_5.png +0 -0
  742. data/vendor/ggml/examples/yolo/data/labels/66_6.png +0 -0
  743. data/vendor/ggml/examples/yolo/data/labels/66_7.png +0 -0
  744. data/vendor/ggml/examples/yolo/data/labels/67_0.png +0 -0
  745. data/vendor/ggml/examples/yolo/data/labels/67_1.png +0 -0
  746. data/vendor/ggml/examples/yolo/data/labels/67_2.png +0 -0
  747. data/vendor/ggml/examples/yolo/data/labels/67_3.png +0 -0
  748. data/vendor/ggml/examples/yolo/data/labels/67_4.png +0 -0
  749. data/vendor/ggml/examples/yolo/data/labels/67_5.png +0 -0
  750. data/vendor/ggml/examples/yolo/data/labels/67_6.png +0 -0
  751. data/vendor/ggml/examples/yolo/data/labels/67_7.png +0 -0
  752. data/vendor/ggml/examples/yolo/data/labels/68_0.png +0 -0
  753. data/vendor/ggml/examples/yolo/data/labels/68_1.png +0 -0
  754. data/vendor/ggml/examples/yolo/data/labels/68_2.png +0 -0
  755. data/vendor/ggml/examples/yolo/data/labels/68_3.png +0 -0
  756. data/vendor/ggml/examples/yolo/data/labels/68_4.png +0 -0
  757. data/vendor/ggml/examples/yolo/data/labels/68_5.png +0 -0
  758. data/vendor/ggml/examples/yolo/data/labels/68_6.png +0 -0
  759. data/vendor/ggml/examples/yolo/data/labels/68_7.png +0 -0
  760. data/vendor/ggml/examples/yolo/data/labels/69_0.png +0 -0
  761. data/vendor/ggml/examples/yolo/data/labels/69_1.png +0 -0
  762. data/vendor/ggml/examples/yolo/data/labels/69_2.png +0 -0
  763. data/vendor/ggml/examples/yolo/data/labels/69_3.png +0 -0
  764. data/vendor/ggml/examples/yolo/data/labels/69_4.png +0 -0
  765. data/vendor/ggml/examples/yolo/data/labels/69_5.png +0 -0
  766. data/vendor/ggml/examples/yolo/data/labels/69_6.png +0 -0
  767. data/vendor/ggml/examples/yolo/data/labels/69_7.png +0 -0
  768. data/vendor/ggml/examples/yolo/data/labels/70_0.png +0 -0
  769. data/vendor/ggml/examples/yolo/data/labels/70_1.png +0 -0
  770. data/vendor/ggml/examples/yolo/data/labels/70_2.png +0 -0
  771. data/vendor/ggml/examples/yolo/data/labels/70_3.png +0 -0
  772. data/vendor/ggml/examples/yolo/data/labels/70_4.png +0 -0
  773. data/vendor/ggml/examples/yolo/data/labels/70_5.png +0 -0
  774. data/vendor/ggml/examples/yolo/data/labels/70_6.png +0 -0
  775. data/vendor/ggml/examples/yolo/data/labels/70_7.png +0 -0
  776. data/vendor/ggml/examples/yolo/data/labels/71_0.png +0 -0
  777. data/vendor/ggml/examples/yolo/data/labels/71_1.png +0 -0
  778. data/vendor/ggml/examples/yolo/data/labels/71_2.png +0 -0
  779. data/vendor/ggml/examples/yolo/data/labels/71_3.png +0 -0
  780. data/vendor/ggml/examples/yolo/data/labels/71_4.png +0 -0
  781. data/vendor/ggml/examples/yolo/data/labels/71_5.png +0 -0
  782. data/vendor/ggml/examples/yolo/data/labels/71_6.png +0 -0
  783. data/vendor/ggml/examples/yolo/data/labels/71_7.png +0 -0
  784. data/vendor/ggml/examples/yolo/data/labels/72_0.png +0 -0
  785. data/vendor/ggml/examples/yolo/data/labels/72_1.png +0 -0
  786. data/vendor/ggml/examples/yolo/data/labels/72_2.png +0 -0
  787. data/vendor/ggml/examples/yolo/data/labels/72_3.png +0 -0
  788. data/vendor/ggml/examples/yolo/data/labels/72_4.png +0 -0
  789. data/vendor/ggml/examples/yolo/data/labels/72_5.png +0 -0
  790. data/vendor/ggml/examples/yolo/data/labels/72_6.png +0 -0
  791. data/vendor/ggml/examples/yolo/data/labels/72_7.png +0 -0
  792. data/vendor/ggml/examples/yolo/data/labels/73_0.png +0 -0
  793. data/vendor/ggml/examples/yolo/data/labels/73_1.png +0 -0
  794. data/vendor/ggml/examples/yolo/data/labels/73_2.png +0 -0
  795. data/vendor/ggml/examples/yolo/data/labels/73_3.png +0 -0
  796. data/vendor/ggml/examples/yolo/data/labels/73_4.png +0 -0
  797. data/vendor/ggml/examples/yolo/data/labels/73_5.png +0 -0
  798. data/vendor/ggml/examples/yolo/data/labels/73_6.png +0 -0
  799. data/vendor/ggml/examples/yolo/data/labels/73_7.png +0 -0
  800. data/vendor/ggml/examples/yolo/data/labels/74_0.png +0 -0
  801. data/vendor/ggml/examples/yolo/data/labels/74_1.png +0 -0
  802. data/vendor/ggml/examples/yolo/data/labels/74_2.png +0 -0
  803. data/vendor/ggml/examples/yolo/data/labels/74_3.png +0 -0
  804. data/vendor/ggml/examples/yolo/data/labels/74_4.png +0 -0
  805. data/vendor/ggml/examples/yolo/data/labels/74_5.png +0 -0
  806. data/vendor/ggml/examples/yolo/data/labels/74_6.png +0 -0
  807. data/vendor/ggml/examples/yolo/data/labels/74_7.png +0 -0
  808. data/vendor/ggml/examples/yolo/data/labels/75_0.png +0 -0
  809. data/vendor/ggml/examples/yolo/data/labels/75_1.png +0 -0
  810. data/vendor/ggml/examples/yolo/data/labels/75_2.png +0 -0
  811. data/vendor/ggml/examples/yolo/data/labels/75_3.png +0 -0
  812. data/vendor/ggml/examples/yolo/data/labels/75_4.png +0 -0
  813. data/vendor/ggml/examples/yolo/data/labels/75_5.png +0 -0
  814. data/vendor/ggml/examples/yolo/data/labels/75_6.png +0 -0
  815. data/vendor/ggml/examples/yolo/data/labels/75_7.png +0 -0
  816. data/vendor/ggml/examples/yolo/data/labels/76_0.png +0 -0
  817. data/vendor/ggml/examples/yolo/data/labels/76_1.png +0 -0
  818. data/vendor/ggml/examples/yolo/data/labels/76_2.png +0 -0
  819. data/vendor/ggml/examples/yolo/data/labels/76_3.png +0 -0
  820. data/vendor/ggml/examples/yolo/data/labels/76_4.png +0 -0
  821. data/vendor/ggml/examples/yolo/data/labels/76_5.png +0 -0
  822. data/vendor/ggml/examples/yolo/data/labels/76_6.png +0 -0
  823. data/vendor/ggml/examples/yolo/data/labels/76_7.png +0 -0
  824. data/vendor/ggml/examples/yolo/data/labels/77_0.png +0 -0
  825. data/vendor/ggml/examples/yolo/data/labels/77_1.png +0 -0
  826. data/vendor/ggml/examples/yolo/data/labels/77_2.png +0 -0
  827. data/vendor/ggml/examples/yolo/data/labels/77_3.png +0 -0
  828. data/vendor/ggml/examples/yolo/data/labels/77_4.png +0 -0
  829. data/vendor/ggml/examples/yolo/data/labels/77_5.png +0 -0
  830. data/vendor/ggml/examples/yolo/data/labels/77_6.png +0 -0
  831. data/vendor/ggml/examples/yolo/data/labels/77_7.png +0 -0
  832. data/vendor/ggml/examples/yolo/data/labels/78_0.png +0 -0
  833. data/vendor/ggml/examples/yolo/data/labels/78_1.png +0 -0
  834. data/vendor/ggml/examples/yolo/data/labels/78_2.png +0 -0
  835. data/vendor/ggml/examples/yolo/data/labels/78_3.png +0 -0
  836. data/vendor/ggml/examples/yolo/data/labels/78_4.png +0 -0
  837. data/vendor/ggml/examples/yolo/data/labels/78_5.png +0 -0
  838. data/vendor/ggml/examples/yolo/data/labels/78_6.png +0 -0
  839. data/vendor/ggml/examples/yolo/data/labels/78_7.png +0 -0
  840. data/vendor/ggml/examples/yolo/data/labels/79_0.png +0 -0
  841. data/vendor/ggml/examples/yolo/data/labels/79_1.png +0 -0
  842. data/vendor/ggml/examples/yolo/data/labels/79_2.png +0 -0
  843. data/vendor/ggml/examples/yolo/data/labels/79_3.png +0 -0
  844. data/vendor/ggml/examples/yolo/data/labels/79_4.png +0 -0
  845. data/vendor/ggml/examples/yolo/data/labels/79_5.png +0 -0
  846. data/vendor/ggml/examples/yolo/data/labels/79_6.png +0 -0
  847. data/vendor/ggml/examples/yolo/data/labels/79_7.png +0 -0
  848. data/vendor/ggml/examples/yolo/data/labels/80_0.png +0 -0
  849. data/vendor/ggml/examples/yolo/data/labels/80_1.png +0 -0
  850. data/vendor/ggml/examples/yolo/data/labels/80_2.png +0 -0
  851. data/vendor/ggml/examples/yolo/data/labels/80_3.png +0 -0
  852. data/vendor/ggml/examples/yolo/data/labels/80_4.png +0 -0
  853. data/vendor/ggml/examples/yolo/data/labels/80_5.png +0 -0
  854. data/vendor/ggml/examples/yolo/data/labels/80_6.png +0 -0
  855. data/vendor/ggml/examples/yolo/data/labels/80_7.png +0 -0
  856. data/vendor/ggml/examples/yolo/data/labels/81_0.png +0 -0
  857. data/vendor/ggml/examples/yolo/data/labels/81_1.png +0 -0
  858. data/vendor/ggml/examples/yolo/data/labels/81_2.png +0 -0
  859. data/vendor/ggml/examples/yolo/data/labels/81_3.png +0 -0
  860. data/vendor/ggml/examples/yolo/data/labels/81_4.png +0 -0
  861. data/vendor/ggml/examples/yolo/data/labels/81_5.png +0 -0
  862. data/vendor/ggml/examples/yolo/data/labels/81_6.png +0 -0
  863. data/vendor/ggml/examples/yolo/data/labels/81_7.png +0 -0
  864. data/vendor/ggml/examples/yolo/data/labels/82_0.png +0 -0
  865. data/vendor/ggml/examples/yolo/data/labels/82_1.png +0 -0
  866. data/vendor/ggml/examples/yolo/data/labels/82_2.png +0 -0
  867. data/vendor/ggml/examples/yolo/data/labels/82_3.png +0 -0
  868. data/vendor/ggml/examples/yolo/data/labels/82_4.png +0 -0
  869. data/vendor/ggml/examples/yolo/data/labels/82_5.png +0 -0
  870. data/vendor/ggml/examples/yolo/data/labels/82_6.png +0 -0
  871. data/vendor/ggml/examples/yolo/data/labels/82_7.png +0 -0
  872. data/vendor/ggml/examples/yolo/data/labels/83_0.png +0 -0
  873. data/vendor/ggml/examples/yolo/data/labels/83_1.png +0 -0
  874. data/vendor/ggml/examples/yolo/data/labels/83_2.png +0 -0
  875. data/vendor/ggml/examples/yolo/data/labels/83_3.png +0 -0
  876. data/vendor/ggml/examples/yolo/data/labels/83_4.png +0 -0
  877. data/vendor/ggml/examples/yolo/data/labels/83_5.png +0 -0
  878. data/vendor/ggml/examples/yolo/data/labels/83_6.png +0 -0
  879. data/vendor/ggml/examples/yolo/data/labels/83_7.png +0 -0
  880. data/vendor/ggml/examples/yolo/data/labels/84_0.png +0 -0
  881. data/vendor/ggml/examples/yolo/data/labels/84_1.png +0 -0
  882. data/vendor/ggml/examples/yolo/data/labels/84_2.png +0 -0
  883. data/vendor/ggml/examples/yolo/data/labels/84_3.png +0 -0
  884. data/vendor/ggml/examples/yolo/data/labels/84_4.png +0 -0
  885. data/vendor/ggml/examples/yolo/data/labels/84_5.png +0 -0
  886. data/vendor/ggml/examples/yolo/data/labels/84_6.png +0 -0
  887. data/vendor/ggml/examples/yolo/data/labels/84_7.png +0 -0
  888. data/vendor/ggml/examples/yolo/data/labels/85_0.png +0 -0
  889. data/vendor/ggml/examples/yolo/data/labels/85_1.png +0 -0
  890. data/vendor/ggml/examples/yolo/data/labels/85_2.png +0 -0
  891. data/vendor/ggml/examples/yolo/data/labels/85_3.png +0 -0
  892. data/vendor/ggml/examples/yolo/data/labels/85_4.png +0 -0
  893. data/vendor/ggml/examples/yolo/data/labels/85_5.png +0 -0
  894. data/vendor/ggml/examples/yolo/data/labels/85_6.png +0 -0
  895. data/vendor/ggml/examples/yolo/data/labels/85_7.png +0 -0
  896. data/vendor/ggml/examples/yolo/data/labels/86_0.png +0 -0
  897. data/vendor/ggml/examples/yolo/data/labels/86_1.png +0 -0
  898. data/vendor/ggml/examples/yolo/data/labels/86_2.png +0 -0
  899. data/vendor/ggml/examples/yolo/data/labels/86_3.png +0 -0
  900. data/vendor/ggml/examples/yolo/data/labels/86_4.png +0 -0
  901. data/vendor/ggml/examples/yolo/data/labels/86_5.png +0 -0
  902. data/vendor/ggml/examples/yolo/data/labels/86_6.png +0 -0
  903. data/vendor/ggml/examples/yolo/data/labels/86_7.png +0 -0
  904. data/vendor/ggml/examples/yolo/data/labels/87_0.png +0 -0
  905. data/vendor/ggml/examples/yolo/data/labels/87_1.png +0 -0
  906. data/vendor/ggml/examples/yolo/data/labels/87_2.png +0 -0
  907. data/vendor/ggml/examples/yolo/data/labels/87_3.png +0 -0
  908. data/vendor/ggml/examples/yolo/data/labels/87_4.png +0 -0
  909. data/vendor/ggml/examples/yolo/data/labels/87_5.png +0 -0
  910. data/vendor/ggml/examples/yolo/data/labels/87_6.png +0 -0
  911. data/vendor/ggml/examples/yolo/data/labels/87_7.png +0 -0
  912. data/vendor/ggml/examples/yolo/data/labels/88_0.png +0 -0
  913. data/vendor/ggml/examples/yolo/data/labels/88_1.png +0 -0
  914. data/vendor/ggml/examples/yolo/data/labels/88_2.png +0 -0
  915. data/vendor/ggml/examples/yolo/data/labels/88_3.png +0 -0
  916. data/vendor/ggml/examples/yolo/data/labels/88_4.png +0 -0
  917. data/vendor/ggml/examples/yolo/data/labels/88_5.png +0 -0
  918. data/vendor/ggml/examples/yolo/data/labels/88_6.png +0 -0
  919. data/vendor/ggml/examples/yolo/data/labels/88_7.png +0 -0
  920. data/vendor/ggml/examples/yolo/data/labels/89_0.png +0 -0
  921. data/vendor/ggml/examples/yolo/data/labels/89_1.png +0 -0
  922. data/vendor/ggml/examples/yolo/data/labels/89_2.png +0 -0
  923. data/vendor/ggml/examples/yolo/data/labels/89_3.png +0 -0
  924. data/vendor/ggml/examples/yolo/data/labels/89_4.png +0 -0
  925. data/vendor/ggml/examples/yolo/data/labels/89_5.png +0 -0
  926. data/vendor/ggml/examples/yolo/data/labels/89_6.png +0 -0
  927. data/vendor/ggml/examples/yolo/data/labels/89_7.png +0 -0
  928. data/vendor/ggml/examples/yolo/data/labels/90_0.png +0 -0
  929. data/vendor/ggml/examples/yolo/data/labels/90_1.png +0 -0
  930. data/vendor/ggml/examples/yolo/data/labels/90_2.png +0 -0
  931. data/vendor/ggml/examples/yolo/data/labels/90_3.png +0 -0
  932. data/vendor/ggml/examples/yolo/data/labels/90_4.png +0 -0
  933. data/vendor/ggml/examples/yolo/data/labels/90_5.png +0 -0
  934. data/vendor/ggml/examples/yolo/data/labels/90_6.png +0 -0
  935. data/vendor/ggml/examples/yolo/data/labels/90_7.png +0 -0
  936. data/vendor/ggml/examples/yolo/data/labels/91_0.png +0 -0
  937. data/vendor/ggml/examples/yolo/data/labels/91_1.png +0 -0
  938. data/vendor/ggml/examples/yolo/data/labels/91_2.png +0 -0
  939. data/vendor/ggml/examples/yolo/data/labels/91_3.png +0 -0
  940. data/vendor/ggml/examples/yolo/data/labels/91_4.png +0 -0
  941. data/vendor/ggml/examples/yolo/data/labels/91_5.png +0 -0
  942. data/vendor/ggml/examples/yolo/data/labels/91_6.png +0 -0
  943. data/vendor/ggml/examples/yolo/data/labels/91_7.png +0 -0
  944. data/vendor/ggml/examples/yolo/data/labels/92_0.png +0 -0
  945. data/vendor/ggml/examples/yolo/data/labels/92_1.png +0 -0
  946. data/vendor/ggml/examples/yolo/data/labels/92_2.png +0 -0
  947. data/vendor/ggml/examples/yolo/data/labels/92_3.png +0 -0
  948. data/vendor/ggml/examples/yolo/data/labels/92_4.png +0 -0
  949. data/vendor/ggml/examples/yolo/data/labels/92_5.png +0 -0
  950. data/vendor/ggml/examples/yolo/data/labels/92_6.png +0 -0
  951. data/vendor/ggml/examples/yolo/data/labels/92_7.png +0 -0
  952. data/vendor/ggml/examples/yolo/data/labels/93_0.png +0 -0
  953. data/vendor/ggml/examples/yolo/data/labels/93_1.png +0 -0
  954. data/vendor/ggml/examples/yolo/data/labels/93_2.png +0 -0
  955. data/vendor/ggml/examples/yolo/data/labels/93_3.png +0 -0
  956. data/vendor/ggml/examples/yolo/data/labels/93_4.png +0 -0
  957. data/vendor/ggml/examples/yolo/data/labels/93_5.png +0 -0
  958. data/vendor/ggml/examples/yolo/data/labels/93_6.png +0 -0
  959. data/vendor/ggml/examples/yolo/data/labels/93_7.png +0 -0
  960. data/vendor/ggml/examples/yolo/data/labels/94_0.png +0 -0
  961. data/vendor/ggml/examples/yolo/data/labels/94_1.png +0 -0
  962. data/vendor/ggml/examples/yolo/data/labels/94_2.png +0 -0
  963. data/vendor/ggml/examples/yolo/data/labels/94_3.png +0 -0
  964. data/vendor/ggml/examples/yolo/data/labels/94_4.png +0 -0
  965. data/vendor/ggml/examples/yolo/data/labels/94_5.png +0 -0
  966. data/vendor/ggml/examples/yolo/data/labels/94_6.png +0 -0
  967. data/vendor/ggml/examples/yolo/data/labels/94_7.png +0 -0
  968. data/vendor/ggml/examples/yolo/data/labels/95_0.png +0 -0
  969. data/vendor/ggml/examples/yolo/data/labels/95_1.png +0 -0
  970. data/vendor/ggml/examples/yolo/data/labels/95_2.png +0 -0
  971. data/vendor/ggml/examples/yolo/data/labels/95_3.png +0 -0
  972. data/vendor/ggml/examples/yolo/data/labels/95_4.png +0 -0
  973. data/vendor/ggml/examples/yolo/data/labels/95_5.png +0 -0
  974. data/vendor/ggml/examples/yolo/data/labels/95_6.png +0 -0
  975. data/vendor/ggml/examples/yolo/data/labels/95_7.png +0 -0
  976. data/vendor/ggml/examples/yolo/data/labels/96_0.png +0 -0
  977. data/vendor/ggml/examples/yolo/data/labels/96_1.png +0 -0
  978. data/vendor/ggml/examples/yolo/data/labels/96_2.png +0 -0
  979. data/vendor/ggml/examples/yolo/data/labels/96_3.png +0 -0
  980. data/vendor/ggml/examples/yolo/data/labels/96_4.png +0 -0
  981. data/vendor/ggml/examples/yolo/data/labels/96_5.png +0 -0
  982. data/vendor/ggml/examples/yolo/data/labels/96_6.png +0 -0
  983. data/vendor/ggml/examples/yolo/data/labels/96_7.png +0 -0
  984. data/vendor/ggml/examples/yolo/data/labels/97_0.png +0 -0
  985. data/vendor/ggml/examples/yolo/data/labels/97_1.png +0 -0
  986. data/vendor/ggml/examples/yolo/data/labels/97_2.png +0 -0
  987. data/vendor/ggml/examples/yolo/data/labels/97_3.png +0 -0
  988. data/vendor/ggml/examples/yolo/data/labels/97_4.png +0 -0
  989. data/vendor/ggml/examples/yolo/data/labels/97_5.png +0 -0
  990. data/vendor/ggml/examples/yolo/data/labels/97_6.png +0 -0
  991. data/vendor/ggml/examples/yolo/data/labels/97_7.png +0 -0
  992. data/vendor/ggml/examples/yolo/data/labels/98_0.png +0 -0
  993. data/vendor/ggml/examples/yolo/data/labels/98_1.png +0 -0
  994. data/vendor/ggml/examples/yolo/data/labels/98_2.png +0 -0
  995. data/vendor/ggml/examples/yolo/data/labels/98_3.png +0 -0
  996. data/vendor/ggml/examples/yolo/data/labels/98_4.png +0 -0
  997. data/vendor/ggml/examples/yolo/data/labels/98_5.png +0 -0
  998. data/vendor/ggml/examples/yolo/data/labels/98_6.png +0 -0
  999. data/vendor/ggml/examples/yolo/data/labels/98_7.png +0 -0
  1000. data/vendor/ggml/examples/yolo/data/labels/99_0.png +0 -0
  1001. data/vendor/ggml/examples/yolo/data/labels/99_1.png +0 -0
  1002. data/vendor/ggml/examples/yolo/data/labels/99_2.png +0 -0
  1003. data/vendor/ggml/examples/yolo/data/labels/99_3.png +0 -0
  1004. data/vendor/ggml/examples/yolo/data/labels/99_4.png +0 -0
  1005. data/vendor/ggml/examples/yolo/data/labels/99_5.png +0 -0
  1006. data/vendor/ggml/examples/yolo/data/labels/99_6.png +0 -0
  1007. data/vendor/ggml/examples/yolo/data/labels/99_7.png +0 -0
  1008. data/vendor/ggml/examples/yolo/yolo-image.cpp +210 -0
  1009. data/vendor/ggml/examples/yolo/yolo-image.h +39 -0
  1010. data/vendor/ggml/examples/yolo/yolov3-tiny.cpp +661 -0
  1011. data/vendor/ggml/ggml.pc.in +10 -0
  1012. data/vendor/ggml/include/ggml-alloc.h +85 -0
  1013. data/vendor/ggml/include/ggml-backend.h +431 -0
  1014. data/vendor/ggml/include/ggml-blas.h +25 -0
  1015. data/vendor/ggml/include/ggml-cann.h +123 -0
  1016. data/vendor/ggml/include/ggml-cpp.h +39 -0
  1017. data/vendor/ggml/include/ggml-cpu.h +151 -0
  1018. data/vendor/ggml/include/ggml-cuda.h +50 -0
  1019. data/vendor/ggml/include/ggml-hexagon.h +19 -0
  1020. data/vendor/ggml/include/ggml-metal.h +61 -0
  1021. data/vendor/ggml/include/ggml-opencl.h +26 -0
  1022. data/vendor/ggml/include/ggml-openvino.h +37 -0
  1023. data/vendor/ggml/include/ggml-opt.h +256 -0
  1024. data/vendor/ggml/include/ggml-rpc.h +35 -0
  1025. data/vendor/ggml/include/ggml-sycl.h +49 -0
  1026. data/vendor/ggml/include/ggml-virtgpu.h +14 -0
  1027. data/vendor/ggml/include/ggml-vulkan.h +29 -0
  1028. data/vendor/ggml/include/ggml-webgpu.h +19 -0
  1029. data/vendor/ggml/include/ggml-zdnn.h +17 -0
  1030. data/vendor/ggml/include/ggml-zendnn.h +22 -0
  1031. data/vendor/ggml/include/ggml.h +2845 -0
  1032. data/vendor/ggml/include/gguf.h +204 -0
  1033. data/vendor/ggml/requirements.txt +12 -0
  1034. data/vendor/ggml/scripts/gen-authors.sh +9 -0
  1035. data/vendor/ggml/scripts/release.sh +296 -0
  1036. data/vendor/ggml/scripts/sync-llama-am.sh +167 -0
  1037. data/vendor/ggml/scripts/sync-llama.last +1 -0
  1038. data/vendor/ggml/scripts/sync-llama.sh +21 -0
  1039. data/vendor/ggml/scripts/sync-whisper-am.sh +138 -0
  1040. data/vendor/ggml/scripts/sync-whisper.last +1 -0
  1041. data/vendor/ggml/scripts/sync-whisper.sh +17 -0
  1042. data/vendor/ggml/src/CMakeLists.txt +493 -0
  1043. data/vendor/ggml/src/ggml-alloc.c +1248 -0
  1044. data/vendor/ggml/src/ggml-backend-dl.cpp +48 -0
  1045. data/vendor/ggml/src/ggml-backend-dl.h +45 -0
  1046. data/vendor/ggml/src/ggml-backend-impl.h +275 -0
  1047. data/vendor/ggml/src/ggml-backend-meta.cpp +2144 -0
  1048. data/vendor/ggml/src/ggml-backend-reg.cpp +586 -0
  1049. data/vendor/ggml/src/ggml-backend.cpp +2371 -0
  1050. data/vendor/ggml/src/ggml-blas/CMakeLists.txt +101 -0
  1051. data/vendor/ggml/src/ggml-blas/ggml-blas.cpp +522 -0
  1052. data/vendor/ggml/src/ggml-cann/CMakeLists.txt +89 -0
  1053. data/vendor/ggml/src/ggml-cann/acl_tensor.cpp +195 -0
  1054. data/vendor/ggml/src/ggml-cann/acl_tensor.h +349 -0
  1055. data/vendor/ggml/src/ggml-cann/aclnn_ops.cpp +4436 -0
  1056. data/vendor/ggml/src/ggml-cann/aclnn_ops.h +1190 -0
  1057. data/vendor/ggml/src/ggml-cann/common.h +651 -0
  1058. data/vendor/ggml/src/ggml-cann/ggml-cann.cpp +3062 -0
  1059. data/vendor/ggml/src/ggml-common.h +1900 -0
  1060. data/vendor/ggml/src/ggml-cpu/CMakeLists.txt +731 -0
  1061. data/vendor/ggml/src/ggml-cpu/amx/amx.cpp +249 -0
  1062. data/vendor/ggml/src/ggml-cpu/amx/amx.h +8 -0
  1063. data/vendor/ggml/src/ggml-cpu/amx/common.h +115 -0
  1064. data/vendor/ggml/src/ggml-cpu/amx/mmq.cpp +2512 -0
  1065. data/vendor/ggml/src/ggml-cpu/amx/mmq.h +10 -0
  1066. data/vendor/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +98 -0
  1067. data/vendor/ggml/src/ggml-cpu/arch/arm/quants.c +4245 -0
  1068. data/vendor/ggml/src/ggml-cpu/arch/arm/repack.cpp +5156 -0
  1069. data/vendor/ggml/src/ggml-cpu/arch/loongarch/quants.c +2158 -0
  1070. data/vendor/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  1071. data/vendor/ggml/src/ggml-cpu/arch/powerpc/quants.c +2304 -0
  1072. data/vendor/ggml/src/ggml-cpu/arch/riscv/cpu-feats.cpp +38 -0
  1073. data/vendor/ggml/src/ggml-cpu/arch/riscv/quants.c +4553 -0
  1074. data/vendor/ggml/src/ggml-cpu/arch/riscv/repack.cpp +1703 -0
  1075. data/vendor/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
  1076. data/vendor/ggml/src/ggml-cpu/arch/s390/quants.c +1465 -0
  1077. data/vendor/ggml/src/ggml-cpu/arch/wasm/quants.c +1220 -0
  1078. data/vendor/ggml/src/ggml-cpu/arch/x86/cpu-feats.cpp +327 -0
  1079. data/vendor/ggml/src/ggml-cpu/arch/x86/quants.c +3970 -0
  1080. data/vendor/ggml/src/ggml-cpu/arch/x86/repack.cpp +6407 -0
  1081. data/vendor/ggml/src/ggml-cpu/arch-fallback.h +348 -0
  1082. data/vendor/ggml/src/ggml-cpu/binary-ops.cpp +154 -0
  1083. data/vendor/ggml/src/ggml-cpu/binary-ops.h +16 -0
  1084. data/vendor/ggml/src/ggml-cpu/cmake/FindSIMD.cmake +100 -0
  1085. data/vendor/ggml/src/ggml-cpu/cmake/FindSMTIME.cmake +32 -0
  1086. data/vendor/ggml/src/ggml-cpu/common.h +95 -0
  1087. data/vendor/ggml/src/ggml-cpu/ggml-cpu-impl.h +539 -0
  1088. data/vendor/ggml/src/ggml-cpu/ggml-cpu.c +3835 -0
  1089. data/vendor/ggml/src/ggml-cpu/ggml-cpu.cpp +703 -0
  1090. data/vendor/ggml/src/ggml-cpu/hbm.cpp +55 -0
  1091. data/vendor/ggml/src/ggml-cpu/hbm.h +8 -0
  1092. data/vendor/ggml/src/ggml-cpu/kleidiai/kernels.cpp +939 -0
  1093. data/vendor/ggml/src/ggml-cpu/kleidiai/kernels.h +90 -0
  1094. data/vendor/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1513 -0
  1095. data/vendor/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  1096. data/vendor/ggml/src/ggml-cpu/llamafile/sgemm.cpp +4051 -0
  1097. data/vendor/ggml/src/ggml-cpu/llamafile/sgemm.h +25 -0
  1098. data/vendor/ggml/src/ggml-cpu/ops.cpp +11373 -0
  1099. data/vendor/ggml/src/ggml-cpu/ops.h +119 -0
  1100. data/vendor/ggml/src/ggml-cpu/quants.c +1288 -0
  1101. data/vendor/ggml/src/ggml-cpu/quants.h +103 -0
  1102. data/vendor/ggml/src/ggml-cpu/repack.cpp +4836 -0
  1103. data/vendor/ggml/src/ggml-cpu/repack.h +245 -0
  1104. data/vendor/ggml/src/ggml-cpu/simd-gemm.h +226 -0
  1105. data/vendor/ggml/src/ggml-cpu/simd-mappings.h +1319 -0
  1106. data/vendor/ggml/src/ggml-cpu/spacemit/ime.cpp +1740 -0
  1107. data/vendor/ggml/src/ggml-cpu/spacemit/ime.h +21 -0
  1108. data/vendor/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +1027 -0
  1109. data/vendor/ggml/src/ggml-cpu/spacemit/ime2_kernels.cpp +5768 -0
  1110. data/vendor/ggml/src/ggml-cpu/spacemit/ime_env.cpp +320 -0
  1111. data/vendor/ggml/src/ggml-cpu/spacemit/ime_env.h +55 -0
  1112. data/vendor/ggml/src/ggml-cpu/spacemit/ime_kernels.h +189 -0
  1113. data/vendor/ggml/src/ggml-cpu/spacemit/repack.cpp +1795 -0
  1114. data/vendor/ggml/src/ggml-cpu/spacemit/repack.h +14 -0
  1115. data/vendor/ggml/src/ggml-cpu/spacemit/rvv_kernels.cpp +3178 -0
  1116. data/vendor/ggml/src/ggml-cpu/spacemit/rvv_kernels.h +95 -0
  1117. data/vendor/ggml/src/ggml-cpu/spacemit/spine_barrier.h +34 -0
  1118. data/vendor/ggml/src/ggml-cpu/spacemit/spine_mem_pool.cpp +760 -0
  1119. data/vendor/ggml/src/ggml-cpu/spacemit/spine_mem_pool.h +32 -0
  1120. data/vendor/ggml/src/ggml-cpu/spacemit/spine_tcm.h +409 -0
  1121. data/vendor/ggml/src/ggml-cpu/traits.cpp +36 -0
  1122. data/vendor/ggml/src/ggml-cpu/traits.h +38 -0
  1123. data/vendor/ggml/src/ggml-cpu/unary-ops.cpp +337 -0
  1124. data/vendor/ggml/src/ggml-cpu/unary-ops.h +35 -0
  1125. data/vendor/ggml/src/ggml-cpu/vec.cpp +629 -0
  1126. data/vendor/ggml/src/ggml-cpu/vec.h +1588 -0
  1127. data/vendor/ggml/src/ggml-cuda/CMakeLists.txt +268 -0
  1128. data/vendor/ggml/src/ggml-cuda/acc.cu +61 -0
  1129. data/vendor/ggml/src/ggml-cuda/acc.cuh +5 -0
  1130. data/vendor/ggml/src/ggml-cuda/add-id.cu +58 -0
  1131. data/vendor/ggml/src/ggml-cuda/add-id.cuh +3 -0
  1132. data/vendor/ggml/src/ggml-cuda/allreduce.cu +971 -0
  1133. data/vendor/ggml/src/ggml-cuda/allreduce.cuh +29 -0
  1134. data/vendor/ggml/src/ggml-cuda/arange.cu +34 -0
  1135. data/vendor/ggml/src/ggml-cuda/arange.cuh +5 -0
  1136. data/vendor/ggml/src/ggml-cuda/argmax.cu +91 -0
  1137. data/vendor/ggml/src/ggml-cuda/argmax.cuh +3 -0
  1138. data/vendor/ggml/src/ggml-cuda/argsort.cu +266 -0
  1139. data/vendor/ggml/src/ggml-cuda/argsort.cuh +19 -0
  1140. data/vendor/ggml/src/ggml-cuda/binbcast.cu +534 -0
  1141. data/vendor/ggml/src/ggml-cuda/binbcast.cuh +12 -0
  1142. data/vendor/ggml/src/ggml-cuda/clamp.cu +45 -0
  1143. data/vendor/ggml/src/ggml-cuda/clamp.cuh +5 -0
  1144. data/vendor/ggml/src/ggml-cuda/common.cuh +1489 -0
  1145. data/vendor/ggml/src/ggml-cuda/concat.cu +204 -0
  1146. data/vendor/ggml/src/ggml-cuda/concat.cuh +5 -0
  1147. data/vendor/ggml/src/ggml-cuda/conv-transpose-1d.cu +86 -0
  1148. data/vendor/ggml/src/ggml-cuda/conv-transpose-1d.cuh +5 -0
  1149. data/vendor/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
  1150. data/vendor/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
  1151. data/vendor/ggml/src/ggml-cuda/conv2d-transpose.cu +115 -0
  1152. data/vendor/ggml/src/ggml-cuda/conv2d-transpose.cuh +5 -0
  1153. data/vendor/ggml/src/ggml-cuda/conv2d.cu +166 -0
  1154. data/vendor/ggml/src/ggml-cuda/conv2d.cuh +5 -0
  1155. data/vendor/ggml/src/ggml-cuda/convert.cu +892 -0
  1156. data/vendor/ggml/src/ggml-cuda/convert.cuh +66 -0
  1157. data/vendor/ggml/src/ggml-cuda/count-equal.cu +64 -0
  1158. data/vendor/ggml/src/ggml-cuda/count-equal.cuh +5 -0
  1159. data/vendor/ggml/src/ggml-cuda/cp-async.cuh +57 -0
  1160. data/vendor/ggml/src/ggml-cuda/cpy-utils.cuh +217 -0
  1161. data/vendor/ggml/src/ggml-cuda/cpy.cu +558 -0
  1162. data/vendor/ggml/src/ggml-cuda/cpy.cuh +7 -0
  1163. data/vendor/ggml/src/ggml-cuda/cross-entropy-loss.cu +177 -0
  1164. data/vendor/ggml/src/ggml-cuda/cross-entropy-loss.cuh +7 -0
  1165. data/vendor/ggml/src/ggml-cuda/cumsum.cu +307 -0
  1166. data/vendor/ggml/src/ggml-cuda/cumsum.cuh +5 -0
  1167. data/vendor/ggml/src/ggml-cuda/dequantize.cuh +99 -0
  1168. data/vendor/ggml/src/ggml-cuda/diag.cu +77 -0
  1169. data/vendor/ggml/src/ggml-cuda/diag.cuh +5 -0
  1170. data/vendor/ggml/src/ggml-cuda/diagmask.cu +40 -0
  1171. data/vendor/ggml/src/ggml-cuda/diagmask.cuh +5 -0
  1172. data/vendor/ggml/src/ggml-cuda/fattn-common.cuh +1212 -0
  1173. data/vendor/ggml/src/ggml-cuda/fattn-mma-f16.cuh +2020 -0
  1174. data/vendor/ggml/src/ggml-cuda/fattn-tile.cu +61 -0
  1175. data/vendor/ggml/src/ggml-cuda/fattn-tile.cuh +1347 -0
  1176. data/vendor/ggml/src/ggml-cuda/fattn-vec.cuh +600 -0
  1177. data/vendor/ggml/src/ggml-cuda/fattn-wmma-f16.cu +696 -0
  1178. data/vendor/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +51 -0
  1179. data/vendor/ggml/src/ggml-cuda/fattn.cu +562 -0
  1180. data/vendor/ggml/src/ggml-cuda/fattn.cuh +5 -0
  1181. data/vendor/ggml/src/ggml-cuda/fill.cu +37 -0
  1182. data/vendor/ggml/src/ggml-cuda/fill.cuh +3 -0
  1183. data/vendor/ggml/src/ggml-cuda/gated_delta_net.cu +311 -0
  1184. data/vendor/ggml/src/ggml-cuda/gated_delta_net.cuh +4 -0
  1185. data/vendor/ggml/src/ggml-cuda/getrows.cu +300 -0
  1186. data/vendor/ggml/src/ggml-cuda/getrows.cuh +15 -0
  1187. data/vendor/ggml/src/ggml-cuda/ggml-cuda.cu +5684 -0
  1188. data/vendor/ggml/src/ggml-cuda/gla.cu +93 -0
  1189. data/vendor/ggml/src/ggml-cuda/gla.cuh +3 -0
  1190. data/vendor/ggml/src/ggml-cuda/im2col.cu +267 -0
  1191. data/vendor/ggml/src/ggml-cuda/im2col.cuh +6 -0
  1192. data/vendor/ggml/src/ggml-cuda/mean.cu +75 -0
  1193. data/vendor/ggml/src/ggml-cuda/mean.cuh +3 -0
  1194. data/vendor/ggml/src/ggml-cuda/mma.cuh +1456 -0
  1195. data/vendor/ggml/src/ggml-cuda/mmf.cu +191 -0
  1196. data/vendor/ggml/src/ggml-cuda/mmf.cuh +908 -0
  1197. data/vendor/ggml/src/ggml-cuda/mmid.cu +164 -0
  1198. data/vendor/ggml/src/ggml-cuda/mmid.cuh +5 -0
  1199. data/vendor/ggml/src/ggml-cuda/mmq.cu +372 -0
  1200. data/vendor/ggml/src/ggml-cuda/mmq.cuh +4176 -0
  1201. data/vendor/ggml/src/ggml-cuda/mmvf.cu +862 -0
  1202. data/vendor/ggml/src/ggml-cuda/mmvf.cuh +14 -0
  1203. data/vendor/ggml/src/ggml-cuda/mmvq.cu +1161 -0
  1204. data/vendor/ggml/src/ggml-cuda/mmvq.cuh +16 -0
  1205. data/vendor/ggml/src/ggml-cuda/norm.cu +672 -0
  1206. data/vendor/ggml/src/ggml-cuda/norm.cuh +18 -0
  1207. data/vendor/ggml/src/ggml-cuda/opt-step-adamw.cu +78 -0
  1208. data/vendor/ggml/src/ggml-cuda/opt-step-adamw.cuh +5 -0
  1209. data/vendor/ggml/src/ggml-cuda/opt-step-sgd.cu +49 -0
  1210. data/vendor/ggml/src/ggml-cuda/opt-step-sgd.cuh +5 -0
  1211. data/vendor/ggml/src/ggml-cuda/out-prod.cu +84 -0
  1212. data/vendor/ggml/src/ggml-cuda/out-prod.cuh +3 -0
  1213. data/vendor/ggml/src/ggml-cuda/pad.cu +106 -0
  1214. data/vendor/ggml/src/ggml-cuda/pad.cuh +5 -0
  1215. data/vendor/ggml/src/ggml-cuda/pad_reflect_1d.cu +91 -0
  1216. data/vendor/ggml/src/ggml-cuda/pad_reflect_1d.cuh +5 -0
  1217. data/vendor/ggml/src/ggml-cuda/pool2d.cu +94 -0
  1218. data/vendor/ggml/src/ggml-cuda/pool2d.cuh +5 -0
  1219. data/vendor/ggml/src/ggml-cuda/quantize.cu +443 -0
  1220. data/vendor/ggml/src/ggml-cuda/quantize.cuh +41 -0
  1221. data/vendor/ggml/src/ggml-cuda/reduce_rows.cuh +39 -0
  1222. data/vendor/ggml/src/ggml-cuda/roll.cu +67 -0
  1223. data/vendor/ggml/src/ggml-cuda/roll.cuh +5 -0
  1224. data/vendor/ggml/src/ggml-cuda/rope.cu +665 -0
  1225. data/vendor/ggml/src/ggml-cuda/rope.cuh +9 -0
  1226. data/vendor/ggml/src/ggml-cuda/scale.cu +34 -0
  1227. data/vendor/ggml/src/ggml-cuda/scale.cuh +5 -0
  1228. data/vendor/ggml/src/ggml-cuda/set-rows.cu +330 -0
  1229. data/vendor/ggml/src/ggml-cuda/set-rows.cuh +7 -0
  1230. data/vendor/ggml/src/ggml-cuda/set.cu +39 -0
  1231. data/vendor/ggml/src/ggml-cuda/set.cuh +7 -0
  1232. data/vendor/ggml/src/ggml-cuda/snake.cu +72 -0
  1233. data/vendor/ggml/src/ggml-cuda/snake.cuh +8 -0
  1234. data/vendor/ggml/src/ggml-cuda/softcap.cu +34 -0
  1235. data/vendor/ggml/src/ggml-cuda/softcap.cuh +5 -0
  1236. data/vendor/ggml/src/ggml-cuda/softmax.cu +472 -0
  1237. data/vendor/ggml/src/ggml-cuda/softmax.cuh +7 -0
  1238. data/vendor/ggml/src/ggml-cuda/solve_tri.cu +275 -0
  1239. data/vendor/ggml/src/ggml-cuda/solve_tri.cuh +3 -0
  1240. data/vendor/ggml/src/ggml-cuda/ssm-conv.cu +197 -0
  1241. data/vendor/ggml/src/ggml-cuda/ssm-conv.cuh +3 -0
  1242. data/vendor/ggml/src/ggml-cuda/ssm-scan.cu +342 -0
  1243. data/vendor/ggml/src/ggml-cuda/ssm-scan.cuh +3 -0
  1244. data/vendor/ggml/src/ggml-cuda/sum.cu +41 -0
  1245. data/vendor/ggml/src/ggml-cuda/sum.cuh +5 -0
  1246. data/vendor/ggml/src/ggml-cuda/sumrows.cu +43 -0
  1247. data/vendor/ggml/src/ggml-cuda/sumrows.cuh +4 -0
  1248. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu +6 -0
  1249. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_32.cu +6 -0
  1250. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu +12 -0
  1251. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_1.cu +10 -0
  1252. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_2.cu +10 -0
  1253. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +12 -0
  1254. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu +6 -0
  1255. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_32.cu +6 -0
  1256. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +12 -0
  1257. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu +12 -0
  1258. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_1.cu +10 -0
  1259. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_2.cu +10 -0
  1260. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu +6 -0
  1261. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_2.cu +10 -0
  1262. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +12 -0
  1263. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_8.cu +12 -0
  1264. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_64-ncols2_1.cu +10 -0
  1265. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_1.cu +10 -0
  1266. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_2.cu +10 -0
  1267. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +12 -0
  1268. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_8.cu +12 -0
  1269. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq112-dv112.cu +5 -0
  1270. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq128-dv128.cu +5 -0
  1271. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq192-dv128.cu +5 -0
  1272. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq256-dv256.cu +5 -0
  1273. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq320-dv256.cu +5 -0
  1274. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq40-dv40.cu +5 -0
  1275. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq512-dv512.cu +5 -0
  1276. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq576-dv512.cu +5 -0
  1277. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq64-dv64.cu +5 -0
  1278. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq72-dv72.cu +5 -0
  1279. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq80-dv80.cu +5 -0
  1280. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq96-dv96.cu +5 -0
  1281. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-bf16.cu +7 -0
  1282. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-f16.cu +7 -0
  1283. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_0.cu +7 -0
  1284. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_1.cu +7 -0
  1285. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_0.cu +7 -0
  1286. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_1.cu +7 -0
  1287. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q8_0.cu +7 -0
  1288. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-bf16.cu +7 -0
  1289. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu +7 -0
  1290. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_0.cu +7 -0
  1291. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_1.cu +7 -0
  1292. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_0.cu +7 -0
  1293. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_1.cu +7 -0
  1294. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q8_0.cu +7 -0
  1295. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-bf16.cu +7 -0
  1296. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-f16.cu +7 -0
  1297. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu +7 -0
  1298. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_1.cu +7 -0
  1299. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_0.cu +7 -0
  1300. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_1.cu +7 -0
  1301. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q8_0.cu +7 -0
  1302. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-bf16.cu +7 -0
  1303. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-f16.cu +7 -0
  1304. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_0.cu +7 -0
  1305. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_1.cu +7 -0
  1306. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_0.cu +7 -0
  1307. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_1.cu +7 -0
  1308. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q8_0.cu +7 -0
  1309. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-bf16.cu +7 -0
  1310. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-f16.cu +7 -0
  1311. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_0.cu +7 -0
  1312. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_1.cu +7 -0
  1313. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_0.cu +7 -0
  1314. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_1.cu +7 -0
  1315. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q8_0.cu +7 -0
  1316. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-bf16.cu +7 -0
  1317. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-f16.cu +7 -0
  1318. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_0.cu +7 -0
  1319. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_1.cu +7 -0
  1320. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_0.cu +7 -0
  1321. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_1.cu +7 -0
  1322. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q8_0.cu +7 -0
  1323. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-bf16.cu +7 -0
  1324. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-f16.cu +7 -0
  1325. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_0.cu +7 -0
  1326. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_1.cu +7 -0
  1327. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_0.cu +7 -0
  1328. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_1.cu +7 -0
  1329. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu +7 -0
  1330. data/vendor/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +110 -0
  1331. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_1.cu +5 -0
  1332. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_10.cu +5 -0
  1333. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_11.cu +5 -0
  1334. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_12.cu +5 -0
  1335. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_13.cu +5 -0
  1336. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_14.cu +5 -0
  1337. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_15.cu +5 -0
  1338. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_16.cu +5 -0
  1339. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_2.cu +5 -0
  1340. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_3.cu +5 -0
  1341. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_4.cu +5 -0
  1342. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_5.cu +5 -0
  1343. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_6.cu +5 -0
  1344. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_7.cu +5 -0
  1345. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_8.cu +5 -0
  1346. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_9.cu +5 -0
  1347. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq1_s.cu +5 -0
  1348. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_s.cu +5 -0
  1349. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xs.cu +5 -0
  1350. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xxs.cu +5 -0
  1351. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_s.cu +5 -0
  1352. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_xxs.cu +5 -0
  1353. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_nl.cu +5 -0
  1354. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_xs.cu +5 -0
  1355. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-mxfp4.cu +5 -0
  1356. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-nvfp4.cu +5 -0
  1357. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q1_0.cu +5 -0
  1358. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
  1359. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
  1360. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
  1361. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
  1362. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
  1363. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
  1364. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
  1365. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
  1366. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
  1367. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
  1368. data/vendor/ggml/src/ggml-cuda/top-k.cu +95 -0
  1369. data/vendor/ggml/src/ggml-cuda/top-k.cuh +3 -0
  1370. data/vendor/ggml/src/ggml-cuda/topk-moe.cu +415 -0
  1371. data/vendor/ggml/src/ggml-cuda/topk-moe.cuh +27 -0
  1372. data/vendor/ggml/src/ggml-cuda/tri.cu +136 -0
  1373. data/vendor/ggml/src/ggml-cuda/tri.cuh +5 -0
  1374. data/vendor/ggml/src/ggml-cuda/tsembd.cu +47 -0
  1375. data/vendor/ggml/src/ggml-cuda/tsembd.cuh +5 -0
  1376. data/vendor/ggml/src/ggml-cuda/unary.cu +640 -0
  1377. data/vendor/ggml/src/ggml-cuda/unary.cuh +114 -0
  1378. data/vendor/ggml/src/ggml-cuda/upscale.cu +293 -0
  1379. data/vendor/ggml/src/ggml-cuda/upscale.cuh +5 -0
  1380. data/vendor/ggml/src/ggml-cuda/vecdotq.cuh +1317 -0
  1381. data/vendor/ggml/src/ggml-cuda/vendors/cuda.h +28 -0
  1382. data/vendor/ggml/src/ggml-cuda/vendors/hip.h +304 -0
  1383. data/vendor/ggml/src/ggml-cuda/vendors/musa.h +150 -0
  1384. data/vendor/ggml/src/ggml-cuda/wkv.cu +199 -0
  1385. data/vendor/ggml/src/ggml-cuda/wkv.cuh +7 -0
  1386. data/vendor/ggml/src/ggml-hexagon/CMakeLists.txt +118 -0
  1387. data/vendor/ggml/src/ggml-hexagon/ggml-hexagon.cpp +3680 -0
  1388. data/vendor/ggml/src/ggml-hexagon/htp/CMakeLists.txt +78 -0
  1389. data/vendor/ggml/src/ggml-hexagon/htp/act-ops.c +782 -0
  1390. data/vendor/ggml/src/ggml-hexagon/htp/argsort-ops.c +293 -0
  1391. data/vendor/ggml/src/ggml-hexagon/htp/binary-ops.c +872 -0
  1392. data/vendor/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +157 -0
  1393. data/vendor/ggml/src/ggml-hexagon/htp/cpy-ops.c +275 -0
  1394. data/vendor/ggml/src/ggml-hexagon/htp/cumsum-ops.c +270 -0
  1395. data/vendor/ggml/src/ggml-hexagon/htp/diag-ops.c +216 -0
  1396. data/vendor/ggml/src/ggml-hexagon/htp/fill-ops.c +123 -0
  1397. data/vendor/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +727 -0
  1398. data/vendor/ggml/src/ggml-hexagon/htp/gated-delta-net-ops.c +955 -0
  1399. data/vendor/ggml/src/ggml-hexagon/htp/get-rows-ops.c +124 -0
  1400. data/vendor/ggml/src/ggml-hexagon/htp/hex-dma.c +63 -0
  1401. data/vendor/ggml/src/ggml-hexagon/htp/hex-dma.h +372 -0
  1402. data/vendor/ggml/src/ggml-hexagon/htp/hex-dump.h +86 -0
  1403. data/vendor/ggml/src/ggml-hexagon/htp/hex-fastdiv.h +37 -0
  1404. data/vendor/ggml/src/ggml-hexagon/htp/hex-utils.h +137 -0
  1405. data/vendor/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c +1841 -0
  1406. data/vendor/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c +1785 -0
  1407. data/vendor/ggml/src/ggml-hexagon/htp/hmx-ops.h +71 -0
  1408. data/vendor/ggml/src/ggml-hexagon/htp/hmx-profile.h +34 -0
  1409. data/vendor/ggml/src/ggml-hexagon/htp/hmx-queue.c +158 -0
  1410. data/vendor/ggml/src/ggml-hexagon/htp/hmx-queue.h +134 -0
  1411. data/vendor/ggml/src/ggml-hexagon/htp/hmx-utils.h +200 -0
  1412. data/vendor/ggml/src/ggml-hexagon/htp/htp-ctx.h +111 -0
  1413. data/vendor/ggml/src/ggml-hexagon/htp/htp-ops.h +181 -0
  1414. data/vendor/ggml/src/ggml-hexagon/htp/htp_iface.idl +22 -0
  1415. data/vendor/ggml/src/ggml-hexagon/htp/hvx-arith.h +443 -0
  1416. data/vendor/ggml/src/ggml-hexagon/htp/hvx-base.h +308 -0
  1417. data/vendor/ggml/src/ggml-hexagon/htp/hvx-copy.h +262 -0
  1418. data/vendor/ggml/src/ggml-hexagon/htp/hvx-div.h +291 -0
  1419. data/vendor/ggml/src/ggml-hexagon/htp/hvx-dump.h +129 -0
  1420. data/vendor/ggml/src/ggml-hexagon/htp/hvx-exp.h +216 -0
  1421. data/vendor/ggml/src/ggml-hexagon/htp/hvx-floor.h +100 -0
  1422. data/vendor/ggml/src/ggml-hexagon/htp/hvx-inverse.h +210 -0
  1423. data/vendor/ggml/src/ggml-hexagon/htp/hvx-reduce.h +296 -0
  1424. data/vendor/ggml/src/ggml-hexagon/htp/hvx-repl.h +74 -0
  1425. data/vendor/ggml/src/ggml-hexagon/htp/hvx-scale.h +133 -0
  1426. data/vendor/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +142 -0
  1427. data/vendor/ggml/src/ggml-hexagon/htp/hvx-sqrt.h +126 -0
  1428. data/vendor/ggml/src/ggml-hexagon/htp/hvx-types.h +36 -0
  1429. data/vendor/ggml/src/ggml-hexagon/htp/hvx-utils.h +19 -0
  1430. data/vendor/ggml/src/ggml-hexagon/htp/main.c +880 -0
  1431. data/vendor/ggml/src/ggml-hexagon/htp/matmul-ops.c +3173 -0
  1432. data/vendor/ggml/src/ggml-hexagon/htp/repeat-ops.c +148 -0
  1433. data/vendor/ggml/src/ggml-hexagon/htp/rope-ops.c +494 -0
  1434. data/vendor/ggml/src/ggml-hexagon/htp/set-rows-ops.c +184 -0
  1435. data/vendor/ggml/src/ggml-hexagon/htp/softmax-ops.c +407 -0
  1436. data/vendor/ggml/src/ggml-hexagon/htp/solve-tri-ops.c +267 -0
  1437. data/vendor/ggml/src/ggml-hexagon/htp/ssm-conv.c +340 -0
  1438. data/vendor/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +128 -0
  1439. data/vendor/ggml/src/ggml-hexagon/htp/unary-ops.c +657 -0
  1440. data/vendor/ggml/src/ggml-hexagon/htp/vtcm-utils.h +16 -0
  1441. data/vendor/ggml/src/ggml-hexagon/htp/worker-pool.c +293 -0
  1442. data/vendor/ggml/src/ggml-hexagon/htp/worker-pool.h +57 -0
  1443. data/vendor/ggml/src/ggml-hexagon/htp-drv.cpp +418 -0
  1444. data/vendor/ggml/src/ggml-hexagon/htp-drv.h +121 -0
  1445. data/vendor/ggml/src/ggml-hexagon/libdl.h +79 -0
  1446. data/vendor/ggml/src/ggml-hexagon/libggml-htp.inf +40 -0
  1447. data/vendor/ggml/src/ggml-hexagon/op-desc.h +153 -0
  1448. data/vendor/ggml/src/ggml-hip/CMakeLists.txt +157 -0
  1449. data/vendor/ggml/src/ggml-impl.h +783 -0
  1450. data/vendor/ggml/src/ggml-metal/CMakeLists.txt +124 -0
  1451. data/vendor/ggml/src/ggml-metal/ggml-metal-common.cpp +457 -0
  1452. data/vendor/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
  1453. data/vendor/ggml/src/ggml-metal/ggml-metal-context.h +41 -0
  1454. data/vendor/ggml/src/ggml-metal/ggml-metal-context.m +739 -0
  1455. data/vendor/ggml/src/ggml-metal/ggml-metal-device.cpp +2053 -0
  1456. data/vendor/ggml/src/ggml-metal/ggml-metal-device.h +296 -0
  1457. data/vendor/ggml/src/ggml-metal/ggml-metal-device.m +1829 -0
  1458. data/vendor/ggml/src/ggml-metal/ggml-metal-impl.h +1175 -0
  1459. data/vendor/ggml/src/ggml-metal/ggml-metal-ops.cpp +4606 -0
  1460. data/vendor/ggml/src/ggml-metal/ggml-metal-ops.h +97 -0
  1461. data/vendor/ggml/src/ggml-metal/ggml-metal.cpp +950 -0
  1462. data/vendor/ggml/src/ggml-metal/ggml-metal.metal +10679 -0
  1463. data/vendor/ggml/src/ggml-musa/CMakeLists.txt +124 -0
  1464. data/vendor/ggml/src/ggml-musa/mudnn.cu +112 -0
  1465. data/vendor/ggml/src/ggml-musa/mudnn.cuh +12 -0
  1466. data/vendor/ggml/src/ggml-opencl/CMakeLists.txt +189 -0
  1467. data/vendor/ggml/src/ggml-opencl/ggml-opencl.cpp +16374 -0
  1468. data/vendor/ggml/src/ggml-opencl/kernels/add.cl +190 -0
  1469. data/vendor/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
  1470. data/vendor/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
  1471. data/vendor/ggml/src/ggml-opencl/kernels/clamp.cl +20 -0
  1472. data/vendor/ggml/src/ggml-opencl/kernels/concat.cl +51 -0
  1473. data/vendor/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
  1474. data/vendor/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
  1475. data/vendor/ggml/src/ggml-opencl/kernels/cpy.cl +229 -0
  1476. data/vendor/ggml/src/ggml-opencl/kernels/cumsum.cl +139 -0
  1477. data/vendor/ggml/src/ggml-opencl/kernels/cvt.cl +1471 -0
  1478. data/vendor/ggml/src/ggml-opencl/kernels/diag.cl +27 -0
  1479. data/vendor/ggml/src/ggml-opencl/kernels/diag_mask_inf.cl +58 -0
  1480. data/vendor/ggml/src/ggml-opencl/kernels/div.cl +138 -0
  1481. data/vendor/ggml/src/ggml-opencl/kernels/embed_kernel.py +26 -0
  1482. data/vendor/ggml/src/ggml-opencl/kernels/exp.cl +125 -0
  1483. data/vendor/ggml/src/ggml-opencl/kernels/expm1.cl +113 -0
  1484. data/vendor/ggml/src/ggml-opencl/kernels/fill.cl +17 -0
  1485. data/vendor/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +370 -0
  1486. data/vendor/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +371 -0
  1487. data/vendor/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +373 -0
  1488. data/vendor/ggml/src/ggml-opencl/kernels/gelu.cl +89 -0
  1489. data/vendor/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32.cl +162 -0
  1490. data/vendor/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32_ns.cl +302 -0
  1491. data/vendor/ggml/src/ggml-opencl/kernels/gemm_moe_q4_0_f32_ns.cl +252 -0
  1492. data/vendor/ggml/src/ggml-opencl/kernels/gemm_moe_q4_1_f32_ns.cl +254 -0
  1493. data/vendor/ggml/src/ggml-opencl/kernels/gemm_moe_q5_0_f32_ns.cl +256 -0
  1494. data/vendor/ggml/src/ggml-opencl/kernels/gemm_moe_q5_1_f32_ns.cl +258 -0
  1495. data/vendor/ggml/src/ggml-opencl/kernels/gemm_noshuffle_iq4_nl_f32.cl +150 -0
  1496. data/vendor/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_0_f32.cl +139 -0
  1497. data/vendor/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_1_f32.cl +132 -0
  1498. data/vendor/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_k_f32.cl +172 -0
  1499. data/vendor/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_k_f32.cl +176 -0
  1500. data/vendor/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q6_k_f32.cl +140 -0
  1501. data/vendor/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q8_0_f32.cl +129 -0
  1502. data/vendor/ggml/src/ggml-opencl/kernels/gemm_xmem_f16_f32_os8.cl +233 -0
  1503. data/vendor/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32.cl +156 -0
  1504. data/vendor/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32_ns.cl +161 -0
  1505. data/vendor/ggml/src/ggml-opencl/kernels/gemv_moe_q4_0_f32_ns.cl +116 -0
  1506. data/vendor/ggml/src/ggml-opencl/kernels/gemv_moe_q4_1_f32_ns.cl +119 -0
  1507. data/vendor/ggml/src/ggml-opencl/kernels/gemv_moe_q5_0_f32_ns.cl +119 -0
  1508. data/vendor/ggml/src/ggml-opencl/kernels/gemv_moe_q5_1_f32_ns.cl +121 -0
  1509. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_iq4_nl_f32.cl +302 -0
  1510. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_0_f32.cl +274 -0
  1511. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_0_f32_spec.cl +268 -0
  1512. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_1_f32.cl +283 -0
  1513. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_k_f32.cl +318 -0
  1514. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_k_f32.cl +326 -0
  1515. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q6_k_f32.cl +293 -0
  1516. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q8_0_f32.cl +195 -0
  1517. data/vendor/ggml/src/ggml-opencl/kernels/get_rows.cl +187 -0
  1518. data/vendor/ggml/src/ggml-opencl/kernels/glu.cl +378 -0
  1519. data/vendor/ggml/src/ggml-opencl/kernels/group_norm.cl +121 -0
  1520. data/vendor/ggml/src/ggml-opencl/kernels/im2col_f16.cl +57 -0
  1521. data/vendor/ggml/src/ggml-opencl/kernels/im2col_f32.cl +57 -0
  1522. data/vendor/ggml/src/ggml-opencl/kernels/l2_norm.cl +71 -0
  1523. data/vendor/ggml/src/ggml-opencl/kernels/mean.cl +140 -0
  1524. data/vendor/ggml/src/ggml-opencl/kernels/moe_reorder_b.cl +30 -0
  1525. data/vendor/ggml/src/ggml-opencl/kernels/moe_sort_by_expert.cl +82 -0
  1526. data/vendor/ggml/src/ggml-opencl/kernels/mul.cl +152 -0
  1527. data/vendor/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
  1528. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_kq_kqv.cl +273 -0
  1529. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +146 -0
  1530. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +147 -0
  1531. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_iq4_nl_f32_l4_lm.cl +171 -0
  1532. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl +163 -0
  1533. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl +165 -0
  1534. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_q4_k_f32_l4_lm.cl +179 -0
  1535. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_q5_k_f32_l4_lm.cl +192 -0
  1536. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_q6_k_f32_l4_lm.cl +158 -0
  1537. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl +154 -0
  1538. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_f16_f16.cl +118 -0
  1539. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32.cl +118 -0
  1540. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_1row.cl +94 -0
  1541. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_l4.cl +84 -0
  1542. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_f32_f32.cl +118 -0
  1543. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
  1544. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
  1545. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  1546. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
  1547. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
  1548. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32.cl +164 -0
  1549. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32_flat.cl +202 -0
  1550. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
  1551. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
  1552. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32.cl +192 -0
  1553. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_16x_flat.cl +307 -0
  1554. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_8x_flat.cl +265 -0
  1555. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_8x_flat.cl +272 -0
  1556. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_v.cl +254 -0
  1557. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl +219 -0
  1558. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl +229 -0
  1559. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl +180 -0
  1560. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32_flat.cl +196 -0
  1561. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32.cl +187 -0
  1562. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32_flat.cl +203 -0
  1563. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32.cl +194 -0
  1564. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +194 -0
  1565. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
  1566. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
  1567. data/vendor/ggml/src/ggml-opencl/kernels/neg.cl +125 -0
  1568. data/vendor/ggml/src/ggml-opencl/kernels/norm.cl +161 -0
  1569. data/vendor/ggml/src/ggml-opencl/kernels/pad.cl +39 -0
  1570. data/vendor/ggml/src/ggml-opencl/kernels/relu.cl +16 -0
  1571. data/vendor/ggml/src/ggml-opencl/kernels/repeat.cl +38 -0
  1572. data/vendor/ggml/src/ggml-opencl/kernels/rms_norm.cl +190 -0
  1573. data/vendor/ggml/src/ggml-opencl/kernels/rope.cl +747 -0
  1574. data/vendor/ggml/src/ggml-opencl/kernels/scale.cl +27 -0
  1575. data/vendor/ggml/src/ggml-opencl/kernels/set_rows.cl +208 -0
  1576. data/vendor/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
  1577. data/vendor/ggml/src/ggml-opencl/kernels/silu.cl +30 -0
  1578. data/vendor/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +108 -0
  1579. data/vendor/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +108 -0
  1580. data/vendor/ggml/src/ggml-opencl/kernels/softmax_f16.cl +107 -0
  1581. data/vendor/ggml/src/ggml-opencl/kernels/softmax_f32.cl +107 -0
  1582. data/vendor/ggml/src/ggml-opencl/kernels/softplus.cl +116 -0
  1583. data/vendor/ggml/src/ggml-opencl/kernels/solve_tri.cl +51 -0
  1584. data/vendor/ggml/src/ggml-opencl/kernels/sqr.cl +53 -0
  1585. data/vendor/ggml/src/ggml-opencl/kernels/sqrt.cl +53 -0
  1586. data/vendor/ggml/src/ggml-opencl/kernels/ssm_conv.cl +77 -0
  1587. data/vendor/ggml/src/ggml-opencl/kernels/sub.cl +138 -0
  1588. data/vendor/ggml/src/ggml-opencl/kernels/sum_rows.cl +140 -0
  1589. data/vendor/ggml/src/ggml-opencl/kernels/tanh.cl +109 -0
  1590. data/vendor/ggml/src/ggml-opencl/kernels/transpose.cl +143 -0
  1591. data/vendor/ggml/src/ggml-opencl/kernels/tri.cl +32 -0
  1592. data/vendor/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  1593. data/vendor/ggml/src/ggml-opencl/kernels/upscale.cl +120 -0
  1594. data/vendor/ggml/src/ggml-openvino/CMakeLists.txt +22 -0
  1595. data/vendor/ggml/src/ggml-openvino/ggml-decoder.cpp +985 -0
  1596. data/vendor/ggml/src/ggml-openvino/ggml-decoder.h +294 -0
  1597. data/vendor/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +380 -0
  1598. data/vendor/ggml/src/ggml-openvino/ggml-openvino-extra.h +182 -0
  1599. data/vendor/ggml/src/ggml-openvino/ggml-openvino.cpp +1132 -0
  1600. data/vendor/ggml/src/ggml-openvino/ggml-quants.cpp +956 -0
  1601. data/vendor/ggml/src/ggml-openvino/ggml-quants.h +153 -0
  1602. data/vendor/ggml/src/ggml-openvino/openvino/decoder.h +74 -0
  1603. data/vendor/ggml/src/ggml-openvino/openvino/frontend.cpp +27 -0
  1604. data/vendor/ggml/src/ggml-openvino/openvino/frontend.h +23 -0
  1605. data/vendor/ggml/src/ggml-openvino/openvino/input_model.cpp +17 -0
  1606. data/vendor/ggml/src/ggml-openvino/openvino/input_model.h +29 -0
  1607. data/vendor/ggml/src/ggml-openvino/openvino/node_context.h +112 -0
  1608. data/vendor/ggml/src/ggml-openvino/openvino/op/cont.cpp +48 -0
  1609. data/vendor/ggml/src/ggml-openvino/openvino/op/cpy.cpp +21 -0
  1610. data/vendor/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +90 -0
  1611. data/vendor/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +69 -0
  1612. data/vendor/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp +61 -0
  1613. data/vendor/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp +62 -0
  1614. data/vendor/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +90 -0
  1615. data/vendor/ggml/src/ggml-openvino/openvino/op/permute.cpp +102 -0
  1616. data/vendor/ggml/src/ggml-openvino/openvino/op/reshape.cpp +83 -0
  1617. data/vendor/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp +46 -0
  1618. data/vendor/ggml/src/ggml-openvino/openvino/op/rope.cpp +149 -0
  1619. data/vendor/ggml/src/ggml-openvino/openvino/op/scale.cpp +41 -0
  1620. data/vendor/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +76 -0
  1621. data/vendor/ggml/src/ggml-openvino/openvino/op/softmax.cpp +89 -0
  1622. data/vendor/ggml/src/ggml-openvino/openvino/op/transpose.cpp +23 -0
  1623. data/vendor/ggml/src/ggml-openvino/openvino/op/unary_gelu.cpp +25 -0
  1624. data/vendor/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp +27 -0
  1625. data/vendor/ggml/src/ggml-openvino/openvino/op/view.cpp +53 -0
  1626. data/vendor/ggml/src/ggml-openvino/openvino/op_table.cpp +47 -0
  1627. data/vendor/ggml/src/ggml-openvino/openvino/op_table.h +40 -0
  1628. data/vendor/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp +60 -0
  1629. data/vendor/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.h +17 -0
  1630. data/vendor/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.h +29 -0
  1631. data/vendor/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.cpp +58 -0
  1632. data/vendor/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.h +17 -0
  1633. data/vendor/ggml/src/ggml-openvino/openvino/rt_info/weightless_caching_attributes.hpp +41 -0
  1634. data/vendor/ggml/src/ggml-openvino/openvino/translate_session.cpp +317 -0
  1635. data/vendor/ggml/src/ggml-openvino/openvino/translate_session.h +28 -0
  1636. data/vendor/ggml/src/ggml-openvino/openvino/utils.cpp +257 -0
  1637. data/vendor/ggml/src/ggml-openvino/openvino/utils.h +86 -0
  1638. data/vendor/ggml/src/ggml-openvino/utils.cpp +880 -0
  1639. data/vendor/ggml/src/ggml-openvino/utils.h +143 -0
  1640. data/vendor/ggml/src/ggml-opt.cpp +1094 -0
  1641. data/vendor/ggml/src/ggml-quants.c +5491 -0
  1642. data/vendor/ggml/src/ggml-quants.h +112 -0
  1643. data/vendor/ggml/src/ggml-rpc/CMakeLists.txt +33 -0
  1644. data/vendor/ggml/src/ggml-rpc/ggml-rpc.cpp +1974 -0
  1645. data/vendor/ggml/src/ggml-rpc/transport.cpp +683 -0
  1646. data/vendor/ggml/src/ggml-rpc/transport.h +34 -0
  1647. data/vendor/ggml/src/ggml-sycl/CMakeLists.txt +207 -0
  1648. data/vendor/ggml/src/ggml-sycl/add-id.cpp +81 -0
  1649. data/vendor/ggml/src/ggml-sycl/add-id.hpp +8 -0
  1650. data/vendor/ggml/src/ggml-sycl/backend.hpp +48 -0
  1651. data/vendor/ggml/src/ggml-sycl/binbcast.cpp +346 -0
  1652. data/vendor/ggml/src/ggml-sycl/binbcast.hpp +39 -0
  1653. data/vendor/ggml/src/ggml-sycl/common.cpp +155 -0
  1654. data/vendor/ggml/src/ggml-sycl/common.hpp +1002 -0
  1655. data/vendor/ggml/src/ggml-sycl/concat.cpp +202 -0
  1656. data/vendor/ggml/src/ggml-sycl/concat.hpp +20 -0
  1657. data/vendor/ggml/src/ggml-sycl/conv.cpp +101 -0
  1658. data/vendor/ggml/src/ggml-sycl/conv.hpp +20 -0
  1659. data/vendor/ggml/src/ggml-sycl/convert.cpp +825 -0
  1660. data/vendor/ggml/src/ggml-sycl/convert.hpp +64 -0
  1661. data/vendor/ggml/src/ggml-sycl/count-equal.cpp +79 -0
  1662. data/vendor/ggml/src/ggml-sycl/count-equal.hpp +9 -0
  1663. data/vendor/ggml/src/ggml-sycl/cpy.cpp +602 -0
  1664. data/vendor/ggml/src/ggml-sycl/cpy.hpp +223 -0
  1665. data/vendor/ggml/src/ggml-sycl/cumsum.cpp +148 -0
  1666. data/vendor/ggml/src/ggml-sycl/cumsum.hpp +5 -0
  1667. data/vendor/ggml/src/ggml-sycl/dequantize.hpp +975 -0
  1668. data/vendor/ggml/src/ggml-sycl/diag.cpp +67 -0
  1669. data/vendor/ggml/src/ggml-sycl/diag.hpp +5 -0
  1670. data/vendor/ggml/src/ggml-sycl/dmmv.cpp +1579 -0
  1671. data/vendor/ggml/src/ggml-sycl/dmmv.hpp +27 -0
  1672. data/vendor/ggml/src/ggml-sycl/dpct/helper.hpp +3774 -0
  1673. data/vendor/ggml/src/ggml-sycl/element_wise.cpp +1124 -0
  1674. data/vendor/ggml/src/ggml-sycl/element_wise.hpp +94 -0
  1675. data/vendor/ggml/src/ggml-sycl/fattn-buffers.cpp +56 -0
  1676. data/vendor/ggml/src/ggml-sycl/fattn-buffers.hpp +63 -0
  1677. data/vendor/ggml/src/ggml-sycl/fattn-common.hpp +1181 -0
  1678. data/vendor/ggml/src/ggml-sycl/fattn-tile.cpp +59 -0
  1679. data/vendor/ggml/src/ggml-sycl/fattn-tile.hpp +1246 -0
  1680. data/vendor/ggml/src/ggml-sycl/fattn-vec.hpp +674 -0
  1681. data/vendor/ggml/src/ggml-sycl/fattn.cpp +227 -0
  1682. data/vendor/ggml/src/ggml-sycl/fattn.hpp +22 -0
  1683. data/vendor/ggml/src/ggml-sycl/fill.cpp +55 -0
  1684. data/vendor/ggml/src/ggml-sycl/fill.hpp +5 -0
  1685. data/vendor/ggml/src/ggml-sycl/gated_delta_net.cpp +307 -0
  1686. data/vendor/ggml/src/ggml-sycl/gated_delta_net.hpp +9 -0
  1687. data/vendor/ggml/src/ggml-sycl/gemm.hpp +93 -0
  1688. data/vendor/ggml/src/ggml-sycl/getrows.cpp +219 -0
  1689. data/vendor/ggml/src/ggml-sycl/getrows.hpp +20 -0
  1690. data/vendor/ggml/src/ggml-sycl/ggml-sycl.cpp +5520 -0
  1691. data/vendor/ggml/src/ggml-sycl/gla.cpp +106 -0
  1692. data/vendor/ggml/src/ggml-sycl/gla.hpp +8 -0
  1693. data/vendor/ggml/src/ggml-sycl/im2col.cpp +400 -0
  1694. data/vendor/ggml/src/ggml-sycl/im2col.hpp +23 -0
  1695. data/vendor/ggml/src/ggml-sycl/mmq.cpp +3030 -0
  1696. data/vendor/ggml/src/ggml-sycl/mmq.hpp +33 -0
  1697. data/vendor/ggml/src/ggml-sycl/mmvq.cpp +1380 -0
  1698. data/vendor/ggml/src/ggml-sycl/mmvq.hpp +43 -0
  1699. data/vendor/ggml/src/ggml-sycl/norm.cpp +656 -0
  1700. data/vendor/ggml/src/ggml-sycl/norm.hpp +28 -0
  1701. data/vendor/ggml/src/ggml-sycl/outprod.cpp +47 -0
  1702. data/vendor/ggml/src/ggml-sycl/outprod.hpp +10 -0
  1703. data/vendor/ggml/src/ggml-sycl/pad.cpp +97 -0
  1704. data/vendor/ggml/src/ggml-sycl/pad.hpp +24 -0
  1705. data/vendor/ggml/src/ggml-sycl/pad_reflect_1d.cpp +100 -0
  1706. data/vendor/ggml/src/ggml-sycl/pad_reflect_1d.hpp +10 -0
  1707. data/vendor/ggml/src/ggml-sycl/presets.hpp +79 -0
  1708. data/vendor/ggml/src/ggml-sycl/quantize.hpp +133 -0
  1709. data/vendor/ggml/src/ggml-sycl/quants.hpp +156 -0
  1710. data/vendor/ggml/src/ggml-sycl/repeat_back.cpp +76 -0
  1711. data/vendor/ggml/src/ggml-sycl/repeat_back.hpp +8 -0
  1712. data/vendor/ggml/src/ggml-sycl/roll.cpp +122 -0
  1713. data/vendor/ggml/src/ggml-sycl/roll.hpp +20 -0
  1714. data/vendor/ggml/src/ggml-sycl/rope.cpp +641 -0
  1715. data/vendor/ggml/src/ggml-sycl/rope.hpp +26 -0
  1716. data/vendor/ggml/src/ggml-sycl/set.cpp +73 -0
  1717. data/vendor/ggml/src/ggml-sycl/set.hpp +5 -0
  1718. data/vendor/ggml/src/ggml-sycl/set_rows.cpp +240 -0
  1719. data/vendor/ggml/src/ggml-sycl/set_rows.hpp +8 -0
  1720. data/vendor/ggml/src/ggml-sycl/softmax.cpp +426 -0
  1721. data/vendor/ggml/src/ggml-sycl/softmax.hpp +24 -0
  1722. data/vendor/ggml/src/ggml-sycl/solve_tri.cpp +172 -0
  1723. data/vendor/ggml/src/ggml-sycl/solve_tri.hpp +8 -0
  1724. data/vendor/ggml/src/ggml-sycl/ssm_conv.cpp +132 -0
  1725. data/vendor/ggml/src/ggml-sycl/ssm_conv.hpp +5 -0
  1726. data/vendor/ggml/src/ggml-sycl/ssm_scan.cpp +156 -0
  1727. data/vendor/ggml/src/ggml-sycl/ssm_scan.hpp +5 -0
  1728. data/vendor/ggml/src/ggml-sycl/sycl_hw.cpp +67 -0
  1729. data/vendor/ggml/src/ggml-sycl/sycl_hw.hpp +38 -0
  1730. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq112-dv112.cpp +5 -0
  1731. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq128-dv128.cpp +5 -0
  1732. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq256-dv256.cpp +5 -0
  1733. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq40-dv40.cpp +5 -0
  1734. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq512-dv512.cpp +6 -0
  1735. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq576-dv512.cpp +5 -0
  1736. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq64-dv64.cpp +5 -0
  1737. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq72-dv72.cpp +5 -0
  1738. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq80-dv80.cpp +5 -0
  1739. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq96-dv96.cpp +5 -0
  1740. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-f16.cpp +8 -0
  1741. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_0.cpp +8 -0
  1742. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_1.cpp +8 -0
  1743. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_0.cpp +8 -0
  1744. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_1.cpp +8 -0
  1745. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q8_0.cpp +8 -0
  1746. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-f16.cpp +8 -0
  1747. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_0.cpp +8 -0
  1748. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_1.cpp +8 -0
  1749. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_0.cpp +8 -0
  1750. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_1.cpp +8 -0
  1751. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q8_0.cpp +8 -0
  1752. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-f16.cpp +8 -0
  1753. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_0.cpp +8 -0
  1754. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_1.cpp +8 -0
  1755. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_0.cpp +8 -0
  1756. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_1.cpp +8 -0
  1757. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q8_0.cpp +8 -0
  1758. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-f16.cpp +8 -0
  1759. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_0.cpp +8 -0
  1760. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_1.cpp +8 -0
  1761. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_0.cpp +8 -0
  1762. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_1.cpp +8 -0
  1763. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q8_0.cpp +8 -0
  1764. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-f16.cpp +8 -0
  1765. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_0.cpp +8 -0
  1766. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_1.cpp +8 -0
  1767. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_0.cpp +8 -0
  1768. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_1.cpp +8 -0
  1769. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q8_0.cpp +8 -0
  1770. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-f16.cpp +8 -0
  1771. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_0.cpp +8 -0
  1772. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_1.cpp +8 -0
  1773. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_0.cpp +8 -0
  1774. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_1.cpp +8 -0
  1775. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q8_0.cpp +8 -0
  1776. data/vendor/ggml/src/ggml-sycl/tsembd.cpp +73 -0
  1777. data/vendor/ggml/src/ggml-sycl/tsembd.hpp +20 -0
  1778. data/vendor/ggml/src/ggml-sycl/type.hpp +112 -0
  1779. data/vendor/ggml/src/ggml-sycl/upscale.cpp +410 -0
  1780. data/vendor/ggml/src/ggml-sycl/upscale.hpp +9 -0
  1781. data/vendor/ggml/src/ggml-sycl/vecdotq.hpp +1508 -0
  1782. data/vendor/ggml/src/ggml-sycl/wkv.cpp +293 -0
  1783. data/vendor/ggml/src/ggml-sycl/wkv.hpp +10 -0
  1784. data/vendor/ggml/src/ggml-threading.cpp +12 -0
  1785. data/vendor/ggml/src/ggml-threading.h +14 -0
  1786. data/vendor/ggml/src/ggml-virtgpu/CMakeLists.txt +70 -0
  1787. data/vendor/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp +87 -0
  1788. data/vendor/ggml/src/ggml-virtgpu/backend/CMakeLists.txt +21 -0
  1789. data/vendor/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp +115 -0
  1790. data/vendor/ggml/src/ggml-virtgpu/backend/backend-convert.h +13 -0
  1791. data/vendor/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +102 -0
  1792. data/vendor/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +105 -0
  1793. data/vendor/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +179 -0
  1794. data/vendor/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp +148 -0
  1795. data/vendor/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +51 -0
  1796. data/vendor/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +73 -0
  1797. data/vendor/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +27 -0
  1798. data/vendor/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +32 -0
  1799. data/vendor/ggml/src/ggml-virtgpu/backend/backend.cpp +144 -0
  1800. data/vendor/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +95 -0
  1801. data/vendor/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +94 -0
  1802. data/vendor/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +50 -0
  1803. data/vendor/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +378 -0
  1804. data/vendor/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +232 -0
  1805. data/vendor/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +58 -0
  1806. data/vendor/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +81 -0
  1807. data/vendor/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +123 -0
  1808. data/vendor/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +160 -0
  1809. data/vendor/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +213 -0
  1810. data/vendor/ggml/src/ggml-virtgpu/ggml-backend.cpp +71 -0
  1811. data/vendor/ggml/src/ggml-virtgpu/ggml-remoting.h +71 -0
  1812. data/vendor/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml +166 -0
  1813. data/vendor/ggml/src/ggml-virtgpu/include/apir_hw.h +9 -0
  1814. data/vendor/ggml/src/ggml-virtgpu/regenerate_remoting.py +333 -0
  1815. data/vendor/ggml/src/ggml-virtgpu/virtgpu-apir.h +15 -0
  1816. data/vendor/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +58 -0
  1817. data/vendor/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +110 -0
  1818. data/vendor/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +173 -0
  1819. data/vendor/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +192 -0
  1820. data/vendor/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +36 -0
  1821. data/vendor/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +53 -0
  1822. data/vendor/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +99 -0
  1823. data/vendor/ggml/src/ggml-virtgpu/virtgpu-shm.h +23 -0
  1824. data/vendor/ggml/src/ggml-virtgpu/virtgpu-utils.cpp +179 -0
  1825. data/vendor/ggml/src/ggml-virtgpu/virtgpu-utils.h +86 -0
  1826. data/vendor/ggml/src/ggml-virtgpu/virtgpu.cpp +545 -0
  1827. data/vendor/ggml/src/ggml-virtgpu/virtgpu.h +115 -0
  1828. data/vendor/ggml/src/ggml-vulkan/CMakeLists.txt +220 -0
  1829. data/vendor/ggml/src/ggml-vulkan/cmake/host-toolchain.cmake.in +15 -0
  1830. data/vendor/ggml/src/ggml-vulkan/ggml-vulkan.cpp +17208 -0
  1831. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +31 -0
  1832. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/abs.comp +21 -0
  1833. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +37 -0
  1834. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +69 -0
  1835. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/add1.comp +28 -0
  1836. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +42 -0
  1837. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/arange.comp +20 -0
  1838. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +60 -0
  1839. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +86 -0
  1840. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/argsort_large.comp +114 -0
  1841. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/ceil.comp +22 -0
  1842. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp +17 -0
  1843. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/concat.comp +41 -0
  1844. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +49 -0
  1845. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp +105 -0
  1846. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +347 -0
  1847. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
  1848. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +23 -0
  1849. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +51 -0
  1850. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +320 -0
  1851. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/copy_transpose.comp +67 -0
  1852. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp +17 -0
  1853. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/count_equal.comp +31 -0
  1854. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/count_experts.comp +51 -0
  1855. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/cumsum.comp +83 -0
  1856. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass1.comp +60 -0
  1857. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass2.comp +66 -0
  1858. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_f32.comp +20 -0
  1859. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl +653 -0
  1860. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.glsl +768 -0
  1861. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_head.glsl +13 -0
  1862. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +42 -0
  1863. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_s.comp +35 -0
  1864. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +44 -0
  1865. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xs.comp +43 -0
  1866. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +49 -0
  1867. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +40 -0
  1868. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +51 -0
  1869. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp +32 -0
  1870. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp +34 -0
  1871. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +32 -0
  1872. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_nvfp4.comp +32 -0
  1873. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q1_0.comp +29 -0
  1874. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +34 -0
  1875. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +42 -0
  1876. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_0.comp +30 -0
  1877. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_1.comp +32 -0
  1878. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +68 -0
  1879. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_0.comp +34 -0
  1880. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_1.comp +35 -0
  1881. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +70 -0
  1882. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +33 -0
  1883. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q8_0.comp +31 -0
  1884. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp +28 -0
  1885. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/diag_mask_inf.comp +34 -0
  1886. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/div.comp +27 -0
  1887. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/elu.comp +27 -0
  1888. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +20 -0
  1889. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/bfloat16.comp +7 -0
  1890. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/coopmat.comp +7 -0
  1891. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/coopmat2.comp +7 -0
  1892. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/integer_dot.comp +7 -0
  1893. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/fill.comp +19 -0
  1894. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +756 -0
  1895. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +255 -0
  1896. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +626 -0
  1897. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +427 -0
  1898. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_dequant.glsl +123 -0
  1899. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mask_opt.comp +162 -0
  1900. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mmq_funcs.glsl +203 -0
  1901. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +121 -0
  1902. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/floor.comp +22 -0
  1903. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp +190 -0
  1904. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
  1905. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
  1906. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
  1907. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp +25 -0
  1908. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
  1909. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp +23 -0
  1910. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.glsl +65 -0
  1911. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/generic_head.glsl +11 -0
  1912. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.glsl +83 -0
  1913. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +42 -0
  1914. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +51 -0
  1915. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.glsl +28 -0
  1916. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.glsl +39 -0
  1917. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp +66 -0
  1918. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +22 -0
  1919. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +22 -0
  1920. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +93 -0
  1921. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +124 -0
  1922. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +44 -0
  1923. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/leaky_relu.comp +22 -0
  1924. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +17 -0
  1925. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul.comp +27 -0
  1926. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_split_k_reduce.comp +48 -0
  1927. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +169 -0
  1928. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl +230 -0
  1929. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +35 -0
  1930. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_m.comp +132 -0
  1931. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp +95 -0
  1932. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_s.comp +90 -0
  1933. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +105 -0
  1934. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xxs.comp +87 -0
  1935. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_s.comp +90 -0
  1936. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_xxs.comp +88 -0
  1937. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +124 -0
  1938. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp +156 -0
  1939. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +128 -0
  1940. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +132 -0
  1941. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +134 -0
  1942. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +165 -0
  1943. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +130 -0
  1944. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +143 -0
  1945. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +503 -0
  1946. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +464 -0
  1947. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +624 -0
  1948. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +600 -0
  1949. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +74 -0
  1950. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +311 -0
  1951. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +454 -0
  1952. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +93 -0
  1953. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +194 -0
  1954. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/neg.comp +20 -0
  1955. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/norm.comp +44 -0
  1956. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_adamw.comp +42 -0
  1957. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +22 -0
  1958. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +64 -0
  1959. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/pool2d.comp +74 -0
  1960. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +127 -0
  1961. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
  1962. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp +21 -0
  1963. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp +26 -0
  1964. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp +37 -0
  1965. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +150 -0
  1966. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_back.comp +55 -0
  1967. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +65 -0
  1968. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
  1969. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +207 -0
  1970. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +19 -0
  1971. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +17 -0
  1972. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +17 -0
  1973. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +17 -0
  1974. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +31 -0
  1975. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +17 -0
  1976. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/round.comp +29 -0
  1977. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +24 -0
  1978. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/sgn.comp +21 -0
  1979. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp +20 -0
  1980. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp +22 -0
  1981. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/silu_back.comp +26 -0
  1982. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp +17 -0
  1983. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +195 -0
  1984. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +54 -0
  1985. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large1.comp +62 -0
  1986. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large2.comp +79 -0
  1987. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large3.comp +65 -0
  1988. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large_common.glsl +53 -0
  1989. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/softplus.comp +23 -0
  1990. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/solve_tri.comp +81 -0
  1991. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +17 -0
  1992. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/square.comp +17 -0
  1993. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +50 -0
  1994. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/ssm_scan.comp +124 -0
  1995. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/step.comp +22 -0
  1996. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/sub.comp +29 -0
  1997. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +47 -0
  1998. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.glsl +25 -0
  1999. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
  2000. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +14 -0
  2001. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp +20 -0
  2002. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +42 -0
  2003. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/topk_argsort.comp +118 -0
  2004. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +213 -0
  2005. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/topk_nary_search.comp +246 -0
  2006. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp +42 -0
  2007. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/trunc.comp +22 -0
  2008. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl +1846 -0
  2009. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +178 -0
  2010. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/utils.glsl +25 -0
  2011. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +1183 -0
  2012. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/wkv6.comp +87 -0
  2013. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/wkv7.comp +91 -0
  2014. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/xielu.comp +35 -0
  2015. data/vendor/ggml/src/ggml-webgpu/CMakeLists.txt +80 -0
  2016. data/vendor/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +3231 -0
  2017. data/vendor/ggml/src/ggml-webgpu/ggml-webgpu.cpp +4461 -0
  2018. data/vendor/ggml/src/ggml-webgpu/pre_wgsl.hpp +778 -0
  2019. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/add_id.wgsl +64 -0
  2020. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl +72 -0
  2021. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl +106 -0
  2022. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl +134 -0
  2023. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +139 -0
  2024. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +905 -0
  2025. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl +75 -0
  2026. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/conv2d.wgsl +165 -0
  2027. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +81 -0
  2028. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl +66 -0
  2029. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +89 -0
  2030. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +706 -0
  2031. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_tile.wgsl +351 -0
  2032. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_blk.wgsl +101 -0
  2033. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_reduce.wgsl +84 -0
  2034. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_split.wgsl +720 -0
  2035. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/gated_delta_net.wgsl +132 -0
  2036. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.wgsl +773 -0
  2037. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/glu.wgsl +155 -0
  2038. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/im2col.wgsl +101 -0
  2039. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
  2040. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +747 -0
  2041. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +1210 -0
  2042. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id.wgsl +195 -0
  2043. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_gather.wgsl +55 -0
  2044. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_vec.wgsl +154 -0
  2045. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl +149 -0
  2046. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.wgsl +200 -0
  2047. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +133 -0
  2048. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_acc.tmpl +1433 -0
  2049. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl +86 -0
  2050. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/repeat.wgsl +67 -0
  2051. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_mul.wgsl +152 -0
  2052. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/rope.wgsl +224 -0
  2053. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/row_norm.wgsl +153 -0
  2054. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/scale.wgsl +63 -0
  2055. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/set.wgsl +109 -0
  2056. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +109 -0
  2057. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.wgsl +245 -0
  2058. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/solve_tri.wgsl +121 -0
  2059. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/ssm_conv.wgsl +65 -0
  2060. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/ssm_scan.wgsl +193 -0
  2061. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl +55 -0
  2062. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +210 -0
  2063. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/upscale.wgsl +240 -0
  2064. data/vendor/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
  2065. data/vendor/ggml/src/ggml-zdnn/common.hpp +59 -0
  2066. data/vendor/ggml/src/ggml-zdnn/ggml-zdnn.cpp +637 -0
  2067. data/vendor/ggml/src/ggml-zdnn/mmf.cpp +80 -0
  2068. data/vendor/ggml/src/ggml-zdnn/mmf.hpp +12 -0
  2069. data/vendor/ggml/src/ggml-zdnn/utils.cpp +79 -0
  2070. data/vendor/ggml/src/ggml-zdnn/utils.hpp +19 -0
  2071. data/vendor/ggml/src/ggml-zendnn/CMakeLists.txt +91 -0
  2072. data/vendor/ggml/src/ggml-zendnn/ggml-zendnn.cpp +669 -0
  2073. data/vendor/ggml/src/ggml.c +7777 -0
  2074. data/vendor/ggml/src/ggml.cpp +26 -0
  2075. data/vendor/ggml/src/gguf.cpp +1556 -0
  2076. data/vendor/ggml/tests/CMakeLists.txt +356 -0
  2077. data/vendor/ggml/tests/test-arange.cpp +100 -0
  2078. data/vendor/ggml/tests/test-backend-ops.cpp +9786 -0
  2079. data/vendor/ggml/tests/test-cont.c +170 -0
  2080. data/vendor/ggml/tests/test-conv-transpose-1d.cpp +691 -0
  2081. data/vendor/ggml/tests/test-conv-transpose.c +248 -0
  2082. data/vendor/ggml/tests/test-conv1d-dw-c1.cpp +243 -0
  2083. data/vendor/ggml/tests/test-conv1d-dw-c2.cpp +243 -0
  2084. data/vendor/ggml/tests/test-conv1d.cpp +289 -0
  2085. data/vendor/ggml/tests/test-conv2d-dw.cpp +153 -0
  2086. data/vendor/ggml/tests/test-conv2d.cpp +391 -0
  2087. data/vendor/ggml/tests/test-customop.c +300 -0
  2088. data/vendor/ggml/tests/test-dup.c +111 -0
  2089. data/vendor/ggml/tests/test-interpolate.cpp +166 -0
  2090. data/vendor/ggml/tests/test-opt.cpp +1003 -0
  2091. data/vendor/ggml/tests/test-pad-reflect-1d.cpp +213 -0
  2092. data/vendor/ggml/tests/test-pool.c +274 -0
  2093. data/vendor/ggml/tests/test-quantize-fns.cpp +196 -0
  2094. data/vendor/ggml/tests/test-quantize-perf.cpp +356 -0
  2095. data/vendor/ggml/tests/test-rel-pos.c +87 -0
  2096. data/vendor/ggml/tests/test-roll.cpp +128 -0
  2097. data/vendor/ggml/tests/test-timestep_embedding.cpp +180 -0
  2098. data/vendor-patches/0001-cuda-buffer_from_ptr.patch +253 -0
  2099. data/vendor-patches/0002-cuda-buffer_from_ptr-reuse-iface.patch +117 -0
  2100. data/vendor-patches/0003-cuda-buffer_from_ptr-copy-mode.patch +128 -0
  2101. data/vendor-patches/0004-cuda-cpy-strided.patch +61 -0
  2102. data/vendor-patches/0005-concat-backward.patch +36 -0
  2103. data/vendor-patches/0006-getrows-back-large-vocab.patch +69 -0
  2104. data/vendor-patches/0007-gpt2-backward-kernels.patch +438 -0
  2105. data/vendor-patches/0008-mul-mat-backward-mixed-precision.patch +50 -0
  2106. data/vendor-patches/0009-sched-unsupported-node-diagnostic.patch +26 -0
  2107. metadata +2161 -0
@@ -0,0 +1,3774 @@
1
+ //
2
+ // MIT license
3
+ // Copyright (C) 2024 Intel Corporation
4
+ // SPDX-License-Identifier: MIT
5
+ //
6
+
7
+ //
8
+ // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
9
+ // See https://llvm.org/LICENSE.txt for license information.
10
+ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
11
+ //
12
+
13
+ #ifndef GGML_SYCL_DPCT_HELPER_HPP
14
+ #define GGML_SYCL_DPCT_HELPER_HPP
15
+
16
+ #include <sycl/sycl.hpp>
17
+ #include <sycl/half_type.hpp>
18
+ #include <oneapi/mkl.hpp>
19
+
20
+ #include <map>
21
+
22
+ #include "ggml.h"
23
+
24
+ #if defined(__linux__)
25
+ #include <sys/mman.h>
26
+ #elif defined(_WIN64)
27
+ #ifndef NOMINMAX
28
+ #define NOMINMAX
29
+ #endif
30
+ #include <windows.h>
31
+ #else
32
+ #error "Only support Windows and Linux."
33
+ #endif
34
+
35
+ #if defined(__linux__)
36
+ #include <unistd.h>
37
+ #include <sys/syscall.h>
38
+ #endif
39
+ #if defined(_WIN64)
40
+ #ifndef NOMINMAX
41
+ #define NOMINMAX
42
+ #endif
43
+ #include <windows.h>
44
+ #endif
45
+
46
+ #define DPCT_COMPATIBILITY_TEMP (900)
47
+
48
+ #if defined(_MSC_VER)
49
+ #define __dpct_align__(n) __declspec(align(n))
50
+ #define __dpct_inline__ __forceinline
51
+ #else
52
+ #define __dpct_align__(n) __attribute__((aligned(n)))
53
+ #define __dpct_inline__ __inline__ __attribute__((always_inline))
54
+ #endif
55
+
56
+ #if defined(_MSC_VER)
57
+ #define __dpct_noinline__ __declspec(noinline)
58
+ #else
59
+ #define __dpct_noinline__ __attribute__((noinline))
60
+ #endif
61
+
62
+ inline std::string get_device_type_name(const sycl::device &Device) {
63
+ auto DeviceType = Device.get_info<sycl::info::device::device_type>();
64
+ switch (DeviceType) {
65
+ case sycl::info::device_type::cpu:
66
+ return "cpu";
67
+ case sycl::info::device_type::gpu:
68
+ return "gpu";
69
+ case sycl::info::device_type::host:
70
+ return "host";
71
+ case sycl::info::device_type::accelerator:
72
+ return "acc";
73
+ default:
74
+ return "unknown";
75
+ }
76
+ }
77
+
78
+ inline std::string get_device_backend_and_type(const sycl::device &device) {
79
+ std::stringstream device_type;
80
+ sycl::backend backend = device.get_backend();
81
+ device_type << backend << ":" << get_device_type_name(device);
82
+ return device_type.str();
83
+ }
84
+
85
+ template <typename Ts> struct matrix_info_t {
86
+ oneapi::mkl::transpose transpose_info[2];
87
+ Ts value_info[2];
88
+ std::int64_t size_info[3];
89
+ std::int64_t ld_info[3];
90
+ std::int64_t groupsize_info;
91
+ };
92
+
93
+ namespace dpct
94
+ {
95
+ typedef sycl::queue *queue_ptr;
96
+ typedef sycl::event *event_ptr;
97
+ typedef char *device_ptr;
98
+ typedef uint8_t byte_t;
99
+ typedef sycl::buffer<byte_t> buffer_t;
100
+
101
+ /// SYCL default exception handler
102
+ inline auto exception_handler = [](sycl::exception_list exceptions)
103
+ {
104
+ for (std::exception_ptr const &e : exceptions)
105
+ {
106
+ try
107
+ {
108
+ std::rethrow_exception(e);
109
+ }
110
+ catch (sycl::exception const &e)
111
+ {
112
+ std::cerr << "Caught asynchronous SYCL exception:" << std::endl
113
+ << e.what() << std::endl
114
+ << "Exception caught at file:" << __FILE__
115
+ << ", line:" << __LINE__ << std::endl;
116
+ }
117
+ }
118
+ };
119
+
120
+ enum error_code
121
+ {
122
+ success = 0,
123
+ default_error = 999
124
+ };
125
+
126
+ enum memcpy_direction
127
+ {
128
+ host_to_host,
129
+ host_to_device,
130
+ device_to_host,
131
+ device_to_device,
132
+ automatic
133
+ };
134
+
135
+ enum memory_region
136
+ {
137
+ global = 0, // device global memory
138
+ constant, // device constant memory
139
+ local, // device local memory
140
+ shared, // memory which can be accessed by host and device
141
+ };
142
+
143
+ enum class library_data_t : unsigned char
144
+ {
145
+ real_float = 0,
146
+ complex_float,
147
+ real_double,
148
+ complex_double,
149
+ real_half,
150
+ complex_half,
151
+ real_bfloat16,
152
+ complex_bfloat16,
153
+ real_int4,
154
+ complex_int4,
155
+ real_uint4,
156
+ complex_uint4,
157
+ real_int8,
158
+ complex_int8,
159
+ real_uint8,
160
+ complex_uint8,
161
+ real_int16,
162
+ complex_int16,
163
+ real_uint16,
164
+ complex_uint16,
165
+ real_int32,
166
+ complex_int32,
167
+ real_uint32,
168
+ complex_uint32,
169
+ real_int64,
170
+ complex_int64,
171
+ real_uint64,
172
+ complex_uint64,
173
+ real_int8_4,
174
+ real_int8_32,
175
+ real_uint8_4,
176
+ library_data_t_size
177
+ };
178
+
179
+ template <typename T>
180
+ struct DataType
181
+ {
182
+ using T2 = T;
183
+ };
184
+ template <typename T>
185
+ struct DataType<sycl::vec<T, 2>>
186
+ {
187
+ using T2 = std::complex<T>;
188
+ };
189
+
190
+ static void destroy_event(event_ptr event)
191
+ {
192
+ delete event;
193
+ }
194
+
195
+ static inline unsigned int get_tid()
196
+ {
197
+ #if defined(__linux__)
198
+ return syscall(SYS_gettid);
199
+ #elif defined(_WIN64)
200
+ return GetCurrentThreadId();
201
+ #else
202
+ #error "Only support Windows and Linux."
203
+ #endif
204
+ }
205
+
206
+ namespace detail
207
+ {
208
+ static void get_version(const sycl::device &dev, int &major, int &minor)
209
+ {
210
+ // Version string has the following format:
211
+ // a. OpenCL<space><major.minor><space><vendor-specific-information>
212
+ // b. <major.minor>
213
+ // c. <AmdGcnArchName> e.g gfx1030
214
+ std::string ver;
215
+ ver = dev.get_info<sycl::info::device::version>();
216
+ std::string::size_type i = 0;
217
+ while (i < ver.size()) {
218
+ if (isdigit(ver[i]))
219
+ break;
220
+ i++;
221
+ }
222
+ major = std::stoi(&(ver[i]));
223
+ while (i < ver.size()) {
224
+ if (ver[i] == '.')
225
+ break;
226
+ i++;
227
+ }
228
+ if (i < ver.size()) {
229
+ // a. and b.
230
+ i++;
231
+ minor = std::stoi(&(ver[i]));
232
+ } else {
233
+ // c.
234
+ minor = 0;
235
+ }
236
+ }
237
+
238
+ template <typename tag, typename T>
239
+ class generic_error_type
240
+ {
241
+ public:
242
+ generic_error_type() = default;
243
+ generic_error_type(T value) : value{value} {}
244
+ operator T() const { return value; }
245
+
246
+ private:
247
+ T value;
248
+ };
249
+
250
+ } // namespace detail
251
+
252
+ // COPY from DPCT head files
253
+ /// dim3 is used to store 3 component dimensions.
254
+ class dim3 {
255
+ public:
256
+ unsigned x, y, z;
257
+
258
+ constexpr dim3(unsigned x = 1, unsigned y = 1, unsigned z = 1)
259
+ : x(x), y(y), z(z) {}
260
+
261
+ dim3(const sycl::id<3> &r) : dim3(r[2], r[1], r[0]) {}
262
+
263
+ operator sycl::range<3>() const { return sycl::range<3>(z, y, x); }
264
+ }; // namespace dim3
265
+
266
+ inline dim3 operator*(const dim3 &a, const dim3 &b) {
267
+ return dim3{a.x * b.x, a.y * b.y, a.z * b.z};
268
+ }
269
+ // COPY from DPCT head files
270
+
271
+
272
+ /// Pitched 2D/3D memory data.
273
+ class pitched_data
274
+ {
275
+ public:
276
+ pitched_data() : pitched_data(nullptr, 0, 0, 0) {}
277
+ pitched_data(void *data, size_t pitch, size_t x, size_t y)
278
+ : _data(data), _pitch(pitch), _x(x), _y(y) {}
279
+
280
+ void *get_data_ptr() { return _data; }
281
+ void set_data_ptr(void *data) { _data = data; }
282
+
283
+ size_t get_pitch() { return _pitch; }
284
+ void set_pitch(size_t pitch) { _pitch = pitch; }
285
+
286
+ size_t get_x() { return _x; }
287
+ void set_x(size_t x) { _x = x; }
288
+
289
+ size_t get_y() { return _y; }
290
+ void set_y(size_t y) { _y = y; }
291
+
292
+ private:
293
+ void *_data;
294
+ size_t _pitch, _x, _y;
295
+ };
296
+
297
+ class device_info
298
+ {
299
+ public:
300
+ // get interface
301
+ const char *get_name() const { return _name; }
302
+ char *get_name() { return _name; }
303
+ template <typename WorkItemSizesTy = sycl::range<3>,
304
+ std::enable_if_t<std::is_same_v<WorkItemSizesTy, sycl::range<3>> ||
305
+ std::is_same_v<WorkItemSizesTy, int *>,
306
+ int> = 0>
307
+ auto get_max_work_item_sizes() const
308
+ {
309
+ if constexpr (std::is_same_v<WorkItemSizesTy, sycl::range<3>>)
310
+ return sycl::range<3>(_max_work_item_sizes_i[0],
311
+ _max_work_item_sizes_i[1],
312
+ _max_work_item_sizes_i[2]);
313
+ else
314
+ {
315
+ return _max_work_item_sizes_i;
316
+ }
317
+ }
318
+ template <typename WorkItemSizesTy = sycl::range<3>,
319
+ std::enable_if_t<std::is_same_v<WorkItemSizesTy, sycl::range<3>> ||
320
+ std::is_same_v<WorkItemSizesTy, int *>,
321
+ int> = 0>
322
+ auto get_max_work_item_sizes()
323
+ {
324
+ if constexpr (std::is_same_v<WorkItemSizesTy, sycl::range<3>>)
325
+ return sycl::range<3>(_max_work_item_sizes_i[0],
326
+ _max_work_item_sizes_i[1],
327
+ _max_work_item_sizes_i[2]);
328
+ else
329
+ {
330
+ return _max_work_item_sizes_i;
331
+ }
332
+ }
333
+ bool get_host_unified_memory() const { return _host_unified_memory; }
334
+ int get_major_version() const { return _major; }
335
+ int get_minor_version() const { return _minor; }
336
+ int get_integrated() const { return _integrated; }
337
+ int get_max_clock_frequency() const { return _frequency; }
338
+ int get_max_compute_units() const { return _max_compute_units; }
339
+ int get_max_work_group_size() const { return _max_work_group_size; }
340
+ int get_max_sub_group_size() const { return _max_sub_group_size; }
341
+ int get_max_work_items_per_compute_unit() const
342
+ {
343
+ return _max_work_items_per_compute_unit;
344
+ }
345
+ int get_max_register_size_per_work_group() const
346
+ {
347
+ return _max_register_size_per_work_group;
348
+ }
349
+ template <typename NDRangeSizeTy = size_t *,
350
+ std::enable_if_t<std::is_same_v<NDRangeSizeTy, size_t *> ||
351
+ std::is_same_v<NDRangeSizeTy, int *>,
352
+ int> = 0>
353
+ auto get_max_nd_range_size() const
354
+ {
355
+ if constexpr (std::is_same_v<NDRangeSizeTy, size_t *>)
356
+ return _max_nd_range_size;
357
+ else
358
+ return _max_nd_range_size_i;
359
+ }
360
+ template <typename NDRangeSizeTy = size_t *,
361
+ std::enable_if_t<std::is_same_v<NDRangeSizeTy, size_t *> ||
362
+ std::is_same_v<NDRangeSizeTy, int *>,
363
+ int> = 0>
364
+ auto get_max_nd_range_size()
365
+ {
366
+ if constexpr (std::is_same_v<NDRangeSizeTy, size_t *>)
367
+ return _max_nd_range_size;
368
+ else
369
+ return _max_nd_range_size_i;
370
+ }
371
+ size_t get_global_mem_size() const { return _global_mem_size; }
372
+ size_t get_local_mem_size() const { return _local_mem_size; }
373
+ size_t get_max_mem_alloc_size() const { return _max_mem_alloc_size; }
374
+ /// Returns the maximum clock rate of device's global memory in kHz. If
375
+ /// compiler does not support this API then returns default value 3200000 kHz.
376
+ unsigned int get_memory_clock_rate() const { return _memory_clock_rate; }
377
+ /// Returns the maximum bus width between device and memory in bits. If
378
+ /// compiler does not support this API then returns default value 64 bits.
379
+ unsigned int get_memory_bus_width() const { return _memory_bus_width; }
380
+ uint32_t get_device_id() const { return _device_id; }
381
+ std::array<unsigned char, 16> get_uuid() const { return _uuid; }
382
+ /// Returns global memory cache size in bytes.
383
+ unsigned int get_global_mem_cache_size() const
384
+ {
385
+ return _global_mem_cache_size;
386
+ }
387
+
388
+ // set interface
389
+ void set_name(const char *name)
390
+ {
391
+ size_t length = strlen(name);
392
+ if (length < 256)
393
+ {
394
+ std::memcpy(_name, name, length + 1);
395
+ }
396
+ else
397
+ {
398
+ std::memcpy(_name, name, 255);
399
+ _name[255] = '\0';
400
+ }
401
+ }
402
+ void set_max_work_item_sizes(const sycl::range<3> max_work_item_sizes)
403
+ {
404
+ for (int i = 0; i < 3; ++i)
405
+ _max_work_item_sizes_i[i] = max_work_item_sizes[i];
406
+ }
407
+ [[deprecated]] void
408
+ set_max_work_item_sizes(const sycl::id<3> max_work_item_sizes)
409
+ {
410
+ for (int i = 0; i < 3; ++i)
411
+ {
412
+ _max_work_item_sizes_i[i] = max_work_item_sizes[i];
413
+ }
414
+ }
415
+ void set_host_unified_memory(bool host_unified_memory)
416
+ {
417
+ _host_unified_memory = host_unified_memory;
418
+ }
419
+ void set_major_version(int major) { _major = major; }
420
+ void set_minor_version(int minor) { _minor = minor; }
421
+ void set_integrated(int integrated) { _integrated = integrated; }
422
+ void set_max_clock_frequency(int frequency) { _frequency = frequency; }
423
+ void set_max_compute_units(int max_compute_units)
424
+ {
425
+ _max_compute_units = max_compute_units;
426
+ }
427
+ void set_global_mem_size(size_t global_mem_size)
428
+ {
429
+ _global_mem_size = global_mem_size;
430
+ }
431
+ void set_local_mem_size(size_t local_mem_size)
432
+ {
433
+ _local_mem_size = local_mem_size;
434
+ }
435
+ void set_max_mem_alloc_size(size_t max_mem_alloc_size)
436
+ {
437
+ _max_mem_alloc_size = max_mem_alloc_size;
438
+ }
439
+ void set_max_work_group_size(int max_work_group_size)
440
+ {
441
+ _max_work_group_size = max_work_group_size;
442
+ }
443
+ void set_max_sub_group_size(int max_sub_group_size)
444
+ {
445
+ _max_sub_group_size = max_sub_group_size;
446
+ }
447
+ void
448
+ set_max_work_items_per_compute_unit(int max_work_items_per_compute_unit)
449
+ {
450
+ _max_work_items_per_compute_unit = max_work_items_per_compute_unit;
451
+ }
452
+ void set_max_nd_range_size(int max_nd_range_size[])
453
+ {
454
+ for (int i = 0; i < 3; i++)
455
+ {
456
+ _max_nd_range_size[i] = max_nd_range_size[i];
457
+ _max_nd_range_size_i[i] = max_nd_range_size[i];
458
+ }
459
+ }
460
+ void set_memory_clock_rate(unsigned int memory_clock_rate)
461
+ {
462
+ _memory_clock_rate = memory_clock_rate;
463
+ }
464
+ void set_memory_bus_width(unsigned int memory_bus_width)
465
+ {
466
+ _memory_bus_width = memory_bus_width;
467
+ }
468
+ void
469
+ set_max_register_size_per_work_group(int max_register_size_per_work_group)
470
+ {
471
+ _max_register_size_per_work_group = max_register_size_per_work_group;
472
+ }
473
+ void set_device_id(uint32_t device_id)
474
+ {
475
+ _device_id = device_id;
476
+ }
477
+ void set_uuid(std::array<unsigned char, 16> uuid)
478
+ {
479
+ _uuid = std::move(uuid);
480
+ }
481
+ void set_global_mem_cache_size(unsigned int global_mem_cache_size)
482
+ {
483
+ _global_mem_cache_size = global_mem_cache_size;
484
+ }
485
+
486
+ private:
487
+ char _name[256];
488
+ int _max_work_item_sizes_i[3];
489
+ bool _host_unified_memory = false;
490
+ int _major;
491
+ int _minor;
492
+ int _integrated = 0;
493
+ int _frequency;
494
+ // Set estimated value 3200000 kHz as default value.
495
+ unsigned int _memory_clock_rate = 3200000;
496
+ // Set estimated value 64 bits as default value.
497
+ unsigned int _memory_bus_width = 64;
498
+ unsigned int _global_mem_cache_size;
499
+ int _max_compute_units;
500
+ int _max_work_group_size;
501
+ int _max_sub_group_size;
502
+ int _max_work_items_per_compute_unit;
503
+ int _max_register_size_per_work_group;
504
+ size_t _global_mem_size;
505
+ size_t _local_mem_size;
506
+ size_t _max_mem_alloc_size;
507
+ size_t _max_nd_range_size[3];
508
+ int _max_nd_range_size_i[3];
509
+ uint32_t _device_id;
510
+ std::array<unsigned char, 16> _uuid;
511
+ };
512
+
513
+ static int get_major_version(const sycl::device &dev)
514
+ {
515
+ int major, minor;
516
+ detail::get_version(dev, major, minor);
517
+ return major;
518
+ }
519
+
520
+ static int get_minor_version(const sycl::device &dev)
521
+ {
522
+ int major, minor;
523
+ detail::get_version(dev, major, minor);
524
+ return minor;
525
+ }
526
+
527
+ static void get_device_info(device_info &out, const sycl::device &dev)
528
+ {
529
+ device_info prop;
530
+ prop.set_name(dev.get_info<sycl::info::device::name>().c_str());
531
+
532
+ int major, minor;
533
+ detail::get_version(dev, major, minor);
534
+ prop.set_major_version(major);
535
+ prop.set_minor_version(minor);
536
+
537
+ prop.set_max_work_item_sizes(
538
+ #if (__SYCL_COMPILER_VERSION && __SYCL_COMPILER_VERSION < 20220902)
539
+ // oneAPI DPC++ compiler older than 2022/09/02, where max_work_item_sizes
540
+ // is an enum class element
541
+ dev.get_info<sycl::info::device::max_work_item_sizes>());
542
+ #else
543
+ // SYCL 2020-conformant code, max_work_item_sizes is a struct templated by
544
+ // an int
545
+ dev.get_info<sycl::info::device::max_work_item_sizes<3>>());
546
+ #endif
547
+ prop.set_host_unified_memory(dev.has(sycl::aspect::usm_host_allocations));
548
+
549
+ prop.set_max_clock_frequency(
550
+ dev.get_info<sycl::info::device::max_clock_frequency>() * 1000);
551
+
552
+ prop.set_max_compute_units(
553
+ dev.get_info<sycl::info::device::max_compute_units>());
554
+ prop.set_max_work_group_size(
555
+ dev.get_info<sycl::info::device::max_work_group_size>());
556
+ prop.set_global_mem_size(dev.get_info<sycl::info::device::global_mem_size>());
557
+ prop.set_local_mem_size(dev.get_info<sycl::info::device::local_mem_size>());
558
+ prop.set_max_mem_alloc_size(dev.get_info<sycl::info::device::max_mem_alloc_size>());
559
+
560
+ #if (defined(SYCL_EXT_INTEL_DEVICE_INFO) && SYCL_EXT_INTEL_DEVICE_INFO >= 6)
561
+ if (dev.has(sycl::aspect::ext_intel_memory_clock_rate))
562
+ {
563
+ unsigned int tmp =
564
+ dev.get_info<sycl::ext::intel::info::device::memory_clock_rate>();
565
+ if (tmp != 0)
566
+ prop.set_memory_clock_rate(1000 * tmp);
567
+ }
568
+ if (dev.has(sycl::aspect::ext_intel_memory_bus_width))
569
+ {
570
+ prop.set_memory_bus_width(
571
+ dev.get_info<sycl::ext::intel::info::device::memory_bus_width>());
572
+ }
573
+ if (dev.has(sycl::aspect::ext_intel_device_id))
574
+ {
575
+ prop.set_device_id(
576
+ dev.get_info<sycl::ext::intel::info::device::device_id>());
577
+ }
578
+ if (dev.has(sycl::aspect::ext_intel_device_info_uuid))
579
+ {
580
+ prop.set_uuid(dev.get_info<sycl::ext::intel::info::device::uuid>());
581
+ }
582
+ #elif defined(_MSC_VER) && !defined(__clang__)
583
+ #pragma message("get_device_info: querying memory_clock_rate and \
584
+ memory_bus_width are not supported by the compiler used. \
585
+ Use 3200000 kHz as memory_clock_rate default value. \
586
+ Use 64 bits as memory_bus_width default value.")
587
+ #else
588
+ #warning "get_device_info: querying memory_clock_rate and \
589
+ memory_bus_width are not supported by the compiler used. \
590
+ Use 3200000 kHz as memory_clock_rate default value. \
591
+ Use 64 bits as memory_bus_width default value."
592
+ #endif
593
+
594
+ size_t max_sub_group_size = 1;
595
+ std::vector<size_t> sub_group_sizes =
596
+ dev.get_info<sycl::info::device::sub_group_sizes>();
597
+
598
+ for (const auto &sub_group_size : sub_group_sizes)
599
+ {
600
+ if (max_sub_group_size < sub_group_size)
601
+ max_sub_group_size = sub_group_size;
602
+ }
603
+
604
+ prop.set_max_sub_group_size(max_sub_group_size);
605
+
606
+ prop.set_max_work_items_per_compute_unit(
607
+ dev.get_info<sycl::info::device::max_work_group_size>());
608
+ int max_nd_range_size[] = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF};
609
+ prop.set_max_nd_range_size(max_nd_range_size);
610
+
611
+ // Estimates max register size per work group, feel free to update the value
612
+ // according to device properties.
613
+ prop.set_max_register_size_per_work_group(65536);
614
+
615
+ prop.set_global_mem_cache_size(
616
+ dev.get_info<sycl::info::device::global_mem_cache_size>());
617
+ out = prop;
618
+ }
619
+
620
+ /// dpct device extension
621
+ class device_ext : public sycl::device {
622
+ typedef std::mutex mutex_type;
623
+
624
+ public:
625
+ device_ext() : sycl::device() {}
626
+ ~device_ext() {
627
+ std::lock_guard<mutex_type> lock(m_mutex);
628
+ clear_queues();
629
+ }
630
+ device_ext(const sycl::device &base) : sycl::device(base) {
631
+ std::lock_guard<mutex_type> lock(m_mutex);
632
+ init_queues();
633
+ }
634
+
635
+ int is_native_atomic_supported() { return 0; }
636
+ int get_major_version() const { return dpct::get_major_version(*this); }
637
+
638
+ int get_minor_version() const { return dpct::get_minor_version(*this); }
639
+
640
+ int get_max_compute_units() const {
641
+ return get_device_info().get_max_compute_units();
642
+ }
643
+
644
+ /// Return the maximum clock frequency of this device in KHz.
645
+ int get_max_clock_frequency() const {
646
+ return get_device_info().get_max_clock_frequency();
647
+ }
648
+
649
+ int get_integrated() const { return get_device_info().get_integrated(); }
650
+
651
+ int get_max_sub_group_size() const {
652
+ return get_device_info().get_max_sub_group_size();
653
+ }
654
+
655
+ int get_max_register_size_per_work_group() const {
656
+ return get_device_info().get_max_register_size_per_work_group();
657
+ }
658
+
659
+ int get_max_work_group_size() const {
660
+ return get_device_info().get_max_work_group_size();
661
+ }
662
+
663
+ int get_mem_base_addr_align() const {
664
+ return get_info<sycl::info::device::mem_base_addr_align>();
665
+ }
666
+
667
+ size_t get_global_mem_size() const {
668
+ return get_device_info().get_global_mem_size();
669
+ }
670
+
671
+ size_t get_max_mem_alloc_size() const {
672
+ return get_device_info().get_max_mem_alloc_size();
673
+ }
674
+
675
+ /// Get the number of bytes of free and total memory on the SYCL device.
676
+ /// \param [out] free_memory The number of bytes of free memory on the
677
+ /// SYCL device. \param [out] total_memory The number of bytes of total
678
+ /// memory on the SYCL device.
679
+ void get_memory_info(size_t &free_memory, size_t &total_memory) {
680
+ total_memory = get_device_info().get_global_mem_size();
681
+ const char *warning_info =
682
+ "get_memory_info: [warning] ext_intel_free_memory is not "
683
+ "supported (export/set ZES_ENABLE_SYSMAN=1 to support), "
684
+ "use total memory as free memory";
685
+ #if (defined(__SYCL_COMPILER_VERSION) && __SYCL_COMPILER_VERSION >= 20221105)
686
+ if (!has(sycl::aspect::ext_intel_free_memory)) {
687
+ std::cerr << warning_info << std::endl;
688
+ free_memory = total_memory;
689
+ } else {
690
+ free_memory = get_info<sycl::ext::intel::info::device::free_memory>();
691
+ }
692
+ #else
693
+ std::cerr << warning_info << std::endl;
694
+ free_memory = total_memory;
695
+ #if defined(_MSC_VER) && !defined(__clang__)
696
+ #pragma message("Querying the number of bytes of free memory is not supported")
697
+ #else
698
+ #warning "Querying the number of bytes of free memory is not supported"
699
+ #endif
700
+ #endif
701
+ }
702
+
703
+ void get_device_info(device_info &out) const {
704
+ dpct::get_device_info(out, *this);
705
+ }
706
+
707
+ device_info get_device_info() const {
708
+ device_info prop;
709
+ dpct::get_device_info(prop, *this);
710
+ return prop;
711
+ }
712
+
713
+ void reset() {
714
+ std::lock_guard<mutex_type> lock(m_mutex);
715
+ clear_queues();
716
+ init_queues();
717
+ }
718
+
719
+ sycl::queue &in_order_queue() { return _q_in_order; }
720
+
721
+ sycl::queue &out_of_order_queue() { return _q_out_of_order; }
722
+
723
+ sycl::queue &default_queue() { return in_order_queue(); }
724
+
725
+ void queues_wait_and_throw() {
726
+ std::unique_lock<mutex_type> lock(m_mutex);
727
+ lock.unlock();
728
+ for (auto &q : _queues) {
729
+ q.wait_and_throw();
730
+ }
731
+ // Guard the destruct of current_queues to make sure the ref count is
732
+ // safe.
733
+ lock.lock();
734
+ }
735
+
736
+ sycl::queue create_queue(bool enable_exception_handler = false) {
737
+ return create_in_order_queue(enable_exception_handler);
738
+ }
739
+
740
+ sycl::queue create_queue(sycl::device device,
741
+ bool enable_exception_handler = false) {
742
+ return create_in_order_queue(device, enable_exception_handler);
743
+ }
744
+
745
+ sycl::queue create_in_order_queue(bool enable_exception_handler = false) {
746
+ std::lock_guard<mutex_type> lock(m_mutex);
747
+ return create_queue_impl(enable_exception_handler,
748
+ sycl::property::queue::in_order());
749
+ }
750
+
751
+ sycl::queue create_in_order_queue(sycl::device device,
752
+ bool enable_exception_handler = false) {
753
+ std::lock_guard<mutex_type> lock(m_mutex);
754
+ return create_queue_impl(device, enable_exception_handler,
755
+ sycl::property::queue::in_order());
756
+ }
757
+
758
+ sycl::queue create_out_of_order_queue(
759
+ bool enable_exception_handler = false) {
760
+ std::lock_guard<mutex_type> lock(m_mutex);
761
+ return create_queue_impl(enable_exception_handler);
762
+ }
763
+
764
+ void destroy_queue(sycl::queue queue) {
765
+ std::lock_guard<mutex_type> lock(m_mutex);
766
+ _queues.erase(std::remove_if(_queues.begin(), _queues.end(),
767
+ [=](const sycl::queue &q) -> bool
768
+ {
769
+ return q == queue;
770
+ }),
771
+ _queues.end());
772
+ }
773
+ void set_saved_queue(sycl::queue q) {
774
+ std::lock_guard<mutex_type> lock(m_mutex);
775
+ _saved_queue = q;
776
+ }
777
+ sycl::queue get_saved_queue() const {
778
+ std::lock_guard<mutex_type> lock(m_mutex);
779
+ return _saved_queue;
780
+ }
781
+
782
+ private:
783
+ void clear_queues() { _queues.clear(); }
784
+
785
+ void init_queues() {
786
+ _q_in_order =
787
+ create_queue_impl(true, sycl::property::queue::in_order());
788
+ _q_out_of_order = create_queue_impl(true);
789
+ _saved_queue = default_queue();
790
+ }
791
+
792
+ /// Caller should acquire resource \p m_mutex before calling this
793
+ /// function.
794
+ template <class... Properties>
795
+ sycl::queue create_queue_impl(bool enable_exception_handler,
796
+ Properties... properties) {
797
+ sycl::async_handler eh = {};
798
+ if (enable_exception_handler) {
799
+ eh = exception_handler;
800
+ }
801
+ _queues.push_back(sycl::queue(
802
+ *this, eh,
803
+ sycl::property_list(
804
+ #ifdef DPCT_PROFILING_ENABLED
805
+ sycl::property::queue::enable_profiling(),
806
+ #endif
807
+ properties...)));
808
+
809
+ return _queues.back();
810
+ }
811
+
812
+ template <class... Properties>
813
+ sycl::queue create_queue_impl(sycl::device device,
814
+ bool enable_exception_handler,
815
+ Properties... properties) {
816
+ sycl::async_handler eh = {};
817
+ if (enable_exception_handler) {
818
+ eh = exception_handler;
819
+ }
820
+ _queues.push_back(sycl::queue(
821
+ device, eh,
822
+ sycl::property_list(
823
+ #ifdef DPCT_PROFILING_ENABLED
824
+ sycl::property::queue::enable_profiling(),
825
+ #endif
826
+ properties...)));
827
+
828
+ return _queues.back();
829
+ }
830
+
831
+ void get_version(int &major, int &minor) const {
832
+ detail::get_version(*this, major, minor);
833
+ }
834
+ sycl::queue _q_in_order, _q_out_of_order;
835
+ sycl::queue _saved_queue;
836
+ std::vector<sycl::queue> _queues;
837
+ mutable mutex_type m_mutex;
838
+ };
839
+
840
+
841
+ /// device manager
842
+ class dev_mgr
843
+ {
844
+ public:
845
+ device_ext &current_device()
846
+ {
847
+ unsigned int dev_id = current_device_id();
848
+ check_id(dev_id);
849
+ return *_devs[dev_id];
850
+ }
851
+ device_ext &cpu_device() const
852
+ {
853
+ std::lock_guard<std::recursive_mutex> lock(m_mutex);
854
+ if (_cpu_device == -1)
855
+ {
856
+ throw std::runtime_error("no valid cpu device");
857
+ }
858
+ else
859
+ {
860
+ return *_devs[_cpu_device];
861
+ }
862
+ }
863
+ device_ext &get_device(unsigned int id) const
864
+ {
865
+ std::lock_guard<std::recursive_mutex> lock(m_mutex);
866
+ check_id(id);
867
+ return *_devs[id];
868
+ }
869
+ unsigned int current_device_id() const
870
+ {
871
+ std::lock_guard<std::recursive_mutex> lock(m_mutex);
872
+ auto it = _thread2dev_map.find(get_tid());
873
+ if (it != _thread2dev_map.end())
874
+ return it->second;
875
+ return DEFAULT_DEVICE_ID;
876
+ }
877
+
878
+ /// Select device with a device ID.
879
+ /// \param [in] id The id of the device which can
880
+ /// be obtained through get_device_id(const sycl::device).
881
+ void select_device(unsigned int id)
882
+ {
883
+ std::lock_guard<std::recursive_mutex> lock(m_mutex);
884
+ check_id(id);
885
+ _thread2dev_map[get_tid()] = id;
886
+ }
887
+ unsigned int device_count() { return _devs.size(); }
888
+
889
+ unsigned int get_device_id(const sycl::device &dev)
890
+ {
891
+ unsigned int id = 0;
892
+ for (auto &dev_item : _devs)
893
+ {
894
+ if (*dev_item == dev)
895
+ {
896
+ return id;
897
+ }
898
+ id++;
899
+ }
900
+ return -1;
901
+ }
902
+
903
+ inline std::string get_preferred_gpu_platform_name() {
904
+ std::string result;
905
+
906
+ std::string filter = "";
907
+ char* env = getenv("ONEAPI_DEVICE_SELECTOR");
908
+ if (env) {
909
+ if (std::strstr(env, "level_zero")) {
910
+ filter = "level-zero";
911
+ }
912
+ else if (std::strstr(env, "opencl")) {
913
+ filter = "opencl";
914
+ }
915
+ else if (std::strstr(env, "cuda")) {
916
+ filter = "cuda";
917
+ }
918
+ else if (std::strstr(env, "hip")) {
919
+ filter = "hip";
920
+ }
921
+ else {
922
+ throw std::runtime_error("invalid device filter: " + std::string(env));
923
+ }
924
+ } else {
925
+ auto default_device = sycl::device(sycl::default_selector_v);
926
+ auto default_platform_name = default_device.get_platform().get_info<sycl::info::platform::name>();
927
+
928
+ if (std::strstr(default_platform_name.c_str(), "Level-Zero") || default_device.is_cpu()) {
929
+ filter = "level-zero";
930
+ }
931
+ else if (std::strstr(default_platform_name.c_str(), "CUDA")) {
932
+ filter = "cuda";
933
+ }
934
+ else if (std::strstr(default_platform_name.c_str(), "HIP")) {
935
+ filter = "hip";
936
+ }
937
+ }
938
+
939
+ auto platform_list = sycl::platform::get_platforms();
940
+
941
+ for (const auto& platform : platform_list) {
942
+ auto devices = platform.get_devices();
943
+ auto gpu_dev = std::find_if(devices.begin(), devices.end(), [](const sycl::device& d) {
944
+ return d.is_gpu();
945
+ });
946
+
947
+ if (gpu_dev == devices.end()) {
948
+ // cout << "platform [" << platform_name
949
+ // << "] does not contain GPU devices, skipping\n";
950
+ continue;
951
+ }
952
+
953
+ auto platform_name = platform.get_info<sycl::info::platform::name>();
954
+ std::string platform_name_low_case;
955
+ platform_name_low_case.resize(platform_name.size());
956
+
957
+ std::transform(
958
+ platform_name.begin(), platform_name.end(), platform_name_low_case.begin(), ::tolower);
959
+
960
+ if (platform_name_low_case.find(filter) == std::string::npos) {
961
+ // cout << "platform [" << platform_name
962
+ // << "] does not match with requested "
963
+ // << filter << ", skipping\n";
964
+ continue;
965
+ }
966
+
967
+ result = platform_name;
968
+ }
969
+
970
+ if (result.empty())
971
+ throw std::runtime_error("can not find preferred GPU platform");
972
+
973
+ return result;
974
+ }
975
+
976
+ template <class DeviceSelector>
977
+ std::enable_if_t<
978
+ std::is_invocable_r_v<int, DeviceSelector, const sycl::device &>>
979
+ select_device(const DeviceSelector &selector = sycl::gpu_selector_v)
980
+ {
981
+ sycl::device selected_device = sycl::device(selector);
982
+ unsigned int selected_device_id = get_device_id(selected_device);
983
+ select_device(selected_device_id);
984
+ }
985
+
986
+ /// Returns the instance of device manager singleton.
987
+ static dev_mgr &instance()
988
+ {
989
+ static dev_mgr d_m;
990
+ return d_m;
991
+ }
992
+ dev_mgr(const dev_mgr &) = delete;
993
+ dev_mgr &operator=(const dev_mgr &) = delete;
994
+ dev_mgr(dev_mgr &&) = delete;
995
+ dev_mgr &operator=(dev_mgr &&) = delete;
996
+
997
+ private:
998
+ mutable std::recursive_mutex m_mutex;
999
+ static bool compare_dev(sycl::device &device1, sycl::device &device2)
1000
+ {
1001
+ sycl::backend backend1 = device1.get_backend();
1002
+ sycl::backend backend2 = device2.get_backend();
1003
+ // levelzero backends always come first
1004
+ if(backend1 == sycl::backend::ext_oneapi_level_zero && backend2 != sycl::backend::ext_oneapi_level_zero) return true;
1005
+ if(backend1 != sycl::backend::ext_oneapi_level_zero && backend2 == sycl::backend::ext_oneapi_level_zero) return false;
1006
+ dpct::device_info prop1;
1007
+ dpct::get_device_info(prop1, device1);
1008
+ dpct::device_info prop2;
1009
+ dpct::get_device_info(prop2, device2);
1010
+ return prop1.get_max_compute_units() > prop2.get_max_compute_units();
1011
+ }
1012
+ static int convert_backend_index(std::string & backend) {
1013
+ if (backend == "ext_oneapi_level_zero:gpu") return 0;
1014
+ if (backend == "opencl:gpu") return 1;
1015
+ if (backend == "ext_oneapi_cuda:gpu") return 2;
1016
+ if (backend == "ext_oneapi_hip:gpu") return 3;
1017
+ if (backend == "opencl:cpu") return 4;
1018
+ if (backend == "opencl:acc") return 5;
1019
+ printf("convert_backend_index: can't handle backend=%s\n", backend.c_str());
1020
+ GGML_ABORT("fatal error");
1021
+ }
1022
+ static bool compare_backend(std::string &backend1, std::string &backend2) {
1023
+ return convert_backend_index(backend1) < convert_backend_index(backend2);
1024
+ }
1025
+ dev_mgr()
1026
+ {
1027
+ sycl::device default_device =
1028
+ sycl::device(sycl::default_selector_v);
1029
+ _devs.push_back(std::make_shared<device_ext>(default_device));
1030
+
1031
+ std::vector<sycl::device> sycl_all_devs;
1032
+ // Collect other devices except for the default device.
1033
+ if (default_device.is_cpu())
1034
+ _cpu_device = 0;
1035
+
1036
+ auto Platforms = sycl::platform::get_platforms();
1037
+ // Keep track of the number of devices per backend
1038
+ std::map<sycl::backend, size_t> DeviceNums;
1039
+ std::map<std::string, std::vector<sycl::device>> backend_devices;
1040
+ auto preferred_platform_name = get_preferred_gpu_platform_name();
1041
+
1042
+ while (!Platforms.empty()) {
1043
+ auto Platform = Platforms.back();
1044
+ Platforms.pop_back();
1045
+ auto platform_name = Platform.get_info<sycl::info::platform::name>();
1046
+ if (platform_name.compare(preferred_platform_name) != 0) {
1047
+ continue;
1048
+ }
1049
+ auto devices = Platform.get_devices();
1050
+ std::string backend_type = get_device_backend_and_type(devices[0]);
1051
+ for (const auto &device : devices) {
1052
+ backend_devices[backend_type].push_back(device);
1053
+ }
1054
+ }
1055
+
1056
+ std::vector<std::string> keys;
1057
+ for(auto it = backend_devices.begin(); it != backend_devices.end(); ++it) {
1058
+ keys.push_back(it->first);
1059
+ }
1060
+ std::sort(keys.begin(), keys.end(), compare_backend);
1061
+
1062
+ for (auto &key : keys) {
1063
+ std::vector<sycl::device> devs = backend_devices[key];
1064
+ std::sort(devs.begin(), devs.end(), compare_dev);
1065
+ for (const auto &dev : devs) {
1066
+ sycl_all_devs.push_back(dev);
1067
+ }
1068
+ }
1069
+
1070
+ for (auto &dev : sycl_all_devs)
1071
+ {
1072
+ if (dev == default_device)
1073
+ {
1074
+ continue;
1075
+ }
1076
+ _devs.push_back(std::make_shared<device_ext>(dev));
1077
+ if (_cpu_device == -1 && dev.is_cpu())
1078
+ {
1079
+ _cpu_device = _devs.size() - 1;
1080
+ }
1081
+ }
1082
+ }
1083
+ void check_id(unsigned int id) const
1084
+ {
1085
+ if (id >= _devs.size())
1086
+ {
1087
+ throw std::runtime_error("invalid device id");
1088
+ }
1089
+ }
1090
+ std::vector<std::shared_ptr<device_ext>> _devs;
1091
+ /// DEFAULT_DEVICE_ID is used, if current_device_id() can not find current
1092
+ /// thread id in _thread2dev_map, which means default device should be used
1093
+ /// for the current thread.
1094
+ const unsigned int DEFAULT_DEVICE_ID = 0;
1095
+ /// thread-id to device-id map.
1096
+ std::map<unsigned int, unsigned int> _thread2dev_map;
1097
+ int _cpu_device = -1;
1098
+ };
1099
+
1100
+ static inline sycl::queue &get_default_queue()
1101
+ {
1102
+ return dev_mgr::instance().current_device().default_queue();
1103
+ }
1104
+
1105
+ namespace detail
1106
+ {
1107
+ enum class pointer_access_attribute
1108
+ {
1109
+ host_only = 0,
1110
+ device_only,
1111
+ host_device,
1112
+ end
1113
+ };
1114
+
1115
+ static pointer_access_attribute get_pointer_attribute(sycl::queue &q,
1116
+ const void *ptr)
1117
+ {
1118
+ switch (sycl::get_pointer_type(ptr, q.get_context()))
1119
+ {
1120
+ case sycl::usm::alloc::unknown:
1121
+ return pointer_access_attribute::host_only;
1122
+ case sycl::usm::alloc::device:
1123
+ return pointer_access_attribute::device_only;
1124
+ case sycl::usm::alloc::shared:
1125
+ case sycl::usm::alloc::host:
1126
+ return pointer_access_attribute::host_device;
1127
+ }
1128
+ }
1129
+
1130
+ template <typename ArgT>
1131
+ inline constexpr std::uint64_t get_type_combination_id(ArgT Val)
1132
+ {
1133
+ static_assert((unsigned char)library_data_t::library_data_t_size <=
1134
+ std::numeric_limits<unsigned char>::max() &&
1135
+ "library_data_t size exceeds limit.");
1136
+ static_assert(std::is_same_v<ArgT, library_data_t>, "Unsupported ArgT");
1137
+ return (std::uint64_t)Val;
1138
+ }
1139
+
1140
+ template <typename FirstT, typename... RestT>
1141
+ inline constexpr std::uint64_t get_type_combination_id(FirstT FirstVal,
1142
+ RestT... RestVal)
1143
+ {
1144
+ static_assert((std::uint8_t)library_data_t::library_data_t_size <=
1145
+ std::numeric_limits<unsigned char>::max() &&
1146
+ "library_data_t size exceeds limit.");
1147
+ static_assert(sizeof...(RestT) <= 8 && "Too many parameters");
1148
+ static_assert(std::is_same_v<FirstT, library_data_t>, "Unsupported FirstT");
1149
+ return get_type_combination_id(RestVal...) << 8 | ((std::uint64_t)FirstVal);
1150
+ }
1151
+
1152
+ class mem_mgr
1153
+ {
1154
+ mem_mgr()
1155
+ {
1156
+ // Reserved address space, no real memory allocation happens here.
1157
+ #if defined(__linux__)
1158
+ mapped_address_space =
1159
+ (byte_t *)mmap(nullptr, mapped_region_size, PROT_NONE,
1160
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
1161
+ #elif defined(_WIN64)
1162
+ mapped_address_space = (byte_t *)VirtualAlloc(
1163
+ NULL, // NULL specified as the base address parameter
1164
+ mapped_region_size, // Size of allocation
1165
+ MEM_RESERVE, // Allocate reserved pages
1166
+ PAGE_NOACCESS); // Protection = no access
1167
+ #else
1168
+ #error "Only support Windows and Linux."
1169
+ #endif
1170
+ next_free = mapped_address_space;
1171
+ }
1172
+
1173
+ public:
1174
+ using buffer_id_t = int;
1175
+
1176
+ struct allocation
1177
+ {
1178
+ buffer_t buffer;
1179
+ byte_t *alloc_ptr;
1180
+ size_t size;
1181
+ };
1182
+
1183
+ ~mem_mgr()
1184
+ {
1185
+ #if defined(__linux__)
1186
+ munmap(mapped_address_space, mapped_region_size);
1187
+ #elif defined(_WIN64)
1188
+ VirtualFree(mapped_address_space, 0, MEM_RELEASE);
1189
+ #else
1190
+ #error "Only support Windows and Linux."
1191
+ #endif
1192
+ }
1193
+
1194
+ mem_mgr(const mem_mgr &) = delete;
1195
+ mem_mgr &operator=(const mem_mgr &) = delete;
1196
+ mem_mgr(mem_mgr &&) = delete;
1197
+ mem_mgr &operator=(mem_mgr &&) = delete;
1198
+
1199
+ /// Allocate
1200
+ void *mem_alloc(size_t size)
1201
+ {
1202
+ if (!size)
1203
+ return nullptr;
1204
+ std::lock_guard<std::mutex> lock(m_mutex);
1205
+ if (next_free + size > mapped_address_space + mapped_region_size)
1206
+ {
1207
+ throw std::runtime_error("dpct_malloc: out of memory for virtual memory pool");
1208
+ }
1209
+ // Allocation
1210
+ sycl::range<1> r(size);
1211
+ buffer_t buf(r);
1212
+ allocation A{buf, next_free, size};
1213
+ // Map allocation to device pointer
1214
+ void *result = next_free;
1215
+ m_map.emplace(next_free + size, A);
1216
+ // Update pointer to the next free space.
1217
+ next_free += (size + extra_padding + alignment - 1) & ~(alignment - 1);
1218
+
1219
+ return result;
1220
+ }
1221
+
1222
+ /// Deallocate
1223
+ void mem_free(const void *ptr)
1224
+ {
1225
+ if (!ptr)
1226
+ return;
1227
+ std::lock_guard<std::mutex> lock(m_mutex);
1228
+ auto it = get_map_iterator(ptr);
1229
+ m_map.erase(it);
1230
+ }
1231
+
1232
+ /// map: device pointer -> allocation(buffer, alloc_ptr, size)
1233
+ allocation translate_ptr(const void *ptr)
1234
+ {
1235
+ std::lock_guard<std::mutex> lock(m_mutex);
1236
+ auto it = get_map_iterator(ptr);
1237
+ return it->second;
1238
+ }
1239
+
1240
+ /// Check if the pointer represents device pointer or not.
1241
+ bool is_device_ptr(const void *ptr) const
1242
+ {
1243
+ std::lock_guard<std::mutex> lock(m_mutex);
1244
+ return (mapped_address_space <= ptr) &&
1245
+ (ptr < mapped_address_space + mapped_region_size);
1246
+ }
1247
+
1248
+ /// Returns the instance of memory manager singleton.
1249
+ static mem_mgr &instance()
1250
+ {
1251
+ static mem_mgr m;
1252
+ return m;
1253
+ }
1254
+
1255
+ private:
1256
+ std::map<byte_t *, allocation> m_map;
1257
+ mutable std::mutex m_mutex;
1258
+ byte_t *mapped_address_space;
1259
+ byte_t *next_free;
1260
+ const size_t mapped_region_size = 128ull * 1024 * 1024 * 1024;
1261
+ const size_t alignment = 256;
1262
+ /// This padding may be defined to some positive value to debug
1263
+ /// out of bound accesses.
1264
+ const size_t extra_padding = 0;
1265
+
1266
+ std::map<byte_t *, allocation>::iterator get_map_iterator(const void *ptr)
1267
+ {
1268
+ auto it = m_map.upper_bound(const_cast<byte_t *>(reinterpret_cast<const byte_t *>(ptr)));
1269
+ if (it == m_map.end())
1270
+ {
1271
+ // Not a virtual pointer.
1272
+ throw std::runtime_error("can not get buffer from non-virtual pointer");
1273
+ }
1274
+ const allocation &alloc = it->second;
1275
+ if (ptr < alloc.alloc_ptr)
1276
+ {
1277
+ // Out of bound.
1278
+ // This may happen if there's a gap between allocations due to alignment
1279
+ // or extra padding and pointer points to this gap.
1280
+ throw std::runtime_error("invalid virtual pointer");
1281
+ }
1282
+ return it;
1283
+ }
1284
+ };
1285
+
1286
+ template <class T, memory_region Memory, size_t Dimension>
1287
+ class accessor;
1288
+ template <memory_region Memory, class T = byte_t>
1289
+ class memory_traits
1290
+ {
1291
+ public:
1292
+ static constexpr sycl::access::target target =
1293
+ sycl::access::target::device;
1294
+ static constexpr sycl::access_mode mode =
1295
+ (Memory == constant) ? sycl::access_mode::read
1296
+ : sycl::access_mode::read_write;
1297
+ static constexpr size_t type_size = sizeof(T);
1298
+ using element_t =
1299
+ typename std::conditional<Memory == constant, const T, T>::type;
1300
+ using value_t = typename std::remove_cv<T>::type;
1301
+ template <size_t Dimension = 1>
1302
+ using accessor_t = typename std::conditional<
1303
+ Memory == local, sycl::local_accessor<value_t, Dimension>,
1304
+ sycl::accessor<T, Dimension, mode, target>>::type;
1305
+ using pointer_t = T *;
1306
+ };
1307
+
1308
+ static inline void *dpct_malloc(size_t size, sycl::queue &q)
1309
+ {
1310
+ return sycl::malloc_device(size, q.get_device(), q.get_context());
1311
+ }
1312
+
1313
+ #define PITCH_DEFAULT_ALIGN(x) (((x) + 31) & ~(0x1F))
1314
+ static inline void *dpct_malloc(size_t &pitch, size_t x, size_t y, size_t z,
1315
+ sycl::queue &q)
1316
+ {
1317
+ pitch = PITCH_DEFAULT_ALIGN(x);
1318
+ return dpct_malloc(pitch * y * z, q);
1319
+ }
1320
+
1321
+ /**
1322
+ * @brief Sets \p value to the first \p size elements starting from \p dev_ptr in \p q.
1323
+ * @tparam valueT The type of the element to be set.
1324
+ * @param [in] q The queue in which the operation is done.
1325
+ * @param [in] dev_ptr Pointer to the virtual device memory address.
1326
+ * @param [in] value The value to be set.
1327
+ * @param [in] size Number of elements to be set to the value.
1328
+ * @return An event representing the memset operation.
1329
+ */
1330
+ template <typename valueT>
1331
+ static inline sycl::event dpct_memset(sycl::queue &q, void *dev_ptr,
1332
+ valueT value, size_t size)
1333
+ {
1334
+ return q.fill(dev_ptr, value, size);
1335
+ }
1336
+
1337
+ /**
1338
+ * @brief Sets \p value to the 3D memory region pointed by \p data in \p q.
1339
+ * @tparam valueT The type of the element to be set.
1340
+ * @param [in] q The queue in which the operation is done.
1341
+ * @param [in] data Pointer to the pitched device memory region.
1342
+ * @param [in] value The value to be set.
1343
+ * @param [in] size 3D memory region by number of elements.
1344
+ * @return An event list representing the memset operations.
1345
+ */
1346
+ template <typename valueT>
1347
+ static inline std::vector<sycl::event>
1348
+ dpct_memset(sycl::queue &q, pitched_data data, valueT value,
1349
+ sycl::range<3> size)
1350
+ {
1351
+ std::vector<sycl::event> event_list;
1352
+ size_t slice = data.get_pitch() * data.get_y();
1353
+ unsigned char *data_surface = (unsigned char *)data.get_data_ptr();
1354
+ for (size_t z = 0; z < size.get(2); ++z)
1355
+ {
1356
+ unsigned char *data_ptr = data_surface;
1357
+ for (size_t y = 0; y < size.get(1); ++y)
1358
+ {
1359
+ event_list.push_back(dpct_memset(q, data_ptr, value, size.get(0)));
1360
+ data_ptr += data.get_pitch();
1361
+ }
1362
+ data_surface += slice;
1363
+ }
1364
+ return event_list;
1365
+ }
1366
+
1367
+ /**
1368
+ * @brief Sets \p val to the pitched 2D memory region pointed by \p ptr in \p q.
1369
+ * @tparam valueT The type of the element to be set.
1370
+ * @param [in] q The queue in which the operation is done.
1371
+ * @param [in] ptr Pointer to the virtual device memory.
1372
+ * @param [in] pitch The pitch size by number of elements, including padding.
1373
+ * @param [in] val The value to be set.
1374
+ * @param [in] x The width of memory region by number of elements.
1375
+ * @param [in] y The height of memory region by number of elements.
1376
+ * @return An event list representing the memset operations.
1377
+ */
1378
+ template <typename valueT>
1379
+ static inline std::vector<sycl::event>
1380
+ dpct_memset(sycl::queue &q, void *ptr, size_t pitch, valueT val, size_t x,
1381
+ size_t y)
1382
+ {
1383
+ return dpct_memset(q, pitched_data(ptr, pitch, x, 1), val,
1384
+ sycl::range<3>(x, y, 1));
1385
+ }
1386
+
1387
+ static memcpy_direction deduce_memcpy_direction(sycl::queue &q, void *to_ptr,
1388
+ const void *from_ptr,
1389
+ memcpy_direction dir)
1390
+ {
1391
+ switch (dir)
1392
+ {
1393
+ case memcpy_direction::host_to_host:
1394
+ case memcpy_direction::host_to_device:
1395
+ case memcpy_direction::device_to_host:
1396
+ case memcpy_direction::device_to_device:
1397
+ return dir;
1398
+ case memcpy_direction::automatic:
1399
+ {
1400
+ // table[to_attribute][from_attribute]
1401
+ static const memcpy_direction
1402
+ direction_table[static_cast<unsigned>(pointer_access_attribute::end)]
1403
+ [static_cast<unsigned>(pointer_access_attribute::end)] =
1404
+ {{memcpy_direction::host_to_host,
1405
+ memcpy_direction::device_to_host,
1406
+ memcpy_direction::host_to_host},
1407
+ {memcpy_direction::host_to_device,
1408
+ memcpy_direction::device_to_device,
1409
+ memcpy_direction::device_to_device},
1410
+ {memcpy_direction::host_to_host,
1411
+ memcpy_direction::device_to_device,
1412
+ memcpy_direction::device_to_device}};
1413
+ return direction_table[static_cast<unsigned>(get_pointer_attribute(
1414
+ q, to_ptr))][static_cast<unsigned>(get_pointer_attribute(q, from_ptr))];
1415
+ }
1416
+ default:
1417
+ throw std::runtime_error("dpct_memcpy: invalid direction value");
1418
+ }
1419
+ }
1420
+
1421
+ static sycl::event
1422
+ dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr, size_t size,
1423
+ memcpy_direction direction,
1424
+ const std::vector<sycl::event> &dep_events = {})
1425
+ {
1426
+ if (!size)
1427
+ return sycl::event{};
1428
+ return q.memcpy(to_ptr, from_ptr, size, dep_events);
1429
+ GGML_UNUSED(direction);
1430
+ }
1431
+
1432
+ // Get actual copy range and make sure it will not exceed range.
1433
+ static inline size_t get_copy_range(sycl::range<3> size, size_t slice,
1434
+ size_t pitch)
1435
+ {
1436
+ return slice * (size.get(2) - 1) + pitch * (size.get(1) - 1) + size.get(0);
1437
+ }
1438
+
1439
+ static inline size_t get_offset(sycl::id<3> id, size_t slice,
1440
+ size_t pitch)
1441
+ {
1442
+ return slice * id.get(2) + pitch * id.get(1) + id.get(0);
1443
+ }
1444
+
1445
+ /// copy 3D matrix specified by \p size from 3D matrix specified by \p from_ptr
1446
+ /// and \p from_range to another specified by \p to_ptr and \p to_range.
1447
+ static inline std::vector<sycl::event>
1448
+ dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr,
1449
+ sycl::range<3> to_range, sycl::range<3> from_range,
1450
+ sycl::id<3> to_id, sycl::id<3> from_id,
1451
+ sycl::range<3> size, memcpy_direction direction,
1452
+ const std::vector<sycl::event> &dep_events = {})
1453
+ {
1454
+ // RAII for host pointer
1455
+ class host_buffer
1456
+ {
1457
+ void *_buf;
1458
+ size_t _size;
1459
+ sycl::queue &_q;
1460
+ const std::vector<sycl::event> &_deps; // free operation depends
1461
+
1462
+ public:
1463
+ host_buffer(size_t size, sycl::queue &q,
1464
+ const std::vector<sycl::event> &deps)
1465
+ : _buf(std::malloc(size)), _size(size), _q(q), _deps(deps) {}
1466
+ void *get_ptr() const { return _buf; }
1467
+ size_t get_size() const { return _size; }
1468
+ ~host_buffer()
1469
+ {
1470
+ if (_buf)
1471
+ {
1472
+ _q.submit([&](sycl::handler &cgh)
1473
+ {
1474
+ cgh.depends_on(_deps);
1475
+ cgh.host_task([buf = _buf] { std::free(buf); }); });
1476
+ }
1477
+ }
1478
+ };
1479
+ std::vector<sycl::event> event_list;
1480
+
1481
+ size_t to_slice = to_range.get(1) * to_range.get(0),
1482
+ from_slice = from_range.get(1) * from_range.get(0);
1483
+ unsigned char *to_surface =
1484
+ (unsigned char *)to_ptr + get_offset(to_id, to_slice, to_range.get(0));
1485
+ const unsigned char *from_surface =
1486
+ (const unsigned char *)from_ptr +
1487
+ get_offset(from_id, from_slice, from_range.get(0));
1488
+
1489
+ if (to_slice == from_slice && to_slice == size.get(1) * size.get(0))
1490
+ {
1491
+ return {dpct_memcpy(q, to_surface, from_surface, to_slice * size.get(2),
1492
+ direction, dep_events)};
1493
+ }
1494
+ direction = deduce_memcpy_direction(q, to_ptr, from_ptr, direction);
1495
+ size_t size_slice = size.get(1) * size.get(0);
1496
+ switch (direction)
1497
+ {
1498
+ case host_to_host:
1499
+ for (size_t z = 0; z < size.get(2); ++z)
1500
+ {
1501
+ unsigned char *to_ptr = to_surface;
1502
+ const unsigned char *from_ptr = from_surface;
1503
+ if (to_range.get(0) == from_range.get(0) &&
1504
+ to_range.get(0) == size.get(0))
1505
+ {
1506
+ event_list.push_back(dpct_memcpy(q, to_ptr, from_ptr, size_slice,
1507
+ direction, dep_events));
1508
+ }
1509
+ else
1510
+ {
1511
+ for (size_t y = 0; y < size.get(1); ++y)
1512
+ {
1513
+ event_list.push_back(dpct_memcpy(q, to_ptr, from_ptr, size.get(0),
1514
+ direction, dep_events));
1515
+ to_ptr += to_range.get(0);
1516
+ from_ptr += from_range.get(0);
1517
+ }
1518
+ }
1519
+ to_surface += to_slice;
1520
+ from_surface += from_slice;
1521
+ }
1522
+ break;
1523
+ case host_to_device:
1524
+ {
1525
+ host_buffer buf(get_copy_range(size, to_slice, to_range.get(0)), q,
1526
+ event_list);
1527
+ std::vector<sycl::event> host_events;
1528
+ if (to_slice == size_slice)
1529
+ {
1530
+ // Copy host data to a temp host buffer with the shape of target.
1531
+ host_events =
1532
+ dpct_memcpy(q, buf.get_ptr(), from_surface, to_range, from_range,
1533
+ sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size,
1534
+ host_to_host, dep_events);
1535
+ }
1536
+ else
1537
+ {
1538
+ // Copy host data to a temp host buffer with the shape of target.
1539
+ host_events = dpct_memcpy(
1540
+ q, buf.get_ptr(), from_surface, to_range, from_range,
1541
+ sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size, host_to_host,
1542
+ // If has padding data, not sure whether it is useless. So fill temp
1543
+ // buffer with it.
1544
+ std::vector<sycl::event>{
1545
+ dpct_memcpy(q, buf.get_ptr(), to_surface, buf.get_size(),
1546
+ device_to_host, dep_events)});
1547
+ }
1548
+ // Copy from temp host buffer to device with only one submit.
1549
+ event_list.push_back(dpct_memcpy(q, to_surface, buf.get_ptr(),
1550
+ buf.get_size(), host_to_device,
1551
+ host_events));
1552
+ break;
1553
+ }
1554
+ case device_to_host:
1555
+ {
1556
+ host_buffer buf(get_copy_range(size, from_slice, from_range.get(0)), q,
1557
+ event_list);
1558
+ // Copy from host temp buffer to host target with reshaping.
1559
+ event_list = dpct_memcpy(
1560
+ q, to_surface, buf.get_ptr(), to_range, from_range, sycl::id<3>(0, 0, 0),
1561
+ sycl::id<3>(0, 0, 0), size, host_to_host,
1562
+ // Copy from device to temp host buffer with only one submit.
1563
+ std::vector<sycl::event>{dpct_memcpy(q, buf.get_ptr(), from_surface,
1564
+ buf.get_size(),
1565
+ device_to_host, dep_events)});
1566
+ break;
1567
+ }
1568
+ case device_to_device:
1569
+ event_list.push_back(q.submit([&](sycl::handler &cgh){
1570
+ cgh.depends_on(dep_events);
1571
+ cgh.parallel_for<class dpct_memcpy_3d_detail>(
1572
+ size,
1573
+ [=](sycl::id<3> id) {
1574
+ to_surface[get_offset(id, to_slice, to_range.get(0))] =
1575
+ from_surface[get_offset(id, from_slice, from_range.get(0))];
1576
+ }); }));
1577
+ break;
1578
+ default:
1579
+ throw std::runtime_error("dpct_memcpy: invalid direction value");
1580
+ }
1581
+ return event_list;
1582
+ }
1583
+
1584
+ /// memcpy 2D/3D matrix specified by pitched_data.
1585
+ static inline std::vector<sycl::event>
1586
+ dpct_memcpy(sycl::queue &q, pitched_data to, sycl::id<3> to_id,
1587
+ pitched_data from, sycl::id<3> from_id, sycl::range<3> size,
1588
+ memcpy_direction direction = automatic)
1589
+ {
1590
+ return dpct_memcpy(q, to.get_data_ptr(), from.get_data_ptr(),
1591
+ sycl::range<3>(to.get_pitch(), to.get_y(), 1),
1592
+ sycl::range<3>(from.get_pitch(), from.get_y(), 1), to_id, from_id,
1593
+ size, direction);
1594
+ }
1595
+
1596
+ /// memcpy 2D matrix with pitch.
1597
+ static inline std::vector<sycl::event>
1598
+ dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr,
1599
+ size_t to_pitch, size_t from_pitch, size_t x, size_t y,
1600
+ memcpy_direction direction = automatic)
1601
+ {
1602
+ return dpct_memcpy(q, to_ptr, from_ptr, sycl::range<3>(to_pitch, y, 1),
1603
+ sycl::range<3>(from_pitch, y, 1),
1604
+ sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0),
1605
+ sycl::range<3>(x, y, 1), direction);
1606
+ }
1607
+
1608
+ namespace deprecated
1609
+ {
1610
+
1611
+ template <typename T, sycl::usm::alloc AllocKind>
1612
+ class usm_allocator
1613
+ {
1614
+ private:
1615
+ using Alloc = sycl::usm_allocator<T, AllocKind>;
1616
+ Alloc _impl;
1617
+
1618
+ public:
1619
+ using value_type = typename std::allocator_traits<Alloc>::value_type;
1620
+ using pointer = typename std::allocator_traits<Alloc>::pointer;
1621
+ using const_pointer = typename std::allocator_traits<Alloc>::const_pointer;
1622
+ using void_pointer = typename std::allocator_traits<Alloc>::void_pointer;
1623
+ using const_void_pointer =
1624
+ typename std::allocator_traits<Alloc>::const_void_pointer;
1625
+ using reference = typename std::allocator_traits<Alloc>::value_type &;
1626
+ using const_reference =
1627
+ const typename std::allocator_traits<Alloc>::value_type &;
1628
+ using difference_type =
1629
+ typename std::allocator_traits<Alloc>::difference_type;
1630
+ using size_type = typename std::allocator_traits<Alloc>::size_type;
1631
+ using propagate_on_container_copy_assignment = typename std::allocator_traits<
1632
+ Alloc>::propagate_on_container_copy_assignment;
1633
+ using propagate_on_container_move_assignment = typename std::allocator_traits<
1634
+ Alloc>::propagate_on_container_move_assignment;
1635
+ using propagate_on_container_swap =
1636
+ typename std::allocator_traits<Alloc>::propagate_on_container_swap;
1637
+ using is_always_equal =
1638
+ typename std::allocator_traits<Alloc>::is_always_equal;
1639
+
1640
+ template <typename U>
1641
+ struct rebind
1642
+ {
1643
+ typedef usm_allocator<U, AllocKind> other;
1644
+ };
1645
+
1646
+ usm_allocator() : _impl(dpct::get_default_queue()) {}
1647
+ ~usm_allocator() {}
1648
+ usm_allocator(const usm_allocator &other) : _impl(other._impl) {}
1649
+ usm_allocator(usm_allocator &&other) : _impl(std::move(other._impl)) {}
1650
+ pointer address(reference r) { return &r; }
1651
+ const_pointer address(const_reference r) { return &r; }
1652
+ pointer allocate(size_type cnt, const_void_pointer hint = nullptr)
1653
+ {
1654
+ return std::allocator_traits<Alloc>::allocate(_impl, cnt, hint);
1655
+ }
1656
+ void deallocate(pointer p, size_type cnt)
1657
+ {
1658
+ std::allocator_traits<Alloc>::deallocate(_impl, p, cnt);
1659
+ }
1660
+ size_type max_size() const
1661
+ {
1662
+ return std::allocator_traits<Alloc>::max_size(_impl);
1663
+ }
1664
+ bool operator==(const usm_allocator &other) const { return _impl == other._impl; }
1665
+ bool operator!=(const usm_allocator &other) const { return _impl != other._impl; }
1666
+ };
1667
+
1668
+ } // namespace deprecated
1669
+
1670
+ inline void dpct_free(void *ptr,
1671
+ const sycl::queue &q)
1672
+ {
1673
+ if (ptr)
1674
+ {
1675
+ sycl::free(ptr, q.get_context());
1676
+ }
1677
+ }
1678
+
1679
+ template <typename T>
1680
+ inline auto get_memory(const void *x)
1681
+ {
1682
+ T *new_x = reinterpret_cast<T *>(const_cast<void *>(x));
1683
+ return new_x;
1684
+ }
1685
+
1686
+ template <typename T>
1687
+ inline typename DataType<T>::T2 get_value(const T *s, sycl::queue &q)
1688
+ {
1689
+ using Ty = typename DataType<T>::T2;
1690
+ Ty s_h;
1691
+ if (get_pointer_attribute(q, s) == pointer_access_attribute::device_only)
1692
+ detail::dpct_memcpy(q, (void *)&s_h, (const void *)s, sizeof(T), device_to_host)
1693
+ .wait();
1694
+ else
1695
+ s_h = *reinterpret_cast<const Ty *>(s);
1696
+ return s_h;
1697
+ }
1698
+
1699
+ } // namespace detail
1700
+
1701
+ template <typename T>
1702
+ inline auto get_value(const T *s, sycl::queue &q)
1703
+ {
1704
+ return detail::get_value(s, q);
1705
+ }
1706
+
1707
+ namespace detail
1708
+ {
1709
+ template <class Ta, class Tb, class Tc, class Ts>
1710
+ inline void gemm_impl(sycl::queue & q, oneapi::mkl::transpose a_trans, oneapi::mkl::transpose b_trans, int m,
1711
+ int n, int k, const void * alpha, const void * a, int lda, const void * b, int ldb,
1712
+ const void * beta, void * c, int ldc) {
1713
+ Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q);
1714
+ Ts beta_value = dpct::get_value(reinterpret_cast<const Ts *>(beta), q);
1715
+ auto data_a = get_memory<const Ta>(a);
1716
+ auto data_b = get_memory<const Tb>(b);
1717
+ auto data_c = get_memory<Tc>(c);
1718
+ oneapi::mkl::blas::column_major::gemm(q, a_trans, b_trans, m, n, k, alpha_value, data_a,
1719
+ lda, data_b, ldb, beta_value, data_c, ldc);
1720
+ }
1721
+
1722
+ template <typename VecT, class BinaryOperation, class = void>
1723
+ class vectorized_binary
1724
+ {
1725
+ public:
1726
+ inline VecT operator()(VecT a, VecT b, const BinaryOperation binary_op)
1727
+ {
1728
+ VecT v4;
1729
+ for (size_t i = 0; i < v4.size(); ++i)
1730
+ {
1731
+ v4[i] = binary_op(a[i], b[i]);
1732
+ }
1733
+ return v4;
1734
+ }
1735
+ };
1736
+
1737
+ template <typename VecT, class BinaryOperation>
1738
+ class vectorized_binary<
1739
+ VecT, BinaryOperation,
1740
+ std::void_t<std::invoke_result_t<BinaryOperation, VecT, VecT>>>
1741
+ {
1742
+ public:
1743
+ inline VecT operator()(VecT a, VecT b, const BinaryOperation binary_op)
1744
+ {
1745
+ return binary_op(a, b).template as<VecT>();
1746
+ }
1747
+ };
1748
+
1749
+ template <class Ta, class Tb, class Tc, class Ts>
1750
+ inline void gemm_batch_impl(sycl::queue & q, oneapi::mkl::transpose a_trans, oneapi::mkl::transpose b_trans,
1751
+ int m, int n, int k, const void * alpha, const void ** a, int lda, const void ** b,
1752
+ int ldb, const void * beta, void ** c, int ldc, int batch_size,
1753
+ matrix_info_t<float> * matrix_info) {
1754
+ Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q);
1755
+ Ts beta_value = dpct::get_value(reinterpret_cast<const Ts *>(beta), q);
1756
+
1757
+ matrix_info->transpose_info[0] = a_trans;
1758
+ matrix_info->transpose_info[1] = b_trans;
1759
+ matrix_info->value_info[0] = alpha_value;
1760
+ matrix_info->value_info[1] = beta_value;
1761
+ matrix_info->size_info[0] = m;
1762
+ matrix_info->size_info[1] = n;
1763
+ matrix_info->size_info[2] = k;
1764
+ matrix_info->ld_info[0] = lda;
1765
+ matrix_info->ld_info[1] = ldb;
1766
+ matrix_info->ld_info[2] = ldc;
1767
+ matrix_info->groupsize_info = batch_size;
1768
+
1769
+ sycl::event e = oneapi::mkl::blas::column_major::gemm_batch(
1770
+ q, matrix_info->transpose_info, matrix_info->transpose_info + 1,
1771
+ matrix_info->size_info, matrix_info->size_info + 1, matrix_info->size_info + 2,
1772
+ reinterpret_cast<Ts *>(matrix_info->value_info), reinterpret_cast<const Ta **>(a), matrix_info->ld_info,
1773
+ reinterpret_cast<const Tb **>(b), matrix_info->ld_info + 1,
1774
+ reinterpret_cast<Ts *>(matrix_info->value_info + 1), reinterpret_cast<Tc **>(c),
1775
+ matrix_info->ld_info + 2, 1, &(matrix_info->groupsize_info));
1776
+ }
1777
+
1778
+ template <class Ta, class Tb, class Tc, class Ts>
1779
+ inline void gemm_batch_impl(sycl::queue & q, oneapi::mkl::transpose a_trans, oneapi::mkl::transpose b_trans,
1780
+ int m, int n, int k, const void * alpha, const void * a, int lda,
1781
+ long long int stride_a, const void * b, int ldb, long long int stride_b,
1782
+ const void * beta, void * c, int ldc, long long int stride_c, int batch_size) {
1783
+ Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q);
1784
+ Ts beta_value = dpct::get_value(reinterpret_cast<const Ts *>(beta), q);
1785
+ auto data_a = get_memory<const Ta>(a);
1786
+ auto data_b = get_memory<const Tb>(b);
1787
+ auto data_c = get_memory<Tc>(c);
1788
+ oneapi::mkl::blas::column_major::gemm_batch(q, a_trans, b_trans, m, n, k, alpha_value,
1789
+ data_a, lda, stride_a, data_b, ldb, stride_b, beta_value,
1790
+ data_c, ldc, stride_c, batch_size);
1791
+ }
1792
+
1793
+ } // namespace detail
1794
+
1795
+ template <typename VecT, class BinaryOperation>
1796
+ inline unsigned vectorized_binary(unsigned a, unsigned b,
1797
+ const BinaryOperation binary_op)
1798
+ {
1799
+ sycl::vec<unsigned, 1> v0{a}, v1{b};
1800
+ auto v2 = v0.as<VecT>();
1801
+ auto v3 = v1.as<VecT>();
1802
+ auto v4 =
1803
+ detail::vectorized_binary<VecT, BinaryOperation>()(v2, v3, binary_op);
1804
+ v0 = v4.template as<sycl::vec<unsigned, 1>>();
1805
+ return v0;
1806
+ }
1807
+
1808
+ static void async_dpct_memcpy(void *to_ptr, const void *from_ptr, size_t size,
1809
+ memcpy_direction direction = automatic,
1810
+ sycl::queue &q = dpct::get_default_queue())
1811
+ {
1812
+ detail::dpct_memcpy(q, to_ptr, from_ptr, size, direction);
1813
+ }
1814
+
1815
+ static inline unsigned int select_device(unsigned int id)
1816
+ {
1817
+ dev_mgr::instance().select_device(id);
1818
+ return id;
1819
+ }
1820
+
1821
+ template <typename T>
1822
+ T permute_sub_group_by_xor(sycl::sub_group g, T x, unsigned int mask,
1823
+ unsigned int logical_sub_group_size = 32)
1824
+ {
1825
+ unsigned int id = g.get_local_linear_id();
1826
+ unsigned int start_index =
1827
+ id / logical_sub_group_size * logical_sub_group_size;
1828
+ unsigned int target_offset = (id % logical_sub_group_size) ^ mask;
1829
+ return sycl::select_from_group(g, x,
1830
+ target_offset < logical_sub_group_size
1831
+ ? start_index + target_offset
1832
+ : id);
1833
+ }
1834
+
1835
+ template <typename T1, typename T2>
1836
+ using dot_product_acc_t = std::conditional_t<
1837
+ std::is_unsigned_v<T1> && std::is_unsigned_v<T2>,
1838
+ uint32_t,
1839
+ int32_t>;
1840
+
1841
+ template <typename T>
1842
+ sycl::vec<T, 4> extract_and_sign_or_zero_extend4(T val) {
1843
+ return sycl::vec<T, 1>(val)
1844
+ .template as<sycl::vec<
1845
+ std::conditional_t<std::is_signed_v<T>, int8_t, uint8_t>,
1846
+ 4>>()
1847
+ .template convert<T>();
1848
+ }
1849
+
1850
+ template <typename T1, typename T2, typename T3>
1851
+ inline auto dp4a(T1 a, T2 b, T3 c) {
1852
+ dot_product_acc_t<T1, T2> res = c;
1853
+ auto va = extract_and_sign_or_zero_extend4(a);
1854
+ auto vb = extract_and_sign_or_zero_extend4(b);
1855
+ res += va[0] * vb[0];
1856
+ res += va[1] * vb[1];
1857
+ res += va[2] * vb[2];
1858
+ res += va[3] * vb[3];
1859
+ return res;
1860
+ }
1861
+
1862
+ struct sub_sat
1863
+ {
1864
+ template <typename T>
1865
+ auto operator()(const T x, const T y) const
1866
+ {
1867
+ return sycl::sub_sat(x, y);
1868
+ }
1869
+ };
1870
+
1871
+ template <typename S, typename T>
1872
+ inline T vectorized_min(T a, T b)
1873
+ {
1874
+ sycl::vec<T, 1> v0{a}, v1{b};
1875
+ auto v2 = v0.template as<S>();
1876
+ auto v3 = v1.template as<S>();
1877
+ auto v4 = sycl::min(v2, v3);
1878
+ v0 = v4.template as<sycl::vec<T, 1>>();
1879
+ return v0;
1880
+ }
1881
+
1882
+ inline float pow(const float a, const int b) { return sycl::pown(a, b); }
1883
+ inline double pow(const double a, const int b) { return sycl::pown(a, b); }
1884
+ inline float pow(const float a, const float b) { return sycl::pow(a, b); }
1885
+ inline double pow(const double a, const double b) { return sycl::pow(a, b); }
1886
+ template <typename T, typename U>
1887
+ inline typename std::enable_if_t<std::is_floating_point_v<T>, T>
1888
+ pow(const T a, const U b)
1889
+ {
1890
+ return sycl::pow(a, static_cast<T>(b));
1891
+ }
1892
+ template <typename T, typename U>
1893
+ inline typename std::enable_if_t<!std::is_floating_point_v<T>, double>
1894
+ pow(const T a, const U b)
1895
+ {
1896
+ return sycl::pow(static_cast<double>(a), static_cast<double>(b));
1897
+ }
1898
+
1899
+ inline double min(const double a, const float b)
1900
+ {
1901
+ return sycl::fmin(a, static_cast<double>(b));
1902
+ }
1903
+ inline double min(const float a, const double b)
1904
+ {
1905
+ return sycl::fmin(static_cast<double>(a), b);
1906
+ }
1907
+ inline float min(const float a, const float b) { return sycl::fmin(a, b); }
1908
+ inline double min(const double a, const double b) { return sycl::fmin(a, b); }
1909
+ inline std::uint32_t min(const std::uint32_t a, const std::int32_t b)
1910
+ {
1911
+ return sycl::min(a, static_cast<std::uint32_t>(b));
1912
+ }
1913
+ inline std::uint32_t min(const std::int32_t a, const std::uint32_t b)
1914
+ {
1915
+ return sycl::min(static_cast<std::uint32_t>(a), b);
1916
+ }
1917
+ inline std::int32_t min(const std::int32_t a, const std::int32_t b)
1918
+ {
1919
+ return sycl::min(a, b);
1920
+ }
1921
+ inline std::uint32_t min(const std::uint32_t a, const std::uint32_t b)
1922
+ {
1923
+ return sycl::min(a, b);
1924
+ }
1925
+ inline std::uint64_t min(const std::uint64_t a, const std::int64_t b)
1926
+ {
1927
+ return sycl::min(a, static_cast<std::uint64_t>(b));
1928
+ }
1929
+ inline std::uint64_t min(const std::int64_t a, const std::uint64_t b)
1930
+ {
1931
+ return sycl::min(static_cast<std::uint64_t>(a), b);
1932
+ }
1933
+ inline std::int64_t min(const std::int64_t a, const std::int64_t b)
1934
+ {
1935
+ return sycl::min(a, b);
1936
+ }
1937
+ inline std::uint64_t min(const std::uint64_t a, const std::uint64_t b)
1938
+ {
1939
+ return sycl::min(a, b);
1940
+ }
1941
+ inline std::uint64_t min(const std::uint64_t a, const std::int32_t b)
1942
+ {
1943
+ return sycl::min(a, static_cast<std::uint64_t>(b));
1944
+ }
1945
+ inline std::uint64_t min(const std::int32_t a, const std::uint64_t b)
1946
+ {
1947
+ return sycl::min(static_cast<std::uint64_t>(a), b);
1948
+ }
1949
+ inline std::uint64_t min(const std::uint64_t a, const std::uint32_t b)
1950
+ {
1951
+ return sycl::min(a, static_cast<std::uint64_t>(b));
1952
+ }
1953
+ inline std::uint64_t min(const std::uint32_t a, const std::uint64_t b)
1954
+ {
1955
+ return sycl::min(static_cast<std::uint64_t>(a), b);
1956
+ }
1957
+ // max function overloads.
1958
+ // For floating-point types, `float` or `double` arguments are acceptable.
1959
+ // For integer types, `std::uint32_t`, `std::int32_t`, `std::uint64_t` or
1960
+ // `std::int64_t` type arguments are acceptable.
1961
+ inline double max(const double a, const float b)
1962
+ {
1963
+ return sycl::fmax(a, static_cast<double>(b));
1964
+ }
1965
+ inline double max(const float a, const double b)
1966
+ {
1967
+ return sycl::fmax(static_cast<double>(a), b);
1968
+ }
1969
+ inline float max(const float a, const float b) { return sycl::fmax(a, b); }
1970
+ inline double max(const double a, const double b) { return sycl::fmax(a, b); }
1971
+ inline std::uint32_t max(const std::uint32_t a, const std::int32_t b)
1972
+ {
1973
+ return sycl::max(a, static_cast<std::uint32_t>(b));
1974
+ }
1975
+ inline std::uint32_t max(const std::int32_t a, const std::uint32_t b)
1976
+ {
1977
+ return sycl::max(static_cast<std::uint32_t>(a), b);
1978
+ }
1979
+ inline std::int32_t max(const std::int32_t a, const std::int32_t b)
1980
+ {
1981
+ return sycl::max(a, b);
1982
+ }
1983
+ inline std::uint32_t max(const std::uint32_t a, const std::uint32_t b)
1984
+ {
1985
+ return sycl::max(a, b);
1986
+ }
1987
+ inline std::uint64_t max(const std::uint64_t a, const std::int64_t b)
1988
+ {
1989
+ return sycl::max(a, static_cast<std::uint64_t>(b));
1990
+ }
1991
+ inline std::uint64_t max(const std::int64_t a, const std::uint64_t b)
1992
+ {
1993
+ return sycl::max(static_cast<std::uint64_t>(a), b);
1994
+ }
1995
+ inline std::int64_t max(const std::int64_t a, const std::int64_t b)
1996
+ {
1997
+ return sycl::max(a, b);
1998
+ }
1999
+ inline std::uint64_t max(const std::uint64_t a, const std::uint64_t b)
2000
+ {
2001
+ return sycl::max(a, b);
2002
+ }
2003
+ inline std::uint64_t max(const std::uint64_t a, const std::int32_t b)
2004
+ {
2005
+ return sycl::max(a, static_cast<std::uint64_t>(b));
2006
+ }
2007
+ inline std::uint64_t max(const std::int32_t a, const std::uint64_t b)
2008
+ {
2009
+ return sycl::max(static_cast<std::uint64_t>(a), b);
2010
+ }
2011
+ inline std::uint64_t max(const std::uint64_t a, const std::uint32_t b)
2012
+ {
2013
+ return sycl::max(a, static_cast<std::uint64_t>(b));
2014
+ }
2015
+ inline std::uint64_t max(const std::uint32_t a, const std::uint64_t b)
2016
+ {
2017
+ return sycl::max(static_cast<std::uint64_t>(a), b);
2018
+ }
2019
+
2020
+ inline void
2021
+ has_capability_or_fail(const sycl::device &dev,
2022
+ const std::initializer_list<sycl::aspect> &props)
2023
+ {
2024
+ for (const auto &it : props)
2025
+ {
2026
+ if (dev.has(it))
2027
+ continue;
2028
+ switch (it)
2029
+ {
2030
+ case sycl::aspect::fp64:
2031
+ throw std::runtime_error("'double' is not supported in '" +
2032
+ dev.get_info<sycl::info::device::name>() +
2033
+ "' device");
2034
+ break;
2035
+ case sycl::aspect::fp16:
2036
+ throw std::runtime_error("'half' is not supported in '" +
2037
+ dev.get_info<sycl::info::device::name>() +
2038
+ "' device");
2039
+ break;
2040
+ default:
2041
+ #define __SYCL_ASPECT(ASPECT, ID) \
2042
+ case sycl::aspect::ASPECT: \
2043
+ return #ASPECT;
2044
+ #define __SYCL_ASPECT_DEPRECATED(ASPECT, ID, MESSAGE) __SYCL_ASPECT(ASPECT, ID)
2045
+ #define __SYCL_ASPECT_DEPRECATED_ALIAS(ASPECT, ID, MESSAGE)
2046
+ auto getAspectNameStr = [](sycl::aspect AspectNum) -> std::string
2047
+ {
2048
+ switch (AspectNum)
2049
+ {
2050
+ #include <sycl/info/aspects.def>
2051
+ #include <sycl/info/aspects_deprecated.def>
2052
+ default:
2053
+ return "unknown aspect";
2054
+ }
2055
+ };
2056
+ #undef __SYCL_ASPECT_DEPRECATED_ALIAS
2057
+ #undef __SYCL_ASPECT_DEPRECATED
2058
+ #undef __SYCL_ASPECT
2059
+ throw std::runtime_error(
2060
+ "'" + getAspectNameStr(it) + "' is not supported in '" +
2061
+ dev.get_info<sycl::info::device::name>() + "' device");
2062
+ }
2063
+ break;
2064
+ }
2065
+ }
2066
+
2067
+ static inline unsigned int get_current_device_id()
2068
+ {
2069
+ return dev_mgr::instance().current_device_id();
2070
+ }
2071
+
2072
+ static inline device_ext &get_current_device()
2073
+ {
2074
+ return dev_mgr::instance().current_device();
2075
+ }
2076
+
2077
+ static inline device_ext &get_device(unsigned int id)
2078
+ {
2079
+ return dev_mgr::instance().get_device(id);
2080
+ }
2081
+
2082
+ static inline sycl::queue &get_in_order_queue()
2083
+ {
2084
+ return dev_mgr::instance().current_device().in_order_queue();
2085
+ }
2086
+
2087
+ static sycl::event
2088
+ dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr, size_t size,
2089
+ memcpy_direction direction,
2090
+ const std::vector<sycl::event> &dep_events = {})
2091
+ {
2092
+ if (!size)
2093
+ return sycl::event{};
2094
+ return q.memcpy(to_ptr, from_ptr, size, dep_events);
2095
+ GGML_UNUSED(direction);
2096
+ }
2097
+
2098
+ // Get actual copy range and make sure it will not exceed range.
2099
+ static inline size_t get_copy_range(sycl::range<3> size, size_t slice,
2100
+ size_t pitch)
2101
+ {
2102
+ return slice * (size.get(2) - 1) + pitch * (size.get(1) - 1) + size.get(0);
2103
+ }
2104
+
2105
+ static inline size_t get_offset(sycl::id<3> id, size_t slice,
2106
+ size_t pitch)
2107
+ {
2108
+ return slice * id.get(2) + pitch * id.get(1) + id.get(0);
2109
+ }
2110
+
2111
+ /// copy 3D matrix specified by \p size from 3D matrix specified by \p from_ptr
2112
+ /// and \p from_range to another specified by \p to_ptr and \p to_range.
2113
+ static inline std::vector<sycl::event>
2114
+ dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr,
2115
+ sycl::range<3> to_range, sycl::range<3> from_range,
2116
+ sycl::id<3> to_id, sycl::id<3> from_id,
2117
+ sycl::range<3> size, memcpy_direction direction,
2118
+ const std::vector<sycl::event> &dep_events = {})
2119
+ {
2120
+ // RAII for host pointer
2121
+ class host_buffer
2122
+ {
2123
+ void *_buf;
2124
+ size_t _size;
2125
+ sycl::queue &_q;
2126
+ const std::vector<sycl::event> &_deps; // free operation depends
2127
+
2128
+ public:
2129
+ host_buffer(size_t size, sycl::queue &q,
2130
+ const std::vector<sycl::event> &deps)
2131
+ : _buf(std::malloc(size)), _size(size), _q(q), _deps(deps) {}
2132
+ void *get_ptr() const { return _buf; }
2133
+ size_t get_size() const { return _size; }
2134
+ ~host_buffer()
2135
+ {
2136
+ if (_buf)
2137
+ {
2138
+ _q.submit([&](sycl::handler &cgh)
2139
+ {
2140
+ cgh.depends_on(_deps);
2141
+ cgh.host_task([buf = _buf] { std::free(buf); }); });
2142
+ }
2143
+ }
2144
+ };
2145
+ std::vector<sycl::event> event_list;
2146
+
2147
+ size_t to_slice = to_range.get(1) * to_range.get(0),
2148
+ from_slice = from_range.get(1) * from_range.get(0);
2149
+ unsigned char *to_surface =
2150
+ (unsigned char *)to_ptr + get_offset(to_id, to_slice, to_range.get(0));
2151
+ const unsigned char *from_surface =
2152
+ (const unsigned char *)from_ptr +
2153
+ get_offset(from_id, from_slice, from_range.get(0));
2154
+
2155
+ if (to_slice == from_slice && to_slice == size.get(1) * size.get(0))
2156
+ {
2157
+ return {dpct_memcpy(q, to_surface, from_surface, to_slice * size.get(2),
2158
+ direction, dep_events)};
2159
+ }
2160
+ direction = detail::deduce_memcpy_direction(q, to_ptr, from_ptr, direction);
2161
+ size_t size_slice = size.get(1) * size.get(0);
2162
+ switch (direction)
2163
+ {
2164
+ case host_to_host:
2165
+ for (size_t z = 0; z < size.get(2); ++z)
2166
+ {
2167
+ unsigned char *to_ptr = to_surface;
2168
+ const unsigned char *from_ptr = from_surface;
2169
+ if (to_range.get(0) == from_range.get(0) &&
2170
+ to_range.get(0) == size.get(0))
2171
+ {
2172
+ event_list.push_back(dpct_memcpy(q, to_ptr, from_ptr, size_slice,
2173
+ direction, dep_events));
2174
+ }
2175
+ else
2176
+ {
2177
+ for (size_t y = 0; y < size.get(1); ++y)
2178
+ {
2179
+ event_list.push_back(dpct_memcpy(q, to_ptr, from_ptr, size.get(0),
2180
+ direction, dep_events));
2181
+ to_ptr += to_range.get(0);
2182
+ from_ptr += from_range.get(0);
2183
+ }
2184
+ }
2185
+ to_surface += to_slice;
2186
+ from_surface += from_slice;
2187
+ }
2188
+ break;
2189
+ case host_to_device:
2190
+ {
2191
+ host_buffer buf(get_copy_range(size, to_slice, to_range.get(0)), q,
2192
+ event_list);
2193
+ std::vector<sycl::event> host_events;
2194
+ if (to_slice == size_slice)
2195
+ {
2196
+ // Copy host data to a temp host buffer with the shape of target.
2197
+ host_events =
2198
+ dpct_memcpy(q, buf.get_ptr(), from_surface, to_range, from_range,
2199
+ sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size,
2200
+ host_to_host, dep_events);
2201
+ }
2202
+ else
2203
+ {
2204
+ // Copy host data to a temp host buffer with the shape of target.
2205
+ host_events = dpct_memcpy(
2206
+ q, buf.get_ptr(), from_surface, to_range, from_range,
2207
+ sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size, host_to_host,
2208
+ // If has padding data, not sure whether it is useless. So fill temp
2209
+ // buffer with it.
2210
+ std::vector<sycl::event>{
2211
+ dpct_memcpy(q, buf.get_ptr(), to_surface, buf.get_size(),
2212
+ device_to_host, dep_events)});
2213
+ }
2214
+ // Copy from temp host buffer to device with only one submit.
2215
+ event_list.push_back(dpct_memcpy(q, to_surface, buf.get_ptr(),
2216
+ buf.get_size(), host_to_device,
2217
+ host_events));
2218
+ break;
2219
+ }
2220
+ case device_to_host:
2221
+ {
2222
+ host_buffer buf(get_copy_range(size, from_slice, from_range.get(0)), q,
2223
+ event_list);
2224
+ // Copy from host temp buffer to host target with reshaping.
2225
+ event_list = dpct_memcpy(
2226
+ q, to_surface, buf.get_ptr(), to_range, from_range, sycl::id<3>(0, 0, 0),
2227
+ sycl::id<3>(0, 0, 0), size, host_to_host,
2228
+ // Copy from device to temp host buffer with only one submit.
2229
+ std::vector<sycl::event>{dpct_memcpy(q, buf.get_ptr(), from_surface,
2230
+ buf.get_size(),
2231
+ device_to_host, dep_events)});
2232
+ break;
2233
+ }
2234
+ case device_to_device:
2235
+ event_list.push_back(q.submit([&](sycl::handler &cgh)
2236
+ {
2237
+ cgh.depends_on(dep_events);
2238
+ cgh.parallel_for<class dpct_memcpy_3d_detail>(
2239
+ size,
2240
+ [=](sycl::id<3> id) {
2241
+ to_surface[get_offset(id, to_slice, to_range.get(0))] =
2242
+ from_surface[get_offset(id, from_slice, from_range.get(0))];
2243
+ }); }));
2244
+ break;
2245
+ default:
2246
+ throw std::runtime_error("dpct_memcpy: invalid direction value");
2247
+ }
2248
+ return event_list;
2249
+ }
2250
+
2251
+ /// memcpy 2D/3D matrix specified by pitched_data.
2252
+ static inline std::vector<sycl::event>
2253
+ dpct_memcpy(sycl::queue &q, pitched_data to, sycl::id<3> to_id,
2254
+ pitched_data from, sycl::id<3> from_id, sycl::range<3> size,
2255
+ memcpy_direction direction = automatic)
2256
+ {
2257
+ return dpct_memcpy(q, to.get_data_ptr(), from.get_data_ptr(),
2258
+ sycl::range<3>(to.get_pitch(), to.get_y(), 1),
2259
+ sycl::range<3>(from.get_pitch(), from.get_y(), 1), to_id, from_id,
2260
+ size, direction);
2261
+ }
2262
+
2263
+ /// memcpy 2D matrix with pitch.
2264
+ static inline std::vector<sycl::event>
2265
+ dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr,
2266
+ size_t to_pitch, size_t from_pitch, size_t x, size_t y,
2267
+ memcpy_direction direction = automatic)
2268
+ {
2269
+ return dpct_memcpy(q, to_ptr, from_ptr, sycl::range<3>(to_pitch, y, 1),
2270
+ sycl::range<3>(from_pitch, y, 1),
2271
+ sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0),
2272
+ sycl::range<3>(x, y, 1), direction);
2273
+ }
2274
+
2275
+ inline void gemm(sycl::queue & q, oneapi::mkl::transpose a_trans, oneapi::mkl::transpose b_trans, int m, int n,
2276
+ int k, const void * alpha, const void * a, library_data_t a_type, int lda, const void * b,
2277
+ library_data_t b_type, int ldb, const void * beta, void * c, library_data_t c_type, int ldc,
2278
+ library_data_t scaling_type) {
2279
+ if (scaling_type == library_data_t::real_float &&
2280
+ c_type == library_data_t::complex_float)
2281
+ {
2282
+ scaling_type = library_data_t::complex_float;
2283
+ }
2284
+ else if (scaling_type == library_data_t::real_double &&
2285
+ c_type == library_data_t::complex_double)
2286
+ {
2287
+ scaling_type = library_data_t::complex_double;
2288
+ }
2289
+
2290
+ std::uint64_t key =
2291
+ detail::get_type_combination_id(a_type, b_type, c_type, scaling_type);
2292
+ switch (key)
2293
+ {
2294
+ case detail::get_type_combination_id(
2295
+ library_data_t::real_float, library_data_t::real_float,
2296
+ library_data_t::real_float, library_data_t::real_float):
2297
+ {
2298
+ detail::gemm_impl<float, float, float, float>(
2299
+ q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
2300
+ break;
2301
+ }
2302
+ case detail::get_type_combination_id(
2303
+ library_data_t::real_double, library_data_t::real_double,
2304
+ library_data_t::real_double, library_data_t::real_double):
2305
+ {
2306
+ detail::gemm_impl<double, double, double, double>(
2307
+ q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
2308
+ break;
2309
+ }
2310
+ case detail::get_type_combination_id(
2311
+ library_data_t::complex_float, library_data_t::complex_float,
2312
+ library_data_t::complex_float, library_data_t::complex_float):
2313
+ {
2314
+ detail::gemm_impl<std::complex<float>, std::complex<float>,
2315
+ std::complex<float>, std::complex<float>>(
2316
+ q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
2317
+ break;
2318
+ }
2319
+ case detail::get_type_combination_id(
2320
+ library_data_t::complex_double, library_data_t::complex_double,
2321
+ library_data_t::complex_double, library_data_t::complex_double):
2322
+ {
2323
+ detail::gemm_impl<std::complex<double>, std::complex<double>,
2324
+ std::complex<double>, std::complex<double>>(
2325
+ q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
2326
+ break;
2327
+ }
2328
+ case detail::get_type_combination_id(
2329
+ library_data_t::real_half, library_data_t::real_half,
2330
+ library_data_t::real_half, library_data_t::real_half):
2331
+ {
2332
+ detail::gemm_impl<sycl::half, sycl::half, sycl::half,
2333
+ sycl::half>(q, a_trans, b_trans, m, n, k, alpha, a,
2334
+ lda, b, ldb, beta, c, ldc);
2335
+ break;
2336
+ }
2337
+ #ifdef __INTEL_MKL__
2338
+ case detail::get_type_combination_id(
2339
+ library_data_t::real_bfloat16, library_data_t::real_bfloat16,
2340
+ library_data_t::real_float, library_data_t::real_float):
2341
+ {
2342
+ detail::gemm_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float, float>(
2343
+ q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
2344
+ break;
2345
+ }
2346
+ case detail::get_type_combination_id(
2347
+ library_data_t::real_half, library_data_t::real_half,
2348
+ library_data_t::real_float, library_data_t::real_float):
2349
+ {
2350
+ detail::gemm_impl<sycl::half, sycl::half, float, float>(
2351
+ q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
2352
+ break;
2353
+ }
2354
+ case detail::get_type_combination_id(
2355
+ library_data_t::real_half, library_data_t::real_half,
2356
+ library_data_t::real_half, library_data_t::real_float):
2357
+ {
2358
+ float alpha_value =
2359
+ dpct::get_value(reinterpret_cast<const float *>(alpha), q);
2360
+ float beta_value =
2361
+ dpct::get_value(reinterpret_cast<const float *>(beta), q);
2362
+ sycl::half alpha_half(alpha_value);
2363
+ sycl::half beta_half(beta_value);
2364
+ detail::gemm_impl<sycl::half, sycl::half, sycl::half,
2365
+ sycl::half>(q, a_trans, b_trans, m, n, k, &alpha_half,
2366
+ a, lda, b, ldb, &beta_half, c, ldc);
2367
+ break;
2368
+ }
2369
+ case detail::get_type_combination_id(
2370
+ library_data_t::real_int8, library_data_t::real_int8,
2371
+ library_data_t::real_float, library_data_t::real_float):
2372
+ {
2373
+ detail::gemm_impl<std::int8_t, std::int8_t, float, float>(
2374
+ q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
2375
+ break;
2376
+ }
2377
+ case detail::get_type_combination_id(
2378
+ library_data_t::real_bfloat16, library_data_t::real_bfloat16,
2379
+ library_data_t::real_bfloat16, library_data_t::real_float):
2380
+ {
2381
+ detail::gemm_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float>(
2382
+ q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
2383
+ break;
2384
+ }
2385
+ case detail::get_type_combination_id(
2386
+ library_data_t::real_int8, library_data_t::real_int8,
2387
+ library_data_t::real_int32, library_data_t::real_int32):
2388
+ {
2389
+ float alpha_float =
2390
+ dpct::get_value(reinterpret_cast<const std::int32_t *>(alpha), q);
2391
+ float beta_float =
2392
+ dpct::get_value(reinterpret_cast<const std::int32_t *>(beta), q);
2393
+ detail::gemm_impl<std::int8_t, std::int8_t, std::int32_t, float>(
2394
+ q, a_trans, b_trans, m, n, k, &alpha_float, a, lda, b, ldb, &beta_float, c, ldc);
2395
+ break;
2396
+ }
2397
+ #endif // __INTEL_MKL__
2398
+ default:
2399
+ throw std::runtime_error("the combination of data type is unsupported");
2400
+ }
2401
+ } // gemm()
2402
+
2403
+ /// Computes a batch of matrix-matrix product with general matrices.
2404
+ /// \param [in] q The queue where the routine should be executed.
2405
+ /// \param [in] a_trans Specifies the operation applied to A.
2406
+ /// \param [in] b_trans Specifies the operation applied to B.
2407
+ /// \param [in] m Specifies the number of rows of the matrix op(A) and of the matrix C.
2408
+ /// \param [in] n Specifies the number of columns of the matrix op(B) and of the matrix C.
2409
+ /// \param [in] k Specifies the number of columns of the matrix op(A) and the number of rows of the matrix op(B).
2410
+ /// \param [in] alpha Scaling factor for the matrix-matrix product.
2411
+ /// \param [in] a Input matrix A.
2412
+ /// \param [in] a_type Data type of the matrix A.
2413
+ /// \param [in] lda Leading dimension of A.
2414
+ /// \param [in] b Input matrix B.
2415
+ /// \param [in] b_type Data type of the matrix B.
2416
+ /// \param [in] ldb Leading dimension of B.
2417
+ /// \param [in] beta Scaling factor for matrix C.
2418
+ /// \param [in, out] c Input/Output matrix C.
2419
+ /// \param [in] c_type Data type of the matrix C.
2420
+ /// \param [in] ldc Leading dimension of C.
2421
+ /// \param [in] batch_size Specifies the number of matrix multiply operations to perform.
2422
+ /// \param [in] scaling_type Data type of the scaling factors.
2423
+ inline void gemm_batch(sycl::queue & q, oneapi::mkl::transpose a_trans, oneapi::mkl::transpose b_trans, int m,
2424
+ int n, int k, const void * alpha, const void * a[], library_data_t a_type, int lda,
2425
+ const void * b[], library_data_t b_type, int ldb, const void * beta, void * c[],
2426
+ library_data_t c_type, int ldc, int batch_size, library_data_t scaling_type,
2427
+ matrix_info_t<float> * matrix_info) {
2428
+ std::uint64_t key =
2429
+ detail::get_type_combination_id(a_type, b_type, c_type, scaling_type);
2430
+ switch (key)
2431
+ {
2432
+ case detail::get_type_combination_id(
2433
+ library_data_t::real_float, library_data_t::real_float,
2434
+ library_data_t::real_float, library_data_t::real_float):
2435
+ {
2436
+ detail::gemm_batch_impl<float, float, float, float>(q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb,
2437
+ beta, c, ldc, batch_size, matrix_info);
2438
+ break;
2439
+ }
2440
+ case detail::get_type_combination_id(
2441
+ library_data_t::real_double, library_data_t::real_double,
2442
+ library_data_t::real_double, library_data_t::real_double):
2443
+ {
2444
+ detail::gemm_batch_impl<double, double, double, double>(q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb,
2445
+ beta, c, ldc, batch_size, matrix_info);
2446
+ break;
2447
+ }
2448
+ case detail::get_type_combination_id(
2449
+ library_data_t::real_half, library_data_t::real_half,
2450
+ library_data_t::real_half, library_data_t::real_half):
2451
+ {
2452
+ detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half, sycl::half>(
2453
+ q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info);
2454
+ break;
2455
+ }
2456
+ #ifdef __INTEL_MKL__
2457
+ case detail::get_type_combination_id(
2458
+ library_data_t::real_bfloat16, library_data_t::real_bfloat16,
2459
+ library_data_t::real_bfloat16, library_data_t::real_float):
2460
+ {
2461
+ detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float>(
2462
+ q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info);
2463
+ break;
2464
+ }
2465
+ case detail::get_type_combination_id(
2466
+ library_data_t::real_bfloat16, library_data_t::real_bfloat16,
2467
+ library_data_t::real_float, library_data_t::real_float):
2468
+ {
2469
+ detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float, float>(
2470
+ q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info);
2471
+ break;
2472
+ }
2473
+ #endif
2474
+ case detail::get_type_combination_id(
2475
+ library_data_t::real_int8, library_data_t::real_int8,
2476
+ library_data_t::real_int32, library_data_t::real_int32):
2477
+ {
2478
+ float alpha_float =
2479
+ dpct::get_value(reinterpret_cast<const std::int32_t *>(alpha), q);
2480
+ float beta_float =
2481
+ dpct::get_value(reinterpret_cast<const std::int32_t *>(beta), q);
2482
+ detail::gemm_batch_impl<std::int8_t, std::int8_t, std::int32_t, float>(
2483
+ q, a_trans, b_trans, m, n, k, &alpha_float, a, lda, b, ldb, &beta_float, c, ldc, batch_size,
2484
+ matrix_info);
2485
+ break;
2486
+ }
2487
+ case detail::get_type_combination_id(
2488
+ library_data_t::real_int8, library_data_t::real_int8,
2489
+ library_data_t::real_float, library_data_t::real_float):
2490
+ {
2491
+ detail::gemm_batch_impl<std::int8_t, std::int8_t, float, float>(
2492
+ q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info);
2493
+ break;
2494
+ }
2495
+ case detail::get_type_combination_id(
2496
+ library_data_t::real_half, library_data_t::real_half,
2497
+ library_data_t::real_float, library_data_t::real_float):
2498
+ {
2499
+ detail::gemm_batch_impl<sycl::half, sycl::half, float, float>(
2500
+ q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info);
2501
+ break;
2502
+ }
2503
+ case detail::get_type_combination_id(
2504
+ library_data_t::real_half, library_data_t::real_half,
2505
+ library_data_t::real_half, library_data_t::real_float):
2506
+ {
2507
+ float alpha_value =
2508
+ dpct::get_value(reinterpret_cast<const float *>(alpha), q);
2509
+ float beta_value =
2510
+ dpct::get_value(reinterpret_cast<const float *>(beta), q);
2511
+ sycl::half alpha_half(alpha_value);
2512
+ sycl::half beta_half(beta_value);
2513
+ detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half, sycl::half>(
2514
+ q, a_trans, b_trans, m, n, k, &alpha_half, a, lda, b, ldb, &beta_half, c, ldc, batch_size, matrix_info);
2515
+ break;
2516
+ }
2517
+ default:
2518
+ throw std::runtime_error("the combination of data type is unsupported");
2519
+ }
2520
+ }
2521
+
2522
+ /// Computes a batch of matrix-matrix product with general matrices.
2523
+ /// \param [in] q The queue where the routine should be executed.
2524
+ /// \param [in] a_trans Specifies the operation applied to A.
2525
+ /// \param [in] b_trans Specifies the operation applied to B.
2526
+ /// \param [in] m Specifies the number of rows of the matrix op(A) and of the matrix C.
2527
+ /// \param [in] n Specifies the number of columns of the matrix op(B) and of the matrix C.
2528
+ /// \param [in] k Specifies the number of columns of the matrix op(A) and the number of rows of the matrix op(B).
2529
+ /// \param [in] alpha Scaling factor for the matrix-matrix product.
2530
+ /// \param [in] a Input matrix A.
2531
+ /// \param [in] a_type Data type of the matrix A.
2532
+ /// \param [in] lda Leading dimension of A.
2533
+ /// \param [in] stride_a Stride between the different A matrices.
2534
+ /// \param [in] b Input matrix B.
2535
+ /// \param [in] b_type Data type of the matrix B.
2536
+ /// \param [in] ldb Leading dimension of B.
2537
+ /// \param [in] stride_b Stride between the different B matrices.
2538
+ /// \param [in] beta Scaling factor for matrix C.
2539
+ /// \param [in, out] c Input/Output matrix C.
2540
+ /// \param [in] c_type Data type of the matrix C.
2541
+ /// \param [in] ldc Leading dimension of C.
2542
+ /// \param [in] stride_c Stride between the different C matrices.
2543
+ /// \param [in] batch_size Specifies the number of matrix multiply operations to perform.
2544
+ /// \param [in] scaling_type Data type of the scaling factors.
2545
+ inline void gemm_batch(sycl::queue & q, oneapi::mkl::transpose a_trans, oneapi::mkl::transpose b_trans, int m,
2546
+ int n, int k, const void * alpha, const void * a, library_data_t a_type, int lda,
2547
+ long long int stride_a, const void * b, library_data_t b_type, int ldb,
2548
+ long long int stride_b, const void * beta, void * c, library_data_t c_type, int ldc,
2549
+ long long int stride_c, int batch_size, library_data_t scaling_type) {
2550
+ if (scaling_type == library_data_t::real_float &&
2551
+ c_type == library_data_t::complex_float)
2552
+ {
2553
+ scaling_type = library_data_t::complex_float;
2554
+ }
2555
+ else if (scaling_type == library_data_t::real_double &&
2556
+ c_type == library_data_t::complex_double)
2557
+ {
2558
+ scaling_type = library_data_t::complex_double;
2559
+ }
2560
+
2561
+ std::uint64_t key =
2562
+ detail::get_type_combination_id(a_type, b_type, c_type, scaling_type);
2563
+ switch (key)
2564
+ {
2565
+ case detail::get_type_combination_id(
2566
+ library_data_t::real_float, library_data_t::real_float,
2567
+ library_data_t::real_float, library_data_t::real_float):
2568
+ {
2569
+ detail::gemm_batch_impl<float, float, float, float>(
2570
+ q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
2571
+ beta, c, ldc, stride_c, batch_size);
2572
+ break;
2573
+ }
2574
+ case detail::get_type_combination_id(
2575
+ library_data_t::real_double, library_data_t::real_double,
2576
+ library_data_t::real_double, library_data_t::real_double):
2577
+ {
2578
+ detail::gemm_batch_impl<double, double, double, double>(
2579
+ q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
2580
+ beta, c, ldc, stride_c, batch_size);
2581
+ break;
2582
+ }
2583
+ case detail::get_type_combination_id(
2584
+ library_data_t::complex_float, library_data_t::complex_float,
2585
+ library_data_t::complex_float, library_data_t::complex_float):
2586
+ {
2587
+ detail::gemm_batch_impl<std::complex<float>, std::complex<float>,
2588
+ std::complex<float>, std::complex<float>>(
2589
+ q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
2590
+ beta, c, ldc, stride_c, batch_size);
2591
+ break;
2592
+ }
2593
+ case detail::get_type_combination_id(
2594
+ library_data_t::complex_double, library_data_t::complex_double,
2595
+ library_data_t::complex_double, library_data_t::complex_double):
2596
+ {
2597
+ detail::gemm_batch_impl<std::complex<double>, std::complex<double>,
2598
+ std::complex<double>, std::complex<double>>(
2599
+ q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
2600
+ beta, c, ldc, stride_c, batch_size);
2601
+ break;
2602
+ }
2603
+ case detail::get_type_combination_id(
2604
+ library_data_t::real_half, library_data_t::real_half,
2605
+ library_data_t::real_half, library_data_t::real_half):
2606
+ {
2607
+ detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half,
2608
+ sycl::half>(q, a_trans, b_trans, m, n, k, alpha,
2609
+ a, lda, stride_a, b, ldb, stride_b,
2610
+ beta, c, ldc, stride_c, batch_size);
2611
+ break;
2612
+ }
2613
+ #ifdef __INTEL_MKL__
2614
+ case detail::get_type_combination_id(
2615
+ library_data_t::real_bfloat16, library_data_t::real_bfloat16,
2616
+ library_data_t::real_bfloat16, library_data_t::real_float):
2617
+ {
2618
+ detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float>(
2619
+ q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c,
2620
+ batch_size);
2621
+ break;
2622
+ }
2623
+ case detail::get_type_combination_id(
2624
+ library_data_t::real_bfloat16, library_data_t::real_bfloat16,
2625
+ library_data_t::real_float, library_data_t::real_float):
2626
+ {
2627
+ detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float, float>(
2628
+ q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c,
2629
+ batch_size);
2630
+ break;
2631
+ }
2632
+ #endif
2633
+ case detail::get_type_combination_id(
2634
+ library_data_t::real_int8, library_data_t::real_int8,
2635
+ library_data_t::real_int32, library_data_t::real_int32):
2636
+ {
2637
+ detail::gemm_batch_impl<std::int8_t, std::int8_t, std::int32_t,
2638
+ std::int32_t>(q, a_trans, b_trans, m, n, k, alpha,
2639
+ a, lda, stride_a, b, ldb, stride_b,
2640
+ beta, c, ldc, stride_c, batch_size);
2641
+ break;
2642
+ }
2643
+ case detail::get_type_combination_id(
2644
+ library_data_t::real_int8, library_data_t::real_int8,
2645
+ library_data_t::real_float, library_data_t::real_float):
2646
+ {
2647
+ detail::gemm_batch_impl<std::int8_t, std::int8_t, float, float>(
2648
+ q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
2649
+ beta, c, ldc, stride_c, batch_size);
2650
+ break;
2651
+ }
2652
+ case detail::get_type_combination_id(
2653
+ library_data_t::real_half, library_data_t::real_half,
2654
+ library_data_t::real_float, library_data_t::real_float):
2655
+ {
2656
+ detail::gemm_batch_impl<sycl::half, sycl::half, float, float>(
2657
+ q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
2658
+ beta, c, ldc, stride_c, batch_size);
2659
+ break;
2660
+ }
2661
+ case detail::get_type_combination_id(
2662
+ library_data_t::real_half, library_data_t::real_half,
2663
+ library_data_t::real_half, library_data_t::real_float):
2664
+ {
2665
+ float alpha_value =
2666
+ dpct::get_value(reinterpret_cast<const float *>(alpha), q);
2667
+ float beta_value =
2668
+ dpct::get_value(reinterpret_cast<const float *>(beta), q);
2669
+ sycl::half alpha_half(alpha_value);
2670
+ sycl::half beta_half(beta_value);
2671
+ detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half, sycl::half>(
2672
+ q, a_trans, b_trans, m, n, k, &alpha_half, a, lda, stride_a, b, ldb, stride_b,
2673
+ &beta_half, c, ldc, stride_c, batch_size);
2674
+ break;
2675
+ }
2676
+ default:
2677
+ throw std::runtime_error("the combination of data type is unsupported");
2678
+ }
2679
+ }
2680
+
2681
+ static inline void
2682
+ async_dpct_memcpy(void *to_ptr, size_t to_pitch, const void *from_ptr,
2683
+ size_t from_pitch, size_t x, size_t y,
2684
+ memcpy_direction direction = automatic,
2685
+ sycl::queue &q = get_default_queue())
2686
+ {
2687
+ detail::dpct_memcpy(q, to_ptr, from_ptr, to_pitch, from_pitch, x, y,
2688
+ direction);
2689
+ }
2690
+
2691
+ using err0 = detail::generic_error_type<struct err0_tag, int>;
2692
+ using err1 = detail::generic_error_type<struct err1_tag, int>;
2693
+
2694
+ static inline void dpct_free(void *ptr, sycl::queue &q = get_default_queue()) {
2695
+ detail::dpct_free(ptr, q);
2696
+ }
2697
+
2698
+ /// dpct accessor used as device function parameter.
2699
+ template <class T, memory_region Memory, size_t Dimension> class accessor;
2700
+ template <class T, memory_region Memory> class accessor<T, Memory, 3> {
2701
+ public:
2702
+ using memory_t = detail::memory_traits<Memory, T>;
2703
+ using element_t = typename memory_t::element_t;
2704
+ using pointer_t = typename memory_t::pointer_t;
2705
+ using accessor_t = typename memory_t::template accessor_t<3>;
2706
+ accessor(pointer_t data, const sycl::range<3> &in_range)
2707
+ : _data(data), _range(in_range) {}
2708
+ template <memory_region M = Memory>
2709
+ accessor(typename std::enable_if<M != local, const accessor_t>::type &acc)
2710
+ : accessor(acc, acc.get_range()) {}
2711
+ accessor(const accessor_t &acc, const sycl::range<3> &in_range)
2712
+ : accessor(acc.get_pointer(), in_range) {}
2713
+ accessor<T, Memory, 2> operator[](size_t index) const {
2714
+ sycl::range<2> sub(_range.get(1), _range.get(2));
2715
+ return accessor<T, Memory, 2>(_data + index * sub.size(), sub);
2716
+ }
2717
+
2718
+ pointer_t get_ptr() const { return _data; }
2719
+
2720
+ private:
2721
+ pointer_t _data;
2722
+ sycl::range<3> _range;
2723
+ };
2724
+ template <class T, memory_region Memory> class accessor<T, Memory, 2> {
2725
+ public:
2726
+ using memory_t = detail::memory_traits<Memory, T>;
2727
+ using element_t = typename memory_t::element_t;
2728
+ using pointer_t = typename memory_t::pointer_t;
2729
+ using accessor_t = typename memory_t::template accessor_t<2>;
2730
+ accessor(pointer_t data, const sycl::range<2> &in_range)
2731
+ : _data(data), _range(in_range) {}
2732
+ template <memory_region M = Memory>
2733
+ accessor(typename std::enable_if<M != local, const accessor_t>::type &acc)
2734
+ : accessor(acc, acc.get_range()) {}
2735
+ accessor(const accessor_t &acc, const sycl::range<2> &in_range)
2736
+ : accessor(acc.get_pointer(), in_range) {}
2737
+
2738
+ pointer_t operator[](size_t index) const {
2739
+ return _data + _range.get(1) * index;
2740
+ }
2741
+
2742
+ pointer_t get_ptr() const { return _data; }
2743
+
2744
+ private:
2745
+ pointer_t _data;
2746
+ sycl::range<2> _range;
2747
+ };
2748
+
2749
+ namespace detail {
2750
+ /// Device variable with address space of shared, global or constant.
2751
+ template <class T, memory_region Memory, size_t Dimension> class device_memory {
2752
+ public:
2753
+ using accessor_t =
2754
+ typename detail::memory_traits<Memory,
2755
+ T>::template accessor_t<Dimension>;
2756
+ using value_t = typename detail::memory_traits<Memory, T>::value_t;
2757
+ using dpct_accessor_t = dpct::accessor<T, Memory, Dimension>;
2758
+
2759
+ device_memory() : device_memory(sycl::range<Dimension>(1)) {}
2760
+
2761
+ /// Constructor of 1-D array with initializer list
2762
+ device_memory(const sycl::range<Dimension> &in_range,
2763
+ std::initializer_list<value_t> &&init_list)
2764
+ : device_memory(in_range) {
2765
+ assert(init_list.size() <= in_range.size());
2766
+ _host_ptr = (value_t *)std::malloc(_size);
2767
+ std::memset(_host_ptr, 0, _size);
2768
+ std::memcpy(_host_ptr, init_list.begin(), init_list.size() * sizeof(T));
2769
+ }
2770
+
2771
+ /// Constructor of 2-D array with initializer list
2772
+ template <size_t D = Dimension>
2773
+ device_memory(
2774
+ const typename std::enable_if<D == 2, sycl::range<2>>::type &in_range,
2775
+ std::initializer_list<std::initializer_list<value_t>> &&init_list)
2776
+ : device_memory(in_range) {
2777
+ assert(init_list.size() <= in_range[0]);
2778
+ _host_ptr = (value_t *)std::malloc(_size);
2779
+ std::memset(_host_ptr, 0, _size);
2780
+ auto tmp_data = _host_ptr;
2781
+ for (auto sub_list : init_list) {
2782
+ assert(sub_list.size() <= in_range[1]);
2783
+ std::memcpy(tmp_data, sub_list.begin(),
2784
+ sub_list.size() * sizeof(T));
2785
+ tmp_data += in_range[1];
2786
+ }
2787
+ }
2788
+
2789
+ /// Constructor with range
2790
+ device_memory(const sycl::range<Dimension> &range_in)
2791
+ : _size(range_in.size() * sizeof(T)), _range(range_in),
2792
+ _reference(false), _host_ptr(nullptr), _device_ptr(nullptr) {
2793
+ static_assert(
2794
+ (Memory == global) || (Memory == constant) || (Memory == shared),
2795
+ "device memory region should be global, constant or shared");
2796
+ // Make sure that singleton class mem_mgr and dev_mgr will destruct
2797
+ // later than this.
2798
+ detail::mem_mgr::instance();
2799
+ dev_mgr::instance();
2800
+ }
2801
+
2802
+ /// Constructor with range
2803
+ template <class... Args>
2804
+ device_memory(Args... Arguments)
2805
+ : device_memory(sycl::range<Dimension>(Arguments...)) {}
2806
+
2807
+ ~device_memory() {
2808
+ if (_device_ptr && !_reference)
2809
+ dpct::dpct_free(_device_ptr);
2810
+ if (_host_ptr)
2811
+ std::free(_host_ptr);
2812
+ }
2813
+
2814
+ /// Allocate memory with default queue, and init memory if has initial
2815
+ /// value.
2816
+ void init() { init(dpct::get_default_queue()); }
2817
+ /// Allocate memory with specified queue, and init memory if has initial
2818
+ /// value.
2819
+ void init(sycl::queue &q) {
2820
+ if (_device_ptr)
2821
+ return;
2822
+ if (!_size)
2823
+ return;
2824
+ allocate_device(q);
2825
+ if (_host_ptr)
2826
+ detail::dpct_memcpy(q, _device_ptr, _host_ptr, _size,
2827
+ host_to_device);
2828
+ }
2829
+
2830
+ /// The variable is assigned to a device pointer.
2831
+ void assign(value_t *src, size_t size) {
2832
+ this->~device_memory();
2833
+ new (this) device_memory(src, size);
2834
+ }
2835
+
2836
+ /// Get memory pointer of the memory object, which is virtual pointer when
2837
+ /// usm is not used, and device pointer when usm is used.
2838
+ value_t *get_ptr() { return get_ptr(get_default_queue()); }
2839
+ /// Get memory pointer of the memory object, which is virtual pointer when
2840
+ /// usm is not used, and device pointer when usm is used.
2841
+ value_t *get_ptr(sycl::queue &q) {
2842
+ init(q);
2843
+ return _device_ptr;
2844
+ }
2845
+
2846
+ /// Get the device memory object size in bytes.
2847
+ size_t get_size() { return _size; }
2848
+
2849
+ template <size_t D = Dimension>
2850
+ typename std::enable_if<D == 1, T>::type &operator[](size_t index) {
2851
+ init();
2852
+ return _device_ptr[index];
2853
+ }
2854
+
2855
+ /// Get dpct::accessor with dimension info for the device memory object
2856
+ /// when usm is used and dimension is greater than 1.
2857
+ template <size_t D = Dimension>
2858
+ typename std::enable_if<D != 1, dpct_accessor_t>::type
2859
+ get_access([[maybe_unused]] sycl::handler &cgh) {
2860
+ return dpct_accessor_t((T *)_device_ptr, _range);
2861
+ }
2862
+
2863
+ private:
2864
+ device_memory(value_t *memory_ptr, size_t size)
2865
+ : _size(size), _range(size / sizeof(T)), _reference(true),
2866
+ _device_ptr(memory_ptr) {}
2867
+
2868
+ void allocate_device(sycl::queue &q) {
2869
+ #ifndef DPCT_USM_LEVEL_NONE
2870
+ if (Memory == shared) {
2871
+ _device_ptr = (value_t *)sycl::malloc_shared(_size, q.get_device(),
2872
+ q.get_context());
2873
+ return;
2874
+ }
2875
+ #ifdef SYCL_EXT_ONEAPI_USM_DEVICE_READ_ONLY
2876
+ if (Memory == constant) {
2877
+ _device_ptr = (value_t *)sycl::malloc_device(
2878
+ _size, q.get_device(), q.get_context(),
2879
+ sycl::ext::oneapi::property::usm::device_read_only());
2880
+ return;
2881
+ }
2882
+ #endif
2883
+ #endif
2884
+ _device_ptr = (value_t *)detail::dpct_malloc(_size, q);
2885
+ }
2886
+
2887
+ size_t _size;
2888
+ sycl::range<Dimension> _range;
2889
+ bool _reference;
2890
+ value_t *_host_ptr;
2891
+ value_t *_device_ptr;
2892
+ };
2893
+ template <class T, memory_region Memory>
2894
+ class device_memory<T, Memory, 0> : public device_memory<T, Memory, 1> {
2895
+ public:
2896
+ using base = device_memory<T, Memory, 1>;
2897
+ using value_t = typename base::value_t;
2898
+ using accessor_t =
2899
+ typename detail::memory_traits<Memory, T>::template accessor_t<0>;
2900
+
2901
+ /// Constructor with initial value.
2902
+ device_memory(const value_t &val) : base(sycl::range<1>(1), {val}) {}
2903
+
2904
+ /// Default constructor
2905
+ device_memory() : base(1) {}
2906
+ };
2907
+ } // namespace detail
2908
+
2909
+ template <class T, size_t Dimension>
2910
+ using global_memory = detail::device_memory<T, global, Dimension>;
2911
+ template <class T, size_t Dimension>
2912
+ using constant_memory = detail::device_memory<T, constant, Dimension>;
2913
+ template <class T, size_t Dimension>
2914
+ using shared_memory = detail::device_memory<T, shared, Dimension>;
2915
+
2916
+
2917
+ template <typename T,
2918
+ sycl::access::address_space addressSpace =
2919
+ sycl::access::address_space::global_space,
2920
+ sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
2921
+ sycl::memory_scope memoryScope = sycl::memory_scope::device>
2922
+ inline T atomic_fetch_add(T *addr, T operand) {
2923
+ auto atm =
2924
+ sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
2925
+ return atm.fetch_add(operand);
2926
+ }
2927
+
2928
+ template <sycl::access::address_space addressSpace =
2929
+ sycl::access::address_space::global_space,
2930
+ sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
2931
+ sycl::memory_scope memoryScope = sycl::memory_scope::device,
2932
+ typename T1, typename T2>
2933
+ inline T1 atomic_fetch_add(T1 *addr, T2 operand) {
2934
+ auto atm =
2935
+ sycl::atomic_ref<T1, memoryOrder, memoryScope, addressSpace>(addr[0]);
2936
+ return atm.fetch_add(operand);
2937
+ }
2938
+
2939
+ template <typename T, sycl::access::address_space addressSpace =
2940
+ sycl::access::address_space::global_space>
2941
+ inline T atomic_fetch_add(T *addr, T operand,
2942
+ sycl::memory_order memoryOrder) {
2943
+ switch (memoryOrder) {
2944
+ case sycl::memory_order::relaxed:
2945
+ return atomic_fetch_add<T, addressSpace, sycl::memory_order::relaxed,
2946
+ sycl::memory_scope::device>(addr, operand);
2947
+ case sycl::memory_order::acq_rel:
2948
+ return atomic_fetch_add<T, addressSpace, sycl::memory_order::acq_rel,
2949
+ sycl::memory_scope::device>(addr, operand);
2950
+ case sycl::memory_order::seq_cst:
2951
+ return atomic_fetch_add<T, addressSpace, sycl::memory_order::seq_cst,
2952
+ sycl::memory_scope::device>(addr, operand);
2953
+ default:
2954
+ assert(false && "Invalid memory_order for atomics. Valid memory_order for "
2955
+ "atomics are: sycl::memory_order::relaxed, "
2956
+ "sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!");
2957
+ }
2958
+ }
2959
+
2960
+ template <sycl::access::address_space addressSpace =
2961
+ sycl::access::address_space::global_space,
2962
+ typename T1, typename T2>
2963
+ inline T1 atomic_fetch_add(T1 *addr, T2 operand,
2964
+ sycl::memory_order memoryOrder) {
2965
+ atomic_fetch_add<T1, addressSpace>(addr, operand, memoryOrder);
2966
+ }
2967
+
2968
+ inline unsigned int byte_level_permute(
2969
+ unsigned int a, unsigned int b, unsigned int s) {
2970
+ unsigned int ret;
2971
+ ret = ((((std::uint64_t)b << 32 | a) >> (s & 0x7) * 8) & 0xff) |
2972
+ (((((std::uint64_t)b << 32 | a) >> ((s >> 4) & 0x7) * 8) & 0xff)
2973
+ << 8) |
2974
+ (((((std::uint64_t)b << 32 | a) >> ((s >> 8) & 0x7) * 8) & 0xff)
2975
+ << 16) |
2976
+ (((((std::uint64_t)b << 32 | a) >> ((s >> 12) & 0x7) * 8) & 0xff)
2977
+ << 24);
2978
+ return ret;
2979
+ }
2980
+
2981
+ inline uint32_t byte_level_permute_custom(
2982
+ uint32_t low32, uint32_t high32, uint32_t sel, int mode = 0) {
2983
+ constexpr uint16_t lookup[6][4] = {
2984
+ {0x3210, 0x4321, 0x5432, 0x6543}, // Forward 4-byte extract
2985
+ {0x5670, 0x6701, 0x7012, 0x0123}, // Backward 4-byte extract
2986
+ {0x0000, 0x1111, 0x2222, 0x3333}, // Replicate 8-bit values
2987
+ {0x3210, 0x3211, 0x3222, 0x3333}, // Edge clamp left
2988
+ {0x0000, 0x1110, 0x2210, 0x3210}, // Edge clamp right
2989
+ {0x1010, 0x3232, 0x1010, 0x3232} // Replicate 16-bit values
2990
+ };
2991
+
2992
+ if (mode >= 1 && mode <= 6) {
2993
+ return byte_level_permute(low32, high32, lookup[mode - 1][sel & 0x3]);
2994
+ } else if (!mode) {
2995
+ return byte_level_permute(low32, high32, sel);
2996
+ }
2997
+ return 0;
2998
+ }
2999
+
3000
+ template <int n_nondefault_params, int n_default_params, typename T>
3001
+ class args_selector;
3002
+
3003
+ /// args_selector is a helper class for extracting arguments from an
3004
+ /// array of pointers to arguments or buffer of arguments to pass to a
3005
+ /// kernel function.
3006
+ ///
3007
+ /// \param R(Ts...) The type of the kernel
3008
+ /// \param n_nondefault_params The number of nondefault parameters of the
3009
+ /// kernel (excluding parameters that like sycl::nd_item, etc.) \param
3010
+ /// n_default_params The number of default parameters of the kernel
3011
+ ///
3012
+ /// Example usage:
3013
+ /// With the following kernel:
3014
+ /// void foo(sycl::float2 *x, int n, sycl::nd_item<3> item_ct1, float
3015
+ /// f=.1) {}
3016
+ /// and with the declaration:
3017
+ /// args_selector<2, 1, decltype(foo)> selector(kernelParams, extra);
3018
+ /// we have:
3019
+ /// selector.get<0>() returns a reference to sycl::float*,
3020
+ /// selector.get<1>() returns a reference to int,
3021
+ /// selector.get<2>() returns a reference to float
3022
+ template <int n_nondefault_params, int n_default_params, typename R,
3023
+ typename... Ts>
3024
+ class args_selector<n_nondefault_params, n_default_params, R(Ts...)> {
3025
+ private:
3026
+ void **kernel_params;
3027
+ char *args_buffer;
3028
+
3029
+ template <int i> static constexpr int account_for_default_params() {
3030
+ constexpr int n_total_params = sizeof...(Ts);
3031
+ if constexpr (i >= n_nondefault_params) {
3032
+ return n_total_params - n_default_params +
3033
+ (i - n_nondefault_params);
3034
+ } else {
3035
+ return i;
3036
+ }
3037
+ }
3038
+
3039
+ public:
3040
+ /// Get the type of the ith argument of R(Ts...)
3041
+ /// \param [in] i Index of parameter to get
3042
+ /// \returns Type of ith parameter
3043
+ template <int i>
3044
+ using arg_type = std::tuple_element_t<account_for_default_params<i>(),
3045
+ std::tuple<Ts...>>;
3046
+ static constexpr int params_num = sizeof...(Ts);
3047
+
3048
+ private:
3049
+ template <int i> static constexpr int get_offset() {
3050
+ if constexpr (i == 0) {
3051
+ // we can assume args_buffer is properly aligned to the
3052
+ // first argument
3053
+ return 0;
3054
+ } else {
3055
+ constexpr int prev_off = get_offset<i - 1>();
3056
+ constexpr int prev_past_end =
3057
+ prev_off + sizeof(arg_type<i - 1>);
3058
+ using T = arg_type<i>;
3059
+ // is the past-the-end of the i-1st element properly aligned
3060
+ // with the ith element's alignment?
3061
+ if constexpr (prev_past_end % alignof(T) == 0) {
3062
+ return prev_past_end;
3063
+ }
3064
+ // otherwise bump prev_past_end to match alignment
3065
+ else {
3066
+ return prev_past_end +
3067
+ (alignof(T) - (prev_past_end % alignof(T)));
3068
+ }
3069
+ }
3070
+ }
3071
+
3072
+ static char *get_args_buffer(void **extra) {
3073
+ if (!extra)
3074
+ return nullptr;
3075
+ for (; (std::size_t)*extra != 0; ++extra) {
3076
+ if ((std::size_t)*extra == 1) {
3077
+ return static_cast<char *>(*(extra + 1));
3078
+ }
3079
+ }
3080
+ return nullptr;
3081
+ }
3082
+
3083
+ public:
3084
+ /// If kernel_params is nonnull, then args_selector will
3085
+ /// extract arguments from kernel_params. Otherwise, it
3086
+ /// will extract them from extra.
3087
+ /// \param [in] kernel_params Array of pointers to arguments
3088
+ /// a or null pointer.
3089
+ /// \param [in] extra Array containing pointer to argument buffer.
3090
+ args_selector(void **kernel_params, void **extra)
3091
+ : kernel_params(kernel_params),
3092
+ args_buffer(get_args_buffer(extra)) {}
3093
+
3094
+ /// Get a reference to the ith argument extracted from kernel_params
3095
+ /// or extra.
3096
+ /// \param [in] i Index of argument to get
3097
+ /// \returns Reference to the ith argument
3098
+ template <int i> arg_type<i> &get() {
3099
+ if (kernel_params) {
3100
+ return *static_cast<arg_type<i> *>(kernel_params[i]);
3101
+ } else {
3102
+ return *reinterpret_cast<arg_type<i> *>(args_buffer +
3103
+ get_offset<i>());
3104
+ }
3105
+ }
3106
+ }; // COPY from DPCT head file
3107
+ // /opt/intel/oneapi/dpcpp-ct/latest/include/dpct/util.hpp
3108
+
3109
+ /// Utility class for launching SYCL kernels through kernel
3110
+ /// function wrapper.
3111
+ /// For example:
3112
+ /// A SYCL kernel function:
3113
+ /// void kernel_func(int *ptr, sycl::nd_item<3> item);
3114
+ /// Kernel function wrapper:
3115
+ /// void kernel_func_wrapper(int *ptr) {
3116
+ /// sycl::queue queue = *dpct::kernel_launcher::_que;
3117
+ /// unsigned int localMemSize = dpct::kernel_launcher::_local_mem_size;
3118
+ /// sycl::nd_range<3> nr = dpct::kernel_launcher::_nr;
3119
+ /// queue.parallel_for(
3120
+ /// nr,
3121
+ /// [=](sycl::nd_item<3> item_ct1) {
3122
+ /// kernel_func(ptr, item_ct1);
3123
+ /// });
3124
+ /// }
3125
+ /// Then launch the kernel through wrapper like:
3126
+ /// typedef void(*fpt)(int *);
3127
+ /// fpt fp = kernel_func_wrapper;
3128
+ /// dpct::kernel_launcher::launch(fp, dpct::dim3(1), dpct::dim3(1), 0, 0,
3129
+ /// device_ptr);
3130
+ /// If the origin function type is erased, then need to register it first:
3131
+ /// void *fp = (void *)wrapper_register(&kernel_func_wrapper).get();
3132
+ /// dpct::kernel_launcher::launch(fp, dpct::dim3(1), dpct::dim3(1), args,
3133
+ /// 0, 0);
3134
+ class kernel_launcher {
3135
+ template <typename FuncT, typename ArgSelector, std::size_t... Index>
3136
+ static void launch_helper(FuncT &&func, ArgSelector &selector,
3137
+ std::index_sequence<Index...>) {
3138
+ func(selector.template get<Index>()...);
3139
+ }
3140
+ static void set_execution_config(dim3 group_range, dim3 local_range,
3141
+ unsigned int local_mem_size,
3142
+ queue_ptr que) {
3143
+ if (que) {
3144
+ _que = que;
3145
+ } else {
3146
+ _que = &get_default_queue();
3147
+ }
3148
+ _nr = sycl::nd_range<3>(
3149
+ static_cast<sycl::range<3>>(group_range * local_range),
3150
+ static_cast<sycl::range<3>>(local_range));
3151
+ _local_mem_size = local_mem_size;
3152
+
3153
+
3154
+ };
3155
+ static inline std::mutex kernel_function_ptr_map_mutex;
3156
+
3157
+ public:
3158
+ /// Variables for storing execution configuration.
3159
+ static inline thread_local sycl::queue *_que = nullptr;
3160
+ static inline thread_local sycl::nd_range<3> _nr = sycl::nd_range<3>();
3161
+ static inline thread_local unsigned int _local_mem_size = 0;
3162
+ /// Map for retrieving launchable functor from a raw pointer.
3163
+ static inline std::map<
3164
+ const void *,
3165
+ std::function<void(dim3, dim3, void **, unsigned int, queue_ptr)>>
3166
+ kernel_function_ptr_map = {};
3167
+
3168
+ /// Registers a kernel function pointer with a corresponding launchable
3169
+ /// functor.
3170
+ /// \param [in] func Pointer to the kernel function.
3171
+ /// \param [in] launcher Functor to handle kernel invocation.
3172
+ static void register_kernel_ptr(
3173
+ const void *func,
3174
+ std::function<void(dim3, dim3, void **, unsigned int, queue_ptr)>
3175
+ launcher) {
3176
+ std::lock_guard<std::mutex> lock(kernel_function_ptr_map_mutex);
3177
+ kernel_function_ptr_map[func] = std::move(launcher);
3178
+ }
3179
+ /// Launches a kernel function with arguments provided directly through
3180
+ /// kernel function wrapper.
3181
+ /// \tparam FuncT Type of the kernel function wrapper.
3182
+ /// \tparam ArgsT Types of kernel arguments.
3183
+ /// \param [in] func Pointer to the kernel function wrapper.
3184
+ /// \param [in] group_range SYCL group range.
3185
+ /// \param [in] local_range SYCL local range.
3186
+ /// \param [in] local_mem_size The size of local memory required by the
3187
+ /// kernel function. \param [in] que SYCL queue used to execute kernel.
3188
+ /// \param [in] args Kernel arguments.
3189
+ template <typename FuncT, typename... ArgsT>
3190
+ static std::enable_if_t<std::is_invocable_v<FuncT *, ArgsT...>, void>
3191
+ launch(FuncT *func, dim3 group_range, dim3 local_range,
3192
+ unsigned int local_mem_size, queue_ptr que, ArgsT... args) {
3193
+ set_execution_config(group_range, local_range, local_mem_size, que);
3194
+ func(args...);
3195
+ }
3196
+ /// Launches a kernel function through registered kernel function
3197
+ /// wrapper. \param [in] func Pointer to the registered kernel function
3198
+ /// wrapper. \param [in] group_range SYCL group range. \param [in]
3199
+ /// local_range SYCL local range. \param [in] args Array of pointers to
3200
+ /// kernel arguments. \param [in] local_mem_size The size of local
3201
+ /// memory required by the kernel function. \param [in] que SYCL queue
3202
+ /// used to execute kernel.
3203
+ static void launch(const void *func, dim3 group_range, dim3 local_range,
3204
+ void **args, unsigned int local_mem_size,
3205
+ queue_ptr que) {
3206
+ std::lock_guard<std::mutex> lock(kernel_function_ptr_map_mutex);
3207
+ auto Iter = kernel_function_ptr_map.find(func);
3208
+ if (Iter == kernel_function_ptr_map.end()) {
3209
+ throw std::runtime_error("dpct::launch() : no registered "
3210
+ "kernel function wrapper found.");
3211
+ }
3212
+ (Iter->second)(group_range, local_range, args, local_mem_size, que);
3213
+ }
3214
+ /// Launches a kernel function with packed arguments through kernel
3215
+ /// function wrapper.
3216
+ /// \tparam FuncT Type of the kernel function wrapper.
3217
+ /// \param [in] func Pointer to the kernel function wrapper.
3218
+ /// \param [in] group_range SYCL group range.
3219
+ /// \param [in] local_range SYCL local range.
3220
+ /// \param [in] args Array of pointers to kernel arguments.
3221
+ /// \param [in] local_mem_size The size of local memory required by the
3222
+ /// kernel function. \param [in] que SYCL queue used to execute kernel.
3223
+ template <typename FuncT>
3224
+ static std::enable_if_t<std::is_function_v<FuncT>, void>
3225
+ launch(FuncT *func, dim3 group_range, dim3 local_range, void **args,
3226
+ unsigned int local_mem_size, queue_ptr que) {
3227
+ constexpr size_t p_num = args_selector<0, 0, FuncT>::params_num;
3228
+ set_execution_config(group_range, local_range, local_mem_size, que);
3229
+ args_selector<p_num, p_num, FuncT> selector(args, nullptr);
3230
+ launch_helper(func, selector, std::make_index_sequence<p_num>{});
3231
+ }
3232
+ }; // COPY from DPCT head file
3233
+ // /opt/intel/oneapi/dpcpp-ct/latest/include/dpct/kernel.hpp
3234
+
3235
+ // /opt/intel/oneapi/dpcpp-ct/latest/include/dpct/util.hpp
3236
+ template <typename T>
3237
+ T select_from_sub_group(
3238
+ sycl::sub_group g,
3239
+ T x,
3240
+ int remote_local_id,
3241
+ int logical_sub_group_size = 32) {
3242
+ unsigned int start_index = g.get_local_linear_id() /
3243
+ logical_sub_group_size *
3244
+ logical_sub_group_size;
3245
+ return sycl::select_from_group(
3246
+ g, x, start_index + remote_local_id % logical_sub_group_size);
3247
+ }
3248
+
3249
+ // /opt/intel/oneapi/dpcpp-ct/latest/include/dpct/math.hpp
3250
+ template <typename T>
3251
+ void ldmatrix(uintptr_t addr, T* m, bool trans = false, unsigned mat = 0) {
3252
+ auto sg = sycl::ext::oneapi::this_work_item::get_sub_group();
3253
+ int lane = sg.get_local_linear_id();
3254
+
3255
+ int lane_group8_row = lane / 8;
3256
+ int lane_group8_col = lane % 8;
3257
+
3258
+ if (!trans) {
3259
+ // calculate the source lane
3260
+ int src_lane = 2 * lane_group8_row;
3261
+ if (lane_group8_col >= 4)
3262
+ src_lane += 1;
3263
+
3264
+ // Broadcast the address from the source lane
3265
+ auto recv_addr_uintp =
3266
+ dpct::select_from_sub_group(sg, addr, mat * 8 + src_lane);
3267
+
3268
+ // Cast the received address from uintptr_t to the type of 'm'
3269
+ auto recv_addr = reinterpret_cast<T*>(recv_addr_uintp);
3270
+
3271
+ // Non-transposed load
3272
+ *m = recv_addr[lane_group8_col % 4];
3273
+ } else {
3274
+ // calculate the source lane
3275
+ int src_lane = (lane % 4) * 2;
3276
+
3277
+ // Broadcast the address from the source lane
3278
+ auto recv_addr_uintp_1 =
3279
+ dpct::select_from_sub_group(sg, addr, mat * 8 + src_lane);
3280
+ auto recv_addr_uintp_2 =
3281
+ dpct::select_from_sub_group(sg, addr, mat * 8 + src_lane + 1);
3282
+
3283
+ // Cast the received address from uintptr_t to 'half *'
3284
+ auto recv_addr_1 = reinterpret_cast<sycl::half*>(recv_addr_uintp_1);
3285
+ auto recv_addr_2 = reinterpret_cast<sycl::half*>(recv_addr_uintp_2);
3286
+
3287
+ // Transposed load
3288
+ int index = lane / 4;
3289
+ sycl::half val0 = recv_addr_1[index];
3290
+ sycl::half val1 = recv_addr_2[index];
3291
+
3292
+ // Combine the two 16-bits into one 32-bit value
3293
+ sycl::half2 val = sycl::half2(val0, val1);
3294
+ *m = *reinterpret_cast<T*>(&val);
3295
+ }
3296
+ }
3297
+
3298
+ template <typename T>
3299
+ void ldmatrix(uintptr_t addr, T* m1, T* m2, bool trans = false) {
3300
+ // Load 1st matrix
3301
+ ldmatrix(addr, m1, trans, 0);
3302
+ // Load 2nd matrix
3303
+ ldmatrix(addr, m2, trans, 1);
3304
+ }
3305
+
3306
+ template <typename T>
3307
+ void ldmatrix(
3308
+ uintptr_t addr, T* m1, T* m2, T* m3, T* m4, bool trans = false) {
3309
+ // Load 1st matrix
3310
+ ldmatrix(addr, m1, trans, 0);
3311
+ // Load 2nd matrix
3312
+ ldmatrix(addr, m2, trans, 1);
3313
+ // Load 3rd matrix
3314
+ ldmatrix(addr, m3, trans, 2);
3315
+ // Load 4th matrix
3316
+ ldmatrix(addr, m4, trans, 3);
3317
+ }
3318
+
3319
+ // /opt/intel/oneapi/dpcpp-ct/latest/include/dpct/math.hpp
3320
+
3321
+ /// A helper struct that defines the pack type for the input matrix
3322
+ /// fragments
3323
+ /// of mma() function based on the type of input matrix fragments.
3324
+ /// The MMAType struct is specialized for different types of input matrices.
3325
+ /// Currently, the specialization for f16, bf16 and s8 types is defined
3326
+ /// below. \tparam [in] T The type of the input matrix fragments
3327
+ template <typename T>
3328
+ struct MMAType {
3329
+ using PackType = uint32_t;
3330
+ };
3331
+
3332
+ /// Each work item of a sub-group (limited to size 32) calling this function
3333
+ /// calculates a subset fragment for the output matrix D using MAD operation
3334
+ /// on A, B & C matrix fragments (D = A * B + C). Current supported shapes &
3335
+ /// types:
3336
+ /// - m8n8k4 (f32.f16.f16.f32)
3337
+ /// - m8n8k16 (s32.s8.s8.s32)
3338
+ /// - m16n8k8 (f32.f16.f16.f32 & f32.bf16.bf16.f32)
3339
+ /// - m16n8k16 (f32.f16.f16.f32 & s32.s8.s8.s32)
3340
+ /// - m16n8k32 (s32.s8.s8.s32)
3341
+ /// Here, m, n & k define the shapes of A, B & C matrices respectively
3342
+ /// (A = [m x k], B = [k x n], C = [m x n]).
3343
+ /// \tparam [in] M The rows of A, C & D matrices
3344
+ /// \tparam [in] N The columns of B, C, D matrices
3345
+ /// \tparam [in] K The columns & rows of A & B matrices respectively
3346
+ /// \tparam [in] ABType The type of the input matrix (A & B) fragment
3347
+ /// \tparam [in] CDType The type of the output matrix (C & D) fragment
3348
+ /// \param [out] d_mat_frag The fragment of the output matrix D to store the
3349
+ /// result of A * B + C
3350
+ /// \param [in] a_mat_frag The fragment of the input matrix A to be
3351
+ /// multiplied with B matrix fragment \param [in] b_mat_frag The fragment of
3352
+ /// the input matrix B to be multiplied with A matrix fragment \param [in]
3353
+ /// c_mat_frag The fragment of the input matrix C to be added with the
3354
+ /// result of A * B fragments
3355
+ template <int M, int N, int K, typename ABType, typename CDType>
3356
+ void mma(
3357
+ volatile void** d_mat_frag,
3358
+ void* a_mat_frag,
3359
+ void* b_mat_frag,
3360
+ void* c_mat_frag) {
3361
+ auto d = reinterpret_cast<volatile CDType**>(d_mat_frag);
3362
+ auto a =
3363
+ reinterpret_cast<typename MMAType<ABType>::PackType*>(a_mat_frag);
3364
+ auto b =
3365
+ reinterpret_cast<typename MMAType<ABType>::PackType*>(b_mat_frag);
3366
+ auto c = reinterpret_cast<CDType*>(c_mat_frag);
3367
+
3368
+ auto sg = sycl::ext::oneapi::this_work_item::get_sub_group();
3369
+ int lane = sg.get_local_linear_id();
3370
+
3371
+ static_assert(
3372
+ (M == 8 && N == 8 && K == 4) || (M == 8 && N == 8 && K == 16) ||
3373
+ (M == 16 && N == 8 && K == 8) || (M == 16 && N == 8 && K == 16) ||
3374
+ (M == 16 && N == 8 && K == 32),
3375
+ "Unsupported MMA shape!");
3376
+
3377
+ short row_load_offset = 4 * (lane >> 2);
3378
+ short col_load_offset = 8 * (lane % 4);
3379
+
3380
+ if constexpr (M == 8 && N == 8 && K == 4) {
3381
+ if constexpr (std::is_floating_point_v<CDType>) {
3382
+ col_load_offset = row_load_offset % 16;
3383
+
3384
+ // Init D matrix with fragments of C matrix
3385
+ *d[0] = c[0];
3386
+ *d[1] = c[1];
3387
+ *d[2] = c[2];
3388
+ *d[3] = c[3];
3389
+ *d[4] = c[4];
3390
+ *d[5] = c[5];
3391
+ *d[6] = c[6];
3392
+ *d[7] = c[7];
3393
+
3394
+ // Calculate the row and col offset indices to iterate through the row
3395
+ // & col fragments of A & B matrices
3396
+ int r_ind = (lane % 2) ? 1 : 0;
3397
+ int c_ind = ((lane % 4) / 2) ? 2 : 0;
3398
+
3399
+ // Each sub-group is responsible for computing a fragment size of 8*8
3400
+ // elements of matrix D for each of 4 MMA computations.
3401
+ // Each work item computes 8 elements of matrix D by gathering
3402
+ // their corresponding col & row matrix fragments of length k (4)
3403
+ // from A & B matrices respectively using below mapping logic:
3404
+ // row0 = (i % 4) if (lane < 16) else (i % 4) + 4
3405
+ // col0 = (lane % 4)
3406
+ // As each row & col fragment of A & B matrices is distributed across
3407
+ // 4 work items, each iteration of below loop loads a partial fragment
3408
+ // of matrix A (row) and matrix B (col) using the row & col offsets.
3409
+ typename MMAType<ABType>::PackType recv_a[2], recv_b[2];
3410
+
3411
+ for (int i = 0; i < 4; i++) {
3412
+ // Load partial fragment from col0 of matrix A ({a0, a1})
3413
+ recv_a[0] =
3414
+ dpct::select_from_sub_group(sg, a[0], row_load_offset + i);
3415
+ // Load partial fragment from col0 of matrix A ({a2, a3})
3416
+ recv_a[1] =
3417
+ dpct::select_from_sub_group(sg, a[1], row_load_offset + i);
3418
+
3419
+ // Load partial fragment from row0 of matrix B ({b0, b1})
3420
+ recv_b[0] =
3421
+ dpct::select_from_sub_group(sg, b[0], col_load_offset + i);
3422
+ // Load partial fragment from row0 of matrix B ({b2, b3})
3423
+ recv_b[1] =
3424
+ dpct::select_from_sub_group(sg, b[1], col_load_offset + i);
3425
+
3426
+ auto ra = reinterpret_cast<ABType*>(recv_a);
3427
+ auto rb = reinterpret_cast<ABType*>(recv_b);
3428
+
3429
+ // Each work item calculates a partial product of A & B matrix
3430
+ // fragments and adds it to the corresponding D matrix fragment (for
3431
+ // even work item indices) d0 += col0{ a0 } * row0{ b0 } d1 += col0{
3432
+ // a0 } * row0{ b1 } d2 += col1{ a2 } * row0{ b0 } d3 += col1{ a2 }
3433
+ // * row0{ b1 } (for odd work item indices) d0 += col0{ a1 } * row0{
3434
+ // b2 } d1 += col0{ a1 } * row0{ b3 } d2 += col1{ a3 } * row0{ b2 }
3435
+ // d3 += col1{ a3 } * row0{ b3 }
3436
+ *d[0] +=
3437
+ static_cast<float>(ra[r_ind]) * static_cast<float>(rb[c_ind]);
3438
+ *d[1] += static_cast<float>(ra[r_ind]) *
3439
+ static_cast<float>(rb[c_ind + 1]);
3440
+ *d[2] += static_cast<float>(ra[r_ind + 2]) *
3441
+ static_cast<float>(rb[c_ind]);
3442
+ *d[3] += static_cast<float>(ra[r_ind + 2]) *
3443
+ static_cast<float>(rb[c_ind + 1]);
3444
+
3445
+ // Load partial fragment from row1 of matrix B ({b0, b1})
3446
+ recv_b[0] =
3447
+ dpct::select_from_sub_group(sg, b[0], col_load_offset + i + 16);
3448
+ // Load partial fragment from row1 of matrix B ({b2, b3})
3449
+ recv_b[1] =
3450
+ dpct::select_from_sub_group(sg, b[1], col_load_offset + i + 16);
3451
+
3452
+ // (for even work item indices)
3453
+ // d0 += col0{ a0 } * row1{ b0 }
3454
+ // d1 += col0{ a0 } * row1{ b1 }
3455
+ // d2 += col1{ a2 } * row1{ b0 }
3456
+ // d3 += col1{ a2 } * row1{ b1 }
3457
+ // (for odd work item indices)
3458
+ // d0 += col0{ a1 } * row1{ b2 }
3459
+ // d1 += col0{ a1 } * row1{ b3 }
3460
+ // d2 += col1{ a3 } * row1{ b2 }
3461
+ // d3 += col1{ a3 } * row1{ b3 }
3462
+ *d[4] +=
3463
+ static_cast<float>(ra[r_ind]) * static_cast<float>(rb[c_ind]);
3464
+ *d[5] += static_cast<float>(ra[r_ind]) *
3465
+ static_cast<float>(rb[c_ind + 1]);
3466
+ *d[6] += static_cast<float>(ra[r_ind + 2]) *
3467
+ static_cast<float>(rb[c_ind]);
3468
+ *d[7] += static_cast<float>(ra[r_ind + 2]) *
3469
+ static_cast<float>(rb[c_ind + 1]);
3470
+ }
3471
+ }
3472
+ } else if constexpr (M == 8 && N == 8 && K == 16) {
3473
+ if constexpr (std::is_integral_v<ABType>) {
3474
+ // Init D matrix with fragments of C matrix
3475
+ *d[0] = c[0];
3476
+ *d[1] = c[1];
3477
+
3478
+ // Each sub-group is responsible for computing a fragment size of 16*8
3479
+ // elements of matrix D.
3480
+ // Each work item computes 2 elements of matrix D by gathering
3481
+ // their corresponding row & col matrix fragments of length k (16)
3482
+ // from A & B matrices respectively using below mapping logic:
3483
+ // row0 = ((lane % 4) * 4) + i
3484
+ // col0 = (lane >> 2)
3485
+ // As each row & col fragment of A & B matrices is distributed across
3486
+ // 4 work items, each iteration of below loop loads a partial fragment
3487
+ // of matrix A (row) and matrix B (col) using the row & col offsets.
3488
+ for (int i = 0; i < 4; i++) {
3489
+ typename MMAType<ABType>::PackType recv_a, recv_b[2];
3490
+
3491
+ // Load partial fragment from row0 of matrix A ({a0, a1, a2, a3})
3492
+ recv_a = dpct::select_from_sub_group(sg, a[0], row_load_offset + i);
3493
+ // Load partial fragment from col0 of matrix B ({b0, b1, b2, b3})
3494
+ recv_b[0] =
3495
+ dpct::select_from_sub_group(sg, b[0], col_load_offset + i);
3496
+ // Load partial fragment from col1 of matrix B ({b0, b1, b2, b3})
3497
+ recv_b[1] =
3498
+ dpct::select_from_sub_group(sg, b[0], col_load_offset + i + 4);
3499
+
3500
+ auto a = reinterpret_cast<ABType*>(&recv_a);
3501
+ auto b = reinterpret_cast<ABType*>(recv_b);
3502
+
3503
+ // Each work item calculates a partial product of A & B matrix
3504
+ // fragments and adds it to the corresponding D matrix fragment d0
3505
+ // += row0{ a0, a1, a2, a3 } * col0{ b0, b1, b2, b3 } d1 += row0{
3506
+ // a0, a1, a2, a3 } * col1{ b0, b1, b2, b3 } d2 += row0{ a0, a1, a2,
3507
+ // a3 } * col0{ b0, b1, b2, b3 } d3 += row0{ a0, a1, a2, a3 } *
3508
+ // col1{ b0, b1, b2, b3 }
3509
+ for (int j = 0; j < 4; j++) {
3510
+ *d[0] += a[j] * b[j];
3511
+ *d[1] += a[j] * b[j + 4];
3512
+ }
3513
+ }
3514
+ }
3515
+ } else if constexpr (M == 16 && N == 8 && K == 8) {
3516
+ if constexpr (std::is_floating_point_v<CDType>) {
3517
+ // Init D matrix fragment with C matrix fragment
3518
+ *d[0] = c[0];
3519
+ *d[1] = c[1];
3520
+ *d[2] = c[2];
3521
+ *d[3] = c[3];
3522
+
3523
+ // Each sub-group is responsible for computing a fragment size of 16*8
3524
+ // elements of matrix D.
3525
+ // Each work item computes 4 elements of matrix D by gathering
3526
+ // their corresponding row & col matrix fragments of length k (8)
3527
+ // from A & B matrices respectively using below mapping logic:
3528
+ // row0 = (lane >> 2) & row1 = (lane >> 2) + 8
3529
+ // col0 = (lane % 4) * 2 + (i & 0x1)
3530
+ // As each row & col fragment of A & B matrices is distributed across
3531
+ // 4 work items, each iteration of below loop loads a partial fragment
3532
+ // of matrix A (row) and matrix B (col) using the row & col offsets.
3533
+ for (int i = 0; i < 4; i++) {
3534
+ typename MMAType<ABType>::PackType recv_a[2], recv_b[2];
3535
+
3536
+ // Load partial fragment from row0 of matrix A ({a0, a1})
3537
+ recv_a[0] =
3538
+ dpct::select_from_sub_group(sg, a[0], row_load_offset + i);
3539
+ // Load partial fragment from row1 of matrix A ({a2, a3})
3540
+ recv_a[1] =
3541
+ dpct::select_from_sub_group(sg, a[1], row_load_offset + i);
3542
+ // Load partial fragment from col0 of matrix B ({b0, b1})
3543
+ recv_b[0] =
3544
+ dpct::select_from_sub_group(sg, b[0], col_load_offset + i);
3545
+ // Load partial fragment from col1 of matrix B ({b0, b1})
3546
+ recv_b[1] =
3547
+ dpct::select_from_sub_group(sg, b[0], col_load_offset + i + 4);
3548
+
3549
+ auto ra = reinterpret_cast<ABType*>(recv_a);
3550
+ auto rb = reinterpret_cast<ABType*>(recv_b);
3551
+
3552
+ // Each work item calculates a partial product of A & B matrix
3553
+ // fragments and adds it to the corresponding D matrix fragment d0
3554
+ // += row0{ a0, a1 } * col0{ b0, b1 } d1 += row0{ a0, a1 } * col1{
3555
+ // b0, b1 } d2 += row1{ a2, a3 } * col0{ b0, b1 } d3 += row1{ a2, a3
3556
+ // } * col1{ b0, b1 }
3557
+ for (int j = 0; j < 2; j++) {
3558
+ *d[0] += static_cast<float>(ra[j]) * static_cast<float>(rb[j]);
3559
+ *d[1] +=
3560
+ static_cast<float>(ra[j]) * static_cast<float>(rb[j + 2]);
3561
+ *d[2] +=
3562
+ static_cast<float>(ra[j + 2]) * static_cast<float>(rb[j]);
3563
+ *d[3] +=
3564
+ static_cast<float>(ra[j + 2]) * static_cast<float>(rb[j + 2]);
3565
+ }
3566
+ }
3567
+ }
3568
+ } else if constexpr (M == 16 && N == 8 && K == 16) {
3569
+ if constexpr (std::is_floating_point_v<CDType>) {
3570
+ // Init D matrix fragment with C matrix fragment
3571
+ *d[0] = c[0];
3572
+ *d[1] = c[1];
3573
+ *d[2] = c[2];
3574
+ *d[3] = c[3];
3575
+
3576
+ // Each sub-group is responsible for computing a fragment size of 16*8
3577
+ // elements of matrix D.
3578
+ // Each work item computes 4 elements of matrix D by gathering
3579
+ // their corresponding row & col matrix fragments of length k (8)
3580
+ // from A & B matrices respectively using below mapping logic:
3581
+ // row0 = (lane >> 2) & row1 = (lane >> 2) + 8
3582
+ // col0 = (lane % 4) * 2 & col1 = (lane % 4) * 2 + 1
3583
+ // As each row & col fragment of A & B matrices is distributed across
3584
+ // 4 work items, each iteration of below loop loads a partial fragment
3585
+ // of matrix A (row) and matrix B (col) using the row & col offsets.
3586
+ for (int i = 0; i < 4; i++) {
3587
+ typename MMAType<ABType>::PackType recv_a[4], recv_b[4];
3588
+
3589
+ // Load partial fragment from row0 of matrix A ({a0, a1})
3590
+ recv_a[0] =
3591
+ dpct::select_from_sub_group(sg, a[0], row_load_offset + i);
3592
+ // Load partial fragment from row0 of matrix A ({a2, a3})
3593
+ recv_a[1] =
3594
+ dpct::select_from_sub_group(sg, a[2], row_load_offset + i);
3595
+ // Load partial fragment from row1 of matrix A ({a0, a1})
3596
+ recv_a[2] =
3597
+ dpct::select_from_sub_group(sg, a[1], row_load_offset + i);
3598
+ // Load partial fragment from row1 of matrix A ({a2, a3})
3599
+ recv_a[3] =
3600
+ dpct::select_from_sub_group(sg, a[3], row_load_offset + i);
3601
+
3602
+ // Load partial fragment from col0 of matrix B ({b0, b1})
3603
+ recv_b[0] =
3604
+ dpct::select_from_sub_group(sg, b[0], col_load_offset + i);
3605
+ // Load partial fragment from col0 of matrix B ({b2, b3})
3606
+ recv_b[1] =
3607
+ dpct::select_from_sub_group(sg, b[1], col_load_offset + i);
3608
+ // Load partial fragment from col1 of matrix B ({b0, b1})
3609
+ recv_b[2] =
3610
+ dpct::select_from_sub_group(sg, b[0], col_load_offset + 4 + i);
3611
+ // Load partial fragment from col1 of matrix B ({b2, b3})
3612
+ recv_b[3] =
3613
+ dpct::select_from_sub_group(sg, b[1], col_load_offset + 4 + i);
3614
+
3615
+ auto ra = reinterpret_cast<ABType*>(recv_a);
3616
+ auto rb = reinterpret_cast<ABType*>(recv_b);
3617
+
3618
+ // Each work item calculates a partial product of A & B matrix
3619
+ // fragments and adds it to the corresponding D matrix fragment d0
3620
+ // += row0{ a0, a1, a2, a3 } * col0{ b0, b1, b2, b3 } d1 += row0{
3621
+ // a0, a1, a2, a3 } * col1{ b0, b1, b2, b3 } d2 += row1{ a0, a1, a2,
3622
+ // a3 } * col0{ b0, b1, b2, b3 } d3 += row1{ a0, a1, a2, a3 } *
3623
+ // col1{ b0, b1, b2, b3 }
3624
+ for (int j = 0; j < 4; j++) {
3625
+ *d[0] += static_cast<CDType>(ra[j]) * static_cast<CDType>(rb[j]);
3626
+ *d[1] +=
3627
+ static_cast<CDType>(ra[j]) * static_cast<CDType>(rb[j + 4]);
3628
+ *d[2] +=
3629
+ static_cast<CDType>(ra[j + 4]) * static_cast<CDType>(rb[j]);
3630
+ *d[3] += static_cast<CDType>(ra[j + 4]) *
3631
+ static_cast<CDType>(rb[j + 4]);
3632
+ }
3633
+ }
3634
+ } else if constexpr (std::is_integral_v<ABType>) {
3635
+ // Init D matrix with fragments of C matrix
3636
+ *d[0] = c[0];
3637
+ *d[1] = c[1];
3638
+ *d[2] = c[2];
3639
+ *d[3] = c[3];
3640
+
3641
+ // Each sub-group is responsible for computing a fragment size of 16*8
3642
+ // elements of matrix D.
3643
+ // Each work item computes 4 elements of matrix D by gathering
3644
+ // their corresponding row & col matrix fragments of length k (8)
3645
+ // from A & B matrices respectively using below mapping logic:
3646
+ // row0 = (lane >> 2) & row1 = (lane >> 2) + 8
3647
+ // col0 = (lane % 4) * 2 & col1 = (lane % 4) * 2 + 1
3648
+ // As each row & col fragment of A & B matrices is distributed across
3649
+ // 4 work items, each iteration of below loop loads a partial fragment
3650
+ // of matrix A (row) and matrix B (col) using the row & col offsets.
3651
+ for (int i = 0; i < 4; i++) {
3652
+ typename MMAType<ABType>::PackType recv_a[2], recv_b[2];
3653
+
3654
+ // Load partial fragment from row0 of matrix A ({a0, a1, a2, a3})
3655
+ recv_a[0] =
3656
+ dpct::select_from_sub_group(sg, a[0], row_load_offset + i);
3657
+ // Load partial fragment from row1 of matrix A ({a4, a5, a6, a7})
3658
+ recv_a[1] =
3659
+ dpct::select_from_sub_group(sg, a[1], row_load_offset + i);
3660
+ // Load partial fragment from col0 of matrix B ({b0, b1, b2, b3})
3661
+ recv_b[0] =
3662
+ dpct::select_from_sub_group(sg, b[0], col_load_offset + i);
3663
+ // Load partial fragment from col1 of matrix B ({b4, b5, b6, b7})
3664
+ recv_b[1] =
3665
+ dpct::select_from_sub_group(sg, b[0], col_load_offset + i + 4);
3666
+
3667
+ auto ra = reinterpret_cast<ABType*>(recv_a);
3668
+ auto rb = reinterpret_cast<ABType*>(recv_b);
3669
+
3670
+ // Each work item calculates a partial product of A & B matrix
3671
+ // fragments and adds it to the corresponding D matrix fragment d0
3672
+ // += row0{ a0, a1, a2, a3 } * col0{ b0, b1, b2, b3 } d1 += row0{
3673
+ // a0, a1, a2, a3 } * col1{ b4, b5, b6, b7 } d2 += row1{ a4, a5, a6,
3674
+ // a7 } * col0{ b0, b1, b2, b3 } d3 += row1{ a4, a5, a6, a7 } *
3675
+ // col1{ b4, b5, b6, b7 }
3676
+ for (int i = 0; i < 4; i++) {
3677
+ *d[0] += ra[i] * rb[i];
3678
+ *d[1] += ra[i] * rb[i + 4];
3679
+ *d[2] += ra[i + 4] * rb[i];
3680
+ *d[3] += ra[i + 4] * rb[i + 4];
3681
+ }
3682
+ }
3683
+ }
3684
+ } else if constexpr (M == 16 && N == 8 && K == 32) {
3685
+ if constexpr (std::is_integral_v<ABType>) {
3686
+ // Init D matrix with fragments of C matrix
3687
+ *d[0] = c[0];
3688
+ *d[1] = c[1];
3689
+ *d[2] = c[2];
3690
+ *d[3] = c[3];
3691
+
3692
+ // Each sub-group is responsible for computing a fragment size of 16*8
3693
+ // elements of matrix D.
3694
+ // Each work item computes 4 elements of matrix D by gathering
3695
+ // their corresponding row & col matrix fragments of length k (32)
3696
+ // from A & B matrices respectively using below mapping logic:
3697
+ // row0 = (lane >> 2) & row1 = (lane >> 2) + 8
3698
+ // col0 = ((lane % 4) * 4) + (i & 0x3) & col1 = ((lane % 4) * 4) + (i
3699
+ // & 0x3) As each row & col fragment of A & B matrices is distributed
3700
+ // across 4 work items, each iteration of below loop loads a partial
3701
+ // fragment of matrix A (row) and matrix B (col) using the row & col
3702
+ // offsets.
3703
+ for (int i = 0; i < 4; i++) {
3704
+ typename MMAType<ABType>::PackType recv_a[2], recv_b[2];
3705
+
3706
+ // Load partial fragment from row0 of matrix A ({a0, a1, a2, a3})
3707
+ recv_a[0] =
3708
+ dpct::select_from_sub_group(sg, a[0], row_load_offset + i);
3709
+ // Load partial fragment from row1 of matrix A ({a4, a5, a6, a7})
3710
+ recv_a[1] =
3711
+ dpct::select_from_sub_group(sg, a[1], row_load_offset + i);
3712
+ // Load partial fragment from col0 of matrix B ({b0, b1, b2, b3})
3713
+ recv_b[0] =
3714
+ dpct::select_from_sub_group(sg, b[0], col_load_offset + i);
3715
+ // Load partial fragment from col1 of matrix B ({b0, b1, b2, b3})
3716
+ recv_b[1] =
3717
+ dpct::select_from_sub_group(sg, b[0], col_load_offset + i + 4);
3718
+
3719
+ auto a = reinterpret_cast<ABType*>(recv_a);
3720
+ auto b = reinterpret_cast<ABType*>(recv_b);
3721
+
3722
+ // Each work item calculates a partial product of A & B matrix
3723
+ // fragments and adds it to the corresponding D matrix fragment d0
3724
+ // += row0{ a0, a1, a2, a3 } * col0{ b0, b1, b2, b3 } d1 += row0{
3725
+ // a0, a1, a2, a3 } * col1{ b0, b1, b2, b3 } d2 += row1{ a4, a5, a6,
3726
+ // a7 } * col0{ b0, b1, b2, b3 } d3 += row1{ a4, a5, a6, a7 } *
3727
+ // col1{ b0, b1, b2, b3 }
3728
+ for (int j = 0; j < 4; j++) {
3729
+ *d[0] += a[j] * b[j];
3730
+ *d[1] += a[j] * b[j + 4];
3731
+ *d[2] += a[j + 4] * b[j];
3732
+ *d[3] += a[j + 4] * b[j + 4];
3733
+ }
3734
+ }
3735
+
3736
+ for (int i = 0; i < 4; i++) {
3737
+ typename MMAType<ABType>::PackType recv_a[2], recv_b[2];
3738
+
3739
+ // Load partial fragment from row0 of matrix A ({a8, a9, a10, a11})
3740
+ recv_a[0] =
3741
+ dpct::select_from_sub_group(sg, a[2], row_load_offset + i);
3742
+ // Load partial fragment from row1 of matrix A ({a12, a13, a14,
3743
+ // a15})
3744
+ recv_a[1] =
3745
+ dpct::select_from_sub_group(sg, a[3], row_load_offset + i);
3746
+ // Load partial fragment from col0 of matrix B ({b4, b5, b6, b7})
3747
+ recv_b[0] =
3748
+ dpct::select_from_sub_group(sg, b[1], col_load_offset + i);
3749
+ // Load partial fragment from col1 of matrix B ({b4, b5, b6, b7})
3750
+ recv_b[1] =
3751
+ dpct::select_from_sub_group(sg, b[1], col_load_offset + i + 4);
3752
+
3753
+ auto a = reinterpret_cast<ABType*>(recv_a);
3754
+ auto b = reinterpret_cast<ABType*>(recv_b);
3755
+
3756
+ // Each work item calculates a partial product of A & B matrix
3757
+ // fragments and adds it to the corresponding D matrix fragment d0
3758
+ // += row0{ a8, a9, a10, a11 } * col0{ b4, b5, b6, b7 } d1 += row0{
3759
+ // a8, a9, a10, a11 } * col1{ b4, b5, b6, b7 } d2 += row1{ a12, a13,
3760
+ // a14, a15 } * col0{ b4, b5, b6, b7 } d3 += row1{ a12, a13, a14,
3761
+ // a15 } * col1{ b4, b5, b6, b7 }
3762
+ for (int j = 0; j < 4; j++) {
3763
+ *d[0] += a[j] * b[j];
3764
+ *d[1] += a[j] * b[j + 4];
3765
+ *d[2] += a[j + 4] * b[j];
3766
+ *d[3] += a[j + 4] * b[j + 4];
3767
+ }
3768
+ }
3769
+ }
3770
+ }
3771
+ }
3772
+ } // COPY from DPCT head files
3773
+
3774
+ #endif // GGML_SYCL_DPCT_HELPER_HPP