toy 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2107) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +1124 -0
  3. data/LICENSE +21 -0
  4. data/Makefile +2022 -0
  5. data/README.md +154 -0
  6. data/bin/toy +10 -0
  7. data/lib/toy/compute.rb +135 -0
  8. data/lib/toy/compute_cuda.rb +104 -0
  9. data/lib/toy/compute_metal.rb +97 -0
  10. data/lib/toy/core/cli/describe.rb +188 -0
  11. data/lib/toy/core/cli/eval.rb +385 -0
  12. data/lib/toy/core/cli/exit_codes.rb +15 -0
  13. data/lib/toy/core/cli/fetch.rb +238 -0
  14. data/lib/toy/core/cli/infer.rb +268 -0
  15. data/lib/toy/core/cli/install.rb +228 -0
  16. data/lib/toy/core/cli/list.rb +86 -0
  17. data/lib/toy/core/cli/manifest.rb +49 -0
  18. data/lib/toy/core/cli/new.rb +594 -0
  19. data/lib/toy/core/cli/serve.rb +237 -0
  20. data/lib/toy/core/cli/train.rb +471 -0
  21. data/lib/toy/core/cli.rb +165 -0
  22. data/lib/toy/core/config.rb +64 -0
  23. data/lib/toy/core/gguf_meta.rb +161 -0
  24. data/lib/toy/core/model_scan.rb +221 -0
  25. data/lib/toy/core/run_log.rb +94 -0
  26. data/lib/toy/core/toy_root.rb +95 -0
  27. data/lib/toy/dev/toy_card.rb +299 -0
  28. data/lib/toy/dev/toy_describe_flow.rb +412 -0
  29. data/lib/toy/dev/toy_logprobs.rb +86 -0
  30. data/lib/toy/dev/toy_tap.rb +183 -0
  31. data/lib/toy/dev/toy_token_drift.rb +121 -0
  32. data/lib/toy/ffi/tinynn.rb +1491 -0
  33. data/lib/toy/ffi/tinynn_cuda.rb +1124 -0
  34. data/lib/toy/ffi/tinynn_metal.rb +359 -0
  35. data/lib/toy/ffi_manifest.rb +84 -0
  36. data/lib/toy/io/bpe.rb +325 -0
  37. data/lib/toy/io/gguf_kv.rb +35 -0
  38. data/lib/toy/io/gguf_load.rb +331 -0
  39. data/lib/toy/io/loaders/toy_gpt2_loader.rb +70 -0
  40. data/lib/toy/io/loaders/toy_smollm2_loader.rb +754 -0
  41. data/lib/toy/io/model_index.rb +206 -0
  42. data/lib/toy/io/run_bundle.rb +280 -0
  43. data/lib/toy/io/tokenizer.rb +613 -0
  44. data/lib/toy/io/toy_corpus_loader.rb +52 -0
  45. data/lib/toy/io/toy_events.rb +56 -0
  46. data/lib/toy/io/toy_image_loader.rb +48 -0
  47. data/lib/toy/llm/adamw.rb +169 -0
  48. data/lib/toy/llm/archs/llama_arch.rb +233 -0
  49. data/lib/toy/llm/archs/llama_arch_cuda.rb +237 -0
  50. data/lib/toy/llm/archs/llama_arch_metal.rb +237 -0
  51. data/lib/toy/llm/blocks/transformer_block.rb +876 -0
  52. data/lib/toy/llm/blocks/transformer_block_cuda.rb +880 -0
  53. data/lib/toy/llm/blocks/transformer_block_metal.rb +880 -0
  54. data/lib/toy/llm/classify_batch.rb +88 -0
  55. data/lib/toy/llm/engine/gpt2_fwd_engine.rb +360 -0
  56. data/lib/toy/llm/engine/gpt2_fwd_engine_cuda.rb +362 -0
  57. data/lib/toy/llm/engine/gpt2_fwd_engine_metal.rb +362 -0
  58. data/lib/toy/llm/engine/gpt2_kv_engine.rb +346 -0
  59. data/lib/toy/llm/engine/gpt2_kv_engine_cuda.rb +348 -0
  60. data/lib/toy/llm/engine/gpt2_kv_engine_metal.rb +348 -0
  61. data/lib/toy/llm/engine/gpt2_seq_engine.rb +289 -0
  62. data/lib/toy/llm/engine/gpt2_seq_engine_cuda.rb +293 -0
  63. data/lib/toy/llm/engine/gpt2_seq_engine_metal.rb +293 -0
  64. data/lib/toy/llm/engine/llama_kv_engine.rb +1593 -0
  65. data/lib/toy/llm/engine/llama_kv_engine_cuda.rb +1526 -0
  66. data/lib/toy/llm/engine/llama_kv_engine_metal.rb +1526 -0
  67. data/lib/toy/llm/engine/llama_seq_engine.rb +1233 -0
  68. data/lib/toy/llm/engine/llama_seq_engine_cuda.rb +1238 -0
  69. data/lib/toy/llm/engine/llama_seq_engine_metal.rb +1238 -0
  70. data/lib/toy/llm/engine/vit_tiny_engine.rb +467 -0
  71. data/lib/toy/llm/labels.rb +142 -0
  72. data/lib/toy/llm/primitives/gqa.rb +62 -0
  73. data/lib/toy/llm/primitives/gqa_cuda.rb +66 -0
  74. data/lib/toy/llm/primitives/gqa_metal.rb +66 -0
  75. data/lib/toy/llm/primitives/rms_norm.rb +39 -0
  76. data/lib/toy/llm/primitives/rms_norm_cuda.rb +43 -0
  77. data/lib/toy/llm/primitives/rms_norm_metal.rb +43 -0
  78. data/lib/toy/llm/primitives/rope.rb +68 -0
  79. data/lib/toy/llm/primitives/rope_cuda.rb +72 -0
  80. data/lib/toy/llm/primitives/rope_metal.rb +72 -0
  81. data/lib/toy/llm/primitives/swiglu.rb +41 -0
  82. data/lib/toy/llm/primitives/swiglu_cuda.rb +45 -0
  83. data/lib/toy/llm/primitives/swiglu_metal.rb +45 -0
  84. data/lib/toy/llm/recipe_options.rb +71 -0
  85. data/lib/toy/llm/recipes/from_scratch.rb +105 -0
  86. data/lib/toy/llm/recipes/from_scratch_cuda.rb +109 -0
  87. data/lib/toy/llm/recipes/from_scratch_metal.rb +109 -0
  88. data/lib/toy/llm/recipes/lora.rb +110 -0
  89. data/lib/toy/llm/recipes/lora_cuda.rb +114 -0
  90. data/lib/toy/llm/recipes/lora_metal.rb +114 -0
  91. data/lib/toy/llm/recipes/vit_tiny.rb +75 -0
  92. data/lib/toy/llm/recipes/warm_start.rb +235 -0
  93. data/lib/toy/llm/recipes/warm_start_cuda.rb +239 -0
  94. data/lib/toy/llm/recipes/warm_start_metal.rb +239 -0
  95. data/lib/toy/llm/training_batch.rb +133 -0
  96. data/lib/toy/models/arch.rb +253 -0
  97. data/lib/toy/models/gpt2.rb +311 -0
  98. data/lib/toy/models/toy_gpt2.rb +177 -0
  99. data/lib/toy/models/toy_smollm2.rb +393 -0
  100. data/lib/toy/models/toy_vit.rb +83 -0
  101. data/lib/toy/models/transformer.rb +1494 -0
  102. data/lib/toy/models/transformer_lm.rb +298 -0
  103. data/lib/toy/models/transformer_lm_cuda.rb +159 -0
  104. data/lib/toy/models/transformer_lm_metal.rb +142 -0
  105. data/lib/toy/mri.rb +300 -0
  106. data/lib/toy/run/eval.rb +76 -0
  107. data/lib/toy/run/eval_cuda.rb +66 -0
  108. data/lib/toy/run/eval_lmc.rb +334 -0
  109. data/lib/toy/run/eval_metal.rb +67 -0
  110. data/lib/toy/run/infer.rb +130 -0
  111. data/lib/toy/run/infer_cuda.rb +118 -0
  112. data/lib/toy/run/infer_metal.rb +119 -0
  113. data/lib/toy/run/infer_trace.rb +37 -0
  114. data/lib/toy/run/serve.rb +144 -0
  115. data/lib/toy/run/train.rb +404 -0
  116. data/lib/toy/run/train_cuda.rb +397 -0
  117. data/lib/toy/run/train_gpt2.rb +103 -0
  118. data/lib/toy/run/train_gpt2_cuda.rb +85 -0
  119. data/lib/toy/run/train_gpt2_metal.rb +85 -0
  120. data/lib/toy/run/train_lora.rb +207 -0
  121. data/lib/toy/run/train_lora_cuda.rb +219 -0
  122. data/lib/toy/run/train_metal.rb +227 -0
  123. data/lib/toy/run/train_vit.rb +251 -0
  124. data/lib/toy/serve/openai/embeddings_handler.rb +92 -0
  125. data/lib/toy/serve/openai/handlers.rb +143 -0
  126. data/lib/toy/serve/openai/server.rb +159 -0
  127. data/lib/toy/train/sampler.rb +314 -0
  128. data/lib/toy/train/toy_chat_template.rb +179 -0
  129. data/lib/toy/train/toy_drift_grad.rb +176 -0
  130. data/lib/toy/train/toy_gguf_fuse.rb +428 -0
  131. data/lib/toy/train/toy_gguf_writer.rb +100 -0
  132. data/lib/toy/train/toy_lr_schedule.rb +39 -0
  133. data/lib/toy/train/toy_sample.rb +125 -0
  134. data/lib/toy/train/toy_trainer.rb +86 -0
  135. data/lib/toy/train/training.rb +160 -0
  136. data/lib/toy/version.rb +11 -0
  137. data/lib/toy.rb +902 -0
  138. data/prep/progress +118 -0
  139. data/prep/quietly +64 -0
  140. data/sig/toy.rbs +397 -0
  141. data/sig/toy_compute.rbs +450 -0
  142. data/spinel-ext.json +122 -0
  143. data/tinynn/Makefile +71 -0
  144. data/tinynn/tinynn_backend_cuda.c +99 -0
  145. data/tinynn/tinynn_backend_metal.m +75 -0
  146. data/tinynn/tinynn_events.c +122 -0
  147. data/tinynn/tinynn_events.h +83 -0
  148. data/tinynn/tinynn_ggml.c +2460 -0
  149. data/tinynn/tinynn_ggml.h +545 -0
  150. data/tinynn/tinynn_gguf.c +783 -0
  151. data/tinynn/tinynn_gguf.h +167 -0
  152. data/tinynn/tinynn_trace.c +180 -0
  153. data/tinynn/tinynn_trace.h +85 -0
  154. data/vendor/ggml/AUTHORS +335 -0
  155. data/vendor/ggml/CMakeLists.txt +505 -0
  156. data/vendor/ggml/CONTRIBUTING.md +3 -0
  157. data/vendor/ggml/LICENSE +21 -0
  158. data/vendor/ggml/README.md +50 -0
  159. data/vendor/ggml/ci/run.sh +395 -0
  160. data/vendor/ggml/cmake/FindNCCL.cmake +36 -0
  161. data/vendor/ggml/cmake/GitVars.cmake +22 -0
  162. data/vendor/ggml/cmake/common.cmake +50 -0
  163. data/vendor/ggml/cmake/ggml-config.cmake.in +191 -0
  164. data/vendor/ggml/docs/gguf.md +828 -0
  165. data/vendor/ggml/examples/CMakeLists.txt +34 -0
  166. data/vendor/ggml/examples/common-ggml.cpp +244 -0
  167. data/vendor/ggml/examples/common-ggml.h +18 -0
  168. data/vendor/ggml/examples/common.cpp +675 -0
  169. data/vendor/ggml/examples/common.h +322 -0
  170. data/vendor/ggml/examples/gpt-2/CMakeLists.txt +32 -0
  171. data/vendor/ggml/examples/gpt-2/README.md +225 -0
  172. data/vendor/ggml/examples/gpt-2/convert-cerebras-to-ggml.py +183 -0
  173. data/vendor/ggml/examples/gpt-2/convert-ckpt-to-ggml.py +159 -0
  174. data/vendor/ggml/examples/gpt-2/convert-h5-to-ggml.py +195 -0
  175. data/vendor/ggml/examples/gpt-2/download-ggml-model.sh +69 -0
  176. data/vendor/ggml/examples/gpt-2/download-model.sh +48 -0
  177. data/vendor/ggml/examples/gpt-2/main-alloc.cpp +880 -0
  178. data/vendor/ggml/examples/gpt-2/main-backend.cpp +946 -0
  179. data/vendor/ggml/examples/gpt-2/main-batched.cpp +1210 -0
  180. data/vendor/ggml/examples/gpt-2/main-ctx.cpp +840 -0
  181. data/vendor/ggml/examples/gpt-2/main-sched.cpp +1079 -0
  182. data/vendor/ggml/examples/gpt-2/quantize.cpp +184 -0
  183. data/vendor/ggml/examples/gpt-j/CMakeLists.txt +13 -0
  184. data/vendor/ggml/examples/gpt-j/README.md +239 -0
  185. data/vendor/ggml/examples/gpt-j/convert-h5-to-ggml.py +173 -0
  186. data/vendor/ggml/examples/gpt-j/download-ggml-model.sh +69 -0
  187. data/vendor/ggml/examples/gpt-j/download-model.sh +11 -0
  188. data/vendor/ggml/examples/gpt-j/main.cpp +755 -0
  189. data/vendor/ggml/examples/gpt-j/quantize.cpp +182 -0
  190. data/vendor/ggml/examples/magika/CMakeLists.txt +17 -0
  191. data/vendor/ggml/examples/magika/README.md +23 -0
  192. data/vendor/ggml/examples/magika/convert.py +32 -0
  193. data/vendor/ggml/examples/magika/main.cpp +374 -0
  194. data/vendor/ggml/examples/mnist/CMakeLists.txt +58 -0
  195. data/vendor/ggml/examples/mnist/README.md +206 -0
  196. data/vendor/ggml/examples/mnist/mnist-common.cpp +496 -0
  197. data/vendor/ggml/examples/mnist/mnist-common.h +166 -0
  198. data/vendor/ggml/examples/mnist/mnist-eval.cpp +67 -0
  199. data/vendor/ggml/examples/mnist/mnist-train-cnn.py +91 -0
  200. data/vendor/ggml/examples/mnist/mnist-train-fc.py +131 -0
  201. data/vendor/ggml/examples/mnist/mnist-train.cpp +39 -0
  202. data/vendor/ggml/examples/mnist/server.py +36 -0
  203. data/vendor/ggml/examples/mnist/web/index.html +178 -0
  204. data/vendor/ggml/examples/perf-metal/CMakeLists.txt +7 -0
  205. data/vendor/ggml/examples/perf-metal/perf-metal.cpp +152 -0
  206. data/vendor/ggml/examples/prompts/dolly-v2.txt +100 -0
  207. data/vendor/ggml/examples/prompts/gpt-2-chinese.txt +1 -0
  208. data/vendor/ggml/examples/prompts/gpt-2.txt +100 -0
  209. data/vendor/ggml/examples/prompts/gpt-j.txt +100 -0
  210. data/vendor/ggml/examples/prompts/gpt-neox-japanese.txt +1 -0
  211. data/vendor/ggml/examples/prompts/gpt-neox.txt +100 -0
  212. data/vendor/ggml/examples/prompts/polyglot-ko.txt +3 -0
  213. data/vendor/ggml/examples/prompts/replit.txt +100 -0
  214. data/vendor/ggml/examples/prompts/starcoder.txt +100 -0
  215. data/vendor/ggml/examples/prompts/test-cases.txt +110 -0
  216. data/vendor/ggml/examples/prompts/tokenize_huggingface.py +65 -0
  217. data/vendor/ggml/examples/prompts/whisper.txt +100 -0
  218. data/vendor/ggml/examples/python/README.md +115 -0
  219. data/vendor/ggml/examples/python/api.h +14 -0
  220. data/vendor/ggml/examples/python/example_add_quant.py +25 -0
  221. data/vendor/ggml/examples/python/example_test_all_quants.py +68 -0
  222. data/vendor/ggml/examples/python/ggml/__init__.py +58 -0
  223. data/vendor/ggml/examples/python/ggml/__init__.pyi +2406 -0
  224. data/vendor/ggml/examples/python/ggml/cffi.py +11 -0
  225. data/vendor/ggml/examples/python/ggml/ffi/__init__.pyi +7 -0
  226. data/vendor/ggml/examples/python/ggml/utils.py +182 -0
  227. data/vendor/ggml/examples/python/regenerate.py +42 -0
  228. data/vendor/ggml/examples/python/stubs.py +128 -0
  229. data/vendor/ggml/examples/python/test_tensor.py +258 -0
  230. data/vendor/ggml/examples/sam/CMakeLists.txt +13 -0
  231. data/vendor/ggml/examples/sam/README.md +95 -0
  232. data/vendor/ggml/examples/sam/convert-pth-to-ggml.py +147 -0
  233. data/vendor/ggml/examples/sam/example.jpg +0 -0
  234. data/vendor/ggml/examples/sam/sam.cpp +2370 -0
  235. data/vendor/ggml/examples/simple/CMakeLists.txt +21 -0
  236. data/vendor/ggml/examples/simple/README.md +61 -0
  237. data/vendor/ggml/examples/simple/simple-backend.cpp +153 -0
  238. data/vendor/ggml/examples/simple/simple-ctx.cpp +127 -0
  239. data/vendor/ggml/examples/stb_image.h +7987 -0
  240. data/vendor/ggml/examples/stb_image_write.h +1724 -0
  241. data/vendor/ggml/examples/test-cmake/CMakeLists.txt +10 -0
  242. data/vendor/ggml/examples/test-cmake/README.md +3 -0
  243. data/vendor/ggml/examples/test-cmake/test-cmake.cpp +6 -0
  244. data/vendor/ggml/examples/yolo/CMakeLists.txt +6 -0
  245. data/vendor/ggml/examples/yolo/README.md +59 -0
  246. data/vendor/ggml/examples/yolo/convert-yolov3-tiny.py +53 -0
  247. data/vendor/ggml/examples/yolo/data/coco.names +80 -0
  248. data/vendor/ggml/examples/yolo/data/labels/100_0.png +0 -0
  249. data/vendor/ggml/examples/yolo/data/labels/100_1.png +0 -0
  250. data/vendor/ggml/examples/yolo/data/labels/100_2.png +0 -0
  251. data/vendor/ggml/examples/yolo/data/labels/100_3.png +0 -0
  252. data/vendor/ggml/examples/yolo/data/labels/100_4.png +0 -0
  253. data/vendor/ggml/examples/yolo/data/labels/100_5.png +0 -0
  254. data/vendor/ggml/examples/yolo/data/labels/100_6.png +0 -0
  255. data/vendor/ggml/examples/yolo/data/labels/100_7.png +0 -0
  256. data/vendor/ggml/examples/yolo/data/labels/101_0.png +0 -0
  257. data/vendor/ggml/examples/yolo/data/labels/101_1.png +0 -0
  258. data/vendor/ggml/examples/yolo/data/labels/101_2.png +0 -0
  259. data/vendor/ggml/examples/yolo/data/labels/101_3.png +0 -0
  260. data/vendor/ggml/examples/yolo/data/labels/101_4.png +0 -0
  261. data/vendor/ggml/examples/yolo/data/labels/101_5.png +0 -0
  262. data/vendor/ggml/examples/yolo/data/labels/101_6.png +0 -0
  263. data/vendor/ggml/examples/yolo/data/labels/101_7.png +0 -0
  264. data/vendor/ggml/examples/yolo/data/labels/102_0.png +0 -0
  265. data/vendor/ggml/examples/yolo/data/labels/102_1.png +0 -0
  266. data/vendor/ggml/examples/yolo/data/labels/102_2.png +0 -0
  267. data/vendor/ggml/examples/yolo/data/labels/102_3.png +0 -0
  268. data/vendor/ggml/examples/yolo/data/labels/102_4.png +0 -0
  269. data/vendor/ggml/examples/yolo/data/labels/102_5.png +0 -0
  270. data/vendor/ggml/examples/yolo/data/labels/102_6.png +0 -0
  271. data/vendor/ggml/examples/yolo/data/labels/102_7.png +0 -0
  272. data/vendor/ggml/examples/yolo/data/labels/103_0.png +0 -0
  273. data/vendor/ggml/examples/yolo/data/labels/103_1.png +0 -0
  274. data/vendor/ggml/examples/yolo/data/labels/103_2.png +0 -0
  275. data/vendor/ggml/examples/yolo/data/labels/103_3.png +0 -0
  276. data/vendor/ggml/examples/yolo/data/labels/103_4.png +0 -0
  277. data/vendor/ggml/examples/yolo/data/labels/103_5.png +0 -0
  278. data/vendor/ggml/examples/yolo/data/labels/103_6.png +0 -0
  279. data/vendor/ggml/examples/yolo/data/labels/103_7.png +0 -0
  280. data/vendor/ggml/examples/yolo/data/labels/104_0.png +0 -0
  281. data/vendor/ggml/examples/yolo/data/labels/104_1.png +0 -0
  282. data/vendor/ggml/examples/yolo/data/labels/104_2.png +0 -0
  283. data/vendor/ggml/examples/yolo/data/labels/104_3.png +0 -0
  284. data/vendor/ggml/examples/yolo/data/labels/104_4.png +0 -0
  285. data/vendor/ggml/examples/yolo/data/labels/104_5.png +0 -0
  286. data/vendor/ggml/examples/yolo/data/labels/104_6.png +0 -0
  287. data/vendor/ggml/examples/yolo/data/labels/104_7.png +0 -0
  288. data/vendor/ggml/examples/yolo/data/labels/105_0.png +0 -0
  289. data/vendor/ggml/examples/yolo/data/labels/105_1.png +0 -0
  290. data/vendor/ggml/examples/yolo/data/labels/105_2.png +0 -0
  291. data/vendor/ggml/examples/yolo/data/labels/105_3.png +0 -0
  292. data/vendor/ggml/examples/yolo/data/labels/105_4.png +0 -0
  293. data/vendor/ggml/examples/yolo/data/labels/105_5.png +0 -0
  294. data/vendor/ggml/examples/yolo/data/labels/105_6.png +0 -0
  295. data/vendor/ggml/examples/yolo/data/labels/105_7.png +0 -0
  296. data/vendor/ggml/examples/yolo/data/labels/106_0.png +0 -0
  297. data/vendor/ggml/examples/yolo/data/labels/106_1.png +0 -0
  298. data/vendor/ggml/examples/yolo/data/labels/106_2.png +0 -0
  299. data/vendor/ggml/examples/yolo/data/labels/106_3.png +0 -0
  300. data/vendor/ggml/examples/yolo/data/labels/106_4.png +0 -0
  301. data/vendor/ggml/examples/yolo/data/labels/106_5.png +0 -0
  302. data/vendor/ggml/examples/yolo/data/labels/106_6.png +0 -0
  303. data/vendor/ggml/examples/yolo/data/labels/106_7.png +0 -0
  304. data/vendor/ggml/examples/yolo/data/labels/107_0.png +0 -0
  305. data/vendor/ggml/examples/yolo/data/labels/107_1.png +0 -0
  306. data/vendor/ggml/examples/yolo/data/labels/107_2.png +0 -0
  307. data/vendor/ggml/examples/yolo/data/labels/107_3.png +0 -0
  308. data/vendor/ggml/examples/yolo/data/labels/107_4.png +0 -0
  309. data/vendor/ggml/examples/yolo/data/labels/107_5.png +0 -0
  310. data/vendor/ggml/examples/yolo/data/labels/107_6.png +0 -0
  311. data/vendor/ggml/examples/yolo/data/labels/107_7.png +0 -0
  312. data/vendor/ggml/examples/yolo/data/labels/108_0.png +0 -0
  313. data/vendor/ggml/examples/yolo/data/labels/108_1.png +0 -0
  314. data/vendor/ggml/examples/yolo/data/labels/108_2.png +0 -0
  315. data/vendor/ggml/examples/yolo/data/labels/108_3.png +0 -0
  316. data/vendor/ggml/examples/yolo/data/labels/108_4.png +0 -0
  317. data/vendor/ggml/examples/yolo/data/labels/108_5.png +0 -0
  318. data/vendor/ggml/examples/yolo/data/labels/108_6.png +0 -0
  319. data/vendor/ggml/examples/yolo/data/labels/108_7.png +0 -0
  320. data/vendor/ggml/examples/yolo/data/labels/109_0.png +0 -0
  321. data/vendor/ggml/examples/yolo/data/labels/109_1.png +0 -0
  322. data/vendor/ggml/examples/yolo/data/labels/109_2.png +0 -0
  323. data/vendor/ggml/examples/yolo/data/labels/109_3.png +0 -0
  324. data/vendor/ggml/examples/yolo/data/labels/109_4.png +0 -0
  325. data/vendor/ggml/examples/yolo/data/labels/109_5.png +0 -0
  326. data/vendor/ggml/examples/yolo/data/labels/109_6.png +0 -0
  327. data/vendor/ggml/examples/yolo/data/labels/109_7.png +0 -0
  328. data/vendor/ggml/examples/yolo/data/labels/110_0.png +0 -0
  329. data/vendor/ggml/examples/yolo/data/labels/110_1.png +0 -0
  330. data/vendor/ggml/examples/yolo/data/labels/110_2.png +0 -0
  331. data/vendor/ggml/examples/yolo/data/labels/110_3.png +0 -0
  332. data/vendor/ggml/examples/yolo/data/labels/110_4.png +0 -0
  333. data/vendor/ggml/examples/yolo/data/labels/110_5.png +0 -0
  334. data/vendor/ggml/examples/yolo/data/labels/110_6.png +0 -0
  335. data/vendor/ggml/examples/yolo/data/labels/110_7.png +0 -0
  336. data/vendor/ggml/examples/yolo/data/labels/111_0.png +0 -0
  337. data/vendor/ggml/examples/yolo/data/labels/111_1.png +0 -0
  338. data/vendor/ggml/examples/yolo/data/labels/111_2.png +0 -0
  339. data/vendor/ggml/examples/yolo/data/labels/111_3.png +0 -0
  340. data/vendor/ggml/examples/yolo/data/labels/111_4.png +0 -0
  341. data/vendor/ggml/examples/yolo/data/labels/111_5.png +0 -0
  342. data/vendor/ggml/examples/yolo/data/labels/111_6.png +0 -0
  343. data/vendor/ggml/examples/yolo/data/labels/111_7.png +0 -0
  344. data/vendor/ggml/examples/yolo/data/labels/112_0.png +0 -0
  345. data/vendor/ggml/examples/yolo/data/labels/112_1.png +0 -0
  346. data/vendor/ggml/examples/yolo/data/labels/112_2.png +0 -0
  347. data/vendor/ggml/examples/yolo/data/labels/112_3.png +0 -0
  348. data/vendor/ggml/examples/yolo/data/labels/112_4.png +0 -0
  349. data/vendor/ggml/examples/yolo/data/labels/112_5.png +0 -0
  350. data/vendor/ggml/examples/yolo/data/labels/112_6.png +0 -0
  351. data/vendor/ggml/examples/yolo/data/labels/112_7.png +0 -0
  352. data/vendor/ggml/examples/yolo/data/labels/113_0.png +0 -0
  353. data/vendor/ggml/examples/yolo/data/labels/113_1.png +0 -0
  354. data/vendor/ggml/examples/yolo/data/labels/113_2.png +0 -0
  355. data/vendor/ggml/examples/yolo/data/labels/113_3.png +0 -0
  356. data/vendor/ggml/examples/yolo/data/labels/113_4.png +0 -0
  357. data/vendor/ggml/examples/yolo/data/labels/113_5.png +0 -0
  358. data/vendor/ggml/examples/yolo/data/labels/113_6.png +0 -0
  359. data/vendor/ggml/examples/yolo/data/labels/113_7.png +0 -0
  360. data/vendor/ggml/examples/yolo/data/labels/114_0.png +0 -0
  361. data/vendor/ggml/examples/yolo/data/labels/114_1.png +0 -0
  362. data/vendor/ggml/examples/yolo/data/labels/114_2.png +0 -0
  363. data/vendor/ggml/examples/yolo/data/labels/114_3.png +0 -0
  364. data/vendor/ggml/examples/yolo/data/labels/114_4.png +0 -0
  365. data/vendor/ggml/examples/yolo/data/labels/114_5.png +0 -0
  366. data/vendor/ggml/examples/yolo/data/labels/114_6.png +0 -0
  367. data/vendor/ggml/examples/yolo/data/labels/114_7.png +0 -0
  368. data/vendor/ggml/examples/yolo/data/labels/115_0.png +0 -0
  369. data/vendor/ggml/examples/yolo/data/labels/115_1.png +0 -0
  370. data/vendor/ggml/examples/yolo/data/labels/115_2.png +0 -0
  371. data/vendor/ggml/examples/yolo/data/labels/115_3.png +0 -0
  372. data/vendor/ggml/examples/yolo/data/labels/115_4.png +0 -0
  373. data/vendor/ggml/examples/yolo/data/labels/115_5.png +0 -0
  374. data/vendor/ggml/examples/yolo/data/labels/115_6.png +0 -0
  375. data/vendor/ggml/examples/yolo/data/labels/115_7.png +0 -0
  376. data/vendor/ggml/examples/yolo/data/labels/116_0.png +0 -0
  377. data/vendor/ggml/examples/yolo/data/labels/116_1.png +0 -0
  378. data/vendor/ggml/examples/yolo/data/labels/116_2.png +0 -0
  379. data/vendor/ggml/examples/yolo/data/labels/116_3.png +0 -0
  380. data/vendor/ggml/examples/yolo/data/labels/116_4.png +0 -0
  381. data/vendor/ggml/examples/yolo/data/labels/116_5.png +0 -0
  382. data/vendor/ggml/examples/yolo/data/labels/116_6.png +0 -0
  383. data/vendor/ggml/examples/yolo/data/labels/116_7.png +0 -0
  384. data/vendor/ggml/examples/yolo/data/labels/117_0.png +0 -0
  385. data/vendor/ggml/examples/yolo/data/labels/117_1.png +0 -0
  386. data/vendor/ggml/examples/yolo/data/labels/117_2.png +0 -0
  387. data/vendor/ggml/examples/yolo/data/labels/117_3.png +0 -0
  388. data/vendor/ggml/examples/yolo/data/labels/117_4.png +0 -0
  389. data/vendor/ggml/examples/yolo/data/labels/117_5.png +0 -0
  390. data/vendor/ggml/examples/yolo/data/labels/117_6.png +0 -0
  391. data/vendor/ggml/examples/yolo/data/labels/117_7.png +0 -0
  392. data/vendor/ggml/examples/yolo/data/labels/118_0.png +0 -0
  393. data/vendor/ggml/examples/yolo/data/labels/118_1.png +0 -0
  394. data/vendor/ggml/examples/yolo/data/labels/118_2.png +0 -0
  395. data/vendor/ggml/examples/yolo/data/labels/118_3.png +0 -0
  396. data/vendor/ggml/examples/yolo/data/labels/118_4.png +0 -0
  397. data/vendor/ggml/examples/yolo/data/labels/118_5.png +0 -0
  398. data/vendor/ggml/examples/yolo/data/labels/118_6.png +0 -0
  399. data/vendor/ggml/examples/yolo/data/labels/118_7.png +0 -0
  400. data/vendor/ggml/examples/yolo/data/labels/119_0.png +0 -0
  401. data/vendor/ggml/examples/yolo/data/labels/119_1.png +0 -0
  402. data/vendor/ggml/examples/yolo/data/labels/119_2.png +0 -0
  403. data/vendor/ggml/examples/yolo/data/labels/119_3.png +0 -0
  404. data/vendor/ggml/examples/yolo/data/labels/119_4.png +0 -0
  405. data/vendor/ggml/examples/yolo/data/labels/119_5.png +0 -0
  406. data/vendor/ggml/examples/yolo/data/labels/119_6.png +0 -0
  407. data/vendor/ggml/examples/yolo/data/labels/119_7.png +0 -0
  408. data/vendor/ggml/examples/yolo/data/labels/120_0.png +0 -0
  409. data/vendor/ggml/examples/yolo/data/labels/120_1.png +0 -0
  410. data/vendor/ggml/examples/yolo/data/labels/120_2.png +0 -0
  411. data/vendor/ggml/examples/yolo/data/labels/120_3.png +0 -0
  412. data/vendor/ggml/examples/yolo/data/labels/120_4.png +0 -0
  413. data/vendor/ggml/examples/yolo/data/labels/120_5.png +0 -0
  414. data/vendor/ggml/examples/yolo/data/labels/120_6.png +0 -0
  415. data/vendor/ggml/examples/yolo/data/labels/120_7.png +0 -0
  416. data/vendor/ggml/examples/yolo/data/labels/121_0.png +0 -0
  417. data/vendor/ggml/examples/yolo/data/labels/121_1.png +0 -0
  418. data/vendor/ggml/examples/yolo/data/labels/121_2.png +0 -0
  419. data/vendor/ggml/examples/yolo/data/labels/121_3.png +0 -0
  420. data/vendor/ggml/examples/yolo/data/labels/121_4.png +0 -0
  421. data/vendor/ggml/examples/yolo/data/labels/121_5.png +0 -0
  422. data/vendor/ggml/examples/yolo/data/labels/121_6.png +0 -0
  423. data/vendor/ggml/examples/yolo/data/labels/121_7.png +0 -0
  424. data/vendor/ggml/examples/yolo/data/labels/122_0.png +0 -0
  425. data/vendor/ggml/examples/yolo/data/labels/122_1.png +0 -0
  426. data/vendor/ggml/examples/yolo/data/labels/122_2.png +0 -0
  427. data/vendor/ggml/examples/yolo/data/labels/122_3.png +0 -0
  428. data/vendor/ggml/examples/yolo/data/labels/122_4.png +0 -0
  429. data/vendor/ggml/examples/yolo/data/labels/122_5.png +0 -0
  430. data/vendor/ggml/examples/yolo/data/labels/122_6.png +0 -0
  431. data/vendor/ggml/examples/yolo/data/labels/122_7.png +0 -0
  432. data/vendor/ggml/examples/yolo/data/labels/123_0.png +0 -0
  433. data/vendor/ggml/examples/yolo/data/labels/123_1.png +0 -0
  434. data/vendor/ggml/examples/yolo/data/labels/123_2.png +0 -0
  435. data/vendor/ggml/examples/yolo/data/labels/123_3.png +0 -0
  436. data/vendor/ggml/examples/yolo/data/labels/123_4.png +0 -0
  437. data/vendor/ggml/examples/yolo/data/labels/123_5.png +0 -0
  438. data/vendor/ggml/examples/yolo/data/labels/123_6.png +0 -0
  439. data/vendor/ggml/examples/yolo/data/labels/123_7.png +0 -0
  440. data/vendor/ggml/examples/yolo/data/labels/124_0.png +0 -0
  441. data/vendor/ggml/examples/yolo/data/labels/124_1.png +0 -0
  442. data/vendor/ggml/examples/yolo/data/labels/124_2.png +0 -0
  443. data/vendor/ggml/examples/yolo/data/labels/124_3.png +0 -0
  444. data/vendor/ggml/examples/yolo/data/labels/124_4.png +0 -0
  445. data/vendor/ggml/examples/yolo/data/labels/124_5.png +0 -0
  446. data/vendor/ggml/examples/yolo/data/labels/124_6.png +0 -0
  447. data/vendor/ggml/examples/yolo/data/labels/124_7.png +0 -0
  448. data/vendor/ggml/examples/yolo/data/labels/125_0.png +0 -0
  449. data/vendor/ggml/examples/yolo/data/labels/125_1.png +0 -0
  450. data/vendor/ggml/examples/yolo/data/labels/125_2.png +0 -0
  451. data/vendor/ggml/examples/yolo/data/labels/125_3.png +0 -0
  452. data/vendor/ggml/examples/yolo/data/labels/125_4.png +0 -0
  453. data/vendor/ggml/examples/yolo/data/labels/125_5.png +0 -0
  454. data/vendor/ggml/examples/yolo/data/labels/125_6.png +0 -0
  455. data/vendor/ggml/examples/yolo/data/labels/125_7.png +0 -0
  456. data/vendor/ggml/examples/yolo/data/labels/126_0.png +0 -0
  457. data/vendor/ggml/examples/yolo/data/labels/126_1.png +0 -0
  458. data/vendor/ggml/examples/yolo/data/labels/126_2.png +0 -0
  459. data/vendor/ggml/examples/yolo/data/labels/126_3.png +0 -0
  460. data/vendor/ggml/examples/yolo/data/labels/126_4.png +0 -0
  461. data/vendor/ggml/examples/yolo/data/labels/126_5.png +0 -0
  462. data/vendor/ggml/examples/yolo/data/labels/126_6.png +0 -0
  463. data/vendor/ggml/examples/yolo/data/labels/126_7.png +0 -0
  464. data/vendor/ggml/examples/yolo/data/labels/32_0.png +0 -0
  465. data/vendor/ggml/examples/yolo/data/labels/32_1.png +0 -0
  466. data/vendor/ggml/examples/yolo/data/labels/32_2.png +0 -0
  467. data/vendor/ggml/examples/yolo/data/labels/32_3.png +0 -0
  468. data/vendor/ggml/examples/yolo/data/labels/32_4.png +0 -0
  469. data/vendor/ggml/examples/yolo/data/labels/32_5.png +0 -0
  470. data/vendor/ggml/examples/yolo/data/labels/32_6.png +0 -0
  471. data/vendor/ggml/examples/yolo/data/labels/32_7.png +0 -0
  472. data/vendor/ggml/examples/yolo/data/labels/33_0.png +0 -0
  473. data/vendor/ggml/examples/yolo/data/labels/33_1.png +0 -0
  474. data/vendor/ggml/examples/yolo/data/labels/33_2.png +0 -0
  475. data/vendor/ggml/examples/yolo/data/labels/33_3.png +0 -0
  476. data/vendor/ggml/examples/yolo/data/labels/33_4.png +0 -0
  477. data/vendor/ggml/examples/yolo/data/labels/33_5.png +0 -0
  478. data/vendor/ggml/examples/yolo/data/labels/33_6.png +0 -0
  479. data/vendor/ggml/examples/yolo/data/labels/33_7.png +0 -0
  480. data/vendor/ggml/examples/yolo/data/labels/34_0.png +0 -0
  481. data/vendor/ggml/examples/yolo/data/labels/34_1.png +0 -0
  482. data/vendor/ggml/examples/yolo/data/labels/34_2.png +0 -0
  483. data/vendor/ggml/examples/yolo/data/labels/34_3.png +0 -0
  484. data/vendor/ggml/examples/yolo/data/labels/34_4.png +0 -0
  485. data/vendor/ggml/examples/yolo/data/labels/34_5.png +0 -0
  486. data/vendor/ggml/examples/yolo/data/labels/34_6.png +0 -0
  487. data/vendor/ggml/examples/yolo/data/labels/34_7.png +0 -0
  488. data/vendor/ggml/examples/yolo/data/labels/35_0.png +0 -0
  489. data/vendor/ggml/examples/yolo/data/labels/35_1.png +0 -0
  490. data/vendor/ggml/examples/yolo/data/labels/35_2.png +0 -0
  491. data/vendor/ggml/examples/yolo/data/labels/35_3.png +0 -0
  492. data/vendor/ggml/examples/yolo/data/labels/35_4.png +0 -0
  493. data/vendor/ggml/examples/yolo/data/labels/35_5.png +0 -0
  494. data/vendor/ggml/examples/yolo/data/labels/35_6.png +0 -0
  495. data/vendor/ggml/examples/yolo/data/labels/35_7.png +0 -0
  496. data/vendor/ggml/examples/yolo/data/labels/36_0.png +0 -0
  497. data/vendor/ggml/examples/yolo/data/labels/36_1.png +0 -0
  498. data/vendor/ggml/examples/yolo/data/labels/36_2.png +0 -0
  499. data/vendor/ggml/examples/yolo/data/labels/36_3.png +0 -0
  500. data/vendor/ggml/examples/yolo/data/labels/36_4.png +0 -0
  501. data/vendor/ggml/examples/yolo/data/labels/36_5.png +0 -0
  502. data/vendor/ggml/examples/yolo/data/labels/36_6.png +0 -0
  503. data/vendor/ggml/examples/yolo/data/labels/36_7.png +0 -0
  504. data/vendor/ggml/examples/yolo/data/labels/37_0.png +0 -0
  505. data/vendor/ggml/examples/yolo/data/labels/37_1.png +0 -0
  506. data/vendor/ggml/examples/yolo/data/labels/37_2.png +0 -0
  507. data/vendor/ggml/examples/yolo/data/labels/37_3.png +0 -0
  508. data/vendor/ggml/examples/yolo/data/labels/37_4.png +0 -0
  509. data/vendor/ggml/examples/yolo/data/labels/37_5.png +0 -0
  510. data/vendor/ggml/examples/yolo/data/labels/37_6.png +0 -0
  511. data/vendor/ggml/examples/yolo/data/labels/37_7.png +0 -0
  512. data/vendor/ggml/examples/yolo/data/labels/38_0.png +0 -0
  513. data/vendor/ggml/examples/yolo/data/labels/38_1.png +0 -0
  514. data/vendor/ggml/examples/yolo/data/labels/38_2.png +0 -0
  515. data/vendor/ggml/examples/yolo/data/labels/38_3.png +0 -0
  516. data/vendor/ggml/examples/yolo/data/labels/38_4.png +0 -0
  517. data/vendor/ggml/examples/yolo/data/labels/38_5.png +0 -0
  518. data/vendor/ggml/examples/yolo/data/labels/38_6.png +0 -0
  519. data/vendor/ggml/examples/yolo/data/labels/38_7.png +0 -0
  520. data/vendor/ggml/examples/yolo/data/labels/39_0.png +0 -0
  521. data/vendor/ggml/examples/yolo/data/labels/39_1.png +0 -0
  522. data/vendor/ggml/examples/yolo/data/labels/39_2.png +0 -0
  523. data/vendor/ggml/examples/yolo/data/labels/39_3.png +0 -0
  524. data/vendor/ggml/examples/yolo/data/labels/39_4.png +0 -0
  525. data/vendor/ggml/examples/yolo/data/labels/39_5.png +0 -0
  526. data/vendor/ggml/examples/yolo/data/labels/39_6.png +0 -0
  527. data/vendor/ggml/examples/yolo/data/labels/39_7.png +0 -0
  528. data/vendor/ggml/examples/yolo/data/labels/40_0.png +0 -0
  529. data/vendor/ggml/examples/yolo/data/labels/40_1.png +0 -0
  530. data/vendor/ggml/examples/yolo/data/labels/40_2.png +0 -0
  531. data/vendor/ggml/examples/yolo/data/labels/40_3.png +0 -0
  532. data/vendor/ggml/examples/yolo/data/labels/40_4.png +0 -0
  533. data/vendor/ggml/examples/yolo/data/labels/40_5.png +0 -0
  534. data/vendor/ggml/examples/yolo/data/labels/40_6.png +0 -0
  535. data/vendor/ggml/examples/yolo/data/labels/40_7.png +0 -0
  536. data/vendor/ggml/examples/yolo/data/labels/41_0.png +0 -0
  537. data/vendor/ggml/examples/yolo/data/labels/41_1.png +0 -0
  538. data/vendor/ggml/examples/yolo/data/labels/41_2.png +0 -0
  539. data/vendor/ggml/examples/yolo/data/labels/41_3.png +0 -0
  540. data/vendor/ggml/examples/yolo/data/labels/41_4.png +0 -0
  541. data/vendor/ggml/examples/yolo/data/labels/41_5.png +0 -0
  542. data/vendor/ggml/examples/yolo/data/labels/41_6.png +0 -0
  543. data/vendor/ggml/examples/yolo/data/labels/41_7.png +0 -0
  544. data/vendor/ggml/examples/yolo/data/labels/42_0.png +0 -0
  545. data/vendor/ggml/examples/yolo/data/labels/42_1.png +0 -0
  546. data/vendor/ggml/examples/yolo/data/labels/42_2.png +0 -0
  547. data/vendor/ggml/examples/yolo/data/labels/42_3.png +0 -0
  548. data/vendor/ggml/examples/yolo/data/labels/42_4.png +0 -0
  549. data/vendor/ggml/examples/yolo/data/labels/42_5.png +0 -0
  550. data/vendor/ggml/examples/yolo/data/labels/42_6.png +0 -0
  551. data/vendor/ggml/examples/yolo/data/labels/42_7.png +0 -0
  552. data/vendor/ggml/examples/yolo/data/labels/43_0.png +0 -0
  553. data/vendor/ggml/examples/yolo/data/labels/43_1.png +0 -0
  554. data/vendor/ggml/examples/yolo/data/labels/43_2.png +0 -0
  555. data/vendor/ggml/examples/yolo/data/labels/43_3.png +0 -0
  556. data/vendor/ggml/examples/yolo/data/labels/43_4.png +0 -0
  557. data/vendor/ggml/examples/yolo/data/labels/43_5.png +0 -0
  558. data/vendor/ggml/examples/yolo/data/labels/43_6.png +0 -0
  559. data/vendor/ggml/examples/yolo/data/labels/43_7.png +0 -0
  560. data/vendor/ggml/examples/yolo/data/labels/44_0.png +0 -0
  561. data/vendor/ggml/examples/yolo/data/labels/44_1.png +0 -0
  562. data/vendor/ggml/examples/yolo/data/labels/44_2.png +0 -0
  563. data/vendor/ggml/examples/yolo/data/labels/44_3.png +0 -0
  564. data/vendor/ggml/examples/yolo/data/labels/44_4.png +0 -0
  565. data/vendor/ggml/examples/yolo/data/labels/44_5.png +0 -0
  566. data/vendor/ggml/examples/yolo/data/labels/44_6.png +0 -0
  567. data/vendor/ggml/examples/yolo/data/labels/44_7.png +0 -0
  568. data/vendor/ggml/examples/yolo/data/labels/45_0.png +0 -0
  569. data/vendor/ggml/examples/yolo/data/labels/45_1.png +0 -0
  570. data/vendor/ggml/examples/yolo/data/labels/45_2.png +0 -0
  571. data/vendor/ggml/examples/yolo/data/labels/45_3.png +0 -0
  572. data/vendor/ggml/examples/yolo/data/labels/45_4.png +0 -0
  573. data/vendor/ggml/examples/yolo/data/labels/45_5.png +0 -0
  574. data/vendor/ggml/examples/yolo/data/labels/45_6.png +0 -0
  575. data/vendor/ggml/examples/yolo/data/labels/45_7.png +0 -0
  576. data/vendor/ggml/examples/yolo/data/labels/46_0.png +0 -0
  577. data/vendor/ggml/examples/yolo/data/labels/46_1.png +0 -0
  578. data/vendor/ggml/examples/yolo/data/labels/46_2.png +0 -0
  579. data/vendor/ggml/examples/yolo/data/labels/46_3.png +0 -0
  580. data/vendor/ggml/examples/yolo/data/labels/46_4.png +0 -0
  581. data/vendor/ggml/examples/yolo/data/labels/46_5.png +0 -0
  582. data/vendor/ggml/examples/yolo/data/labels/46_6.png +0 -0
  583. data/vendor/ggml/examples/yolo/data/labels/46_7.png +0 -0
  584. data/vendor/ggml/examples/yolo/data/labels/47_0.png +0 -0
  585. data/vendor/ggml/examples/yolo/data/labels/47_1.png +0 -0
  586. data/vendor/ggml/examples/yolo/data/labels/47_2.png +0 -0
  587. data/vendor/ggml/examples/yolo/data/labels/47_3.png +0 -0
  588. data/vendor/ggml/examples/yolo/data/labels/47_4.png +0 -0
  589. data/vendor/ggml/examples/yolo/data/labels/47_5.png +0 -0
  590. data/vendor/ggml/examples/yolo/data/labels/47_6.png +0 -0
  591. data/vendor/ggml/examples/yolo/data/labels/47_7.png +0 -0
  592. data/vendor/ggml/examples/yolo/data/labels/48_0.png +0 -0
  593. data/vendor/ggml/examples/yolo/data/labels/48_1.png +0 -0
  594. data/vendor/ggml/examples/yolo/data/labels/48_2.png +0 -0
  595. data/vendor/ggml/examples/yolo/data/labels/48_3.png +0 -0
  596. data/vendor/ggml/examples/yolo/data/labels/48_4.png +0 -0
  597. data/vendor/ggml/examples/yolo/data/labels/48_5.png +0 -0
  598. data/vendor/ggml/examples/yolo/data/labels/48_6.png +0 -0
  599. data/vendor/ggml/examples/yolo/data/labels/48_7.png +0 -0
  600. data/vendor/ggml/examples/yolo/data/labels/49_0.png +0 -0
  601. data/vendor/ggml/examples/yolo/data/labels/49_1.png +0 -0
  602. data/vendor/ggml/examples/yolo/data/labels/49_2.png +0 -0
  603. data/vendor/ggml/examples/yolo/data/labels/49_3.png +0 -0
  604. data/vendor/ggml/examples/yolo/data/labels/49_4.png +0 -0
  605. data/vendor/ggml/examples/yolo/data/labels/49_5.png +0 -0
  606. data/vendor/ggml/examples/yolo/data/labels/49_6.png +0 -0
  607. data/vendor/ggml/examples/yolo/data/labels/49_7.png +0 -0
  608. data/vendor/ggml/examples/yolo/data/labels/50_0.png +0 -0
  609. data/vendor/ggml/examples/yolo/data/labels/50_1.png +0 -0
  610. data/vendor/ggml/examples/yolo/data/labels/50_2.png +0 -0
  611. data/vendor/ggml/examples/yolo/data/labels/50_3.png +0 -0
  612. data/vendor/ggml/examples/yolo/data/labels/50_4.png +0 -0
  613. data/vendor/ggml/examples/yolo/data/labels/50_5.png +0 -0
  614. data/vendor/ggml/examples/yolo/data/labels/50_6.png +0 -0
  615. data/vendor/ggml/examples/yolo/data/labels/50_7.png +0 -0
  616. data/vendor/ggml/examples/yolo/data/labels/51_0.png +0 -0
  617. data/vendor/ggml/examples/yolo/data/labels/51_1.png +0 -0
  618. data/vendor/ggml/examples/yolo/data/labels/51_2.png +0 -0
  619. data/vendor/ggml/examples/yolo/data/labels/51_3.png +0 -0
  620. data/vendor/ggml/examples/yolo/data/labels/51_4.png +0 -0
  621. data/vendor/ggml/examples/yolo/data/labels/51_5.png +0 -0
  622. data/vendor/ggml/examples/yolo/data/labels/51_6.png +0 -0
  623. data/vendor/ggml/examples/yolo/data/labels/51_7.png +0 -0
  624. data/vendor/ggml/examples/yolo/data/labels/52_0.png +0 -0
  625. data/vendor/ggml/examples/yolo/data/labels/52_1.png +0 -0
  626. data/vendor/ggml/examples/yolo/data/labels/52_2.png +0 -0
  627. data/vendor/ggml/examples/yolo/data/labels/52_3.png +0 -0
  628. data/vendor/ggml/examples/yolo/data/labels/52_4.png +0 -0
  629. data/vendor/ggml/examples/yolo/data/labels/52_5.png +0 -0
  630. data/vendor/ggml/examples/yolo/data/labels/52_6.png +0 -0
  631. data/vendor/ggml/examples/yolo/data/labels/52_7.png +0 -0
  632. data/vendor/ggml/examples/yolo/data/labels/53_0.png +0 -0
  633. data/vendor/ggml/examples/yolo/data/labels/53_1.png +0 -0
  634. data/vendor/ggml/examples/yolo/data/labels/53_2.png +0 -0
  635. data/vendor/ggml/examples/yolo/data/labels/53_3.png +0 -0
  636. data/vendor/ggml/examples/yolo/data/labels/53_4.png +0 -0
  637. data/vendor/ggml/examples/yolo/data/labels/53_5.png +0 -0
  638. data/vendor/ggml/examples/yolo/data/labels/53_6.png +0 -0
  639. data/vendor/ggml/examples/yolo/data/labels/53_7.png +0 -0
  640. data/vendor/ggml/examples/yolo/data/labels/54_0.png +0 -0
  641. data/vendor/ggml/examples/yolo/data/labels/54_1.png +0 -0
  642. data/vendor/ggml/examples/yolo/data/labels/54_2.png +0 -0
  643. data/vendor/ggml/examples/yolo/data/labels/54_3.png +0 -0
  644. data/vendor/ggml/examples/yolo/data/labels/54_4.png +0 -0
  645. data/vendor/ggml/examples/yolo/data/labels/54_5.png +0 -0
  646. data/vendor/ggml/examples/yolo/data/labels/54_6.png +0 -0
  647. data/vendor/ggml/examples/yolo/data/labels/54_7.png +0 -0
  648. data/vendor/ggml/examples/yolo/data/labels/55_0.png +0 -0
  649. data/vendor/ggml/examples/yolo/data/labels/55_1.png +0 -0
  650. data/vendor/ggml/examples/yolo/data/labels/55_2.png +0 -0
  651. data/vendor/ggml/examples/yolo/data/labels/55_3.png +0 -0
  652. data/vendor/ggml/examples/yolo/data/labels/55_4.png +0 -0
  653. data/vendor/ggml/examples/yolo/data/labels/55_5.png +0 -0
  654. data/vendor/ggml/examples/yolo/data/labels/55_6.png +0 -0
  655. data/vendor/ggml/examples/yolo/data/labels/55_7.png +0 -0
  656. data/vendor/ggml/examples/yolo/data/labels/56_0.png +0 -0
  657. data/vendor/ggml/examples/yolo/data/labels/56_1.png +0 -0
  658. data/vendor/ggml/examples/yolo/data/labels/56_2.png +0 -0
  659. data/vendor/ggml/examples/yolo/data/labels/56_3.png +0 -0
  660. data/vendor/ggml/examples/yolo/data/labels/56_4.png +0 -0
  661. data/vendor/ggml/examples/yolo/data/labels/56_5.png +0 -0
  662. data/vendor/ggml/examples/yolo/data/labels/56_6.png +0 -0
  663. data/vendor/ggml/examples/yolo/data/labels/56_7.png +0 -0
  664. data/vendor/ggml/examples/yolo/data/labels/57_0.png +0 -0
  665. data/vendor/ggml/examples/yolo/data/labels/57_1.png +0 -0
  666. data/vendor/ggml/examples/yolo/data/labels/57_2.png +0 -0
  667. data/vendor/ggml/examples/yolo/data/labels/57_3.png +0 -0
  668. data/vendor/ggml/examples/yolo/data/labels/57_4.png +0 -0
  669. data/vendor/ggml/examples/yolo/data/labels/57_5.png +0 -0
  670. data/vendor/ggml/examples/yolo/data/labels/57_6.png +0 -0
  671. data/vendor/ggml/examples/yolo/data/labels/57_7.png +0 -0
  672. data/vendor/ggml/examples/yolo/data/labels/58_0.png +0 -0
  673. data/vendor/ggml/examples/yolo/data/labels/58_1.png +0 -0
  674. data/vendor/ggml/examples/yolo/data/labels/58_2.png +0 -0
  675. data/vendor/ggml/examples/yolo/data/labels/58_3.png +0 -0
  676. data/vendor/ggml/examples/yolo/data/labels/58_4.png +0 -0
  677. data/vendor/ggml/examples/yolo/data/labels/58_5.png +0 -0
  678. data/vendor/ggml/examples/yolo/data/labels/58_6.png +0 -0
  679. data/vendor/ggml/examples/yolo/data/labels/58_7.png +0 -0
  680. data/vendor/ggml/examples/yolo/data/labels/59_0.png +0 -0
  681. data/vendor/ggml/examples/yolo/data/labels/59_1.png +0 -0
  682. data/vendor/ggml/examples/yolo/data/labels/59_2.png +0 -0
  683. data/vendor/ggml/examples/yolo/data/labels/59_3.png +0 -0
  684. data/vendor/ggml/examples/yolo/data/labels/59_4.png +0 -0
  685. data/vendor/ggml/examples/yolo/data/labels/59_5.png +0 -0
  686. data/vendor/ggml/examples/yolo/data/labels/59_6.png +0 -0
  687. data/vendor/ggml/examples/yolo/data/labels/59_7.png +0 -0
  688. data/vendor/ggml/examples/yolo/data/labels/60_0.png +0 -0
  689. data/vendor/ggml/examples/yolo/data/labels/60_1.png +0 -0
  690. data/vendor/ggml/examples/yolo/data/labels/60_2.png +0 -0
  691. data/vendor/ggml/examples/yolo/data/labels/60_3.png +0 -0
  692. data/vendor/ggml/examples/yolo/data/labels/60_4.png +0 -0
  693. data/vendor/ggml/examples/yolo/data/labels/60_5.png +0 -0
  694. data/vendor/ggml/examples/yolo/data/labels/60_6.png +0 -0
  695. data/vendor/ggml/examples/yolo/data/labels/60_7.png +0 -0
  696. data/vendor/ggml/examples/yolo/data/labels/61_0.png +0 -0
  697. data/vendor/ggml/examples/yolo/data/labels/61_1.png +0 -0
  698. data/vendor/ggml/examples/yolo/data/labels/61_2.png +0 -0
  699. data/vendor/ggml/examples/yolo/data/labels/61_3.png +0 -0
  700. data/vendor/ggml/examples/yolo/data/labels/61_4.png +0 -0
  701. data/vendor/ggml/examples/yolo/data/labels/61_5.png +0 -0
  702. data/vendor/ggml/examples/yolo/data/labels/61_6.png +0 -0
  703. data/vendor/ggml/examples/yolo/data/labels/61_7.png +0 -0
  704. data/vendor/ggml/examples/yolo/data/labels/62_0.png +0 -0
  705. data/vendor/ggml/examples/yolo/data/labels/62_1.png +0 -0
  706. data/vendor/ggml/examples/yolo/data/labels/62_2.png +0 -0
  707. data/vendor/ggml/examples/yolo/data/labels/62_3.png +0 -0
  708. data/vendor/ggml/examples/yolo/data/labels/62_4.png +0 -0
  709. data/vendor/ggml/examples/yolo/data/labels/62_5.png +0 -0
  710. data/vendor/ggml/examples/yolo/data/labels/62_6.png +0 -0
  711. data/vendor/ggml/examples/yolo/data/labels/62_7.png +0 -0
  712. data/vendor/ggml/examples/yolo/data/labels/63_0.png +0 -0
  713. data/vendor/ggml/examples/yolo/data/labels/63_1.png +0 -0
  714. data/vendor/ggml/examples/yolo/data/labels/63_2.png +0 -0
  715. data/vendor/ggml/examples/yolo/data/labels/63_3.png +0 -0
  716. data/vendor/ggml/examples/yolo/data/labels/63_4.png +0 -0
  717. data/vendor/ggml/examples/yolo/data/labels/63_5.png +0 -0
  718. data/vendor/ggml/examples/yolo/data/labels/63_6.png +0 -0
  719. data/vendor/ggml/examples/yolo/data/labels/63_7.png +0 -0
  720. data/vendor/ggml/examples/yolo/data/labels/64_0.png +0 -0
  721. data/vendor/ggml/examples/yolo/data/labels/64_1.png +0 -0
  722. data/vendor/ggml/examples/yolo/data/labels/64_2.png +0 -0
  723. data/vendor/ggml/examples/yolo/data/labels/64_3.png +0 -0
  724. data/vendor/ggml/examples/yolo/data/labels/64_4.png +0 -0
  725. data/vendor/ggml/examples/yolo/data/labels/64_5.png +0 -0
  726. data/vendor/ggml/examples/yolo/data/labels/64_6.png +0 -0
  727. data/vendor/ggml/examples/yolo/data/labels/64_7.png +0 -0
  728. data/vendor/ggml/examples/yolo/data/labels/65_0.png +0 -0
  729. data/vendor/ggml/examples/yolo/data/labels/65_1.png +0 -0
  730. data/vendor/ggml/examples/yolo/data/labels/65_2.png +0 -0
  731. data/vendor/ggml/examples/yolo/data/labels/65_3.png +0 -0
  732. data/vendor/ggml/examples/yolo/data/labels/65_4.png +0 -0
  733. data/vendor/ggml/examples/yolo/data/labels/65_5.png +0 -0
  734. data/vendor/ggml/examples/yolo/data/labels/65_6.png +0 -0
  735. data/vendor/ggml/examples/yolo/data/labels/65_7.png +0 -0
  736. data/vendor/ggml/examples/yolo/data/labels/66_0.png +0 -0
  737. data/vendor/ggml/examples/yolo/data/labels/66_1.png +0 -0
  738. data/vendor/ggml/examples/yolo/data/labels/66_2.png +0 -0
  739. data/vendor/ggml/examples/yolo/data/labels/66_3.png +0 -0
  740. data/vendor/ggml/examples/yolo/data/labels/66_4.png +0 -0
  741. data/vendor/ggml/examples/yolo/data/labels/66_5.png +0 -0
  742. data/vendor/ggml/examples/yolo/data/labels/66_6.png +0 -0
  743. data/vendor/ggml/examples/yolo/data/labels/66_7.png +0 -0
  744. data/vendor/ggml/examples/yolo/data/labels/67_0.png +0 -0
  745. data/vendor/ggml/examples/yolo/data/labels/67_1.png +0 -0
  746. data/vendor/ggml/examples/yolo/data/labels/67_2.png +0 -0
  747. data/vendor/ggml/examples/yolo/data/labels/67_3.png +0 -0
  748. data/vendor/ggml/examples/yolo/data/labels/67_4.png +0 -0
  749. data/vendor/ggml/examples/yolo/data/labels/67_5.png +0 -0
  750. data/vendor/ggml/examples/yolo/data/labels/67_6.png +0 -0
  751. data/vendor/ggml/examples/yolo/data/labels/67_7.png +0 -0
  752. data/vendor/ggml/examples/yolo/data/labels/68_0.png +0 -0
  753. data/vendor/ggml/examples/yolo/data/labels/68_1.png +0 -0
  754. data/vendor/ggml/examples/yolo/data/labels/68_2.png +0 -0
  755. data/vendor/ggml/examples/yolo/data/labels/68_3.png +0 -0
  756. data/vendor/ggml/examples/yolo/data/labels/68_4.png +0 -0
  757. data/vendor/ggml/examples/yolo/data/labels/68_5.png +0 -0
  758. data/vendor/ggml/examples/yolo/data/labels/68_6.png +0 -0
  759. data/vendor/ggml/examples/yolo/data/labels/68_7.png +0 -0
  760. data/vendor/ggml/examples/yolo/data/labels/69_0.png +0 -0
  761. data/vendor/ggml/examples/yolo/data/labels/69_1.png +0 -0
  762. data/vendor/ggml/examples/yolo/data/labels/69_2.png +0 -0
  763. data/vendor/ggml/examples/yolo/data/labels/69_3.png +0 -0
  764. data/vendor/ggml/examples/yolo/data/labels/69_4.png +0 -0
  765. data/vendor/ggml/examples/yolo/data/labels/69_5.png +0 -0
  766. data/vendor/ggml/examples/yolo/data/labels/69_6.png +0 -0
  767. data/vendor/ggml/examples/yolo/data/labels/69_7.png +0 -0
  768. data/vendor/ggml/examples/yolo/data/labels/70_0.png +0 -0
  769. data/vendor/ggml/examples/yolo/data/labels/70_1.png +0 -0
  770. data/vendor/ggml/examples/yolo/data/labels/70_2.png +0 -0
  771. data/vendor/ggml/examples/yolo/data/labels/70_3.png +0 -0
  772. data/vendor/ggml/examples/yolo/data/labels/70_4.png +0 -0
  773. data/vendor/ggml/examples/yolo/data/labels/70_5.png +0 -0
  774. data/vendor/ggml/examples/yolo/data/labels/70_6.png +0 -0
  775. data/vendor/ggml/examples/yolo/data/labels/70_7.png +0 -0
  776. data/vendor/ggml/examples/yolo/data/labels/71_0.png +0 -0
  777. data/vendor/ggml/examples/yolo/data/labels/71_1.png +0 -0
  778. data/vendor/ggml/examples/yolo/data/labels/71_2.png +0 -0
  779. data/vendor/ggml/examples/yolo/data/labels/71_3.png +0 -0
  780. data/vendor/ggml/examples/yolo/data/labels/71_4.png +0 -0
  781. data/vendor/ggml/examples/yolo/data/labels/71_5.png +0 -0
  782. data/vendor/ggml/examples/yolo/data/labels/71_6.png +0 -0
  783. data/vendor/ggml/examples/yolo/data/labels/71_7.png +0 -0
  784. data/vendor/ggml/examples/yolo/data/labels/72_0.png +0 -0
  785. data/vendor/ggml/examples/yolo/data/labels/72_1.png +0 -0
  786. data/vendor/ggml/examples/yolo/data/labels/72_2.png +0 -0
  787. data/vendor/ggml/examples/yolo/data/labels/72_3.png +0 -0
  788. data/vendor/ggml/examples/yolo/data/labels/72_4.png +0 -0
  789. data/vendor/ggml/examples/yolo/data/labels/72_5.png +0 -0
  790. data/vendor/ggml/examples/yolo/data/labels/72_6.png +0 -0
  791. data/vendor/ggml/examples/yolo/data/labels/72_7.png +0 -0
  792. data/vendor/ggml/examples/yolo/data/labels/73_0.png +0 -0
  793. data/vendor/ggml/examples/yolo/data/labels/73_1.png +0 -0
  794. data/vendor/ggml/examples/yolo/data/labels/73_2.png +0 -0
  795. data/vendor/ggml/examples/yolo/data/labels/73_3.png +0 -0
  796. data/vendor/ggml/examples/yolo/data/labels/73_4.png +0 -0
  797. data/vendor/ggml/examples/yolo/data/labels/73_5.png +0 -0
  798. data/vendor/ggml/examples/yolo/data/labels/73_6.png +0 -0
  799. data/vendor/ggml/examples/yolo/data/labels/73_7.png +0 -0
  800. data/vendor/ggml/examples/yolo/data/labels/74_0.png +0 -0
  801. data/vendor/ggml/examples/yolo/data/labels/74_1.png +0 -0
  802. data/vendor/ggml/examples/yolo/data/labels/74_2.png +0 -0
  803. data/vendor/ggml/examples/yolo/data/labels/74_3.png +0 -0
  804. data/vendor/ggml/examples/yolo/data/labels/74_4.png +0 -0
  805. data/vendor/ggml/examples/yolo/data/labels/74_5.png +0 -0
  806. data/vendor/ggml/examples/yolo/data/labels/74_6.png +0 -0
  807. data/vendor/ggml/examples/yolo/data/labels/74_7.png +0 -0
  808. data/vendor/ggml/examples/yolo/data/labels/75_0.png +0 -0
  809. data/vendor/ggml/examples/yolo/data/labels/75_1.png +0 -0
  810. data/vendor/ggml/examples/yolo/data/labels/75_2.png +0 -0
  811. data/vendor/ggml/examples/yolo/data/labels/75_3.png +0 -0
  812. data/vendor/ggml/examples/yolo/data/labels/75_4.png +0 -0
  813. data/vendor/ggml/examples/yolo/data/labels/75_5.png +0 -0
  814. data/vendor/ggml/examples/yolo/data/labels/75_6.png +0 -0
  815. data/vendor/ggml/examples/yolo/data/labels/75_7.png +0 -0
  816. data/vendor/ggml/examples/yolo/data/labels/76_0.png +0 -0
  817. data/vendor/ggml/examples/yolo/data/labels/76_1.png +0 -0
  818. data/vendor/ggml/examples/yolo/data/labels/76_2.png +0 -0
  819. data/vendor/ggml/examples/yolo/data/labels/76_3.png +0 -0
  820. data/vendor/ggml/examples/yolo/data/labels/76_4.png +0 -0
  821. data/vendor/ggml/examples/yolo/data/labels/76_5.png +0 -0
  822. data/vendor/ggml/examples/yolo/data/labels/76_6.png +0 -0
  823. data/vendor/ggml/examples/yolo/data/labels/76_7.png +0 -0
  824. data/vendor/ggml/examples/yolo/data/labels/77_0.png +0 -0
  825. data/vendor/ggml/examples/yolo/data/labels/77_1.png +0 -0
  826. data/vendor/ggml/examples/yolo/data/labels/77_2.png +0 -0
  827. data/vendor/ggml/examples/yolo/data/labels/77_3.png +0 -0
  828. data/vendor/ggml/examples/yolo/data/labels/77_4.png +0 -0
  829. data/vendor/ggml/examples/yolo/data/labels/77_5.png +0 -0
  830. data/vendor/ggml/examples/yolo/data/labels/77_6.png +0 -0
  831. data/vendor/ggml/examples/yolo/data/labels/77_7.png +0 -0
  832. data/vendor/ggml/examples/yolo/data/labels/78_0.png +0 -0
  833. data/vendor/ggml/examples/yolo/data/labels/78_1.png +0 -0
  834. data/vendor/ggml/examples/yolo/data/labels/78_2.png +0 -0
  835. data/vendor/ggml/examples/yolo/data/labels/78_3.png +0 -0
  836. data/vendor/ggml/examples/yolo/data/labels/78_4.png +0 -0
  837. data/vendor/ggml/examples/yolo/data/labels/78_5.png +0 -0
  838. data/vendor/ggml/examples/yolo/data/labels/78_6.png +0 -0
  839. data/vendor/ggml/examples/yolo/data/labels/78_7.png +0 -0
  840. data/vendor/ggml/examples/yolo/data/labels/79_0.png +0 -0
  841. data/vendor/ggml/examples/yolo/data/labels/79_1.png +0 -0
  842. data/vendor/ggml/examples/yolo/data/labels/79_2.png +0 -0
  843. data/vendor/ggml/examples/yolo/data/labels/79_3.png +0 -0
  844. data/vendor/ggml/examples/yolo/data/labels/79_4.png +0 -0
  845. data/vendor/ggml/examples/yolo/data/labels/79_5.png +0 -0
  846. data/vendor/ggml/examples/yolo/data/labels/79_6.png +0 -0
  847. data/vendor/ggml/examples/yolo/data/labels/79_7.png +0 -0
  848. data/vendor/ggml/examples/yolo/data/labels/80_0.png +0 -0
  849. data/vendor/ggml/examples/yolo/data/labels/80_1.png +0 -0
  850. data/vendor/ggml/examples/yolo/data/labels/80_2.png +0 -0
  851. data/vendor/ggml/examples/yolo/data/labels/80_3.png +0 -0
  852. data/vendor/ggml/examples/yolo/data/labels/80_4.png +0 -0
  853. data/vendor/ggml/examples/yolo/data/labels/80_5.png +0 -0
  854. data/vendor/ggml/examples/yolo/data/labels/80_6.png +0 -0
  855. data/vendor/ggml/examples/yolo/data/labels/80_7.png +0 -0
  856. data/vendor/ggml/examples/yolo/data/labels/81_0.png +0 -0
  857. data/vendor/ggml/examples/yolo/data/labels/81_1.png +0 -0
  858. data/vendor/ggml/examples/yolo/data/labels/81_2.png +0 -0
  859. data/vendor/ggml/examples/yolo/data/labels/81_3.png +0 -0
  860. data/vendor/ggml/examples/yolo/data/labels/81_4.png +0 -0
  861. data/vendor/ggml/examples/yolo/data/labels/81_5.png +0 -0
  862. data/vendor/ggml/examples/yolo/data/labels/81_6.png +0 -0
  863. data/vendor/ggml/examples/yolo/data/labels/81_7.png +0 -0
  864. data/vendor/ggml/examples/yolo/data/labels/82_0.png +0 -0
  865. data/vendor/ggml/examples/yolo/data/labels/82_1.png +0 -0
  866. data/vendor/ggml/examples/yolo/data/labels/82_2.png +0 -0
  867. data/vendor/ggml/examples/yolo/data/labels/82_3.png +0 -0
  868. data/vendor/ggml/examples/yolo/data/labels/82_4.png +0 -0
  869. data/vendor/ggml/examples/yolo/data/labels/82_5.png +0 -0
  870. data/vendor/ggml/examples/yolo/data/labels/82_6.png +0 -0
  871. data/vendor/ggml/examples/yolo/data/labels/82_7.png +0 -0
  872. data/vendor/ggml/examples/yolo/data/labels/83_0.png +0 -0
  873. data/vendor/ggml/examples/yolo/data/labels/83_1.png +0 -0
  874. data/vendor/ggml/examples/yolo/data/labels/83_2.png +0 -0
  875. data/vendor/ggml/examples/yolo/data/labels/83_3.png +0 -0
  876. data/vendor/ggml/examples/yolo/data/labels/83_4.png +0 -0
  877. data/vendor/ggml/examples/yolo/data/labels/83_5.png +0 -0
  878. data/vendor/ggml/examples/yolo/data/labels/83_6.png +0 -0
  879. data/vendor/ggml/examples/yolo/data/labels/83_7.png +0 -0
  880. data/vendor/ggml/examples/yolo/data/labels/84_0.png +0 -0
  881. data/vendor/ggml/examples/yolo/data/labels/84_1.png +0 -0
  882. data/vendor/ggml/examples/yolo/data/labels/84_2.png +0 -0
  883. data/vendor/ggml/examples/yolo/data/labels/84_3.png +0 -0
  884. data/vendor/ggml/examples/yolo/data/labels/84_4.png +0 -0
  885. data/vendor/ggml/examples/yolo/data/labels/84_5.png +0 -0
  886. data/vendor/ggml/examples/yolo/data/labels/84_6.png +0 -0
  887. data/vendor/ggml/examples/yolo/data/labels/84_7.png +0 -0
  888. data/vendor/ggml/examples/yolo/data/labels/85_0.png +0 -0
  889. data/vendor/ggml/examples/yolo/data/labels/85_1.png +0 -0
  890. data/vendor/ggml/examples/yolo/data/labels/85_2.png +0 -0
  891. data/vendor/ggml/examples/yolo/data/labels/85_3.png +0 -0
  892. data/vendor/ggml/examples/yolo/data/labels/85_4.png +0 -0
  893. data/vendor/ggml/examples/yolo/data/labels/85_5.png +0 -0
  894. data/vendor/ggml/examples/yolo/data/labels/85_6.png +0 -0
  895. data/vendor/ggml/examples/yolo/data/labels/85_7.png +0 -0
  896. data/vendor/ggml/examples/yolo/data/labels/86_0.png +0 -0
  897. data/vendor/ggml/examples/yolo/data/labels/86_1.png +0 -0
  898. data/vendor/ggml/examples/yolo/data/labels/86_2.png +0 -0
  899. data/vendor/ggml/examples/yolo/data/labels/86_3.png +0 -0
  900. data/vendor/ggml/examples/yolo/data/labels/86_4.png +0 -0
  901. data/vendor/ggml/examples/yolo/data/labels/86_5.png +0 -0
  902. data/vendor/ggml/examples/yolo/data/labels/86_6.png +0 -0
  903. data/vendor/ggml/examples/yolo/data/labels/86_7.png +0 -0
  904. data/vendor/ggml/examples/yolo/data/labels/87_0.png +0 -0
  905. data/vendor/ggml/examples/yolo/data/labels/87_1.png +0 -0
  906. data/vendor/ggml/examples/yolo/data/labels/87_2.png +0 -0
  907. data/vendor/ggml/examples/yolo/data/labels/87_3.png +0 -0
  908. data/vendor/ggml/examples/yolo/data/labels/87_4.png +0 -0
  909. data/vendor/ggml/examples/yolo/data/labels/87_5.png +0 -0
  910. data/vendor/ggml/examples/yolo/data/labels/87_6.png +0 -0
  911. data/vendor/ggml/examples/yolo/data/labels/87_7.png +0 -0
  912. data/vendor/ggml/examples/yolo/data/labels/88_0.png +0 -0
  913. data/vendor/ggml/examples/yolo/data/labels/88_1.png +0 -0
  914. data/vendor/ggml/examples/yolo/data/labels/88_2.png +0 -0
  915. data/vendor/ggml/examples/yolo/data/labels/88_3.png +0 -0
  916. data/vendor/ggml/examples/yolo/data/labels/88_4.png +0 -0
  917. data/vendor/ggml/examples/yolo/data/labels/88_5.png +0 -0
  918. data/vendor/ggml/examples/yolo/data/labels/88_6.png +0 -0
  919. data/vendor/ggml/examples/yolo/data/labels/88_7.png +0 -0
  920. data/vendor/ggml/examples/yolo/data/labels/89_0.png +0 -0
  921. data/vendor/ggml/examples/yolo/data/labels/89_1.png +0 -0
  922. data/vendor/ggml/examples/yolo/data/labels/89_2.png +0 -0
  923. data/vendor/ggml/examples/yolo/data/labels/89_3.png +0 -0
  924. data/vendor/ggml/examples/yolo/data/labels/89_4.png +0 -0
  925. data/vendor/ggml/examples/yolo/data/labels/89_5.png +0 -0
  926. data/vendor/ggml/examples/yolo/data/labels/89_6.png +0 -0
  927. data/vendor/ggml/examples/yolo/data/labels/89_7.png +0 -0
  928. data/vendor/ggml/examples/yolo/data/labels/90_0.png +0 -0
  929. data/vendor/ggml/examples/yolo/data/labels/90_1.png +0 -0
  930. data/vendor/ggml/examples/yolo/data/labels/90_2.png +0 -0
  931. data/vendor/ggml/examples/yolo/data/labels/90_3.png +0 -0
  932. data/vendor/ggml/examples/yolo/data/labels/90_4.png +0 -0
  933. data/vendor/ggml/examples/yolo/data/labels/90_5.png +0 -0
  934. data/vendor/ggml/examples/yolo/data/labels/90_6.png +0 -0
  935. data/vendor/ggml/examples/yolo/data/labels/90_7.png +0 -0
  936. data/vendor/ggml/examples/yolo/data/labels/91_0.png +0 -0
  937. data/vendor/ggml/examples/yolo/data/labels/91_1.png +0 -0
  938. data/vendor/ggml/examples/yolo/data/labels/91_2.png +0 -0
  939. data/vendor/ggml/examples/yolo/data/labels/91_3.png +0 -0
  940. data/vendor/ggml/examples/yolo/data/labels/91_4.png +0 -0
  941. data/vendor/ggml/examples/yolo/data/labels/91_5.png +0 -0
  942. data/vendor/ggml/examples/yolo/data/labels/91_6.png +0 -0
  943. data/vendor/ggml/examples/yolo/data/labels/91_7.png +0 -0
  944. data/vendor/ggml/examples/yolo/data/labels/92_0.png +0 -0
  945. data/vendor/ggml/examples/yolo/data/labels/92_1.png +0 -0
  946. data/vendor/ggml/examples/yolo/data/labels/92_2.png +0 -0
  947. data/vendor/ggml/examples/yolo/data/labels/92_3.png +0 -0
  948. data/vendor/ggml/examples/yolo/data/labels/92_4.png +0 -0
  949. data/vendor/ggml/examples/yolo/data/labels/92_5.png +0 -0
  950. data/vendor/ggml/examples/yolo/data/labels/92_6.png +0 -0
  951. data/vendor/ggml/examples/yolo/data/labels/92_7.png +0 -0
  952. data/vendor/ggml/examples/yolo/data/labels/93_0.png +0 -0
  953. data/vendor/ggml/examples/yolo/data/labels/93_1.png +0 -0
  954. data/vendor/ggml/examples/yolo/data/labels/93_2.png +0 -0
  955. data/vendor/ggml/examples/yolo/data/labels/93_3.png +0 -0
  956. data/vendor/ggml/examples/yolo/data/labels/93_4.png +0 -0
  957. data/vendor/ggml/examples/yolo/data/labels/93_5.png +0 -0
  958. data/vendor/ggml/examples/yolo/data/labels/93_6.png +0 -0
  959. data/vendor/ggml/examples/yolo/data/labels/93_7.png +0 -0
  960. data/vendor/ggml/examples/yolo/data/labels/94_0.png +0 -0
  961. data/vendor/ggml/examples/yolo/data/labels/94_1.png +0 -0
  962. data/vendor/ggml/examples/yolo/data/labels/94_2.png +0 -0
  963. data/vendor/ggml/examples/yolo/data/labels/94_3.png +0 -0
  964. data/vendor/ggml/examples/yolo/data/labels/94_4.png +0 -0
  965. data/vendor/ggml/examples/yolo/data/labels/94_5.png +0 -0
  966. data/vendor/ggml/examples/yolo/data/labels/94_6.png +0 -0
  967. data/vendor/ggml/examples/yolo/data/labels/94_7.png +0 -0
  968. data/vendor/ggml/examples/yolo/data/labels/95_0.png +0 -0
  969. data/vendor/ggml/examples/yolo/data/labels/95_1.png +0 -0
  970. data/vendor/ggml/examples/yolo/data/labels/95_2.png +0 -0
  971. data/vendor/ggml/examples/yolo/data/labels/95_3.png +0 -0
  972. data/vendor/ggml/examples/yolo/data/labels/95_4.png +0 -0
  973. data/vendor/ggml/examples/yolo/data/labels/95_5.png +0 -0
  974. data/vendor/ggml/examples/yolo/data/labels/95_6.png +0 -0
  975. data/vendor/ggml/examples/yolo/data/labels/95_7.png +0 -0
  976. data/vendor/ggml/examples/yolo/data/labels/96_0.png +0 -0
  977. data/vendor/ggml/examples/yolo/data/labels/96_1.png +0 -0
  978. data/vendor/ggml/examples/yolo/data/labels/96_2.png +0 -0
  979. data/vendor/ggml/examples/yolo/data/labels/96_3.png +0 -0
  980. data/vendor/ggml/examples/yolo/data/labels/96_4.png +0 -0
  981. data/vendor/ggml/examples/yolo/data/labels/96_5.png +0 -0
  982. data/vendor/ggml/examples/yolo/data/labels/96_6.png +0 -0
  983. data/vendor/ggml/examples/yolo/data/labels/96_7.png +0 -0
  984. data/vendor/ggml/examples/yolo/data/labels/97_0.png +0 -0
  985. data/vendor/ggml/examples/yolo/data/labels/97_1.png +0 -0
  986. data/vendor/ggml/examples/yolo/data/labels/97_2.png +0 -0
  987. data/vendor/ggml/examples/yolo/data/labels/97_3.png +0 -0
  988. data/vendor/ggml/examples/yolo/data/labels/97_4.png +0 -0
  989. data/vendor/ggml/examples/yolo/data/labels/97_5.png +0 -0
  990. data/vendor/ggml/examples/yolo/data/labels/97_6.png +0 -0
  991. data/vendor/ggml/examples/yolo/data/labels/97_7.png +0 -0
  992. data/vendor/ggml/examples/yolo/data/labels/98_0.png +0 -0
  993. data/vendor/ggml/examples/yolo/data/labels/98_1.png +0 -0
  994. data/vendor/ggml/examples/yolo/data/labels/98_2.png +0 -0
  995. data/vendor/ggml/examples/yolo/data/labels/98_3.png +0 -0
  996. data/vendor/ggml/examples/yolo/data/labels/98_4.png +0 -0
  997. data/vendor/ggml/examples/yolo/data/labels/98_5.png +0 -0
  998. data/vendor/ggml/examples/yolo/data/labels/98_6.png +0 -0
  999. data/vendor/ggml/examples/yolo/data/labels/98_7.png +0 -0
  1000. data/vendor/ggml/examples/yolo/data/labels/99_0.png +0 -0
  1001. data/vendor/ggml/examples/yolo/data/labels/99_1.png +0 -0
  1002. data/vendor/ggml/examples/yolo/data/labels/99_2.png +0 -0
  1003. data/vendor/ggml/examples/yolo/data/labels/99_3.png +0 -0
  1004. data/vendor/ggml/examples/yolo/data/labels/99_4.png +0 -0
  1005. data/vendor/ggml/examples/yolo/data/labels/99_5.png +0 -0
  1006. data/vendor/ggml/examples/yolo/data/labels/99_6.png +0 -0
  1007. data/vendor/ggml/examples/yolo/data/labels/99_7.png +0 -0
  1008. data/vendor/ggml/examples/yolo/yolo-image.cpp +210 -0
  1009. data/vendor/ggml/examples/yolo/yolo-image.h +39 -0
  1010. data/vendor/ggml/examples/yolo/yolov3-tiny.cpp +661 -0
  1011. data/vendor/ggml/ggml.pc.in +10 -0
  1012. data/vendor/ggml/include/ggml-alloc.h +85 -0
  1013. data/vendor/ggml/include/ggml-backend.h +431 -0
  1014. data/vendor/ggml/include/ggml-blas.h +25 -0
  1015. data/vendor/ggml/include/ggml-cann.h +123 -0
  1016. data/vendor/ggml/include/ggml-cpp.h +39 -0
  1017. data/vendor/ggml/include/ggml-cpu.h +151 -0
  1018. data/vendor/ggml/include/ggml-cuda.h +50 -0
  1019. data/vendor/ggml/include/ggml-hexagon.h +19 -0
  1020. data/vendor/ggml/include/ggml-metal.h +61 -0
  1021. data/vendor/ggml/include/ggml-opencl.h +26 -0
  1022. data/vendor/ggml/include/ggml-openvino.h +37 -0
  1023. data/vendor/ggml/include/ggml-opt.h +256 -0
  1024. data/vendor/ggml/include/ggml-rpc.h +35 -0
  1025. data/vendor/ggml/include/ggml-sycl.h +49 -0
  1026. data/vendor/ggml/include/ggml-virtgpu.h +14 -0
  1027. data/vendor/ggml/include/ggml-vulkan.h +29 -0
  1028. data/vendor/ggml/include/ggml-webgpu.h +19 -0
  1029. data/vendor/ggml/include/ggml-zdnn.h +17 -0
  1030. data/vendor/ggml/include/ggml-zendnn.h +22 -0
  1031. data/vendor/ggml/include/ggml.h +2845 -0
  1032. data/vendor/ggml/include/gguf.h +204 -0
  1033. data/vendor/ggml/requirements.txt +12 -0
  1034. data/vendor/ggml/scripts/gen-authors.sh +9 -0
  1035. data/vendor/ggml/scripts/release.sh +296 -0
  1036. data/vendor/ggml/scripts/sync-llama-am.sh +167 -0
  1037. data/vendor/ggml/scripts/sync-llama.last +1 -0
  1038. data/vendor/ggml/scripts/sync-llama.sh +21 -0
  1039. data/vendor/ggml/scripts/sync-whisper-am.sh +138 -0
  1040. data/vendor/ggml/scripts/sync-whisper.last +1 -0
  1041. data/vendor/ggml/scripts/sync-whisper.sh +17 -0
  1042. data/vendor/ggml/src/CMakeLists.txt +493 -0
  1043. data/vendor/ggml/src/ggml-alloc.c +1248 -0
  1044. data/vendor/ggml/src/ggml-backend-dl.cpp +48 -0
  1045. data/vendor/ggml/src/ggml-backend-dl.h +45 -0
  1046. data/vendor/ggml/src/ggml-backend-impl.h +275 -0
  1047. data/vendor/ggml/src/ggml-backend-meta.cpp +2144 -0
  1048. data/vendor/ggml/src/ggml-backend-reg.cpp +586 -0
  1049. data/vendor/ggml/src/ggml-backend.cpp +2371 -0
  1050. data/vendor/ggml/src/ggml-blas/CMakeLists.txt +101 -0
  1051. data/vendor/ggml/src/ggml-blas/ggml-blas.cpp +522 -0
  1052. data/vendor/ggml/src/ggml-cann/CMakeLists.txt +89 -0
  1053. data/vendor/ggml/src/ggml-cann/acl_tensor.cpp +195 -0
  1054. data/vendor/ggml/src/ggml-cann/acl_tensor.h +349 -0
  1055. data/vendor/ggml/src/ggml-cann/aclnn_ops.cpp +4436 -0
  1056. data/vendor/ggml/src/ggml-cann/aclnn_ops.h +1190 -0
  1057. data/vendor/ggml/src/ggml-cann/common.h +651 -0
  1058. data/vendor/ggml/src/ggml-cann/ggml-cann.cpp +3062 -0
  1059. data/vendor/ggml/src/ggml-common.h +1900 -0
  1060. data/vendor/ggml/src/ggml-cpu/CMakeLists.txt +731 -0
  1061. data/vendor/ggml/src/ggml-cpu/amx/amx.cpp +249 -0
  1062. data/vendor/ggml/src/ggml-cpu/amx/amx.h +8 -0
  1063. data/vendor/ggml/src/ggml-cpu/amx/common.h +115 -0
  1064. data/vendor/ggml/src/ggml-cpu/amx/mmq.cpp +2512 -0
  1065. data/vendor/ggml/src/ggml-cpu/amx/mmq.h +10 -0
  1066. data/vendor/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +98 -0
  1067. data/vendor/ggml/src/ggml-cpu/arch/arm/quants.c +4245 -0
  1068. data/vendor/ggml/src/ggml-cpu/arch/arm/repack.cpp +5156 -0
  1069. data/vendor/ggml/src/ggml-cpu/arch/loongarch/quants.c +2158 -0
  1070. data/vendor/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  1071. data/vendor/ggml/src/ggml-cpu/arch/powerpc/quants.c +2304 -0
  1072. data/vendor/ggml/src/ggml-cpu/arch/riscv/cpu-feats.cpp +38 -0
  1073. data/vendor/ggml/src/ggml-cpu/arch/riscv/quants.c +4553 -0
  1074. data/vendor/ggml/src/ggml-cpu/arch/riscv/repack.cpp +1703 -0
  1075. data/vendor/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
  1076. data/vendor/ggml/src/ggml-cpu/arch/s390/quants.c +1465 -0
  1077. data/vendor/ggml/src/ggml-cpu/arch/wasm/quants.c +1220 -0
  1078. data/vendor/ggml/src/ggml-cpu/arch/x86/cpu-feats.cpp +327 -0
  1079. data/vendor/ggml/src/ggml-cpu/arch/x86/quants.c +3970 -0
  1080. data/vendor/ggml/src/ggml-cpu/arch/x86/repack.cpp +6407 -0
  1081. data/vendor/ggml/src/ggml-cpu/arch-fallback.h +348 -0
  1082. data/vendor/ggml/src/ggml-cpu/binary-ops.cpp +154 -0
  1083. data/vendor/ggml/src/ggml-cpu/binary-ops.h +16 -0
  1084. data/vendor/ggml/src/ggml-cpu/cmake/FindSIMD.cmake +100 -0
  1085. data/vendor/ggml/src/ggml-cpu/cmake/FindSMTIME.cmake +32 -0
  1086. data/vendor/ggml/src/ggml-cpu/common.h +95 -0
  1087. data/vendor/ggml/src/ggml-cpu/ggml-cpu-impl.h +539 -0
  1088. data/vendor/ggml/src/ggml-cpu/ggml-cpu.c +3835 -0
  1089. data/vendor/ggml/src/ggml-cpu/ggml-cpu.cpp +703 -0
  1090. data/vendor/ggml/src/ggml-cpu/hbm.cpp +55 -0
  1091. data/vendor/ggml/src/ggml-cpu/hbm.h +8 -0
  1092. data/vendor/ggml/src/ggml-cpu/kleidiai/kernels.cpp +939 -0
  1093. data/vendor/ggml/src/ggml-cpu/kleidiai/kernels.h +90 -0
  1094. data/vendor/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1513 -0
  1095. data/vendor/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  1096. data/vendor/ggml/src/ggml-cpu/llamafile/sgemm.cpp +4051 -0
  1097. data/vendor/ggml/src/ggml-cpu/llamafile/sgemm.h +25 -0
  1098. data/vendor/ggml/src/ggml-cpu/ops.cpp +11373 -0
  1099. data/vendor/ggml/src/ggml-cpu/ops.h +119 -0
  1100. data/vendor/ggml/src/ggml-cpu/quants.c +1288 -0
  1101. data/vendor/ggml/src/ggml-cpu/quants.h +103 -0
  1102. data/vendor/ggml/src/ggml-cpu/repack.cpp +4836 -0
  1103. data/vendor/ggml/src/ggml-cpu/repack.h +245 -0
  1104. data/vendor/ggml/src/ggml-cpu/simd-gemm.h +226 -0
  1105. data/vendor/ggml/src/ggml-cpu/simd-mappings.h +1319 -0
  1106. data/vendor/ggml/src/ggml-cpu/spacemit/ime.cpp +1740 -0
  1107. data/vendor/ggml/src/ggml-cpu/spacemit/ime.h +21 -0
  1108. data/vendor/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +1027 -0
  1109. data/vendor/ggml/src/ggml-cpu/spacemit/ime2_kernels.cpp +5768 -0
  1110. data/vendor/ggml/src/ggml-cpu/spacemit/ime_env.cpp +320 -0
  1111. data/vendor/ggml/src/ggml-cpu/spacemit/ime_env.h +55 -0
  1112. data/vendor/ggml/src/ggml-cpu/spacemit/ime_kernels.h +189 -0
  1113. data/vendor/ggml/src/ggml-cpu/spacemit/repack.cpp +1795 -0
  1114. data/vendor/ggml/src/ggml-cpu/spacemit/repack.h +14 -0
  1115. data/vendor/ggml/src/ggml-cpu/spacemit/rvv_kernels.cpp +3178 -0
  1116. data/vendor/ggml/src/ggml-cpu/spacemit/rvv_kernels.h +95 -0
  1117. data/vendor/ggml/src/ggml-cpu/spacemit/spine_barrier.h +34 -0
  1118. data/vendor/ggml/src/ggml-cpu/spacemit/spine_mem_pool.cpp +760 -0
  1119. data/vendor/ggml/src/ggml-cpu/spacemit/spine_mem_pool.h +32 -0
  1120. data/vendor/ggml/src/ggml-cpu/spacemit/spine_tcm.h +409 -0
  1121. data/vendor/ggml/src/ggml-cpu/traits.cpp +36 -0
  1122. data/vendor/ggml/src/ggml-cpu/traits.h +38 -0
  1123. data/vendor/ggml/src/ggml-cpu/unary-ops.cpp +337 -0
  1124. data/vendor/ggml/src/ggml-cpu/unary-ops.h +35 -0
  1125. data/vendor/ggml/src/ggml-cpu/vec.cpp +629 -0
  1126. data/vendor/ggml/src/ggml-cpu/vec.h +1588 -0
  1127. data/vendor/ggml/src/ggml-cuda/CMakeLists.txt +268 -0
  1128. data/vendor/ggml/src/ggml-cuda/acc.cu +61 -0
  1129. data/vendor/ggml/src/ggml-cuda/acc.cuh +5 -0
  1130. data/vendor/ggml/src/ggml-cuda/add-id.cu +58 -0
  1131. data/vendor/ggml/src/ggml-cuda/add-id.cuh +3 -0
  1132. data/vendor/ggml/src/ggml-cuda/allreduce.cu +971 -0
  1133. data/vendor/ggml/src/ggml-cuda/allreduce.cuh +29 -0
  1134. data/vendor/ggml/src/ggml-cuda/arange.cu +34 -0
  1135. data/vendor/ggml/src/ggml-cuda/arange.cuh +5 -0
  1136. data/vendor/ggml/src/ggml-cuda/argmax.cu +91 -0
  1137. data/vendor/ggml/src/ggml-cuda/argmax.cuh +3 -0
  1138. data/vendor/ggml/src/ggml-cuda/argsort.cu +266 -0
  1139. data/vendor/ggml/src/ggml-cuda/argsort.cuh +19 -0
  1140. data/vendor/ggml/src/ggml-cuda/binbcast.cu +534 -0
  1141. data/vendor/ggml/src/ggml-cuda/binbcast.cuh +12 -0
  1142. data/vendor/ggml/src/ggml-cuda/clamp.cu +45 -0
  1143. data/vendor/ggml/src/ggml-cuda/clamp.cuh +5 -0
  1144. data/vendor/ggml/src/ggml-cuda/common.cuh +1489 -0
  1145. data/vendor/ggml/src/ggml-cuda/concat.cu +204 -0
  1146. data/vendor/ggml/src/ggml-cuda/concat.cuh +5 -0
  1147. data/vendor/ggml/src/ggml-cuda/conv-transpose-1d.cu +86 -0
  1148. data/vendor/ggml/src/ggml-cuda/conv-transpose-1d.cuh +5 -0
  1149. data/vendor/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
  1150. data/vendor/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
  1151. data/vendor/ggml/src/ggml-cuda/conv2d-transpose.cu +115 -0
  1152. data/vendor/ggml/src/ggml-cuda/conv2d-transpose.cuh +5 -0
  1153. data/vendor/ggml/src/ggml-cuda/conv2d.cu +166 -0
  1154. data/vendor/ggml/src/ggml-cuda/conv2d.cuh +5 -0
  1155. data/vendor/ggml/src/ggml-cuda/convert.cu +892 -0
  1156. data/vendor/ggml/src/ggml-cuda/convert.cuh +66 -0
  1157. data/vendor/ggml/src/ggml-cuda/count-equal.cu +64 -0
  1158. data/vendor/ggml/src/ggml-cuda/count-equal.cuh +5 -0
  1159. data/vendor/ggml/src/ggml-cuda/cp-async.cuh +57 -0
  1160. data/vendor/ggml/src/ggml-cuda/cpy-utils.cuh +217 -0
  1161. data/vendor/ggml/src/ggml-cuda/cpy.cu +558 -0
  1162. data/vendor/ggml/src/ggml-cuda/cpy.cuh +7 -0
  1163. data/vendor/ggml/src/ggml-cuda/cross-entropy-loss.cu +177 -0
  1164. data/vendor/ggml/src/ggml-cuda/cross-entropy-loss.cuh +7 -0
  1165. data/vendor/ggml/src/ggml-cuda/cumsum.cu +307 -0
  1166. data/vendor/ggml/src/ggml-cuda/cumsum.cuh +5 -0
  1167. data/vendor/ggml/src/ggml-cuda/dequantize.cuh +99 -0
  1168. data/vendor/ggml/src/ggml-cuda/diag.cu +77 -0
  1169. data/vendor/ggml/src/ggml-cuda/diag.cuh +5 -0
  1170. data/vendor/ggml/src/ggml-cuda/diagmask.cu +40 -0
  1171. data/vendor/ggml/src/ggml-cuda/diagmask.cuh +5 -0
  1172. data/vendor/ggml/src/ggml-cuda/fattn-common.cuh +1212 -0
  1173. data/vendor/ggml/src/ggml-cuda/fattn-mma-f16.cuh +2020 -0
  1174. data/vendor/ggml/src/ggml-cuda/fattn-tile.cu +61 -0
  1175. data/vendor/ggml/src/ggml-cuda/fattn-tile.cuh +1347 -0
  1176. data/vendor/ggml/src/ggml-cuda/fattn-vec.cuh +600 -0
  1177. data/vendor/ggml/src/ggml-cuda/fattn-wmma-f16.cu +696 -0
  1178. data/vendor/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +51 -0
  1179. data/vendor/ggml/src/ggml-cuda/fattn.cu +562 -0
  1180. data/vendor/ggml/src/ggml-cuda/fattn.cuh +5 -0
  1181. data/vendor/ggml/src/ggml-cuda/fill.cu +37 -0
  1182. data/vendor/ggml/src/ggml-cuda/fill.cuh +3 -0
  1183. data/vendor/ggml/src/ggml-cuda/gated_delta_net.cu +311 -0
  1184. data/vendor/ggml/src/ggml-cuda/gated_delta_net.cuh +4 -0
  1185. data/vendor/ggml/src/ggml-cuda/getrows.cu +300 -0
  1186. data/vendor/ggml/src/ggml-cuda/getrows.cuh +15 -0
  1187. data/vendor/ggml/src/ggml-cuda/ggml-cuda.cu +5684 -0
  1188. data/vendor/ggml/src/ggml-cuda/gla.cu +93 -0
  1189. data/vendor/ggml/src/ggml-cuda/gla.cuh +3 -0
  1190. data/vendor/ggml/src/ggml-cuda/im2col.cu +267 -0
  1191. data/vendor/ggml/src/ggml-cuda/im2col.cuh +6 -0
  1192. data/vendor/ggml/src/ggml-cuda/mean.cu +75 -0
  1193. data/vendor/ggml/src/ggml-cuda/mean.cuh +3 -0
  1194. data/vendor/ggml/src/ggml-cuda/mma.cuh +1456 -0
  1195. data/vendor/ggml/src/ggml-cuda/mmf.cu +191 -0
  1196. data/vendor/ggml/src/ggml-cuda/mmf.cuh +908 -0
  1197. data/vendor/ggml/src/ggml-cuda/mmid.cu +164 -0
  1198. data/vendor/ggml/src/ggml-cuda/mmid.cuh +5 -0
  1199. data/vendor/ggml/src/ggml-cuda/mmq.cu +372 -0
  1200. data/vendor/ggml/src/ggml-cuda/mmq.cuh +4176 -0
  1201. data/vendor/ggml/src/ggml-cuda/mmvf.cu +862 -0
  1202. data/vendor/ggml/src/ggml-cuda/mmvf.cuh +14 -0
  1203. data/vendor/ggml/src/ggml-cuda/mmvq.cu +1161 -0
  1204. data/vendor/ggml/src/ggml-cuda/mmvq.cuh +16 -0
  1205. data/vendor/ggml/src/ggml-cuda/norm.cu +672 -0
  1206. data/vendor/ggml/src/ggml-cuda/norm.cuh +18 -0
  1207. data/vendor/ggml/src/ggml-cuda/opt-step-adamw.cu +78 -0
  1208. data/vendor/ggml/src/ggml-cuda/opt-step-adamw.cuh +5 -0
  1209. data/vendor/ggml/src/ggml-cuda/opt-step-sgd.cu +49 -0
  1210. data/vendor/ggml/src/ggml-cuda/opt-step-sgd.cuh +5 -0
  1211. data/vendor/ggml/src/ggml-cuda/out-prod.cu +84 -0
  1212. data/vendor/ggml/src/ggml-cuda/out-prod.cuh +3 -0
  1213. data/vendor/ggml/src/ggml-cuda/pad.cu +106 -0
  1214. data/vendor/ggml/src/ggml-cuda/pad.cuh +5 -0
  1215. data/vendor/ggml/src/ggml-cuda/pad_reflect_1d.cu +91 -0
  1216. data/vendor/ggml/src/ggml-cuda/pad_reflect_1d.cuh +5 -0
  1217. data/vendor/ggml/src/ggml-cuda/pool2d.cu +94 -0
  1218. data/vendor/ggml/src/ggml-cuda/pool2d.cuh +5 -0
  1219. data/vendor/ggml/src/ggml-cuda/quantize.cu +443 -0
  1220. data/vendor/ggml/src/ggml-cuda/quantize.cuh +41 -0
  1221. data/vendor/ggml/src/ggml-cuda/reduce_rows.cuh +39 -0
  1222. data/vendor/ggml/src/ggml-cuda/roll.cu +67 -0
  1223. data/vendor/ggml/src/ggml-cuda/roll.cuh +5 -0
  1224. data/vendor/ggml/src/ggml-cuda/rope.cu +665 -0
  1225. data/vendor/ggml/src/ggml-cuda/rope.cuh +9 -0
  1226. data/vendor/ggml/src/ggml-cuda/scale.cu +34 -0
  1227. data/vendor/ggml/src/ggml-cuda/scale.cuh +5 -0
  1228. data/vendor/ggml/src/ggml-cuda/set-rows.cu +330 -0
  1229. data/vendor/ggml/src/ggml-cuda/set-rows.cuh +7 -0
  1230. data/vendor/ggml/src/ggml-cuda/set.cu +39 -0
  1231. data/vendor/ggml/src/ggml-cuda/set.cuh +7 -0
  1232. data/vendor/ggml/src/ggml-cuda/snake.cu +72 -0
  1233. data/vendor/ggml/src/ggml-cuda/snake.cuh +8 -0
  1234. data/vendor/ggml/src/ggml-cuda/softcap.cu +34 -0
  1235. data/vendor/ggml/src/ggml-cuda/softcap.cuh +5 -0
  1236. data/vendor/ggml/src/ggml-cuda/softmax.cu +472 -0
  1237. data/vendor/ggml/src/ggml-cuda/softmax.cuh +7 -0
  1238. data/vendor/ggml/src/ggml-cuda/solve_tri.cu +275 -0
  1239. data/vendor/ggml/src/ggml-cuda/solve_tri.cuh +3 -0
  1240. data/vendor/ggml/src/ggml-cuda/ssm-conv.cu +197 -0
  1241. data/vendor/ggml/src/ggml-cuda/ssm-conv.cuh +3 -0
  1242. data/vendor/ggml/src/ggml-cuda/ssm-scan.cu +342 -0
  1243. data/vendor/ggml/src/ggml-cuda/ssm-scan.cuh +3 -0
  1244. data/vendor/ggml/src/ggml-cuda/sum.cu +41 -0
  1245. data/vendor/ggml/src/ggml-cuda/sum.cuh +5 -0
  1246. data/vendor/ggml/src/ggml-cuda/sumrows.cu +43 -0
  1247. data/vendor/ggml/src/ggml-cuda/sumrows.cuh +4 -0
  1248. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu +6 -0
  1249. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_32.cu +6 -0
  1250. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu +12 -0
  1251. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_1.cu +10 -0
  1252. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_2.cu +10 -0
  1253. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +12 -0
  1254. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu +6 -0
  1255. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_32.cu +6 -0
  1256. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +12 -0
  1257. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu +12 -0
  1258. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_1.cu +10 -0
  1259. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_2.cu +10 -0
  1260. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu +6 -0
  1261. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_2.cu +10 -0
  1262. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +12 -0
  1263. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_8.cu +12 -0
  1264. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_64-ncols2_1.cu +10 -0
  1265. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_1.cu +10 -0
  1266. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_2.cu +10 -0
  1267. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +12 -0
  1268. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_8.cu +12 -0
  1269. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq112-dv112.cu +5 -0
  1270. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq128-dv128.cu +5 -0
  1271. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq192-dv128.cu +5 -0
  1272. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq256-dv256.cu +5 -0
  1273. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq320-dv256.cu +5 -0
  1274. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq40-dv40.cu +5 -0
  1275. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq512-dv512.cu +5 -0
  1276. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq576-dv512.cu +5 -0
  1277. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq64-dv64.cu +5 -0
  1278. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq72-dv72.cu +5 -0
  1279. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq80-dv80.cu +5 -0
  1280. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq96-dv96.cu +5 -0
  1281. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-bf16.cu +7 -0
  1282. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-f16.cu +7 -0
  1283. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_0.cu +7 -0
  1284. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_1.cu +7 -0
  1285. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_0.cu +7 -0
  1286. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_1.cu +7 -0
  1287. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q8_0.cu +7 -0
  1288. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-bf16.cu +7 -0
  1289. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu +7 -0
  1290. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_0.cu +7 -0
  1291. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_1.cu +7 -0
  1292. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_0.cu +7 -0
  1293. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_1.cu +7 -0
  1294. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q8_0.cu +7 -0
  1295. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-bf16.cu +7 -0
  1296. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-f16.cu +7 -0
  1297. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu +7 -0
  1298. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_1.cu +7 -0
  1299. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_0.cu +7 -0
  1300. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_1.cu +7 -0
  1301. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q8_0.cu +7 -0
  1302. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-bf16.cu +7 -0
  1303. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-f16.cu +7 -0
  1304. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_0.cu +7 -0
  1305. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_1.cu +7 -0
  1306. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_0.cu +7 -0
  1307. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_1.cu +7 -0
  1308. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q8_0.cu +7 -0
  1309. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-bf16.cu +7 -0
  1310. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-f16.cu +7 -0
  1311. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_0.cu +7 -0
  1312. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_1.cu +7 -0
  1313. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_0.cu +7 -0
  1314. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_1.cu +7 -0
  1315. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q8_0.cu +7 -0
  1316. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-bf16.cu +7 -0
  1317. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-f16.cu +7 -0
  1318. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_0.cu +7 -0
  1319. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_1.cu +7 -0
  1320. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_0.cu +7 -0
  1321. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_1.cu +7 -0
  1322. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q8_0.cu +7 -0
  1323. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-bf16.cu +7 -0
  1324. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-f16.cu +7 -0
  1325. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_0.cu +7 -0
  1326. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_1.cu +7 -0
  1327. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_0.cu +7 -0
  1328. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_1.cu +7 -0
  1329. data/vendor/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu +7 -0
  1330. data/vendor/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +110 -0
  1331. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_1.cu +5 -0
  1332. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_10.cu +5 -0
  1333. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_11.cu +5 -0
  1334. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_12.cu +5 -0
  1335. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_13.cu +5 -0
  1336. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_14.cu +5 -0
  1337. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_15.cu +5 -0
  1338. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_16.cu +5 -0
  1339. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_2.cu +5 -0
  1340. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_3.cu +5 -0
  1341. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_4.cu +5 -0
  1342. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_5.cu +5 -0
  1343. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_6.cu +5 -0
  1344. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_7.cu +5 -0
  1345. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_8.cu +5 -0
  1346. data/vendor/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_9.cu +5 -0
  1347. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq1_s.cu +5 -0
  1348. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_s.cu +5 -0
  1349. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xs.cu +5 -0
  1350. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xxs.cu +5 -0
  1351. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_s.cu +5 -0
  1352. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_xxs.cu +5 -0
  1353. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_nl.cu +5 -0
  1354. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_xs.cu +5 -0
  1355. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-mxfp4.cu +5 -0
  1356. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-nvfp4.cu +5 -0
  1357. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q1_0.cu +5 -0
  1358. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
  1359. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
  1360. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
  1361. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
  1362. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
  1363. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
  1364. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
  1365. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
  1366. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
  1367. data/vendor/ggml/src/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
  1368. data/vendor/ggml/src/ggml-cuda/top-k.cu +95 -0
  1369. data/vendor/ggml/src/ggml-cuda/top-k.cuh +3 -0
  1370. data/vendor/ggml/src/ggml-cuda/topk-moe.cu +415 -0
  1371. data/vendor/ggml/src/ggml-cuda/topk-moe.cuh +27 -0
  1372. data/vendor/ggml/src/ggml-cuda/tri.cu +136 -0
  1373. data/vendor/ggml/src/ggml-cuda/tri.cuh +5 -0
  1374. data/vendor/ggml/src/ggml-cuda/tsembd.cu +47 -0
  1375. data/vendor/ggml/src/ggml-cuda/tsembd.cuh +5 -0
  1376. data/vendor/ggml/src/ggml-cuda/unary.cu +640 -0
  1377. data/vendor/ggml/src/ggml-cuda/unary.cuh +114 -0
  1378. data/vendor/ggml/src/ggml-cuda/upscale.cu +293 -0
  1379. data/vendor/ggml/src/ggml-cuda/upscale.cuh +5 -0
  1380. data/vendor/ggml/src/ggml-cuda/vecdotq.cuh +1317 -0
  1381. data/vendor/ggml/src/ggml-cuda/vendors/cuda.h +28 -0
  1382. data/vendor/ggml/src/ggml-cuda/vendors/hip.h +304 -0
  1383. data/vendor/ggml/src/ggml-cuda/vendors/musa.h +150 -0
  1384. data/vendor/ggml/src/ggml-cuda/wkv.cu +199 -0
  1385. data/vendor/ggml/src/ggml-cuda/wkv.cuh +7 -0
  1386. data/vendor/ggml/src/ggml-hexagon/CMakeLists.txt +118 -0
  1387. data/vendor/ggml/src/ggml-hexagon/ggml-hexagon.cpp +3680 -0
  1388. data/vendor/ggml/src/ggml-hexagon/htp/CMakeLists.txt +78 -0
  1389. data/vendor/ggml/src/ggml-hexagon/htp/act-ops.c +782 -0
  1390. data/vendor/ggml/src/ggml-hexagon/htp/argsort-ops.c +293 -0
  1391. data/vendor/ggml/src/ggml-hexagon/htp/binary-ops.c +872 -0
  1392. data/vendor/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +157 -0
  1393. data/vendor/ggml/src/ggml-hexagon/htp/cpy-ops.c +275 -0
  1394. data/vendor/ggml/src/ggml-hexagon/htp/cumsum-ops.c +270 -0
  1395. data/vendor/ggml/src/ggml-hexagon/htp/diag-ops.c +216 -0
  1396. data/vendor/ggml/src/ggml-hexagon/htp/fill-ops.c +123 -0
  1397. data/vendor/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +727 -0
  1398. data/vendor/ggml/src/ggml-hexagon/htp/gated-delta-net-ops.c +955 -0
  1399. data/vendor/ggml/src/ggml-hexagon/htp/get-rows-ops.c +124 -0
  1400. data/vendor/ggml/src/ggml-hexagon/htp/hex-dma.c +63 -0
  1401. data/vendor/ggml/src/ggml-hexagon/htp/hex-dma.h +372 -0
  1402. data/vendor/ggml/src/ggml-hexagon/htp/hex-dump.h +86 -0
  1403. data/vendor/ggml/src/ggml-hexagon/htp/hex-fastdiv.h +37 -0
  1404. data/vendor/ggml/src/ggml-hexagon/htp/hex-utils.h +137 -0
  1405. data/vendor/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c +1841 -0
  1406. data/vendor/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c +1785 -0
  1407. data/vendor/ggml/src/ggml-hexagon/htp/hmx-ops.h +71 -0
  1408. data/vendor/ggml/src/ggml-hexagon/htp/hmx-profile.h +34 -0
  1409. data/vendor/ggml/src/ggml-hexagon/htp/hmx-queue.c +158 -0
  1410. data/vendor/ggml/src/ggml-hexagon/htp/hmx-queue.h +134 -0
  1411. data/vendor/ggml/src/ggml-hexagon/htp/hmx-utils.h +200 -0
  1412. data/vendor/ggml/src/ggml-hexagon/htp/htp-ctx.h +111 -0
  1413. data/vendor/ggml/src/ggml-hexagon/htp/htp-ops.h +181 -0
  1414. data/vendor/ggml/src/ggml-hexagon/htp/htp_iface.idl +22 -0
  1415. data/vendor/ggml/src/ggml-hexagon/htp/hvx-arith.h +443 -0
  1416. data/vendor/ggml/src/ggml-hexagon/htp/hvx-base.h +308 -0
  1417. data/vendor/ggml/src/ggml-hexagon/htp/hvx-copy.h +262 -0
  1418. data/vendor/ggml/src/ggml-hexagon/htp/hvx-div.h +291 -0
  1419. data/vendor/ggml/src/ggml-hexagon/htp/hvx-dump.h +129 -0
  1420. data/vendor/ggml/src/ggml-hexagon/htp/hvx-exp.h +216 -0
  1421. data/vendor/ggml/src/ggml-hexagon/htp/hvx-floor.h +100 -0
  1422. data/vendor/ggml/src/ggml-hexagon/htp/hvx-inverse.h +210 -0
  1423. data/vendor/ggml/src/ggml-hexagon/htp/hvx-reduce.h +296 -0
  1424. data/vendor/ggml/src/ggml-hexagon/htp/hvx-repl.h +74 -0
  1425. data/vendor/ggml/src/ggml-hexagon/htp/hvx-scale.h +133 -0
  1426. data/vendor/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +142 -0
  1427. data/vendor/ggml/src/ggml-hexagon/htp/hvx-sqrt.h +126 -0
  1428. data/vendor/ggml/src/ggml-hexagon/htp/hvx-types.h +36 -0
  1429. data/vendor/ggml/src/ggml-hexagon/htp/hvx-utils.h +19 -0
  1430. data/vendor/ggml/src/ggml-hexagon/htp/main.c +880 -0
  1431. data/vendor/ggml/src/ggml-hexagon/htp/matmul-ops.c +3173 -0
  1432. data/vendor/ggml/src/ggml-hexagon/htp/repeat-ops.c +148 -0
  1433. data/vendor/ggml/src/ggml-hexagon/htp/rope-ops.c +494 -0
  1434. data/vendor/ggml/src/ggml-hexagon/htp/set-rows-ops.c +184 -0
  1435. data/vendor/ggml/src/ggml-hexagon/htp/softmax-ops.c +407 -0
  1436. data/vendor/ggml/src/ggml-hexagon/htp/solve-tri-ops.c +267 -0
  1437. data/vendor/ggml/src/ggml-hexagon/htp/ssm-conv.c +340 -0
  1438. data/vendor/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +128 -0
  1439. data/vendor/ggml/src/ggml-hexagon/htp/unary-ops.c +657 -0
  1440. data/vendor/ggml/src/ggml-hexagon/htp/vtcm-utils.h +16 -0
  1441. data/vendor/ggml/src/ggml-hexagon/htp/worker-pool.c +293 -0
  1442. data/vendor/ggml/src/ggml-hexagon/htp/worker-pool.h +57 -0
  1443. data/vendor/ggml/src/ggml-hexagon/htp-drv.cpp +418 -0
  1444. data/vendor/ggml/src/ggml-hexagon/htp-drv.h +121 -0
  1445. data/vendor/ggml/src/ggml-hexagon/libdl.h +79 -0
  1446. data/vendor/ggml/src/ggml-hexagon/libggml-htp.inf +40 -0
  1447. data/vendor/ggml/src/ggml-hexagon/op-desc.h +153 -0
  1448. data/vendor/ggml/src/ggml-hip/CMakeLists.txt +157 -0
  1449. data/vendor/ggml/src/ggml-impl.h +783 -0
  1450. data/vendor/ggml/src/ggml-metal/CMakeLists.txt +124 -0
  1451. data/vendor/ggml/src/ggml-metal/ggml-metal-common.cpp +457 -0
  1452. data/vendor/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
  1453. data/vendor/ggml/src/ggml-metal/ggml-metal-context.h +41 -0
  1454. data/vendor/ggml/src/ggml-metal/ggml-metal-context.m +739 -0
  1455. data/vendor/ggml/src/ggml-metal/ggml-metal-device.cpp +2053 -0
  1456. data/vendor/ggml/src/ggml-metal/ggml-metal-device.h +296 -0
  1457. data/vendor/ggml/src/ggml-metal/ggml-metal-device.m +1829 -0
  1458. data/vendor/ggml/src/ggml-metal/ggml-metal-impl.h +1175 -0
  1459. data/vendor/ggml/src/ggml-metal/ggml-metal-ops.cpp +4606 -0
  1460. data/vendor/ggml/src/ggml-metal/ggml-metal-ops.h +97 -0
  1461. data/vendor/ggml/src/ggml-metal/ggml-metal.cpp +950 -0
  1462. data/vendor/ggml/src/ggml-metal/ggml-metal.metal +10679 -0
  1463. data/vendor/ggml/src/ggml-musa/CMakeLists.txt +124 -0
  1464. data/vendor/ggml/src/ggml-musa/mudnn.cu +112 -0
  1465. data/vendor/ggml/src/ggml-musa/mudnn.cuh +12 -0
  1466. data/vendor/ggml/src/ggml-opencl/CMakeLists.txt +189 -0
  1467. data/vendor/ggml/src/ggml-opencl/ggml-opencl.cpp +16374 -0
  1468. data/vendor/ggml/src/ggml-opencl/kernels/add.cl +190 -0
  1469. data/vendor/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
  1470. data/vendor/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
  1471. data/vendor/ggml/src/ggml-opencl/kernels/clamp.cl +20 -0
  1472. data/vendor/ggml/src/ggml-opencl/kernels/concat.cl +51 -0
  1473. data/vendor/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
  1474. data/vendor/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
  1475. data/vendor/ggml/src/ggml-opencl/kernels/cpy.cl +229 -0
  1476. data/vendor/ggml/src/ggml-opencl/kernels/cumsum.cl +139 -0
  1477. data/vendor/ggml/src/ggml-opencl/kernels/cvt.cl +1471 -0
  1478. data/vendor/ggml/src/ggml-opencl/kernels/diag.cl +27 -0
  1479. data/vendor/ggml/src/ggml-opencl/kernels/diag_mask_inf.cl +58 -0
  1480. data/vendor/ggml/src/ggml-opencl/kernels/div.cl +138 -0
  1481. data/vendor/ggml/src/ggml-opencl/kernels/embed_kernel.py +26 -0
  1482. data/vendor/ggml/src/ggml-opencl/kernels/exp.cl +125 -0
  1483. data/vendor/ggml/src/ggml-opencl/kernels/expm1.cl +113 -0
  1484. data/vendor/ggml/src/ggml-opencl/kernels/fill.cl +17 -0
  1485. data/vendor/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +370 -0
  1486. data/vendor/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +371 -0
  1487. data/vendor/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +373 -0
  1488. data/vendor/ggml/src/ggml-opencl/kernels/gelu.cl +89 -0
  1489. data/vendor/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32.cl +162 -0
  1490. data/vendor/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32_ns.cl +302 -0
  1491. data/vendor/ggml/src/ggml-opencl/kernels/gemm_moe_q4_0_f32_ns.cl +252 -0
  1492. data/vendor/ggml/src/ggml-opencl/kernels/gemm_moe_q4_1_f32_ns.cl +254 -0
  1493. data/vendor/ggml/src/ggml-opencl/kernels/gemm_moe_q5_0_f32_ns.cl +256 -0
  1494. data/vendor/ggml/src/ggml-opencl/kernels/gemm_moe_q5_1_f32_ns.cl +258 -0
  1495. data/vendor/ggml/src/ggml-opencl/kernels/gemm_noshuffle_iq4_nl_f32.cl +150 -0
  1496. data/vendor/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_0_f32.cl +139 -0
  1497. data/vendor/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_1_f32.cl +132 -0
  1498. data/vendor/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_k_f32.cl +172 -0
  1499. data/vendor/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_k_f32.cl +176 -0
  1500. data/vendor/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q6_k_f32.cl +140 -0
  1501. data/vendor/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q8_0_f32.cl +129 -0
  1502. data/vendor/ggml/src/ggml-opencl/kernels/gemm_xmem_f16_f32_os8.cl +233 -0
  1503. data/vendor/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32.cl +156 -0
  1504. data/vendor/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32_ns.cl +161 -0
  1505. data/vendor/ggml/src/ggml-opencl/kernels/gemv_moe_q4_0_f32_ns.cl +116 -0
  1506. data/vendor/ggml/src/ggml-opencl/kernels/gemv_moe_q4_1_f32_ns.cl +119 -0
  1507. data/vendor/ggml/src/ggml-opencl/kernels/gemv_moe_q5_0_f32_ns.cl +119 -0
  1508. data/vendor/ggml/src/ggml-opencl/kernels/gemv_moe_q5_1_f32_ns.cl +121 -0
  1509. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_iq4_nl_f32.cl +302 -0
  1510. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_0_f32.cl +274 -0
  1511. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_0_f32_spec.cl +268 -0
  1512. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_1_f32.cl +283 -0
  1513. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_k_f32.cl +318 -0
  1514. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_k_f32.cl +326 -0
  1515. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q6_k_f32.cl +293 -0
  1516. data/vendor/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q8_0_f32.cl +195 -0
  1517. data/vendor/ggml/src/ggml-opencl/kernels/get_rows.cl +187 -0
  1518. data/vendor/ggml/src/ggml-opencl/kernels/glu.cl +378 -0
  1519. data/vendor/ggml/src/ggml-opencl/kernels/group_norm.cl +121 -0
  1520. data/vendor/ggml/src/ggml-opencl/kernels/im2col_f16.cl +57 -0
  1521. data/vendor/ggml/src/ggml-opencl/kernels/im2col_f32.cl +57 -0
  1522. data/vendor/ggml/src/ggml-opencl/kernels/l2_norm.cl +71 -0
  1523. data/vendor/ggml/src/ggml-opencl/kernels/mean.cl +140 -0
  1524. data/vendor/ggml/src/ggml-opencl/kernels/moe_reorder_b.cl +30 -0
  1525. data/vendor/ggml/src/ggml-opencl/kernels/moe_sort_by_expert.cl +82 -0
  1526. data/vendor/ggml/src/ggml-opencl/kernels/mul.cl +152 -0
  1527. data/vendor/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
  1528. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_kq_kqv.cl +273 -0
  1529. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +146 -0
  1530. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +147 -0
  1531. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_iq4_nl_f32_l4_lm.cl +171 -0
  1532. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl +163 -0
  1533. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl +165 -0
  1534. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_q4_k_f32_l4_lm.cl +179 -0
  1535. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_q5_k_f32_l4_lm.cl +192 -0
  1536. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_q6_k_f32_l4_lm.cl +158 -0
  1537. data/vendor/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl +154 -0
  1538. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_f16_f16.cl +118 -0
  1539. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32.cl +118 -0
  1540. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_1row.cl +94 -0
  1541. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_l4.cl +84 -0
  1542. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_f32_f32.cl +118 -0
  1543. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
  1544. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
  1545. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  1546. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
  1547. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
  1548. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32.cl +164 -0
  1549. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_iq4_nl_f32_flat.cl +202 -0
  1550. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
  1551. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
  1552. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32.cl +192 -0
  1553. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_16x_flat.cl +307 -0
  1554. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_8x_flat.cl +265 -0
  1555. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_8x_flat.cl +272 -0
  1556. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_v.cl +254 -0
  1557. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl +219 -0
  1558. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl +229 -0
  1559. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl +180 -0
  1560. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32_flat.cl +196 -0
  1561. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32.cl +187 -0
  1562. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q5_k_f32_flat.cl +203 -0
  1563. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32.cl +194 -0
  1564. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +194 -0
  1565. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
  1566. data/vendor/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
  1567. data/vendor/ggml/src/ggml-opencl/kernels/neg.cl +125 -0
  1568. data/vendor/ggml/src/ggml-opencl/kernels/norm.cl +161 -0
  1569. data/vendor/ggml/src/ggml-opencl/kernels/pad.cl +39 -0
  1570. data/vendor/ggml/src/ggml-opencl/kernels/relu.cl +16 -0
  1571. data/vendor/ggml/src/ggml-opencl/kernels/repeat.cl +38 -0
  1572. data/vendor/ggml/src/ggml-opencl/kernels/rms_norm.cl +190 -0
  1573. data/vendor/ggml/src/ggml-opencl/kernels/rope.cl +747 -0
  1574. data/vendor/ggml/src/ggml-opencl/kernels/scale.cl +27 -0
  1575. data/vendor/ggml/src/ggml-opencl/kernels/set_rows.cl +208 -0
  1576. data/vendor/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
  1577. data/vendor/ggml/src/ggml-opencl/kernels/silu.cl +30 -0
  1578. data/vendor/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +108 -0
  1579. data/vendor/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +108 -0
  1580. data/vendor/ggml/src/ggml-opencl/kernels/softmax_f16.cl +107 -0
  1581. data/vendor/ggml/src/ggml-opencl/kernels/softmax_f32.cl +107 -0
  1582. data/vendor/ggml/src/ggml-opencl/kernels/softplus.cl +116 -0
  1583. data/vendor/ggml/src/ggml-opencl/kernels/solve_tri.cl +51 -0
  1584. data/vendor/ggml/src/ggml-opencl/kernels/sqr.cl +53 -0
  1585. data/vendor/ggml/src/ggml-opencl/kernels/sqrt.cl +53 -0
  1586. data/vendor/ggml/src/ggml-opencl/kernels/ssm_conv.cl +77 -0
  1587. data/vendor/ggml/src/ggml-opencl/kernels/sub.cl +138 -0
  1588. data/vendor/ggml/src/ggml-opencl/kernels/sum_rows.cl +140 -0
  1589. data/vendor/ggml/src/ggml-opencl/kernels/tanh.cl +109 -0
  1590. data/vendor/ggml/src/ggml-opencl/kernels/transpose.cl +143 -0
  1591. data/vendor/ggml/src/ggml-opencl/kernels/tri.cl +32 -0
  1592. data/vendor/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  1593. data/vendor/ggml/src/ggml-opencl/kernels/upscale.cl +120 -0
  1594. data/vendor/ggml/src/ggml-openvino/CMakeLists.txt +22 -0
  1595. data/vendor/ggml/src/ggml-openvino/ggml-decoder.cpp +985 -0
  1596. data/vendor/ggml/src/ggml-openvino/ggml-decoder.h +294 -0
  1597. data/vendor/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +380 -0
  1598. data/vendor/ggml/src/ggml-openvino/ggml-openvino-extra.h +182 -0
  1599. data/vendor/ggml/src/ggml-openvino/ggml-openvino.cpp +1132 -0
  1600. data/vendor/ggml/src/ggml-openvino/ggml-quants.cpp +956 -0
  1601. data/vendor/ggml/src/ggml-openvino/ggml-quants.h +153 -0
  1602. data/vendor/ggml/src/ggml-openvino/openvino/decoder.h +74 -0
  1603. data/vendor/ggml/src/ggml-openvino/openvino/frontend.cpp +27 -0
  1604. data/vendor/ggml/src/ggml-openvino/openvino/frontend.h +23 -0
  1605. data/vendor/ggml/src/ggml-openvino/openvino/input_model.cpp +17 -0
  1606. data/vendor/ggml/src/ggml-openvino/openvino/input_model.h +29 -0
  1607. data/vendor/ggml/src/ggml-openvino/openvino/node_context.h +112 -0
  1608. data/vendor/ggml/src/ggml-openvino/openvino/op/cont.cpp +48 -0
  1609. data/vendor/ggml/src/ggml-openvino/openvino/op/cpy.cpp +21 -0
  1610. data/vendor/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +90 -0
  1611. data/vendor/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +69 -0
  1612. data/vendor/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp +61 -0
  1613. data/vendor/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp +62 -0
  1614. data/vendor/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +90 -0
  1615. data/vendor/ggml/src/ggml-openvino/openvino/op/permute.cpp +102 -0
  1616. data/vendor/ggml/src/ggml-openvino/openvino/op/reshape.cpp +83 -0
  1617. data/vendor/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp +46 -0
  1618. data/vendor/ggml/src/ggml-openvino/openvino/op/rope.cpp +149 -0
  1619. data/vendor/ggml/src/ggml-openvino/openvino/op/scale.cpp +41 -0
  1620. data/vendor/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +76 -0
  1621. data/vendor/ggml/src/ggml-openvino/openvino/op/softmax.cpp +89 -0
  1622. data/vendor/ggml/src/ggml-openvino/openvino/op/transpose.cpp +23 -0
  1623. data/vendor/ggml/src/ggml-openvino/openvino/op/unary_gelu.cpp +25 -0
  1624. data/vendor/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp +27 -0
  1625. data/vendor/ggml/src/ggml-openvino/openvino/op/view.cpp +53 -0
  1626. data/vendor/ggml/src/ggml-openvino/openvino/op_table.cpp +47 -0
  1627. data/vendor/ggml/src/ggml-openvino/openvino/op_table.h +40 -0
  1628. data/vendor/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp +60 -0
  1629. data/vendor/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.h +17 -0
  1630. data/vendor/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.h +29 -0
  1631. data/vendor/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.cpp +58 -0
  1632. data/vendor/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.h +17 -0
  1633. data/vendor/ggml/src/ggml-openvino/openvino/rt_info/weightless_caching_attributes.hpp +41 -0
  1634. data/vendor/ggml/src/ggml-openvino/openvino/translate_session.cpp +317 -0
  1635. data/vendor/ggml/src/ggml-openvino/openvino/translate_session.h +28 -0
  1636. data/vendor/ggml/src/ggml-openvino/openvino/utils.cpp +257 -0
  1637. data/vendor/ggml/src/ggml-openvino/openvino/utils.h +86 -0
  1638. data/vendor/ggml/src/ggml-openvino/utils.cpp +880 -0
  1639. data/vendor/ggml/src/ggml-openvino/utils.h +143 -0
  1640. data/vendor/ggml/src/ggml-opt.cpp +1094 -0
  1641. data/vendor/ggml/src/ggml-quants.c +5491 -0
  1642. data/vendor/ggml/src/ggml-quants.h +112 -0
  1643. data/vendor/ggml/src/ggml-rpc/CMakeLists.txt +33 -0
  1644. data/vendor/ggml/src/ggml-rpc/ggml-rpc.cpp +1974 -0
  1645. data/vendor/ggml/src/ggml-rpc/transport.cpp +683 -0
  1646. data/vendor/ggml/src/ggml-rpc/transport.h +34 -0
  1647. data/vendor/ggml/src/ggml-sycl/CMakeLists.txt +207 -0
  1648. data/vendor/ggml/src/ggml-sycl/add-id.cpp +81 -0
  1649. data/vendor/ggml/src/ggml-sycl/add-id.hpp +8 -0
  1650. data/vendor/ggml/src/ggml-sycl/backend.hpp +48 -0
  1651. data/vendor/ggml/src/ggml-sycl/binbcast.cpp +346 -0
  1652. data/vendor/ggml/src/ggml-sycl/binbcast.hpp +39 -0
  1653. data/vendor/ggml/src/ggml-sycl/common.cpp +155 -0
  1654. data/vendor/ggml/src/ggml-sycl/common.hpp +1002 -0
  1655. data/vendor/ggml/src/ggml-sycl/concat.cpp +202 -0
  1656. data/vendor/ggml/src/ggml-sycl/concat.hpp +20 -0
  1657. data/vendor/ggml/src/ggml-sycl/conv.cpp +101 -0
  1658. data/vendor/ggml/src/ggml-sycl/conv.hpp +20 -0
  1659. data/vendor/ggml/src/ggml-sycl/convert.cpp +825 -0
  1660. data/vendor/ggml/src/ggml-sycl/convert.hpp +64 -0
  1661. data/vendor/ggml/src/ggml-sycl/count-equal.cpp +79 -0
  1662. data/vendor/ggml/src/ggml-sycl/count-equal.hpp +9 -0
  1663. data/vendor/ggml/src/ggml-sycl/cpy.cpp +602 -0
  1664. data/vendor/ggml/src/ggml-sycl/cpy.hpp +223 -0
  1665. data/vendor/ggml/src/ggml-sycl/cumsum.cpp +148 -0
  1666. data/vendor/ggml/src/ggml-sycl/cumsum.hpp +5 -0
  1667. data/vendor/ggml/src/ggml-sycl/dequantize.hpp +975 -0
  1668. data/vendor/ggml/src/ggml-sycl/diag.cpp +67 -0
  1669. data/vendor/ggml/src/ggml-sycl/diag.hpp +5 -0
  1670. data/vendor/ggml/src/ggml-sycl/dmmv.cpp +1579 -0
  1671. data/vendor/ggml/src/ggml-sycl/dmmv.hpp +27 -0
  1672. data/vendor/ggml/src/ggml-sycl/dpct/helper.hpp +3774 -0
  1673. data/vendor/ggml/src/ggml-sycl/element_wise.cpp +1124 -0
  1674. data/vendor/ggml/src/ggml-sycl/element_wise.hpp +94 -0
  1675. data/vendor/ggml/src/ggml-sycl/fattn-buffers.cpp +56 -0
  1676. data/vendor/ggml/src/ggml-sycl/fattn-buffers.hpp +63 -0
  1677. data/vendor/ggml/src/ggml-sycl/fattn-common.hpp +1181 -0
  1678. data/vendor/ggml/src/ggml-sycl/fattn-tile.cpp +59 -0
  1679. data/vendor/ggml/src/ggml-sycl/fattn-tile.hpp +1246 -0
  1680. data/vendor/ggml/src/ggml-sycl/fattn-vec.hpp +674 -0
  1681. data/vendor/ggml/src/ggml-sycl/fattn.cpp +227 -0
  1682. data/vendor/ggml/src/ggml-sycl/fattn.hpp +22 -0
  1683. data/vendor/ggml/src/ggml-sycl/fill.cpp +55 -0
  1684. data/vendor/ggml/src/ggml-sycl/fill.hpp +5 -0
  1685. data/vendor/ggml/src/ggml-sycl/gated_delta_net.cpp +307 -0
  1686. data/vendor/ggml/src/ggml-sycl/gated_delta_net.hpp +9 -0
  1687. data/vendor/ggml/src/ggml-sycl/gemm.hpp +93 -0
  1688. data/vendor/ggml/src/ggml-sycl/getrows.cpp +219 -0
  1689. data/vendor/ggml/src/ggml-sycl/getrows.hpp +20 -0
  1690. data/vendor/ggml/src/ggml-sycl/ggml-sycl.cpp +5520 -0
  1691. data/vendor/ggml/src/ggml-sycl/gla.cpp +106 -0
  1692. data/vendor/ggml/src/ggml-sycl/gla.hpp +8 -0
  1693. data/vendor/ggml/src/ggml-sycl/im2col.cpp +400 -0
  1694. data/vendor/ggml/src/ggml-sycl/im2col.hpp +23 -0
  1695. data/vendor/ggml/src/ggml-sycl/mmq.cpp +3030 -0
  1696. data/vendor/ggml/src/ggml-sycl/mmq.hpp +33 -0
  1697. data/vendor/ggml/src/ggml-sycl/mmvq.cpp +1380 -0
  1698. data/vendor/ggml/src/ggml-sycl/mmvq.hpp +43 -0
  1699. data/vendor/ggml/src/ggml-sycl/norm.cpp +656 -0
  1700. data/vendor/ggml/src/ggml-sycl/norm.hpp +28 -0
  1701. data/vendor/ggml/src/ggml-sycl/outprod.cpp +47 -0
  1702. data/vendor/ggml/src/ggml-sycl/outprod.hpp +10 -0
  1703. data/vendor/ggml/src/ggml-sycl/pad.cpp +97 -0
  1704. data/vendor/ggml/src/ggml-sycl/pad.hpp +24 -0
  1705. data/vendor/ggml/src/ggml-sycl/pad_reflect_1d.cpp +100 -0
  1706. data/vendor/ggml/src/ggml-sycl/pad_reflect_1d.hpp +10 -0
  1707. data/vendor/ggml/src/ggml-sycl/presets.hpp +79 -0
  1708. data/vendor/ggml/src/ggml-sycl/quantize.hpp +133 -0
  1709. data/vendor/ggml/src/ggml-sycl/quants.hpp +156 -0
  1710. data/vendor/ggml/src/ggml-sycl/repeat_back.cpp +76 -0
  1711. data/vendor/ggml/src/ggml-sycl/repeat_back.hpp +8 -0
  1712. data/vendor/ggml/src/ggml-sycl/roll.cpp +122 -0
  1713. data/vendor/ggml/src/ggml-sycl/roll.hpp +20 -0
  1714. data/vendor/ggml/src/ggml-sycl/rope.cpp +641 -0
  1715. data/vendor/ggml/src/ggml-sycl/rope.hpp +26 -0
  1716. data/vendor/ggml/src/ggml-sycl/set.cpp +73 -0
  1717. data/vendor/ggml/src/ggml-sycl/set.hpp +5 -0
  1718. data/vendor/ggml/src/ggml-sycl/set_rows.cpp +240 -0
  1719. data/vendor/ggml/src/ggml-sycl/set_rows.hpp +8 -0
  1720. data/vendor/ggml/src/ggml-sycl/softmax.cpp +426 -0
  1721. data/vendor/ggml/src/ggml-sycl/softmax.hpp +24 -0
  1722. data/vendor/ggml/src/ggml-sycl/solve_tri.cpp +172 -0
  1723. data/vendor/ggml/src/ggml-sycl/solve_tri.hpp +8 -0
  1724. data/vendor/ggml/src/ggml-sycl/ssm_conv.cpp +132 -0
  1725. data/vendor/ggml/src/ggml-sycl/ssm_conv.hpp +5 -0
  1726. data/vendor/ggml/src/ggml-sycl/ssm_scan.cpp +156 -0
  1727. data/vendor/ggml/src/ggml-sycl/ssm_scan.hpp +5 -0
  1728. data/vendor/ggml/src/ggml-sycl/sycl_hw.cpp +67 -0
  1729. data/vendor/ggml/src/ggml-sycl/sycl_hw.hpp +38 -0
  1730. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq112-dv112.cpp +5 -0
  1731. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq128-dv128.cpp +5 -0
  1732. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq256-dv256.cpp +5 -0
  1733. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq40-dv40.cpp +5 -0
  1734. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq512-dv512.cpp +6 -0
  1735. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq576-dv512.cpp +5 -0
  1736. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq64-dv64.cpp +5 -0
  1737. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq72-dv72.cpp +5 -0
  1738. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq80-dv80.cpp +5 -0
  1739. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq96-dv96.cpp +5 -0
  1740. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-f16.cpp +8 -0
  1741. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_0.cpp +8 -0
  1742. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_1.cpp +8 -0
  1743. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_0.cpp +8 -0
  1744. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_1.cpp +8 -0
  1745. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q8_0.cpp +8 -0
  1746. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-f16.cpp +8 -0
  1747. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_0.cpp +8 -0
  1748. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_1.cpp +8 -0
  1749. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_0.cpp +8 -0
  1750. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_1.cpp +8 -0
  1751. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q8_0.cpp +8 -0
  1752. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-f16.cpp +8 -0
  1753. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_0.cpp +8 -0
  1754. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_1.cpp +8 -0
  1755. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_0.cpp +8 -0
  1756. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_1.cpp +8 -0
  1757. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q8_0.cpp +8 -0
  1758. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-f16.cpp +8 -0
  1759. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_0.cpp +8 -0
  1760. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_1.cpp +8 -0
  1761. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_0.cpp +8 -0
  1762. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_1.cpp +8 -0
  1763. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q8_0.cpp +8 -0
  1764. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-f16.cpp +8 -0
  1765. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_0.cpp +8 -0
  1766. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_1.cpp +8 -0
  1767. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_0.cpp +8 -0
  1768. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_1.cpp +8 -0
  1769. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q8_0.cpp +8 -0
  1770. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-f16.cpp +8 -0
  1771. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_0.cpp +8 -0
  1772. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_1.cpp +8 -0
  1773. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_0.cpp +8 -0
  1774. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_1.cpp +8 -0
  1775. data/vendor/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q8_0.cpp +8 -0
  1776. data/vendor/ggml/src/ggml-sycl/tsembd.cpp +73 -0
  1777. data/vendor/ggml/src/ggml-sycl/tsembd.hpp +20 -0
  1778. data/vendor/ggml/src/ggml-sycl/type.hpp +112 -0
  1779. data/vendor/ggml/src/ggml-sycl/upscale.cpp +410 -0
  1780. data/vendor/ggml/src/ggml-sycl/upscale.hpp +9 -0
  1781. data/vendor/ggml/src/ggml-sycl/vecdotq.hpp +1508 -0
  1782. data/vendor/ggml/src/ggml-sycl/wkv.cpp +293 -0
  1783. data/vendor/ggml/src/ggml-sycl/wkv.hpp +10 -0
  1784. data/vendor/ggml/src/ggml-threading.cpp +12 -0
  1785. data/vendor/ggml/src/ggml-threading.h +14 -0
  1786. data/vendor/ggml/src/ggml-virtgpu/CMakeLists.txt +70 -0
  1787. data/vendor/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp +87 -0
  1788. data/vendor/ggml/src/ggml-virtgpu/backend/CMakeLists.txt +21 -0
  1789. data/vendor/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp +115 -0
  1790. data/vendor/ggml/src/ggml-virtgpu/backend/backend-convert.h +13 -0
  1791. data/vendor/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +102 -0
  1792. data/vendor/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +105 -0
  1793. data/vendor/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +179 -0
  1794. data/vendor/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp +148 -0
  1795. data/vendor/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +51 -0
  1796. data/vendor/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +73 -0
  1797. data/vendor/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +27 -0
  1798. data/vendor/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +32 -0
  1799. data/vendor/ggml/src/ggml-virtgpu/backend/backend.cpp +144 -0
  1800. data/vendor/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +95 -0
  1801. data/vendor/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +94 -0
  1802. data/vendor/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +50 -0
  1803. data/vendor/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +378 -0
  1804. data/vendor/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +232 -0
  1805. data/vendor/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +58 -0
  1806. data/vendor/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +81 -0
  1807. data/vendor/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +123 -0
  1808. data/vendor/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +160 -0
  1809. data/vendor/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +213 -0
  1810. data/vendor/ggml/src/ggml-virtgpu/ggml-backend.cpp +71 -0
  1811. data/vendor/ggml/src/ggml-virtgpu/ggml-remoting.h +71 -0
  1812. data/vendor/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml +166 -0
  1813. data/vendor/ggml/src/ggml-virtgpu/include/apir_hw.h +9 -0
  1814. data/vendor/ggml/src/ggml-virtgpu/regenerate_remoting.py +333 -0
  1815. data/vendor/ggml/src/ggml-virtgpu/virtgpu-apir.h +15 -0
  1816. data/vendor/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +58 -0
  1817. data/vendor/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +110 -0
  1818. data/vendor/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +173 -0
  1819. data/vendor/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +192 -0
  1820. data/vendor/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +36 -0
  1821. data/vendor/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +53 -0
  1822. data/vendor/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +99 -0
  1823. data/vendor/ggml/src/ggml-virtgpu/virtgpu-shm.h +23 -0
  1824. data/vendor/ggml/src/ggml-virtgpu/virtgpu-utils.cpp +179 -0
  1825. data/vendor/ggml/src/ggml-virtgpu/virtgpu-utils.h +86 -0
  1826. data/vendor/ggml/src/ggml-virtgpu/virtgpu.cpp +545 -0
  1827. data/vendor/ggml/src/ggml-virtgpu/virtgpu.h +115 -0
  1828. data/vendor/ggml/src/ggml-vulkan/CMakeLists.txt +220 -0
  1829. data/vendor/ggml/src/ggml-vulkan/cmake/host-toolchain.cmake.in +15 -0
  1830. data/vendor/ggml/src/ggml-vulkan/ggml-vulkan.cpp +17208 -0
  1831. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +31 -0
  1832. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/abs.comp +21 -0
  1833. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +37 -0
  1834. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +69 -0
  1835. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/add1.comp +28 -0
  1836. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +42 -0
  1837. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/arange.comp +20 -0
  1838. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +60 -0
  1839. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +86 -0
  1840. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/argsort_large.comp +114 -0
  1841. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/ceil.comp +22 -0
  1842. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp +17 -0
  1843. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/concat.comp +41 -0
  1844. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +49 -0
  1845. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp +105 -0
  1846. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +347 -0
  1847. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
  1848. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +23 -0
  1849. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +51 -0
  1850. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +320 -0
  1851. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/copy_transpose.comp +67 -0
  1852. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp +17 -0
  1853. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/count_equal.comp +31 -0
  1854. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/count_experts.comp +51 -0
  1855. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/cumsum.comp +83 -0
  1856. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass1.comp +60 -0
  1857. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass2.comp +66 -0
  1858. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_f32.comp +20 -0
  1859. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl +653 -0
  1860. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.glsl +768 -0
  1861. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_head.glsl +13 -0
  1862. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +42 -0
  1863. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_s.comp +35 -0
  1864. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +44 -0
  1865. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xs.comp +43 -0
  1866. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +49 -0
  1867. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +40 -0
  1868. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +51 -0
  1869. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp +32 -0
  1870. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp +34 -0
  1871. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +32 -0
  1872. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_nvfp4.comp +32 -0
  1873. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q1_0.comp +29 -0
  1874. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +34 -0
  1875. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +42 -0
  1876. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_0.comp +30 -0
  1877. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_1.comp +32 -0
  1878. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +68 -0
  1879. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_0.comp +34 -0
  1880. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_1.comp +35 -0
  1881. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +70 -0
  1882. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +33 -0
  1883. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q8_0.comp +31 -0
  1884. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp +28 -0
  1885. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/diag_mask_inf.comp +34 -0
  1886. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/div.comp +27 -0
  1887. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/elu.comp +27 -0
  1888. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +20 -0
  1889. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/bfloat16.comp +7 -0
  1890. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/coopmat.comp +7 -0
  1891. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/coopmat2.comp +7 -0
  1892. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/integer_dot.comp +7 -0
  1893. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/fill.comp +19 -0
  1894. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +756 -0
  1895. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +255 -0
  1896. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +626 -0
  1897. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +427 -0
  1898. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_dequant.glsl +123 -0
  1899. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mask_opt.comp +162 -0
  1900. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mmq_funcs.glsl +203 -0
  1901. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +121 -0
  1902. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/floor.comp +22 -0
  1903. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp +190 -0
  1904. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
  1905. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
  1906. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
  1907. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp +25 -0
  1908. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
  1909. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp +23 -0
  1910. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.glsl +65 -0
  1911. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/generic_head.glsl +11 -0
  1912. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.glsl +83 -0
  1913. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +42 -0
  1914. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +51 -0
  1915. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.glsl +28 -0
  1916. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.glsl +39 -0
  1917. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp +66 -0
  1918. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +22 -0
  1919. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +22 -0
  1920. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +93 -0
  1921. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +124 -0
  1922. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +44 -0
  1923. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/leaky_relu.comp +22 -0
  1924. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +17 -0
  1925. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul.comp +27 -0
  1926. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_split_k_reduce.comp +48 -0
  1927. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +169 -0
  1928. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl +230 -0
  1929. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +35 -0
  1930. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_m.comp +132 -0
  1931. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp +95 -0
  1932. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_s.comp +90 -0
  1933. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +105 -0
  1934. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xxs.comp +87 -0
  1935. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_s.comp +90 -0
  1936. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_xxs.comp +88 -0
  1937. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +124 -0
  1938. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp +156 -0
  1939. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +128 -0
  1940. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +132 -0
  1941. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +134 -0
  1942. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +165 -0
  1943. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +130 -0
  1944. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +143 -0
  1945. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +503 -0
  1946. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +464 -0
  1947. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +624 -0
  1948. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +600 -0
  1949. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +74 -0
  1950. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +311 -0
  1951. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +454 -0
  1952. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +93 -0
  1953. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +194 -0
  1954. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/neg.comp +20 -0
  1955. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/norm.comp +44 -0
  1956. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_adamw.comp +42 -0
  1957. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +22 -0
  1958. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +64 -0
  1959. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/pool2d.comp +74 -0
  1960. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +127 -0
  1961. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
  1962. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp +21 -0
  1963. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp +26 -0
  1964. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp +37 -0
  1965. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +150 -0
  1966. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_back.comp +55 -0
  1967. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +65 -0
  1968. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
  1969. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +207 -0
  1970. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +19 -0
  1971. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +17 -0
  1972. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +17 -0
  1973. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +17 -0
  1974. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +31 -0
  1975. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +17 -0
  1976. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/round.comp +29 -0
  1977. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +24 -0
  1978. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/sgn.comp +21 -0
  1979. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp +20 -0
  1980. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp +22 -0
  1981. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/silu_back.comp +26 -0
  1982. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp +17 -0
  1983. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +195 -0
  1984. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +54 -0
  1985. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large1.comp +62 -0
  1986. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large2.comp +79 -0
  1987. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large3.comp +65 -0
  1988. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large_common.glsl +53 -0
  1989. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/softplus.comp +23 -0
  1990. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/solve_tri.comp +81 -0
  1991. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +17 -0
  1992. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/square.comp +17 -0
  1993. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +50 -0
  1994. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/ssm_scan.comp +124 -0
  1995. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/step.comp +22 -0
  1996. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/sub.comp +29 -0
  1997. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +47 -0
  1998. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.glsl +25 -0
  1999. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
  2000. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +14 -0
  2001. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp +20 -0
  2002. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +42 -0
  2003. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/topk_argsort.comp +118 -0
  2004. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +213 -0
  2005. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/topk_nary_search.comp +246 -0
  2006. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp +42 -0
  2007. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/trunc.comp +22 -0
  2008. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl +1846 -0
  2009. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +178 -0
  2010. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/utils.glsl +25 -0
  2011. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +1183 -0
  2012. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/wkv6.comp +87 -0
  2013. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/wkv7.comp +91 -0
  2014. data/vendor/ggml/src/ggml-vulkan/vulkan-shaders/xielu.comp +35 -0
  2015. data/vendor/ggml/src/ggml-webgpu/CMakeLists.txt +80 -0
  2016. data/vendor/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +3231 -0
  2017. data/vendor/ggml/src/ggml-webgpu/ggml-webgpu.cpp +4461 -0
  2018. data/vendor/ggml/src/ggml-webgpu/pre_wgsl.hpp +778 -0
  2019. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/add_id.wgsl +64 -0
  2020. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl +72 -0
  2021. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl +106 -0
  2022. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl +134 -0
  2023. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +139 -0
  2024. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +905 -0
  2025. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl +75 -0
  2026. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/conv2d.wgsl +165 -0
  2027. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +81 -0
  2028. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl +66 -0
  2029. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +89 -0
  2030. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +706 -0
  2031. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_tile.wgsl +351 -0
  2032. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_blk.wgsl +101 -0
  2033. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_reduce.wgsl +84 -0
  2034. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_split.wgsl +720 -0
  2035. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/gated_delta_net.wgsl +132 -0
  2036. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.wgsl +773 -0
  2037. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/glu.wgsl +155 -0
  2038. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/im2col.wgsl +101 -0
  2039. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
  2040. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +747 -0
  2041. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +1210 -0
  2042. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id.wgsl +195 -0
  2043. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_gather.wgsl +55 -0
  2044. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_vec.wgsl +154 -0
  2045. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl +149 -0
  2046. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.wgsl +200 -0
  2047. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +133 -0
  2048. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_acc.tmpl +1433 -0
  2049. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl +86 -0
  2050. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/repeat.wgsl +67 -0
  2051. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_mul.wgsl +152 -0
  2052. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/rope.wgsl +224 -0
  2053. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/row_norm.wgsl +153 -0
  2054. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/scale.wgsl +63 -0
  2055. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/set.wgsl +109 -0
  2056. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +109 -0
  2057. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.wgsl +245 -0
  2058. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/solve_tri.wgsl +121 -0
  2059. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/ssm_conv.wgsl +65 -0
  2060. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/ssm_scan.wgsl +193 -0
  2061. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl +55 -0
  2062. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +210 -0
  2063. data/vendor/ggml/src/ggml-webgpu/wgsl-shaders/upscale.wgsl +240 -0
  2064. data/vendor/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
  2065. data/vendor/ggml/src/ggml-zdnn/common.hpp +59 -0
  2066. data/vendor/ggml/src/ggml-zdnn/ggml-zdnn.cpp +637 -0
  2067. data/vendor/ggml/src/ggml-zdnn/mmf.cpp +80 -0
  2068. data/vendor/ggml/src/ggml-zdnn/mmf.hpp +12 -0
  2069. data/vendor/ggml/src/ggml-zdnn/utils.cpp +79 -0
  2070. data/vendor/ggml/src/ggml-zdnn/utils.hpp +19 -0
  2071. data/vendor/ggml/src/ggml-zendnn/CMakeLists.txt +91 -0
  2072. data/vendor/ggml/src/ggml-zendnn/ggml-zendnn.cpp +669 -0
  2073. data/vendor/ggml/src/ggml.c +7777 -0
  2074. data/vendor/ggml/src/ggml.cpp +26 -0
  2075. data/vendor/ggml/src/gguf.cpp +1556 -0
  2076. data/vendor/ggml/tests/CMakeLists.txt +356 -0
  2077. data/vendor/ggml/tests/test-arange.cpp +100 -0
  2078. data/vendor/ggml/tests/test-backend-ops.cpp +9786 -0
  2079. data/vendor/ggml/tests/test-cont.c +170 -0
  2080. data/vendor/ggml/tests/test-conv-transpose-1d.cpp +691 -0
  2081. data/vendor/ggml/tests/test-conv-transpose.c +248 -0
  2082. data/vendor/ggml/tests/test-conv1d-dw-c1.cpp +243 -0
  2083. data/vendor/ggml/tests/test-conv1d-dw-c2.cpp +243 -0
  2084. data/vendor/ggml/tests/test-conv1d.cpp +289 -0
  2085. data/vendor/ggml/tests/test-conv2d-dw.cpp +153 -0
  2086. data/vendor/ggml/tests/test-conv2d.cpp +391 -0
  2087. data/vendor/ggml/tests/test-customop.c +300 -0
  2088. data/vendor/ggml/tests/test-dup.c +111 -0
  2089. data/vendor/ggml/tests/test-interpolate.cpp +166 -0
  2090. data/vendor/ggml/tests/test-opt.cpp +1003 -0
  2091. data/vendor/ggml/tests/test-pad-reflect-1d.cpp +213 -0
  2092. data/vendor/ggml/tests/test-pool.c +274 -0
  2093. data/vendor/ggml/tests/test-quantize-fns.cpp +196 -0
  2094. data/vendor/ggml/tests/test-quantize-perf.cpp +356 -0
  2095. data/vendor/ggml/tests/test-rel-pos.c +87 -0
  2096. data/vendor/ggml/tests/test-roll.cpp +128 -0
  2097. data/vendor/ggml/tests/test-timestep_embedding.cpp +180 -0
  2098. data/vendor-patches/0001-cuda-buffer_from_ptr.patch +253 -0
  2099. data/vendor-patches/0002-cuda-buffer_from_ptr-reuse-iface.patch +117 -0
  2100. data/vendor-patches/0003-cuda-buffer_from_ptr-copy-mode.patch +128 -0
  2101. data/vendor-patches/0004-cuda-cpy-strided.patch +61 -0
  2102. data/vendor-patches/0005-concat-backward.patch +36 -0
  2103. data/vendor-patches/0006-getrows-back-large-vocab.patch +69 -0
  2104. data/vendor-patches/0007-gpt2-backward-kernels.patch +438 -0
  2105. data/vendor-patches/0008-mul-mat-backward-mixed-precision.patch +50 -0
  2106. data/vendor-patches/0009-sched-unsupported-node-diagnostic.patch +26 -0
  2107. metadata +2161 -0
@@ -0,0 +1,2460 @@
1
+ #include "tinynn_ggml.h"
2
+ #include "tinynn_trace.h"
3
+ #include "ggml.h"
4
+ #include "ggml-backend.h"
5
+ #include "ggml-cpu.h"
6
+ #include "gguf.h"
7
+
8
+ #include <math.h>
9
+ #include <stdio.h>
10
+ #include <stdlib.h>
11
+ #include <string.h>
12
+ #include <errno.h>
13
+ #include <sys/stat.h>
14
+ #include <unistd.h>
15
+
16
+ /* CUDA backend init lives in tinynn_backend_cuda.c (only present when
17
+ * linking against libtinynn_ggml_cuda.a). Weak DEFINITION here returns
18
+ * NULL — strong override in the CUDA archive provides the real impl.
19
+ * Lets a single tinynn_ggml.o serve both CPU-only and CUDA programs
20
+ * without symbol duplication, on both clang and gcc.
21
+ */
22
+ __attribute__((weak)) ggml_backend_t tnn_backend_cuda_init_internal(void) {
23
+ return NULL;
24
+ }
25
+
26
+ /* Metal backend init: same weak-default / strong-override pattern as
27
+ * CUDA. The strong definition lives in tinynn/tinynn_backend_metal.m,
28
+ * compiled only into libtinynn_ggml_metal.a (macOS-only). Builds
29
+ * without the Metal archive get this NULL weak default and fall
30
+ * through to the CPU backend. */
31
+ __attribute__((weak)) ggml_backend_t tnn_backend_metal_init_internal(void) {
32
+ return NULL;
33
+ }
34
+
35
+ /* Weak hook: returns a CUDA-side ggml_backend_cuda_buffer_from_ptr
36
+ * wrapping the given host region (typically an mmap'd GGUF). The
37
+ * CPU-only build leaves this NULL; the CUDA archive
38
+ * (tinynn_backend_cuda.c) overrides with a strong definition that
39
+ * calls into the patched ggml-cuda. */
40
+ __attribute__((weak)) ggml_backend_buffer_t
41
+ tnn_cuda_buffer_from_ptr_internal(void *host_ptr, size_t size, int device) {
42
+ (void)host_ptr; (void)size; (void)device;
43
+ return NULL;
44
+ }
45
+
46
+ #define TNN_SCRATCH_BYTES (16 * 1024 * 1024) /* 16 MiB: 4M f32 */
47
+
48
+ /* P6: per-op timing via sched eval callback. Routes one Chrome-Trace
49
+ * duration event per ggml node when tnn_trace_op_capture_active().
50
+ * Cost when off: one capture-flag load + early return per node.
51
+ *
52
+ * Semantics from ggml_backend_sched_eval_callback (ggml-backend.h):
53
+ * ask=true → return true to request a post call, false to skip.
54
+ * ask=false → return true to continue compute, false to ABORT.
55
+ * We never abort: returning true on post is unconditional. */
56
+ static bool tnn_sched_op_eval_cb(struct ggml_tensor *t, bool ask, void *user_data) {
57
+ (void)user_data;
58
+ if (ask) {
59
+ if (!tnn_trace_op_capture_active()) return false;
60
+ tnn_trace_op_record_begin();
61
+ return true;
62
+ }
63
+ if (tnn_trace_op_capture_active()) {
64
+ tnn_trace_op_record_end(ggml_op_name(t->op));
65
+ }
66
+ return true;
67
+ }
68
+
69
+ /* Engine: persistent across the program's lifetime. Holds the backend
70
+ * objects + scheduler. Cached per backend flavor so multiple
71
+ * session_new calls share one backend init. */
72
+ typedef struct {
73
+ ggml_backend_t backend; /* CUDA / Metal / CPU */
74
+ ggml_backend_t cpu_backend; /* sched fallback when primary is GPU */
75
+ ggml_backend_sched_t sched;
76
+ const char *backend_name;
77
+ } tnn_engine;
78
+
79
+ /* GH#3 — multi-GPU mode 1 (replicated inference). CUDA engine cache
80
+ * widened from a scalar to a per-device array so each GPU can have
81
+ * its own backend/sched and sessions can be pinned to a device via
82
+ * tnn_session_new_on(kind, device). CPU and Metal stay scalar (Metal
83
+ * = single Apple GPU; CPU = single host).
84
+ *
85
+ * Bound is conservative — 8 GPUs on one host is more than any
86
+ * realistic toy deployment. If you need more, bump and rebuild. */
87
+ #define TNN_MAX_CUDA_DEVICES 8
88
+
89
+ static tnn_engine *g_engine_cpu = NULL;
90
+ static tnn_engine *g_engine_cuda[TNN_MAX_CUDA_DEVICES] = { NULL };
91
+ static tnn_engine *g_engine_metal = NULL;
92
+
93
+ /* CUDA backend init with device selection. Weak stub returns NULL;
94
+ * strong override lives in tinynn_backend_cuda.c. */
95
+ __attribute__((weak))
96
+ ggml_backend_t tnn_backend_cuda_init_internal_on(int device) {
97
+ (void)device;
98
+ return NULL;
99
+ }
100
+
101
+ /* Bind ggml_backend_cuda_get_device_count so Ruby can discover the
102
+ * GPU count without hard-coding. Weak stub returns 0 on CPU-only
103
+ * builds; strong override in tinynn_backend_cuda.c calls the real
104
+ * ggml API. */
105
+ __attribute__((weak))
106
+ int tnn_cuda_get_device_count_internal(void) {
107
+ return 0;
108
+ }
109
+
110
+ int tnn_cuda_get_device_count(void) {
111
+ return tnn_cuda_get_device_count_internal();
112
+ }
113
+
114
+ /* backend_kind: 0 = CPU, 1 = CUDA, 2 = Metal. device: for CUDA, the
115
+ * GPU index (0..TNN_MAX_CUDA_DEVICES-1); ignored for CPU and Metal.
116
+ * Falls back to CPU if the requested GPU backend isn't linked into
117
+ * the binary (weak init stub returns NULL). */
118
+ static tnn_engine *tnn_engine_get_on(int backend_kind, int device)
119
+ {
120
+ tnn_engine **slot;
121
+ switch (backend_kind) {
122
+ case 1: {
123
+ if (device < 0 || device >= TNN_MAX_CUDA_DEVICES) {
124
+ fprintf(stderr,
125
+ "[tnn] tnn_engine_get_on: CUDA device=%d out of range "
126
+ "[0,%d). Bump TNN_MAX_CUDA_DEVICES if you really have "
127
+ "this many GPUs.\n", device, TNN_MAX_CUDA_DEVICES);
128
+ return NULL;
129
+ }
130
+ slot = &g_engine_cuda[device];
131
+ break;
132
+ }
133
+ case 2: slot = &g_engine_metal; break;
134
+ default: slot = &g_engine_cpu; break;
135
+ }
136
+ if (*slot) return *slot;
137
+
138
+ ggml_backend_load_all();
139
+ tnn_engine *e = (tnn_engine *)calloc(1, sizeof(tnn_engine));
140
+ if (!e) return NULL;
141
+
142
+ if (backend_kind == 1) {
143
+ e->backend = tnn_backend_cuda_init_internal_on(device);
144
+ if (e->backend) e->backend_name = "cuda";
145
+ } else if (backend_kind == 2) {
146
+ e->backend = tnn_backend_metal_init_internal();
147
+ if (e->backend) e->backend_name = "metal";
148
+ }
149
+ if (!e->backend) {
150
+ /* Fail loud on the GPU→CPU fallback: a consumer that asked for
151
+ * CUDA/Metal but didn't link the backend archive (e.g. missing
152
+ * -Wl,-u,tnn_cuda_force_link) would otherwise silently compute
153
+ * on CPU — the loss curve flips with no other symptom. */
154
+ if (backend_kind == 1 || backend_kind == 2) {
155
+ fprintf(stderr,
156
+ "[tnn] WARNING: %s backend requested but not linked into "
157
+ "this binary (init returned NULL) — falling back to CPU. "
158
+ "CUDA consumers must link with "
159
+ "-Wl,-u,tnn_cuda_force_link; see docs/consuming-toy.md.\n",
160
+ backend_kind == 1 ? "CUDA" : "Metal");
161
+ }
162
+ e->backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, NULL);
163
+ e->backend_name = "cpu";
164
+ }
165
+ if (!e->backend) { free(e); return NULL; }
166
+
167
+ e->cpu_backend = (e->backend_name[0] == 'c' && e->backend_name[1] == 'p')
168
+ ? NULL
169
+ : ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, NULL);
170
+
171
+ ggml_backend_t backends[2];
172
+ int n_backends = 0;
173
+ backends[n_backends++] = e->backend;
174
+ if (e->cpu_backend) backends[n_backends++] = e->cpu_backend;
175
+ /* Scheduler graph-size hint. Must be >= n_nodes + n_leafs of the
176
+ * largest graph we'll alloc. 65536 covers seq-mode training of
177
+ * Qwen2.5-3B at T<=32 with LoRA + AdamW + pin_all_graph_b_nodes
178
+ * (~30K backward nodes once every grad-chain intermediate is
179
+ * pinned-as-output for the ggml-cpu sched-alias workaround).
180
+ * Older path (KV-cache decode, T=1) used 16384; we leave headroom. */
181
+ e->sched = ggml_backend_sched_new(backends, NULL, n_backends,
182
+ 65536, false, true);
183
+ /* P6: per-op eval callback. Installed unconditionally; the callback
184
+ * itself early-outs when capture is off, so the overhead in the
185
+ * common (untraced) case is one branch per ggml node. */
186
+ ggml_backend_sched_set_eval_callback(e->sched, tnn_sched_op_eval_cb, NULL);
187
+
188
+ *slot = e;
189
+ return e;
190
+ }
191
+
192
+ /* Explicit teardown of every cached engine. Idempotent. Programs that
193
+ * want a clean exit (notably Metal — its ggml backend asserts in the
194
+ * static destructor if a residency set wasn't drained beforehand)
195
+ * should call this before main() returns. CUDA + CPU don't strictly
196
+ * need it but tolerate the call. After tnn_shutdown_engines the
197
+ * caches are NULL, so a fresh tnn_session_new will re-init the
198
+ * backend from scratch — handy if your program wants to release the
199
+ * GPU between phases. */
200
+ void tnn_shutdown_engines(void)
201
+ {
202
+ /* CPU + Metal: single slot each. */
203
+ tnn_engine **scalar_slots[] = { &g_engine_cpu, &g_engine_metal };
204
+ for (int i = 0; i < 2; ++i) {
205
+ tnn_engine *e = *scalar_slots[i];
206
+ if (!e) continue;
207
+ if (e->sched) ggml_backend_sched_free(e->sched);
208
+ if (e->cpu_backend) ggml_backend_free(e->cpu_backend);
209
+ if (e->backend) ggml_backend_free(e->backend);
210
+ free(e);
211
+ *scalar_slots[i] = NULL;
212
+ }
213
+ /* CUDA: walk the per-device array (GH#3 multi-GPU mode 1). */
214
+ for (int dev = 0; dev < TNN_MAX_CUDA_DEVICES; ++dev) {
215
+ tnn_engine *e = g_engine_cuda[dev];
216
+ if (!e) continue;
217
+ if (e->sched) ggml_backend_sched_free(e->sched);
218
+ if (e->cpu_backend) ggml_backend_free(e->cpu_backend);
219
+ if (e->backend) ggml_backend_free(e->backend);
220
+ free(e);
221
+ g_engine_cuda[dev] = NULL;
222
+ }
223
+ }
224
+
225
+ /* Session: per "compute frame" — owns its ctx + graph + scratch, but
226
+ * references a cached engine. tnn_session_free frees the per-frame
227
+ * resources only; the engine persists for reuse.
228
+ *
229
+ * Two contexts:
230
+ * - ctx_w (weights_ctx): persistent tensors (parameters, moments).
231
+ * Allocated once via ggml_backend_alloc_ctx_tensors into a stable
232
+ * backend buffer that survives sched_reset cycles.
233
+ * - ctx (compute_ctx): per-step tensors (inputs, intermediates).
234
+ * Managed by backend_sched, re-allocated per compute cycle.
235
+ *
236
+ * Cross-ctx tensors in a single graph are supported by ggml — nodes
237
+ * just hold tensor pointers. The compute graph references both ctxs;
238
+ * sched skips persistent tensors (they already have a buffer). */
239
+ typedef struct {
240
+ tnn_engine *engine; /* unowned */
241
+ struct ggml_context *ctx; /* compute (no_alloc=true) */
242
+ struct ggml_context *ctx_w; /* weights (no_alloc=true until finalized) */
243
+ struct ggml_context *ctx_w_mmap; /* mmap'd weights (no_alloc=true forever;
244
+ * tensors get data via
245
+ * ggml_backend_tensor_alloc against
246
+ * weight_buf_mmap) */
247
+ struct ggml_cgraph *graph; /* primary (e.g. forward) */
248
+ struct ggml_cgraph *graph_b; /* secondary (e.g. adam_step) */
249
+ uint8_t *ctx_buf;
250
+ size_t ctx_buf_size;
251
+ uint8_t *ctx_w_buf;
252
+ size_t ctx_w_buf_size;
253
+ uint8_t *ctx_w_mmap_buf;
254
+ size_t ctx_w_mmap_buf_size;
255
+ ggml_backend_buffer_t weights_buf; /* set by tnn_finalize_weights */
256
+ ggml_backend_buffer_t weights_buf_mmap; /* cpu_buffer_from_ptr wrapping
257
+ * a caller-owned mmap region. We
258
+ * free the buffer; we do NOT free
259
+ * the underlying memory. */
260
+ void *weights_map_base; /* mmap base, caller-owned */
261
+ size_t weights_map_size;
262
+ float *scratch;
263
+ int scratch_pinned; /* 1 if cudaHostAlloc'd */
264
+ int realized;
265
+ int realized_b;
266
+ int weights_finalized;
267
+ int last_graph; /* 0 = none, 1 = a, 2 = b */
268
+ int scratch_overflow_warned; /* once-per-session diag */
269
+ int graph_capacity; /* GH#17: persists across rebuilds */
270
+ } tnn_session;
271
+
272
+ /* Pinned-memory allocator hooks. Weak defaults below fall back to
273
+ * calloc/free. The CUDA backend object overrides these with
274
+ * cudaHostAlloc/cudaFreeHost so that ggml_backend_tensor_set can DMA
275
+ * directly from the scratch buffer instead of staging through a pinned
276
+ * bounce buffer inside the driver. CPU-only builds keep the weak
277
+ * fallbacks and pay no extra cost. */
278
+ __attribute__((weak))
279
+ void *tnn_pinned_alloc(size_t bytes) { return calloc(1, bytes); }
280
+ __attribute__((weak))
281
+ void tnn_pinned_free(void *p) { free(p); }
282
+
283
+ /* Source-compat: pre-GH#3 single-device entry point. Existing callers
284
+ * keep working unchanged — CPU sessions and Metal sessions never had
285
+ * a device choice, and CUDA defaulted to device 0. */
286
+ void *tnn_session_new(int backend_kind)
287
+ {
288
+ return tnn_session_new_on(backend_kind, 0);
289
+ }
290
+
291
+ /* GH#3 — multi-GPU device-aware session constructor. For backend_kind
292
+ * == 1 (CUDA), `device` is the GPU index. For CPU/Metal the device
293
+ * argument is ignored. Returns NULL if the requested backend isn't
294
+ * linked or if the device index is out of range. */
295
+ void *tnn_session_new_on(int backend_kind, int device)
296
+ {
297
+ tnn_engine *e = tnn_engine_get_on(backend_kind, device);
298
+ if (!e) return NULL;
299
+
300
+ tnn_session *s = (tnn_session *)calloc(1, sizeof(tnn_session));
301
+ if (!s) return NULL;
302
+ s->engine = e;
303
+
304
+ /* Reset the (shared) scheduler so any prior allocation state is
305
+ * wiped before this session builds its graph. */
306
+ ggml_backend_sched_reset(e->sched);
307
+
308
+ /* Two cgraphs share ctx, so reserve room for both. ctx grows
309
+ * monotonically across tnn_reset_for_rebuild cycles (each rebuild
310
+ * allocates new compute-tensor metadata in the same ctx). At
311
+ * GPT-2-distil shape one decode-step graph has ~1280 ops:
312
+ * 6 layers × (12 heads × ~16 ops + concat/proj/FFN/LN/residual)
313
+ * × N rebuilds = 1280 × N tensor headers (~376 B each).
314
+ * Reserve enough headroom for ~10k rebuilds = ~5M tensor headers.
315
+ * The no_alloc=true ctx only holds metadata so this is cheap
316
+ * bytes-wise. */
317
+ s->ctx_buf_size = ggml_tensor_overhead() * 262144
318
+ + ggml_graph_overhead_custom(GGML_DEFAULT_GRAPH_SIZE, false) * 4
319
+ + 32 * 1024 * 1024;
320
+ s->ctx_buf = (uint8_t *)calloc(1, s->ctx_buf_size);
321
+ struct ggml_init_params params = {
322
+ /*.mem_size =*/ s->ctx_buf_size,
323
+ /*.mem_buffer =*/ s->ctx_buf,
324
+ /*.no_alloc =*/ true,
325
+ };
326
+ s->ctx = ggml_init(params);
327
+ /* Graph node-count budget. Default GGML_DEFAULT_GRAPH_SIZE=2048
328
+ * is enough for distilgpt2 (6 layers, ~1200 nodes/step) but not
329
+ * for gpt2-small (12 layers, ~2500) and larger. 65536 covers
330
+ * seq-mode training (Qwen2.5-3B, T<=32, LoRA + AdamW + pinned
331
+ * graph_b) — matches the engine sched hash-set size. Cost is one
332
+ * int slot per node header. */
333
+ s->graph_capacity = 65536;
334
+ s->graph = ggml_new_graph_custom(s->ctx, (size_t)s->graph_capacity, false);
335
+ s->graph_b = ggml_new_graph_custom(s->ctx, (size_t)s->graph_capacity, false);
336
+
337
+ /* Weights ctx pool. Sized for ~1024 weight tensors -- generous
338
+ * upper bound that covers FullForwardFFICache at LLM scale
339
+ * (per layer: 2 norms + 3*n_heads + 3 = up to ~50 tensors; for
340
+ * 16 layers that's 800; plus global). no_alloc=true so this is
341
+ * just metadata bytes. */
342
+ /* Persistent-weights ctx. One slot per tensor declared via
343
+ * tnn_input_*_f32_persistent. GPT-2 sizes:
344
+ * distilgpt2 6 layers ~ 636 tensors
345
+ * gpt2-small 12 layers ~ 1272 tensors
346
+ * gpt2-large 36 layers ~ 7560 tensors
347
+ * gpt2-xl 48 layers ~10080 tensors (KV cache per head adds)
348
+ * 16384 covers up to gpt2-xl comfortably; the no_alloc ctx only
349
+ * holds metadata so the extra bytes cost nothing on small models. */
350
+ s->ctx_w_buf_size = ggml_tensor_overhead() * 16384;
351
+ s->ctx_w_buf = (uint8_t *)calloc(1, s->ctx_w_buf_size);
352
+ struct ggml_init_params w_params = {
353
+ /*.mem_size =*/ s->ctx_w_buf_size,
354
+ /*.mem_buffer =*/ s->ctx_w_buf,
355
+ /*.no_alloc =*/ true,
356
+ };
357
+ s->ctx_w = ggml_init(w_params);
358
+
359
+ /* ctx_w_mmap is created LAZILY (on first tnn_session_attach_weight_mmap
360
+ * call) rather than at session_new. Eager creation has a CUDA
361
+ * regression: even an empty no_alloc ggml_context with no
362
+ * attached backend buffer causes ggml-cuda's scheduler to
363
+ * produce wrong matmul output for downstream ops on the SAME
364
+ * session (verified 2026-05-18 — CUDA inference goes from
365
+ * wrong (top=112919) to correct (top=71 matching CPU) when
366
+ * this ctx is absent). Lazy creation keeps the BYO-pointer
367
+ * path working when needed without poisoning sessions that
368
+ * don't use it. */
369
+ s->ctx_w_mmap_buf_size = 0;
370
+ s->ctx_w_mmap_buf = NULL;
371
+ s->ctx_w_mmap = NULL;
372
+
373
+ /* Pinned scratch on CUDA: cudaHostAlloc'd pages let
374
+ * ggml_backend_tensor_set DMA directly without staging through a
375
+ * pinned bounce buffer inside the driver. Cuts per-step
376
+ * labels-upload cost (heavy LoRA bench: ~19 ms → ~target). The
377
+ * pinned_alloc symbols are weak in this object; the CUDA backend
378
+ * archive overrides them with cudaHostAlloc, CPU-only binaries
379
+ * keep the calloc fallback. */
380
+ s->scratch = (float *)tnn_pinned_alloc(TNN_SCRATCH_BYTES);
381
+ s->scratch_pinned = (s->scratch != NULL);
382
+ s->realized = 0;
383
+ s->realized_b = 0;
384
+ s->weights_finalized = 0;
385
+ s->weights_buf = NULL;
386
+ s->weights_buf_mmap = NULL;
387
+ s->weights_map_base = NULL;
388
+ s->weights_map_size = 0;
389
+ s->last_graph = 0;
390
+ return (void *)s;
391
+ }
392
+
393
+ void tnn_session_free(void *sess)
394
+ {
395
+ if (!sess) return;
396
+ tnn_session *s = (tnn_session *)sess;
397
+ if (s->weights_buf) ggml_backend_buffer_free(s->weights_buf);
398
+ if (s->weights_buf_mmap) ggml_backend_buffer_free(s->weights_buf_mmap);
399
+ if (s->ctx) ggml_free(s->ctx);
400
+ if (s->ctx_w) ggml_free(s->ctx_w);
401
+ if (s->ctx_w_mmap) ggml_free(s->ctx_w_mmap);
402
+ free(s->ctx_buf);
403
+ free(s->ctx_w_buf);
404
+ free(s->ctx_w_mmap_buf);
405
+ if (s->scratch_pinned) tnn_pinned_free(s->scratch);
406
+ else free(s->scratch);
407
+ free(s);
408
+ /* Engine + sched are cached globally; do not free here. */
409
+ }
410
+
411
+ const char *tnn_backend_name(void *sess)
412
+ {
413
+ if (!sess) return "(null)";
414
+ return ((tnn_session *)sess)->engine->backend_name;
415
+ }
416
+
417
+ int tnn_link_check(void) { return 73; }
418
+
419
+ void *tnn_input_2d_f32(void *sess, int rows, int cols)
420
+ {
421
+ if (!sess || rows <= 0 || cols <= 0) return NULL;
422
+ tnn_session *s = (tnn_session *)sess;
423
+ (void)s; /* future: validate ctx hasn't been realized */
424
+ return (void *)ggml_new_tensor_2d(((tnn_session *)sess)->ctx, GGML_TYPE_F32,
425
+ (int64_t)cols, (int64_t)rows);
426
+ }
427
+
428
+ /* Create a PERSISTENT 2D F32 tensor in ctx_w. Its backend buffer is
429
+ * allocated by tnn_finalize_weights (call once after all persistent
430
+ * tensors are declared) and retained across sched_reset cycles, so
431
+ * uploaded data survives multiple compute calls without re-upload. */
432
+ void *tnn_input_2d_f32_persistent(void *sess, int rows, int cols)
433
+ {
434
+ if (!sess || rows <= 0 || cols <= 0) return NULL;
435
+ tnn_session *s = (tnn_session *)sess;
436
+ if (s->weights_finalized) return NULL;
437
+ return (void *)ggml_new_tensor_2d(s->ctx_w, GGML_TYPE_F32,
438
+ (int64_t)cols, (int64_t)rows);
439
+ }
440
+
441
+ /* Same shape as tnn_input_2d_f32_persistent but with a caller-chosen
442
+ * ggml type (e.g. GGML_TYPE_Q8_0 for Q8-stays-Q8 inference). For
443
+ * block-quantized types the column count (ne0) must be a multiple of
444
+ * the block size — GGML_BLCK_SIZE handles this. Returns NULL on bad
445
+ * shape; callers should sanity-check the result. */
446
+ void *tnn_input_2d_persistent_typed(void *sess, int rows, int cols, int ggml_type)
447
+ {
448
+ if (!sess || rows <= 0 || cols <= 0) return NULL;
449
+ tnn_session *s = (tnn_session *)sess;
450
+ if (s->weights_finalized) return NULL;
451
+ enum ggml_type t = (enum ggml_type)ggml_type;
452
+ int blck = ggml_blck_size(t);
453
+ if (blck > 1 && (cols % blck != 0)) return NULL;
454
+ return (void *)ggml_new_tensor_2d(s->ctx_w, t,
455
+ (int64_t)cols, (int64_t)rows);
456
+ }
457
+
458
+ long tnn_row_size(int ggml_type, int ne0)
459
+ {
460
+ if (ne0 <= 0) return 0;
461
+ return (long)ggml_row_size((enum ggml_type)ggml_type, (int64_t)ne0);
462
+ }
463
+
464
+ /* Phase 2 BYO-pointer: register an mmap'd region as the backing
465
+ * buffer for weight tensors created via tnn_input_*_persistent_mmap.
466
+ * The session does NOT own the underlying memory — the caller (e.g.
467
+ * a tnn_gguf_session) must keep `base` valid for the session's
468
+ * lifetime. Returns 0 on success, -1 on already-attached / bad args. */
469
+ /* Lazy-create ctx_w_mmap. Called from tnn_session_attach_weight_mmap
470
+ * (Phase 2 entry point). NOT called from tnn_session_new — see the
471
+ * note there for why (eager creation breaks CUDA inference). */
472
+ static int ensure_ctx_w_mmap(tnn_session *s)
473
+ {
474
+ if (s->ctx_w_mmap) return 0;
475
+ s->ctx_w_mmap_buf_size = ggml_tensor_overhead() * 16384;
476
+ s->ctx_w_mmap_buf = (uint8_t *)calloc(1, s->ctx_w_mmap_buf_size);
477
+ if (!s->ctx_w_mmap_buf) return -1;
478
+ struct ggml_init_params m_params = {
479
+ /*.mem_size =*/ s->ctx_w_mmap_buf_size,
480
+ /*.mem_buffer =*/ s->ctx_w_mmap_buf,
481
+ /*.no_alloc =*/ true,
482
+ };
483
+ s->ctx_w_mmap = ggml_init(m_params);
484
+ if (!s->ctx_w_mmap) {
485
+ free(s->ctx_w_mmap_buf);
486
+ s->ctx_w_mmap_buf = NULL;
487
+ s->ctx_w_mmap_buf_size = 0;
488
+ return -1;
489
+ }
490
+ return 0;
491
+ }
492
+
493
+ int tnn_session_attach_weight_mmap(void *sess, void *base, size_t size)
494
+ {
495
+ if (!sess || !base || size == 0) return -1;
496
+ tnn_session *s = (tnn_session *)sess;
497
+ if (s->weights_buf_mmap) return -1; /* already attached */
498
+ if (ensure_ctx_w_mmap(s) != 0) return -1;
499
+ /* The buffer_from_ptr APIs assert ptr % TENSOR_ALIGNMENT == 0.
500
+ * mmap returns page-aligned pointers (>= 4 KiB), so a GGUF mmap
501
+ * always satisfies this.
502
+ *
503
+ * CUDA sessions get the patched ggml_backend_cuda_buffer_from_ptr
504
+ * (vendored in this repo; see docs/cuda-byo-pointer-design.md).
505
+ * The host region is cudaHostRegister'd and made device-addressable
506
+ * via UVA; on GB10 unified memory the device pointer equals the
507
+ * host pointer and kernels read the mmap'd file pages directly. */
508
+ int is_cuda = (s->engine && s->engine->backend_name &&
509
+ s->engine->backend_name[0] == 'c' &&
510
+ s->engine->backend_name[1] == 'u');
511
+ if (is_cuda) {
512
+ s->weights_buf_mmap = tnn_cuda_buffer_from_ptr_internal(base, size, 0);
513
+ if (!s->weights_buf_mmap) return -2; /* CUDA archive not linked / GPU error */
514
+ } else {
515
+ s->weights_buf_mmap = ggml_backend_cpu_buffer_from_ptr(base, size);
516
+ if (!s->weights_buf_mmap) return -1;
517
+ }
518
+ /* Store the buffer's "view" of the base, NOT the raw host pointer.
519
+ * On CPU these are the same; on CUDA the buffer's base is the
520
+ * UVA-mapped device pointer (equal to host_ptr on unified-memory
521
+ * SKUs, different on discrete GPUs). Tensor data pointers are
522
+ * computed as weights_map_base + offset; using the buffer's base
523
+ * keeps ggml_backend_tensor_alloc's range-check happy. */
524
+ s->weights_map_base = ggml_backend_buffer_get_base(s->weights_buf_mmap);
525
+ s->weights_map_size = size;
526
+ return 0;
527
+ }
528
+
529
+ /* Allocate a 2D persistent tensor in ctx_w_mmap whose `data` points
530
+ * at `base + buf_offset` in the attached mmap region. The tensor's
531
+ * `buffer` is set so the scheduler treats it as already-resident.
532
+ * Returns NULL on bad args or out-of-range offset.
533
+ *
534
+ * For block-quantized types, `cols` (ne0) must be a multiple of the
535
+ * type's block size and `buf_offset` must land on a 32-byte boundary
536
+ * (GGUF guarantees this).
537
+ *
538
+ * Caller computes buf_offset as
539
+ * gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, idx).
540
+ */
541
+ void *tnn_input_2d_persistent_mmap(void *sess, int rows, int cols,
542
+ int ggml_type, size_t buf_offset)
543
+ {
544
+ if (!sess || rows <= 0 || cols <= 0) return NULL;
545
+ tnn_session *s = (tnn_session *)sess;
546
+ if (!s->weights_buf_mmap || !s->weights_map_base) return NULL;
547
+ enum ggml_type t = (enum ggml_type)ggml_type;
548
+ int blck = ggml_blck_size(t);
549
+ if (blck > 1 && (cols % blck != 0)) return NULL;
550
+ if (buf_offset >= s->weights_map_size) return NULL;
551
+
552
+ struct ggml_tensor *tensor = ggml_new_tensor_2d(s->ctx_w_mmap, t,
553
+ (int64_t)cols,
554
+ (int64_t)rows);
555
+ if (!tensor) return NULL;
556
+
557
+ void *addr = (char *)s->weights_map_base + buf_offset;
558
+ enum ggml_status st = ggml_backend_tensor_alloc(s->weights_buf_mmap,
559
+ tensor, addr);
560
+ if (st != GGML_STATUS_SUCCESS) return NULL;
561
+ return (void *)tensor;
562
+ }
563
+
564
+ /* 3D variant for M2.3 MoE expert stacks. Per-expert weight matrices
565
+ * concatenated along ne[2] in the GGUF (e.g. ffn_gate_exps.weight has
566
+ * ne=[d_model, d_ff, n_experts]). Loads them in place via mmap so a
567
+ * Mixtral-8x7B Q4_K_M (26GB) doesn't require any RAM copy. */
568
+ void *tnn_input_3d_persistent_mmap(void *sess, int ne0, int ne1, int ne2,
569
+ int ggml_type, size_t buf_offset)
570
+ {
571
+ if (!sess || ne0 <= 0 || ne1 <= 0 || ne2 <= 0) return NULL;
572
+ tnn_session *s = (tnn_session *)sess;
573
+ if (!s->weights_buf_mmap || !s->weights_map_base) return NULL;
574
+ enum ggml_type t = (enum ggml_type)ggml_type;
575
+ int blck = ggml_blck_size(t);
576
+ if (blck > 1 && (ne0 % blck != 0)) return NULL;
577
+ if (buf_offset >= s->weights_map_size) return NULL;
578
+
579
+ struct ggml_tensor *tensor = ggml_new_tensor_3d(s->ctx_w_mmap, t,
580
+ (int64_t)ne0,
581
+ (int64_t)ne1,
582
+ (int64_t)ne2);
583
+ if (!tensor) return NULL;
584
+
585
+ void *addr = (char *)s->weights_map_base + buf_offset;
586
+ enum ggml_status st = ggml_backend_tensor_alloc(s->weights_buf_mmap,
587
+ tensor, addr);
588
+ if (st != GGML_STATUS_SUCCESS) return NULL;
589
+ return (void *)tensor;
590
+ }
591
+
592
+ /* 1D variant for norms / biases — same semantics. */
593
+ void *tnn_input_1d_persistent_mmap(void *sess, int n, int ggml_type,
594
+ size_t buf_offset)
595
+ {
596
+ if (!sess || n <= 0) return NULL;
597
+ tnn_session *s = (tnn_session *)sess;
598
+ if (!s->weights_buf_mmap || !s->weights_map_base) return NULL;
599
+ enum ggml_type t = (enum ggml_type)ggml_type;
600
+ if (buf_offset >= s->weights_map_size) return NULL;
601
+
602
+ struct ggml_tensor *tensor = ggml_new_tensor_1d(s->ctx_w_mmap, t,
603
+ (int64_t)n);
604
+ if (!tensor) return NULL;
605
+
606
+ void *addr = (char *)s->weights_map_base + buf_offset;
607
+ enum ggml_status st = ggml_backend_tensor_alloc(s->weights_buf_mmap,
608
+ tensor, addr);
609
+ if (st != GGML_STATUS_SUCCESS) return NULL;
610
+ return (void *)tensor;
611
+ }
612
+
613
+ /* Same as above but 1D — used for the 7-elem adamw_params vector. */
614
+ void *tnn_input_1d_f32_persistent(void *sess, int n)
615
+ {
616
+ if (!sess || n <= 0) return NULL;
617
+ tnn_session *s = (tnn_session *)sess;
618
+ if (s->weights_finalized) return NULL;
619
+ return (void *)ggml_new_tensor_1d(s->ctx_w, GGML_TYPE_F32, (int64_t)n);
620
+ }
621
+
622
+ /* M2: 3-D persistent F32 tensor — needed for MoE expert stacks,
623
+ * shape [d_in, d_out, n_experts]. Same lifecycle as the 2-D variant. */
624
+ void *tnn_input_3d_f32_persistent(void *sess, int ne0, int ne1, int ne2)
625
+ {
626
+ if (!sess || ne0 <= 0 || ne1 <= 0 || ne2 <= 0) return NULL;
627
+ tnn_session *s = (tnn_session *)sess;
628
+ if (s->weights_finalized) return NULL;
629
+ return (void *)ggml_new_tensor_3d(s->ctx_w, GGML_TYPE_F32,
630
+ (int64_t)ne0, (int64_t)ne1, (int64_t)ne2);
631
+ }
632
+
633
+ /* 3D variant of tnn_input_2d_persistent_typed for M2.3 MoE expert
634
+ * stacks. ne0/ne1 are the per-expert matrix dims; ne2 is n_experts.
635
+ * For Q8_0 we require ne0 % 32 == 0 (block alignment). */
636
+ void *tnn_input_3d_persistent_typed(void *sess, int ne0, int ne1, int ne2,
637
+ int ggml_type)
638
+ {
639
+ if (!sess || ne0 <= 0 || ne1 <= 0 || ne2 <= 0) return NULL;
640
+ tnn_session *s = (tnn_session *)sess;
641
+ if (s->weights_finalized) return NULL;
642
+ enum ggml_type t = (enum ggml_type)ggml_type;
643
+ int blck = ggml_blck_size(t);
644
+ if (blck > 1 && (ne0 % blck != 0)) return NULL;
645
+ return (void *)ggml_new_tensor_3d(s->ctx_w, t,
646
+ (int64_t)ne0, (int64_t)ne1, (int64_t)ne2);
647
+ }
648
+
649
+ /* E1.1 — 4D persistent F32 (conv kernels: ne=[KW, KH, IC, OC]). */
650
+ void *tnn_input_4d_f32_persistent(void *sess, int ne0, int ne1, int ne2, int ne3)
651
+ {
652
+ if (!sess || ne0 <= 0 || ne1 <= 0 || ne2 <= 0 || ne3 <= 0) return NULL;
653
+ tnn_session *s = (tnn_session *)sess;
654
+ if (s->weights_finalized) return NULL;
655
+ return (void *)ggml_new_tensor_4d(s->ctx_w, GGML_TYPE_F32,
656
+ (int64_t)ne0, (int64_t)ne1,
657
+ (int64_t)ne2, (int64_t)ne3);
658
+ }
659
+
660
+ /* Allocate the backend buffer for all persistent tensors in ctx_w.
661
+ * Must be called AFTER declaring all persistent tensors and BEFORE
662
+ * any tnn_realize/compute. After this, the persistent tensors have
663
+ * stable backend storage independent of sched.
664
+ *
665
+ * Returns 0 on success, negative on failure. */
666
+ int tnn_finalize_weights(void *sess)
667
+ {
668
+ if (!sess) return -1;
669
+ tnn_session *s = (tnn_session *)sess;
670
+ if (s->weights_finalized) return -2;
671
+ s->weights_buf = ggml_backend_alloc_ctx_tensors(s->ctx_w, s->engine->backend);
672
+ if (!s->weights_buf) return -3;
673
+ s->weights_finalized = 1;
674
+ return 0;
675
+ }
676
+
677
+ /* Zero an entire persistent tensor via backend memset_tensor. Faster
678
+ * than building a Mat-of-zeros + upload_row_major when the tensor is
679
+ * big (e.g. Adam state for vocab×d_model embeddings: ~1 GB of zeros).
680
+ * Works on both CPU (memset) and CUDA (cudaMemsetAsync). */
681
+ int tnn_zero_tensor(void *sess, void *tensor)
682
+ {
683
+ if (!sess || !tensor) return -1;
684
+ tnn_session *s = (tnn_session *)sess;
685
+ (void)s;
686
+ struct ggml_tensor *t = (struct ggml_tensor *)tensor;
687
+ ggml_backend_tensor_memset(t, 0, 0, ggml_nbytes(t));
688
+ return 0;
689
+ }
690
+
691
+ void *tnn_matmul(void *sess, void *a, void *b)
692
+ {
693
+ if (!sess || !a || !b) return NULL;
694
+ tnn_session *s = (tnn_session *)sess;
695
+ return (void *)ggml_mul_mat(s->ctx,
696
+ (struct ggml_tensor *)a,
697
+ (struct ggml_tensor *)b);
698
+ }
699
+
700
+ void *tnn_out_prod(void *sess, void *a, void *b)
701
+ {
702
+ /* ggml_out_prod: result[m, n] = sum_k a[k, m] * b[k, n]. Same
703
+ * input shape constraints as ggml_mul_mat (a.ne0 == b.ne0). Used
704
+ * by ggml's autograd for weight-gradient computations. Exposed
705
+ * here so A/B smokes can compare per-op cost vs ggml_mul_mat. */
706
+ if (!sess || !a || !b) return NULL;
707
+ tnn_session *s = (tnn_session *)sess;
708
+ return (void *)ggml_out_prod(s->ctx,
709
+ (struct ggml_tensor *)a,
710
+ (struct ggml_tensor *)b);
711
+ }
712
+
713
+ void *tnn_swiglu_split(void *sess, void *gate, void *up)
714
+ {
715
+ /* ggml_swiglu_split: silu(gate) * up — fused activation+mul for
716
+ * the Llama-family SwiGLU FFN gating step. Replaces the explicit
717
+ * silu(gate) → mul(_, up) pair in toy's FFN block. On CUDA the
718
+ * fusion lets ggml-cuda issue one kernel instead of two. */
719
+ if (!sess || !gate || !up) return NULL;
720
+ tnn_session *s = (tnn_session *)sess;
721
+ return (void *)ggml_swiglu_split(s->ctx,
722
+ (struct ggml_tensor *)gate,
723
+ (struct ggml_tensor *)up);
724
+ }
725
+
726
+ /* M2 MoE primitives. Thin wrappers — ggml does the work; we just expose
727
+ * the entry points through the FFI. See tinynn_ggml.h for shape docs. */
728
+ void *tnn_mul_mat_id(void *sess, void *as, void *b, void *ids)
729
+ {
730
+ if (!sess || !as || !b || !ids) return NULL;
731
+ tnn_session *s = (tnn_session *)sess;
732
+ return (void *)ggml_mul_mat_id(s->ctx,
733
+ (struct ggml_tensor *)as,
734
+ (struct ggml_tensor *)b,
735
+ (struct ggml_tensor *)ids);
736
+ }
737
+
738
+ void *tnn_add_id(void *sess, void *a, void *b, void *ids)
739
+ {
740
+ if (!sess || !a || !b || !ids) return NULL;
741
+ tnn_session *s = (tnn_session *)sess;
742
+ return (void *)ggml_add_id(s->ctx,
743
+ (struct ggml_tensor *)a,
744
+ (struct ggml_tensor *)b,
745
+ (struct ggml_tensor *)ids);
746
+ }
747
+
748
+ void *tnn_argsort(void *sess, void *a, int descending)
749
+ {
750
+ if (!sess || !a) return NULL;
751
+ tnn_session *s = (tnn_session *)sess;
752
+ enum ggml_sort_order order = descending ? GGML_SORT_ORDER_DESC : GGML_SORT_ORDER_ASC;
753
+ return (void *)ggml_argsort(s->ctx, (struct ggml_tensor *)a, order);
754
+ }
755
+
756
+ void *tnn_top_k(void *sess, void *a, int k)
757
+ {
758
+ if (!sess || !a || k <= 0) return NULL;
759
+ tnn_session *s = (tnn_session *)sess;
760
+ return (void *)ggml_top_k(s->ctx, (struct ggml_tensor *)a, k);
761
+ }
762
+
763
+ void *tnn_matmul_axb(void *sess, void *a, void *b)
764
+ {
765
+ /* Compute A · B (no transpose at the caller). ggml_mul_mat does
766
+ * A · B^T natively, so we transpose B first. ggml_transpose is a
767
+ * stride-permutation view; ggml_cont materializes it as contiguous
768
+ * so mul_mat's contiguity-required input is satisfied. */
769
+ if (!sess || !a || !b) return NULL;
770
+ tnn_session *s = (tnn_session *)sess;
771
+ struct ggml_tensor *bT = ggml_cont(s->ctx, ggml_transpose(s->ctx, (struct ggml_tensor *)b));
772
+ return (void *)ggml_mul_mat(s->ctx, (struct ggml_tensor *)a, bT);
773
+ }
774
+
775
+ void *tnn_add(void *sess, void *a, void *b)
776
+ {
777
+ if (!sess || !a || !b) return NULL;
778
+ tnn_session *s = (tnn_session *)sess;
779
+ return (void *)ggml_add(s->ctx,
780
+ (struct ggml_tensor *)a,
781
+ (struct ggml_tensor *)b);
782
+ }
783
+
784
+ void *tnn_tanh(void *sess, void *a)
785
+ {
786
+ if (!sess || !a) return NULL;
787
+ tnn_session *s = (tnn_session *)sess;
788
+ /* Element-wise tanh. Used by Gemma 2's logit soft-cap:
789
+ * y = softcap * tanh(x / softcap)
790
+ * Composed via tnn_scale + tnn_tanh + tnn_scale in the graph builder. */
791
+ return (void *)ggml_tanh(s->ctx, (struct ggml_tensor *)a);
792
+ }
793
+
794
+ void *tnn_ssm_conv(void *sess, void *sx, void *c)
795
+ {
796
+ if (!sess || !sx || !c) return NULL;
797
+ tnn_session *s = (tnn_session *)sess;
798
+ return (void *)ggml_ssm_conv(s->ctx,
799
+ (struct ggml_tensor *)sx,
800
+ (struct ggml_tensor *)c);
801
+ }
802
+
803
+ void *tnn_ssm_scan(void *sess, void *state, void *x, void *dt,
804
+ void *A, void *B, void *C, void *ids)
805
+ {
806
+ if (!sess || !state || !x || !dt || !A || !B || !C || !ids) return NULL;
807
+ tnn_session *s = (tnn_session *)sess;
808
+ return (void *)ggml_ssm_scan(s->ctx,
809
+ (struct ggml_tensor *)state,
810
+ (struct ggml_tensor *)x,
811
+ (struct ggml_tensor *)dt,
812
+ (struct ggml_tensor *)A,
813
+ (struct ggml_tensor *)B,
814
+ (struct ggml_tensor *)C,
815
+ (struct ggml_tensor *)ids);
816
+ }
817
+
818
+ void *tnn_gelu(void *sess, void *a)
819
+ {
820
+ if (!sess || !a) return NULL;
821
+ tnn_session *s = (tnn_session *)sess;
822
+ /* ggml_gelu uses the tanh approximation:
823
+ * 0.5 * x * (1 + tanh(sqrt(2/π) * (x + 0.044715 * x^3)))
824
+ * which matches the project's feed_forward GeLU exactly. */
825
+ return (void *)ggml_gelu(s->ctx, (struct ggml_tensor *)a);
826
+ }
827
+
828
+ void *tnn_rms_norm(void *sess, void *x, void *gamma_row, double eps)
829
+ {
830
+ if (!sess || !x || !gamma_row) return NULL;
831
+ tnn_session *s = (tnn_session *)sess;
832
+ /* ggml_rms_norm normalizes along ne[0] (the feature dim). The result
833
+ * is the unscaled normalized tensor; we then multiply by gamma_row
834
+ * (shape 1 x feature) which ggml_mul broadcasts over the leading dim. */
835
+ struct ggml_tensor *normed = ggml_rms_norm(s->ctx,
836
+ (struct ggml_tensor *)x,
837
+ (float)eps);
838
+ return (void *)ggml_mul(s->ctx, normed, (struct ggml_tensor *)gamma_row);
839
+ }
840
+
841
+ /* LayerNorm: y = gamma * (x - mean) / sqrt(var + eps) + beta. ggml_norm
842
+ * computes the normalized (x - mean)/sqrt(var+eps) part; we then
843
+ * multiply by gamma and add beta. Used for HF-style models (GPT-2 /
844
+ * GPT-Neo / TinyStories) that use LayerNorm rather than RMSNorm. */
845
+ void *tnn_layer_norm(void *sess, void *x, void *gamma_row, void *beta_row, double eps)
846
+ {
847
+ if (!sess || !x || !gamma_row || !beta_row) return NULL;
848
+ tnn_session *s = (tnn_session *)sess;
849
+ struct ggml_tensor *normed = ggml_norm(s->ctx,
850
+ (struct ggml_tensor *)x,
851
+ (float)eps);
852
+ struct ggml_tensor *scaled = ggml_mul(s->ctx, normed,
853
+ (struct ggml_tensor *)gamma_row);
854
+ return (void *)ggml_add(s->ctx, scaled,
855
+ (struct ggml_tensor *)beta_row);
856
+ }
857
+
858
+ /* Write `b` into `a` at byte offset, with row stride nb1. Result has
859
+ * `a`'s shape (unlike ggml_cpy which returns the small dst view) so
860
+ * downstream ops can read the modified `a` directly. Used for V[:, pos]
861
+ * column writes in KV cache (V layout = [max_T, d_head], offset =
862
+ * pos * 4, nb1 = max_T * 4). */
863
+ void *tnn_set_2d(void *sess, void *a, void *b, long nb1, long offset)
864
+ {
865
+ if (!sess || !a || !b) return NULL;
866
+ tnn_session *s = (tnn_session *)sess;
867
+ return (void *)ggml_set_2d(s->ctx,
868
+ (struct ggml_tensor *)a,
869
+ (struct ggml_tensor *)b,
870
+ (size_t)nb1,
871
+ (size_t)offset);
872
+ }
873
+
874
+ /* Write `b`'s rows into `a` at row indices `idx`. For our KV cache:
875
+ * a = persistent K (ne=[d_head, max_T])
876
+ * b = compute k_new (ne=[d_head, 1])
877
+ * idx = compute (1,) int32 holding the current decode position
878
+ * The new k row lands at K[idx[0]] (other rows untouched). Same shape
879
+ * pattern for V. Position is a RUNTIME tensor — the graph stays
880
+ * static across decode steps, so we don't need to rebuild it. */
881
+ void *tnn_set_rows(void *sess, void *a, void *b, void *idx)
882
+ {
883
+ if (!sess || !a || !b || !idx) return NULL;
884
+ tnn_session *s = (tnn_session *)sess;
885
+ return (void *)ggml_set_rows(s->ctx,
886
+ (struct ggml_tensor *)a,
887
+ (struct ggml_tensor *)b,
888
+ (struct ggml_tensor *)idx);
889
+ }
890
+
891
+ /* Softmax-with-mask. Adds `mask` to `a`, scales by `scale`, then runs
892
+ * softmax along ne[0]. For KV-cache attention: scores shape (max_T, 1),
893
+ * mask shape (max_T, 1), result shape (max_T, 1). The mask is uploaded
894
+ * per step with 0.0 for positions <= pos and -inf for positions > pos
895
+ * so the softmax zeroes out future-key attention even though K's
896
+ * future-position slots may hold stale or uninitialised values. */
897
+ void *tnn_soft_max_ext(void *sess, void *a, void *mask, double scale, double max_bias)
898
+ {
899
+ if (!sess || !a) return NULL;
900
+ tnn_session *s = (tnn_session *)sess;
901
+ return (void *)ggml_soft_max_ext(s->ctx,
902
+ (struct ggml_tensor *)a,
903
+ (struct ggml_tensor *)mask,
904
+ (float)scale,
905
+ (float)max_bias);
906
+ }
907
+
908
+ /* Returns a NULL pointer typed as :ptr. Useful as an Array<:ptr> seed
909
+ * value so Spinel infers the array as a PtrArray rather than typing
910
+ * it from a `[nil]` literal (which can resolve to IntArray). */
911
+ void *tnn_null_ptr(void)
912
+ {
913
+ return NULL;
914
+ }
915
+
916
+ /* 1-D view of a tensor at byte `offset`, of length `ne0`. Used to
917
+ * slice a single row out of a (max_T, d_head) KV buffer at a runtime
918
+ * position computed by the caller (offset = pos * d_head * 4). */
919
+ void *tnn_view_1d(void *sess, void *a, int ne0, long offset)
920
+ {
921
+ if (!sess || !a) return NULL;
922
+ tnn_session *s = (tnn_session *)sess;
923
+ return (void *)ggml_view_1d(s->ctx, (struct ggml_tensor *)a,
924
+ (int64_t)ne0, (size_t)offset);
925
+ }
926
+
927
+ /* 2-D view of a tensor: rows of length ne0 stride nb1, ne1 rows
928
+ * total, starting at byte `offset`. Used for slicing K/V[0:pos+1] in
929
+ * attention. nb1 = d_head * 4 for our row-of-floats KV layout. */
930
+ void *tnn_view_2d(void *sess, void *a, int ne0, int ne1, long nb1, long offset)
931
+ {
932
+ if (!sess || !a) return NULL;
933
+ tnn_session *s = (tnn_session *)sess;
934
+ return (void *)ggml_view_2d(s->ctx, (struct ggml_tensor *)a,
935
+ (int64_t)ne0, (int64_t)ne1,
936
+ (size_t)nb1, (size_t)offset);
937
+ }
938
+
939
+ /* Reshape a contiguous tensor to (ne0, ne1, ne2). The total element
940
+ * count must match. Used by the sequence-mode forward (M3) to lift
941
+ * Q/K from ne=[d_head, T] to ne=[d_head, 1, T] before ggml_rope_ext —
942
+ * rope_ext asserts a->ne[2] == positions->ne[0]. */
943
+ void *tnn_reshape_3d(void *sess, void *a, int ne0, int ne1, int ne2)
944
+ {
945
+ if (!sess || !a) return NULL;
946
+ tnn_session *s = (tnn_session *)sess;
947
+ return (void *)ggml_reshape_3d(s->ctx, (struct ggml_tensor *)a,
948
+ (int64_t)ne0, (int64_t)ne1, (int64_t)ne2);
949
+ }
950
+
951
+ /* Reshape a contiguous tensor back to (ne0, ne1). After rope_ext on
952
+ * a [d_head, 1, T] tensor, downstream matmul wants [d_head, T] again. */
953
+ void *tnn_reshape_2d(void *sess, void *a, int ne0, int ne1)
954
+ {
955
+ if (!sess || !a) return NULL;
956
+ tnn_session *s = (tnn_session *)sess;
957
+ return (void *)ggml_reshape_2d(s->ctx, (struct ggml_tensor *)a,
958
+ (int64_t)ne0, (int64_t)ne1);
959
+ }
960
+
961
+ /* Copy a -> b. Used to write k_new into a view of the persistent K
962
+ * buffer (b = view_2d(K, d_head, 1, ..., offset=pos*d_head*4)). */
963
+ void *tnn_cpy(void *sess, void *a, void *b)
964
+ {
965
+ if (!sess || !a || !b) return NULL;
966
+ tnn_session *s = (tnn_session *)sess;
967
+ return (void *)ggml_cpy(s->ctx, (struct ggml_tensor *)a,
968
+ (struct ggml_tensor *)b);
969
+ }
970
+
971
+ /* Cast a tensor to a target dtype (GH#9 mixed-precision training).
972
+ * Returns a NEW tensor of the requested type with the same shape.
973
+ * dtype enum values are from ggml_type (0=F32, 1=F16, 30=BF16, …).
974
+ * Backed by ggml_cast which under the hood is GGML_OP_CPY with a
975
+ * fresh dst of the target dtype — backward flows correctly through
976
+ * the cpy backward case (grad of cast(src) = reshape(grad, src)
977
+ * which preserves src's dtype = the F32 master in the typical
978
+ * weight-cast-to-bf16 use case). */
979
+ void *tnn_cast(void *sess, void *a, int dtype)
980
+ {
981
+ if (!sess || !a) return NULL;
982
+ tnn_session *s = (tnn_session *)sess;
983
+ return (void *)ggml_cast(s->ctx, (struct ggml_tensor *)a,
984
+ (enum ggml_type)dtype);
985
+ }
986
+
987
+ /* Concatenate `a` and `b` along the given dim (0 = ne[0], 1 = ne[1]).
988
+ * Other dims must match. Used to glue per-head attention outputs into
989
+ * a single (d_model, T) tensor by stacking d_head slices along ne0. */
990
+ void *tnn_concat(void *sess, void *a, void *b, int dim)
991
+ {
992
+ if (!sess || !a || !b) return NULL;
993
+ tnn_session *s = (tnn_session *)sess;
994
+ return (void *)ggml_concat(s->ctx,
995
+ (struct ggml_tensor *)a,
996
+ (struct ggml_tensor *)b,
997
+ dim);
998
+ }
999
+
1000
+ /* Causal mask: sets elements ABOVE the diagonal (i.e. positions where
1001
+ * key_idx > query_idx + n_past) to -inf, so subsequent softmax zeroes
1002
+ * them. n_past = 0 gives the standard causal mask for training. For
1003
+ * KV-cache inference, n_past = current position so attention can see
1004
+ * cached keys plus the current token but not future tokens. */
1005
+ void *tnn_diag_mask_inf(void *sess, void *a, int n_past)
1006
+ {
1007
+ if (!sess || !a) return NULL;
1008
+ tnn_session *s = (tnn_session *)sess;
1009
+ return (void *)ggml_diag_mask_inf(s->ctx, (struct ggml_tensor *)a, n_past);
1010
+ }
1011
+
1012
+ /* --- Vision / Conv ops (E1.1) ------------------------------------------ */
1013
+
1014
+ /* im2col: extracts sliding kernel windows from the input image into a
1015
+ * 2D matrix suitable for matmul-as-conv. ggml's im2col output ne for
1016
+ * is_2D=true is [IC*KH*KW, OH*OW, N, 1].
1017
+ *
1018
+ * dst_type: 0=F32, 1=F16, 26=I32 (full enum in ggml.h:ggml_type). */
1019
+ void *tnn_im2col(void *sess, void *kernel, void *data,
1020
+ int s0, int s1, int p0, int p1, int d0, int d1,
1021
+ int is_2D, int dst_type)
1022
+ {
1023
+ if (!sess || !kernel || !data) return NULL;
1024
+ tnn_session *s = (tnn_session *)sess;
1025
+ return (void *)ggml_im2col(s->ctx,
1026
+ (struct ggml_tensor *)kernel,
1027
+ (struct ggml_tensor *)data,
1028
+ s0, s1, p0, p1, d0, d1,
1029
+ is_2D ? true : false,
1030
+ (enum ggml_type)dst_type);
1031
+ }
1032
+
1033
+ /* im2col_back: gradient of im2col w.r.t. the input image. Caller
1034
+ * must pass the original input image shape via input_w/input_h/input_c/input_n
1035
+ * (ggml's API wants an int64_t ne[4]). */
1036
+ void *tnn_im2col_back(void *sess, void *kernel, void *grad_im2col,
1037
+ int input_w, int input_h, int input_c, int input_n,
1038
+ int s0, int s1, int p0, int p1, int d0, int d1,
1039
+ int is_2D)
1040
+ {
1041
+ if (!sess || !kernel || !grad_im2col) return NULL;
1042
+ tnn_session *s = (tnn_session *)sess;
1043
+ int64_t ne[4];
1044
+ ne[0] = (int64_t)input_w;
1045
+ ne[1] = (int64_t)input_h;
1046
+ ne[2] = (int64_t)input_c;
1047
+ ne[3] = (int64_t)input_n;
1048
+ return (void *)ggml_im2col_back(s->ctx,
1049
+ (struct ggml_tensor *)kernel,
1050
+ (struct ggml_tensor *)grad_im2col,
1051
+ ne,
1052
+ s0, s1, p0, p1, d0, d1,
1053
+ is_2D ? true : false);
1054
+ }
1055
+
1056
+ /* conv_2d: composite (im2col + matmul). ggml internally folds the
1057
+ * kernel + im2col output and emits the [OW, OH, OC, N] result. */
1058
+ void *tnn_conv_2d(void *sess, void *kernel, void *data,
1059
+ int s0, int s1, int p0, int p1, int d0, int d1)
1060
+ {
1061
+ if (!sess || !kernel || !data) return NULL;
1062
+ tnn_session *s = (tnn_session *)sess;
1063
+ return (void *)ggml_conv_2d(s->ctx,
1064
+ (struct ggml_tensor *)kernel,
1065
+ (struct ggml_tensor *)data,
1066
+ s0, s1, p0, p1, d0, d1);
1067
+ }
1068
+
1069
+ /* Reorder dims as a view (no copy). Result must be passed through
1070
+ * ggml_cont before any op that requires contiguous memory. */
1071
+ void *tnn_permute(void *sess, void *a, int axis0, int axis1, int axis2, int axis3)
1072
+ {
1073
+ if (!sess || !a) return NULL;
1074
+ tnn_session *s = (tnn_session *)sess;
1075
+ return (void *)ggml_permute(s->ctx, (struct ggml_tensor *)a,
1076
+ axis0, axis1, axis2, axis3);
1077
+ }
1078
+
1079
+ /* Make contiguous + reshape to 2D in one op. Used after a permute
1080
+ * to flatten the spatial dims for the transformer input. */
1081
+ void *tnn_cont_2d(void *sess, void *a, int ne0, int ne1)
1082
+ {
1083
+ if (!sess || !a) return NULL;
1084
+ tnn_session *s = (tnn_session *)sess;
1085
+ return (void *)ggml_cont_2d(s->ctx, (struct ggml_tensor *)a,
1086
+ (int64_t)ne0, (int64_t)ne1);
1087
+ }
1088
+
1089
+ /* --- Llama-family ops -------------------------------------------------- */
1090
+
1091
+ /* SiLU activation: silu(x) = x * sigmoid(x). Used in SwiGLU FFNs
1092
+ * (Llama / SmolLM2 / Qwen / Phi). */
1093
+ void *tnn_silu(void *sess, void *a)
1094
+ {
1095
+ if (!sess || !a) return NULL;
1096
+ tnn_session *s = (tnn_session *)sess;
1097
+ return (void *)ggml_silu(s->ctx, (struct ggml_tensor *)a);
1098
+ }
1099
+
1100
+ /* Elementwise multiply c = a * b. Used to combine the gate and up
1101
+ * projections of SwiGLU before the down projection. */
1102
+ void *tnn_mul(void *sess, void *a, void *b)
1103
+ {
1104
+ if (!sess || !a || !b) return NULL;
1105
+ tnn_session *s = (tnn_session *)sess;
1106
+ return (void *)ggml_mul(s->ctx,
1107
+ (struct ggml_tensor *)a,
1108
+ (struct ggml_tensor *)b);
1109
+ }
1110
+
1111
+ /* Rotary Position Embedding (rotate_half / NEOX mode), as used by
1112
+ * Llama / SmolLM2 / Qwen2 / Mistral. Applied to Q and K before the
1113
+ * dot product.
1114
+ *
1115
+ * a: input tensor, shape [Dh, T, ...] (one head's worth)
1116
+ * pos: int32 tensor of length T, absolute positions per token
1117
+ * n_dims: number of dimensions to rotate (= Dh for full rotary,
1118
+ * smaller for partial — Pythia uses Dh/4)
1119
+ * freq_base: theta base. 10000 (Llama-1/2), 100000 (SmolLM2),
1120
+ * 1000000 (Qwen2 long-context)
1121
+ *
1122
+ * Pass freq_scale=1.0, ext_factor=0.0, attn_factor=1.0, beta_fast=32.0,
1123
+ * beta_slow=1.0, freq_factors=NULL for the no-scaling (vanilla GPT-2 /
1124
+ * SmolLM2 / Qwen2-short-context) default. YaRN tunes the scalars;
1125
+ * llama3 + LongRoPE supply freq_factors via tnn_rope_freq_factors_*. */
1126
+ void *tnn_rope_ext(void *sess, void *a, void *pos, int n_dims,
1127
+ double freq_base, double freq_scale,
1128
+ double ext_factor, double attn_factor,
1129
+ double beta_fast, double beta_slow,
1130
+ void *freq_factors)
1131
+ {
1132
+ if (!sess || !a || !pos) return NULL;
1133
+ tnn_session *s = (tnn_session *)sess;
1134
+ const int mode = 2; /* GGML_ROPE_TYPE_NEOX — matches HF llama rotate_half */
1135
+ /* n_ctx_orig is only consulted when ext_factor != 0 (YaRN). Pass
1136
+ * 0 when no YaRN is in play; callers using YaRN encode orig_ctx
1137
+ * via the freq_factors path or pass it via attn_factor scaling. */
1138
+ const int n_ctx_orig = 0;
1139
+ return (void *)ggml_rope_ext(s->ctx,
1140
+ (struct ggml_tensor *)a,
1141
+ (struct ggml_tensor *)pos,
1142
+ (struct ggml_tensor *)freq_factors,
1143
+ n_dims,
1144
+ mode,
1145
+ n_ctx_orig,
1146
+ (float)freq_base,
1147
+ (float)freq_scale,
1148
+ (float)ext_factor,
1149
+ (float)attn_factor,
1150
+ (float)beta_fast,
1151
+ (float)beta_slow);
1152
+ }
1153
+
1154
+ /* Allocate a persistent (n_dims/2)-element F32 tensor in ctx_w to hold
1155
+ * RoPE freq_factors for llama3-style or LongRoPE scaling. Must be
1156
+ * called BEFORE tnn_finalize_weights, like any other persistent.
1157
+ *
1158
+ * The values are computed by the Ruby side (see
1159
+ * Toy::RopeScaling.compute_llama3_freq_factors) and uploaded via the
1160
+ * standard tnn_upload_from_float_array path after finalize. Doing the
1161
+ * math in Ruby (i) keeps the C wrapper simple, (ii) avoids the
1162
+ * "write to t->data with no_alloc=true" trap, and (iii) makes the
1163
+ * scaling formula trivially testable from MRI without recompiling. */
1164
+ void *tnn_rope_freq_factors_alloc(void *sess, int n_dims)
1165
+ {
1166
+ if (!sess || n_dims <= 0) return NULL;
1167
+ tnn_session *s = (tnn_session *)sess;
1168
+ if (s->weights_finalized) return NULL;
1169
+ return (void *)ggml_new_tensor_1d(s->ctx_w, GGML_TYPE_F32,
1170
+ (int64_t)(n_dims / 2));
1171
+ }
1172
+
1173
+ /* Allocate a 1-D int32 tensor in the *session* context. Used to hold
1174
+ * RoPE position indices. The caller fills it via tnn_scratch_set_i32 +
1175
+ * tnn_upload_int_array (or fills directly during graph build). */
1176
+ void *tnn_input_1d_i32_ctx(void *sess, int n)
1177
+ {
1178
+ if (!sess) return NULL;
1179
+ tnn_session *s = (tnn_session *)sess;
1180
+ return (void *)ggml_new_tensor_1d(s->ctx, GGML_TYPE_I32, n);
1181
+ }
1182
+
1183
+ void *tnn_softmax(void *sess, void *a)
1184
+ {
1185
+ if (!sess || !a) return NULL;
1186
+ tnn_session *s = (tnn_session *)sess;
1187
+ /* ggml_soft_max normalizes along ne[0]. With our convention
1188
+ * (ne0=cols, ne1=rows) this is per-row softmax, matching the
1189
+ * project's softmax_rows!. */
1190
+ return (void *)ggml_soft_max(s->ctx, (struct ggml_tensor *)a);
1191
+ }
1192
+
1193
+ void *tnn_flash_attn_ext(void *sess, void *q, void *k, void *v, void *mask,
1194
+ double scale, double max_bias, double logit_softcap)
1195
+ {
1196
+ if (!sess || !q || !k || !v) return NULL;
1197
+ tnn_session *s = (tnn_session *)sess;
1198
+ /* mask may be NULL when no causal/sequence mask is wanted (e.g. fully
1199
+ * dense T_q=1 decode with no padding). ggml's impl handles NULL. */
1200
+ return (void *)ggml_flash_attn_ext(s->ctx,
1201
+ (struct ggml_tensor *)q,
1202
+ (struct ggml_tensor *)k,
1203
+ (struct ggml_tensor *)v,
1204
+ (struct ggml_tensor *)mask,
1205
+ (float)scale,
1206
+ (float)max_bias,
1207
+ (float)logit_softcap);
1208
+ }
1209
+
1210
+ void *tnn_transpose(void *sess, void *a)
1211
+ {
1212
+ if (!sess || !a) return NULL;
1213
+ tnn_session *s = (tnn_session *)sess;
1214
+ /* ggml_transpose is a stride-permutation view (no data movement).
1215
+ * Wrap in ggml_cont so the result is contiguous f32 and downloadable. */
1216
+ return (void *)ggml_cont(s->ctx,
1217
+ ggml_transpose(s->ctx, (struct ggml_tensor *)a));
1218
+ }
1219
+
1220
+ void *tnn_scale(void *sess, void *a, double scale)
1221
+ {
1222
+ if (!sess || !a) return NULL;
1223
+ tnn_session *s = (tnn_session *)sess;
1224
+ return (void *)ggml_scale(s->ctx, (struct ggml_tensor *)a, (float)scale);
1225
+ }
1226
+
1227
+ void *tnn_rms_norm_back(void *sess, void *x, void *dy, double eps)
1228
+ {
1229
+ if (!sess || !x || !dy) return NULL;
1230
+ tnn_session *s = (tnn_session *)sess;
1231
+ return (void *)ggml_rms_norm_back(s->ctx,
1232
+ (struct ggml_tensor *)x,
1233
+ (struct ggml_tensor *)dy,
1234
+ (float)eps);
1235
+ }
1236
+
1237
+ void *tnn_softmax_back(void *sess, void *a, void *dy)
1238
+ {
1239
+ if (!sess || !a || !dy) return NULL;
1240
+ tnn_session *s = (tnn_session *)sess;
1241
+ /* Plain softmax backward: scale=1.0, max_bias=0.0 (no ALiBi). */
1242
+ return (void *)ggml_soft_max_ext_back(s->ctx,
1243
+ (struct ggml_tensor *)a,
1244
+ (struct ggml_tensor *)dy,
1245
+ 1.0f, 0.0f);
1246
+ }
1247
+
1248
+ /* Backward for SiLU activation. SiLU(x) = x * sigmoid(x);
1249
+ * dSiLU/dx = sigmoid(x) * (1 + x * (1 - sigmoid(x)))
1250
+ * Given x and dy (gradient from upstream), returns dx.
1251
+ *
1252
+ * NOTE: ggml_silu_back's public header comment swaps the args
1253
+ * ("a - x, b - dy"). Reading the actual CPU op, src[0]=dy and
1254
+ * src[1]=x. We pass (dy, x) to match the implementation. */
1255
+ void *tnn_silu_back(void *sess, void *x, void *dy)
1256
+ {
1257
+ if (!sess || !x || !dy) return NULL;
1258
+ tnn_session *s = (tnn_session *)sess;
1259
+ return (void *)ggml_silu_back(s->ctx,
1260
+ (struct ggml_tensor *)dy,
1261
+ (struct ggml_tensor *)x);
1262
+ }
1263
+
1264
+ /* Backward for RoPE-NEOX. Same arg convention as tnn_rope_ext but
1265
+ * also takes dy (gradient of the rope_ext output). Returns dx.
1266
+ * Callers must pass the same YaRN/scaling args used in the forward;
1267
+ * mismatch silently corrupts gradients. */
1268
+ void *tnn_rope_ext_back(void *sess, void *dy, void *pos, int n_dims,
1269
+ double freq_base, double freq_scale,
1270
+ double ext_factor, double attn_factor,
1271
+ double beta_fast, double beta_slow,
1272
+ void *freq_factors)
1273
+ {
1274
+ if (!sess || !dy || !pos) return NULL;
1275
+ tnn_session *s = (tnn_session *)sess;
1276
+ const int mode = 2; /* GGML_ROPE_TYPE_NEOX */
1277
+ const int n_ctx_orig = 0;
1278
+ return (void *)ggml_rope_ext_back(s->ctx,
1279
+ (struct ggml_tensor *)dy,
1280
+ (struct ggml_tensor *)pos,
1281
+ (struct ggml_tensor *)freq_factors,
1282
+ n_dims,
1283
+ mode,
1284
+ n_ctx_orig,
1285
+ (float)freq_base,
1286
+ (float)freq_scale,
1287
+ (float)ext_factor,
1288
+ (float)attn_factor,
1289
+ (float)beta_fast,
1290
+ (float)beta_slow);
1291
+ }
1292
+
1293
+ void *tnn_get_rows(void *sess, void *table, void *idx)
1294
+ {
1295
+ if (!sess || !table || !idx) return NULL;
1296
+ tnn_session *s = (tnn_session *)sess;
1297
+ return (void *)ggml_get_rows(s->ctx,
1298
+ (struct ggml_tensor *)table,
1299
+ (struct ggml_tensor *)idx);
1300
+ }
1301
+
1302
+ void *tnn_get_rows_back(void *sess, void *d_out, void *idx, void *table_shape)
1303
+ {
1304
+ if (!sess || !d_out || !idx || !table_shape) return NULL;
1305
+ tnn_session *s = (tnn_session *)sess;
1306
+ return (void *)ggml_get_rows_back(s->ctx,
1307
+ (struct ggml_tensor *)d_out,
1308
+ (struct ggml_tensor *)idx,
1309
+ (struct ggml_tensor *)table_shape);
1310
+ }
1311
+
1312
+ void *tnn_input_1d_i32(void *sess, int n)
1313
+ {
1314
+ if (!sess || n <= 0) return NULL;
1315
+ tnn_session *s = (tnn_session *)sess;
1316
+ return (void *)ggml_new_tensor_1d(s->ctx, GGML_TYPE_I32, (int64_t)n);
1317
+ }
1318
+
1319
+ void tnn_gelu_back_scratch(void *sess, int n)
1320
+ {
1321
+ if (!sess || n <= 0) return;
1322
+ tnn_session *s = (tnn_session *)sess;
1323
+ int max_slots = TNN_SCRATCH_BYTES / (int)sizeof(float);
1324
+ if (3 * n > max_slots) return; /* not enough scratch */
1325
+
1326
+ const float *x = s->scratch + 0;
1327
+ const float *dh = s->scratch + n;
1328
+ float *dx = s->scratch + 2 * n;
1329
+
1330
+ const float c = 0.7978845608028654f; /* sqrt(2/pi) */
1331
+ const float k = 0.044715f;
1332
+
1333
+ for (int i = 0; i < n; ++i) {
1334
+ float xi = x[i];
1335
+ float xi2 = xi * xi;
1336
+ float u = c * (xi + k * xi * xi2);
1337
+ float tu = tanhf(u);
1338
+ float sech2 = 1.0f - tu * tu;
1339
+ float dudx = c * (1.0f + 3.0f * k * xi2);
1340
+ float dgelu = 0.5f * (1.0f + tu) + 0.5f * xi * sech2 * dudx;
1341
+ dx[i] = dh[i] * dgelu;
1342
+ }
1343
+ }
1344
+
1345
+ void tnn_adam_step_scratch(void *sess, int n,
1346
+ double lr, double b1, double b2, double eps,
1347
+ double omc1, double omc2)
1348
+ {
1349
+ if (!sess || n <= 0) return;
1350
+ tnn_session *s = (tnn_session *)sess;
1351
+ int max_slots = TNN_SCRATCH_BYTES / (int)sizeof(float);
1352
+ if (4 * n > max_slots) return;
1353
+
1354
+ float *p = s->scratch + 0;
1355
+ const float *g = s->scratch + n;
1356
+ float *m = s->scratch + 2 * n;
1357
+ float *v = s->scratch + 3 * n;
1358
+
1359
+ const float one_minus_b1 = (float)(1.0 - b1);
1360
+ const float one_minus_b2 = (float)(1.0 - b2);
1361
+ const float fb1 = (float)b1;
1362
+ const float fb2 = (float)b2;
1363
+ const float flr = (float)lr;
1364
+ const float feps = (float)eps;
1365
+ const float fomc1 = (float)omc1;
1366
+ const float fomc2 = (float)omc2;
1367
+
1368
+ for (int i = 0; i < n; ++i) {
1369
+ float gi = g[i];
1370
+ float new_m = fb1 * m[i] + one_minus_b1 * gi;
1371
+ float new_v = fb2 * v[i] + one_minus_b2 * gi * gi;
1372
+ m[i] = new_m;
1373
+ v[i] = new_v;
1374
+ float m_hat = new_m / fomc1;
1375
+ float v_hat = new_v / fomc2;
1376
+ p[i] = p[i] - flr * m_hat / (sqrtf(v_hat) + feps);
1377
+ }
1378
+ }
1379
+
1380
+ void tnn_set_output(void *tensor)
1381
+ {
1382
+ if (!tensor) return;
1383
+ ggml_set_output((struct ggml_tensor *)tensor);
1384
+ }
1385
+
1386
+ /* Sum all elements → scalar. Used to build a loss from a vector
1387
+ * output (e.g. sum(y * y) for an L2 squared loss). */
1388
+ void *tnn_sum(void *sess, void *a)
1389
+ {
1390
+ if (!sess || !a) return NULL;
1391
+ tnn_session *s = (tnn_session *)sess;
1392
+ return (void *)ggml_sum(s->ctx, (struct ggml_tensor *)a);
1393
+ }
1394
+
1395
+ void *tnn_sum_rows(void *sess, void *a)
1396
+ {
1397
+ if (!sess || !a) return NULL;
1398
+ tnn_session *s = (tnn_session *)sess;
1399
+ return (void *)ggml_sum_rows(s->ctx, (struct ggml_tensor *)a);
1400
+ }
1401
+
1402
+ /* Cross-entropy loss against a probability-distribution label tensor.
1403
+ * Wraps ggml_cross_entropy_loss: returns a scalar. The label tensor
1404
+ * has the same shape as the logits and should be a probability dist
1405
+ * (one-hot for hard targets, label-smoothed for soft). Output is the
1406
+ * mean negative log-likelihood across the columns of a (a column =
1407
+ * one example). Used for F1.2 SmolLM2 LoRA fine-tuning. */
1408
+ void *tnn_cross_entropy_loss(void *sess, void *a, void *b)
1409
+ {
1410
+ if (!sess || !a || !b) return NULL;
1411
+ tnn_session *s = (tnn_session *)sess;
1412
+ return (void *)ggml_cross_entropy_loss(s->ctx,
1413
+ (struct ggml_tensor *)a,
1414
+ (struct ggml_tensor *)b);
1415
+ }
1416
+
1417
+ void tnn_set_param(void *tensor)
1418
+ {
1419
+ if (!tensor) return;
1420
+ ggml_set_param((struct ggml_tensor *)tensor);
1421
+ }
1422
+
1423
+ /* Mark a tensor as the training loss. Required for autograd via
1424
+ * ggml_build_backward_expand — it asserts at least one node is marked
1425
+ * as loss and at least one as param. Typically the scalar output of
1426
+ * a sum-reduce or cross-entropy. */
1427
+ void tnn_set_loss(void *tensor)
1428
+ {
1429
+ if (!tensor) return;
1430
+ ggml_set_loss((struct ggml_tensor *)tensor);
1431
+ }
1432
+
1433
+ /* Phase F0.4 autograd: after building a forward graph + marking params
1434
+ * + marking loss, call this to extend the graph with backward nodes.
1435
+ *
1436
+ * Workflow (caller side):
1437
+ * 1. tnn_input_*_persistent(...) for params, mark each with tnn_set_param
1438
+ * 2. Build forward ops (matmul, gelu, ...) ending in a scalar loss
1439
+ * 3. tnn_set_loss(loss_tensor); tnn_set_output(loss_tensor)
1440
+ * 4. tnn_realize(sess, loss_tensor)
1441
+ * 5. tnn_build_backward(sess) ← extends s->graph_b with backward nodes
1442
+ * 6. tnn_compute_backward(sess) ← runs forward+backward
1443
+ * 7. tnn_tensor_grad(param) ← retrieve the gradient tensor
1444
+ *
1445
+ * The backward extends s->graph_b (we keep s->graph as forward-only
1446
+ * for inference use); a freshly-duped copy of s->graph is taken with
1447
+ * grads=true so ggml_build_backward_expand has the slots it needs.
1448
+ * Returns 0 on success, -1 on failure. */
1449
+ /* Split tnn_build_backward into two phases so callers can extend the
1450
+ * graph with optimizer-step nodes between build and alloc. Typical
1451
+ * in-graph-optimizer flow:
1452
+ *
1453
+ * tnn_realize(sess, loss) // forward graph
1454
+ * tnn_build_backward(sess) // dup + build_backward_expand
1455
+ * for each param:
1456
+ * opt_node = tnn_opt_step_adamw(sess, p, grad, m, v, hp)
1457
+ * tnn_extend_backward_graph(sess, opt_node)
1458
+ * tnn_realize_backward(sess) // sched-alloc the final graph
1459
+ * loop:
1460
+ * tnn_compute_backward(sess) // fwd + bwd + adam in one call
1461
+ * read scalar loss; repeat
1462
+ */
1463
+ int tnn_build_backward(void *sess)
1464
+ {
1465
+ if (!sess) return -1;
1466
+ tnn_session *s = (tnn_session *)sess;
1467
+ if (!s->realized) return -2; /* must build forward first */
1468
+
1469
+ /* ggml_build_backward_expand requires cgraph->grads + grad_accs
1470
+ * to be non-NULL, which ggml_new_graph_custom only allocates when
1471
+ * `grads=true`. Our session's graph is created with grads=false
1472
+ * (forward-only). Solve by dup'ing with force_grads=true. The
1473
+ * duped graph SHARES tensor pointers with the original — leaves
1474
+ * and compute nodes alike. */
1475
+ s->graph_b = ggml_graph_dup(s->ctx, s->graph, /*force_grads=*/true);
1476
+ if (!s->graph_b) return -3;
1477
+
1478
+ /* Expand with backward nodes for every node tagged as param. */
1479
+ ggml_build_backward_expand(s->ctx, s->graph_b, NULL);
1480
+ /* Note: NOT allocated yet — caller may extend with opt_step nodes,
1481
+ * then call tnn_realize_backward to finalize the allocation. */
1482
+ return 0;
1483
+ }
1484
+
1485
+ /* Add a node to the backward graph (typically an opt_step output).
1486
+ * Used between tnn_build_backward and tnn_realize_backward. */
1487
+ int tnn_extend_backward_graph(void *sess, void *node)
1488
+ {
1489
+ if (!sess || !node) return -1;
1490
+ tnn_session *s = (tnn_session *)sess;
1491
+ if (!s->graph_b) return -2;
1492
+ ggml_build_forward_expand(s->graph_b, (struct ggml_tensor *)node);
1493
+ return 0;
1494
+ }
1495
+
1496
+ /* Finalize the backward graph allocation. Called once, after all
1497
+ * opt_step nodes have been added. Subsequent compute_backward calls
1498
+ * are cheap re-runs. */
1499
+ int tnn_realize_backward(void *sess)
1500
+ {
1501
+ if (!sess) return -1;
1502
+ tnn_session *s = (tnn_session *)sess;
1503
+ if (!s->graph_b) return -2;
1504
+ int64_t _t = tnn_trace_begin("realize_backward");
1505
+ ggml_backend_sched_reset(s->engine->sched);
1506
+ int ok = ggml_backend_sched_alloc_graph(s->engine->sched, s->graph_b) ? 1 : 0;
1507
+ tnn_trace_end("realize_backward", _t);
1508
+ if (!ok) return -3;
1509
+ s->realized_b = 1;
1510
+ return 0;
1511
+ }
1512
+
1513
+ /* Initialize the backward-graph state: zero all gradient
1514
+ * accumulators + Adam moments (m, v) for any opt_step nodes; set the
1515
+ * loss tensor's incoming gradient to 1.0. Call this ONCE between
1516
+ * tnn_realize_backward and the first tnn_compute_backward. Subsequent
1517
+ * compute calls accumulate normally — momenta persist across steps. */
1518
+ int tnn_graph_reset(void *sess)
1519
+ {
1520
+ if (!sess) return -1;
1521
+ tnn_session *s = (tnn_session *)sess;
1522
+ if (!s->graph_b) return -2;
1523
+ ggml_graph_reset(s->graph_b);
1524
+ return 0;
1525
+ }
1526
+
1527
+ /* F1.2 step 5: zero grad accumulators (and reset loss_grad to 1) but
1528
+ * leave opt_step's m / v momenta alone. Lets AdamW survive across
1529
+ * training steps without losing momentum, while still clearing the
1530
+ * grads between iterations so the next compute_backward recomputes
1531
+ * them from scratch (not accumulates).
1532
+ *
1533
+ * Mirrors ggml_graph_reset minus the GGML_OP_OPT_STEP_ADAMW arm that
1534
+ * zeros src[2] (m) and src[3] (v). For SGD this primitive and
1535
+ * tnn_graph_reset behave identically. For AdamW the difference is
1536
+ * load-bearing: graph_reset would clobber momentum every step. */
1537
+ int tnn_graph_reset_grads_only(void *sess)
1538
+ {
1539
+ if (!sess) return -1;
1540
+ tnn_session *s = (tnn_session *)sess;
1541
+ if (!s->graph_b) return -2;
1542
+ int n_nodes = ggml_graph_n_nodes(s->graph_b);
1543
+ int i = 0;
1544
+ while (i < n_nodes) {
1545
+ struct ggml_tensor * node = ggml_graph_node(s->graph_b, i);
1546
+ struct ggml_tensor * grad_acc = ggml_graph_get_grad_acc(s->graph_b, node);
1547
+ if (grad_acc) {
1548
+ if (node->flags & GGML_TENSOR_FLAG_LOSS) {
1549
+ const float onef = 1.0f;
1550
+ if (grad_acc->buffer) {
1551
+ ggml_backend_tensor_set(grad_acc, &onef, 0, sizeof(float));
1552
+ } else if (grad_acc->data) {
1553
+ *((float *) grad_acc->data) = onef;
1554
+ }
1555
+ } else {
1556
+ ggml_set_zero(grad_acc);
1557
+ }
1558
+ }
1559
+ i++;
1560
+ }
1561
+ return 0;
1562
+ }
1563
+
1564
+ /* Task #70 diagnostic — pin EVERY node in graph_b as an output, so
1565
+ * sched is forbidden from reusing any intermediate's buffer slot
1566
+ * once the node is computed. Used to test the hypothesis that the
1567
+ * CPU/CUDA training divergence is caused by sched aliasing of
1568
+ * intermediate grad tensors in long backward chains. Returns the
1569
+ * number of nodes pinned.
1570
+ *
1571
+ * Call AFTER tnn_build_backward (so the backward nodes exist) but
1572
+ * BEFORE tnn_realize_backward (so the sched sees the output flags
1573
+ * when it allocates buffers).
1574
+ *
1575
+ * This is a diagnostic primitive, NOT a recommended training path —
1576
+ * pinning every node defeats the sched's buffer-reuse optimization
1577
+ * and inflates memory by ~node-count tensors. Use only to localize
1578
+ * sched aliasing as the cause. */
1579
+ int tnn_pin_all_graph_b_nodes(void *sess)
1580
+ {
1581
+ if (!sess) return -1;
1582
+ tnn_session *s = (tnn_session *)sess;
1583
+ if (!s->graph_b) return -2;
1584
+ int n = ggml_graph_n_nodes(s->graph_b);
1585
+ int i = 0;
1586
+ while (i < n) {
1587
+ struct ggml_tensor *t = ggml_graph_node(s->graph_b, i);
1588
+ if (t) ggml_set_output(t);
1589
+ i++;
1590
+ }
1591
+ return n;
1592
+ }
1593
+
1594
+ /* Run the backward graph (forward + backward in one compute call). */
1595
+ int tnn_compute_backward(void *sess)
1596
+ {
1597
+ if (!sess) return -1;
1598
+ tnn_session *s = (tnn_session *)sess;
1599
+ if (!s->realized_b) return -2;
1600
+ int64_t _t = tnn_trace_begin("compute_backward");
1601
+ enum ggml_status rc = ggml_backend_sched_graph_compute(s->engine->sched, s->graph_b);
1602
+ tnn_trace_end("compute_backward", _t);
1603
+ return (rc == GGML_STATUS_SUCCESS) ? 0 : (int)rc;
1604
+ }
1605
+
1606
+ /* Return the gradient tensor for a param. Caller can then read its
1607
+ * data via tnn_download. Returns NULL if no gradient exists (param
1608
+ * wasn't marked, or backward wasn't built/computed). */
1609
+ void *tnn_tensor_grad(void *sess, void *tensor)
1610
+ {
1611
+ if (!sess || !tensor) return NULL;
1612
+ tnn_session *s = (tnn_session *)sess;
1613
+ if (!s->graph_b) return NULL;
1614
+ return (void *)ggml_graph_get_grad(s->graph_b, (struct ggml_tensor *)tensor);
1615
+ }
1616
+
1617
+ void *tnn_input_1d_f32(void *sess, int n)
1618
+ {
1619
+ if (!sess || n <= 0) return NULL;
1620
+ tnn_session *s = (tnn_session *)sess;
1621
+ return (void *)ggml_new_tensor_1d(s->ctx, GGML_TYPE_F32, (int64_t)n);
1622
+ }
1623
+
1624
+ void *tnn_opt_step_adamw(void *sess, void *a, void *grad, void *m, void *v, void *params)
1625
+ {
1626
+ if (!sess || !a || !grad || !m || !v || !params) return NULL;
1627
+ tnn_session *s = (tnn_session *)sess;
1628
+ return (void *)ggml_opt_step_adamw(s->ctx,
1629
+ (struct ggml_tensor *)a,
1630
+ (struct ggml_tensor *)grad,
1631
+ (struct ggml_tensor *)m,
1632
+ (struct ggml_tensor *)v,
1633
+ (struct ggml_tensor *)params);
1634
+ }
1635
+
1636
+ /* SGD step: w = w - alpha * grad - alpha * wd * w. Simpler than Adam,
1637
+ * useful for sanity-checking the autograd gradient direction (no
1638
+ * momentum to obscure things). params is a 1-D 2-element tensor:
1639
+ * [alpha, weight_decay]. */
1640
+ void *tnn_opt_step_sgd(void *sess, void *a, void *grad, void *params)
1641
+ {
1642
+ if (!sess || !a || !grad || !params) return NULL;
1643
+ tnn_session *s = (tnn_session *)sess;
1644
+ return (void *)ggml_opt_step_sgd(s->ctx,
1645
+ (struct ggml_tensor *)a,
1646
+ (struct ggml_tensor *)grad,
1647
+ (struct ggml_tensor *)params);
1648
+ }
1649
+
1650
+ int tnn_realize(void *sess, void *result)
1651
+ {
1652
+ if (!sess || !result) return -1;
1653
+ tnn_session *s = (tnn_session *)sess;
1654
+ if (s->realized) return -2;
1655
+ int64_t _t = tnn_trace_begin("realize");
1656
+ ggml_build_forward_expand(s->graph, (struct ggml_tensor *)result);
1657
+ ggml_backend_sched_reset(s->engine->sched);
1658
+ int ok = ggml_backend_sched_alloc_graph(s->engine->sched, s->graph) ? 1 : 0;
1659
+ tnn_trace_end("realize", _t);
1660
+ if (!ok) return -3;
1661
+ s->realized = 1;
1662
+ s->last_graph = 1;
1663
+ return 0;
1664
+ }
1665
+
1666
+ /* Same as tnn_realize minus the sched-alloc. Training callers use this:
1667
+ * tnn_build_forward_only(sess, loss) → tnn_build_backward(sess) →
1668
+ * (optional tnn_extend_backward_graph for opt_step) → tnn_realize_backward.
1669
+ * The follow-up tnn_realize_backward does the single sched-alloc on the
1670
+ * combined graph_b. Calling tnn_realize THEN tnn_realize_backward is
1671
+ * broken: the sched_reset between the two leaves tensor buffer pointers
1672
+ * stale and the second alloc lands tensors on freed-pool memory (validated
1673
+ * 2026-05-20 with a standalone ggml POC reproducing micro5's failure
1674
+ * byte-for-byte; see docs/design/phase-f1-status.md). */
1675
+ int tnn_build_forward_only(void *sess, void *result)
1676
+ {
1677
+ if (!sess || !result) return -1;
1678
+ tnn_session *s = (tnn_session *)sess;
1679
+ if (s->realized) return -2;
1680
+ ggml_build_forward_expand(s->graph, (struct ggml_tensor *)result);
1681
+ s->realized = 1;
1682
+ s->last_graph = 1;
1683
+ return 0;
1684
+ }
1685
+
1686
+ /* Add an extra tensor's compute tree to the graph BEFORE tnn_realize.
1687
+ * Use for side-effect ops (ggml_cpy into a view) that aren't reachable
1688
+ * from the final result tensor — without this they'd be pruned. The
1689
+ * realize-target's tree is appended later by tnn_realize itself. */
1690
+ /* E2.4 — streaming corpus loader primitive. Reads n_ints int32s
1691
+ * from `path` starting at byte_offset (byte-addressed, not
1692
+ * token-addressed — caller computes offset = token_offset * 4).
1693
+ * Widens the disk-format i32s to int64 to match Spinel's :int_array
1694
+ * ABI (Ruby Integers are 64-bit on this platform).
1695
+ * Returns count of i32s actually read (== n_ints on full read,
1696
+ * < n_ints at EOF), or negative on open/seek/alloc failure. */
1697
+ int tnn_read_f32_file(const char *path, int byte_offset, int n_floats, double *dst)
1698
+ {
1699
+ if (!path || !dst || n_floats <= 0) return -1;
1700
+ FILE *f = fopen(path, "rb");
1701
+ if (!f) return -2;
1702
+ if (fseek(f, (long)byte_offset, SEEK_SET) != 0) {
1703
+ fclose(f);
1704
+ return -3;
1705
+ }
1706
+ float *tmp = (float *)malloc((size_t)n_floats * sizeof(float));
1707
+ if (!tmp) {
1708
+ fclose(f);
1709
+ return -4;
1710
+ }
1711
+ size_t got = fread(tmp, sizeof(float), (size_t)n_floats, f);
1712
+ fclose(f);
1713
+ for (size_t i = 0; i < got; i++) {
1714
+ dst[i] = (double)tmp[i];
1715
+ }
1716
+ free(tmp);
1717
+ return (int)got;
1718
+ }
1719
+
1720
+ int tnn_read_i32_file(const char *path, int byte_offset, int n_ints, int64_t *dst)
1721
+ {
1722
+ if (!path || !dst || n_ints <= 0) return -1;
1723
+ FILE *f = fopen(path, "rb");
1724
+ if (!f) return -2;
1725
+ if (fseek(f, (long)byte_offset, SEEK_SET) != 0) {
1726
+ fclose(f);
1727
+ return -3;
1728
+ }
1729
+ int32_t *tmp = (int32_t *)malloc((size_t)n_ints * sizeof(int32_t));
1730
+ if (!tmp) {
1731
+ fclose(f);
1732
+ return -4;
1733
+ }
1734
+ size_t got = fread(tmp, sizeof(int32_t), (size_t)n_ints, f);
1735
+ fclose(f);
1736
+ for (size_t i = 0; i < got; i++) {
1737
+ dst[i] = (int64_t)tmp[i];
1738
+ }
1739
+ free(tmp);
1740
+ return (int)got;
1741
+ }
1742
+
1743
+ /* toy#embed-api (#145) — dequantize-aware single-row read from a
1744
+ * 2-D tensor whose data lives in CPU-readable memory (mmap'd GGUF
1745
+ * pages are the common case). Reads row `row_idx` of `tensor`,
1746
+ * dequantizes via the per-type to_float, and writes d_model doubles
1747
+ * into dst.
1748
+ *
1749
+ * Returns 0 on success, negative on failure:
1750
+ * -1 null arg, -2 bad row_idx, -3 mismatched d_model,
1751
+ * -4 t->data is NULL (GPU-resident; needs download path instead),
1752
+ * -5 type has no to_float (no known dequantizer).
1753
+ *
1754
+ * Use case: Tep's future /v1/embeddings. The mmap'd token_embd table
1755
+ * is CPU-readable regardless of compute backend, so this primitive
1756
+ * works under :cpu, :cuda, and :metal sessions. */
1757
+ int tnn_embed_lookup_to_doubles(void *sess, void *tensor, int row_idx,
1758
+ double *dst, int d_model)
1759
+ {
1760
+ (void)sess; /* not consulted; embed table lives in mmap region */
1761
+ if (!tensor || !dst) return -1;
1762
+ struct ggml_tensor *t = (struct ggml_tensor *)tensor;
1763
+ if (row_idx < 0 || row_idx >= (int)t->ne[1]) return -2;
1764
+ if (d_model != (int)t->ne[0]) return -3;
1765
+ if (!t->data) return -4;
1766
+
1767
+ /* Row offset in bytes: stride along ne[1] is nb[1]. */
1768
+ const uint8_t *src = (const uint8_t *)t->data + (size_t)row_idx * t->nb[1];
1769
+
1770
+ /* F32 needs no dequant; ggml's type_traits.to_float is NULL for it. */
1771
+ if (t->type == GGML_TYPE_F32) {
1772
+ const float *frow = (const float *)src;
1773
+ for (int j = 0; j < d_model; j++) dst[j] = (double)frow[j];
1774
+ return 0;
1775
+ }
1776
+
1777
+ const struct ggml_type_traits *tr = ggml_get_type_traits(t->type);
1778
+ if (!tr || !tr->to_float) return -5;
1779
+
1780
+ /* Dequantize into a float scratch then widen to double. */
1781
+ float *fbuf = (float *)malloc((size_t)d_model * sizeof(float));
1782
+ if (!fbuf) return -6;
1783
+ tr->to_float(src, fbuf, (int64_t)d_model);
1784
+ for (int j = 0; j < d_model; j++) dst[j] = (double)fbuf[j];
1785
+ free(fbuf);
1786
+ return 0;
1787
+ }
1788
+
1789
+ /* GH#17 — re-allocate the session's forward + backward graphs with a
1790
+ * larger node-count budget. Must be called BEFORE realize so the ctx
1791
+ * hasn't yet stored any compute tensors.
1792
+ *
1793
+ * Why this exists: per-head attention decomposition makes node count
1794
+ * scale as O(n_layers × n_heads); the default 65536 cap overflows on
1795
+ * 24L × 16-head Qwen-shape models at backward-expand time. Callers in
1796
+ * realize_for_random_init / _mmap pass a size derived from cfg.
1797
+ *
1798
+ * Implementation: this tears down the compute ctx and re-inits it with
1799
+ * a buffer large enough to hold:
1800
+ * - the forward graph (capacity nodes)
1801
+ * - the backward graph (capacity nodes, with grads → 2× tensor-ptr
1802
+ * arrays + a hash_set sized proportional to capacity)
1803
+ * - rebuild headroom for many decode steps (the original 32 MB slack
1804
+ * served distil-GPT-2 at 10k rebuilds; we keep that slack additive)
1805
+ * The persistent-weights ctx (ctx_w) is untouched. */
1806
+ int tnn_session_set_graph_capacity(void *sess, int capacity)
1807
+ {
1808
+ if (!sess) return -1;
1809
+ tnn_session *s = (tnn_session *)sess;
1810
+ if (capacity <= 0) return -2;
1811
+ if (s->realized) return -3;
1812
+
1813
+ /* Size the buffer so two grad-flagged graphs at this capacity fit
1814
+ * comfortably, plus the original rebuild slack. */
1815
+ size_t graph_bytes = ggml_graph_overhead_custom((size_t)capacity, true);
1816
+ size_t needed = graph_bytes * 2
1817
+ + ggml_tensor_overhead() * 262144 /* preserve original tensor-header slack */
1818
+ + 32 * 1024 * 1024; /* rebuild headroom */
1819
+ if (needed > s->ctx_buf_size) {
1820
+ ggml_free(s->ctx);
1821
+ free(s->ctx_buf);
1822
+ s->ctx_buf_size = needed;
1823
+ s->ctx_buf = (uint8_t *)calloc(1, s->ctx_buf_size);
1824
+ struct ggml_init_params params = {
1825
+ /*.mem_size =*/ s->ctx_buf_size,
1826
+ /*.mem_buffer =*/ s->ctx_buf,
1827
+ /*.no_alloc =*/ true,
1828
+ };
1829
+ s->ctx = ggml_init(params);
1830
+ }
1831
+ s->graph_capacity = capacity;
1832
+ s->graph = ggml_new_graph_custom(s->ctx, (size_t)s->graph_capacity, false);
1833
+ s->graph_b = ggml_new_graph_custom(s->ctx, (size_t)s->graph_capacity, false);
1834
+ return 0;
1835
+ }
1836
+
1837
+ int tnn_add_to_graph(void *sess, void *tensor)
1838
+ {
1839
+ if (!sess || !tensor) return -1;
1840
+ tnn_session *s = (tnn_session *)sess;
1841
+ if (s->realized) return -2;
1842
+ ggml_build_forward_expand(s->graph, (struct ggml_tensor *)tensor);
1843
+ return 0;
1844
+ }
1845
+
1846
+ /* Reset for rebuild: free the compute ctx entirely and start fresh.
1847
+ * The persistent ctx_w + its backend buffer are untouched, so weights
1848
+ * keep their data. Previously this only swapped graphs in the same
1849
+ * ctx — that grew monotonically and overflowed after ~80 decode steps
1850
+ * at gpt2-small + max_T=1024 (each step creates ~1300 new tensor
1851
+ * headers, none get reclaimed). Tearing ctx down per step makes the
1852
+ * per-decode-step compute fully bounded in metadata footprint.
1853
+ *
1854
+ * The scheduler also has internal state tied to tensor pointers; we
1855
+ * reset it before realize, so this is safe. Per decode step:
1856
+ * tnn_reset_for_rebuild(sess)
1857
+ * ... build ops with current pos baked in ...
1858
+ * tnn_realize(sess, result_tensor)
1859
+ * ... upload, compute, download ... */
1860
+ int tnn_reset_for_rebuild(void *sess)
1861
+ {
1862
+ if (!sess) return -1;
1863
+ tnn_session *s = (tnn_session *)sess;
1864
+ /* Profile timing showed that free()+init() of the (now 130-ish MB)
1865
+ * ctx_buf adds ~500 ms per call — dominates compute. So we ONLY
1866
+ * teardown when the ctx is approaching capacity. The (small)
1867
+ * accumulated dead headers between teardowns are bounded by
1868
+ * ctx_used / ctx_buf_size, which we check before each rebuild
1869
+ * via ggml_used_mem.
1870
+ *
1871
+ * Threshold: half the buffer. Headroom ensures the *next* step's
1872
+ * graph build can complete without overflowing. */
1873
+ size_t used = ggml_used_mem(s->ctx);
1874
+ if (used > s->ctx_buf_size / 2) {
1875
+ ggml_free(s->ctx);
1876
+ struct ggml_init_params params = {
1877
+ /*.mem_size =*/ s->ctx_buf_size,
1878
+ /*.mem_buffer =*/ s->ctx_buf,
1879
+ /*.no_alloc =*/ true,
1880
+ };
1881
+ s->ctx = ggml_init(params);
1882
+ s->graph_b = ggml_new_graph_custom(s->ctx, (size_t)s->graph_capacity, false);
1883
+ s->realized_b = 0;
1884
+ }
1885
+ s->realized = 0;
1886
+ s->graph = ggml_new_graph_custom(s->ctx, (size_t)s->graph_capacity, false);
1887
+ s->last_graph = 0;
1888
+ return 0;
1889
+ }
1890
+
1891
+ int tnn_compute(void *sess)
1892
+ {
1893
+ if (!sess) return -1;
1894
+ tnn_session *s = (tnn_session *)sess;
1895
+ if (!s->realized) return -2;
1896
+ int64_t _t = tnn_trace_begin("compute");
1897
+ enum ggml_status rc = ggml_backend_sched_graph_compute(s->engine->sched, s->graph);
1898
+ tnn_trace_end("compute", _t);
1899
+ return (rc == GGML_STATUS_SUCCESS) ? 0 : (int)rc;
1900
+ }
1901
+
1902
+ /* Build a SECONDARY graph (graph_b) in the same session, sharing ctx
1903
+ * and tensors with the primary. Does NOT alloc — call tnn_switch_b
1904
+ * before tnn_compute_b each cycle. */
1905
+ int tnn_realize_b(void *sess, void *result)
1906
+ {
1907
+ if (!sess || !result) return -1;
1908
+ tnn_session *s = (tnn_session *)sess;
1909
+ if (s->realized_b) return -2;
1910
+ ggml_build_forward_expand(s->graph_b, (struct ggml_tensor *)result);
1911
+ s->realized_b = 1;
1912
+ return 0;
1913
+ }
1914
+
1915
+ /* Switch sched allocation to graph_b (or back to graph). Resets the
1916
+ * scheduler then allocates buffer slots for the requested graph's
1917
+ * compute tensors. Persistent tensors (allocated via ctx_w) keep
1918
+ * their stable buffer locations. Compute tensors (h, intermediates)
1919
+ * get fresh slots that may differ from prior cycles -- caller MUST
1920
+ * re-upload any compute inputs before tnn_compute*. */
1921
+ int tnn_switch_b(void *sess)
1922
+ {
1923
+ if (!sess) return -1;
1924
+ tnn_session *s = (tnn_session *)sess;
1925
+ if (!s->realized_b) return -2;
1926
+ ggml_backend_sched_reset(s->engine->sched);
1927
+ if (!ggml_backend_sched_alloc_graph(s->engine->sched, s->graph_b)) return -3;
1928
+ s->last_graph = 2;
1929
+ return 0;
1930
+ }
1931
+
1932
+ int tnn_switch_a(void *sess)
1933
+ {
1934
+ if (!sess) return -1;
1935
+ tnn_session *s = (tnn_session *)sess;
1936
+ if (!s->realized) return -2;
1937
+ ggml_backend_sched_reset(s->engine->sched);
1938
+ if (!ggml_backend_sched_alloc_graph(s->engine->sched, s->graph)) return -3;
1939
+ s->last_graph = 1;
1940
+ return 0;
1941
+ }
1942
+
1943
+ int tnn_compute_b(void *sess)
1944
+ {
1945
+ if (!sess) return -1;
1946
+ tnn_session *s = (tnn_session *)sess;
1947
+ if (!s->realized_b) return -2;
1948
+ enum ggml_status rc = ggml_backend_sched_graph_compute(s->engine->sched, s->graph_b);
1949
+ return (rc == GGML_STATUS_SUCCESS) ? 0 : (int)rc;
1950
+ }
1951
+
1952
+ /* Out-of-range scratch_set used to silently drop writes — a stage+upload
1953
+ * pair operating on a tensor larger than the scratch buffer would
1954
+ * truncate at the boundary and then `tnn_upload` would memcpy past
1955
+ * the scratch end into the next backend buffer. That bug bit
1956
+ * Qwen2.5-0.5B (ffn_gate = 4.36M floats > 4M scratch slots; 17.4 MB
1957
+ * upload past a 16 MiB scratch) and produced NaN logits at L=1 with
1958
+ * no visible error. Now we fprintf a one-line warning the FIRST time
1959
+ * we see an out-of-range write per session — noisy enough to catch
1960
+ * future regressions without spamming the logs. */
1961
+ void tnn_scratch_set(void *sess, int idx, double v)
1962
+ {
1963
+ if (!sess) return;
1964
+ tnn_session *s = (tnn_session *)sess;
1965
+ int max_n = TNN_SCRATCH_BYTES / (int)sizeof(float);
1966
+ if (idx < 0 || idx >= max_n) {
1967
+ if (!s->scratch_overflow_warned) {
1968
+ fprintf(stderr, "[tnn] WARN: tnn_scratch_set idx=%d out of range "
1969
+ "(max=%d, scratch=%d bytes). Subsequent uploads "
1970
+ "from this scratch are corrupt — use a chunked "
1971
+ "uploader (e.g. tnn_upload_transposed_f64).\n",
1972
+ idx, max_n, TNN_SCRATCH_BYTES);
1973
+ s->scratch_overflow_warned = 1;
1974
+ }
1975
+ return;
1976
+ }
1977
+ s->scratch[idx] = (float)v;
1978
+ }
1979
+
1980
+ /* Out-of-range reads used to silently return 0.0 — indistinguishable
1981
+ * from a legitimate zero in the scratch slot. Now we still return 0.0
1982
+ * for backward compatibility, but emit a once-per-session warning so
1983
+ * the failure is visible. Callers that need the legitimate zero/OOR
1984
+ * distinction should check bounds themselves. */
1985
+ double tnn_scratch_get(void *sess, int idx)
1986
+ {
1987
+ if (!sess) return 0.0;
1988
+ tnn_session *s = (tnn_session *)sess;
1989
+ int max_n = TNN_SCRATCH_BYTES / (int)sizeof(float);
1990
+ if (idx < 0 || idx >= max_n) {
1991
+ if (!s->scratch_overflow_warned) {
1992
+ fprintf(stderr, "[tnn] WARN: tnn_scratch_get idx=%d out of range "
1993
+ "(max=%d). Returning 0.0 — but this is now a "
1994
+ "silent zero, not a real one. Check your indexing.\n",
1995
+ idx, max_n);
1996
+ s->scratch_overflow_warned = 1;
1997
+ }
1998
+ return 0.0;
1999
+ }
2000
+ return (double)s->scratch[idx];
2001
+ }
2002
+
2003
+ /* The scratch buffer is just bytes; we let i32 values share it. Caller
2004
+ * must not mix i32 + f32 writes within a single tensor's upload window.
2005
+ * Same overflow warning as tnn_scratch_set — once-per-session fprintf. */
2006
+ void tnn_scratch_set_i32(void *sess, int idx, int value)
2007
+ {
2008
+ if (!sess) return;
2009
+ tnn_session *s = (tnn_session *)sess;
2010
+ int max_n = TNN_SCRATCH_BYTES / (int)sizeof(int32_t);
2011
+ if (idx < 0 || idx >= max_n) {
2012
+ if (!s->scratch_overflow_warned) {
2013
+ fprintf(stderr, "[tnn] WARN: tnn_scratch_set_i32 idx=%d out of "
2014
+ "range (max=%d). Use a chunked uploader.\n",
2015
+ idx, max_n);
2016
+ s->scratch_overflow_warned = 1;
2017
+ }
2018
+ return;
2019
+ }
2020
+ ((int32_t *)s->scratch)[idx] = (int32_t)value;
2021
+ }
2022
+
2023
+ int tnn_scratch_get_i32(void *sess, int idx)
2024
+ {
2025
+ if (!sess) return 0;
2026
+ tnn_session *s = (tnn_session *)sess;
2027
+ int max_n = TNN_SCRATCH_BYTES / (int)sizeof(int32_t);
2028
+ if (idx < 0 || idx >= max_n) {
2029
+ if (!s->scratch_overflow_warned) {
2030
+ fprintf(stderr, "[tnn] WARN: tnn_scratch_get_i32 idx=%d out of "
2031
+ "range (max=%d). Returning 0 — but this is a "
2032
+ "silent zero, not a real one.\n",
2033
+ idx, max_n);
2034
+ s->scratch_overflow_warned = 1;
2035
+ }
2036
+ return 0;
2037
+ }
2038
+ return (int)((int32_t *)s->scratch)[idx];
2039
+ }
2040
+
2041
+ /* Bounds-checked upload: tensor must fit in the 16 MiB scratch. Larger
2042
+ * tensors caused the silent UB that produced NaN logits at L=1 on
2043
+ * Qwen2.5-0.5B (ffn_gate = 17.4 MB > 16 MB scratch); the memcpy past
2044
+ * the scratch end overwrote adjacent heap. Use chunked uploaders for
2045
+ * anything that might be large:
2046
+ * - tnn_upload_from_float_array (chunked f32 upload)
2047
+ * - tnn_upload_transposed_f64 (chunked transposed f64 upload)
2048
+ * Returns 0 on success, -1 on null sess/tensor, -2 on size overflow. */
2049
+ int tnn_upload(void *sess, void *tensor)
2050
+ {
2051
+ if (!sess || !tensor) return -1;
2052
+ tnn_session *s = (tnn_session *)sess;
2053
+ struct ggml_tensor *t = (struct ggml_tensor *)tensor;
2054
+ size_t nbytes = ggml_nbytes(t);
2055
+ if (nbytes > (size_t)TNN_SCRATCH_BYTES) {
2056
+ if (!s->scratch_overflow_warned) {
2057
+ fprintf(stderr, "[tnn] WARN: tnn_upload tensor=%zu bytes exceeds "
2058
+ "scratch=%d bytes. Skipping upload (was: silent UB). "
2059
+ "Use tnn_upload_from_float_array or "
2060
+ "tnn_upload_transposed_f64 for tensors > 16 MiB.\n",
2061
+ nbytes, TNN_SCRATCH_BYTES);
2062
+ s->scratch_overflow_warned = 1;
2063
+ }
2064
+ return -2;
2065
+ }
2066
+ int64_t _t = tnn_trace_begin("upload");
2067
+ ggml_backend_tensor_set(t, s->scratch, 0, nbytes);
2068
+ tnn_trace_end("upload", _t);
2069
+ return 0;
2070
+ }
2071
+
2072
+ /* Same bounds check as tnn_upload — a download into an oversized
2073
+ * tensor would memcpy past the scratch end into adjacent heap. */
2074
+ int tnn_download(void *sess, void *tensor)
2075
+ {
2076
+ if (!sess || !tensor) return -1;
2077
+ tnn_session *s = (tnn_session *)sess;
2078
+ struct ggml_tensor *t = (struct ggml_tensor *)tensor;
2079
+ size_t nbytes = ggml_nbytes(t);
2080
+ if (nbytes > (size_t)TNN_SCRATCH_BYTES) {
2081
+ if (!s->scratch_overflow_warned) {
2082
+ fprintf(stderr, "[tnn] WARN: tnn_download tensor=%zu bytes exceeds "
2083
+ "scratch=%d bytes. Skipping download (was: silent UB). "
2084
+ "Use tnn_download_to_f64_array for tensors > 16 MiB.\n",
2085
+ nbytes, TNN_SCRATCH_BYTES);
2086
+ s->scratch_overflow_warned = 1;
2087
+ }
2088
+ return -2;
2089
+ }
2090
+ int64_t _t = tnn_trace_begin("download");
2091
+ ggml_backend_tensor_get(t, s->scratch, 0, nbytes);
2092
+ tnn_trace_end("download", _t);
2093
+ return 0;
2094
+ }
2095
+
2096
+ /* Transpose-and-upload a row-major f64 Mat into a ggml f32 tensor of
2097
+ * shape ne=[br, bc] in chunked passes — so it works for tensors larger
2098
+ * than the 16 MiB scratch buffer.
2099
+ *
2100
+ * Source layout: src[i*bc + j] = (i, j) of an (br × bc) row-major Mat.
2101
+ * Destination ggml layout: T[ne0=k0, ne1=k1] at byte offset k1*br + k0
2102
+ * (in float positions). We want T[i, j] = src[i, j] (transpose semantics
2103
+ * is in the *consumer* — ggml_mul_mat treats (br, bc) as (K, M) where
2104
+ * the K axis is contracted; we get B^T · h that way).
2105
+ *
2106
+ * Chunking: pick `cols_per_chunk` ≤ scratch_slots / br. For each chunk
2107
+ * [j_start, j_end) of columns: stage src[i, j] → scratch[(j - j_start)*br + i]
2108
+ * for i ∈ [0, br) and j ∈ [j_start, j_end). Then upload that contiguous
2109
+ * slice into the tensor at byte offset j_start*br*sizeof(float).
2110
+ *
2111
+ * Same shape as tnn_upload_from_float_array's chunking, but for the
2112
+ * transposed-input case used by stage_transposed_and_upload. Fixes the
2113
+ * scratch-overflow bug that produced garbage uploads for Qwen's
2114
+ * ffn_gate / ffn_up / ffn_down (each ~17 MB, scratch is 16 MB). */
2115
+ int tnn_upload_transposed_f64(void *sess, void *tensor,
2116
+ const double *src, int br, int bc)
2117
+ {
2118
+ if (!sess || !tensor || !src || br <= 0 || bc <= 0) return -1;
2119
+ tnn_session *s = (tnn_session *)sess;
2120
+ struct ggml_tensor *t = (struct ggml_tensor *)tensor;
2121
+
2122
+ size_t expected_bytes = (size_t)br * (size_t)bc * sizeof(float);
2123
+ if (expected_bytes > ggml_nbytes(t)) return -2;
2124
+
2125
+ const int max_slots = TNN_SCRATCH_BYTES / (int)sizeof(float);
2126
+ int cols_per_chunk = max_slots / br;
2127
+ if (cols_per_chunk <= 0) return -3; /* br > scratch — wider than ~4M */
2128
+
2129
+ int j_start = 0;
2130
+ while (j_start < bc) {
2131
+ int j_end = j_start + cols_per_chunk;
2132
+ if (j_end > bc) j_end = bc;
2133
+
2134
+ int j = j_start;
2135
+ while (j < j_end) {
2136
+ int i = 0;
2137
+ const double *src_row_base = src + (size_t)j;
2138
+ float *dst_col = s->scratch + (size_t)(j - j_start) * (size_t)br;
2139
+ while (i < br) {
2140
+ dst_col[i] = (float)src_row_base[(size_t)i * (size_t)bc];
2141
+ i++;
2142
+ }
2143
+ j++;
2144
+ }
2145
+
2146
+ size_t byte_off = (size_t)j_start * (size_t)br * sizeof(float);
2147
+ size_t byte_len = (size_t)(j_end - j_start) * (size_t)br * sizeof(float);
2148
+ ggml_backend_tensor_set(t, s->scratch, byte_off, byte_len);
2149
+
2150
+ j_start = j_end;
2151
+ }
2152
+ return 0;
2153
+ }
2154
+
2155
+ int tnn_upload_from_float_array(void *sess, void *tensor, const double *data, size_t n)
2156
+ {
2157
+ if (!sess || !tensor || !data) return -1;
2158
+ int64_t _trace = tnn_trace_begin("upload_from_float_array");
2159
+ tnn_session *s = (tnn_session *)sess;
2160
+ struct ggml_tensor *t = (struct ggml_tensor *)tensor;
2161
+ const size_t chunk_floats = TNN_SCRATCH_BYTES / sizeof(float);
2162
+
2163
+ /* Chunked f64 → f32 conversion into scratch, then ggml_backend_tensor_set
2164
+ * per chunk at the right byte offset. Lets us upload tensors larger
2165
+ * than scratch (e.g. distilgpt2's 38.6 M-element token_embd) without
2166
+ * growing the scratch buffer for everyone. */
2167
+ size_t off = 0;
2168
+ while (off < n) {
2169
+ size_t this_chunk = (n - off) < chunk_floats ? (n - off) : chunk_floats;
2170
+ for (size_t i = 0; i < this_chunk; ++i) {
2171
+ s->scratch[i] = (float)data[off + i];
2172
+ }
2173
+ ggml_backend_tensor_set(t, s->scratch,
2174
+ off * sizeof(float),
2175
+ this_chunk * sizeof(float));
2176
+ off += this_chunk;
2177
+ }
2178
+ tnn_trace_end("upload_from_float_array", _trace);
2179
+ return 0;
2180
+ }
2181
+
2182
+ /* Mirror of tnn_upload_from_float_array: read a tensor's f32 contents
2183
+ * back into a host f64 buffer in scratch-sized chunks. Enables full
2184
+ * Mat-roundtrip on weights loaded via the direct GGUF→FFI path —
2185
+ * required by the user-stated rule that the API mustn't paint into
2186
+ * an inference-only corner. */
2187
+ int tnn_download_to_f64_array(void *sess, void *tensor, double *dst, size_t n)
2188
+ {
2189
+ if (!sess || !tensor || !dst) return -1;
2190
+ tnn_session *s = (tnn_session *)sess;
2191
+ struct ggml_tensor *t = (struct ggml_tensor *)tensor;
2192
+ size_t available = ggml_nelements(t);
2193
+ if (n > available) return -2;
2194
+
2195
+ const size_t chunk_floats = TNN_SCRATCH_BYTES / sizeof(float);
2196
+ size_t off = 0;
2197
+ while (off < n) {
2198
+ size_t this_chunk = (n - off) < chunk_floats ? (n - off) : chunk_floats;
2199
+ ggml_backend_tensor_get(t, s->scratch,
2200
+ off * sizeof(float),
2201
+ this_chunk * sizeof(float));
2202
+ for (size_t i = 0; i < this_chunk; ++i) {
2203
+ dst[off + i] = (double)s->scratch[i];
2204
+ }
2205
+ off += this_chunk;
2206
+ }
2207
+ return 0;
2208
+ }
2209
+
2210
+ int tnn_upload_from_int_array(void *sess, void *tensor, const long *data, size_t n)
2211
+ {
2212
+ if (!sess || !tensor || !data) return -1;
2213
+ tnn_session *s = (tnn_session *)sess;
2214
+ struct ggml_tensor *t = (struct ggml_tensor *)tensor;
2215
+ size_t max_n = TNN_SCRATCH_BYTES / sizeof(int32_t);
2216
+ if (n > max_n) return -2;
2217
+
2218
+ int64_t _trace = tnn_trace_begin("upload_from_int_array");
2219
+ int32_t *dst = (int32_t *)s->scratch;
2220
+ /* i64 → i32 narrowing. Spinel's :int_array is `const int64_t *`; ggml's
2221
+ * GGML_TYPE_I32 row-index tensors are 32-bit. Caller responsibility
2222
+ * not to pass out-of-range indices (vocab fits easily in int32). */
2223
+ for (size_t i = 0; i < n; ++i) dst[i] = (int32_t)data[i];
2224
+
2225
+ ggml_backend_tensor_set(t, dst, 0, n * sizeof(int32_t));
2226
+ tnn_trace_end("upload_from_int_array", _trace);
2227
+ return 0;
2228
+ }
2229
+
2230
+ /* Scratch-buffer stats. Caller has just done tnn_download(sess, t)
2231
+ * which copied a tensor's f32 contents into the session's scratch
2232
+ * buffer. These helpers reduce over the first `n` floats without
2233
+ * crossing the FFI boundary per element — one Ruby↔C call per stat,
2234
+ * O(n) in C. Used by the trace-tap diagnostic path; not on any
2235
+ * production hot path. */
2236
+ double tnn_scratch_min_f32(void *sess, int n)
2237
+ {
2238
+ if (!sess || n <= 0) return 0.0;
2239
+ tnn_session *s = (tnn_session *)sess;
2240
+ float mn = s->scratch[0];
2241
+ int i = 1;
2242
+ while (i < n) { if (s->scratch[i] < mn) mn = s->scratch[i]; i++; }
2243
+ return (double)mn;
2244
+ }
2245
+
2246
+ double tnn_scratch_max_f32(void *sess, int n)
2247
+ {
2248
+ if (!sess || n <= 0) return 0.0;
2249
+ tnn_session *s = (tnn_session *)sess;
2250
+ float mx = s->scratch[0];
2251
+ int i = 1;
2252
+ while (i < n) { if (s->scratch[i] > mx) mx = s->scratch[i]; i++; }
2253
+ return (double)mx;
2254
+ }
2255
+
2256
+ double tnn_scratch_sum_abs_f32(void *sess, int n)
2257
+ {
2258
+ if (!sess || n <= 0) return 0.0;
2259
+ tnn_session *s = (tnn_session *)sess;
2260
+ double sum = 0.0;
2261
+ int i = 0;
2262
+ while (i < n) {
2263
+ float v = s->scratch[i];
2264
+ sum += v < 0.0f ? -(double)v : (double)v;
2265
+ i++;
2266
+ }
2267
+ return sum;
2268
+ }
2269
+
2270
+ /* Sum of squares; for L2 norm take sqrt() on the Ruby side. */
2271
+ double tnn_scratch_sum_sq_f32(void *sess, int n)
2272
+ {
2273
+ if (!sess || n <= 0) return 0.0;
2274
+ tnn_session *s = (tnn_session *)sess;
2275
+ double sum = 0.0;
2276
+ int i = 0;
2277
+ while (i < n) {
2278
+ double v = (double)s->scratch[i];
2279
+ sum += v * v;
2280
+ i++;
2281
+ }
2282
+ return sum;
2283
+ }
2284
+
2285
+ /* Plain sum (for mean = sum/n). */
2286
+ double tnn_scratch_sum_f32(void *sess, int n)
2287
+ {
2288
+ if (!sess || n <= 0) return 0.0;
2289
+ tnn_session *s = (tnn_session *)sess;
2290
+ double sum = 0.0;
2291
+ int i = 0;
2292
+ while (i < n) {
2293
+ sum += (double)s->scratch[i];
2294
+ i++;
2295
+ }
2296
+ return sum;
2297
+ }
2298
+
2299
+ /* Count of NaN-or-inf elements. NaN comparison: v != v is true iff NaN.
2300
+ * Inf: abs(v) > 1e30 is conservative (real f32 inf is 3.4e38). */
2301
+ int tnn_scratch_nan_count_f32(void *sess, int n)
2302
+ {
2303
+ if (!sess || n <= 0) return 0;
2304
+ tnn_session *s = (tnn_session *)sess;
2305
+ int c = 0;
2306
+ int i = 0;
2307
+ while (i < n) {
2308
+ float v = s->scratch[i];
2309
+ float av = v < 0.0f ? -v : v;
2310
+ if (v != v || av > 1.0e30f) c++;
2311
+ i++;
2312
+ }
2313
+ return c;
2314
+ }
2315
+
2316
+ int tnn_tensor_ne0(void *t) { return t ? (int)((struct ggml_tensor *)t)->ne[0] : 0; }
2317
+ int tnn_tensor_ne1(void *t) { return t ? (int)((struct ggml_tensor *)t)->ne[1] : 0; }
2318
+ int tnn_tensor_ne2(void *t) { return t ? (int)((struct ggml_tensor *)t)->ne[2] : 0; }
2319
+ int tnn_tensor_ne3(void *t) { return t ? (int)((struct ggml_tensor *)t)->ne[3] : 0; }
2320
+ size_t tnn_tensor_nbytes(void *t) { return t ? ggml_nbytes((struct ggml_tensor *)t) : 0; }
2321
+ int tnn_tensor_nelements(void *t) { return t ? (int)ggml_nelements((struct ggml_tensor *)t) : 0; }
2322
+
2323
+ /* Introspection primitives for kv.describe_flow (tao#kv-describe-flow).
2324
+ * All are read-only walks over the built compute graph + leaf set;
2325
+ * cheap enough to invoke ad-hoc after a graph has been realized. */
2326
+ const char *tnn_tensor_name(void *t) {
2327
+ return t ? ((struct ggml_tensor *)t)->name : "";
2328
+ }
2329
+ /* ggml_type enum value: 0=F32, 8=Q8_0, etc. See vendor/ggml/include/ggml.h. */
2330
+ int tnn_tensor_dtype(void *t) {
2331
+ return t ? (int)((struct ggml_tensor *)t)->type : 0;
2332
+ }
2333
+ /* Bitmask of GGML_TENSOR_FLAG_INPUT(1) | OUTPUT(2) | PARAM(4) | LOSS(8) | COMPUTE(16). */
2334
+ int tnn_tensor_flags(void *t) {
2335
+ return t ? (int)((struct ggml_tensor *)t)->flags : 0;
2336
+ }
2337
+ /* Op id (ggml_op enum) — 0=NONE, then MUL_MAT, ADD, …. Useful to label
2338
+ * compute nodes by their op kind in the description. */
2339
+ int tnn_tensor_op(void *t) {
2340
+ return t ? (int)((struct ggml_tensor *)t)->op : 0;
2341
+ }
2342
+ const char *tnn_tensor_op_name(void *t) {
2343
+ if (!t) return "";
2344
+ return ggml_op_name(((struct ggml_tensor *)t)->op);
2345
+ }
2346
+ /* Source-tensor pointers for an op node: src[0]..src[N]. Returns NULL
2347
+ * past the last source. ggml caps at GGML_MAX_SRC=10 — typical ops
2348
+ * use 2 srcs, opt_step_adamw uses 5, no current op uses more than 10. */
2349
+ void *tnn_tensor_src(void *t, int i) {
2350
+ if (!t || i < 0 || i >= GGML_MAX_SRC) return NULL;
2351
+ return (void *)((struct ggml_tensor *)t)->src[i];
2352
+ }
2353
+
2354
+ /* Graph walk: number of compute nodes, indexed accessor. Walks the
2355
+ * primary graph (graph_a) — the one populated by tnn_build_forward_only
2356
+ * or tnn_realize. Use tnn_graph_b_n_nodes / tnn_graph_b_node for the
2357
+ * backward graph when needed. */
2358
+ int tnn_graph_n_nodes(void *sess) {
2359
+ if (!sess) return 0;
2360
+ tnn_session *s = (tnn_session *)sess;
2361
+ if (!s->graph) return 0;
2362
+ return ggml_graph_n_nodes(s->graph);
2363
+ }
2364
+ void *tnn_graph_node(void *sess, int i) {
2365
+ if (!sess || i < 0) return NULL;
2366
+ tnn_session *s = (tnn_session *)sess;
2367
+ if (!s->graph) return NULL;
2368
+ if (i >= ggml_graph_n_nodes(s->graph)) return NULL;
2369
+ return (void *)ggml_graph_node(s->graph, i);
2370
+ }
2371
+ /* No tnn_graph_n_leafs / tnn_graph_leaf: ggml's cgraph leafs[] is
2372
+ * private (no public accessor). The describe_flow walker discovers
2373
+ * leaves from the Ruby side by scanning node srcs that aren't
2374
+ * themselves nodes — same set, just computed differently. */
2375
+
2376
+ /* tao#gguf-checkpoint-writer thin wrappers over ggml's gguf writer
2377
+ * API. The lifecycle is:
2378
+ * ctx = tnn_gguf_w_init()
2379
+ * tnn_gguf_w_set_str/u32/f32(ctx, key, value) — metadata
2380
+ * tnn_tensor_set_name(t, "...") — name each param
2381
+ * tnn_gguf_w_add_tensor(ctx, t) — record + data ptr
2382
+ * tnn_gguf_w_finalize(ctx, path) — fsync + close
2383
+ * tnn_gguf_w_free(ctx)
2384
+ *
2385
+ * `tnn_gguf_w_add_tensor` reads the tensor's `data` field; for CPU
2386
+ * backend that's the host pointer in the persistent backend buffer.
2387
+ * For CUDA backend a download step (not implemented here — see
2388
+ * toy#gguf-checkpoint-writer-cuda) would be required. */
2389
+
2390
+ void tnn_tensor_set_name(void *t, const char *name) {
2391
+ if (!t || !name) return;
2392
+ ggml_set_name((struct ggml_tensor *)t, name);
2393
+ }
2394
+
2395
+ void *tnn_gguf_w_init(void) {
2396
+ return (void *)gguf_init_empty();
2397
+ }
2398
+
2399
+ void tnn_gguf_w_set_str(void *ctx, const char *key, const char *val) {
2400
+ if (!ctx || !key || !val) return;
2401
+ gguf_set_val_str((struct gguf_context *)ctx, key, val);
2402
+ }
2403
+
2404
+ void tnn_gguf_w_set_u32(void *ctx, const char *key, int val) {
2405
+ if (!ctx || !key) return;
2406
+ gguf_set_val_u32((struct gguf_context *)ctx, key, (uint32_t)val);
2407
+ }
2408
+
2409
+ void tnn_gguf_w_set_f32(void *ctx, const char *key, double val) {
2410
+ if (!ctx || !key) return;
2411
+ gguf_set_val_f32((struct gguf_context *)ctx, key, (float)val);
2412
+ }
2413
+
2414
+ void tnn_gguf_w_set_bool(void *ctx, const char *key, int val) {
2415
+ if (!ctx || !key) return;
2416
+ gguf_set_val_bool((struct gguf_context *)ctx, key, val ? true : false);
2417
+ }
2418
+
2419
+ void tnn_gguf_w_add_tensor(void *ctx, void *t) {
2420
+ if (!ctx || !t) return;
2421
+ gguf_add_tensor((struct gguf_context *)ctx,
2422
+ (const struct ggml_tensor *)t);
2423
+ }
2424
+
2425
+ /* Returns 0 on success, -1 on null args, -2 on file write failure. */
2426
+ int tnn_gguf_w_finalize(void *ctx, const char *path) {
2427
+ if (!ctx || !path) return -1;
2428
+ bool ok = gguf_write_to_file((const struct gguf_context *)ctx,
2429
+ path, /*only_meta=*/ false);
2430
+ return ok ? 0 : -2;
2431
+ }
2432
+
2433
+ void tnn_gguf_w_free(void *ctx) {
2434
+ if (!ctx) return;
2435
+ gguf_free((struct gguf_context *)ctx);
2436
+ }
2437
+
2438
+ /* Atomic symlink replace (sym_path → target). Used by the checkpoint
2439
+ * writer to maintain `weights/latest`. Returns 0 on success, -1 on
2440
+ * failure. Unlinks any pre-existing symlink first; the create itself
2441
+ * is non-atomic (real atomicity needs renameat2 + a tmp link), but
2442
+ * Tao's consumers tolerate brief absence of the latest link. */
2443
+ int tnn_filesystem_symlink(const char *target, const char *sym_path) {
2444
+ if (!target || !sym_path) return -1;
2445
+ unlink(sym_path); /* may not exist; ignore EEXIST/ENOENT */
2446
+ int rc = symlink(target, sym_path);
2447
+ return rc == 0 ? 0 : -1;
2448
+ }
2449
+
2450
+ /* mkdir-p style helper: creates dir if missing. Returns 0 on success
2451
+ * (or already-exists), -1 on failure. Single-level: caller is
2452
+ * responsible for parent dirs (typically TAO_RUN_DIR already exists
2453
+ * because Tao created it). */
2454
+ int tnn_filesystem_mkdir(const char *path) {
2455
+ if (!path) return -1;
2456
+ int rc = mkdir(path, 0755);
2457
+ if (rc == 0) return 0;
2458
+ if (errno == EEXIST) return 0;
2459
+ return -1;
2460
+ }