local-llm-rn 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (626) hide show
  1. package/cpp/CMakeLists.txt +285 -0
  2. package/cpp/common/CMakeLists.txt +149 -0
  3. package/cpp/common/arg.cpp +3799 -0
  4. package/cpp/common/arg.h +131 -0
  5. package/cpp/common/base64.hpp +392 -0
  6. package/cpp/common/build-info.cpp.in +4 -0
  7. package/cpp/common/chat-parser-xml-toolcall.cpp +879 -0
  8. package/cpp/common/chat-parser-xml-toolcall.h +45 -0
  9. package/cpp/common/chat-parser.cpp +1649 -0
  10. package/cpp/common/chat-parser.h +133 -0
  11. package/cpp/common/chat-peg-parser.cpp +124 -0
  12. package/cpp/common/chat-peg-parser.h +105 -0
  13. package/cpp/common/chat.cpp +3355 -0
  14. package/cpp/common/chat.h +252 -0
  15. package/cpp/common/common.cpp +1824 -0
  16. package/cpp/common/common.h +930 -0
  17. package/cpp/common/console.cpp +1137 -0
  18. package/cpp/common/console.h +41 -0
  19. package/cpp/common/debug.cpp +167 -0
  20. package/cpp/common/debug.h +43 -0
  21. package/cpp/common/download.cpp +792 -0
  22. package/cpp/common/download.h +84 -0
  23. package/cpp/common/http.h +84 -0
  24. package/cpp/common/jinja/README.md +88 -0
  25. package/cpp/common/jinja/caps.cpp +285 -0
  26. package/cpp/common/jinja/caps.h +30 -0
  27. package/cpp/common/jinja/lexer.cpp +341 -0
  28. package/cpp/common/jinja/lexer.h +157 -0
  29. package/cpp/common/jinja/parser.cpp +591 -0
  30. package/cpp/common/jinja/parser.h +21 -0
  31. package/cpp/common/jinja/runtime.cpp +867 -0
  32. package/cpp/common/jinja/runtime.h +638 -0
  33. package/cpp/common/jinja/string.cpp +213 -0
  34. package/cpp/common/jinja/string.h +61 -0
  35. package/cpp/common/jinja/utils.h +149 -0
  36. package/cpp/common/jinja/value.cpp +1393 -0
  37. package/cpp/common/jinja/value.h +756 -0
  38. package/cpp/common/json-partial.cpp +324 -0
  39. package/cpp/common/json-partial.h +39 -0
  40. package/cpp/common/json-schema-to-grammar.cpp +1153 -0
  41. package/cpp/common/json-schema-to-grammar.h +43 -0
  42. package/cpp/common/llguidance.cpp +258 -0
  43. package/cpp/common/log.cpp +446 -0
  44. package/cpp/common/log.h +119 -0
  45. package/cpp/common/ngram-cache.cpp +285 -0
  46. package/cpp/common/ngram-cache.h +101 -0
  47. package/cpp/common/ngram-map.cpp +530 -0
  48. package/cpp/common/ngram-map.h +115 -0
  49. package/cpp/common/ngram-mod.cpp +60 -0
  50. package/cpp/common/ngram-mod.h +38 -0
  51. package/cpp/common/peg-parser.cpp +1712 -0
  52. package/cpp/common/peg-parser.h +459 -0
  53. package/cpp/common/preset.cpp +483 -0
  54. package/cpp/common/preset.h +83 -0
  55. package/cpp/common/regex-partial.cpp +204 -0
  56. package/cpp/common/regex-partial.h +56 -0
  57. package/cpp/common/sampling.cpp +745 -0
  58. package/cpp/common/sampling.h +119 -0
  59. package/cpp/common/speculative.cpp +1074 -0
  60. package/cpp/common/speculative.h +41 -0
  61. package/cpp/common/unicode.cpp +64 -0
  62. package/cpp/common/unicode.h +22 -0
  63. package/cpp/ggml/CMakeLists.txt +494 -0
  64. package/cpp/ggml/cmake/GitVars.cmake +22 -0
  65. package/cpp/ggml/cmake/common.cmake +50 -0
  66. package/cpp/ggml/cmake/ggml-config.cmake.in +191 -0
  67. package/cpp/ggml/include/ggml-alloc.h +85 -0
  68. package/cpp/ggml/include/ggml-backend.h +373 -0
  69. package/cpp/ggml/include/ggml-blas.h +25 -0
  70. package/cpp/ggml/include/ggml-cann.h +123 -0
  71. package/cpp/ggml/include/ggml-cpp.h +39 -0
  72. package/cpp/ggml/include/ggml-cpu.h +151 -0
  73. package/cpp/ggml/include/ggml-cuda.h +47 -0
  74. package/cpp/ggml/include/ggml-hexagon.h +19 -0
  75. package/cpp/ggml/include/ggml-metal.h +61 -0
  76. package/cpp/ggml/include/ggml-opencl.h +26 -0
  77. package/cpp/ggml/include/ggml-opt.h +256 -0
  78. package/cpp/ggml/include/ggml-rpc.h +30 -0
  79. package/cpp/ggml/include/ggml-sycl.h +49 -0
  80. package/cpp/ggml/include/ggml-virtgpu.h +14 -0
  81. package/cpp/ggml/include/ggml-vulkan.h +29 -0
  82. package/cpp/ggml/include/ggml-webgpu.h +19 -0
  83. package/cpp/ggml/include/ggml-zdnn.h +17 -0
  84. package/cpp/ggml/include/ggml-zendnn.h +22 -0
  85. package/cpp/ggml/include/ggml.h +2753 -0
  86. package/cpp/ggml/include/gguf.h +204 -0
  87. package/cpp/ggml/src/CMakeLists.txt +492 -0
  88. package/cpp/ggml/src/ggml-alloc.c +1244 -0
  89. package/cpp/ggml/src/ggml-backend-dl.cpp +48 -0
  90. package/cpp/ggml/src/ggml-backend-dl.h +45 -0
  91. package/cpp/ggml/src/ggml-backend-impl.h +255 -0
  92. package/cpp/ggml/src/ggml-backend-reg.cpp +566 -0
  93. package/cpp/ggml/src/ggml-backend.cpp +2270 -0
  94. package/cpp/ggml/src/ggml-blas/CMakeLists.txt +101 -0
  95. package/cpp/ggml/src/ggml-blas/ggml-blas.cpp +518 -0
  96. package/cpp/ggml/src/ggml-common.h +1878 -0
  97. package/cpp/ggml/src/ggml-cpu/CMakeLists.txt +691 -0
  98. package/cpp/ggml/src/ggml-cpu/amx/amx.cpp +247 -0
  99. package/cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
  100. package/cpp/ggml/src/ggml-cpu/amx/common.h +91 -0
  101. package/cpp/ggml/src/ggml-cpu/amx/mmq.cpp +2512 -0
  102. package/cpp/ggml/src/ggml-cpu/amx/mmq.h +10 -0
  103. package/cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +98 -0
  104. package/cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4052 -0
  105. package/cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +4935 -0
  106. package/cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2159 -0
  107. package/cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  108. package/cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2305 -0
  109. package/cpp/ggml/src/ggml-cpu/arch/riscv/cpu-feats.cpp +38 -0
  110. package/cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2726 -0
  111. package/cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +342 -0
  112. package/cpp/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
  113. package/cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1468 -0
  114. package/cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1221 -0
  115. package/cpp/ggml/src/ggml-cpu/arch/x86/cpu-feats.cpp +327 -0
  116. package/cpp/ggml/src/ggml-cpu/arch/x86/quants.c +3820 -0
  117. package/cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +6307 -0
  118. package/cpp/ggml/src/ggml-cpu/arch-fallback.h +313 -0
  119. package/cpp/ggml/src/ggml-cpu/binary-ops.cpp +154 -0
  120. package/cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
  121. package/cpp/ggml/src/ggml-cpu/cmake/FindSIMD.cmake +100 -0
  122. package/cpp/ggml/src/ggml-cpu/common.h +95 -0
  123. package/cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +529 -0
  124. package/cpp/ggml/src/ggml-cpu/ggml-cpu.c +3734 -0
  125. package/cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +701 -0
  126. package/cpp/ggml/src/ggml-cpu/hbm.cpp +55 -0
  127. package/cpp/ggml/src/ggml-cpu/hbm.h +8 -0
  128. package/cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +938 -0
  129. package/cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +90 -0
  130. package/cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +798 -0
  131. package/cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  132. package/cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +4033 -0
  133. package/cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +25 -0
  134. package/cpp/ggml/src/ggml-cpu/ops.cpp +10978 -0
  135. package/cpp/ggml/src/ggml-cpu/ops.h +116 -0
  136. package/cpp/ggml/src/ggml-cpu/quants.c +1193 -0
  137. package/cpp/ggml/src/ggml-cpu/quants.h +97 -0
  138. package/cpp/ggml/src/ggml-cpu/repack.cpp +3316 -0
  139. package/cpp/ggml/src/ggml-cpu/repack.h +173 -0
  140. package/cpp/ggml/src/ggml-cpu/simd-gemm.h +136 -0
  141. package/cpp/ggml/src/ggml-cpu/simd-mappings.h +1279 -0
  142. package/cpp/ggml/src/ggml-cpu/spacemit/ime.cpp +1025 -0
  143. package/cpp/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
  144. package/cpp/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
  145. package/cpp/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
  146. package/cpp/ggml/src/ggml-cpu/traits.cpp +36 -0
  147. package/cpp/ggml/src/ggml-cpu/traits.h +38 -0
  148. package/cpp/ggml/src/ggml-cpu/unary-ops.cpp +337 -0
  149. package/cpp/ggml/src/ggml-cpu/unary-ops.h +35 -0
  150. package/cpp/ggml/src/ggml-cpu/vec.cpp +629 -0
  151. package/cpp/ggml/src/ggml-cpu/vec.h +1585 -0
  152. package/cpp/ggml/src/ggml-hexagon/CMakeLists.txt +117 -0
  153. package/cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp +3232 -0
  154. package/cpp/ggml/src/ggml-hexagon/htp/CMakeLists.txt +45 -0
  155. package/cpp/ggml/src/ggml-hexagon/htp/act-ops.c +815 -0
  156. package/cpp/ggml/src/ggml-hexagon/htp/argsort-ops.c +281 -0
  157. package/cpp/ggml/src/ggml-hexagon/htp/binary-ops.c +827 -0
  158. package/cpp/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +157 -0
  159. package/cpp/ggml/src/ggml-hexagon/htp/cpy-ops.c +251 -0
  160. package/cpp/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +666 -0
  161. package/cpp/ggml/src/ggml-hexagon/htp/get-rows-ops.c +111 -0
  162. package/cpp/ggml/src/ggml-hexagon/htp/hex-dma.c +63 -0
  163. package/cpp/ggml/src/ggml-hexagon/htp/hex-dma.h +182 -0
  164. package/cpp/ggml/src/ggml-hexagon/htp/hex-dump.h +77 -0
  165. package/cpp/ggml/src/ggml-hexagon/htp/hex-fastdiv.h +37 -0
  166. package/cpp/ggml/src/ggml-hexagon/htp/hex-utils.h +51 -0
  167. package/cpp/ggml/src/ggml-hexagon/htp/htp-ctx.h +35 -0
  168. package/cpp/ggml/src/ggml-hexagon/htp/htp-msg.h +154 -0
  169. package/cpp/ggml/src/ggml-hexagon/htp/htp-ops.h +65 -0
  170. package/cpp/ggml/src/ggml-hexagon/htp/htp_iface.idl +16 -0
  171. package/cpp/ggml/src/ggml-hexagon/htp/hvx-arith.h +470 -0
  172. package/cpp/ggml/src/ggml-hexagon/htp/hvx-base.h +173 -0
  173. package/cpp/ggml/src/ggml-hexagon/htp/hvx-copy.h +245 -0
  174. package/cpp/ggml/src/ggml-hexagon/htp/hvx-div.h +116 -0
  175. package/cpp/ggml/src/ggml-hexagon/htp/hvx-dump.h +129 -0
  176. package/cpp/ggml/src/ggml-hexagon/htp/hvx-exp.h +215 -0
  177. package/cpp/ggml/src/ggml-hexagon/htp/hvx-floor.h +100 -0
  178. package/cpp/ggml/src/ggml-hexagon/htp/hvx-inverse.h +176 -0
  179. package/cpp/ggml/src/ggml-hexagon/htp/hvx-reduce.h +266 -0
  180. package/cpp/ggml/src/ggml-hexagon/htp/hvx-scale.h +133 -0
  181. package/cpp/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +141 -0
  182. package/cpp/ggml/src/ggml-hexagon/htp/hvx-sqrt.h +126 -0
  183. package/cpp/ggml/src/ggml-hexagon/htp/hvx-types.h +36 -0
  184. package/cpp/ggml/src/ggml-hexagon/htp/hvx-utils.h +18 -0
  185. package/cpp/ggml/src/ggml-hexagon/htp/main.c +1150 -0
  186. package/cpp/ggml/src/ggml-hexagon/htp/matmul-ops.c +2595 -0
  187. package/cpp/ggml/src/ggml-hexagon/htp/rope-ops.c +498 -0
  188. package/cpp/ggml/src/ggml-hexagon/htp/set-rows-ops.c +167 -0
  189. package/cpp/ggml/src/ggml-hexagon/htp/softmax-ops.c +421 -0
  190. package/cpp/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +130 -0
  191. package/cpp/ggml/src/ggml-hexagon/htp/unary-ops.c +384 -0
  192. package/cpp/ggml/src/ggml-hexagon/htp/worker-pool.c +293 -0
  193. package/cpp/ggml/src/ggml-hexagon/htp/worker-pool.h +57 -0
  194. package/cpp/ggml/src/ggml-hexagon/htp-drv.cpp +418 -0
  195. package/cpp/ggml/src/ggml-hexagon/htp-drv.h +121 -0
  196. package/cpp/ggml/src/ggml-hexagon/libdl.h +79 -0
  197. package/cpp/ggml/src/ggml-hexagon/libggml-htp.inf +38 -0
  198. package/cpp/ggml/src/ggml-hexagon/op-desc.h +153 -0
  199. package/cpp/ggml/src/ggml-impl.h +724 -0
  200. package/cpp/ggml/src/ggml-metal/CMakeLists.txt +124 -0
  201. package/cpp/ggml/src/ggml-metal/ggml-metal-common.cpp +457 -0
  202. package/cpp/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
  203. package/cpp/ggml/src/ggml-metal/ggml-metal-context.h +41 -0
  204. package/cpp/ggml/src/ggml-metal/ggml-metal-context.m +702 -0
  205. package/cpp/ggml/src/ggml-metal/ggml-metal-device.cpp +1890 -0
  206. package/cpp/ggml/src/ggml-metal/ggml-metal-device.h +290 -0
  207. package/cpp/ggml/src/ggml-metal/ggml-metal-device.m +1749 -0
  208. package/cpp/ggml/src/ggml-metal/ggml-metal-impl.h +1054 -0
  209. package/cpp/ggml/src/ggml-metal/ggml-metal-ops.cpp +4370 -0
  210. package/cpp/ggml/src/ggml-metal/ggml-metal-ops.h +94 -0
  211. package/cpp/ggml/src/ggml-metal/ggml-metal.cpp +937 -0
  212. package/cpp/ggml/src/ggml-metal/ggml-metal.metal +9819 -0
  213. package/cpp/ggml/src/ggml-musa/CMakeLists.txt +125 -0
  214. package/cpp/ggml/src/ggml-musa/mudnn.cu +112 -0
  215. package/cpp/ggml/src/ggml-musa/mudnn.cuh +12 -0
  216. package/cpp/ggml/src/ggml-opencl/CMakeLists.txt +150 -0
  217. package/cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +11553 -0
  218. package/cpp/ggml/src/ggml-opencl/kernels/add.cl +190 -0
  219. package/cpp/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
  220. package/cpp/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
  221. package/cpp/ggml/src/ggml-opencl/kernels/clamp.cl +20 -0
  222. package/cpp/ggml/src/ggml-opencl/kernels/concat.cl +51 -0
  223. package/cpp/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
  224. package/cpp/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
  225. package/cpp/ggml/src/ggml-opencl/kernels/cpy.cl +184 -0
  226. package/cpp/ggml/src/ggml-opencl/kernels/cvt.cl +417 -0
  227. package/cpp/ggml/src/ggml-opencl/kernels/diag_mask_inf.cl +58 -0
  228. package/cpp/ggml/src/ggml-opencl/kernels/div.cl +138 -0
  229. package/cpp/ggml/src/ggml-opencl/kernels/embed_kernel.py +26 -0
  230. package/cpp/ggml/src/ggml-opencl/kernels/expm1.cl +113 -0
  231. package/cpp/ggml/src/ggml-opencl/kernels/fill.cl +17 -0
  232. package/cpp/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +370 -0
  233. package/cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +371 -0
  234. package/cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +373 -0
  235. package/cpp/ggml/src/ggml-opencl/kernels/gelu.cl +89 -0
  236. package/cpp/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32.cl +162 -0
  237. package/cpp/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32.cl +156 -0
  238. package/cpp/ggml/src/ggml-opencl/kernels/gemv_noshuffle.cl +268 -0
  239. package/cpp/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general.cl +274 -0
  240. package/cpp/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general_q8_0_f32.cl +195 -0
  241. package/cpp/ggml/src/ggml-opencl/kernels/get_rows.cl +187 -0
  242. package/cpp/ggml/src/ggml-opencl/kernels/glu.cl +378 -0
  243. package/cpp/ggml/src/ggml-opencl/kernels/group_norm.cl +121 -0
  244. package/cpp/ggml/src/ggml-opencl/kernels/im2col_f16.cl +57 -0
  245. package/cpp/ggml/src/ggml-opencl/kernels/im2col_f32.cl +57 -0
  246. package/cpp/ggml/src/ggml-opencl/kernels/mean.cl +140 -0
  247. package/cpp/ggml/src/ggml-opencl/kernels/mul.cl +152 -0
  248. package/cpp/ggml/src/ggml-opencl/kernels/mul_mat_Ab_Bi_8x4.cl +139 -0
  249. package/cpp/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
  250. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_kq_kqv.cl +273 -0
  251. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +146 -0
  252. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +147 -0
  253. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl +163 -0
  254. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl +165 -0
  255. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q6_k_f32_l4_lm.cl +158 -0
  256. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_8x4.cl +129 -0
  257. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl +154 -0
  258. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f16.cl +118 -0
  259. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32.cl +118 -0
  260. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_1row.cl +94 -0
  261. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_l4.cl +84 -0
  262. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f32_f32.cl +118 -0
  263. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
  264. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
  265. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  266. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
  267. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
  268. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
  269. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
  270. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32.cl +192 -0
  271. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_16x_flat.cl +307 -0
  272. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_8x_flat.cl +265 -0
  273. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_8x_flat.cl +272 -0
  274. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_v.cl +254 -0
  275. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl +219 -0
  276. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl +229 -0
  277. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl +180 -0
  278. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32.cl +194 -0
  279. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +194 -0
  280. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
  281. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
  282. package/cpp/ggml/src/ggml-opencl/kernels/norm.cl +161 -0
  283. package/cpp/ggml/src/ggml-opencl/kernels/pad.cl +39 -0
  284. package/cpp/ggml/src/ggml-opencl/kernels/relu.cl +16 -0
  285. package/cpp/ggml/src/ggml-opencl/kernels/repeat.cl +38 -0
  286. package/cpp/ggml/src/ggml-opencl/kernels/rms_norm.cl +190 -0
  287. package/cpp/ggml/src/ggml-opencl/kernels/rope.cl +747 -0
  288. package/cpp/ggml/src/ggml-opencl/kernels/scale.cl +27 -0
  289. package/cpp/ggml/src/ggml-opencl/kernels/set_rows.cl +208 -0
  290. package/cpp/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
  291. package/cpp/ggml/src/ggml-opencl/kernels/silu.cl +30 -0
  292. package/cpp/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +108 -0
  293. package/cpp/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +108 -0
  294. package/cpp/ggml/src/ggml-opencl/kernels/softmax_f16.cl +107 -0
  295. package/cpp/ggml/src/ggml-opencl/kernels/softmax_f32.cl +107 -0
  296. package/cpp/ggml/src/ggml-opencl/kernels/softplus.cl +116 -0
  297. package/cpp/ggml/src/ggml-opencl/kernels/solve_tri.cl +51 -0
  298. package/cpp/ggml/src/ggml-opencl/kernels/sqr.cl +53 -0
  299. package/cpp/ggml/src/ggml-opencl/kernels/sqrt.cl +53 -0
  300. package/cpp/ggml/src/ggml-opencl/kernels/ssm_conv.cl +77 -0
  301. package/cpp/ggml/src/ggml-opencl/kernels/sub.cl +138 -0
  302. package/cpp/ggml/src/ggml-opencl/kernels/sum_rows.cl +140 -0
  303. package/cpp/ggml/src/ggml-opencl/kernels/tanh.cl +109 -0
  304. package/cpp/ggml/src/ggml-opencl/kernels/transpose.cl +117 -0
  305. package/cpp/ggml/src/ggml-opencl/kernels/tri.cl +32 -0
  306. package/cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  307. package/cpp/ggml/src/ggml-opencl/kernels/upscale.cl +120 -0
  308. package/cpp/ggml/src/ggml-opt.cpp +1093 -0
  309. package/cpp/ggml/src/ggml-quants.c +5325 -0
  310. package/cpp/ggml/src/ggml-quants.h +106 -0
  311. package/cpp/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
  312. package/cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2118 -0
  313. package/cpp/ggml/src/ggml-threading.cpp +12 -0
  314. package/cpp/ggml/src/ggml-threading.h +14 -0
  315. package/cpp/ggml/src/ggml-virtgpu/CMakeLists.txt +70 -0
  316. package/cpp/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp +87 -0
  317. package/cpp/ggml/src/ggml-virtgpu/backend/CMakeLists.txt +21 -0
  318. package/cpp/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp +115 -0
  319. package/cpp/ggml/src/ggml-virtgpu/backend/backend-convert.h +13 -0
  320. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +102 -0
  321. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +105 -0
  322. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +179 -0
  323. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp +148 -0
  324. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +51 -0
  325. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +73 -0
  326. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +27 -0
  327. package/cpp/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +32 -0
  328. package/cpp/ggml/src/ggml-virtgpu/backend/backend.cpp +144 -0
  329. package/cpp/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +95 -0
  330. package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +94 -0
  331. package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +50 -0
  332. package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +378 -0
  333. package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +232 -0
  334. package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +58 -0
  335. package/cpp/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +81 -0
  336. package/cpp/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +119 -0
  337. package/cpp/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +158 -0
  338. package/cpp/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +213 -0
  339. package/cpp/ggml/src/ggml-virtgpu/ggml-backend.cpp +69 -0
  340. package/cpp/ggml/src/ggml-virtgpu/ggml-remoting.h +71 -0
  341. package/cpp/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml +166 -0
  342. package/cpp/ggml/src/ggml-virtgpu/include/apir_hw.h +9 -0
  343. package/cpp/ggml/src/ggml-virtgpu/regenerate_remoting.py +333 -0
  344. package/cpp/ggml/src/ggml-virtgpu/virtgpu-apir.h +15 -0
  345. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +58 -0
  346. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +110 -0
  347. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +173 -0
  348. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +192 -0
  349. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +36 -0
  350. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +53 -0
  351. package/cpp/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +98 -0
  352. package/cpp/ggml/src/ggml-virtgpu/virtgpu-shm.h +23 -0
  353. package/cpp/ggml/src/ggml-virtgpu/virtgpu-utils.cpp +179 -0
  354. package/cpp/ggml/src/ggml-virtgpu/virtgpu-utils.h +86 -0
  355. package/cpp/ggml/src/ggml-virtgpu/virtgpu.cpp +544 -0
  356. package/cpp/ggml/src/ggml-virtgpu/virtgpu.h +117 -0
  357. package/cpp/ggml/src/ggml-webgpu/CMakeLists.txt +80 -0
  358. package/cpp/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +1231 -0
  359. package/cpp/ggml/src/ggml-webgpu/ggml-webgpu.cpp +3150 -0
  360. package/cpp/ggml/src/ggml-webgpu/pre_wgsl.hpp +778 -0
  361. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl +72 -0
  362. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl +106 -0
  363. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl +134 -0
  364. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +107 -0
  365. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +923 -0
  366. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +107 -0
  367. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl +66 -0
  368. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +182 -0
  369. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +636 -0
  370. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.wgsl +668 -0
  371. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +323 -0
  372. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
  373. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +713 -0
  374. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +103 -0
  375. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl +138 -0
  376. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.wgsl +188 -0
  377. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +194 -0
  378. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl +86 -0
  379. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +123 -0
  380. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/rope.tmpl.wgsl +295 -0
  381. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/scale.wgsl +63 -0
  382. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +109 -0
  383. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.tmpl.wgsl +345 -0
  384. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl +55 -0
  385. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +193 -0
  386. package/cpp/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
  387. package/cpp/ggml/src/ggml-zdnn/common.hpp +59 -0
  388. package/cpp/ggml/src/ggml-zdnn/ggml-zdnn.cpp +633 -0
  389. package/cpp/ggml/src/ggml-zdnn/mmf.cpp +80 -0
  390. package/cpp/ggml/src/ggml-zdnn/mmf.hpp +12 -0
  391. package/cpp/ggml/src/ggml-zdnn/utils.cpp +79 -0
  392. package/cpp/ggml/src/ggml-zdnn/utils.hpp +19 -0
  393. package/cpp/ggml/src/ggml-zendnn/CMakeLists.txt +92 -0
  394. package/cpp/ggml/src/ggml-zendnn/ggml-zendnn.cpp +469 -0
  395. package/cpp/ggml/src/ggml.c +7669 -0
  396. package/cpp/ggml/src/ggml.cpp +26 -0
  397. package/cpp/ggml/src/gguf.cpp +1699 -0
  398. package/cpp/include/llama-cpp.h +32 -0
  399. package/cpp/include/llama.h +1568 -0
  400. package/cpp/mtmd/CMakeLists.txt +98 -0
  401. package/cpp/mtmd/README.md +63 -0
  402. package/cpp/mtmd/clip-graph.h +117 -0
  403. package/cpp/mtmd/clip-impl.h +586 -0
  404. package/cpp/mtmd/clip-model.h +390 -0
  405. package/cpp/mtmd/clip.cpp +4154 -0
  406. package/cpp/mtmd/clip.h +121 -0
  407. package/cpp/mtmd/deprecation-warning.cpp +22 -0
  408. package/cpp/mtmd/legacy-models/convert_image_encoder_to_gguf.py +412 -0
  409. package/cpp/mtmd/legacy-models/glmedge-convert-image-encoder-to-gguf.py +280 -0
  410. package/cpp/mtmd/legacy-models/glmedge-surgery.py +33 -0
  411. package/cpp/mtmd/legacy-models/llava_surgery.py +38 -0
  412. package/cpp/mtmd/legacy-models/llava_surgery_v2.py +180 -0
  413. package/cpp/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py +892 -0
  414. package/cpp/mtmd/legacy-models/minicpmv-surgery.py +47 -0
  415. package/cpp/mtmd/models/cogvlm.cpp +98 -0
  416. package/cpp/mtmd/models/conformer.cpp +216 -0
  417. package/cpp/mtmd/models/glm4v.cpp +122 -0
  418. package/cpp/mtmd/models/internvl.cpp +69 -0
  419. package/cpp/mtmd/models/kimik25.cpp +101 -0
  420. package/cpp/mtmd/models/kimivl.cpp +63 -0
  421. package/cpp/mtmd/models/llama4.cpp +96 -0
  422. package/cpp/mtmd/models/llava.cpp +374 -0
  423. package/cpp/mtmd/models/minicpmv.cpp +114 -0
  424. package/cpp/mtmd/models/mobilenetv5.cpp +451 -0
  425. package/cpp/mtmd/models/models.h +128 -0
  426. package/cpp/mtmd/models/nemotron-v2-vl.cpp +35 -0
  427. package/cpp/mtmd/models/paddleocr.cpp +52 -0
  428. package/cpp/mtmd/models/pixtral.cpp +86 -0
  429. package/cpp/mtmd/models/qwen2vl.cpp +183 -0
  430. package/cpp/mtmd/models/qwen3vl.cpp +193 -0
  431. package/cpp/mtmd/models/siglip.cpp +86 -0
  432. package/cpp/mtmd/models/whisper-enc.cpp +115 -0
  433. package/cpp/mtmd/models/youtuvl.cpp +179 -0
  434. package/cpp/mtmd/mtmd-audio.cpp +730 -0
  435. package/cpp/mtmd/mtmd-audio.h +113 -0
  436. package/cpp/mtmd/mtmd-cli.cpp +437 -0
  437. package/cpp/mtmd/mtmd-helper.cpp +521 -0
  438. package/cpp/mtmd/mtmd-helper.h +96 -0
  439. package/cpp/mtmd/mtmd.cpp +1156 -0
  440. package/cpp/mtmd/mtmd.h +319 -0
  441. package/cpp/mtmd/requirements.txt +5 -0
  442. package/cpp/mtmd/test-1.jpeg +0 -0
  443. package/cpp/mtmd/test-2.mp3 +0 -0
  444. package/cpp/mtmd/tests.sh +192 -0
  445. package/cpp/src/CMakeLists.txt +169 -0
  446. package/cpp/src/llama-adapter.cpp +488 -0
  447. package/cpp/src/llama-adapter.h +89 -0
  448. package/cpp/src/llama-arch.cpp +2855 -0
  449. package/cpp/src/llama-arch.h +619 -0
  450. package/cpp/src/llama-batch.cpp +917 -0
  451. package/cpp/src/llama-batch.h +173 -0
  452. package/cpp/src/llama-chat.cpp +896 -0
  453. package/cpp/src/llama-chat.h +71 -0
  454. package/cpp/src/llama-context.cpp +3512 -0
  455. package/cpp/src/llama-context.h +359 -0
  456. package/cpp/src/llama-cparams.cpp +5 -0
  457. package/cpp/src/llama-cparams.h +44 -0
  458. package/cpp/src/llama-grammar.cpp +1464 -0
  459. package/cpp/src/llama-grammar.h +194 -0
  460. package/cpp/src/llama-graph.cpp +2685 -0
  461. package/cpp/src/llama-graph.h +1026 -0
  462. package/cpp/src/llama-hparams.cpp +234 -0
  463. package/cpp/src/llama-hparams.h +339 -0
  464. package/cpp/src/llama-impl.cpp +171 -0
  465. package/cpp/src/llama-impl.h +73 -0
  466. package/cpp/src/llama-io.cpp +15 -0
  467. package/cpp/src/llama-io.h +35 -0
  468. package/cpp/src/llama-kv-cache-iswa.cpp +330 -0
  469. package/cpp/src/llama-kv-cache-iswa.h +137 -0
  470. package/cpp/src/llama-kv-cache.cpp +2271 -0
  471. package/cpp/src/llama-kv-cache.h +388 -0
  472. package/cpp/src/llama-kv-cells.h +533 -0
  473. package/cpp/src/llama-memory-hybrid-iswa.cpp +275 -0
  474. package/cpp/src/llama-memory-hybrid-iswa.h +140 -0
  475. package/cpp/src/llama-memory-hybrid.cpp +268 -0
  476. package/cpp/src/llama-memory-hybrid.h +139 -0
  477. package/cpp/src/llama-memory-recurrent.cpp +1165 -0
  478. package/cpp/src/llama-memory-recurrent.h +182 -0
  479. package/cpp/src/llama-memory.cpp +59 -0
  480. package/cpp/src/llama-memory.h +122 -0
  481. package/cpp/src/llama-mmap.cpp +785 -0
  482. package/cpp/src/llama-mmap.h +92 -0
  483. package/cpp/src/llama-model-loader.cpp +1414 -0
  484. package/cpp/src/llama-model-loader.h +203 -0
  485. package/cpp/src/llama-model-saver.cpp +286 -0
  486. package/cpp/src/llama-model-saver.h +37 -0
  487. package/cpp/src/llama-model.cpp +9253 -0
  488. package/cpp/src/llama-model.h +576 -0
  489. package/cpp/src/llama-quant.cpp +1119 -0
  490. package/cpp/src/llama-quant.h +1 -0
  491. package/cpp/src/llama-sampler.cpp +3885 -0
  492. package/cpp/src/llama-sampler.h +42 -0
  493. package/cpp/src/llama-vocab.cpp +3970 -0
  494. package/cpp/src/llama-vocab.h +187 -0
  495. package/cpp/src/llama.cpp +1313 -0
  496. package/cpp/src/models/afmoe.cpp +191 -0
  497. package/cpp/src/models/apertus.cpp +125 -0
  498. package/cpp/src/models/arcee.cpp +135 -0
  499. package/cpp/src/models/arctic.cpp +138 -0
  500. package/cpp/src/models/arwkv7.cpp +86 -0
  501. package/cpp/src/models/baichuan.cpp +122 -0
  502. package/cpp/src/models/bailingmoe.cpp +144 -0
  503. package/cpp/src/models/bailingmoe2.cpp +135 -0
  504. package/cpp/src/models/bert.cpp +178 -0
  505. package/cpp/src/models/bitnet.cpp +160 -0
  506. package/cpp/src/models/bloom.cpp +101 -0
  507. package/cpp/src/models/chameleon.cpp +178 -0
  508. package/cpp/src/models/chatglm.cpp +132 -0
  509. package/cpp/src/models/codeshell.cpp +111 -0
  510. package/cpp/src/models/cogvlm.cpp +102 -0
  511. package/cpp/src/models/cohere2-iswa.cpp +134 -0
  512. package/cpp/src/models/command-r.cpp +122 -0
  513. package/cpp/src/models/dbrx.cpp +123 -0
  514. package/cpp/src/models/deci.cpp +135 -0
  515. package/cpp/src/models/deepseek.cpp +144 -0
  516. package/cpp/src/models/deepseek2.cpp +262 -0
  517. package/cpp/src/models/delta-net-base.cpp +376 -0
  518. package/cpp/src/models/dots1.cpp +134 -0
  519. package/cpp/src/models/dream.cpp +105 -0
  520. package/cpp/src/models/ernie4-5-moe.cpp +150 -0
  521. package/cpp/src/models/ernie4-5.cpp +110 -0
  522. package/cpp/src/models/eurobert.cpp +97 -0
  523. package/cpp/src/models/exaone-moe.cpp +146 -0
  524. package/cpp/src/models/exaone.cpp +114 -0
  525. package/cpp/src/models/exaone4.cpp +123 -0
  526. package/cpp/src/models/falcon-h1.cpp +111 -0
  527. package/cpp/src/models/falcon.cpp +120 -0
  528. package/cpp/src/models/gemma-embedding.cpp +116 -0
  529. package/cpp/src/models/gemma.cpp +112 -0
  530. package/cpp/src/models/gemma2-iswa.cpp +128 -0
  531. package/cpp/src/models/gemma3.cpp +155 -0
  532. package/cpp/src/models/gemma3n-iswa.cpp +384 -0
  533. package/cpp/src/models/glm4-moe.cpp +170 -0
  534. package/cpp/src/models/glm4.cpp +157 -0
  535. package/cpp/src/models/gpt2.cpp +105 -0
  536. package/cpp/src/models/gptneox.cpp +144 -0
  537. package/cpp/src/models/granite-hybrid.cpp +196 -0
  538. package/cpp/src/models/granite.cpp +211 -0
  539. package/cpp/src/models/grok.cpp +159 -0
  540. package/cpp/src/models/grovemoe.cpp +141 -0
  541. package/cpp/src/models/hunyuan-dense.cpp +132 -0
  542. package/cpp/src/models/hunyuan-moe.cpp +154 -0
  543. package/cpp/src/models/internlm2.cpp +120 -0
  544. package/cpp/src/models/jais.cpp +86 -0
  545. package/cpp/src/models/jais2.cpp +123 -0
  546. package/cpp/src/models/jamba.cpp +106 -0
  547. package/cpp/src/models/kimi-linear.cpp +392 -0
  548. package/cpp/src/models/lfm2.cpp +190 -0
  549. package/cpp/src/models/llada-moe.cpp +122 -0
  550. package/cpp/src/models/llada.cpp +99 -0
  551. package/cpp/src/models/llama-iswa.cpp +178 -0
  552. package/cpp/src/models/llama.cpp +168 -0
  553. package/cpp/src/models/maincoder.cpp +117 -0
  554. package/cpp/src/models/mamba-base.cpp +285 -0
  555. package/cpp/src/models/mamba.cpp +54 -0
  556. package/cpp/src/models/mimo2-iswa.cpp +123 -0
  557. package/cpp/src/models/minicpm3.cpp +200 -0
  558. package/cpp/src/models/minimax-m2.cpp +124 -0
  559. package/cpp/src/models/mistral3.cpp +160 -0
  560. package/cpp/src/models/models.h +684 -0
  561. package/cpp/src/models/modern-bert.cpp +109 -0
  562. package/cpp/src/models/mpt.cpp +126 -0
  563. package/cpp/src/models/nemotron-h.cpp +148 -0
  564. package/cpp/src/models/nemotron.cpp +122 -0
  565. package/cpp/src/models/neo-bert.cpp +104 -0
  566. package/cpp/src/models/olmo.cpp +121 -0
  567. package/cpp/src/models/olmo2.cpp +150 -0
  568. package/cpp/src/models/olmoe.cpp +124 -0
  569. package/cpp/src/models/openai-moe-iswa.cpp +127 -0
  570. package/cpp/src/models/openelm.cpp +124 -0
  571. package/cpp/src/models/orion.cpp +123 -0
  572. package/cpp/src/models/paddleocr.cpp +122 -0
  573. package/cpp/src/models/pangu-embedded.cpp +121 -0
  574. package/cpp/src/models/phi2.cpp +121 -0
  575. package/cpp/src/models/phi3.cpp +152 -0
  576. package/cpp/src/models/plamo.cpp +110 -0
  577. package/cpp/src/models/plamo2.cpp +318 -0
  578. package/cpp/src/models/plamo3.cpp +128 -0
  579. package/cpp/src/models/plm.cpp +169 -0
  580. package/cpp/src/models/qwen.cpp +108 -0
  581. package/cpp/src/models/qwen2.cpp +126 -0
  582. package/cpp/src/models/qwen2moe.cpp +151 -0
  583. package/cpp/src/models/qwen2vl.cpp +117 -0
  584. package/cpp/src/models/qwen3.cpp +117 -0
  585. package/cpp/src/models/qwen35.cpp +386 -0
  586. package/cpp/src/models/qwen35moe.cpp +420 -0
  587. package/cpp/src/models/qwen3moe.cpp +124 -0
  588. package/cpp/src/models/qwen3next.cpp +525 -0
  589. package/cpp/src/models/qwen3vl-moe.cpp +140 -0
  590. package/cpp/src/models/qwen3vl.cpp +132 -0
  591. package/cpp/src/models/refact.cpp +94 -0
  592. package/cpp/src/models/rnd1.cpp +126 -0
  593. package/cpp/src/models/rwkv6-base.cpp +164 -0
  594. package/cpp/src/models/rwkv6.cpp +94 -0
  595. package/cpp/src/models/rwkv6qwen2.cpp +86 -0
  596. package/cpp/src/models/rwkv7-base.cpp +137 -0
  597. package/cpp/src/models/rwkv7.cpp +90 -0
  598. package/cpp/src/models/seed-oss.cpp +124 -0
  599. package/cpp/src/models/smallthinker.cpp +126 -0
  600. package/cpp/src/models/smollm3.cpp +128 -0
  601. package/cpp/src/models/stablelm.cpp +146 -0
  602. package/cpp/src/models/starcoder.cpp +100 -0
  603. package/cpp/src/models/starcoder2.cpp +121 -0
  604. package/cpp/src/models/step35-iswa.cpp +168 -0
  605. package/cpp/src/models/t5-dec.cpp +166 -0
  606. package/cpp/src/models/t5-enc.cpp +96 -0
  607. package/cpp/src/models/wavtokenizer-dec.cpp +149 -0
  608. package/cpp/src/models/xverse.cpp +108 -0
  609. package/cpp/src/unicode-data.cpp +7034 -0
  610. package/cpp/src/unicode-data.h +20 -0
  611. package/cpp/src/unicode.cpp +1103 -0
  612. package/cpp/src/unicode.h +111 -0
  613. package/cpp/vendor/nlohmann/json.hpp +25526 -0
  614. package/cpp/vendor/nlohmann/json_fwd.hpp +187 -0
  615. package/cpp/vendor/stb/stb_image.h +7988 -0
  616. package/ios/LocalLLM-Bridging-Header.h +2 -0
  617. package/ios/LocalLLM.h +5 -0
  618. package/ios/LocalLLM.mm +1267 -0
  619. package/local-llm-rn.podspec +60 -0
  620. package/package.json +35 -0
  621. package/src/NativeLocalLLM.ts +73 -0
  622. package/src/device.ts +50 -0
  623. package/src/download-adapter.ts +17 -0
  624. package/src/index.ts +21 -0
  625. package/src/native-bridge.ts +142 -0
  626. package/src/rn-downloader.ts +37 -0
@@ -0,0 +1,417 @@
1
+ //------------------------------------------------------------------------------
2
+ // This file is contains kernels for data conversion.
3
+ // These kernels are used when loading the model, so its performance is less
4
+ // important.
5
+ //------------------------------------------------------------------------------
6
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
7
+
8
+ #ifdef cl_intel_required_subgroup_size
9
+ #pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
10
+ #define INTEL_GPU 1
11
+ #define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
12
+ #define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
13
+ #elif defined(cl_qcom_reqd_sub_group_size)
14
+ #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
15
+ #define ADRENO_GPU 1
16
+ #define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
17
+ #define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
18
+ #endif
19
+
20
+ #define QK4_0 32
21
+ #define QR4_0 2
22
+ #define QK4_1 32
23
+ #define QR4_1 2
24
+ #define QK5_0 32
25
+ #define QR5_0 2
26
+ #define QK5_1 32
27
+ #define QR5_1 2
28
+ #define QK8_0 32
29
+ #define QR8_0 1
30
+ #define QK_K 256
31
+ #define K_QUANTS_PER_ITERATION 2
32
+
33
+ typedef char int8_t;
34
+ typedef uchar uint8_t;
35
+ typedef short int16_t;
36
+ typedef ushort uint16_t;
37
+ typedef int int32_t;
38
+ typedef uint uint32_t;
39
+
40
+ //------------------------------------------------------------------------------
41
+ // block_q4_0
42
+ //------------------------------------------------------------------------------
43
+ struct block_q4_0
44
+ {
45
+ half d;
46
+ uint8_t qs[QK4_0 / 2];
47
+ };
48
+
49
+ //------------------------------------------------------------------------------
50
+ // block_q4_1
51
+ //------------------------------------------------------------------------------
52
+ struct block_q4_1 {
53
+ half d; // delta
54
+ half m; // min
55
+ uchar qs[QK4_1 / 2]; // nibbles / quants
56
+ };
57
+
58
+ //------------------------------------------------------------------------------
59
+ // block_q6_K
60
+ //------------------------------------------------------------------------------
61
+ struct block_q6_K {
62
+ uint8_t ql[QK_K/2]; // quants, lower 4 bits
63
+ uint8_t qh[QK_K/4]; // quants, upper 2 bits
64
+ int8_t scales[QK_K/16]; // scales, quantized with 8 bits
65
+ half d; // super-block scale
66
+ };
67
+
68
+ //------------------------------------------------------------------------------
69
+ // kernel_convert_block_q4_0
70
+ // Convert the block_q4_0 format to 2 separate arrays (AOS -> SOA).
71
+ // This kernel does not deshuffle the bits.
72
+ //------------------------------------------------------------------------------
73
+ kernel void kernel_convert_block_q4_0(
74
+ global struct block_q4_0 * src0,
75
+ global uchar * dst_q,
76
+ global half * dst_d
77
+ ) {
78
+ global struct block_q4_0 * b = (global struct block_q4_0 *) src0 + get_global_id(0);
79
+ global uchar * q = (global uchar *) dst_q + QK4_0/2*get_global_id(0);
80
+ global half * d = (global half *) dst_d + get_global_id(0);
81
+
82
+ *d = b->d;
83
+
84
+ for (int i = 0; i < QK4_0/2; ++i) {
85
+ q[i] = b->qs[i];
86
+ }
87
+ }
88
+
89
+ kernel void kernel_restore_block_q4_0(
90
+ global uchar * src_q,
91
+ global half * src_d,
92
+ global struct block_q4_0 * dst
93
+ ) {
94
+ global struct block_q4_0 * b = (global struct block_q4_0 *) dst + get_global_id(0);
95
+ global uchar * q = (global uchar *) src_q + QK4_0/2*get_global_id(0);
96
+ global half * d = (global half *) src_d + get_global_id(0);
97
+
98
+ b->d = *d;
99
+ for (int i = 0; i < QK4_0/2; ++i) {
100
+ b->qs[i] = q[i];
101
+ }
102
+ }
103
+
104
+ //------------------------------------------------------------------------------
105
+ // kernel_convert_block_q4_0_noshuffle
106
+ // Flatten q4_0 weights and unshuffle the bits
107
+ //------------------------------------------------------------------------------
108
+
109
+ kernel void kernel_convert_block_q4_0_noshuffle(
110
+ global struct block_q4_0 * src0,
111
+ global uchar * dst_q,
112
+ global half * dst_d
113
+ ) {
114
+ global struct block_q4_0 * b = (global struct block_q4_0 *) src0 + get_global_id(0);
115
+ global uchar * q = (global uchar *) dst_q + QK4_0/2*get_global_id(0);
116
+ global half * d = (global half *) dst_d + get_global_id(0);
117
+
118
+ *d = b->d;
119
+ for (int i = 0; i < QK4_0/4; ++i) {
120
+ uchar x0 = b->qs[2*i + 0];
121
+ uchar x1 = b->qs[2*i + 1];
122
+
123
+ q[i + 0 ] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
124
+ q[i + QK4_0/4] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
125
+
126
+ #ifdef ADRENO_GPU
127
+ // Workaround for adreno - must have the following printf statement for
128
+ // the kernel to work properly. Otherwise it produces incorrect result.
129
+ // convert_uchar above also seems necessary.
130
+ // Compare against a large number so that it does not print anything.
131
+ // get_sub_group_local_id() also works.
132
+ if (get_global_id(0) == 65536*4096) {
133
+ printf("%04x - %02x\n", *(global ushort*)d, ((x0 & 0xF0) >> 4) | (x1 & 0xF0));
134
+ }
135
+ #endif
136
+ }
137
+ }
138
+
139
+ kernel void kernel_restore_block_q4_0_noshuffle(
140
+ global uchar * src_q,
141
+ global half * src_d,
142
+ global struct block_q4_0 * dst,
143
+ uchar mask_0F,
144
+ uchar mask_F0
145
+ ) {
146
+ global struct block_q4_0 * b = (global struct block_q4_0 *) dst + get_global_id(0);
147
+ global uchar * q = (global uchar *) src_q + QK4_0/2*get_global_id(0);
148
+ global half * d = (global half *) src_d + get_global_id(0);
149
+
150
+ b->d = *d;
151
+ for (int i = 0; i < QK4_0/4; ++i) {
152
+ uchar x0 = q[i + 0 ] ;
153
+ uchar x1 = q[i + QK4_0/4];
154
+
155
+ b->qs[2*i + 0] = convert_uchar((x0 & mask_0F) | ((x1 & mask_0F) << 4));
156
+ b->qs[2*i + 1] = convert_uchar(((x0 & mask_F0) >> 4) | (x1 & mask_F0));
157
+ }
158
+ }
159
+
160
+ //------------------------------------------------------------------------------
161
+ // kernel_convert_block_q4_1
162
+ // Convert the block_q4_1 format to 2 separate arrays (AOS -> SOA).
163
+ // This kernel does not deshuffle the bits.
164
+ //------------------------------------------------------------------------------
165
+ kernel void kernel_convert_block_q4_1(
166
+ global struct block_q4_1 * src0,
167
+ global uchar * dst_q,
168
+ global half * dst_d,
169
+ global half * dst_m
170
+ ) {
171
+ global struct block_q4_1 * b = (global struct block_q4_1 *) src0 + get_global_id(0);
172
+ global uchar * q = (global uchar *) dst_q + QK4_1/2*get_global_id(0);
173
+ global half * d = (global half *) dst_d + get_global_id(0);
174
+ global half * m = (global half *) dst_m + get_global_id(0);
175
+
176
+ *d = b->d;
177
+ *m = b->m;
178
+
179
+ for (int i = 0; i < QK4_1/2; ++i) {
180
+ q[i] = b->qs[i];
181
+ }
182
+ }
183
+
184
+ kernel void kernel_restore_block_q4_1(
185
+ global uchar * src_q,
186
+ global half * src_d,
187
+ global half * src_m,
188
+ global struct block_q4_1 * dst
189
+ ) {
190
+ global struct block_q4_1 * b = (global struct block_q4_1 *) dst + get_global_id(0);
191
+ global uchar * q = (global uchar *) src_q + QK4_1/2*get_global_id(0);
192
+ global half * d = (global half *) src_d + get_global_id(0);
193
+ global half * m = (global half *) src_m + get_global_id(0);
194
+
195
+ b->d = *d;
196
+ b->m = *m;
197
+ for (int i = 0; i < QK4_1/2; ++i) {
198
+ b->qs[i] = q[i];
199
+ }
200
+ }
201
+
202
+ //------------------------------------------------------------------------------
203
+ // block_mxfp4
204
+ //------------------------------------------------------------------------------
205
+ #define QK_MXFP4 32
206
+ struct block_mxfp4 {
207
+ uchar e; // E8M0
208
+ uchar qs[QK_MXFP4 / 2];
209
+ };
210
+
211
+ //------------------------------------------------------------------------------
212
+ // kernel_convert_block_mxfp4
213
+ // Convert the block_mxfp4 format to 2 separate arrays (AOS -> SOA).
214
+ // This kernel does not deshuffle the bits.
215
+ //------------------------------------------------------------------------------
216
+ kernel void kernel_convert_block_mxfp4(
217
+ global struct block_mxfp4 * src0,
218
+ global uchar * dst_q,
219
+ global uchar * dst_e
220
+ ) {
221
+ global struct block_mxfp4 * b = (global struct block_mxfp4 *) src0 + get_global_id(0);
222
+ global uchar * q = (global uchar *) dst_q + QK_MXFP4 / 2 * get_global_id(0);
223
+ global uchar * e = (global uchar *) dst_e + get_global_id(0);
224
+
225
+ *e = b->e;
226
+
227
+ for (int i = 0; i < QK_MXFP4 / 2; ++i) {
228
+ q[i] = b->qs[i];
229
+ }
230
+ }
231
+
232
+ kernel void kernel_convert_block_mxfp4_trans(
233
+ global struct block_mxfp4 * src0,
234
+ __global uint4 * dst_q,
235
+ __global uchar * dst_e,
236
+ uint ne00,
237
+ uint ne01
238
+ ) {
239
+ int i00 = get_global_id(1);
240
+ uint i01 = get_global_id(0);
241
+ uint i02 = get_global_id(2);
242
+
243
+ uint ne00_blk = ne00 / QK_MXFP4;
244
+ uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
245
+ uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
246
+
247
+ global struct block_mxfp4 * b = src0 + src_blk_offset;
248
+
249
+ dst_q[dst_blk_offset] = ((global uint4 *)(&(b->qs[0])))[0];
250
+ dst_e[dst_blk_offset] = b->e;
251
+ }
252
+
253
+ kernel void kernel_restore_block_mxfp4(
254
+ global uchar * src_q,
255
+ global half * src_e,
256
+ global struct block_mxfp4 * dst
257
+ ) {
258
+ global struct block_mxfp4 * b = (global struct block_mxfp4 *) dst + get_global_id(0);
259
+ global uchar * q = (global uchar *) src_q + QK_MXFP4 / 2 * get_global_id(0);
260
+ global uchar * e = (global uchar *) src_e + get_global_id(0);
261
+
262
+ b->e = *e;
263
+ for (int i = 0; i < QK_MXFP4 / 2; ++i) {
264
+ b->qs[i] = q[i];
265
+ }
266
+ }
267
+
268
+ kernel void kernel_restore_block_mxfp4_trans(
269
+ __global uint4 * src_q,
270
+ __global uchar * src_e,
271
+ global struct block_mxfp4 * dst,
272
+ uint ne00,
273
+ uint ne01
274
+ ) {
275
+ int i00 = get_global_id(1);
276
+ uint i01 = get_global_id(0);
277
+ uint i02 = get_global_id(2);
278
+
279
+ uint ne00_blk = ne00 / QK_MXFP4;
280
+ uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
281
+ uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
282
+
283
+ global struct block_mxfp4 * b = dst + dst_blk_offset;
284
+
285
+ ((global uint4 *)(&(b->qs[0])))[0] = src_q[src_blk_offset];
286
+ b->e = src_e[src_blk_offset];
287
+ }
288
+
289
+ //------------------------------------------------------------------------------
290
+ // block_q8_0
291
+ //------------------------------------------------------------------------------
292
+ typedef struct {
293
+ half d; // delta
294
+ char qs[QK8_0]; // quants
295
+ } block_q8_0;
296
+
297
+ kernel void kernel_convert_block_q8_0(
298
+ global block_q8_0 * src0,
299
+ global uchar * dst_q,
300
+ global half * dst_d
301
+ ) {
302
+ global block_q8_0 * b = (global block_q8_0 *) src0 + get_global_id(0);
303
+ global uchar * q = (global uchar *) dst_q + QK8_0*get_global_id(0);
304
+ global half * d = (global half *) dst_d + get_global_id(0);
305
+
306
+ *d = b->d;
307
+
308
+ for (int i = 0; i < QK8_0; ++i) {
309
+ q[i] = b->qs[i];
310
+ }
311
+ }
312
+
313
+ kernel void kernel_restore_block_q8_0(
314
+ global uchar * src_q,
315
+ global half * src_d,
316
+ global block_q8_0 * dst
317
+ ) {
318
+ global block_q8_0 * b = (global block_q8_0 *) dst + get_global_id(0);
319
+ global uchar * q = (global uchar *) src_q + QK8_0*get_global_id(0);
320
+ global half * d = (global half *) src_d + get_global_id(0);
321
+
322
+ b->d = *d;
323
+ for (int i = 0; i < QK8_0; ++i) {
324
+ b->qs[i] = q[i];
325
+ }
326
+ }
327
+
328
+ kernel void kernel_restore_block_q8_0_trans(
329
+ global uchar * src_q,
330
+ global half * src_d,
331
+ global block_q8_0 * dst,
332
+ uint ne00,
333
+ uint ne01
334
+ ){
335
+ uint num_blk_per_row = ne00 / QK8_0;
336
+
337
+ global block_q8_0 * b = (global block_q8_0 *) dst + get_global_id(0) * num_blk_per_row;
338
+ global uchar * q = (global uchar *) src_q + get_global_id(0) * 4; // 4 8-bit packed
339
+ global half * d = (global half *) src_d + get_global_id(0);
340
+
341
+ for (uint blk = 0; blk < num_blk_per_row; blk++) {
342
+ b->d = *d;
343
+
344
+ for (uint i = 0; i < QK8_0; i+=4) {
345
+ b->qs[i] = q[0];
346
+ b->qs[i+1] = q[1];
347
+ b->qs[i+2] = q[2];
348
+ b->qs[i+3] = q[3];
349
+
350
+ q += 4 * ne01; // M stride
351
+ }
352
+
353
+ d += ne01;
354
+
355
+ b++;
356
+ }
357
+ }
358
+
359
+ //------------------------------------------------------------------------------
360
+ // kernel_convert_block_q6_K
361
+ // Convert the block_q6_K format to 3 separate arrays (AOS -> SOA).
362
+ // This kernel does not deshuffle the bits.
363
+ // Each thread processes a super block.
364
+ //------------------------------------------------------------------------------
365
+ kernel void kernel_convert_block_q6_K(
366
+ global struct block_q6_K * src0,
367
+ global uchar * dst_ql,
368
+ global uchar * dst_qh,
369
+ global char * dst_s,
370
+ global half * dst_d
371
+ ) {
372
+ global struct block_q6_K * b = (global struct block_q6_K *) src0 + get_global_id(0);
373
+ global uchar * ql = (global uchar *) dst_ql + QK_K/2*get_global_id(0);
374
+ global uchar * qh = (global uchar *) dst_qh + QK_K/4*get_global_id(0);
375
+ global char * s = (global char *) dst_s + QK_K/16*get_global_id(0);
376
+ global half * d = (global half *) dst_d + get_global_id(0);
377
+
378
+ *d = b->d;
379
+
380
+ for (int i = 0; i < QK_K/2; ++i) {
381
+ ql[i] = b->ql[i];
382
+ }
383
+ for (int i = 0; i < QK_K/4; ++i) {
384
+ qh[i] = b->qh[i];
385
+ }
386
+ for (int i = 0; i < QK_K/16; ++i) {
387
+ s[i] = b->scales[i];
388
+ }
389
+ }
390
+
391
+ // Restore block_q6_K from flattened arrays.
392
+ // Each thread processes a super block.
393
+ kernel void kernel_restore_block_q6_K(
394
+ global uchar * dst_ql,
395
+ global uchar * dst_qh,
396
+ global char * dst_s,
397
+ global half * dst_d,
398
+ global struct block_q6_K * dst
399
+ ) {
400
+ global struct block_q6_K * b = (global struct block_q6_K *) dst + get_global_id(0);
401
+ global uchar * ql = (global uchar *) dst_ql + QK_K/2*get_global_id(0);
402
+ global uchar * qh = (global uchar *) dst_qh + QK_K/4*get_global_id(0);
403
+ global char * s = (global char *) dst_s + QK_K/16*get_global_id(0);
404
+ global half * d = (global half *) dst_d + get_global_id(0);
405
+
406
+ b->d = *d;
407
+
408
+ for (int i = 0; i < QK_K/2; ++i) {
409
+ b->ql[i] = ql[i];
410
+ }
411
+ for (int i = 0; i < QK_K/4; ++i) {
412
+ b->qh[i] = qh[i];
413
+ }
414
+ for (int i = 0; i < QK_K/16; ++i) {
415
+ b->scales[i] = s[i];
416
+ }
417
+ }
@@ -0,0 +1,58 @@
1
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
2
+
3
+ //------------------------------------------------------------------------------
4
+ // diag_mask_inf kernels
5
+ //------------------------------------------------------------------------------
6
+ kernel void kernel_diag_mask_inf(
7
+ global float * src0,
8
+ ulong offset0,
9
+ global float * dst,
10
+ ulong offsetd,
11
+ int ne00,
12
+ int ne01,
13
+ int n_past
14
+ ) {
15
+ src0 = (global float*)((global char*)src0 + offset0);
16
+ dst = (global float*)((global char*)dst + offsetd);
17
+
18
+ int i02 = get_global_id(2);
19
+ int i01 = get_global_id(1);
20
+ int i00 = get_global_id(0);
21
+
22
+ if (i00 > n_past + i01) {
23
+ dst[i02*ne01*ne00 + i01*ne00 + i00] = -INFINITY;
24
+ } else {
25
+ dst[i02*ne01*ne00 + i01*ne00 + i00] = src0[i02*ne01*ne00 + i01*ne00 + i00];
26
+ }
27
+ }
28
+
29
+ kernel void kernel_diag_mask_inf_8(
30
+ global float4 * src0,
31
+ ulong offset0,
32
+ global float4 * dst,
33
+ ulong offsetd,
34
+ int ne00,
35
+ int ne01,
36
+ int n_past
37
+ ) {
38
+ src0 = (global float4*)((global char*)src0 + offset0);
39
+ dst = (global float4*)((global char*)dst + offsetd);
40
+
41
+ int i = 2*get_global_id(0);
42
+
43
+ dst[i+0] = src0[i+0];
44
+ dst[i+1] = src0[i+1];
45
+ int i4 = 4*i;
46
+ int i02 = i4/(ne00*ne01); i4 -= i02*ne00*ne01;
47
+ int i01 = i4/(ne00); i4 -= i01*ne00;
48
+ int i00 = i4;
49
+ for (int k = 3; k >= 0; --k) {
50
+ if (i00 + 4 + k <= n_past + i01) {
51
+ break;
52
+ }
53
+ (&dst[i+1])[k] = -INFINITY;
54
+ if (i00 + k > n_past + i01) {
55
+ (&dst[i])[k] = -INFINITY;
56
+ }
57
+ }
58
+ }
@@ -0,0 +1,138 @@
1
+ #pragma OPENCL EXTENSION cl_khr_fp16 : enable
2
+
3
+ //------------------------------------------------------------------------------
4
+ // div
5
+ //------------------------------------------------------------------------------
6
+ kernel void kernel_div(
7
+ global char * src0,
8
+ ulong offset0,
9
+ global char * src1,
10
+ ulong offset1,
11
+ global char * dst,
12
+ ulong offsetd,
13
+ ulong nb00,
14
+ ulong nb01,
15
+ ulong nb02,
16
+ ulong nb03,
17
+ int ne10,
18
+ int ne11,
19
+ int ne12,
20
+ int ne13,
21
+ ulong nb10,
22
+ ulong nb11,
23
+ ulong nb12,
24
+ ulong nb13,
25
+ int ne0,
26
+ ulong nb0,
27
+ ulong nb1,
28
+ ulong nb2,
29
+ ulong nb3
30
+ ) {
31
+ src0 = src0 + offset0;
32
+ src1 = src1 + offset1;
33
+ dst = dst + offsetd;
34
+
35
+ int i03 = get_group_id(2);
36
+ int i02 = get_group_id(1);
37
+ int i01 = get_group_id(0);
38
+
39
+ int i13 = i03 % ne13;
40
+ int i12 = i02 % ne12;
41
+ int i11 = i01 % ne11;
42
+
43
+ global char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
44
+ global char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
45
+ global char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1;
46
+
47
+ for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
48
+ const int i10 = i0 % ne10;
49
+ *((global float *)(dst_ptr + i0*nb0)) = *((global float *)(src0_ptr + i0*nb00)) / *((global float *)(src1_ptr + i10*nb10));
50
+ }
51
+ }
52
+
53
+ // assumption: src1 is a row
54
+ // broadcast src1 into src0
55
+ kernel void kernel_div_row(
56
+ global float4 * src0,
57
+ ulong offset0,
58
+ global float4 * src1,
59
+ ulong offset1,
60
+ global float4 * dst,
61
+ ulong offsetd,
62
+ int ne
63
+ ) {
64
+ src0 = (global float4*)((global char*)src0 + offset0);
65
+ src1 = (global float4*)((global char*)src1 + offset1);
66
+ dst = (global float4*)((global char*)dst + offsetd);
67
+
68
+ // This performs better than using %.
69
+ uint gid = get_global_id(0);
70
+ uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
71
+ dst[gid] = src0[gid] / src1[idx1];
72
+ }
73
+
74
+ kernel void kernel_div_f16(
75
+ global char * src0,
76
+ ulong offset0,
77
+ global char * src1,
78
+ ulong offset1,
79
+ global char * dst,
80
+ ulong offsetd,
81
+ ulong nb00,
82
+ ulong nb01,
83
+ ulong nb02,
84
+ ulong nb03,
85
+ int ne10,
86
+ int ne11,
87
+ int ne12,
88
+ int ne13,
89
+ ulong nb10,
90
+ ulong nb11,
91
+ ulong nb12,
92
+ ulong nb13,
93
+ int ne0,
94
+ ulong nb0,
95
+ ulong nb1,
96
+ ulong nb2,
97
+ ulong nb3
98
+ ) {
99
+ src0 = src0 + offset0;
100
+ src1 = src1 + offset1;
101
+ dst = dst + offsetd;
102
+
103
+ int i03 = get_group_id(2);
104
+ int i02 = get_group_id(1);
105
+ int i01 = get_group_id(0);
106
+
107
+ int i13 = i03 % ne13;
108
+ int i12 = i02 % ne12;
109
+ int i11 = i01 % ne11;
110
+
111
+ global char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
112
+ global char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
113
+ global char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1;
114
+
115
+ for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
116
+ const int i10 = i0 % ne10;
117
+ *((global half *)(dst_ptr + i0*nb0)) = *((global half *)(src0_ptr + i0*nb00)) / *((global half *)(src1_ptr + i10*nb10));
118
+ }
119
+ }
120
+
121
+ kernel void kernel_div_row_f16(
122
+ global half4 * src0,
123
+ ulong offset0,
124
+ global half4 * src1,
125
+ ulong offset1,
126
+ global half4 * dst,
127
+ ulong offsetd,
128
+ int ne
129
+ ) {
130
+ src0 = (global half4*)((global char*)src0 + offset0);
131
+ src1 = (global half4*)((global char*)src1 + offset1);
132
+ dst = (global half4*)((global char*)dst + offsetd);
133
+
134
+ // This performs better than using %.
135
+ uint gid = get_global_id(0);
136
+ uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
137
+ dst[gid] = src0[gid] / src1[idx1];
138
+ }
@@ -0,0 +1,26 @@
1
+ #
2
+
3
+ import sys
4
+ import logging
5
+ logger = logging.getLogger("opencl-embed-kernel")
6
+
7
+
8
+ def main():
9
+ logging.basicConfig(level=logging.INFO)
10
+
11
+ if len(sys.argv) != 3:
12
+ logger.info("Usage: python embed_kernel.py <input_file> <output_file>")
13
+ sys.exit(1)
14
+
15
+ ifile = open(sys.argv[1], "r")
16
+ ofile = open(sys.argv[2], "w")
17
+
18
+ for i in ifile:
19
+ ofile.write('R"({})"\n'.format(i))
20
+
21
+ ifile.close()
22
+ ofile.close()
23
+
24
+
25
+ if __name__ == "__main__":
26
+ main()