local-llm-rn 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/cpp/CMakeLists.txt +285 -0
- package/cpp/common/CMakeLists.txt +149 -0
- package/cpp/common/arg.cpp +3799 -0
- package/cpp/common/arg.h +131 -0
- package/cpp/common/base64.hpp +392 -0
- package/cpp/common/build-info.cpp.in +4 -0
- package/cpp/common/chat-parser-xml-toolcall.cpp +879 -0
- package/cpp/common/chat-parser-xml-toolcall.h +45 -0
- package/cpp/common/chat-parser.cpp +1649 -0
- package/cpp/common/chat-parser.h +133 -0
- package/cpp/common/chat-peg-parser.cpp +124 -0
- package/cpp/common/chat-peg-parser.h +105 -0
- package/cpp/common/chat.cpp +3355 -0
- package/cpp/common/chat.h +252 -0
- package/cpp/common/common.cpp +1824 -0
- package/cpp/common/common.h +930 -0
- package/cpp/common/console.cpp +1137 -0
- package/cpp/common/console.h +41 -0
- package/cpp/common/debug.cpp +167 -0
- package/cpp/common/debug.h +43 -0
- package/cpp/common/download.cpp +792 -0
- package/cpp/common/download.h +84 -0
- package/cpp/common/http.h +84 -0
- package/cpp/common/jinja/README.md +88 -0
- package/cpp/common/jinja/caps.cpp +285 -0
- package/cpp/common/jinja/caps.h +30 -0
- package/cpp/common/jinja/lexer.cpp +341 -0
- package/cpp/common/jinja/lexer.h +157 -0
- package/cpp/common/jinja/parser.cpp +591 -0
- package/cpp/common/jinja/parser.h +21 -0
- package/cpp/common/jinja/runtime.cpp +867 -0
- package/cpp/common/jinja/runtime.h +638 -0
- package/cpp/common/jinja/string.cpp +213 -0
- package/cpp/common/jinja/string.h +61 -0
- package/cpp/common/jinja/utils.h +149 -0
- package/cpp/common/jinja/value.cpp +1393 -0
- package/cpp/common/jinja/value.h +756 -0
- package/cpp/common/json-partial.cpp +324 -0
- package/cpp/common/json-partial.h +39 -0
- package/cpp/common/json-schema-to-grammar.cpp +1153 -0
- package/cpp/common/json-schema-to-grammar.h +43 -0
- package/cpp/common/llguidance.cpp +258 -0
- package/cpp/common/log.cpp +446 -0
- package/cpp/common/log.h +119 -0
- package/cpp/common/ngram-cache.cpp +285 -0
- package/cpp/common/ngram-cache.h +101 -0
- package/cpp/common/ngram-map.cpp +530 -0
- package/cpp/common/ngram-map.h +115 -0
- package/cpp/common/ngram-mod.cpp +60 -0
- package/cpp/common/ngram-mod.h +38 -0
- package/cpp/common/peg-parser.cpp +1712 -0
- package/cpp/common/peg-parser.h +459 -0
- package/cpp/common/preset.cpp +483 -0
- package/cpp/common/preset.h +83 -0
- package/cpp/common/regex-partial.cpp +204 -0
- package/cpp/common/regex-partial.h +56 -0
- package/cpp/common/sampling.cpp +745 -0
- package/cpp/common/sampling.h +119 -0
- package/cpp/common/speculative.cpp +1074 -0
- package/cpp/common/speculative.h +41 -0
- package/cpp/common/unicode.cpp +64 -0
- package/cpp/common/unicode.h +22 -0
- package/cpp/ggml/CMakeLists.txt +494 -0
- package/cpp/ggml/cmake/GitVars.cmake +22 -0
- package/cpp/ggml/cmake/common.cmake +50 -0
- package/cpp/ggml/cmake/ggml-config.cmake.in +191 -0
- package/cpp/ggml/include/ggml-alloc.h +85 -0
- package/cpp/ggml/include/ggml-backend.h +373 -0
- package/cpp/ggml/include/ggml-blas.h +25 -0
- package/cpp/ggml/include/ggml-cann.h +123 -0
- package/cpp/ggml/include/ggml-cpp.h +39 -0
- package/cpp/ggml/include/ggml-cpu.h +151 -0
- package/cpp/ggml/include/ggml-cuda.h +47 -0
- package/cpp/ggml/include/ggml-hexagon.h +19 -0
- package/cpp/ggml/include/ggml-metal.h +61 -0
- package/cpp/ggml/include/ggml-opencl.h +26 -0
- package/cpp/ggml/include/ggml-opt.h +256 -0
- package/cpp/ggml/include/ggml-rpc.h +30 -0
- package/cpp/ggml/include/ggml-sycl.h +49 -0
- package/cpp/ggml/include/ggml-virtgpu.h +14 -0
- package/cpp/ggml/include/ggml-vulkan.h +29 -0
- package/cpp/ggml/include/ggml-webgpu.h +19 -0
- package/cpp/ggml/include/ggml-zdnn.h +17 -0
- package/cpp/ggml/include/ggml-zendnn.h +22 -0
- package/cpp/ggml/include/ggml.h +2753 -0
- package/cpp/ggml/include/gguf.h +204 -0
- package/cpp/ggml/src/CMakeLists.txt +492 -0
- package/cpp/ggml/src/ggml-alloc.c +1244 -0
- package/cpp/ggml/src/ggml-backend-dl.cpp +48 -0
- package/cpp/ggml/src/ggml-backend-dl.h +45 -0
- package/cpp/ggml/src/ggml-backend-impl.h +255 -0
- package/cpp/ggml/src/ggml-backend-reg.cpp +566 -0
- package/cpp/ggml/src/ggml-backend.cpp +2270 -0
- package/cpp/ggml/src/ggml-blas/CMakeLists.txt +101 -0
- package/cpp/ggml/src/ggml-blas/ggml-blas.cpp +518 -0
- package/cpp/ggml/src/ggml-common.h +1878 -0
- package/cpp/ggml/src/ggml-cpu/CMakeLists.txt +691 -0
- package/cpp/ggml/src/ggml-cpu/amx/amx.cpp +247 -0
- package/cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
- package/cpp/ggml/src/ggml-cpu/amx/common.h +91 -0
- package/cpp/ggml/src/ggml-cpu/amx/mmq.cpp +2512 -0
- package/cpp/ggml/src/ggml-cpu/amx/mmq.h +10 -0
- package/cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +98 -0
- package/cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4052 -0
- package/cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +4935 -0
- package/cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2159 -0
- package/cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
- package/cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2305 -0
- package/cpp/ggml/src/ggml-cpu/arch/riscv/cpu-feats.cpp +38 -0
- package/cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2726 -0
- package/cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +342 -0
- package/cpp/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
- package/cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1468 -0
- package/cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1221 -0
- package/cpp/ggml/src/ggml-cpu/arch/x86/cpu-feats.cpp +327 -0
- package/cpp/ggml/src/ggml-cpu/arch/x86/quants.c +3820 -0
- package/cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +6307 -0
- package/cpp/ggml/src/ggml-cpu/arch-fallback.h +313 -0
- package/cpp/ggml/src/ggml-cpu/binary-ops.cpp +154 -0
- package/cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
- package/cpp/ggml/src/ggml-cpu/cmake/FindSIMD.cmake +100 -0
- package/cpp/ggml/src/ggml-cpu/common.h +95 -0
- package/cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +529 -0
- package/cpp/ggml/src/ggml-cpu/ggml-cpu.c +3734 -0
- package/cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +701 -0
- package/cpp/ggml/src/ggml-cpu/hbm.cpp +55 -0
- package/cpp/ggml/src/ggml-cpu/hbm.h +8 -0
- package/cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +938 -0
- package/cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +90 -0
- package/cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +798 -0
- package/cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
- package/cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +4033 -0
- package/cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +25 -0
- package/cpp/ggml/src/ggml-cpu/ops.cpp +10978 -0
- package/cpp/ggml/src/ggml-cpu/ops.h +116 -0
- package/cpp/ggml/src/ggml-cpu/quants.c +1193 -0
- package/cpp/ggml/src/ggml-cpu/quants.h +97 -0
- package/cpp/ggml/src/ggml-cpu/repack.cpp +3316 -0
- package/cpp/ggml/src/ggml-cpu/repack.h +173 -0
- package/cpp/ggml/src/ggml-cpu/simd-gemm.h +136 -0
- package/cpp/ggml/src/ggml-cpu/simd-mappings.h +1279 -0
- package/cpp/ggml/src/ggml-cpu/spacemit/ime.cpp +1025 -0
- package/cpp/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
- package/cpp/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
- package/cpp/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
- package/cpp/ggml/src/ggml-cpu/traits.cpp +36 -0
- package/cpp/ggml/src/ggml-cpu/traits.h +38 -0
- package/cpp/ggml/src/ggml-cpu/unary-ops.cpp +337 -0
- package/cpp/ggml/src/ggml-cpu/unary-ops.h +35 -0
- package/cpp/ggml/src/ggml-cpu/vec.cpp +629 -0
- package/cpp/ggml/src/ggml-cpu/vec.h +1585 -0
- package/cpp/ggml/src/ggml-hexagon/CMakeLists.txt +117 -0
- package/cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp +3232 -0
- package/cpp/ggml/src/ggml-hexagon/htp/CMakeLists.txt +45 -0
- package/cpp/ggml/src/ggml-hexagon/htp/act-ops.c +815 -0
- package/cpp/ggml/src/ggml-hexagon/htp/argsort-ops.c +281 -0
- package/cpp/ggml/src/ggml-hexagon/htp/binary-ops.c +827 -0
- package/cpp/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +157 -0
- package/cpp/ggml/src/ggml-hexagon/htp/cpy-ops.c +251 -0
- package/cpp/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +666 -0
- package/cpp/ggml/src/ggml-hexagon/htp/get-rows-ops.c +111 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hex-dma.c +63 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hex-dma.h +182 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hex-dump.h +77 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hex-fastdiv.h +37 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hex-utils.h +51 -0
- package/cpp/ggml/src/ggml-hexagon/htp/htp-ctx.h +35 -0
- package/cpp/ggml/src/ggml-hexagon/htp/htp-msg.h +154 -0
- package/cpp/ggml/src/ggml-hexagon/htp/htp-ops.h +65 -0
- package/cpp/ggml/src/ggml-hexagon/htp/htp_iface.idl +16 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-arith.h +470 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-base.h +173 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-copy.h +245 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-div.h +116 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-dump.h +129 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-exp.h +215 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-floor.h +100 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-inverse.h +176 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-reduce.h +266 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-scale.h +133 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +141 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-sqrt.h +126 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-types.h +36 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-utils.h +18 -0
- package/cpp/ggml/src/ggml-hexagon/htp/main.c +1150 -0
- package/cpp/ggml/src/ggml-hexagon/htp/matmul-ops.c +2595 -0
- package/cpp/ggml/src/ggml-hexagon/htp/rope-ops.c +498 -0
- package/cpp/ggml/src/ggml-hexagon/htp/set-rows-ops.c +167 -0
- package/cpp/ggml/src/ggml-hexagon/htp/softmax-ops.c +421 -0
- package/cpp/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +130 -0
- package/cpp/ggml/src/ggml-hexagon/htp/unary-ops.c +384 -0
- package/cpp/ggml/src/ggml-hexagon/htp/worker-pool.c +293 -0
- package/cpp/ggml/src/ggml-hexagon/htp/worker-pool.h +57 -0
- package/cpp/ggml/src/ggml-hexagon/htp-drv.cpp +418 -0
- package/cpp/ggml/src/ggml-hexagon/htp-drv.h +121 -0
- package/cpp/ggml/src/ggml-hexagon/libdl.h +79 -0
- package/cpp/ggml/src/ggml-hexagon/libggml-htp.inf +38 -0
- package/cpp/ggml/src/ggml-hexagon/op-desc.h +153 -0
- package/cpp/ggml/src/ggml-impl.h +724 -0
- package/cpp/ggml/src/ggml-metal/CMakeLists.txt +124 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal-common.cpp +457 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal-context.h +41 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal-context.m +702 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal-device.cpp +1890 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal-device.h +290 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal-device.m +1749 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal-impl.h +1054 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal-ops.cpp +4370 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal-ops.h +94 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal.cpp +937 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal.metal +9819 -0
- package/cpp/ggml/src/ggml-musa/CMakeLists.txt +125 -0
- package/cpp/ggml/src/ggml-musa/mudnn.cu +112 -0
- package/cpp/ggml/src/ggml-musa/mudnn.cuh +12 -0
- package/cpp/ggml/src/ggml-opencl/CMakeLists.txt +150 -0
- package/cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +11553 -0
- package/cpp/ggml/src/ggml-opencl/kernels/add.cl +190 -0
- package/cpp/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
- package/cpp/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
- package/cpp/ggml/src/ggml-opencl/kernels/clamp.cl +20 -0
- package/cpp/ggml/src/ggml-opencl/kernels/concat.cl +51 -0
- package/cpp/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
- package/cpp/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
- package/cpp/ggml/src/ggml-opencl/kernels/cpy.cl +184 -0
- package/cpp/ggml/src/ggml-opencl/kernels/cvt.cl +417 -0
- package/cpp/ggml/src/ggml-opencl/kernels/diag_mask_inf.cl +58 -0
- package/cpp/ggml/src/ggml-opencl/kernels/div.cl +138 -0
- package/cpp/ggml/src/ggml-opencl/kernels/embed_kernel.py +26 -0
- package/cpp/ggml/src/ggml-opencl/kernels/expm1.cl +113 -0
- package/cpp/ggml/src/ggml-opencl/kernels/fill.cl +17 -0
- package/cpp/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +370 -0
- package/cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +371 -0
- package/cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +373 -0
- package/cpp/ggml/src/ggml-opencl/kernels/gelu.cl +89 -0
- package/cpp/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32.cl +162 -0
- package/cpp/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32.cl +156 -0
- package/cpp/ggml/src/ggml-opencl/kernels/gemv_noshuffle.cl +268 -0
- package/cpp/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general.cl +274 -0
- package/cpp/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general_q8_0_f32.cl +195 -0
- package/cpp/ggml/src/ggml-opencl/kernels/get_rows.cl +187 -0
- package/cpp/ggml/src/ggml-opencl/kernels/glu.cl +378 -0
- package/cpp/ggml/src/ggml-opencl/kernels/group_norm.cl +121 -0
- package/cpp/ggml/src/ggml-opencl/kernels/im2col_f16.cl +57 -0
- package/cpp/ggml/src/ggml-opencl/kernels/im2col_f32.cl +57 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mean.cl +140 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul.cl +152 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mat_Ab_Bi_8x4.cl +139 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_kq_kqv.cl +273 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +146 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +147 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl +163 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl +165 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q6_k_f32_l4_lm.cl +158 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_8x4.cl +129 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl +154 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f16.cl +118 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32.cl +118 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_1row.cl +94 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_l4.cl +84 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f32_f32.cl +118 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32.cl +192 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_16x_flat.cl +307 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_8x_flat.cl +265 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_8x_flat.cl +272 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_v.cl +254 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl +219 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl +229 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl +180 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32.cl +194 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +194 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
- package/cpp/ggml/src/ggml-opencl/kernels/norm.cl +161 -0
- package/cpp/ggml/src/ggml-opencl/kernels/pad.cl +39 -0
- package/cpp/ggml/src/ggml-opencl/kernels/relu.cl +16 -0
- package/cpp/ggml/src/ggml-opencl/kernels/repeat.cl +38 -0
- package/cpp/ggml/src/ggml-opencl/kernels/rms_norm.cl +190 -0
- package/cpp/ggml/src/ggml-opencl/kernels/rope.cl +747 -0
- package/cpp/ggml/src/ggml-opencl/kernels/scale.cl +27 -0
- package/cpp/ggml/src/ggml-opencl/kernels/set_rows.cl +208 -0
- package/cpp/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
- package/cpp/ggml/src/ggml-opencl/kernels/silu.cl +30 -0
- package/cpp/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +108 -0
- package/cpp/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +108 -0
- package/cpp/ggml/src/ggml-opencl/kernels/softmax_f16.cl +107 -0
- package/cpp/ggml/src/ggml-opencl/kernels/softmax_f32.cl +107 -0
- package/cpp/ggml/src/ggml-opencl/kernels/softplus.cl +116 -0
- package/cpp/ggml/src/ggml-opencl/kernels/solve_tri.cl +51 -0
- package/cpp/ggml/src/ggml-opencl/kernels/sqr.cl +53 -0
- package/cpp/ggml/src/ggml-opencl/kernels/sqrt.cl +53 -0
- package/cpp/ggml/src/ggml-opencl/kernels/ssm_conv.cl +77 -0
- package/cpp/ggml/src/ggml-opencl/kernels/sub.cl +138 -0
- package/cpp/ggml/src/ggml-opencl/kernels/sum_rows.cl +140 -0
- package/cpp/ggml/src/ggml-opencl/kernels/tanh.cl +109 -0
- package/cpp/ggml/src/ggml-opencl/kernels/transpose.cl +117 -0
- package/cpp/ggml/src/ggml-opencl/kernels/tri.cl +32 -0
- package/cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
- package/cpp/ggml/src/ggml-opencl/kernels/upscale.cl +120 -0
- package/cpp/ggml/src/ggml-opt.cpp +1093 -0
- package/cpp/ggml/src/ggml-quants.c +5325 -0
- package/cpp/ggml/src/ggml-quants.h +106 -0
- package/cpp/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
- package/cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2118 -0
- package/cpp/ggml/src/ggml-threading.cpp +12 -0
- package/cpp/ggml/src/ggml-threading.h +14 -0
- package/cpp/ggml/src/ggml-virtgpu/CMakeLists.txt +70 -0
- package/cpp/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp +87 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/CMakeLists.txt +21 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp +115 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/backend-convert.h +13 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +102 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +105 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +179 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp +148 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +51 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +73 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +27 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +32 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/backend.cpp +144 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +95 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +94 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +50 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +378 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +232 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +58 -0
- package/cpp/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +81 -0
- package/cpp/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +119 -0
- package/cpp/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +158 -0
- package/cpp/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +213 -0
- package/cpp/ggml/src/ggml-virtgpu/ggml-backend.cpp +69 -0
- package/cpp/ggml/src/ggml-virtgpu/ggml-remoting.h +71 -0
- package/cpp/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml +166 -0
- package/cpp/ggml/src/ggml-virtgpu/include/apir_hw.h +9 -0
- package/cpp/ggml/src/ggml-virtgpu/regenerate_remoting.py +333 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu-apir.h +15 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +58 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +110 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +173 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +192 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +36 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +53 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +98 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu-shm.h +23 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu-utils.cpp +179 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu-utils.h +86 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu.cpp +544 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu.h +117 -0
- package/cpp/ggml/src/ggml-webgpu/CMakeLists.txt +80 -0
- package/cpp/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +1231 -0
- package/cpp/ggml/src/ggml-webgpu/ggml-webgpu.cpp +3150 -0
- package/cpp/ggml/src/ggml-webgpu/pre_wgsl.hpp +778 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl +72 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl +106 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl +134 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +107 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +923 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +107 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl +66 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +182 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +636 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.wgsl +668 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +323 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +713 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +103 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl +138 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.wgsl +188 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +194 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl +86 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +123 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/rope.tmpl.wgsl +295 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/scale.wgsl +63 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +109 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.tmpl.wgsl +345 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl +55 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +193 -0
- package/cpp/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
- package/cpp/ggml/src/ggml-zdnn/common.hpp +59 -0
- package/cpp/ggml/src/ggml-zdnn/ggml-zdnn.cpp +633 -0
- package/cpp/ggml/src/ggml-zdnn/mmf.cpp +80 -0
- package/cpp/ggml/src/ggml-zdnn/mmf.hpp +12 -0
- package/cpp/ggml/src/ggml-zdnn/utils.cpp +79 -0
- package/cpp/ggml/src/ggml-zdnn/utils.hpp +19 -0
- package/cpp/ggml/src/ggml-zendnn/CMakeLists.txt +92 -0
- package/cpp/ggml/src/ggml-zendnn/ggml-zendnn.cpp +469 -0
- package/cpp/ggml/src/ggml.c +7669 -0
- package/cpp/ggml/src/ggml.cpp +26 -0
- package/cpp/ggml/src/gguf.cpp +1699 -0
- package/cpp/include/llama-cpp.h +32 -0
- package/cpp/include/llama.h +1568 -0
- package/cpp/mtmd/CMakeLists.txt +98 -0
- package/cpp/mtmd/README.md +63 -0
- package/cpp/mtmd/clip-graph.h +117 -0
- package/cpp/mtmd/clip-impl.h +586 -0
- package/cpp/mtmd/clip-model.h +390 -0
- package/cpp/mtmd/clip.cpp +4154 -0
- package/cpp/mtmd/clip.h +121 -0
- package/cpp/mtmd/deprecation-warning.cpp +22 -0
- package/cpp/mtmd/legacy-models/convert_image_encoder_to_gguf.py +412 -0
- package/cpp/mtmd/legacy-models/glmedge-convert-image-encoder-to-gguf.py +280 -0
- package/cpp/mtmd/legacy-models/glmedge-surgery.py +33 -0
- package/cpp/mtmd/legacy-models/llava_surgery.py +38 -0
- package/cpp/mtmd/legacy-models/llava_surgery_v2.py +180 -0
- package/cpp/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py +892 -0
- package/cpp/mtmd/legacy-models/minicpmv-surgery.py +47 -0
- package/cpp/mtmd/models/cogvlm.cpp +98 -0
- package/cpp/mtmd/models/conformer.cpp +216 -0
- package/cpp/mtmd/models/glm4v.cpp +122 -0
- package/cpp/mtmd/models/internvl.cpp +69 -0
- package/cpp/mtmd/models/kimik25.cpp +101 -0
- package/cpp/mtmd/models/kimivl.cpp +63 -0
- package/cpp/mtmd/models/llama4.cpp +96 -0
- package/cpp/mtmd/models/llava.cpp +374 -0
- package/cpp/mtmd/models/minicpmv.cpp +114 -0
- package/cpp/mtmd/models/mobilenetv5.cpp +451 -0
- package/cpp/mtmd/models/models.h +128 -0
- package/cpp/mtmd/models/nemotron-v2-vl.cpp +35 -0
- package/cpp/mtmd/models/paddleocr.cpp +52 -0
- package/cpp/mtmd/models/pixtral.cpp +86 -0
- package/cpp/mtmd/models/qwen2vl.cpp +183 -0
- package/cpp/mtmd/models/qwen3vl.cpp +193 -0
- package/cpp/mtmd/models/siglip.cpp +86 -0
- package/cpp/mtmd/models/whisper-enc.cpp +115 -0
- package/cpp/mtmd/models/youtuvl.cpp +179 -0
- package/cpp/mtmd/mtmd-audio.cpp +730 -0
- package/cpp/mtmd/mtmd-audio.h +113 -0
- package/cpp/mtmd/mtmd-cli.cpp +437 -0
- package/cpp/mtmd/mtmd-helper.cpp +521 -0
- package/cpp/mtmd/mtmd-helper.h +96 -0
- package/cpp/mtmd/mtmd.cpp +1156 -0
- package/cpp/mtmd/mtmd.h +319 -0
- package/cpp/mtmd/requirements.txt +5 -0
- package/cpp/mtmd/test-1.jpeg +0 -0
- package/cpp/mtmd/test-2.mp3 +0 -0
- package/cpp/mtmd/tests.sh +192 -0
- package/cpp/src/CMakeLists.txt +169 -0
- package/cpp/src/llama-adapter.cpp +488 -0
- package/cpp/src/llama-adapter.h +89 -0
- package/cpp/src/llama-arch.cpp +2855 -0
- package/cpp/src/llama-arch.h +619 -0
- package/cpp/src/llama-batch.cpp +917 -0
- package/cpp/src/llama-batch.h +173 -0
- package/cpp/src/llama-chat.cpp +896 -0
- package/cpp/src/llama-chat.h +71 -0
- package/cpp/src/llama-context.cpp +3512 -0
- package/cpp/src/llama-context.h +359 -0
- package/cpp/src/llama-cparams.cpp +5 -0
- package/cpp/src/llama-cparams.h +44 -0
- package/cpp/src/llama-grammar.cpp +1464 -0
- package/cpp/src/llama-grammar.h +194 -0
- package/cpp/src/llama-graph.cpp +2685 -0
- package/cpp/src/llama-graph.h +1026 -0
- package/cpp/src/llama-hparams.cpp +234 -0
- package/cpp/src/llama-hparams.h +339 -0
- package/cpp/src/llama-impl.cpp +171 -0
- package/cpp/src/llama-impl.h +73 -0
- package/cpp/src/llama-io.cpp +15 -0
- package/cpp/src/llama-io.h +35 -0
- package/cpp/src/llama-kv-cache-iswa.cpp +330 -0
- package/cpp/src/llama-kv-cache-iswa.h +137 -0
- package/cpp/src/llama-kv-cache.cpp +2271 -0
- package/cpp/src/llama-kv-cache.h +388 -0
- package/cpp/src/llama-kv-cells.h +533 -0
- package/cpp/src/llama-memory-hybrid-iswa.cpp +275 -0
- package/cpp/src/llama-memory-hybrid-iswa.h +140 -0
- package/cpp/src/llama-memory-hybrid.cpp +268 -0
- package/cpp/src/llama-memory-hybrid.h +139 -0
- package/cpp/src/llama-memory-recurrent.cpp +1165 -0
- package/cpp/src/llama-memory-recurrent.h +182 -0
- package/cpp/src/llama-memory.cpp +59 -0
- package/cpp/src/llama-memory.h +122 -0
- package/cpp/src/llama-mmap.cpp +785 -0
- package/cpp/src/llama-mmap.h +92 -0
- package/cpp/src/llama-model-loader.cpp +1414 -0
- package/cpp/src/llama-model-loader.h +203 -0
- package/cpp/src/llama-model-saver.cpp +286 -0
- package/cpp/src/llama-model-saver.h +37 -0
- package/cpp/src/llama-model.cpp +9253 -0
- package/cpp/src/llama-model.h +576 -0
- package/cpp/src/llama-quant.cpp +1119 -0
- package/cpp/src/llama-quant.h +1 -0
- package/cpp/src/llama-sampler.cpp +3885 -0
- package/cpp/src/llama-sampler.h +42 -0
- package/cpp/src/llama-vocab.cpp +3970 -0
- package/cpp/src/llama-vocab.h +187 -0
- package/cpp/src/llama.cpp +1313 -0
- package/cpp/src/models/afmoe.cpp +191 -0
- package/cpp/src/models/apertus.cpp +125 -0
- package/cpp/src/models/arcee.cpp +135 -0
- package/cpp/src/models/arctic.cpp +138 -0
- package/cpp/src/models/arwkv7.cpp +86 -0
- package/cpp/src/models/baichuan.cpp +122 -0
- package/cpp/src/models/bailingmoe.cpp +144 -0
- package/cpp/src/models/bailingmoe2.cpp +135 -0
- package/cpp/src/models/bert.cpp +178 -0
- package/cpp/src/models/bitnet.cpp +160 -0
- package/cpp/src/models/bloom.cpp +101 -0
- package/cpp/src/models/chameleon.cpp +178 -0
- package/cpp/src/models/chatglm.cpp +132 -0
- package/cpp/src/models/codeshell.cpp +111 -0
- package/cpp/src/models/cogvlm.cpp +102 -0
- package/cpp/src/models/cohere2-iswa.cpp +134 -0
- package/cpp/src/models/command-r.cpp +122 -0
- package/cpp/src/models/dbrx.cpp +123 -0
- package/cpp/src/models/deci.cpp +135 -0
- package/cpp/src/models/deepseek.cpp +144 -0
- package/cpp/src/models/deepseek2.cpp +262 -0
- package/cpp/src/models/delta-net-base.cpp +376 -0
- package/cpp/src/models/dots1.cpp +134 -0
- package/cpp/src/models/dream.cpp +105 -0
- package/cpp/src/models/ernie4-5-moe.cpp +150 -0
- package/cpp/src/models/ernie4-5.cpp +110 -0
- package/cpp/src/models/eurobert.cpp +97 -0
- package/cpp/src/models/exaone-moe.cpp +146 -0
- package/cpp/src/models/exaone.cpp +114 -0
- package/cpp/src/models/exaone4.cpp +123 -0
- package/cpp/src/models/falcon-h1.cpp +111 -0
- package/cpp/src/models/falcon.cpp +120 -0
- package/cpp/src/models/gemma-embedding.cpp +116 -0
- package/cpp/src/models/gemma.cpp +112 -0
- package/cpp/src/models/gemma2-iswa.cpp +128 -0
- package/cpp/src/models/gemma3.cpp +155 -0
- package/cpp/src/models/gemma3n-iswa.cpp +384 -0
- package/cpp/src/models/glm4-moe.cpp +170 -0
- package/cpp/src/models/glm4.cpp +157 -0
- package/cpp/src/models/gpt2.cpp +105 -0
- package/cpp/src/models/gptneox.cpp +144 -0
- package/cpp/src/models/granite-hybrid.cpp +196 -0
- package/cpp/src/models/granite.cpp +211 -0
- package/cpp/src/models/grok.cpp +159 -0
- package/cpp/src/models/grovemoe.cpp +141 -0
- package/cpp/src/models/hunyuan-dense.cpp +132 -0
- package/cpp/src/models/hunyuan-moe.cpp +154 -0
- package/cpp/src/models/internlm2.cpp +120 -0
- package/cpp/src/models/jais.cpp +86 -0
- package/cpp/src/models/jais2.cpp +123 -0
- package/cpp/src/models/jamba.cpp +106 -0
- package/cpp/src/models/kimi-linear.cpp +392 -0
- package/cpp/src/models/lfm2.cpp +190 -0
- package/cpp/src/models/llada-moe.cpp +122 -0
- package/cpp/src/models/llada.cpp +99 -0
- package/cpp/src/models/llama-iswa.cpp +178 -0
- package/cpp/src/models/llama.cpp +168 -0
- package/cpp/src/models/maincoder.cpp +117 -0
- package/cpp/src/models/mamba-base.cpp +285 -0
- package/cpp/src/models/mamba.cpp +54 -0
- package/cpp/src/models/mimo2-iswa.cpp +123 -0
- package/cpp/src/models/minicpm3.cpp +200 -0
- package/cpp/src/models/minimax-m2.cpp +124 -0
- package/cpp/src/models/mistral3.cpp +160 -0
- package/cpp/src/models/models.h +684 -0
- package/cpp/src/models/modern-bert.cpp +109 -0
- package/cpp/src/models/mpt.cpp +126 -0
- package/cpp/src/models/nemotron-h.cpp +148 -0
- package/cpp/src/models/nemotron.cpp +122 -0
- package/cpp/src/models/neo-bert.cpp +104 -0
- package/cpp/src/models/olmo.cpp +121 -0
- package/cpp/src/models/olmo2.cpp +150 -0
- package/cpp/src/models/olmoe.cpp +124 -0
- package/cpp/src/models/openai-moe-iswa.cpp +127 -0
- package/cpp/src/models/openelm.cpp +124 -0
- package/cpp/src/models/orion.cpp +123 -0
- package/cpp/src/models/paddleocr.cpp +122 -0
- package/cpp/src/models/pangu-embedded.cpp +121 -0
- package/cpp/src/models/phi2.cpp +121 -0
- package/cpp/src/models/phi3.cpp +152 -0
- package/cpp/src/models/plamo.cpp +110 -0
- package/cpp/src/models/plamo2.cpp +318 -0
- package/cpp/src/models/plamo3.cpp +128 -0
- package/cpp/src/models/plm.cpp +169 -0
- package/cpp/src/models/qwen.cpp +108 -0
- package/cpp/src/models/qwen2.cpp +126 -0
- package/cpp/src/models/qwen2moe.cpp +151 -0
- package/cpp/src/models/qwen2vl.cpp +117 -0
- package/cpp/src/models/qwen3.cpp +117 -0
- package/cpp/src/models/qwen35.cpp +386 -0
- package/cpp/src/models/qwen35moe.cpp +420 -0
- package/cpp/src/models/qwen3moe.cpp +124 -0
- package/cpp/src/models/qwen3next.cpp +525 -0
- package/cpp/src/models/qwen3vl-moe.cpp +140 -0
- package/cpp/src/models/qwen3vl.cpp +132 -0
- package/cpp/src/models/refact.cpp +94 -0
- package/cpp/src/models/rnd1.cpp +126 -0
- package/cpp/src/models/rwkv6-base.cpp +164 -0
- package/cpp/src/models/rwkv6.cpp +94 -0
- package/cpp/src/models/rwkv6qwen2.cpp +86 -0
- package/cpp/src/models/rwkv7-base.cpp +137 -0
- package/cpp/src/models/rwkv7.cpp +90 -0
- package/cpp/src/models/seed-oss.cpp +124 -0
- package/cpp/src/models/smallthinker.cpp +126 -0
- package/cpp/src/models/smollm3.cpp +128 -0
- package/cpp/src/models/stablelm.cpp +146 -0
- package/cpp/src/models/starcoder.cpp +100 -0
- package/cpp/src/models/starcoder2.cpp +121 -0
- package/cpp/src/models/step35-iswa.cpp +168 -0
- package/cpp/src/models/t5-dec.cpp +166 -0
- package/cpp/src/models/t5-enc.cpp +96 -0
- package/cpp/src/models/wavtokenizer-dec.cpp +149 -0
- package/cpp/src/models/xverse.cpp +108 -0
- package/cpp/src/unicode-data.cpp +7034 -0
- package/cpp/src/unicode-data.h +20 -0
- package/cpp/src/unicode.cpp +1103 -0
- package/cpp/src/unicode.h +111 -0
- package/cpp/vendor/nlohmann/json.hpp +25526 -0
- package/cpp/vendor/nlohmann/json_fwd.hpp +187 -0
- package/cpp/vendor/stb/stb_image.h +7988 -0
- package/ios/LocalLLM-Bridging-Header.h +2 -0
- package/ios/LocalLLM.h +5 -0
- package/ios/LocalLLM.mm +1267 -0
- package/local-llm-rn.podspec +60 -0
- package/package.json +35 -0
- package/src/NativeLocalLLM.ts +73 -0
- package/src/device.ts +50 -0
- package/src/download-adapter.ts +17 -0
- package/src/index.ts +21 -0
- package/src/native-bridge.ts +142 -0
- package/src/rn-downloader.ts +37 -0
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
|
2
|
+
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
|
3
|
+
|
|
4
|
+
// Most devices have max workgroup size of 1024, so this is enough for subgroup
|
|
5
|
+
// sizes of 16, 32, 64 and 128. Increase this value for smaller subgroups sizes
|
|
6
|
+
#define MAX_SUBGROUPS 64
|
|
7
|
+
kernel void kernel_mean_f32(
|
|
8
|
+
global char * src0,
|
|
9
|
+
ulong offset0,
|
|
10
|
+
global char * dst,
|
|
11
|
+
ulong offsetd,
|
|
12
|
+
int ne00,
|
|
13
|
+
int ne01,
|
|
14
|
+
int ne02,
|
|
15
|
+
int ne03,
|
|
16
|
+
ulong nb01,
|
|
17
|
+
ulong nb02,
|
|
18
|
+
ulong nb03,
|
|
19
|
+
ulong nb1,
|
|
20
|
+
ulong nb2,
|
|
21
|
+
ulong nb3
|
|
22
|
+
) {
|
|
23
|
+
src0 = src0 + offset0;
|
|
24
|
+
dst = dst + offsetd;
|
|
25
|
+
|
|
26
|
+
const int i3 = get_group_id(2);
|
|
27
|
+
const int i2 = get_group_id(1);
|
|
28
|
+
const int i1 = get_group_id(0);
|
|
29
|
+
|
|
30
|
+
const int lid = get_local_id(0);
|
|
31
|
+
const int lsize = get_local_size(0);
|
|
32
|
+
|
|
33
|
+
const uint sg_size = get_sub_group_size();
|
|
34
|
+
const uint sg_id = get_sub_group_id();
|
|
35
|
+
const uint sg_lid = get_sub_group_local_id();
|
|
36
|
+
|
|
37
|
+
__local float lmem[MAX_SUBGROUPS];
|
|
38
|
+
|
|
39
|
+
if (i3 >= ne03 || i2 >= ne02 || i1 >= ne01) {
|
|
40
|
+
return;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
if(sg_id == 0){
|
|
44
|
+
lmem[sg_lid] = 0.0f;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
global float * src_row = (global float *) (src0 + i1*nb01 + i2*nb02 + i3*nb03);
|
|
48
|
+
global float * dst_row = (global float *) (dst + i1*nb1 + i2*nb2 + i3*nb3);
|
|
49
|
+
|
|
50
|
+
float sumf = 0.0f;
|
|
51
|
+
|
|
52
|
+
for (int i0 = lid; i0 < ne00; i0 += lsize) {
|
|
53
|
+
sumf += src_row[i0];
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
sumf = sub_group_reduce_add(sumf);
|
|
57
|
+
|
|
58
|
+
barrier(CLK_LOCAL_MEM_FENCE);
|
|
59
|
+
|
|
60
|
+
if(sg_lid == 0){
|
|
61
|
+
lmem[sg_id] = sumf;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
barrier(CLK_LOCAL_MEM_FENCE);
|
|
65
|
+
|
|
66
|
+
sumf = lmem[sg_lid];
|
|
67
|
+
sumf = sub_group_reduce_add(sumf);
|
|
68
|
+
|
|
69
|
+
if (lid == 0) {
|
|
70
|
+
dst_row[0] = sumf / ne00;
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
kernel void kernel_mean_f32_4(
|
|
75
|
+
global char * src0,
|
|
76
|
+
ulong offset0,
|
|
77
|
+
global char * dst,
|
|
78
|
+
ulong offsetd,
|
|
79
|
+
int ne00,
|
|
80
|
+
int ne01,
|
|
81
|
+
int ne02,
|
|
82
|
+
int ne03,
|
|
83
|
+
ulong nb01,
|
|
84
|
+
ulong nb02,
|
|
85
|
+
ulong nb03,
|
|
86
|
+
ulong nb1,
|
|
87
|
+
ulong nb2,
|
|
88
|
+
ulong nb3
|
|
89
|
+
) {
|
|
90
|
+
src0 = src0 + offset0;
|
|
91
|
+
dst = dst + offsetd;
|
|
92
|
+
|
|
93
|
+
const int i3 = get_group_id(2);
|
|
94
|
+
const int i2 = get_group_id(1);
|
|
95
|
+
const int i1 = get_group_id(0);
|
|
96
|
+
|
|
97
|
+
const int lid = get_local_id(0);
|
|
98
|
+
const int lsize = get_local_size(0);
|
|
99
|
+
|
|
100
|
+
const uint sg_size = get_sub_group_size();
|
|
101
|
+
const uint sg_id = get_sub_group_id();
|
|
102
|
+
const uint sg_lid = get_sub_group_local_id();
|
|
103
|
+
|
|
104
|
+
__local float lmem[MAX_SUBGROUPS];
|
|
105
|
+
|
|
106
|
+
if (i3 >= ne03 || i2 >= ne02 || i1 >= ne01) {
|
|
107
|
+
return;
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
if(sg_id == 0){
|
|
111
|
+
lmem[sg_lid] = 0.0f;
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
global float4 * src_row = (global float4 *) (src0 + i1*nb01 + i2*nb02 + i3*nb03);
|
|
115
|
+
global float * dst_row = (global float *) (dst + i1*nb1 + i2*nb2 + i3*nb3);
|
|
116
|
+
|
|
117
|
+
float4 sum_vec = (float4)0.0f;
|
|
118
|
+
|
|
119
|
+
for (int i0 = lid; i0 < ne00 / 4; i0 += lsize) {
|
|
120
|
+
sum_vec += src_row[i0];
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
float sumf = dot(sum_vec, (float4)(1.0f));
|
|
124
|
+
sumf = sub_group_reduce_add(sumf);
|
|
125
|
+
|
|
126
|
+
barrier(CLK_LOCAL_MEM_FENCE);
|
|
127
|
+
|
|
128
|
+
if(sg_lid == 0){
|
|
129
|
+
lmem[sg_id] = sumf;
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
barrier(CLK_LOCAL_MEM_FENCE);
|
|
133
|
+
|
|
134
|
+
sumf = lmem[sg_lid];
|
|
135
|
+
sumf = sub_group_reduce_add(sumf);
|
|
136
|
+
|
|
137
|
+
if (lid == 0) {
|
|
138
|
+
dst_row[0] = sumf / ne00;
|
|
139
|
+
}
|
|
140
|
+
}
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
|
2
|
+
|
|
3
|
+
//------------------------------------------------------------------------------
|
|
4
|
+
// mul
|
|
5
|
+
//------------------------------------------------------------------------------
|
|
6
|
+
kernel void kernel_mul(
|
|
7
|
+
global char * src0,
|
|
8
|
+
ulong offset0,
|
|
9
|
+
global char * src1,
|
|
10
|
+
ulong offset1,
|
|
11
|
+
global char * dst,
|
|
12
|
+
ulong offsetd,
|
|
13
|
+
int ne00,
|
|
14
|
+
int ne01,
|
|
15
|
+
int ne02,
|
|
16
|
+
int ne03,
|
|
17
|
+
ulong nb00,
|
|
18
|
+
ulong nb01,
|
|
19
|
+
ulong nb02,
|
|
20
|
+
ulong nb03,
|
|
21
|
+
int ne10,
|
|
22
|
+
int ne11,
|
|
23
|
+
int ne12,
|
|
24
|
+
int ne13,
|
|
25
|
+
ulong nb10,
|
|
26
|
+
ulong nb11,
|
|
27
|
+
ulong nb12,
|
|
28
|
+
ulong nb13,
|
|
29
|
+
int ne0,
|
|
30
|
+
int ne1,
|
|
31
|
+
int ne2,
|
|
32
|
+
int ne3,
|
|
33
|
+
ulong nb0,
|
|
34
|
+
ulong nb1,
|
|
35
|
+
ulong nb2,
|
|
36
|
+
ulong nb3
|
|
37
|
+
) {
|
|
38
|
+
src0 = src0 + offset0;
|
|
39
|
+
src1 = src1 + offset1;
|
|
40
|
+
dst = dst + offsetd;
|
|
41
|
+
|
|
42
|
+
int i03 = get_group_id(2);
|
|
43
|
+
int i02 = get_group_id(1);
|
|
44
|
+
int i01 = get_group_id(0);
|
|
45
|
+
|
|
46
|
+
int i13 = i03 % ne13;
|
|
47
|
+
int i12 = i02 % ne12;
|
|
48
|
+
int i11 = i01 % ne11;
|
|
49
|
+
|
|
50
|
+
global char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
|
|
51
|
+
global char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
|
|
52
|
+
global char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1;
|
|
53
|
+
|
|
54
|
+
for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
|
|
55
|
+
const int i10 = i0 % ne10;
|
|
56
|
+
*((global float *)(dst_ptr + i0*nb0)) = *((global float *)(src0_ptr + i0*nb00)) * *((global float *)(src1_ptr + i10*nb10));
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
// assumption: src1 is a row
|
|
61
|
+
// broadcast src1 into src0
|
|
62
|
+
kernel void kernel_mul_row(
|
|
63
|
+
global float4 * src0,
|
|
64
|
+
ulong offset0,
|
|
65
|
+
global float4 * src1,
|
|
66
|
+
ulong offset1,
|
|
67
|
+
global float4 * dst,
|
|
68
|
+
ulong offsetd,
|
|
69
|
+
int ne
|
|
70
|
+
) {
|
|
71
|
+
src0 = (global float4*)((global char*)src0 + offset0);
|
|
72
|
+
src1 = (global float4*)((global char*)src1 + offset1);
|
|
73
|
+
dst = (global float4*)((global char*)dst + offsetd);
|
|
74
|
+
|
|
75
|
+
// This performs better than using %.
|
|
76
|
+
uint gid = get_global_id(0);
|
|
77
|
+
uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
|
|
78
|
+
dst[gid] = src0[gid] * src1[idx1];
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
kernel void kernel_mul_f16(
|
|
82
|
+
global char * src0,
|
|
83
|
+
ulong offset0,
|
|
84
|
+
global char * src1,
|
|
85
|
+
ulong offset1,
|
|
86
|
+
global char * dst,
|
|
87
|
+
ulong offsetd,
|
|
88
|
+
int ne00,
|
|
89
|
+
int ne01,
|
|
90
|
+
int ne02,
|
|
91
|
+
int ne03,
|
|
92
|
+
ulong nb00,
|
|
93
|
+
ulong nb01,
|
|
94
|
+
ulong nb02,
|
|
95
|
+
ulong nb03,
|
|
96
|
+
int ne10,
|
|
97
|
+
int ne11,
|
|
98
|
+
int ne12,
|
|
99
|
+
int ne13,
|
|
100
|
+
ulong nb10,
|
|
101
|
+
ulong nb11,
|
|
102
|
+
ulong nb12,
|
|
103
|
+
ulong nb13,
|
|
104
|
+
int ne0,
|
|
105
|
+
int ne1,
|
|
106
|
+
int ne2,
|
|
107
|
+
int ne3,
|
|
108
|
+
ulong nb0,
|
|
109
|
+
ulong nb1,
|
|
110
|
+
ulong nb2,
|
|
111
|
+
ulong nb3
|
|
112
|
+
) {
|
|
113
|
+
src0 = src0 + offset0;
|
|
114
|
+
src1 = src1 + offset1;
|
|
115
|
+
dst = dst + offsetd;
|
|
116
|
+
|
|
117
|
+
int i03 = get_group_id(2);
|
|
118
|
+
int i02 = get_group_id(1);
|
|
119
|
+
int i01 = get_group_id(0);
|
|
120
|
+
|
|
121
|
+
int i13 = i03 % ne13;
|
|
122
|
+
int i12 = i02 % ne12;
|
|
123
|
+
int i11 = i01 % ne11;
|
|
124
|
+
|
|
125
|
+
global char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
|
|
126
|
+
global char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
|
|
127
|
+
global char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1;
|
|
128
|
+
|
|
129
|
+
for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
|
|
130
|
+
const int i10 = i0 % ne10;
|
|
131
|
+
*((global half *)(dst_ptr + i0*nb0)) = *((global half *)(src0_ptr + i0*nb00)) * *((global half *)(src1_ptr + i10*nb10));
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
kernel void kernel_mul_row_f16(
|
|
136
|
+
global half4 * src0,
|
|
137
|
+
ulong offset0,
|
|
138
|
+
global half4 * src1,
|
|
139
|
+
ulong offset1,
|
|
140
|
+
global half4 * dst,
|
|
141
|
+
ulong offsetd,
|
|
142
|
+
int ne
|
|
143
|
+
) {
|
|
144
|
+
src0 = (global half4*)((global char*)src0 + offset0);
|
|
145
|
+
src1 = (global half4*)((global char*)src1 + offset1);
|
|
146
|
+
dst = (global half4*)((global char*)dst + offsetd);
|
|
147
|
+
|
|
148
|
+
// This performs better than using %.
|
|
149
|
+
uint gid = get_global_id(0);
|
|
150
|
+
uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
|
|
151
|
+
dst[gid] = src0[gid] * src1[idx1];
|
|
152
|
+
}
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
// src0_q, src0_d, src1 are transposed as a preprocessing step
|
|
2
|
+
// 4-bit weights are transposed in groups of 4 (unsigned short int)
|
|
3
|
+
// consider weights originally "next to each other", now "on top of each other"
|
|
4
|
+
// each fiber computes a 8x4 tile of output elements
|
|
5
|
+
// using unshuffled weights
|
|
6
|
+
|
|
7
|
+
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
|
8
|
+
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
|
9
|
+
|
|
10
|
+
#ifdef cl_qcom_reqd_sub_group_size
|
|
11
|
+
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
|
12
|
+
#define ADRENO_GPU 1
|
|
13
|
+
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
|
14
|
+
#endif
|
|
15
|
+
|
|
16
|
+
#ifdef ADRENO_GPU
|
|
17
|
+
REQD_SUBGROUP_SIZE_128
|
|
18
|
+
#endif
|
|
19
|
+
|
|
20
|
+
kernel void kernel_mul_mat_Ab_Bi_8x4(
|
|
21
|
+
global const ushort * src0_q, // quantized A
|
|
22
|
+
global const half * src0_d, // A scales
|
|
23
|
+
__read_only image1d_buffer_t src1, // B (1d image)
|
|
24
|
+
global float * dst, // C
|
|
25
|
+
int m, // M
|
|
26
|
+
int n, // N with padding
|
|
27
|
+
int k, // K
|
|
28
|
+
int n_no_padding // N without padding
|
|
29
|
+
) {
|
|
30
|
+
|
|
31
|
+
int m_4 = m >> 2;
|
|
32
|
+
int n_4 = n >> 2;
|
|
33
|
+
|
|
34
|
+
int gy = get_global_id(0);
|
|
35
|
+
int gx = get_global_id(1);
|
|
36
|
+
int gx_2 = gx << 2;
|
|
37
|
+
|
|
38
|
+
half8 c0 = 0, c1 = 0, c2 = 0, c3 = 0; // 8x4 output elements
|
|
39
|
+
half8 B; // registers for activations
|
|
40
|
+
half4 dequantized_weights; // registers for dequantized weights
|
|
41
|
+
__global const ushort* weight_ptr = src0_q + gx_2; // pointer for weights
|
|
42
|
+
__global const half* scale_ptr = src0_d + gx_2; // pointer for scales
|
|
43
|
+
|
|
44
|
+
for(int i=0; i<k; i+=4){ //loop through K dimension
|
|
45
|
+
|
|
46
|
+
B.s0123 = read_imageh(src1, gy*2 + (i)*(n_4));
|
|
47
|
+
B.s4567 = read_imageh(src1, gy*2 + (i)*(n_4)+1);
|
|
48
|
+
|
|
49
|
+
// keep (i/4) and (i/32) in parenthesis, rounds down
|
|
50
|
+
// load 4 consecutive groups of 4 weights
|
|
51
|
+
ushort4 bits4 = vload4(0, weight_ptr + (i/4)*(m)); // (i/4) because weights grouped in 4s
|
|
52
|
+
|
|
53
|
+
// load 4 consecutive scales
|
|
54
|
+
half4 scale = vload4(0, scale_ptr + (i/32)*(m));// (i/32) because 1 scale per 32 elements
|
|
55
|
+
|
|
56
|
+
// j=0
|
|
57
|
+
dequantized_weights.s0 = ((bits4.s0 & (0x000F)) - 8) * scale.s0; // dequantize a row of the 16 weights
|
|
58
|
+
dequantized_weights.s1 = ((bits4.s1 & (0x000F)) - 8) * scale.s1;
|
|
59
|
+
dequantized_weights.s2 = ((bits4.s2 & (0x000F)) - 8) * scale.s2;
|
|
60
|
+
dequantized_weights.s3 = ((bits4.s3 & (0x000F)) - 8) * scale.s3;
|
|
61
|
+
c0 += B * dequantized_weights.s0; // vector-scalar multiplication to accumulate
|
|
62
|
+
c1 += B * dequantized_weights.s1;
|
|
63
|
+
c2 += B * dequantized_weights.s2;
|
|
64
|
+
c3 += B * dequantized_weights.s3;
|
|
65
|
+
|
|
66
|
+
// j=1
|
|
67
|
+
B.s0123 = read_imageh(src1, gy*2 + (i+1)*(n_4));
|
|
68
|
+
B.s4567 = read_imageh(src1, gy*2 + (i+1)*(n_4)+1);
|
|
69
|
+
dequantized_weights.s0 = (((bits4.s0 & (0x00F0)) >> 4) - 8) * scale.s0; // dequantize a row of the 16 weights
|
|
70
|
+
dequantized_weights.s1 = (((bits4.s1 & (0x00F0)) >> 4) - 8) * scale.s1;
|
|
71
|
+
dequantized_weights.s2 = (((bits4.s2 & (0x00F0)) >> 4) - 8) * scale.s2;
|
|
72
|
+
dequantized_weights.s3 = (((bits4.s3 & (0x00F0)) >> 4) - 8) * scale.s3;
|
|
73
|
+
c0 += B * dequantized_weights.s0; //vector-scalar multiplication to accumulate
|
|
74
|
+
c1 += B * dequantized_weights.s1;
|
|
75
|
+
c2 += B * dequantized_weights.s2;
|
|
76
|
+
c3 += B * dequantized_weights.s3;
|
|
77
|
+
|
|
78
|
+
// j=2
|
|
79
|
+
B.s0123 = read_imageh(src1, gy*2 + (i+2)*(n_4));
|
|
80
|
+
B.s4567 = read_imageh(src1, gy*2 + (i+2)*(n_4)+1);
|
|
81
|
+
dequantized_weights.s0 = (((bits4.s0 & (0x0F00)) >> 8) - 8) * scale.s0; // dequantize a row of the 16 weights
|
|
82
|
+
dequantized_weights.s1 = (((bits4.s1 & (0x0F00)) >> 8) - 8) * scale.s1;
|
|
83
|
+
dequantized_weights.s2 = (((bits4.s2 & (0x0F00)) >> 8) - 8) * scale.s2;
|
|
84
|
+
dequantized_weights.s3 = (((bits4.s3 & (0x0F00)) >> 8) - 8) * scale.s3;
|
|
85
|
+
c0 += B * dequantized_weights.s0; // vector-scalar multiplication to accumulate
|
|
86
|
+
c1 += B * dequantized_weights.s1;
|
|
87
|
+
c2 += B * dequantized_weights.s2;
|
|
88
|
+
c3 += B * dequantized_weights.s3;
|
|
89
|
+
|
|
90
|
+
// j=3
|
|
91
|
+
B.s0123 = read_imageh(src1, gy*2 + (i+3)*(n_4));
|
|
92
|
+
B.s4567 = read_imageh(src1, gy*2 + (i+3)*(n_4)+1);
|
|
93
|
+
dequantized_weights.s0 = (((bits4.s0 & (0xF000)) >> 12) - 8) * scale.s0; // dequantize a row of the 16 weights
|
|
94
|
+
dequantized_weights.s1 = (((bits4.s1 & (0xF000)) >> 12) - 8) * scale.s1;
|
|
95
|
+
dequantized_weights.s2 = (((bits4.s2 & (0xF000)) >> 12) - 8) * scale.s2;
|
|
96
|
+
dequantized_weights.s3 = (((bits4.s3 & (0xF000)) >> 12) - 8) * scale.s3;
|
|
97
|
+
c0 += B * dequantized_weights.s0; // vector-scalar multiplication to accumulate
|
|
98
|
+
c1 += B * dequantized_weights.s1;
|
|
99
|
+
c2 += B * dequantized_weights.s2;
|
|
100
|
+
c3 += B * dequantized_weights.s3;
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
int idx = (gy<<3)*m + (gx<<2); // vectorized store 16 elements
|
|
104
|
+
|
|
105
|
+
// conditional check if store is to a valid location. Required when N is not a multiple of 8
|
|
106
|
+
// if statements allow registers to be reused for each store
|
|
107
|
+
// provides a performance boost due to reduced register footprint, which increases number of concurrent waves
|
|
108
|
+
if(idx+3 < m*n_no_padding){
|
|
109
|
+
vstore4((float4)(c0.s0, c1.s0, c2.s0, c3.s0), 0, dst + idx);
|
|
110
|
+
idx += m;
|
|
111
|
+
}
|
|
112
|
+
if(idx+3 < m*n_no_padding){
|
|
113
|
+
vstore4((float4)(c0.s1, c1.s1, c2.s1, c3.s1), 0, dst + idx);
|
|
114
|
+
idx += m;
|
|
115
|
+
}
|
|
116
|
+
if(idx+3 < m*n_no_padding){
|
|
117
|
+
vstore4((float4)(c0.s2, c1.s2, c2.s2, c3.s2), 0, dst + idx);
|
|
118
|
+
idx += m;
|
|
119
|
+
}
|
|
120
|
+
if(idx+3 < m*n_no_padding){
|
|
121
|
+
vstore4((float4)(c0.s3, c1.s3, c2.s3, c3.s3), 0, dst + idx);
|
|
122
|
+
idx += m;
|
|
123
|
+
}
|
|
124
|
+
if(idx+3 < m*n_no_padding){
|
|
125
|
+
vstore4((float4)(c0.s4, c1.s4, c2.s4, c3.s4), 0, dst + idx);
|
|
126
|
+
idx += m;
|
|
127
|
+
}
|
|
128
|
+
if(idx+3 < m*n_no_padding){
|
|
129
|
+
vstore4((float4)(c0.s5, c1.s5, c2.s5, c3.s5), 0, dst + idx);
|
|
130
|
+
idx += m;
|
|
131
|
+
}
|
|
132
|
+
if(idx+3 < m*n_no_padding){
|
|
133
|
+
vstore4((float4)(c0.s6, c1.s6, c2.s6, c3.s6), 0, dst + idx);
|
|
134
|
+
idx += m;
|
|
135
|
+
}
|
|
136
|
+
if(idx+3 < m*n_no_padding){
|
|
137
|
+
vstore4((float4)(c0.s7, c1.s7, c2.s7, c3.s7), 0, dst + idx);
|
|
138
|
+
}
|
|
139
|
+
}
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
|
2
|
+
|
|
3
|
+
#if defined(cl_qcom_reqd_sub_group_size)
|
|
4
|
+
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
|
5
|
+
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
|
6
|
+
#else
|
|
7
|
+
#define REQD_SUBGROUP_SIZE_128
|
|
8
|
+
#endif
|
|
9
|
+
|
|
10
|
+
#define OPWM 64
|
|
11
|
+
#define OPWN 64
|
|
12
|
+
#define CPWK 8
|
|
13
|
+
#define OPTM 4
|
|
14
|
+
#define OPTN 8
|
|
15
|
+
|
|
16
|
+
#define WG_M (OPWM / OPTM)
|
|
17
|
+
#define WG_N (OPWN / OPTN)
|
|
18
|
+
#define VEC_K (CPWK / 4)
|
|
19
|
+
|
|
20
|
+
REQD_SUBGROUP_SIZE_128
|
|
21
|
+
__kernel void mul_mat_f16_f32(
|
|
22
|
+
const int M, const int N, const int K,
|
|
23
|
+
__global const void* A_void, ulong A_offset,
|
|
24
|
+
__global const void* B_void, ulong B_offset,
|
|
25
|
+
__global void* C_void, ulong C_offset) {
|
|
26
|
+
|
|
27
|
+
__global const half* A = (__global const half* )((__global const char*)A_void + A_offset);
|
|
28
|
+
__global const float* B = (__global const float*)((__global const char*)B_void + B_offset);
|
|
29
|
+
__global float* C = (__global float*)((__global char*)C_void + C_offset);
|
|
30
|
+
|
|
31
|
+
const int lidm = get_local_id(0);
|
|
32
|
+
const int lidn = get_local_id(1);
|
|
33
|
+
const int lid = lidn * WG_M + lidm;
|
|
34
|
+
|
|
35
|
+
const int offsetM = get_group_id(0) * OPWM;
|
|
36
|
+
const int offsetN = get_group_id(1) * OPWN;
|
|
37
|
+
|
|
38
|
+
__local half4 Alocal[OPWM][VEC_K];
|
|
39
|
+
__local float4 Blocal[OPWN][VEC_K];
|
|
40
|
+
|
|
41
|
+
float sum[OPTM][OPTN];
|
|
42
|
+
|
|
43
|
+
for (int wm = 0; wm < OPTM; wm++) {
|
|
44
|
+
for (int wn = 0; wn < OPTN; wn++) {
|
|
45
|
+
sum[wm][wn] = 0.0f;
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
const int numTiles = (K + CPWK - 1) / CPWK;
|
|
50
|
+
|
|
51
|
+
const int load_row_a = lid % OPWM;
|
|
52
|
+
const int load_vec_k_a = lid / OPWM;
|
|
53
|
+
const int global_row_a = offsetM + load_row_a;
|
|
54
|
+
|
|
55
|
+
const int load_row_b = lid % OPWN;
|
|
56
|
+
const int load_vec_k_b = lid / OPWN;
|
|
57
|
+
const int global_row_b = offsetN + load_row_b;
|
|
58
|
+
|
|
59
|
+
for (int t = 0; t < numTiles; t++) {
|
|
60
|
+
const int k_start = t * CPWK;
|
|
61
|
+
const int k_vec_start_a = k_start + load_vec_k_a * 4;
|
|
62
|
+
const int k_vec_start_b = k_start + load_vec_k_b * 4;
|
|
63
|
+
|
|
64
|
+
if (global_row_a < M && k_vec_start_a < K) {
|
|
65
|
+
if (k_vec_start_a + 3 < K) {
|
|
66
|
+
Alocal[load_row_a][load_vec_k_a] = vload4(0, A + global_row_a * K + k_vec_start_a);
|
|
67
|
+
} else {
|
|
68
|
+
half4 tempA = (half4)(0.0h);
|
|
69
|
+
if (k_vec_start_a < K) tempA.s0 = A[global_row_a * K + k_vec_start_a];
|
|
70
|
+
if (k_vec_start_a + 1 < K) tempA.s1 = A[global_row_a * K + k_vec_start_a + 1];
|
|
71
|
+
if (k_vec_start_a + 2 < K) tempA.s2 = A[global_row_a * K + k_vec_start_a + 2];
|
|
72
|
+
Alocal[load_row_a][load_vec_k_a] = tempA;
|
|
73
|
+
}
|
|
74
|
+
} else {
|
|
75
|
+
Alocal[load_row_a][load_vec_k_a] = (half4)(0.0h);
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
if (global_row_b < N && k_vec_start_b < K) {
|
|
79
|
+
if (k_vec_start_b + 3 < K) {
|
|
80
|
+
Blocal[load_row_b][load_vec_k_b] = vload4(0, B + global_row_b * K + k_vec_start_b);
|
|
81
|
+
} else {
|
|
82
|
+
float4 tempB = (float4)(0.0f);
|
|
83
|
+
if (k_vec_start_b < K) tempB.s0 = B[global_row_b * K + k_vec_start_b];
|
|
84
|
+
if (k_vec_start_b + 1 < K) tempB.s1 = B[global_row_b * K + k_vec_start_b + 1];
|
|
85
|
+
if (k_vec_start_b + 2 < K) tempB.s2 = B[global_row_b * K + k_vec_start_b + 2];
|
|
86
|
+
Blocal[load_row_b][load_vec_k_b] = tempB;
|
|
87
|
+
}
|
|
88
|
+
} else {
|
|
89
|
+
Blocal[load_row_b][load_vec_k_b] = (float4)(0.0f);
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
barrier(CLK_LOCAL_MEM_FENCE);
|
|
93
|
+
|
|
94
|
+
#pragma unroll
|
|
95
|
+
for (int k_vec = 0; k_vec < VEC_K; k_vec++) {
|
|
96
|
+
float4 a_fvecs[OPTM];
|
|
97
|
+
int current_row_a = lidm;
|
|
98
|
+
for (int wm = 0; wm < OPTM; wm++) {
|
|
99
|
+
a_fvecs[wm] = convert_float4(Alocal[current_row_a][k_vec]);
|
|
100
|
+
current_row_a += WG_M;
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
float4 b_fvecs[OPTN];
|
|
104
|
+
int current_row_b = lidn;
|
|
105
|
+
for (int wn = 0; wn < OPTN; wn++) {
|
|
106
|
+
b_fvecs[wn] = Blocal[current_row_b][k_vec];
|
|
107
|
+
current_row_b += WG_N;
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
for (int wm = 0; wm < OPTM; wm++) {
|
|
111
|
+
for (int wn = 0; wn < OPTN; wn++) {
|
|
112
|
+
sum[wm][wn] += dot(a_fvecs[wm], b_fvecs[wn]);
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
barrier(CLK_LOCAL_MEM_FENCE);
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
for (int wm = 0; wm < OPTM; wm++) {
|
|
120
|
+
int globalRow = offsetM + lidm + wm * WG_M;
|
|
121
|
+
if (globalRow < M) {
|
|
122
|
+
for (int wn = 0; wn < OPTN; wn++) {
|
|
123
|
+
int globalCol = offsetN + lidn + wn * WG_N;
|
|
124
|
+
if (globalCol < N) {
|
|
125
|
+
C[globalCol * M + globalRow] = sum[wm][wn];
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
}
|