local-llm-rn 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/cpp/CMakeLists.txt +285 -0
- package/cpp/common/CMakeLists.txt +149 -0
- package/cpp/common/arg.cpp +3799 -0
- package/cpp/common/arg.h +131 -0
- package/cpp/common/base64.hpp +392 -0
- package/cpp/common/build-info.cpp.in +4 -0
- package/cpp/common/chat-parser-xml-toolcall.cpp +879 -0
- package/cpp/common/chat-parser-xml-toolcall.h +45 -0
- package/cpp/common/chat-parser.cpp +1649 -0
- package/cpp/common/chat-parser.h +133 -0
- package/cpp/common/chat-peg-parser.cpp +124 -0
- package/cpp/common/chat-peg-parser.h +105 -0
- package/cpp/common/chat.cpp +3355 -0
- package/cpp/common/chat.h +252 -0
- package/cpp/common/common.cpp +1824 -0
- package/cpp/common/common.h +930 -0
- package/cpp/common/console.cpp +1137 -0
- package/cpp/common/console.h +41 -0
- package/cpp/common/debug.cpp +167 -0
- package/cpp/common/debug.h +43 -0
- package/cpp/common/download.cpp +792 -0
- package/cpp/common/download.h +84 -0
- package/cpp/common/http.h +84 -0
- package/cpp/common/jinja/README.md +88 -0
- package/cpp/common/jinja/caps.cpp +285 -0
- package/cpp/common/jinja/caps.h +30 -0
- package/cpp/common/jinja/lexer.cpp +341 -0
- package/cpp/common/jinja/lexer.h +157 -0
- package/cpp/common/jinja/parser.cpp +591 -0
- package/cpp/common/jinja/parser.h +21 -0
- package/cpp/common/jinja/runtime.cpp +867 -0
- package/cpp/common/jinja/runtime.h +638 -0
- package/cpp/common/jinja/string.cpp +213 -0
- package/cpp/common/jinja/string.h +61 -0
- package/cpp/common/jinja/utils.h +149 -0
- package/cpp/common/jinja/value.cpp +1393 -0
- package/cpp/common/jinja/value.h +756 -0
- package/cpp/common/json-partial.cpp +324 -0
- package/cpp/common/json-partial.h +39 -0
- package/cpp/common/json-schema-to-grammar.cpp +1153 -0
- package/cpp/common/json-schema-to-grammar.h +43 -0
- package/cpp/common/llguidance.cpp +258 -0
- package/cpp/common/log.cpp +446 -0
- package/cpp/common/log.h +119 -0
- package/cpp/common/ngram-cache.cpp +285 -0
- package/cpp/common/ngram-cache.h +101 -0
- package/cpp/common/ngram-map.cpp +530 -0
- package/cpp/common/ngram-map.h +115 -0
- package/cpp/common/ngram-mod.cpp +60 -0
- package/cpp/common/ngram-mod.h +38 -0
- package/cpp/common/peg-parser.cpp +1712 -0
- package/cpp/common/peg-parser.h +459 -0
- package/cpp/common/preset.cpp +483 -0
- package/cpp/common/preset.h +83 -0
- package/cpp/common/regex-partial.cpp +204 -0
- package/cpp/common/regex-partial.h +56 -0
- package/cpp/common/sampling.cpp +745 -0
- package/cpp/common/sampling.h +119 -0
- package/cpp/common/speculative.cpp +1074 -0
- package/cpp/common/speculative.h +41 -0
- package/cpp/common/unicode.cpp +64 -0
- package/cpp/common/unicode.h +22 -0
- package/cpp/ggml/CMakeLists.txt +494 -0
- package/cpp/ggml/cmake/GitVars.cmake +22 -0
- package/cpp/ggml/cmake/common.cmake +50 -0
- package/cpp/ggml/cmake/ggml-config.cmake.in +191 -0
- package/cpp/ggml/include/ggml-alloc.h +85 -0
- package/cpp/ggml/include/ggml-backend.h +373 -0
- package/cpp/ggml/include/ggml-blas.h +25 -0
- package/cpp/ggml/include/ggml-cann.h +123 -0
- package/cpp/ggml/include/ggml-cpp.h +39 -0
- package/cpp/ggml/include/ggml-cpu.h +151 -0
- package/cpp/ggml/include/ggml-cuda.h +47 -0
- package/cpp/ggml/include/ggml-hexagon.h +19 -0
- package/cpp/ggml/include/ggml-metal.h +61 -0
- package/cpp/ggml/include/ggml-opencl.h +26 -0
- package/cpp/ggml/include/ggml-opt.h +256 -0
- package/cpp/ggml/include/ggml-rpc.h +30 -0
- package/cpp/ggml/include/ggml-sycl.h +49 -0
- package/cpp/ggml/include/ggml-virtgpu.h +14 -0
- package/cpp/ggml/include/ggml-vulkan.h +29 -0
- package/cpp/ggml/include/ggml-webgpu.h +19 -0
- package/cpp/ggml/include/ggml-zdnn.h +17 -0
- package/cpp/ggml/include/ggml-zendnn.h +22 -0
- package/cpp/ggml/include/ggml.h +2753 -0
- package/cpp/ggml/include/gguf.h +204 -0
- package/cpp/ggml/src/CMakeLists.txt +492 -0
- package/cpp/ggml/src/ggml-alloc.c +1244 -0
- package/cpp/ggml/src/ggml-backend-dl.cpp +48 -0
- package/cpp/ggml/src/ggml-backend-dl.h +45 -0
- package/cpp/ggml/src/ggml-backend-impl.h +255 -0
- package/cpp/ggml/src/ggml-backend-reg.cpp +566 -0
- package/cpp/ggml/src/ggml-backend.cpp +2270 -0
- package/cpp/ggml/src/ggml-blas/CMakeLists.txt +101 -0
- package/cpp/ggml/src/ggml-blas/ggml-blas.cpp +518 -0
- package/cpp/ggml/src/ggml-common.h +1878 -0
- package/cpp/ggml/src/ggml-cpu/CMakeLists.txt +691 -0
- package/cpp/ggml/src/ggml-cpu/amx/amx.cpp +247 -0
- package/cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
- package/cpp/ggml/src/ggml-cpu/amx/common.h +91 -0
- package/cpp/ggml/src/ggml-cpu/amx/mmq.cpp +2512 -0
- package/cpp/ggml/src/ggml-cpu/amx/mmq.h +10 -0
- package/cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +98 -0
- package/cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4052 -0
- package/cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +4935 -0
- package/cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2159 -0
- package/cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
- package/cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2305 -0
- package/cpp/ggml/src/ggml-cpu/arch/riscv/cpu-feats.cpp +38 -0
- package/cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2726 -0
- package/cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +342 -0
- package/cpp/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
- package/cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1468 -0
- package/cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1221 -0
- package/cpp/ggml/src/ggml-cpu/arch/x86/cpu-feats.cpp +327 -0
- package/cpp/ggml/src/ggml-cpu/arch/x86/quants.c +3820 -0
- package/cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +6307 -0
- package/cpp/ggml/src/ggml-cpu/arch-fallback.h +313 -0
- package/cpp/ggml/src/ggml-cpu/binary-ops.cpp +154 -0
- package/cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
- package/cpp/ggml/src/ggml-cpu/cmake/FindSIMD.cmake +100 -0
- package/cpp/ggml/src/ggml-cpu/common.h +95 -0
- package/cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +529 -0
- package/cpp/ggml/src/ggml-cpu/ggml-cpu.c +3734 -0
- package/cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +701 -0
- package/cpp/ggml/src/ggml-cpu/hbm.cpp +55 -0
- package/cpp/ggml/src/ggml-cpu/hbm.h +8 -0
- package/cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +938 -0
- package/cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +90 -0
- package/cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +798 -0
- package/cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
- package/cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +4033 -0
- package/cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +25 -0
- package/cpp/ggml/src/ggml-cpu/ops.cpp +10978 -0
- package/cpp/ggml/src/ggml-cpu/ops.h +116 -0
- package/cpp/ggml/src/ggml-cpu/quants.c +1193 -0
- package/cpp/ggml/src/ggml-cpu/quants.h +97 -0
- package/cpp/ggml/src/ggml-cpu/repack.cpp +3316 -0
- package/cpp/ggml/src/ggml-cpu/repack.h +173 -0
- package/cpp/ggml/src/ggml-cpu/simd-gemm.h +136 -0
- package/cpp/ggml/src/ggml-cpu/simd-mappings.h +1279 -0
- package/cpp/ggml/src/ggml-cpu/spacemit/ime.cpp +1025 -0
- package/cpp/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
- package/cpp/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
- package/cpp/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
- package/cpp/ggml/src/ggml-cpu/traits.cpp +36 -0
- package/cpp/ggml/src/ggml-cpu/traits.h +38 -0
- package/cpp/ggml/src/ggml-cpu/unary-ops.cpp +337 -0
- package/cpp/ggml/src/ggml-cpu/unary-ops.h +35 -0
- package/cpp/ggml/src/ggml-cpu/vec.cpp +629 -0
- package/cpp/ggml/src/ggml-cpu/vec.h +1585 -0
- package/cpp/ggml/src/ggml-hexagon/CMakeLists.txt +117 -0
- package/cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp +3232 -0
- package/cpp/ggml/src/ggml-hexagon/htp/CMakeLists.txt +45 -0
- package/cpp/ggml/src/ggml-hexagon/htp/act-ops.c +815 -0
- package/cpp/ggml/src/ggml-hexagon/htp/argsort-ops.c +281 -0
- package/cpp/ggml/src/ggml-hexagon/htp/binary-ops.c +827 -0
- package/cpp/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +157 -0
- package/cpp/ggml/src/ggml-hexagon/htp/cpy-ops.c +251 -0
- package/cpp/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +666 -0
- package/cpp/ggml/src/ggml-hexagon/htp/get-rows-ops.c +111 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hex-dma.c +63 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hex-dma.h +182 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hex-dump.h +77 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hex-fastdiv.h +37 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hex-utils.h +51 -0
- package/cpp/ggml/src/ggml-hexagon/htp/htp-ctx.h +35 -0
- package/cpp/ggml/src/ggml-hexagon/htp/htp-msg.h +154 -0
- package/cpp/ggml/src/ggml-hexagon/htp/htp-ops.h +65 -0
- package/cpp/ggml/src/ggml-hexagon/htp/htp_iface.idl +16 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-arith.h +470 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-base.h +173 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-copy.h +245 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-div.h +116 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-dump.h +129 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-exp.h +215 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-floor.h +100 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-inverse.h +176 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-reduce.h +266 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-scale.h +133 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +141 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-sqrt.h +126 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-types.h +36 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-utils.h +18 -0
- package/cpp/ggml/src/ggml-hexagon/htp/main.c +1150 -0
- package/cpp/ggml/src/ggml-hexagon/htp/matmul-ops.c +2595 -0
- package/cpp/ggml/src/ggml-hexagon/htp/rope-ops.c +498 -0
- package/cpp/ggml/src/ggml-hexagon/htp/set-rows-ops.c +167 -0
- package/cpp/ggml/src/ggml-hexagon/htp/softmax-ops.c +421 -0
- package/cpp/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +130 -0
- package/cpp/ggml/src/ggml-hexagon/htp/unary-ops.c +384 -0
- package/cpp/ggml/src/ggml-hexagon/htp/worker-pool.c +293 -0
- package/cpp/ggml/src/ggml-hexagon/htp/worker-pool.h +57 -0
- package/cpp/ggml/src/ggml-hexagon/htp-drv.cpp +418 -0
- package/cpp/ggml/src/ggml-hexagon/htp-drv.h +121 -0
- package/cpp/ggml/src/ggml-hexagon/libdl.h +79 -0
- package/cpp/ggml/src/ggml-hexagon/libggml-htp.inf +38 -0
- package/cpp/ggml/src/ggml-hexagon/op-desc.h +153 -0
- package/cpp/ggml/src/ggml-impl.h +724 -0
- package/cpp/ggml/src/ggml-metal/CMakeLists.txt +124 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal-common.cpp +457 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal-context.h +41 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal-context.m +702 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal-device.cpp +1890 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal-device.h +290 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal-device.m +1749 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal-impl.h +1054 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal-ops.cpp +4370 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal-ops.h +94 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal.cpp +937 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal.metal +9819 -0
- package/cpp/ggml/src/ggml-musa/CMakeLists.txt +125 -0
- package/cpp/ggml/src/ggml-musa/mudnn.cu +112 -0
- package/cpp/ggml/src/ggml-musa/mudnn.cuh +12 -0
- package/cpp/ggml/src/ggml-opencl/CMakeLists.txt +150 -0
- package/cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +11553 -0
- package/cpp/ggml/src/ggml-opencl/kernels/add.cl +190 -0
- package/cpp/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
- package/cpp/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
- package/cpp/ggml/src/ggml-opencl/kernels/clamp.cl +20 -0
- package/cpp/ggml/src/ggml-opencl/kernels/concat.cl +51 -0
- package/cpp/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
- package/cpp/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
- package/cpp/ggml/src/ggml-opencl/kernels/cpy.cl +184 -0
- package/cpp/ggml/src/ggml-opencl/kernels/cvt.cl +417 -0
- package/cpp/ggml/src/ggml-opencl/kernels/diag_mask_inf.cl +58 -0
- package/cpp/ggml/src/ggml-opencl/kernels/div.cl +138 -0
- package/cpp/ggml/src/ggml-opencl/kernels/embed_kernel.py +26 -0
- package/cpp/ggml/src/ggml-opencl/kernels/expm1.cl +113 -0
- package/cpp/ggml/src/ggml-opencl/kernels/fill.cl +17 -0
- package/cpp/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +370 -0
- package/cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +371 -0
- package/cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +373 -0
- package/cpp/ggml/src/ggml-opencl/kernels/gelu.cl +89 -0
- package/cpp/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32.cl +162 -0
- package/cpp/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32.cl +156 -0
- package/cpp/ggml/src/ggml-opencl/kernels/gemv_noshuffle.cl +268 -0
- package/cpp/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general.cl +274 -0
- package/cpp/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general_q8_0_f32.cl +195 -0
- package/cpp/ggml/src/ggml-opencl/kernels/get_rows.cl +187 -0
- package/cpp/ggml/src/ggml-opencl/kernels/glu.cl +378 -0
- package/cpp/ggml/src/ggml-opencl/kernels/group_norm.cl +121 -0
- package/cpp/ggml/src/ggml-opencl/kernels/im2col_f16.cl +57 -0
- package/cpp/ggml/src/ggml-opencl/kernels/im2col_f32.cl +57 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mean.cl +140 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul.cl +152 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mat_Ab_Bi_8x4.cl +139 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_kq_kqv.cl +273 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +146 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +147 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl +163 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl +165 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q6_k_f32_l4_lm.cl +158 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_8x4.cl +129 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl +154 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f16.cl +118 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32.cl +118 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_1row.cl +94 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_l4.cl +84 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f32_f32.cl +118 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32.cl +192 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_16x_flat.cl +307 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_8x_flat.cl +265 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_8x_flat.cl +272 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_v.cl +254 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl +219 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl +229 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl +180 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32.cl +194 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +194 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
- package/cpp/ggml/src/ggml-opencl/kernels/norm.cl +161 -0
- package/cpp/ggml/src/ggml-opencl/kernels/pad.cl +39 -0
- package/cpp/ggml/src/ggml-opencl/kernels/relu.cl +16 -0
- package/cpp/ggml/src/ggml-opencl/kernels/repeat.cl +38 -0
- package/cpp/ggml/src/ggml-opencl/kernels/rms_norm.cl +190 -0
- package/cpp/ggml/src/ggml-opencl/kernels/rope.cl +747 -0
- package/cpp/ggml/src/ggml-opencl/kernels/scale.cl +27 -0
- package/cpp/ggml/src/ggml-opencl/kernels/set_rows.cl +208 -0
- package/cpp/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
- package/cpp/ggml/src/ggml-opencl/kernels/silu.cl +30 -0
- package/cpp/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +108 -0
- package/cpp/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +108 -0
- package/cpp/ggml/src/ggml-opencl/kernels/softmax_f16.cl +107 -0
- package/cpp/ggml/src/ggml-opencl/kernels/softmax_f32.cl +107 -0
- package/cpp/ggml/src/ggml-opencl/kernels/softplus.cl +116 -0
- package/cpp/ggml/src/ggml-opencl/kernels/solve_tri.cl +51 -0
- package/cpp/ggml/src/ggml-opencl/kernels/sqr.cl +53 -0
- package/cpp/ggml/src/ggml-opencl/kernels/sqrt.cl +53 -0
- package/cpp/ggml/src/ggml-opencl/kernels/ssm_conv.cl +77 -0
- package/cpp/ggml/src/ggml-opencl/kernels/sub.cl +138 -0
- package/cpp/ggml/src/ggml-opencl/kernels/sum_rows.cl +140 -0
- package/cpp/ggml/src/ggml-opencl/kernels/tanh.cl +109 -0
- package/cpp/ggml/src/ggml-opencl/kernels/transpose.cl +117 -0
- package/cpp/ggml/src/ggml-opencl/kernels/tri.cl +32 -0
- package/cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
- package/cpp/ggml/src/ggml-opencl/kernels/upscale.cl +120 -0
- package/cpp/ggml/src/ggml-opt.cpp +1093 -0
- package/cpp/ggml/src/ggml-quants.c +5325 -0
- package/cpp/ggml/src/ggml-quants.h +106 -0
- package/cpp/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
- package/cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2118 -0
- package/cpp/ggml/src/ggml-threading.cpp +12 -0
- package/cpp/ggml/src/ggml-threading.h +14 -0
- package/cpp/ggml/src/ggml-virtgpu/CMakeLists.txt +70 -0
- package/cpp/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp +87 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/CMakeLists.txt +21 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp +115 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/backend-convert.h +13 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +102 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +105 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +179 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp +148 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +51 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +73 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +27 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +32 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/backend.cpp +144 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +95 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +94 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +50 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +378 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +232 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +58 -0
- package/cpp/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +81 -0
- package/cpp/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +119 -0
- package/cpp/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +158 -0
- package/cpp/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +213 -0
- package/cpp/ggml/src/ggml-virtgpu/ggml-backend.cpp +69 -0
- package/cpp/ggml/src/ggml-virtgpu/ggml-remoting.h +71 -0
- package/cpp/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml +166 -0
- package/cpp/ggml/src/ggml-virtgpu/include/apir_hw.h +9 -0
- package/cpp/ggml/src/ggml-virtgpu/regenerate_remoting.py +333 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu-apir.h +15 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +58 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +110 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +173 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +192 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +36 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +53 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +98 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu-shm.h +23 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu-utils.cpp +179 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu-utils.h +86 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu.cpp +544 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu.h +117 -0
- package/cpp/ggml/src/ggml-webgpu/CMakeLists.txt +80 -0
- package/cpp/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +1231 -0
- package/cpp/ggml/src/ggml-webgpu/ggml-webgpu.cpp +3150 -0
- package/cpp/ggml/src/ggml-webgpu/pre_wgsl.hpp +778 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl +72 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl +106 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl +134 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +107 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +923 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +107 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl +66 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +182 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +636 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.wgsl +668 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +323 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +713 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +103 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl +138 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.wgsl +188 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +194 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl +86 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +123 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/rope.tmpl.wgsl +295 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/scale.wgsl +63 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +109 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.tmpl.wgsl +345 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl +55 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +193 -0
- package/cpp/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
- package/cpp/ggml/src/ggml-zdnn/common.hpp +59 -0
- package/cpp/ggml/src/ggml-zdnn/ggml-zdnn.cpp +633 -0
- package/cpp/ggml/src/ggml-zdnn/mmf.cpp +80 -0
- package/cpp/ggml/src/ggml-zdnn/mmf.hpp +12 -0
- package/cpp/ggml/src/ggml-zdnn/utils.cpp +79 -0
- package/cpp/ggml/src/ggml-zdnn/utils.hpp +19 -0
- package/cpp/ggml/src/ggml-zendnn/CMakeLists.txt +92 -0
- package/cpp/ggml/src/ggml-zendnn/ggml-zendnn.cpp +469 -0
- package/cpp/ggml/src/ggml.c +7669 -0
- package/cpp/ggml/src/ggml.cpp +26 -0
- package/cpp/ggml/src/gguf.cpp +1699 -0
- package/cpp/include/llama-cpp.h +32 -0
- package/cpp/include/llama.h +1568 -0
- package/cpp/mtmd/CMakeLists.txt +98 -0
- package/cpp/mtmd/README.md +63 -0
- package/cpp/mtmd/clip-graph.h +117 -0
- package/cpp/mtmd/clip-impl.h +586 -0
- package/cpp/mtmd/clip-model.h +390 -0
- package/cpp/mtmd/clip.cpp +4154 -0
- package/cpp/mtmd/clip.h +121 -0
- package/cpp/mtmd/deprecation-warning.cpp +22 -0
- package/cpp/mtmd/legacy-models/convert_image_encoder_to_gguf.py +412 -0
- package/cpp/mtmd/legacy-models/glmedge-convert-image-encoder-to-gguf.py +280 -0
- package/cpp/mtmd/legacy-models/glmedge-surgery.py +33 -0
- package/cpp/mtmd/legacy-models/llava_surgery.py +38 -0
- package/cpp/mtmd/legacy-models/llava_surgery_v2.py +180 -0
- package/cpp/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py +892 -0
- package/cpp/mtmd/legacy-models/minicpmv-surgery.py +47 -0
- package/cpp/mtmd/models/cogvlm.cpp +98 -0
- package/cpp/mtmd/models/conformer.cpp +216 -0
- package/cpp/mtmd/models/glm4v.cpp +122 -0
- package/cpp/mtmd/models/internvl.cpp +69 -0
- package/cpp/mtmd/models/kimik25.cpp +101 -0
- package/cpp/mtmd/models/kimivl.cpp +63 -0
- package/cpp/mtmd/models/llama4.cpp +96 -0
- package/cpp/mtmd/models/llava.cpp +374 -0
- package/cpp/mtmd/models/minicpmv.cpp +114 -0
- package/cpp/mtmd/models/mobilenetv5.cpp +451 -0
- package/cpp/mtmd/models/models.h +128 -0
- package/cpp/mtmd/models/nemotron-v2-vl.cpp +35 -0
- package/cpp/mtmd/models/paddleocr.cpp +52 -0
- package/cpp/mtmd/models/pixtral.cpp +86 -0
- package/cpp/mtmd/models/qwen2vl.cpp +183 -0
- package/cpp/mtmd/models/qwen3vl.cpp +193 -0
- package/cpp/mtmd/models/siglip.cpp +86 -0
- package/cpp/mtmd/models/whisper-enc.cpp +115 -0
- package/cpp/mtmd/models/youtuvl.cpp +179 -0
- package/cpp/mtmd/mtmd-audio.cpp +730 -0
- package/cpp/mtmd/mtmd-audio.h +113 -0
- package/cpp/mtmd/mtmd-cli.cpp +437 -0
- package/cpp/mtmd/mtmd-helper.cpp +521 -0
- package/cpp/mtmd/mtmd-helper.h +96 -0
- package/cpp/mtmd/mtmd.cpp +1156 -0
- package/cpp/mtmd/mtmd.h +319 -0
- package/cpp/mtmd/requirements.txt +5 -0
- package/cpp/mtmd/test-1.jpeg +0 -0
- package/cpp/mtmd/test-2.mp3 +0 -0
- package/cpp/mtmd/tests.sh +192 -0
- package/cpp/src/CMakeLists.txt +169 -0
- package/cpp/src/llama-adapter.cpp +488 -0
- package/cpp/src/llama-adapter.h +89 -0
- package/cpp/src/llama-arch.cpp +2855 -0
- package/cpp/src/llama-arch.h +619 -0
- package/cpp/src/llama-batch.cpp +917 -0
- package/cpp/src/llama-batch.h +173 -0
- package/cpp/src/llama-chat.cpp +896 -0
- package/cpp/src/llama-chat.h +71 -0
- package/cpp/src/llama-context.cpp +3512 -0
- package/cpp/src/llama-context.h +359 -0
- package/cpp/src/llama-cparams.cpp +5 -0
- package/cpp/src/llama-cparams.h +44 -0
- package/cpp/src/llama-grammar.cpp +1464 -0
- package/cpp/src/llama-grammar.h +194 -0
- package/cpp/src/llama-graph.cpp +2685 -0
- package/cpp/src/llama-graph.h +1026 -0
- package/cpp/src/llama-hparams.cpp +234 -0
- package/cpp/src/llama-hparams.h +339 -0
- package/cpp/src/llama-impl.cpp +171 -0
- package/cpp/src/llama-impl.h +73 -0
- package/cpp/src/llama-io.cpp +15 -0
- package/cpp/src/llama-io.h +35 -0
- package/cpp/src/llama-kv-cache-iswa.cpp +330 -0
- package/cpp/src/llama-kv-cache-iswa.h +137 -0
- package/cpp/src/llama-kv-cache.cpp +2271 -0
- package/cpp/src/llama-kv-cache.h +388 -0
- package/cpp/src/llama-kv-cells.h +533 -0
- package/cpp/src/llama-memory-hybrid-iswa.cpp +275 -0
- package/cpp/src/llama-memory-hybrid-iswa.h +140 -0
- package/cpp/src/llama-memory-hybrid.cpp +268 -0
- package/cpp/src/llama-memory-hybrid.h +139 -0
- package/cpp/src/llama-memory-recurrent.cpp +1165 -0
- package/cpp/src/llama-memory-recurrent.h +182 -0
- package/cpp/src/llama-memory.cpp +59 -0
- package/cpp/src/llama-memory.h +122 -0
- package/cpp/src/llama-mmap.cpp +785 -0
- package/cpp/src/llama-mmap.h +92 -0
- package/cpp/src/llama-model-loader.cpp +1414 -0
- package/cpp/src/llama-model-loader.h +203 -0
- package/cpp/src/llama-model-saver.cpp +286 -0
- package/cpp/src/llama-model-saver.h +37 -0
- package/cpp/src/llama-model.cpp +9253 -0
- package/cpp/src/llama-model.h +576 -0
- package/cpp/src/llama-quant.cpp +1119 -0
- package/cpp/src/llama-quant.h +1 -0
- package/cpp/src/llama-sampler.cpp +3885 -0
- package/cpp/src/llama-sampler.h +42 -0
- package/cpp/src/llama-vocab.cpp +3970 -0
- package/cpp/src/llama-vocab.h +187 -0
- package/cpp/src/llama.cpp +1313 -0
- package/cpp/src/models/afmoe.cpp +191 -0
- package/cpp/src/models/apertus.cpp +125 -0
- package/cpp/src/models/arcee.cpp +135 -0
- package/cpp/src/models/arctic.cpp +138 -0
- package/cpp/src/models/arwkv7.cpp +86 -0
- package/cpp/src/models/baichuan.cpp +122 -0
- package/cpp/src/models/bailingmoe.cpp +144 -0
- package/cpp/src/models/bailingmoe2.cpp +135 -0
- package/cpp/src/models/bert.cpp +178 -0
- package/cpp/src/models/bitnet.cpp +160 -0
- package/cpp/src/models/bloom.cpp +101 -0
- package/cpp/src/models/chameleon.cpp +178 -0
- package/cpp/src/models/chatglm.cpp +132 -0
- package/cpp/src/models/codeshell.cpp +111 -0
- package/cpp/src/models/cogvlm.cpp +102 -0
- package/cpp/src/models/cohere2-iswa.cpp +134 -0
- package/cpp/src/models/command-r.cpp +122 -0
- package/cpp/src/models/dbrx.cpp +123 -0
- package/cpp/src/models/deci.cpp +135 -0
- package/cpp/src/models/deepseek.cpp +144 -0
- package/cpp/src/models/deepseek2.cpp +262 -0
- package/cpp/src/models/delta-net-base.cpp +376 -0
- package/cpp/src/models/dots1.cpp +134 -0
- package/cpp/src/models/dream.cpp +105 -0
- package/cpp/src/models/ernie4-5-moe.cpp +150 -0
- package/cpp/src/models/ernie4-5.cpp +110 -0
- package/cpp/src/models/eurobert.cpp +97 -0
- package/cpp/src/models/exaone-moe.cpp +146 -0
- package/cpp/src/models/exaone.cpp +114 -0
- package/cpp/src/models/exaone4.cpp +123 -0
- package/cpp/src/models/falcon-h1.cpp +111 -0
- package/cpp/src/models/falcon.cpp +120 -0
- package/cpp/src/models/gemma-embedding.cpp +116 -0
- package/cpp/src/models/gemma.cpp +112 -0
- package/cpp/src/models/gemma2-iswa.cpp +128 -0
- package/cpp/src/models/gemma3.cpp +155 -0
- package/cpp/src/models/gemma3n-iswa.cpp +384 -0
- package/cpp/src/models/glm4-moe.cpp +170 -0
- package/cpp/src/models/glm4.cpp +157 -0
- package/cpp/src/models/gpt2.cpp +105 -0
- package/cpp/src/models/gptneox.cpp +144 -0
- package/cpp/src/models/granite-hybrid.cpp +196 -0
- package/cpp/src/models/granite.cpp +211 -0
- package/cpp/src/models/grok.cpp +159 -0
- package/cpp/src/models/grovemoe.cpp +141 -0
- package/cpp/src/models/hunyuan-dense.cpp +132 -0
- package/cpp/src/models/hunyuan-moe.cpp +154 -0
- package/cpp/src/models/internlm2.cpp +120 -0
- package/cpp/src/models/jais.cpp +86 -0
- package/cpp/src/models/jais2.cpp +123 -0
- package/cpp/src/models/jamba.cpp +106 -0
- package/cpp/src/models/kimi-linear.cpp +392 -0
- package/cpp/src/models/lfm2.cpp +190 -0
- package/cpp/src/models/llada-moe.cpp +122 -0
- package/cpp/src/models/llada.cpp +99 -0
- package/cpp/src/models/llama-iswa.cpp +178 -0
- package/cpp/src/models/llama.cpp +168 -0
- package/cpp/src/models/maincoder.cpp +117 -0
- package/cpp/src/models/mamba-base.cpp +285 -0
- package/cpp/src/models/mamba.cpp +54 -0
- package/cpp/src/models/mimo2-iswa.cpp +123 -0
- package/cpp/src/models/minicpm3.cpp +200 -0
- package/cpp/src/models/minimax-m2.cpp +124 -0
- package/cpp/src/models/mistral3.cpp +160 -0
- package/cpp/src/models/models.h +684 -0
- package/cpp/src/models/modern-bert.cpp +109 -0
- package/cpp/src/models/mpt.cpp +126 -0
- package/cpp/src/models/nemotron-h.cpp +148 -0
- package/cpp/src/models/nemotron.cpp +122 -0
- package/cpp/src/models/neo-bert.cpp +104 -0
- package/cpp/src/models/olmo.cpp +121 -0
- package/cpp/src/models/olmo2.cpp +150 -0
- package/cpp/src/models/olmoe.cpp +124 -0
- package/cpp/src/models/openai-moe-iswa.cpp +127 -0
- package/cpp/src/models/openelm.cpp +124 -0
- package/cpp/src/models/orion.cpp +123 -0
- package/cpp/src/models/paddleocr.cpp +122 -0
- package/cpp/src/models/pangu-embedded.cpp +121 -0
- package/cpp/src/models/phi2.cpp +121 -0
- package/cpp/src/models/phi3.cpp +152 -0
- package/cpp/src/models/plamo.cpp +110 -0
- package/cpp/src/models/plamo2.cpp +318 -0
- package/cpp/src/models/plamo3.cpp +128 -0
- package/cpp/src/models/plm.cpp +169 -0
- package/cpp/src/models/qwen.cpp +108 -0
- package/cpp/src/models/qwen2.cpp +126 -0
- package/cpp/src/models/qwen2moe.cpp +151 -0
- package/cpp/src/models/qwen2vl.cpp +117 -0
- package/cpp/src/models/qwen3.cpp +117 -0
- package/cpp/src/models/qwen35.cpp +386 -0
- package/cpp/src/models/qwen35moe.cpp +420 -0
- package/cpp/src/models/qwen3moe.cpp +124 -0
- package/cpp/src/models/qwen3next.cpp +525 -0
- package/cpp/src/models/qwen3vl-moe.cpp +140 -0
- package/cpp/src/models/qwen3vl.cpp +132 -0
- package/cpp/src/models/refact.cpp +94 -0
- package/cpp/src/models/rnd1.cpp +126 -0
- package/cpp/src/models/rwkv6-base.cpp +164 -0
- package/cpp/src/models/rwkv6.cpp +94 -0
- package/cpp/src/models/rwkv6qwen2.cpp +86 -0
- package/cpp/src/models/rwkv7-base.cpp +137 -0
- package/cpp/src/models/rwkv7.cpp +90 -0
- package/cpp/src/models/seed-oss.cpp +124 -0
- package/cpp/src/models/smallthinker.cpp +126 -0
- package/cpp/src/models/smollm3.cpp +128 -0
- package/cpp/src/models/stablelm.cpp +146 -0
- package/cpp/src/models/starcoder.cpp +100 -0
- package/cpp/src/models/starcoder2.cpp +121 -0
- package/cpp/src/models/step35-iswa.cpp +168 -0
- package/cpp/src/models/t5-dec.cpp +166 -0
- package/cpp/src/models/t5-enc.cpp +96 -0
- package/cpp/src/models/wavtokenizer-dec.cpp +149 -0
- package/cpp/src/models/xverse.cpp +108 -0
- package/cpp/src/unicode-data.cpp +7034 -0
- package/cpp/src/unicode-data.h +20 -0
- package/cpp/src/unicode.cpp +1103 -0
- package/cpp/src/unicode.h +111 -0
- package/cpp/vendor/nlohmann/json.hpp +25526 -0
- package/cpp/vendor/nlohmann/json_fwd.hpp +187 -0
- package/cpp/vendor/stb/stb_image.h +7988 -0
- package/ios/LocalLLM-Bridging-Header.h +2 -0
- package/ios/LocalLLM.h +5 -0
- package/ios/LocalLLM.mm +1267 -0
- package/local-llm-rn.podspec +60 -0
- package/package.json +35 -0
- package/src/NativeLocalLLM.ts +73 -0
- package/src/device.ts +50 -0
- package/src/download-adapter.ts +17 -0
- package/src/index.ts +21 -0
- package/src/native-bridge.ts +142 -0
- package/src/rn-downloader.ts +37 -0
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
|
2
|
+
|
|
3
|
+
#ifdef cl_intel_subgroups
|
|
4
|
+
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
|
|
5
|
+
#else
|
|
6
|
+
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
|
7
|
+
#endif
|
|
8
|
+
|
|
9
|
+
#ifdef cl_intel_required_subgroup_size
|
|
10
|
+
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
|
|
11
|
+
#define INTEL_GPU 1
|
|
12
|
+
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
|
|
13
|
+
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
|
|
14
|
+
#elif defined(cl_qcom_reqd_sub_group_size)
|
|
15
|
+
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
|
16
|
+
#define ADRENO_GPU 1
|
|
17
|
+
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
|
18
|
+
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
|
19
|
+
#endif
|
|
20
|
+
|
|
21
|
+
#define QK_MXFP4 32
|
|
22
|
+
typedef struct {
|
|
23
|
+
uchar e; // E8M0
|
|
24
|
+
uchar qs[QK_MXFP4/2];
|
|
25
|
+
} block_mxfp4;
|
|
26
|
+
|
|
27
|
+
constant static float kvalues_mxfp4_f[16] = {
|
|
28
|
+
0, .5f, 1.f, 1.5f, 2.f, 3.f, 4.f, 6.f, -0, -.5f, -1.f, -1.5f, -2.f, -3.f, -4.f, -6.f
|
|
29
|
+
};
|
|
30
|
+
|
|
31
|
+
static inline float e8m0_to_fp32(uchar x) {
|
|
32
|
+
int bits;
|
|
33
|
+
|
|
34
|
+
if (x == 0) {
|
|
35
|
+
bits = 0x00400000;
|
|
36
|
+
} else {
|
|
37
|
+
bits = (uint) x << 23;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
return as_float(bits);
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
#ifdef INTEL_GPU
|
|
44
|
+
#define N_R0_MXFP4 2 // number of rows each subgroup works on
|
|
45
|
+
#define N_SG_MXFP4 2 // number of subgroups in a work group
|
|
46
|
+
#define N_SIMDWIDTH 16 // subgroup size
|
|
47
|
+
#elif defined (ADRENO_GPU)
|
|
48
|
+
#define N_R0_MXFP4 2
|
|
49
|
+
#define N_SG_MXFP4 2
|
|
50
|
+
#define N_SIMDWIDTH 64
|
|
51
|
+
#endif
|
|
52
|
+
|
|
53
|
+
inline void mul_mv_mxfp4_f32(
|
|
54
|
+
global char * src0,
|
|
55
|
+
global char * src1,
|
|
56
|
+
global char * dst,
|
|
57
|
+
int ne00,
|
|
58
|
+
ulong nb01,
|
|
59
|
+
ulong nb02,
|
|
60
|
+
ulong nb03,
|
|
61
|
+
int ne12,
|
|
62
|
+
ulong nb11,
|
|
63
|
+
ulong nb12,
|
|
64
|
+
ulong nb13,
|
|
65
|
+
int ne0,
|
|
66
|
+
int ne1,
|
|
67
|
+
int r2,
|
|
68
|
+
int r3,
|
|
69
|
+
local char * shmem
|
|
70
|
+
) {
|
|
71
|
+
local float * shmem_f32 = (local float *) shmem;
|
|
72
|
+
int nb = ne00/QK_MXFP4;
|
|
73
|
+
|
|
74
|
+
int r0 = get_group_id(0);
|
|
75
|
+
int r1 = get_group_id(1);
|
|
76
|
+
int im = 0;
|
|
77
|
+
|
|
78
|
+
int first_row = (r0 * N_SG_MXFP4 + get_sub_group_id()) * N_R0_MXFP4;
|
|
79
|
+
|
|
80
|
+
uint i12 = im%ne12;
|
|
81
|
+
uint i13 = im/ne12;
|
|
82
|
+
|
|
83
|
+
ulong offset_src0 = first_row*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
|
|
84
|
+
ulong offset_src1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13;
|
|
85
|
+
|
|
86
|
+
global block_mxfp4 * x = (global block_mxfp4 *) (src0 + offset_src0);
|
|
87
|
+
global float * y = (global float *) (src1 + offset_src1);
|
|
88
|
+
|
|
89
|
+
const short ix = get_sub_group_local_id()/2; // 0...15
|
|
90
|
+
const short it = get_sub_group_local_id()%2; // 0 or 1
|
|
91
|
+
|
|
92
|
+
shmem_f32[get_sub_group_local_id()] = kvalues_mxfp4_f[get_sub_group_local_id()%16];
|
|
93
|
+
barrier(CLK_LOCAL_MEM_FENCE);
|
|
94
|
+
|
|
95
|
+
float4 yl[4];
|
|
96
|
+
float sumf[N_R0_MXFP4] = {0.f};
|
|
97
|
+
|
|
98
|
+
global float * yb = y + ix * QK_MXFP4 + it * 8;
|
|
99
|
+
|
|
100
|
+
for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
|
|
101
|
+
global float4 * y4 = (global float4 *)yb;
|
|
102
|
+
yl[0] = y4[0];
|
|
103
|
+
yl[1] = y4[4];
|
|
104
|
+
yl[2] = y4[1];
|
|
105
|
+
yl[3] = y4[5];
|
|
106
|
+
|
|
107
|
+
for (short row = 0; row < N_R0_MXFP4; row++) {
|
|
108
|
+
global block_mxfp4 * xb = x + row*nb + ib;
|
|
109
|
+
global uchar * q2 = (global uchar *)(xb->qs + 8*it);
|
|
110
|
+
|
|
111
|
+
float4 acc1 = yl[0]*(float4)(shmem_f32[q2[0] & 0x0F], shmem_f32[q2[1] & 0x0F], shmem_f32[q2[2] & 0x0F], shmem_f32[q2[3] & 0x0F]);
|
|
112
|
+
float4 acc2 = yl[1]*(float4)(shmem_f32[q2[0] >> 4 ], shmem_f32[q2[1] >> 4 ], shmem_f32[q2[2] >> 4 ], shmem_f32[q2[3] >> 4 ]);
|
|
113
|
+
float4 acc3 = yl[2]*(float4)(shmem_f32[q2[4] & 0x0F], shmem_f32[q2[5] & 0x0F], shmem_f32[q2[6] & 0x0F], shmem_f32[q2[7] & 0x0F]);
|
|
114
|
+
float4 acc4 = yl[3]*(float4)(shmem_f32[q2[4] >> 4 ], shmem_f32[q2[5] >> 4 ], shmem_f32[q2[6] >> 4 ], shmem_f32[q2[7] >> 4 ]);
|
|
115
|
+
|
|
116
|
+
acc1 = (acc1 + acc3) + (acc2 + acc4);
|
|
117
|
+
|
|
118
|
+
sumf[row] += e8m0_to_fp32(xb->e) * ((acc1.s0 + acc1.s1) + (acc1.s2 + acc1.s3));
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
yb += (N_SIMDWIDTH/2) * QK_MXFP4;
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
global float * dst_f32 = (global float *) dst + (ulong)im*ne0*ne1 + (ulong)r1*ne0;
|
|
125
|
+
|
|
126
|
+
for (int row = 0; row < N_R0_MXFP4 && first_row + row < ne0; ++row) {
|
|
127
|
+
float sum_all = sub_group_reduce_add(sumf[row]);
|
|
128
|
+
if (get_sub_group_local_id() == 0) {
|
|
129
|
+
dst_f32[first_row + row] = sum_all;
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
#ifdef INTEL_GPU
|
|
135
|
+
REQD_SUBGROUP_SIZE_16
|
|
136
|
+
#elif defined (ADRENO_GPU)
|
|
137
|
+
REQD_SUBGROUP_SIZE_64
|
|
138
|
+
#endif
|
|
139
|
+
kernel void kernel_mul_mv_id_mxfp4_f32(
|
|
140
|
+
global char * src0,
|
|
141
|
+
ulong offset0,
|
|
142
|
+
global char * src1,
|
|
143
|
+
ulong offset1,
|
|
144
|
+
global char * src2,
|
|
145
|
+
ulong offset2,
|
|
146
|
+
global char * dst,
|
|
147
|
+
ulong offsetd,
|
|
148
|
+
int ne00,
|
|
149
|
+
ulong nb01,
|
|
150
|
+
ulong nb02,
|
|
151
|
+
ulong nb03,
|
|
152
|
+
int ne11,
|
|
153
|
+
int ne12,
|
|
154
|
+
ulong nb11,
|
|
155
|
+
ulong nb12,
|
|
156
|
+
ulong nb13,
|
|
157
|
+
int ne20,
|
|
158
|
+
int ne21,
|
|
159
|
+
ulong nb21,
|
|
160
|
+
int ne0,
|
|
161
|
+
int ne1,
|
|
162
|
+
int r2,
|
|
163
|
+
int r3,
|
|
164
|
+
local char * shmem
|
|
165
|
+
) {
|
|
166
|
+
src0 = (global char *)((global char *)src0 + offset0);
|
|
167
|
+
src1 = (global char *)((global char *)src1 + offset1);
|
|
168
|
+
src2 = (global char *)((global char *)src2 + offset2);
|
|
169
|
+
dst = (global char *)((global char *)dst + offsetd);
|
|
170
|
+
|
|
171
|
+
const int iid1 = get_group_id(2)/ne20;
|
|
172
|
+
const int idx = get_group_id(2)%ne20;
|
|
173
|
+
|
|
174
|
+
int i02 = ((global int *) (src2 + iid1*nb21))[idx];
|
|
175
|
+
|
|
176
|
+
int i11 = idx % ne11;
|
|
177
|
+
int i12 = iid1;
|
|
178
|
+
|
|
179
|
+
int i1 = idx;
|
|
180
|
+
int i2 = i12;
|
|
181
|
+
|
|
182
|
+
global char * src0_cur = src0 + i02*nb02;
|
|
183
|
+
global char * src1_cur = src1 + i11*nb11 + i12*nb12;
|
|
184
|
+
|
|
185
|
+
global char * dst_cur = dst + (i1*ne0 + i2*ne1*ne0)*sizeof(float);
|
|
186
|
+
|
|
187
|
+
mul_mv_mxfp4_f32(src0_cur, src1_cur, dst_cur,
|
|
188
|
+
ne00, nb01, nb02, nb03, ne12, nb11, nb12, nb13, ne0, ne1, r2, r3, shmem);
|
|
189
|
+
}
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
|
2
|
+
|
|
3
|
+
#ifdef cl_intel_subgroups
|
|
4
|
+
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
|
|
5
|
+
#else
|
|
6
|
+
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
|
7
|
+
#endif
|
|
8
|
+
|
|
9
|
+
#ifdef cl_intel_required_subgroup_size
|
|
10
|
+
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
|
|
11
|
+
#define INTEL_GPU 1
|
|
12
|
+
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
|
|
13
|
+
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
|
|
14
|
+
#elif defined(cl_qcom_reqd_sub_group_size)
|
|
15
|
+
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
|
16
|
+
#define ADRENO_GPU 1
|
|
17
|
+
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
|
18
|
+
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
|
19
|
+
#endif
|
|
20
|
+
|
|
21
|
+
#define QK_MXFP4 32
|
|
22
|
+
|
|
23
|
+
static inline half4 mxfp4_to_fp16_packed(ushort fp4x4) {
|
|
24
|
+
ushort2 fp16_packed_a, fp16_packed_b, bias_a, bias_b, sign_a, sign_b;
|
|
25
|
+
fp16_packed_a.lo = (fp4x4 << 9) & 0x0E00;
|
|
26
|
+
fp16_packed_a.hi = (fp4x4 << 5) & 0x0E00;
|
|
27
|
+
fp16_packed_b.lo = (fp4x4 << 1) & 0x0E00;
|
|
28
|
+
fp16_packed_b.hi = (fp4x4 >> 3) & 0x0E00;
|
|
29
|
+
|
|
30
|
+
bias_a.lo = (fp16_packed_a.lo == 0) ? 0x0 : 0x3800;
|
|
31
|
+
bias_a.hi = (fp16_packed_a.hi == 0) ? 0x0 : 0x3800;
|
|
32
|
+
bias_b.lo = (fp16_packed_b.lo == 0) ? 0x0 : 0x3800;
|
|
33
|
+
bias_b.hi = (fp16_packed_b.hi == 0) ? 0x0 : 0x3800;
|
|
34
|
+
|
|
35
|
+
fp16_packed_a.lo = (fp16_packed_a.lo == 0x0200) ? 0x0 : fp16_packed_a.lo;
|
|
36
|
+
fp16_packed_a.hi = (fp16_packed_a.hi == 0x0200) ? 0x0 : fp16_packed_a.hi;
|
|
37
|
+
fp16_packed_b.lo = (fp16_packed_b.lo == 0x0200) ? 0x0 : fp16_packed_b.lo;
|
|
38
|
+
fp16_packed_b.hi = (fp16_packed_b.hi == 0x0200) ? 0x0 : fp16_packed_b.hi;
|
|
39
|
+
|
|
40
|
+
sign_a.lo = (fp4x4 << 12) & 0x8000;
|
|
41
|
+
sign_a.hi = (fp4x4 << 8) & 0x8000;
|
|
42
|
+
sign_b.lo = (fp4x4 << 4) & 0x8000;
|
|
43
|
+
sign_b.hi = fp4x4 & 0x8000;
|
|
44
|
+
|
|
45
|
+
fp16_packed_a = sign_a + bias_a + fp16_packed_a;
|
|
46
|
+
fp16_packed_b = sign_b + bias_b + fp16_packed_b;
|
|
47
|
+
|
|
48
|
+
return as_half4((ushort4)(fp16_packed_a, fp16_packed_b));
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
static inline float e8m0_to_fp32(uchar x) {
|
|
52
|
+
int bits;
|
|
53
|
+
bits = (x == 0) ? 0x00400000 : ((uint) x << 23);
|
|
54
|
+
return as_float(bits);
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
#ifdef INTEL_GPU
|
|
58
|
+
#define N_R0_MXFP4 2 // number of rows each subgroup works on
|
|
59
|
+
#define N_SG_MXFP4 2 // number of subgroups in a work group
|
|
60
|
+
#define N_SIMDWIDTH 16 // subgroup size
|
|
61
|
+
#elif defined (ADRENO_GPU)
|
|
62
|
+
#define N_R0_MXFP4 4
|
|
63
|
+
#define N_SG_MXFP4 1
|
|
64
|
+
#define N_SIMDWIDTH 64
|
|
65
|
+
#define SRC0Q_IMG
|
|
66
|
+
#endif
|
|
67
|
+
|
|
68
|
+
kernel void kernel_mul_mv_id_mxfp4_f32_flat(
|
|
69
|
+
#ifdef SRC0Q_IMG
|
|
70
|
+
__read_only image1d_buffer_t src0_q,
|
|
71
|
+
#else
|
|
72
|
+
global uchar * src0_q,
|
|
73
|
+
#endif
|
|
74
|
+
global uchar * src0_e,
|
|
75
|
+
global uchar * src1,
|
|
76
|
+
ulong offset1,
|
|
77
|
+
global uchar * src2,
|
|
78
|
+
ulong offset2,
|
|
79
|
+
global uchar * dst,
|
|
80
|
+
ulong offsetd,
|
|
81
|
+
int ne00,
|
|
82
|
+
ulong nb01,
|
|
83
|
+
ulong nb02,
|
|
84
|
+
ulong nb03,
|
|
85
|
+
int ne11,
|
|
86
|
+
int ne12,
|
|
87
|
+
ulong nb11,
|
|
88
|
+
ulong nb12,
|
|
89
|
+
ulong nb13,
|
|
90
|
+
int ne20,
|
|
91
|
+
int ne21,
|
|
92
|
+
ulong nb21,
|
|
93
|
+
int ne0,
|
|
94
|
+
int ne1,
|
|
95
|
+
int r2,
|
|
96
|
+
int r3
|
|
97
|
+
) {
|
|
98
|
+
dst = dst + offsetd;
|
|
99
|
+
|
|
100
|
+
const int iid1 = get_group_id(2) / ne20;
|
|
101
|
+
const int idx = get_group_id(2) % ne20;
|
|
102
|
+
|
|
103
|
+
uint i02 = ((global uint *) (src2 + offset2 + iid1 * nb21))[idx];
|
|
104
|
+
|
|
105
|
+
int i11 = idx % ne11;
|
|
106
|
+
|
|
107
|
+
int nb = ne00 / QK_MXFP4;
|
|
108
|
+
|
|
109
|
+
uint src0_off = i02*nb02;
|
|
110
|
+
src0_off /= 17; // 17 = sizeof(block_mxfp4)
|
|
111
|
+
|
|
112
|
+
src0_e = src0_e + src0_off;
|
|
113
|
+
|
|
114
|
+
dst = dst + (idx * ne0 + iid1 * ne1 * ne0) * sizeof(float);
|
|
115
|
+
|
|
116
|
+
int r0 = get_group_id(0);
|
|
117
|
+
int r1 = get_group_id(1);
|
|
118
|
+
|
|
119
|
+
int first_row = (r0 * N_SG_MXFP4 + get_sub_group_id()) * N_R0_MXFP4;
|
|
120
|
+
|
|
121
|
+
uint offset_src0 = first_row*nb01;
|
|
122
|
+
offset_src0 /= 17; // 17 = sizeof(block_mxfp4)
|
|
123
|
+
#ifdef SRC0Q_IMG
|
|
124
|
+
ulong offset_q = src0_off + offset_src0;
|
|
125
|
+
#else
|
|
126
|
+
src0_q = src0_q + src0_off*16;
|
|
127
|
+
global uchar16 * x_q = (global uchar16 *)(src0_q) + offset_src0;
|
|
128
|
+
#endif
|
|
129
|
+
global uchar * x_e = src0_e + offset_src0;
|
|
130
|
+
|
|
131
|
+
const short ix = get_sub_group_local_id() >> 1;
|
|
132
|
+
const short it = get_sub_group_local_id() & 1;
|
|
133
|
+
|
|
134
|
+
float sumf[N_R0_MXFP4] = {0.f};
|
|
135
|
+
|
|
136
|
+
src1 = src1 + offset1 + i11 * nb11 + iid1 * nb12;
|
|
137
|
+
global float * y = (global float *) (src1 + r1 * nb11);
|
|
138
|
+
global float * yb = y + ix * QK_MXFP4 + it * 8;
|
|
139
|
+
|
|
140
|
+
for (int ib = ix; ib < nb; ib += N_SIMDWIDTH / 2) {
|
|
141
|
+
global float4 * y4 = (global float4 *)yb;
|
|
142
|
+
|
|
143
|
+
#pragma unroll
|
|
144
|
+
for (short row = 0; row < N_R0_MXFP4; row++) {
|
|
145
|
+
uchar xb_e = x_e[row * nb + ib];
|
|
146
|
+
#ifdef SRC0Q_IMG
|
|
147
|
+
ushort4 xb_q = as_ushort4(read_imageui(src0_q, (offset_q + row * nb + ib) * 2 + it).xy);
|
|
148
|
+
#else
|
|
149
|
+
ushort4 xb_q = vload4(0, (global ushort *)((global uchar *)(x_q + row * nb + ib) + 8 * it));
|
|
150
|
+
#endif
|
|
151
|
+
|
|
152
|
+
half4 fp16x4_0 = mxfp4_to_fp16_packed(xb_q.s0);
|
|
153
|
+
half4 fp16x4_1 = mxfp4_to_fp16_packed(xb_q.s1);
|
|
154
|
+
float4 acc1 = y4[0] * (float4)(fp16x4_0.s0, fp16x4_0.s2, fp16x4_1.s0, fp16x4_1.s2);
|
|
155
|
+
acc1 += y4[4] * (float4)(fp16x4_0.s1, fp16x4_0.s3, fp16x4_1.s1, fp16x4_1.s3);
|
|
156
|
+
|
|
157
|
+
fp16x4_0 = mxfp4_to_fp16_packed(xb_q.s2);
|
|
158
|
+
fp16x4_1 = mxfp4_to_fp16_packed(xb_q.s3);
|
|
159
|
+
acc1 += y4[1] * (float4)(fp16x4_0.s0, fp16x4_0.s2, fp16x4_1.s0, fp16x4_1.s2);
|
|
160
|
+
acc1 += y4[5] * (float4)(fp16x4_0.s1, fp16x4_0.s3, fp16x4_1.s1, fp16x4_1.s3);
|
|
161
|
+
|
|
162
|
+
sumf[row] += e8m0_to_fp32(xb_e) * ((acc1.s0 + acc1.s1) + (acc1.s2 + acc1.s3));
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
yb += (N_SIMDWIDTH / 2) * QK_MXFP4;
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
global float * dst_f32 = (global float *)dst + (ulong)r1 * ne0;
|
|
169
|
+
|
|
170
|
+
for (int row = 0; row < N_R0_MXFP4 && first_row + row < ne0; ++row) {
|
|
171
|
+
float sum_all = sub_group_reduce_add(sumf[row]);
|
|
172
|
+
if (get_sub_group_local_id() == 0) {
|
|
173
|
+
dst_f32[first_row + row] = sum_all;
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
}
|
|
@@ -0,0 +1,283 @@
|
|
|
1
|
+
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
|
2
|
+
|
|
3
|
+
#ifdef cl_intel_subgroups
|
|
4
|
+
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
|
|
5
|
+
#else
|
|
6
|
+
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
|
7
|
+
#endif
|
|
8
|
+
|
|
9
|
+
#ifdef cl_intel_required_subgroup_size
|
|
10
|
+
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
|
|
11
|
+
#define INTEL_GPU 1
|
|
12
|
+
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
|
|
13
|
+
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
|
|
14
|
+
#elif defined(cl_qcom_reqd_sub_group_size)
|
|
15
|
+
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
|
16
|
+
#define ADRENO_GPU 1
|
|
17
|
+
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
|
18
|
+
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
|
19
|
+
#endif
|
|
20
|
+
|
|
21
|
+
#define QK4_0 32
|
|
22
|
+
|
|
23
|
+
typedef char int8_t;
|
|
24
|
+
typedef uchar uint8_t;
|
|
25
|
+
typedef short int16_t;
|
|
26
|
+
typedef ushort uint16_t;
|
|
27
|
+
typedef int int32_t;
|
|
28
|
+
typedef uint uint32_t;
|
|
29
|
+
|
|
30
|
+
//------------------------------------------------------------------------------
|
|
31
|
+
// block_q4_0
|
|
32
|
+
//------------------------------------------------------------------------------
|
|
33
|
+
struct block_q4_0
|
|
34
|
+
{
|
|
35
|
+
half d;
|
|
36
|
+
uint8_t qs[QK4_0 / 2];
|
|
37
|
+
};
|
|
38
|
+
|
|
39
|
+
// This function requires the original shuffled weights.
|
|
40
|
+
// As a reminder, the original weights are shuffled so that (q[0], q[16]) are
|
|
41
|
+
// packed together in a byte, so are (q[1], q[17]) and so on.
|
|
42
|
+
inline float block_q_4_0_dot_y_flat(
|
|
43
|
+
global uchar * x,
|
|
44
|
+
global half * dh,
|
|
45
|
+
float sumy,
|
|
46
|
+
float16 yl,
|
|
47
|
+
int il
|
|
48
|
+
) {
|
|
49
|
+
float d = *dh;
|
|
50
|
+
global ushort * qs = ((global ushort *)x + il/2);
|
|
51
|
+
float acc = 0.f;
|
|
52
|
+
|
|
53
|
+
acc += yl.s0 * (qs[0] & 0x000F);
|
|
54
|
+
acc += yl.s1 * (qs[0] & 0x0F00);
|
|
55
|
+
acc += yl.s8 * (qs[0] & 0x00F0);
|
|
56
|
+
acc += yl.s9 * (qs[0] & 0xF000);
|
|
57
|
+
|
|
58
|
+
acc += yl.s2 * (qs[1] & 0x000F);
|
|
59
|
+
acc += yl.s3 * (qs[1] & 0x0F00);
|
|
60
|
+
acc += yl.sa * (qs[1] & 0x00F0);
|
|
61
|
+
acc += yl.sb * (qs[1] & 0xF000);
|
|
62
|
+
|
|
63
|
+
acc += yl.s4 * (qs[2] & 0x000F);
|
|
64
|
+
acc += yl.s5 * (qs[2] & 0x0F00);
|
|
65
|
+
acc += yl.sc * (qs[2] & 0x00F0);
|
|
66
|
+
acc += yl.sd * (qs[2] & 0xF000);
|
|
67
|
+
|
|
68
|
+
acc += yl.s6 * (qs[3] & 0x000F);
|
|
69
|
+
acc += yl.s7 * (qs[3] & 0x0F00);
|
|
70
|
+
acc += yl.se * (qs[3] & 0x00F0);
|
|
71
|
+
acc += yl.sf * (qs[3] & 0xF000);
|
|
72
|
+
|
|
73
|
+
return d * (sumy * -8.f + acc);
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
//
|
|
77
|
+
// This variant outputs 8 values.
|
|
78
|
+
//
|
|
79
|
+
#undef N_DST
|
|
80
|
+
#undef N_SIMDGROUP
|
|
81
|
+
#undef N_SIMDWIDTH
|
|
82
|
+
|
|
83
|
+
#ifdef INTEL_GPU
|
|
84
|
+
#define N_DST 8 // each SIMD group works on 8 rows
|
|
85
|
+
#define N_SIMDGROUP 1 // number of SIMD groups in a thread group
|
|
86
|
+
#define N_SIMDWIDTH 16 // subgroup size
|
|
87
|
+
#elif defined (ADRENO_GPU)
|
|
88
|
+
#define N_DST 8
|
|
89
|
+
#define N_SIMDGROUP 1
|
|
90
|
+
#define N_SIMDWIDTH 64
|
|
91
|
+
#endif
|
|
92
|
+
|
|
93
|
+
inline void mul_vec_q_n_f32_8x_flat(
|
|
94
|
+
global char * src0_q,
|
|
95
|
+
global half * src0_d,
|
|
96
|
+
global float * src1,
|
|
97
|
+
global float * dst,
|
|
98
|
+
int ne00,
|
|
99
|
+
int ne01,
|
|
100
|
+
int ne02,
|
|
101
|
+
int ne10,
|
|
102
|
+
int ne12,
|
|
103
|
+
int ne0,
|
|
104
|
+
int ne1,
|
|
105
|
+
int r2,
|
|
106
|
+
int r3
|
|
107
|
+
) {
|
|
108
|
+
const ulong nb = ne00/QK4_0;
|
|
109
|
+
|
|
110
|
+
int r0 = get_group_id(0);
|
|
111
|
+
int r1 = get_group_id(1);
|
|
112
|
+
int im = 0;
|
|
113
|
+
|
|
114
|
+
int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
|
|
115
|
+
|
|
116
|
+
int i12 = im%ne12;
|
|
117
|
+
int i13 = im/ne12;
|
|
118
|
+
|
|
119
|
+
// The number of scales is the same as the number of blocks.
|
|
120
|
+
ulong offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
|
|
121
|
+
// Each block contains QK4_0/2 uchars, hence offset for qs is as follows.
|
|
122
|
+
ulong offset0_q = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02)) * QK4_0/2;
|
|
123
|
+
|
|
124
|
+
global uchar * x = (global uchar *) src0_q + offset0_q;
|
|
125
|
+
global half * d = (global half *) src0_d + offset0_d;
|
|
126
|
+
global float * y = (global float *) src1 + r1*ne10 + im*ne00*ne1;
|
|
127
|
+
|
|
128
|
+
float16 yl;
|
|
129
|
+
float8 sumf = 0.f;
|
|
130
|
+
|
|
131
|
+
int ix = get_sub_group_local_id()/2;
|
|
132
|
+
int il = 8*(get_sub_group_local_id()%2);
|
|
133
|
+
|
|
134
|
+
global float * yb = y + ix*QK4_0 + il;
|
|
135
|
+
|
|
136
|
+
for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
|
|
137
|
+
float sumy = 0.f;
|
|
138
|
+
|
|
139
|
+
sumy += yb[0];
|
|
140
|
+
sumy += yb[1];
|
|
141
|
+
sumy += yb[2];
|
|
142
|
+
sumy += yb[3];
|
|
143
|
+
sumy += yb[4];
|
|
144
|
+
sumy += yb[5];
|
|
145
|
+
sumy += yb[6];
|
|
146
|
+
sumy += yb[7];
|
|
147
|
+
|
|
148
|
+
sumy += yb[16];
|
|
149
|
+
sumy += yb[17];
|
|
150
|
+
sumy += yb[18];
|
|
151
|
+
sumy += yb[19];
|
|
152
|
+
sumy += yb[20];
|
|
153
|
+
sumy += yb[21];
|
|
154
|
+
sumy += yb[22];
|
|
155
|
+
sumy += yb[23];
|
|
156
|
+
|
|
157
|
+
yl.s0 = yb[0];
|
|
158
|
+
yl.s1 = yb[1]/256.f;
|
|
159
|
+
|
|
160
|
+
yl.s2 = yb[2];
|
|
161
|
+
yl.s3 = yb[3]/256.f;
|
|
162
|
+
|
|
163
|
+
yl.s4 = yb[4];
|
|
164
|
+
yl.s5 = yb[5]/256.f;
|
|
165
|
+
|
|
166
|
+
yl.s6 = yb[6];
|
|
167
|
+
yl.s7 = yb[7]/256.f;
|
|
168
|
+
|
|
169
|
+
yl.s8 = yb[16]/16.f;
|
|
170
|
+
yl.s9 = yb[17]/4096.f;
|
|
171
|
+
|
|
172
|
+
yl.sa = yb[18]/16.f;
|
|
173
|
+
yl.sb = yb[19]/4096.f;
|
|
174
|
+
|
|
175
|
+
yl.sc = yb[20]/16.f;
|
|
176
|
+
yl.sd = yb[21]/4096.f;
|
|
177
|
+
|
|
178
|
+
yl.se = yb[22]/16.f;
|
|
179
|
+
yl.sf = yb[23]/4096.f;
|
|
180
|
+
|
|
181
|
+
sumf.s0 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 0*nb*QK4_0/2, d + ib + 0*nb, sumy, yl, il);
|
|
182
|
+
sumf.s1 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 1*nb*QK4_0/2, d + ib + 1*nb, sumy, yl, il);
|
|
183
|
+
sumf.s2 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 2*nb*QK4_0/2, d + ib + 2*nb, sumy, yl, il);
|
|
184
|
+
sumf.s3 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 3*nb*QK4_0/2, d + ib + 3*nb, sumy, yl, il);
|
|
185
|
+
|
|
186
|
+
sumf.s4 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 4*nb*QK4_0/2, d + ib + 4*nb, sumy, yl, il);
|
|
187
|
+
sumf.s5 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 5*nb*QK4_0/2, d + ib + 5*nb, sumy, yl, il);
|
|
188
|
+
sumf.s6 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 6*nb*QK4_0/2, d + ib + 6*nb, sumy, yl, il);
|
|
189
|
+
sumf.s7 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 7*nb*QK4_0/2, d + ib + 7*nb, sumy, yl, il);
|
|
190
|
+
|
|
191
|
+
yb += QK4_0 * (N_SIMDWIDTH/2);
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
float8 tot = (float8)(
|
|
195
|
+
sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1),
|
|
196
|
+
sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3),
|
|
197
|
+
sub_group_reduce_add(sumf.s4), sub_group_reduce_add(sumf.s5),
|
|
198
|
+
sub_group_reduce_add(sumf.s6), sub_group_reduce_add(sumf.s7)
|
|
199
|
+
);
|
|
200
|
+
|
|
201
|
+
if (get_sub_group_local_id() == 0) {
|
|
202
|
+
if (first_row + 0 < ne01) {
|
|
203
|
+
dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
|
|
204
|
+
}
|
|
205
|
+
if (first_row + 1 < ne01) {
|
|
206
|
+
dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
|
|
207
|
+
}
|
|
208
|
+
if (first_row + 2 < ne01) {
|
|
209
|
+
dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
|
|
210
|
+
}
|
|
211
|
+
if (first_row + 3 < ne01) {
|
|
212
|
+
dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
if (first_row + 4 < ne01) {
|
|
216
|
+
dst[r1*ne0 + im*ne0*ne1 + first_row + 4] = tot.s4;
|
|
217
|
+
}
|
|
218
|
+
if (first_row + 5 < ne01) {
|
|
219
|
+
dst[r1*ne0 + im*ne0*ne1 + first_row + 5] = tot.s5;
|
|
220
|
+
}
|
|
221
|
+
if (first_row + 6 < ne01) {
|
|
222
|
+
dst[r1*ne0 + im*ne0*ne1 + first_row + 6] = tot.s6;
|
|
223
|
+
}
|
|
224
|
+
if (first_row + 7 < ne01) {
|
|
225
|
+
dst[r1*ne0 + im*ne0*ne1 + first_row + 7] = tot.s7;
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
#ifdef INTEL_GPU
|
|
231
|
+
REQD_SUBGROUP_SIZE_16
|
|
232
|
+
#elif defined (ADRENO_GPU)
|
|
233
|
+
REQD_SUBGROUP_SIZE_64
|
|
234
|
+
#endif
|
|
235
|
+
kernel void kernel_mul_mv_id_q4_0_f32_8x_flat(
|
|
236
|
+
global char * src0_q,
|
|
237
|
+
global half * src0_d,
|
|
238
|
+
global float * src1,
|
|
239
|
+
ulong offset1,
|
|
240
|
+
global char * src2,
|
|
241
|
+
ulong offset2,
|
|
242
|
+
global float * dst,
|
|
243
|
+
ulong offsetd,
|
|
244
|
+
int ne00,
|
|
245
|
+
int ne01,
|
|
246
|
+
int ne02,
|
|
247
|
+
ulong nb00,
|
|
248
|
+
ulong nb02,
|
|
249
|
+
int ne10,
|
|
250
|
+
int ne11,
|
|
251
|
+
int ne12,
|
|
252
|
+
ulong nb11,
|
|
253
|
+
ulong nb12,
|
|
254
|
+
int ne20,
|
|
255
|
+
int ne21,
|
|
256
|
+
ulong nb21,
|
|
257
|
+
int ne0,
|
|
258
|
+
int ne1,
|
|
259
|
+
int r2,
|
|
260
|
+
int r3
|
|
261
|
+
) {
|
|
262
|
+
src1 = (global float *)((global char *)src1 + offset1);
|
|
263
|
+
src2 = (global char *)((global char *)src2 + offset2);
|
|
264
|
+
dst = (global float *)((global char *)dst + offsetd);
|
|
265
|
+
|
|
266
|
+
const int iid1 = get_group_id(2)/ne20;
|
|
267
|
+
const int idx = get_group_id(2)%ne20;
|
|
268
|
+
|
|
269
|
+
const int i02 = ((global int *)(src2 + iid1*nb21))[idx];
|
|
270
|
+
|
|
271
|
+
const int i11 = idx%ne11;
|
|
272
|
+
const int i12 = iid1;
|
|
273
|
+
|
|
274
|
+
const int i1 = idx;
|
|
275
|
+
const int i2 = i12;
|
|
276
|
+
|
|
277
|
+
global char * src0_q_cur = src0_q + (i02*nb02/nb00)*(QK4_0/2);
|
|
278
|
+
global half * src0_d_cur = src0_d + (i02*nb02/nb00);
|
|
279
|
+
global float * src1_cur = (global float *)((global char *) src1 + i11*nb11 + i12*nb12);
|
|
280
|
+
global float * dst_cur = dst + i1*ne0 + i2*ne1*ne0;
|
|
281
|
+
|
|
282
|
+
mul_vec_q_n_f32_8x_flat(src0_q_cur, src0_d_cur, src1_cur, dst_cur, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3);
|
|
283
|
+
}
|