local-llm-rn 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/cpp/CMakeLists.txt +285 -0
- package/cpp/common/CMakeLists.txt +149 -0
- package/cpp/common/arg.cpp +3799 -0
- package/cpp/common/arg.h +131 -0
- package/cpp/common/base64.hpp +392 -0
- package/cpp/common/build-info.cpp.in +4 -0
- package/cpp/common/chat-parser-xml-toolcall.cpp +879 -0
- package/cpp/common/chat-parser-xml-toolcall.h +45 -0
- package/cpp/common/chat-parser.cpp +1649 -0
- package/cpp/common/chat-parser.h +133 -0
- package/cpp/common/chat-peg-parser.cpp +124 -0
- package/cpp/common/chat-peg-parser.h +105 -0
- package/cpp/common/chat.cpp +3355 -0
- package/cpp/common/chat.h +252 -0
- package/cpp/common/common.cpp +1824 -0
- package/cpp/common/common.h +930 -0
- package/cpp/common/console.cpp +1137 -0
- package/cpp/common/console.h +41 -0
- package/cpp/common/debug.cpp +167 -0
- package/cpp/common/debug.h +43 -0
- package/cpp/common/download.cpp +792 -0
- package/cpp/common/download.h +84 -0
- package/cpp/common/http.h +84 -0
- package/cpp/common/jinja/README.md +88 -0
- package/cpp/common/jinja/caps.cpp +285 -0
- package/cpp/common/jinja/caps.h +30 -0
- package/cpp/common/jinja/lexer.cpp +341 -0
- package/cpp/common/jinja/lexer.h +157 -0
- package/cpp/common/jinja/parser.cpp +591 -0
- package/cpp/common/jinja/parser.h +21 -0
- package/cpp/common/jinja/runtime.cpp +867 -0
- package/cpp/common/jinja/runtime.h +638 -0
- package/cpp/common/jinja/string.cpp +213 -0
- package/cpp/common/jinja/string.h +61 -0
- package/cpp/common/jinja/utils.h +149 -0
- package/cpp/common/jinja/value.cpp +1393 -0
- package/cpp/common/jinja/value.h +756 -0
- package/cpp/common/json-partial.cpp +324 -0
- package/cpp/common/json-partial.h +39 -0
- package/cpp/common/json-schema-to-grammar.cpp +1153 -0
- package/cpp/common/json-schema-to-grammar.h +43 -0
- package/cpp/common/llguidance.cpp +258 -0
- package/cpp/common/log.cpp +446 -0
- package/cpp/common/log.h +119 -0
- package/cpp/common/ngram-cache.cpp +285 -0
- package/cpp/common/ngram-cache.h +101 -0
- package/cpp/common/ngram-map.cpp +530 -0
- package/cpp/common/ngram-map.h +115 -0
- package/cpp/common/ngram-mod.cpp +60 -0
- package/cpp/common/ngram-mod.h +38 -0
- package/cpp/common/peg-parser.cpp +1712 -0
- package/cpp/common/peg-parser.h +459 -0
- package/cpp/common/preset.cpp +483 -0
- package/cpp/common/preset.h +83 -0
- package/cpp/common/regex-partial.cpp +204 -0
- package/cpp/common/regex-partial.h +56 -0
- package/cpp/common/sampling.cpp +745 -0
- package/cpp/common/sampling.h +119 -0
- package/cpp/common/speculative.cpp +1074 -0
- package/cpp/common/speculative.h +41 -0
- package/cpp/common/unicode.cpp +64 -0
- package/cpp/common/unicode.h +22 -0
- package/cpp/ggml/CMakeLists.txt +494 -0
- package/cpp/ggml/cmake/GitVars.cmake +22 -0
- package/cpp/ggml/cmake/common.cmake +50 -0
- package/cpp/ggml/cmake/ggml-config.cmake.in +191 -0
- package/cpp/ggml/include/ggml-alloc.h +85 -0
- package/cpp/ggml/include/ggml-backend.h +373 -0
- package/cpp/ggml/include/ggml-blas.h +25 -0
- package/cpp/ggml/include/ggml-cann.h +123 -0
- package/cpp/ggml/include/ggml-cpp.h +39 -0
- package/cpp/ggml/include/ggml-cpu.h +151 -0
- package/cpp/ggml/include/ggml-cuda.h +47 -0
- package/cpp/ggml/include/ggml-hexagon.h +19 -0
- package/cpp/ggml/include/ggml-metal.h +61 -0
- package/cpp/ggml/include/ggml-opencl.h +26 -0
- package/cpp/ggml/include/ggml-opt.h +256 -0
- package/cpp/ggml/include/ggml-rpc.h +30 -0
- package/cpp/ggml/include/ggml-sycl.h +49 -0
- package/cpp/ggml/include/ggml-virtgpu.h +14 -0
- package/cpp/ggml/include/ggml-vulkan.h +29 -0
- package/cpp/ggml/include/ggml-webgpu.h +19 -0
- package/cpp/ggml/include/ggml-zdnn.h +17 -0
- package/cpp/ggml/include/ggml-zendnn.h +22 -0
- package/cpp/ggml/include/ggml.h +2753 -0
- package/cpp/ggml/include/gguf.h +204 -0
- package/cpp/ggml/src/CMakeLists.txt +492 -0
- package/cpp/ggml/src/ggml-alloc.c +1244 -0
- package/cpp/ggml/src/ggml-backend-dl.cpp +48 -0
- package/cpp/ggml/src/ggml-backend-dl.h +45 -0
- package/cpp/ggml/src/ggml-backend-impl.h +255 -0
- package/cpp/ggml/src/ggml-backend-reg.cpp +566 -0
- package/cpp/ggml/src/ggml-backend.cpp +2270 -0
- package/cpp/ggml/src/ggml-blas/CMakeLists.txt +101 -0
- package/cpp/ggml/src/ggml-blas/ggml-blas.cpp +518 -0
- package/cpp/ggml/src/ggml-common.h +1878 -0
- package/cpp/ggml/src/ggml-cpu/CMakeLists.txt +691 -0
- package/cpp/ggml/src/ggml-cpu/amx/amx.cpp +247 -0
- package/cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
- package/cpp/ggml/src/ggml-cpu/amx/common.h +91 -0
- package/cpp/ggml/src/ggml-cpu/amx/mmq.cpp +2512 -0
- package/cpp/ggml/src/ggml-cpu/amx/mmq.h +10 -0
- package/cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +98 -0
- package/cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4052 -0
- package/cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +4935 -0
- package/cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2159 -0
- package/cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
- package/cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2305 -0
- package/cpp/ggml/src/ggml-cpu/arch/riscv/cpu-feats.cpp +38 -0
- package/cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2726 -0
- package/cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +342 -0
- package/cpp/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
- package/cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1468 -0
- package/cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1221 -0
- package/cpp/ggml/src/ggml-cpu/arch/x86/cpu-feats.cpp +327 -0
- package/cpp/ggml/src/ggml-cpu/arch/x86/quants.c +3820 -0
- package/cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +6307 -0
- package/cpp/ggml/src/ggml-cpu/arch-fallback.h +313 -0
- package/cpp/ggml/src/ggml-cpu/binary-ops.cpp +154 -0
- package/cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
- package/cpp/ggml/src/ggml-cpu/cmake/FindSIMD.cmake +100 -0
- package/cpp/ggml/src/ggml-cpu/common.h +95 -0
- package/cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +529 -0
- package/cpp/ggml/src/ggml-cpu/ggml-cpu.c +3734 -0
- package/cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +701 -0
- package/cpp/ggml/src/ggml-cpu/hbm.cpp +55 -0
- package/cpp/ggml/src/ggml-cpu/hbm.h +8 -0
- package/cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +938 -0
- package/cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +90 -0
- package/cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +798 -0
- package/cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
- package/cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +4033 -0
- package/cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +25 -0
- package/cpp/ggml/src/ggml-cpu/ops.cpp +10978 -0
- package/cpp/ggml/src/ggml-cpu/ops.h +116 -0
- package/cpp/ggml/src/ggml-cpu/quants.c +1193 -0
- package/cpp/ggml/src/ggml-cpu/quants.h +97 -0
- package/cpp/ggml/src/ggml-cpu/repack.cpp +3316 -0
- package/cpp/ggml/src/ggml-cpu/repack.h +173 -0
- package/cpp/ggml/src/ggml-cpu/simd-gemm.h +136 -0
- package/cpp/ggml/src/ggml-cpu/simd-mappings.h +1279 -0
- package/cpp/ggml/src/ggml-cpu/spacemit/ime.cpp +1025 -0
- package/cpp/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
- package/cpp/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
- package/cpp/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
- package/cpp/ggml/src/ggml-cpu/traits.cpp +36 -0
- package/cpp/ggml/src/ggml-cpu/traits.h +38 -0
- package/cpp/ggml/src/ggml-cpu/unary-ops.cpp +337 -0
- package/cpp/ggml/src/ggml-cpu/unary-ops.h +35 -0
- package/cpp/ggml/src/ggml-cpu/vec.cpp +629 -0
- package/cpp/ggml/src/ggml-cpu/vec.h +1585 -0
- package/cpp/ggml/src/ggml-hexagon/CMakeLists.txt +117 -0
- package/cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp +3232 -0
- package/cpp/ggml/src/ggml-hexagon/htp/CMakeLists.txt +45 -0
- package/cpp/ggml/src/ggml-hexagon/htp/act-ops.c +815 -0
- package/cpp/ggml/src/ggml-hexagon/htp/argsort-ops.c +281 -0
- package/cpp/ggml/src/ggml-hexagon/htp/binary-ops.c +827 -0
- package/cpp/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +157 -0
- package/cpp/ggml/src/ggml-hexagon/htp/cpy-ops.c +251 -0
- package/cpp/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +666 -0
- package/cpp/ggml/src/ggml-hexagon/htp/get-rows-ops.c +111 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hex-dma.c +63 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hex-dma.h +182 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hex-dump.h +77 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hex-fastdiv.h +37 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hex-utils.h +51 -0
- package/cpp/ggml/src/ggml-hexagon/htp/htp-ctx.h +35 -0
- package/cpp/ggml/src/ggml-hexagon/htp/htp-msg.h +154 -0
- package/cpp/ggml/src/ggml-hexagon/htp/htp-ops.h +65 -0
- package/cpp/ggml/src/ggml-hexagon/htp/htp_iface.idl +16 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-arith.h +470 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-base.h +173 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-copy.h +245 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-div.h +116 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-dump.h +129 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-exp.h +215 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-floor.h +100 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-inverse.h +176 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-reduce.h +266 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-scale.h +133 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +141 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-sqrt.h +126 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-types.h +36 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-utils.h +18 -0
- package/cpp/ggml/src/ggml-hexagon/htp/main.c +1150 -0
- package/cpp/ggml/src/ggml-hexagon/htp/matmul-ops.c +2595 -0
- package/cpp/ggml/src/ggml-hexagon/htp/rope-ops.c +498 -0
- package/cpp/ggml/src/ggml-hexagon/htp/set-rows-ops.c +167 -0
- package/cpp/ggml/src/ggml-hexagon/htp/softmax-ops.c +421 -0
- package/cpp/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +130 -0
- package/cpp/ggml/src/ggml-hexagon/htp/unary-ops.c +384 -0
- package/cpp/ggml/src/ggml-hexagon/htp/worker-pool.c +293 -0
- package/cpp/ggml/src/ggml-hexagon/htp/worker-pool.h +57 -0
- package/cpp/ggml/src/ggml-hexagon/htp-drv.cpp +418 -0
- package/cpp/ggml/src/ggml-hexagon/htp-drv.h +121 -0
- package/cpp/ggml/src/ggml-hexagon/libdl.h +79 -0
- package/cpp/ggml/src/ggml-hexagon/libggml-htp.inf +38 -0
- package/cpp/ggml/src/ggml-hexagon/op-desc.h +153 -0
- package/cpp/ggml/src/ggml-impl.h +724 -0
- package/cpp/ggml/src/ggml-metal/CMakeLists.txt +124 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal-common.cpp +457 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal-context.h +41 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal-context.m +702 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal-device.cpp +1890 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal-device.h +290 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal-device.m +1749 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal-impl.h +1054 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal-ops.cpp +4370 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal-ops.h +94 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal.cpp +937 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal.metal +9819 -0
- package/cpp/ggml/src/ggml-musa/CMakeLists.txt +125 -0
- package/cpp/ggml/src/ggml-musa/mudnn.cu +112 -0
- package/cpp/ggml/src/ggml-musa/mudnn.cuh +12 -0
- package/cpp/ggml/src/ggml-opencl/CMakeLists.txt +150 -0
- package/cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +11553 -0
- package/cpp/ggml/src/ggml-opencl/kernels/add.cl +190 -0
- package/cpp/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
- package/cpp/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
- package/cpp/ggml/src/ggml-opencl/kernels/clamp.cl +20 -0
- package/cpp/ggml/src/ggml-opencl/kernels/concat.cl +51 -0
- package/cpp/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
- package/cpp/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
- package/cpp/ggml/src/ggml-opencl/kernels/cpy.cl +184 -0
- package/cpp/ggml/src/ggml-opencl/kernels/cvt.cl +417 -0
- package/cpp/ggml/src/ggml-opencl/kernels/diag_mask_inf.cl +58 -0
- package/cpp/ggml/src/ggml-opencl/kernels/div.cl +138 -0
- package/cpp/ggml/src/ggml-opencl/kernels/embed_kernel.py +26 -0
- package/cpp/ggml/src/ggml-opencl/kernels/expm1.cl +113 -0
- package/cpp/ggml/src/ggml-opencl/kernels/fill.cl +17 -0
- package/cpp/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +370 -0
- package/cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +371 -0
- package/cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +373 -0
- package/cpp/ggml/src/ggml-opencl/kernels/gelu.cl +89 -0
- package/cpp/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32.cl +162 -0
- package/cpp/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32.cl +156 -0
- package/cpp/ggml/src/ggml-opencl/kernels/gemv_noshuffle.cl +268 -0
- package/cpp/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general.cl +274 -0
- package/cpp/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general_q8_0_f32.cl +195 -0
- package/cpp/ggml/src/ggml-opencl/kernels/get_rows.cl +187 -0
- package/cpp/ggml/src/ggml-opencl/kernels/glu.cl +378 -0
- package/cpp/ggml/src/ggml-opencl/kernels/group_norm.cl +121 -0
- package/cpp/ggml/src/ggml-opencl/kernels/im2col_f16.cl +57 -0
- package/cpp/ggml/src/ggml-opencl/kernels/im2col_f32.cl +57 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mean.cl +140 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul.cl +152 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mat_Ab_Bi_8x4.cl +139 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_kq_kqv.cl +273 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +146 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +147 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl +163 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl +165 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q6_k_f32_l4_lm.cl +158 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_8x4.cl +129 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl +154 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f16.cl +118 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32.cl +118 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_1row.cl +94 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_l4.cl +84 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f32_f32.cl +118 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32.cl +192 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_16x_flat.cl +307 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_8x_flat.cl +265 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_8x_flat.cl +272 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_v.cl +254 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl +219 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl +229 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl +180 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32.cl +194 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +194 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
- package/cpp/ggml/src/ggml-opencl/kernels/norm.cl +161 -0
- package/cpp/ggml/src/ggml-opencl/kernels/pad.cl +39 -0
- package/cpp/ggml/src/ggml-opencl/kernels/relu.cl +16 -0
- package/cpp/ggml/src/ggml-opencl/kernels/repeat.cl +38 -0
- package/cpp/ggml/src/ggml-opencl/kernels/rms_norm.cl +190 -0
- package/cpp/ggml/src/ggml-opencl/kernels/rope.cl +747 -0
- package/cpp/ggml/src/ggml-opencl/kernels/scale.cl +27 -0
- package/cpp/ggml/src/ggml-opencl/kernels/set_rows.cl +208 -0
- package/cpp/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
- package/cpp/ggml/src/ggml-opencl/kernels/silu.cl +30 -0
- package/cpp/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +108 -0
- package/cpp/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +108 -0
- package/cpp/ggml/src/ggml-opencl/kernels/softmax_f16.cl +107 -0
- package/cpp/ggml/src/ggml-opencl/kernels/softmax_f32.cl +107 -0
- package/cpp/ggml/src/ggml-opencl/kernels/softplus.cl +116 -0
- package/cpp/ggml/src/ggml-opencl/kernels/solve_tri.cl +51 -0
- package/cpp/ggml/src/ggml-opencl/kernels/sqr.cl +53 -0
- package/cpp/ggml/src/ggml-opencl/kernels/sqrt.cl +53 -0
- package/cpp/ggml/src/ggml-opencl/kernels/ssm_conv.cl +77 -0
- package/cpp/ggml/src/ggml-opencl/kernels/sub.cl +138 -0
- package/cpp/ggml/src/ggml-opencl/kernels/sum_rows.cl +140 -0
- package/cpp/ggml/src/ggml-opencl/kernels/tanh.cl +109 -0
- package/cpp/ggml/src/ggml-opencl/kernels/transpose.cl +117 -0
- package/cpp/ggml/src/ggml-opencl/kernels/tri.cl +32 -0
- package/cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
- package/cpp/ggml/src/ggml-opencl/kernels/upscale.cl +120 -0
- package/cpp/ggml/src/ggml-opt.cpp +1093 -0
- package/cpp/ggml/src/ggml-quants.c +5325 -0
- package/cpp/ggml/src/ggml-quants.h +106 -0
- package/cpp/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
- package/cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2118 -0
- package/cpp/ggml/src/ggml-threading.cpp +12 -0
- package/cpp/ggml/src/ggml-threading.h +14 -0
- package/cpp/ggml/src/ggml-virtgpu/CMakeLists.txt +70 -0
- package/cpp/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp +87 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/CMakeLists.txt +21 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp +115 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/backend-convert.h +13 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +102 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +105 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +179 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp +148 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +51 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +73 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +27 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +32 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/backend.cpp +144 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +95 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +94 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +50 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +378 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +232 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +58 -0
- package/cpp/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +81 -0
- package/cpp/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +119 -0
- package/cpp/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +158 -0
- package/cpp/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +213 -0
- package/cpp/ggml/src/ggml-virtgpu/ggml-backend.cpp +69 -0
- package/cpp/ggml/src/ggml-virtgpu/ggml-remoting.h +71 -0
- package/cpp/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml +166 -0
- package/cpp/ggml/src/ggml-virtgpu/include/apir_hw.h +9 -0
- package/cpp/ggml/src/ggml-virtgpu/regenerate_remoting.py +333 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu-apir.h +15 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +58 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +110 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +173 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +192 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +36 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +53 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +98 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu-shm.h +23 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu-utils.cpp +179 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu-utils.h +86 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu.cpp +544 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu.h +117 -0
- package/cpp/ggml/src/ggml-webgpu/CMakeLists.txt +80 -0
- package/cpp/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +1231 -0
- package/cpp/ggml/src/ggml-webgpu/ggml-webgpu.cpp +3150 -0
- package/cpp/ggml/src/ggml-webgpu/pre_wgsl.hpp +778 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl +72 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl +106 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl +134 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +107 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +923 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +107 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl +66 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +182 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +636 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.wgsl +668 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +323 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +713 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +103 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl +138 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.wgsl +188 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +194 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl +86 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +123 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/rope.tmpl.wgsl +295 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/scale.wgsl +63 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +109 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.tmpl.wgsl +345 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl +55 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +193 -0
- package/cpp/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
- package/cpp/ggml/src/ggml-zdnn/common.hpp +59 -0
- package/cpp/ggml/src/ggml-zdnn/ggml-zdnn.cpp +633 -0
- package/cpp/ggml/src/ggml-zdnn/mmf.cpp +80 -0
- package/cpp/ggml/src/ggml-zdnn/mmf.hpp +12 -0
- package/cpp/ggml/src/ggml-zdnn/utils.cpp +79 -0
- package/cpp/ggml/src/ggml-zdnn/utils.hpp +19 -0
- package/cpp/ggml/src/ggml-zendnn/CMakeLists.txt +92 -0
- package/cpp/ggml/src/ggml-zendnn/ggml-zendnn.cpp +469 -0
- package/cpp/ggml/src/ggml.c +7669 -0
- package/cpp/ggml/src/ggml.cpp +26 -0
- package/cpp/ggml/src/gguf.cpp +1699 -0
- package/cpp/include/llama-cpp.h +32 -0
- package/cpp/include/llama.h +1568 -0
- package/cpp/mtmd/CMakeLists.txt +98 -0
- package/cpp/mtmd/README.md +63 -0
- package/cpp/mtmd/clip-graph.h +117 -0
- package/cpp/mtmd/clip-impl.h +586 -0
- package/cpp/mtmd/clip-model.h +390 -0
- package/cpp/mtmd/clip.cpp +4154 -0
- package/cpp/mtmd/clip.h +121 -0
- package/cpp/mtmd/deprecation-warning.cpp +22 -0
- package/cpp/mtmd/legacy-models/convert_image_encoder_to_gguf.py +412 -0
- package/cpp/mtmd/legacy-models/glmedge-convert-image-encoder-to-gguf.py +280 -0
- package/cpp/mtmd/legacy-models/glmedge-surgery.py +33 -0
- package/cpp/mtmd/legacy-models/llava_surgery.py +38 -0
- package/cpp/mtmd/legacy-models/llava_surgery_v2.py +180 -0
- package/cpp/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py +892 -0
- package/cpp/mtmd/legacy-models/minicpmv-surgery.py +47 -0
- package/cpp/mtmd/models/cogvlm.cpp +98 -0
- package/cpp/mtmd/models/conformer.cpp +216 -0
- package/cpp/mtmd/models/glm4v.cpp +122 -0
- package/cpp/mtmd/models/internvl.cpp +69 -0
- package/cpp/mtmd/models/kimik25.cpp +101 -0
- package/cpp/mtmd/models/kimivl.cpp +63 -0
- package/cpp/mtmd/models/llama4.cpp +96 -0
- package/cpp/mtmd/models/llava.cpp +374 -0
- package/cpp/mtmd/models/minicpmv.cpp +114 -0
- package/cpp/mtmd/models/mobilenetv5.cpp +451 -0
- package/cpp/mtmd/models/models.h +128 -0
- package/cpp/mtmd/models/nemotron-v2-vl.cpp +35 -0
- package/cpp/mtmd/models/paddleocr.cpp +52 -0
- package/cpp/mtmd/models/pixtral.cpp +86 -0
- package/cpp/mtmd/models/qwen2vl.cpp +183 -0
- package/cpp/mtmd/models/qwen3vl.cpp +193 -0
- package/cpp/mtmd/models/siglip.cpp +86 -0
- package/cpp/mtmd/models/whisper-enc.cpp +115 -0
- package/cpp/mtmd/models/youtuvl.cpp +179 -0
- package/cpp/mtmd/mtmd-audio.cpp +730 -0
- package/cpp/mtmd/mtmd-audio.h +113 -0
- package/cpp/mtmd/mtmd-cli.cpp +437 -0
- package/cpp/mtmd/mtmd-helper.cpp +521 -0
- package/cpp/mtmd/mtmd-helper.h +96 -0
- package/cpp/mtmd/mtmd.cpp +1156 -0
- package/cpp/mtmd/mtmd.h +319 -0
- package/cpp/mtmd/requirements.txt +5 -0
- package/cpp/mtmd/test-1.jpeg +0 -0
- package/cpp/mtmd/test-2.mp3 +0 -0
- package/cpp/mtmd/tests.sh +192 -0
- package/cpp/src/CMakeLists.txt +169 -0
- package/cpp/src/llama-adapter.cpp +488 -0
- package/cpp/src/llama-adapter.h +89 -0
- package/cpp/src/llama-arch.cpp +2855 -0
- package/cpp/src/llama-arch.h +619 -0
- package/cpp/src/llama-batch.cpp +917 -0
- package/cpp/src/llama-batch.h +173 -0
- package/cpp/src/llama-chat.cpp +896 -0
- package/cpp/src/llama-chat.h +71 -0
- package/cpp/src/llama-context.cpp +3512 -0
- package/cpp/src/llama-context.h +359 -0
- package/cpp/src/llama-cparams.cpp +5 -0
- package/cpp/src/llama-cparams.h +44 -0
- package/cpp/src/llama-grammar.cpp +1464 -0
- package/cpp/src/llama-grammar.h +194 -0
- package/cpp/src/llama-graph.cpp +2685 -0
- package/cpp/src/llama-graph.h +1026 -0
- package/cpp/src/llama-hparams.cpp +234 -0
- package/cpp/src/llama-hparams.h +339 -0
- package/cpp/src/llama-impl.cpp +171 -0
- package/cpp/src/llama-impl.h +73 -0
- package/cpp/src/llama-io.cpp +15 -0
- package/cpp/src/llama-io.h +35 -0
- package/cpp/src/llama-kv-cache-iswa.cpp +330 -0
- package/cpp/src/llama-kv-cache-iswa.h +137 -0
- package/cpp/src/llama-kv-cache.cpp +2271 -0
- package/cpp/src/llama-kv-cache.h +388 -0
- package/cpp/src/llama-kv-cells.h +533 -0
- package/cpp/src/llama-memory-hybrid-iswa.cpp +275 -0
- package/cpp/src/llama-memory-hybrid-iswa.h +140 -0
- package/cpp/src/llama-memory-hybrid.cpp +268 -0
- package/cpp/src/llama-memory-hybrid.h +139 -0
- package/cpp/src/llama-memory-recurrent.cpp +1165 -0
- package/cpp/src/llama-memory-recurrent.h +182 -0
- package/cpp/src/llama-memory.cpp +59 -0
- package/cpp/src/llama-memory.h +122 -0
- package/cpp/src/llama-mmap.cpp +785 -0
- package/cpp/src/llama-mmap.h +92 -0
- package/cpp/src/llama-model-loader.cpp +1414 -0
- package/cpp/src/llama-model-loader.h +203 -0
- package/cpp/src/llama-model-saver.cpp +286 -0
- package/cpp/src/llama-model-saver.h +37 -0
- package/cpp/src/llama-model.cpp +9253 -0
- package/cpp/src/llama-model.h +576 -0
- package/cpp/src/llama-quant.cpp +1119 -0
- package/cpp/src/llama-quant.h +1 -0
- package/cpp/src/llama-sampler.cpp +3885 -0
- package/cpp/src/llama-sampler.h +42 -0
- package/cpp/src/llama-vocab.cpp +3970 -0
- package/cpp/src/llama-vocab.h +187 -0
- package/cpp/src/llama.cpp +1313 -0
- package/cpp/src/models/afmoe.cpp +191 -0
- package/cpp/src/models/apertus.cpp +125 -0
- package/cpp/src/models/arcee.cpp +135 -0
- package/cpp/src/models/arctic.cpp +138 -0
- package/cpp/src/models/arwkv7.cpp +86 -0
- package/cpp/src/models/baichuan.cpp +122 -0
- package/cpp/src/models/bailingmoe.cpp +144 -0
- package/cpp/src/models/bailingmoe2.cpp +135 -0
- package/cpp/src/models/bert.cpp +178 -0
- package/cpp/src/models/bitnet.cpp +160 -0
- package/cpp/src/models/bloom.cpp +101 -0
- package/cpp/src/models/chameleon.cpp +178 -0
- package/cpp/src/models/chatglm.cpp +132 -0
- package/cpp/src/models/codeshell.cpp +111 -0
- package/cpp/src/models/cogvlm.cpp +102 -0
- package/cpp/src/models/cohere2-iswa.cpp +134 -0
- package/cpp/src/models/command-r.cpp +122 -0
- package/cpp/src/models/dbrx.cpp +123 -0
- package/cpp/src/models/deci.cpp +135 -0
- package/cpp/src/models/deepseek.cpp +144 -0
- package/cpp/src/models/deepseek2.cpp +262 -0
- package/cpp/src/models/delta-net-base.cpp +376 -0
- package/cpp/src/models/dots1.cpp +134 -0
- package/cpp/src/models/dream.cpp +105 -0
- package/cpp/src/models/ernie4-5-moe.cpp +150 -0
- package/cpp/src/models/ernie4-5.cpp +110 -0
- package/cpp/src/models/eurobert.cpp +97 -0
- package/cpp/src/models/exaone-moe.cpp +146 -0
- package/cpp/src/models/exaone.cpp +114 -0
- package/cpp/src/models/exaone4.cpp +123 -0
- package/cpp/src/models/falcon-h1.cpp +111 -0
- package/cpp/src/models/falcon.cpp +120 -0
- package/cpp/src/models/gemma-embedding.cpp +116 -0
- package/cpp/src/models/gemma.cpp +112 -0
- package/cpp/src/models/gemma2-iswa.cpp +128 -0
- package/cpp/src/models/gemma3.cpp +155 -0
- package/cpp/src/models/gemma3n-iswa.cpp +384 -0
- package/cpp/src/models/glm4-moe.cpp +170 -0
- package/cpp/src/models/glm4.cpp +157 -0
- package/cpp/src/models/gpt2.cpp +105 -0
- package/cpp/src/models/gptneox.cpp +144 -0
- package/cpp/src/models/granite-hybrid.cpp +196 -0
- package/cpp/src/models/granite.cpp +211 -0
- package/cpp/src/models/grok.cpp +159 -0
- package/cpp/src/models/grovemoe.cpp +141 -0
- package/cpp/src/models/hunyuan-dense.cpp +132 -0
- package/cpp/src/models/hunyuan-moe.cpp +154 -0
- package/cpp/src/models/internlm2.cpp +120 -0
- package/cpp/src/models/jais.cpp +86 -0
- package/cpp/src/models/jais2.cpp +123 -0
- package/cpp/src/models/jamba.cpp +106 -0
- package/cpp/src/models/kimi-linear.cpp +392 -0
- package/cpp/src/models/lfm2.cpp +190 -0
- package/cpp/src/models/llada-moe.cpp +122 -0
- package/cpp/src/models/llada.cpp +99 -0
- package/cpp/src/models/llama-iswa.cpp +178 -0
- package/cpp/src/models/llama.cpp +168 -0
- package/cpp/src/models/maincoder.cpp +117 -0
- package/cpp/src/models/mamba-base.cpp +285 -0
- package/cpp/src/models/mamba.cpp +54 -0
- package/cpp/src/models/mimo2-iswa.cpp +123 -0
- package/cpp/src/models/minicpm3.cpp +200 -0
- package/cpp/src/models/minimax-m2.cpp +124 -0
- package/cpp/src/models/mistral3.cpp +160 -0
- package/cpp/src/models/models.h +684 -0
- package/cpp/src/models/modern-bert.cpp +109 -0
- package/cpp/src/models/mpt.cpp +126 -0
- package/cpp/src/models/nemotron-h.cpp +148 -0
- package/cpp/src/models/nemotron.cpp +122 -0
- package/cpp/src/models/neo-bert.cpp +104 -0
- package/cpp/src/models/olmo.cpp +121 -0
- package/cpp/src/models/olmo2.cpp +150 -0
- package/cpp/src/models/olmoe.cpp +124 -0
- package/cpp/src/models/openai-moe-iswa.cpp +127 -0
- package/cpp/src/models/openelm.cpp +124 -0
- package/cpp/src/models/orion.cpp +123 -0
- package/cpp/src/models/paddleocr.cpp +122 -0
- package/cpp/src/models/pangu-embedded.cpp +121 -0
- package/cpp/src/models/phi2.cpp +121 -0
- package/cpp/src/models/phi3.cpp +152 -0
- package/cpp/src/models/plamo.cpp +110 -0
- package/cpp/src/models/plamo2.cpp +318 -0
- package/cpp/src/models/plamo3.cpp +128 -0
- package/cpp/src/models/plm.cpp +169 -0
- package/cpp/src/models/qwen.cpp +108 -0
- package/cpp/src/models/qwen2.cpp +126 -0
- package/cpp/src/models/qwen2moe.cpp +151 -0
- package/cpp/src/models/qwen2vl.cpp +117 -0
- package/cpp/src/models/qwen3.cpp +117 -0
- package/cpp/src/models/qwen35.cpp +386 -0
- package/cpp/src/models/qwen35moe.cpp +420 -0
- package/cpp/src/models/qwen3moe.cpp +124 -0
- package/cpp/src/models/qwen3next.cpp +525 -0
- package/cpp/src/models/qwen3vl-moe.cpp +140 -0
- package/cpp/src/models/qwen3vl.cpp +132 -0
- package/cpp/src/models/refact.cpp +94 -0
- package/cpp/src/models/rnd1.cpp +126 -0
- package/cpp/src/models/rwkv6-base.cpp +164 -0
- package/cpp/src/models/rwkv6.cpp +94 -0
- package/cpp/src/models/rwkv6qwen2.cpp +86 -0
- package/cpp/src/models/rwkv7-base.cpp +137 -0
- package/cpp/src/models/rwkv7.cpp +90 -0
- package/cpp/src/models/seed-oss.cpp +124 -0
- package/cpp/src/models/smallthinker.cpp +126 -0
- package/cpp/src/models/smollm3.cpp +128 -0
- package/cpp/src/models/stablelm.cpp +146 -0
- package/cpp/src/models/starcoder.cpp +100 -0
- package/cpp/src/models/starcoder2.cpp +121 -0
- package/cpp/src/models/step35-iswa.cpp +168 -0
- package/cpp/src/models/t5-dec.cpp +166 -0
- package/cpp/src/models/t5-enc.cpp +96 -0
- package/cpp/src/models/wavtokenizer-dec.cpp +149 -0
- package/cpp/src/models/xverse.cpp +108 -0
- package/cpp/src/unicode-data.cpp +7034 -0
- package/cpp/src/unicode-data.h +20 -0
- package/cpp/src/unicode.cpp +1103 -0
- package/cpp/src/unicode.h +111 -0
- package/cpp/vendor/nlohmann/json.hpp +25526 -0
- package/cpp/vendor/nlohmann/json_fwd.hpp +187 -0
- package/cpp/vendor/stb/stb_image.h +7988 -0
- package/ios/LocalLLM-Bridging-Header.h +2 -0
- package/ios/LocalLLM.h +5 -0
- package/ios/LocalLLM.mm +1267 -0
- package/local-llm-rn.podspec +60 -0
- package/package.json +35 -0
- package/src/NativeLocalLLM.ts +73 -0
- package/src/device.ts +50 -0
- package/src/download-adapter.ts +17 -0
- package/src/index.ts +21 -0
- package/src/native-bridge.ts +142 -0
- package/src/rn-downloader.ts +37 -0
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
|
2
|
+
|
|
3
|
+
kernel void kernel_scale_f32(
|
|
4
|
+
global float * src0,
|
|
5
|
+
ulong offset0,
|
|
6
|
+
global float * dst,
|
|
7
|
+
ulong offsetd,
|
|
8
|
+
float scale,
|
|
9
|
+
float bias
|
|
10
|
+
) {
|
|
11
|
+
src0 = (global float*)((global char*)src0 + offset0);
|
|
12
|
+
dst = (global float*)((global char*)dst + offsetd);
|
|
13
|
+
dst[get_global_id(0)] = src0[get_global_id(0)] * scale + bias;
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
kernel void kernel_scale_f32_4(
|
|
17
|
+
global float4 * src0,
|
|
18
|
+
ulong offset0,
|
|
19
|
+
global float4 * dst,
|
|
20
|
+
ulong offsetd,
|
|
21
|
+
float scale,
|
|
22
|
+
float bias
|
|
23
|
+
) {
|
|
24
|
+
src0 = (global float4*)((global char*)src0 + offset0);
|
|
25
|
+
dst = (global float4*)((global char*)dst + offsetd);
|
|
26
|
+
dst[get_global_id(0)] = src0[get_global_id(0)] * scale + bias;
|
|
27
|
+
}
|
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
|
2
|
+
|
|
3
|
+
// v = { mp, L, d }
|
|
4
|
+
inline uint fastdiv(uint n, uint4 v) {
|
|
5
|
+
uint msbs;
|
|
6
|
+
msbs = mul_hi(n, v.s0);
|
|
7
|
+
return (msbs + n) >> v.s1;
|
|
8
|
+
}
|
|
9
|
+
inline uint fastmod(uint n, uint4 v) {
|
|
10
|
+
uint q = fastdiv(n, v);
|
|
11
|
+
return n - q * v.s2;
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
kernel void kernel_set_rows_f32_i64(
|
|
15
|
+
global char * src0,
|
|
16
|
+
ulong offset0,
|
|
17
|
+
global char * src1,
|
|
18
|
+
ulong offset1,
|
|
19
|
+
global char * dst,
|
|
20
|
+
ulong offsetd,
|
|
21
|
+
int ne01,
|
|
22
|
+
ulong nb01,
|
|
23
|
+
ulong nb02,
|
|
24
|
+
ulong nb03,
|
|
25
|
+
uint4 ne11,
|
|
26
|
+
uint4 ne12,
|
|
27
|
+
ulong nb10,
|
|
28
|
+
ulong nb11,
|
|
29
|
+
ulong nb12,
|
|
30
|
+
int nblk0,
|
|
31
|
+
ulong nb1,
|
|
32
|
+
ulong nb2,
|
|
33
|
+
ulong nb3
|
|
34
|
+
) {
|
|
35
|
+
src0 = src0 + offset0;
|
|
36
|
+
src1 = src1 + offset1;
|
|
37
|
+
dst = dst + offsetd;
|
|
38
|
+
|
|
39
|
+
int i03 = get_group_id(2);
|
|
40
|
+
int i02 = get_group_id(1);
|
|
41
|
+
int i01 = get_group_id(0)*get_local_size(1) + get_local_id(1);
|
|
42
|
+
|
|
43
|
+
if (i01 >= ne01) {
|
|
44
|
+
return;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
//int i12 = i03%ne12;
|
|
48
|
+
//int i11 = i02%ne11;
|
|
49
|
+
int i12 = fastmod(i03, ne12);
|
|
50
|
+
int i11 = fastmod(i02, ne11);
|
|
51
|
+
|
|
52
|
+
int i10 = i01;
|
|
53
|
+
long i1 = ((global long *)(src1 + i10*nb10 + i11*nb11 + i12*nb12))[0];
|
|
54
|
+
|
|
55
|
+
global float * dst_row = (global float *) (dst + i1*nb1 + i02*nb2 + i03*nb3);
|
|
56
|
+
global float * src_row = (global float *) (src0 + i01*nb01 + i02*nb02 + i03*nb03);
|
|
57
|
+
|
|
58
|
+
for (int ind = get_local_id(0); ind < nblk0; ind += get_local_size(0)) {
|
|
59
|
+
dst_row[ind] = (float)src_row[ind];
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
kernel void kernel_set_rows_f16_i64(
|
|
64
|
+
global char * src0,
|
|
65
|
+
ulong offset0,
|
|
66
|
+
global char * src1,
|
|
67
|
+
ulong offset1,
|
|
68
|
+
global char * dst,
|
|
69
|
+
ulong offsetd,
|
|
70
|
+
int ne01,
|
|
71
|
+
ulong nb01,
|
|
72
|
+
ulong nb02,
|
|
73
|
+
ulong nb03,
|
|
74
|
+
uint4 ne11,
|
|
75
|
+
uint4 ne12,
|
|
76
|
+
ulong nb10,
|
|
77
|
+
ulong nb11,
|
|
78
|
+
ulong nb12,
|
|
79
|
+
int nblk0,
|
|
80
|
+
ulong nb1,
|
|
81
|
+
ulong nb2,
|
|
82
|
+
ulong nb3
|
|
83
|
+
) {
|
|
84
|
+
src0 = src0 + offset0;
|
|
85
|
+
src1 = src1 + offset1;
|
|
86
|
+
dst = dst + offsetd;
|
|
87
|
+
|
|
88
|
+
int i03 = get_group_id(2);
|
|
89
|
+
int i02 = get_group_id(1);
|
|
90
|
+
int i01 = get_group_id(0)*get_local_size(1) + get_local_id(1);
|
|
91
|
+
|
|
92
|
+
if (i01 >= ne01) {
|
|
93
|
+
return;
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
//int i12 = i03%ne12;
|
|
97
|
+
//int i11 = i02%ne11;
|
|
98
|
+
int i12 = fastmod(i03, ne12);
|
|
99
|
+
int i11 = fastmod(i02, ne11);
|
|
100
|
+
|
|
101
|
+
int i10 = i01;
|
|
102
|
+
long i1 = ((global long *)(src1 + i10*nb10 + i11*nb11 + i12*nb12))[0];
|
|
103
|
+
|
|
104
|
+
global half * dst_row = (global half *) (dst + i1*nb1 + i02*nb2 + i03*nb3);
|
|
105
|
+
global float * src_row = (global float *) (src0 + i01*nb01 + i02*nb02 + i03*nb03);
|
|
106
|
+
|
|
107
|
+
for (int ind = get_local_id(0); ind < nblk0; ind += get_local_size(0)) {
|
|
108
|
+
dst_row[ind] = src_row[ind];
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
kernel void kernel_set_rows_f32_i32(
|
|
113
|
+
global char * src0,
|
|
114
|
+
ulong offset0,
|
|
115
|
+
global char * src1,
|
|
116
|
+
ulong offset1,
|
|
117
|
+
global char * dst,
|
|
118
|
+
ulong offsetd,
|
|
119
|
+
int ne01,
|
|
120
|
+
ulong nb01,
|
|
121
|
+
ulong nb02,
|
|
122
|
+
ulong nb03,
|
|
123
|
+
uint4 ne11,
|
|
124
|
+
uint4 ne12,
|
|
125
|
+
ulong nb10,
|
|
126
|
+
ulong nb11,
|
|
127
|
+
ulong nb12,
|
|
128
|
+
int nblk0,
|
|
129
|
+
ulong nb1,
|
|
130
|
+
ulong nb2,
|
|
131
|
+
ulong nb3
|
|
132
|
+
) {
|
|
133
|
+
src0 = src0 + offset0;
|
|
134
|
+
src1 = src1 + offset1;
|
|
135
|
+
dst = dst + offsetd;
|
|
136
|
+
|
|
137
|
+
int i03 = get_group_id(2);
|
|
138
|
+
int i02 = get_group_id(1);
|
|
139
|
+
int i01 = get_group_id(0)*get_local_size(1) + get_local_id(1);
|
|
140
|
+
|
|
141
|
+
if (i01 >= ne01) {
|
|
142
|
+
return;
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
//int i12 = i03%ne12;
|
|
146
|
+
//int i11 = i02%ne11;
|
|
147
|
+
int i12 = fastmod(i03, ne12);
|
|
148
|
+
int i11 = fastmod(i02, ne11);
|
|
149
|
+
|
|
150
|
+
int i10 = i01;
|
|
151
|
+
int i1 = ((global int *)(src1 + i10*nb10 + i11*nb11 + i12*nb12))[0];
|
|
152
|
+
|
|
153
|
+
global float * dst_row = (global float *) (dst + i1*nb1 + i02*nb2 + i03*nb3);
|
|
154
|
+
global float * src_row = (global float *) (src0 + i01*nb01 + i02*nb02 + i03*nb03);
|
|
155
|
+
|
|
156
|
+
for (int ind = get_local_id(0); ind < nblk0; ind += get_local_size(0)) {
|
|
157
|
+
dst_row[ind] = (float)src_row[ind];
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
kernel void kernel_set_rows_f16_i32(
|
|
162
|
+
global char * src0,
|
|
163
|
+
ulong offset0,
|
|
164
|
+
global char * src1,
|
|
165
|
+
ulong offset1,
|
|
166
|
+
global char * dst,
|
|
167
|
+
ulong offsetd,
|
|
168
|
+
int ne01,
|
|
169
|
+
ulong nb01,
|
|
170
|
+
ulong nb02,
|
|
171
|
+
ulong nb03,
|
|
172
|
+
uint4 ne11,
|
|
173
|
+
uint4 ne12,
|
|
174
|
+
ulong nb10,
|
|
175
|
+
ulong nb11,
|
|
176
|
+
ulong nb12,
|
|
177
|
+
int nblk0,
|
|
178
|
+
ulong nb1,
|
|
179
|
+
ulong nb2,
|
|
180
|
+
ulong nb3
|
|
181
|
+
) {
|
|
182
|
+
src0 = src0 + offset0;
|
|
183
|
+
src1 = src1 + offset1;
|
|
184
|
+
dst = dst + offsetd;
|
|
185
|
+
|
|
186
|
+
int i03 = get_group_id(2);
|
|
187
|
+
int i02 = get_group_id(1);
|
|
188
|
+
int i01 = get_group_id(0)*get_local_size(1) + get_local_id(1);
|
|
189
|
+
|
|
190
|
+
if (i01 >= ne01) {
|
|
191
|
+
return;
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
//int i12 = i03%ne12;
|
|
195
|
+
//int i11 = i02%ne11;
|
|
196
|
+
int i12 = fastmod(i03, ne12);
|
|
197
|
+
int i11 = fastmod(i02, ne11);
|
|
198
|
+
|
|
199
|
+
int i10 = i01;
|
|
200
|
+
int i1 = ((global int *)(src1 + i10*nb10 + i11*nb11 + i12*nb12))[0];
|
|
201
|
+
|
|
202
|
+
global half * dst_row = (global half *) (dst + i1*nb1 + i02*nb2 + i03*nb3);
|
|
203
|
+
global float * src_row = (global float *) (src0 + i01*nb01 + i02*nb02 + i03*nb03);
|
|
204
|
+
|
|
205
|
+
for (int ind = get_local_id(0); ind < nblk0; ind += get_local_size(0)) {
|
|
206
|
+
dst_row[ind] = src_row[ind];
|
|
207
|
+
}
|
|
208
|
+
}
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
|
2
|
+
|
|
3
|
+
//------------------------------------------------------------------------------
|
|
4
|
+
// sigmoid
|
|
5
|
+
//------------------------------------------------------------------------------
|
|
6
|
+
|
|
7
|
+
kernel void kernel_sigmoid_f32(
|
|
8
|
+
global float * src0,
|
|
9
|
+
ulong offset0,
|
|
10
|
+
global float * dst,
|
|
11
|
+
ulong offsetd
|
|
12
|
+
) {
|
|
13
|
+
src0 = (global float*)((global char*)src0 + offset0);
|
|
14
|
+
dst = (global float*)((global char*)dst + offsetd);
|
|
15
|
+
|
|
16
|
+
dst[get_global_id(0)] = 1.0f / (1.0f + exp(-src0[get_global_id(0)]));
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
kernel void kernel_sigmoid_f16(
|
|
20
|
+
global half * src0,
|
|
21
|
+
ulong offset0,
|
|
22
|
+
global half * dst,
|
|
23
|
+
ulong offsetd
|
|
24
|
+
) {
|
|
25
|
+
src0 = (global half*)((global char*)src0 + offset0);
|
|
26
|
+
dst = (global half*)((global char*)dst + offsetd);
|
|
27
|
+
|
|
28
|
+
dst[get_global_id(0)] = 1.0f / (1.0f + exp(-src0[get_global_id(0)]));
|
|
29
|
+
}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
|
2
|
+
|
|
3
|
+
//------------------------------------------------------------------------------
|
|
4
|
+
// silu
|
|
5
|
+
//------------------------------------------------------------------------------
|
|
6
|
+
kernel void kernel_silu(
|
|
7
|
+
global float * src0,
|
|
8
|
+
ulong offset0,
|
|
9
|
+
global float * dst,
|
|
10
|
+
ulong offsetd
|
|
11
|
+
) {
|
|
12
|
+
src0 = (global float*)((global char*)src0 + offset0);
|
|
13
|
+
dst = (global float*)((global char*)dst + offsetd);
|
|
14
|
+
|
|
15
|
+
float x = src0[get_global_id(0)];
|
|
16
|
+
dst[get_global_id(0)] = x / (1.0f + exp(-x));
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
kernel void kernel_silu_4(
|
|
20
|
+
global float4 * src0,
|
|
21
|
+
ulong offset0,
|
|
22
|
+
global float4 * dst,
|
|
23
|
+
ulong offsetd
|
|
24
|
+
) {
|
|
25
|
+
src0 = (global float4*)((global char*)src0 + offset0);
|
|
26
|
+
dst = (global float4*)((global char*)dst + offsetd);
|
|
27
|
+
|
|
28
|
+
float4 x = src0[get_global_id(0)];
|
|
29
|
+
dst[get_global_id(0)] = x / (1.0f + exp(-x));
|
|
30
|
+
}
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
|
2
|
+
|
|
3
|
+
#ifdef cl_intel_subgroups
|
|
4
|
+
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
|
|
5
|
+
#else
|
|
6
|
+
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
|
7
|
+
#endif
|
|
8
|
+
|
|
9
|
+
#ifdef cl_intel_required_subgroup_size
|
|
10
|
+
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
|
|
11
|
+
#define INTEL_GPU 1
|
|
12
|
+
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
|
|
13
|
+
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
|
|
14
|
+
#elif defined(cl_qcom_reqd_sub_group_size)
|
|
15
|
+
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
|
16
|
+
#define ADRENO_GPU 1
|
|
17
|
+
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
|
18
|
+
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
|
19
|
+
#endif
|
|
20
|
+
|
|
21
|
+
#ifdef ADRENO_GPU
|
|
22
|
+
REQD_SUBGROUP_SIZE_64
|
|
23
|
+
#endif
|
|
24
|
+
kernel void kernel_soft_max_4_f16(
|
|
25
|
+
global char * src0,
|
|
26
|
+
ulong offset0,
|
|
27
|
+
global char * src1,
|
|
28
|
+
ulong offset1,
|
|
29
|
+
global char * src2,
|
|
30
|
+
ulong offset2,
|
|
31
|
+
global char * dst,
|
|
32
|
+
ulong offsetd,
|
|
33
|
+
int ne00,
|
|
34
|
+
ulong nb01,
|
|
35
|
+
ulong nb02,
|
|
36
|
+
ulong nb03,
|
|
37
|
+
int ne12,
|
|
38
|
+
int ne13,
|
|
39
|
+
ulong nb11,
|
|
40
|
+
ulong nb12,
|
|
41
|
+
ulong nb13,
|
|
42
|
+
ulong nb1,
|
|
43
|
+
ulong nb2,
|
|
44
|
+
ulong nb3,
|
|
45
|
+
float scale,
|
|
46
|
+
float max_bias,
|
|
47
|
+
float m0,
|
|
48
|
+
float m1,
|
|
49
|
+
int n_head_log2
|
|
50
|
+
) {
|
|
51
|
+
src0 = src0 + offset0;
|
|
52
|
+
src1 = src1 + offset1;
|
|
53
|
+
src2 = src2 + offset2;
|
|
54
|
+
dst = dst + offsetd;
|
|
55
|
+
|
|
56
|
+
int i03 = get_group_id(2);
|
|
57
|
+
int i02 = get_group_id(1);
|
|
58
|
+
int i01 = get_group_id(0);
|
|
59
|
+
|
|
60
|
+
int i13 = i03%ne13;
|
|
61
|
+
int i12 = i02%ne12;
|
|
62
|
+
int i11 = i01;
|
|
63
|
+
|
|
64
|
+
global float4 * psrc4 = (global float4 *)(src0 + i01*nb01 + i02*nb02 + i03*nb03);
|
|
65
|
+
global half4 * pmask = src1 != src0 ? (global half4 *)(src1 + i11*nb11 + i12*nb12 + i13*nb13) : 0;
|
|
66
|
+
global float * psrc2 = src2 != src0 ? (global float *)(src2) : 0;
|
|
67
|
+
global float4 * pdst4 = (global float4 *)(dst + i01*nb1 + i02*nb2 + i03*nb3);
|
|
68
|
+
|
|
69
|
+
float slope = 1.0f;
|
|
70
|
+
|
|
71
|
+
// ALiBi
|
|
72
|
+
if (max_bias > 0.0f) {
|
|
73
|
+
int h = i02;
|
|
74
|
+
|
|
75
|
+
float base = h < n_head_log2 ? m0 : m1;
|
|
76
|
+
int exp = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
|
|
77
|
+
|
|
78
|
+
slope = pow(base, exp);
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
// parallel max
|
|
82
|
+
float4 lmax4 = psrc2 ? psrc2[i02] : -INFINITY;
|
|
83
|
+
for (int i00 = get_local_id(0); i00 < ne00/4; i00 += get_local_size(0)) {
|
|
84
|
+
lmax4 = fmax(lmax4, psrc4[i00]*scale + slope*(pmask ? convert_float4(pmask[i00]) : 0.0f));
|
|
85
|
+
}
|
|
86
|
+
float lmax = fmax(fmax(lmax4.s0, lmax4.s1), fmax(lmax4.s2, lmax4.s3));
|
|
87
|
+
|
|
88
|
+
const float max = sub_group_reduce_max(lmax);
|
|
89
|
+
|
|
90
|
+
// parallel sum
|
|
91
|
+
float4 lsum4 = 0.0f;
|
|
92
|
+
for (int i00 = get_local_id(0); i00 < ne00/4; i00 += get_local_size(0)) {
|
|
93
|
+
const float4 exp_psrc4 = exp((psrc4[i00]*scale + slope*(pmask ? convert_float4(pmask[i00]) : 0.0f)) - max);
|
|
94
|
+
lsum4 += exp_psrc4;
|
|
95
|
+
pdst4[i00] = exp_psrc4;
|
|
96
|
+
}
|
|
97
|
+
float lsum = lsum4.s0 + lsum4.s1 + lsum4.s2 + lsum4.s3;
|
|
98
|
+
|
|
99
|
+
float sum = sub_group_reduce_add(lsum);
|
|
100
|
+
|
|
101
|
+
if (psrc2) {
|
|
102
|
+
sum += exp(psrc2[i02] - max);
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
for (int i00 = get_local_id(0); i00 < ne00/4; i00 += get_local_size(0)) {
|
|
106
|
+
pdst4[i00] /= sum;
|
|
107
|
+
}
|
|
108
|
+
}
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
|
2
|
+
|
|
3
|
+
#ifdef cl_intel_subgroups
|
|
4
|
+
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
|
|
5
|
+
#else
|
|
6
|
+
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
|
7
|
+
#endif
|
|
8
|
+
|
|
9
|
+
#ifdef cl_intel_required_subgroup_size
|
|
10
|
+
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
|
|
11
|
+
#define INTEL_GPU 1
|
|
12
|
+
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
|
|
13
|
+
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
|
|
14
|
+
#elif defined(cl_qcom_reqd_sub_group_size)
|
|
15
|
+
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
|
16
|
+
#define ADRENO_GPU 1
|
|
17
|
+
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
|
18
|
+
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
|
19
|
+
#endif
|
|
20
|
+
|
|
21
|
+
#ifdef ADRENO_GPU
|
|
22
|
+
REQD_SUBGROUP_SIZE_64
|
|
23
|
+
#endif
|
|
24
|
+
kernel void kernel_soft_max_4(
|
|
25
|
+
global char * src0,
|
|
26
|
+
ulong offset0,
|
|
27
|
+
global char * src1,
|
|
28
|
+
ulong offset1,
|
|
29
|
+
global char * src2,
|
|
30
|
+
ulong offset2,
|
|
31
|
+
global char * dst,
|
|
32
|
+
ulong offsetd,
|
|
33
|
+
int ne00,
|
|
34
|
+
ulong nb01,
|
|
35
|
+
ulong nb02,
|
|
36
|
+
ulong nb03,
|
|
37
|
+
int ne12,
|
|
38
|
+
int ne13,
|
|
39
|
+
ulong nb11,
|
|
40
|
+
ulong nb12,
|
|
41
|
+
ulong nb13,
|
|
42
|
+
ulong nb1,
|
|
43
|
+
ulong nb2,
|
|
44
|
+
ulong nb3,
|
|
45
|
+
float scale,
|
|
46
|
+
float max_bias,
|
|
47
|
+
float m0,
|
|
48
|
+
float m1,
|
|
49
|
+
int n_head_log2
|
|
50
|
+
) {
|
|
51
|
+
src0 = src0 + offset0;
|
|
52
|
+
src1 = src1 + offset1;
|
|
53
|
+
src2 = src2 + offset2;
|
|
54
|
+
dst = dst + offsetd;
|
|
55
|
+
|
|
56
|
+
int i03 = get_group_id(2);
|
|
57
|
+
int i02 = get_group_id(1);
|
|
58
|
+
int i01 = get_group_id(0);
|
|
59
|
+
|
|
60
|
+
int i13 = i03%ne13;
|
|
61
|
+
int i12 = i02%ne12;
|
|
62
|
+
int i11 = i01;
|
|
63
|
+
|
|
64
|
+
global float4 * psrc4 = (global float4 *)(src0 + i01*nb01 + i02*nb02 + i03*nb03);
|
|
65
|
+
global float4 * pmask = src1 != src0 ? (global float4 *)(src1 + i11*nb11 + i12*nb12 + i13*nb13) : 0;
|
|
66
|
+
global float * psrc2 = src2 != src0 ? (global float *)(src2) : 0;
|
|
67
|
+
global float4 * pdst4 = (global float4 *)(dst + i01*nb1 + i02*nb2 + i03*nb3);
|
|
68
|
+
|
|
69
|
+
float slope = 1.0f;
|
|
70
|
+
|
|
71
|
+
// ALiBi
|
|
72
|
+
if (max_bias > 0.0f) {
|
|
73
|
+
int h = i02;
|
|
74
|
+
|
|
75
|
+
float base = h < n_head_log2 ? m0 : m1;
|
|
76
|
+
int exp = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
|
|
77
|
+
|
|
78
|
+
slope = pow(base, exp);
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
// parallel max
|
|
82
|
+
float4 lmax4 = psrc2 ? psrc2[i02] : -INFINITY;
|
|
83
|
+
for (int i00 = get_local_id(0); i00 < ne00/4; i00 += get_local_size(0)) {
|
|
84
|
+
lmax4 = fmax(lmax4, psrc4[i00]*scale + (pmask ? slope*pmask[i00] : 0.0f));
|
|
85
|
+
}
|
|
86
|
+
float lmax = fmax(fmax(lmax4.s0, lmax4.s1), fmax(lmax4.s2, lmax4.s3));
|
|
87
|
+
|
|
88
|
+
const float max = sub_group_reduce_max(lmax);
|
|
89
|
+
|
|
90
|
+
// parallel sum
|
|
91
|
+
float4 lsum4 = 0.0f;
|
|
92
|
+
for (int i00 = get_local_id(0); i00 < ne00/4; i00 += get_local_size(0)) {
|
|
93
|
+
const float4 exp_psrc4 = exp((psrc4[i00]*scale + (pmask ? slope*pmask[i00] : 0.0f)) - max);
|
|
94
|
+
lsum4 += exp_psrc4;
|
|
95
|
+
pdst4[i00] = exp_psrc4;
|
|
96
|
+
}
|
|
97
|
+
float lsum = lsum4.s0 + lsum4.s1 + lsum4.s2 + lsum4.s3;
|
|
98
|
+
|
|
99
|
+
float sum = sub_group_reduce_add(lsum);
|
|
100
|
+
|
|
101
|
+
if (psrc2) {
|
|
102
|
+
sum += exp(psrc2[i02] - max);
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
for (int i00 = get_local_id(0); i00 < ne00/4; i00 += get_local_size(0)) {
|
|
106
|
+
pdst4[i00] /= sum;
|
|
107
|
+
}
|
|
108
|
+
}
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
|
2
|
+
|
|
3
|
+
#ifdef cl_intel_subgroups
|
|
4
|
+
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
|
|
5
|
+
#else
|
|
6
|
+
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
|
7
|
+
#endif
|
|
8
|
+
|
|
9
|
+
#ifdef cl_intel_required_subgroup_size
|
|
10
|
+
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
|
|
11
|
+
#define INTEL_GPU 1
|
|
12
|
+
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
|
|
13
|
+
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
|
|
14
|
+
#elif defined(cl_qcom_reqd_sub_group_size)
|
|
15
|
+
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
|
16
|
+
#define ADRENO_GPU 1
|
|
17
|
+
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
|
18
|
+
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
|
19
|
+
#endif
|
|
20
|
+
|
|
21
|
+
#ifdef ADRENO_GPU
|
|
22
|
+
REQD_SUBGROUP_SIZE_64
|
|
23
|
+
#endif
|
|
24
|
+
kernel void kernel_soft_max_f16(
|
|
25
|
+
global char * src0,
|
|
26
|
+
ulong offset0,
|
|
27
|
+
global char * src1,
|
|
28
|
+
ulong offset1,
|
|
29
|
+
global char * src2,
|
|
30
|
+
ulong offset2,
|
|
31
|
+
global char * dst,
|
|
32
|
+
ulong offsetd,
|
|
33
|
+
int ne00,
|
|
34
|
+
ulong nb01,
|
|
35
|
+
ulong nb02,
|
|
36
|
+
ulong nb03,
|
|
37
|
+
int ne12,
|
|
38
|
+
int ne13,
|
|
39
|
+
ulong nb11,
|
|
40
|
+
ulong nb12,
|
|
41
|
+
ulong nb13,
|
|
42
|
+
ulong nb1,
|
|
43
|
+
ulong nb2,
|
|
44
|
+
ulong nb3,
|
|
45
|
+
float scale,
|
|
46
|
+
float max_bias,
|
|
47
|
+
float m0,
|
|
48
|
+
float m1,
|
|
49
|
+
int n_head_log2
|
|
50
|
+
) {
|
|
51
|
+
src0 = src0 + offset0;
|
|
52
|
+
src1 = src1 + offset1;
|
|
53
|
+
src2 = src2 + offset2;
|
|
54
|
+
dst = dst + offsetd;
|
|
55
|
+
|
|
56
|
+
int i03 = get_group_id(2);
|
|
57
|
+
int i02 = get_group_id(1);
|
|
58
|
+
int i01 = get_group_id(0);
|
|
59
|
+
|
|
60
|
+
int i13 = i03%ne13;
|
|
61
|
+
int i12 = i02%ne12;
|
|
62
|
+
int i11 = i01;
|
|
63
|
+
|
|
64
|
+
global float * psrc0 = (global float *)(src0 + i01*nb01 + i02*nb02 + i03*nb03);
|
|
65
|
+
global half * pmask = src1 != src0 ? (global half *)(src1 + i11*nb11 + i12*nb12 + i13*nb13) : 0;
|
|
66
|
+
global float * psrc2 = src2 != src0 ? (global float *)(src2) : 0;
|
|
67
|
+
global float * pdst = (global float *)(dst + i01*nb1 + i02*nb2 + i03*nb3);
|
|
68
|
+
|
|
69
|
+
float slope = 1.0f;
|
|
70
|
+
|
|
71
|
+
// ALiBi
|
|
72
|
+
if (max_bias > 0.0f) {
|
|
73
|
+
int h = i02;
|
|
74
|
+
|
|
75
|
+
float base = h < n_head_log2 ? m0 : m1;
|
|
76
|
+
int exp = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
|
|
77
|
+
|
|
78
|
+
slope = pow(base, exp);
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
// parallel max
|
|
82
|
+
float lmax = psrc2 ? psrc2[i02] : -INFINITY;
|
|
83
|
+
for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
|
|
84
|
+
lmax = fmax(lmax, psrc0[i00]*scale + (pmask ? slope*pmask[i00] : 0.0f));
|
|
85
|
+
}
|
|
86
|
+
float max = sub_group_reduce_max(lmax);
|
|
87
|
+
|
|
88
|
+
// parallel sum
|
|
89
|
+
float lsum = 0.0f;
|
|
90
|
+
for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
|
|
91
|
+
float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? slope*pmask[i00] : 0.0f)) - max);
|
|
92
|
+
lsum += exp_psrc0;
|
|
93
|
+
// Remember the result of exp here. exp is expensive, so we really do not
|
|
94
|
+
// wish to compute it twice.
|
|
95
|
+
pdst[i00] = exp_psrc0;
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
float sum = sub_group_reduce_add(lsum);
|
|
99
|
+
|
|
100
|
+
if (psrc2) {
|
|
101
|
+
sum += exp(psrc2[i02] - max);
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
|
|
105
|
+
pdst[i00] /= sum;
|
|
106
|
+
}
|
|
107
|
+
}
|