local-llm-rn 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/cpp/CMakeLists.txt +285 -0
- package/cpp/common/CMakeLists.txt +149 -0
- package/cpp/common/arg.cpp +3799 -0
- package/cpp/common/arg.h +131 -0
- package/cpp/common/base64.hpp +392 -0
- package/cpp/common/build-info.cpp.in +4 -0
- package/cpp/common/chat-parser-xml-toolcall.cpp +879 -0
- package/cpp/common/chat-parser-xml-toolcall.h +45 -0
- package/cpp/common/chat-parser.cpp +1649 -0
- package/cpp/common/chat-parser.h +133 -0
- package/cpp/common/chat-peg-parser.cpp +124 -0
- package/cpp/common/chat-peg-parser.h +105 -0
- package/cpp/common/chat.cpp +3355 -0
- package/cpp/common/chat.h +252 -0
- package/cpp/common/common.cpp +1824 -0
- package/cpp/common/common.h +930 -0
- package/cpp/common/console.cpp +1137 -0
- package/cpp/common/console.h +41 -0
- package/cpp/common/debug.cpp +167 -0
- package/cpp/common/debug.h +43 -0
- package/cpp/common/download.cpp +792 -0
- package/cpp/common/download.h +84 -0
- package/cpp/common/http.h +84 -0
- package/cpp/common/jinja/README.md +88 -0
- package/cpp/common/jinja/caps.cpp +285 -0
- package/cpp/common/jinja/caps.h +30 -0
- package/cpp/common/jinja/lexer.cpp +341 -0
- package/cpp/common/jinja/lexer.h +157 -0
- package/cpp/common/jinja/parser.cpp +591 -0
- package/cpp/common/jinja/parser.h +21 -0
- package/cpp/common/jinja/runtime.cpp +867 -0
- package/cpp/common/jinja/runtime.h +638 -0
- package/cpp/common/jinja/string.cpp +213 -0
- package/cpp/common/jinja/string.h +61 -0
- package/cpp/common/jinja/utils.h +149 -0
- package/cpp/common/jinja/value.cpp +1393 -0
- package/cpp/common/jinja/value.h +756 -0
- package/cpp/common/json-partial.cpp +324 -0
- package/cpp/common/json-partial.h +39 -0
- package/cpp/common/json-schema-to-grammar.cpp +1153 -0
- package/cpp/common/json-schema-to-grammar.h +43 -0
- package/cpp/common/llguidance.cpp +258 -0
- package/cpp/common/log.cpp +446 -0
- package/cpp/common/log.h +119 -0
- package/cpp/common/ngram-cache.cpp +285 -0
- package/cpp/common/ngram-cache.h +101 -0
- package/cpp/common/ngram-map.cpp +530 -0
- package/cpp/common/ngram-map.h +115 -0
- package/cpp/common/ngram-mod.cpp +60 -0
- package/cpp/common/ngram-mod.h +38 -0
- package/cpp/common/peg-parser.cpp +1712 -0
- package/cpp/common/peg-parser.h +459 -0
- package/cpp/common/preset.cpp +483 -0
- package/cpp/common/preset.h +83 -0
- package/cpp/common/regex-partial.cpp +204 -0
- package/cpp/common/regex-partial.h +56 -0
- package/cpp/common/sampling.cpp +745 -0
- package/cpp/common/sampling.h +119 -0
- package/cpp/common/speculative.cpp +1074 -0
- package/cpp/common/speculative.h +41 -0
- package/cpp/common/unicode.cpp +64 -0
- package/cpp/common/unicode.h +22 -0
- package/cpp/ggml/CMakeLists.txt +494 -0
- package/cpp/ggml/cmake/GitVars.cmake +22 -0
- package/cpp/ggml/cmake/common.cmake +50 -0
- package/cpp/ggml/cmake/ggml-config.cmake.in +191 -0
- package/cpp/ggml/include/ggml-alloc.h +85 -0
- package/cpp/ggml/include/ggml-backend.h +373 -0
- package/cpp/ggml/include/ggml-blas.h +25 -0
- package/cpp/ggml/include/ggml-cann.h +123 -0
- package/cpp/ggml/include/ggml-cpp.h +39 -0
- package/cpp/ggml/include/ggml-cpu.h +151 -0
- package/cpp/ggml/include/ggml-cuda.h +47 -0
- package/cpp/ggml/include/ggml-hexagon.h +19 -0
- package/cpp/ggml/include/ggml-metal.h +61 -0
- package/cpp/ggml/include/ggml-opencl.h +26 -0
- package/cpp/ggml/include/ggml-opt.h +256 -0
- package/cpp/ggml/include/ggml-rpc.h +30 -0
- package/cpp/ggml/include/ggml-sycl.h +49 -0
- package/cpp/ggml/include/ggml-virtgpu.h +14 -0
- package/cpp/ggml/include/ggml-vulkan.h +29 -0
- package/cpp/ggml/include/ggml-webgpu.h +19 -0
- package/cpp/ggml/include/ggml-zdnn.h +17 -0
- package/cpp/ggml/include/ggml-zendnn.h +22 -0
- package/cpp/ggml/include/ggml.h +2753 -0
- package/cpp/ggml/include/gguf.h +204 -0
- package/cpp/ggml/src/CMakeLists.txt +492 -0
- package/cpp/ggml/src/ggml-alloc.c +1244 -0
- package/cpp/ggml/src/ggml-backend-dl.cpp +48 -0
- package/cpp/ggml/src/ggml-backend-dl.h +45 -0
- package/cpp/ggml/src/ggml-backend-impl.h +255 -0
- package/cpp/ggml/src/ggml-backend-reg.cpp +566 -0
- package/cpp/ggml/src/ggml-backend.cpp +2270 -0
- package/cpp/ggml/src/ggml-blas/CMakeLists.txt +101 -0
- package/cpp/ggml/src/ggml-blas/ggml-blas.cpp +518 -0
- package/cpp/ggml/src/ggml-common.h +1878 -0
- package/cpp/ggml/src/ggml-cpu/CMakeLists.txt +691 -0
- package/cpp/ggml/src/ggml-cpu/amx/amx.cpp +247 -0
- package/cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
- package/cpp/ggml/src/ggml-cpu/amx/common.h +91 -0
- package/cpp/ggml/src/ggml-cpu/amx/mmq.cpp +2512 -0
- package/cpp/ggml/src/ggml-cpu/amx/mmq.h +10 -0
- package/cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +98 -0
- package/cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4052 -0
- package/cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +4935 -0
- package/cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2159 -0
- package/cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
- package/cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2305 -0
- package/cpp/ggml/src/ggml-cpu/arch/riscv/cpu-feats.cpp +38 -0
- package/cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2726 -0
- package/cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +342 -0
- package/cpp/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
- package/cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1468 -0
- package/cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1221 -0
- package/cpp/ggml/src/ggml-cpu/arch/x86/cpu-feats.cpp +327 -0
- package/cpp/ggml/src/ggml-cpu/arch/x86/quants.c +3820 -0
- package/cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +6307 -0
- package/cpp/ggml/src/ggml-cpu/arch-fallback.h +313 -0
- package/cpp/ggml/src/ggml-cpu/binary-ops.cpp +154 -0
- package/cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
- package/cpp/ggml/src/ggml-cpu/cmake/FindSIMD.cmake +100 -0
- package/cpp/ggml/src/ggml-cpu/common.h +95 -0
- package/cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +529 -0
- package/cpp/ggml/src/ggml-cpu/ggml-cpu.c +3734 -0
- package/cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +701 -0
- package/cpp/ggml/src/ggml-cpu/hbm.cpp +55 -0
- package/cpp/ggml/src/ggml-cpu/hbm.h +8 -0
- package/cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +938 -0
- package/cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +90 -0
- package/cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +798 -0
- package/cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
- package/cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +4033 -0
- package/cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +25 -0
- package/cpp/ggml/src/ggml-cpu/ops.cpp +10978 -0
- package/cpp/ggml/src/ggml-cpu/ops.h +116 -0
- package/cpp/ggml/src/ggml-cpu/quants.c +1193 -0
- package/cpp/ggml/src/ggml-cpu/quants.h +97 -0
- package/cpp/ggml/src/ggml-cpu/repack.cpp +3316 -0
- package/cpp/ggml/src/ggml-cpu/repack.h +173 -0
- package/cpp/ggml/src/ggml-cpu/simd-gemm.h +136 -0
- package/cpp/ggml/src/ggml-cpu/simd-mappings.h +1279 -0
- package/cpp/ggml/src/ggml-cpu/spacemit/ime.cpp +1025 -0
- package/cpp/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
- package/cpp/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
- package/cpp/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
- package/cpp/ggml/src/ggml-cpu/traits.cpp +36 -0
- package/cpp/ggml/src/ggml-cpu/traits.h +38 -0
- package/cpp/ggml/src/ggml-cpu/unary-ops.cpp +337 -0
- package/cpp/ggml/src/ggml-cpu/unary-ops.h +35 -0
- package/cpp/ggml/src/ggml-cpu/vec.cpp +629 -0
- package/cpp/ggml/src/ggml-cpu/vec.h +1585 -0
- package/cpp/ggml/src/ggml-hexagon/CMakeLists.txt +117 -0
- package/cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp +3232 -0
- package/cpp/ggml/src/ggml-hexagon/htp/CMakeLists.txt +45 -0
- package/cpp/ggml/src/ggml-hexagon/htp/act-ops.c +815 -0
- package/cpp/ggml/src/ggml-hexagon/htp/argsort-ops.c +281 -0
- package/cpp/ggml/src/ggml-hexagon/htp/binary-ops.c +827 -0
- package/cpp/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +157 -0
- package/cpp/ggml/src/ggml-hexagon/htp/cpy-ops.c +251 -0
- package/cpp/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +666 -0
- package/cpp/ggml/src/ggml-hexagon/htp/get-rows-ops.c +111 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hex-dma.c +63 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hex-dma.h +182 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hex-dump.h +77 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hex-fastdiv.h +37 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hex-utils.h +51 -0
- package/cpp/ggml/src/ggml-hexagon/htp/htp-ctx.h +35 -0
- package/cpp/ggml/src/ggml-hexagon/htp/htp-msg.h +154 -0
- package/cpp/ggml/src/ggml-hexagon/htp/htp-ops.h +65 -0
- package/cpp/ggml/src/ggml-hexagon/htp/htp_iface.idl +16 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-arith.h +470 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-base.h +173 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-copy.h +245 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-div.h +116 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-dump.h +129 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-exp.h +215 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-floor.h +100 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-inverse.h +176 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-reduce.h +266 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-scale.h +133 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +141 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-sqrt.h +126 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-types.h +36 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-utils.h +18 -0
- package/cpp/ggml/src/ggml-hexagon/htp/main.c +1150 -0
- package/cpp/ggml/src/ggml-hexagon/htp/matmul-ops.c +2595 -0
- package/cpp/ggml/src/ggml-hexagon/htp/rope-ops.c +498 -0
- package/cpp/ggml/src/ggml-hexagon/htp/set-rows-ops.c +167 -0
- package/cpp/ggml/src/ggml-hexagon/htp/softmax-ops.c +421 -0
- package/cpp/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +130 -0
- package/cpp/ggml/src/ggml-hexagon/htp/unary-ops.c +384 -0
- package/cpp/ggml/src/ggml-hexagon/htp/worker-pool.c +293 -0
- package/cpp/ggml/src/ggml-hexagon/htp/worker-pool.h +57 -0
- package/cpp/ggml/src/ggml-hexagon/htp-drv.cpp +418 -0
- package/cpp/ggml/src/ggml-hexagon/htp-drv.h +121 -0
- package/cpp/ggml/src/ggml-hexagon/libdl.h +79 -0
- package/cpp/ggml/src/ggml-hexagon/libggml-htp.inf +38 -0
- package/cpp/ggml/src/ggml-hexagon/op-desc.h +153 -0
- package/cpp/ggml/src/ggml-impl.h +724 -0
- package/cpp/ggml/src/ggml-metal/CMakeLists.txt +124 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal-common.cpp +457 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal-context.h +41 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal-context.m +702 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal-device.cpp +1890 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal-device.h +290 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal-device.m +1749 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal-impl.h +1054 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal-ops.cpp +4370 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal-ops.h +94 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal.cpp +937 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal.metal +9819 -0
- package/cpp/ggml/src/ggml-musa/CMakeLists.txt +125 -0
- package/cpp/ggml/src/ggml-musa/mudnn.cu +112 -0
- package/cpp/ggml/src/ggml-musa/mudnn.cuh +12 -0
- package/cpp/ggml/src/ggml-opencl/CMakeLists.txt +150 -0
- package/cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +11553 -0
- package/cpp/ggml/src/ggml-opencl/kernels/add.cl +190 -0
- package/cpp/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
- package/cpp/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
- package/cpp/ggml/src/ggml-opencl/kernels/clamp.cl +20 -0
- package/cpp/ggml/src/ggml-opencl/kernels/concat.cl +51 -0
- package/cpp/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
- package/cpp/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
- package/cpp/ggml/src/ggml-opencl/kernels/cpy.cl +184 -0
- package/cpp/ggml/src/ggml-opencl/kernels/cvt.cl +417 -0
- package/cpp/ggml/src/ggml-opencl/kernels/diag_mask_inf.cl +58 -0
- package/cpp/ggml/src/ggml-opencl/kernels/div.cl +138 -0
- package/cpp/ggml/src/ggml-opencl/kernels/embed_kernel.py +26 -0
- package/cpp/ggml/src/ggml-opencl/kernels/expm1.cl +113 -0
- package/cpp/ggml/src/ggml-opencl/kernels/fill.cl +17 -0
- package/cpp/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +370 -0
- package/cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +371 -0
- package/cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +373 -0
- package/cpp/ggml/src/ggml-opencl/kernels/gelu.cl +89 -0
- package/cpp/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32.cl +162 -0
- package/cpp/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32.cl +156 -0
- package/cpp/ggml/src/ggml-opencl/kernels/gemv_noshuffle.cl +268 -0
- package/cpp/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general.cl +274 -0
- package/cpp/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general_q8_0_f32.cl +195 -0
- package/cpp/ggml/src/ggml-opencl/kernels/get_rows.cl +187 -0
- package/cpp/ggml/src/ggml-opencl/kernels/glu.cl +378 -0
- package/cpp/ggml/src/ggml-opencl/kernels/group_norm.cl +121 -0
- package/cpp/ggml/src/ggml-opencl/kernels/im2col_f16.cl +57 -0
- package/cpp/ggml/src/ggml-opencl/kernels/im2col_f32.cl +57 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mean.cl +140 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul.cl +152 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mat_Ab_Bi_8x4.cl +139 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_kq_kqv.cl +273 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +146 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +147 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl +163 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl +165 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q6_k_f32_l4_lm.cl +158 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_8x4.cl +129 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl +154 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f16.cl +118 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32.cl +118 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_1row.cl +94 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_l4.cl +84 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f32_f32.cl +118 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32.cl +192 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_16x_flat.cl +307 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_8x_flat.cl +265 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_8x_flat.cl +272 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_v.cl +254 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl +219 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl +229 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl +180 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32.cl +194 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +194 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
- package/cpp/ggml/src/ggml-opencl/kernels/norm.cl +161 -0
- package/cpp/ggml/src/ggml-opencl/kernels/pad.cl +39 -0
- package/cpp/ggml/src/ggml-opencl/kernels/relu.cl +16 -0
- package/cpp/ggml/src/ggml-opencl/kernels/repeat.cl +38 -0
- package/cpp/ggml/src/ggml-opencl/kernels/rms_norm.cl +190 -0
- package/cpp/ggml/src/ggml-opencl/kernels/rope.cl +747 -0
- package/cpp/ggml/src/ggml-opencl/kernels/scale.cl +27 -0
- package/cpp/ggml/src/ggml-opencl/kernels/set_rows.cl +208 -0
- package/cpp/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
- package/cpp/ggml/src/ggml-opencl/kernels/silu.cl +30 -0
- package/cpp/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +108 -0
- package/cpp/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +108 -0
- package/cpp/ggml/src/ggml-opencl/kernels/softmax_f16.cl +107 -0
- package/cpp/ggml/src/ggml-opencl/kernels/softmax_f32.cl +107 -0
- package/cpp/ggml/src/ggml-opencl/kernels/softplus.cl +116 -0
- package/cpp/ggml/src/ggml-opencl/kernels/solve_tri.cl +51 -0
- package/cpp/ggml/src/ggml-opencl/kernels/sqr.cl +53 -0
- package/cpp/ggml/src/ggml-opencl/kernels/sqrt.cl +53 -0
- package/cpp/ggml/src/ggml-opencl/kernels/ssm_conv.cl +77 -0
- package/cpp/ggml/src/ggml-opencl/kernels/sub.cl +138 -0
- package/cpp/ggml/src/ggml-opencl/kernels/sum_rows.cl +140 -0
- package/cpp/ggml/src/ggml-opencl/kernels/tanh.cl +109 -0
- package/cpp/ggml/src/ggml-opencl/kernels/transpose.cl +117 -0
- package/cpp/ggml/src/ggml-opencl/kernels/tri.cl +32 -0
- package/cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
- package/cpp/ggml/src/ggml-opencl/kernels/upscale.cl +120 -0
- package/cpp/ggml/src/ggml-opt.cpp +1093 -0
- package/cpp/ggml/src/ggml-quants.c +5325 -0
- package/cpp/ggml/src/ggml-quants.h +106 -0
- package/cpp/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
- package/cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2118 -0
- package/cpp/ggml/src/ggml-threading.cpp +12 -0
- package/cpp/ggml/src/ggml-threading.h +14 -0
- package/cpp/ggml/src/ggml-virtgpu/CMakeLists.txt +70 -0
- package/cpp/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp +87 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/CMakeLists.txt +21 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp +115 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/backend-convert.h +13 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +102 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +105 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +179 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp +148 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +51 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +73 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +27 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +32 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/backend.cpp +144 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +95 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +94 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +50 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +378 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +232 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +58 -0
- package/cpp/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +81 -0
- package/cpp/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +119 -0
- package/cpp/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +158 -0
- package/cpp/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +213 -0
- package/cpp/ggml/src/ggml-virtgpu/ggml-backend.cpp +69 -0
- package/cpp/ggml/src/ggml-virtgpu/ggml-remoting.h +71 -0
- package/cpp/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml +166 -0
- package/cpp/ggml/src/ggml-virtgpu/include/apir_hw.h +9 -0
- package/cpp/ggml/src/ggml-virtgpu/regenerate_remoting.py +333 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu-apir.h +15 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +58 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +110 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +173 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +192 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +36 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +53 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +98 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu-shm.h +23 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu-utils.cpp +179 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu-utils.h +86 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu.cpp +544 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu.h +117 -0
- package/cpp/ggml/src/ggml-webgpu/CMakeLists.txt +80 -0
- package/cpp/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +1231 -0
- package/cpp/ggml/src/ggml-webgpu/ggml-webgpu.cpp +3150 -0
- package/cpp/ggml/src/ggml-webgpu/pre_wgsl.hpp +778 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl +72 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl +106 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl +134 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +107 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +923 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +107 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl +66 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +182 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +636 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.wgsl +668 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +323 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +713 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +103 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl +138 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.wgsl +188 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +194 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl +86 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +123 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/rope.tmpl.wgsl +295 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/scale.wgsl +63 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +109 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.tmpl.wgsl +345 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl +55 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +193 -0
- package/cpp/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
- package/cpp/ggml/src/ggml-zdnn/common.hpp +59 -0
- package/cpp/ggml/src/ggml-zdnn/ggml-zdnn.cpp +633 -0
- package/cpp/ggml/src/ggml-zdnn/mmf.cpp +80 -0
- package/cpp/ggml/src/ggml-zdnn/mmf.hpp +12 -0
- package/cpp/ggml/src/ggml-zdnn/utils.cpp +79 -0
- package/cpp/ggml/src/ggml-zdnn/utils.hpp +19 -0
- package/cpp/ggml/src/ggml-zendnn/CMakeLists.txt +92 -0
- package/cpp/ggml/src/ggml-zendnn/ggml-zendnn.cpp +469 -0
- package/cpp/ggml/src/ggml.c +7669 -0
- package/cpp/ggml/src/ggml.cpp +26 -0
- package/cpp/ggml/src/gguf.cpp +1699 -0
- package/cpp/include/llama-cpp.h +32 -0
- package/cpp/include/llama.h +1568 -0
- package/cpp/mtmd/CMakeLists.txt +98 -0
- package/cpp/mtmd/README.md +63 -0
- package/cpp/mtmd/clip-graph.h +117 -0
- package/cpp/mtmd/clip-impl.h +586 -0
- package/cpp/mtmd/clip-model.h +390 -0
- package/cpp/mtmd/clip.cpp +4154 -0
- package/cpp/mtmd/clip.h +121 -0
- package/cpp/mtmd/deprecation-warning.cpp +22 -0
- package/cpp/mtmd/legacy-models/convert_image_encoder_to_gguf.py +412 -0
- package/cpp/mtmd/legacy-models/glmedge-convert-image-encoder-to-gguf.py +280 -0
- package/cpp/mtmd/legacy-models/glmedge-surgery.py +33 -0
- package/cpp/mtmd/legacy-models/llava_surgery.py +38 -0
- package/cpp/mtmd/legacy-models/llava_surgery_v2.py +180 -0
- package/cpp/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py +892 -0
- package/cpp/mtmd/legacy-models/minicpmv-surgery.py +47 -0
- package/cpp/mtmd/models/cogvlm.cpp +98 -0
- package/cpp/mtmd/models/conformer.cpp +216 -0
- package/cpp/mtmd/models/glm4v.cpp +122 -0
- package/cpp/mtmd/models/internvl.cpp +69 -0
- package/cpp/mtmd/models/kimik25.cpp +101 -0
- package/cpp/mtmd/models/kimivl.cpp +63 -0
- package/cpp/mtmd/models/llama4.cpp +96 -0
- package/cpp/mtmd/models/llava.cpp +374 -0
- package/cpp/mtmd/models/minicpmv.cpp +114 -0
- package/cpp/mtmd/models/mobilenetv5.cpp +451 -0
- package/cpp/mtmd/models/models.h +128 -0
- package/cpp/mtmd/models/nemotron-v2-vl.cpp +35 -0
- package/cpp/mtmd/models/paddleocr.cpp +52 -0
- package/cpp/mtmd/models/pixtral.cpp +86 -0
- package/cpp/mtmd/models/qwen2vl.cpp +183 -0
- package/cpp/mtmd/models/qwen3vl.cpp +193 -0
- package/cpp/mtmd/models/siglip.cpp +86 -0
- package/cpp/mtmd/models/whisper-enc.cpp +115 -0
- package/cpp/mtmd/models/youtuvl.cpp +179 -0
- package/cpp/mtmd/mtmd-audio.cpp +730 -0
- package/cpp/mtmd/mtmd-audio.h +113 -0
- package/cpp/mtmd/mtmd-cli.cpp +437 -0
- package/cpp/mtmd/mtmd-helper.cpp +521 -0
- package/cpp/mtmd/mtmd-helper.h +96 -0
- package/cpp/mtmd/mtmd.cpp +1156 -0
- package/cpp/mtmd/mtmd.h +319 -0
- package/cpp/mtmd/requirements.txt +5 -0
- package/cpp/mtmd/test-1.jpeg +0 -0
- package/cpp/mtmd/test-2.mp3 +0 -0
- package/cpp/mtmd/tests.sh +192 -0
- package/cpp/src/CMakeLists.txt +169 -0
- package/cpp/src/llama-adapter.cpp +488 -0
- package/cpp/src/llama-adapter.h +89 -0
- package/cpp/src/llama-arch.cpp +2855 -0
- package/cpp/src/llama-arch.h +619 -0
- package/cpp/src/llama-batch.cpp +917 -0
- package/cpp/src/llama-batch.h +173 -0
- package/cpp/src/llama-chat.cpp +896 -0
- package/cpp/src/llama-chat.h +71 -0
- package/cpp/src/llama-context.cpp +3512 -0
- package/cpp/src/llama-context.h +359 -0
- package/cpp/src/llama-cparams.cpp +5 -0
- package/cpp/src/llama-cparams.h +44 -0
- package/cpp/src/llama-grammar.cpp +1464 -0
- package/cpp/src/llama-grammar.h +194 -0
- package/cpp/src/llama-graph.cpp +2685 -0
- package/cpp/src/llama-graph.h +1026 -0
- package/cpp/src/llama-hparams.cpp +234 -0
- package/cpp/src/llama-hparams.h +339 -0
- package/cpp/src/llama-impl.cpp +171 -0
- package/cpp/src/llama-impl.h +73 -0
- package/cpp/src/llama-io.cpp +15 -0
- package/cpp/src/llama-io.h +35 -0
- package/cpp/src/llama-kv-cache-iswa.cpp +330 -0
- package/cpp/src/llama-kv-cache-iswa.h +137 -0
- package/cpp/src/llama-kv-cache.cpp +2271 -0
- package/cpp/src/llama-kv-cache.h +388 -0
- package/cpp/src/llama-kv-cells.h +533 -0
- package/cpp/src/llama-memory-hybrid-iswa.cpp +275 -0
- package/cpp/src/llama-memory-hybrid-iswa.h +140 -0
- package/cpp/src/llama-memory-hybrid.cpp +268 -0
- package/cpp/src/llama-memory-hybrid.h +139 -0
- package/cpp/src/llama-memory-recurrent.cpp +1165 -0
- package/cpp/src/llama-memory-recurrent.h +182 -0
- package/cpp/src/llama-memory.cpp +59 -0
- package/cpp/src/llama-memory.h +122 -0
- package/cpp/src/llama-mmap.cpp +785 -0
- package/cpp/src/llama-mmap.h +92 -0
- package/cpp/src/llama-model-loader.cpp +1414 -0
- package/cpp/src/llama-model-loader.h +203 -0
- package/cpp/src/llama-model-saver.cpp +286 -0
- package/cpp/src/llama-model-saver.h +37 -0
- package/cpp/src/llama-model.cpp +9253 -0
- package/cpp/src/llama-model.h +576 -0
- package/cpp/src/llama-quant.cpp +1119 -0
- package/cpp/src/llama-quant.h +1 -0
- package/cpp/src/llama-sampler.cpp +3885 -0
- package/cpp/src/llama-sampler.h +42 -0
- package/cpp/src/llama-vocab.cpp +3970 -0
- package/cpp/src/llama-vocab.h +187 -0
- package/cpp/src/llama.cpp +1313 -0
- package/cpp/src/models/afmoe.cpp +191 -0
- package/cpp/src/models/apertus.cpp +125 -0
- package/cpp/src/models/arcee.cpp +135 -0
- package/cpp/src/models/arctic.cpp +138 -0
- package/cpp/src/models/arwkv7.cpp +86 -0
- package/cpp/src/models/baichuan.cpp +122 -0
- package/cpp/src/models/bailingmoe.cpp +144 -0
- package/cpp/src/models/bailingmoe2.cpp +135 -0
- package/cpp/src/models/bert.cpp +178 -0
- package/cpp/src/models/bitnet.cpp +160 -0
- package/cpp/src/models/bloom.cpp +101 -0
- package/cpp/src/models/chameleon.cpp +178 -0
- package/cpp/src/models/chatglm.cpp +132 -0
- package/cpp/src/models/codeshell.cpp +111 -0
- package/cpp/src/models/cogvlm.cpp +102 -0
- package/cpp/src/models/cohere2-iswa.cpp +134 -0
- package/cpp/src/models/command-r.cpp +122 -0
- package/cpp/src/models/dbrx.cpp +123 -0
- package/cpp/src/models/deci.cpp +135 -0
- package/cpp/src/models/deepseek.cpp +144 -0
- package/cpp/src/models/deepseek2.cpp +262 -0
- package/cpp/src/models/delta-net-base.cpp +376 -0
- package/cpp/src/models/dots1.cpp +134 -0
- package/cpp/src/models/dream.cpp +105 -0
- package/cpp/src/models/ernie4-5-moe.cpp +150 -0
- package/cpp/src/models/ernie4-5.cpp +110 -0
- package/cpp/src/models/eurobert.cpp +97 -0
- package/cpp/src/models/exaone-moe.cpp +146 -0
- package/cpp/src/models/exaone.cpp +114 -0
- package/cpp/src/models/exaone4.cpp +123 -0
- package/cpp/src/models/falcon-h1.cpp +111 -0
- package/cpp/src/models/falcon.cpp +120 -0
- package/cpp/src/models/gemma-embedding.cpp +116 -0
- package/cpp/src/models/gemma.cpp +112 -0
- package/cpp/src/models/gemma2-iswa.cpp +128 -0
- package/cpp/src/models/gemma3.cpp +155 -0
- package/cpp/src/models/gemma3n-iswa.cpp +384 -0
- package/cpp/src/models/glm4-moe.cpp +170 -0
- package/cpp/src/models/glm4.cpp +157 -0
- package/cpp/src/models/gpt2.cpp +105 -0
- package/cpp/src/models/gptneox.cpp +144 -0
- package/cpp/src/models/granite-hybrid.cpp +196 -0
- package/cpp/src/models/granite.cpp +211 -0
- package/cpp/src/models/grok.cpp +159 -0
- package/cpp/src/models/grovemoe.cpp +141 -0
- package/cpp/src/models/hunyuan-dense.cpp +132 -0
- package/cpp/src/models/hunyuan-moe.cpp +154 -0
- package/cpp/src/models/internlm2.cpp +120 -0
- package/cpp/src/models/jais.cpp +86 -0
- package/cpp/src/models/jais2.cpp +123 -0
- package/cpp/src/models/jamba.cpp +106 -0
- package/cpp/src/models/kimi-linear.cpp +392 -0
- package/cpp/src/models/lfm2.cpp +190 -0
- package/cpp/src/models/llada-moe.cpp +122 -0
- package/cpp/src/models/llada.cpp +99 -0
- package/cpp/src/models/llama-iswa.cpp +178 -0
- package/cpp/src/models/llama.cpp +168 -0
- package/cpp/src/models/maincoder.cpp +117 -0
- package/cpp/src/models/mamba-base.cpp +285 -0
- package/cpp/src/models/mamba.cpp +54 -0
- package/cpp/src/models/mimo2-iswa.cpp +123 -0
- package/cpp/src/models/minicpm3.cpp +200 -0
- package/cpp/src/models/minimax-m2.cpp +124 -0
- package/cpp/src/models/mistral3.cpp +160 -0
- package/cpp/src/models/models.h +684 -0
- package/cpp/src/models/modern-bert.cpp +109 -0
- package/cpp/src/models/mpt.cpp +126 -0
- package/cpp/src/models/nemotron-h.cpp +148 -0
- package/cpp/src/models/nemotron.cpp +122 -0
- package/cpp/src/models/neo-bert.cpp +104 -0
- package/cpp/src/models/olmo.cpp +121 -0
- package/cpp/src/models/olmo2.cpp +150 -0
- package/cpp/src/models/olmoe.cpp +124 -0
- package/cpp/src/models/openai-moe-iswa.cpp +127 -0
- package/cpp/src/models/openelm.cpp +124 -0
- package/cpp/src/models/orion.cpp +123 -0
- package/cpp/src/models/paddleocr.cpp +122 -0
- package/cpp/src/models/pangu-embedded.cpp +121 -0
- package/cpp/src/models/phi2.cpp +121 -0
- package/cpp/src/models/phi3.cpp +152 -0
- package/cpp/src/models/plamo.cpp +110 -0
- package/cpp/src/models/plamo2.cpp +318 -0
- package/cpp/src/models/plamo3.cpp +128 -0
- package/cpp/src/models/plm.cpp +169 -0
- package/cpp/src/models/qwen.cpp +108 -0
- package/cpp/src/models/qwen2.cpp +126 -0
- package/cpp/src/models/qwen2moe.cpp +151 -0
- package/cpp/src/models/qwen2vl.cpp +117 -0
- package/cpp/src/models/qwen3.cpp +117 -0
- package/cpp/src/models/qwen35.cpp +386 -0
- package/cpp/src/models/qwen35moe.cpp +420 -0
- package/cpp/src/models/qwen3moe.cpp +124 -0
- package/cpp/src/models/qwen3next.cpp +525 -0
- package/cpp/src/models/qwen3vl-moe.cpp +140 -0
- package/cpp/src/models/qwen3vl.cpp +132 -0
- package/cpp/src/models/refact.cpp +94 -0
- package/cpp/src/models/rnd1.cpp +126 -0
- package/cpp/src/models/rwkv6-base.cpp +164 -0
- package/cpp/src/models/rwkv6.cpp +94 -0
- package/cpp/src/models/rwkv6qwen2.cpp +86 -0
- package/cpp/src/models/rwkv7-base.cpp +137 -0
- package/cpp/src/models/rwkv7.cpp +90 -0
- package/cpp/src/models/seed-oss.cpp +124 -0
- package/cpp/src/models/smallthinker.cpp +126 -0
- package/cpp/src/models/smollm3.cpp +128 -0
- package/cpp/src/models/stablelm.cpp +146 -0
- package/cpp/src/models/starcoder.cpp +100 -0
- package/cpp/src/models/starcoder2.cpp +121 -0
- package/cpp/src/models/step35-iswa.cpp +168 -0
- package/cpp/src/models/t5-dec.cpp +166 -0
- package/cpp/src/models/t5-enc.cpp +96 -0
- package/cpp/src/models/wavtokenizer-dec.cpp +149 -0
- package/cpp/src/models/xverse.cpp +108 -0
- package/cpp/src/unicode-data.cpp +7034 -0
- package/cpp/src/unicode-data.h +20 -0
- package/cpp/src/unicode.cpp +1103 -0
- package/cpp/src/unicode.h +111 -0
- package/cpp/vendor/nlohmann/json.hpp +25526 -0
- package/cpp/vendor/nlohmann/json_fwd.hpp +187 -0
- package/cpp/vendor/stb/stb_image.h +7988 -0
- package/ios/LocalLLM-Bridging-Header.h +2 -0
- package/ios/LocalLLM.h +5 -0
- package/ios/LocalLLM.mm +1267 -0
- package/local-llm-rn.podspec +60 -0
- package/package.json +35 -0
- package/src/NativeLocalLLM.ts +73 -0
- package/src/device.ts +50 -0
- package/src/download-adapter.ts +17 -0
- package/src/index.ts +21 -0
- package/src/native-bridge.ts +142 -0
- package/src/rn-downloader.ts +37 -0
|
@@ -0,0 +1,417 @@
|
|
|
1
|
+
//------------------------------------------------------------------------------
|
|
2
|
+
// This file is contains kernels for data conversion.
|
|
3
|
+
// These kernels are used when loading the model, so its performance is less
|
|
4
|
+
// important.
|
|
5
|
+
//------------------------------------------------------------------------------
|
|
6
|
+
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
|
7
|
+
|
|
8
|
+
#ifdef cl_intel_required_subgroup_size
|
|
9
|
+
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
|
|
10
|
+
#define INTEL_GPU 1
|
|
11
|
+
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
|
|
12
|
+
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
|
|
13
|
+
#elif defined(cl_qcom_reqd_sub_group_size)
|
|
14
|
+
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
|
15
|
+
#define ADRENO_GPU 1
|
|
16
|
+
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
|
17
|
+
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
|
18
|
+
#endif
|
|
19
|
+
|
|
20
|
+
#define QK4_0 32
|
|
21
|
+
#define QR4_0 2
|
|
22
|
+
#define QK4_1 32
|
|
23
|
+
#define QR4_1 2
|
|
24
|
+
#define QK5_0 32
|
|
25
|
+
#define QR5_0 2
|
|
26
|
+
#define QK5_1 32
|
|
27
|
+
#define QR5_1 2
|
|
28
|
+
#define QK8_0 32
|
|
29
|
+
#define QR8_0 1
|
|
30
|
+
#define QK_K 256
|
|
31
|
+
#define K_QUANTS_PER_ITERATION 2
|
|
32
|
+
|
|
33
|
+
typedef char int8_t;
|
|
34
|
+
typedef uchar uint8_t;
|
|
35
|
+
typedef short int16_t;
|
|
36
|
+
typedef ushort uint16_t;
|
|
37
|
+
typedef int int32_t;
|
|
38
|
+
typedef uint uint32_t;
|
|
39
|
+
|
|
40
|
+
//------------------------------------------------------------------------------
|
|
41
|
+
// block_q4_0
|
|
42
|
+
//------------------------------------------------------------------------------
|
|
43
|
+
struct block_q4_0
|
|
44
|
+
{
|
|
45
|
+
half d;
|
|
46
|
+
uint8_t qs[QK4_0 / 2];
|
|
47
|
+
};
|
|
48
|
+
|
|
49
|
+
//------------------------------------------------------------------------------
|
|
50
|
+
// block_q4_1
|
|
51
|
+
//------------------------------------------------------------------------------
|
|
52
|
+
struct block_q4_1 {
|
|
53
|
+
half d; // delta
|
|
54
|
+
half m; // min
|
|
55
|
+
uchar qs[QK4_1 / 2]; // nibbles / quants
|
|
56
|
+
};
|
|
57
|
+
|
|
58
|
+
//------------------------------------------------------------------------------
|
|
59
|
+
// block_q6_K
|
|
60
|
+
//------------------------------------------------------------------------------
|
|
61
|
+
struct block_q6_K {
|
|
62
|
+
uint8_t ql[QK_K/2]; // quants, lower 4 bits
|
|
63
|
+
uint8_t qh[QK_K/4]; // quants, upper 2 bits
|
|
64
|
+
int8_t scales[QK_K/16]; // scales, quantized with 8 bits
|
|
65
|
+
half d; // super-block scale
|
|
66
|
+
};
|
|
67
|
+
|
|
68
|
+
//------------------------------------------------------------------------------
|
|
69
|
+
// kernel_convert_block_q4_0
|
|
70
|
+
// Convert the block_q4_0 format to 2 separate arrays (AOS -> SOA).
|
|
71
|
+
// This kernel does not deshuffle the bits.
|
|
72
|
+
//------------------------------------------------------------------------------
|
|
73
|
+
kernel void kernel_convert_block_q4_0(
|
|
74
|
+
global struct block_q4_0 * src0,
|
|
75
|
+
global uchar * dst_q,
|
|
76
|
+
global half * dst_d
|
|
77
|
+
) {
|
|
78
|
+
global struct block_q4_0 * b = (global struct block_q4_0 *) src0 + get_global_id(0);
|
|
79
|
+
global uchar * q = (global uchar *) dst_q + QK4_0/2*get_global_id(0);
|
|
80
|
+
global half * d = (global half *) dst_d + get_global_id(0);
|
|
81
|
+
|
|
82
|
+
*d = b->d;
|
|
83
|
+
|
|
84
|
+
for (int i = 0; i < QK4_0/2; ++i) {
|
|
85
|
+
q[i] = b->qs[i];
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
kernel void kernel_restore_block_q4_0(
|
|
90
|
+
global uchar * src_q,
|
|
91
|
+
global half * src_d,
|
|
92
|
+
global struct block_q4_0 * dst
|
|
93
|
+
) {
|
|
94
|
+
global struct block_q4_0 * b = (global struct block_q4_0 *) dst + get_global_id(0);
|
|
95
|
+
global uchar * q = (global uchar *) src_q + QK4_0/2*get_global_id(0);
|
|
96
|
+
global half * d = (global half *) src_d + get_global_id(0);
|
|
97
|
+
|
|
98
|
+
b->d = *d;
|
|
99
|
+
for (int i = 0; i < QK4_0/2; ++i) {
|
|
100
|
+
b->qs[i] = q[i];
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
//------------------------------------------------------------------------------
|
|
105
|
+
// kernel_convert_block_q4_0_noshuffle
|
|
106
|
+
// Flatten q4_0 weights and unshuffle the bits
|
|
107
|
+
//------------------------------------------------------------------------------
|
|
108
|
+
|
|
109
|
+
kernel void kernel_convert_block_q4_0_noshuffle(
|
|
110
|
+
global struct block_q4_0 * src0,
|
|
111
|
+
global uchar * dst_q,
|
|
112
|
+
global half * dst_d
|
|
113
|
+
) {
|
|
114
|
+
global struct block_q4_0 * b = (global struct block_q4_0 *) src0 + get_global_id(0);
|
|
115
|
+
global uchar * q = (global uchar *) dst_q + QK4_0/2*get_global_id(0);
|
|
116
|
+
global half * d = (global half *) dst_d + get_global_id(0);
|
|
117
|
+
|
|
118
|
+
*d = b->d;
|
|
119
|
+
for (int i = 0; i < QK4_0/4; ++i) {
|
|
120
|
+
uchar x0 = b->qs[2*i + 0];
|
|
121
|
+
uchar x1 = b->qs[2*i + 1];
|
|
122
|
+
|
|
123
|
+
q[i + 0 ] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
|
|
124
|
+
q[i + QK4_0/4] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
|
|
125
|
+
|
|
126
|
+
#ifdef ADRENO_GPU
|
|
127
|
+
// Workaround for adreno - must have the following printf statement for
|
|
128
|
+
// the kernel to work properly. Otherwise it produces incorrect result.
|
|
129
|
+
// convert_uchar above also seems necessary.
|
|
130
|
+
// Compare against a large number so that it does not print anything.
|
|
131
|
+
// get_sub_group_local_id() also works.
|
|
132
|
+
if (get_global_id(0) == 65536*4096) {
|
|
133
|
+
printf("%04x - %02x\n", *(global ushort*)d, ((x0 & 0xF0) >> 4) | (x1 & 0xF0));
|
|
134
|
+
}
|
|
135
|
+
#endif
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
kernel void kernel_restore_block_q4_0_noshuffle(
|
|
140
|
+
global uchar * src_q,
|
|
141
|
+
global half * src_d,
|
|
142
|
+
global struct block_q4_0 * dst,
|
|
143
|
+
uchar mask_0F,
|
|
144
|
+
uchar mask_F0
|
|
145
|
+
) {
|
|
146
|
+
global struct block_q4_0 * b = (global struct block_q4_0 *) dst + get_global_id(0);
|
|
147
|
+
global uchar * q = (global uchar *) src_q + QK4_0/2*get_global_id(0);
|
|
148
|
+
global half * d = (global half *) src_d + get_global_id(0);
|
|
149
|
+
|
|
150
|
+
b->d = *d;
|
|
151
|
+
for (int i = 0; i < QK4_0/4; ++i) {
|
|
152
|
+
uchar x0 = q[i + 0 ] ;
|
|
153
|
+
uchar x1 = q[i + QK4_0/4];
|
|
154
|
+
|
|
155
|
+
b->qs[2*i + 0] = convert_uchar((x0 & mask_0F) | ((x1 & mask_0F) << 4));
|
|
156
|
+
b->qs[2*i + 1] = convert_uchar(((x0 & mask_F0) >> 4) | (x1 & mask_F0));
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
//------------------------------------------------------------------------------
|
|
161
|
+
// kernel_convert_block_q4_1
|
|
162
|
+
// Convert the block_q4_1 format to 2 separate arrays (AOS -> SOA).
|
|
163
|
+
// This kernel does not deshuffle the bits.
|
|
164
|
+
//------------------------------------------------------------------------------
|
|
165
|
+
kernel void kernel_convert_block_q4_1(
|
|
166
|
+
global struct block_q4_1 * src0,
|
|
167
|
+
global uchar * dst_q,
|
|
168
|
+
global half * dst_d,
|
|
169
|
+
global half * dst_m
|
|
170
|
+
) {
|
|
171
|
+
global struct block_q4_1 * b = (global struct block_q4_1 *) src0 + get_global_id(0);
|
|
172
|
+
global uchar * q = (global uchar *) dst_q + QK4_1/2*get_global_id(0);
|
|
173
|
+
global half * d = (global half *) dst_d + get_global_id(0);
|
|
174
|
+
global half * m = (global half *) dst_m + get_global_id(0);
|
|
175
|
+
|
|
176
|
+
*d = b->d;
|
|
177
|
+
*m = b->m;
|
|
178
|
+
|
|
179
|
+
for (int i = 0; i < QK4_1/2; ++i) {
|
|
180
|
+
q[i] = b->qs[i];
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
kernel void kernel_restore_block_q4_1(
|
|
185
|
+
global uchar * src_q,
|
|
186
|
+
global half * src_d,
|
|
187
|
+
global half * src_m,
|
|
188
|
+
global struct block_q4_1 * dst
|
|
189
|
+
) {
|
|
190
|
+
global struct block_q4_1 * b = (global struct block_q4_1 *) dst + get_global_id(0);
|
|
191
|
+
global uchar * q = (global uchar *) src_q + QK4_1/2*get_global_id(0);
|
|
192
|
+
global half * d = (global half *) src_d + get_global_id(0);
|
|
193
|
+
global half * m = (global half *) src_m + get_global_id(0);
|
|
194
|
+
|
|
195
|
+
b->d = *d;
|
|
196
|
+
b->m = *m;
|
|
197
|
+
for (int i = 0; i < QK4_1/2; ++i) {
|
|
198
|
+
b->qs[i] = q[i];
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
//------------------------------------------------------------------------------
|
|
203
|
+
// block_mxfp4
|
|
204
|
+
//------------------------------------------------------------------------------
|
|
205
|
+
#define QK_MXFP4 32
|
|
206
|
+
struct block_mxfp4 {
|
|
207
|
+
uchar e; // E8M0
|
|
208
|
+
uchar qs[QK_MXFP4 / 2];
|
|
209
|
+
};
|
|
210
|
+
|
|
211
|
+
//------------------------------------------------------------------------------
|
|
212
|
+
// kernel_convert_block_mxfp4
|
|
213
|
+
// Convert the block_mxfp4 format to 2 separate arrays (AOS -> SOA).
|
|
214
|
+
// This kernel does not deshuffle the bits.
|
|
215
|
+
//------------------------------------------------------------------------------
|
|
216
|
+
kernel void kernel_convert_block_mxfp4(
|
|
217
|
+
global struct block_mxfp4 * src0,
|
|
218
|
+
global uchar * dst_q,
|
|
219
|
+
global uchar * dst_e
|
|
220
|
+
) {
|
|
221
|
+
global struct block_mxfp4 * b = (global struct block_mxfp4 *) src0 + get_global_id(0);
|
|
222
|
+
global uchar * q = (global uchar *) dst_q + QK_MXFP4 / 2 * get_global_id(0);
|
|
223
|
+
global uchar * e = (global uchar *) dst_e + get_global_id(0);
|
|
224
|
+
|
|
225
|
+
*e = b->e;
|
|
226
|
+
|
|
227
|
+
for (int i = 0; i < QK_MXFP4 / 2; ++i) {
|
|
228
|
+
q[i] = b->qs[i];
|
|
229
|
+
}
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
kernel void kernel_convert_block_mxfp4_trans(
|
|
233
|
+
global struct block_mxfp4 * src0,
|
|
234
|
+
__global uint4 * dst_q,
|
|
235
|
+
__global uchar * dst_e,
|
|
236
|
+
uint ne00,
|
|
237
|
+
uint ne01
|
|
238
|
+
) {
|
|
239
|
+
int i00 = get_global_id(1);
|
|
240
|
+
uint i01 = get_global_id(0);
|
|
241
|
+
uint i02 = get_global_id(2);
|
|
242
|
+
|
|
243
|
+
uint ne00_blk = ne00 / QK_MXFP4;
|
|
244
|
+
uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
|
|
245
|
+
uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
|
|
246
|
+
|
|
247
|
+
global struct block_mxfp4 * b = src0 + src_blk_offset;
|
|
248
|
+
|
|
249
|
+
dst_q[dst_blk_offset] = ((global uint4 *)(&(b->qs[0])))[0];
|
|
250
|
+
dst_e[dst_blk_offset] = b->e;
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
kernel void kernel_restore_block_mxfp4(
|
|
254
|
+
global uchar * src_q,
|
|
255
|
+
global half * src_e,
|
|
256
|
+
global struct block_mxfp4 * dst
|
|
257
|
+
) {
|
|
258
|
+
global struct block_mxfp4 * b = (global struct block_mxfp4 *) dst + get_global_id(0);
|
|
259
|
+
global uchar * q = (global uchar *) src_q + QK_MXFP4 / 2 * get_global_id(0);
|
|
260
|
+
global uchar * e = (global uchar *) src_e + get_global_id(0);
|
|
261
|
+
|
|
262
|
+
b->e = *e;
|
|
263
|
+
for (int i = 0; i < QK_MXFP4 / 2; ++i) {
|
|
264
|
+
b->qs[i] = q[i];
|
|
265
|
+
}
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
kernel void kernel_restore_block_mxfp4_trans(
|
|
269
|
+
__global uint4 * src_q,
|
|
270
|
+
__global uchar * src_e,
|
|
271
|
+
global struct block_mxfp4 * dst,
|
|
272
|
+
uint ne00,
|
|
273
|
+
uint ne01
|
|
274
|
+
) {
|
|
275
|
+
int i00 = get_global_id(1);
|
|
276
|
+
uint i01 = get_global_id(0);
|
|
277
|
+
uint i02 = get_global_id(2);
|
|
278
|
+
|
|
279
|
+
uint ne00_blk = ne00 / QK_MXFP4;
|
|
280
|
+
uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
|
|
281
|
+
uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
|
|
282
|
+
|
|
283
|
+
global struct block_mxfp4 * b = dst + dst_blk_offset;
|
|
284
|
+
|
|
285
|
+
((global uint4 *)(&(b->qs[0])))[0] = src_q[src_blk_offset];
|
|
286
|
+
b->e = src_e[src_blk_offset];
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
//------------------------------------------------------------------------------
|
|
290
|
+
// block_q8_0
|
|
291
|
+
//------------------------------------------------------------------------------
|
|
292
|
+
typedef struct {
|
|
293
|
+
half d; // delta
|
|
294
|
+
char qs[QK8_0]; // quants
|
|
295
|
+
} block_q8_0;
|
|
296
|
+
|
|
297
|
+
kernel void kernel_convert_block_q8_0(
|
|
298
|
+
global block_q8_0 * src0,
|
|
299
|
+
global uchar * dst_q,
|
|
300
|
+
global half * dst_d
|
|
301
|
+
) {
|
|
302
|
+
global block_q8_0 * b = (global block_q8_0 *) src0 + get_global_id(0);
|
|
303
|
+
global uchar * q = (global uchar *) dst_q + QK8_0*get_global_id(0);
|
|
304
|
+
global half * d = (global half *) dst_d + get_global_id(0);
|
|
305
|
+
|
|
306
|
+
*d = b->d;
|
|
307
|
+
|
|
308
|
+
for (int i = 0; i < QK8_0; ++i) {
|
|
309
|
+
q[i] = b->qs[i];
|
|
310
|
+
}
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
kernel void kernel_restore_block_q8_0(
|
|
314
|
+
global uchar * src_q,
|
|
315
|
+
global half * src_d,
|
|
316
|
+
global block_q8_0 * dst
|
|
317
|
+
) {
|
|
318
|
+
global block_q8_0 * b = (global block_q8_0 *) dst + get_global_id(0);
|
|
319
|
+
global uchar * q = (global uchar *) src_q + QK8_0*get_global_id(0);
|
|
320
|
+
global half * d = (global half *) src_d + get_global_id(0);
|
|
321
|
+
|
|
322
|
+
b->d = *d;
|
|
323
|
+
for (int i = 0; i < QK8_0; ++i) {
|
|
324
|
+
b->qs[i] = q[i];
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
kernel void kernel_restore_block_q8_0_trans(
|
|
329
|
+
global uchar * src_q,
|
|
330
|
+
global half * src_d,
|
|
331
|
+
global block_q8_0 * dst,
|
|
332
|
+
uint ne00,
|
|
333
|
+
uint ne01
|
|
334
|
+
){
|
|
335
|
+
uint num_blk_per_row = ne00 / QK8_0;
|
|
336
|
+
|
|
337
|
+
global block_q8_0 * b = (global block_q8_0 *) dst + get_global_id(0) * num_blk_per_row;
|
|
338
|
+
global uchar * q = (global uchar *) src_q + get_global_id(0) * 4; // 4 8-bit packed
|
|
339
|
+
global half * d = (global half *) src_d + get_global_id(0);
|
|
340
|
+
|
|
341
|
+
for (uint blk = 0; blk < num_blk_per_row; blk++) {
|
|
342
|
+
b->d = *d;
|
|
343
|
+
|
|
344
|
+
for (uint i = 0; i < QK8_0; i+=4) {
|
|
345
|
+
b->qs[i] = q[0];
|
|
346
|
+
b->qs[i+1] = q[1];
|
|
347
|
+
b->qs[i+2] = q[2];
|
|
348
|
+
b->qs[i+3] = q[3];
|
|
349
|
+
|
|
350
|
+
q += 4 * ne01; // M stride
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
d += ne01;
|
|
354
|
+
|
|
355
|
+
b++;
|
|
356
|
+
}
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
//------------------------------------------------------------------------------
|
|
360
|
+
// kernel_convert_block_q6_K
|
|
361
|
+
// Convert the block_q6_K format to 3 separate arrays (AOS -> SOA).
|
|
362
|
+
// This kernel does not deshuffle the bits.
|
|
363
|
+
// Each thread processes a super block.
|
|
364
|
+
//------------------------------------------------------------------------------
|
|
365
|
+
kernel void kernel_convert_block_q6_K(
|
|
366
|
+
global struct block_q6_K * src0,
|
|
367
|
+
global uchar * dst_ql,
|
|
368
|
+
global uchar * dst_qh,
|
|
369
|
+
global char * dst_s,
|
|
370
|
+
global half * dst_d
|
|
371
|
+
) {
|
|
372
|
+
global struct block_q6_K * b = (global struct block_q6_K *) src0 + get_global_id(0);
|
|
373
|
+
global uchar * ql = (global uchar *) dst_ql + QK_K/2*get_global_id(0);
|
|
374
|
+
global uchar * qh = (global uchar *) dst_qh + QK_K/4*get_global_id(0);
|
|
375
|
+
global char * s = (global char *) dst_s + QK_K/16*get_global_id(0);
|
|
376
|
+
global half * d = (global half *) dst_d + get_global_id(0);
|
|
377
|
+
|
|
378
|
+
*d = b->d;
|
|
379
|
+
|
|
380
|
+
for (int i = 0; i < QK_K/2; ++i) {
|
|
381
|
+
ql[i] = b->ql[i];
|
|
382
|
+
}
|
|
383
|
+
for (int i = 0; i < QK_K/4; ++i) {
|
|
384
|
+
qh[i] = b->qh[i];
|
|
385
|
+
}
|
|
386
|
+
for (int i = 0; i < QK_K/16; ++i) {
|
|
387
|
+
s[i] = b->scales[i];
|
|
388
|
+
}
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
// Restore block_q6_K from flattened arrays.
|
|
392
|
+
// Each thread processes a super block.
|
|
393
|
+
kernel void kernel_restore_block_q6_K(
|
|
394
|
+
global uchar * dst_ql,
|
|
395
|
+
global uchar * dst_qh,
|
|
396
|
+
global char * dst_s,
|
|
397
|
+
global half * dst_d,
|
|
398
|
+
global struct block_q6_K * dst
|
|
399
|
+
) {
|
|
400
|
+
global struct block_q6_K * b = (global struct block_q6_K *) dst + get_global_id(0);
|
|
401
|
+
global uchar * ql = (global uchar *) dst_ql + QK_K/2*get_global_id(0);
|
|
402
|
+
global uchar * qh = (global uchar *) dst_qh + QK_K/4*get_global_id(0);
|
|
403
|
+
global char * s = (global char *) dst_s + QK_K/16*get_global_id(0);
|
|
404
|
+
global half * d = (global half *) dst_d + get_global_id(0);
|
|
405
|
+
|
|
406
|
+
b->d = *d;
|
|
407
|
+
|
|
408
|
+
for (int i = 0; i < QK_K/2; ++i) {
|
|
409
|
+
b->ql[i] = ql[i];
|
|
410
|
+
}
|
|
411
|
+
for (int i = 0; i < QK_K/4; ++i) {
|
|
412
|
+
b->qh[i] = qh[i];
|
|
413
|
+
}
|
|
414
|
+
for (int i = 0; i < QK_K/16; ++i) {
|
|
415
|
+
b->scales[i] = s[i];
|
|
416
|
+
}
|
|
417
|
+
}
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
|
2
|
+
|
|
3
|
+
//------------------------------------------------------------------------------
|
|
4
|
+
// diag_mask_inf kernels
|
|
5
|
+
//------------------------------------------------------------------------------
|
|
6
|
+
kernel void kernel_diag_mask_inf(
|
|
7
|
+
global float * src0,
|
|
8
|
+
ulong offset0,
|
|
9
|
+
global float * dst,
|
|
10
|
+
ulong offsetd,
|
|
11
|
+
int ne00,
|
|
12
|
+
int ne01,
|
|
13
|
+
int n_past
|
|
14
|
+
) {
|
|
15
|
+
src0 = (global float*)((global char*)src0 + offset0);
|
|
16
|
+
dst = (global float*)((global char*)dst + offsetd);
|
|
17
|
+
|
|
18
|
+
int i02 = get_global_id(2);
|
|
19
|
+
int i01 = get_global_id(1);
|
|
20
|
+
int i00 = get_global_id(0);
|
|
21
|
+
|
|
22
|
+
if (i00 > n_past + i01) {
|
|
23
|
+
dst[i02*ne01*ne00 + i01*ne00 + i00] = -INFINITY;
|
|
24
|
+
} else {
|
|
25
|
+
dst[i02*ne01*ne00 + i01*ne00 + i00] = src0[i02*ne01*ne00 + i01*ne00 + i00];
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
kernel void kernel_diag_mask_inf_8(
|
|
30
|
+
global float4 * src0,
|
|
31
|
+
ulong offset0,
|
|
32
|
+
global float4 * dst,
|
|
33
|
+
ulong offsetd,
|
|
34
|
+
int ne00,
|
|
35
|
+
int ne01,
|
|
36
|
+
int n_past
|
|
37
|
+
) {
|
|
38
|
+
src0 = (global float4*)((global char*)src0 + offset0);
|
|
39
|
+
dst = (global float4*)((global char*)dst + offsetd);
|
|
40
|
+
|
|
41
|
+
int i = 2*get_global_id(0);
|
|
42
|
+
|
|
43
|
+
dst[i+0] = src0[i+0];
|
|
44
|
+
dst[i+1] = src0[i+1];
|
|
45
|
+
int i4 = 4*i;
|
|
46
|
+
int i02 = i4/(ne00*ne01); i4 -= i02*ne00*ne01;
|
|
47
|
+
int i01 = i4/(ne00); i4 -= i01*ne00;
|
|
48
|
+
int i00 = i4;
|
|
49
|
+
for (int k = 3; k >= 0; --k) {
|
|
50
|
+
if (i00 + 4 + k <= n_past + i01) {
|
|
51
|
+
break;
|
|
52
|
+
}
|
|
53
|
+
(&dst[i+1])[k] = -INFINITY;
|
|
54
|
+
if (i00 + k > n_past + i01) {
|
|
55
|
+
(&dst[i])[k] = -INFINITY;
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
}
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
|
2
|
+
|
|
3
|
+
//------------------------------------------------------------------------------
|
|
4
|
+
// div
|
|
5
|
+
//------------------------------------------------------------------------------
|
|
6
|
+
kernel void kernel_div(
|
|
7
|
+
global char * src0,
|
|
8
|
+
ulong offset0,
|
|
9
|
+
global char * src1,
|
|
10
|
+
ulong offset1,
|
|
11
|
+
global char * dst,
|
|
12
|
+
ulong offsetd,
|
|
13
|
+
ulong nb00,
|
|
14
|
+
ulong nb01,
|
|
15
|
+
ulong nb02,
|
|
16
|
+
ulong nb03,
|
|
17
|
+
int ne10,
|
|
18
|
+
int ne11,
|
|
19
|
+
int ne12,
|
|
20
|
+
int ne13,
|
|
21
|
+
ulong nb10,
|
|
22
|
+
ulong nb11,
|
|
23
|
+
ulong nb12,
|
|
24
|
+
ulong nb13,
|
|
25
|
+
int ne0,
|
|
26
|
+
ulong nb0,
|
|
27
|
+
ulong nb1,
|
|
28
|
+
ulong nb2,
|
|
29
|
+
ulong nb3
|
|
30
|
+
) {
|
|
31
|
+
src0 = src0 + offset0;
|
|
32
|
+
src1 = src1 + offset1;
|
|
33
|
+
dst = dst + offsetd;
|
|
34
|
+
|
|
35
|
+
int i03 = get_group_id(2);
|
|
36
|
+
int i02 = get_group_id(1);
|
|
37
|
+
int i01 = get_group_id(0);
|
|
38
|
+
|
|
39
|
+
int i13 = i03 % ne13;
|
|
40
|
+
int i12 = i02 % ne12;
|
|
41
|
+
int i11 = i01 % ne11;
|
|
42
|
+
|
|
43
|
+
global char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
|
|
44
|
+
global char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
|
|
45
|
+
global char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1;
|
|
46
|
+
|
|
47
|
+
for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
|
|
48
|
+
const int i10 = i0 % ne10;
|
|
49
|
+
*((global float *)(dst_ptr + i0*nb0)) = *((global float *)(src0_ptr + i0*nb00)) / *((global float *)(src1_ptr + i10*nb10));
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
// assumption: src1 is a row
|
|
54
|
+
// broadcast src1 into src0
|
|
55
|
+
kernel void kernel_div_row(
|
|
56
|
+
global float4 * src0,
|
|
57
|
+
ulong offset0,
|
|
58
|
+
global float4 * src1,
|
|
59
|
+
ulong offset1,
|
|
60
|
+
global float4 * dst,
|
|
61
|
+
ulong offsetd,
|
|
62
|
+
int ne
|
|
63
|
+
) {
|
|
64
|
+
src0 = (global float4*)((global char*)src0 + offset0);
|
|
65
|
+
src1 = (global float4*)((global char*)src1 + offset1);
|
|
66
|
+
dst = (global float4*)((global char*)dst + offsetd);
|
|
67
|
+
|
|
68
|
+
// This performs better than using %.
|
|
69
|
+
uint gid = get_global_id(0);
|
|
70
|
+
uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
|
|
71
|
+
dst[gid] = src0[gid] / src1[idx1];
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
kernel void kernel_div_f16(
|
|
75
|
+
global char * src0,
|
|
76
|
+
ulong offset0,
|
|
77
|
+
global char * src1,
|
|
78
|
+
ulong offset1,
|
|
79
|
+
global char * dst,
|
|
80
|
+
ulong offsetd,
|
|
81
|
+
ulong nb00,
|
|
82
|
+
ulong nb01,
|
|
83
|
+
ulong nb02,
|
|
84
|
+
ulong nb03,
|
|
85
|
+
int ne10,
|
|
86
|
+
int ne11,
|
|
87
|
+
int ne12,
|
|
88
|
+
int ne13,
|
|
89
|
+
ulong nb10,
|
|
90
|
+
ulong nb11,
|
|
91
|
+
ulong nb12,
|
|
92
|
+
ulong nb13,
|
|
93
|
+
int ne0,
|
|
94
|
+
ulong nb0,
|
|
95
|
+
ulong nb1,
|
|
96
|
+
ulong nb2,
|
|
97
|
+
ulong nb3
|
|
98
|
+
) {
|
|
99
|
+
src0 = src0 + offset0;
|
|
100
|
+
src1 = src1 + offset1;
|
|
101
|
+
dst = dst + offsetd;
|
|
102
|
+
|
|
103
|
+
int i03 = get_group_id(2);
|
|
104
|
+
int i02 = get_group_id(1);
|
|
105
|
+
int i01 = get_group_id(0);
|
|
106
|
+
|
|
107
|
+
int i13 = i03 % ne13;
|
|
108
|
+
int i12 = i02 % ne12;
|
|
109
|
+
int i11 = i01 % ne11;
|
|
110
|
+
|
|
111
|
+
global char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
|
|
112
|
+
global char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
|
|
113
|
+
global char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1;
|
|
114
|
+
|
|
115
|
+
for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
|
|
116
|
+
const int i10 = i0 % ne10;
|
|
117
|
+
*((global half *)(dst_ptr + i0*nb0)) = *((global half *)(src0_ptr + i0*nb00)) / *((global half *)(src1_ptr + i10*nb10));
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
kernel void kernel_div_row_f16(
|
|
122
|
+
global half4 * src0,
|
|
123
|
+
ulong offset0,
|
|
124
|
+
global half4 * src1,
|
|
125
|
+
ulong offset1,
|
|
126
|
+
global half4 * dst,
|
|
127
|
+
ulong offsetd,
|
|
128
|
+
int ne
|
|
129
|
+
) {
|
|
130
|
+
src0 = (global half4*)((global char*)src0 + offset0);
|
|
131
|
+
src1 = (global half4*)((global char*)src1 + offset1);
|
|
132
|
+
dst = (global half4*)((global char*)dst + offsetd);
|
|
133
|
+
|
|
134
|
+
// This performs better than using %.
|
|
135
|
+
uint gid = get_global_id(0);
|
|
136
|
+
uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
|
|
137
|
+
dst[gid] = src0[gid] / src1[idx1];
|
|
138
|
+
}
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
#
|
|
2
|
+
|
|
3
|
+
import sys
|
|
4
|
+
import logging
|
|
5
|
+
logger = logging.getLogger("opencl-embed-kernel")
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def main():
|
|
9
|
+
logging.basicConfig(level=logging.INFO)
|
|
10
|
+
|
|
11
|
+
if len(sys.argv) != 3:
|
|
12
|
+
logger.info("Usage: python embed_kernel.py <input_file> <output_file>")
|
|
13
|
+
sys.exit(1)
|
|
14
|
+
|
|
15
|
+
ifile = open(sys.argv[1], "r")
|
|
16
|
+
ofile = open(sys.argv[2], "w")
|
|
17
|
+
|
|
18
|
+
for i in ifile:
|
|
19
|
+
ofile.write('R"({})"\n'.format(i))
|
|
20
|
+
|
|
21
|
+
ifile.close()
|
|
22
|
+
ofile.close()
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
if __name__ == "__main__":
|
|
26
|
+
main()
|