local-llm-rn 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/cpp/CMakeLists.txt +285 -0
- package/cpp/common/CMakeLists.txt +149 -0
- package/cpp/common/arg.cpp +3799 -0
- package/cpp/common/arg.h +131 -0
- package/cpp/common/base64.hpp +392 -0
- package/cpp/common/build-info.cpp.in +4 -0
- package/cpp/common/chat-parser-xml-toolcall.cpp +879 -0
- package/cpp/common/chat-parser-xml-toolcall.h +45 -0
- package/cpp/common/chat-parser.cpp +1649 -0
- package/cpp/common/chat-parser.h +133 -0
- package/cpp/common/chat-peg-parser.cpp +124 -0
- package/cpp/common/chat-peg-parser.h +105 -0
- package/cpp/common/chat.cpp +3355 -0
- package/cpp/common/chat.h +252 -0
- package/cpp/common/common.cpp +1824 -0
- package/cpp/common/common.h +930 -0
- package/cpp/common/console.cpp +1137 -0
- package/cpp/common/console.h +41 -0
- package/cpp/common/debug.cpp +167 -0
- package/cpp/common/debug.h +43 -0
- package/cpp/common/download.cpp +792 -0
- package/cpp/common/download.h +84 -0
- package/cpp/common/http.h +84 -0
- package/cpp/common/jinja/README.md +88 -0
- package/cpp/common/jinja/caps.cpp +285 -0
- package/cpp/common/jinja/caps.h +30 -0
- package/cpp/common/jinja/lexer.cpp +341 -0
- package/cpp/common/jinja/lexer.h +157 -0
- package/cpp/common/jinja/parser.cpp +591 -0
- package/cpp/common/jinja/parser.h +21 -0
- package/cpp/common/jinja/runtime.cpp +867 -0
- package/cpp/common/jinja/runtime.h +638 -0
- package/cpp/common/jinja/string.cpp +213 -0
- package/cpp/common/jinja/string.h +61 -0
- package/cpp/common/jinja/utils.h +149 -0
- package/cpp/common/jinja/value.cpp +1393 -0
- package/cpp/common/jinja/value.h +756 -0
- package/cpp/common/json-partial.cpp +324 -0
- package/cpp/common/json-partial.h +39 -0
- package/cpp/common/json-schema-to-grammar.cpp +1153 -0
- package/cpp/common/json-schema-to-grammar.h +43 -0
- package/cpp/common/llguidance.cpp +258 -0
- package/cpp/common/log.cpp +446 -0
- package/cpp/common/log.h +119 -0
- package/cpp/common/ngram-cache.cpp +285 -0
- package/cpp/common/ngram-cache.h +101 -0
- package/cpp/common/ngram-map.cpp +530 -0
- package/cpp/common/ngram-map.h +115 -0
- package/cpp/common/ngram-mod.cpp +60 -0
- package/cpp/common/ngram-mod.h +38 -0
- package/cpp/common/peg-parser.cpp +1712 -0
- package/cpp/common/peg-parser.h +459 -0
- package/cpp/common/preset.cpp +483 -0
- package/cpp/common/preset.h +83 -0
- package/cpp/common/regex-partial.cpp +204 -0
- package/cpp/common/regex-partial.h +56 -0
- package/cpp/common/sampling.cpp +745 -0
- package/cpp/common/sampling.h +119 -0
- package/cpp/common/speculative.cpp +1074 -0
- package/cpp/common/speculative.h +41 -0
- package/cpp/common/unicode.cpp +64 -0
- package/cpp/common/unicode.h +22 -0
- package/cpp/ggml/CMakeLists.txt +494 -0
- package/cpp/ggml/cmake/GitVars.cmake +22 -0
- package/cpp/ggml/cmake/common.cmake +50 -0
- package/cpp/ggml/cmake/ggml-config.cmake.in +191 -0
- package/cpp/ggml/include/ggml-alloc.h +85 -0
- package/cpp/ggml/include/ggml-backend.h +373 -0
- package/cpp/ggml/include/ggml-blas.h +25 -0
- package/cpp/ggml/include/ggml-cann.h +123 -0
- package/cpp/ggml/include/ggml-cpp.h +39 -0
- package/cpp/ggml/include/ggml-cpu.h +151 -0
- package/cpp/ggml/include/ggml-cuda.h +47 -0
- package/cpp/ggml/include/ggml-hexagon.h +19 -0
- package/cpp/ggml/include/ggml-metal.h +61 -0
- package/cpp/ggml/include/ggml-opencl.h +26 -0
- package/cpp/ggml/include/ggml-opt.h +256 -0
- package/cpp/ggml/include/ggml-rpc.h +30 -0
- package/cpp/ggml/include/ggml-sycl.h +49 -0
- package/cpp/ggml/include/ggml-virtgpu.h +14 -0
- package/cpp/ggml/include/ggml-vulkan.h +29 -0
- package/cpp/ggml/include/ggml-webgpu.h +19 -0
- package/cpp/ggml/include/ggml-zdnn.h +17 -0
- package/cpp/ggml/include/ggml-zendnn.h +22 -0
- package/cpp/ggml/include/ggml.h +2753 -0
- package/cpp/ggml/include/gguf.h +204 -0
- package/cpp/ggml/src/CMakeLists.txt +492 -0
- package/cpp/ggml/src/ggml-alloc.c +1244 -0
- package/cpp/ggml/src/ggml-backend-dl.cpp +48 -0
- package/cpp/ggml/src/ggml-backend-dl.h +45 -0
- package/cpp/ggml/src/ggml-backend-impl.h +255 -0
- package/cpp/ggml/src/ggml-backend-reg.cpp +566 -0
- package/cpp/ggml/src/ggml-backend.cpp +2270 -0
- package/cpp/ggml/src/ggml-blas/CMakeLists.txt +101 -0
- package/cpp/ggml/src/ggml-blas/ggml-blas.cpp +518 -0
- package/cpp/ggml/src/ggml-common.h +1878 -0
- package/cpp/ggml/src/ggml-cpu/CMakeLists.txt +691 -0
- package/cpp/ggml/src/ggml-cpu/amx/amx.cpp +247 -0
- package/cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
- package/cpp/ggml/src/ggml-cpu/amx/common.h +91 -0
- package/cpp/ggml/src/ggml-cpu/amx/mmq.cpp +2512 -0
- package/cpp/ggml/src/ggml-cpu/amx/mmq.h +10 -0
- package/cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +98 -0
- package/cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4052 -0
- package/cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +4935 -0
- package/cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2159 -0
- package/cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
- package/cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2305 -0
- package/cpp/ggml/src/ggml-cpu/arch/riscv/cpu-feats.cpp +38 -0
- package/cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2726 -0
- package/cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +342 -0
- package/cpp/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
- package/cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1468 -0
- package/cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1221 -0
- package/cpp/ggml/src/ggml-cpu/arch/x86/cpu-feats.cpp +327 -0
- package/cpp/ggml/src/ggml-cpu/arch/x86/quants.c +3820 -0
- package/cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +6307 -0
- package/cpp/ggml/src/ggml-cpu/arch-fallback.h +313 -0
- package/cpp/ggml/src/ggml-cpu/binary-ops.cpp +154 -0
- package/cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
- package/cpp/ggml/src/ggml-cpu/cmake/FindSIMD.cmake +100 -0
- package/cpp/ggml/src/ggml-cpu/common.h +95 -0
- package/cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +529 -0
- package/cpp/ggml/src/ggml-cpu/ggml-cpu.c +3734 -0
- package/cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +701 -0
- package/cpp/ggml/src/ggml-cpu/hbm.cpp +55 -0
- package/cpp/ggml/src/ggml-cpu/hbm.h +8 -0
- package/cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +938 -0
- package/cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +90 -0
- package/cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +798 -0
- package/cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
- package/cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +4033 -0
- package/cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +25 -0
- package/cpp/ggml/src/ggml-cpu/ops.cpp +10978 -0
- package/cpp/ggml/src/ggml-cpu/ops.h +116 -0
- package/cpp/ggml/src/ggml-cpu/quants.c +1193 -0
- package/cpp/ggml/src/ggml-cpu/quants.h +97 -0
- package/cpp/ggml/src/ggml-cpu/repack.cpp +3316 -0
- package/cpp/ggml/src/ggml-cpu/repack.h +173 -0
- package/cpp/ggml/src/ggml-cpu/simd-gemm.h +136 -0
- package/cpp/ggml/src/ggml-cpu/simd-mappings.h +1279 -0
- package/cpp/ggml/src/ggml-cpu/spacemit/ime.cpp +1025 -0
- package/cpp/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
- package/cpp/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
- package/cpp/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
- package/cpp/ggml/src/ggml-cpu/traits.cpp +36 -0
- package/cpp/ggml/src/ggml-cpu/traits.h +38 -0
- package/cpp/ggml/src/ggml-cpu/unary-ops.cpp +337 -0
- package/cpp/ggml/src/ggml-cpu/unary-ops.h +35 -0
- package/cpp/ggml/src/ggml-cpu/vec.cpp +629 -0
- package/cpp/ggml/src/ggml-cpu/vec.h +1585 -0
- package/cpp/ggml/src/ggml-hexagon/CMakeLists.txt +117 -0
- package/cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp +3232 -0
- package/cpp/ggml/src/ggml-hexagon/htp/CMakeLists.txt +45 -0
- package/cpp/ggml/src/ggml-hexagon/htp/act-ops.c +815 -0
- package/cpp/ggml/src/ggml-hexagon/htp/argsort-ops.c +281 -0
- package/cpp/ggml/src/ggml-hexagon/htp/binary-ops.c +827 -0
- package/cpp/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +157 -0
- package/cpp/ggml/src/ggml-hexagon/htp/cpy-ops.c +251 -0
- package/cpp/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +666 -0
- package/cpp/ggml/src/ggml-hexagon/htp/get-rows-ops.c +111 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hex-dma.c +63 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hex-dma.h +182 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hex-dump.h +77 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hex-fastdiv.h +37 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hex-utils.h +51 -0
- package/cpp/ggml/src/ggml-hexagon/htp/htp-ctx.h +35 -0
- package/cpp/ggml/src/ggml-hexagon/htp/htp-msg.h +154 -0
- package/cpp/ggml/src/ggml-hexagon/htp/htp-ops.h +65 -0
- package/cpp/ggml/src/ggml-hexagon/htp/htp_iface.idl +16 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-arith.h +470 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-base.h +173 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-copy.h +245 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-div.h +116 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-dump.h +129 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-exp.h +215 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-floor.h +100 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-inverse.h +176 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-reduce.h +266 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-scale.h +133 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +141 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-sqrt.h +126 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-types.h +36 -0
- package/cpp/ggml/src/ggml-hexagon/htp/hvx-utils.h +18 -0
- package/cpp/ggml/src/ggml-hexagon/htp/main.c +1150 -0
- package/cpp/ggml/src/ggml-hexagon/htp/matmul-ops.c +2595 -0
- package/cpp/ggml/src/ggml-hexagon/htp/rope-ops.c +498 -0
- package/cpp/ggml/src/ggml-hexagon/htp/set-rows-ops.c +167 -0
- package/cpp/ggml/src/ggml-hexagon/htp/softmax-ops.c +421 -0
- package/cpp/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +130 -0
- package/cpp/ggml/src/ggml-hexagon/htp/unary-ops.c +384 -0
- package/cpp/ggml/src/ggml-hexagon/htp/worker-pool.c +293 -0
- package/cpp/ggml/src/ggml-hexagon/htp/worker-pool.h +57 -0
- package/cpp/ggml/src/ggml-hexagon/htp-drv.cpp +418 -0
- package/cpp/ggml/src/ggml-hexagon/htp-drv.h +121 -0
- package/cpp/ggml/src/ggml-hexagon/libdl.h +79 -0
- package/cpp/ggml/src/ggml-hexagon/libggml-htp.inf +38 -0
- package/cpp/ggml/src/ggml-hexagon/op-desc.h +153 -0
- package/cpp/ggml/src/ggml-impl.h +724 -0
- package/cpp/ggml/src/ggml-metal/CMakeLists.txt +124 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal-common.cpp +457 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal-context.h +41 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal-context.m +702 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal-device.cpp +1890 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal-device.h +290 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal-device.m +1749 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal-impl.h +1054 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal-ops.cpp +4370 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal-ops.h +94 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal.cpp +937 -0
- package/cpp/ggml/src/ggml-metal/ggml-metal.metal +9819 -0
- package/cpp/ggml/src/ggml-musa/CMakeLists.txt +125 -0
- package/cpp/ggml/src/ggml-musa/mudnn.cu +112 -0
- package/cpp/ggml/src/ggml-musa/mudnn.cuh +12 -0
- package/cpp/ggml/src/ggml-opencl/CMakeLists.txt +150 -0
- package/cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +11553 -0
- package/cpp/ggml/src/ggml-opencl/kernels/add.cl +190 -0
- package/cpp/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
- package/cpp/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
- package/cpp/ggml/src/ggml-opencl/kernels/clamp.cl +20 -0
- package/cpp/ggml/src/ggml-opencl/kernels/concat.cl +51 -0
- package/cpp/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
- package/cpp/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
- package/cpp/ggml/src/ggml-opencl/kernels/cpy.cl +184 -0
- package/cpp/ggml/src/ggml-opencl/kernels/cvt.cl +417 -0
- package/cpp/ggml/src/ggml-opencl/kernels/diag_mask_inf.cl +58 -0
- package/cpp/ggml/src/ggml-opencl/kernels/div.cl +138 -0
- package/cpp/ggml/src/ggml-opencl/kernels/embed_kernel.py +26 -0
- package/cpp/ggml/src/ggml-opencl/kernels/expm1.cl +113 -0
- package/cpp/ggml/src/ggml-opencl/kernels/fill.cl +17 -0
- package/cpp/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +370 -0
- package/cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +371 -0
- package/cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +373 -0
- package/cpp/ggml/src/ggml-opencl/kernels/gelu.cl +89 -0
- package/cpp/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32.cl +162 -0
- package/cpp/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32.cl +156 -0
- package/cpp/ggml/src/ggml-opencl/kernels/gemv_noshuffle.cl +268 -0
- package/cpp/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general.cl +274 -0
- package/cpp/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general_q8_0_f32.cl +195 -0
- package/cpp/ggml/src/ggml-opencl/kernels/get_rows.cl +187 -0
- package/cpp/ggml/src/ggml-opencl/kernels/glu.cl +378 -0
- package/cpp/ggml/src/ggml-opencl/kernels/group_norm.cl +121 -0
- package/cpp/ggml/src/ggml-opencl/kernels/im2col_f16.cl +57 -0
- package/cpp/ggml/src/ggml-opencl/kernels/im2col_f32.cl +57 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mean.cl +140 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul.cl +152 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mat_Ab_Bi_8x4.cl +139 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_kq_kqv.cl +273 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +146 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +147 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl +163 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl +165 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q6_k_f32_l4_lm.cl +158 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_8x4.cl +129 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl +154 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f16.cl +118 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32.cl +118 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_1row.cl +94 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_l4.cl +84 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f32_f32.cl +118 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32.cl +192 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_16x_flat.cl +307 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_8x_flat.cl +265 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_8x_flat.cl +272 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_v.cl +254 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl +219 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl +229 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl +180 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32.cl +194 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +194 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
- package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
- package/cpp/ggml/src/ggml-opencl/kernels/norm.cl +161 -0
- package/cpp/ggml/src/ggml-opencl/kernels/pad.cl +39 -0
- package/cpp/ggml/src/ggml-opencl/kernels/relu.cl +16 -0
- package/cpp/ggml/src/ggml-opencl/kernels/repeat.cl +38 -0
- package/cpp/ggml/src/ggml-opencl/kernels/rms_norm.cl +190 -0
- package/cpp/ggml/src/ggml-opencl/kernels/rope.cl +747 -0
- package/cpp/ggml/src/ggml-opencl/kernels/scale.cl +27 -0
- package/cpp/ggml/src/ggml-opencl/kernels/set_rows.cl +208 -0
- package/cpp/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
- package/cpp/ggml/src/ggml-opencl/kernels/silu.cl +30 -0
- package/cpp/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +108 -0
- package/cpp/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +108 -0
- package/cpp/ggml/src/ggml-opencl/kernels/softmax_f16.cl +107 -0
- package/cpp/ggml/src/ggml-opencl/kernels/softmax_f32.cl +107 -0
- package/cpp/ggml/src/ggml-opencl/kernels/softplus.cl +116 -0
- package/cpp/ggml/src/ggml-opencl/kernels/solve_tri.cl +51 -0
- package/cpp/ggml/src/ggml-opencl/kernels/sqr.cl +53 -0
- package/cpp/ggml/src/ggml-opencl/kernels/sqrt.cl +53 -0
- package/cpp/ggml/src/ggml-opencl/kernels/ssm_conv.cl +77 -0
- package/cpp/ggml/src/ggml-opencl/kernels/sub.cl +138 -0
- package/cpp/ggml/src/ggml-opencl/kernels/sum_rows.cl +140 -0
- package/cpp/ggml/src/ggml-opencl/kernels/tanh.cl +109 -0
- package/cpp/ggml/src/ggml-opencl/kernels/transpose.cl +117 -0
- package/cpp/ggml/src/ggml-opencl/kernels/tri.cl +32 -0
- package/cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
- package/cpp/ggml/src/ggml-opencl/kernels/upscale.cl +120 -0
- package/cpp/ggml/src/ggml-opt.cpp +1093 -0
- package/cpp/ggml/src/ggml-quants.c +5325 -0
- package/cpp/ggml/src/ggml-quants.h +106 -0
- package/cpp/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
- package/cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2118 -0
- package/cpp/ggml/src/ggml-threading.cpp +12 -0
- package/cpp/ggml/src/ggml-threading.h +14 -0
- package/cpp/ggml/src/ggml-virtgpu/CMakeLists.txt +70 -0
- package/cpp/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp +87 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/CMakeLists.txt +21 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp +115 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/backend-convert.h +13 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +102 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +105 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +179 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp +148 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +51 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +73 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +27 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +32 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/backend.cpp +144 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +95 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +94 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +50 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +378 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +232 -0
- package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +58 -0
- package/cpp/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +81 -0
- package/cpp/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +119 -0
- package/cpp/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +158 -0
- package/cpp/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +213 -0
- package/cpp/ggml/src/ggml-virtgpu/ggml-backend.cpp +69 -0
- package/cpp/ggml/src/ggml-virtgpu/ggml-remoting.h +71 -0
- package/cpp/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml +166 -0
- package/cpp/ggml/src/ggml-virtgpu/include/apir_hw.h +9 -0
- package/cpp/ggml/src/ggml-virtgpu/regenerate_remoting.py +333 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu-apir.h +15 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +58 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +110 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +173 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +192 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +36 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +53 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +98 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu-shm.h +23 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu-utils.cpp +179 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu-utils.h +86 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu.cpp +544 -0
- package/cpp/ggml/src/ggml-virtgpu/virtgpu.h +117 -0
- package/cpp/ggml/src/ggml-webgpu/CMakeLists.txt +80 -0
- package/cpp/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +1231 -0
- package/cpp/ggml/src/ggml-webgpu/ggml-webgpu.cpp +3150 -0
- package/cpp/ggml/src/ggml-webgpu/pre_wgsl.hpp +778 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl +72 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl +106 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl +134 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +107 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +923 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +107 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl +66 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +182 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +636 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.wgsl +668 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +323 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +713 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +103 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl +138 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.wgsl +188 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +194 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl +86 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +123 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/rope.tmpl.wgsl +295 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/scale.wgsl +63 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +109 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.tmpl.wgsl +345 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl +55 -0
- package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +193 -0
- package/cpp/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
- package/cpp/ggml/src/ggml-zdnn/common.hpp +59 -0
- package/cpp/ggml/src/ggml-zdnn/ggml-zdnn.cpp +633 -0
- package/cpp/ggml/src/ggml-zdnn/mmf.cpp +80 -0
- package/cpp/ggml/src/ggml-zdnn/mmf.hpp +12 -0
- package/cpp/ggml/src/ggml-zdnn/utils.cpp +79 -0
- package/cpp/ggml/src/ggml-zdnn/utils.hpp +19 -0
- package/cpp/ggml/src/ggml-zendnn/CMakeLists.txt +92 -0
- package/cpp/ggml/src/ggml-zendnn/ggml-zendnn.cpp +469 -0
- package/cpp/ggml/src/ggml.c +7669 -0
- package/cpp/ggml/src/ggml.cpp +26 -0
- package/cpp/ggml/src/gguf.cpp +1699 -0
- package/cpp/include/llama-cpp.h +32 -0
- package/cpp/include/llama.h +1568 -0
- package/cpp/mtmd/CMakeLists.txt +98 -0
- package/cpp/mtmd/README.md +63 -0
- package/cpp/mtmd/clip-graph.h +117 -0
- package/cpp/mtmd/clip-impl.h +586 -0
- package/cpp/mtmd/clip-model.h +390 -0
- package/cpp/mtmd/clip.cpp +4154 -0
- package/cpp/mtmd/clip.h +121 -0
- package/cpp/mtmd/deprecation-warning.cpp +22 -0
- package/cpp/mtmd/legacy-models/convert_image_encoder_to_gguf.py +412 -0
- package/cpp/mtmd/legacy-models/glmedge-convert-image-encoder-to-gguf.py +280 -0
- package/cpp/mtmd/legacy-models/glmedge-surgery.py +33 -0
- package/cpp/mtmd/legacy-models/llava_surgery.py +38 -0
- package/cpp/mtmd/legacy-models/llava_surgery_v2.py +180 -0
- package/cpp/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py +892 -0
- package/cpp/mtmd/legacy-models/minicpmv-surgery.py +47 -0
- package/cpp/mtmd/models/cogvlm.cpp +98 -0
- package/cpp/mtmd/models/conformer.cpp +216 -0
- package/cpp/mtmd/models/glm4v.cpp +122 -0
- package/cpp/mtmd/models/internvl.cpp +69 -0
- package/cpp/mtmd/models/kimik25.cpp +101 -0
- package/cpp/mtmd/models/kimivl.cpp +63 -0
- package/cpp/mtmd/models/llama4.cpp +96 -0
- package/cpp/mtmd/models/llava.cpp +374 -0
- package/cpp/mtmd/models/minicpmv.cpp +114 -0
- package/cpp/mtmd/models/mobilenetv5.cpp +451 -0
- package/cpp/mtmd/models/models.h +128 -0
- package/cpp/mtmd/models/nemotron-v2-vl.cpp +35 -0
- package/cpp/mtmd/models/paddleocr.cpp +52 -0
- package/cpp/mtmd/models/pixtral.cpp +86 -0
- package/cpp/mtmd/models/qwen2vl.cpp +183 -0
- package/cpp/mtmd/models/qwen3vl.cpp +193 -0
- package/cpp/mtmd/models/siglip.cpp +86 -0
- package/cpp/mtmd/models/whisper-enc.cpp +115 -0
- package/cpp/mtmd/models/youtuvl.cpp +179 -0
- package/cpp/mtmd/mtmd-audio.cpp +730 -0
- package/cpp/mtmd/mtmd-audio.h +113 -0
- package/cpp/mtmd/mtmd-cli.cpp +437 -0
- package/cpp/mtmd/mtmd-helper.cpp +521 -0
- package/cpp/mtmd/mtmd-helper.h +96 -0
- package/cpp/mtmd/mtmd.cpp +1156 -0
- package/cpp/mtmd/mtmd.h +319 -0
- package/cpp/mtmd/requirements.txt +5 -0
- package/cpp/mtmd/test-1.jpeg +0 -0
- package/cpp/mtmd/test-2.mp3 +0 -0
- package/cpp/mtmd/tests.sh +192 -0
- package/cpp/src/CMakeLists.txt +169 -0
- package/cpp/src/llama-adapter.cpp +488 -0
- package/cpp/src/llama-adapter.h +89 -0
- package/cpp/src/llama-arch.cpp +2855 -0
- package/cpp/src/llama-arch.h +619 -0
- package/cpp/src/llama-batch.cpp +917 -0
- package/cpp/src/llama-batch.h +173 -0
- package/cpp/src/llama-chat.cpp +896 -0
- package/cpp/src/llama-chat.h +71 -0
- package/cpp/src/llama-context.cpp +3512 -0
- package/cpp/src/llama-context.h +359 -0
- package/cpp/src/llama-cparams.cpp +5 -0
- package/cpp/src/llama-cparams.h +44 -0
- package/cpp/src/llama-grammar.cpp +1464 -0
- package/cpp/src/llama-grammar.h +194 -0
- package/cpp/src/llama-graph.cpp +2685 -0
- package/cpp/src/llama-graph.h +1026 -0
- package/cpp/src/llama-hparams.cpp +234 -0
- package/cpp/src/llama-hparams.h +339 -0
- package/cpp/src/llama-impl.cpp +171 -0
- package/cpp/src/llama-impl.h +73 -0
- package/cpp/src/llama-io.cpp +15 -0
- package/cpp/src/llama-io.h +35 -0
- package/cpp/src/llama-kv-cache-iswa.cpp +330 -0
- package/cpp/src/llama-kv-cache-iswa.h +137 -0
- package/cpp/src/llama-kv-cache.cpp +2271 -0
- package/cpp/src/llama-kv-cache.h +388 -0
- package/cpp/src/llama-kv-cells.h +533 -0
- package/cpp/src/llama-memory-hybrid-iswa.cpp +275 -0
- package/cpp/src/llama-memory-hybrid-iswa.h +140 -0
- package/cpp/src/llama-memory-hybrid.cpp +268 -0
- package/cpp/src/llama-memory-hybrid.h +139 -0
- package/cpp/src/llama-memory-recurrent.cpp +1165 -0
- package/cpp/src/llama-memory-recurrent.h +182 -0
- package/cpp/src/llama-memory.cpp +59 -0
- package/cpp/src/llama-memory.h +122 -0
- package/cpp/src/llama-mmap.cpp +785 -0
- package/cpp/src/llama-mmap.h +92 -0
- package/cpp/src/llama-model-loader.cpp +1414 -0
- package/cpp/src/llama-model-loader.h +203 -0
- package/cpp/src/llama-model-saver.cpp +286 -0
- package/cpp/src/llama-model-saver.h +37 -0
- package/cpp/src/llama-model.cpp +9253 -0
- package/cpp/src/llama-model.h +576 -0
- package/cpp/src/llama-quant.cpp +1119 -0
- package/cpp/src/llama-quant.h +1 -0
- package/cpp/src/llama-sampler.cpp +3885 -0
- package/cpp/src/llama-sampler.h +42 -0
- package/cpp/src/llama-vocab.cpp +3970 -0
- package/cpp/src/llama-vocab.h +187 -0
- package/cpp/src/llama.cpp +1313 -0
- package/cpp/src/models/afmoe.cpp +191 -0
- package/cpp/src/models/apertus.cpp +125 -0
- package/cpp/src/models/arcee.cpp +135 -0
- package/cpp/src/models/arctic.cpp +138 -0
- package/cpp/src/models/arwkv7.cpp +86 -0
- package/cpp/src/models/baichuan.cpp +122 -0
- package/cpp/src/models/bailingmoe.cpp +144 -0
- package/cpp/src/models/bailingmoe2.cpp +135 -0
- package/cpp/src/models/bert.cpp +178 -0
- package/cpp/src/models/bitnet.cpp +160 -0
- package/cpp/src/models/bloom.cpp +101 -0
- package/cpp/src/models/chameleon.cpp +178 -0
- package/cpp/src/models/chatglm.cpp +132 -0
- package/cpp/src/models/codeshell.cpp +111 -0
- package/cpp/src/models/cogvlm.cpp +102 -0
- package/cpp/src/models/cohere2-iswa.cpp +134 -0
- package/cpp/src/models/command-r.cpp +122 -0
- package/cpp/src/models/dbrx.cpp +123 -0
- package/cpp/src/models/deci.cpp +135 -0
- package/cpp/src/models/deepseek.cpp +144 -0
- package/cpp/src/models/deepseek2.cpp +262 -0
- package/cpp/src/models/delta-net-base.cpp +376 -0
- package/cpp/src/models/dots1.cpp +134 -0
- package/cpp/src/models/dream.cpp +105 -0
- package/cpp/src/models/ernie4-5-moe.cpp +150 -0
- package/cpp/src/models/ernie4-5.cpp +110 -0
- package/cpp/src/models/eurobert.cpp +97 -0
- package/cpp/src/models/exaone-moe.cpp +146 -0
- package/cpp/src/models/exaone.cpp +114 -0
- package/cpp/src/models/exaone4.cpp +123 -0
- package/cpp/src/models/falcon-h1.cpp +111 -0
- package/cpp/src/models/falcon.cpp +120 -0
- package/cpp/src/models/gemma-embedding.cpp +116 -0
- package/cpp/src/models/gemma.cpp +112 -0
- package/cpp/src/models/gemma2-iswa.cpp +128 -0
- package/cpp/src/models/gemma3.cpp +155 -0
- package/cpp/src/models/gemma3n-iswa.cpp +384 -0
- package/cpp/src/models/glm4-moe.cpp +170 -0
- package/cpp/src/models/glm4.cpp +157 -0
- package/cpp/src/models/gpt2.cpp +105 -0
- package/cpp/src/models/gptneox.cpp +144 -0
- package/cpp/src/models/granite-hybrid.cpp +196 -0
- package/cpp/src/models/granite.cpp +211 -0
- package/cpp/src/models/grok.cpp +159 -0
- package/cpp/src/models/grovemoe.cpp +141 -0
- package/cpp/src/models/hunyuan-dense.cpp +132 -0
- package/cpp/src/models/hunyuan-moe.cpp +154 -0
- package/cpp/src/models/internlm2.cpp +120 -0
- package/cpp/src/models/jais.cpp +86 -0
- package/cpp/src/models/jais2.cpp +123 -0
- package/cpp/src/models/jamba.cpp +106 -0
- package/cpp/src/models/kimi-linear.cpp +392 -0
- package/cpp/src/models/lfm2.cpp +190 -0
- package/cpp/src/models/llada-moe.cpp +122 -0
- package/cpp/src/models/llada.cpp +99 -0
- package/cpp/src/models/llama-iswa.cpp +178 -0
- package/cpp/src/models/llama.cpp +168 -0
- package/cpp/src/models/maincoder.cpp +117 -0
- package/cpp/src/models/mamba-base.cpp +285 -0
- package/cpp/src/models/mamba.cpp +54 -0
- package/cpp/src/models/mimo2-iswa.cpp +123 -0
- package/cpp/src/models/minicpm3.cpp +200 -0
- package/cpp/src/models/minimax-m2.cpp +124 -0
- package/cpp/src/models/mistral3.cpp +160 -0
- package/cpp/src/models/models.h +684 -0
- package/cpp/src/models/modern-bert.cpp +109 -0
- package/cpp/src/models/mpt.cpp +126 -0
- package/cpp/src/models/nemotron-h.cpp +148 -0
- package/cpp/src/models/nemotron.cpp +122 -0
- package/cpp/src/models/neo-bert.cpp +104 -0
- package/cpp/src/models/olmo.cpp +121 -0
- package/cpp/src/models/olmo2.cpp +150 -0
- package/cpp/src/models/olmoe.cpp +124 -0
- package/cpp/src/models/openai-moe-iswa.cpp +127 -0
- package/cpp/src/models/openelm.cpp +124 -0
- package/cpp/src/models/orion.cpp +123 -0
- package/cpp/src/models/paddleocr.cpp +122 -0
- package/cpp/src/models/pangu-embedded.cpp +121 -0
- package/cpp/src/models/phi2.cpp +121 -0
- package/cpp/src/models/phi3.cpp +152 -0
- package/cpp/src/models/plamo.cpp +110 -0
- package/cpp/src/models/plamo2.cpp +318 -0
- package/cpp/src/models/plamo3.cpp +128 -0
- package/cpp/src/models/plm.cpp +169 -0
- package/cpp/src/models/qwen.cpp +108 -0
- package/cpp/src/models/qwen2.cpp +126 -0
- package/cpp/src/models/qwen2moe.cpp +151 -0
- package/cpp/src/models/qwen2vl.cpp +117 -0
- package/cpp/src/models/qwen3.cpp +117 -0
- package/cpp/src/models/qwen35.cpp +386 -0
- package/cpp/src/models/qwen35moe.cpp +420 -0
- package/cpp/src/models/qwen3moe.cpp +124 -0
- package/cpp/src/models/qwen3next.cpp +525 -0
- package/cpp/src/models/qwen3vl-moe.cpp +140 -0
- package/cpp/src/models/qwen3vl.cpp +132 -0
- package/cpp/src/models/refact.cpp +94 -0
- package/cpp/src/models/rnd1.cpp +126 -0
- package/cpp/src/models/rwkv6-base.cpp +164 -0
- package/cpp/src/models/rwkv6.cpp +94 -0
- package/cpp/src/models/rwkv6qwen2.cpp +86 -0
- package/cpp/src/models/rwkv7-base.cpp +137 -0
- package/cpp/src/models/rwkv7.cpp +90 -0
- package/cpp/src/models/seed-oss.cpp +124 -0
- package/cpp/src/models/smallthinker.cpp +126 -0
- package/cpp/src/models/smollm3.cpp +128 -0
- package/cpp/src/models/stablelm.cpp +146 -0
- package/cpp/src/models/starcoder.cpp +100 -0
- package/cpp/src/models/starcoder2.cpp +121 -0
- package/cpp/src/models/step35-iswa.cpp +168 -0
- package/cpp/src/models/t5-dec.cpp +166 -0
- package/cpp/src/models/t5-enc.cpp +96 -0
- package/cpp/src/models/wavtokenizer-dec.cpp +149 -0
- package/cpp/src/models/xverse.cpp +108 -0
- package/cpp/src/unicode-data.cpp +7034 -0
- package/cpp/src/unicode-data.h +20 -0
- package/cpp/src/unicode.cpp +1103 -0
- package/cpp/src/unicode.h +111 -0
- package/cpp/vendor/nlohmann/json.hpp +25526 -0
- package/cpp/vendor/nlohmann/json_fwd.hpp +187 -0
- package/cpp/vendor/stb/stb_image.h +7988 -0
- package/ios/LocalLLM-Bridging-Header.h +2 -0
- package/ios/LocalLLM.h +5 -0
- package/ios/LocalLLM.mm +1267 -0
- package/local-llm-rn.podspec +60 -0
- package/package.json +35 -0
- package/src/NativeLocalLLM.ts +73 -0
- package/src/device.ts +50 -0
- package/src/download-adapter.ts +17 -0
- package/src/index.ts +21 -0
- package/src/native-bridge.ts +142 -0
- package/src/rn-downloader.ts +37 -0
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
#include "llama-impl.h"
|
|
2
|
+
|
|
3
|
+
#include "gguf.h"
|
|
4
|
+
#include "llama.h"
|
|
5
|
+
|
|
6
|
+
#include <cinttypes>
|
|
7
|
+
#include <climits>
|
|
8
|
+
#include <cstdarg>
|
|
9
|
+
#include <cstring>
|
|
10
|
+
#include <vector>
|
|
11
|
+
#include <sstream>
|
|
12
|
+
|
|
13
|
+
struct llama_logger_state {
|
|
14
|
+
ggml_log_callback log_callback = llama_log_callback_default;
|
|
15
|
+
void * log_callback_user_data = nullptr;
|
|
16
|
+
};
|
|
17
|
+
|
|
18
|
+
static llama_logger_state g_logger_state;
|
|
19
|
+
|
|
20
|
+
time_meas::time_meas(int64_t & t_acc, bool disable) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc) {}
|
|
21
|
+
|
|
22
|
+
time_meas::~time_meas() {
|
|
23
|
+
if (t_start_us >= 0) {
|
|
24
|
+
t_acc += ggml_time_us() - t_start_us;
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
void llama_log_get(ggml_log_callback * log_callback, void ** user_data) {
|
|
29
|
+
ggml_log_get(log_callback, user_data);
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
void llama_log_set(ggml_log_callback log_callback, void * user_data) {
|
|
33
|
+
ggml_log_set(log_callback, user_data);
|
|
34
|
+
g_logger_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
|
|
35
|
+
g_logger_state.log_callback_user_data = user_data;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
static void llama_log_internal_v(ggml_log_level level, const char * format, va_list args) {
|
|
39
|
+
va_list args_copy;
|
|
40
|
+
va_copy(args_copy, args);
|
|
41
|
+
char buffer[128];
|
|
42
|
+
int len = vsnprintf(buffer, 128, format, args);
|
|
43
|
+
if (len < 128) {
|
|
44
|
+
g_logger_state.log_callback(level, buffer, g_logger_state.log_callback_user_data);
|
|
45
|
+
} else {
|
|
46
|
+
char * buffer2 = new char[len + 1];
|
|
47
|
+
vsnprintf(buffer2, len + 1, format, args_copy);
|
|
48
|
+
buffer2[len] = 0;
|
|
49
|
+
g_logger_state.log_callback(level, buffer2, g_logger_state.log_callback_user_data);
|
|
50
|
+
delete[] buffer2;
|
|
51
|
+
}
|
|
52
|
+
va_end(args_copy);
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
void llama_log_internal(ggml_log_level level, const char * format, ...) {
|
|
56
|
+
va_list args;
|
|
57
|
+
va_start(args, format);
|
|
58
|
+
llama_log_internal_v(level, format, args);
|
|
59
|
+
va_end(args);
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data) {
|
|
63
|
+
(void) level;
|
|
64
|
+
(void) user_data;
|
|
65
|
+
fputs(text, stderr);
|
|
66
|
+
fflush(stderr);
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
|
70
|
+
if (search.empty()) {
|
|
71
|
+
return;
|
|
72
|
+
}
|
|
73
|
+
std::string builder;
|
|
74
|
+
builder.reserve(s.length());
|
|
75
|
+
size_t pos = 0;
|
|
76
|
+
size_t last_pos = 0;
|
|
77
|
+
while ((pos = s.find(search, last_pos)) != std::string::npos) {
|
|
78
|
+
builder.append(s, last_pos, pos - last_pos);
|
|
79
|
+
builder.append(replace);
|
|
80
|
+
last_pos = pos + search.length();
|
|
81
|
+
}
|
|
82
|
+
builder.append(s, last_pos, std::string::npos);
|
|
83
|
+
s = std::move(builder);
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
std::string format(const char * fmt, ...) {
|
|
87
|
+
va_list ap;
|
|
88
|
+
va_list ap2;
|
|
89
|
+
va_start(ap, fmt);
|
|
90
|
+
va_copy(ap2, ap);
|
|
91
|
+
int size = vsnprintf(NULL, 0, fmt, ap);
|
|
92
|
+
GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
|
|
93
|
+
std::vector<char> buf(size + 1);
|
|
94
|
+
int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
|
|
95
|
+
GGML_ASSERT(size2 == size);
|
|
96
|
+
va_end(ap2);
|
|
97
|
+
va_end(ap);
|
|
98
|
+
return std::string(buf.data(), size);
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
std::string llama_format_tensor_shape(const std::vector<int64_t> & ne) {
|
|
102
|
+
char buf[256];
|
|
103
|
+
snprintf(buf, sizeof(buf), "%5" PRId64, ne.at(0));
|
|
104
|
+
for (size_t i = 1; i < ne.size(); i++) {
|
|
105
|
+
snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %5" PRId64, ne.at(i));
|
|
106
|
+
}
|
|
107
|
+
return buf;
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
std::string llama_format_tensor_shape(const struct ggml_tensor * t) {
|
|
111
|
+
char buf[256];
|
|
112
|
+
snprintf(buf, sizeof(buf), "%6" PRId64, t->ne[0]);
|
|
113
|
+
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
|
114
|
+
snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %6" PRId64, t->ne[i]);
|
|
115
|
+
}
|
|
116
|
+
return buf;
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
|
|
120
|
+
switch (type) {
|
|
121
|
+
case GGUF_TYPE_UINT8: return std::to_string(((const uint8_t *)data)[i]);
|
|
122
|
+
case GGUF_TYPE_INT8: return std::to_string(((const int8_t *)data)[i]);
|
|
123
|
+
case GGUF_TYPE_UINT16: return std::to_string(((const uint16_t *)data)[i]);
|
|
124
|
+
case GGUF_TYPE_INT16: return std::to_string(((const int16_t *)data)[i]);
|
|
125
|
+
case GGUF_TYPE_UINT32: return std::to_string(((const uint32_t *)data)[i]);
|
|
126
|
+
case GGUF_TYPE_INT32: return std::to_string(((const int32_t *)data)[i]);
|
|
127
|
+
case GGUF_TYPE_UINT64: return std::to_string(((const uint64_t *)data)[i]);
|
|
128
|
+
case GGUF_TYPE_INT64: return std::to_string(((const int64_t *)data)[i]);
|
|
129
|
+
case GGUF_TYPE_FLOAT32: return std::to_string(((const float *)data)[i]);
|
|
130
|
+
case GGUF_TYPE_FLOAT64: return std::to_string(((const double *)data)[i]);
|
|
131
|
+
case GGUF_TYPE_BOOL: return ((const bool *)data)[i] ? "true" : "false";
|
|
132
|
+
default: return format("unknown type %d", type);
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
|
|
137
|
+
const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
|
|
138
|
+
|
|
139
|
+
switch (type) {
|
|
140
|
+
case GGUF_TYPE_STRING:
|
|
141
|
+
return gguf_get_val_str(ctx_gguf, i);
|
|
142
|
+
case GGUF_TYPE_ARRAY:
|
|
143
|
+
{
|
|
144
|
+
const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
|
|
145
|
+
int arr_n = gguf_get_arr_n(ctx_gguf, i);
|
|
146
|
+
const void * data = arr_type == GGUF_TYPE_STRING ? nullptr : gguf_get_arr_data(ctx_gguf, i);
|
|
147
|
+
std::stringstream ss;
|
|
148
|
+
ss << "[";
|
|
149
|
+
for (int j = 0; j < arr_n; j++) {
|
|
150
|
+
if (arr_type == GGUF_TYPE_STRING) {
|
|
151
|
+
std::string val = gguf_get_arr_str(ctx_gguf, i, j);
|
|
152
|
+
// escape quotes
|
|
153
|
+
replace_all(val, "\\", "\\\\");
|
|
154
|
+
replace_all(val, "\"", "\\\"");
|
|
155
|
+
ss << '"' << val << '"';
|
|
156
|
+
} else if (arr_type == GGUF_TYPE_ARRAY) {
|
|
157
|
+
ss << "???";
|
|
158
|
+
} else {
|
|
159
|
+
ss << gguf_data_to_str(arr_type, data, j);
|
|
160
|
+
}
|
|
161
|
+
if (j < arr_n - 1) {
|
|
162
|
+
ss << ", ";
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
ss << "]";
|
|
166
|
+
return ss.str();
|
|
167
|
+
}
|
|
168
|
+
default:
|
|
169
|
+
return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0);
|
|
170
|
+
}
|
|
171
|
+
}
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
#pragma once
|
|
2
|
+
|
|
3
|
+
#include "ggml.h" // for ggml_log_level
|
|
4
|
+
|
|
5
|
+
#include <string>
|
|
6
|
+
#include <vector>
|
|
7
|
+
|
|
8
|
+
#ifdef __GNUC__
|
|
9
|
+
# if defined(__MINGW32__) && !defined(__clang__)
|
|
10
|
+
# define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
|
|
11
|
+
# else
|
|
12
|
+
# define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
|
|
13
|
+
# endif
|
|
14
|
+
#else
|
|
15
|
+
# define LLAMA_ATTRIBUTE_FORMAT(...)
|
|
16
|
+
#endif
|
|
17
|
+
|
|
18
|
+
//
|
|
19
|
+
// logging
|
|
20
|
+
//
|
|
21
|
+
|
|
22
|
+
LLAMA_ATTRIBUTE_FORMAT(2, 3)
|
|
23
|
+
void llama_log_internal (ggml_log_level level, const char * format, ...);
|
|
24
|
+
void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data);
|
|
25
|
+
|
|
26
|
+
#define LLAMA_LOG(...) llama_log_internal(GGML_LOG_LEVEL_NONE , __VA_ARGS__)
|
|
27
|
+
#define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
|
|
28
|
+
#define LLAMA_LOG_WARN(...) llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
|
|
29
|
+
#define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
|
|
30
|
+
#define LLAMA_LOG_DEBUG(...) llama_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
|
|
31
|
+
#define LLAMA_LOG_CONT(...) llama_log_internal(GGML_LOG_LEVEL_CONT , __VA_ARGS__)
|
|
32
|
+
|
|
33
|
+
//
|
|
34
|
+
// helpers
|
|
35
|
+
//
|
|
36
|
+
|
|
37
|
+
template <typename T>
|
|
38
|
+
struct no_init {
|
|
39
|
+
T value;
|
|
40
|
+
no_init() = default;
|
|
41
|
+
};
|
|
42
|
+
|
|
43
|
+
struct time_meas {
|
|
44
|
+
time_meas(int64_t & t_acc, bool disable = false);
|
|
45
|
+
~time_meas();
|
|
46
|
+
|
|
47
|
+
const int64_t t_start_us;
|
|
48
|
+
|
|
49
|
+
int64_t & t_acc;
|
|
50
|
+
};
|
|
51
|
+
|
|
52
|
+
template <typename T>
|
|
53
|
+
struct buffer_view {
|
|
54
|
+
T * data;
|
|
55
|
+
size_t size = 0;
|
|
56
|
+
|
|
57
|
+
bool has_data() const {
|
|
58
|
+
return data && size > 0;
|
|
59
|
+
}
|
|
60
|
+
};
|
|
61
|
+
|
|
62
|
+
void replace_all(std::string & s, const std::string & search, const std::string & replace);
|
|
63
|
+
|
|
64
|
+
// TODO: rename to llama_format ?
|
|
65
|
+
LLAMA_ATTRIBUTE_FORMAT(1, 2)
|
|
66
|
+
std::string format(const char * fmt, ...);
|
|
67
|
+
|
|
68
|
+
std::string llama_format_tensor_shape(const std::vector<int64_t> & ne);
|
|
69
|
+
std::string llama_format_tensor_shape(const struct ggml_tensor * t);
|
|
70
|
+
|
|
71
|
+
std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i);
|
|
72
|
+
|
|
73
|
+
#define LLAMA_TENSOR_NAME_FATTN "__fattn__"
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
#include "llama-io.h"
|
|
2
|
+
|
|
3
|
+
void llama_io_write_i::write_string(const std::string & str) {
|
|
4
|
+
uint32_t str_size = str.size();
|
|
5
|
+
|
|
6
|
+
write(&str_size, sizeof(str_size));
|
|
7
|
+
write(str.data(), str_size);
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
void llama_io_read_i::read_string(std::string & str) {
|
|
11
|
+
uint32_t str_size;
|
|
12
|
+
read_to(&str_size, sizeof(str_size));
|
|
13
|
+
|
|
14
|
+
str.assign((const char *) read(str_size), str_size);
|
|
15
|
+
}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
#pragma once
|
|
2
|
+
|
|
3
|
+
#include <cstddef>
|
|
4
|
+
#include <cstdint>
|
|
5
|
+
#include <string>
|
|
6
|
+
|
|
7
|
+
struct ggml_tensor;
|
|
8
|
+
|
|
9
|
+
class llama_io_write_i {
|
|
10
|
+
public:
|
|
11
|
+
llama_io_write_i() = default;
|
|
12
|
+
virtual ~llama_io_write_i() = default;
|
|
13
|
+
|
|
14
|
+
virtual void write(const void * src, size_t size) = 0;
|
|
15
|
+
virtual void write_tensor(const ggml_tensor * tensor, size_t offset, size_t size) = 0;
|
|
16
|
+
|
|
17
|
+
// bytes written so far
|
|
18
|
+
virtual size_t n_bytes() = 0;
|
|
19
|
+
|
|
20
|
+
void write_string(const std::string & str);
|
|
21
|
+
};
|
|
22
|
+
|
|
23
|
+
class llama_io_read_i {
|
|
24
|
+
public:
|
|
25
|
+
llama_io_read_i() = default;
|
|
26
|
+
virtual ~llama_io_read_i() = default;
|
|
27
|
+
|
|
28
|
+
virtual const uint8_t * read(size_t size) = 0;
|
|
29
|
+
virtual void read_to(void * dst, size_t size) = 0;
|
|
30
|
+
|
|
31
|
+
// bytes read so far
|
|
32
|
+
virtual size_t n_bytes() = 0;
|
|
33
|
+
|
|
34
|
+
void read_string(std::string & str);
|
|
35
|
+
};
|
|
@@ -0,0 +1,330 @@
|
|
|
1
|
+
#include "llama-kv-cache-iswa.h"
|
|
2
|
+
|
|
3
|
+
#include "llama-impl.h"
|
|
4
|
+
#include "llama-batch.h"
|
|
5
|
+
#include "llama-model.h"
|
|
6
|
+
|
|
7
|
+
#include <algorithm>
|
|
8
|
+
#include <cassert>
|
|
9
|
+
|
|
10
|
+
//
|
|
11
|
+
// llama_kv_cache_iswa
|
|
12
|
+
//
|
|
13
|
+
|
|
14
|
+
llama_kv_cache_iswa::llama_kv_cache_iswa(
|
|
15
|
+
const llama_model & model,
|
|
16
|
+
ggml_type type_k,
|
|
17
|
+
ggml_type type_v,
|
|
18
|
+
bool v_trans,
|
|
19
|
+
bool offload,
|
|
20
|
+
bool swa_full,
|
|
21
|
+
bool unified,
|
|
22
|
+
uint32_t kv_size,
|
|
23
|
+
uint32_t n_seq_max,
|
|
24
|
+
uint32_t n_ubatch,
|
|
25
|
+
uint32_t n_pad,
|
|
26
|
+
const layer_filter_cb & filter,
|
|
27
|
+
const layer_reuse_cb & reuse) : hparams(model.hparams), unified(unified) {
|
|
28
|
+
|
|
29
|
+
// chain filters
|
|
30
|
+
const layer_filter_cb filter_base = [&](int32_t il) {
|
|
31
|
+
if (filter && !filter(il)) {
|
|
32
|
+
return false;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
return !model.hparams.is_swa(il);
|
|
36
|
+
};
|
|
37
|
+
|
|
38
|
+
const layer_filter_cb filter_swa = [&](int32_t il) {
|
|
39
|
+
if (filter && !filter(il)) {
|
|
40
|
+
return false;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
return model.hparams.is_swa(il);
|
|
44
|
+
};
|
|
45
|
+
|
|
46
|
+
const uint32_t size_base = kv_size;
|
|
47
|
+
|
|
48
|
+
// note: the SWA cache is always padded to 256 for performance
|
|
49
|
+
// https://github.com/ggml-org/llama.cpp/issues/17037
|
|
50
|
+
uint32_t size_swa = GGML_PAD(std::min(size_base, hparams.n_swa*(unified ? n_seq_max : 1) + n_ubatch), 256);
|
|
51
|
+
|
|
52
|
+
// when using full-size SWA cache, we set the SWA cache size to be equal to the base cache size
|
|
53
|
+
if (swa_full) {
|
|
54
|
+
LLAMA_LOG_WARN("%s: using full-size SWA cache (ref: %s)\n",
|
|
55
|
+
__func__, "https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055");
|
|
56
|
+
|
|
57
|
+
size_swa = size_base;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
LLAMA_LOG_INFO("%s: creating non-SWA KV cache, size = %u cells\n", __func__, size_base);
|
|
61
|
+
|
|
62
|
+
kv_base = std::make_unique<llama_kv_cache>(
|
|
63
|
+
model, type_k, type_v,
|
|
64
|
+
v_trans, offload, unified, size_base, n_seq_max, n_pad,
|
|
65
|
+
0, LLAMA_SWA_TYPE_NONE, filter_base, reuse);
|
|
66
|
+
|
|
67
|
+
LLAMA_LOG_INFO("%s: creating SWA KV cache, size = %u cells\n", __func__, size_swa);
|
|
68
|
+
|
|
69
|
+
kv_swa = std::make_unique<llama_kv_cache>(
|
|
70
|
+
model, type_k, type_v,
|
|
71
|
+
v_trans, offload, unified, size_swa, n_seq_max, n_pad,
|
|
72
|
+
hparams.n_swa, hparams.swa_type, filter_swa, reuse);
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
void llama_kv_cache_iswa::clear(bool data) {
|
|
76
|
+
kv_base->clear(data);
|
|
77
|
+
kv_swa ->clear(data);
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
bool llama_kv_cache_iswa::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
|
|
81
|
+
bool res = true;
|
|
82
|
+
|
|
83
|
+
res = res & kv_base->seq_rm(seq_id, p0, p1);
|
|
84
|
+
res = res & kv_swa ->seq_rm(seq_id, p0, p1);
|
|
85
|
+
|
|
86
|
+
return res;
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
void llama_kv_cache_iswa::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
|
|
90
|
+
kv_base->seq_cp(seq_id_src, seq_id_dst, p0, p1);
|
|
91
|
+
kv_swa ->seq_cp(seq_id_src, seq_id_dst, p0, p1);
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
void llama_kv_cache_iswa::seq_keep(llama_seq_id seq_id) {
|
|
95
|
+
kv_base->seq_keep(seq_id);
|
|
96
|
+
kv_swa ->seq_keep(seq_id);
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
void llama_kv_cache_iswa::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) {
|
|
100
|
+
kv_base->seq_add(seq_id, p0, p1, shift);
|
|
101
|
+
kv_swa ->seq_add(seq_id, p0, p1, shift);
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
void llama_kv_cache_iswa::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
|
|
105
|
+
kv_base->seq_div(seq_id, p0, p1, d);
|
|
106
|
+
kv_swa ->seq_div(seq_id, p0, p1, d);
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
llama_pos llama_kv_cache_iswa::seq_pos_min(llama_seq_id seq_id) const {
|
|
110
|
+
// the base cache is a superset of the SWA cache, so we can just check the SWA cache
|
|
111
|
+
return kv_swa->seq_pos_min(seq_id);
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
llama_pos llama_kv_cache_iswa::seq_pos_max(llama_seq_id seq_id) const {
|
|
115
|
+
return kv_swa->seq_pos_max(seq_id);
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
std::map<ggml_backend_buffer_type_t, size_t> llama_kv_cache_iswa::memory_breakdown() const {
|
|
119
|
+
std::map<ggml_backend_buffer_type_t, size_t> mb = kv_base->memory_breakdown();
|
|
120
|
+
for (const auto & buft_size : kv_swa->memory_breakdown()) {
|
|
121
|
+
mb[buft_size.first] += buft_size.second;
|
|
122
|
+
}
|
|
123
|
+
return mb;
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
llama_memory_context_ptr llama_kv_cache_iswa::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
|
|
127
|
+
GGML_UNUSED(embd_all);
|
|
128
|
+
|
|
129
|
+
// first try simple split
|
|
130
|
+
do {
|
|
131
|
+
if (!unified) {
|
|
132
|
+
// requires equal splits, so we skip the simple split
|
|
133
|
+
break;
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
balloc.split_reset();
|
|
137
|
+
|
|
138
|
+
std::vector<llama_ubatch> ubatches;
|
|
139
|
+
while (true) {
|
|
140
|
+
auto ubatch = balloc.split_simple(n_ubatch);
|
|
141
|
+
|
|
142
|
+
if (ubatch.n_tokens == 0) {
|
|
143
|
+
break;
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
ubatches.push_back(std::move(ubatch)); // NOLINT
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
if (balloc.get_n_used() < balloc.get_n_tokens()) {
|
|
150
|
+
// failed to find a suitable split
|
|
151
|
+
break;
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
auto sinfos_base = kv_base->prepare(ubatches);
|
|
155
|
+
if (sinfos_base.empty()) {
|
|
156
|
+
break;
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
auto sinfos_swa = kv_swa->prepare(ubatches);
|
|
160
|
+
if (sinfos_swa.empty()) {
|
|
161
|
+
break;
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
assert(sinfos_base.size() == sinfos_swa.size());
|
|
165
|
+
|
|
166
|
+
return std::make_unique<llama_kv_cache_iswa_context>(
|
|
167
|
+
this, std::move(sinfos_base), std::move(sinfos_swa), std::move(ubatches));
|
|
168
|
+
} while (false);
|
|
169
|
+
|
|
170
|
+
// if it fails, try equal split
|
|
171
|
+
do {
|
|
172
|
+
balloc.split_reset();
|
|
173
|
+
|
|
174
|
+
std::vector<llama_ubatch> ubatches;
|
|
175
|
+
while (true) {
|
|
176
|
+
auto ubatch = balloc.split_equal(n_ubatch, !unified);
|
|
177
|
+
|
|
178
|
+
if (ubatch.n_tokens == 0) {
|
|
179
|
+
break;
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
ubatches.push_back(std::move(ubatch)); // NOLINT
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
if (balloc.get_n_used() < balloc.get_n_tokens()) {
|
|
186
|
+
// failed to find a suitable split
|
|
187
|
+
break;
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
auto sinfos_base = kv_base->prepare(ubatches);
|
|
191
|
+
if (sinfos_base.empty()) {
|
|
192
|
+
break;
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
auto sinfos_swa = kv_swa->prepare(ubatches);
|
|
196
|
+
if (sinfos_swa.empty()) {
|
|
197
|
+
break;
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
assert(sinfos_base.size() == sinfos_swa.size());
|
|
201
|
+
|
|
202
|
+
return std::make_unique<llama_kv_cache_iswa_context>(
|
|
203
|
+
this, std::move(sinfos_base), std::move(sinfos_swa), std::move(ubatches));
|
|
204
|
+
} while (false);
|
|
205
|
+
|
|
206
|
+
// TODO: if we fail again, we should attempt different splitting strategies
|
|
207
|
+
// but to do that properly, we first have to refactor the batches to be more flexible
|
|
208
|
+
|
|
209
|
+
return std::make_unique<llama_kv_cache_iswa_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
llama_memory_context_ptr llama_kv_cache_iswa::init_full() {
|
|
213
|
+
return std::make_unique<llama_kv_cache_iswa_context>(this);
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
llama_memory_context_ptr llama_kv_cache_iswa::init_update(llama_context * lctx, bool optimize) {
|
|
217
|
+
return std::make_unique<llama_kv_cache_iswa_context>(this, lctx, optimize);
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
bool llama_kv_cache_iswa::get_can_shift() const {
|
|
221
|
+
return kv_base->get_can_shift() &&
|
|
222
|
+
kv_swa->get_can_shift() &&
|
|
223
|
+
kv_base->get_size() == kv_swa->get_size();
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
void llama_kv_cache_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
|
|
227
|
+
if ((flags & LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY) == 0) {
|
|
228
|
+
kv_base->state_write(io, seq_id, flags);
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
kv_swa->state_write(io, seq_id, flags);
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
void llama_kv_cache_iswa::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
|
|
235
|
+
if ((flags & LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY) == 0) {
|
|
236
|
+
kv_base->state_read(io, seq_id, flags);
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
kv_swa->state_read(io, seq_id, flags);
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
llama_kv_cache * llama_kv_cache_iswa::get_base() const {
|
|
243
|
+
return kv_base.get();
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
llama_kv_cache * llama_kv_cache_iswa::get_swa() const {
|
|
247
|
+
return kv_swa.get();
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
//
|
|
251
|
+
// llama_kv_cache_iswa_context
|
|
252
|
+
//
|
|
253
|
+
|
|
254
|
+
llama_kv_cache_iswa_context::llama_kv_cache_iswa_context(llama_memory_status status) : status(status) {}
|
|
255
|
+
|
|
256
|
+
llama_kv_cache_iswa_context::llama_kv_cache_iswa_context(
|
|
257
|
+
llama_kv_cache_iswa * kv) :
|
|
258
|
+
ctx_base(kv->get_base()->init_full()),
|
|
259
|
+
ctx_swa (kv->get_swa ()->init_full()),
|
|
260
|
+
status(llama_memory_status_combine(ctx_base->get_status(), ctx_swa->get_status())) {
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
llama_kv_cache_iswa_context::llama_kv_cache_iswa_context(
|
|
264
|
+
llama_kv_cache_iswa * kv,
|
|
265
|
+
llama_context * lctx,
|
|
266
|
+
bool optimize) :
|
|
267
|
+
ctx_base(kv->get_base()->init_update(lctx, optimize)),
|
|
268
|
+
ctx_swa (kv->get_swa ()->init_update(lctx, optimize)),
|
|
269
|
+
status(llama_memory_status_combine(ctx_base->get_status(), ctx_swa->get_status())) {
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
llama_kv_cache_iswa_context::llama_kv_cache_iswa_context(
|
|
273
|
+
llama_kv_cache_iswa * kv,
|
|
274
|
+
slot_info_vec_t sinfos_base,
|
|
275
|
+
slot_info_vec_t sinfos_swa,
|
|
276
|
+
std::vector<llama_ubatch> ubatches) :
|
|
277
|
+
ubatches(std::move(ubatches)),
|
|
278
|
+
// note: here we copy the ubatches. not sure if this is ideal
|
|
279
|
+
ctx_base(new llama_kv_cache_context(kv->get_base(), std::move(sinfos_base), this->ubatches)),
|
|
280
|
+
ctx_swa (new llama_kv_cache_context(kv->get_swa (), std::move(sinfos_swa), this->ubatches)),
|
|
281
|
+
status(llama_memory_status_combine(ctx_base->get_status(), ctx_swa->get_status())) {
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
llama_kv_cache_iswa_context:: ~llama_kv_cache_iswa_context() = default;
|
|
285
|
+
|
|
286
|
+
bool llama_kv_cache_iswa_context::next() {
|
|
287
|
+
assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
|
|
288
|
+
|
|
289
|
+
ctx_base->next();
|
|
290
|
+
ctx_swa ->next();
|
|
291
|
+
|
|
292
|
+
if (++i_next >= ubatches.size()) {
|
|
293
|
+
return false;
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
return true;
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
bool llama_kv_cache_iswa_context::apply() {
|
|
300
|
+
assert(!llama_memory_status_is_fail(status));
|
|
301
|
+
|
|
302
|
+
bool res = true;
|
|
303
|
+
|
|
304
|
+
res = res & ctx_base->apply();
|
|
305
|
+
res = res & ctx_swa ->apply();
|
|
306
|
+
|
|
307
|
+
return res;
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
llama_memory_status llama_kv_cache_iswa_context::get_status() const {
|
|
311
|
+
return status;
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
const llama_ubatch & llama_kv_cache_iswa_context::get_ubatch() const {
|
|
315
|
+
assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
|
|
316
|
+
|
|
317
|
+
return ubatches[i_next];
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
const llama_kv_cache_context * llama_kv_cache_iswa_context::get_base() const {
|
|
321
|
+
assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
|
|
322
|
+
|
|
323
|
+
return static_cast<const llama_kv_cache_context *>(ctx_base.get());
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
const llama_kv_cache_context * llama_kv_cache_iswa_context::get_swa() const {
|
|
327
|
+
assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
|
|
328
|
+
|
|
329
|
+
return static_cast<const llama_kv_cache_context *>(ctx_swa.get());
|
|
330
|
+
}
|