@novastera-oss/llamarn 0.2.9 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/build.gradle +2 -1
- package/android/proguard-rules.pro +12 -0
- package/android/src/main/cpp/include/llama.h +15 -47
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86/libggml.so +0 -0
- package/android/src/main/jniLibs/x86/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/CMakeLists.txt +0 -1
- package/cpp/llama.cpp/CMakePresets.json +11 -0
- package/cpp/llama.cpp/CODEOWNERS +1 -0
- package/cpp/llama.cpp/README.md +8 -8
- package/cpp/llama.cpp/build-xcframework.sh +1 -1
- package/cpp/llama.cpp/common/CMakeLists.txt +4 -5
- package/cpp/llama.cpp/common/arg.cpp +62 -1
- package/cpp/llama.cpp/common/chat.cpp +37 -20
- package/cpp/llama.cpp/common/chat.h +2 -0
- package/cpp/llama.cpp/common/common.cpp +22 -6
- package/cpp/llama.cpp/common/common.h +22 -4
- package/cpp/llama.cpp/convert_hf_to_gguf.py +1250 -43
- package/cpp/llama.cpp/convert_hf_to_gguf_update.py +21 -13
- package/cpp/llama.cpp/ggml/CMakeLists.txt +13 -3
- package/cpp/llama.cpp/ggml/cmake/ggml-config.cmake.in +85 -47
- package/cpp/llama.cpp/ggml/include/ggml-backend.h +1 -1
- package/cpp/llama.cpp/ggml/include/ggml-webgpu.h +19 -0
- package/cpp/llama.cpp/ggml/include/ggml.h +173 -10
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-alloc.c +0 -15
- package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +7 -8
- package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +44 -38
- package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +3 -1
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +126 -8
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +130 -22
- package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +138 -18
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +11 -3
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +28 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +109 -12
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +88 -10
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +343 -1094
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +1206 -163
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +6 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +0 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +36 -9
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +142 -9
- package/cpp/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +31 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cu +86 -17
- package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy-utils.cuh +225 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cu +41 -301
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cross-entropy-loss.cu +2 -14
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +85 -64
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +47 -60
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cu +29 -42
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cu +46 -59
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +36 -45
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +38 -45
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +23 -36
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cu +3 -13
- package/cpp/llama.cpp/ggml/src/ggml-cuda/getrows.cu +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +255 -99
- package/cpp/llama.cpp/ggml/src/ggml-cuda/im2col.cu +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mma.cuh +111 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cu +6 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +1152 -695
- package/cpp/llama.cpp/ggml/src/ggml-cuda/norm.cu +92 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/norm.cuh +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/rope.cu +21 -27
- package/cpp/llama.cpp/ggml/src/ggml-cuda/scale.cu +8 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/set-rows.cu +275 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/set-rows.cuh +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/softmax.cu +119 -58
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-conv.cu +10 -2
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +192 -52
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +104 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +13 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/upscale.cu +92 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +27 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +80 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -2
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +48 -12
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +572 -106
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +599 -105
- package/cpp/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +18 -4
- package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +800 -42
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/gelu.cl +27 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/glu.cl +337 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/im2col_f16.cl +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/im2col_f32.cl +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/rms_norm.cl +79 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/scale.cl +3 -2
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/set_rows.cl +95 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +24 -11
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +24 -11
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f16.cl +24 -11
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f32.cl +24 -11
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +2 -3
- package/cpp/llama.cpp/ggml/src/ggml-quants.c +6 -6
- package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +4 -4
- package/cpp/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +693 -1034
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +18 -9
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +14 -26
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +191 -55
- package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +8 -9
- package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +15 -18
- package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.cpp +131 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.hpp +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +2 -6
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +991 -307
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +265 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +59 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +28 -23
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +14 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +38 -32
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +32 -27
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +44 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +17 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp +29 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +3 -8
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +128 -72
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +38 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +18 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +1 -4
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rte.comp +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +20 -4
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +69 -5
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +84 -9
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/CMakeLists.txt +54 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/ggml-webgpu.cpp +907 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +60 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +35 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +56 -0
- package/cpp/llama.cpp/ggml/src/ggml.c +386 -67
- package/cpp/llama.cpp/ggml/src/gguf.cpp +8 -1
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +307 -0
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +8 -2
- package/cpp/llama.cpp/gguf-py/gguf/metadata.py +4 -0
- package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_dump.py +24 -1
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +122 -47
- package/cpp/llama.cpp/gguf-py/gguf/vocab.py +12 -3
- package/cpp/llama.cpp/include/llama.h +15 -47
- package/cpp/llama.cpp/models/templates/llama-cpp-rwkv-world.jinja +34 -0
- package/cpp/llama.cpp/models/templates/moonshotai-Kimi-K2.jinja +43 -0
- package/cpp/llama.cpp/requirements/requirements-all.txt +1 -0
- package/cpp/llama.cpp/requirements/requirements-server-bench.txt +5 -0
- package/cpp/llama.cpp/src/llama-arch.cpp +316 -3
- package/cpp/llama.cpp/src/llama-arch.h +23 -1
- package/cpp/llama.cpp/src/llama-batch.cpp +103 -71
- package/cpp/llama.cpp/src/llama-batch.h +31 -18
- package/cpp/llama.cpp/src/llama-chat.cpp +58 -1
- package/cpp/llama.cpp/src/llama-chat.h +3 -0
- package/cpp/llama.cpp/src/llama-context.cpp +180 -106
- package/cpp/llama.cpp/src/llama-context.h +26 -16
- package/cpp/llama.cpp/src/llama-cparams.h +3 -2
- package/cpp/llama.cpp/src/llama-graph.cpp +310 -211
- package/cpp/llama.cpp/src/llama-graph.h +184 -122
- package/cpp/llama.cpp/src/llama-hparams.cpp +47 -1
- package/cpp/llama.cpp/src/llama-hparams.h +13 -2
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +38 -22
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +7 -2
- package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +849 -304
- package/cpp/llama.cpp/src/llama-kv-cache-unified.h +143 -47
- package/cpp/llama.cpp/src/llama-kv-cells.h +62 -10
- package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +10 -4
- package/cpp/llama.cpp/src/llama-memory-hybrid.h +3 -1
- package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +36 -11
- package/cpp/llama.cpp/src/llama-memory.cpp +17 -0
- package/cpp/llama.cpp/src/llama-memory.h +3 -0
- package/cpp/llama.cpp/src/llama-model.cpp +3545 -719
- package/cpp/llama.cpp/src/llama-model.h +21 -4
- package/cpp/llama.cpp/src/llama-quant.cpp +2 -2
- package/cpp/llama.cpp/src/llama-vocab.cpp +376 -10
- package/cpp/llama.cpp/src/llama-vocab.h +43 -0
- package/cpp/llama.cpp/src/unicode.cpp +207 -0
- package/cpp/llama.cpp/src/unicode.h +2 -0
- package/ios/include/chat.h +2 -0
- package/ios/include/common.h +22 -4
- package/ios/include/llama.h +15 -47
- package/ios/libs/llama.xcframework/Info.plist +13 -13
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5267 -4890
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +15 -47
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5238 -4861
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4014 -3764
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +15 -47
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5238 -4861
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4016 -3766
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +15 -47
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +15 -47
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +15 -47
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5267 -4890
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +15 -47
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5238 -4861
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4014 -3764
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +15 -47
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5303 -4926
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +15 -47
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5274 -4897
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4044 -3794
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +15 -47
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +4 -4
- package/cpp/llama.cpp/ggml/include/ggml-kompute.h +0 -50
- package/cpp/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
- package/cpp/llama.cpp/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/common.comp +0 -112
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +0 -58
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +0 -25
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +0 -30
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +0 -22
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +0 -17
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +0 -31
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +0 -31
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +0 -38
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +0 -39
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +0 -44
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +0 -69
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +0 -51
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +0 -33
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +0 -35
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +0 -140
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +0 -106
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +0 -73
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +0 -28
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +0 -84
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +0 -21
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +0 -53
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +0 -19
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +0 -23
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +0 -22
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +0 -72
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +0 -71
|
@@ -98,10 +98,20 @@ llama_context::llama_context(
|
|
|
98
98
|
LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD);
|
|
99
99
|
cparams.n_batch = GGML_KQ_MASK_PAD;
|
|
100
100
|
}
|
|
101
|
-
|
|
102
101
|
cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
|
|
103
102
|
|
|
104
103
|
cparams.op_offload = params.op_offload;
|
|
104
|
+
cparams.kv_unified = params.kv_unified;
|
|
105
|
+
|
|
106
|
+
{
|
|
107
|
+
const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
|
|
108
|
+
supports_set_rows = LLAMA_SET_ROWS ? (atoi(LLAMA_SET_ROWS) != 0) : false;
|
|
109
|
+
|
|
110
|
+
if (!supports_set_rows && !cparams.kv_unified) {
|
|
111
|
+
LLAMA_LOG_WARN("%s: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache\n", __func__);
|
|
112
|
+
cparams.kv_unified = true;
|
|
113
|
+
}
|
|
114
|
+
}
|
|
105
115
|
|
|
106
116
|
const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
|
|
107
117
|
|
|
@@ -112,6 +122,7 @@ llama_context::llama_context(
|
|
|
112
122
|
LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch);
|
|
113
123
|
LLAMA_LOG_INFO("%s: causal_attn = %d\n", __func__, cparams.causal_attn);
|
|
114
124
|
LLAMA_LOG_INFO("%s: flash_attn = %d\n", __func__, cparams.flash_attn);
|
|
125
|
+
LLAMA_LOG_INFO("%s: kv_unified = %s\n", __func__, cparams.kv_unified ? "true" : "false");
|
|
115
126
|
LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
|
|
116
127
|
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
|
|
117
128
|
|
|
@@ -227,8 +238,8 @@ llama_context::llama_context(
|
|
|
227
238
|
|
|
228
239
|
LLAMA_LOG_DEBUG("%s: max_nodes = %zu\n", __func__, max_nodes);
|
|
229
240
|
|
|
230
|
-
|
|
231
|
-
|
|
241
|
+
gf_res_prev.reset(new llm_graph_result(max_nodes));
|
|
242
|
+
gf_res_reserve.reset(new llm_graph_result(max_nodes));
|
|
232
243
|
|
|
233
244
|
// TODO: move these checks to ggml_backend_sched
|
|
234
245
|
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
|
|
@@ -267,7 +278,7 @@ llama_context::llama_context(
|
|
|
267
278
|
|
|
268
279
|
// reserve worst-case graph
|
|
269
280
|
if (!hparams.vocab_only && memory) {
|
|
270
|
-
const uint32_t n_seqs = cparams.n_seq_max;
|
|
281
|
+
const uint32_t n_seqs = cparams.kv_unified ? 1 : cparams.n_seq_max;
|
|
271
282
|
const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
|
|
272
283
|
|
|
273
284
|
LLAMA_LOG_DEBUG("%s: worst-case: n_tokens = %d, n_seqs = %d, n_outputs = %d\n", __func__, n_tokens, n_seqs, n_outputs);
|
|
@@ -300,7 +311,7 @@ llama_context::llama_context(
|
|
|
300
311
|
|
|
301
312
|
// reserve with tg graph to get the number of splits and nodes
|
|
302
313
|
{
|
|
303
|
-
auto * gf = graph_reserve(
|
|
314
|
+
auto * gf = graph_reserve(n_seqs, n_seqs, n_seqs, mctx.get());
|
|
304
315
|
if (!gf) {
|
|
305
316
|
throw std::runtime_error("failed to allocate compute tg buffers");
|
|
306
317
|
}
|
|
@@ -311,6 +322,10 @@ llama_context::llama_context(
|
|
|
311
322
|
|
|
312
323
|
// reserve again with pp graph to avoid ggml-alloc reallocations during inference
|
|
313
324
|
{
|
|
325
|
+
// TODO: not sure if the following graph would be worster case for multi-stream KV caches:
|
|
326
|
+
//
|
|
327
|
+
// auto * gf = graph_reserve(n_tokens, 1, n_tokens, mctx.get());
|
|
328
|
+
//
|
|
314
329
|
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
|
|
315
330
|
if (!gf) {
|
|
316
331
|
throw std::runtime_error("failed to allocate compute pp buffers");
|
|
@@ -388,10 +403,6 @@ ggml_backend_sched_t llama_context::get_sched() const {
|
|
|
388
403
|
return sched.get();
|
|
389
404
|
}
|
|
390
405
|
|
|
391
|
-
ggml_context * llama_context::get_ctx_compute() const {
|
|
392
|
-
return ctx_compute.get();
|
|
393
|
-
}
|
|
394
|
-
|
|
395
406
|
uint32_t llama_context::n_ctx() const {
|
|
396
407
|
return cparams.n_ctx;
|
|
397
408
|
}
|
|
@@ -463,6 +474,11 @@ bool llama_context::kv_self_update(bool optimize) {
|
|
|
463
474
|
}
|
|
464
475
|
}
|
|
465
476
|
|
|
477
|
+
// reset the previous graph result to make sure that it won't be reused
|
|
478
|
+
// TODO: change the mctx->apply() to return information if a graph reserve is needed
|
|
479
|
+
// reset the graph result only if the memory module did reset the scheduler
|
|
480
|
+
gf_res_prev->reset();
|
|
481
|
+
|
|
466
482
|
if (!mctx->apply()) {
|
|
467
483
|
LLAMA_LOG_ERROR("%s: failed to apply memory update\n", __func__);
|
|
468
484
|
}
|
|
@@ -475,7 +491,7 @@ bool llama_context::kv_self_update(bool optimize) {
|
|
|
475
491
|
throw std::runtime_error("failed to initialize memory context");
|
|
476
492
|
}
|
|
477
493
|
|
|
478
|
-
const uint32_t n_seqs
|
|
494
|
+
const uint32_t n_seqs = cparams.kv_unified ? 1 : cparams.n_seq_max;
|
|
479
495
|
const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
|
|
480
496
|
|
|
481
497
|
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
|
|
@@ -492,12 +508,16 @@ enum llama_pooling_type llama_context::pooling_type() const {
|
|
|
492
508
|
}
|
|
493
509
|
|
|
494
510
|
float * llama_context::get_logits() {
|
|
511
|
+
output_reorder();
|
|
512
|
+
|
|
495
513
|
return logits;
|
|
496
514
|
}
|
|
497
515
|
|
|
498
516
|
float * llama_context::get_logits_ith(int32_t i) {
|
|
499
517
|
int64_t j = -1;
|
|
500
518
|
|
|
519
|
+
output_reorder();
|
|
520
|
+
|
|
501
521
|
try {
|
|
502
522
|
if (logits == nullptr) {
|
|
503
523
|
throw std::runtime_error("no logits");
|
|
@@ -534,12 +554,16 @@ float * llama_context::get_logits_ith(int32_t i) {
|
|
|
534
554
|
}
|
|
535
555
|
|
|
536
556
|
float * llama_context::get_embeddings() {
|
|
557
|
+
output_reorder();
|
|
558
|
+
|
|
537
559
|
return embd;
|
|
538
560
|
}
|
|
539
561
|
|
|
540
562
|
float * llama_context::get_embeddings_ith(int32_t i) {
|
|
541
563
|
int64_t j = -1;
|
|
542
564
|
|
|
565
|
+
output_reorder();
|
|
566
|
+
|
|
543
567
|
try {
|
|
544
568
|
if (embd == nullptr) {
|
|
545
569
|
throw std::runtime_error("no embeddings");
|
|
@@ -678,38 +702,59 @@ bool llama_context::apply_adapter_cvec(
|
|
|
678
702
|
return cvec.apply(model, data, len, n_embd, il_start, il_end);
|
|
679
703
|
}
|
|
680
704
|
|
|
681
|
-
|
|
705
|
+
llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, ggml_status & ret) {
|
|
682
706
|
if (mctx && !mctx->apply()) {
|
|
683
707
|
LLAMA_LOG_ERROR("%s: failed to apply memory context\n", __func__);
|
|
684
708
|
ret = GGML_STATUS_FAILED;
|
|
685
709
|
return nullptr;
|
|
686
710
|
}
|
|
687
711
|
|
|
688
|
-
auto *
|
|
689
|
-
|
|
690
|
-
LLAMA_LOG_ERROR("%s: failed to initialize graph\n", __func__);
|
|
691
|
-
ret = GGML_STATUS_FAILED;
|
|
692
|
-
return nullptr;
|
|
693
|
-
}
|
|
712
|
+
auto * res = gf_res_prev.get();
|
|
713
|
+
auto * gf = res->get_gf();
|
|
694
714
|
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
ret = GGML_STATUS_FAILED;
|
|
699
|
-
return nullptr;
|
|
700
|
-
}
|
|
715
|
+
// the new graph parameters
|
|
716
|
+
// in order to correctly reuse a graph, it's full topology has to be uniquely determined by these parameters
|
|
717
|
+
const auto gparams = graph_params(res, ubatch, mctx, gtype);
|
|
701
718
|
|
|
702
|
-
|
|
719
|
+
if (res->can_reuse(gparams)) {
|
|
720
|
+
//LLAMA_LOG_DEBUG("%s: reusing previous graph\n", __func__);
|
|
703
721
|
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
722
|
+
n_reused++;
|
|
723
|
+
} else {
|
|
724
|
+
res->reset();
|
|
725
|
+
|
|
726
|
+
ggml_backend_sched_reset(sched.get());
|
|
727
|
+
ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
|
|
728
|
+
|
|
729
|
+
//const auto t_start_us = ggml_time_us();
|
|
730
|
+
|
|
731
|
+
gf = model.build_graph(gparams);
|
|
732
|
+
|
|
733
|
+
//LLAMA_LOG_INFO("graph build time: %.3f ms\n", (ggml_time_us() - t_start_us)/1000.0);
|
|
734
|
+
|
|
735
|
+
if (!gf) {
|
|
736
|
+
LLAMA_LOG_ERROR("%s: failed to initialize graph\n", __func__);
|
|
737
|
+
ret = GGML_STATUS_FAILED;
|
|
738
|
+
return nullptr;
|
|
739
|
+
}
|
|
740
|
+
|
|
741
|
+
if (!ggml_backend_sched_alloc_graph(sched.get(), gf)) {
|
|
742
|
+
LLAMA_LOG_ERROR("%s: failed to allocate graph\n", __func__);
|
|
743
|
+
ret = GGML_STATUS_ALLOC_FAILED;
|
|
744
|
+
return nullptr;
|
|
745
|
+
}
|
|
708
746
|
}
|
|
709
747
|
|
|
710
|
-
|
|
748
|
+
// set the input data for the input tensors
|
|
749
|
+
{
|
|
750
|
+
//const auto t_start_us = ggml_time_us();
|
|
751
|
+
|
|
752
|
+
res->set_inputs(&ubatch);
|
|
753
|
+
|
|
754
|
+
//LLAMA_LOG_INFO("graph set inputs time: %.3f ms\n", (ggml_time_us() - t_start_us)/1000.0);
|
|
755
|
+
}
|
|
711
756
|
|
|
712
|
-
const auto status = graph_compute(
|
|
757
|
+
const auto status = graph_compute(res->get_gf(), ubatch.n_tokens > 1);
|
|
713
758
|
if (status != GGML_STATUS_SUCCESS) {
|
|
714
759
|
LLAMA_LOG_ERROR("%s: failed to compute graph, compute status: %d\n", __func__, status);
|
|
715
760
|
ret = status;
|
|
@@ -731,16 +776,19 @@ int llama_context::encode(const llama_batch & batch_inp) {
|
|
|
731
776
|
|
|
732
777
|
const auto & hparams = model.hparams;
|
|
733
778
|
|
|
734
|
-
const int64_t n_embd
|
|
779
|
+
const int64_t n_embd = hparams.n_embd;
|
|
780
|
+
const int32_t n_vocab = model.vocab.n_tokens();
|
|
735
781
|
|
|
736
782
|
// note: during encode, we always pass the full sequence starting from pos = 0
|
|
737
|
-
if (!balloc->init(batch_inp, model.vocab, nullptr, n_embd, true)) {
|
|
783
|
+
if (!balloc->init(batch_inp, model.vocab, nullptr, n_embd, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, true)) {
|
|
738
784
|
LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
|
|
739
785
|
return -1;
|
|
740
786
|
}
|
|
741
787
|
|
|
742
788
|
const uint32_t n_tokens = balloc->get_n_tokens();
|
|
743
789
|
|
|
790
|
+
// [TAG_NO_CACHE_PAD]
|
|
791
|
+
// TODO: add new split mode where we pad the input sequences so that ubatch.equal_seqs == true
|
|
744
792
|
const llama_ubatch ubatch = balloc->split_simple(n_tokens);
|
|
745
793
|
|
|
746
794
|
// micro-batching is not possible for non-causal encoding, so we process the batch in a single shot
|
|
@@ -767,9 +815,6 @@ int llama_context::encode(const llama_batch & batch_inp) {
|
|
|
767
815
|
|
|
768
816
|
n_outputs = n_tokens;
|
|
769
817
|
|
|
770
|
-
ggml_backend_sched_reset(sched.get());
|
|
771
|
-
ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
|
|
772
|
-
|
|
773
818
|
const auto causal_attn_org = cparams.causal_attn;
|
|
774
819
|
|
|
775
820
|
// always use non-causal attention for encoder graphs
|
|
@@ -778,7 +823,7 @@ int llama_context::encode(const llama_batch & batch_inp) {
|
|
|
778
823
|
cparams.causal_attn = false;
|
|
779
824
|
|
|
780
825
|
ggml_status status;
|
|
781
|
-
const auto res = process_ubatch(ubatch, LLM_GRAPH_TYPE_ENCODER, nullptr, status);
|
|
826
|
+
const auto * res = process_ubatch(ubatch, LLM_GRAPH_TYPE_ENCODER, nullptr, status);
|
|
782
827
|
|
|
783
828
|
cparams.causal_attn = causal_attn_org;
|
|
784
829
|
|
|
@@ -791,10 +836,20 @@ int llama_context::encode(const llama_batch & batch_inp) {
|
|
|
791
836
|
}
|
|
792
837
|
}
|
|
793
838
|
|
|
839
|
+
auto * t_logits = res->get_logits();
|
|
794
840
|
auto * t_embd = res->get_embd_pooled() ? res->get_embd_pooled() : res->get_embd();
|
|
795
841
|
|
|
842
|
+
// extract logits
|
|
843
|
+
if (logits && t_logits) {
|
|
844
|
+
ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(sched.get(), t_logits);
|
|
845
|
+
GGML_ASSERT(backend_res != nullptr);
|
|
846
|
+
GGML_ASSERT(logits != nullptr);
|
|
847
|
+
|
|
848
|
+
ggml_backend_tensor_get_async(backend_res, t_logits, logits, 0, n_tokens*n_vocab*sizeof(float));
|
|
849
|
+
}
|
|
850
|
+
|
|
796
851
|
// extract embeddings
|
|
797
|
-
if (t_embd) {
|
|
852
|
+
if (embd && t_embd) {
|
|
798
853
|
ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd);
|
|
799
854
|
GGML_ASSERT(backend_embd != nullptr);
|
|
800
855
|
|
|
@@ -844,9 +899,11 @@ int llama_context::encode(const llama_batch & batch_inp) {
|
|
|
844
899
|
}
|
|
845
900
|
}
|
|
846
901
|
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
|
|
902
|
+
if (!supports_set_rows) {
|
|
903
|
+
// Reset state for the next token before backend sync, to allow the CPU activities in the reset to
|
|
904
|
+
// overlap with device computation.
|
|
905
|
+
ggml_backend_sched_reset(sched.get());
|
|
906
|
+
}
|
|
850
907
|
|
|
851
908
|
// TODO: hacky solution
|
|
852
909
|
if (model.arch == LLM_ARCH_T5 && t_embd) {
|
|
@@ -899,7 +956,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
|
|
899
956
|
// when computing embeddings, all tokens are output
|
|
900
957
|
const bool output_all = cparams.embeddings;
|
|
901
958
|
|
|
902
|
-
if (!balloc->init(batch_inp, vocab, memory.get(), n_embd, output_all)) {
|
|
959
|
+
if (!balloc->init(batch_inp, vocab, memory.get(), n_embd, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, output_all)) {
|
|
903
960
|
LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
|
|
904
961
|
return -1;
|
|
905
962
|
}
|
|
@@ -927,6 +984,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
|
|
927
984
|
|
|
928
985
|
// TODO: this clear of the buffer can easily be forgotten - need something better
|
|
929
986
|
embd_seq.clear();
|
|
987
|
+
output_swaps.clear();
|
|
930
988
|
|
|
931
989
|
bool did_optimize = false;
|
|
932
990
|
|
|
@@ -1005,11 +1063,8 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
|
|
1005
1063
|
n_outputs = n_outputs_new;
|
|
1006
1064
|
}
|
|
1007
1065
|
|
|
1008
|
-
ggml_backend_sched_reset(sched.get());
|
|
1009
|
-
ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
|
|
1010
|
-
|
|
1011
1066
|
ggml_status status;
|
|
1012
|
-
const auto res = process_ubatch(ubatch, LLM_GRAPH_TYPE_DECODER, mctx.get(), status);
|
|
1067
|
+
const auto * res = process_ubatch(ubatch, LLM_GRAPH_TYPE_DECODER, mctx.get(), status);
|
|
1013
1068
|
|
|
1014
1069
|
if (!res) {
|
|
1015
1070
|
// the last ubatch failed or was aborted -> remove all positions of that ubatch from the KV cache
|
|
@@ -1149,9 +1204,6 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
|
|
1149
1204
|
// make the outputs have the same order they had in the user-provided batch
|
|
1150
1205
|
// note: this is mostly relevant for recurrent models atm
|
|
1151
1206
|
if (!sorted_output) {
|
|
1152
|
-
const uint32_t n_vocab = model.vocab.n_tokens();
|
|
1153
|
-
const uint64_t n_embd = model.hparams.n_embd;
|
|
1154
|
-
|
|
1155
1207
|
GGML_ASSERT((size_t) n_outputs == out_ids.size());
|
|
1156
1208
|
|
|
1157
1209
|
// TODO: is there something more efficient which also minimizes swaps?
|
|
@@ -1167,16 +1219,9 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
|
|
1167
1219
|
continue;
|
|
1168
1220
|
}
|
|
1169
1221
|
std::swap(out_ids[i], out_ids[j_min]);
|
|
1170
|
-
|
|
1171
|
-
|
|
1172
|
-
|
|
1173
|
-
}
|
|
1174
|
-
}
|
|
1175
|
-
if (embd_size > 0) {
|
|
1176
|
-
for (uint32_t k = 0; k < n_embd; k++) {
|
|
1177
|
-
std::swap(embd[i*n_embd + k], embd[j_min*n_embd + k]);
|
|
1178
|
-
}
|
|
1179
|
-
}
|
|
1222
|
+
|
|
1223
|
+
// remember the swaps and apply them lazily upon logits/embeddings access
|
|
1224
|
+
output_swaps.push_back({ i, j_min });
|
|
1180
1225
|
}
|
|
1181
1226
|
|
|
1182
1227
|
std::fill(output_ids.begin(), output_ids.end(), -1);
|
|
@@ -1190,9 +1235,11 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
|
|
1190
1235
|
// wait for the computation to finish (automatically done when obtaining the model output)
|
|
1191
1236
|
//synchronize();
|
|
1192
1237
|
|
|
1193
|
-
|
|
1194
|
-
|
|
1195
|
-
|
|
1238
|
+
if (!supports_set_rows) {
|
|
1239
|
+
// Reset state for the next token before backend sync, to allow the CPU activities in the reset to
|
|
1240
|
+
// overlap with device computation.
|
|
1241
|
+
ggml_backend_sched_reset(sched.get());
|
|
1242
|
+
}
|
|
1196
1243
|
|
|
1197
1244
|
return 0;
|
|
1198
1245
|
}
|
|
@@ -1271,24 +1318,40 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
|
|
|
1271
1318
|
return n_outputs_max;
|
|
1272
1319
|
}
|
|
1273
1320
|
|
|
1321
|
+
void llama_context::output_reorder() {
|
|
1322
|
+
const uint32_t n_vocab = model.vocab.n_tokens();
|
|
1323
|
+
const uint64_t n_embd = model.hparams.n_embd;
|
|
1324
|
+
|
|
1325
|
+
for (uint32_t s = 0; s < output_swaps.size(); ++s) {
|
|
1326
|
+
const uint32_t i0 = output_swaps[s].i0;
|
|
1327
|
+
const uint32_t i1 = output_swaps[s].i1;
|
|
1328
|
+
|
|
1329
|
+
if (logits_size > 0) {
|
|
1330
|
+
for (uint32_t k = 0; k < n_vocab; k++) {
|
|
1331
|
+
std::swap(logits[i0*n_vocab + k], logits[i1*n_vocab + k]);
|
|
1332
|
+
}
|
|
1333
|
+
}
|
|
1334
|
+
|
|
1335
|
+
if (embd_size > 0) {
|
|
1336
|
+
for (uint32_t k = 0; k < n_embd; k++) {
|
|
1337
|
+
std::swap(embd[i0*n_embd + k], embd[i1*n_embd + k]);
|
|
1338
|
+
}
|
|
1339
|
+
}
|
|
1340
|
+
}
|
|
1341
|
+
|
|
1342
|
+
output_swaps.clear();
|
|
1343
|
+
}
|
|
1344
|
+
|
|
1274
1345
|
//
|
|
1275
1346
|
// graph
|
|
1276
1347
|
//
|
|
1277
1348
|
|
|
1278
|
-
|
|
1279
|
-
return std::max<
|
|
1349
|
+
uint32_t llama_context::graph_max_nodes() const {
|
|
1350
|
+
return std::max<uint32_t>(1024u, 8u*model.n_tensors());
|
|
1280
1351
|
}
|
|
1281
1352
|
|
|
1282
|
-
|
|
1283
|
-
|
|
1284
|
-
/*.mem_size =*/ buf_compute_meta.size(),
|
|
1285
|
-
/*.mem_buffer =*/ buf_compute_meta.data(),
|
|
1286
|
-
/*.no_alloc =*/ true,
|
|
1287
|
-
};
|
|
1288
|
-
|
|
1289
|
-
ctx_compute.reset(ggml_init(params));
|
|
1290
|
-
|
|
1291
|
-
return ggml_new_graph_custom(ctx_compute.get(), graph_max_nodes(), false);
|
|
1353
|
+
llm_graph_result * llama_context::get_gf_res_reserve() const {
|
|
1354
|
+
return static_cast<llm_graph_result *>(gf_res_reserve.get());
|
|
1292
1355
|
}
|
|
1293
1356
|
|
|
1294
1357
|
ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx) {
|
|
@@ -1301,6 +1364,11 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u
|
|
|
1301
1364
|
LLAMA_LOG_DEBUG("%s: making n_tokens a multiple of n_seqs - n_tokens = %u, n_seqs = %u, n_outputs = %u\n", __func__, n_tokens, n_seqs, n_outputs);
|
|
1302
1365
|
}
|
|
1303
1366
|
|
|
1367
|
+
ggml_backend_sched_reset(sched.get());
|
|
1368
|
+
|
|
1369
|
+
// when the scheduler is reset, we cannnot reuse the old graph, so we reset the previous graph result to prevent that
|
|
1370
|
+
gf_res_prev->reset();
|
|
1371
|
+
|
|
1304
1372
|
// store the n_outputs as it is, and restore it afterwards
|
|
1305
1373
|
// TODO: not sure if needed, might simplify in the future by removing this
|
|
1306
1374
|
const auto save_n_outputs = this->n_outputs;
|
|
@@ -1310,17 +1378,15 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u
|
|
|
1310
1378
|
llama_batch_allocr balloc(model.hparams.n_pos_per_embd());
|
|
1311
1379
|
llama_ubatch ubatch = balloc.ubatch_reserve(n_tokens/n_seqs, n_seqs);
|
|
1312
1380
|
|
|
1313
|
-
auto *
|
|
1314
|
-
auto res = graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT, mctx);
|
|
1381
|
+
auto * res = gf_res_reserve.get();
|
|
1315
1382
|
|
|
1316
|
-
|
|
1383
|
+
const auto gparams = graph_params(res, ubatch, mctx, LLM_GRAPH_TYPE_DEFAULT);
|
|
1317
1384
|
|
|
1318
|
-
|
|
1319
|
-
LLAMA_LOG_ERROR("%s: failed to build worst-case graph\n", __func__);
|
|
1320
|
-
return nullptr;
|
|
1321
|
-
}
|
|
1385
|
+
res->reset();
|
|
1322
1386
|
|
|
1323
|
-
|
|
1387
|
+
auto * gf = model.build_graph(gparams);
|
|
1388
|
+
|
|
1389
|
+
this->n_outputs = save_n_outputs;
|
|
1324
1390
|
|
|
1325
1391
|
// initialize scheduler with the specified graph
|
|
1326
1392
|
if (!ggml_backend_sched_reserve(sched.get(), gf)) {
|
|
@@ -1331,28 +1397,27 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u
|
|
|
1331
1397
|
return gf;
|
|
1332
1398
|
}
|
|
1333
1399
|
|
|
1334
|
-
|
|
1335
|
-
|
|
1336
|
-
|
|
1337
|
-
|
|
1338
|
-
|
|
1339
|
-
|
|
1340
|
-
|
|
1341
|
-
|
|
1342
|
-
|
|
1343
|
-
|
|
1344
|
-
|
|
1345
|
-
|
|
1346
|
-
|
|
1347
|
-
|
|
1348
|
-
|
|
1349
|
-
|
|
1350
|
-
|
|
1351
|
-
|
|
1352
|
-
|
|
1353
|
-
|
|
1354
|
-
|
|
1355
|
-
}, gf, gtype);
|
|
1400
|
+
llm_graph_params llama_context::graph_params(
|
|
1401
|
+
llm_graph_result * res,
|
|
1402
|
+
const llama_ubatch & ubatch,
|
|
1403
|
+
const llama_memory_context_i * mctx,
|
|
1404
|
+
llm_graph_type gtype) const {
|
|
1405
|
+
return {
|
|
1406
|
+
/*.arch =*/ model.arch,
|
|
1407
|
+
/*.hparams =*/ model.hparams,
|
|
1408
|
+
/*.cparams =*/ cparams,
|
|
1409
|
+
/*.ubatch =*/ ubatch,
|
|
1410
|
+
/*.gtype =*/ gtype,
|
|
1411
|
+
/*.sched =*/ sched.get(),
|
|
1412
|
+
/*.backend_cpu =*/ backend_cpu,
|
|
1413
|
+
/*.cvec =*/ &cvec,
|
|
1414
|
+
/*.loras =*/ &loras,
|
|
1415
|
+
/*.mctx =*/ mctx,
|
|
1416
|
+
/*.cross =*/ &cross,
|
|
1417
|
+
/*.n_outputs =*/ n_outputs,
|
|
1418
|
+
/*.cb =*/ graph_get_cb(),
|
|
1419
|
+
/*.res =*/ res,
|
|
1420
|
+
};
|
|
1356
1421
|
}
|
|
1357
1422
|
|
|
1358
1423
|
ggml_status llama_context::graph_compute(
|
|
@@ -1930,6 +1995,7 @@ llama_perf_context_data llama_context::perf_get_data() const {
|
|
|
1930
1995
|
data.t_eval_ms = 1e-3 * t_eval_us;
|
|
1931
1996
|
data.n_p_eval = std::max(1, n_p_eval);
|
|
1932
1997
|
data.n_eval = std::max(1, n_eval);
|
|
1998
|
+
data.n_reused = std::max(0, n_reused);
|
|
1933
1999
|
|
|
1934
2000
|
return data;
|
|
1935
2001
|
}
|
|
@@ -1938,6 +2004,7 @@ void llama_context::perf_reset() {
|
|
|
1938
2004
|
t_start_us = ggml_time_us();
|
|
1939
2005
|
t_eval_us = n_eval = 0;
|
|
1940
2006
|
t_p_eval_us = n_p_eval = 0;
|
|
2007
|
+
n_reused = 0;
|
|
1941
2008
|
}
|
|
1942
2009
|
|
|
1943
2010
|
//
|
|
@@ -2028,7 +2095,7 @@ void llama_context::opt_epoch_iter(
|
|
|
2028
2095
|
batch.logits [pos_batch] = true;
|
|
2029
2096
|
}
|
|
2030
2097
|
|
|
2031
|
-
if (!balloc->init(batch, model.vocab, nullptr, model.hparams.n_embd, true)) {
|
|
2098
|
+
if (!balloc->init(batch, model.vocab, nullptr, model.hparams.n_embd, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, true)) {
|
|
2032
2099
|
LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
|
|
2033
2100
|
return;
|
|
2034
2101
|
}
|
|
@@ -2064,8 +2131,13 @@ void llama_context::opt_epoch_iter(
|
|
|
2064
2131
|
break;
|
|
2065
2132
|
}
|
|
2066
2133
|
|
|
2067
|
-
auto *
|
|
2068
|
-
|
|
2134
|
+
auto * res = gf_res_prev.get();
|
|
2135
|
+
|
|
2136
|
+
const auto gparams = graph_params(res, ubatch, mctx.get(), LLM_GRAPH_TYPE_DEFAULT);
|
|
2137
|
+
|
|
2138
|
+
res->reset();
|
|
2139
|
+
|
|
2140
|
+
auto * gf = model.build_graph(gparams);
|
|
2069
2141
|
|
|
2070
2142
|
struct ggml_context * ctx_compute_opt;
|
|
2071
2143
|
{
|
|
@@ -2187,6 +2259,7 @@ llama_context_params llama_context_default_params() {
|
|
|
2187
2259
|
/*.no_perf =*/ true,
|
|
2188
2260
|
/*.op_offload =*/ true,
|
|
2189
2261
|
/*.swa_full =*/ true,
|
|
2262
|
+
/*.kv_unified =*/ false,
|
|
2190
2263
|
};
|
|
2191
2264
|
|
|
2192
2265
|
return result;
|
|
@@ -2807,6 +2880,7 @@ void llama_perf_context_print(const llama_context * ctx) {
|
|
|
2807
2880
|
LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
|
2808
2881
|
__func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval);
|
|
2809
2882
|
LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));
|
|
2883
|
+
LLAMA_LOG_INFO("%s: graphs reused = %10d\n", __func__, data.n_reused);
|
|
2810
2884
|
}
|
|
2811
2885
|
|
|
2812
2886
|
void llama_perf_context_reset(llama_context * ctx) {
|
|
@@ -35,8 +35,6 @@ struct llama_context {
|
|
|
35
35
|
|
|
36
36
|
ggml_backend_sched_t get_sched() const;
|
|
37
37
|
|
|
38
|
-
ggml_context * get_ctx_compute() const;
|
|
39
|
-
|
|
40
38
|
uint32_t n_ctx() const;
|
|
41
39
|
uint32_t n_ctx_per_seq() const;
|
|
42
40
|
uint32_t n_batch() const;
|
|
@@ -96,7 +94,7 @@ struct llama_context {
|
|
|
96
94
|
// if memory_context is provided, it will be applied first to the context's memory
|
|
97
95
|
// ret contains the status of the graph computation
|
|
98
96
|
// returns nullptr only if ret != GGML_STATUS_SUCCESS
|
|
99
|
-
|
|
97
|
+
llm_graph_result * process_ubatch(
|
|
100
98
|
const llama_ubatch & ubatch,
|
|
101
99
|
llm_graph_type gtype,
|
|
102
100
|
llama_memory_context_i * mctx,
|
|
@@ -183,15 +181,17 @@ private:
|
|
|
183
181
|
// Returns max number of outputs for which space was reserved.
|
|
184
182
|
uint32_t output_reserve(int32_t n_outputs);
|
|
185
183
|
|
|
184
|
+
void output_reorder();
|
|
185
|
+
|
|
186
186
|
//
|
|
187
187
|
// graph
|
|
188
188
|
//
|
|
189
189
|
|
|
190
190
|
public:
|
|
191
|
-
|
|
191
|
+
uint32_t graph_max_nodes() const;
|
|
192
192
|
|
|
193
|
-
//
|
|
194
|
-
|
|
193
|
+
// can reuse the llm_graph_result instance of the context (for example to update a memory module)
|
|
194
|
+
llm_graph_result * get_gf_res_reserve() const;
|
|
195
195
|
|
|
196
196
|
// returns the result of ggml_backend_sched_graph_compute_async execution
|
|
197
197
|
ggml_status graph_compute(ggml_cgraph * gf, bool batched);
|
|
@@ -200,12 +200,11 @@ public:
|
|
|
200
200
|
ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx);
|
|
201
201
|
|
|
202
202
|
private:
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
const llama_memory_context_i * mctx);
|
|
203
|
+
llm_graph_params graph_params(
|
|
204
|
+
llm_graph_result * res,
|
|
205
|
+
const llama_ubatch & ubatch,
|
|
206
|
+
const llama_memory_context_i * mctx,
|
|
207
|
+
llm_graph_type gtype) const;
|
|
209
208
|
|
|
210
209
|
llm_graph_cb graph_get_cb() const;
|
|
211
210
|
|
|
@@ -253,13 +252,18 @@ private:
|
|
|
253
252
|
|
|
254
253
|
std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers
|
|
255
254
|
|
|
255
|
+
struct swap_info {
|
|
256
|
+
uint32_t i0;
|
|
257
|
+
uint32_t i1;
|
|
258
|
+
};
|
|
259
|
+
|
|
260
|
+
std::vector<swap_info> output_swaps;
|
|
261
|
+
|
|
256
262
|
ggml_backend_sched_ptr sched;
|
|
257
263
|
|
|
258
264
|
ggml_backend_t backend_cpu = nullptr;
|
|
259
265
|
std::vector<ggml_backend_ptr> backends;
|
|
260
266
|
|
|
261
|
-
ggml_context_ptr ctx_compute;
|
|
262
|
-
|
|
263
267
|
// training
|
|
264
268
|
ggml_opt_context_t opt_ctx = nullptr;
|
|
265
269
|
|
|
@@ -275,14 +279,18 @@ private:
|
|
|
275
279
|
std::vector<ggml_backend_t> backend_ptrs;
|
|
276
280
|
std::vector<ggml_backend_buffer_type_t> backend_buft;
|
|
277
281
|
|
|
278
|
-
|
|
279
|
-
|
|
282
|
+
llm_graph_result_ptr gf_res_prev;
|
|
283
|
+
llm_graph_result_ptr gf_res_reserve;
|
|
280
284
|
|
|
281
285
|
// host buffer for the model output (logits and embeddings)
|
|
282
286
|
ggml_backend_buffer_ptr buf_output;
|
|
283
287
|
|
|
284
288
|
bool has_evaluated_once = false;
|
|
285
289
|
|
|
290
|
+
// env: LLAMA_SET_ROWS (temporary)
|
|
291
|
+
// ref: https://github.com/ggml-org/llama.cpp/pull/14285
|
|
292
|
+
bool supports_set_rows = false;
|
|
293
|
+
|
|
286
294
|
// perf
|
|
287
295
|
mutable int64_t t_start_us = 0;
|
|
288
296
|
mutable int64_t t_load_us = 0;
|
|
@@ -294,4 +302,6 @@ private:
|
|
|
294
302
|
|
|
295
303
|
mutable int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
|
|
296
304
|
mutable int32_t n_eval = 0; // number of eval calls
|
|
305
|
+
|
|
306
|
+
mutable int32_t n_reused = 0; // number of times the previous graph was reused
|
|
297
307
|
};
|
|
@@ -11,8 +11,8 @@ struct llama_cparams {
|
|
|
11
11
|
uint32_t n_batch;
|
|
12
12
|
uint32_t n_ubatch;
|
|
13
13
|
uint32_t n_seq_max;
|
|
14
|
-
|
|
15
|
-
|
|
14
|
+
int32_t n_threads; // number of threads to use for generation
|
|
15
|
+
int32_t n_threads_batch; // number of threads to use for batch processing
|
|
16
16
|
|
|
17
17
|
float rope_freq_base;
|
|
18
18
|
float rope_freq_scale;
|
|
@@ -33,6 +33,7 @@ struct llama_cparams {
|
|
|
33
33
|
bool no_perf;
|
|
34
34
|
bool warmup;
|
|
35
35
|
bool op_offload;
|
|
36
|
+
bool kv_unified;
|
|
36
37
|
|
|
37
38
|
enum llama_pooling_type pooling_type;
|
|
38
39
|
|