@novastera-oss/llamarn 0.2.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +80 -14
- package/RNLlamaCpp.podspec +10 -3
- package/android/CMakeLists.txt +8 -0
- package/android/src/main/cpp/include/llama.h +62 -125
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/README.md +11 -3
- package/cpp/llama.cpp/build-xcframework.sh +1 -0
- package/cpp/llama.cpp/common/CMakeLists.txt +8 -2
- package/cpp/llama.cpp/common/arg.cpp +153 -113
- package/cpp/llama.cpp/common/chat-parser.cpp +379 -0
- package/cpp/llama.cpp/common/chat-parser.h +117 -0
- package/cpp/llama.cpp/common/chat.cpp +847 -699
- package/cpp/llama.cpp/common/chat.h +73 -6
- package/cpp/llama.cpp/common/common.cpp +50 -82
- package/cpp/llama.cpp/common/common.h +21 -17
- package/cpp/llama.cpp/common/json-partial.cpp +255 -0
- package/cpp/llama.cpp/common/json-partial.h +37 -0
- package/cpp/llama.cpp/common/minja/chat-template.hpp +9 -5
- package/cpp/llama.cpp/common/minja/minja.hpp +69 -36
- package/cpp/llama.cpp/common/regex-partial.cpp +204 -0
- package/cpp/llama.cpp/common/regex-partial.h +56 -0
- package/cpp/llama.cpp/common/sampling.cpp +7 -8
- package/cpp/llama.cpp/convert_hf_to_gguf.py +453 -118
- package/cpp/llama.cpp/convert_hf_to_gguf_update.py +120 -68
- package/cpp/llama.cpp/ggml/CMakeLists.txt +2 -1
- package/cpp/llama.cpp/ggml/cmake/common.cmake +25 -0
- package/cpp/llama.cpp/ggml/include/ggml-opt.h +49 -28
- package/cpp/llama.cpp/ggml/include/ggml.h +26 -7
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +16 -10
- package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +4 -1
- package/cpp/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +604 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +42 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +54 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +50 -51
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +5 -9
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +779 -19
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +88 -5
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -12
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +264 -69
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +322 -100
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +117 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +85 -16
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +220 -49
- package/cpp/llama.cpp/ggml/src/ggml-cuda/acc.cu +40 -26
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cu +11 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +15 -7
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +266 -64
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +49 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +48 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cu +2 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +5 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cu +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/quantize.cu +7 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sum.cu +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +10 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +99 -17
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +200 -2
- package/cpp/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +8 -2
- package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cu +112 -0
- package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +12 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +6 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +972 -178
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/div.cl +72 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/group_norm.cl +72 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sub.cl +72 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sum_rows.cl +39 -0
- package/cpp/llama.cpp/ggml/src/ggml-opt.cpp +373 -190
- package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +29 -23
- package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -10
- package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +101 -5
- package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +31 -33
- package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +29 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +4 -5
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +9 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +84 -72
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
- package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +1 -3
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +324 -129
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +31 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +95 -68
- package/cpp/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +1 -4
- package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +2 -3
- package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +69 -43
- package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +2 -14
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +81 -91
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +432 -181
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +17 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +6 -152
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +162 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +360 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +2 -118
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +12 -1
- package/cpp/llama.cpp/ggml/src/ggml.c +107 -36
- package/cpp/llama.cpp/ggml/src/gguf.cpp +33 -33
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +100 -15
- package/cpp/llama.cpp/gguf-py/gguf/gguf_reader.py +1 -1
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +44 -12
- package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_editor_gui.py +21 -10
- package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_new_metadata.py +5 -2
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +128 -31
- package/cpp/llama.cpp/gguf-py/gguf/utility.py +1 -1
- package/cpp/llama.cpp/gguf-py/pyproject.toml +1 -1
- package/cpp/llama.cpp/include/llama.h +62 -125
- package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-nomic-bert-moe.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.out +1 -1
- package/cpp/llama.cpp/models/templates/Qwen-QwQ-32B.jinja +62 -0
- package/cpp/llama.cpp/models/templates/Qwen-Qwen3-0.6B.jinja +85 -0
- package/cpp/llama.cpp/models/templates/README.md +2 -0
- package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +5 -1
- package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +5 -1
- package/cpp/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
- package/cpp/llama.cpp/requirements/requirements-gguf_editor_gui.txt +1 -1
- package/cpp/llama.cpp/src/CMakeLists.txt +2 -0
- package/cpp/llama.cpp/src/llama-arch.cpp +6 -0
- package/cpp/llama.cpp/src/llama-arch.h +2 -0
- package/cpp/llama.cpp/src/llama-batch.cpp +3 -1
- package/cpp/llama.cpp/src/llama-context.cpp +340 -123
- package/cpp/llama.cpp/src/llama-context.h +30 -0
- package/cpp/llama.cpp/src/llama-cparams.cpp +4 -0
- package/cpp/llama.cpp/src/llama-cparams.h +2 -0
- package/cpp/llama.cpp/src/llama-grammar.cpp +12 -2
- package/cpp/llama.cpp/src/llama-graph.cpp +157 -247
- package/cpp/llama.cpp/src/llama-graph.h +52 -7
- package/cpp/llama.cpp/src/llama-hparams.cpp +17 -1
- package/cpp/llama.cpp/src/llama-hparams.h +37 -5
- package/cpp/llama.cpp/src/llama-kv-cache.cpp +742 -481
- package/cpp/llama.cpp/src/llama-kv-cache.h +196 -99
- package/cpp/llama.cpp/src/llama-kv-cells.h +379 -0
- package/cpp/llama.cpp/src/llama-memory.h +4 -3
- package/cpp/llama.cpp/src/llama-model-loader.cpp +22 -17
- package/cpp/llama.cpp/src/llama-model-saver.cpp +281 -0
- package/cpp/llama.cpp/src/llama-model-saver.h +37 -0
- package/cpp/llama.cpp/src/llama-model.cpp +529 -172
- package/cpp/llama.cpp/src/llama-model.h +6 -1
- package/cpp/llama.cpp/src/llama-quant.cpp +15 -13
- package/cpp/llama.cpp/src/llama-sampling.cpp +2 -2
- package/cpp/llama.cpp/src/llama-vocab.cpp +35 -8
- package/cpp/llama.cpp/src/llama-vocab.h +6 -0
- package/cpp/llama.cpp/src/llama.cpp +14 -0
- package/cpp/rn-completion.cpp +4 -2
- package/ios/include/chat.h +73 -6
- package/ios/include/common/minja/chat-template.hpp +9 -5
- package/ios/include/common/minja/minja.hpp +69 -36
- package/ios/include/common.h +21 -17
- package/ios/include/llama.h +62 -125
- package/ios/libs/llama.xcframework/Info.plist +19 -19
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4617 -4487
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4638 -4508
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3557 -3435
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4638 -4508
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3559 -3437
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4616 -4487
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4637 -4508
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3556 -3435
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4653 -4523
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4674 -4544
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3587 -3465
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +1 -1
- package/cpp/llama.cpp/common/stb_image.h +0 -7988
- package/cpp/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-chameleon.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-llama4.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-llama4.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-pixtral.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +0 -46
|
@@ -23,32 +23,21 @@ uint32_t llama_kv_cache_unified::get_padding(const llama_cparams & cparams) {
|
|
|
23
23
|
}
|
|
24
24
|
|
|
25
25
|
llama_kv_cache_unified::llama_kv_cache_unified(
|
|
26
|
-
const llama_model &
|
|
27
|
-
|
|
28
|
-
ggml_type
|
|
29
|
-
|
|
30
|
-
bool
|
|
31
|
-
|
|
32
|
-
uint32_t
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
GGML_ASSERT(kv_size % padding == 0 && "kv_size must be a multiple of padding");
|
|
42
|
-
|
|
43
|
-
head = 0;
|
|
44
|
-
size = kv_size;
|
|
45
|
-
used = 0;
|
|
46
|
-
|
|
47
|
-
this->type_k = type_k;
|
|
48
|
-
this->type_v = type_v;
|
|
49
|
-
|
|
50
|
-
cells.clear();
|
|
51
|
-
cells.resize(kv_size);
|
|
26
|
+
const llama_model & model,
|
|
27
|
+
layer_filter_cb && filter,
|
|
28
|
+
ggml_type type_k,
|
|
29
|
+
ggml_type type_v,
|
|
30
|
+
bool v_trans,
|
|
31
|
+
bool offload,
|
|
32
|
+
uint32_t kv_size,
|
|
33
|
+
uint32_t n_seq_max,
|
|
34
|
+
uint32_t n_pad,
|
|
35
|
+
uint32_t n_swa,
|
|
36
|
+
llama_swa_type swa_type) :
|
|
37
|
+
model(model), hparams(model.hparams), v_trans(v_trans),
|
|
38
|
+
n_seq_max(n_seq_max), n_pad(n_pad), n_swa(n_swa), swa_type(swa_type) {
|
|
39
|
+
|
|
40
|
+
GGML_ASSERT(kv_size % n_pad == 0);
|
|
52
41
|
|
|
53
42
|
// create a context for each buffer type
|
|
54
43
|
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
|
|
@@ -56,7 +45,7 @@ llama_kv_cache_unified::llama_kv_cache_unified(
|
|
|
56
45
|
auto it = ctx_map.find(buft);
|
|
57
46
|
if (it == ctx_map.end()) {
|
|
58
47
|
ggml_init_params params = {
|
|
59
|
-
/*.mem_size =*/ size_t(2u*n_layer*ggml_tensor_overhead()),
|
|
48
|
+
/*.mem_size =*/ size_t(2u*hparams.n_layer*ggml_tensor_overhead()),
|
|
60
49
|
/*.mem_buffer =*/ NULL,
|
|
61
50
|
/*.no_alloc =*/ true,
|
|
62
51
|
};
|
|
@@ -75,37 +64,48 @@ llama_kv_cache_unified::llama_kv_cache_unified(
|
|
|
75
64
|
return it->second;
|
|
76
65
|
};
|
|
77
66
|
|
|
78
|
-
|
|
79
|
-
v_l.reserve(n_layer);
|
|
67
|
+
head = 0;
|
|
80
68
|
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
69
|
+
cells.resize(kv_size);
|
|
70
|
+
|
|
71
|
+
for (uint32_t il = 0; il < hparams.n_layer; il++) {
|
|
72
|
+
if (filter && !filter(il)) {
|
|
73
|
+
LLAMA_LOG_DEBUG("%s: layer %3d: skipped\n", __func__, il);
|
|
74
|
+
continue;
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
|
|
78
|
+
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
|
|
84
79
|
|
|
85
80
|
const char * dev_name = "CPU";
|
|
86
81
|
|
|
87
82
|
ggml_backend_buffer_type_t buft = ggml_backend_cpu_buffer_type();
|
|
88
83
|
|
|
89
84
|
if (offload) {
|
|
90
|
-
auto * dev = model.dev_layer(
|
|
85
|
+
auto * dev = model.dev_layer(il);
|
|
91
86
|
buft = ggml_backend_dev_buffer_type(dev);
|
|
92
87
|
|
|
93
88
|
dev_name = ggml_backend_dev_name(dev);
|
|
94
89
|
}
|
|
95
90
|
|
|
96
|
-
LLAMA_LOG_DEBUG("%s: layer %3d: dev = %s\n", __func__,
|
|
91
|
+
LLAMA_LOG_DEBUG("%s: layer %3d: dev = %s\n", __func__, il, dev_name);
|
|
97
92
|
|
|
98
93
|
ggml_context * ctx = ctx_for_buft(buft);
|
|
99
94
|
if (!ctx) {
|
|
100
95
|
throw std::runtime_error("failed to create ggml context for kv cache");
|
|
101
96
|
}
|
|
102
97
|
|
|
103
|
-
ggml_tensor * k
|
|
104
|
-
ggml_tensor * v
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
98
|
+
ggml_tensor * k;
|
|
99
|
+
ggml_tensor * v;
|
|
100
|
+
|
|
101
|
+
k = ggml_new_tensor_2d(ctx, type_k, n_embd_k_gqa, kv_size);
|
|
102
|
+
v = ggml_new_tensor_2d(ctx, type_v, n_embd_v_gqa, kv_size);
|
|
103
|
+
|
|
104
|
+
ggml_format_name(k, "cache_k_l%d", il);
|
|
105
|
+
ggml_format_name(v, "cache_v_l%d", il);
|
|
106
|
+
|
|
107
|
+
map_layer_ids[il] = layers.size();
|
|
108
|
+
layers.push_back({ il, k, v });
|
|
109
109
|
}
|
|
110
110
|
|
|
111
111
|
// allocate tensors and initialize the buffers to avoid NaNs in the padding
|
|
@@ -117,8 +117,10 @@ llama_kv_cache_unified::llama_kv_cache_unified(
|
|
|
117
117
|
if (!buf) {
|
|
118
118
|
throw std::runtime_error("failed to allocate buffer for kv cache");
|
|
119
119
|
}
|
|
120
|
-
|
|
120
|
+
|
|
121
121
|
LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
|
|
122
|
+
|
|
123
|
+
ggml_backend_buffer_clear(buf, 0);
|
|
122
124
|
bufs.emplace_back(buf);
|
|
123
125
|
}
|
|
124
126
|
|
|
@@ -126,20 +128,17 @@ llama_kv_cache_unified::llama_kv_cache_unified(
|
|
|
126
128
|
const size_t memory_size_k = size_k_bytes();
|
|
127
129
|
const size_t memory_size_v = size_v_bytes();
|
|
128
130
|
|
|
129
|
-
LLAMA_LOG_INFO("%s:
|
|
130
|
-
(float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
|
|
131
|
+
LLAMA_LOG_INFO("%s: size = %7.2f MiB (%6u cells, %3d layers, %2u seqs), K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
|
|
132
|
+
(float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f), kv_size, (int) layers.size(), n_seq_max,
|
|
131
133
|
ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
|
|
132
134
|
ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
|
|
133
135
|
}
|
|
134
136
|
}
|
|
135
137
|
|
|
136
138
|
void llama_kv_cache_unified::clear() {
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
cells[i].seq_id.clear();
|
|
140
|
-
}
|
|
139
|
+
cells.reset();
|
|
140
|
+
|
|
141
141
|
head = 0;
|
|
142
|
-
used = 0;
|
|
143
142
|
|
|
144
143
|
for (auto & buf : bufs) {
|
|
145
144
|
ggml_backend_buffer_clear(buf.get(), 0);
|
|
@@ -147,7 +146,7 @@ void llama_kv_cache_unified::clear() {
|
|
|
147
146
|
}
|
|
148
147
|
|
|
149
148
|
bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
|
|
150
|
-
uint32_t new_head = size;
|
|
149
|
+
uint32_t new_head = cells.size();
|
|
151
150
|
|
|
152
151
|
if (p0 < 0) {
|
|
153
152
|
p0 = 0;
|
|
@@ -157,32 +156,20 @@ bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
|
|
|
157
156
|
p1 = std::numeric_limits<llama_pos>::max();
|
|
158
157
|
}
|
|
159
158
|
|
|
160
|
-
for (uint32_t i = 0; i < size; ++i) {
|
|
161
|
-
if (cells
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
} else if (cells[i].has_seq_id(seq_id)) {
|
|
165
|
-
cells[i].seq_id.erase(seq_id);
|
|
166
|
-
} else {
|
|
167
|
-
continue;
|
|
168
|
-
}
|
|
169
|
-
if (cells[i].is_empty()) {
|
|
170
|
-
// keep count of the number of used cells
|
|
171
|
-
if (cells[i].pos >= 0) {
|
|
172
|
-
used--;
|
|
173
|
-
}
|
|
174
|
-
|
|
175
|
-
cells[i].pos = -1;
|
|
159
|
+
for (uint32_t i = 0; i < cells.size(); ++i) {
|
|
160
|
+
if (!cells.pos_in(i, p0, p1)) {
|
|
161
|
+
continue;
|
|
162
|
+
}
|
|
176
163
|
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
164
|
+
if (cells.seq_has(i, seq_id) && cells.seq_rm(i, seq_id)) {
|
|
165
|
+
if (new_head == cells.size()) {
|
|
166
|
+
new_head = i;
|
|
180
167
|
}
|
|
181
168
|
}
|
|
182
169
|
}
|
|
183
170
|
|
|
184
171
|
// If we freed up a slot, set head to it so searching can start there.
|
|
185
|
-
if (new_head != size && new_head < head) {
|
|
172
|
+
if (new_head != cells.size() && new_head < head) {
|
|
186
173
|
head = new_head;
|
|
187
174
|
}
|
|
188
175
|
|
|
@@ -202,49 +189,40 @@ void llama_kv_cache_unified::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id
|
|
|
202
189
|
p1 = std::numeric_limits<llama_pos>::max();
|
|
203
190
|
}
|
|
204
191
|
|
|
205
|
-
|
|
206
|
-
|
|
192
|
+
for (uint32_t i = 0; i < cells.size(); ++i) {
|
|
193
|
+
if (!cells.pos_in(i, p0, p1)) {
|
|
194
|
+
continue;
|
|
195
|
+
}
|
|
207
196
|
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
cells[i].seq_id.insert(seq_id_dst);
|
|
197
|
+
if (cells.seq_has(i, seq_id_src)) {
|
|
198
|
+
cells.seq_add(i, seq_id_dst);
|
|
211
199
|
}
|
|
212
200
|
}
|
|
213
201
|
}
|
|
214
202
|
|
|
215
203
|
void llama_kv_cache_unified::seq_keep(llama_seq_id seq_id) {
|
|
216
|
-
uint32_t new_head = size;
|
|
204
|
+
uint32_t new_head = cells.size();
|
|
217
205
|
|
|
218
|
-
for (uint32_t i = 0; i < size; ++i) {
|
|
219
|
-
if (
|
|
220
|
-
if (cells
|
|
221
|
-
used--;
|
|
222
|
-
}
|
|
223
|
-
|
|
224
|
-
cells[i].pos = -1;
|
|
225
|
-
cells[i].seq_id.clear();
|
|
226
|
-
|
|
227
|
-
if (new_head == size){
|
|
206
|
+
for (uint32_t i = 0; i < cells.size(); ++i) {
|
|
207
|
+
if (cells.seq_keep(i, seq_id)) {
|
|
208
|
+
if (new_head == cells.size()) {
|
|
228
209
|
new_head = i;
|
|
229
210
|
}
|
|
230
|
-
} else {
|
|
231
|
-
cells[i].seq_id.clear();
|
|
232
|
-
cells[i].seq_id.insert(seq_id);
|
|
233
211
|
}
|
|
234
212
|
}
|
|
235
213
|
|
|
236
214
|
// If we freed up a slot, set head to it so searching can start there.
|
|
237
|
-
if (new_head != size && new_head < head) {
|
|
215
|
+
if (new_head != cells.size() && new_head < head) {
|
|
238
216
|
head = new_head;
|
|
239
217
|
}
|
|
240
218
|
}
|
|
241
219
|
|
|
242
|
-
void llama_kv_cache_unified::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos
|
|
243
|
-
if (
|
|
220
|
+
void llama_kv_cache_unified::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) {
|
|
221
|
+
if (shift == 0) {
|
|
244
222
|
return;
|
|
245
223
|
}
|
|
246
224
|
|
|
247
|
-
uint32_t new_head = size;
|
|
225
|
+
uint32_t new_head = cells.size();
|
|
248
226
|
|
|
249
227
|
if (p0 < 0) {
|
|
250
228
|
p0 = 0;
|
|
@@ -254,24 +232,19 @@ void llama_kv_cache_unified::seq_add(llama_seq_id seq_id, llama_pos p0, llama_po
|
|
|
254
232
|
p1 = std::numeric_limits<llama_pos>::max();
|
|
255
233
|
}
|
|
256
234
|
|
|
257
|
-
// If there is no range then return early to avoid looping over
|
|
235
|
+
// If there is no range then return early to avoid looping over all cells.
|
|
258
236
|
if (p0 == p1) {
|
|
259
237
|
return;
|
|
260
238
|
}
|
|
261
239
|
|
|
262
|
-
for (uint32_t i = 0; i < size; ++i) {
|
|
263
|
-
if (cells
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
cells[i].delta += delta;
|
|
240
|
+
for (uint32_t i = 0; i < cells.size(); ++i) {
|
|
241
|
+
if (!cells.pos_in(i, p0, p1)) {
|
|
242
|
+
continue;
|
|
243
|
+
}
|
|
267
244
|
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
}
|
|
272
|
-
cells[i].pos = -1;
|
|
273
|
-
cells[i].seq_id.clear();
|
|
274
|
-
if (new_head == size) {
|
|
245
|
+
if (cells.seq_has(i, seq_id)) {
|
|
246
|
+
if (cells.pos_add(i, shift)) {
|
|
247
|
+
if (new_head == cells.size()) {
|
|
275
248
|
new_head = i;
|
|
276
249
|
}
|
|
277
250
|
}
|
|
@@ -280,7 +253,7 @@ void llama_kv_cache_unified::seq_add(llama_seq_id seq_id, llama_pos p0, llama_po
|
|
|
280
253
|
|
|
281
254
|
// If we freed up a slot, set head to it so searching can start there.
|
|
282
255
|
// Otherwise we just start the next search from the beginning.
|
|
283
|
-
head = new_head != size ? new_head : 0;
|
|
256
|
+
head = new_head != cells.size() ? new_head : 0;
|
|
284
257
|
}
|
|
285
258
|
|
|
286
259
|
void llama_kv_cache_unified::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
|
|
@@ -301,66 +274,41 @@ void llama_kv_cache_unified::seq_div(llama_seq_id seq_id, llama_pos p0, llama_po
|
|
|
301
274
|
return;
|
|
302
275
|
}
|
|
303
276
|
|
|
304
|
-
for (uint32_t i = 0; i < size; ++i) {
|
|
305
|
-
if (cells
|
|
306
|
-
|
|
277
|
+
for (uint32_t i = 0; i < cells.size(); ++i) {
|
|
278
|
+
if (!cells.pos_in(i, p0, p1)) {
|
|
279
|
+
continue;
|
|
280
|
+
}
|
|
307
281
|
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
cells[i].pos /= d;
|
|
311
|
-
cells[i].delta += cells[i].pos - p_old;
|
|
312
|
-
}
|
|
282
|
+
if (cells.seq_has(i, seq_id)) {
|
|
283
|
+
cells.pos_div(i, d);
|
|
313
284
|
}
|
|
314
285
|
}
|
|
315
286
|
}
|
|
316
287
|
|
|
317
|
-
llama_pos llama_kv_cache_unified::
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
for (uint32_t i = 0; i < size; ++i) {
|
|
321
|
-
if (cells[i].has_seq_id(seq_id)) {
|
|
322
|
-
result = std::max(result, cells[i].pos);
|
|
323
|
-
}
|
|
324
|
-
}
|
|
288
|
+
llama_pos llama_kv_cache_unified::seq_pos_min(llama_seq_id seq_id) const {
|
|
289
|
+
return cells.seq_pos_min(seq_id);
|
|
290
|
+
}
|
|
325
291
|
|
|
326
|
-
|
|
292
|
+
llama_pos llama_kv_cache_unified::seq_pos_max(llama_seq_id seq_id) const {
|
|
293
|
+
return cells.seq_pos_max(seq_id);
|
|
327
294
|
}
|
|
328
295
|
|
|
329
296
|
void llama_kv_cache_unified::restore() {
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
}
|
|
333
|
-
|
|
334
|
-
uint32_t new_head = size;
|
|
335
|
-
|
|
336
|
-
for (auto & range : pending.ranges) {
|
|
337
|
-
for (uint32_t i = range.c0; i < range.c1; ++i) {
|
|
338
|
-
cells[i].seq_id.clear();
|
|
339
|
-
|
|
340
|
-
// keep count of the number of used cells
|
|
341
|
-
if (cells[i].pos >= 0) {
|
|
342
|
-
used--;
|
|
343
|
-
}
|
|
344
|
-
|
|
345
|
-
cells[i].pos = -1;
|
|
346
|
-
}
|
|
347
|
-
|
|
348
|
-
new_head = std::min(new_head, range.c0);
|
|
297
|
+
for (auto & state : recovery.states) {
|
|
298
|
+
cells.set(state.i, state.cells);
|
|
349
299
|
}
|
|
350
300
|
|
|
351
|
-
|
|
352
|
-
head = new_head;
|
|
353
|
-
}
|
|
301
|
+
recovery.clear();
|
|
354
302
|
}
|
|
355
303
|
|
|
356
304
|
void llama_kv_cache_unified::commit() {
|
|
357
|
-
if (
|
|
358
|
-
LLAMA_LOG_WARN("%s:
|
|
359
|
-
__func__, "https://github.com/ggml-org/llama.cpp/pull/
|
|
305
|
+
if (recovery.states.empty()) {
|
|
306
|
+
LLAMA_LOG_WARN("%s: the recovery information upon a commit was empty - might indicate a bug (ref: %s)\n",
|
|
307
|
+
__func__, "https://github.com/ggml-org/llama.cpp/pull/13194");
|
|
360
308
|
return;
|
|
361
309
|
}
|
|
362
310
|
|
|
363
|
-
|
|
311
|
+
recovery.clear();
|
|
364
312
|
}
|
|
365
313
|
|
|
366
314
|
bool llama_kv_cache_unified::update(llama_context & lctx) {
|
|
@@ -368,7 +316,7 @@ bool llama_kv_cache_unified::update(llama_context & lctx) {
|
|
|
368
316
|
|
|
369
317
|
auto * sched = lctx.get_sched();
|
|
370
318
|
|
|
371
|
-
if (
|
|
319
|
+
if (cells.get_has_shift()) {
|
|
372
320
|
if (!get_can_shift()) {
|
|
373
321
|
GGML_ABORT("The current KV cache / model configuration does not support K-shift");
|
|
374
322
|
}
|
|
@@ -392,13 +340,7 @@ bool llama_kv_cache_unified::update(llama_context & lctx) {
|
|
|
392
340
|
need_reserve = true;
|
|
393
341
|
}
|
|
394
342
|
|
|
395
|
-
|
|
396
|
-
has_shift = false;
|
|
397
|
-
|
|
398
|
-
for (uint32_t i = 0; i < size; ++i) {
|
|
399
|
-
cells[i].delta = 0;
|
|
400
|
-
}
|
|
401
|
-
}
|
|
343
|
+
cells.reset_shift();
|
|
402
344
|
}
|
|
403
345
|
|
|
404
346
|
if (do_defrag) {
|
|
@@ -429,7 +371,7 @@ bool llama_kv_cache_unified::update(llama_context & lctx) {
|
|
|
429
371
|
void llama_kv_cache_unified::defrag_sched(float thold) {
|
|
430
372
|
// - do not defrag small contexts (i.e. < 2048 tokens)
|
|
431
373
|
// - count the padding towards the number of used tokens
|
|
432
|
-
const float fragmentation = n >= 2048 ? std::max(0.0f, 1.0f - (float(
|
|
374
|
+
const float fragmentation = n >= 2048 ? std::max(0.0f, 1.0f - (float(cells.get_used() + n_pad)/n)) : 0.0f;
|
|
433
375
|
|
|
434
376
|
// queue defragmentation for next llama_kv_cache_update
|
|
435
377
|
if (fragmentation > thold) {
|
|
@@ -440,54 +382,77 @@ void llama_kv_cache_unified::defrag_sched(float thold) {
|
|
|
440
382
|
}
|
|
441
383
|
|
|
442
384
|
void llama_kv_cache_unified::set_full() {
|
|
443
|
-
n = size;
|
|
385
|
+
n = cells.size();
|
|
386
|
+
|
|
387
|
+
// when simulating a full KV cache, the specific value of the "head" pointer is not important because it does not
|
|
388
|
+
// affect the shapes of the tensors in the compute graph - it only affects the offsets of the K/V views.
|
|
389
|
+
// we should only guarantee that the head position won't cause out-of-bounds view of the K, V tensors, so
|
|
390
|
+
// setting it to 0 is the simplest way to achieve that
|
|
391
|
+
// ref: https://github.com/ggml-org/llama.cpp/issues/13359
|
|
392
|
+
head = 0;
|
|
444
393
|
}
|
|
445
394
|
|
|
446
|
-
llama_sbatch llama_kv_cache_unified::sbatch_init(
|
|
447
|
-
const llama_batch & batch,
|
|
448
|
-
bool logits_all) {
|
|
395
|
+
llama_sbatch llama_kv_cache_unified::sbatch_init(const llama_batch & batch, bool logits_all) {
|
|
449
396
|
return llama_sbatch(batch, hparams.n_embd, true, logits_all);
|
|
450
397
|
}
|
|
451
398
|
|
|
452
|
-
llama_ubatch llama_kv_cache_unified::ubatch_next(
|
|
453
|
-
llama_sbatch & sbatch,
|
|
454
|
-
uint32_t n_ubatch,
|
|
455
|
-
bool embd_pooled) const {
|
|
399
|
+
llama_ubatch llama_kv_cache_unified::ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const {
|
|
456
400
|
GGML_UNUSED(embd_pooled);
|
|
457
401
|
return sbatch.split_simple(n_ubatch);
|
|
458
402
|
}
|
|
459
403
|
|
|
460
|
-
bool llama_kv_cache_unified::find_slot(
|
|
461
|
-
const llama_ubatch & ubatch) {
|
|
404
|
+
bool llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) {
|
|
462
405
|
const uint32_t n_tokens = ubatch.n_tokens;
|
|
463
|
-
const uint32_t n_seqs = ubatch.n_seqs;
|
|
464
|
-
const uint32_t n_seq_tokens = ubatch.n_seq_tokens;
|
|
465
406
|
|
|
466
407
|
// if we have enough unused cells before the current head ->
|
|
467
408
|
// better to start searching from the beginning of the cache, hoping to fill it
|
|
468
|
-
if (head >
|
|
409
|
+
if (head > cells.get_used() + 2*ubatch.n_tokens) {
|
|
469
410
|
head = 0;
|
|
470
411
|
}
|
|
471
412
|
|
|
472
413
|
// otherwise, one cell per token.
|
|
473
414
|
|
|
474
|
-
if (n_tokens > size) {
|
|
475
|
-
LLAMA_LOG_ERROR("%s: n_tokens = %d > size = %
|
|
415
|
+
if (n_tokens > cells.size()) {
|
|
416
|
+
LLAMA_LOG_ERROR("%s: n_tokens = %d > size = %u\n", __func__, n_tokens, cells.size());
|
|
476
417
|
return false;
|
|
477
418
|
}
|
|
478
419
|
|
|
420
|
+
//#define FIND_SLOT_DEBUG 1
|
|
421
|
+
#if FIND_SLOT_DEBUG
|
|
422
|
+
LLAMA_LOG_WARN("begin: n = %5d, used = %5d, head = %5d, n_swa = %5d\n", n, used, head, n_swa);
|
|
423
|
+
|
|
424
|
+
// for debugging
|
|
425
|
+
{
|
|
426
|
+
std::string ss;
|
|
427
|
+
if (n_swa > 0) {
|
|
428
|
+
for (uint32_t i = 0; i < size; ++i) {
|
|
429
|
+
if (cells.is_empty(i)) {
|
|
430
|
+
ss += '.';
|
|
431
|
+
} else {
|
|
432
|
+
ss += 'x';
|
|
433
|
+
}
|
|
434
|
+
if (i%256 == 255) {
|
|
435
|
+
ss += '\n';
|
|
436
|
+
}
|
|
437
|
+
}
|
|
438
|
+
}
|
|
439
|
+
LLAMA_LOG_WARN("\n%s\n", ss.c_str());
|
|
440
|
+
}
|
|
441
|
+
#endif
|
|
442
|
+
|
|
479
443
|
uint32_t n_tested = 0;
|
|
480
444
|
|
|
481
445
|
while (true) {
|
|
482
|
-
if (head + n_tokens > size) {
|
|
483
|
-
n_tested += size - head;
|
|
446
|
+
if (head + n_tokens > cells.size()) {
|
|
447
|
+
n_tested += cells.size() - head;
|
|
484
448
|
head = 0;
|
|
485
449
|
continue;
|
|
486
450
|
}
|
|
487
451
|
|
|
488
452
|
bool found = true;
|
|
489
453
|
for (uint32_t i = 0; i < n_tokens; i++) {
|
|
490
|
-
|
|
454
|
+
// TODO: improve to accept cells that are masked by the SWA
|
|
455
|
+
if (!cells.is_empty(head + i)) {
|
|
491
456
|
found = false;
|
|
492
457
|
head += i + 1;
|
|
493
458
|
n_tested += i + 1;
|
|
@@ -499,66 +464,257 @@ bool llama_kv_cache_unified::find_slot(
|
|
|
499
464
|
break;
|
|
500
465
|
}
|
|
501
466
|
|
|
502
|
-
if (n_tested >= size) {
|
|
467
|
+
if (n_tested >= cells.size()) {
|
|
503
468
|
//LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
|
|
504
469
|
return false;
|
|
505
470
|
}
|
|
506
471
|
}
|
|
507
472
|
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
uint32_t k = s*n_seq_tokens + i;
|
|
511
|
-
cells[head + k].pos = ubatch.pos[k];
|
|
473
|
+
// store the old state of the cells in the recovery stack
|
|
474
|
+
recovery.states.push_back({head, cells.cp(head, n_tokens)});
|
|
512
475
|
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
476
|
+
for (uint32_t i = 0; i < n_tokens; ++i) {
|
|
477
|
+
cells.pos_set(head + i, ubatch.pos[i]);
|
|
478
|
+
|
|
479
|
+
for (int32_t j = 0; j < ubatch.n_seq_id[i]; j++) {
|
|
480
|
+
cells.seq_add(head + i, ubatch.seq_id[i][j]);
|
|
516
481
|
}
|
|
517
482
|
}
|
|
518
483
|
|
|
519
|
-
used += n_tokens;
|
|
520
|
-
|
|
521
|
-
pending.ranges.push_back({head, head + n_tokens});
|
|
522
|
-
|
|
523
484
|
// a heuristic, to avoid attending the full cache if it is not yet utilized
|
|
524
485
|
// after enough generations, the benefit from this heuristic disappears
|
|
525
486
|
// if we start defragmenting the cache, the benefit from this will be more important
|
|
526
|
-
n = std::min(size, std::max(
|
|
487
|
+
n = std::min(cells.size(), std::max(n_pad, GGML_PAD(cells.used_max_p1(), n_pad)));
|
|
527
488
|
|
|
528
|
-
|
|
489
|
+
#ifdef FIND_SLOT_DEBUG
|
|
490
|
+
LLAMA_LOG_WARN("end: n = %5d, used = %5d, head = %5d, n_swa = %5d\n", n, used, head, n_swa);
|
|
491
|
+
#endif
|
|
529
492
|
|
|
530
493
|
return true;
|
|
531
494
|
}
|
|
532
495
|
|
|
533
|
-
|
|
534
|
-
|
|
496
|
+
bool llama_kv_cache_unified::get_can_shift() const {
|
|
497
|
+
return true;
|
|
498
|
+
}
|
|
499
|
+
|
|
500
|
+
uint32_t llama_kv_cache_unified::get_n() const {
|
|
501
|
+
return n;
|
|
502
|
+
}
|
|
503
|
+
|
|
504
|
+
uint32_t llama_kv_cache_unified::get_size() const {
|
|
505
|
+
return cells.size();
|
|
506
|
+
}
|
|
507
|
+
|
|
508
|
+
ggml_tensor * llama_kv_cache_unified::get_k(ggml_context * ctx, int32_t il) const {
|
|
509
|
+
const int32_t ikv = map_layer_ids.at(il);
|
|
510
|
+
|
|
511
|
+
auto * k = layers[ikv].k;
|
|
512
|
+
|
|
513
|
+
return ggml_view_3d(ctx, k,
|
|
514
|
+
hparams.n_embd_head_k, hparams.n_head_kv(il), n,
|
|
515
|
+
ggml_row_size(k->type, hparams.n_embd_head_k),
|
|
516
|
+
ggml_row_size(k->type, hparams.n_embd_k_gqa(il)),
|
|
517
|
+
0);
|
|
518
|
+
}
|
|
519
|
+
|
|
520
|
+
ggml_tensor * llama_kv_cache_unified::get_v(ggml_context * ctx, int32_t il) const {
|
|
521
|
+
const int32_t ikv = map_layer_ids.at(il);
|
|
522
|
+
|
|
523
|
+
auto * v = layers[ikv].v;
|
|
535
524
|
|
|
536
|
-
|
|
537
|
-
|
|
525
|
+
if (!v_trans) {
|
|
526
|
+
// note: v->nb[1] <= v->nb[2]
|
|
527
|
+
return ggml_view_3d(ctx, v,
|
|
528
|
+
hparams.n_embd_head_v, hparams.n_head_kv(il), n,
|
|
529
|
+
ggml_row_size(v->type, hparams.n_embd_head_v), // v->nb[1]
|
|
530
|
+
ggml_row_size(v->type, hparams.n_embd_v_gqa(il)), // v->nb[2]
|
|
531
|
+
0);
|
|
538
532
|
}
|
|
539
533
|
|
|
540
|
-
|
|
534
|
+
// note: v->nb[1] > v->nb[2]
|
|
535
|
+
return ggml_view_3d(ctx, v,
|
|
536
|
+
n, hparams.n_head_kv(il), hparams.n_embd_head_v,
|
|
537
|
+
ggml_row_size(v->type, v->ne[1]*hparams.n_embd_head_v), // v->nb[1]
|
|
538
|
+
ggml_row_size(v->type, v->ne[1]), // v->nb[2]
|
|
539
|
+
0);
|
|
541
540
|
}
|
|
542
541
|
|
|
543
|
-
|
|
544
|
-
|
|
542
|
+
ggml_tensor * llama_kv_cache_unified::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, int32_t il) const {
|
|
543
|
+
const int32_t ikv = map_layer_ids.at(il);
|
|
544
|
+
|
|
545
|
+
auto * k = layers[ikv].k;
|
|
546
|
+
|
|
547
|
+
const int64_t n_tokens = k_cur->ne[2];
|
|
548
|
+
|
|
549
|
+
ggml_tensor * k_view = ggml_view_1d(ctx, k,
|
|
550
|
+
n_tokens*hparams.n_embd_k_gqa(il),
|
|
551
|
+
ggml_row_size(k->type, hparams.n_embd_k_gqa(il))*head);
|
|
552
|
+
|
|
553
|
+
return ggml_cpy(ctx, k_cur, k_view);
|
|
545
554
|
}
|
|
546
555
|
|
|
547
|
-
|
|
548
|
-
|
|
556
|
+
ggml_tensor * llama_kv_cache_unified::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, int32_t il) const {
|
|
557
|
+
const int32_t ikv = map_layer_ids.at(il);
|
|
558
|
+
|
|
559
|
+
auto * v = layers[ikv].v;
|
|
560
|
+
|
|
561
|
+
const int64_t n_tokens = v_cur->ne[2];
|
|
562
|
+
|
|
563
|
+
v_cur = ggml_reshape_2d(ctx, v_cur, hparams.n_embd_v_gqa(il), n_tokens);
|
|
564
|
+
|
|
565
|
+
ggml_tensor * v_view = nullptr;
|
|
566
|
+
|
|
567
|
+
if (!v_trans) {
|
|
568
|
+
v_view = ggml_view_1d(ctx, v,
|
|
569
|
+
n_tokens*hparams.n_embd_v_gqa(il),
|
|
570
|
+
ggml_row_size(v->type, hparams.n_embd_v_gqa(il))*head);
|
|
571
|
+
} else {
|
|
572
|
+
// note: the V cache is transposed when not using flash attention
|
|
573
|
+
v_view = ggml_view_2d(ctx, v, n_tokens, hparams.n_embd_v_gqa(il),
|
|
574
|
+
(v->ne[1])*ggml_element_size(v),
|
|
575
|
+
( head)*ggml_element_size(v));
|
|
576
|
+
|
|
577
|
+
v_cur = ggml_transpose(ctx, v_cur);
|
|
578
|
+
}
|
|
579
|
+
|
|
580
|
+
return ggml_cpy(ctx, v_cur, v_view);
|
|
549
581
|
}
|
|
550
582
|
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
583
|
+
void llama_kv_cache_unified::prune_swa(llama_seq_id seq_id, llama_pos pmin, llama_pos pmax) {
|
|
584
|
+
// no pruning is needed when the cache does not use SWA
|
|
585
|
+
GGML_ASSERT(swa_type != LLAMA_SWA_TYPE_NONE && "do not prune non-SWA cache");
|
|
586
|
+
|
|
587
|
+
int n_attended = 0;
|
|
588
|
+
|
|
589
|
+
for (uint32_t i = 0; i < cells.size(); ++i) {
|
|
590
|
+
if (!cells.seq_has(i, seq_id)) {
|
|
591
|
+
continue;
|
|
592
|
+
}
|
|
593
|
+
|
|
594
|
+
const llama_pos p0 = cells.pos_get(i);
|
|
595
|
+
|
|
596
|
+
if (p0 <= pmin && !is_masked_swa(p0, pmin)) {
|
|
597
|
+
n_attended++;
|
|
598
|
+
}
|
|
599
|
+
|
|
600
|
+
if (is_masked_swa(p0, pmax)) {
|
|
601
|
+
cells.seq_rm(i, seq_id);
|
|
602
|
+
}
|
|
555
603
|
}
|
|
556
604
|
|
|
557
|
-
|
|
605
|
+
if (n_attended < std::min<int>(n_swa, pmin)) {
|
|
606
|
+
LLAMA_LOG_WARN("%s: partial SWA cache detected - possible loss of information, pmin = %d, n_attended = %d, n_swa = %d\n", __func__, pmin, n_attended, n_swa);
|
|
607
|
+
}
|
|
608
|
+
}
|
|
609
|
+
|
|
610
|
+
void llama_kv_cache_unified::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const {
|
|
611
|
+
const int64_t n_tokens = ubatch->n_tokens;
|
|
612
|
+
const int64_t n_seq_tokens = ubatch->n_seq_tokens;
|
|
613
|
+
const int64_t n_seqs = ubatch->n_seqs;
|
|
614
|
+
|
|
615
|
+
GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
|
|
616
|
+
float * data = (float *) dst->data;
|
|
617
|
+
|
|
618
|
+
const int64_t n_kv = n;
|
|
619
|
+
|
|
620
|
+
// Use only the previous KV cells of the correct sequence for each token of the ubatch.
|
|
621
|
+
// It's assumed that if a token in the batch has multiple sequences, they are equivalent.
|
|
622
|
+
// Example with a cache of 10 tokens, 2 tokens populated in cache and 3 tokens in batch:
|
|
623
|
+
// Causal mask:
|
|
624
|
+
// xxx-------
|
|
625
|
+
// xxxx------
|
|
626
|
+
// xxxxx-----
|
|
627
|
+
// Non-causal mask:
|
|
628
|
+
// xxxxx-----
|
|
629
|
+
// xxxxx-----
|
|
630
|
+
// xxxxx-----
|
|
631
|
+
// To visualize the mask, see https://github.com/ggml-org/llama.cpp/pull/12615
|
|
632
|
+
for (int h = 0; h < 1; ++h) {
|
|
633
|
+
for (int s = 0; s < n_seqs; ++s) {
|
|
634
|
+
const llama_seq_id seq_id = ubatch->seq_id[s][0];
|
|
635
|
+
|
|
636
|
+
for (int j = 0; j < n_seq_tokens; ++j) {
|
|
637
|
+
const llama_pos p1 = ubatch->pos[s*n_seq_tokens + j];
|
|
638
|
+
|
|
639
|
+
for (int i = 0; i < n_kv; ++i) {
|
|
640
|
+
float f = 0.0f;
|
|
641
|
+
|
|
642
|
+
bool masked = false;
|
|
643
|
+
|
|
644
|
+
if (cells.is_empty(i)) {
|
|
645
|
+
masked = true;
|
|
646
|
+
} else {
|
|
647
|
+
const llama_pos p0 = cells.pos_get(i);
|
|
648
|
+
|
|
649
|
+
// mask the token if not the same sequence
|
|
650
|
+
masked = masked || (!cells.seq_has(i, seq_id));
|
|
651
|
+
|
|
652
|
+
// mask future tokens
|
|
653
|
+
masked = masked || (causal_attn && p0 > p1);
|
|
654
|
+
|
|
655
|
+
// apply SWA if any
|
|
656
|
+
masked = masked || (is_masked_swa(p0, p1));
|
|
657
|
+
|
|
658
|
+
if (!masked && hparams.use_alibi) {
|
|
659
|
+
f = -std::abs(p0 - p1);
|
|
660
|
+
}
|
|
661
|
+
}
|
|
662
|
+
|
|
663
|
+
if (masked) {
|
|
664
|
+
f = -INFINITY;
|
|
665
|
+
}
|
|
666
|
+
|
|
667
|
+
data[h*(n_kv*n_tokens) + s*(n_kv*n_seq_tokens) + j*n_kv + i] = f;
|
|
668
|
+
}
|
|
669
|
+
}
|
|
670
|
+
}
|
|
671
|
+
|
|
672
|
+
// mask padded tokens
|
|
673
|
+
if (data) {
|
|
674
|
+
for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
|
|
675
|
+
for (int j = 0; j < n_kv; ++j) {
|
|
676
|
+
data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
|
|
677
|
+
}
|
|
678
|
+
}
|
|
679
|
+
}
|
|
680
|
+
}
|
|
681
|
+
}
|
|
682
|
+
|
|
683
|
+
void llama_kv_cache_unified::set_input_k_shift(ggml_tensor * dst) const {
|
|
684
|
+
GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
|
|
685
|
+
|
|
686
|
+
int32_t * data = (int32_t *) dst->data;
|
|
687
|
+
|
|
688
|
+
for (uint32_t i = 0; i < cells.size(); ++i) {
|
|
689
|
+
data[i] = cells.is_empty(i) ? 0 : cells.get_shift(i);
|
|
690
|
+
}
|
|
691
|
+
}
|
|
692
|
+
|
|
693
|
+
void llama_kv_cache_unified::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const {
|
|
694
|
+
const int64_t n_tokens = ubatch->n_tokens;
|
|
695
|
+
|
|
696
|
+
GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
|
|
697
|
+
GGML_ASSERT(!ubatch->equal_seqs); // TODO: use ubatch->n_seqs instead of failing
|
|
698
|
+
|
|
699
|
+
int32_t * data = (int32_t *) dst->data;
|
|
700
|
+
|
|
701
|
+
const int64_t n_kv = n;
|
|
702
|
+
|
|
703
|
+
for (int h = 0; h < 1; ++h) {
|
|
704
|
+
for (int j = 0; j < n_tokens; ++j) {
|
|
705
|
+
for (int i = 0; i < n_kv; ++i) {
|
|
706
|
+
// the position when the cells is empty is irrelevant - it will be masked out later in the attention
|
|
707
|
+
const llama_pos p0 = cells.is_empty(i) ? -1 : cells.pos_get(i);
|
|
708
|
+
|
|
709
|
+
data[h*(n_kv*n_tokens) + j*n_kv + i] = llama_relative_position_bucket(p0, ubatch->pos[j], hparams.n_rel_attn_bkts, false);
|
|
710
|
+
}
|
|
711
|
+
}
|
|
712
|
+
}
|
|
558
713
|
}
|
|
559
714
|
|
|
560
715
|
size_t llama_kv_cache_unified::total_size() const {
|
|
561
716
|
size_t size = 0;
|
|
717
|
+
|
|
562
718
|
for (const auto & buf : bufs) {
|
|
563
719
|
size += ggml_backend_buffer_get_size(buf.get());
|
|
564
720
|
}
|
|
@@ -569,8 +725,8 @@ size_t llama_kv_cache_unified::total_size() const {
|
|
|
569
725
|
size_t llama_kv_cache_unified::size_k_bytes() const {
|
|
570
726
|
size_t size_k_bytes = 0;
|
|
571
727
|
|
|
572
|
-
for (const auto &
|
|
573
|
-
size_k_bytes += ggml_nbytes(k);
|
|
728
|
+
for (const auto & layer : layers) {
|
|
729
|
+
size_k_bytes += ggml_nbytes(layer.k);
|
|
574
730
|
}
|
|
575
731
|
|
|
576
732
|
return size_k_bytes;
|
|
@@ -579,8 +735,8 @@ size_t llama_kv_cache_unified::size_k_bytes() const {
|
|
|
579
735
|
size_t llama_kv_cache_unified::size_v_bytes() const {
|
|
580
736
|
size_t size_v_bytes = 0;
|
|
581
737
|
|
|
582
|
-
for (const auto &
|
|
583
|
-
size_v_bytes += ggml_nbytes(v);
|
|
738
|
+
for (const auto & layer : layers) {
|
|
739
|
+
size_v_bytes += ggml_nbytes(layer.v);
|
|
584
740
|
}
|
|
585
741
|
|
|
586
742
|
return size_v_bytes;
|
|
@@ -601,11 +757,19 @@ ggml_tensor * llama_kv_cache_unified::build_rope_shift(
|
|
|
601
757
|
const auto & yarn_beta_slow = cparams.yarn_beta_slow;
|
|
602
758
|
|
|
603
759
|
const auto & n_rot = hparams.n_rot;
|
|
604
|
-
const auto & rope_type = hparams.rope_type
|
|
760
|
+
const auto & rope_type = hparams.rope_type == LLAMA_ROPE_TYPE_MROPE
|
|
761
|
+
// @ngxson : this is a workaround
|
|
762
|
+
// for M-RoPE, we want to rotate the whole vector when doing KV shift
|
|
763
|
+
// a normal RoPE should work, we just need to use the correct ordering
|
|
764
|
+
// ref: https://github.com/ggml-org/llama.cpp/pull/13870
|
|
765
|
+
? LLAMA_ROPE_TYPE_NEOX
|
|
766
|
+
: hparams.rope_type;
|
|
605
767
|
|
|
606
768
|
// See llm_build_deepseek2() for why attn_factor has to be scaled for YaRN RoPE to work correctly.
|
|
607
769
|
// See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
|
|
608
|
-
const float yarn_attn_factor = model.arch == LLM_ARCH_DEEPSEEK2
|
|
770
|
+
const float yarn_attn_factor = model.arch == LLM_ARCH_DEEPSEEK2
|
|
771
|
+
? 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale))
|
|
772
|
+
: cparams.yarn_attn_factor;
|
|
609
773
|
|
|
610
774
|
ggml_tensor * tmp;
|
|
611
775
|
|
|
@@ -644,13 +808,7 @@ void llm_graph_input_k_shift::set_input(const llama_ubatch * ubatch) {
|
|
|
644
808
|
GGML_UNUSED(ubatch);
|
|
645
809
|
|
|
646
810
|
if (k_shift) {
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
int32_t * data = (int32_t *) k_shift->data;
|
|
650
|
-
|
|
651
|
-
for (uint32_t i = 0; i < kv_self->size; ++i) {
|
|
652
|
-
data[i] = kv_self->cells[i].delta;
|
|
653
|
-
}
|
|
811
|
+
kv_self->set_input_k_shift(k_shift);
|
|
654
812
|
}
|
|
655
813
|
}
|
|
656
814
|
|
|
@@ -660,13 +818,9 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
|
|
|
660
818
|
ggml_cgraph * gf) const {
|
|
661
819
|
auto res = std::make_unique<llm_graph_result>();
|
|
662
820
|
|
|
663
|
-
const auto & n_layer = hparams.n_layer;
|
|
664
|
-
|
|
665
821
|
const auto & n_embd_head_k = hparams.n_embd_head_k;
|
|
666
822
|
//const auto & n_embd_head_v = hparams.n_embd_head_v;
|
|
667
823
|
|
|
668
|
-
const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
|
|
669
|
-
|
|
670
824
|
//GGML_ASSERT(kv_self->size == n_ctx);
|
|
671
825
|
|
|
672
826
|
auto inp = std::make_unique<llm_graph_input_k_shift>(this);
|
|
@@ -674,24 +828,22 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
|
|
|
674
828
|
inp->k_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, cparams.n_ctx);
|
|
675
829
|
ggml_set_input(inp->k_shift);
|
|
676
830
|
|
|
677
|
-
for (
|
|
831
|
+
for (const auto & layer : layers) {
|
|
832
|
+
const uint32_t il = layer.il;
|
|
833
|
+
|
|
678
834
|
const int64_t n_head_kv = hparams.n_head_kv(il);
|
|
679
835
|
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
|
|
680
836
|
|
|
681
|
-
const
|
|
682
|
-
|
|
683
|
-
// note: the swa rope params could become part of the cparams in the future
|
|
684
|
-
// if we decide to make them configurable, like the non-sliding ones
|
|
685
|
-
const float freq_base_l = is_swa ? hparams.rope_freq_base_train_swa : cparams.rope_freq_base;
|
|
686
|
-
const float freq_scale_l = is_swa ? hparams.rope_freq_scale_train_swa : cparams.rope_freq_scale;
|
|
837
|
+
const float freq_base_l = model.get_rope_freq_base (cparams, il);
|
|
838
|
+
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
|
|
687
839
|
|
|
688
|
-
ggml_tensor * rope_factors = model.get_rope_factors(
|
|
840
|
+
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
|
689
841
|
|
|
690
842
|
ggml_tensor * k =
|
|
691
|
-
ggml_view_3d(ctx,
|
|
692
|
-
n_embd_head_k, n_head_kv, size,
|
|
693
|
-
ggml_row_size(
|
|
694
|
-
ggml_row_size(
|
|
843
|
+
ggml_view_3d(ctx, layer.k,
|
|
844
|
+
n_embd_head_k, n_head_kv, cells.size(),
|
|
845
|
+
ggml_row_size(layer.k->type, n_embd_head_k),
|
|
846
|
+
ggml_row_size(layer.k->type, n_embd_k_gqa),
|
|
695
847
|
0);
|
|
696
848
|
|
|
697
849
|
ggml_tensor * cur = build_rope_shift(cparams, ctx, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l);
|
|
@@ -796,44 +948,46 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
|
|
|
796
948
|
nm++;
|
|
797
949
|
}
|
|
798
950
|
|
|
799
|
-
for (
|
|
951
|
+
for (const auto & layer : layers) {
|
|
952
|
+
const uint32_t il = layer.il;
|
|
953
|
+
|
|
800
954
|
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
|
|
801
955
|
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
|
|
802
956
|
|
|
803
|
-
ggml_tensor * view_k_src = ggml_view_2d(ctx,
|
|
957
|
+
ggml_tensor * view_k_src = ggml_view_2d(ctx, layer.k,
|
|
804
958
|
n_embd_k_gqa, nm,
|
|
805
|
-
ggml_row_size(
|
|
806
|
-
ggml_row_size(
|
|
959
|
+
ggml_row_size(layer.k->type, n_embd_k_gqa),
|
|
960
|
+
ggml_row_size(layer.k->type, n_embd_k_gqa*i));
|
|
807
961
|
|
|
808
|
-
ggml_tensor * view_k_dst = ggml_view_2d(ctx,
|
|
962
|
+
ggml_tensor * view_k_dst = ggml_view_2d(ctx, layer.k,
|
|
809
963
|
n_embd_k_gqa, nm,
|
|
810
|
-
ggml_row_size(
|
|
811
|
-
ggml_row_size(
|
|
964
|
+
ggml_row_size(layer.k->type, n_embd_k_gqa),
|
|
965
|
+
ggml_row_size(layer.k->type, n_embd_k_gqa*id));
|
|
812
966
|
|
|
813
967
|
ggml_tensor * view_v_src;
|
|
814
968
|
ggml_tensor * view_v_dst;
|
|
815
969
|
|
|
816
970
|
if (cparams.flash_attn) {
|
|
817
971
|
// NOTE: the V cache is not transposed when using flash attention
|
|
818
|
-
view_v_src = ggml_view_2d(ctx,
|
|
972
|
+
view_v_src = ggml_view_2d(ctx, layer.v,
|
|
819
973
|
n_embd_v_gqa, nm,
|
|
820
|
-
ggml_row_size(
|
|
821
|
-
ggml_row_size(
|
|
974
|
+
ggml_row_size(layer.v->type, n_embd_v_gqa),
|
|
975
|
+
ggml_row_size(layer.v->type, n_embd_v_gqa*i));
|
|
822
976
|
|
|
823
|
-
view_v_dst = ggml_view_2d(ctx,
|
|
977
|
+
view_v_dst = ggml_view_2d(ctx, layer.v,
|
|
824
978
|
n_embd_v_gqa, nm,
|
|
825
|
-
ggml_row_size(
|
|
826
|
-
ggml_row_size(
|
|
979
|
+
ggml_row_size(layer.v->type, n_embd_v_gqa),
|
|
980
|
+
ggml_row_size(layer.v->type, n_embd_v_gqa*id));
|
|
827
981
|
} else {
|
|
828
|
-
view_v_src = ggml_view_2d(ctx,
|
|
982
|
+
view_v_src = ggml_view_2d(ctx, layer.v,
|
|
829
983
|
nm, n_embd_v_gqa,
|
|
830
|
-
ggml_row_size(
|
|
831
|
-
ggml_row_size(
|
|
984
|
+
ggml_row_size(layer.v->type, cells.size()),
|
|
985
|
+
ggml_row_size(layer.v->type, i));
|
|
832
986
|
|
|
833
|
-
view_v_dst = ggml_view_2d(ctx,
|
|
987
|
+
view_v_dst = ggml_view_2d(ctx, layer.v,
|
|
834
988
|
nm, n_embd_v_gqa,
|
|
835
|
-
ggml_row_size(
|
|
836
|
-
ggml_row_size(
|
|
989
|
+
ggml_row_size(layer.v->type, cells.size()),
|
|
990
|
+
ggml_row_size(layer.v->type, id));
|
|
837
991
|
}
|
|
838
992
|
|
|
839
993
|
ggml_build_forward_expand(gf, ggml_cpy(ctx, view_k_src, view_k_dst));
|
|
@@ -850,10 +1004,10 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
|
|
|
850
1004
|
}
|
|
851
1005
|
|
|
852
1006
|
bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
|
|
853
|
-
const uint32_t n_layer =
|
|
1007
|
+
const uint32_t n_layer = layers.size();
|
|
854
1008
|
|
|
855
|
-
const uint32_t n_kv =
|
|
856
|
-
const uint32_t n_used =
|
|
1009
|
+
const uint32_t n_kv = cells.used_max_p1();
|
|
1010
|
+
const uint32_t n_used = cells.get_used();
|
|
857
1011
|
|
|
858
1012
|
assert(n_used <= n_kv);
|
|
859
1013
|
|
|
@@ -881,9 +1035,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
|
|
|
881
1035
|
ids.resize(n_kv, n_kv);
|
|
882
1036
|
|
|
883
1037
|
for (uint32_t i0 = 0; i0 < n_used; ++i0) {
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
if (!cell0.is_empty()) {
|
|
1038
|
+
if (!cells.is_empty(i0)) {
|
|
887
1039
|
ids[i0] = i0;
|
|
888
1040
|
|
|
889
1041
|
continue;
|
|
@@ -894,7 +1046,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
|
|
|
894
1046
|
uint32_t nh = 1;
|
|
895
1047
|
|
|
896
1048
|
// determine the size of the hole
|
|
897
|
-
while (i0 + nh < n_used && cells
|
|
1049
|
+
while (i0 + nh < n_used && cells.is_empty(i0 + nh)) {
|
|
898
1050
|
nh++;
|
|
899
1051
|
}
|
|
900
1052
|
|
|
@@ -903,9 +1055,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
|
|
|
903
1055
|
|
|
904
1056
|
// starting from the end, find nh non-empty cells
|
|
905
1057
|
for (; is > i0; --is) {
|
|
906
|
-
|
|
907
|
-
|
|
908
|
-
if (cell1.is_empty() || ids[is] != n_kv) {
|
|
1058
|
+
if (cells.is_empty(is) || ids[is] != n_kv) {
|
|
909
1059
|
continue;
|
|
910
1060
|
}
|
|
911
1061
|
|
|
@@ -932,9 +1082,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
|
|
|
932
1082
|
|
|
933
1083
|
// go back and move the nf cells to the hole
|
|
934
1084
|
for (; i1 < n_kv; ++i1) {
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
if (cell1.is_empty() || ids[i1] != n_kv) {
|
|
1085
|
+
if (cells.is_empty(i1) || ids[i1] != n_kv) {
|
|
938
1086
|
if (n_moves == max_moves) {
|
|
939
1087
|
stop = true;
|
|
940
1088
|
break;
|
|
@@ -948,10 +1096,8 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
|
|
|
948
1096
|
ids[i1] = i0 + nf;
|
|
949
1097
|
|
|
950
1098
|
// move the cell meta data
|
|
951
|
-
cells
|
|
1099
|
+
cells.mv(i1, i0 + nf);
|
|
952
1100
|
|
|
953
|
-
// clear the old cell and move the head there
|
|
954
|
-
cell1 = kv_cell();
|
|
955
1101
|
head = n_used;
|
|
956
1102
|
|
|
957
1103
|
if (!cont) {
|
|
@@ -986,16 +1132,30 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
|
|
|
986
1132
|
return true;
|
|
987
1133
|
}
|
|
988
1134
|
|
|
989
|
-
|
|
990
|
-
|
|
991
|
-
const kv_cell & cell = cells[i - 1];
|
|
1135
|
+
bool llama_kv_cache_unified::is_masked_swa(llama_pos p0, llama_pos p1) const {
|
|
1136
|
+
assert(p0 >= 0 && p1 >= 0);
|
|
992
1137
|
|
|
993
|
-
|
|
994
|
-
|
|
995
|
-
|
|
1138
|
+
switch (swa_type) {
|
|
1139
|
+
case LLAMA_SWA_TYPE_NONE:
|
|
1140
|
+
{
|
|
1141
|
+
} break;
|
|
1142
|
+
case LLAMA_SWA_TYPE_STANDARD:
|
|
1143
|
+
{
|
|
1144
|
+
if (p1 - p0 >= (int32_t) n_swa) {
|
|
1145
|
+
return true;
|
|
1146
|
+
}
|
|
1147
|
+
} break;
|
|
1148
|
+
case LLAMA_SWA_TYPE_CHUNKED:
|
|
1149
|
+
{
|
|
1150
|
+
const llama_pos pos_chunk_start = (p1 / n_swa) * n_swa;
|
|
1151
|
+
|
|
1152
|
+
if (p0 < pos_chunk_start) {
|
|
1153
|
+
return true;
|
|
1154
|
+
}
|
|
1155
|
+
} break;
|
|
996
1156
|
}
|
|
997
1157
|
|
|
998
|
-
return
|
|
1158
|
+
return false;
|
|
999
1159
|
}
|
|
1000
1160
|
|
|
1001
1161
|
void llama_kv_cache_unified::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
|
|
@@ -1004,23 +1164,24 @@ void llama_kv_cache_unified::state_write(llama_io_write_i & io, llama_seq_id seq
|
|
|
1004
1164
|
|
|
1005
1165
|
// Count the number of cells with the specified seq_id
|
|
1006
1166
|
// Find all the ranges of cells with this seq id (or all, when -1)
|
|
1007
|
-
uint32_t cell_range_begin = size;
|
|
1008
|
-
|
|
1009
|
-
|
|
1010
|
-
if ((seq_id == -1
|
|
1167
|
+
uint32_t cell_range_begin = cells.size();
|
|
1168
|
+
|
|
1169
|
+
for (uint32_t i = 0; i < cells.size(); ++i) {
|
|
1170
|
+
if (!cells.is_empty(i) && (seq_id == -1 || cells.seq_has(i, seq_id))) {
|
|
1011
1171
|
++cell_count;
|
|
1012
|
-
if (cell_range_begin == size) {
|
|
1172
|
+
if (cell_range_begin == cells.size()) {
|
|
1013
1173
|
cell_range_begin = i;
|
|
1014
1174
|
}
|
|
1015
1175
|
} else {
|
|
1016
|
-
if (cell_range_begin != size) {
|
|
1176
|
+
if (cell_range_begin != cells.size()) {
|
|
1017
1177
|
cell_ranges.emplace_back(cell_range_begin, i);
|
|
1018
|
-
cell_range_begin = size;
|
|
1178
|
+
cell_range_begin = cells.size();
|
|
1019
1179
|
}
|
|
1020
1180
|
}
|
|
1021
1181
|
}
|
|
1022
|
-
|
|
1023
|
-
|
|
1182
|
+
|
|
1183
|
+
if (cell_range_begin != cells.size()) {
|
|
1184
|
+
cell_ranges.emplace_back(cell_range_begin, cells.size());
|
|
1024
1185
|
}
|
|
1025
1186
|
|
|
1026
1187
|
// DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
|
|
@@ -1057,17 +1218,24 @@ void llama_kv_cache_unified::state_read(llama_io_read_i & io, llama_seq_id seq_i
|
|
|
1057
1218
|
void llama_kv_cache_unified::state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id) const {
|
|
1058
1219
|
for (const auto & range : cell_ranges) {
|
|
1059
1220
|
for (uint32_t i = range.first; i < range.second; ++i) {
|
|
1060
|
-
|
|
1061
|
-
|
|
1062
|
-
|
|
1221
|
+
std::vector<llama_seq_id> seq_ids;
|
|
1222
|
+
|
|
1223
|
+
for (llama_seq_id cur = 0; cur < (int) n_seq_max; ++cur) {
|
|
1224
|
+
if (cur == seq_id || seq_id == -1) {
|
|
1225
|
+
if (cells.seq_has(i, cur)) {
|
|
1226
|
+
seq_ids.push_back(cur);
|
|
1227
|
+
}
|
|
1228
|
+
}
|
|
1229
|
+
}
|
|
1230
|
+
|
|
1231
|
+
const llama_pos pos = cells.pos_get(i);
|
|
1232
|
+
const uint32_t n_seq_id = seq_ids.size();
|
|
1063
1233
|
|
|
1064
1234
|
io.write(&pos, sizeof(pos));
|
|
1065
1235
|
io.write(&n_seq_id, sizeof(n_seq_id));
|
|
1066
1236
|
|
|
1067
|
-
|
|
1068
|
-
|
|
1069
|
-
io.write(&seq_id, sizeof(seq_id));
|
|
1070
|
-
}
|
|
1237
|
+
for (const auto & seq_id : seq_ids) {
|
|
1238
|
+
io.write(&seq_id, sizeof(seq_id));
|
|
1071
1239
|
}
|
|
1072
1240
|
}
|
|
1073
1241
|
}
|
|
@@ -1075,7 +1243,7 @@ void llama_kv_cache_unified::state_write_meta(llama_io_write_i & io, const std::
|
|
|
1075
1243
|
|
|
1076
1244
|
void llama_kv_cache_unified::state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const {
|
|
1077
1245
|
const uint32_t v_trans = this->v_trans ? 1 : 0;
|
|
1078
|
-
const uint32_t n_layer =
|
|
1246
|
+
const uint32_t n_layer = layers.size();
|
|
1079
1247
|
|
|
1080
1248
|
io.write(&v_trans, sizeof(v_trans));
|
|
1081
1249
|
io.write(&n_layer, sizeof(n_layer));
|
|
@@ -1084,56 +1252,63 @@ void llama_kv_cache_unified::state_write_data(llama_io_write_i & io, const std::
|
|
|
1084
1252
|
|
|
1085
1253
|
// Iterate and write all the keys first, each row is a cell
|
|
1086
1254
|
// Get whole range at a time
|
|
1087
|
-
for (
|
|
1255
|
+
for (const auto & layer : layers) {
|
|
1256
|
+
const uint32_t il = layer.il;
|
|
1257
|
+
|
|
1088
1258
|
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
|
|
1089
1259
|
|
|
1090
1260
|
// Write key type
|
|
1091
|
-
const int32_t k_type_i = (int32_t)
|
|
1261
|
+
const int32_t k_type_i = (int32_t)layer.k->type;
|
|
1092
1262
|
io.write(&k_type_i, sizeof(k_type_i));
|
|
1093
1263
|
|
|
1094
1264
|
// Write row size of key
|
|
1095
|
-
const uint64_t k_size_row = ggml_row_size(
|
|
1265
|
+
const uint64_t k_size_row = ggml_row_size(layer.k->type, n_embd_k_gqa);
|
|
1096
1266
|
io.write(&k_size_row, sizeof(k_size_row));
|
|
1097
1267
|
|
|
1098
1268
|
// Read each range of cells of k_size length each into tmp_buf and write out
|
|
1099
1269
|
for (const auto & range : cell_ranges) {
|
|
1100
1270
|
const size_t range_size = range.second - range.first;
|
|
1101
1271
|
const size_t buf_size = range_size * k_size_row;
|
|
1102
|
-
io.write_tensor(
|
|
1272
|
+
io.write_tensor(layer.k, range.first * k_size_row, buf_size);
|
|
1103
1273
|
}
|
|
1104
1274
|
}
|
|
1105
1275
|
|
|
1106
1276
|
if (!v_trans) {
|
|
1107
|
-
for (
|
|
1277
|
+
for (const auto & layer : layers) {
|
|
1278
|
+
const uint32_t il = layer.il;
|
|
1279
|
+
|
|
1108
1280
|
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
|
|
1109
1281
|
|
|
1110
1282
|
// Write value type
|
|
1111
|
-
const int32_t v_type_i = (int32_t)
|
|
1283
|
+
const int32_t v_type_i = (int32_t)layer.v->type;
|
|
1112
1284
|
io.write(&v_type_i, sizeof(v_type_i));
|
|
1113
1285
|
|
|
1114
1286
|
// Write row size of value
|
|
1115
|
-
const uint64_t v_size_row = ggml_row_size(
|
|
1287
|
+
const uint64_t v_size_row = ggml_row_size(layer.v->type, n_embd_v_gqa);
|
|
1116
1288
|
io.write(&v_size_row, sizeof(v_size_row));
|
|
1117
1289
|
|
|
1118
1290
|
// Read each range of cells of v_size length each into tmp_buf and write out
|
|
1119
1291
|
for (const auto & range : cell_ranges) {
|
|
1120
1292
|
const size_t range_size = range.second - range.first;
|
|
1121
1293
|
const size_t buf_size = range_size * v_size_row;
|
|
1122
|
-
io.write_tensor(
|
|
1294
|
+
io.write_tensor(layer.v, range.first * v_size_row, buf_size);
|
|
1123
1295
|
}
|
|
1124
1296
|
}
|
|
1125
1297
|
} else {
|
|
1126
1298
|
// When v is transposed, we also need the element size and get the element ranges from each row
|
|
1127
|
-
const uint32_t kv_size = size;
|
|
1128
|
-
|
|
1299
|
+
const uint32_t kv_size = cells.size();
|
|
1300
|
+
|
|
1301
|
+
for (const auto & layer : layers) {
|
|
1302
|
+
const uint32_t il = layer.il;
|
|
1303
|
+
|
|
1129
1304
|
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
|
|
1130
1305
|
|
|
1131
1306
|
// Write value type
|
|
1132
|
-
const int32_t v_type_i = (int32_t)
|
|
1307
|
+
const int32_t v_type_i = (int32_t)layer.v->type;
|
|
1133
1308
|
io.write(&v_type_i, sizeof(v_type_i));
|
|
1134
1309
|
|
|
1135
1310
|
// Write element size
|
|
1136
|
-
const uint32_t v_size_el = ggml_type_size(
|
|
1311
|
+
const uint32_t v_size_el = ggml_type_size(layer.v->type);
|
|
1137
1312
|
io.write(&v_size_el, sizeof(v_size_el));
|
|
1138
1313
|
|
|
1139
1314
|
// Write GQA embedding size
|
|
@@ -1146,7 +1321,7 @@ void llama_kv_cache_unified::state_write_data(llama_io_write_i & io, const std::
|
|
|
1146
1321
|
const size_t range_size = range.second - range.first;
|
|
1147
1322
|
const size_t src_offset = (range.first + j * kv_size) * v_size_el;
|
|
1148
1323
|
const size_t buf_size = range_size * v_size_el;
|
|
1149
|
-
io.write_tensor(
|
|
1324
|
+
io.write_tensor(layer.v, src_offset, buf_size);
|
|
1150
1325
|
}
|
|
1151
1326
|
}
|
|
1152
1327
|
}
|
|
@@ -1163,8 +1338,6 @@ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell
|
|
|
1163
1338
|
llama_ubatch batch = sbatch.reserve_ubatch(cell_count, /* has_embd */ false);
|
|
1164
1339
|
|
|
1165
1340
|
batch.n_tokens = cell_count;
|
|
1166
|
-
batch.n_seq_tokens = cell_count;
|
|
1167
|
-
batch.n_seqs = 1;
|
|
1168
1341
|
|
|
1169
1342
|
for (uint32_t i = 0; i < cell_count; ++i) {
|
|
1170
1343
|
llama_pos pos;
|
|
@@ -1173,32 +1346,40 @@ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell
|
|
|
1173
1346
|
io.read_to(&pos, sizeof(pos));
|
|
1174
1347
|
io.read_to(&n_seq_id, sizeof(n_seq_id));
|
|
1175
1348
|
|
|
1176
|
-
if (n_seq_id !=
|
|
1349
|
+
if (n_seq_id != 1) {
|
|
1177
1350
|
LLAMA_LOG_ERROR("%s: invalid seq_id-agnostic kv cell\n", __func__);
|
|
1178
1351
|
return false;
|
|
1179
1352
|
}
|
|
1180
1353
|
|
|
1181
|
-
|
|
1354
|
+
// read the sequence id, but directly discard it - we will use dest_seq_id instead
|
|
1355
|
+
{
|
|
1356
|
+
llama_seq_id seq_id;
|
|
1357
|
+
io.read_to(&seq_id, sizeof(seq_id));
|
|
1358
|
+
}
|
|
1359
|
+
|
|
1360
|
+
batch.pos[i] = pos;
|
|
1361
|
+
batch.n_seq_id[i] = n_seq_id;
|
|
1362
|
+
batch.seq_id[i] = &dest_seq_id;
|
|
1182
1363
|
}
|
|
1183
|
-
|
|
1184
|
-
batch.seq_id[0] = &dest_seq_id;
|
|
1364
|
+
|
|
1185
1365
|
if (!find_slot(batch)) {
|
|
1186
1366
|
LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
|
|
1187
1367
|
return false;
|
|
1188
1368
|
}
|
|
1369
|
+
|
|
1189
1370
|
commit();
|
|
1190
1371
|
|
|
1191
1372
|
// DEBUG CHECK: kv.head should be our first cell, kv.head + cell_count - 1 should be our last cell (verify seq_id and pos values)
|
|
1192
1373
|
// Assume that this is one contiguous block of cells
|
|
1193
|
-
GGML_ASSERT(head + cell_count <= size);
|
|
1194
|
-
GGML_ASSERT(cells
|
|
1195
|
-
GGML_ASSERT(cells
|
|
1196
|
-
GGML_ASSERT(cells
|
|
1197
|
-
GGML_ASSERT(cells
|
|
1374
|
+
GGML_ASSERT(head + cell_count <= cells.size());
|
|
1375
|
+
GGML_ASSERT(cells.pos_get(head) == batch.pos[0]);
|
|
1376
|
+
GGML_ASSERT(cells.pos_get(head + cell_count - 1) == batch.pos[cell_count - 1]);
|
|
1377
|
+
GGML_ASSERT(cells.seq_has(head, dest_seq_id));
|
|
1378
|
+
GGML_ASSERT(cells.seq_has(head + cell_count - 1, dest_seq_id));
|
|
1198
1379
|
} else {
|
|
1199
1380
|
// whole KV cache restore
|
|
1200
1381
|
|
|
1201
|
-
if (cell_count > size) {
|
|
1382
|
+
if (cell_count > cells.size()) {
|
|
1202
1383
|
LLAMA_LOG_ERROR("%s: not enough cells in kv cache\n", __func__);
|
|
1203
1384
|
return false;
|
|
1204
1385
|
}
|
|
@@ -1206,34 +1387,28 @@ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell
|
|
|
1206
1387
|
clear();
|
|
1207
1388
|
|
|
1208
1389
|
for (uint32_t i = 0; i < cell_count; ++i) {
|
|
1209
|
-
kv_cell & cell = cells[i];
|
|
1210
|
-
|
|
1211
1390
|
llama_pos pos;
|
|
1212
1391
|
uint32_t n_seq_id;
|
|
1213
1392
|
|
|
1214
1393
|
io.read_to(&pos, sizeof(pos));
|
|
1215
1394
|
io.read_to(&n_seq_id, sizeof(n_seq_id));
|
|
1216
1395
|
|
|
1217
|
-
|
|
1396
|
+
cells.pos_set(i, pos);
|
|
1218
1397
|
|
|
1219
1398
|
for (uint32_t j = 0; j < n_seq_id; ++j) {
|
|
1220
1399
|
llama_seq_id seq_id;
|
|
1221
1400
|
io.read_to(&seq_id, sizeof(seq_id));
|
|
1222
1401
|
|
|
1223
|
-
|
|
1224
|
-
|
|
1225
|
-
if (seq_id < 0) {
|
|
1226
|
-
//LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, llama_n_seq_max(ctx));
|
|
1227
|
-
LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, inf)\n", __func__, seq_id);
|
|
1402
|
+
if (seq_id < 0 || (uint32_t) seq_id >= n_seq_max) {
|
|
1403
|
+
LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, n_seq_max);
|
|
1228
1404
|
return false;
|
|
1229
1405
|
}
|
|
1230
1406
|
|
|
1231
|
-
|
|
1407
|
+
cells.seq_add(i, seq_id);
|
|
1232
1408
|
}
|
|
1233
1409
|
}
|
|
1234
1410
|
|
|
1235
1411
|
head = 0;
|
|
1236
|
-
used = cell_count;
|
|
1237
1412
|
}
|
|
1238
1413
|
|
|
1239
1414
|
return true;
|
|
@@ -1242,15 +1417,16 @@ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell
|
|
|
1242
1417
|
bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell_count) {
|
|
1243
1418
|
uint32_t v_trans;
|
|
1244
1419
|
uint32_t n_layer;
|
|
1420
|
+
|
|
1245
1421
|
io.read_to(&v_trans, sizeof(v_trans));
|
|
1246
1422
|
io.read_to(&n_layer, sizeof(n_layer));
|
|
1247
1423
|
|
|
1248
|
-
if (n_layer !=
|
|
1249
|
-
LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer,
|
|
1424
|
+
if (n_layer != layers.size()) {
|
|
1425
|
+
LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, (uint32_t) layers.size());
|
|
1250
1426
|
return false;
|
|
1251
1427
|
}
|
|
1252
|
-
if (cell_count > size) {
|
|
1253
|
-
LLAMA_LOG_ERROR("%s: not enough cells in kv cache to restore state (%u > %u)\n", __func__, cell_count, size);
|
|
1428
|
+
if (cell_count > cells.size()) {
|
|
1429
|
+
LLAMA_LOG_ERROR("%s: not enough cells in kv cache to restore state (%u > %u)\n", __func__, cell_count, cells.size());
|
|
1254
1430
|
return false;
|
|
1255
1431
|
}
|
|
1256
1432
|
if (this->v_trans != (bool) v_trans) {
|
|
@@ -1259,13 +1435,15 @@ bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell
|
|
|
1259
1435
|
}
|
|
1260
1436
|
|
|
1261
1437
|
// For each layer, read the keys for each cell, one row is one cell, read as one contiguous block
|
|
1262
|
-
for (
|
|
1438
|
+
for (const auto & layer : layers) {
|
|
1439
|
+
const uint32_t il = layer.il;
|
|
1440
|
+
|
|
1263
1441
|
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
|
|
1264
1442
|
|
|
1265
1443
|
// Read type of key
|
|
1266
1444
|
int32_t k_type_i_ref;
|
|
1267
1445
|
io.read_to(&k_type_i_ref, sizeof(k_type_i_ref));
|
|
1268
|
-
const int32_t k_type_i = (int32_t)
|
|
1446
|
+
const int32_t k_type_i = (int32_t) layer.k->type;
|
|
1269
1447
|
if (k_type_i != k_type_i_ref) {
|
|
1270
1448
|
LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il);
|
|
1271
1449
|
return false;
|
|
@@ -1274,7 +1452,7 @@ bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell
|
|
|
1274
1452
|
// Read row size of key
|
|
1275
1453
|
uint64_t k_size_row_ref;
|
|
1276
1454
|
io.read_to(&k_size_row_ref, sizeof(k_size_row_ref));
|
|
1277
|
-
const size_t k_size_row = ggml_row_size(
|
|
1455
|
+
const size_t k_size_row = ggml_row_size(layer.k->type, n_embd_k_gqa);
|
|
1278
1456
|
if (k_size_row != k_size_row_ref) {
|
|
1279
1457
|
LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, (size_t) k_size_row_ref, il);
|
|
1280
1458
|
return false;
|
|
@@ -1282,18 +1460,20 @@ bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell
|
|
|
1282
1460
|
|
|
1283
1461
|
if (cell_count) {
|
|
1284
1462
|
// Read and set the keys for the whole cell range
|
|
1285
|
-
ggml_backend_tensor_set(
|
|
1463
|
+
ggml_backend_tensor_set(layer.k, io.read(cell_count * k_size_row), head * k_size_row, cell_count * k_size_row);
|
|
1286
1464
|
}
|
|
1287
1465
|
}
|
|
1288
1466
|
|
|
1289
1467
|
if (!this->v_trans) {
|
|
1290
|
-
for (
|
|
1468
|
+
for (const auto & layer : layers) {
|
|
1469
|
+
const uint32_t il = layer.il;
|
|
1470
|
+
|
|
1291
1471
|
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
|
|
1292
1472
|
|
|
1293
1473
|
// Read type of value
|
|
1294
1474
|
int32_t v_type_i_ref;
|
|
1295
1475
|
io.read_to(&v_type_i_ref, sizeof(v_type_i_ref));
|
|
1296
|
-
const int32_t v_type_i = (int32_t)
|
|
1476
|
+
const int32_t v_type_i = (int32_t)layer.v->type;
|
|
1297
1477
|
if (v_type_i != v_type_i_ref) {
|
|
1298
1478
|
LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
|
|
1299
1479
|
return false;
|
|
@@ -1302,7 +1482,7 @@ bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell
|
|
|
1302
1482
|
// Read row size of value
|
|
1303
1483
|
uint64_t v_size_row_ref;
|
|
1304
1484
|
io.read_to(&v_size_row_ref, sizeof(v_size_row_ref));
|
|
1305
|
-
const size_t v_size_row = ggml_row_size(
|
|
1485
|
+
const size_t v_size_row = ggml_row_size(layer.v->type, n_embd_v_gqa);
|
|
1306
1486
|
if (v_size_row != v_size_row_ref) {
|
|
1307
1487
|
LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, (size_t) v_size_row_ref, il);
|
|
1308
1488
|
return false;
|
|
@@ -1310,18 +1490,20 @@ bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell
|
|
|
1310
1490
|
|
|
1311
1491
|
if (cell_count) {
|
|
1312
1492
|
// Read and set the values for the whole cell range
|
|
1313
|
-
ggml_backend_tensor_set(
|
|
1493
|
+
ggml_backend_tensor_set(layer.v, io.read(cell_count * v_size_row), head * v_size_row, cell_count * v_size_row);
|
|
1314
1494
|
}
|
|
1315
1495
|
}
|
|
1316
1496
|
} else {
|
|
1317
1497
|
// For each layer, read the values for each cell (transposed)
|
|
1318
|
-
for (
|
|
1498
|
+
for (const auto & layer : layers) {
|
|
1499
|
+
const uint32_t il = layer.il;
|
|
1500
|
+
|
|
1319
1501
|
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
|
|
1320
1502
|
|
|
1321
1503
|
// Read type of value
|
|
1322
1504
|
int32_t v_type_i_ref;
|
|
1323
1505
|
io.read_to(&v_type_i_ref, sizeof(v_type_i_ref));
|
|
1324
|
-
const int32_t v_type_i = (int32_t)
|
|
1506
|
+
const int32_t v_type_i = (int32_t)layer.v->type;
|
|
1325
1507
|
if (v_type_i != v_type_i_ref) {
|
|
1326
1508
|
LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
|
|
1327
1509
|
return false;
|
|
@@ -1330,7 +1512,7 @@ bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell
|
|
|
1330
1512
|
// Read element size of value
|
|
1331
1513
|
uint32_t v_size_el_ref;
|
|
1332
1514
|
io.read_to(&v_size_el_ref, sizeof(v_size_el_ref));
|
|
1333
|
-
const size_t v_size_el = ggml_type_size(
|
|
1515
|
+
const size_t v_size_el = ggml_type_size(layer.v->type);
|
|
1334
1516
|
if (v_size_el != v_size_el_ref) {
|
|
1335
1517
|
LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, (size_t) v_size_el_ref, il);
|
|
1336
1518
|
return false;
|
|
@@ -1347,8 +1529,8 @@ bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell
|
|
|
1347
1529
|
if (cell_count) {
|
|
1348
1530
|
// For each row in the transposed matrix, read the values for the whole cell range
|
|
1349
1531
|
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
|
|
1350
|
-
const size_t dst_offset = (head + j * size) * v_size_el;
|
|
1351
|
-
ggml_backend_tensor_set(
|
|
1532
|
+
const size_t dst_offset = (head + j * cells.size()) * v_size_el;
|
|
1533
|
+
ggml_backend_tensor_set(layer.v, io.read(cell_count * v_size_el), dst_offset, cell_count * v_size_el);
|
|
1352
1534
|
}
|
|
1353
1535
|
}
|
|
1354
1536
|
}
|
|
@@ -1357,6 +1539,193 @@ bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell
|
|
|
1357
1539
|
return true;
|
|
1358
1540
|
}
|
|
1359
1541
|
|
|
1542
|
+
//
|
|
1543
|
+
// llama_kv_cache_unified_iswa
|
|
1544
|
+
//
|
|
1545
|
+
|
|
1546
|
+
llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa(
|
|
1547
|
+
const llama_model & model,
|
|
1548
|
+
ggml_type type_k,
|
|
1549
|
+
ggml_type type_v,
|
|
1550
|
+
bool v_trans,
|
|
1551
|
+
bool offload,
|
|
1552
|
+
bool swa_full,
|
|
1553
|
+
uint32_t kv_size,
|
|
1554
|
+
uint32_t n_seq_max,
|
|
1555
|
+
uint32_t n_batch,
|
|
1556
|
+
uint32_t n_pad) : hparams(model.hparams) {
|
|
1557
|
+
llama_kv_cache_unified::layer_filter_cb filter_base = [&](int32_t il) { return !model.hparams.is_swa(il); };
|
|
1558
|
+
llama_kv_cache_unified::layer_filter_cb filter_swa = [&](int32_t il) { return model.hparams.is_swa(il); };
|
|
1559
|
+
|
|
1560
|
+
const uint32_t size_base = kv_size;
|
|
1561
|
+
|
|
1562
|
+
uint32_t size_swa = std::min(size_base, GGML_PAD(hparams.n_swa*n_seq_max + n_batch, n_pad));
|
|
1563
|
+
|
|
1564
|
+
// when using full-size SWA cache, we set the SWA cache size to be equal to the base cache size and disable pruning
|
|
1565
|
+
if (swa_full) {
|
|
1566
|
+
LLAMA_LOG_WARN("%s: using full-size SWA cache (ref: %s)\n",
|
|
1567
|
+
__func__, "https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055");
|
|
1568
|
+
|
|
1569
|
+
size_swa = size_base;
|
|
1570
|
+
do_prune = false;
|
|
1571
|
+
}
|
|
1572
|
+
|
|
1573
|
+
LLAMA_LOG_INFO("%s: creating non-SWA KV cache, size = %u cells\n", __func__, size_base);
|
|
1574
|
+
|
|
1575
|
+
kv_base = std::make_unique<llama_kv_cache_unified>(
|
|
1576
|
+
model, std::move(filter_base), type_k, type_v,
|
|
1577
|
+
v_trans, offload, size_base, n_seq_max, n_pad,
|
|
1578
|
+
0, LLAMA_SWA_TYPE_NONE);
|
|
1579
|
+
|
|
1580
|
+
LLAMA_LOG_INFO("%s: creating SWA KV cache, size = %u cells\n", __func__, size_swa);
|
|
1581
|
+
|
|
1582
|
+
kv_swa = std::make_unique<llama_kv_cache_unified>(
|
|
1583
|
+
model, std::move(filter_swa), type_k, type_v,
|
|
1584
|
+
v_trans, offload, size_swa, n_seq_max, n_pad,
|
|
1585
|
+
hparams.n_swa, hparams.swa_type);
|
|
1586
|
+
}
|
|
1587
|
+
|
|
1588
|
+
void llama_kv_cache_unified_iswa::clear() {
|
|
1589
|
+
kv_base->clear();
|
|
1590
|
+
kv_swa ->clear();
|
|
1591
|
+
}
|
|
1592
|
+
|
|
1593
|
+
bool llama_kv_cache_unified_iswa::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
|
|
1594
|
+
bool res = true;
|
|
1595
|
+
|
|
1596
|
+
res = res & kv_base->seq_rm(seq_id, p0, p1);
|
|
1597
|
+
res = res & kv_swa ->seq_rm(seq_id, p0, p1);
|
|
1598
|
+
|
|
1599
|
+
return res;
|
|
1600
|
+
}
|
|
1601
|
+
|
|
1602
|
+
void llama_kv_cache_unified_iswa::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
|
|
1603
|
+
kv_base->seq_cp(seq_id_src, seq_id_dst, p0, p1);
|
|
1604
|
+
kv_swa ->seq_cp(seq_id_src, seq_id_dst, p0, p1);
|
|
1605
|
+
}
|
|
1606
|
+
|
|
1607
|
+
void llama_kv_cache_unified_iswa::seq_keep(llama_seq_id seq_id) {
|
|
1608
|
+
kv_base->seq_keep(seq_id);
|
|
1609
|
+
kv_swa ->seq_keep(seq_id);
|
|
1610
|
+
}
|
|
1611
|
+
|
|
1612
|
+
void llama_kv_cache_unified_iswa::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) {
|
|
1613
|
+
kv_base->seq_add(seq_id, p0, p1, shift);
|
|
1614
|
+
kv_swa ->seq_add(seq_id, p0, p1, shift);
|
|
1615
|
+
}
|
|
1616
|
+
|
|
1617
|
+
void llama_kv_cache_unified_iswa::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
|
|
1618
|
+
kv_base->seq_div(seq_id, p0, p1, d);
|
|
1619
|
+
kv_swa ->seq_div(seq_id, p0, p1, d);
|
|
1620
|
+
}
|
|
1621
|
+
|
|
1622
|
+
llama_pos llama_kv_cache_unified_iswa::seq_pos_min(llama_seq_id seq_id) const {
|
|
1623
|
+
// the base cache is a superset of the SWA cache, so we can just check the SWA cache
|
|
1624
|
+
return kv_swa->seq_pos_min(seq_id);
|
|
1625
|
+
}
|
|
1626
|
+
|
|
1627
|
+
llama_pos llama_kv_cache_unified_iswa::seq_pos_max(llama_seq_id seq_id) const {
|
|
1628
|
+
return kv_swa->seq_pos_max(seq_id);
|
|
1629
|
+
}
|
|
1630
|
+
|
|
1631
|
+
void llama_kv_cache_unified_iswa::restore() {
|
|
1632
|
+
kv_base->restore();
|
|
1633
|
+
kv_swa ->restore();
|
|
1634
|
+
}
|
|
1635
|
+
|
|
1636
|
+
void llama_kv_cache_unified_iswa::commit() {
|
|
1637
|
+
kv_base->commit();
|
|
1638
|
+
kv_swa ->commit();
|
|
1639
|
+
|
|
1640
|
+
// slide the attention window, forgetting/pruning old tokens that are outside the window
|
|
1641
|
+
if (do_prune) {
|
|
1642
|
+
for (const auto & [seq_id, entry] : pending.pos) {
|
|
1643
|
+
kv_swa->prune_swa(seq_id, entry.pmin, entry.pmax);
|
|
1644
|
+
}
|
|
1645
|
+
|
|
1646
|
+
}
|
|
1647
|
+
|
|
1648
|
+
pending.clear();
|
|
1649
|
+
}
|
|
1650
|
+
|
|
1651
|
+
bool llama_kv_cache_unified_iswa::update(llama_context & lctx) {
|
|
1652
|
+
bool res = true;
|
|
1653
|
+
|
|
1654
|
+
res = res & kv_base->update(lctx);
|
|
1655
|
+
res = res & kv_swa ->update(lctx);
|
|
1656
|
+
|
|
1657
|
+
return res;
|
|
1658
|
+
}
|
|
1659
|
+
|
|
1660
|
+
void llama_kv_cache_unified_iswa::defrag_sched(float thold) {
|
|
1661
|
+
kv_base->defrag_sched(thold);
|
|
1662
|
+
kv_swa ->defrag_sched(thold);
|
|
1663
|
+
}
|
|
1664
|
+
|
|
1665
|
+
void llama_kv_cache_unified_iswa::set_full() {
|
|
1666
|
+
kv_base->set_full();
|
|
1667
|
+
kv_swa ->set_full();
|
|
1668
|
+
}
|
|
1669
|
+
|
|
1670
|
+
llama_sbatch llama_kv_cache_unified_iswa::sbatch_init(const llama_batch & batch, bool logits_all) {
|
|
1671
|
+
pending.clear();
|
|
1672
|
+
|
|
1673
|
+
if (do_prune) {
|
|
1674
|
+
for (int i = 0; i < batch.n_tokens; ++i) {
|
|
1675
|
+
for (int s = 0; s < batch.n_seq_id[i]; ++s) {
|
|
1676
|
+
const llama_seq_id seq_id = batch.seq_id[i][s];
|
|
1677
|
+
const llama_pos pos = batch.pos[i];
|
|
1678
|
+
|
|
1679
|
+
if (pending.pos.find(seq_id) == pending.pos.end()) {
|
|
1680
|
+
pending.pos[seq_id].pmin = pos;
|
|
1681
|
+
pending.pos[seq_id].pmax = pos;
|
|
1682
|
+
} else {
|
|
1683
|
+
pending.pos[seq_id].pmin = std::min(pending.pos[seq_id].pmin, pos);
|
|
1684
|
+
pending.pos[seq_id].pmax = std::max(pending.pos[seq_id].pmax, pos);
|
|
1685
|
+
}
|
|
1686
|
+
}
|
|
1687
|
+
}
|
|
1688
|
+
}
|
|
1689
|
+
|
|
1690
|
+
return llama_sbatch(batch, hparams.n_embd, true, logits_all);
|
|
1691
|
+
}
|
|
1692
|
+
|
|
1693
|
+
llama_ubatch llama_kv_cache_unified_iswa::ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const {
|
|
1694
|
+
GGML_UNUSED(embd_pooled);
|
|
1695
|
+
return sbatch.split_simple(n_ubatch);
|
|
1696
|
+
}
|
|
1697
|
+
|
|
1698
|
+
bool llama_kv_cache_unified_iswa::find_slot(const llama_ubatch & batch) {
|
|
1699
|
+
bool res = true;
|
|
1700
|
+
|
|
1701
|
+
res = res & kv_base->find_slot(batch);
|
|
1702
|
+
res = res & kv_swa ->find_slot(batch);
|
|
1703
|
+
|
|
1704
|
+
return res;
|
|
1705
|
+
}
|
|
1706
|
+
|
|
1707
|
+
bool llama_kv_cache_unified_iswa::get_can_shift() const {
|
|
1708
|
+
return kv_base->get_size() == kv_swa->get_size();
|
|
1709
|
+
}
|
|
1710
|
+
|
|
1711
|
+
void llama_kv_cache_unified_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
|
|
1712
|
+
kv_base->state_write(io, seq_id);
|
|
1713
|
+
kv_swa ->state_write(io, seq_id);
|
|
1714
|
+
}
|
|
1715
|
+
|
|
1716
|
+
void llama_kv_cache_unified_iswa::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
|
|
1717
|
+
kv_base->state_read(io, seq_id);
|
|
1718
|
+
kv_swa ->state_read(io, seq_id);
|
|
1719
|
+
}
|
|
1720
|
+
|
|
1721
|
+
llama_kv_cache_unified * llama_kv_cache_unified_iswa::get_kv_base() const {
|
|
1722
|
+
return kv_base.get();
|
|
1723
|
+
}
|
|
1724
|
+
|
|
1725
|
+
llama_kv_cache_unified * llama_kv_cache_unified_iswa::get_kv_swa() const {
|
|
1726
|
+
return kv_swa.get();
|
|
1727
|
+
}
|
|
1728
|
+
|
|
1360
1729
|
//
|
|
1361
1730
|
// llama_kv_cache_recurrent
|
|
1362
1731
|
//
|
|
@@ -1366,19 +1735,17 @@ llama_kv_cache_recurrent::llama_kv_cache_recurrent(
|
|
|
1366
1735
|
ggml_type type_k,
|
|
1367
1736
|
ggml_type type_v,
|
|
1368
1737
|
bool offload,
|
|
1369
|
-
uint32_t kv_size
|
|
1738
|
+
uint32_t kv_size,
|
|
1739
|
+
uint32_t n_seq_max) : hparams(model.hparams), n_seq_max(n_seq_max) {
|
|
1370
1740
|
const int32_t n_layer = hparams.n_layer;
|
|
1371
1741
|
|
|
1372
|
-
LLAMA_LOG_INFO("%s: kv_size = %
|
|
1373
|
-
__func__, kv_size, ggml_type_name(type_k), ggml_type_name(type_v), n_layer);
|
|
1742
|
+
LLAMA_LOG_INFO("%s: kv_size = %u, n_seq_max = %u, type_k = '%s', type_v = '%s', n_layer = %d\n",
|
|
1743
|
+
__func__, kv_size, n_seq_max, ggml_type_name(type_k), ggml_type_name(type_v), n_layer);
|
|
1374
1744
|
|
|
1375
1745
|
head = 0;
|
|
1376
1746
|
size = kv_size;
|
|
1377
1747
|
used = 0;
|
|
1378
1748
|
|
|
1379
|
-
this->type_k = type_k;
|
|
1380
|
-
this->type_v = type_v;
|
|
1381
|
-
|
|
1382
1749
|
cells.clear();
|
|
1383
1750
|
cells.resize(kv_size);
|
|
1384
1751
|
|
|
@@ -1616,8 +1983,8 @@ void llama_kv_cache_recurrent::seq_keep(llama_seq_id seq_id) {
|
|
|
1616
1983
|
}
|
|
1617
1984
|
}
|
|
1618
1985
|
|
|
1619
|
-
void llama_kv_cache_recurrent::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos
|
|
1620
|
-
if (
|
|
1986
|
+
void llama_kv_cache_recurrent::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) {
|
|
1987
|
+
if (shift == 0) {
|
|
1621
1988
|
return;
|
|
1622
1989
|
}
|
|
1623
1990
|
|
|
@@ -1640,7 +2007,7 @@ void llama_kv_cache_recurrent::seq_add(llama_seq_id seq_id, llama_pos p0, llama_
|
|
|
1640
2007
|
if (tail_id >= 0) {
|
|
1641
2008
|
kv_cell & cell = cells[tail_id];
|
|
1642
2009
|
if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
|
|
1643
|
-
cell.pos +=
|
|
2010
|
+
cell.pos += shift;
|
|
1644
2011
|
}
|
|
1645
2012
|
}
|
|
1646
2013
|
}
|
|
@@ -1676,8 +2043,24 @@ void llama_kv_cache_recurrent::seq_div(llama_seq_id seq_id, llama_pos p0, llama_
|
|
|
1676
2043
|
}
|
|
1677
2044
|
}
|
|
1678
2045
|
|
|
2046
|
+
llama_pos llama_kv_cache_recurrent::seq_pos_min(llama_seq_id seq_id) const {
|
|
2047
|
+
llama_pos result = std::numeric_limits<llama_pos>::max();
|
|
2048
|
+
|
|
2049
|
+
for (uint32_t i = 0; i < size; ++i) {
|
|
2050
|
+
if (cells[i].has_seq_id(seq_id)) {
|
|
2051
|
+
result = std::min(result, cells[i].pos);
|
|
2052
|
+
}
|
|
2053
|
+
}
|
|
2054
|
+
|
|
2055
|
+
if (result == std::numeric_limits<llama_pos>::max()) {
|
|
2056
|
+
result = -1;
|
|
2057
|
+
}
|
|
2058
|
+
|
|
2059
|
+
return result;
|
|
2060
|
+
}
|
|
2061
|
+
|
|
1679
2062
|
llama_pos llama_kv_cache_recurrent::seq_pos_max(llama_seq_id seq_id) const {
|
|
1680
|
-
llama_pos result =
|
|
2063
|
+
llama_pos result = -1;
|
|
1681
2064
|
|
|
1682
2065
|
for (uint32_t i = 0; i < size; ++i) {
|
|
1683
2066
|
if (cells[i].has_seq_id(seq_id)) {
|
|
@@ -1700,8 +2083,8 @@ void llama_kv_cache_recurrent::commit() {
|
|
|
1700
2083
|
pending.ranges.clear();
|
|
1701
2084
|
}
|
|
1702
2085
|
|
|
1703
|
-
bool llama_kv_cache_recurrent::update(llama_context &
|
|
1704
|
-
GGML_UNUSED(
|
|
2086
|
+
bool llama_kv_cache_recurrent::update(llama_context & ctx) {
|
|
2087
|
+
GGML_UNUSED(ctx);
|
|
1705
2088
|
return false;
|
|
1706
2089
|
}
|
|
1707
2090
|
|
|
@@ -1712,6 +2095,7 @@ void llama_kv_cache_recurrent::defrag_sched(float thold) {
|
|
|
1712
2095
|
|
|
1713
2096
|
void llama_kv_cache_recurrent::set_full() {
|
|
1714
2097
|
n = size;
|
|
2098
|
+
head = 0;
|
|
1715
2099
|
}
|
|
1716
2100
|
|
|
1717
2101
|
llama_sbatch llama_kv_cache_recurrent::sbatch_init(
|
|
@@ -1761,7 +2145,7 @@ bool llama_kv_cache_recurrent::find_slot(
|
|
|
1761
2145
|
if (seq_id < 0 || (uint32_t) seq_id >= size) {
|
|
1762
2146
|
// too big seq_id
|
|
1763
2147
|
// TODO: would it be possible to resize the cache instead?
|
|
1764
|
-
LLAMA_LOG_ERROR("%s: seq_id=%d >= n_seq_max=%
|
|
2148
|
+
LLAMA_LOG_ERROR("%s: seq_id=%d >= n_seq_max=%u Try using a bigger --parallel value\n", __func__, seq_id, n_seq_max);
|
|
1765
2149
|
return false;
|
|
1766
2150
|
}
|
|
1767
2151
|
if (j > 0) {
|
|
@@ -1904,29 +2288,6 @@ bool llama_kv_cache_recurrent::find_slot(
|
|
|
1904
2288
|
return n >= n_seqs;
|
|
1905
2289
|
}
|
|
1906
2290
|
|
|
1907
|
-
int32_t llama_kv_cache_recurrent::get_n_tokens() const {
|
|
1908
|
-
int32_t result = 0;
|
|
1909
|
-
|
|
1910
|
-
for (uint32_t i = 0; i < size; i++) {
|
|
1911
|
-
result += cells[i].seq_id.size();
|
|
1912
|
-
}
|
|
1913
|
-
|
|
1914
|
-
return result;
|
|
1915
|
-
}
|
|
1916
|
-
|
|
1917
|
-
int32_t llama_kv_cache_recurrent::get_used_cells() const {
|
|
1918
|
-
return used;
|
|
1919
|
-
}
|
|
1920
|
-
|
|
1921
|
-
llama_pos llama_kv_cache_recurrent::get_pos_max() const {
|
|
1922
|
-
llama_pos pos_max = -1;
|
|
1923
|
-
for (const auto & cell : cells) {
|
|
1924
|
-
pos_max = std::max(pos_max, cell.pos);
|
|
1925
|
-
}
|
|
1926
|
-
|
|
1927
|
-
return pos_max;
|
|
1928
|
-
}
|
|
1929
|
-
|
|
1930
2291
|
bool llama_kv_cache_recurrent::get_can_shift() const {
|
|
1931
2292
|
return false;
|
|
1932
2293
|
}
|
|
@@ -2055,6 +2416,7 @@ void llama_kv_cache_recurrent::state_read(llama_io_read_i & io, llama_seq_id seq
|
|
|
2055
2416
|
io.read_to(&cell_count, sizeof(cell_count));
|
|
2056
2417
|
|
|
2057
2418
|
bool res = true;
|
|
2419
|
+
|
|
2058
2420
|
res = res && state_read_meta(io, cell_count, seq_id);
|
|
2059
2421
|
res = res && state_read_data(io, cell_count);
|
|
2060
2422
|
|
|
@@ -2383,104 +2745,3 @@ bool llama_kv_cache_recurrent::state_read_data(llama_io_read_i & io, uint32_t ce
|
|
|
2383
2745
|
|
|
2384
2746
|
return true;
|
|
2385
2747
|
}
|
|
2386
|
-
|
|
2387
|
-
//
|
|
2388
|
-
// kv cache view
|
|
2389
|
-
//
|
|
2390
|
-
|
|
2391
|
-
llama_kv_cache_view llama_kv_cache_view_init(const llama_kv_cache & kv, int32_t n_seq_max) {
|
|
2392
|
-
llama_kv_cache_view result = {
|
|
2393
|
-
/*.n_cells = */ 0,
|
|
2394
|
-
/*.n_seq_max = */ n_seq_max,
|
|
2395
|
-
/*.token_count = */ 0,
|
|
2396
|
-
/*.used_cells = */ kv.get_used_cells(),
|
|
2397
|
-
/*.max_contiguous = */ 0,
|
|
2398
|
-
/*.max_contiguous_idx = */ -1,
|
|
2399
|
-
/*.cells = */ nullptr,
|
|
2400
|
-
/*.cells_sequences = */ nullptr,
|
|
2401
|
-
};
|
|
2402
|
-
|
|
2403
|
-
return result;
|
|
2404
|
-
}
|
|
2405
|
-
|
|
2406
|
-
void llama_kv_cache_view_free(llama_kv_cache_view * view) {
|
|
2407
|
-
if (view->cells != nullptr) {
|
|
2408
|
-
free(view->cells);
|
|
2409
|
-
view->cells = nullptr;
|
|
2410
|
-
}
|
|
2411
|
-
if (view->cells_sequences != nullptr) {
|
|
2412
|
-
free(view->cells_sequences);
|
|
2413
|
-
view->cells_sequences = nullptr;
|
|
2414
|
-
}
|
|
2415
|
-
}
|
|
2416
|
-
|
|
2417
|
-
void llama_kv_cache_view_update(llama_kv_cache_view * view, const llama_kv_cache * kv) {
|
|
2418
|
-
// TODO: rework this in the future, for now quick hack
|
|
2419
|
-
const llama_kv_cache_unified * kvu = dynamic_cast<const llama_kv_cache_unified *>(kv);
|
|
2420
|
-
if (kvu == nullptr) {
|
|
2421
|
-
LLAMA_LOG_ERROR("%s: the kv_cache_view currently works only with llama_kv_cache_unified\n", __func__);
|
|
2422
|
-
return;
|
|
2423
|
-
}
|
|
2424
|
-
|
|
2425
|
-
if (uint32_t(view->n_cells) < kvu->size || view->cells == nullptr) {
|
|
2426
|
-
view->n_cells = int32_t(kvu->size);
|
|
2427
|
-
void * p = realloc(view->cells, sizeof(llama_kv_cache_view_cell) * view->n_cells);
|
|
2428
|
-
GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells");
|
|
2429
|
-
view->cells = (llama_kv_cache_view_cell *)p;
|
|
2430
|
-
p = realloc(view->cells_sequences, sizeof(llama_seq_id) * view->n_seq_max * view->n_cells);
|
|
2431
|
-
GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells sequences");
|
|
2432
|
-
view->cells_sequences = (llama_seq_id *)p;
|
|
2433
|
-
}
|
|
2434
|
-
|
|
2435
|
-
const std::vector<llama_kv_cache_unified::kv_cell> & kv_cells = kvu->cells;
|
|
2436
|
-
llama_kv_cache_view_cell * c_curr = view->cells;
|
|
2437
|
-
llama_seq_id * cs_curr = view->cells_sequences;
|
|
2438
|
-
int32_t used_cells = 0;
|
|
2439
|
-
int32_t token_count = 0;
|
|
2440
|
-
int32_t curr_contig_idx = -1;
|
|
2441
|
-
uint32_t max_contig = 0;
|
|
2442
|
-
int32_t max_contig_idx = -1;
|
|
2443
|
-
|
|
2444
|
-
for (int32_t i = 0; i < int32_t(kvu->size); i++, c_curr++, cs_curr += view->n_seq_max) {
|
|
2445
|
-
const size_t curr_size = kv_cells[i].seq_id.size();
|
|
2446
|
-
token_count += curr_size;
|
|
2447
|
-
c_curr->pos = kv_cells[i].pos + kv_cells[i].delta;
|
|
2448
|
-
|
|
2449
|
-
if (curr_size > 0) {
|
|
2450
|
-
if (curr_contig_idx >= 0 && uint32_t(i - curr_contig_idx) > max_contig) {
|
|
2451
|
-
max_contig = i - curr_contig_idx;
|
|
2452
|
-
max_contig_idx = curr_contig_idx;
|
|
2453
|
-
}
|
|
2454
|
-
curr_contig_idx = -1;
|
|
2455
|
-
} else if (curr_contig_idx < 0) {
|
|
2456
|
-
curr_contig_idx = i;
|
|
2457
|
-
}
|
|
2458
|
-
|
|
2459
|
-
int seq_idx = 0;
|
|
2460
|
-
for (const llama_seq_id it : kv_cells[i].seq_id) {
|
|
2461
|
-
if (seq_idx >= view->n_seq_max) {
|
|
2462
|
-
break;
|
|
2463
|
-
}
|
|
2464
|
-
cs_curr[seq_idx] = it;
|
|
2465
|
-
seq_idx++;
|
|
2466
|
-
}
|
|
2467
|
-
if (seq_idx != 0) {
|
|
2468
|
-
used_cells++;
|
|
2469
|
-
}
|
|
2470
|
-
for (; seq_idx < view->n_seq_max; seq_idx++) {
|
|
2471
|
-
cs_curr[seq_idx] = -1;
|
|
2472
|
-
}
|
|
2473
|
-
}
|
|
2474
|
-
if (curr_contig_idx >= 0 && kv_cells.size() - curr_contig_idx > max_contig) {
|
|
2475
|
-
max_contig_idx = curr_contig_idx;
|
|
2476
|
-
max_contig = kv_cells.size() - curr_contig_idx;
|
|
2477
|
-
}
|
|
2478
|
-
view->max_contiguous = max_contig;
|
|
2479
|
-
view->max_contiguous_idx = max_contig_idx;
|
|
2480
|
-
view->token_count = token_count;
|
|
2481
|
-
view->used_cells = used_cells;
|
|
2482
|
-
if (uint32_t(used_cells) != kvu->used) {
|
|
2483
|
-
LLAMA_LOG_ERROR("%s: used cells mismatch. kv_cache says %d but we calculated %d\n",
|
|
2484
|
-
__func__, kvu->used, used_cells);
|
|
2485
|
-
}
|
|
2486
|
-
}
|