@novastera-oss/llamarn 0.2.9 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86/libggml.so +0 -0
- package/android/src/main/jniLibs/x86/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/CMakeLists.txt +0 -1
- package/cpp/llama.cpp/README.md +4 -5
- package/cpp/llama.cpp/build-xcframework.sh +1 -1
- package/cpp/llama.cpp/common/CMakeLists.txt +4 -5
- package/cpp/llama.cpp/common/arg.cpp +17 -0
- package/cpp/llama.cpp/common/chat.cpp +37 -20
- package/cpp/llama.cpp/common/chat.h +2 -0
- package/cpp/llama.cpp/common/common.h +4 -0
- package/cpp/llama.cpp/convert_hf_to_gguf.py +745 -6
- package/cpp/llama.cpp/convert_hf_to_gguf_update.py +9 -0
- package/cpp/llama.cpp/ggml/CMakeLists.txt +7 -2
- package/cpp/llama.cpp/ggml/include/ggml-backend.h +1 -1
- package/cpp/llama.cpp/ggml/include/ggml.h +173 -10
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +0 -1
- package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +0 -8
- package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +36 -18
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +68 -5
- package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +16 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +6 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +28 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +1203 -163
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +6 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +33 -9
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +142 -9
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +17 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cu +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cross-entropy-loss.cu +2 -14
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +4 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +8 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cu +6 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cu +14 -12
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +5 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +15 -10
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +8 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/getrows.cu +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +185 -79
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +2 -8
- package/cpp/llama.cpp/ggml/src/ggml-cuda/rope.cu +21 -27
- package/cpp/llama.cpp/ggml/src/ggml-cuda/scale.cu +8 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/softmax.cu +119 -58
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-conv.cu +10 -2
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +192 -52
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +97 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +11 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/upscale.cu +92 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +14 -5
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +64 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -2
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +35 -9
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +167 -39
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +254 -57
- package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +505 -40
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/gelu.cl +27 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/glu.cl +337 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/scale.cl +3 -2
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/set_rows.cl +95 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +24 -11
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +24 -11
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f16.cl +24 -11
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f32.cl +24 -11
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +2 -3
- package/cpp/llama.cpp/ggml/src/ggml-quants.c +6 -6
- package/cpp/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +693 -1034
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +18 -9
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +60 -9
- package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +15 -18
- package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.cpp +131 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.hpp +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +711 -292
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +58 -7
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +28 -23
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +14 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +38 -32
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +32 -27
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +44 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +15 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp +29 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +128 -72
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +38 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +12 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +20 -4
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +69 -5
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +23 -3
- package/cpp/llama.cpp/ggml/src/ggml.c +382 -61
- package/cpp/llama.cpp/ggml/src/gguf.cpp +8 -1
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +209 -0
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +8 -2
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +73 -21
- package/cpp/llama.cpp/gguf-py/gguf/vocab.py +12 -3
- package/cpp/llama.cpp/include/llama.h +0 -40
- package/cpp/llama.cpp/src/llama-arch.cpp +210 -3
- package/cpp/llama.cpp/src/llama-arch.h +18 -1
- package/cpp/llama.cpp/src/llama-batch.cpp +27 -1
- package/cpp/llama.cpp/src/llama-batch.h +8 -1
- package/cpp/llama.cpp/src/llama-chat.cpp +15 -0
- package/cpp/llama.cpp/src/llama-chat.h +1 -0
- package/cpp/llama.cpp/src/llama-graph.cpp +119 -184
- package/cpp/llama.cpp/src/llama-graph.h +47 -60
- package/cpp/llama.cpp/src/llama-hparams.cpp +7 -1
- package/cpp/llama.cpp/src/llama-hparams.h +3 -0
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +28 -18
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +4 -2
- package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +214 -65
- package/cpp/llama.cpp/src/llama-kv-cache-unified.h +62 -24
- package/cpp/llama.cpp/src/llama-kv-cells.h +62 -10
- package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +9 -4
- package/cpp/llama.cpp/src/llama-memory-hybrid.h +3 -1
- package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +20 -10
- package/cpp/llama.cpp/src/llama-memory.cpp +17 -0
- package/cpp/llama.cpp/src/llama-memory.h +3 -0
- package/cpp/llama.cpp/src/llama-model.cpp +2530 -685
- package/cpp/llama.cpp/src/llama-model.h +18 -0
- package/cpp/llama.cpp/src/llama-quant.cpp +1 -0
- package/cpp/llama.cpp/src/llama-vocab.cpp +13 -2
- package/cpp/llama.cpp/src/llama-vocab.h +41 -0
- package/ios/include/chat.h +2 -0
- package/ios/include/common.h +4 -0
- package/ios/include/llama.h +0 -40
- package/ios/libs/llama.xcframework/Info.plist +19 -19
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5055 -4886
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +0 -40
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5030 -4861
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3889 -3764
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +0 -40
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5030 -4861
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3891 -3766
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +0 -40
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +0 -40
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +0 -40
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5059 -4890
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +0 -40
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5030 -4861
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3889 -3764
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +0 -40
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5091 -4922
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +0 -40
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5066 -4897
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3919 -3794
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +173 -10
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +0 -40
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +1 -1
- package/cpp/llama.cpp/ggml/include/ggml-kompute.h +0 -50
- package/cpp/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
- package/cpp/llama.cpp/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/common.comp +0 -112
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +0 -58
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +0 -25
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +0 -30
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +0 -22
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +0 -17
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +0 -31
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +0 -31
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +0 -38
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +0 -39
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +0 -44
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +0 -69
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +0 -51
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +0 -33
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +0 -35
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +0 -140
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +0 -106
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +0 -73
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +0 -28
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +0 -84
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +0 -21
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +0 -53
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +0 -19
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +0 -23
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +0 -22
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +0 -72
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +0 -71
|
@@ -105,10 +105,30 @@ public:
|
|
|
105
105
|
res.resize(n);
|
|
106
106
|
|
|
107
107
|
for (uint32_t j = 0; j < n; ++j) {
|
|
108
|
-
|
|
109
|
-
res.seq[j] = seq[i + j];
|
|
108
|
+
const auto idx = i + j;
|
|
110
109
|
|
|
111
|
-
|
|
110
|
+
res.pos[j] = pos[idx];
|
|
111
|
+
res.seq[j] = seq[idx];
|
|
112
|
+
|
|
113
|
+
assert(shift[idx] == 0);
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
return res;
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
// copy the state of cells [idxs[0], idxs[1], ..., idxs[idxs.size() - 1])
|
|
120
|
+
llama_kv_cells_unified cp(const std::vector<uint32_t> & idxs) const {
|
|
121
|
+
llama_kv_cells_unified res;
|
|
122
|
+
|
|
123
|
+
res.resize(idxs.size());
|
|
124
|
+
|
|
125
|
+
for (uint32_t j = 0; j < idxs.size(); ++j) {
|
|
126
|
+
const auto idx = idxs[j];
|
|
127
|
+
|
|
128
|
+
res.pos[j] = pos[idx];
|
|
129
|
+
res.seq[j] = seq[idx];
|
|
130
|
+
|
|
131
|
+
assert(shift[idx] == 0);
|
|
112
132
|
}
|
|
113
133
|
|
|
114
134
|
return res;
|
|
@@ -119,26 +139,58 @@ public:
|
|
|
119
139
|
assert(i + other.pos.size() <= pos.size());
|
|
120
140
|
|
|
121
141
|
for (uint32_t j = 0; j < other.pos.size(); ++j) {
|
|
122
|
-
|
|
142
|
+
const auto idx = i + j;
|
|
143
|
+
|
|
144
|
+
if (pos[idx] == -1 && other.pos[j] != -1) {
|
|
123
145
|
used.insert(i + j);
|
|
124
146
|
}
|
|
125
147
|
|
|
126
|
-
if (pos[
|
|
148
|
+
if (pos[idx] != -1 && other.pos[j] == -1) {
|
|
127
149
|
used.erase(i + j);
|
|
128
150
|
}
|
|
129
151
|
|
|
130
|
-
if (pos[
|
|
152
|
+
if (pos[idx] != -1) {
|
|
131
153
|
seq_pos_rm(i + j);
|
|
132
154
|
}
|
|
133
155
|
|
|
134
|
-
pos[
|
|
135
|
-
seq[
|
|
156
|
+
pos[idx] = other.pos[j];
|
|
157
|
+
seq[idx] = other.seq[j];
|
|
136
158
|
|
|
137
|
-
if (pos[
|
|
159
|
+
if (pos[idx] != -1) {
|
|
138
160
|
seq_pos_add(i + j);
|
|
139
161
|
}
|
|
140
162
|
|
|
141
|
-
assert(shift[
|
|
163
|
+
assert(shift[idx] == 0);
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
// set the state of cells [idxs[0], idxs[1], ..., idxs[idxs.size() - 1])
|
|
168
|
+
void set(const std::vector<uint32_t> & idxs, const llama_kv_cells_unified & other) {
|
|
169
|
+
assert(idxs.size() == other.pos.size());
|
|
170
|
+
|
|
171
|
+
for (uint32_t j = 0; j < other.pos.size(); ++j) {
|
|
172
|
+
const auto idx = idxs[j];
|
|
173
|
+
|
|
174
|
+
if (pos[idx] == -1 && other.pos[j] != -1) {
|
|
175
|
+
used.insert(idx);
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
if (pos[idx] != -1 && other.pos[j] == -1) {
|
|
179
|
+
used.erase(idx);
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
if (pos[idx] != -1) {
|
|
183
|
+
seq_pos_rm(idx);
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
pos[idx] = other.pos[j];
|
|
187
|
+
seq[idx] = other.seq[j];
|
|
188
|
+
|
|
189
|
+
if (pos[idx] != -1) {
|
|
190
|
+
seq_pos_add(idx);
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
assert(shift[idx] == 0);
|
|
142
194
|
}
|
|
143
195
|
}
|
|
144
196
|
|
|
@@ -70,7 +70,7 @@ llama_memory_context_ptr llama_memory_hybrid::init_batch(llama_batch_allocr & ba
|
|
|
70
70
|
// if all tokens are output, split by sequence
|
|
71
71
|
ubatch = balloc.split_seq(n_ubatch);
|
|
72
72
|
} else {
|
|
73
|
-
ubatch = balloc.split_equal(n_ubatch);
|
|
73
|
+
ubatch = balloc.split_equal(n_ubatch, false);
|
|
74
74
|
}
|
|
75
75
|
|
|
76
76
|
if (ubatch.n_tokens == 0) {
|
|
@@ -80,6 +80,11 @@ llama_memory_context_ptr llama_memory_hybrid::init_batch(llama_batch_allocr & ba
|
|
|
80
80
|
ubatches.push_back(std::move(ubatch)); // NOLINT
|
|
81
81
|
}
|
|
82
82
|
|
|
83
|
+
if (balloc.get_n_used() < balloc.get_n_tokens()) {
|
|
84
|
+
// failed to find a suitable split
|
|
85
|
+
break;
|
|
86
|
+
}
|
|
87
|
+
|
|
83
88
|
// prepare the recurrent batches first
|
|
84
89
|
if (!mem_recr->prepare(ubatches)) {
|
|
85
90
|
// TODO: will the recurrent cache be in an undefined context at this point?
|
|
@@ -195,11 +200,11 @@ llama_memory_hybrid_context::llama_memory_hybrid_context(
|
|
|
195
200
|
|
|
196
201
|
llama_memory_hybrid_context::llama_memory_hybrid_context(
|
|
197
202
|
llama_memory_hybrid * mem,
|
|
198
|
-
|
|
203
|
+
slot_info_vec_t sinfos_attn,
|
|
199
204
|
std::vector<llama_ubatch> ubatches) :
|
|
200
205
|
ubatches(std::move(ubatches)),
|
|
201
206
|
// note: here we copy the ubatches. not sure if this is ideal
|
|
202
|
-
ctx_attn(new llama_kv_cache_unified_context(mem->get_mem_attn(), std::move(
|
|
207
|
+
ctx_attn(new llama_kv_cache_unified_context(mem->get_mem_attn(), std::move(sinfos_attn), this->ubatches)),
|
|
203
208
|
ctx_recr(new llama_memory_recurrent_context(mem->get_mem_recr(), this->ubatches)),
|
|
204
209
|
status(llama_memory_status_combine(ctx_attn->get_status(), ctx_recr->get_status())) {
|
|
205
210
|
}
|
|
@@ -218,7 +223,7 @@ bool llama_memory_hybrid_context::next() {
|
|
|
218
223
|
}
|
|
219
224
|
|
|
220
225
|
bool llama_memory_hybrid_context::apply() {
|
|
221
|
-
assert(status
|
|
226
|
+
assert(!llama_memory_status_is_fail(status));
|
|
222
227
|
|
|
223
228
|
bool res = true;
|
|
224
229
|
|
|
@@ -92,6 +92,8 @@ private:
|
|
|
92
92
|
|
|
93
93
|
class llama_memory_hybrid_context : public llama_memory_context_i {
|
|
94
94
|
public:
|
|
95
|
+
using slot_info_vec_t = llama_kv_cache_unified::slot_info_vec_t;
|
|
96
|
+
|
|
95
97
|
// init failure
|
|
96
98
|
explicit llama_memory_hybrid_context(llama_memory_status status);
|
|
97
99
|
|
|
@@ -107,7 +109,7 @@ public:
|
|
|
107
109
|
// init success
|
|
108
110
|
llama_memory_hybrid_context(
|
|
109
111
|
llama_memory_hybrid * mem,
|
|
110
|
-
|
|
112
|
+
slot_info_vec_t sinfos_attn,
|
|
111
113
|
std::vector<llama_ubatch> ubatches);
|
|
112
114
|
|
|
113
115
|
~llama_memory_hybrid_context() = default;
|
|
@@ -25,9 +25,6 @@ llama_memory_recurrent::llama_memory_recurrent(
|
|
|
25
25
|
uint32_t n_seq_max) : hparams(model.hparams), n_seq_max(n_seq_max) {
|
|
26
26
|
const int32_t n_layer = hparams.n_layer;
|
|
27
27
|
|
|
28
|
-
LLAMA_LOG_INFO("%s: mem_size = %u, n_seq_max = %u, type_r = '%s', type_s = '%s', n_layer = %d\n",
|
|
29
|
-
__func__, mem_size, n_seq_max, ggml_type_name(type_r), ggml_type_name(type_s), n_layer);
|
|
30
|
-
|
|
31
28
|
head = 0;
|
|
32
29
|
size = mem_size;
|
|
33
30
|
used = 0;
|
|
@@ -84,7 +81,7 @@ llama_memory_recurrent::llama_memory_recurrent(
|
|
|
84
81
|
|
|
85
82
|
ggml_context * ctx = ctx_for_buft(buft);
|
|
86
83
|
if (!ctx) {
|
|
87
|
-
throw std::runtime_error("failed to create ggml context for
|
|
84
|
+
throw std::runtime_error("failed to create ggml context for rs cache");
|
|
88
85
|
}
|
|
89
86
|
|
|
90
87
|
ggml_tensor * r = ggml_new_tensor_1d(ctx, type_r, hparams.n_embd_r()*mem_size);
|
|
@@ -102,10 +99,10 @@ llama_memory_recurrent::llama_memory_recurrent(
|
|
|
102
99
|
|
|
103
100
|
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
|
|
104
101
|
if (!buf) {
|
|
105
|
-
throw std::runtime_error("failed to allocate buffer for
|
|
102
|
+
throw std::runtime_error("failed to allocate buffer for rs cache");
|
|
106
103
|
}
|
|
107
104
|
ggml_backend_buffer_clear(buf, 0);
|
|
108
|
-
LLAMA_LOG_INFO("%s: %10s
|
|
105
|
+
LLAMA_LOG_INFO("%s: %10s RS buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
|
|
109
106
|
bufs.emplace_back(buf);
|
|
110
107
|
}
|
|
111
108
|
|
|
@@ -113,8 +110,8 @@ llama_memory_recurrent::llama_memory_recurrent(
|
|
|
113
110
|
const size_t memory_size_r = size_r_bytes();
|
|
114
111
|
const size_t memory_size_s = size_s_bytes();
|
|
115
112
|
|
|
116
|
-
LLAMA_LOG_INFO("%s:
|
|
117
|
-
(float)(memory_size_r + memory_size_s) / (1024.0f * 1024.0f),
|
|
113
|
+
LLAMA_LOG_INFO("%s: size = %7.2f MiB (%6u cells, %3d layers, %2u seqs), R (%s): %7.2f MiB, S (%s): %7.2f MiB\n", __func__,
|
|
114
|
+
(float)(memory_size_r + memory_size_s) / (1024.0f * 1024.0f), mem_size, n_layer, n_seq_max,
|
|
118
115
|
ggml_type_name(type_r), (float)memory_size_r / (1024.0f * 1024.0f),
|
|
119
116
|
ggml_type_name(type_s), (float)memory_size_s / (1024.0f * 1024.0f));
|
|
120
117
|
}
|
|
@@ -374,7 +371,7 @@ llama_memory_context_ptr llama_memory_recurrent::init_batch(llama_batch_allocr &
|
|
|
374
371
|
// if all tokens are output, split by sequence
|
|
375
372
|
ubatch = balloc.split_seq(n_ubatch);
|
|
376
373
|
} else {
|
|
377
|
-
ubatch = balloc.split_equal(n_ubatch);
|
|
374
|
+
ubatch = balloc.split_equal(n_ubatch, false);
|
|
378
375
|
}
|
|
379
376
|
|
|
380
377
|
if (ubatch.n_tokens == 0) {
|
|
@@ -384,6 +381,11 @@ llama_memory_context_ptr llama_memory_recurrent::init_batch(llama_batch_allocr &
|
|
|
384
381
|
ubatches.push_back(std::move(ubatch)); // NOLINT
|
|
385
382
|
}
|
|
386
383
|
|
|
384
|
+
if (balloc.get_n_used() < balloc.get_n_tokens()) {
|
|
385
|
+
// failed to find a suitable split
|
|
386
|
+
break;
|
|
387
|
+
}
|
|
388
|
+
|
|
387
389
|
if (!prepare(ubatches)) {
|
|
388
390
|
break;
|
|
389
391
|
}
|
|
@@ -1071,7 +1073,15 @@ bool llama_memory_recurrent_context::next() {
|
|
|
1071
1073
|
}
|
|
1072
1074
|
|
|
1073
1075
|
bool llama_memory_recurrent_context::apply() {
|
|
1074
|
-
assert(status
|
|
1076
|
+
assert(!llama_memory_status_is_fail(status));
|
|
1077
|
+
|
|
1078
|
+
// no ubatches -> this is an update
|
|
1079
|
+
if (ubatches.empty()) {
|
|
1080
|
+
// recurrent cache never performs updates
|
|
1081
|
+
assert(status == LLAMA_MEMORY_STATUS_NO_UPDATE);
|
|
1082
|
+
|
|
1083
|
+
return true;
|
|
1084
|
+
}
|
|
1075
1085
|
|
|
1076
1086
|
mem->find_slot(ubatches[i_next]);
|
|
1077
1087
|
|
|
@@ -40,3 +40,20 @@ llama_memory_status llama_memory_status_combine(llama_memory_status s0, llama_me
|
|
|
40
40
|
// if either status has an update, then the combined status has an update
|
|
41
41
|
return has_update ? LLAMA_MEMORY_STATUS_SUCCESS : LLAMA_MEMORY_STATUS_NO_UPDATE;
|
|
42
42
|
}
|
|
43
|
+
|
|
44
|
+
bool llama_memory_status_is_fail(llama_memory_status status) {
|
|
45
|
+
switch (status) {
|
|
46
|
+
case LLAMA_MEMORY_STATUS_SUCCESS:
|
|
47
|
+
case LLAMA_MEMORY_STATUS_NO_UPDATE:
|
|
48
|
+
{
|
|
49
|
+
return false;
|
|
50
|
+
}
|
|
51
|
+
case LLAMA_MEMORY_STATUS_FAILED_PREPARE:
|
|
52
|
+
case LLAMA_MEMORY_STATUS_FAILED_COMPUTE:
|
|
53
|
+
{
|
|
54
|
+
return true;
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
return false;
|
|
59
|
+
}
|
|
@@ -31,6 +31,9 @@ enum llama_memory_status {
|
|
|
31
31
|
// useful for implementing hybrid memory types (e.g. iSWA)
|
|
32
32
|
llama_memory_status llama_memory_status_combine(llama_memory_status s0, llama_memory_status s1);
|
|
33
33
|
|
|
34
|
+
// helper function for checking if a memory status indicates a failure
|
|
35
|
+
bool llama_memory_status_is_fail(llama_memory_status status);
|
|
36
|
+
|
|
34
37
|
// the interface for managing the memory context during batch processing
|
|
35
38
|
// this interface is implemented per memory type. see:
|
|
36
39
|
// - llama_kv_cache_unified_context
|