@novastera-oss/llamarn 0.2.6 → 0.2.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/cpp/include/llama.h +141 -38
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/LlamaCppModel.cpp +58 -24
- package/cpp/LlamaCppModel.h +3 -3
- package/cpp/PureCppImpl.cpp +1 -1
- package/cpp/PureCppImpl.h +2 -2
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/CMakeLists.txt +15 -4
- package/cpp/llama.cpp/Makefile +2 -2
- package/cpp/llama.cpp/README.md +32 -13
- package/cpp/llama.cpp/common/CMakeLists.txt +10 -20
- package/cpp/llama.cpp/common/arg.cpp +37 -6
- package/cpp/llama.cpp/common/build-info.cpp.in +2 -2
- package/cpp/llama.cpp/common/chat-parser.cpp +5 -0
- package/cpp/llama.cpp/common/chat-parser.h +2 -0
- package/cpp/llama.cpp/common/chat.cpp +12 -9
- package/cpp/llama.cpp/common/chat.h +1 -1
- package/cpp/llama.cpp/common/common.cpp +53 -40
- package/cpp/llama.cpp/common/common.h +6 -2
- package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
- package/cpp/llama.cpp/common/speculative.cpp +6 -4
- package/cpp/llama.cpp/convert_hf_to_gguf.py +215 -76
- package/cpp/llama.cpp/ggml/CMakeLists.txt +48 -2
- package/cpp/llama.cpp/ggml/cmake/common.cmake +1 -2
- package/cpp/llama.cpp/ggml/include/ggml-cpu.h +2 -0
- package/cpp/llama.cpp/ggml/include/ggml.h +33 -0
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +64 -13
- package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +6 -1
- package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +33 -9
- package/cpp/llama.cpp/ggml/src/ggml-common.h +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +124 -26
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +11 -10
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4114 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2163 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2639 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2732 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2069 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +397 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1300 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1481 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +4311 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +79 -3225
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +184 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +4 -3
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +16 -7
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +93 -104
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +12 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +58 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +194 -69
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +1158 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +1571 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.h +98 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +213 -37
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +45 -45
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +59 -37
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +4 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +90 -39
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cu +19 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cuh +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cu +257 -87
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cuh +2 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +6 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cu +5 -18
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cuh +0 -1
- package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +61 -183
- package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +11 -10
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +16 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +260 -49
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +497 -282
- package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1078 -468
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
- package/cpp/llama.cpp/ggml/src/ggml-quants.c +0 -2
- package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +18 -15
- package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -6
- package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +20 -48
- package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +28 -41
- package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +4 -10
- package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +117 -165
- package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +192 -53
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +32 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +49 -67
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +99 -159
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +8 -105
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +209 -92
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +60 -80
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +158 -203
- package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +55 -74
- package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +38 -10
- package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +24 -20
- package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -8
- package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +108 -16
- package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +12 -16
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +36 -28
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +487 -247
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml.c +69 -19
- package/cpp/llama.cpp/ggml/src/gguf.cpp +5 -1
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +133 -0
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +25 -1
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +78 -3
- package/cpp/llama.cpp/gguf-py/gguf/vocab.py +97 -4
- package/cpp/llama.cpp/gguf-py/pyproject.toml +2 -2
- package/cpp/llama.cpp/include/llama.h +141 -38
- package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +124 -0
- package/cpp/llama.cpp/requirements/requirements-compare-llama-bench.txt +1 -0
- package/cpp/llama.cpp/src/CMakeLists.txt +2 -2
- package/cpp/llama.cpp/src/llama-arch.cpp +150 -3
- package/cpp/llama.cpp/src/llama-arch.h +25 -1
- package/cpp/llama.cpp/src/llama-batch.cpp +736 -274
- package/cpp/llama.cpp/src/llama-batch.h +110 -57
- package/cpp/llama.cpp/src/llama-chat.cpp +30 -8
- package/cpp/llama.cpp/src/llama-chat.h +1 -0
- package/cpp/llama.cpp/src/llama-context.cpp +360 -266
- package/cpp/llama.cpp/src/llama-context.h +27 -23
- package/cpp/llama.cpp/src/llama-cparams.cpp +1 -1
- package/cpp/llama.cpp/src/llama-cparams.h +1 -1
- package/cpp/llama.cpp/src/llama-graph.cpp +411 -344
- package/cpp/llama.cpp/src/llama-graph.h +126 -58
- package/cpp/llama.cpp/src/llama-hparams.cpp +10 -2
- package/cpp/llama.cpp/src/llama-hparams.h +16 -2
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +103 -73
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +34 -42
- package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +345 -221
- package/cpp/llama.cpp/src/llama-kv-cache-unified.h +75 -50
- package/cpp/llama.cpp/src/llama-kv-cells.h +51 -22
- package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +246 -0
- package/cpp/llama.cpp/src/llama-memory-hybrid.h +138 -0
- package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.cpp → llama-memory-recurrent.cpp} +302 -317
- package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.h → llama-memory-recurrent.h} +60 -68
- package/cpp/llama.cpp/src/llama-memory.cpp +41 -0
- package/cpp/llama.cpp/src/llama-memory.h +73 -36
- package/cpp/llama.cpp/src/llama-mmap.cpp +1 -1
- package/cpp/llama.cpp/src/llama-model-loader.cpp +42 -17
- package/cpp/llama.cpp/src/llama-model-saver.cpp +1 -0
- package/cpp/llama.cpp/src/llama-model.cpp +1630 -511
- package/cpp/llama.cpp/src/llama-model.h +26 -0
- package/cpp/llama.cpp/src/llama-quant.cpp +89 -6
- package/cpp/llama.cpp/src/llama-vocab.cpp +58 -26
- package/cpp/llama.cpp/src/llama-vocab.h +1 -0
- package/cpp/llama.cpp/src/llama.cpp +11 -7
- package/cpp/llama.cpp/src/unicode.cpp +5 -0
- package/cpp/rn-completion.cpp +2 -2
- package/cpp/{rn-llama.hpp → rn-llama.h} +1 -1
- package/cpp/{rn-utils.hpp → rn-utils.h} +3 -0
- package/ios/include/chat.h +1 -1
- package/ios/include/common.h +6 -2
- package/ios/include/llama.h +141 -38
- package/ios/libs/llama.xcframework/Info.plist +15 -15
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4689
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3622
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3766 -3624
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4689
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3622
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4926 -4725
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4897 -4746
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3794 -3652
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +1 -2
- package/cpp/llama.cpp/common/cmake/build-info-gen-cpp.cmake +0 -24
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13891
- package/cpp/llama.cpp/src/llama-kv-cache.cpp +0 -1
- package/cpp/llama.cpp/src/llama-kv-cache.h +0 -44
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
|
@@ -470,6 +470,7 @@ extern "C" {
|
|
|
470
470
|
GGML_OP_TRANSPOSE,
|
|
471
471
|
GGML_OP_GET_ROWS,
|
|
472
472
|
GGML_OP_GET_ROWS_BACK,
|
|
473
|
+
GGML_OP_SET_ROWS,
|
|
473
474
|
GGML_OP_DIAG,
|
|
474
475
|
GGML_OP_DIAG_MASK_INF,
|
|
475
476
|
GGML_OP_DIAG_MASK_ZERO,
|
|
@@ -489,6 +490,7 @@ extern "C" {
|
|
|
489
490
|
GGML_OP_UPSCALE, // nearest interpolate
|
|
490
491
|
GGML_OP_PAD,
|
|
491
492
|
GGML_OP_PAD_REFLECT_1D,
|
|
493
|
+
GGML_OP_ROLL,
|
|
492
494
|
GGML_OP_ARANGE,
|
|
493
495
|
GGML_OP_TIMESTEP_EMBEDDING,
|
|
494
496
|
GGML_OP_ARGSORT,
|
|
@@ -686,6 +688,9 @@ extern "C" {
|
|
|
686
688
|
// true for tensor that is stored in memory as CxWxHxN and has been permuted to WxHxCxN
|
|
687
689
|
GGML_API bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor);
|
|
688
690
|
|
|
691
|
+
// true if the elements in dimension 0 are contiguous, or there is just 1 block of elements
|
|
692
|
+
GGML_API bool ggml_is_contiguous_rows(const struct ggml_tensor * tensor);
|
|
693
|
+
|
|
689
694
|
GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
|
690
695
|
GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
|
691
696
|
|
|
@@ -1374,6 +1379,23 @@ extern "C" {
|
|
|
1374
1379
|
struct ggml_tensor * b, // row indices
|
|
1375
1380
|
struct ggml_tensor * c); // data for ggml_get_rows, only used for its shape
|
|
1376
1381
|
|
|
1382
|
+
// a TD [n_embd, ne1, ne2, ne3]
|
|
1383
|
+
// b TS [n_embd, n_rows, ne02, ne03] | ne02 == ne2, ne03 == ne3
|
|
1384
|
+
// c I64 [n_rows, ne11, ne12, 1] | c[i] in [0, ne1)
|
|
1385
|
+
//
|
|
1386
|
+
// undefined behavior if destination rows overlap
|
|
1387
|
+
//
|
|
1388
|
+
// broadcast:
|
|
1389
|
+
// ne2 % ne11 == 0
|
|
1390
|
+
// ne3 % ne12 == 0
|
|
1391
|
+
//
|
|
1392
|
+
// return view(a)
|
|
1393
|
+
GGML_API struct ggml_tensor * ggml_set_rows(
|
|
1394
|
+
struct ggml_context * ctx,
|
|
1395
|
+
struct ggml_tensor * a, // destination
|
|
1396
|
+
struct ggml_tensor * b, // source
|
|
1397
|
+
struct ggml_tensor * c); // row indices
|
|
1398
|
+
|
|
1377
1399
|
GGML_API struct ggml_tensor * ggml_diag(
|
|
1378
1400
|
struct ggml_context * ctx,
|
|
1379
1401
|
struct ggml_tensor * a);
|
|
@@ -1801,6 +1823,17 @@ extern "C" {
|
|
|
1801
1823
|
int p0,
|
|
1802
1824
|
int p1);
|
|
1803
1825
|
|
|
1826
|
+
// Move tensor elements by an offset given for each dimension. Elements that
|
|
1827
|
+
// are shifted beyond the last position are wrapped around to the beginning.
|
|
1828
|
+
GGML_API struct ggml_tensor * ggml_roll(
|
|
1829
|
+
struct ggml_context * ctx,
|
|
1830
|
+
struct ggml_tensor * a,
|
|
1831
|
+
int shift0,
|
|
1832
|
+
int shift1,
|
|
1833
|
+
int shift2,
|
|
1834
|
+
int shift3);
|
|
1835
|
+
|
|
1836
|
+
|
|
1804
1837
|
// Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
|
|
1805
1838
|
// timesteps: [N,]
|
|
1806
1839
|
// return: [N, dim]
|
|
@@ -125,7 +125,6 @@ if (NOT MSVC)
|
|
|
125
125
|
endif()
|
|
126
126
|
|
|
127
127
|
if (MINGW)
|
|
128
|
-
# Target Windows 8 for PrefetchVirtualMemory
|
|
129
128
|
add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER})
|
|
130
129
|
endif()
|
|
131
130
|
|
|
@@ -213,6 +212,7 @@ endif()
|
|
|
213
212
|
|
|
214
213
|
add_library(ggml
|
|
215
214
|
ggml-backend-reg.cpp)
|
|
215
|
+
add_library(ggml::ggml ALIAS ggml)
|
|
216
216
|
|
|
217
217
|
target_link_libraries(ggml PUBLIC ggml-base)
|
|
218
218
|
|
|
@@ -270,17 +270,27 @@ endfunction()
|
|
|
270
270
|
function(ggml_add_cpu_backend_variant tag_name)
|
|
271
271
|
set(GGML_CPU_TAG_NAME ${tag_name})
|
|
272
272
|
# other: OPENMP LLAMAFILE CPU_HBM
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
273
|
+
if (GGML_SYSTEM_ARCH STREQUAL "x86")
|
|
274
|
+
foreach (feat NATIVE
|
|
275
|
+
SSE42
|
|
276
|
+
AVX AVX2 BMI2 AVX_VNNI FMA F16C
|
|
277
|
+
AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16
|
|
278
|
+
AMX_TILE AMX_INT8 AMX_BF16)
|
|
279
|
+
set(GGML_${feat} OFF)
|
|
280
|
+
endforeach()
|
|
281
|
+
|
|
282
|
+
foreach (feat ${ARGN})
|
|
283
|
+
set(GGML_${feat} ON)
|
|
284
|
+
endforeach()
|
|
285
|
+
elseif (GGML_SYSTEM_ARCH STREQUAL "ARM")
|
|
286
|
+
foreach (feat ${ARGN})
|
|
287
|
+
set(GGML_INTERNAL_${feat} ON)
|
|
288
|
+
endforeach()
|
|
289
|
+
elseif (GGML_SYSTEM_ARCH STREQUAL "PowerPC")
|
|
290
|
+
foreach (feat ${ARGN})
|
|
291
|
+
set(GGML_INTERNAL_${feat} ON)
|
|
292
|
+
endforeach()
|
|
293
|
+
endif()
|
|
284
294
|
|
|
285
295
|
ggml_add_cpu_backend_variant_impl(${tag_name})
|
|
286
296
|
endfunction()
|
|
@@ -290,6 +300,8 @@ ggml_add_backend(CPU)
|
|
|
290
300
|
if (GGML_CPU_ALL_VARIANTS)
|
|
291
301
|
if (NOT GGML_BACKEND_DL)
|
|
292
302
|
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
|
|
303
|
+
elseif (GGML_CPU_ARM_ARCH)
|
|
304
|
+
message(FATAL_ERROR "Cannot use both GGML_CPU_ARM_ARCH and GGML_CPU_ALL_VARIANTS")
|
|
293
305
|
endif()
|
|
294
306
|
if (GGML_SYSTEM_ARCH STREQUAL "x86")
|
|
295
307
|
ggml_add_cpu_backend_variant(x64)
|
|
@@ -303,8 +315,47 @@ if (GGML_CPU_ALL_VARIANTS)
|
|
|
303
315
|
# MSVC doesn't support AMX
|
|
304
316
|
ggml_add_cpu_backend_variant(sapphirerapids SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
|
|
305
317
|
endif()
|
|
318
|
+
elseif(GGML_SYSTEM_ARCH STREQUAL "ARM")
|
|
319
|
+
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
|
|
320
|
+
# Many of these features are optional so we build versions with popular
|
|
321
|
+
# combinations and name the backends based on the version they were
|
|
322
|
+
# first released with
|
|
323
|
+
ggml_add_cpu_backend_variant(armv8.0_1)
|
|
324
|
+
ggml_add_cpu_backend_variant(armv8.2_1 DOTPROD)
|
|
325
|
+
ggml_add_cpu_backend_variant(armv8.2_2 DOTPROD FP16_VECTOR_ARITHMETIC)
|
|
326
|
+
ggml_add_cpu_backend_variant(armv8.2_3 DOTPROD FP16_VECTOR_ARITHMETIC SVE)
|
|
327
|
+
ggml_add_cpu_backend_variant(armv8.6_1 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8)
|
|
328
|
+
ggml_add_cpu_backend_variant(armv8.6_2 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8 SVE2)
|
|
329
|
+
ggml_add_cpu_backend_variant(armv9.2_1 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8 SME)
|
|
330
|
+
ggml_add_cpu_backend_variant(armv9.2_2 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8 SVE2 SME)
|
|
331
|
+
elseif (CMAKE_SYSTEM_NAME MATCHES "Android")
|
|
332
|
+
# Android-specific backends with SoC-compatible feature sets
|
|
333
|
+
ggml_add_cpu_backend_variant(android_armv8.0_1)
|
|
334
|
+
ggml_add_cpu_backend_variant(android_armv8.2_1 DOTPROD)
|
|
335
|
+
ggml_add_cpu_backend_variant(android_armv8.2_2 DOTPROD FP16_VECTOR_ARITHMETIC)
|
|
336
|
+
ggml_add_cpu_backend_variant(android_armv8.6_1 DOTPROD FP16_VECTOR_ARITHMETIC MATMUL_INT8)
|
|
337
|
+
elseif (APPLE)
|
|
338
|
+
ggml_add_cpu_backend_variant(apple_m1 DOTPROD)
|
|
339
|
+
ggml_add_cpu_backend_variant(apple_m2_m3 DOTPROD MATMUL_INT8)
|
|
340
|
+
ggml_add_cpu_backend_variant(apple_m4 DOTPROD MATMUL_INT8 NOSVE SME)
|
|
341
|
+
else()
|
|
342
|
+
message(FATAL_ERROR "Unsupported ARM target OS: ${CMAKE_SYSTEM_NAME}")
|
|
343
|
+
endif()
|
|
344
|
+
elseif (GGML_SYSTEM_ARCH STREQUAL "PowerPC")
|
|
345
|
+
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
|
|
346
|
+
ggml_add_cpu_backend_variant(power0)
|
|
347
|
+
ggml_add_cpu_backend_variant(power7_1 POWER7)
|
|
348
|
+
ggml_add_cpu_backend_variant(power7_2 POWER7 VSX)
|
|
349
|
+
ggml_add_cpu_backend_variant(power8_1 POWER8)
|
|
350
|
+
ggml_add_cpu_backend_variant(power8_2 POWER8 VSX)
|
|
351
|
+
ggml_add_cpu_backend_variant(power9 POWER9 VSX)
|
|
352
|
+
ggml_add_cpu_backend_variant(power10 POWER10 VSX)
|
|
353
|
+
ggml_add_cpu_backend_variant(power11 POWER11 VSX)
|
|
354
|
+
else()
|
|
355
|
+
message(FATAL_ERROR "Unsupported PowerPC target OS: ${CMAKE_SYSTEM_NAME}")
|
|
356
|
+
endif()
|
|
306
357
|
else()
|
|
307
|
-
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS not yet supported
|
|
358
|
+
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS not yet supported with ${GGML_SYSTEM_ARCH} on ${CMAKE_SYSTEM_NAME}")
|
|
308
359
|
endif()
|
|
309
360
|
elseif (GGML_CPU)
|
|
310
361
|
ggml_add_cpu_backend_variant_impl("")
|
|
@@ -69,6 +69,9 @@
|
|
|
69
69
|
#if defined(__clang__)
|
|
70
70
|
# pragma clang diagnostic push
|
|
71
71
|
# pragma clang diagnostic ignored "-Wdeprecated-declarations"
|
|
72
|
+
#elif defined(__GNUC__)
|
|
73
|
+
# pragma GCC diagnostic push
|
|
74
|
+
# pragma GCC diagnostic ignored "-Wdeprecated-declarations"
|
|
72
75
|
#endif
|
|
73
76
|
|
|
74
77
|
namespace fs = std::filesystem;
|
|
@@ -91,6 +94,8 @@ static std::string path_str(const fs::path & path) {
|
|
|
91
94
|
|
|
92
95
|
#if defined(__clang__)
|
|
93
96
|
# pragma clang diagnostic pop
|
|
97
|
+
#elif defined(__GNUC__)
|
|
98
|
+
# pragma GCC diagnostic pop
|
|
94
99
|
#endif
|
|
95
100
|
|
|
96
101
|
#ifdef _WIN32
|
|
@@ -37,6 +37,7 @@
|
|
|
37
37
|
#include <thread>
|
|
38
38
|
#include <unistd.h>
|
|
39
39
|
#include <functional>
|
|
40
|
+
#include <optional>
|
|
40
41
|
|
|
41
42
|
#include "../include/ggml-cann.h"
|
|
42
43
|
#include "../include/ggml.h"
|
|
@@ -103,6 +104,9 @@ const ggml_cann_device_info& ggml_cann_info();
|
|
|
103
104
|
void ggml_cann_set_device(int32_t device);
|
|
104
105
|
int32_t ggml_cann_get_device();
|
|
105
106
|
|
|
107
|
+
std::optional<std::string> get_env(const std::string& name);
|
|
108
|
+
bool parse_bool(const std::string& value);
|
|
109
|
+
|
|
106
110
|
/**
|
|
107
111
|
* @brief Abstract base class for memory pools used by CANN.
|
|
108
112
|
*/
|
|
@@ -354,7 +358,8 @@ struct ggml_backend_cann_context {
|
|
|
354
358
|
: device(device), name("CANN" + std::to_string(device)), task_queue(1024, device) {
|
|
355
359
|
ggml_cann_set_device(device);
|
|
356
360
|
description = aclrtGetSocName();
|
|
357
|
-
|
|
361
|
+
|
|
362
|
+
async_mode = parse_bool(get_env("GGML_CANN_ASYNC_MODE").value_or(""));
|
|
358
363
|
GGML_LOG_INFO("%s: device %d async operator submission is %s\n", __func__,
|
|
359
364
|
device, async_mode ? "ON" : "OFF");
|
|
360
365
|
}
|
|
@@ -31,6 +31,8 @@
|
|
|
31
31
|
#include <mutex>
|
|
32
32
|
#include <queue>
|
|
33
33
|
#include <chrono>
|
|
34
|
+
#include <unordered_set>
|
|
35
|
+
#include <optional>
|
|
34
36
|
|
|
35
37
|
#include "ggml-impl.h"
|
|
36
38
|
#include "ggml-backend-impl.h"
|
|
@@ -93,6 +95,26 @@ int32_t ggml_cann_get_device() {
|
|
|
93
95
|
return id;
|
|
94
96
|
}
|
|
95
97
|
|
|
98
|
+
/**
|
|
99
|
+
* @brief Get the value of the specified environment variable (name).
|
|
100
|
+
* if not empty, return a std::string object
|
|
101
|
+
*/
|
|
102
|
+
std::optional<std::string> get_env(const std::string& name) {
|
|
103
|
+
const char* val = std::getenv(name.c_str());
|
|
104
|
+
if (!val) return std::nullopt;
|
|
105
|
+
std::string res = std::string(val);
|
|
106
|
+
std::transform(res.begin(), res.end(), res.begin(), ::tolower);
|
|
107
|
+
return res;
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
/**
|
|
111
|
+
* @brief Verify whether the environment variable is a valid value.
|
|
112
|
+
*/
|
|
113
|
+
bool parse_bool(const std::string& value) {
|
|
114
|
+
std::unordered_set<std::string> valid_values = {"on", "1", "yes", "y", "enable", "true"};
|
|
115
|
+
return valid_values.find(value) != valid_values.end();
|
|
116
|
+
}
|
|
117
|
+
|
|
96
118
|
/**
|
|
97
119
|
* @brief Initialize the CANN device information.
|
|
98
120
|
*
|
|
@@ -214,7 +236,7 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
|
|
|
214
236
|
* @param device The device ID to associate with this buffer pool.
|
|
215
237
|
*/
|
|
216
238
|
explicit ggml_cann_pool_buf_prio(int device) : device(device) {
|
|
217
|
-
disable_clean =
|
|
239
|
+
disable_clean = parse_bool(get_env("GGML_CANN_DISABLE_BUF_POOL_CLEAN").value_or(""));
|
|
218
240
|
}
|
|
219
241
|
|
|
220
242
|
/**
|
|
@@ -410,7 +432,7 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
|
|
|
410
432
|
* @param device The device ID to associate with this buffer pool.
|
|
411
433
|
*/
|
|
412
434
|
explicit ggml_cann_pool_buf(int device) : device(device) {
|
|
413
|
-
disable_clean =
|
|
435
|
+
disable_clean = parse_bool(get_env("GGML_CANN_DISABLE_BUF_POOL_CLEAN").value_or(""));
|
|
414
436
|
}
|
|
415
437
|
|
|
416
438
|
/**
|
|
@@ -731,16 +753,18 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
|
|
|
731
753
|
*/
|
|
732
754
|
std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(
|
|
733
755
|
int device) {
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_vmm(device));
|
|
738
|
-
}
|
|
739
|
-
bool enable_buf_prio = (getenv("GGML_CANN_ENABLE_BUF_PRIO_POOL") != nullptr);
|
|
740
|
-
if (enable_buf_prio) {
|
|
756
|
+
std::string mem_pool_type = get_env("GGML_CANN_MEM_POOL").value_or("");
|
|
757
|
+
|
|
758
|
+
if (mem_pool_type == "prio") {
|
|
741
759
|
GGML_LOG_INFO("%s: device %d use buffer pool with priority queue\n", __func__, device);
|
|
742
760
|
return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_buf_prio(device));
|
|
743
761
|
}
|
|
762
|
+
|
|
763
|
+
if (ggml_cann_info().devices[device].vmm && mem_pool_type != "leg") {
|
|
764
|
+
GGML_LOG_INFO("%s: device %d use vmm pool\n", __func__, device);
|
|
765
|
+
return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_vmm(device));
|
|
766
|
+
}
|
|
767
|
+
|
|
744
768
|
GGML_LOG_INFO("%s: device %d use buffer pool\n", __func__, device);
|
|
745
769
|
return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_buf(device));
|
|
746
770
|
}
|
|
@@ -1074,6 +1074,10 @@ GGML_TABLE_BEGIN(uint32_t, iq3s_grid, 512)
|
|
|
1074
1074
|
0x0f090307, 0x0f090501, 0x0f090b01, 0x0f0b0505, 0x0f0b0905, 0x0f0d0105, 0x0f0d0703, 0x0f0f0101,
|
|
1075
1075
|
GGML_TABLE_END()
|
|
1076
1076
|
|
|
1077
|
+
GGML_TABLE_BEGIN(int8_t, kvalues_iq4nl, 16)
|
|
1078
|
+
-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113,
|
|
1079
|
+
GGML_TABLE_END()
|
|
1080
|
+
|
|
1077
1081
|
#define NGRID_IQ1S 2048
|
|
1078
1082
|
#define IQ1S_DELTA 0.125f
|
|
1079
1083
|
#define IQ1M_DELTA 0.125f
|
|
@@ -1,3 +1,17 @@
|
|
|
1
|
+
function(ggml_add_cpu_backend_features cpu_name arch)
|
|
2
|
+
# The feature detection code is compiled as a separate target so that
|
|
3
|
+
# it can be built without the architecture flags
|
|
4
|
+
# Since multiple variants of the CPU backend may be included in the same
|
|
5
|
+
# build, using set_source_files_properties() to set the arch flags is not possible
|
|
6
|
+
set(GGML_CPU_FEATS_NAME ${cpu_name}-feats)
|
|
7
|
+
add_library(${GGML_CPU_FEATS_NAME} OBJECT ggml-cpu/arch/${arch}/cpu-feats.cpp)
|
|
8
|
+
target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . .. ../include)
|
|
9
|
+
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARGN})
|
|
10
|
+
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED)
|
|
11
|
+
set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
|
12
|
+
target_link_libraries(${cpu_name} PRIVATE ${GGML_CPU_FEATS_NAME})
|
|
13
|
+
endfunction()
|
|
14
|
+
|
|
1
15
|
function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
2
16
|
if (tag_name)
|
|
3
17
|
set(GGML_CPU_NAME ggml-cpu-${tag_name})
|
|
@@ -10,14 +24,14 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
10
24
|
list (APPEND GGML_CPU_SOURCES
|
|
11
25
|
ggml-cpu/ggml-cpu.c
|
|
12
26
|
ggml-cpu/ggml-cpu.cpp
|
|
13
|
-
ggml-cpu/
|
|
14
|
-
ggml-cpu/
|
|
15
|
-
ggml-cpu/
|
|
16
|
-
ggml-cpu/
|
|
17
|
-
ggml-cpu/
|
|
18
|
-
ggml-cpu/
|
|
19
|
-
ggml-cpu/
|
|
20
|
-
ggml-cpu/
|
|
27
|
+
ggml-cpu/repack.cpp
|
|
28
|
+
ggml-cpu/repack.h
|
|
29
|
+
ggml-cpu/hbm.cpp
|
|
30
|
+
ggml-cpu/hbm.h
|
|
31
|
+
ggml-cpu/quants.c
|
|
32
|
+
ggml-cpu/quants.h
|
|
33
|
+
ggml-cpu/traits.cpp
|
|
34
|
+
ggml-cpu/traits.h
|
|
21
35
|
ggml-cpu/amx/amx.cpp
|
|
22
36
|
ggml-cpu/amx/amx.h
|
|
23
37
|
ggml-cpu/amx/mmq.cpp
|
|
@@ -84,6 +98,11 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
84
98
|
|
|
85
99
|
if (GGML_SYSTEM_ARCH STREQUAL "ARM")
|
|
86
100
|
message(STATUS "ARM detected")
|
|
101
|
+
list(APPEND GGML_CPU_SOURCES
|
|
102
|
+
ggml-cpu/arch/arm/quants.c
|
|
103
|
+
ggml-cpu/arch/arm/repack.cpp
|
|
104
|
+
)
|
|
105
|
+
|
|
87
106
|
if (MSVC AND NOT CMAKE_C_COMPILER_ID STREQUAL "Clang")
|
|
88
107
|
message(FATAL_ERROR "MSVC is not supported for ARM, use clang")
|
|
89
108
|
else()
|
|
@@ -138,6 +157,49 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
138
157
|
else()
|
|
139
158
|
if (GGML_CPU_ARM_ARCH)
|
|
140
159
|
list(APPEND ARCH_FLAGS -march=${GGML_CPU_ARM_ARCH})
|
|
160
|
+
elseif(GGML_CPU_ALL_VARIANTS)
|
|
161
|
+
# Begin with the lowest baseline
|
|
162
|
+
set(ARM_MCPU "armv8-a")
|
|
163
|
+
set(ARCH_TAGS "")
|
|
164
|
+
set(ARCH_DEFINITIONS "")
|
|
165
|
+
|
|
166
|
+
# When a feature is selected, bump the MCPU to the first
|
|
167
|
+
# version that supported it
|
|
168
|
+
if (GGML_INTERNAL_DOTPROD)
|
|
169
|
+
set(ARM_MCPU "armv8.2-a")
|
|
170
|
+
set(ARCH_TAGS "${ARCH_TAGS}+dotprod")
|
|
171
|
+
list(APPEND ARCH_DEFINITIONS GGML_USE_DOTPROD)
|
|
172
|
+
endif()
|
|
173
|
+
if (GGML_INTERNAL_FP16_VECTOR_ARITHMETIC)
|
|
174
|
+
set(ARM_MCPU "armv8.2-a")
|
|
175
|
+
set(ARCH_TAGS "${ARCH_TAGS}+fp16")
|
|
176
|
+
list(APPEND ARCH_DEFINITIONS GGML_USE_FP16_VECTOR_ARITHMETIC)
|
|
177
|
+
endif()
|
|
178
|
+
if (GGML_INTERNAL_SVE)
|
|
179
|
+
set(ARM_MCPU "armv8.2-a")
|
|
180
|
+
set(ARCH_TAGS "${ARCH_TAGS}+sve")
|
|
181
|
+
list(APPEND ARCH_DEFINITIONS GGML_USE_SVE)
|
|
182
|
+
endif()
|
|
183
|
+
if (GGML_INTERNAL_MATMUL_INT8)
|
|
184
|
+
set(ARM_MCPU "armv8.6-a")
|
|
185
|
+
set(ARCH_TAGS "${ARCH_TAGS}+i8mm")
|
|
186
|
+
list(APPEND ARCH_DEFINITIONS GGML_USE_MATMUL_INT8)
|
|
187
|
+
endif()
|
|
188
|
+
if (GGML_INTERNAL_SVE2)
|
|
189
|
+
set(ARM_MCPU "armv8.6-a")
|
|
190
|
+
set(ARCH_TAGS "${ARCH_TAGS}+sve2")
|
|
191
|
+
list(APPEND ARCH_DEFINITIONS GGML_USE_SVE2)
|
|
192
|
+
endif()
|
|
193
|
+
if (GGML_INTERNAL_NOSVE)
|
|
194
|
+
set(ARCH_TAGS "${ARCH_TAGS}+nosve")
|
|
195
|
+
endif()
|
|
196
|
+
if (GGML_INTERNAL_SME)
|
|
197
|
+
set(ARM_MCPU "armv9.2-a")
|
|
198
|
+
set(ARCH_TAGS "${ARCH_TAGS}+sme")
|
|
199
|
+
list(APPEND ARCH_DEFINITIONS GGML_USE_SME)
|
|
200
|
+
endif()
|
|
201
|
+
list(APPEND ARCH_FLAGS "-march=${ARM_MCPU}${ARCH_TAGS}")
|
|
202
|
+
ggml_add_cpu_backend_features(${GGML_CPU_NAME} arm ${ARCH_DEFINITIONS})
|
|
141
203
|
endif()
|
|
142
204
|
endif()
|
|
143
205
|
|
|
@@ -167,6 +229,11 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
167
229
|
endif()
|
|
168
230
|
elseif (GGML_SYSTEM_ARCH STREQUAL "x86")
|
|
169
231
|
message(STATUS "x86 detected")
|
|
232
|
+
list(APPEND GGML_CPU_SOURCES
|
|
233
|
+
ggml-cpu/arch/x86/quants.c
|
|
234
|
+
ggml-cpu/arch/x86/repack.cpp
|
|
235
|
+
)
|
|
236
|
+
|
|
170
237
|
if (MSVC)
|
|
171
238
|
# instruction set detection for MSVC only
|
|
172
239
|
if (GGML_NATIVE)
|
|
@@ -296,21 +363,11 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
296
363
|
# the feature check relies on ARCH_DEFINITIONS, but it is not set with GGML_NATIVE
|
|
297
364
|
message(FATAL_ERROR "GGML_NATIVE is not compatible with GGML_BACKEND_DL, consider using GGML_CPU_ALL_VARIANTS")
|
|
298
365
|
endif()
|
|
299
|
-
|
|
300
|
-
# The feature detection code is compiled as a separate target so that
|
|
301
|
-
# it can be built without the architecture flags
|
|
302
|
-
# Since multiple variants of the CPU backend may be included in the same
|
|
303
|
-
# build, using set_source_files_properties() to set the arch flags is not possible
|
|
304
|
-
set(GGML_CPU_FEATS_NAME ${GGML_CPU_NAME}-feats)
|
|
305
|
-
add_library(${GGML_CPU_FEATS_NAME} OBJECT ggml-cpu/cpu-feats-x86.cpp)
|
|
306
|
-
target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . .. ../include)
|
|
307
|
-
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARCH_DEFINITIONS})
|
|
308
|
-
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED)
|
|
309
|
-
set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
|
310
|
-
target_link_libraries(${GGML_CPU_NAME} PRIVATE ${GGML_CPU_FEATS_NAME})
|
|
366
|
+
ggml_add_cpu_backend_features(${GGML_CPU_NAME} x86 ${ARCH_DEFINITIONS})
|
|
311
367
|
endif()
|
|
312
368
|
elseif (GGML_SYSTEM_ARCH STREQUAL "PowerPC")
|
|
313
369
|
message(STATUS "PowerPC detected")
|
|
370
|
+
list(APPEND GGML_CPU_SOURCES ggml-cpu/arch/powerpc/quants.c)
|
|
314
371
|
if (GGML_NATIVE)
|
|
315
372
|
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
|
|
316
373
|
file(READ "/proc/cpuinfo" POWER10_M)
|
|
@@ -318,7 +375,8 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
318
375
|
execute_process(COMMAND bash -c "prtconf |grep 'Implementation' | head -n 1" OUTPUT_VARIABLE POWER10_M)
|
|
319
376
|
endif()
|
|
320
377
|
|
|
321
|
-
string(
|
|
378
|
+
string(TOUPPER "${POWER10_M}" POWER10_M_UPPER)
|
|
379
|
+
string(REGEX MATCHALL "POWER *([0-9]+)" MATCHED_STRING "${POWER10_M_UPPER}")
|
|
322
380
|
string(REGEX REPLACE "POWER *([0-9]+)" "\\1" EXTRACTED_NUMBER "${MATCHED_STRING}")
|
|
323
381
|
|
|
324
382
|
if (EXTRACTED_NUMBER GREATER_EQUAL 10)
|
|
@@ -330,6 +388,27 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
330
388
|
else()
|
|
331
389
|
list(APPEND ARCH_FLAGS -mcpu=native -mtune=native -mpowerpc64)
|
|
332
390
|
endif()
|
|
391
|
+
elseif(GGML_CPU_ALL_VARIANTS)
|
|
392
|
+
# Begin with the lowest baseline
|
|
393
|
+
set(ARCH_DEFINITIONS "")
|
|
394
|
+
|
|
395
|
+
# When a feature is selected, bump the MCPU to the first
|
|
396
|
+
# version that supported it
|
|
397
|
+
foreach(PVER RANGE 7 11)
|
|
398
|
+
if(DEFINED GGML_INTERNAL_POWER${PVER})
|
|
399
|
+
set(POWERPC_MCPU "power${PVER}")
|
|
400
|
+
list(APPEND ARCH_DEFINITIONS GGML_USE_POWER${PVER})
|
|
401
|
+
endif()
|
|
402
|
+
endforeach()
|
|
403
|
+
if (GGML_INTERNAL_VSX)
|
|
404
|
+
list(APPEND ARCH_DEFINITIONS GGML_USE_VSX)
|
|
405
|
+
list(APPEND ARCH_FLAGS -mvsx)
|
|
406
|
+
endif()
|
|
407
|
+
|
|
408
|
+
if (DEFINED POWERPC_MCPU)
|
|
409
|
+
list(APPEND ARCH_FLAGS -mcpu=${POWERPC_MCPU})
|
|
410
|
+
endif()
|
|
411
|
+
ggml_add_cpu_backend_features(${GGML_CPU_NAME} powerpc ${ARCH_DEFINITIONS})
|
|
333
412
|
else()
|
|
334
413
|
if (GGML_CPU_POWERPC_CPUTYPE)
|
|
335
414
|
list(APPEND ARCH_FLAGS -mcpu=${GGML_CPU_POWERPC_CPUTYPE})
|
|
@@ -337,6 +416,8 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
337
416
|
endif()
|
|
338
417
|
elseif (GGML_SYSTEM_ARCH STREQUAL "loongarch64")
|
|
339
418
|
message(STATUS "loongarch64 detected")
|
|
419
|
+
list(APPEND GGML_CPU_SOURCES ggml-cpu/arch/loongarch/quants.c)
|
|
420
|
+
|
|
340
421
|
list(APPEND ARCH_FLAGS -march=loongarch64)
|
|
341
422
|
if (GGML_LASX)
|
|
342
423
|
list(APPEND ARCH_FLAGS -mlasx)
|
|
@@ -346,6 +427,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
346
427
|
endif()
|
|
347
428
|
elseif (GGML_SYSTEM_ARCH STREQUAL "riscv64")
|
|
348
429
|
message(STATUS "riscv64 detected")
|
|
430
|
+
list(APPEND GGML_CPU_SOURCES
|
|
431
|
+
ggml-cpu/arch/riscv/quants.c
|
|
432
|
+
ggml-cpu/arch/riscv/repack.cpp
|
|
433
|
+
)
|
|
349
434
|
if (GGML_RVV)
|
|
350
435
|
if (GGML_XTHEADVECTOR)
|
|
351
436
|
list(APPEND ARCH_FLAGS -march=rv64gc_xtheadvector -mabi=lp64d)
|
|
@@ -357,11 +442,13 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
357
442
|
endif()
|
|
358
443
|
elseif (GGML_SYSTEM_ARCH STREQUAL "s390x")
|
|
359
444
|
message(STATUS "s390x detected")
|
|
445
|
+
list(APPEND GGML_CPU_SOURCES ggml-cpu/arch/s390/quants.c)
|
|
360
446
|
file(READ "/proc/cpuinfo" CPUINFO_CONTENTS)
|
|
361
447
|
string(REGEX REPLACE "machine[ \t\r\n]*=[ \t\r\n]*([0-9]+)" "\\1" S390X_M ${CPUINFO_CONTENTS})
|
|
362
448
|
|
|
363
449
|
# TODO: Separation to determine activation of VX/VXE/VXE2
|
|
364
450
|
if (${S390X_M} MATCHES "8561|8562")
|
|
451
|
+
set(GGML_NNPA OFF)
|
|
365
452
|
message(STATUS "z15 target")
|
|
366
453
|
list(APPEND ARCH_FLAGS -march=z15)
|
|
367
454
|
elseif (${S390X_M} MATCHES "3931")
|
|
@@ -378,14 +465,25 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
378
465
|
endif()
|
|
379
466
|
|
|
380
467
|
if (GGML_VXE)
|
|
468
|
+
message(STATUS "VX/VXE/VXE2 enabled")
|
|
381
469
|
list(APPEND ARCH_FLAGS -mvx -mzvector)
|
|
470
|
+
list(APPEND ARCH_DEFINITIONS GGML_VXE)
|
|
471
|
+
endif()
|
|
472
|
+
|
|
473
|
+
if (GGML_NNPA)
|
|
474
|
+
message(STATUS "NNPA enabled")
|
|
475
|
+
list(APPEND ARCH_DEFINITIONS GGML_NNPA)
|
|
382
476
|
endif()
|
|
477
|
+
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "wasm")
|
|
478
|
+
message(STATUS "Wasm detected")
|
|
479
|
+
list (APPEND GGML_CPU_SOURCES ggml-cpu/arch/wasm/quants.c)
|
|
383
480
|
else()
|
|
384
|
-
message(
|
|
481
|
+
message(WARNING "Unknown CPU architecture. Falling back to generic implementations.")
|
|
482
|
+
list(APPEND ARCH_FLAGS -DGGML_CPU_GENERIC)
|
|
385
483
|
endif()
|
|
386
484
|
|
|
387
|
-
if (
|
|
388
|
-
target_compile_definitions(${GGML_CPU_NAME} PRIVATE
|
|
485
|
+
if (GGML_CPU_REPACK)
|
|
486
|
+
target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_CPU_REPACK)
|
|
389
487
|
endif()
|
|
390
488
|
|
|
391
489
|
if (GGML_CPU_KLEIDIAI)
|
|
@@ -396,9 +494,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
396
494
|
|
|
397
495
|
# Fetch KleidiAI sources:
|
|
398
496
|
include(FetchContent)
|
|
399
|
-
set(KLEIDIAI_COMMIT_TAG "v1.
|
|
497
|
+
set(KLEIDIAI_COMMIT_TAG "v1.9.0")
|
|
400
498
|
set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
|
|
401
|
-
set(KLEIDIAI_ARCHIVE_MD5 "
|
|
499
|
+
set(KLEIDIAI_ARCHIVE_MD5 "2a8e1bb55d201557553545536489a017")
|
|
402
500
|
|
|
403
501
|
if (POLICY CMP0135)
|
|
404
502
|
cmake_policy(SET CMP0135 NEW)
|
|
@@ -8,7 +8,8 @@
|
|
|
8
8
|
#include "mmq.h"
|
|
9
9
|
#include "ggml-impl.h"
|
|
10
10
|
#include "ggml-cpu-impl.h"
|
|
11
|
-
#include "
|
|
11
|
+
#include "simd-mappings.h"
|
|
12
|
+
#include "quants.h"
|
|
12
13
|
#include "ggml-quants.h"
|
|
13
14
|
#include <algorithm>
|
|
14
15
|
#include <type_traits>
|
|
@@ -453,7 +454,7 @@ void quantize_row_q8_K_vnni(const float * RESTRICT x, void * RESTRICT vy, int64_
|
|
|
453
454
|
|
|
454
455
|
// Quantize these floats
|
|
455
456
|
const float iscale = 127.f / amax;
|
|
456
|
-
y[i].d =
|
|
457
|
+
y[i].d = GGML_CPU_FP32_TO_FP16(1 / iscale);
|
|
457
458
|
const float id = ( amax != 0.0f ) ? iscale : 0.f;
|
|
458
459
|
const __m512 vscale = _mm512_set1_ps(id);
|
|
459
460
|
|
|
@@ -1090,7 +1091,7 @@ struct acc_C<block_q8_0, block_q4_0, is_acc> {
|
|
|
1090
1091
|
const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset)));
|
|
1091
1092
|
|
|
1092
1093
|
for (int m = 0; m < nr; ++m) {
|
|
1093
|
-
const __m512 vd1 = _mm512_set1_ps(
|
|
1094
|
+
const __m512 vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[m * lda].d));
|
|
1094
1095
|
const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));
|
|
1095
1096
|
|
|
1096
1097
|
__m512 vsum;
|
|
@@ -1113,8 +1114,8 @@ struct acc_C<block_q8_1, block_q4_1, is_acc> {
|
|
|
1113
1114
|
const __m512 vm0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset + TILE_N * sizeof(ggml_half))));
|
|
1114
1115
|
|
|
1115
1116
|
for (int m = 0; m < nr; ++m) {
|
|
1116
|
-
const __m512 vd1 = _mm512_set1_ps(
|
|
1117
|
-
const __m512 vs1 = _mm512_set1_ps(
|
|
1117
|
+
const __m512 vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[m * lda].d));
|
|
1118
|
+
const __m512 vs1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[m * lda].s));
|
|
1118
1119
|
const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));
|
|
1119
1120
|
|
|
1120
1121
|
__m512 vsum;
|
|
@@ -1137,7 +1138,7 @@ struct acc_C<block_q8_0, block_q8_0, is_acc> {
|
|
|
1137
1138
|
const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset)));
|
|
1138
1139
|
|
|
1139
1140
|
for (int m = 0; m < nr; ++m) {
|
|
1140
|
-
const __m512 vd1 = _mm512_set1_ps(
|
|
1141
|
+
const __m512 vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[m * lda].d));
|
|
1141
1142
|
const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));
|
|
1142
1143
|
|
|
1143
1144
|
__m512 vsum;
|
|
@@ -1437,7 +1438,7 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q4_0, float, BLOCK_M, BLOCK_N, BLO
|
|
|
1437
1438
|
va[k] = _mm512_set1_epi32(a_ptr[k]);
|
|
1438
1439
|
vcomp = _mm512_dpbusd_epi32(vcomp, off, va[k]);
|
|
1439
1440
|
}
|
|
1440
|
-
vd1 = _mm512_set1_ps(
|
|
1441
|
+
vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[0 * KB + i].d));
|
|
1441
1442
|
}
|
|
1442
1443
|
|
|
1443
1444
|
// load b
|
|
@@ -1498,8 +1499,8 @@ struct tinygemm_kernel_vnni<block_q8_1, block_q4_1, float, 1, BLOCK_N, BLOCK_K>
|
|
|
1498
1499
|
for (int k = 0; k < 8; ++k) {
|
|
1499
1500
|
va[k] = _mm512_set1_epi32(a_ptr[k]);
|
|
1500
1501
|
}
|
|
1501
|
-
vd1 = _mm512_set1_ps(
|
|
1502
|
-
vs1 = _mm512_set1_ps(
|
|
1502
|
+
vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[0 * KB + i].d));
|
|
1503
|
+
vs1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[0 * KB + i].s));
|
|
1503
1504
|
}
|
|
1504
1505
|
|
|
1505
1506
|
// load b
|
|
@@ -1571,7 +1572,7 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q8_0, float, BLOCK_M, BLOCK_N, BLO
|
|
|
1571
1572
|
va[k] = _mm512_set1_epi32(a_ptr[k]);
|
|
1572
1573
|
va[k] = _mm512_add_epi8(va[k], off);
|
|
1573
1574
|
}
|
|
1574
|
-
vd1 = _mm512_set1_ps(
|
|
1575
|
+
vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[0 * KB + i].d));
|
|
1575
1576
|
}
|
|
1576
1577
|
|
|
1577
1578
|
// load b
|