@novastera-oss/llamarn 0.2.6 → 0.2.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/cpp/include/llama.h +141 -38
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/LlamaCppModel.cpp +58 -24
- package/cpp/LlamaCppModel.h +3 -3
- package/cpp/PureCppImpl.cpp +1 -1
- package/cpp/PureCppImpl.h +2 -2
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/CMakeLists.txt +15 -4
- package/cpp/llama.cpp/Makefile +2 -2
- package/cpp/llama.cpp/README.md +32 -13
- package/cpp/llama.cpp/common/CMakeLists.txt +10 -20
- package/cpp/llama.cpp/common/arg.cpp +37 -6
- package/cpp/llama.cpp/common/build-info.cpp.in +2 -2
- package/cpp/llama.cpp/common/chat-parser.cpp +5 -0
- package/cpp/llama.cpp/common/chat-parser.h +2 -0
- package/cpp/llama.cpp/common/chat.cpp +12 -9
- package/cpp/llama.cpp/common/chat.h +1 -1
- package/cpp/llama.cpp/common/common.cpp +53 -40
- package/cpp/llama.cpp/common/common.h +6 -2
- package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
- package/cpp/llama.cpp/common/speculative.cpp +6 -4
- package/cpp/llama.cpp/convert_hf_to_gguf.py +215 -76
- package/cpp/llama.cpp/ggml/CMakeLists.txt +48 -2
- package/cpp/llama.cpp/ggml/cmake/common.cmake +1 -2
- package/cpp/llama.cpp/ggml/include/ggml-cpu.h +2 -0
- package/cpp/llama.cpp/ggml/include/ggml.h +33 -0
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +64 -13
- package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +6 -1
- package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +33 -9
- package/cpp/llama.cpp/ggml/src/ggml-common.h +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +124 -26
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +11 -10
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4114 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2163 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2639 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2732 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2069 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +397 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1300 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1481 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +4311 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +79 -3225
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +184 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +4 -3
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +16 -7
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +93 -104
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +12 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +58 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +194 -69
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +1158 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +1571 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.h +98 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +213 -37
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +45 -45
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +59 -37
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +4 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +90 -39
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cu +19 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cuh +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cu +257 -87
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cuh +2 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +6 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cu +5 -18
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cuh +0 -1
- package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +61 -183
- package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +11 -10
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +16 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +260 -49
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +497 -282
- package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1078 -468
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
- package/cpp/llama.cpp/ggml/src/ggml-quants.c +0 -2
- package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +18 -15
- package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -6
- package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +20 -48
- package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +28 -41
- package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +4 -10
- package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +117 -165
- package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +192 -53
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +32 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +49 -67
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +99 -159
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +8 -105
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +209 -92
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +60 -80
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +158 -203
- package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +55 -74
- package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +38 -10
- package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +24 -20
- package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -8
- package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +108 -16
- package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +12 -16
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +36 -28
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +487 -247
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml.c +69 -19
- package/cpp/llama.cpp/ggml/src/gguf.cpp +5 -1
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +133 -0
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +25 -1
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +78 -3
- package/cpp/llama.cpp/gguf-py/gguf/vocab.py +97 -4
- package/cpp/llama.cpp/gguf-py/pyproject.toml +2 -2
- package/cpp/llama.cpp/include/llama.h +141 -38
- package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +124 -0
- package/cpp/llama.cpp/requirements/requirements-compare-llama-bench.txt +1 -0
- package/cpp/llama.cpp/src/CMakeLists.txt +2 -2
- package/cpp/llama.cpp/src/llama-arch.cpp +150 -3
- package/cpp/llama.cpp/src/llama-arch.h +25 -1
- package/cpp/llama.cpp/src/llama-batch.cpp +736 -274
- package/cpp/llama.cpp/src/llama-batch.h +110 -57
- package/cpp/llama.cpp/src/llama-chat.cpp +30 -8
- package/cpp/llama.cpp/src/llama-chat.h +1 -0
- package/cpp/llama.cpp/src/llama-context.cpp +360 -266
- package/cpp/llama.cpp/src/llama-context.h +27 -23
- package/cpp/llama.cpp/src/llama-cparams.cpp +1 -1
- package/cpp/llama.cpp/src/llama-cparams.h +1 -1
- package/cpp/llama.cpp/src/llama-graph.cpp +411 -344
- package/cpp/llama.cpp/src/llama-graph.h +126 -58
- package/cpp/llama.cpp/src/llama-hparams.cpp +10 -2
- package/cpp/llama.cpp/src/llama-hparams.h +16 -2
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +103 -73
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +34 -42
- package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +345 -221
- package/cpp/llama.cpp/src/llama-kv-cache-unified.h +75 -50
- package/cpp/llama.cpp/src/llama-kv-cells.h +51 -22
- package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +246 -0
- package/cpp/llama.cpp/src/llama-memory-hybrid.h +138 -0
- package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.cpp → llama-memory-recurrent.cpp} +302 -317
- package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.h → llama-memory-recurrent.h} +60 -68
- package/cpp/llama.cpp/src/llama-memory.cpp +41 -0
- package/cpp/llama.cpp/src/llama-memory.h +73 -36
- package/cpp/llama.cpp/src/llama-mmap.cpp +1 -1
- package/cpp/llama.cpp/src/llama-model-loader.cpp +42 -17
- package/cpp/llama.cpp/src/llama-model-saver.cpp +1 -0
- package/cpp/llama.cpp/src/llama-model.cpp +1630 -511
- package/cpp/llama.cpp/src/llama-model.h +26 -0
- package/cpp/llama.cpp/src/llama-quant.cpp +89 -6
- package/cpp/llama.cpp/src/llama-vocab.cpp +58 -26
- package/cpp/llama.cpp/src/llama-vocab.h +1 -0
- package/cpp/llama.cpp/src/llama.cpp +11 -7
- package/cpp/llama.cpp/src/unicode.cpp +5 -0
- package/cpp/rn-completion.cpp +2 -2
- package/cpp/{rn-llama.hpp → rn-llama.h} +1 -1
- package/cpp/{rn-utils.hpp → rn-utils.h} +3 -0
- package/ios/include/chat.h +1 -1
- package/ios/include/common.h +6 -2
- package/ios/include/llama.h +141 -38
- package/ios/libs/llama.xcframework/Info.plist +15 -15
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4689
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3622
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3766 -3624
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4689
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3622
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4926 -4725
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4897 -4746
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3794 -3652
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +1 -2
- package/cpp/llama.cpp/common/cmake/build-info-gen-cpp.cmake +0 -24
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13891
- package/cpp/llama.cpp/src/llama-kv-cache.cpp +0 -1
- package/cpp/llama.cpp/src/llama-kv-cache.h +0 -44
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
#pragma once
|
|
2
|
+
|
|
3
|
+
#define GGML_COMMON_DECL_CPP
|
|
4
|
+
#include "ggml-common.h"
|
|
5
|
+
|
|
6
|
+
#include "traits.h"
|
|
7
|
+
#include "ggml.h"
|
|
8
|
+
|
|
9
|
+
// GGML internal header
|
|
10
|
+
|
|
11
|
+
ggml_backend_buffer_type_t ggml_backend_cpu_repack_buffer_type(void);
|
|
12
|
+
|
|
13
|
+
template <int K> constexpr int QK_0() {
|
|
14
|
+
if constexpr (K == 4) {
|
|
15
|
+
return QK4_0;
|
|
16
|
+
}
|
|
17
|
+
if constexpr (K == 8) {
|
|
18
|
+
return QK8_0;
|
|
19
|
+
}
|
|
20
|
+
return -1;
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
template <int K, int N> struct block {
|
|
24
|
+
ggml_half d[N]; // deltas for N qK_0 blocks
|
|
25
|
+
int8_t qs[(QK_0<K>() * N * K) / 8]; // quants for N qK_0 blocks
|
|
26
|
+
};
|
|
27
|
+
|
|
28
|
+
// control size
|
|
29
|
+
static_assert(sizeof(block<4, 4>) == 4 * sizeof(ggml_half) + QK8_0 * 2, "wrong block<4,4> size/padding");
|
|
30
|
+
static_assert(sizeof(block<4, 8>) == 8 * sizeof(ggml_half) + QK8_0 * 4, "wrong block<4,8> size/padding");
|
|
31
|
+
static_assert(sizeof(block<8, 4>) == 4 * sizeof(ggml_half) + QK8_0 * 4, "wrong block<8,4> size/padding");
|
|
32
|
+
static_assert(sizeof(block<8, 8>) == 8 * sizeof(ggml_half) + QK8_0 * 8, "wrong block<8,8> size/padding");
|
|
33
|
+
|
|
34
|
+
using block_q4_0x4 = block<4, 4>;
|
|
35
|
+
using block_q4_0x8 = block<4, 8>;
|
|
36
|
+
using block_q8_0x4 = block<8, 4>;
|
|
37
|
+
using block_q8_0x8 = block<8, 8>;
|
|
38
|
+
|
|
39
|
+
struct block_q4_Kx8 {
|
|
40
|
+
ggml_half d[8]; // super-block scale for quantized scales
|
|
41
|
+
ggml_half dmin[8]; // super-block scale for quantized mins
|
|
42
|
+
uint8_t scales[96]; // scales and mins, quantized with 6 bits
|
|
43
|
+
uint8_t qs[1024]; // 4--bit quants
|
|
44
|
+
};
|
|
45
|
+
|
|
46
|
+
static_assert(sizeof(block_q4_Kx8) == sizeof(ggml_half) * 16 + K_SCALE_SIZE * 8 + QK_K * 4, "wrong q4_K block size/padding");
|
|
47
|
+
|
|
48
|
+
struct block_q8_Kx4 {
|
|
49
|
+
float d[4]; // delta
|
|
50
|
+
int8_t qs[QK_K * 4]; // quants
|
|
51
|
+
int16_t bsums[QK_K / 4]; // sum of quants in groups of 16
|
|
52
|
+
};
|
|
53
|
+
|
|
54
|
+
static_assert(sizeof(block_q8_Kx4) == sizeof(float) * 4 + QK_K * 4 + (QK_K / 4) * sizeof(int16_t), "wrong q8_K block size/padding");
|
|
55
|
+
|
|
56
|
+
struct block_iq4_nlx4 {
|
|
57
|
+
ggml_half d[4]; // deltas for 4 iq4_nl blocks
|
|
58
|
+
uint8_t qs[QK4_NL * 2]; // nibbles / quants for 4 iq4_nl blocks
|
|
59
|
+
};
|
|
60
|
+
|
|
61
|
+
static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wrong iq4_nlx4 block size/padding");
|
|
62
|
+
|
|
63
|
+
#if defined(__cplusplus)
|
|
64
|
+
extern "C" {
|
|
65
|
+
#endif
|
|
66
|
+
|
|
67
|
+
void ggml_quantize_mat_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
|
68
|
+
void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
|
69
|
+
void ggml_quantize_mat_q8_K_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
|
70
|
+
void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
71
|
+
void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
72
|
+
void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
73
|
+
void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
74
|
+
void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
75
|
+
void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
76
|
+
void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
77
|
+
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
78
|
+
void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
79
|
+
void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
80
|
+
|
|
81
|
+
// Native implementations
|
|
82
|
+
void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
|
83
|
+
void ggml_quantize_mat_q8_0_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
|
84
|
+
void ggml_quantize_mat_q8_K_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
|
85
|
+
void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
86
|
+
void ggml_gemv_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
87
|
+
void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
88
|
+
void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
89
|
+
void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
90
|
+
void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
91
|
+
void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
92
|
+
void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
93
|
+
void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
94
|
+
void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
95
|
+
|
|
96
|
+
#if defined(__cplusplus)
|
|
97
|
+
} // extern "C"
|
|
98
|
+
#endif
|
|
@@ -2,10 +2,167 @@
|
|
|
2
2
|
|
|
3
3
|
#include "ggml-cpu-impl.h"
|
|
4
4
|
|
|
5
|
+
#ifdef __ARM_FEATURE_SVE
|
|
6
|
+
#include <arm_sve.h>
|
|
7
|
+
#endif // __ARM_FEATURE_SVE
|
|
8
|
+
|
|
9
|
+
#if defined(__ARM_NEON) && !defined(__CUDACC__) && !defined(__MUSACC__)
|
|
10
|
+
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
|
|
11
|
+
//
|
|
12
|
+
// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
|
|
13
|
+
//
|
|
14
|
+
#include <arm_neon.h>
|
|
15
|
+
#endif
|
|
16
|
+
|
|
17
|
+
#if defined(__F16C__)
|
|
18
|
+
#include <immintrin.h>
|
|
19
|
+
#endif
|
|
20
|
+
|
|
21
|
+
#ifdef __cplusplus
|
|
22
|
+
extern "C" {
|
|
23
|
+
#endif
|
|
24
|
+
|
|
5
25
|
//
|
|
6
26
|
// simd mappings
|
|
7
27
|
//
|
|
8
28
|
|
|
29
|
+
// FP16 to FP32 conversion
|
|
30
|
+
|
|
31
|
+
// 16-bit float
|
|
32
|
+
// on Arm, we use __fp16
|
|
33
|
+
// on x86, we use uint16_t
|
|
34
|
+
//
|
|
35
|
+
// for old CUDA compilers (<= 11), we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/10616
|
|
36
|
+
// for MUSA compilers , we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/11843
|
|
37
|
+
//
|
|
38
|
+
#if defined(__ARM_NEON) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__)
|
|
39
|
+
#define GGML_CPU_COMPUTE_FP16_TO_FP32(x) neon_compute_fp16_to_fp32(x)
|
|
40
|
+
#define GGML_CPU_COMPUTE_FP32_TO_FP16(x) neon_compute_fp32_to_fp16(x)
|
|
41
|
+
|
|
42
|
+
#define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
|
|
43
|
+
|
|
44
|
+
static inline float neon_compute_fp16_to_fp32(ggml_fp16_t h) {
|
|
45
|
+
__fp16 tmp;
|
|
46
|
+
memcpy(&tmp, &h, sizeof(ggml_fp16_t));
|
|
47
|
+
return (float)tmp;
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
static inline ggml_fp16_t neon_compute_fp32_to_fp16(float f) {
|
|
51
|
+
ggml_fp16_t res;
|
|
52
|
+
__fp16 tmp = f;
|
|
53
|
+
memcpy(&res, &tmp, sizeof(ggml_fp16_t));
|
|
54
|
+
return res;
|
|
55
|
+
}
|
|
56
|
+
#elif defined(__F16C__)
|
|
57
|
+
#ifdef _MSC_VER
|
|
58
|
+
#define GGML_CPU_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
|
|
59
|
+
#define GGML_CPU_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
|
|
60
|
+
#else
|
|
61
|
+
#define GGML_CPU_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
|
|
62
|
+
#define GGML_CPU_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
|
|
63
|
+
#endif
|
|
64
|
+
#elif defined(__POWER9_VECTOR__)
|
|
65
|
+
#define GGML_CPU_COMPUTE_FP16_TO_FP32(x) power_compute_fp16_to_fp32(x)
|
|
66
|
+
#define GGML_CPU_COMPUTE_FP32_TO_FP16(x) power_compute_fp32_to_fp16(x)
|
|
67
|
+
/* the inline asm below is about 12% faster than the lookup method */
|
|
68
|
+
#define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
|
|
69
|
+
#define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x)
|
|
70
|
+
|
|
71
|
+
static inline float power_compute_fp16_to_fp32(ggml_fp16_t h) {
|
|
72
|
+
float f;
|
|
73
|
+
double d;
|
|
74
|
+
__asm__(
|
|
75
|
+
"mtfprd %0,%2\n"
|
|
76
|
+
"xscvhpdp %0,%0\n"
|
|
77
|
+
"frsp %1,%0\n" :
|
|
78
|
+
/* temp */ "=d"(d),
|
|
79
|
+
/* out */ "=f"(f):
|
|
80
|
+
/* in */ "r"(h));
|
|
81
|
+
return f;
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
static inline ggml_fp16_t power_compute_fp32_to_fp16(float f) {
|
|
85
|
+
double d;
|
|
86
|
+
ggml_fp16_t r;
|
|
87
|
+
__asm__( /* xscvdphp can work on double or single precision */
|
|
88
|
+
"xscvdphp %0,%2\n"
|
|
89
|
+
"mffprd %1,%0\n" :
|
|
90
|
+
/* temp */ "=d"(d),
|
|
91
|
+
/* out */ "=r"(r):
|
|
92
|
+
/* in */ "f"(f));
|
|
93
|
+
return r;
|
|
94
|
+
}
|
|
95
|
+
#elif defined(__riscv) && defined(__riscv_zfhmin)
|
|
96
|
+
static inline float riscv_compute_fp16_to_fp32(ggml_fp16_t h) {
|
|
97
|
+
float f;
|
|
98
|
+
__asm__(
|
|
99
|
+
"fmv.h.x %[f], %[h]\n\t"
|
|
100
|
+
"fcvt.s.h %[f], %[f]"
|
|
101
|
+
: [f] "=&f" (f)
|
|
102
|
+
: [h] "r" (h)
|
|
103
|
+
);
|
|
104
|
+
return f;
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
static inline ggml_fp16_t riscv_compute_fp32_to_fp16(float f) {
|
|
108
|
+
ggml_fp16_t res;
|
|
109
|
+
__asm__(
|
|
110
|
+
"fcvt.h.s %[f], %[f]\n\t"
|
|
111
|
+
"fmv.x.h %[h], %[f]"
|
|
112
|
+
: [h] "=&r" (res)
|
|
113
|
+
: [f] "f" (f)
|
|
114
|
+
);
|
|
115
|
+
return res;
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
#define GGML_CPU_COMPUTE_FP16_TO_FP32(x) riscv_compute_fp16_to_fp32(x)
|
|
119
|
+
#define GGML_CPU_COMPUTE_FP32_TO_FP16(x) riscv_compute_fp32_to_fp16(x)
|
|
120
|
+
#define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
|
|
121
|
+
#define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x)
|
|
122
|
+
#elif defined(__NNPA__)
|
|
123
|
+
#define GGML_CPU_COMPUTE_FP16_TO_FP32(x) nnpa_compute_fp16_to_fp32(x)
|
|
124
|
+
#define GGML_CPU_COMPUTE_FP32_TO_FP16(x) nnpa_compute_fp32_to_fp16(x)
|
|
125
|
+
|
|
126
|
+
#define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
|
|
127
|
+
#define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x)
|
|
128
|
+
|
|
129
|
+
static inline float nnpa_compute_fp16_to_fp32(ggml_fp16_t h) {
|
|
130
|
+
uint16x8_t v_h = vec_splats(h);
|
|
131
|
+
uint16x8_t v_hd = vec_convert_from_fp16(v_h, 0);
|
|
132
|
+
return vec_extend_to_fp32_hi(v_hd, 0)[0];
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
static inline ggml_fp16_t nnpa_compute_fp32_to_fp16(float f) {
|
|
136
|
+
float32x4_t v_f = vec_splats(f);
|
|
137
|
+
float32x4_t v_zero = vec_splats(0.0f);
|
|
138
|
+
uint16x8_t v_hd = vec_round_from_fp32(v_f, v_zero, 0);
|
|
139
|
+
uint16x8_t v_h = vec_convert_to_fp16(v_hd, 0);
|
|
140
|
+
return vec_extract(v_h, 0);
|
|
141
|
+
}
|
|
142
|
+
#endif
|
|
143
|
+
|
|
144
|
+
// precomputed f32 table for f16 (256 KB)
|
|
145
|
+
// defined in ggml-cpu.c, initialized in ggml_cpu_init()
|
|
146
|
+
extern float ggml_table_f32_f16[1 << 16];
|
|
147
|
+
|
|
148
|
+
// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
|
|
149
|
+
// so we define GGML_CPU_FP16_TO_FP32 and GGML_CPU_FP32_TO_FP16 elsewhere for NEON.
|
|
150
|
+
// This is also true for POWER9.
|
|
151
|
+
#if !defined(GGML_CPU_FP16_TO_FP32)
|
|
152
|
+
inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
|
|
153
|
+
uint16_t s;
|
|
154
|
+
memcpy(&s, &f, sizeof(uint16_t));
|
|
155
|
+
return ggml_table_f32_f16[s];
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
#define GGML_CPU_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
|
|
159
|
+
#endif
|
|
160
|
+
|
|
161
|
+
#if !defined(GGML_CPU_FP32_TO_FP16)
|
|
162
|
+
#define GGML_CPU_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
|
163
|
+
#endif
|
|
164
|
+
|
|
165
|
+
|
|
9
166
|
// we define a common set of C macros which map to specific intrinsics based on the current architecture
|
|
10
167
|
// we then implement the fundamental computation operations below using only these macros
|
|
11
168
|
// adding support for new architectures requires to define the corresponding SIMD macros
|
|
@@ -415,7 +572,7 @@ static inline __m256 __avx_f32cx8_load(const ggml_fp16_t * x) {
|
|
|
415
572
|
float tmp[8];
|
|
416
573
|
|
|
417
574
|
for (int i = 0; i < 8; i++) {
|
|
418
|
-
tmp[i] =
|
|
575
|
+
tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]);
|
|
419
576
|
}
|
|
420
577
|
|
|
421
578
|
return _mm256_loadu_ps(tmp);
|
|
@@ -426,7 +583,7 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
|
|
|
426
583
|
_mm256_storeu_ps(arr, y);
|
|
427
584
|
|
|
428
585
|
for (int i = 0; i < 8; i++)
|
|
429
|
-
x[i] =
|
|
586
|
+
x[i] = GGML_CPU_FP32_TO_FP16(arr[i]);
|
|
430
587
|
}
|
|
431
588
|
#define GGML_F32Cx8_LOAD(x) __avx_f32cx8_load(x)
|
|
432
589
|
#define GGML_F32Cx8_STORE(x, y) __avx_f32cx8_store(x, y)
|
|
@@ -574,10 +731,10 @@ static inline unsigned char ggml_endian_byte(int i) {
|
|
|
574
731
|
inline static v128_t __wasm_f16x4_load(const ggml_fp16_t * p) {
|
|
575
732
|
float tmp[4];
|
|
576
733
|
|
|
577
|
-
tmp[0] =
|
|
578
|
-
tmp[1] =
|
|
579
|
-
tmp[2] =
|
|
580
|
-
tmp[3] =
|
|
734
|
+
tmp[0] = GGML_CPU_FP16_TO_FP32(p[0]);
|
|
735
|
+
tmp[1] = GGML_CPU_FP16_TO_FP32(p[1]);
|
|
736
|
+
tmp[2] = GGML_CPU_FP16_TO_FP32(p[2]);
|
|
737
|
+
tmp[3] = GGML_CPU_FP16_TO_FP32(p[3]);
|
|
581
738
|
|
|
582
739
|
return wasm_v128_load(tmp);
|
|
583
740
|
}
|
|
@@ -587,10 +744,10 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
|
|
|
587
744
|
|
|
588
745
|
wasm_v128_store(tmp, x);
|
|
589
746
|
|
|
590
|
-
p[0] =
|
|
591
|
-
p[1] =
|
|
592
|
-
p[2] =
|
|
593
|
-
p[3] =
|
|
747
|
+
p[0] = GGML_CPU_FP32_TO_FP16(tmp[0]);
|
|
748
|
+
p[1] = GGML_CPU_FP32_TO_FP16(tmp[1]);
|
|
749
|
+
p[2] = GGML_CPU_FP32_TO_FP16(tmp[2]);
|
|
750
|
+
p[3] = GGML_CPU_FP32_TO_FP16(tmp[3]);
|
|
594
751
|
}
|
|
595
752
|
|
|
596
753
|
#define GGML_F16x4 v128_t
|
|
@@ -690,10 +847,10 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
|
|
|
690
847
|
static inline __m128 __sse_f16x4_load(const ggml_fp16_t * x) {
|
|
691
848
|
float tmp[4];
|
|
692
849
|
|
|
693
|
-
tmp[0] =
|
|
694
|
-
tmp[1] =
|
|
695
|
-
tmp[2] =
|
|
696
|
-
tmp[3] =
|
|
850
|
+
tmp[0] = GGML_CPU_FP16_TO_FP32(x[0]);
|
|
851
|
+
tmp[1] = GGML_CPU_FP16_TO_FP32(x[1]);
|
|
852
|
+
tmp[2] = GGML_CPU_FP16_TO_FP32(x[2]);
|
|
853
|
+
tmp[3] = GGML_CPU_FP16_TO_FP32(x[3]);
|
|
697
854
|
|
|
698
855
|
return _mm_loadu_ps(tmp);
|
|
699
856
|
}
|
|
@@ -703,10 +860,10 @@ static inline void __sse_f16x4_store(ggml_fp16_t * x, __m128 y) {
|
|
|
703
860
|
|
|
704
861
|
_mm_storeu_ps(arr, y);
|
|
705
862
|
|
|
706
|
-
x[0] =
|
|
707
|
-
x[1] =
|
|
708
|
-
x[2] =
|
|
709
|
-
x[3] =
|
|
863
|
+
x[0] = GGML_CPU_FP32_TO_FP16(arr[0]);
|
|
864
|
+
x[1] = GGML_CPU_FP32_TO_FP16(arr[1]);
|
|
865
|
+
x[2] = GGML_CPU_FP32_TO_FP16(arr[2]);
|
|
866
|
+
x[3] = GGML_CPU_FP32_TO_FP16(arr[3]);
|
|
710
867
|
}
|
|
711
868
|
|
|
712
869
|
#define GGML_F32Cx4 __m128
|
|
@@ -828,7 +985,7 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
|
|
|
828
985
|
#define GGML_F32x4_ZERO __lsx_vldi(0)
|
|
829
986
|
#define GGML_F32x4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
|
|
830
987
|
#define GGML_F32x4_LOAD(x) __lsx_vld((x), 0)
|
|
831
|
-
#define GGML_F32x4_STORE(
|
|
988
|
+
#define GGML_F32x4_STORE(x, y) __lsx_vst(y, x, 0)
|
|
832
989
|
#define GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a)
|
|
833
990
|
#define GGML_F32x4_ADD __lsx_vfadd_s
|
|
834
991
|
#define GGML_F32x4_MUL __lsx_vfmul_s
|
|
@@ -874,10 +1031,10 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
|
|
|
874
1031
|
static inline __m128 __lsx_f16x4_load(const ggml_fp16_t * x) {
|
|
875
1032
|
float tmp[4];
|
|
876
1033
|
|
|
877
|
-
tmp[0] =
|
|
878
|
-
tmp[1] =
|
|
879
|
-
tmp[2] =
|
|
880
|
-
tmp[3] =
|
|
1034
|
+
tmp[0] = GGML_CPU_FP16_TO_FP32(x[0]);
|
|
1035
|
+
tmp[1] = GGML_CPU_FP16_TO_FP32(x[1]);
|
|
1036
|
+
tmp[2] = GGML_CPU_FP16_TO_FP32(x[2]);
|
|
1037
|
+
tmp[3] = GGML_CPU_FP16_TO_FP32(x[3]);
|
|
881
1038
|
|
|
882
1039
|
return __lsx_vld(tmp, 0);
|
|
883
1040
|
}
|
|
@@ -887,10 +1044,10 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
|
|
|
887
1044
|
|
|
888
1045
|
__lsx_vst(y, arr, 0);
|
|
889
1046
|
|
|
890
|
-
x[0] =
|
|
891
|
-
x[1] =
|
|
892
|
-
x[2] =
|
|
893
|
-
x[3] =
|
|
1047
|
+
x[0] = GGML_CPU_FP32_TO_FP16(arr[0]);
|
|
1048
|
+
x[1] = GGML_CPU_FP32_TO_FP16(arr[1]);
|
|
1049
|
+
x[2] = GGML_CPU_FP32_TO_FP16(arr[2]);
|
|
1050
|
+
x[3] = GGML_CPU_FP32_TO_FP16(arr[3]);
|
|
894
1051
|
}
|
|
895
1052
|
|
|
896
1053
|
#define GGML_F32Cx4 __m128
|
|
@@ -922,7 +1079,7 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
|
|
|
922
1079
|
#define GGML_F32_STEP 32
|
|
923
1080
|
#define GGML_F32_EPR 4
|
|
924
1081
|
|
|
925
|
-
#define GGML_F32x4
|
|
1082
|
+
#define GGML_F32x4 float32x4_t
|
|
926
1083
|
#define GGML_F32x4_ZERO vec_splats(0.0f)
|
|
927
1084
|
#define GGML_F32x4_SET1 vec_splats
|
|
928
1085
|
#define GGML_F32x4_LOAD(p) vec_xl(0, p)
|
|
@@ -944,10 +1101,8 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
|
|
|
944
1101
|
for (int i = 0; i < offset; ++i) { \
|
|
945
1102
|
x[i] = vec_add(x[i], x[offset + i]); \
|
|
946
1103
|
} \
|
|
947
|
-
|
|
948
|
-
|
|
949
|
-
vec_extract(x[0], 2) + \
|
|
950
|
-
vec_extract(x[0], 3); \
|
|
1104
|
+
float32x4_t tmp = x[0] + vec_reve(x[0]); \
|
|
1105
|
+
res = tmp[0] + tmp[1]; \
|
|
951
1106
|
}
|
|
952
1107
|
|
|
953
1108
|
#define GGML_F32_VEC GGML_F32x4
|
|
@@ -964,28 +1119,45 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
|
|
|
964
1119
|
#define GGML_F16_STEP GGML_F32_STEP
|
|
965
1120
|
#define GGML_F16_EPR GGML_F32_EPR
|
|
966
1121
|
|
|
967
|
-
static inline
|
|
1122
|
+
static inline float32x4_t __lzs_f16cx4_load(const ggml_fp16_t * x) {
|
|
1123
|
+
#if defined(__NNPA__)
|
|
1124
|
+
uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)x);
|
|
1125
|
+
uint16x8_t v_xd = vec_convert_from_fp16(v_x, 0);
|
|
1126
|
+
return vec_extend_to_fp32_hi(v_xd, 0);
|
|
1127
|
+
#else
|
|
968
1128
|
float tmp[4];
|
|
969
1129
|
|
|
970
1130
|
for (int i = 0; i < 4; i++) {
|
|
971
|
-
tmp[i] =
|
|
1131
|
+
tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]);
|
|
972
1132
|
}
|
|
973
1133
|
|
|
974
1134
|
// note: keep type-cast here to prevent compiler bugs
|
|
975
1135
|
// see: https://github.com/ggml-org/llama.cpp/issues/12846
|
|
976
1136
|
return vec_xl(0, (const float *)(tmp));
|
|
1137
|
+
#endif
|
|
977
1138
|
}
|
|
978
1139
|
|
|
979
|
-
static inline void __lzs_f16cx4_store(ggml_fp16_t * x,
|
|
1140
|
+
static inline void __lzs_f16cx4_store(ggml_fp16_t * x, float32x4_t v_y) {
|
|
1141
|
+
#if defined(__NNPA__)
|
|
1142
|
+
float32x4_t v_zero = vec_splats(0.0f);
|
|
1143
|
+
uint16x8_t v_xd = vec_round_from_fp32(v_y, v_zero, 0);
|
|
1144
|
+
uint16x8_t v_x = vec_convert_to_fp16(v_xd, 0);
|
|
1145
|
+
|
|
1146
|
+
x[0] = vec_extract(v_x, 0);
|
|
1147
|
+
x[1] = vec_extract(v_x, 1);
|
|
1148
|
+
x[2] = vec_extract(v_x, 2);
|
|
1149
|
+
x[3] = vec_extract(v_x, 3);
|
|
1150
|
+
#else
|
|
980
1151
|
float arr[4];
|
|
981
1152
|
|
|
982
1153
|
// note: keep type-cast here to prevent compiler bugs
|
|
983
1154
|
// see: https://github.com/ggml-org/llama.cpp/issues/12846
|
|
984
|
-
vec_xst(
|
|
1155
|
+
vec_xst(v_y, 0, (float *)(arr));
|
|
985
1156
|
|
|
986
1157
|
for (int i = 0; i < 4; i++) {
|
|
987
|
-
x[i] =
|
|
1158
|
+
x[i] = GGML_CPU_FP32_TO_FP16(arr[i]);
|
|
988
1159
|
}
|
|
1160
|
+
#endif
|
|
989
1161
|
}
|
|
990
1162
|
|
|
991
1163
|
#define GGML_F16_VEC GGML_F32x4
|
|
@@ -1006,3 +1178,7 @@ static inline void __lzs_f16cx4_store(ggml_fp16_t * x, __vector float y) {
|
|
|
1006
1178
|
#define GGML_F32_ARR (GGML_F32_STEP/GGML_F32_EPR)
|
|
1007
1179
|
#define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
|
|
1008
1180
|
#endif
|
|
1181
|
+
|
|
1182
|
+
#ifdef __cplusplus
|
|
1183
|
+
}
|
|
1184
|
+
#endif
|
|
@@ -219,11 +219,11 @@ void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * G
|
|
|
219
219
|
|
|
220
220
|
// leftovers
|
|
221
221
|
for (int i = np; i < n; ++i) {
|
|
222
|
-
sumf += (ggml_float)(
|
|
222
|
+
sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i]));
|
|
223
223
|
}
|
|
224
224
|
#else
|
|
225
225
|
for (int i = 0; i < n; ++i) {
|
|
226
|
-
sumf += (ggml_float)(
|
|
226
|
+
sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i]));
|
|
227
227
|
}
|
|
228
228
|
#endif
|
|
229
229
|
|