@novastera-oss/llamarn 0.3.1 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +86 -3
- package/RNLlamaCpp.podspec +1 -1
- package/android/CMakeLists.txt +11 -3
- package/android/generated/jni/react/renderer/components/RNLlamaCppSpec/RNLlamaCppSpecJSI.h +49 -4
- package/android/src/main/cpp/include/llama.h +53 -114
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86/libggml.so +0 -0
- package/android/src/main/jniLibs/x86/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/LlamaCppModel.cpp +2 -10
- package/cpp/PureCppImpl.cpp +71 -4
- package/cpp/SystemUtils.cpp +3 -7
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/CMakeLists.txt +2 -0
- package/cpp/llama.cpp/CODEOWNERS +1 -1
- package/cpp/llama.cpp/Makefile +6 -1605
- package/cpp/llama.cpp/README.md +5 -1
- package/cpp/llama.cpp/common/arg.cpp +230 -51
- package/cpp/llama.cpp/common/chat-parser.cpp +9 -1
- package/cpp/llama.cpp/common/chat.cpp +539 -8
- package/cpp/llama.cpp/common/chat.h +8 -1
- package/cpp/llama.cpp/common/common.cpp +60 -15
- package/cpp/llama.cpp/common/common.h +64 -15
- package/cpp/llama.cpp/common/speculative.cpp +135 -54
- package/cpp/llama.cpp/common/speculative.h +8 -1
- package/cpp/llama.cpp/convert_hf_to_gguf.py +1216 -109
- package/cpp/llama.cpp/convert_hf_to_gguf_update.py +19 -6
- package/cpp/llama.cpp/convert_lora_to_gguf.py +1 -1
- package/cpp/llama.cpp/flake.nix +0 -5
- package/cpp/llama.cpp/ggml/CMakeLists.txt +6 -3
- package/cpp/llama.cpp/ggml/cmake/ggml-config.cmake.in +71 -70
- package/cpp/llama.cpp/ggml/include/ggml-opt.h +25 -6
- package/cpp/llama.cpp/ggml/include/ggml-zdnn.h +16 -0
- package/cpp/llama.cpp/ggml/include/ggml.h +90 -3
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +13 -1
- package/cpp/llama.cpp/ggml/src/ggml-alloc.c +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +10 -0
- package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +113 -17
- package/cpp/llama.cpp/ggml/src/ggml-blas/ggml-blas.cpp +4 -4
- package/cpp/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +14 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +701 -585
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +13 -3
- package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +52 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +274 -91
- package/cpp/llama.cpp/ggml/src/ggml-common.h +17 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +132 -596
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +14 -286
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +90 -569
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +162 -589
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +55 -341
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +3 -58
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +371 -298
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +54 -314
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +184 -675
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +4679 -1657
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +33 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +26 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +21 -24
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +16 -7
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +232 -123
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +428 -23
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +4 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +35 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.h +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +458 -46
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.h +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +39 -14
- package/cpp/llama.cpp/ggml/src/ggml-cpu/traits.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/traits.h +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +20 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +122 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +9 -11
- package/cpp/llama.cpp/ggml/src/ggml-cuda/add-id.cu +58 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/add-id.cuh +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/binbcast.cu +275 -170
- package/cpp/llama.cpp/ggml/src/ggml-cuda/binbcast.cuh +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +103 -65
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv-transpose-1d.cu +1 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d.cu +171 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cu +33 -7
- package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cuh +13 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy-utils.cuh +2 -10
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cu +3 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/dequantize.cuh +14 -40
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +83 -27
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +116 -57
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cu +45 -18
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cu +56 -29
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +61 -39
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +70 -49
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +70 -21
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cu +162 -50
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cuh +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/getrows.cu +5 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +208 -97
- package/cpp/llama.cpp/ggml/src/ggml-cuda/im2col.cu +46 -35
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cu +56 -2
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mma.cuh +95 -51
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmf.cu +427 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmf.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cu +204 -57
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +252 -168
- package/cpp/llama.cpp/ggml/src/ggml-cuda/{mmv.cu → mmvf.cu} +53 -53
- package/cpp/llama.cpp/ggml/src/ggml-cuda/{mmv.cuh → mmvf.cuh} +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvq.cu +10 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/norm.cu +192 -19
- package/cpp/llama.cpp/ggml/src/ggml-cuda/norm.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/opt-step-sgd.cu +49 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/opt-step-sgd.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/pad_reflect_1d.cu +82 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/pad_reflect_1d.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/reduce_rows.cuh +53 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/roll.cu +67 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/roll.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/set-rows.cu +1 -8
- package/cpp/llama.cpp/ggml/src/ggml-cuda/softcap.cu +34 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/softcap.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/softmax.cu +16 -10
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +153 -71
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sum.cu +6 -10
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cu +21 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmq-instance-mxfp4.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +75 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/vecdotq.cuh +110 -22
- package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +14 -25
- package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +2 -1
- package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +10 -2
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +61 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +31 -20
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +342 -131
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +464 -134
- package/cpp/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +0 -4
- package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1108 -176
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/add.cl +107 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/div.cl +66 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +343 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +343 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +346 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/glu.cl +41 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/group_norm.cl +49 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul.cl +73 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +132 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +133 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/norm.cl +80 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +10 -2
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +10 -2
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f16.cl +10 -2
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f32.cl +10 -2
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sub.cl +66 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/transpose.cl +20 -0
- package/cpp/llama.cpp/ggml/src/ggml-opt.cpp +97 -41
- package/cpp/llama.cpp/ggml/src/ggml-quants.c +110 -16
- package/cpp/llama.cpp/ggml/src/ggml-quants.h +6 -0
- package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +22 -9
- package/cpp/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +0 -212
- package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +213 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +117 -238
- package/cpp/llama.cpp/ggml/src/ggml-sycl/quantize.hpp +133 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.cpp +94 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1666 -633
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +41 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +42 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +13 -4
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +39 -29
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +107 -43
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp +18 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp +21 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +32 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +20 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +21 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +16 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +44 -8
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +44 -16
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +26 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +2 -17
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +37 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +11 -7
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +109 -55
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +71 -41
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +6 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +111 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +49 -11
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +65 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +9 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +17 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +38 -5
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +14 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +55 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/utils.comp +25 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +75 -20
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/CMakeLists.txt +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/ggml-webgpu.cpp +807 -412
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +72 -22
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +8 -8
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +1794 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +82 -0
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/ggml-zdnn-impl.h +97 -0
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/ggml-zdnn.cpp +846 -0
- package/cpp/llama.cpp/ggml/src/ggml.c +204 -50
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +187 -2
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +11 -2
- package/cpp/llama.cpp/gguf-py/gguf/quants.py +53 -4
- package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_convert_endian.py +67 -63
- package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_new_metadata.py +7 -1
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +120 -16
- package/cpp/llama.cpp/gguf-py/gguf/utility.py +5 -1
- package/cpp/llama.cpp/gguf-py/gguf/vocab.py +284 -1
- package/cpp/llama.cpp/gguf-py/tests/test_quants.py +14 -5
- package/cpp/llama.cpp/include/llama.h +53 -114
- package/cpp/llama.cpp/models/templates/ByteDance-Seed-OSS.jinja +171 -0
- package/cpp/llama.cpp/models/templates/README.md +2 -1
- package/cpp/llama.cpp/models/templates/ibm-granite-granite-3.3-2B-Instruct.jinja +59 -0
- package/cpp/llama.cpp/models/templates/openai-gpt-oss-120b.jinja +331 -0
- package/cpp/llama.cpp/models/templates/unsloth-mistral-Devstral-Small-2507.jinja +105 -0
- package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +3 -1
- package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +0 -6
- package/cpp/llama.cpp/requirements/requirements-pydantic.txt +1 -1
- package/cpp/llama.cpp/src/CMakeLists.txt +2 -2
- package/cpp/llama.cpp/src/llama-adapter.cpp +68 -4
- package/cpp/llama.cpp/src/llama-adapter.h +3 -0
- package/cpp/llama.cpp/src/llama-arch.cpp +192 -2
- package/cpp/llama.cpp/src/llama-arch.h +18 -0
- package/cpp/llama.cpp/src/llama-batch.cpp +2 -2
- package/cpp/llama.cpp/src/llama-chat.cpp +47 -6
- package/cpp/llama.cpp/src/llama-chat.h +3 -0
- package/cpp/llama.cpp/src/llama-context.cpp +61 -252
- package/cpp/llama.cpp/src/llama-context.h +10 -15
- package/cpp/llama.cpp/src/llama-cparams.h +0 -1
- package/cpp/llama.cpp/src/llama-graph.cpp +180 -85
- package/cpp/llama.cpp/src/llama-graph.h +90 -51
- package/cpp/llama.cpp/src/llama-hparams.cpp +34 -3
- package/cpp/llama.cpp/src/llama-hparams.h +21 -6
- package/cpp/llama.cpp/src/{llama-kv-cache-unified-iswa.cpp → llama-kv-cache-iswa.cpp} +79 -56
- package/cpp/llama.cpp/src/{llama-kv-cache-unified-iswa.h → llama-kv-cache-iswa.h} +30 -28
- package/cpp/llama.cpp/src/{llama-kv-cache-unified.cpp → llama-kv-cache.cpp} +240 -632
- package/cpp/llama.cpp/src/{llama-kv-cache-unified.h → llama-kv-cache.h} +39 -74
- package/cpp/llama.cpp/src/llama-kv-cells.h +21 -21
- package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +41 -35
- package/cpp/llama.cpp/src/llama-memory-hybrid.h +26 -29
- package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +13 -9
- package/cpp/llama.cpp/src/llama-memory-recurrent.h +10 -14
- package/cpp/llama.cpp/src/llama-memory.h +13 -10
- package/cpp/llama.cpp/src/llama-model-loader.cpp +2 -0
- package/cpp/llama.cpp/src/llama-model-loader.h +3 -2
- package/cpp/llama.cpp/src/llama-model.cpp +1959 -419
- package/cpp/llama.cpp/src/llama-model.h +28 -4
- package/cpp/llama.cpp/src/llama-quant.cpp +40 -4
- package/cpp/llama.cpp/src/llama-vocab.cpp +51 -2
- package/cpp/llama.cpp/src/llama-vocab.h +1 -0
- package/cpp/llama.cpp/vendor/minja/chat-template.hpp +16 -7
- package/cpp/llama.cpp/vendor/minja/minja.hpp +47 -12
- package/cpp/rn-completion.cpp +3 -27
- package/ios/generated/RNLlamaCppSpec/RNLlamaCppSpec.h +30 -0
- package/ios/generated/RNLlamaCppSpecJSI.h +49 -4
- package/ios/include/chat.h +8 -1
- package/ios/include/common/minja/chat-template.hpp +16 -7
- package/ios/include/common/minja/minja.hpp +47 -12
- package/ios/include/common.h +64 -15
- package/ios/include/llama.h +53 -114
- package/ios/include/speculative.h +8 -1
- package/ios/libs/llama.xcframework/Info.plist +18 -18
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5557 -5267
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-opt.h +25 -6
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +90 -3
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +53 -114
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5520 -5238
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4241 -4014
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +25 -6
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +90 -3
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +53 -114
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5519 -5238
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4242 -4016
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-opt.h +25 -6
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +90 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +53 -114
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-opt.h +25 -6
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +90 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +53 -114
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-opt.h +25 -6
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +90 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +53 -114
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5556 -5267
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-opt.h +25 -6
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +90 -3
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +53 -114
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5519 -5238
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4241 -4014
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +25 -6
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +90 -3
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +53 -114
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5553 -5303
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-opt.h +25 -6
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +90 -3
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +53 -114
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5515 -5274
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4238 -4044
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +25 -6
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +90 -3
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +53 -114
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/lib/module/NativeRNLlamaCpp.js.map +1 -1
- package/lib/typescript/src/NativeRNLlamaCpp.d.ts +5 -0
- package/lib/typescript/src/NativeRNLlamaCpp.d.ts.map +1 -1
- package/package.json +1 -2
- package/src/NativeRNLlamaCpp.ts +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +0 -56
|
@@ -201,24 +201,14 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
201
201
|
|
|
202
202
|
sumf = vec_extract(vsumf0, 0);
|
|
203
203
|
|
|
204
|
-
#endif
|
|
205
|
-
for (; ib < nb; ++ib) {
|
|
206
|
-
int sumi0 = 0;
|
|
207
|
-
int sumi1 = 0;
|
|
208
|
-
|
|
209
|
-
for (int j = 0; j < qk/2; ++j) {
|
|
210
|
-
const int v0 = (x[ib].qs[j] & 0x0F) - 8;
|
|
211
|
-
const int v1 = (x[ib].qs[j] >> 4) - 8;
|
|
212
|
-
|
|
213
|
-
sumi0 += (v0 * y[ib].qs[j]);
|
|
214
|
-
sumi1 += (v1 * y[ib].qs[j + qk/2]);
|
|
215
|
-
}
|
|
216
|
-
|
|
217
|
-
int sumi = sumi0 + sumi1;
|
|
218
|
-
sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
|
|
219
|
-
}
|
|
220
|
-
|
|
221
204
|
*s = sumf;
|
|
205
|
+
#else
|
|
206
|
+
UNUSED(x);
|
|
207
|
+
UNUSED(y);
|
|
208
|
+
UNUSED(ib);
|
|
209
|
+
UNUSED(sumf);
|
|
210
|
+
ggml_vec_dot_q4_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
211
|
+
#endif
|
|
222
212
|
}
|
|
223
213
|
|
|
224
214
|
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
@@ -278,24 +268,80 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
278
268
|
|
|
279
269
|
sumf = vec_extract(vsumf0, 0);
|
|
280
270
|
|
|
271
|
+
*s = sumf;
|
|
272
|
+
#else
|
|
273
|
+
UNUSED(x);
|
|
274
|
+
UNUSED(y);
|
|
275
|
+
UNUSED(ib);
|
|
276
|
+
UNUSED(sumf);
|
|
277
|
+
ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
281
278
|
#endif
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
282
|
+
assert(nrc == 1);
|
|
283
|
+
UNUSED(nrc);
|
|
284
|
+
UNUSED(bx);
|
|
285
|
+
UNUSED(by);
|
|
286
|
+
UNUSED(bs);
|
|
287
|
+
assert(n % QK_MXFP4 == 0);
|
|
288
|
+
static_assert(QK_MXFP4 == QK8_0, "QK_MXFP4 and QK8_0 must be the same");
|
|
289
|
+
|
|
290
|
+
const block_mxfp4 * GGML_RESTRICT x = vx;
|
|
291
|
+
const block_q8_0 * GGML_RESTRICT y = vy;
|
|
292
|
+
|
|
293
|
+
const int nb = n / QK_MXFP4;
|
|
294
|
+
|
|
295
|
+
int ib = 0;
|
|
296
|
+
float sumf = 0;
|
|
297
|
+
|
|
298
|
+
#if defined(__POWER9_VECTOR__)
|
|
299
|
+
const vector signed char lowMask = vec_splats((signed char)0xF);
|
|
300
|
+
const vector unsigned char vshift4 = vec_splats((unsigned char)4);
|
|
301
|
+
vector float vsumf0 = vec_splats(0.0f);
|
|
302
|
+
|
|
303
|
+
vector signed char kv = vec_xl(0, (const signed char *)kvalues_mxfp4);
|
|
304
|
+
|
|
305
|
+
#pragma GCC unroll 8
|
|
282
306
|
for (; ib < nb; ++ib) {
|
|
283
|
-
|
|
284
|
-
|
|
307
|
+
__builtin_prefetch(x[ib].qs, 0, 1);
|
|
308
|
+
__builtin_prefetch(y[ib].qs, 0, 1);
|
|
285
309
|
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
const int v1 = (x[ib].qs[j] >> 4);
|
|
310
|
+
vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d) *
|
|
311
|
+
GGML_E8M0_TO_FP32_HALF(x[ib].e));
|
|
289
312
|
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
313
|
+
vector signed char q8y0 = vec_xl( 0, y[ib].qs);
|
|
314
|
+
vector signed char q8y1 = vec_xl(16, y[ib].qs);
|
|
315
|
+
|
|
316
|
+
vector signed char qxs = (vector signed char)vec_xl(0, x[ib].qs);
|
|
317
|
+
|
|
318
|
+
vector unsigned char lo_nibbles = (vector unsigned char)vec_and(qxs, lowMask);
|
|
319
|
+
vector unsigned char hi_nibbles = (vector unsigned char)vec_sr(qxs, vshift4);
|
|
320
|
+
|
|
321
|
+
vector signed char q4x0 = vec_perm(kv, kv, lo_nibbles);
|
|
322
|
+
vector signed char q4x1 = vec_perm(kv, kv, hi_nibbles);
|
|
293
323
|
|
|
294
|
-
|
|
295
|
-
|
|
324
|
+
vector signed short qv0 = vec_add(vec_mule(q4x0, q8y0), vec_mulo(q4x0, q8y0));
|
|
325
|
+
vector signed short qv1 = vec_add(vec_mule(q4x1, q8y1), vec_mulo(q4x1, q8y1));
|
|
326
|
+
|
|
327
|
+
vector signed int vsumi0 = vec_splats((int32_t)0);
|
|
328
|
+
vsumi0 = vec_sum4s(qv0, vsumi0);
|
|
329
|
+
vsumi0 = vec_sum4s(qv1, vsumi0);
|
|
330
|
+
|
|
331
|
+
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vyd, vsumf0);
|
|
296
332
|
}
|
|
297
333
|
|
|
334
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
|
|
335
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
|
|
336
|
+
sumf = vec_extract(vsumf0, 0);
|
|
298
337
|
*s = sumf;
|
|
338
|
+
#else
|
|
339
|
+
UNUSED(x);
|
|
340
|
+
UNUSED(y);
|
|
341
|
+
UNUSED(ib);
|
|
342
|
+
UNUSED(sumf);
|
|
343
|
+
ggml_vec_dot_mxfp4_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
344
|
+
#endif
|
|
299
345
|
}
|
|
300
346
|
|
|
301
347
|
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
@@ -360,30 +406,14 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
360
406
|
|
|
361
407
|
sumf = vec_extract(vsumf0, 0);
|
|
362
408
|
|
|
363
|
-
#endif
|
|
364
|
-
for (; ib < nb; ++ib) {
|
|
365
|
-
uint32_t qh;
|
|
366
|
-
memcpy(&qh, x[ib].qh, sizeof(qh));
|
|
367
|
-
|
|
368
|
-
int sumi0 = 0;
|
|
369
|
-
int sumi1 = 0;
|
|
370
|
-
|
|
371
|
-
for (int j = 0; j < qk/2; ++j) {
|
|
372
|
-
const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
|
|
373
|
-
const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
|
|
374
|
-
|
|
375
|
-
const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16);
|
|
376
|
-
const int32_t x1 = (int8_t)(((x[ib].qs[j] >> 4) | xh_1) - 16);
|
|
377
|
-
|
|
378
|
-
sumi0 += (x0 * y[ib].qs[j]);
|
|
379
|
-
sumi1 += (x1 * y[ib].qs[j + qk/2]);
|
|
380
|
-
}
|
|
381
|
-
|
|
382
|
-
int sumi = sumi0 + sumi1;
|
|
383
|
-
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
|
|
384
|
-
}
|
|
385
|
-
|
|
386
409
|
*s = sumf;
|
|
410
|
+
#else
|
|
411
|
+
UNUSED(ib);
|
|
412
|
+
UNUSED(sumf);
|
|
413
|
+
UNUSED(x);
|
|
414
|
+
UNUSED(y);
|
|
415
|
+
ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
416
|
+
#endif
|
|
387
417
|
}
|
|
388
418
|
|
|
389
419
|
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
@@ -451,30 +481,15 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
451
481
|
|
|
452
482
|
sumf = vec_extract(vsumf0, 0);
|
|
453
483
|
|
|
454
|
-
#endif
|
|
455
|
-
for (; ib < nb; ++ib) {
|
|
456
|
-
uint32_t qh;
|
|
457
|
-
memcpy(&qh, x[ib].qh, sizeof(qh));
|
|
458
|
-
|
|
459
|
-
int sumi0 = 0;
|
|
460
|
-
int sumi1 = 0;
|
|
461
|
-
|
|
462
|
-
for (int j = 0; j < qk/2; ++j) {
|
|
463
|
-
const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
|
|
464
|
-
const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;
|
|
465
|
-
|
|
466
|
-
const int32_t x0 = (x[ib].qs[j] & 0xF) | xh_0;
|
|
467
|
-
const int32_t x1 = (x[ib].qs[j] >> 4) | xh_1;
|
|
468
|
-
|
|
469
|
-
sumi0 += (x0 * y[ib].qs[j]);
|
|
470
|
-
sumi1 += (x1 * y[ib].qs[j + qk/2]);
|
|
471
|
-
}
|
|
472
|
-
|
|
473
|
-
int sumi = sumi0 + sumi1;
|
|
474
|
-
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
|
|
475
|
-
}
|
|
476
|
-
|
|
477
484
|
*s = sumf;
|
|
485
|
+
#else
|
|
486
|
+
UNUSED(nb);
|
|
487
|
+
UNUSED(ib);
|
|
488
|
+
UNUSED(sumf);
|
|
489
|
+
UNUSED(x);
|
|
490
|
+
UNUSED(y);
|
|
491
|
+
ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
492
|
+
#endif
|
|
478
493
|
}
|
|
479
494
|
|
|
480
495
|
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
@@ -535,18 +550,15 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
535
550
|
|
|
536
551
|
sumf = vec_extract(vsumf0, 0);
|
|
537
552
|
|
|
538
|
-
#endif
|
|
539
|
-
for (; ib < nb; ++ib) {
|
|
540
|
-
int sumi = 0;
|
|
541
|
-
|
|
542
|
-
for (int j = 0; j < qk; j++) {
|
|
543
|
-
sumi += x[ib].qs[j]*y[ib].qs[j];
|
|
544
|
-
}
|
|
545
|
-
|
|
546
|
-
sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
|
|
547
|
-
}
|
|
548
|
-
|
|
549
553
|
*s = sumf;
|
|
554
|
+
#else
|
|
555
|
+
UNUSED(nb);
|
|
556
|
+
UNUSED(x);
|
|
557
|
+
UNUSED(y);
|
|
558
|
+
UNUSED(ib);
|
|
559
|
+
UNUSED(sumf);
|
|
560
|
+
ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
561
|
+
#endif
|
|
550
562
|
}
|
|
551
563
|
|
|
552
564
|
void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
@@ -695,45 +707,10 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
695
707
|
*s = vec_extract(vsumf0, 0);
|
|
696
708
|
|
|
697
709
|
#else
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
const uint8_t * q2 = x[i].qs;
|
|
704
|
-
const int8_t * q8 = y[i].qs;
|
|
705
|
-
const uint8_t * sc = x[i].scales;
|
|
706
|
-
|
|
707
|
-
int summs = 0;
|
|
708
|
-
for (int j = 0; j < 16; ++j) {
|
|
709
|
-
summs += y[i].bsums[j] * (sc[j] >> 4);
|
|
710
|
-
}
|
|
711
|
-
|
|
712
|
-
const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
|
713
|
-
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
|
714
|
-
|
|
715
|
-
int isum = 0;
|
|
716
|
-
int is = 0;
|
|
717
|
-
int d;
|
|
718
|
-
for (int k = 0; k < QK_K/128; ++k) {
|
|
719
|
-
int shift = 0;
|
|
720
|
-
for (int j = 0; j < 4; ++j) {
|
|
721
|
-
d = sc[is++] & 0xF;
|
|
722
|
-
int isuml = 0;
|
|
723
|
-
for (int l = 0; l < 16; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
|
|
724
|
-
isum += d * isuml;
|
|
725
|
-
d = sc[is++] & 0xF;
|
|
726
|
-
isuml = 0;
|
|
727
|
-
for (int l = 16; l < 32; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
|
|
728
|
-
isum += d * isuml;
|
|
729
|
-
shift += 2;
|
|
730
|
-
q8 += 32;
|
|
731
|
-
}
|
|
732
|
-
q2 += 32;
|
|
733
|
-
}
|
|
734
|
-
sumf += dall * isum - dmin * summs;
|
|
735
|
-
}
|
|
736
|
-
*s = sumf;
|
|
710
|
+
UNUSED(x);
|
|
711
|
+
UNUSED(y);
|
|
712
|
+
UNUSED(nb);
|
|
713
|
+
ggml_vec_dot_q2_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
737
714
|
#endif
|
|
738
715
|
}
|
|
739
716
|
|
|
@@ -907,70 +884,13 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
907
884
|
*s = vec_extract(vsumf0, 0);
|
|
908
885
|
|
|
909
886
|
#else
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
|
|
913
|
-
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
// write vectorized versions for AVX, ARM_NEON, etc.
|
|
917
|
-
|
|
918
|
-
int8_t aux8[QK_K];
|
|
919
|
-
int16_t aux16[8];
|
|
920
|
-
float sums [8];
|
|
921
|
-
int32_t aux32[8];
|
|
922
|
-
memset(sums, 0, 8*sizeof(float));
|
|
923
|
-
|
|
924
|
-
uint32_t auxs[4];
|
|
925
|
-
const int8_t * scales = (const int8_t*)auxs;
|
|
926
|
-
|
|
927
|
-
float sumf = 0;
|
|
928
|
-
for (int i = 0; i < nb; ++i) {
|
|
929
|
-
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
|
930
|
-
const uint8_t * GGML_RESTRICT hm = x[i].hmask;
|
|
931
|
-
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
932
|
-
memset(aux32, 0, 8*sizeof(int32_t));
|
|
933
|
-
int8_t * GGML_RESTRICT a = aux8;
|
|
934
|
-
uint8_t m = 1;
|
|
935
|
-
for (int j = 0; j < QK_K; j += 128) {
|
|
936
|
-
for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
|
|
937
|
-
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
|
938
|
-
a += 32; m <<= 1;
|
|
939
|
-
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3;
|
|
940
|
-
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
|
941
|
-
a += 32; m <<= 1;
|
|
942
|
-
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3;
|
|
943
|
-
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
|
944
|
-
a += 32; m <<= 1;
|
|
945
|
-
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3;
|
|
946
|
-
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
|
947
|
-
a += 32; m <<= 1;
|
|
948
|
-
q3 += 32;
|
|
949
|
-
}
|
|
950
|
-
a = aux8;
|
|
951
|
-
|
|
952
|
-
memcpy(auxs, x[i].scales, 12);
|
|
953
|
-
uint32_t tmp = auxs[2];
|
|
954
|
-
auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
|
|
955
|
-
auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
|
|
956
|
-
auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
|
|
957
|
-
auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
|
|
958
|
-
for (int j = 0; j < QK_K/16; ++j) {
|
|
959
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
960
|
-
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
|
|
961
|
-
q8 += 8; a += 8;
|
|
962
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
963
|
-
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
|
|
964
|
-
q8 += 8; a += 8;
|
|
965
|
-
}
|
|
966
|
-
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
967
|
-
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
968
|
-
}
|
|
969
|
-
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
970
|
-
*s = sumf;
|
|
971
|
-
|
|
887
|
+
UNUSED(kmask1);
|
|
888
|
+
UNUSED(kmask2);
|
|
889
|
+
UNUSED(x);
|
|
890
|
+
UNUSED(y);
|
|
891
|
+
UNUSED(nb);
|
|
892
|
+
ggml_vec_dot_q3_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
972
893
|
#endif
|
|
973
|
-
|
|
974
894
|
}
|
|
975
895
|
|
|
976
896
|
void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
@@ -1130,61 +1050,14 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1130
1050
|
*s = vec_extract(vsumf0, 0);
|
|
1131
1051
|
|
|
1132
1052
|
#else
|
|
1133
|
-
|
|
1134
|
-
|
|
1135
|
-
|
|
1136
|
-
|
|
1137
|
-
|
|
1138
|
-
|
|
1139
|
-
|
|
1140
|
-
|
|
1141
|
-
memset(sums, 0, 8*sizeof(float));
|
|
1142
|
-
|
|
1143
|
-
float sumf = 0;
|
|
1144
|
-
for (int i = 0; i < nb; ++i) {
|
|
1145
|
-
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
|
1146
|
-
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
1147
|
-
memset(aux32, 0, 8*sizeof(int32_t));
|
|
1148
|
-
int8_t * GGML_RESTRICT a = aux8;
|
|
1149
|
-
for (int j = 0; j < QK_K/64; ++j) {
|
|
1150
|
-
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
|
|
1151
|
-
a += 32;
|
|
1152
|
-
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
|
|
1153
|
-
a += 32; q4 += 32;
|
|
1154
|
-
}
|
|
1155
|
-
memcpy(utmp, x[i].scales, 12);
|
|
1156
|
-
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
|
1157
|
-
const uint32_t uaux = utmp[1] & kmask1;
|
|
1158
|
-
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
|
1159
|
-
utmp[2] = uaux;
|
|
1160
|
-
utmp[0] &= kmask1;
|
|
1161
|
-
|
|
1162
|
-
int sumi = 0;
|
|
1163
|
-
for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
|
|
1164
|
-
a = aux8;
|
|
1165
|
-
int is = 0;
|
|
1166
|
-
for (int j = 0; j < QK_K/32; ++j) {
|
|
1167
|
-
int32_t scale = scales[is++];
|
|
1168
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
1169
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
1170
|
-
q8 += 8; a += 8;
|
|
1171
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
1172
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
1173
|
-
q8 += 8; a += 8;
|
|
1174
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
1175
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
1176
|
-
q8 += 8; a += 8;
|
|
1177
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
1178
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
1179
|
-
q8 += 8; a += 8;
|
|
1180
|
-
}
|
|
1181
|
-
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
1182
|
-
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
1183
|
-
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
|
1184
|
-
sumf -= dmin * sumi;
|
|
1185
|
-
}
|
|
1186
|
-
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
1187
|
-
*s = sumf;
|
|
1053
|
+
UNUSED(x);
|
|
1054
|
+
UNUSED(y);
|
|
1055
|
+
UNUSED(nb);
|
|
1056
|
+
UNUSED(kmask1);
|
|
1057
|
+
UNUSED(kmask2);
|
|
1058
|
+
UNUSED(kmask3);
|
|
1059
|
+
UNUSED(utmp);
|
|
1060
|
+
ggml_vec_dot_q4_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
1188
1061
|
#endif
|
|
1189
1062
|
}
|
|
1190
1063
|
|
|
@@ -1342,66 +1215,14 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1342
1215
|
*s = vec_extract(vsumf0, 0);
|
|
1343
1216
|
|
|
1344
1217
|
#else
|
|
1345
|
-
|
|
1346
|
-
|
|
1347
|
-
|
|
1348
|
-
|
|
1349
|
-
|
|
1350
|
-
|
|
1351
|
-
|
|
1352
|
-
|
|
1353
|
-
memset(sums, 0, 8*sizeof(float));
|
|
1354
|
-
|
|
1355
|
-
float sumf = 0;
|
|
1356
|
-
for (int i = 0; i < nb; ++i) {
|
|
1357
|
-
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
|
1358
|
-
const uint8_t * GGML_RESTRICT hm = x[i].qh;
|
|
1359
|
-
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
1360
|
-
memset(aux32, 0, 8*sizeof(int32_t));
|
|
1361
|
-
int8_t * GGML_RESTRICT a = aux8;
|
|
1362
|
-
uint8_t m = 1;
|
|
1363
|
-
for (int j = 0; j < QK_K/64; ++j) {
|
|
1364
|
-
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
|
|
1365
|
-
for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
|
|
1366
|
-
a += 32; m <<= 1;
|
|
1367
|
-
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
|
|
1368
|
-
for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
|
|
1369
|
-
a += 32; m <<= 1;
|
|
1370
|
-
q4 += 32;
|
|
1371
|
-
}
|
|
1372
|
-
memcpy(utmp, x[i].scales, 12);
|
|
1373
|
-
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
|
1374
|
-
const uint32_t uaux = utmp[1] & kmask1;
|
|
1375
|
-
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
|
1376
|
-
utmp[2] = uaux;
|
|
1377
|
-
utmp[0] &= kmask1;
|
|
1378
|
-
|
|
1379
|
-
int sumi = 0;
|
|
1380
|
-
for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
|
|
1381
|
-
a = aux8;
|
|
1382
|
-
int is = 0;
|
|
1383
|
-
for (int j = 0; j < QK_K/32; ++j) {
|
|
1384
|
-
int32_t scale = scales[is++];
|
|
1385
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
1386
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
1387
|
-
q8 += 8; a += 8;
|
|
1388
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
1389
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
1390
|
-
q8 += 8; a += 8;
|
|
1391
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
1392
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
1393
|
-
q8 += 8; a += 8;
|
|
1394
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
1395
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
1396
|
-
q8 += 8; a += 8;
|
|
1397
|
-
}
|
|
1398
|
-
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
1399
|
-
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
1400
|
-
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
|
1401
|
-
sumf -= dmin * sumi;
|
|
1402
|
-
}
|
|
1403
|
-
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
1404
|
-
*s = sumf;
|
|
1218
|
+
UNUSED(x);
|
|
1219
|
+
UNUSED(y);
|
|
1220
|
+
UNUSED(nb);
|
|
1221
|
+
UNUSED(kmask1);
|
|
1222
|
+
UNUSED(kmask2);
|
|
1223
|
+
UNUSED(kmask3);
|
|
1224
|
+
UNUSED(utmp);
|
|
1225
|
+
ggml_vec_dot_q5_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
1405
1226
|
#endif
|
|
1406
1227
|
}
|
|
1407
1228
|
|
|
@@ -1556,47 +1377,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1556
1377
|
*s = vec_extract(vsumf0, 0);
|
|
1557
1378
|
|
|
1558
1379
|
#else
|
|
1559
|
-
|
|
1560
|
-
|
|
1561
|
-
|
|
1562
|
-
|
|
1563
|
-
int32_t aux32[8];
|
|
1564
|
-
memset(sums, 0, 8*sizeof(float));
|
|
1565
|
-
|
|
1566
|
-
float sumf = 0;
|
|
1567
|
-
for (int i = 0; i < nb; ++i) {
|
|
1568
|
-
const uint8_t * GGML_RESTRICT q4 = x[i].ql;
|
|
1569
|
-
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
1570
|
-
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
1571
|
-
memset(aux32, 0, 8*sizeof(int32_t));
|
|
1572
|
-
int8_t * GGML_RESTRICT a = aux8;
|
|
1573
|
-
for (int j = 0; j < QK_K; j += 128) {
|
|
1574
|
-
for (int l = 0; l < 32; ++l) {
|
|
1575
|
-
a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
|
|
1576
|
-
a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
|
|
1577
|
-
a[l + 64] = (int8_t)((q4[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
|
|
1578
|
-
a[l + 96] = (int8_t)((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
|
|
1579
|
-
}
|
|
1580
|
-
a += 128;
|
|
1581
|
-
q4 += 64;
|
|
1582
|
-
qh += 32;
|
|
1583
|
-
}
|
|
1584
|
-
a = aux8;
|
|
1585
|
-
int is = 0;
|
|
1586
|
-
for (int j = 0; j < QK_K/16; ++j) {
|
|
1587
|
-
int scale = x[i].scales[is++];
|
|
1588
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
1589
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
1590
|
-
q8 += 8; a += 8;
|
|
1591
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
1592
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
1593
|
-
q8 += 8; a += 8;
|
|
1594
|
-
}
|
|
1595
|
-
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
1596
|
-
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
1597
|
-
}
|
|
1598
|
-
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
1599
|
-
*s = sumf;
|
|
1380
|
+
UNUSED(x);
|
|
1381
|
+
UNUSED(y);
|
|
1382
|
+
UNUSED(nb);
|
|
1383
|
+
ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
1600
1384
|
#endif
|
|
1601
1385
|
}
|
|
1602
1386
|
|
|
@@ -1737,34 +1521,10 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
|
|
|
1737
1521
|
*s = 0.125f * vec_extract(vsumf0, 0);
|
|
1738
1522
|
|
|
1739
1523
|
#else
|
|
1740
|
-
|
|
1741
|
-
|
|
1742
|
-
|
|
1743
|
-
|
|
1744
|
-
float sumf = 0.f;
|
|
1745
|
-
for (int i = 0; i < nb; ++i) {
|
|
1746
|
-
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
1747
|
-
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
|
1748
|
-
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
1749
|
-
int32_t bsum = 0;
|
|
1750
|
-
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
|
1751
|
-
memcpy(aux32, q2, 2*sizeof(uint32_t));
|
|
1752
|
-
q2 += 4;
|
|
1753
|
-
const uint32_t ls = 2*(aux32[1] >> 28) + 1;
|
|
1754
|
-
int32_t sumi = 0;
|
|
1755
|
-
for (int l = 0; l < 4; ++l) {
|
|
1756
|
-
const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
|
|
1757
|
-
const uint8_t signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127];
|
|
1758
|
-
for (int j = 0; j < 8; ++j) {
|
|
1759
|
-
sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
|
|
1760
|
-
}
|
|
1761
|
-
q8 += 8;
|
|
1762
|
-
}
|
|
1763
|
-
bsum += sumi * ls;
|
|
1764
|
-
}
|
|
1765
|
-
sumf += d * bsum;
|
|
1766
|
-
}
|
|
1767
|
-
*s = 0.125f * sumf;
|
|
1524
|
+
UNUSED(x);
|
|
1525
|
+
UNUSED(y);
|
|
1526
|
+
UNUSED(nb);
|
|
1527
|
+
ggml_vec_dot_iq2_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
1768
1528
|
#endif
|
|
1769
1529
|
}
|
|
1770
1530
|
|
|
@@ -1869,42 +1629,10 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
|
1869
1629
|
*s = 0.125f * vec_extract(vsumf0, 0);
|
|
1870
1630
|
|
|
1871
1631
|
#else
|
|
1872
|
-
|
|
1873
|
-
|
|
1874
|
-
|
|
1875
|
-
|
|
1876
|
-
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
|
1877
|
-
const uint8_t * GGML_RESTRICT sc = x[i].scales;
|
|
1878
|
-
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
1879
|
-
int32_t bsum = 0;
|
|
1880
|
-
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
|
1881
|
-
const uint16_t ls1 = 2*(sc[ib32] & 0xf) + 1;
|
|
1882
|
-
const uint16_t ls2 = 2*(sc[ib32] >> 4) + 1;
|
|
1883
|
-
int32_t sumi = 0;
|
|
1884
|
-
for (int l = 0; l < 2; ++l) {
|
|
1885
|
-
const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
|
|
1886
|
-
const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
|
|
1887
|
-
for (int j = 0; j < 8; ++j) {
|
|
1888
|
-
sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
|
|
1889
|
-
}
|
|
1890
|
-
q8 += 8;
|
|
1891
|
-
}
|
|
1892
|
-
bsum += sumi * ls1;
|
|
1893
|
-
sumi = 0;
|
|
1894
|
-
for (int l = 2; l < 4; ++l) {
|
|
1895
|
-
const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
|
|
1896
|
-
const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
|
|
1897
|
-
for (int j = 0; j < 8; ++j) {
|
|
1898
|
-
sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
|
|
1899
|
-
}
|
|
1900
|
-
q8 += 8;
|
|
1901
|
-
}
|
|
1902
|
-
bsum += sumi * ls2;
|
|
1903
|
-
q2 += 4;
|
|
1904
|
-
}
|
|
1905
|
-
sumf += d * bsum;
|
|
1906
|
-
}
|
|
1907
|
-
*s = 0.125f * sumf;
|
|
1632
|
+
UNUSED(x);
|
|
1633
|
+
UNUSED(y);
|
|
1634
|
+
UNUSED(nb);
|
|
1635
|
+
ggml_vec_dot_iq2_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
1908
1636
|
#endif
|
|
1909
1637
|
}
|
|
1910
1638
|
|
|
@@ -2030,47 +1758,11 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
2030
1758
|
*s = 0.125f * vec_extract(vsumf0, 0);
|
|
2031
1759
|
|
|
2032
1760
|
#else
|
|
2033
|
-
|
|
2034
|
-
|
|
2035
|
-
|
|
2036
|
-
|
|
2037
|
-
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
2038
|
-
const int8_t * q8 = y[i].qs;
|
|
2039
|
-
const uint8_t * qs = x[i].qs;
|
|
2040
|
-
const uint8_t * qh = x[i].qh;
|
|
2041
|
-
const uint8_t * signs = qs + QK_K/8;
|
|
2042
|
-
|
|
2043
|
-
int bsum = 0;
|
|
2044
|
-
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
|
2045
|
-
int ls1 = 1 + 2*(x[i].scales[ib32] & 0xf);
|
|
2046
|
-
int ls2 = 1 + 2*(x[i].scales[ib32] >> 4);
|
|
2047
|
-
int sumi1 = 0, sumi2 = 0;
|
|
2048
|
-
for (int l = 0; l < 2; ++l) {
|
|
2049
|
-
const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
|
|
2050
|
-
for (int j = 0; j < 8; ++j) {
|
|
2051
|
-
sumi1 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
|
|
2052
|
-
}
|
|
2053
|
-
q8 += 8;
|
|
2054
|
-
}
|
|
2055
|
-
for (int l = 2; l < 4; ++l) {
|
|
2056
|
-
const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
|
|
2057
|
-
for (int j = 0; j < 8; ++j) {
|
|
2058
|
-
sumi2 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
|
|
2059
|
-
}
|
|
2060
|
-
q8 += 8;
|
|
2061
|
-
}
|
|
2062
|
-
bsum += ls1 * sumi1 + ls2 * sumi2;
|
|
2063
|
-
qs += 4;
|
|
2064
|
-
signs += 4;
|
|
2065
|
-
}
|
|
2066
|
-
|
|
2067
|
-
sumf += d * bsum;
|
|
2068
|
-
}
|
|
2069
|
-
|
|
2070
|
-
*s = 0.125f * sumf;
|
|
2071
|
-
|
|
1761
|
+
UNUSED(x);
|
|
1762
|
+
UNUSED(y);
|
|
1763
|
+
UNUSED(nb);
|
|
1764
|
+
ggml_vec_dot_iq2_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
2072
1765
|
#endif
|
|
2073
|
-
|
|
2074
1766
|
}
|
|
2075
1767
|
|
|
2076
1768
|
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
@@ -2172,36 +1864,10 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
|
|
|
2172
1864
|
*s = 0.25f * vec_extract(vsumf0, 0);
|
|
2173
1865
|
|
|
2174
1866
|
#else
|
|
2175
|
-
|
|
2176
|
-
|
|
2177
|
-
|
|
2178
|
-
|
|
2179
|
-
for (int i = 0; i < nb; ++i) {
|
|
2180
|
-
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
2181
|
-
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
|
2182
|
-
const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
|
|
2183
|
-
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
2184
|
-
int32_t bsum = 0;
|
|
2185
|
-
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
|
2186
|
-
memcpy(&aux32, gas, sizeof(uint32_t)); gas += sizeof(uint32_t);
|
|
2187
|
-
const uint32_t ls = 2*(aux32 >> 28) + 1;
|
|
2188
|
-
int32_t sumi = 0;
|
|
2189
|
-
for (int l = 0; l < 4; ++l) {
|
|
2190
|
-
const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*l+0]);
|
|
2191
|
-
const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*l+1]);
|
|
2192
|
-
const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*l) & 127];
|
|
2193
|
-
for (int j = 0; j < 4; ++j) {
|
|
2194
|
-
sumi += grid1[j] * q8[j+0] * (signs & kmask_iq2xs[j+0] ? -1 : 1);
|
|
2195
|
-
sumi += grid2[j] * q8[j+4] * (signs & kmask_iq2xs[j+4] ? -1 : 1);
|
|
2196
|
-
}
|
|
2197
|
-
q8 += 8;
|
|
2198
|
-
}
|
|
2199
|
-
q3 += 8;
|
|
2200
|
-
bsum += sumi * ls;
|
|
2201
|
-
}
|
|
2202
|
-
sumf += d * bsum;
|
|
2203
|
-
}
|
|
2204
|
-
*s = 0.25f * sumf;
|
|
1867
|
+
UNUSED(x);
|
|
1868
|
+
UNUSED(y);
|
|
1869
|
+
UNUSED(nb);
|
|
1870
|
+
ggml_vec_dot_iq3_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
2205
1871
|
#endif
|
|
2206
1872
|
}
|
|
2207
1873
|
|
|
@@ -2327,48 +1993,10 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
2327
1993
|
*s = vec_extract(vsumf0, 0);
|
|
2328
1994
|
|
|
2329
1995
|
#else
|
|
2330
|
-
|
|
2331
|
-
|
|
2332
|
-
|
|
2333
|
-
|
|
2334
|
-
const uint8_t * GGML_RESTRICT qs = x[i].qs;
|
|
2335
|
-
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
2336
|
-
const uint8_t * GGML_RESTRICT signs = x[i].signs;
|
|
2337
|
-
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
2338
|
-
int32_t bsum = 0;
|
|
2339
|
-
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
|
2340
|
-
const uint32_t ls1 = 2*(x[i].scales[ib32/2] & 0xf) + 1;
|
|
2341
|
-
const uint32_t ls2 = 2*(x[i].scales[ib32/2] >> 4) + 1;
|
|
2342
|
-
int32_t sumi = 0;
|
|
2343
|
-
for (int l = 0; l < 4; ++l) {
|
|
2344
|
-
const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256)));
|
|
2345
|
-
const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256)));
|
|
2346
|
-
for (int j = 0; j < 4; ++j) {
|
|
2347
|
-
sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
|
|
2348
|
-
sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
|
|
2349
|
-
}
|
|
2350
|
-
q8 += 8;
|
|
2351
|
-
}
|
|
2352
|
-
qs += 8;
|
|
2353
|
-
signs += 4;
|
|
2354
|
-
bsum += sumi * ls1;
|
|
2355
|
-
sumi = 0;
|
|
2356
|
-
for (int l = 0; l < 4; ++l) {
|
|
2357
|
-
const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256)));
|
|
2358
|
-
const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256)));
|
|
2359
|
-
for (int j = 0; j < 4; ++j) {
|
|
2360
|
-
sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
|
|
2361
|
-
sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
|
|
2362
|
-
}
|
|
2363
|
-
q8 += 8;
|
|
2364
|
-
}
|
|
2365
|
-
qs += 8;
|
|
2366
|
-
signs += 4;
|
|
2367
|
-
bsum += sumi * ls2;
|
|
2368
|
-
}
|
|
2369
|
-
sumf += d * bsum;
|
|
2370
|
-
}
|
|
2371
|
-
*s = sumf;
|
|
1996
|
+
UNUSED(x);
|
|
1997
|
+
UNUSED(y);
|
|
1998
|
+
UNUSED(nb);
|
|
1999
|
+
ggml_vec_dot_iq3_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
2372
2000
|
#endif
|
|
2373
2001
|
}
|
|
2374
2002
|
|
|
@@ -2481,36 +2109,10 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
2481
2109
|
*s = vec_extract(vsumf0, 0);
|
|
2482
2110
|
|
|
2483
2111
|
#else
|
|
2484
|
-
|
|
2485
|
-
|
|
2486
|
-
|
|
2487
|
-
|
|
2488
|
-
const int8_t * q8 = y[i].qs;
|
|
2489
|
-
const uint8_t * qs = x[i].qs;
|
|
2490
|
-
const uint16_t * qh = x[i].qh;
|
|
2491
|
-
|
|
2492
|
-
int sumi = 0, sumi1 = 0;
|
|
2493
|
-
for (int ib = 0; ib < QK_K/32; ++ib) {
|
|
2494
|
-
const int ls = 2*((qh[ib] >> 12) & 7) + 1;
|
|
2495
|
-
const int delta = qh[ib] & 0x8000 ? -1 : 1;
|
|
2496
|
-
int lsum = 0;
|
|
2497
|
-
for (int l = 0; l < 4; ++l) {
|
|
2498
|
-
const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((qh[ib] >> 3*l) & 7) << 8)));
|
|
2499
|
-
for (int j = 0; j < 8; ++j) {
|
|
2500
|
-
lsum += q8[j] * grid[j];
|
|
2501
|
-
}
|
|
2502
|
-
q8 += 8;
|
|
2503
|
-
}
|
|
2504
|
-
sumi += ls * lsum;
|
|
2505
|
-
sumi1 += ls * delta * (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]);
|
|
2506
|
-
qs += 4;
|
|
2507
|
-
}
|
|
2508
|
-
|
|
2509
|
-
sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
|
|
2510
|
-
}
|
|
2511
|
-
|
|
2512
|
-
*s = sumf;
|
|
2513
|
-
|
|
2112
|
+
UNUSED(x);
|
|
2113
|
+
UNUSED(y);
|
|
2114
|
+
UNUSED(nb);
|
|
2115
|
+
ggml_vec_dot_iq1_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
2514
2116
|
#endif
|
|
2515
2117
|
}
|
|
2516
2118
|
|
|
@@ -2581,17 +2183,15 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
|
2581
2183
|
|
|
2582
2184
|
sumf = vec_extract(vsumf0, 0);
|
|
2583
2185
|
|
|
2584
|
-
#endif
|
|
2585
|
-
for (; ib < nb; ++ib) {
|
|
2586
|
-
const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d);
|
|
2587
|
-
int sumi1 = 0, sumi2 = 0;
|
|
2588
|
-
for (int j = 0; j < QK4_NL/2; ++j) {
|
|
2589
|
-
sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
|
|
2590
|
-
sumi2 += y[ib].qs[j+QK4_NL/2] * kvalues_iq4nl[x[ib].qs[j] >> 4];
|
|
2591
|
-
}
|
|
2592
|
-
sumf += d * (sumi1 + sumi2);
|
|
2593
|
-
}
|
|
2594
2186
|
*s = sumf;
|
|
2187
|
+
#else
|
|
2188
|
+
UNUSED(x);
|
|
2189
|
+
UNUSED(y);
|
|
2190
|
+
UNUSED(nb);
|
|
2191
|
+
UNUSED(ib);
|
|
2192
|
+
UNUSED(sumf);
|
|
2193
|
+
ggml_vec_dot_iq4_nl_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
2194
|
+
#endif
|
|
2595
2195
|
}
|
|
2596
2196
|
|
|
2597
2197
|
void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
@@ -2696,37 +2296,10 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
|
2696
2296
|
*s = vec_extract(vsumf0, 0);
|
|
2697
2297
|
|
|
2698
2298
|
#else
|
|
2699
|
-
|
|
2700
|
-
|
|
2701
|
-
|
|
2702
|
-
|
|
2703
|
-
const uint8_t * qs = x[ibl].qs;
|
|
2704
|
-
const int8_t * q8 = y[ibl].qs;
|
|
2705
|
-
for (int ib = 0; ib < QK_K/32; ib += 2) {
|
|
2706
|
-
const uint8_t ls1 = (x[ibl].scales_l[ib/2] & 0xf) | ((h << 4) & 0x30);
|
|
2707
|
-
const uint8_t ls2 = (x[ibl].scales_l[ib/2] >> 4) | ((h << 2) & 0x30);
|
|
2708
|
-
h >>= 4;
|
|
2709
|
-
const float d1 = d4d8*(ls1 - 32);
|
|
2710
|
-
const float d2 = d4d8*(ls2 - 32);
|
|
2711
|
-
int sumi1 = 0, sumi2 = 0;
|
|
2712
|
-
for (int j = 0; j < 16; ++j) {
|
|
2713
|
-
sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
|
|
2714
|
-
sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
|
|
2715
|
-
}
|
|
2716
|
-
sumf += d1 * (sumi1 + sumi2);
|
|
2717
|
-
qs += 16;
|
|
2718
|
-
q8 += 32;
|
|
2719
|
-
sumi1 = sumi2 = 0;
|
|
2720
|
-
for (int j = 0; j < 16; ++j) {
|
|
2721
|
-
sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
|
|
2722
|
-
sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
|
|
2723
|
-
}
|
|
2724
|
-
sumf += d2 * (sumi1 + sumi2);
|
|
2725
|
-
qs += 16;
|
|
2726
|
-
q8 += 32;
|
|
2727
|
-
}
|
|
2728
|
-
}
|
|
2729
|
-
*s = sumf;
|
|
2299
|
+
UNUSED(x);
|
|
2300
|
+
UNUSED(y);
|
|
2301
|
+
UNUSED(nb);
|
|
2302
|
+
ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
2730
2303
|
#endif
|
|
2731
2304
|
}
|
|
2732
2305
|
|