@novastera-oss/llamarn 0.3.1 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +86 -3
- package/RNLlamaCpp.podspec +1 -1
- package/android/CMakeLists.txt +11 -3
- package/android/generated/jni/react/renderer/components/RNLlamaCppSpec/RNLlamaCppSpecJSI.h +49 -4
- package/android/src/main/cpp/include/llama.h +53 -114
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86/libggml.so +0 -0
- package/android/src/main/jniLibs/x86/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/LlamaCppModel.cpp +2 -10
- package/cpp/PureCppImpl.cpp +71 -4
- package/cpp/SystemUtils.cpp +3 -7
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/CMakeLists.txt +2 -0
- package/cpp/llama.cpp/CODEOWNERS +1 -1
- package/cpp/llama.cpp/Makefile +6 -1605
- package/cpp/llama.cpp/README.md +5 -1
- package/cpp/llama.cpp/common/arg.cpp +230 -51
- package/cpp/llama.cpp/common/chat-parser.cpp +9 -1
- package/cpp/llama.cpp/common/chat.cpp +539 -8
- package/cpp/llama.cpp/common/chat.h +8 -1
- package/cpp/llama.cpp/common/common.cpp +60 -15
- package/cpp/llama.cpp/common/common.h +64 -15
- package/cpp/llama.cpp/common/speculative.cpp +135 -54
- package/cpp/llama.cpp/common/speculative.h +8 -1
- package/cpp/llama.cpp/convert_hf_to_gguf.py +1216 -109
- package/cpp/llama.cpp/convert_hf_to_gguf_update.py +19 -6
- package/cpp/llama.cpp/convert_lora_to_gguf.py +1 -1
- package/cpp/llama.cpp/flake.nix +0 -5
- package/cpp/llama.cpp/ggml/CMakeLists.txt +6 -3
- package/cpp/llama.cpp/ggml/cmake/ggml-config.cmake.in +71 -70
- package/cpp/llama.cpp/ggml/include/ggml-opt.h +25 -6
- package/cpp/llama.cpp/ggml/include/ggml-zdnn.h +16 -0
- package/cpp/llama.cpp/ggml/include/ggml.h +90 -3
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +13 -1
- package/cpp/llama.cpp/ggml/src/ggml-alloc.c +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +10 -0
- package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +113 -17
- package/cpp/llama.cpp/ggml/src/ggml-blas/ggml-blas.cpp +4 -4
- package/cpp/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +14 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +701 -585
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +13 -3
- package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +52 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +274 -91
- package/cpp/llama.cpp/ggml/src/ggml-common.h +17 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +132 -596
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +14 -286
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +90 -569
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +162 -589
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +55 -341
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +3 -58
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +371 -298
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +54 -314
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +184 -675
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +4679 -1657
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +33 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +26 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +21 -24
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +16 -7
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +232 -123
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +428 -23
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +4 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +35 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.h +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +458 -46
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.h +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +39 -14
- package/cpp/llama.cpp/ggml/src/ggml-cpu/traits.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/traits.h +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +20 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +122 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +9 -11
- package/cpp/llama.cpp/ggml/src/ggml-cuda/add-id.cu +58 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/add-id.cuh +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/binbcast.cu +275 -170
- package/cpp/llama.cpp/ggml/src/ggml-cuda/binbcast.cuh +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +103 -65
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv-transpose-1d.cu +1 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d.cu +171 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cu +33 -7
- package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cuh +13 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy-utils.cuh +2 -10
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cu +3 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/dequantize.cuh +14 -40
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +83 -27
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +116 -57
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cu +45 -18
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cu +56 -29
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +61 -39
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +70 -49
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +70 -21
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cu +162 -50
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cuh +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/getrows.cu +5 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +208 -97
- package/cpp/llama.cpp/ggml/src/ggml-cuda/im2col.cu +46 -35
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cu +56 -2
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mma.cuh +95 -51
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmf.cu +427 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmf.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cu +204 -57
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +252 -168
- package/cpp/llama.cpp/ggml/src/ggml-cuda/{mmv.cu → mmvf.cu} +53 -53
- package/cpp/llama.cpp/ggml/src/ggml-cuda/{mmv.cuh → mmvf.cuh} +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvq.cu +10 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/norm.cu +192 -19
- package/cpp/llama.cpp/ggml/src/ggml-cuda/norm.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/opt-step-sgd.cu +49 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/opt-step-sgd.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/pad_reflect_1d.cu +82 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/pad_reflect_1d.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/reduce_rows.cuh +53 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/roll.cu +67 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/roll.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/set-rows.cu +1 -8
- package/cpp/llama.cpp/ggml/src/ggml-cuda/softcap.cu +34 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/softcap.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/softmax.cu +16 -10
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +153 -71
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sum.cu +6 -10
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cu +21 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmq-instance-mxfp4.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +75 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/vecdotq.cuh +110 -22
- package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +14 -25
- package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +2 -1
- package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +10 -2
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +61 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +31 -20
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +342 -131
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +464 -134
- package/cpp/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +0 -4
- package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1108 -176
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/add.cl +107 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/div.cl +66 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +343 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +343 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +346 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/glu.cl +41 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/group_norm.cl +49 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul.cl +73 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +132 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +133 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/norm.cl +80 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +10 -2
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +10 -2
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f16.cl +10 -2
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f32.cl +10 -2
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sub.cl +66 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/transpose.cl +20 -0
- package/cpp/llama.cpp/ggml/src/ggml-opt.cpp +97 -41
- package/cpp/llama.cpp/ggml/src/ggml-quants.c +110 -16
- package/cpp/llama.cpp/ggml/src/ggml-quants.h +6 -0
- package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +22 -9
- package/cpp/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +0 -212
- package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +213 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +117 -238
- package/cpp/llama.cpp/ggml/src/ggml-sycl/quantize.hpp +133 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.cpp +94 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1666 -633
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +41 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +42 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +13 -4
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +39 -29
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +107 -43
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp +18 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp +21 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +32 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +20 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +21 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +16 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +44 -8
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +44 -16
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +26 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +2 -17
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +37 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +11 -7
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +109 -55
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +71 -41
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +6 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +111 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +49 -11
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +65 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +9 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +17 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +38 -5
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +14 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +55 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/utils.comp +25 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +75 -20
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/CMakeLists.txt +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/ggml-webgpu.cpp +807 -412
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +72 -22
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +8 -8
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +1794 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +82 -0
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/ggml-zdnn-impl.h +97 -0
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/ggml-zdnn.cpp +846 -0
- package/cpp/llama.cpp/ggml/src/ggml.c +204 -50
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +187 -2
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +11 -2
- package/cpp/llama.cpp/gguf-py/gguf/quants.py +53 -4
- package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_convert_endian.py +67 -63
- package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_new_metadata.py +7 -1
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +120 -16
- package/cpp/llama.cpp/gguf-py/gguf/utility.py +5 -1
- package/cpp/llama.cpp/gguf-py/gguf/vocab.py +284 -1
- package/cpp/llama.cpp/gguf-py/tests/test_quants.py +14 -5
- package/cpp/llama.cpp/include/llama.h +53 -114
- package/cpp/llama.cpp/models/templates/ByteDance-Seed-OSS.jinja +171 -0
- package/cpp/llama.cpp/models/templates/README.md +2 -1
- package/cpp/llama.cpp/models/templates/ibm-granite-granite-3.3-2B-Instruct.jinja +59 -0
- package/cpp/llama.cpp/models/templates/openai-gpt-oss-120b.jinja +331 -0
- package/cpp/llama.cpp/models/templates/unsloth-mistral-Devstral-Small-2507.jinja +105 -0
- package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +3 -1
- package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +0 -6
- package/cpp/llama.cpp/requirements/requirements-pydantic.txt +1 -1
- package/cpp/llama.cpp/src/CMakeLists.txt +2 -2
- package/cpp/llama.cpp/src/llama-adapter.cpp +68 -4
- package/cpp/llama.cpp/src/llama-adapter.h +3 -0
- package/cpp/llama.cpp/src/llama-arch.cpp +192 -2
- package/cpp/llama.cpp/src/llama-arch.h +18 -0
- package/cpp/llama.cpp/src/llama-batch.cpp +2 -2
- package/cpp/llama.cpp/src/llama-chat.cpp +47 -6
- package/cpp/llama.cpp/src/llama-chat.h +3 -0
- package/cpp/llama.cpp/src/llama-context.cpp +61 -252
- package/cpp/llama.cpp/src/llama-context.h +10 -15
- package/cpp/llama.cpp/src/llama-cparams.h +0 -1
- package/cpp/llama.cpp/src/llama-graph.cpp +180 -85
- package/cpp/llama.cpp/src/llama-graph.h +90 -51
- package/cpp/llama.cpp/src/llama-hparams.cpp +34 -3
- package/cpp/llama.cpp/src/llama-hparams.h +21 -6
- package/cpp/llama.cpp/src/{llama-kv-cache-unified-iswa.cpp → llama-kv-cache-iswa.cpp} +79 -56
- package/cpp/llama.cpp/src/{llama-kv-cache-unified-iswa.h → llama-kv-cache-iswa.h} +30 -28
- package/cpp/llama.cpp/src/{llama-kv-cache-unified.cpp → llama-kv-cache.cpp} +240 -632
- package/cpp/llama.cpp/src/{llama-kv-cache-unified.h → llama-kv-cache.h} +39 -74
- package/cpp/llama.cpp/src/llama-kv-cells.h +21 -21
- package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +41 -35
- package/cpp/llama.cpp/src/llama-memory-hybrid.h +26 -29
- package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +13 -9
- package/cpp/llama.cpp/src/llama-memory-recurrent.h +10 -14
- package/cpp/llama.cpp/src/llama-memory.h +13 -10
- package/cpp/llama.cpp/src/llama-model-loader.cpp +2 -0
- package/cpp/llama.cpp/src/llama-model-loader.h +3 -2
- package/cpp/llama.cpp/src/llama-model.cpp +1959 -419
- package/cpp/llama.cpp/src/llama-model.h +28 -4
- package/cpp/llama.cpp/src/llama-quant.cpp +40 -4
- package/cpp/llama.cpp/src/llama-vocab.cpp +51 -2
- package/cpp/llama.cpp/src/llama-vocab.h +1 -0
- package/cpp/llama.cpp/vendor/minja/chat-template.hpp +16 -7
- package/cpp/llama.cpp/vendor/minja/minja.hpp +47 -12
- package/cpp/rn-completion.cpp +3 -27
- package/ios/generated/RNLlamaCppSpec/RNLlamaCppSpec.h +30 -0
- package/ios/generated/RNLlamaCppSpecJSI.h +49 -4
- package/ios/include/chat.h +8 -1
- package/ios/include/common/minja/chat-template.hpp +16 -7
- package/ios/include/common/minja/minja.hpp +47 -12
- package/ios/include/common.h +64 -15
- package/ios/include/llama.h +53 -114
- package/ios/include/speculative.h +8 -1
- package/ios/libs/llama.xcframework/Info.plist +18 -18
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5557 -5267
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-opt.h +25 -6
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +90 -3
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +53 -114
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5520 -5238
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4241 -4014
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +25 -6
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +90 -3
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +53 -114
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5519 -5238
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4242 -4016
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-opt.h +25 -6
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +90 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +53 -114
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-opt.h +25 -6
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +90 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +53 -114
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-opt.h +25 -6
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +90 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +53 -114
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5556 -5267
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-opt.h +25 -6
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +90 -3
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +53 -114
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5519 -5238
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4241 -4014
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +25 -6
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +90 -3
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +53 -114
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5553 -5303
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-opt.h +25 -6
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +90 -3
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +53 -114
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5515 -5274
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4238 -4044
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +25 -6
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +90 -3
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +53 -114
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/lib/module/NativeRNLlamaCpp.js.map +1 -1
- package/lib/typescript/src/NativeRNLlamaCpp.d.ts +5 -0
- package/lib/typescript/src/NativeRNLlamaCpp.d.ts.map +1 -1
- package/package.json +1 -2
- package/src/NativeRNLlamaCpp.ts +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +0 -56
|
@@ -104,12 +104,30 @@ static __global__ void group_norm_f32(const float * x, float * dst, const int gr
|
|
|
104
104
|
}
|
|
105
105
|
}
|
|
106
106
|
|
|
107
|
-
template <int block_size, bool do_multiply = false>
|
|
108
|
-
static __global__ void rms_norm_f32(
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
107
|
+
template <int block_size, bool do_multiply = false, bool do_add = false>
|
|
108
|
+
static __global__ void rms_norm_f32(const float * x, float * dst,
|
|
109
|
+
const int ncols,
|
|
110
|
+
const int64_t stride_row,
|
|
111
|
+
const int64_t stride_channel,
|
|
112
|
+
const int64_t stride_sample,
|
|
113
|
+
const float eps,
|
|
114
|
+
const float * mul = nullptr,
|
|
115
|
+
const int64_t mul_stride_row = 0,
|
|
116
|
+
const int64_t mul_stride_channel = 0,
|
|
117
|
+
const int64_t mul_stride_sample = 0,
|
|
118
|
+
const int mul_ncols = 0,
|
|
119
|
+
const int mul_nrows = 0,
|
|
120
|
+
const int mul_nchannels = 0,
|
|
121
|
+
const int mul_nsamples = 0,
|
|
122
|
+
const float * add = nullptr,
|
|
123
|
+
const int64_t add_stride_row = 0,
|
|
124
|
+
const int64_t add_stride_channel = 0,
|
|
125
|
+
const int64_t add_stride_sample = 0,
|
|
126
|
+
const int add_ncols = 0,
|
|
127
|
+
const int add_nrows = 0,
|
|
128
|
+
const int add_nchannels = 0,
|
|
129
|
+
const int add_nsamples = 0) {
|
|
130
|
+
|
|
113
131
|
const int nrows = gridDim.x;
|
|
114
132
|
const int nchannels = gridDim.y;
|
|
115
133
|
|
|
@@ -118,6 +136,8 @@ static __global__ void rms_norm_f32(
|
|
|
118
136
|
const int sample = blockIdx.z;
|
|
119
137
|
const int tid = threadIdx.x;
|
|
120
138
|
|
|
139
|
+
static_assert(!do_add || do_multiply, "fusing add is not supported without multiplying");
|
|
140
|
+
|
|
121
141
|
x += sample*stride_sample + channel*stride_channel + row*stride_row;
|
|
122
142
|
dst += ((sample*nchannels + channel)*nrows + row)*ncols;
|
|
123
143
|
|
|
@@ -128,6 +148,13 @@ static __global__ void rms_norm_f32(
|
|
|
128
148
|
mul += mul_sample*mul_stride_sample + mul_channel*mul_stride_channel + mul_row*mul_stride_row;
|
|
129
149
|
}
|
|
130
150
|
|
|
151
|
+
if constexpr (do_add) {
|
|
152
|
+
const int add_row = row % add_nrows;
|
|
153
|
+
const int add_channel = channel % add_nchannels;
|
|
154
|
+
const int add_sample = sample % add_nsamples;
|
|
155
|
+
add += add_sample * add_stride_sample + add_channel * add_stride_channel + add_row * add_stride_row;
|
|
156
|
+
}
|
|
157
|
+
|
|
131
158
|
float tmp = 0.0f; // partial sum for thread in warp
|
|
132
159
|
|
|
133
160
|
for (int col = tid; col < ncols; col += block_size) {
|
|
@@ -154,7 +181,11 @@ static __global__ void rms_norm_f32(
|
|
|
154
181
|
const float scale = rsqrtf(mean + eps);
|
|
155
182
|
|
|
156
183
|
for (int col = tid; col < ncols; col += block_size) {
|
|
157
|
-
if constexpr (do_multiply) {
|
|
184
|
+
if constexpr (do_multiply && do_add) {
|
|
185
|
+
const int mul_col = col % mul_ncols;
|
|
186
|
+
const int add_col = col % add_ncols;
|
|
187
|
+
dst[col] = scale * x[col] * mul[mul_col] + add[add_col];
|
|
188
|
+
} else if constexpr (do_multiply) {
|
|
158
189
|
const int mul_col = col % mul_ncols;
|
|
159
190
|
dst[col] = scale * x[col] * mul[mul_col];
|
|
160
191
|
} else {
|
|
@@ -331,23 +362,70 @@ static void rms_norm_f32_cuda(
|
|
|
331
362
|
}
|
|
332
363
|
}
|
|
333
364
|
|
|
334
|
-
static void rms_norm_mul_f32_cuda(
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
365
|
+
static void rms_norm_mul_f32_cuda(const float * x,
|
|
366
|
+
const float * mul,
|
|
367
|
+
const float * add,
|
|
368
|
+
float * dst,
|
|
369
|
+
const int ncols,
|
|
370
|
+
const int nrows,
|
|
371
|
+
const int nchannels,
|
|
372
|
+
const int nsamples,
|
|
373
|
+
const int64_t stride_row,
|
|
374
|
+
const int64_t stride_channel,
|
|
375
|
+
const int64_t stride_sample,
|
|
376
|
+
const int64_t mul_stride_row,
|
|
377
|
+
const int64_t mul_stride_channel,
|
|
378
|
+
const int64_t mul_stride_sample,
|
|
379
|
+
const int mul_ncols,
|
|
380
|
+
const int mul_nrows,
|
|
381
|
+
const int mul_nchannels,
|
|
382
|
+
const int mul_nsamples,
|
|
383
|
+
const int64_t add_stride_row,
|
|
384
|
+
const int64_t add_stride_channel,
|
|
385
|
+
const int64_t add_stride_sample,
|
|
386
|
+
const int add_ncols,
|
|
387
|
+
const int add_nrows,
|
|
388
|
+
const int add_nchannels,
|
|
389
|
+
const int add_nsamples,
|
|
390
|
+
const float eps,
|
|
391
|
+
cudaStream_t stream) {
|
|
340
392
|
const dim3 blocks_num(nrows, nchannels, nsamples);
|
|
341
393
|
if (mul == nullptr) {
|
|
342
394
|
rms_norm_f32_cuda(x, dst, ncols, nrows, nchannels, nsamples, stride_row, stride_channel, stride_sample, eps, stream);
|
|
343
395
|
return;
|
|
344
396
|
}
|
|
345
|
-
if (
|
|
346
|
-
|
|
347
|
-
|
|
397
|
+
if (add == nullptr) {
|
|
398
|
+
if (ncols < 1024) {
|
|
399
|
+
const dim3 block_dims(WARP_SIZE, 1, 1);
|
|
400
|
+
rms_norm_f32<WARP_SIZE, true><<<blocks_num, block_dims, 0, stream>>>(x, dst,
|
|
401
|
+
ncols, stride_row, stride_channel, stride_sample, eps,
|
|
402
|
+
mul, mul_stride_row, mul_stride_channel, mul_stride_sample,
|
|
403
|
+
mul_ncols, mul_nrows, mul_nchannels, mul_nsamples);
|
|
404
|
+
} else {
|
|
405
|
+
const dim3 block_dims(1024, 1, 1);
|
|
406
|
+
rms_norm_f32<1024, true><<<blocks_num, block_dims, 0, stream>>>(x, dst,
|
|
407
|
+
ncols, stride_row, stride_channel, stride_sample, eps,
|
|
408
|
+
mul, mul_stride_row, mul_stride_channel, mul_stride_sample,
|
|
409
|
+
mul_ncols, mul_nrows, mul_nchannels, mul_nsamples);
|
|
410
|
+
}
|
|
348
411
|
} else {
|
|
349
|
-
|
|
350
|
-
|
|
412
|
+
if (ncols < 1024) {
|
|
413
|
+
const dim3 block_dims(WARP_SIZE, 1, 1);
|
|
414
|
+
rms_norm_f32<WARP_SIZE, true, true><<<blocks_num, block_dims, 0, stream>>>(x, dst,
|
|
415
|
+
ncols, stride_row, stride_channel, stride_sample, eps,
|
|
416
|
+
mul, mul_stride_row, mul_stride_channel, mul_stride_sample,
|
|
417
|
+
mul_ncols, mul_nrows, mul_nchannels, mul_nsamples,
|
|
418
|
+
add, add_stride_row, add_stride_channel, add_stride_sample,
|
|
419
|
+
add_ncols, add_nrows, add_nchannels, add_nsamples);
|
|
420
|
+
} else {
|
|
421
|
+
const dim3 block_dims(1024, 1, 1);
|
|
422
|
+
rms_norm_f32<1024, true, true><<<blocks_num, block_dims, 0, stream>>>(x, dst,
|
|
423
|
+
ncols, stride_row, stride_channel, stride_sample, eps,
|
|
424
|
+
mul, mul_stride_row, mul_stride_channel, mul_stride_sample,
|
|
425
|
+
mul_ncols, mul_nrows, mul_nchannels, mul_nsamples,
|
|
426
|
+
add, add_stride_row, add_stride_channel, add_stride_sample,
|
|
427
|
+
add_ncols, add_nrows, add_nchannels, add_nsamples);
|
|
428
|
+
}
|
|
351
429
|
}
|
|
352
430
|
}
|
|
353
431
|
|
|
@@ -491,7 +569,102 @@ void ggml_cuda_op_rms_norm_fused(ggml_backend_cuda_context & ctx, ggml_tensor *
|
|
|
491
569
|
const int mul_nchannels = mul_src->ne[2];
|
|
492
570
|
const int mul_nsamples = mul_src->ne[3];
|
|
493
571
|
|
|
494
|
-
rms_norm_mul_f32_cuda(src0_d, mul_d,
|
|
572
|
+
rms_norm_mul_f32_cuda(src0_d, mul_d, nullptr, dst_d,
|
|
573
|
+
ne00, ne01, ne02, ne03,
|
|
574
|
+
/*s00*/ s01, s02, s03,
|
|
575
|
+
/*mul_s00*/ mul_s01, mul_s02, mul_s03,
|
|
576
|
+
mul_ncols, mul_nrows, mul_nchannels, mul_nsamples,
|
|
577
|
+
/*add_s00*/ 0, 0, 0,
|
|
578
|
+
0, 0, 0, 0,
|
|
579
|
+
eps, stream);
|
|
580
|
+
}
|
|
581
|
+
|
|
582
|
+
void ggml_cuda_op_rms_norm_fused_add(ggml_backend_cuda_context & ctx,
|
|
583
|
+
ggml_tensor * dst,
|
|
584
|
+
ggml_tensor * mul_tensor,
|
|
585
|
+
ggml_tensor * add_tensor) {
|
|
586
|
+
const ggml_tensor * rms_norm_src = (ggml_tensor *) dst->src[0];
|
|
587
|
+
float eps = 0.0f;
|
|
588
|
+
|
|
589
|
+
memcpy(&eps, dst->op_params, sizeof(float));
|
|
590
|
+
|
|
591
|
+
const float * src0_d = (const float *) rms_norm_src->data;
|
|
592
|
+
const float * mul_d = nullptr;
|
|
593
|
+
const ggml_tensor * mul_src = nullptr;
|
|
594
|
+
|
|
595
|
+
if (mul_tensor->src[0] == dst) {
|
|
596
|
+
mul_d = (float *) mul_tensor->src[1]->data;
|
|
597
|
+
mul_src = mul_tensor->src[1];
|
|
598
|
+
} else if (mul_tensor->src[1] == dst) {
|
|
599
|
+
mul_d = (float *) mul_tensor->src[0]->data;
|
|
600
|
+
mul_src = mul_tensor->src[0];
|
|
601
|
+
} else {
|
|
602
|
+
GGML_ASSERT(false);
|
|
603
|
+
}
|
|
604
|
+
|
|
605
|
+
const float * add_d = nullptr;
|
|
606
|
+
const ggml_tensor * add_src = nullptr;
|
|
607
|
+
|
|
608
|
+
if (add_tensor->src[0] == mul_tensor) {
|
|
609
|
+
add_d = (float *) add_tensor->src[1]->data;
|
|
610
|
+
add_src = add_tensor->src[1];
|
|
611
|
+
} else if (add_tensor->src[1] == mul_tensor) {
|
|
612
|
+
add_d = (float *) add_tensor->src[0]->data;
|
|
613
|
+
add_src = add_tensor->src[0];
|
|
614
|
+
} else {
|
|
615
|
+
GGML_ASSERT(false);
|
|
616
|
+
}
|
|
617
|
+
|
|
618
|
+
float * dst_d = (float *) add_tensor->data;
|
|
619
|
+
cudaStream_t stream = ctx.stream();
|
|
620
|
+
|
|
621
|
+
GGML_ASSERT(rms_norm_src->type == GGML_TYPE_F32);
|
|
622
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
623
|
+
GGML_ASSERT(mul_tensor->type == GGML_TYPE_F32);
|
|
624
|
+
GGML_ASSERT(add_tensor->type == GGML_TYPE_F32);
|
|
625
|
+
GGML_ASSERT(eps >= 0.0f);
|
|
626
|
+
|
|
627
|
+
const int64_t ne00 = rms_norm_src->ne[0];
|
|
628
|
+
const int64_t ne01 = rms_norm_src->ne[1];
|
|
629
|
+
const int64_t ne02 = rms_norm_src->ne[2];
|
|
630
|
+
const int64_t ne03 = rms_norm_src->ne[3];
|
|
631
|
+
|
|
632
|
+
const size_t ts0 = ggml_type_size(rms_norm_src->type);
|
|
633
|
+
GGML_ASSERT(rms_norm_src->nb[0] == ts0);
|
|
634
|
+
const int64_t s01 = rms_norm_src->nb[1] / ts0;
|
|
635
|
+
const int64_t s02 = rms_norm_src->nb[2] / ts0;
|
|
636
|
+
const int64_t s03 = rms_norm_src->nb[3] / ts0;
|
|
637
|
+
|
|
638
|
+
const size_t ts_mul = ggml_type_size(mul_src->type);
|
|
639
|
+
GGML_ASSERT(mul_src->nb[0] == ts_mul);
|
|
640
|
+
const int64_t mul_s01 = mul_src->nb[1] / ts_mul;
|
|
641
|
+
const int64_t mul_s02 = mul_src->nb[2] / ts_mul;
|
|
642
|
+
const int64_t mul_s03 = mul_src->nb[3] / ts_mul;
|
|
643
|
+
|
|
644
|
+
const int mul_ncols = mul_src->ne[0];
|
|
645
|
+
const int mul_nrows = mul_src->ne[1];
|
|
646
|
+
const int mul_nchannels = mul_src->ne[2];
|
|
647
|
+
const int mul_nsamples = mul_src->ne[3];
|
|
648
|
+
|
|
649
|
+
const size_t ts_add = ggml_type_size(add_src->type);
|
|
650
|
+
GGML_ASSERT(add_src->nb[0] == ts_add);
|
|
651
|
+
const int64_t add_s01 = add_src->nb[1] / ts_add;
|
|
652
|
+
const int64_t add_s02 = add_src->nb[2] / ts_add;
|
|
653
|
+
const int64_t add_s03 = add_src->nb[3] / ts_add;
|
|
654
|
+
|
|
655
|
+
const int add_ncols = add_src->ne[0];
|
|
656
|
+
const int add_nrows = add_src->ne[1];
|
|
657
|
+
const int add_nchannels = add_src->ne[2];
|
|
658
|
+
const int add_nsamples = add_src->ne[3];
|
|
659
|
+
|
|
660
|
+
rms_norm_mul_f32_cuda(src0_d, mul_d,add_d,dst_d,
|
|
661
|
+
ne00,ne01, ne02, ne03,
|
|
662
|
+
/*s00*/ s01, s02, s03,
|
|
663
|
+
/*mul_s00*/ mul_s01, mul_s02, mul_s03,
|
|
664
|
+
mul_ncols, mul_nrows, mul_nchannels, mul_nsamples,
|
|
665
|
+
/*add_s00*/ add_s01, add_s02, add_s03,
|
|
666
|
+
add_ncols, add_nrows, add_nchannels, add_nsamples,
|
|
667
|
+
eps, stream);
|
|
495
668
|
}
|
|
496
669
|
|
|
497
670
|
void ggml_cuda_op_rms_norm_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|
@@ -8,6 +8,11 @@ void ggml_cuda_op_rms_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
|
|
8
8
|
|
|
9
9
|
void ggml_cuda_op_rms_norm_fused(ggml_backend_cuda_context & ctx, ggml_tensor * dst, ggml_tensor * mul_tensor);
|
|
10
10
|
|
|
11
|
+
void ggml_cuda_op_rms_norm_fused_add(ggml_backend_cuda_context & ctx,
|
|
12
|
+
ggml_tensor * dst,
|
|
13
|
+
ggml_tensor * mul_tensor,
|
|
14
|
+
ggml_tensor * add_tensor);
|
|
15
|
+
|
|
11
16
|
void ggml_cuda_op_rms_norm_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
|
12
17
|
|
|
13
18
|
void ggml_cuda_op_l2_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
#include "ggml-impl.h"
|
|
2
|
+
#include "opt-step-sgd.cuh"
|
|
3
|
+
|
|
4
|
+
#include <cstdint>
|
|
5
|
+
|
|
6
|
+
static __global__ void opt_step_sgd_f32(
|
|
7
|
+
float * __restrict__ x, const float * __restrict__ g,
|
|
8
|
+
const float * __restrict__ pars, const int64_t k) {
|
|
9
|
+
|
|
10
|
+
const int64_t i = (int64_t) blockIdx.x*blockDim.x + threadIdx.x;
|
|
11
|
+
|
|
12
|
+
if (i >= k) {
|
|
13
|
+
return;
|
|
14
|
+
}
|
|
15
|
+
x[i] = x[i] * (1.0f - pars[0] * pars[1]) - pars[0] * g[i];
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
static void opt_step_sgd_f32_cuda(
|
|
19
|
+
float * x, const float * g, const float * __restrict__ pars, const int64_t k, cudaStream_t stream) {
|
|
20
|
+
|
|
21
|
+
const dim3 block_dims(CUDA_OPT_STEP_SGD_BLOCK_SIZE, 1, 1);
|
|
22
|
+
const dim3 block_nums((k + CUDA_OPT_STEP_SGD_BLOCK_SIZE - 1) / CUDA_OPT_STEP_SGD_BLOCK_SIZE, 1, 1);
|
|
23
|
+
opt_step_sgd_f32<<<block_nums, block_dims, 0, stream>>>(x, g, pars, k);
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
void ggml_cuda_opt_step_sgd(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|
27
|
+
const ggml_tensor * src0 = dst->src[0];
|
|
28
|
+
const ggml_tensor * src0_grad = dst->src[1];
|
|
29
|
+
const ggml_tensor * params = dst->src[2];
|
|
30
|
+
|
|
31
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
|
32
|
+
GGML_ASSERT(src0_grad->type == GGML_TYPE_F32);
|
|
33
|
+
GGML_ASSERT(params->type == GGML_TYPE_F32);
|
|
34
|
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
|
35
|
+
GGML_ASSERT(ggml_is_contiguous(src0_grad));
|
|
36
|
+
GGML_ASSERT(ggml_is_contiguous(params));
|
|
37
|
+
GGML_ASSERT(ggml_are_same_shape(src0, src0_grad));
|
|
38
|
+
GGML_ASSERT(ggml_nelements(params) == 2);
|
|
39
|
+
|
|
40
|
+
float * src0_d = (float *) src0->data;
|
|
41
|
+
const float * src0_grad_d = (const float *) src0_grad->data;
|
|
42
|
+
const float * params_d = (const float *) params->data;
|
|
43
|
+
|
|
44
|
+
cudaStream_t stream = ctx.stream();
|
|
45
|
+
|
|
46
|
+
const int64_t ne = ggml_nelements(src0);
|
|
47
|
+
|
|
48
|
+
opt_step_sgd_f32_cuda(src0_d, src0_grad_d, params_d, ne, stream);
|
|
49
|
+
}
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
#include "pad_reflect_1d.cuh"
|
|
2
|
+
|
|
3
|
+
static __global__ void pad_reflect_1d_kernel_f32(
|
|
4
|
+
const void * __restrict__ src0,
|
|
5
|
+
void * __restrict__ dst,
|
|
6
|
+
const int64_t ne0,
|
|
7
|
+
const int64_t ne00,
|
|
8
|
+
const int64_t ne01,
|
|
9
|
+
const int64_t ne02,
|
|
10
|
+
const int64_t ne03,
|
|
11
|
+
const int64_t nb00,
|
|
12
|
+
const int64_t nb01,
|
|
13
|
+
const int64_t nb02,
|
|
14
|
+
const int64_t nb03,
|
|
15
|
+
const int64_t nb0,
|
|
16
|
+
const int64_t nb1,
|
|
17
|
+
const int64_t nb2,
|
|
18
|
+
const int64_t nb3,
|
|
19
|
+
const int p0,
|
|
20
|
+
const int p1) {
|
|
21
|
+
|
|
22
|
+
const int64_t i3 = blockIdx.z;
|
|
23
|
+
const int64_t i2 = blockIdx.y;
|
|
24
|
+
const int64_t i1 = blockIdx.x;
|
|
25
|
+
|
|
26
|
+
if (i1 >= ne01 || i2 >= ne02 || i3 >= ne03) {
|
|
27
|
+
return;
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
const char * src0_ptr = (const char *)src0 + i3*nb03 + i2*nb02 + i1*nb01;
|
|
31
|
+
char * dst_ptr = (char *)dst + i3*nb3 + i2*nb2 + i1*nb1;
|
|
32
|
+
|
|
33
|
+
for (int64_t i0 = threadIdx.x; i0 < ne0; i0 += blockDim.x) {
|
|
34
|
+
float value;
|
|
35
|
+
|
|
36
|
+
if (i0 < p0) {
|
|
37
|
+
// Left padding - reflect
|
|
38
|
+
value = *(const float *)(src0_ptr + (p0 - i0) * nb00);
|
|
39
|
+
} else if (i0 < ne0 - p1) {
|
|
40
|
+
// Middle - copy
|
|
41
|
+
value = *(const float *)(src0_ptr + (i0 - p0) * nb00);
|
|
42
|
+
} else {
|
|
43
|
+
// Right padding - reflect
|
|
44
|
+
int64_t src_idx = (ne0 - p1 - p0) - (p1 + 1 - (ne0 - i0)) - 1;
|
|
45
|
+
value = *(const float *)(src0_ptr + src_idx * nb00);
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
*(float *)(dst_ptr + i0 * nb0) = value;
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
void ggml_cuda_op_pad_reflect_1d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|
53
|
+
const ggml_tensor * src0 = dst->src[0];
|
|
54
|
+
cudaStream_t stream = ctx.stream();
|
|
55
|
+
|
|
56
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
|
57
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
58
|
+
|
|
59
|
+
const int32_t * opts = (const int32_t *) dst->op_params;
|
|
60
|
+
const int p0 = opts[0];
|
|
61
|
+
const int p1 = opts[1];
|
|
62
|
+
|
|
63
|
+
const int64_t ne00 = src0->ne[0];
|
|
64
|
+
const int64_t ne01 = src0->ne[1];
|
|
65
|
+
const int64_t ne02 = src0->ne[2];
|
|
66
|
+
const int64_t ne03 = src0->ne[3];
|
|
67
|
+
|
|
68
|
+
const int64_t ne0 = dst->ne[0];
|
|
69
|
+
|
|
70
|
+
GGML_ASSERT(ne0 == ne00 + p0 + p1);
|
|
71
|
+
|
|
72
|
+
const dim3 block_dims(CUDA_PAD_REFLECT_1D_BLOCK_SIZE, 1, 1);
|
|
73
|
+
const dim3 grid_dims(ne01, ne02, ne03);
|
|
74
|
+
|
|
75
|
+
pad_reflect_1d_kernel_f32<<<grid_dims, block_dims, 0, stream>>>(
|
|
76
|
+
src0->data, dst->data,
|
|
77
|
+
ne0, ne00, ne01, ne02, ne03,
|
|
78
|
+
src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
|
|
79
|
+
dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3],
|
|
80
|
+
p0, p1
|
|
81
|
+
);
|
|
82
|
+
}
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
#include "common.cuh"
|
|
2
|
+
|
|
3
|
+
// Row reduction kernel template - compute sum (norm=false) or mean (norm=true)
|
|
4
|
+
template <bool norm>
|
|
5
|
+
static __global__ void reduce_rows_f32(const float * __restrict__ x, float * __restrict__ dst, const int ncols) {
|
|
6
|
+
const int row = blockIdx.x;
|
|
7
|
+
const int col = threadIdx.x;
|
|
8
|
+
|
|
9
|
+
float sum = 0.0f;
|
|
10
|
+
const int num_unroll = 8;
|
|
11
|
+
float temp[num_unroll];
|
|
12
|
+
float sum_temp[num_unroll] = { 0.0f };
|
|
13
|
+
for (int i = col; i < ncols;) {
|
|
14
|
+
for (int j = 0; j < num_unroll; ++j) {
|
|
15
|
+
if (i < ncols) {
|
|
16
|
+
temp[j] = x[row * ncols + i];
|
|
17
|
+
} else {
|
|
18
|
+
temp[j] = 0;
|
|
19
|
+
}
|
|
20
|
+
i += blockDim.x;
|
|
21
|
+
}
|
|
22
|
+
for (int j = 0; j < num_unroll; ++j) {
|
|
23
|
+
sum_temp[j] += temp[j];
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
for (int j = 0; j < num_unroll; ++j) {
|
|
27
|
+
sum += sum_temp[j];
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
// sum up partial sums
|
|
31
|
+
sum = warp_reduce_sum(sum);
|
|
32
|
+
if (blockDim.x > WARP_SIZE) {
|
|
33
|
+
assert((blockDim.x <= 1024) && (blockDim.x % WARP_SIZE) == 0);
|
|
34
|
+
__shared__ float s_sum[32];
|
|
35
|
+
const int warp_id = threadIdx.x / WARP_SIZE;
|
|
36
|
+
const int lane_id = threadIdx.x % WARP_SIZE;
|
|
37
|
+
if (lane_id == 0) {
|
|
38
|
+
s_sum[warp_id] = sum;
|
|
39
|
+
}
|
|
40
|
+
__syncthreads();
|
|
41
|
+
sum = 0.0f;
|
|
42
|
+
if (lane_id < (static_cast<int>(blockDim.x) / WARP_SIZE)) {
|
|
43
|
+
sum = s_sum[lane_id];
|
|
44
|
+
}
|
|
45
|
+
sum = warp_reduce_sum(sum);
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
if (col != 0) {
|
|
49
|
+
return;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
dst[row] = norm ? sum / ncols : sum;
|
|
53
|
+
}
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
#include "ggml-cuda/common.cuh"
|
|
2
|
+
#include "roll.cuh"
|
|
3
|
+
|
|
4
|
+
static __forceinline__ __device__ int64_t wrap_index(const int64_t idx, const int64_t ne) {
|
|
5
|
+
if (idx < 0) {
|
|
6
|
+
return idx + ne;
|
|
7
|
+
}
|
|
8
|
+
if (idx >= ne) {
|
|
9
|
+
return idx - ne;
|
|
10
|
+
}
|
|
11
|
+
return idx;
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
static __global__ void roll_f32_cuda(const float * __restrict__ src,
|
|
15
|
+
float * __restrict__ dst,
|
|
16
|
+
const int64_t ne00,
|
|
17
|
+
const int64_t ne01,
|
|
18
|
+
const int64_t ne02,
|
|
19
|
+
const int64_t ne03,
|
|
20
|
+
const int s0,
|
|
21
|
+
const int s1,
|
|
22
|
+
const int s2,
|
|
23
|
+
const int s3) {
|
|
24
|
+
const int64_t idx = int64_t(blockDim.x) * blockIdx.x + threadIdx.x;
|
|
25
|
+
const int64_t n_elements = ne00 * ne01 * ne02 * ne03;
|
|
26
|
+
|
|
27
|
+
if (idx >= n_elements) {
|
|
28
|
+
return;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
const int64_t i0 = idx % ne00;
|
|
32
|
+
const int64_t i1 = (idx / ne00) % ne01;
|
|
33
|
+
const int64_t i2 = (idx / (ne00 * ne01)) % ne02;
|
|
34
|
+
const int64_t i3 = (idx / (ne00 * ne01 * ne02)) % ne03;
|
|
35
|
+
|
|
36
|
+
const int64_t d0 = wrap_index(i0 - s0, ne00);
|
|
37
|
+
const int64_t d1 = wrap_index(i1 - s1, ne01);
|
|
38
|
+
const int64_t d2 = wrap_index(i2 - s2, ne02);
|
|
39
|
+
const int64_t d3 = wrap_index(i3 - s3, ne03);
|
|
40
|
+
|
|
41
|
+
dst[i3 * (ne00 * ne01 * ne02) + i2 * (ne01 * ne00) + i1 * ne00 + i0] =
|
|
42
|
+
src[d3 * (ne00 * ne01 * ne02) + d2 * (ne01 * ne00) + d1 * ne00 + d0];
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
void ggml_cuda_op_roll(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|
46
|
+
int s0 = dst->op_params[0];
|
|
47
|
+
int s1 = dst->op_params[1];
|
|
48
|
+
int s2 = dst->op_params[2];
|
|
49
|
+
int s3 = dst->op_params[3];
|
|
50
|
+
|
|
51
|
+
const ggml_tensor * src0 = dst->src[0];
|
|
52
|
+
const float * src0_d = (const float *) dst->src[0]->data;
|
|
53
|
+
float * dst_d = (float *) dst->data;
|
|
54
|
+
|
|
55
|
+
GGML_TENSOR_UNARY_OP_LOCALS;
|
|
56
|
+
|
|
57
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
|
58
|
+
GGML_ASSERT(ggml_are_same_shape(dst->src[0], dst));
|
|
59
|
+
|
|
60
|
+
cudaStream_t stream = ctx.stream();
|
|
61
|
+
|
|
62
|
+
int64_t sz = (ne00 * ne01 * ne02 * ne03);
|
|
63
|
+
int64_t num_blocks = (sz + CUDA_ROLL_BLOCK_SIZE - 1) / CUDA_ROLL_BLOCK_SIZE;
|
|
64
|
+
|
|
65
|
+
roll_f32_cuda<<<num_blocks, CUDA_ROLL_BLOCK_SIZE, 0, stream>>>(
|
|
66
|
+
src0_d, dst_d, ne00, ne01, ne02, ne03, s0, s1, s2, s3);
|
|
67
|
+
}
|
|
@@ -3,11 +3,6 @@
|
|
|
3
3
|
|
|
4
4
|
typedef void (*set_rows_kernel_t)(const char * src, char * dst);
|
|
5
5
|
|
|
6
|
-
template<typename src_t, typename dst_t>
|
|
7
|
-
__device__ __forceinline__ void set_rows_1(const src_t * src_f, dst_t * dst_f) {
|
|
8
|
-
convert_flt(src_f, dst_f);
|
|
9
|
-
}
|
|
10
|
-
|
|
11
6
|
// Generic quantized set_rows kernel template
|
|
12
7
|
template<typename block_type, int qk, void (*quantize_func)(const float*, block_type*)>
|
|
13
8
|
static __global__ void k_set_rows_quant(
|
|
@@ -117,9 +112,7 @@ static __global__ void k_set_rows(
|
|
|
117
112
|
const src_t * src0_row = src0 + i01*s01 + i02*s02 + i03*s03;
|
|
118
113
|
dst_t * dst_row_ptr = dst + dst_row*s1 + i02*s2 + i03*s3;
|
|
119
114
|
|
|
120
|
-
|
|
121
|
-
dst_t* dst_elem = dst_row_ptr + i00;
|
|
122
|
-
set_rows_1(src_elem, dst_elem);
|
|
115
|
+
dst_row_ptr[i00] = ggml_cuda_cast<dst_t>(src0_row[i00]);
|
|
123
116
|
|
|
124
117
|
GGML_UNUSED(ne10);
|
|
125
118
|
GGML_UNUSED(ne13);
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
#include "softcap.cuh"
|
|
2
|
+
|
|
3
|
+
static __global__ void softcap_f32(const float * x, float * dst, const float scale, const float softcap, const int k) {
|
|
4
|
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
|
5
|
+
|
|
6
|
+
if (i >= k) {
|
|
7
|
+
return;
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
dst[i] = tanhf(scale * x[i]) * softcap;
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
static void softcap_f32_cuda(const float * x, float * dst, const float scale, const float softcap, const int k, cudaStream_t stream) {
|
|
14
|
+
const int num_blocks = (k + CUDA_SOFTCAP_BLOCK_SIZE - 1) / CUDA_SOFTCAP_BLOCK_SIZE;
|
|
15
|
+
softcap_f32<<<num_blocks, CUDA_SOFTCAP_BLOCK_SIZE, 0, stream>>>(x, dst, scale, softcap, k);
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
// fused GGML_OP_SCALE + GGML_UNARY_OP_TANH + GGML_OP_SCALE
|
|
19
|
+
void ggml_cuda_op_softcap(ggml_backend_cuda_context & ctx, ggml_tensor * dst, ggml_tensor * src) {
|
|
20
|
+
const ggml_tensor * src0 = src->src[0];
|
|
21
|
+
const float * src0_d = (const float *)src0->data;
|
|
22
|
+
float * dst_d = (float *)dst->data;
|
|
23
|
+
cudaStream_t stream = ctx.stream();
|
|
24
|
+
|
|
25
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
|
26
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
27
|
+
|
|
28
|
+
float scale;
|
|
29
|
+
float softcap;
|
|
30
|
+
memcpy(&scale, (float *) src->op_params + 0, sizeof(float));
|
|
31
|
+
memcpy(&softcap, (float *) dst->op_params + 0, sizeof(float));
|
|
32
|
+
|
|
33
|
+
softcap_f32_cuda(src0_d, dst_d, scale, softcap, ggml_nelements(src0), stream);
|
|
34
|
+
}
|