@novastera-oss/llamarn 0.3.1 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +86 -3
- package/RNLlamaCpp.podspec +1 -1
- package/android/CMakeLists.txt +11 -3
- package/android/generated/jni/react/renderer/components/RNLlamaCppSpec/RNLlamaCppSpecJSI.h +49 -4
- package/android/src/main/cpp/include/llama.h +53 -114
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86/libggml.so +0 -0
- package/android/src/main/jniLibs/x86/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/LlamaCppModel.cpp +2 -10
- package/cpp/PureCppImpl.cpp +71 -4
- package/cpp/SystemUtils.cpp +3 -7
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/CMakeLists.txt +2 -0
- package/cpp/llama.cpp/CODEOWNERS +1 -1
- package/cpp/llama.cpp/Makefile +6 -1605
- package/cpp/llama.cpp/README.md +5 -1
- package/cpp/llama.cpp/common/arg.cpp +230 -51
- package/cpp/llama.cpp/common/chat-parser.cpp +9 -1
- package/cpp/llama.cpp/common/chat.cpp +539 -8
- package/cpp/llama.cpp/common/chat.h +8 -1
- package/cpp/llama.cpp/common/common.cpp +60 -15
- package/cpp/llama.cpp/common/common.h +64 -15
- package/cpp/llama.cpp/common/speculative.cpp +135 -54
- package/cpp/llama.cpp/common/speculative.h +8 -1
- package/cpp/llama.cpp/convert_hf_to_gguf.py +1216 -109
- package/cpp/llama.cpp/convert_hf_to_gguf_update.py +19 -6
- package/cpp/llama.cpp/convert_lora_to_gguf.py +1 -1
- package/cpp/llama.cpp/flake.nix +0 -5
- package/cpp/llama.cpp/ggml/CMakeLists.txt +6 -3
- package/cpp/llama.cpp/ggml/cmake/ggml-config.cmake.in +71 -70
- package/cpp/llama.cpp/ggml/include/ggml-opt.h +25 -6
- package/cpp/llama.cpp/ggml/include/ggml-zdnn.h +16 -0
- package/cpp/llama.cpp/ggml/include/ggml.h +90 -3
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +13 -1
- package/cpp/llama.cpp/ggml/src/ggml-alloc.c +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +10 -0
- package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +113 -17
- package/cpp/llama.cpp/ggml/src/ggml-blas/ggml-blas.cpp +4 -4
- package/cpp/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +14 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +701 -585
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +13 -3
- package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +52 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +274 -91
- package/cpp/llama.cpp/ggml/src/ggml-common.h +17 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +132 -596
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +14 -286
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +90 -569
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +162 -589
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +55 -341
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +3 -58
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +371 -298
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +54 -314
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +184 -675
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +4679 -1657
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +33 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +26 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +21 -24
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +16 -7
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +232 -123
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +428 -23
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +4 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +35 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.h +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +458 -46
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.h +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +39 -14
- package/cpp/llama.cpp/ggml/src/ggml-cpu/traits.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/traits.h +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +20 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +122 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +9 -11
- package/cpp/llama.cpp/ggml/src/ggml-cuda/add-id.cu +58 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/add-id.cuh +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/binbcast.cu +275 -170
- package/cpp/llama.cpp/ggml/src/ggml-cuda/binbcast.cuh +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +103 -65
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv-transpose-1d.cu +1 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d.cu +171 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cu +33 -7
- package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cuh +13 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy-utils.cuh +2 -10
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cu +3 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/dequantize.cuh +14 -40
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +83 -27
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +116 -57
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cu +45 -18
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cu +56 -29
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +61 -39
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +70 -49
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +70 -21
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cu +162 -50
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cuh +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/getrows.cu +5 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +208 -97
- package/cpp/llama.cpp/ggml/src/ggml-cuda/im2col.cu +46 -35
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cu +56 -2
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mma.cuh +95 -51
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmf.cu +427 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmf.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cu +204 -57
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +252 -168
- package/cpp/llama.cpp/ggml/src/ggml-cuda/{mmv.cu → mmvf.cu} +53 -53
- package/cpp/llama.cpp/ggml/src/ggml-cuda/{mmv.cuh → mmvf.cuh} +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvq.cu +10 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/norm.cu +192 -19
- package/cpp/llama.cpp/ggml/src/ggml-cuda/norm.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/opt-step-sgd.cu +49 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/opt-step-sgd.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/pad_reflect_1d.cu +82 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/pad_reflect_1d.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/reduce_rows.cuh +53 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/roll.cu +67 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/roll.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/set-rows.cu +1 -8
- package/cpp/llama.cpp/ggml/src/ggml-cuda/softcap.cu +34 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/softcap.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/softmax.cu +16 -10
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +153 -71
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sum.cu +6 -10
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cu +21 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmq-instance-mxfp4.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +75 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/vecdotq.cuh +110 -22
- package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +14 -25
- package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +2 -1
- package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +10 -2
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +61 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +31 -20
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +342 -131
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +464 -134
- package/cpp/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +0 -4
- package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1108 -176
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/add.cl +107 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/div.cl +66 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +343 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +343 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +346 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/glu.cl +41 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/group_norm.cl +49 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul.cl +73 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +132 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +133 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/norm.cl +80 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +10 -2
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +10 -2
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f16.cl +10 -2
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f32.cl +10 -2
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sub.cl +66 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/transpose.cl +20 -0
- package/cpp/llama.cpp/ggml/src/ggml-opt.cpp +97 -41
- package/cpp/llama.cpp/ggml/src/ggml-quants.c +110 -16
- package/cpp/llama.cpp/ggml/src/ggml-quants.h +6 -0
- package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +22 -9
- package/cpp/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +0 -212
- package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +213 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +117 -238
- package/cpp/llama.cpp/ggml/src/ggml-sycl/quantize.hpp +133 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.cpp +94 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1666 -633
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +41 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +42 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +13 -4
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +39 -29
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +107 -43
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp +18 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp +21 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +32 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +20 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +21 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +16 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +44 -8
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +44 -16
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +26 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +2 -17
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +37 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +11 -7
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +109 -55
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +71 -41
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +6 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +111 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +49 -11
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +65 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +9 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +17 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +38 -5
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +14 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +55 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/utils.comp +25 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +75 -20
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/CMakeLists.txt +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/ggml-webgpu.cpp +807 -412
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +72 -22
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +8 -8
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +1794 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +82 -0
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/ggml-zdnn-impl.h +97 -0
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/ggml-zdnn.cpp +846 -0
- package/cpp/llama.cpp/ggml/src/ggml.c +204 -50
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +187 -2
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +11 -2
- package/cpp/llama.cpp/gguf-py/gguf/quants.py +53 -4
- package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_convert_endian.py +67 -63
- package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_new_metadata.py +7 -1
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +120 -16
- package/cpp/llama.cpp/gguf-py/gguf/utility.py +5 -1
- package/cpp/llama.cpp/gguf-py/gguf/vocab.py +284 -1
- package/cpp/llama.cpp/gguf-py/tests/test_quants.py +14 -5
- package/cpp/llama.cpp/include/llama.h +53 -114
- package/cpp/llama.cpp/models/templates/ByteDance-Seed-OSS.jinja +171 -0
- package/cpp/llama.cpp/models/templates/README.md +2 -1
- package/cpp/llama.cpp/models/templates/ibm-granite-granite-3.3-2B-Instruct.jinja +59 -0
- package/cpp/llama.cpp/models/templates/openai-gpt-oss-120b.jinja +331 -0
- package/cpp/llama.cpp/models/templates/unsloth-mistral-Devstral-Small-2507.jinja +105 -0
- package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +3 -1
- package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +0 -6
- package/cpp/llama.cpp/requirements/requirements-pydantic.txt +1 -1
- package/cpp/llama.cpp/src/CMakeLists.txt +2 -2
- package/cpp/llama.cpp/src/llama-adapter.cpp +68 -4
- package/cpp/llama.cpp/src/llama-adapter.h +3 -0
- package/cpp/llama.cpp/src/llama-arch.cpp +192 -2
- package/cpp/llama.cpp/src/llama-arch.h +18 -0
- package/cpp/llama.cpp/src/llama-batch.cpp +2 -2
- package/cpp/llama.cpp/src/llama-chat.cpp +47 -6
- package/cpp/llama.cpp/src/llama-chat.h +3 -0
- package/cpp/llama.cpp/src/llama-context.cpp +61 -252
- package/cpp/llama.cpp/src/llama-context.h +10 -15
- package/cpp/llama.cpp/src/llama-cparams.h +0 -1
- package/cpp/llama.cpp/src/llama-graph.cpp +180 -85
- package/cpp/llama.cpp/src/llama-graph.h +90 -51
- package/cpp/llama.cpp/src/llama-hparams.cpp +34 -3
- package/cpp/llama.cpp/src/llama-hparams.h +21 -6
- package/cpp/llama.cpp/src/{llama-kv-cache-unified-iswa.cpp → llama-kv-cache-iswa.cpp} +79 -56
- package/cpp/llama.cpp/src/{llama-kv-cache-unified-iswa.h → llama-kv-cache-iswa.h} +30 -28
- package/cpp/llama.cpp/src/{llama-kv-cache-unified.cpp → llama-kv-cache.cpp} +240 -632
- package/cpp/llama.cpp/src/{llama-kv-cache-unified.h → llama-kv-cache.h} +39 -74
- package/cpp/llama.cpp/src/llama-kv-cells.h +21 -21
- package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +41 -35
- package/cpp/llama.cpp/src/llama-memory-hybrid.h +26 -29
- package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +13 -9
- package/cpp/llama.cpp/src/llama-memory-recurrent.h +10 -14
- package/cpp/llama.cpp/src/llama-memory.h +13 -10
- package/cpp/llama.cpp/src/llama-model-loader.cpp +2 -0
- package/cpp/llama.cpp/src/llama-model-loader.h +3 -2
- package/cpp/llama.cpp/src/llama-model.cpp +1959 -419
- package/cpp/llama.cpp/src/llama-model.h +28 -4
- package/cpp/llama.cpp/src/llama-quant.cpp +40 -4
- package/cpp/llama.cpp/src/llama-vocab.cpp +51 -2
- package/cpp/llama.cpp/src/llama-vocab.h +1 -0
- package/cpp/llama.cpp/vendor/minja/chat-template.hpp +16 -7
- package/cpp/llama.cpp/vendor/minja/minja.hpp +47 -12
- package/cpp/rn-completion.cpp +3 -27
- package/ios/generated/RNLlamaCppSpec/RNLlamaCppSpec.h +30 -0
- package/ios/generated/RNLlamaCppSpecJSI.h +49 -4
- package/ios/include/chat.h +8 -1
- package/ios/include/common/minja/chat-template.hpp +16 -7
- package/ios/include/common/minja/minja.hpp +47 -12
- package/ios/include/common.h +64 -15
- package/ios/include/llama.h +53 -114
- package/ios/include/speculative.h +8 -1
- package/ios/libs/llama.xcframework/Info.plist +18 -18
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5557 -5267
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-opt.h +25 -6
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +90 -3
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +53 -114
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5520 -5238
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4241 -4014
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +25 -6
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +90 -3
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +53 -114
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5519 -5238
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4242 -4016
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-opt.h +25 -6
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +90 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +53 -114
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-opt.h +25 -6
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +90 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +53 -114
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-opt.h +25 -6
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +90 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +53 -114
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5556 -5267
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-opt.h +25 -6
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +90 -3
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +53 -114
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5519 -5238
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4241 -4014
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +25 -6
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +90 -3
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +53 -114
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5553 -5303
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-opt.h +25 -6
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +90 -3
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +53 -114
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5515 -5274
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4238 -4044
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +25 -6
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +90 -3
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +53 -114
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/lib/module/NativeRNLlamaCpp.js.map +1 -1
- package/lib/typescript/src/NativeRNLlamaCpp.d.ts +5 -0
- package/lib/typescript/src/NativeRNLlamaCpp.d.ts.map +1 -1
- package/package.json +1 -2
- package/src/NativeRNLlamaCpp.ts +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +0 -56
|
@@ -582,9 +582,6 @@ FILE * ggml_fopen(const char * fname, const char * mode) {
|
|
|
582
582
|
#endif
|
|
583
583
|
|
|
584
584
|
}
|
|
585
|
-
static void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc);
|
|
586
|
-
static void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc);
|
|
587
|
-
static void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc);
|
|
588
585
|
|
|
589
586
|
static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
|
|
590
587
|
[GGML_TYPE_I8] = {
|
|
@@ -690,6 +687,14 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
|
|
|
690
687
|
.is_quantized = true,
|
|
691
688
|
.from_float_ref = (ggml_from_float_t) quantize_row_q8_1_ref,
|
|
692
689
|
},
|
|
690
|
+
[GGML_TYPE_MXFP4] = {
|
|
691
|
+
.type_name = "mxfp4",
|
|
692
|
+
.blck_size = QK_MXFP4,
|
|
693
|
+
.type_size = sizeof(block_mxfp4),
|
|
694
|
+
.is_quantized = true,
|
|
695
|
+
.to_float = (ggml_to_float_t) dequantize_row_mxfp4,
|
|
696
|
+
.from_float_ref = (ggml_from_float_t)quantize_row_mxfp4_ref,
|
|
697
|
+
},
|
|
693
698
|
[GGML_TYPE_Q2_K] = {
|
|
694
699
|
.type_name = "q2_K",
|
|
695
700
|
.blck_size = QK_K,
|
|
@@ -917,6 +922,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
|
917
922
|
|
|
918
923
|
"DUP",
|
|
919
924
|
"ADD",
|
|
925
|
+
"ADD_ID",
|
|
920
926
|
"ADD1",
|
|
921
927
|
"ACC",
|
|
922
928
|
"SUB",
|
|
@@ -969,6 +975,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
|
969
975
|
"IM2COL",
|
|
970
976
|
"IM2COL_BACK",
|
|
971
977
|
"CONV_2D",
|
|
978
|
+
"CONV_3D",
|
|
972
979
|
"CONV_2D_DW",
|
|
973
980
|
"CONV_TRANSPOSE_2D",
|
|
974
981
|
"POOL_1D",
|
|
@@ -1006,17 +1013,19 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
|
1006
1013
|
"CROSS_ENTROPY_LOSS",
|
|
1007
1014
|
"CROSS_ENTROPY_LOSS_BACK",
|
|
1008
1015
|
"OPT_STEP_ADAMW",
|
|
1016
|
+
"OPT_STEP_SGD",
|
|
1009
1017
|
|
|
1010
1018
|
"GLU",
|
|
1011
1019
|
};
|
|
1012
1020
|
|
|
1013
|
-
static_assert(GGML_OP_COUNT ==
|
|
1021
|
+
static_assert(GGML_OP_COUNT == 89, "GGML_OP_COUNT != 89");
|
|
1014
1022
|
|
|
1015
1023
|
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
1016
1024
|
"none",
|
|
1017
1025
|
|
|
1018
1026
|
"x",
|
|
1019
1027
|
"x+y",
|
|
1028
|
+
"x[i]+y",
|
|
1020
1029
|
"x+y",
|
|
1021
1030
|
"view(x,nb,offset)+=y->x",
|
|
1022
1031
|
"x-y",
|
|
@@ -1069,6 +1078,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
|
1069
1078
|
"im2col(x)",
|
|
1070
1079
|
"im2col_back(x)",
|
|
1071
1080
|
"conv_2d(x)",
|
|
1081
|
+
"conv_3d(x)",
|
|
1072
1082
|
"conv_2d_dw(x)",
|
|
1073
1083
|
"conv_transpose_2d(x)",
|
|
1074
1084
|
"pool_1d(x)",
|
|
@@ -1106,15 +1116,15 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
|
1106
1116
|
"cross_entropy_loss(x,y)",
|
|
1107
1117
|
"cross_entropy_loss_back(x,y)",
|
|
1108
1118
|
"adamw(x)",
|
|
1119
|
+
"sgd(x)",
|
|
1109
1120
|
|
|
1110
1121
|
"glu(x)",
|
|
1111
1122
|
};
|
|
1112
1123
|
|
|
1113
|
-
static_assert(GGML_OP_COUNT ==
|
|
1124
|
+
static_assert(GGML_OP_COUNT == 89, "GGML_OP_COUNT != 89");
|
|
1114
1125
|
|
|
1115
1126
|
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
|
1116
1127
|
|
|
1117
|
-
|
|
1118
1128
|
static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
|
|
1119
1129
|
"ABS",
|
|
1120
1130
|
"SGN",
|
|
@@ -1140,11 +1150,12 @@ static const char * GGML_GLU_OP_NAME[GGML_GLU_OP_COUNT] = {
|
|
|
1140
1150
|
"REGLU",
|
|
1141
1151
|
"GEGLU",
|
|
1142
1152
|
"SWIGLU",
|
|
1153
|
+
"SWIGLU_OAI",
|
|
1143
1154
|
"GEGLU_ERF",
|
|
1144
1155
|
"GEGLU_QUICK",
|
|
1145
1156
|
};
|
|
1146
1157
|
|
|
1147
|
-
static_assert(GGML_GLU_OP_COUNT ==
|
|
1158
|
+
static_assert(GGML_GLU_OP_COUNT == 6, "GGML_GLU_OP_COUNT != 6");
|
|
1148
1159
|
|
|
1149
1160
|
|
|
1150
1161
|
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
|
|
@@ -1312,6 +1323,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
|
|
|
1312
1323
|
case GGML_FTYPE_MOSTLY_Q5_0: wtype = GGML_TYPE_Q5_0; break;
|
|
1313
1324
|
case GGML_FTYPE_MOSTLY_Q5_1: wtype = GGML_TYPE_Q5_1; break;
|
|
1314
1325
|
case GGML_FTYPE_MOSTLY_Q8_0: wtype = GGML_TYPE_Q8_0; break;
|
|
1326
|
+
case GGML_FTYPE_MOSTLY_MXFP4: wtype = GGML_TYPE_MXFP4; break;
|
|
1315
1327
|
case GGML_FTYPE_MOSTLY_Q2_K: wtype = GGML_TYPE_Q2_K; break;
|
|
1316
1328
|
case GGML_FTYPE_MOSTLY_Q3_K: wtype = GGML_TYPE_Q3_K; break;
|
|
1317
1329
|
case GGML_FTYPE_MOSTLY_Q4_K: wtype = GGML_TYPE_Q4_K; break;
|
|
@@ -1962,6 +1974,27 @@ struct ggml_tensor * ggml_add_cast(
|
|
|
1962
1974
|
return ggml_add_cast_impl(ctx, a, b, type);
|
|
1963
1975
|
}
|
|
1964
1976
|
|
|
1977
|
+
struct ggml_tensor * ggml_add_id(
|
|
1978
|
+
struct ggml_context * ctx,
|
|
1979
|
+
struct ggml_tensor * a,
|
|
1980
|
+
struct ggml_tensor * b,
|
|
1981
|
+
struct ggml_tensor * ids) {
|
|
1982
|
+
|
|
1983
|
+
GGML_ASSERT(a->ne[0] == b->ne[0]);
|
|
1984
|
+
GGML_ASSERT(a->ne[1] == ids->ne[0]);
|
|
1985
|
+
GGML_ASSERT(a->ne[2] == ids->ne[1]);
|
|
1986
|
+
GGML_ASSERT(ids->type == GGML_TYPE_I32);
|
|
1987
|
+
|
|
1988
|
+
struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
|
|
1989
|
+
|
|
1990
|
+
result->op = GGML_OP_ADD_ID;
|
|
1991
|
+
result->src[0] = a;
|
|
1992
|
+
result->src[1] = b;
|
|
1993
|
+
result->src[2] = ids;
|
|
1994
|
+
|
|
1995
|
+
return result;
|
|
1996
|
+
}
|
|
1997
|
+
|
|
1965
1998
|
// ggml_add1
|
|
1966
1999
|
|
|
1967
2000
|
static struct ggml_tensor * ggml_add1_impl(
|
|
@@ -2812,6 +2845,19 @@ struct ggml_tensor * ggml_geglu_quick_split(
|
|
|
2812
2845
|
return ggml_glu_impl(ctx, a, b, GGML_GLU_OP_GEGLU_QUICK, false);
|
|
2813
2846
|
}
|
|
2814
2847
|
|
|
2848
|
+
struct ggml_tensor * ggml_swiglu_oai(
|
|
2849
|
+
struct ggml_context * ctx,
|
|
2850
|
+
struct ggml_tensor * a,
|
|
2851
|
+
struct ggml_tensor * b,
|
|
2852
|
+
float alpha,
|
|
2853
|
+
float limit) {
|
|
2854
|
+
struct ggml_tensor * result = ggml_glu_impl(ctx, a, b, GGML_GLU_OP_SWIGLU_OAI, false);
|
|
2855
|
+
ggml_set_op_params_f32(result, 2, alpha);
|
|
2856
|
+
ggml_set_op_params_f32(result, 3, limit);
|
|
2857
|
+
|
|
2858
|
+
return result;
|
|
2859
|
+
}
|
|
2860
|
+
|
|
2815
2861
|
// ggml_norm
|
|
2816
2862
|
|
|
2817
2863
|
static struct ggml_tensor * ggml_norm_impl(
|
|
@@ -3779,6 +3825,22 @@ struct ggml_tensor * ggml_soft_max_ext(
|
|
|
3779
3825
|
return ggml_soft_max_impl(ctx, a, mask, scale, max_bias, false);
|
|
3780
3826
|
}
|
|
3781
3827
|
|
|
3828
|
+
void ggml_soft_max_add_sinks(
|
|
3829
|
+
struct ggml_tensor * a,
|
|
3830
|
+
struct ggml_tensor * sinks) {
|
|
3831
|
+
if (!sinks) {
|
|
3832
|
+
a->src[2] = NULL;
|
|
3833
|
+
return;
|
|
3834
|
+
}
|
|
3835
|
+
|
|
3836
|
+
GGML_ASSERT(a->op == GGML_OP_SOFT_MAX);
|
|
3837
|
+
GGML_ASSERT(a->src[2] == NULL);
|
|
3838
|
+
GGML_ASSERT(a->src[0]->ne[2] == sinks->ne[0]);
|
|
3839
|
+
GGML_ASSERT(sinks->type == GGML_TYPE_F32);
|
|
3840
|
+
|
|
3841
|
+
a->src[2] = sinks;
|
|
3842
|
+
}
|
|
3843
|
+
|
|
3782
3844
|
// ggml_soft_max_ext_back
|
|
3783
3845
|
|
|
3784
3846
|
static struct ggml_tensor * ggml_soft_max_ext_back_impl(
|
|
@@ -3826,6 +3888,7 @@ static struct ggml_tensor * ggml_rope_impl(
|
|
|
3826
3888
|
struct ggml_tensor * b,
|
|
3827
3889
|
struct ggml_tensor * c,
|
|
3828
3890
|
int n_dims,
|
|
3891
|
+
int sections[GGML_MROPE_SECTIONS],
|
|
3829
3892
|
int mode,
|
|
3830
3893
|
int n_ctx_orig,
|
|
3831
3894
|
float freq_base,
|
|
@@ -3839,15 +3902,19 @@ static struct ggml_tensor * ggml_rope_impl(
|
|
|
3839
3902
|
|
|
3840
3903
|
GGML_ASSERT(ggml_is_vector(b));
|
|
3841
3904
|
GGML_ASSERT(b->type == GGML_TYPE_I32);
|
|
3842
|
-
|
|
3905
|
+
|
|
3906
|
+
bool mrope_used = mode & GGML_ROPE_TYPE_MROPE;
|
|
3907
|
+
if (mrope_used) {
|
|
3908
|
+
GGML_ASSERT(a->ne[2] * 4 == b->ne[0]); // mrope expecting 4 position ids per token
|
|
3909
|
+
} else {
|
|
3910
|
+
GGML_ASSERT(a->ne[2] == b->ne[0]);
|
|
3911
|
+
}
|
|
3843
3912
|
|
|
3844
3913
|
if (c) {
|
|
3845
3914
|
GGML_ASSERT(c->type == GGML_TYPE_F32);
|
|
3846
3915
|
GGML_ASSERT(c->ne[0] >= n_dims / 2);
|
|
3847
3916
|
}
|
|
3848
3917
|
|
|
3849
|
-
int sections[4] = {0, 0, 0, 0};
|
|
3850
|
-
|
|
3851
3918
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
|
3852
3919
|
|
|
3853
3920
|
int32_t params[15] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
|
|
@@ -3857,7 +3924,11 @@ static struct ggml_tensor * ggml_rope_impl(
|
|
|
3857
3924
|
memcpy(params + 8, &attn_factor, sizeof(float));
|
|
3858
3925
|
memcpy(params + 9, &beta_fast, sizeof(float));
|
|
3859
3926
|
memcpy(params + 10, &beta_slow, sizeof(float));
|
|
3860
|
-
|
|
3927
|
+
if (mrope_used) {
|
|
3928
|
+
memcpy(params + 11, sections, sizeof(int32_t) * GGML_MROPE_SECTIONS);
|
|
3929
|
+
} else {
|
|
3930
|
+
memset(params + 11, 0, sizeof(int32_t) * GGML_MROPE_SECTIONS);
|
|
3931
|
+
}
|
|
3861
3932
|
ggml_set_op_params(result, params, sizeof(params));
|
|
3862
3933
|
|
|
3863
3934
|
result->op = GGML_OP_ROPE;
|
|
@@ -3875,7 +3946,7 @@ struct ggml_tensor * ggml_rope(
|
|
|
3875
3946
|
int n_dims,
|
|
3876
3947
|
int mode) {
|
|
3877
3948
|
return ggml_rope_impl(
|
|
3878
|
-
ctx, a, b, NULL, n_dims, mode, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, false
|
|
3949
|
+
ctx, a, b, NULL, n_dims, NULL, mode, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, false
|
|
3879
3950
|
);
|
|
3880
3951
|
}
|
|
3881
3952
|
|
|
@@ -3885,7 +3956,7 @@ struct ggml_tensor * ggml_rope_multi(
|
|
|
3885
3956
|
struct ggml_tensor * b,
|
|
3886
3957
|
struct ggml_tensor * c,
|
|
3887
3958
|
int n_dims,
|
|
3888
|
-
int sections[
|
|
3959
|
+
int sections[GGML_MROPE_SECTIONS],
|
|
3889
3960
|
int mode,
|
|
3890
3961
|
int n_ctx_orig,
|
|
3891
3962
|
float freq_base,
|
|
@@ -3894,36 +3965,31 @@ struct ggml_tensor * ggml_rope_multi(
|
|
|
3894
3965
|
float attn_factor,
|
|
3895
3966
|
float beta_fast,
|
|
3896
3967
|
float beta_slow) {
|
|
3897
|
-
|
|
3898
|
-
|
|
3899
|
-
|
|
3900
|
-
|
|
3901
|
-
|
|
3902
|
-
GGML_ASSERT(a->ne[2] * 4 == b->ne[0]); // mrope expecting 4 position ids per token
|
|
3903
|
-
|
|
3904
|
-
if (c) {
|
|
3905
|
-
GGML_ASSERT(c->type == GGML_TYPE_F32);
|
|
3906
|
-
GGML_ASSERT(c->ne[0] >= n_dims / 2);
|
|
3907
|
-
}
|
|
3908
|
-
|
|
3909
|
-
struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
|
|
3910
|
-
|
|
3911
|
-
int32_t params[11 + 4] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
|
|
3912
|
-
memcpy(params + 5, &freq_base, sizeof(float));
|
|
3913
|
-
memcpy(params + 6, &freq_scale, sizeof(float));
|
|
3914
|
-
memcpy(params + 7, &ext_factor, sizeof(float));
|
|
3915
|
-
memcpy(params + 8, &attn_factor, sizeof(float));
|
|
3916
|
-
memcpy(params + 9, &beta_fast, sizeof(float));
|
|
3917
|
-
memcpy(params + 10, &beta_slow, sizeof(float));
|
|
3918
|
-
memcpy(¶ms[11], sections, sizeof(int)*4);
|
|
3919
|
-
ggml_set_op_params(result, params, sizeof(params));
|
|
3920
|
-
|
|
3921
|
-
result->op = GGML_OP_ROPE;
|
|
3922
|
-
result->src[0] = a;
|
|
3923
|
-
result->src[1] = b;
|
|
3924
|
-
result->src[2] = c;
|
|
3968
|
+
return ggml_rope_impl(
|
|
3969
|
+
ctx, a, b, c, n_dims, sections, mode, n_ctx_orig, freq_base, freq_scale,
|
|
3970
|
+
ext_factor, attn_factor, beta_fast, beta_slow, false
|
|
3971
|
+
);
|
|
3972
|
+
}
|
|
3925
3973
|
|
|
3926
|
-
|
|
3974
|
+
struct ggml_tensor * ggml_rope_multi_inplace(
|
|
3975
|
+
struct ggml_context * ctx,
|
|
3976
|
+
struct ggml_tensor * a,
|
|
3977
|
+
struct ggml_tensor * b,
|
|
3978
|
+
struct ggml_tensor * c,
|
|
3979
|
+
int n_dims,
|
|
3980
|
+
int sections[GGML_MROPE_SECTIONS],
|
|
3981
|
+
int mode,
|
|
3982
|
+
int n_ctx_orig,
|
|
3983
|
+
float freq_base,
|
|
3984
|
+
float freq_scale,
|
|
3985
|
+
float ext_factor,
|
|
3986
|
+
float attn_factor,
|
|
3987
|
+
float beta_fast,
|
|
3988
|
+
float beta_slow) {
|
|
3989
|
+
return ggml_rope_impl(
|
|
3990
|
+
ctx, a, b, c, n_dims, sections, mode, n_ctx_orig, freq_base, freq_scale,
|
|
3991
|
+
ext_factor, attn_factor, beta_fast, beta_slow, true
|
|
3992
|
+
);
|
|
3927
3993
|
}
|
|
3928
3994
|
|
|
3929
3995
|
struct ggml_tensor * ggml_rope_inplace(
|
|
@@ -3933,7 +3999,7 @@ struct ggml_tensor * ggml_rope_inplace(
|
|
|
3933
3999
|
int n_dims,
|
|
3934
4000
|
int mode) {
|
|
3935
4001
|
return ggml_rope_impl(
|
|
3936
|
-
ctx, a, b, NULL, n_dims, mode, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, true
|
|
4002
|
+
ctx, a, b, NULL, n_dims, NULL, mode, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, true
|
|
3937
4003
|
);
|
|
3938
4004
|
}
|
|
3939
4005
|
|
|
@@ -3952,7 +4018,7 @@ struct ggml_tensor * ggml_rope_ext(
|
|
|
3952
4018
|
float beta_fast,
|
|
3953
4019
|
float beta_slow) {
|
|
3954
4020
|
return ggml_rope_impl(
|
|
3955
|
-
ctx, a, b, c, n_dims, mode, n_ctx_orig, freq_base, freq_scale,
|
|
4021
|
+
ctx, a, b, c, n_dims, NULL, mode, n_ctx_orig, freq_base, freq_scale,
|
|
3956
4022
|
ext_factor, attn_factor, beta_fast, beta_slow, false
|
|
3957
4023
|
);
|
|
3958
4024
|
}
|
|
@@ -3972,7 +4038,7 @@ struct ggml_tensor * ggml_rope_ext_inplace(
|
|
|
3972
4038
|
float beta_fast,
|
|
3973
4039
|
float beta_slow) {
|
|
3974
4040
|
return ggml_rope_impl(
|
|
3975
|
-
ctx, a, b, c, n_dims, mode, n_ctx_orig, freq_base, freq_scale,
|
|
4041
|
+
ctx, a, b, c, n_dims, NULL, mode, n_ctx_orig, freq_base, freq_scale,
|
|
3976
4042
|
ext_factor, attn_factor, beta_fast, beta_slow, true
|
|
3977
4043
|
);
|
|
3978
4044
|
}
|
|
@@ -3991,7 +4057,7 @@ struct ggml_tensor * ggml_rope_custom(
|
|
|
3991
4057
|
float beta_fast,
|
|
3992
4058
|
float beta_slow) {
|
|
3993
4059
|
return ggml_rope_impl(
|
|
3994
|
-
ctx, a, b, NULL, n_dims, mode, n_ctx_orig, freq_base, freq_scale,
|
|
4060
|
+
ctx, a, b, NULL, n_dims, NULL, mode, n_ctx_orig, freq_base, freq_scale,
|
|
3995
4061
|
ext_factor, attn_factor, beta_fast, beta_slow, false
|
|
3996
4062
|
);
|
|
3997
4063
|
}
|
|
@@ -4010,7 +4076,7 @@ struct ggml_tensor * ggml_rope_custom_inplace(
|
|
|
4010
4076
|
float beta_fast,
|
|
4011
4077
|
float beta_slow) {
|
|
4012
4078
|
return ggml_rope_impl(
|
|
4013
|
-
ctx, a, b, NULL, n_dims, mode, n_ctx_orig, freq_base, freq_scale,
|
|
4079
|
+
ctx, a, b, NULL, n_dims, NULL, mode, n_ctx_orig, freq_base, freq_scale,
|
|
4014
4080
|
ext_factor, attn_factor, beta_fast, beta_slow, true
|
|
4015
4081
|
);
|
|
4016
4082
|
}
|
|
@@ -4208,14 +4274,13 @@ struct ggml_tensor * ggml_conv_1d_dw(
|
|
|
4208
4274
|
int s0,
|
|
4209
4275
|
int p0,
|
|
4210
4276
|
int d0) {
|
|
4211
|
-
struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], 1, a->ne[1], a->ne[2]);
|
|
4212
4277
|
struct ggml_tensor * new_b = ggml_reshape_4d(ctx, b, b->ne[0], 1, b->ne[1], b->ne[2]);
|
|
4213
4278
|
|
|
4214
|
-
struct ggml_tensor * im2col = ggml_im2col(ctx,
|
|
4279
|
+
struct ggml_tensor * im2col = ggml_im2col(ctx, a, new_b, s0, 0, p0, 0, d0, 0, false, GGML_TYPE_F16);
|
|
4215
4280
|
|
|
4216
4281
|
struct ggml_tensor * result = ggml_mul_mat(ctx, im2col, a);
|
|
4217
4282
|
|
|
4218
|
-
result = ggml_reshape_3d(ctx, result,
|
|
4283
|
+
result = ggml_reshape_3d(ctx, result, result->ne[0], result->ne[2], 1);
|
|
4219
4284
|
|
|
4220
4285
|
return result;
|
|
4221
4286
|
}
|
|
@@ -4417,6 +4482,56 @@ struct ggml_tensor * ggml_conv_2d_direct(
|
|
|
4417
4482
|
return result;
|
|
4418
4483
|
}
|
|
4419
4484
|
|
|
4485
|
+
// ggml_conv_3d
|
|
4486
|
+
|
|
4487
|
+
struct ggml_tensor * ggml_conv_3d(
|
|
4488
|
+
struct ggml_context * ctx,
|
|
4489
|
+
struct ggml_tensor * a,
|
|
4490
|
+
struct ggml_tensor * b,
|
|
4491
|
+
int s0,
|
|
4492
|
+
int s1,
|
|
4493
|
+
int s2,
|
|
4494
|
+
int p0,
|
|
4495
|
+
int p1,
|
|
4496
|
+
int p2,
|
|
4497
|
+
int d0,
|
|
4498
|
+
int d1,
|
|
4499
|
+
int d2,
|
|
4500
|
+
int c,
|
|
4501
|
+
int n,
|
|
4502
|
+
int oc) {
|
|
4503
|
+
|
|
4504
|
+
GGML_ASSERT(a->ne[3] == (int64_t) c * oc);
|
|
4505
|
+
GGML_ASSERT(b->ne[3] == (int64_t) c * n);
|
|
4506
|
+
|
|
4507
|
+
int64_t ne[4];
|
|
4508
|
+
ne[0] = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
|
|
4509
|
+
ne[1] = ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1);
|
|
4510
|
+
ne[2] = ggml_calc_conv_output_size(b->ne[2], a->ne[2], s2, p2, d2);
|
|
4511
|
+
ne[3] = (int64_t) oc * n;
|
|
4512
|
+
|
|
4513
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
|
4514
|
+
|
|
4515
|
+
ggml_set_op_params_i32(result, 0, s0);
|
|
4516
|
+
ggml_set_op_params_i32(result, 1, s1);
|
|
4517
|
+
ggml_set_op_params_i32(result, 2, s2);
|
|
4518
|
+
ggml_set_op_params_i32(result, 3, p0);
|
|
4519
|
+
ggml_set_op_params_i32(result, 4, p1);
|
|
4520
|
+
ggml_set_op_params_i32(result, 5, p2);
|
|
4521
|
+
ggml_set_op_params_i32(result, 6, d0);
|
|
4522
|
+
ggml_set_op_params_i32(result, 7, d1);
|
|
4523
|
+
ggml_set_op_params_i32(result, 8, d2);
|
|
4524
|
+
ggml_set_op_params_i32(result, 9, c);
|
|
4525
|
+
ggml_set_op_params_i32(result, 10, n);
|
|
4526
|
+
ggml_set_op_params_i32(result, 11, oc);
|
|
4527
|
+
|
|
4528
|
+
result->op = GGML_OP_CONV_3D;
|
|
4529
|
+
result->src[0] = a;
|
|
4530
|
+
result->src[1] = b;
|
|
4531
|
+
|
|
4532
|
+
return result;
|
|
4533
|
+
}
|
|
4534
|
+
|
|
4420
4535
|
// ggml_conv_transpose_2d_p0
|
|
4421
4536
|
|
|
4422
4537
|
static int64_t ggml_calc_conv_transpose_output_size(int64_t ins, int64_t ks, int s, int p) {
|
|
@@ -4812,6 +4927,22 @@ enum ggml_prec ggml_flash_attn_ext_get_prec(
|
|
|
4812
4927
|
return (enum ggml_prec) prec_i32;
|
|
4813
4928
|
}
|
|
4814
4929
|
|
|
4930
|
+
void ggml_flash_attn_ext_add_sinks(
|
|
4931
|
+
struct ggml_tensor * a,
|
|
4932
|
+
struct ggml_tensor * sinks) {
|
|
4933
|
+
if (!sinks) {
|
|
4934
|
+
a->src[4] = NULL;
|
|
4935
|
+
return;
|
|
4936
|
+
}
|
|
4937
|
+
|
|
4938
|
+
GGML_ASSERT(a->op == GGML_OP_FLASH_ATTN_EXT);
|
|
4939
|
+
GGML_ASSERT(a->src[4] == NULL);
|
|
4940
|
+
GGML_ASSERT(a->src[0]->ne[2] == sinks->ne[0]);
|
|
4941
|
+
GGML_ASSERT(sinks->type == GGML_TYPE_F32);
|
|
4942
|
+
|
|
4943
|
+
a->src[4] = sinks;
|
|
4944
|
+
}
|
|
4945
|
+
|
|
4815
4946
|
// ggml_flash_attn_back
|
|
4816
4947
|
|
|
4817
4948
|
struct ggml_tensor * ggml_flash_attn_back(
|
|
@@ -5527,6 +5658,28 @@ struct ggml_tensor * ggml_opt_step_adamw(
|
|
|
5527
5658
|
return result;
|
|
5528
5659
|
}
|
|
5529
5660
|
|
|
5661
|
+
// opt_step_sgd
|
|
5662
|
+
|
|
5663
|
+
struct ggml_tensor * ggml_opt_step_sgd(
|
|
5664
|
+
struct ggml_context * ctx,
|
|
5665
|
+
struct ggml_tensor * a,
|
|
5666
|
+
struct ggml_tensor * grad,
|
|
5667
|
+
struct ggml_tensor * params) {
|
|
5668
|
+
GGML_ASSERT(a->flags & GGML_TENSOR_FLAG_PARAM);
|
|
5669
|
+
GGML_ASSERT(ggml_are_same_shape(a, grad));
|
|
5670
|
+
GGML_ASSERT(params->type == GGML_TYPE_F32);
|
|
5671
|
+
GGML_ASSERT(ggml_nelements(params) == 2);
|
|
5672
|
+
|
|
5673
|
+
struct ggml_tensor * result = ggml_view_tensor(ctx, a);
|
|
5674
|
+
|
|
5675
|
+
result->op = GGML_OP_OPT_STEP_SGD;
|
|
5676
|
+
result->src[0] = a;
|
|
5677
|
+
result->src[1] = grad;
|
|
5678
|
+
result->src[2] = params;
|
|
5679
|
+
|
|
5680
|
+
return result;
|
|
5681
|
+
}
|
|
5682
|
+
|
|
5530
5683
|
////////////////////////////////////////////////////////////////////////////////
|
|
5531
5684
|
|
|
5532
5685
|
struct ggml_hash_set ggml_hash_set_new(size_t size) {
|
|
@@ -6872,6 +7025,7 @@ size_t ggml_quantize_chunk(
|
|
|
6872
7025
|
case GGML_TYPE_Q5_0: result = quantize_q5_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
|
6873
7026
|
case GGML_TYPE_Q5_1: result = quantize_q5_1(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
|
6874
7027
|
case GGML_TYPE_Q8_0: result = quantize_q8_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
|
7028
|
+
case GGML_TYPE_MXFP4: result = quantize_mxfp4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
|
6875
7029
|
case GGML_TYPE_Q2_K: result = quantize_q2_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
|
6876
7030
|
case GGML_TYPE_Q3_K: result = quantize_q3_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
|
6877
7031
|
case GGML_TYPE_Q4_K: result = quantize_q4_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|