@novastera-oss/llamarn 0.2.6 → 0.2.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/cpp/include/llama.h +141 -38
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/LlamaCppModel.cpp +58 -24
- package/cpp/LlamaCppModel.h +3 -3
- package/cpp/PureCppImpl.cpp +1 -1
- package/cpp/PureCppImpl.h +2 -2
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/CMakeLists.txt +15 -4
- package/cpp/llama.cpp/Makefile +2 -2
- package/cpp/llama.cpp/README.md +32 -13
- package/cpp/llama.cpp/common/CMakeLists.txt +10 -20
- package/cpp/llama.cpp/common/arg.cpp +37 -6
- package/cpp/llama.cpp/common/build-info.cpp.in +2 -2
- package/cpp/llama.cpp/common/chat-parser.cpp +5 -0
- package/cpp/llama.cpp/common/chat-parser.h +2 -0
- package/cpp/llama.cpp/common/chat.cpp +12 -9
- package/cpp/llama.cpp/common/chat.h +1 -1
- package/cpp/llama.cpp/common/common.cpp +53 -40
- package/cpp/llama.cpp/common/common.h +6 -2
- package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
- package/cpp/llama.cpp/common/speculative.cpp +6 -4
- package/cpp/llama.cpp/convert_hf_to_gguf.py +215 -76
- package/cpp/llama.cpp/ggml/CMakeLists.txt +48 -2
- package/cpp/llama.cpp/ggml/cmake/common.cmake +1 -2
- package/cpp/llama.cpp/ggml/include/ggml-cpu.h +2 -0
- package/cpp/llama.cpp/ggml/include/ggml.h +33 -0
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +64 -13
- package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +6 -1
- package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +33 -9
- package/cpp/llama.cpp/ggml/src/ggml-common.h +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +124 -26
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +11 -10
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4114 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2163 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2639 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2732 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2069 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +397 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1300 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1481 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +4311 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +79 -3225
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +184 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +4 -3
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +16 -7
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +93 -104
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +12 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +58 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +194 -69
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +1158 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +1571 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.h +98 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +213 -37
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +45 -45
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +59 -37
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +4 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +90 -39
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cu +19 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cuh +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cu +257 -87
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cuh +2 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +6 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cu +5 -18
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cuh +0 -1
- package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +61 -183
- package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +11 -10
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +16 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +260 -49
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +497 -282
- package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1078 -468
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
- package/cpp/llama.cpp/ggml/src/ggml-quants.c +0 -2
- package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +18 -15
- package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -6
- package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +20 -48
- package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +28 -41
- package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +4 -10
- package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +117 -165
- package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +192 -53
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +32 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +49 -67
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +99 -159
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +8 -105
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +209 -92
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +60 -80
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +158 -203
- package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +55 -74
- package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +38 -10
- package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +24 -20
- package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -8
- package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +108 -16
- package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +12 -16
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +36 -28
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +487 -247
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml.c +69 -19
- package/cpp/llama.cpp/ggml/src/gguf.cpp +5 -1
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +133 -0
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +25 -1
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +78 -3
- package/cpp/llama.cpp/gguf-py/gguf/vocab.py +97 -4
- package/cpp/llama.cpp/gguf-py/pyproject.toml +2 -2
- package/cpp/llama.cpp/include/llama.h +141 -38
- package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +124 -0
- package/cpp/llama.cpp/requirements/requirements-compare-llama-bench.txt +1 -0
- package/cpp/llama.cpp/src/CMakeLists.txt +2 -2
- package/cpp/llama.cpp/src/llama-arch.cpp +150 -3
- package/cpp/llama.cpp/src/llama-arch.h +25 -1
- package/cpp/llama.cpp/src/llama-batch.cpp +736 -274
- package/cpp/llama.cpp/src/llama-batch.h +110 -57
- package/cpp/llama.cpp/src/llama-chat.cpp +30 -8
- package/cpp/llama.cpp/src/llama-chat.h +1 -0
- package/cpp/llama.cpp/src/llama-context.cpp +360 -266
- package/cpp/llama.cpp/src/llama-context.h +27 -23
- package/cpp/llama.cpp/src/llama-cparams.cpp +1 -1
- package/cpp/llama.cpp/src/llama-cparams.h +1 -1
- package/cpp/llama.cpp/src/llama-graph.cpp +411 -344
- package/cpp/llama.cpp/src/llama-graph.h +126 -58
- package/cpp/llama.cpp/src/llama-hparams.cpp +10 -2
- package/cpp/llama.cpp/src/llama-hparams.h +16 -2
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +103 -73
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +34 -42
- package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +345 -221
- package/cpp/llama.cpp/src/llama-kv-cache-unified.h +75 -50
- package/cpp/llama.cpp/src/llama-kv-cells.h +51 -22
- package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +246 -0
- package/cpp/llama.cpp/src/llama-memory-hybrid.h +138 -0
- package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.cpp → llama-memory-recurrent.cpp} +302 -317
- package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.h → llama-memory-recurrent.h} +60 -68
- package/cpp/llama.cpp/src/llama-memory.cpp +41 -0
- package/cpp/llama.cpp/src/llama-memory.h +73 -36
- package/cpp/llama.cpp/src/llama-mmap.cpp +1 -1
- package/cpp/llama.cpp/src/llama-model-loader.cpp +42 -17
- package/cpp/llama.cpp/src/llama-model-saver.cpp +1 -0
- package/cpp/llama.cpp/src/llama-model.cpp +1630 -511
- package/cpp/llama.cpp/src/llama-model.h +26 -0
- package/cpp/llama.cpp/src/llama-quant.cpp +89 -6
- package/cpp/llama.cpp/src/llama-vocab.cpp +58 -26
- package/cpp/llama.cpp/src/llama-vocab.h +1 -0
- package/cpp/llama.cpp/src/llama.cpp +11 -7
- package/cpp/llama.cpp/src/unicode.cpp +5 -0
- package/cpp/rn-completion.cpp +2 -2
- package/cpp/{rn-llama.hpp → rn-llama.h} +1 -1
- package/cpp/{rn-utils.hpp → rn-utils.h} +3 -0
- package/ios/include/chat.h +1 -1
- package/ios/include/common.h +6 -2
- package/ios/include/llama.h +141 -38
- package/ios/libs/llama.xcframework/Info.plist +15 -15
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4689
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3622
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3766 -3624
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4689
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3622
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4926 -4725
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4897 -4746
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3794 -3652
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +1 -2
- package/cpp/llama.cpp/common/cmake/build-info-gen-cpp.cmake +0 -24
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13891
- package/cpp/llama.cpp/src/llama-kv-cache.cpp +0 -1
- package/cpp/llama.cpp/src/llama-kv-cache.h +0 -44
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
#include <algorithm>
|
|
2
|
+
|
|
3
|
+
#include "conv2d-transpose.cuh"
|
|
4
|
+
#include "ggml.h"
|
|
5
|
+
|
|
6
|
+
__global__ void conv2d_transpose_kernel(const float * __restrict__ input, const half * __restrict__ kernel,
|
|
7
|
+
float * __restrict__ output, const int in_w, const int in_h, const int out_w,
|
|
8
|
+
const int out_h, const int kernel_w, const int kernel_h, const int stride,
|
|
9
|
+
const int c_in, const int c_out, const int batches) {
|
|
10
|
+
const int global_idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
11
|
+
|
|
12
|
+
const int total_elements = out_w * out_h * c_out * batches;
|
|
13
|
+
|
|
14
|
+
if (global_idx >= total_elements) {
|
|
15
|
+
return;
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
const int out_x_idx = global_idx % out_w;
|
|
19
|
+
const int out_y_idx = (global_idx / out_w) % out_h;
|
|
20
|
+
const int c_idx = (global_idx / (out_w * out_h)) % c_out;
|
|
21
|
+
const int n_idx = global_idx / (out_w * out_h * c_out);
|
|
22
|
+
|
|
23
|
+
float accumulator = 0;
|
|
24
|
+
// For each output idx, find the inputs that contribute to it by checking stride alignment and bounds
|
|
25
|
+
|
|
26
|
+
for (int c_in_idx = 0; c_in_idx < c_in; c_in_idx++) {
|
|
27
|
+
for (int kh = 0; kh < kernel_h; ++kh) {
|
|
28
|
+
int in_y = out_y_idx - kh;
|
|
29
|
+
if (in_y < 0 || in_y % stride) continue;
|
|
30
|
+
in_y /= stride;
|
|
31
|
+
if (in_y >= in_h) continue;
|
|
32
|
+
|
|
33
|
+
for (int kw = 0; kw < kernel_w; ++kw) {
|
|
34
|
+
int in_x = out_x_idx - kw;
|
|
35
|
+
if (in_x < 0 || in_x % stride) continue;
|
|
36
|
+
in_x /= stride;
|
|
37
|
+
if (in_x >= in_w) continue;
|
|
38
|
+
|
|
39
|
+
const int input_idx = (in_w * in_h * c_in) * n_idx + (in_w * in_h) * c_in_idx + (in_w) *in_y + in_x;
|
|
40
|
+
const int kernel_idx =
|
|
41
|
+
(kernel_h * kernel_w * c_out) * c_in_idx + (kernel_h * kernel_w) * c_idx + (kernel_w) *kh + kw;
|
|
42
|
+
|
|
43
|
+
float input_val = input[input_idx];
|
|
44
|
+
half kern_val = kernel[kernel_idx];
|
|
45
|
+
|
|
46
|
+
accumulator += input_val * (float) kern_val;
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
output[(out_w * out_h * c_out) * n_idx + (out_w * out_h) * c_idx + (out_w) *out_y_idx + out_x_idx] = accumulator;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
//input is (W, H, C_in, N), Kernel is (W, H, C_out, C_in)
|
|
55
|
+
void ggml_cuda_conv_2d_transpose_p0(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|
56
|
+
const ggml_tensor * kernel = dst->src[0];
|
|
57
|
+
const ggml_tensor * input = dst->src[1];
|
|
58
|
+
|
|
59
|
+
GGML_ASSERT(kernel->type == GGML_TYPE_F16 && input->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
|
60
|
+
|
|
61
|
+
const float * input_data = (const float *) input->data;
|
|
62
|
+
float * output_data = (float *) dst->data;
|
|
63
|
+
const half * kernel_data = (const half *) kernel->data;
|
|
64
|
+
|
|
65
|
+
const int input_w = input->ne[0];
|
|
66
|
+
const int input_h = input->ne[1];
|
|
67
|
+
const int output_w = dst->ne[0];
|
|
68
|
+
const int output_h = dst->ne[1];
|
|
69
|
+
const int channels_in = input->ne[2];
|
|
70
|
+
const int channels_out = kernel->ne[2];
|
|
71
|
+
const int kernel_w = kernel->ne[0];
|
|
72
|
+
const int kernel_h = kernel->ne[1];
|
|
73
|
+
const int stride = dst->op_params[0];
|
|
74
|
+
const int batches = input->ne[3];
|
|
75
|
+
|
|
76
|
+
GGML_ASSERT(channels_in == kernel->ne[3]);
|
|
77
|
+
GGML_ASSERT(stride > 0);
|
|
78
|
+
|
|
79
|
+
cudaStream_t st = ctx.stream();
|
|
80
|
+
|
|
81
|
+
GGML_ASSERT(ggml_is_contiguous(input));
|
|
82
|
+
GGML_ASSERT(ggml_is_contiguous(kernel));
|
|
83
|
+
GGML_ASSERT(ggml_is_contiguous(dst));
|
|
84
|
+
|
|
85
|
+
const int total = (output_w * output_h * channels_out * batches);
|
|
86
|
+
const int blocks = (total + CUDA_CONV2D_TRANSPOSE_BLOCK_SIZE - 1) / CUDA_CONV2D_TRANSPOSE_BLOCK_SIZE;
|
|
87
|
+
|
|
88
|
+
conv2d_transpose_kernel<<<blocks, CUDA_CONV2D_TRANSPOSE_BLOCK_SIZE, 0, st>>>(
|
|
89
|
+
input_data, kernel_data, output_data, input_w, input_h, output_w, output_h, kernel_w, kernel_h, stride,
|
|
90
|
+
channels_in, channels_out, batches);
|
|
91
|
+
}
|
|
@@ -652,9 +652,12 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
|
|
|
652
652
|
float KQ_max_scale[cols_per_thread];
|
|
653
653
|
#pragma unroll
|
|
654
654
|
for (int col = 0; col < cols_per_thread; ++col) {
|
|
655
|
-
|
|
655
|
+
const float KQ_max_diff = KQ_max[col] - KQ_max_new[col];
|
|
656
|
+
KQ_max_scale[col] = expf(KQ_max_diff);
|
|
656
657
|
KQ_max[col] = KQ_max_new[col];
|
|
657
658
|
|
|
659
|
+
*((uint32_t *) &KQ_max_scale[col]) *= KQ_max_diff >= SOFTMAX_FTZ_THRESHOLD;
|
|
660
|
+
|
|
658
661
|
// Scale previous KQ_rowsum to account for a potential increase in KQ_max:
|
|
659
662
|
KQ_rowsum[col] = KQ_max_scale[col]*KQ_rowsum[col] + KQ_rowsum_add[col];
|
|
660
663
|
}
|
|
@@ -9,7 +9,11 @@
|
|
|
9
9
|
#ifdef FP16_MMA_AVAILABLE
|
|
10
10
|
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
|
11
11
|
#include <mma.h>
|
|
12
|
+
#ifdef GGML_USE_MUSA
|
|
13
|
+
namespace wmma = mtmusa::wmma;
|
|
14
|
+
#else // GGML_USE_MUSA
|
|
12
15
|
namespace wmma = nvcuda::wmma;
|
|
16
|
+
#endif // GGML_USE_MUSA
|
|
13
17
|
#elif defined(GGML_HIP_ROCWMMA_FATTN) && defined(FP16_MMA_AVAILABLE)
|
|
14
18
|
#undef HIP_ENABLE_WARP_SYNC_BUILTINS // conflicts with rocWMMA headers
|
|
15
19
|
#include <rocwmma/rocwmma.hpp>
|
|
@@ -11,6 +11,8 @@
|
|
|
11
11
|
#include "ggml-cuda/clamp.cuh"
|
|
12
12
|
#include "ggml-cuda/concat.cuh"
|
|
13
13
|
#include "ggml-cuda/conv-transpose-1d.cuh"
|
|
14
|
+
#include "ggml-cuda/conv2d-dw.cuh"
|
|
15
|
+
#include "ggml-cuda/conv2d-transpose.cuh"
|
|
14
16
|
#include "ggml-cuda/convert.cuh"
|
|
15
17
|
#include "ggml-cuda/count-equal.cuh"
|
|
16
18
|
#include "ggml-cuda/cpy.cuh"
|
|
@@ -35,6 +37,7 @@
|
|
|
35
37
|
#include "ggml-cuda/ssm-scan.cuh"
|
|
36
38
|
#include "ggml-cuda/sum.cuh"
|
|
37
39
|
#include "ggml-cuda/sumrows.cuh"
|
|
40
|
+
#include "ggml-cuda/mean.cuh"
|
|
38
41
|
#include "ggml-cuda/tsembd.cuh"
|
|
39
42
|
#include "ggml-cuda/unary.cuh"
|
|
40
43
|
#include "ggml-cuda/upscale.cuh"
|
|
@@ -47,6 +50,7 @@
|
|
|
47
50
|
#include <atomic>
|
|
48
51
|
#include <charconv>
|
|
49
52
|
#include <cinttypes>
|
|
53
|
+
#include <condition_variable>
|
|
50
54
|
#include <cstddef>
|
|
51
55
|
#include <cstdint>
|
|
52
56
|
#include <float.h>
|
|
@@ -54,9 +58,8 @@
|
|
|
54
58
|
#include <map>
|
|
55
59
|
#include <memory>
|
|
56
60
|
#include <mutex>
|
|
57
|
-
#include <stdint.h>
|
|
58
|
-
#include <stdio.h>
|
|
59
61
|
#include <stdarg.h>
|
|
62
|
+
#include <stdio.h>
|
|
60
63
|
#include <stdlib.h>
|
|
61
64
|
#include <string>
|
|
62
65
|
#include <vector>
|
|
@@ -97,8 +100,7 @@ int ggml_cuda_get_device() {
|
|
|
97
100
|
static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
|
|
98
101
|
ggml_cuda_set_device(device);
|
|
99
102
|
cudaError_t err;
|
|
100
|
-
if (getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr)
|
|
101
|
-
{
|
|
103
|
+
if (getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr) {
|
|
102
104
|
err = cudaMallocManaged(ptr, size);
|
|
103
105
|
#if defined(GGML_USE_HIP)
|
|
104
106
|
if (err == hipSuccess) {
|
|
@@ -116,9 +118,7 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
|
|
|
116
118
|
err = cudaMalloc(ptr, size);
|
|
117
119
|
}
|
|
118
120
|
#endif // defined(GGML_USE_HIP)
|
|
119
|
-
}
|
|
120
|
-
else
|
|
121
|
-
{
|
|
121
|
+
} else {
|
|
122
122
|
err = cudaMalloc(ptr, size);
|
|
123
123
|
}
|
|
124
124
|
return err;
|
|
@@ -514,6 +514,33 @@ std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(i
|
|
|
514
514
|
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_leg(device));
|
|
515
515
|
}
|
|
516
516
|
|
|
517
|
+
// destroying a cuBLAS handle while a graph is being captured in a different thread can result in a CUDA error
|
|
518
|
+
// this lock is used to ensure that no cuBLAS handle is destroyed while a graph is being captured
|
|
519
|
+
|
|
520
|
+
static std::mutex ggml_cuda_lock;
|
|
521
|
+
static std::condition_variable ggml_cuda_lock_cv;
|
|
522
|
+
static std::atomic<int> ggml_cuda_lock_counter;
|
|
523
|
+
|
|
524
|
+
ggml_backend_cuda_context::~ggml_backend_cuda_context() {
|
|
525
|
+
std::unique_lock<std::mutex> lock(ggml_cuda_lock);
|
|
526
|
+
ggml_cuda_lock_cv.wait(lock, []{ return ggml_cuda_lock_counter.load(std::memory_order_relaxed) == 0; });
|
|
527
|
+
|
|
528
|
+
if (copy_event != nullptr) {
|
|
529
|
+
CUDA_CHECK(cudaEventDestroy(copy_event));
|
|
530
|
+
}
|
|
531
|
+
for (int i = 0; i < GGML_CUDA_MAX_DEVICES; ++i) {
|
|
532
|
+
for (int j = 0; j < GGML_CUDA_MAX_STREAMS; ++j) {
|
|
533
|
+
if (streams[i][j] != nullptr) {
|
|
534
|
+
CUDA_CHECK(cudaStreamDestroy(streams[i][j]));
|
|
535
|
+
}
|
|
536
|
+
}
|
|
537
|
+
if (cublas_handles[i] != nullptr) {
|
|
538
|
+
CUBLAS_CHECK(cublasDestroy(cublas_handles[i]));
|
|
539
|
+
}
|
|
540
|
+
}
|
|
541
|
+
}
|
|
542
|
+
|
|
543
|
+
|
|
517
544
|
// cuda buffer
|
|
518
545
|
|
|
519
546
|
struct ggml_backend_cuda_buffer_context {
|
|
@@ -615,9 +642,8 @@ static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
|
|
|
615
642
|
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
|
616
643
|
|
|
617
644
|
ggml_cuda_set_device(ctx->device);
|
|
618
|
-
CUDA_CHECK(
|
|
619
|
-
CUDA_CHECK(
|
|
620
|
-
CUDA_CHECK(cudaDeviceSynchronize());
|
|
645
|
+
CUDA_CHECK(cudaMemsetAsync(ctx->dev_ptr, value, buffer->size, cudaStreamPerThread));
|
|
646
|
+
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
|
|
621
647
|
}
|
|
622
648
|
|
|
623
649
|
static const ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = {
|
|
@@ -1144,7 +1170,6 @@ typedef void (*ggml_cuda_op_mul_mat_t)(
|
|
|
1144
1170
|
static cudaError_t ggml_cuda_cpy_tensor_2d(
|
|
1145
1171
|
void * dst, const struct ggml_tensor * src, int64_t i3, int64_t i2, int64_t i1_low, int64_t i1_high, cudaStream_t stream) {
|
|
1146
1172
|
|
|
1147
|
-
GGML_ASSERT(ggml_backend_buffer_is_cuda(src->buffer));
|
|
1148
1173
|
const char * src_ptr = (const char *) src->data;
|
|
1149
1174
|
char * dst_ptr = (char *) dst;
|
|
1150
1175
|
|
|
@@ -1202,9 +1227,12 @@ static void ggml_cuda_op_mul_mat_cublas(
|
|
|
1202
1227
|
|
|
1203
1228
|
const int cc = ggml_cuda_info().devices[id].cc;
|
|
1204
1229
|
|
|
1230
|
+
const bool supports_bf16 = GGML_CUDA_CC_IS_NVIDIA(cc) || GGML_CUDA_CC_IS_AMD(cc) ||
|
|
1231
|
+
(GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_QY2);
|
|
1232
|
+
|
|
1205
1233
|
const bool use_fp16 = (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT;
|
|
1206
1234
|
|
|
1207
|
-
if (src0->type == GGML_TYPE_BF16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) {
|
|
1235
|
+
if (supports_bf16 && src0->type == GGML_TYPE_BF16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) {
|
|
1208
1236
|
ggml_cuda_pool_alloc<nv_bfloat16> src1_as_bf16(ctx.pool(id));
|
|
1209
1237
|
if (src1->type != GGML_TYPE_BF16) {
|
|
1210
1238
|
const to_bf16_cuda_t to_bf16_cuda = ggml_get_to_bf16_cuda(src1->type);
|
|
@@ -1232,7 +1260,7 @@ static void ggml_cuda_op_mul_mat_cublas(
|
|
|
1232
1260
|
|
|
1233
1261
|
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_BF16);
|
|
1234
1262
|
to_fp32_cuda(dst_bf16.get(), dst_dd_i, row_diff*src1_ncols, stream);
|
|
1235
|
-
} else if ((
|
|
1263
|
+
} else if (fast_fp16_hardware_available(cc) && use_fp16) {
|
|
1236
1264
|
// convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
|
|
1237
1265
|
ggml_cuda_pool_alloc<half> src0_as_f16(ctx.pool(id));
|
|
1238
1266
|
if (src0->type != GGML_TYPE_F16) {
|
|
@@ -1427,8 +1455,6 @@ static void ggml_cuda_op_mul_mat(
|
|
|
1427
1455
|
const int64_t nb2 = dst->nb[2];
|
|
1428
1456
|
const int64_t nb3 = dst->nb[3];
|
|
1429
1457
|
|
|
1430
|
-
GGML_ASSERT(ggml_backend_buffer_is_cuda(dst->buffer));
|
|
1431
|
-
GGML_ASSERT(ggml_backend_buffer_is_cuda(src1->buffer));
|
|
1432
1458
|
ggml_backend_cuda_buffer_context * src1_ctx = (ggml_backend_cuda_buffer_context *) src1->buffer->context;
|
|
1433
1459
|
ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *) dst->buffer->context;
|
|
1434
1460
|
|
|
@@ -1750,7 +1776,7 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
|
|
|
1750
1776
|
GGML_ASSERT(!ggml_is_transposed(src0));
|
|
1751
1777
|
GGML_ASSERT(!ggml_is_transposed(src1));
|
|
1752
1778
|
|
|
1753
|
-
GGML_ASSERT(
|
|
1779
|
+
GGML_ASSERT(!ggml_backend_buft_is_cuda_split(src0->buffer->buft));
|
|
1754
1780
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
|
1755
1781
|
|
|
1756
1782
|
// Byte offsets and tensor dimensions are currently used in an inconsistent way for dst.
|
|
@@ -1920,16 +1946,14 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
|
|
1920
1946
|
&& ggml_nbytes(src0) != ggml_backend_buffer_get_alloc_size(src0->buffer, src0) && src0->view_src;
|
|
1921
1947
|
|
|
1922
1948
|
bool use_mul_mat_vec = (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16)
|
|
1923
|
-
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
|
1924
|
-
&& src0->ne[0] % 2 == 0 && src1->ne[1] == 1;
|
|
1949
|
+
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
|
|
1925
1950
|
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) && !bad_padding_clear
|
|
1926
1951
|
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
|
1927
1952
|
&& src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
|
|
1928
1953
|
bool use_mul_mat_q = ggml_is_quantized(src0->type) && !bad_padding_clear
|
|
1929
1954
|
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
|
|
1930
1955
|
|
|
1931
|
-
bool any_gpus_with_slow_fp16
|
|
1932
|
-
bool any_gpus_without_fp16_mma = false;
|
|
1956
|
+
bool any_gpus_with_slow_fp16 = false;
|
|
1933
1957
|
|
|
1934
1958
|
if (split) {
|
|
1935
1959
|
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
|
|
@@ -1940,16 +1964,16 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
|
|
1940
1964
|
continue;
|
|
1941
1965
|
}
|
|
1942
1966
|
|
|
1943
|
-
const int cc
|
|
1944
|
-
use_mul_mat_q
|
|
1945
|
-
|
|
1946
|
-
|
|
1967
|
+
const int cc = ggml_cuda_info().devices[id].cc;
|
|
1968
|
+
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
|
|
1969
|
+
use_mul_mat_vec = use_mul_mat_vec && ggml_cuda_should_use_mmv(src0->type, cc, src0->ne, src1->ne[1]);
|
|
1970
|
+
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
|
|
1947
1971
|
}
|
|
1948
1972
|
} else {
|
|
1949
|
-
const int cc
|
|
1950
|
-
use_mul_mat_q
|
|
1951
|
-
|
|
1952
|
-
|
|
1973
|
+
const int cc = ggml_cuda_info().devices[ctx.device].cc;
|
|
1974
|
+
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
|
|
1975
|
+
use_mul_mat_vec = use_mul_mat_vec && ggml_cuda_should_use_mmv(src0->type, cc, src0->ne, src1->ne[1]);
|
|
1976
|
+
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
|
|
1953
1977
|
}
|
|
1954
1978
|
|
|
1955
1979
|
// debug helpers
|
|
@@ -1960,7 +1984,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
|
|
1960
1984
|
//printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
|
|
1961
1985
|
//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
|
|
1962
1986
|
|
|
1963
|
-
if (!split && use_mul_mat_vec
|
|
1987
|
+
if (!split && use_mul_mat_vec) {
|
|
1964
1988
|
// the custom F16 vector kernel can be used over batched cuBLAS GEMM
|
|
1965
1989
|
// but this is only faster for GPUs without tensor cores or with a thin src0 matrix (particularly KQV in attention)
|
|
1966
1990
|
ggml_cuda_mul_mat_vec(ctx, src0, src1, nullptr, dst);
|
|
@@ -2314,6 +2338,12 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
|
|
2314
2338
|
case GGML_OP_IM2COL:
|
|
2315
2339
|
ggml_cuda_op_im2col(ctx, dst);
|
|
2316
2340
|
break;
|
|
2341
|
+
case GGML_OP_CONV_2D_DW:
|
|
2342
|
+
ggml_cuda_op_conv2d_dw(ctx, dst);
|
|
2343
|
+
break;
|
|
2344
|
+
case GGML_OP_CONV_TRANSPOSE_2D:
|
|
2345
|
+
ggml_cuda_conv_2d_transpose_p0(ctx, dst);
|
|
2346
|
+
break;
|
|
2317
2347
|
case GGML_OP_CONV_TRANSPOSE_1D:
|
|
2318
2348
|
ggml_cuda_op_conv_transpose_1d(ctx,dst);
|
|
2319
2349
|
break;
|
|
@@ -2326,6 +2356,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
|
|
2326
2356
|
case GGML_OP_SUM_ROWS:
|
|
2327
2357
|
ggml_cuda_op_sum_rows(ctx, dst);
|
|
2328
2358
|
break;
|
|
2359
|
+
case GGML_OP_MEAN:
|
|
2360
|
+
ggml_cuda_op_mean(ctx, dst);
|
|
2361
|
+
break;
|
|
2329
2362
|
case GGML_OP_SSM_CONV:
|
|
2330
2363
|
ggml_cuda_op_ssm_conv(ctx, dst);
|
|
2331
2364
|
break;
|
|
@@ -2668,7 +2701,9 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
|
|
2668
2701
|
ggml_backend_buft_is_cuda_split(node->src[j]->buffer->buft) || (integrated && ggml_backend_buft_is_cuda_host(node->src[j]->buffer->buft)));
|
|
2669
2702
|
}
|
|
2670
2703
|
}
|
|
2671
|
-
#
|
|
2704
|
+
#else
|
|
2705
|
+
GGML_UNUSED(integrated);
|
|
2706
|
+
#endif // NDEBUG
|
|
2672
2707
|
|
|
2673
2708
|
bool ok = ggml_cuda_compute_forward(*cuda_ctx, node);
|
|
2674
2709
|
if (!ok) {
|
|
@@ -2687,6 +2722,11 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
|
|
2687
2722
|
|
|
2688
2723
|
CUDA_CHECK(cudaStreamEndCapture(cuda_ctx->stream(), &cuda_ctx->cuda_graph->graph));
|
|
2689
2724
|
graph_evaluated_or_captured = true; // CUDA graph has been captured
|
|
2725
|
+
|
|
2726
|
+
std::lock_guard<std::mutex> lock(ggml_cuda_lock);
|
|
2727
|
+
if (ggml_cuda_lock_counter.fetch_sub(1, std::memory_order_relaxed) == 1) {
|
|
2728
|
+
ggml_cuda_lock_cv.notify_all();
|
|
2729
|
+
}
|
|
2690
2730
|
} else {
|
|
2691
2731
|
graph_evaluated_or_captured = true; // ggml graph has been directly evaluated
|
|
2692
2732
|
}
|
|
@@ -2762,7 +2802,13 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
|
|
2762
2802
|
}
|
|
2763
2803
|
}
|
|
2764
2804
|
|
|
2765
|
-
if (use_cuda_graph && cuda_graph_update_required) {
|
|
2805
|
+
if (use_cuda_graph && cuda_graph_update_required) {
|
|
2806
|
+
// Start CUDA graph capture
|
|
2807
|
+
{
|
|
2808
|
+
std::lock_guard<std::mutex> lock(ggml_cuda_lock);
|
|
2809
|
+
ggml_cuda_lock_counter.fetch_add(1, std::memory_order_relaxed);
|
|
2810
|
+
}
|
|
2811
|
+
|
|
2766
2812
|
CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed));
|
|
2767
2813
|
}
|
|
2768
2814
|
|
|
@@ -3018,9 +3064,16 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
|
|
3018
3064
|
return false;
|
|
3019
3065
|
}
|
|
3020
3066
|
#ifdef GGML_USE_MUSA
|
|
3021
|
-
|
|
3022
|
-
|
|
3023
|
-
|
|
3067
|
+
const int cc = ggml_cuda_info().devices[dev_ctx->device].cc;
|
|
3068
|
+
if (b->ne[2]*b->ne[3] > 1 && !ggml_is_transposed(a) && !ggml_is_transposed(b)) {
|
|
3069
|
+
if (GGML_CUDA_CC_IS_QY1(cc) && op->op == GGML_OP_MUL_MAT &&
|
|
3070
|
+
a->type == GGML_TYPE_F16 && b->type == GGML_TYPE_F16) {
|
|
3071
|
+
return false;
|
|
3072
|
+
}
|
|
3073
|
+
if (GGML_CUDA_CC_IS_QY2(cc) && op->op == GGML_OP_MUL_MAT_ID &&
|
|
3074
|
+
a->type == GGML_TYPE_Q2_K && b->type == GGML_TYPE_F32) {
|
|
3075
|
+
return false;
|
|
3076
|
+
}
|
|
3024
3077
|
}
|
|
3025
3078
|
#endif // GGML_USE_MUSA
|
|
3026
3079
|
switch (a->type) {
|
|
@@ -3047,11 +3100,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
|
|
3047
3100
|
case GGML_TYPE_IQ4_NL:
|
|
3048
3101
|
case GGML_TYPE_IQ4_XS:
|
|
3049
3102
|
case GGML_TYPE_BF16:
|
|
3050
|
-
#ifdef GGML_USE_MUSA
|
|
3051
|
-
if (a->type == GGML_TYPE_Q3_K) {
|
|
3052
|
-
return false;
|
|
3053
|
-
}
|
|
3054
|
-
#endif // GGML_USE_MUSA
|
|
3055
3103
|
return true;
|
|
3056
3104
|
default:
|
|
3057
3105
|
return false;
|
|
@@ -3211,9 +3259,12 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
|
|
3211
3259
|
return op->src[0]->nb[0] == ggml_type_size(op->src[0]->type) && ggml_is_contiguous_2(op->src[0]);
|
|
3212
3260
|
}
|
|
3213
3261
|
case GGML_OP_IM2COL:
|
|
3262
|
+
case GGML_OP_CONV_2D_DW:
|
|
3263
|
+
case GGML_OP_CONV_TRANSPOSE_2D:
|
|
3214
3264
|
case GGML_OP_POOL_2D:
|
|
3215
3265
|
case GGML_OP_SUM:
|
|
3216
3266
|
case GGML_OP_SUM_ROWS:
|
|
3267
|
+
case GGML_OP_MEAN:
|
|
3217
3268
|
case GGML_OP_ARGSORT:
|
|
3218
3269
|
case GGML_OP_ACC:
|
|
3219
3270
|
return true;
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
#include "mean.cuh"
|
|
2
|
+
|
|
3
|
+
void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|
4
|
+
const ggml_tensor * src0 = dst->src[0];
|
|
5
|
+
const float * src0_d = (const float *) src0->data;
|
|
6
|
+
float * dst_d = (float *) dst->data;
|
|
7
|
+
cudaStream_t stream = ctx.stream();
|
|
8
|
+
|
|
9
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
|
10
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
11
|
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
|
12
|
+
|
|
13
|
+
const int64_t ncols = src0->ne[0];
|
|
14
|
+
const int64_t nrows = ggml_nrows(src0);
|
|
15
|
+
|
|
16
|
+
const dim3 block_dims(WARP_SIZE, 1, 1);
|
|
17
|
+
const dim3 block_nums(nrows, 1, 1);
|
|
18
|
+
reduce_rows_f32</*norm*/ true><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
|
|
19
|
+
}
|