@novastera-oss/llamarn 0.2.7 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/cpp/include/llama.h +8 -3
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86/libggml.so +0 -0
- package/android/src/main/jniLibs/x86/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/LlamaCppModel.cpp +56 -22
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/CMakeLists.txt +1 -2
- package/cpp/llama.cpp/README.md +4 -5
- package/cpp/llama.cpp/build-xcframework.sh +1 -1
- package/cpp/llama.cpp/common/CMakeLists.txt +4 -5
- package/cpp/llama.cpp/common/arg.cpp +24 -0
- package/cpp/llama.cpp/common/chat.cpp +37 -20
- package/cpp/llama.cpp/common/chat.h +2 -0
- package/cpp/llama.cpp/common/common.cpp +3 -0
- package/cpp/llama.cpp/common/common.h +5 -0
- package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
- package/cpp/llama.cpp/convert_hf_to_gguf.py +860 -23
- package/cpp/llama.cpp/convert_hf_to_gguf_update.py +9 -0
- package/cpp/llama.cpp/ggml/CMakeLists.txt +8 -2
- package/cpp/llama.cpp/ggml/include/ggml-backend.h +1 -1
- package/cpp/llama.cpp/ggml/include/ggml-cpu.h +2 -0
- package/cpp/llama.cpp/ggml/include/ggml.h +206 -10
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +17 -1
- package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +0 -8
- package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +36 -18
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +68 -5
- package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +16 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +37 -3
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +10 -9
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +109 -108
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +1027 -1038
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +53 -52
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +56 -55
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +42 -41
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +24 -23
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +29 -28
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +30 -29
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +83 -82
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +20 -19
- package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +3 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +9 -3
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +111 -103
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +3 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +1405 -240
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +25 -24
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +56 -40
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +212 -34
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +35 -11
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +187 -54
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +71 -29
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cu +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cross-entropy-loss.cu +2 -14
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +4 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +8 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cu +6 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cu +14 -12
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +5 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +15 -10
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +12 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/getrows.cu +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +269 -110
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cu +19 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cuh +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +2 -8
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cu +257 -87
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cuh +2 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/rope.cu +21 -27
- package/cpp/llama.cpp/ggml/src/ggml-cuda/scale.cu +8 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/softmax.cu +119 -58
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-conv.cu +10 -2
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +192 -52
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cu +5 -18
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cuh +0 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +97 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +11 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/upscale.cu +92 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +14 -5
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +125 -183
- package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -2
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +51 -9
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +394 -80
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +616 -239
- package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +741 -571
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/gelu.cl +27 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/glu.cl +337 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/scale.cl +3 -2
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/set_rows.cl +95 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +24 -11
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +24 -11
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f16.cl +24 -11
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f32.cl +24 -11
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +2 -3
- package/cpp/llama.cpp/ggml/src/ggml-quants.c +6 -6
- package/cpp/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -6
- package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +1 -24
- package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +28 -41
- package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +4 -10
- package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +99 -166
- package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +94 -72
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +49 -67
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +697 -1098
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +18 -9
- package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +6 -9
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +104 -62
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +60 -80
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +132 -201
- package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +55 -74
- package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +39 -38
- package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.cpp +131 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.hpp +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -8
- package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +12 -16
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +12 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +767 -292
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +58 -7
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +28 -23
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +14 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +38 -32
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +32 -27
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +44 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +15 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp +29 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +128 -72
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +38 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +12 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +20 -4
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +69 -5
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +23 -3
- package/cpp/llama.cpp/ggml/src/ggml.c +449 -72
- package/cpp/llama.cpp/ggml/src/gguf.cpp +13 -2
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +285 -0
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +27 -0
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +137 -21
- package/cpp/llama.cpp/gguf-py/gguf/vocab.py +109 -7
- package/cpp/llama.cpp/gguf-py/pyproject.toml +2 -2
- package/cpp/llama.cpp/include/llama.h +8 -43
- package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +124 -0
- package/cpp/llama.cpp/src/llama-arch.cpp +265 -3
- package/cpp/llama.cpp/src/llama-arch.h +36 -1
- package/cpp/llama.cpp/src/llama-batch.cpp +596 -359
- package/cpp/llama.cpp/src/llama-batch.h +105 -70
- package/cpp/llama.cpp/src/llama-chat.cpp +26 -6
- package/cpp/llama.cpp/src/llama-chat.h +1 -0
- package/cpp/llama.cpp/src/llama-context.cpp +101 -107
- package/cpp/llama.cpp/src/llama-context.h +13 -13
- package/cpp/llama.cpp/src/llama-graph.cpp +286 -404
- package/cpp/llama.cpp/src/llama-graph.h +78 -79
- package/cpp/llama.cpp/src/llama-hparams.cpp +11 -1
- package/cpp/llama.cpp/src/llama-hparams.h +11 -0
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +74 -66
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +23 -26
- package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +312 -157
- package/cpp/llama.cpp/src/llama-kv-cache-unified.h +79 -46
- package/cpp/llama.cpp/src/llama-kv-cells.h +97 -21
- package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +73 -69
- package/cpp/llama.cpp/src/llama-memory-hybrid.h +19 -22
- package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +88 -77
- package/cpp/llama.cpp/src/llama-memory-recurrent.h +15 -20
- package/cpp/llama.cpp/src/llama-memory.cpp +17 -0
- package/cpp/llama.cpp/src/llama-memory.h +21 -22
- package/cpp/llama.cpp/src/llama-model-saver.cpp +1 -0
- package/cpp/llama.cpp/src/llama-model.cpp +5301 -2922
- package/cpp/llama.cpp/src/llama-model.h +40 -0
- package/cpp/llama.cpp/src/llama-quant.cpp +88 -5
- package/cpp/llama.cpp/src/llama-vocab.cpp +37 -3
- package/cpp/llama.cpp/src/llama-vocab.h +42 -0
- package/cpp/rn-utils.h +3 -0
- package/ios/include/chat.h +2 -0
- package/ios/include/common.h +5 -0
- package/ios/include/llama.h +8 -43
- package/ios/libs/llama.xcframework/Info.plist +19 -19
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5059 -4863
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +206 -10
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +8 -43
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5030 -4834
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3889 -3742
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +206 -10
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -43
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5030 -4834
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3891 -3744
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +206 -10
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +8 -43
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +206 -10
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +8 -43
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +206 -10
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +8 -43
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5059 -4863
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +206 -10
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +8 -43
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5030 -4834
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3889 -3742
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +206 -10
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -43
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5095 -4900
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +206 -10
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +8 -43
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5066 -4871
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3919 -3773
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +206 -10
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -43
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +1 -1
- package/cpp/llama.cpp/ggml/include/ggml-kompute.h +0 -50
- package/cpp/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
- package/cpp/llama.cpp/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/common.comp +0 -112
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +0 -58
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +0 -25
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +0 -30
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +0 -22
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +0 -17
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +0 -31
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +0 -31
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +0 -38
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +0 -39
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +0 -44
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +0 -69
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +0 -51
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +0 -33
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +0 -35
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +0 -140
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +0 -106
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +0 -73
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +0 -28
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +0 -84
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +0 -21
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +0 -53
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +0 -19
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +0 -23
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +0 -22
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +0 -72
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +0 -71
|
@@ -61,8 +61,8 @@ ACC_TYPE Max(const in uint32_t row, const in uint32_t col, const in ACC_TYPE ele
|
|
|
61
61
|
// Rows index by Q's dimension 2, and the first N rows are valid.
|
|
62
62
|
D_TYPE perElemOpGqaStore(const in uint32_t r, const in uint32_t c, const in D_TYPE elem, const in uint32_t o_offset, const in uint32_t iq2, const in uint32_t N)
|
|
63
63
|
{
|
|
64
|
-
if (r < N && c <
|
|
65
|
-
uint32_t offset = (iq2 + r) *
|
|
64
|
+
if (r < N && c < HSV) {
|
|
65
|
+
uint32_t offset = (iq2 + r) * HSV + c;
|
|
66
66
|
data_o[o_offset + offset] = D_TYPE(elem);
|
|
67
67
|
}
|
|
68
68
|
return elem;
|
|
@@ -86,9 +86,9 @@ void main() {
|
|
|
86
86
|
tensorLayoutV = setTensorLayoutBlockSizeNV(tensorLayoutV, 1, BLOCK_SIZE);
|
|
87
87
|
#endif
|
|
88
88
|
|
|
89
|
-
tensorLayoutQ = setTensorLayoutDimensionNV(tensorLayoutQ, N,
|
|
90
|
-
tensorLayoutK = setTensorLayoutDimensionNV(tensorLayoutK, KV,
|
|
91
|
-
tensorLayoutV = setTensorLayoutDimensionNV(tensorLayoutV, KV,
|
|
89
|
+
tensorLayoutQ = setTensorLayoutDimensionNV(tensorLayoutQ, N, HSK);
|
|
90
|
+
tensorLayoutK = setTensorLayoutDimensionNV(tensorLayoutK, KV, HSK);
|
|
91
|
+
tensorLayoutV = setTensorLayoutDimensionNV(tensorLayoutV, KV, HSV);
|
|
92
92
|
|
|
93
93
|
// hint to the compiler that strides are aligned for the aligned variant of the shader
|
|
94
94
|
if (Clamp != gl_CooperativeMatrixClampModeConstantNV)
|
|
@@ -104,16 +104,16 @@ void main() {
|
|
|
104
104
|
tensorLayoutK = setTensorLayoutStrideNV(tensorLayoutK, k_stride, 1);
|
|
105
105
|
tensorLayoutV = setTensorLayoutStrideNV(tensorLayoutV, v_stride, 1);
|
|
106
106
|
|
|
107
|
-
coopmat<Q_TYPE, gl_ScopeWorkgroup, Br,
|
|
108
|
-
coopmat<float16_t, gl_ScopeWorkgroup, Br,
|
|
107
|
+
coopmat<Q_TYPE, gl_ScopeWorkgroup, Br, HSK, gl_MatrixUseAccumulator> Q;
|
|
108
|
+
coopmat<float16_t, gl_ScopeWorkgroup, Br, HSK, gl_MatrixUseA> Qf16;
|
|
109
109
|
|
|
110
110
|
uint32_t q_offset = iq2*p.nb02+iq3*p.nb03;
|
|
111
|
-
coopMatLoadTensorNV(Q, data_q, q_offset, sliceTensorLayoutNV(tensorLayoutQ, i * Br, Br, 0,
|
|
111
|
+
coopMatLoadTensorNV(Q, data_q, q_offset, sliceTensorLayoutNV(tensorLayoutQ, i * Br, Br, 0, HSK));
|
|
112
112
|
|
|
113
|
-
Qf16 = coopmat<float16_t, gl_ScopeWorkgroup, Br,
|
|
113
|
+
Qf16 = coopmat<float16_t, gl_ScopeWorkgroup, Br, HSK, gl_MatrixUseA>(Q);
|
|
114
114
|
Qf16 *= float16_t(p.scale);
|
|
115
115
|
|
|
116
|
-
coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br,
|
|
116
|
+
coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, HSV, gl_MatrixUseAccumulator> O = coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, HSV, gl_MatrixUseAccumulator>(0);
|
|
117
117
|
|
|
118
118
|
coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator> L, M;
|
|
119
119
|
|
|
@@ -130,15 +130,20 @@ void main() {
|
|
|
130
130
|
coopMatPerElementNV(slopeMat, slopeMat, perElemOpComputeSlope, iq2);
|
|
131
131
|
}
|
|
132
132
|
|
|
133
|
+
uint32_t m_offset = 0;
|
|
134
|
+
if (p.nem2 != 1 || p.nem3 != 1) {
|
|
135
|
+
m_offset = ((iq3 % p.nem3) * p.nem2 + (iq2 % p.nem2)) * p.nem1 * KV * 2 /*sizeof(float16_t)*/;
|
|
136
|
+
}
|
|
137
|
+
|
|
133
138
|
[[dont_unroll]]
|
|
134
139
|
for (uint32_t j = start_j; j < end_j; ++j) {
|
|
135
140
|
|
|
136
141
|
coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator> S = coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(0);
|
|
137
142
|
|
|
138
|
-
coopmat<float16_t, gl_ScopeWorkgroup,
|
|
143
|
+
coopmat<float16_t, gl_ScopeWorkgroup, HSK, Bc, gl_MatrixUseB> K_T;
|
|
139
144
|
|
|
140
145
|
uint32_t k_offset = ik2*p.nb12 + ik3*p.nb13;
|
|
141
|
-
coopMatLoadTensorNV(K_T, data_k, k_offset, sliceTensorLayoutNV(tensorLayoutK, j * Bc, Bc, 0,
|
|
146
|
+
coopMatLoadTensorNV(K_T, data_k, k_offset, sliceTensorLayoutNV(tensorLayoutK, j * Bc, Bc, 0, HSK), tensorViewTranspose DECODEFUNC);
|
|
142
147
|
S = coopMatMulAdd(Qf16, K_T, S);
|
|
143
148
|
|
|
144
149
|
if (p.logit_softcap != 0.0f) {
|
|
@@ -148,14 +153,14 @@ void main() {
|
|
|
148
153
|
}
|
|
149
154
|
}
|
|
150
155
|
|
|
151
|
-
if (p.
|
|
156
|
+
if ((p.mask_n_head_log2 & MASK_ENABLE_BIT) != 0) {
|
|
152
157
|
tensorLayoutNV<2, Clamp> tensorLayoutM = createTensorLayoutNV(2, Clamp);
|
|
153
158
|
tensorLayoutM = setTensorLayoutDimensionNV(tensorLayoutM, p.nem1, KV);
|
|
154
159
|
tensorLayoutM = setTensorLayoutStrideNV(tensorLayoutM, m_stride, 1);
|
|
155
160
|
|
|
156
161
|
coopmat<float16_t, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator> mv;
|
|
157
162
|
|
|
158
|
-
coopMatLoadTensorNV(mv, data_m,
|
|
163
|
+
coopMatLoadTensorNV(mv, data_m, m_offset, sliceTensorLayoutNV(tensorLayoutM, i * Br, Br, j * Bc, Bc));
|
|
159
164
|
|
|
160
165
|
S += slopeMat*coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(mv);
|
|
161
166
|
}
|
|
@@ -203,42 +208,42 @@ void main() {
|
|
|
203
208
|
rowsum = coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(0.0);
|
|
204
209
|
rowsum = coopMatMulAdd(P_A, One, rowsum);
|
|
205
210
|
|
|
206
|
-
coopmat<float16_t, gl_ScopeWorkgroup, Bc,
|
|
211
|
+
coopmat<float16_t, gl_ScopeWorkgroup, Bc, HSV, gl_MatrixUseB> V;
|
|
207
212
|
uint32_t v_offset = iv2*p.nb22 + iv3*p.nb23;
|
|
208
|
-
coopMatLoadTensorNV(V, data_v, v_offset, sliceTensorLayoutNV(tensorLayoutV, j * Bc, Bc, 0,
|
|
213
|
+
coopMatLoadTensorNV(V, data_v, v_offset, sliceTensorLayoutNV(tensorLayoutV, j * Bc, Bc, 0, HSV) DECODEFUNC);
|
|
209
214
|
|
|
210
215
|
L = eM*L + rowsum;
|
|
211
216
|
|
|
212
217
|
// This is the "diagonal" matrix in the paper, but since we do componentwise
|
|
213
218
|
// multiply rather than matrix multiply it has the diagonal element smeared
|
|
214
219
|
// across the row
|
|
215
|
-
coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br,
|
|
220
|
+
coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, HSV, gl_MatrixUseAccumulator> eMdiag;
|
|
216
221
|
|
|
217
222
|
// resize eM by using smear/reduce
|
|
218
223
|
coopMatReduceNV(eMdiag, eM, gl_CooperativeMatrixReduceRowNV, smearReduce);
|
|
219
224
|
|
|
220
225
|
// multiply with fp16 accumulation, then add to O.
|
|
221
|
-
coopmat<float16_t, gl_ScopeWorkgroup, Br,
|
|
226
|
+
coopmat<float16_t, gl_ScopeWorkgroup, Br, HSV, gl_MatrixUseAccumulator> PV = coopmat<float16_t, gl_ScopeWorkgroup, Br, HSV, gl_MatrixUseAccumulator>(0);
|
|
222
227
|
PV = coopMatMulAdd(P_A, V, PV);
|
|
223
228
|
|
|
224
|
-
O = eMdiag * O + coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br,
|
|
229
|
+
O = eMdiag * O + coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, HSV, gl_MatrixUseAccumulator>(PV);
|
|
225
230
|
}
|
|
226
231
|
|
|
227
232
|
// If there is split_k, then the split_k resolve shader does the final
|
|
228
233
|
// division by L. Store the intermediate O value and per-row m and L values.
|
|
229
234
|
if (p.k_num > 1) {
|
|
230
|
-
coopmat<D_TYPE, gl_ScopeWorkgroup, Br,
|
|
235
|
+
coopmat<D_TYPE, gl_ScopeWorkgroup, Br, HSV, gl_MatrixUseAccumulator> O_D = coopmat<D_TYPE, gl_ScopeWorkgroup, Br, HSV, gl_MatrixUseAccumulator>(O);
|
|
231
236
|
|
|
232
|
-
uint32_t o_offset =
|
|
237
|
+
uint32_t o_offset = HSV * p.ne1 * (split_k_index + iq3 * p.k_num);
|
|
233
238
|
coopMatPerElementNV(O_D, O_D, perElemOpGqaStore, o_offset, iq2, N);
|
|
234
239
|
|
|
235
|
-
o_offset =
|
|
240
|
+
o_offset = HSV * p.ne1 * p.ne3 * p.k_num + p.ne1 * (split_k_index + iq3 * p.k_num) * 2;
|
|
236
241
|
coopMatPerElementNV(L, L, perElemOpStoreCol0, o_offset, iq2, N);
|
|
237
242
|
coopMatPerElementNV(M, M, perElemOpStoreCol0, o_offset + p.ne1, iq2, N);
|
|
238
243
|
return;
|
|
239
244
|
}
|
|
240
245
|
|
|
241
|
-
coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br,
|
|
246
|
+
coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, HSV, gl_MatrixUseAccumulator> Ldiag;
|
|
242
247
|
|
|
243
248
|
// resize L by using smear/reduce
|
|
244
249
|
coopMatReduceNV(Ldiag, L, gl_CooperativeMatrixReduceRowNV, smearReduce);
|
|
@@ -250,18 +255,18 @@ void main() {
|
|
|
250
255
|
|
|
251
256
|
O = Ldiag*O;
|
|
252
257
|
|
|
253
|
-
uint32_t o_offset = iq3*p.ne2*p.ne1;
|
|
258
|
+
uint32_t o_offset = iq3*p.ne2*p.ne1*HSV;
|
|
254
259
|
|
|
255
|
-
coopmat<D_TYPE, gl_ScopeWorkgroup, Br,
|
|
260
|
+
coopmat<D_TYPE, gl_ScopeWorkgroup, Br, HSV, gl_MatrixUseAccumulator> O_D = coopmat<D_TYPE, gl_ScopeWorkgroup, Br, HSV, gl_MatrixUseAccumulator>(O);
|
|
256
261
|
if (p.gqa_ratio > 1) {
|
|
257
262
|
coopMatPerElementNV(O_D, O_D, perElemOpGqaStore, o_offset, iq2, N);
|
|
258
263
|
} else {
|
|
259
264
|
tensorLayoutNV<3, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutD = createTensorLayoutNV(3, gl_CooperativeMatrixClampModeConstantNV);
|
|
260
|
-
tensorLayoutD = setTensorLayoutDimensionNV(tensorLayoutD, p.ne2, p.ne1,
|
|
265
|
+
tensorLayoutD = setTensorLayoutDimensionNV(tensorLayoutD, p.ne2, p.ne1, HSV);
|
|
261
266
|
|
|
262
267
|
// permute dimensions
|
|
263
268
|
tensorViewNV<3, false, 1, 0, 2> tensorViewPermute = createTensorViewNV(3, false, 1, 0, 2);
|
|
264
269
|
|
|
265
|
-
coopMatStoreTensorNV(O_D, data_o, o_offset, sliceTensorLayoutNV(tensorLayoutD, i * Br, Br, iq2, N, 0,
|
|
270
|
+
coopMatStoreTensorNV(O_D, data_o, o_offset, sliceTensorLayoutNV(tensorLayoutD, i * Br, Br, iq2, N, 0, HSV), tensorViewPermute);
|
|
266
271
|
}
|
|
267
272
|
}
|
|
@@ -2,9 +2,9 @@
|
|
|
2
2
|
|
|
3
3
|
#extension GL_EXT_control_flow_attributes : enable
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
layout(constant_id = 0) const uint BLOCK_SIZE = 32;
|
|
6
6
|
|
|
7
|
-
layout(
|
|
7
|
+
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
|
8
8
|
|
|
9
9
|
layout (binding = 0) readonly buffer A {float data_a[];};
|
|
10
10
|
layout (binding = 1) writeonly buffer D {float data_d[];};
|
|
@@ -12,48 +12,80 @@ layout (binding = 1) writeonly buffer D {float data_d[];};
|
|
|
12
12
|
layout (push_constant) uniform parameter {
|
|
13
13
|
uint D;
|
|
14
14
|
uint N;
|
|
15
|
+
uint ne3;
|
|
15
16
|
uint k_num;
|
|
16
17
|
} p;
|
|
17
18
|
|
|
19
|
+
shared float tmpsh[BLOCK_SIZE];
|
|
20
|
+
|
|
18
21
|
void main() {
|
|
19
22
|
// Each workgroup handles a row
|
|
20
23
|
const uint n = gl_WorkGroupID.x;
|
|
21
24
|
const uint tid = gl_LocalInvocationID.x;
|
|
25
|
+
const uint iq3 = gl_WorkGroupID.z;
|
|
22
26
|
|
|
23
27
|
uint D = p.D;
|
|
24
28
|
uint N = p.N;
|
|
25
29
|
uint k_num = p.k_num;
|
|
26
30
|
|
|
27
|
-
uint l_offset = D * N * k_num + n;
|
|
28
|
-
uint m_offset = D * N * k_num + N + n;
|
|
31
|
+
uint l_offset = D * N * p.ne3 * k_num + N * iq3 * k_num * 2 + n;
|
|
32
|
+
uint m_offset = D * N * p.ne3 * k_num + N * iq3 * k_num * 2 + N + n;
|
|
29
33
|
uint lm_stride = N * 2;
|
|
30
34
|
|
|
31
35
|
// Compute the max m value for the row
|
|
32
36
|
float m_max = -1.0/0.0;
|
|
33
|
-
|
|
34
|
-
float m = data_a[m_offset + k * lm_stride];
|
|
37
|
+
for (uint k = 0; k + tid < k_num; k += BLOCK_SIZE) {
|
|
38
|
+
float m = data_a[m_offset + (k + tid) * lm_stride];
|
|
35
39
|
m_max = max(m_max, m);
|
|
36
40
|
}
|
|
37
41
|
|
|
42
|
+
// reduce across the workgroup
|
|
43
|
+
tmpsh[tid] = m_max;
|
|
44
|
+
barrier();
|
|
45
|
+
[[unroll]] for (uint s = BLOCK_SIZE/2; s > 0; s >>= 1) {
|
|
46
|
+
if (tid < s) {
|
|
47
|
+
m_max = max(m_max, tmpsh[tid + s]);
|
|
48
|
+
tmpsh[tid] = m_max;
|
|
49
|
+
}
|
|
50
|
+
barrier();
|
|
51
|
+
}
|
|
52
|
+
m_max = tmpsh[0];
|
|
53
|
+
|
|
54
|
+
barrier();
|
|
55
|
+
|
|
38
56
|
// Compute L based on m_max
|
|
39
57
|
float L = 0;
|
|
40
|
-
|
|
41
|
-
float l = data_a[l_offset + k * lm_stride];
|
|
42
|
-
float m = data_a[m_offset + k * lm_stride];
|
|
58
|
+
for (uint k = 0; k + tid < k_num; k += BLOCK_SIZE) {
|
|
59
|
+
float l = data_a[l_offset + (k + tid) * lm_stride];
|
|
60
|
+
float m = data_a[m_offset + (k + tid) * lm_stride];
|
|
43
61
|
L += exp(m - m_max) * l;
|
|
44
62
|
}
|
|
45
63
|
|
|
64
|
+
// reduce across the workgroup
|
|
65
|
+
tmpsh[tid] = L;
|
|
66
|
+
barrier();
|
|
67
|
+
[[unroll]] for (uint s = BLOCK_SIZE/2; s > 0; s >>= 1) {
|
|
68
|
+
if (tid < s) {
|
|
69
|
+
L += tmpsh[tid + s];
|
|
70
|
+
tmpsh[tid] = L;
|
|
71
|
+
}
|
|
72
|
+
barrier();
|
|
73
|
+
}
|
|
74
|
+
L = tmpsh[0];
|
|
75
|
+
|
|
46
76
|
L = 1.0 / L;
|
|
47
77
|
|
|
78
|
+
// D dimension is split across workgroups in the y dimension
|
|
79
|
+
uint d = tid + gl_WorkGroupID.y * BLOCK_SIZE;
|
|
48
80
|
// Scale and sum the O contributions based on m_max and store the result to memory
|
|
49
|
-
|
|
81
|
+
if (d < D) {
|
|
50
82
|
float O = 0.0;
|
|
51
83
|
[[unroll]] for (uint k = 0; k < k_num; ++k) {
|
|
52
|
-
uint o_offset = D * N * k + D * n + d;
|
|
84
|
+
uint o_offset = D * N * (k + iq3 * k_num) + D * n + d;
|
|
53
85
|
float m = data_a[m_offset + k * lm_stride];
|
|
54
86
|
O += exp(m - m_max) * data_a[o_offset];
|
|
55
87
|
}
|
|
56
88
|
O *= L;
|
|
57
|
-
data_d[D * n + d] = O;
|
|
89
|
+
data_d[iq3 * D * N + D * n + d] = O;
|
|
58
90
|
}
|
|
59
91
|
}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
#version 450
|
|
2
|
+
|
|
3
|
+
#include "glu_head.comp"
|
|
4
|
+
|
|
5
|
+
const float GELU_COEF_A = 0.044715f;
|
|
6
|
+
const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
|
|
7
|
+
|
|
8
|
+
float op(float a, float b) {
|
|
9
|
+
const float val = SQRT_2_OVER_PI*a*(1.0f + GELU_COEF_A*a*a);
|
|
10
|
+
return 0.5f*a*(2.0f - 2.0f / (exp(2 * val) + 1)) * b;
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
#include "glu_main.comp"
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
#version 450
|
|
2
|
+
|
|
3
|
+
#include "glu_head.comp"
|
|
4
|
+
|
|
5
|
+
// based on Abramowitz and Stegun formula 7.1.26 or similar Hastings' approximation
|
|
6
|
+
// ref: https://www.johndcook.com/blog/python_erf/
|
|
7
|
+
const float p_erf = 0.3275911f;
|
|
8
|
+
const float a1_erf = 0.254829592f;
|
|
9
|
+
const float a2_erf = -0.284496736f;
|
|
10
|
+
const float a3_erf = 1.421413741f;
|
|
11
|
+
const float a4_erf = -1.453152027f;
|
|
12
|
+
const float a5_erf = 1.061405429f;
|
|
13
|
+
|
|
14
|
+
const float SQRT_2_INV = 0.70710678118654752440084436210484f;
|
|
15
|
+
|
|
16
|
+
float op(float a, float b) {
|
|
17
|
+
const float a_div_sqr2 = a * SQRT_2_INV;
|
|
18
|
+
const float sign_x = sign(a_div_sqr2);
|
|
19
|
+
const float x = abs(a_div_sqr2);
|
|
20
|
+
const float t = 1.0f / (1.0f + p_erf * x);
|
|
21
|
+
const float y = 1.0f - (((((a5_erf * t + a4_erf) * t) + a3_erf) * t + a2_erf) * t + a1_erf) * t * exp(-x * x);
|
|
22
|
+
const float erf_approx = sign_x * y;
|
|
23
|
+
|
|
24
|
+
return 0.5f * a * (1.0f + erf_approx) * b;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
#include "glu_main.comp"
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
#version 450
|
|
2
|
+
|
|
3
|
+
#include "generic_head.comp"
|
|
4
|
+
#include "types.comp"
|
|
5
|
+
|
|
6
|
+
#extension GL_EXT_control_flow_attributes : enable
|
|
7
|
+
|
|
8
|
+
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
|
9
|
+
|
|
10
|
+
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
|
11
|
+
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
|
12
|
+
|
|
13
|
+
void main() {
|
|
14
|
+
// based on Abramowitz and Stegun formula 7.1.26 or similar Hastings' approximation
|
|
15
|
+
// ref: https://www.johndcook.com/blog/python_erf/
|
|
16
|
+
const float p_erf = 0.3275911f;
|
|
17
|
+
const float a1_erf = 0.254829592f;
|
|
18
|
+
const float a2_erf = -0.284496736f;
|
|
19
|
+
const float a3_erf = 1.421413741f;
|
|
20
|
+
const float a4_erf = -1.453152027f;
|
|
21
|
+
const float a5_erf = 1.061405429f;
|
|
22
|
+
|
|
23
|
+
const float SQRT_2_INV = 0.70710678118654752440084436210484f;
|
|
24
|
+
const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
|
25
|
+
|
|
26
|
+
if (i >= p.KX) {
|
|
27
|
+
return;
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
const float a = float(data_a[i]);
|
|
31
|
+
const float a_div_sqr2 = a * SQRT_2_INV;
|
|
32
|
+
const float sign_x = sign(a_div_sqr2);
|
|
33
|
+
const float x = abs(a_div_sqr2);
|
|
34
|
+
const float t = 1.0f / (1.0f + p_erf * x);
|
|
35
|
+
const float y = 1.0f - (((((a5_erf * t + a4_erf) * t) + a3_erf) * t + a2_erf) * t + a1_erf) * t * exp(-x * x);
|
|
36
|
+
const float erf_approx = sign_x * y;
|
|
37
|
+
|
|
38
|
+
data_d[i] = D_TYPE(0.5f * a * (1.0f + erf_approx));
|
|
39
|
+
}
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
#extension GL_EXT_shader_16bit_storage : require
|
|
2
|
+
|
|
3
|
+
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
|
4
|
+
|
|
5
|
+
layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
|
|
6
|
+
layout (binding = 1) readonly buffer B {A_TYPE data_b[];};
|
|
7
|
+
layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
|
|
8
|
+
|
|
9
|
+
layout (push_constant) uniform parameter
|
|
10
|
+
{
|
|
11
|
+
uint N;
|
|
12
|
+
uint ne00;
|
|
13
|
+
uint ne20;
|
|
14
|
+
uint mode;
|
|
15
|
+
} p;
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
void main() {
|
|
2
|
+
const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
|
3
|
+
|
|
4
|
+
if (i >= p.N) {
|
|
5
|
+
return;
|
|
6
|
+
}
|
|
7
|
+
|
|
8
|
+
const uint row = i / p.ne20;
|
|
9
|
+
const uint col = i - row * p.ne20;
|
|
10
|
+
|
|
11
|
+
if (p.mode == 0) {
|
|
12
|
+
// Default
|
|
13
|
+
const uint offset = p.ne00 / 2;
|
|
14
|
+
const uint idx = row * p.ne00 + col;
|
|
15
|
+
|
|
16
|
+
data_d[row * offset + col] = D_TYPE(op(float(data_a[idx]), float(data_a[idx + offset])));
|
|
17
|
+
} else if (p.mode == 1) {
|
|
18
|
+
// Swapped
|
|
19
|
+
const uint offset = p.ne00 / 2;
|
|
20
|
+
const uint idx = row * p.ne00 + col;
|
|
21
|
+
|
|
22
|
+
data_d[row * offset + col] = D_TYPE(op(float(data_a[idx + offset]), float(data_a[idx])));
|
|
23
|
+
} else {
|
|
24
|
+
// Split
|
|
25
|
+
const uint idx = row * p.ne00 + col;
|
|
26
|
+
|
|
27
|
+
data_d[idx] = D_TYPE(op(float(data_a[idx]), float(data_b[idx])));
|
|
28
|
+
}
|
|
29
|
+
}
|