@novastera-oss/llamarn 0.2.6 → 0.2.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/cpp/include/llama.h +141 -38
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/LlamaCppModel.cpp +58 -24
- package/cpp/LlamaCppModel.h +3 -3
- package/cpp/PureCppImpl.cpp +1 -1
- package/cpp/PureCppImpl.h +2 -2
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/CMakeLists.txt +15 -4
- package/cpp/llama.cpp/Makefile +2 -2
- package/cpp/llama.cpp/README.md +32 -13
- package/cpp/llama.cpp/common/CMakeLists.txt +10 -20
- package/cpp/llama.cpp/common/arg.cpp +37 -6
- package/cpp/llama.cpp/common/build-info.cpp.in +2 -2
- package/cpp/llama.cpp/common/chat-parser.cpp +5 -0
- package/cpp/llama.cpp/common/chat-parser.h +2 -0
- package/cpp/llama.cpp/common/chat.cpp +12 -9
- package/cpp/llama.cpp/common/chat.h +1 -1
- package/cpp/llama.cpp/common/common.cpp +53 -40
- package/cpp/llama.cpp/common/common.h +6 -2
- package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
- package/cpp/llama.cpp/common/speculative.cpp +6 -4
- package/cpp/llama.cpp/convert_hf_to_gguf.py +215 -76
- package/cpp/llama.cpp/ggml/CMakeLists.txt +48 -2
- package/cpp/llama.cpp/ggml/cmake/common.cmake +1 -2
- package/cpp/llama.cpp/ggml/include/ggml-cpu.h +2 -0
- package/cpp/llama.cpp/ggml/include/ggml.h +33 -0
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +64 -13
- package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +6 -1
- package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +33 -9
- package/cpp/llama.cpp/ggml/src/ggml-common.h +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +124 -26
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +11 -10
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4114 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2163 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2639 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2732 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2069 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +397 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1300 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1481 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +4311 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +79 -3225
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +184 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +4 -3
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +16 -7
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +93 -104
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +12 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +58 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +194 -69
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +1158 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +1571 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.h +98 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +213 -37
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +45 -45
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +59 -37
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +4 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +90 -39
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cu +19 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cuh +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cu +257 -87
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cuh +2 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +6 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cu +5 -18
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cuh +0 -1
- package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +61 -183
- package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +11 -10
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +16 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +260 -49
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +497 -282
- package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1078 -468
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
- package/cpp/llama.cpp/ggml/src/ggml-quants.c +0 -2
- package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +18 -15
- package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -6
- package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +20 -48
- package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +28 -41
- package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +4 -10
- package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +117 -165
- package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +192 -53
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +32 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +49 -67
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +99 -159
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +8 -105
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +209 -92
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +60 -80
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +158 -203
- package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +55 -74
- package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +38 -10
- package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +24 -20
- package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -8
- package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +108 -16
- package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +12 -16
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +36 -28
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +487 -247
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml.c +69 -19
- package/cpp/llama.cpp/ggml/src/gguf.cpp +5 -1
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +133 -0
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +25 -1
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +78 -3
- package/cpp/llama.cpp/gguf-py/gguf/vocab.py +97 -4
- package/cpp/llama.cpp/gguf-py/pyproject.toml +2 -2
- package/cpp/llama.cpp/include/llama.h +141 -38
- package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +124 -0
- package/cpp/llama.cpp/requirements/requirements-compare-llama-bench.txt +1 -0
- package/cpp/llama.cpp/src/CMakeLists.txt +2 -2
- package/cpp/llama.cpp/src/llama-arch.cpp +150 -3
- package/cpp/llama.cpp/src/llama-arch.h +25 -1
- package/cpp/llama.cpp/src/llama-batch.cpp +736 -274
- package/cpp/llama.cpp/src/llama-batch.h +110 -57
- package/cpp/llama.cpp/src/llama-chat.cpp +30 -8
- package/cpp/llama.cpp/src/llama-chat.h +1 -0
- package/cpp/llama.cpp/src/llama-context.cpp +360 -266
- package/cpp/llama.cpp/src/llama-context.h +27 -23
- package/cpp/llama.cpp/src/llama-cparams.cpp +1 -1
- package/cpp/llama.cpp/src/llama-cparams.h +1 -1
- package/cpp/llama.cpp/src/llama-graph.cpp +411 -344
- package/cpp/llama.cpp/src/llama-graph.h +126 -58
- package/cpp/llama.cpp/src/llama-hparams.cpp +10 -2
- package/cpp/llama.cpp/src/llama-hparams.h +16 -2
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +103 -73
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +34 -42
- package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +345 -221
- package/cpp/llama.cpp/src/llama-kv-cache-unified.h +75 -50
- package/cpp/llama.cpp/src/llama-kv-cells.h +51 -22
- package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +246 -0
- package/cpp/llama.cpp/src/llama-memory-hybrid.h +138 -0
- package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.cpp → llama-memory-recurrent.cpp} +302 -317
- package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.h → llama-memory-recurrent.h} +60 -68
- package/cpp/llama.cpp/src/llama-memory.cpp +41 -0
- package/cpp/llama.cpp/src/llama-memory.h +73 -36
- package/cpp/llama.cpp/src/llama-mmap.cpp +1 -1
- package/cpp/llama.cpp/src/llama-model-loader.cpp +42 -17
- package/cpp/llama.cpp/src/llama-model-saver.cpp +1 -0
- package/cpp/llama.cpp/src/llama-model.cpp +1630 -511
- package/cpp/llama.cpp/src/llama-model.h +26 -0
- package/cpp/llama.cpp/src/llama-quant.cpp +89 -6
- package/cpp/llama.cpp/src/llama-vocab.cpp +58 -26
- package/cpp/llama.cpp/src/llama-vocab.h +1 -0
- package/cpp/llama.cpp/src/llama.cpp +11 -7
- package/cpp/llama.cpp/src/unicode.cpp +5 -0
- package/cpp/rn-completion.cpp +2 -2
- package/cpp/{rn-llama.hpp → rn-llama.h} +1 -1
- package/cpp/{rn-utils.hpp → rn-utils.h} +3 -0
- package/ios/include/chat.h +1 -1
- package/ios/include/common.h +6 -2
- package/ios/include/llama.h +141 -38
- package/ios/libs/llama.xcframework/Info.plist +15 -15
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4689
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3622
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3766 -3624
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4689
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3622
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4926 -4725
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4897 -4746
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3794 -3652
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +1 -2
- package/cpp/llama.cpp/common/cmake/build-info-gen-cpp.cmake +0 -24
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13891
- package/cpp/llama.cpp/src/llama-kv-cache.cpp +0 -1
- package/cpp/llama.cpp/src/llama-kv-cache.h +0 -44
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
|
@@ -254,14 +254,13 @@ static void norm_f32_sycl(const float * x, float * dst, const int ncols, const i
|
|
|
254
254
|
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
|
255
255
|
if (ncols < 1024) {
|
|
256
256
|
const sycl::range<3> block_dims(1, 1, WARP_SIZE);
|
|
257
|
-
stream
|
|
258
|
-
cgh
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
});
|
|
257
|
+
sycl_launch(stream, [&](sycl::handler & cgh) {
|
|
258
|
+
sycl_parallel_for(cgh, sycl::nd_range<3>(global_dims * block_dims, block_dims),
|
|
259
|
+
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
|
260
|
+
norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1,
|
|
261
|
+
nullptr, WARP_SIZE);
|
|
262
|
+
});
|
|
263
|
+
});
|
|
265
264
|
}
|
|
266
265
|
else {
|
|
267
266
|
const int work_group_size = ggml_sycl_info().max_work_group_sizes[device];
|
|
@@ -272,16 +271,15 @@ static void norm_f32_sycl(const float * x, float * dst, const int ncols, const i
|
|
|
272
271
|
the limit. To get the device limit, query
|
|
273
272
|
info::device::max_work_group_size. Adjust the work-group size if needed.
|
|
274
273
|
*/
|
|
275
|
-
stream
|
|
274
|
+
sycl_launch(stream, [&](sycl::handler & cgh) {
|
|
276
275
|
sycl::local_accessor<sycl::float2, 1> s_sum_acc_ct1(
|
|
277
276
|
sycl::range<1>(work_group_size / WARP_SIZE), cgh);
|
|
278
|
-
cgh
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
});
|
|
277
|
+
sycl_parallel_for(cgh, sycl::nd_range<3>(global_dims * block_dims, block_dims),
|
|
278
|
+
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
|
279
|
+
norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1,
|
|
280
|
+
get_pointer(s_sum_acc_ct1), work_group_size);
|
|
281
|
+
});
|
|
282
|
+
});
|
|
285
283
|
}
|
|
286
284
|
}
|
|
287
285
|
|
|
@@ -290,18 +288,14 @@ static void group_norm_f32_sycl(const float* x, float* dst,
|
|
|
290
288
|
const int ne_elements, queue_ptr stream, int device) {
|
|
291
289
|
if (group_size < 1024) {
|
|
292
290
|
const sycl::range<3> block_dims(1, 1, WARP_SIZE);
|
|
293
|
-
stream
|
|
291
|
+
sycl_launch(stream, [&](sycl::handler & cgh) {
|
|
294
292
|
const float eps_ct4 = eps;
|
|
295
|
-
cgh
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
x, dst, group_size, ne_elements, eps_ct4, item_ct1,
|
|
302
|
-
nullptr, WARP_SIZE);
|
|
303
|
-
});
|
|
304
|
-
});
|
|
293
|
+
sycl_parallel_for(cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims, block_dims),
|
|
294
|
+
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
|
295
|
+
group_norm_f32(x, dst, group_size, ne_elements, eps_ct4, item_ct1, nullptr,
|
|
296
|
+
WARP_SIZE);
|
|
297
|
+
});
|
|
298
|
+
});
|
|
305
299
|
}
|
|
306
300
|
else {
|
|
307
301
|
const int work_group_size = ggml_sycl_info().max_work_group_sizes[device];
|
|
@@ -313,22 +307,18 @@ static void group_norm_f32_sycl(const float* x, float* dst,
|
|
|
313
307
|
info::device::max_work_group_size. Adjust the work-group size if needed.
|
|
314
308
|
*/
|
|
315
309
|
|
|
316
|
-
stream
|
|
310
|
+
sycl_launch(stream, [&](sycl::handler & cgh) {
|
|
317
311
|
sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(work_group_size / WARP_SIZE),
|
|
318
312
|
cgh);
|
|
319
313
|
|
|
320
314
|
const float eps_ct4 = eps;
|
|
321
315
|
|
|
322
|
-
cgh
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
eps_ct4, item_ct1,
|
|
329
|
-
get_pointer(s_sum_acc_ct1), work_group_size);
|
|
330
|
-
});
|
|
331
|
-
});
|
|
316
|
+
sycl_parallel_for(cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims, block_dims),
|
|
317
|
+
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
|
318
|
+
group_norm_f32(x, dst, group_size, ne_elements, eps_ct4, item_ct1,
|
|
319
|
+
get_pointer(s_sum_acc_ct1), work_group_size);
|
|
320
|
+
});
|
|
321
|
+
});
|
|
332
322
|
}
|
|
333
323
|
}
|
|
334
324
|
|
|
@@ -340,14 +330,13 @@ static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols, const
|
|
|
340
330
|
const sycl::range<3> global_dims(nsamples, nchannels, nrows);
|
|
341
331
|
if (ncols < 1024) {
|
|
342
332
|
const sycl::range<3> block_dims(1, 1, WARP_SIZE);
|
|
343
|
-
stream
|
|
344
|
-
cgh
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
});
|
|
333
|
+
sycl_launch(stream, [&](sycl::handler & cgh) {
|
|
334
|
+
sycl_parallel_for(cgh, sycl::nd_range<3>(global_dims * block_dims, block_dims),
|
|
335
|
+
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
|
336
|
+
rms_norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1,
|
|
337
|
+
nullptr, WARP_SIZE);
|
|
338
|
+
});
|
|
339
|
+
});
|
|
351
340
|
}
|
|
352
341
|
else {
|
|
353
342
|
const int work_group_size = ggml_sycl_info().max_work_group_sizes[device];
|
|
@@ -358,16 +347,15 @@ static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols, const
|
|
|
358
347
|
the limit. To get the device limit, query
|
|
359
348
|
info::device::max_work_group_size. Adjust the work-group size if needed.
|
|
360
349
|
*/
|
|
361
|
-
stream
|
|
350
|
+
sycl_launch(stream, [&](sycl::handler & cgh) {
|
|
362
351
|
sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(work_group_size / WARP_SIZE),
|
|
363
352
|
cgh);
|
|
364
|
-
cgh
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
});
|
|
353
|
+
sycl_parallel_for(cgh, sycl::nd_range<3>(global_dims * block_dims, block_dims),
|
|
354
|
+
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
|
355
|
+
rms_norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1,
|
|
356
|
+
get_pointer(s_sum_acc_ct1), work_group_size);
|
|
357
|
+
});
|
|
358
|
+
});
|
|
371
359
|
}
|
|
372
360
|
}
|
|
373
361
|
|
|
@@ -378,16 +366,12 @@ static void l2_norm_f32_sycl(const float* x, float* dst, const int ncols,
|
|
|
378
366
|
// printf("%s ncols=%d, nrows=%d, WARP_SIZE=%d\n", __func__, ncols, nrows, WARP_SIZE);
|
|
379
367
|
if (ncols < 1024) {
|
|
380
368
|
const sycl::range<3> block_dims(1, 1, WARP_SIZE);
|
|
381
|
-
stream
|
|
382
|
-
cgh
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
l2_norm_f32(x, dst, ncols, eps, item_ct1,
|
|
388
|
-
nullptr, WARP_SIZE);
|
|
389
|
-
});
|
|
390
|
-
});
|
|
369
|
+
sycl_launch(stream, [&](sycl::handler & cgh) {
|
|
370
|
+
sycl_parallel_for(cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims),
|
|
371
|
+
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
|
372
|
+
l2_norm_f32(x, dst, ncols, eps, item_ct1, nullptr, WARP_SIZE);
|
|
373
|
+
});
|
|
374
|
+
});
|
|
391
375
|
}
|
|
392
376
|
else {
|
|
393
377
|
const int work_group_size = ggml_sycl_info().max_work_group_sizes[device];
|
|
@@ -398,18 +382,15 @@ static void l2_norm_f32_sycl(const float* x, float* dst, const int ncols,
|
|
|
398
382
|
the limit. To get the device limit, query
|
|
399
383
|
info::device::max_work_group_size. Adjust the work-group size if needed.
|
|
400
384
|
*/
|
|
401
|
-
stream
|
|
385
|
+
sycl_launch(stream, [&](sycl::handler & cgh) {
|
|
402
386
|
sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(work_group_size / WARP_SIZE),
|
|
403
387
|
cgh);
|
|
404
|
-
cgh
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
get_pointer(s_sum_acc_ct1), work_group_size);
|
|
411
|
-
});
|
|
412
|
-
});
|
|
388
|
+
sycl_parallel_for(cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims),
|
|
389
|
+
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
|
390
|
+
l2_norm_f32(x, dst, ncols, eps, item_ct1, get_pointer(s_sum_acc_ct1),
|
|
391
|
+
work_group_size);
|
|
392
|
+
});
|
|
393
|
+
});
|
|
413
394
|
}
|
|
414
395
|
}
|
|
415
396
|
|
|
@@ -14,12 +14,13 @@
|
|
|
14
14
|
#ifndef GGML_SYCL_QUANTS_HPP
|
|
15
15
|
#define GGML_SYCL_QUANTS_HPP
|
|
16
16
|
|
|
17
|
+
#include <utility>
|
|
18
|
+
|
|
17
19
|
#include "ggml-common.h"
|
|
18
20
|
#include "ggml.h"
|
|
19
21
|
|
|
20
22
|
namespace ggml_sycl_reordered {
|
|
21
23
|
|
|
22
|
-
|
|
23
24
|
// The reordered block moves quants (qs) and scales(d) to two
|
|
24
25
|
// uniform regions of memory that is contiguous in the same tensor.
|
|
25
26
|
// What this means is that instead of having:
|
|
@@ -32,7 +33,6 @@ namespace ggml_sycl_reordered {
|
|
|
32
33
|
|
|
33
34
|
template <ggml_type type> struct block_q_t;
|
|
34
35
|
|
|
35
|
-
|
|
36
36
|
// qk number of weights / quants in a block
|
|
37
37
|
// qr number of weights in a byte (described as 'before dequantization')
|
|
38
38
|
// for quantization types that has low and high bits split, qr is calculated with
|
|
@@ -47,10 +47,12 @@ template <> struct block_q_t<GGML_TYPE_Q4_0> {
|
|
|
47
47
|
static constexpr uint32_t vdr_mmvq = 2;
|
|
48
48
|
};
|
|
49
49
|
|
|
50
|
-
static constexpr int get_block_offset(const int block_index
|
|
50
|
+
static constexpr std::pair<int, int> get_block_offset(const int block_index, const int /* nblocks */) {
|
|
51
|
+
return { block_index * (traits::qk / traits::qr), 0 };
|
|
52
|
+
}
|
|
51
53
|
|
|
52
|
-
static constexpr int get_d_offset(int nrows, int ncols, const int block_index) {
|
|
53
|
-
return (ncols / traits::qr * nrows) + block_index * sizeof(ggml_half);
|
|
54
|
+
static constexpr std::pair<int, int> get_d_offset(int nrows, int ncols, const int block_index) {
|
|
55
|
+
return { (ncols / traits::qr * nrows) + block_index * sizeof(ggml_half), 0 };
|
|
54
56
|
}
|
|
55
57
|
|
|
56
58
|
static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
|
|
@@ -64,20 +66,46 @@ template <> struct block_q_t<GGML_TYPE_Q4_K> {
|
|
|
64
66
|
static constexpr uint32_t vdr_mmvq = 2;
|
|
65
67
|
};
|
|
66
68
|
|
|
67
|
-
static constexpr int get_block_offset(const int block_index
|
|
69
|
+
static constexpr std::pair<int, int> get_block_offset(const int block_index, const int /* nblocks */) {
|
|
70
|
+
return { block_index * (traits::qk / traits::qr), 0 };
|
|
71
|
+
}
|
|
68
72
|
|
|
69
|
-
static constexpr int get_d_offset(int nrows, int ncols, const int block_index) {
|
|
73
|
+
static constexpr std::pair<int, int> get_d_offset(int nrows, int ncols, const int block_index) {
|
|
70
74
|
auto nblocks = (nrows * (ncols / traits::qk));
|
|
71
|
-
return
|
|
75
|
+
return { nblocks * (QK_K / 2),
|
|
76
|
+
(nblocks * QK_K / 2) + (nblocks * K_SCALE_SIZE) + (block_index * sizeof(ggml_half2)) };
|
|
72
77
|
}
|
|
73
78
|
|
|
74
79
|
static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
|
|
75
80
|
|
|
76
81
|
constexpr size_t get_total_qs_bytes(int nblocks) { return nblocks * QK_K / 2; }
|
|
77
|
-
|
|
78
|
-
constexpr size_t get_dm_offset(int nblocks) { return get_total_qs_bytes(nblocks) + nblocks * K_SCALE_SIZE; }
|
|
79
82
|
};
|
|
80
83
|
|
|
84
|
+
template <> struct block_q_t<GGML_TYPE_Q6_K> {
|
|
85
|
+
struct traits {
|
|
86
|
+
static constexpr uint32_t qk = QK_K;
|
|
87
|
+
static constexpr uint32_t qi = QI6_K;
|
|
88
|
+
static constexpr uint32_t qr = QR6_K;
|
|
89
|
+
static constexpr uint32_t vdr_mmvq = 1;
|
|
90
|
+
};
|
|
91
|
+
|
|
92
|
+
static constexpr std::pair<int, int> get_block_offset(const int block_index, const int n_blocks) {
|
|
93
|
+
auto low_bits_index = block_index * (traits::qk / traits::qr);
|
|
94
|
+
// the index of high bits it's after all low bits
|
|
95
|
+
auto high_bits_index = n_blocks * (QK_K / 2) + (block_index * (QK_K / 4));
|
|
96
|
+
return { low_bits_index, high_bits_index };
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
static constexpr std::pair<int, int> get_d_offset(int nrows, int ncols, const int block_index) {
|
|
100
|
+
auto nblocks = (nrows * (ncols / traits::qk));
|
|
101
|
+
auto total_qs_bytes = nblocks * (QK_K / 2) + nblocks * (QK_K / 4);
|
|
102
|
+
auto block_scales = total_qs_bytes + block_index * (QK_K / 16);
|
|
103
|
+
auto sb_scale = total_qs_bytes + nblocks * (QK_K / 16);
|
|
104
|
+
return { block_scales, sb_scale };
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
|
|
108
|
+
};
|
|
81
109
|
} // namespace ggml_sycl_reordered
|
|
82
110
|
|
|
83
111
|
#endif // GGML_SYCL_QUANTS_HPP
|
|
@@ -235,20 +235,22 @@ static void rope_norm_sycl(const T * x, T * dst, const int ne0, const int ne1, c
|
|
|
235
235
|
the limit. To get the device limit, query
|
|
236
236
|
info::device::max_work_group_size. Adjust the work-group size if needed.
|
|
237
237
|
*/
|
|
238
|
-
stream
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
238
|
+
sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
239
|
+
[=](sycl::nd_item<3> item_ct1) {
|
|
240
|
+
rope_norm<T, false>(x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor,
|
|
241
|
+
attn_factor, corr_dims, theta_scale, freq_factors, item_ct1);
|
|
242
|
+
});
|
|
242
243
|
} else {
|
|
243
244
|
/*
|
|
244
245
|
DPCT1049:41: The work-group size passed to the SYCL kernel may exceed
|
|
245
246
|
the limit. To get the device limit, query
|
|
246
247
|
info::device::max_work_group_size. Adjust the work-group size if needed.
|
|
247
248
|
*/
|
|
248
|
-
stream
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
249
|
+
sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
250
|
+
[=](sycl::nd_item<3> item_ct1) {
|
|
251
|
+
rope_norm<T, true>(x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor,
|
|
252
|
+
attn_factor, corr_dims, theta_scale, freq_factors, item_ct1);
|
|
253
|
+
});
|
|
252
254
|
}
|
|
253
255
|
}
|
|
254
256
|
|
|
@@ -267,15 +269,17 @@ static void rope_neox_sycl(const T * x, T * dst, const int ne0, const int ne1, c
|
|
|
267
269
|
dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
|
|
268
270
|
|
|
269
271
|
if (freq_factors == nullptr) {
|
|
270
|
-
stream
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
272
|
+
sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
273
|
+
[=](sycl::nd_item<3> item_ct1) {
|
|
274
|
+
rope_neox<T, false>(x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor,
|
|
275
|
+
attn_factor, corr_dims, theta_scale, freq_factors, item_ct1);
|
|
276
|
+
});
|
|
274
277
|
} else {
|
|
275
|
-
stream
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
278
|
+
sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
279
|
+
[=](sycl::nd_item<3> item_ct1) {
|
|
280
|
+
rope_neox<T, true>(x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor,
|
|
281
|
+
attn_factor, corr_dims, theta_scale, freq_factors, item_ct1);
|
|
282
|
+
});
|
|
279
283
|
}
|
|
280
284
|
}
|
|
281
285
|
|
|
@@ -298,12 +302,12 @@ static void rope_multi_sycl(const T * x, T * dst, const int ne0, const int ne1,
|
|
|
298
302
|
}
|
|
299
303
|
// launch kernel
|
|
300
304
|
if (freq_factors == nullptr) {
|
|
301
|
-
stream
|
|
305
|
+
sycl_parallel_for(stream, nd_range, [=](sycl::nd_item<3> item_ct1) {
|
|
302
306
|
rope_multi<T, false>(x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor,
|
|
303
307
|
corr_dims, theta_scale, freq_factors, sections, item_ct1);
|
|
304
308
|
});
|
|
305
309
|
} else {
|
|
306
|
-
stream
|
|
310
|
+
sycl_parallel_for(stream, nd_range, [=](sycl::nd_item<3> item_ct1) {
|
|
307
311
|
rope_multi<T, true>(x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor,
|
|
308
312
|
corr_dims, theta_scale, freq_factors, sections, item_ct1);
|
|
309
313
|
});
|
|
@@ -333,12 +337,12 @@ static void rope_vision_sycl(const T * x, T * dst, const int ne0, const int ne1,
|
|
|
333
337
|
}
|
|
334
338
|
// launch kernel
|
|
335
339
|
if (freq_factors == nullptr) {
|
|
336
|
-
stream
|
|
340
|
+
sycl_parallel_for(stream, nd_range, [=](sycl::nd_item<3> item_ct1) {
|
|
337
341
|
rope_vision<T, false>(x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor,
|
|
338
342
|
corr_dims, theta_scale, freq_factors, sections, item_ct1);
|
|
339
343
|
});
|
|
340
344
|
} else {
|
|
341
|
-
stream
|
|
345
|
+
sycl_parallel_for(stream, nd_range, [=](sycl::nd_item<3> item_ct1) {
|
|
342
346
|
rope_vision<T, true>(x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor,
|
|
343
347
|
corr_dims, theta_scale, freq_factors, sections, item_ct1);
|
|
344
348
|
});
|
|
@@ -127,11 +127,11 @@ static void soft_max_f32_submitter(const float * x, const T * mask, float * dst,
|
|
|
127
127
|
const int nrows_y, const float scale, const float max_bias, const float m0,
|
|
128
128
|
const float m1, uint32_t n_head_log2, sycl::range<3> block_nums, sycl::range<3> block_dims,
|
|
129
129
|
const size_t n_local_scratch, queue_ptr stream) {
|
|
130
|
-
stream
|
|
130
|
+
sycl_launch(stream, [&](sycl::handler & cgh) {
|
|
131
131
|
sycl::local_accessor<float, 1> local_buf_acc(n_local_scratch, cgh);
|
|
132
132
|
|
|
133
|
-
|
|
134
|
-
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
133
|
+
sycl_parallel_for(
|
|
134
|
+
cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
135
135
|
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
|
136
136
|
soft_max_f32<vals_smem, ncols_template, block_size_template>(x, mask, dst, ncols_par,
|
|
137
137
|
nrows_y, scale, max_bias, m0,
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
#include "sycl_hw.hpp"
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
// TODO: currently not used
|
|
4
|
+
/*
|
|
4
5
|
sycl_hw_info get_device_hw_info(sycl::device *device_ptr) {
|
|
5
6
|
sycl_hw_info res;
|
|
6
7
|
int32_t id = device_ptr->get_info<sycl::ext::intel::info::device::device_id>();
|
|
@@ -11,3 +12,4 @@ sycl_hw_info get_device_hw_info(sycl::device *device_ptr) {
|
|
|
11
12
|
|
|
12
13
|
return res;
|
|
13
14
|
}
|
|
15
|
+
*/
|
|
@@ -10,6 +10,8 @@
|
|
|
10
10
|
|
|
11
11
|
namespace syclex = sycl::ext::oneapi::experimental;
|
|
12
12
|
|
|
13
|
+
// TODO: currently not used
|
|
14
|
+
/*
|
|
13
15
|
struct sycl_hw_info {
|
|
14
16
|
syclex::architecture arch;
|
|
15
17
|
int32_t device_id;
|
|
@@ -18,6 +20,7 @@ struct sycl_hw_info {
|
|
|
18
20
|
bool is_in_vector(std::vector<int> &vec, int item);
|
|
19
21
|
|
|
20
22
|
sycl_hw_info get_device_hw_info(sycl::device *device_ptr);
|
|
23
|
+
*/
|
|
21
24
|
|
|
22
25
|
|
|
23
26
|
#endif // SYCL_HW_HPP
|
|
@@ -45,14 +45,9 @@ static void timestep_embedding_f32_sycl(
|
|
|
45
45
|
int num_blocks = (half_ceil + SYCL_TIMESTEP_EMBEDDING_BLOCK_SIZE - 1) / SYCL_TIMESTEP_EMBEDDING_BLOCK_SIZE;
|
|
46
46
|
sycl::range<3> block_dims(1, 1, SYCL_TIMESTEP_EMBEDDING_BLOCK_SIZE);
|
|
47
47
|
sycl::range<3> gridDim(1, ne00, num_blocks);
|
|
48
|
-
stream
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
[=](sycl::nd_item<3> item_ct1) {
|
|
52
|
-
timestep_embedding_f32(
|
|
53
|
-
x, dst, nb1, dim, max_period, item_ct1
|
|
54
|
-
);
|
|
55
|
-
});
|
|
48
|
+
sycl_parallel_for(stream, sycl::nd_range<3>(gridDim * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
|
49
|
+
timestep_embedding_f32(x, dst, nb1, dim, max_period, item_ct1);
|
|
50
|
+
});
|
|
56
51
|
}
|
|
57
52
|
|
|
58
53
|
void ggml_sycl_op_timestep_embedding(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
@@ -284,22 +284,23 @@ template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q4_0> {
|
|
|
284
284
|
return d4 * (sumi * ds8f.x() - (8 * q4_0_traits::vdr_mmvq / q4_0_traits::qi) * ds8f.y());
|
|
285
285
|
}
|
|
286
286
|
|
|
287
|
-
__dpct_inline__ float operator()(const void * __restrict__ vbq, const int
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
const
|
|
287
|
+
__dpct_inline__ float operator()(const void * __restrict__ vbq, const std::pair<int, int> ibx_offset,
|
|
288
|
+
const std::pair<int, int> d_offset, const int8_t * q8_1_quant_ptr,
|
|
289
|
+
const sycl::half2 * q8_1_ds, const int & iqs) {
|
|
290
|
+
const uint8_t * bq4_0 = static_cast<const uint8_t *>(vbq) + ibx_offset.first;
|
|
291
|
+
const ggml_half d = *(reinterpret_cast<const ggml_half *>(static_cast<const uint8_t *>(vbq) + d_offset.first));
|
|
291
292
|
int v[q4_0_traits::vdr_mmvq];
|
|
292
293
|
int u[2 * q4_0_traits::vdr_mmvq];
|
|
293
294
|
|
|
294
|
-
#pragma unroll
|
|
295
295
|
|
|
296
|
+
#pragma unroll
|
|
296
297
|
for (size_t i = 0; i < q4_0_traits::vdr_mmvq; ++i) {
|
|
297
298
|
v[i] = get_int_from_uint8(bq4_0, iqs + i);
|
|
298
|
-
u[2 * i + 0] = get_int_from_int8_aligned(
|
|
299
|
-
u[2 * i + 1] = get_int_from_int8_aligned(
|
|
299
|
+
u[2 * i + 0] = get_int_from_int8_aligned(q8_1_quant_ptr, iqs + i);
|
|
300
|
+
u[2 * i + 1] = get_int_from_int8_aligned(q8_1_quant_ptr, iqs + i + q4_0_traits::qi);
|
|
300
301
|
}
|
|
301
302
|
|
|
302
|
-
return vec_dot_q4_0_q8_1_impl(v, u, d,
|
|
303
|
+
return vec_dot_q4_0_q8_1_impl(v, u, d, *q8_1_ds);
|
|
303
304
|
};
|
|
304
305
|
};
|
|
305
306
|
|
|
@@ -346,24 +347,115 @@ template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q4_K> {
|
|
|
346
347
|
using q4_k_block = ggml_sycl_reordered::block_q_t<GGML_TYPE_Q4_K>;
|
|
347
348
|
using q4_k_traits = typename q4_k_block::traits;
|
|
348
349
|
|
|
349
|
-
float operator()(const void * __restrict__ vbq, const int
|
|
350
|
-
|
|
351
|
-
|
|
350
|
+
__dpct_inline__ float operator()(const void * __restrict__ vbq, const std::pair<int, int> ibx_offset,
|
|
351
|
+
const std::pair<int, int> d_offset, const int8_t * q8_1_quant_ptr,
|
|
352
|
+
const sycl::half2 * q8_1_ds, const int & iqs) {
|
|
353
|
+
const int ib = ibx_offset.first / (QK_K / 2);
|
|
352
354
|
|
|
353
355
|
const uint8_t * base = static_cast<const uint8_t *>(vbq);
|
|
354
|
-
const uint8_t * qs = base + ibx_offset;
|
|
355
|
-
const
|
|
356
|
-
const
|
|
357
|
-
const ggml_half2 * dms = reinterpret_cast<const ggml_half2 *>(base + d_offset);
|
|
356
|
+
const uint8_t * qs = base + ibx_offset.first;
|
|
357
|
+
const uint8_t * scs = base + d_offset.first + ib * K_SCALE_SIZE;
|
|
358
|
+
const ggml_half2 * dms = reinterpret_cast<const ggml_half2 *>(base + d_offset.second);
|
|
358
359
|
|
|
359
360
|
const int bq8_offset = QR4_K * ((iqs / 2) / (QI8_1 / 2));
|
|
360
361
|
const int * q4 = (const int *) (qs + 16 * bq8_offset + 4 * ((iqs / 2) % 4));
|
|
361
362
|
const uint16_t * scales = (const uint16_t *) scs;
|
|
362
363
|
|
|
363
|
-
|
|
364
|
+
int v[2];
|
|
365
|
+
int u[2 * QR4_K];
|
|
366
|
+
float d8[QR4_K];
|
|
367
|
+
|
|
368
|
+
v[0] = q4[0];
|
|
369
|
+
v[1] = q4[4];
|
|
370
|
+
|
|
371
|
+
uint16_t aux[2];
|
|
372
|
+
const int j = (QR4_K * ((iqs / 2) / (QI8_1 / 2))) / 2;
|
|
373
|
+
if (j < 2) {
|
|
374
|
+
aux[0] = scales[j + 0] & 0x3f3f;
|
|
375
|
+
aux[1] = scales[j + 2] & 0x3f3f;
|
|
376
|
+
} else {
|
|
377
|
+
aux[0] = ((scales[j + 2] >> 0) & 0x0f0f) | ((scales[j - 2] & 0xc0c0) >> 2);
|
|
378
|
+
aux[1] = ((scales[j + 2] >> 4) & 0x0f0f) | ((scales[j - 0] & 0xc0c0) >> 2);
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
const uint8_t * sc = (const uint8_t *) aux;
|
|
382
|
+
const uint8_t * m = sc + 2;
|
|
383
|
+
|
|
384
|
+
for (int i = 0; i < QR4_K; ++i) {
|
|
385
|
+
const int8_t* quant_base_ptr = q8_1_quant_ptr + (bq8_offset + i) * QK8_1;
|
|
386
|
+
sycl::half2 ds_values = *(q8_1_ds + bq8_offset + i);
|
|
387
|
+
|
|
388
|
+
d8[i] = ds_values[0];
|
|
389
|
+
|
|
390
|
+
const int * q8 = (const int *) quant_base_ptr + ((iqs / 2) % 4);
|
|
391
|
+
u[2 * i + 0] = q8[0];
|
|
392
|
+
u[2 * i + 1] = q8[4];
|
|
393
|
+
}
|
|
394
|
+
|
|
395
|
+
return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, *dms, d8);
|
|
364
396
|
}
|
|
365
397
|
};
|
|
366
398
|
|
|
399
|
+
template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q6_K> {
|
|
400
|
+
static constexpr ggml_type gtype = GGML_TYPE_Q6_K;
|
|
401
|
+
|
|
402
|
+
using q6_k_block = ggml_sycl_reordered::block_q_t<GGML_TYPE_Q6_K>;
|
|
403
|
+
using q6_k_traits = typename q6_k_block::traits;
|
|
404
|
+
|
|
405
|
+
__dpct_inline__ float vec_dot_q6_K_q8_1_impl_mmvq(const int vl, const int vh, const int * __restrict__ u,
|
|
406
|
+
const int8_t * __restrict__ scales, const float d,
|
|
407
|
+
const float * __restrict__ d8) {
|
|
408
|
+
float sumf = 0.0f;
|
|
409
|
+
|
|
410
|
+
#pragma unroll
|
|
411
|
+
for (int i = 0; i < QR6_K; ++i) {
|
|
412
|
+
const int sc = scales[4 * i];
|
|
413
|
+
|
|
414
|
+
const int vil = (vl >> (4 * i)) & 0x0F0F0F0F;
|
|
415
|
+
|
|
416
|
+
const int vih = ((vh >> (4 * i)) << 4) & 0x30303030;
|
|
417
|
+
|
|
418
|
+
const int vi = dpct::vectorized_binary<sycl::char4>((vil | vih), 0x20202020,
|
|
419
|
+
dpct::sub_sat()); // vi = (vil | vih) - 32
|
|
420
|
+
|
|
421
|
+
sumf += d8[i] * (dpct::dp4a(vi, u[i], 0) * sc); // SIMD dot product
|
|
422
|
+
}
|
|
423
|
+
|
|
424
|
+
return d * sumf;
|
|
425
|
+
}
|
|
426
|
+
|
|
427
|
+
__dpct_inline__ float operator()(const void * __restrict__ vbq, const std::pair<int, int> ibx_offset,
|
|
428
|
+
const std::pair<int, int> d_offset, const int8_t * q8_1_quant_ptr, const sycl::half2 * q8_1_ds,
|
|
429
|
+
const int iqs) {
|
|
430
|
+
const int ib = ibx_offset.first / (QK_K / 2);
|
|
431
|
+
|
|
432
|
+
const uint8_t * base = static_cast<const uint8_t *>(vbq);
|
|
433
|
+
const uint8_t * ql = base + ibx_offset.first;
|
|
434
|
+
const uint8_t * qh = base + ibx_offset.second;
|
|
435
|
+
const int8_t * scales = reinterpret_cast<const int8_t *>(base + d_offset.first);
|
|
436
|
+
const ggml_half * d = (const ggml_half *) (base + d_offset.second) + ib;
|
|
437
|
+
|
|
438
|
+
const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K / 2)) + (iqs % (QI6_K / 2)) / (QI6_K / 4);
|
|
439
|
+
const int scale_offset = (QI6_K / 4) * (iqs / (QI6_K / 2)) + (iqs % (QI6_K / 2)) / (QI6_K / 8);
|
|
440
|
+
const int vh_shift = 2 * ((iqs % (QI6_K / 2)) / (QI6_K / 4));
|
|
441
|
+
|
|
442
|
+
const int vl = get_int_from_uint8(ql, iqs);
|
|
443
|
+
const int vh = get_int_from_uint8(qh, (QI6_K / 4) * (iqs / (QI6_K / 2)) + iqs % (QI6_K / 4)) >> vh_shift;
|
|
444
|
+
|
|
445
|
+
const int8_t * scs = scales + scale_offset;
|
|
446
|
+
|
|
447
|
+
int u[QR6_K];
|
|
448
|
+
float d8[QR6_K];
|
|
449
|
+
|
|
450
|
+
#pragma unroll
|
|
451
|
+
for (int i = 0; i < QR6_K; ++i) {
|
|
452
|
+
u[i] = get_int_from_int8_aligned(q8_1_quant_ptr + (bq8_offset + 2 * i) * QK8_1, iqs % QI8_1);
|
|
453
|
+
const sycl::half2 ds_values = *(q8_1_ds + bq8_offset + 2 * i);
|
|
454
|
+
d8[i] = ds_values[0];
|
|
455
|
+
}
|
|
456
|
+
return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scs, *d, d8);
|
|
457
|
+
}
|
|
458
|
+
};
|
|
367
459
|
#define VDR_Q4_0_Q8_1_MMVQ 2
|
|
368
460
|
#define VDR_Q4_0_Q8_1_MMQ 4
|
|
369
461
|
|