@novastera-oss/llamarn 0.2.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +80 -14
- package/RNLlamaCpp.podspec +10 -3
- package/android/CMakeLists.txt +8 -0
- package/android/src/main/cpp/include/llama.h +62 -125
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/README.md +11 -3
- package/cpp/llama.cpp/build-xcframework.sh +1 -0
- package/cpp/llama.cpp/common/CMakeLists.txt +8 -2
- package/cpp/llama.cpp/common/arg.cpp +153 -113
- package/cpp/llama.cpp/common/chat-parser.cpp +379 -0
- package/cpp/llama.cpp/common/chat-parser.h +117 -0
- package/cpp/llama.cpp/common/chat.cpp +847 -699
- package/cpp/llama.cpp/common/chat.h +73 -6
- package/cpp/llama.cpp/common/common.cpp +50 -82
- package/cpp/llama.cpp/common/common.h +21 -17
- package/cpp/llama.cpp/common/json-partial.cpp +255 -0
- package/cpp/llama.cpp/common/json-partial.h +37 -0
- package/cpp/llama.cpp/common/minja/chat-template.hpp +9 -5
- package/cpp/llama.cpp/common/minja/minja.hpp +69 -36
- package/cpp/llama.cpp/common/regex-partial.cpp +204 -0
- package/cpp/llama.cpp/common/regex-partial.h +56 -0
- package/cpp/llama.cpp/common/sampling.cpp +7 -8
- package/cpp/llama.cpp/convert_hf_to_gguf.py +453 -118
- package/cpp/llama.cpp/convert_hf_to_gguf_update.py +120 -68
- package/cpp/llama.cpp/ggml/CMakeLists.txt +2 -1
- package/cpp/llama.cpp/ggml/cmake/common.cmake +25 -0
- package/cpp/llama.cpp/ggml/include/ggml-opt.h +49 -28
- package/cpp/llama.cpp/ggml/include/ggml.h +26 -7
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +16 -10
- package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +4 -1
- package/cpp/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +604 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +42 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +54 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +50 -51
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +5 -9
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +779 -19
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +88 -5
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -12
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +264 -69
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +322 -100
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +117 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +85 -16
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +220 -49
- package/cpp/llama.cpp/ggml/src/ggml-cuda/acc.cu +40 -26
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cu +11 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +15 -7
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +266 -64
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +49 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +48 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cu +2 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +5 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cu +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/quantize.cu +7 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sum.cu +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +10 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +99 -17
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +200 -2
- package/cpp/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +8 -2
- package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cu +112 -0
- package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +12 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +6 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +972 -178
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/div.cl +72 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/group_norm.cl +72 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sub.cl +72 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sum_rows.cl +39 -0
- package/cpp/llama.cpp/ggml/src/ggml-opt.cpp +373 -190
- package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +29 -23
- package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -10
- package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +101 -5
- package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +31 -33
- package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +29 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +4 -5
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +9 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +84 -72
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
- package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +1 -3
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +324 -129
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +31 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +95 -68
- package/cpp/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +1 -4
- package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +2 -3
- package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +69 -43
- package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +2 -14
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +81 -91
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +432 -181
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +17 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +6 -152
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +162 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +360 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +2 -118
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +12 -1
- package/cpp/llama.cpp/ggml/src/ggml.c +107 -36
- package/cpp/llama.cpp/ggml/src/gguf.cpp +33 -33
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +100 -15
- package/cpp/llama.cpp/gguf-py/gguf/gguf_reader.py +1 -1
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +44 -12
- package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_editor_gui.py +21 -10
- package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_new_metadata.py +5 -2
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +128 -31
- package/cpp/llama.cpp/gguf-py/gguf/utility.py +1 -1
- package/cpp/llama.cpp/gguf-py/pyproject.toml +1 -1
- package/cpp/llama.cpp/include/llama.h +62 -125
- package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-nomic-bert-moe.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.out +1 -1
- package/cpp/llama.cpp/models/templates/Qwen-QwQ-32B.jinja +62 -0
- package/cpp/llama.cpp/models/templates/Qwen-Qwen3-0.6B.jinja +85 -0
- package/cpp/llama.cpp/models/templates/README.md +2 -0
- package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +5 -1
- package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +5 -1
- package/cpp/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
- package/cpp/llama.cpp/requirements/requirements-gguf_editor_gui.txt +1 -1
- package/cpp/llama.cpp/src/CMakeLists.txt +2 -0
- package/cpp/llama.cpp/src/llama-arch.cpp +6 -0
- package/cpp/llama.cpp/src/llama-arch.h +2 -0
- package/cpp/llama.cpp/src/llama-batch.cpp +3 -1
- package/cpp/llama.cpp/src/llama-context.cpp +340 -123
- package/cpp/llama.cpp/src/llama-context.h +30 -0
- package/cpp/llama.cpp/src/llama-cparams.cpp +4 -0
- package/cpp/llama.cpp/src/llama-cparams.h +2 -0
- package/cpp/llama.cpp/src/llama-grammar.cpp +12 -2
- package/cpp/llama.cpp/src/llama-graph.cpp +157 -247
- package/cpp/llama.cpp/src/llama-graph.h +52 -7
- package/cpp/llama.cpp/src/llama-hparams.cpp +17 -1
- package/cpp/llama.cpp/src/llama-hparams.h +37 -5
- package/cpp/llama.cpp/src/llama-kv-cache.cpp +742 -481
- package/cpp/llama.cpp/src/llama-kv-cache.h +196 -99
- package/cpp/llama.cpp/src/llama-kv-cells.h +379 -0
- package/cpp/llama.cpp/src/llama-memory.h +4 -3
- package/cpp/llama.cpp/src/llama-model-loader.cpp +22 -17
- package/cpp/llama.cpp/src/llama-model-saver.cpp +281 -0
- package/cpp/llama.cpp/src/llama-model-saver.h +37 -0
- package/cpp/llama.cpp/src/llama-model.cpp +529 -172
- package/cpp/llama.cpp/src/llama-model.h +6 -1
- package/cpp/llama.cpp/src/llama-quant.cpp +15 -13
- package/cpp/llama.cpp/src/llama-sampling.cpp +2 -2
- package/cpp/llama.cpp/src/llama-vocab.cpp +35 -8
- package/cpp/llama.cpp/src/llama-vocab.h +6 -0
- package/cpp/llama.cpp/src/llama.cpp +14 -0
- package/cpp/rn-completion.cpp +4 -2
- package/ios/include/chat.h +73 -6
- package/ios/include/common/minja/chat-template.hpp +9 -5
- package/ios/include/common/minja/minja.hpp +69 -36
- package/ios/include/common.h +21 -17
- package/ios/include/llama.h +62 -125
- package/ios/libs/llama.xcframework/Info.plist +19 -19
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4617 -4487
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4638 -4508
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3557 -3435
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4638 -4508
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3559 -3437
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4616 -4487
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4637 -4508
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3556 -3435
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4653 -4523
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4674 -4544
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3587 -3465
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +1 -1
- package/cpp/llama.cpp/common/stb_image.h +0 -7988
- package/cpp/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-chameleon.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-llama4.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-llama4.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-pixtral.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +0 -46
|
@@ -76,6 +76,7 @@ static void gated_linear_attn_f32_kernel(const dpct::queue_ptr stream, u_int B,
|
|
|
76
76
|
}
|
|
77
77
|
|
|
78
78
|
void ggml_sycl_op_gated_linear_attn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
79
|
+
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/5);
|
|
79
80
|
const float * k_d = static_cast<const float *>(dst->src[0]->data);
|
|
80
81
|
const float * v_d = static_cast<const float *>(dst->src[1]->data);
|
|
81
82
|
const float * r_d = static_cast<const float *>(dst->src[2]->data);
|
|
@@ -24,6 +24,7 @@ static void mul_mat_vec_q_reorder(const void * __restrict__ vx, const void * __r
|
|
|
24
24
|
const int blocks_per_row = ncols / block_traits::qk;
|
|
25
25
|
constexpr int blocks_per_subgroup = ceil_div(block_traits::vdr_mmvq * WARP_SIZE, block_traits::qi);
|
|
26
26
|
constexpr int block_elements_per_subgroup = block_traits::qi / block_traits::vdr_mmvq;
|
|
27
|
+
const int nblocks = nrows * (ncols / block_traits::qk);
|
|
27
28
|
|
|
28
29
|
static_assert(blocks_per_subgroup > 0);
|
|
29
30
|
static_assert(block_elements_per_subgroup > 0);
|
|
@@ -45,7 +46,7 @@ static void mul_mat_vec_q_reorder(const void * __restrict__ vx, const void * __r
|
|
|
45
46
|
// x block quant index when casting the quants to int
|
|
46
47
|
const int iqs = elem + block_traits::vdr_mmvq * (sg.get_local_linear_id() % block_elements_per_subgroup);
|
|
47
48
|
|
|
48
|
-
partial_sum += reorder_vec_dot_q_sycl()(vx, bx_offset, d_offset, &y[iby], iqs);
|
|
49
|
+
partial_sum += reorder_vec_dot_q_sycl()(vx, bx_offset, d_offset, &y[iby], iqs, nblocks);
|
|
49
50
|
}
|
|
50
51
|
}
|
|
51
52
|
|
|
@@ -739,6 +740,27 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
|
|
|
739
740
|
}
|
|
740
741
|
}
|
|
741
742
|
|
|
743
|
+
static void reorder_mul_mat_vec_q4_k_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols,
|
|
744
|
+
const int nrows, dpct::queue_ptr stream) {
|
|
745
|
+
GGML_ASSERT(ncols % QK_K == 0);
|
|
746
|
+
|
|
747
|
+
const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y);
|
|
748
|
+
constexpr size_t num_subgroups = 16;
|
|
749
|
+
GGML_ASSERT(block_num_y % num_subgroups == 0);
|
|
750
|
+
|
|
751
|
+
const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE);
|
|
752
|
+
const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
|
|
753
|
+
|
|
754
|
+
stream->submit([&](sycl::handler & cgh) {
|
|
755
|
+
cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size),
|
|
756
|
+
[=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
|
757
|
+
mul_mat_vec_q_reorder<reorder_vec_dot_q_sycl<GGML_TYPE_Q4_K>>(vx, vy, dst, ncols,
|
|
758
|
+
nrows, nd_item);
|
|
759
|
+
});
|
|
760
|
+
});
|
|
761
|
+
}
|
|
762
|
+
|
|
763
|
+
|
|
742
764
|
static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
|
|
743
765
|
float *dst, const int ncols,
|
|
744
766
|
const int nrows,
|
|
@@ -1035,7 +1057,14 @@ void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tens
|
|
|
1035
1057
|
mul_mat_vec_q3_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
|
1036
1058
|
break;
|
|
1037
1059
|
case GGML_TYPE_Q4_K:
|
|
1038
|
-
|
|
1060
|
+
if ((ggml_tensor_extra_gpu *) dst->src[0]->extra &&
|
|
1061
|
+
((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
|
|
1062
|
+
GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q4_k_q8_1_sycl\n");
|
|
1063
|
+
reorder_mul_mat_vec_q4_k_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
|
1064
|
+
} else {
|
|
1065
|
+
GGML_SYCL_DEBUG("Calling mul_mat_vec_q4_K_q8_1_sycl\n");
|
|
1066
|
+
mul_mat_vec_q4_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
|
1067
|
+
}
|
|
1039
1068
|
break;
|
|
1040
1069
|
case GGML_TYPE_Q5_K:
|
|
1041
1070
|
mul_mat_vec_q5_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
|
@@ -1,40 +1,50 @@
|
|
|
1
1
|
#include "norm.hpp"
|
|
2
|
+
#include "ggml-sycl/common.hpp"
|
|
3
|
+
#include "ggml-sycl/presets.hpp"
|
|
2
4
|
|
|
3
|
-
static void norm_f32(const float* x, float* dst, const int ncols, const
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
const int
|
|
5
|
+
static void norm_f32(const float* x, float* dst, const int ncols, const int64_t stride_row, const int64_t stride_channel,
|
|
6
|
+
const int64_t stride_sample, const float eps, const sycl::nd_item<3>& item_ct1, sycl::float2* s_sum, int block_size) {
|
|
7
|
+
|
|
8
|
+
const int nrows = item_ct1.get_group_range(2);
|
|
9
|
+
const int nchannels = item_ct1.get_group_range(1);
|
|
8
10
|
|
|
9
11
|
const int nthreads = item_ct1.get_local_range(2);
|
|
12
|
+
const int sample = item_ct1.get_group(0);
|
|
13
|
+
const int channel = item_ct1.get_group(1);
|
|
14
|
+
const int row = item_ct1.get_group(2);
|
|
15
|
+
|
|
16
|
+
const int tid = item_ct1.get_local_id(2);
|
|
10
17
|
const int nwarps = nthreads / WARP_SIZE;
|
|
18
|
+
|
|
19
|
+
const auto strided_offset = calculate_offset<3>({stride_sample, stride_channel, stride_row}, {sample, channel, row});
|
|
20
|
+
const auto packed_offset = calculate_offset<3>({nchannels * nrows * ncols, nrows * ncols, ncols}, {sample, channel, row});
|
|
21
|
+
|
|
22
|
+
x += strided_offset;
|
|
23
|
+
dst += packed_offset;
|
|
24
|
+
|
|
11
25
|
sycl::float2 mean_var = sycl::float2(0.f, 0.f);
|
|
12
26
|
|
|
13
27
|
for (int col = tid; col < ncols; col += block_size) {
|
|
14
|
-
const float xi = x[
|
|
28
|
+
const float xi = x[col];
|
|
15
29
|
mean_var.x() += xi;
|
|
16
30
|
mean_var.y() += xi * xi;
|
|
17
31
|
}
|
|
18
32
|
|
|
19
33
|
// sum up partial sums
|
|
20
34
|
mean_var = warp_reduce_sum(mean_var, item_ct1);
|
|
21
|
-
if
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
if (
|
|
26
|
-
s_sum[
|
|
35
|
+
if (block_size > WARP_SIZE) {
|
|
36
|
+
const auto sub_group = item_ct1.get_sub_group();
|
|
37
|
+
const auto sg_id = sub_group.get_group_linear_id();
|
|
38
|
+
const auto wi_in_sg = sub_group.get_local_linear_id();
|
|
39
|
+
if (wi_in_sg == 0) {
|
|
40
|
+
s_sum[sg_id] = mean_var;
|
|
27
41
|
}
|
|
28
|
-
/*
|
|
29
|
-
DPCT1118:0: SYCL group functions and algorithms must be encountered in
|
|
30
|
-
converged control flow. You may need to adjust the code.
|
|
31
|
-
*/
|
|
32
42
|
item_ct1.barrier(sycl::access::fence_space::local_space);
|
|
33
43
|
mean_var = 0.f;
|
|
34
|
-
size_t nreduce = nwarps
|
|
44
|
+
const size_t nreduce = ceil_div(nwarps, WARP_SIZE);
|
|
35
45
|
for (size_t i = 0; i < nreduce; i += 1)
|
|
36
46
|
{
|
|
37
|
-
mean_var += s_sum[
|
|
47
|
+
mean_var += s_sum[wi_in_sg + i * WARP_SIZE];
|
|
38
48
|
}
|
|
39
49
|
mean_var = warp_reduce_sum(mean_var, item_ct1);
|
|
40
50
|
}
|
|
@@ -44,7 +54,7 @@ static void norm_f32(const float* x, float* dst, const int ncols, const float ep
|
|
|
44
54
|
const float inv_std = sycl::rsqrt(var + eps);
|
|
45
55
|
|
|
46
56
|
for (int col = tid; col < ncols; col += block_size) {
|
|
47
|
-
dst[
|
|
57
|
+
dst[col] = (x[col] - mean) * inv_std;
|
|
48
58
|
}
|
|
49
59
|
}
|
|
50
60
|
|
|
@@ -135,39 +145,51 @@ static void group_norm_f32(const float* x, float* dst, const int group_size, con
|
|
|
135
145
|
}
|
|
136
146
|
}
|
|
137
147
|
|
|
138
|
-
static void rms_norm_f32(const float* x, float* dst, const int ncols, const
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
const int
|
|
148
|
+
static void rms_norm_f32(const float* x, float* dst, const int ncols, const int64_t stride_row, const int64_t stride_channel,
|
|
149
|
+
const int64_t stride_sample, const float eps, const sycl::nd_item<3>& item_ct1, float* s_sum, int block_size) {
|
|
150
|
+
|
|
151
|
+
const int nrows = item_ct1.get_group_range(2);
|
|
152
|
+
const int nchannels = item_ct1.get_group_range(1);
|
|
153
|
+
|
|
154
|
+
const int sample = item_ct1.get_group(0);
|
|
155
|
+
const int channel = item_ct1.get_group(1);
|
|
156
|
+
const int row = item_ct1.get_group(2);
|
|
157
|
+
|
|
143
158
|
const int nthreads = item_ct1.get_local_range(2);
|
|
159
|
+
|
|
160
|
+
const int tid = item_ct1.get_local_id(2);
|
|
144
161
|
const int nwarps = nthreads / WARP_SIZE;
|
|
162
|
+
|
|
163
|
+
const auto strided_offset = calculate_offset<3>({stride_sample, stride_channel, stride_row}, {sample, channel, row});
|
|
164
|
+
const auto packed_offset = calculate_offset<3>({nchannels * nrows * ncols, nrows * ncols, ncols}, {sample, channel, row});
|
|
165
|
+
|
|
166
|
+
x += strided_offset;
|
|
167
|
+
dst += packed_offset;
|
|
168
|
+
|
|
169
|
+
|
|
145
170
|
float tmp = 0.0f; // partial sum for thread in warp
|
|
146
171
|
|
|
147
172
|
for (int col = tid; col < ncols; col += block_size) {
|
|
148
|
-
const float xi = x[
|
|
173
|
+
const float xi = x[col];
|
|
149
174
|
tmp += xi * xi;
|
|
150
175
|
}
|
|
151
176
|
|
|
152
177
|
// sum up partial sums
|
|
153
178
|
tmp = warp_reduce_sum(tmp, item_ct1);
|
|
154
179
|
if (block_size > WARP_SIZE) {
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
if (
|
|
159
|
-
s_sum[
|
|
180
|
+
const auto sub_group = item_ct1.get_sub_group();
|
|
181
|
+
const auto sg_id = sub_group.get_group_linear_id();
|
|
182
|
+
const auto wi_in_sg = sub_group.get_local_linear_id();
|
|
183
|
+
if (wi_in_sg == 0) {
|
|
184
|
+
s_sum[sg_id] = tmp;
|
|
160
185
|
}
|
|
161
|
-
|
|
162
|
-
DPCT1118:3: SYCL group functions and algorithms must be encountered in
|
|
163
|
-
converged control flow. You may need to adjust the code.
|
|
164
|
-
*/
|
|
186
|
+
|
|
165
187
|
item_ct1.barrier(sycl::access::fence_space::local_space);
|
|
166
|
-
size_t nreduce = nwarps
|
|
188
|
+
const size_t nreduce = ceil_div(nwarps, WARP_SIZE);
|
|
167
189
|
tmp = 0.f;
|
|
168
190
|
for (size_t i = 0; i < nreduce; i += 1)
|
|
169
191
|
{
|
|
170
|
-
tmp += s_sum[
|
|
192
|
+
tmp += s_sum[wi_in_sg + i * WARP_SIZE];
|
|
171
193
|
}
|
|
172
194
|
tmp = warp_reduce_sum(tmp, item_ct1);
|
|
173
195
|
}
|
|
@@ -176,7 +198,7 @@ static void rms_norm_f32(const float* x, float* dst, const int ncols, const floa
|
|
|
176
198
|
const float scale = sycl::rsqrt(mean + eps);
|
|
177
199
|
|
|
178
200
|
for (int col = tid; col < ncols; col += block_size) {
|
|
179
|
-
dst[
|
|
201
|
+
dst[col] = scale * x[col];
|
|
180
202
|
}
|
|
181
203
|
}
|
|
182
204
|
|
|
@@ -224,20 +246,20 @@ static void l2_norm_f32(const float* x, float* dst, const int ncols, const float
|
|
|
224
246
|
}
|
|
225
247
|
}
|
|
226
248
|
|
|
227
|
-
static void norm_f32_sycl(const float* x, float* dst, const int ncols,
|
|
228
|
-
|
|
229
|
-
|
|
249
|
+
static void norm_f32_sycl(const float * x, float * dst, const int ncols, const int nrows, const int nchannels, const int nsamples,
|
|
250
|
+
const int64_t stride_row, const int64_t stride_channel, const int64_t stride_sample,
|
|
251
|
+
const float eps, queue_ptr stream, int device) {
|
|
252
|
+
|
|
253
|
+
const sycl::range<3> global_dims(nsamples, nchannels, nrows);
|
|
230
254
|
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
|
231
255
|
if (ncols < 1024) {
|
|
232
256
|
const sycl::range<3> block_dims(1, 1, WARP_SIZE);
|
|
233
257
|
stream->submit([&](sycl::handler& cgh) {
|
|
234
258
|
cgh.parallel_for(
|
|
235
|
-
sycl::nd_range<3>(
|
|
236
|
-
block_dims),
|
|
259
|
+
sycl::nd_range<3>(global_dims * block_dims, block_dims),
|
|
237
260
|
[=](sycl::nd_item<3> item_ct1)
|
|
238
261
|
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
|
239
|
-
norm_f32(x, dst, ncols, eps, item_ct1,
|
|
240
|
-
nullptr, WARP_SIZE);
|
|
262
|
+
norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, nullptr, WARP_SIZE);
|
|
241
263
|
});
|
|
242
264
|
});
|
|
243
265
|
}
|
|
@@ -252,15 +274,12 @@ static void norm_f32_sycl(const float* x, float* dst, const int ncols,
|
|
|
252
274
|
*/
|
|
253
275
|
stream->submit([&](sycl::handler& cgh) {
|
|
254
276
|
sycl::local_accessor<sycl::float2, 1> s_sum_acc_ct1(
|
|
255
|
-
|
|
256
|
-
|
|
277
|
+
sycl::range<1>(work_group_size / WARP_SIZE), cgh);
|
|
257
278
|
cgh.parallel_for(
|
|
258
|
-
sycl::nd_range<3>(
|
|
259
|
-
block_dims),
|
|
279
|
+
sycl::nd_range<3>(global_dims * block_dims, block_dims),
|
|
260
280
|
[=](sycl::nd_item<3> item_ct1)
|
|
261
281
|
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
|
262
|
-
norm_f32(x, dst, ncols, eps, item_ct1,
|
|
263
|
-
get_pointer(s_sum_acc_ct1), work_group_size);
|
|
282
|
+
norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, get_pointer(s_sum_acc_ct1), work_group_size);
|
|
264
283
|
});
|
|
265
284
|
});
|
|
266
285
|
}
|
|
@@ -313,21 +332,20 @@ static void group_norm_f32_sycl(const float* x, float* dst,
|
|
|
313
332
|
}
|
|
314
333
|
}
|
|
315
334
|
|
|
316
|
-
static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols,
|
|
317
|
-
|
|
318
|
-
queue_ptr stream, int device) {
|
|
335
|
+
static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols, const int nrows, const int nchannels, const int nsamples,
|
|
336
|
+
const int64_t stride_row, const int64_t stride_channel, const int64_t stride_sample, const float eps, queue_ptr stream, int device) {
|
|
319
337
|
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
|
320
338
|
// printf("%s ncols=%d, nrows=%d, WARP_SIZE=%d\n", __func__, ncols, nrows, WARP_SIZE);
|
|
339
|
+
|
|
340
|
+
const sycl::range<3> global_dims(nsamples, nchannels, nrows);
|
|
321
341
|
if (ncols < 1024) {
|
|
322
342
|
const sycl::range<3> block_dims(1, 1, WARP_SIZE);
|
|
323
343
|
stream->submit([&](sycl::handler& cgh) {
|
|
324
344
|
cgh.parallel_for(
|
|
325
|
-
sycl::nd_range<3>(
|
|
326
|
-
block_dims),
|
|
345
|
+
sycl::nd_range<3>(global_dims * block_dims, block_dims),
|
|
327
346
|
[=](sycl::nd_item<3> item_ct1)
|
|
328
347
|
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
|
329
|
-
rms_norm_f32(x, dst, ncols, eps, item_ct1,
|
|
330
|
-
nullptr, WARP_SIZE);
|
|
348
|
+
rms_norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, nullptr, WARP_SIZE);
|
|
331
349
|
});
|
|
332
350
|
});
|
|
333
351
|
}
|
|
@@ -344,12 +362,10 @@ static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols,
|
|
|
344
362
|
sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(work_group_size / WARP_SIZE),
|
|
345
363
|
cgh);
|
|
346
364
|
cgh.parallel_for(
|
|
347
|
-
sycl::nd_range<3>(
|
|
348
|
-
block_dims),
|
|
365
|
+
sycl::nd_range<3>(global_dims * block_dims, block_dims),
|
|
349
366
|
[=](sycl::nd_item<3> item_ct1)
|
|
350
367
|
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
|
351
|
-
rms_norm_f32(x, dst, ncols, eps, item_ct1,
|
|
352
|
-
get_pointer(s_sum_acc_ct1), work_group_size);
|
|
368
|
+
rms_norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, get_pointer(s_sum_acc_ct1), work_group_size);
|
|
353
369
|
});
|
|
354
370
|
});
|
|
355
371
|
}
|
|
@@ -398,12 +414,12 @@ static void l2_norm_f32_sycl(const float* x, float* dst, const int ncols,
|
|
|
398
414
|
}
|
|
399
415
|
|
|
400
416
|
void ggml_sycl_op_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
|
|
417
|
+
const ggml_tensor * src0 = dst->src[0];
|
|
401
418
|
|
|
402
419
|
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
|
403
420
|
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
404
421
|
|
|
405
|
-
|
|
406
|
-
const int64_t nrows = ggml_nrows(dst->src[0]);
|
|
422
|
+
GGML_TENSOR_UNARY_OP_LOCALS
|
|
407
423
|
dpct::queue_ptr main_stream = ctx.stream();
|
|
408
424
|
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
|
409
425
|
const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
|
|
@@ -411,8 +427,14 @@ void ggml_sycl_op_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
|
|
|
411
427
|
|
|
412
428
|
float eps;
|
|
413
429
|
memcpy(&eps, dst->op_params, sizeof(float));
|
|
414
|
-
|
|
415
|
-
|
|
430
|
+
GGML_ASSERT(eps >= 0.0f);
|
|
431
|
+
const size_t ts0 = ggml_type_size(src0->type);
|
|
432
|
+
GGML_ASSERT(nb00 == ts0);
|
|
433
|
+
const int64_t s01 = nb01 / ts0;
|
|
434
|
+
const int64_t s02 = nb02 / ts0;
|
|
435
|
+
const int64_t s03 = nb03 / ts0;
|
|
436
|
+
|
|
437
|
+
norm_f32_sycl(src0_dd, dst_dd, ne00, ne01, ne02, ne03, s01, s02, s03, eps, main_stream, ctx.device);
|
|
416
438
|
}
|
|
417
439
|
|
|
418
440
|
void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
|
|
@@ -436,11 +458,10 @@ void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
|
|
|
436
458
|
|
|
437
459
|
void ggml_sycl_op_rms_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
438
460
|
|
|
461
|
+
const ggml_tensor * src0 = dst->src[0];
|
|
439
462
|
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
|
440
463
|
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
441
464
|
|
|
442
|
-
const int64_t ne00 = dst->src[0]->ne[0];
|
|
443
|
-
const int64_t nrows = ggml_nrows(dst->src[0]);
|
|
444
465
|
dpct::queue_ptr main_stream = ctx.stream();
|
|
445
466
|
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
|
446
467
|
|
|
@@ -450,7 +471,13 @@ void ggml_sycl_op_rms_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
|
450
471
|
float eps;
|
|
451
472
|
memcpy(&eps, dst->op_params, sizeof(float));
|
|
452
473
|
|
|
453
|
-
|
|
474
|
+
GGML_TENSOR_UNARY_OP_LOCALS
|
|
475
|
+
const size_t ts0 = ggml_type_size(src0->type);
|
|
476
|
+
GGML_ASSERT(nb00 == ts0);
|
|
477
|
+
const int64_t s01 = nb01 / ts0;
|
|
478
|
+
const int64_t s02 = nb02 / ts0;
|
|
479
|
+
const int64_t s03 = nb03 / ts0;
|
|
480
|
+
rms_norm_f32_sycl(src0_dd, dst_dd, ne00, ne01, ne02, ne03, s01, s02, s03, eps, main_stream, ctx.device);
|
|
454
481
|
}
|
|
455
482
|
|
|
456
483
|
void ggml_sycl_op_l2_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
|
|
@@ -56,6 +56,28 @@ template <> struct block_q_t<GGML_TYPE_Q4_0> {
|
|
|
56
56
|
static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
|
|
57
57
|
};
|
|
58
58
|
|
|
59
|
+
template <> struct block_q_t<GGML_TYPE_Q4_K> {
|
|
60
|
+
struct traits {
|
|
61
|
+
static constexpr uint32_t qk = QK_K;
|
|
62
|
+
static constexpr uint32_t qi = QI4_K;
|
|
63
|
+
static constexpr uint32_t qr = QR4_K;
|
|
64
|
+
static constexpr uint32_t vdr_mmvq = 2;
|
|
65
|
+
};
|
|
66
|
+
|
|
67
|
+
static constexpr int get_block_offset(const int block_index) { return block_index * (traits::qk / traits::qr); }
|
|
68
|
+
|
|
69
|
+
static constexpr int get_d_offset(int nrows, int ncols, const int block_index) {
|
|
70
|
+
auto nblocks = (nrows * (ncols / traits::qk));
|
|
71
|
+
return (nblocks * QK_K / 2) + (nblocks * K_SCALE_SIZE) + (block_index * sizeof(ggml_half2));
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
|
|
75
|
+
|
|
76
|
+
constexpr size_t get_total_qs_bytes(int nblocks) { return nblocks * QK_K / 2; }
|
|
77
|
+
|
|
78
|
+
constexpr size_t get_dm_offset(int nblocks) { return get_total_qs_bytes(nblocks) + nblocks * K_SCALE_SIZE; }
|
|
79
|
+
};
|
|
80
|
+
|
|
59
81
|
} // namespace ggml_sycl_reordered
|
|
60
82
|
|
|
61
83
|
#endif // GGML_SYCL_QUANTS_HPP
|
|
@@ -355,8 +355,7 @@ inline void ggml_sycl_op_rope(ggml_backend_sycl_context & ctx, ggml_tensor *dst)
|
|
|
355
355
|
}
|
|
356
356
|
|
|
357
357
|
void ggml_sycl_rope(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
358
|
-
|
|
358
|
+
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/3);
|
|
359
359
|
ggml_sycl_op_rope(ctx, dst);
|
|
360
|
-
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
|
361
360
|
}
|
|
362
361
|
|
|
@@ -225,7 +225,7 @@ static void soft_max_f32_sycl(const float * x, const T * mask,
|
|
|
225
225
|
}
|
|
226
226
|
|
|
227
227
|
void ggml_sycl_op_soft_max(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
228
|
-
|
|
228
|
+
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
|
|
229
229
|
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
|
230
230
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
231
231
|
|
|
@@ -249,16 +249,13 @@ void ggml_sycl_op_soft_max(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
|
249
249
|
|
|
250
250
|
if (dst->src[1] && dst->src[1]->type == GGML_TYPE_F16) {
|
|
251
251
|
const sycl::half * src1_dd = static_cast<sycl::half *>(dst->src[1]->data);
|
|
252
|
-
GGML_SYCL_DEBUG("%s: F16 mask\n", __func__);
|
|
253
252
|
soft_max_f32_sycl<sycl::half>(src0_dd, src1_dd, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias,
|
|
254
253
|
main_stream, ctx.device);
|
|
255
254
|
} else if (dst->src[1] && dst->src[1]->type == GGML_TYPE_F32) {
|
|
256
255
|
const float * src1_dd = static_cast<const float *>(dst->src[1]->data);
|
|
257
|
-
GGML_SYCL_DEBUG("%s: F32 mask\n", __func__);
|
|
258
256
|
soft_max_f32_sycl<float>(src0_dd, src1_dd, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias, main_stream, ctx.device);
|
|
259
257
|
} else {
|
|
260
258
|
/* mask unavailable */
|
|
261
|
-
GGML_SYCL_DEBUG("%s: No mask\n", __func__);
|
|
262
259
|
soft_max_f32_sycl<float>(src0_dd, nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias, main_stream, ctx.device);
|
|
263
260
|
}
|
|
264
261
|
}
|
|
@@ -56,8 +56,8 @@ static void timestep_embedding_f32_sycl(
|
|
|
56
56
|
}
|
|
57
57
|
|
|
58
58
|
void ggml_sycl_op_timestep_embedding(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
59
|
-
|
|
60
|
-
const ggml_tensor *
|
|
59
|
+
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
|
60
|
+
const ggml_tensor * src0 = dst->src[0];
|
|
61
61
|
const float * src0_d = (const float *)src0->data;
|
|
62
62
|
float * dst_d = (float *)dst->data;
|
|
63
63
|
dpct::queue_ptr stream = ctx.stream();
|
|
@@ -69,5 +69,4 @@ void ggml_sycl_op_timestep_embedding(ggml_backend_sycl_context & ctx, ggml_tenso
|
|
|
69
69
|
const int max_period = dst->op_params[1];
|
|
70
70
|
|
|
71
71
|
timestep_embedding_f32_sycl(src0_d, dst_d, src0->ne[0], dst->nb[1], dim, max_period, stream);
|
|
72
|
-
GGML_UNUSED(src1);
|
|
73
72
|
}
|
|
@@ -285,7 +285,7 @@ template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q4_0> {
|
|
|
285
285
|
}
|
|
286
286
|
|
|
287
287
|
__dpct_inline__ float operator()(const void * __restrict__ vbq, const int ibx_offset, const int d_offset,
|
|
288
|
-
const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
|
288
|
+
const block_q8_1 * __restrict__ bq8_1, const int & iqs, int /* nblocks */) {
|
|
289
289
|
const uint8_t * bq4_0 = static_cast<const uint8_t *>(vbq) + ibx_offset;
|
|
290
290
|
const ggml_half d = *(reinterpret_cast<const ggml_half *>(static_cast<const uint8_t *>(vbq) + d_offset));
|
|
291
291
|
int v[q4_0_traits::vdr_mmvq];
|
|
@@ -303,6 +303,67 @@ template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q4_0> {
|
|
|
303
303
|
};
|
|
304
304
|
};
|
|
305
305
|
|
|
306
|
+
static inline float vec_dot_q4_K_q8_1_common(const int * __restrict__ q4, const uint16_t * __restrict__ scales,
|
|
307
|
+
const ggml_half2 & dm, const block_q8_1 * __restrict__ bq8_1,
|
|
308
|
+
const int & iqs) {
|
|
309
|
+
int v[2];
|
|
310
|
+
int u[2 * QR4_K];
|
|
311
|
+
float d8[QR4_K];
|
|
312
|
+
|
|
313
|
+
v[0] = q4[0];
|
|
314
|
+
v[1] = q4[4];
|
|
315
|
+
|
|
316
|
+
uint16_t aux[2];
|
|
317
|
+
const int j = (QR4_K * ((iqs / 2) / (QI8_1 / 2))) / 2;
|
|
318
|
+
if (j < 2) {
|
|
319
|
+
aux[0] = scales[j + 0] & 0x3f3f;
|
|
320
|
+
aux[1] = scales[j + 2] & 0x3f3f;
|
|
321
|
+
} else {
|
|
322
|
+
aux[0] = ((scales[j + 2] >> 0) & 0x0f0f) | ((scales[j - 2] & 0xc0c0) >> 2);
|
|
323
|
+
aux[1] = ((scales[j + 2] >> 4) & 0x0f0f) | ((scales[j - 0] & 0xc0c0) >> 2);
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
const uint8_t * sc = (const uint8_t *) aux;
|
|
327
|
+
const uint8_t * m = sc + 2;
|
|
328
|
+
|
|
329
|
+
const int bq8_offset = QR4_K * ((iqs / 2) / (QI8_1 / 2));
|
|
330
|
+
|
|
331
|
+
for (int i = 0; i < QR4_K; ++i) {
|
|
332
|
+
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
|
333
|
+
d8[i] = bq8i->ds[0];
|
|
334
|
+
|
|
335
|
+
const int * q8 = (const int *) bq8i->qs + ((iqs / 2) % 4);
|
|
336
|
+
u[2 * i + 0] = q8[0];
|
|
337
|
+
u[2 * i + 1] = q8[4];
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, dm, d8);
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q4_K> {
|
|
344
|
+
static constexpr ggml_type gtype = GGML_TYPE_Q4_K;
|
|
345
|
+
|
|
346
|
+
using q4_k_block = ggml_sycl_reordered::block_q_t<GGML_TYPE_Q4_K>;
|
|
347
|
+
using q4_k_traits = typename q4_k_block::traits;
|
|
348
|
+
|
|
349
|
+
float operator()(const void * __restrict__ vbq, const int ibx_offset, const int d_offset,
|
|
350
|
+
const block_q8_1 * __restrict__ bq8_1, const int & iqs, int nblocks) {
|
|
351
|
+
const int ib = ibx_offset / (QK_K / 2);
|
|
352
|
+
|
|
353
|
+
const uint8_t * base = static_cast<const uint8_t *>(vbq);
|
|
354
|
+
const uint8_t * qs = base + ibx_offset;
|
|
355
|
+
const int total_qs_bytes = nblocks * (QK_K / 2);
|
|
356
|
+
const uint8_t * scs = base + total_qs_bytes + ib * K_SCALE_SIZE;
|
|
357
|
+
const ggml_half2 * dms = reinterpret_cast<const ggml_half2 *>(base + d_offset);
|
|
358
|
+
|
|
359
|
+
const int bq8_offset = QR4_K * ((iqs / 2) / (QI8_1 / 2));
|
|
360
|
+
const int * q4 = (const int *) (qs + 16 * bq8_offset + 4 * ((iqs / 2) % 4));
|
|
361
|
+
const uint16_t * scales = (const uint16_t *) scs;
|
|
362
|
+
|
|
363
|
+
return vec_dot_q4_K_q8_1_common(q4, scales, *dms, bq8_1, iqs);
|
|
364
|
+
}
|
|
365
|
+
};
|
|
366
|
+
|
|
306
367
|
#define VDR_Q4_0_Q8_1_MMVQ 2
|
|
307
368
|
#define VDR_Q4_0_Q8_1_MMQ 4
|
|
308
369
|
|
|
@@ -649,52 +710,17 @@ vec_dot_q3_K_q8_1(const void *__restrict__ vbq,
|
|
|
649
710
|
return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
|
|
650
711
|
}
|
|
651
712
|
|
|
652
|
-
static __dpct_inline__ float
|
|
653
|
-
|
|
654
|
-
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
|
|
655
|
-
|
|
713
|
+
static __dpct_inline__ float vec_dot_q4_K_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1,
|
|
714
|
+
const int & iqs) {
|
|
656
715
|
#ifndef GGML_QKK_64
|
|
657
|
-
const block_q4_K * bq4_K = (const block_q4_K *) vbq;
|
|
658
|
-
|
|
659
|
-
int v[2];
|
|
660
|
-
int u[2*QR4_K];
|
|
661
|
-
float d8[QR4_K];
|
|
662
716
|
|
|
663
|
-
|
|
664
|
-
const int bq8_offset = QR4_K * ((iqs/2) / (QI8_1/2));
|
|
665
|
-
|
|
666
|
-
// iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12
|
|
667
|
-
// iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44
|
|
668
|
-
// iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76
|
|
669
|
-
// iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108
|
|
670
|
-
|
|
671
|
-
const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
|
|
672
|
-
v[0] = q4[0];
|
|
673
|
-
v[1] = q4[4];
|
|
674
|
-
|
|
675
|
-
const uint16_t * scales = (const uint16_t *)bq4_K->scales;
|
|
676
|
-
uint16_t aux[2];
|
|
677
|
-
const int j = bq8_offset/2;
|
|
678
|
-
if (j < 2) {
|
|
679
|
-
aux[0] = scales[j+0] & 0x3f3f;
|
|
680
|
-
aux[1] = scales[j+2] & 0x3f3f;
|
|
681
|
-
} else {
|
|
682
|
-
aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
|
|
683
|
-
aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
|
|
684
|
-
}
|
|
685
|
-
const uint8_t * sc = (const uint8_t *)aux;
|
|
686
|
-
const uint8_t * m = sc + 2;
|
|
687
|
-
|
|
688
|
-
for (int i = 0; i < QR4_K; ++i) {
|
|
689
|
-
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
|
690
|
-
d8[i] = bq8i->ds[0];
|
|
717
|
+
const block_q4_K * bq4_K = (const block_q4_K *) vbq;
|
|
691
718
|
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
}
|
|
719
|
+
const int bq8_offset = QR4_K * ((iqs / 2) / (QI8_1 / 2));
|
|
720
|
+
const int * q4 = (const int *) (bq4_K->qs + 16 * bq8_offset + 4 * ((iqs / 2) % 4));
|
|
721
|
+
const uint16_t * scales = (const uint16_t *) bq4_K->scales;
|
|
696
722
|
|
|
697
|
-
return
|
|
723
|
+
return vec_dot_q4_K_q8_1_common(q4, scales, bq4_K->dm, bq8_1, iqs);
|
|
698
724
|
|
|
699
725
|
#else
|
|
700
726
|
|
|
@@ -180,10 +180,7 @@ static void rwkv_wkv7_f32_kernel(
|
|
|
180
180
|
}
|
|
181
181
|
|
|
182
182
|
void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
|
|
183
|
-
|
|
184
|
-
const ggml_tensor *src0 = dst->src[0];
|
|
185
|
-
const ggml_tensor *src1 = dst->src[1];
|
|
186
|
-
|
|
183
|
+
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/6);
|
|
187
184
|
const float* k_d = (const float*)dst->src[0]->data;
|
|
188
185
|
const float* v_d = (const float*)dst->src[1]->data;
|
|
189
186
|
const float* r_d = (const float*)dst->src[2]->data;
|
|
@@ -236,16 +233,10 @@ void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
|
|
|
236
233
|
});
|
|
237
234
|
});
|
|
238
235
|
}
|
|
239
|
-
|
|
240
|
-
GGML_UNUSED(src0);
|
|
241
|
-
GGML_UNUSED(src1);
|
|
242
236
|
}
|
|
243
237
|
|
|
244
238
|
void ggml_sycl_op_rwkv_wkv7(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
|
|
245
|
-
|
|
246
|
-
const ggml_tensor *src0 = dst->src[0];
|
|
247
|
-
const ggml_tensor *src1 = dst->src[1];
|
|
248
|
-
|
|
239
|
+
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/7);
|
|
249
240
|
const float* r_d = (const float*)dst->src[0]->data;
|
|
250
241
|
const float* w_d = (const float*)dst->src[1]->data;
|
|
251
242
|
const float* k_d = (const float*)dst->src[2]->data;
|
|
@@ -299,7 +290,4 @@ void ggml_sycl_op_rwkv_wkv7(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
|
|
|
299
290
|
});
|
|
300
291
|
});
|
|
301
292
|
}
|
|
302
|
-
|
|
303
|
-
GGML_UNUSED(src0);
|
|
304
|
-
GGML_UNUSED(src1);
|
|
305
293
|
}
|