@novastera-oss/llamarn 0.2.1 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +80 -14
- package/RNLlamaCpp.podspec +10 -3
- package/android/CMakeLists.txt +8 -0
- package/android/src/main/cpp/include/llama.h +62 -125
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/PureCppImpl.cpp +9 -27
- package/cpp/SystemUtils.h +2 -2
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/README.md +11 -3
- package/cpp/llama.cpp/build-xcframework.sh +1 -0
- package/cpp/llama.cpp/common/CMakeLists.txt +8 -2
- package/cpp/llama.cpp/common/arg.cpp +153 -113
- package/cpp/llama.cpp/common/chat-parser.cpp +379 -0
- package/cpp/llama.cpp/common/chat-parser.h +117 -0
- package/cpp/llama.cpp/common/chat.cpp +847 -699
- package/cpp/llama.cpp/common/chat.h +73 -6
- package/cpp/llama.cpp/common/common.cpp +50 -82
- package/cpp/llama.cpp/common/common.h +21 -17
- package/cpp/llama.cpp/common/json-partial.cpp +255 -0
- package/cpp/llama.cpp/common/json-partial.h +37 -0
- package/cpp/llama.cpp/common/minja/chat-template.hpp +9 -5
- package/cpp/llama.cpp/common/minja/minja.hpp +69 -36
- package/cpp/llama.cpp/common/regex-partial.cpp +204 -0
- package/cpp/llama.cpp/common/regex-partial.h +56 -0
- package/cpp/llama.cpp/common/sampling.cpp +7 -8
- package/cpp/llama.cpp/convert_hf_to_gguf.py +453 -118
- package/cpp/llama.cpp/convert_hf_to_gguf_update.py +120 -68
- package/cpp/llama.cpp/ggml/CMakeLists.txt +2 -1
- package/cpp/llama.cpp/ggml/cmake/common.cmake +25 -0
- package/cpp/llama.cpp/ggml/include/ggml-opt.h +49 -28
- package/cpp/llama.cpp/ggml/include/ggml.h +26 -7
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +16 -10
- package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +4 -1
- package/cpp/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +604 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +42 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +54 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +50 -51
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +5 -9
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +779 -19
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +88 -5
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -12
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +264 -69
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +322 -100
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +117 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +85 -16
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +220 -49
- package/cpp/llama.cpp/ggml/src/ggml-cuda/acc.cu +40 -26
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cu +11 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +15 -7
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +266 -64
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +49 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +48 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cu +2 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +5 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cu +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/quantize.cu +7 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sum.cu +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +10 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +99 -17
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +200 -2
- package/cpp/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +8 -2
- package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cu +112 -0
- package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +12 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +6 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +972 -178
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/div.cl +72 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/group_norm.cl +72 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sub.cl +72 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sum_rows.cl +39 -0
- package/cpp/llama.cpp/ggml/src/ggml-opt.cpp +373 -190
- package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +29 -23
- package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -10
- package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +101 -5
- package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +31 -33
- package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +29 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +4 -5
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +9 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +84 -72
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
- package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +1 -3
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +324 -129
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +31 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +95 -68
- package/cpp/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +1 -4
- package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +2 -3
- package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +69 -43
- package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +2 -14
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +81 -91
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +432 -181
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +17 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +6 -152
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +162 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +360 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +2 -118
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +12 -1
- package/cpp/llama.cpp/ggml/src/ggml.c +107 -36
- package/cpp/llama.cpp/ggml/src/gguf.cpp +33 -33
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +100 -15
- package/cpp/llama.cpp/gguf-py/gguf/gguf_reader.py +1 -1
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +44 -12
- package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_editor_gui.py +21 -10
- package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_new_metadata.py +5 -2
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +128 -31
- package/cpp/llama.cpp/gguf-py/gguf/utility.py +1 -1
- package/cpp/llama.cpp/gguf-py/pyproject.toml +1 -1
- package/cpp/llama.cpp/include/llama.h +62 -125
- package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-nomic-bert-moe.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.out +1 -1
- package/cpp/llama.cpp/models/templates/Qwen-QwQ-32B.jinja +62 -0
- package/cpp/llama.cpp/models/templates/Qwen-Qwen3-0.6B.jinja +85 -0
- package/cpp/llama.cpp/models/templates/README.md +2 -0
- package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +5 -1
- package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +5 -1
- package/cpp/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
- package/cpp/llama.cpp/requirements/requirements-gguf_editor_gui.txt +1 -1
- package/cpp/llama.cpp/src/CMakeLists.txt +2 -0
- package/cpp/llama.cpp/src/llama-arch.cpp +6 -0
- package/cpp/llama.cpp/src/llama-arch.h +2 -0
- package/cpp/llama.cpp/src/llama-batch.cpp +3 -1
- package/cpp/llama.cpp/src/llama-context.cpp +340 -123
- package/cpp/llama.cpp/src/llama-context.h +30 -0
- package/cpp/llama.cpp/src/llama-cparams.cpp +4 -0
- package/cpp/llama.cpp/src/llama-cparams.h +2 -0
- package/cpp/llama.cpp/src/llama-grammar.cpp +12 -2
- package/cpp/llama.cpp/src/llama-graph.cpp +157 -247
- package/cpp/llama.cpp/src/llama-graph.h +52 -7
- package/cpp/llama.cpp/src/llama-hparams.cpp +17 -1
- package/cpp/llama.cpp/src/llama-hparams.h +37 -5
- package/cpp/llama.cpp/src/llama-kv-cache.cpp +742 -481
- package/cpp/llama.cpp/src/llama-kv-cache.h +196 -99
- package/cpp/llama.cpp/src/llama-kv-cells.h +379 -0
- package/cpp/llama.cpp/src/llama-memory.h +4 -3
- package/cpp/llama.cpp/src/llama-model-loader.cpp +22 -17
- package/cpp/llama.cpp/src/llama-model-saver.cpp +281 -0
- package/cpp/llama.cpp/src/llama-model-saver.h +37 -0
- package/cpp/llama.cpp/src/llama-model.cpp +529 -172
- package/cpp/llama.cpp/src/llama-model.h +6 -1
- package/cpp/llama.cpp/src/llama-quant.cpp +15 -13
- package/cpp/llama.cpp/src/llama-sampling.cpp +2 -2
- package/cpp/llama.cpp/src/llama-vocab.cpp +35 -8
- package/cpp/llama.cpp/src/llama-vocab.h +6 -0
- package/cpp/llama.cpp/src/llama.cpp +14 -0
- package/cpp/rn-completion.cpp +60 -5
- package/ios/include/chat.h +73 -6
- package/ios/include/common/minja/chat-template.hpp +9 -5
- package/ios/include/common/minja/minja.hpp +69 -36
- package/ios/include/common.h +21 -17
- package/ios/include/llama.h +62 -125
- package/ios/libs/llama.xcframework/Info.plist +19 -19
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4617 -4487
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4638 -4508
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3557 -3435
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4638 -4508
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3559 -3437
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4616 -4487
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4637 -4508
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3556 -3435
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4653 -4523
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4674 -4544
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3587 -3465
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +1 -1
- package/cpp/llama.cpp/common/stb_image.h +0 -7988
- package/cpp/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-chameleon.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-llama4.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-llama4.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-pixtral.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +0 -46
|
@@ -49,6 +49,7 @@ static bool g_sycl_loaded = false;
|
|
|
49
49
|
int g_ggml_sycl_debug = 0;
|
|
50
50
|
int g_ggml_sycl_disable_optimize = 0;
|
|
51
51
|
int g_ggml_sycl_disable_graph = 0;
|
|
52
|
+
int g_ggml_sycl_disable_dnn = 0;
|
|
52
53
|
int g_ggml_sycl_prioritize_dmmv = 0;
|
|
53
54
|
|
|
54
55
|
static ggml_sycl_device_info ggml_sycl_init() {
|
|
@@ -196,12 +197,22 @@ static void ggml_check_sycl() try {
|
|
|
196
197
|
g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
|
|
197
198
|
g_ggml_sycl_disable_optimize= get_sycl_env("GGML_SYCL_DISABLE_OPT", 1);
|
|
198
199
|
g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1);
|
|
200
|
+
g_ggml_sycl_disable_dnn = get_sycl_env("GGML_SYCL_DISABLE_DNN", 0);
|
|
199
201
|
g_ggml_sycl_prioritize_dmmv = get_sycl_env("GGML_SYCL_PRIORITIZE_DMMV", 0);
|
|
200
202
|
GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n");
|
|
201
203
|
GGML_LOG_INFO("Running with Environment Variables:\n");
|
|
202
204
|
GGML_LOG_INFO(" GGML_SYCL_DEBUG: %d\n", g_ggml_sycl_debug);
|
|
203
205
|
GGML_LOG_INFO(" GGML_SYCL_DISABLE_OPT: %d\n", g_ggml_sycl_disable_optimize);
|
|
206
|
+
#ifdef GGML_SYCL_GRAPH
|
|
204
207
|
GGML_LOG_INFO(" GGML_SYCL_DISABLE_GRAPH: %d\n", g_ggml_sycl_disable_graph);
|
|
208
|
+
#else
|
|
209
|
+
GGML_LOG_INFO(" GGML_SYCL_DISABLE_GRAPH: graph disabled by compile flag\n");
|
|
210
|
+
#endif
|
|
211
|
+
#if GGML_SYCL_DNNL
|
|
212
|
+
GGML_LOG_INFO(" GGML_SYCL_DISABLE_DNN: %d\n", g_ggml_sycl_disable_dnn);
|
|
213
|
+
#else
|
|
214
|
+
GGML_LOG_INFO(" GGML_SYCL_DISABLE_DNN: DNN disabled by compile flag\n");
|
|
215
|
+
#endif
|
|
205
216
|
GGML_LOG_INFO(" GGML_SYCL_PRIORITIZE_DMMV: %d\n", g_ggml_sycl_prioritize_dmmv);
|
|
206
217
|
GGML_LOG_INFO("Build with Macros:\n");
|
|
207
218
|
#if defined(GGML_SYCL_FORCE_MMQ)
|
|
@@ -335,13 +346,15 @@ static void * ggml_backend_sycl_buffer_get_base(ggml_backend_buffer_t buffer) {
|
|
|
335
346
|
static enum ggml_status
|
|
336
347
|
ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
|
337
348
|
ggml_tensor *tensor) try {
|
|
349
|
+
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
|
350
|
+
debug_print_tensor(": tensor=", tensor, "\n");
|
|
338
351
|
ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *)buffer->context;
|
|
339
352
|
|
|
340
353
|
if (tensor->view_src != NULL) {
|
|
341
354
|
assert(tensor->view_src->buffer->buft == buffer->buft);
|
|
342
355
|
return GGML_STATUS_SUCCESS;
|
|
343
356
|
}
|
|
344
|
-
if (tensor->type == GGML_TYPE_Q4_0 && !g_ggml_sycl_disable_optimize) {
|
|
357
|
+
if ((tensor->type == GGML_TYPE_Q4_0 || tensor->type == GGML_TYPE_Q4_K) && !g_ggml_sycl_disable_optimize) {
|
|
345
358
|
ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
|
|
346
359
|
tensor->extra = extra;
|
|
347
360
|
ctx->tensor_extras.push_back(extra); //used to release it when destroy ctx.
|
|
@@ -370,20 +383,23 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
|
|
370
383
|
ggml_tensor *tensor,
|
|
371
384
|
const void *data, size_t offset,
|
|
372
385
|
size_t size) try {
|
|
373
|
-
|
|
386
|
+
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
|
387
|
+
debug_print_tensor(": tensor=", tensor);
|
|
388
|
+
GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
|
|
374
389
|
ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
|
|
375
390
|
ggml_sycl_set_device(ctx->device);
|
|
376
391
|
auto stream = &(dpct::dev_mgr::instance().get_device(ctx->device).default_queue());
|
|
377
|
-
SYCL_CHECK(
|
|
378
|
-
|
|
392
|
+
SYCL_CHECK(CHECK_TRY_ERROR(dpct::dev_mgr::instance().get_device(ctx->device).queues_wait_and_throw()));
|
|
393
|
+
#ifndef _WIN32
|
|
379
394
|
// Note: Use host buffer to save the data from mmap(), then copy to device. It's workaround for mmap() issue on PVC GPU.
|
|
380
395
|
// This function will be called during load model from disk. Use memory buffer replace dynamic won't save more time and brings potential memory leak risk here.
|
|
381
|
-
char* host_buf = (char*)malloc(size);
|
|
396
|
+
char * host_buf = (char *) malloc(size);
|
|
382
397
|
memcpy(host_buf, data, size);
|
|
383
|
-
SYCL_CHECK(
|
|
384
|
-
CHECK_TRY_ERROR((*stream).memcpy((char *)tensor->data + offset, host_buf, size)
|
|
385
|
-
.wait()));
|
|
398
|
+
SYCL_CHECK(CHECK_TRY_ERROR((*stream).memcpy((char *) tensor->data + offset, host_buf, size).wait()));
|
|
386
399
|
free(host_buf);
|
|
400
|
+
#else
|
|
401
|
+
SYCL_CHECK(CHECK_TRY_ERROR((*stream).memcpy((char *) tensor->data + offset, data, size).wait()));
|
|
402
|
+
#endif
|
|
387
403
|
}
|
|
388
404
|
catch (sycl::exception const &exc) {
|
|
389
405
|
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
|
@@ -395,7 +411,9 @@ static void ggml_backend_sycl_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
|
|
395
411
|
const ggml_tensor *tensor,
|
|
396
412
|
void *data, size_t offset,
|
|
397
413
|
size_t size) try {
|
|
398
|
-
|
|
414
|
+
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
|
415
|
+
debug_print_tensor(": tensor=", tensor);
|
|
416
|
+
GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
|
|
399
417
|
ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
|
|
400
418
|
|
|
401
419
|
ggml_sycl_set_device(ctx->device);
|
|
@@ -423,7 +441,12 @@ static bool
|
|
|
423
441
|
ggml_backend_sycl_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
|
|
424
442
|
const ggml_tensor *src,
|
|
425
443
|
ggml_tensor *dst) try {
|
|
426
|
-
|
|
444
|
+
bool is_cpy_supported = ggml_backend_buffer_is_sycl(src->buffer);
|
|
445
|
+
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
|
446
|
+
debug_print_tensor(": dst=", dst);
|
|
447
|
+
debug_print_tensor(" src=", src);
|
|
448
|
+
GGML_SYCL_DEBUG(" is_cpy_supported=%d\n", is_cpy_supported);
|
|
449
|
+
if (is_cpy_supported) {
|
|
427
450
|
ggml_backend_sycl_buffer_context * src_ctx = (ggml_backend_sycl_buffer_context *)src->buffer->context;
|
|
428
451
|
ggml_backend_sycl_buffer_context * dst_ctx = (ggml_backend_sycl_buffer_context *)dst->buffer->context;
|
|
429
452
|
|
|
@@ -480,7 +503,8 @@ ggml_backend_sycl_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
|
|
|
480
503
|
|
|
481
504
|
static void ggml_backend_sycl_buffer_clear(ggml_backend_buffer_t buffer,
|
|
482
505
|
uint8_t value) try {
|
|
483
|
-
|
|
506
|
+
GGML_SYCL_DEBUG("[SYCL] call %s: size=%zu\n", __func__, buffer->size);
|
|
507
|
+
ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *) buffer->context;
|
|
484
508
|
|
|
485
509
|
ggml_sycl_set_device(ctx->device);
|
|
486
510
|
queue_ptr stream = ctx->stream;
|
|
@@ -499,7 +523,9 @@ catch (sycl::exception const &exc) {
|
|
|
499
523
|
|
|
500
524
|
static void ggml_backend_sycl_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value,
|
|
501
525
|
size_t offset, size_t size) {
|
|
502
|
-
GGML_SYCL_DEBUG("
|
|
526
|
+
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
|
527
|
+
debug_print_tensor(": tensor=", tensor);
|
|
528
|
+
GGML_SYCL_DEBUG(" size=%zu offset=%zu value=%u\n", size, offset, value);
|
|
503
529
|
ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *) buffer->context;
|
|
504
530
|
SYCL_CHECK(ggml_sycl_set_device(ctx->device));
|
|
505
531
|
auto stream = &(dpct::dev_mgr::instance().get_device(ctx->device).default_queue());
|
|
@@ -777,6 +803,8 @@ static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buff
|
|
|
777
803
|
static enum ggml_status
|
|
778
804
|
ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
|
779
805
|
ggml_tensor *tensor) try {
|
|
806
|
+
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
|
807
|
+
debug_print_tensor(": tensor=", tensor, "\n");
|
|
780
808
|
GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
|
|
781
809
|
|
|
782
810
|
ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
|
|
@@ -861,6 +889,9 @@ static void
|
|
|
861
889
|
ggml_backend_sycl_split_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
|
862
890
|
ggml_tensor *tensor, const void *data,
|
|
863
891
|
size_t offset, size_t size) try {
|
|
892
|
+
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
|
893
|
+
debug_print_tensor(": tensor=", tensor);
|
|
894
|
+
GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
|
|
864
895
|
// split tensors must always be set in their entirety at once
|
|
865
896
|
GGML_ASSERT(offset == 0);
|
|
866
897
|
GGML_ASSERT(size == ggml_nbytes(tensor));
|
|
@@ -914,6 +945,9 @@ static void
|
|
|
914
945
|
ggml_backend_sycl_split_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
|
915
946
|
const ggml_tensor *tensor, void *data,
|
|
916
947
|
size_t offset, size_t size) try {
|
|
948
|
+
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
|
949
|
+
debug_print_tensor(": tensor=", tensor);
|
|
950
|
+
GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
|
|
917
951
|
// split tensors must always be set in their entirety at once
|
|
918
952
|
GGML_ASSERT(offset == 0);
|
|
919
953
|
GGML_ASSERT(size == ggml_nbytes(tensor));
|
|
@@ -1985,31 +2019,30 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
|
|
1985
2019
|
|
|
1986
2020
|
const int64_t ne00 = src0->ne[0];
|
|
1987
2021
|
const int64_t ne10 = src1->ne[0];
|
|
1988
|
-
|
|
2022
|
+
GGML_ASSERT(ne00 == ne10);
|
|
1989
2023
|
|
|
1990
2024
|
const int64_t row_diff = row_high - row_low;
|
|
1991
2025
|
|
|
1992
2026
|
int id;
|
|
1993
2027
|
SYCL_CHECK(
|
|
1994
2028
|
CHECK_TRY_ERROR(id = get_current_device_id()));
|
|
1995
|
-
|
|
1996
|
-
const int64_t ne0 = dst->ne[0];
|
|
2029
|
+
|
|
2030
|
+
const int64_t ne0 = dst->ne[0]; // used by MKL only
|
|
1997
2031
|
// the main device has a larger memory buffer to hold the results from all GPUs
|
|
1998
2032
|
// ldc == nrows of the matrix that cuBLAS writes into
|
|
1999
|
-
int ldc = id == ctx.device ? ne0 : row_diff;
|
|
2000
|
-
#endif
|
|
2033
|
+
int ldc = id == ctx.device ? ne0 : row_diff; // used by MKL only
|
|
2001
2034
|
|
|
2002
2035
|
#ifdef GGML_SYCL_F16
|
|
2003
2036
|
bool use_fp16 = true; // TODO(Yu) SYCL capability check
|
|
2004
2037
|
#else
|
|
2005
2038
|
bool use_fp16 = false;
|
|
2006
2039
|
#endif
|
|
2007
|
-
if ((src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
|
|
2008
|
-
|
|
2009
|
-
dst->op_params[0] == GGML_PREC_DEFAULT) {
|
|
2010
|
-
// GGML_SYCL_DEBUG("ggml_sycl_op_mul_mat_sycl - fp16 path\n");
|
|
2040
|
+
if ((src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && use_fp16 && ggml_is_contiguous(src0) &&
|
|
2041
|
+
row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
|
|
2011
2042
|
ggml_sycl_pool_alloc<sycl::half> src0_as_f16(ctx.pool());
|
|
2012
2043
|
if (src0->type != GGML_TYPE_F16) {
|
|
2044
|
+
scope_op_debug_print scope_dbg_print(__func__, "/to_fp16_sycl", dst, /*num_src=*/2,
|
|
2045
|
+
" : converting src0 to fp16");
|
|
2013
2046
|
const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src0->type, dst);
|
|
2014
2047
|
GGML_ASSERT(to_fp16_sycl != nullptr);
|
|
2015
2048
|
size_t ne = row_diff*ne00;
|
|
@@ -2022,6 +2055,8 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
|
|
2022
2055
|
|
|
2023
2056
|
ggml_sycl_pool_alloc<sycl::half> src1_as_f16(ctx.pool());
|
|
2024
2057
|
if (src1->type != GGML_TYPE_F16) {
|
|
2058
|
+
scope_op_debug_print scope_dbg_print(__func__, "/to_fp16_sycl", dst, /*num_src=*/2,
|
|
2059
|
+
" : converting src1 to fp16");
|
|
2025
2060
|
const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type, dst);
|
|
2026
2061
|
GGML_ASSERT(to_fp16_sycl != nullptr);
|
|
2027
2062
|
size_t ne = src1_ncols*ne10;
|
|
@@ -2033,37 +2068,47 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
|
|
2033
2068
|
: src1_as_f16.get();
|
|
2034
2069
|
ggml_sycl_pool_alloc<sycl::half> dst_f16(ctx.pool(), row_diff * src1_ncols);
|
|
2035
2070
|
|
|
2036
|
-
#if
|
|
2037
|
-
|
|
2038
|
-
|
|
2039
|
-
|
|
2040
|
-
|
|
2041
|
-
|
|
2042
|
-
|
|
2043
|
-
|
|
2044
|
-
dst_f16.get(),
|
|
2045
|
-
|
|
2046
|
-
|
|
2047
|
-
to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
|
|
2048
|
-
#else
|
|
2049
|
-
DnnlGemmWrapper::row_gemm(ctx, false, true, src1_ncols, row_diff, ne10, src1_ptr,
|
|
2050
|
-
DnnlGemmWrapper::to_dt<sycl::half>(), src0_ptr, DnnlGemmWrapper::to_dt<sycl::half>(),
|
|
2051
|
-
dst_f16.get(), DnnlGemmWrapper::to_dt<sycl::half>(), stream);
|
|
2052
|
-
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16, dst);
|
|
2053
|
-
to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff* src1_ncols, stream);
|
|
2071
|
+
#if GGML_SYCL_DNNL
|
|
2072
|
+
if (!g_ggml_sycl_disable_dnn) {
|
|
2073
|
+
DnnlGemmWrapper::row_gemm(ctx, src1_ncols, row_diff, ne10, src1_ptr,
|
|
2074
|
+
DnnlGemmWrapper::to_dt<sycl::half>(), src0_ptr, DnnlGemmWrapper::to_dt<sycl::half>(),
|
|
2075
|
+
dst_f16.get(), DnnlGemmWrapper::to_dt<sycl::half>(), stream);
|
|
2076
|
+
scope_op_debug_print scope_dbg_print(__func__, "/to_fp32_sycl", dst, /*num_src=*/2,
|
|
2077
|
+
" : converting dst to fp32");
|
|
2078
|
+
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16, dst);
|
|
2079
|
+
to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff* src1_ncols, stream);
|
|
2080
|
+
}
|
|
2081
|
+
else
|
|
2054
2082
|
#endif
|
|
2055
|
-
|
|
2056
|
-
|
|
2057
|
-
|
|
2083
|
+
{
|
|
2084
|
+
const sycl::half alpha_f16 = 1.0f;
|
|
2085
|
+
const sycl::half beta_f16 = 0.0f;
|
|
2086
|
+
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm(
|
|
2087
|
+
*stream, oneapi::math::transpose::trans,
|
|
2088
|
+
oneapi::math::transpose::nontrans, row_diff, src1_ncols, ne10,
|
|
2089
|
+
&alpha_f16, src0_ptr, dpct::library_data_t::real_half, ne00,
|
|
2090
|
+
src1_ptr, dpct::library_data_t::real_half, ne10, &beta_f16,
|
|
2091
|
+
dst_f16.get(), dpct::library_data_t::real_half, ldc,
|
|
2092
|
+
dpct::library_data_t::real_half)));
|
|
2093
|
+
scope_op_debug_print scope_dbg_print(__func__, "/to_fp32_sycl", dst, /*num_src=*/2,
|
|
2094
|
+
" : converting dst to fp32");
|
|
2095
|
+
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16, dst);
|
|
2096
|
+
to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
|
|
2097
|
+
}
|
|
2098
|
+
} else {
|
|
2058
2099
|
ggml_sycl_pool_alloc<float> src0_ddq_as_f32(ctx.pool());
|
|
2059
2100
|
ggml_sycl_pool_alloc<float> src1_ddq_as_f32(ctx.pool());
|
|
2060
2101
|
if (src0->type != GGML_TYPE_F32) {
|
|
2102
|
+
scope_op_debug_print scope_dbg_print(__func__, "/to_fp32_sycl", dst, /*num_src=*/2,
|
|
2103
|
+
" : converting src0 to fp32");
|
|
2061
2104
|
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(src0->type, dst);
|
|
2062
2105
|
GGML_ASSERT(to_fp32_sycl != nullptr);
|
|
2063
2106
|
src0_ddq_as_f32.alloc(row_diff*ne00);
|
|
2064
2107
|
to_fp32_sycl(src0_dd_i, src0_ddq_as_f32.get(), row_diff*ne00, stream);
|
|
2065
2108
|
}
|
|
2066
2109
|
if (src1->type != GGML_TYPE_F32) {
|
|
2110
|
+
scope_op_debug_print scope_dbg_print(__func__, "/to_fp32_sycl", dst, /*num_src=*/2,
|
|
2111
|
+
" : converting src1 to fp32");
|
|
2067
2112
|
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(src1->type, dst);
|
|
2068
2113
|
GGML_ASSERT(to_fp32_sycl != nullptr);
|
|
2069
2114
|
src1_ddq_as_f32.alloc(src1_ncols*ne10);
|
|
@@ -2072,18 +2117,22 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
|
|
2072
2117
|
const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32.get();
|
|
2073
2118
|
const float * src1_ddf1_i = src1->type == GGML_TYPE_F32 ? (const float *) src1_ddf_i : src1_ddq_as_f32.get();
|
|
2074
2119
|
|
|
2075
|
-
#if
|
|
2076
|
-
|
|
2077
|
-
|
|
2078
|
-
|
|
2079
|
-
|
|
2080
|
-
|
|
2081
|
-
|
|
2082
|
-
#else
|
|
2083
|
-
DnnlGemmWrapper::row_gemm(ctx, false, true, src1_ncols, row_diff, ne10, src1_ddf1_i,
|
|
2084
|
-
DnnlGemmWrapper::to_dt<float>(), src0_ddf_i, DnnlGemmWrapper::to_dt<float>(),
|
|
2085
|
-
dst_dd_i, DnnlGemmWrapper::to_dt<float>(), stream);
|
|
2120
|
+
#if GGML_SYCL_DNNL
|
|
2121
|
+
if (!g_ggml_sycl_disable_dnn) {
|
|
2122
|
+
DnnlGemmWrapper::row_gemm(ctx, src1_ncols, row_diff, ne10, src1_ddf1_i,
|
|
2123
|
+
DnnlGemmWrapper::to_dt<float>(), src0_ddf_i, DnnlGemmWrapper::to_dt<float>(),
|
|
2124
|
+
dst_dd_i, DnnlGemmWrapper::to_dt<float>(), stream);
|
|
2125
|
+
}
|
|
2126
|
+
else
|
|
2086
2127
|
#endif
|
|
2128
|
+
{
|
|
2129
|
+
const float alpha = 1.0f;
|
|
2130
|
+
const float beta = 0.0f;
|
|
2131
|
+
SYCL_CHECK(CHECK_TRY_ERROR(oneapi::math::blas::column_major::gemm(
|
|
2132
|
+
get_onemath_backend(*stream), oneapi::math::transpose::trans, oneapi::math::transpose::nontrans, row_diff,
|
|
2133
|
+
src1_ncols, ne10, dpct::get_value(&alpha, *stream), src0_ddf_i, ne00, src1_ddf1_i, ne10,
|
|
2134
|
+
dpct::get_value(&beta, *stream), dst_dd_i, ldc)));
|
|
2135
|
+
}
|
|
2087
2136
|
}
|
|
2088
2137
|
GGML_UNUSED(dst);
|
|
2089
2138
|
GGML_UNUSED(src1_ddq_i);
|
|
@@ -2095,8 +2144,7 @@ catch (sycl::exception const &exc) {
|
|
|
2095
2144
|
std::exit(1);
|
|
2096
2145
|
}
|
|
2097
2146
|
|
|
2098
|
-
static void ggml_sycl_op_pool2d(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
|
|
2099
|
-
|
|
2147
|
+
static void ggml_sycl_op_pool2d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
2100
2148
|
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
|
2101
2149
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
2102
2150
|
dpct::queue_ptr main_stream = ctx.stream();
|
|
@@ -2148,8 +2196,7 @@ inline void ggml_sycl_op_sum(ggml_backend_sycl_context & ctx, ggml_tensor *dst)
|
|
|
2148
2196
|
sum_rows_f32_sycl(src0_dd, dst_dd, ne, 1, main_stream);
|
|
2149
2197
|
}
|
|
2150
2198
|
|
|
2151
|
-
inline void ggml_sycl_op_sum_rows(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
|
|
2152
|
-
|
|
2199
|
+
inline void ggml_sycl_op_sum_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
2153
2200
|
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
|
2154
2201
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
2155
2202
|
dpct::queue_ptr main_stream = ctx.stream();
|
|
@@ -2180,8 +2227,7 @@ inline void ggml_sycl_op_argsort(ggml_backend_sycl_context & ctx, ggml_tensor *
|
|
|
2180
2227
|
argsort_f32_i32_sycl(src0_dd, (int *) dst_dd, ncols, nrows, order, main_stream);
|
|
2181
2228
|
}
|
|
2182
2229
|
|
|
2183
|
-
inline void ggml_sycl_op_argmax(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
|
|
2184
|
-
|
|
2230
|
+
inline void ggml_sycl_op_argmax(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
2185
2231
|
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
|
2186
2232
|
GGML_ASSERT( dst->type == GGML_TYPE_I32);
|
|
2187
2233
|
|
|
@@ -2196,8 +2242,7 @@ inline void ggml_sycl_op_argmax(ggml_backend_sycl_context & ctx, ggml_tensor *ds
|
|
|
2196
2242
|
argmax_f32_i32_sycl(src0_dd, dst_dd, ncols, nrows, main_stream);
|
|
2197
2243
|
}
|
|
2198
2244
|
|
|
2199
|
-
inline void ggml_sycl_op_diag_mask_inf(ggml_backend_sycl_context & ctx,ggml_tensor *dst) {
|
|
2200
|
-
|
|
2245
|
+
inline void ggml_sycl_op_diag_mask_inf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
2201
2246
|
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
|
2202
2247
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
2203
2248
|
dpct::queue_ptr main_stream = ctx.stream();
|
|
@@ -2214,8 +2259,7 @@ inline void ggml_sycl_op_diag_mask_inf(ggml_backend_sycl_context & ctx,ggml_tens
|
|
|
2214
2259
|
diag_mask_inf_f32_sycl(src0_dd, dst_dd, ne00, nrows0, ne01, n_past, main_stream);
|
|
2215
2260
|
}
|
|
2216
2261
|
|
|
2217
|
-
inline void ggml_sycl_op_scale(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
|
|
2218
|
-
|
|
2262
|
+
inline void ggml_sycl_op_scale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
2219
2263
|
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
|
2220
2264
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
2221
2265
|
dpct::queue_ptr main_stream = ctx.stream();
|
|
@@ -2402,6 +2446,8 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
|
|
|
2402
2446
|
dev[i].src1_ddq = dev[i].src1_ddq_alloc.alloc(ctx.pool(i), nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs);
|
|
2403
2447
|
|
|
2404
2448
|
if (src1_on_device && src1_is_contiguous) {
|
|
2449
|
+
scope_op_debug_print scope_dbg_print(__func__, "/quantize_row_q8_1_sycl", dst,
|
|
2450
|
+
/*num_src=*/2, " : converting src1 to Q8_1");
|
|
2405
2451
|
quantize_row_q8_1_sycl(dev[i].src1_ddf, dev[i].src1_ddq, ne10, nrows1, src1_padded_col_size, stream);
|
|
2406
2452
|
/*
|
|
2407
2453
|
DPCT1010:90: SYCL uses exceptions to report errors and does not
|
|
@@ -2506,6 +2552,8 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
|
|
|
2506
2552
|
}
|
|
2507
2553
|
|
|
2508
2554
|
if (convert_src1_to_q8_1 && !src1_is_contiguous) {
|
|
2555
|
+
scope_op_debug_print scope_dbg_print(__func__, "/quantize_row_q8_1_sycl", dst,
|
|
2556
|
+
/*num_src=*/2, " : converting src1 to Q8_1");
|
|
2509
2557
|
quantize_row_q8_1_sycl(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
|
|
2510
2558
|
/*
|
|
2511
2559
|
DPCT1010:92: SYCL uses exceptions to report errors and does
|
|
@@ -2600,33 +2648,28 @@ catch (sycl::exception const &exc) {
|
|
|
2600
2648
|
|
|
2601
2649
|
|
|
2602
2650
|
static void ggml_sycl_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
2603
|
-
|
|
2651
|
+
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
|
|
2604
2652
|
ggml_sycl_op_get_rows(ctx, dst);
|
|
2605
|
-
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
|
2606
2653
|
}
|
|
2607
2654
|
|
|
2608
2655
|
static void ggml_sycl_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
2609
|
-
|
|
2656
|
+
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
|
2610
2657
|
ggml_sycl_op_norm(ctx, dst);
|
|
2611
|
-
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
|
2612
2658
|
}
|
|
2613
2659
|
|
|
2614
2660
|
static void ggml_sycl_rms_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
2615
|
-
|
|
2661
|
+
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
|
2616
2662
|
ggml_sycl_op_rms_norm(ctx, dst);
|
|
2617
|
-
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
|
2618
2663
|
}
|
|
2619
2664
|
|
|
2620
2665
|
static void ggml_sycl_l2_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
2621
|
-
|
|
2666
|
+
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
|
2622
2667
|
ggml_sycl_op_l2_norm(ctx, dst);
|
|
2623
|
-
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
|
2624
2668
|
}
|
|
2625
2669
|
|
|
2626
2670
|
static void ggml_sycl_group_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
2627
|
-
|
|
2671
|
+
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
|
2628
2672
|
ggml_sycl_op_group_norm(ctx, dst);
|
|
2629
|
-
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
|
2630
2673
|
}
|
|
2631
2674
|
|
|
2632
2675
|
static void ggml_sycl_mul_mat_vec_p021(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
|
@@ -2697,7 +2740,7 @@ catch (sycl::exception const &exc) {
|
|
|
2697
2740
|
std::exit(1);
|
|
2698
2741
|
}
|
|
2699
2742
|
|
|
2700
|
-
static void k_compute_batched_ptrs(const sycl::half * src0_as_f16, const sycl::half * src1_as_f16,
|
|
2743
|
+
static void k_compute_batched_ptrs(const sycl::half * src0_as_f16, const sycl::half * src1_as_f16, void * dst,
|
|
2701
2744
|
const void ** ptrs_src, void ** ptrs_dst, int64_t ne12, int64_t ne13, int64_t ne23,
|
|
2702
2745
|
size_t nb02, size_t nb03, size_t nb12, size_t nb13, size_t nbd2, size_t nbd3,
|
|
2703
2746
|
int64_t r2, int64_t r3, const sycl::nd_item<3> & item_ct1) {
|
|
@@ -2713,7 +2756,7 @@ static void k_compute_batched_ptrs(const sycl::half * src0_as_f16, const sycl::h
|
|
|
2713
2756
|
|
|
2714
2757
|
const uint8_t * src0_bytes = reinterpret_cast<const uint8_t *>(src0_as_f16);
|
|
2715
2758
|
const uint8_t * src1_bytes = reinterpret_cast<const uint8_t *>(src1_as_f16);
|
|
2716
|
-
uint8_t * dst_bytes =
|
|
2759
|
+
uint8_t * dst_bytes = static_cast<uint8_t *>(dst);
|
|
2717
2760
|
|
|
2718
2761
|
ptrs_src[0 * ne23 + i12 + i13 * ne12] = src0_bytes + i02 * nb02 + i03 * nb03;
|
|
2719
2762
|
ptrs_src[1 * ne23 + i12 + i13 * ne12] = src1_bytes + i12 * nb12 + i13 * nb13;
|
|
@@ -2726,6 +2769,7 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
|
|
|
2726
2769
|
GGML_ASSERT(!ggml_is_transposed(src1));
|
|
2727
2770
|
GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(src0->buffer));
|
|
2728
2771
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
|
2772
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
2729
2773
|
|
|
2730
2774
|
GGML_TENSOR_BINARY_OP_LOCALS
|
|
2731
2775
|
|
|
@@ -2753,6 +2797,8 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
|
|
|
2753
2797
|
|
|
2754
2798
|
// convert src1 to fp16
|
|
2755
2799
|
if (src1->type != GGML_TYPE_F16) {
|
|
2800
|
+
scope_op_debug_print scope_dbg_print(__func__, "/to_fp16_nc_sycl", dst, /*num_src=*/2,
|
|
2801
|
+
" : converting src1 to fp16");
|
|
2756
2802
|
const to_fp16_nc_sycl_t to_fp16_nc_sycl = get_to_fp16_nc_sycl(src1->type);
|
|
2757
2803
|
GGML_ASSERT(to_fp16_nc_sycl != nullptr);
|
|
2758
2804
|
const int64_t ne_src1 = ggml_nelements(src1);
|
|
@@ -2766,7 +2812,6 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
|
|
|
2766
2812
|
}
|
|
2767
2813
|
|
|
2768
2814
|
ggml_sycl_pool_alloc<sycl::half> dst_f16(ctx.pool());
|
|
2769
|
-
char * dst_t = reinterpret_cast<char *>(dst_ddf);
|
|
2770
2815
|
|
|
2771
2816
|
dpct::library_data_t mkl_compute_type = dpct::library_data_t::real_float;
|
|
2772
2817
|
dpct::library_data_t mkl_data_type = dpct::library_data_t::real_float;
|
|
@@ -2783,42 +2828,83 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
|
|
|
2783
2828
|
|
|
2784
2829
|
GGML_ASSERT(ne12 % ne02 == 0);
|
|
2785
2830
|
GGML_ASSERT(ne13 % ne03 == 0);
|
|
2831
|
+
GGML_ASSERT(ne01 == static_cast<int64_t>(nb1/nb0));
|
|
2832
|
+
GGML_ASSERT(ne10 == ne00);
|
|
2786
2833
|
|
|
2787
2834
|
// broadcast factors
|
|
2788
2835
|
const int64_t r2 = ne12 / ne02;
|
|
2789
2836
|
const int64_t r3 = ne13 / ne03;
|
|
2790
2837
|
|
|
2791
|
-
|
|
2792
|
-
|
|
2793
|
-
|
|
2794
|
-
|
|
2795
|
-
|
|
2796
|
-
|
|
2797
|
-
|
|
2798
|
-
|
|
2799
|
-
|
|
2800
|
-
|
|
2801
|
-
|
|
2802
|
-
|
|
2803
|
-
|
|
2804
|
-
|
|
2805
|
-
|
|
2806
|
-
|
|
2807
|
-
|
|
2808
|
-
|
|
2809
|
-
|
|
2810
|
-
|
|
2811
|
-
|
|
2812
|
-
|
|
2813
|
-
|
|
2838
|
+
#if GGML_SYCL_DNNL
|
|
2839
|
+
if (!g_ggml_sycl_disable_dnn) {
|
|
2840
|
+
auto dnn_gemm = [&ctx, queue, ne11, ne01, ne10, nb00, nb01, nb02, s11, s12]
|
|
2841
|
+
(const sycl::half* src1, const sycl::half* src0, float* dst, const dnnl_dim_t batches_a, const dnnl_dim_t batches_b) {
|
|
2842
|
+
|
|
2843
|
+
DnnlGemmWrapper::gemm(ctx, ne11,ne01, ne10,
|
|
2844
|
+
src1, DnnlGemmWrapper::to_dt<sycl::half>(), s11, 1, s12,
|
|
2845
|
+
src0, DnnlGemmWrapper::to_dt<sycl::half>(), 1, nb01/nb00, nb02/nb00,
|
|
2846
|
+
dst, DnnlGemmWrapper::to_dt<float>(), queue, batches_a, batches_b);
|
|
2847
|
+
};
|
|
2848
|
+
|
|
2849
|
+
if (r2 == 1 && r3 == 1) {
|
|
2850
|
+
if (ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
|
|
2851
|
+
dnn_gemm(src1_f16, src0_f16, dst_ddf, ne12*ne13, ne02 * ne03);
|
|
2852
|
+
}
|
|
2853
|
+
else {
|
|
2854
|
+
for (int64_t ie03 = 0; ie03 < ne03; ++ie03) {
|
|
2855
|
+
const sycl::half* src0_f16_shifted = src0_f16 + ((ie03*nb03)/sizeof(sycl::half)); // nb is in bytes
|
|
2856
|
+
const sycl::half* src1_f16_shifted = src1_f16 + ie03*s13;
|
|
2857
|
+
float* dst_shifted = dst_ddf + ((ie03*nb3)/sizeof(float));
|
|
2858
|
+
dnn_gemm(src1_f16_shifted, src0_f16_shifted, dst_shifted, ne12, ne02);
|
|
2859
|
+
}
|
|
2860
|
+
}
|
|
2861
|
+
} else {
|
|
2862
|
+
// iterate over batches from smaller set of matrices (matrix 0)
|
|
2863
|
+
for (int64_t ie02 = 0; ie02 < ne02; ++ie02) {
|
|
2864
|
+
for (int64_t ie03 = 0; ie03 < ne03; ++ie03) {
|
|
2865
|
+
const sycl::half* src0_f16_shifted = src0_f16 + ((ie02*nb02 + ie03*nb03)/sizeof(sycl::half));
|
|
2866
|
+
const sycl::half* src1_f16_shifted = src1_f16 + ie02*s12*r2 + ie03*s13*r3;
|
|
2867
|
+
float* dst_shifted = dst_ddf + ((ie02*nb2*r2 + ie03*nb3*r3)/sizeof(float));
|
|
2868
|
+
dnn_gemm(src1_f16_shifted, src0_f16_shifted, dst_shifted, r2*r3, 1);
|
|
2869
|
+
}
|
|
2870
|
+
}
|
|
2871
|
+
}
|
|
2872
|
+
}
|
|
2873
|
+
else
|
|
2874
|
+
#endif
|
|
2875
|
+
{
|
|
2876
|
+
if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
|
|
2877
|
+
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
|
|
2878
|
+
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(*queue, oneapi::math::transpose::trans,
|
|
2879
|
+
oneapi::math::transpose::nontrans, ne01, ne11, ne10, alpha,
|
|
2880
|
+
src0_f16, dpct::library_data_t::real_half, nb01 / nb00, nb02 / nb00,
|
|
2881
|
+
src1_f16, dpct::library_data_t::real_half, s11, s12, beta, dst_ddf,
|
|
2882
|
+
mkl_data_type, ne0, ne1 * ne0, ne12 * ne13, mkl_compute_type)));
|
|
2883
|
+
} else {
|
|
2884
|
+
const int ne23 = ne12 * ne13;
|
|
2885
|
+
|
|
2886
|
+
ggml_sycl_pool_alloc<const void *> ptrs_src(ctx.pool(), 2 * ne23);
|
|
2887
|
+
ggml_sycl_pool_alloc<void *> ptrs_dst(ctx.pool(), 1 * ne23);
|
|
2888
|
+
ggml_sycl_pool_alloc<matrix_info_t<float>> matrix_info(ctx.host_pool(), 1);
|
|
2889
|
+
|
|
2890
|
+
sycl::range<3> block_dims(1, ne12, ne13);
|
|
2891
|
+
queue->submit([&](sycl::handler & cgh) {
|
|
2892
|
+
const void ** ptrs_src_get = ptrs_src.get();
|
|
2893
|
+
void ** ptrs_dst_get = ptrs_dst.get();
|
|
2894
|
+
size_t nb12_scaled = src1->type == GGML_TYPE_F16 ? nb12 : s12 * sizeof(sycl::half);
|
|
2895
|
+
size_t nb13_scaled = src1->type == GGML_TYPE_F16 ? nb13 : s13 * sizeof(sycl::half);
|
|
2896
|
+
cgh.parallel_for(sycl::nd_range<3>(block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
|
2897
|
+
k_compute_batched_ptrs(src0_f16, src1_f16, dst_ddf, ptrs_src_get, ptrs_dst_get, ne12, ne13, ne23, nb02,
|
|
2898
|
+
nb03, nb12_scaled, nb13_scaled, nbd2, nbd3, r2, r3, item_ct1);
|
|
2899
|
+
});
|
|
2814
2900
|
});
|
|
2815
|
-
});
|
|
2816
2901
|
|
|
2817
|
-
|
|
2818
|
-
|
|
2819
|
-
|
|
2820
|
-
|
|
2821
|
-
|
|
2902
|
+
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
|
|
2903
|
+
*queue, oneapi::math::transpose::trans, oneapi::math::transpose::nontrans, ne01, ne11, ne10, alpha,
|
|
2904
|
+
(const void **) (ptrs_src.get() + 0 * ne23), dpct::library_data_t::real_half, nb01 / nb00,
|
|
2905
|
+
(const void **) (ptrs_src.get() + 1 * ne23), dpct::library_data_t::real_half, s11, beta,
|
|
2906
|
+
(void **) (ptrs_dst.get() + 0 * ne23), mkl_data_type, ne0, ne23, mkl_compute_type, matrix_info.get())));
|
|
2907
|
+
}
|
|
2822
2908
|
}
|
|
2823
2909
|
} catch (const sycl::exception & exc) {
|
|
2824
2910
|
std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl;
|
|
@@ -2841,6 +2927,8 @@ inline bool ggml_sycl_supports_reorder_mul_mat_sycl(enum ggml_type type) {
|
|
|
2841
2927
|
switch (type) {
|
|
2842
2928
|
case GGML_TYPE_Q4_0:
|
|
2843
2929
|
return true;
|
|
2930
|
+
case GGML_TYPE_Q4_K:
|
|
2931
|
+
return !g_ggml_sycl_prioritize_dmmv;
|
|
2844
2932
|
default:
|
|
2845
2933
|
return false;
|
|
2846
2934
|
}
|
|
@@ -2858,6 +2946,7 @@ inline bool ggml_sycl_supports_reorder_dmmv(enum ggml_type type) {
|
|
|
2858
2946
|
inline bool ggml_sycl_supports_reorder_mmvq(enum ggml_type type) {
|
|
2859
2947
|
switch (type) {
|
|
2860
2948
|
case GGML_TYPE_Q4_0:
|
|
2949
|
+
case GGML_TYPE_Q4_K:
|
|
2861
2950
|
return true;
|
|
2862
2951
|
default:
|
|
2863
2952
|
return false;
|
|
@@ -2883,16 +2972,16 @@ static bool ggml_sycl_supports_dmmv(enum ggml_type type) {
|
|
|
2883
2972
|
}
|
|
2884
2973
|
}
|
|
2885
2974
|
|
|
2886
|
-
static void
|
|
2887
|
-
|
|
2888
|
-
auto tmp_buf = sycl::malloc_shared<
|
|
2975
|
+
static void reorder_qw_q4_0(uint8_t * data_device, const int ncols, const int nrows, size_t size, size_t offset,
|
|
2976
|
+
dpct::queue_ptr stream) {
|
|
2977
|
+
auto * tmp_buf = sycl::malloc_shared<uint8_t>(size, *stream);
|
|
2889
2978
|
SYCL_CHECK(
|
|
2890
2979
|
CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size)
|
|
2891
2980
|
.wait()));
|
|
2892
2981
|
GGML_ASSERT((size % sizeof(block_q4_0) == 0));
|
|
2893
2982
|
GGML_ASSERT((offset % sizeof(block_q4_0) == 0));
|
|
2894
2983
|
int offset_blks = offset / sizeof(block_q4_0);
|
|
2895
|
-
auto qs_ptr
|
|
2984
|
+
auto qs_ptr = data_device + offset_blks * QK4_0 / 2;
|
|
2896
2985
|
auto d_ptr = (sycl::half*)(qs_ptr + ncols * nrows / 2) + offset_blks;
|
|
2897
2986
|
|
|
2898
2987
|
stream->parallel_for(
|
|
@@ -2906,25 +2995,66 @@ static void reorder_qw(char *data_device, const int ncols, const int nrows,
|
|
|
2906
2995
|
*(qs_ptr + ib * QK4_0 / 2 + j) = x[ib].qs[j];
|
|
2907
2996
|
}
|
|
2908
2997
|
*(d_ptr + ib) = x[ib].d;
|
|
2909
|
-
});
|
|
2998
|
+
}).wait_and_throw();
|
|
2999
|
+
|
|
3000
|
+
sycl::free(tmp_buf, *stream);
|
|
3001
|
+
}
|
|
3002
|
+
|
|
3003
|
+
static void reorder_qw_q4_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
|
|
3004
|
+
GGML_ASSERT(size % sizeof(block_q4_K) == 0);
|
|
3005
|
+
GGML_ASSERT(offset % sizeof(block_q4_K) == 0);
|
|
3006
|
+
|
|
3007
|
+
const int nblocks = size / sizeof(block_q4_K);
|
|
3008
|
+
|
|
3009
|
+
auto * tmp_buf = sycl::malloc_shared<uint8_t>(size, *stream);
|
|
3010
|
+
SYCL_CHECK(CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size).wait()));
|
|
3011
|
+
|
|
3012
|
+
auto * qs_ptr = data_device;
|
|
3013
|
+
auto * scales_ptr = qs_ptr + QK_K / 2 * nblocks;
|
|
3014
|
+
auto * dm_ptr = (sycl::half2 *) (scales_ptr + K_SCALE_SIZE * nblocks);
|
|
3015
|
+
|
|
3016
|
+
stream->parallel_for(nblocks, [=](auto i) {
|
|
3017
|
+
const block_q4_K * x = (const block_q4_K *) tmp_buf;
|
|
3018
|
+
const int ib = i;
|
|
3019
|
+
|
|
3020
|
+
for (int j = 0; j < QK_K / 2; ++j) {
|
|
3021
|
+
qs_ptr[ib * (QK_K / 2) + j] = x[ib].qs[j];
|
|
3022
|
+
}
|
|
3023
|
+
|
|
3024
|
+
for (int j = 0; j < K_SCALE_SIZE; ++j) {
|
|
3025
|
+
scales_ptr[ib * K_SCALE_SIZE + j] = x[ib].scales[j];
|
|
3026
|
+
}
|
|
3027
|
+
|
|
3028
|
+
dm_ptr[ib] = x[ib].dm;
|
|
3029
|
+
}).wait_and_throw();
|
|
2910
3030
|
|
|
2911
3031
|
sycl::free(tmp_buf, *stream);
|
|
2912
3032
|
}
|
|
2913
3033
|
|
|
2914
3034
|
static void reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
|
|
2915
|
-
|
|
3035
|
+
uint8_t * data_device = (uint8_t *) src0->data;
|
|
2916
3036
|
size_t ncols = src0->ne[0];
|
|
2917
3037
|
size_t nrows = src0->ne[1];
|
|
2918
3038
|
size_t size = ggml_nbytes(src0);
|
|
2919
3039
|
|
|
2920
|
-
|
|
3040
|
+
switch (src0->type) {
|
|
3041
|
+
case GGML_TYPE_Q4_0:
|
|
3042
|
+
reorder_qw_q4_0(data_device, ncols, nrows, size, 0, stream);
|
|
3043
|
+
break;
|
|
3044
|
+
case GGML_TYPE_Q4_K:
|
|
3045
|
+
reorder_qw_q4_k(data_device, size, 0, stream);
|
|
3046
|
+
break;
|
|
3047
|
+
default:
|
|
3048
|
+
GGML_ABORT("reorder_qw() called with unsupported type");
|
|
3049
|
+
break;
|
|
3050
|
+
}
|
|
2921
3051
|
}
|
|
2922
3052
|
|
|
2923
3053
|
static bool should_reorder_tensor(ggml_backend_sycl_context& ctx, const ggml_tensor * dst) {
|
|
2924
3054
|
return !g_ggml_sycl_disable_optimize && //allow optimize, controlled by $GGML_SYCL_DISABLE_OPT
|
|
2925
3055
|
ctx.opt_feature.reorder && //allow this device due to good perf, skip the devices with bad perf.
|
|
2926
3056
|
dst->op == GGML_OP_MUL_MAT && //limit to some supported cases of Q4_0, to do for more cases.
|
|
2927
|
-
dst->src[1]->ne[2]==1 && dst->src[1]->ne[3]==1;
|
|
3057
|
+
dst->src[1]->ne[1]==1 && dst->src[1]->ne[2]==1 && dst->src[1]->ne[3]==1;
|
|
2928
3058
|
}
|
|
2929
3059
|
|
|
2930
3060
|
static void opt_for_reorder(ggml_backend_sycl_context * ctx, const ggml_tensor * src0, const ggml_tensor * /* src1 */,
|
|
@@ -2960,8 +3090,19 @@ static void opt_for_reorder(ggml_backend_sycl_context * ctx, const ggml_tensor *
|
|
|
2960
3090
|
extra->optimized_feature.reorder = true; // Used to decode/dequan in next steps and avoid re-reordering
|
|
2961
3091
|
}
|
|
2962
3092
|
|
|
2963
|
-
static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
2964
3093
|
|
|
3094
|
+
static bool can_use_dequantize_mul_mat_vec(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
3095
|
+
return ggml_sycl_supports_dmmv(src0->type) && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 &&
|
|
3096
|
+
src0->ne[0] % GGML_SYCL_DMMV_X == 0 && src1->ne[1] == 1;
|
|
3097
|
+
}
|
|
3098
|
+
|
|
3099
|
+
static bool can_use_mul_mat_vec_q(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
3100
|
+
return ggml_is_quantized(src0->type) && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 &&
|
|
3101
|
+
src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
|
|
3102
|
+
}
|
|
3103
|
+
|
|
3104
|
+
static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
3105
|
+
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
|
|
2965
3106
|
const bool split = ggml_backend_buffer_is_sycl_split(src0->buffer);
|
|
2966
3107
|
int64_t min_compute_capability = INT_MAX;
|
|
2967
3108
|
|
|
@@ -2984,13 +3125,9 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
|
|
|
2984
3125
|
}
|
|
2985
3126
|
|
|
2986
3127
|
// check data types and tensor shapes for custom matrix multiplication kernels:
|
|
2987
|
-
bool use_dequantize_mul_mat_vec =
|
|
2988
|
-
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
|
2989
|
-
&& src0->ne[0] % GGML_SYCL_DMMV_X == 0 && src1->ne[1] == 1;
|
|
3128
|
+
bool use_dequantize_mul_mat_vec = can_use_dequantize_mul_mat_vec(src0, src1, dst);
|
|
2990
3129
|
|
|
2991
|
-
bool use_mul_mat_vec_q =
|
|
2992
|
-
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
|
2993
|
-
&& src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
|
|
3130
|
+
bool use_mul_mat_vec_q = can_use_mul_mat_vec_q(src0, src1, dst);
|
|
2994
3131
|
|
|
2995
3132
|
bool use_mul_mat_q = ggml_sycl_supports_mmq(src0->type)
|
|
2996
3133
|
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
|
|
@@ -3041,11 +3178,8 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
|
|
|
3041
3178
|
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_q, convert_src1_to_q8_1);
|
|
3042
3179
|
} else {
|
|
3043
3180
|
constexpr bool convert_src1_to_q8_1 = false;
|
|
3044
|
-
// MUL_MAT_SYCL supports reorder
|
|
3045
|
-
opt_for_reorder(&ctx, src0, src1, dst, mul_mat_algo::MUL_MAT_SYCL);
|
|
3046
3181
|
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_sycl, convert_src1_to_q8_1);
|
|
3047
3182
|
}
|
|
3048
|
-
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
|
3049
3183
|
}
|
|
3050
3184
|
|
|
3051
3185
|
|
|
@@ -3116,6 +3250,7 @@ __dpct_inline__ static void k_copy_dst_from_contiguous(
|
|
|
3116
3250
|
|
|
3117
3251
|
static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
|
|
3118
3252
|
ggml_tensor *dst) try {
|
|
3253
|
+
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/3);
|
|
3119
3254
|
const ggml_tensor *src0 = dst->src[0];
|
|
3120
3255
|
const ggml_tensor *src1 = dst->src[1];
|
|
3121
3256
|
GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(src0->buffer) && "mul_mat_id does not support split buffers");
|
|
@@ -3284,37 +3419,45 @@ catch (sycl::exception const &exc) {
|
|
|
3284
3419
|
}
|
|
3285
3420
|
|
|
3286
3421
|
static void ggml_sycl_scale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
3422
|
+
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
|
3287
3423
|
ggml_sycl_op_scale(ctx, dst);
|
|
3288
3424
|
}
|
|
3289
3425
|
|
|
3290
3426
|
static void ggml_sycl_diag_mask_inf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
3427
|
+
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
|
3291
3428
|
ggml_sycl_op_diag_mask_inf(ctx, dst);
|
|
3292
3429
|
}
|
|
3293
3430
|
|
|
3294
3431
|
static void ggml_sycl_pool2d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
3432
|
+
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
|
3295
3433
|
ggml_sycl_op_pool2d(ctx, dst);
|
|
3296
3434
|
}
|
|
3297
3435
|
|
|
3298
3436
|
static void ggml_sycl_im2col(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
3437
|
+
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
|
|
3299
3438
|
ggml_sycl_op_im2col(ctx, dst);
|
|
3300
3439
|
}
|
|
3301
3440
|
|
|
3302
3441
|
static void ggml_sycl_sum(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
3442
|
+
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
|
3303
3443
|
GGML_ASSERT(ggml_is_contiguous(dst->src[0]));
|
|
3304
3444
|
ggml_sycl_op_sum(ctx, dst);
|
|
3305
3445
|
}
|
|
3306
3446
|
|
|
3307
3447
|
static void ggml_sycl_sum_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
3448
|
+
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
|
3308
3449
|
GGML_ASSERT(ggml_is_contiguous(dst->src[0]));
|
|
3309
3450
|
ggml_sycl_op_sum_rows(ctx, dst);
|
|
3310
3451
|
}
|
|
3311
3452
|
|
|
3312
3453
|
static void ggml_sycl_argsort(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
3454
|
+
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
|
3313
3455
|
GGML_ASSERT(ggml_is_contiguous(dst->src[0]));
|
|
3314
3456
|
ggml_sycl_op_argsort(ctx, dst);
|
|
3315
3457
|
}
|
|
3316
3458
|
|
|
3317
3459
|
static void ggml_sycl_argmax(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
3460
|
+
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
|
3318
3461
|
GGML_ASSERT(ggml_is_contiguous(dst->src[0]));
|
|
3319
3462
|
ggml_sycl_op_argmax(ctx, dst);
|
|
3320
3463
|
}
|
|
@@ -3400,6 +3543,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
|
|
|
3400
3543
|
case GGML_UNARY_OP_GELU_QUICK:
|
|
3401
3544
|
ggml_sycl_gelu_quick(ctx, dst);
|
|
3402
3545
|
break;
|
|
3546
|
+
case GGML_UNARY_OP_GELU_ERF:
|
|
3547
|
+
ggml_sycl_gelu_erf(ctx, dst);
|
|
3548
|
+
break;
|
|
3403
3549
|
case GGML_UNARY_OP_TANH:
|
|
3404
3550
|
ggml_sycl_tanh(ctx, dst);
|
|
3405
3551
|
break;
|
|
@@ -3608,6 +3754,9 @@ static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend,
|
|
|
3608
3754
|
ggml_tensor *tensor,
|
|
3609
3755
|
const void *data, size_t offset,
|
|
3610
3756
|
size_t size) try {
|
|
3757
|
+
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
|
3758
|
+
debug_print_tensor(": tensor=", tensor);
|
|
3759
|
+
GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
|
|
3611
3760
|
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
|
3612
3761
|
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
|
3613
3762
|
|
|
@@ -3626,13 +3775,16 @@ static void ggml_backend_sycl_get_tensor_async(ggml_backend_t backend,
|
|
|
3626
3775
|
const ggml_tensor *tensor,
|
|
3627
3776
|
void *data, size_t offset,
|
|
3628
3777
|
size_t size) try {
|
|
3778
|
+
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
|
3779
|
+
debug_print_tensor(": tensor=", tensor);
|
|
3780
|
+
GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
|
|
3629
3781
|
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
|
3630
3782
|
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
|
3631
3783
|
|
|
3632
3784
|
GGML_ASSERT(buf->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type");
|
|
3633
3785
|
const queue_ptr stream = sycl_ctx->stream(sycl_ctx->device, 0);
|
|
3634
3786
|
SYCL_CHECK(CHECK_TRY_ERROR((stream)->memcpy(
|
|
3635
|
-
data, (const char *)tensor->data + offset, size)
|
|
3787
|
+
data, (const char *)tensor->data + offset, size)));
|
|
3636
3788
|
}
|
|
3637
3789
|
catch (sycl::exception const &exc) {
|
|
3638
3790
|
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
|
@@ -3644,7 +3796,13 @@ static bool ggml_backend_sycl_cpy_tensor_async(ggml_backend_t backend,
|
|
|
3644
3796
|
const ggml_tensor *src,
|
|
3645
3797
|
ggml_tensor *dst) try {
|
|
3646
3798
|
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
|
3647
|
-
|
|
3799
|
+
bool is_cpy_supported = dst->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) &&
|
|
3800
|
+
ggml_backend_buffer_is_sycl(src->buffer);
|
|
3801
|
+
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
|
3802
|
+
debug_print_tensor(": dst=", dst);
|
|
3803
|
+
debug_print_tensor(" src=", src);
|
|
3804
|
+
GGML_SYCL_DEBUG(" is_cpy_supported=%d\n", is_cpy_supported);
|
|
3805
|
+
if (is_cpy_supported) {
|
|
3648
3806
|
/*
|
|
3649
3807
|
DPCT1009:215: SYCL uses exceptions to report errors and does not use the
|
|
3650
3808
|
error codes. The original code was commented out and a warning string
|
|
@@ -3652,7 +3810,7 @@ static bool ggml_backend_sycl_cpy_tensor_async(ggml_backend_t backend,
|
|
|
3652
3810
|
*/
|
|
3653
3811
|
const queue_ptr stream = sycl_ctx->stream(sycl_ctx->device, 0);
|
|
3654
3812
|
SYCL_CHECK(CHECK_TRY_ERROR((stream)->memcpy(
|
|
3655
|
-
dst->data, src->data, ggml_nbytes(dst))
|
|
3813
|
+
dst->data, src->data, ggml_nbytes(dst))));
|
|
3656
3814
|
return true;
|
|
3657
3815
|
}
|
|
3658
3816
|
|
|
@@ -3665,6 +3823,7 @@ catch (sycl::exception const &exc) {
|
|
|
3665
3823
|
}
|
|
3666
3824
|
|
|
3667
3825
|
static void ggml_backend_sycl_synchronize(ggml_backend_t backend) try {
|
|
3826
|
+
GGML_SYCL_DEBUG("[SYCL] call %s\n", __func__);
|
|
3668
3827
|
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
|
3669
3828
|
const queue_ptr stream = sycl_ctx->stream(sycl_ctx->device, 0);
|
|
3670
3829
|
SYCL_CHECK(CHECK_TRY_ERROR((stream)->wait()));
|
|
@@ -3701,11 +3860,43 @@ static void ggml_backend_sycl_graph_compute_impl(ggml_backend_sycl_context * syc
|
|
|
3701
3860
|
}
|
|
3702
3861
|
}
|
|
3703
3862
|
|
|
3863
|
+
#ifdef GGML_SYCL_GRAPH
|
|
3864
|
+
static bool check_graph_compatibility(ggml_cgraph * cgraph) {
|
|
3865
|
+
if (ggml_sycl_info().device_count > 1) {
|
|
3866
|
+
// A sycl_ex::command_graph object can only be created for a single device
|
|
3867
|
+
GGML_LOG_INFO("%s: disabling SYCL graphs due to multiple devices\n", __func__);
|
|
3868
|
+
return false;
|
|
3869
|
+
}
|
|
3870
|
+
|
|
3871
|
+
for (int i = 0; i < cgraph->n_nodes; i++) {
|
|
3872
|
+
const ggml_op node_op = cgraph->nodes[i]->op;
|
|
3873
|
+
switch (node_op) {
|
|
3874
|
+
default:
|
|
3875
|
+
break;
|
|
3876
|
+
case GGML_OP_CONCAT:
|
|
3877
|
+
// ggml_sycl_op_concat() does a blocking host wait after memcpy operations,
|
|
3878
|
+
// but wait() can't be called on the events returned by a queue recording
|
|
3879
|
+
// to a graph.
|
|
3880
|
+
[[fallthrough]];
|
|
3881
|
+
case GGML_OP_MUL_MAT_ID:
|
|
3882
|
+
// ggml_sycl_mul_mat_id() does a blocking host wait on the sycl queue after
|
|
3883
|
+
// submitting a memcpy operation, but wait() can't be called on a queue that
|
|
3884
|
+
// is recording to a graph.
|
|
3885
|
+
GGML_LOG_INFO("%s: disabling SYCL graphs due to unsupported node type %s\n", __func__,
|
|
3886
|
+
ggml_op_name(node_op));
|
|
3887
|
+
return false;
|
|
3888
|
+
}
|
|
3889
|
+
}
|
|
3890
|
+
return true;
|
|
3891
|
+
}
|
|
3892
|
+
#endif
|
|
3893
|
+
|
|
3704
3894
|
static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
|
3705
3895
|
auto * sycl_ctx = static_cast<ggml_backend_sycl_context *>(backend->context);
|
|
3706
3896
|
|
|
3707
3897
|
#ifdef GGML_SYCL_GRAPH
|
|
3708
|
-
|
|
3898
|
+
bool use_sycl_graph = !g_ggml_sycl_disable_graph && check_graph_compatibility(cgraph);
|
|
3899
|
+
if (use_sycl_graph) {
|
|
3709
3900
|
const bool graph_support = dpct::get_device(sycl_ctx->device).has(sycl::aspect::ext_oneapi_limited_graph);
|
|
3710
3901
|
if (!graph_support) {
|
|
3711
3902
|
GGML_SYCL_DEBUG("[SYCL-GRAPH] can not use graphs on device:%d\n", sycl_ctx->device);
|
|
@@ -3713,7 +3904,8 @@ static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_
|
|
|
3713
3904
|
return GGML_STATUS_SUCCESS;
|
|
3714
3905
|
}
|
|
3715
3906
|
|
|
3716
|
-
sycl_ex::command_graph model_sycl_graph(*(sycl_ctx->stream()));
|
|
3907
|
+
sycl_ex::command_graph model_sycl_graph(*(sycl_ctx->stream()), {sycl_ex::property::graph::assume_buffer_outlives_graph{}});
|
|
3908
|
+
|
|
3717
3909
|
model_sycl_graph.begin_recording(*(sycl_ctx->stream()));
|
|
3718
3910
|
ggml_backend_sycl_graph_compute_impl(sycl_ctx, cgraph);
|
|
3719
3911
|
model_sycl_graph.end_recording();
|
|
@@ -3765,7 +3957,7 @@ catch (sycl::exception const &exc)
|
|
|
3765
3957
|
}
|
|
3766
3958
|
|
|
3767
3959
|
static void ggml_backend_sycl_event_wait(ggml_backend_t backend, ggml_backend_event_t event) try {
|
|
3768
|
-
|
|
3960
|
+
GGML_SYCL_DEBUG("[SYCL] call %s\n", __func__);
|
|
3769
3961
|
sycl::event* sycl_event = static_cast<sycl::event*>(event->context);
|
|
3770
3962
|
|
|
3771
3963
|
if (ggml_backend_is_sycl(backend)) {
|
|
@@ -3907,6 +4099,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
|
|
3907
4099
|
case GGML_UNARY_OP_HARDSIGMOID:
|
|
3908
4100
|
case GGML_UNARY_OP_HARDSWISH:
|
|
3909
4101
|
case GGML_UNARY_OP_GELU_QUICK:
|
|
4102
|
+
case GGML_UNARY_OP_GELU_ERF:
|
|
3910
4103
|
case GGML_UNARY_OP_TANH:
|
|
3911
4104
|
case GGML_UNARY_OP_EXP:
|
|
3912
4105
|
case GGML_UNARY_OP_SGN:
|
|
@@ -4052,6 +4245,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
|
|
4052
4245
|
#endif
|
|
4053
4246
|
case GGML_OP_NORM:
|
|
4054
4247
|
case GGML_OP_RMS_NORM:
|
|
4248
|
+
return true;
|
|
4055
4249
|
case GGML_OP_L2_NORM:
|
|
4056
4250
|
case GGML_OP_GROUP_NORM:
|
|
4057
4251
|
return ggml_is_contiguous(op->src[0]);
|
|
@@ -4160,6 +4354,7 @@ static void ggml_backend_sycl_device_event_free(ggml_backend_dev_t dev, ggml_bac
|
|
|
4160
4354
|
|
|
4161
4355
|
static void ggml_backend_sycl_device_event_synchronize(ggml_backend_dev_t dev, ggml_backend_event_t event) try {
|
|
4162
4356
|
GGML_UNUSED(dev);
|
|
4357
|
+
GGML_SYCL_DEBUG("[SYCL] call %s\n", __func__);
|
|
4163
4358
|
|
|
4164
4359
|
sycl::event *sycl_event = static_cast<sycl::event *>(event->context);
|
|
4165
4360
|
SYCL_CHECK(CHECK_TRY_ERROR(sycl_event->wait()));
|