@novastera-oss/llamarn 0.2.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +80 -14
- package/RNLlamaCpp.podspec +10 -3
- package/android/CMakeLists.txt +8 -0
- package/android/src/main/cpp/include/llama.h +62 -125
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/README.md +11 -3
- package/cpp/llama.cpp/build-xcframework.sh +1 -0
- package/cpp/llama.cpp/common/CMakeLists.txt +8 -2
- package/cpp/llama.cpp/common/arg.cpp +153 -113
- package/cpp/llama.cpp/common/chat-parser.cpp +379 -0
- package/cpp/llama.cpp/common/chat-parser.h +117 -0
- package/cpp/llama.cpp/common/chat.cpp +847 -699
- package/cpp/llama.cpp/common/chat.h +73 -6
- package/cpp/llama.cpp/common/common.cpp +50 -82
- package/cpp/llama.cpp/common/common.h +21 -17
- package/cpp/llama.cpp/common/json-partial.cpp +255 -0
- package/cpp/llama.cpp/common/json-partial.h +37 -0
- package/cpp/llama.cpp/common/minja/chat-template.hpp +9 -5
- package/cpp/llama.cpp/common/minja/minja.hpp +69 -36
- package/cpp/llama.cpp/common/regex-partial.cpp +204 -0
- package/cpp/llama.cpp/common/regex-partial.h +56 -0
- package/cpp/llama.cpp/common/sampling.cpp +7 -8
- package/cpp/llama.cpp/convert_hf_to_gguf.py +453 -118
- package/cpp/llama.cpp/convert_hf_to_gguf_update.py +120 -68
- package/cpp/llama.cpp/ggml/CMakeLists.txt +2 -1
- package/cpp/llama.cpp/ggml/cmake/common.cmake +25 -0
- package/cpp/llama.cpp/ggml/include/ggml-opt.h +49 -28
- package/cpp/llama.cpp/ggml/include/ggml.h +26 -7
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +16 -10
- package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +4 -1
- package/cpp/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +604 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +42 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +54 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +50 -51
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +5 -9
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +779 -19
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +88 -5
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -12
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +264 -69
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +322 -100
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +117 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +85 -16
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +220 -49
- package/cpp/llama.cpp/ggml/src/ggml-cuda/acc.cu +40 -26
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cu +11 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +15 -7
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +266 -64
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +49 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +48 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cu +2 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +5 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cu +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/quantize.cu +7 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sum.cu +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +10 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +99 -17
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +200 -2
- package/cpp/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +8 -2
- package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cu +112 -0
- package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +12 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +6 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +972 -178
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/div.cl +72 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/group_norm.cl +72 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sub.cl +72 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sum_rows.cl +39 -0
- package/cpp/llama.cpp/ggml/src/ggml-opt.cpp +373 -190
- package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +29 -23
- package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -10
- package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +101 -5
- package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +31 -33
- package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +29 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +4 -5
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +9 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +84 -72
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
- package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +1 -3
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +324 -129
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +31 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +95 -68
- package/cpp/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +1 -4
- package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +2 -3
- package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +69 -43
- package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +2 -14
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +81 -91
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +432 -181
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +17 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +6 -152
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +162 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +360 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +2 -118
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +12 -1
- package/cpp/llama.cpp/ggml/src/ggml.c +107 -36
- package/cpp/llama.cpp/ggml/src/gguf.cpp +33 -33
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +100 -15
- package/cpp/llama.cpp/gguf-py/gguf/gguf_reader.py +1 -1
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +44 -12
- package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_editor_gui.py +21 -10
- package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_new_metadata.py +5 -2
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +128 -31
- package/cpp/llama.cpp/gguf-py/gguf/utility.py +1 -1
- package/cpp/llama.cpp/gguf-py/pyproject.toml +1 -1
- package/cpp/llama.cpp/include/llama.h +62 -125
- package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-nomic-bert-moe.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.out +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +1 -1
- package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.out +1 -1
- package/cpp/llama.cpp/models/templates/Qwen-QwQ-32B.jinja +62 -0
- package/cpp/llama.cpp/models/templates/Qwen-Qwen3-0.6B.jinja +85 -0
- package/cpp/llama.cpp/models/templates/README.md +2 -0
- package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +5 -1
- package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +5 -1
- package/cpp/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
- package/cpp/llama.cpp/requirements/requirements-gguf_editor_gui.txt +1 -1
- package/cpp/llama.cpp/src/CMakeLists.txt +2 -0
- package/cpp/llama.cpp/src/llama-arch.cpp +6 -0
- package/cpp/llama.cpp/src/llama-arch.h +2 -0
- package/cpp/llama.cpp/src/llama-batch.cpp +3 -1
- package/cpp/llama.cpp/src/llama-context.cpp +340 -123
- package/cpp/llama.cpp/src/llama-context.h +30 -0
- package/cpp/llama.cpp/src/llama-cparams.cpp +4 -0
- package/cpp/llama.cpp/src/llama-cparams.h +2 -0
- package/cpp/llama.cpp/src/llama-grammar.cpp +12 -2
- package/cpp/llama.cpp/src/llama-graph.cpp +157 -247
- package/cpp/llama.cpp/src/llama-graph.h +52 -7
- package/cpp/llama.cpp/src/llama-hparams.cpp +17 -1
- package/cpp/llama.cpp/src/llama-hparams.h +37 -5
- package/cpp/llama.cpp/src/llama-kv-cache.cpp +742 -481
- package/cpp/llama.cpp/src/llama-kv-cache.h +196 -99
- package/cpp/llama.cpp/src/llama-kv-cells.h +379 -0
- package/cpp/llama.cpp/src/llama-memory.h +4 -3
- package/cpp/llama.cpp/src/llama-model-loader.cpp +22 -17
- package/cpp/llama.cpp/src/llama-model-saver.cpp +281 -0
- package/cpp/llama.cpp/src/llama-model-saver.h +37 -0
- package/cpp/llama.cpp/src/llama-model.cpp +529 -172
- package/cpp/llama.cpp/src/llama-model.h +6 -1
- package/cpp/llama.cpp/src/llama-quant.cpp +15 -13
- package/cpp/llama.cpp/src/llama-sampling.cpp +2 -2
- package/cpp/llama.cpp/src/llama-vocab.cpp +35 -8
- package/cpp/llama.cpp/src/llama-vocab.h +6 -0
- package/cpp/llama.cpp/src/llama.cpp +14 -0
- package/cpp/rn-completion.cpp +4 -2
- package/ios/include/chat.h +73 -6
- package/ios/include/common/minja/chat-template.hpp +9 -5
- package/ios/include/common/minja/minja.hpp +69 -36
- package/ios/include/common.h +21 -17
- package/ios/include/llama.h +62 -125
- package/ios/libs/llama.xcframework/Info.plist +19 -19
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4617 -4487
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4638 -4508
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3557 -3435
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4638 -4508
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3559 -3437
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4616 -4487
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4637 -4508
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3556 -3435
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4653 -4523
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4674 -4544
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3587 -3465
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +237 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +26 -7
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +62 -125
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +1 -1
- package/cpp/llama.cpp/common/stb_image.h +0 -7988
- package/cpp/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-chameleon.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-llama4.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-llama4.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-pixtral.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +0 -46
|
@@ -49,35 +49,38 @@ endif()
|
|
|
49
49
|
target_compile_options(ggml-sycl PRIVATE "-Wno-narrowing")
|
|
50
50
|
|
|
51
51
|
# Link against oneDNN
|
|
52
|
-
find_package(DNNL)
|
|
53
52
|
set(GGML_SYCL_DNNL 0)
|
|
54
|
-
if(
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
53
|
+
if(GGML_SYCL_DNN)
|
|
54
|
+
find_package(DNNL)
|
|
55
|
+
if(DNNL_FOUND)
|
|
56
|
+
if (NOT DEFINED DNNL_GPU_VENDOR)
|
|
57
|
+
# default to intel target
|
|
58
|
+
set(DNNL_GPU_VENDOR "INTEL")
|
|
59
|
+
if(NOT "${GGML_SYCL_TARGET}" STREQUAL "INTEL")
|
|
60
|
+
message(WARNING "oneDNN builds bundled with oneapi release only support INTEL target")
|
|
61
|
+
endif()
|
|
61
62
|
endif()
|
|
62
|
-
endif()
|
|
63
63
|
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
64
|
+
# Verify oneDNN was compiled for the same target as llama
|
|
65
|
+
if("${GGML_SYCL_TARGET}" STREQUAL "${DNNL_GPU_VENDOR}")
|
|
66
|
+
target_link_libraries(ggml-sycl PRIVATE DNNL::dnnl)
|
|
67
|
+
set(GGML_SYCL_DNNL 1)
|
|
68
|
+
get_target_property(CONFIGS DNNL::dnnl IMPORTED_CONFIGURATIONS)
|
|
69
|
+
foreach(CONFIG ${CONFIGS})
|
|
70
|
+
get_target_property(DNNL_LIB DNNL::dnnl IMPORTED_LOCATION_${CONFIG})
|
|
71
|
+
message(STATUS "Found oneDNN: ${DNNL_LIB}")
|
|
72
|
+
endforeach()
|
|
73
|
+
else()
|
|
74
|
+
message(WARNING
|
|
75
|
+
"oneDNN must be compiled for the same target as llama.cpp.
|
|
76
|
+
llama.cpp: ${GGML_SYCL_TARGET}, oneDNN: ${DNNL_GPU_VENDOR}.
|
|
77
|
+
Disabling oneDNN support.")
|
|
78
|
+
endif()
|
|
73
79
|
else()
|
|
74
|
-
message(
|
|
75
|
-
"oneDNN must be compiled for the same target as llama.cpp.
|
|
76
|
-
llama.cpp: ${GGML_SYCL_TARGET}, oneDNN: ${DNNL_GPU_VENDOR}.
|
|
77
|
-
Disabling oneDNN support.")
|
|
80
|
+
message(STATUS "oneDNN not found, disabling oneDNN support")
|
|
78
81
|
endif()
|
|
79
82
|
else()
|
|
80
|
-
message(STATUS "oneDNN
|
|
83
|
+
message(STATUS "oneDNN support disabled by the user")
|
|
81
84
|
endif()
|
|
82
85
|
target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_DNNL=${GGML_SYCL_DNNL})
|
|
83
86
|
|
|
@@ -108,6 +111,9 @@ endif()
|
|
|
108
111
|
if (GGML_SYCL_TARGET STREQUAL "INTEL")
|
|
109
112
|
# Intel devices use Intel oneMKL directly instead of oneMath to avoid the limitation of linking Intel oneMKL statically
|
|
110
113
|
# See https://github.com/uxlfoundation/oneMath/issues/654
|
|
114
|
+
if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
|
|
115
|
+
set(SYCL_COMPILER ON)
|
|
116
|
+
endif()
|
|
111
117
|
find_package(MKL REQUIRED)
|
|
112
118
|
target_link_libraries(ggml-sycl PRIVATE MKL::MKL_SYCL::BLAS)
|
|
113
119
|
target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_USE_INTEL_ONEMKL)
|
|
@@ -319,32 +319,27 @@ inline void ggml_sycl_op_repeat(ggml_backend_sycl_context & ctx, ggml_tensor *ds
|
|
|
319
319
|
|
|
320
320
|
|
|
321
321
|
void ggml_sycl_add(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
322
|
-
|
|
322
|
+
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
|
|
323
323
|
ggml_sycl_op_add(ctx, dst);
|
|
324
|
-
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
|
325
324
|
}
|
|
326
325
|
|
|
327
326
|
void ggml_sycl_sub(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
328
|
-
|
|
327
|
+
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
|
|
329
328
|
ggml_sycl_op_sub(ctx, dst);
|
|
330
|
-
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
|
331
329
|
}
|
|
332
330
|
|
|
333
331
|
void ggml_sycl_mul(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
334
|
-
|
|
332
|
+
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
|
|
335
333
|
ggml_sycl_op_mul(ctx, dst);
|
|
336
|
-
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
|
337
334
|
}
|
|
338
335
|
|
|
339
336
|
void ggml_sycl_div(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
340
|
-
|
|
337
|
+
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
|
|
341
338
|
ggml_sycl_op_div(ctx, dst);
|
|
342
|
-
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
|
343
339
|
}
|
|
344
340
|
|
|
345
341
|
void ggml_sycl_repeat(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
346
|
-
|
|
342
|
+
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
|
347
343
|
ggml_sycl_op_repeat(ctx, dst);
|
|
348
|
-
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
|
349
344
|
}
|
|
350
345
|
|
|
@@ -13,8 +13,10 @@
|
|
|
13
13
|
#ifndef GGML_SYCL_COMMON_HPP
|
|
14
14
|
#define GGML_SYCL_COMMON_HPP
|
|
15
15
|
|
|
16
|
+
#include <cstddef>
|
|
16
17
|
#include <fstream>
|
|
17
18
|
#include <iostream>
|
|
19
|
+
#include <string>
|
|
18
20
|
|
|
19
21
|
#include "dpct/helper.hpp"
|
|
20
22
|
#include "ggml-sycl.h"
|
|
@@ -44,11 +46,20 @@ extern int g_ggml_sycl_debug;
|
|
|
44
46
|
extern int g_ggml_sycl_disable_optimize;
|
|
45
47
|
extern int g_ggml_sycl_prioritize_dmmv;
|
|
46
48
|
|
|
47
|
-
#
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
49
|
+
#if defined(__clang__) && __has_builtin(__builtin_expect)
|
|
50
|
+
// Hint the optimizer to pipeline the more likely following instruction in branches
|
|
51
|
+
# define LIKELY(expr) __builtin_expect(expr, true)
|
|
52
|
+
# define UNLIKELY(expr) __builtin_expect(expr, false)
|
|
53
|
+
#else
|
|
54
|
+
# define LIKELY(expr) (expr)
|
|
55
|
+
# define UNLIKELY(expr) (expr)
|
|
56
|
+
#endif
|
|
57
|
+
|
|
58
|
+
#define GGML_SYCL_DEBUG(...) \
|
|
59
|
+
do { \
|
|
60
|
+
if (UNLIKELY(g_ggml_sycl_debug)) \
|
|
61
|
+
fprintf(stderr, __VA_ARGS__); \
|
|
62
|
+
} while (0)
|
|
52
63
|
|
|
53
64
|
#define CHECK_TRY_ERROR(expr) \
|
|
54
65
|
[&]() { \
|
|
@@ -471,6 +482,19 @@ static __dpct_inline__ float warp_reduce_max(float x,
|
|
|
471
482
|
return x;
|
|
472
483
|
}
|
|
473
484
|
|
|
485
|
+
/* Helper for Computing the linear offset of a ggml_tensor given
|
|
486
|
+
per-dimension sizes, strides, and indices */
|
|
487
|
+
template<int N>
|
|
488
|
+
__dpct_inline__ size_t calculate_offset(const std::array<int, N> & strides, const std::array<int, N> & indices) {
|
|
489
|
+
size_t offset = 0;
|
|
490
|
+
#pragma unroll
|
|
491
|
+
for (int i = 0; i < N; i++) {
|
|
492
|
+
auto index_i = indices[i];
|
|
493
|
+
offset += strides[i] * index_i;
|
|
494
|
+
}
|
|
495
|
+
return offset;
|
|
496
|
+
}
|
|
497
|
+
|
|
474
498
|
// Helper for vec loading aligned data
|
|
475
499
|
template <typename Tp, int n>
|
|
476
500
|
inline sycl::vec<Tp, n> vec_aligned_load(const Tp* aligned_ptr) {
|
|
@@ -490,4 +514,76 @@ constexpr size_t ceil_div(const size_t m, const size_t n) {
|
|
|
490
514
|
}
|
|
491
515
|
|
|
492
516
|
bool gpu_has_xmx(sycl::device &dev);
|
|
517
|
+
|
|
518
|
+
template <int N, class T> void debug_print_array(const std::string & prefix, const T array[N]) {
|
|
519
|
+
if (LIKELY(!g_ggml_sycl_debug)) {
|
|
520
|
+
return;
|
|
521
|
+
}
|
|
522
|
+
std::stringstream ss;
|
|
523
|
+
ss << prefix << "=[";
|
|
524
|
+
for (std::size_t i = 0; i < N - 1; ++i) {
|
|
525
|
+
ss << array[i] << ", ";
|
|
526
|
+
}
|
|
527
|
+
if constexpr (N > 0) {
|
|
528
|
+
ss << array[N - 1];
|
|
529
|
+
}
|
|
530
|
+
ss << "]";
|
|
531
|
+
GGML_SYCL_DEBUG("%s", ss.str().c_str());
|
|
532
|
+
}
|
|
533
|
+
|
|
534
|
+
inline void debug_print_tensor(const std::string & prefix, const ggml_tensor * tensor,
|
|
535
|
+
const std::string & suffix = "") {
|
|
536
|
+
if (LIKELY(!g_ggml_sycl_debug)) {
|
|
537
|
+
return;
|
|
538
|
+
}
|
|
539
|
+
GGML_SYCL_DEBUG("%s=", prefix.c_str());
|
|
540
|
+
if (tensor) {
|
|
541
|
+
GGML_SYCL_DEBUG("'%s':type=%s", tensor->name, ggml_type_name(tensor->type));
|
|
542
|
+
debug_print_array<GGML_MAX_DIMS>(";ne", tensor->ne);
|
|
543
|
+
debug_print_array<GGML_MAX_DIMS>(";nb", tensor->nb);
|
|
544
|
+
if (!ggml_is_contiguous(tensor)) {
|
|
545
|
+
GGML_SYCL_DEBUG(";strided");
|
|
546
|
+
}
|
|
547
|
+
if (ggml_is_permuted(tensor)) {
|
|
548
|
+
GGML_SYCL_DEBUG(";permuted");
|
|
549
|
+
}
|
|
550
|
+
} else {
|
|
551
|
+
GGML_SYCL_DEBUG("nullptr");
|
|
552
|
+
}
|
|
553
|
+
GGML_SYCL_DEBUG("%s", suffix.c_str());
|
|
554
|
+
}
|
|
555
|
+
|
|
556
|
+
// Use scope_op_debug_print to log operations coming from running a model
|
|
557
|
+
struct scope_op_debug_print {
|
|
558
|
+
// Use string_views to avoid the cost of creating a string and concatenating them
|
|
559
|
+
// string_views must be alive for as long as the object is alive
|
|
560
|
+
// scope_op_debug_print are used with string literals in practice which are stored in constant space so always accessible
|
|
561
|
+
scope_op_debug_print(const std::string_view & func, const std::string_view & func_suffix, const ggml_tensor * dst,
|
|
562
|
+
std::size_t num_src, const std::string_view & suffix = "") :
|
|
563
|
+
func(func),
|
|
564
|
+
func_suffix(func_suffix) {
|
|
565
|
+
if (LIKELY(!g_ggml_sycl_debug)) {
|
|
566
|
+
return;
|
|
567
|
+
}
|
|
568
|
+
GGML_SYCL_DEBUG("[SYCL][OP] call %s%s:", func.data(), func_suffix.data());
|
|
569
|
+
debug_print_tensor(" dst", dst);
|
|
570
|
+
if (dst) {
|
|
571
|
+
for (std::size_t i = 0; i < num_src; ++i) {
|
|
572
|
+
debug_print_tensor("\tsrc" + std::to_string(i), dst->src[i]);
|
|
573
|
+
}
|
|
574
|
+
}
|
|
575
|
+
GGML_SYCL_DEBUG("%s\n", suffix.data());
|
|
576
|
+
}
|
|
577
|
+
|
|
578
|
+
scope_op_debug_print(const std::string_view & func, const ggml_tensor * dst, std::size_t num_src,
|
|
579
|
+
const std::string_view & suffix = "") :
|
|
580
|
+
scope_op_debug_print(func, "", dst, num_src, suffix) {}
|
|
581
|
+
|
|
582
|
+
~scope_op_debug_print() { GGML_SYCL_DEBUG("[SYCL][OP] call %s%s done\n", func.data(), func_suffix.data()); }
|
|
583
|
+
|
|
584
|
+
private:
|
|
585
|
+
std::string_view func;
|
|
586
|
+
std::string_view func_suffix;
|
|
587
|
+
};
|
|
588
|
+
|
|
493
589
|
#endif // GGML_SYCL_COMMON_HPP
|
|
@@ -159,39 +159,37 @@ static void concat_f32_sycl_non_cont(
|
|
|
159
159
|
}
|
|
160
160
|
|
|
161
161
|
void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
162
|
+
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
|
|
163
|
+
const ggml_tensor * src0 = dst->src[0];
|
|
164
|
+
const ggml_tensor * src1 = dst->src[1];
|
|
165
|
+
queue_ptr stream = ctx.stream();
|
|
166
|
+
|
|
167
|
+
const int32_t dim = ((int32_t *) dst->op_params)[0];
|
|
168
|
+
|
|
169
|
+
if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
|
|
170
|
+
const float * src0_d = (const float *) src0->data;
|
|
171
|
+
const float * src1_d = (const float *) src1->data;
|
|
172
|
+
|
|
173
|
+
float * dst_d = (float *) dst->data;
|
|
174
|
+
|
|
175
|
+
if (dim != 3) {
|
|
176
|
+
for (int i3 = 0; i3 < dst->ne[3]; i3++) {
|
|
177
|
+
concat_f32_sycl(src0_d + i3 * (src0->nb[3] / 4), src1_d + i3 * (src1->nb[3] / 4),
|
|
178
|
+
dst_d + i3 * (dst->nb[3] / 4), src0->ne[0], src0->ne[1], src0->ne[2], dst->ne[0],
|
|
179
|
+
dst->ne[1], dst->ne[2], dim, stream);
|
|
180
|
+
}
|
|
181
|
+
} else {
|
|
182
|
+
const size_t size0 = ggml_nbytes(src0);
|
|
183
|
+
const size_t size1 = ggml_nbytes(src1);
|
|
184
|
+
|
|
185
|
+
SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(dst_d, src0_d, size0).wait()));
|
|
186
|
+
SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(dst_d + size0 / 4, src1_d, size1).wait()));
|
|
187
|
+
}
|
|
181
188
|
} else {
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
stream->memcpy(dst_d + size0 / 4, src1_d, size1).wait()));
|
|
189
|
+
concat_f32_sycl_non_cont(stream, (const char *) src0->data, (const char *) src1->data, (char *) dst->data,
|
|
190
|
+
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0->nb[0], src0->nb[1],
|
|
191
|
+
src0->nb[2], src0->nb[3], src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
|
|
192
|
+
src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3], dst->ne[0], dst->ne[1], dst->ne[2],
|
|
193
|
+
dst->ne[3], dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3], dim);
|
|
188
194
|
}
|
|
189
|
-
} else
|
|
190
|
-
concat_f32_sycl_non_cont(
|
|
191
|
-
stream, (const char *)src0->data, (const char *)src1->data,
|
|
192
|
-
(char *)dst->data, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
|
|
193
|
-
src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3], src1->ne[0],
|
|
194
|
-
src1->ne[1], src1->ne[2], src1->ne[3], src1->nb[0], src1->nb[1],
|
|
195
|
-
src1->nb[2], src1->nb[3], dst->ne[0], dst->ne[1], dst->ne[2],
|
|
196
|
-
dst->ne[3], dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3], dim);
|
|
197
195
|
}
|
|
@@ -72,6 +72,7 @@ static void conv_transpose_1d_f32_f32_sycl(
|
|
|
72
72
|
}
|
|
73
73
|
|
|
74
74
|
void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
|
|
75
|
+
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
|
|
75
76
|
const ggml_tensor *src0 = dst->src[0];
|
|
76
77
|
const ggml_tensor *src1 = dst->src[1];
|
|
77
78
|
const float * src0_d = (const float *)src0->data;
|
|
@@ -183,6 +183,24 @@ static void dequantize_row_q4_K_sycl(const void *vx, dst_t *y, const int64_t k,
|
|
|
183
183
|
}
|
|
184
184
|
}
|
|
185
185
|
|
|
186
|
+
template <typename dst_t>
|
|
187
|
+
static void dequantize_row_q4_K_sycl_reorder(const void * vx, dst_t * y, const int64_t k, dpct::queue_ptr stream) {
|
|
188
|
+
const int64_t nb = k / QK_K;
|
|
189
|
+
const size_t local_size = 32;
|
|
190
|
+
const size_t global_size = nb * local_size;
|
|
191
|
+
|
|
192
|
+
dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
|
|
193
|
+
|
|
194
|
+
stream->submit([&](sycl::handler & cgh) {
|
|
195
|
+
sycl::local_accessor<uint8_t, 1> scale_local_acc(sycl::range<1>(12), cgh);
|
|
196
|
+
|
|
197
|
+
cgh.parallel_for(sycl::nd_range<1>(sycl::range<1>(global_size), sycl::range<1>(local_size)),
|
|
198
|
+
[=](sycl::nd_item<1> item_ct1) {
|
|
199
|
+
dequantize_block_q4_K_reorder(vx, y, get_pointer(scale_local_acc), item_ct1, nb);
|
|
200
|
+
});
|
|
201
|
+
});
|
|
202
|
+
}
|
|
203
|
+
|
|
186
204
|
template <typename dst_t>
|
|
187
205
|
static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int64_t k,
|
|
188
206
|
dpct::queue_ptr stream) {
|
|
@@ -504,7 +522,11 @@ to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst) {
|
|
|
504
522
|
case GGML_TYPE_Q3_K:
|
|
505
523
|
return dequantize_row_q3_K_sycl;
|
|
506
524
|
case GGML_TYPE_Q4_K:
|
|
507
|
-
|
|
525
|
+
if (dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
|
|
526
|
+
return dequantize_row_q4_K_sycl_reorder;
|
|
527
|
+
} else {
|
|
528
|
+
return dequantize_row_q4_K_sycl;
|
|
529
|
+
}
|
|
508
530
|
case GGML_TYPE_Q5_K:
|
|
509
531
|
return dequantize_row_q5_K_sycl;
|
|
510
532
|
case GGML_TYPE_Q6_K:
|
|
@@ -556,7 +578,12 @@ to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst) {
|
|
|
556
578
|
case GGML_TYPE_Q3_K:
|
|
557
579
|
return dequantize_row_q3_K_sycl;
|
|
558
580
|
case GGML_TYPE_Q4_K:
|
|
559
|
-
|
|
581
|
+
if (dst->src[0]->extra &&
|
|
582
|
+
((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) {
|
|
583
|
+
return dequantize_row_q4_K_sycl_reorder;
|
|
584
|
+
} else {
|
|
585
|
+
return dequantize_row_q4_K_sycl;
|
|
586
|
+
}
|
|
560
587
|
case GGML_TYPE_Q5_K:
|
|
561
588
|
return dequantize_row_q5_K_sycl;
|
|
562
589
|
case GGML_TYPE_Q6_K:
|
|
@@ -616,6 +616,9 @@ static void ggml_cpy_i32_i32_sycl(const char * cx, char * cdst, const int ne, co
|
|
|
616
616
|
}
|
|
617
617
|
|
|
618
618
|
void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1) try {
|
|
619
|
+
// Unlike other operators ggml_sycl_cpy takes 2 distinct tensors instead of a dst ggml_tensor and rely on its src field
|
|
620
|
+
scope_op_debug_print scope_dbg_print(__func__, src1, /*num_src=*/0,
|
|
621
|
+
std::string(" src0 type=") + ggml_type_name(src0->type));
|
|
619
622
|
const int64_t ne = ggml_nelements(src0);
|
|
620
623
|
GGML_ASSERT(ne == ggml_nelements(src1));
|
|
621
624
|
|
|
@@ -629,8 +632,6 @@ void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, co
|
|
|
629
632
|
|
|
630
633
|
char * src0_ddc = (char *) src0->data;
|
|
631
634
|
char * src1_ddc = (char *) src1->data;
|
|
632
|
-
GGML_SYCL_DEBUG("[SYCL] %s: Tensor supplied: %s to %s\n", __func__, ggml_type_name(src0->type),
|
|
633
|
-
ggml_type_name(src1->type));
|
|
634
635
|
|
|
635
636
|
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
|
636
637
|
ggml_cpy_f32_f32_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10,
|
|
@@ -694,8 +695,6 @@ void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, co
|
|
|
694
695
|
}
|
|
695
696
|
|
|
696
697
|
void ggml_sycl_dup(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
697
|
-
|
|
698
|
-
GGML_SYCL_DEBUG("[SYCL] call %s\n", __func__);
|
|
698
|
+
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
|
699
699
|
ggml_sycl_cpy(ctx, dst->src[0], dst);
|
|
700
|
-
GGML_SYCL_DEBUG("[SYCL] call %s done\n", __func__);
|
|
701
700
|
}
|
|
@@ -357,6 +357,28 @@ static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8
|
|
|
357
357
|
}
|
|
358
358
|
#endif
|
|
359
359
|
|
|
360
|
+
template <typename dst_t>
|
|
361
|
+
inline void dequantize_q4_K_common(dst_t * __restrict__ y, const uint8_t * __restrict__ qs_ptr, const float dall,
|
|
362
|
+
const float dmin, uint8_t * __restrict__ scales_local, int il, int ir) {
|
|
363
|
+
const int is = 2 * il;
|
|
364
|
+
constexpr int n = 4;
|
|
365
|
+
|
|
366
|
+
uint8_t sc, m;
|
|
367
|
+
get_scale_min_k4(is + 0, scales_local, sc, m);
|
|
368
|
+
const float d1 = dall * sc;
|
|
369
|
+
const float m1 = dmin * m;
|
|
370
|
+
|
|
371
|
+
get_scale_min_k4(is + 1, scales_local, sc, m);
|
|
372
|
+
const float d2 = dall * sc;
|
|
373
|
+
const float m2 = dmin * m;
|
|
374
|
+
|
|
375
|
+
sycl::vec<uint8_t, n> q_vec = vec_aligned_load<uint8_t, n>(qs_ptr + 32 * il + n * ir);
|
|
376
|
+
for (int l = 0; l < n; ++l) {
|
|
377
|
+
y[l + 0] = d1 * (q_vec[l] & 0xF) - m1;
|
|
378
|
+
y[l + 32] = d2 * (q_vec[l] >> 4) - m2;
|
|
379
|
+
}
|
|
380
|
+
}
|
|
381
|
+
|
|
360
382
|
template<typename dst_t>
|
|
361
383
|
static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
|
|
362
384
|
uint8_t* scales_local, const sycl::nd_item<3> &item_ct1) {
|
|
@@ -365,36 +387,22 @@ static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restri
|
|
|
365
387
|
const int64_t i = item_ct1.get_group(2);
|
|
366
388
|
|
|
367
389
|
#if QK_K == 256
|
|
368
|
-
// assume 32 threads
|
|
369
390
|
const int64_t tid = item_ct1.get_local_id(2);
|
|
370
|
-
const int64_t il = tid/8;
|
|
371
|
-
const int64_t ir = tid%8;
|
|
372
|
-
const int64_t is = 2*il;
|
|
373
|
-
const int64_t n = 4;
|
|
391
|
+
const int64_t il = tid / 8;
|
|
392
|
+
const int64_t ir = tid % 8;
|
|
374
393
|
|
|
375
|
-
dst_t * y = yy + i*QK_K + 64*il +
|
|
394
|
+
dst_t * y = yy + i * QK_K + 64 * il + 4 * ir;
|
|
376
395
|
|
|
377
396
|
const sycl::half2 dm = x[i].dm;
|
|
378
397
|
const float dall = dm[0];
|
|
379
398
|
const float dmin = dm[1];
|
|
380
399
|
|
|
381
|
-
if (tid < 12)
|
|
400
|
+
if (tid < 12) {
|
|
382
401
|
scales_local[tid] = x[i].scales[tid];
|
|
383
|
-
item_ct1.barrier(sycl::access::fence_space::local_space);
|
|
384
|
-
|
|
385
|
-
uint8_t sc, m;
|
|
386
|
-
get_scale_min_k4(is + 0, scales_local, sc, m);
|
|
387
|
-
const float d1 = dall * sc;
|
|
388
|
-
const float m1 = dmin * m;
|
|
389
|
-
get_scale_min_k4(is + 1, scales_local, sc, m);
|
|
390
|
-
const float d2 = dall * sc;
|
|
391
|
-
const float m2 = dmin * m;
|
|
392
|
-
|
|
393
|
-
sycl::vec<uint8_t, n> q_vec = vec_aligned_load<uint8_t, n>(x[i].qs + 32*il + n*ir);
|
|
394
|
-
for (int l = 0; l < n; ++l) {
|
|
395
|
-
y[l + 0] = d1 * (q_vec[l] & 0xF) - m1;
|
|
396
|
-
y[l +32] = d2 * (q_vec[l] >> 4) - m2;
|
|
397
402
|
}
|
|
403
|
+
|
|
404
|
+
item_ct1.barrier(sycl::access::fence_space::local_space);
|
|
405
|
+
dequantize_q4_K_common(y, x[i].qs, dall, dmin, scales_local, il, ir);
|
|
398
406
|
#else
|
|
399
407
|
const int64_t tid = item_ct1.get_local_id(2);
|
|
400
408
|
const uint8_t * q = x[i].qs;
|
|
@@ -406,6 +414,36 @@ static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restri
|
|
|
406
414
|
#endif
|
|
407
415
|
}
|
|
408
416
|
|
|
417
|
+
template <typename dst_t>
|
|
418
|
+
static void dequantize_block_q4_K_reorder(const void * __restrict__ vx, dst_t * __restrict__ yy, uint8_t * scales_local,
|
|
419
|
+
const sycl::nd_item<1> & item_ct1, int64_t nb) {
|
|
420
|
+
const int64_t i = item_ct1.get_group(0); // block index
|
|
421
|
+
const int64_t tid = item_ct1.get_local_id(0); // thread index within block
|
|
422
|
+
const int64_t il = tid / 8;
|
|
423
|
+
const int64_t ir = tid % 8;
|
|
424
|
+
|
|
425
|
+
dst_t * y = yy + i * QK_K + 64 * il + 4 * ir;
|
|
426
|
+
|
|
427
|
+
const uint8_t * base = static_cast<const uint8_t *>(vx);
|
|
428
|
+
const size_t qs_offset = i * (QK_K / 2);
|
|
429
|
+
const size_t scales_offset = nb * (QK_K / 2) + i * K_SCALE_SIZE;
|
|
430
|
+
const size_t dm_offset = nb * (QK_K / 2) + nb * K_SCALE_SIZE + i * sizeof(ggml_half2);
|
|
431
|
+
|
|
432
|
+
const uint8_t * qs_ptr = base + qs_offset;
|
|
433
|
+
const uint8_t * scales_ptr = base + scales_offset;
|
|
434
|
+
ggml_half2 dm_values = *reinterpret_cast<const ggml_half2 *>(base + dm_offset);
|
|
435
|
+
|
|
436
|
+
const float dall = dm_values.x();
|
|
437
|
+
const float dmin = dm_values.y();
|
|
438
|
+
|
|
439
|
+
if (tid < 12) {
|
|
440
|
+
scales_local[tid] = scales_ptr[tid];
|
|
441
|
+
}
|
|
442
|
+
|
|
443
|
+
item_ct1.barrier(sycl::access::fence_space::local_space);
|
|
444
|
+
dequantize_q4_K_common(y, qs_ptr, dall, dmin, scales_local, il, ir);
|
|
445
|
+
}
|
|
446
|
+
|
|
409
447
|
template<typename dst_t>
|
|
410
448
|
static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
|
|
411
449
|
const sycl::nd_item<3> &item_ct1) {
|
|
@@ -1092,6 +1092,8 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
|
|
|
1092
1092
|
src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
|
|
1093
1093
|
|
|
1094
1094
|
if (src1_convert_f16) {
|
|
1095
|
+
scope_op_debug_print scope_dbg_print(__func__, "/to_fp16_sycl", dst, /*num_src=*/2,
|
|
1096
|
+
" : converting src1 to fp16");
|
|
1095
1097
|
src1_dfloat = src1_dfloat_a.alloc(ne00);
|
|
1096
1098
|
const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type, dst);
|
|
1097
1099
|
GGML_ASSERT(to_fp16_sycl != nullptr);
|
|
@@ -1129,7 +1131,13 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
|
|
|
1129
1131
|
dequantize_mul_mat_vec_q3_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
|
|
1130
1132
|
break;
|
|
1131
1133
|
case GGML_TYPE_Q4_K:
|
|
1132
|
-
|
|
1134
|
+
if ((ggml_tensor_extra_gpu *) dst->src[0]->extra &&
|
|
1135
|
+
((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
|
|
1136
|
+
// reorder is currently not supported for dmmv
|
|
1137
|
+
GGML_ABORT("Unimplemented dequantize case case for q4_k reorder");
|
|
1138
|
+
} else {
|
|
1139
|
+
dequantize_mul_mat_vec_q4_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
|
|
1140
|
+
}
|
|
1133
1141
|
break;
|
|
1134
1142
|
case GGML_TYPE_Q5_K:
|
|
1135
1143
|
dequantize_mul_mat_vec_q5_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
|