@novastera-oss/llamarn 0.2.6 → 0.2.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/cpp/include/llama.h +134 -36
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/LlamaCppModel.cpp +2 -2
- package/cpp/LlamaCppModel.h +3 -3
- package/cpp/PureCppImpl.cpp +1 -1
- package/cpp/PureCppImpl.h +2 -2
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/CMakeLists.txt +15 -4
- package/cpp/llama.cpp/Makefile +2 -2
- package/cpp/llama.cpp/README.md +32 -13
- package/cpp/llama.cpp/common/CMakeLists.txt +10 -20
- package/cpp/llama.cpp/common/arg.cpp +30 -6
- package/cpp/llama.cpp/common/build-info.cpp.in +2 -2
- package/cpp/llama.cpp/common/chat-parser.cpp +5 -0
- package/cpp/llama.cpp/common/chat-parser.h +2 -0
- package/cpp/llama.cpp/common/chat.cpp +12 -9
- package/cpp/llama.cpp/common/chat.h +1 -1
- package/cpp/llama.cpp/common/common.cpp +50 -40
- package/cpp/llama.cpp/common/common.h +5 -2
- package/cpp/llama.cpp/common/speculative.cpp +6 -4
- package/cpp/llama.cpp/convert_hf_to_gguf.py +97 -56
- package/cpp/llama.cpp/ggml/CMakeLists.txt +47 -2
- package/cpp/llama.cpp/ggml/cmake/common.cmake +1 -2
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +47 -13
- package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +6 -1
- package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +33 -9
- package/cpp/llama.cpp/ggml/src/ggml-common.h +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +93 -24
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4113 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2174 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2638 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2731 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2068 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +396 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1299 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1480 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +4310 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +59 -3206
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +184 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +7 -4
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +10 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +8 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +56 -7
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +1157 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +1555 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.h +98 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +2 -4
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +5 -8
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +4 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +6 -8
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +6 -4
- package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +11 -10
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +33 -8
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +135 -100
- package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +908 -3
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
- package/cpp/llama.cpp/ggml/src/ggml-quants.c +0 -2
- package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +18 -15
- package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +19 -24
- package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +21 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +121 -4
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +32 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +2 -96
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +164 -38
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +32 -8
- package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +38 -10
- package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +108 -16
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +26 -29
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +431 -247
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +0 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml.c +0 -6
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +57 -0
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +4 -1
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +14 -3
- package/cpp/llama.cpp/include/llama.h +134 -36
- package/cpp/llama.cpp/requirements/requirements-compare-llama-bench.txt +1 -0
- package/cpp/llama.cpp/src/CMakeLists.txt +2 -2
- package/cpp/llama.cpp/src/llama-arch.cpp +95 -3
- package/cpp/llama.cpp/src/llama-arch.h +7 -1
- package/cpp/llama.cpp/src/llama-batch.cpp +270 -19
- package/cpp/llama.cpp/src/llama-batch.h +36 -11
- package/cpp/llama.cpp/src/llama-chat.cpp +19 -2
- package/cpp/llama.cpp/src/llama-chat.h +1 -0
- package/cpp/llama.cpp/src/llama-context.cpp +313 -213
- package/cpp/llama.cpp/src/llama-context.h +16 -12
- package/cpp/llama.cpp/src/llama-cparams.cpp +1 -1
- package/cpp/llama.cpp/src/llama-cparams.h +1 -1
- package/cpp/llama.cpp/src/llama-graph.cpp +249 -129
- package/cpp/llama.cpp/src/llama-graph.h +90 -34
- package/cpp/llama.cpp/src/llama-hparams.cpp +6 -2
- package/cpp/llama.cpp/src/llama-hparams.h +8 -2
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +82 -50
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +23 -26
- package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +292 -174
- package/cpp/llama.cpp/src/llama-kv-cache-unified.h +68 -38
- package/cpp/llama.cpp/src/llama-kv-cells.h +18 -13
- package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +247 -0
- package/cpp/llama.cpp/src/llama-memory-hybrid.h +143 -0
- package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.cpp → llama-memory-recurrent.cpp} +266 -282
- package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.h → llama-memory-recurrent.h} +54 -57
- package/cpp/llama.cpp/src/llama-memory.cpp +41 -0
- package/cpp/llama.cpp/src/llama-memory.h +64 -23
- package/cpp/llama.cpp/src/llama-mmap.cpp +1 -1
- package/cpp/llama.cpp/src/llama-model-loader.cpp +42 -17
- package/cpp/llama.cpp/src/llama-model.cpp +726 -141
- package/cpp/llama.cpp/src/llama-model.h +4 -0
- package/cpp/llama.cpp/src/llama-quant.cpp +2 -1
- package/cpp/llama.cpp/src/llama-vocab.cpp +32 -23
- package/cpp/llama.cpp/src/llama.cpp +11 -7
- package/cpp/llama.cpp/src/unicode.cpp +5 -0
- package/cpp/rn-completion.cpp +2 -2
- package/cpp/{rn-llama.hpp → rn-llama.h} +1 -1
- package/ios/include/chat.h +1 -1
- package/ios/include/common.h +5 -2
- package/ios/include/llama.h +134 -36
- package/ios/libs/llama.xcframework/Info.plist +18 -18
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4863 -4689
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +134 -36
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4834 -4710
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3742 -3622
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +134 -36
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4834 -4710
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3744 -3624
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +134 -36
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +134 -36
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +134 -36
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4863 -4689
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +134 -36
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4834 -4710
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3742 -3622
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +134 -36
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4900 -4725
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +134 -36
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4871 -4746
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3773 -3652
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +134 -36
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +1 -2
- package/cpp/llama.cpp/common/cmake/build-info-gen-cpp.cmake +0 -24
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13891
- package/cpp/llama.cpp/src/llama-kv-cache.cpp +0 -1
- package/cpp/llama.cpp/src/llama-kv-cache.h +0 -44
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
- /package/cpp/{rn-utils.hpp → rn-utils.h} +0 -0
|
@@ -347,14 +347,15 @@ static enum ggml_status
|
|
|
347
347
|
ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
|
348
348
|
ggml_tensor *tensor) try {
|
|
349
349
|
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
|
350
|
-
|
|
350
|
+
GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor, "\n").c_str());
|
|
351
351
|
ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *)buffer->context;
|
|
352
352
|
|
|
353
353
|
if (tensor->view_src != NULL) {
|
|
354
354
|
assert(tensor->view_src->buffer->buft == buffer->buft);
|
|
355
355
|
return GGML_STATUS_SUCCESS;
|
|
356
356
|
}
|
|
357
|
-
if ((tensor->type == GGML_TYPE_Q4_0 || tensor->type == GGML_TYPE_Q4_K
|
|
357
|
+
if ((tensor->type == GGML_TYPE_Q4_0 || tensor->type == GGML_TYPE_Q4_K || tensor->type == GGML_TYPE_Q6_K) &&
|
|
358
|
+
!g_ggml_sycl_disable_optimize) {
|
|
358
359
|
ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
|
|
359
360
|
tensor->extra = extra;
|
|
360
361
|
ctx->tensor_extras.push_back(extra); //used to release it when destroy ctx.
|
|
@@ -384,7 +385,7 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
|
|
384
385
|
const void *data, size_t offset,
|
|
385
386
|
size_t size) try {
|
|
386
387
|
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
|
387
|
-
|
|
388
|
+
GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
|
|
388
389
|
GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
|
|
389
390
|
ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
|
|
390
391
|
ggml_sycl_set_device(ctx->device);
|
|
@@ -412,7 +413,7 @@ static void ggml_backend_sycl_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
|
|
412
413
|
void *data, size_t offset,
|
|
413
414
|
size_t size) try {
|
|
414
415
|
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
|
415
|
-
|
|
416
|
+
GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
|
|
416
417
|
GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
|
|
417
418
|
ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
|
|
418
419
|
|
|
@@ -443,8 +444,8 @@ ggml_backend_sycl_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
|
|
|
443
444
|
ggml_tensor *dst) try {
|
|
444
445
|
bool is_cpy_supported = ggml_backend_buffer_is_sycl(src->buffer);
|
|
445
446
|
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
|
446
|
-
|
|
447
|
-
|
|
447
|
+
GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": dst", dst).c_str());
|
|
448
|
+
GGML_SYCL_DEBUG("%s", debug_get_tensor_str(" src", src).c_str());
|
|
448
449
|
GGML_SYCL_DEBUG(" is_cpy_supported=%d\n", is_cpy_supported);
|
|
449
450
|
if (is_cpy_supported) {
|
|
450
451
|
ggml_backend_sycl_buffer_context * src_ctx = (ggml_backend_sycl_buffer_context *)src->buffer->context;
|
|
@@ -524,7 +525,7 @@ catch (sycl::exception const &exc) {
|
|
|
524
525
|
static void ggml_backend_sycl_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value,
|
|
525
526
|
size_t offset, size_t size) {
|
|
526
527
|
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
|
527
|
-
|
|
528
|
+
GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
|
|
528
529
|
GGML_SYCL_DEBUG(" size=%zu offset=%zu value=%u\n", size, offset, value);
|
|
529
530
|
ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *) buffer->context;
|
|
530
531
|
SYCL_CHECK(ggml_sycl_set_device(ctx->device));
|
|
@@ -804,7 +805,7 @@ static enum ggml_status
|
|
|
804
805
|
ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
|
805
806
|
ggml_tensor *tensor) try {
|
|
806
807
|
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
|
807
|
-
|
|
808
|
+
GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor, "\n").c_str());
|
|
808
809
|
GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
|
|
809
810
|
|
|
810
811
|
ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
|
|
@@ -890,7 +891,7 @@ ggml_backend_sycl_split_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
|
|
890
891
|
ggml_tensor *tensor, const void *data,
|
|
891
892
|
size_t offset, size_t size) try {
|
|
892
893
|
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
|
893
|
-
|
|
894
|
+
GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
|
|
894
895
|
GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
|
|
895
896
|
// split tensors must always be set in their entirety at once
|
|
896
897
|
GGML_ASSERT(offset == 0);
|
|
@@ -946,7 +947,7 @@ ggml_backend_sycl_split_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
|
|
946
947
|
const ggml_tensor *tensor, void *data,
|
|
947
948
|
size_t offset, size_t size) try {
|
|
948
949
|
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
|
949
|
-
|
|
950
|
+
GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
|
|
950
951
|
GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
|
|
951
952
|
// split tensors must always be set in their entirety at once
|
|
952
953
|
GGML_ASSERT(offset == 0);
|
|
@@ -1434,6 +1435,59 @@ static void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy,
|
|
|
1434
1435
|
reinterpret_cast<sycl::half &>(y[ib].ds.y()) = sum;
|
|
1435
1436
|
}
|
|
1436
1437
|
|
|
1438
|
+
template <int ElementsPerWI>
|
|
1439
|
+
static __dpct_inline__ void quantize_and_reorder_q8_1(const float * __restrict__ x, void * reordered_q8_tensor,
|
|
1440
|
+
const int kx, const int kx_padded, const sycl::nd_item<1> & it) {
|
|
1441
|
+
/*
|
|
1442
|
+
Quantizes and reorders the resultant q8 tensor in a per row fashion
|
|
1443
|
+
Each sub-group calculates one quant block. i.e. QK8_1 quant values and the d and sum values
|
|
1444
|
+
*/
|
|
1445
|
+
|
|
1446
|
+
auto subgroup_id = it.get_group(0);
|
|
1447
|
+
auto wi_id = it.get_local_id(0);
|
|
1448
|
+
|
|
1449
|
+
const int num_blocks_per_row = kx / QK8_1;
|
|
1450
|
+
auto row = subgroup_id / num_blocks_per_row;
|
|
1451
|
+
auto col = subgroup_id % num_blocks_per_row;
|
|
1452
|
+
|
|
1453
|
+
auto row_offset = row * (kx_padded / QK8_1) * sizeof(block_q8_1);
|
|
1454
|
+
auto col_offset = QK8_1 * col + wi_id * ElementsPerWI;
|
|
1455
|
+
|
|
1456
|
+
auto quant_ptr = (int8_t *) ((char *) reordered_q8_tensor + row_offset + col_offset);
|
|
1457
|
+
auto ds_ptr = (sycl::half2 *) ((char *) reordered_q8_tensor + row_offset + kx + col * sizeof(sycl::half2));
|
|
1458
|
+
|
|
1459
|
+
sycl::vec<float, ElementsPerWI> wi_f32_vals;
|
|
1460
|
+
sycl::vec<int8_t, ElementsPerWI> quantized_values;
|
|
1461
|
+
|
|
1462
|
+
auto float_ptr_offset = subgroup_id * QK8_1 + ElementsPerWI * wi_id;
|
|
1463
|
+
wi_f32_vals = *reinterpret_cast<const sycl::vec<float, ElementsPerWI> *>(x + float_ptr_offset);
|
|
1464
|
+
|
|
1465
|
+
float sum = 0.0f;
|
|
1466
|
+
float amax = 0.0f;
|
|
1467
|
+
|
|
1468
|
+
#pragma unroll(ElementsPerWI)
|
|
1469
|
+
for (int i = 0; i < ElementsPerWI; i++) {
|
|
1470
|
+
sum += wi_f32_vals[i];
|
|
1471
|
+
amax = sycl::fmax(amax, sycl::fabs(wi_f32_vals[i]));
|
|
1472
|
+
quantized_values[i] = 0;
|
|
1473
|
+
}
|
|
1474
|
+
sum = sycl::reduce_over_group(it.get_group(), sum, sycl::plus<float>());
|
|
1475
|
+
amax = sycl::reduce_over_group(it.get_group(), amax, sycl::maximum<float>());
|
|
1476
|
+
float d = amax == 0 ? 1 : amax / 127;
|
|
1477
|
+
|
|
1478
|
+
#pragma unroll(ElementsPerWI)
|
|
1479
|
+
for (int i = 0; i < ElementsPerWI; i++) {
|
|
1480
|
+
quantized_values[i] = sycl::round(wi_f32_vals[i] / d);
|
|
1481
|
+
}
|
|
1482
|
+
|
|
1483
|
+
d = amax == 0 ? 0 : d;
|
|
1484
|
+
|
|
1485
|
+
*reinterpret_cast<sycl::vec<int8_t, ElementsPerWI> *>(quant_ptr) = quantized_values;
|
|
1486
|
+
if (wi_id == 0) {
|
|
1487
|
+
*ds_ptr = sycl::half2(sycl::half(d), sycl::half(sum));
|
|
1488
|
+
}
|
|
1489
|
+
}
|
|
1490
|
+
|
|
1437
1491
|
static void mul_mat_p021_f16_f32(
|
|
1438
1492
|
const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst,
|
|
1439
1493
|
const int ncols_x, const int nrows_x, const int nchannels_x, const int nchannels_y,
|
|
@@ -1718,23 +1772,30 @@ static void pool2d_nchw_kernel(
|
|
|
1718
1772
|
o_ptr[cur_oh * ow + cur_ow] = res;
|
|
1719
1773
|
}
|
|
1720
1774
|
|
|
1721
|
-
static void quantize_row_q8_1_sycl(const float *x, void *vy, const int kx,
|
|
1722
|
-
|
|
1723
|
-
|
|
1724
|
-
|
|
1725
|
-
|
|
1726
|
-
|
|
1727
|
-
|
|
1728
|
-
|
|
1729
|
-
|
|
1730
|
-
|
|
1731
|
-
|
|
1775
|
+
static void quantize_row_q8_1_sycl(const float * x, void * vy, const int kx, const int ky, const int kx_padded,
|
|
1776
|
+
bool reorder_q8_tensor, queue_ptr stream) {
|
|
1777
|
+
if (reorder_q8_tensor) {
|
|
1778
|
+
auto local_range = std::size_t(WARP_SIZE);
|
|
1779
|
+
auto num_quant_blocks = ky * (kx / QK8_1);
|
|
1780
|
+
auto global_range = num_quant_blocks * local_range;
|
|
1781
|
+
stream->parallel_for(sycl::nd_range<1>({ global_range }, { local_range }),
|
|
1782
|
+
[=](sycl::nd_item<1> it) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
|
1783
|
+
quantize_and_reorder_q8_1<QK8_1 / WARP_SIZE>(x, vy, kx, kx_padded, it);
|
|
1784
|
+
});
|
|
1785
|
+
} else {
|
|
1786
|
+
const int block_num_x = (kx_padded + SYCL_QUANTIZE_BLOCK_SIZE - 1) / SYCL_QUANTIZE_BLOCK_SIZE;
|
|
1787
|
+
const sycl::range<3> num_blocks(1, ky, block_num_x);
|
|
1788
|
+
int constexpr QUANT_BLOCK_TILE = QK8_1 / WARP_SIZE;
|
|
1789
|
+
static_assert(QK8_1 % WARP_SIZE == 0);
|
|
1790
|
+
const sycl::range<3> block_size(1, 1, SYCL_QUANTIZE_BLOCK_SIZE / QUANT_BLOCK_TILE);
|
|
1791
|
+
{
|
|
1792
|
+
dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
|
|
1732
1793
|
|
|
1733
|
-
|
|
1734
|
-
|
|
1735
|
-
|
|
1736
|
-
|
|
1737
|
-
|
|
1794
|
+
stream->parallel_for(sycl::nd_range<3>(num_blocks * block_size, block_size),
|
|
1795
|
+
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
|
1796
|
+
quantize_q8_1<QUANT_BLOCK_TILE>(x, vy, kx, kx_padded, item_ct1);
|
|
1797
|
+
});
|
|
1798
|
+
}
|
|
1738
1799
|
}
|
|
1739
1800
|
}
|
|
1740
1801
|
|
|
@@ -2066,21 +2127,18 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
|
|
2066
2127
|
const sycl::half *src1_ptr = src1->type == GGML_TYPE_F16
|
|
2067
2128
|
? (const sycl::half *)src1->data + src1_padded_row_size
|
|
2068
2129
|
: src1_as_f16.get();
|
|
2069
|
-
ggml_sycl_pool_alloc<sycl::half> dst_f16(ctx.pool(), row_diff * src1_ncols);
|
|
2070
2130
|
|
|
2071
2131
|
#if GGML_SYCL_DNNL
|
|
2072
2132
|
if (!g_ggml_sycl_disable_dnn) {
|
|
2073
2133
|
DnnlGemmWrapper::row_gemm(ctx, src1_ncols, row_diff, ne10, src1_ptr,
|
|
2074
2134
|
DnnlGemmWrapper::to_dt<sycl::half>(), src0_ptr, DnnlGemmWrapper::to_dt<sycl::half>(),
|
|
2075
|
-
|
|
2076
|
-
scope_op_debug_print scope_dbg_print(__func__, "/to_fp32_sycl", dst, /*num_src=*/2,
|
|
2077
|
-
" : converting dst to fp32");
|
|
2078
|
-
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16, dst);
|
|
2079
|
-
to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff* src1_ncols, stream);
|
|
2135
|
+
dst_dd_i, DnnlGemmWrapper::to_dt<float>(), stream);
|
|
2080
2136
|
}
|
|
2081
2137
|
else
|
|
2082
2138
|
#endif
|
|
2083
2139
|
{
|
|
2140
|
+
ggml_sycl_pool_alloc<sycl::half> dst_f16(ctx.pool(), row_diff * src1_ncols);
|
|
2141
|
+
|
|
2084
2142
|
const sycl::half alpha_f16 = 1.0f;
|
|
2085
2143
|
const sycl::half beta_f16 = 0.0f;
|
|
2086
2144
|
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm(
|
|
@@ -2446,9 +2504,10 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
|
|
|
2446
2504
|
dev[i].src1_ddq = dev[i].src1_ddq_alloc.alloc(ctx.pool(i), nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs);
|
|
2447
2505
|
|
|
2448
2506
|
if (src1_on_device && src1_is_contiguous) {
|
|
2507
|
+
bool reorder_q8_tensor = src0->extra && ((ggml_tensor_extra_gpu *)src0->extra)->optimized_feature.reorder;
|
|
2449
2508
|
scope_op_debug_print scope_dbg_print(__func__, "/quantize_row_q8_1_sycl", dst,
|
|
2450
2509
|
/*num_src=*/2, " : converting src1 to Q8_1");
|
|
2451
|
-
quantize_row_q8_1_sycl(dev[i].src1_ddf, dev[i].src1_ddq, ne10, nrows1, src1_padded_col_size, stream);
|
|
2510
|
+
quantize_row_q8_1_sycl(dev[i].src1_ddf, dev[i].src1_ddq, ne10, nrows1, src1_padded_col_size, reorder_q8_tensor, stream);
|
|
2452
2511
|
/*
|
|
2453
2512
|
DPCT1010:90: SYCL uses exceptions to report errors and does not
|
|
2454
2513
|
use the error codes. The call was replaced with 0. You need to
|
|
@@ -2554,7 +2613,7 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
|
|
|
2554
2613
|
if (convert_src1_to_q8_1 && !src1_is_contiguous) {
|
|
2555
2614
|
scope_op_debug_print scope_dbg_print(__func__, "/quantize_row_q8_1_sycl", dst,
|
|
2556
2615
|
/*num_src=*/2, " : converting src1 to Q8_1");
|
|
2557
|
-
quantize_row_q8_1_sycl(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
|
|
2616
|
+
quantize_row_q8_1_sycl(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, false, stream);
|
|
2558
2617
|
/*
|
|
2559
2618
|
DPCT1010:92: SYCL uses exceptions to report errors and does
|
|
2560
2619
|
not use the error codes. The call was replaced with 0. You
|
|
@@ -2928,6 +2987,7 @@ inline bool ggml_sycl_supports_reorder_mul_mat_sycl(enum ggml_type type) {
|
|
|
2928
2987
|
case GGML_TYPE_Q4_0:
|
|
2929
2988
|
return true;
|
|
2930
2989
|
case GGML_TYPE_Q4_K:
|
|
2990
|
+
case GGML_TYPE_Q6_K:
|
|
2931
2991
|
return !g_ggml_sycl_prioritize_dmmv;
|
|
2932
2992
|
default:
|
|
2933
2993
|
return false;
|
|
@@ -2947,6 +3007,7 @@ inline bool ggml_sycl_supports_reorder_mmvq(enum ggml_type type) {
|
|
|
2947
3007
|
switch (type) {
|
|
2948
3008
|
case GGML_TYPE_Q4_0:
|
|
2949
3009
|
case GGML_TYPE_Q4_K:
|
|
3010
|
+
case GGML_TYPE_Q6_K:
|
|
2950
3011
|
return true;
|
|
2951
3012
|
default:
|
|
2952
3013
|
return false;
|
|
@@ -3031,6 +3092,50 @@ static void reorder_qw_q4_k(uint8_t * data_device, size_t size, size_t offset, d
|
|
|
3031
3092
|
sycl::free(tmp_buf, *stream);
|
|
3032
3093
|
}
|
|
3033
3094
|
|
|
3095
|
+
static void reorder_qw_q6_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
|
|
3096
|
+
GGML_ASSERT(size % sizeof(block_q6_K) == 0);
|
|
3097
|
+
GGML_ASSERT(offset % sizeof(block_q6_K) == 0);
|
|
3098
|
+
|
|
3099
|
+
const int nblocks = size / sizeof(block_q6_K);
|
|
3100
|
+
|
|
3101
|
+
auto * tmp_buf = sycl::malloc_shared<uint8_t>(size, *stream);
|
|
3102
|
+
SYCL_CHECK(CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size).wait()));
|
|
3103
|
+
|
|
3104
|
+
auto * ql_ptr = data_device;
|
|
3105
|
+
auto * qh_ptr = ql_ptr + (QK_K / 2) * nblocks;
|
|
3106
|
+
auto * scales_ptr = qh_ptr + (QK_K / 4) * nblocks;
|
|
3107
|
+
sycl::half * dm_ptr = (sycl::half *) (scales_ptr + (QK_K / 16) * nblocks);
|
|
3108
|
+
|
|
3109
|
+
stream
|
|
3110
|
+
->parallel_for(nblocks,
|
|
3111
|
+
[=](auto i) {
|
|
3112
|
+
const block_q6_K * x = (const block_q6_K *) tmp_buf;
|
|
3113
|
+
const int ib = i;
|
|
3114
|
+
|
|
3115
|
+
const uint8_t * ql = x[ib].ql;
|
|
3116
|
+
const uint8_t * qh = x[ib].qh;
|
|
3117
|
+
uint8_t * base_ql_ptr = ql_ptr + (QK_K / 2) * ib;
|
|
3118
|
+
uint8_t * base_qh_ptr = qh_ptr + (QK_K / 4) * ib;
|
|
3119
|
+
uint8_t * base_scales_ptr = scales_ptr + (QK_K / 16) * ib;
|
|
3120
|
+
|
|
3121
|
+
for (int j = 0; j < QK_K / 2; ++j) {
|
|
3122
|
+
base_ql_ptr[j] = ql[j];
|
|
3123
|
+
}
|
|
3124
|
+
for (int j = 0; j < QK_K / 4; ++j) {
|
|
3125
|
+
base_qh_ptr[j] = qh[j];
|
|
3126
|
+
}
|
|
3127
|
+
|
|
3128
|
+
for (int j = 0; j < QK_K / 16; ++j) {
|
|
3129
|
+
base_scales_ptr[j] = x[ib].scales[j];
|
|
3130
|
+
}
|
|
3131
|
+
|
|
3132
|
+
dm_ptr[ib] = x[ib].d;
|
|
3133
|
+
})
|
|
3134
|
+
.wait_and_throw();
|
|
3135
|
+
|
|
3136
|
+
sycl::free(tmp_buf, *stream);
|
|
3137
|
+
}
|
|
3138
|
+
|
|
3034
3139
|
static void reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
|
|
3035
3140
|
uint8_t * data_device = (uint8_t *) src0->data;
|
|
3036
3141
|
size_t ncols = src0->ne[0];
|
|
@@ -3044,6 +3149,9 @@ static void reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
|
|
|
3044
3149
|
case GGML_TYPE_Q4_K:
|
|
3045
3150
|
reorder_qw_q4_k(data_device, size, 0, stream);
|
|
3046
3151
|
break;
|
|
3152
|
+
case GGML_TYPE_Q6_K:
|
|
3153
|
+
reorder_qw_q6_k(data_device, size, 0, stream);
|
|
3154
|
+
break;
|
|
3047
3155
|
default:
|
|
3048
3156
|
GGML_ABORT("reorder_qw() called with unsupported type");
|
|
3049
3157
|
break;
|
|
@@ -3755,7 +3863,7 @@ static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend,
|
|
|
3755
3863
|
const void *data, size_t offset,
|
|
3756
3864
|
size_t size) try {
|
|
3757
3865
|
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
|
3758
|
-
|
|
3866
|
+
GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
|
|
3759
3867
|
GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
|
|
3760
3868
|
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
|
3761
3869
|
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
|
@@ -3776,7 +3884,7 @@ static void ggml_backend_sycl_get_tensor_async(ggml_backend_t backend,
|
|
|
3776
3884
|
void *data, size_t offset,
|
|
3777
3885
|
size_t size) try {
|
|
3778
3886
|
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
|
3779
|
-
|
|
3887
|
+
GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
|
|
3780
3888
|
GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
|
|
3781
3889
|
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
|
3782
3890
|
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
|
@@ -3799,8 +3907,8 @@ static bool ggml_backend_sycl_cpy_tensor_async(ggml_backend_t backend,
|
|
|
3799
3907
|
bool is_cpy_supported = dst->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) &&
|
|
3800
3908
|
ggml_backend_buffer_is_sycl(src->buffer);
|
|
3801
3909
|
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
|
3802
|
-
|
|
3803
|
-
|
|
3910
|
+
GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": dst", dst).c_str());
|
|
3911
|
+
GGML_SYCL_DEBUG("%s", debug_get_tensor_str(" src", src).c_str());
|
|
3804
3912
|
GGML_SYCL_DEBUG(" is_cpy_supported=%d\n", is_cpy_supported);
|
|
3805
3913
|
if (is_cpy_supported) {
|
|
3806
3914
|
/*
|
|
@@ -4165,6 +4273,9 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
|
|
4165
4273
|
{
|
|
4166
4274
|
ggml_type src0_type = op->src[0]->type;
|
|
4167
4275
|
ggml_type src1_type = op->src[1]->type;
|
|
4276
|
+
if (src0_type == src1_type && (ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1])) && src0_type != GGML_TYPE_BF16) {
|
|
4277
|
+
return true;
|
|
4278
|
+
}
|
|
4168
4279
|
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
|
|
4169
4280
|
return true;
|
|
4170
4281
|
}
|
|
@@ -4210,6 +4321,21 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
|
|
4210
4321
|
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_IQ4_NL) {
|
|
4211
4322
|
return true;
|
|
4212
4323
|
}
|
|
4324
|
+
if(src0_type == GGML_TYPE_Q8_0 && src1_type == GGML_TYPE_Q8_0) {
|
|
4325
|
+
return true;
|
|
4326
|
+
}
|
|
4327
|
+
if(src0_type == GGML_TYPE_Q5_0 && src1_type == GGML_TYPE_Q5_0) {
|
|
4328
|
+
return true;
|
|
4329
|
+
}
|
|
4330
|
+
if(src0_type == GGML_TYPE_Q5_1 && src1_type == GGML_TYPE_Q5_1) {
|
|
4331
|
+
return true;
|
|
4332
|
+
}
|
|
4333
|
+
if(src0_type == GGML_TYPE_Q4_0 && src1_type == GGML_TYPE_Q4_0) {
|
|
4334
|
+
return true;
|
|
4335
|
+
}
|
|
4336
|
+
if(src0_type == GGML_TYPE_Q4_1 && src1_type == GGML_TYPE_Q4_1) {
|
|
4337
|
+
return true;
|
|
4338
|
+
}
|
|
4213
4339
|
return false;
|
|
4214
4340
|
}
|
|
4215
4341
|
case GGML_OP_CONCAT:
|
|
@@ -29,24 +29,23 @@ static void mul_mat_vec_q_reorder(const void * __restrict__ vx, const void * __r
|
|
|
29
29
|
static_assert(blocks_per_subgroup > 0);
|
|
30
30
|
static_assert(block_elements_per_subgroup > 0);
|
|
31
31
|
|
|
32
|
-
const block_q8_1 * y = (const block_q8_1 *) vy;
|
|
33
|
-
|
|
34
32
|
float partial_sum = 0.0f;
|
|
35
33
|
for (int i = sg.get_local_linear_id() / block_elements_per_subgroup; i < blocks_per_row; i += blocks_per_subgroup) {
|
|
36
|
-
const int ibx
|
|
37
|
-
// TODO: Generalize offsets, right now only works for quantizations that don't split high and low bits
|
|
38
|
-
const int bx_offset = block_type::get_block_offset(ibx);
|
|
39
|
-
const int d_offset = block_type::get_d_offset(nrows, ncols, ibx);
|
|
34
|
+
const int ibx = row * blocks_per_row + i; // x block index
|
|
40
35
|
|
|
36
|
+
const auto bx_offset = block_type::get_block_offset(ibx, nblocks);
|
|
37
|
+
const auto d_offset = block_type::get_d_offset(nrows, ncols, ibx);
|
|
41
38
|
// Y block index that aligns with ibx
|
|
42
39
|
const int iby = i * block_type::block_to_q8_1_ratio();
|
|
40
|
+
const int8_t* q8_1_quant_ptr = (const int8_t*)vy + iby * QK8_1;
|
|
41
|
+
const sycl::half2* q8_1_ds_ptr = (const sycl::half2*)((const char*)vy + ncols + iby * sizeof(sycl::half2));
|
|
43
42
|
|
|
44
43
|
#pragma unroll
|
|
45
44
|
for (int elem = 0; elem < block_elements_per_subgroup; elem += WARP_SIZE) {
|
|
46
45
|
// x block quant index when casting the quants to int
|
|
47
46
|
const int iqs = elem + block_traits::vdr_mmvq * (sg.get_local_linear_id() % block_elements_per_subgroup);
|
|
48
47
|
|
|
49
|
-
partial_sum += reorder_vec_dot_q_sycl()(vx, bx_offset, d_offset,
|
|
48
|
+
partial_sum += reorder_vec_dot_q_sycl()(vx, bx_offset, d_offset, q8_1_quant_ptr, q8_1_ds_ptr, iqs);
|
|
50
49
|
}
|
|
51
50
|
}
|
|
52
51
|
|
|
@@ -785,6 +784,24 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
|
|
|
785
784
|
}
|
|
786
785
|
}
|
|
787
786
|
|
|
787
|
+
static void reorder_mul_mat_vec_q6_k_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols,
|
|
788
|
+
const int nrows, dpct::queue_ptr stream) {
|
|
789
|
+
GGML_ASSERT(ncols % QK_K == 0);
|
|
790
|
+
const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y);
|
|
791
|
+
constexpr size_t num_subgroups = 16;
|
|
792
|
+
GGML_ASSERT(block_num_y % num_subgroups == 0);
|
|
793
|
+
|
|
794
|
+
const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE);
|
|
795
|
+
const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
|
|
796
|
+
|
|
797
|
+
stream->submit([&](sycl::handler & cgh) {
|
|
798
|
+
cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size),
|
|
799
|
+
[=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
|
800
|
+
mul_mat_vec_q_reorder<reorder_vec_dot_q_sycl<GGML_TYPE_Q6_K>>(vx, vy, dst, ncols, nrows,
|
|
801
|
+
nd_item);
|
|
802
|
+
});
|
|
803
|
+
});
|
|
804
|
+
}
|
|
788
805
|
static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
|
|
789
806
|
float *dst, const int ncols,
|
|
790
807
|
const int nrows,
|
|
@@ -1070,7 +1087,14 @@ void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tens
|
|
|
1070
1087
|
mul_mat_vec_q5_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
|
1071
1088
|
break;
|
|
1072
1089
|
case GGML_TYPE_Q6_K:
|
|
1073
|
-
|
|
1090
|
+
if ((ggml_tensor_extra_gpu *) dst->src[0]->extra &&
|
|
1091
|
+
((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
|
|
1092
|
+
GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q6_k_q8_1_sycl\n");
|
|
1093
|
+
reorder_mul_mat_vec_q6_k_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
|
1094
|
+
} else {
|
|
1095
|
+
GGML_SYCL_DEBUG("Calling mul_mat_vec_q6_k_q8_1_sycl\n");
|
|
1096
|
+
mul_mat_vec_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
|
1097
|
+
}
|
|
1074
1098
|
break;
|
|
1075
1099
|
case GGML_TYPE_IQ1_S:
|
|
1076
1100
|
mul_mat_vec_iq1_s_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
|
@@ -14,12 +14,13 @@
|
|
|
14
14
|
#ifndef GGML_SYCL_QUANTS_HPP
|
|
15
15
|
#define GGML_SYCL_QUANTS_HPP
|
|
16
16
|
|
|
17
|
+
#include <utility>
|
|
18
|
+
|
|
17
19
|
#include "ggml-common.h"
|
|
18
20
|
#include "ggml.h"
|
|
19
21
|
|
|
20
22
|
namespace ggml_sycl_reordered {
|
|
21
23
|
|
|
22
|
-
|
|
23
24
|
// The reordered block moves quants (qs) and scales(d) to two
|
|
24
25
|
// uniform regions of memory that is contiguous in the same tensor.
|
|
25
26
|
// What this means is that instead of having:
|
|
@@ -32,7 +33,6 @@ namespace ggml_sycl_reordered {
|
|
|
32
33
|
|
|
33
34
|
template <ggml_type type> struct block_q_t;
|
|
34
35
|
|
|
35
|
-
|
|
36
36
|
// qk number of weights / quants in a block
|
|
37
37
|
// qr number of weights in a byte (described as 'before dequantization')
|
|
38
38
|
// for quantization types that has low and high bits split, qr is calculated with
|
|
@@ -47,10 +47,12 @@ template <> struct block_q_t<GGML_TYPE_Q4_0> {
|
|
|
47
47
|
static constexpr uint32_t vdr_mmvq = 2;
|
|
48
48
|
};
|
|
49
49
|
|
|
50
|
-
static constexpr int get_block_offset(const int block_index
|
|
50
|
+
static constexpr std::pair<int, int> get_block_offset(const int block_index, const int /* nblocks */) {
|
|
51
|
+
return { block_index * (traits::qk / traits::qr), 0 };
|
|
52
|
+
}
|
|
51
53
|
|
|
52
|
-
static constexpr int get_d_offset(int nrows, int ncols, const int block_index) {
|
|
53
|
-
return (ncols / traits::qr * nrows) + block_index * sizeof(ggml_half);
|
|
54
|
+
static constexpr std::pair<int, int> get_d_offset(int nrows, int ncols, const int block_index) {
|
|
55
|
+
return { (ncols / traits::qr * nrows) + block_index * sizeof(ggml_half), 0 };
|
|
54
56
|
}
|
|
55
57
|
|
|
56
58
|
static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
|
|
@@ -64,20 +66,46 @@ template <> struct block_q_t<GGML_TYPE_Q4_K> {
|
|
|
64
66
|
static constexpr uint32_t vdr_mmvq = 2;
|
|
65
67
|
};
|
|
66
68
|
|
|
67
|
-
static constexpr int get_block_offset(const int block_index
|
|
69
|
+
static constexpr std::pair<int, int> get_block_offset(const int block_index, const int /* nblocks */) {
|
|
70
|
+
return { block_index * (traits::qk / traits::qr), 0 };
|
|
71
|
+
}
|
|
68
72
|
|
|
69
|
-
static constexpr int get_d_offset(int nrows, int ncols, const int block_index) {
|
|
73
|
+
static constexpr std::pair<int, int> get_d_offset(int nrows, int ncols, const int block_index) {
|
|
70
74
|
auto nblocks = (nrows * (ncols / traits::qk));
|
|
71
|
-
return
|
|
75
|
+
return { nblocks * (QK_K / 2),
|
|
76
|
+
(nblocks * QK_K / 2) + (nblocks * K_SCALE_SIZE) + (block_index * sizeof(ggml_half2)) };
|
|
72
77
|
}
|
|
73
78
|
|
|
74
79
|
static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
|
|
75
80
|
|
|
76
81
|
constexpr size_t get_total_qs_bytes(int nblocks) { return nblocks * QK_K / 2; }
|
|
77
|
-
|
|
78
|
-
constexpr size_t get_dm_offset(int nblocks) { return get_total_qs_bytes(nblocks) + nblocks * K_SCALE_SIZE; }
|
|
79
82
|
};
|
|
80
83
|
|
|
84
|
+
template <> struct block_q_t<GGML_TYPE_Q6_K> {
|
|
85
|
+
struct traits {
|
|
86
|
+
static constexpr uint32_t qk = QK_K;
|
|
87
|
+
static constexpr uint32_t qi = QI6_K;
|
|
88
|
+
static constexpr uint32_t qr = QR6_K;
|
|
89
|
+
static constexpr uint32_t vdr_mmvq = 1;
|
|
90
|
+
};
|
|
91
|
+
|
|
92
|
+
static constexpr std::pair<int, int> get_block_offset(const int block_index, const int n_blocks) {
|
|
93
|
+
auto low_bits_index = block_index * (traits::qk / traits::qr);
|
|
94
|
+
// the index of high bits it's after all low bits
|
|
95
|
+
auto high_bits_index = n_blocks * (QK_K / 2) + (block_index * (QK_K / 4));
|
|
96
|
+
return { low_bits_index, high_bits_index };
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
static constexpr std::pair<int, int> get_d_offset(int nrows, int ncols, const int block_index) {
|
|
100
|
+
auto nblocks = (nrows * (ncols / traits::qk));
|
|
101
|
+
auto total_qs_bytes = nblocks * (QK_K / 2) + nblocks * (QK_K / 4);
|
|
102
|
+
auto block_scales = total_qs_bytes + block_index * (QK_K / 16);
|
|
103
|
+
auto sb_scale = total_qs_bytes + nblocks * (QK_K / 16);
|
|
104
|
+
return { block_scales, sb_scale };
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
|
|
108
|
+
};
|
|
81
109
|
} // namespace ggml_sycl_reordered
|
|
82
110
|
|
|
83
111
|
#endif // GGML_SYCL_QUANTS_HPP
|