@novastera-oss/llamarn 0.2.6 → 0.2.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/cpp/include/llama.h +134 -36
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/LlamaCppModel.cpp +2 -2
- package/cpp/LlamaCppModel.h +3 -3
- package/cpp/PureCppImpl.cpp +1 -1
- package/cpp/PureCppImpl.h +2 -2
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/CMakeLists.txt +15 -4
- package/cpp/llama.cpp/Makefile +2 -2
- package/cpp/llama.cpp/README.md +32 -13
- package/cpp/llama.cpp/common/CMakeLists.txt +10 -20
- package/cpp/llama.cpp/common/arg.cpp +30 -6
- package/cpp/llama.cpp/common/build-info.cpp.in +2 -2
- package/cpp/llama.cpp/common/chat-parser.cpp +5 -0
- package/cpp/llama.cpp/common/chat-parser.h +2 -0
- package/cpp/llama.cpp/common/chat.cpp +12 -9
- package/cpp/llama.cpp/common/chat.h +1 -1
- package/cpp/llama.cpp/common/common.cpp +50 -40
- package/cpp/llama.cpp/common/common.h +5 -2
- package/cpp/llama.cpp/common/speculative.cpp +6 -4
- package/cpp/llama.cpp/convert_hf_to_gguf.py +97 -56
- package/cpp/llama.cpp/ggml/CMakeLists.txt +47 -2
- package/cpp/llama.cpp/ggml/cmake/common.cmake +1 -2
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +47 -13
- package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +6 -1
- package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +33 -9
- package/cpp/llama.cpp/ggml/src/ggml-common.h +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +93 -24
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4113 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2174 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2638 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2731 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2068 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +396 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1299 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1480 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +4310 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +59 -3206
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +184 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +7 -4
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +10 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +8 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +56 -7
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +1157 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +1555 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.h +98 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +2 -4
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +5 -8
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +4 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +6 -8
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +6 -4
- package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +11 -10
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +33 -8
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +135 -100
- package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +908 -3
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
- package/cpp/llama.cpp/ggml/src/ggml-quants.c +0 -2
- package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +18 -15
- package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +19 -24
- package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +21 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +121 -4
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +32 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +2 -96
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +164 -38
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +32 -8
- package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +38 -10
- package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +108 -16
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +26 -29
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +431 -247
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +0 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml.c +0 -6
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +57 -0
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +4 -1
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +14 -3
- package/cpp/llama.cpp/include/llama.h +134 -36
- package/cpp/llama.cpp/requirements/requirements-compare-llama-bench.txt +1 -0
- package/cpp/llama.cpp/src/CMakeLists.txt +2 -2
- package/cpp/llama.cpp/src/llama-arch.cpp +95 -3
- package/cpp/llama.cpp/src/llama-arch.h +7 -1
- package/cpp/llama.cpp/src/llama-batch.cpp +270 -19
- package/cpp/llama.cpp/src/llama-batch.h +36 -11
- package/cpp/llama.cpp/src/llama-chat.cpp +19 -2
- package/cpp/llama.cpp/src/llama-chat.h +1 -0
- package/cpp/llama.cpp/src/llama-context.cpp +313 -213
- package/cpp/llama.cpp/src/llama-context.h +16 -12
- package/cpp/llama.cpp/src/llama-cparams.cpp +1 -1
- package/cpp/llama.cpp/src/llama-cparams.h +1 -1
- package/cpp/llama.cpp/src/llama-graph.cpp +249 -129
- package/cpp/llama.cpp/src/llama-graph.h +90 -34
- package/cpp/llama.cpp/src/llama-hparams.cpp +6 -2
- package/cpp/llama.cpp/src/llama-hparams.h +8 -2
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +82 -50
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +23 -26
- package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +292 -174
- package/cpp/llama.cpp/src/llama-kv-cache-unified.h +68 -38
- package/cpp/llama.cpp/src/llama-kv-cells.h +18 -13
- package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +247 -0
- package/cpp/llama.cpp/src/llama-memory-hybrid.h +143 -0
- package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.cpp → llama-memory-recurrent.cpp} +266 -282
- package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.h → llama-memory-recurrent.h} +54 -57
- package/cpp/llama.cpp/src/llama-memory.cpp +41 -0
- package/cpp/llama.cpp/src/llama-memory.h +64 -23
- package/cpp/llama.cpp/src/llama-mmap.cpp +1 -1
- package/cpp/llama.cpp/src/llama-model-loader.cpp +42 -17
- package/cpp/llama.cpp/src/llama-model.cpp +726 -141
- package/cpp/llama.cpp/src/llama-model.h +4 -0
- package/cpp/llama.cpp/src/llama-quant.cpp +2 -1
- package/cpp/llama.cpp/src/llama-vocab.cpp +32 -23
- package/cpp/llama.cpp/src/llama.cpp +11 -7
- package/cpp/llama.cpp/src/unicode.cpp +5 -0
- package/cpp/rn-completion.cpp +2 -2
- package/cpp/{rn-llama.hpp → rn-llama.h} +1 -1
- package/ios/include/chat.h +1 -1
- package/ios/include/common.h +5 -2
- package/ios/include/llama.h +134 -36
- package/ios/libs/llama.xcframework/Info.plist +18 -18
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4863 -4689
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +134 -36
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4834 -4710
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3742 -3622
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +134 -36
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4834 -4710
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3744 -3624
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +134 -36
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +134 -36
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +134 -36
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4863 -4689
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +134 -36
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4834 -4710
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3742 -3622
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +134 -36
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4900 -4725
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +134 -36
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4871 -4746
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3773 -3652
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +134 -36
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +1 -2
- package/cpp/llama.cpp/common/cmake/build-info-gen-cpp.cmake +0 -24
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13891
- package/cpp/llama.cpp/src/llama-kv-cache.cpp +0 -1
- package/cpp/llama.cpp/src/llama-kv-cache.h +0 -44
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
- /package/cpp/{rn-utils.hpp → rn-utils.h} +0 -0
|
@@ -2425,8 +2425,6 @@ void dequantize_row_iq1_m(const block_iq1_m * GGML_RESTRICT x, float * GGML_REST
|
|
|
2425
2425
|
}
|
|
2426
2426
|
}
|
|
2427
2427
|
|
|
2428
|
-
static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
|
|
2429
|
-
|
|
2430
2428
|
void dequantize_row_iq4_nl(const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
|
2431
2429
|
assert(k % QK4_NL == 0);
|
|
2432
2430
|
const int64_t nb = k / QK4_NL;
|
|
@@ -53,6 +53,9 @@ struct socket_t {
|
|
|
53
53
|
}
|
|
54
54
|
};
|
|
55
55
|
|
|
56
|
+
// macro for nicer error messages on server crash
|
|
57
|
+
#define RPC_STATUS_ASSERT(x) if (!(x)) GGML_ABORT("Remote RPC server crashed or returned malformed response")
|
|
58
|
+
|
|
56
59
|
// all RPC structures must be packed
|
|
57
60
|
#pragma pack(push, 1)
|
|
58
61
|
// ggml_tensor is serialized into rpc_tensor
|
|
@@ -425,7 +428,7 @@ static bool send_rpc_cmd(const std::shared_ptr<socket_t> & sock, enum rpc_cmd cm
|
|
|
425
428
|
static bool check_server_version(const std::shared_ptr<socket_t> & sock) {
|
|
426
429
|
rpc_msg_hello_rsp response;
|
|
427
430
|
bool status = send_rpc_cmd(sock, RPC_CMD_HELLO, nullptr, 0, &response, sizeof(response));
|
|
428
|
-
|
|
431
|
+
RPC_STATUS_ASSERT(status);
|
|
429
432
|
if (response.major != RPC_PROTO_MAJOR_VERSION || response.minor > RPC_PROTO_MINOR_VERSION) {
|
|
430
433
|
fprintf(stderr, "RPC server version mismatch: %d.%d.%d\n", response.major, response.minor, response.patch);
|
|
431
434
|
return false;
|
|
@@ -481,7 +484,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
|
|
481
484
|
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
|
|
482
485
|
rpc_msg_free_buffer_req request = {ctx->remote_ptr};
|
|
483
486
|
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_FREE_BUFFER, &request, sizeof(request), nullptr, 0);
|
|
484
|
-
|
|
487
|
+
RPC_STATUS_ASSERT(status);
|
|
485
488
|
delete ctx;
|
|
486
489
|
}
|
|
487
490
|
|
|
@@ -493,7 +496,7 @@ static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
|
|
|
493
496
|
rpc_msg_buffer_get_base_req request = {ctx->remote_ptr};
|
|
494
497
|
rpc_msg_buffer_get_base_rsp response;
|
|
495
498
|
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_BUFFER_GET_BASE, &request, sizeof(request), &response, sizeof(response));
|
|
496
|
-
|
|
499
|
+
RPC_STATUS_ASSERT(status);
|
|
497
500
|
ctx->base_ptr = reinterpret_cast<void *>(response.base_ptr);
|
|
498
501
|
return ctx->base_ptr;
|
|
499
502
|
}
|
|
@@ -545,7 +548,7 @@ static enum ggml_status ggml_backend_rpc_buffer_init_tensor(ggml_backend_buffer_
|
|
|
545
548
|
request.tensor = serialize_tensor(tensor);
|
|
546
549
|
|
|
547
550
|
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_INIT_TENSOR, &request, sizeof(request), nullptr, 0);
|
|
548
|
-
|
|
551
|
+
RPC_STATUS_ASSERT(status);
|
|
549
552
|
}
|
|
550
553
|
return GGML_STATUS_SUCCESS;
|
|
551
554
|
}
|
|
@@ -560,7 +563,7 @@ static void ggml_backend_rpc_buffer_set_tensor(ggml_backend_buffer_t buffer, ggm
|
|
|
560
563
|
request.hash = fnv_hash((const uint8_t*)data, size);
|
|
561
564
|
rpc_msg_set_tensor_hash_rsp response;
|
|
562
565
|
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_SET_TENSOR_HASH, &request, sizeof(request), &response, sizeof(response));
|
|
563
|
-
|
|
566
|
+
RPC_STATUS_ASSERT(status);
|
|
564
567
|
if (response.result) {
|
|
565
568
|
// the server has the same data, no need to send it
|
|
566
569
|
return;
|
|
@@ -573,7 +576,7 @@ static void ggml_backend_rpc_buffer_set_tensor(ggml_backend_buffer_t buffer, ggm
|
|
|
573
576
|
memcpy(input.data() + sizeof(rpc_tensor), &offset, sizeof(offset));
|
|
574
577
|
memcpy(input.data() + sizeof(rpc_tensor) + sizeof(offset), data, size);
|
|
575
578
|
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_SET_TENSOR, input.data(), input.size());
|
|
576
|
-
|
|
579
|
+
RPC_STATUS_ASSERT(status);
|
|
577
580
|
}
|
|
578
581
|
|
|
579
582
|
static void ggml_backend_rpc_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
|
@@ -583,7 +586,7 @@ static void ggml_backend_rpc_buffer_get_tensor(ggml_backend_buffer_t buffer, con
|
|
|
583
586
|
request.offset = offset;
|
|
584
587
|
request.size = size;
|
|
585
588
|
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_GET_TENSOR, &request, sizeof(request), data, size);
|
|
586
|
-
|
|
589
|
+
RPC_STATUS_ASSERT(status);
|
|
587
590
|
}
|
|
588
591
|
|
|
589
592
|
static bool ggml_backend_rpc_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
|
|
@@ -601,7 +604,7 @@ static bool ggml_backend_rpc_buffer_cpy_tensor(ggml_backend_buffer_t buffer, con
|
|
|
601
604
|
request.dst = serialize_tensor(dst);
|
|
602
605
|
rpc_msg_copy_tensor_rsp response;
|
|
603
606
|
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_COPY_TENSOR, &request, sizeof(request), &response, sizeof(response));
|
|
604
|
-
|
|
607
|
+
RPC_STATUS_ASSERT(status);
|
|
605
608
|
return response.result;
|
|
606
609
|
}
|
|
607
610
|
|
|
@@ -609,7 +612,7 @@ static void ggml_backend_rpc_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
|
|
|
609
612
|
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
|
|
610
613
|
rpc_msg_buffer_clear_req request = {ctx->remote_ptr, value};
|
|
611
614
|
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_BUFFER_CLEAR, &request, sizeof(request), nullptr, 0);
|
|
612
|
-
|
|
615
|
+
RPC_STATUS_ASSERT(status);
|
|
613
616
|
}
|
|
614
617
|
|
|
615
618
|
static ggml_backend_buffer_i ggml_backend_rpc_buffer_interface = {
|
|
@@ -635,7 +638,7 @@ static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer(ggml_back
|
|
|
635
638
|
rpc_msg_alloc_buffer_rsp response;
|
|
636
639
|
auto sock = get_socket(buft_ctx->endpoint);
|
|
637
640
|
bool status = send_rpc_cmd(sock, RPC_CMD_ALLOC_BUFFER, &request, sizeof(request), &response, sizeof(response));
|
|
638
|
-
|
|
641
|
+
RPC_STATUS_ASSERT(status);
|
|
639
642
|
if (response.remote_ptr != 0) {
|
|
640
643
|
ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft,
|
|
641
644
|
ggml_backend_rpc_buffer_interface,
|
|
@@ -650,7 +653,7 @@ static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer(ggml_back
|
|
|
650
653
|
static size_t get_alignment(const std::shared_ptr<socket_t> & sock) {
|
|
651
654
|
rpc_msg_get_alignment_rsp response;
|
|
652
655
|
bool status = send_rpc_cmd(sock, RPC_CMD_GET_ALIGNMENT, nullptr, 0, &response, sizeof(response));
|
|
653
|
-
|
|
656
|
+
RPC_STATUS_ASSERT(status);
|
|
654
657
|
return response.alignment;
|
|
655
658
|
}
|
|
656
659
|
|
|
@@ -662,7 +665,7 @@ static size_t ggml_backend_rpc_buffer_type_get_alignment(ggml_backend_buffer_typ
|
|
|
662
665
|
static size_t get_max_size(const std::shared_ptr<socket_t> & sock) {
|
|
663
666
|
rpc_msg_get_max_size_rsp response;
|
|
664
667
|
bool status = send_rpc_cmd(sock, RPC_CMD_GET_MAX_SIZE, nullptr, 0, &response, sizeof(response));
|
|
665
|
-
|
|
668
|
+
RPC_STATUS_ASSERT(status);
|
|
666
669
|
return response.max_size;
|
|
667
670
|
}
|
|
668
671
|
|
|
@@ -683,7 +686,7 @@ static size_t ggml_backend_rpc_buffer_type_get_alloc_size(ggml_backend_buffer_ty
|
|
|
683
686
|
|
|
684
687
|
rpc_msg_get_alloc_size_rsp response;
|
|
685
688
|
bool status = send_rpc_cmd(sock, RPC_CMD_GET_ALLOC_SIZE, &request, sizeof(request), &response, sizeof(response));
|
|
686
|
-
|
|
689
|
+
RPC_STATUS_ASSERT(status);
|
|
687
690
|
|
|
688
691
|
return response.alloc_size;
|
|
689
692
|
} else {
|
|
@@ -761,7 +764,7 @@ static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t backend, g
|
|
|
761
764
|
rpc_msg_graph_compute_rsp response;
|
|
762
765
|
auto sock = get_socket(rpc_ctx->endpoint);
|
|
763
766
|
bool status = send_rpc_cmd(sock, RPC_CMD_GRAPH_COMPUTE, input.data(), input.size(), &response, sizeof(response));
|
|
764
|
-
|
|
767
|
+
RPC_STATUS_ASSERT(status);
|
|
765
768
|
return (enum ggml_status)response.result;
|
|
766
769
|
}
|
|
767
770
|
|
|
@@ -835,7 +838,7 @@ bool ggml_backend_is_rpc(ggml_backend_t backend) {
|
|
|
835
838
|
static void get_device_memory(const std::shared_ptr<socket_t> & sock, size_t * free, size_t * total) {
|
|
836
839
|
rpc_msg_get_device_memory_rsp response;
|
|
837
840
|
bool status = send_rpc_cmd(sock, RPC_CMD_GET_DEVICE_MEMORY, nullptr, 0, &response, sizeof(response));
|
|
838
|
-
|
|
841
|
+
RPC_STATUS_ASSERT(status);
|
|
839
842
|
*free = response.free_mem;
|
|
840
843
|
*total = response.total_mem;
|
|
841
844
|
}
|
|
@@ -142,7 +142,7 @@ else()
|
|
|
142
142
|
FetchContent_Declare(
|
|
143
143
|
ONEMATH
|
|
144
144
|
GIT_REPOSITORY https://github.com/uxlfoundation/oneMath.git
|
|
145
|
-
GIT_TAG
|
|
145
|
+
GIT_TAG 8efe85f5aaebb37f1d8c503b7af66315feabf142
|
|
146
146
|
)
|
|
147
147
|
FetchContent_MakeAvailable(ONEMATH)
|
|
148
148
|
# Create alias to match with find_package targets name
|
|
@@ -149,8 +149,6 @@ typedef sycl::float2 dfloat2;
|
|
|
149
149
|
|
|
150
150
|
#define MMVQ_MAX_BATCH_SIZE 8
|
|
151
151
|
|
|
152
|
-
static const int8_t kvalues_iq4nl[16]={-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
|
|
153
|
-
|
|
154
152
|
static int g_all_sycl_device_count = -1;
|
|
155
153
|
static bool g_ggml_backend_sycl_buffer_type_initialized = false;
|
|
156
154
|
|
|
@@ -515,9 +513,9 @@ constexpr size_t ceil_div(const size_t m, const size_t n) {
|
|
|
515
513
|
|
|
516
514
|
bool gpu_has_xmx(sycl::device &dev);
|
|
517
515
|
|
|
518
|
-
template <int N, class T>
|
|
516
|
+
template <int N, class T> std::string debug_get_array_str(const std::string & prefix, const T array[N]) {
|
|
519
517
|
if (LIKELY(!g_ggml_sycl_debug)) {
|
|
520
|
-
return;
|
|
518
|
+
return "";
|
|
521
519
|
}
|
|
522
520
|
std::stringstream ss;
|
|
523
521
|
ss << prefix << "=[";
|
|
@@ -528,29 +526,26 @@ template <int N, class T> void debug_print_array(const std::string & prefix, con
|
|
|
528
526
|
ss << array[N - 1];
|
|
529
527
|
}
|
|
530
528
|
ss << "]";
|
|
531
|
-
|
|
529
|
+
return ss.str();
|
|
532
530
|
}
|
|
533
531
|
|
|
534
|
-
inline
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
GGML_SYCL_DEBUG("%s=", prefix.c_str());
|
|
532
|
+
inline std::string debug_get_tensor_str(const std::string &prefix,
|
|
533
|
+
const ggml_tensor *tensor, const std::string &suffix = "") {
|
|
534
|
+
std::stringstream ss;
|
|
535
|
+
if (LIKELY(!g_ggml_sycl_debug)) { return ss.str(); }
|
|
536
|
+
ss << prefix.c_str() << "=";
|
|
540
537
|
if (tensor) {
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
}
|
|
547
|
-
if (ggml_is_permuted(tensor)) {
|
|
548
|
-
GGML_SYCL_DEBUG(";permuted");
|
|
549
|
-
}
|
|
538
|
+
ss << "'" << tensor->name << "':type=" << ggml_type_name(tensor->type);
|
|
539
|
+
ss << debug_get_array_str<GGML_MAX_DIMS>(";ne", tensor->ne);
|
|
540
|
+
ss << debug_get_array_str<GGML_MAX_DIMS>(";nb", tensor->nb);
|
|
541
|
+
|
|
542
|
+
if (!ggml_is_contiguous(tensor)) { ss << ";strided"; }
|
|
543
|
+
if (ggml_is_permuted(tensor)) { ss << ";permuted"; }
|
|
550
544
|
} else {
|
|
551
|
-
|
|
545
|
+
ss << "nullptr";
|
|
552
546
|
}
|
|
553
|
-
|
|
547
|
+
ss << suffix;
|
|
548
|
+
return ss.str();
|
|
554
549
|
}
|
|
555
550
|
|
|
556
551
|
// Use scope_op_debug_print to log operations coming from running a model
|
|
@@ -566,10 +561,10 @@ struct scope_op_debug_print {
|
|
|
566
561
|
return;
|
|
567
562
|
}
|
|
568
563
|
GGML_SYCL_DEBUG("[SYCL][OP] call %s%s:", func.data(), func_suffix.data());
|
|
569
|
-
|
|
564
|
+
GGML_SYCL_DEBUG("%s", debug_get_tensor_str(" dst", dst).c_str());
|
|
570
565
|
if (dst) {
|
|
571
566
|
for (std::size_t i = 0; i < num_src; ++i) {
|
|
572
|
-
|
|
567
|
+
GGML_SYCL_DEBUG("%s", debug_get_tensor_str("\tsrc" + std::to_string(i), dst->src[i]).c_str());
|
|
573
568
|
}
|
|
574
569
|
}
|
|
575
570
|
GGML_SYCL_DEBUG("%s\n", suffix.data());
|
|
@@ -265,6 +265,17 @@ static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int64_t k,
|
|
|
265
265
|
#endif
|
|
266
266
|
}
|
|
267
267
|
|
|
268
|
+
template <typename dst_t>
|
|
269
|
+
static void dequantize_row_q6_K_sycl_reorder(const void * vx, dst_t * y, const int64_t k, dpct::queue_ptr stream) {
|
|
270
|
+
const int64_t nb = k / QK_K;
|
|
271
|
+
|
|
272
|
+
dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
|
|
273
|
+
|
|
274
|
+
stream->parallel_for(
|
|
275
|
+
sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)),
|
|
276
|
+
[=](sycl::nd_item<3> item_ct1) { dequantize_block_q6_K_reorder(vx, y, item_ct1, nb); });
|
|
277
|
+
}
|
|
278
|
+
|
|
268
279
|
template <typename dst_t>
|
|
269
280
|
static void dequantize_row_iq1_s_sycl(const void *vx, dst_t *y, const int64_t k,
|
|
270
281
|
dpct::queue_ptr stream) {
|
|
@@ -530,7 +541,11 @@ to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst) {
|
|
|
530
541
|
case GGML_TYPE_Q5_K:
|
|
531
542
|
return dequantize_row_q5_K_sycl;
|
|
532
543
|
case GGML_TYPE_Q6_K:
|
|
533
|
-
|
|
544
|
+
if (dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
|
|
545
|
+
return dequantize_row_q6_K_sycl_reorder;
|
|
546
|
+
} else {
|
|
547
|
+
return dequantize_row_q6_K_sycl;
|
|
548
|
+
}
|
|
534
549
|
case GGML_TYPE_IQ1_S:
|
|
535
550
|
return dequantize_row_iq1_s_sycl;
|
|
536
551
|
case GGML_TYPE_IQ1_M:
|
|
@@ -587,7 +602,11 @@ to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst) {
|
|
|
587
602
|
case GGML_TYPE_Q5_K:
|
|
588
603
|
return dequantize_row_q5_K_sycl;
|
|
589
604
|
case GGML_TYPE_Q6_K:
|
|
590
|
-
|
|
605
|
+
if (dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
|
|
606
|
+
return dequantize_row_q6_K_sycl_reorder;
|
|
607
|
+
} else {
|
|
608
|
+
return dequantize_row_q6_K_sycl;
|
|
609
|
+
}
|
|
591
610
|
case GGML_TYPE_IQ1_S:
|
|
592
611
|
return dequantize_row_iq1_s_sycl;
|
|
593
612
|
case GGML_TYPE_IQ1_M:
|
|
@@ -1,8 +1,12 @@
|
|
|
1
1
|
#include "cpy.hpp"
|
|
2
2
|
|
|
3
3
|
#include <float.h>
|
|
4
|
+
#include <string>
|
|
4
5
|
|
|
5
6
|
#include "dequantize.hpp"
|
|
7
|
+
#include "ggml-sycl/common.hpp"
|
|
8
|
+
#include "ggml-sycl/presets.hpp"
|
|
9
|
+
#include "ggml.h"
|
|
6
10
|
|
|
7
11
|
static __dpct_inline__ int best_index_int8(int n, const int8_t * val, float x) {
|
|
8
12
|
if (x <= val[0]) {
|
|
@@ -116,6 +120,15 @@ static void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
|
|
|
116
120
|
}
|
|
117
121
|
}
|
|
118
122
|
|
|
123
|
+
/* quantized type same copy */
|
|
124
|
+
template<typename T>
|
|
125
|
+
static void cpy_blck_q_q(const char * cxi, char * cdsti) {
|
|
126
|
+
const T * xi = (const T *) cxi;
|
|
127
|
+
T * dsti = (T *) cdsti;
|
|
128
|
+
*dsti = *xi;
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
|
|
119
132
|
static void cpy_blck_q8_0_f32(const char * cxi, char * cdsti) {
|
|
120
133
|
float * cdstf = (float *) (cdsti);
|
|
121
134
|
|
|
@@ -311,6 +324,34 @@ template <dequantize_kernel_t dequant, int qk> static void cpy_blck_q_f32(const
|
|
|
311
324
|
}
|
|
312
325
|
}
|
|
313
326
|
|
|
327
|
+
|
|
328
|
+
template <typename T, int qk>
|
|
329
|
+
static void cpy_q_q(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, const int ne02,
|
|
330
|
+
const int nb00, const int nb01, const int nb02, const int nb03, const int ne10, const int ne11,
|
|
331
|
+
const int ne12, const int nb10, const int nb11, const int nb12, const int nb13,
|
|
332
|
+
const sycl::nd_item<3> & item_ct1) {
|
|
333
|
+
const int i = (item_ct1.get_local_range(2) * item_ct1.get_group(2) + item_ct1.get_local_id(2)) * qk;
|
|
334
|
+
|
|
335
|
+
if (i >= ne) {
|
|
336
|
+
return;
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
const int i03 = i / (ne00 * ne01 * ne02);
|
|
340
|
+
const int i02 = (i - i03 * ne00 * ne01 * ne02) / (ne00 * ne01);
|
|
341
|
+
const int i01 = (i - i03 * ne00 * ne01 * ne02 - i02 * ne01 * ne00) / ne00;
|
|
342
|
+
const int i00 = i - i03 * ne00 * ne01 * ne02 - i02 * ne01 * ne00 - i01 * ne00;
|
|
343
|
+
const int x_offset = (i00 / qk) * nb00 + i01 * nb01 + i02 * nb02 + i03 * nb03;
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
const int i13 = i / (ne10 * ne11 * ne12);
|
|
347
|
+
const int i12 = (i - i13 * ne10 * ne11 * ne12) / (ne10 * ne11);
|
|
348
|
+
const int i11 = (i - i13 * ne10 * ne11 * ne12 - i12 * ne10 * ne11) / ne10;
|
|
349
|
+
const int i10 = i - i13 * ne10 * ne11 * ne12 - i12 * ne10 * ne11 - i11 * ne10;
|
|
350
|
+
const int dst_offset = (i10 / qk) * nb10 + i11 * nb11 + i12 * nb12 + i13 * nb13;
|
|
351
|
+
|
|
352
|
+
cpy_blck_q_q<T>(cx + x_offset, cdst + dst_offset);
|
|
353
|
+
}
|
|
354
|
+
|
|
314
355
|
template <cpy_kernel_t cpy_blck, int qk>
|
|
315
356
|
static void cpy_f32_q(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, const int ne02,
|
|
316
357
|
const int nb00, const int nb01, const int nb02, const int nb03, const int ne10, const int ne11,
|
|
@@ -322,6 +363,7 @@ static void cpy_f32_q(const char * cx, char * cdst, const int ne, const int ne00
|
|
|
322
363
|
return;
|
|
323
364
|
}
|
|
324
365
|
|
|
366
|
+
|
|
325
367
|
const int i03 = i / (ne00 * ne01 * ne02);
|
|
326
368
|
const int i02 = (i - i03 * ne00 * ne01 * ne02) / (ne00 * ne01);
|
|
327
369
|
const int i01 = (i - i03 * ne00 * ne01 * ne02 - i02 * ne01 * ne00) / ne00;
|
|
@@ -615,10 +657,73 @@ static void ggml_cpy_i32_i32_sycl(const char * cx, char * cdst, const int ne, co
|
|
|
615
657
|
}
|
|
616
658
|
}
|
|
617
659
|
|
|
660
|
+
static void ggml_cpy_q8_0_q8_0(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
|
|
661
|
+
const int ne02, const int nb00, const int nb01, const int nb02, const int nb03,
|
|
662
|
+
const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
|
663
|
+
const int nb12, const int nb13, queue_ptr stream) {
|
|
664
|
+
const int num_blocks = ceil_div(ne, SYCL_CPY_BLOCK_SIZE);
|
|
665
|
+
stream->parallel_for(
|
|
666
|
+
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
|
|
667
|
+
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), [=](sycl::nd_item<3> item_ct1) {
|
|
668
|
+
cpy_q_q<block_q8_0, QK8_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
|
|
669
|
+
});
|
|
670
|
+
}
|
|
671
|
+
|
|
672
|
+
|
|
673
|
+
static void ggml_cpy_q5_0_q5_0(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
|
|
674
|
+
const int ne02, const int nb00, const int nb01, const int nb02, const int nb03,
|
|
675
|
+
const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
|
676
|
+
const int nb12, const int nb13, queue_ptr stream) {
|
|
677
|
+
const int num_blocks = ceil_div(ne, SYCL_CPY_BLOCK_SIZE);
|
|
678
|
+
stream->parallel_for(
|
|
679
|
+
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
|
|
680
|
+
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), [=](sycl::nd_item<3> item_ct1) {
|
|
681
|
+
cpy_q_q<block_q5_0, QK5_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
|
|
682
|
+
});
|
|
683
|
+
}
|
|
684
|
+
|
|
685
|
+
|
|
686
|
+
static void ggml_cpy_q5_1_q5_1(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
|
|
687
|
+
const int ne02, const int nb00, const int nb01, const int nb02, const int nb03,
|
|
688
|
+
const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
|
689
|
+
const int nb12, const int nb13, queue_ptr stream) {
|
|
690
|
+
const int num_blocks = ceil_div(ne, SYCL_CPY_BLOCK_SIZE);
|
|
691
|
+
|
|
692
|
+
stream->parallel_for(
|
|
693
|
+
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
|
|
694
|
+
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), [=](sycl::nd_item<3> item_ct1) {
|
|
695
|
+
cpy_q_q<block_q5_1, QK5_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
|
|
696
|
+
});
|
|
697
|
+
}
|
|
698
|
+
|
|
699
|
+
|
|
700
|
+
static void ggml_cpy_q4_0_q4_0(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
|
|
701
|
+
const int ne02, const int nb00, const int nb01, const int nb02, const int nb03,
|
|
702
|
+
const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
|
703
|
+
const int nb12, const int nb13, queue_ptr stream) {
|
|
704
|
+
const int num_blocks = ceil_div(ne, SYCL_CPY_BLOCK_SIZE);
|
|
705
|
+
stream->parallel_for(
|
|
706
|
+
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), [=](sycl::nd_item<3> item_ct1) {
|
|
707
|
+
cpy_q_q<block_q4_0, QK4_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
|
|
708
|
+
});
|
|
709
|
+
}
|
|
710
|
+
|
|
711
|
+
|
|
712
|
+
static void ggml_cpy_q4_1_q4_1(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
|
|
713
|
+
const int ne02, const int nb00, const int nb01, const int nb02, const int nb03,
|
|
714
|
+
const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
|
715
|
+
const int nb12, const int nb13, queue_ptr stream) {
|
|
716
|
+
|
|
717
|
+
const int num_blocks = ceil_div(ne, SYCL_CPY_BLOCK_SIZE);
|
|
718
|
+
stream->parallel_for(
|
|
719
|
+
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), [=](sycl::nd_item<3> item_ct1) {
|
|
720
|
+
cpy_q_q<block_q4_1, QK4_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
|
|
721
|
+
});
|
|
722
|
+
}
|
|
723
|
+
|
|
618
724
|
void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1) try {
|
|
619
725
|
// Unlike other operators ggml_sycl_cpy takes 2 distinct tensors instead of a dst ggml_tensor and rely on its src field
|
|
620
|
-
scope_op_debug_print scope_dbg_print(__func__, src1, /*num_src=*/0,
|
|
621
|
-
std::string(" src0 type=") + ggml_type_name(src0->type));
|
|
726
|
+
scope_op_debug_print scope_dbg_print(__func__, src1, /*num_src=*/0, debug_get_tensor_str("\tsrc0", src0));
|
|
622
727
|
const int64_t ne = ggml_nelements(src0);
|
|
623
728
|
GGML_ASSERT(ne == ggml_nelements(src1));
|
|
624
729
|
|
|
@@ -632,8 +737,10 @@ void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, co
|
|
|
632
737
|
|
|
633
738
|
char * src0_ddc = (char *) src0->data;
|
|
634
739
|
char * src1_ddc = (char *) src1->data;
|
|
635
|
-
|
|
636
|
-
|
|
740
|
+
if ((src0->type == src1->type) && (ggml_is_contiguous(src0) && ggml_is_contiguous(src1))) {
|
|
741
|
+
GGML_SYCL_DEBUG("%s: memcpy path\n", __func__);
|
|
742
|
+
main_stream->memcpy(src1_ddc, src0_ddc, ggml_nbytes(src0));
|
|
743
|
+
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
|
637
744
|
ggml_cpy_f32_f32_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10,
|
|
638
745
|
nb11, nb12, nb13, main_stream);
|
|
639
746
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
|
|
@@ -684,6 +791,16 @@ void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, co
|
|
|
684
791
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_IQ4_NL) {
|
|
685
792
|
ggml_cpy_f32_iq4_nl_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12,
|
|
686
793
|
nb10, nb11, nb12, nb13, main_stream);
|
|
794
|
+
} else if (src0->type == GGML_TYPE_Q8_0 && src1->type == GGML_TYPE_Q8_0) {
|
|
795
|
+
ggml_cpy_q8_0_q8_0(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
|
796
|
+
} else if (src0->type == GGML_TYPE_Q5_0 && src1->type == GGML_TYPE_Q5_0) {
|
|
797
|
+
ggml_cpy_q5_0_q5_0(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
|
798
|
+
} else if (src0->type == GGML_TYPE_Q5_1 && src1->type == GGML_TYPE_Q5_1) {
|
|
799
|
+
ggml_cpy_q5_1_q5_1(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
|
800
|
+
} else if (src0->type == GGML_TYPE_Q4_0 && src1->type == GGML_TYPE_Q4_0) {
|
|
801
|
+
ggml_cpy_q4_0_q4_0(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
|
802
|
+
} else if (src0->type == GGML_TYPE_Q4_1 && src1->type == GGML_TYPE_Q4_1) {
|
|
803
|
+
ggml_cpy_q4_1_q4_1(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
|
687
804
|
} else {
|
|
688
805
|
GGML_LOG_ERROR("%s: unsupported type combination (%s to %s)\n", __func__, ggml_type_name(src0->type),
|
|
689
806
|
ggml_type_name(src1->type));
|
|
@@ -538,6 +538,38 @@ static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restri
|
|
|
538
538
|
#endif
|
|
539
539
|
}
|
|
540
540
|
|
|
541
|
+
template <typename dst_t>
|
|
542
|
+
static void dequantize_block_q6_K_reorder(const void * __restrict__ vx, dst_t * __restrict__ yy,
|
|
543
|
+
const sycl::nd_item<3> & item_ct1, int64_t n_blocks) {
|
|
544
|
+
const int64_t ib = item_ct1.get_group(2);
|
|
545
|
+
|
|
546
|
+
const int64_t tid = item_ct1.get_local_id(2);
|
|
547
|
+
const int64_t ip = tid / 32; // ip is 0 or 1
|
|
548
|
+
const int64_t il = tid - 32 * ip; // 0...32
|
|
549
|
+
const int64_t is = 8 * ip + il / 16;
|
|
550
|
+
|
|
551
|
+
const uint8_t * base_ptr = static_cast<const uint8_t *>(vx);
|
|
552
|
+
const auto ql_offset = ib * (QK_K / 2);
|
|
553
|
+
const auto qh_offset = (QK_K / 2) * n_blocks + (QK_K / 4) * ib;
|
|
554
|
+
const auto base_scales_offset = (QK_K / 2) * n_blocks + (QK_K / 4) * n_blocks + (QK_K / 16) * ib;
|
|
555
|
+
const auto base_d_offset = ((QK_K / 2) + (QK_K / 4) + (QK_K / 16)) * n_blocks;
|
|
556
|
+
const uint8_t * ql_ptr = base_ptr + ql_offset;
|
|
557
|
+
const uint8_t * qh_ptr = base_ptr + qh_offset;
|
|
558
|
+
const uint8_t * scales_ptr = base_ptr + base_scales_offset;
|
|
559
|
+
const ggml_half * d = (const ggml_half *) (base_ptr + base_d_offset) + ib;
|
|
560
|
+
|
|
561
|
+
dst_t * y = yy + ib * QK_K + 128 * ip + il;
|
|
562
|
+
|
|
563
|
+
const uint8_t * ql = ql_ptr + 64 * ip + il;
|
|
564
|
+
const uint8_t qh = *(qh_ptr + 32 * ip + il);
|
|
565
|
+
const int8_t * sc = reinterpret_cast<const int8_t *>(scales_ptr + is);
|
|
566
|
+
|
|
567
|
+
y[0] = *d * sc[0] * ((int8_t) ((ql[0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
|
|
568
|
+
y[32] = *d * sc[2] * ((int8_t) ((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
|
|
569
|
+
y[64] = *d * sc[4] * ((int8_t) ((ql[0] >> 4) | (((qh >> 4) & 3) << 4)) - 32);
|
|
570
|
+
y[96] = *d * sc[6] * ((int8_t) ((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
|
|
571
|
+
}
|
|
572
|
+
|
|
541
573
|
template<typename dst_t>
|
|
542
574
|
static void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy,
|
|
543
575
|
const sycl::nd_item<3> &item_ct1,
|
|
@@ -65,6 +65,9 @@ public:
|
|
|
65
65
|
|
|
66
66
|
dnnl::primitive_attr primitive_attr;
|
|
67
67
|
primitive_attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
|
|
68
|
+
#ifdef GGML_SYCL_F16
|
|
69
|
+
primitive_attr.set_fpmath_mode(dnnl::fpmath_mode::f16);
|
|
70
|
+
#endif
|
|
68
71
|
|
|
69
72
|
auto a_mem = dnnl::memory(a_in_md, eng, const_cast<void*>(a));
|
|
70
73
|
auto b_mem = dnnl::memory(b_in_md, eng, const_cast<void*>(b));
|
|
@@ -60,54 +60,6 @@ static void k_get_rows(
|
|
|
60
60
|
dst_row[iybs + iqs + y_offset] = v.y();
|
|
61
61
|
}
|
|
62
62
|
|
|
63
|
-
template<int qk, int qr, dequantize_kernel_t_reorder dequantize_kernel_recorder, typename dst_t>
|
|
64
|
-
static void k_get_rows_reorder(
|
|
65
|
-
const void * src0, const void *src0_dq, const int32_t * src1, dst_t * dst,
|
|
66
|
-
int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
|
|
67
|
-
/*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
|
|
68
|
-
/*size_t s0,*/ size_t s1, size_t s2, size_t s3,
|
|
69
|
-
/*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
|
|
70
|
-
size_t s10, size_t s11, size_t s12,
|
|
71
|
-
const sycl::nd_item<3> &item_ct1/*, size_t s13*/) {
|
|
72
|
-
|
|
73
|
-
const int i00 = (item_ct1.get_group(2) * item_ct1.get_local_range(2) +
|
|
74
|
-
item_ct1.get_local_id(2)) *
|
|
75
|
-
2;
|
|
76
|
-
const int i10 = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
|
|
77
|
-
item_ct1.get_local_id(1);
|
|
78
|
-
const int i11 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) +
|
|
79
|
-
item_ct1.get_local_id(0)) /
|
|
80
|
-
ne12;
|
|
81
|
-
const int i12 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) +
|
|
82
|
-
item_ct1.get_local_id(0)) %
|
|
83
|
-
ne12;
|
|
84
|
-
|
|
85
|
-
if (i00 >= ne00) {
|
|
86
|
-
return;
|
|
87
|
-
}
|
|
88
|
-
auto ncols = ne00;
|
|
89
|
-
const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
|
|
90
|
-
|
|
91
|
-
dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
|
|
92
|
-
|
|
93
|
-
const int src0_off = i01 * ncols + i00;
|
|
94
|
-
const int ib = src0_off / QK4_0; // block index
|
|
95
|
-
const int iqs = (i00%qk)/qr; // x quant index
|
|
96
|
-
const int iybs = i00 - i00%qk; // dst block start index
|
|
97
|
-
const int y_offset = qr == 1 ? 1 : qk/2;
|
|
98
|
-
|
|
99
|
-
// dequantize
|
|
100
|
-
dfloat2 v;
|
|
101
|
-
dequantize_kernel_recorder((const void *)src0_dq, ib, (const void *)src0, src0_off/2, v);
|
|
102
|
-
|
|
103
|
-
dst_row[iybs + iqs + 0] = v.x();
|
|
104
|
-
dst_row[iybs + iqs + y_offset] = v.y();
|
|
105
|
-
|
|
106
|
-
GGML_UNUSED(nb01);
|
|
107
|
-
GGML_UNUSED(nb02);
|
|
108
|
-
GGML_UNUSED(nb03);
|
|
109
|
-
}
|
|
110
|
-
|
|
111
63
|
template<typename src0_t, typename dst_t>
|
|
112
64
|
static void k_get_rows_float(
|
|
113
65
|
const src0_t * src0, const int32_t * src1, dst_t * dst,
|
|
@@ -177,47 +129,6 @@ static void get_rows_sycl(ggml_backend_sycl_context & ctx, const ggml_tensor *sr
|
|
|
177
129
|
GGML_UNUSED(ctx);
|
|
178
130
|
}
|
|
179
131
|
|
|
180
|
-
template <int qk, int qr, dequantize_kernel_t_reorder dq_reorder>
|
|
181
|
-
static void get_rows_sycl_reorder(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
|
|
182
|
-
ggml_tensor *dst, const void *src0_dd,
|
|
183
|
-
const int32_t *src1_dd, float *dst_dd,
|
|
184
|
-
queue_ptr stream) {
|
|
185
|
-
|
|
186
|
-
GGML_TENSOR_BINARY_OP_LOCALS
|
|
187
|
-
|
|
188
|
-
const sycl::range<3> block_dims(1, 1, SYCL_GET_ROWS_BLOCK_SIZE);
|
|
189
|
-
const int block_num_x = (ne00 + 2*SYCL_GET_ROWS_BLOCK_SIZE - 1) / (2*SYCL_GET_ROWS_BLOCK_SIZE);
|
|
190
|
-
const sycl::range<3> block_nums(ne11 * ne12, ne10, block_num_x);
|
|
191
|
-
|
|
192
|
-
// strides in elements
|
|
193
|
-
//const size_t s0 = nb0 / ggml_element_size(dst);
|
|
194
|
-
const size_t s1 = nb1 / ggml_element_size(dst);
|
|
195
|
-
const size_t s2 = nb2 / ggml_element_size(dst);
|
|
196
|
-
const size_t s3 = nb3 / ggml_element_size(dst);
|
|
197
|
-
|
|
198
|
-
const size_t s10 = nb10 / ggml_element_size(src1);
|
|
199
|
-
const size_t s11 = nb11 / ggml_element_size(src1);
|
|
200
|
-
const size_t s12 = nb12 / ggml_element_size(src1);
|
|
201
|
-
//const size_t s13 = nb13 / ggml_element_size(src1);
|
|
202
|
-
|
|
203
|
-
GGML_ASSERT(ne00 % 2 == 0);
|
|
204
|
-
|
|
205
|
-
const uint8_t* src0_q = (const uint8_t*)src0_dd;
|
|
206
|
-
const size_t ncols = ne00;
|
|
207
|
-
const size_t nrows = ne01;
|
|
208
|
-
const sycl::half* src0_dq = (const sycl::half*)(src0_q + nrows * ncols / 2);
|
|
209
|
-
stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
210
|
-
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]]{
|
|
211
|
-
k_get_rows_reorder<qk, qr, dq_reorder>(
|
|
212
|
-
src0_dd, src0_dq, src1_dd, dst_dd, ne00, ne12, s1, s2,
|
|
213
|
-
s3, nb01, nb02, nb03, s10, s11, s12, item_ct1);
|
|
214
|
-
});
|
|
215
|
-
|
|
216
|
-
GGML_UNUSED(dst);
|
|
217
|
-
GGML_UNUSED(ctx);
|
|
218
|
-
}
|
|
219
|
-
|
|
220
|
-
|
|
221
132
|
template <typename src0_t>
|
|
222
133
|
static void get_rows_sycl_float(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
|
223
134
|
const ggml_tensor *src1, ggml_tensor *dst,
|
|
@@ -277,13 +188,8 @@ void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
|
277
188
|
src1_i32, (float *)dst->data, ctx.stream());
|
|
278
189
|
break;
|
|
279
190
|
case GGML_TYPE_Q4_0:
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
src1_i32, (float *)dst->data, ctx.stream());
|
|
283
|
-
} else {
|
|
284
|
-
get_rows_sycl<QK4_0, QR4_0, dequantize_q4_0>(ctx, dst->src[0], dst->src[1], dst, (const float *)dst->src[0]->data,
|
|
285
|
-
src1_i32, (float *)dst->data, ctx.stream());
|
|
286
|
-
}
|
|
191
|
+
get_rows_sycl<QK4_0, QR4_0, dequantize_q4_0>(ctx, dst->src[0], dst->src[1], dst, (const float *)dst->src[0]->data,
|
|
192
|
+
src1_i32, (float *)dst->data, ctx.stream());
|
|
287
193
|
break;
|
|
288
194
|
case GGML_TYPE_Q4_1:
|
|
289
195
|
get_rows_sycl<QK4_1, QR4_1, dequantize_q4_1>(ctx, dst->src[0], dst->src[1], dst, (const float *)dst->src[0]->data,
|