@fugood/llama.node 0.3.3 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +5 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +18 -1
- package/package.json +1 -1
- package/src/EmbeddingWorker.cpp +15 -5
- package/src/EmbeddingWorker.h +2 -1
- package/src/LlamaCompletionWorker.cpp +1 -1
- package/src/LlamaContext.cpp +81 -18
- package/src/LlamaContext.h +2 -0
- package/src/llama.cpp/.github/workflows/build.yml +197 -159
- package/src/llama.cpp/.github/workflows/docker.yml +5 -8
- package/src/llama.cpp/.github/workflows/python-lint.yml +8 -1
- package/src/llama.cpp/.github/workflows/server.yml +21 -14
- package/src/llama.cpp/CMakeLists.txt +11 -6
- package/src/llama.cpp/Sources/llama/llama.h +4 -0
- package/src/llama.cpp/cmake/common.cmake +33 -0
- package/src/llama.cpp/cmake/x64-windows-llvm.cmake +11 -0
- package/src/llama.cpp/common/CMakeLists.txt +6 -2
- package/src/llama.cpp/common/arg.cpp +426 -245
- package/src/llama.cpp/common/common.cpp +143 -80
- package/src/llama.cpp/common/common.h +81 -24
- package/src/llama.cpp/common/sampling.cpp +53 -19
- package/src/llama.cpp/common/sampling.h +22 -1
- package/src/llama.cpp/common/speculative.cpp +274 -0
- package/src/llama.cpp/common/speculative.h +28 -0
- package/src/llama.cpp/docs/build.md +101 -148
- package/src/llama.cpp/examples/CMakeLists.txt +32 -13
- package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched/batched.cpp +5 -4
- package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/cvector-generator/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +1 -1
- package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +3 -2
- package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +4 -7
- package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +8 -1
- package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +2 -2
- package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
- package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +11 -2
- package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/infill/infill.cpp +1 -1
- package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +405 -316
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/llava/CMakeLists.txt +10 -3
- package/src/llama.cpp/examples/llava/clip.cpp +262 -66
- package/src/llama.cpp/examples/llava/clip.h +8 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +1 -1
- package/src/llama.cpp/examples/llava/llava.cpp +46 -19
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +1 -1
- package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +581 -0
- package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -1
- package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +2 -1
- package/src/llama.cpp/examples/lookup/lookup.cpp +2 -2
- package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/main/main.cpp +9 -5
- package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/parallel/parallel.cpp +1 -1
- package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +0 -3
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +4 -4
- package/src/llama.cpp/examples/run/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/run/run.cpp +911 -0
- package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +4 -4
- package/src/llama.cpp/examples/server/CMakeLists.txt +3 -7
- package/src/llama.cpp/examples/server/server.cpp +1758 -886
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
- package/src/llama.cpp/examples/server/utils.hpp +94 -304
- package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/simple/simple.cpp +4 -0
- package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +3 -0
- package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/speculative/speculative.cpp +16 -15
- package/src/llama.cpp/examples/speculative-simple/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +265 -0
- package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +1 -1
- package/src/llama.cpp/examples/tts/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/tts/tts.cpp +932 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +46 -34
- package/src/llama.cpp/ggml/include/ggml-backend.h +16 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +7 -49
- package/src/llama.cpp/ggml/include/ggml-opencl.h +26 -0
- package/src/llama.cpp/ggml/include/ggml.h +106 -24
- package/src/llama.cpp/ggml/src/CMakeLists.txt +73 -24
- package/src/llama.cpp/ggml/src/ggml-alloc.c +0 -1
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +51 -11
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +379 -22
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +3 -7
- package/src/llama.cpp/ggml/src/ggml-blas/ggml-blas.cpp +5 -2
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +33 -3
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +456 -111
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +6 -3
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +95 -35
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -5
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +22 -9
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +24 -13
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +23 -13
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +17 -0
- package/src/llama.cpp/ggml/src/ggml-common.h +42 -42
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +288 -213
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
- package/src/llama.cpp/ggml/src/{ggml-amx → ggml-cpu/amx}/common.h +19 -22
- package/src/llama.cpp/ggml/src/{ggml-amx → ggml-cpu/amx}/mmq.cpp +93 -92
- package/src/llama.cpp/ggml/src/{ggml-amx → ggml-cpu/amx}/mmq.h +2 -9
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.c → ggml-cpu-aarch64.cpp} +892 -190
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +2 -24
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +15 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +38 -25
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +552 -399
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +101 -136
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +7 -10
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -6
- package/src/llama.cpp/ggml/src/ggml-impl.h +32 -11
- package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +13 -9
- package/src/llama.cpp/ggml/src/ggml-kompute/ggml-kompute.cpp +131 -64
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +3 -6
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +39 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +14 -7
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +147 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +4004 -0
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +67 -80
- package/src/llama.cpp/ggml/src/ggml-quants.c +0 -9
- package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +3 -5
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +5 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +13 -10
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +2 -11
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +5 -5
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +32 -13
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +80 -61
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +159 -114
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +6 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +6 -20
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +4 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +8 -8
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +4 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +7 -7
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +4 -1
- package/src/llama.cpp/ggml/src/ggml-threading.h +4 -2
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +21 -7
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1718 -399
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +3 -1
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +105 -31
- package/src/llama.cpp/ggml/src/ggml.c +367 -207
- package/src/llama.cpp/include/llama-cpp.h +25 -0
- package/src/llama.cpp/include/llama.h +26 -19
- package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +46 -0
- package/src/llama.cpp/pocs/CMakeLists.txt +3 -1
- package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
- package/src/llama.cpp/src/CMakeLists.txt +2 -7
- package/src/llama.cpp/src/llama-grammar.cpp +15 -15
- package/src/llama.cpp/src/llama-grammar.h +2 -5
- package/src/llama.cpp/src/llama-sampling.cpp +35 -90
- package/src/llama.cpp/src/llama-vocab.cpp +6 -1
- package/src/llama.cpp/src/llama.cpp +1748 -640
- package/src/llama.cpp/src/unicode.cpp +62 -51
- package/src/llama.cpp/src/unicode.h +9 -10
- package/src/llama.cpp/tests/CMakeLists.txt +48 -37
- package/src/llama.cpp/tests/test-arg-parser.cpp +2 -2
- package/src/llama.cpp/tests/test-backend-ops.cpp +140 -21
- package/src/llama.cpp/tests/test-chat-template.cpp +50 -4
- package/src/llama.cpp/tests/test-gguf.cpp +1303 -0
- package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -6
- package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -4
- package/src/llama.cpp/tests/test-quantize-fns.cpp +3 -3
- package/src/llama.cpp/tests/test-rope.cpp +61 -20
- package/src/llama.cpp/tests/test-sampling.cpp +2 -2
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +0 -72
- package/src/llama.cpp/.github/workflows/nix-ci.yml +0 -79
- package/src/llama.cpp/.github/workflows/nix-flake-update.yml +0 -22
- package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +0 -36
- package/src/llama.cpp/ggml/include/ggml-amx.h +0 -25
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +0 -129
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -19
- package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +0 -107
- package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +0 -446
|
@@ -47,7 +47,7 @@ static ggml_sycl_device_info ggml_sycl_init() {
|
|
|
47
47
|
|
|
48
48
|
info.device_count = dpct::dev_mgr::instance().device_count();
|
|
49
49
|
if (info.device_count == 0) {
|
|
50
|
-
|
|
50
|
+
GGML_LOG_ERROR("%s: failed to initialize: %s\n", GGML_SYCL_NAME, __func__);
|
|
51
51
|
return info;
|
|
52
52
|
}
|
|
53
53
|
|
|
@@ -55,16 +55,16 @@ static ggml_sycl_device_info ggml_sycl_init() {
|
|
|
55
55
|
|
|
56
56
|
int64_t total_vram = 0;
|
|
57
57
|
#if defined(GGML_SYCL_FORCE_MMQ)
|
|
58
|
-
|
|
58
|
+
GGML_LOG_INFO("%s: GGML_SYCL_FORCE_MMQ: yes\n", __func__);
|
|
59
59
|
#else
|
|
60
|
-
|
|
60
|
+
GGML_LOG_INFO("%s: GGML_SYCL_FORCE_MMQ: no\n", __func__);
|
|
61
61
|
#endif
|
|
62
62
|
#if defined(SYCL_USE_XMX)
|
|
63
|
-
|
|
63
|
+
GGML_LOG_INFO("%s: SYCL_USE_XMX: yes\n", __func__);
|
|
64
64
|
#else
|
|
65
|
-
|
|
65
|
+
GGML_LOG_INFO("%s: SYCL_USE_XMX: no\n", __func__);
|
|
66
66
|
#endif
|
|
67
|
-
|
|
67
|
+
GGML_LOG_INFO("%s: found %d %s devices:\n", __func__, info.device_count, GGML_SYCL_NAME);
|
|
68
68
|
|
|
69
69
|
for (int i = 0; i < info.device_count; ++i) {
|
|
70
70
|
info.devices[i].vmm = 0;
|
|
@@ -110,7 +110,7 @@ void print_device_detail(int id, sycl::device &device, std::string device_type)
|
|
|
110
110
|
|
|
111
111
|
auto global_mem_size = prop.get_global_mem_size()/1000000;
|
|
112
112
|
|
|
113
|
-
|
|
113
|
+
GGML_LOG_INFO("|%2d|%19s|%39s|%7s|%7d|%8d|%5d|%6luM|%21s|\n", id, device_type.c_str(),
|
|
114
114
|
name.c_str(), version.c_str(), prop.get_max_compute_units(),
|
|
115
115
|
prop.get_max_work_group_size(), prop.get_max_sub_group_size(),
|
|
116
116
|
global_mem_size, device.get_info<sycl::info::device::driver_version>().c_str());
|
|
@@ -120,19 +120,29 @@ void ggml_backend_sycl_print_sycl_devices() {
|
|
|
120
120
|
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_print_sycl_devices\n");
|
|
121
121
|
int device_count = dpct::dev_mgr::instance().device_count();
|
|
122
122
|
std::map<std::string, size_t> DeviceNums;
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
123
|
+
GGML_LOG_INFO("Found %d SYCL devices:\n", device_count);
|
|
124
|
+
|
|
125
|
+
GGML_LOG_INFO(
|
|
126
|
+
"| | | | "
|
|
127
|
+
" |Max | |Max |Global | |\n");
|
|
128
|
+
GGML_LOG_INFO(
|
|
129
|
+
"| | | | "
|
|
130
|
+
" |compute|Max work|sub |mem | |\n");
|
|
131
|
+
GGML_LOG_INFO(
|
|
132
|
+
"|ID| Device Type| "
|
|
133
|
+
"Name|Version|units |group |group|size | Driver version|\n");
|
|
134
|
+
GGML_LOG_INFO(
|
|
135
|
+
"|--|-------------------|---------------------------------------|------"
|
|
136
|
+
"-|-------|--------|-----|-------|---------------------|\n");
|
|
137
|
+
|
|
128
138
|
for (int id = 0; id < device_count; ++id) {
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
139
|
+
sycl::device device = dpct::dev_mgr::instance().get_device(id);
|
|
140
|
+
std::string backend_type = get_device_backend_and_type(device);
|
|
141
|
+
int type_id = DeviceNums[backend_type]++;
|
|
142
|
+
std::stringstream device_type;
|
|
143
|
+
device_type << "[" << backend_type << ":" << std::to_string(type_id)
|
|
144
|
+
<< "]";
|
|
145
|
+
print_device_detail(id, device, device_type.str());
|
|
136
146
|
}
|
|
137
147
|
}
|
|
138
148
|
|
|
@@ -154,15 +164,14 @@ static void ggml_check_sycl() try {
|
|
|
154
164
|
static bool initialized = false;
|
|
155
165
|
|
|
156
166
|
if (!initialized) {
|
|
157
|
-
|
|
167
|
+
GGML_LOG_INFO("[SYCL] call ggml_check_sycl\n");
|
|
158
168
|
g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
|
|
159
|
-
|
|
160
|
-
fprintf(stderr, "%s: GGML_SYCL_DEBUG: %d\n", __func__, g_ggml_sycl_debug);
|
|
169
|
+
GGML_LOG_INFO("%s: GGML_SYCL_DEBUG: %d\n", __func__, g_ggml_sycl_debug);
|
|
161
170
|
|
|
162
171
|
#if defined(GGML_SYCL_F16)
|
|
163
|
-
|
|
172
|
+
GGML_LOG_INFO("%s: GGML_SYCL_F16: yes\n", __func__);
|
|
164
173
|
#else
|
|
165
|
-
|
|
174
|
+
GGML_LOG_INFO("%s: GGML_SYCL_F16: no\n", __func__);
|
|
166
175
|
#endif
|
|
167
176
|
|
|
168
177
|
/* NOT REMOVE, keep it for next optimize for XMX.
|
|
@@ -180,9 +189,10 @@ static void ggml_check_sycl() try {
|
|
|
180
189
|
return;
|
|
181
190
|
}
|
|
182
191
|
GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES);
|
|
183
|
-
|
|
192
|
+
|
|
184
193
|
initialized = true;
|
|
185
194
|
g_sycl_loaded = true;
|
|
195
|
+
ggml_backend_sycl_print_sycl_devices();
|
|
186
196
|
}
|
|
187
197
|
}
|
|
188
198
|
catch (sycl::exception const &exc) {
|
|
@@ -205,7 +215,7 @@ inline void check_allow_gpu_index(const int device_index) {
|
|
|
205
215
|
__func__,
|
|
206
216
|
device_index,
|
|
207
217
|
ggml_sycl_info().device_count - 1);
|
|
208
|
-
|
|
218
|
+
GGML_LOG_ERROR("%s\n", error_buf);
|
|
209
219
|
assert(false);
|
|
210
220
|
}
|
|
211
221
|
}
|
|
@@ -409,14 +419,12 @@ ggml_backend_sycl_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
|
|
|
409
419
|
return true;
|
|
410
420
|
}
|
|
411
421
|
return false;
|
|
412
|
-
|
|
413
|
-
catch (sycl::exception
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
std::exit(1);
|
|
422
|
+
GGML_UNUSED(buffer);
|
|
423
|
+
} catch (const sycl::exception & exc) {
|
|
424
|
+
std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl;
|
|
425
|
+
std::exit(1);
|
|
417
426
|
}
|
|
418
427
|
|
|
419
|
-
|
|
420
428
|
static void ggml_backend_sycl_buffer_clear(ggml_backend_buffer_t buffer,
|
|
421
429
|
uint8_t value) try {
|
|
422
430
|
ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
|
|
@@ -475,8 +483,8 @@ ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
|
|
|
475
483
|
SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)sycl::malloc_device(
|
|
476
484
|
size, *stream)));
|
|
477
485
|
if (!dev_ptr) {
|
|
478
|
-
|
|
479
|
-
|
|
486
|
+
GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on device\n", __func__, size);
|
|
487
|
+
return nullptr;
|
|
480
488
|
}
|
|
481
489
|
ggml_backend_sycl_buffer_context * ctx = new ggml_backend_sycl_buffer_context(buft_ctx->device, dev_ptr, buft_ctx->stream);
|
|
482
490
|
return ggml_backend_buffer_init(buft, ggml_backend_sycl_buffer_interface, ctx, size);
|
|
@@ -752,7 +760,7 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
|
|
752
760
|
size, *stream)));
|
|
753
761
|
if (!buf) {
|
|
754
762
|
char err_buf[1024];
|
|
755
|
-
snprintf(err_buf, 1023, "%s: can't
|
|
763
|
+
snprintf(err_buf, 1023, "%s: can't allocate %lu Bytes of memory on device\n", __func__, size);
|
|
756
764
|
throw std::runtime_error(err_buf);
|
|
757
765
|
}
|
|
758
766
|
// set padding to 0 to avoid possible NaN values
|
|
@@ -1081,10 +1089,7 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
|
|
|
1081
1089
|
ggml_sycl_buffer buffer_pool[MAX_SYCL_BUFFERS] = {};
|
|
1082
1090
|
size_t pool_size = 0;
|
|
1083
1091
|
|
|
1084
|
-
explicit ggml_sycl_pool_leg(queue_ptr qptr_, int device_) :
|
|
1085
|
-
qptr(qptr_),
|
|
1086
|
-
device(device_) {
|
|
1087
|
-
}
|
|
1092
|
+
explicit ggml_sycl_pool_leg(queue_ptr qptr_, int device_) : device(device_), qptr(qptr_) {}
|
|
1088
1093
|
|
|
1089
1094
|
~ggml_sycl_pool_leg() {
|
|
1090
1095
|
for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) {
|
|
@@ -1142,17 +1147,18 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
|
|
|
1142
1147
|
CHECK_TRY_ERROR(ptr = (void *)sycl::malloc_device(
|
|
1143
1148
|
look_ahead_size, *qptr)));
|
|
1144
1149
|
if (!ptr) {
|
|
1145
|
-
|
|
1150
|
+
GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on device/GPU\n", __func__, look_ahead_size);
|
|
1146
1151
|
return nullptr;
|
|
1147
1152
|
}
|
|
1148
1153
|
|
|
1149
1154
|
*actual_size = look_ahead_size;
|
|
1150
1155
|
pool_size += look_ahead_size;
|
|
1151
1156
|
|
|
1152
|
-
|
|
1153
|
-
|
|
1157
|
+
#ifdef DEBUG_SYCL_MALLOC
|
|
1158
|
+
GGML_LOG_DEBUG("%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, id, nnz,
|
|
1154
1159
|
(uint32_t)(max_size/1024/1024), (uint32_t)(g_sycl_pool_size[id]/1024/1024), (uint32_t)(size/1024/1024));
|
|
1155
|
-
|
|
1160
|
+
#endif
|
|
1161
|
+
|
|
1156
1162
|
// GGML_SYCL_DEBUG("ggml_sycl_pool_malloc_leg look_ahead_size=%lu, return %p\n", look_ahead_size, ptr);
|
|
1157
1163
|
return ptr;
|
|
1158
1164
|
}
|
|
@@ -1166,7 +1172,7 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
|
|
|
1166
1172
|
return;
|
|
1167
1173
|
}
|
|
1168
1174
|
}
|
|
1169
|
-
|
|
1175
|
+
GGML_LOG_WARN("WARNING: sycl buffer pool full, increase MAX_sycl_BUFFERS\n");
|
|
1170
1176
|
SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(ptr, *qptr)));
|
|
1171
1177
|
pool_size -= size;
|
|
1172
1178
|
}
|
|
@@ -1226,7 +1232,7 @@ static void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy,
|
|
|
1226
1232
|
zeros[i] = 0.f;
|
|
1227
1233
|
qzeros[i] = 0;
|
|
1228
1234
|
}
|
|
1229
|
-
const TC xi = ix < kx ? *(TC *)&x[iy * kx + ix] : zeros;
|
|
1235
|
+
const TC xi = ix < kx ? *(const TC *)&x[iy * kx + ix] : zeros;
|
|
1230
1236
|
float sum = xi[0];
|
|
1231
1237
|
float amax = sycl::fabs(xi[0]);
|
|
1232
1238
|
#pragma unroll
|
|
@@ -1787,6 +1793,9 @@ static void pool2d_nchw_kernel(
|
|
|
1787
1793
|
switch (op) {
|
|
1788
1794
|
case GGML_OP_POOL_AVG: res = 0; break;
|
|
1789
1795
|
case GGML_OP_POOL_MAX: res = -FLT_MAX; break;
|
|
1796
|
+
default:
|
|
1797
|
+
res = (To) sycl::nan(uint32_t(0));
|
|
1798
|
+
break;
|
|
1790
1799
|
}
|
|
1791
1800
|
|
|
1792
1801
|
for (int i = bh; i < eh; i += 1) {
|
|
@@ -1805,6 +1814,9 @@ static void pool2d_nchw_kernel(
|
|
|
1805
1814
|
switch (op) {
|
|
1806
1815
|
case GGML_OP_POOL_AVG: res += (cur / (kh * kw)); break;
|
|
1807
1816
|
case GGML_OP_POOL_MAX: res = sycl::max(res, (To)cur); break;
|
|
1817
|
+
default:
|
|
1818
|
+
res = (To) sycl::nan(uint32_t(0));
|
|
1819
|
+
break;
|
|
1808
1820
|
}
|
|
1809
1821
|
}
|
|
1810
1822
|
}
|
|
@@ -1843,7 +1855,8 @@ static void get_rows_sycl(ggml_backend_sycl_context & ctx, const ggml_tensor *sr
|
|
|
1843
1855
|
s3, nb01, nb02, nb03, s10, s11, s12, item_ct1);
|
|
1844
1856
|
});
|
|
1845
1857
|
|
|
1846
|
-
(
|
|
1858
|
+
GGML_UNUSED(dst);
|
|
1859
|
+
GGML_UNUSED(ctx);
|
|
1847
1860
|
}
|
|
1848
1861
|
|
|
1849
1862
|
template <typename src0_t>
|
|
@@ -1881,10 +1894,10 @@ static void get_rows_sycl_float(ggml_backend_sycl_context & ctx, const ggml_tens
|
|
|
1881
1894
|
});
|
|
1882
1895
|
}
|
|
1883
1896
|
|
|
1884
|
-
(
|
|
1897
|
+
GGML_UNUSED(dst);
|
|
1898
|
+
GGML_UNUSED(ctx);
|
|
1885
1899
|
}
|
|
1886
1900
|
|
|
1887
|
-
|
|
1888
1901
|
static void quantize_row_q8_1_sycl(const float *x, void *vy, const int kx,
|
|
1889
1902
|
const int ky, const int kx_padded,
|
|
1890
1903
|
queue_ptr stream) {
|
|
@@ -2437,7 +2450,7 @@ static void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, const ggml_te
|
|
|
2437
2450
|
break;
|
|
2438
2451
|
default:
|
|
2439
2452
|
// TODO: k-quants
|
|
2440
|
-
|
|
2453
|
+
GGML_LOG_ERROR("%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
|
|
2441
2454
|
GGML_ABORT("fatal error");
|
|
2442
2455
|
break;
|
|
2443
2456
|
}
|
|
@@ -2452,8 +2465,8 @@ static void ggml_sycl_op_repeat(ggml_backend_sycl_context & ctx, const ggml_tens
|
|
|
2452
2465
|
|
|
2453
2466
|
ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_repeat>>(ctx, dst, src0, dst, nullptr, src0_d, dst_d, main_stream);
|
|
2454
2467
|
|
|
2455
|
-
(
|
|
2456
|
-
(
|
|
2468
|
+
GGML_UNUSED(src1);
|
|
2469
|
+
GGML_UNUSED(src1_d);
|
|
2457
2470
|
}
|
|
2458
2471
|
|
|
2459
2472
|
|
|
@@ -2472,17 +2485,18 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
|
|
2472
2485
|
const int64_t ne00 = src0->ne[0];
|
|
2473
2486
|
const int64_t ne10 = src1->ne[0];
|
|
2474
2487
|
|
|
2475
|
-
const int64_t ne0 = dst->ne[0];
|
|
2476
2488
|
|
|
2477
2489
|
const int64_t row_diff = row_high - row_low;
|
|
2478
2490
|
|
|
2479
2491
|
int id;
|
|
2480
2492
|
SYCL_CHECK(
|
|
2481
2493
|
CHECK_TRY_ERROR(id = get_current_device_id()));
|
|
2482
|
-
|
|
2494
|
+
#if !GGML_SYCL_DNNL
|
|
2495
|
+
const int64_t ne0 = dst->ne[0];
|
|
2483
2496
|
// the main device has a larger memory buffer to hold the results from all GPUs
|
|
2484
2497
|
// ldc == nrows of the matrix that cuBLAS writes into
|
|
2485
2498
|
int ldc = id == ctx.device ? ne0 : row_diff;
|
|
2499
|
+
#endif
|
|
2486
2500
|
|
|
2487
2501
|
#ifdef GGML_SYCL_F16
|
|
2488
2502
|
bool use_fp16 = true; // TODO(Yu) SYCL capability check
|
|
@@ -2519,9 +2533,9 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
|
|
2519
2533
|
: src1_as_f16.get();
|
|
2520
2534
|
ggml_sycl_pool_alloc<sycl::half> dst_f16(ctx.pool(), row_diff * src1_ncols);
|
|
2521
2535
|
|
|
2522
|
-
const sycl::half alpha_f16 = 1.0f;
|
|
2523
|
-
const sycl::half beta_f16 = 0.0f;
|
|
2524
2536
|
#if !GGML_SYCL_DNNL
|
|
2537
|
+
const sycl::half alpha_f16 = 1.0f;
|
|
2538
|
+
const sycl::half beta_f16 = 0.0f;
|
|
2525
2539
|
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm(
|
|
2526
2540
|
*stream, oneapi::mkl::transpose::trans,
|
|
2527
2541
|
oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10,
|
|
@@ -2558,24 +2572,29 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
|
|
2558
2572
|
const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32.get();
|
|
2559
2573
|
const float * src1_ddf1_i = src1->type == GGML_TYPE_F32 ? (const float *) src1_ddf_i : src1_ddq_as_f32.get();
|
|
2560
2574
|
|
|
2561
|
-
const float alpha = 1.0f;
|
|
2562
|
-
const float beta = 0.0f;
|
|
2563
2575
|
#if !GGML_SYCL_DNNL
|
|
2576
|
+
const float alpha = 1.0f;
|
|
2577
|
+
const float beta = 0.0f;
|
|
2578
|
+
# ifdef GGML_SYCL_NVIDIA
|
|
2564
2579
|
SYCL_CHECK(CHECK_TRY_ERROR(oneapi::mkl::blas::column_major::gemm(
|
|
2565
|
-
*stream, oneapi::mkl::transpose::trans,
|
|
2566
|
-
oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10,
|
|
2567
|
-
dpct::get_value(&
|
|
2568
|
-
|
|
2580
|
+
oneapi::mkl::backend_selector<oneapi::mkl::backend::cublas>{ *stream }, oneapi::mkl::transpose::trans,
|
|
2581
|
+
oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10, dpct::get_value(&alpha, *stream), src0_ddf_i,
|
|
2582
|
+
ne00, src1_ddf1_i, ne10, dpct::get_value(&beta, *stream), dst_dd_i, ldc)));
|
|
2583
|
+
# else
|
|
2584
|
+
SYCL_CHECK(CHECK_TRY_ERROR(oneapi::mkl::blas::column_major::gemm(
|
|
2585
|
+
*stream, oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10,
|
|
2586
|
+
dpct::get_value(&alpha, *stream), src0_ddf_i, ne00, src1_ddf1_i, ne10, dpct::get_value(&beta, *stream),
|
|
2569
2587
|
dst_dd_i, ldc)));
|
|
2588
|
+
# endif
|
|
2570
2589
|
#else
|
|
2571
2590
|
auto dnnl_stream = ctx.stream_dnnl(stream);
|
|
2572
2591
|
DnnlGemmWrapper::row_gemm(dnnl_stream, false, true, src1_ncols, row_diff, ne10, src1_ddf1_i, DnnlGemmWrapper::to_dt<float>(),
|
|
2573
2592
|
src0_ddf_i, DnnlGemmWrapper::to_dt<float>(), dst_dd_i, DnnlGemmWrapper::to_dt<float>());
|
|
2574
2593
|
#endif
|
|
2575
2594
|
}
|
|
2576
|
-
(
|
|
2577
|
-
(
|
|
2578
|
-
(
|
|
2595
|
+
GGML_UNUSED(dst);
|
|
2596
|
+
GGML_UNUSED(src1_ddq_i);
|
|
2597
|
+
GGML_UNUSED(src1_padded_row_size);
|
|
2579
2598
|
}
|
|
2580
2599
|
catch (sycl::exception const &exc) {
|
|
2581
2600
|
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
|
@@ -2621,8 +2640,9 @@ static void ggml_sycl_op_pool2d(ggml_backend_sycl_context & ctx, const ggml_tens
|
|
|
2621
2640
|
item_ct1);
|
|
2622
2641
|
});
|
|
2623
2642
|
|
|
2624
|
-
(
|
|
2625
|
-
(
|
|
2643
|
+
GGML_UNUSED(src1);
|
|
2644
|
+
GGML_UNUSED(src1_dd);
|
|
2645
|
+
GGML_UNUSED(ctx);
|
|
2626
2646
|
}
|
|
2627
2647
|
|
|
2628
2648
|
inline void ggml_sycl_op_sum(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
|
@@ -2637,9 +2657,10 @@ inline void ggml_sycl_op_sum(ggml_backend_sycl_context & ctx, const ggml_tensor
|
|
|
2637
2657
|
|
|
2638
2658
|
sum_rows_f32_sycl(src0_dd, dst_dd, ne, 1, main_stream);
|
|
2639
2659
|
|
|
2640
|
-
(
|
|
2641
|
-
(
|
|
2642
|
-
(
|
|
2660
|
+
GGML_UNUSED(src1);
|
|
2661
|
+
GGML_UNUSED(dst);
|
|
2662
|
+
GGML_UNUSED(src1_dd);
|
|
2663
|
+
GGML_UNUSED(ctx);
|
|
2643
2664
|
}
|
|
2644
2665
|
|
|
2645
2666
|
inline void ggml_sycl_op_sum_rows(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
|
@@ -2656,9 +2677,10 @@ inline void ggml_sycl_op_sum_rows(ggml_backend_sycl_context & ctx, const ggml_te
|
|
|
2656
2677
|
|
|
2657
2678
|
sum_rows_f32_sycl(src0_dd, dst_dd, ncols, nrows, main_stream);
|
|
2658
2679
|
|
|
2659
|
-
(
|
|
2660
|
-
(
|
|
2661
|
-
(
|
|
2680
|
+
GGML_UNUSED(src1);
|
|
2681
|
+
GGML_UNUSED(dst);
|
|
2682
|
+
GGML_UNUSED(src1_dd);
|
|
2683
|
+
GGML_UNUSED(ctx);
|
|
2662
2684
|
}
|
|
2663
2685
|
|
|
2664
2686
|
inline void ggml_sycl_op_argsort(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
|
@@ -2677,9 +2699,10 @@ inline void ggml_sycl_op_argsort(ggml_backend_sycl_context & ctx, const ggml_ten
|
|
|
2677
2699
|
|
|
2678
2700
|
argsort_f32_i32_sycl(src0_dd, (int *)dst_dd, ncols, nrows, order, main_stream);
|
|
2679
2701
|
|
|
2680
|
-
(
|
|
2681
|
-
(
|
|
2682
|
-
(
|
|
2702
|
+
GGML_UNUSED(src1);
|
|
2703
|
+
GGML_UNUSED(dst);
|
|
2704
|
+
GGML_UNUSED(src1_dd);
|
|
2705
|
+
GGML_UNUSED(ctx);
|
|
2683
2706
|
}
|
|
2684
2707
|
|
|
2685
2708
|
inline void ggml_sycl_op_argmax(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
|
@@ -2696,9 +2719,10 @@ inline void ggml_sycl_op_argmax(ggml_backend_sycl_context & ctx, const ggml_tens
|
|
|
2696
2719
|
|
|
2697
2720
|
argmax_f32_i32_sycl(src0_dd, (int *)dst_dd, ncols, nrows, main_stream);
|
|
2698
2721
|
|
|
2699
|
-
(
|
|
2700
|
-
(
|
|
2701
|
-
(
|
|
2722
|
+
GGML_UNUSED(src1);
|
|
2723
|
+
GGML_UNUSED(dst);
|
|
2724
|
+
GGML_UNUSED(src1_dd);
|
|
2725
|
+
GGML_UNUSED(ctx);
|
|
2702
2726
|
}
|
|
2703
2727
|
|
|
2704
2728
|
inline void ggml_sycl_op_diag_mask_inf(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
|
@@ -2718,9 +2742,10 @@ inline void ggml_sycl_op_diag_mask_inf(ggml_backend_sycl_context & ctx, const gg
|
|
|
2718
2742
|
|
|
2719
2743
|
diag_mask_inf_f32_sycl(src0_dd, dst_dd, ne00, nrows0, ne01, n_past, main_stream);
|
|
2720
2744
|
|
|
2721
|
-
(
|
|
2722
|
-
(
|
|
2723
|
-
(
|
|
2745
|
+
GGML_UNUSED(src1);
|
|
2746
|
+
GGML_UNUSED(dst);
|
|
2747
|
+
GGML_UNUSED(src1_dd);
|
|
2748
|
+
GGML_UNUSED(ctx);
|
|
2724
2749
|
}
|
|
2725
2750
|
|
|
2726
2751
|
inline void ggml_sycl_op_scale(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
|
|
@@ -2741,9 +2766,10 @@ inline void ggml_sycl_op_scale(ggml_backend_sycl_context & ctx, const ggml_tenso
|
|
|
2741
2766
|
*/
|
|
2742
2767
|
SYCL_CHECK(0);
|
|
2743
2768
|
|
|
2744
|
-
(
|
|
2745
|
-
(
|
|
2746
|
-
(
|
|
2769
|
+
GGML_UNUSED(src1);
|
|
2770
|
+
GGML_UNUSED(dst);
|
|
2771
|
+
GGML_UNUSED(src1_dd);
|
|
2772
|
+
GGML_UNUSED(ctx);
|
|
2747
2773
|
}
|
|
2748
2774
|
|
|
2749
2775
|
inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
|
|
@@ -2766,9 +2792,10 @@ inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, const ggml_tenso
|
|
|
2766
2792
|
*/
|
|
2767
2793
|
SYCL_CHECK(0);
|
|
2768
2794
|
|
|
2769
|
-
(
|
|
2770
|
-
(
|
|
2771
|
-
(
|
|
2795
|
+
GGML_UNUSED(src1);
|
|
2796
|
+
GGML_UNUSED(dst);
|
|
2797
|
+
GGML_UNUSED(src1_dd);
|
|
2798
|
+
GGML_UNUSED(ctx);
|
|
2772
2799
|
}
|
|
2773
2800
|
|
|
2774
2801
|
static void ggml_sycl_set_peer_access(const int n_tokens, int main_device) {
|
|
@@ -2845,7 +2872,6 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
|
|
|
2845
2872
|
|
|
2846
2873
|
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
|
2847
2874
|
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
|
2848
|
-
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
|
2849
2875
|
|
|
2850
2876
|
const bool src0_is_contiguous = ggml_is_contiguous(src0);
|
|
2851
2877
|
const bool src1_is_contiguous = ggml_is_contiguous(src1);
|
|
@@ -3272,7 +3298,6 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx,
|
|
|
3272
3298
|
|
|
3273
3299
|
GGML_TENSOR_BINARY_OP_LOCALS
|
|
3274
3300
|
|
|
3275
|
-
const int64_t ne_dst = ggml_nelements(dst);
|
|
3276
3301
|
|
|
3277
3302
|
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
|
3278
3303
|
queue_ptr main_stream = ctx.stream();;
|
|
@@ -3380,6 +3405,7 @@ catch (sycl::exception const &exc) {
|
|
|
3380
3405
|
|
|
3381
3406
|
inline bool ggml_sycl_supports_mmq(enum ggml_type type) {
|
|
3382
3407
|
// TODO: accuracy issues in MMQ
|
|
3408
|
+
GGML_UNUSED(type);
|
|
3383
3409
|
return false;
|
|
3384
3410
|
}
|
|
3385
3411
|
|
|
@@ -3447,8 +3473,15 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
|
|
|
3447
3473
|
use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
|
|
3448
3474
|
|
|
3449
3475
|
if (!split && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
|
3450
|
-
//
|
|
3451
|
-
|
|
3476
|
+
// TODO: Refactor and cleanup of mul mat dispatching.
|
|
3477
|
+
if (src0->ne[3] == 1 && src1->ne[3] == 1) {
|
|
3478
|
+
// KQ single-batch
|
|
3479
|
+
// mmv p021 was specific for these dimensions
|
|
3480
|
+
ggml_sycl_mul_mat_vec_p021(ctx, src0, src1, dst);
|
|
3481
|
+
} else {
|
|
3482
|
+
// The kernel from the if path is faster for that specific case, but does not support all mul mats.
|
|
3483
|
+
ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst);
|
|
3484
|
+
}
|
|
3452
3485
|
} else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
|
3453
3486
|
// KQV single-batch
|
|
3454
3487
|
ggml_sycl_mul_mat_vec_nc(ctx, src0, src1, dst);
|
|
@@ -3743,12 +3776,12 @@ static void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor *sr
|
|
|
3743
3776
|
} else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) {
|
|
3744
3777
|
ggml_cpy_i32_i32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
|
3745
3778
|
} else {
|
|
3746
|
-
|
|
3779
|
+
GGML_LOG_ERROR("%s: unsupported type combination (%s to %s)\n", __func__,
|
|
3747
3780
|
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
|
3748
3781
|
GGML_ABORT("fatal error");
|
|
3749
3782
|
}
|
|
3750
3783
|
|
|
3751
|
-
(
|
|
3784
|
+
GGML_UNUSED(dst);
|
|
3752
3785
|
}
|
|
3753
3786
|
catch (sycl::exception const &exc) {
|
|
3754
3787
|
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
|
@@ -3759,7 +3792,7 @@ catch (sycl::exception const &exc) {
|
|
|
3759
3792
|
static void ggml_sycl_dup(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
3760
3793
|
// TODO: why do we pass dst as src1 here?
|
|
3761
3794
|
ggml_sycl_cpy(ctx, src0, dst, nullptr);
|
|
3762
|
-
(
|
|
3795
|
+
GGML_UNUSED(src1);
|
|
3763
3796
|
}
|
|
3764
3797
|
|
|
3765
3798
|
static void ggml_sycl_diag_mask_inf(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -3804,13 +3837,16 @@ static void ggml_sycl_argmax(ggml_backend_sycl_context & ctx, const ggml_tensor
|
|
|
3804
3837
|
}
|
|
3805
3838
|
|
|
3806
3839
|
static void ggml_sycl_nop(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
3807
|
-
(
|
|
3808
|
-
(
|
|
3809
|
-
(
|
|
3840
|
+
GGML_UNUSED(src0);
|
|
3841
|
+
GGML_UNUSED(src1);
|
|
3842
|
+
GGML_UNUSED(dst);
|
|
3843
|
+
GGML_UNUSED(ctx);
|
|
3810
3844
|
}
|
|
3811
3845
|
|
|
3812
3846
|
void ggml_sycl_set_main_device(const int main_device) try {
|
|
3813
|
-
if (dpct::get_current_device_id() == main_device)
|
|
3847
|
+
if (dpct::get_current_device_id() == static_cast<unsigned int> (main_device)) {
|
|
3848
|
+
return;
|
|
3849
|
+
}
|
|
3814
3850
|
check_allow_gpu_index(main_device);
|
|
3815
3851
|
dpct::select_device(main_device);
|
|
3816
3852
|
|
|
@@ -3818,7 +3854,7 @@ void ggml_sycl_set_main_device(const int main_device) try {
|
|
|
3818
3854
|
dpct::device_info prop;
|
|
3819
3855
|
SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
|
|
3820
3856
|
prop, dpct::dev_mgr::instance().get_device(main_device))));
|
|
3821
|
-
|
|
3857
|
+
GGML_LOG_INFO("Using device %d (%s) as main device\n",
|
|
3822
3858
|
main_device, prop.get_name());
|
|
3823
3859
|
}
|
|
3824
3860
|
}
|
|
@@ -4165,7 +4201,7 @@ static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_
|
|
|
4165
4201
|
#endif
|
|
4166
4202
|
bool ok = ggml_sycl_compute_forward(*sycl_ctx, node);
|
|
4167
4203
|
if (!ok) {
|
|
4168
|
-
|
|
4204
|
+
GGML_LOG_ERROR("%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
|
|
4169
4205
|
}
|
|
4170
4206
|
GGML_ASSERT(ok);
|
|
4171
4207
|
}
|
|
@@ -4178,6 +4214,7 @@ try
|
|
|
4178
4214
|
{
|
|
4179
4215
|
ggml_backend_sycl_context *sycl_ctx =
|
|
4180
4216
|
(ggml_backend_sycl_context *)backend->context;
|
|
4217
|
+
|
|
4181
4218
|
sycl::event *sycl_event = static_cast<sycl::event *>(event->context);
|
|
4182
4219
|
|
|
4183
4220
|
const queue_ptr &stream = sycl_ctx->stream(sycl_ctx->device, 0);
|
|
@@ -4192,7 +4229,7 @@ catch (sycl::exception const &exc)
|
|
|
4192
4229
|
}
|
|
4193
4230
|
|
|
4194
4231
|
static void ggml_backend_sycl_event_wait(ggml_backend_t backend, ggml_backend_event_t event) try {
|
|
4195
|
-
|
|
4232
|
+
|
|
4196
4233
|
sycl::event* sycl_event = static_cast<sycl::event*>(event->context);
|
|
4197
4234
|
|
|
4198
4235
|
if (ggml_backend_is_sycl(backend)) {
|
|
@@ -4350,10 +4387,6 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
|
|
4350
4387
|
if (op->op == GGML_OP_MUL_MAT) {
|
|
4351
4388
|
a = op->src[0];
|
|
4352
4389
|
b = op->src[1];
|
|
4353
|
-
if (ggml_is_permuted(a) || ggml_is_permuted(b)) {
|
|
4354
|
-
// TODO: fix like https://github.com/ggerganov/llama.cpp/pull/10021
|
|
4355
|
-
return false;
|
|
4356
|
-
}
|
|
4357
4390
|
} else {
|
|
4358
4391
|
a = op->src[2];
|
|
4359
4392
|
b = op->src[1];
|
|
@@ -4455,7 +4488,16 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
|
|
4455
4488
|
case GGML_OP_SOFT_MAX:
|
|
4456
4489
|
return true;
|
|
4457
4490
|
case GGML_OP_ROPE:
|
|
4458
|
-
|
|
4491
|
+
{
|
|
4492
|
+
const int mode = ((const int32_t *) op->op_params)[2];
|
|
4493
|
+
if (mode & GGML_ROPE_TYPE_MROPE) {
|
|
4494
|
+
return false;
|
|
4495
|
+
}
|
|
4496
|
+
if (mode & GGML_ROPE_TYPE_VISION) {
|
|
4497
|
+
return false;
|
|
4498
|
+
}
|
|
4499
|
+
return ggml_is_contiguous(op->src[0]);
|
|
4500
|
+
}
|
|
4459
4501
|
case GGML_OP_IM2COL:
|
|
4460
4502
|
// TODO: add support for the new F32 operations
|
|
4461
4503
|
return op->src[0]->type == GGML_TYPE_F16;
|
|
@@ -4490,7 +4532,7 @@ static bool ggml_backend_sycl_device_supports_buft(ggml_backend_dev_t dev, ggml_
|
|
|
4490
4532
|
static int64_t get_op_batch_size(const ggml_tensor * op) {
|
|
4491
4533
|
switch (op->op) {
|
|
4492
4534
|
case GGML_OP_GET_ROWS:
|
|
4493
|
-
return
|
|
4535
|
+
return 0;
|
|
4494
4536
|
case GGML_OP_MUL_MAT:
|
|
4495
4537
|
return op->ne[1];
|
|
4496
4538
|
case GGML_OP_MUL_MAT_ID:
|
|
@@ -4604,13 +4646,14 @@ static void *ggml_backend_sycl_reg_get_proc_address(ggml_backend_reg_t reg, cons
|
|
|
4604
4646
|
// SYCL doesn't support registering host memory, left here for reference
|
|
4605
4647
|
// "ggml_backend_register_host_buffer"
|
|
4606
4648
|
// "ggml_backend_unregister_host_buffer"
|
|
4649
|
+
GGML_UNUSED(name);
|
|
4607
4650
|
return nullptr;
|
|
4608
4651
|
}
|
|
4609
4652
|
|
|
4610
4653
|
static const ggml_backend_reg_i ggml_backend_sycl_reg_interface = {
|
|
4611
4654
|
/* .get_name = */ ggml_backend_sycl_reg_get_name,
|
|
4612
4655
|
/* .get_device_count = */ ggml_backend_sycl_reg_get_device_count,
|
|
4613
|
-
/* .
|
|
4656
|
+
/* .get_device = */ ggml_backend_sycl_reg_get_device,
|
|
4614
4657
|
/* .get_proc_address = */ ggml_backend_sycl_reg_get_proc_address,
|
|
4615
4658
|
};
|
|
4616
4659
|
|
|
@@ -4641,16 +4684,17 @@ ggml_backend_reg_t ggml_backend_sycl_reg() {
|
|
|
4641
4684
|
dev_ctx->description = prop.get_name();
|
|
4642
4685
|
|
|
4643
4686
|
ggml_backend_dev_t dev = new ggml_backend_device {
|
|
4644
|
-
/* .
|
|
4645
|
-
/* .reg
|
|
4646
|
-
/* .context
|
|
4687
|
+
/* .iface = */ ggml_backend_sycl_device_interface,
|
|
4688
|
+
/* .reg = */ ®,
|
|
4689
|
+
/* .context = */ dev_ctx
|
|
4647
4690
|
};
|
|
4648
4691
|
ctx->devices.push_back(dev);
|
|
4649
4692
|
}
|
|
4650
4693
|
|
|
4651
4694
|
reg = ggml_backend_reg {
|
|
4652
|
-
/* .
|
|
4653
|
-
/* .
|
|
4695
|
+
/* .api_version = */ GGML_BACKEND_API_VERSION,
|
|
4696
|
+
/* .iface = */ ggml_backend_sycl_reg_interface,
|
|
4697
|
+
/* .context = */ ctx
|
|
4654
4698
|
};
|
|
4655
4699
|
}
|
|
4656
4700
|
|
|
@@ -4668,7 +4712,7 @@ ggml_backend_t ggml_backend_sycl_init(int device) {
|
|
|
4668
4712
|
|
|
4669
4713
|
ggml_backend_sycl_context * ctx = new ggml_backend_sycl_context(device);
|
|
4670
4714
|
if (ctx == nullptr) {
|
|
4671
|
-
|
|
4715
|
+
GGML_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
|
|
4672
4716
|
return nullptr;
|
|
4673
4717
|
};
|
|
4674
4718
|
|
|
@@ -4682,3 +4726,4 @@ ggml_backend_t ggml_backend_sycl_init(int device) {
|
|
|
4682
4726
|
return sycl_backend;
|
|
4683
4727
|
}
|
|
4684
4728
|
|
|
4729
|
+
GGML_BACKEND_DL_IMPL(ggml_backend_sycl_reg)
|
|
@@ -120,6 +120,7 @@ void ggml_sycl_op_im2col(
|
|
|
120
120
|
im2col_sycl(src1_dd, (float *)dst_dd, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
|
|
121
121
|
}
|
|
122
122
|
|
|
123
|
-
(
|
|
124
|
-
(
|
|
123
|
+
GGML_UNUSED(src0);
|
|
124
|
+
GGML_UNUSED(src0_dd);
|
|
125
|
+
GGML_UNUSED(ctx);
|
|
125
126
|
}
|