@fugood/llama.node 0.3.14 → 0.3.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/package.json +1 -1
- package/src/llama.cpp/.github/workflows/build.yml +30 -1
- package/src/llama.cpp/CMakeLists.txt +9 -1
- package/src/llama.cpp/cmake/common.cmake +2 -0
- package/src/llama.cpp/common/arg.cpp +20 -2
- package/src/llama.cpp/common/common.cpp +6 -3
- package/src/llama.cpp/common/speculative.cpp +4 -4
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +2 -2
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +1 -1
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +2 -2
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +1 -1
- package/src/llama.cpp/examples/infill/infill.cpp +2 -2
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +2 -2
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +4 -4
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +1 -1
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +6 -6
- package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
- package/src/llama.cpp/examples/main/main.cpp +6 -6
- package/src/llama.cpp/examples/parallel/parallel.cpp +5 -5
- package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +6 -6
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -2
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +1 -1
- package/src/llama.cpp/examples/run/run.cpp +91 -46
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +2 -2
- package/src/llama.cpp/examples/server/server.cpp +37 -15
- package/src/llama.cpp/examples/server/utils.hpp +3 -1
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +2 -2
- package/src/llama.cpp/examples/speculative/speculative.cpp +14 -14
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
- package/src/llama.cpp/examples/tts/tts.cpp +20 -9
- package/src/llama.cpp/ggml/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/cmake/common.cmake +26 -0
- package/src/llama.cpp/ggml/include/ggml.h +24 -0
- package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -28
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +6 -2
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +0 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +15 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +1493 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +150 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +284 -29
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +2 -1
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +3 -1
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +7 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +0 -4
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +95 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +35 -12
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +93 -27
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +12 -13
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +40 -40
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +12 -43
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +1 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +109 -40
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +0 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +19 -20
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +114 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +6 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +305 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv.hpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +398 -158
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +0 -4
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +7 -2
- package/src/llama.cpp/ggml/src/ggml.c +85 -2
- package/src/llama.cpp/include/llama.h +86 -22
- package/src/llama.cpp/src/CMakeLists.txt +5 -2
- package/src/llama.cpp/src/llama-adapter.cpp +19 -20
- package/src/llama.cpp/src/llama-adapter.h +11 -9
- package/src/llama.cpp/src/llama-arch.cpp +103 -16
- package/src/llama.cpp/src/llama-arch.h +18 -0
- package/src/llama.cpp/src/llama-batch.h +2 -2
- package/src/llama.cpp/src/llama-context.cpp +2253 -1222
- package/src/llama.cpp/src/llama-context.h +214 -77
- package/src/llama.cpp/src/llama-cparams.h +1 -0
- package/src/llama.cpp/src/llama-graph.cpp +1662 -0
- package/src/llama.cpp/src/llama-graph.h +574 -0
- package/src/llama.cpp/src/llama-hparams.cpp +8 -0
- package/src/llama.cpp/src/llama-hparams.h +9 -0
- package/src/llama.cpp/src/llama-io.cpp +15 -0
- package/src/llama.cpp/src/llama-io.h +35 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +1006 -291
- package/src/llama.cpp/src/llama-kv-cache.h +178 -110
- package/src/llama.cpp/src/llama-memory.cpp +1 -0
- package/src/llama.cpp/src/llama-memory.h +21 -0
- package/src/llama.cpp/src/llama-model.cpp +8244 -173
- package/src/llama.cpp/src/llama-model.h +34 -1
- package/src/llama.cpp/src/llama-quant.cpp +10 -1
- package/src/llama.cpp/src/llama.cpp +51 -9984
- package/src/llama.cpp/tests/test-backend-ops.cpp +145 -23
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +0 -143
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +0 -9
|
@@ -46,6 +46,7 @@
|
|
|
46
46
|
static bool g_sycl_loaded = false;
|
|
47
47
|
int g_ggml_sycl_debug = 0;
|
|
48
48
|
int g_ggml_sycl_disable_optimize = 0;
|
|
49
|
+
int g_ggml_sycl_disable_graph = 0;
|
|
49
50
|
|
|
50
51
|
static ggml_sycl_device_info ggml_sycl_init() {
|
|
51
52
|
ggml_sycl_device_info info = {};
|
|
@@ -95,7 +96,7 @@ const ggml_sycl_device_info & ggml_sycl_info() {
|
|
|
95
96
|
return info;
|
|
96
97
|
}
|
|
97
98
|
|
|
98
|
-
void print_device_detail(int id, sycl::device &device, std::string device_type) {
|
|
99
|
+
static void print_device_detail(int id, sycl::device &device, std::string device_type) {
|
|
99
100
|
|
|
100
101
|
dpct::device_info prop;
|
|
101
102
|
SYCL_CHECK(CHECK_TRY_ERROR(
|
|
@@ -118,7 +119,7 @@ void print_device_detail(int id, sycl::device &device, std::string device_type)
|
|
|
118
119
|
global_mem_size, device.get_info<sycl::info::device::driver_version>().c_str());
|
|
119
120
|
}
|
|
120
121
|
|
|
121
|
-
void print_device_opt_feature(int device_count) {
|
|
122
|
+
static void print_device_opt_feature(int device_count) {
|
|
122
123
|
GGML_LOG_INFO("SYCL Optimization Feature:\n");
|
|
123
124
|
GGML_LOG_INFO(
|
|
124
125
|
"|ID| Device Type|Reorder|\n");
|
|
@@ -191,10 +192,12 @@ static void ggml_check_sycl() try {
|
|
|
191
192
|
if (!initialized) {
|
|
192
193
|
g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
|
|
193
194
|
g_ggml_sycl_disable_optimize= get_sycl_env("GGML_SYCL_DISABLE_OPT", 0);
|
|
195
|
+
g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1);
|
|
194
196
|
GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n");
|
|
195
197
|
GGML_LOG_INFO("Running with Environment Variables:\n");
|
|
196
198
|
GGML_LOG_INFO(" GGML_SYCL_DEBUG: %d\n", g_ggml_sycl_debug);
|
|
197
199
|
GGML_LOG_INFO(" GGML_SYCL_DISABLE_OPT: %d\n", g_ggml_sycl_disable_optimize);
|
|
200
|
+
GGML_LOG_INFO(" GGML_SYCL_DISABLE_GRAPH: %d\n", g_ggml_sycl_disable_graph);
|
|
198
201
|
GGML_LOG_INFO("Build with Macros:\n");
|
|
199
202
|
#if defined(GGML_SYCL_FORCE_MMQ)
|
|
200
203
|
GGML_LOG_INFO(" GGML_SYCL_FORCE_MMQ: yes\n");
|
|
@@ -333,10 +336,11 @@ ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
|
|
333
336
|
assert(tensor->view_src->buffer->buft == buffer->buft);
|
|
334
337
|
return GGML_STATUS_SUCCESS;
|
|
335
338
|
}
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
339
|
+
if (tensor->type == GGML_TYPE_Q4_0) {
|
|
340
|
+
ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
|
|
341
|
+
tensor->extra = extra;
|
|
342
|
+
ctx->tensor_extras.push_back(extra); //used to release it when destroy ctx.
|
|
343
|
+
}
|
|
340
344
|
|
|
341
345
|
if (ggml_is_quantized(tensor->type)) {
|
|
342
346
|
// initialize padding to 0 to avoid possible NaN values
|
|
@@ -400,7 +404,7 @@ catch (sycl::exception const &exc) {
|
|
|
400
404
|
std::exit(1);
|
|
401
405
|
}
|
|
402
406
|
|
|
403
|
-
void dev2dev_memcpy(sycl::queue &q_dst, sycl::queue &q_src, void *ptr_dst,
|
|
407
|
+
static void dev2dev_memcpy(sycl::queue &q_dst, sycl::queue &q_src, void *ptr_dst,
|
|
404
408
|
const void *ptr_src, size_t size) {
|
|
405
409
|
char *host_buf = (char *)malloc(size);
|
|
406
410
|
q_src.memcpy(host_buf, (const char *)ptr_src, size).wait();
|
|
@@ -486,6 +490,22 @@ catch (sycl::exception const &exc) {
|
|
|
486
490
|
std::exit(1);
|
|
487
491
|
}
|
|
488
492
|
|
|
493
|
+
static void ggml_backend_sycl_buffer_reset(ggml_backend_buffer_t buffer) {
|
|
494
|
+
GGML_SYCL_DEBUG("[SYCL] call %s\n", __func__);
|
|
495
|
+
if (buffer == nullptr) {
|
|
496
|
+
return;
|
|
497
|
+
}
|
|
498
|
+
|
|
499
|
+
ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *) buffer->context;
|
|
500
|
+
|
|
501
|
+
if (ctx != nullptr) {
|
|
502
|
+
for (ggml_tensor_extra_gpu * extra : ctx->tensor_extras) {
|
|
503
|
+
release_extra_gpu(extra);
|
|
504
|
+
}
|
|
505
|
+
ctx->tensor_extras.clear(); // reset the tensor_extras vector
|
|
506
|
+
}
|
|
507
|
+
}
|
|
508
|
+
|
|
489
509
|
static const ggml_backend_buffer_i ggml_backend_sycl_buffer_interface = {
|
|
490
510
|
/* .free_buffer = */ ggml_backend_sycl_buffer_free_buffer,
|
|
491
511
|
/* .get_base = */ ggml_backend_sycl_buffer_get_base,
|
|
@@ -495,7 +515,7 @@ static const ggml_backend_buffer_i ggml_backend_sycl_buffer_interface = {
|
|
|
495
515
|
/* .get_tensor = */ ggml_backend_sycl_buffer_get_tensor,
|
|
496
516
|
/* .cpy_tensor = */ ggml_backend_sycl_buffer_cpy_tensor,
|
|
497
517
|
/* .clear = */ ggml_backend_sycl_buffer_clear,
|
|
498
|
-
/* .reset = */
|
|
518
|
+
/* .reset = */ ggml_backend_sycl_buffer_reset,
|
|
499
519
|
};
|
|
500
520
|
|
|
501
521
|
// sycl buffer type
|
|
@@ -576,7 +596,6 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) {
|
|
|
576
596
|
static std::mutex mutex;
|
|
577
597
|
std::lock_guard<std::mutex> lock(mutex);
|
|
578
598
|
|
|
579
|
-
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_buffer_type\n");
|
|
580
599
|
|
|
581
600
|
auto dev_count = ggml_backend_sycl_get_device_count();
|
|
582
601
|
|
|
@@ -604,7 +623,7 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) {
|
|
|
604
623
|
return &ggml_backend_sycl_buffer_types[device];
|
|
605
624
|
}
|
|
606
625
|
|
|
607
|
-
ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(ggml_backend_sycl_context * ctx) {
|
|
626
|
+
static ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(ggml_backend_sycl_context * ctx) {
|
|
608
627
|
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_buffer_type\n");
|
|
609
628
|
|
|
610
629
|
int device = ctx->device;
|
|
@@ -1666,7 +1685,7 @@ static void quantize_row_q8_1_sycl(const float *x, void *vy, const int kx,
|
|
|
1666
1685
|
|
|
1667
1686
|
stream->parallel_for(
|
|
1668
1687
|
sycl::nd_range<3>(num_blocks * block_size, block_size),
|
|
1669
|
-
[=](sycl::nd_item<3> item_ct1) [[
|
|
1688
|
+
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
|
1670
1689
|
quantize_q8_1<QUANT_BLOCK_TILE>(x, vy, kx, kx_padded, item_ct1);
|
|
1671
1690
|
});
|
|
1672
1691
|
}
|
|
@@ -1687,7 +1706,7 @@ static void ggml_mul_mat_p021_f16_f32_sycl(const void *vx, const float *y,
|
|
|
1687
1706
|
|
|
1688
1707
|
stream->parallel_for(
|
|
1689
1708
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
1690
|
-
[=](sycl::nd_item<3> item_ct1) [[
|
|
1709
|
+
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
|
1691
1710
|
mul_mat_p021_f16_f32(vx, y, dst, ncols_x, nrows_x, nchannels_x,
|
|
1692
1711
|
nchannels_y, item_ct1);
|
|
1693
1712
|
});
|
|
@@ -1707,7 +1726,7 @@ static void ggml_mul_mat_vec_nc_f16_f32_sycl(
|
|
|
1707
1726
|
|
|
1708
1727
|
stream->parallel_for(
|
|
1709
1728
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
1710
|
-
[=](sycl::nd_item<3> item_ct1) [[
|
|
1729
|
+
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
|
1711
1730
|
mul_mat_vec_nc_f16_f32(vx, y, dst, ncols_x, nrows_x,
|
|
1712
1731
|
row_stride_x, channel_stride_x,
|
|
1713
1732
|
nchannels_y / nchannels_x, item_ct1);
|
|
@@ -1748,7 +1767,7 @@ static void sum_rows_f32_sycl(const float *x, float *dst, const int ncols,
|
|
|
1748
1767
|
const sycl::range<3> block_nums(1, nrows, 1);
|
|
1749
1768
|
stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
1750
1769
|
[=](sycl::nd_item<3> item_ct1)
|
|
1751
|
-
[[
|
|
1770
|
+
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
|
1752
1771
|
k_sum_rows_f32(x, dst, ncols, item_ct1);
|
|
1753
1772
|
});
|
|
1754
1773
|
}
|
|
@@ -2039,9 +2058,9 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
|
|
2039
2058
|
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16, dst);
|
|
2040
2059
|
to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
|
|
2041
2060
|
#else
|
|
2042
|
-
|
|
2043
|
-
|
|
2044
|
-
|
|
2061
|
+
DnnlGemmWrapper::row_gemm(ctx, false, true, src1_ncols, row_diff, ne10, src1_ptr,
|
|
2062
|
+
DnnlGemmWrapper::to_dt<sycl::half>(), src0_ptr, DnnlGemmWrapper::to_dt<sycl::half>(),
|
|
2063
|
+
dst_f16.get(), DnnlGemmWrapper::to_dt<sycl::half>(), stream);
|
|
2045
2064
|
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16, dst);
|
|
2046
2065
|
to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff* src1_ncols, stream);
|
|
2047
2066
|
#endif
|
|
@@ -2080,9 +2099,9 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
|
|
2080
2099
|
dst_dd_i, ldc)));
|
|
2081
2100
|
# endif
|
|
2082
2101
|
#else
|
|
2083
|
-
|
|
2084
|
-
|
|
2085
|
-
|
|
2102
|
+
DnnlGemmWrapper::row_gemm(ctx, false, true, src1_ncols, row_diff, ne10, src1_ddf1_i,
|
|
2103
|
+
DnnlGemmWrapper::to_dt<float>(), src0_ddf_i, DnnlGemmWrapper::to_dt<float>(),
|
|
2104
|
+
dst_dd_i, DnnlGemmWrapper::to_dt<float>(), stream);
|
|
2086
2105
|
#endif
|
|
2087
2106
|
}
|
|
2088
2107
|
GGML_UNUSED(dst);
|
|
@@ -2680,6 +2699,12 @@ static void ggml_sycl_rms_norm(ggml_backend_sycl_context & ctx, ggml_tensor * ds
|
|
|
2680
2699
|
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
|
2681
2700
|
}
|
|
2682
2701
|
|
|
2702
|
+
static void ggml_sycl_l2_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
2703
|
+
GGML_SYCL_DEBUG("call %s\n", __func__);
|
|
2704
|
+
ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_l2_norm);
|
|
2705
|
+
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
|
2706
|
+
}
|
|
2707
|
+
|
|
2683
2708
|
static void ggml_sycl_group_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
2684
2709
|
GGML_SYCL_DEBUG("call %s\n", __func__);
|
|
2685
2710
|
ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_group_norm);
|
|
@@ -2898,7 +2923,7 @@ inline bool ggml_sycl_supports_mmq(enum ggml_type type) {
|
|
|
2898
2923
|
return false;
|
|
2899
2924
|
}
|
|
2900
2925
|
|
|
2901
|
-
bool ggml_sycl_supports_dmmv(enum ggml_type type) {
|
|
2926
|
+
static bool ggml_sycl_supports_dmmv(enum ggml_type type) {
|
|
2902
2927
|
switch (type) {
|
|
2903
2928
|
case GGML_TYPE_Q4_0:
|
|
2904
2929
|
case GGML_TYPE_Q4_1:
|
|
@@ -3113,8 +3138,8 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
|
|
|
3113
3138
|
const int64_t i2 = i12;
|
|
3114
3139
|
|
|
3115
3140
|
src0_row.data = src0_original + i02*nb02;
|
|
3116
|
-
src1_row.data = src1_original +
|
|
3117
|
-
dst_row.data = dst_original + i1*nb1
|
|
3141
|
+
src1_row.data = src1_original + i11*nb11 + i12*nb12;
|
|
3142
|
+
dst_row.data = dst_original + i1*nb1 + i2*nb2;
|
|
3118
3143
|
|
|
3119
3144
|
ggml_sycl_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
|
|
3120
3145
|
}
|
|
@@ -3271,7 +3296,7 @@ static void ggml_sycl_argmax(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
|
|
|
3271
3296
|
}
|
|
3272
3297
|
|
|
3273
3298
|
|
|
3274
|
-
void ggml_sycl_set_main_device(const int main_device) try {
|
|
3299
|
+
static void ggml_sycl_set_main_device(const int main_device) try {
|
|
3275
3300
|
if (dpct::get_current_device_id() == static_cast<unsigned int> (main_device)) {
|
|
3276
3301
|
return;
|
|
3277
3302
|
}
|
|
@@ -3292,7 +3317,7 @@ catch (sycl::exception const &exc) {
|
|
|
3292
3317
|
std::exit(1);
|
|
3293
3318
|
}
|
|
3294
3319
|
|
|
3295
|
-
bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tensor * dst) {
|
|
3320
|
+
static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tensor * dst) {
|
|
3296
3321
|
if (!g_sycl_loaded) return false;
|
|
3297
3322
|
|
|
3298
3323
|
if (dst->src[0] != nullptr && ggml_backend_buffer_is_sycl_split(dst->src[0]->buffer)) {
|
|
@@ -3394,6 +3419,9 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens
|
|
|
3394
3419
|
case GGML_OP_RMS_NORM:
|
|
3395
3420
|
ggml_sycl_rms_norm(ctx, dst);
|
|
3396
3421
|
break;
|
|
3422
|
+
case GGML_OP_L2_NORM:
|
|
3423
|
+
ggml_sycl_l2_norm(ctx, dst);
|
|
3424
|
+
break;
|
|
3397
3425
|
case GGML_OP_MUL_MAT:
|
|
3398
3426
|
if (dst->src[0]->ne[3] != dst->src[1]->ne[3]) {
|
|
3399
3427
|
return false;
|
|
@@ -3471,6 +3499,9 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens
|
|
|
3471
3499
|
case GGML_OP_RWKV_WKV6:
|
|
3472
3500
|
ggml_sycl_op_rwkv_wkv6(ctx, dst);
|
|
3473
3501
|
break;
|
|
3502
|
+
case GGML_OP_RWKV_WKV7:
|
|
3503
|
+
ggml_sycl_op_rwkv_wkv7(ctx, dst);
|
|
3504
|
+
break;
|
|
3474
3505
|
case GGML_OP_GATED_LINEAR_ATTN:
|
|
3475
3506
|
ggml_sycl_op_gated_linear_attn(ctx, dst);
|
|
3476
3507
|
break;
|
|
@@ -3610,7 +3641,7 @@ catch (sycl::exception const &exc) {
|
|
|
3610
3641
|
std::exit(1);
|
|
3611
3642
|
}
|
|
3612
3643
|
|
|
3613
|
-
void reorder_qw(char *data_device, const int ncols, const int nrows,
|
|
3644
|
+
static void reorder_qw(char *data_device, const int ncols, const int nrows,
|
|
3614
3645
|
size_t size, size_t offset, dpct::queue_ptr stream) {
|
|
3615
3646
|
auto tmp_buf = sycl::malloc_shared<char>(size, *stream);
|
|
3616
3647
|
SYCL_CHECK(
|
|
@@ -3624,7 +3655,7 @@ void reorder_qw(char *data_device, const int ncols, const int nrows,
|
|
|
3624
3655
|
|
|
3625
3656
|
stream->parallel_for(
|
|
3626
3657
|
size / sizeof(block_q4_0),
|
|
3627
|
-
[=](auto i) [[
|
|
3658
|
+
[=](auto i) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
|
3628
3659
|
const block_q4_0* x = (const block_q4_0*)tmp_buf;
|
|
3629
3660
|
const int ib = i;
|
|
3630
3661
|
|
|
@@ -3638,7 +3669,7 @@ void reorder_qw(char *data_device, const int ncols, const int nrows,
|
|
|
3638
3669
|
sycl::free(tmp_buf, *stream);
|
|
3639
3670
|
}
|
|
3640
3671
|
|
|
3641
|
-
void reorder_qw(ggml_tensor * src0, dpct::queue_ptr stream) {
|
|
3672
|
+
static void reorder_qw(ggml_tensor * src0, dpct::queue_ptr stream) {
|
|
3642
3673
|
char*data_device = (char*)src0->data;
|
|
3643
3674
|
size_t ncols = src0->ne[0];
|
|
3644
3675
|
size_t nrows = src0->ne[1];
|
|
@@ -3647,7 +3678,7 @@ void reorder_qw(ggml_tensor * src0, dpct::queue_ptr stream) {
|
|
|
3647
3678
|
reorder_qw(data_device, ncols, nrows, size, 0, stream);
|
|
3648
3679
|
}
|
|
3649
3680
|
|
|
3650
|
-
void opt_for_reorder(ggml_tensor * dst, dpct::queue_ptr stream) {
|
|
3681
|
+
static void opt_for_reorder(ggml_tensor * dst, dpct::queue_ptr stream) {
|
|
3651
3682
|
ggml_tensor *src0 = dst->src[0];
|
|
3652
3683
|
ggml_tensor *src1 = dst->src[1];
|
|
3653
3684
|
|
|
@@ -3660,7 +3691,7 @@ void opt_for_reorder(ggml_tensor * dst, dpct::queue_ptr stream) {
|
|
|
3660
3691
|
}
|
|
3661
3692
|
}
|
|
3662
3693
|
|
|
3663
|
-
void optimize_graph_once(ggml_cgraph * cgraph, ggml_backend_sycl_context * ctx) {
|
|
3694
|
+
static void optimize_graph_once(ggml_cgraph * cgraph, ggml_backend_sycl_context * ctx) {
|
|
3664
3695
|
dpct::queue_ptr stream = ctx->stream();
|
|
3665
3696
|
if (ctx->optimized_graph) {
|
|
3666
3697
|
return;
|
|
@@ -3671,10 +3702,9 @@ void optimize_graph_once(ggml_cgraph * cgraph, ggml_backend_sycl_context * ctx)
|
|
|
3671
3702
|
if (ctx->opt_feature.reorder) opt_for_reorder(cgraph->nodes[i], stream);
|
|
3672
3703
|
}
|
|
3673
3704
|
}
|
|
3674
|
-
static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
|
3675
|
-
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
|
3676
|
-
ggml_sycl_set_main_device(sycl_ctx->device);
|
|
3677
3705
|
|
|
3706
|
+
static void ggml_backend_sycl_graph_compute_impl(ggml_backend_sycl_context * sycl_ctx, ggml_cgraph * cgraph) {
|
|
3707
|
+
ggml_sycl_set_main_device(sycl_ctx->device);
|
|
3678
3708
|
if (!g_ggml_sycl_disable_optimize) optimize_graph_once(cgraph, sycl_ctx);
|
|
3679
3709
|
|
|
3680
3710
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
|
@@ -3696,7 +3726,46 @@ static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_
|
|
|
3696
3726
|
}
|
|
3697
3727
|
GGML_ASSERT(ok);
|
|
3698
3728
|
}
|
|
3729
|
+
}
|
|
3730
|
+
|
|
3731
|
+
static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
|
3732
|
+
auto * sycl_ctx = static_cast<ggml_backend_sycl_context *>(backend->context);
|
|
3733
|
+
|
|
3734
|
+
#ifdef GGML_SYCL_GRAPH
|
|
3735
|
+
if (!g_ggml_sycl_disable_graph) {
|
|
3736
|
+
if (!sycl_ctx->exec_graph && !dpct::get_device(sycl_ctx->device).has(sycl::aspect::ext_oneapi_graph)) {
|
|
3737
|
+
GGML_SYCL_DEBUG("[SYCL-GRAPH] can not use graphs on device:%d\n", sycl_ctx->device);
|
|
3738
|
+
ggml_backend_sycl_graph_compute_impl(sycl_ctx, cgraph);
|
|
3739
|
+
return GGML_STATUS_SUCCESS;
|
|
3740
|
+
}
|
|
3741
|
+
|
|
3742
|
+
sycl_ex::command_graph model_sycl_graph(*(sycl_ctx->stream()));
|
|
3743
|
+
model_sycl_graph.begin_recording(*(sycl_ctx->stream()));
|
|
3744
|
+
ggml_backend_sycl_graph_compute_impl(sycl_ctx, cgraph);
|
|
3745
|
+
model_sycl_graph.end_recording();
|
|
3746
|
+
|
|
3747
|
+
if (!sycl_ctx->exec_graph) {
|
|
3748
|
+
auto exec_graph = model_sycl_graph.finalize({sycl_ex::property::graph::updatable{}});
|
|
3749
|
+
sycl_ctx->exec_graph = std::make_unique<
|
|
3750
|
+
sycl_ex::command_graph<sycl_ex::graph_state::executable>>(exec_graph);
|
|
3751
|
+
} else {
|
|
3752
|
+
try {
|
|
3753
|
+
sycl_ctx->exec_graph->update(model_sycl_graph);
|
|
3754
|
+
GGML_SYCL_DEBUG("[SYCL-GRAPH] update success\n");
|
|
3755
|
+
} catch (sycl::exception const & e) {
|
|
3756
|
+
GGML_SYCL_DEBUG("[SYCL-GRAPH] Exception when updating graph, %s\n", e.what());
|
|
3757
|
+
auto exec_graph = model_sycl_graph.finalize({sycl_ex::property::graph::updatable{}});
|
|
3758
|
+
sycl_ctx->exec_graph = std::make_unique<
|
|
3759
|
+
sycl_ex::command_graph<sycl_ex::graph_state::executable>>(exec_graph);
|
|
3760
|
+
}
|
|
3761
|
+
}
|
|
3699
3762
|
|
|
3763
|
+
sycl_ctx->stream()->ext_oneapi_graph(*(sycl_ctx->exec_graph));
|
|
3764
|
+
} else
|
|
3765
|
+
#endif
|
|
3766
|
+
{
|
|
3767
|
+
ggml_backend_sycl_graph_compute_impl(sycl_ctx, cgraph);
|
|
3768
|
+
}
|
|
3700
3769
|
return GGML_STATUS_SUCCESS;
|
|
3701
3770
|
}
|
|
3702
3771
|
|
|
@@ -3761,7 +3830,6 @@ bool ggml_backend_is_sycl(ggml_backend_t backend) {
|
|
|
3761
3830
|
}
|
|
3762
3831
|
|
|
3763
3832
|
int ggml_backend_sycl_get_device_count() {
|
|
3764
|
-
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_count\n");
|
|
3765
3833
|
return ggml_sycl_info().device_count;
|
|
3766
3834
|
}
|
|
3767
3835
|
|
|
@@ -3851,7 +3919,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
|
|
3851
3919
|
return true;
|
|
3852
3920
|
}
|
|
3853
3921
|
return false;
|
|
3854
|
-
}
|
|
3922
|
+
}
|
|
3855
3923
|
case GGML_OP_UNARY:
|
|
3856
3924
|
switch (ggml_get_unary_op(op)) {
|
|
3857
3925
|
case GGML_UNARY_OP_NEG:
|
|
@@ -3869,7 +3937,6 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
|
|
3869
3937
|
default:
|
|
3870
3938
|
return false;
|
|
3871
3939
|
}
|
|
3872
|
-
break;
|
|
3873
3940
|
case GGML_OP_MUL_MAT:
|
|
3874
3941
|
case GGML_OP_MUL_MAT_ID:
|
|
3875
3942
|
{
|
|
@@ -3900,7 +3967,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
|
|
3900
3967
|
return false;
|
|
3901
3968
|
}
|
|
3902
3969
|
return true;
|
|
3903
|
-
}
|
|
3970
|
+
}
|
|
3904
3971
|
case GGML_OP_OUT_PROD:
|
|
3905
3972
|
return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->ne[2] == 1 && op->ne[3] == 1;
|
|
3906
3973
|
case GGML_OP_GET_ROWS:
|
|
@@ -3917,7 +3984,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
|
|
3917
3984
|
default:
|
|
3918
3985
|
return false;
|
|
3919
3986
|
}
|
|
3920
|
-
}
|
|
3987
|
+
}
|
|
3921
3988
|
case GGML_OP_CPY:
|
|
3922
3989
|
{
|
|
3923
3990
|
ggml_type src0_type = op->src[0]->type;
|
|
@@ -3968,12 +4035,12 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
|
|
3968
4035
|
return true;
|
|
3969
4036
|
}
|
|
3970
4037
|
return false;
|
|
3971
|
-
}
|
|
4038
|
+
}
|
|
3972
4039
|
case GGML_OP_CONCAT:
|
|
3973
4040
|
{
|
|
3974
4041
|
ggml_type src0_type = op->src[0]->type;
|
|
3975
4042
|
return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
|
|
3976
|
-
}
|
|
4043
|
+
}
|
|
3977
4044
|
case GGML_OP_DUP:
|
|
3978
4045
|
case GGML_OP_ARGMAX:
|
|
3979
4046
|
case GGML_OP_NONE:
|
|
@@ -3997,6 +4064,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
|
|
3997
4064
|
return (op->src[0]->type == GGML_TYPE_F32);
|
|
3998
4065
|
case GGML_OP_NORM:
|
|
3999
4066
|
case GGML_OP_RMS_NORM:
|
|
4067
|
+
case GGML_OP_L2_NORM:
|
|
4000
4068
|
case GGML_OP_GROUP_NORM:
|
|
4001
4069
|
return ggml_is_contiguous(op->src[0]);
|
|
4002
4070
|
case GGML_OP_SCALE:
|
|
@@ -4030,6 +4098,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
|
|
4030
4098
|
case GGML_OP_LEAKY_RELU:
|
|
4031
4099
|
case GGML_OP_TIMESTEP_EMBEDDING:
|
|
4032
4100
|
case GGML_OP_RWKV_WKV6:
|
|
4101
|
+
case GGML_OP_RWKV_WKV7:
|
|
4033
4102
|
case GGML_OP_GATED_LINEAR_ATTN:
|
|
4034
4103
|
return true;
|
|
4035
4104
|
default:
|
|
@@ -495,7 +495,7 @@ static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy,
|
|
|
495
495
|
cgh.parallel_for(
|
|
496
496
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
497
497
|
[=](sycl::nd_item<3> item_ct1)
|
|
498
|
-
[[
|
|
498
|
+
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
|
499
499
|
mul_mat_vec_q<QK4_0, QI4_0, block_q4_0,
|
|
500
500
|
VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
|
|
501
501
|
vx, vy, dst, ncols, nrows, item_ct1);
|
|
@@ -519,7 +519,7 @@ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
|
|
|
519
519
|
cgh.parallel_for(
|
|
520
520
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
521
521
|
[=](sycl::nd_item<3> item_ct1)
|
|
522
|
-
[[
|
|
522
|
+
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
|
523
523
|
mul_mat_vec_q<QK4_0, QI4_1, block_q4_1,
|
|
524
524
|
VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
|
|
525
525
|
vx, vy, dst, ncols, nrows, item_ct1);
|
|
@@ -543,7 +543,7 @@ static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
|
|
|
543
543
|
cgh.parallel_for(
|
|
544
544
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
545
545
|
[=](sycl::nd_item<3> item_ct1)
|
|
546
|
-
[[
|
|
546
|
+
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
|
547
547
|
mul_mat_vec_q<QK5_0, QI5_0, block_q5_0,
|
|
548
548
|
VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
|
|
549
549
|
vx, vy, dst, ncols, nrows, item_ct1);
|
|
@@ -567,7 +567,7 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
|
|
|
567
567
|
cgh.parallel_for(
|
|
568
568
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
569
569
|
[=](sycl::nd_item<3> item_ct1)
|
|
570
|
-
[[
|
|
570
|
+
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
|
571
571
|
mul_mat_vec_q<QK5_1, QI5_1, block_q5_1,
|
|
572
572
|
VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
|
|
573
573
|
vx, vy, dst, ncols, nrows, item_ct1);
|
|
@@ -591,7 +591,7 @@ static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
|
|
|
591
591
|
cgh.parallel_for(
|
|
592
592
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
593
593
|
[=](sycl::nd_item<3> item_ct1)
|
|
594
|
-
[[
|
|
594
|
+
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
|
595
595
|
mul_mat_vec_q<QK8_0, QI8_0, block_q8_0,
|
|
596
596
|
VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
|
|
597
597
|
vx, vy, dst, ncols, nrows, item_ct1);
|
|
@@ -615,7 +615,7 @@ static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
|
|
|
615
615
|
cgh.parallel_for(
|
|
616
616
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
617
617
|
[=](sycl::nd_item<3> item_ct1)
|
|
618
|
-
[[
|
|
618
|
+
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
|
619
619
|
mul_mat_vec_q<QK_K, QI2_K, block_q2_K,
|
|
620
620
|
VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
|
|
621
621
|
vx, vy, dst, ncols, nrows, item_ct1);
|
|
@@ -639,7 +639,7 @@ static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
|
|
|
639
639
|
cgh.parallel_for(
|
|
640
640
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
641
641
|
[=](sycl::nd_item<3> item_ct1)
|
|
642
|
-
[[
|
|
642
|
+
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
|
643
643
|
mul_mat_vec_q<QK_K, QI3_K, block_q3_K,
|
|
644
644
|
VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
|
|
645
645
|
vx, vy, dst, ncols, nrows, item_ct1);
|
|
@@ -663,7 +663,7 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
|
|
|
663
663
|
cgh.parallel_for(
|
|
664
664
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
665
665
|
[=](sycl::nd_item<3> item_ct1)
|
|
666
|
-
[[
|
|
666
|
+
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
|
667
667
|
mul_mat_vec_q<QK_K, QI4_K, block_q4_K,
|
|
668
668
|
VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
|
|
669
669
|
vx, vy, dst, ncols, nrows, item_ct1);
|
|
@@ -687,7 +687,7 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
|
|
|
687
687
|
cgh.parallel_for(
|
|
688
688
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
689
689
|
[=](sycl::nd_item<3> item_ct1)
|
|
690
|
-
[[
|
|
690
|
+
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
|
691
691
|
mul_mat_vec_q<QK_K, QI5_K, block_q5_K,
|
|
692
692
|
VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
|
|
693
693
|
vx, vy, dst, ncols, nrows, item_ct1);
|
|
@@ -711,7 +711,7 @@ static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
|
|
|
711
711
|
cgh.parallel_for(
|
|
712
712
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
713
713
|
[=](sycl::nd_item<3> item_ct1)
|
|
714
|
-
[[
|
|
714
|
+
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
|
715
715
|
mul_mat_vec_q<QK_K, QI6_K, block_q6_K,
|
|
716
716
|
VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
|
|
717
717
|
vx, vy, dst, ncols, nrows, item_ct1);
|
|
@@ -734,7 +734,7 @@ static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy,
|
|
|
734
734
|
cgh.parallel_for(
|
|
735
735
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
736
736
|
[=](sycl::nd_item<3> item_ct1)
|
|
737
|
-
[[
|
|
737
|
+
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
|
738
738
|
mul_mat_vec_q_iq2_xxs_q8_1<QK_K, QI2_XXS/2, block_iq2_xxs, 1>(
|
|
739
739
|
vx, vy, dst, ncols, nrows, item_ct1);
|
|
740
740
|
});
|
|
@@ -755,7 +755,7 @@ static void mul_mat_vec_iq2_xs_q8_1_sycl(const void *vx, const void *vy,
|
|
|
755
755
|
cgh.parallel_for(
|
|
756
756
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
757
757
|
[=](sycl::nd_item<3> item_ct1)
|
|
758
|
-
[[
|
|
758
|
+
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
|
759
759
|
mul_mat_vec_q_iq2_xs_q8_1<QK_K, QI2_XS/2, block_iq2_xs, 1>(
|
|
760
760
|
vx, vy, dst, ncols, nrows, item_ct1);
|
|
761
761
|
});
|
|
@@ -777,7 +777,7 @@ static void mul_mat_vec_iq2_s_q8_1_sycl(const void *vx, const void *vy,
|
|
|
777
777
|
cgh.parallel_for(
|
|
778
778
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
779
779
|
[=](sycl::nd_item<3> item_ct1)
|
|
780
|
-
[[
|
|
780
|
+
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
|
781
781
|
mul_mat_vec_q_iq2_s_q8_1<QK_K, QI2_S/2, block_iq2_s, 1>(
|
|
782
782
|
vx, vy, dst, ncols, nrows, item_ct1);
|
|
783
783
|
});
|
|
@@ -799,7 +799,7 @@ static void mul_mat_vec_iq3_xxs_q8_1_sycl(const void *vx, const void *vy,
|
|
|
799
799
|
cgh.parallel_for(
|
|
800
800
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
801
801
|
[=](sycl::nd_item<3> item_ct1)
|
|
802
|
-
[[
|
|
802
|
+
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
|
803
803
|
mul_mat_vec_q_iq3_xxs_q8_1<QK_K, QI3_XXS/2, block_iq3_xxs, 1>(
|
|
804
804
|
vx, vy, dst, ncols, nrows, item_ct1);
|
|
805
805
|
});
|
|
@@ -821,7 +821,7 @@ static void mul_mat_vec_iq3_s_q8_1_sycl(const void *vx, const void *vy,
|
|
|
821
821
|
cgh.parallel_for(
|
|
822
822
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
823
823
|
[=](sycl::nd_item<3> item_ct1)
|
|
824
|
-
[[
|
|
824
|
+
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
|
825
825
|
mul_mat_vec_q_iq3_s_q8_1<QK_K, QI3_S/2, block_iq3_s, 1>(
|
|
826
826
|
vx, vy, dst, ncols, nrows, item_ct1);
|
|
827
827
|
});
|
|
@@ -843,7 +843,7 @@ static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy,
|
|
|
843
843
|
cgh.parallel_for(
|
|
844
844
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
845
845
|
[=](sycl::nd_item<3> item_ct1)
|
|
846
|
-
[[
|
|
846
|
+
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
|
847
847
|
mul_mat_vec_q_iq1_s_q8_1<QK_K, QI1_S, block_iq1_s, 1>(
|
|
848
848
|
vx, vy, dst, ncols, nrows, item_ct1);
|
|
849
849
|
});
|
|
@@ -864,7 +864,7 @@ static void mul_mat_vec_iq1_m_q8_1_sycl(const void *vx, const void *vy,
|
|
|
864
864
|
cgh.parallel_for(
|
|
865
865
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
866
866
|
[=](sycl::nd_item<3> item_ct1)
|
|
867
|
-
[[
|
|
867
|
+
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
|
868
868
|
mul_mat_vec_q_iq1_m_q8_1<QK_K, QI1_S, block_iq1_m, 1>(
|
|
869
869
|
vx, vy, dst, ncols, nrows, item_ct1);
|
|
870
870
|
});
|
|
@@ -886,7 +886,7 @@ static void mul_mat_vec_iq4_nl_q8_1_sycl(const void *vx, const void *vy,
|
|
|
886
886
|
cgh.parallel_for(
|
|
887
887
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
888
888
|
[=](sycl::nd_item<3> item_ct1)
|
|
889
|
-
[[
|
|
889
|
+
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
|
890
890
|
mul_mat_vec_q_iq4_nl_q8_1<QK4_NL, QI4_NL, block_iq4_nl, 2>(
|
|
891
891
|
vx, vy, dst, ncols, nrows, item_ct1);
|
|
892
892
|
});
|
|
@@ -908,7 +908,7 @@ static void mul_mat_vec_iq4_xs_q8_1_sycl(const void *vx, const void *vy,
|
|
|
908
908
|
cgh.parallel_for(
|
|
909
909
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
910
910
|
[=](sycl::nd_item<3> item_ct1)
|
|
911
|
-
[[
|
|
911
|
+
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
|
912
912
|
mul_mat_vec_q_iq4_xs_q8_1<QK_K, QI4_XS/4, block_iq4_xs, 1>(
|
|
913
913
|
vx, vy, dst, ncols, nrows, item_ct1);
|
|
914
914
|
});
|
|
@@ -1003,7 +1003,6 @@ void ggml_sycl_op_mul_mat_vec_q(
|
|
|
1003
1003
|
break;
|
|
1004
1004
|
default:
|
|
1005
1005
|
GGML_ABORT("fatal error");
|
|
1006
|
-
break;
|
|
1007
1006
|
}
|
|
1008
1007
|
}
|
|
1009
1008
|
GGML_UNUSED(src1);
|