llama_cpp 0.12.7 → 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -0
- data/ext/llama_cpp/llama_cpp.cpp +72 -262
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +23 -25
- data/vendor/tmp/llama.cpp/Makefile +8 -3
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +2 -0
- data/vendor/tmp/llama.cpp/ggml-backend.c +14 -2
- data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +7 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +96 -15
- data/vendor/tmp/llama.cpp/ggml-metal.metal +1049 -38
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +25 -25
- data/vendor/tmp/llama.cpp/ggml-quants.c +1873 -218
- data/vendor/tmp/llama.cpp/ggml-quants.h +52 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +292 -221
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +64 -52
- data/vendor/tmp/llama.cpp/ggml.c +318 -195
- data/vendor/tmp/llama.cpp/ggml.h +35 -19
- data/vendor/tmp/llama.cpp/llama.cpp +806 -531
- data/vendor/tmp/llama.cpp/llama.h +53 -65
- data/vendor/tmp/llama.cpp/unicode.h +310 -1
- metadata +2 -2
@@ -3338,7 +3338,7 @@ void print_ggml_tensor(const char*name, struct ggml_tensor *src){
|
|
3338
3338
|
|
3339
3339
|
size_t total_elements = ggml_nelements(src);
|
3340
3340
|
|
3341
|
-
const bool src_on_device = src->backend ==
|
3341
|
+
const bool src_on_device = src->backend == GGML_BACKEND_TYPE_GPU || src->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
|
3342
3342
|
float *src_data =NULL;
|
3343
3343
|
if(src_on_device) {
|
3344
3344
|
ggml_tensor_extra_gpu * src_extra = (ggml_tensor_extra_gpu *) src->extra;
|
@@ -8086,11 +8086,11 @@ static void k_argsort_f32_i32(const float * x, int * dst, const int ncols,
|
|
8086
8086
|
int ixj = col ^ j;
|
8087
8087
|
if (ixj > col) {
|
8088
8088
|
if ((col & k) == 0) {
|
8089
|
-
if (order ==
|
8089
|
+
if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) {
|
8090
8090
|
swap(dst_row[col], dst_row[ixj]);
|
8091
8091
|
}
|
8092
8092
|
} else {
|
8093
|
-
if (order ==
|
8093
|
+
if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) {
|
8094
8094
|
swap(dst_row[col], dst_row[ixj]);
|
8095
8095
|
}
|
8096
8096
|
}
|
@@ -8126,23 +8126,51 @@ static void diag_mask_inf_f32(const float * x, float * dst, const int ncols, con
|
|
8126
8126
|
dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX;
|
8127
8127
|
}
|
8128
8128
|
|
8129
|
-
|
8130
|
-
|
8129
|
+
|
8130
|
+
template <bool vals_smem, int ncols_template, int block_size_template>
|
8131
|
+
static void soft_max_f32(const float * x, const float * mask, const float *pos, float * dst, const int ncols_par,
|
8132
|
+
const int nrows_y, const float scale, const float max_bias, const float m0,
|
8133
|
+
const float m1, uint32_t n_head_log2, const sycl::nd_item<3> &item_ct1, float *buf) {
|
8134
|
+
const int ncols = ncols_template == 0 ? ncols_par : ncols_template;
|
8135
|
+
|
8131
8136
|
const int tid = item_ct1.get_local_id(2);
|
8132
8137
|
const int rowx = item_ct1.get_group(2);
|
8133
8138
|
const int rowy = rowx % nrows_y; // broadcast the mask (y) in the row dimension
|
8134
8139
|
|
8135
|
-
const int block_size = item_ct1.get_local_range(2);
|
8140
|
+
const int block_size = block_size_template == 0 ? item_ct1.get_local_range(2) : block_size_template;
|
8136
8141
|
|
8137
8142
|
const int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
|
8138
8143
|
const int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
|
8139
8144
|
|
8145
|
+
float slope = 0.0f;
|
8146
|
+
|
8147
|
+
// ALiBi
|
8148
|
+
if (max_bias > 0.0f) {
|
8149
|
+
const uint32_t h = rowx/nrows_y; // head index
|
8150
|
+
|
8151
|
+
const float base = h < n_head_log2 ? m0 : m1;
|
8152
|
+
const int exp = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
|
8153
|
+
|
8154
|
+
slope = sycl::pow(base, float(exp));
|
8155
|
+
}
|
8156
|
+
|
8157
|
+
float * vals = vals_smem ? buf + WARP_SIZE : dst + rowx*ncols;
|
8140
8158
|
float max_val = -INFINITY;
|
8141
8159
|
|
8142
|
-
for (int
|
8160
|
+
for (int col0 = 0; col0 < ncols; col0 += block_size) {
|
8161
|
+
const int col = col0 + tid;
|
8162
|
+
|
8163
|
+
if (ncols_template == 0 && col >= ncols) {
|
8164
|
+
break;
|
8165
|
+
}
|
8166
|
+
|
8143
8167
|
const int ix = rowx*ncols + col;
|
8144
8168
|
const int iy = rowy*ncols + col;
|
8145
|
-
|
8169
|
+
|
8170
|
+
const float val = x[ix]*scale + (mask ? mask[iy] : 0.0f) + (pos ? slope*pos[col] : 0.0f);
|
8171
|
+
|
8172
|
+
vals[col] = val;
|
8173
|
+
max_val = sycl::max(max_val, val);
|
8146
8174
|
}
|
8147
8175
|
|
8148
8176
|
// find the max value in the block
|
@@ -8151,30 +8179,12 @@ static void soft_max_f32(const float * x, const float * y, float * dst, const in
|
|
8151
8179
|
if (warp_id == 0) {
|
8152
8180
|
buf[lane_id] = -INFINITY;
|
8153
8181
|
}
|
8154
|
-
|
8155
|
-
DPCT1118:12: SYCL group functions and algorithms must be encountered in
|
8156
|
-
converged control flow. You may need to adjust the code.
|
8157
|
-
*/
|
8158
|
-
/*
|
8159
|
-
DPCT1065:60: Consider replacing sycl::nd_item::barrier() with
|
8160
|
-
sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
|
8161
|
-
better performance if there is no access to global memory.
|
8162
|
-
*/
|
8163
|
-
item_ct1.barrier();
|
8182
|
+
item_ct1.barrier(sycl::access::fence_space::local_space);
|
8164
8183
|
|
8165
8184
|
if (lane_id == 0) {
|
8166
8185
|
buf[warp_id] = max_val;
|
8167
8186
|
}
|
8168
|
-
|
8169
|
-
DPCT1118:13: SYCL group functions and algorithms must be encountered in
|
8170
|
-
converged control flow. You may need to adjust the code.
|
8171
|
-
*/
|
8172
|
-
/*
|
8173
|
-
DPCT1065:61: Consider replacing sycl::nd_item::barrier() with
|
8174
|
-
sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
|
8175
|
-
better performance if there is no access to global memory.
|
8176
|
-
*/
|
8177
|
-
item_ct1.barrier();
|
8187
|
+
item_ct1.barrier(sycl::access::fence_space::local_space);
|
8178
8188
|
|
8179
8189
|
max_val = buf[lane_id];
|
8180
8190
|
max_val = warp_reduce_max(max_val, item_ct1);
|
@@ -8182,13 +8192,16 @@ static void soft_max_f32(const float * x, const float * y, float * dst, const in
|
|
8182
8192
|
|
8183
8193
|
float tmp = 0.f;
|
8184
8194
|
|
8185
|
-
|
8186
|
-
|
8187
|
-
const int
|
8188
|
-
|
8189
|
-
|
8195
|
+
#pragma unroll
|
8196
|
+
for (int col0 = 0; col0 < ncols; col0 += block_size) {
|
8197
|
+
const int col = col0 + tid;
|
8198
|
+
if (ncols_template == 0 && col >= ncols) {
|
8199
|
+
break;
|
8200
|
+
}
|
8201
|
+
|
8202
|
+
const float val = sycl::native::exp(vals[col] - max_val);
|
8190
8203
|
tmp += val;
|
8191
|
-
|
8204
|
+
vals[col] = val;
|
8192
8205
|
}
|
8193
8206
|
|
8194
8207
|
// find the sum of exps in the block
|
@@ -8197,40 +8210,29 @@ static void soft_max_f32(const float * x, const float * y, float * dst, const in
|
|
8197
8210
|
if (warp_id == 0) {
|
8198
8211
|
buf[lane_id] = 0.f;
|
8199
8212
|
}
|
8200
|
-
|
8201
|
-
DPCT1118:14: SYCL group functions and algorithms must be encountered in
|
8202
|
-
converged control flow. You may need to adjust the code.
|
8203
|
-
*/
|
8204
|
-
/*
|
8205
|
-
DPCT1065:62: Consider replacing sycl::nd_item::barrier() with
|
8206
|
-
sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
|
8207
|
-
better performance if there is no access to global memory.
|
8208
|
-
*/
|
8209
|
-
item_ct1.barrier();
|
8213
|
+
item_ct1.barrier(sycl::access::fence_space::local_space);
|
8210
8214
|
|
8211
8215
|
if (lane_id == 0) {
|
8212
8216
|
buf[warp_id] = tmp;
|
8213
8217
|
}
|
8214
|
-
|
8215
|
-
DPCT1118:15: SYCL group functions and algorithms must be encountered in
|
8216
|
-
converged control flow. You may need to adjust the code.
|
8217
|
-
*/
|
8218
|
-
/*
|
8219
|
-
DPCT1065:63: Consider replacing sycl::nd_item::barrier() with
|
8220
|
-
sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
|
8221
|
-
better performance if there is no access to global memory.
|
8222
|
-
*/
|
8223
|
-
item_ct1.barrier();
|
8218
|
+
item_ct1.barrier(sycl::access::fence_space::local_space);
|
8224
8219
|
|
8225
8220
|
tmp = buf[lane_id];
|
8226
8221
|
tmp = warp_reduce_sum(tmp, item_ct1);
|
8227
8222
|
}
|
8228
8223
|
|
8229
|
-
const float
|
8224
|
+
const float inv_sum = 1.f / tmp;
|
8230
8225
|
|
8231
|
-
|
8232
|
-
|
8233
|
-
|
8226
|
+
#pragma unroll
|
8227
|
+
for (int col0 = 0; col0 < ncols; col0 += block_size) {
|
8228
|
+
const int col = col0 + tid;
|
8229
|
+
|
8230
|
+
if (ncols_template == 0 && col >= ncols) {
|
8231
|
+
return;
|
8232
|
+
}
|
8233
|
+
|
8234
|
+
const int idst = rowx*ncols + col;
|
8235
|
+
dst[idst] = vals[col] * inv_sum;
|
8234
8236
|
}
|
8235
8237
|
}
|
8236
8238
|
|
@@ -10825,7 +10827,7 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
|
|
10825
10827
|
|
10826
10828
|
const sycl::range<3> block_dims(1, 1, ncols);
|
10827
10829
|
const sycl::range<3> block_nums(1, nrows, 1);
|
10828
|
-
if (order ==
|
10830
|
+
if (order == GGML_SORT_ORDER_ASC) {
|
10829
10831
|
/*
|
10830
10832
|
DPCT1049:44: The work-group size passed to the SYCL kernel may exceed
|
10831
10833
|
the limit. To get the device limit, query
|
@@ -10834,9 +10836,9 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
|
|
10834
10836
|
stream->parallel_for(
|
10835
10837
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
10836
10838
|
[=](sycl::nd_item<3> item_ct1) {
|
10837
|
-
k_argsort_f32_i32<
|
10839
|
+
k_argsort_f32_i32<GGML_SORT_ORDER_ASC>(x, dst, ncols, item_ct1);
|
10838
10840
|
});
|
10839
|
-
} else if (order ==
|
10841
|
+
} else if (order == GGML_SORT_ORDER_DESC) {
|
10840
10842
|
/*
|
10841
10843
|
DPCT1049:45: The work-group size passed to the SYCL kernel may exceed
|
10842
10844
|
the limit. To get the device limit, query
|
@@ -10845,7 +10847,7 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
|
|
10845
10847
|
stream->parallel_for(
|
10846
10848
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
10847
10849
|
[=](sycl::nd_item<3> item_ct1) {
|
10848
|
-
k_argsort_f32_i32<
|
10850
|
+
k_argsort_f32_i32<GGML_SORT_ORDER_DESC>(x, dst, ncols, item_ct1);
|
10849
10851
|
});
|
10850
10852
|
} else {
|
10851
10853
|
GGML_ASSERT(false);
|
@@ -10867,37 +10869,98 @@ static void diag_mask_inf_f32_sycl(const float *x, float *dst,
|
|
10867
10869
|
});
|
10868
10870
|
}
|
10869
10871
|
|
10870
|
-
|
10871
|
-
|
10872
|
-
|
10873
|
-
|
10874
|
-
|
10875
|
-
while (nth < ncols_x && nth < SYCL_SOFT_MAX_BLOCK_SIZE) nth *= 2;
|
10876
|
-
const sycl::range<3> block_dims(1, 1, nth);
|
10877
|
-
const sycl::range<3> block_nums(1, 1, nrows_x);
|
10878
|
-
/*
|
10879
|
-
DPCT1049:46: The work-group size passed to the SYCL kernel may exceed the
|
10880
|
-
limit. To get the device limit, query info::device::max_work_group_size.
|
10881
|
-
Adjust the work-group size if needed.
|
10882
|
-
*/
|
10872
|
+
template <bool vals_smem, int ncols_template, int block_size_template>
|
10873
|
+
static void soft_max_f32_submitter(const float * x, const float * mask, const float *pos, float * dst, const int ncols_par,
|
10874
|
+
const int nrows_y, const float scale, const float max_bias, const float m0,
|
10875
|
+
const float m1, uint32_t n_head_log2, sycl::range<3> block_nums, sycl::range<3> block_dims,
|
10876
|
+
const size_t n_local_scratch, dpct::queue_ptr stream) {
|
10883
10877
|
stream->submit([&](sycl::handler &cgh) {
|
10884
|
-
|
10885
|
-
DPCT1101:96: 'SYCL_SOFT_MAX_BLOCK_SIZE/WARP_SIZE' expression was
|
10886
|
-
replaced with a value. Modify the code to use the original expression,
|
10887
|
-
provided in comments, if it is correct.
|
10888
|
-
*/
|
10889
|
-
sycl::local_accessor<float, 1> buf_acc_ct1(
|
10890
|
-
sycl::range<1>(32 /*SYCL_SOFT_MAX_BLOCK_SIZE/WARP_SIZE*/), cgh);
|
10878
|
+
sycl::local_accessor<float, 1> local_buf_acc(n_local_scratch, cgh);
|
10891
10879
|
|
10892
10880
|
cgh.parallel_for(
|
10893
10881
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
10894
10882
|
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
10895
|
-
soft_max_f32
|
10896
|
-
|
10883
|
+
soft_max_f32<vals_smem, ncols_template, block_size_template>(x, mask, pos, dst, ncols_par,
|
10884
|
+
nrows_y, scale, max_bias, m0,
|
10885
|
+
m1, n_head_log2, item_ct1,
|
10886
|
+
local_buf_acc.get_pointer());
|
10897
10887
|
});
|
10898
10888
|
});
|
10899
10889
|
}
|
10900
10890
|
|
10891
|
+
static void soft_max_f32_sycl(const float * x, const float * mask, const float * pos,
|
10892
|
+
float * dst, const int ncols_x, const int nrows_x,
|
10893
|
+
const int nrows_y, const float scale, const float max_bias,
|
10894
|
+
dpct::queue_ptr stream) {
|
10895
|
+
int nth = WARP_SIZE;
|
10896
|
+
while (nth < ncols_x && nth < SYCL_SOFT_MAX_BLOCK_SIZE) nth *= 2;
|
10897
|
+
const sycl::range<3> block_dims(1, 1, nth);
|
10898
|
+
const sycl::range<3> block_nums(1, 1, nrows_x);
|
10899
|
+
const size_t n_local_scratch = (GGML_PAD(ncols_x, WARP_SIZE) + WARP_SIZE);
|
10900
|
+
static_assert(SYCL_SOFT_MAX_BLOCK_SIZE == 1024, "These values need to be adjusted.");
|
10901
|
+
|
10902
|
+
const uint32_t n_head_kv = nrows_x/nrows_y;
|
10903
|
+
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));
|
10904
|
+
|
10905
|
+
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
10906
|
+
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
10907
|
+
|
10908
|
+
const size_t local_mem_size = stream->get_device().get_info<sycl::info::device::local_mem_size>();
|
10909
|
+
if (n_local_scratch*sizeof(float) < local_mem_size) {
|
10910
|
+
switch (ncols_x) {
|
10911
|
+
case 32:
|
10912
|
+
soft_max_f32_submitter<true, 32, 32>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
10913
|
+
max_bias, m0, m1, n_head_log2, block_nums,
|
10914
|
+
block_dims, n_local_scratch, stream);
|
10915
|
+
break;
|
10916
|
+
case 64:
|
10917
|
+
soft_max_f32_submitter<true, 64, 64>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
10918
|
+
max_bias, m0, m1, n_head_log2, block_nums,
|
10919
|
+
block_dims, n_local_scratch, stream);
|
10920
|
+
break;
|
10921
|
+
case 128:
|
10922
|
+
soft_max_f32_submitter<true, 128, 128>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
10923
|
+
max_bias, m0, m1, n_head_log2, block_nums,
|
10924
|
+
block_dims, n_local_scratch, stream);
|
10925
|
+
break;
|
10926
|
+
case 256:
|
10927
|
+
soft_max_f32_submitter<true, 256, 256>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
10928
|
+
max_bias, m0, m1, n_head_log2, block_nums,
|
10929
|
+
block_dims, n_local_scratch, stream);
|
10930
|
+
break;
|
10931
|
+
case 512:
|
10932
|
+
soft_max_f32_submitter<true, 512, 512>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
10933
|
+
max_bias, m0, m1, n_head_log2, block_nums,
|
10934
|
+
block_dims, n_local_scratch, stream);
|
10935
|
+
break;
|
10936
|
+
case 1024:
|
10937
|
+
soft_max_f32_submitter<true, 1024, 1024>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
10938
|
+
max_bias, m0, m1, n_head_log2, block_nums,
|
10939
|
+
block_dims, n_local_scratch, stream);
|
10940
|
+
break;
|
10941
|
+
case 2048:
|
10942
|
+
soft_max_f32_submitter<true, 2048, 1024>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
10943
|
+
max_bias, m0, m1, n_head_log2, block_nums,
|
10944
|
+
block_dims, n_local_scratch, stream);
|
10945
|
+
break;
|
10946
|
+
case 4096:
|
10947
|
+
soft_max_f32_submitter<true, 4096, 1024>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
10948
|
+
max_bias, m0, m1, n_head_log2, block_nums,
|
10949
|
+
block_dims, n_local_scratch, stream);
|
10950
|
+
break;
|
10951
|
+
default:
|
10952
|
+
soft_max_f32_submitter<true, 0, 0>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
10953
|
+
max_bias, m0, m1, n_head_log2, block_nums,
|
10954
|
+
block_dims, n_local_scratch, stream);
|
10955
|
+
break;
|
10956
|
+
}
|
10957
|
+
} else {
|
10958
|
+
soft_max_f32_submitter<false, 0, 0>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
10959
|
+
max_bias, m0, m1, n_head_log2, block_nums,
|
10960
|
+
block_dims, WARP_SIZE, stream);
|
10961
|
+
}
|
10962
|
+
}
|
10963
|
+
|
10901
10964
|
template <typename T>
|
10902
10965
|
static void im2col_sycl(const float *x, T *dst, int IW, int IH,
|
10903
10966
|
int OW, int OH, int KW, int KH, int IC,
|
@@ -11407,12 +11470,12 @@ static dpct::err0 ggml_sycl_cpy_tensor_2d(void *dst,
|
|
11407
11470
|
|
11408
11471
|
dpct::memcpy_direction kind;
|
11409
11472
|
char * src_ptr;
|
11410
|
-
if (src->backend ==
|
11473
|
+
if (src->backend == GGML_BACKEND_TYPE_CPU) {
|
11411
11474
|
kind = dpct::host_to_device;
|
11412
11475
|
src_ptr = (char *) src->data;
|
11413
|
-
// GGML_SYCL_DEBUG("ggml_sycl_cpy_tensor_2d
|
11414
|
-
} else if (src->backend ==
|
11415
|
-
GGML_ASSERT(src->backend !=
|
11476
|
+
// GGML_SYCL_DEBUG("ggml_sycl_cpy_tensor_2d GGML_BACKEND_TYPE_CPU src_ptr %p\n", src_ptr);
|
11477
|
+
} else if (src->backend == GGML_BACKEND_TYPE_GPU || src->backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
|
11478
|
+
GGML_ASSERT(src->backend != GGML_BACKEND_TYPE_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
|
11416
11479
|
kind = dpct::device_to_device;
|
11417
11480
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
|
11418
11481
|
int id;
|
@@ -11846,7 +11909,7 @@ inline void ggml_sycl_op_mul_mat_q(
|
|
11846
11909
|
|
11847
11910
|
// the main device has a larger memory buffer to hold the results from all GPUs
|
11848
11911
|
// nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into
|
11849
|
-
const int64_t nrows_dst = dst->backend ==
|
11912
|
+
const int64_t nrows_dst = dst->backend == GGML_BACKEND_TYPE_GPU && device_id == g_main_device ? ne0 : row_diff;
|
11850
11913
|
|
11851
11914
|
switch (src0->type) {
|
11852
11915
|
case GGML_TYPE_Q4_0:
|
@@ -12119,7 +12182,7 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
|
12119
12182
|
|
12120
12183
|
// the main device has a larger memory buffer to hold the results from all GPUs
|
12121
12184
|
// ldc == nrows of the matrix that cuBLAS writes into
|
12122
|
-
int ldc = dst->backend ==
|
12185
|
+
int ldc = dst->backend == GGML_BACKEND_TYPE_GPU && device_id == g_main_device ? ne0 : row_diff;
|
12123
12186
|
|
12124
12187
|
#ifdef GGML_SYCL_F16
|
12125
12188
|
bool use_fp16 = true; // TODO(Yu) SYCL capability check
|
@@ -12435,14 +12498,35 @@ inline void ggml_sycl_op_soft_max(const ggml_tensor *src0,
|
|
12435
12498
|
|
12436
12499
|
const int64_t ne00 = src0->ne[0];
|
12437
12500
|
const int64_t nrows_x = ggml_nrows(src0);
|
12438
|
-
const int64_t nrows_y =
|
12501
|
+
const int64_t nrows_y = src0->ne[1];
|
12439
12502
|
|
12440
12503
|
float scale = 1.0f;
|
12441
|
-
|
12504
|
+
float max_bias = 0.0f;
|
12442
12505
|
|
12443
|
-
|
12506
|
+
memcpy(&scale, dst->op_params + 0, sizeof(float));
|
12507
|
+
memcpy(&max_bias, dst->op_params + 1, sizeof(float));
|
12444
12508
|
|
12445
|
-
|
12509
|
+
// positions tensor
|
12510
|
+
float * src2_dd = nullptr;
|
12511
|
+
sycl_pool_alloc<float> src2_f;
|
12512
|
+
|
12513
|
+
ggml_tensor * src2 = dst->src[2];
|
12514
|
+
const bool use_src2 = src2 != nullptr;
|
12515
|
+
|
12516
|
+
if (use_src2) {
|
12517
|
+
const bool src2_on_device = src2->backend == GGML_BACKEND_TYPE_GPU;
|
12518
|
+
|
12519
|
+
if (src2_on_device) {
|
12520
|
+
ggml_tensor_extra_gpu * src2_extra = (ggml_tensor_extra_gpu *) src2->extra;
|
12521
|
+
src2_dd = (float *) src2_extra->data_device[g_main_device];
|
12522
|
+
} else {
|
12523
|
+
src2_dd = src2_f.alloc(ggml_nelements(src2));
|
12524
|
+
SYCL_CHECK(ggml_sycl_cpy_tensor_2d(src2_dd, src2, 0, 0, 0, 1, main_stream));
|
12525
|
+
}
|
12526
|
+
}
|
12527
|
+
|
12528
|
+
soft_max_f32_sycl(src0_dd, src1 ? src1_dd : nullptr, src2_dd, dst_dd, ne00,
|
12529
|
+
nrows_x, nrows_y, scale, max_bias, main_stream);
|
12446
12530
|
}
|
12447
12531
|
|
12448
12532
|
inline void ggml_sycl_op_scale(const ggml_tensor *src0, const ggml_tensor *src1,
|
@@ -12501,16 +12585,16 @@ static void ggml_sycl_op_flatten(const ggml_tensor *src0,
|
|
12501
12585
|
const bool use_src1 = src1 != nullptr;
|
12502
12586
|
const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
|
12503
12587
|
|
12504
|
-
GGML_ASSERT(!use_src1 || src1->backend !=
|
12505
|
-
GGML_ASSERT( dst->backend !=
|
12588
|
+
GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
12589
|
+
GGML_ASSERT( dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
12506
12590
|
|
12507
12591
|
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
12508
12592
|
ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
|
12509
12593
|
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
12510
12594
|
|
12511
|
-
const bool src0_on_device = src0->backend ==
|
12512
|
-
const bool src1_on_device = use_src1 && src1->backend ==
|
12513
|
-
const bool dst_on_device = dst->backend ==
|
12595
|
+
const bool src0_on_device = src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
|
12596
|
+
const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_TYPE_GPU;
|
12597
|
+
const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU;
|
12514
12598
|
|
12515
12599
|
// dd = data device
|
12516
12600
|
float * src0_ddf = nullptr;
|
@@ -12565,7 +12649,7 @@ static void ggml_sycl_op_flatten(const ggml_tensor *src0,
|
|
12565
12649
|
main_stream->memcpy(dst->data, dst_ddf, ggml_nbytes(dst))));
|
12566
12650
|
}
|
12567
12651
|
|
12568
|
-
if (dst->backend ==
|
12652
|
+
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
12569
12653
|
SYCL_CHECK(CHECK_TRY_ERROR(
|
12570
12654
|
dpct::get_current_device().queues_wait_and_throw()));
|
12571
12655
|
}
|
@@ -12640,8 +12724,9 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
|
|
12640
12724
|
const int nb2 = dst->nb[2];
|
12641
12725
|
const int nb3 = dst->nb[3];
|
12642
12726
|
|
12643
|
-
GGML_ASSERT(dst->backend !=
|
12644
|
-
GGML_ASSERT(src1->backend !=
|
12727
|
+
GGML_ASSERT(dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
12728
|
+
GGML_ASSERT(src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
12729
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32 || (src1->ne[2] == 1 && src1->ne[3] == 1));
|
12645
12730
|
|
12646
12731
|
GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);
|
12647
12732
|
|
@@ -12656,13 +12741,13 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
|
|
12656
12741
|
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
12657
12742
|
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
12658
12743
|
|
12659
|
-
const bool src0_on_device = src0->backend ==
|
12744
|
+
const bool src0_on_device = src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
|
12660
12745
|
const bool src0_is_contiguous = ggml_is_contiguous(src0);
|
12661
12746
|
const bool src1_is_contiguous = ggml_is_contiguous(src1);
|
12662
12747
|
|
12663
12748
|
int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
|
12664
12749
|
|
12665
|
-
const bool split = src0->backend ==
|
12750
|
+
const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
|
12666
12751
|
GGML_ASSERT(!(split && ne02 > 1));
|
12667
12752
|
GGML_ASSERT(!(split && ne03 > 1));
|
12668
12753
|
GGML_ASSERT(!(split && ne02 < ne12));
|
@@ -12717,8 +12802,8 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
|
|
12717
12802
|
|
12718
12803
|
used_devices++;
|
12719
12804
|
|
12720
|
-
const bool src1_on_device = src1->backend ==
|
12721
|
-
const bool dst_on_device = dst->backend ==
|
12805
|
+
const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index;
|
12806
|
+
const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index;
|
12722
12807
|
|
12723
12808
|
ggml_sycl_set_device(get_device_id_by_index(id));
|
12724
12809
|
const dpct::queue_ptr stream = g_syclStreams[id][0];
|
@@ -12782,8 +12867,8 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
|
|
12782
12867
|
continue;
|
12783
12868
|
}
|
12784
12869
|
|
12785
|
-
const bool src1_on_device = src1->backend ==
|
12786
|
-
const bool dst_on_device = dst->backend ==
|
12870
|
+
const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index;
|
12871
|
+
const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index;
|
12787
12872
|
const int64_t row_diff = row_high[id] - row_low[id];
|
12788
12873
|
|
12789
12874
|
ggml_sycl_set_device(get_device_id_by_index(id));
|
@@ -12809,12 +12894,12 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
|
|
12809
12894
|
|
12810
12895
|
// the main device memory buffer can be on VRAM scratch, with space for all partial results
|
12811
12896
|
// in that case an offset on dst_ddf_i is needed
|
12812
|
-
if (dst->backend ==
|
12897
|
+
if (dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index) {
|
12813
12898
|
dst_dd_i += row_low[id]; // offset is 0 if no tensor split
|
12814
12899
|
}
|
12815
12900
|
|
12816
12901
|
// copy src0, src1 to device if necessary
|
12817
|
-
if (src1->backend ==
|
12902
|
+
if (src1->backend == GGML_BACKEND_TYPE_GPU && src1_is_contiguous) {
|
12818
12903
|
if (id != g_main_device_index) {
|
12819
12904
|
if (convert_src1_to_q8_1) {
|
12820
12905
|
char * src1_ddq_i_source = src1_ddq[g_main_device_index] + src1_ddq_i_offset;
|
@@ -12830,14 +12915,14 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
|
|
12830
12915
|
src1_ncols * ne10 * sizeof(float))));
|
12831
12916
|
}
|
12832
12917
|
}
|
12833
|
-
} else if (src1->backend ==
|
12918
|
+
} else if (src1->backend == GGML_BACKEND_TYPE_CPU || (src1_on_device && !src1_is_contiguous)) {
|
12834
12919
|
SYCL_CHECK(ggml_sycl_cpy_tensor_2d(
|
12835
12920
|
src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
|
12836
12921
|
} else {
|
12837
12922
|
GGML_ASSERT(false);
|
12838
12923
|
}
|
12839
12924
|
|
12840
|
-
if (convert_src1_to_q8_1 && (src1->backend ==
|
12925
|
+
if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_TYPE_CPU || !src1_is_contiguous)) {
|
12841
12926
|
quantize_row_q8_1_sycl(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
|
12842
12927
|
/*
|
12843
12928
|
DPCT1010:92: SYCL uses exceptions to report errors and does
|
@@ -12867,10 +12952,10 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
|
|
12867
12952
|
if (!dst_on_device) {
|
12868
12953
|
void * dst_off_device;
|
12869
12954
|
dpct::memcpy_direction kind;
|
12870
|
-
if (dst->backend ==
|
12955
|
+
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
12871
12956
|
dst_off_device = dst->data;
|
12872
12957
|
kind = dpct::device_to_host;
|
12873
|
-
} else if (dst->backend ==
|
12958
|
+
} else if (dst->backend == GGML_BACKEND_TYPE_GPU) {
|
12874
12959
|
dst_off_device = dst_extra->data_device[g_main_device_index];
|
12875
12960
|
kind = dpct::device_to_device;
|
12876
12961
|
} else {
|
@@ -12954,7 +13039,7 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
|
|
12954
13039
|
}
|
12955
13040
|
}
|
12956
13041
|
|
12957
|
-
if (dst->backend ==
|
13042
|
+
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
12958
13043
|
SYCL_CHECK(ggml_sycl_set_device(g_main_device));
|
12959
13044
|
SYCL_CHECK(CHECK_TRY_ERROR(
|
12960
13045
|
dpct::get_current_device().queues_wait_and_throw()));
|
@@ -13091,7 +13176,7 @@ static void ggml_sycl_mul_mat_vec_p021(const ggml_tensor *src0,
|
|
13091
13176
|
const ggml_tensor *src1,
|
13092
13177
|
ggml_tensor *dst) try {
|
13093
13178
|
GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
|
13094
|
-
GGML_ASSERT(src0->backend !=
|
13179
|
+
GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
13095
13180
|
GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
|
13096
13181
|
GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation
|
13097
13182
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
@@ -13129,7 +13214,7 @@ static void ggml_sycl_mul_mat_vec_nc(const ggml_tensor *src0,
|
|
13129
13214
|
GGML_ASSERT(!ggml_is_transposed(src0));
|
13130
13215
|
GGML_ASSERT(!ggml_is_transposed(src1));
|
13131
13216
|
GGML_ASSERT(!ggml_is_permuted(src0));
|
13132
|
-
GGML_ASSERT(src0->backend !=
|
13217
|
+
GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
13133
13218
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
13134
13219
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
13135
13220
|
|
@@ -13185,31 +13270,23 @@ static void k_compute_batched_ptrs(const sycl::half *src0_as_f16,
|
|
13185
13270
|
int64_t i03 = i13 / r3;
|
13186
13271
|
int64_t i02 = i12 / r2;
|
13187
13272
|
|
13188
|
-
ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02
|
13189
|
-
ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12
|
13190
|
-
ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2
|
13273
|
+
ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
|
13274
|
+
ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12 + i13*nb13;
|
13275
|
+
ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3;
|
13191
13276
|
}
|
13192
13277
|
|
13193
|
-
static void
|
13194
|
-
|
13195
|
-
|
13278
|
+
static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
|
13279
|
+
const ggml_tensor *src1,
|
13280
|
+
ggml_tensor *dst) try {
|
13196
13281
|
GGML_ASSERT(!ggml_is_transposed(src0));
|
13197
13282
|
GGML_ASSERT(!ggml_is_transposed(src1));
|
13198
13283
|
|
13199
|
-
GGML_ASSERT(src0->backend !=
|
13284
|
+
GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
13200
13285
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
13201
|
-
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
13202
|
-
|
13203
|
-
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
|
13204
13286
|
|
13205
|
-
|
13206
|
-
|
13207
|
-
GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
|
13208
|
-
|
13209
|
-
GGML_TENSOR_LOCALS(int64_t, nb1, src1, nb);
|
13287
|
+
GGML_TENSOR_BINARY_OP_LOCALS
|
13210
13288
|
|
13211
|
-
const int64_t
|
13212
|
-
const int64_t ne = ggml_nelements(dst);
|
13289
|
+
const int64_t ne_dst = ggml_nelements(dst);
|
13213
13290
|
|
13214
13291
|
SYCL_CHECK(ggml_sycl_set_device(g_main_device));
|
13215
13292
|
dpct::queue_ptr main_stream = g_syclStreams[g_main_device_index][0];
|
@@ -13228,11 +13305,16 @@ static void ggml_sycl_mul_mat_mat_batched_sycl(const ggml_tensor *src0,
|
|
13228
13305
|
float * dst_ddf = (float *) dst_extra->data_device[g_main_device_index];
|
13229
13306
|
|
13230
13307
|
// convert src1 to fp16
|
13231
|
-
|
13232
|
-
|
13233
|
-
|
13234
|
-
|
13235
|
-
|
13308
|
+
sycl_pool_alloc<sycl::half> src1_f16_alloc;
|
13309
|
+
if (src1->type != GGML_TYPE_F16) {
|
13310
|
+
const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type);
|
13311
|
+
const int64_t ne_src1 = ggml_nelements(src1);
|
13312
|
+
src1_f16_alloc.alloc(ne_src1);
|
13313
|
+
GGML_ASSERT(to_fp16_sycl != nullptr);
|
13314
|
+
to_fp16_sycl(src1_ddf, src1_f16_alloc.get(), ne_src1, main_stream);
|
13315
|
+
}
|
13316
|
+
sycl::half *src1_f16 = src1->type == GGML_TYPE_F16 ? (sycl::half *)src1_ddf
|
13317
|
+
: src1_f16_alloc.get();
|
13236
13318
|
|
13237
13319
|
sycl_pool_alloc<sycl::half> dst_f16;
|
13238
13320
|
char * dst_t;
|
@@ -13253,20 +13335,12 @@ static void ggml_sycl_mul_mat_mat_batched_sycl(const ggml_tensor *src0,
|
|
13253
13335
|
const void * alpha = &alpha_f16;
|
13254
13336
|
const void * beta = &beta_f16;
|
13255
13337
|
|
13256
|
-
|
13257
|
-
|
13258
|
-
|
13259
|
-
nbd2 /= sizeof(float) / sizeof(sycl::half);
|
13260
|
-
nbd3 /= sizeof(float) / sizeof(sycl::half);
|
13261
|
-
} else {
|
13262
|
-
dst_t = (char *) dst_ddf;
|
13263
|
-
|
13264
|
-
cu_compute_type = dpct::library_data_t::real_float;
|
13265
|
-
cu_data_type = dpct::library_data_t::real_float;
|
13338
|
+
// TODO: Renable (dst->op_params[0] =! GGML_PREC_DEFAULT) pathway
|
13339
|
+
// once oneMKL open source supports half, half, float, float: datatypes
|
13340
|
+
dst_t = (char *) dst_f16.alloc(ne_dst);
|
13266
13341
|
|
13267
|
-
|
13268
|
-
|
13269
|
-
}
|
13342
|
+
nbd2 /= sizeof(float) / sizeof(sycl::half);
|
13343
|
+
nbd3 /= sizeof(float) / sizeof(sycl::half);
|
13270
13344
|
|
13271
13345
|
GGML_ASSERT(ne12 % ne02 == 0);
|
13272
13346
|
GGML_ASSERT(ne13 % ne03 == 0);
|
@@ -13302,10 +13376,10 @@ static void ggml_sycl_mul_mat_mat_batched_sycl(const ggml_tensor *src0,
|
|
13302
13376
|
*g_sycl_handles[g_main_device_index], oneapi::mkl::transpose::trans,
|
13303
13377
|
oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha,
|
13304
13378
|
(const char *)src0_as_f16, dpct::library_data_t::real_half,
|
13305
|
-
nb01 /
|
13306
|
-
(const char *)
|
13307
|
-
nb11 /
|
13308
|
-
(char *)dst_t, cu_data_type, ne01,
|
13379
|
+
nb01 / nb00, nb02 / nb00,
|
13380
|
+
(const char *)src1_f16, dpct::library_data_t::real_half,
|
13381
|
+
nb11 / nb10, nb12 / nb10, beta,
|
13382
|
+
(char *)dst_t, cu_data_type, ne01, nb2 / nb0,
|
13309
13383
|
ne12 * ne13, cu_compute_type)));
|
13310
13384
|
} else {
|
13311
13385
|
// use syclGemmBatchedEx
|
@@ -13325,44 +13399,35 @@ static void ggml_sycl_mul_mat_mat_batched_sycl(const ggml_tensor *src0,
|
|
13325
13399
|
{sycl::aspect::fp16});
|
13326
13400
|
|
13327
13401
|
main_stream->submit([&](sycl::handler &cgh) {
|
13328
|
-
const
|
13329
|
-
|
13330
|
-
|
13331
|
-
|
13402
|
+
const void **ptrs_src_get = ptrs_src.get();
|
13403
|
+
void **ptrs_dst_get = ptrs_dst.get();
|
13404
|
+
size_t nb12_scaled = src1->type == GGML_TYPE_F16 ? nb12 : nb12 / 2;
|
13405
|
+
size_t nb13_scaled = src1->type == GGML_TYPE_F16 ? nb13 : nb13 / 2;
|
13332
13406
|
cgh.parallel_for(sycl::nd_range<3>(block_dims, block_dims),
|
13333
13407
|
[=](sycl::nd_item<3> item_ct1) {
|
13334
13408
|
k_compute_batched_ptrs(
|
13335
|
-
src0_as_f16,
|
13336
|
-
dst_t,
|
13337
|
-
|
13338
|
-
nb02, nb03,
|
13339
|
-
r3, item_ct1);
|
13409
|
+
src0_as_f16, src1_f16,
|
13410
|
+
dst_t, ptrs_src_get,
|
13411
|
+
ptrs_dst_get, ne12, ne13, ne23,
|
13412
|
+
nb02, nb03, nb12_scaled, nb13_scaled,
|
13413
|
+
nbd2, nbd3, r2, r3, item_ct1);
|
13340
13414
|
});
|
13341
13415
|
});
|
13342
13416
|
}
|
13343
|
-
/*
|
13344
|
-
DPCT1010:95: SYCL uses exceptions to report errors and does not use the
|
13345
|
-
error codes. The call was replaced with 0. You need to rewrite this
|
13346
|
-
code.
|
13347
|
-
*/
|
13348
|
-
SYCL_CHECK(0);
|
13349
|
-
|
13350
13417
|
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
|
13351
13418
|
*g_sycl_handles[g_main_device_index], oneapi::mkl::transpose::trans,
|
13352
13419
|
oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha,
|
13353
13420
|
(const void **)(ptrs_src.get() + 0 * ne23),
|
13354
|
-
dpct::library_data_t::real_half, nb01 /
|
13421
|
+
dpct::library_data_t::real_half, nb01 / nb00,
|
13355
13422
|
(const void **)(ptrs_src.get() + 1 * ne23),
|
13356
|
-
dpct::library_data_t::real_half, nb11 /
|
13423
|
+
dpct::library_data_t::real_half, nb11 / nb10, beta,
|
13357
13424
|
(void **)(ptrs_dst.get() + 0 * ne23), cu_data_type, ne01, ne23,
|
13358
13425
|
cu_compute_type)));
|
13359
13426
|
}
|
13360
13427
|
#endif
|
13361
13428
|
|
13362
|
-
|
13363
|
-
|
13364
|
-
to_fp32_sycl(dst_f16.get(), dst_ddf, ne, main_stream);
|
13365
|
-
}
|
13429
|
+
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
|
13430
|
+
to_fp32_sycl(dst_f16.get(), dst_ddf, ne_dst, main_stream);
|
13366
13431
|
}
|
13367
13432
|
catch (sycl::exception const &exc) {
|
13368
13433
|
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
@@ -13372,11 +13437,11 @@ catch (sycl::exception const &exc) {
|
|
13372
13437
|
|
13373
13438
|
static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
13374
13439
|
const bool all_on_device =
|
13375
|
-
(src0->backend ==
|
13376
|
-
(src1->backend ==
|
13377
|
-
( dst->backend ==
|
13440
|
+
(src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT) &&
|
13441
|
+
(src1->backend == GGML_BACKEND_TYPE_GPU) &&
|
13442
|
+
( dst->backend == GGML_BACKEND_TYPE_GPU);
|
13378
13443
|
|
13379
|
-
const bool split = src0->backend ==
|
13444
|
+
const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
|
13380
13445
|
|
13381
13446
|
int64_t min_compute_capability = INT_MAX;
|
13382
13447
|
for (int64_t id = 0; id < g_device_count; ++id) {
|
@@ -13407,10 +13472,10 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|
13407
13472
|
// KQV single-batch
|
13408
13473
|
// GGML_SYCL_DEBUG("ggml_sycl_mul_mat_vec_nc\n");
|
13409
13474
|
ggml_sycl_mul_mat_vec_nc(src0, src1, dst);
|
13410
|
-
} else if (!split && all_on_device && use_xmx && src0->type == GGML_TYPE_F16 &&
|
13475
|
+
} else if (!split && all_on_device && use_xmx && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
|
13411
13476
|
// KQ + KQV multi-batch
|
13412
|
-
// GGML_SYCL_DEBUG("
|
13413
|
-
|
13477
|
+
// GGML_SYCL_DEBUG("ggml_sycl_mul_mat_batched_sycl\n");
|
13478
|
+
ggml_sycl_mul_mat_batched_sycl(src0, src1, dst);
|
13414
13479
|
} else if (src0->type == GGML_TYPE_F32) {
|
13415
13480
|
// GGML_SYCL_DEBUG("ggml_sycl_op_mul_mat\n");
|
13416
13481
|
ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false);
|
@@ -13505,7 +13570,7 @@ static void ggml_sycl_mul_mat_id_sycl(ggml_tensor * dst) {
|
|
13505
13570
|
GGML_ASSERT(!ggml_is_transposed(src00));
|
13506
13571
|
GGML_ASSERT(!ggml_is_transposed(src1));
|
13507
13572
|
|
13508
|
-
GGML_ASSERT(src00->backend !=
|
13573
|
+
GGML_ASSERT(src00->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
13509
13574
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
13510
13575
|
|
13511
13576
|
GGML_TENSOR_LOCALS(int64_t, ne0, src00, ne);
|
@@ -13643,7 +13708,7 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
|
|
13643
13708
|
|
13644
13709
|
const dpct::queue_ptr stream = g_syclStreams[g_main_device_index][0];
|
13645
13710
|
|
13646
|
-
if (ids->backend ==
|
13711
|
+
if (ids->backend == GGML_BACKEND_TYPE_GPU) {
|
13647
13712
|
const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device_index];
|
13648
13713
|
SYCL_CHECK(CHECK_TRY_ERROR(
|
13649
13714
|
stream->memcpy(ids_host.data(), ids_dev, ggml_nbytes(ids))));
|
@@ -13661,20 +13726,20 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
|
|
13661
13726
|
ggml_tensor src1_row = *src1;
|
13662
13727
|
ggml_tensor dst_row = *dst;
|
13663
13728
|
|
13664
|
-
src1_row.backend =
|
13665
|
-
dst_row.backend =
|
13729
|
+
src1_row.backend = GGML_BACKEND_TYPE_GPU;
|
13730
|
+
dst_row.backend = GGML_BACKEND_TYPE_GPU;
|
13666
13731
|
|
13667
13732
|
src1_row.extra = &src1_row_extra;
|
13668
13733
|
dst_row.extra = &dst_row_extra;
|
13669
13734
|
|
13670
|
-
char * src1_original = src1->backend ==
|
13735
|
+
char * src1_original = src1->backend == GGML_BACKEND_TYPE_CPU ?
|
13671
13736
|
(char *) src1->data : (char *) src1_extra->data_device[g_main_device_index];
|
13672
|
-
char * dst_original = dst->backend ==
|
13737
|
+
char * dst_original = dst->backend == GGML_BACKEND_TYPE_CPU ?
|
13673
13738
|
(char *) dst->data : (char *) dst_extra->data_device[g_main_device_index];
|
13674
13739
|
|
13675
13740
|
if (src1->ne[1] == 1) {
|
13676
|
-
GGML_ASSERT(src1->backend ==
|
13677
|
-
GGML_ASSERT(dst->backend ==
|
13741
|
+
GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
|
13742
|
+
GGML_ASSERT(dst->backend == GGML_BACKEND_TYPE_GPU);
|
13678
13743
|
|
13679
13744
|
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
13680
13745
|
//int32_t row_id;
|
@@ -13756,7 +13821,7 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
|
|
13756
13821
|
}
|
13757
13822
|
}
|
13758
13823
|
|
13759
|
-
if (dst->backend ==
|
13824
|
+
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
13760
13825
|
SYCL_CHECK(CHECK_TRY_ERROR(stream->wait()));
|
13761
13826
|
}
|
13762
13827
|
}
|
@@ -13779,8 +13844,8 @@ static void ggml_sycl_cpy(const ggml_tensor *src0, const ggml_tensor *src1,
|
|
13779
13844
|
const int64_t ne = ggml_nelements(src0);
|
13780
13845
|
GGML_ASSERT(ne == ggml_nelements(src1));
|
13781
13846
|
|
13782
|
-
GGML_ASSERT(src0->backend ==
|
13783
|
-
GGML_ASSERT(src1->backend ==
|
13847
|
+
GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
|
13848
|
+
GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
|
13784
13849
|
|
13785
13850
|
GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
|
13786
13851
|
GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
|
@@ -13887,17 +13952,17 @@ void ggml_sycl_transform_tensor(void *data, struct ggml_tensor *tensor) try {
|
|
13887
13952
|
memset(extra, 0, sizeof(*extra));
|
13888
13953
|
|
13889
13954
|
for (int64_t id = 0; id < g_device_count; ++id) {
|
13890
|
-
if (backend ==
|
13955
|
+
if (backend == GGML_BACKEND_TYPE_GPU && id != g_main_device_index) {
|
13891
13956
|
continue;
|
13892
13957
|
}
|
13893
13958
|
ggml_sycl_set_device(get_device_id_by_index(id));
|
13894
13959
|
const dpct::queue_ptr stream = g_syclStreams[id][0];
|
13895
13960
|
|
13896
13961
|
int64_t row_low, row_high;
|
13897
|
-
if (backend ==
|
13962
|
+
if (backend == GGML_BACKEND_TYPE_GPU) {
|
13898
13963
|
row_low = 0;
|
13899
13964
|
row_high = nrows;
|
13900
|
-
} else if (backend ==
|
13965
|
+
} else if (backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
|
13901
13966
|
const int64_t rounding = get_row_rounding(tensor->type);
|
13902
13967
|
|
13903
13968
|
row_low = id == 0 ? 0 : nrows*g_tensor_split[id];
|
@@ -13946,7 +14011,7 @@ void ggml_sycl_transform_tensor(void *data, struct ggml_tensor *tensor) try {
|
|
13946
14011
|
|
13947
14012
|
extra->data_device[id] = buf;
|
13948
14013
|
|
13949
|
-
if (backend ==
|
14014
|
+
if (backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
|
13950
14015
|
for (int64_t is = 0; is < MAX_STREAMS; ++is) {
|
13951
14016
|
SYCL_CHECK(CHECK_TRY_ERROR(extra->events[id][is] =
|
13952
14017
|
new sycl::event()));
|
@@ -13963,7 +14028,7 @@ catch (sycl::exception const &exc) {
|
|
13963
14028
|
}
|
13964
14029
|
|
13965
14030
|
void ggml_sycl_free_data(struct ggml_tensor *tensor) try {
|
13966
|
-
if (!tensor || !tensor->extra || (tensor->backend !=
|
14031
|
+
if (!tensor || !tensor->extra || (tensor->backend != GGML_BACKEND_TYPE_GPU && tensor->backend != GGML_BACKEND_TYPE_GPU_SPLIT) ) {
|
13967
14032
|
return;
|
13968
14033
|
}
|
13969
14034
|
|
@@ -14016,15 +14081,15 @@ static void ggml_sycl_assign_buffers_impl(struct ggml_tensor *tensor,
|
|
14016
14081
|
return;
|
14017
14082
|
}
|
14018
14083
|
|
14019
|
-
tensor->backend =
|
14084
|
+
tensor->backend = GGML_BACKEND_TYPE_GPU;
|
14020
14085
|
|
14021
|
-
if (tensor->src[0] != nullptr && tensor->src[0]->backend ==
|
14086
|
+
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU) {
|
14022
14087
|
const ggml_op src0_op = tensor->src[0]->op;
|
14023
14088
|
if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
|
14024
14089
|
ggml_sycl_assign_buffers_impl(tensor->src[0], scratch, force_inplace, no_alloc);
|
14025
14090
|
}
|
14026
14091
|
}
|
14027
|
-
if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend ==
|
14092
|
+
if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU) {
|
14028
14093
|
ggml_sycl_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc);
|
14029
14094
|
}
|
14030
14095
|
|
@@ -14042,7 +14107,7 @@ static void ggml_sycl_assign_buffers_impl(struct ggml_tensor *tensor,
|
|
14042
14107
|
SYCL_CHECK(ggml_sycl_set_device(g_main_device));
|
14043
14108
|
const dpct::queue_ptr stream = g_syclStreams[g_main_device_index][0];
|
14044
14109
|
|
14045
|
-
if (inplace && (tensor->src[0]->backend ==
|
14110
|
+
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT)) {
|
14046
14111
|
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
|
14047
14112
|
char * src0_ddc = (char *) src0_extra->data_device[g_main_device_index];
|
14048
14113
|
size_t offset = 0;
|
@@ -14111,7 +14176,7 @@ void ggml_sycl_assign_scratch_offset(struct ggml_tensor *tensor,
|
|
14111
14176
|
|
14112
14177
|
const bool inplace = tensor->view_src != nullptr;
|
14113
14178
|
|
14114
|
-
if (inplace && (tensor->view_src->backend ==
|
14179
|
+
if (inplace && (tensor->view_src->backend == GGML_BACKEND_TYPE_GPU || tensor->view_src->backend == GGML_BACKEND_TYPE_GPU_SPLIT)) {
|
14115
14180
|
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->view_src->extra;
|
14116
14181
|
char * src0_ddc = (char *) src0_extra->data_device[g_main_device_index];
|
14117
14182
|
size_t view_offset = 0;
|
@@ -14132,7 +14197,7 @@ catch (sycl::exception const &exc) {
|
|
14132
14197
|
}
|
14133
14198
|
|
14134
14199
|
void ggml_sycl_copy_to_device(struct ggml_tensor *tensor) try {
|
14135
|
-
GGML_ASSERT(tensor->backend ==
|
14200
|
+
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
14136
14201
|
GGML_ASSERT(ggml_is_contiguous(tensor));
|
14137
14202
|
|
14138
14203
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
@@ -14219,9 +14284,9 @@ bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
14219
14284
|
if (!g_sycl_loaded) return false;
|
14220
14285
|
|
14221
14286
|
ggml_sycl_func_t func;
|
14222
|
-
const bool any_on_device = tensor->backend ==
|
14223
|
-
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend ==
|
14224
|
-
|| (tensor->src[1] != nullptr && tensor->src[1]->backend ==
|
14287
|
+
const bool any_on_device = tensor->backend == GGML_BACKEND_TYPE_GPU
|
14288
|
+
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
|
14289
|
+
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_TYPE_GPU);
|
14225
14290
|
|
14226
14291
|
if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) {
|
14227
14292
|
return false;
|
@@ -14359,14 +14424,14 @@ bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
14359
14424
|
return false;
|
14360
14425
|
}
|
14361
14426
|
|
14362
|
-
if (tensor->src[0] != nullptr && tensor->src[0]->backend ==
|
14427
|
+
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
|
14363
14428
|
ggml_sycl_set_peer_access(tensor->src[1]->ne[1]);
|
14364
14429
|
}
|
14365
14430
|
|
14366
14431
|
if (params->ith != 0) {
|
14367
14432
|
return true;
|
14368
14433
|
}
|
14369
|
-
if (params->type ==
|
14434
|
+
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
14370
14435
|
return true;
|
14371
14436
|
}
|
14372
14437
|
func(tensor->src[0], tensor->src[1], tensor);
|
@@ -14517,7 +14582,7 @@ static void ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
|
14517
14582
|
|
14518
14583
|
extra->data_device[ctx->device] = tensor->data;
|
14519
14584
|
|
14520
|
-
tensor->backend =
|
14585
|
+
tensor->backend = GGML_BACKEND_TYPE_GPU;
|
14521
14586
|
tensor->extra = extra;
|
14522
14587
|
|
14523
14588
|
if (ggml_is_quantized(tensor->type)) {
|
@@ -14548,7 +14613,7 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
|
14548
14613
|
ggml_tensor *tensor,
|
14549
14614
|
const void *data, size_t offset,
|
14550
14615
|
size_t size) try {
|
14551
|
-
GGML_ASSERT(tensor->backend ==
|
14616
|
+
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
14552
14617
|
|
14553
14618
|
ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
|
14554
14619
|
|
@@ -14573,7 +14638,7 @@ static void ggml_backend_sycl_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
|
14573
14638
|
const ggml_tensor *tensor,
|
14574
14639
|
void *data, size_t offset,
|
14575
14640
|
size_t size) try {
|
14576
|
-
GGML_ASSERT(tensor->backend ==
|
14641
|
+
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
14577
14642
|
|
14578
14643
|
ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
|
14579
14644
|
|
@@ -14809,7 +14874,7 @@ static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend,
|
|
14809
14874
|
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
14810
14875
|
|
14811
14876
|
GGML_ASSERT(tensor->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type");
|
14812
|
-
GGML_ASSERT(tensor->backend ==
|
14877
|
+
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
14813
14878
|
|
14814
14879
|
SYCL_CHECK(CHECK_TRY_ERROR(g_syclStreams[sycl_ctx->device][0]->memcpy(
|
14815
14880
|
(char *)tensor->data + offset, data, size)));
|
@@ -14827,7 +14892,7 @@ static void ggml_backend_sycl_get_tensor_async(ggml_backend_t backend,
|
|
14827
14892
|
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
14828
14893
|
|
14829
14894
|
GGML_ASSERT(tensor->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type");
|
14830
|
-
GGML_ASSERT(tensor->backend ==
|
14895
|
+
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
14831
14896
|
|
14832
14897
|
SYCL_CHECK(CHECK_TRY_ERROR(g_syclStreams[sycl_ctx->device][0]->memcpy(
|
14833
14898
|
data, (const char *)tensor->data + offset, size)));
|
@@ -14880,7 +14945,7 @@ static bool ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph
|
|
14880
14945
|
ggml_sycl_set_main_device(sycl_ctx->device);
|
14881
14946
|
|
14882
14947
|
ggml_compute_params params = {};
|
14883
|
-
params.type =
|
14948
|
+
params.type = GGML_TASK_TYPE_COMPUTE;
|
14884
14949
|
params.ith = 0;
|
14885
14950
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
14886
14951
|
ggml_tensor * node = cgraph->nodes[i];
|
@@ -14888,13 +14953,13 @@ static bool ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph
|
|
14888
14953
|
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
|
14889
14954
|
continue;
|
14890
14955
|
|
14891
|
-
assert(node->backend ==
|
14956
|
+
assert(node->backend == GGML_BACKEND_TYPE_GPU);
|
14892
14957
|
assert(node->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device));
|
14893
14958
|
assert(node->extra != nullptr);
|
14894
14959
|
|
14895
14960
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
14896
14961
|
if (node->src[j] != nullptr) {
|
14897
|
-
assert(node->src[j]->backend ==
|
14962
|
+
assert(node->src[j]->backend == GGML_BACKEND_TYPE_GPU);
|
14898
14963
|
assert(node->src[j]->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device));
|
14899
14964
|
assert(node->src[j]->extra != nullptr);
|
14900
14965
|
}
|
@@ -15078,6 +15143,11 @@ static ggml_backend_i ggml_backend_sycl_interface = {
|
|
15078
15143
|
/* .supports_op = */ ggml_backend_sycl_supports_op,
|
15079
15144
|
};
|
15080
15145
|
|
15146
|
+
static ggml_guid_t ggml_backend_sycl_guid() {
|
15147
|
+
static ggml_guid guid = { 0x58, 0x05, 0x13, 0x8f, 0xcd, 0x3a, 0x61, 0x9d, 0xe7, 0xcd, 0x98, 0xa9, 0x03, 0xfd, 0x7c, 0x53 };
|
15148
|
+
return &guid;
|
15149
|
+
}
|
15150
|
+
|
15081
15151
|
ggml_backend_t ggml_backend_sycl_init(int device) {
|
15082
15152
|
ggml_init_sycl(); // TODO: remove from ggml.c
|
15083
15153
|
|
@@ -15095,6 +15165,7 @@ ggml_backend_t ggml_backend_sycl_init(int device) {
|
|
15095
15165
|
};
|
15096
15166
|
|
15097
15167
|
ggml_backend_t sycl_backend = new ggml_backend {
|
15168
|
+
/* .guid = */ ggml_backend_sycl_guid(),
|
15098
15169
|
/* .interface = */ ggml_backend_sycl_interface,
|
15099
15170
|
/* .context = */ ctx
|
15100
15171
|
};
|
@@ -15103,7 +15174,7 @@ ggml_backend_t ggml_backend_sycl_init(int device) {
|
|
15103
15174
|
}
|
15104
15175
|
|
15105
15176
|
bool ggml_backend_is_sycl(ggml_backend_t backend) {
|
15106
|
-
return backend->
|
15177
|
+
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_sycl_guid());
|
15107
15178
|
}
|
15108
15179
|
|
15109
15180
|
static ggml_backend_t ggml_backend_reg_sycl_init(const char * params, void * user_data) {
|