llama_cpp 0.12.7 → 0.13.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -0
- data/ext/llama_cpp/llama_cpp.cpp +72 -262
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +23 -25
- data/vendor/tmp/llama.cpp/Makefile +8 -3
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +2 -0
- data/vendor/tmp/llama.cpp/ggml-backend.c +14 -2
- data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +7 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +96 -15
- data/vendor/tmp/llama.cpp/ggml-metal.metal +1049 -38
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +25 -25
- data/vendor/tmp/llama.cpp/ggml-quants.c +1873 -218
- data/vendor/tmp/llama.cpp/ggml-quants.h +52 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +292 -221
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +64 -52
- data/vendor/tmp/llama.cpp/ggml.c +318 -195
- data/vendor/tmp/llama.cpp/ggml.h +35 -19
- data/vendor/tmp/llama.cpp/llama.cpp +806 -531
- data/vendor/tmp/llama.cpp/llama.h +53 -65
- data/vendor/tmp/llama.cpp/unicode.h +310 -1
- metadata +2 -2
@@ -3338,7 +3338,7 @@ void print_ggml_tensor(const char*name, struct ggml_tensor *src){
|
|
3338
3338
|
|
3339
3339
|
size_t total_elements = ggml_nelements(src);
|
3340
3340
|
|
3341
|
-
const bool src_on_device = src->backend ==
|
3341
|
+
const bool src_on_device = src->backend == GGML_BACKEND_TYPE_GPU || src->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
|
3342
3342
|
float *src_data =NULL;
|
3343
3343
|
if(src_on_device) {
|
3344
3344
|
ggml_tensor_extra_gpu * src_extra = (ggml_tensor_extra_gpu *) src->extra;
|
@@ -8086,11 +8086,11 @@ static void k_argsort_f32_i32(const float * x, int * dst, const int ncols,
|
|
8086
8086
|
int ixj = col ^ j;
|
8087
8087
|
if (ixj > col) {
|
8088
8088
|
if ((col & k) == 0) {
|
8089
|
-
if (order ==
|
8089
|
+
if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) {
|
8090
8090
|
swap(dst_row[col], dst_row[ixj]);
|
8091
8091
|
}
|
8092
8092
|
} else {
|
8093
|
-
if (order ==
|
8093
|
+
if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) {
|
8094
8094
|
swap(dst_row[col], dst_row[ixj]);
|
8095
8095
|
}
|
8096
8096
|
}
|
@@ -8126,23 +8126,51 @@ static void diag_mask_inf_f32(const float * x, float * dst, const int ncols, con
|
|
8126
8126
|
dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX;
|
8127
8127
|
}
|
8128
8128
|
|
8129
|
-
|
8130
|
-
|
8129
|
+
|
8130
|
+
template <bool vals_smem, int ncols_template, int block_size_template>
|
8131
|
+
static void soft_max_f32(const float * x, const float * mask, const float *pos, float * dst, const int ncols_par,
|
8132
|
+
const int nrows_y, const float scale, const float max_bias, const float m0,
|
8133
|
+
const float m1, uint32_t n_head_log2, const sycl::nd_item<3> &item_ct1, float *buf) {
|
8134
|
+
const int ncols = ncols_template == 0 ? ncols_par : ncols_template;
|
8135
|
+
|
8131
8136
|
const int tid = item_ct1.get_local_id(2);
|
8132
8137
|
const int rowx = item_ct1.get_group(2);
|
8133
8138
|
const int rowy = rowx % nrows_y; // broadcast the mask (y) in the row dimension
|
8134
8139
|
|
8135
|
-
const int block_size = item_ct1.get_local_range(2);
|
8140
|
+
const int block_size = block_size_template == 0 ? item_ct1.get_local_range(2) : block_size_template;
|
8136
8141
|
|
8137
8142
|
const int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
|
8138
8143
|
const int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
|
8139
8144
|
|
8145
|
+
float slope = 0.0f;
|
8146
|
+
|
8147
|
+
// ALiBi
|
8148
|
+
if (max_bias > 0.0f) {
|
8149
|
+
const uint32_t h = rowx/nrows_y; // head index
|
8150
|
+
|
8151
|
+
const float base = h < n_head_log2 ? m0 : m1;
|
8152
|
+
const int exp = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
|
8153
|
+
|
8154
|
+
slope = sycl::pow(base, float(exp));
|
8155
|
+
}
|
8156
|
+
|
8157
|
+
float * vals = vals_smem ? buf + WARP_SIZE : dst + rowx*ncols;
|
8140
8158
|
float max_val = -INFINITY;
|
8141
8159
|
|
8142
|
-
for (int
|
8160
|
+
for (int col0 = 0; col0 < ncols; col0 += block_size) {
|
8161
|
+
const int col = col0 + tid;
|
8162
|
+
|
8163
|
+
if (ncols_template == 0 && col >= ncols) {
|
8164
|
+
break;
|
8165
|
+
}
|
8166
|
+
|
8143
8167
|
const int ix = rowx*ncols + col;
|
8144
8168
|
const int iy = rowy*ncols + col;
|
8145
|
-
|
8169
|
+
|
8170
|
+
const float val = x[ix]*scale + (mask ? mask[iy] : 0.0f) + (pos ? slope*pos[col] : 0.0f);
|
8171
|
+
|
8172
|
+
vals[col] = val;
|
8173
|
+
max_val = sycl::max(max_val, val);
|
8146
8174
|
}
|
8147
8175
|
|
8148
8176
|
// find the max value in the block
|
@@ -8151,30 +8179,12 @@ static void soft_max_f32(const float * x, const float * y, float * dst, const in
|
|
8151
8179
|
if (warp_id == 0) {
|
8152
8180
|
buf[lane_id] = -INFINITY;
|
8153
8181
|
}
|
8154
|
-
|
8155
|
-
DPCT1118:12: SYCL group functions and algorithms must be encountered in
|
8156
|
-
converged control flow. You may need to adjust the code.
|
8157
|
-
*/
|
8158
|
-
/*
|
8159
|
-
DPCT1065:60: Consider replacing sycl::nd_item::barrier() with
|
8160
|
-
sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
|
8161
|
-
better performance if there is no access to global memory.
|
8162
|
-
*/
|
8163
|
-
item_ct1.barrier();
|
8182
|
+
item_ct1.barrier(sycl::access::fence_space::local_space);
|
8164
8183
|
|
8165
8184
|
if (lane_id == 0) {
|
8166
8185
|
buf[warp_id] = max_val;
|
8167
8186
|
}
|
8168
|
-
|
8169
|
-
DPCT1118:13: SYCL group functions and algorithms must be encountered in
|
8170
|
-
converged control flow. You may need to adjust the code.
|
8171
|
-
*/
|
8172
|
-
/*
|
8173
|
-
DPCT1065:61: Consider replacing sycl::nd_item::barrier() with
|
8174
|
-
sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
|
8175
|
-
better performance if there is no access to global memory.
|
8176
|
-
*/
|
8177
|
-
item_ct1.barrier();
|
8187
|
+
item_ct1.barrier(sycl::access::fence_space::local_space);
|
8178
8188
|
|
8179
8189
|
max_val = buf[lane_id];
|
8180
8190
|
max_val = warp_reduce_max(max_val, item_ct1);
|
@@ -8182,13 +8192,16 @@ static void soft_max_f32(const float * x, const float * y, float * dst, const in
|
|
8182
8192
|
|
8183
8193
|
float tmp = 0.f;
|
8184
8194
|
|
8185
|
-
|
8186
|
-
|
8187
|
-
const int
|
8188
|
-
|
8189
|
-
|
8195
|
+
#pragma unroll
|
8196
|
+
for (int col0 = 0; col0 < ncols; col0 += block_size) {
|
8197
|
+
const int col = col0 + tid;
|
8198
|
+
if (ncols_template == 0 && col >= ncols) {
|
8199
|
+
break;
|
8200
|
+
}
|
8201
|
+
|
8202
|
+
const float val = sycl::native::exp(vals[col] - max_val);
|
8190
8203
|
tmp += val;
|
8191
|
-
|
8204
|
+
vals[col] = val;
|
8192
8205
|
}
|
8193
8206
|
|
8194
8207
|
// find the sum of exps in the block
|
@@ -8197,40 +8210,29 @@ static void soft_max_f32(const float * x, const float * y, float * dst, const in
|
|
8197
8210
|
if (warp_id == 0) {
|
8198
8211
|
buf[lane_id] = 0.f;
|
8199
8212
|
}
|
8200
|
-
|
8201
|
-
DPCT1118:14: SYCL group functions and algorithms must be encountered in
|
8202
|
-
converged control flow. You may need to adjust the code.
|
8203
|
-
*/
|
8204
|
-
/*
|
8205
|
-
DPCT1065:62: Consider replacing sycl::nd_item::barrier() with
|
8206
|
-
sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
|
8207
|
-
better performance if there is no access to global memory.
|
8208
|
-
*/
|
8209
|
-
item_ct1.barrier();
|
8213
|
+
item_ct1.barrier(sycl::access::fence_space::local_space);
|
8210
8214
|
|
8211
8215
|
if (lane_id == 0) {
|
8212
8216
|
buf[warp_id] = tmp;
|
8213
8217
|
}
|
8214
|
-
|
8215
|
-
DPCT1118:15: SYCL group functions and algorithms must be encountered in
|
8216
|
-
converged control flow. You may need to adjust the code.
|
8217
|
-
*/
|
8218
|
-
/*
|
8219
|
-
DPCT1065:63: Consider replacing sycl::nd_item::barrier() with
|
8220
|
-
sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
|
8221
|
-
better performance if there is no access to global memory.
|
8222
|
-
*/
|
8223
|
-
item_ct1.barrier();
|
8218
|
+
item_ct1.barrier(sycl::access::fence_space::local_space);
|
8224
8219
|
|
8225
8220
|
tmp = buf[lane_id];
|
8226
8221
|
tmp = warp_reduce_sum(tmp, item_ct1);
|
8227
8222
|
}
|
8228
8223
|
|
8229
|
-
const float
|
8224
|
+
const float inv_sum = 1.f / tmp;
|
8230
8225
|
|
8231
|
-
|
8232
|
-
|
8233
|
-
|
8226
|
+
#pragma unroll
|
8227
|
+
for (int col0 = 0; col0 < ncols; col0 += block_size) {
|
8228
|
+
const int col = col0 + tid;
|
8229
|
+
|
8230
|
+
if (ncols_template == 0 && col >= ncols) {
|
8231
|
+
return;
|
8232
|
+
}
|
8233
|
+
|
8234
|
+
const int idst = rowx*ncols + col;
|
8235
|
+
dst[idst] = vals[col] * inv_sum;
|
8234
8236
|
}
|
8235
8237
|
}
|
8236
8238
|
|
@@ -10825,7 +10827,7 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
|
|
10825
10827
|
|
10826
10828
|
const sycl::range<3> block_dims(1, 1, ncols);
|
10827
10829
|
const sycl::range<3> block_nums(1, nrows, 1);
|
10828
|
-
if (order ==
|
10830
|
+
if (order == GGML_SORT_ORDER_ASC) {
|
10829
10831
|
/*
|
10830
10832
|
DPCT1049:44: The work-group size passed to the SYCL kernel may exceed
|
10831
10833
|
the limit. To get the device limit, query
|
@@ -10834,9 +10836,9 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
|
|
10834
10836
|
stream->parallel_for(
|
10835
10837
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
10836
10838
|
[=](sycl::nd_item<3> item_ct1) {
|
10837
|
-
k_argsort_f32_i32<
|
10839
|
+
k_argsort_f32_i32<GGML_SORT_ORDER_ASC>(x, dst, ncols, item_ct1);
|
10838
10840
|
});
|
10839
|
-
} else if (order ==
|
10841
|
+
} else if (order == GGML_SORT_ORDER_DESC) {
|
10840
10842
|
/*
|
10841
10843
|
DPCT1049:45: The work-group size passed to the SYCL kernel may exceed
|
10842
10844
|
the limit. To get the device limit, query
|
@@ -10845,7 +10847,7 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
|
|
10845
10847
|
stream->parallel_for(
|
10846
10848
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
10847
10849
|
[=](sycl::nd_item<3> item_ct1) {
|
10848
|
-
k_argsort_f32_i32<
|
10850
|
+
k_argsort_f32_i32<GGML_SORT_ORDER_DESC>(x, dst, ncols, item_ct1);
|
10849
10851
|
});
|
10850
10852
|
} else {
|
10851
10853
|
GGML_ASSERT(false);
|
@@ -10867,37 +10869,98 @@ static void diag_mask_inf_f32_sycl(const float *x, float *dst,
|
|
10867
10869
|
});
|
10868
10870
|
}
|
10869
10871
|
|
10870
|
-
|
10871
|
-
|
10872
|
-
|
10873
|
-
|
10874
|
-
|
10875
|
-
while (nth < ncols_x && nth < SYCL_SOFT_MAX_BLOCK_SIZE) nth *= 2;
|
10876
|
-
const sycl::range<3> block_dims(1, 1, nth);
|
10877
|
-
const sycl::range<3> block_nums(1, 1, nrows_x);
|
10878
|
-
/*
|
10879
|
-
DPCT1049:46: The work-group size passed to the SYCL kernel may exceed the
|
10880
|
-
limit. To get the device limit, query info::device::max_work_group_size.
|
10881
|
-
Adjust the work-group size if needed.
|
10882
|
-
*/
|
10872
|
+
template <bool vals_smem, int ncols_template, int block_size_template>
|
10873
|
+
static void soft_max_f32_submitter(const float * x, const float * mask, const float *pos, float * dst, const int ncols_par,
|
10874
|
+
const int nrows_y, const float scale, const float max_bias, const float m0,
|
10875
|
+
const float m1, uint32_t n_head_log2, sycl::range<3> block_nums, sycl::range<3> block_dims,
|
10876
|
+
const size_t n_local_scratch, dpct::queue_ptr stream) {
|
10883
10877
|
stream->submit([&](sycl::handler &cgh) {
|
10884
|
-
|
10885
|
-
DPCT1101:96: 'SYCL_SOFT_MAX_BLOCK_SIZE/WARP_SIZE' expression was
|
10886
|
-
replaced with a value. Modify the code to use the original expression,
|
10887
|
-
provided in comments, if it is correct.
|
10888
|
-
*/
|
10889
|
-
sycl::local_accessor<float, 1> buf_acc_ct1(
|
10890
|
-
sycl::range<1>(32 /*SYCL_SOFT_MAX_BLOCK_SIZE/WARP_SIZE*/), cgh);
|
10878
|
+
sycl::local_accessor<float, 1> local_buf_acc(n_local_scratch, cgh);
|
10891
10879
|
|
10892
10880
|
cgh.parallel_for(
|
10893
10881
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
10894
10882
|
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
10895
|
-
soft_max_f32
|
10896
|
-
|
10883
|
+
soft_max_f32<vals_smem, ncols_template, block_size_template>(x, mask, pos, dst, ncols_par,
|
10884
|
+
nrows_y, scale, max_bias, m0,
|
10885
|
+
m1, n_head_log2, item_ct1,
|
10886
|
+
local_buf_acc.get_pointer());
|
10897
10887
|
});
|
10898
10888
|
});
|
10899
10889
|
}
|
10900
10890
|
|
10891
|
+
static void soft_max_f32_sycl(const float * x, const float * mask, const float * pos,
|
10892
|
+
float * dst, const int ncols_x, const int nrows_x,
|
10893
|
+
const int nrows_y, const float scale, const float max_bias,
|
10894
|
+
dpct::queue_ptr stream) {
|
10895
|
+
int nth = WARP_SIZE;
|
10896
|
+
while (nth < ncols_x && nth < SYCL_SOFT_MAX_BLOCK_SIZE) nth *= 2;
|
10897
|
+
const sycl::range<3> block_dims(1, 1, nth);
|
10898
|
+
const sycl::range<3> block_nums(1, 1, nrows_x);
|
10899
|
+
const size_t n_local_scratch = (GGML_PAD(ncols_x, WARP_SIZE) + WARP_SIZE);
|
10900
|
+
static_assert(SYCL_SOFT_MAX_BLOCK_SIZE == 1024, "These values need to be adjusted.");
|
10901
|
+
|
10902
|
+
const uint32_t n_head_kv = nrows_x/nrows_y;
|
10903
|
+
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));
|
10904
|
+
|
10905
|
+
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
10906
|
+
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
10907
|
+
|
10908
|
+
const size_t local_mem_size = stream->get_device().get_info<sycl::info::device::local_mem_size>();
|
10909
|
+
if (n_local_scratch*sizeof(float) < local_mem_size) {
|
10910
|
+
switch (ncols_x) {
|
10911
|
+
case 32:
|
10912
|
+
soft_max_f32_submitter<true, 32, 32>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
10913
|
+
max_bias, m0, m1, n_head_log2, block_nums,
|
10914
|
+
block_dims, n_local_scratch, stream);
|
10915
|
+
break;
|
10916
|
+
case 64:
|
10917
|
+
soft_max_f32_submitter<true, 64, 64>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
10918
|
+
max_bias, m0, m1, n_head_log2, block_nums,
|
10919
|
+
block_dims, n_local_scratch, stream);
|
10920
|
+
break;
|
10921
|
+
case 128:
|
10922
|
+
soft_max_f32_submitter<true, 128, 128>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
10923
|
+
max_bias, m0, m1, n_head_log2, block_nums,
|
10924
|
+
block_dims, n_local_scratch, stream);
|
10925
|
+
break;
|
10926
|
+
case 256:
|
10927
|
+
soft_max_f32_submitter<true, 256, 256>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
10928
|
+
max_bias, m0, m1, n_head_log2, block_nums,
|
10929
|
+
block_dims, n_local_scratch, stream);
|
10930
|
+
break;
|
10931
|
+
case 512:
|
10932
|
+
soft_max_f32_submitter<true, 512, 512>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
10933
|
+
max_bias, m0, m1, n_head_log2, block_nums,
|
10934
|
+
block_dims, n_local_scratch, stream);
|
10935
|
+
break;
|
10936
|
+
case 1024:
|
10937
|
+
soft_max_f32_submitter<true, 1024, 1024>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
10938
|
+
max_bias, m0, m1, n_head_log2, block_nums,
|
10939
|
+
block_dims, n_local_scratch, stream);
|
10940
|
+
break;
|
10941
|
+
case 2048:
|
10942
|
+
soft_max_f32_submitter<true, 2048, 1024>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
10943
|
+
max_bias, m0, m1, n_head_log2, block_nums,
|
10944
|
+
block_dims, n_local_scratch, stream);
|
10945
|
+
break;
|
10946
|
+
case 4096:
|
10947
|
+
soft_max_f32_submitter<true, 4096, 1024>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
10948
|
+
max_bias, m0, m1, n_head_log2, block_nums,
|
10949
|
+
block_dims, n_local_scratch, stream);
|
10950
|
+
break;
|
10951
|
+
default:
|
10952
|
+
soft_max_f32_submitter<true, 0, 0>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
10953
|
+
max_bias, m0, m1, n_head_log2, block_nums,
|
10954
|
+
block_dims, n_local_scratch, stream);
|
10955
|
+
break;
|
10956
|
+
}
|
10957
|
+
} else {
|
10958
|
+
soft_max_f32_submitter<false, 0, 0>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
10959
|
+
max_bias, m0, m1, n_head_log2, block_nums,
|
10960
|
+
block_dims, WARP_SIZE, stream);
|
10961
|
+
}
|
10962
|
+
}
|
10963
|
+
|
10901
10964
|
template <typename T>
|
10902
10965
|
static void im2col_sycl(const float *x, T *dst, int IW, int IH,
|
10903
10966
|
int OW, int OH, int KW, int KH, int IC,
|
@@ -11407,12 +11470,12 @@ static dpct::err0 ggml_sycl_cpy_tensor_2d(void *dst,
|
|
11407
11470
|
|
11408
11471
|
dpct::memcpy_direction kind;
|
11409
11472
|
char * src_ptr;
|
11410
|
-
if (src->backend ==
|
11473
|
+
if (src->backend == GGML_BACKEND_TYPE_CPU) {
|
11411
11474
|
kind = dpct::host_to_device;
|
11412
11475
|
src_ptr = (char *) src->data;
|
11413
|
-
// GGML_SYCL_DEBUG("ggml_sycl_cpy_tensor_2d
|
11414
|
-
} else if (src->backend ==
|
11415
|
-
GGML_ASSERT(src->backend !=
|
11476
|
+
// GGML_SYCL_DEBUG("ggml_sycl_cpy_tensor_2d GGML_BACKEND_TYPE_CPU src_ptr %p\n", src_ptr);
|
11477
|
+
} else if (src->backend == GGML_BACKEND_TYPE_GPU || src->backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
|
11478
|
+
GGML_ASSERT(src->backend != GGML_BACKEND_TYPE_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
|
11416
11479
|
kind = dpct::device_to_device;
|
11417
11480
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
|
11418
11481
|
int id;
|
@@ -11846,7 +11909,7 @@ inline void ggml_sycl_op_mul_mat_q(
|
|
11846
11909
|
|
11847
11910
|
// the main device has a larger memory buffer to hold the results from all GPUs
|
11848
11911
|
// nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into
|
11849
|
-
const int64_t nrows_dst = dst->backend ==
|
11912
|
+
const int64_t nrows_dst = dst->backend == GGML_BACKEND_TYPE_GPU && device_id == g_main_device ? ne0 : row_diff;
|
11850
11913
|
|
11851
11914
|
switch (src0->type) {
|
11852
11915
|
case GGML_TYPE_Q4_0:
|
@@ -12119,7 +12182,7 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
|
12119
12182
|
|
12120
12183
|
// the main device has a larger memory buffer to hold the results from all GPUs
|
12121
12184
|
// ldc == nrows of the matrix that cuBLAS writes into
|
12122
|
-
int ldc = dst->backend ==
|
12185
|
+
int ldc = dst->backend == GGML_BACKEND_TYPE_GPU && device_id == g_main_device ? ne0 : row_diff;
|
12123
12186
|
|
12124
12187
|
#ifdef GGML_SYCL_F16
|
12125
12188
|
bool use_fp16 = true; // TODO(Yu) SYCL capability check
|
@@ -12435,14 +12498,35 @@ inline void ggml_sycl_op_soft_max(const ggml_tensor *src0,
|
|
12435
12498
|
|
12436
12499
|
const int64_t ne00 = src0->ne[0];
|
12437
12500
|
const int64_t nrows_x = ggml_nrows(src0);
|
12438
|
-
const int64_t nrows_y =
|
12501
|
+
const int64_t nrows_y = src0->ne[1];
|
12439
12502
|
|
12440
12503
|
float scale = 1.0f;
|
12441
|
-
|
12504
|
+
float max_bias = 0.0f;
|
12442
12505
|
|
12443
|
-
|
12506
|
+
memcpy(&scale, dst->op_params + 0, sizeof(float));
|
12507
|
+
memcpy(&max_bias, dst->op_params + 1, sizeof(float));
|
12444
12508
|
|
12445
|
-
|
12509
|
+
// positions tensor
|
12510
|
+
float * src2_dd = nullptr;
|
12511
|
+
sycl_pool_alloc<float> src2_f;
|
12512
|
+
|
12513
|
+
ggml_tensor * src2 = dst->src[2];
|
12514
|
+
const bool use_src2 = src2 != nullptr;
|
12515
|
+
|
12516
|
+
if (use_src2) {
|
12517
|
+
const bool src2_on_device = src2->backend == GGML_BACKEND_TYPE_GPU;
|
12518
|
+
|
12519
|
+
if (src2_on_device) {
|
12520
|
+
ggml_tensor_extra_gpu * src2_extra = (ggml_tensor_extra_gpu *) src2->extra;
|
12521
|
+
src2_dd = (float *) src2_extra->data_device[g_main_device];
|
12522
|
+
} else {
|
12523
|
+
src2_dd = src2_f.alloc(ggml_nelements(src2));
|
12524
|
+
SYCL_CHECK(ggml_sycl_cpy_tensor_2d(src2_dd, src2, 0, 0, 0, 1, main_stream));
|
12525
|
+
}
|
12526
|
+
}
|
12527
|
+
|
12528
|
+
soft_max_f32_sycl(src0_dd, src1 ? src1_dd : nullptr, src2_dd, dst_dd, ne00,
|
12529
|
+
nrows_x, nrows_y, scale, max_bias, main_stream);
|
12446
12530
|
}
|
12447
12531
|
|
12448
12532
|
inline void ggml_sycl_op_scale(const ggml_tensor *src0, const ggml_tensor *src1,
|
@@ -12501,16 +12585,16 @@ static void ggml_sycl_op_flatten(const ggml_tensor *src0,
|
|
12501
12585
|
const bool use_src1 = src1 != nullptr;
|
12502
12586
|
const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
|
12503
12587
|
|
12504
|
-
GGML_ASSERT(!use_src1 || src1->backend !=
|
12505
|
-
GGML_ASSERT( dst->backend !=
|
12588
|
+
GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
12589
|
+
GGML_ASSERT( dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
12506
12590
|
|
12507
12591
|
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
12508
12592
|
ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
|
12509
12593
|
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
12510
12594
|
|
12511
|
-
const bool src0_on_device = src0->backend ==
|
12512
|
-
const bool src1_on_device = use_src1 && src1->backend ==
|
12513
|
-
const bool dst_on_device = dst->backend ==
|
12595
|
+
const bool src0_on_device = src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
|
12596
|
+
const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_TYPE_GPU;
|
12597
|
+
const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU;
|
12514
12598
|
|
12515
12599
|
// dd = data device
|
12516
12600
|
float * src0_ddf = nullptr;
|
@@ -12565,7 +12649,7 @@ static void ggml_sycl_op_flatten(const ggml_tensor *src0,
|
|
12565
12649
|
main_stream->memcpy(dst->data, dst_ddf, ggml_nbytes(dst))));
|
12566
12650
|
}
|
12567
12651
|
|
12568
|
-
if (dst->backend ==
|
12652
|
+
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
12569
12653
|
SYCL_CHECK(CHECK_TRY_ERROR(
|
12570
12654
|
dpct::get_current_device().queues_wait_and_throw()));
|
12571
12655
|
}
|
@@ -12640,8 +12724,9 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
|
|
12640
12724
|
const int nb2 = dst->nb[2];
|
12641
12725
|
const int nb3 = dst->nb[3];
|
12642
12726
|
|
12643
|
-
GGML_ASSERT(dst->backend !=
|
12644
|
-
GGML_ASSERT(src1->backend !=
|
12727
|
+
GGML_ASSERT(dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
12728
|
+
GGML_ASSERT(src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
12729
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32 || (src1->ne[2] == 1 && src1->ne[3] == 1));
|
12645
12730
|
|
12646
12731
|
GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);
|
12647
12732
|
|
@@ -12656,13 +12741,13 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
|
|
12656
12741
|
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
12657
12742
|
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
12658
12743
|
|
12659
|
-
const bool src0_on_device = src0->backend ==
|
12744
|
+
const bool src0_on_device = src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
|
12660
12745
|
const bool src0_is_contiguous = ggml_is_contiguous(src0);
|
12661
12746
|
const bool src1_is_contiguous = ggml_is_contiguous(src1);
|
12662
12747
|
|
12663
12748
|
int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
|
12664
12749
|
|
12665
|
-
const bool split = src0->backend ==
|
12750
|
+
const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
|
12666
12751
|
GGML_ASSERT(!(split && ne02 > 1));
|
12667
12752
|
GGML_ASSERT(!(split && ne03 > 1));
|
12668
12753
|
GGML_ASSERT(!(split && ne02 < ne12));
|
@@ -12717,8 +12802,8 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
|
|
12717
12802
|
|
12718
12803
|
used_devices++;
|
12719
12804
|
|
12720
|
-
const bool src1_on_device = src1->backend ==
|
12721
|
-
const bool dst_on_device = dst->backend ==
|
12805
|
+
const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index;
|
12806
|
+
const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index;
|
12722
12807
|
|
12723
12808
|
ggml_sycl_set_device(get_device_id_by_index(id));
|
12724
12809
|
const dpct::queue_ptr stream = g_syclStreams[id][0];
|
@@ -12782,8 +12867,8 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
|
|
12782
12867
|
continue;
|
12783
12868
|
}
|
12784
12869
|
|
12785
|
-
const bool src1_on_device = src1->backend ==
|
12786
|
-
const bool dst_on_device = dst->backend ==
|
12870
|
+
const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index;
|
12871
|
+
const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index;
|
12787
12872
|
const int64_t row_diff = row_high[id] - row_low[id];
|
12788
12873
|
|
12789
12874
|
ggml_sycl_set_device(get_device_id_by_index(id));
|
@@ -12809,12 +12894,12 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
|
|
12809
12894
|
|
12810
12895
|
// the main device memory buffer can be on VRAM scratch, with space for all partial results
|
12811
12896
|
// in that case an offset on dst_ddf_i is needed
|
12812
|
-
if (dst->backend ==
|
12897
|
+
if (dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index) {
|
12813
12898
|
dst_dd_i += row_low[id]; // offset is 0 if no tensor split
|
12814
12899
|
}
|
12815
12900
|
|
12816
12901
|
// copy src0, src1 to device if necessary
|
12817
|
-
if (src1->backend ==
|
12902
|
+
if (src1->backend == GGML_BACKEND_TYPE_GPU && src1_is_contiguous) {
|
12818
12903
|
if (id != g_main_device_index) {
|
12819
12904
|
if (convert_src1_to_q8_1) {
|
12820
12905
|
char * src1_ddq_i_source = src1_ddq[g_main_device_index] + src1_ddq_i_offset;
|
@@ -12830,14 +12915,14 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
|
|
12830
12915
|
src1_ncols * ne10 * sizeof(float))));
|
12831
12916
|
}
|
12832
12917
|
}
|
12833
|
-
} else if (src1->backend ==
|
12918
|
+
} else if (src1->backend == GGML_BACKEND_TYPE_CPU || (src1_on_device && !src1_is_contiguous)) {
|
12834
12919
|
SYCL_CHECK(ggml_sycl_cpy_tensor_2d(
|
12835
12920
|
src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
|
12836
12921
|
} else {
|
12837
12922
|
GGML_ASSERT(false);
|
12838
12923
|
}
|
12839
12924
|
|
12840
|
-
if (convert_src1_to_q8_1 && (src1->backend ==
|
12925
|
+
if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_TYPE_CPU || !src1_is_contiguous)) {
|
12841
12926
|
quantize_row_q8_1_sycl(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
|
12842
12927
|
/*
|
12843
12928
|
DPCT1010:92: SYCL uses exceptions to report errors and does
|
@@ -12867,10 +12952,10 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
|
|
12867
12952
|
if (!dst_on_device) {
|
12868
12953
|
void * dst_off_device;
|
12869
12954
|
dpct::memcpy_direction kind;
|
12870
|
-
if (dst->backend ==
|
12955
|
+
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
12871
12956
|
dst_off_device = dst->data;
|
12872
12957
|
kind = dpct::device_to_host;
|
12873
|
-
} else if (dst->backend ==
|
12958
|
+
} else if (dst->backend == GGML_BACKEND_TYPE_GPU) {
|
12874
12959
|
dst_off_device = dst_extra->data_device[g_main_device_index];
|
12875
12960
|
kind = dpct::device_to_device;
|
12876
12961
|
} else {
|
@@ -12954,7 +13039,7 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
|
|
12954
13039
|
}
|
12955
13040
|
}
|
12956
13041
|
|
12957
|
-
if (dst->backend ==
|
13042
|
+
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
12958
13043
|
SYCL_CHECK(ggml_sycl_set_device(g_main_device));
|
12959
13044
|
SYCL_CHECK(CHECK_TRY_ERROR(
|
12960
13045
|
dpct::get_current_device().queues_wait_and_throw()));
|
@@ -13091,7 +13176,7 @@ static void ggml_sycl_mul_mat_vec_p021(const ggml_tensor *src0,
|
|
13091
13176
|
const ggml_tensor *src1,
|
13092
13177
|
ggml_tensor *dst) try {
|
13093
13178
|
GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
|
13094
|
-
GGML_ASSERT(src0->backend !=
|
13179
|
+
GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
13095
13180
|
GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
|
13096
13181
|
GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation
|
13097
13182
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
@@ -13129,7 +13214,7 @@ static void ggml_sycl_mul_mat_vec_nc(const ggml_tensor *src0,
|
|
13129
13214
|
GGML_ASSERT(!ggml_is_transposed(src0));
|
13130
13215
|
GGML_ASSERT(!ggml_is_transposed(src1));
|
13131
13216
|
GGML_ASSERT(!ggml_is_permuted(src0));
|
13132
|
-
GGML_ASSERT(src0->backend !=
|
13217
|
+
GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
13133
13218
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
13134
13219
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
13135
13220
|
|
@@ -13185,31 +13270,23 @@ static void k_compute_batched_ptrs(const sycl::half *src0_as_f16,
|
|
13185
13270
|
int64_t i03 = i13 / r3;
|
13186
13271
|
int64_t i02 = i12 / r2;
|
13187
13272
|
|
13188
|
-
ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02
|
13189
|
-
ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12
|
13190
|
-
ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2
|
13273
|
+
ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
|
13274
|
+
ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12 + i13*nb13;
|
13275
|
+
ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3;
|
13191
13276
|
}
|
13192
13277
|
|
13193
|
-
static void
|
13194
|
-
|
13195
|
-
|
13278
|
+
static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
|
13279
|
+
const ggml_tensor *src1,
|
13280
|
+
ggml_tensor *dst) try {
|
13196
13281
|
GGML_ASSERT(!ggml_is_transposed(src0));
|
13197
13282
|
GGML_ASSERT(!ggml_is_transposed(src1));
|
13198
13283
|
|
13199
|
-
GGML_ASSERT(src0->backend !=
|
13284
|
+
GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
13200
13285
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
13201
|
-
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
13202
|
-
|
13203
|
-
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
|
13204
13286
|
|
13205
|
-
|
13206
|
-
|
13207
|
-
GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
|
13208
|
-
|
13209
|
-
GGML_TENSOR_LOCALS(int64_t, nb1, src1, nb);
|
13287
|
+
GGML_TENSOR_BINARY_OP_LOCALS
|
13210
13288
|
|
13211
|
-
const int64_t
|
13212
|
-
const int64_t ne = ggml_nelements(dst);
|
13289
|
+
const int64_t ne_dst = ggml_nelements(dst);
|
13213
13290
|
|
13214
13291
|
SYCL_CHECK(ggml_sycl_set_device(g_main_device));
|
13215
13292
|
dpct::queue_ptr main_stream = g_syclStreams[g_main_device_index][0];
|
@@ -13228,11 +13305,16 @@ static void ggml_sycl_mul_mat_mat_batched_sycl(const ggml_tensor *src0,
|
|
13228
13305
|
float * dst_ddf = (float *) dst_extra->data_device[g_main_device_index];
|
13229
13306
|
|
13230
13307
|
// convert src1 to fp16
|
13231
|
-
|
13232
|
-
|
13233
|
-
|
13234
|
-
|
13235
|
-
|
13308
|
+
sycl_pool_alloc<sycl::half> src1_f16_alloc;
|
13309
|
+
if (src1->type != GGML_TYPE_F16) {
|
13310
|
+
const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type);
|
13311
|
+
const int64_t ne_src1 = ggml_nelements(src1);
|
13312
|
+
src1_f16_alloc.alloc(ne_src1);
|
13313
|
+
GGML_ASSERT(to_fp16_sycl != nullptr);
|
13314
|
+
to_fp16_sycl(src1_ddf, src1_f16_alloc.get(), ne_src1, main_stream);
|
13315
|
+
}
|
13316
|
+
sycl::half *src1_f16 = src1->type == GGML_TYPE_F16 ? (sycl::half *)src1_ddf
|
13317
|
+
: src1_f16_alloc.get();
|
13236
13318
|
|
13237
13319
|
sycl_pool_alloc<sycl::half> dst_f16;
|
13238
13320
|
char * dst_t;
|
@@ -13253,20 +13335,12 @@ static void ggml_sycl_mul_mat_mat_batched_sycl(const ggml_tensor *src0,
|
|
13253
13335
|
const void * alpha = &alpha_f16;
|
13254
13336
|
const void * beta = &beta_f16;
|
13255
13337
|
|
13256
|
-
|
13257
|
-
|
13258
|
-
|
13259
|
-
nbd2 /= sizeof(float) / sizeof(sycl::half);
|
13260
|
-
nbd3 /= sizeof(float) / sizeof(sycl::half);
|
13261
|
-
} else {
|
13262
|
-
dst_t = (char *) dst_ddf;
|
13263
|
-
|
13264
|
-
cu_compute_type = dpct::library_data_t::real_float;
|
13265
|
-
cu_data_type = dpct::library_data_t::real_float;
|
13338
|
+
// TODO: Renable (dst->op_params[0] =! GGML_PREC_DEFAULT) pathway
|
13339
|
+
// once oneMKL open source supports half, half, float, float: datatypes
|
13340
|
+
dst_t = (char *) dst_f16.alloc(ne_dst);
|
13266
13341
|
|
13267
|
-
|
13268
|
-
|
13269
|
-
}
|
13342
|
+
nbd2 /= sizeof(float) / sizeof(sycl::half);
|
13343
|
+
nbd3 /= sizeof(float) / sizeof(sycl::half);
|
13270
13344
|
|
13271
13345
|
GGML_ASSERT(ne12 % ne02 == 0);
|
13272
13346
|
GGML_ASSERT(ne13 % ne03 == 0);
|
@@ -13302,10 +13376,10 @@ static void ggml_sycl_mul_mat_mat_batched_sycl(const ggml_tensor *src0,
|
|
13302
13376
|
*g_sycl_handles[g_main_device_index], oneapi::mkl::transpose::trans,
|
13303
13377
|
oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha,
|
13304
13378
|
(const char *)src0_as_f16, dpct::library_data_t::real_half,
|
13305
|
-
nb01 /
|
13306
|
-
(const char *)
|
13307
|
-
nb11 /
|
13308
|
-
(char *)dst_t, cu_data_type, ne01,
|
13379
|
+
nb01 / nb00, nb02 / nb00,
|
13380
|
+
(const char *)src1_f16, dpct::library_data_t::real_half,
|
13381
|
+
nb11 / nb10, nb12 / nb10, beta,
|
13382
|
+
(char *)dst_t, cu_data_type, ne01, nb2 / nb0,
|
13309
13383
|
ne12 * ne13, cu_compute_type)));
|
13310
13384
|
} else {
|
13311
13385
|
// use syclGemmBatchedEx
|
@@ -13325,44 +13399,35 @@ static void ggml_sycl_mul_mat_mat_batched_sycl(const ggml_tensor *src0,
|
|
13325
13399
|
{sycl::aspect::fp16});
|
13326
13400
|
|
13327
13401
|
main_stream->submit([&](sycl::handler &cgh) {
|
13328
|
-
const
|
13329
|
-
|
13330
|
-
|
13331
|
-
|
13402
|
+
const void **ptrs_src_get = ptrs_src.get();
|
13403
|
+
void **ptrs_dst_get = ptrs_dst.get();
|
13404
|
+
size_t nb12_scaled = src1->type == GGML_TYPE_F16 ? nb12 : nb12 / 2;
|
13405
|
+
size_t nb13_scaled = src1->type == GGML_TYPE_F16 ? nb13 : nb13 / 2;
|
13332
13406
|
cgh.parallel_for(sycl::nd_range<3>(block_dims, block_dims),
|
13333
13407
|
[=](sycl::nd_item<3> item_ct1) {
|
13334
13408
|
k_compute_batched_ptrs(
|
13335
|
-
src0_as_f16,
|
13336
|
-
dst_t,
|
13337
|
-
|
13338
|
-
nb02, nb03,
|
13339
|
-
r3, item_ct1);
|
13409
|
+
src0_as_f16, src1_f16,
|
13410
|
+
dst_t, ptrs_src_get,
|
13411
|
+
ptrs_dst_get, ne12, ne13, ne23,
|
13412
|
+
nb02, nb03, nb12_scaled, nb13_scaled,
|
13413
|
+
nbd2, nbd3, r2, r3, item_ct1);
|
13340
13414
|
});
|
13341
13415
|
});
|
13342
13416
|
}
|
13343
|
-
/*
|
13344
|
-
DPCT1010:95: SYCL uses exceptions to report errors and does not use the
|
13345
|
-
error codes. The call was replaced with 0. You need to rewrite this
|
13346
|
-
code.
|
13347
|
-
*/
|
13348
|
-
SYCL_CHECK(0);
|
13349
|
-
|
13350
13417
|
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
|
13351
13418
|
*g_sycl_handles[g_main_device_index], oneapi::mkl::transpose::trans,
|
13352
13419
|
oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha,
|
13353
13420
|
(const void **)(ptrs_src.get() + 0 * ne23),
|
13354
|
-
dpct::library_data_t::real_half, nb01 /
|
13421
|
+
dpct::library_data_t::real_half, nb01 / nb00,
|
13355
13422
|
(const void **)(ptrs_src.get() + 1 * ne23),
|
13356
|
-
dpct::library_data_t::real_half, nb11 /
|
13423
|
+
dpct::library_data_t::real_half, nb11 / nb10, beta,
|
13357
13424
|
(void **)(ptrs_dst.get() + 0 * ne23), cu_data_type, ne01, ne23,
|
13358
13425
|
cu_compute_type)));
|
13359
13426
|
}
|
13360
13427
|
#endif
|
13361
13428
|
|
13362
|
-
|
13363
|
-
|
13364
|
-
to_fp32_sycl(dst_f16.get(), dst_ddf, ne, main_stream);
|
13365
|
-
}
|
13429
|
+
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
|
13430
|
+
to_fp32_sycl(dst_f16.get(), dst_ddf, ne_dst, main_stream);
|
13366
13431
|
}
|
13367
13432
|
catch (sycl::exception const &exc) {
|
13368
13433
|
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
@@ -13372,11 +13437,11 @@ catch (sycl::exception const &exc) {
|
|
13372
13437
|
|
13373
13438
|
static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
13374
13439
|
const bool all_on_device =
|
13375
|
-
(src0->backend ==
|
13376
|
-
(src1->backend ==
|
13377
|
-
( dst->backend ==
|
13440
|
+
(src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT) &&
|
13441
|
+
(src1->backend == GGML_BACKEND_TYPE_GPU) &&
|
13442
|
+
( dst->backend == GGML_BACKEND_TYPE_GPU);
|
13378
13443
|
|
13379
|
-
const bool split = src0->backend ==
|
13444
|
+
const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
|
13380
13445
|
|
13381
13446
|
int64_t min_compute_capability = INT_MAX;
|
13382
13447
|
for (int64_t id = 0; id < g_device_count; ++id) {
|
@@ -13407,10 +13472,10 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|
13407
13472
|
// KQV single-batch
|
13408
13473
|
// GGML_SYCL_DEBUG("ggml_sycl_mul_mat_vec_nc\n");
|
13409
13474
|
ggml_sycl_mul_mat_vec_nc(src0, src1, dst);
|
13410
|
-
} else if (!split && all_on_device && use_xmx && src0->type == GGML_TYPE_F16 &&
|
13475
|
+
} else if (!split && all_on_device && use_xmx && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
|
13411
13476
|
// KQ + KQV multi-batch
|
13412
|
-
// GGML_SYCL_DEBUG("
|
13413
|
-
|
13477
|
+
// GGML_SYCL_DEBUG("ggml_sycl_mul_mat_batched_sycl\n");
|
13478
|
+
ggml_sycl_mul_mat_batched_sycl(src0, src1, dst);
|
13414
13479
|
} else if (src0->type == GGML_TYPE_F32) {
|
13415
13480
|
// GGML_SYCL_DEBUG("ggml_sycl_op_mul_mat\n");
|
13416
13481
|
ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false);
|
@@ -13505,7 +13570,7 @@ static void ggml_sycl_mul_mat_id_sycl(ggml_tensor * dst) {
|
|
13505
13570
|
GGML_ASSERT(!ggml_is_transposed(src00));
|
13506
13571
|
GGML_ASSERT(!ggml_is_transposed(src1));
|
13507
13572
|
|
13508
|
-
GGML_ASSERT(src00->backend !=
|
13573
|
+
GGML_ASSERT(src00->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
13509
13574
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
13510
13575
|
|
13511
13576
|
GGML_TENSOR_LOCALS(int64_t, ne0, src00, ne);
|
@@ -13643,7 +13708,7 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
|
|
13643
13708
|
|
13644
13709
|
const dpct::queue_ptr stream = g_syclStreams[g_main_device_index][0];
|
13645
13710
|
|
13646
|
-
if (ids->backend ==
|
13711
|
+
if (ids->backend == GGML_BACKEND_TYPE_GPU) {
|
13647
13712
|
const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device_index];
|
13648
13713
|
SYCL_CHECK(CHECK_TRY_ERROR(
|
13649
13714
|
stream->memcpy(ids_host.data(), ids_dev, ggml_nbytes(ids))));
|
@@ -13661,20 +13726,20 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
|
|
13661
13726
|
ggml_tensor src1_row = *src1;
|
13662
13727
|
ggml_tensor dst_row = *dst;
|
13663
13728
|
|
13664
|
-
src1_row.backend =
|
13665
|
-
dst_row.backend =
|
13729
|
+
src1_row.backend = GGML_BACKEND_TYPE_GPU;
|
13730
|
+
dst_row.backend = GGML_BACKEND_TYPE_GPU;
|
13666
13731
|
|
13667
13732
|
src1_row.extra = &src1_row_extra;
|
13668
13733
|
dst_row.extra = &dst_row_extra;
|
13669
13734
|
|
13670
|
-
char * src1_original = src1->backend ==
|
13735
|
+
char * src1_original = src1->backend == GGML_BACKEND_TYPE_CPU ?
|
13671
13736
|
(char *) src1->data : (char *) src1_extra->data_device[g_main_device_index];
|
13672
|
-
char * dst_original = dst->backend ==
|
13737
|
+
char * dst_original = dst->backend == GGML_BACKEND_TYPE_CPU ?
|
13673
13738
|
(char *) dst->data : (char *) dst_extra->data_device[g_main_device_index];
|
13674
13739
|
|
13675
13740
|
if (src1->ne[1] == 1) {
|
13676
|
-
GGML_ASSERT(src1->backend ==
|
13677
|
-
GGML_ASSERT(dst->backend ==
|
13741
|
+
GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
|
13742
|
+
GGML_ASSERT(dst->backend == GGML_BACKEND_TYPE_GPU);
|
13678
13743
|
|
13679
13744
|
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
13680
13745
|
//int32_t row_id;
|
@@ -13756,7 +13821,7 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
|
|
13756
13821
|
}
|
13757
13822
|
}
|
13758
13823
|
|
13759
|
-
if (dst->backend ==
|
13824
|
+
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
13760
13825
|
SYCL_CHECK(CHECK_TRY_ERROR(stream->wait()));
|
13761
13826
|
}
|
13762
13827
|
}
|
@@ -13779,8 +13844,8 @@ static void ggml_sycl_cpy(const ggml_tensor *src0, const ggml_tensor *src1,
|
|
13779
13844
|
const int64_t ne = ggml_nelements(src0);
|
13780
13845
|
GGML_ASSERT(ne == ggml_nelements(src1));
|
13781
13846
|
|
13782
|
-
GGML_ASSERT(src0->backend ==
|
13783
|
-
GGML_ASSERT(src1->backend ==
|
13847
|
+
GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
|
13848
|
+
GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
|
13784
13849
|
|
13785
13850
|
GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
|
13786
13851
|
GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
|
@@ -13887,17 +13952,17 @@ void ggml_sycl_transform_tensor(void *data, struct ggml_tensor *tensor) try {
|
|
13887
13952
|
memset(extra, 0, sizeof(*extra));
|
13888
13953
|
|
13889
13954
|
for (int64_t id = 0; id < g_device_count; ++id) {
|
13890
|
-
if (backend ==
|
13955
|
+
if (backend == GGML_BACKEND_TYPE_GPU && id != g_main_device_index) {
|
13891
13956
|
continue;
|
13892
13957
|
}
|
13893
13958
|
ggml_sycl_set_device(get_device_id_by_index(id));
|
13894
13959
|
const dpct::queue_ptr stream = g_syclStreams[id][0];
|
13895
13960
|
|
13896
13961
|
int64_t row_low, row_high;
|
13897
|
-
if (backend ==
|
13962
|
+
if (backend == GGML_BACKEND_TYPE_GPU) {
|
13898
13963
|
row_low = 0;
|
13899
13964
|
row_high = nrows;
|
13900
|
-
} else if (backend ==
|
13965
|
+
} else if (backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
|
13901
13966
|
const int64_t rounding = get_row_rounding(tensor->type);
|
13902
13967
|
|
13903
13968
|
row_low = id == 0 ? 0 : nrows*g_tensor_split[id];
|
@@ -13946,7 +14011,7 @@ void ggml_sycl_transform_tensor(void *data, struct ggml_tensor *tensor) try {
|
|
13946
14011
|
|
13947
14012
|
extra->data_device[id] = buf;
|
13948
14013
|
|
13949
|
-
if (backend ==
|
14014
|
+
if (backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
|
13950
14015
|
for (int64_t is = 0; is < MAX_STREAMS; ++is) {
|
13951
14016
|
SYCL_CHECK(CHECK_TRY_ERROR(extra->events[id][is] =
|
13952
14017
|
new sycl::event()));
|
@@ -13963,7 +14028,7 @@ catch (sycl::exception const &exc) {
|
|
13963
14028
|
}
|
13964
14029
|
|
13965
14030
|
void ggml_sycl_free_data(struct ggml_tensor *tensor) try {
|
13966
|
-
if (!tensor || !tensor->extra || (tensor->backend !=
|
14031
|
+
if (!tensor || !tensor->extra || (tensor->backend != GGML_BACKEND_TYPE_GPU && tensor->backend != GGML_BACKEND_TYPE_GPU_SPLIT) ) {
|
13967
14032
|
return;
|
13968
14033
|
}
|
13969
14034
|
|
@@ -14016,15 +14081,15 @@ static void ggml_sycl_assign_buffers_impl(struct ggml_tensor *tensor,
|
|
14016
14081
|
return;
|
14017
14082
|
}
|
14018
14083
|
|
14019
|
-
tensor->backend =
|
14084
|
+
tensor->backend = GGML_BACKEND_TYPE_GPU;
|
14020
14085
|
|
14021
|
-
if (tensor->src[0] != nullptr && tensor->src[0]->backend ==
|
14086
|
+
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU) {
|
14022
14087
|
const ggml_op src0_op = tensor->src[0]->op;
|
14023
14088
|
if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
|
14024
14089
|
ggml_sycl_assign_buffers_impl(tensor->src[0], scratch, force_inplace, no_alloc);
|
14025
14090
|
}
|
14026
14091
|
}
|
14027
|
-
if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend ==
|
14092
|
+
if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU) {
|
14028
14093
|
ggml_sycl_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc);
|
14029
14094
|
}
|
14030
14095
|
|
@@ -14042,7 +14107,7 @@ static void ggml_sycl_assign_buffers_impl(struct ggml_tensor *tensor,
|
|
14042
14107
|
SYCL_CHECK(ggml_sycl_set_device(g_main_device));
|
14043
14108
|
const dpct::queue_ptr stream = g_syclStreams[g_main_device_index][0];
|
14044
14109
|
|
14045
|
-
if (inplace && (tensor->src[0]->backend ==
|
14110
|
+
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT)) {
|
14046
14111
|
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
|
14047
14112
|
char * src0_ddc = (char *) src0_extra->data_device[g_main_device_index];
|
14048
14113
|
size_t offset = 0;
|
@@ -14111,7 +14176,7 @@ void ggml_sycl_assign_scratch_offset(struct ggml_tensor *tensor,
|
|
14111
14176
|
|
14112
14177
|
const bool inplace = tensor->view_src != nullptr;
|
14113
14178
|
|
14114
|
-
if (inplace && (tensor->view_src->backend ==
|
14179
|
+
if (inplace && (tensor->view_src->backend == GGML_BACKEND_TYPE_GPU || tensor->view_src->backend == GGML_BACKEND_TYPE_GPU_SPLIT)) {
|
14115
14180
|
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->view_src->extra;
|
14116
14181
|
char * src0_ddc = (char *) src0_extra->data_device[g_main_device_index];
|
14117
14182
|
size_t view_offset = 0;
|
@@ -14132,7 +14197,7 @@ catch (sycl::exception const &exc) {
|
|
14132
14197
|
}
|
14133
14198
|
|
14134
14199
|
void ggml_sycl_copy_to_device(struct ggml_tensor *tensor) try {
|
14135
|
-
GGML_ASSERT(tensor->backend ==
|
14200
|
+
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
14136
14201
|
GGML_ASSERT(ggml_is_contiguous(tensor));
|
14137
14202
|
|
14138
14203
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
@@ -14219,9 +14284,9 @@ bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
14219
14284
|
if (!g_sycl_loaded) return false;
|
14220
14285
|
|
14221
14286
|
ggml_sycl_func_t func;
|
14222
|
-
const bool any_on_device = tensor->backend ==
|
14223
|
-
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend ==
|
14224
|
-
|| (tensor->src[1] != nullptr && tensor->src[1]->backend ==
|
14287
|
+
const bool any_on_device = tensor->backend == GGML_BACKEND_TYPE_GPU
|
14288
|
+
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
|
14289
|
+
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_TYPE_GPU);
|
14225
14290
|
|
14226
14291
|
if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) {
|
14227
14292
|
return false;
|
@@ -14359,14 +14424,14 @@ bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
14359
14424
|
return false;
|
14360
14425
|
}
|
14361
14426
|
|
14362
|
-
if (tensor->src[0] != nullptr && tensor->src[0]->backend ==
|
14427
|
+
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
|
14363
14428
|
ggml_sycl_set_peer_access(tensor->src[1]->ne[1]);
|
14364
14429
|
}
|
14365
14430
|
|
14366
14431
|
if (params->ith != 0) {
|
14367
14432
|
return true;
|
14368
14433
|
}
|
14369
|
-
if (params->type ==
|
14434
|
+
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
14370
14435
|
return true;
|
14371
14436
|
}
|
14372
14437
|
func(tensor->src[0], tensor->src[1], tensor);
|
@@ -14517,7 +14582,7 @@ static void ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
|
14517
14582
|
|
14518
14583
|
extra->data_device[ctx->device] = tensor->data;
|
14519
14584
|
|
14520
|
-
tensor->backend =
|
14585
|
+
tensor->backend = GGML_BACKEND_TYPE_GPU;
|
14521
14586
|
tensor->extra = extra;
|
14522
14587
|
|
14523
14588
|
if (ggml_is_quantized(tensor->type)) {
|
@@ -14548,7 +14613,7 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
|
14548
14613
|
ggml_tensor *tensor,
|
14549
14614
|
const void *data, size_t offset,
|
14550
14615
|
size_t size) try {
|
14551
|
-
GGML_ASSERT(tensor->backend ==
|
14616
|
+
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
14552
14617
|
|
14553
14618
|
ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
|
14554
14619
|
|
@@ -14573,7 +14638,7 @@ static void ggml_backend_sycl_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
|
14573
14638
|
const ggml_tensor *tensor,
|
14574
14639
|
void *data, size_t offset,
|
14575
14640
|
size_t size) try {
|
14576
|
-
GGML_ASSERT(tensor->backend ==
|
14641
|
+
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
14577
14642
|
|
14578
14643
|
ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
|
14579
14644
|
|
@@ -14809,7 +14874,7 @@ static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend,
|
|
14809
14874
|
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
14810
14875
|
|
14811
14876
|
GGML_ASSERT(tensor->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type");
|
14812
|
-
GGML_ASSERT(tensor->backend ==
|
14877
|
+
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
14813
14878
|
|
14814
14879
|
SYCL_CHECK(CHECK_TRY_ERROR(g_syclStreams[sycl_ctx->device][0]->memcpy(
|
14815
14880
|
(char *)tensor->data + offset, data, size)));
|
@@ -14827,7 +14892,7 @@ static void ggml_backend_sycl_get_tensor_async(ggml_backend_t backend,
|
|
14827
14892
|
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
14828
14893
|
|
14829
14894
|
GGML_ASSERT(tensor->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type");
|
14830
|
-
GGML_ASSERT(tensor->backend ==
|
14895
|
+
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
14831
14896
|
|
14832
14897
|
SYCL_CHECK(CHECK_TRY_ERROR(g_syclStreams[sycl_ctx->device][0]->memcpy(
|
14833
14898
|
data, (const char *)tensor->data + offset, size)));
|
@@ -14880,7 +14945,7 @@ static bool ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph
|
|
14880
14945
|
ggml_sycl_set_main_device(sycl_ctx->device);
|
14881
14946
|
|
14882
14947
|
ggml_compute_params params = {};
|
14883
|
-
params.type =
|
14948
|
+
params.type = GGML_TASK_TYPE_COMPUTE;
|
14884
14949
|
params.ith = 0;
|
14885
14950
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
14886
14951
|
ggml_tensor * node = cgraph->nodes[i];
|
@@ -14888,13 +14953,13 @@ static bool ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph
|
|
14888
14953
|
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
|
14889
14954
|
continue;
|
14890
14955
|
|
14891
|
-
assert(node->backend ==
|
14956
|
+
assert(node->backend == GGML_BACKEND_TYPE_GPU);
|
14892
14957
|
assert(node->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device));
|
14893
14958
|
assert(node->extra != nullptr);
|
14894
14959
|
|
14895
14960
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
14896
14961
|
if (node->src[j] != nullptr) {
|
14897
|
-
assert(node->src[j]->backend ==
|
14962
|
+
assert(node->src[j]->backend == GGML_BACKEND_TYPE_GPU);
|
14898
14963
|
assert(node->src[j]->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device));
|
14899
14964
|
assert(node->src[j]->extra != nullptr);
|
14900
14965
|
}
|
@@ -15078,6 +15143,11 @@ static ggml_backend_i ggml_backend_sycl_interface = {
|
|
15078
15143
|
/* .supports_op = */ ggml_backend_sycl_supports_op,
|
15079
15144
|
};
|
15080
15145
|
|
15146
|
+
static ggml_guid_t ggml_backend_sycl_guid() {
|
15147
|
+
static ggml_guid guid = { 0x58, 0x05, 0x13, 0x8f, 0xcd, 0x3a, 0x61, 0x9d, 0xe7, 0xcd, 0x98, 0xa9, 0x03, 0xfd, 0x7c, 0x53 };
|
15148
|
+
return &guid;
|
15149
|
+
}
|
15150
|
+
|
15081
15151
|
ggml_backend_t ggml_backend_sycl_init(int device) {
|
15082
15152
|
ggml_init_sycl(); // TODO: remove from ggml.c
|
15083
15153
|
|
@@ -15095,6 +15165,7 @@ ggml_backend_t ggml_backend_sycl_init(int device) {
|
|
15095
15165
|
};
|
15096
15166
|
|
15097
15167
|
ggml_backend_t sycl_backend = new ggml_backend {
|
15168
|
+
/* .guid = */ ggml_backend_sycl_guid(),
|
15098
15169
|
/* .interface = */ ggml_backend_sycl_interface,
|
15099
15170
|
/* .context = */ ctx
|
15100
15171
|
};
|
@@ -15103,7 +15174,7 @@ ggml_backend_t ggml_backend_sycl_init(int device) {
|
|
15103
15174
|
}
|
15104
15175
|
|
15105
15176
|
bool ggml_backend_is_sycl(ggml_backend_t backend) {
|
15106
|
-
return backend->
|
15177
|
+
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_sycl_guid());
|
15107
15178
|
}
|
15108
15179
|
|
15109
15180
|
static ggml_backend_t ggml_backend_reg_sycl_init(const char * params, void * user_data) {
|