llama_cpp 0.12.6 → 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +21 -0
- data/ext/llama_cpp/llama_cpp.cpp +90 -269
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +28 -23
- data/vendor/tmp/llama.cpp/Makefile +51 -15
- data/vendor/tmp/llama.cpp/ggml-alloc.c +73 -43
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +2 -0
- data/vendor/tmp/llama.cpp/ggml-backend.c +32 -11
- data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +560 -346
- data/vendor/tmp/llama.cpp/ggml-impl.h +20 -7
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +7 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +191 -22
- data/vendor/tmp/llama.cpp/ggml-metal.metal +2472 -862
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +25 -25
- data/vendor/tmp/llama.cpp/ggml-quants.c +3176 -667
- data/vendor/tmp/llama.cpp/ggml-quants.h +77 -2
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +373 -424
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +186 -102
- data/vendor/tmp/llama.cpp/ggml.c +1266 -699
- data/vendor/tmp/llama.cpp/ggml.h +59 -30
- data/vendor/tmp/llama.cpp/llama.cpp +1517 -717
- data/vendor/tmp/llama.cpp/llama.h +87 -63
- data/vendor/tmp/llama.cpp/scripts/get-flags.mk +1 -1
- data/vendor/tmp/llama.cpp/unicode.h +310 -1
- metadata +2 -2
|
@@ -3338,7 +3338,7 @@ void print_ggml_tensor(const char*name, struct ggml_tensor *src){
|
|
|
3338
3338
|
|
|
3339
3339
|
size_t total_elements = ggml_nelements(src);
|
|
3340
3340
|
|
|
3341
|
-
const bool src_on_device = src->backend ==
|
|
3341
|
+
const bool src_on_device = src->backend == GGML_BACKEND_TYPE_GPU || src->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
|
|
3342
3342
|
float *src_data =NULL;
|
|
3343
3343
|
if(src_on_device) {
|
|
3344
3344
|
ggml_tensor_extra_gpu * src_extra = (ggml_tensor_extra_gpu *) src->extra;
|
|
@@ -8086,11 +8086,11 @@ static void k_argsort_f32_i32(const float * x, int * dst, const int ncols,
|
|
|
8086
8086
|
int ixj = col ^ j;
|
|
8087
8087
|
if (ixj > col) {
|
|
8088
8088
|
if ((col & k) == 0) {
|
|
8089
|
-
if (order ==
|
|
8089
|
+
if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) {
|
|
8090
8090
|
swap(dst_row[col], dst_row[ixj]);
|
|
8091
8091
|
}
|
|
8092
8092
|
} else {
|
|
8093
|
-
if (order ==
|
|
8093
|
+
if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) {
|
|
8094
8094
|
swap(dst_row[col], dst_row[ixj]);
|
|
8095
8095
|
}
|
|
8096
8096
|
}
|
|
@@ -8126,23 +8126,51 @@ static void diag_mask_inf_f32(const float * x, float * dst, const int ncols, con
|
|
|
8126
8126
|
dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX;
|
|
8127
8127
|
}
|
|
8128
8128
|
|
|
8129
|
-
|
|
8130
|
-
|
|
8129
|
+
|
|
8130
|
+
template <bool vals_smem, int ncols_template, int block_size_template>
|
|
8131
|
+
static void soft_max_f32(const float * x, const float * mask, const float *pos, float * dst, const int ncols_par,
|
|
8132
|
+
const int nrows_y, const float scale, const float max_bias, const float m0,
|
|
8133
|
+
const float m1, uint32_t n_head_log2, const sycl::nd_item<3> &item_ct1, float *buf) {
|
|
8134
|
+
const int ncols = ncols_template == 0 ? ncols_par : ncols_template;
|
|
8135
|
+
|
|
8131
8136
|
const int tid = item_ct1.get_local_id(2);
|
|
8132
8137
|
const int rowx = item_ct1.get_group(2);
|
|
8133
8138
|
const int rowy = rowx % nrows_y; // broadcast the mask (y) in the row dimension
|
|
8134
8139
|
|
|
8135
|
-
const int block_size = item_ct1.get_local_range(2);
|
|
8140
|
+
const int block_size = block_size_template == 0 ? item_ct1.get_local_range(2) : block_size_template;
|
|
8136
8141
|
|
|
8137
8142
|
const int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
|
|
8138
8143
|
const int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
|
|
8139
8144
|
|
|
8145
|
+
float slope = 0.0f;
|
|
8146
|
+
|
|
8147
|
+
// ALiBi
|
|
8148
|
+
if (max_bias > 0.0f) {
|
|
8149
|
+
const uint32_t h = rowx/nrows_y; // head index
|
|
8150
|
+
|
|
8151
|
+
const float base = h < n_head_log2 ? m0 : m1;
|
|
8152
|
+
const int exp = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
|
|
8153
|
+
|
|
8154
|
+
slope = sycl::pow(base, float(exp));
|
|
8155
|
+
}
|
|
8156
|
+
|
|
8157
|
+
float * vals = vals_smem ? buf + WARP_SIZE : dst + rowx*ncols;
|
|
8140
8158
|
float max_val = -INFINITY;
|
|
8141
8159
|
|
|
8142
|
-
for (int
|
|
8160
|
+
for (int col0 = 0; col0 < ncols; col0 += block_size) {
|
|
8161
|
+
const int col = col0 + tid;
|
|
8162
|
+
|
|
8163
|
+
if (ncols_template == 0 && col >= ncols) {
|
|
8164
|
+
break;
|
|
8165
|
+
}
|
|
8166
|
+
|
|
8143
8167
|
const int ix = rowx*ncols + col;
|
|
8144
8168
|
const int iy = rowy*ncols + col;
|
|
8145
|
-
|
|
8169
|
+
|
|
8170
|
+
const float val = x[ix]*scale + (mask ? mask[iy] : 0.0f) + (pos ? slope*pos[col] : 0.0f);
|
|
8171
|
+
|
|
8172
|
+
vals[col] = val;
|
|
8173
|
+
max_val = sycl::max(max_val, val);
|
|
8146
8174
|
}
|
|
8147
8175
|
|
|
8148
8176
|
// find the max value in the block
|
|
@@ -8151,30 +8179,12 @@ static void soft_max_f32(const float * x, const float * y, float * dst, const in
|
|
|
8151
8179
|
if (warp_id == 0) {
|
|
8152
8180
|
buf[lane_id] = -INFINITY;
|
|
8153
8181
|
}
|
|
8154
|
-
|
|
8155
|
-
DPCT1118:12: SYCL group functions and algorithms must be encountered in
|
|
8156
|
-
converged control flow. You may need to adjust the code.
|
|
8157
|
-
*/
|
|
8158
|
-
/*
|
|
8159
|
-
DPCT1065:60: Consider replacing sycl::nd_item::barrier() with
|
|
8160
|
-
sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
|
|
8161
|
-
better performance if there is no access to global memory.
|
|
8162
|
-
*/
|
|
8163
|
-
item_ct1.barrier();
|
|
8182
|
+
item_ct1.barrier(sycl::access::fence_space::local_space);
|
|
8164
8183
|
|
|
8165
8184
|
if (lane_id == 0) {
|
|
8166
8185
|
buf[warp_id] = max_val;
|
|
8167
8186
|
}
|
|
8168
|
-
|
|
8169
|
-
DPCT1118:13: SYCL group functions and algorithms must be encountered in
|
|
8170
|
-
converged control flow. You may need to adjust the code.
|
|
8171
|
-
*/
|
|
8172
|
-
/*
|
|
8173
|
-
DPCT1065:61: Consider replacing sycl::nd_item::barrier() with
|
|
8174
|
-
sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
|
|
8175
|
-
better performance if there is no access to global memory.
|
|
8176
|
-
*/
|
|
8177
|
-
item_ct1.barrier();
|
|
8187
|
+
item_ct1.barrier(sycl::access::fence_space::local_space);
|
|
8178
8188
|
|
|
8179
8189
|
max_val = buf[lane_id];
|
|
8180
8190
|
max_val = warp_reduce_max(max_val, item_ct1);
|
|
@@ -8182,13 +8192,16 @@ static void soft_max_f32(const float * x, const float * y, float * dst, const in
|
|
|
8182
8192
|
|
|
8183
8193
|
float tmp = 0.f;
|
|
8184
8194
|
|
|
8185
|
-
|
|
8186
|
-
|
|
8187
|
-
const int
|
|
8188
|
-
|
|
8189
|
-
|
|
8195
|
+
#pragma unroll
|
|
8196
|
+
for (int col0 = 0; col0 < ncols; col0 += block_size) {
|
|
8197
|
+
const int col = col0 + tid;
|
|
8198
|
+
if (ncols_template == 0 && col >= ncols) {
|
|
8199
|
+
break;
|
|
8200
|
+
}
|
|
8201
|
+
|
|
8202
|
+
const float val = sycl::native::exp(vals[col] - max_val);
|
|
8190
8203
|
tmp += val;
|
|
8191
|
-
|
|
8204
|
+
vals[col] = val;
|
|
8192
8205
|
}
|
|
8193
8206
|
|
|
8194
8207
|
// find the sum of exps in the block
|
|
@@ -8197,40 +8210,29 @@ static void soft_max_f32(const float * x, const float * y, float * dst, const in
|
|
|
8197
8210
|
if (warp_id == 0) {
|
|
8198
8211
|
buf[lane_id] = 0.f;
|
|
8199
8212
|
}
|
|
8200
|
-
|
|
8201
|
-
DPCT1118:14: SYCL group functions and algorithms must be encountered in
|
|
8202
|
-
converged control flow. You may need to adjust the code.
|
|
8203
|
-
*/
|
|
8204
|
-
/*
|
|
8205
|
-
DPCT1065:62: Consider replacing sycl::nd_item::barrier() with
|
|
8206
|
-
sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
|
|
8207
|
-
better performance if there is no access to global memory.
|
|
8208
|
-
*/
|
|
8209
|
-
item_ct1.barrier();
|
|
8213
|
+
item_ct1.barrier(sycl::access::fence_space::local_space);
|
|
8210
8214
|
|
|
8211
8215
|
if (lane_id == 0) {
|
|
8212
8216
|
buf[warp_id] = tmp;
|
|
8213
8217
|
}
|
|
8214
|
-
|
|
8215
|
-
DPCT1118:15: SYCL group functions and algorithms must be encountered in
|
|
8216
|
-
converged control flow. You may need to adjust the code.
|
|
8217
|
-
*/
|
|
8218
|
-
/*
|
|
8219
|
-
DPCT1065:63: Consider replacing sycl::nd_item::barrier() with
|
|
8220
|
-
sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
|
|
8221
|
-
better performance if there is no access to global memory.
|
|
8222
|
-
*/
|
|
8223
|
-
item_ct1.barrier();
|
|
8218
|
+
item_ct1.barrier(sycl::access::fence_space::local_space);
|
|
8224
8219
|
|
|
8225
8220
|
tmp = buf[lane_id];
|
|
8226
8221
|
tmp = warp_reduce_sum(tmp, item_ct1);
|
|
8227
8222
|
}
|
|
8228
8223
|
|
|
8229
|
-
const float
|
|
8224
|
+
const float inv_sum = 1.f / tmp;
|
|
8230
8225
|
|
|
8231
|
-
|
|
8232
|
-
|
|
8233
|
-
|
|
8226
|
+
#pragma unroll
|
|
8227
|
+
for (int col0 = 0; col0 < ncols; col0 += block_size) {
|
|
8228
|
+
const int col = col0 + tid;
|
|
8229
|
+
|
|
8230
|
+
if (ncols_template == 0 && col >= ncols) {
|
|
8231
|
+
return;
|
|
8232
|
+
}
|
|
8233
|
+
|
|
8234
|
+
const int idst = rowx*ncols + col;
|
|
8235
|
+
dst[idst] = vals[col] * inv_sum;
|
|
8234
8236
|
}
|
|
8235
8237
|
}
|
|
8236
8238
|
|
|
@@ -9188,174 +9190,22 @@ static void convert_mul_mat_vec_f16_sycl(const void *vx, const dfloat *y,
|
|
|
9188
9190
|
}
|
|
9189
9191
|
}
|
|
9190
9192
|
|
|
9191
|
-
|
|
9192
|
-
|
|
9193
|
-
|
|
9194
|
-
|
|
9195
|
-
|
|
9196
|
-
|
|
9197
|
-
|
|
9198
|
-
|
|
9199
|
-
|
|
9200
|
-
|
|
9201
|
-
|
|
9202
|
-
|
|
9203
|
-
|
|
9204
|
-
|
|
9205
|
-
|
|
9206
|
-
}
|
|
9207
|
-
|
|
9208
|
-
static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
|
|
9209
|
-
float *dst, const int ncols,
|
|
9210
|
-
const int nrows,
|
|
9211
|
-
dpct::queue_ptr stream) {
|
|
9212
|
-
GGML_ASSERT(ncols % QK4_1 == 0);
|
|
9213
|
-
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
|
9214
|
-
const sycl::range<3> block_nums(1, 1, block_num_y);
|
|
9215
|
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
|
9216
|
-
stream->parallel_for(
|
|
9217
|
-
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
9218
|
-
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
|
9219
|
-
mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ,
|
|
9220
|
-
vec_dot_q4_1_q8_1>(vx, vy, dst, ncols, nrows,
|
|
9221
|
-
item_ct1);
|
|
9222
|
-
});
|
|
9223
|
-
}
|
|
9224
|
-
|
|
9225
|
-
static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
|
|
9226
|
-
float *dst, const int ncols,
|
|
9227
|
-
const int nrows,
|
|
9228
|
-
dpct::queue_ptr stream) {
|
|
9229
|
-
GGML_ASSERT(ncols % QK5_0 == 0);
|
|
9230
|
-
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
|
9231
|
-
const sycl::range<3> block_nums(1, 1, block_num_y);
|
|
9232
|
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
|
9233
|
-
stream->parallel_for(
|
|
9234
|
-
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
9235
|
-
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
|
9236
|
-
mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ,
|
|
9237
|
-
vec_dot_q5_0_q8_1>(vx, vy, dst, ncols, nrows,
|
|
9238
|
-
item_ct1);
|
|
9239
|
-
});
|
|
9240
|
-
}
|
|
9241
|
-
|
|
9242
|
-
static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
|
|
9243
|
-
float *dst, const int ncols,
|
|
9244
|
-
const int nrows,
|
|
9245
|
-
dpct::queue_ptr stream) {
|
|
9246
|
-
GGML_ASSERT(ncols % QK5_1 == 0);
|
|
9247
|
-
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
|
9248
|
-
const sycl::range<3> block_nums(1, 1, block_num_y);
|
|
9249
|
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
|
9250
|
-
stream->parallel_for(
|
|
9251
|
-
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
9252
|
-
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
|
9253
|
-
mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ,
|
|
9254
|
-
vec_dot_q5_1_q8_1>(vx, vy, dst, ncols, nrows,
|
|
9255
|
-
item_ct1);
|
|
9256
|
-
});
|
|
9257
|
-
}
|
|
9258
|
-
|
|
9259
|
-
static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
|
|
9260
|
-
float *dst, const int ncols,
|
|
9261
|
-
const int nrows,
|
|
9262
|
-
dpct::queue_ptr stream) {
|
|
9263
|
-
GGML_ASSERT(ncols % QK8_0 == 0);
|
|
9264
|
-
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
|
9265
|
-
const sycl::range<3> block_nums(1, 1, block_num_y);
|
|
9266
|
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
|
9267
|
-
stream->parallel_for(
|
|
9268
|
-
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
9269
|
-
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
|
9270
|
-
mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ,
|
|
9271
|
-
vec_dot_q8_0_q8_1>(vx, vy, dst, ncols, nrows,
|
|
9272
|
-
item_ct1);
|
|
9273
|
-
});
|
|
9274
|
-
}
|
|
9275
|
-
|
|
9276
|
-
static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
|
|
9277
|
-
float *dst, const int ncols,
|
|
9278
|
-
const int nrows,
|
|
9279
|
-
dpct::queue_ptr stream) {
|
|
9280
|
-
GGML_ASSERT(ncols % QK_K == 0);
|
|
9281
|
-
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
|
9282
|
-
const sycl::range<3> block_nums(1, 1, block_num_y);
|
|
9283
|
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
|
9284
|
-
stream->parallel_for(
|
|
9285
|
-
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
9286
|
-
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
|
9287
|
-
mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ,
|
|
9288
|
-
vec_dot_q2_K_q8_1>(vx, vy, dst, ncols, nrows,
|
|
9289
|
-
item_ct1);
|
|
9290
|
-
});
|
|
9291
|
-
}
|
|
9292
|
-
|
|
9293
|
-
static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
|
|
9294
|
-
float *dst, const int ncols,
|
|
9295
|
-
const int nrows,
|
|
9296
|
-
dpct::queue_ptr stream) {
|
|
9297
|
-
GGML_ASSERT(ncols % QK_K == 0);
|
|
9298
|
-
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
|
9299
|
-
const sycl::range<3> block_nums(1, 1, block_num_y);
|
|
9300
|
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
|
9301
|
-
stream->parallel_for(
|
|
9302
|
-
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
9303
|
-
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
|
9304
|
-
mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ,
|
|
9305
|
-
vec_dot_q3_K_q8_1>(vx, vy, dst, ncols, nrows,
|
|
9306
|
-
item_ct1);
|
|
9307
|
-
});
|
|
9308
|
-
}
|
|
9309
|
-
|
|
9310
|
-
static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
|
|
9311
|
-
float *dst, const int ncols,
|
|
9312
|
-
const int nrows,
|
|
9313
|
-
dpct::queue_ptr stream) {
|
|
9314
|
-
GGML_ASSERT(ncols % QK_K == 0);
|
|
9315
|
-
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
|
9316
|
-
const sycl::range<3> block_nums(1, 1, block_num_y);
|
|
9317
|
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
|
9318
|
-
stream->parallel_for(
|
|
9319
|
-
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
9320
|
-
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
|
9321
|
-
mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ,
|
|
9322
|
-
vec_dot_q4_K_q8_1>(vx, vy, dst, ncols, nrows,
|
|
9323
|
-
item_ct1);
|
|
9324
|
-
});
|
|
9325
|
-
}
|
|
9326
|
-
|
|
9327
|
-
static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
|
|
9328
|
-
float *dst, const int ncols,
|
|
9329
|
-
const int nrows,
|
|
9330
|
-
dpct::queue_ptr stream) {
|
|
9331
|
-
GGML_ASSERT(ncols % QK_K == 0);
|
|
9332
|
-
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
|
9333
|
-
const sycl::range<3> block_nums(1, 1, block_num_y);
|
|
9334
|
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
|
9335
|
-
stream->parallel_for(
|
|
9336
|
-
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
9337
|
-
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
|
9338
|
-
mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ,
|
|
9339
|
-
vec_dot_q5_K_q8_1>(vx, vy, dst, ncols, nrows,
|
|
9340
|
-
item_ct1);
|
|
9341
|
-
});
|
|
9342
|
-
}
|
|
9343
|
-
|
|
9344
|
-
static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
|
|
9345
|
-
float *dst, const int ncols,
|
|
9346
|
-
const int nrows,
|
|
9347
|
-
dpct::queue_ptr stream) {
|
|
9348
|
-
GGML_ASSERT(ncols % QK_K == 0);
|
|
9349
|
-
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
|
9350
|
-
const sycl::range<3> block_nums(1, 1, block_num_y);
|
|
9351
|
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
|
9352
|
-
stream->parallel_for(
|
|
9353
|
-
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
9354
|
-
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
|
9355
|
-
mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ,
|
|
9356
|
-
vec_dot_q6_K_q8_1>(vx, vy, dst, ncols, nrows,
|
|
9357
|
-
item_ct1);
|
|
9358
|
-
});
|
|
9193
|
+
template <int qk, int qi, typename block_q_t, int vdr,
|
|
9194
|
+
vec_dot_q_sycl_t vec_dot_q_sycl>
|
|
9195
|
+
static void mul_mat_vec_q_sycl_submitter(const void *vx, const void *vy,
|
|
9196
|
+
float *dst, const int ncols,
|
|
9197
|
+
const int nrows,
|
|
9198
|
+
dpct::queue_ptr stream) {
|
|
9199
|
+
GGML_ASSERT(ncols % QK4_0 == 0);
|
|
9200
|
+
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
|
9201
|
+
const sycl::range<3> block_nums(1, 1, block_num_y);
|
|
9202
|
+
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
|
9203
|
+
stream->parallel_for(
|
|
9204
|
+
sycl::nd_range<3>(block_nums * block_dims, block_dims), [=
|
|
9205
|
+
](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
|
9206
|
+
mul_mat_vec_q<qk, qi, block_q_t, vdr, vec_dot_q_sycl>(
|
|
9207
|
+
vx, vy, dst, ncols, nrows, item_ct1);
|
|
9208
|
+
});
|
|
9359
9209
|
}
|
|
9360
9210
|
|
|
9361
9211
|
int get_device_index_by_id(int id){
|
|
@@ -10977,7 +10827,7 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
|
|
|
10977
10827
|
|
|
10978
10828
|
const sycl::range<3> block_dims(1, 1, ncols);
|
|
10979
10829
|
const sycl::range<3> block_nums(1, nrows, 1);
|
|
10980
|
-
if (order ==
|
|
10830
|
+
if (order == GGML_SORT_ORDER_ASC) {
|
|
10981
10831
|
/*
|
|
10982
10832
|
DPCT1049:44: The work-group size passed to the SYCL kernel may exceed
|
|
10983
10833
|
the limit. To get the device limit, query
|
|
@@ -10986,9 +10836,9 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
|
|
|
10986
10836
|
stream->parallel_for(
|
|
10987
10837
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
10988
10838
|
[=](sycl::nd_item<3> item_ct1) {
|
|
10989
|
-
k_argsort_f32_i32<
|
|
10839
|
+
k_argsort_f32_i32<GGML_SORT_ORDER_ASC>(x, dst, ncols, item_ct1);
|
|
10990
10840
|
});
|
|
10991
|
-
} else if (order ==
|
|
10841
|
+
} else if (order == GGML_SORT_ORDER_DESC) {
|
|
10992
10842
|
/*
|
|
10993
10843
|
DPCT1049:45: The work-group size passed to the SYCL kernel may exceed
|
|
10994
10844
|
the limit. To get the device limit, query
|
|
@@ -10997,7 +10847,7 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
|
|
|
10997
10847
|
stream->parallel_for(
|
|
10998
10848
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
10999
10849
|
[=](sycl::nd_item<3> item_ct1) {
|
|
11000
|
-
k_argsort_f32_i32<
|
|
10850
|
+
k_argsort_f32_i32<GGML_SORT_ORDER_DESC>(x, dst, ncols, item_ct1);
|
|
11001
10851
|
});
|
|
11002
10852
|
} else {
|
|
11003
10853
|
GGML_ASSERT(false);
|
|
@@ -11019,37 +10869,98 @@ static void diag_mask_inf_f32_sycl(const float *x, float *dst,
|
|
|
11019
10869
|
});
|
|
11020
10870
|
}
|
|
11021
10871
|
|
|
11022
|
-
|
|
11023
|
-
|
|
11024
|
-
|
|
11025
|
-
|
|
11026
|
-
|
|
11027
|
-
while (nth < ncols_x && nth < SYCL_SOFT_MAX_BLOCK_SIZE) nth *= 2;
|
|
11028
|
-
const sycl::range<3> block_dims(1, 1, nth);
|
|
11029
|
-
const sycl::range<3> block_nums(1, 1, nrows_x);
|
|
11030
|
-
/*
|
|
11031
|
-
DPCT1049:46: The work-group size passed to the SYCL kernel may exceed the
|
|
11032
|
-
limit. To get the device limit, query info::device::max_work_group_size.
|
|
11033
|
-
Adjust the work-group size if needed.
|
|
11034
|
-
*/
|
|
10872
|
+
template <bool vals_smem, int ncols_template, int block_size_template>
|
|
10873
|
+
static void soft_max_f32_submitter(const float * x, const float * mask, const float *pos, float * dst, const int ncols_par,
|
|
10874
|
+
const int nrows_y, const float scale, const float max_bias, const float m0,
|
|
10875
|
+
const float m1, uint32_t n_head_log2, sycl::range<3> block_nums, sycl::range<3> block_dims,
|
|
10876
|
+
const size_t n_local_scratch, dpct::queue_ptr stream) {
|
|
11035
10877
|
stream->submit([&](sycl::handler &cgh) {
|
|
11036
|
-
|
|
11037
|
-
DPCT1101:96: 'SYCL_SOFT_MAX_BLOCK_SIZE/WARP_SIZE' expression was
|
|
11038
|
-
replaced with a value. Modify the code to use the original expression,
|
|
11039
|
-
provided in comments, if it is correct.
|
|
11040
|
-
*/
|
|
11041
|
-
sycl::local_accessor<float, 1> buf_acc_ct1(
|
|
11042
|
-
sycl::range<1>(32 /*SYCL_SOFT_MAX_BLOCK_SIZE/WARP_SIZE*/), cgh);
|
|
10878
|
+
sycl::local_accessor<float, 1> local_buf_acc(n_local_scratch, cgh);
|
|
11043
10879
|
|
|
11044
10880
|
cgh.parallel_for(
|
|
11045
10881
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
11046
10882
|
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
|
11047
|
-
soft_max_f32
|
|
11048
|
-
|
|
10883
|
+
soft_max_f32<vals_smem, ncols_template, block_size_template>(x, mask, pos, dst, ncols_par,
|
|
10884
|
+
nrows_y, scale, max_bias, m0,
|
|
10885
|
+
m1, n_head_log2, item_ct1,
|
|
10886
|
+
local_buf_acc.get_pointer());
|
|
11049
10887
|
});
|
|
11050
10888
|
});
|
|
11051
10889
|
}
|
|
11052
10890
|
|
|
10891
|
+
static void soft_max_f32_sycl(const float * x, const float * mask, const float * pos,
|
|
10892
|
+
float * dst, const int ncols_x, const int nrows_x,
|
|
10893
|
+
const int nrows_y, const float scale, const float max_bias,
|
|
10894
|
+
dpct::queue_ptr stream) {
|
|
10895
|
+
int nth = WARP_SIZE;
|
|
10896
|
+
while (nth < ncols_x && nth < SYCL_SOFT_MAX_BLOCK_SIZE) nth *= 2;
|
|
10897
|
+
const sycl::range<3> block_dims(1, 1, nth);
|
|
10898
|
+
const sycl::range<3> block_nums(1, 1, nrows_x);
|
|
10899
|
+
const size_t n_local_scratch = (GGML_PAD(ncols_x, WARP_SIZE) + WARP_SIZE);
|
|
10900
|
+
static_assert(SYCL_SOFT_MAX_BLOCK_SIZE == 1024, "These values need to be adjusted.");
|
|
10901
|
+
|
|
10902
|
+
const uint32_t n_head_kv = nrows_x/nrows_y;
|
|
10903
|
+
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));
|
|
10904
|
+
|
|
10905
|
+
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
|
10906
|
+
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
|
10907
|
+
|
|
10908
|
+
const size_t local_mem_size = stream->get_device().get_info<sycl::info::device::local_mem_size>();
|
|
10909
|
+
if (n_local_scratch*sizeof(float) < local_mem_size) {
|
|
10910
|
+
switch (ncols_x) {
|
|
10911
|
+
case 32:
|
|
10912
|
+
soft_max_f32_submitter<true, 32, 32>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
|
10913
|
+
max_bias, m0, m1, n_head_log2, block_nums,
|
|
10914
|
+
block_dims, n_local_scratch, stream);
|
|
10915
|
+
break;
|
|
10916
|
+
case 64:
|
|
10917
|
+
soft_max_f32_submitter<true, 64, 64>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
|
10918
|
+
max_bias, m0, m1, n_head_log2, block_nums,
|
|
10919
|
+
block_dims, n_local_scratch, stream);
|
|
10920
|
+
break;
|
|
10921
|
+
case 128:
|
|
10922
|
+
soft_max_f32_submitter<true, 128, 128>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
|
10923
|
+
max_bias, m0, m1, n_head_log2, block_nums,
|
|
10924
|
+
block_dims, n_local_scratch, stream);
|
|
10925
|
+
break;
|
|
10926
|
+
case 256:
|
|
10927
|
+
soft_max_f32_submitter<true, 256, 256>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
|
10928
|
+
max_bias, m0, m1, n_head_log2, block_nums,
|
|
10929
|
+
block_dims, n_local_scratch, stream);
|
|
10930
|
+
break;
|
|
10931
|
+
case 512:
|
|
10932
|
+
soft_max_f32_submitter<true, 512, 512>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
|
10933
|
+
max_bias, m0, m1, n_head_log2, block_nums,
|
|
10934
|
+
block_dims, n_local_scratch, stream);
|
|
10935
|
+
break;
|
|
10936
|
+
case 1024:
|
|
10937
|
+
soft_max_f32_submitter<true, 1024, 1024>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
|
10938
|
+
max_bias, m0, m1, n_head_log2, block_nums,
|
|
10939
|
+
block_dims, n_local_scratch, stream);
|
|
10940
|
+
break;
|
|
10941
|
+
case 2048:
|
|
10942
|
+
soft_max_f32_submitter<true, 2048, 1024>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
|
10943
|
+
max_bias, m0, m1, n_head_log2, block_nums,
|
|
10944
|
+
block_dims, n_local_scratch, stream);
|
|
10945
|
+
break;
|
|
10946
|
+
case 4096:
|
|
10947
|
+
soft_max_f32_submitter<true, 4096, 1024>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
|
10948
|
+
max_bias, m0, m1, n_head_log2, block_nums,
|
|
10949
|
+
block_dims, n_local_scratch, stream);
|
|
10950
|
+
break;
|
|
10951
|
+
default:
|
|
10952
|
+
soft_max_f32_submitter<true, 0, 0>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
|
10953
|
+
max_bias, m0, m1, n_head_log2, block_nums,
|
|
10954
|
+
block_dims, n_local_scratch, stream);
|
|
10955
|
+
break;
|
|
10956
|
+
}
|
|
10957
|
+
} else {
|
|
10958
|
+
soft_max_f32_submitter<false, 0, 0>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
|
10959
|
+
max_bias, m0, m1, n_head_log2, block_nums,
|
|
10960
|
+
block_dims, WARP_SIZE, stream);
|
|
10961
|
+
}
|
|
10962
|
+
}
|
|
10963
|
+
|
|
11053
10964
|
template <typename T>
|
|
11054
10965
|
static void im2col_sycl(const float *x, T *dst, int IW, int IH,
|
|
11055
10966
|
int OW, int OH, int KW, int KH, int IC,
|
|
@@ -11559,12 +11470,12 @@ static dpct::err0 ggml_sycl_cpy_tensor_2d(void *dst,
|
|
|
11559
11470
|
|
|
11560
11471
|
dpct::memcpy_direction kind;
|
|
11561
11472
|
char * src_ptr;
|
|
11562
|
-
if (src->backend ==
|
|
11473
|
+
if (src->backend == GGML_BACKEND_TYPE_CPU) {
|
|
11563
11474
|
kind = dpct::host_to_device;
|
|
11564
11475
|
src_ptr = (char *) src->data;
|
|
11565
|
-
// GGML_SYCL_DEBUG("ggml_sycl_cpy_tensor_2d
|
|
11566
|
-
} else if (src->backend ==
|
|
11567
|
-
GGML_ASSERT(src->backend !=
|
|
11476
|
+
// GGML_SYCL_DEBUG("ggml_sycl_cpy_tensor_2d GGML_BACKEND_TYPE_CPU src_ptr %p\n", src_ptr);
|
|
11477
|
+
} else if (src->backend == GGML_BACKEND_TYPE_GPU || src->backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
|
|
11478
|
+
GGML_ASSERT(src->backend != GGML_BACKEND_TYPE_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
|
|
11568
11479
|
kind = dpct::device_to_device;
|
|
11569
11480
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
|
|
11570
11481
|
int id;
|
|
@@ -11998,7 +11909,7 @@ inline void ggml_sycl_op_mul_mat_q(
|
|
|
11998
11909
|
|
|
11999
11910
|
// the main device has a larger memory buffer to hold the results from all GPUs
|
|
12000
11911
|
// nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into
|
|
12001
|
-
const int64_t nrows_dst = dst->backend ==
|
|
11912
|
+
const int64_t nrows_dst = dst->backend == GGML_BACKEND_TYPE_GPU && device_id == g_main_device ? ne0 : row_diff;
|
|
12002
11913
|
|
|
12003
11914
|
switch (src0->type) {
|
|
12004
11915
|
case GGML_TYPE_Q4_0:
|
|
@@ -12095,37 +12006,63 @@ inline void ggml_sycl_op_mul_mat_vec_q(
|
|
|
12095
12006
|
const int64_t ne00 = src0->ne[0];
|
|
12096
12007
|
const int64_t row_diff = row_high - row_low;
|
|
12097
12008
|
|
|
12009
|
+
// TODO: support these quantization types
|
|
12010
|
+
GGML_ASSERT(!(src0->type == GGML_TYPE_IQ2_XXS ||
|
|
12011
|
+
src0->type == GGML_TYPE_IQ2_XS ||
|
|
12012
|
+
src0->type == GGML_TYPE_IQ3_XXS ||
|
|
12013
|
+
src0->type == GGML_TYPE_IQ1_S));
|
|
12014
|
+
|
|
12098
12015
|
switch (src0->type) {
|
|
12099
12016
|
case GGML_TYPE_Q4_0:
|
|
12100
|
-
|
|
12101
|
-
|
|
12017
|
+
mul_mat_vec_q_sycl_submitter<QK4_0, QI4_0, block_q4_0,
|
|
12018
|
+
VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
|
|
12019
|
+
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
|
12020
|
+
break;
|
|
12102
12021
|
case GGML_TYPE_Q4_1:
|
|
12103
|
-
|
|
12104
|
-
|
|
12022
|
+
mul_mat_vec_q_sycl_submitter<QK4_1, QI4_1, block_q4_1,
|
|
12023
|
+
VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
|
|
12024
|
+
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
|
12025
|
+
break;
|
|
12105
12026
|
case GGML_TYPE_Q5_0:
|
|
12106
|
-
|
|
12107
|
-
|
|
12027
|
+
mul_mat_vec_q_sycl_submitter<QK5_0, QI5_0, block_q5_0,
|
|
12028
|
+
VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
|
|
12029
|
+
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
|
12030
|
+
break;
|
|
12108
12031
|
case GGML_TYPE_Q5_1:
|
|
12109
|
-
|
|
12110
|
-
|
|
12032
|
+
mul_mat_vec_q_sycl_submitter<QK5_1, QI5_1, block_q5_1,
|
|
12033
|
+
VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
|
|
12034
|
+
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
|
12035
|
+
break;
|
|
12111
12036
|
case GGML_TYPE_Q8_0:
|
|
12112
|
-
|
|
12113
|
-
|
|
12037
|
+
mul_mat_vec_q_sycl_submitter<QK8_0, QI8_0, block_q8_0,
|
|
12038
|
+
VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
|
|
12039
|
+
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
|
12040
|
+
break;
|
|
12114
12041
|
case GGML_TYPE_Q2_K:
|
|
12115
|
-
|
|
12116
|
-
|
|
12042
|
+
mul_mat_vec_q_sycl_submitter<QK_K, QI2_K, block_q2_K,
|
|
12043
|
+
VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
|
|
12044
|
+
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
|
12045
|
+
break;
|
|
12117
12046
|
case GGML_TYPE_Q3_K:
|
|
12118
|
-
|
|
12119
|
-
|
|
12047
|
+
mul_mat_vec_q_sycl_submitter<QK_K, QI3_K, block_q3_K,
|
|
12048
|
+
VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
|
|
12049
|
+
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
|
12050
|
+
break;
|
|
12120
12051
|
case GGML_TYPE_Q4_K:
|
|
12121
|
-
|
|
12122
|
-
|
|
12052
|
+
mul_mat_vec_q_sycl_submitter<QK_K, QI4_K, block_q4_K,
|
|
12053
|
+
VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
|
|
12054
|
+
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
|
12055
|
+
break;
|
|
12123
12056
|
case GGML_TYPE_Q5_K:
|
|
12124
|
-
|
|
12125
|
-
|
|
12057
|
+
mul_mat_vec_q_sycl_submitter<QK_K, QI5_K, block_q5_K,
|
|
12058
|
+
VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
|
|
12059
|
+
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
|
12060
|
+
break;
|
|
12126
12061
|
case GGML_TYPE_Q6_K:
|
|
12127
|
-
|
|
12128
|
-
|
|
12062
|
+
mul_mat_vec_q_sycl_submitter<QK_K, QI6_K, block_q6_K,
|
|
12063
|
+
VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
|
|
12064
|
+
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
|
12065
|
+
break;
|
|
12129
12066
|
default:
|
|
12130
12067
|
GGML_ASSERT(false);
|
|
12131
12068
|
break;
|
|
@@ -12145,7 +12082,7 @@ inline void ggml_sycl_op_dequantize_mul_mat_vec(
|
|
|
12145
12082
|
const int64_t src1_ncols, const int64_t src1_padded_row_size,
|
|
12146
12083
|
const dpct::queue_ptr &stream) {
|
|
12147
12084
|
|
|
12148
|
-
GGML_TENSOR_BINARY_OP_LOCALS
|
|
12085
|
+
GGML_TENSOR_BINARY_OP_LOCALS;
|
|
12149
12086
|
|
|
12150
12087
|
const int64_t row_diff = row_high - row_low;
|
|
12151
12088
|
|
|
@@ -12245,7 +12182,7 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
|
|
12245
12182
|
|
|
12246
12183
|
// the main device has a larger memory buffer to hold the results from all GPUs
|
|
12247
12184
|
// ldc == nrows of the matrix that cuBLAS writes into
|
|
12248
|
-
int ldc = dst->backend ==
|
|
12185
|
+
int ldc = dst->backend == GGML_BACKEND_TYPE_GPU && device_id == g_main_device ? ne0 : row_diff;
|
|
12249
12186
|
|
|
12250
12187
|
#ifdef GGML_SYCL_F16
|
|
12251
12188
|
bool use_fp16 = true; // TODO(Yu) SYCL capability check
|
|
@@ -12561,14 +12498,35 @@ inline void ggml_sycl_op_soft_max(const ggml_tensor *src0,
|
|
|
12561
12498
|
|
|
12562
12499
|
const int64_t ne00 = src0->ne[0];
|
|
12563
12500
|
const int64_t nrows_x = ggml_nrows(src0);
|
|
12564
|
-
const int64_t nrows_y =
|
|
12501
|
+
const int64_t nrows_y = src0->ne[1];
|
|
12565
12502
|
|
|
12566
12503
|
float scale = 1.0f;
|
|
12567
|
-
|
|
12504
|
+
float max_bias = 0.0f;
|
|
12568
12505
|
|
|
12569
|
-
|
|
12506
|
+
memcpy(&scale, dst->op_params + 0, sizeof(float));
|
|
12507
|
+
memcpy(&max_bias, dst->op_params + 1, sizeof(float));
|
|
12570
12508
|
|
|
12571
|
-
|
|
12509
|
+
// positions tensor
|
|
12510
|
+
float * src2_dd = nullptr;
|
|
12511
|
+
sycl_pool_alloc<float> src2_f;
|
|
12512
|
+
|
|
12513
|
+
ggml_tensor * src2 = dst->src[2];
|
|
12514
|
+
const bool use_src2 = src2 != nullptr;
|
|
12515
|
+
|
|
12516
|
+
if (use_src2) {
|
|
12517
|
+
const bool src2_on_device = src2->backend == GGML_BACKEND_TYPE_GPU;
|
|
12518
|
+
|
|
12519
|
+
if (src2_on_device) {
|
|
12520
|
+
ggml_tensor_extra_gpu * src2_extra = (ggml_tensor_extra_gpu *) src2->extra;
|
|
12521
|
+
src2_dd = (float *) src2_extra->data_device[g_main_device];
|
|
12522
|
+
} else {
|
|
12523
|
+
src2_dd = src2_f.alloc(ggml_nelements(src2));
|
|
12524
|
+
SYCL_CHECK(ggml_sycl_cpy_tensor_2d(src2_dd, src2, 0, 0, 0, 1, main_stream));
|
|
12525
|
+
}
|
|
12526
|
+
}
|
|
12527
|
+
|
|
12528
|
+
soft_max_f32_sycl(src0_dd, src1 ? src1_dd : nullptr, src2_dd, dst_dd, ne00,
|
|
12529
|
+
nrows_x, nrows_y, scale, max_bias, main_stream);
|
|
12572
12530
|
}
|
|
12573
12531
|
|
|
12574
12532
|
inline void ggml_sycl_op_scale(const ggml_tensor *src0, const ggml_tensor *src1,
|
|
@@ -12627,16 +12585,16 @@ static void ggml_sycl_op_flatten(const ggml_tensor *src0,
|
|
|
12627
12585
|
const bool use_src1 = src1 != nullptr;
|
|
12628
12586
|
const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
|
|
12629
12587
|
|
|
12630
|
-
GGML_ASSERT(!use_src1 || src1->backend !=
|
|
12631
|
-
GGML_ASSERT( dst->backend !=
|
|
12588
|
+
GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
|
12589
|
+
GGML_ASSERT( dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
|
12632
12590
|
|
|
12633
12591
|
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
|
12634
12592
|
ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
|
|
12635
12593
|
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
|
12636
12594
|
|
|
12637
|
-
const bool src0_on_device = src0->backend ==
|
|
12638
|
-
const bool src1_on_device = use_src1 && src1->backend ==
|
|
12639
|
-
const bool dst_on_device = dst->backend ==
|
|
12595
|
+
const bool src0_on_device = src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
|
|
12596
|
+
const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_TYPE_GPU;
|
|
12597
|
+
const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU;
|
|
12640
12598
|
|
|
12641
12599
|
// dd = data device
|
|
12642
12600
|
float * src0_ddf = nullptr;
|
|
@@ -12691,7 +12649,7 @@ static void ggml_sycl_op_flatten(const ggml_tensor *src0,
|
|
|
12691
12649
|
main_stream->memcpy(dst->data, dst_ddf, ggml_nbytes(dst))));
|
|
12692
12650
|
}
|
|
12693
12651
|
|
|
12694
|
-
if (dst->backend ==
|
|
12652
|
+
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
|
12695
12653
|
SYCL_CHECK(CHECK_TRY_ERROR(
|
|
12696
12654
|
dpct::get_current_device().queues_wait_and_throw()));
|
|
12697
12655
|
}
|
|
@@ -12766,8 +12724,9 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
|
|
|
12766
12724
|
const int nb2 = dst->nb[2];
|
|
12767
12725
|
const int nb3 = dst->nb[3];
|
|
12768
12726
|
|
|
12769
|
-
GGML_ASSERT(dst->backend !=
|
|
12770
|
-
GGML_ASSERT(src1->backend !=
|
|
12727
|
+
GGML_ASSERT(dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
|
12728
|
+
GGML_ASSERT(src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
|
12729
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32 || (src1->ne[2] == 1 && src1->ne[3] == 1));
|
|
12771
12730
|
|
|
12772
12731
|
GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);
|
|
12773
12732
|
|
|
@@ -12782,13 +12741,13 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
|
|
|
12782
12741
|
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
|
12783
12742
|
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
|
12784
12743
|
|
|
12785
|
-
const bool src0_on_device = src0->backend ==
|
|
12744
|
+
const bool src0_on_device = src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
|
|
12786
12745
|
const bool src0_is_contiguous = ggml_is_contiguous(src0);
|
|
12787
12746
|
const bool src1_is_contiguous = ggml_is_contiguous(src1);
|
|
12788
12747
|
|
|
12789
12748
|
int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
|
|
12790
12749
|
|
|
12791
|
-
const bool split = src0->backend ==
|
|
12750
|
+
const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
|
|
12792
12751
|
GGML_ASSERT(!(split && ne02 > 1));
|
|
12793
12752
|
GGML_ASSERT(!(split && ne03 > 1));
|
|
12794
12753
|
GGML_ASSERT(!(split && ne02 < ne12));
|
|
@@ -12843,8 +12802,8 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
|
|
|
12843
12802
|
|
|
12844
12803
|
used_devices++;
|
|
12845
12804
|
|
|
12846
|
-
const bool src1_on_device = src1->backend ==
|
|
12847
|
-
const bool dst_on_device = dst->backend ==
|
|
12805
|
+
const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index;
|
|
12806
|
+
const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index;
|
|
12848
12807
|
|
|
12849
12808
|
ggml_sycl_set_device(get_device_id_by_index(id));
|
|
12850
12809
|
const dpct::queue_ptr stream = g_syclStreams[id][0];
|
|
@@ -12908,8 +12867,8 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
|
|
|
12908
12867
|
continue;
|
|
12909
12868
|
}
|
|
12910
12869
|
|
|
12911
|
-
const bool src1_on_device = src1->backend ==
|
|
12912
|
-
const bool dst_on_device = dst->backend ==
|
|
12870
|
+
const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index;
|
|
12871
|
+
const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index;
|
|
12913
12872
|
const int64_t row_diff = row_high[id] - row_low[id];
|
|
12914
12873
|
|
|
12915
12874
|
ggml_sycl_set_device(get_device_id_by_index(id));
|
|
@@ -12935,12 +12894,12 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
|
|
|
12935
12894
|
|
|
12936
12895
|
// the main device memory buffer can be on VRAM scratch, with space for all partial results
|
|
12937
12896
|
// in that case an offset on dst_ddf_i is needed
|
|
12938
|
-
if (dst->backend ==
|
|
12897
|
+
if (dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index) {
|
|
12939
12898
|
dst_dd_i += row_low[id]; // offset is 0 if no tensor split
|
|
12940
12899
|
}
|
|
12941
12900
|
|
|
12942
12901
|
// copy src0, src1 to device if necessary
|
|
12943
|
-
if (src1->backend ==
|
|
12902
|
+
if (src1->backend == GGML_BACKEND_TYPE_GPU && src1_is_contiguous) {
|
|
12944
12903
|
if (id != g_main_device_index) {
|
|
12945
12904
|
if (convert_src1_to_q8_1) {
|
|
12946
12905
|
char * src1_ddq_i_source = src1_ddq[g_main_device_index] + src1_ddq_i_offset;
|
|
@@ -12956,14 +12915,14 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
|
|
|
12956
12915
|
src1_ncols * ne10 * sizeof(float))));
|
|
12957
12916
|
}
|
|
12958
12917
|
}
|
|
12959
|
-
} else if (src1->backend ==
|
|
12918
|
+
} else if (src1->backend == GGML_BACKEND_TYPE_CPU || (src1_on_device && !src1_is_contiguous)) {
|
|
12960
12919
|
SYCL_CHECK(ggml_sycl_cpy_tensor_2d(
|
|
12961
12920
|
src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
|
|
12962
12921
|
} else {
|
|
12963
12922
|
GGML_ASSERT(false);
|
|
12964
12923
|
}
|
|
12965
12924
|
|
|
12966
|
-
if (convert_src1_to_q8_1 && (src1->backend ==
|
|
12925
|
+
if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_TYPE_CPU || !src1_is_contiguous)) {
|
|
12967
12926
|
quantize_row_q8_1_sycl(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
|
|
12968
12927
|
/*
|
|
12969
12928
|
DPCT1010:92: SYCL uses exceptions to report errors and does
|
|
@@ -12993,10 +12952,10 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
|
|
|
12993
12952
|
if (!dst_on_device) {
|
|
12994
12953
|
void * dst_off_device;
|
|
12995
12954
|
dpct::memcpy_direction kind;
|
|
12996
|
-
if (dst->backend ==
|
|
12955
|
+
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
|
12997
12956
|
dst_off_device = dst->data;
|
|
12998
12957
|
kind = dpct::device_to_host;
|
|
12999
|
-
} else if (dst->backend ==
|
|
12958
|
+
} else if (dst->backend == GGML_BACKEND_TYPE_GPU) {
|
|
13000
12959
|
dst_off_device = dst_extra->data_device[g_main_device_index];
|
|
13001
12960
|
kind = dpct::device_to_device;
|
|
13002
12961
|
} else {
|
|
@@ -13080,7 +13039,7 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
|
|
|
13080
13039
|
}
|
|
13081
13040
|
}
|
|
13082
13041
|
|
|
13083
|
-
if (dst->backend ==
|
|
13042
|
+
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
|
13084
13043
|
SYCL_CHECK(ggml_sycl_set_device(g_main_device));
|
|
13085
13044
|
SYCL_CHECK(CHECK_TRY_ERROR(
|
|
13086
13045
|
dpct::get_current_device().queues_wait_and_throw()));
|
|
@@ -13217,7 +13176,7 @@ static void ggml_sycl_mul_mat_vec_p021(const ggml_tensor *src0,
|
|
|
13217
13176
|
const ggml_tensor *src1,
|
|
13218
13177
|
ggml_tensor *dst) try {
|
|
13219
13178
|
GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
|
|
13220
|
-
GGML_ASSERT(src0->backend !=
|
|
13179
|
+
GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
|
13221
13180
|
GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
|
|
13222
13181
|
GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation
|
|
13223
13182
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
|
@@ -13255,7 +13214,7 @@ static void ggml_sycl_mul_mat_vec_nc(const ggml_tensor *src0,
|
|
|
13255
13214
|
GGML_ASSERT(!ggml_is_transposed(src0));
|
|
13256
13215
|
GGML_ASSERT(!ggml_is_transposed(src1));
|
|
13257
13216
|
GGML_ASSERT(!ggml_is_permuted(src0));
|
|
13258
|
-
GGML_ASSERT(src0->backend !=
|
|
13217
|
+
GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
|
13259
13218
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
|
13260
13219
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
|
13261
13220
|
|
|
@@ -13311,31 +13270,23 @@ static void k_compute_batched_ptrs(const sycl::half *src0_as_f16,
|
|
|
13311
13270
|
int64_t i03 = i13 / r3;
|
|
13312
13271
|
int64_t i02 = i12 / r2;
|
|
13313
13272
|
|
|
13314
|
-
ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02
|
|
13315
|
-
ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12
|
|
13316
|
-
ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2
|
|
13273
|
+
ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
|
|
13274
|
+
ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12 + i13*nb13;
|
|
13275
|
+
ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3;
|
|
13317
13276
|
}
|
|
13318
13277
|
|
|
13319
|
-
static void
|
|
13320
|
-
|
|
13321
|
-
|
|
13278
|
+
static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
|
|
13279
|
+
const ggml_tensor *src1,
|
|
13280
|
+
ggml_tensor *dst) try {
|
|
13322
13281
|
GGML_ASSERT(!ggml_is_transposed(src0));
|
|
13323
13282
|
GGML_ASSERT(!ggml_is_transposed(src1));
|
|
13324
13283
|
|
|
13325
|
-
GGML_ASSERT(src0->backend !=
|
|
13284
|
+
GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
|
13326
13285
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
|
13327
|
-
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
|
13328
|
-
|
|
13329
|
-
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
|
|
13330
13286
|
|
|
13331
|
-
|
|
13332
|
-
|
|
13333
|
-
GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
|
|
13334
|
-
|
|
13335
|
-
GGML_TENSOR_LOCALS(int64_t, nb1, src1, nb);
|
|
13287
|
+
GGML_TENSOR_BINARY_OP_LOCALS
|
|
13336
13288
|
|
|
13337
|
-
const int64_t
|
|
13338
|
-
const int64_t ne = ggml_nelements(dst);
|
|
13289
|
+
const int64_t ne_dst = ggml_nelements(dst);
|
|
13339
13290
|
|
|
13340
13291
|
SYCL_CHECK(ggml_sycl_set_device(g_main_device));
|
|
13341
13292
|
dpct::queue_ptr main_stream = g_syclStreams[g_main_device_index][0];
|
|
@@ -13354,11 +13305,16 @@ static void ggml_sycl_mul_mat_mat_batched_sycl(const ggml_tensor *src0,
|
|
|
13354
13305
|
float * dst_ddf = (float *) dst_extra->data_device[g_main_device_index];
|
|
13355
13306
|
|
|
13356
13307
|
// convert src1 to fp16
|
|
13357
|
-
|
|
13358
|
-
|
|
13359
|
-
|
|
13360
|
-
|
|
13361
|
-
|
|
13308
|
+
sycl_pool_alloc<sycl::half> src1_f16_alloc;
|
|
13309
|
+
if (src1->type != GGML_TYPE_F16) {
|
|
13310
|
+
const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type);
|
|
13311
|
+
const int64_t ne_src1 = ggml_nelements(src1);
|
|
13312
|
+
src1_f16_alloc.alloc(ne_src1);
|
|
13313
|
+
GGML_ASSERT(to_fp16_sycl != nullptr);
|
|
13314
|
+
to_fp16_sycl(src1_ddf, src1_f16_alloc.get(), ne_src1, main_stream);
|
|
13315
|
+
}
|
|
13316
|
+
sycl::half *src1_f16 = src1->type == GGML_TYPE_F16 ? (sycl::half *)src1_ddf
|
|
13317
|
+
: src1_f16_alloc.get();
|
|
13362
13318
|
|
|
13363
13319
|
sycl_pool_alloc<sycl::half> dst_f16;
|
|
13364
13320
|
char * dst_t;
|
|
@@ -13379,20 +13335,12 @@ static void ggml_sycl_mul_mat_mat_batched_sycl(const ggml_tensor *src0,
|
|
|
13379
13335
|
const void * alpha = &alpha_f16;
|
|
13380
13336
|
const void * beta = &beta_f16;
|
|
13381
13337
|
|
|
13382
|
-
|
|
13383
|
-
|
|
13338
|
+
// TODO: Renable (dst->op_params[0] =! GGML_PREC_DEFAULT) pathway
|
|
13339
|
+
// once oneMKL open source supports half, half, float, float: datatypes
|
|
13340
|
+
dst_t = (char *) dst_f16.alloc(ne_dst);
|
|
13384
13341
|
|
|
13385
|
-
|
|
13386
|
-
|
|
13387
|
-
} else {
|
|
13388
|
-
dst_t = (char *) dst_ddf;
|
|
13389
|
-
|
|
13390
|
-
cu_compute_type = dpct::library_data_t::real_float;
|
|
13391
|
-
cu_data_type = dpct::library_data_t::real_float;
|
|
13392
|
-
|
|
13393
|
-
alpha = &alpha_f32;
|
|
13394
|
-
beta = &beta_f32;
|
|
13395
|
-
}
|
|
13342
|
+
nbd2 /= sizeof(float) / sizeof(sycl::half);
|
|
13343
|
+
nbd3 /= sizeof(float) / sizeof(sycl::half);
|
|
13396
13344
|
|
|
13397
13345
|
GGML_ASSERT(ne12 % ne02 == 0);
|
|
13398
13346
|
GGML_ASSERT(ne13 % ne03 == 0);
|
|
@@ -13428,10 +13376,10 @@ static void ggml_sycl_mul_mat_mat_batched_sycl(const ggml_tensor *src0,
|
|
|
13428
13376
|
*g_sycl_handles[g_main_device_index], oneapi::mkl::transpose::trans,
|
|
13429
13377
|
oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha,
|
|
13430
13378
|
(const char *)src0_as_f16, dpct::library_data_t::real_half,
|
|
13431
|
-
nb01 /
|
|
13432
|
-
(const char *)
|
|
13433
|
-
nb11 /
|
|
13434
|
-
(char *)dst_t, cu_data_type, ne01,
|
|
13379
|
+
nb01 / nb00, nb02 / nb00,
|
|
13380
|
+
(const char *)src1_f16, dpct::library_data_t::real_half,
|
|
13381
|
+
nb11 / nb10, nb12 / nb10, beta,
|
|
13382
|
+
(char *)dst_t, cu_data_type, ne01, nb2 / nb0,
|
|
13435
13383
|
ne12 * ne13, cu_compute_type)));
|
|
13436
13384
|
} else {
|
|
13437
13385
|
// use syclGemmBatchedEx
|
|
@@ -13451,44 +13399,35 @@ static void ggml_sycl_mul_mat_mat_batched_sycl(const ggml_tensor *src0,
|
|
|
13451
13399
|
{sycl::aspect::fp16});
|
|
13452
13400
|
|
|
13453
13401
|
main_stream->submit([&](sycl::handler &cgh) {
|
|
13454
|
-
const
|
|
13455
|
-
|
|
13456
|
-
|
|
13457
|
-
|
|
13402
|
+
const void **ptrs_src_get = ptrs_src.get();
|
|
13403
|
+
void **ptrs_dst_get = ptrs_dst.get();
|
|
13404
|
+
size_t nb12_scaled = src1->type == GGML_TYPE_F16 ? nb12 : nb12 / 2;
|
|
13405
|
+
size_t nb13_scaled = src1->type == GGML_TYPE_F16 ? nb13 : nb13 / 2;
|
|
13458
13406
|
cgh.parallel_for(sycl::nd_range<3>(block_dims, block_dims),
|
|
13459
13407
|
[=](sycl::nd_item<3> item_ct1) {
|
|
13460
13408
|
k_compute_batched_ptrs(
|
|
13461
|
-
src0_as_f16,
|
|
13462
|
-
dst_t,
|
|
13463
|
-
|
|
13464
|
-
nb02, nb03,
|
|
13465
|
-
r3, item_ct1);
|
|
13409
|
+
src0_as_f16, src1_f16,
|
|
13410
|
+
dst_t, ptrs_src_get,
|
|
13411
|
+
ptrs_dst_get, ne12, ne13, ne23,
|
|
13412
|
+
nb02, nb03, nb12_scaled, nb13_scaled,
|
|
13413
|
+
nbd2, nbd3, r2, r3, item_ct1);
|
|
13466
13414
|
});
|
|
13467
13415
|
});
|
|
13468
13416
|
}
|
|
13469
|
-
/*
|
|
13470
|
-
DPCT1010:95: SYCL uses exceptions to report errors and does not use the
|
|
13471
|
-
error codes. The call was replaced with 0. You need to rewrite this
|
|
13472
|
-
code.
|
|
13473
|
-
*/
|
|
13474
|
-
SYCL_CHECK(0);
|
|
13475
|
-
|
|
13476
13417
|
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
|
|
13477
13418
|
*g_sycl_handles[g_main_device_index], oneapi::mkl::transpose::trans,
|
|
13478
13419
|
oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha,
|
|
13479
13420
|
(const void **)(ptrs_src.get() + 0 * ne23),
|
|
13480
|
-
dpct::library_data_t::real_half, nb01 /
|
|
13421
|
+
dpct::library_data_t::real_half, nb01 / nb00,
|
|
13481
13422
|
(const void **)(ptrs_src.get() + 1 * ne23),
|
|
13482
|
-
dpct::library_data_t::real_half, nb11 /
|
|
13423
|
+
dpct::library_data_t::real_half, nb11 / nb10, beta,
|
|
13483
13424
|
(void **)(ptrs_dst.get() + 0 * ne23), cu_data_type, ne01, ne23,
|
|
13484
13425
|
cu_compute_type)));
|
|
13485
13426
|
}
|
|
13486
13427
|
#endif
|
|
13487
13428
|
|
|
13488
|
-
|
|
13489
|
-
|
|
13490
|
-
to_fp32_sycl(dst_f16.get(), dst_ddf, ne, main_stream);
|
|
13491
|
-
}
|
|
13429
|
+
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
|
|
13430
|
+
to_fp32_sycl(dst_f16.get(), dst_ddf, ne_dst, main_stream);
|
|
13492
13431
|
}
|
|
13493
13432
|
catch (sycl::exception const &exc) {
|
|
13494
13433
|
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
|
@@ -13498,11 +13437,11 @@ catch (sycl::exception const &exc) {
|
|
|
13498
13437
|
|
|
13499
13438
|
static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
13500
13439
|
const bool all_on_device =
|
|
13501
|
-
(src0->backend ==
|
|
13502
|
-
(src1->backend ==
|
|
13503
|
-
( dst->backend ==
|
|
13440
|
+
(src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT) &&
|
|
13441
|
+
(src1->backend == GGML_BACKEND_TYPE_GPU) &&
|
|
13442
|
+
( dst->backend == GGML_BACKEND_TYPE_GPU);
|
|
13504
13443
|
|
|
13505
|
-
const bool split = src0->backend ==
|
|
13444
|
+
const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
|
|
13506
13445
|
|
|
13507
13446
|
int64_t min_compute_capability = INT_MAX;
|
|
13508
13447
|
for (int64_t id = 0; id < g_device_count; ++id) {
|
|
@@ -13533,10 +13472,10 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|
|
13533
13472
|
// KQV single-batch
|
|
13534
13473
|
// GGML_SYCL_DEBUG("ggml_sycl_mul_mat_vec_nc\n");
|
|
13535
13474
|
ggml_sycl_mul_mat_vec_nc(src0, src1, dst);
|
|
13536
|
-
} else if (!split && all_on_device && use_xmx && src0->type == GGML_TYPE_F16 &&
|
|
13475
|
+
} else if (!split && all_on_device && use_xmx && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
|
|
13537
13476
|
// KQ + KQV multi-batch
|
|
13538
|
-
// GGML_SYCL_DEBUG("
|
|
13539
|
-
|
|
13477
|
+
// GGML_SYCL_DEBUG("ggml_sycl_mul_mat_batched_sycl\n");
|
|
13478
|
+
ggml_sycl_mul_mat_batched_sycl(src0, src1, dst);
|
|
13540
13479
|
} else if (src0->type == GGML_TYPE_F32) {
|
|
13541
13480
|
// GGML_SYCL_DEBUG("ggml_sycl_op_mul_mat\n");
|
|
13542
13481
|
ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false);
|
|
@@ -13631,7 +13570,7 @@ static void ggml_sycl_mul_mat_id_sycl(ggml_tensor * dst) {
|
|
|
13631
13570
|
GGML_ASSERT(!ggml_is_transposed(src00));
|
|
13632
13571
|
GGML_ASSERT(!ggml_is_transposed(src1));
|
|
13633
13572
|
|
|
13634
|
-
GGML_ASSERT(src00->backend !=
|
|
13573
|
+
GGML_ASSERT(src00->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
|
13635
13574
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
|
13636
13575
|
|
|
13637
13576
|
GGML_TENSOR_LOCALS(int64_t, ne0, src00, ne);
|
|
@@ -13769,7 +13708,7 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
|
|
|
13769
13708
|
|
|
13770
13709
|
const dpct::queue_ptr stream = g_syclStreams[g_main_device_index][0];
|
|
13771
13710
|
|
|
13772
|
-
if (ids->backend ==
|
|
13711
|
+
if (ids->backend == GGML_BACKEND_TYPE_GPU) {
|
|
13773
13712
|
const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device_index];
|
|
13774
13713
|
SYCL_CHECK(CHECK_TRY_ERROR(
|
|
13775
13714
|
stream->memcpy(ids_host.data(), ids_dev, ggml_nbytes(ids))));
|
|
@@ -13787,20 +13726,20 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
|
|
|
13787
13726
|
ggml_tensor src1_row = *src1;
|
|
13788
13727
|
ggml_tensor dst_row = *dst;
|
|
13789
13728
|
|
|
13790
|
-
src1_row.backend =
|
|
13791
|
-
dst_row.backend =
|
|
13729
|
+
src1_row.backend = GGML_BACKEND_TYPE_GPU;
|
|
13730
|
+
dst_row.backend = GGML_BACKEND_TYPE_GPU;
|
|
13792
13731
|
|
|
13793
13732
|
src1_row.extra = &src1_row_extra;
|
|
13794
13733
|
dst_row.extra = &dst_row_extra;
|
|
13795
13734
|
|
|
13796
|
-
char * src1_original = src1->backend ==
|
|
13735
|
+
char * src1_original = src1->backend == GGML_BACKEND_TYPE_CPU ?
|
|
13797
13736
|
(char *) src1->data : (char *) src1_extra->data_device[g_main_device_index];
|
|
13798
|
-
char * dst_original = dst->backend ==
|
|
13737
|
+
char * dst_original = dst->backend == GGML_BACKEND_TYPE_CPU ?
|
|
13799
13738
|
(char *) dst->data : (char *) dst_extra->data_device[g_main_device_index];
|
|
13800
13739
|
|
|
13801
13740
|
if (src1->ne[1] == 1) {
|
|
13802
|
-
GGML_ASSERT(src1->backend ==
|
|
13803
|
-
GGML_ASSERT(dst->backend ==
|
|
13741
|
+
GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
|
|
13742
|
+
GGML_ASSERT(dst->backend == GGML_BACKEND_TYPE_GPU);
|
|
13804
13743
|
|
|
13805
13744
|
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
|
13806
13745
|
//int32_t row_id;
|
|
@@ -13882,7 +13821,7 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
|
|
|
13882
13821
|
}
|
|
13883
13822
|
}
|
|
13884
13823
|
|
|
13885
|
-
if (dst->backend ==
|
|
13824
|
+
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
|
13886
13825
|
SYCL_CHECK(CHECK_TRY_ERROR(stream->wait()));
|
|
13887
13826
|
}
|
|
13888
13827
|
}
|
|
@@ -13905,8 +13844,8 @@ static void ggml_sycl_cpy(const ggml_tensor *src0, const ggml_tensor *src1,
|
|
|
13905
13844
|
const int64_t ne = ggml_nelements(src0);
|
|
13906
13845
|
GGML_ASSERT(ne == ggml_nelements(src1));
|
|
13907
13846
|
|
|
13908
|
-
GGML_ASSERT(src0->backend ==
|
|
13909
|
-
GGML_ASSERT(src1->backend ==
|
|
13847
|
+
GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
|
|
13848
|
+
GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
|
|
13910
13849
|
|
|
13911
13850
|
GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
|
|
13912
13851
|
GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
|
|
@@ -14013,17 +13952,17 @@ void ggml_sycl_transform_tensor(void *data, struct ggml_tensor *tensor) try {
|
|
|
14013
13952
|
memset(extra, 0, sizeof(*extra));
|
|
14014
13953
|
|
|
14015
13954
|
for (int64_t id = 0; id < g_device_count; ++id) {
|
|
14016
|
-
if (backend ==
|
|
13955
|
+
if (backend == GGML_BACKEND_TYPE_GPU && id != g_main_device_index) {
|
|
14017
13956
|
continue;
|
|
14018
13957
|
}
|
|
14019
13958
|
ggml_sycl_set_device(get_device_id_by_index(id));
|
|
14020
13959
|
const dpct::queue_ptr stream = g_syclStreams[id][0];
|
|
14021
13960
|
|
|
14022
13961
|
int64_t row_low, row_high;
|
|
14023
|
-
if (backend ==
|
|
13962
|
+
if (backend == GGML_BACKEND_TYPE_GPU) {
|
|
14024
13963
|
row_low = 0;
|
|
14025
13964
|
row_high = nrows;
|
|
14026
|
-
} else if (backend ==
|
|
13965
|
+
} else if (backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
|
|
14027
13966
|
const int64_t rounding = get_row_rounding(tensor->type);
|
|
14028
13967
|
|
|
14029
13968
|
row_low = id == 0 ? 0 : nrows*g_tensor_split[id];
|
|
@@ -14072,7 +14011,7 @@ void ggml_sycl_transform_tensor(void *data, struct ggml_tensor *tensor) try {
|
|
|
14072
14011
|
|
|
14073
14012
|
extra->data_device[id] = buf;
|
|
14074
14013
|
|
|
14075
|
-
if (backend ==
|
|
14014
|
+
if (backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
|
|
14076
14015
|
for (int64_t is = 0; is < MAX_STREAMS; ++is) {
|
|
14077
14016
|
SYCL_CHECK(CHECK_TRY_ERROR(extra->events[id][is] =
|
|
14078
14017
|
new sycl::event()));
|
|
@@ -14089,7 +14028,7 @@ catch (sycl::exception const &exc) {
|
|
|
14089
14028
|
}
|
|
14090
14029
|
|
|
14091
14030
|
void ggml_sycl_free_data(struct ggml_tensor *tensor) try {
|
|
14092
|
-
if (!tensor || !tensor->extra || (tensor->backend !=
|
|
14031
|
+
if (!tensor || !tensor->extra || (tensor->backend != GGML_BACKEND_TYPE_GPU && tensor->backend != GGML_BACKEND_TYPE_GPU_SPLIT) ) {
|
|
14093
14032
|
return;
|
|
14094
14033
|
}
|
|
14095
14034
|
|
|
@@ -14142,15 +14081,15 @@ static void ggml_sycl_assign_buffers_impl(struct ggml_tensor *tensor,
|
|
|
14142
14081
|
return;
|
|
14143
14082
|
}
|
|
14144
14083
|
|
|
14145
|
-
tensor->backend =
|
|
14084
|
+
tensor->backend = GGML_BACKEND_TYPE_GPU;
|
|
14146
14085
|
|
|
14147
|
-
if (tensor->src[0] != nullptr && tensor->src[0]->backend ==
|
|
14086
|
+
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU) {
|
|
14148
14087
|
const ggml_op src0_op = tensor->src[0]->op;
|
|
14149
14088
|
if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
|
|
14150
14089
|
ggml_sycl_assign_buffers_impl(tensor->src[0], scratch, force_inplace, no_alloc);
|
|
14151
14090
|
}
|
|
14152
14091
|
}
|
|
14153
|
-
if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend ==
|
|
14092
|
+
if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU) {
|
|
14154
14093
|
ggml_sycl_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc);
|
|
14155
14094
|
}
|
|
14156
14095
|
|
|
@@ -14168,7 +14107,7 @@ static void ggml_sycl_assign_buffers_impl(struct ggml_tensor *tensor,
|
|
|
14168
14107
|
SYCL_CHECK(ggml_sycl_set_device(g_main_device));
|
|
14169
14108
|
const dpct::queue_ptr stream = g_syclStreams[g_main_device_index][0];
|
|
14170
14109
|
|
|
14171
|
-
if (inplace && (tensor->src[0]->backend ==
|
|
14110
|
+
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT)) {
|
|
14172
14111
|
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
|
|
14173
14112
|
char * src0_ddc = (char *) src0_extra->data_device[g_main_device_index];
|
|
14174
14113
|
size_t offset = 0;
|
|
@@ -14237,7 +14176,7 @@ void ggml_sycl_assign_scratch_offset(struct ggml_tensor *tensor,
|
|
|
14237
14176
|
|
|
14238
14177
|
const bool inplace = tensor->view_src != nullptr;
|
|
14239
14178
|
|
|
14240
|
-
if (inplace && (tensor->view_src->backend ==
|
|
14179
|
+
if (inplace && (tensor->view_src->backend == GGML_BACKEND_TYPE_GPU || tensor->view_src->backend == GGML_BACKEND_TYPE_GPU_SPLIT)) {
|
|
14241
14180
|
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->view_src->extra;
|
|
14242
14181
|
char * src0_ddc = (char *) src0_extra->data_device[g_main_device_index];
|
|
14243
14182
|
size_t view_offset = 0;
|
|
@@ -14258,7 +14197,7 @@ catch (sycl::exception const &exc) {
|
|
|
14258
14197
|
}
|
|
14259
14198
|
|
|
14260
14199
|
void ggml_sycl_copy_to_device(struct ggml_tensor *tensor) try {
|
|
14261
|
-
GGML_ASSERT(tensor->backend ==
|
|
14200
|
+
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
|
14262
14201
|
GGML_ASSERT(ggml_is_contiguous(tensor));
|
|
14263
14202
|
|
|
14264
14203
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
|
@@ -14345,9 +14284,9 @@ bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
|
14345
14284
|
if (!g_sycl_loaded) return false;
|
|
14346
14285
|
|
|
14347
14286
|
ggml_sycl_func_t func;
|
|
14348
|
-
const bool any_on_device = tensor->backend ==
|
|
14349
|
-
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend ==
|
|
14350
|
-
|| (tensor->src[1] != nullptr && tensor->src[1]->backend ==
|
|
14287
|
+
const bool any_on_device = tensor->backend == GGML_BACKEND_TYPE_GPU
|
|
14288
|
+
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
|
|
14289
|
+
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_TYPE_GPU);
|
|
14351
14290
|
|
|
14352
14291
|
if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) {
|
|
14353
14292
|
return false;
|
|
@@ -14485,14 +14424,14 @@ bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
|
14485
14424
|
return false;
|
|
14486
14425
|
}
|
|
14487
14426
|
|
|
14488
|
-
if (tensor->src[0] != nullptr && tensor->src[0]->backend ==
|
|
14427
|
+
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
|
|
14489
14428
|
ggml_sycl_set_peer_access(tensor->src[1]->ne[1]);
|
|
14490
14429
|
}
|
|
14491
14430
|
|
|
14492
14431
|
if (params->ith != 0) {
|
|
14493
14432
|
return true;
|
|
14494
14433
|
}
|
|
14495
|
-
if (params->type ==
|
|
14434
|
+
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
|
14496
14435
|
return true;
|
|
14497
14436
|
}
|
|
14498
14437
|
func(tensor->src[0], tensor->src[1], tensor);
|
|
@@ -14643,7 +14582,7 @@ static void ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
|
|
14643
14582
|
|
|
14644
14583
|
extra->data_device[ctx->device] = tensor->data;
|
|
14645
14584
|
|
|
14646
|
-
tensor->backend =
|
|
14585
|
+
tensor->backend = GGML_BACKEND_TYPE_GPU;
|
|
14647
14586
|
tensor->extra = extra;
|
|
14648
14587
|
|
|
14649
14588
|
if (ggml_is_quantized(tensor->type)) {
|
|
@@ -14674,7 +14613,7 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
|
|
14674
14613
|
ggml_tensor *tensor,
|
|
14675
14614
|
const void *data, size_t offset,
|
|
14676
14615
|
size_t size) try {
|
|
14677
|
-
GGML_ASSERT(tensor->backend ==
|
|
14616
|
+
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
|
14678
14617
|
|
|
14679
14618
|
ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
|
|
14680
14619
|
|
|
@@ -14699,7 +14638,7 @@ static void ggml_backend_sycl_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
|
|
14699
14638
|
const ggml_tensor *tensor,
|
|
14700
14639
|
void *data, size_t offset,
|
|
14701
14640
|
size_t size) try {
|
|
14702
|
-
GGML_ASSERT(tensor->backend ==
|
|
14641
|
+
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
|
14703
14642
|
|
|
14704
14643
|
ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
|
|
14705
14644
|
|
|
@@ -14768,7 +14707,8 @@ GGML_CALL static const char * ggml_backend_sycl_buffer_type_name(ggml_backend_bu
|
|
|
14768
14707
|
static ggml_backend_buffer_t
|
|
14769
14708
|
ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
|
|
14770
14709
|
size_t size) try {
|
|
14771
|
-
|
|
14710
|
+
ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context;
|
|
14711
|
+
int device = (int) buft_ctx->device;
|
|
14772
14712
|
|
|
14773
14713
|
ggml_sycl_set_device(device);
|
|
14774
14714
|
int device_index = get_device_index_by_id(device);
|
|
@@ -14846,7 +14786,7 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) {
|
|
|
14846
14786
|
for (int i = 0; i < GGML_SYCL_MAX_DEVICES; i++) {
|
|
14847
14787
|
ggml_backend_sycl_buffer_types[i] = {
|
|
14848
14788
|
/* .iface = */ ggml_backend_sycl_buffer_type_interface,
|
|
14849
|
-
/* .context = */
|
|
14789
|
+
/* .context = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(i)},
|
|
14850
14790
|
};
|
|
14851
14791
|
}
|
|
14852
14792
|
ggml_backend_sycl_buffer_type_initialized = true;
|
|
@@ -14908,10 +14848,6 @@ ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type() {
|
|
|
14908
14848
|
|
|
14909
14849
|
// backend
|
|
14910
14850
|
|
|
14911
|
-
struct ggml_backend_context_sycl {
|
|
14912
|
-
int device;
|
|
14913
|
-
};
|
|
14914
|
-
|
|
14915
14851
|
static const char * ggml_backend_sycl_name(ggml_backend_t backend) {
|
|
14916
14852
|
return GGML_SYCL_NAME;
|
|
14917
14853
|
|
|
@@ -14919,14 +14855,14 @@ static const char * ggml_backend_sycl_name(ggml_backend_t backend) {
|
|
|
14919
14855
|
}
|
|
14920
14856
|
|
|
14921
14857
|
static void ggml_backend_sycl_free(ggml_backend_t backend) {
|
|
14922
|
-
|
|
14858
|
+
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
|
14923
14859
|
|
|
14924
14860
|
delete sycl_ctx;
|
|
14925
14861
|
delete backend;
|
|
14926
14862
|
}
|
|
14927
14863
|
|
|
14928
14864
|
static ggml_backend_buffer_type_t ggml_backend_sycl_get_default_buffer_type(ggml_backend_t backend) {
|
|
14929
|
-
|
|
14865
|
+
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
|
14930
14866
|
|
|
14931
14867
|
return ggml_backend_sycl_buffer_type(sycl_ctx->device);
|
|
14932
14868
|
}
|
|
@@ -14935,10 +14871,10 @@ static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend,
|
|
|
14935
14871
|
ggml_tensor *tensor,
|
|
14936
14872
|
const void *data, size_t offset,
|
|
14937
14873
|
size_t size) try {
|
|
14938
|
-
|
|
14874
|
+
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
|
14939
14875
|
|
|
14940
14876
|
GGML_ASSERT(tensor->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type");
|
|
14941
|
-
GGML_ASSERT(tensor->backend ==
|
|
14877
|
+
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
|
14942
14878
|
|
|
14943
14879
|
SYCL_CHECK(CHECK_TRY_ERROR(g_syclStreams[sycl_ctx->device][0]->memcpy(
|
|
14944
14880
|
(char *)tensor->data + offset, data, size)));
|
|
@@ -14953,10 +14889,10 @@ static void ggml_backend_sycl_get_tensor_async(ggml_backend_t backend,
|
|
|
14953
14889
|
const ggml_tensor *tensor,
|
|
14954
14890
|
void *data, size_t offset,
|
|
14955
14891
|
size_t size) try {
|
|
14956
|
-
|
|
14892
|
+
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
|
14957
14893
|
|
|
14958
14894
|
GGML_ASSERT(tensor->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type");
|
|
14959
|
-
GGML_ASSERT(tensor->backend ==
|
|
14895
|
+
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
|
14960
14896
|
|
|
14961
14897
|
SYCL_CHECK(CHECK_TRY_ERROR(g_syclStreams[sycl_ctx->device][0]->memcpy(
|
|
14962
14898
|
data, (const char *)tensor->data + offset, size)));
|
|
@@ -14968,7 +14904,7 @@ catch (sycl::exception const &exc) {
|
|
|
14968
14904
|
}
|
|
14969
14905
|
|
|
14970
14906
|
static void ggml_backend_sycl_synchronize(ggml_backend_t backend) try {
|
|
14971
|
-
|
|
14907
|
+
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
|
14972
14908
|
|
|
14973
14909
|
SYCL_CHECK(CHECK_TRY_ERROR(g_syclStreams[sycl_ctx->device][0]->wait()));
|
|
14974
14910
|
|
|
@@ -15004,12 +14940,12 @@ static void ggml_backend_sycl_graph_plan_compute(ggml_backend_t backend, ggml_ba
|
|
|
15004
14940
|
}
|
|
15005
14941
|
|
|
15006
14942
|
static bool ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
|
15007
|
-
|
|
14943
|
+
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
|
15008
14944
|
|
|
15009
14945
|
ggml_sycl_set_main_device(sycl_ctx->device);
|
|
15010
14946
|
|
|
15011
14947
|
ggml_compute_params params = {};
|
|
15012
|
-
params.type =
|
|
14948
|
+
params.type = GGML_TASK_TYPE_COMPUTE;
|
|
15013
14949
|
params.ith = 0;
|
|
15014
14950
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
|
15015
14951
|
ggml_tensor * node = cgraph->nodes[i];
|
|
@@ -15017,13 +14953,13 @@ static bool ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph
|
|
|
15017
14953
|
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
|
|
15018
14954
|
continue;
|
|
15019
14955
|
|
|
15020
|
-
assert(node->backend ==
|
|
14956
|
+
assert(node->backend == GGML_BACKEND_TYPE_GPU);
|
|
15021
14957
|
assert(node->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device));
|
|
15022
14958
|
assert(node->extra != nullptr);
|
|
15023
14959
|
|
|
15024
14960
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
|
15025
14961
|
if (node->src[j] != nullptr) {
|
|
15026
|
-
assert(node->src[j]->backend ==
|
|
14962
|
+
assert(node->src[j]->backend == GGML_BACKEND_TYPE_GPU);
|
|
15027
14963
|
assert(node->src[j]->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device));
|
|
15028
14964
|
assert(node->src[j]->extra != nullptr);
|
|
15029
14965
|
}
|
|
@@ -15093,6 +15029,12 @@ static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, const ggml_ten
|
|
|
15093
15029
|
return false;
|
|
15094
15030
|
}
|
|
15095
15031
|
|
|
15032
|
+
if (a->type == GGML_TYPE_IQ1_S) {
|
|
15033
|
+
return false;
|
|
15034
|
+
}
|
|
15035
|
+
if (a->type == GGML_TYPE_IQ3_XXS) {
|
|
15036
|
+
return false;
|
|
15037
|
+
}
|
|
15096
15038
|
if (a->type == GGML_TYPE_IQ2_XXS) {
|
|
15097
15039
|
return false;
|
|
15098
15040
|
}
|
|
@@ -15201,6 +15143,11 @@ static ggml_backend_i ggml_backend_sycl_interface = {
|
|
|
15201
15143
|
/* .supports_op = */ ggml_backend_sycl_supports_op,
|
|
15202
15144
|
};
|
|
15203
15145
|
|
|
15146
|
+
static ggml_guid_t ggml_backend_sycl_guid() {
|
|
15147
|
+
static ggml_guid guid = { 0x58, 0x05, 0x13, 0x8f, 0xcd, 0x3a, 0x61, 0x9d, 0xe7, 0xcd, 0x98, 0xa9, 0x03, 0xfd, 0x7c, 0x53 };
|
|
15148
|
+
return &guid;
|
|
15149
|
+
}
|
|
15150
|
+
|
|
15204
15151
|
ggml_backend_t ggml_backend_sycl_init(int device) {
|
|
15205
15152
|
ggml_init_sycl(); // TODO: remove from ggml.c
|
|
15206
15153
|
|
|
@@ -15212,11 +15159,13 @@ ggml_backend_t ggml_backend_sycl_init(int device) {
|
|
|
15212
15159
|
// not strictly necessary, but it may reduce the overhead of the first graph_compute
|
|
15213
15160
|
ggml_sycl_set_main_device(device);
|
|
15214
15161
|
|
|
15215
|
-
|
|
15216
|
-
/* .device = */ device
|
|
15162
|
+
ggml_backend_sycl_context * ctx = new ggml_backend_sycl_context {
|
|
15163
|
+
/* .device = */ device,
|
|
15164
|
+
/* .name = */ GGML_SYCL_NAME + std::to_string(device),
|
|
15217
15165
|
};
|
|
15218
15166
|
|
|
15219
15167
|
ggml_backend_t sycl_backend = new ggml_backend {
|
|
15168
|
+
/* .guid = */ ggml_backend_sycl_guid(),
|
|
15220
15169
|
/* .interface = */ ggml_backend_sycl_interface,
|
|
15221
15170
|
/* .context = */ ctx
|
|
15222
15171
|
};
|
|
@@ -15225,7 +15174,7 @@ ggml_backend_t ggml_backend_sycl_init(int device) {
|
|
|
15225
15174
|
}
|
|
15226
15175
|
|
|
15227
15176
|
bool ggml_backend_is_sycl(ggml_backend_t backend) {
|
|
15228
|
-
return backend->
|
|
15177
|
+
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_sycl_guid());
|
|
15229
15178
|
}
|
|
15230
15179
|
|
|
15231
15180
|
static ggml_backend_t ggml_backend_reg_sycl_init(const char * params, void * user_data) {
|