llama_cpp 0.12.5 → 0.12.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +17 -0
- data/ext/llama_cpp/llama_cpp.cpp +67 -10
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +15 -1
- data/vendor/tmp/llama.cpp/Makefile +51 -12
- data/vendor/tmp/llama.cpp/ggml-alloc.c +595 -492
- data/vendor/tmp/llama.cpp/ggml-alloc.h +39 -65
- data/vendor/tmp/llama.cpp/ggml-backend.c +268 -271
- data/vendor/tmp/llama.cpp/ggml-backend.h +8 -12
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +560 -346
- data/vendor/tmp/llama.cpp/ggml-impl.h +20 -7
- data/vendor/tmp/llama.cpp/ggml-metal.m +101 -11
- data/vendor/tmp/llama.cpp/ggml-metal.metal +608 -9
- data/vendor/tmp/llama.cpp/ggml-quants.c +1255 -94
- data/vendor/tmp/llama.cpp/ggml-quants.h +39 -16
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +95 -264
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +213 -58
- data/vendor/tmp/llama.cpp/ggml.c +1082 -564
- data/vendor/tmp/llama.cpp/ggml.h +50 -17
- data/vendor/tmp/llama.cpp/llama.cpp +1329 -280
- data/vendor/tmp/llama.cpp/llama.h +43 -1
- data/vendor/tmp/llama.cpp/scripts/get-flags.mk +1 -1
- data/vendor/tmp/llama.cpp/unicode.h +42 -30
- metadata +2 -2
@@ -191,6 +191,21 @@ typedef struct {
|
|
191
191
|
} block_iq3_xxs;
|
192
192
|
static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
|
193
193
|
|
194
|
+
typedef struct {
|
195
|
+
ggml_fp16_t d;
|
196
|
+
uint8_t qs[QK_K/8];
|
197
|
+
uint8_t scales[QK_K/16];
|
198
|
+
} block_iq1_s;
|
199
|
+
static_assert(sizeof(block_iq1_s) == sizeof(ggml_fp16_t) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding");
|
200
|
+
|
201
|
+
// Non-linear quants
|
202
|
+
#define QK4_NL 32
|
203
|
+
typedef struct {
|
204
|
+
ggml_fp16_t d;
|
205
|
+
uint8_t qs[QK4_NL/2];
|
206
|
+
} block_iq4_nl;
|
207
|
+
static_assert(sizeof(block_iq4_nl) == sizeof(ggml_fp16_t) + QK4_NL/2, "wrong iq4_nl block size/padding");
|
208
|
+
|
194
209
|
#ifdef __cplusplus
|
195
210
|
extern "C" {
|
196
211
|
#endif
|
@@ -210,6 +225,7 @@ void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGM
|
|
210
225
|
void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int k);
|
211
226
|
void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int k);
|
212
227
|
void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int k);
|
228
|
+
void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int k);
|
213
229
|
|
214
230
|
void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
215
231
|
void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
@@ -225,6 +241,7 @@ void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in
|
|
225
241
|
void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
226
242
|
void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
227
243
|
void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
244
|
+
void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
228
245
|
|
229
246
|
// Dequantization
|
230
247
|
void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
@@ -243,22 +260,26 @@ void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRI
|
|
243
260
|
void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
244
261
|
void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
245
262
|
void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
263
|
+
void dequantize_row_iq1_s (const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
264
|
+
void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
246
265
|
|
247
266
|
// Dot product
|
248
|
-
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
249
|
-
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
250
|
-
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
251
|
-
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
252
|
-
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
253
|
-
|
254
|
-
void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
255
|
-
void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
256
|
-
void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
257
|
-
void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
258
|
-
void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
259
|
-
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
260
|
-
void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
261
|
-
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
267
|
+
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
268
|
+
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
269
|
+
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
270
|
+
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
271
|
+
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
272
|
+
|
273
|
+
void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
274
|
+
void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
275
|
+
void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
276
|
+
void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
277
|
+
void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
278
|
+
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
279
|
+
void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
280
|
+
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
281
|
+
void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
282
|
+
void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
262
283
|
|
263
284
|
//
|
264
285
|
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
@@ -266,6 +287,8 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, const void * GGML
|
|
266
287
|
size_t quantize_iq2_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
267
288
|
size_t quantize_iq2_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
268
289
|
size_t quantize_iq3_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
290
|
+
size_t quantize_iq1_s (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
291
|
+
size_t quantize_iq4_nl (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
269
292
|
size_t quantize_q2_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
270
293
|
size_t quantize_q3_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
271
294
|
size_t quantize_q4_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
@@ -276,8 +299,8 @@ size_t quantize_q4_1 (const float * src, void * dst, int nrows, int n_per_row,
|
|
276
299
|
size_t quantize_q5_0 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
277
300
|
size_t quantize_q5_1 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
278
301
|
|
279
|
-
void iq2xs_init_impl(
|
280
|
-
void iq2xs_free_impl(
|
302
|
+
void iq2xs_init_impl(enum ggml_type type);
|
303
|
+
void iq2xs_free_impl(enum ggml_type type);
|
281
304
|
void iq3xs_init_impl(int grid_size);
|
282
305
|
void iq3xs_free_impl(int grid_size);
|
283
306
|
|
@@ -9188,174 +9188,22 @@ static void convert_mul_mat_vec_f16_sycl(const void *vx, const dfloat *y,
|
|
9188
9188
|
}
|
9189
9189
|
}
|
9190
9190
|
|
9191
|
-
|
9192
|
-
|
9193
|
-
|
9194
|
-
|
9195
|
-
|
9196
|
-
|
9197
|
-
|
9198
|
-
|
9199
|
-
|
9200
|
-
|
9201
|
-
|
9202
|
-
|
9203
|
-
|
9204
|
-
|
9205
|
-
|
9206
|
-
}
|
9207
|
-
|
9208
|
-
static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
|
9209
|
-
float *dst, const int ncols,
|
9210
|
-
const int nrows,
|
9211
|
-
dpct::queue_ptr stream) {
|
9212
|
-
GGML_ASSERT(ncols % QK4_1 == 0);
|
9213
|
-
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
9214
|
-
const sycl::range<3> block_nums(1, 1, block_num_y);
|
9215
|
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
9216
|
-
stream->parallel_for(
|
9217
|
-
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
9218
|
-
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
9219
|
-
mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ,
|
9220
|
-
vec_dot_q4_1_q8_1>(vx, vy, dst, ncols, nrows,
|
9221
|
-
item_ct1);
|
9222
|
-
});
|
9223
|
-
}
|
9224
|
-
|
9225
|
-
static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
|
9226
|
-
float *dst, const int ncols,
|
9227
|
-
const int nrows,
|
9228
|
-
dpct::queue_ptr stream) {
|
9229
|
-
GGML_ASSERT(ncols % QK5_0 == 0);
|
9230
|
-
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
9231
|
-
const sycl::range<3> block_nums(1, 1, block_num_y);
|
9232
|
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
9233
|
-
stream->parallel_for(
|
9234
|
-
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
9235
|
-
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
9236
|
-
mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ,
|
9237
|
-
vec_dot_q5_0_q8_1>(vx, vy, dst, ncols, nrows,
|
9238
|
-
item_ct1);
|
9239
|
-
});
|
9240
|
-
}
|
9241
|
-
|
9242
|
-
static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
|
9243
|
-
float *dst, const int ncols,
|
9244
|
-
const int nrows,
|
9245
|
-
dpct::queue_ptr stream) {
|
9246
|
-
GGML_ASSERT(ncols % QK5_1 == 0);
|
9247
|
-
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
9248
|
-
const sycl::range<3> block_nums(1, 1, block_num_y);
|
9249
|
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
9250
|
-
stream->parallel_for(
|
9251
|
-
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
9252
|
-
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
9253
|
-
mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ,
|
9254
|
-
vec_dot_q5_1_q8_1>(vx, vy, dst, ncols, nrows,
|
9255
|
-
item_ct1);
|
9256
|
-
});
|
9257
|
-
}
|
9258
|
-
|
9259
|
-
static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
|
9260
|
-
float *dst, const int ncols,
|
9261
|
-
const int nrows,
|
9262
|
-
dpct::queue_ptr stream) {
|
9263
|
-
GGML_ASSERT(ncols % QK8_0 == 0);
|
9264
|
-
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
9265
|
-
const sycl::range<3> block_nums(1, 1, block_num_y);
|
9266
|
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
9267
|
-
stream->parallel_for(
|
9268
|
-
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
9269
|
-
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
9270
|
-
mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ,
|
9271
|
-
vec_dot_q8_0_q8_1>(vx, vy, dst, ncols, nrows,
|
9272
|
-
item_ct1);
|
9273
|
-
});
|
9274
|
-
}
|
9275
|
-
|
9276
|
-
static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
|
9277
|
-
float *dst, const int ncols,
|
9278
|
-
const int nrows,
|
9279
|
-
dpct::queue_ptr stream) {
|
9280
|
-
GGML_ASSERT(ncols % QK_K == 0);
|
9281
|
-
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
9282
|
-
const sycl::range<3> block_nums(1, 1, block_num_y);
|
9283
|
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
9284
|
-
stream->parallel_for(
|
9285
|
-
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
9286
|
-
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
9287
|
-
mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ,
|
9288
|
-
vec_dot_q2_K_q8_1>(vx, vy, dst, ncols, nrows,
|
9289
|
-
item_ct1);
|
9290
|
-
});
|
9291
|
-
}
|
9292
|
-
|
9293
|
-
static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
|
9294
|
-
float *dst, const int ncols,
|
9295
|
-
const int nrows,
|
9296
|
-
dpct::queue_ptr stream) {
|
9297
|
-
GGML_ASSERT(ncols % QK_K == 0);
|
9298
|
-
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
9299
|
-
const sycl::range<3> block_nums(1, 1, block_num_y);
|
9300
|
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
9301
|
-
stream->parallel_for(
|
9302
|
-
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
9303
|
-
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
9304
|
-
mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ,
|
9305
|
-
vec_dot_q3_K_q8_1>(vx, vy, dst, ncols, nrows,
|
9306
|
-
item_ct1);
|
9307
|
-
});
|
9308
|
-
}
|
9309
|
-
|
9310
|
-
static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
|
9311
|
-
float *dst, const int ncols,
|
9312
|
-
const int nrows,
|
9313
|
-
dpct::queue_ptr stream) {
|
9314
|
-
GGML_ASSERT(ncols % QK_K == 0);
|
9315
|
-
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
9316
|
-
const sycl::range<3> block_nums(1, 1, block_num_y);
|
9317
|
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
9318
|
-
stream->parallel_for(
|
9319
|
-
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
9320
|
-
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
9321
|
-
mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ,
|
9322
|
-
vec_dot_q4_K_q8_1>(vx, vy, dst, ncols, nrows,
|
9323
|
-
item_ct1);
|
9324
|
-
});
|
9325
|
-
}
|
9326
|
-
|
9327
|
-
static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
|
9328
|
-
float *dst, const int ncols,
|
9329
|
-
const int nrows,
|
9330
|
-
dpct::queue_ptr stream) {
|
9331
|
-
GGML_ASSERT(ncols % QK_K == 0);
|
9332
|
-
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
9333
|
-
const sycl::range<3> block_nums(1, 1, block_num_y);
|
9334
|
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
9335
|
-
stream->parallel_for(
|
9336
|
-
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
9337
|
-
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
9338
|
-
mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ,
|
9339
|
-
vec_dot_q5_K_q8_1>(vx, vy, dst, ncols, nrows,
|
9340
|
-
item_ct1);
|
9341
|
-
});
|
9342
|
-
}
|
9343
|
-
|
9344
|
-
static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
|
9345
|
-
float *dst, const int ncols,
|
9346
|
-
const int nrows,
|
9347
|
-
dpct::queue_ptr stream) {
|
9348
|
-
GGML_ASSERT(ncols % QK_K == 0);
|
9349
|
-
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
9350
|
-
const sycl::range<3> block_nums(1, 1, block_num_y);
|
9351
|
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
9352
|
-
stream->parallel_for(
|
9353
|
-
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
9354
|
-
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
9355
|
-
mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ,
|
9356
|
-
vec_dot_q6_K_q8_1>(vx, vy, dst, ncols, nrows,
|
9357
|
-
item_ct1);
|
9358
|
-
});
|
9191
|
+
template <int qk, int qi, typename block_q_t, int vdr,
|
9192
|
+
vec_dot_q_sycl_t vec_dot_q_sycl>
|
9193
|
+
static void mul_mat_vec_q_sycl_submitter(const void *vx, const void *vy,
|
9194
|
+
float *dst, const int ncols,
|
9195
|
+
const int nrows,
|
9196
|
+
dpct::queue_ptr stream) {
|
9197
|
+
GGML_ASSERT(ncols % QK4_0 == 0);
|
9198
|
+
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
9199
|
+
const sycl::range<3> block_nums(1, 1, block_num_y);
|
9200
|
+
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
9201
|
+
stream->parallel_for(
|
9202
|
+
sycl::nd_range<3>(block_nums * block_dims, block_dims), [=
|
9203
|
+
](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
9204
|
+
mul_mat_vec_q<qk, qi, block_q_t, vdr, vec_dot_q_sycl>(
|
9205
|
+
vx, vy, dst, ncols, nrows, item_ct1);
|
9206
|
+
});
|
9359
9207
|
}
|
9360
9208
|
|
9361
9209
|
int get_device_index_by_id(int id){
|
@@ -11578,11 +11426,8 @@ static dpct::err0 ggml_sycl_cpy_tensor_2d(void *dst,
|
|
11578
11426
|
}
|
11579
11427
|
char * dst_ptr = (char *) dst;
|
11580
11428
|
|
11581
|
-
|
11582
|
-
|
11583
|
-
const int64_t nb1 = src->nb[1];
|
11584
|
-
const int64_t nb2 = src->nb[2];
|
11585
|
-
const int64_t nb3 = src->nb[3];
|
11429
|
+
GGML_TENSOR_LOCALS_1(int64_t, ne, src, ne);
|
11430
|
+
GGML_TENSOR_LOCALS(int64_t, nb, src, nb);
|
11586
11431
|
const enum ggml_type type = src->type;
|
11587
11432
|
const int64_t ts = ggml_type_size(type);
|
11588
11433
|
const int64_t bs = ggml_blck_size(type);
|
@@ -12098,37 +11943,63 @@ inline void ggml_sycl_op_mul_mat_vec_q(
|
|
12098
11943
|
const int64_t ne00 = src0->ne[0];
|
12099
11944
|
const int64_t row_diff = row_high - row_low;
|
12100
11945
|
|
11946
|
+
// TODO: support these quantization types
|
11947
|
+
GGML_ASSERT(!(src0->type == GGML_TYPE_IQ2_XXS ||
|
11948
|
+
src0->type == GGML_TYPE_IQ2_XS ||
|
11949
|
+
src0->type == GGML_TYPE_IQ3_XXS ||
|
11950
|
+
src0->type == GGML_TYPE_IQ1_S));
|
11951
|
+
|
12101
11952
|
switch (src0->type) {
|
12102
11953
|
case GGML_TYPE_Q4_0:
|
12103
|
-
|
12104
|
-
|
11954
|
+
mul_mat_vec_q_sycl_submitter<QK4_0, QI4_0, block_q4_0,
|
11955
|
+
VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
|
11956
|
+
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
11957
|
+
break;
|
12105
11958
|
case GGML_TYPE_Q4_1:
|
12106
|
-
|
12107
|
-
|
11959
|
+
mul_mat_vec_q_sycl_submitter<QK4_1, QI4_1, block_q4_1,
|
11960
|
+
VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
|
11961
|
+
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
11962
|
+
break;
|
12108
11963
|
case GGML_TYPE_Q5_0:
|
12109
|
-
|
12110
|
-
|
11964
|
+
mul_mat_vec_q_sycl_submitter<QK5_0, QI5_0, block_q5_0,
|
11965
|
+
VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
|
11966
|
+
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
11967
|
+
break;
|
12111
11968
|
case GGML_TYPE_Q5_1:
|
12112
|
-
|
12113
|
-
|
11969
|
+
mul_mat_vec_q_sycl_submitter<QK5_1, QI5_1, block_q5_1,
|
11970
|
+
VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
|
11971
|
+
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
11972
|
+
break;
|
12114
11973
|
case GGML_TYPE_Q8_0:
|
12115
|
-
|
12116
|
-
|
11974
|
+
mul_mat_vec_q_sycl_submitter<QK8_0, QI8_0, block_q8_0,
|
11975
|
+
VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
|
11976
|
+
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
11977
|
+
break;
|
12117
11978
|
case GGML_TYPE_Q2_K:
|
12118
|
-
|
12119
|
-
|
11979
|
+
mul_mat_vec_q_sycl_submitter<QK_K, QI2_K, block_q2_K,
|
11980
|
+
VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
|
11981
|
+
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
11982
|
+
break;
|
12120
11983
|
case GGML_TYPE_Q3_K:
|
12121
|
-
|
12122
|
-
|
11984
|
+
mul_mat_vec_q_sycl_submitter<QK_K, QI3_K, block_q3_K,
|
11985
|
+
VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
|
11986
|
+
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
11987
|
+
break;
|
12123
11988
|
case GGML_TYPE_Q4_K:
|
12124
|
-
|
12125
|
-
|
11989
|
+
mul_mat_vec_q_sycl_submitter<QK_K, QI4_K, block_q4_K,
|
11990
|
+
VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
|
11991
|
+
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
11992
|
+
break;
|
12126
11993
|
case GGML_TYPE_Q5_K:
|
12127
|
-
|
12128
|
-
|
11994
|
+
mul_mat_vec_q_sycl_submitter<QK_K, QI5_K, block_q5_K,
|
11995
|
+
VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
|
11996
|
+
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
11997
|
+
break;
|
12129
11998
|
case GGML_TYPE_Q6_K:
|
12130
|
-
|
12131
|
-
|
11999
|
+
mul_mat_vec_q_sycl_submitter<QK_K, QI6_K, block_q6_K,
|
12000
|
+
VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
|
12001
|
+
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
12002
|
+
break;
|
12132
12003
|
default:
|
12133
12004
|
GGML_ASSERT(false);
|
12134
12005
|
break;
|
@@ -12148,7 +12019,7 @@ inline void ggml_sycl_op_dequantize_mul_mat_vec(
|
|
12148
12019
|
const int64_t src1_ncols, const int64_t src1_padded_row_size,
|
12149
12020
|
const dpct::queue_ptr &stream) {
|
12150
12021
|
|
12151
|
-
GGML_TENSOR_BINARY_OP_LOCALS
|
12022
|
+
GGML_TENSOR_BINARY_OP_LOCALS;
|
12152
12023
|
|
12153
12024
|
const int64_t row_diff = row_high - row_low;
|
12154
12025
|
|
@@ -12426,9 +12297,7 @@ inline void ggml_sycl_op_alibi(const ggml_tensor *src0, const ggml_tensor *src1,
|
|
12426
12297
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
12427
12298
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
12428
12299
|
|
12429
|
-
|
12430
|
-
const int64_t ne01 = src0->ne[1];
|
12431
|
-
const int64_t ne02 = src0->ne[2];
|
12300
|
+
GGML_TENSOR_LOCALS_3(int64_t, ne0, src0, ne);
|
12432
12301
|
const int64_t nrows = ggml_nrows(src0);
|
12433
12302
|
|
12434
12303
|
//const int n_past = ((int32_t *) dst->op_params)[0];
|
@@ -12758,15 +12627,9 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
|
|
12758
12627
|
ggml_sycl_op_mul_mat_t op,
|
12759
12628
|
const bool convert_src1_to_q8_1) try {
|
12760
12629
|
|
12761
|
-
|
12762
|
-
const int64_t ne01 = src0->ne[1];
|
12763
|
-
const int64_t ne02 = src0->ne[2];
|
12764
|
-
const int64_t ne03 = src0->ne[3];
|
12630
|
+
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
|
12765
12631
|
|
12766
|
-
|
12767
|
-
const int64_t ne11 = src1->ne[1];
|
12768
|
-
const int64_t ne12 = src1->ne[2];
|
12769
|
-
const int64_t ne13 = src1->ne[3];
|
12632
|
+
GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
|
12770
12633
|
const int64_t nrows1 = ggml_nrows(src1);
|
12771
12634
|
|
12772
12635
|
GGML_ASSERT(ne03 == ne13);
|
@@ -13337,23 +13200,13 @@ static void ggml_sycl_mul_mat_mat_batched_sycl(const ggml_tensor *src0,
|
|
13337
13200
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
13338
13201
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
13339
13202
|
|
13340
|
-
|
13341
|
-
const int64_t ne01 = src0->ne[1];
|
13342
|
-
const int64_t ne02 = src0->ne[2];
|
13343
|
-
const int64_t ne03 = src0->ne[3];
|
13203
|
+
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
|
13344
13204
|
|
13345
|
-
|
13346
|
-
const int64_t nb02 = src0->nb[2]; GGML_UNUSED(nb02);
|
13347
|
-
const int64_t nb03 = src0->nb[3]; GGML_UNUSED(nb03);
|
13205
|
+
GGML_TENSOR_LOCALS(int64_t, nb0, src0, nb);
|
13348
13206
|
|
13349
|
-
|
13350
|
-
const int64_t ne11 = src1->ne[1];
|
13351
|
-
const int64_t ne12 = src1->ne[2];
|
13352
|
-
const int64_t ne13 = src1->ne[3];
|
13207
|
+
GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
|
13353
13208
|
|
13354
|
-
|
13355
|
-
const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
|
13356
|
-
const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
|
13209
|
+
GGML_TENSOR_LOCALS(int64_t, nb1, src1, nb);
|
13357
13210
|
|
13358
13211
|
const int64_t ne1 = ggml_nelements(src1);
|
13359
13212
|
const int64_t ne = ggml_nelements(dst);
|
@@ -13655,23 +13508,15 @@ static void ggml_sycl_mul_mat_id_sycl(ggml_tensor * dst) {
|
|
13655
13508
|
GGML_ASSERT(src00->backend != GGML_BACKEND_GPU_SPLIT);
|
13656
13509
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
13657
13510
|
|
13658
|
-
|
13659
|
-
const int64_t ne01 = src00->ne[1];
|
13660
|
-
const int64_t ne02 = src00->ne[2];
|
13661
|
-
const int64_t ne03 = src00->ne[3];
|
13511
|
+
GGML_TENSOR_LOCALS(int64_t, ne0, src00, ne);
|
13662
13512
|
|
13663
13513
|
//const int64_t nb01 = src00->nb[1];
|
13664
|
-
|
13665
|
-
const int64_t nb03 = src00->nb[3]; GGML_UNUSED(nb03);
|
13514
|
+
GGML_TENSOR_LOCALS(int64_t, nb0, src00, nb);
|
13666
13515
|
|
13667
|
-
|
13668
|
-
const int64_t ne11 = src1->ne[1];
|
13669
|
-
const int64_t ne12 = src1->ne[2];
|
13670
|
-
const int64_t ne13 = src1->ne[3];
|
13516
|
+
GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
|
13671
13517
|
|
13518
|
+
GGML_TENSOR_LOCALS(int64_t, nb1, src1, nb);
|
13672
13519
|
//const int64_t nb11 = src1->nb[1];
|
13673
|
-
const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
|
13674
|
-
const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
|
13675
13520
|
|
13676
13521
|
const int64_t ne1 = ggml_nelements(src1);
|
13677
13522
|
const int64_t ne = ggml_nelements(dst);
|
@@ -13940,25 +13785,7 @@ static void ggml_sycl_cpy(const ggml_tensor *src0, const ggml_tensor *src1,
|
|
13940
13785
|
GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
|
13941
13786
|
GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
|
13942
13787
|
|
13943
|
-
|
13944
|
-
const int64_t ne01 = src0->ne[1];
|
13945
|
-
const int64_t ne02 = src0->ne[2];
|
13946
|
-
|
13947
|
-
|
13948
|
-
const int64_t nb00 = src0->nb[0];
|
13949
|
-
const int64_t nb01 = src0->nb[1];
|
13950
|
-
const int64_t nb02 = src0->nb[2];
|
13951
|
-
const int64_t nb03 = src0->nb[3];
|
13952
|
-
|
13953
|
-
const int64_t ne10 = src1->ne[0];
|
13954
|
-
const int64_t ne11 = src1->ne[1];
|
13955
|
-
const int64_t ne12 = src1->ne[2];
|
13956
|
-
|
13957
|
-
|
13958
|
-
const int64_t nb10 = src1->nb[0];
|
13959
|
-
const int64_t nb11 = src1->nb[1];
|
13960
|
-
const int64_t nb12 = src1->nb[2];
|
13961
|
-
const int64_t nb13 = src1->nb[3];
|
13788
|
+
GGML_TENSOR_BINARY_OP_LOCALS;
|
13962
13789
|
|
13963
13790
|
SYCL_CHECK(ggml_sycl_set_device(g_main_device));
|
13964
13791
|
dpct::queue_ptr main_stream = g_syclStreams[g_main_device_index][0];
|
@@ -14815,7 +14642,8 @@ GGML_CALL static const char * ggml_backend_sycl_buffer_type_name(ggml_backend_bu
|
|
14815
14642
|
static ggml_backend_buffer_t
|
14816
14643
|
ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
|
14817
14644
|
size_t size) try {
|
14818
|
-
|
14645
|
+
ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context;
|
14646
|
+
int device = (int) buft_ctx->device;
|
14819
14647
|
|
14820
14648
|
ggml_sycl_set_device(device);
|
14821
14649
|
int device_index = get_device_index_by_id(device);
|
@@ -14893,7 +14721,7 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) {
|
|
14893
14721
|
for (int i = 0; i < GGML_SYCL_MAX_DEVICES; i++) {
|
14894
14722
|
ggml_backend_sycl_buffer_types[i] = {
|
14895
14723
|
/* .iface = */ ggml_backend_sycl_buffer_type_interface,
|
14896
|
-
/* .context = */
|
14724
|
+
/* .context = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(i)},
|
14897
14725
|
};
|
14898
14726
|
}
|
14899
14727
|
ggml_backend_sycl_buffer_type_initialized = true;
|
@@ -14955,10 +14783,6 @@ ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type() {
|
|
14955
14783
|
|
14956
14784
|
// backend
|
14957
14785
|
|
14958
|
-
struct ggml_backend_context_sycl {
|
14959
|
-
int device;
|
14960
|
-
};
|
14961
|
-
|
14962
14786
|
static const char * ggml_backend_sycl_name(ggml_backend_t backend) {
|
14963
14787
|
return GGML_SYCL_NAME;
|
14964
14788
|
|
@@ -14966,14 +14790,14 @@ static const char * ggml_backend_sycl_name(ggml_backend_t backend) {
|
|
14966
14790
|
}
|
14967
14791
|
|
14968
14792
|
static void ggml_backend_sycl_free(ggml_backend_t backend) {
|
14969
|
-
|
14793
|
+
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
14970
14794
|
|
14971
14795
|
delete sycl_ctx;
|
14972
14796
|
delete backend;
|
14973
14797
|
}
|
14974
14798
|
|
14975
14799
|
static ggml_backend_buffer_type_t ggml_backend_sycl_get_default_buffer_type(ggml_backend_t backend) {
|
14976
|
-
|
14800
|
+
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
14977
14801
|
|
14978
14802
|
return ggml_backend_sycl_buffer_type(sycl_ctx->device);
|
14979
14803
|
}
|
@@ -14982,7 +14806,7 @@ static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend,
|
|
14982
14806
|
ggml_tensor *tensor,
|
14983
14807
|
const void *data, size_t offset,
|
14984
14808
|
size_t size) try {
|
14985
|
-
|
14809
|
+
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
14986
14810
|
|
14987
14811
|
GGML_ASSERT(tensor->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type");
|
14988
14812
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
@@ -15000,7 +14824,7 @@ static void ggml_backend_sycl_get_tensor_async(ggml_backend_t backend,
|
|
15000
14824
|
const ggml_tensor *tensor,
|
15001
14825
|
void *data, size_t offset,
|
15002
14826
|
size_t size) try {
|
15003
|
-
|
14827
|
+
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
15004
14828
|
|
15005
14829
|
GGML_ASSERT(tensor->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type");
|
15006
14830
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
@@ -15015,7 +14839,7 @@ catch (sycl::exception const &exc) {
|
|
15015
14839
|
}
|
15016
14840
|
|
15017
14841
|
static void ggml_backend_sycl_synchronize(ggml_backend_t backend) try {
|
15018
|
-
|
14842
|
+
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
15019
14843
|
|
15020
14844
|
SYCL_CHECK(CHECK_TRY_ERROR(g_syclStreams[sycl_ctx->device][0]->wait()));
|
15021
14845
|
|
@@ -15051,7 +14875,7 @@ static void ggml_backend_sycl_graph_plan_compute(ggml_backend_t backend, ggml_ba
|
|
15051
14875
|
}
|
15052
14876
|
|
15053
14877
|
static bool ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
15054
|
-
|
14878
|
+
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
15055
14879
|
|
15056
14880
|
ggml_sycl_set_main_device(sycl_ctx->device);
|
15057
14881
|
|
@@ -15140,6 +14964,12 @@ static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, const ggml_ten
|
|
15140
14964
|
return false;
|
15141
14965
|
}
|
15142
14966
|
|
14967
|
+
if (a->type == GGML_TYPE_IQ1_S) {
|
14968
|
+
return false;
|
14969
|
+
}
|
14970
|
+
if (a->type == GGML_TYPE_IQ3_XXS) {
|
14971
|
+
return false;
|
14972
|
+
}
|
15143
14973
|
if (a->type == GGML_TYPE_IQ2_XXS) {
|
15144
14974
|
return false;
|
15145
14975
|
}
|
@@ -15259,8 +15089,9 @@ ggml_backend_t ggml_backend_sycl_init(int device) {
|
|
15259
15089
|
// not strictly necessary, but it may reduce the overhead of the first graph_compute
|
15260
15090
|
ggml_sycl_set_main_device(device);
|
15261
15091
|
|
15262
|
-
|
15263
|
-
/* .device = */ device
|
15092
|
+
ggml_backend_sycl_context * ctx = new ggml_backend_sycl_context {
|
15093
|
+
/* .device = */ device,
|
15094
|
+
/* .name = */ GGML_SYCL_NAME + std::to_string(device),
|
15264
15095
|
};
|
15265
15096
|
|
15266
15097
|
ggml_backend_t sycl_backend = new ggml_backend {
|