llama_cpp 0.12.6 → 0.12.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/ext/llama_cpp/llama_cpp.cpp +21 -10
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +8 -1
- data/vendor/tmp/llama.cpp/Makefile +43 -12
- data/vendor/tmp/llama.cpp/ggml-alloc.c +73 -43
- data/vendor/tmp/llama.cpp/ggml-backend.c +18 -9
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +560 -346
- data/vendor/tmp/llama.cpp/ggml-impl.h +20 -7
- data/vendor/tmp/llama.cpp/ggml-metal.m +99 -11
- data/vendor/tmp/llama.cpp/ggml-metal.metal +608 -9
- data/vendor/tmp/llama.cpp/ggml-quants.c +908 -54
- data/vendor/tmp/llama.cpp/ggml-quants.h +25 -2
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +81 -203
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +124 -52
- data/vendor/tmp/llama.cpp/ggml.c +948 -504
- data/vendor/tmp/llama.cpp/ggml.h +24 -11
- data/vendor/tmp/llama.cpp/llama.cpp +688 -163
- data/vendor/tmp/llama.cpp/llama.h +37 -1
- data/vendor/tmp/llama.cpp/scripts/get-flags.mk +1 -1
- metadata +2 -2
@@ -191,6 +191,21 @@ typedef struct {
|
|
191
191
|
} block_iq3_xxs;
|
192
192
|
static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
|
193
193
|
|
194
|
+
typedef struct {
|
195
|
+
ggml_fp16_t d;
|
196
|
+
uint8_t qs[QK_K/8];
|
197
|
+
uint8_t scales[QK_K/16];
|
198
|
+
} block_iq1_s;
|
199
|
+
static_assert(sizeof(block_iq1_s) == sizeof(ggml_fp16_t) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding");
|
200
|
+
|
201
|
+
// Non-linear quants
|
202
|
+
#define QK4_NL 32
|
203
|
+
typedef struct {
|
204
|
+
ggml_fp16_t d;
|
205
|
+
uint8_t qs[QK4_NL/2];
|
206
|
+
} block_iq4_nl;
|
207
|
+
static_assert(sizeof(block_iq4_nl) == sizeof(ggml_fp16_t) + QK4_NL/2, "wrong iq4_nl block size/padding");
|
208
|
+
|
194
209
|
#ifdef __cplusplus
|
195
210
|
extern "C" {
|
196
211
|
#endif
|
@@ -210,6 +225,7 @@ void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGM
|
|
210
225
|
void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int k);
|
211
226
|
void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int k);
|
212
227
|
void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int k);
|
228
|
+
void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int k);
|
213
229
|
|
214
230
|
void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
215
231
|
void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
@@ -225,6 +241,7 @@ void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in
|
|
225
241
|
void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
226
242
|
void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
227
243
|
void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
244
|
+
void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
228
245
|
|
229
246
|
// Dequantization
|
230
247
|
void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
@@ -243,6 +260,8 @@ void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRI
|
|
243
260
|
void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
244
261
|
void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
245
262
|
void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
263
|
+
void dequantize_row_iq1_s (const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
264
|
+
void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
246
265
|
|
247
266
|
// Dot product
|
248
267
|
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
@@ -259,6 +278,8 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
259
278
|
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
260
279
|
void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
261
280
|
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
281
|
+
void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
282
|
+
void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
262
283
|
|
263
284
|
//
|
264
285
|
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
@@ -266,6 +287,8 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
|
|
266
287
|
size_t quantize_iq2_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
267
288
|
size_t quantize_iq2_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
268
289
|
size_t quantize_iq3_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
290
|
+
size_t quantize_iq1_s (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
291
|
+
size_t quantize_iq4_nl (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
269
292
|
size_t quantize_q2_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
270
293
|
size_t quantize_q3_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
271
294
|
size_t quantize_q4_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
@@ -276,8 +299,8 @@ size_t quantize_q4_1 (const float * src, void * dst, int nrows, int n_per_row,
|
|
276
299
|
size_t quantize_q5_0 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
277
300
|
size_t quantize_q5_1 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
278
301
|
|
279
|
-
void iq2xs_init_impl(
|
280
|
-
void iq2xs_free_impl(
|
302
|
+
void iq2xs_init_impl(enum ggml_type type);
|
303
|
+
void iq2xs_free_impl(enum ggml_type type);
|
281
304
|
void iq3xs_init_impl(int grid_size);
|
282
305
|
void iq3xs_free_impl(int grid_size);
|
283
306
|
|
@@ -9188,174 +9188,22 @@ static void convert_mul_mat_vec_f16_sycl(const void *vx, const dfloat *y,
|
|
9188
9188
|
}
|
9189
9189
|
}
|
9190
9190
|
|
9191
|
-
|
9192
|
-
|
9193
|
-
|
9194
|
-
|
9195
|
-
|
9196
|
-
|
9197
|
-
|
9198
|
-
|
9199
|
-
|
9200
|
-
|
9201
|
-
|
9202
|
-
|
9203
|
-
|
9204
|
-
|
9205
|
-
|
9206
|
-
}
|
9207
|
-
|
9208
|
-
static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
|
9209
|
-
float *dst, const int ncols,
|
9210
|
-
const int nrows,
|
9211
|
-
dpct::queue_ptr stream) {
|
9212
|
-
GGML_ASSERT(ncols % QK4_1 == 0);
|
9213
|
-
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
9214
|
-
const sycl::range<3> block_nums(1, 1, block_num_y);
|
9215
|
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
9216
|
-
stream->parallel_for(
|
9217
|
-
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
9218
|
-
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
9219
|
-
mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ,
|
9220
|
-
vec_dot_q4_1_q8_1>(vx, vy, dst, ncols, nrows,
|
9221
|
-
item_ct1);
|
9222
|
-
});
|
9223
|
-
}
|
9224
|
-
|
9225
|
-
static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
|
9226
|
-
float *dst, const int ncols,
|
9227
|
-
const int nrows,
|
9228
|
-
dpct::queue_ptr stream) {
|
9229
|
-
GGML_ASSERT(ncols % QK5_0 == 0);
|
9230
|
-
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
9231
|
-
const sycl::range<3> block_nums(1, 1, block_num_y);
|
9232
|
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
9233
|
-
stream->parallel_for(
|
9234
|
-
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
9235
|
-
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
9236
|
-
mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ,
|
9237
|
-
vec_dot_q5_0_q8_1>(vx, vy, dst, ncols, nrows,
|
9238
|
-
item_ct1);
|
9239
|
-
});
|
9240
|
-
}
|
9241
|
-
|
9242
|
-
static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
|
9243
|
-
float *dst, const int ncols,
|
9244
|
-
const int nrows,
|
9245
|
-
dpct::queue_ptr stream) {
|
9246
|
-
GGML_ASSERT(ncols % QK5_1 == 0);
|
9247
|
-
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
9248
|
-
const sycl::range<3> block_nums(1, 1, block_num_y);
|
9249
|
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
9250
|
-
stream->parallel_for(
|
9251
|
-
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
9252
|
-
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
9253
|
-
mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ,
|
9254
|
-
vec_dot_q5_1_q8_1>(vx, vy, dst, ncols, nrows,
|
9255
|
-
item_ct1);
|
9256
|
-
});
|
9257
|
-
}
|
9258
|
-
|
9259
|
-
static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
|
9260
|
-
float *dst, const int ncols,
|
9261
|
-
const int nrows,
|
9262
|
-
dpct::queue_ptr stream) {
|
9263
|
-
GGML_ASSERT(ncols % QK8_0 == 0);
|
9264
|
-
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
9265
|
-
const sycl::range<3> block_nums(1, 1, block_num_y);
|
9266
|
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
9267
|
-
stream->parallel_for(
|
9268
|
-
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
9269
|
-
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
9270
|
-
mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ,
|
9271
|
-
vec_dot_q8_0_q8_1>(vx, vy, dst, ncols, nrows,
|
9272
|
-
item_ct1);
|
9273
|
-
});
|
9274
|
-
}
|
9275
|
-
|
9276
|
-
static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
|
9277
|
-
float *dst, const int ncols,
|
9278
|
-
const int nrows,
|
9279
|
-
dpct::queue_ptr stream) {
|
9280
|
-
GGML_ASSERT(ncols % QK_K == 0);
|
9281
|
-
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
9282
|
-
const sycl::range<3> block_nums(1, 1, block_num_y);
|
9283
|
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
9284
|
-
stream->parallel_for(
|
9285
|
-
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
9286
|
-
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
9287
|
-
mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ,
|
9288
|
-
vec_dot_q2_K_q8_1>(vx, vy, dst, ncols, nrows,
|
9289
|
-
item_ct1);
|
9290
|
-
});
|
9291
|
-
}
|
9292
|
-
|
9293
|
-
static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
|
9294
|
-
float *dst, const int ncols,
|
9295
|
-
const int nrows,
|
9296
|
-
dpct::queue_ptr stream) {
|
9297
|
-
GGML_ASSERT(ncols % QK_K == 0);
|
9298
|
-
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
9299
|
-
const sycl::range<3> block_nums(1, 1, block_num_y);
|
9300
|
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
9301
|
-
stream->parallel_for(
|
9302
|
-
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
9303
|
-
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
9304
|
-
mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ,
|
9305
|
-
vec_dot_q3_K_q8_1>(vx, vy, dst, ncols, nrows,
|
9306
|
-
item_ct1);
|
9307
|
-
});
|
9308
|
-
}
|
9309
|
-
|
9310
|
-
static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
|
9311
|
-
float *dst, const int ncols,
|
9312
|
-
const int nrows,
|
9313
|
-
dpct::queue_ptr stream) {
|
9314
|
-
GGML_ASSERT(ncols % QK_K == 0);
|
9315
|
-
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
9316
|
-
const sycl::range<3> block_nums(1, 1, block_num_y);
|
9317
|
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
9318
|
-
stream->parallel_for(
|
9319
|
-
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
9320
|
-
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
9321
|
-
mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ,
|
9322
|
-
vec_dot_q4_K_q8_1>(vx, vy, dst, ncols, nrows,
|
9323
|
-
item_ct1);
|
9324
|
-
});
|
9325
|
-
}
|
9326
|
-
|
9327
|
-
static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
|
9328
|
-
float *dst, const int ncols,
|
9329
|
-
const int nrows,
|
9330
|
-
dpct::queue_ptr stream) {
|
9331
|
-
GGML_ASSERT(ncols % QK_K == 0);
|
9332
|
-
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
9333
|
-
const sycl::range<3> block_nums(1, 1, block_num_y);
|
9334
|
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
9335
|
-
stream->parallel_for(
|
9336
|
-
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
9337
|
-
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
9338
|
-
mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ,
|
9339
|
-
vec_dot_q5_K_q8_1>(vx, vy, dst, ncols, nrows,
|
9340
|
-
item_ct1);
|
9341
|
-
});
|
9342
|
-
}
|
9343
|
-
|
9344
|
-
static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
|
9345
|
-
float *dst, const int ncols,
|
9346
|
-
const int nrows,
|
9347
|
-
dpct::queue_ptr stream) {
|
9348
|
-
GGML_ASSERT(ncols % QK_K == 0);
|
9349
|
-
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
9350
|
-
const sycl::range<3> block_nums(1, 1, block_num_y);
|
9351
|
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
9352
|
-
stream->parallel_for(
|
9353
|
-
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
9354
|
-
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
9355
|
-
mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ,
|
9356
|
-
vec_dot_q6_K_q8_1>(vx, vy, dst, ncols, nrows,
|
9357
|
-
item_ct1);
|
9358
|
-
});
|
9191
|
+
template <int qk, int qi, typename block_q_t, int vdr,
|
9192
|
+
vec_dot_q_sycl_t vec_dot_q_sycl>
|
9193
|
+
static void mul_mat_vec_q_sycl_submitter(const void *vx, const void *vy,
|
9194
|
+
float *dst, const int ncols,
|
9195
|
+
const int nrows,
|
9196
|
+
dpct::queue_ptr stream) {
|
9197
|
+
GGML_ASSERT(ncols % QK4_0 == 0);
|
9198
|
+
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
9199
|
+
const sycl::range<3> block_nums(1, 1, block_num_y);
|
9200
|
+
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
9201
|
+
stream->parallel_for(
|
9202
|
+
sycl::nd_range<3>(block_nums * block_dims, block_dims), [=
|
9203
|
+
](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
9204
|
+
mul_mat_vec_q<qk, qi, block_q_t, vdr, vec_dot_q_sycl>(
|
9205
|
+
vx, vy, dst, ncols, nrows, item_ct1);
|
9206
|
+
});
|
9359
9207
|
}
|
9360
9208
|
|
9361
9209
|
int get_device_index_by_id(int id){
|
@@ -12095,37 +11943,63 @@ inline void ggml_sycl_op_mul_mat_vec_q(
|
|
12095
11943
|
const int64_t ne00 = src0->ne[0];
|
12096
11944
|
const int64_t row_diff = row_high - row_low;
|
12097
11945
|
|
11946
|
+
// TODO: support these quantization types
|
11947
|
+
GGML_ASSERT(!(src0->type == GGML_TYPE_IQ2_XXS ||
|
11948
|
+
src0->type == GGML_TYPE_IQ2_XS ||
|
11949
|
+
src0->type == GGML_TYPE_IQ3_XXS ||
|
11950
|
+
src0->type == GGML_TYPE_IQ1_S));
|
11951
|
+
|
12098
11952
|
switch (src0->type) {
|
12099
11953
|
case GGML_TYPE_Q4_0:
|
12100
|
-
|
12101
|
-
|
11954
|
+
mul_mat_vec_q_sycl_submitter<QK4_0, QI4_0, block_q4_0,
|
11955
|
+
VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
|
11956
|
+
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
11957
|
+
break;
|
12102
11958
|
case GGML_TYPE_Q4_1:
|
12103
|
-
|
12104
|
-
|
11959
|
+
mul_mat_vec_q_sycl_submitter<QK4_1, QI4_1, block_q4_1,
|
11960
|
+
VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
|
11961
|
+
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
11962
|
+
break;
|
12105
11963
|
case GGML_TYPE_Q5_0:
|
12106
|
-
|
12107
|
-
|
11964
|
+
mul_mat_vec_q_sycl_submitter<QK5_0, QI5_0, block_q5_0,
|
11965
|
+
VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
|
11966
|
+
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
11967
|
+
break;
|
12108
11968
|
case GGML_TYPE_Q5_1:
|
12109
|
-
|
12110
|
-
|
11969
|
+
mul_mat_vec_q_sycl_submitter<QK5_1, QI5_1, block_q5_1,
|
11970
|
+
VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
|
11971
|
+
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
11972
|
+
break;
|
12111
11973
|
case GGML_TYPE_Q8_0:
|
12112
|
-
|
12113
|
-
|
11974
|
+
mul_mat_vec_q_sycl_submitter<QK8_0, QI8_0, block_q8_0,
|
11975
|
+
VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
|
11976
|
+
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
11977
|
+
break;
|
12114
11978
|
case GGML_TYPE_Q2_K:
|
12115
|
-
|
12116
|
-
|
11979
|
+
mul_mat_vec_q_sycl_submitter<QK_K, QI2_K, block_q2_K,
|
11980
|
+
VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
|
11981
|
+
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
11982
|
+
break;
|
12117
11983
|
case GGML_TYPE_Q3_K:
|
12118
|
-
|
12119
|
-
|
11984
|
+
mul_mat_vec_q_sycl_submitter<QK_K, QI3_K, block_q3_K,
|
11985
|
+
VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
|
11986
|
+
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
11987
|
+
break;
|
12120
11988
|
case GGML_TYPE_Q4_K:
|
12121
|
-
|
12122
|
-
|
11989
|
+
mul_mat_vec_q_sycl_submitter<QK_K, QI4_K, block_q4_K,
|
11990
|
+
VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
|
11991
|
+
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
11992
|
+
break;
|
12123
11993
|
case GGML_TYPE_Q5_K:
|
12124
|
-
|
12125
|
-
|
11994
|
+
mul_mat_vec_q_sycl_submitter<QK_K, QI5_K, block_q5_K,
|
11995
|
+
VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
|
11996
|
+
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
11997
|
+
break;
|
12126
11998
|
case GGML_TYPE_Q6_K:
|
12127
|
-
|
12128
|
-
|
11999
|
+
mul_mat_vec_q_sycl_submitter<QK_K, QI6_K, block_q6_K,
|
12000
|
+
VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
|
12001
|
+
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
12002
|
+
break;
|
12129
12003
|
default:
|
12130
12004
|
GGML_ASSERT(false);
|
12131
12005
|
break;
|
@@ -12145,7 +12019,7 @@ inline void ggml_sycl_op_dequantize_mul_mat_vec(
|
|
12145
12019
|
const int64_t src1_ncols, const int64_t src1_padded_row_size,
|
12146
12020
|
const dpct::queue_ptr &stream) {
|
12147
12021
|
|
12148
|
-
GGML_TENSOR_BINARY_OP_LOCALS
|
12022
|
+
GGML_TENSOR_BINARY_OP_LOCALS;
|
12149
12023
|
|
12150
12024
|
const int64_t row_diff = row_high - row_low;
|
12151
12025
|
|
@@ -14768,7 +14642,8 @@ GGML_CALL static const char * ggml_backend_sycl_buffer_type_name(ggml_backend_bu
|
|
14768
14642
|
static ggml_backend_buffer_t
|
14769
14643
|
ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
|
14770
14644
|
size_t size) try {
|
14771
|
-
|
14645
|
+
ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context;
|
14646
|
+
int device = (int) buft_ctx->device;
|
14772
14647
|
|
14773
14648
|
ggml_sycl_set_device(device);
|
14774
14649
|
int device_index = get_device_index_by_id(device);
|
@@ -14846,7 +14721,7 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) {
|
|
14846
14721
|
for (int i = 0; i < GGML_SYCL_MAX_DEVICES; i++) {
|
14847
14722
|
ggml_backend_sycl_buffer_types[i] = {
|
14848
14723
|
/* .iface = */ ggml_backend_sycl_buffer_type_interface,
|
14849
|
-
/* .context = */
|
14724
|
+
/* .context = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(i)},
|
14850
14725
|
};
|
14851
14726
|
}
|
14852
14727
|
ggml_backend_sycl_buffer_type_initialized = true;
|
@@ -14908,10 +14783,6 @@ ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type() {
|
|
14908
14783
|
|
14909
14784
|
// backend
|
14910
14785
|
|
14911
|
-
struct ggml_backend_context_sycl {
|
14912
|
-
int device;
|
14913
|
-
};
|
14914
|
-
|
14915
14786
|
static const char * ggml_backend_sycl_name(ggml_backend_t backend) {
|
14916
14787
|
return GGML_SYCL_NAME;
|
14917
14788
|
|
@@ -14919,14 +14790,14 @@ static const char * ggml_backend_sycl_name(ggml_backend_t backend) {
|
|
14919
14790
|
}
|
14920
14791
|
|
14921
14792
|
static void ggml_backend_sycl_free(ggml_backend_t backend) {
|
14922
|
-
|
14793
|
+
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
14923
14794
|
|
14924
14795
|
delete sycl_ctx;
|
14925
14796
|
delete backend;
|
14926
14797
|
}
|
14927
14798
|
|
14928
14799
|
static ggml_backend_buffer_type_t ggml_backend_sycl_get_default_buffer_type(ggml_backend_t backend) {
|
14929
|
-
|
14800
|
+
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
14930
14801
|
|
14931
14802
|
return ggml_backend_sycl_buffer_type(sycl_ctx->device);
|
14932
14803
|
}
|
@@ -14935,7 +14806,7 @@ static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend,
|
|
14935
14806
|
ggml_tensor *tensor,
|
14936
14807
|
const void *data, size_t offset,
|
14937
14808
|
size_t size) try {
|
14938
|
-
|
14809
|
+
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
14939
14810
|
|
14940
14811
|
GGML_ASSERT(tensor->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type");
|
14941
14812
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
@@ -14953,7 +14824,7 @@ static void ggml_backend_sycl_get_tensor_async(ggml_backend_t backend,
|
|
14953
14824
|
const ggml_tensor *tensor,
|
14954
14825
|
void *data, size_t offset,
|
14955
14826
|
size_t size) try {
|
14956
|
-
|
14827
|
+
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
14957
14828
|
|
14958
14829
|
GGML_ASSERT(tensor->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type");
|
14959
14830
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
@@ -14968,7 +14839,7 @@ catch (sycl::exception const &exc) {
|
|
14968
14839
|
}
|
14969
14840
|
|
14970
14841
|
static void ggml_backend_sycl_synchronize(ggml_backend_t backend) try {
|
14971
|
-
|
14842
|
+
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
14972
14843
|
|
14973
14844
|
SYCL_CHECK(CHECK_TRY_ERROR(g_syclStreams[sycl_ctx->device][0]->wait()));
|
14974
14845
|
|
@@ -15004,7 +14875,7 @@ static void ggml_backend_sycl_graph_plan_compute(ggml_backend_t backend, ggml_ba
|
|
15004
14875
|
}
|
15005
14876
|
|
15006
14877
|
static bool ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
15007
|
-
|
14878
|
+
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
15008
14879
|
|
15009
14880
|
ggml_sycl_set_main_device(sycl_ctx->device);
|
15010
14881
|
|
@@ -15093,6 +14964,12 @@ static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, const ggml_ten
|
|
15093
14964
|
return false;
|
15094
14965
|
}
|
15095
14966
|
|
14967
|
+
if (a->type == GGML_TYPE_IQ1_S) {
|
14968
|
+
return false;
|
14969
|
+
}
|
14970
|
+
if (a->type == GGML_TYPE_IQ3_XXS) {
|
14971
|
+
return false;
|
14972
|
+
}
|
15096
14973
|
if (a->type == GGML_TYPE_IQ2_XXS) {
|
15097
14974
|
return false;
|
15098
14975
|
}
|
@@ -15212,8 +15089,9 @@ ggml_backend_t ggml_backend_sycl_init(int device) {
|
|
15212
15089
|
// not strictly necessary, but it may reduce the overhead of the first graph_compute
|
15213
15090
|
ggml_sycl_set_main_device(device);
|
15214
15091
|
|
15215
|
-
|
15216
|
-
/* .device = */ device
|
15092
|
+
ggml_backend_sycl_context * ctx = new ggml_backend_sycl_context {
|
15093
|
+
/* .device = */ device,
|
15094
|
+
/* .name = */ GGML_SYCL_NAME + std::to_string(device),
|
15217
15095
|
};
|
15218
15096
|
|
15219
15097
|
ggml_backend_t sycl_backend = new ggml_backend {
|