llama_cpp 0.12.6 → 0.12.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/ext/llama_cpp/llama_cpp.cpp +21 -10
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +8 -1
- data/vendor/tmp/llama.cpp/Makefile +43 -12
- data/vendor/tmp/llama.cpp/ggml-alloc.c +73 -43
- data/vendor/tmp/llama.cpp/ggml-backend.c +18 -9
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +560 -346
- data/vendor/tmp/llama.cpp/ggml-impl.h +20 -7
- data/vendor/tmp/llama.cpp/ggml-metal.m +99 -11
- data/vendor/tmp/llama.cpp/ggml-metal.metal +608 -9
- data/vendor/tmp/llama.cpp/ggml-quants.c +908 -54
- data/vendor/tmp/llama.cpp/ggml-quants.h +25 -2
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +81 -203
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +124 -52
- data/vendor/tmp/llama.cpp/ggml.c +948 -504
- data/vendor/tmp/llama.cpp/ggml.h +24 -11
- data/vendor/tmp/llama.cpp/llama.cpp +688 -163
- data/vendor/tmp/llama.cpp/llama.h +37 -1
- data/vendor/tmp/llama.cpp/scripts/get-flags.mk +1 -1
- metadata +2 -2
@@ -191,6 +191,21 @@ typedef struct {
|
|
191
191
|
} block_iq3_xxs;
|
192
192
|
static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
|
193
193
|
|
194
|
+
typedef struct {
|
195
|
+
ggml_fp16_t d;
|
196
|
+
uint8_t qs[QK_K/8];
|
197
|
+
uint8_t scales[QK_K/16];
|
198
|
+
} block_iq1_s;
|
199
|
+
static_assert(sizeof(block_iq1_s) == sizeof(ggml_fp16_t) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding");
|
200
|
+
|
201
|
+
// Non-linear quants
|
202
|
+
#define QK4_NL 32
|
203
|
+
typedef struct {
|
204
|
+
ggml_fp16_t d;
|
205
|
+
uint8_t qs[QK4_NL/2];
|
206
|
+
} block_iq4_nl;
|
207
|
+
static_assert(sizeof(block_iq4_nl) == sizeof(ggml_fp16_t) + QK4_NL/2, "wrong iq4_nl block size/padding");
|
208
|
+
|
194
209
|
#ifdef __cplusplus
|
195
210
|
extern "C" {
|
196
211
|
#endif
|
@@ -210,6 +225,7 @@ void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGM
|
|
210
225
|
void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int k);
|
211
226
|
void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int k);
|
212
227
|
void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int k);
|
228
|
+
void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int k);
|
213
229
|
|
214
230
|
void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
215
231
|
void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
@@ -225,6 +241,7 @@ void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in
|
|
225
241
|
void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
226
242
|
void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
227
243
|
void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
244
|
+
void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
228
245
|
|
229
246
|
// Dequantization
|
230
247
|
void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
@@ -243,6 +260,8 @@ void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRI
|
|
243
260
|
void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
244
261
|
void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
245
262
|
void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
263
|
+
void dequantize_row_iq1_s (const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
264
|
+
void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
246
265
|
|
247
266
|
// Dot product
|
248
267
|
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
@@ -259,6 +278,8 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
259
278
|
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
260
279
|
void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
261
280
|
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
281
|
+
void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
282
|
+
void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
262
283
|
|
263
284
|
//
|
264
285
|
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
@@ -266,6 +287,8 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
|
|
266
287
|
size_t quantize_iq2_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
267
288
|
size_t quantize_iq2_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
268
289
|
size_t quantize_iq3_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
290
|
+
size_t quantize_iq1_s (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
291
|
+
size_t quantize_iq4_nl (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
269
292
|
size_t quantize_q2_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
270
293
|
size_t quantize_q3_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
271
294
|
size_t quantize_q4_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
@@ -276,8 +299,8 @@ size_t quantize_q4_1 (const float * src, void * dst, int nrows, int n_per_row,
|
|
276
299
|
size_t quantize_q5_0 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
277
300
|
size_t quantize_q5_1 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
278
301
|
|
279
|
-
void iq2xs_init_impl(
|
280
|
-
void iq2xs_free_impl(
|
302
|
+
void iq2xs_init_impl(enum ggml_type type);
|
303
|
+
void iq2xs_free_impl(enum ggml_type type);
|
281
304
|
void iq3xs_init_impl(int grid_size);
|
282
305
|
void iq3xs_free_impl(int grid_size);
|
283
306
|
|
@@ -9188,174 +9188,22 @@ static void convert_mul_mat_vec_f16_sycl(const void *vx, const dfloat *y,
|
|
9188
9188
|
}
|
9189
9189
|
}
|
9190
9190
|
|
9191
|
-
|
9192
|
-
|
9193
|
-
|
9194
|
-
|
9195
|
-
|
9196
|
-
|
9197
|
-
|
9198
|
-
|
9199
|
-
|
9200
|
-
|
9201
|
-
|
9202
|
-
|
9203
|
-
|
9204
|
-
|
9205
|
-
|
9206
|
-
}
|
9207
|
-
|
9208
|
-
static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
|
9209
|
-
float *dst, const int ncols,
|
9210
|
-
const int nrows,
|
9211
|
-
dpct::queue_ptr stream) {
|
9212
|
-
GGML_ASSERT(ncols % QK4_1 == 0);
|
9213
|
-
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
9214
|
-
const sycl::range<3> block_nums(1, 1, block_num_y);
|
9215
|
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
9216
|
-
stream->parallel_for(
|
9217
|
-
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
9218
|
-
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
9219
|
-
mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ,
|
9220
|
-
vec_dot_q4_1_q8_1>(vx, vy, dst, ncols, nrows,
|
9221
|
-
item_ct1);
|
9222
|
-
});
|
9223
|
-
}
|
9224
|
-
|
9225
|
-
static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
|
9226
|
-
float *dst, const int ncols,
|
9227
|
-
const int nrows,
|
9228
|
-
dpct::queue_ptr stream) {
|
9229
|
-
GGML_ASSERT(ncols % QK5_0 == 0);
|
9230
|
-
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
9231
|
-
const sycl::range<3> block_nums(1, 1, block_num_y);
|
9232
|
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
9233
|
-
stream->parallel_for(
|
9234
|
-
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
9235
|
-
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
9236
|
-
mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ,
|
9237
|
-
vec_dot_q5_0_q8_1>(vx, vy, dst, ncols, nrows,
|
9238
|
-
item_ct1);
|
9239
|
-
});
|
9240
|
-
}
|
9241
|
-
|
9242
|
-
static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
|
9243
|
-
float *dst, const int ncols,
|
9244
|
-
const int nrows,
|
9245
|
-
dpct::queue_ptr stream) {
|
9246
|
-
GGML_ASSERT(ncols % QK5_1 == 0);
|
9247
|
-
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
9248
|
-
const sycl::range<3> block_nums(1, 1, block_num_y);
|
9249
|
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
9250
|
-
stream->parallel_for(
|
9251
|
-
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
9252
|
-
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
9253
|
-
mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ,
|
9254
|
-
vec_dot_q5_1_q8_1>(vx, vy, dst, ncols, nrows,
|
9255
|
-
item_ct1);
|
9256
|
-
});
|
9257
|
-
}
|
9258
|
-
|
9259
|
-
static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
|
9260
|
-
float *dst, const int ncols,
|
9261
|
-
const int nrows,
|
9262
|
-
dpct::queue_ptr stream) {
|
9263
|
-
GGML_ASSERT(ncols % QK8_0 == 0);
|
9264
|
-
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
9265
|
-
const sycl::range<3> block_nums(1, 1, block_num_y);
|
9266
|
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
9267
|
-
stream->parallel_for(
|
9268
|
-
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
9269
|
-
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
9270
|
-
mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ,
|
9271
|
-
vec_dot_q8_0_q8_1>(vx, vy, dst, ncols, nrows,
|
9272
|
-
item_ct1);
|
9273
|
-
});
|
9274
|
-
}
|
9275
|
-
|
9276
|
-
static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
|
9277
|
-
float *dst, const int ncols,
|
9278
|
-
const int nrows,
|
9279
|
-
dpct::queue_ptr stream) {
|
9280
|
-
GGML_ASSERT(ncols % QK_K == 0);
|
9281
|
-
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
9282
|
-
const sycl::range<3> block_nums(1, 1, block_num_y);
|
9283
|
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
9284
|
-
stream->parallel_for(
|
9285
|
-
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
9286
|
-
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
9287
|
-
mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ,
|
9288
|
-
vec_dot_q2_K_q8_1>(vx, vy, dst, ncols, nrows,
|
9289
|
-
item_ct1);
|
9290
|
-
});
|
9291
|
-
}
|
9292
|
-
|
9293
|
-
static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
|
9294
|
-
float *dst, const int ncols,
|
9295
|
-
const int nrows,
|
9296
|
-
dpct::queue_ptr stream) {
|
9297
|
-
GGML_ASSERT(ncols % QK_K == 0);
|
9298
|
-
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
9299
|
-
const sycl::range<3> block_nums(1, 1, block_num_y);
|
9300
|
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
9301
|
-
stream->parallel_for(
|
9302
|
-
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
9303
|
-
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
9304
|
-
mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ,
|
9305
|
-
vec_dot_q3_K_q8_1>(vx, vy, dst, ncols, nrows,
|
9306
|
-
item_ct1);
|
9307
|
-
});
|
9308
|
-
}
|
9309
|
-
|
9310
|
-
static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
|
9311
|
-
float *dst, const int ncols,
|
9312
|
-
const int nrows,
|
9313
|
-
dpct::queue_ptr stream) {
|
9314
|
-
GGML_ASSERT(ncols % QK_K == 0);
|
9315
|
-
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
9316
|
-
const sycl::range<3> block_nums(1, 1, block_num_y);
|
9317
|
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
9318
|
-
stream->parallel_for(
|
9319
|
-
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
9320
|
-
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
9321
|
-
mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ,
|
9322
|
-
vec_dot_q4_K_q8_1>(vx, vy, dst, ncols, nrows,
|
9323
|
-
item_ct1);
|
9324
|
-
});
|
9325
|
-
}
|
9326
|
-
|
9327
|
-
static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
|
9328
|
-
float *dst, const int ncols,
|
9329
|
-
const int nrows,
|
9330
|
-
dpct::queue_ptr stream) {
|
9331
|
-
GGML_ASSERT(ncols % QK_K == 0);
|
9332
|
-
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
9333
|
-
const sycl::range<3> block_nums(1, 1, block_num_y);
|
9334
|
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
9335
|
-
stream->parallel_for(
|
9336
|
-
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
9337
|
-
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
9338
|
-
mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ,
|
9339
|
-
vec_dot_q5_K_q8_1>(vx, vy, dst, ncols, nrows,
|
9340
|
-
item_ct1);
|
9341
|
-
});
|
9342
|
-
}
|
9343
|
-
|
9344
|
-
static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
|
9345
|
-
float *dst, const int ncols,
|
9346
|
-
const int nrows,
|
9347
|
-
dpct::queue_ptr stream) {
|
9348
|
-
GGML_ASSERT(ncols % QK_K == 0);
|
9349
|
-
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
9350
|
-
const sycl::range<3> block_nums(1, 1, block_num_y);
|
9351
|
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
9352
|
-
stream->parallel_for(
|
9353
|
-
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
9354
|
-
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
9355
|
-
mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ,
|
9356
|
-
vec_dot_q6_K_q8_1>(vx, vy, dst, ncols, nrows,
|
9357
|
-
item_ct1);
|
9358
|
-
});
|
9191
|
+
template <int qk, int qi, typename block_q_t, int vdr,
|
9192
|
+
vec_dot_q_sycl_t vec_dot_q_sycl>
|
9193
|
+
static void mul_mat_vec_q_sycl_submitter(const void *vx, const void *vy,
|
9194
|
+
float *dst, const int ncols,
|
9195
|
+
const int nrows,
|
9196
|
+
dpct::queue_ptr stream) {
|
9197
|
+
GGML_ASSERT(ncols % QK4_0 == 0);
|
9198
|
+
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
9199
|
+
const sycl::range<3> block_nums(1, 1, block_num_y);
|
9200
|
+
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
9201
|
+
stream->parallel_for(
|
9202
|
+
sycl::nd_range<3>(block_nums * block_dims, block_dims), [=
|
9203
|
+
](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
9204
|
+
mul_mat_vec_q<qk, qi, block_q_t, vdr, vec_dot_q_sycl>(
|
9205
|
+
vx, vy, dst, ncols, nrows, item_ct1);
|
9206
|
+
});
|
9359
9207
|
}
|
9360
9208
|
|
9361
9209
|
int get_device_index_by_id(int id){
|
@@ -12095,37 +11943,63 @@ inline void ggml_sycl_op_mul_mat_vec_q(
|
|
12095
11943
|
const int64_t ne00 = src0->ne[0];
|
12096
11944
|
const int64_t row_diff = row_high - row_low;
|
12097
11945
|
|
11946
|
+
// TODO: support these quantization types
|
11947
|
+
GGML_ASSERT(!(src0->type == GGML_TYPE_IQ2_XXS ||
|
11948
|
+
src0->type == GGML_TYPE_IQ2_XS ||
|
11949
|
+
src0->type == GGML_TYPE_IQ3_XXS ||
|
11950
|
+
src0->type == GGML_TYPE_IQ1_S));
|
11951
|
+
|
12098
11952
|
switch (src0->type) {
|
12099
11953
|
case GGML_TYPE_Q4_0:
|
12100
|
-
|
12101
|
-
|
11954
|
+
mul_mat_vec_q_sycl_submitter<QK4_0, QI4_0, block_q4_0,
|
11955
|
+
VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
|
11956
|
+
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
11957
|
+
break;
|
12102
11958
|
case GGML_TYPE_Q4_1:
|
12103
|
-
|
12104
|
-
|
11959
|
+
mul_mat_vec_q_sycl_submitter<QK4_1, QI4_1, block_q4_1,
|
11960
|
+
VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
|
11961
|
+
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
11962
|
+
break;
|
12105
11963
|
case GGML_TYPE_Q5_0:
|
12106
|
-
|
12107
|
-
|
11964
|
+
mul_mat_vec_q_sycl_submitter<QK5_0, QI5_0, block_q5_0,
|
11965
|
+
VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
|
11966
|
+
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
11967
|
+
break;
|
12108
11968
|
case GGML_TYPE_Q5_1:
|
12109
|
-
|
12110
|
-
|
11969
|
+
mul_mat_vec_q_sycl_submitter<QK5_1, QI5_1, block_q5_1,
|
11970
|
+
VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
|
11971
|
+
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
11972
|
+
break;
|
12111
11973
|
case GGML_TYPE_Q8_0:
|
12112
|
-
|
12113
|
-
|
11974
|
+
mul_mat_vec_q_sycl_submitter<QK8_0, QI8_0, block_q8_0,
|
11975
|
+
VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
|
11976
|
+
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
11977
|
+
break;
|
12114
11978
|
case GGML_TYPE_Q2_K:
|
12115
|
-
|
12116
|
-
|
11979
|
+
mul_mat_vec_q_sycl_submitter<QK_K, QI2_K, block_q2_K,
|
11980
|
+
VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
|
11981
|
+
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
11982
|
+
break;
|
12117
11983
|
case GGML_TYPE_Q3_K:
|
12118
|
-
|
12119
|
-
|
11984
|
+
mul_mat_vec_q_sycl_submitter<QK_K, QI3_K, block_q3_K,
|
11985
|
+
VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
|
11986
|
+
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
11987
|
+
break;
|
12120
11988
|
case GGML_TYPE_Q4_K:
|
12121
|
-
|
12122
|
-
|
11989
|
+
mul_mat_vec_q_sycl_submitter<QK_K, QI4_K, block_q4_K,
|
11990
|
+
VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
|
11991
|
+
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
11992
|
+
break;
|
12123
11993
|
case GGML_TYPE_Q5_K:
|
12124
|
-
|
12125
|
-
|
11994
|
+
mul_mat_vec_q_sycl_submitter<QK_K, QI5_K, block_q5_K,
|
11995
|
+
VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
|
11996
|
+
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
11997
|
+
break;
|
12126
11998
|
case GGML_TYPE_Q6_K:
|
12127
|
-
|
12128
|
-
|
11999
|
+
mul_mat_vec_q_sycl_submitter<QK_K, QI6_K, block_q6_K,
|
12000
|
+
VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
|
12001
|
+
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
12002
|
+
break;
|
12129
12003
|
default:
|
12130
12004
|
GGML_ASSERT(false);
|
12131
12005
|
break;
|
@@ -12145,7 +12019,7 @@ inline void ggml_sycl_op_dequantize_mul_mat_vec(
|
|
12145
12019
|
const int64_t src1_ncols, const int64_t src1_padded_row_size,
|
12146
12020
|
const dpct::queue_ptr &stream) {
|
12147
12021
|
|
12148
|
-
GGML_TENSOR_BINARY_OP_LOCALS
|
12022
|
+
GGML_TENSOR_BINARY_OP_LOCALS;
|
12149
12023
|
|
12150
12024
|
const int64_t row_diff = row_high - row_low;
|
12151
12025
|
|
@@ -14768,7 +14642,8 @@ GGML_CALL static const char * ggml_backend_sycl_buffer_type_name(ggml_backend_bu
|
|
14768
14642
|
static ggml_backend_buffer_t
|
14769
14643
|
ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
|
14770
14644
|
size_t size) try {
|
14771
|
-
|
14645
|
+
ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context;
|
14646
|
+
int device = (int) buft_ctx->device;
|
14772
14647
|
|
14773
14648
|
ggml_sycl_set_device(device);
|
14774
14649
|
int device_index = get_device_index_by_id(device);
|
@@ -14846,7 +14721,7 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) {
|
|
14846
14721
|
for (int i = 0; i < GGML_SYCL_MAX_DEVICES; i++) {
|
14847
14722
|
ggml_backend_sycl_buffer_types[i] = {
|
14848
14723
|
/* .iface = */ ggml_backend_sycl_buffer_type_interface,
|
14849
|
-
/* .context = */
|
14724
|
+
/* .context = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(i)},
|
14850
14725
|
};
|
14851
14726
|
}
|
14852
14727
|
ggml_backend_sycl_buffer_type_initialized = true;
|
@@ -14908,10 +14783,6 @@ ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type() {
|
|
14908
14783
|
|
14909
14784
|
// backend
|
14910
14785
|
|
14911
|
-
struct ggml_backend_context_sycl {
|
14912
|
-
int device;
|
14913
|
-
};
|
14914
|
-
|
14915
14786
|
static const char * ggml_backend_sycl_name(ggml_backend_t backend) {
|
14916
14787
|
return GGML_SYCL_NAME;
|
14917
14788
|
|
@@ -14919,14 +14790,14 @@ static const char * ggml_backend_sycl_name(ggml_backend_t backend) {
|
|
14919
14790
|
}
|
14920
14791
|
|
14921
14792
|
static void ggml_backend_sycl_free(ggml_backend_t backend) {
|
14922
|
-
|
14793
|
+
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
14923
14794
|
|
14924
14795
|
delete sycl_ctx;
|
14925
14796
|
delete backend;
|
14926
14797
|
}
|
14927
14798
|
|
14928
14799
|
static ggml_backend_buffer_type_t ggml_backend_sycl_get_default_buffer_type(ggml_backend_t backend) {
|
14929
|
-
|
14800
|
+
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
14930
14801
|
|
14931
14802
|
return ggml_backend_sycl_buffer_type(sycl_ctx->device);
|
14932
14803
|
}
|
@@ -14935,7 +14806,7 @@ static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend,
|
|
14935
14806
|
ggml_tensor *tensor,
|
14936
14807
|
const void *data, size_t offset,
|
14937
14808
|
size_t size) try {
|
14938
|
-
|
14809
|
+
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
14939
14810
|
|
14940
14811
|
GGML_ASSERT(tensor->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type");
|
14941
14812
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
@@ -14953,7 +14824,7 @@ static void ggml_backend_sycl_get_tensor_async(ggml_backend_t backend,
|
|
14953
14824
|
const ggml_tensor *tensor,
|
14954
14825
|
void *data, size_t offset,
|
14955
14826
|
size_t size) try {
|
14956
|
-
|
14827
|
+
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
14957
14828
|
|
14958
14829
|
GGML_ASSERT(tensor->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type");
|
14959
14830
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
@@ -14968,7 +14839,7 @@ catch (sycl::exception const &exc) {
|
|
14968
14839
|
}
|
14969
14840
|
|
14970
14841
|
static void ggml_backend_sycl_synchronize(ggml_backend_t backend) try {
|
14971
|
-
|
14842
|
+
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
14972
14843
|
|
14973
14844
|
SYCL_CHECK(CHECK_TRY_ERROR(g_syclStreams[sycl_ctx->device][0]->wait()));
|
14974
14845
|
|
@@ -15004,7 +14875,7 @@ static void ggml_backend_sycl_graph_plan_compute(ggml_backend_t backend, ggml_ba
|
|
15004
14875
|
}
|
15005
14876
|
|
15006
14877
|
static bool ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
15007
|
-
|
14878
|
+
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
15008
14879
|
|
15009
14880
|
ggml_sycl_set_main_device(sycl_ctx->device);
|
15010
14881
|
|
@@ -15093,6 +14964,12 @@ static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, const ggml_ten
|
|
15093
14964
|
return false;
|
15094
14965
|
}
|
15095
14966
|
|
14967
|
+
if (a->type == GGML_TYPE_IQ1_S) {
|
14968
|
+
return false;
|
14969
|
+
}
|
14970
|
+
if (a->type == GGML_TYPE_IQ3_XXS) {
|
14971
|
+
return false;
|
14972
|
+
}
|
15096
14973
|
if (a->type == GGML_TYPE_IQ2_XXS) {
|
15097
14974
|
return false;
|
15098
14975
|
}
|
@@ -15212,8 +15089,9 @@ ggml_backend_t ggml_backend_sycl_init(int device) {
|
|
15212
15089
|
// not strictly necessary, but it may reduce the overhead of the first graph_compute
|
15213
15090
|
ggml_sycl_set_main_device(device);
|
15214
15091
|
|
15215
|
-
|
15216
|
-
/* .device = */ device
|
15092
|
+
ggml_backend_sycl_context * ctx = new ggml_backend_sycl_context {
|
15093
|
+
/* .device = */ device,
|
15094
|
+
/* .name = */ GGML_SYCL_NAME + std::to_string(device),
|
15217
15095
|
};
|
15218
15096
|
|
15219
15097
|
ggml_backend_t sycl_backend = new ggml_backend {
|