llama_cpp 0.15.3 → 0.15.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/ext/llama_cpp/llama_cpp.cpp +12 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- data/vendor/tmp/llama.cpp/Makefile +4 -1
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +27 -10
- data/vendor/tmp/llama.cpp/ggml-impl.h +4 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +0 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +65 -11
- data/vendor/tmp/llama.cpp/ggml-metal.metal +69 -27
- data/vendor/tmp/llama.cpp/ggml-quants.c +101 -11
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +75 -58
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +338 -160
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +2 -0
- data/vendor/tmp/llama.cpp/ggml.c +145 -101
- data/vendor/tmp/llama.cpp/ggml.h +18 -3
- data/vendor/tmp/llama.cpp/llama.cpp +637 -249
- data/vendor/tmp/llama.cpp/llama.h +11 -5
- metadata +2 -2
@@ -2944,6 +2944,57 @@ namespace dpct
|
|
2944
2944
|
using shared_memory = detail::device_memory<T, shared, Dimension>;
|
2945
2945
|
|
2946
2946
|
|
2947
|
+
template <typename T,
|
2948
|
+
sycl::access::address_space addressSpace =
|
2949
|
+
sycl::access::address_space::global_space,
|
2950
|
+
sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
|
2951
|
+
sycl::memory_scope memoryScope = sycl::memory_scope::device>
|
2952
|
+
inline T atomic_fetch_add(T *addr, T operand) {
|
2953
|
+
auto atm =
|
2954
|
+
sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
|
2955
|
+
return atm.fetch_add(operand);
|
2956
|
+
}
|
2957
|
+
|
2958
|
+
template <sycl::access::address_space addressSpace =
|
2959
|
+
sycl::access::address_space::global_space,
|
2960
|
+
sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
|
2961
|
+
sycl::memory_scope memoryScope = sycl::memory_scope::device,
|
2962
|
+
typename T1, typename T2>
|
2963
|
+
inline T1 atomic_fetch_add(T1 *addr, T2 operand) {
|
2964
|
+
auto atm =
|
2965
|
+
sycl::atomic_ref<T1, memoryOrder, memoryScope, addressSpace>(addr[0]);
|
2966
|
+
return atm.fetch_add(operand);
|
2967
|
+
}
|
2968
|
+
|
2969
|
+
template <typename T, sycl::access::address_space addressSpace =
|
2970
|
+
sycl::access::address_space::global_space>
|
2971
|
+
inline T atomic_fetch_add(T *addr, T operand,
|
2972
|
+
sycl::memory_order memoryOrder) {
|
2973
|
+
switch (memoryOrder) {
|
2974
|
+
case sycl::memory_order::relaxed:
|
2975
|
+
return atomic_fetch_add<T, addressSpace, sycl::memory_order::relaxed,
|
2976
|
+
sycl::memory_scope::device>(addr, operand);
|
2977
|
+
case sycl::memory_order::acq_rel:
|
2978
|
+
return atomic_fetch_add<T, addressSpace, sycl::memory_order::acq_rel,
|
2979
|
+
sycl::memory_scope::device>(addr, operand);
|
2980
|
+
case sycl::memory_order::seq_cst:
|
2981
|
+
return atomic_fetch_add<T, addressSpace, sycl::memory_order::seq_cst,
|
2982
|
+
sycl::memory_scope::device>(addr, operand);
|
2983
|
+
default:
|
2984
|
+
assert(false && "Invalid memory_order for atomics. Valid memory_order for "
|
2985
|
+
"atomics are: sycl::memory_order::relaxed, "
|
2986
|
+
"sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!");
|
2987
|
+
}
|
2988
|
+
}
|
2989
|
+
|
2990
|
+
template <sycl::access::address_space addressSpace =
|
2991
|
+
sycl::access::address_space::global_space,
|
2992
|
+
typename T1, typename T2>
|
2993
|
+
inline T1 atomic_fetch_add(T1 *addr, T2 operand,
|
2994
|
+
sycl::memory_order memoryOrder) {
|
2995
|
+
atomic_fetch_add<T1, addressSpace>(addr, operand, memoryOrder);
|
2996
|
+
}
|
2997
|
+
|
2947
2998
|
} // COPY from DPCT head files
|
2948
2999
|
|
2949
3000
|
#define GGML_COMMON_DECL_SYCL
|
@@ -2971,20 +3022,19 @@ static int g_work_group_size = 0;
|
|
2971
3022
|
// typedef sycl::half ggml_fp16_t;
|
2972
3023
|
|
2973
3024
|
#define __SYCL_ARCH__ DPCT_COMPATIBILITY_TEMP
|
2974
|
-
#define VER_4VEC
|
3025
|
+
#define VER_4VEC 130 //todo for hardward optimize.
|
2975
3026
|
#define VER_GEN9 700 //todo for hardward optimize.
|
2976
3027
|
#define VER_GEN12 1000000 //todo for hardward optimize.
|
2977
3028
|
#define VER_GEN13 (VER_GEN12 + 1030) //todo for hardward optimize.
|
2978
3029
|
|
2979
3030
|
#define GGML_SYCL_MAX_NODES 8192 //TODO: adapt to hardwares
|
2980
3031
|
|
2981
|
-
|
2982
|
-
|
2983
|
-
|
2984
|
-
#define SYCL_USE_XMX
|
3032
|
+
#if !defined(GGML_SYCL_FORCE_MMQ)
|
3033
|
+
#define SYCL_USE_XMX
|
3034
|
+
#endif
|
2985
3035
|
|
2986
3036
|
// max batch size to use MMQ kernels when tensor cores are available
|
2987
|
-
#define
|
3037
|
+
#define MMQ_MAX_BATCH_SIZE 32
|
2988
3038
|
|
2989
3039
|
|
2990
3040
|
#if defined(_MSC_VER)
|
@@ -3060,6 +3110,7 @@ void ggml_sycl_get_device_description(int device, char * description, size_t d
|
|
3060
3110
|
bool ggml_backend_is_sycl(ggml_backend_t backend);
|
3061
3111
|
int ggml_backend_sycl_get_device(ggml_backend_t backend);
|
3062
3112
|
int get_main_device();
|
3113
|
+
static bool ggml_backend_buffer_is_sycl_split(ggml_backend_buffer_t buffer);
|
3063
3114
|
void print_ggml_tensor(const char*name, struct ggml_tensor *src);
|
3064
3115
|
void log_tensor_with_cnt(const char* name, struct ggml_tensor * src, int stop_cnt);
|
3065
3116
|
|
@@ -8830,12 +8881,11 @@ static void rope(
|
|
8830
8881
|
dst[i + 1] = x0*sin_theta + x1*cos_theta;
|
8831
8882
|
}
|
8832
8883
|
|
8833
|
-
template<typename T, bool has_pos>
|
8884
|
+
template<typename T, bool has_pos, bool has_freq_facs>
|
8834
8885
|
static void rope_neox(
|
8835
8886
|
const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
|
8836
|
-
float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims
|
8837
|
-
,
|
8838
|
-
const sycl::nd_item<3> &item_ct1) {
|
8887
|
+
float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims,
|
8888
|
+
const float * freq_factors, const sycl::nd_item<3> &item_ct1) {
|
8839
8889
|
const int col = 2 * (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
|
8840
8890
|
item_ct1.get_local_id(1));
|
8841
8891
|
|
@@ -8863,8 +8913,10 @@ static void rope_neox(
|
|
8863
8913
|
float cur_rot = inv_ndims * ic - ib;
|
8864
8914
|
|
8865
8915
|
const int p = has_pos ? pos[i2] : 0;
|
8916
|
+
const float freq_factor = has_freq_facs ? freq_factors[ic/2] : 1.0f;
|
8917
|
+
|
8866
8918
|
const float theta_base =
|
8867
|
-
p * freq_scale * dpct::pow(theta_scale, col / 2.0f);
|
8919
|
+
p * freq_scale * dpct::pow(theta_scale, col / 2.0f)/freq_factor;
|
8868
8920
|
|
8869
8921
|
float cos_theta, sin_theta;
|
8870
8922
|
rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
@@ -12413,7 +12465,7 @@ static void rope_neox_sycl(const T *x, T *dst, int ncols, int n_dims, int nrows,
|
|
12413
12465
|
const int32_t *pos, float freq_scale,
|
12414
12466
|
int p_delta_rows, float freq_base, float ext_factor,
|
12415
12467
|
float attn_factor, rope_corr_dims corr_dims,
|
12416
|
-
dpct::queue_ptr stream) {
|
12468
|
+
const float * freq_factors, dpct::queue_ptr stream) {
|
12417
12469
|
GGML_ASSERT(ncols % 2 == 0);
|
12418
12470
|
const sycl::range<3> block_dims(1, SYCL_ROPE_BLOCK_SIZE, 1);
|
12419
12471
|
const int num_blocks_x = (ncols + 2*SYCL_ROPE_BLOCK_SIZE - 1) / (2*SYCL_ROPE_BLOCK_SIZE);
|
@@ -12423,38 +12475,48 @@ static void rope_neox_sycl(const T *x, T *dst, int ncols, int n_dims, int nrows,
|
|
12423
12475
|
const float inv_ndims = -1.0f / n_dims;
|
12424
12476
|
|
12425
12477
|
if (pos == nullptr) {
|
12426
|
-
/*
|
12427
|
-
DPCT1049:42: The work-group size passed to the SYCL kernel may exceed
|
12428
|
-
the limit. To get the device limit, query
|
12429
|
-
info::device::max_work_group_size. Adjust the work-group size if needed.
|
12430
|
-
*/
|
12431
12478
|
dpct::has_capability_or_fail(stream->get_device(),
|
12432
12479
|
{sycl::aspect::fp16});
|
12433
|
-
|
12434
|
-
|
12435
|
-
|
12436
|
-
|
12437
|
-
|
12438
|
-
|
12439
|
-
|
12440
|
-
|
12441
|
-
|
12480
|
+
if (freq_factors == nullptr) {
|
12481
|
+
stream->parallel_for(
|
12482
|
+
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
12483
|
+
[=](sycl::nd_item<3> item_ct1) {
|
12484
|
+
rope_neox<T, false, false>(x, dst, ncols, n_dims, pos, freq_scale,
|
12485
|
+
p_delta_rows, ext_factor, attn_factor,
|
12486
|
+
corr_dims, theta_scale, inv_ndims, freq_factors,
|
12487
|
+
item_ct1);
|
12488
|
+
});
|
12489
|
+
} else {
|
12490
|
+
stream->parallel_for(
|
12491
|
+
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
12492
|
+
[=](sycl::nd_item<3> item_ct1) {
|
12493
|
+
rope_neox<T, false, true>(x, dst, ncols, n_dims, pos, freq_scale,
|
12494
|
+
p_delta_rows, ext_factor, attn_factor,
|
12495
|
+
corr_dims, theta_scale, inv_ndims, freq_factors,
|
12496
|
+
item_ct1);
|
12497
|
+
});
|
12498
|
+
}
|
12442
12499
|
} else {
|
12443
|
-
/*
|
12444
|
-
DPCT1049:43: The work-group size passed to the SYCL kernel may exceed
|
12445
|
-
the limit. To get the device limit, query
|
12446
|
-
info::device::max_work_group_size. Adjust the work-group size if needed.
|
12447
|
-
*/
|
12448
12500
|
dpct::has_capability_or_fail(stream->get_device(),
|
12449
12501
|
{sycl::aspect::fp16});
|
12450
12502
|
|
12451
|
-
|
12452
|
-
|
12453
|
-
|
12454
|
-
|
12455
|
-
|
12456
|
-
|
12457
|
-
|
12503
|
+
if (freq_factors == nullptr) {
|
12504
|
+
stream->parallel_for(
|
12505
|
+
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
12506
|
+
[=](sycl::nd_item<3> item_ct1) {
|
12507
|
+
rope_neox<T, true, false>(x, dst, ncols, n_dims, pos, freq_scale,
|
12508
|
+
p_delta_rows, ext_factor, attn_factor,
|
12509
|
+
corr_dims, theta_scale, inv_ndims, freq_factors, item_ct1);
|
12510
|
+
});
|
12511
|
+
} else {
|
12512
|
+
stream->parallel_for(
|
12513
|
+
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
12514
|
+
[=](sycl::nd_item<3> item_ct1) {
|
12515
|
+
rope_neox<T, true, true>(x, dst, ncols, n_dims, pos, freq_scale,
|
12516
|
+
p_delta_rows, ext_factor, attn_factor,
|
12517
|
+
corr_dims, theta_scale, inv_ndims, freq_factors, item_ct1);
|
12518
|
+
});
|
12519
|
+
}
|
12458
12520
|
}
|
12459
12521
|
}
|
12460
12522
|
|
@@ -13501,6 +13563,10 @@ inline void ggml_sycl_op_concat(const ggml_tensor *src0,
|
|
13501
13563
|
const float *src0_dd, const float *src1_dd,
|
13502
13564
|
float *dst_dd,
|
13503
13565
|
const dpct::queue_ptr &main_stream) {
|
13566
|
+
#pragma message("TODO: generalize concat kernel for dim != 2")
|
13567
|
+
#pragma message(" https://github.com/ggerganov/llama.cpp/pull/7563")
|
13568
|
+
int dim = dst->op_params[0];
|
13569
|
+
GGML_ASSERT(dim == 2);
|
13504
13570
|
|
13505
13571
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
13506
13572
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
@@ -13986,9 +14052,7 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
|
|
13986
14052
|
ggml_tensor *dst, const float *src0_dd,
|
13987
14053
|
const float *src1_dd, float *dst_dd,
|
13988
14054
|
const dpct::queue_ptr &main_stream) {
|
13989
|
-
|
13990
|
-
#pragma message(" https://github.com/ggerganov/llama.cpp/pull/7225")
|
13991
|
-
GGML_ASSERT(dst->src[2] == nullptr && "phi3 frequency factors not implemented yet");
|
14055
|
+
const ggml_tensor * src2 = dst->src[2];
|
13992
14056
|
|
13993
14057
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
|
13994
14058
|
GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
@@ -14014,6 +14078,7 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
|
|
14014
14078
|
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
14015
14079
|
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
14016
14080
|
|
14081
|
+
const float * freq_factors = nullptr;
|
14017
14082
|
const int32_t * pos = nullptr;
|
14018
14083
|
if ((mode & 1) == 0) {
|
14019
14084
|
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
@@ -14024,6 +14089,16 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
|
|
14024
14089
|
const bool is_neox = mode & 2;
|
14025
14090
|
const bool is_glm = mode & 4;
|
14026
14091
|
|
14092
|
+
if (is_neox) {
|
14093
|
+
pos = (const int32_t *) src1_dd;
|
14094
|
+
|
14095
|
+
if (src2 != nullptr) {
|
14096
|
+
freq_factors = (const float *) src2->data;
|
14097
|
+
}
|
14098
|
+
} else {
|
14099
|
+
GGML_ASSERT(src2 == nullptr && "TODO: freq_factors not implemented for !is_neox");
|
14100
|
+
}
|
14101
|
+
|
14027
14102
|
rope_corr_dims corr_dims;
|
14028
14103
|
ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims.v);
|
14029
14104
|
|
@@ -14035,13 +14110,13 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
|
|
14035
14110
|
if (src0->type == GGML_TYPE_F32) {
|
14036
14111
|
rope_neox_sycl(
|
14037
14112
|
(const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
14038
|
-
attn_factor, corr_dims, main_stream
|
14113
|
+
attn_factor, corr_dims, freq_factors, main_stream
|
14039
14114
|
);
|
14040
14115
|
} else if (src0->type == GGML_TYPE_F16) {
|
14041
14116
|
rope_neox_sycl((const sycl::half *)src0_dd, (sycl::half *)dst_dd,
|
14042
14117
|
ne00, n_dims, nrows, pos, freq_scale, ne01,
|
14043
14118
|
freq_base, ext_factor, attn_factor, corr_dims,
|
14044
|
-
main_stream);
|
14119
|
+
freq_factors, main_stream);
|
14045
14120
|
} else {
|
14046
14121
|
GGML_ASSERT(false);
|
14047
14122
|
}
|
@@ -15108,7 +15183,7 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
|
|
15108
15183
|
const int64_t r2 = ne12/ne02;
|
15109
15184
|
const int64_t r3 = ne13/ne03;
|
15110
15185
|
|
15111
|
-
if (r2 == 1 && r3 == 1 && src0
|
15186
|
+
if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
|
15112
15187
|
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
|
15113
15188
|
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
|
15114
15189
|
*g_sycl_handles[g_main_device], oneapi::mkl::transpose::trans,
|
@@ -15173,6 +15248,29 @@ catch (sycl::exception const &exc) {
|
|
15173
15248
|
std::exit(1);
|
15174
15249
|
}
|
15175
15250
|
|
15251
|
+
inline bool ggml_sycl_supports_mmq(enum ggml_type type) {
|
15252
|
+
// TODO: accuracy issues in MMQ
|
15253
|
+
return false;
|
15254
|
+
}
|
15255
|
+
|
15256
|
+
bool ggml_sycl_supports_dmmv(enum ggml_type type) {
|
15257
|
+
switch (type) {
|
15258
|
+
case GGML_TYPE_Q4_0:
|
15259
|
+
case GGML_TYPE_Q4_1:
|
15260
|
+
case GGML_TYPE_Q5_0:
|
15261
|
+
case GGML_TYPE_Q5_1:
|
15262
|
+
case GGML_TYPE_Q8_0:
|
15263
|
+
case GGML_TYPE_Q2_K:
|
15264
|
+
case GGML_TYPE_Q3_K:
|
15265
|
+
case GGML_TYPE_Q4_K:
|
15266
|
+
case GGML_TYPE_Q5_K:
|
15267
|
+
case GGML_TYPE_Q6_K:
|
15268
|
+
case GGML_TYPE_F16:
|
15269
|
+
return true;
|
15270
|
+
default:
|
15271
|
+
return false;
|
15272
|
+
}
|
15273
|
+
}
|
15176
15274
|
|
15177
15275
|
static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
15178
15276
|
const bool all_on_device =
|
@@ -15189,75 +15287,42 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|
15189
15287
|
}
|
15190
15288
|
}
|
15191
15289
|
|
15192
|
-
|
15193
|
-
|
15194
|
-
|
15195
|
-
|
15196
|
-
|
15290
|
+
// check data types and tensor shapes for custom matrix multiplication kernels:
|
15291
|
+
bool use_dequantize_mul_mat_vec = ggml_sycl_supports_dmmv(src0->type)
|
15292
|
+
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
15293
|
+
&& src0->ne[0] % GGML_SYCL_DMMV_X == 0 && src1->ne[1] == 1;
|
15294
|
+
|
15295
|
+
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
|
15296
|
+
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
15297
|
+
&& src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
|
15197
15298
|
|
15198
|
-
|
15199
|
-
|
15200
|
-
|
15201
|
-
//
|
15202
|
-
//
|
15203
|
-
|
15204
|
-
|
15299
|
+
bool use_mul_mat_q = ggml_sycl_supports_mmq(src0->type)
|
15300
|
+
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
|
15301
|
+
|
15302
|
+
// mmvq and mmq need the __dp4a instruction which is available for gen12+
|
15303
|
+
// Workaround in https://github.com/ggerganov/llama.cpp/commit/95f84d5ce8b449a9b16009434aca800df504a02e
|
15304
|
+
use_mul_mat_q = use_mul_mat_q && (src0->type != GGML_TYPE_IQ2_XXS);
|
15305
|
+
#ifdef SYCL_USE_XMX
|
15306
|
+
use_mul_mat_q = use_mul_mat_q && (src1->ne[1] <= MMQ_MAX_BATCH_SIZE);
|
15307
|
+
#endif // SYCL_USE_XMX
|
15205
15308
|
|
15206
|
-
if (!split &&
|
15309
|
+
if (!split && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
15207
15310
|
// KQ single-batch
|
15208
|
-
// GGML_SYCL_DEBUG("ggml_sycl_mul_mat_vec_p021\n");
|
15209
15311
|
ggml_sycl_mul_mat_vec_p021(src0, src1, dst);
|
15210
|
-
} else if (!split &&
|
15312
|
+
} else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
15211
15313
|
// KQV single-batch
|
15212
|
-
// GGML_SYCL_DEBUG("ggml_sycl_mul_mat_vec_nc\n");
|
15213
15314
|
ggml_sycl_mul_mat_vec_nc(src0, src1, dst);
|
15214
|
-
} else if (!split &&
|
15315
|
+
} else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16) && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
|
15215
15316
|
// KQ + KQV multi-batch
|
15216
|
-
// GGML_SYCL_DEBUG("ggml_sycl_mul_mat_batched_sycl\n");
|
15217
15317
|
ggml_sycl_mul_mat_batched_sycl(src0, src1, dst);
|
15218
|
-
} else if (
|
15219
|
-
|
15220
|
-
|
15221
|
-
|
15222
|
-
|
15223
|
-
|
15224
|
-
#ifdef GGML_SYCL_FORCE_DMMV
|
15225
|
-
const bool use_mul_mat_vec_q = false;
|
15226
|
-
#else
|
15227
|
-
bool use_mul_mat_vec_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type);
|
15228
|
-
use_mul_mat_vec_q = use_mul_mat_vec_q ||
|
15229
|
-
(src0->type == GGML_TYPE_IQ2_XXS) || (src0->type == GGML_TYPE_IQ2_XS) || (src0->type == GGML_TYPE_IQ2_S) ||
|
15230
|
-
(src0->type == GGML_TYPE_IQ3_XXS) || (src0->type == GGML_TYPE_IQ3_S) ||
|
15231
|
-
(src0->type == GGML_TYPE_IQ4_NL) || (src0->type == GGML_TYPE_IQ4_XS) ||
|
15232
|
-
(src0->type == GGML_TYPE_IQ1_S) || (src0->type == GGML_TYPE_IQ1_M);
|
15233
|
-
|
15234
|
-
|
15235
|
-
#endif // GGML_SYCL_FORCE_DMMV
|
15236
|
-
|
15237
|
-
if (use_mul_mat_vec_q) {
|
15238
|
-
// GGML_SYCL_DEBUG("ggml_sycl_mul_mat ggml_sycl_op_mul_mat_vec_q path\n");
|
15239
|
-
ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_vec_q, true);
|
15240
|
-
} else {
|
15241
|
-
// GGML_SYCL_DEBUG("ggml_sycl_mul_mat ggml_sycl_op_dequantize_mul_mat_vec path\n");
|
15242
|
-
ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec, false);
|
15243
|
-
}
|
15244
|
-
} else {
|
15245
|
-
bool use_mul_mat_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type);
|
15246
|
-
|
15247
|
-
if (use_xmx && min_compute_capability >= VER_GEN9 && src1->ne[1] > XMX_MAX_BATCH_SIZE) {
|
15248
|
-
use_mul_mat_q = false;
|
15249
|
-
}
|
15250
|
-
|
15251
|
-
if (use_mul_mat_q) {
|
15252
|
-
// GGML_SYCL_DEBUG("ggml_sycl_mul_mat ggml_sycl_op_mul_mat_q path\n");
|
15253
|
-
ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_q, true);
|
15254
|
-
} else {
|
15255
|
-
// GGML_SYCL_DEBUG("ggml_sycl_mul_mat ggml_sycl_op_mul_mat_sycl path\n");
|
15256
|
-
ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false);
|
15257
|
-
}
|
15258
|
-
}
|
15318
|
+
} else if (use_dequantize_mul_mat_vec) {
|
15319
|
+
ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec, false);
|
15320
|
+
} else if (use_mul_mat_vec_q) {
|
15321
|
+
ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_vec_q, true);
|
15322
|
+
} else if (use_mul_mat_q) {
|
15323
|
+
ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_q, true);
|
15259
15324
|
} else {
|
15260
|
-
|
15325
|
+
ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false);
|
15261
15326
|
}
|
15262
15327
|
}
|
15263
15328
|
|
@@ -15434,22 +15499,86 @@ static void ggml_sycl_mul_mat_id_sycl(ggml_tensor * dst) {
|
|
15434
15499
|
}
|
15435
15500
|
#endif
|
15436
15501
|
|
15502
|
+
struct mmid_row_mapping {
|
15503
|
+
int32_t i1;
|
15504
|
+
int32_t i2;
|
15505
|
+
};
|
15506
|
+
|
15507
|
+
__dpct_inline__ static void k_copy_src1_to_contiguous(
|
15508
|
+
const char *__restrict__ src1_original, char *__restrict__ src1_contiguous,
|
15509
|
+
int *__restrict__ cur_src1_row, mmid_row_mapping *__restrict__ row_mapping,
|
15510
|
+
const char *__restrict ids, int64_t i02, size_t ids_nb1, size_t ids_nb0,
|
15511
|
+
int64_t ne11, int64_t ne10, size_t nb11, size_t nb12,
|
15512
|
+
const sycl::nd_item<3> &item_ct1, int &src1_row) {
|
15513
|
+
int32_t iid1 = item_ct1.get_group(2);
|
15514
|
+
int32_t id = item_ct1.get_group(1);
|
15515
|
+
|
15516
|
+
const int32_t row_id_i = *(const int32_t *) (ids + iid1*ids_nb1 + id*ids_nb0);
|
15517
|
+
|
15518
|
+
if (row_id_i != i02) {
|
15519
|
+
return;
|
15520
|
+
}
|
15521
|
+
|
15522
|
+
const int64_t i11 = id % ne11;
|
15523
|
+
const int64_t i12 = iid1;
|
15524
|
+
|
15525
|
+
if (item_ct1.get_local_id(2) == 0) {
|
15526
|
+
src1_row =
|
15527
|
+
dpct::atomic_fetch_add<sycl::access::address_space::generic_space>(
|
15528
|
+
cur_src1_row, 1);
|
15529
|
+
row_mapping[src1_row] = {id, iid1};
|
15530
|
+
}
|
15531
|
+
/*
|
15532
|
+
DPCT1065:194: Consider replacing sycl::nd_item::barrier() with
|
15533
|
+
sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better
|
15534
|
+
performance if there is no access to global memory.
|
15535
|
+
*/
|
15536
|
+
item_ct1.barrier();
|
15537
|
+
|
15538
|
+
const float * src1_row_original = (const float *)(src1_original + i11*nb11 + i12*nb12);
|
15539
|
+
float * src1_row_contiguous = (float *)(src1_contiguous + src1_row*nb11);
|
15540
|
+
|
15541
|
+
#pragma unroll
|
15542
|
+
for (int i = item_ct1.get_local_id(2); i < ne10;
|
15543
|
+
i += item_ct1.get_local_range(2)) {
|
15544
|
+
src1_row_contiguous[i] = src1_row_original[i];
|
15545
|
+
}
|
15546
|
+
}
|
15547
|
+
|
15548
|
+
__dpct_inline__ static void k_copy_dst_from_contiguous(
|
15549
|
+
char *__restrict__ dst_original, const char *__restrict__ dst_contiguous,
|
15550
|
+
const mmid_row_mapping *__restrict__ row_mapping, int64_t ne0, size_t nb1,
|
15551
|
+
size_t nb2, const sycl::nd_item<3> &item_ct1) {
|
15552
|
+
int32_t i = item_ct1.get_group(2);
|
15553
|
+
|
15554
|
+
const int32_t i1 = row_mapping[i].i1;
|
15555
|
+
const int32_t i2 = row_mapping[i].i2;
|
15556
|
+
|
15557
|
+
const float * dst_row_contiguous = (const float *)(dst_contiguous + i*nb1);
|
15558
|
+
float * dst_row_original = (float *)(dst_original + i1*nb1 + i2*nb2);
|
15559
|
+
|
15560
|
+
#pragma unroll
|
15561
|
+
for (int j = item_ct1.get_local_id(2); j < ne0;
|
15562
|
+
j += item_ct1.get_local_range(2)) {
|
15563
|
+
dst_row_original[j] = dst_row_contiguous[j];
|
15564
|
+
}
|
15565
|
+
}
|
15566
|
+
|
15437
15567
|
static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
|
15438
15568
|
const ggml_tensor *src1,
|
15439
15569
|
ggml_tensor *dst) try {
|
15440
|
-
GGML_ASSERT(src0->
|
15441
|
-
|
15570
|
+
GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(src0->buffer) && "mul_mat_id does not support split buffers");
|
15571
|
+
|
15442
15572
|
const ggml_tensor *ids = dst->src[2];
|
15443
|
-
|
15573
|
+
GGML_TENSOR_BINARY_OP_LOCALS
|
15444
15574
|
|
15445
|
-
const
|
15446
|
-
const size_t nb1 = dst->nb[1];
|
15575
|
+
const dpct::queue_ptr stream = g_syclStreams[g_main_device][0];
|
15447
15576
|
|
15448
|
-
const
|
15449
|
-
const
|
15577
|
+
const int64_t n_as = ne02;
|
15578
|
+
const int64_t n_ids = ids->ne[0];
|
15450
15579
|
|
15451
15580
|
std::vector<char> ids_host(ggml_nbytes(ids));
|
15452
|
-
const char *ids_dev = (const char *)ids->data;
|
15581
|
+
const char * ids_dev = (const char *) ids->data;
|
15453
15582
|
|
15454
15583
|
SYCL_CHECK(CHECK_TRY_ERROR(
|
15455
15584
|
stream->memcpy(ids_host.data(), ids_dev, ggml_nbytes(ids))));
|
@@ -15489,24 +15618,40 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
|
|
15489
15618
|
|
15490
15619
|
src0_row.ne[2] = 1;
|
15491
15620
|
src0_row.ne[3] = 1;
|
15492
|
-
src0_row.nb[3] =
|
15493
|
-
|
15494
|
-
|
15495
|
-
|
15496
|
-
|
15497
|
-
|
15498
|
-
|
15499
|
-
|
15500
|
-
|
15621
|
+
src0_row.nb[3] = nb02;
|
15622
|
+
|
15623
|
+
src1_row.ne[1] = 1;
|
15624
|
+
src1_row.ne[2] = 1;
|
15625
|
+
src1_row.ne[3] = 1;
|
15626
|
+
src1_row.nb[2] = nb11;
|
15627
|
+
src1_row.nb[3] = nb11;
|
15628
|
+
|
15629
|
+
dst_row.ne[1] = 1;
|
15630
|
+
dst_row.ne[2] = 1;
|
15631
|
+
dst_row.ne[3] = 1;
|
15632
|
+
dst_row.nb[2] = nb1;
|
15633
|
+
dst_row.nb[3] = nb1;
|
15634
|
+
if (ne12 == 1) {
|
15635
|
+
for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
|
15636
|
+
for (int64_t id = 0; id < n_ids; id++) {
|
15637
|
+
const int32_t i02 = *(const int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
|
15638
|
+
GGML_ASSERT(i02 >= 0 && i02 < n_as);
|
15639
|
+
|
15640
|
+
const int64_t i11 = id % ne11;
|
15641
|
+
const int64_t i12 = iid1;
|
15642
|
+
|
15643
|
+
const int64_t i1 = id;
|
15644
|
+
const int64_t i2 = i12;
|
15501
15645
|
|
15502
15646
|
src0_row_extra.data_device[g_main_device] =
|
15503
|
-
src0_original +
|
15647
|
+
src0_original + i02*nb02;
|
15504
15648
|
src1_row_extra.data_device[g_main_device] =
|
15505
|
-
src1_original +
|
15649
|
+
src1_original + + i11*nb11 + i12*nb12;
|
15506
15650
|
dst_row_extra.data_device[g_main_device] =
|
15507
|
-
dst_original +
|
15651
|
+
dst_original + i1*nb1 + i2*nb2;
|
15508
15652
|
|
15509
15653
|
ggml_sycl_mul_mat(&src0_row, &src1_row, &dst_row);
|
15654
|
+
}
|
15510
15655
|
}
|
15511
15656
|
} else {
|
15512
15657
|
sycl_pool_alloc<char> src1_contiguous(sizeof(float)*ggml_nelements(src1));
|
@@ -15515,64 +15660,98 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
|
|
15515
15660
|
src1_row_extra.data_device[g_main_device] = src1_contiguous.get();
|
15516
15661
|
dst_row_extra.data_device[g_main_device] = dst_contiguous.get();
|
15517
15662
|
|
15518
|
-
for (
|
15663
|
+
for (int64_t i02 = 0; i02 < n_as; i02++) {
|
15519
15664
|
int64_t num_src1_rows = 0;
|
15520
|
-
for (int64_t
|
15521
|
-
|
15665
|
+
for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
|
15666
|
+
for (int64_t id = 0; id < n_ids; id++) {
|
15667
|
+
const int32_t row_id_i = *(const int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
|
15522
15668
|
|
15523
|
-
|
15524
|
-
continue;
|
15525
|
-
}
|
15669
|
+
GGML_ASSERT(row_id_i >= 0 && row_id_i < n_as);
|
15526
15670
|
|
15527
|
-
|
15671
|
+
if (row_id_i != i02) {
|
15672
|
+
continue;
|
15673
|
+
}
|
15528
15674
|
|
15529
|
-
|
15530
|
-
|
15531
|
-
src1_original + i01 * nb11, nb11)));
|
15532
|
-
num_src1_rows++;
|
15675
|
+
num_src1_rows++;
|
15676
|
+
}
|
15533
15677
|
}
|
15534
15678
|
|
15535
15679
|
if (num_src1_rows == 0) {
|
15536
15680
|
continue;
|
15537
15681
|
}
|
15538
15682
|
|
15539
|
-
src0_row_extra.data_device[g_main_device] =
|
15540
|
-
src0_original + row_id * src0->nb[2];
|
15541
15683
|
|
15684
|
+
sycl_pool_alloc<int> dev_cur_src1_row(1);
|
15685
|
+
sycl_pool_alloc<mmid_row_mapping> dev_row_mapping(num_src1_rows);
|
15686
|
+
SYCL_CHECK(CHECK_TRY_ERROR(
|
15687
|
+
stream->memset(dev_cur_src1_row.get(), 0, sizeof(int))));
|
15688
|
+
|
15689
|
+
{
|
15690
|
+
sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne10, 768u));
|
15691
|
+
sycl::range<3> grid_dims(1, n_ids, ids->ne[1]);
|
15692
|
+
stream->submit([&](sycl::handler &cgh) {
|
15693
|
+
sycl::local_accessor<int, 0> src1_row_acc(cgh);
|
15694
|
+
|
15695
|
+
char *__restrict src1_contiguous_get =
|
15696
|
+
src1_contiguous.get();
|
15697
|
+
int *__restrict dev_cur_src1_row_get =
|
15698
|
+
dev_cur_src1_row.get();
|
15699
|
+
mmid_row_mapping *__restrict dev_row_mapping_get =
|
15700
|
+
dev_row_mapping.get();
|
15701
|
+
size_t ids_nb_ct6 = ids->nb[1];
|
15702
|
+
size_t ids_nb_ct7 = ids->nb[0];
|
15703
|
+
|
15704
|
+
cgh.parallel_for(
|
15705
|
+
sycl::nd_range<3>(grid_dims * block_dims, block_dims),
|
15706
|
+
[=](sycl::nd_item<3> item_ct1) {
|
15707
|
+
k_copy_src1_to_contiguous(
|
15708
|
+
src1_original, src1_contiguous_get,
|
15709
|
+
dev_cur_src1_row_get,
|
15710
|
+
dev_row_mapping_get, ids_dev, i02,
|
15711
|
+
ids_nb_ct6, ids_nb_ct7, ne11, ne10, nb11, nb12,
|
15712
|
+
item_ct1, src1_row_acc);
|
15713
|
+
});
|
15714
|
+
});
|
15715
|
+
}
|
15716
|
+
|
15717
|
+
src0_row_extra.data_device[g_main_device] = src0_original + i02*nb02;
|
15718
|
+
|
15719
|
+
GGML_ASSERT(nb11 == sizeof(float)*ne10);
|
15720
|
+
GGML_ASSERT(nb1 == sizeof(float)*ne0);
|
15542
15721
|
src1_row.ne[1] = num_src1_rows;
|
15543
|
-
dst_row.ne[1] = num_src1_rows;
|
15544
15722
|
|
15545
15723
|
src1_row.nb[1] = nb11;
|
15546
15724
|
src1_row.nb[2] = num_src1_rows*nb11;
|
15547
15725
|
src1_row.nb[3] = num_src1_rows*nb11;
|
15548
15726
|
|
15727
|
+
dst_row.ne[1] = num_src1_rows;
|
15549
15728
|
dst_row.nb[1] = nb1;
|
15550
15729
|
dst_row.nb[2] = num_src1_rows*nb1;
|
15551
15730
|
dst_row.nb[3] = num_src1_rows*nb1;
|
15552
15731
|
|
15553
15732
|
ggml_sycl_mul_mat(&src0_row, &src1_row, &dst_row);
|
15554
15733
|
|
15555
|
-
|
15556
|
-
|
15557
|
-
|
15558
|
-
|
15559
|
-
|
15560
|
-
|
15561
|
-
|
15562
|
-
|
15563
|
-
|
15564
|
-
|
15565
|
-
|
15566
|
-
|
15567
|
-
|
15568
|
-
|
15734
|
+
{
|
15735
|
+
sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne0, 768u));
|
15736
|
+
sycl::range<3> grid_dims(1, 1, num_src1_rows);
|
15737
|
+
stream->submit([&](sycl::handler &cgh) {
|
15738
|
+
const char *__restrict dst_contiguous_get =
|
15739
|
+
dst_contiguous.get();
|
15740
|
+
const mmid_row_mapping *__restrict dev_row_mapping_get =
|
15741
|
+
dev_row_mapping.get();
|
15742
|
+
|
15743
|
+
cgh.parallel_for(
|
15744
|
+
sycl::nd_range<3>(grid_dims * block_dims, block_dims),
|
15745
|
+
[=](sycl::nd_item<3> item_ct1) {
|
15746
|
+
k_copy_dst_from_contiguous(dst_original,
|
15747
|
+
dst_contiguous_get,
|
15748
|
+
dev_row_mapping_get,
|
15749
|
+
ne0, nb1, nb2, item_ct1);
|
15750
|
+
});
|
15751
|
+
});
|
15569
15752
|
}
|
15570
15753
|
}
|
15571
15754
|
}
|
15572
|
-
|
15573
|
-
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
15574
|
-
SYCL_CHECK(CHECK_TRY_ERROR(stream->wait()));
|
15575
|
-
}
|
15576
15755
|
}
|
15577
15756
|
catch (sycl::exception const &exc) {
|
15578
15757
|
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
@@ -16555,10 +16734,9 @@ GGML_CALL static const char * ggml_backend_sycl_split_buffer_get_name(ggml_backe
|
|
16555
16734
|
UNUSED(buffer);
|
16556
16735
|
}
|
16557
16736
|
|
16558
|
-
|
16559
|
-
|
16560
|
-
|
16561
|
-
//}
|
16737
|
+
static bool ggml_backend_buffer_is_sycl_split(ggml_backend_buffer_t buffer) {
|
16738
|
+
return buffer->iface.get_name == ggml_backend_sycl_split_buffer_get_name;
|
16739
|
+
}
|
16562
16740
|
|
16563
16741
|
GGML_CALL static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
16564
16742
|
ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
|