llama_cpp 0.15.0 → 0.15.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/ext/llama_cpp/llama_cpp.cpp +6 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +6 -0
- data/vendor/tmp/llama.cpp/Makefile +6 -7
- data/vendor/tmp/llama.cpp/ggml-backend.c +2 -3
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +303 -23
- data/vendor/tmp/llama.cpp/ggml-impl.h +84 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +9 -3
- data/vendor/tmp/llama.cpp/ggml-metal.m +137 -133
- data/vendor/tmp/llama.cpp/ggml-metal.metal +87 -110
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +1 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +2220 -28
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +1032 -0
- data/vendor/tmp/llama.cpp/ggml-rpc.h +24 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +35 -152
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +46843 -39205
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +953 -268
- data/vendor/tmp/llama.cpp/ggml.c +1762 -681
- data/vendor/tmp/llama.cpp/ggml.h +43 -24
- data/vendor/tmp/llama.cpp/llama.cpp +533 -296
- data/vendor/tmp/llama.cpp/llama.h +10 -1
- data/vendor/tmp/llama.cpp/sgemm.cpp +56 -21
- data/vendor/tmp/llama.cpp/unicode-data.cpp +6969 -1637
- data/vendor/tmp/llama.cpp/unicode-data.h +15 -11
- data/vendor/tmp/llama.cpp/unicode.cpp +286 -176
- data/vendor/tmp/llama.cpp/unicode.h +44 -10
- metadata +4 -2
@@ -0,0 +1,24 @@
|
|
1
|
+
#pragma once
|
2
|
+
|
3
|
+
#include "ggml.h"
|
4
|
+
#include "ggml-backend.h"
|
5
|
+
|
6
|
+
#ifdef __cplusplus
|
7
|
+
extern "C" {
|
8
|
+
#endif
|
9
|
+
|
10
|
+
#define GGML_RPC_MAX_SERVERS 16
|
11
|
+
|
12
|
+
// backend API
|
13
|
+
GGML_API GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint);
|
14
|
+
GGML_API GGML_CALL bool ggml_backend_is_rpc(ggml_backend_t backend);
|
15
|
+
|
16
|
+
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint);
|
17
|
+
|
18
|
+
GGML_API GGML_CALL void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);
|
19
|
+
|
20
|
+
GGML_API GGML_CALL void start_rpc_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem);
|
21
|
+
|
22
|
+
#ifdef __cplusplus
|
23
|
+
}
|
24
|
+
#endif
|
@@ -3154,7 +3154,6 @@ typedef float (*vec_dot_q_mul_mat_sycl_t)(
|
|
3154
3154
|
#define SYCL_SCALE_BLOCK_SIZE 256
|
3155
3155
|
#define SYCL_CLAMP_BLOCK_SIZE 256
|
3156
3156
|
#define SYCL_ROPE_BLOCK_SIZE 256
|
3157
|
-
#define SYCL_ALIBI_BLOCK_SIZE 32
|
3158
3157
|
#define SYCL_DIAG_MASK_INF_BLOCK_SIZE 32
|
3159
3158
|
#define SYCL_QUANTIZE_BLOCK_SIZE 256
|
3160
3159
|
#define SYCL_DEQUANTIZE_BLOCK_SIZE 256
|
@@ -8330,24 +8329,26 @@ static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict_
|
|
8330
8329
|
const int blocks_per_row = ncols / qk;
|
8331
8330
|
const int blocks_per_warp = vdr * WARP_SIZE / qi;
|
8332
8331
|
|
8333
|
-
//
|
8332
|
+
const int qi_vdr = (qi / vdr); // N_threads processing 1 qk block
|
8333
|
+
|
8334
|
+
// partial sum for each thread
|
8334
8335
|
float tmp = 0.0f;
|
8335
8336
|
|
8336
8337
|
const block_q_t * x = (const block_q_t *) vx;
|
8337
8338
|
const block_q8_1 * y = (const block_q8_1 *) vy;
|
8338
8339
|
|
8339
|
-
for (int i = item_ct1.get_local_id(2) /
|
8340
|
+
for (int i = item_ct1.get_local_id(2) / qi_vdr; i < blocks_per_row;
|
8340
8341
|
i += blocks_per_warp) {
|
8341
|
-
|
8342
|
+
const int ibx = row * blocks_per_row + i; // x block index
|
8342
8343
|
|
8343
|
-
|
8344
|
+
const int iby = i * (qk / QK8_1); // y block index that aligns with ibx
|
8344
8345
|
|
8345
|
-
|
8346
|
-
|
8347
|
-
|
8348
|
-
|
8346
|
+
const int iqs =
|
8347
|
+
vdr *
|
8348
|
+
(item_ct1.get_local_id(2) -
|
8349
|
+
i * qi_vdr); // x block quant index when casting the quants to int
|
8349
8350
|
|
8350
|
-
|
8351
|
+
tmp += vec_dot_q_sycl(&x[ibx], &y[iby], iqs);
|
8351
8352
|
}
|
8352
8353
|
|
8353
8354
|
// sum up partial sums and write back result
|
@@ -9314,32 +9315,6 @@ static void rope_glm_f32(
|
|
9314
9315
|
dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;
|
9315
9316
|
}
|
9316
9317
|
|
9317
|
-
static void alibi_f32(const float * x, float * dst, const int ncols, const int k_rows,
|
9318
|
-
const int n_heads_log2_floor, const float m0, const float m1,
|
9319
|
-
const sycl::nd_item<3> &item_ct1) {
|
9320
|
-
const int col = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
9321
|
-
item_ct1.get_local_id(2);
|
9322
|
-
|
9323
|
-
if (col >= ncols) {
|
9324
|
-
return;
|
9325
|
-
}
|
9326
|
-
|
9327
|
-
const int row = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
|
9328
|
-
item_ct1.get_local_id(1);
|
9329
|
-
const int i = row*ncols + col;
|
9330
|
-
|
9331
|
-
const int k = row/k_rows;
|
9332
|
-
|
9333
|
-
float m_k;
|
9334
|
-
if (k < n_heads_log2_floor) {
|
9335
|
-
m_k = dpct::pow(m0, k + 1);
|
9336
|
-
} else {
|
9337
|
-
m_k = dpct::pow(m1, 2 * (k - n_heads_log2_floor) + 1);
|
9338
|
-
}
|
9339
|
-
|
9340
|
-
dst[i] = col * m_k + x[i];
|
9341
|
-
}
|
9342
|
-
|
9343
9318
|
static void k_sum_rows_f32(const float * x, float * dst, const int ncols,
|
9344
9319
|
const sycl::nd_item<3> &item_ct1) {
|
9345
9320
|
const int row = item_ct1.get_group(1);
|
@@ -9441,7 +9416,7 @@ static void diag_mask_inf_f32(const float * x, float * dst, const int ncols, con
|
|
9441
9416
|
|
9442
9417
|
|
9443
9418
|
template <bool vals_smem, int ncols_template, int block_size_template>
|
9444
|
-
static void soft_max_f32(const float * x, const float * mask,
|
9419
|
+
static void soft_max_f32(const float * x, const float * mask, float * dst, const int ncols_par,
|
9445
9420
|
const int nrows_y, const float scale, const float max_bias, const float m0,
|
9446
9421
|
const float m1, uint32_t n_head_log2, const sycl::nd_item<3> &item_ct1, float *buf) {
|
9447
9422
|
const int ncols = ncols_template == 0 ? ncols_par : ncols_template;
|
@@ -9455,7 +9430,7 @@ static void soft_max_f32(const float * x, const float * mask, const float *pos,
|
|
9455
9430
|
const int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
|
9456
9431
|
const int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
|
9457
9432
|
|
9458
|
-
float slope =
|
9433
|
+
float slope = 1.0f;
|
9459
9434
|
|
9460
9435
|
// ALiBi
|
9461
9436
|
if (max_bias > 0.0f) {
|
@@ -9480,7 +9455,7 @@ static void soft_max_f32(const float * x, const float * mask, const float *pos,
|
|
9480
9455
|
const int ix = rowx*ncols + col;
|
9481
9456
|
const int iy = rowy*ncols + col;
|
9482
9457
|
|
9483
|
-
const float val = x[ix]*scale + (mask ? mask[iy] : 0.0f)
|
9458
|
+
const float val = x[ix]*scale + (mask ? slope*mask[iy] : 0.0f);
|
9484
9459
|
|
9485
9460
|
vals[col] = val;
|
9486
9461
|
max_val = sycl::max(max_val, val);
|
@@ -12962,20 +12937,6 @@ static void rope_glm_f32_sycl(const float *x, float *dst, int ncols, int nrows,
|
|
12962
12937
|
});
|
12963
12938
|
}
|
12964
12939
|
|
12965
|
-
static void alibi_f32_sycl(const float *x, float *dst, const int ncols,
|
12966
|
-
const int nrows, const int k_rows,
|
12967
|
-
const int n_heads_log2_floor, const float m0,
|
12968
|
-
const float m1, dpct::queue_ptr stream) {
|
12969
|
-
const sycl::range<3> block_dims(1, 1, SYCL_ALIBI_BLOCK_SIZE);
|
12970
|
-
const int num_blocks_x = (ncols + SYCL_ALIBI_BLOCK_SIZE - 1) / (SYCL_ALIBI_BLOCK_SIZE);
|
12971
|
-
const sycl::range<3> block_nums(1, nrows, num_blocks_x);
|
12972
|
-
stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
12973
|
-
[=](sycl::nd_item<3> item_ct1) {
|
12974
|
-
alibi_f32(x, dst, ncols, k_rows,
|
12975
|
-
n_heads_log2_floor, m0, m1, item_ct1);
|
12976
|
-
});
|
12977
|
-
}
|
12978
|
-
|
12979
12940
|
static void sum_rows_f32_sycl(const float *x, float *dst, const int ncols,
|
12980
12941
|
const int nrows, dpct::queue_ptr stream) {
|
12981
12942
|
const sycl::range<3> block_dims(1, 1, WARP_SIZE);
|
@@ -13056,7 +13017,7 @@ static void diag_mask_inf_f32_sycl(const float *x, float *dst,
|
|
13056
13017
|
}
|
13057
13018
|
|
13058
13019
|
template <bool vals_smem, int ncols_template, int block_size_template>
|
13059
|
-
static void soft_max_f32_submitter(const float * x, const float * mask,
|
13020
|
+
static void soft_max_f32_submitter(const float * x, const float * mask, float * dst, const int ncols_par,
|
13060
13021
|
const int nrows_y, const float scale, const float max_bias, const float m0,
|
13061
13022
|
const float m1, uint32_t n_head_log2, sycl::range<3> block_nums, sycl::range<3> block_dims,
|
13062
13023
|
const size_t n_local_scratch, dpct::queue_ptr stream) {
|
@@ -13066,7 +13027,7 @@ static void soft_max_f32_submitter(const float * x, const float * mask, const fl
|
|
13066
13027
|
cgh.parallel_for(
|
13067
13028
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
13068
13029
|
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
13069
|
-
soft_max_f32<vals_smem, ncols_template, block_size_template>(x, mask,
|
13030
|
+
soft_max_f32<vals_smem, ncols_template, block_size_template>(x, mask, dst, ncols_par,
|
13070
13031
|
nrows_y, scale, max_bias, m0,
|
13071
13032
|
m1, n_head_log2, item_ct1,
|
13072
13033
|
local_buf_acc.get_pointer());
|
@@ -13074,7 +13035,7 @@ static void soft_max_f32_submitter(const float * x, const float * mask, const fl
|
|
13074
13035
|
});
|
13075
13036
|
}
|
13076
13037
|
|
13077
|
-
static void soft_max_f32_sycl(const float * x, const float * mask,
|
13038
|
+
static void soft_max_f32_sycl(const float * x, const float * mask,
|
13078
13039
|
float * dst, const int ncols_x, const int nrows_x,
|
13079
13040
|
const int nrows_y, const float scale, const float max_bias,
|
13080
13041
|
dpct::queue_ptr stream) {
|
@@ -13096,60 +13057,60 @@ static void soft_max_f32_sycl(const float * x, const float * mask, const float *
|
|
13096
13057
|
const size_t local_mem_size = stream->get_device().get_info<sycl::info::device::local_mem_size>();
|
13097
13058
|
if (n_local_scratch*sizeof(float) < local_mem_size) {
|
13098
13059
|
if (ncols_x > max_block_size) {
|
13099
|
-
soft_max_f32_submitter<true, 0, 0>(x, mask,
|
13060
|
+
soft_max_f32_submitter<true, 0, 0>(x, mask, dst, ncols_x, nrows_y, scale,
|
13100
13061
|
max_bias, m0, m1, n_head_log2, block_nums,
|
13101
13062
|
block_dims, n_local_scratch, stream);
|
13102
13063
|
return;
|
13103
13064
|
}
|
13104
13065
|
switch (ncols_x) {
|
13105
13066
|
case 32:
|
13106
|
-
soft_max_f32_submitter<true, 32, 32>(x, mask,
|
13067
|
+
soft_max_f32_submitter<true, 32, 32>(x, mask, dst, ncols_x, nrows_y, scale,
|
13107
13068
|
max_bias, m0, m1, n_head_log2, block_nums,
|
13108
13069
|
block_dims, n_local_scratch, stream);
|
13109
13070
|
break;
|
13110
13071
|
case 64:
|
13111
|
-
soft_max_f32_submitter<true, 64, 64>(x, mask,
|
13072
|
+
soft_max_f32_submitter<true, 64, 64>(x, mask, dst, ncols_x, nrows_y, scale,
|
13112
13073
|
max_bias, m0, m1, n_head_log2, block_nums,
|
13113
13074
|
block_dims, n_local_scratch, stream);
|
13114
13075
|
break;
|
13115
13076
|
case 128:
|
13116
|
-
soft_max_f32_submitter<true, 128, 128>(x, mask,
|
13077
|
+
soft_max_f32_submitter<true, 128, 128>(x, mask, dst, ncols_x, nrows_y, scale,
|
13117
13078
|
max_bias, m0, m1, n_head_log2, block_nums,
|
13118
13079
|
block_dims, n_local_scratch, stream);
|
13119
13080
|
break;
|
13120
13081
|
case 256:
|
13121
|
-
soft_max_f32_submitter<true, 256, 256>(x, mask,
|
13082
|
+
soft_max_f32_submitter<true, 256, 256>(x, mask, dst, ncols_x, nrows_y, scale,
|
13122
13083
|
max_bias, m0, m1, n_head_log2, block_nums,
|
13123
13084
|
block_dims, n_local_scratch, stream);
|
13124
13085
|
break;
|
13125
13086
|
case 512:
|
13126
|
-
soft_max_f32_submitter<true, 512, 512>(x, mask,
|
13087
|
+
soft_max_f32_submitter<true, 512, 512>(x, mask, dst, ncols_x, nrows_y, scale,
|
13127
13088
|
max_bias, m0, m1, n_head_log2, block_nums,
|
13128
13089
|
block_dims, n_local_scratch, stream);
|
13129
13090
|
break;
|
13130
13091
|
case 1024:
|
13131
|
-
soft_max_f32_submitter<true, 1024, 1024>(x, mask,
|
13092
|
+
soft_max_f32_submitter<true, 1024, 1024>(x, mask, dst, ncols_x, nrows_y, scale,
|
13132
13093
|
max_bias, m0, m1, n_head_log2, block_nums,
|
13133
13094
|
block_dims, n_local_scratch, stream);
|
13134
13095
|
break;
|
13135
13096
|
case 2048:
|
13136
|
-
soft_max_f32_submitter<true, 2048, 1024>(x, mask,
|
13097
|
+
soft_max_f32_submitter<true, 2048, 1024>(x, mask, dst, ncols_x, nrows_y, scale,
|
13137
13098
|
max_bias, m0, m1, n_head_log2, block_nums,
|
13138
13099
|
block_dims, n_local_scratch, stream);
|
13139
13100
|
break;
|
13140
13101
|
case 4096:
|
13141
|
-
soft_max_f32_submitter<true, 4096, 1024>(x, mask,
|
13102
|
+
soft_max_f32_submitter<true, 4096, 1024>(x, mask, dst, ncols_x, nrows_y, scale,
|
13142
13103
|
max_bias, m0, m1, n_head_log2, block_nums,
|
13143
13104
|
block_dims, n_local_scratch, stream);
|
13144
13105
|
break;
|
13145
13106
|
default:
|
13146
|
-
soft_max_f32_submitter<true, 0, 0>(x, mask,
|
13107
|
+
soft_max_f32_submitter<true, 0, 0>(x, mask, dst, ncols_x, nrows_y, scale,
|
13147
13108
|
max_bias, m0, m1, n_head_log2, block_nums,
|
13148
13109
|
block_dims, n_local_scratch, stream);
|
13149
13110
|
break;
|
13150
13111
|
}
|
13151
13112
|
} else {
|
13152
|
-
soft_max_f32_submitter<false, 0, 0>(x, mask,
|
13113
|
+
soft_max_f32_submitter<false, 0, 0>(x, mask, dst, ncols_x, nrows_y, scale,
|
13153
13114
|
max_bias, m0, m1, n_head_log2, block_nums,
|
13154
13115
|
block_dims, WARP_SIZE, stream);
|
13155
13116
|
}
|
@@ -14026,6 +13987,10 @@ inline void ggml_sycl_op_upscale(const ggml_tensor *src0,
|
|
14026
13987
|
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
14027
13988
|
GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
|
14028
13989
|
|
13990
|
+
#pragma message("TODO: generalize upscale operator")
|
13991
|
+
#pragma message(" https://github.com/ggerganov/ggml/pull/814")
|
13992
|
+
GGML_ASSERT(false && "TODO: generalize upscale operator");
|
13993
|
+
|
14029
13994
|
const int scale_factor = dst->op_params[0];
|
14030
13995
|
|
14031
13996
|
upscale_f32_sycl(src0_dd, dst_dd, src0->ne[0], src0->ne[1], src0->ne[2], scale_factor, main_stream);
|
@@ -14560,36 +14525,6 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
|
|
14560
14525
|
(void) src1_dd;
|
14561
14526
|
}
|
14562
14527
|
|
14563
|
-
inline void ggml_sycl_op_alibi(const ggml_tensor *src0, const ggml_tensor *src1,
|
14564
|
-
ggml_tensor *dst, const float *src0_dd,
|
14565
|
-
const float *src1_dd, float *dst_dd,
|
14566
|
-
const dpct::queue_ptr &main_stream) {
|
14567
|
-
|
14568
|
-
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
14569
|
-
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
14570
|
-
|
14571
|
-
GGML_TENSOR_LOCALS_3(int64_t, ne0, src0, ne);
|
14572
|
-
const int64_t nrows = ggml_nrows(src0);
|
14573
|
-
|
14574
|
-
//const int n_past = ((int32_t *) dst->op_params)[0];
|
14575
|
-
const int n_head = ((int32_t *) dst->op_params)[1];
|
14576
|
-
float max_bias;
|
14577
|
-
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
|
14578
|
-
|
14579
|
-
//GGML_ASSERT(ne01 + n_past == ne00);
|
14580
|
-
GGML_ASSERT(n_head == ne02);
|
14581
|
-
|
14582
|
-
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
|
14583
|
-
|
14584
|
-
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
|
14585
|
-
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
|
14586
|
-
|
14587
|
-
alibi_f32_sycl(src0_dd, dst_dd, ne00, nrows, ne01, n_heads_log2_floor, m0, m1, main_stream);
|
14588
|
-
|
14589
|
-
(void) src1;
|
14590
|
-
(void) src1_dd;
|
14591
|
-
}
|
14592
|
-
|
14593
14528
|
static void ggml_sycl_op_pool2d(const ggml_tensor *src0,
|
14594
14529
|
const ggml_tensor *src1, ggml_tensor *dst,
|
14595
14530
|
const float *src0_dd, const float *src1_dd,
|
@@ -14744,12 +14679,9 @@ inline void ggml_sycl_op_soft_max(const ggml_tensor *src0,
|
|
14744
14679
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
14745
14680
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
14746
14681
|
|
14747
|
-
|
14748
|
-
|
14749
|
-
#pragma message("TODO: add ggml_sycl_op_soft_max() F16 src1 and src2 support")
|
14682
|
+
#pragma message("TODO: add ggml_sycl_op_soft_max() F16 src1 support")
|
14750
14683
|
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5021")
|
14751
14684
|
GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional
|
14752
|
-
GGML_ASSERT(!src2 || src2->type == GGML_TYPE_F32); // src2 contains positions and it is optional
|
14753
14685
|
|
14754
14686
|
const int64_t ne00 = src0->ne[0];
|
14755
14687
|
const int64_t nrows_x = ggml_nrows(src0);
|
@@ -14761,25 +14693,7 @@ inline void ggml_sycl_op_soft_max(const ggml_tensor *src0,
|
|
14761
14693
|
memcpy(&scale, dst->op_params + 0, sizeof(float));
|
14762
14694
|
memcpy(&max_bias, dst->op_params + 1, sizeof(float));
|
14763
14695
|
|
14764
|
-
|
14765
|
-
float * src2_dd = nullptr;
|
14766
|
-
sycl_pool_alloc<float> src2_f;
|
14767
|
-
|
14768
|
-
const bool use_src2 = src2 != nullptr;
|
14769
|
-
|
14770
|
-
if (use_src2) {
|
14771
|
-
const bool src2_on_device = src2->backend == GGML_BACKEND_TYPE_GPU;
|
14772
|
-
|
14773
|
-
if (src2_on_device) {
|
14774
|
-
ggml_tensor_extra_gpu * src2_extra = (ggml_tensor_extra_gpu *) src2->extra;
|
14775
|
-
src2_dd = (float *) src2_extra->data_device[g_main_device];
|
14776
|
-
} else {
|
14777
|
-
src2_dd = src2_f.alloc(ggml_nelements(src2));
|
14778
|
-
SYCL_CHECK(ggml_sycl_cpy_tensor_2d(src2_dd, src2, 0, 0, 0, 1, main_stream));
|
14779
|
-
}
|
14780
|
-
}
|
14781
|
-
|
14782
|
-
soft_max_f32_sycl(src0_dd, src1 ? src1_dd : nullptr, src2_dd, dst_dd, ne00,
|
14696
|
+
soft_max_f32_sycl(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00,
|
14783
14697
|
nrows_x, nrows_y, scale, max_bias, main_stream);
|
14784
14698
|
}
|
14785
14699
|
|
@@ -15654,26 +15568,6 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
|
|
15654
15568
|
const int64_t r2 = ne12/ne02;
|
15655
15569
|
const int64_t r3 = ne13/ne03;
|
15656
15570
|
|
15657
|
-
#if 0
|
15658
|
-
// use syclGemmEx
|
15659
|
-
{
|
15660
|
-
for (int i13 = 0; i13 < ne13; ++i13) {
|
15661
|
-
for (int i12 = 0; i12 < ne12; ++i12) {
|
15662
|
-
int i03 = i13 / r3;
|
15663
|
-
int i02 = i12 / r2;
|
15664
|
-
|
15665
|
-
SYCL_CHECK(
|
15666
|
-
syclGemmEx(g_sycl_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
15667
|
-
ne01, ne11, ne10,
|
15668
|
-
alpha, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , SYCL_R_16F, nb01/sizeof(half),
|
15669
|
-
(const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, SYCL_R_16F, nb11/sizeof(float),
|
15670
|
-
beta, ( char *) dst_t + i12*nbd2 + i13*nbd3, cu_data_type, ne01,
|
15671
|
-
cu_compute_type,
|
15672
|
-
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
15673
|
-
}
|
15674
|
-
}
|
15675
|
-
}
|
15676
|
-
#else
|
15677
15571
|
if (r2 == 1 && r3 == 1 && src0->nb[2]*src0->ne[2] == src0->nb[3] && src1->nb[2]*src1->ne[2] == src1->nb[3]) {
|
15678
15572
|
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
|
15679
15573
|
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
|
@@ -15685,7 +15579,6 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
|
|
15685
15579
|
nb11 / nb10, nb12 / nb10, beta,
|
15686
15580
|
(char *)dst_t, cu_data_type, ne01, nb2 / nb0,
|
15687
15581
|
ne12 * ne13, cu_compute_type)));
|
15688
|
-
g_sycl_handles[g_main_device]->wait();
|
15689
15582
|
} else {
|
15690
15583
|
const int ne23 = ne12*ne13;
|
15691
15584
|
|
@@ -15716,7 +15609,7 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
|
|
15716
15609
|
nb02, nb03, nb12_scaled, nb13_scaled,
|
15717
15610
|
nbd2, nbd3, r2, r3, item_ct1);
|
15718
15611
|
});
|
15719
|
-
})
|
15612
|
+
});
|
15720
15613
|
}
|
15721
15614
|
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
|
15722
15615
|
*g_sycl_handles[g_main_device], oneapi::mkl::transpose::trans,
|
@@ -15727,9 +15620,7 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
|
|
15727
15620
|
dpct::library_data_t::real_half, nb11 / nb10, beta,
|
15728
15621
|
(void **)(ptrs_dst.get() + 0 * ne23), cu_data_type, ne01, ne23,
|
15729
15622
|
cu_compute_type)));
|
15730
|
-
g_sycl_handles[g_main_device]->wait();
|
15731
15623
|
}
|
15732
|
-
#endif
|
15733
15624
|
|
15734
15625
|
if (no_mixed_dtypes) {
|
15735
15626
|
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
|
@@ -16230,10 +16121,6 @@ static void ggml_sycl_rope(const ggml_tensor * src0, const ggml_tensor * src1, g
|
|
16230
16121
|
ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_rope);
|
16231
16122
|
}
|
16232
16123
|
|
16233
|
-
static void ggml_sycl_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
16234
|
-
ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_alibi);
|
16235
|
-
}
|
16236
|
-
|
16237
16124
|
static void ggml_sycl_pool2d(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
16238
16125
|
ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_pool2d);
|
16239
16126
|
}
|
@@ -16610,9 +16497,6 @@ bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
16610
16497
|
case GGML_OP_ROPE:
|
16611
16498
|
func = ggml_sycl_rope;
|
16612
16499
|
break;
|
16613
|
-
case GGML_OP_ALIBI:
|
16614
|
-
func = ggml_sycl_alibi;
|
16615
|
-
break;
|
16616
16500
|
case GGML_OP_IM2COL:
|
16617
16501
|
func = ggml_sycl_im2col;
|
16618
16502
|
break;
|
@@ -17742,7 +17626,6 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
|
|
17742
17626
|
case GGML_OP_DIAG_MASK_INF:
|
17743
17627
|
case GGML_OP_SOFT_MAX:
|
17744
17628
|
case GGML_OP_ROPE:
|
17745
|
-
case GGML_OP_ALIBI:
|
17746
17629
|
case GGML_OP_IM2COL:
|
17747
17630
|
case GGML_OP_POOL_2D:
|
17748
17631
|
case GGML_OP_SUM_ROWS:
|