llama_cpp 0.5.3 → 0.7.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +17 -0
- data/README.md +6 -5
- data/examples/chat.rb +13 -13
- data/examples/embedding.rb +9 -9
- data/ext/llama_cpp/llama_cpp.cpp +583 -262
- data/ext/llama_cpp/src/ggml-alloc.c +8 -2
- data/ext/llama_cpp/src/ggml-alloc.h +1 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +326 -149
- data/ext/llama_cpp/src/ggml-cuda.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.h +4 -0
- data/ext/llama_cpp/src/ggml-metal.m +167 -89
- data/ext/llama_cpp/src/ggml-metal.metal +130 -40
- data/ext/llama_cpp/src/ggml-opencl.cpp +119 -53
- data/ext/llama_cpp/src/ggml.c +2355 -1166
- data/ext/llama_cpp/src/ggml.h +129 -35
- data/ext/llama_cpp/src/k_quants.c +744 -2
- data/ext/llama_cpp/src/llama.cpp +1766 -671
- data/ext/llama_cpp/src/llama.h +321 -120
- data/ext/llama_cpp/src/unicode.h +462 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +6 -10
- data/sig/llama_cpp.rbs +70 -34
- metadata +4 -3
@@ -24,12 +24,59 @@ typedef struct {
|
|
24
24
|
int8_t qs[QK8_0]; // quants
|
25
25
|
} block_q8_0;
|
26
26
|
|
27
|
+
// general-purpose kernel for addition of two tensors
|
28
|
+
// pros: works for non-contiguous tensors, supports broadcast across dims 1, 2 and 3
|
29
|
+
// cons: not very efficient
|
27
30
|
kernel void kernel_add(
|
28
|
-
device const
|
29
|
-
device const
|
30
|
-
device
|
31
|
-
|
32
|
-
|
31
|
+
device const char * src0,
|
32
|
+
device const char * src1,
|
33
|
+
device char * dst,
|
34
|
+
constant int64_t & ne00,
|
35
|
+
constant int64_t & ne01,
|
36
|
+
constant int64_t & ne02,
|
37
|
+
constant int64_t & ne03,
|
38
|
+
constant int64_t & nb00,
|
39
|
+
constant int64_t & nb01,
|
40
|
+
constant int64_t & nb02,
|
41
|
+
constant int64_t & nb03,
|
42
|
+
constant int64_t & ne10,
|
43
|
+
constant int64_t & ne11,
|
44
|
+
constant int64_t & ne12,
|
45
|
+
constant int64_t & ne13,
|
46
|
+
constant int64_t & nb10,
|
47
|
+
constant int64_t & nb11,
|
48
|
+
constant int64_t & nb12,
|
49
|
+
constant int64_t & nb13,
|
50
|
+
constant int64_t & ne0,
|
51
|
+
constant int64_t & ne1,
|
52
|
+
constant int64_t & ne2,
|
53
|
+
constant int64_t & ne3,
|
54
|
+
constant int64_t & nb0,
|
55
|
+
constant int64_t & nb1,
|
56
|
+
constant int64_t & nb2,
|
57
|
+
constant int64_t & nb3,
|
58
|
+
uint3 tgpig[[threadgroup_position_in_grid]],
|
59
|
+
uint3 tpitg[[thread_position_in_threadgroup]],
|
60
|
+
uint3 ntg[[threads_per_threadgroup]]) {
|
61
|
+
const int64_t i03 = tgpig.z;
|
62
|
+
const int64_t i02 = tgpig.y;
|
63
|
+
const int64_t i01 = tgpig.x;
|
64
|
+
|
65
|
+
const int64_t i13 = i03 % ne13;
|
66
|
+
const int64_t i12 = i02 % ne12;
|
67
|
+
const int64_t i11 = i01 % ne11;
|
68
|
+
|
69
|
+
device const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01 + tpitg.x*nb00;
|
70
|
+
device const char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11 + tpitg.x*nb10;
|
71
|
+
device char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1 + tpitg.x*nb0;
|
72
|
+
|
73
|
+
for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
|
74
|
+
((device float *)dst_ptr)[0] = ((device float *)src0_ptr)[0] + ((device float *)src1_ptr)[0];
|
75
|
+
|
76
|
+
src0_ptr += ntg.x*nb00;
|
77
|
+
src1_ptr += ntg.x*nb10;
|
78
|
+
dst_ptr += ntg.x*nb0;
|
79
|
+
}
|
33
80
|
}
|
34
81
|
|
35
82
|
// assumption: src1 is a row
|
@@ -38,7 +85,7 @@ kernel void kernel_add_row(
|
|
38
85
|
device const float4 * src0,
|
39
86
|
device const float4 * src1,
|
40
87
|
device float4 * dst,
|
41
|
-
constant int64_t & nb,
|
88
|
+
constant int64_t & nb [[buffer(27)]],
|
42
89
|
uint tpig[[thread_position_in_grid]]) {
|
43
90
|
dst[tpig] = src0[tpig] + src1[tpig % nb];
|
44
91
|
}
|
@@ -783,7 +830,9 @@ kernel void kernel_alibi_f32(
|
|
783
830
|
constant uint64_t & nb1,
|
784
831
|
constant uint64_t & nb2,
|
785
832
|
constant uint64_t & nb3,
|
786
|
-
constant
|
833
|
+
constant float & m0,
|
834
|
+
constant float & m1,
|
835
|
+
constant int & n_heads_log2_floor,
|
787
836
|
uint3 tgpig[[threadgroup_position_in_grid]],
|
788
837
|
uint3 tpitg[[thread_position_in_threadgroup]],
|
789
838
|
uint3 ntg[[threads_per_threadgroup]]) {
|
@@ -799,37 +848,73 @@ kernel void kernel_alibi_f32(
|
|
799
848
|
const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
|
800
849
|
|
801
850
|
device float * dst_data = (device float *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
802
|
-
float m_k
|
851
|
+
float m_k;
|
852
|
+
if (i2 < n_heads_log2_floor) {
|
853
|
+
m_k = pow(m0, i2 + 1);
|
854
|
+
} else {
|
855
|
+
m_k = pow(m1, 2 * (i2 - n_heads_log2_floor) + 1);
|
856
|
+
}
|
803
857
|
for (int64_t i00 = tpitg.x; i00 < ne00; i00 += ntg.x) {
|
804
858
|
device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
|
805
859
|
dst_data[i00] = src[0] + m_k * (i00 - ne00 + 1);
|
806
860
|
}
|
807
861
|
}
|
808
862
|
|
863
|
+
typedef void (rope_t)(
|
864
|
+
device const void * src0,
|
865
|
+
device const int32_t * src1,
|
866
|
+
device float * dst,
|
867
|
+
constant int64_t & ne00,
|
868
|
+
constant int64_t & ne01,
|
869
|
+
constant int64_t & ne02,
|
870
|
+
constant int64_t & ne03,
|
871
|
+
constant uint64_t & nb00,
|
872
|
+
constant uint64_t & nb01,
|
873
|
+
constant uint64_t & nb02,
|
874
|
+
constant uint64_t & nb03,
|
875
|
+
constant int64_t & ne0,
|
876
|
+
constant int64_t & ne1,
|
877
|
+
constant int64_t & ne2,
|
878
|
+
constant int64_t & ne3,
|
879
|
+
constant uint64_t & nb0,
|
880
|
+
constant uint64_t & nb1,
|
881
|
+
constant uint64_t & nb2,
|
882
|
+
constant uint64_t & nb3,
|
883
|
+
constant int & n_past,
|
884
|
+
constant int & n_dims,
|
885
|
+
constant int & mode,
|
886
|
+
constant float & freq_base,
|
887
|
+
constant float & freq_scale,
|
888
|
+
uint tiitg[[thread_index_in_threadgroup]],
|
889
|
+
uint3 tptg[[threads_per_threadgroup]],
|
890
|
+
uint3 tgpig[[threadgroup_position_in_grid]]);
|
891
|
+
|
892
|
+
template<typename T>
|
809
893
|
kernel void kernel_rope(
|
810
|
-
device const
|
811
|
-
device
|
812
|
-
|
813
|
-
constant
|
814
|
-
constant
|
815
|
-
constant
|
816
|
-
constant
|
817
|
-
constant
|
818
|
-
constant
|
819
|
-
constant
|
820
|
-
constant
|
821
|
-
constant
|
822
|
-
constant
|
823
|
-
constant
|
824
|
-
constant
|
825
|
-
constant
|
826
|
-
constant
|
827
|
-
constant
|
828
|
-
constant
|
829
|
-
constant
|
830
|
-
constant
|
831
|
-
constant
|
832
|
-
constant
|
894
|
+
device const void * src0,
|
895
|
+
device const int32_t * src1,
|
896
|
+
device float * dst,
|
897
|
+
constant int64_t & ne00,
|
898
|
+
constant int64_t & ne01,
|
899
|
+
constant int64_t & ne02,
|
900
|
+
constant int64_t & ne03,
|
901
|
+
constant uint64_t & nb00,
|
902
|
+
constant uint64_t & nb01,
|
903
|
+
constant uint64_t & nb02,
|
904
|
+
constant uint64_t & nb03,
|
905
|
+
constant int64_t & ne0,
|
906
|
+
constant int64_t & ne1,
|
907
|
+
constant int64_t & ne2,
|
908
|
+
constant int64_t & ne3,
|
909
|
+
constant uint64_t & nb0,
|
910
|
+
constant uint64_t & nb1,
|
911
|
+
constant uint64_t & nb2,
|
912
|
+
constant uint64_t & nb3,
|
913
|
+
constant int & n_past,
|
914
|
+
constant int & n_dims,
|
915
|
+
constant int & mode,
|
916
|
+
constant float & freq_base,
|
917
|
+
constant float & freq_scale,
|
833
918
|
uint tiitg[[thread_index_in_threadgroup]],
|
834
919
|
uint3 tptg[[threads_per_threadgroup]],
|
835
920
|
uint3 tgpig[[threadgroup_position_in_grid]]) {
|
@@ -839,7 +924,9 @@ kernel void kernel_rope(
|
|
839
924
|
|
840
925
|
const bool is_neox = mode & 2;
|
841
926
|
|
842
|
-
const
|
927
|
+
device const int32_t * pos = src1;
|
928
|
+
|
929
|
+
const int64_t p = pos[i2];
|
843
930
|
|
844
931
|
const float theta_0 = freq_scale * (float)p;
|
845
932
|
const float inv_ndims = -1.f/n_dims;
|
@@ -851,11 +938,11 @@ kernel void kernel_rope(
|
|
851
938
|
const float cos_theta = cos(theta);
|
852
939
|
const float sin_theta = sin(theta);
|
853
940
|
|
854
|
-
device const
|
855
|
-
device
|
941
|
+
device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
942
|
+
device T * dst_data = (device T *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
856
943
|
|
857
|
-
const
|
858
|
-
const
|
944
|
+
const T x0 = src[0];
|
945
|
+
const T x1 = src[1];
|
859
946
|
|
860
947
|
dst_data[0] = x0*cos_theta - x1*sin_theta;
|
861
948
|
dst_data[1] = x0*sin_theta + x1*cos_theta;
|
@@ -870,8 +957,8 @@ kernel void kernel_rope(
|
|
870
957
|
|
871
958
|
const int64_t i0 = ib*n_dims + ic/2;
|
872
959
|
|
873
|
-
device const
|
874
|
-
device
|
960
|
+
device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
961
|
+
device T * dst_data = (device T *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
875
962
|
|
876
963
|
const float x0 = src[0];
|
877
964
|
const float x1 = src[n_dims/2];
|
@@ -883,6 +970,9 @@ kernel void kernel_rope(
|
|
883
970
|
}
|
884
971
|
}
|
885
972
|
|
973
|
+
template [[host_name("kernel_rope_f32")]] kernel rope_t kernel_rope<float>;
|
974
|
+
template [[host_name("kernel_rope_f16")]] kernel rope_t kernel_rope<half>;
|
975
|
+
|
886
976
|
kernel void kernel_cpy_f16_f16(
|
887
977
|
device const half * src0,
|
888
978
|
device half * dst,
|
@@ -1273,8 +1363,8 @@ kernel void kernel_mul_mat_q3_K_f32(
|
|
1273
1363
|
|
1274
1364
|
float yl[32];
|
1275
1365
|
|
1276
|
-
const uint16_t kmask1 = 0x3030;
|
1277
|
-
const uint16_t kmask2 = 0x0f0f;
|
1366
|
+
//const uint16_t kmask1 = 0x3030;
|
1367
|
+
//const uint16_t kmask2 = 0x0f0f;
|
1278
1368
|
|
1279
1369
|
const int tid = tiisg/4;
|
1280
1370
|
const int ix = tiisg%4;
|
@@ -202,14 +202,14 @@ inline void get_scale_min_k4(int j, const __global uint8_t *q, uint8_t *d, uint8
|
|
202
202
|
|
203
203
|
__kernel void dequantize_block_q2_K(__global const struct block_q2_K *x, __global float *yy)
|
204
204
|
{
|
205
|
-
const int i = get_group_id(0);
|
205
|
+
const int i = get_group_id(0) + get_global_offset(0);
|
206
206
|
const int tid = get_local_id(0);
|
207
207
|
const int n = tid / 32;
|
208
208
|
const int l = tid - 32 * n;
|
209
209
|
const int is = 8 * n + l / 16;
|
210
210
|
|
211
211
|
const uint8_t q = x[i].qs[32 * n + l];
|
212
|
-
__global float *y = yy +
|
212
|
+
__global float *y = yy + get_group_id(0) * QK_K + 128 * n;
|
213
213
|
|
214
214
|
const float dall = vload_half(0, &x[i].d);
|
215
215
|
const float dmin = vload_half(0, &x[i].dmin);
|
@@ -223,7 +223,7 @@ __kernel void dequantize_block_q2_K(__global const struct block_q2_K *x, __globa
|
|
223
223
|
__kernel void dequantize_block_q3_K(__global const struct block_q3_K *x, __global float *yy)
|
224
224
|
{
|
225
225
|
int r = get_local_id(0) / 4;
|
226
|
-
int i = get_group_id(0);
|
226
|
+
int i = get_group_id(0) + get_global_offset(0);
|
227
227
|
int tid = r / 2;
|
228
228
|
int is0 = r % 2;
|
229
229
|
int l0 = 16 * is0 + 4 * (get_local_id(0) % 4);
|
@@ -241,7 +241,7 @@ __kernel void dequantize_block_q3_K(__global const struct block_q3_K *x, __globa
|
|
241
241
|
float d_all = vload_half(0, &x[i].d);
|
242
242
|
float dl = d_all * (us - 32);
|
243
243
|
|
244
|
-
__global float *y = yy +
|
244
|
+
__global float *y = yy + get_group_id(0) * QK_K + 128 * n + 32 * j;
|
245
245
|
const __global uint8_t *q = x[i].qs + 32 * n;
|
246
246
|
const __global uint8_t *hm = x[i].hmask;
|
247
247
|
|
@@ -251,14 +251,14 @@ __kernel void dequantize_block_q3_K(__global const struct block_q3_K *x, __globa
|
|
251
251
|
|
252
252
|
__kernel void dequantize_block_q4_K(__global const struct block_q4_K *x, __global float *yy)
|
253
253
|
{
|
254
|
-
const int i = get_group_id(0);
|
254
|
+
const int i = get_group_id(0) + get_global_offset(0);
|
255
255
|
const int tid = get_local_id(0);
|
256
256
|
const int il = tid / 8;
|
257
257
|
const int ir = tid % 8;
|
258
258
|
const int is = 2 * il;
|
259
259
|
const int n = 4;
|
260
260
|
|
261
|
-
__global float *y = yy +
|
261
|
+
__global float *y = yy + get_group_id(0) * QK_K + 64 * il + n * ir;
|
262
262
|
|
263
263
|
const float dall = vload_half(0, &x[i].d);
|
264
264
|
const float dmin = vload_half(0, &x[i].dmin);
|
@@ -281,13 +281,13 @@ __kernel void dequantize_block_q4_K(__global const struct block_q4_K *x, __globa
|
|
281
281
|
|
282
282
|
__kernel void dequantize_block_q5_K(__global const struct block_q5_K *x, __global float *yy)
|
283
283
|
{
|
284
|
-
const int i = get_group_id(0);
|
284
|
+
const int i = get_group_id(0) + get_global_offset(0);
|
285
285
|
const int tid = get_local_id(0);
|
286
286
|
const int il = tid / 16;
|
287
287
|
const int ir = tid % 16;
|
288
288
|
const int is = 2 * il;
|
289
289
|
|
290
|
-
__global float *y = yy +
|
290
|
+
__global float *y = yy + get_group_id(0) * QK_K + 64 * il + 2 * ir;
|
291
291
|
|
292
292
|
const float dall = vload_half(0, &x[i].d);
|
293
293
|
const float dmin = vload_half(0, &x[i].dmin);
|
@@ -313,13 +313,13 @@ __kernel void dequantize_block_q5_K(__global const struct block_q5_K *x, __globa
|
|
313
313
|
|
314
314
|
__kernel void dequantize_block_q6_K(__global const struct block_q6_K *x, __global float *yy)
|
315
315
|
{
|
316
|
-
const int i = get_group_id(0);
|
316
|
+
const int i = get_group_id(0) + get_global_offset(0);
|
317
317
|
const int tid = get_local_id(0);
|
318
318
|
const int ip = tid / 32;
|
319
319
|
const int il = tid - 32 * ip;
|
320
320
|
const int is = 8 * ip + il / 16;
|
321
321
|
|
322
|
-
__global float *y = yy +
|
322
|
+
__global float *y = yy + get_group_id(0) * QK_K + 128 * ip + il;
|
323
323
|
|
324
324
|
const float d = vload_half(0, &x[i].d);
|
325
325
|
|
@@ -730,7 +730,7 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) {
|
|
730
730
|
const uint qk = QUANT_K;
|
731
731
|
const uint qr = QUANT_R;
|
732
732
|
|
733
|
-
const int ib = i/qk; // block index
|
733
|
+
const int ib = i/qk + get_global_offset(0); // block index
|
734
734
|
const int iqs = (i%qk)/qr; // quant index
|
735
735
|
const int iybs = i - i%qk; // y block start index
|
736
736
|
const int y_offset = qr == 1 ? 1 : qk/2;
|
@@ -1349,30 +1349,42 @@ static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t o
|
|
1349
1349
|
const enum ggml_type type = src->type;
|
1350
1350
|
const size_t ts = ggml_type_size(type);
|
1351
1351
|
const size_t bs = ggml_blck_size(type);
|
1352
|
+
const uint64_t row_size = ts*ne0/bs;
|
1352
1353
|
|
1353
|
-
const
|
1354
|
-
if (nb0 == ts && nb1 ==
|
1355
|
-
|
1356
|
-
return err;
|
1354
|
+
const char * x = (const char *) src->data + i2*nb2 + i3*nb3;
|
1355
|
+
if (nb0 == ts && nb1 == row_size) {
|
1356
|
+
return clEnqueueWriteBuffer(queue, dst, CL_FALSE, offset, ne1*row_size, x, 0, NULL, ev);
|
1357
1357
|
}
|
1358
1358
|
if (nb0 == ts) {
|
1359
1359
|
const size_t buffer_origin[3] = { offset, 0, 0 };
|
1360
1360
|
const size_t host_origin[3] = { 0, 0, 0 };
|
1361
|
-
const size_t region[3] = {
|
1362
|
-
|
1363
|
-
return err;
|
1361
|
+
const size_t region[3] = { row_size, ne1, 1 };
|
1362
|
+
return clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, row_size, 0, nb1, 0, x, 0, NULL, ev);
|
1364
1363
|
}
|
1364
|
+
std::vector<cl_event> events;
|
1365
|
+
if (ev && ne1>1) events.reserve(ne1-1);
|
1365
1366
|
for (uint64_t i1 = 0; i1 < ne1; i1++) {
|
1366
1367
|
// pretend the row is a matrix with cols=1
|
1367
|
-
const size_t buffer_origin[3] = { offset
|
1368
|
+
const size_t buffer_origin[3] = { offset + i1*row_size, 0, 0 };
|
1368
1369
|
const size_t host_origin[3] = { 0, 0, 0 };
|
1369
|
-
const size_t region[3] = { ts
|
1370
|
-
|
1370
|
+
const size_t region[3] = { ts, ne0/bs, 1 };
|
1371
|
+
// if an event is requested, make the last write wait for all previous writes to complete
|
1372
|
+
if (ev && i1) {
|
1373
|
+
events.push_back(*ev);
|
1374
|
+
}
|
1375
|
+
cl_uint nevents = i1 == ne1-1 ? events.size() : 0U;
|
1376
|
+
err = clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, ts, 0, nb0, 0, x + i1*nb1, nevents, nevents ? events.data() : nullptr, ev);
|
1371
1377
|
if (err != CL_SUCCESS) {
|
1372
|
-
|
1378
|
+
for (auto event : events) {
|
1379
|
+
clReleaseEvent(event);
|
1380
|
+
}
|
1381
|
+
return err;
|
1373
1382
|
}
|
1374
1383
|
}
|
1375
|
-
|
1384
|
+
for (auto event : events) {
|
1385
|
+
CL_CHECK(clReleaseEvent(event));
|
1386
|
+
}
|
1387
|
+
return CL_SUCCESS;
|
1376
1388
|
}
|
1377
1389
|
|
1378
1390
|
static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -1476,10 +1488,15 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1476
1488
|
|
1477
1489
|
const int64_t ne10 = src1->ne[0];
|
1478
1490
|
const int64_t ne11 = src1->ne[1];
|
1491
|
+
const int64_t ne12 = src1->ne[2];
|
1492
|
+
const int64_t ne13 = src1->ne[3];
|
1479
1493
|
|
1480
1494
|
const int nb2 = dst->nb[2];
|
1481
1495
|
const int nb3 = dst->nb[3];
|
1482
1496
|
|
1497
|
+
const int64_t r2 = ne12 / ne02;
|
1498
|
+
const int64_t r3 = ne13 / ne03;
|
1499
|
+
|
1483
1500
|
const float alpha = 1.0f;
|
1484
1501
|
const float beta = 0.0f;
|
1485
1502
|
const int x_ne = ne01 * ne00;
|
@@ -1498,13 +1515,25 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1498
1515
|
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
|
1499
1516
|
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
|
1500
1517
|
|
1501
|
-
|
1502
|
-
|
1518
|
+
size_t x_offset = 0;
|
1519
|
+
int64_t pi02 = -1;
|
1520
|
+
int64_t pi03 = -1;
|
1521
|
+
|
1522
|
+
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
1523
|
+
int64_t i03 = i13 / r3;
|
1524
|
+
|
1525
|
+
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
1526
|
+
int64_t i02 = i12 / r2;
|
1527
|
+
|
1503
1528
|
// copy data to device
|
1504
|
-
if (src0->backend
|
1529
|
+
if (src0->backend == GGML_BACKEND_GPU) {
|
1530
|
+
x_offset = (i03 * ne02 + i02) * x_ne;
|
1531
|
+
} else if (i02 != pi02 || i03 != pi03) {
|
1505
1532
|
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
|
1533
|
+
pi02 = i02;
|
1534
|
+
pi03 = i03;
|
1506
1535
|
}
|
1507
|
-
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1,
|
1536
|
+
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
|
1508
1537
|
|
1509
1538
|
CL_CHECK(clFinish(queue));
|
1510
1539
|
|
@@ -1514,7 +1543,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1514
1543
|
clblast::Transpose::kYes, clblast::Transpose::kNo,
|
1515
1544
|
ne01, ne11, ne10,
|
1516
1545
|
alpha,
|
1517
|
-
d_X,
|
1546
|
+
d_X, x_offset, ne00,
|
1518
1547
|
d_Y, 0, ne10,
|
1519
1548
|
beta,
|
1520
1549
|
d_D, 0, ne01,
|
@@ -1525,7 +1554,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1525
1554
|
}
|
1526
1555
|
|
1527
1556
|
// copy dst to host
|
1528
|
-
float * d = (float *) ((char *) dst->data +
|
1557
|
+
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
1529
1558
|
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
|
1530
1559
|
}
|
1531
1560
|
}
|
@@ -1547,6 +1576,8 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1547
1576
|
|
1548
1577
|
const int64_t ne10 = src1->ne[0];
|
1549
1578
|
const int64_t ne11 = src1->ne[1];
|
1579
|
+
const int64_t ne12 = src1->ne[2];
|
1580
|
+
const int64_t ne13 = src1->ne[3];
|
1550
1581
|
|
1551
1582
|
const int nb10 = src1->nb[0];
|
1552
1583
|
const int nb11 = src1->nb[1];
|
@@ -1556,6 +1587,9 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1556
1587
|
const int nb2 = dst->nb[2];
|
1557
1588
|
const int nb3 = dst->nb[3];
|
1558
1589
|
|
1590
|
+
const int64_t r2 = ne12 / ne02;
|
1591
|
+
const int64_t r3 = ne13 / ne03;
|
1592
|
+
|
1559
1593
|
const ggml_fp16_t alpha = ggml_fp32_to_fp16(1.0f);
|
1560
1594
|
const ggml_fp16_t beta = ggml_fp32_to_fp16(0.0f);
|
1561
1595
|
const int x_ne = ne01 * ne00;
|
@@ -1577,32 +1611,44 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1577
1611
|
bool src1_cont_rows = nb10 == sizeof(float);
|
1578
1612
|
bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);
|
1579
1613
|
|
1580
|
-
|
1581
|
-
|
1614
|
+
size_t x_offset = 0;
|
1615
|
+
int64_t pi02 = -1;
|
1616
|
+
int64_t pi03 = -1;
|
1617
|
+
|
1618
|
+
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
1619
|
+
int64_t i03 = i13 / r3;
|
1620
|
+
|
1621
|
+
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
1622
|
+
int64_t i02 = i12 / r2;
|
1623
|
+
|
1582
1624
|
// copy src0 to device
|
1583
|
-
if (src0->backend
|
1625
|
+
if (src0->backend == GGML_BACKEND_GPU) {
|
1626
|
+
x_offset = (i03 * ne02 + i02) * x_ne;
|
1627
|
+
} else if (i02 != pi02 || i03 != pi03) {
|
1584
1628
|
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
|
1629
|
+
pi02 = i02;
|
1630
|
+
pi03 = i03;
|
1585
1631
|
}
|
1586
1632
|
|
1587
1633
|
// convert src1 to fp16
|
1588
1634
|
// TODO: use multiple threads
|
1589
|
-
ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (
|
1590
|
-
char * src1i = (char *) src1->data +
|
1635
|
+
ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i13 * ne12 + i12);
|
1636
|
+
char * src1i = (char *) src1->data + i13*nb13 + i12*nb12;
|
1591
1637
|
if (src1_cont_rows) {
|
1592
1638
|
if (src1_cont_cols) {
|
1593
1639
|
ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
|
1594
1640
|
}
|
1595
1641
|
else {
|
1596
|
-
for (int64_t
|
1597
|
-
ggml_fp32_to_fp16_row((float *) (src1i +
|
1642
|
+
for (int64_t i11 = 0; i11 < ne11; i11++) {
|
1643
|
+
ggml_fp32_to_fp16_row((float *) (src1i + i11*nb11), tmp + i11*ne10, ne10);
|
1598
1644
|
}
|
1599
1645
|
}
|
1600
1646
|
}
|
1601
1647
|
else {
|
1602
|
-
for (int64_t
|
1603
|
-
for (int64_t
|
1648
|
+
for (int64_t i11 = 0; i11 < ne11; i11++) {
|
1649
|
+
for (int64_t i10 = 0; i10 < ne10; i10++) {
|
1604
1650
|
// very slow due to no inlining
|
1605
|
-
tmp[
|
1651
|
+
tmp[i11*ne10 + i10] = ggml_fp32_to_fp16(*(float *) (src1i + i11*nb11 + i10*nb10));
|
1606
1652
|
}
|
1607
1653
|
}
|
1608
1654
|
}
|
@@ -1618,7 +1664,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1618
1664
|
clblast::Transpose::kYes, clblast::Transpose::kNo,
|
1619
1665
|
ne01, ne11, ne10,
|
1620
1666
|
alpha,
|
1621
|
-
d_X,
|
1667
|
+
d_X, x_offset, ne00,
|
1622
1668
|
d_Y, 0, ne10,
|
1623
1669
|
beta,
|
1624
1670
|
d_D, 0, ne01,
|
@@ -1631,7 +1677,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1631
1677
|
// copy dst to host, then convert to float
|
1632
1678
|
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
|
1633
1679
|
|
1634
|
-
float * d = (float *) ((char *) dst->data +
|
1680
|
+
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
1635
1681
|
|
1636
1682
|
ggml_fp16_to_fp32_row(tmp, d, d_ne);
|
1637
1683
|
}
|
@@ -1652,18 +1698,24 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
1652
1698
|
|
1653
1699
|
const int64_t ne10 = src1->ne[0];
|
1654
1700
|
const int64_t ne11 = src1->ne[1];
|
1701
|
+
const int64_t ne12 = src1->ne[2];
|
1702
|
+
const int64_t ne13 = src1->ne[3];
|
1655
1703
|
|
1656
1704
|
const int nb2 = dst->nb[2];
|
1657
1705
|
const int nb3 = dst->nb[3];
|
1658
1706
|
const ggml_type type = src0->type;
|
1659
1707
|
const bool mul_mat_vec = ne11 == 1;
|
1660
1708
|
|
1709
|
+
const int64_t r2 = ne12 / ne02;
|
1710
|
+
const int64_t r3 = ne13 / ne03;
|
1711
|
+
|
1661
1712
|
const float alpha = 1.0f;
|
1662
1713
|
const float beta = 0.0f;
|
1663
1714
|
const int x_ne = ne01 * ne00;
|
1664
1715
|
const int y_ne = ne11 * ne10;
|
1665
1716
|
const int d_ne = ne11 * ne01;
|
1666
|
-
const
|
1717
|
+
const int x_bps = x_ne / ggml_blck_size(type); // blocks per 2D slice
|
1718
|
+
const size_t q_sz = ggml_type_size(type) * x_bps;
|
1667
1719
|
|
1668
1720
|
size_t x_size;
|
1669
1721
|
size_t y_size;
|
@@ -1690,12 +1742,23 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
1690
1742
|
size_t ev_idx = 0;
|
1691
1743
|
std::vector<cl_event> events;
|
1692
1744
|
|
1693
|
-
|
1694
|
-
|
1745
|
+
int64_t pi02 = -1;
|
1746
|
+
int64_t pi03 = -1;
|
1747
|
+
|
1748
|
+
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
1749
|
+
int64_t i03 = i13 / r3;
|
1750
|
+
|
1751
|
+
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
1752
|
+
int64_t i02 = i12 / r2;
|
1753
|
+
|
1695
1754
|
// copy src0 to device if necessary
|
1696
1755
|
if (src0->backend == GGML_BACKEND_CPU) {
|
1697
|
-
|
1698
|
-
|
1756
|
+
if (i02 != pi02 || i03 != pi03) {
|
1757
|
+
events.emplace_back();
|
1758
|
+
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
|
1759
|
+
pi02 = i02;
|
1760
|
+
pi03 = i03;
|
1761
|
+
}
|
1699
1762
|
} else if (src0->backend == GGML_BACKEND_GPU) {
|
1700
1763
|
d_Q = (cl_mem) src0->extra;
|
1701
1764
|
} else {
|
@@ -1704,7 +1767,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
1704
1767
|
if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
|
1705
1768
|
// copy src1 to device
|
1706
1769
|
events.emplace_back();
|
1707
|
-
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1,
|
1770
|
+
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++));
|
1708
1771
|
|
1709
1772
|
// compute
|
1710
1773
|
const size_t global = ne01 * CL_DMMV_BLOCK_SIZE;
|
@@ -1720,12 +1783,13 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
1720
1783
|
} else { // general dequantization kernel + CLBlast matrix matrix multiplication
|
1721
1784
|
// convert src0 to fp32 on device
|
1722
1785
|
const size_t global = x_ne / global_denom;
|
1786
|
+
const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
|
1723
1787
|
CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
|
1724
1788
|
CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
|
1725
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, NULL, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
|
1789
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, offset > 0 ? &offset : NULL, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
|
1726
1790
|
|
1727
1791
|
// copy src1 to device
|
1728
|
-
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1,
|
1792
|
+
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
|
1729
1793
|
|
1730
1794
|
events.emplace_back();
|
1731
1795
|
|
@@ -1749,7 +1813,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
1749
1813
|
}
|
1750
1814
|
|
1751
1815
|
// copy dst to host
|
1752
|
-
float * d = (float *) ((char *) dst->data +
|
1816
|
+
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
1753
1817
|
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL));
|
1754
1818
|
for (auto *event : events) {
|
1755
1819
|
clReleaseEvent(event);
|
@@ -1844,17 +1908,19 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
|
|
1844
1908
|
const int64_t ne3 = tensor->ne[3];
|
1845
1909
|
|
1846
1910
|
const ggml_type type = tensor->type;
|
1847
|
-
const size_t
|
1911
|
+
const size_t s_sz = ggml_type_size(type) * (size_t) (ne0 * ne1 / ggml_blck_size(type));
|
1912
|
+
const size_t q_sz = s_sz * (size_t) (ne2 * ne3);
|
1848
1913
|
|
1849
1914
|
size_t q_size;
|
1850
1915
|
cl_mem dst = ggml_cl_pool_malloc(q_sz, &q_size);
|
1851
1916
|
|
1852
1917
|
tensor->data = data;
|
1853
1918
|
// copy tensor to device
|
1919
|
+
size_t offset = 0;
|
1854
1920
|
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
1855
1921
|
for (int64_t i2 = 0; i2 < ne2; i2++) {
|
1856
|
-
|
1857
|
-
|
1922
|
+
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, dst, offset, tensor, i3, i2, NULL));
|
1923
|
+
offset += s_sz;
|
1858
1924
|
}
|
1859
1925
|
}
|
1860
1926
|
|