llama_cpp 0.1.4 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +29 -0
- data/ext/llama_cpp/extconf.rb +26 -1
- data/ext/llama_cpp/llama_cpp.cpp +210 -13
- data/ext/llama_cpp/src/ggml-cuda.cu +1916 -0
- data/ext/llama_cpp/src/ggml-cuda.h +15 -2
- data/ext/llama_cpp/src/ggml-metal.h +63 -0
- data/ext/llama_cpp/src/ggml-metal.m +783 -0
- data/ext/llama_cpp/src/ggml-metal.metal +1133 -0
- data/ext/llama_cpp/src/ggml-opencl.cpp +235 -39
- data/ext/llama_cpp/src/ggml-opencl.h +4 -0
- data/ext/llama_cpp/src/ggml.c +340 -109
- data/ext/llama_cpp/src/ggml.h +44 -6
- data/ext/llama_cpp/src/k_quants.c +2244 -0
- data/ext/llama_cpp/src/k_quants.h +122 -0
- data/ext/llama_cpp/src/llama-util.h +16 -0
- data/ext/llama_cpp/src/llama.cpp +484 -136
- data/ext/llama_cpp/src/llama.h +39 -8
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +33 -1
- metadata +8 -2
@@ -3,6 +3,8 @@
|
|
3
3
|
#include <array>
|
4
4
|
#include <atomic>
|
5
5
|
#include <sstream>
|
6
|
+
#include <vector>
|
7
|
+
#include <limits>
|
6
8
|
|
7
9
|
#define CL_TARGET_OPENCL_VERSION 110
|
8
10
|
#include <clblast.h>
|
@@ -197,6 +199,18 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float
|
|
197
199
|
}
|
198
200
|
);
|
199
201
|
|
202
|
+
std::string mul_template = MULTILINE_QUOTE(
|
203
|
+
__kernel void KERNEL_NAME(__global TYPE* x, const int x_offset, __global TYPE* y, const int y_offset, __global TYPE* dst, const int dst_offset, const int ky) {
|
204
|
+
const int i = get_group_id(0)*get_local_size(0) + get_local_id(0);
|
205
|
+
|
206
|
+
if (i >= get_global_size(0)) {
|
207
|
+
return;
|
208
|
+
}
|
209
|
+
|
210
|
+
dst[dst_offset + i] = x[x_offset + i] * y[y_offset + i%ky];
|
211
|
+
}
|
212
|
+
);
|
213
|
+
|
200
214
|
#define CL_CHECK(err) \
|
201
215
|
do { \
|
202
216
|
cl_int err_ = (err); \
|
@@ -239,6 +253,13 @@ std::array<std::string, 30> dequant_mul_mat_vec_str_values = {
|
|
239
253
|
"convert_mul_mat_vec_f16", "half", "1", "1", "convert_f16"
|
240
254
|
};
|
241
255
|
|
256
|
+
std::array<std::string, 2> mul_str_keys = {
|
257
|
+
"KERNEL_NAME", "TYPE"
|
258
|
+
};
|
259
|
+
std::array<std::string, 2> mul_str_values = {
|
260
|
+
"mul_f32", "float"
|
261
|
+
};
|
262
|
+
|
242
263
|
std::string& replace(std::string& s, const std::string& from, const std::string& to) {
|
243
264
|
size_t pos = 0;
|
244
265
|
while ((pos = s.find(from, pos)) != std::string::npos) {
|
@@ -261,6 +282,13 @@ std::string generate_kernels() {
|
|
261
282
|
src << dequant_kernel << '\n';
|
262
283
|
src << dmmv_kernel << '\n';
|
263
284
|
}
|
285
|
+
for (size_t i = 0; i < mul_str_values.size(); i += mul_str_keys.size()) {
|
286
|
+
std::string mul_kernel = mul_template;
|
287
|
+
for (size_t j = 0; j < mul_str_keys.size(); j++) {
|
288
|
+
replace(mul_kernel, mul_str_keys[j], mul_str_values[i + j]);
|
289
|
+
}
|
290
|
+
src << mul_kernel << '\n';
|
291
|
+
}
|
264
292
|
return src.str();
|
265
293
|
}
|
266
294
|
|
@@ -272,6 +300,7 @@ static cl_program program;
|
|
272
300
|
static cl_kernel convert_row_f16_cl;
|
273
301
|
static cl_kernel dequantize_row_q4_0_cl, dequantize_row_q4_1_cl, dequantize_row_q5_0_cl, dequantize_row_q5_1_cl, dequantize_row_q8_0_cl;
|
274
302
|
static cl_kernel dequantize_mul_mat_vec_q4_0_cl, dequantize_mul_mat_vec_q4_1_cl, dequantize_mul_mat_vec_q5_0_cl, dequantize_mul_mat_vec_q5_1_cl, dequantize_mul_mat_vec_q8_0_cl, convert_mul_mat_vec_f16_cl;
|
303
|
+
static cl_kernel mul_f32_cl;
|
275
304
|
static bool fp16_support;
|
276
305
|
|
277
306
|
static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer) {
|
@@ -508,6 +537,9 @@ void ggml_cl_init(void) {
|
|
508
537
|
CL_CHECK((dequantize_mul_mat_vec_q5_1_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_1", &err), err));
|
509
538
|
CL_CHECK((dequantize_mul_mat_vec_q8_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q8_0", &err), err));
|
510
539
|
CL_CHECK((convert_mul_mat_vec_f16_cl = clCreateKernel(program, "convert_mul_mat_vec_f16", &err), err));
|
540
|
+
|
541
|
+
// mul kernel
|
542
|
+
CL_CHECK((mul_f32_cl = clCreateKernel(program, "mul_f32", &err), err));
|
511
543
|
}
|
512
544
|
|
513
545
|
static cl_kernel* ggml_get_to_fp32_cl(ggml_type type) {
|
@@ -573,21 +605,44 @@ struct cl_buffer {
|
|
573
605
|
static cl_buffer g_cl_buffer_pool[MAX_CL_BUFFERS];
|
574
606
|
static std::atomic_flag g_cl_pool_lock = ATOMIC_FLAG_INIT;
|
575
607
|
|
576
|
-
static cl_mem ggml_cl_pool_malloc(size_t size, size_t * actual_size
|
608
|
+
static cl_mem ggml_cl_pool_malloc(size_t size, size_t * actual_size) {
|
577
609
|
scoped_spin_lock lock(g_cl_pool_lock);
|
578
610
|
cl_int err;
|
579
611
|
|
612
|
+
int best_i = -1;
|
613
|
+
size_t best_size = std::numeric_limits<size_t>::max(); //smallest unused buffer that fits our needs
|
614
|
+
int worst_i = -1;
|
615
|
+
size_t worst_size = 0; //largest unused buffer seen so far
|
580
616
|
for (int i = 0; i < MAX_CL_BUFFERS; ++i) {
|
581
|
-
cl_buffer&
|
582
|
-
if (b.size > 0 && b.size >= size)
|
583
|
-
|
584
|
-
|
585
|
-
b.size
|
586
|
-
return mem;
|
617
|
+
cl_buffer &b = g_cl_buffer_pool[i];
|
618
|
+
if (b.size > 0 && b.size >= size && b.size < best_size)
|
619
|
+
{
|
620
|
+
best_i = i;
|
621
|
+
best_size = b.size;
|
587
622
|
}
|
623
|
+
if (b.size > 0 && b.size > worst_size)
|
624
|
+
{
|
625
|
+
worst_i = i;
|
626
|
+
worst_size = b.size;
|
627
|
+
}
|
628
|
+
}
|
629
|
+
if(best_i!=-1) //found the smallest buffer that fits our needs
|
630
|
+
{
|
631
|
+
cl_buffer& b = g_cl_buffer_pool[best_i];
|
632
|
+
cl_mem mem = b.mem;
|
633
|
+
*actual_size = b.size;
|
634
|
+
b.size = 0;
|
635
|
+
return mem;
|
636
|
+
}
|
637
|
+
if(worst_i!=-1) //no buffer that fits our needs, resize largest one to save memory
|
638
|
+
{
|
639
|
+
cl_buffer& b = g_cl_buffer_pool[worst_i];
|
640
|
+
cl_mem mem = b.mem;
|
641
|
+
b.size = 0;
|
642
|
+
clReleaseMemObject(mem);
|
588
643
|
}
|
589
644
|
cl_mem mem;
|
590
|
-
CL_CHECK((mem = clCreateBuffer(context,
|
645
|
+
CL_CHECK((mem = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &err), err));
|
591
646
|
*actual_size = size;
|
592
647
|
return mem;
|
593
648
|
}
|
@@ -607,6 +662,15 @@ static void ggml_cl_pool_free(cl_mem mem, size_t size) {
|
|
607
662
|
clReleaseMemObject(mem);
|
608
663
|
}
|
609
664
|
|
665
|
+
void ggml_cl_free_data(const struct ggml_tensor* tensor) {
|
666
|
+
if (tensor->backend != GGML_BACKEND_GPU) {
|
667
|
+
return;
|
668
|
+
}
|
669
|
+
|
670
|
+
cl_mem mem = (cl_mem)tensor->data;
|
671
|
+
clReleaseMemObject(mem);
|
672
|
+
}
|
673
|
+
|
610
674
|
static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t offset, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, cl_event* ev) {
|
611
675
|
cl_int err;
|
612
676
|
const uint64_t ne0 = src->ne[0];
|
@@ -644,6 +708,99 @@ static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t o
|
|
644
708
|
return err;
|
645
709
|
}
|
646
710
|
|
711
|
+
static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
712
|
+
GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
|
713
|
+
const int64_t ne00 = src0->ne[0];
|
714
|
+
const int64_t ne01 = src0->ne[1];
|
715
|
+
const int64_t ne02 = src0->ne[2];
|
716
|
+
const int64_t ne03 = src0->ne[2];
|
717
|
+
const int64_t ne0 = ne00 * ne01 * ne02 * ne03;
|
718
|
+
const int64_t ne10 = src1->ne[0];
|
719
|
+
const int64_t ne11 = src1->ne[1];
|
720
|
+
const int64_t ne12 = src1->ne[2];
|
721
|
+
const int64_t ne13 = src1->ne[3];
|
722
|
+
const int64_t nb10 = src1->nb[0];
|
723
|
+
const int nb2 = dst->nb[2];
|
724
|
+
const int nb3 = dst->nb[3];
|
725
|
+
size_t x_size;
|
726
|
+
size_t d_size;
|
727
|
+
|
728
|
+
cl_mem d_X = ggml_cl_pool_malloc(ne0 * sizeof(float), &x_size); // src0
|
729
|
+
cl_mem d_Y = (cl_mem) src1->data; // src1 is already on device, broadcasted.
|
730
|
+
cl_mem d_D = ggml_cl_pool_malloc(ne0 * sizeof(float), &d_size); // dst
|
731
|
+
|
732
|
+
|
733
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
734
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
735
|
+
const int i0 = i03*ne02 + i02;
|
736
|
+
|
737
|
+
cl_event ev;
|
738
|
+
|
739
|
+
// copy src0 to device
|
740
|
+
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, i0, src0, i03, i02, &ev));
|
741
|
+
|
742
|
+
if (nb10 == sizeof(float)) {
|
743
|
+
// Contiguous, avoid overhead from queueing many kernel runs
|
744
|
+
const int64_t i13 = i03%ne13;
|
745
|
+
const int64_t i12 = i02%ne12;
|
746
|
+
const int i1 = i13*ne12*ne11 + i12*ne11;
|
747
|
+
|
748
|
+
cl_int x_offset = 0;
|
749
|
+
cl_int y_offset = i1*ne10;
|
750
|
+
cl_int d_offset = 0;
|
751
|
+
|
752
|
+
size_t global = ne00 * ne01;
|
753
|
+
cl_int ky = ne10;
|
754
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X));
|
755
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset));
|
756
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 2, sizeof(cl_mem), &d_Y));
|
757
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 3, sizeof(cl_int), &y_offset));
|
758
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 4, sizeof(cl_mem), &d_D));
|
759
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset));
|
760
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky));
|
761
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
|
762
|
+
} else {
|
763
|
+
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
764
|
+
const int64_t i13 = i03%ne13;
|
765
|
+
const int64_t i12 = i02%ne12;
|
766
|
+
const int64_t i11 = i01%ne11;
|
767
|
+
const int i1 = i13*ne12*ne11 + i12*ne11 + i11;
|
768
|
+
|
769
|
+
cl_int x_offset = i01*ne00;
|
770
|
+
cl_int y_offset = i1*ne10;
|
771
|
+
cl_int d_offset = i01*ne00;
|
772
|
+
|
773
|
+
// compute
|
774
|
+
size_t global = ne00;
|
775
|
+
cl_int ky = ne10;
|
776
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X));
|
777
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset));
|
778
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 2, sizeof(cl_mem), &d_Y));
|
779
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 3, sizeof(cl_int), &y_offset));
|
780
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 4, sizeof(cl_mem), &d_D));
|
781
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset));
|
782
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky));
|
783
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
|
784
|
+
}
|
785
|
+
}
|
786
|
+
|
787
|
+
CL_CHECK(clReleaseEvent(ev));
|
788
|
+
CL_CHECK(clFinish(queue));
|
789
|
+
|
790
|
+
// copy dst to host
|
791
|
+
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
792
|
+
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * ne00*ne01, d, 0, NULL, NULL));
|
793
|
+
}
|
794
|
+
}
|
795
|
+
ggml_cl_pool_free(d_X, x_size);
|
796
|
+
ggml_cl_pool_free(d_D, d_size);
|
797
|
+
}
|
798
|
+
|
799
|
+
void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
800
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
801
|
+
ggml_cl_mul_f32(src0, src1, dst);
|
802
|
+
}
|
803
|
+
|
647
804
|
static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
648
805
|
const int64_t ne00 = src0->ne[0];
|
649
806
|
const int64_t ne01 = src0->ne[1];
|
@@ -666,18 +823,18 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
|
666
823
|
size_t y_size;
|
667
824
|
size_t d_size;
|
668
825
|
cl_mem d_X;
|
669
|
-
if (src0->backend ==
|
826
|
+
if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
|
670
827
|
d_X = (cl_mem) src0->data;
|
671
828
|
} else {
|
672
|
-
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size
|
829
|
+
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
|
673
830
|
}
|
674
|
-
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size
|
675
|
-
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size
|
831
|
+
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
|
832
|
+
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
|
676
833
|
|
677
834
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
678
835
|
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
679
836
|
// copy data to device
|
680
|
-
if (src0->backend !=
|
837
|
+
if (src0->backend != GGML_BACKEND_GPU) {
|
681
838
|
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
|
682
839
|
}
|
683
840
|
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL));
|
@@ -706,7 +863,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
|
706
863
|
}
|
707
864
|
}
|
708
865
|
|
709
|
-
if (src0->backend !=
|
866
|
+
if (src0->backend != GGML_BACKEND_GPU) {
|
710
867
|
ggml_cl_pool_free(d_X, x_size);
|
711
868
|
}
|
712
869
|
ggml_cl_pool_free(d_Y, y_size);
|
@@ -742,13 +899,13 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|
742
899
|
size_t y_size;
|
743
900
|
size_t d_size;
|
744
901
|
cl_mem d_X;
|
745
|
-
if (src0->backend ==
|
902
|
+
if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
|
746
903
|
d_X = (cl_mem) src0->data;
|
747
904
|
} else {
|
748
|
-
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size
|
905
|
+
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
|
749
906
|
}
|
750
|
-
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * y_ne, &y_size
|
751
|
-
cl_mem d_D = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * d_ne, &d_size
|
907
|
+
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * y_ne, &y_size);
|
908
|
+
cl_mem d_D = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * d_ne, &d_size);
|
752
909
|
|
753
910
|
bool src1_cont_rows = nb10 == sizeof(float);
|
754
911
|
bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);
|
@@ -756,7 +913,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|
756
913
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
757
914
|
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
758
915
|
// copy src0 to device
|
759
|
-
if (src0->backend !=
|
916
|
+
if (src0->backend != GGML_BACKEND_GPU) {
|
760
917
|
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
|
761
918
|
}
|
762
919
|
|
@@ -813,7 +970,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|
813
970
|
}
|
814
971
|
}
|
815
972
|
|
816
|
-
if (src0->backend !=
|
973
|
+
if (src0->backend != GGML_BACKEND_GPU) {
|
817
974
|
ggml_cl_pool_free(d_X, x_size);
|
818
975
|
}
|
819
976
|
ggml_cl_pool_free(d_Y, y_size);
|
@@ -847,57 +1004,61 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
847
1004
|
size_t q_size;
|
848
1005
|
cl_mem d_X;
|
849
1006
|
if (!mul_mat_vec) {
|
850
|
-
d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size
|
1007
|
+
d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
|
851
1008
|
}
|
852
|
-
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size
|
853
|
-
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size
|
1009
|
+
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
|
1010
|
+
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
|
854
1011
|
cl_mem d_Q;
|
855
1012
|
if (src0->backend == GGML_BACKEND_CPU) {
|
856
|
-
d_Q = ggml_cl_pool_malloc(q_sz, &q_size
|
1013
|
+
d_Q = ggml_cl_pool_malloc(q_sz, &q_size);
|
857
1014
|
}
|
858
1015
|
|
859
1016
|
cl_kernel* to_fp32_cl = ggml_get_to_fp32_cl(type);
|
860
1017
|
cl_kernel* dmmv = ggml_get_dequantize_mul_mat_vec_cl(type);
|
861
1018
|
GGML_ASSERT(to_fp32_cl != nullptr);
|
862
1019
|
|
1020
|
+
size_t ev_idx = 0;
|
1021
|
+
std::vector<cl_event> events;
|
1022
|
+
|
863
1023
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
864
1024
|
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
865
|
-
cl_event ev_sgemm;
|
866
|
-
|
867
1025
|
// copy src0 to device if necessary
|
868
1026
|
if (src0->backend == GGML_BACKEND_CPU) {
|
869
|
-
|
870
|
-
|
1027
|
+
events.emplace_back();
|
1028
|
+
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
|
1029
|
+
} else if (src0->backend == GGML_BACKEND_GPU) {
|
871
1030
|
d_Q = (cl_mem) src0->data;
|
872
1031
|
} else {
|
873
1032
|
GGML_ASSERT(false);
|
874
1033
|
}
|
875
1034
|
if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
|
876
1035
|
// copy src1 to device
|
877
|
-
|
1036
|
+
events.emplace_back();
|
1037
|
+
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, events.data() + ev_idx++));
|
878
1038
|
|
879
1039
|
// compute
|
880
1040
|
const size_t global = ne01 * CL_DMMV_BLOCK_SIZE;
|
881
1041
|
const size_t local = CL_DMMV_BLOCK_SIZE;
|
882
1042
|
const cl_int ncols = ne00;
|
1043
|
+
events.emplace_back();
|
883
1044
|
CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
|
884
1045
|
CL_CHECK(clSetKernelArg(*dmmv, 1, sizeof(float) * local, NULL));
|
885
1046
|
CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y));
|
886
1047
|
CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D));
|
887
1048
|
CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols));
|
888
|
-
CL_CHECK(
|
889
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, NULL, &global, &local, 0, NULL, &ev_sgemm));
|
1049
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, NULL, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
|
890
1050
|
} else { // general dequantization kernel + CLBlast matrix matrix multiplication
|
891
1051
|
// convert src0 to fp32 on device
|
892
1052
|
const size_t global = x_ne;
|
893
1053
|
CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
|
894
1054
|
CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
|
895
|
-
CL_CHECK(
|
896
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, NULL, &global, NULL, 0, NULL, NULL));
|
1055
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, NULL, &global, NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
|
897
1056
|
|
898
1057
|
// copy src1 to device
|
899
1058
|
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL));
|
900
1059
|
|
1060
|
+
events.emplace_back();
|
1061
|
+
|
901
1062
|
// wait for conversion
|
902
1063
|
CL_CHECK(clFinish(queue));
|
903
1064
|
|
@@ -910,7 +1071,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
910
1071
|
d_Y, 0, ne10,
|
911
1072
|
beta,
|
912
1073
|
d_D, 0, ne01,
|
913
|
-
&queue,
|
1074
|
+
&queue, events.data() + ev_idx++);
|
914
1075
|
|
915
1076
|
if (status != clblast::StatusCode::kSuccess) {
|
916
1077
|
GGML_ASSERT(false);
|
@@ -919,8 +1080,13 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
919
1080
|
|
920
1081
|
// copy dst to host
|
921
1082
|
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
922
|
-
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &
|
923
|
-
|
1083
|
+
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL));
|
1084
|
+
for (auto *event : events) {
|
1085
|
+
clReleaseEvent(event);
|
1086
|
+
}
|
1087
|
+
|
1088
|
+
ev_idx = 0;
|
1089
|
+
events.clear();
|
924
1090
|
}
|
925
1091
|
}
|
926
1092
|
|
@@ -945,7 +1111,7 @@ bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
|
|
945
1111
|
if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
|
946
1112
|
src1->type == GGML_TYPE_F32 &&
|
947
1113
|
dst->type == GGML_TYPE_F32 &&
|
948
|
-
((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend ==
|
1114
|
+
((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_GPU)) {
|
949
1115
|
return true;
|
950
1116
|
}
|
951
1117
|
|
@@ -1011,7 +1177,7 @@ void ggml_cl_transform_tensor(ggml_tensor * tensor) {
|
|
1011
1177
|
const size_t q_sz = ggml_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_blck_size(type);
|
1012
1178
|
|
1013
1179
|
size_t q_size;
|
1014
|
-
cl_mem dst = ggml_cl_pool_malloc(q_sz, &q_size
|
1180
|
+
cl_mem dst = ggml_cl_pool_malloc(q_sz, &q_size);
|
1015
1181
|
|
1016
1182
|
// copy tensor to device
|
1017
1183
|
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
@@ -1024,5 +1190,35 @@ void ggml_cl_transform_tensor(ggml_tensor * tensor) {
|
|
1024
1190
|
CL_CHECK(clFinish(queue));
|
1025
1191
|
|
1026
1192
|
tensor->data = dst;
|
1027
|
-
tensor->backend =
|
1193
|
+
tensor->backend = GGML_BACKEND_GPU;
|
1194
|
+
}
|
1195
|
+
|
1196
|
+
void ggml_cl_load_data(const char * fname, struct ggml_tensor * tensor, const size_t offset) {
|
1197
|
+
cl_int err;
|
1198
|
+
FILE * fp = fopen(fname, "rb");
|
1199
|
+
|
1200
|
+
const size_t size = ggml_nbytes(tensor);
|
1201
|
+
|
1202
|
+
cl_mem dst;
|
1203
|
+
CL_CHECK((dst = clCreateBuffer(context, CL_MEM_READ_ONLY, size, nullptr, &err), err));
|
1204
|
+
void * buf_host = malloc(size);
|
1205
|
+
|
1206
|
+
#ifdef _WIN32
|
1207
|
+
int ret = _fseeki64(fp, (__int64) offset, SEEK_SET);
|
1208
|
+
#else
|
1209
|
+
int ret = fseek(fp, (long) offset, SEEK_SET);
|
1210
|
+
#endif
|
1211
|
+
GGML_ASSERT(ret == 0); // same
|
1212
|
+
|
1213
|
+
size_t ret2 = fread(buf_host, size, 1, fp);
|
1214
|
+
if (ret2 != 1) {
|
1215
|
+
fprintf(stderr, "unexpectedly reached end of file");
|
1216
|
+
exit(1);
|
1217
|
+
}
|
1218
|
+
|
1219
|
+
clEnqueueWriteBuffer(queue, dst, CL_TRUE, 0, size, buf_host, 0, nullptr, nullptr);
|
1220
|
+
|
1221
|
+
tensor->data = dst;
|
1222
|
+
free(buf_host);
|
1223
|
+
fclose(fp);
|
1028
1224
|
}
|
@@ -8,6 +8,7 @@ extern "C" {
|
|
8
8
|
|
9
9
|
void ggml_cl_init(void);
|
10
10
|
|
11
|
+
void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
11
12
|
bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
12
13
|
size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
13
14
|
void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
|
@@ -15,7 +16,10 @@ void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor
|
|
15
16
|
void * ggml_cl_host_malloc(size_t size);
|
16
17
|
void ggml_cl_host_free(void * ptr);
|
17
18
|
|
19
|
+
void ggml_cl_free_data(const struct ggml_tensor* tensor);
|
20
|
+
|
18
21
|
void ggml_cl_transform_tensor(struct ggml_tensor * tensor);
|
22
|
+
void ggml_cl_load_data(const char * fname, struct ggml_tensor * tensor, size_t offset);
|
19
23
|
|
20
24
|
#ifdef __cplusplus
|
21
25
|
}
|