llama_cpp 0.1.4 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +36 -0
- data/examples/README.md +60 -0
- data/examples/chat.rb +195 -0
- data/ext/llama_cpp/extconf.rb +26 -1
- data/ext/llama_cpp/llama_cpp.cpp +262 -13
- data/ext/llama_cpp/src/ggml-cuda.cu +2483 -0
- data/ext/llama_cpp/src/ggml-cuda.h +18 -2
- data/ext/llama_cpp/src/ggml-metal.h +64 -0
- data/ext/llama_cpp/src/ggml-metal.m +834 -0
- data/ext/llama_cpp/src/ggml-metal.metal +1436 -0
- data/ext/llama_cpp/src/ggml-opencl.cpp +207 -40
- data/ext/llama_cpp/src/ggml-opencl.h +4 -1
- data/ext/llama_cpp/src/ggml.c +2236 -404
- data/ext/llama_cpp/src/ggml.h +170 -8
- data/ext/llama_cpp/src/k_quants.c +2244 -0
- data/ext/llama_cpp/src/k_quants.h +122 -0
- data/ext/llama_cpp/src/llama-util.h +16 -0
- data/ext/llama_cpp/src/llama.cpp +631 -179
- data/ext/llama_cpp/src/llama.h +51 -11
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +36 -1
- metadata +10 -2
@@ -3,6 +3,8 @@
|
|
3
3
|
#include <array>
|
4
4
|
#include <atomic>
|
5
5
|
#include <sstream>
|
6
|
+
#include <vector>
|
7
|
+
#include <limits>
|
6
8
|
|
7
9
|
#define CL_TARGET_OPENCL_VERSION 110
|
8
10
|
#include <clblast.h>
|
@@ -197,6 +199,18 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float
|
|
197
199
|
}
|
198
200
|
);
|
199
201
|
|
202
|
+
std::string mul_template = MULTILINE_QUOTE(
|
203
|
+
__kernel void KERNEL_NAME(__global TYPE* x, const int x_offset, __global TYPE* y, const int y_offset, __global TYPE* dst, const int dst_offset, const int ky) {
|
204
|
+
const int i = get_group_id(0)*get_local_size(0) + get_local_id(0);
|
205
|
+
|
206
|
+
if (i >= get_global_size(0)) {
|
207
|
+
return;
|
208
|
+
}
|
209
|
+
|
210
|
+
dst[dst_offset + i] = x[x_offset + i] * y[y_offset + i%ky];
|
211
|
+
}
|
212
|
+
);
|
213
|
+
|
200
214
|
#define CL_CHECK(err) \
|
201
215
|
do { \
|
202
216
|
cl_int err_ = (err); \
|
@@ -239,6 +253,13 @@ std::array<std::string, 30> dequant_mul_mat_vec_str_values = {
|
|
239
253
|
"convert_mul_mat_vec_f16", "half", "1", "1", "convert_f16"
|
240
254
|
};
|
241
255
|
|
256
|
+
std::array<std::string, 2> mul_str_keys = {
|
257
|
+
"KERNEL_NAME", "TYPE"
|
258
|
+
};
|
259
|
+
std::array<std::string, 2> mul_str_values = {
|
260
|
+
"mul_f32", "float"
|
261
|
+
};
|
262
|
+
|
242
263
|
std::string& replace(std::string& s, const std::string& from, const std::string& to) {
|
243
264
|
size_t pos = 0;
|
244
265
|
while ((pos = s.find(from, pos)) != std::string::npos) {
|
@@ -261,6 +282,13 @@ std::string generate_kernels() {
|
|
261
282
|
src << dequant_kernel << '\n';
|
262
283
|
src << dmmv_kernel << '\n';
|
263
284
|
}
|
285
|
+
for (size_t i = 0; i < mul_str_values.size(); i += mul_str_keys.size()) {
|
286
|
+
std::string mul_kernel = mul_template;
|
287
|
+
for (size_t j = 0; j < mul_str_keys.size(); j++) {
|
288
|
+
replace(mul_kernel, mul_str_keys[j], mul_str_values[i + j]);
|
289
|
+
}
|
290
|
+
src << mul_kernel << '\n';
|
291
|
+
}
|
264
292
|
return src.str();
|
265
293
|
}
|
266
294
|
|
@@ -272,6 +300,7 @@ static cl_program program;
|
|
272
300
|
static cl_kernel convert_row_f16_cl;
|
273
301
|
static cl_kernel dequantize_row_q4_0_cl, dequantize_row_q4_1_cl, dequantize_row_q5_0_cl, dequantize_row_q5_1_cl, dequantize_row_q8_0_cl;
|
274
302
|
static cl_kernel dequantize_mul_mat_vec_q4_0_cl, dequantize_mul_mat_vec_q4_1_cl, dequantize_mul_mat_vec_q5_0_cl, dequantize_mul_mat_vec_q5_1_cl, dequantize_mul_mat_vec_q8_0_cl, convert_mul_mat_vec_f16_cl;
|
303
|
+
static cl_kernel mul_f32_cl;
|
275
304
|
static bool fp16_support;
|
276
305
|
|
277
306
|
static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer) {
|
@@ -508,6 +537,9 @@ void ggml_cl_init(void) {
|
|
508
537
|
CL_CHECK((dequantize_mul_mat_vec_q5_1_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_1", &err), err));
|
509
538
|
CL_CHECK((dequantize_mul_mat_vec_q8_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q8_0", &err), err));
|
510
539
|
CL_CHECK((convert_mul_mat_vec_f16_cl = clCreateKernel(program, "convert_mul_mat_vec_f16", &err), err));
|
540
|
+
|
541
|
+
// mul kernel
|
542
|
+
CL_CHECK((mul_f32_cl = clCreateKernel(program, "mul_f32", &err), err));
|
511
543
|
}
|
512
544
|
|
513
545
|
static cl_kernel* ggml_get_to_fp32_cl(ggml_type type) {
|
@@ -573,21 +605,44 @@ struct cl_buffer {
|
|
573
605
|
static cl_buffer g_cl_buffer_pool[MAX_CL_BUFFERS];
|
574
606
|
static std::atomic_flag g_cl_pool_lock = ATOMIC_FLAG_INIT;
|
575
607
|
|
576
|
-
static cl_mem ggml_cl_pool_malloc(size_t size, size_t * actual_size
|
608
|
+
static cl_mem ggml_cl_pool_malloc(size_t size, size_t * actual_size) {
|
577
609
|
scoped_spin_lock lock(g_cl_pool_lock);
|
578
610
|
cl_int err;
|
579
611
|
|
612
|
+
int best_i = -1;
|
613
|
+
size_t best_size = std::numeric_limits<size_t>::max(); //smallest unused buffer that fits our needs
|
614
|
+
int worst_i = -1;
|
615
|
+
size_t worst_size = 0; //largest unused buffer seen so far
|
580
616
|
for (int i = 0; i < MAX_CL_BUFFERS; ++i) {
|
581
|
-
cl_buffer&
|
582
|
-
if (b.size > 0 && b.size >= size)
|
583
|
-
|
584
|
-
|
585
|
-
b.size
|
586
|
-
return mem;
|
617
|
+
cl_buffer &b = g_cl_buffer_pool[i];
|
618
|
+
if (b.size > 0 && b.size >= size && b.size < best_size)
|
619
|
+
{
|
620
|
+
best_i = i;
|
621
|
+
best_size = b.size;
|
587
622
|
}
|
623
|
+
if (b.size > 0 && b.size > worst_size)
|
624
|
+
{
|
625
|
+
worst_i = i;
|
626
|
+
worst_size = b.size;
|
627
|
+
}
|
628
|
+
}
|
629
|
+
if(best_i!=-1) //found the smallest buffer that fits our needs
|
630
|
+
{
|
631
|
+
cl_buffer& b = g_cl_buffer_pool[best_i];
|
632
|
+
cl_mem mem = b.mem;
|
633
|
+
*actual_size = b.size;
|
634
|
+
b.size = 0;
|
635
|
+
return mem;
|
636
|
+
}
|
637
|
+
if(worst_i!=-1) //no buffer that fits our needs, resize largest one to save memory
|
638
|
+
{
|
639
|
+
cl_buffer& b = g_cl_buffer_pool[worst_i];
|
640
|
+
cl_mem mem = b.mem;
|
641
|
+
b.size = 0;
|
642
|
+
clReleaseMemObject(mem);
|
588
643
|
}
|
589
644
|
cl_mem mem;
|
590
|
-
CL_CHECK((mem = clCreateBuffer(context,
|
645
|
+
CL_CHECK((mem = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &err), err));
|
591
646
|
*actual_size = size;
|
592
647
|
return mem;
|
593
648
|
}
|
@@ -607,6 +662,15 @@ static void ggml_cl_pool_free(cl_mem mem, size_t size) {
|
|
607
662
|
clReleaseMemObject(mem);
|
608
663
|
}
|
609
664
|
|
665
|
+
void ggml_cl_free_data(const struct ggml_tensor* tensor) {
|
666
|
+
if (tensor->backend != GGML_BACKEND_GPU) {
|
667
|
+
return;
|
668
|
+
}
|
669
|
+
|
670
|
+
cl_mem mem = (cl_mem)tensor->data;
|
671
|
+
clReleaseMemObject(mem);
|
672
|
+
}
|
673
|
+
|
610
674
|
static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t offset, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, cl_event* ev) {
|
611
675
|
cl_int err;
|
612
676
|
const uint64_t ne0 = src->ne[0];
|
@@ -644,6 +708,99 @@ static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t o
|
|
644
708
|
return err;
|
645
709
|
}
|
646
710
|
|
711
|
+
static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
712
|
+
GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
|
713
|
+
const int64_t ne00 = src0->ne[0];
|
714
|
+
const int64_t ne01 = src0->ne[1];
|
715
|
+
const int64_t ne02 = src0->ne[2];
|
716
|
+
const int64_t ne03 = src0->ne[2];
|
717
|
+
const int64_t ne0 = ne00 * ne01 * ne02 * ne03;
|
718
|
+
const int64_t ne10 = src1->ne[0];
|
719
|
+
const int64_t ne11 = src1->ne[1];
|
720
|
+
const int64_t ne12 = src1->ne[2];
|
721
|
+
const int64_t ne13 = src1->ne[3];
|
722
|
+
const int64_t nb10 = src1->nb[0];
|
723
|
+
const int nb2 = dst->nb[2];
|
724
|
+
const int nb3 = dst->nb[3];
|
725
|
+
size_t x_size;
|
726
|
+
size_t d_size;
|
727
|
+
|
728
|
+
cl_mem d_X = ggml_cl_pool_malloc(ne0 * sizeof(float), &x_size); // src0
|
729
|
+
cl_mem d_Y = (cl_mem) src1->data; // src1 is already on device, broadcasted.
|
730
|
+
cl_mem d_D = ggml_cl_pool_malloc(ne0 * sizeof(float), &d_size); // dst
|
731
|
+
|
732
|
+
|
733
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
734
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
735
|
+
const int i0 = i03*ne02 + i02;
|
736
|
+
|
737
|
+
cl_event ev;
|
738
|
+
|
739
|
+
// copy src0 to device
|
740
|
+
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, i0, src0, i03, i02, &ev));
|
741
|
+
|
742
|
+
if (nb10 == sizeof(float)) {
|
743
|
+
// Contiguous, avoid overhead from queueing many kernel runs
|
744
|
+
const int64_t i13 = i03%ne13;
|
745
|
+
const int64_t i12 = i02%ne12;
|
746
|
+
const int i1 = i13*ne12*ne11 + i12*ne11;
|
747
|
+
|
748
|
+
cl_int x_offset = 0;
|
749
|
+
cl_int y_offset = i1*ne10;
|
750
|
+
cl_int d_offset = 0;
|
751
|
+
|
752
|
+
size_t global = ne00 * ne01;
|
753
|
+
cl_int ky = ne10;
|
754
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X));
|
755
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset));
|
756
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 2, sizeof(cl_mem), &d_Y));
|
757
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 3, sizeof(cl_int), &y_offset));
|
758
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 4, sizeof(cl_mem), &d_D));
|
759
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset));
|
760
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky));
|
761
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
|
762
|
+
} else {
|
763
|
+
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
764
|
+
const int64_t i13 = i03%ne13;
|
765
|
+
const int64_t i12 = i02%ne12;
|
766
|
+
const int64_t i11 = i01%ne11;
|
767
|
+
const int i1 = i13*ne12*ne11 + i12*ne11 + i11;
|
768
|
+
|
769
|
+
cl_int x_offset = i01*ne00;
|
770
|
+
cl_int y_offset = i1*ne10;
|
771
|
+
cl_int d_offset = i01*ne00;
|
772
|
+
|
773
|
+
// compute
|
774
|
+
size_t global = ne00;
|
775
|
+
cl_int ky = ne10;
|
776
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X));
|
777
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset));
|
778
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 2, sizeof(cl_mem), &d_Y));
|
779
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 3, sizeof(cl_int), &y_offset));
|
780
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 4, sizeof(cl_mem), &d_D));
|
781
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset));
|
782
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky));
|
783
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
|
784
|
+
}
|
785
|
+
}
|
786
|
+
|
787
|
+
CL_CHECK(clReleaseEvent(ev));
|
788
|
+
CL_CHECK(clFinish(queue));
|
789
|
+
|
790
|
+
// copy dst to host
|
791
|
+
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
792
|
+
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * ne00*ne01, d, 0, NULL, NULL));
|
793
|
+
}
|
794
|
+
}
|
795
|
+
ggml_cl_pool_free(d_X, x_size);
|
796
|
+
ggml_cl_pool_free(d_D, d_size);
|
797
|
+
}
|
798
|
+
|
799
|
+
void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
800
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
801
|
+
ggml_cl_mul_f32(src0, src1, dst);
|
802
|
+
}
|
803
|
+
|
647
804
|
static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
648
805
|
const int64_t ne00 = src0->ne[0];
|
649
806
|
const int64_t ne01 = src0->ne[1];
|
@@ -666,18 +823,18 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
|
666
823
|
size_t y_size;
|
667
824
|
size_t d_size;
|
668
825
|
cl_mem d_X;
|
669
|
-
if (src0->backend ==
|
826
|
+
if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
|
670
827
|
d_X = (cl_mem) src0->data;
|
671
828
|
} else {
|
672
|
-
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size
|
829
|
+
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
|
673
830
|
}
|
674
|
-
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size
|
675
|
-
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size
|
831
|
+
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
|
832
|
+
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
|
676
833
|
|
677
834
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
678
835
|
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
679
836
|
// copy data to device
|
680
|
-
if (src0->backend !=
|
837
|
+
if (src0->backend != GGML_BACKEND_GPU) {
|
681
838
|
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
|
682
839
|
}
|
683
840
|
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL));
|
@@ -706,7 +863,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
|
706
863
|
}
|
707
864
|
}
|
708
865
|
|
709
|
-
if (src0->backend !=
|
866
|
+
if (src0->backend != GGML_BACKEND_GPU) {
|
710
867
|
ggml_cl_pool_free(d_X, x_size);
|
711
868
|
}
|
712
869
|
ggml_cl_pool_free(d_Y, y_size);
|
@@ -742,13 +899,13 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|
742
899
|
size_t y_size;
|
743
900
|
size_t d_size;
|
744
901
|
cl_mem d_X;
|
745
|
-
if (src0->backend ==
|
902
|
+
if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
|
746
903
|
d_X = (cl_mem) src0->data;
|
747
904
|
} else {
|
748
|
-
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size
|
905
|
+
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
|
749
906
|
}
|
750
|
-
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * y_ne, &y_size
|
751
|
-
cl_mem d_D = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * d_ne, &d_size
|
907
|
+
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * y_ne, &y_size);
|
908
|
+
cl_mem d_D = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * d_ne, &d_size);
|
752
909
|
|
753
910
|
bool src1_cont_rows = nb10 == sizeof(float);
|
754
911
|
bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);
|
@@ -756,7 +913,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|
756
913
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
757
914
|
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
758
915
|
// copy src0 to device
|
759
|
-
if (src0->backend !=
|
916
|
+
if (src0->backend != GGML_BACKEND_GPU) {
|
760
917
|
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
|
761
918
|
}
|
762
919
|
|
@@ -813,7 +970,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|
813
970
|
}
|
814
971
|
}
|
815
972
|
|
816
|
-
if (src0->backend !=
|
973
|
+
if (src0->backend != GGML_BACKEND_GPU) {
|
817
974
|
ggml_cl_pool_free(d_X, x_size);
|
818
975
|
}
|
819
976
|
ggml_cl_pool_free(d_Y, y_size);
|
@@ -847,57 +1004,61 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
847
1004
|
size_t q_size;
|
848
1005
|
cl_mem d_X;
|
849
1006
|
if (!mul_mat_vec) {
|
850
|
-
d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size
|
1007
|
+
d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
|
851
1008
|
}
|
852
|
-
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size
|
853
|
-
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size
|
1009
|
+
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
|
1010
|
+
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
|
854
1011
|
cl_mem d_Q;
|
855
1012
|
if (src0->backend == GGML_BACKEND_CPU) {
|
856
|
-
d_Q = ggml_cl_pool_malloc(q_sz, &q_size
|
1013
|
+
d_Q = ggml_cl_pool_malloc(q_sz, &q_size);
|
857
1014
|
}
|
858
1015
|
|
859
1016
|
cl_kernel* to_fp32_cl = ggml_get_to_fp32_cl(type);
|
860
1017
|
cl_kernel* dmmv = ggml_get_dequantize_mul_mat_vec_cl(type);
|
861
1018
|
GGML_ASSERT(to_fp32_cl != nullptr);
|
862
1019
|
|
1020
|
+
size_t ev_idx = 0;
|
1021
|
+
std::vector<cl_event> events;
|
1022
|
+
|
863
1023
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
864
1024
|
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
865
|
-
cl_event ev_sgemm;
|
866
|
-
|
867
1025
|
// copy src0 to device if necessary
|
868
1026
|
if (src0->backend == GGML_BACKEND_CPU) {
|
869
|
-
|
870
|
-
|
1027
|
+
events.emplace_back();
|
1028
|
+
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
|
1029
|
+
} else if (src0->backend == GGML_BACKEND_GPU) {
|
871
1030
|
d_Q = (cl_mem) src0->data;
|
872
1031
|
} else {
|
873
1032
|
GGML_ASSERT(false);
|
874
1033
|
}
|
875
1034
|
if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
|
876
1035
|
// copy src1 to device
|
877
|
-
|
1036
|
+
events.emplace_back();
|
1037
|
+
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, events.data() + ev_idx++));
|
878
1038
|
|
879
1039
|
// compute
|
880
1040
|
const size_t global = ne01 * CL_DMMV_BLOCK_SIZE;
|
881
1041
|
const size_t local = CL_DMMV_BLOCK_SIZE;
|
882
1042
|
const cl_int ncols = ne00;
|
1043
|
+
events.emplace_back();
|
883
1044
|
CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
|
884
1045
|
CL_CHECK(clSetKernelArg(*dmmv, 1, sizeof(float) * local, NULL));
|
885
1046
|
CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y));
|
886
1047
|
CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D));
|
887
1048
|
CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols));
|
888
|
-
CL_CHECK(
|
889
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, NULL, &global, &local, 0, NULL, &ev_sgemm));
|
1049
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, NULL, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
|
890
1050
|
} else { // general dequantization kernel + CLBlast matrix matrix multiplication
|
891
1051
|
// convert src0 to fp32 on device
|
892
1052
|
const size_t global = x_ne;
|
893
1053
|
CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
|
894
1054
|
CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
|
895
|
-
CL_CHECK(
|
896
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, NULL, &global, NULL, 0, NULL, NULL));
|
1055
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, NULL, &global, NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
|
897
1056
|
|
898
1057
|
// copy src1 to device
|
899
1058
|
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL));
|
900
1059
|
|
1060
|
+
events.emplace_back();
|
1061
|
+
|
901
1062
|
// wait for conversion
|
902
1063
|
CL_CHECK(clFinish(queue));
|
903
1064
|
|
@@ -910,7 +1071,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
910
1071
|
d_Y, 0, ne10,
|
911
1072
|
beta,
|
912
1073
|
d_D, 0, ne01,
|
913
|
-
&queue,
|
1074
|
+
&queue, events.data() + ev_idx++);
|
914
1075
|
|
915
1076
|
if (status != clblast::StatusCode::kSuccess) {
|
916
1077
|
GGML_ASSERT(false);
|
@@ -919,8 +1080,13 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
919
1080
|
|
920
1081
|
// copy dst to host
|
921
1082
|
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
922
|
-
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &
|
923
|
-
|
1083
|
+
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL));
|
1084
|
+
for (auto *event : events) {
|
1085
|
+
clReleaseEvent(event);
|
1086
|
+
}
|
1087
|
+
|
1088
|
+
ev_idx = 0;
|
1089
|
+
events.clear();
|
924
1090
|
}
|
925
1091
|
}
|
926
1092
|
|
@@ -945,7 +1111,7 @@ bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
|
|
945
1111
|
if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
|
946
1112
|
src1->type == GGML_TYPE_F32 &&
|
947
1113
|
dst->type == GGML_TYPE_F32 &&
|
948
|
-
((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend ==
|
1114
|
+
((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_GPU)) {
|
949
1115
|
return true;
|
950
1116
|
}
|
951
1117
|
|
@@ -1001,7 +1167,7 @@ size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct g
|
|
1001
1167
|
return 0;
|
1002
1168
|
}
|
1003
1169
|
|
1004
|
-
void ggml_cl_transform_tensor(ggml_tensor * tensor) {
|
1170
|
+
void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
|
1005
1171
|
const int64_t ne0 = tensor->ne[0];
|
1006
1172
|
const int64_t ne1 = tensor->ne[1];
|
1007
1173
|
const int64_t ne2 = tensor->ne[2];
|
@@ -1011,8 +1177,9 @@ void ggml_cl_transform_tensor(ggml_tensor * tensor) {
|
|
1011
1177
|
const size_t q_sz = ggml_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_blck_size(type);
|
1012
1178
|
|
1013
1179
|
size_t q_size;
|
1014
|
-
cl_mem dst = ggml_cl_pool_malloc(q_sz, &q_size
|
1180
|
+
cl_mem dst = ggml_cl_pool_malloc(q_sz, &q_size);
|
1015
1181
|
|
1182
|
+
tensor->data = data;
|
1016
1183
|
// copy tensor to device
|
1017
1184
|
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
1018
1185
|
for (int64_t i2 = 0; i2 < ne2; i2++) {
|
@@ -1024,5 +1191,5 @@ void ggml_cl_transform_tensor(ggml_tensor * tensor) {
|
|
1024
1191
|
CL_CHECK(clFinish(queue));
|
1025
1192
|
|
1026
1193
|
tensor->data = dst;
|
1027
|
-
tensor->backend
|
1194
|
+
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
1028
1195
|
}
|
@@ -8,6 +8,7 @@ extern "C" {
|
|
8
8
|
|
9
9
|
void ggml_cl_init(void);
|
10
10
|
|
11
|
+
void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
11
12
|
bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
12
13
|
size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
13
14
|
void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
|
@@ -15,7 +16,9 @@ void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor
|
|
15
16
|
void * ggml_cl_host_malloc(size_t size);
|
16
17
|
void ggml_cl_host_free(void * ptr);
|
17
18
|
|
18
|
-
void
|
19
|
+
void ggml_cl_free_data(const struct ggml_tensor* tensor);
|
20
|
+
|
21
|
+
void ggml_cl_transform_tensor(void * data, struct ggml_tensor * tensor);
|
19
22
|
|
20
23
|
#ifdef __cplusplus
|
21
24
|
}
|