llama_cpp 0.1.3 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +39 -8
- data/ext/llama_cpp/extconf.rb +26 -1
- data/ext/llama_cpp/llama_cpp.cpp +210 -13
- data/ext/llama_cpp/src/ggml-cuda.cu +1916 -0
- data/ext/llama_cpp/src/ggml-cuda.h +15 -2
- data/ext/llama_cpp/src/ggml-metal.h +63 -0
- data/ext/llama_cpp/src/ggml-metal.m +783 -0
- data/ext/llama_cpp/src/ggml-metal.metal +1133 -0
- data/ext/llama_cpp/src/ggml-opencl.cpp +242 -52
- data/ext/llama_cpp/src/ggml-opencl.h +4 -0
- data/ext/llama_cpp/src/ggml.c +835 -82
- data/ext/llama_cpp/src/ggml.h +64 -8
- data/ext/llama_cpp/src/k_quants.c +2244 -0
- data/ext/llama_cpp/src/k_quants.h +122 -0
- data/ext/llama_cpp/src/llama-util.h +16 -0
- data/ext/llama_cpp/src/llama.cpp +489 -134
- data/ext/llama_cpp/src/llama.h +43 -7
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +33 -1
- metadata +8 -2
@@ -3,6 +3,8 @@
|
|
3
3
|
#include <array>
|
4
4
|
#include <atomic>
|
5
5
|
#include <sstream>
|
6
|
+
#include <vector>
|
7
|
+
#include <limits>
|
6
8
|
|
7
9
|
#define CL_TARGET_OPENCL_VERSION 110
|
8
10
|
#include <clblast.h>
|
@@ -197,6 +199,18 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float
|
|
197
199
|
}
|
198
200
|
);
|
199
201
|
|
202
|
+
std::string mul_template = MULTILINE_QUOTE(
|
203
|
+
__kernel void KERNEL_NAME(__global TYPE* x, const int x_offset, __global TYPE* y, const int y_offset, __global TYPE* dst, const int dst_offset, const int ky) {
|
204
|
+
const int i = get_group_id(0)*get_local_size(0) + get_local_id(0);
|
205
|
+
|
206
|
+
if (i >= get_global_size(0)) {
|
207
|
+
return;
|
208
|
+
}
|
209
|
+
|
210
|
+
dst[dst_offset + i] = x[x_offset + i] * y[y_offset + i%ky];
|
211
|
+
}
|
212
|
+
);
|
213
|
+
|
200
214
|
#define CL_CHECK(err) \
|
201
215
|
do { \
|
202
216
|
cl_int err_ = (err); \
|
@@ -239,6 +253,13 @@ std::array<std::string, 30> dequant_mul_mat_vec_str_values = {
|
|
239
253
|
"convert_mul_mat_vec_f16", "half", "1", "1", "convert_f16"
|
240
254
|
};
|
241
255
|
|
256
|
+
std::array<std::string, 2> mul_str_keys = {
|
257
|
+
"KERNEL_NAME", "TYPE"
|
258
|
+
};
|
259
|
+
std::array<std::string, 2> mul_str_values = {
|
260
|
+
"mul_f32", "float"
|
261
|
+
};
|
262
|
+
|
242
263
|
std::string& replace(std::string& s, const std::string& from, const std::string& to) {
|
243
264
|
size_t pos = 0;
|
244
265
|
while ((pos = s.find(from, pos)) != std::string::npos) {
|
@@ -261,6 +282,13 @@ std::string generate_kernels() {
|
|
261
282
|
src << dequant_kernel << '\n';
|
262
283
|
src << dmmv_kernel << '\n';
|
263
284
|
}
|
285
|
+
for (size_t i = 0; i < mul_str_values.size(); i += mul_str_keys.size()) {
|
286
|
+
std::string mul_kernel = mul_template;
|
287
|
+
for (size_t j = 0; j < mul_str_keys.size(); j++) {
|
288
|
+
replace(mul_kernel, mul_str_keys[j], mul_str_values[i + j]);
|
289
|
+
}
|
290
|
+
src << mul_kernel << '\n';
|
291
|
+
}
|
264
292
|
return src.str();
|
265
293
|
}
|
266
294
|
|
@@ -272,6 +300,7 @@ static cl_program program;
|
|
272
300
|
static cl_kernel convert_row_f16_cl;
|
273
301
|
static cl_kernel dequantize_row_q4_0_cl, dequantize_row_q4_1_cl, dequantize_row_q5_0_cl, dequantize_row_q5_1_cl, dequantize_row_q8_0_cl;
|
274
302
|
static cl_kernel dequantize_mul_mat_vec_q4_0_cl, dequantize_mul_mat_vec_q4_1_cl, dequantize_mul_mat_vec_q5_0_cl, dequantize_mul_mat_vec_q5_1_cl, dequantize_mul_mat_vec_q8_0_cl, convert_mul_mat_vec_f16_cl;
|
303
|
+
static cl_kernel mul_f32_cl;
|
275
304
|
static bool fp16_support;
|
276
305
|
|
277
306
|
static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer) {
|
@@ -469,16 +498,11 @@ void ggml_cl_init(void) {
|
|
469
498
|
|
470
499
|
size_t ext_str_size;
|
471
500
|
clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 0, NULL, &ext_str_size);
|
472
|
-
char*
|
501
|
+
char *ext_buffer = (char *)alloca(ext_str_size + 1);
|
473
502
|
clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, ext_str_size, ext_buffer, NULL);
|
503
|
+
ext_buffer[ext_str_size] = '\0'; // ensure it is null terminated
|
474
504
|
// Check if ext_buffer contains cl_khr_fp16
|
475
|
-
|
476
|
-
if (memcmp(ext_buffer + i, "cl_khr_fp16", 11) == 0) {
|
477
|
-
fp16_support = true;
|
478
|
-
break;
|
479
|
-
}
|
480
|
-
}
|
481
|
-
free(ext_buffer);
|
505
|
+
fp16_support = strstr(ext_buffer, "cl_khr_fp16") != NULL;
|
482
506
|
fprintf(stderr, "ggml_opencl: device FP16 support: %s\n", fp16_support ? "true" : "false");
|
483
507
|
|
484
508
|
cl_context_properties properties[] = {
|
@@ -513,6 +537,9 @@ void ggml_cl_init(void) {
|
|
513
537
|
CL_CHECK((dequantize_mul_mat_vec_q5_1_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_1", &err), err));
|
514
538
|
CL_CHECK((dequantize_mul_mat_vec_q8_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q8_0", &err), err));
|
515
539
|
CL_CHECK((convert_mul_mat_vec_f16_cl = clCreateKernel(program, "convert_mul_mat_vec_f16", &err), err));
|
540
|
+
|
541
|
+
// mul kernel
|
542
|
+
CL_CHECK((mul_f32_cl = clCreateKernel(program, "mul_f32", &err), err));
|
516
543
|
}
|
517
544
|
|
518
545
|
static cl_kernel* ggml_get_to_fp32_cl(ggml_type type) {
|
@@ -578,21 +605,44 @@ struct cl_buffer {
|
|
578
605
|
static cl_buffer g_cl_buffer_pool[MAX_CL_BUFFERS];
|
579
606
|
static std::atomic_flag g_cl_pool_lock = ATOMIC_FLAG_INIT;
|
580
607
|
|
581
|
-
static cl_mem ggml_cl_pool_malloc(size_t size, size_t * actual_size
|
608
|
+
static cl_mem ggml_cl_pool_malloc(size_t size, size_t * actual_size) {
|
582
609
|
scoped_spin_lock lock(g_cl_pool_lock);
|
583
610
|
cl_int err;
|
584
611
|
|
612
|
+
int best_i = -1;
|
613
|
+
size_t best_size = std::numeric_limits<size_t>::max(); //smallest unused buffer that fits our needs
|
614
|
+
int worst_i = -1;
|
615
|
+
size_t worst_size = 0; //largest unused buffer seen so far
|
585
616
|
for (int i = 0; i < MAX_CL_BUFFERS; ++i) {
|
586
|
-
cl_buffer&
|
587
|
-
if (b.size > 0 && b.size >= size)
|
588
|
-
|
589
|
-
|
590
|
-
b.size
|
591
|
-
|
617
|
+
cl_buffer &b = g_cl_buffer_pool[i];
|
618
|
+
if (b.size > 0 && b.size >= size && b.size < best_size)
|
619
|
+
{
|
620
|
+
best_i = i;
|
621
|
+
best_size = b.size;
|
622
|
+
}
|
623
|
+
if (b.size > 0 && b.size > worst_size)
|
624
|
+
{
|
625
|
+
worst_i = i;
|
626
|
+
worst_size = b.size;
|
592
627
|
}
|
593
628
|
}
|
629
|
+
if(best_i!=-1) //found the smallest buffer that fits our needs
|
630
|
+
{
|
631
|
+
cl_buffer& b = g_cl_buffer_pool[best_i];
|
632
|
+
cl_mem mem = b.mem;
|
633
|
+
*actual_size = b.size;
|
634
|
+
b.size = 0;
|
635
|
+
return mem;
|
636
|
+
}
|
637
|
+
if(worst_i!=-1) //no buffer that fits our needs, resize largest one to save memory
|
638
|
+
{
|
639
|
+
cl_buffer& b = g_cl_buffer_pool[worst_i];
|
640
|
+
cl_mem mem = b.mem;
|
641
|
+
b.size = 0;
|
642
|
+
clReleaseMemObject(mem);
|
643
|
+
}
|
594
644
|
cl_mem mem;
|
595
|
-
CL_CHECK((mem = clCreateBuffer(context,
|
645
|
+
CL_CHECK((mem = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &err), err));
|
596
646
|
*actual_size = size;
|
597
647
|
return mem;
|
598
648
|
}
|
@@ -612,6 +662,15 @@ static void ggml_cl_pool_free(cl_mem mem, size_t size) {
|
|
612
662
|
clReleaseMemObject(mem);
|
613
663
|
}
|
614
664
|
|
665
|
+
void ggml_cl_free_data(const struct ggml_tensor* tensor) {
|
666
|
+
if (tensor->backend != GGML_BACKEND_GPU) {
|
667
|
+
return;
|
668
|
+
}
|
669
|
+
|
670
|
+
cl_mem mem = (cl_mem)tensor->data;
|
671
|
+
clReleaseMemObject(mem);
|
672
|
+
}
|
673
|
+
|
615
674
|
static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t offset, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, cl_event* ev) {
|
616
675
|
cl_int err;
|
617
676
|
const uint64_t ne0 = src->ne[0];
|
@@ -649,6 +708,99 @@ static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t o
|
|
649
708
|
return err;
|
650
709
|
}
|
651
710
|
|
711
|
+
static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
712
|
+
GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
|
713
|
+
const int64_t ne00 = src0->ne[0];
|
714
|
+
const int64_t ne01 = src0->ne[1];
|
715
|
+
const int64_t ne02 = src0->ne[2];
|
716
|
+
const int64_t ne03 = src0->ne[2];
|
717
|
+
const int64_t ne0 = ne00 * ne01 * ne02 * ne03;
|
718
|
+
const int64_t ne10 = src1->ne[0];
|
719
|
+
const int64_t ne11 = src1->ne[1];
|
720
|
+
const int64_t ne12 = src1->ne[2];
|
721
|
+
const int64_t ne13 = src1->ne[3];
|
722
|
+
const int64_t nb10 = src1->nb[0];
|
723
|
+
const int nb2 = dst->nb[2];
|
724
|
+
const int nb3 = dst->nb[3];
|
725
|
+
size_t x_size;
|
726
|
+
size_t d_size;
|
727
|
+
|
728
|
+
cl_mem d_X = ggml_cl_pool_malloc(ne0 * sizeof(float), &x_size); // src0
|
729
|
+
cl_mem d_Y = (cl_mem) src1->data; // src1 is already on device, broadcasted.
|
730
|
+
cl_mem d_D = ggml_cl_pool_malloc(ne0 * sizeof(float), &d_size); // dst
|
731
|
+
|
732
|
+
|
733
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
734
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
735
|
+
const int i0 = i03*ne02 + i02;
|
736
|
+
|
737
|
+
cl_event ev;
|
738
|
+
|
739
|
+
// copy src0 to device
|
740
|
+
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, i0, src0, i03, i02, &ev));
|
741
|
+
|
742
|
+
if (nb10 == sizeof(float)) {
|
743
|
+
// Contiguous, avoid overhead from queueing many kernel runs
|
744
|
+
const int64_t i13 = i03%ne13;
|
745
|
+
const int64_t i12 = i02%ne12;
|
746
|
+
const int i1 = i13*ne12*ne11 + i12*ne11;
|
747
|
+
|
748
|
+
cl_int x_offset = 0;
|
749
|
+
cl_int y_offset = i1*ne10;
|
750
|
+
cl_int d_offset = 0;
|
751
|
+
|
752
|
+
size_t global = ne00 * ne01;
|
753
|
+
cl_int ky = ne10;
|
754
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X));
|
755
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset));
|
756
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 2, sizeof(cl_mem), &d_Y));
|
757
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 3, sizeof(cl_int), &y_offset));
|
758
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 4, sizeof(cl_mem), &d_D));
|
759
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset));
|
760
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky));
|
761
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
|
762
|
+
} else {
|
763
|
+
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
764
|
+
const int64_t i13 = i03%ne13;
|
765
|
+
const int64_t i12 = i02%ne12;
|
766
|
+
const int64_t i11 = i01%ne11;
|
767
|
+
const int i1 = i13*ne12*ne11 + i12*ne11 + i11;
|
768
|
+
|
769
|
+
cl_int x_offset = i01*ne00;
|
770
|
+
cl_int y_offset = i1*ne10;
|
771
|
+
cl_int d_offset = i01*ne00;
|
772
|
+
|
773
|
+
// compute
|
774
|
+
size_t global = ne00;
|
775
|
+
cl_int ky = ne10;
|
776
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X));
|
777
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset));
|
778
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 2, sizeof(cl_mem), &d_Y));
|
779
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 3, sizeof(cl_int), &y_offset));
|
780
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 4, sizeof(cl_mem), &d_D));
|
781
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset));
|
782
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky));
|
783
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
|
784
|
+
}
|
785
|
+
}
|
786
|
+
|
787
|
+
CL_CHECK(clReleaseEvent(ev));
|
788
|
+
CL_CHECK(clFinish(queue));
|
789
|
+
|
790
|
+
// copy dst to host
|
791
|
+
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
792
|
+
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * ne00*ne01, d, 0, NULL, NULL));
|
793
|
+
}
|
794
|
+
}
|
795
|
+
ggml_cl_pool_free(d_X, x_size);
|
796
|
+
ggml_cl_pool_free(d_D, d_size);
|
797
|
+
}
|
798
|
+
|
799
|
+
void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
800
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
801
|
+
ggml_cl_mul_f32(src0, src1, dst);
|
802
|
+
}
|
803
|
+
|
652
804
|
static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
653
805
|
const int64_t ne00 = src0->ne[0];
|
654
806
|
const int64_t ne01 = src0->ne[1];
|
@@ -671,18 +823,18 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
|
671
823
|
size_t y_size;
|
672
824
|
size_t d_size;
|
673
825
|
cl_mem d_X;
|
674
|
-
if (src0->backend ==
|
675
|
-
d_X =
|
826
|
+
if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
|
827
|
+
d_X = (cl_mem) src0->data;
|
676
828
|
} else {
|
677
|
-
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size
|
829
|
+
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
|
678
830
|
}
|
679
|
-
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size
|
680
|
-
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size
|
831
|
+
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
|
832
|
+
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
|
681
833
|
|
682
834
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
683
835
|
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
684
836
|
// copy data to device
|
685
|
-
if (src0->backend !=
|
837
|
+
if (src0->backend != GGML_BACKEND_GPU) {
|
686
838
|
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
|
687
839
|
}
|
688
840
|
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL));
|
@@ -711,7 +863,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
|
711
863
|
}
|
712
864
|
}
|
713
865
|
|
714
|
-
if (src0->backend !=
|
866
|
+
if (src0->backend != GGML_BACKEND_GPU) {
|
715
867
|
ggml_cl_pool_free(d_X, x_size);
|
716
868
|
}
|
717
869
|
ggml_cl_pool_free(d_Y, y_size);
|
@@ -747,13 +899,13 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|
747
899
|
size_t y_size;
|
748
900
|
size_t d_size;
|
749
901
|
cl_mem d_X;
|
750
|
-
if (src0->backend ==
|
751
|
-
d_X =
|
902
|
+
if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
|
903
|
+
d_X = (cl_mem) src0->data;
|
752
904
|
} else {
|
753
|
-
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size
|
905
|
+
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
|
754
906
|
}
|
755
|
-
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * y_ne, &y_size
|
756
|
-
cl_mem d_D = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * d_ne, &d_size
|
907
|
+
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * y_ne, &y_size);
|
908
|
+
cl_mem d_D = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * d_ne, &d_size);
|
757
909
|
|
758
910
|
bool src1_cont_rows = nb10 == sizeof(float);
|
759
911
|
bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);
|
@@ -761,7 +913,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|
761
913
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
762
914
|
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
763
915
|
// copy src0 to device
|
764
|
-
if (src0->backend !=
|
916
|
+
if (src0->backend != GGML_BACKEND_GPU) {
|
765
917
|
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
|
766
918
|
}
|
767
919
|
|
@@ -818,7 +970,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|
818
970
|
}
|
819
971
|
}
|
820
972
|
|
821
|
-
if (src0->backend !=
|
973
|
+
if (src0->backend != GGML_BACKEND_GPU) {
|
822
974
|
ggml_cl_pool_free(d_X, x_size);
|
823
975
|
}
|
824
976
|
ggml_cl_pool_free(d_Y, y_size);
|
@@ -852,57 +1004,61 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
852
1004
|
size_t q_size;
|
853
1005
|
cl_mem d_X;
|
854
1006
|
if (!mul_mat_vec) {
|
855
|
-
d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size
|
1007
|
+
d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
|
856
1008
|
}
|
857
|
-
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size
|
858
|
-
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size
|
1009
|
+
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
|
1010
|
+
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
|
859
1011
|
cl_mem d_Q;
|
860
1012
|
if (src0->backend == GGML_BACKEND_CPU) {
|
861
|
-
d_Q = ggml_cl_pool_malloc(q_sz, &q_size
|
1013
|
+
d_Q = ggml_cl_pool_malloc(q_sz, &q_size);
|
862
1014
|
}
|
863
1015
|
|
864
1016
|
cl_kernel* to_fp32_cl = ggml_get_to_fp32_cl(type);
|
865
1017
|
cl_kernel* dmmv = ggml_get_dequantize_mul_mat_vec_cl(type);
|
866
1018
|
GGML_ASSERT(to_fp32_cl != nullptr);
|
867
1019
|
|
1020
|
+
size_t ev_idx = 0;
|
1021
|
+
std::vector<cl_event> events;
|
1022
|
+
|
868
1023
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
869
1024
|
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
870
|
-
cl_event ev_sgemm;
|
871
|
-
|
872
1025
|
// copy src0 to device if necessary
|
873
1026
|
if (src0->backend == GGML_BACKEND_CPU) {
|
874
|
-
|
875
|
-
|
876
|
-
|
1027
|
+
events.emplace_back();
|
1028
|
+
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
|
1029
|
+
} else if (src0->backend == GGML_BACKEND_GPU) {
|
1030
|
+
d_Q = (cl_mem) src0->data;
|
877
1031
|
} else {
|
878
1032
|
GGML_ASSERT(false);
|
879
1033
|
}
|
880
1034
|
if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
|
881
1035
|
// copy src1 to device
|
882
|
-
|
1036
|
+
events.emplace_back();
|
1037
|
+
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, events.data() + ev_idx++));
|
883
1038
|
|
884
1039
|
// compute
|
885
1040
|
const size_t global = ne01 * CL_DMMV_BLOCK_SIZE;
|
886
1041
|
const size_t local = CL_DMMV_BLOCK_SIZE;
|
887
1042
|
const cl_int ncols = ne00;
|
1043
|
+
events.emplace_back();
|
888
1044
|
CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
|
889
1045
|
CL_CHECK(clSetKernelArg(*dmmv, 1, sizeof(float) * local, NULL));
|
890
1046
|
CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y));
|
891
1047
|
CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D));
|
892
1048
|
CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols));
|
893
|
-
CL_CHECK(
|
894
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, NULL, &global, &local, 0, NULL, &ev_sgemm));
|
1049
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, NULL, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
|
895
1050
|
} else { // general dequantization kernel + CLBlast matrix matrix multiplication
|
896
1051
|
// convert src0 to fp32 on device
|
897
1052
|
const size_t global = x_ne;
|
898
1053
|
CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
|
899
1054
|
CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
|
900
|
-
CL_CHECK(
|
901
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, NULL, &global, NULL, 0, NULL, NULL));
|
1055
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, NULL, &global, NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
|
902
1056
|
|
903
1057
|
// copy src1 to device
|
904
1058
|
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL));
|
905
1059
|
|
1060
|
+
events.emplace_back();
|
1061
|
+
|
906
1062
|
// wait for conversion
|
907
1063
|
CL_CHECK(clFinish(queue));
|
908
1064
|
|
@@ -915,7 +1071,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
915
1071
|
d_Y, 0, ne10,
|
916
1072
|
beta,
|
917
1073
|
d_D, 0, ne01,
|
918
|
-
&queue,
|
1074
|
+
&queue, events.data() + ev_idx++);
|
919
1075
|
|
920
1076
|
if (status != clblast::StatusCode::kSuccess) {
|
921
1077
|
GGML_ASSERT(false);
|
@@ -924,8 +1080,13 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
924
1080
|
|
925
1081
|
// copy dst to host
|
926
1082
|
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
927
|
-
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &
|
928
|
-
|
1083
|
+
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL));
|
1084
|
+
for (auto *event : events) {
|
1085
|
+
clReleaseEvent(event);
|
1086
|
+
}
|
1087
|
+
|
1088
|
+
ev_idx = 0;
|
1089
|
+
events.clear();
|
929
1090
|
}
|
930
1091
|
}
|
931
1092
|
|
@@ -950,7 +1111,7 @@ bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
|
|
950
1111
|
if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
|
951
1112
|
src1->type == GGML_TYPE_F32 &&
|
952
1113
|
dst->type == GGML_TYPE_F32 &&
|
953
|
-
((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend ==
|
1114
|
+
((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_GPU)) {
|
954
1115
|
return true;
|
955
1116
|
}
|
956
1117
|
|
@@ -1016,19 +1177,48 @@ void ggml_cl_transform_tensor(ggml_tensor * tensor) {
|
|
1016
1177
|
const size_t q_sz = ggml_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_blck_size(type);
|
1017
1178
|
|
1018
1179
|
size_t q_size;
|
1019
|
-
cl_mem
|
1020
|
-
*dst = ggml_cl_pool_malloc(q_sz, &q_size, CL_MEM_READ_ONLY);
|
1180
|
+
cl_mem dst = ggml_cl_pool_malloc(q_sz, &q_size);
|
1021
1181
|
|
1022
1182
|
// copy tensor to device
|
1023
1183
|
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
1024
1184
|
for (int64_t i2 = 0; i2 < ne2; i2++) {
|
1025
1185
|
int i = i3*ne2 + i2;
|
1026
|
-
CL_CHECK(ggml_cl_h2d_tensor_2d(queue,
|
1186
|
+
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, dst, i*ne0*ne1, tensor, i3, i2, NULL));
|
1027
1187
|
}
|
1028
1188
|
}
|
1029
1189
|
|
1030
1190
|
CL_CHECK(clFinish(queue));
|
1031
1191
|
|
1032
1192
|
tensor->data = dst;
|
1033
|
-
tensor->backend =
|
1193
|
+
tensor->backend = GGML_BACKEND_GPU;
|
1194
|
+
}
|
1195
|
+
|
1196
|
+
void ggml_cl_load_data(const char * fname, struct ggml_tensor * tensor, const size_t offset) {
|
1197
|
+
cl_int err;
|
1198
|
+
FILE * fp = fopen(fname, "rb");
|
1199
|
+
|
1200
|
+
const size_t size = ggml_nbytes(tensor);
|
1201
|
+
|
1202
|
+
cl_mem dst;
|
1203
|
+
CL_CHECK((dst = clCreateBuffer(context, CL_MEM_READ_ONLY, size, nullptr, &err), err));
|
1204
|
+
void * buf_host = malloc(size);
|
1205
|
+
|
1206
|
+
#ifdef _WIN32
|
1207
|
+
int ret = _fseeki64(fp, (__int64) offset, SEEK_SET);
|
1208
|
+
#else
|
1209
|
+
int ret = fseek(fp, (long) offset, SEEK_SET);
|
1210
|
+
#endif
|
1211
|
+
GGML_ASSERT(ret == 0); // same
|
1212
|
+
|
1213
|
+
size_t ret2 = fread(buf_host, size, 1, fp);
|
1214
|
+
if (ret2 != 1) {
|
1215
|
+
fprintf(stderr, "unexpectedly reached end of file");
|
1216
|
+
exit(1);
|
1217
|
+
}
|
1218
|
+
|
1219
|
+
clEnqueueWriteBuffer(queue, dst, CL_TRUE, 0, size, buf_host, 0, nullptr, nullptr);
|
1220
|
+
|
1221
|
+
tensor->data = dst;
|
1222
|
+
free(buf_host);
|
1223
|
+
fclose(fp);
|
1034
1224
|
}
|
@@ -8,6 +8,7 @@ extern "C" {
|
|
8
8
|
|
9
9
|
void ggml_cl_init(void);
|
10
10
|
|
11
|
+
void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
11
12
|
bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
12
13
|
size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
13
14
|
void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
|
@@ -15,7 +16,10 @@ void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor
|
|
15
16
|
void * ggml_cl_host_malloc(size_t size);
|
16
17
|
void ggml_cl_host_free(void * ptr);
|
17
18
|
|
19
|
+
void ggml_cl_free_data(const struct ggml_tensor* tensor);
|
20
|
+
|
18
21
|
void ggml_cl_transform_tensor(struct ggml_tensor * tensor);
|
22
|
+
void ggml_cl_load_data(const char * fname, struct ggml_tensor * tensor, size_t offset);
|
19
23
|
|
20
24
|
#ifdef __cplusplus
|
21
25
|
}
|