llama_cpp 0.1.3 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +39 -8
- data/ext/llama_cpp/extconf.rb +26 -1
- data/ext/llama_cpp/llama_cpp.cpp +210 -13
- data/ext/llama_cpp/src/ggml-cuda.cu +1916 -0
- data/ext/llama_cpp/src/ggml-cuda.h +15 -2
- data/ext/llama_cpp/src/ggml-metal.h +63 -0
- data/ext/llama_cpp/src/ggml-metal.m +783 -0
- data/ext/llama_cpp/src/ggml-metal.metal +1133 -0
- data/ext/llama_cpp/src/ggml-opencl.cpp +242 -52
- data/ext/llama_cpp/src/ggml-opencl.h +4 -0
- data/ext/llama_cpp/src/ggml.c +835 -82
- data/ext/llama_cpp/src/ggml.h +64 -8
- data/ext/llama_cpp/src/k_quants.c +2244 -0
- data/ext/llama_cpp/src/k_quants.h +122 -0
- data/ext/llama_cpp/src/llama-util.h +16 -0
- data/ext/llama_cpp/src/llama.cpp +489 -134
- data/ext/llama_cpp/src/llama.h +43 -7
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +33 -1
- metadata +8 -2
@@ -3,6 +3,8 @@
|
|
3
3
|
#include <array>
|
4
4
|
#include <atomic>
|
5
5
|
#include <sstream>
|
6
|
+
#include <vector>
|
7
|
+
#include <limits>
|
6
8
|
|
7
9
|
#define CL_TARGET_OPENCL_VERSION 110
|
8
10
|
#include <clblast.h>
|
@@ -197,6 +199,18 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float
|
|
197
199
|
}
|
198
200
|
);
|
199
201
|
|
202
|
+
std::string mul_template = MULTILINE_QUOTE(
|
203
|
+
__kernel void KERNEL_NAME(__global TYPE* x, const int x_offset, __global TYPE* y, const int y_offset, __global TYPE* dst, const int dst_offset, const int ky) {
|
204
|
+
const int i = get_group_id(0)*get_local_size(0) + get_local_id(0);
|
205
|
+
|
206
|
+
if (i >= get_global_size(0)) {
|
207
|
+
return;
|
208
|
+
}
|
209
|
+
|
210
|
+
dst[dst_offset + i] = x[x_offset + i] * y[y_offset + i%ky];
|
211
|
+
}
|
212
|
+
);
|
213
|
+
|
200
214
|
#define CL_CHECK(err) \
|
201
215
|
do { \
|
202
216
|
cl_int err_ = (err); \
|
@@ -239,6 +253,13 @@ std::array<std::string, 30> dequant_mul_mat_vec_str_values = {
|
|
239
253
|
"convert_mul_mat_vec_f16", "half", "1", "1", "convert_f16"
|
240
254
|
};
|
241
255
|
|
256
|
+
std::array<std::string, 2> mul_str_keys = {
|
257
|
+
"KERNEL_NAME", "TYPE"
|
258
|
+
};
|
259
|
+
std::array<std::string, 2> mul_str_values = {
|
260
|
+
"mul_f32", "float"
|
261
|
+
};
|
262
|
+
|
242
263
|
std::string& replace(std::string& s, const std::string& from, const std::string& to) {
|
243
264
|
size_t pos = 0;
|
244
265
|
while ((pos = s.find(from, pos)) != std::string::npos) {
|
@@ -261,6 +282,13 @@ std::string generate_kernels() {
|
|
261
282
|
src << dequant_kernel << '\n';
|
262
283
|
src << dmmv_kernel << '\n';
|
263
284
|
}
|
285
|
+
for (size_t i = 0; i < mul_str_values.size(); i += mul_str_keys.size()) {
|
286
|
+
std::string mul_kernel = mul_template;
|
287
|
+
for (size_t j = 0; j < mul_str_keys.size(); j++) {
|
288
|
+
replace(mul_kernel, mul_str_keys[j], mul_str_values[i + j]);
|
289
|
+
}
|
290
|
+
src << mul_kernel << '\n';
|
291
|
+
}
|
264
292
|
return src.str();
|
265
293
|
}
|
266
294
|
|
@@ -272,6 +300,7 @@ static cl_program program;
|
|
272
300
|
static cl_kernel convert_row_f16_cl;
|
273
301
|
static cl_kernel dequantize_row_q4_0_cl, dequantize_row_q4_1_cl, dequantize_row_q5_0_cl, dequantize_row_q5_1_cl, dequantize_row_q8_0_cl;
|
274
302
|
static cl_kernel dequantize_mul_mat_vec_q4_0_cl, dequantize_mul_mat_vec_q4_1_cl, dequantize_mul_mat_vec_q5_0_cl, dequantize_mul_mat_vec_q5_1_cl, dequantize_mul_mat_vec_q8_0_cl, convert_mul_mat_vec_f16_cl;
|
303
|
+
static cl_kernel mul_f32_cl;
|
275
304
|
static bool fp16_support;
|
276
305
|
|
277
306
|
static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer) {
|
@@ -469,16 +498,11 @@ void ggml_cl_init(void) {
|
|
469
498
|
|
470
499
|
size_t ext_str_size;
|
471
500
|
clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 0, NULL, &ext_str_size);
|
472
|
-
char*
|
501
|
+
char *ext_buffer = (char *)alloca(ext_str_size + 1);
|
473
502
|
clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, ext_str_size, ext_buffer, NULL);
|
503
|
+
ext_buffer[ext_str_size] = '\0'; // ensure it is null terminated
|
474
504
|
// Check if ext_buffer contains cl_khr_fp16
|
475
|
-
|
476
|
-
if (memcmp(ext_buffer + i, "cl_khr_fp16", 11) == 0) {
|
477
|
-
fp16_support = true;
|
478
|
-
break;
|
479
|
-
}
|
480
|
-
}
|
481
|
-
free(ext_buffer);
|
505
|
+
fp16_support = strstr(ext_buffer, "cl_khr_fp16") != NULL;
|
482
506
|
fprintf(stderr, "ggml_opencl: device FP16 support: %s\n", fp16_support ? "true" : "false");
|
483
507
|
|
484
508
|
cl_context_properties properties[] = {
|
@@ -513,6 +537,9 @@ void ggml_cl_init(void) {
|
|
513
537
|
CL_CHECK((dequantize_mul_mat_vec_q5_1_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_1", &err), err));
|
514
538
|
CL_CHECK((dequantize_mul_mat_vec_q8_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q8_0", &err), err));
|
515
539
|
CL_CHECK((convert_mul_mat_vec_f16_cl = clCreateKernel(program, "convert_mul_mat_vec_f16", &err), err));
|
540
|
+
|
541
|
+
// mul kernel
|
542
|
+
CL_CHECK((mul_f32_cl = clCreateKernel(program, "mul_f32", &err), err));
|
516
543
|
}
|
517
544
|
|
518
545
|
static cl_kernel* ggml_get_to_fp32_cl(ggml_type type) {
|
@@ -578,21 +605,44 @@ struct cl_buffer {
|
|
578
605
|
static cl_buffer g_cl_buffer_pool[MAX_CL_BUFFERS];
|
579
606
|
static std::atomic_flag g_cl_pool_lock = ATOMIC_FLAG_INIT;
|
580
607
|
|
581
|
-
static cl_mem ggml_cl_pool_malloc(size_t size, size_t * actual_size
|
608
|
+
static cl_mem ggml_cl_pool_malloc(size_t size, size_t * actual_size) {
|
582
609
|
scoped_spin_lock lock(g_cl_pool_lock);
|
583
610
|
cl_int err;
|
584
611
|
|
612
|
+
int best_i = -1;
|
613
|
+
size_t best_size = std::numeric_limits<size_t>::max(); //smallest unused buffer that fits our needs
|
614
|
+
int worst_i = -1;
|
615
|
+
size_t worst_size = 0; //largest unused buffer seen so far
|
585
616
|
for (int i = 0; i < MAX_CL_BUFFERS; ++i) {
|
586
|
-
cl_buffer&
|
587
|
-
if (b.size > 0 && b.size >= size)
|
588
|
-
|
589
|
-
|
590
|
-
b.size
|
591
|
-
|
617
|
+
cl_buffer &b = g_cl_buffer_pool[i];
|
618
|
+
if (b.size > 0 && b.size >= size && b.size < best_size)
|
619
|
+
{
|
620
|
+
best_i = i;
|
621
|
+
best_size = b.size;
|
622
|
+
}
|
623
|
+
if (b.size > 0 && b.size > worst_size)
|
624
|
+
{
|
625
|
+
worst_i = i;
|
626
|
+
worst_size = b.size;
|
592
627
|
}
|
593
628
|
}
|
629
|
+
if(best_i!=-1) //found the smallest buffer that fits our needs
|
630
|
+
{
|
631
|
+
cl_buffer& b = g_cl_buffer_pool[best_i];
|
632
|
+
cl_mem mem = b.mem;
|
633
|
+
*actual_size = b.size;
|
634
|
+
b.size = 0;
|
635
|
+
return mem;
|
636
|
+
}
|
637
|
+
if(worst_i!=-1) //no buffer that fits our needs, resize largest one to save memory
|
638
|
+
{
|
639
|
+
cl_buffer& b = g_cl_buffer_pool[worst_i];
|
640
|
+
cl_mem mem = b.mem;
|
641
|
+
b.size = 0;
|
642
|
+
clReleaseMemObject(mem);
|
643
|
+
}
|
594
644
|
cl_mem mem;
|
595
|
-
CL_CHECK((mem = clCreateBuffer(context,
|
645
|
+
CL_CHECK((mem = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &err), err));
|
596
646
|
*actual_size = size;
|
597
647
|
return mem;
|
598
648
|
}
|
@@ -612,6 +662,15 @@ static void ggml_cl_pool_free(cl_mem mem, size_t size) {
|
|
612
662
|
clReleaseMemObject(mem);
|
613
663
|
}
|
614
664
|
|
665
|
+
void ggml_cl_free_data(const struct ggml_tensor* tensor) {
|
666
|
+
if (tensor->backend != GGML_BACKEND_GPU) {
|
667
|
+
return;
|
668
|
+
}
|
669
|
+
|
670
|
+
cl_mem mem = (cl_mem)tensor->data;
|
671
|
+
clReleaseMemObject(mem);
|
672
|
+
}
|
673
|
+
|
615
674
|
static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t offset, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, cl_event* ev) {
|
616
675
|
cl_int err;
|
617
676
|
const uint64_t ne0 = src->ne[0];
|
@@ -649,6 +708,99 @@ static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t o
|
|
649
708
|
return err;
|
650
709
|
}
|
651
710
|
|
711
|
+
static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
712
|
+
GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
|
713
|
+
const int64_t ne00 = src0->ne[0];
|
714
|
+
const int64_t ne01 = src0->ne[1];
|
715
|
+
const int64_t ne02 = src0->ne[2];
|
716
|
+
const int64_t ne03 = src0->ne[2];
|
717
|
+
const int64_t ne0 = ne00 * ne01 * ne02 * ne03;
|
718
|
+
const int64_t ne10 = src1->ne[0];
|
719
|
+
const int64_t ne11 = src1->ne[1];
|
720
|
+
const int64_t ne12 = src1->ne[2];
|
721
|
+
const int64_t ne13 = src1->ne[3];
|
722
|
+
const int64_t nb10 = src1->nb[0];
|
723
|
+
const int nb2 = dst->nb[2];
|
724
|
+
const int nb3 = dst->nb[3];
|
725
|
+
size_t x_size;
|
726
|
+
size_t d_size;
|
727
|
+
|
728
|
+
cl_mem d_X = ggml_cl_pool_malloc(ne0 * sizeof(float), &x_size); // src0
|
729
|
+
cl_mem d_Y = (cl_mem) src1->data; // src1 is already on device, broadcasted.
|
730
|
+
cl_mem d_D = ggml_cl_pool_malloc(ne0 * sizeof(float), &d_size); // dst
|
731
|
+
|
732
|
+
|
733
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
734
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
735
|
+
const int i0 = i03*ne02 + i02;
|
736
|
+
|
737
|
+
cl_event ev;
|
738
|
+
|
739
|
+
// copy src0 to device
|
740
|
+
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, i0, src0, i03, i02, &ev));
|
741
|
+
|
742
|
+
if (nb10 == sizeof(float)) {
|
743
|
+
// Contiguous, avoid overhead from queueing many kernel runs
|
744
|
+
const int64_t i13 = i03%ne13;
|
745
|
+
const int64_t i12 = i02%ne12;
|
746
|
+
const int i1 = i13*ne12*ne11 + i12*ne11;
|
747
|
+
|
748
|
+
cl_int x_offset = 0;
|
749
|
+
cl_int y_offset = i1*ne10;
|
750
|
+
cl_int d_offset = 0;
|
751
|
+
|
752
|
+
size_t global = ne00 * ne01;
|
753
|
+
cl_int ky = ne10;
|
754
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X));
|
755
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset));
|
756
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 2, sizeof(cl_mem), &d_Y));
|
757
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 3, sizeof(cl_int), &y_offset));
|
758
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 4, sizeof(cl_mem), &d_D));
|
759
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset));
|
760
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky));
|
761
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
|
762
|
+
} else {
|
763
|
+
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
764
|
+
const int64_t i13 = i03%ne13;
|
765
|
+
const int64_t i12 = i02%ne12;
|
766
|
+
const int64_t i11 = i01%ne11;
|
767
|
+
const int i1 = i13*ne12*ne11 + i12*ne11 + i11;
|
768
|
+
|
769
|
+
cl_int x_offset = i01*ne00;
|
770
|
+
cl_int y_offset = i1*ne10;
|
771
|
+
cl_int d_offset = i01*ne00;
|
772
|
+
|
773
|
+
// compute
|
774
|
+
size_t global = ne00;
|
775
|
+
cl_int ky = ne10;
|
776
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X));
|
777
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset));
|
778
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 2, sizeof(cl_mem), &d_Y));
|
779
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 3, sizeof(cl_int), &y_offset));
|
780
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 4, sizeof(cl_mem), &d_D));
|
781
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset));
|
782
|
+
CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky));
|
783
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
|
784
|
+
}
|
785
|
+
}
|
786
|
+
|
787
|
+
CL_CHECK(clReleaseEvent(ev));
|
788
|
+
CL_CHECK(clFinish(queue));
|
789
|
+
|
790
|
+
// copy dst to host
|
791
|
+
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
792
|
+
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * ne00*ne01, d, 0, NULL, NULL));
|
793
|
+
}
|
794
|
+
}
|
795
|
+
ggml_cl_pool_free(d_X, x_size);
|
796
|
+
ggml_cl_pool_free(d_D, d_size);
|
797
|
+
}
|
798
|
+
|
799
|
+
void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
800
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
801
|
+
ggml_cl_mul_f32(src0, src1, dst);
|
802
|
+
}
|
803
|
+
|
652
804
|
static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
653
805
|
const int64_t ne00 = src0->ne[0];
|
654
806
|
const int64_t ne01 = src0->ne[1];
|
@@ -671,18 +823,18 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
|
671
823
|
size_t y_size;
|
672
824
|
size_t d_size;
|
673
825
|
cl_mem d_X;
|
674
|
-
if (src0->backend ==
|
675
|
-
d_X =
|
826
|
+
if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
|
827
|
+
d_X = (cl_mem) src0->data;
|
676
828
|
} else {
|
677
|
-
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size
|
829
|
+
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
|
678
830
|
}
|
679
|
-
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size
|
680
|
-
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size
|
831
|
+
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
|
832
|
+
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
|
681
833
|
|
682
834
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
683
835
|
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
684
836
|
// copy data to device
|
685
|
-
if (src0->backend !=
|
837
|
+
if (src0->backend != GGML_BACKEND_GPU) {
|
686
838
|
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
|
687
839
|
}
|
688
840
|
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL));
|
@@ -711,7 +863,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
|
711
863
|
}
|
712
864
|
}
|
713
865
|
|
714
|
-
if (src0->backend !=
|
866
|
+
if (src0->backend != GGML_BACKEND_GPU) {
|
715
867
|
ggml_cl_pool_free(d_X, x_size);
|
716
868
|
}
|
717
869
|
ggml_cl_pool_free(d_Y, y_size);
|
@@ -747,13 +899,13 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|
747
899
|
size_t y_size;
|
748
900
|
size_t d_size;
|
749
901
|
cl_mem d_X;
|
750
|
-
if (src0->backend ==
|
751
|
-
d_X =
|
902
|
+
if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
|
903
|
+
d_X = (cl_mem) src0->data;
|
752
904
|
} else {
|
753
|
-
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size
|
905
|
+
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
|
754
906
|
}
|
755
|
-
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * y_ne, &y_size
|
756
|
-
cl_mem d_D = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * d_ne, &d_size
|
907
|
+
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * y_ne, &y_size);
|
908
|
+
cl_mem d_D = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * d_ne, &d_size);
|
757
909
|
|
758
910
|
bool src1_cont_rows = nb10 == sizeof(float);
|
759
911
|
bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);
|
@@ -761,7 +913,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|
761
913
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
762
914
|
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
763
915
|
// copy src0 to device
|
764
|
-
if (src0->backend !=
|
916
|
+
if (src0->backend != GGML_BACKEND_GPU) {
|
765
917
|
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
|
766
918
|
}
|
767
919
|
|
@@ -818,7 +970,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|
818
970
|
}
|
819
971
|
}
|
820
972
|
|
821
|
-
if (src0->backend !=
|
973
|
+
if (src0->backend != GGML_BACKEND_GPU) {
|
822
974
|
ggml_cl_pool_free(d_X, x_size);
|
823
975
|
}
|
824
976
|
ggml_cl_pool_free(d_Y, y_size);
|
@@ -852,57 +1004,61 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
852
1004
|
size_t q_size;
|
853
1005
|
cl_mem d_X;
|
854
1006
|
if (!mul_mat_vec) {
|
855
|
-
d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size
|
1007
|
+
d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
|
856
1008
|
}
|
857
|
-
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size
|
858
|
-
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size
|
1009
|
+
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
|
1010
|
+
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
|
859
1011
|
cl_mem d_Q;
|
860
1012
|
if (src0->backend == GGML_BACKEND_CPU) {
|
861
|
-
d_Q = ggml_cl_pool_malloc(q_sz, &q_size
|
1013
|
+
d_Q = ggml_cl_pool_malloc(q_sz, &q_size);
|
862
1014
|
}
|
863
1015
|
|
864
1016
|
cl_kernel* to_fp32_cl = ggml_get_to_fp32_cl(type);
|
865
1017
|
cl_kernel* dmmv = ggml_get_dequantize_mul_mat_vec_cl(type);
|
866
1018
|
GGML_ASSERT(to_fp32_cl != nullptr);
|
867
1019
|
|
1020
|
+
size_t ev_idx = 0;
|
1021
|
+
std::vector<cl_event> events;
|
1022
|
+
|
868
1023
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
869
1024
|
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
870
|
-
cl_event ev_sgemm;
|
871
|
-
|
872
1025
|
// copy src0 to device if necessary
|
873
1026
|
if (src0->backend == GGML_BACKEND_CPU) {
|
874
|
-
|
875
|
-
|
876
|
-
|
1027
|
+
events.emplace_back();
|
1028
|
+
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
|
1029
|
+
} else if (src0->backend == GGML_BACKEND_GPU) {
|
1030
|
+
d_Q = (cl_mem) src0->data;
|
877
1031
|
} else {
|
878
1032
|
GGML_ASSERT(false);
|
879
1033
|
}
|
880
1034
|
if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
|
881
1035
|
// copy src1 to device
|
882
|
-
|
1036
|
+
events.emplace_back();
|
1037
|
+
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, events.data() + ev_idx++));
|
883
1038
|
|
884
1039
|
// compute
|
885
1040
|
const size_t global = ne01 * CL_DMMV_BLOCK_SIZE;
|
886
1041
|
const size_t local = CL_DMMV_BLOCK_SIZE;
|
887
1042
|
const cl_int ncols = ne00;
|
1043
|
+
events.emplace_back();
|
888
1044
|
CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
|
889
1045
|
CL_CHECK(clSetKernelArg(*dmmv, 1, sizeof(float) * local, NULL));
|
890
1046
|
CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y));
|
891
1047
|
CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D));
|
892
1048
|
CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols));
|
893
|
-
CL_CHECK(
|
894
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, NULL, &global, &local, 0, NULL, &ev_sgemm));
|
1049
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, NULL, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
|
895
1050
|
} else { // general dequantization kernel + CLBlast matrix matrix multiplication
|
896
1051
|
// convert src0 to fp32 on device
|
897
1052
|
const size_t global = x_ne;
|
898
1053
|
CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
|
899
1054
|
CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
|
900
|
-
CL_CHECK(
|
901
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, NULL, &global, NULL, 0, NULL, NULL));
|
1055
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, NULL, &global, NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
|
902
1056
|
|
903
1057
|
// copy src1 to device
|
904
1058
|
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL));
|
905
1059
|
|
1060
|
+
events.emplace_back();
|
1061
|
+
|
906
1062
|
// wait for conversion
|
907
1063
|
CL_CHECK(clFinish(queue));
|
908
1064
|
|
@@ -915,7 +1071,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
915
1071
|
d_Y, 0, ne10,
|
916
1072
|
beta,
|
917
1073
|
d_D, 0, ne01,
|
918
|
-
&queue,
|
1074
|
+
&queue, events.data() + ev_idx++);
|
919
1075
|
|
920
1076
|
if (status != clblast::StatusCode::kSuccess) {
|
921
1077
|
GGML_ASSERT(false);
|
@@ -924,8 +1080,13 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
924
1080
|
|
925
1081
|
// copy dst to host
|
926
1082
|
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
927
|
-
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &
|
928
|
-
|
1083
|
+
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL));
|
1084
|
+
for (auto *event : events) {
|
1085
|
+
clReleaseEvent(event);
|
1086
|
+
}
|
1087
|
+
|
1088
|
+
ev_idx = 0;
|
1089
|
+
events.clear();
|
929
1090
|
}
|
930
1091
|
}
|
931
1092
|
|
@@ -950,7 +1111,7 @@ bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
|
|
950
1111
|
if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
|
951
1112
|
src1->type == GGML_TYPE_F32 &&
|
952
1113
|
dst->type == GGML_TYPE_F32 &&
|
953
|
-
((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend ==
|
1114
|
+
((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_GPU)) {
|
954
1115
|
return true;
|
955
1116
|
}
|
956
1117
|
|
@@ -1016,19 +1177,48 @@ void ggml_cl_transform_tensor(ggml_tensor * tensor) {
|
|
1016
1177
|
const size_t q_sz = ggml_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_blck_size(type);
|
1017
1178
|
|
1018
1179
|
size_t q_size;
|
1019
|
-
cl_mem
|
1020
|
-
*dst = ggml_cl_pool_malloc(q_sz, &q_size, CL_MEM_READ_ONLY);
|
1180
|
+
cl_mem dst = ggml_cl_pool_malloc(q_sz, &q_size);
|
1021
1181
|
|
1022
1182
|
// copy tensor to device
|
1023
1183
|
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
1024
1184
|
for (int64_t i2 = 0; i2 < ne2; i2++) {
|
1025
1185
|
int i = i3*ne2 + i2;
|
1026
|
-
CL_CHECK(ggml_cl_h2d_tensor_2d(queue,
|
1186
|
+
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, dst, i*ne0*ne1, tensor, i3, i2, NULL));
|
1027
1187
|
}
|
1028
1188
|
}
|
1029
1189
|
|
1030
1190
|
CL_CHECK(clFinish(queue));
|
1031
1191
|
|
1032
1192
|
tensor->data = dst;
|
1033
|
-
tensor->backend =
|
1193
|
+
tensor->backend = GGML_BACKEND_GPU;
|
1194
|
+
}
|
1195
|
+
|
1196
|
+
void ggml_cl_load_data(const char * fname, struct ggml_tensor * tensor, const size_t offset) {
|
1197
|
+
cl_int err;
|
1198
|
+
FILE * fp = fopen(fname, "rb");
|
1199
|
+
|
1200
|
+
const size_t size = ggml_nbytes(tensor);
|
1201
|
+
|
1202
|
+
cl_mem dst;
|
1203
|
+
CL_CHECK((dst = clCreateBuffer(context, CL_MEM_READ_ONLY, size, nullptr, &err), err));
|
1204
|
+
void * buf_host = malloc(size);
|
1205
|
+
|
1206
|
+
#ifdef _WIN32
|
1207
|
+
int ret = _fseeki64(fp, (__int64) offset, SEEK_SET);
|
1208
|
+
#else
|
1209
|
+
int ret = fseek(fp, (long) offset, SEEK_SET);
|
1210
|
+
#endif
|
1211
|
+
GGML_ASSERT(ret == 0); // same
|
1212
|
+
|
1213
|
+
size_t ret2 = fread(buf_host, size, 1, fp);
|
1214
|
+
if (ret2 != 1) {
|
1215
|
+
fprintf(stderr, "unexpectedly reached end of file");
|
1216
|
+
exit(1);
|
1217
|
+
}
|
1218
|
+
|
1219
|
+
clEnqueueWriteBuffer(queue, dst, CL_TRUE, 0, size, buf_host, 0, nullptr, nullptr);
|
1220
|
+
|
1221
|
+
tensor->data = dst;
|
1222
|
+
free(buf_host);
|
1223
|
+
fclose(fp);
|
1034
1224
|
}
|
@@ -8,6 +8,7 @@ extern "C" {
|
|
8
8
|
|
9
9
|
void ggml_cl_init(void);
|
10
10
|
|
11
|
+
void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
11
12
|
bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
12
13
|
size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
13
14
|
void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
|
@@ -15,7 +16,10 @@ void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor
|
|
15
16
|
void * ggml_cl_host_malloc(size_t size);
|
16
17
|
void ggml_cl_host_free(void * ptr);
|
17
18
|
|
19
|
+
void ggml_cl_free_data(const struct ggml_tensor* tensor);
|
20
|
+
|
18
21
|
void ggml_cl_transform_tensor(struct ggml_tensor * tensor);
|
22
|
+
void ggml_cl_load_data(const char * fname, struct ggml_tensor * tensor, size_t offset);
|
19
23
|
|
20
24
|
#ifdef __cplusplus
|
21
25
|
}
|