llama_cpp 0.1.3 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -3,6 +3,8 @@
3
3
  #include <array>
4
4
  #include <atomic>
5
5
  #include <sstream>
6
+ #include <vector>
7
+ #include <limits>
6
8
 
7
9
  #define CL_TARGET_OPENCL_VERSION 110
8
10
  #include <clblast.h>
@@ -197,6 +199,18 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float
197
199
  }
198
200
  );
199
201
 
202
+ std::string mul_template = MULTILINE_QUOTE(
203
+ __kernel void KERNEL_NAME(__global TYPE* x, const int x_offset, __global TYPE* y, const int y_offset, __global TYPE* dst, const int dst_offset, const int ky) {
204
+ const int i = get_group_id(0)*get_local_size(0) + get_local_id(0);
205
+
206
+ if (i >= get_global_size(0)) {
207
+ return;
208
+ }
209
+
210
+ dst[dst_offset + i] = x[x_offset + i] * y[y_offset + i%ky];
211
+ }
212
+ );
213
+
200
214
  #define CL_CHECK(err) \
201
215
  do { \
202
216
  cl_int err_ = (err); \
@@ -239,6 +253,13 @@ std::array<std::string, 30> dequant_mul_mat_vec_str_values = {
239
253
  "convert_mul_mat_vec_f16", "half", "1", "1", "convert_f16"
240
254
  };
241
255
 
256
+ std::array<std::string, 2> mul_str_keys = {
257
+ "KERNEL_NAME", "TYPE"
258
+ };
259
+ std::array<std::string, 2> mul_str_values = {
260
+ "mul_f32", "float"
261
+ };
262
+
242
263
  std::string& replace(std::string& s, const std::string& from, const std::string& to) {
243
264
  size_t pos = 0;
244
265
  while ((pos = s.find(from, pos)) != std::string::npos) {
@@ -261,6 +282,13 @@ std::string generate_kernels() {
261
282
  src << dequant_kernel << '\n';
262
283
  src << dmmv_kernel << '\n';
263
284
  }
285
+ for (size_t i = 0; i < mul_str_values.size(); i += mul_str_keys.size()) {
286
+ std::string mul_kernel = mul_template;
287
+ for (size_t j = 0; j < mul_str_keys.size(); j++) {
288
+ replace(mul_kernel, mul_str_keys[j], mul_str_values[i + j]);
289
+ }
290
+ src << mul_kernel << '\n';
291
+ }
264
292
  return src.str();
265
293
  }
266
294
 
@@ -272,6 +300,7 @@ static cl_program program;
272
300
  static cl_kernel convert_row_f16_cl;
273
301
  static cl_kernel dequantize_row_q4_0_cl, dequantize_row_q4_1_cl, dequantize_row_q5_0_cl, dequantize_row_q5_1_cl, dequantize_row_q8_0_cl;
274
302
  static cl_kernel dequantize_mul_mat_vec_q4_0_cl, dequantize_mul_mat_vec_q4_1_cl, dequantize_mul_mat_vec_q5_0_cl, dequantize_mul_mat_vec_q5_1_cl, dequantize_mul_mat_vec_q8_0_cl, convert_mul_mat_vec_f16_cl;
303
+ static cl_kernel mul_f32_cl;
275
304
  static bool fp16_support;
276
305
 
277
306
  static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer) {
@@ -469,16 +498,11 @@ void ggml_cl_init(void) {
469
498
 
470
499
  size_t ext_str_size;
471
500
  clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 0, NULL, &ext_str_size);
472
- char* ext_buffer = (char*) malloc(sizeof(char) * ext_str_size);
501
+ char *ext_buffer = (char *)alloca(ext_str_size + 1);
473
502
  clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, ext_str_size, ext_buffer, NULL);
503
+ ext_buffer[ext_str_size] = '\0'; // ensure it is null terminated
474
504
  // Check if ext_buffer contains cl_khr_fp16
475
- for (size_t i = 0; i < ext_str_size - 12; i++) {
476
- if (memcmp(ext_buffer + i, "cl_khr_fp16", 11) == 0) {
477
- fp16_support = true;
478
- break;
479
- }
480
- }
481
- free(ext_buffer);
505
+ fp16_support = strstr(ext_buffer, "cl_khr_fp16") != NULL;
482
506
  fprintf(stderr, "ggml_opencl: device FP16 support: %s\n", fp16_support ? "true" : "false");
483
507
 
484
508
  cl_context_properties properties[] = {
@@ -513,6 +537,9 @@ void ggml_cl_init(void) {
513
537
  CL_CHECK((dequantize_mul_mat_vec_q5_1_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_1", &err), err));
514
538
  CL_CHECK((dequantize_mul_mat_vec_q8_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q8_0", &err), err));
515
539
  CL_CHECK((convert_mul_mat_vec_f16_cl = clCreateKernel(program, "convert_mul_mat_vec_f16", &err), err));
540
+
541
+ // mul kernel
542
+ CL_CHECK((mul_f32_cl = clCreateKernel(program, "mul_f32", &err), err));
516
543
  }
517
544
 
518
545
  static cl_kernel* ggml_get_to_fp32_cl(ggml_type type) {
@@ -578,21 +605,44 @@ struct cl_buffer {
578
605
  static cl_buffer g_cl_buffer_pool[MAX_CL_BUFFERS];
579
606
  static std::atomic_flag g_cl_pool_lock = ATOMIC_FLAG_INIT;
580
607
 
581
- static cl_mem ggml_cl_pool_malloc(size_t size, size_t * actual_size, cl_mem_flags flags) {
608
+ static cl_mem ggml_cl_pool_malloc(size_t size, size_t * actual_size) {
582
609
  scoped_spin_lock lock(g_cl_pool_lock);
583
610
  cl_int err;
584
611
 
612
+ int best_i = -1;
613
+ size_t best_size = std::numeric_limits<size_t>::max(); //smallest unused buffer that fits our needs
614
+ int worst_i = -1;
615
+ size_t worst_size = 0; //largest unused buffer seen so far
585
616
  for (int i = 0; i < MAX_CL_BUFFERS; ++i) {
586
- cl_buffer& b = g_cl_buffer_pool[i];
587
- if (b.size > 0 && b.size >= size) {
588
- cl_mem mem = b.mem;
589
- *actual_size = b.size;
590
- b.size = 0;
591
- return mem;
617
+ cl_buffer &b = g_cl_buffer_pool[i];
618
+ if (b.size > 0 && b.size >= size && b.size < best_size)
619
+ {
620
+ best_i = i;
621
+ best_size = b.size;
622
+ }
623
+ if (b.size > 0 && b.size > worst_size)
624
+ {
625
+ worst_i = i;
626
+ worst_size = b.size;
592
627
  }
593
628
  }
629
+ if(best_i!=-1) //found the smallest buffer that fits our needs
630
+ {
631
+ cl_buffer& b = g_cl_buffer_pool[best_i];
632
+ cl_mem mem = b.mem;
633
+ *actual_size = b.size;
634
+ b.size = 0;
635
+ return mem;
636
+ }
637
+ if(worst_i!=-1) //no buffer that fits our needs, resize largest one to save memory
638
+ {
639
+ cl_buffer& b = g_cl_buffer_pool[worst_i];
640
+ cl_mem mem = b.mem;
641
+ b.size = 0;
642
+ clReleaseMemObject(mem);
643
+ }
594
644
  cl_mem mem;
595
- CL_CHECK((mem = clCreateBuffer(context, flags, size, NULL, &err), err));
645
+ CL_CHECK((mem = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &err), err));
596
646
  *actual_size = size;
597
647
  return mem;
598
648
  }
@@ -612,6 +662,15 @@ static void ggml_cl_pool_free(cl_mem mem, size_t size) {
612
662
  clReleaseMemObject(mem);
613
663
  }
614
664
 
665
+ void ggml_cl_free_data(const struct ggml_tensor* tensor) {
666
+ if (tensor->backend != GGML_BACKEND_GPU) {
667
+ return;
668
+ }
669
+
670
+ cl_mem mem = (cl_mem)tensor->data;
671
+ clReleaseMemObject(mem);
672
+ }
673
+
615
674
  static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t offset, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, cl_event* ev) {
616
675
  cl_int err;
617
676
  const uint64_t ne0 = src->ne[0];
@@ -649,6 +708,99 @@ static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t o
649
708
  return err;
650
709
  }
651
710
 
711
+ static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
712
+ GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
713
+ const int64_t ne00 = src0->ne[0];
714
+ const int64_t ne01 = src0->ne[1];
715
+ const int64_t ne02 = src0->ne[2];
716
+ const int64_t ne03 = src0->ne[2];
717
+ const int64_t ne0 = ne00 * ne01 * ne02 * ne03;
718
+ const int64_t ne10 = src1->ne[0];
719
+ const int64_t ne11 = src1->ne[1];
720
+ const int64_t ne12 = src1->ne[2];
721
+ const int64_t ne13 = src1->ne[3];
722
+ const int64_t nb10 = src1->nb[0];
723
+ const int nb2 = dst->nb[2];
724
+ const int nb3 = dst->nb[3];
725
+ size_t x_size;
726
+ size_t d_size;
727
+
728
+ cl_mem d_X = ggml_cl_pool_malloc(ne0 * sizeof(float), &x_size); // src0
729
+ cl_mem d_Y = (cl_mem) src1->data; // src1 is already on device, broadcasted.
730
+ cl_mem d_D = ggml_cl_pool_malloc(ne0 * sizeof(float), &d_size); // dst
731
+
732
+
733
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
734
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
735
+ const int i0 = i03*ne02 + i02;
736
+
737
+ cl_event ev;
738
+
739
+ // copy src0 to device
740
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, i0, src0, i03, i02, &ev));
741
+
742
+ if (nb10 == sizeof(float)) {
743
+ // Contiguous, avoid overhead from queueing many kernel runs
744
+ const int64_t i13 = i03%ne13;
745
+ const int64_t i12 = i02%ne12;
746
+ const int i1 = i13*ne12*ne11 + i12*ne11;
747
+
748
+ cl_int x_offset = 0;
749
+ cl_int y_offset = i1*ne10;
750
+ cl_int d_offset = 0;
751
+
752
+ size_t global = ne00 * ne01;
753
+ cl_int ky = ne10;
754
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X));
755
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset));
756
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 2, sizeof(cl_mem), &d_Y));
757
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 3, sizeof(cl_int), &y_offset));
758
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 4, sizeof(cl_mem), &d_D));
759
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset));
760
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky));
761
+ CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
762
+ } else {
763
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
764
+ const int64_t i13 = i03%ne13;
765
+ const int64_t i12 = i02%ne12;
766
+ const int64_t i11 = i01%ne11;
767
+ const int i1 = i13*ne12*ne11 + i12*ne11 + i11;
768
+
769
+ cl_int x_offset = i01*ne00;
770
+ cl_int y_offset = i1*ne10;
771
+ cl_int d_offset = i01*ne00;
772
+
773
+ // compute
774
+ size_t global = ne00;
775
+ cl_int ky = ne10;
776
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X));
777
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset));
778
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 2, sizeof(cl_mem), &d_Y));
779
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 3, sizeof(cl_int), &y_offset));
780
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 4, sizeof(cl_mem), &d_D));
781
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset));
782
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky));
783
+ CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
784
+ }
785
+ }
786
+
787
+ CL_CHECK(clReleaseEvent(ev));
788
+ CL_CHECK(clFinish(queue));
789
+
790
+ // copy dst to host
791
+ float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
792
+ CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * ne00*ne01, d, 0, NULL, NULL));
793
+ }
794
+ }
795
+ ggml_cl_pool_free(d_X, x_size);
796
+ ggml_cl_pool_free(d_D, d_size);
797
+ }
798
+
799
+ void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
800
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
801
+ ggml_cl_mul_f32(src0, src1, dst);
802
+ }
803
+
652
804
  static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
653
805
  const int64_t ne00 = src0->ne[0];
654
806
  const int64_t ne01 = src0->ne[1];
@@ -671,18 +823,18 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
671
823
  size_t y_size;
672
824
  size_t d_size;
673
825
  cl_mem d_X;
674
- if (src0->backend == GGML_BACKEND_CL) {
675
- d_X = *(cl_mem*) src0->data;
826
+ if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
827
+ d_X = (cl_mem) src0->data;
676
828
  } else {
677
- d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size, CL_MEM_READ_ONLY);
829
+ d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
678
830
  }
679
- cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size, CL_MEM_READ_ONLY);
680
- cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size, CL_MEM_WRITE_ONLY);
831
+ cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
832
+ cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
681
833
 
682
834
  for (int64_t i03 = 0; i03 < ne03; i03++) {
683
835
  for (int64_t i02 = 0; i02 < ne02; i02++) {
684
836
  // copy data to device
685
- if (src0->backend != GGML_BACKEND_CL) {
837
+ if (src0->backend != GGML_BACKEND_GPU) {
686
838
  CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
687
839
  }
688
840
  CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL));
@@ -711,7 +863,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
711
863
  }
712
864
  }
713
865
 
714
- if (src0->backend != GGML_BACKEND_CL) {
866
+ if (src0->backend != GGML_BACKEND_GPU) {
715
867
  ggml_cl_pool_free(d_X, x_size);
716
868
  }
717
869
  ggml_cl_pool_free(d_Y, y_size);
@@ -747,13 +899,13 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
747
899
  size_t y_size;
748
900
  size_t d_size;
749
901
  cl_mem d_X;
750
- if (src0->backend == GGML_BACKEND_CL) {
751
- d_X = *(cl_mem*) src0->data;
902
+ if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
903
+ d_X = (cl_mem) src0->data;
752
904
  } else {
753
- d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size, CL_MEM_READ_ONLY);
905
+ d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
754
906
  }
755
- cl_mem d_Y = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * y_ne, &y_size, CL_MEM_READ_ONLY);
756
- cl_mem d_D = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * d_ne, &d_size, CL_MEM_WRITE_ONLY);
907
+ cl_mem d_Y = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * y_ne, &y_size);
908
+ cl_mem d_D = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * d_ne, &d_size);
757
909
 
758
910
  bool src1_cont_rows = nb10 == sizeof(float);
759
911
  bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);
@@ -761,7 +913,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
761
913
  for (int64_t i03 = 0; i03 < ne03; i03++) {
762
914
  for (int64_t i02 = 0; i02 < ne02; i02++) {
763
915
  // copy src0 to device
764
- if (src0->backend != GGML_BACKEND_CL) {
916
+ if (src0->backend != GGML_BACKEND_GPU) {
765
917
  CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
766
918
  }
767
919
 
@@ -818,7 +970,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
818
970
  }
819
971
  }
820
972
 
821
- if (src0->backend != GGML_BACKEND_CL) {
973
+ if (src0->backend != GGML_BACKEND_GPU) {
822
974
  ggml_cl_pool_free(d_X, x_size);
823
975
  }
824
976
  ggml_cl_pool_free(d_Y, y_size);
@@ -852,57 +1004,61 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
852
1004
  size_t q_size;
853
1005
  cl_mem d_X;
854
1006
  if (!mul_mat_vec) {
855
- d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size, CL_MEM_READ_WRITE);
1007
+ d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
856
1008
  }
857
- cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size, CL_MEM_READ_ONLY);
858
- cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size, CL_MEM_WRITE_ONLY);
1009
+ cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
1010
+ cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
859
1011
  cl_mem d_Q;
860
1012
  if (src0->backend == GGML_BACKEND_CPU) {
861
- d_Q = ggml_cl_pool_malloc(q_sz, &q_size, CL_MEM_READ_ONLY);
1013
+ d_Q = ggml_cl_pool_malloc(q_sz, &q_size);
862
1014
  }
863
1015
 
864
1016
  cl_kernel* to_fp32_cl = ggml_get_to_fp32_cl(type);
865
1017
  cl_kernel* dmmv = ggml_get_dequantize_mul_mat_vec_cl(type);
866
1018
  GGML_ASSERT(to_fp32_cl != nullptr);
867
1019
 
1020
+ size_t ev_idx = 0;
1021
+ std::vector<cl_event> events;
1022
+
868
1023
  for (int64_t i03 = 0; i03 < ne03; i03++) {
869
1024
  for (int64_t i02 = 0; i02 < ne02; i02++) {
870
- cl_event ev_sgemm;
871
-
872
1025
  // copy src0 to device if necessary
873
1026
  if (src0->backend == GGML_BACKEND_CPU) {
874
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, NULL));
875
- } else if (src0->backend == GGML_BACKEND_CL) {
876
- d_Q = *(cl_mem*) src0->data;
1027
+ events.emplace_back();
1028
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
1029
+ } else if (src0->backend == GGML_BACKEND_GPU) {
1030
+ d_Q = (cl_mem) src0->data;
877
1031
  } else {
878
1032
  GGML_ASSERT(false);
879
1033
  }
880
1034
  if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
881
1035
  // copy src1 to device
882
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL));
1036
+ events.emplace_back();
1037
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, events.data() + ev_idx++));
883
1038
 
884
1039
  // compute
885
1040
  const size_t global = ne01 * CL_DMMV_BLOCK_SIZE;
886
1041
  const size_t local = CL_DMMV_BLOCK_SIZE;
887
1042
  const cl_int ncols = ne00;
1043
+ events.emplace_back();
888
1044
  CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
889
1045
  CL_CHECK(clSetKernelArg(*dmmv, 1, sizeof(float) * local, NULL));
890
1046
  CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y));
891
1047
  CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D));
892
1048
  CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols));
893
- CL_CHECK(clFinish(queue));
894
- CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, NULL, &global, &local, 0, NULL, &ev_sgemm));
1049
+ CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, NULL, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
895
1050
  } else { // general dequantization kernel + CLBlast matrix matrix multiplication
896
1051
  // convert src0 to fp32 on device
897
1052
  const size_t global = x_ne;
898
1053
  CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
899
1054
  CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
900
- CL_CHECK(clFinish(queue));
901
- CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, NULL, &global, NULL, 0, NULL, NULL));
1055
+ CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, NULL, &global, NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
902
1056
 
903
1057
  // copy src1 to device
904
1058
  CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL));
905
1059
 
1060
+ events.emplace_back();
1061
+
906
1062
  // wait for conversion
907
1063
  CL_CHECK(clFinish(queue));
908
1064
 
@@ -915,7 +1071,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
915
1071
  d_Y, 0, ne10,
916
1072
  beta,
917
1073
  d_D, 0, ne01,
918
- &queue, &ev_sgemm);
1074
+ &queue, events.data() + ev_idx++);
919
1075
 
920
1076
  if (status != clblast::StatusCode::kSuccess) {
921
1077
  GGML_ASSERT(false);
@@ -924,8 +1080,13 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
924
1080
 
925
1081
  // copy dst to host
926
1082
  float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
927
- CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
928
- clReleaseEvent(ev_sgemm);
1083
+ CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL));
1084
+ for (auto *event : events) {
1085
+ clReleaseEvent(event);
1086
+ }
1087
+
1088
+ ev_idx = 0;
1089
+ events.clear();
929
1090
  }
930
1091
  }
931
1092
 
@@ -950,7 +1111,7 @@ bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
950
1111
  if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
951
1112
  src1->type == GGML_TYPE_F32 &&
952
1113
  dst->type == GGML_TYPE_F32 &&
953
- ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_CL)) {
1114
+ ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_GPU)) {
954
1115
  return true;
955
1116
  }
956
1117
 
@@ -1016,19 +1177,48 @@ void ggml_cl_transform_tensor(ggml_tensor * tensor) {
1016
1177
  const size_t q_sz = ggml_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_blck_size(type);
1017
1178
 
1018
1179
  size_t q_size;
1019
- cl_mem* dst = (cl_mem*) malloc(sizeof(cl_mem));
1020
- *dst = ggml_cl_pool_malloc(q_sz, &q_size, CL_MEM_READ_ONLY);
1180
+ cl_mem dst = ggml_cl_pool_malloc(q_sz, &q_size);
1021
1181
 
1022
1182
  // copy tensor to device
1023
1183
  for (int64_t i3 = 0; i3 < ne3; i3++) {
1024
1184
  for (int64_t i2 = 0; i2 < ne2; i2++) {
1025
1185
  int i = i3*ne2 + i2;
1026
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, *dst, i*ne0*ne1, tensor, i3, i2, NULL));
1186
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, dst, i*ne0*ne1, tensor, i3, i2, NULL));
1027
1187
  }
1028
1188
  }
1029
1189
 
1030
1190
  CL_CHECK(clFinish(queue));
1031
1191
 
1032
1192
  tensor->data = dst;
1033
- tensor->backend = GGML_BACKEND_CL;
1193
+ tensor->backend = GGML_BACKEND_GPU;
1194
+ }
1195
+
1196
+ void ggml_cl_load_data(const char * fname, struct ggml_tensor * tensor, const size_t offset) {
1197
+ cl_int err;
1198
+ FILE * fp = fopen(fname, "rb");
1199
+
1200
+ const size_t size = ggml_nbytes(tensor);
1201
+
1202
+ cl_mem dst;
1203
+ CL_CHECK((dst = clCreateBuffer(context, CL_MEM_READ_ONLY, size, nullptr, &err), err));
1204
+ void * buf_host = malloc(size);
1205
+
1206
+ #ifdef _WIN32
1207
+ int ret = _fseeki64(fp, (__int64) offset, SEEK_SET);
1208
+ #else
1209
+ int ret = fseek(fp, (long) offset, SEEK_SET);
1210
+ #endif
1211
+ GGML_ASSERT(ret == 0); // same
1212
+
1213
+ size_t ret2 = fread(buf_host, size, 1, fp);
1214
+ if (ret2 != 1) {
1215
+ fprintf(stderr, "unexpectedly reached end of file");
1216
+ exit(1);
1217
+ }
1218
+
1219
+ clEnqueueWriteBuffer(queue, dst, CL_TRUE, 0, size, buf_host, 0, nullptr, nullptr);
1220
+
1221
+ tensor->data = dst;
1222
+ free(buf_host);
1223
+ fclose(fp);
1034
1224
  }
@@ -8,6 +8,7 @@ extern "C" {
8
8
 
9
9
  void ggml_cl_init(void);
10
10
 
11
+ void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
11
12
  bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
12
13
  size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
13
14
  void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
@@ -15,7 +16,10 @@ void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor
15
16
  void * ggml_cl_host_malloc(size_t size);
16
17
  void ggml_cl_host_free(void * ptr);
17
18
 
19
+ void ggml_cl_free_data(const struct ggml_tensor* tensor);
20
+
18
21
  void ggml_cl_transform_tensor(struct ggml_tensor * tensor);
22
+ void ggml_cl_load_data(const char * fname, struct ggml_tensor * tensor, size_t offset);
19
23
 
20
24
  #ifdef __cplusplus
21
25
  }