llama_cpp 0.1.4 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,6 +3,8 @@
3
3
  #include <array>
4
4
  #include <atomic>
5
5
  #include <sstream>
6
+ #include <vector>
7
+ #include <limits>
6
8
 
7
9
  #define CL_TARGET_OPENCL_VERSION 110
8
10
  #include <clblast.h>
@@ -197,6 +199,18 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float
197
199
  }
198
200
  );
199
201
 
202
+ std::string mul_template = MULTILINE_QUOTE(
203
+ __kernel void KERNEL_NAME(__global TYPE* x, const int x_offset, __global TYPE* y, const int y_offset, __global TYPE* dst, const int dst_offset, const int ky) {
204
+ const int i = get_group_id(0)*get_local_size(0) + get_local_id(0);
205
+
206
+ if (i >= get_global_size(0)) {
207
+ return;
208
+ }
209
+
210
+ dst[dst_offset + i] = x[x_offset + i] * y[y_offset + i%ky];
211
+ }
212
+ );
213
+
200
214
  #define CL_CHECK(err) \
201
215
  do { \
202
216
  cl_int err_ = (err); \
@@ -239,6 +253,13 @@ std::array<std::string, 30> dequant_mul_mat_vec_str_values = {
239
253
  "convert_mul_mat_vec_f16", "half", "1", "1", "convert_f16"
240
254
  };
241
255
 
256
+ std::array<std::string, 2> mul_str_keys = {
257
+ "KERNEL_NAME", "TYPE"
258
+ };
259
+ std::array<std::string, 2> mul_str_values = {
260
+ "mul_f32", "float"
261
+ };
262
+
242
263
  std::string& replace(std::string& s, const std::string& from, const std::string& to) {
243
264
  size_t pos = 0;
244
265
  while ((pos = s.find(from, pos)) != std::string::npos) {
@@ -261,6 +282,13 @@ std::string generate_kernels() {
261
282
  src << dequant_kernel << '\n';
262
283
  src << dmmv_kernel << '\n';
263
284
  }
285
+ for (size_t i = 0; i < mul_str_values.size(); i += mul_str_keys.size()) {
286
+ std::string mul_kernel = mul_template;
287
+ for (size_t j = 0; j < mul_str_keys.size(); j++) {
288
+ replace(mul_kernel, mul_str_keys[j], mul_str_values[i + j]);
289
+ }
290
+ src << mul_kernel << '\n';
291
+ }
264
292
  return src.str();
265
293
  }
266
294
 
@@ -272,6 +300,7 @@ static cl_program program;
272
300
  static cl_kernel convert_row_f16_cl;
273
301
  static cl_kernel dequantize_row_q4_0_cl, dequantize_row_q4_1_cl, dequantize_row_q5_0_cl, dequantize_row_q5_1_cl, dequantize_row_q8_0_cl;
274
302
  static cl_kernel dequantize_mul_mat_vec_q4_0_cl, dequantize_mul_mat_vec_q4_1_cl, dequantize_mul_mat_vec_q5_0_cl, dequantize_mul_mat_vec_q5_1_cl, dequantize_mul_mat_vec_q8_0_cl, convert_mul_mat_vec_f16_cl;
303
+ static cl_kernel mul_f32_cl;
275
304
  static bool fp16_support;
276
305
 
277
306
  static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer) {
@@ -508,6 +537,9 @@ void ggml_cl_init(void) {
508
537
  CL_CHECK((dequantize_mul_mat_vec_q5_1_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_1", &err), err));
509
538
  CL_CHECK((dequantize_mul_mat_vec_q8_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q8_0", &err), err));
510
539
  CL_CHECK((convert_mul_mat_vec_f16_cl = clCreateKernel(program, "convert_mul_mat_vec_f16", &err), err));
540
+
541
+ // mul kernel
542
+ CL_CHECK((mul_f32_cl = clCreateKernel(program, "mul_f32", &err), err));
511
543
  }
512
544
 
513
545
  static cl_kernel* ggml_get_to_fp32_cl(ggml_type type) {
@@ -573,21 +605,44 @@ struct cl_buffer {
573
605
  static cl_buffer g_cl_buffer_pool[MAX_CL_BUFFERS];
574
606
  static std::atomic_flag g_cl_pool_lock = ATOMIC_FLAG_INIT;
575
607
 
576
- static cl_mem ggml_cl_pool_malloc(size_t size, size_t * actual_size, cl_mem_flags flags) {
608
+ static cl_mem ggml_cl_pool_malloc(size_t size, size_t * actual_size) {
577
609
  scoped_spin_lock lock(g_cl_pool_lock);
578
610
  cl_int err;
579
611
 
612
+ int best_i = -1;
613
+ size_t best_size = std::numeric_limits<size_t>::max(); //smallest unused buffer that fits our needs
614
+ int worst_i = -1;
615
+ size_t worst_size = 0; //largest unused buffer seen so far
580
616
  for (int i = 0; i < MAX_CL_BUFFERS; ++i) {
581
- cl_buffer& b = g_cl_buffer_pool[i];
582
- if (b.size > 0 && b.size >= size) {
583
- cl_mem mem = b.mem;
584
- *actual_size = b.size;
585
- b.size = 0;
586
- return mem;
617
+ cl_buffer &b = g_cl_buffer_pool[i];
618
+ if (b.size > 0 && b.size >= size && b.size < best_size)
619
+ {
620
+ best_i = i;
621
+ best_size = b.size;
587
622
  }
623
+ if (b.size > 0 && b.size > worst_size)
624
+ {
625
+ worst_i = i;
626
+ worst_size = b.size;
627
+ }
628
+ }
629
+ if(best_i!=-1) //found the smallest buffer that fits our needs
630
+ {
631
+ cl_buffer& b = g_cl_buffer_pool[best_i];
632
+ cl_mem mem = b.mem;
633
+ *actual_size = b.size;
634
+ b.size = 0;
635
+ return mem;
636
+ }
637
+ if(worst_i!=-1) //no buffer that fits our needs, resize largest one to save memory
638
+ {
639
+ cl_buffer& b = g_cl_buffer_pool[worst_i];
640
+ cl_mem mem = b.mem;
641
+ b.size = 0;
642
+ clReleaseMemObject(mem);
588
643
  }
589
644
  cl_mem mem;
590
- CL_CHECK((mem = clCreateBuffer(context, flags, size, NULL, &err), err));
645
+ CL_CHECK((mem = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &err), err));
591
646
  *actual_size = size;
592
647
  return mem;
593
648
  }
@@ -607,6 +662,15 @@ static void ggml_cl_pool_free(cl_mem mem, size_t size) {
607
662
  clReleaseMemObject(mem);
608
663
  }
609
664
 
665
+ void ggml_cl_free_data(const struct ggml_tensor* tensor) {
666
+ if (tensor->backend != GGML_BACKEND_GPU) {
667
+ return;
668
+ }
669
+
670
+ cl_mem mem = (cl_mem)tensor->data;
671
+ clReleaseMemObject(mem);
672
+ }
673
+
610
674
  static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t offset, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, cl_event* ev) {
611
675
  cl_int err;
612
676
  const uint64_t ne0 = src->ne[0];
@@ -644,6 +708,99 @@ static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t o
644
708
  return err;
645
709
  }
646
710
 
711
+ static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
712
+ GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
713
+ const int64_t ne00 = src0->ne[0];
714
+ const int64_t ne01 = src0->ne[1];
715
+ const int64_t ne02 = src0->ne[2];
716
+ const int64_t ne03 = src0->ne[2];
717
+ const int64_t ne0 = ne00 * ne01 * ne02 * ne03;
718
+ const int64_t ne10 = src1->ne[0];
719
+ const int64_t ne11 = src1->ne[1];
720
+ const int64_t ne12 = src1->ne[2];
721
+ const int64_t ne13 = src1->ne[3];
722
+ const int64_t nb10 = src1->nb[0];
723
+ const int nb2 = dst->nb[2];
724
+ const int nb3 = dst->nb[3];
725
+ size_t x_size;
726
+ size_t d_size;
727
+
728
+ cl_mem d_X = ggml_cl_pool_malloc(ne0 * sizeof(float), &x_size); // src0
729
+ cl_mem d_Y = (cl_mem) src1->data; // src1 is already on device, broadcasted.
730
+ cl_mem d_D = ggml_cl_pool_malloc(ne0 * sizeof(float), &d_size); // dst
731
+
732
+
733
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
734
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
735
+ const int i0 = i03*ne02 + i02;
736
+
737
+ cl_event ev;
738
+
739
+ // copy src0 to device
740
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, i0, src0, i03, i02, &ev));
741
+
742
+ if (nb10 == sizeof(float)) {
743
+ // Contiguous, avoid overhead from queueing many kernel runs
744
+ const int64_t i13 = i03%ne13;
745
+ const int64_t i12 = i02%ne12;
746
+ const int i1 = i13*ne12*ne11 + i12*ne11;
747
+
748
+ cl_int x_offset = 0;
749
+ cl_int y_offset = i1*ne10;
750
+ cl_int d_offset = 0;
751
+
752
+ size_t global = ne00 * ne01;
753
+ cl_int ky = ne10;
754
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X));
755
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset));
756
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 2, sizeof(cl_mem), &d_Y));
757
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 3, sizeof(cl_int), &y_offset));
758
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 4, sizeof(cl_mem), &d_D));
759
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset));
760
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky));
761
+ CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
762
+ } else {
763
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
764
+ const int64_t i13 = i03%ne13;
765
+ const int64_t i12 = i02%ne12;
766
+ const int64_t i11 = i01%ne11;
767
+ const int i1 = i13*ne12*ne11 + i12*ne11 + i11;
768
+
769
+ cl_int x_offset = i01*ne00;
770
+ cl_int y_offset = i1*ne10;
771
+ cl_int d_offset = i01*ne00;
772
+
773
+ // compute
774
+ size_t global = ne00;
775
+ cl_int ky = ne10;
776
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X));
777
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset));
778
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 2, sizeof(cl_mem), &d_Y));
779
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 3, sizeof(cl_int), &y_offset));
780
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 4, sizeof(cl_mem), &d_D));
781
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset));
782
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky));
783
+ CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
784
+ }
785
+ }
786
+
787
+ CL_CHECK(clReleaseEvent(ev));
788
+ CL_CHECK(clFinish(queue));
789
+
790
+ // copy dst to host
791
+ float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
792
+ CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * ne00*ne01, d, 0, NULL, NULL));
793
+ }
794
+ }
795
+ ggml_cl_pool_free(d_X, x_size);
796
+ ggml_cl_pool_free(d_D, d_size);
797
+ }
798
+
799
+ void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
800
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
801
+ ggml_cl_mul_f32(src0, src1, dst);
802
+ }
803
+
647
804
  static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
648
805
  const int64_t ne00 = src0->ne[0];
649
806
  const int64_t ne01 = src0->ne[1];
@@ -666,18 +823,18 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
666
823
  size_t y_size;
667
824
  size_t d_size;
668
825
  cl_mem d_X;
669
- if (src0->backend == GGML_BACKEND_CL) {
826
+ if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
670
827
  d_X = (cl_mem) src0->data;
671
828
  } else {
672
- d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size, CL_MEM_READ_ONLY);
829
+ d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
673
830
  }
674
- cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size, CL_MEM_READ_ONLY);
675
- cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size, CL_MEM_WRITE_ONLY);
831
+ cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
832
+ cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
676
833
 
677
834
  for (int64_t i03 = 0; i03 < ne03; i03++) {
678
835
  for (int64_t i02 = 0; i02 < ne02; i02++) {
679
836
  // copy data to device
680
- if (src0->backend != GGML_BACKEND_CL) {
837
+ if (src0->backend != GGML_BACKEND_GPU) {
681
838
  CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
682
839
  }
683
840
  CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL));
@@ -706,7 +863,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
706
863
  }
707
864
  }
708
865
 
709
- if (src0->backend != GGML_BACKEND_CL) {
866
+ if (src0->backend != GGML_BACKEND_GPU) {
710
867
  ggml_cl_pool_free(d_X, x_size);
711
868
  }
712
869
  ggml_cl_pool_free(d_Y, y_size);
@@ -742,13 +899,13 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
742
899
  size_t y_size;
743
900
  size_t d_size;
744
901
  cl_mem d_X;
745
- if (src0->backend == GGML_BACKEND_CL) {
902
+ if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
746
903
  d_X = (cl_mem) src0->data;
747
904
  } else {
748
- d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size, CL_MEM_READ_ONLY);
905
+ d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
749
906
  }
750
- cl_mem d_Y = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * y_ne, &y_size, CL_MEM_READ_ONLY);
751
- cl_mem d_D = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * d_ne, &d_size, CL_MEM_WRITE_ONLY);
907
+ cl_mem d_Y = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * y_ne, &y_size);
908
+ cl_mem d_D = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * d_ne, &d_size);
752
909
 
753
910
  bool src1_cont_rows = nb10 == sizeof(float);
754
911
  bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);
@@ -756,7 +913,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
756
913
  for (int64_t i03 = 0; i03 < ne03; i03++) {
757
914
  for (int64_t i02 = 0; i02 < ne02; i02++) {
758
915
  // copy src0 to device
759
- if (src0->backend != GGML_BACKEND_CL) {
916
+ if (src0->backend != GGML_BACKEND_GPU) {
760
917
  CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
761
918
  }
762
919
 
@@ -813,7 +970,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
813
970
  }
814
971
  }
815
972
 
816
- if (src0->backend != GGML_BACKEND_CL) {
973
+ if (src0->backend != GGML_BACKEND_GPU) {
817
974
  ggml_cl_pool_free(d_X, x_size);
818
975
  }
819
976
  ggml_cl_pool_free(d_Y, y_size);
@@ -847,57 +1004,61 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
847
1004
  size_t q_size;
848
1005
  cl_mem d_X;
849
1006
  if (!mul_mat_vec) {
850
- d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size, CL_MEM_READ_WRITE);
1007
+ d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
851
1008
  }
852
- cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size, CL_MEM_READ_ONLY);
853
- cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size, CL_MEM_WRITE_ONLY);
1009
+ cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
1010
+ cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
854
1011
  cl_mem d_Q;
855
1012
  if (src0->backend == GGML_BACKEND_CPU) {
856
- d_Q = ggml_cl_pool_malloc(q_sz, &q_size, CL_MEM_READ_ONLY);
1013
+ d_Q = ggml_cl_pool_malloc(q_sz, &q_size);
857
1014
  }
858
1015
 
859
1016
  cl_kernel* to_fp32_cl = ggml_get_to_fp32_cl(type);
860
1017
  cl_kernel* dmmv = ggml_get_dequantize_mul_mat_vec_cl(type);
861
1018
  GGML_ASSERT(to_fp32_cl != nullptr);
862
1019
 
1020
+ size_t ev_idx = 0;
1021
+ std::vector<cl_event> events;
1022
+
863
1023
  for (int64_t i03 = 0; i03 < ne03; i03++) {
864
1024
  for (int64_t i02 = 0; i02 < ne02; i02++) {
865
- cl_event ev_sgemm;
866
-
867
1025
  // copy src0 to device if necessary
868
1026
  if (src0->backend == GGML_BACKEND_CPU) {
869
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, NULL));
870
- } else if (src0->backend == GGML_BACKEND_CL) {
1027
+ events.emplace_back();
1028
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
1029
+ } else if (src0->backend == GGML_BACKEND_GPU) {
871
1030
  d_Q = (cl_mem) src0->data;
872
1031
  } else {
873
1032
  GGML_ASSERT(false);
874
1033
  }
875
1034
  if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
876
1035
  // copy src1 to device
877
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL));
1036
+ events.emplace_back();
1037
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, events.data() + ev_idx++));
878
1038
 
879
1039
  // compute
880
1040
  const size_t global = ne01 * CL_DMMV_BLOCK_SIZE;
881
1041
  const size_t local = CL_DMMV_BLOCK_SIZE;
882
1042
  const cl_int ncols = ne00;
1043
+ events.emplace_back();
883
1044
  CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
884
1045
  CL_CHECK(clSetKernelArg(*dmmv, 1, sizeof(float) * local, NULL));
885
1046
  CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y));
886
1047
  CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D));
887
1048
  CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols));
888
- CL_CHECK(clFinish(queue));
889
- CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, NULL, &global, &local, 0, NULL, &ev_sgemm));
1049
+ CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, NULL, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
890
1050
  } else { // general dequantization kernel + CLBlast matrix matrix multiplication
891
1051
  // convert src0 to fp32 on device
892
1052
  const size_t global = x_ne;
893
1053
  CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
894
1054
  CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
895
- CL_CHECK(clFinish(queue));
896
- CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, NULL, &global, NULL, 0, NULL, NULL));
1055
+ CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, NULL, &global, NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
897
1056
 
898
1057
  // copy src1 to device
899
1058
  CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL));
900
1059
 
1060
+ events.emplace_back();
1061
+
901
1062
  // wait for conversion
902
1063
  CL_CHECK(clFinish(queue));
903
1064
 
@@ -910,7 +1071,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
910
1071
  d_Y, 0, ne10,
911
1072
  beta,
912
1073
  d_D, 0, ne01,
913
- &queue, &ev_sgemm);
1074
+ &queue, events.data() + ev_idx++);
914
1075
 
915
1076
  if (status != clblast::StatusCode::kSuccess) {
916
1077
  GGML_ASSERT(false);
@@ -919,8 +1080,13 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
919
1080
 
920
1081
  // copy dst to host
921
1082
  float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
922
- CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
923
- clReleaseEvent(ev_sgemm);
1083
+ CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL));
1084
+ for (auto *event : events) {
1085
+ clReleaseEvent(event);
1086
+ }
1087
+
1088
+ ev_idx = 0;
1089
+ events.clear();
924
1090
  }
925
1091
  }
926
1092
 
@@ -945,7 +1111,7 @@ bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
945
1111
  if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
946
1112
  src1->type == GGML_TYPE_F32 &&
947
1113
  dst->type == GGML_TYPE_F32 &&
948
- ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_CL)) {
1114
+ ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_GPU)) {
949
1115
  return true;
950
1116
  }
951
1117
 
@@ -1001,7 +1167,7 @@ size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct g
1001
1167
  return 0;
1002
1168
  }
1003
1169
 
1004
- void ggml_cl_transform_tensor(ggml_tensor * tensor) {
1170
+ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
1005
1171
  const int64_t ne0 = tensor->ne[0];
1006
1172
  const int64_t ne1 = tensor->ne[1];
1007
1173
  const int64_t ne2 = tensor->ne[2];
@@ -1011,8 +1177,9 @@ void ggml_cl_transform_tensor(ggml_tensor * tensor) {
1011
1177
  const size_t q_sz = ggml_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_blck_size(type);
1012
1178
 
1013
1179
  size_t q_size;
1014
- cl_mem dst = ggml_cl_pool_malloc(q_sz, &q_size, CL_MEM_READ_ONLY);
1180
+ cl_mem dst = ggml_cl_pool_malloc(q_sz, &q_size);
1015
1181
 
1182
+ tensor->data = data;
1016
1183
  // copy tensor to device
1017
1184
  for (int64_t i3 = 0; i3 < ne3; i3++) {
1018
1185
  for (int64_t i2 = 0; i2 < ne2; i2++) {
@@ -1024,5 +1191,5 @@ void ggml_cl_transform_tensor(ggml_tensor * tensor) {
1024
1191
  CL_CHECK(clFinish(queue));
1025
1192
 
1026
1193
  tensor->data = dst;
1027
- tensor->backend = GGML_BACKEND_CL;
1194
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
1028
1195
  }
@@ -8,6 +8,7 @@ extern "C" {
8
8
 
9
9
  void ggml_cl_init(void);
10
10
 
11
+ void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
11
12
  bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
12
13
  size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
13
14
  void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
@@ -15,7 +16,9 @@ void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor
15
16
  void * ggml_cl_host_malloc(size_t size);
16
17
  void ggml_cl_host_free(void * ptr);
17
18
 
18
- void ggml_cl_transform_tensor(struct ggml_tensor * tensor);
19
+ void ggml_cl_free_data(const struct ggml_tensor* tensor);
20
+
21
+ void ggml_cl_transform_tensor(void * data, struct ggml_tensor * tensor);
19
22
 
20
23
  #ifdef __cplusplus
21
24
  }