llama_cpp 0.1.4 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -3,6 +3,8 @@
3
3
  #include <array>
4
4
  #include <atomic>
5
5
  #include <sstream>
6
+ #include <vector>
7
+ #include <limits>
6
8
 
7
9
  #define CL_TARGET_OPENCL_VERSION 110
8
10
  #include <clblast.h>
@@ -197,6 +199,18 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float
197
199
  }
198
200
  );
199
201
 
202
+ std::string mul_template = MULTILINE_QUOTE(
203
+ __kernel void KERNEL_NAME(__global TYPE* x, const int x_offset, __global TYPE* y, const int y_offset, __global TYPE* dst, const int dst_offset, const int ky) {
204
+ const int i = get_group_id(0)*get_local_size(0) + get_local_id(0);
205
+
206
+ if (i >= get_global_size(0)) {
207
+ return;
208
+ }
209
+
210
+ dst[dst_offset + i] = x[x_offset + i] * y[y_offset + i%ky];
211
+ }
212
+ );
213
+
200
214
  #define CL_CHECK(err) \
201
215
  do { \
202
216
  cl_int err_ = (err); \
@@ -239,6 +253,13 @@ std::array<std::string, 30> dequant_mul_mat_vec_str_values = {
239
253
  "convert_mul_mat_vec_f16", "half", "1", "1", "convert_f16"
240
254
  };
241
255
 
256
+ std::array<std::string, 2> mul_str_keys = {
257
+ "KERNEL_NAME", "TYPE"
258
+ };
259
+ std::array<std::string, 2> mul_str_values = {
260
+ "mul_f32", "float"
261
+ };
262
+
242
263
  std::string& replace(std::string& s, const std::string& from, const std::string& to) {
243
264
  size_t pos = 0;
244
265
  while ((pos = s.find(from, pos)) != std::string::npos) {
@@ -261,6 +282,13 @@ std::string generate_kernels() {
261
282
  src << dequant_kernel << '\n';
262
283
  src << dmmv_kernel << '\n';
263
284
  }
285
+ for (size_t i = 0; i < mul_str_values.size(); i += mul_str_keys.size()) {
286
+ std::string mul_kernel = mul_template;
287
+ for (size_t j = 0; j < mul_str_keys.size(); j++) {
288
+ replace(mul_kernel, mul_str_keys[j], mul_str_values[i + j]);
289
+ }
290
+ src << mul_kernel << '\n';
291
+ }
264
292
  return src.str();
265
293
  }
266
294
 
@@ -272,6 +300,7 @@ static cl_program program;
272
300
  static cl_kernel convert_row_f16_cl;
273
301
  static cl_kernel dequantize_row_q4_0_cl, dequantize_row_q4_1_cl, dequantize_row_q5_0_cl, dequantize_row_q5_1_cl, dequantize_row_q8_0_cl;
274
302
  static cl_kernel dequantize_mul_mat_vec_q4_0_cl, dequantize_mul_mat_vec_q4_1_cl, dequantize_mul_mat_vec_q5_0_cl, dequantize_mul_mat_vec_q5_1_cl, dequantize_mul_mat_vec_q8_0_cl, convert_mul_mat_vec_f16_cl;
303
+ static cl_kernel mul_f32_cl;
275
304
  static bool fp16_support;
276
305
 
277
306
  static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer) {
@@ -508,6 +537,9 @@ void ggml_cl_init(void) {
508
537
  CL_CHECK((dequantize_mul_mat_vec_q5_1_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_1", &err), err));
509
538
  CL_CHECK((dequantize_mul_mat_vec_q8_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q8_0", &err), err));
510
539
  CL_CHECK((convert_mul_mat_vec_f16_cl = clCreateKernel(program, "convert_mul_mat_vec_f16", &err), err));
540
+
541
+ // mul kernel
542
+ CL_CHECK((mul_f32_cl = clCreateKernel(program, "mul_f32", &err), err));
511
543
  }
512
544
 
513
545
  static cl_kernel* ggml_get_to_fp32_cl(ggml_type type) {
@@ -573,21 +605,44 @@ struct cl_buffer {
573
605
  static cl_buffer g_cl_buffer_pool[MAX_CL_BUFFERS];
574
606
  static std::atomic_flag g_cl_pool_lock = ATOMIC_FLAG_INIT;
575
607
 
576
- static cl_mem ggml_cl_pool_malloc(size_t size, size_t * actual_size, cl_mem_flags flags) {
608
+ static cl_mem ggml_cl_pool_malloc(size_t size, size_t * actual_size) {
577
609
  scoped_spin_lock lock(g_cl_pool_lock);
578
610
  cl_int err;
579
611
 
612
+ int best_i = -1;
613
+ size_t best_size = std::numeric_limits<size_t>::max(); //smallest unused buffer that fits our needs
614
+ int worst_i = -1;
615
+ size_t worst_size = 0; //largest unused buffer seen so far
580
616
  for (int i = 0; i < MAX_CL_BUFFERS; ++i) {
581
- cl_buffer& b = g_cl_buffer_pool[i];
582
- if (b.size > 0 && b.size >= size) {
583
- cl_mem mem = b.mem;
584
- *actual_size = b.size;
585
- b.size = 0;
586
- return mem;
617
+ cl_buffer &b = g_cl_buffer_pool[i];
618
+ if (b.size > 0 && b.size >= size && b.size < best_size)
619
+ {
620
+ best_i = i;
621
+ best_size = b.size;
587
622
  }
623
+ if (b.size > 0 && b.size > worst_size)
624
+ {
625
+ worst_i = i;
626
+ worst_size = b.size;
627
+ }
628
+ }
629
+ if(best_i!=-1) //found the smallest buffer that fits our needs
630
+ {
631
+ cl_buffer& b = g_cl_buffer_pool[best_i];
632
+ cl_mem mem = b.mem;
633
+ *actual_size = b.size;
634
+ b.size = 0;
635
+ return mem;
636
+ }
637
+ if(worst_i!=-1) //no buffer that fits our needs, resize largest one to save memory
638
+ {
639
+ cl_buffer& b = g_cl_buffer_pool[worst_i];
640
+ cl_mem mem = b.mem;
641
+ b.size = 0;
642
+ clReleaseMemObject(mem);
588
643
  }
589
644
  cl_mem mem;
590
- CL_CHECK((mem = clCreateBuffer(context, flags, size, NULL, &err), err));
645
+ CL_CHECK((mem = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &err), err));
591
646
  *actual_size = size;
592
647
  return mem;
593
648
  }
@@ -607,6 +662,15 @@ static void ggml_cl_pool_free(cl_mem mem, size_t size) {
607
662
  clReleaseMemObject(mem);
608
663
  }
609
664
 
665
+ void ggml_cl_free_data(const struct ggml_tensor* tensor) {
666
+ if (tensor->backend != GGML_BACKEND_GPU) {
667
+ return;
668
+ }
669
+
670
+ cl_mem mem = (cl_mem)tensor->data;
671
+ clReleaseMemObject(mem);
672
+ }
673
+
610
674
  static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t offset, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, cl_event* ev) {
611
675
  cl_int err;
612
676
  const uint64_t ne0 = src->ne[0];
@@ -644,6 +708,99 @@ static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t o
644
708
  return err;
645
709
  }
646
710
 
711
+ static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
712
+ GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
713
+ const int64_t ne00 = src0->ne[0];
714
+ const int64_t ne01 = src0->ne[1];
715
+ const int64_t ne02 = src0->ne[2];
716
+ const int64_t ne03 = src0->ne[2];
717
+ const int64_t ne0 = ne00 * ne01 * ne02 * ne03;
718
+ const int64_t ne10 = src1->ne[0];
719
+ const int64_t ne11 = src1->ne[1];
720
+ const int64_t ne12 = src1->ne[2];
721
+ const int64_t ne13 = src1->ne[3];
722
+ const int64_t nb10 = src1->nb[0];
723
+ const int nb2 = dst->nb[2];
724
+ const int nb3 = dst->nb[3];
725
+ size_t x_size;
726
+ size_t d_size;
727
+
728
+ cl_mem d_X = ggml_cl_pool_malloc(ne0 * sizeof(float), &x_size); // src0
729
+ cl_mem d_Y = (cl_mem) src1->data; // src1 is already on device, broadcasted.
730
+ cl_mem d_D = ggml_cl_pool_malloc(ne0 * sizeof(float), &d_size); // dst
731
+
732
+
733
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
734
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
735
+ const int i0 = i03*ne02 + i02;
736
+
737
+ cl_event ev;
738
+
739
+ // copy src0 to device
740
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, i0, src0, i03, i02, &ev));
741
+
742
+ if (nb10 == sizeof(float)) {
743
+ // Contiguous, avoid overhead from queueing many kernel runs
744
+ const int64_t i13 = i03%ne13;
745
+ const int64_t i12 = i02%ne12;
746
+ const int i1 = i13*ne12*ne11 + i12*ne11;
747
+
748
+ cl_int x_offset = 0;
749
+ cl_int y_offset = i1*ne10;
750
+ cl_int d_offset = 0;
751
+
752
+ size_t global = ne00 * ne01;
753
+ cl_int ky = ne10;
754
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X));
755
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset));
756
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 2, sizeof(cl_mem), &d_Y));
757
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 3, sizeof(cl_int), &y_offset));
758
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 4, sizeof(cl_mem), &d_D));
759
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset));
760
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky));
761
+ CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
762
+ } else {
763
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
764
+ const int64_t i13 = i03%ne13;
765
+ const int64_t i12 = i02%ne12;
766
+ const int64_t i11 = i01%ne11;
767
+ const int i1 = i13*ne12*ne11 + i12*ne11 + i11;
768
+
769
+ cl_int x_offset = i01*ne00;
770
+ cl_int y_offset = i1*ne10;
771
+ cl_int d_offset = i01*ne00;
772
+
773
+ // compute
774
+ size_t global = ne00;
775
+ cl_int ky = ne10;
776
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X));
777
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset));
778
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 2, sizeof(cl_mem), &d_Y));
779
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 3, sizeof(cl_int), &y_offset));
780
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 4, sizeof(cl_mem), &d_D));
781
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset));
782
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky));
783
+ CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
784
+ }
785
+ }
786
+
787
+ CL_CHECK(clReleaseEvent(ev));
788
+ CL_CHECK(clFinish(queue));
789
+
790
+ // copy dst to host
791
+ float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
792
+ CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * ne00*ne01, d, 0, NULL, NULL));
793
+ }
794
+ }
795
+ ggml_cl_pool_free(d_X, x_size);
796
+ ggml_cl_pool_free(d_D, d_size);
797
+ }
798
+
799
+ void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
800
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
801
+ ggml_cl_mul_f32(src0, src1, dst);
802
+ }
803
+
647
804
  static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
648
805
  const int64_t ne00 = src0->ne[0];
649
806
  const int64_t ne01 = src0->ne[1];
@@ -666,18 +823,18 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
666
823
  size_t y_size;
667
824
  size_t d_size;
668
825
  cl_mem d_X;
669
- if (src0->backend == GGML_BACKEND_CL) {
826
+ if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
670
827
  d_X = (cl_mem) src0->data;
671
828
  } else {
672
- d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size, CL_MEM_READ_ONLY);
829
+ d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
673
830
  }
674
- cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size, CL_MEM_READ_ONLY);
675
- cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size, CL_MEM_WRITE_ONLY);
831
+ cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
832
+ cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
676
833
 
677
834
  for (int64_t i03 = 0; i03 < ne03; i03++) {
678
835
  for (int64_t i02 = 0; i02 < ne02; i02++) {
679
836
  // copy data to device
680
- if (src0->backend != GGML_BACKEND_CL) {
837
+ if (src0->backend != GGML_BACKEND_GPU) {
681
838
  CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
682
839
  }
683
840
  CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL));
@@ -706,7 +863,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
706
863
  }
707
864
  }
708
865
 
709
- if (src0->backend != GGML_BACKEND_CL) {
866
+ if (src0->backend != GGML_BACKEND_GPU) {
710
867
  ggml_cl_pool_free(d_X, x_size);
711
868
  }
712
869
  ggml_cl_pool_free(d_Y, y_size);
@@ -742,13 +899,13 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
742
899
  size_t y_size;
743
900
  size_t d_size;
744
901
  cl_mem d_X;
745
- if (src0->backend == GGML_BACKEND_CL) {
902
+ if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
746
903
  d_X = (cl_mem) src0->data;
747
904
  } else {
748
- d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size, CL_MEM_READ_ONLY);
905
+ d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
749
906
  }
750
- cl_mem d_Y = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * y_ne, &y_size, CL_MEM_READ_ONLY);
751
- cl_mem d_D = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * d_ne, &d_size, CL_MEM_WRITE_ONLY);
907
+ cl_mem d_Y = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * y_ne, &y_size);
908
+ cl_mem d_D = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * d_ne, &d_size);
752
909
 
753
910
  bool src1_cont_rows = nb10 == sizeof(float);
754
911
  bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);
@@ -756,7 +913,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
756
913
  for (int64_t i03 = 0; i03 < ne03; i03++) {
757
914
  for (int64_t i02 = 0; i02 < ne02; i02++) {
758
915
  // copy src0 to device
759
- if (src0->backend != GGML_BACKEND_CL) {
916
+ if (src0->backend != GGML_BACKEND_GPU) {
760
917
  CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
761
918
  }
762
919
 
@@ -813,7 +970,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
813
970
  }
814
971
  }
815
972
 
816
- if (src0->backend != GGML_BACKEND_CL) {
973
+ if (src0->backend != GGML_BACKEND_GPU) {
817
974
  ggml_cl_pool_free(d_X, x_size);
818
975
  }
819
976
  ggml_cl_pool_free(d_Y, y_size);
@@ -847,57 +1004,61 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
847
1004
  size_t q_size;
848
1005
  cl_mem d_X;
849
1006
  if (!mul_mat_vec) {
850
- d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size, CL_MEM_READ_WRITE);
1007
+ d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
851
1008
  }
852
- cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size, CL_MEM_READ_ONLY);
853
- cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size, CL_MEM_WRITE_ONLY);
1009
+ cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
1010
+ cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
854
1011
  cl_mem d_Q;
855
1012
  if (src0->backend == GGML_BACKEND_CPU) {
856
- d_Q = ggml_cl_pool_malloc(q_sz, &q_size, CL_MEM_READ_ONLY);
1013
+ d_Q = ggml_cl_pool_malloc(q_sz, &q_size);
857
1014
  }
858
1015
 
859
1016
  cl_kernel* to_fp32_cl = ggml_get_to_fp32_cl(type);
860
1017
  cl_kernel* dmmv = ggml_get_dequantize_mul_mat_vec_cl(type);
861
1018
  GGML_ASSERT(to_fp32_cl != nullptr);
862
1019
 
1020
+ size_t ev_idx = 0;
1021
+ std::vector<cl_event> events;
1022
+
863
1023
  for (int64_t i03 = 0; i03 < ne03; i03++) {
864
1024
  for (int64_t i02 = 0; i02 < ne02; i02++) {
865
- cl_event ev_sgemm;
866
-
867
1025
  // copy src0 to device if necessary
868
1026
  if (src0->backend == GGML_BACKEND_CPU) {
869
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, NULL));
870
- } else if (src0->backend == GGML_BACKEND_CL) {
1027
+ events.emplace_back();
1028
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
1029
+ } else if (src0->backend == GGML_BACKEND_GPU) {
871
1030
  d_Q = (cl_mem) src0->data;
872
1031
  } else {
873
1032
  GGML_ASSERT(false);
874
1033
  }
875
1034
  if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
876
1035
  // copy src1 to device
877
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL));
1036
+ events.emplace_back();
1037
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, events.data() + ev_idx++));
878
1038
 
879
1039
  // compute
880
1040
  const size_t global = ne01 * CL_DMMV_BLOCK_SIZE;
881
1041
  const size_t local = CL_DMMV_BLOCK_SIZE;
882
1042
  const cl_int ncols = ne00;
1043
+ events.emplace_back();
883
1044
  CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
884
1045
  CL_CHECK(clSetKernelArg(*dmmv, 1, sizeof(float) * local, NULL));
885
1046
  CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y));
886
1047
  CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D));
887
1048
  CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols));
888
- CL_CHECK(clFinish(queue));
889
- CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, NULL, &global, &local, 0, NULL, &ev_sgemm));
1049
+ CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, NULL, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
890
1050
  } else { // general dequantization kernel + CLBlast matrix matrix multiplication
891
1051
  // convert src0 to fp32 on device
892
1052
  const size_t global = x_ne;
893
1053
  CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
894
1054
  CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
895
- CL_CHECK(clFinish(queue));
896
- CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, NULL, &global, NULL, 0, NULL, NULL));
1055
+ CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, NULL, &global, NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
897
1056
 
898
1057
  // copy src1 to device
899
1058
  CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL));
900
1059
 
1060
+ events.emplace_back();
1061
+
901
1062
  // wait for conversion
902
1063
  CL_CHECK(clFinish(queue));
903
1064
 
@@ -910,7 +1071,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
910
1071
  d_Y, 0, ne10,
911
1072
  beta,
912
1073
  d_D, 0, ne01,
913
- &queue, &ev_sgemm);
1074
+ &queue, events.data() + ev_idx++);
914
1075
 
915
1076
  if (status != clblast::StatusCode::kSuccess) {
916
1077
  GGML_ASSERT(false);
@@ -919,8 +1080,13 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
919
1080
 
920
1081
  // copy dst to host
921
1082
  float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
922
- CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
923
- clReleaseEvent(ev_sgemm);
1083
+ CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL));
1084
+ for (auto *event : events) {
1085
+ clReleaseEvent(event);
1086
+ }
1087
+
1088
+ ev_idx = 0;
1089
+ events.clear();
924
1090
  }
925
1091
  }
926
1092
 
@@ -945,7 +1111,7 @@ bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
945
1111
  if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
946
1112
  src1->type == GGML_TYPE_F32 &&
947
1113
  dst->type == GGML_TYPE_F32 &&
948
- ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_CL)) {
1114
+ ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_GPU)) {
949
1115
  return true;
950
1116
  }
951
1117
 
@@ -1011,7 +1177,7 @@ void ggml_cl_transform_tensor(ggml_tensor * tensor) {
1011
1177
  const size_t q_sz = ggml_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_blck_size(type);
1012
1178
 
1013
1179
  size_t q_size;
1014
- cl_mem dst = ggml_cl_pool_malloc(q_sz, &q_size, CL_MEM_READ_ONLY);
1180
+ cl_mem dst = ggml_cl_pool_malloc(q_sz, &q_size);
1015
1181
 
1016
1182
  // copy tensor to device
1017
1183
  for (int64_t i3 = 0; i3 < ne3; i3++) {
@@ -1024,5 +1190,35 @@ void ggml_cl_transform_tensor(ggml_tensor * tensor) {
1024
1190
  CL_CHECK(clFinish(queue));
1025
1191
 
1026
1192
  tensor->data = dst;
1027
- tensor->backend = GGML_BACKEND_CL;
1193
+ tensor->backend = GGML_BACKEND_GPU;
1194
+ }
1195
+
1196
+ void ggml_cl_load_data(const char * fname, struct ggml_tensor * tensor, const size_t offset) {
1197
+ cl_int err;
1198
+ FILE * fp = fopen(fname, "rb");
1199
+
1200
+ const size_t size = ggml_nbytes(tensor);
1201
+
1202
+ cl_mem dst;
1203
+ CL_CHECK((dst = clCreateBuffer(context, CL_MEM_READ_ONLY, size, nullptr, &err), err));
1204
+ void * buf_host = malloc(size);
1205
+
1206
+ #ifdef _WIN32
1207
+ int ret = _fseeki64(fp, (__int64) offset, SEEK_SET);
1208
+ #else
1209
+ int ret = fseek(fp, (long) offset, SEEK_SET);
1210
+ #endif
1211
+ GGML_ASSERT(ret == 0); // same
1212
+
1213
+ size_t ret2 = fread(buf_host, size, 1, fp);
1214
+ if (ret2 != 1) {
1215
+ fprintf(stderr, "unexpectedly reached end of file");
1216
+ exit(1);
1217
+ }
1218
+
1219
+ clEnqueueWriteBuffer(queue, dst, CL_TRUE, 0, size, buf_host, 0, nullptr, nullptr);
1220
+
1221
+ tensor->data = dst;
1222
+ free(buf_host);
1223
+ fclose(fp);
1028
1224
  }
@@ -8,6 +8,7 @@ extern "C" {
8
8
 
9
9
  void ggml_cl_init(void);
10
10
 
11
+ void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
11
12
  bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
12
13
  size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
13
14
  void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
@@ -15,7 +16,10 @@ void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor
15
16
  void * ggml_cl_host_malloc(size_t size);
16
17
  void ggml_cl_host_free(void * ptr);
17
18
 
19
+ void ggml_cl_free_data(const struct ggml_tensor* tensor);
20
+
18
21
  void ggml_cl_transform_tensor(struct ggml_tensor * tensor);
22
+ void ggml_cl_load_data(const char * fname, struct ggml_tensor * tensor, size_t offset);
19
23
 
20
24
  #ifdef __cplusplus
21
25
  }