llama_cpp 0.1.3 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,6 +3,8 @@
3
3
  #include <array>
4
4
  #include <atomic>
5
5
  #include <sstream>
6
+ #include <vector>
7
+ #include <limits>
6
8
 
7
9
  #define CL_TARGET_OPENCL_VERSION 110
8
10
  #include <clblast.h>
@@ -197,6 +199,18 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float
197
199
  }
198
200
  );
199
201
 
202
+ std::string mul_template = MULTILINE_QUOTE(
203
+ __kernel void KERNEL_NAME(__global TYPE* x, const int x_offset, __global TYPE* y, const int y_offset, __global TYPE* dst, const int dst_offset, const int ky) {
204
+ const int i = get_group_id(0)*get_local_size(0) + get_local_id(0);
205
+
206
+ if (i >= get_global_size(0)) {
207
+ return;
208
+ }
209
+
210
+ dst[dst_offset + i] = x[x_offset + i] * y[y_offset + i%ky];
211
+ }
212
+ );
213
+
200
214
  #define CL_CHECK(err) \
201
215
  do { \
202
216
  cl_int err_ = (err); \
@@ -239,6 +253,13 @@ std::array<std::string, 30> dequant_mul_mat_vec_str_values = {
239
253
  "convert_mul_mat_vec_f16", "half", "1", "1", "convert_f16"
240
254
  };
241
255
 
256
+ std::array<std::string, 2> mul_str_keys = {
257
+ "KERNEL_NAME", "TYPE"
258
+ };
259
+ std::array<std::string, 2> mul_str_values = {
260
+ "mul_f32", "float"
261
+ };
262
+
242
263
  std::string& replace(std::string& s, const std::string& from, const std::string& to) {
243
264
  size_t pos = 0;
244
265
  while ((pos = s.find(from, pos)) != std::string::npos) {
@@ -261,6 +282,13 @@ std::string generate_kernels() {
261
282
  src << dequant_kernel << '\n';
262
283
  src << dmmv_kernel << '\n';
263
284
  }
285
+ for (size_t i = 0; i < mul_str_values.size(); i += mul_str_keys.size()) {
286
+ std::string mul_kernel = mul_template;
287
+ for (size_t j = 0; j < mul_str_keys.size(); j++) {
288
+ replace(mul_kernel, mul_str_keys[j], mul_str_values[i + j]);
289
+ }
290
+ src << mul_kernel << '\n';
291
+ }
264
292
  return src.str();
265
293
  }
266
294
 
@@ -272,6 +300,7 @@ static cl_program program;
272
300
  static cl_kernel convert_row_f16_cl;
273
301
  static cl_kernel dequantize_row_q4_0_cl, dequantize_row_q4_1_cl, dequantize_row_q5_0_cl, dequantize_row_q5_1_cl, dequantize_row_q8_0_cl;
274
302
  static cl_kernel dequantize_mul_mat_vec_q4_0_cl, dequantize_mul_mat_vec_q4_1_cl, dequantize_mul_mat_vec_q5_0_cl, dequantize_mul_mat_vec_q5_1_cl, dequantize_mul_mat_vec_q8_0_cl, convert_mul_mat_vec_f16_cl;
303
+ static cl_kernel mul_f32_cl;
275
304
  static bool fp16_support;
276
305
 
277
306
  static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer) {
@@ -469,16 +498,11 @@ void ggml_cl_init(void) {
469
498
 
470
499
  size_t ext_str_size;
471
500
  clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 0, NULL, &ext_str_size);
472
- char* ext_buffer = (char*) malloc(sizeof(char) * ext_str_size);
501
+ char *ext_buffer = (char *)alloca(ext_str_size + 1);
473
502
  clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, ext_str_size, ext_buffer, NULL);
503
+ ext_buffer[ext_str_size] = '\0'; // ensure it is null terminated
474
504
  // Check if ext_buffer contains cl_khr_fp16
475
- for (size_t i = 0; i < ext_str_size - 12; i++) {
476
- if (memcmp(ext_buffer + i, "cl_khr_fp16", 11) == 0) {
477
- fp16_support = true;
478
- break;
479
- }
480
- }
481
- free(ext_buffer);
505
+ fp16_support = strstr(ext_buffer, "cl_khr_fp16") != NULL;
482
506
  fprintf(stderr, "ggml_opencl: device FP16 support: %s\n", fp16_support ? "true" : "false");
483
507
 
484
508
  cl_context_properties properties[] = {
@@ -513,6 +537,9 @@ void ggml_cl_init(void) {
513
537
  CL_CHECK((dequantize_mul_mat_vec_q5_1_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_1", &err), err));
514
538
  CL_CHECK((dequantize_mul_mat_vec_q8_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q8_0", &err), err));
515
539
  CL_CHECK((convert_mul_mat_vec_f16_cl = clCreateKernel(program, "convert_mul_mat_vec_f16", &err), err));
540
+
541
+ // mul kernel
542
+ CL_CHECK((mul_f32_cl = clCreateKernel(program, "mul_f32", &err), err));
516
543
  }
517
544
 
518
545
  static cl_kernel* ggml_get_to_fp32_cl(ggml_type type) {
@@ -578,21 +605,44 @@ struct cl_buffer {
578
605
  static cl_buffer g_cl_buffer_pool[MAX_CL_BUFFERS];
579
606
  static std::atomic_flag g_cl_pool_lock = ATOMIC_FLAG_INIT;
580
607
 
581
- static cl_mem ggml_cl_pool_malloc(size_t size, size_t * actual_size, cl_mem_flags flags) {
608
+ static cl_mem ggml_cl_pool_malloc(size_t size, size_t * actual_size) {
582
609
  scoped_spin_lock lock(g_cl_pool_lock);
583
610
  cl_int err;
584
611
 
612
+ int best_i = -1;
613
+ size_t best_size = std::numeric_limits<size_t>::max(); //smallest unused buffer that fits our needs
614
+ int worst_i = -1;
615
+ size_t worst_size = 0; //largest unused buffer seen so far
585
616
  for (int i = 0; i < MAX_CL_BUFFERS; ++i) {
586
- cl_buffer& b = g_cl_buffer_pool[i];
587
- if (b.size > 0 && b.size >= size) {
588
- cl_mem mem = b.mem;
589
- *actual_size = b.size;
590
- b.size = 0;
591
- return mem;
617
+ cl_buffer &b = g_cl_buffer_pool[i];
618
+ if (b.size > 0 && b.size >= size && b.size < best_size)
619
+ {
620
+ best_i = i;
621
+ best_size = b.size;
622
+ }
623
+ if (b.size > 0 && b.size > worst_size)
624
+ {
625
+ worst_i = i;
626
+ worst_size = b.size;
592
627
  }
593
628
  }
629
+ if(best_i!=-1) //found the smallest buffer that fits our needs
630
+ {
631
+ cl_buffer& b = g_cl_buffer_pool[best_i];
632
+ cl_mem mem = b.mem;
633
+ *actual_size = b.size;
634
+ b.size = 0;
635
+ return mem;
636
+ }
637
+ if(worst_i!=-1) //no buffer that fits our needs, resize largest one to save memory
638
+ {
639
+ cl_buffer& b = g_cl_buffer_pool[worst_i];
640
+ cl_mem mem = b.mem;
641
+ b.size = 0;
642
+ clReleaseMemObject(mem);
643
+ }
594
644
  cl_mem mem;
595
- CL_CHECK((mem = clCreateBuffer(context, flags, size, NULL, &err), err));
645
+ CL_CHECK((mem = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &err), err));
596
646
  *actual_size = size;
597
647
  return mem;
598
648
  }
@@ -612,6 +662,15 @@ static void ggml_cl_pool_free(cl_mem mem, size_t size) {
612
662
  clReleaseMemObject(mem);
613
663
  }
614
664
 
665
+ void ggml_cl_free_data(const struct ggml_tensor* tensor) {
666
+ if (tensor->backend != GGML_BACKEND_GPU) {
667
+ return;
668
+ }
669
+
670
+ cl_mem mem = (cl_mem)tensor->data;
671
+ clReleaseMemObject(mem);
672
+ }
673
+
615
674
  static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t offset, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, cl_event* ev) {
616
675
  cl_int err;
617
676
  const uint64_t ne0 = src->ne[0];
@@ -649,6 +708,99 @@ static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t o
649
708
  return err;
650
709
  }
651
710
 
711
+ static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
712
+ GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
713
+ const int64_t ne00 = src0->ne[0];
714
+ const int64_t ne01 = src0->ne[1];
715
+ const int64_t ne02 = src0->ne[2];
716
+ const int64_t ne03 = src0->ne[2];
717
+ const int64_t ne0 = ne00 * ne01 * ne02 * ne03;
718
+ const int64_t ne10 = src1->ne[0];
719
+ const int64_t ne11 = src1->ne[1];
720
+ const int64_t ne12 = src1->ne[2];
721
+ const int64_t ne13 = src1->ne[3];
722
+ const int64_t nb10 = src1->nb[0];
723
+ const int nb2 = dst->nb[2];
724
+ const int nb3 = dst->nb[3];
725
+ size_t x_size;
726
+ size_t d_size;
727
+
728
+ cl_mem d_X = ggml_cl_pool_malloc(ne0 * sizeof(float), &x_size); // src0
729
+ cl_mem d_Y = (cl_mem) src1->data; // src1 is already on device, broadcasted.
730
+ cl_mem d_D = ggml_cl_pool_malloc(ne0 * sizeof(float), &d_size); // dst
731
+
732
+
733
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
734
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
735
+ const int i0 = i03*ne02 + i02;
736
+
737
+ cl_event ev;
738
+
739
+ // copy src0 to device
740
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, i0, src0, i03, i02, &ev));
741
+
742
+ if (nb10 == sizeof(float)) {
743
+ // Contiguous, avoid overhead from queueing many kernel runs
744
+ const int64_t i13 = i03%ne13;
745
+ const int64_t i12 = i02%ne12;
746
+ const int i1 = i13*ne12*ne11 + i12*ne11;
747
+
748
+ cl_int x_offset = 0;
749
+ cl_int y_offset = i1*ne10;
750
+ cl_int d_offset = 0;
751
+
752
+ size_t global = ne00 * ne01;
753
+ cl_int ky = ne10;
754
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X));
755
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset));
756
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 2, sizeof(cl_mem), &d_Y));
757
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 3, sizeof(cl_int), &y_offset));
758
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 4, sizeof(cl_mem), &d_D));
759
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset));
760
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky));
761
+ CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
762
+ } else {
763
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
764
+ const int64_t i13 = i03%ne13;
765
+ const int64_t i12 = i02%ne12;
766
+ const int64_t i11 = i01%ne11;
767
+ const int i1 = i13*ne12*ne11 + i12*ne11 + i11;
768
+
769
+ cl_int x_offset = i01*ne00;
770
+ cl_int y_offset = i1*ne10;
771
+ cl_int d_offset = i01*ne00;
772
+
773
+ // compute
774
+ size_t global = ne00;
775
+ cl_int ky = ne10;
776
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X));
777
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset));
778
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 2, sizeof(cl_mem), &d_Y));
779
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 3, sizeof(cl_int), &y_offset));
780
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 4, sizeof(cl_mem), &d_D));
781
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset));
782
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky));
783
+ CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
784
+ }
785
+ }
786
+
787
+ CL_CHECK(clReleaseEvent(ev));
788
+ CL_CHECK(clFinish(queue));
789
+
790
+ // copy dst to host
791
+ float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
792
+ CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * ne00*ne01, d, 0, NULL, NULL));
793
+ }
794
+ }
795
+ ggml_cl_pool_free(d_X, x_size);
796
+ ggml_cl_pool_free(d_D, d_size);
797
+ }
798
+
799
+ void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
800
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
801
+ ggml_cl_mul_f32(src0, src1, dst);
802
+ }
803
+
652
804
  static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
653
805
  const int64_t ne00 = src0->ne[0];
654
806
  const int64_t ne01 = src0->ne[1];
@@ -671,18 +823,18 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
671
823
  size_t y_size;
672
824
  size_t d_size;
673
825
  cl_mem d_X;
674
- if (src0->backend == GGML_BACKEND_CL) {
675
- d_X = *(cl_mem*) src0->data;
826
+ if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
827
+ d_X = (cl_mem) src0->data;
676
828
  } else {
677
- d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size, CL_MEM_READ_ONLY);
829
+ d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
678
830
  }
679
- cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size, CL_MEM_READ_ONLY);
680
- cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size, CL_MEM_WRITE_ONLY);
831
+ cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
832
+ cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
681
833
 
682
834
  for (int64_t i03 = 0; i03 < ne03; i03++) {
683
835
  for (int64_t i02 = 0; i02 < ne02; i02++) {
684
836
  // copy data to device
685
- if (src0->backend != GGML_BACKEND_CL) {
837
+ if (src0->backend != GGML_BACKEND_GPU) {
686
838
  CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
687
839
  }
688
840
  CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL));
@@ -711,7 +863,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
711
863
  }
712
864
  }
713
865
 
714
- if (src0->backend != GGML_BACKEND_CL) {
866
+ if (src0->backend != GGML_BACKEND_GPU) {
715
867
  ggml_cl_pool_free(d_X, x_size);
716
868
  }
717
869
  ggml_cl_pool_free(d_Y, y_size);
@@ -747,13 +899,13 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
747
899
  size_t y_size;
748
900
  size_t d_size;
749
901
  cl_mem d_X;
750
- if (src0->backend == GGML_BACKEND_CL) {
751
- d_X = *(cl_mem*) src0->data;
902
+ if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
903
+ d_X = (cl_mem) src0->data;
752
904
  } else {
753
- d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size, CL_MEM_READ_ONLY);
905
+ d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
754
906
  }
755
- cl_mem d_Y = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * y_ne, &y_size, CL_MEM_READ_ONLY);
756
- cl_mem d_D = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * d_ne, &d_size, CL_MEM_WRITE_ONLY);
907
+ cl_mem d_Y = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * y_ne, &y_size);
908
+ cl_mem d_D = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * d_ne, &d_size);
757
909
 
758
910
  bool src1_cont_rows = nb10 == sizeof(float);
759
911
  bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);
@@ -761,7 +913,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
761
913
  for (int64_t i03 = 0; i03 < ne03; i03++) {
762
914
  for (int64_t i02 = 0; i02 < ne02; i02++) {
763
915
  // copy src0 to device
764
- if (src0->backend != GGML_BACKEND_CL) {
916
+ if (src0->backend != GGML_BACKEND_GPU) {
765
917
  CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
766
918
  }
767
919
 
@@ -818,7 +970,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
818
970
  }
819
971
  }
820
972
 
821
- if (src0->backend != GGML_BACKEND_CL) {
973
+ if (src0->backend != GGML_BACKEND_GPU) {
822
974
  ggml_cl_pool_free(d_X, x_size);
823
975
  }
824
976
  ggml_cl_pool_free(d_Y, y_size);
@@ -852,57 +1004,61 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
852
1004
  size_t q_size;
853
1005
  cl_mem d_X;
854
1006
  if (!mul_mat_vec) {
855
- d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size, CL_MEM_READ_WRITE);
1007
+ d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
856
1008
  }
857
- cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size, CL_MEM_READ_ONLY);
858
- cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size, CL_MEM_WRITE_ONLY);
1009
+ cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
1010
+ cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
859
1011
  cl_mem d_Q;
860
1012
  if (src0->backend == GGML_BACKEND_CPU) {
861
- d_Q = ggml_cl_pool_malloc(q_sz, &q_size, CL_MEM_READ_ONLY);
1013
+ d_Q = ggml_cl_pool_malloc(q_sz, &q_size);
862
1014
  }
863
1015
 
864
1016
  cl_kernel* to_fp32_cl = ggml_get_to_fp32_cl(type);
865
1017
  cl_kernel* dmmv = ggml_get_dequantize_mul_mat_vec_cl(type);
866
1018
  GGML_ASSERT(to_fp32_cl != nullptr);
867
1019
 
1020
+ size_t ev_idx = 0;
1021
+ std::vector<cl_event> events;
1022
+
868
1023
  for (int64_t i03 = 0; i03 < ne03; i03++) {
869
1024
  for (int64_t i02 = 0; i02 < ne02; i02++) {
870
- cl_event ev_sgemm;
871
-
872
1025
  // copy src0 to device if necessary
873
1026
  if (src0->backend == GGML_BACKEND_CPU) {
874
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, NULL));
875
- } else if (src0->backend == GGML_BACKEND_CL) {
876
- d_Q = *(cl_mem*) src0->data;
1027
+ events.emplace_back();
1028
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
1029
+ } else if (src0->backend == GGML_BACKEND_GPU) {
1030
+ d_Q = (cl_mem) src0->data;
877
1031
  } else {
878
1032
  GGML_ASSERT(false);
879
1033
  }
880
1034
  if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
881
1035
  // copy src1 to device
882
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL));
1036
+ events.emplace_back();
1037
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, events.data() + ev_idx++));
883
1038
 
884
1039
  // compute
885
1040
  const size_t global = ne01 * CL_DMMV_BLOCK_SIZE;
886
1041
  const size_t local = CL_DMMV_BLOCK_SIZE;
887
1042
  const cl_int ncols = ne00;
1043
+ events.emplace_back();
888
1044
  CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
889
1045
  CL_CHECK(clSetKernelArg(*dmmv, 1, sizeof(float) * local, NULL));
890
1046
  CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y));
891
1047
  CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D));
892
1048
  CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols));
893
- CL_CHECK(clFinish(queue));
894
- CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, NULL, &global, &local, 0, NULL, &ev_sgemm));
1049
+ CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, NULL, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
895
1050
  } else { // general dequantization kernel + CLBlast matrix matrix multiplication
896
1051
  // convert src0 to fp32 on device
897
1052
  const size_t global = x_ne;
898
1053
  CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
899
1054
  CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
900
- CL_CHECK(clFinish(queue));
901
- CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, NULL, &global, NULL, 0, NULL, NULL));
1055
+ CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, NULL, &global, NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
902
1056
 
903
1057
  // copy src1 to device
904
1058
  CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL));
905
1059
 
1060
+ events.emplace_back();
1061
+
906
1062
  // wait for conversion
907
1063
  CL_CHECK(clFinish(queue));
908
1064
 
@@ -915,7 +1071,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
915
1071
  d_Y, 0, ne10,
916
1072
  beta,
917
1073
  d_D, 0, ne01,
918
- &queue, &ev_sgemm);
1074
+ &queue, events.data() + ev_idx++);
919
1075
 
920
1076
  if (status != clblast::StatusCode::kSuccess) {
921
1077
  GGML_ASSERT(false);
@@ -924,8 +1080,13 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
924
1080
 
925
1081
  // copy dst to host
926
1082
  float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
927
- CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
928
- clReleaseEvent(ev_sgemm);
1083
+ CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL));
1084
+ for (auto *event : events) {
1085
+ clReleaseEvent(event);
1086
+ }
1087
+
1088
+ ev_idx = 0;
1089
+ events.clear();
929
1090
  }
930
1091
  }
931
1092
 
@@ -950,7 +1111,7 @@ bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
950
1111
  if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
951
1112
  src1->type == GGML_TYPE_F32 &&
952
1113
  dst->type == GGML_TYPE_F32 &&
953
- ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_CL)) {
1114
+ ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_GPU)) {
954
1115
  return true;
955
1116
  }
956
1117
 
@@ -1016,19 +1177,48 @@ void ggml_cl_transform_tensor(ggml_tensor * tensor) {
1016
1177
  const size_t q_sz = ggml_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_blck_size(type);
1017
1178
 
1018
1179
  size_t q_size;
1019
- cl_mem* dst = (cl_mem*) malloc(sizeof(cl_mem));
1020
- *dst = ggml_cl_pool_malloc(q_sz, &q_size, CL_MEM_READ_ONLY);
1180
+ cl_mem dst = ggml_cl_pool_malloc(q_sz, &q_size);
1021
1181
 
1022
1182
  // copy tensor to device
1023
1183
  for (int64_t i3 = 0; i3 < ne3; i3++) {
1024
1184
  for (int64_t i2 = 0; i2 < ne2; i2++) {
1025
1185
  int i = i3*ne2 + i2;
1026
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, *dst, i*ne0*ne1, tensor, i3, i2, NULL));
1186
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, dst, i*ne0*ne1, tensor, i3, i2, NULL));
1027
1187
  }
1028
1188
  }
1029
1189
 
1030
1190
  CL_CHECK(clFinish(queue));
1031
1191
 
1032
1192
  tensor->data = dst;
1033
- tensor->backend = GGML_BACKEND_CL;
1193
+ tensor->backend = GGML_BACKEND_GPU;
1194
+ }
1195
+
1196
+ void ggml_cl_load_data(const char * fname, struct ggml_tensor * tensor, const size_t offset) {
1197
+ cl_int err;
1198
+ FILE * fp = fopen(fname, "rb");
1199
+
1200
+ const size_t size = ggml_nbytes(tensor);
1201
+
1202
+ cl_mem dst;
1203
+ CL_CHECK((dst = clCreateBuffer(context, CL_MEM_READ_ONLY, size, nullptr, &err), err));
1204
+ void * buf_host = malloc(size);
1205
+
1206
+ #ifdef _WIN32
1207
+ int ret = _fseeki64(fp, (__int64) offset, SEEK_SET);
1208
+ #else
1209
+ int ret = fseek(fp, (long) offset, SEEK_SET);
1210
+ #endif
1211
+ GGML_ASSERT(ret == 0); // same
1212
+
1213
+ size_t ret2 = fread(buf_host, size, 1, fp);
1214
+ if (ret2 != 1) {
1215
+ fprintf(stderr, "unexpectedly reached end of file");
1216
+ exit(1);
1217
+ }
1218
+
1219
+ clEnqueueWriteBuffer(queue, dst, CL_TRUE, 0, size, buf_host, 0, nullptr, nullptr);
1220
+
1221
+ tensor->data = dst;
1222
+ free(buf_host);
1223
+ fclose(fp);
1034
1224
  }
@@ -8,6 +8,7 @@ extern "C" {
8
8
 
9
9
  void ggml_cl_init(void);
10
10
 
11
+ void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
11
12
  bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
12
13
  size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
13
14
  void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
@@ -15,7 +16,10 @@ void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor
15
16
  void * ggml_cl_host_malloc(size_t size);
16
17
  void ggml_cl_host_free(void * ptr);
17
18
 
19
+ void ggml_cl_free_data(const struct ggml_tensor* tensor);
20
+
18
21
  void ggml_cl_transform_tensor(struct ggml_tensor * tensor);
22
+ void ggml_cl_load_data(const char * fname, struct ggml_tensor * tensor, size_t offset);
19
23
 
20
24
  #ifdef __cplusplus
21
25
  }