llama_cpp 0.5.0 → 0.5.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -63,7 +63,9 @@ struct ggml_metal_context {
63
63
  GGML_METAL_DECL_KERNEL(relu);
64
64
  GGML_METAL_DECL_KERNEL(gelu);
65
65
  GGML_METAL_DECL_KERNEL(soft_max);
66
+ GGML_METAL_DECL_KERNEL(soft_max_4);
66
67
  GGML_METAL_DECL_KERNEL(diag_mask_inf);
68
+ GGML_METAL_DECL_KERNEL(diag_mask_inf_8);
67
69
  GGML_METAL_DECL_KERNEL(get_rows_f16);
68
70
  GGML_METAL_DECL_KERNEL(get_rows_q4_0);
69
71
  GGML_METAL_DECL_KERNEL(get_rows_q4_1);
@@ -76,6 +78,8 @@ struct ggml_metal_context {
76
78
  GGML_METAL_DECL_KERNEL(rms_norm);
77
79
  GGML_METAL_DECL_KERNEL(norm);
78
80
  GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
81
+ GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_1row);
82
+ GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_l4);
79
83
  GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
80
84
  GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32);
81
85
  GGML_METAL_DECL_KERNEL(mul_mat_q8_0_f32);
@@ -116,22 +120,47 @@ static NSString * const msl_library_source = @"see metal.metal";
116
120
  struct ggml_metal_context * ggml_metal_init(int n_cb) {
117
121
  metal_printf("%s: allocating\n", __func__);
118
122
 
119
- struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
123
+ id <MTLDevice> device;
124
+ NSString * s;
125
+
126
+ #if TARGET_OS_OSX
127
+ // Show all the Metal device instances in the system
128
+ NSArray * devices = MTLCopyAllDevices();
129
+ for (device in devices) {
130
+ s = [device name];
131
+ metal_printf("%s: found device: %s\n", __func__, [s UTF8String]);
132
+ }
133
+ #endif
120
134
 
135
+ // Pick and show default Metal device
136
+ device = MTLCreateSystemDefaultDevice();
137
+ s = [device name];
138
+ metal_printf("%s: picking default device: %s\n", __func__, [s UTF8String]);
139
+
140
+ // Configure context
141
+ struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
142
+ ctx->device = device;
121
143
  ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
122
- ctx->device = MTLCreateSystemDefaultDevice();
123
144
  ctx->queue = [ctx->device newCommandQueue];
124
145
  ctx->n_buffers = 0;
125
146
  ctx->concur_list_len = 0;
126
147
 
127
148
  ctx->d_queue = dispatch_queue_create("llama.cpp", DISPATCH_QUEUE_CONCURRENT);
128
149
 
129
- #if 0
130
- // compile from source string and show compile log
150
+ #ifdef GGML_SWIFT
151
+ // load the default.metallib file
131
152
  {
132
153
  NSError * error = nil;
133
154
 
134
- ctx->library = [ctx->device newLibraryWithSource:msl_library_source options:nil error:&error];
155
+ NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
156
+ NSString * llamaBundlePath = [bundle pathForResource:@"llama_llama" ofType:@"bundle"];
157
+ NSBundle * llamaBundle = [NSBundle bundleWithPath:llamaBundlePath];
158
+ NSString * libPath = [llamaBundle pathForResource:@"default" ofType:@"metallib"];
159
+ NSURL * libURL = [NSURL fileURLWithPath:libPath];
160
+
161
+ // Load the metallib file into a Metal library
162
+ ctx->library = [ctx->device newLibraryWithURL:libURL error:&error];
163
+
135
164
  if (error) {
136
165
  metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]);
137
166
  return NULL;
@@ -192,7 +221,9 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
192
221
  GGML_METAL_ADD_KERNEL(relu);
193
222
  GGML_METAL_ADD_KERNEL(gelu);
194
223
  GGML_METAL_ADD_KERNEL(soft_max);
224
+ GGML_METAL_ADD_KERNEL(soft_max_4);
195
225
  GGML_METAL_ADD_KERNEL(diag_mask_inf);
226
+ GGML_METAL_ADD_KERNEL(diag_mask_inf_8);
196
227
  GGML_METAL_ADD_KERNEL(get_rows_f16);
197
228
  GGML_METAL_ADD_KERNEL(get_rows_q4_0);
198
229
  GGML_METAL_ADD_KERNEL(get_rows_q4_1);
@@ -205,6 +236,8 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
205
236
  GGML_METAL_ADD_KERNEL(rms_norm);
206
237
  GGML_METAL_ADD_KERNEL(norm);
207
238
  GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
239
+ GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_1row);
240
+ GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_l4);
208
241
  GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
209
242
  GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32);
210
243
  GGML_METAL_ADD_KERNEL(mul_mat_q8_0_f32);
@@ -231,13 +264,15 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
231
264
  #undef GGML_METAL_ADD_KERNEL
232
265
  }
233
266
 
234
- metal_printf("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
235
267
  metal_printf("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
268
+ #if TARGET_OS_OSX
269
+ metal_printf("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
236
270
  if (ctx->device.maxTransferRate != 0) {
237
271
  metal_printf("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
238
272
  } else {
239
273
  metal_printf("%s: maxTransferRate = built-in GPU\n", __func__);
240
274
  }
275
+ #endif
241
276
 
242
277
  return ctx;
243
278
  }
@@ -257,7 +292,8 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
257
292
  GGML_METAL_DEL_KERNEL(relu);
258
293
  GGML_METAL_DEL_KERNEL(gelu);
259
294
  GGML_METAL_DEL_KERNEL(soft_max);
260
- GGML_METAL_DEL_KERNEL(diag_mask_inf);
295
+ GGML_METAL_DEL_KERNEL(soft_max_4);
296
+ GGML_METAL_DEL_KERNEL(diag_mask_inf_8);
261
297
  GGML_METAL_DEL_KERNEL(get_rows_f16);
262
298
  GGML_METAL_DEL_KERNEL(get_rows_q4_0);
263
299
  GGML_METAL_DEL_KERNEL(get_rows_q4_1);
@@ -270,6 +306,8 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
270
306
  GGML_METAL_DEL_KERNEL(rms_norm);
271
307
  GGML_METAL_DEL_KERNEL(norm);
272
308
  GGML_METAL_DEL_KERNEL(mul_mat_f16_f32);
309
+ GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_1row);
310
+ GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_l4);
273
311
  GGML_METAL_DEL_KERNEL(mul_mat_q4_0_f32);
274
312
  GGML_METAL_DEL_KERNEL(mul_mat_q4_1_f32);
275
313
  GGML_METAL_DEL_KERNEL(mul_mat_q8_0_f32);
@@ -310,7 +348,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
310
348
 
311
349
  void * ggml_metal_host_malloc(size_t n) {
312
350
  void * data = NULL;
313
- const int result = posix_memalign((void **) &data, getpagesize(), n);
351
+ const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n);
314
352
  if (result != 0) {
315
353
  metal_printf("%s: error: posix_memalign failed\n", __func__);
316
354
  return NULL;
@@ -384,7 +422,7 @@ bool ggml_metal_add_buffer(
384
422
  }
385
423
  }
386
424
 
387
- const size_t size_page = getpagesize();
425
+ const size_t size_page = sysconf(_SC_PAGESIZE);
388
426
 
389
427
  size_t size_aligned = size;
390
428
  if ((size_aligned % size_page) != 0) {
@@ -437,6 +475,7 @@ bool ggml_metal_add_buffer(
437
475
  }
438
476
  }
439
477
 
478
+ #if TARGET_OS_OSX
440
479
  metal_printf(", (%8.2f / %8.2f)",
441
480
  ctx->device.currentAllocatedSize / 1024.0 / 1024.0,
442
481
  ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
@@ -446,6 +485,9 @@ bool ggml_metal_add_buffer(
446
485
  } else {
447
486
  metal_printf("\n");
448
487
  }
488
+ #else
489
+ metal_printf(", (%8.2f)\n", ctx->device.currentAllocatedSize / 1024.0 / 1024.0);
490
+ #endif
449
491
  }
450
492
 
451
493
  return true;
@@ -733,7 +775,7 @@ void ggml_metal_graph_compute(
733
775
  [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
734
776
  [encoder setBytes:&scale length:sizeof(scale) atIndex:2];
735
777
 
736
- const int64_t n = ggml_nelements(dst);
778
+ const int64_t n = ggml_nelements(dst)/4;
737
779
 
738
780
  [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
739
781
  } break;
@@ -745,7 +787,7 @@ void ggml_metal_graph_compute(
745
787
  [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
746
788
  [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
747
789
 
748
- const int64_t n = ggml_nelements(dst);
790
+ const int64_t n = ggml_nelements(dst)/4;
749
791
 
750
792
  [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
751
793
  } break;
@@ -765,7 +807,7 @@ void ggml_metal_graph_compute(
765
807
  [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
766
808
  [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
767
809
 
768
- const int64_t n = ggml_nelements(dst);
810
+ const int64_t n = ggml_nelements(dst)/4;
769
811
 
770
812
  [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
771
813
  } break;
@@ -779,13 +821,16 @@ void ggml_metal_graph_compute(
779
821
  {
780
822
  const int nth = 32;
781
823
 
782
- [encoder setComputePipelineState:ctx->pipeline_soft_max];
824
+ if (ne00%4 == 0) {
825
+ [encoder setComputePipelineState:ctx->pipeline_soft_max_4];
826
+ } else {
827
+ [encoder setComputePipelineState:ctx->pipeline_soft_max];
828
+ }
783
829
  [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
784
830
  [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
785
831
  [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
786
832
  [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
787
833
  [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
788
- [encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
789
834
 
790
835
  [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
791
836
  } break;
@@ -793,14 +838,23 @@ void ggml_metal_graph_compute(
793
838
  {
794
839
  const int n_past = ((int32_t *)(dst->op_params))[0];
795
840
 
796
- [encoder setComputePipelineState:ctx->pipeline_diag_mask_inf];
841
+ if (ne00%8 == 0) {
842
+ [encoder setComputePipelineState:ctx->pipeline_diag_mask_inf_8];
843
+ } else {
844
+ [encoder setComputePipelineState:ctx->pipeline_diag_mask_inf];
845
+ }
797
846
  [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
798
847
  [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
799
848
  [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
800
849
  [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
801
850
  [encoder setBytes:&n_past length:sizeof(int) atIndex:4];
802
851
 
803
- [encoder dispatchThreadgroups:MTLSizeMake(ne00, ne01, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
852
+ if (ne00%8 == 0) {
853
+ [encoder dispatchThreadgroups:MTLSizeMake(ne00*ne01*ne02/8, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
854
+ }
855
+ else {
856
+ [encoder dispatchThreadgroups:MTLSizeMake(ne00, ne01, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
857
+ }
804
858
  } break;
805
859
  case GGML_OP_MUL_MAT:
806
860
  {
@@ -847,6 +901,7 @@ void ggml_metal_graph_compute(
847
901
  } else {
848
902
  int nth0 = 32;
849
903
  int nth1 = 1;
904
+ int nrows = 1;
850
905
 
851
906
  // use custom matrix x vector kernel
852
907
  switch (src0t) {
@@ -854,7 +909,15 @@ void ggml_metal_graph_compute(
854
909
  {
855
910
  nth0 = 32;
856
911
  nth1 = 1;
857
- [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
912
+ if (ne11 * ne12 < 4) {
913
+ [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_1row];
914
+ } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
915
+ [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_l4];
916
+ nrows = ne11;
917
+ } else {
918
+ [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
919
+ nrows = 4;
920
+ }
858
921
  } break;
859
922
  case GGML_TYPE_Q4_0:
860
923
  {
@@ -906,8 +969,8 @@ void ggml_metal_graph_compute(
906
969
  GGML_ASSERT(ne02 == 1);
907
970
  GGML_ASSERT(ne12 == 1);
908
971
 
909
- nth0 = 2;
910
- nth1 = 32;
972
+ nth0 = 4; //1;
973
+ nth1 = 8; //32;
911
974
  [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32];
912
975
  } break;
913
976
  case GGML_TYPE_Q5_K:
@@ -955,9 +1018,12 @@ void ggml_metal_graph_compute(
955
1018
  [encoder setBytes:&gqa length:sizeof(gqa) atIndex:17];
956
1019
 
957
1020
  if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q8_0 ||
958
- src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) {
1021
+ src0t == GGML_TYPE_Q2_K) {// || src0t == GGML_TYPE_Q4_K) {
959
1022
  [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
960
1023
  }
1024
+ else if (src0t == GGML_TYPE_Q4_K) {
1025
+ [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
1026
+ }
961
1027
  else if (src0t == GGML_TYPE_Q3_K) {
962
1028
  #ifdef GGML_QKK_64
963
1029
  [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
@@ -971,8 +1037,8 @@ void ggml_metal_graph_compute(
971
1037
  else if (src0t == GGML_TYPE_Q6_K) {
972
1038
  [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
973
1039
  } else {
974
- [encoder setThreadgroupMemoryLength:nth0*sizeof(float) atIndex:0];
975
- [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
1040
+ int64_t ny = (ne11 + nrows - 1)/nrows;
1041
+ [encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
976
1042
  }
977
1043
  }
978
1044
  } break;
@@ -1117,7 +1183,7 @@ void ggml_metal_graph_compute(
1117
1183
  [encoder setBytes:&freq_base length:sizeof(float) atIndex:21];
1118
1184
  [encoder setBytes:&freq_scale length:sizeof(float) atIndex:22];
1119
1185
 
1120
- [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
1186
+ [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)];
1121
1187
  } break;
1122
1188
  case GGML_OP_DUP:
1123
1189
  case GGML_OP_CPY: