llama_cpp 0.5.0 → 0.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -63,7 +63,9 @@ struct ggml_metal_context {
63
63
  GGML_METAL_DECL_KERNEL(relu);
64
64
  GGML_METAL_DECL_KERNEL(gelu);
65
65
  GGML_METAL_DECL_KERNEL(soft_max);
66
+ GGML_METAL_DECL_KERNEL(soft_max_4);
66
67
  GGML_METAL_DECL_KERNEL(diag_mask_inf);
68
+ GGML_METAL_DECL_KERNEL(diag_mask_inf_8);
67
69
  GGML_METAL_DECL_KERNEL(get_rows_f16);
68
70
  GGML_METAL_DECL_KERNEL(get_rows_q4_0);
69
71
  GGML_METAL_DECL_KERNEL(get_rows_q4_1);
@@ -76,6 +78,8 @@ struct ggml_metal_context {
76
78
  GGML_METAL_DECL_KERNEL(rms_norm);
77
79
  GGML_METAL_DECL_KERNEL(norm);
78
80
  GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
81
+ GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_1row);
82
+ GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_l4);
79
83
  GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
80
84
  GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32);
81
85
  GGML_METAL_DECL_KERNEL(mul_mat_q8_0_f32);
@@ -116,22 +120,47 @@ static NSString * const msl_library_source = @"see metal.metal";
116
120
  struct ggml_metal_context * ggml_metal_init(int n_cb) {
117
121
  metal_printf("%s: allocating\n", __func__);
118
122
 
119
- struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
123
+ id <MTLDevice> device;
124
+ NSString * s;
125
+
126
+ #if TARGET_OS_OSX
127
+ // Show all the Metal device instances in the system
128
+ NSArray * devices = MTLCopyAllDevices();
129
+ for (device in devices) {
130
+ s = [device name];
131
+ metal_printf("%s: found device: %s\n", __func__, [s UTF8String]);
132
+ }
133
+ #endif
120
134
 
135
+ // Pick and show default Metal device
136
+ device = MTLCreateSystemDefaultDevice();
137
+ s = [device name];
138
+ metal_printf("%s: picking default device: %s\n", __func__, [s UTF8String]);
139
+
140
+ // Configure context
141
+ struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
142
+ ctx->device = device;
121
143
  ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
122
- ctx->device = MTLCreateSystemDefaultDevice();
123
144
  ctx->queue = [ctx->device newCommandQueue];
124
145
  ctx->n_buffers = 0;
125
146
  ctx->concur_list_len = 0;
126
147
 
127
148
  ctx->d_queue = dispatch_queue_create("llama.cpp", DISPATCH_QUEUE_CONCURRENT);
128
149
 
129
- #if 0
130
- // compile from source string and show compile log
150
+ #ifdef GGML_SWIFT
151
+ // load the default.metallib file
131
152
  {
132
153
  NSError * error = nil;
133
154
 
134
- ctx->library = [ctx->device newLibraryWithSource:msl_library_source options:nil error:&error];
155
+ NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
156
+ NSString * llamaBundlePath = [bundle pathForResource:@"llama_llama" ofType:@"bundle"];
157
+ NSBundle * llamaBundle = [NSBundle bundleWithPath:llamaBundlePath];
158
+ NSString * libPath = [llamaBundle pathForResource:@"default" ofType:@"metallib"];
159
+ NSURL * libURL = [NSURL fileURLWithPath:libPath];
160
+
161
+ // Load the metallib file into a Metal library
162
+ ctx->library = [ctx->device newLibraryWithURL:libURL error:&error];
163
+
135
164
  if (error) {
136
165
  metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]);
137
166
  return NULL;
@@ -192,7 +221,9 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
192
221
  GGML_METAL_ADD_KERNEL(relu);
193
222
  GGML_METAL_ADD_KERNEL(gelu);
194
223
  GGML_METAL_ADD_KERNEL(soft_max);
224
+ GGML_METAL_ADD_KERNEL(soft_max_4);
195
225
  GGML_METAL_ADD_KERNEL(diag_mask_inf);
226
+ GGML_METAL_ADD_KERNEL(diag_mask_inf_8);
196
227
  GGML_METAL_ADD_KERNEL(get_rows_f16);
197
228
  GGML_METAL_ADD_KERNEL(get_rows_q4_0);
198
229
  GGML_METAL_ADD_KERNEL(get_rows_q4_1);
@@ -205,6 +236,8 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
205
236
  GGML_METAL_ADD_KERNEL(rms_norm);
206
237
  GGML_METAL_ADD_KERNEL(norm);
207
238
  GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
239
+ GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_1row);
240
+ GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_l4);
208
241
  GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
209
242
  GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32);
210
243
  GGML_METAL_ADD_KERNEL(mul_mat_q8_0_f32);
@@ -231,13 +264,15 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
231
264
  #undef GGML_METAL_ADD_KERNEL
232
265
  }
233
266
 
234
- metal_printf("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
235
267
  metal_printf("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
268
+ #if TARGET_OS_OSX
269
+ metal_printf("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
236
270
  if (ctx->device.maxTransferRate != 0) {
237
271
  metal_printf("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
238
272
  } else {
239
273
  metal_printf("%s: maxTransferRate = built-in GPU\n", __func__);
240
274
  }
275
+ #endif
241
276
 
242
277
  return ctx;
243
278
  }
@@ -257,7 +292,8 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
257
292
  GGML_METAL_DEL_KERNEL(relu);
258
293
  GGML_METAL_DEL_KERNEL(gelu);
259
294
  GGML_METAL_DEL_KERNEL(soft_max);
260
- GGML_METAL_DEL_KERNEL(diag_mask_inf);
295
+ GGML_METAL_DEL_KERNEL(soft_max_4);
296
+ GGML_METAL_DEL_KERNEL(diag_mask_inf_8);
261
297
  GGML_METAL_DEL_KERNEL(get_rows_f16);
262
298
  GGML_METAL_DEL_KERNEL(get_rows_q4_0);
263
299
  GGML_METAL_DEL_KERNEL(get_rows_q4_1);
@@ -270,6 +306,8 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
270
306
  GGML_METAL_DEL_KERNEL(rms_norm);
271
307
  GGML_METAL_DEL_KERNEL(norm);
272
308
  GGML_METAL_DEL_KERNEL(mul_mat_f16_f32);
309
+ GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_1row);
310
+ GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_l4);
273
311
  GGML_METAL_DEL_KERNEL(mul_mat_q4_0_f32);
274
312
  GGML_METAL_DEL_KERNEL(mul_mat_q4_1_f32);
275
313
  GGML_METAL_DEL_KERNEL(mul_mat_q8_0_f32);
@@ -310,7 +348,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
310
348
 
311
349
  void * ggml_metal_host_malloc(size_t n) {
312
350
  void * data = NULL;
313
- const int result = posix_memalign((void **) &data, getpagesize(), n);
351
+ const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n);
314
352
  if (result != 0) {
315
353
  metal_printf("%s: error: posix_memalign failed\n", __func__);
316
354
  return NULL;
@@ -384,7 +422,7 @@ bool ggml_metal_add_buffer(
384
422
  }
385
423
  }
386
424
 
387
- const size_t size_page = getpagesize();
425
+ const size_t size_page = sysconf(_SC_PAGESIZE);
388
426
 
389
427
  size_t size_aligned = size;
390
428
  if ((size_aligned % size_page) != 0) {
@@ -437,6 +475,7 @@ bool ggml_metal_add_buffer(
437
475
  }
438
476
  }
439
477
 
478
+ #if TARGET_OS_OSX
440
479
  metal_printf(", (%8.2f / %8.2f)",
441
480
  ctx->device.currentAllocatedSize / 1024.0 / 1024.0,
442
481
  ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
@@ -446,6 +485,9 @@ bool ggml_metal_add_buffer(
446
485
  } else {
447
486
  metal_printf("\n");
448
487
  }
488
+ #else
489
+ metal_printf(", (%8.2f)\n", ctx->device.currentAllocatedSize / 1024.0 / 1024.0);
490
+ #endif
449
491
  }
450
492
 
451
493
  return true;
@@ -733,7 +775,7 @@ void ggml_metal_graph_compute(
733
775
  [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
734
776
  [encoder setBytes:&scale length:sizeof(scale) atIndex:2];
735
777
 
736
- const int64_t n = ggml_nelements(dst);
778
+ const int64_t n = ggml_nelements(dst)/4;
737
779
 
738
780
  [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
739
781
  } break;
@@ -745,7 +787,7 @@ void ggml_metal_graph_compute(
745
787
  [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
746
788
  [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
747
789
 
748
- const int64_t n = ggml_nelements(dst);
790
+ const int64_t n = ggml_nelements(dst)/4;
749
791
 
750
792
  [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
751
793
  } break;
@@ -765,7 +807,7 @@ void ggml_metal_graph_compute(
765
807
  [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
766
808
  [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
767
809
 
768
- const int64_t n = ggml_nelements(dst);
810
+ const int64_t n = ggml_nelements(dst)/4;
769
811
 
770
812
  [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
771
813
  } break;
@@ -779,13 +821,16 @@ void ggml_metal_graph_compute(
779
821
  {
780
822
  const int nth = 32;
781
823
 
782
- [encoder setComputePipelineState:ctx->pipeline_soft_max];
824
+ if (ne00%4 == 0) {
825
+ [encoder setComputePipelineState:ctx->pipeline_soft_max_4];
826
+ } else {
827
+ [encoder setComputePipelineState:ctx->pipeline_soft_max];
828
+ }
783
829
  [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
784
830
  [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
785
831
  [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
786
832
  [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
787
833
  [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
788
- [encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
789
834
 
790
835
  [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
791
836
  } break;
@@ -793,14 +838,23 @@ void ggml_metal_graph_compute(
793
838
  {
794
839
  const int n_past = ((int32_t *)(dst->op_params))[0];
795
840
 
796
- [encoder setComputePipelineState:ctx->pipeline_diag_mask_inf];
841
+ if (ne00%8 == 0) {
842
+ [encoder setComputePipelineState:ctx->pipeline_diag_mask_inf_8];
843
+ } else {
844
+ [encoder setComputePipelineState:ctx->pipeline_diag_mask_inf];
845
+ }
797
846
  [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
798
847
  [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
799
848
  [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
800
849
  [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
801
850
  [encoder setBytes:&n_past length:sizeof(int) atIndex:4];
802
851
 
803
- [encoder dispatchThreadgroups:MTLSizeMake(ne00, ne01, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
852
+ if (ne00%8 == 0) {
853
+ [encoder dispatchThreadgroups:MTLSizeMake(ne00*ne01*ne02/8, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
854
+ }
855
+ else {
856
+ [encoder dispatchThreadgroups:MTLSizeMake(ne00, ne01, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
857
+ }
804
858
  } break;
805
859
  case GGML_OP_MUL_MAT:
806
860
  {
@@ -847,6 +901,7 @@ void ggml_metal_graph_compute(
847
901
  } else {
848
902
  int nth0 = 32;
849
903
  int nth1 = 1;
904
+ int nrows = 1;
850
905
 
851
906
  // use custom matrix x vector kernel
852
907
  switch (src0t) {
@@ -854,7 +909,15 @@ void ggml_metal_graph_compute(
854
909
  {
855
910
  nth0 = 32;
856
911
  nth1 = 1;
857
- [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
912
+ if (ne11 * ne12 < 4) {
913
+ [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_1row];
914
+ } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
915
+ [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_l4];
916
+ nrows = ne11;
917
+ } else {
918
+ [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
919
+ nrows = 4;
920
+ }
858
921
  } break;
859
922
  case GGML_TYPE_Q4_0:
860
923
  {
@@ -906,8 +969,8 @@ void ggml_metal_graph_compute(
906
969
  GGML_ASSERT(ne02 == 1);
907
970
  GGML_ASSERT(ne12 == 1);
908
971
 
909
- nth0 = 2;
910
- nth1 = 32;
972
+ nth0 = 4; //1;
973
+ nth1 = 8; //32;
911
974
  [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32];
912
975
  } break;
913
976
  case GGML_TYPE_Q5_K:
@@ -955,9 +1018,12 @@ void ggml_metal_graph_compute(
955
1018
  [encoder setBytes:&gqa length:sizeof(gqa) atIndex:17];
956
1019
 
957
1020
  if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q8_0 ||
958
- src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) {
1021
+ src0t == GGML_TYPE_Q2_K) {// || src0t == GGML_TYPE_Q4_K) {
959
1022
  [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
960
1023
  }
1024
+ else if (src0t == GGML_TYPE_Q4_K) {
1025
+ [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
1026
+ }
961
1027
  else if (src0t == GGML_TYPE_Q3_K) {
962
1028
  #ifdef GGML_QKK_64
963
1029
  [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
@@ -971,8 +1037,8 @@ void ggml_metal_graph_compute(
971
1037
  else if (src0t == GGML_TYPE_Q6_K) {
972
1038
  [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
973
1039
  } else {
974
- [encoder setThreadgroupMemoryLength:nth0*sizeof(float) atIndex:0];
975
- [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
1040
+ int64_t ny = (ne11 + nrows - 1)/nrows;
1041
+ [encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
976
1042
  }
977
1043
  }
978
1044
  } break;
@@ -1117,7 +1183,7 @@ void ggml_metal_graph_compute(
1117
1183
  [encoder setBytes:&freq_base length:sizeof(float) atIndex:21];
1118
1184
  [encoder setBytes:&freq_scale length:sizeof(float) atIndex:22];
1119
1185
 
1120
- [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
1186
+ [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)];
1121
1187
  } break;
1122
1188
  case GGML_OP_DUP:
1123
1189
  case GGML_OP_CPY: