llama_cpp 0.5.1 → 0.5.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -63,7 +63,9 @@ struct ggml_metal_context {
63
63
  GGML_METAL_DECL_KERNEL(relu);
64
64
  GGML_METAL_DECL_KERNEL(gelu);
65
65
  GGML_METAL_DECL_KERNEL(soft_max);
66
+ GGML_METAL_DECL_KERNEL(soft_max_4);
66
67
  GGML_METAL_DECL_KERNEL(diag_mask_inf);
68
+ GGML_METAL_DECL_KERNEL(diag_mask_inf_8);
67
69
  GGML_METAL_DECL_KERNEL(get_rows_f16);
68
70
  GGML_METAL_DECL_KERNEL(get_rows_q4_0);
69
71
  GGML_METAL_DECL_KERNEL(get_rows_q4_1);
@@ -77,6 +79,7 @@ struct ggml_metal_context {
77
79
  GGML_METAL_DECL_KERNEL(norm);
78
80
  GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
79
81
  GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_1row);
82
+ GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_l4);
80
83
  GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
81
84
  GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32);
82
85
  GGML_METAL_DECL_KERNEL(mul_mat_q8_0_f32);
@@ -117,14 +120,17 @@ static NSString * const msl_library_source = @"see metal.metal";
117
120
  struct ggml_metal_context * ggml_metal_init(int n_cb) {
118
121
  metal_printf("%s: allocating\n", __func__);
119
122
 
120
- // Show all the Metal device instances in the system
121
- NSArray * devices = MTLCopyAllDevices();
122
123
  id <MTLDevice> device;
123
124
  NSString * s;
125
+
126
+ #if TARGET_OS_OSX
127
+ // Show all the Metal device instances in the system
128
+ NSArray * devices = MTLCopyAllDevices();
124
129
  for (device in devices) {
125
130
  s = [device name];
126
131
  metal_printf("%s: found device: %s\n", __func__, [s UTF8String]);
127
132
  }
133
+ #endif
128
134
 
129
135
  // Pick and show default Metal device
130
136
  device = MTLCreateSystemDefaultDevice();
@@ -141,12 +147,20 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
141
147
 
142
148
  ctx->d_queue = dispatch_queue_create("llama.cpp", DISPATCH_QUEUE_CONCURRENT);
143
149
 
144
- #if 0
145
- // compile from source string and show compile log
150
+ #ifdef GGML_SWIFT
151
+ // load the default.metallib file
146
152
  {
147
153
  NSError * error = nil;
148
154
 
149
- ctx->library = [ctx->device newLibraryWithSource:msl_library_source options:nil error:&error];
155
+ NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
156
+ NSString * llamaBundlePath = [bundle pathForResource:@"llama_llama" ofType:@"bundle"];
157
+ NSBundle * llamaBundle = [NSBundle bundleWithPath:llamaBundlePath];
158
+ NSString * libPath = [llamaBundle pathForResource:@"default" ofType:@"metallib"];
159
+ NSURL * libURL = [NSURL fileURLWithPath:libPath];
160
+
161
+ // Load the metallib file into a Metal library
162
+ ctx->library = [ctx->device newLibraryWithURL:libURL error:&error];
163
+
150
164
  if (error) {
151
165
  metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]);
152
166
  return NULL;
@@ -207,7 +221,9 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
207
221
  GGML_METAL_ADD_KERNEL(relu);
208
222
  GGML_METAL_ADD_KERNEL(gelu);
209
223
  GGML_METAL_ADD_KERNEL(soft_max);
224
+ GGML_METAL_ADD_KERNEL(soft_max_4);
210
225
  GGML_METAL_ADD_KERNEL(diag_mask_inf);
226
+ GGML_METAL_ADD_KERNEL(diag_mask_inf_8);
211
227
  GGML_METAL_ADD_KERNEL(get_rows_f16);
212
228
  GGML_METAL_ADD_KERNEL(get_rows_q4_0);
213
229
  GGML_METAL_ADD_KERNEL(get_rows_q4_1);
@@ -221,6 +237,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
221
237
  GGML_METAL_ADD_KERNEL(norm);
222
238
  GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
223
239
  GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_1row);
240
+ GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_l4);
224
241
  GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
225
242
  GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32);
226
243
  GGML_METAL_ADD_KERNEL(mul_mat_q8_0_f32);
@@ -247,13 +264,15 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
247
264
  #undef GGML_METAL_ADD_KERNEL
248
265
  }
249
266
 
250
- metal_printf("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
251
267
  metal_printf("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
268
+ #if TARGET_OS_OSX
269
+ metal_printf("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
252
270
  if (ctx->device.maxTransferRate != 0) {
253
271
  metal_printf("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
254
272
  } else {
255
273
  metal_printf("%s: maxTransferRate = built-in GPU\n", __func__);
256
274
  }
275
+ #endif
257
276
 
258
277
  return ctx;
259
278
  }
@@ -273,7 +292,8 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
273
292
  GGML_METAL_DEL_KERNEL(relu);
274
293
  GGML_METAL_DEL_KERNEL(gelu);
275
294
  GGML_METAL_DEL_KERNEL(soft_max);
276
- GGML_METAL_DEL_KERNEL(diag_mask_inf);
295
+ GGML_METAL_DEL_KERNEL(soft_max_4);
296
+ GGML_METAL_DEL_KERNEL(diag_mask_inf_8);
277
297
  GGML_METAL_DEL_KERNEL(get_rows_f16);
278
298
  GGML_METAL_DEL_KERNEL(get_rows_q4_0);
279
299
  GGML_METAL_DEL_KERNEL(get_rows_q4_1);
@@ -287,6 +307,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
287
307
  GGML_METAL_DEL_KERNEL(norm);
288
308
  GGML_METAL_DEL_KERNEL(mul_mat_f16_f32);
289
309
  GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_1row);
310
+ GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_l4);
290
311
  GGML_METAL_DEL_KERNEL(mul_mat_q4_0_f32);
291
312
  GGML_METAL_DEL_KERNEL(mul_mat_q4_1_f32);
292
313
  GGML_METAL_DEL_KERNEL(mul_mat_q8_0_f32);
@@ -454,6 +475,7 @@ bool ggml_metal_add_buffer(
454
475
  }
455
476
  }
456
477
 
478
+ #if TARGET_OS_OSX
457
479
  metal_printf(", (%8.2f / %8.2f)",
458
480
  ctx->device.currentAllocatedSize / 1024.0 / 1024.0,
459
481
  ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
@@ -463,6 +485,9 @@ bool ggml_metal_add_buffer(
463
485
  } else {
464
486
  metal_printf("\n");
465
487
  }
488
+ #else
489
+ metal_printf(", (%8.2f)\n", ctx->device.currentAllocatedSize / 1024.0 / 1024.0);
490
+ #endif
466
491
  }
467
492
 
468
493
  return true;
@@ -750,7 +775,7 @@ void ggml_metal_graph_compute(
750
775
  [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
751
776
  [encoder setBytes:&scale length:sizeof(scale) atIndex:2];
752
777
 
753
- const int64_t n = ggml_nelements(dst);
778
+ const int64_t n = ggml_nelements(dst)/4;
754
779
 
755
780
  [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
756
781
  } break;
@@ -762,7 +787,7 @@ void ggml_metal_graph_compute(
762
787
  [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
763
788
  [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
764
789
 
765
- const int64_t n = ggml_nelements(dst);
790
+ const int64_t n = ggml_nelements(dst)/4;
766
791
 
767
792
  [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
768
793
  } break;
@@ -782,7 +807,7 @@ void ggml_metal_graph_compute(
782
807
  [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
783
808
  [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
784
809
 
785
- const int64_t n = ggml_nelements(dst);
810
+ const int64_t n = ggml_nelements(dst)/4;
786
811
 
787
812
  [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
788
813
  } break;
@@ -796,13 +821,16 @@ void ggml_metal_graph_compute(
796
821
  {
797
822
  const int nth = 32;
798
823
 
799
- [encoder setComputePipelineState:ctx->pipeline_soft_max];
824
+ if (ne00%4 == 0) {
825
+ [encoder setComputePipelineState:ctx->pipeline_soft_max_4];
826
+ } else {
827
+ [encoder setComputePipelineState:ctx->pipeline_soft_max];
828
+ }
800
829
  [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
801
830
  [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
802
831
  [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
803
832
  [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
804
833
  [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
805
- [encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
806
834
 
807
835
  [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
808
836
  } break;
@@ -810,14 +838,23 @@ void ggml_metal_graph_compute(
810
838
  {
811
839
  const int n_past = ((int32_t *)(dst->op_params))[0];
812
840
 
813
- [encoder setComputePipelineState:ctx->pipeline_diag_mask_inf];
841
+ if (ne00%8 == 0) {
842
+ [encoder setComputePipelineState:ctx->pipeline_diag_mask_inf_8];
843
+ } else {
844
+ [encoder setComputePipelineState:ctx->pipeline_diag_mask_inf];
845
+ }
814
846
  [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
815
847
  [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
816
848
  [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
817
849
  [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
818
850
  [encoder setBytes:&n_past length:sizeof(int) atIndex:4];
819
851
 
820
- [encoder dispatchThreadgroups:MTLSizeMake(ne00, ne01, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
852
+ if (ne00%8 == 0) {
853
+ [encoder dispatchThreadgroups:MTLSizeMake(ne00*ne01*ne02/8, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
854
+ }
855
+ else {
856
+ [encoder dispatchThreadgroups:MTLSizeMake(ne00, ne01, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
857
+ }
821
858
  } break;
822
859
  case GGML_OP_MUL_MAT:
823
860
  {
@@ -864,6 +901,7 @@ void ggml_metal_graph_compute(
864
901
  } else {
865
902
  int nth0 = 32;
866
903
  int nth1 = 1;
904
+ int nrows = 1;
867
905
 
868
906
  // use custom matrix x vector kernel
869
907
  switch (src0t) {
@@ -873,8 +911,12 @@ void ggml_metal_graph_compute(
873
911
  nth1 = 1;
874
912
  if (ne11 * ne12 < 4) {
875
913
  [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_1row];
914
+ } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
915
+ [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_l4];
916
+ nrows = ne11;
876
917
  } else {
877
918
  [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
919
+ nrows = 4;
878
920
  }
879
921
  } break;
880
922
  case GGML_TYPE_Q4_0:
@@ -995,7 +1037,7 @@ void ggml_metal_graph_compute(
995
1037
  else if (src0t == GGML_TYPE_Q6_K) {
996
1038
  [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
997
1039
  } else {
998
- int64_t ny = (ne11 + 3)/4;
1040
+ int64_t ny = (ne11 + nrows - 1)/nrows;
999
1041
  [encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
1000
1042
  }
1001
1043
  }