llama_cpp 0.5.1 → 0.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -63,7 +63,9 @@ struct ggml_metal_context {
63
63
  GGML_METAL_DECL_KERNEL(relu);
64
64
  GGML_METAL_DECL_KERNEL(gelu);
65
65
  GGML_METAL_DECL_KERNEL(soft_max);
66
+ GGML_METAL_DECL_KERNEL(soft_max_4);
66
67
  GGML_METAL_DECL_KERNEL(diag_mask_inf);
68
+ GGML_METAL_DECL_KERNEL(diag_mask_inf_8);
67
69
  GGML_METAL_DECL_KERNEL(get_rows_f16);
68
70
  GGML_METAL_DECL_KERNEL(get_rows_q4_0);
69
71
  GGML_METAL_DECL_KERNEL(get_rows_q4_1);
@@ -77,6 +79,7 @@ struct ggml_metal_context {
77
79
  GGML_METAL_DECL_KERNEL(norm);
78
80
  GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
79
81
  GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_1row);
82
+ GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_l4);
80
83
  GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
81
84
  GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32);
82
85
  GGML_METAL_DECL_KERNEL(mul_mat_q8_0_f32);
@@ -117,14 +120,17 @@ static NSString * const msl_library_source = @"see metal.metal";
117
120
  struct ggml_metal_context * ggml_metal_init(int n_cb) {
118
121
  metal_printf("%s: allocating\n", __func__);
119
122
 
120
- // Show all the Metal device instances in the system
121
- NSArray * devices = MTLCopyAllDevices();
122
123
  id <MTLDevice> device;
123
124
  NSString * s;
125
+
126
+ #if TARGET_OS_OSX
127
+ // Show all the Metal device instances in the system
128
+ NSArray * devices = MTLCopyAllDevices();
124
129
  for (device in devices) {
125
130
  s = [device name];
126
131
  metal_printf("%s: found device: %s\n", __func__, [s UTF8String]);
127
132
  }
133
+ #endif
128
134
 
129
135
  // Pick and show default Metal device
130
136
  device = MTLCreateSystemDefaultDevice();
@@ -141,12 +147,20 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
141
147
 
142
148
  ctx->d_queue = dispatch_queue_create("llama.cpp", DISPATCH_QUEUE_CONCURRENT);
143
149
 
144
- #if 0
145
- // compile from source string and show compile log
150
+ #ifdef GGML_SWIFT
151
+ // load the default.metallib file
146
152
  {
147
153
  NSError * error = nil;
148
154
 
149
- ctx->library = [ctx->device newLibraryWithSource:msl_library_source options:nil error:&error];
155
+ NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
156
+ NSString * llamaBundlePath = [bundle pathForResource:@"llama_llama" ofType:@"bundle"];
157
+ NSBundle * llamaBundle = [NSBundle bundleWithPath:llamaBundlePath];
158
+ NSString * libPath = [llamaBundle pathForResource:@"default" ofType:@"metallib"];
159
+ NSURL * libURL = [NSURL fileURLWithPath:libPath];
160
+
161
+ // Load the metallib file into a Metal library
162
+ ctx->library = [ctx->device newLibraryWithURL:libURL error:&error];
163
+
150
164
  if (error) {
151
165
  metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]);
152
166
  return NULL;
@@ -207,7 +221,9 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
207
221
  GGML_METAL_ADD_KERNEL(relu);
208
222
  GGML_METAL_ADD_KERNEL(gelu);
209
223
  GGML_METAL_ADD_KERNEL(soft_max);
224
+ GGML_METAL_ADD_KERNEL(soft_max_4);
210
225
  GGML_METAL_ADD_KERNEL(diag_mask_inf);
226
+ GGML_METAL_ADD_KERNEL(diag_mask_inf_8);
211
227
  GGML_METAL_ADD_KERNEL(get_rows_f16);
212
228
  GGML_METAL_ADD_KERNEL(get_rows_q4_0);
213
229
  GGML_METAL_ADD_KERNEL(get_rows_q4_1);
@@ -221,6 +237,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
221
237
  GGML_METAL_ADD_KERNEL(norm);
222
238
  GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
223
239
  GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_1row);
240
+ GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_l4);
224
241
  GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
225
242
  GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32);
226
243
  GGML_METAL_ADD_KERNEL(mul_mat_q8_0_f32);
@@ -247,13 +264,15 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
247
264
  #undef GGML_METAL_ADD_KERNEL
248
265
  }
249
266
 
250
- metal_printf("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
251
267
  metal_printf("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
268
+ #if TARGET_OS_OSX
269
+ metal_printf("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
252
270
  if (ctx->device.maxTransferRate != 0) {
253
271
  metal_printf("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
254
272
  } else {
255
273
  metal_printf("%s: maxTransferRate = built-in GPU\n", __func__);
256
274
  }
275
+ #endif
257
276
 
258
277
  return ctx;
259
278
  }
@@ -273,7 +292,8 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
273
292
  GGML_METAL_DEL_KERNEL(relu);
274
293
  GGML_METAL_DEL_KERNEL(gelu);
275
294
  GGML_METAL_DEL_KERNEL(soft_max);
276
- GGML_METAL_DEL_KERNEL(diag_mask_inf);
295
+ GGML_METAL_DEL_KERNEL(soft_max_4);
296
+ GGML_METAL_DEL_KERNEL(diag_mask_inf_8);
277
297
  GGML_METAL_DEL_KERNEL(get_rows_f16);
278
298
  GGML_METAL_DEL_KERNEL(get_rows_q4_0);
279
299
  GGML_METAL_DEL_KERNEL(get_rows_q4_1);
@@ -287,6 +307,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
287
307
  GGML_METAL_DEL_KERNEL(norm);
288
308
  GGML_METAL_DEL_KERNEL(mul_mat_f16_f32);
289
309
  GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_1row);
310
+ GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_l4);
290
311
  GGML_METAL_DEL_KERNEL(mul_mat_q4_0_f32);
291
312
  GGML_METAL_DEL_KERNEL(mul_mat_q4_1_f32);
292
313
  GGML_METAL_DEL_KERNEL(mul_mat_q8_0_f32);
@@ -454,6 +475,7 @@ bool ggml_metal_add_buffer(
454
475
  }
455
476
  }
456
477
 
478
+ #if TARGET_OS_OSX
457
479
  metal_printf(", (%8.2f / %8.2f)",
458
480
  ctx->device.currentAllocatedSize / 1024.0 / 1024.0,
459
481
  ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
@@ -463,6 +485,9 @@ bool ggml_metal_add_buffer(
463
485
  } else {
464
486
  metal_printf("\n");
465
487
  }
488
+ #else
489
+ metal_printf(", (%8.2f)\n", ctx->device.currentAllocatedSize / 1024.0 / 1024.0);
490
+ #endif
466
491
  }
467
492
 
468
493
  return true;
@@ -750,7 +775,7 @@ void ggml_metal_graph_compute(
750
775
  [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
751
776
  [encoder setBytes:&scale length:sizeof(scale) atIndex:2];
752
777
 
753
- const int64_t n = ggml_nelements(dst);
778
+ const int64_t n = ggml_nelements(dst)/4;
754
779
 
755
780
  [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
756
781
  } break;
@@ -762,7 +787,7 @@ void ggml_metal_graph_compute(
762
787
  [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
763
788
  [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
764
789
 
765
- const int64_t n = ggml_nelements(dst);
790
+ const int64_t n = ggml_nelements(dst)/4;
766
791
 
767
792
  [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
768
793
  } break;
@@ -782,7 +807,7 @@ void ggml_metal_graph_compute(
782
807
  [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
783
808
  [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
784
809
 
785
- const int64_t n = ggml_nelements(dst);
810
+ const int64_t n = ggml_nelements(dst)/4;
786
811
 
787
812
  [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
788
813
  } break;
@@ -796,13 +821,16 @@ void ggml_metal_graph_compute(
796
821
  {
797
822
  const int nth = 32;
798
823
 
799
- [encoder setComputePipelineState:ctx->pipeline_soft_max];
824
+ if (ne00%4 == 0) {
825
+ [encoder setComputePipelineState:ctx->pipeline_soft_max_4];
826
+ } else {
827
+ [encoder setComputePipelineState:ctx->pipeline_soft_max];
828
+ }
800
829
  [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
801
830
  [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
802
831
  [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
803
832
  [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
804
833
  [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
805
- [encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
806
834
 
807
835
  [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
808
836
  } break;
@@ -810,14 +838,23 @@ void ggml_metal_graph_compute(
810
838
  {
811
839
  const int n_past = ((int32_t *)(dst->op_params))[0];
812
840
 
813
- [encoder setComputePipelineState:ctx->pipeline_diag_mask_inf];
841
+ if (ne00%8 == 0) {
842
+ [encoder setComputePipelineState:ctx->pipeline_diag_mask_inf_8];
843
+ } else {
844
+ [encoder setComputePipelineState:ctx->pipeline_diag_mask_inf];
845
+ }
814
846
  [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
815
847
  [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
816
848
  [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
817
849
  [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
818
850
  [encoder setBytes:&n_past length:sizeof(int) atIndex:4];
819
851
 
820
- [encoder dispatchThreadgroups:MTLSizeMake(ne00, ne01, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
852
+ if (ne00%8 == 0) {
853
+ [encoder dispatchThreadgroups:MTLSizeMake(ne00*ne01*ne02/8, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
854
+ }
855
+ else {
856
+ [encoder dispatchThreadgroups:MTLSizeMake(ne00, ne01, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
857
+ }
821
858
  } break;
822
859
  case GGML_OP_MUL_MAT:
823
860
  {
@@ -864,6 +901,7 @@ void ggml_metal_graph_compute(
864
901
  } else {
865
902
  int nth0 = 32;
866
903
  int nth1 = 1;
904
+ int nrows = 1;
867
905
 
868
906
  // use custom matrix x vector kernel
869
907
  switch (src0t) {
@@ -873,8 +911,12 @@ void ggml_metal_graph_compute(
873
911
  nth1 = 1;
874
912
  if (ne11 * ne12 < 4) {
875
913
  [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_1row];
914
+ } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
915
+ [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_l4];
916
+ nrows = ne11;
876
917
  } else {
877
918
  [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
919
+ nrows = 4;
878
920
  }
879
921
  } break;
880
922
  case GGML_TYPE_Q4_0:
@@ -995,7 +1037,7 @@ void ggml_metal_graph_compute(
995
1037
  else if (src0t == GGML_TYPE_Q6_K) {
996
1038
  [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
997
1039
  } else {
998
- int64_t ny = (ne11 + 3)/4;
1040
+ int64_t ny = (ne11 + nrows - 1)/nrows;
999
1041
  [encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
1000
1042
  }
1001
1043
  }