llama_cpp 0.5.0 → 0.5.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -2
- data/examples/prompt_jp.txt +1 -1
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +30 -0
- data/ext/llama_cpp/src/ggml-alloc.c +101 -24
- data/ext/llama_cpp/src/ggml-cuda.cu +1094 -678
- data/ext/llama_cpp/src/ggml-metal.m +89 -23
- data/ext/llama_cpp/src/ggml-metal.metal +398 -211
- data/ext/llama_cpp/src/ggml-opencl.cpp +7 -7
- data/ext/llama_cpp/src/ggml.c +32 -56
- data/ext/llama_cpp/src/ggml.h +1 -1
- data/ext/llama_cpp/src/k_quants.c +49 -13
- data/ext/llama_cpp/src/llama.cpp +833 -281
- data/ext/llama_cpp/src/llama.h +11 -6
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +4 -0
- metadata +2 -2
@@ -63,7 +63,9 @@ struct ggml_metal_context {
|
|
63
63
|
GGML_METAL_DECL_KERNEL(relu);
|
64
64
|
GGML_METAL_DECL_KERNEL(gelu);
|
65
65
|
GGML_METAL_DECL_KERNEL(soft_max);
|
66
|
+
GGML_METAL_DECL_KERNEL(soft_max_4);
|
66
67
|
GGML_METAL_DECL_KERNEL(diag_mask_inf);
|
68
|
+
GGML_METAL_DECL_KERNEL(diag_mask_inf_8);
|
67
69
|
GGML_METAL_DECL_KERNEL(get_rows_f16);
|
68
70
|
GGML_METAL_DECL_KERNEL(get_rows_q4_0);
|
69
71
|
GGML_METAL_DECL_KERNEL(get_rows_q4_1);
|
@@ -76,6 +78,8 @@ struct ggml_metal_context {
|
|
76
78
|
GGML_METAL_DECL_KERNEL(rms_norm);
|
77
79
|
GGML_METAL_DECL_KERNEL(norm);
|
78
80
|
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
|
81
|
+
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_1row);
|
82
|
+
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_l4);
|
79
83
|
GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
|
80
84
|
GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32);
|
81
85
|
GGML_METAL_DECL_KERNEL(mul_mat_q8_0_f32);
|
@@ -116,22 +120,47 @@ static NSString * const msl_library_source = @"see metal.metal";
|
|
116
120
|
struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
117
121
|
metal_printf("%s: allocating\n", __func__);
|
118
122
|
|
119
|
-
|
123
|
+
id <MTLDevice> device;
|
124
|
+
NSString * s;
|
125
|
+
|
126
|
+
#if TARGET_OS_OSX
|
127
|
+
// Show all the Metal device instances in the system
|
128
|
+
NSArray * devices = MTLCopyAllDevices();
|
129
|
+
for (device in devices) {
|
130
|
+
s = [device name];
|
131
|
+
metal_printf("%s: found device: %s\n", __func__, [s UTF8String]);
|
132
|
+
}
|
133
|
+
#endif
|
120
134
|
|
135
|
+
// Pick and show default Metal device
|
136
|
+
device = MTLCreateSystemDefaultDevice();
|
137
|
+
s = [device name];
|
138
|
+
metal_printf("%s: picking default device: %s\n", __func__, [s UTF8String]);
|
139
|
+
|
140
|
+
// Configure context
|
141
|
+
struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
|
142
|
+
ctx->device = device;
|
121
143
|
ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
|
122
|
-
ctx->device = MTLCreateSystemDefaultDevice();
|
123
144
|
ctx->queue = [ctx->device newCommandQueue];
|
124
145
|
ctx->n_buffers = 0;
|
125
146
|
ctx->concur_list_len = 0;
|
126
147
|
|
127
148
|
ctx->d_queue = dispatch_queue_create("llama.cpp", DISPATCH_QUEUE_CONCURRENT);
|
128
149
|
|
129
|
-
#
|
130
|
-
//
|
150
|
+
#ifdef GGML_SWIFT
|
151
|
+
// load the default.metallib file
|
131
152
|
{
|
132
153
|
NSError * error = nil;
|
133
154
|
|
134
|
-
|
155
|
+
NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
|
156
|
+
NSString * llamaBundlePath = [bundle pathForResource:@"llama_llama" ofType:@"bundle"];
|
157
|
+
NSBundle * llamaBundle = [NSBundle bundleWithPath:llamaBundlePath];
|
158
|
+
NSString * libPath = [llamaBundle pathForResource:@"default" ofType:@"metallib"];
|
159
|
+
NSURL * libURL = [NSURL fileURLWithPath:libPath];
|
160
|
+
|
161
|
+
// Load the metallib file into a Metal library
|
162
|
+
ctx->library = [ctx->device newLibraryWithURL:libURL error:&error];
|
163
|
+
|
135
164
|
if (error) {
|
136
165
|
metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
137
166
|
return NULL;
|
@@ -192,7 +221,9 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
192
221
|
GGML_METAL_ADD_KERNEL(relu);
|
193
222
|
GGML_METAL_ADD_KERNEL(gelu);
|
194
223
|
GGML_METAL_ADD_KERNEL(soft_max);
|
224
|
+
GGML_METAL_ADD_KERNEL(soft_max_4);
|
195
225
|
GGML_METAL_ADD_KERNEL(diag_mask_inf);
|
226
|
+
GGML_METAL_ADD_KERNEL(diag_mask_inf_8);
|
196
227
|
GGML_METAL_ADD_KERNEL(get_rows_f16);
|
197
228
|
GGML_METAL_ADD_KERNEL(get_rows_q4_0);
|
198
229
|
GGML_METAL_ADD_KERNEL(get_rows_q4_1);
|
@@ -205,6 +236,8 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
205
236
|
GGML_METAL_ADD_KERNEL(rms_norm);
|
206
237
|
GGML_METAL_ADD_KERNEL(norm);
|
207
238
|
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
|
239
|
+
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_1row);
|
240
|
+
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_l4);
|
208
241
|
GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
|
209
242
|
GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32);
|
210
243
|
GGML_METAL_ADD_KERNEL(mul_mat_q8_0_f32);
|
@@ -231,13 +264,15 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
231
264
|
#undef GGML_METAL_ADD_KERNEL
|
232
265
|
}
|
233
266
|
|
234
|
-
metal_printf("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
235
267
|
metal_printf("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
|
268
|
+
#if TARGET_OS_OSX
|
269
|
+
metal_printf("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
236
270
|
if (ctx->device.maxTransferRate != 0) {
|
237
271
|
metal_printf("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
|
238
272
|
} else {
|
239
273
|
metal_printf("%s: maxTransferRate = built-in GPU\n", __func__);
|
240
274
|
}
|
275
|
+
#endif
|
241
276
|
|
242
277
|
return ctx;
|
243
278
|
}
|
@@ -257,7 +292,8 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
|
257
292
|
GGML_METAL_DEL_KERNEL(relu);
|
258
293
|
GGML_METAL_DEL_KERNEL(gelu);
|
259
294
|
GGML_METAL_DEL_KERNEL(soft_max);
|
260
|
-
GGML_METAL_DEL_KERNEL(
|
295
|
+
GGML_METAL_DEL_KERNEL(soft_max_4);
|
296
|
+
GGML_METAL_DEL_KERNEL(diag_mask_inf_8);
|
261
297
|
GGML_METAL_DEL_KERNEL(get_rows_f16);
|
262
298
|
GGML_METAL_DEL_KERNEL(get_rows_q4_0);
|
263
299
|
GGML_METAL_DEL_KERNEL(get_rows_q4_1);
|
@@ -270,6 +306,8 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
|
270
306
|
GGML_METAL_DEL_KERNEL(rms_norm);
|
271
307
|
GGML_METAL_DEL_KERNEL(norm);
|
272
308
|
GGML_METAL_DEL_KERNEL(mul_mat_f16_f32);
|
309
|
+
GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_1row);
|
310
|
+
GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_l4);
|
273
311
|
GGML_METAL_DEL_KERNEL(mul_mat_q4_0_f32);
|
274
312
|
GGML_METAL_DEL_KERNEL(mul_mat_q4_1_f32);
|
275
313
|
GGML_METAL_DEL_KERNEL(mul_mat_q8_0_f32);
|
@@ -310,7 +348,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
|
310
348
|
|
311
349
|
void * ggml_metal_host_malloc(size_t n) {
|
312
350
|
void * data = NULL;
|
313
|
-
const int result = posix_memalign((void **) &data,
|
351
|
+
const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n);
|
314
352
|
if (result != 0) {
|
315
353
|
metal_printf("%s: error: posix_memalign failed\n", __func__);
|
316
354
|
return NULL;
|
@@ -384,7 +422,7 @@ bool ggml_metal_add_buffer(
|
|
384
422
|
}
|
385
423
|
}
|
386
424
|
|
387
|
-
const size_t size_page =
|
425
|
+
const size_t size_page = sysconf(_SC_PAGESIZE);
|
388
426
|
|
389
427
|
size_t size_aligned = size;
|
390
428
|
if ((size_aligned % size_page) != 0) {
|
@@ -437,6 +475,7 @@ bool ggml_metal_add_buffer(
|
|
437
475
|
}
|
438
476
|
}
|
439
477
|
|
478
|
+
#if TARGET_OS_OSX
|
440
479
|
metal_printf(", (%8.2f / %8.2f)",
|
441
480
|
ctx->device.currentAllocatedSize / 1024.0 / 1024.0,
|
442
481
|
ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
@@ -446,6 +485,9 @@ bool ggml_metal_add_buffer(
|
|
446
485
|
} else {
|
447
486
|
metal_printf("\n");
|
448
487
|
}
|
488
|
+
#else
|
489
|
+
metal_printf(", (%8.2f)\n", ctx->device.currentAllocatedSize / 1024.0 / 1024.0);
|
490
|
+
#endif
|
449
491
|
}
|
450
492
|
|
451
493
|
return true;
|
@@ -733,7 +775,7 @@ void ggml_metal_graph_compute(
|
|
733
775
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
734
776
|
[encoder setBytes:&scale length:sizeof(scale) atIndex:2];
|
735
777
|
|
736
|
-
const int64_t n = ggml_nelements(dst);
|
778
|
+
const int64_t n = ggml_nelements(dst)/4;
|
737
779
|
|
738
780
|
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
739
781
|
} break;
|
@@ -745,7 +787,7 @@ void ggml_metal_graph_compute(
|
|
745
787
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
746
788
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
747
789
|
|
748
|
-
const int64_t n = ggml_nelements(dst);
|
790
|
+
const int64_t n = ggml_nelements(dst)/4;
|
749
791
|
|
750
792
|
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
751
793
|
} break;
|
@@ -765,7 +807,7 @@ void ggml_metal_graph_compute(
|
|
765
807
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
766
808
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
767
809
|
|
768
|
-
const int64_t n = ggml_nelements(dst);
|
810
|
+
const int64_t n = ggml_nelements(dst)/4;
|
769
811
|
|
770
812
|
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
771
813
|
} break;
|
@@ -779,13 +821,16 @@ void ggml_metal_graph_compute(
|
|
779
821
|
{
|
780
822
|
const int nth = 32;
|
781
823
|
|
782
|
-
|
824
|
+
if (ne00%4 == 0) {
|
825
|
+
[encoder setComputePipelineState:ctx->pipeline_soft_max_4];
|
826
|
+
} else {
|
827
|
+
[encoder setComputePipelineState:ctx->pipeline_soft_max];
|
828
|
+
}
|
783
829
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
784
830
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
785
831
|
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
|
786
832
|
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
|
787
833
|
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
|
788
|
-
[encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
|
789
834
|
|
790
835
|
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
791
836
|
} break;
|
@@ -793,14 +838,23 @@ void ggml_metal_graph_compute(
|
|
793
838
|
{
|
794
839
|
const int n_past = ((int32_t *)(dst->op_params))[0];
|
795
840
|
|
796
|
-
|
841
|
+
if (ne00%8 == 0) {
|
842
|
+
[encoder setComputePipelineState:ctx->pipeline_diag_mask_inf_8];
|
843
|
+
} else {
|
844
|
+
[encoder setComputePipelineState:ctx->pipeline_diag_mask_inf];
|
845
|
+
}
|
797
846
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
798
847
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
799
848
|
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
|
800
849
|
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
|
801
850
|
[encoder setBytes:&n_past length:sizeof(int) atIndex:4];
|
802
851
|
|
803
|
-
|
852
|
+
if (ne00%8 == 0) {
|
853
|
+
[encoder dispatchThreadgroups:MTLSizeMake(ne00*ne01*ne02/8, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
854
|
+
}
|
855
|
+
else {
|
856
|
+
[encoder dispatchThreadgroups:MTLSizeMake(ne00, ne01, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
857
|
+
}
|
804
858
|
} break;
|
805
859
|
case GGML_OP_MUL_MAT:
|
806
860
|
{
|
@@ -847,6 +901,7 @@ void ggml_metal_graph_compute(
|
|
847
901
|
} else {
|
848
902
|
int nth0 = 32;
|
849
903
|
int nth1 = 1;
|
904
|
+
int nrows = 1;
|
850
905
|
|
851
906
|
// use custom matrix x vector kernel
|
852
907
|
switch (src0t) {
|
@@ -854,7 +909,15 @@ void ggml_metal_graph_compute(
|
|
854
909
|
{
|
855
910
|
nth0 = 32;
|
856
911
|
nth1 = 1;
|
857
|
-
|
912
|
+
if (ne11 * ne12 < 4) {
|
913
|
+
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_1row];
|
914
|
+
} else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
|
915
|
+
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_l4];
|
916
|
+
nrows = ne11;
|
917
|
+
} else {
|
918
|
+
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
|
919
|
+
nrows = 4;
|
920
|
+
}
|
858
921
|
} break;
|
859
922
|
case GGML_TYPE_Q4_0:
|
860
923
|
{
|
@@ -906,8 +969,8 @@ void ggml_metal_graph_compute(
|
|
906
969
|
GGML_ASSERT(ne02 == 1);
|
907
970
|
GGML_ASSERT(ne12 == 1);
|
908
971
|
|
909
|
-
nth0 =
|
910
|
-
nth1 = 32;
|
972
|
+
nth0 = 4; //1;
|
973
|
+
nth1 = 8; //32;
|
911
974
|
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32];
|
912
975
|
} break;
|
913
976
|
case GGML_TYPE_Q5_K:
|
@@ -955,9 +1018,12 @@ void ggml_metal_graph_compute(
|
|
955
1018
|
[encoder setBytes:&gqa length:sizeof(gqa) atIndex:17];
|
956
1019
|
|
957
1020
|
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q8_0 ||
|
958
|
-
src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) {
|
1021
|
+
src0t == GGML_TYPE_Q2_K) {// || src0t == GGML_TYPE_Q4_K) {
|
959
1022
|
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
960
1023
|
}
|
1024
|
+
else if (src0t == GGML_TYPE_Q4_K) {
|
1025
|
+
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
1026
|
+
}
|
961
1027
|
else if (src0t == GGML_TYPE_Q3_K) {
|
962
1028
|
#ifdef GGML_QKK_64
|
963
1029
|
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
@@ -971,8 +1037,8 @@ void ggml_metal_graph_compute(
|
|
971
1037
|
else if (src0t == GGML_TYPE_Q6_K) {
|
972
1038
|
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
973
1039
|
} else {
|
974
|
-
|
975
|
-
[encoder dispatchThreadgroups:MTLSizeMake(ne01,
|
1040
|
+
int64_t ny = (ne11 + nrows - 1)/nrows;
|
1041
|
+
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
976
1042
|
}
|
977
1043
|
}
|
978
1044
|
} break;
|
@@ -1117,7 +1183,7 @@ void ggml_metal_graph_compute(
|
|
1117
1183
|
[encoder setBytes:&freq_base length:sizeof(float) atIndex:21];
|
1118
1184
|
[encoder setBytes:&freq_scale length:sizeof(float) atIndex:22];
|
1119
1185
|
|
1120
|
-
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(
|
1186
|
+
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)];
|
1121
1187
|
} break;
|
1122
1188
|
case GGML_OP_DUP:
|
1123
1189
|
case GGML_OP_CPY:
|