llama_cpp 0.5.0 → 0.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -2
- data/examples/prompt_jp.txt +1 -1
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +30 -0
- data/ext/llama_cpp/src/ggml-alloc.c +101 -24
- data/ext/llama_cpp/src/ggml-cuda.cu +1094 -678
- data/ext/llama_cpp/src/ggml-metal.m +89 -23
- data/ext/llama_cpp/src/ggml-metal.metal +398 -211
- data/ext/llama_cpp/src/ggml-opencl.cpp +7 -7
- data/ext/llama_cpp/src/ggml.c +32 -56
- data/ext/llama_cpp/src/ggml.h +1 -1
- data/ext/llama_cpp/src/k_quants.c +49 -13
- data/ext/llama_cpp/src/llama.cpp +833 -281
- data/ext/llama_cpp/src/llama.h +11 -6
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +4 -0
- metadata +2 -2
@@ -63,7 +63,9 @@ struct ggml_metal_context {
|
|
63
63
|
GGML_METAL_DECL_KERNEL(relu);
|
64
64
|
GGML_METAL_DECL_KERNEL(gelu);
|
65
65
|
GGML_METAL_DECL_KERNEL(soft_max);
|
66
|
+
GGML_METAL_DECL_KERNEL(soft_max_4);
|
66
67
|
GGML_METAL_DECL_KERNEL(diag_mask_inf);
|
68
|
+
GGML_METAL_DECL_KERNEL(diag_mask_inf_8);
|
67
69
|
GGML_METAL_DECL_KERNEL(get_rows_f16);
|
68
70
|
GGML_METAL_DECL_KERNEL(get_rows_q4_0);
|
69
71
|
GGML_METAL_DECL_KERNEL(get_rows_q4_1);
|
@@ -76,6 +78,8 @@ struct ggml_metal_context {
|
|
76
78
|
GGML_METAL_DECL_KERNEL(rms_norm);
|
77
79
|
GGML_METAL_DECL_KERNEL(norm);
|
78
80
|
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
|
81
|
+
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_1row);
|
82
|
+
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_l4);
|
79
83
|
GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
|
80
84
|
GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32);
|
81
85
|
GGML_METAL_DECL_KERNEL(mul_mat_q8_0_f32);
|
@@ -116,22 +120,47 @@ static NSString * const msl_library_source = @"see metal.metal";
|
|
116
120
|
struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
117
121
|
metal_printf("%s: allocating\n", __func__);
|
118
122
|
|
119
|
-
|
123
|
+
id <MTLDevice> device;
|
124
|
+
NSString * s;
|
125
|
+
|
126
|
+
#if TARGET_OS_OSX
|
127
|
+
// Show all the Metal device instances in the system
|
128
|
+
NSArray * devices = MTLCopyAllDevices();
|
129
|
+
for (device in devices) {
|
130
|
+
s = [device name];
|
131
|
+
metal_printf("%s: found device: %s\n", __func__, [s UTF8String]);
|
132
|
+
}
|
133
|
+
#endif
|
120
134
|
|
135
|
+
// Pick and show default Metal device
|
136
|
+
device = MTLCreateSystemDefaultDevice();
|
137
|
+
s = [device name];
|
138
|
+
metal_printf("%s: picking default device: %s\n", __func__, [s UTF8String]);
|
139
|
+
|
140
|
+
// Configure context
|
141
|
+
struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
|
142
|
+
ctx->device = device;
|
121
143
|
ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
|
122
|
-
ctx->device = MTLCreateSystemDefaultDevice();
|
123
144
|
ctx->queue = [ctx->device newCommandQueue];
|
124
145
|
ctx->n_buffers = 0;
|
125
146
|
ctx->concur_list_len = 0;
|
126
147
|
|
127
148
|
ctx->d_queue = dispatch_queue_create("llama.cpp", DISPATCH_QUEUE_CONCURRENT);
|
128
149
|
|
129
|
-
#
|
130
|
-
//
|
150
|
+
#ifdef GGML_SWIFT
|
151
|
+
// load the default.metallib file
|
131
152
|
{
|
132
153
|
NSError * error = nil;
|
133
154
|
|
134
|
-
|
155
|
+
NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
|
156
|
+
NSString * llamaBundlePath = [bundle pathForResource:@"llama_llama" ofType:@"bundle"];
|
157
|
+
NSBundle * llamaBundle = [NSBundle bundleWithPath:llamaBundlePath];
|
158
|
+
NSString * libPath = [llamaBundle pathForResource:@"default" ofType:@"metallib"];
|
159
|
+
NSURL * libURL = [NSURL fileURLWithPath:libPath];
|
160
|
+
|
161
|
+
// Load the metallib file into a Metal library
|
162
|
+
ctx->library = [ctx->device newLibraryWithURL:libURL error:&error];
|
163
|
+
|
135
164
|
if (error) {
|
136
165
|
metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
137
166
|
return NULL;
|
@@ -192,7 +221,9 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
192
221
|
GGML_METAL_ADD_KERNEL(relu);
|
193
222
|
GGML_METAL_ADD_KERNEL(gelu);
|
194
223
|
GGML_METAL_ADD_KERNEL(soft_max);
|
224
|
+
GGML_METAL_ADD_KERNEL(soft_max_4);
|
195
225
|
GGML_METAL_ADD_KERNEL(diag_mask_inf);
|
226
|
+
GGML_METAL_ADD_KERNEL(diag_mask_inf_8);
|
196
227
|
GGML_METAL_ADD_KERNEL(get_rows_f16);
|
197
228
|
GGML_METAL_ADD_KERNEL(get_rows_q4_0);
|
198
229
|
GGML_METAL_ADD_KERNEL(get_rows_q4_1);
|
@@ -205,6 +236,8 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
205
236
|
GGML_METAL_ADD_KERNEL(rms_norm);
|
206
237
|
GGML_METAL_ADD_KERNEL(norm);
|
207
238
|
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
|
239
|
+
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_1row);
|
240
|
+
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_l4);
|
208
241
|
GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
|
209
242
|
GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32);
|
210
243
|
GGML_METAL_ADD_KERNEL(mul_mat_q8_0_f32);
|
@@ -231,13 +264,15 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
231
264
|
#undef GGML_METAL_ADD_KERNEL
|
232
265
|
}
|
233
266
|
|
234
|
-
metal_printf("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
235
267
|
metal_printf("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
|
268
|
+
#if TARGET_OS_OSX
|
269
|
+
metal_printf("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
236
270
|
if (ctx->device.maxTransferRate != 0) {
|
237
271
|
metal_printf("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
|
238
272
|
} else {
|
239
273
|
metal_printf("%s: maxTransferRate = built-in GPU\n", __func__);
|
240
274
|
}
|
275
|
+
#endif
|
241
276
|
|
242
277
|
return ctx;
|
243
278
|
}
|
@@ -257,7 +292,8 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
|
257
292
|
GGML_METAL_DEL_KERNEL(relu);
|
258
293
|
GGML_METAL_DEL_KERNEL(gelu);
|
259
294
|
GGML_METAL_DEL_KERNEL(soft_max);
|
260
|
-
GGML_METAL_DEL_KERNEL(
|
295
|
+
GGML_METAL_DEL_KERNEL(soft_max_4);
|
296
|
+
GGML_METAL_DEL_KERNEL(diag_mask_inf_8);
|
261
297
|
GGML_METAL_DEL_KERNEL(get_rows_f16);
|
262
298
|
GGML_METAL_DEL_KERNEL(get_rows_q4_0);
|
263
299
|
GGML_METAL_DEL_KERNEL(get_rows_q4_1);
|
@@ -270,6 +306,8 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
|
270
306
|
GGML_METAL_DEL_KERNEL(rms_norm);
|
271
307
|
GGML_METAL_DEL_KERNEL(norm);
|
272
308
|
GGML_METAL_DEL_KERNEL(mul_mat_f16_f32);
|
309
|
+
GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_1row);
|
310
|
+
GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_l4);
|
273
311
|
GGML_METAL_DEL_KERNEL(mul_mat_q4_0_f32);
|
274
312
|
GGML_METAL_DEL_KERNEL(mul_mat_q4_1_f32);
|
275
313
|
GGML_METAL_DEL_KERNEL(mul_mat_q8_0_f32);
|
@@ -310,7 +348,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
|
310
348
|
|
311
349
|
void * ggml_metal_host_malloc(size_t n) {
|
312
350
|
void * data = NULL;
|
313
|
-
const int result = posix_memalign((void **) &data,
|
351
|
+
const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n);
|
314
352
|
if (result != 0) {
|
315
353
|
metal_printf("%s: error: posix_memalign failed\n", __func__);
|
316
354
|
return NULL;
|
@@ -384,7 +422,7 @@ bool ggml_metal_add_buffer(
|
|
384
422
|
}
|
385
423
|
}
|
386
424
|
|
387
|
-
const size_t size_page =
|
425
|
+
const size_t size_page = sysconf(_SC_PAGESIZE);
|
388
426
|
|
389
427
|
size_t size_aligned = size;
|
390
428
|
if ((size_aligned % size_page) != 0) {
|
@@ -437,6 +475,7 @@ bool ggml_metal_add_buffer(
|
|
437
475
|
}
|
438
476
|
}
|
439
477
|
|
478
|
+
#if TARGET_OS_OSX
|
440
479
|
metal_printf(", (%8.2f / %8.2f)",
|
441
480
|
ctx->device.currentAllocatedSize / 1024.0 / 1024.0,
|
442
481
|
ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
@@ -446,6 +485,9 @@ bool ggml_metal_add_buffer(
|
|
446
485
|
} else {
|
447
486
|
metal_printf("\n");
|
448
487
|
}
|
488
|
+
#else
|
489
|
+
metal_printf(", (%8.2f)\n", ctx->device.currentAllocatedSize / 1024.0 / 1024.0);
|
490
|
+
#endif
|
449
491
|
}
|
450
492
|
|
451
493
|
return true;
|
@@ -733,7 +775,7 @@ void ggml_metal_graph_compute(
|
|
733
775
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
734
776
|
[encoder setBytes:&scale length:sizeof(scale) atIndex:2];
|
735
777
|
|
736
|
-
const int64_t n = ggml_nelements(dst);
|
778
|
+
const int64_t n = ggml_nelements(dst)/4;
|
737
779
|
|
738
780
|
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
739
781
|
} break;
|
@@ -745,7 +787,7 @@ void ggml_metal_graph_compute(
|
|
745
787
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
746
788
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
747
789
|
|
748
|
-
const int64_t n = ggml_nelements(dst);
|
790
|
+
const int64_t n = ggml_nelements(dst)/4;
|
749
791
|
|
750
792
|
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
751
793
|
} break;
|
@@ -765,7 +807,7 @@ void ggml_metal_graph_compute(
|
|
765
807
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
766
808
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
767
809
|
|
768
|
-
const int64_t n = ggml_nelements(dst);
|
810
|
+
const int64_t n = ggml_nelements(dst)/4;
|
769
811
|
|
770
812
|
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
771
813
|
} break;
|
@@ -779,13 +821,16 @@ void ggml_metal_graph_compute(
|
|
779
821
|
{
|
780
822
|
const int nth = 32;
|
781
823
|
|
782
|
-
|
824
|
+
if (ne00%4 == 0) {
|
825
|
+
[encoder setComputePipelineState:ctx->pipeline_soft_max_4];
|
826
|
+
} else {
|
827
|
+
[encoder setComputePipelineState:ctx->pipeline_soft_max];
|
828
|
+
}
|
783
829
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
784
830
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
785
831
|
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
|
786
832
|
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
|
787
833
|
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
|
788
|
-
[encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
|
789
834
|
|
790
835
|
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
791
836
|
} break;
|
@@ -793,14 +838,23 @@ void ggml_metal_graph_compute(
|
|
793
838
|
{
|
794
839
|
const int n_past = ((int32_t *)(dst->op_params))[0];
|
795
840
|
|
796
|
-
|
841
|
+
if (ne00%8 == 0) {
|
842
|
+
[encoder setComputePipelineState:ctx->pipeline_diag_mask_inf_8];
|
843
|
+
} else {
|
844
|
+
[encoder setComputePipelineState:ctx->pipeline_diag_mask_inf];
|
845
|
+
}
|
797
846
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
798
847
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
799
848
|
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
|
800
849
|
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
|
801
850
|
[encoder setBytes:&n_past length:sizeof(int) atIndex:4];
|
802
851
|
|
803
|
-
|
852
|
+
if (ne00%8 == 0) {
|
853
|
+
[encoder dispatchThreadgroups:MTLSizeMake(ne00*ne01*ne02/8, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
854
|
+
}
|
855
|
+
else {
|
856
|
+
[encoder dispatchThreadgroups:MTLSizeMake(ne00, ne01, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
857
|
+
}
|
804
858
|
} break;
|
805
859
|
case GGML_OP_MUL_MAT:
|
806
860
|
{
|
@@ -847,6 +901,7 @@ void ggml_metal_graph_compute(
|
|
847
901
|
} else {
|
848
902
|
int nth0 = 32;
|
849
903
|
int nth1 = 1;
|
904
|
+
int nrows = 1;
|
850
905
|
|
851
906
|
// use custom matrix x vector kernel
|
852
907
|
switch (src0t) {
|
@@ -854,7 +909,15 @@ void ggml_metal_graph_compute(
|
|
854
909
|
{
|
855
910
|
nth0 = 32;
|
856
911
|
nth1 = 1;
|
857
|
-
|
912
|
+
if (ne11 * ne12 < 4) {
|
913
|
+
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_1row];
|
914
|
+
} else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
|
915
|
+
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_l4];
|
916
|
+
nrows = ne11;
|
917
|
+
} else {
|
918
|
+
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
|
919
|
+
nrows = 4;
|
920
|
+
}
|
858
921
|
} break;
|
859
922
|
case GGML_TYPE_Q4_0:
|
860
923
|
{
|
@@ -906,8 +969,8 @@ void ggml_metal_graph_compute(
|
|
906
969
|
GGML_ASSERT(ne02 == 1);
|
907
970
|
GGML_ASSERT(ne12 == 1);
|
908
971
|
|
909
|
-
nth0 =
|
910
|
-
nth1 = 32;
|
972
|
+
nth0 = 4; //1;
|
973
|
+
nth1 = 8; //32;
|
911
974
|
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32];
|
912
975
|
} break;
|
913
976
|
case GGML_TYPE_Q5_K:
|
@@ -955,9 +1018,12 @@ void ggml_metal_graph_compute(
|
|
955
1018
|
[encoder setBytes:&gqa length:sizeof(gqa) atIndex:17];
|
956
1019
|
|
957
1020
|
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q8_0 ||
|
958
|
-
src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) {
|
1021
|
+
src0t == GGML_TYPE_Q2_K) {// || src0t == GGML_TYPE_Q4_K) {
|
959
1022
|
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
960
1023
|
}
|
1024
|
+
else if (src0t == GGML_TYPE_Q4_K) {
|
1025
|
+
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
1026
|
+
}
|
961
1027
|
else if (src0t == GGML_TYPE_Q3_K) {
|
962
1028
|
#ifdef GGML_QKK_64
|
963
1029
|
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
@@ -971,8 +1037,8 @@ void ggml_metal_graph_compute(
|
|
971
1037
|
else if (src0t == GGML_TYPE_Q6_K) {
|
972
1038
|
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
973
1039
|
} else {
|
974
|
-
|
975
|
-
[encoder dispatchThreadgroups:MTLSizeMake(ne01,
|
1040
|
+
int64_t ny = (ne11 + nrows - 1)/nrows;
|
1041
|
+
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
976
1042
|
}
|
977
1043
|
}
|
978
1044
|
} break;
|
@@ -1117,7 +1183,7 @@ void ggml_metal_graph_compute(
|
|
1117
1183
|
[encoder setBytes:&freq_base length:sizeof(float) atIndex:21];
|
1118
1184
|
[encoder setBytes:&freq_scale length:sizeof(float) atIndex:22];
|
1119
1185
|
|
1120
|
-
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(
|
1186
|
+
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)];
|
1121
1187
|
} break;
|
1122
1188
|
case GGML_OP_DUP:
|
1123
1189
|
case GGML_OP_CPY:
|