llama_cpp 0.5.1 → 0.5.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -3
- data/examples/prompt_jp.txt +1 -1
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +30 -0
- data/ext/llama_cpp/src/ggml-alloc.c +0 -5
- data/ext/llama_cpp/src/ggml-cuda.cu +1011 -655
- data/ext/llama_cpp/src/ggml-metal.m +57 -15
- data/ext/llama_cpp/src/ggml-metal.metal +271 -137
- data/ext/llama_cpp/src/ggml.c +7 -3
- data/ext/llama_cpp/src/ggml.h +1 -1
- data/ext/llama_cpp/src/k_quants.c +4 -1
- data/ext/llama_cpp/src/llama.cpp +617 -141
- data/ext/llama_cpp/src/llama.h +8 -6
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +4 -0
- metadata +2 -2
@@ -63,7 +63,9 @@ struct ggml_metal_context {
|
|
63
63
|
GGML_METAL_DECL_KERNEL(relu);
|
64
64
|
GGML_METAL_DECL_KERNEL(gelu);
|
65
65
|
GGML_METAL_DECL_KERNEL(soft_max);
|
66
|
+
GGML_METAL_DECL_KERNEL(soft_max_4);
|
66
67
|
GGML_METAL_DECL_KERNEL(diag_mask_inf);
|
68
|
+
GGML_METAL_DECL_KERNEL(diag_mask_inf_8);
|
67
69
|
GGML_METAL_DECL_KERNEL(get_rows_f16);
|
68
70
|
GGML_METAL_DECL_KERNEL(get_rows_q4_0);
|
69
71
|
GGML_METAL_DECL_KERNEL(get_rows_q4_1);
|
@@ -77,6 +79,7 @@ struct ggml_metal_context {
|
|
77
79
|
GGML_METAL_DECL_KERNEL(norm);
|
78
80
|
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
|
79
81
|
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_1row);
|
82
|
+
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_l4);
|
80
83
|
GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
|
81
84
|
GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32);
|
82
85
|
GGML_METAL_DECL_KERNEL(mul_mat_q8_0_f32);
|
@@ -117,14 +120,17 @@ static NSString * const msl_library_source = @"see metal.metal";
|
|
117
120
|
struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
118
121
|
metal_printf("%s: allocating\n", __func__);
|
119
122
|
|
120
|
-
// Show all the Metal device instances in the system
|
121
|
-
NSArray * devices = MTLCopyAllDevices();
|
122
123
|
id <MTLDevice> device;
|
123
124
|
NSString * s;
|
125
|
+
|
126
|
+
#if TARGET_OS_OSX
|
127
|
+
// Show all the Metal device instances in the system
|
128
|
+
NSArray * devices = MTLCopyAllDevices();
|
124
129
|
for (device in devices) {
|
125
130
|
s = [device name];
|
126
131
|
metal_printf("%s: found device: %s\n", __func__, [s UTF8String]);
|
127
132
|
}
|
133
|
+
#endif
|
128
134
|
|
129
135
|
// Pick and show default Metal device
|
130
136
|
device = MTLCreateSystemDefaultDevice();
|
@@ -141,12 +147,20 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
141
147
|
|
142
148
|
ctx->d_queue = dispatch_queue_create("llama.cpp", DISPATCH_QUEUE_CONCURRENT);
|
143
149
|
|
144
|
-
#
|
145
|
-
//
|
150
|
+
#ifdef GGML_SWIFT
|
151
|
+
// load the default.metallib file
|
146
152
|
{
|
147
153
|
NSError * error = nil;
|
148
154
|
|
149
|
-
|
155
|
+
NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
|
156
|
+
NSString * llamaBundlePath = [bundle pathForResource:@"llama_llama" ofType:@"bundle"];
|
157
|
+
NSBundle * llamaBundle = [NSBundle bundleWithPath:llamaBundlePath];
|
158
|
+
NSString * libPath = [llamaBundle pathForResource:@"default" ofType:@"metallib"];
|
159
|
+
NSURL * libURL = [NSURL fileURLWithPath:libPath];
|
160
|
+
|
161
|
+
// Load the metallib file into a Metal library
|
162
|
+
ctx->library = [ctx->device newLibraryWithURL:libURL error:&error];
|
163
|
+
|
150
164
|
if (error) {
|
151
165
|
metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
152
166
|
return NULL;
|
@@ -207,7 +221,9 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
207
221
|
GGML_METAL_ADD_KERNEL(relu);
|
208
222
|
GGML_METAL_ADD_KERNEL(gelu);
|
209
223
|
GGML_METAL_ADD_KERNEL(soft_max);
|
224
|
+
GGML_METAL_ADD_KERNEL(soft_max_4);
|
210
225
|
GGML_METAL_ADD_KERNEL(diag_mask_inf);
|
226
|
+
GGML_METAL_ADD_KERNEL(diag_mask_inf_8);
|
211
227
|
GGML_METAL_ADD_KERNEL(get_rows_f16);
|
212
228
|
GGML_METAL_ADD_KERNEL(get_rows_q4_0);
|
213
229
|
GGML_METAL_ADD_KERNEL(get_rows_q4_1);
|
@@ -221,6 +237,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
221
237
|
GGML_METAL_ADD_KERNEL(norm);
|
222
238
|
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
|
223
239
|
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_1row);
|
240
|
+
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_l4);
|
224
241
|
GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
|
225
242
|
GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32);
|
226
243
|
GGML_METAL_ADD_KERNEL(mul_mat_q8_0_f32);
|
@@ -247,13 +264,15 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
247
264
|
#undef GGML_METAL_ADD_KERNEL
|
248
265
|
}
|
249
266
|
|
250
|
-
metal_printf("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
251
267
|
metal_printf("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
|
268
|
+
#if TARGET_OS_OSX
|
269
|
+
metal_printf("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
252
270
|
if (ctx->device.maxTransferRate != 0) {
|
253
271
|
metal_printf("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
|
254
272
|
} else {
|
255
273
|
metal_printf("%s: maxTransferRate = built-in GPU\n", __func__);
|
256
274
|
}
|
275
|
+
#endif
|
257
276
|
|
258
277
|
return ctx;
|
259
278
|
}
|
@@ -273,7 +292,8 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
|
273
292
|
GGML_METAL_DEL_KERNEL(relu);
|
274
293
|
GGML_METAL_DEL_KERNEL(gelu);
|
275
294
|
GGML_METAL_DEL_KERNEL(soft_max);
|
276
|
-
GGML_METAL_DEL_KERNEL(
|
295
|
+
GGML_METAL_DEL_KERNEL(soft_max_4);
|
296
|
+
GGML_METAL_DEL_KERNEL(diag_mask_inf_8);
|
277
297
|
GGML_METAL_DEL_KERNEL(get_rows_f16);
|
278
298
|
GGML_METAL_DEL_KERNEL(get_rows_q4_0);
|
279
299
|
GGML_METAL_DEL_KERNEL(get_rows_q4_1);
|
@@ -287,6 +307,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
|
287
307
|
GGML_METAL_DEL_KERNEL(norm);
|
288
308
|
GGML_METAL_DEL_KERNEL(mul_mat_f16_f32);
|
289
309
|
GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_1row);
|
310
|
+
GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_l4);
|
290
311
|
GGML_METAL_DEL_KERNEL(mul_mat_q4_0_f32);
|
291
312
|
GGML_METAL_DEL_KERNEL(mul_mat_q4_1_f32);
|
292
313
|
GGML_METAL_DEL_KERNEL(mul_mat_q8_0_f32);
|
@@ -454,6 +475,7 @@ bool ggml_metal_add_buffer(
|
|
454
475
|
}
|
455
476
|
}
|
456
477
|
|
478
|
+
#if TARGET_OS_OSX
|
457
479
|
metal_printf(", (%8.2f / %8.2f)",
|
458
480
|
ctx->device.currentAllocatedSize / 1024.0 / 1024.0,
|
459
481
|
ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
@@ -463,6 +485,9 @@ bool ggml_metal_add_buffer(
|
|
463
485
|
} else {
|
464
486
|
metal_printf("\n");
|
465
487
|
}
|
488
|
+
#else
|
489
|
+
metal_printf(", (%8.2f)\n", ctx->device.currentAllocatedSize / 1024.0 / 1024.0);
|
490
|
+
#endif
|
466
491
|
}
|
467
492
|
|
468
493
|
return true;
|
@@ -750,7 +775,7 @@ void ggml_metal_graph_compute(
|
|
750
775
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
751
776
|
[encoder setBytes:&scale length:sizeof(scale) atIndex:2];
|
752
777
|
|
753
|
-
const int64_t n = ggml_nelements(dst);
|
778
|
+
const int64_t n = ggml_nelements(dst)/4;
|
754
779
|
|
755
780
|
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
756
781
|
} break;
|
@@ -762,7 +787,7 @@ void ggml_metal_graph_compute(
|
|
762
787
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
763
788
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
764
789
|
|
765
|
-
const int64_t n = ggml_nelements(dst);
|
790
|
+
const int64_t n = ggml_nelements(dst)/4;
|
766
791
|
|
767
792
|
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
768
793
|
} break;
|
@@ -782,7 +807,7 @@ void ggml_metal_graph_compute(
|
|
782
807
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
783
808
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
784
809
|
|
785
|
-
const int64_t n = ggml_nelements(dst);
|
810
|
+
const int64_t n = ggml_nelements(dst)/4;
|
786
811
|
|
787
812
|
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
788
813
|
} break;
|
@@ -796,13 +821,16 @@ void ggml_metal_graph_compute(
|
|
796
821
|
{
|
797
822
|
const int nth = 32;
|
798
823
|
|
799
|
-
|
824
|
+
if (ne00%4 == 0) {
|
825
|
+
[encoder setComputePipelineState:ctx->pipeline_soft_max_4];
|
826
|
+
} else {
|
827
|
+
[encoder setComputePipelineState:ctx->pipeline_soft_max];
|
828
|
+
}
|
800
829
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
801
830
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
802
831
|
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
|
803
832
|
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
|
804
833
|
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
|
805
|
-
[encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
|
806
834
|
|
807
835
|
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
808
836
|
} break;
|
@@ -810,14 +838,23 @@ void ggml_metal_graph_compute(
|
|
810
838
|
{
|
811
839
|
const int n_past = ((int32_t *)(dst->op_params))[0];
|
812
840
|
|
813
|
-
|
841
|
+
if (ne00%8 == 0) {
|
842
|
+
[encoder setComputePipelineState:ctx->pipeline_diag_mask_inf_8];
|
843
|
+
} else {
|
844
|
+
[encoder setComputePipelineState:ctx->pipeline_diag_mask_inf];
|
845
|
+
}
|
814
846
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
815
847
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
816
848
|
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
|
817
849
|
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
|
818
850
|
[encoder setBytes:&n_past length:sizeof(int) atIndex:4];
|
819
851
|
|
820
|
-
|
852
|
+
if (ne00%8 == 0) {
|
853
|
+
[encoder dispatchThreadgroups:MTLSizeMake(ne00*ne01*ne02/8, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
854
|
+
}
|
855
|
+
else {
|
856
|
+
[encoder dispatchThreadgroups:MTLSizeMake(ne00, ne01, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
857
|
+
}
|
821
858
|
} break;
|
822
859
|
case GGML_OP_MUL_MAT:
|
823
860
|
{
|
@@ -864,6 +901,7 @@ void ggml_metal_graph_compute(
|
|
864
901
|
} else {
|
865
902
|
int nth0 = 32;
|
866
903
|
int nth1 = 1;
|
904
|
+
int nrows = 1;
|
867
905
|
|
868
906
|
// use custom matrix x vector kernel
|
869
907
|
switch (src0t) {
|
@@ -873,8 +911,12 @@ void ggml_metal_graph_compute(
|
|
873
911
|
nth1 = 1;
|
874
912
|
if (ne11 * ne12 < 4) {
|
875
913
|
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_1row];
|
914
|
+
} else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
|
915
|
+
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_l4];
|
916
|
+
nrows = ne11;
|
876
917
|
} else {
|
877
918
|
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
|
919
|
+
nrows = 4;
|
878
920
|
}
|
879
921
|
} break;
|
880
922
|
case GGML_TYPE_Q4_0:
|
@@ -995,7 +1037,7 @@ void ggml_metal_graph_compute(
|
|
995
1037
|
else if (src0t == GGML_TYPE_Q6_K) {
|
996
1038
|
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
997
1039
|
} else {
|
998
|
-
int64_t ny = (ne11 +
|
1040
|
+
int64_t ny = (ne11 + nrows - 1)/nrows;
|
999
1041
|
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
1000
1042
|
}
|
1001
1043
|
}
|