llama_cpp 0.5.1 → 0.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -3
- data/examples/prompt_jp.txt +1 -1
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +30 -0
- data/ext/llama_cpp/src/ggml-alloc.c +0 -5
- data/ext/llama_cpp/src/ggml-cuda.cu +1011 -655
- data/ext/llama_cpp/src/ggml-metal.m +57 -15
- data/ext/llama_cpp/src/ggml-metal.metal +271 -137
- data/ext/llama_cpp/src/ggml.c +7 -3
- data/ext/llama_cpp/src/ggml.h +1 -1
- data/ext/llama_cpp/src/k_quants.c +4 -1
- data/ext/llama_cpp/src/llama.cpp +617 -141
- data/ext/llama_cpp/src/llama.h +8 -6
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +4 -0
- metadata +2 -2
@@ -63,7 +63,9 @@ struct ggml_metal_context {
|
|
63
63
|
GGML_METAL_DECL_KERNEL(relu);
|
64
64
|
GGML_METAL_DECL_KERNEL(gelu);
|
65
65
|
GGML_METAL_DECL_KERNEL(soft_max);
|
66
|
+
GGML_METAL_DECL_KERNEL(soft_max_4);
|
66
67
|
GGML_METAL_DECL_KERNEL(diag_mask_inf);
|
68
|
+
GGML_METAL_DECL_KERNEL(diag_mask_inf_8);
|
67
69
|
GGML_METAL_DECL_KERNEL(get_rows_f16);
|
68
70
|
GGML_METAL_DECL_KERNEL(get_rows_q4_0);
|
69
71
|
GGML_METAL_DECL_KERNEL(get_rows_q4_1);
|
@@ -77,6 +79,7 @@ struct ggml_metal_context {
|
|
77
79
|
GGML_METAL_DECL_KERNEL(norm);
|
78
80
|
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
|
79
81
|
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_1row);
|
82
|
+
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_l4);
|
80
83
|
GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
|
81
84
|
GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32);
|
82
85
|
GGML_METAL_DECL_KERNEL(mul_mat_q8_0_f32);
|
@@ -117,14 +120,17 @@ static NSString * const msl_library_source = @"see metal.metal";
|
|
117
120
|
struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
118
121
|
metal_printf("%s: allocating\n", __func__);
|
119
122
|
|
120
|
-
// Show all the Metal device instances in the system
|
121
|
-
NSArray * devices = MTLCopyAllDevices();
|
122
123
|
id <MTLDevice> device;
|
123
124
|
NSString * s;
|
125
|
+
|
126
|
+
#if TARGET_OS_OSX
|
127
|
+
// Show all the Metal device instances in the system
|
128
|
+
NSArray * devices = MTLCopyAllDevices();
|
124
129
|
for (device in devices) {
|
125
130
|
s = [device name];
|
126
131
|
metal_printf("%s: found device: %s\n", __func__, [s UTF8String]);
|
127
132
|
}
|
133
|
+
#endif
|
128
134
|
|
129
135
|
// Pick and show default Metal device
|
130
136
|
device = MTLCreateSystemDefaultDevice();
|
@@ -141,12 +147,20 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
141
147
|
|
142
148
|
ctx->d_queue = dispatch_queue_create("llama.cpp", DISPATCH_QUEUE_CONCURRENT);
|
143
149
|
|
144
|
-
#
|
145
|
-
//
|
150
|
+
#ifdef GGML_SWIFT
|
151
|
+
// load the default.metallib file
|
146
152
|
{
|
147
153
|
NSError * error = nil;
|
148
154
|
|
149
|
-
|
155
|
+
NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
|
156
|
+
NSString * llamaBundlePath = [bundle pathForResource:@"llama_llama" ofType:@"bundle"];
|
157
|
+
NSBundle * llamaBundle = [NSBundle bundleWithPath:llamaBundlePath];
|
158
|
+
NSString * libPath = [llamaBundle pathForResource:@"default" ofType:@"metallib"];
|
159
|
+
NSURL * libURL = [NSURL fileURLWithPath:libPath];
|
160
|
+
|
161
|
+
// Load the metallib file into a Metal library
|
162
|
+
ctx->library = [ctx->device newLibraryWithURL:libURL error:&error];
|
163
|
+
|
150
164
|
if (error) {
|
151
165
|
metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
152
166
|
return NULL;
|
@@ -207,7 +221,9 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
207
221
|
GGML_METAL_ADD_KERNEL(relu);
|
208
222
|
GGML_METAL_ADD_KERNEL(gelu);
|
209
223
|
GGML_METAL_ADD_KERNEL(soft_max);
|
224
|
+
GGML_METAL_ADD_KERNEL(soft_max_4);
|
210
225
|
GGML_METAL_ADD_KERNEL(diag_mask_inf);
|
226
|
+
GGML_METAL_ADD_KERNEL(diag_mask_inf_8);
|
211
227
|
GGML_METAL_ADD_KERNEL(get_rows_f16);
|
212
228
|
GGML_METAL_ADD_KERNEL(get_rows_q4_0);
|
213
229
|
GGML_METAL_ADD_KERNEL(get_rows_q4_1);
|
@@ -221,6 +237,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
221
237
|
GGML_METAL_ADD_KERNEL(norm);
|
222
238
|
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
|
223
239
|
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_1row);
|
240
|
+
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_l4);
|
224
241
|
GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
|
225
242
|
GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32);
|
226
243
|
GGML_METAL_ADD_KERNEL(mul_mat_q8_0_f32);
|
@@ -247,13 +264,15 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
247
264
|
#undef GGML_METAL_ADD_KERNEL
|
248
265
|
}
|
249
266
|
|
250
|
-
metal_printf("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
251
267
|
metal_printf("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
|
268
|
+
#if TARGET_OS_OSX
|
269
|
+
metal_printf("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
252
270
|
if (ctx->device.maxTransferRate != 0) {
|
253
271
|
metal_printf("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
|
254
272
|
} else {
|
255
273
|
metal_printf("%s: maxTransferRate = built-in GPU\n", __func__);
|
256
274
|
}
|
275
|
+
#endif
|
257
276
|
|
258
277
|
return ctx;
|
259
278
|
}
|
@@ -273,7 +292,8 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
|
273
292
|
GGML_METAL_DEL_KERNEL(relu);
|
274
293
|
GGML_METAL_DEL_KERNEL(gelu);
|
275
294
|
GGML_METAL_DEL_KERNEL(soft_max);
|
276
|
-
GGML_METAL_DEL_KERNEL(
|
295
|
+
GGML_METAL_DEL_KERNEL(soft_max_4);
|
296
|
+
GGML_METAL_DEL_KERNEL(diag_mask_inf_8);
|
277
297
|
GGML_METAL_DEL_KERNEL(get_rows_f16);
|
278
298
|
GGML_METAL_DEL_KERNEL(get_rows_q4_0);
|
279
299
|
GGML_METAL_DEL_KERNEL(get_rows_q4_1);
|
@@ -287,6 +307,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
|
287
307
|
GGML_METAL_DEL_KERNEL(norm);
|
288
308
|
GGML_METAL_DEL_KERNEL(mul_mat_f16_f32);
|
289
309
|
GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_1row);
|
310
|
+
GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_l4);
|
290
311
|
GGML_METAL_DEL_KERNEL(mul_mat_q4_0_f32);
|
291
312
|
GGML_METAL_DEL_KERNEL(mul_mat_q4_1_f32);
|
292
313
|
GGML_METAL_DEL_KERNEL(mul_mat_q8_0_f32);
|
@@ -454,6 +475,7 @@ bool ggml_metal_add_buffer(
|
|
454
475
|
}
|
455
476
|
}
|
456
477
|
|
478
|
+
#if TARGET_OS_OSX
|
457
479
|
metal_printf(", (%8.2f / %8.2f)",
|
458
480
|
ctx->device.currentAllocatedSize / 1024.0 / 1024.0,
|
459
481
|
ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
@@ -463,6 +485,9 @@ bool ggml_metal_add_buffer(
|
|
463
485
|
} else {
|
464
486
|
metal_printf("\n");
|
465
487
|
}
|
488
|
+
#else
|
489
|
+
metal_printf(", (%8.2f)\n", ctx->device.currentAllocatedSize / 1024.0 / 1024.0);
|
490
|
+
#endif
|
466
491
|
}
|
467
492
|
|
468
493
|
return true;
|
@@ -750,7 +775,7 @@ void ggml_metal_graph_compute(
|
|
750
775
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
751
776
|
[encoder setBytes:&scale length:sizeof(scale) atIndex:2];
|
752
777
|
|
753
|
-
const int64_t n = ggml_nelements(dst);
|
778
|
+
const int64_t n = ggml_nelements(dst)/4;
|
754
779
|
|
755
780
|
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
756
781
|
} break;
|
@@ -762,7 +787,7 @@ void ggml_metal_graph_compute(
|
|
762
787
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
763
788
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
764
789
|
|
765
|
-
const int64_t n = ggml_nelements(dst);
|
790
|
+
const int64_t n = ggml_nelements(dst)/4;
|
766
791
|
|
767
792
|
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
768
793
|
} break;
|
@@ -782,7 +807,7 @@ void ggml_metal_graph_compute(
|
|
782
807
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
783
808
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
784
809
|
|
785
|
-
const int64_t n = ggml_nelements(dst);
|
810
|
+
const int64_t n = ggml_nelements(dst)/4;
|
786
811
|
|
787
812
|
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
788
813
|
} break;
|
@@ -796,13 +821,16 @@ void ggml_metal_graph_compute(
|
|
796
821
|
{
|
797
822
|
const int nth = 32;
|
798
823
|
|
799
|
-
|
824
|
+
if (ne00%4 == 0) {
|
825
|
+
[encoder setComputePipelineState:ctx->pipeline_soft_max_4];
|
826
|
+
} else {
|
827
|
+
[encoder setComputePipelineState:ctx->pipeline_soft_max];
|
828
|
+
}
|
800
829
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
801
830
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
802
831
|
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
|
803
832
|
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
|
804
833
|
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
|
805
|
-
[encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
|
806
834
|
|
807
835
|
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
808
836
|
} break;
|
@@ -810,14 +838,23 @@ void ggml_metal_graph_compute(
|
|
810
838
|
{
|
811
839
|
const int n_past = ((int32_t *)(dst->op_params))[0];
|
812
840
|
|
813
|
-
|
841
|
+
if (ne00%8 == 0) {
|
842
|
+
[encoder setComputePipelineState:ctx->pipeline_diag_mask_inf_8];
|
843
|
+
} else {
|
844
|
+
[encoder setComputePipelineState:ctx->pipeline_diag_mask_inf];
|
845
|
+
}
|
814
846
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
815
847
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
816
848
|
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
|
817
849
|
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
|
818
850
|
[encoder setBytes:&n_past length:sizeof(int) atIndex:4];
|
819
851
|
|
820
|
-
|
852
|
+
if (ne00%8 == 0) {
|
853
|
+
[encoder dispatchThreadgroups:MTLSizeMake(ne00*ne01*ne02/8, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
854
|
+
}
|
855
|
+
else {
|
856
|
+
[encoder dispatchThreadgroups:MTLSizeMake(ne00, ne01, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
857
|
+
}
|
821
858
|
} break;
|
822
859
|
case GGML_OP_MUL_MAT:
|
823
860
|
{
|
@@ -864,6 +901,7 @@ void ggml_metal_graph_compute(
|
|
864
901
|
} else {
|
865
902
|
int nth0 = 32;
|
866
903
|
int nth1 = 1;
|
904
|
+
int nrows = 1;
|
867
905
|
|
868
906
|
// use custom matrix x vector kernel
|
869
907
|
switch (src0t) {
|
@@ -873,8 +911,12 @@ void ggml_metal_graph_compute(
|
|
873
911
|
nth1 = 1;
|
874
912
|
if (ne11 * ne12 < 4) {
|
875
913
|
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_1row];
|
914
|
+
} else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
|
915
|
+
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_l4];
|
916
|
+
nrows = ne11;
|
876
917
|
} else {
|
877
918
|
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
|
919
|
+
nrows = 4;
|
878
920
|
}
|
879
921
|
} break;
|
880
922
|
case GGML_TYPE_Q4_0:
|
@@ -995,7 +1037,7 @@ void ggml_metal_graph_compute(
|
|
995
1037
|
else if (src0t == GGML_TYPE_Q6_K) {
|
996
1038
|
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
997
1039
|
} else {
|
998
|
-
int64_t ny = (ne11 +
|
1040
|
+
int64_t ny = (ne11 + nrows - 1)/nrows;
|
999
1041
|
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
1000
1042
|
}
|
1001
1043
|
}
|