llama_cpp 0.3.2 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +37 -0
- data/ext/llama_cpp/extconf.rb +9 -0
- data/ext/llama_cpp/llama_cpp.cpp +302 -112
- data/ext/llama_cpp/src/ggml-cuda.cu +677 -118
- data/ext/llama_cpp/src/ggml-metal.h +5 -1
- data/ext/llama_cpp/src/ggml-metal.m +65 -45
- data/ext/llama_cpp/src/ggml-metal.metal +610 -484
- data/ext/llama_cpp/src/ggml-mpi.c +216 -0
- data/ext/llama_cpp/src/ggml-mpi.h +39 -0
- data/ext/llama_cpp/src/ggml.c +1146 -812
- data/ext/llama_cpp/src/ggml.h +77 -19
- data/ext/llama_cpp/src/k_quants.h +8 -0
- data/ext/llama_cpp/src/llama.cpp +289 -104
- data/ext/llama_cpp/src/llama.h +46 -3
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +2 -1
- data/sig/llama_cpp.rbs +14 -1
- metadata +4 -2
@@ -34,9 +34,13 @@ extern "C" {
|
|
34
34
|
|
35
35
|
struct ggml_metal_context;
|
36
36
|
|
37
|
-
|
37
|
+
// number of command buffers to use
|
38
|
+
struct ggml_metal_context * ggml_metal_init(int n_cb);
|
38
39
|
void ggml_metal_free(struct ggml_metal_context * ctx);
|
39
40
|
|
41
|
+
// set the number of command buffers to use
|
42
|
+
void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb);
|
43
|
+
|
40
44
|
// creates a mapping between a host memory buffer and a device memory buffer
|
41
45
|
// - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
|
42
46
|
// - the mapping is used during computation to determine the arguments of the compute kernels
|
@@ -25,6 +25,8 @@ struct ggml_metal_buffer {
|
|
25
25
|
};
|
26
26
|
|
27
27
|
struct ggml_metal_context {
|
28
|
+
int n_cb;
|
29
|
+
|
28
30
|
float * logits;
|
29
31
|
|
30
32
|
id<MTLDevice> device;
|
@@ -86,11 +88,12 @@ static NSString * const msl_library_source = @"see metal.metal";
|
|
86
88
|
@implementation GGMLMetalClass
|
87
89
|
@end
|
88
90
|
|
89
|
-
struct ggml_metal_context * ggml_metal_init(
|
91
|
+
struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
90
92
|
fprintf(stderr, "%s: allocating\n", __func__);
|
91
93
|
|
92
94
|
struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
|
93
95
|
|
96
|
+
ctx->n_cb = n_cb;
|
94
97
|
ctx->device = MTLCreateSystemDefaultDevice();
|
95
98
|
ctx->queue = [ctx->device newCommandQueue];
|
96
99
|
ctx->n_buffers = 0;
|
@@ -208,6 +211,10 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
|
208
211
|
free(ctx);
|
209
212
|
}
|
210
213
|
|
214
|
+
void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb) {
|
215
|
+
ctx->n_cb = n_cb;
|
216
|
+
}
|
217
|
+
|
211
218
|
// finds the Metal buffer that contains the tensor data on the GPU device
|
212
219
|
// the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
|
213
220
|
// Metal buffer based on the host memory pointer
|
@@ -354,7 +361,7 @@ void ggml_metal_graph_compute(
|
|
354
361
|
// create multiple command buffers and enqueue them
|
355
362
|
// then, we encode the graph into the command buffers in parallel
|
356
363
|
|
357
|
-
const int n_cb =
|
364
|
+
const int n_cb = ctx->n_cb;
|
358
365
|
|
359
366
|
NSMutableArray * command_buffers = [NSMutableArray arrayWithCapacity:n_cb];
|
360
367
|
|
@@ -386,8 +393,8 @@ void ggml_metal_graph_compute(
|
|
386
393
|
for (int i = node_start; i < node_end; ++i) {
|
387
394
|
metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
|
388
395
|
|
389
|
-
struct ggml_tensor * src0 = gf->nodes[i]->
|
390
|
-
struct ggml_tensor * src1 = gf->nodes[i]->
|
396
|
+
struct ggml_tensor * src0 = gf->nodes[i]->src[0];
|
397
|
+
struct ggml_tensor * src1 = gf->nodes[i]->src[1];
|
391
398
|
struct ggml_tensor * dst = gf->nodes[i];
|
392
399
|
|
393
400
|
const int64_t ne00 = src0 ? src0->ne[0] : 0;
|
@@ -443,6 +450,7 @@ void ggml_metal_graph_compute(
|
|
443
450
|
//}
|
444
451
|
|
445
452
|
switch (dst->op) {
|
453
|
+
case GGML_OP_NONE:
|
446
454
|
case GGML_OP_RESHAPE:
|
447
455
|
case GGML_OP_VIEW:
|
448
456
|
case GGML_OP_TRANSPOSE:
|
@@ -668,8 +676,8 @@ void ggml_metal_graph_compute(
|
|
668
676
|
GGML_ASSERT(ne02 == 1);
|
669
677
|
GGML_ASSERT(ne12 == 1);
|
670
678
|
|
671
|
-
nth0 =
|
672
|
-
nth1 =
|
679
|
+
nth0 = 2;
|
680
|
+
nth1 = 32;
|
673
681
|
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q2_K_f32];
|
674
682
|
} break;
|
675
683
|
case GGML_TYPE_Q3_K:
|
@@ -677,8 +685,8 @@ void ggml_metal_graph_compute(
|
|
677
685
|
GGML_ASSERT(ne02 == 1);
|
678
686
|
GGML_ASSERT(ne12 == 1);
|
679
687
|
|
680
|
-
nth0 =
|
681
|
-
nth1 =
|
688
|
+
nth0 = 2;
|
689
|
+
nth1 = 32;
|
682
690
|
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q3_K_f32];
|
683
691
|
} break;
|
684
692
|
case GGML_TYPE_Q4_K:
|
@@ -686,8 +694,8 @@ void ggml_metal_graph_compute(
|
|
686
694
|
GGML_ASSERT(ne02 == 1);
|
687
695
|
GGML_ASSERT(ne12 == 1);
|
688
696
|
|
689
|
-
nth0 =
|
690
|
-
nth1 =
|
697
|
+
nth0 = 2;
|
698
|
+
nth1 = 32;
|
691
699
|
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32];
|
692
700
|
} break;
|
693
701
|
case GGML_TYPE_Q5_K:
|
@@ -695,8 +703,8 @@ void ggml_metal_graph_compute(
|
|
695
703
|
GGML_ASSERT(ne02 == 1);
|
696
704
|
GGML_ASSERT(ne12 == 1);
|
697
705
|
|
698
|
-
nth0 =
|
699
|
-
nth1 =
|
706
|
+
nth0 = 2;
|
707
|
+
nth1 = 32;
|
700
708
|
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_K_f32];
|
701
709
|
} break;
|
702
710
|
case GGML_TYPE_Q6_K:
|
@@ -704,8 +712,8 @@ void ggml_metal_graph_compute(
|
|
704
712
|
GGML_ASSERT(ne02 == 1);
|
705
713
|
GGML_ASSERT(ne12 == 1);
|
706
714
|
|
707
|
-
nth0 =
|
708
|
-
nth1 =
|
715
|
+
nth0 = 2;
|
716
|
+
nth1 = 32;
|
709
717
|
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q6_K_f32];
|
710
718
|
} break;
|
711
719
|
default:
|
@@ -731,17 +739,22 @@ void ggml_metal_graph_compute(
|
|
731
739
|
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13];
|
732
740
|
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14];
|
733
741
|
|
734
|
-
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1
|
735
|
-
|
736
|
-
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
742
|
+
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
|
743
|
+
src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) {
|
744
|
+
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7) / 8, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
737
745
|
}
|
738
|
-
else if (src0t ==
|
739
|
-
|
740
|
-
|
741
|
-
|
742
|
-
|
743
|
-
|
744
|
-
|
746
|
+
else if (src0t == GGML_TYPE_Q3_K) {
|
747
|
+
#ifdef GGML_QKK_64
|
748
|
+
[encoder dispatchThreadgroups:MTLSizeMake((ne01+1)/2, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
749
|
+
#else
|
750
|
+
[encoder dispatchThreadgroups:MTLSizeMake((ne01+3)/4, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
751
|
+
#endif
|
752
|
+
}
|
753
|
+
else if (src0t == GGML_TYPE_Q5_K) {
|
754
|
+
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3) / 4, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
755
|
+
}
|
756
|
+
else if (src0t == GGML_TYPE_Q6_K) {
|
757
|
+
[encoder dispatchThreadgroups:MTLSizeMake((ne01+1)/2, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
745
758
|
} else {
|
746
759
|
[encoder setThreadgroupMemoryLength:nth0*sizeof(float) atIndex:0];
|
747
760
|
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
@@ -785,7 +798,7 @@ void ggml_metal_graph_compute(
|
|
785
798
|
|
786
799
|
const float eps = 1e-6f;
|
787
800
|
|
788
|
-
const int nth =
|
801
|
+
const int nth = 512;
|
789
802
|
|
790
803
|
[encoder setComputePipelineState:ctx->pipeline_rms_norm];
|
791
804
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
@@ -793,7 +806,7 @@ void ggml_metal_graph_compute(
|
|
793
806
|
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
|
794
807
|
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
|
795
808
|
[encoder setBytes:&eps length:sizeof( float) atIndex:4];
|
796
|
-
[encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
|
809
|
+
[encoder setThreadgroupMemoryLength:nth/32*sizeof(float) atIndex:0];
|
797
810
|
|
798
811
|
const int64_t nrows = ggml_nrows(src0);
|
799
812
|
|
@@ -874,28 +887,35 @@ void ggml_metal_graph_compute(
|
|
874
887
|
|
875
888
|
const int n_past = ((int32_t *)(src1->data))[0];
|
876
889
|
|
890
|
+
float freq_base;
|
891
|
+
float freq_scale;
|
892
|
+
memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float));
|
893
|
+
memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
|
894
|
+
|
877
895
|
[encoder setComputePipelineState:ctx->pipeline_rope];
|
878
896
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
879
897
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
880
|
-
[encoder setBytes:&ne00
|
881
|
-
[encoder setBytes:&ne01
|
882
|
-
[encoder setBytes:&ne02
|
883
|
-
[encoder setBytes:&ne03
|
884
|
-
[encoder setBytes:&nb00
|
885
|
-
[encoder setBytes:&nb01
|
886
|
-
[encoder setBytes:&nb02
|
887
|
-
[encoder setBytes:&nb03
|
888
|
-
[encoder setBytes:&ne0
|
889
|
-
[encoder setBytes:&ne1
|
890
|
-
[encoder setBytes:&ne2
|
891
|
-
[encoder setBytes:&ne3
|
892
|
-
[encoder setBytes:&nb0
|
893
|
-
[encoder setBytes:&nb1
|
894
|
-
[encoder setBytes:&nb2
|
895
|
-
[encoder setBytes:&nb3
|
896
|
-
[encoder setBytes:&n_past
|
897
|
-
[encoder setBytes:&n_dims
|
898
|
-
[encoder setBytes:&mode
|
898
|
+
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
|
899
|
+
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3];
|
900
|
+
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4];
|
901
|
+
[encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5];
|
902
|
+
[encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6];
|
903
|
+
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7];
|
904
|
+
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8];
|
905
|
+
[encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9];
|
906
|
+
[encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10];
|
907
|
+
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11];
|
908
|
+
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12];
|
909
|
+
[encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13];
|
910
|
+
[encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14];
|
911
|
+
[encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15];
|
912
|
+
[encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16];
|
913
|
+
[encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17];
|
914
|
+
[encoder setBytes:&n_past length:sizeof( int) atIndex:18];
|
915
|
+
[encoder setBytes:&n_dims length:sizeof( int) atIndex:19];
|
916
|
+
[encoder setBytes:&mode length:sizeof( int) atIndex:20];
|
917
|
+
[encoder setBytes:&freq_base length:sizeof(float) atIndex:21];
|
918
|
+
[encoder setBytes:&freq_scale length:sizeof(float) atIndex:22];
|
899
919
|
|
900
920
|
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
901
921
|
} break;
|