llama_cpp 0.3.5 → 0.3.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/README.md +18 -2
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +22 -8
- data/ext/llama_cpp/src/ggml-alloc.c +549 -0
- data/ext/llama_cpp/src/ggml-alloc.h +22 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +2526 -430
- data/ext/llama_cpp/src/ggml-cuda.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.m +56 -34
- data/ext/llama_cpp/src/ggml-metal.metal +4 -1
- data/ext/llama_cpp/src/ggml.c +445 -176
- data/ext/llama_cpp/src/ggml.h +125 -33
- data/ext/llama_cpp/src/k_quants.c +32 -30
- data/ext/llama_cpp/src/llama-util.h +41 -1
- data/ext/llama_cpp/src/llama.cpp +409 -210
- data/ext/llama_cpp/src/llama.h +19 -1
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- metadata +4 -2
@@ -27,6 +27,7 @@ void ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
|
|
27
27
|
void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
|
28
28
|
void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor);
|
29
29
|
void ggml_cuda_set_main_device(int main_device);
|
30
|
+
void ggml_cuda_set_mul_mat_q(bool mul_mat_q);
|
30
31
|
void ggml_cuda_set_scratch_size(size_t scratch_size);
|
31
32
|
void ggml_cuda_free_scratch(void);
|
32
33
|
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
|
@@ -7,6 +7,11 @@
|
|
7
7
|
#import <Metal/Metal.h>
|
8
8
|
#import <MetalPerformanceShaders/MetalPerformanceShaders.h>
|
9
9
|
|
10
|
+
#undef MIN
|
11
|
+
#undef MAX
|
12
|
+
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
13
|
+
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
14
|
+
|
10
15
|
#ifdef GGML_METAL_NDEBUG
|
11
16
|
#define metal_printf(...)
|
12
17
|
#else
|
@@ -15,6 +20,8 @@
|
|
15
20
|
|
16
21
|
#define UNUSED(x) (void)(x)
|
17
22
|
|
23
|
+
#define GGML_MAX_CONCUR (2*GGML_MAX_NODES)
|
24
|
+
|
18
25
|
struct ggml_metal_buffer {
|
19
26
|
const char * name;
|
20
27
|
|
@@ -36,7 +43,7 @@ struct ggml_metal_context {
|
|
36
43
|
int n_buffers;
|
37
44
|
struct ggml_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
|
38
45
|
|
39
|
-
int concur_list[
|
46
|
+
int concur_list[GGML_MAX_CONCUR];
|
40
47
|
int concur_list_len;
|
41
48
|
|
42
49
|
// custom kernels
|
@@ -370,15 +377,15 @@ void ggml_metal_graph_find_concurrency(
|
|
370
377
|
struct ggml_metal_context * ctx,
|
371
378
|
struct ggml_cgraph * gf) {
|
372
379
|
int search_depth = gf->n_nodes; //we only find concurrency in this range to avoid wasting too much time
|
373
|
-
int nodes_unused[
|
380
|
+
int nodes_unused[GGML_MAX_CONCUR];
|
374
381
|
|
375
|
-
for (int i = 0; i <
|
376
|
-
for (int i = 0; i < gf->n_nodes;
|
382
|
+
for (int i = 0; i < GGML_MAX_CONCUR; i++) { ctx->concur_list[i] = 0; }
|
383
|
+
for (int i = 0; i < gf->n_nodes; i++) { nodes_unused[i] = 1; }
|
377
384
|
ctx->concur_list_len = 0;
|
378
385
|
|
379
|
-
int n_left
|
380
|
-
int n_start
|
381
|
-
int level_pos = 0;
|
386
|
+
int n_left = gf->n_nodes;
|
387
|
+
int n_start = 0; // all nodes before n_start at nodes_unused array have been sorted and store back to ctx->concur_list
|
388
|
+
int level_pos = 0; // at ctx->concur_list, the last layer (level) ends at level_pos
|
382
389
|
|
383
390
|
while (n_left > 0) {
|
384
391
|
// number of nodes at a layer (that can be issued concurrently)
|
@@ -386,28 +393,40 @@ void ggml_metal_graph_find_concurrency(
|
|
386
393
|
for (int i = n_start; i < ((n_start + search_depth > gf->n_nodes) ? gf->n_nodes : n_start + search_depth); i++) {
|
387
394
|
if (nodes_unused[i]) {
|
388
395
|
// if the requirements for gf->nodes[i] are satisfied
|
389
|
-
int exe_flag=1;
|
396
|
+
int exe_flag = 1;
|
397
|
+
|
390
398
|
// scan all srcs
|
391
399
|
for (int src_ind = 0; src_ind < GGML_MAX_SRC; src_ind++) {
|
392
400
|
struct ggml_tensor * src_cur = gf->nodes[i]->src[src_ind];
|
393
401
|
if (src_cur) {
|
394
402
|
// if is leaf nodes it's satisfied.
|
395
|
-
|
403
|
+
// TODO: ggml_is_leaf()
|
404
|
+
if (src_cur->op == GGML_OP_NONE && src_cur->grad == NULL) {
|
405
|
+
continue;
|
406
|
+
}
|
396
407
|
|
397
408
|
// otherwise this src should be the output from previous nodes.
|
398
409
|
int is_found = 0;
|
410
|
+
|
399
411
|
// scan 2*search_depth back because we inserted barrier.
|
400
|
-
for (int j = ((level_pos - 2*search_depth) < 0 ? 0 : (level_pos - 2*search_depth)); j < level_pos; j++) {
|
401
|
-
|
412
|
+
//for (int j = ((level_pos - 2*search_depth) < 0 ? 0 : (level_pos - 2*search_depth)); j < level_pos; j++) {
|
413
|
+
for (int j = MAX(0, level_pos - 2*search_depth); j < level_pos; j++) {
|
414
|
+
if (ctx->concur_list[j] >= 0 && gf->nodes[ctx->concur_list[j]] == src_cur) {
|
415
|
+
is_found = 1;
|
416
|
+
break;
|
417
|
+
}
|
418
|
+
}
|
419
|
+
if (is_found == 0) {
|
420
|
+
exe_flag = 0;
|
421
|
+
break;
|
402
422
|
}
|
403
|
-
if (is_found == 0) {exe_flag = 0; break;}
|
404
423
|
}
|
405
424
|
}
|
406
425
|
if (exe_flag) {
|
407
426
|
// check if nodes[i]'s data will be overwritten by a node before nodes[i].
|
408
427
|
// if node[5] and node[3] write to the same memory region, then we can't issue node[5] before node[3]
|
409
428
|
int64_t data_start = (int64_t) gf->nodes[i]->data;
|
410
|
-
int64_t length
|
429
|
+
int64_t length = (int64_t) ggml_nbytes(gf->nodes[i]);
|
411
430
|
for (int j = n_start; j < i; j++) {
|
412
431
|
if (nodes_unused[j] && gf->nodes[j]->op != GGML_OP_RESHAPE \
|
413
432
|
&& gf->nodes[j]->op != GGML_OP_VIEW \
|
@@ -416,9 +435,9 @@ void ggml_metal_graph_find_concurrency(
|
|
416
435
|
if (((int64_t)gf->nodes[j]->data) >= data_start + length || \
|
417
436
|
((int64_t)gf->nodes[j]->data) + (int64_t) ggml_nbytes(gf->nodes[j]) <= data_start) {
|
418
437
|
continue;
|
419
|
-
} else {
|
420
|
-
exe_flag = 0;
|
421
438
|
}
|
439
|
+
|
440
|
+
exe_flag = 0;
|
422
441
|
}
|
423
442
|
}
|
424
443
|
}
|
@@ -435,11 +454,13 @@ void ggml_metal_graph_find_concurrency(
|
|
435
454
|
ctx->concur_list[level_pos + concurrency] = -1;
|
436
455
|
ctx->concur_list_len++;
|
437
456
|
// jump all sorted nodes at nodes_bak
|
438
|
-
while (!nodes_unused[n_start]) {
|
457
|
+
while (!nodes_unused[n_start]) {
|
458
|
+
n_start++;
|
459
|
+
}
|
439
460
|
level_pos += concurrency + 1;
|
440
461
|
}
|
441
462
|
|
442
|
-
if (ctx->concur_list_len >
|
463
|
+
if (ctx->concur_list_len > GGML_MAX_CONCUR) {
|
443
464
|
fprintf(stderr, "%s: too many elements for metal ctx->concur_list!\n", __func__);
|
444
465
|
}
|
445
466
|
}
|
@@ -453,7 +474,7 @@ void ggml_metal_graph_compute(
|
|
453
474
|
// else fallback to serial dispatch
|
454
475
|
MTLComputePassDescriptor * edesc = MTLComputePassDescriptor.computePassDescriptor;
|
455
476
|
|
456
|
-
const bool has_concur = ctx->concur_list_len && ctx->concur_list_len <=
|
477
|
+
const bool has_concur = ctx->concur_list_len && ctx->concur_list_len <= GGML_MAX_CONCUR;
|
457
478
|
|
458
479
|
const int n_nodes = has_concur ? ctx->concur_list_len : gf->n_nodes;
|
459
480
|
edesc.dispatchType = has_concur ? MTLDispatchTypeConcurrent : MTLDispatchTypeSerial;
|
@@ -718,7 +739,8 @@ void ggml_metal_graph_compute(
|
|
718
739
|
// TODO: needs to be updated after PR: https://github.com/ggerganov/ggml/pull/224
|
719
740
|
|
720
741
|
GGML_ASSERT(ne00 == ne10);
|
721
|
-
GGML_ASSERT(ne02 == ne12);
|
742
|
+
// GGML_ASSERT(ne02 == ne12); // Should be checked on individual data types until broadcast is implemented everywhere
|
743
|
+
GGML_ASSERT(ne03 == ne13);
|
722
744
|
|
723
745
|
if (ggml_is_contiguous(src0) &&
|
724
746
|
ggml_is_contiguous(src1) &&
|
@@ -746,11 +768,11 @@ void ggml_metal_graph_compute(
|
|
746
768
|
initWithDevice:ctx->device transposeLeft:false transposeRight:true
|
747
769
|
resultRows:ne11 resultColumns:ne01 interiorColumns:ne00 alpha:1.0 beta:0.0];
|
748
770
|
|
749
|
-
// we need to do
|
771
|
+
// we need to do ne12 multiplications
|
750
772
|
// TODO: is there a way to do this in parallel - currently very slow ..
|
751
773
|
// TODO: might be possible to offload part of the computation to ANE using Accelerate's CBLAS
|
752
|
-
for (int64_t i02 = 0; i02 <
|
753
|
-
size_t offs_src0_cur = offs_src0 + i02*nb02;
|
774
|
+
for (int64_t i02 = 0; i02 < ne12; ++i02) {
|
775
|
+
size_t offs_src0_cur = offs_src0 + i02/(ne12/ne02)*nb02; // gqa not used for now
|
754
776
|
size_t offs_src1_cur = offs_src1 + i02*nb12;
|
755
777
|
size_t offs_dst_cur = offs_dst + i02*nb2;
|
756
778
|
|
@@ -772,8 +794,6 @@ void ggml_metal_graph_compute(
|
|
772
794
|
switch (src0t) {
|
773
795
|
case GGML_TYPE_F16:
|
774
796
|
{
|
775
|
-
GGML_ASSERT(ne02 == ne12);
|
776
|
-
|
777
797
|
nth0 = 64;
|
778
798
|
nth1 = 1;
|
779
799
|
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
|
@@ -853,16 +873,18 @@ void ggml_metal_graph_compute(
|
|
853
873
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
854
874
|
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
|
855
875
|
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
|
856
|
-
[encoder setBytes:&
|
857
|
-
[encoder setBytes:&
|
858
|
-
[encoder setBytes:&
|
859
|
-
[encoder setBytes:&
|
860
|
-
[encoder setBytes:&
|
861
|
-
[encoder setBytes:&
|
862
|
-
[encoder setBytes:&
|
863
|
-
[encoder setBytes:&
|
864
|
-
[encoder setBytes:&
|
865
|
-
[encoder setBytes:&
|
876
|
+
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
|
877
|
+
[encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
|
878
|
+
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
|
879
|
+
[encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
|
880
|
+
[encoder setBytes:&ne10 length:sizeof(ne10) atIndex:9];
|
881
|
+
[encoder setBytes:&ne11 length:sizeof(ne11) atIndex:10];
|
882
|
+
[encoder setBytes:&ne12 length:sizeof(ne12) atIndex:11];
|
883
|
+
[encoder setBytes:&nb10 length:sizeof(nb10) atIndex:12];
|
884
|
+
[encoder setBytes:&nb11 length:sizeof(nb11) atIndex:13];
|
885
|
+
[encoder setBytes:&nb12 length:sizeof(nb12) atIndex:14];
|
886
|
+
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:15];
|
887
|
+
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:16];
|
866
888
|
|
867
889
|
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
|
868
890
|
src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) {
|
@@ -509,11 +509,13 @@ kernel void kernel_mul_mat_f16_f32(
|
|
509
509
|
device float * dst,
|
510
510
|
constant int64_t & ne00,
|
511
511
|
constant int64_t & ne01,
|
512
|
+
constant int64_t & ne02,
|
512
513
|
constant uint64_t & nb00,
|
513
514
|
constant uint64_t & nb01,
|
514
515
|
constant uint64_t & nb02,
|
515
516
|
constant int64_t & ne10,
|
516
517
|
constant int64_t & ne11,
|
518
|
+
constant int64_t & ne12,
|
517
519
|
constant uint64_t & nb10,
|
518
520
|
constant uint64_t & nb11,
|
519
521
|
constant uint64_t & nb12,
|
@@ -529,7 +531,7 @@ kernel void kernel_mul_mat_f16_f32(
|
|
529
531
|
const int64_t r1 = tgpig.y;
|
530
532
|
const int64_t im = tgpig.z;
|
531
533
|
|
532
|
-
device const half * x = (device const half *) (src0 + r0*nb01 + im*nb02);
|
534
|
+
device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
|
533
535
|
device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
|
534
536
|
|
535
537
|
sum[tpitg.x] = 0.0f;
|
@@ -552,6 +554,7 @@ kernel void kernel_mul_mat_f16_f32(
|
|
552
554
|
}
|
553
555
|
}
|
554
556
|
|
557
|
+
|
555
558
|
kernel void kernel_alibi_f32(
|
556
559
|
device const float * src0,
|
557
560
|
device float * dst,
|