llama_cpp 0.3.5 → 0.3.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/README.md +18 -2
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +22 -8
- data/ext/llama_cpp/src/ggml-alloc.c +549 -0
- data/ext/llama_cpp/src/ggml-alloc.h +22 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +2526 -430
- data/ext/llama_cpp/src/ggml-cuda.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.m +56 -34
- data/ext/llama_cpp/src/ggml-metal.metal +4 -1
- data/ext/llama_cpp/src/ggml.c +445 -176
- data/ext/llama_cpp/src/ggml.h +125 -33
- data/ext/llama_cpp/src/k_quants.c +32 -30
- data/ext/llama_cpp/src/llama-util.h +41 -1
- data/ext/llama_cpp/src/llama.cpp +409 -210
- data/ext/llama_cpp/src/llama.h +19 -1
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- metadata +4 -2
@@ -27,6 +27,7 @@ void ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
|
|
27
27
|
void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
|
28
28
|
void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor);
|
29
29
|
void ggml_cuda_set_main_device(int main_device);
|
30
|
+
void ggml_cuda_set_mul_mat_q(bool mul_mat_q);
|
30
31
|
void ggml_cuda_set_scratch_size(size_t scratch_size);
|
31
32
|
void ggml_cuda_free_scratch(void);
|
32
33
|
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
|
@@ -7,6 +7,11 @@
|
|
7
7
|
#import <Metal/Metal.h>
|
8
8
|
#import <MetalPerformanceShaders/MetalPerformanceShaders.h>
|
9
9
|
|
10
|
+
#undef MIN
|
11
|
+
#undef MAX
|
12
|
+
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
13
|
+
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
14
|
+
|
10
15
|
#ifdef GGML_METAL_NDEBUG
|
11
16
|
#define metal_printf(...)
|
12
17
|
#else
|
@@ -15,6 +20,8 @@
|
|
15
20
|
|
16
21
|
#define UNUSED(x) (void)(x)
|
17
22
|
|
23
|
+
#define GGML_MAX_CONCUR (2*GGML_MAX_NODES)
|
24
|
+
|
18
25
|
struct ggml_metal_buffer {
|
19
26
|
const char * name;
|
20
27
|
|
@@ -36,7 +43,7 @@ struct ggml_metal_context {
|
|
36
43
|
int n_buffers;
|
37
44
|
struct ggml_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
|
38
45
|
|
39
|
-
int concur_list[
|
46
|
+
int concur_list[GGML_MAX_CONCUR];
|
40
47
|
int concur_list_len;
|
41
48
|
|
42
49
|
// custom kernels
|
@@ -370,15 +377,15 @@ void ggml_metal_graph_find_concurrency(
|
|
370
377
|
struct ggml_metal_context * ctx,
|
371
378
|
struct ggml_cgraph * gf) {
|
372
379
|
int search_depth = gf->n_nodes; //we only find concurrency in this range to avoid wasting too much time
|
373
|
-
int nodes_unused[
|
380
|
+
int nodes_unused[GGML_MAX_CONCUR];
|
374
381
|
|
375
|
-
for (int i = 0; i <
|
376
|
-
for (int i = 0; i < gf->n_nodes;
|
382
|
+
for (int i = 0; i < GGML_MAX_CONCUR; i++) { ctx->concur_list[i] = 0; }
|
383
|
+
for (int i = 0; i < gf->n_nodes; i++) { nodes_unused[i] = 1; }
|
377
384
|
ctx->concur_list_len = 0;
|
378
385
|
|
379
|
-
int n_left
|
380
|
-
int n_start
|
381
|
-
int level_pos = 0;
|
386
|
+
int n_left = gf->n_nodes;
|
387
|
+
int n_start = 0; // all nodes before n_start at nodes_unused array have been sorted and store back to ctx->concur_list
|
388
|
+
int level_pos = 0; // at ctx->concur_list, the last layer (level) ends at level_pos
|
382
389
|
|
383
390
|
while (n_left > 0) {
|
384
391
|
// number of nodes at a layer (that can be issued concurrently)
|
@@ -386,28 +393,40 @@ void ggml_metal_graph_find_concurrency(
|
|
386
393
|
for (int i = n_start; i < ((n_start + search_depth > gf->n_nodes) ? gf->n_nodes : n_start + search_depth); i++) {
|
387
394
|
if (nodes_unused[i]) {
|
388
395
|
// if the requirements for gf->nodes[i] are satisfied
|
389
|
-
int exe_flag=1;
|
396
|
+
int exe_flag = 1;
|
397
|
+
|
390
398
|
// scan all srcs
|
391
399
|
for (int src_ind = 0; src_ind < GGML_MAX_SRC; src_ind++) {
|
392
400
|
struct ggml_tensor * src_cur = gf->nodes[i]->src[src_ind];
|
393
401
|
if (src_cur) {
|
394
402
|
// if is leaf nodes it's satisfied.
|
395
|
-
|
403
|
+
// TODO: ggml_is_leaf()
|
404
|
+
if (src_cur->op == GGML_OP_NONE && src_cur->grad == NULL) {
|
405
|
+
continue;
|
406
|
+
}
|
396
407
|
|
397
408
|
// otherwise this src should be the output from previous nodes.
|
398
409
|
int is_found = 0;
|
410
|
+
|
399
411
|
// scan 2*search_depth back because we inserted barrier.
|
400
|
-
for (int j = ((level_pos - 2*search_depth) < 0 ? 0 : (level_pos - 2*search_depth)); j < level_pos; j++) {
|
401
|
-
|
412
|
+
//for (int j = ((level_pos - 2*search_depth) < 0 ? 0 : (level_pos - 2*search_depth)); j < level_pos; j++) {
|
413
|
+
for (int j = MAX(0, level_pos - 2*search_depth); j < level_pos; j++) {
|
414
|
+
if (ctx->concur_list[j] >= 0 && gf->nodes[ctx->concur_list[j]] == src_cur) {
|
415
|
+
is_found = 1;
|
416
|
+
break;
|
417
|
+
}
|
418
|
+
}
|
419
|
+
if (is_found == 0) {
|
420
|
+
exe_flag = 0;
|
421
|
+
break;
|
402
422
|
}
|
403
|
-
if (is_found == 0) {exe_flag = 0; break;}
|
404
423
|
}
|
405
424
|
}
|
406
425
|
if (exe_flag) {
|
407
426
|
// check if nodes[i]'s data will be overwritten by a node before nodes[i].
|
408
427
|
// if node[5] and node[3] write to the same memory region, then we can't issue node[5] before node[3]
|
409
428
|
int64_t data_start = (int64_t) gf->nodes[i]->data;
|
410
|
-
int64_t length
|
429
|
+
int64_t length = (int64_t) ggml_nbytes(gf->nodes[i]);
|
411
430
|
for (int j = n_start; j < i; j++) {
|
412
431
|
if (nodes_unused[j] && gf->nodes[j]->op != GGML_OP_RESHAPE \
|
413
432
|
&& gf->nodes[j]->op != GGML_OP_VIEW \
|
@@ -416,9 +435,9 @@ void ggml_metal_graph_find_concurrency(
|
|
416
435
|
if (((int64_t)gf->nodes[j]->data) >= data_start + length || \
|
417
436
|
((int64_t)gf->nodes[j]->data) + (int64_t) ggml_nbytes(gf->nodes[j]) <= data_start) {
|
418
437
|
continue;
|
419
|
-
} else {
|
420
|
-
exe_flag = 0;
|
421
438
|
}
|
439
|
+
|
440
|
+
exe_flag = 0;
|
422
441
|
}
|
423
442
|
}
|
424
443
|
}
|
@@ -435,11 +454,13 @@ void ggml_metal_graph_find_concurrency(
|
|
435
454
|
ctx->concur_list[level_pos + concurrency] = -1;
|
436
455
|
ctx->concur_list_len++;
|
437
456
|
// jump all sorted nodes at nodes_bak
|
438
|
-
while (!nodes_unused[n_start]) {
|
457
|
+
while (!nodes_unused[n_start]) {
|
458
|
+
n_start++;
|
459
|
+
}
|
439
460
|
level_pos += concurrency + 1;
|
440
461
|
}
|
441
462
|
|
442
|
-
if (ctx->concur_list_len >
|
463
|
+
if (ctx->concur_list_len > GGML_MAX_CONCUR) {
|
443
464
|
fprintf(stderr, "%s: too many elements for metal ctx->concur_list!\n", __func__);
|
444
465
|
}
|
445
466
|
}
|
@@ -453,7 +474,7 @@ void ggml_metal_graph_compute(
|
|
453
474
|
// else fallback to serial dispatch
|
454
475
|
MTLComputePassDescriptor * edesc = MTLComputePassDescriptor.computePassDescriptor;
|
455
476
|
|
456
|
-
const bool has_concur = ctx->concur_list_len && ctx->concur_list_len <=
|
477
|
+
const bool has_concur = ctx->concur_list_len && ctx->concur_list_len <= GGML_MAX_CONCUR;
|
457
478
|
|
458
479
|
const int n_nodes = has_concur ? ctx->concur_list_len : gf->n_nodes;
|
459
480
|
edesc.dispatchType = has_concur ? MTLDispatchTypeConcurrent : MTLDispatchTypeSerial;
|
@@ -718,7 +739,8 @@ void ggml_metal_graph_compute(
|
|
718
739
|
// TODO: needs to be updated after PR: https://github.com/ggerganov/ggml/pull/224
|
719
740
|
|
720
741
|
GGML_ASSERT(ne00 == ne10);
|
721
|
-
GGML_ASSERT(ne02 == ne12);
|
742
|
+
// GGML_ASSERT(ne02 == ne12); // Should be checked on individual data types until broadcast is implemented everywhere
|
743
|
+
GGML_ASSERT(ne03 == ne13);
|
722
744
|
|
723
745
|
if (ggml_is_contiguous(src0) &&
|
724
746
|
ggml_is_contiguous(src1) &&
|
@@ -746,11 +768,11 @@ void ggml_metal_graph_compute(
|
|
746
768
|
initWithDevice:ctx->device transposeLeft:false transposeRight:true
|
747
769
|
resultRows:ne11 resultColumns:ne01 interiorColumns:ne00 alpha:1.0 beta:0.0];
|
748
770
|
|
749
|
-
// we need to do
|
771
|
+
// we need to do ne12 multiplications
|
750
772
|
// TODO: is there a way to do this in parallel - currently very slow ..
|
751
773
|
// TODO: might be possible to offload part of the computation to ANE using Accelerate's CBLAS
|
752
|
-
for (int64_t i02 = 0; i02 <
|
753
|
-
size_t offs_src0_cur = offs_src0 + i02*nb02;
|
774
|
+
for (int64_t i02 = 0; i02 < ne12; ++i02) {
|
775
|
+
size_t offs_src0_cur = offs_src0 + i02/(ne12/ne02)*nb02; // gqa not used for now
|
754
776
|
size_t offs_src1_cur = offs_src1 + i02*nb12;
|
755
777
|
size_t offs_dst_cur = offs_dst + i02*nb2;
|
756
778
|
|
@@ -772,8 +794,6 @@ void ggml_metal_graph_compute(
|
|
772
794
|
switch (src0t) {
|
773
795
|
case GGML_TYPE_F16:
|
774
796
|
{
|
775
|
-
GGML_ASSERT(ne02 == ne12);
|
776
|
-
|
777
797
|
nth0 = 64;
|
778
798
|
nth1 = 1;
|
779
799
|
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
|
@@ -853,16 +873,18 @@ void ggml_metal_graph_compute(
|
|
853
873
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
854
874
|
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
|
855
875
|
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
|
856
|
-
[encoder setBytes:&
|
857
|
-
[encoder setBytes:&
|
858
|
-
[encoder setBytes:&
|
859
|
-
[encoder setBytes:&
|
860
|
-
[encoder setBytes:&
|
861
|
-
[encoder setBytes:&
|
862
|
-
[encoder setBytes:&
|
863
|
-
[encoder setBytes:&
|
864
|
-
[encoder setBytes:&
|
865
|
-
[encoder setBytes:&
|
876
|
+
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
|
877
|
+
[encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
|
878
|
+
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
|
879
|
+
[encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
|
880
|
+
[encoder setBytes:&ne10 length:sizeof(ne10) atIndex:9];
|
881
|
+
[encoder setBytes:&ne11 length:sizeof(ne11) atIndex:10];
|
882
|
+
[encoder setBytes:&ne12 length:sizeof(ne12) atIndex:11];
|
883
|
+
[encoder setBytes:&nb10 length:sizeof(nb10) atIndex:12];
|
884
|
+
[encoder setBytes:&nb11 length:sizeof(nb11) atIndex:13];
|
885
|
+
[encoder setBytes:&nb12 length:sizeof(nb12) atIndex:14];
|
886
|
+
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:15];
|
887
|
+
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:16];
|
866
888
|
|
867
889
|
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
|
868
890
|
src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) {
|
@@ -509,11 +509,13 @@ kernel void kernel_mul_mat_f16_f32(
|
|
509
509
|
device float * dst,
|
510
510
|
constant int64_t & ne00,
|
511
511
|
constant int64_t & ne01,
|
512
|
+
constant int64_t & ne02,
|
512
513
|
constant uint64_t & nb00,
|
513
514
|
constant uint64_t & nb01,
|
514
515
|
constant uint64_t & nb02,
|
515
516
|
constant int64_t & ne10,
|
516
517
|
constant int64_t & ne11,
|
518
|
+
constant int64_t & ne12,
|
517
519
|
constant uint64_t & nb10,
|
518
520
|
constant uint64_t & nb11,
|
519
521
|
constant uint64_t & nb12,
|
@@ -529,7 +531,7 @@ kernel void kernel_mul_mat_f16_f32(
|
|
529
531
|
const int64_t r1 = tgpig.y;
|
530
532
|
const int64_t im = tgpig.z;
|
531
533
|
|
532
|
-
device const half * x = (device const half *) (src0 + r0*nb01 + im*nb02);
|
534
|
+
device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
|
533
535
|
device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
|
534
536
|
|
535
537
|
sum[tpitg.x] = 0.0f;
|
@@ -552,6 +554,7 @@ kernel void kernel_mul_mat_f16_f32(
|
|
552
554
|
}
|
553
555
|
}
|
554
556
|
|
557
|
+
|
555
558
|
kernel void kernel_alibi_f32(
|
556
559
|
device const float * src0,
|
557
560
|
device float * dst,
|