llama_cpp 0.2.1 → 0.2.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/examples/README.md +32 -0
- data/examples/embedding.rb +37 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +553 -313
- data/ext/llama_cpp/src/ggml-metal.h +4 -1
- data/ext/llama_cpp/src/ggml-metal.m +157 -19
- data/ext/llama_cpp/src/ggml-metal.metal +149 -0
- data/ext/llama_cpp/src/ggml-opencl.cpp +493 -4
- data/ext/llama_cpp/src/ggml.c +736 -98
- data/ext/llama_cpp/src/ggml.h +140 -9
- data/ext/llama_cpp/src/llama.cpp +58 -31
- data/ext/llama_cpp/src/llama.h +8 -9
- data/lib/llama_cpp/version.rb +2 -2
- metadata +3 -2
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -303,6 +303,7 @@ extern "C" {
|
|
303
303
|
GGML_OP_STEP,
|
304
304
|
GGML_OP_RELU,
|
305
305
|
GGML_OP_GELU,
|
306
|
+
GGML_OP_GELU_QUICK,
|
306
307
|
GGML_OP_SILU,
|
307
308
|
GGML_OP_SILU_BACK,
|
308
309
|
GGML_OP_NORM, // normalize
|
@@ -331,12 +332,15 @@ extern "C" {
|
|
331
332
|
GGML_OP_ROPE_BACK,
|
332
333
|
GGML_OP_ALIBI,
|
333
334
|
GGML_OP_CLAMP,
|
334
|
-
|
335
|
-
|
335
|
+
GGML_OP_CONV_1D_S1_PH,
|
336
|
+
GGML_OP_CONV_1D_S2_PH,
|
337
|
+
GGML_OP_CONV_2D_SK_P0,
|
336
338
|
|
337
339
|
GGML_OP_FLASH_ATTN,
|
338
340
|
GGML_OP_FLASH_FF,
|
339
341
|
GGML_OP_FLASH_ATTN_BACK,
|
342
|
+
GGML_OP_WIN_PART,
|
343
|
+
GGML_OP_WIN_UNPART,
|
340
344
|
|
341
345
|
GGML_OP_MAP_UNARY,
|
342
346
|
GGML_OP_MAP_BINARY,
|
@@ -500,8 +504,9 @@ extern "C" {
|
|
500
504
|
GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
|
501
505
|
GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
|
502
506
|
|
503
|
-
GGML_API void * ggml_get_mem_buffer(struct ggml_context * ctx);
|
504
|
-
GGML_API size_t ggml_get_mem_size
|
507
|
+
GGML_API void * ggml_get_mem_buffer (const struct ggml_context * ctx);
|
508
|
+
GGML_API size_t ggml_get_mem_size (const struct ggml_context * ctx);
|
509
|
+
GGML_API size_t ggml_get_max_tensor_size(const struct ggml_context * ctx);
|
505
510
|
|
506
511
|
GGML_API struct ggml_tensor * ggml_new_tensor(
|
507
512
|
struct ggml_context * ctx,
|
@@ -556,8 +561,8 @@ extern "C" {
|
|
556
561
|
GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
|
557
562
|
GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
|
558
563
|
|
559
|
-
GGML_API const char *
|
560
|
-
GGML_API
|
564
|
+
GGML_API const char * ggml_get_name(const struct ggml_tensor * tensor);
|
565
|
+
GGML_API struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name);
|
561
566
|
|
562
567
|
//
|
563
568
|
// operations on tensors with backpropagation
|
@@ -610,24 +615,47 @@ extern "C" {
|
|
610
615
|
struct ggml_tensor * a,
|
611
616
|
struct ggml_tensor * b);
|
612
617
|
|
618
|
+
GGML_API struct ggml_tensor * ggml_sub_inplace(
|
619
|
+
struct ggml_context * ctx,
|
620
|
+
struct ggml_tensor * a,
|
621
|
+
struct ggml_tensor * b);
|
622
|
+
|
613
623
|
GGML_API struct ggml_tensor * ggml_mul(
|
614
624
|
struct ggml_context * ctx,
|
615
625
|
struct ggml_tensor * a,
|
616
626
|
struct ggml_tensor * b);
|
617
627
|
|
628
|
+
GGML_API struct ggml_tensor * ggml_mul_inplace(
|
629
|
+
struct ggml_context * ctx,
|
630
|
+
struct ggml_tensor * a,
|
631
|
+
struct ggml_tensor * b);
|
632
|
+
|
618
633
|
GGML_API struct ggml_tensor * ggml_div(
|
619
634
|
struct ggml_context * ctx,
|
620
635
|
struct ggml_tensor * a,
|
621
636
|
struct ggml_tensor * b);
|
622
637
|
|
638
|
+
GGML_API struct ggml_tensor * ggml_div_inplace(
|
639
|
+
struct ggml_context * ctx,
|
640
|
+
struct ggml_tensor * a,
|
641
|
+
struct ggml_tensor * b);
|
642
|
+
|
623
643
|
GGML_API struct ggml_tensor * ggml_sqr(
|
624
644
|
struct ggml_context * ctx,
|
625
645
|
struct ggml_tensor * a);
|
626
646
|
|
647
|
+
GGML_API struct ggml_tensor * ggml_sqr_inplace(
|
648
|
+
struct ggml_context * ctx,
|
649
|
+
struct ggml_tensor * a);
|
650
|
+
|
627
651
|
GGML_API struct ggml_tensor * ggml_sqrt(
|
628
652
|
struct ggml_context * ctx,
|
629
653
|
struct ggml_tensor * a);
|
630
654
|
|
655
|
+
GGML_API struct ggml_tensor * ggml_sqrt_inplace(
|
656
|
+
struct ggml_context * ctx,
|
657
|
+
struct ggml_tensor * a);
|
658
|
+
|
631
659
|
GGML_API struct ggml_tensor * ggml_log(
|
632
660
|
struct ggml_context * ctx,
|
633
661
|
struct ggml_tensor * a);
|
@@ -667,31 +695,67 @@ extern "C" {
|
|
667
695
|
struct ggml_context * ctx,
|
668
696
|
struct ggml_tensor * a);
|
669
697
|
|
698
|
+
GGML_API struct ggml_tensor * ggml_abs_inplace(
|
699
|
+
struct ggml_context * ctx,
|
700
|
+
struct ggml_tensor * a);
|
701
|
+
|
670
702
|
GGML_API struct ggml_tensor * ggml_sgn(
|
671
703
|
struct ggml_context * ctx,
|
672
704
|
struct ggml_tensor * a);
|
673
705
|
|
706
|
+
GGML_API struct ggml_tensor * ggml_sgn_inplace(
|
707
|
+
struct ggml_context * ctx,
|
708
|
+
struct ggml_tensor * a);
|
709
|
+
|
674
710
|
GGML_API struct ggml_tensor * ggml_neg(
|
675
711
|
struct ggml_context * ctx,
|
676
712
|
struct ggml_tensor * a);
|
677
713
|
|
714
|
+
GGML_API struct ggml_tensor * ggml_neg_inplace(
|
715
|
+
struct ggml_context * ctx,
|
716
|
+
struct ggml_tensor * a);
|
717
|
+
|
678
718
|
GGML_API struct ggml_tensor * ggml_step(
|
679
719
|
struct ggml_context * ctx,
|
680
720
|
struct ggml_tensor * a);
|
681
721
|
|
722
|
+
GGML_API struct ggml_tensor * ggml_step_inplace(
|
723
|
+
struct ggml_context * ctx,
|
724
|
+
struct ggml_tensor * a);
|
725
|
+
|
682
726
|
GGML_API struct ggml_tensor * ggml_relu(
|
683
727
|
struct ggml_context * ctx,
|
684
728
|
struct ggml_tensor * a);
|
685
729
|
|
730
|
+
GGML_API struct ggml_tensor * ggml_relu_inplace(
|
731
|
+
struct ggml_context * ctx,
|
732
|
+
struct ggml_tensor * a);
|
733
|
+
|
686
734
|
// TODO: double-check this computation is correct
|
687
735
|
GGML_API struct ggml_tensor * ggml_gelu(
|
688
736
|
struct ggml_context * ctx,
|
689
737
|
struct ggml_tensor * a);
|
690
738
|
|
739
|
+
GGML_API struct ggml_tensor * ggml_gelu_inplace(
|
740
|
+
struct ggml_context * ctx,
|
741
|
+
struct ggml_tensor * a);
|
742
|
+
|
743
|
+
GGML_API struct ggml_tensor * ggml_gelu_quick(
|
744
|
+
struct ggml_context * ctx,
|
745
|
+
struct ggml_tensor * a);
|
746
|
+
|
747
|
+
GGML_API struct ggml_tensor * ggml_gelu_quick_inplace(
|
748
|
+
struct ggml_context * ctx,
|
749
|
+
struct ggml_tensor * a);
|
750
|
+
|
691
751
|
GGML_API struct ggml_tensor * ggml_silu(
|
692
752
|
struct ggml_context * ctx,
|
693
753
|
struct ggml_tensor * a);
|
694
754
|
|
755
|
+
GGML_API struct ggml_tensor * ggml_silu_inplace(
|
756
|
+
struct ggml_context * ctx,
|
757
|
+
struct ggml_tensor * a);
|
758
|
+
|
695
759
|
// a - x
|
696
760
|
// b - dy
|
697
761
|
GGML_API struct ggml_tensor * ggml_silu_back(
|
@@ -705,10 +769,18 @@ extern "C" {
|
|
705
769
|
struct ggml_context * ctx,
|
706
770
|
struct ggml_tensor * a);
|
707
771
|
|
772
|
+
GGML_API struct ggml_tensor * ggml_norm_inplace(
|
773
|
+
struct ggml_context * ctx,
|
774
|
+
struct ggml_tensor * a);
|
775
|
+
|
708
776
|
GGML_API struct ggml_tensor * ggml_rms_norm(
|
709
777
|
struct ggml_context * ctx,
|
710
778
|
struct ggml_tensor * a);
|
711
779
|
|
780
|
+
GGML_API struct ggml_tensor * ggml_rms_norm_inplace(
|
781
|
+
struct ggml_context * ctx,
|
782
|
+
struct ggml_tensor * a);
|
783
|
+
|
712
784
|
// a - x
|
713
785
|
// b - dy
|
714
786
|
GGML_API struct ggml_tensor * ggml_rms_norm_back(
|
@@ -998,16 +1070,55 @@ extern "C" {
|
|
998
1070
|
float min,
|
999
1071
|
float max);
|
1000
1072
|
|
1001
|
-
//
|
1073
|
+
// TODO: implement general-purpose convolutions
|
1074
|
+
// GGML_API struct ggml_tensor * ggml_conv_1d(
|
1075
|
+
// struct ggml_context * ctx,
|
1076
|
+
// struct ggml_tensor * a,
|
1077
|
+
// struct ggml_tensor * b,
|
1078
|
+
// int s0
|
1079
|
+
// int p0,
|
1080
|
+
// int d0);
|
1081
|
+
//
|
1082
|
+
// GGML_API struct ggml_tensor * ggml_conv_2d(
|
1083
|
+
// struct ggml_context * ctx,
|
1084
|
+
// struct ggml_tensor * a,
|
1085
|
+
// struct ggml_tensor * b,
|
1086
|
+
// int s0,
|
1087
|
+
// int s1,
|
1088
|
+
// int p0,
|
1089
|
+
// int p1,
|
1090
|
+
// int d0,
|
1091
|
+
// int d1);
|
1092
|
+
|
1093
|
+
// padding = half
|
1002
1094
|
// TODO: we don't support extra parameters for now
|
1003
1095
|
// that's why we are hard-coding the stride, padding, and dilation
|
1004
1096
|
// not great ..
|
1005
|
-
|
1097
|
+
// example:
|
1098
|
+
// a: 3 80 768 1
|
1099
|
+
// b: 3000 80 1 1
|
1100
|
+
// res: 3000 768 1 1
|
1101
|
+
// used in whisper
|
1102
|
+
GGML_API struct ggml_tensor * ggml_conv_1d_s1_ph(
|
1006
1103
|
struct ggml_context * ctx,
|
1007
1104
|
struct ggml_tensor * a,
|
1008
1105
|
struct ggml_tensor * b);
|
1009
1106
|
|
1010
|
-
|
1107
|
+
// used in whisper
|
1108
|
+
GGML_API struct ggml_tensor * ggml_conv_1d_s2_ph(
|
1109
|
+
struct ggml_context * ctx,
|
1110
|
+
struct ggml_tensor * a,
|
1111
|
+
struct ggml_tensor * b);
|
1112
|
+
|
1113
|
+
// kernel size is a->ne[0] x a->ne[1]
|
1114
|
+
// stride is equal to kernel size
|
1115
|
+
// padding is zero
|
1116
|
+
// example:
|
1117
|
+
// a: 16 16 3 768
|
1118
|
+
// b: 1024 1024 3 1
|
1119
|
+
// res: 64 64 768 1
|
1120
|
+
// used in sam
|
1121
|
+
GGML_API struct ggml_tensor * ggml_conv_2d_sk_p0(
|
1011
1122
|
struct ggml_context * ctx,
|
1012
1123
|
struct ggml_tensor * a,
|
1013
1124
|
struct ggml_tensor * b);
|
@@ -1035,6 +1146,26 @@ extern "C" {
|
|
1035
1146
|
struct ggml_tensor * c0,
|
1036
1147
|
struct ggml_tensor * c1);
|
1037
1148
|
|
1149
|
+
// partition into non-overlapping windows with padding if needed
|
1150
|
+
// example:
|
1151
|
+
// a: 768 64 64 1
|
1152
|
+
// w: 14
|
1153
|
+
// res: 768 14 14 25
|
1154
|
+
// used in sam
|
1155
|
+
GGML_API struct ggml_tensor * ggml_win_part(
|
1156
|
+
struct ggml_context * ctx,
|
1157
|
+
struct ggml_tensor * a,
|
1158
|
+
int w);
|
1159
|
+
|
1160
|
+
// reverse of ggml_win_part
|
1161
|
+
// used in sam
|
1162
|
+
GGML_API struct ggml_tensor * ggml_win_unpart(
|
1163
|
+
struct ggml_context * ctx,
|
1164
|
+
struct ggml_tensor * a,
|
1165
|
+
int w0,
|
1166
|
+
int h0,
|
1167
|
+
int w);
|
1168
|
+
|
1038
1169
|
// Mapping operations
|
1039
1170
|
typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
|
1040
1171
|
typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
|
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -19,6 +19,11 @@
|
|
19
19
|
#ifdef GGML_USE_METAL
|
20
20
|
#include "ggml-metal.h"
|
21
21
|
#endif
|
22
|
+
#ifdef GGML_USE_K_QUANTS
|
23
|
+
#ifndef QK_K
|
24
|
+
#define QK_K 256
|
25
|
+
#endif
|
26
|
+
#endif
|
22
27
|
|
23
28
|
#include <array>
|
24
29
|
#include <ctime>
|
@@ -40,6 +45,10 @@
|
|
40
45
|
#include <sstream>
|
41
46
|
#include <numeric>
|
42
47
|
|
48
|
+
#if defined(_MSC_VER)
|
49
|
+
#pragma warning(disable: 4244 4267) // possible loss of data
|
50
|
+
#endif
|
51
|
+
|
43
52
|
#define LLAMA_USE_SCRATCH
|
44
53
|
#define LLAMA_MAX_SCRATCH_BUFFERS 16
|
45
54
|
|
@@ -882,6 +891,7 @@ static bool kv_cache_init(
|
|
882
891
|
const int64_t n_elements = n_embd*n_mem;
|
883
892
|
|
884
893
|
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
|
894
|
+
cache.n = 0;
|
885
895
|
|
886
896
|
struct ggml_init_params params;
|
887
897
|
params.mem_size = cache.buf.size;
|
@@ -900,6 +910,7 @@ static bool kv_cache_init(
|
|
900
910
|
ggml_set_name(cache.k, "cache_k");
|
901
911
|
ggml_set_name(cache.v, "cache_v");
|
902
912
|
|
913
|
+
(void) n_gpu_layers;
|
903
914
|
#ifdef GGML_USE_CUBLAS
|
904
915
|
if (n_gpu_layers > n_layer + 1) {
|
905
916
|
ggml_cuda_assign_buffers_no_scratch(cache.v);
|
@@ -914,21 +925,21 @@ static bool kv_cache_init(
|
|
914
925
|
|
915
926
|
struct llama_context_params llama_context_default_params() {
|
916
927
|
struct llama_context_params result = {
|
928
|
+
/*.seed =*/ -1,
|
917
929
|
/*.n_ctx =*/ 512,
|
918
930
|
/*.n_batch =*/ 512,
|
919
931
|
/*.gpu_layers =*/ 0,
|
920
932
|
/*.main_gpu =*/ 0,
|
921
933
|
/*.tensor_split =*/ {0},
|
934
|
+
/*.progress_callback =*/ nullptr,
|
935
|
+
/*.progress_callback_user_data =*/ nullptr,
|
922
936
|
/*.low_vram =*/ false,
|
923
|
-
/*.seed =*/ -1,
|
924
937
|
/*.f16_kv =*/ true,
|
925
938
|
/*.logits_all =*/ false,
|
926
939
|
/*.vocab_only =*/ false,
|
927
940
|
/*.use_mmap =*/ true,
|
928
941
|
/*.use_mlock =*/ false,
|
929
942
|
/*.embedding =*/ false,
|
930
|
-
/*.progress_callback =*/ nullptr,
|
931
|
-
/*.progress_callback_user_data =*/ nullptr,
|
932
943
|
};
|
933
944
|
|
934
945
|
return result;
|
@@ -1249,7 +1260,7 @@ static void llama_model_load_internal(
|
|
1249
1260
|
vram_scratch = n_batch * MB;
|
1250
1261
|
ggml_cuda_set_scratch_size(vram_scratch);
|
1251
1262
|
if (n_gpu_layers > 0) {
|
1252
|
-
fprintf(stderr, "%s: allocating batch_size x 1 MB = %
|
1263
|
+
fprintf(stderr, "%s: allocating batch_size x 1 MB = %zd MB VRAM for the scratch buffer\n",
|
1253
1264
|
__func__, vram_scratch / MB);
|
1254
1265
|
}
|
1255
1266
|
}
|
@@ -1609,7 +1620,7 @@ static bool llama_eval_internal(
|
|
1609
1620
|
model.layers[il].w1,
|
1610
1621
|
cur);
|
1611
1622
|
offload_func(cur);
|
1612
|
-
ggml_set_name(cur, "
|
1623
|
+
ggml_set_name(cur, "result_w1");
|
1613
1624
|
|
1614
1625
|
// SILU activation
|
1615
1626
|
cur = ggml_silu(ctx0, cur);
|
@@ -1646,15 +1657,11 @@ static bool llama_eval_internal(
|
|
1646
1657
|
{
|
1647
1658
|
cur = ggml_rms_norm(ctx0, inpL);
|
1648
1659
|
offload_func_nr(cur);
|
1649
|
-
ggml_set_name(cur, "
|
1650
|
-
|
1651
|
-
cur = ggml_rms_norm(ctx0, cur);
|
1652
|
-
offload_func_nr(cur);
|
1653
|
-
ggml_set_name(cur, "rms_norm_after");
|
1660
|
+
ggml_set_name(cur, "rms_norm_2");
|
1654
1661
|
|
1655
1662
|
// cur = cur*norm(broadcasted)
|
1656
1663
|
cur = ggml_mul(ctx0, cur, model.norm);
|
1657
|
-
offload_func_nr(cur);
|
1664
|
+
// offload_func_nr(cur); // TODO CPU + GPU mirrored backend
|
1658
1665
|
ggml_set_name(cur, "result_norm");
|
1659
1666
|
|
1660
1667
|
embeddings = cur;
|
@@ -2485,8 +2492,23 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2485
2492
|
} else {
|
2486
2493
|
new_type = quantized_type;
|
2487
2494
|
#ifdef GGML_USE_K_QUANTS
|
2495
|
+
if (quantized_type == GGML_TYPE_Q2_K || quantized_type == GGML_TYPE_Q3_K || quantized_type == GGML_TYPE_Q4_K ||
|
2496
|
+
quantized_type == GGML_TYPE_Q5_K || quantized_type == GGML_TYPE_Q6_K) {
|
2497
|
+
int nx = tensor.ne.at(0);
|
2498
|
+
int ny = tensor.ne.at(1);
|
2499
|
+
if (nx % QK_K != 0 || ny % QK_K != 0) {
|
2500
|
+
fprintf(stderr, "\n\n========================= Tensor sizes %d x %d are not divisible by %d\n",nx,ny,QK_K);
|
2501
|
+
fprintf(stderr, "This is required to be able to use k-quants for now!\n");
|
2502
|
+
fprintf(stderr, "========================================================================================\n\n");
|
2503
|
+
throw std::runtime_error("Unsupported tensor size encountered\n");
|
2504
|
+
}
|
2505
|
+
}
|
2488
2506
|
if (tensor.name == "output.weight") {
|
2489
|
-
|
2507
|
+
int nx = tensor.ne.at(0);
|
2508
|
+
int ny = tensor.ne.at(1);
|
2509
|
+
if (nx % QK_K == 0 && ny % QK_K == 0) {
|
2510
|
+
new_type = GGML_TYPE_Q6_K;
|
2511
|
+
}
|
2490
2512
|
} else if (tensor.name.find("attention.wv.weight") != std::string::npos) {
|
2491
2513
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
2492
2514
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
@@ -2690,16 +2712,21 @@ struct llama_context * llama_init_from_file(
|
|
2690
2712
|
// this allocates all Metal resources and memory buffers
|
2691
2713
|
ctx->ctx_metal = ggml_metal_init();
|
2692
2714
|
|
2693
|
-
void *data_ptr
|
2715
|
+
void * data_ptr = NULL;
|
2694
2716
|
size_t data_size = 0;
|
2717
|
+
|
2695
2718
|
if (params.use_mmap) {
|
2696
|
-
data_ptr
|
2697
|
-
data_size= ctx->model.mapping->size;
|
2719
|
+
data_ptr = ctx->model.mapping->addr;
|
2720
|
+
data_size = ctx->model.mapping->size;
|
2698
2721
|
} else {
|
2699
|
-
data_ptr
|
2700
|
-
data_size= ggml_get_mem_size(ctx->model.ctx);
|
2722
|
+
data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
|
2723
|
+
data_size = ggml_get_mem_size (ctx->model.ctx);
|
2701
2724
|
}
|
2702
2725
|
|
2726
|
+
const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
|
2727
|
+
|
2728
|
+
printf("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
|
2729
|
+
|
2703
2730
|
#define LLAMA_METAL_CHECK_BUF(result) \
|
2704
2731
|
if (!(result)) { \
|
2705
2732
|
fprintf(stderr, "%s: failed to add buffer\n", __func__); \
|
@@ -2707,12 +2734,13 @@ struct llama_context * llama_init_from_file(
|
|
2707
2734
|
return NULL; \
|
2708
2735
|
}
|
2709
2736
|
|
2710
|
-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size));
|
2711
|
-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size));
|
2737
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
|
2712
2738
|
|
2713
|
-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "
|
2714
|
-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "
|
2715
|
-
|
2739
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0));
|
2740
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size, 0));
|
2741
|
+
|
2742
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0));
|
2743
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0));
|
2716
2744
|
#undef LLAMA_METAL_CHECK_BUF
|
2717
2745
|
}
|
2718
2746
|
#endif
|
@@ -3098,9 +3126,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
|
3098
3126
|
if (kv_size) {
|
3099
3127
|
const size_t elt_size = ggml_element_size(kv_self.k);
|
3100
3128
|
|
3101
|
-
|
3102
|
-
|
3103
|
-
ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
|
3129
|
+
ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
|
3104
3130
|
ggml_cgraph gf{};
|
3105
3131
|
gf.n_threads = 1;
|
3106
3132
|
|
@@ -3206,9 +3232,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
3206
3232
|
|
3207
3233
|
const size_t elt_size = ggml_element_size(kv_self.k);
|
3208
3234
|
|
3209
|
-
|
3210
|
-
|
3211
|
-
ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
|
3235
|
+
ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
|
3212
3236
|
ggml_cgraph gf{};
|
3213
3237
|
gf.n_threads = 1;
|
3214
3238
|
|
@@ -3443,9 +3467,12 @@ void llama_print_timings(struct llama_context * ctx) {
|
|
3443
3467
|
|
3444
3468
|
fprintf(stderr, "\n");
|
3445
3469
|
fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
|
3446
|
-
fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token
|
3447
|
-
|
3448
|
-
fprintf(stderr, "%s:
|
3470
|
+
fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
3471
|
+
__func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample, 1e6 / ctx->t_sample_us * n_sample);
|
3472
|
+
fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
|
3473
|
+
__func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval, 1e6 / ctx->t_p_eval_us * n_p_eval);
|
3474
|
+
fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
3475
|
+
__func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval, 1e6 / ctx->t_eval_us * n_eval);
|
3449
3476
|
fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0);
|
3450
3477
|
}
|
3451
3478
|
|
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -71,28 +71,27 @@ extern "C" {
|
|
71
71
|
|
72
72
|
typedef void (*llama_progress_callback)(float progress, void *ctx);
|
73
73
|
|
74
|
-
|
74
|
+
struct llama_context_params {
|
75
|
+
int seed; // RNG seed, -1 for random
|
75
76
|
int n_ctx; // text context
|
76
77
|
int n_batch; // prompt processing batch size
|
77
78
|
int n_gpu_layers; // number of layers to store in VRAM
|
78
79
|
int main_gpu; // the GPU that is used for scratch and small tensors
|
79
80
|
float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
|
80
|
-
|
81
|
-
|
81
|
+
// called with a progress value between 0 and 1, pass NULL to disable
|
82
|
+
llama_progress_callback progress_callback;
|
83
|
+
// context pointer passed to the progress callback
|
84
|
+
void * progress_callback_user_data;
|
82
85
|
|
86
|
+
// Keep the booleans together to avoid misalignment during copy-by-value.
|
87
|
+
bool low_vram; // if true, reduce VRAM usage at the cost of performance
|
83
88
|
bool f16_kv; // use fp16 for KV cache
|
84
89
|
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
85
90
|
bool vocab_only; // only load the vocabulary, no weights
|
86
91
|
bool use_mmap; // use mmap if possible
|
87
92
|
bool use_mlock; // force system to keep model in RAM
|
88
93
|
bool embedding; // embedding mode only
|
89
|
-
|
90
|
-
// called with a progress value between 0 and 1, pass NULL to disable
|
91
|
-
llama_progress_callback progress_callback;
|
92
|
-
// context pointer passed to the progress callback
|
93
|
-
void * progress_callback_user_data;
|
94
94
|
};
|
95
|
-
|
96
95
|
// model file types
|
97
96
|
enum llama_ftype {
|
98
97
|
LLAMA_FTYPE_ALL_F32 = 0,
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.2.
|
6
|
+
VERSION = '0.2.2'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = 'master-
|
9
|
+
LLAMA_CPP_VERSION = 'master-7487137'
|
10
10
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-06-
|
11
|
+
date: 2023-06-23 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|
@@ -24,6 +24,7 @@ files:
|
|
24
24
|
- README.md
|
25
25
|
- examples/README.md
|
26
26
|
- examples/chat.rb
|
27
|
+
- examples/embedding.rb
|
27
28
|
- ext/llama_cpp/extconf.rb
|
28
29
|
- ext/llama_cpp/llama_cpp.cpp
|
29
30
|
- ext/llama_cpp/llama_cpp.h
|