llama_cpp 0.2.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/examples/README.md +32 -0
- data/examples/embedding.rb +37 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +553 -313
- data/ext/llama_cpp/src/ggml-metal.h +4 -1
- data/ext/llama_cpp/src/ggml-metal.m +157 -19
- data/ext/llama_cpp/src/ggml-metal.metal +149 -0
- data/ext/llama_cpp/src/ggml-opencl.cpp +493 -4
- data/ext/llama_cpp/src/ggml.c +736 -98
- data/ext/llama_cpp/src/ggml.h +140 -9
- data/ext/llama_cpp/src/llama.cpp +58 -31
- data/ext/llama_cpp/src/llama.h +8 -9
- data/lib/llama_cpp/version.rb +2 -2
- metadata +3 -2
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -303,6 +303,7 @@ extern "C" {
|
|
303
303
|
GGML_OP_STEP,
|
304
304
|
GGML_OP_RELU,
|
305
305
|
GGML_OP_GELU,
|
306
|
+
GGML_OP_GELU_QUICK,
|
306
307
|
GGML_OP_SILU,
|
307
308
|
GGML_OP_SILU_BACK,
|
308
309
|
GGML_OP_NORM, // normalize
|
@@ -331,12 +332,15 @@ extern "C" {
|
|
331
332
|
GGML_OP_ROPE_BACK,
|
332
333
|
GGML_OP_ALIBI,
|
333
334
|
GGML_OP_CLAMP,
|
334
|
-
|
335
|
-
|
335
|
+
GGML_OP_CONV_1D_S1_PH,
|
336
|
+
GGML_OP_CONV_1D_S2_PH,
|
337
|
+
GGML_OP_CONV_2D_SK_P0,
|
336
338
|
|
337
339
|
GGML_OP_FLASH_ATTN,
|
338
340
|
GGML_OP_FLASH_FF,
|
339
341
|
GGML_OP_FLASH_ATTN_BACK,
|
342
|
+
GGML_OP_WIN_PART,
|
343
|
+
GGML_OP_WIN_UNPART,
|
340
344
|
|
341
345
|
GGML_OP_MAP_UNARY,
|
342
346
|
GGML_OP_MAP_BINARY,
|
@@ -500,8 +504,9 @@ extern "C" {
|
|
500
504
|
GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
|
501
505
|
GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
|
502
506
|
|
503
|
-
GGML_API void * ggml_get_mem_buffer(struct ggml_context * ctx);
|
504
|
-
GGML_API size_t ggml_get_mem_size
|
507
|
+
GGML_API void * ggml_get_mem_buffer (const struct ggml_context * ctx);
|
508
|
+
GGML_API size_t ggml_get_mem_size (const struct ggml_context * ctx);
|
509
|
+
GGML_API size_t ggml_get_max_tensor_size(const struct ggml_context * ctx);
|
505
510
|
|
506
511
|
GGML_API struct ggml_tensor * ggml_new_tensor(
|
507
512
|
struct ggml_context * ctx,
|
@@ -556,8 +561,8 @@ extern "C" {
|
|
556
561
|
GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
|
557
562
|
GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
|
558
563
|
|
559
|
-
GGML_API const char *
|
560
|
-
GGML_API
|
564
|
+
GGML_API const char * ggml_get_name(const struct ggml_tensor * tensor);
|
565
|
+
GGML_API struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name);
|
561
566
|
|
562
567
|
//
|
563
568
|
// operations on tensors with backpropagation
|
@@ -610,24 +615,47 @@ extern "C" {
|
|
610
615
|
struct ggml_tensor * a,
|
611
616
|
struct ggml_tensor * b);
|
612
617
|
|
618
|
+
GGML_API struct ggml_tensor * ggml_sub_inplace(
|
619
|
+
struct ggml_context * ctx,
|
620
|
+
struct ggml_tensor * a,
|
621
|
+
struct ggml_tensor * b);
|
622
|
+
|
613
623
|
GGML_API struct ggml_tensor * ggml_mul(
|
614
624
|
struct ggml_context * ctx,
|
615
625
|
struct ggml_tensor * a,
|
616
626
|
struct ggml_tensor * b);
|
617
627
|
|
628
|
+
GGML_API struct ggml_tensor * ggml_mul_inplace(
|
629
|
+
struct ggml_context * ctx,
|
630
|
+
struct ggml_tensor * a,
|
631
|
+
struct ggml_tensor * b);
|
632
|
+
|
618
633
|
GGML_API struct ggml_tensor * ggml_div(
|
619
634
|
struct ggml_context * ctx,
|
620
635
|
struct ggml_tensor * a,
|
621
636
|
struct ggml_tensor * b);
|
622
637
|
|
638
|
+
GGML_API struct ggml_tensor * ggml_div_inplace(
|
639
|
+
struct ggml_context * ctx,
|
640
|
+
struct ggml_tensor * a,
|
641
|
+
struct ggml_tensor * b);
|
642
|
+
|
623
643
|
GGML_API struct ggml_tensor * ggml_sqr(
|
624
644
|
struct ggml_context * ctx,
|
625
645
|
struct ggml_tensor * a);
|
626
646
|
|
647
|
+
GGML_API struct ggml_tensor * ggml_sqr_inplace(
|
648
|
+
struct ggml_context * ctx,
|
649
|
+
struct ggml_tensor * a);
|
650
|
+
|
627
651
|
GGML_API struct ggml_tensor * ggml_sqrt(
|
628
652
|
struct ggml_context * ctx,
|
629
653
|
struct ggml_tensor * a);
|
630
654
|
|
655
|
+
GGML_API struct ggml_tensor * ggml_sqrt_inplace(
|
656
|
+
struct ggml_context * ctx,
|
657
|
+
struct ggml_tensor * a);
|
658
|
+
|
631
659
|
GGML_API struct ggml_tensor * ggml_log(
|
632
660
|
struct ggml_context * ctx,
|
633
661
|
struct ggml_tensor * a);
|
@@ -667,31 +695,67 @@ extern "C" {
|
|
667
695
|
struct ggml_context * ctx,
|
668
696
|
struct ggml_tensor * a);
|
669
697
|
|
698
|
+
GGML_API struct ggml_tensor * ggml_abs_inplace(
|
699
|
+
struct ggml_context * ctx,
|
700
|
+
struct ggml_tensor * a);
|
701
|
+
|
670
702
|
GGML_API struct ggml_tensor * ggml_sgn(
|
671
703
|
struct ggml_context * ctx,
|
672
704
|
struct ggml_tensor * a);
|
673
705
|
|
706
|
+
GGML_API struct ggml_tensor * ggml_sgn_inplace(
|
707
|
+
struct ggml_context * ctx,
|
708
|
+
struct ggml_tensor * a);
|
709
|
+
|
674
710
|
GGML_API struct ggml_tensor * ggml_neg(
|
675
711
|
struct ggml_context * ctx,
|
676
712
|
struct ggml_tensor * a);
|
677
713
|
|
714
|
+
GGML_API struct ggml_tensor * ggml_neg_inplace(
|
715
|
+
struct ggml_context * ctx,
|
716
|
+
struct ggml_tensor * a);
|
717
|
+
|
678
718
|
GGML_API struct ggml_tensor * ggml_step(
|
679
719
|
struct ggml_context * ctx,
|
680
720
|
struct ggml_tensor * a);
|
681
721
|
|
722
|
+
GGML_API struct ggml_tensor * ggml_step_inplace(
|
723
|
+
struct ggml_context * ctx,
|
724
|
+
struct ggml_tensor * a);
|
725
|
+
|
682
726
|
GGML_API struct ggml_tensor * ggml_relu(
|
683
727
|
struct ggml_context * ctx,
|
684
728
|
struct ggml_tensor * a);
|
685
729
|
|
730
|
+
GGML_API struct ggml_tensor * ggml_relu_inplace(
|
731
|
+
struct ggml_context * ctx,
|
732
|
+
struct ggml_tensor * a);
|
733
|
+
|
686
734
|
// TODO: double-check this computation is correct
|
687
735
|
GGML_API struct ggml_tensor * ggml_gelu(
|
688
736
|
struct ggml_context * ctx,
|
689
737
|
struct ggml_tensor * a);
|
690
738
|
|
739
|
+
GGML_API struct ggml_tensor * ggml_gelu_inplace(
|
740
|
+
struct ggml_context * ctx,
|
741
|
+
struct ggml_tensor * a);
|
742
|
+
|
743
|
+
GGML_API struct ggml_tensor * ggml_gelu_quick(
|
744
|
+
struct ggml_context * ctx,
|
745
|
+
struct ggml_tensor * a);
|
746
|
+
|
747
|
+
GGML_API struct ggml_tensor * ggml_gelu_quick_inplace(
|
748
|
+
struct ggml_context * ctx,
|
749
|
+
struct ggml_tensor * a);
|
750
|
+
|
691
751
|
GGML_API struct ggml_tensor * ggml_silu(
|
692
752
|
struct ggml_context * ctx,
|
693
753
|
struct ggml_tensor * a);
|
694
754
|
|
755
|
+
GGML_API struct ggml_tensor * ggml_silu_inplace(
|
756
|
+
struct ggml_context * ctx,
|
757
|
+
struct ggml_tensor * a);
|
758
|
+
|
695
759
|
// a - x
|
696
760
|
// b - dy
|
697
761
|
GGML_API struct ggml_tensor * ggml_silu_back(
|
@@ -705,10 +769,18 @@ extern "C" {
|
|
705
769
|
struct ggml_context * ctx,
|
706
770
|
struct ggml_tensor * a);
|
707
771
|
|
772
|
+
GGML_API struct ggml_tensor * ggml_norm_inplace(
|
773
|
+
struct ggml_context * ctx,
|
774
|
+
struct ggml_tensor * a);
|
775
|
+
|
708
776
|
GGML_API struct ggml_tensor * ggml_rms_norm(
|
709
777
|
struct ggml_context * ctx,
|
710
778
|
struct ggml_tensor * a);
|
711
779
|
|
780
|
+
GGML_API struct ggml_tensor * ggml_rms_norm_inplace(
|
781
|
+
struct ggml_context * ctx,
|
782
|
+
struct ggml_tensor * a);
|
783
|
+
|
712
784
|
// a - x
|
713
785
|
// b - dy
|
714
786
|
GGML_API struct ggml_tensor * ggml_rms_norm_back(
|
@@ -998,16 +1070,55 @@ extern "C" {
|
|
998
1070
|
float min,
|
999
1071
|
float max);
|
1000
1072
|
|
1001
|
-
//
|
1073
|
+
// TODO: implement general-purpose convolutions
|
1074
|
+
// GGML_API struct ggml_tensor * ggml_conv_1d(
|
1075
|
+
// struct ggml_context * ctx,
|
1076
|
+
// struct ggml_tensor * a,
|
1077
|
+
// struct ggml_tensor * b,
|
1078
|
+
// int s0
|
1079
|
+
// int p0,
|
1080
|
+
// int d0);
|
1081
|
+
//
|
1082
|
+
// GGML_API struct ggml_tensor * ggml_conv_2d(
|
1083
|
+
// struct ggml_context * ctx,
|
1084
|
+
// struct ggml_tensor * a,
|
1085
|
+
// struct ggml_tensor * b,
|
1086
|
+
// int s0,
|
1087
|
+
// int s1,
|
1088
|
+
// int p0,
|
1089
|
+
// int p1,
|
1090
|
+
// int d0,
|
1091
|
+
// int d1);
|
1092
|
+
|
1093
|
+
// padding = half
|
1002
1094
|
// TODO: we don't support extra parameters for now
|
1003
1095
|
// that's why we are hard-coding the stride, padding, and dilation
|
1004
1096
|
// not great ..
|
1005
|
-
|
1097
|
+
// example:
|
1098
|
+
// a: 3 80 768 1
|
1099
|
+
// b: 3000 80 1 1
|
1100
|
+
// res: 3000 768 1 1
|
1101
|
+
// used in whisper
|
1102
|
+
GGML_API struct ggml_tensor * ggml_conv_1d_s1_ph(
|
1006
1103
|
struct ggml_context * ctx,
|
1007
1104
|
struct ggml_tensor * a,
|
1008
1105
|
struct ggml_tensor * b);
|
1009
1106
|
|
1010
|
-
|
1107
|
+
// used in whisper
|
1108
|
+
GGML_API struct ggml_tensor * ggml_conv_1d_s2_ph(
|
1109
|
+
struct ggml_context * ctx,
|
1110
|
+
struct ggml_tensor * a,
|
1111
|
+
struct ggml_tensor * b);
|
1112
|
+
|
1113
|
+
// kernel size is a->ne[0] x a->ne[1]
|
1114
|
+
// stride is equal to kernel size
|
1115
|
+
// padding is zero
|
1116
|
+
// example:
|
1117
|
+
// a: 16 16 3 768
|
1118
|
+
// b: 1024 1024 3 1
|
1119
|
+
// res: 64 64 768 1
|
1120
|
+
// used in sam
|
1121
|
+
GGML_API struct ggml_tensor * ggml_conv_2d_sk_p0(
|
1011
1122
|
struct ggml_context * ctx,
|
1012
1123
|
struct ggml_tensor * a,
|
1013
1124
|
struct ggml_tensor * b);
|
@@ -1035,6 +1146,26 @@ extern "C" {
|
|
1035
1146
|
struct ggml_tensor * c0,
|
1036
1147
|
struct ggml_tensor * c1);
|
1037
1148
|
|
1149
|
+
// partition into non-overlapping windows with padding if needed
|
1150
|
+
// example:
|
1151
|
+
// a: 768 64 64 1
|
1152
|
+
// w: 14
|
1153
|
+
// res: 768 14 14 25
|
1154
|
+
// used in sam
|
1155
|
+
GGML_API struct ggml_tensor * ggml_win_part(
|
1156
|
+
struct ggml_context * ctx,
|
1157
|
+
struct ggml_tensor * a,
|
1158
|
+
int w);
|
1159
|
+
|
1160
|
+
// reverse of ggml_win_part
|
1161
|
+
// used in sam
|
1162
|
+
GGML_API struct ggml_tensor * ggml_win_unpart(
|
1163
|
+
struct ggml_context * ctx,
|
1164
|
+
struct ggml_tensor * a,
|
1165
|
+
int w0,
|
1166
|
+
int h0,
|
1167
|
+
int w);
|
1168
|
+
|
1038
1169
|
// Mapping operations
|
1039
1170
|
typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
|
1040
1171
|
typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
|
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -19,6 +19,11 @@
|
|
19
19
|
#ifdef GGML_USE_METAL
|
20
20
|
#include "ggml-metal.h"
|
21
21
|
#endif
|
22
|
+
#ifdef GGML_USE_K_QUANTS
|
23
|
+
#ifndef QK_K
|
24
|
+
#define QK_K 256
|
25
|
+
#endif
|
26
|
+
#endif
|
22
27
|
|
23
28
|
#include <array>
|
24
29
|
#include <ctime>
|
@@ -40,6 +45,10 @@
|
|
40
45
|
#include <sstream>
|
41
46
|
#include <numeric>
|
42
47
|
|
48
|
+
#if defined(_MSC_VER)
|
49
|
+
#pragma warning(disable: 4244 4267) // possible loss of data
|
50
|
+
#endif
|
51
|
+
|
43
52
|
#define LLAMA_USE_SCRATCH
|
44
53
|
#define LLAMA_MAX_SCRATCH_BUFFERS 16
|
45
54
|
|
@@ -882,6 +891,7 @@ static bool kv_cache_init(
|
|
882
891
|
const int64_t n_elements = n_embd*n_mem;
|
883
892
|
|
884
893
|
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
|
894
|
+
cache.n = 0;
|
885
895
|
|
886
896
|
struct ggml_init_params params;
|
887
897
|
params.mem_size = cache.buf.size;
|
@@ -900,6 +910,7 @@ static bool kv_cache_init(
|
|
900
910
|
ggml_set_name(cache.k, "cache_k");
|
901
911
|
ggml_set_name(cache.v, "cache_v");
|
902
912
|
|
913
|
+
(void) n_gpu_layers;
|
903
914
|
#ifdef GGML_USE_CUBLAS
|
904
915
|
if (n_gpu_layers > n_layer + 1) {
|
905
916
|
ggml_cuda_assign_buffers_no_scratch(cache.v);
|
@@ -914,21 +925,21 @@ static bool kv_cache_init(
|
|
914
925
|
|
915
926
|
struct llama_context_params llama_context_default_params() {
|
916
927
|
struct llama_context_params result = {
|
928
|
+
/*.seed =*/ -1,
|
917
929
|
/*.n_ctx =*/ 512,
|
918
930
|
/*.n_batch =*/ 512,
|
919
931
|
/*.gpu_layers =*/ 0,
|
920
932
|
/*.main_gpu =*/ 0,
|
921
933
|
/*.tensor_split =*/ {0},
|
934
|
+
/*.progress_callback =*/ nullptr,
|
935
|
+
/*.progress_callback_user_data =*/ nullptr,
|
922
936
|
/*.low_vram =*/ false,
|
923
|
-
/*.seed =*/ -1,
|
924
937
|
/*.f16_kv =*/ true,
|
925
938
|
/*.logits_all =*/ false,
|
926
939
|
/*.vocab_only =*/ false,
|
927
940
|
/*.use_mmap =*/ true,
|
928
941
|
/*.use_mlock =*/ false,
|
929
942
|
/*.embedding =*/ false,
|
930
|
-
/*.progress_callback =*/ nullptr,
|
931
|
-
/*.progress_callback_user_data =*/ nullptr,
|
932
943
|
};
|
933
944
|
|
934
945
|
return result;
|
@@ -1249,7 +1260,7 @@ static void llama_model_load_internal(
|
|
1249
1260
|
vram_scratch = n_batch * MB;
|
1250
1261
|
ggml_cuda_set_scratch_size(vram_scratch);
|
1251
1262
|
if (n_gpu_layers > 0) {
|
1252
|
-
fprintf(stderr, "%s: allocating batch_size x 1 MB = %
|
1263
|
+
fprintf(stderr, "%s: allocating batch_size x 1 MB = %zd MB VRAM for the scratch buffer\n",
|
1253
1264
|
__func__, vram_scratch / MB);
|
1254
1265
|
}
|
1255
1266
|
}
|
@@ -1609,7 +1620,7 @@ static bool llama_eval_internal(
|
|
1609
1620
|
model.layers[il].w1,
|
1610
1621
|
cur);
|
1611
1622
|
offload_func(cur);
|
1612
|
-
ggml_set_name(cur, "
|
1623
|
+
ggml_set_name(cur, "result_w1");
|
1613
1624
|
|
1614
1625
|
// SILU activation
|
1615
1626
|
cur = ggml_silu(ctx0, cur);
|
@@ -1646,15 +1657,11 @@ static bool llama_eval_internal(
|
|
1646
1657
|
{
|
1647
1658
|
cur = ggml_rms_norm(ctx0, inpL);
|
1648
1659
|
offload_func_nr(cur);
|
1649
|
-
ggml_set_name(cur, "
|
1650
|
-
|
1651
|
-
cur = ggml_rms_norm(ctx0, cur);
|
1652
|
-
offload_func_nr(cur);
|
1653
|
-
ggml_set_name(cur, "rms_norm_after");
|
1660
|
+
ggml_set_name(cur, "rms_norm_2");
|
1654
1661
|
|
1655
1662
|
// cur = cur*norm(broadcasted)
|
1656
1663
|
cur = ggml_mul(ctx0, cur, model.norm);
|
1657
|
-
offload_func_nr(cur);
|
1664
|
+
// offload_func_nr(cur); // TODO CPU + GPU mirrored backend
|
1658
1665
|
ggml_set_name(cur, "result_norm");
|
1659
1666
|
|
1660
1667
|
embeddings = cur;
|
@@ -2485,8 +2492,23 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2485
2492
|
} else {
|
2486
2493
|
new_type = quantized_type;
|
2487
2494
|
#ifdef GGML_USE_K_QUANTS
|
2495
|
+
if (quantized_type == GGML_TYPE_Q2_K || quantized_type == GGML_TYPE_Q3_K || quantized_type == GGML_TYPE_Q4_K ||
|
2496
|
+
quantized_type == GGML_TYPE_Q5_K || quantized_type == GGML_TYPE_Q6_K) {
|
2497
|
+
int nx = tensor.ne.at(0);
|
2498
|
+
int ny = tensor.ne.at(1);
|
2499
|
+
if (nx % QK_K != 0 || ny % QK_K != 0) {
|
2500
|
+
fprintf(stderr, "\n\n========================= Tensor sizes %d x %d are not divisible by %d\n",nx,ny,QK_K);
|
2501
|
+
fprintf(stderr, "This is required to be able to use k-quants for now!\n");
|
2502
|
+
fprintf(stderr, "========================================================================================\n\n");
|
2503
|
+
throw std::runtime_error("Unsupported tensor size encountered\n");
|
2504
|
+
}
|
2505
|
+
}
|
2488
2506
|
if (tensor.name == "output.weight") {
|
2489
|
-
|
2507
|
+
int nx = tensor.ne.at(0);
|
2508
|
+
int ny = tensor.ne.at(1);
|
2509
|
+
if (nx % QK_K == 0 && ny % QK_K == 0) {
|
2510
|
+
new_type = GGML_TYPE_Q6_K;
|
2511
|
+
}
|
2490
2512
|
} else if (tensor.name.find("attention.wv.weight") != std::string::npos) {
|
2491
2513
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
2492
2514
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
@@ -2690,16 +2712,21 @@ struct llama_context * llama_init_from_file(
|
|
2690
2712
|
// this allocates all Metal resources and memory buffers
|
2691
2713
|
ctx->ctx_metal = ggml_metal_init();
|
2692
2714
|
|
2693
|
-
void *data_ptr
|
2715
|
+
void * data_ptr = NULL;
|
2694
2716
|
size_t data_size = 0;
|
2717
|
+
|
2695
2718
|
if (params.use_mmap) {
|
2696
|
-
data_ptr
|
2697
|
-
data_size= ctx->model.mapping->size;
|
2719
|
+
data_ptr = ctx->model.mapping->addr;
|
2720
|
+
data_size = ctx->model.mapping->size;
|
2698
2721
|
} else {
|
2699
|
-
data_ptr
|
2700
|
-
data_size= ggml_get_mem_size(ctx->model.ctx);
|
2722
|
+
data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
|
2723
|
+
data_size = ggml_get_mem_size (ctx->model.ctx);
|
2701
2724
|
}
|
2702
2725
|
|
2726
|
+
const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
|
2727
|
+
|
2728
|
+
printf("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
|
2729
|
+
|
2703
2730
|
#define LLAMA_METAL_CHECK_BUF(result) \
|
2704
2731
|
if (!(result)) { \
|
2705
2732
|
fprintf(stderr, "%s: failed to add buffer\n", __func__); \
|
@@ -2707,12 +2734,13 @@ struct llama_context * llama_init_from_file(
|
|
2707
2734
|
return NULL; \
|
2708
2735
|
}
|
2709
2736
|
|
2710
|
-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size));
|
2711
|
-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size));
|
2737
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
|
2712
2738
|
|
2713
|
-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "
|
2714
|
-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "
|
2715
|
-
|
2739
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0));
|
2740
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size, 0));
|
2741
|
+
|
2742
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0));
|
2743
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0));
|
2716
2744
|
#undef LLAMA_METAL_CHECK_BUF
|
2717
2745
|
}
|
2718
2746
|
#endif
|
@@ -3098,9 +3126,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
|
3098
3126
|
if (kv_size) {
|
3099
3127
|
const size_t elt_size = ggml_element_size(kv_self.k);
|
3100
3128
|
|
3101
|
-
|
3102
|
-
|
3103
|
-
ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
|
3129
|
+
ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
|
3104
3130
|
ggml_cgraph gf{};
|
3105
3131
|
gf.n_threads = 1;
|
3106
3132
|
|
@@ -3206,9 +3232,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
3206
3232
|
|
3207
3233
|
const size_t elt_size = ggml_element_size(kv_self.k);
|
3208
3234
|
|
3209
|
-
|
3210
|
-
|
3211
|
-
ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
|
3235
|
+
ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
|
3212
3236
|
ggml_cgraph gf{};
|
3213
3237
|
gf.n_threads = 1;
|
3214
3238
|
|
@@ -3443,9 +3467,12 @@ void llama_print_timings(struct llama_context * ctx) {
|
|
3443
3467
|
|
3444
3468
|
fprintf(stderr, "\n");
|
3445
3469
|
fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
|
3446
|
-
fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token
|
3447
|
-
|
3448
|
-
fprintf(stderr, "%s:
|
3470
|
+
fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
3471
|
+
__func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample, 1e6 / ctx->t_sample_us * n_sample);
|
3472
|
+
fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
|
3473
|
+
__func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval, 1e6 / ctx->t_p_eval_us * n_p_eval);
|
3474
|
+
fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
3475
|
+
__func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval, 1e6 / ctx->t_eval_us * n_eval);
|
3449
3476
|
fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0);
|
3450
3477
|
}
|
3451
3478
|
|
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -71,28 +71,27 @@ extern "C" {
|
|
71
71
|
|
72
72
|
typedef void (*llama_progress_callback)(float progress, void *ctx);
|
73
73
|
|
74
|
-
|
74
|
+
struct llama_context_params {
|
75
|
+
int seed; // RNG seed, -1 for random
|
75
76
|
int n_ctx; // text context
|
76
77
|
int n_batch; // prompt processing batch size
|
77
78
|
int n_gpu_layers; // number of layers to store in VRAM
|
78
79
|
int main_gpu; // the GPU that is used for scratch and small tensors
|
79
80
|
float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
|
80
|
-
|
81
|
-
|
81
|
+
// called with a progress value between 0 and 1, pass NULL to disable
|
82
|
+
llama_progress_callback progress_callback;
|
83
|
+
// context pointer passed to the progress callback
|
84
|
+
void * progress_callback_user_data;
|
82
85
|
|
86
|
+
// Keep the booleans together to avoid misalignment during copy-by-value.
|
87
|
+
bool low_vram; // if true, reduce VRAM usage at the cost of performance
|
83
88
|
bool f16_kv; // use fp16 for KV cache
|
84
89
|
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
85
90
|
bool vocab_only; // only load the vocabulary, no weights
|
86
91
|
bool use_mmap; // use mmap if possible
|
87
92
|
bool use_mlock; // force system to keep model in RAM
|
88
93
|
bool embedding; // embedding mode only
|
89
|
-
|
90
|
-
// called with a progress value between 0 and 1, pass NULL to disable
|
91
|
-
llama_progress_callback progress_callback;
|
92
|
-
// context pointer passed to the progress callback
|
93
|
-
void * progress_callback_user_data;
|
94
94
|
};
|
95
|
-
|
96
95
|
// model file types
|
97
96
|
enum llama_ftype {
|
98
97
|
LLAMA_FTYPE_ALL_F32 = 0,
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.2.
|
6
|
+
VERSION = '0.2.2'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = 'master-
|
9
|
+
LLAMA_CPP_VERSION = 'master-7487137'
|
10
10
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-06-
|
11
|
+
date: 2023-06-23 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|
@@ -24,6 +24,7 @@ files:
|
|
24
24
|
- README.md
|
25
25
|
- examples/README.md
|
26
26
|
- examples/chat.rb
|
27
|
+
- examples/embedding.rb
|
27
28
|
- ext/llama_cpp/extconf.rb
|
28
29
|
- ext/llama_cpp/llama_cpp.cpp
|
29
30
|
- ext/llama_cpp/llama_cpp.h
|