llama_cpp 0.3.3 → 0.3.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +24 -0
- data/ext/llama_cpp/llama_cpp.cpp +146 -9
- data/ext/llama_cpp/src/ggml-cuda.cu +485 -67
- data/ext/llama_cpp/src/ggml-metal.m +52 -43
- data/ext/llama_cpp/src/ggml-metal.metal +587 -470
- data/ext/llama_cpp/src/ggml.c +105 -79
- data/ext/llama_cpp/src/ggml.h +13 -1
- data/ext/llama_cpp/src/k_quants.h +8 -0
- data/ext/llama_cpp/src/llama.cpp +123 -66
- data/ext/llama_cpp/src/llama.h +34 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -0
- data/sig/llama_cpp.rbs +12 -1
- metadata +2 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -101,14 +101,15 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
|
|
101
101
|
// memory sizes
|
102
102
|
//
|
103
103
|
|
104
|
-
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
104
|
+
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0(int n_ctx)
|
105
105
|
{
|
106
106
|
static std::map<e_model, size_t> k_sizes = {
|
107
|
-
|
108
|
-
{
|
109
|
-
{
|
110
|
-
{
|
111
|
-
{
|
107
|
+
/* empirical scaling, still a guess */
|
108
|
+
{ MODEL_3B, ((size_t) n_ctx / 16ull + 128ull) * MB },
|
109
|
+
{ MODEL_7B, ((size_t) n_ctx / 16ull + 256ull) * MB },
|
110
|
+
{ MODEL_13B, ((size_t) n_ctx / 12ull + 256ull) * MB },
|
111
|
+
{ MODEL_30B, ((size_t) n_ctx / 10ull + 256ull) * MB },
|
112
|
+
{ MODEL_65B, ((size_t) n_ctx / 8ull + 512ull) * MB },
|
112
113
|
};
|
113
114
|
return k_sizes;
|
114
115
|
}
|
@@ -140,14 +141,14 @@ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
|
|
140
141
|
|
141
142
|
// this is mostly needed for temporary mul_mat buffers to dequantize the data
|
142
143
|
// not actually needed if BLAS is disabled
|
143
|
-
static const std::map<e_model, size_t> & MEM_REQ_EVAL()
|
144
|
+
static const std::map<e_model, size_t> & MEM_REQ_EVAL(int n_ctx)
|
144
145
|
{
|
145
146
|
static std::map<e_model, size_t> k_sizes = {
|
146
|
-
{ MODEL_3B,
|
147
|
-
{ MODEL_7B,
|
148
|
-
{ MODEL_13B, 1024ull * MB },
|
149
|
-
{ MODEL_30B, 1280ull * MB },
|
150
|
-
{ MODEL_65B, 1536ull * MB },
|
147
|
+
{ MODEL_3B, ((size_t) n_ctx / 256ull + 512ull) * MB },
|
148
|
+
{ MODEL_7B, ((size_t) n_ctx / 256ull + 768ull) * MB },
|
149
|
+
{ MODEL_13B, ((size_t) n_ctx / 256ull + 1024ull) * MB },
|
150
|
+
{ MODEL_30B, ((size_t) n_ctx / 256ull + 1280ull) * MB },
|
151
|
+
{ MODEL_65B, ((size_t) n_ctx / 256ull + 1536ull) * MB },
|
151
152
|
};
|
152
153
|
return k_sizes;
|
153
154
|
}
|
@@ -189,6 +190,10 @@ struct llama_hparams {
|
|
189
190
|
uint32_t n_head = 32;
|
190
191
|
uint32_t n_layer = 32;
|
191
192
|
uint32_t n_rot = 64;
|
193
|
+
|
194
|
+
float rope_freq_base = 10000.0f;
|
195
|
+
float rope_freq_scale = 1.0f;
|
196
|
+
|
192
197
|
enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
|
193
198
|
|
194
199
|
bool operator!=(const llama_hparams & other) const {
|
@@ -303,7 +308,7 @@ struct llama_model {
|
|
303
308
|
};
|
304
309
|
|
305
310
|
struct llama_context {
|
306
|
-
llama_context(const llama_model & model
|
311
|
+
llama_context(const llama_model & model) : model(model), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
|
307
312
|
#ifdef GGML_USE_METAL
|
308
313
|
~llama_context() {
|
309
314
|
if (ctx_metal) {
|
@@ -324,7 +329,6 @@ struct llama_context {
|
|
324
329
|
int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
|
325
330
|
|
326
331
|
const llama_model & model;
|
327
|
-
const llama_vocab & vocab;
|
328
332
|
|
329
333
|
bool model_owner = false;
|
330
334
|
|
@@ -551,7 +555,9 @@ struct llama_file_loader {
|
|
551
555
|
}
|
552
556
|
|
553
557
|
// skip to the next multiple of 32 bytes
|
554
|
-
|
558
|
+
if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) {
|
559
|
+
file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
|
560
|
+
}
|
555
561
|
|
556
562
|
tensor.file_off = file.tell();
|
557
563
|
tensor.name = name;
|
@@ -648,7 +654,7 @@ struct llama_model_loader {
|
|
648
654
|
*ctx_size_p = *mmapped_size_p = 0;
|
649
655
|
for (const llama_load_tensor & lt : tensors_map.tensors) {
|
650
656
|
*ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE;
|
651
|
-
*(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size;
|
657
|
+
*(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size + 16;
|
652
658
|
}
|
653
659
|
}
|
654
660
|
|
@@ -843,7 +849,9 @@ struct llama_context_params llama_context_default_params() {
|
|
843
849
|
/*.n_batch =*/ 512,
|
844
850
|
/*.gpu_layers =*/ 0,
|
845
851
|
/*.main_gpu =*/ 0,
|
846
|
-
/*.tensor_split =*/
|
852
|
+
/*.tensor_split =*/ nullptr,
|
853
|
+
/*.rope_freq_base =*/ 10000.0f,
|
854
|
+
/*.rope_freq_scale =*/ 1.0f,
|
847
855
|
/*.progress_callback =*/ nullptr,
|
848
856
|
/*.progress_callback_user_data =*/ nullptr,
|
849
857
|
/*.low_vram =*/ false,
|
@@ -869,6 +877,10 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
|
869
877
|
return result;
|
870
878
|
}
|
871
879
|
|
880
|
+
int llama_max_devices() {
|
881
|
+
return LLAMA_MAX_DEVICES;
|
882
|
+
}
|
883
|
+
|
872
884
|
bool llama_mmap_supported() {
|
873
885
|
return llama_mmap::SUPPORTED;
|
874
886
|
}
|
@@ -967,6 +979,8 @@ static void llama_model_load_internal(
|
|
967
979
|
int n_gpu_layers,
|
968
980
|
int main_gpu,
|
969
981
|
const float * tensor_split,
|
982
|
+
float rope_freq_base,
|
983
|
+
float rope_freq_scale,
|
970
984
|
bool low_vram,
|
971
985
|
ggml_type memory_type,
|
972
986
|
bool use_mmap,
|
@@ -1001,22 +1015,27 @@ static void llama_model_load_internal(
|
|
1001
1015
|
}
|
1002
1016
|
|
1003
1017
|
hparams.n_ctx = n_ctx;
|
1018
|
+
|
1019
|
+
hparams.rope_freq_base = rope_freq_base;
|
1020
|
+
hparams.rope_freq_scale = rope_freq_scale;
|
1004
1021
|
}
|
1005
1022
|
|
1006
1023
|
const uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
|
1007
1024
|
|
1008
1025
|
{
|
1009
|
-
fprintf(stderr, "%s: format = %s\n",
|
1010
|
-
fprintf(stderr, "%s: n_vocab = %u\n",
|
1011
|
-
fprintf(stderr, "%s: n_ctx = %u\n",
|
1012
|
-
fprintf(stderr, "%s: n_embd = %u\n",
|
1013
|
-
fprintf(stderr, "%s: n_mult = %u\n",
|
1014
|
-
fprintf(stderr, "%s: n_head = %u\n",
|
1015
|
-
fprintf(stderr, "%s: n_layer = %u\n",
|
1016
|
-
fprintf(stderr, "%s: n_rot = %u\n",
|
1026
|
+
fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version));
|
1027
|
+
fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab);
|
1028
|
+
fprintf(stderr, "%s: n_ctx = %u\n", __func__, hparams.n_ctx);
|
1029
|
+
fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd);
|
1030
|
+
fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult);
|
1031
|
+
fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head);
|
1032
|
+
fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer);
|
1033
|
+
fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot);
|
1034
|
+
fprintf(stderr, "%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base);
|
1035
|
+
fprintf(stderr, "%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale);
|
1017
1036
|
fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
|
1018
|
-
fprintf(stderr, "%s: n_ff = %u\n",
|
1019
|
-
fprintf(stderr, "%s: model size = %s\n",
|
1037
|
+
fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
|
1038
|
+
fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
|
1020
1039
|
}
|
1021
1040
|
|
1022
1041
|
if (file_version < LLAMA_FILE_VERSION_GGJT_V2) {
|
@@ -1165,9 +1184,9 @@ static void llama_model_load_internal(
|
|
1165
1184
|
const size_t mem_required =
|
1166
1185
|
ctx_size +
|
1167
1186
|
mmapped_size - vram_weights + // weights in VRAM not in memory
|
1168
|
-
MEM_REQ_SCRATCH0().at(model.type) +
|
1187
|
+
MEM_REQ_SCRATCH0(hparams.n_ctx).at(model.type) +
|
1169
1188
|
MEM_REQ_SCRATCH1().at(model.type) +
|
1170
|
-
MEM_REQ_EVAL().at
|
1189
|
+
MEM_REQ_EVAL(hparams.n_ctx).at(model.type);
|
1171
1190
|
|
1172
1191
|
// this is the memory required by one llama_state
|
1173
1192
|
const size_t mem_required_state =
|
@@ -1270,7 +1289,9 @@ static bool llama_model_load(
|
|
1270
1289
|
int n_batch,
|
1271
1290
|
int n_gpu_layers,
|
1272
1291
|
int main_gpu,
|
1273
|
-
float * tensor_split,
|
1292
|
+
const float * tensor_split,
|
1293
|
+
float rope_freq_base,
|
1294
|
+
float rope_freq_scale,
|
1274
1295
|
bool low_vram,
|
1275
1296
|
ggml_type memory_type,
|
1276
1297
|
bool use_mmap,
|
@@ -1279,7 +1300,7 @@ static bool llama_model_load(
|
|
1279
1300
|
llama_progress_callback progress_callback,
|
1280
1301
|
void *progress_callback_user_data) {
|
1281
1302
|
try {
|
1282
|
-
llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
|
1303
|
+
llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, rope_freq_base, rope_freq_scale, low_vram, memory_type,
|
1283
1304
|
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
|
1284
1305
|
return true;
|
1285
1306
|
} catch (const std::exception & err) {
|
@@ -1331,6 +1352,9 @@ static bool llama_eval_internal(
|
|
1331
1352
|
const int n_rot = hparams.n_embd/hparams.n_head;
|
1332
1353
|
const int n_gpu_layers = model.n_gpu_layers;
|
1333
1354
|
|
1355
|
+
const float freq_base = hparams.rope_freq_base;
|
1356
|
+
const float freq_scale = hparams.rope_freq_scale;
|
1357
|
+
|
1334
1358
|
auto & mem_per_token = lctx.mem_per_token;
|
1335
1359
|
auto & buf_compute = lctx.buf_compute;
|
1336
1360
|
|
@@ -1428,11 +1452,11 @@ static bool llama_eval_internal(
|
|
1428
1452
|
offload_func_kq(tmpq);
|
1429
1453
|
ggml_set_name(tmpq, "tmpq");
|
1430
1454
|
|
1431
|
-
struct ggml_tensor * Kcur =
|
1455
|
+
struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0, freq_base, freq_scale);
|
1432
1456
|
offload_func_kq(Kcur);
|
1433
1457
|
ggml_set_name(Kcur, "Kcur");
|
1434
1458
|
|
1435
|
-
struct ggml_tensor * Qcur =
|
1459
|
+
struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0, freq_base, freq_scale);
|
1436
1460
|
offload_func_kq(Qcur);
|
1437
1461
|
ggml_set_name(Qcur, "Qcur");
|
1438
1462
|
|
@@ -2006,9 +2030,18 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
|
|
2006
2030
|
}
|
2007
2031
|
|
2008
2032
|
// Normalize the second derivatives
|
2009
|
-
|
2010
|
-
|
2011
|
-
|
2033
|
+
{
|
2034
|
+
const float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
|
2035
|
+
|
2036
|
+
if (second_derivatives_sum > 1e-6f) {
|
2037
|
+
for (float & value : second_derivatives) {
|
2038
|
+
value /= second_derivatives_sum;
|
2039
|
+
}
|
2040
|
+
} else {
|
2041
|
+
for (float & value : second_derivatives) {
|
2042
|
+
value = 1.0f / second_derivatives.size();
|
2043
|
+
}
|
2044
|
+
}
|
2012
2045
|
}
|
2013
2046
|
|
2014
2047
|
float cum_sum = 0.0f;
|
@@ -2185,9 +2218,8 @@ void llama_sample_classifier_free_guidance(
|
|
2185
2218
|
struct llama_context * ctx,
|
2186
2219
|
llama_token_data_array * candidates,
|
2187
2220
|
struct llama_context * guidance_ctx,
|
2188
|
-
float scale
|
2189
|
-
|
2190
|
-
int64_t t_start_sample_us = t_start_sample_us = ggml_time_us();
|
2221
|
+
float scale) {
|
2222
|
+
int64_t t_start_sample_us = ggml_time_us();
|
2191
2223
|
|
2192
2224
|
assert(ctx);
|
2193
2225
|
auto n_vocab = llama_n_vocab(ctx);
|
@@ -2207,16 +2239,7 @@ void llama_sample_classifier_free_guidance(
|
|
2207
2239
|
for (int i = 0; i < n_vocab; ++i) {
|
2208
2240
|
float logit_guidance = logits_guidance[i];
|
2209
2241
|
float logit_base = logits_base[i];
|
2210
|
-
|
2211
|
-
}
|
2212
|
-
|
2213
|
-
llama_log_softmax(logits_guidance, n_vocab);
|
2214
|
-
|
2215
|
-
for (int i = 0; i < n_vocab; ++i) {
|
2216
|
-
float logit_base = logits_base[i];
|
2217
|
-
float logit_guidance = logits_guidance[i];
|
2218
|
-
|
2219
|
-
candidates->data[i].logit = smooth_factor * logit_guidance + (1.f - smooth_factor) * logit_base;
|
2242
|
+
candidates->data[i].logit = scale * (logit_base - logit_guidance) + logit_guidance;
|
2220
2243
|
}
|
2221
2244
|
|
2222
2245
|
if (ctx) {
|
@@ -2675,8 +2698,9 @@ struct llama_model * llama_load_model_from_file(
|
|
2675
2698
|
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
2676
2699
|
|
2677
2700
|
if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers,
|
2678
|
-
params.main_gpu, params.tensor_split, params.
|
2679
|
-
params.
|
2701
|
+
params.main_gpu, params.tensor_split, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
|
2702
|
+
memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback,
|
2703
|
+
params.progress_callback_user_data)) {
|
2680
2704
|
delete model;
|
2681
2705
|
fprintf(stderr, "%s: failed to load model\n", __func__);
|
2682
2706
|
return nullptr;
|
@@ -2697,7 +2721,7 @@ struct llama_context * llama_new_context_with_model(
|
|
2697
2721
|
return nullptr;
|
2698
2722
|
}
|
2699
2723
|
|
2700
|
-
llama_context * ctx = new llama_context(*model
|
2724
|
+
llama_context * ctx = new llama_context(*model);
|
2701
2725
|
|
2702
2726
|
if (params.seed == LLAMA_DEFAULT_SEED) {
|
2703
2727
|
params.seed = time(NULL);
|
@@ -2751,9 +2775,9 @@ struct llama_context * llama_new_context_with_model(
|
|
2751
2775
|
ctx->embedding.resize(hparams.n_embd);
|
2752
2776
|
}
|
2753
2777
|
|
2754
|
-
ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type));
|
2778
|
+
ctx->buf_compute.resize(MEM_REQ_EVAL(hparams.n_ctx).at(ctx->model.type));
|
2755
2779
|
|
2756
|
-
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type));
|
2780
|
+
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type));
|
2757
2781
|
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
|
2758
2782
|
}
|
2759
2783
|
|
@@ -3535,13 +3559,13 @@ int llama_eval_export(struct llama_context * ctx, const char * fname) {
|
|
3535
3559
|
return 0;
|
3536
3560
|
}
|
3537
3561
|
|
3538
|
-
int
|
3539
|
-
|
3562
|
+
int llama_tokenize_with_model(
|
3563
|
+
const struct llama_model * model,
|
3540
3564
|
const char * text,
|
3541
3565
|
llama_token * tokens,
|
3542
3566
|
int n_max_tokens,
|
3543
3567
|
bool add_bos) {
|
3544
|
-
auto res = llama_tokenize(
|
3568
|
+
auto res = llama_tokenize(model->vocab, text, add_bos);
|
3545
3569
|
|
3546
3570
|
if (n_max_tokens < (int) res.size()) {
|
3547
3571
|
fprintf(stderr, "%s: too many tokens\n", __func__);
|
@@ -3555,8 +3579,29 @@ int llama_tokenize(
|
|
3555
3579
|
return res.size();
|
3556
3580
|
}
|
3557
3581
|
|
3582
|
+
int llama_tokenize(
|
3583
|
+
struct llama_context * ctx,
|
3584
|
+
const char * text,
|
3585
|
+
llama_token * tokens,
|
3586
|
+
int n_max_tokens,
|
3587
|
+
bool add_bos) {
|
3588
|
+
return llama_tokenize_with_model(&ctx->model, text, tokens, n_max_tokens, add_bos);
|
3589
|
+
}
|
3590
|
+
|
3591
|
+
int llama_n_vocab_from_model(const struct llama_model * model) {
|
3592
|
+
return model->vocab.id_to_token.size();
|
3593
|
+
}
|
3594
|
+
|
3595
|
+
int llama_n_ctx_from_model(const struct llama_model * model) {
|
3596
|
+
return model->hparams.n_ctx;
|
3597
|
+
}
|
3598
|
+
|
3599
|
+
int llama_n_embd_from_model(const struct llama_model * model) {
|
3600
|
+
return model->hparams.n_embd;
|
3601
|
+
}
|
3602
|
+
|
3558
3603
|
int llama_n_vocab(const struct llama_context * ctx) {
|
3559
|
-
return ctx->vocab.id_to_token.size();
|
3604
|
+
return ctx->model.vocab.id_to_token.size();
|
3560
3605
|
}
|
3561
3606
|
|
3562
3607
|
int llama_n_ctx(const struct llama_context * ctx) {
|
@@ -3567,19 +3612,27 @@ int llama_n_embd(const struct llama_context * ctx) {
|
|
3567
3612
|
return ctx->model.hparams.n_embd;
|
3568
3613
|
}
|
3569
3614
|
|
3570
|
-
int
|
3571
|
-
const struct
|
3615
|
+
int llama_get_vocab_from_model(
|
3616
|
+
const struct llama_model * model,
|
3572
3617
|
const char * * strings,
|
3573
3618
|
float * scores,
|
3574
3619
|
int capacity) {
|
3575
|
-
int n = std::min(capacity, (int)
|
3620
|
+
int n = std::min(capacity, (int) model->vocab.id_to_token.size());
|
3576
3621
|
for (int i = 0; i<n; ++i) {
|
3577
|
-
strings[i] =
|
3578
|
-
scores[i] =
|
3622
|
+
strings[i] = model->vocab.id_to_token[i].tok.c_str();
|
3623
|
+
scores[i] = model->vocab.id_to_token[i].score;
|
3579
3624
|
}
|
3580
3625
|
return n;
|
3581
3626
|
}
|
3582
3627
|
|
3628
|
+
int llama_get_vocab(
|
3629
|
+
const struct llama_context * ctx,
|
3630
|
+
const char * * strings,
|
3631
|
+
float * scores,
|
3632
|
+
int capacity) {
|
3633
|
+
return llama_get_vocab_from_model(&ctx->model, strings, scores, capacity);
|
3634
|
+
}
|
3635
|
+
|
3583
3636
|
float * llama_get_logits(struct llama_context * ctx) {
|
3584
3637
|
return ctx->logits.data();
|
3585
3638
|
}
|
@@ -3588,12 +3641,16 @@ float * llama_get_embeddings(struct llama_context * ctx) {
|
|
3588
3641
|
return ctx->embedding.data();
|
3589
3642
|
}
|
3590
3643
|
|
3591
|
-
const char *
|
3592
|
-
if (token >=
|
3644
|
+
const char * llama_token_to_str_with_model(const struct llama_model * model, llama_token token) {
|
3645
|
+
if (token >= llama_n_vocab_from_model(model)) {
|
3593
3646
|
return nullptr;
|
3594
3647
|
}
|
3595
3648
|
|
3596
|
-
return
|
3649
|
+
return model->vocab.id_to_token[token].tok.c_str();
|
3650
|
+
}
|
3651
|
+
|
3652
|
+
const char * llama_token_to_str(const struct llama_context * ctx, llama_token token) {
|
3653
|
+
return llama_token_to_str_with_model(&ctx->model, token);
|
3597
3654
|
}
|
3598
3655
|
|
3599
3656
|
llama_token llama_token_bos() {
|
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -88,7 +88,13 @@ extern "C" {
|
|
88
88
|
int32_t n_batch; // prompt processing batch size
|
89
89
|
int32_t n_gpu_layers; // number of layers to store in VRAM
|
90
90
|
int32_t main_gpu; // the GPU that is used for scratch and small tensors
|
91
|
-
|
91
|
+
|
92
|
+
const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
|
93
|
+
|
94
|
+
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
95
|
+
float rope_freq_base; // RoPE base frequency
|
96
|
+
float rope_freq_scale; // RoPE frequency scaling factor
|
97
|
+
|
92
98
|
// called with a progress value between 0 and 1, pass NULL to disable
|
93
99
|
llama_progress_callback progress_callback;
|
94
100
|
// context pointer passed to the progress callback
|
@@ -148,6 +154,8 @@ extern "C" {
|
|
148
154
|
int32_t n_eval;
|
149
155
|
};
|
150
156
|
|
157
|
+
LLAMA_API int llama_max_devices();
|
158
|
+
|
151
159
|
LLAMA_API struct llama_context_params llama_context_default_params();
|
152
160
|
LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
|
153
161
|
|
@@ -270,10 +278,21 @@ extern "C" {
|
|
270
278
|
int n_max_tokens,
|
271
279
|
bool add_bos);
|
272
280
|
|
281
|
+
LLAMA_API int llama_tokenize_with_model(
|
282
|
+
const struct llama_model * model,
|
283
|
+
const char * text,
|
284
|
+
llama_token * tokens,
|
285
|
+
int n_max_tokens,
|
286
|
+
bool add_bos);
|
287
|
+
|
273
288
|
LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
|
274
289
|
LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
|
275
290
|
LLAMA_API int llama_n_embd (const struct llama_context * ctx);
|
276
291
|
|
292
|
+
LLAMA_API int llama_n_vocab_from_model(const struct llama_model * model);
|
293
|
+
LLAMA_API int llama_n_ctx_from_model (const struct llama_model * model);
|
294
|
+
LLAMA_API int llama_n_embd_from_model (const struct llama_model * model);
|
295
|
+
|
277
296
|
// Get the vocabulary as output parameters.
|
278
297
|
// Returns number of results.
|
279
298
|
LLAMA_API int llama_get_vocab(
|
@@ -282,6 +301,12 @@ extern "C" {
|
|
282
301
|
float * scores,
|
283
302
|
int capacity);
|
284
303
|
|
304
|
+
LLAMA_API int llama_get_vocab_from_model(
|
305
|
+
const struct llama_model * model,
|
306
|
+
const char * * strings,
|
307
|
+
float * scores,
|
308
|
+
int capacity);
|
309
|
+
|
285
310
|
// Token logits obtained from the last call to llama_eval()
|
286
311
|
// The logits for the last token are stored in the last row
|
287
312
|
// Can be mutated in order to change the probabilities of the next token
|
@@ -294,7 +319,13 @@ extern "C" {
|
|
294
319
|
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
295
320
|
|
296
321
|
// Token Id -> String. Uses the vocabulary in the provided context
|
297
|
-
LLAMA_API const char * llama_token_to_str(
|
322
|
+
LLAMA_API const char * llama_token_to_str(
|
323
|
+
const struct llama_context * ctx,
|
324
|
+
llama_token token);
|
325
|
+
|
326
|
+
LLAMA_API const char * llama_token_to_str_with_model(
|
327
|
+
const struct llama_model * model,
|
328
|
+
llama_token token);
|
298
329
|
|
299
330
|
// Special tokens
|
300
331
|
LLAMA_API llama_token llama_token_bos(); // beginning-of-sentence
|
@@ -313,13 +344,11 @@ extern "C" {
|
|
313
344
|
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
|
314
345
|
/// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
|
315
346
|
/// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
|
316
|
-
/// @params smooth_factor Smooth factor between guidance logits and original logits. 1.0f means only use guidance logits. 0.0f means only original logits.
|
317
347
|
LLAMA_API void llama_sample_classifier_free_guidance(
|
318
348
|
struct llama_context * ctx,
|
319
349
|
llama_token_data_array * candidates,
|
320
350
|
struct llama_context * guidance_ctx,
|
321
|
-
float scale
|
322
|
-
float smooth_factor);
|
351
|
+
float scale);
|
323
352
|
|
324
353
|
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
|
325
354
|
LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.3.
|
6
|
+
VERSION = '0.3.4'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = 'master-
|
9
|
+
LLAMA_CPP_VERSION = 'master-d924522'
|
10
10
|
end
|
data/lib/llama_cpp.rb
CHANGED
data/sig/llama_cpp.rbs
CHANGED
@@ -39,6 +39,7 @@ module LLaMACpp
|
|
39
39
|
def self?.token_nl: () -> Integer
|
40
40
|
def self?.mmap_supported?: () -> bool
|
41
41
|
def self?.mlock_supported?: () -> bool
|
42
|
+
def self?.max_devices: () -> Integer
|
42
43
|
|
43
44
|
class TokenData
|
44
45
|
public
|
@@ -69,6 +70,12 @@ module LLaMACpp
|
|
69
70
|
def free: () -> void
|
70
71
|
def load: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
|
71
72
|
def apply_lora_from_file: (lora_path: String, ?base_model_path: String, ?n_threads: Integer) -> void
|
73
|
+
def n_vocab: () -> Integer
|
74
|
+
def n_ctx: () -> Integer
|
75
|
+
def n_embd: () -> Integer
|
76
|
+
def vocab: (capacity: Integer) -> [Array[String], Array[Float]]
|
77
|
+
def token_to_str: (Integer) -> String
|
78
|
+
def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
|
72
79
|
end
|
73
80
|
|
74
81
|
class Timings
|
@@ -109,7 +116,7 @@ module LLaMACpp
|
|
109
116
|
def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
|
110
117
|
def sample_repetition_penalty: (::LLaMACpp::TokenDataArray, Array[Integer], penalty: Float) -> void
|
111
118
|
def sample_frequency_and_presence_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], frequency: Float, presence: Float) -> void
|
112
|
-
def sample_classifier_free_guidance: (::LLaMACpp::TokenDataArray, guidance: ::LLaMACpp::Context, scale: Float
|
119
|
+
def sample_classifier_free_guidance: (::LLaMACpp::TokenDataArray, guidance: ::LLaMACpp::Context, scale: Float) -> void
|
113
120
|
def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
|
114
121
|
def sample_top_k: (::LLaMACpp::TokenDataArray, k: Integer, ?min_keep: Integer) -> void
|
115
122
|
def sample_top_p: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
|
@@ -140,6 +147,10 @@ module LLaMACpp
|
|
140
147
|
def main_gpu: () -> Integer
|
141
148
|
def main_gpu=: (Integer) -> Integer
|
142
149
|
def tensor_split: () -> Array[Float]
|
150
|
+
def rope_freq_base=: (Float) -> Float
|
151
|
+
def rope_freq_base: () -> Float
|
152
|
+
def rope_freq_scale=: (Float) -> Float
|
153
|
+
def rope_freq_scale: () -> Float
|
143
154
|
def low_vram: () -> bool
|
144
155
|
def low_vram=: (bool) -> bool
|
145
156
|
def seed: () -> Integer
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-07-
|
11
|
+
date: 2023-07-22 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|