llama_cpp 0.3.3 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +24 -0
- data/ext/llama_cpp/llama_cpp.cpp +146 -9
- data/ext/llama_cpp/src/ggml-cuda.cu +485 -67
- data/ext/llama_cpp/src/ggml-metal.m +52 -43
- data/ext/llama_cpp/src/ggml-metal.metal +587 -470
- data/ext/llama_cpp/src/ggml.c +105 -79
- data/ext/llama_cpp/src/ggml.h +13 -1
- data/ext/llama_cpp/src/k_quants.h +8 -0
- data/ext/llama_cpp/src/llama.cpp +123 -66
- data/ext/llama_cpp/src/llama.h +34 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -0
- data/sig/llama_cpp.rbs +12 -1
- metadata +2 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -101,14 +101,15 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
|
|
101
101
|
// memory sizes
|
102
102
|
//
|
103
103
|
|
104
|
-
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
104
|
+
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0(int n_ctx)
|
105
105
|
{
|
106
106
|
static std::map<e_model, size_t> k_sizes = {
|
107
|
-
|
108
|
-
{
|
109
|
-
{
|
110
|
-
{
|
111
|
-
{
|
107
|
+
/* empirical scaling, still a guess */
|
108
|
+
{ MODEL_3B, ((size_t) n_ctx / 16ull + 128ull) * MB },
|
109
|
+
{ MODEL_7B, ((size_t) n_ctx / 16ull + 256ull) * MB },
|
110
|
+
{ MODEL_13B, ((size_t) n_ctx / 12ull + 256ull) * MB },
|
111
|
+
{ MODEL_30B, ((size_t) n_ctx / 10ull + 256ull) * MB },
|
112
|
+
{ MODEL_65B, ((size_t) n_ctx / 8ull + 512ull) * MB },
|
112
113
|
};
|
113
114
|
return k_sizes;
|
114
115
|
}
|
@@ -140,14 +141,14 @@ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
|
|
140
141
|
|
141
142
|
// this is mostly needed for temporary mul_mat buffers to dequantize the data
|
142
143
|
// not actually needed if BLAS is disabled
|
143
|
-
static const std::map<e_model, size_t> & MEM_REQ_EVAL()
|
144
|
+
static const std::map<e_model, size_t> & MEM_REQ_EVAL(int n_ctx)
|
144
145
|
{
|
145
146
|
static std::map<e_model, size_t> k_sizes = {
|
146
|
-
{ MODEL_3B,
|
147
|
-
{ MODEL_7B,
|
148
|
-
{ MODEL_13B, 1024ull * MB },
|
149
|
-
{ MODEL_30B, 1280ull * MB },
|
150
|
-
{ MODEL_65B, 1536ull * MB },
|
147
|
+
{ MODEL_3B, ((size_t) n_ctx / 256ull + 512ull) * MB },
|
148
|
+
{ MODEL_7B, ((size_t) n_ctx / 256ull + 768ull) * MB },
|
149
|
+
{ MODEL_13B, ((size_t) n_ctx / 256ull + 1024ull) * MB },
|
150
|
+
{ MODEL_30B, ((size_t) n_ctx / 256ull + 1280ull) * MB },
|
151
|
+
{ MODEL_65B, ((size_t) n_ctx / 256ull + 1536ull) * MB },
|
151
152
|
};
|
152
153
|
return k_sizes;
|
153
154
|
}
|
@@ -189,6 +190,10 @@ struct llama_hparams {
|
|
189
190
|
uint32_t n_head = 32;
|
190
191
|
uint32_t n_layer = 32;
|
191
192
|
uint32_t n_rot = 64;
|
193
|
+
|
194
|
+
float rope_freq_base = 10000.0f;
|
195
|
+
float rope_freq_scale = 1.0f;
|
196
|
+
|
192
197
|
enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
|
193
198
|
|
194
199
|
bool operator!=(const llama_hparams & other) const {
|
@@ -303,7 +308,7 @@ struct llama_model {
|
|
303
308
|
};
|
304
309
|
|
305
310
|
struct llama_context {
|
306
|
-
llama_context(const llama_model & model
|
311
|
+
llama_context(const llama_model & model) : model(model), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
|
307
312
|
#ifdef GGML_USE_METAL
|
308
313
|
~llama_context() {
|
309
314
|
if (ctx_metal) {
|
@@ -324,7 +329,6 @@ struct llama_context {
|
|
324
329
|
int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
|
325
330
|
|
326
331
|
const llama_model & model;
|
327
|
-
const llama_vocab & vocab;
|
328
332
|
|
329
333
|
bool model_owner = false;
|
330
334
|
|
@@ -551,7 +555,9 @@ struct llama_file_loader {
|
|
551
555
|
}
|
552
556
|
|
553
557
|
// skip to the next multiple of 32 bytes
|
554
|
-
|
558
|
+
if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) {
|
559
|
+
file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
|
560
|
+
}
|
555
561
|
|
556
562
|
tensor.file_off = file.tell();
|
557
563
|
tensor.name = name;
|
@@ -648,7 +654,7 @@ struct llama_model_loader {
|
|
648
654
|
*ctx_size_p = *mmapped_size_p = 0;
|
649
655
|
for (const llama_load_tensor & lt : tensors_map.tensors) {
|
650
656
|
*ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE;
|
651
|
-
*(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size;
|
657
|
+
*(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size + 16;
|
652
658
|
}
|
653
659
|
}
|
654
660
|
|
@@ -843,7 +849,9 @@ struct llama_context_params llama_context_default_params() {
|
|
843
849
|
/*.n_batch =*/ 512,
|
844
850
|
/*.gpu_layers =*/ 0,
|
845
851
|
/*.main_gpu =*/ 0,
|
846
|
-
/*.tensor_split =*/
|
852
|
+
/*.tensor_split =*/ nullptr,
|
853
|
+
/*.rope_freq_base =*/ 10000.0f,
|
854
|
+
/*.rope_freq_scale =*/ 1.0f,
|
847
855
|
/*.progress_callback =*/ nullptr,
|
848
856
|
/*.progress_callback_user_data =*/ nullptr,
|
849
857
|
/*.low_vram =*/ false,
|
@@ -869,6 +877,10 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
|
869
877
|
return result;
|
870
878
|
}
|
871
879
|
|
880
|
+
int llama_max_devices() {
|
881
|
+
return LLAMA_MAX_DEVICES;
|
882
|
+
}
|
883
|
+
|
872
884
|
bool llama_mmap_supported() {
|
873
885
|
return llama_mmap::SUPPORTED;
|
874
886
|
}
|
@@ -967,6 +979,8 @@ static void llama_model_load_internal(
|
|
967
979
|
int n_gpu_layers,
|
968
980
|
int main_gpu,
|
969
981
|
const float * tensor_split,
|
982
|
+
float rope_freq_base,
|
983
|
+
float rope_freq_scale,
|
970
984
|
bool low_vram,
|
971
985
|
ggml_type memory_type,
|
972
986
|
bool use_mmap,
|
@@ -1001,22 +1015,27 @@ static void llama_model_load_internal(
|
|
1001
1015
|
}
|
1002
1016
|
|
1003
1017
|
hparams.n_ctx = n_ctx;
|
1018
|
+
|
1019
|
+
hparams.rope_freq_base = rope_freq_base;
|
1020
|
+
hparams.rope_freq_scale = rope_freq_scale;
|
1004
1021
|
}
|
1005
1022
|
|
1006
1023
|
const uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
|
1007
1024
|
|
1008
1025
|
{
|
1009
|
-
fprintf(stderr, "%s: format = %s\n",
|
1010
|
-
fprintf(stderr, "%s: n_vocab = %u\n",
|
1011
|
-
fprintf(stderr, "%s: n_ctx = %u\n",
|
1012
|
-
fprintf(stderr, "%s: n_embd = %u\n",
|
1013
|
-
fprintf(stderr, "%s: n_mult = %u\n",
|
1014
|
-
fprintf(stderr, "%s: n_head = %u\n",
|
1015
|
-
fprintf(stderr, "%s: n_layer = %u\n",
|
1016
|
-
fprintf(stderr, "%s: n_rot = %u\n",
|
1026
|
+
fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version));
|
1027
|
+
fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab);
|
1028
|
+
fprintf(stderr, "%s: n_ctx = %u\n", __func__, hparams.n_ctx);
|
1029
|
+
fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd);
|
1030
|
+
fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult);
|
1031
|
+
fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head);
|
1032
|
+
fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer);
|
1033
|
+
fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot);
|
1034
|
+
fprintf(stderr, "%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base);
|
1035
|
+
fprintf(stderr, "%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale);
|
1017
1036
|
fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
|
1018
|
-
fprintf(stderr, "%s: n_ff = %u\n",
|
1019
|
-
fprintf(stderr, "%s: model size = %s\n",
|
1037
|
+
fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
|
1038
|
+
fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
|
1020
1039
|
}
|
1021
1040
|
|
1022
1041
|
if (file_version < LLAMA_FILE_VERSION_GGJT_V2) {
|
@@ -1165,9 +1184,9 @@ static void llama_model_load_internal(
|
|
1165
1184
|
const size_t mem_required =
|
1166
1185
|
ctx_size +
|
1167
1186
|
mmapped_size - vram_weights + // weights in VRAM not in memory
|
1168
|
-
MEM_REQ_SCRATCH0().at(model.type) +
|
1187
|
+
MEM_REQ_SCRATCH0(hparams.n_ctx).at(model.type) +
|
1169
1188
|
MEM_REQ_SCRATCH1().at(model.type) +
|
1170
|
-
MEM_REQ_EVAL().at
|
1189
|
+
MEM_REQ_EVAL(hparams.n_ctx).at(model.type);
|
1171
1190
|
|
1172
1191
|
// this is the memory required by one llama_state
|
1173
1192
|
const size_t mem_required_state =
|
@@ -1270,7 +1289,9 @@ static bool llama_model_load(
|
|
1270
1289
|
int n_batch,
|
1271
1290
|
int n_gpu_layers,
|
1272
1291
|
int main_gpu,
|
1273
|
-
float * tensor_split,
|
1292
|
+
const float * tensor_split,
|
1293
|
+
float rope_freq_base,
|
1294
|
+
float rope_freq_scale,
|
1274
1295
|
bool low_vram,
|
1275
1296
|
ggml_type memory_type,
|
1276
1297
|
bool use_mmap,
|
@@ -1279,7 +1300,7 @@ static bool llama_model_load(
|
|
1279
1300
|
llama_progress_callback progress_callback,
|
1280
1301
|
void *progress_callback_user_data) {
|
1281
1302
|
try {
|
1282
|
-
llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
|
1303
|
+
llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, rope_freq_base, rope_freq_scale, low_vram, memory_type,
|
1283
1304
|
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
|
1284
1305
|
return true;
|
1285
1306
|
} catch (const std::exception & err) {
|
@@ -1331,6 +1352,9 @@ static bool llama_eval_internal(
|
|
1331
1352
|
const int n_rot = hparams.n_embd/hparams.n_head;
|
1332
1353
|
const int n_gpu_layers = model.n_gpu_layers;
|
1333
1354
|
|
1355
|
+
const float freq_base = hparams.rope_freq_base;
|
1356
|
+
const float freq_scale = hparams.rope_freq_scale;
|
1357
|
+
|
1334
1358
|
auto & mem_per_token = lctx.mem_per_token;
|
1335
1359
|
auto & buf_compute = lctx.buf_compute;
|
1336
1360
|
|
@@ -1428,11 +1452,11 @@ static bool llama_eval_internal(
|
|
1428
1452
|
offload_func_kq(tmpq);
|
1429
1453
|
ggml_set_name(tmpq, "tmpq");
|
1430
1454
|
|
1431
|
-
struct ggml_tensor * Kcur =
|
1455
|
+
struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0, freq_base, freq_scale);
|
1432
1456
|
offload_func_kq(Kcur);
|
1433
1457
|
ggml_set_name(Kcur, "Kcur");
|
1434
1458
|
|
1435
|
-
struct ggml_tensor * Qcur =
|
1459
|
+
struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0, freq_base, freq_scale);
|
1436
1460
|
offload_func_kq(Qcur);
|
1437
1461
|
ggml_set_name(Qcur, "Qcur");
|
1438
1462
|
|
@@ -2006,9 +2030,18 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
|
|
2006
2030
|
}
|
2007
2031
|
|
2008
2032
|
// Normalize the second derivatives
|
2009
|
-
|
2010
|
-
|
2011
|
-
|
2033
|
+
{
|
2034
|
+
const float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
|
2035
|
+
|
2036
|
+
if (second_derivatives_sum > 1e-6f) {
|
2037
|
+
for (float & value : second_derivatives) {
|
2038
|
+
value /= second_derivatives_sum;
|
2039
|
+
}
|
2040
|
+
} else {
|
2041
|
+
for (float & value : second_derivatives) {
|
2042
|
+
value = 1.0f / second_derivatives.size();
|
2043
|
+
}
|
2044
|
+
}
|
2012
2045
|
}
|
2013
2046
|
|
2014
2047
|
float cum_sum = 0.0f;
|
@@ -2185,9 +2218,8 @@ void llama_sample_classifier_free_guidance(
|
|
2185
2218
|
struct llama_context * ctx,
|
2186
2219
|
llama_token_data_array * candidates,
|
2187
2220
|
struct llama_context * guidance_ctx,
|
2188
|
-
float scale
|
2189
|
-
|
2190
|
-
int64_t t_start_sample_us = t_start_sample_us = ggml_time_us();
|
2221
|
+
float scale) {
|
2222
|
+
int64_t t_start_sample_us = ggml_time_us();
|
2191
2223
|
|
2192
2224
|
assert(ctx);
|
2193
2225
|
auto n_vocab = llama_n_vocab(ctx);
|
@@ -2207,16 +2239,7 @@ void llama_sample_classifier_free_guidance(
|
|
2207
2239
|
for (int i = 0; i < n_vocab; ++i) {
|
2208
2240
|
float logit_guidance = logits_guidance[i];
|
2209
2241
|
float logit_base = logits_base[i];
|
2210
|
-
|
2211
|
-
}
|
2212
|
-
|
2213
|
-
llama_log_softmax(logits_guidance, n_vocab);
|
2214
|
-
|
2215
|
-
for (int i = 0; i < n_vocab; ++i) {
|
2216
|
-
float logit_base = logits_base[i];
|
2217
|
-
float logit_guidance = logits_guidance[i];
|
2218
|
-
|
2219
|
-
candidates->data[i].logit = smooth_factor * logit_guidance + (1.f - smooth_factor) * logit_base;
|
2242
|
+
candidates->data[i].logit = scale * (logit_base - logit_guidance) + logit_guidance;
|
2220
2243
|
}
|
2221
2244
|
|
2222
2245
|
if (ctx) {
|
@@ -2675,8 +2698,9 @@ struct llama_model * llama_load_model_from_file(
|
|
2675
2698
|
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
2676
2699
|
|
2677
2700
|
if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers,
|
2678
|
-
params.main_gpu, params.tensor_split, params.
|
2679
|
-
params.
|
2701
|
+
params.main_gpu, params.tensor_split, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
|
2702
|
+
memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback,
|
2703
|
+
params.progress_callback_user_data)) {
|
2680
2704
|
delete model;
|
2681
2705
|
fprintf(stderr, "%s: failed to load model\n", __func__);
|
2682
2706
|
return nullptr;
|
@@ -2697,7 +2721,7 @@ struct llama_context * llama_new_context_with_model(
|
|
2697
2721
|
return nullptr;
|
2698
2722
|
}
|
2699
2723
|
|
2700
|
-
llama_context * ctx = new llama_context(*model
|
2724
|
+
llama_context * ctx = new llama_context(*model);
|
2701
2725
|
|
2702
2726
|
if (params.seed == LLAMA_DEFAULT_SEED) {
|
2703
2727
|
params.seed = time(NULL);
|
@@ -2751,9 +2775,9 @@ struct llama_context * llama_new_context_with_model(
|
|
2751
2775
|
ctx->embedding.resize(hparams.n_embd);
|
2752
2776
|
}
|
2753
2777
|
|
2754
|
-
ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type));
|
2778
|
+
ctx->buf_compute.resize(MEM_REQ_EVAL(hparams.n_ctx).at(ctx->model.type));
|
2755
2779
|
|
2756
|
-
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type));
|
2780
|
+
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type));
|
2757
2781
|
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
|
2758
2782
|
}
|
2759
2783
|
|
@@ -3535,13 +3559,13 @@ int llama_eval_export(struct llama_context * ctx, const char * fname) {
|
|
3535
3559
|
return 0;
|
3536
3560
|
}
|
3537
3561
|
|
3538
|
-
int
|
3539
|
-
|
3562
|
+
int llama_tokenize_with_model(
|
3563
|
+
const struct llama_model * model,
|
3540
3564
|
const char * text,
|
3541
3565
|
llama_token * tokens,
|
3542
3566
|
int n_max_tokens,
|
3543
3567
|
bool add_bos) {
|
3544
|
-
auto res = llama_tokenize(
|
3568
|
+
auto res = llama_tokenize(model->vocab, text, add_bos);
|
3545
3569
|
|
3546
3570
|
if (n_max_tokens < (int) res.size()) {
|
3547
3571
|
fprintf(stderr, "%s: too many tokens\n", __func__);
|
@@ -3555,8 +3579,29 @@ int llama_tokenize(
|
|
3555
3579
|
return res.size();
|
3556
3580
|
}
|
3557
3581
|
|
3582
|
+
int llama_tokenize(
|
3583
|
+
struct llama_context * ctx,
|
3584
|
+
const char * text,
|
3585
|
+
llama_token * tokens,
|
3586
|
+
int n_max_tokens,
|
3587
|
+
bool add_bos) {
|
3588
|
+
return llama_tokenize_with_model(&ctx->model, text, tokens, n_max_tokens, add_bos);
|
3589
|
+
}
|
3590
|
+
|
3591
|
+
int llama_n_vocab_from_model(const struct llama_model * model) {
|
3592
|
+
return model->vocab.id_to_token.size();
|
3593
|
+
}
|
3594
|
+
|
3595
|
+
int llama_n_ctx_from_model(const struct llama_model * model) {
|
3596
|
+
return model->hparams.n_ctx;
|
3597
|
+
}
|
3598
|
+
|
3599
|
+
int llama_n_embd_from_model(const struct llama_model * model) {
|
3600
|
+
return model->hparams.n_embd;
|
3601
|
+
}
|
3602
|
+
|
3558
3603
|
int llama_n_vocab(const struct llama_context * ctx) {
|
3559
|
-
return ctx->vocab.id_to_token.size();
|
3604
|
+
return ctx->model.vocab.id_to_token.size();
|
3560
3605
|
}
|
3561
3606
|
|
3562
3607
|
int llama_n_ctx(const struct llama_context * ctx) {
|
@@ -3567,19 +3612,27 @@ int llama_n_embd(const struct llama_context * ctx) {
|
|
3567
3612
|
return ctx->model.hparams.n_embd;
|
3568
3613
|
}
|
3569
3614
|
|
3570
|
-
int
|
3571
|
-
const struct
|
3615
|
+
int llama_get_vocab_from_model(
|
3616
|
+
const struct llama_model * model,
|
3572
3617
|
const char * * strings,
|
3573
3618
|
float * scores,
|
3574
3619
|
int capacity) {
|
3575
|
-
int n = std::min(capacity, (int)
|
3620
|
+
int n = std::min(capacity, (int) model->vocab.id_to_token.size());
|
3576
3621
|
for (int i = 0; i<n; ++i) {
|
3577
|
-
strings[i] =
|
3578
|
-
scores[i] =
|
3622
|
+
strings[i] = model->vocab.id_to_token[i].tok.c_str();
|
3623
|
+
scores[i] = model->vocab.id_to_token[i].score;
|
3579
3624
|
}
|
3580
3625
|
return n;
|
3581
3626
|
}
|
3582
3627
|
|
3628
|
+
int llama_get_vocab(
|
3629
|
+
const struct llama_context * ctx,
|
3630
|
+
const char * * strings,
|
3631
|
+
float * scores,
|
3632
|
+
int capacity) {
|
3633
|
+
return llama_get_vocab_from_model(&ctx->model, strings, scores, capacity);
|
3634
|
+
}
|
3635
|
+
|
3583
3636
|
float * llama_get_logits(struct llama_context * ctx) {
|
3584
3637
|
return ctx->logits.data();
|
3585
3638
|
}
|
@@ -3588,12 +3641,16 @@ float * llama_get_embeddings(struct llama_context * ctx) {
|
|
3588
3641
|
return ctx->embedding.data();
|
3589
3642
|
}
|
3590
3643
|
|
3591
|
-
const char *
|
3592
|
-
if (token >=
|
3644
|
+
const char * llama_token_to_str_with_model(const struct llama_model * model, llama_token token) {
|
3645
|
+
if (token >= llama_n_vocab_from_model(model)) {
|
3593
3646
|
return nullptr;
|
3594
3647
|
}
|
3595
3648
|
|
3596
|
-
return
|
3649
|
+
return model->vocab.id_to_token[token].tok.c_str();
|
3650
|
+
}
|
3651
|
+
|
3652
|
+
const char * llama_token_to_str(const struct llama_context * ctx, llama_token token) {
|
3653
|
+
return llama_token_to_str_with_model(&ctx->model, token);
|
3597
3654
|
}
|
3598
3655
|
|
3599
3656
|
llama_token llama_token_bos() {
|
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -88,7 +88,13 @@ extern "C" {
|
|
88
88
|
int32_t n_batch; // prompt processing batch size
|
89
89
|
int32_t n_gpu_layers; // number of layers to store in VRAM
|
90
90
|
int32_t main_gpu; // the GPU that is used for scratch and small tensors
|
91
|
-
|
91
|
+
|
92
|
+
const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
|
93
|
+
|
94
|
+
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
95
|
+
float rope_freq_base; // RoPE base frequency
|
96
|
+
float rope_freq_scale; // RoPE frequency scaling factor
|
97
|
+
|
92
98
|
// called with a progress value between 0 and 1, pass NULL to disable
|
93
99
|
llama_progress_callback progress_callback;
|
94
100
|
// context pointer passed to the progress callback
|
@@ -148,6 +154,8 @@ extern "C" {
|
|
148
154
|
int32_t n_eval;
|
149
155
|
};
|
150
156
|
|
157
|
+
LLAMA_API int llama_max_devices();
|
158
|
+
|
151
159
|
LLAMA_API struct llama_context_params llama_context_default_params();
|
152
160
|
LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
|
153
161
|
|
@@ -270,10 +278,21 @@ extern "C" {
|
|
270
278
|
int n_max_tokens,
|
271
279
|
bool add_bos);
|
272
280
|
|
281
|
+
LLAMA_API int llama_tokenize_with_model(
|
282
|
+
const struct llama_model * model,
|
283
|
+
const char * text,
|
284
|
+
llama_token * tokens,
|
285
|
+
int n_max_tokens,
|
286
|
+
bool add_bos);
|
287
|
+
|
273
288
|
LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
|
274
289
|
LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
|
275
290
|
LLAMA_API int llama_n_embd (const struct llama_context * ctx);
|
276
291
|
|
292
|
+
LLAMA_API int llama_n_vocab_from_model(const struct llama_model * model);
|
293
|
+
LLAMA_API int llama_n_ctx_from_model (const struct llama_model * model);
|
294
|
+
LLAMA_API int llama_n_embd_from_model (const struct llama_model * model);
|
295
|
+
|
277
296
|
// Get the vocabulary as output parameters.
|
278
297
|
// Returns number of results.
|
279
298
|
LLAMA_API int llama_get_vocab(
|
@@ -282,6 +301,12 @@ extern "C" {
|
|
282
301
|
float * scores,
|
283
302
|
int capacity);
|
284
303
|
|
304
|
+
LLAMA_API int llama_get_vocab_from_model(
|
305
|
+
const struct llama_model * model,
|
306
|
+
const char * * strings,
|
307
|
+
float * scores,
|
308
|
+
int capacity);
|
309
|
+
|
285
310
|
// Token logits obtained from the last call to llama_eval()
|
286
311
|
// The logits for the last token are stored in the last row
|
287
312
|
// Can be mutated in order to change the probabilities of the next token
|
@@ -294,7 +319,13 @@ extern "C" {
|
|
294
319
|
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
295
320
|
|
296
321
|
// Token Id -> String. Uses the vocabulary in the provided context
|
297
|
-
LLAMA_API const char * llama_token_to_str(
|
322
|
+
LLAMA_API const char * llama_token_to_str(
|
323
|
+
const struct llama_context * ctx,
|
324
|
+
llama_token token);
|
325
|
+
|
326
|
+
LLAMA_API const char * llama_token_to_str_with_model(
|
327
|
+
const struct llama_model * model,
|
328
|
+
llama_token token);
|
298
329
|
|
299
330
|
// Special tokens
|
300
331
|
LLAMA_API llama_token llama_token_bos(); // beginning-of-sentence
|
@@ -313,13 +344,11 @@ extern "C" {
|
|
313
344
|
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
|
314
345
|
/// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
|
315
346
|
/// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
|
316
|
-
/// @params smooth_factor Smooth factor between guidance logits and original logits. 1.0f means only use guidance logits. 0.0f means only original logits.
|
317
347
|
LLAMA_API void llama_sample_classifier_free_guidance(
|
318
348
|
struct llama_context * ctx,
|
319
349
|
llama_token_data_array * candidates,
|
320
350
|
struct llama_context * guidance_ctx,
|
321
|
-
float scale
|
322
|
-
float smooth_factor);
|
351
|
+
float scale);
|
323
352
|
|
324
353
|
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
|
325
354
|
LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.3.
|
6
|
+
VERSION = '0.3.4'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = 'master-
|
9
|
+
LLAMA_CPP_VERSION = 'master-d924522'
|
10
10
|
end
|
data/lib/llama_cpp.rb
CHANGED
data/sig/llama_cpp.rbs
CHANGED
@@ -39,6 +39,7 @@ module LLaMACpp
|
|
39
39
|
def self?.token_nl: () -> Integer
|
40
40
|
def self?.mmap_supported?: () -> bool
|
41
41
|
def self?.mlock_supported?: () -> bool
|
42
|
+
def self?.max_devices: () -> Integer
|
42
43
|
|
43
44
|
class TokenData
|
44
45
|
public
|
@@ -69,6 +70,12 @@ module LLaMACpp
|
|
69
70
|
def free: () -> void
|
70
71
|
def load: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
|
71
72
|
def apply_lora_from_file: (lora_path: String, ?base_model_path: String, ?n_threads: Integer) -> void
|
73
|
+
def n_vocab: () -> Integer
|
74
|
+
def n_ctx: () -> Integer
|
75
|
+
def n_embd: () -> Integer
|
76
|
+
def vocab: (capacity: Integer) -> [Array[String], Array[Float]]
|
77
|
+
def token_to_str: (Integer) -> String
|
78
|
+
def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
|
72
79
|
end
|
73
80
|
|
74
81
|
class Timings
|
@@ -109,7 +116,7 @@ module LLaMACpp
|
|
109
116
|
def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
|
110
117
|
def sample_repetition_penalty: (::LLaMACpp::TokenDataArray, Array[Integer], penalty: Float) -> void
|
111
118
|
def sample_frequency_and_presence_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], frequency: Float, presence: Float) -> void
|
112
|
-
def sample_classifier_free_guidance: (::LLaMACpp::TokenDataArray, guidance: ::LLaMACpp::Context, scale: Float
|
119
|
+
def sample_classifier_free_guidance: (::LLaMACpp::TokenDataArray, guidance: ::LLaMACpp::Context, scale: Float) -> void
|
113
120
|
def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
|
114
121
|
def sample_top_k: (::LLaMACpp::TokenDataArray, k: Integer, ?min_keep: Integer) -> void
|
115
122
|
def sample_top_p: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
|
@@ -140,6 +147,10 @@ module LLaMACpp
|
|
140
147
|
def main_gpu: () -> Integer
|
141
148
|
def main_gpu=: (Integer) -> Integer
|
142
149
|
def tensor_split: () -> Array[Float]
|
150
|
+
def rope_freq_base=: (Float) -> Float
|
151
|
+
def rope_freq_base: () -> Float
|
152
|
+
def rope_freq_scale=: (Float) -> Float
|
153
|
+
def rope_freq_scale: () -> Float
|
143
154
|
def low_vram: () -> bool
|
144
155
|
def low_vram=: (bool) -> bool
|
145
156
|
def seed: () -> Integer
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-07-
|
11
|
+
date: 2023-07-22 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|