llama_cpp 0.4.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/examples/chat.rb +2 -2
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +23 -11
- data/ext/llama_cpp/src/ggml-alloc.c +13 -50
- data/ext/llama_cpp/src/ggml-cuda.cu +23 -11
- data/ext/llama_cpp/src/ggml-metal.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.m +130 -61
- data/ext/llama_cpp/src/ggml-metal.metal +44 -26
- data/ext/llama_cpp/src/ggml.c +637 -328
- data/ext/llama_cpp/src/ggml.h +45 -19
- data/ext/llama_cpp/src/k_quants.c +2 -2
- data/ext/llama_cpp/src/llama.cpp +426 -97
- data/ext/llama_cpp/src/llama.h +51 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +5 -3
- metadata +2 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -1,9 +1,6 @@
|
|
1
1
|
// Defines fileno on msys:
|
2
2
|
#ifndef _GNU_SOURCE
|
3
3
|
#define _GNU_SOURCE
|
4
|
-
#include <cstddef>
|
5
|
-
#include <cstdint>
|
6
|
-
#include <cstdio>
|
7
4
|
#endif
|
8
5
|
|
9
6
|
#include "llama.h"
|
@@ -62,6 +59,9 @@
|
|
62
59
|
#include <cinttypes>
|
63
60
|
#include <climits>
|
64
61
|
#include <cstdarg>
|
62
|
+
#include <cstddef>
|
63
|
+
#include <cstdint>
|
64
|
+
#include <cstdio>
|
65
65
|
#include <cstring>
|
66
66
|
#include <ctime>
|
67
67
|
#include <fstream>
|
@@ -114,12 +114,17 @@ static size_t utf8_len(char src) {
|
|
114
114
|
}
|
115
115
|
|
116
116
|
void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
117
|
+
std::string result;
|
118
|
+
for (size_t pos = 0; ; pos += search.length()) {
|
119
|
+
auto new_pos = s.find(search, pos);
|
120
|
+
if (new_pos == std::string::npos) {
|
121
|
+
result += s.substr(pos, s.size() - pos);
|
122
|
+
break;
|
123
|
+
}
|
124
|
+
result += s.substr(pos, new_pos - pos) + replace;
|
125
|
+
pos = new_pos;
|
122
126
|
}
|
127
|
+
s = std::move(result);
|
123
128
|
}
|
124
129
|
|
125
130
|
static void zeros(std::ofstream & file, size_t n) {
|
@@ -796,12 +801,12 @@ static void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
|
|
796
801
|
(void) tensor;
|
797
802
|
}
|
798
803
|
|
799
|
-
static std::string
|
804
|
+
static std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) {
|
800
805
|
std::vector<char> result(8, 0);
|
801
|
-
const int n_tokens =
|
806
|
+
const int n_tokens = llama_token_to_piece(ctx, token, result.data(), result.size());
|
802
807
|
if (n_tokens < 0) {
|
803
808
|
result.resize(-n_tokens);
|
804
|
-
int check =
|
809
|
+
int check = llama_token_to_piece(ctx, token, result.data(), result.size());
|
805
810
|
GGML_ASSERT(check == -n_tokens);
|
806
811
|
} else {
|
807
812
|
result.resize(n_tokens);
|
@@ -955,10 +960,10 @@ struct llama_vocab {
|
|
955
960
|
id linefeed_id = 13;
|
956
961
|
|
957
962
|
int find_bpe_rank(std::string token_left, std::string token_right) const {
|
958
|
-
replace_all(token_left, " ", "
|
959
|
-
replace_all(token_left, "\n", "
|
960
|
-
replace_all(token_right, " ", "
|
961
|
-
replace_all(token_right, "\n", "
|
963
|
+
replace_all(token_left, " ", "\u0120");
|
964
|
+
replace_all(token_left, "\n", "\u010A");
|
965
|
+
replace_all(token_right, " ", "\u0120");
|
966
|
+
replace_all(token_right, "\n", "\u010A");
|
962
967
|
|
963
968
|
auto it = bpe_ranks.find(std::make_pair(token_left, token_right));
|
964
969
|
if (it == bpe_ranks.end()) {
|
@@ -1144,11 +1149,13 @@ static bool llama_kv_cache_init(
|
|
1144
1149
|
|
1145
1150
|
enum llama_fver {
|
1146
1151
|
GGUF_FILE_VERSION_V1 = 1,
|
1152
|
+
GGUF_FILE_VERSION_V2 = 2,
|
1147
1153
|
};
|
1148
1154
|
|
1149
1155
|
static const char * llama_file_version_name(llama_fver version) {
|
1150
1156
|
switch (version) {
|
1151
|
-
case GGUF_FILE_VERSION_V1: return "GGUF V1 (
|
1157
|
+
case GGUF_FILE_VERSION_V1: return "GGUF V1 (support until nov 2023)";
|
1158
|
+
case GGUF_FILE_VERSION_V2: return "GGUF V2 (latest)";
|
1152
1159
|
}
|
1153
1160
|
|
1154
1161
|
return "unknown";
|
@@ -1635,7 +1642,8 @@ static void llm_load_hparams(
|
|
1635
1642
|
}
|
1636
1643
|
|
1637
1644
|
// TODO: This should probably be in llama.h
|
1638
|
-
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab,
|
1645
|
+
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos);
|
1646
|
+
static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch);
|
1639
1647
|
|
1640
1648
|
static void llm_load_vocab(
|
1641
1649
|
llama_model_loader & ml,
|
@@ -1737,7 +1745,11 @@ static void llm_load_vocab(
|
|
1737
1745
|
}
|
1738
1746
|
|
1739
1747
|
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
|
1740
|
-
|
1748
|
+
if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
|
1749
|
+
vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
|
1750
|
+
} else {
|
1751
|
+
vocab.linefeed_id = llama_tokenize_internal(vocab, "\n", false)[0];
|
1752
|
+
}
|
1741
1753
|
|
1742
1754
|
// special tokens
|
1743
1755
|
GGUF_GET_KEY(ctx, vocab.special_bos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_BOS_ID));
|
@@ -2635,18 +2647,20 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
2635
2647
|
|
2636
2648
|
const size_t wsize = ggml_type_size(cur->type);
|
2637
2649
|
|
2638
|
-
|
2650
|
+
// TODO: these 2 ggml_conts are technically not needed, but we add them until CUDA support for
|
2651
|
+
// non-contiguous views is added for the rope operator
|
2652
|
+
struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_3d(
|
2639
2653
|
ctx0, cur, n_embd_head, n_head, N,
|
2640
2654
|
wsize * n_embd_head,
|
2641
2655
|
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
2642
|
-
0);
|
2656
|
+
0));
|
2643
2657
|
offload_func_kq(tmpq);
|
2644
2658
|
|
2645
|
-
struct ggml_tensor * tmpk = ggml_view_3d(
|
2659
|
+
struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_3d(
|
2646
2660
|
ctx0, cur, n_embd_head, n_head_kv, N,
|
2647
2661
|
wsize * n_embd_head,
|
2648
2662
|
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
2649
|
-
wsize * n_embd_head * n_head);
|
2663
|
+
wsize * n_embd_head * n_head));
|
2650
2664
|
offload_func_kq(tmpk);
|
2651
2665
|
|
2652
2666
|
struct ggml_tensor * tmpv = ggml_view_3d(
|
@@ -2831,7 +2845,6 @@ static bool llama_eval_internal(
|
|
2831
2845
|
|
2832
2846
|
GGML_ASSERT(n_tokens > 0);
|
2833
2847
|
GGML_ASSERT(n_past >= 0);
|
2834
|
-
GGML_ASSERT(n_threads > 0);
|
2835
2848
|
// TODO: keep the values of n_batch and n_ctx
|
2836
2849
|
// GGML_ASSERT(n_tokens <= n_batch);
|
2837
2850
|
// GGML_ASSERT(n_past + n_tokens <= n_ctx);
|
@@ -2842,6 +2855,8 @@ static bool llama_eval_internal(
|
|
2842
2855
|
ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
|
2843
2856
|
#endif
|
2844
2857
|
|
2858
|
+
GGML_ASSERT(n_threads > 0);
|
2859
|
+
|
2845
2860
|
const int N = n_tokens;
|
2846
2861
|
|
2847
2862
|
const auto & model = lctx.model;
|
@@ -3026,16 +3041,8 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
|
|
3026
3041
|
return vocab.token_to_id.at(buf);
|
3027
3042
|
}
|
3028
3043
|
|
3029
|
-
static
|
3030
|
-
|
3031
|
-
for (size_t offs = 0; offs < text.length(); ++offs) {
|
3032
|
-
if (text[offs] == ' ') {
|
3033
|
-
result += "\xe2\x96\x81";
|
3034
|
-
} else {
|
3035
|
-
result += text[offs];
|
3036
|
-
}
|
3037
|
-
}
|
3038
|
-
return result;
|
3044
|
+
static void llama_escape_whitespace(std::string & text) {
|
3045
|
+
replace_all(text, " ", "\xe2\x96\x81");
|
3039
3046
|
}
|
3040
3047
|
|
3041
3048
|
static void llama_unescape_whitespace(std::string & word) {
|
@@ -3204,7 +3211,7 @@ private:
|
|
3204
3211
|
|
3205
3212
|
struct llm_bigram_bpe {
|
3206
3213
|
struct comparator {
|
3207
|
-
bool operator()(llm_bigram_bpe & l, llm_bigram_bpe & r) {
|
3214
|
+
bool operator()(const llm_bigram_bpe & l, const llm_bigram_bpe & r) const {
|
3208
3215
|
return l.rank > r.rank || (l.rank == r.rank && l.left > r.left);
|
3209
3216
|
}
|
3210
3217
|
};
|
@@ -3219,7 +3226,7 @@ struct llm_bigram_bpe {
|
|
3219
3226
|
};
|
3220
3227
|
|
3221
3228
|
struct llm_tokenizer_bpe {
|
3222
|
-
llm_tokenizer_bpe(const llama_vocab & vocab
|
3229
|
+
llm_tokenizer_bpe(const llama_vocab & vocab): vocab(vocab) {}
|
3223
3230
|
|
3224
3231
|
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
3225
3232
|
int final_prev_index = -1;
|
@@ -3352,26 +3359,23 @@ private:
|
|
3352
3359
|
}
|
3353
3360
|
|
3354
3361
|
// probably not 100% correct
|
3355
|
-
|
3356
|
-
static std::vector<std::string> bpe_gpt2_preprocess(std::string text) {
|
3362
|
+
static std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
|
3357
3363
|
std::vector<std::string> words;
|
3358
3364
|
|
3359
3365
|
// ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53
|
3360
3366
|
const std::string pattern = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
|
3361
3367
|
const std::regex re(pattern);
|
3362
|
-
std::smatch m;
|
3363
3368
|
|
3364
|
-
|
3365
|
-
|
3366
|
-
|
3367
|
-
|
3368
|
-
|
3369
|
+
auto words_begin = std::sregex_iterator(text.begin(), text.end(), re);
|
3370
|
+
auto words_end = std::sregex_iterator();
|
3371
|
+
auto n_words = std::distance(words_begin, words_end);
|
3372
|
+
words.reserve(n_words);
|
3373
|
+
for (auto it = words_begin; it != words_end; ++it) {
|
3374
|
+
words.push_back(it->str());
|
3369
3375
|
}
|
3370
|
-
|
3371
3376
|
return words;
|
3372
|
-
}
|
3373
3377
|
|
3374
|
-
|
3378
|
+
}
|
3375
3379
|
|
3376
3380
|
const llama_vocab & vocab;
|
3377
3381
|
|
@@ -3381,9 +3385,18 @@ private:
|
|
3381
3385
|
llm_bigram_bpe::queue work_queue;
|
3382
3386
|
};
|
3383
3387
|
|
3384
|
-
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab,
|
3388
|
+
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos) {
|
3385
3389
|
std::vector<llama_vocab::id> output;
|
3386
3390
|
|
3391
|
+
// OG tokenizer behavior:
|
3392
|
+
//
|
3393
|
+
// tokenizer.encode('', add_bos=True) returns [1]
|
3394
|
+
// tokenizer.encode('', add_bos=False) returns []
|
3395
|
+
|
3396
|
+
if (bos && vocab.special_bos_id != -1) {
|
3397
|
+
output.push_back(vocab.special_bos_id);
|
3398
|
+
}
|
3399
|
+
|
3387
3400
|
if (raw_text.empty()) {
|
3388
3401
|
return output;
|
3389
3402
|
}
|
@@ -3391,29 +3404,16 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
3391
3404
|
switch (vocab.type) {
|
3392
3405
|
case LLAMA_VOCAB_TYPE_SPM:
|
3393
3406
|
{
|
3394
|
-
|
3407
|
+
// without adding this leading whitespace, we do not get the same results as the original tokenizer
|
3408
|
+
raw_text = " " + raw_text;
|
3395
3409
|
|
3396
|
-
|
3397
|
-
|
3398
|
-
|
3399
|
-
|
3400
|
-
std::string text;
|
3401
|
-
if (escape) {
|
3402
|
-
text = llama_escape_whitespace(raw_text);
|
3403
|
-
} else {
|
3404
|
-
text = raw_text;
|
3405
|
-
}
|
3406
|
-
|
3407
|
-
tokenizer.tokenize(text, output);
|
3410
|
+
llm_tokenizer_spm tokenizer(vocab);
|
3411
|
+
llama_escape_whitespace(raw_text);
|
3412
|
+
tokenizer.tokenize(raw_text, output);
|
3408
3413
|
} break;
|
3409
3414
|
case LLAMA_VOCAB_TYPE_BPE:
|
3410
3415
|
{
|
3411
|
-
llm_tokenizer_bpe tokenizer(vocab
|
3412
|
-
|
3413
|
-
if (bos && vocab.special_bos_id != -1) {
|
3414
|
-
output.push_back(vocab.special_bos_id);
|
3415
|
-
}
|
3416
|
-
|
3416
|
+
llm_tokenizer_bpe tokenizer(vocab);
|
3417
3417
|
tokenizer.tokenize(raw_text, output);
|
3418
3418
|
} break;
|
3419
3419
|
};
|
@@ -3908,7 +3908,7 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
|
|
3908
3908
|
|
3909
3909
|
// Calculate absolute value of second derivatives
|
3910
3910
|
for (size_t i = 0; i < second_derivatives.size(); ++i) {
|
3911
|
-
second_derivatives[i] = abs(second_derivatives[i]);
|
3911
|
+
second_derivatives[i] = std::abs(second_derivatives[i]);
|
3912
3912
|
}
|
3913
3913
|
|
3914
3914
|
// Normalize the second derivatives
|
@@ -4099,16 +4099,16 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
|
|
4099
4099
|
std::vector<llama_grammar_candidate> candidates_grammar;
|
4100
4100
|
|
4101
4101
|
for (size_t i = 0; i < candidates->size; ++i) {
|
4102
|
-
const llama_token id
|
4103
|
-
const std::string
|
4102
|
+
const llama_token id = candidates->data[i].id;
|
4103
|
+
const std::string piece = llama_token_to_str(ctx, id);
|
4104
4104
|
if (id == eos) {
|
4105
4105
|
if (!allow_eos) {
|
4106
4106
|
candidates->data[i].logit = -INFINITY;
|
4107
4107
|
}
|
4108
|
-
} else if (
|
4108
|
+
} else if (piece.empty() || piece[0] == 0) {
|
4109
4109
|
candidates->data[i].logit = -INFINITY;
|
4110
4110
|
} else {
|
4111
|
-
candidates_decoded.push_back(decode_utf8(
|
4111
|
+
candidates_decoded.push_back(decode_utf8(piece.c_str(), grammar->partial_utf8));
|
4112
4112
|
candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second });
|
4113
4113
|
}
|
4114
4114
|
}
|
@@ -4312,10 +4312,10 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
|
|
4312
4312
|
GGML_ASSERT(false);
|
4313
4313
|
}
|
4314
4314
|
|
4315
|
-
const std::string
|
4315
|
+
const std::string piece = llama_token_to_str(ctx, token);
|
4316
4316
|
|
4317
4317
|
// Note terminating 0 in decoded string
|
4318
|
-
const auto decoded = decode_utf8(
|
4318
|
+
const auto decoded = decode_utf8(piece.c_str(), grammar->partial_utf8);
|
4319
4319
|
const auto & code_points = decoded.first;
|
4320
4320
|
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
4321
4321
|
grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
|
@@ -4326,6 +4326,257 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
|
|
4326
4326
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
4327
4327
|
}
|
4328
4328
|
|
4329
|
+
//
|
4330
|
+
// Beam search
|
4331
|
+
//
|
4332
|
+
|
4333
|
+
struct llama_beam {
|
4334
|
+
std::vector<llama_token> tokens;
|
4335
|
+
float p; // Cumulative beam probability (renormalized relative to all beams)
|
4336
|
+
bool eob; // Initialize end-of-beam to false. Callback sets this to true.
|
4337
|
+
// Sort beams by probability. In case of ties, prefer beams at eob.
|
4338
|
+
bool operator<(const llama_beam & rhs) const {
|
4339
|
+
return std::make_pair(p, eob) < std::make_pair(rhs.p, rhs.eob);
|
4340
|
+
}
|
4341
|
+
// Shift off first n tokens and discard them.
|
4342
|
+
void shift_tokens(const size_t n) {
|
4343
|
+
if (n) {
|
4344
|
+
std::copy(tokens.begin() + n, tokens.end(), tokens.begin());
|
4345
|
+
tokens.resize(tokens.size() - n);
|
4346
|
+
}
|
4347
|
+
}
|
4348
|
+
llama_beam_view view() const { return {tokens.data(), tokens.size(), p, eob}; }
|
4349
|
+
};
|
4350
|
+
|
4351
|
+
// A struct for calculating logit-related info.
|
4352
|
+
struct llama_logit_info {
|
4353
|
+
const float * const logits;
|
4354
|
+
const int n_vocab;
|
4355
|
+
const float max_l;
|
4356
|
+
const float normalizer;
|
4357
|
+
struct sum_exp {
|
4358
|
+
float max_l;
|
4359
|
+
float operator()(float sum, float l) const { return sum + std::exp(l - max_l); }
|
4360
|
+
};
|
4361
|
+
llama_logit_info(llama_context * ctx)
|
4362
|
+
: logits(llama_get_logits(ctx))
|
4363
|
+
, n_vocab(llama_n_vocab(ctx))
|
4364
|
+
, max_l(*std::max_element(logits, logits + n_vocab))
|
4365
|
+
, normalizer(1.0f / std::accumulate(logits, logits + n_vocab, 0.0f, sum_exp{max_l}))
|
4366
|
+
{ }
|
4367
|
+
llama_token_data get_token_data(const llama_token token_id) const {
|
4368
|
+
constexpr auto p = std::numeric_limits<float>::quiet_NaN(); // never used
|
4369
|
+
return {token_id, logits[token_id], p};
|
4370
|
+
}
|
4371
|
+
// Return top k token_data by logit.
|
4372
|
+
std::vector<llama_token_data> top_k(size_t k) {
|
4373
|
+
std::vector<llama_token_data> min_heap; // min-heap by logit
|
4374
|
+
const llama_token k_min = std::min(static_cast<llama_token>(k), n_vocab);
|
4375
|
+
min_heap.reserve(k_min);
|
4376
|
+
for (llama_token token_id = 0 ; token_id < k_min ; ++token_id) {
|
4377
|
+
min_heap.push_back(get_token_data(token_id));
|
4378
|
+
}
|
4379
|
+
auto comp = [](const llama_token_data & a, const llama_token_data & b) { return a.logit > b.logit; };
|
4380
|
+
std::make_heap(min_heap.begin(), min_heap.end(), comp);
|
4381
|
+
for (llama_token token_id = k_min ; token_id < n_vocab ; ++token_id) {
|
4382
|
+
if (min_heap.front().logit < logits[token_id]) {
|
4383
|
+
std::pop_heap(min_heap.begin(), min_heap.end(), comp);
|
4384
|
+
min_heap.back().id = token_id;
|
4385
|
+
min_heap.back().logit = logits[token_id];
|
4386
|
+
std::push_heap(min_heap.begin(), min_heap.end(), comp);
|
4387
|
+
}
|
4388
|
+
}
|
4389
|
+
return min_heap;
|
4390
|
+
}
|
4391
|
+
float probability_from_logit(float logit) {
|
4392
|
+
return normalizer * std::exp(logit - max_l);
|
4393
|
+
}
|
4394
|
+
};
|
4395
|
+
|
4396
|
+
struct llama_beam_search_data {
|
4397
|
+
llama_context * ctx;
|
4398
|
+
size_t n_beams;
|
4399
|
+
int n_past;
|
4400
|
+
int n_predict;
|
4401
|
+
int n_threads;
|
4402
|
+
std::vector<llama_beam> beams;
|
4403
|
+
std::vector<llama_beam> next_beams;
|
4404
|
+
|
4405
|
+
// Re-calculated on each loop iteration
|
4406
|
+
size_t common_prefix_length;
|
4407
|
+
|
4408
|
+
// Used to communicate to/from callback on beams state.
|
4409
|
+
std::vector<llama_beam_view> beam_views;
|
4410
|
+
|
4411
|
+
llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict, int n_threads)
|
4412
|
+
: ctx(ctx)
|
4413
|
+
, n_beams(n_beams)
|
4414
|
+
, n_past(n_past)
|
4415
|
+
, n_predict(n_predict)
|
4416
|
+
, n_threads(n_threads)
|
4417
|
+
, beam_views(n_beams) {
|
4418
|
+
beams.reserve(n_beams);
|
4419
|
+
next_beams.reserve(n_beams);
|
4420
|
+
}
|
4421
|
+
|
4422
|
+
// Collapse beams to a single beam given by index.
|
4423
|
+
void collapse_beams(const size_t beam_idx) {
|
4424
|
+
if (0u < beam_idx) {
|
4425
|
+
std::swap(beams[0], beams[beam_idx]);
|
4426
|
+
}
|
4427
|
+
beams.resize(1);
|
4428
|
+
}
|
4429
|
+
|
4430
|
+
// Min-heaps are used to efficiently collect the top-k elements (k=n_beams).
|
4431
|
+
// The repetative patterns below reflect the 2 stages of heaps:
|
4432
|
+
// * Gather elements until the vector is full, then call std::make_heap() on it.
|
4433
|
+
// * If the heap is full and a new element is found that should be included, pop the
|
4434
|
+
// least element to the back(), replace it with the new, then push it into the heap.
|
4435
|
+
void fill_next_beams_by_top_probabilities(llama_beam & beam) {
|
4436
|
+
// Min-heaps use a greater-than comparator.
|
4437
|
+
const auto comp = [](const llama_beam & a, const llama_beam & b) { return a.p > b.p; };
|
4438
|
+
if (beam.eob) {
|
4439
|
+
// beam is at end-of-sentence, so just copy it to next_beams if its probability is high enough.
|
4440
|
+
if (next_beams.size() < n_beams) {
|
4441
|
+
next_beams.push_back(std::move(beam));
|
4442
|
+
if (next_beams.size() == n_beams) {
|
4443
|
+
std::make_heap(next_beams.begin(), next_beams.end(), comp);
|
4444
|
+
}
|
4445
|
+
} else if (next_beams.front().p < beam.p) {
|
4446
|
+
std::pop_heap(next_beams.begin(), next_beams.end(), comp);
|
4447
|
+
next_beams.back() = std::move(beam);
|
4448
|
+
std::push_heap(next_beams.begin(), next_beams.end(), comp);
|
4449
|
+
}
|
4450
|
+
} else {
|
4451
|
+
// beam is not at end-of-sentence, so branch with next top_k tokens.
|
4452
|
+
if (!beam.tokens.empty()) {
|
4453
|
+
llama_eval(ctx, beam.tokens.data(), beam.tokens.size(), n_past, n_threads);
|
4454
|
+
}
|
4455
|
+
llama_logit_info logit_info(ctx);
|
4456
|
+
std::vector<llama_token_data> next_tokens = logit_info.top_k(n_beams);
|
4457
|
+
size_t i=0;
|
4458
|
+
if (next_beams.size() < n_beams) {
|
4459
|
+
for (; next_beams.size() < n_beams ; ++i) {
|
4460
|
+
llama_beam next_beam = beam;
|
4461
|
+
next_beam.tokens.push_back(next_tokens[i].id);
|
4462
|
+
next_beam.p *= logit_info.probability_from_logit(next_tokens[i].logit);
|
4463
|
+
next_beams.push_back(std::move(next_beam));
|
4464
|
+
}
|
4465
|
+
std::make_heap(next_beams.begin(), next_beams.end(), comp);
|
4466
|
+
} else {
|
4467
|
+
for (; next_beams.front().p == 0.0f ; ++i) {
|
4468
|
+
std::pop_heap(next_beams.begin(), next_beams.end(), comp);
|
4469
|
+
next_beams.back() = beam;
|
4470
|
+
next_beams.back().tokens.push_back(next_tokens[i].id);
|
4471
|
+
next_beams.back().p *= logit_info.probability_from_logit(next_tokens[i].logit);
|
4472
|
+
std::push_heap(next_beams.begin(), next_beams.end(), comp);
|
4473
|
+
}
|
4474
|
+
}
|
4475
|
+
for (; i < n_beams ; ++i) {
|
4476
|
+
const float next_p = beam.p * logit_info.probability_from_logit(next_tokens[i].logit);
|
4477
|
+
if (next_beams.front().p < next_p) {
|
4478
|
+
std::pop_heap(next_beams.begin(), next_beams.end(), comp);
|
4479
|
+
next_beams.back() = beam;
|
4480
|
+
next_beams.back().tokens.push_back(next_tokens[i].id);
|
4481
|
+
next_beams.back().p = next_p;
|
4482
|
+
std::push_heap(next_beams.begin(), next_beams.end(), comp);
|
4483
|
+
}
|
4484
|
+
}
|
4485
|
+
}
|
4486
|
+
}
|
4487
|
+
|
4488
|
+
// Find common_prefix_length based on beams.
|
4489
|
+
// Requires beams is not empty.
|
4490
|
+
size_t find_common_prefix_length() {
|
4491
|
+
size_t common_prefix_length = beams[0].tokens.size();
|
4492
|
+
for (size_t i = 1 ; i < beams.size() ; ++i) {
|
4493
|
+
common_prefix_length = std::min(common_prefix_length, beams[i].tokens.size());
|
4494
|
+
for (size_t j = 0 ; j < common_prefix_length ; ++j) {
|
4495
|
+
if (beams[0].tokens[j] != beams[i].tokens[j]) {
|
4496
|
+
common_prefix_length = j;
|
4497
|
+
break;
|
4498
|
+
}
|
4499
|
+
}
|
4500
|
+
}
|
4501
|
+
return common_prefix_length;
|
4502
|
+
}
|
4503
|
+
|
4504
|
+
// Construct beams_state to send back to caller via the callback function.
|
4505
|
+
// Side effect: set common_prefix_length = find_common_prefix_length();
|
4506
|
+
llama_beams_state get_beams_state(const bool last_call) {
|
4507
|
+
for (size_t i = 0 ; i < beams.size() ; ++i) {
|
4508
|
+
beam_views[i] = beams[i].view();
|
4509
|
+
}
|
4510
|
+
common_prefix_length = find_common_prefix_length();
|
4511
|
+
return {beam_views.data(), beams.size(), common_prefix_length, last_call};
|
4512
|
+
}
|
4513
|
+
|
4514
|
+
// Loop:
|
4515
|
+
// * while i < n_predict, AND
|
4516
|
+
// * any of the beams have not yet reached end-of-beam (eob), AND
|
4517
|
+
// * the highest probability beam(s) (plural in case of ties) are not at end-of-sentence
|
4518
|
+
// (since all other beam probabilities can only decrease)
|
4519
|
+
void loop(const llama_beam_search_callback_fn_t callback, void * const callback_data) {
|
4520
|
+
beams.push_back({{}, 1.0f, false}); // Start with one empty beam w/ probability = 1.0 and !eob.
|
4521
|
+
const auto not_eob = [](const llama_beam & beam) { return !beam.eob; };
|
4522
|
+
for (int i = 0 ; i < n_predict && std::any_of(beams.begin(),beams.end(),not_eob) &&
|
4523
|
+
!beams[top_beam_index()].eob ; ++i) {
|
4524
|
+
callback(callback_data, get_beams_state(false)); // Sets common_prefix_length
|
4525
|
+
update_beams_from_beam_views(); // Update values (p,eob) that callback may have changed.
|
4526
|
+
if (common_prefix_length) {
|
4527
|
+
llama_eval(ctx, beams[0].tokens.data(), common_prefix_length, n_past, n_threads);
|
4528
|
+
n_past += common_prefix_length;
|
4529
|
+
}
|
4530
|
+
// Zero-out next_beam probabilities to place them last in following min-heap.
|
4531
|
+
std::for_each(next_beams.begin(), next_beams.end(), [](llama_beam & beam) { beam.p = 0.0f; });
|
4532
|
+
for (llama_beam & beam : beams) {
|
4533
|
+
beam.shift_tokens(common_prefix_length);
|
4534
|
+
fill_next_beams_by_top_probabilities(beam);
|
4535
|
+
}
|
4536
|
+
// next_beams become the beams of next/final iteration. Swap them to re-use memory.
|
4537
|
+
beams.swap(next_beams);
|
4538
|
+
renormalize_beam_probabilities(beams);
|
4539
|
+
}
|
4540
|
+
collapse_beams(top_beam_index());
|
4541
|
+
callback(callback_data, get_beams_state(true));
|
4542
|
+
}
|
4543
|
+
|
4544
|
+
// As beams grow, the cumulative probabilities decrease.
|
4545
|
+
// Renormalize them to avoid floating point underflow.
|
4546
|
+
static void renormalize_beam_probabilities(std::vector<llama_beam> & beams) {
|
4547
|
+
const auto sum_p = [](float sum, llama_beam & beam) { return sum + beam.p; };
|
4548
|
+
const float inv_sum = 1.0f / std::accumulate(beams.begin(), beams.end(), 0.0f, sum_p);
|
4549
|
+
std::for_each(beams.begin(), beams.end(), [=](llama_beam & beam) { beam.p *= inv_sum; });
|
4550
|
+
}
|
4551
|
+
|
4552
|
+
// Assumes beams is non-empty. Uses llama_beam::operator<() for ordering.
|
4553
|
+
size_t top_beam_index() {
|
4554
|
+
return std::max_element(beams.begin(), beams.end()) - beams.begin();
|
4555
|
+
}
|
4556
|
+
|
4557
|
+
// Copy (p,eob) for each beam which may have been changed by the callback.
|
4558
|
+
void update_beams_from_beam_views() {
|
4559
|
+
for (size_t i = 0 ; i < beams.size() ; ++i) {
|
4560
|
+
beams[i].p = beam_views[i].p;
|
4561
|
+
beams[i].eob = beam_views[i].eob;
|
4562
|
+
}
|
4563
|
+
}
|
4564
|
+
};
|
4565
|
+
|
4566
|
+
void llama_beam_search(llama_context * ctx,
|
4567
|
+
llama_beam_search_callback_fn_t callback, void * callback_data,
|
4568
|
+
size_t n_beams, int n_past, int n_predict, int n_threads) {
|
4569
|
+
assert(ctx);
|
4570
|
+
const int64_t t_start_sample_us = ggml_time_us();
|
4571
|
+
|
4572
|
+
llama_beam_search_data beam_search_data(ctx, n_beams, n_past, n_predict, n_threads);
|
4573
|
+
|
4574
|
+
beam_search_data.loop(callback, callback_data);
|
4575
|
+
|
4576
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
4577
|
+
ctx->n_sample++;
|
4578
|
+
}
|
4579
|
+
|
4329
4580
|
//
|
4330
4581
|
// quantization
|
4331
4582
|
//
|
@@ -4423,6 +4674,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4423
4674
|
|
4424
4675
|
std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname_inp, /*use_mmap*/ false));
|
4425
4676
|
|
4677
|
+
llama_model model;
|
4678
|
+
llm_load_arch(*ml, model);
|
4679
|
+
llm_load_hparams(*ml, model, 0, 0, 0);
|
4680
|
+
|
4426
4681
|
const size_t align = GGUF_DEFAULT_ALIGNMENT;
|
4427
4682
|
struct gguf_context * ctx_out = gguf_init_empty();
|
4428
4683
|
|
@@ -4448,6 +4703,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4448
4703
|
++n_feed_forward_w2;
|
4449
4704
|
}
|
4450
4705
|
}
|
4706
|
+
if (n_attention_wv != n_feed_forward_w2 || (uint32_t)n_attention_wv != model.hparams.n_layer) {
|
4707
|
+
LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_feed_forward_w2 = %d, hparams.n_layer = %d\n",
|
4708
|
+
__func__, n_attention_wv, n_feed_forward_w2, model.hparams.n_layer);
|
4709
|
+
}
|
4451
4710
|
|
4452
4711
|
int i_attention_wv = 0;
|
4453
4712
|
int i_feed_forward_w2 = 0;
|
@@ -4524,8 +4783,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4524
4783
|
|
4525
4784
|
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
|
4526
4785
|
int nx = tensor->ne[0];
|
4527
|
-
|
4528
|
-
|
4786
|
+
if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
|
4787
|
+
new_type = GGML_TYPE_Q8_0;
|
4788
|
+
}
|
4789
|
+
else if (new_type != GGML_TYPE_Q8_0) {
|
4529
4790
|
new_type = GGML_TYPE_Q6_K;
|
4530
4791
|
}
|
4531
4792
|
} else if (name.find("attn_v.weight") != std::string::npos) {
|
@@ -4539,21 +4800,49 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4539
4800
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
|
4540
4801
|
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
|
4541
4802
|
(i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
|
4803
|
+
if (model.type == MODEL_70B) {
|
4804
|
+
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
|
4805
|
+
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
|
4806
|
+
// nearly negligible increase in model size by quantizing this tensor with more bits:
|
4807
|
+
if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
|
4808
|
+
}
|
4542
4809
|
++i_attention_wv;
|
4543
4810
|
} else if (name.find("ffn_down.weight") != std::string::npos) {
|
4544
4811
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
4545
4812
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
4546
|
-
new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
|
4813
|
+
new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
|
4814
|
+
: model.arch != LLM_ARCH_FALCON || use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K
|
4815
|
+
: GGML_TYPE_Q3_K;
|
4816
|
+
}
|
4817
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
|
4818
|
+
new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
|
4819
|
+
}
|
4820
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
|
4821
|
+
if (model.arch == LLM_ARCH_FALCON) {
|
4822
|
+
new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
|
4823
|
+
use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
4824
|
+
} else {
|
4825
|
+
if (use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
4826
|
+
}
|
4827
|
+
}
|
4828
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
4829
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && i_feed_forward_w2 < 4) {
|
4830
|
+
new_type = GGML_TYPE_Q5_K;
|
4547
4831
|
}
|
4548
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
4549
|
-
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
4550
|
-
use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
4551
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_feed_forward_w2 < 4) new_type = GGML_TYPE_Q5_K;
|
4552
4832
|
++i_feed_forward_w2;
|
4553
4833
|
} else if (name.find("attn_output.weight") != std::string::npos) {
|
4554
|
-
if
|
4555
|
-
|
4556
|
-
|
4834
|
+
if (model.arch != LLM_ARCH_FALCON) {
|
4835
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
|
4836
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
|
4837
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
4838
|
+
} else {
|
4839
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
|
4840
|
+
}
|
4841
|
+
}
|
4842
|
+
else if (name.find("attn_qkv.weight") != std::string::npos) {
|
4843
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
|
4844
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
|
4845
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
|
4557
4846
|
}
|
4558
4847
|
else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) {
|
4559
4848
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
@@ -4568,8 +4857,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4568
4857
|
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
|
4569
4858
|
int nx = tensor->ne[0];
|
4570
4859
|
int ny = tensor->ne[1];
|
4571
|
-
if (nx % QK_K != 0
|
4572
|
-
|
4860
|
+
if (nx % QK_K != 0) {
|
4861
|
+
LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for k-quants\n", __func__, nx, ny, QK_K);
|
4573
4862
|
convert_incompatible_tensor = true;
|
4574
4863
|
}
|
4575
4864
|
}
|
@@ -4998,7 +5287,7 @@ struct llama_context_params llama_context_default_params() {
|
|
4998
5287
|
/*.progress_callback =*/ nullptr,
|
4999
5288
|
/*.progress_callback_user_data =*/ nullptr,
|
5000
5289
|
/*.low_vram =*/ false,
|
5001
|
-
/*.mul_mat_q =*/
|
5290
|
+
/*.mul_mat_q =*/ true,
|
5002
5291
|
/*.f16_kv =*/ true,
|
5003
5292
|
/*.logits_all =*/ false,
|
5004
5293
|
/*.vocab_only =*/ false,
|
@@ -5297,13 +5586,29 @@ int llama_model_n_embd(const struct llama_model * model) {
|
|
5297
5586
|
return model->hparams.n_embd;
|
5298
5587
|
}
|
5299
5588
|
|
5300
|
-
int
|
5589
|
+
int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
|
5301
5590
|
return snprintf(buf, buf_size, "%s %s %s",
|
5302
5591
|
model->name.c_str(),
|
5303
5592
|
llama_model_type_name(model->type),
|
5304
5593
|
llama_model_ftype_name(model->ftype).c_str());
|
5305
5594
|
}
|
5306
5595
|
|
5596
|
+
uint64_t llama_model_size(const struct llama_model * model) {
|
5597
|
+
uint64_t size = 0;
|
5598
|
+
for (const auto & it : model->tensors_by_name) {
|
5599
|
+
size += ggml_nbytes(it.second);
|
5600
|
+
}
|
5601
|
+
return size;
|
5602
|
+
}
|
5603
|
+
|
5604
|
+
uint64_t llama_model_n_params(const struct llama_model * model) {
|
5605
|
+
uint64_t nparams = 0;
|
5606
|
+
for (const auto & it : model->tensors_by_name) {
|
5607
|
+
nparams += ggml_nelements(it.second);
|
5608
|
+
}
|
5609
|
+
return nparams;
|
5610
|
+
}
|
5611
|
+
|
5307
5612
|
int llama_model_quantize(
|
5308
5613
|
const char * fname_inp,
|
5309
5614
|
const char * fname_out,
|
@@ -5828,8 +6133,7 @@ int llama_tokenize_with_model(
|
|
5828
6133
|
llama_token * tokens,
|
5829
6134
|
int n_max_tokens,
|
5830
6135
|
bool add_bos) {
|
5831
|
-
auto
|
5832
|
-
auto res = llama_tokenize_internal(model->vocab, text, add_bos, escape);
|
6136
|
+
auto res = llama_tokenize_internal(model->vocab, text, add_bos);
|
5833
6137
|
|
5834
6138
|
if (n_max_tokens < (int) res.size()) {
|
5835
6139
|
LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
|
@@ -5843,12 +6147,12 @@ int llama_tokenize_with_model(
|
|
5843
6147
|
return res.size();
|
5844
6148
|
}
|
5845
6149
|
|
5846
|
-
int
|
5847
|
-
return
|
6150
|
+
int llama_token_to_piece(const struct llama_context * ctx, llama_token token, char * buf, int length) {
|
6151
|
+
return llama_token_to_piece_with_model(&ctx->model, token, buf, length);
|
5848
6152
|
}
|
5849
6153
|
|
5850
|
-
// does not write null-terminator to
|
5851
|
-
int
|
6154
|
+
// does not write null-terminator to buf
|
6155
|
+
int llama_token_to_piece_with_model(const struct llama_model * model, llama_token token, char * buf, int length) {
|
5852
6156
|
if (0 <= token && token < llama_model_n_vocab(model)) {
|
5853
6157
|
if (llama_is_normal_token(model->vocab, token)) {
|
5854
6158
|
std::string result = model->vocab.id_to_token[token].text;
|
@@ -5936,11 +6240,40 @@ const char * llama_print_system_info(void) {
|
|
5936
6240
|
s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
|
5937
6241
|
s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
|
5938
6242
|
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
|
6243
|
+
s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
|
5939
6244
|
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
|
5940
6245
|
|
5941
6246
|
return s.c_str();
|
5942
6247
|
}
|
5943
6248
|
|
6249
|
+
void llama_dump_timing_info_yaml(FILE * stream, const llama_context * ctx) {
|
6250
|
+
fprintf(stream, "\n");
|
6251
|
+
fprintf(stream, "###########\n");
|
6252
|
+
fprintf(stream, "# Timings #\n");
|
6253
|
+
fprintf(stream, "###########\n");
|
6254
|
+
fprintf(stream, "\n");
|
6255
|
+
|
6256
|
+
fprintf(stream, "mst_eval: %.2f # ms / token during generation\n",
|
6257
|
+
1.0e-3 * ctx->t_eval_us / ctx->n_eval);
|
6258
|
+
fprintf(stream, "mst_p_eval: %.2f # ms / token during prompt processing\n",
|
6259
|
+
1.0e-3 * ctx->t_p_eval_us / ctx->n_p_eval);
|
6260
|
+
fprintf(stream, "mst_sample: %.2f # ms / token during sampling\n",
|
6261
|
+
1.0e-3 * ctx->t_sample_us / ctx->n_sample);
|
6262
|
+
fprintf(stream, "n_eval: %d # number of tokens generated (excluding the first one)\n", ctx->n_eval);
|
6263
|
+
fprintf(stream, "n_p_eval: %d # number of tokens processed in batches at the beginning\n", ctx->n_p_eval);
|
6264
|
+
fprintf(stream, "n_sample: %d # number of sampled tokens\n", ctx->n_sample);
|
6265
|
+
fprintf(stream, "t_eval_us: %" PRId64 " # total microseconds spent generating tokens\n", ctx->t_eval_us);
|
6266
|
+
fprintf(stream, "t_load_us: %" PRId64 " # total microseconds spent loading the model\n", ctx->t_load_us);
|
6267
|
+
fprintf(stream, "t_p_eval_us: %" PRId64 " # total microseconds spent prompt processing\n", ctx->t_p_eval_us);
|
6268
|
+
fprintf(stream, "t_sample_us: %" PRId64 " # total microseconds spent sampling\n", ctx->t_sample_us);
|
6269
|
+
fprintf(stream, "ts_eval: %.2f # tokens / second during generation\n",
|
6270
|
+
1.0e6 * ctx->n_eval / ctx->t_eval_us);
|
6271
|
+
fprintf(stream, "ts_p_eval: %.2f # tokens / second during prompt processing\n",
|
6272
|
+
1.0e6 * ctx->n_p_eval / ctx->t_p_eval_us);
|
6273
|
+
fprintf(stream, "ts_sample: %.2f # tokens / second during sampling\n",
|
6274
|
+
1.0e6 * ctx->n_sample / ctx->t_sample_us);
|
6275
|
+
}
|
6276
|
+
|
5944
6277
|
// For internal test use
|
5945
6278
|
const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
|
5946
6279
|
return ctx->model.tensors_by_name;
|
@@ -5951,10 +6284,6 @@ void llama_log_set(llama_log_callback log_callback, void * user_data) {
|
|
5951
6284
|
g_state.log_callback_user_data = user_data;
|
5952
6285
|
}
|
5953
6286
|
|
5954
|
-
#if defined(_MSC_VER) && !defined(vsnprintf)
|
5955
|
-
#define vsnprintf _vsnprintf
|
5956
|
-
#endif
|
5957
|
-
|
5958
6287
|
static void llama_log_internal_v(llama_log_level level, const char * format, va_list args) {
|
5959
6288
|
va_list args_copy;
|
5960
6289
|
va_copy(args_copy, args);
|