llama_cpp 0.4.0 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/examples/chat.rb +2 -2
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +23 -11
- data/ext/llama_cpp/src/ggml-alloc.c +13 -50
- data/ext/llama_cpp/src/ggml-cuda.cu +23 -11
- data/ext/llama_cpp/src/ggml-metal.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.m +130 -61
- data/ext/llama_cpp/src/ggml-metal.metal +44 -26
- data/ext/llama_cpp/src/ggml.c +637 -328
- data/ext/llama_cpp/src/ggml.h +45 -19
- data/ext/llama_cpp/src/k_quants.c +2 -2
- data/ext/llama_cpp/src/llama.cpp +426 -97
- data/ext/llama_cpp/src/llama.h +51 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +5 -3
- metadata +2 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -1,9 +1,6 @@
|
|
1
1
|
// Defines fileno on msys:
|
2
2
|
#ifndef _GNU_SOURCE
|
3
3
|
#define _GNU_SOURCE
|
4
|
-
#include <cstddef>
|
5
|
-
#include <cstdint>
|
6
|
-
#include <cstdio>
|
7
4
|
#endif
|
8
5
|
|
9
6
|
#include "llama.h"
|
@@ -62,6 +59,9 @@
|
|
62
59
|
#include <cinttypes>
|
63
60
|
#include <climits>
|
64
61
|
#include <cstdarg>
|
62
|
+
#include <cstddef>
|
63
|
+
#include <cstdint>
|
64
|
+
#include <cstdio>
|
65
65
|
#include <cstring>
|
66
66
|
#include <ctime>
|
67
67
|
#include <fstream>
|
@@ -114,12 +114,17 @@ static size_t utf8_len(char src) {
|
|
114
114
|
}
|
115
115
|
|
116
116
|
void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
117
|
+
std::string result;
|
118
|
+
for (size_t pos = 0; ; pos += search.length()) {
|
119
|
+
auto new_pos = s.find(search, pos);
|
120
|
+
if (new_pos == std::string::npos) {
|
121
|
+
result += s.substr(pos, s.size() - pos);
|
122
|
+
break;
|
123
|
+
}
|
124
|
+
result += s.substr(pos, new_pos - pos) + replace;
|
125
|
+
pos = new_pos;
|
122
126
|
}
|
127
|
+
s = std::move(result);
|
123
128
|
}
|
124
129
|
|
125
130
|
static void zeros(std::ofstream & file, size_t n) {
|
@@ -796,12 +801,12 @@ static void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
|
|
796
801
|
(void) tensor;
|
797
802
|
}
|
798
803
|
|
799
|
-
static std::string
|
804
|
+
static std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) {
|
800
805
|
std::vector<char> result(8, 0);
|
801
|
-
const int n_tokens =
|
806
|
+
const int n_tokens = llama_token_to_piece(ctx, token, result.data(), result.size());
|
802
807
|
if (n_tokens < 0) {
|
803
808
|
result.resize(-n_tokens);
|
804
|
-
int check =
|
809
|
+
int check = llama_token_to_piece(ctx, token, result.data(), result.size());
|
805
810
|
GGML_ASSERT(check == -n_tokens);
|
806
811
|
} else {
|
807
812
|
result.resize(n_tokens);
|
@@ -955,10 +960,10 @@ struct llama_vocab {
|
|
955
960
|
id linefeed_id = 13;
|
956
961
|
|
957
962
|
int find_bpe_rank(std::string token_left, std::string token_right) const {
|
958
|
-
replace_all(token_left, " ", "
|
959
|
-
replace_all(token_left, "\n", "
|
960
|
-
replace_all(token_right, " ", "
|
961
|
-
replace_all(token_right, "\n", "
|
963
|
+
replace_all(token_left, " ", "\u0120");
|
964
|
+
replace_all(token_left, "\n", "\u010A");
|
965
|
+
replace_all(token_right, " ", "\u0120");
|
966
|
+
replace_all(token_right, "\n", "\u010A");
|
962
967
|
|
963
968
|
auto it = bpe_ranks.find(std::make_pair(token_left, token_right));
|
964
969
|
if (it == bpe_ranks.end()) {
|
@@ -1144,11 +1149,13 @@ static bool llama_kv_cache_init(
|
|
1144
1149
|
|
1145
1150
|
enum llama_fver {
|
1146
1151
|
GGUF_FILE_VERSION_V1 = 1,
|
1152
|
+
GGUF_FILE_VERSION_V2 = 2,
|
1147
1153
|
};
|
1148
1154
|
|
1149
1155
|
static const char * llama_file_version_name(llama_fver version) {
|
1150
1156
|
switch (version) {
|
1151
|
-
case GGUF_FILE_VERSION_V1: return "GGUF V1 (
|
1157
|
+
case GGUF_FILE_VERSION_V1: return "GGUF V1 (support until nov 2023)";
|
1158
|
+
case GGUF_FILE_VERSION_V2: return "GGUF V2 (latest)";
|
1152
1159
|
}
|
1153
1160
|
|
1154
1161
|
return "unknown";
|
@@ -1635,7 +1642,8 @@ static void llm_load_hparams(
|
|
1635
1642
|
}
|
1636
1643
|
|
1637
1644
|
// TODO: This should probably be in llama.h
|
1638
|
-
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab,
|
1645
|
+
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos);
|
1646
|
+
static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch);
|
1639
1647
|
|
1640
1648
|
static void llm_load_vocab(
|
1641
1649
|
llama_model_loader & ml,
|
@@ -1737,7 +1745,11 @@ static void llm_load_vocab(
|
|
1737
1745
|
}
|
1738
1746
|
|
1739
1747
|
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
|
1740
|
-
|
1748
|
+
if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
|
1749
|
+
vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
|
1750
|
+
} else {
|
1751
|
+
vocab.linefeed_id = llama_tokenize_internal(vocab, "\n", false)[0];
|
1752
|
+
}
|
1741
1753
|
|
1742
1754
|
// special tokens
|
1743
1755
|
GGUF_GET_KEY(ctx, vocab.special_bos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_BOS_ID));
|
@@ -2635,18 +2647,20 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
2635
2647
|
|
2636
2648
|
const size_t wsize = ggml_type_size(cur->type);
|
2637
2649
|
|
2638
|
-
|
2650
|
+
// TODO: these 2 ggml_conts are technically not needed, but we add them until CUDA support for
|
2651
|
+
// non-contiguous views is added for the rope operator
|
2652
|
+
struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_3d(
|
2639
2653
|
ctx0, cur, n_embd_head, n_head, N,
|
2640
2654
|
wsize * n_embd_head,
|
2641
2655
|
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
2642
|
-
0);
|
2656
|
+
0));
|
2643
2657
|
offload_func_kq(tmpq);
|
2644
2658
|
|
2645
|
-
struct ggml_tensor * tmpk = ggml_view_3d(
|
2659
|
+
struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_3d(
|
2646
2660
|
ctx0, cur, n_embd_head, n_head_kv, N,
|
2647
2661
|
wsize * n_embd_head,
|
2648
2662
|
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
2649
|
-
wsize * n_embd_head * n_head);
|
2663
|
+
wsize * n_embd_head * n_head));
|
2650
2664
|
offload_func_kq(tmpk);
|
2651
2665
|
|
2652
2666
|
struct ggml_tensor * tmpv = ggml_view_3d(
|
@@ -2831,7 +2845,6 @@ static bool llama_eval_internal(
|
|
2831
2845
|
|
2832
2846
|
GGML_ASSERT(n_tokens > 0);
|
2833
2847
|
GGML_ASSERT(n_past >= 0);
|
2834
|
-
GGML_ASSERT(n_threads > 0);
|
2835
2848
|
// TODO: keep the values of n_batch and n_ctx
|
2836
2849
|
// GGML_ASSERT(n_tokens <= n_batch);
|
2837
2850
|
// GGML_ASSERT(n_past + n_tokens <= n_ctx);
|
@@ -2842,6 +2855,8 @@ static bool llama_eval_internal(
|
|
2842
2855
|
ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
|
2843
2856
|
#endif
|
2844
2857
|
|
2858
|
+
GGML_ASSERT(n_threads > 0);
|
2859
|
+
|
2845
2860
|
const int N = n_tokens;
|
2846
2861
|
|
2847
2862
|
const auto & model = lctx.model;
|
@@ -3026,16 +3041,8 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
|
|
3026
3041
|
return vocab.token_to_id.at(buf);
|
3027
3042
|
}
|
3028
3043
|
|
3029
|
-
static
|
3030
|
-
|
3031
|
-
for (size_t offs = 0; offs < text.length(); ++offs) {
|
3032
|
-
if (text[offs] == ' ') {
|
3033
|
-
result += "\xe2\x96\x81";
|
3034
|
-
} else {
|
3035
|
-
result += text[offs];
|
3036
|
-
}
|
3037
|
-
}
|
3038
|
-
return result;
|
3044
|
+
static void llama_escape_whitespace(std::string & text) {
|
3045
|
+
replace_all(text, " ", "\xe2\x96\x81");
|
3039
3046
|
}
|
3040
3047
|
|
3041
3048
|
static void llama_unescape_whitespace(std::string & word) {
|
@@ -3204,7 +3211,7 @@ private:
|
|
3204
3211
|
|
3205
3212
|
struct llm_bigram_bpe {
|
3206
3213
|
struct comparator {
|
3207
|
-
bool operator()(llm_bigram_bpe & l, llm_bigram_bpe & r) {
|
3214
|
+
bool operator()(const llm_bigram_bpe & l, const llm_bigram_bpe & r) const {
|
3208
3215
|
return l.rank > r.rank || (l.rank == r.rank && l.left > r.left);
|
3209
3216
|
}
|
3210
3217
|
};
|
@@ -3219,7 +3226,7 @@ struct llm_bigram_bpe {
|
|
3219
3226
|
};
|
3220
3227
|
|
3221
3228
|
struct llm_tokenizer_bpe {
|
3222
|
-
llm_tokenizer_bpe(const llama_vocab & vocab
|
3229
|
+
llm_tokenizer_bpe(const llama_vocab & vocab): vocab(vocab) {}
|
3223
3230
|
|
3224
3231
|
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
3225
3232
|
int final_prev_index = -1;
|
@@ -3352,26 +3359,23 @@ private:
|
|
3352
3359
|
}
|
3353
3360
|
|
3354
3361
|
// probably not 100% correct
|
3355
|
-
|
3356
|
-
static std::vector<std::string> bpe_gpt2_preprocess(std::string text) {
|
3362
|
+
static std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
|
3357
3363
|
std::vector<std::string> words;
|
3358
3364
|
|
3359
3365
|
// ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53
|
3360
3366
|
const std::string pattern = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
|
3361
3367
|
const std::regex re(pattern);
|
3362
|
-
std::smatch m;
|
3363
3368
|
|
3364
|
-
|
3365
|
-
|
3366
|
-
|
3367
|
-
|
3368
|
-
|
3369
|
+
auto words_begin = std::sregex_iterator(text.begin(), text.end(), re);
|
3370
|
+
auto words_end = std::sregex_iterator();
|
3371
|
+
auto n_words = std::distance(words_begin, words_end);
|
3372
|
+
words.reserve(n_words);
|
3373
|
+
for (auto it = words_begin; it != words_end; ++it) {
|
3374
|
+
words.push_back(it->str());
|
3369
3375
|
}
|
3370
|
-
|
3371
3376
|
return words;
|
3372
|
-
}
|
3373
3377
|
|
3374
|
-
|
3378
|
+
}
|
3375
3379
|
|
3376
3380
|
const llama_vocab & vocab;
|
3377
3381
|
|
@@ -3381,9 +3385,18 @@ private:
|
|
3381
3385
|
llm_bigram_bpe::queue work_queue;
|
3382
3386
|
};
|
3383
3387
|
|
3384
|
-
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab,
|
3388
|
+
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos) {
|
3385
3389
|
std::vector<llama_vocab::id> output;
|
3386
3390
|
|
3391
|
+
// OG tokenizer behavior:
|
3392
|
+
//
|
3393
|
+
// tokenizer.encode('', add_bos=True) returns [1]
|
3394
|
+
// tokenizer.encode('', add_bos=False) returns []
|
3395
|
+
|
3396
|
+
if (bos && vocab.special_bos_id != -1) {
|
3397
|
+
output.push_back(vocab.special_bos_id);
|
3398
|
+
}
|
3399
|
+
|
3387
3400
|
if (raw_text.empty()) {
|
3388
3401
|
return output;
|
3389
3402
|
}
|
@@ -3391,29 +3404,16 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
3391
3404
|
switch (vocab.type) {
|
3392
3405
|
case LLAMA_VOCAB_TYPE_SPM:
|
3393
3406
|
{
|
3394
|
-
|
3407
|
+
// without adding this leading whitespace, we do not get the same results as the original tokenizer
|
3408
|
+
raw_text = " " + raw_text;
|
3395
3409
|
|
3396
|
-
|
3397
|
-
|
3398
|
-
|
3399
|
-
|
3400
|
-
std::string text;
|
3401
|
-
if (escape) {
|
3402
|
-
text = llama_escape_whitespace(raw_text);
|
3403
|
-
} else {
|
3404
|
-
text = raw_text;
|
3405
|
-
}
|
3406
|
-
|
3407
|
-
tokenizer.tokenize(text, output);
|
3410
|
+
llm_tokenizer_spm tokenizer(vocab);
|
3411
|
+
llama_escape_whitespace(raw_text);
|
3412
|
+
tokenizer.tokenize(raw_text, output);
|
3408
3413
|
} break;
|
3409
3414
|
case LLAMA_VOCAB_TYPE_BPE:
|
3410
3415
|
{
|
3411
|
-
llm_tokenizer_bpe tokenizer(vocab
|
3412
|
-
|
3413
|
-
if (bos && vocab.special_bos_id != -1) {
|
3414
|
-
output.push_back(vocab.special_bos_id);
|
3415
|
-
}
|
3416
|
-
|
3416
|
+
llm_tokenizer_bpe tokenizer(vocab);
|
3417
3417
|
tokenizer.tokenize(raw_text, output);
|
3418
3418
|
} break;
|
3419
3419
|
};
|
@@ -3908,7 +3908,7 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
|
|
3908
3908
|
|
3909
3909
|
// Calculate absolute value of second derivatives
|
3910
3910
|
for (size_t i = 0; i < second_derivatives.size(); ++i) {
|
3911
|
-
second_derivatives[i] = abs(second_derivatives[i]);
|
3911
|
+
second_derivatives[i] = std::abs(second_derivatives[i]);
|
3912
3912
|
}
|
3913
3913
|
|
3914
3914
|
// Normalize the second derivatives
|
@@ -4099,16 +4099,16 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
|
|
4099
4099
|
std::vector<llama_grammar_candidate> candidates_grammar;
|
4100
4100
|
|
4101
4101
|
for (size_t i = 0; i < candidates->size; ++i) {
|
4102
|
-
const llama_token id
|
4103
|
-
const std::string
|
4102
|
+
const llama_token id = candidates->data[i].id;
|
4103
|
+
const std::string piece = llama_token_to_str(ctx, id);
|
4104
4104
|
if (id == eos) {
|
4105
4105
|
if (!allow_eos) {
|
4106
4106
|
candidates->data[i].logit = -INFINITY;
|
4107
4107
|
}
|
4108
|
-
} else if (
|
4108
|
+
} else if (piece.empty() || piece[0] == 0) {
|
4109
4109
|
candidates->data[i].logit = -INFINITY;
|
4110
4110
|
} else {
|
4111
|
-
candidates_decoded.push_back(decode_utf8(
|
4111
|
+
candidates_decoded.push_back(decode_utf8(piece.c_str(), grammar->partial_utf8));
|
4112
4112
|
candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second });
|
4113
4113
|
}
|
4114
4114
|
}
|
@@ -4312,10 +4312,10 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
|
|
4312
4312
|
GGML_ASSERT(false);
|
4313
4313
|
}
|
4314
4314
|
|
4315
|
-
const std::string
|
4315
|
+
const std::string piece = llama_token_to_str(ctx, token);
|
4316
4316
|
|
4317
4317
|
// Note terminating 0 in decoded string
|
4318
|
-
const auto decoded = decode_utf8(
|
4318
|
+
const auto decoded = decode_utf8(piece.c_str(), grammar->partial_utf8);
|
4319
4319
|
const auto & code_points = decoded.first;
|
4320
4320
|
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
4321
4321
|
grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
|
@@ -4326,6 +4326,257 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
|
|
4326
4326
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
4327
4327
|
}
|
4328
4328
|
|
4329
|
+
//
|
4330
|
+
// Beam search
|
4331
|
+
//
|
4332
|
+
|
4333
|
+
struct llama_beam {
|
4334
|
+
std::vector<llama_token> tokens;
|
4335
|
+
float p; // Cumulative beam probability (renormalized relative to all beams)
|
4336
|
+
bool eob; // Initialize end-of-beam to false. Callback sets this to true.
|
4337
|
+
// Sort beams by probability. In case of ties, prefer beams at eob.
|
4338
|
+
bool operator<(const llama_beam & rhs) const {
|
4339
|
+
return std::make_pair(p, eob) < std::make_pair(rhs.p, rhs.eob);
|
4340
|
+
}
|
4341
|
+
// Shift off first n tokens and discard them.
|
4342
|
+
void shift_tokens(const size_t n) {
|
4343
|
+
if (n) {
|
4344
|
+
std::copy(tokens.begin() + n, tokens.end(), tokens.begin());
|
4345
|
+
tokens.resize(tokens.size() - n);
|
4346
|
+
}
|
4347
|
+
}
|
4348
|
+
llama_beam_view view() const { return {tokens.data(), tokens.size(), p, eob}; }
|
4349
|
+
};
|
4350
|
+
|
4351
|
+
// A struct for calculating logit-related info.
|
4352
|
+
struct llama_logit_info {
|
4353
|
+
const float * const logits;
|
4354
|
+
const int n_vocab;
|
4355
|
+
const float max_l;
|
4356
|
+
const float normalizer;
|
4357
|
+
struct sum_exp {
|
4358
|
+
float max_l;
|
4359
|
+
float operator()(float sum, float l) const { return sum + std::exp(l - max_l); }
|
4360
|
+
};
|
4361
|
+
llama_logit_info(llama_context * ctx)
|
4362
|
+
: logits(llama_get_logits(ctx))
|
4363
|
+
, n_vocab(llama_n_vocab(ctx))
|
4364
|
+
, max_l(*std::max_element(logits, logits + n_vocab))
|
4365
|
+
, normalizer(1.0f / std::accumulate(logits, logits + n_vocab, 0.0f, sum_exp{max_l}))
|
4366
|
+
{ }
|
4367
|
+
llama_token_data get_token_data(const llama_token token_id) const {
|
4368
|
+
constexpr auto p = std::numeric_limits<float>::quiet_NaN(); // never used
|
4369
|
+
return {token_id, logits[token_id], p};
|
4370
|
+
}
|
4371
|
+
// Return top k token_data by logit.
|
4372
|
+
std::vector<llama_token_data> top_k(size_t k) {
|
4373
|
+
std::vector<llama_token_data> min_heap; // min-heap by logit
|
4374
|
+
const llama_token k_min = std::min(static_cast<llama_token>(k), n_vocab);
|
4375
|
+
min_heap.reserve(k_min);
|
4376
|
+
for (llama_token token_id = 0 ; token_id < k_min ; ++token_id) {
|
4377
|
+
min_heap.push_back(get_token_data(token_id));
|
4378
|
+
}
|
4379
|
+
auto comp = [](const llama_token_data & a, const llama_token_data & b) { return a.logit > b.logit; };
|
4380
|
+
std::make_heap(min_heap.begin(), min_heap.end(), comp);
|
4381
|
+
for (llama_token token_id = k_min ; token_id < n_vocab ; ++token_id) {
|
4382
|
+
if (min_heap.front().logit < logits[token_id]) {
|
4383
|
+
std::pop_heap(min_heap.begin(), min_heap.end(), comp);
|
4384
|
+
min_heap.back().id = token_id;
|
4385
|
+
min_heap.back().logit = logits[token_id];
|
4386
|
+
std::push_heap(min_heap.begin(), min_heap.end(), comp);
|
4387
|
+
}
|
4388
|
+
}
|
4389
|
+
return min_heap;
|
4390
|
+
}
|
4391
|
+
float probability_from_logit(float logit) {
|
4392
|
+
return normalizer * std::exp(logit - max_l);
|
4393
|
+
}
|
4394
|
+
};
|
4395
|
+
|
4396
|
+
struct llama_beam_search_data {
|
4397
|
+
llama_context * ctx;
|
4398
|
+
size_t n_beams;
|
4399
|
+
int n_past;
|
4400
|
+
int n_predict;
|
4401
|
+
int n_threads;
|
4402
|
+
std::vector<llama_beam> beams;
|
4403
|
+
std::vector<llama_beam> next_beams;
|
4404
|
+
|
4405
|
+
// Re-calculated on each loop iteration
|
4406
|
+
size_t common_prefix_length;
|
4407
|
+
|
4408
|
+
// Used to communicate to/from callback on beams state.
|
4409
|
+
std::vector<llama_beam_view> beam_views;
|
4410
|
+
|
4411
|
+
llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict, int n_threads)
|
4412
|
+
: ctx(ctx)
|
4413
|
+
, n_beams(n_beams)
|
4414
|
+
, n_past(n_past)
|
4415
|
+
, n_predict(n_predict)
|
4416
|
+
, n_threads(n_threads)
|
4417
|
+
, beam_views(n_beams) {
|
4418
|
+
beams.reserve(n_beams);
|
4419
|
+
next_beams.reserve(n_beams);
|
4420
|
+
}
|
4421
|
+
|
4422
|
+
// Collapse beams to a single beam given by index.
|
4423
|
+
void collapse_beams(const size_t beam_idx) {
|
4424
|
+
if (0u < beam_idx) {
|
4425
|
+
std::swap(beams[0], beams[beam_idx]);
|
4426
|
+
}
|
4427
|
+
beams.resize(1);
|
4428
|
+
}
|
4429
|
+
|
4430
|
+
// Min-heaps are used to efficiently collect the top-k elements (k=n_beams).
|
4431
|
+
// The repetative patterns below reflect the 2 stages of heaps:
|
4432
|
+
// * Gather elements until the vector is full, then call std::make_heap() on it.
|
4433
|
+
// * If the heap is full and a new element is found that should be included, pop the
|
4434
|
+
// least element to the back(), replace it with the new, then push it into the heap.
|
4435
|
+
void fill_next_beams_by_top_probabilities(llama_beam & beam) {
|
4436
|
+
// Min-heaps use a greater-than comparator.
|
4437
|
+
const auto comp = [](const llama_beam & a, const llama_beam & b) { return a.p > b.p; };
|
4438
|
+
if (beam.eob) {
|
4439
|
+
// beam is at end-of-sentence, so just copy it to next_beams if its probability is high enough.
|
4440
|
+
if (next_beams.size() < n_beams) {
|
4441
|
+
next_beams.push_back(std::move(beam));
|
4442
|
+
if (next_beams.size() == n_beams) {
|
4443
|
+
std::make_heap(next_beams.begin(), next_beams.end(), comp);
|
4444
|
+
}
|
4445
|
+
} else if (next_beams.front().p < beam.p) {
|
4446
|
+
std::pop_heap(next_beams.begin(), next_beams.end(), comp);
|
4447
|
+
next_beams.back() = std::move(beam);
|
4448
|
+
std::push_heap(next_beams.begin(), next_beams.end(), comp);
|
4449
|
+
}
|
4450
|
+
} else {
|
4451
|
+
// beam is not at end-of-sentence, so branch with next top_k tokens.
|
4452
|
+
if (!beam.tokens.empty()) {
|
4453
|
+
llama_eval(ctx, beam.tokens.data(), beam.tokens.size(), n_past, n_threads);
|
4454
|
+
}
|
4455
|
+
llama_logit_info logit_info(ctx);
|
4456
|
+
std::vector<llama_token_data> next_tokens = logit_info.top_k(n_beams);
|
4457
|
+
size_t i=0;
|
4458
|
+
if (next_beams.size() < n_beams) {
|
4459
|
+
for (; next_beams.size() < n_beams ; ++i) {
|
4460
|
+
llama_beam next_beam = beam;
|
4461
|
+
next_beam.tokens.push_back(next_tokens[i].id);
|
4462
|
+
next_beam.p *= logit_info.probability_from_logit(next_tokens[i].logit);
|
4463
|
+
next_beams.push_back(std::move(next_beam));
|
4464
|
+
}
|
4465
|
+
std::make_heap(next_beams.begin(), next_beams.end(), comp);
|
4466
|
+
} else {
|
4467
|
+
for (; next_beams.front().p == 0.0f ; ++i) {
|
4468
|
+
std::pop_heap(next_beams.begin(), next_beams.end(), comp);
|
4469
|
+
next_beams.back() = beam;
|
4470
|
+
next_beams.back().tokens.push_back(next_tokens[i].id);
|
4471
|
+
next_beams.back().p *= logit_info.probability_from_logit(next_tokens[i].logit);
|
4472
|
+
std::push_heap(next_beams.begin(), next_beams.end(), comp);
|
4473
|
+
}
|
4474
|
+
}
|
4475
|
+
for (; i < n_beams ; ++i) {
|
4476
|
+
const float next_p = beam.p * logit_info.probability_from_logit(next_tokens[i].logit);
|
4477
|
+
if (next_beams.front().p < next_p) {
|
4478
|
+
std::pop_heap(next_beams.begin(), next_beams.end(), comp);
|
4479
|
+
next_beams.back() = beam;
|
4480
|
+
next_beams.back().tokens.push_back(next_tokens[i].id);
|
4481
|
+
next_beams.back().p = next_p;
|
4482
|
+
std::push_heap(next_beams.begin(), next_beams.end(), comp);
|
4483
|
+
}
|
4484
|
+
}
|
4485
|
+
}
|
4486
|
+
}
|
4487
|
+
|
4488
|
+
// Find common_prefix_length based on beams.
|
4489
|
+
// Requires beams is not empty.
|
4490
|
+
size_t find_common_prefix_length() {
|
4491
|
+
size_t common_prefix_length = beams[0].tokens.size();
|
4492
|
+
for (size_t i = 1 ; i < beams.size() ; ++i) {
|
4493
|
+
common_prefix_length = std::min(common_prefix_length, beams[i].tokens.size());
|
4494
|
+
for (size_t j = 0 ; j < common_prefix_length ; ++j) {
|
4495
|
+
if (beams[0].tokens[j] != beams[i].tokens[j]) {
|
4496
|
+
common_prefix_length = j;
|
4497
|
+
break;
|
4498
|
+
}
|
4499
|
+
}
|
4500
|
+
}
|
4501
|
+
return common_prefix_length;
|
4502
|
+
}
|
4503
|
+
|
4504
|
+
// Construct beams_state to send back to caller via the callback function.
|
4505
|
+
// Side effect: set common_prefix_length = find_common_prefix_length();
|
4506
|
+
llama_beams_state get_beams_state(const bool last_call) {
|
4507
|
+
for (size_t i = 0 ; i < beams.size() ; ++i) {
|
4508
|
+
beam_views[i] = beams[i].view();
|
4509
|
+
}
|
4510
|
+
common_prefix_length = find_common_prefix_length();
|
4511
|
+
return {beam_views.data(), beams.size(), common_prefix_length, last_call};
|
4512
|
+
}
|
4513
|
+
|
4514
|
+
// Loop:
|
4515
|
+
// * while i < n_predict, AND
|
4516
|
+
// * any of the beams have not yet reached end-of-beam (eob), AND
|
4517
|
+
// * the highest probability beam(s) (plural in case of ties) are not at end-of-sentence
|
4518
|
+
// (since all other beam probabilities can only decrease)
|
4519
|
+
void loop(const llama_beam_search_callback_fn_t callback, void * const callback_data) {
|
4520
|
+
beams.push_back({{}, 1.0f, false}); // Start with one empty beam w/ probability = 1.0 and !eob.
|
4521
|
+
const auto not_eob = [](const llama_beam & beam) { return !beam.eob; };
|
4522
|
+
for (int i = 0 ; i < n_predict && std::any_of(beams.begin(),beams.end(),not_eob) &&
|
4523
|
+
!beams[top_beam_index()].eob ; ++i) {
|
4524
|
+
callback(callback_data, get_beams_state(false)); // Sets common_prefix_length
|
4525
|
+
update_beams_from_beam_views(); // Update values (p,eob) that callback may have changed.
|
4526
|
+
if (common_prefix_length) {
|
4527
|
+
llama_eval(ctx, beams[0].tokens.data(), common_prefix_length, n_past, n_threads);
|
4528
|
+
n_past += common_prefix_length;
|
4529
|
+
}
|
4530
|
+
// Zero-out next_beam probabilities to place them last in following min-heap.
|
4531
|
+
std::for_each(next_beams.begin(), next_beams.end(), [](llama_beam & beam) { beam.p = 0.0f; });
|
4532
|
+
for (llama_beam & beam : beams) {
|
4533
|
+
beam.shift_tokens(common_prefix_length);
|
4534
|
+
fill_next_beams_by_top_probabilities(beam);
|
4535
|
+
}
|
4536
|
+
// next_beams become the beams of next/final iteration. Swap them to re-use memory.
|
4537
|
+
beams.swap(next_beams);
|
4538
|
+
renormalize_beam_probabilities(beams);
|
4539
|
+
}
|
4540
|
+
collapse_beams(top_beam_index());
|
4541
|
+
callback(callback_data, get_beams_state(true));
|
4542
|
+
}
|
4543
|
+
|
4544
|
+
// As beams grow, the cumulative probabilities decrease.
|
4545
|
+
// Renormalize them to avoid floating point underflow.
|
4546
|
+
static void renormalize_beam_probabilities(std::vector<llama_beam> & beams) {
|
4547
|
+
const auto sum_p = [](float sum, llama_beam & beam) { return sum + beam.p; };
|
4548
|
+
const float inv_sum = 1.0f / std::accumulate(beams.begin(), beams.end(), 0.0f, sum_p);
|
4549
|
+
std::for_each(beams.begin(), beams.end(), [=](llama_beam & beam) { beam.p *= inv_sum; });
|
4550
|
+
}
|
4551
|
+
|
4552
|
+
// Assumes beams is non-empty. Uses llama_beam::operator<() for ordering.
|
4553
|
+
size_t top_beam_index() {
|
4554
|
+
return std::max_element(beams.begin(), beams.end()) - beams.begin();
|
4555
|
+
}
|
4556
|
+
|
4557
|
+
// Copy (p,eob) for each beam which may have been changed by the callback.
|
4558
|
+
void update_beams_from_beam_views() {
|
4559
|
+
for (size_t i = 0 ; i < beams.size() ; ++i) {
|
4560
|
+
beams[i].p = beam_views[i].p;
|
4561
|
+
beams[i].eob = beam_views[i].eob;
|
4562
|
+
}
|
4563
|
+
}
|
4564
|
+
};
|
4565
|
+
|
4566
|
+
void llama_beam_search(llama_context * ctx,
|
4567
|
+
llama_beam_search_callback_fn_t callback, void * callback_data,
|
4568
|
+
size_t n_beams, int n_past, int n_predict, int n_threads) {
|
4569
|
+
assert(ctx);
|
4570
|
+
const int64_t t_start_sample_us = ggml_time_us();
|
4571
|
+
|
4572
|
+
llama_beam_search_data beam_search_data(ctx, n_beams, n_past, n_predict, n_threads);
|
4573
|
+
|
4574
|
+
beam_search_data.loop(callback, callback_data);
|
4575
|
+
|
4576
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
4577
|
+
ctx->n_sample++;
|
4578
|
+
}
|
4579
|
+
|
4329
4580
|
//
|
4330
4581
|
// quantization
|
4331
4582
|
//
|
@@ -4423,6 +4674,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4423
4674
|
|
4424
4675
|
std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname_inp, /*use_mmap*/ false));
|
4425
4676
|
|
4677
|
+
llama_model model;
|
4678
|
+
llm_load_arch(*ml, model);
|
4679
|
+
llm_load_hparams(*ml, model, 0, 0, 0);
|
4680
|
+
|
4426
4681
|
const size_t align = GGUF_DEFAULT_ALIGNMENT;
|
4427
4682
|
struct gguf_context * ctx_out = gguf_init_empty();
|
4428
4683
|
|
@@ -4448,6 +4703,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4448
4703
|
++n_feed_forward_w2;
|
4449
4704
|
}
|
4450
4705
|
}
|
4706
|
+
if (n_attention_wv != n_feed_forward_w2 || (uint32_t)n_attention_wv != model.hparams.n_layer) {
|
4707
|
+
LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_feed_forward_w2 = %d, hparams.n_layer = %d\n",
|
4708
|
+
__func__, n_attention_wv, n_feed_forward_w2, model.hparams.n_layer);
|
4709
|
+
}
|
4451
4710
|
|
4452
4711
|
int i_attention_wv = 0;
|
4453
4712
|
int i_feed_forward_w2 = 0;
|
@@ -4524,8 +4783,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4524
4783
|
|
4525
4784
|
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
|
4526
4785
|
int nx = tensor->ne[0];
|
4527
|
-
|
4528
|
-
|
4786
|
+
if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
|
4787
|
+
new_type = GGML_TYPE_Q8_0;
|
4788
|
+
}
|
4789
|
+
else if (new_type != GGML_TYPE_Q8_0) {
|
4529
4790
|
new_type = GGML_TYPE_Q6_K;
|
4530
4791
|
}
|
4531
4792
|
} else if (name.find("attn_v.weight") != std::string::npos) {
|
@@ -4539,21 +4800,49 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4539
4800
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
|
4540
4801
|
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
|
4541
4802
|
(i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
|
4803
|
+
if (model.type == MODEL_70B) {
|
4804
|
+
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
|
4805
|
+
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
|
4806
|
+
// nearly negligible increase in model size by quantizing this tensor with more bits:
|
4807
|
+
if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
|
4808
|
+
}
|
4542
4809
|
++i_attention_wv;
|
4543
4810
|
} else if (name.find("ffn_down.weight") != std::string::npos) {
|
4544
4811
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
4545
4812
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
4546
|
-
new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
|
4813
|
+
new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
|
4814
|
+
: model.arch != LLM_ARCH_FALCON || use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K
|
4815
|
+
: GGML_TYPE_Q3_K;
|
4816
|
+
}
|
4817
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
|
4818
|
+
new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
|
4819
|
+
}
|
4820
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
|
4821
|
+
if (model.arch == LLM_ARCH_FALCON) {
|
4822
|
+
new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
|
4823
|
+
use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
4824
|
+
} else {
|
4825
|
+
if (use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
4826
|
+
}
|
4827
|
+
}
|
4828
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
4829
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && i_feed_forward_w2 < 4) {
|
4830
|
+
new_type = GGML_TYPE_Q5_K;
|
4547
4831
|
}
|
4548
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
4549
|
-
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
4550
|
-
use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
4551
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_feed_forward_w2 < 4) new_type = GGML_TYPE_Q5_K;
|
4552
4832
|
++i_feed_forward_w2;
|
4553
4833
|
} else if (name.find("attn_output.weight") != std::string::npos) {
|
4554
|
-
if
|
4555
|
-
|
4556
|
-
|
4834
|
+
if (model.arch != LLM_ARCH_FALCON) {
|
4835
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
|
4836
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
|
4837
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
4838
|
+
} else {
|
4839
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
|
4840
|
+
}
|
4841
|
+
}
|
4842
|
+
else if (name.find("attn_qkv.weight") != std::string::npos) {
|
4843
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
|
4844
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
|
4845
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
|
4557
4846
|
}
|
4558
4847
|
else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) {
|
4559
4848
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
@@ -4568,8 +4857,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4568
4857
|
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
|
4569
4858
|
int nx = tensor->ne[0];
|
4570
4859
|
int ny = tensor->ne[1];
|
4571
|
-
if (nx % QK_K != 0
|
4572
|
-
|
4860
|
+
if (nx % QK_K != 0) {
|
4861
|
+
LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for k-quants\n", __func__, nx, ny, QK_K);
|
4573
4862
|
convert_incompatible_tensor = true;
|
4574
4863
|
}
|
4575
4864
|
}
|
@@ -4998,7 +5287,7 @@ struct llama_context_params llama_context_default_params() {
|
|
4998
5287
|
/*.progress_callback =*/ nullptr,
|
4999
5288
|
/*.progress_callback_user_data =*/ nullptr,
|
5000
5289
|
/*.low_vram =*/ false,
|
5001
|
-
/*.mul_mat_q =*/
|
5290
|
+
/*.mul_mat_q =*/ true,
|
5002
5291
|
/*.f16_kv =*/ true,
|
5003
5292
|
/*.logits_all =*/ false,
|
5004
5293
|
/*.vocab_only =*/ false,
|
@@ -5297,13 +5586,29 @@ int llama_model_n_embd(const struct llama_model * model) {
|
|
5297
5586
|
return model->hparams.n_embd;
|
5298
5587
|
}
|
5299
5588
|
|
5300
|
-
int
|
5589
|
+
int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
|
5301
5590
|
return snprintf(buf, buf_size, "%s %s %s",
|
5302
5591
|
model->name.c_str(),
|
5303
5592
|
llama_model_type_name(model->type),
|
5304
5593
|
llama_model_ftype_name(model->ftype).c_str());
|
5305
5594
|
}
|
5306
5595
|
|
5596
|
+
uint64_t llama_model_size(const struct llama_model * model) {
|
5597
|
+
uint64_t size = 0;
|
5598
|
+
for (const auto & it : model->tensors_by_name) {
|
5599
|
+
size += ggml_nbytes(it.second);
|
5600
|
+
}
|
5601
|
+
return size;
|
5602
|
+
}
|
5603
|
+
|
5604
|
+
uint64_t llama_model_n_params(const struct llama_model * model) {
|
5605
|
+
uint64_t nparams = 0;
|
5606
|
+
for (const auto & it : model->tensors_by_name) {
|
5607
|
+
nparams += ggml_nelements(it.second);
|
5608
|
+
}
|
5609
|
+
return nparams;
|
5610
|
+
}
|
5611
|
+
|
5307
5612
|
int llama_model_quantize(
|
5308
5613
|
const char * fname_inp,
|
5309
5614
|
const char * fname_out,
|
@@ -5828,8 +6133,7 @@ int llama_tokenize_with_model(
|
|
5828
6133
|
llama_token * tokens,
|
5829
6134
|
int n_max_tokens,
|
5830
6135
|
bool add_bos) {
|
5831
|
-
auto
|
5832
|
-
auto res = llama_tokenize_internal(model->vocab, text, add_bos, escape);
|
6136
|
+
auto res = llama_tokenize_internal(model->vocab, text, add_bos);
|
5833
6137
|
|
5834
6138
|
if (n_max_tokens < (int) res.size()) {
|
5835
6139
|
LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
|
@@ -5843,12 +6147,12 @@ int llama_tokenize_with_model(
|
|
5843
6147
|
return res.size();
|
5844
6148
|
}
|
5845
6149
|
|
5846
|
-
int
|
5847
|
-
return
|
6150
|
+
int llama_token_to_piece(const struct llama_context * ctx, llama_token token, char * buf, int length) {
|
6151
|
+
return llama_token_to_piece_with_model(&ctx->model, token, buf, length);
|
5848
6152
|
}
|
5849
6153
|
|
5850
|
-
// does not write null-terminator to
|
5851
|
-
int
|
6154
|
+
// does not write null-terminator to buf
|
6155
|
+
int llama_token_to_piece_with_model(const struct llama_model * model, llama_token token, char * buf, int length) {
|
5852
6156
|
if (0 <= token && token < llama_model_n_vocab(model)) {
|
5853
6157
|
if (llama_is_normal_token(model->vocab, token)) {
|
5854
6158
|
std::string result = model->vocab.id_to_token[token].text;
|
@@ -5936,11 +6240,40 @@ const char * llama_print_system_info(void) {
|
|
5936
6240
|
s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
|
5937
6241
|
s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
|
5938
6242
|
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
|
6243
|
+
s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
|
5939
6244
|
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
|
5940
6245
|
|
5941
6246
|
return s.c_str();
|
5942
6247
|
}
|
5943
6248
|
|
6249
|
+
void llama_dump_timing_info_yaml(FILE * stream, const llama_context * ctx) {
|
6250
|
+
fprintf(stream, "\n");
|
6251
|
+
fprintf(stream, "###########\n");
|
6252
|
+
fprintf(stream, "# Timings #\n");
|
6253
|
+
fprintf(stream, "###########\n");
|
6254
|
+
fprintf(stream, "\n");
|
6255
|
+
|
6256
|
+
fprintf(stream, "mst_eval: %.2f # ms / token during generation\n",
|
6257
|
+
1.0e-3 * ctx->t_eval_us / ctx->n_eval);
|
6258
|
+
fprintf(stream, "mst_p_eval: %.2f # ms / token during prompt processing\n",
|
6259
|
+
1.0e-3 * ctx->t_p_eval_us / ctx->n_p_eval);
|
6260
|
+
fprintf(stream, "mst_sample: %.2f # ms / token during sampling\n",
|
6261
|
+
1.0e-3 * ctx->t_sample_us / ctx->n_sample);
|
6262
|
+
fprintf(stream, "n_eval: %d # number of tokens generated (excluding the first one)\n", ctx->n_eval);
|
6263
|
+
fprintf(stream, "n_p_eval: %d # number of tokens processed in batches at the beginning\n", ctx->n_p_eval);
|
6264
|
+
fprintf(stream, "n_sample: %d # number of sampled tokens\n", ctx->n_sample);
|
6265
|
+
fprintf(stream, "t_eval_us: %" PRId64 " # total microseconds spent generating tokens\n", ctx->t_eval_us);
|
6266
|
+
fprintf(stream, "t_load_us: %" PRId64 " # total microseconds spent loading the model\n", ctx->t_load_us);
|
6267
|
+
fprintf(stream, "t_p_eval_us: %" PRId64 " # total microseconds spent prompt processing\n", ctx->t_p_eval_us);
|
6268
|
+
fprintf(stream, "t_sample_us: %" PRId64 " # total microseconds spent sampling\n", ctx->t_sample_us);
|
6269
|
+
fprintf(stream, "ts_eval: %.2f # tokens / second during generation\n",
|
6270
|
+
1.0e6 * ctx->n_eval / ctx->t_eval_us);
|
6271
|
+
fprintf(stream, "ts_p_eval: %.2f # tokens / second during prompt processing\n",
|
6272
|
+
1.0e6 * ctx->n_p_eval / ctx->t_p_eval_us);
|
6273
|
+
fprintf(stream, "ts_sample: %.2f # tokens / second during sampling\n",
|
6274
|
+
1.0e6 * ctx->n_sample / ctx->t_sample_us);
|
6275
|
+
}
|
6276
|
+
|
5944
6277
|
// For internal test use
|
5945
6278
|
const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
|
5946
6279
|
return ctx->model.tensors_by_name;
|
@@ -5951,10 +6284,6 @@ void llama_log_set(llama_log_callback log_callback, void * user_data) {
|
|
5951
6284
|
g_state.log_callback_user_data = user_data;
|
5952
6285
|
}
|
5953
6286
|
|
5954
|
-
#if defined(_MSC_VER) && !defined(vsnprintf)
|
5955
|
-
#define vsnprintf _vsnprintf
|
5956
|
-
#endif
|
5957
|
-
|
5958
6287
|
static void llama_log_internal_v(llama_log_level level, const char * format, va_list args) {
|
5959
6288
|
va_list args_copy;
|
5960
6289
|
va_copy(args_copy, args);
|