llama_cpp 0.4.0 → 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/examples/chat.rb +2 -2
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +23 -11
- data/ext/llama_cpp/src/ggml-alloc.c +118 -73
- data/ext/llama_cpp/src/ggml-cuda.cu +106 -34
- data/ext/llama_cpp/src/ggml-metal.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.m +165 -72
- data/ext/llama_cpp/src/ggml-metal.metal +160 -89
- data/ext/llama_cpp/src/ggml-opencl.cpp +7 -7
- data/ext/llama_cpp/src/ggml.c +661 -380
- data/ext/llama_cpp/src/ggml.h +45 -19
- data/ext/llama_cpp/src/k_quants.c +47 -14
- data/ext/llama_cpp/src/llama.cpp +571 -166
- data/ext/llama_cpp/src/llama.h +54 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +5 -3
- metadata +2 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -1,9 +1,6 @@
|
|
1
1
|
// Defines fileno on msys:
|
2
2
|
#ifndef _GNU_SOURCE
|
3
3
|
#define _GNU_SOURCE
|
4
|
-
#include <cstddef>
|
5
|
-
#include <cstdint>
|
6
|
-
#include <cstdio>
|
7
4
|
#endif
|
8
5
|
|
9
6
|
#include "llama.h"
|
@@ -62,6 +59,9 @@
|
|
62
59
|
#include <cinttypes>
|
63
60
|
#include <climits>
|
64
61
|
#include <cstdarg>
|
62
|
+
#include <cstddef>
|
63
|
+
#include <cstdint>
|
64
|
+
#include <cstdio>
|
65
65
|
#include <cstring>
|
66
66
|
#include <ctime>
|
67
67
|
#include <fstream>
|
@@ -114,13 +114,21 @@ static size_t utf8_len(char src) {
|
|
114
114
|
}
|
115
115
|
|
116
116
|
void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
117
|
+
std::string result;
|
118
|
+
for (size_t pos = 0; ; pos += search.length()) {
|
119
|
+
auto new_pos = s.find(search, pos);
|
120
|
+
if (new_pos == std::string::npos) {
|
121
|
+
result += s.substr(pos, s.size() - pos);
|
122
|
+
break;
|
123
|
+
}
|
124
|
+
result += s.substr(pos, new_pos - pos) + replace;
|
125
|
+
pos = new_pos;
|
122
126
|
}
|
127
|
+
s = std::move(result);
|
123
128
|
}
|
129
|
+
#ifdef GGML_USE_CPU_HBM
|
130
|
+
#include <hbwmalloc.h>
|
131
|
+
#endif
|
124
132
|
|
125
133
|
static void zeros(std::ofstream & file, size_t n) {
|
126
134
|
char zero = 0;
|
@@ -320,6 +328,44 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
320
328
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
321
329
|
},
|
322
330
|
},
|
331
|
+
{
|
332
|
+
LLM_ARCH_GPT2,
|
333
|
+
{
|
334
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
335
|
+
},
|
336
|
+
},
|
337
|
+
{
|
338
|
+
LLM_ARCH_GPTJ,
|
339
|
+
{
|
340
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
341
|
+
},
|
342
|
+
},
|
343
|
+
{
|
344
|
+
LLM_ARCH_GPTNEOX,
|
345
|
+
{
|
346
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
347
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
348
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
349
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
350
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
351
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
352
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
353
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
354
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
355
|
+
},
|
356
|
+
},
|
357
|
+
{
|
358
|
+
LLM_ARCH_MPT,
|
359
|
+
{
|
360
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
361
|
+
},
|
362
|
+
},
|
363
|
+
{
|
364
|
+
LLM_ARCH_UNKNOWN,
|
365
|
+
{
|
366
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
367
|
+
},
|
368
|
+
},
|
323
369
|
};
|
324
370
|
|
325
371
|
static llm_arch llm_arch_from_string(const std::string & name) {
|
@@ -407,6 +453,9 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
|
|
407
453
|
#elif GGML_USE_METAL
|
408
454
|
# define llama_host_malloc(n) ggml_metal_host_malloc(n)
|
409
455
|
# define llama_host_free(data) ggml_metal_host_free(data)
|
456
|
+
#elif GGML_USE_CPU_HBM
|
457
|
+
# define llama_host_malloc(n) hbw_malloc(n)
|
458
|
+
# define llama_host_free(data) if (data != NULL) hbw_free(data)
|
410
459
|
#else
|
411
460
|
# define llama_host_malloc(n) malloc(n)
|
412
461
|
# define llama_host_free(data) free(data)
|
@@ -563,16 +612,16 @@ struct llama_mmap {
|
|
563
612
|
|
564
613
|
if (prefetch > 0) {
|
565
614
|
// Advise the kernel to preload the mapped memory
|
566
|
-
if (
|
567
|
-
fprintf(stderr, "warning:
|
615
|
+
if (posix_madvise(addr, std::min(file->size, prefetch), POSIX_MADV_WILLNEED)) {
|
616
|
+
fprintf(stderr, "warning: posix_madvise(.., POSIX_MADV_WILLNEED) failed: %s\n",
|
568
617
|
strerror(errno));
|
569
618
|
}
|
570
619
|
}
|
571
620
|
if (numa) {
|
572
621
|
// advise the kernel not to use readahead
|
573
622
|
// (because the next page might not belong on the same node)
|
574
|
-
if (
|
575
|
-
fprintf(stderr, "warning:
|
623
|
+
if (posix_madvise(addr, file->size, POSIX_MADV_RANDOM)) {
|
624
|
+
fprintf(stderr, "warning: posix_madvise(.., POSIX_MADV_RANDOM) failed: %s\n",
|
576
625
|
strerror(errno));
|
577
626
|
}
|
578
627
|
}
|
@@ -609,7 +658,9 @@ struct llama_mmap {
|
|
609
658
|
#if _WIN32_WINNT >= _WIN32_WINNT_WIN8
|
610
659
|
if (prefetch) {
|
611
660
|
// Advise the kernel to preload the mapped memory
|
661
|
+
|
612
662
|
WIN32_MEMORY_RANGE_ENTRY range;
|
663
|
+
|
613
664
|
range.VirtualAddress = addr;
|
614
665
|
range.NumberOfBytes = (SIZE_T)size;
|
615
666
|
if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
|
@@ -796,12 +847,12 @@ static void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
|
|
796
847
|
(void) tensor;
|
797
848
|
}
|
798
849
|
|
799
|
-
static std::string
|
850
|
+
static std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) {
|
800
851
|
std::vector<char> result(8, 0);
|
801
|
-
const int n_tokens =
|
852
|
+
const int n_tokens = llama_token_to_piece(ctx, token, result.data(), result.size());
|
802
853
|
if (n_tokens < 0) {
|
803
854
|
result.resize(-n_tokens);
|
804
|
-
int check =
|
855
|
+
int check = llama_token_to_piece(ctx, token, result.data(), result.size());
|
805
856
|
GGML_ASSERT(check == -n_tokens);
|
806
857
|
} else {
|
807
858
|
result.resize(n_tokens);
|
@@ -955,10 +1006,10 @@ struct llama_vocab {
|
|
955
1006
|
id linefeed_id = 13;
|
956
1007
|
|
957
1008
|
int find_bpe_rank(std::string token_left, std::string token_right) const {
|
958
|
-
replace_all(token_left, " ", "
|
959
|
-
replace_all(token_left, "\n", "
|
960
|
-
replace_all(token_right, " ", "
|
961
|
-
replace_all(token_right, "\n", "
|
1009
|
+
replace_all(token_left, " ", "\u0120");
|
1010
|
+
replace_all(token_left, "\n", "\u010A");
|
1011
|
+
replace_all(token_right, " ", "\u0120");
|
1012
|
+
replace_all(token_right, "\n", "\u010A");
|
962
1013
|
|
963
1014
|
auto it = bpe_ranks.find(std::make_pair(token_left, token_right));
|
964
1015
|
if (it == bpe_ranks.end()) {
|
@@ -1144,11 +1195,13 @@ static bool llama_kv_cache_init(
|
|
1144
1195
|
|
1145
1196
|
enum llama_fver {
|
1146
1197
|
GGUF_FILE_VERSION_V1 = 1,
|
1198
|
+
GGUF_FILE_VERSION_V2 = 2,
|
1147
1199
|
};
|
1148
1200
|
|
1149
1201
|
static const char * llama_file_version_name(llama_fver version) {
|
1150
1202
|
switch (version) {
|
1151
|
-
case GGUF_FILE_VERSION_V1: return "GGUF V1 (
|
1203
|
+
case GGUF_FILE_VERSION_V1: return "GGUF V1 (support until nov 2023)";
|
1204
|
+
case GGUF_FILE_VERSION_V2: return "GGUF V2 (latest)";
|
1152
1205
|
}
|
1153
1206
|
|
1154
1207
|
return "unknown";
|
@@ -1439,7 +1492,11 @@ struct llama_model_loader {
|
|
1439
1492
|
// allocate temp buffer if not using mmap
|
1440
1493
|
if (!use_mmap && cur->data == NULL) {
|
1441
1494
|
GGML_ASSERT(cur->backend != GGML_BACKEND_CPU);
|
1442
|
-
|
1495
|
+
#ifdef GGML_USE_CPU_HBM
|
1496
|
+
cur->data = (uint8_t*)hbw_malloc(ggml_nbytes(cur));
|
1497
|
+
#else
|
1498
|
+
cur->data = (uint8_t*)malloc(ggml_nbytes(cur));
|
1499
|
+
#endif
|
1443
1500
|
}
|
1444
1501
|
|
1445
1502
|
load_data_for(cur);
|
@@ -1593,9 +1650,13 @@ static void llm_load_hparams(
|
|
1593
1650
|
|
1594
1651
|
GGUF_GET_KEY(ctx, hparams.n_rot, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ROPE_DIMENSION_COUNT));
|
1595
1652
|
|
1596
|
-
if (
|
1597
|
-
|
1653
|
+
if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON) {
|
1654
|
+
if (hparams.n_rot != hparams.n_embd / hparams.n_head) {
|
1655
|
+
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd / hparams.n_head));
|
1656
|
+
}
|
1598
1657
|
}
|
1658
|
+
// gpt-neox n_rot = rotary_pct * (n_embd / n_head)
|
1659
|
+
// gpt-j n_rot = rotary_dim
|
1599
1660
|
}
|
1600
1661
|
|
1601
1662
|
// arch-specific KVs
|
@@ -1635,7 +1696,8 @@ static void llm_load_hparams(
|
|
1635
1696
|
}
|
1636
1697
|
|
1637
1698
|
// TODO: This should probably be in llama.h
|
1638
|
-
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab,
|
1699
|
+
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos);
|
1700
|
+
static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch);
|
1639
1701
|
|
1640
1702
|
static void llm_load_vocab(
|
1641
1703
|
llama_model_loader & ml,
|
@@ -1737,7 +1799,11 @@ static void llm_load_vocab(
|
|
1737
1799
|
}
|
1738
1800
|
|
1739
1801
|
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
|
1740
|
-
|
1802
|
+
if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
|
1803
|
+
vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
|
1804
|
+
} else {
|
1805
|
+
vocab.linefeed_id = llama_tokenize_internal(vocab, "\n", false)[0];
|
1806
|
+
}
|
1741
1807
|
|
1742
1808
|
// special tokens
|
1743
1809
|
GGUF_GET_KEY(ctx, vocab.special_bos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_BOS_ID));
|
@@ -2635,18 +2701,20 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
2635
2701
|
|
2636
2702
|
const size_t wsize = ggml_type_size(cur->type);
|
2637
2703
|
|
2638
|
-
|
2704
|
+
// TODO: these 2 ggml_conts are technically not needed, but we add them until CUDA support for
|
2705
|
+
// non-contiguous views is added for the rope operator
|
2706
|
+
struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_3d(
|
2639
2707
|
ctx0, cur, n_embd_head, n_head, N,
|
2640
2708
|
wsize * n_embd_head,
|
2641
2709
|
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
2642
|
-
0);
|
2710
|
+
0));
|
2643
2711
|
offload_func_kq(tmpq);
|
2644
2712
|
|
2645
|
-
struct ggml_tensor * tmpk = ggml_view_3d(
|
2713
|
+
struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_3d(
|
2646
2714
|
ctx0, cur, n_embd_head, n_head_kv, N,
|
2647
2715
|
wsize * n_embd_head,
|
2648
2716
|
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
2649
|
-
wsize * n_embd_head * n_head);
|
2717
|
+
wsize * n_embd_head * n_head));
|
2650
2718
|
offload_func_kq(tmpk);
|
2651
2719
|
|
2652
2720
|
struct ggml_tensor * tmpv = ggml_view_3d(
|
@@ -2831,7 +2899,6 @@ static bool llama_eval_internal(
|
|
2831
2899
|
|
2832
2900
|
GGML_ASSERT(n_tokens > 0);
|
2833
2901
|
GGML_ASSERT(n_past >= 0);
|
2834
|
-
GGML_ASSERT(n_threads > 0);
|
2835
2902
|
// TODO: keep the values of n_batch and n_ctx
|
2836
2903
|
// GGML_ASSERT(n_tokens <= n_batch);
|
2837
2904
|
// GGML_ASSERT(n_past + n_tokens <= n_ctx);
|
@@ -2842,6 +2909,8 @@ static bool llama_eval_internal(
|
|
2842
2909
|
ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
|
2843
2910
|
#endif
|
2844
2911
|
|
2912
|
+
GGML_ASSERT(n_threads > 0);
|
2913
|
+
|
2845
2914
|
const int N = n_tokens;
|
2846
2915
|
|
2847
2916
|
const auto & model = lctx.model;
|
@@ -2880,7 +2949,12 @@ static bool llama_eval_internal(
|
|
2880
2949
|
|
2881
2950
|
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
2882
2951
|
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
2883
|
-
|
2952
|
+
// TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
|
2953
|
+
// we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
|
2954
|
+
// with the BLAS calls. need a better solution
|
2955
|
+
if (N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
|
2956
|
+
n_threads = std::min(4, n_threads);
|
2957
|
+
}
|
2884
2958
|
|
2885
2959
|
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
2886
2960
|
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
|
@@ -2985,33 +3059,10 @@ static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) {
|
|
2985
3059
|
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_CONTROL;
|
2986
3060
|
}
|
2987
3061
|
|
2988
|
-
static bool llama_is_user_defined_token(const llama_vocab & vocab, llama_token id) {
|
2989
|
-
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED;
|
2990
|
-
}
|
2991
|
-
|
2992
|
-
static bool llama_is_unused_token(const llama_vocab & vocab, llama_token id) {
|
2993
|
-
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_UNUSED;
|
2994
|
-
}
|
2995
|
-
|
2996
3062
|
static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
|
2997
3063
|
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE;
|
2998
3064
|
}
|
2999
3065
|
|
3000
|
-
static bool llama_is_bos_token(const llama_vocab & vocab, llama_token id) {
|
3001
|
-
GGML_ASSERT(llama_is_control_token(vocab, id));
|
3002
|
-
return id == vocab.special_bos_id;
|
3003
|
-
}
|
3004
|
-
|
3005
|
-
static bool llama_is_eos_token(const llama_vocab & vocab, llama_token id ) {
|
3006
|
-
GGML_ASSERT(llama_is_control_token(vocab, id));
|
3007
|
-
return id == vocab.special_eos_id;
|
3008
|
-
}
|
3009
|
-
|
3010
|
-
static bool llama_is_pad_token(const llama_vocab & vocab, llama_token id ) {
|
3011
|
-
GGML_ASSERT(id < 0 || llama_is_control_token(vocab, id));
|
3012
|
-
return id == vocab.special_pad_id;
|
3013
|
-
}
|
3014
|
-
|
3015
3066
|
static uint8_t llama_token_to_byte(const llama_vocab & vocab, llama_token id) {
|
3016
3067
|
GGML_ASSERT(llama_is_byte_token(vocab, id));
|
3017
3068
|
const auto& token_data = vocab.id_to_token.at(id);
|
@@ -3026,16 +3077,8 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
|
|
3026
3077
|
return vocab.token_to_id.at(buf);
|
3027
3078
|
}
|
3028
3079
|
|
3029
|
-
static
|
3030
|
-
|
3031
|
-
for (size_t offs = 0; offs < text.length(); ++offs) {
|
3032
|
-
if (text[offs] == ' ') {
|
3033
|
-
result += "\xe2\x96\x81";
|
3034
|
-
} else {
|
3035
|
-
result += text[offs];
|
3036
|
-
}
|
3037
|
-
}
|
3038
|
-
return result;
|
3080
|
+
static void llama_escape_whitespace(std::string & text) {
|
3081
|
+
replace_all(text, " ", "\xe2\x96\x81");
|
3039
3082
|
}
|
3040
3083
|
|
3041
3084
|
static void llama_unescape_whitespace(std::string & word) {
|
@@ -3204,7 +3247,7 @@ private:
|
|
3204
3247
|
|
3205
3248
|
struct llm_bigram_bpe {
|
3206
3249
|
struct comparator {
|
3207
|
-
bool operator()(llm_bigram_bpe & l, llm_bigram_bpe & r) {
|
3250
|
+
bool operator()(const llm_bigram_bpe & l, const llm_bigram_bpe & r) const {
|
3208
3251
|
return l.rank > r.rank || (l.rank == r.rank && l.left > r.left);
|
3209
3252
|
}
|
3210
3253
|
};
|
@@ -3219,7 +3262,7 @@ struct llm_bigram_bpe {
|
|
3219
3262
|
};
|
3220
3263
|
|
3221
3264
|
struct llm_tokenizer_bpe {
|
3222
|
-
llm_tokenizer_bpe(const llama_vocab & vocab
|
3265
|
+
llm_tokenizer_bpe(const llama_vocab & vocab): vocab(vocab) {}
|
3223
3266
|
|
3224
3267
|
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
3225
3268
|
int final_prev_index = -1;
|
@@ -3312,9 +3355,15 @@ struct llm_tokenizer_bpe {
|
|
3312
3355
|
std::string byte_str(1, *j);
|
3313
3356
|
auto token_multibyte = vocab.token_to_id.find(byte_str);
|
3314
3357
|
if (token_multibyte == vocab.token_to_id.end()) {
|
3315
|
-
|
3358
|
+
try {
|
3359
|
+
llama_token token_byte = llama_byte_to_token(vocab, *j);
|
3360
|
+
output.push_back(token_byte);
|
3361
|
+
} catch (const std::out_of_range & err) {
|
3362
|
+
fprintf(stderr,"ERROR: byte not found in vocab: '%s'\n", byte_str.c_str());
|
3363
|
+
}
|
3364
|
+
} else {
|
3365
|
+
output.push_back((*token_multibyte).second);
|
3316
3366
|
}
|
3317
|
-
output.push_back((*token_multibyte).second);
|
3318
3367
|
}
|
3319
3368
|
} else {
|
3320
3369
|
output.push_back((*token).second);
|
@@ -3352,26 +3401,23 @@ private:
|
|
3352
3401
|
}
|
3353
3402
|
|
3354
3403
|
// probably not 100% correct
|
3355
|
-
|
3356
|
-
static std::vector<std::string> bpe_gpt2_preprocess(std::string text) {
|
3404
|
+
static std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
|
3357
3405
|
std::vector<std::string> words;
|
3358
3406
|
|
3359
3407
|
// ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53
|
3360
3408
|
const std::string pattern = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
|
3361
3409
|
const std::regex re(pattern);
|
3362
|
-
std::smatch m;
|
3363
3410
|
|
3364
|
-
|
3365
|
-
|
3366
|
-
|
3367
|
-
|
3368
|
-
|
3411
|
+
auto words_begin = std::sregex_iterator(text.begin(), text.end(), re);
|
3412
|
+
auto words_end = std::sregex_iterator();
|
3413
|
+
auto n_words = std::distance(words_begin, words_end);
|
3414
|
+
words.reserve(n_words);
|
3415
|
+
for (auto it = words_begin; it != words_end; ++it) {
|
3416
|
+
words.push_back(it->str());
|
3369
3417
|
}
|
3370
|
-
|
3371
3418
|
return words;
|
3372
|
-
}
|
3373
3419
|
|
3374
|
-
|
3420
|
+
}
|
3375
3421
|
|
3376
3422
|
const llama_vocab & vocab;
|
3377
3423
|
|
@@ -3381,9 +3427,18 @@ private:
|
|
3381
3427
|
llm_bigram_bpe::queue work_queue;
|
3382
3428
|
};
|
3383
3429
|
|
3384
|
-
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab,
|
3430
|
+
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos) {
|
3385
3431
|
std::vector<llama_vocab::id> output;
|
3386
3432
|
|
3433
|
+
// OG tokenizer behavior:
|
3434
|
+
//
|
3435
|
+
// tokenizer.encode('', add_bos=True) returns [1]
|
3436
|
+
// tokenizer.encode('', add_bos=False) returns []
|
3437
|
+
|
3438
|
+
if (bos && vocab.special_bos_id != -1) {
|
3439
|
+
output.push_back(vocab.special_bos_id);
|
3440
|
+
}
|
3441
|
+
|
3387
3442
|
if (raw_text.empty()) {
|
3388
3443
|
return output;
|
3389
3444
|
}
|
@@ -3391,29 +3446,16 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
3391
3446
|
switch (vocab.type) {
|
3392
3447
|
case LLAMA_VOCAB_TYPE_SPM:
|
3393
3448
|
{
|
3394
|
-
|
3395
|
-
|
3396
|
-
if (bos) {
|
3397
|
-
output.push_back(vocab.special_bos_id);
|
3398
|
-
}
|
3449
|
+
// without adding this leading whitespace, we do not get the same results as the original tokenizer
|
3450
|
+
raw_text = " " + raw_text;
|
3399
3451
|
|
3400
|
-
|
3401
|
-
|
3402
|
-
|
3403
|
-
} else {
|
3404
|
-
text = raw_text;
|
3405
|
-
}
|
3406
|
-
|
3407
|
-
tokenizer.tokenize(text, output);
|
3452
|
+
llm_tokenizer_spm tokenizer(vocab);
|
3453
|
+
llama_escape_whitespace(raw_text);
|
3454
|
+
tokenizer.tokenize(raw_text, output);
|
3408
3455
|
} break;
|
3409
3456
|
case LLAMA_VOCAB_TYPE_BPE:
|
3410
3457
|
{
|
3411
|
-
llm_tokenizer_bpe tokenizer(vocab
|
3412
|
-
|
3413
|
-
if (bos && vocab.special_bos_id != -1) {
|
3414
|
-
output.push_back(vocab.special_bos_id);
|
3415
|
-
}
|
3416
|
-
|
3458
|
+
llm_tokenizer_bpe tokenizer(vocab);
|
3417
3459
|
tokenizer.tokenize(raw_text, output);
|
3418
3460
|
} break;
|
3419
3461
|
};
|
@@ -3595,7 +3637,7 @@ static void llama_grammar_advance_stack(
|
|
3595
3637
|
std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
|
3596
3638
|
|
3597
3639
|
if (stack.empty()) {
|
3598
|
-
new_stacks.
|
3640
|
+
new_stacks.emplace_back(stack);
|
3599
3641
|
return;
|
3600
3642
|
}
|
3601
3643
|
|
@@ -3632,7 +3674,7 @@ static void llama_grammar_advance_stack(
|
|
3632
3674
|
}
|
3633
3675
|
case LLAMA_GRETYPE_CHAR:
|
3634
3676
|
case LLAMA_GRETYPE_CHAR_NOT:
|
3635
|
-
new_stacks.
|
3677
|
+
new_stacks.emplace_back(stack);
|
3636
3678
|
break;
|
3637
3679
|
default:
|
3638
3680
|
// end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
|
@@ -3797,6 +3839,25 @@ void llama_grammar_free(struct llama_grammar * grammar) {
|
|
3797
3839
|
delete grammar;
|
3798
3840
|
}
|
3799
3841
|
|
3842
|
+
struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar) {
|
3843
|
+
llama_grammar * result = new llama_grammar{ grammar->rules, grammar->stacks, grammar->partial_utf8 };
|
3844
|
+
|
3845
|
+
// redirect elements in stacks to point to new rules
|
3846
|
+
for (size_t is = 0; is < result->stacks.size(); is++) {
|
3847
|
+
for (size_t ie = 0; ie < result->stacks[is].size(); ie++) {
|
3848
|
+
for (size_t ir0 = 0; ir0 < grammar->rules.size(); ir0++) {
|
3849
|
+
for (size_t ir1 = 0; ir1 < grammar->rules[ir0].size(); ir1++) {
|
3850
|
+
if (grammar->stacks[is][ie] == &grammar->rules[ir0][ir1]) {
|
3851
|
+
result->stacks[is][ie] = &result->rules[ir0][ir1];
|
3852
|
+
}
|
3853
|
+
}
|
3854
|
+
}
|
3855
|
+
}
|
3856
|
+
}
|
3857
|
+
|
3858
|
+
return result;
|
3859
|
+
}
|
3860
|
+
|
3800
3861
|
//
|
3801
3862
|
// sampling
|
3802
3863
|
//
|
@@ -3908,7 +3969,7 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
|
|
3908
3969
|
|
3909
3970
|
// Calculate absolute value of second derivatives
|
3910
3971
|
for (size_t i = 0; i < second_derivatives.size(); ++i) {
|
3911
|
-
second_derivatives[i] = abs(second_derivatives[i]);
|
3972
|
+
second_derivatives[i] = std::abs(second_derivatives[i]);
|
3912
3973
|
}
|
3913
3974
|
|
3914
3975
|
// Normalize the second derivatives
|
@@ -4099,16 +4160,16 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
|
|
4099
4160
|
std::vector<llama_grammar_candidate> candidates_grammar;
|
4100
4161
|
|
4101
4162
|
for (size_t i = 0; i < candidates->size; ++i) {
|
4102
|
-
const llama_token id
|
4103
|
-
const std::string
|
4163
|
+
const llama_token id = candidates->data[i].id;
|
4164
|
+
const std::string piece = llama_token_to_str(ctx, id);
|
4104
4165
|
if (id == eos) {
|
4105
4166
|
if (!allow_eos) {
|
4106
4167
|
candidates->data[i].logit = -INFINITY;
|
4107
4168
|
}
|
4108
|
-
} else if (
|
4169
|
+
} else if (piece.empty() || piece[0] == 0) {
|
4109
4170
|
candidates->data[i].logit = -INFINITY;
|
4110
4171
|
} else {
|
4111
|
-
candidates_decoded.push_back(decode_utf8(
|
4172
|
+
candidates_decoded.push_back(decode_utf8(piece.c_str(), grammar->partial_utf8));
|
4112
4173
|
candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second });
|
4113
4174
|
}
|
4114
4175
|
}
|
@@ -4312,10 +4373,10 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
|
|
4312
4373
|
GGML_ASSERT(false);
|
4313
4374
|
}
|
4314
4375
|
|
4315
|
-
const std::string
|
4376
|
+
const std::string piece = llama_token_to_str(ctx, token);
|
4316
4377
|
|
4317
4378
|
// Note terminating 0 in decoded string
|
4318
|
-
const auto decoded = decode_utf8(
|
4379
|
+
const auto decoded = decode_utf8(piece.c_str(), grammar->partial_utf8);
|
4319
4380
|
const auto & code_points = decoded.first;
|
4320
4381
|
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
4321
4382
|
grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
|
@@ -4326,6 +4387,257 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
|
|
4326
4387
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
4327
4388
|
}
|
4328
4389
|
|
4390
|
+
//
|
4391
|
+
// Beam search
|
4392
|
+
//
|
4393
|
+
|
4394
|
+
struct llama_beam {
|
4395
|
+
std::vector<llama_token> tokens;
|
4396
|
+
float p; // Cumulative beam probability (renormalized relative to all beams)
|
4397
|
+
bool eob; // Initialize end-of-beam to false. Callback sets this to true.
|
4398
|
+
// Sort beams by probability. In case of ties, prefer beams at eob.
|
4399
|
+
bool operator<(const llama_beam & rhs) const {
|
4400
|
+
return std::make_pair(p, eob) < std::make_pair(rhs.p, rhs.eob);
|
4401
|
+
}
|
4402
|
+
// Shift off first n tokens and discard them.
|
4403
|
+
void shift_tokens(const size_t n) {
|
4404
|
+
if (n) {
|
4405
|
+
std::copy(tokens.begin() + n, tokens.end(), tokens.begin());
|
4406
|
+
tokens.resize(tokens.size() - n);
|
4407
|
+
}
|
4408
|
+
}
|
4409
|
+
llama_beam_view view() const { return {tokens.data(), tokens.size(), p, eob}; }
|
4410
|
+
};
|
4411
|
+
|
4412
|
+
// A struct for calculating logit-related info.
|
4413
|
+
struct llama_logit_info {
|
4414
|
+
const float * const logits;
|
4415
|
+
const int n_vocab;
|
4416
|
+
const float max_l;
|
4417
|
+
const float normalizer;
|
4418
|
+
struct sum_exp {
|
4419
|
+
float max_l;
|
4420
|
+
float operator()(float sum, float l) const { return sum + std::exp(l - max_l); }
|
4421
|
+
};
|
4422
|
+
llama_logit_info(llama_context * ctx)
|
4423
|
+
: logits(llama_get_logits(ctx))
|
4424
|
+
, n_vocab(llama_n_vocab(ctx))
|
4425
|
+
, max_l(*std::max_element(logits, logits + n_vocab))
|
4426
|
+
, normalizer(1.0f / std::accumulate(logits, logits + n_vocab, 0.0f, sum_exp{max_l}))
|
4427
|
+
{ }
|
4428
|
+
llama_token_data get_token_data(const llama_token token_id) const {
|
4429
|
+
constexpr auto p = std::numeric_limits<float>::quiet_NaN(); // never used
|
4430
|
+
return {token_id, logits[token_id], p};
|
4431
|
+
}
|
4432
|
+
// Return top k token_data by logit.
|
4433
|
+
std::vector<llama_token_data> top_k(size_t k) {
|
4434
|
+
std::vector<llama_token_data> min_heap; // min-heap by logit
|
4435
|
+
const llama_token k_min = std::min(static_cast<llama_token>(k), n_vocab);
|
4436
|
+
min_heap.reserve(k_min);
|
4437
|
+
for (llama_token token_id = 0 ; token_id < k_min ; ++token_id) {
|
4438
|
+
min_heap.push_back(get_token_data(token_id));
|
4439
|
+
}
|
4440
|
+
auto comp = [](const llama_token_data & a, const llama_token_data & b) { return a.logit > b.logit; };
|
4441
|
+
std::make_heap(min_heap.begin(), min_heap.end(), comp);
|
4442
|
+
for (llama_token token_id = k_min ; token_id < n_vocab ; ++token_id) {
|
4443
|
+
if (min_heap.front().logit < logits[token_id]) {
|
4444
|
+
std::pop_heap(min_heap.begin(), min_heap.end(), comp);
|
4445
|
+
min_heap.back().id = token_id;
|
4446
|
+
min_heap.back().logit = logits[token_id];
|
4447
|
+
std::push_heap(min_heap.begin(), min_heap.end(), comp);
|
4448
|
+
}
|
4449
|
+
}
|
4450
|
+
return min_heap;
|
4451
|
+
}
|
4452
|
+
float probability_from_logit(float logit) const {
|
4453
|
+
return normalizer * std::exp(logit - max_l);
|
4454
|
+
}
|
4455
|
+
};
|
4456
|
+
|
4457
|
+
struct llama_beam_search_data {
|
4458
|
+
llama_context * ctx;
|
4459
|
+
size_t n_beams;
|
4460
|
+
int n_past;
|
4461
|
+
int n_predict;
|
4462
|
+
int n_threads;
|
4463
|
+
std::vector<llama_beam> beams;
|
4464
|
+
std::vector<llama_beam> next_beams;
|
4465
|
+
|
4466
|
+
// Re-calculated on each loop iteration
|
4467
|
+
size_t common_prefix_length;
|
4468
|
+
|
4469
|
+
// Used to communicate to/from callback on beams state.
|
4470
|
+
std::vector<llama_beam_view> beam_views;
|
4471
|
+
|
4472
|
+
llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict, int n_threads)
|
4473
|
+
: ctx(ctx)
|
4474
|
+
, n_beams(n_beams)
|
4475
|
+
, n_past(n_past)
|
4476
|
+
, n_predict(n_predict)
|
4477
|
+
, n_threads(n_threads)
|
4478
|
+
, beam_views(n_beams) {
|
4479
|
+
beams.reserve(n_beams);
|
4480
|
+
next_beams.reserve(n_beams);
|
4481
|
+
}
|
4482
|
+
|
4483
|
+
// Collapse beams to a single beam given by index.
|
4484
|
+
void collapse_beams(const size_t beam_idx) {
|
4485
|
+
if (0u < beam_idx) {
|
4486
|
+
std::swap(beams[0], beams[beam_idx]);
|
4487
|
+
}
|
4488
|
+
beams.resize(1);
|
4489
|
+
}
|
4490
|
+
|
4491
|
+
// Min-heaps are used to efficiently collect the top-k elements (k=n_beams).
|
4492
|
+
// The repetative patterns below reflect the 2 stages of heaps:
|
4493
|
+
// * Gather elements until the vector is full, then call std::make_heap() on it.
|
4494
|
+
// * If the heap is full and a new element is found that should be included, pop the
|
4495
|
+
// least element to the back(), replace it with the new, then push it into the heap.
|
4496
|
+
void fill_next_beams_by_top_probabilities(llama_beam & beam) {
|
4497
|
+
// Min-heaps use a greater-than comparator.
|
4498
|
+
const auto comp = [](const llama_beam & a, const llama_beam & b) { return a.p > b.p; };
|
4499
|
+
if (beam.eob) {
|
4500
|
+
// beam is at end-of-sentence, so just copy it to next_beams if its probability is high enough.
|
4501
|
+
if (next_beams.size() < n_beams) {
|
4502
|
+
next_beams.push_back(std::move(beam));
|
4503
|
+
if (next_beams.size() == n_beams) {
|
4504
|
+
std::make_heap(next_beams.begin(), next_beams.end(), comp);
|
4505
|
+
}
|
4506
|
+
} else if (next_beams.front().p < beam.p) {
|
4507
|
+
std::pop_heap(next_beams.begin(), next_beams.end(), comp);
|
4508
|
+
next_beams.back() = std::move(beam);
|
4509
|
+
std::push_heap(next_beams.begin(), next_beams.end(), comp);
|
4510
|
+
}
|
4511
|
+
} else {
|
4512
|
+
// beam is not at end-of-sentence, so branch with next top_k tokens.
|
4513
|
+
if (!beam.tokens.empty()) {
|
4514
|
+
llama_eval(ctx, beam.tokens.data(), beam.tokens.size(), n_past, n_threads);
|
4515
|
+
}
|
4516
|
+
llama_logit_info logit_info(ctx);
|
4517
|
+
std::vector<llama_token_data> next_tokens = logit_info.top_k(n_beams);
|
4518
|
+
size_t i=0;
|
4519
|
+
if (next_beams.size() < n_beams) {
|
4520
|
+
for (; next_beams.size() < n_beams ; ++i) {
|
4521
|
+
llama_beam next_beam = beam;
|
4522
|
+
next_beam.tokens.push_back(next_tokens[i].id);
|
4523
|
+
next_beam.p *= logit_info.probability_from_logit(next_tokens[i].logit);
|
4524
|
+
next_beams.push_back(std::move(next_beam));
|
4525
|
+
}
|
4526
|
+
std::make_heap(next_beams.begin(), next_beams.end(), comp);
|
4527
|
+
} else {
|
4528
|
+
for (; next_beams.front().p == 0.0f ; ++i) {
|
4529
|
+
std::pop_heap(next_beams.begin(), next_beams.end(), comp);
|
4530
|
+
next_beams.back() = beam;
|
4531
|
+
next_beams.back().tokens.push_back(next_tokens[i].id);
|
4532
|
+
next_beams.back().p *= logit_info.probability_from_logit(next_tokens[i].logit);
|
4533
|
+
std::push_heap(next_beams.begin(), next_beams.end(), comp);
|
4534
|
+
}
|
4535
|
+
}
|
4536
|
+
for (; i < n_beams ; ++i) {
|
4537
|
+
const float next_p = beam.p * logit_info.probability_from_logit(next_tokens[i].logit);
|
4538
|
+
if (next_beams.front().p < next_p) {
|
4539
|
+
std::pop_heap(next_beams.begin(), next_beams.end(), comp);
|
4540
|
+
next_beams.back() = beam;
|
4541
|
+
next_beams.back().tokens.push_back(next_tokens[i].id);
|
4542
|
+
next_beams.back().p = next_p;
|
4543
|
+
std::push_heap(next_beams.begin(), next_beams.end(), comp);
|
4544
|
+
}
|
4545
|
+
}
|
4546
|
+
}
|
4547
|
+
}
|
4548
|
+
|
4549
|
+
// Find common_prefix_length based on beams.
|
4550
|
+
// Requires beams is not empty.
|
4551
|
+
size_t find_common_prefix_length() {
|
4552
|
+
size_t common_prefix_length = beams[0].tokens.size();
|
4553
|
+
for (size_t i = 1 ; i < beams.size() ; ++i) {
|
4554
|
+
common_prefix_length = std::min(common_prefix_length, beams[i].tokens.size());
|
4555
|
+
for (size_t j = 0 ; j < common_prefix_length ; ++j) {
|
4556
|
+
if (beams[0].tokens[j] != beams[i].tokens[j]) {
|
4557
|
+
common_prefix_length = j;
|
4558
|
+
break;
|
4559
|
+
}
|
4560
|
+
}
|
4561
|
+
}
|
4562
|
+
return common_prefix_length;
|
4563
|
+
}
|
4564
|
+
|
4565
|
+
// Construct beams_state to send back to caller via the callback function.
|
4566
|
+
// Side effect: set common_prefix_length = find_common_prefix_length();
|
4567
|
+
llama_beams_state get_beams_state(const bool last_call) {
|
4568
|
+
for (size_t i = 0 ; i < beams.size() ; ++i) {
|
4569
|
+
beam_views[i] = beams[i].view();
|
4570
|
+
}
|
4571
|
+
common_prefix_length = find_common_prefix_length();
|
4572
|
+
return {beam_views.data(), beams.size(), common_prefix_length, last_call};
|
4573
|
+
}
|
4574
|
+
|
4575
|
+
// Loop:
|
4576
|
+
// * while i < n_predict, AND
|
4577
|
+
// * any of the beams have not yet reached end-of-beam (eob), AND
|
4578
|
+
// * the highest probability beam(s) (plural in case of ties) are not at end-of-sentence
|
4579
|
+
// (since all other beam probabilities can only decrease)
|
4580
|
+
void loop(const llama_beam_search_callback_fn_t callback, void * const callback_data) {
|
4581
|
+
beams.push_back({{}, 1.0f, false}); // Start with one empty beam w/ probability = 1.0 and !eob.
|
4582
|
+
const auto not_eob = [](const llama_beam & beam) { return !beam.eob; };
|
4583
|
+
for (int i = 0 ; i < n_predict && std::any_of(beams.begin(),beams.end(),not_eob) &&
|
4584
|
+
!beams[top_beam_index()].eob ; ++i) {
|
4585
|
+
callback(callback_data, get_beams_state(false)); // Sets common_prefix_length
|
4586
|
+
update_beams_from_beam_views(); // Update values (p,eob) that callback may have changed.
|
4587
|
+
if (common_prefix_length) {
|
4588
|
+
llama_eval(ctx, beams[0].tokens.data(), common_prefix_length, n_past, n_threads);
|
4589
|
+
n_past += common_prefix_length;
|
4590
|
+
}
|
4591
|
+
// Zero-out next_beam probabilities to place them last in following min-heap.
|
4592
|
+
std::for_each(next_beams.begin(), next_beams.end(), [](llama_beam & beam) { beam.p = 0.0f; });
|
4593
|
+
for (llama_beam & beam : beams) {
|
4594
|
+
beam.shift_tokens(common_prefix_length);
|
4595
|
+
fill_next_beams_by_top_probabilities(beam);
|
4596
|
+
}
|
4597
|
+
// next_beams become the beams of next/final iteration. Swap them to re-use memory.
|
4598
|
+
beams.swap(next_beams);
|
4599
|
+
renormalize_beam_probabilities(beams);
|
4600
|
+
}
|
4601
|
+
collapse_beams(top_beam_index());
|
4602
|
+
callback(callback_data, get_beams_state(true));
|
4603
|
+
}
|
4604
|
+
|
4605
|
+
// As beams grow, the cumulative probabilities decrease.
|
4606
|
+
// Renormalize them to avoid floating point underflow.
|
4607
|
+
static void renormalize_beam_probabilities(std::vector<llama_beam> & beams) {
|
4608
|
+
const auto sum_p = [](float sum, llama_beam & beam) { return sum + beam.p; };
|
4609
|
+
const float inv_sum = 1.0f / std::accumulate(beams.begin(), beams.end(), 0.0f, sum_p);
|
4610
|
+
std::for_each(beams.begin(), beams.end(), [=](llama_beam & beam) { beam.p *= inv_sum; });
|
4611
|
+
}
|
4612
|
+
|
4613
|
+
// Assumes beams is non-empty. Uses llama_beam::operator<() for ordering.
|
4614
|
+
size_t top_beam_index() {
|
4615
|
+
return std::max_element(beams.begin(), beams.end()) - beams.begin();
|
4616
|
+
}
|
4617
|
+
|
4618
|
+
// Copy (p,eob) for each beam which may have been changed by the callback.
|
4619
|
+
void update_beams_from_beam_views() {
|
4620
|
+
for (size_t i = 0 ; i < beams.size() ; ++i) {
|
4621
|
+
beams[i].p = beam_views[i].p;
|
4622
|
+
beams[i].eob = beam_views[i].eob;
|
4623
|
+
}
|
4624
|
+
}
|
4625
|
+
};
|
4626
|
+
|
4627
|
+
void llama_beam_search(llama_context * ctx,
|
4628
|
+
llama_beam_search_callback_fn_t callback, void * callback_data,
|
4629
|
+
size_t n_beams, int n_past, int n_predict, int n_threads) {
|
4630
|
+
assert(ctx);
|
4631
|
+
const int64_t t_start_sample_us = ggml_time_us();
|
4632
|
+
|
4633
|
+
llama_beam_search_data beam_search_data(ctx, n_beams, n_past, n_predict, n_threads);
|
4634
|
+
|
4635
|
+
beam_search_data.loop(callback, callback_data);
|
4636
|
+
|
4637
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
4638
|
+
ctx->n_sample++;
|
4639
|
+
}
|
4640
|
+
|
4329
4641
|
//
|
4330
4642
|
// quantization
|
4331
4643
|
//
|
@@ -4423,6 +4735,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4423
4735
|
|
4424
4736
|
std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname_inp, /*use_mmap*/ false));
|
4425
4737
|
|
4738
|
+
llama_model model;
|
4739
|
+
llm_load_arch(*ml, model);
|
4740
|
+
llm_load_hparams(*ml, model, 0, 0, 0);
|
4741
|
+
|
4742
|
+
if (params->only_copy) {
|
4743
|
+
ftype = model.ftype;
|
4744
|
+
}
|
4745
|
+
|
4426
4746
|
const size_t align = GGUF_DEFAULT_ALIGNMENT;
|
4427
4747
|
struct gguf_context * ctx_out = gguf_init_empty();
|
4428
4748
|
|
@@ -4448,6 +4768,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4448
4768
|
++n_feed_forward_w2;
|
4449
4769
|
}
|
4450
4770
|
}
|
4771
|
+
if (n_attention_wv != n_feed_forward_w2 || (uint32_t)n_attention_wv != model.hparams.n_layer) {
|
4772
|
+
LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_feed_forward_w2 = %d, hparams.n_layer = %d\n",
|
4773
|
+
__func__, n_attention_wv, n_feed_forward_w2, model.hparams.n_layer);
|
4774
|
+
}
|
4451
4775
|
|
4452
4776
|
int i_attention_wv = 0;
|
4453
4777
|
int i_feed_forward_w2 = 0;
|
@@ -4460,9 +4784,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4460
4784
|
std::vector<std::thread> workers;
|
4461
4785
|
std::mutex mutex;
|
4462
4786
|
|
4787
|
+
#ifdef GGML_USE_K_QUANTS
|
4463
4788
|
auto use_more_bits = [] (int i_layer, int num_layers) -> bool {
|
4464
4789
|
return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
|
4465
4790
|
};
|
4791
|
+
#endif
|
4466
4792
|
|
4467
4793
|
int idx = 0;
|
4468
4794
|
|
@@ -4505,18 +4831,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4505
4831
|
// quantize only 2D tensors
|
4506
4832
|
quantize &= (tensor->n_dims == 2);
|
4507
4833
|
quantize &= params->quantize_output_tensor || name != "output.weight";
|
4508
|
-
quantize &=
|
4834
|
+
quantize &= !params->only_copy;
|
4509
4835
|
|
4510
4836
|
enum ggml_type new_type;
|
4511
4837
|
void * new_data;
|
4512
4838
|
size_t new_size;
|
4513
4839
|
|
4514
|
-
if (
|
4515
|
-
new_type = tensor->type;
|
4516
|
-
new_data = tensor->data;
|
4517
|
-
new_size = ggml_nbytes(tensor);
|
4518
|
-
LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
|
4519
|
-
} else {
|
4840
|
+
if (quantize) {
|
4520
4841
|
new_type = quantized_type;
|
4521
4842
|
#ifdef GGML_USE_K_QUANTS
|
4522
4843
|
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
@@ -4524,8 +4845,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4524
4845
|
|
4525
4846
|
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
|
4526
4847
|
int nx = tensor->ne[0];
|
4527
|
-
|
4528
|
-
|
4848
|
+
if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
|
4849
|
+
new_type = GGML_TYPE_Q8_0;
|
4850
|
+
}
|
4851
|
+
else if (new_type != GGML_TYPE_Q8_0) {
|
4529
4852
|
new_type = GGML_TYPE_Q6_K;
|
4530
4853
|
}
|
4531
4854
|
} else if (name.find("attn_v.weight") != std::string::npos) {
|
@@ -4539,21 +4862,49 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4539
4862
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
|
4540
4863
|
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
|
4541
4864
|
(i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
|
4865
|
+
if (model.type == MODEL_70B) {
|
4866
|
+
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
|
4867
|
+
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
|
4868
|
+
// nearly negligible increase in model size by quantizing this tensor with more bits:
|
4869
|
+
if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
|
4870
|
+
}
|
4542
4871
|
++i_attention_wv;
|
4543
4872
|
} else if (name.find("ffn_down.weight") != std::string::npos) {
|
4544
4873
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
4545
4874
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
4546
|
-
new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
|
4875
|
+
new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
|
4876
|
+
: model.arch != LLM_ARCH_FALCON || use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K
|
4877
|
+
: GGML_TYPE_Q3_K;
|
4878
|
+
}
|
4879
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
|
4880
|
+
new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
|
4881
|
+
}
|
4882
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
|
4883
|
+
if (model.arch == LLM_ARCH_FALCON) {
|
4884
|
+
new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
|
4885
|
+
use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
4886
|
+
} else {
|
4887
|
+
if (use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
4888
|
+
}
|
4889
|
+
}
|
4890
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
4891
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && i_feed_forward_w2 < 4) {
|
4892
|
+
new_type = GGML_TYPE_Q5_K;
|
4547
4893
|
}
|
4548
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
4549
|
-
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
4550
|
-
use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
4551
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_feed_forward_w2 < 4) new_type = GGML_TYPE_Q5_K;
|
4552
4894
|
++i_feed_forward_w2;
|
4553
4895
|
} else if (name.find("attn_output.weight") != std::string::npos) {
|
4554
|
-
if
|
4555
|
-
|
4556
|
-
|
4896
|
+
if (model.arch != LLM_ARCH_FALCON) {
|
4897
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
|
4898
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
|
4899
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
4900
|
+
} else {
|
4901
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
|
4902
|
+
}
|
4903
|
+
}
|
4904
|
+
else if (name.find("attn_qkv.weight") != std::string::npos) {
|
4905
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
|
4906
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
|
4907
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
|
4557
4908
|
}
|
4558
4909
|
else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) {
|
4559
4910
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
@@ -4568,8 +4919,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4568
4919
|
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
|
4569
4920
|
int nx = tensor->ne[0];
|
4570
4921
|
int ny = tensor->ne[1];
|
4571
|
-
if (nx % QK_K != 0
|
4572
|
-
|
4922
|
+
if (nx % QK_K != 0) {
|
4923
|
+
LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for k-quants\n", __func__, nx, ny, QK_K);
|
4573
4924
|
convert_incompatible_tensor = true;
|
4574
4925
|
}
|
4575
4926
|
}
|
@@ -4585,7 +4936,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4585
4936
|
}
|
4586
4937
|
}
|
4587
4938
|
#endif
|
4588
|
-
|
4939
|
+
// If we've decided to quantize to the same type the tensor is already
|
4940
|
+
// in then there's nothing to do.
|
4941
|
+
quantize = tensor->type != new_type;
|
4942
|
+
}
|
4943
|
+
if (!quantize) {
|
4944
|
+
new_type = tensor->type;
|
4945
|
+
new_data = tensor->data;
|
4946
|
+
new_size = ggml_nbytes(tensor);
|
4947
|
+
LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
|
4948
|
+
} else {
|
4589
4949
|
const size_t nelements = ggml_nelements(tensor);
|
4590
4950
|
|
4591
4951
|
float * f32_data;
|
@@ -4990,7 +5350,7 @@ struct llama_context_params llama_context_default_params() {
|
|
4990
5350
|
/*.seed =*/ LLAMA_DEFAULT_SEED,
|
4991
5351
|
/*.n_ctx =*/ 512,
|
4992
5352
|
/*.n_batch =*/ 512,
|
4993
|
-
/*.
|
5353
|
+
/*.n_gpu_layers =*/ 0,
|
4994
5354
|
/*.main_gpu =*/ 0,
|
4995
5355
|
/*.tensor_split =*/ nullptr,
|
4996
5356
|
/*.rope_freq_base =*/ 10000.0f,
|
@@ -4998,7 +5358,7 @@ struct llama_context_params llama_context_default_params() {
|
|
4998
5358
|
/*.progress_callback =*/ nullptr,
|
4999
5359
|
/*.progress_callback_user_data =*/ nullptr,
|
5000
5360
|
/*.low_vram =*/ false,
|
5001
|
-
/*.mul_mat_q =*/
|
5361
|
+
/*.mul_mat_q =*/ true,
|
5002
5362
|
/*.f16_kv =*/ true,
|
5003
5363
|
/*.logits_all =*/ false,
|
5004
5364
|
/*.vocab_only =*/ false,
|
@@ -5007,6 +5367,10 @@ struct llama_context_params llama_context_default_params() {
|
|
5007
5367
|
/*.embedding =*/ false,
|
5008
5368
|
};
|
5009
5369
|
|
5370
|
+
#ifdef GGML_USE_METAL
|
5371
|
+
result.n_gpu_layers = 1;
|
5372
|
+
#endif
|
5373
|
+
|
5010
5374
|
return result;
|
5011
5375
|
}
|
5012
5376
|
|
@@ -5016,6 +5380,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
|
5016
5380
|
/*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
|
5017
5381
|
/*.allow_requantize =*/ false,
|
5018
5382
|
/*.quantize_output_tensor =*/ true,
|
5383
|
+
/*.only_copy =*/ false,
|
5019
5384
|
};
|
5020
5385
|
|
5021
5386
|
return result;
|
@@ -5198,43 +5563,43 @@ struct llama_context * llama_new_context_with_model(
|
|
5198
5563
|
}
|
5199
5564
|
#endif
|
5200
5565
|
}
|
5201
|
-
}
|
5202
5566
|
|
5203
5567
|
#ifdef GGML_USE_METAL
|
5204
|
-
|
5205
|
-
|
5568
|
+
if (params.n_gpu_layers > 0) {
|
5569
|
+
// this allocates all Metal resources and memory buffers
|
5206
5570
|
|
5207
|
-
|
5208
|
-
|
5571
|
+
void * data_ptr = NULL;
|
5572
|
+
size_t data_size = 0;
|
5209
5573
|
|
5210
|
-
|
5211
|
-
|
5212
|
-
|
5213
|
-
|
5214
|
-
|
5215
|
-
|
5216
|
-
|
5574
|
+
if (params.use_mmap) {
|
5575
|
+
data_ptr = ctx->model.mapping->addr;
|
5576
|
+
data_size = ctx->model.mapping->size;
|
5577
|
+
} else {
|
5578
|
+
data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
|
5579
|
+
data_size = ggml_get_mem_size (ctx->model.ctx);
|
5580
|
+
}
|
5217
5581
|
|
5218
|
-
|
5582
|
+
const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
|
5219
5583
|
|
5220
|
-
|
5584
|
+
LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
|
5221
5585
|
|
5222
5586
|
#define LLAMA_METAL_CHECK_BUF(result) \
|
5223
|
-
|
5224
|
-
|
5225
|
-
|
5226
|
-
|
5227
|
-
|
5587
|
+
if (!(result)) { \
|
5588
|
+
LLAMA_LOG_ERROR("%s: failed to add buffer\n", __func__); \
|
5589
|
+
llama_free(ctx); \
|
5590
|
+
return NULL; \
|
5591
|
+
}
|
5228
5592
|
|
5229
|
-
|
5593
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
|
5230
5594
|
|
5231
|
-
|
5232
|
-
|
5595
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.data, ctx->buf_compute.size, 0));
|
5596
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0));
|
5233
5597
|
|
5234
|
-
|
5598
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.data, ctx->buf_alloc.size, 0));
|
5235
5599
|
#undef LLAMA_METAL_CHECK_BUF
|
5236
|
-
|
5600
|
+
}
|
5237
5601
|
#endif
|
5602
|
+
}
|
5238
5603
|
|
5239
5604
|
#ifdef GGML_USE_MPI
|
5240
5605
|
ctx->ctx_mpi = ggml_mpi_init();
|
@@ -5297,13 +5662,29 @@ int llama_model_n_embd(const struct llama_model * model) {
|
|
5297
5662
|
return model->hparams.n_embd;
|
5298
5663
|
}
|
5299
5664
|
|
5300
|
-
int
|
5665
|
+
int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
|
5301
5666
|
return snprintf(buf, buf_size, "%s %s %s",
|
5302
5667
|
model->name.c_str(),
|
5303
5668
|
llama_model_type_name(model->type),
|
5304
5669
|
llama_model_ftype_name(model->ftype).c_str());
|
5305
5670
|
}
|
5306
5671
|
|
5672
|
+
uint64_t llama_model_size(const struct llama_model * model) {
|
5673
|
+
uint64_t size = 0;
|
5674
|
+
for (const auto & it : model->tensors_by_name) {
|
5675
|
+
size += ggml_nbytes(it.second);
|
5676
|
+
}
|
5677
|
+
return size;
|
5678
|
+
}
|
5679
|
+
|
5680
|
+
uint64_t llama_model_n_params(const struct llama_model * model) {
|
5681
|
+
uint64_t nparams = 0;
|
5682
|
+
for (const auto & it : model->tensors_by_name) {
|
5683
|
+
nparams += ggml_nelements(it.second);
|
5684
|
+
}
|
5685
|
+
return nparams;
|
5686
|
+
}
|
5687
|
+
|
5307
5688
|
int llama_model_quantize(
|
5308
5689
|
const char * fname_inp,
|
5309
5690
|
const char * fname_out,
|
@@ -5552,7 +5933,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
5552
5933
|
rng_ss.str(std::string(&rng_buf[0], rng_size));
|
5553
5934
|
rng_ss >> ctx->rng;
|
5554
5935
|
|
5555
|
-
GGML_ASSERT(rng_ss.fail()
|
5936
|
+
GGML_ASSERT(!rng_ss.fail());
|
5556
5937
|
}
|
5557
5938
|
|
5558
5939
|
// set logits
|
@@ -5828,8 +6209,7 @@ int llama_tokenize_with_model(
|
|
5828
6209
|
llama_token * tokens,
|
5829
6210
|
int n_max_tokens,
|
5830
6211
|
bool add_bos) {
|
5831
|
-
auto
|
5832
|
-
auto res = llama_tokenize_internal(model->vocab, text, add_bos, escape);
|
6212
|
+
auto res = llama_tokenize_internal(model->vocab, text, add_bos);
|
5833
6213
|
|
5834
6214
|
if (n_max_tokens < (int) res.size()) {
|
5835
6215
|
LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
|
@@ -5843,12 +6223,12 @@ int llama_tokenize_with_model(
|
|
5843
6223
|
return res.size();
|
5844
6224
|
}
|
5845
6225
|
|
5846
|
-
int
|
5847
|
-
return
|
6226
|
+
int llama_token_to_piece(const struct llama_context * ctx, llama_token token, char * buf, int length) {
|
6227
|
+
return llama_token_to_piece_with_model(&ctx->model, token, buf, length);
|
5848
6228
|
}
|
5849
6229
|
|
5850
|
-
// does not write null-terminator to
|
5851
|
-
int
|
6230
|
+
// does not write null-terminator to buf
|
6231
|
+
int llama_token_to_piece_with_model(const struct llama_model * model, llama_token token, char * buf, int length) {
|
5852
6232
|
if (0 <= token && token < llama_model_n_vocab(model)) {
|
5853
6233
|
if (llama_is_normal_token(model->vocab, token)) {
|
5854
6234
|
std::string result = model->vocab.id_to_token[token].text;
|
@@ -5936,11 +6316,40 @@ const char * llama_print_system_info(void) {
|
|
5936
6316
|
s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
|
5937
6317
|
s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
|
5938
6318
|
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
|
6319
|
+
s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
|
5939
6320
|
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
|
5940
6321
|
|
5941
6322
|
return s.c_str();
|
5942
6323
|
}
|
5943
6324
|
|
6325
|
+
void llama_dump_timing_info_yaml(FILE * stream, const llama_context * ctx) {
|
6326
|
+
fprintf(stream, "\n");
|
6327
|
+
fprintf(stream, "###########\n");
|
6328
|
+
fprintf(stream, "# Timings #\n");
|
6329
|
+
fprintf(stream, "###########\n");
|
6330
|
+
fprintf(stream, "\n");
|
6331
|
+
|
6332
|
+
fprintf(stream, "mst_eval: %.2f # ms / token during generation\n",
|
6333
|
+
1.0e-3 * ctx->t_eval_us / ctx->n_eval);
|
6334
|
+
fprintf(stream, "mst_p_eval: %.2f # ms / token during prompt processing\n",
|
6335
|
+
1.0e-3 * ctx->t_p_eval_us / ctx->n_p_eval);
|
6336
|
+
fprintf(stream, "mst_sample: %.2f # ms / token during sampling\n",
|
6337
|
+
1.0e-3 * ctx->t_sample_us / ctx->n_sample);
|
6338
|
+
fprintf(stream, "n_eval: %d # number of tokens generated (excluding the first one)\n", ctx->n_eval);
|
6339
|
+
fprintf(stream, "n_p_eval: %d # number of tokens processed in batches at the beginning\n", ctx->n_p_eval);
|
6340
|
+
fprintf(stream, "n_sample: %d # number of sampled tokens\n", ctx->n_sample);
|
6341
|
+
fprintf(stream, "t_eval_us: %" PRId64 " # total microseconds spent generating tokens\n", ctx->t_eval_us);
|
6342
|
+
fprintf(stream, "t_load_us: %" PRId64 " # total microseconds spent loading the model\n", ctx->t_load_us);
|
6343
|
+
fprintf(stream, "t_p_eval_us: %" PRId64 " # total microseconds spent prompt processing\n", ctx->t_p_eval_us);
|
6344
|
+
fprintf(stream, "t_sample_us: %" PRId64 " # total microseconds spent sampling\n", ctx->t_sample_us);
|
6345
|
+
fprintf(stream, "ts_eval: %.2f # tokens / second during generation\n",
|
6346
|
+
1.0e6 * ctx->n_eval / ctx->t_eval_us);
|
6347
|
+
fprintf(stream, "ts_p_eval: %.2f # tokens / second during prompt processing\n",
|
6348
|
+
1.0e6 * ctx->n_p_eval / ctx->t_p_eval_us);
|
6349
|
+
fprintf(stream, "ts_sample: %.2f # tokens / second during sampling\n",
|
6350
|
+
1.0e6 * ctx->n_sample / ctx->t_sample_us);
|
6351
|
+
}
|
6352
|
+
|
5944
6353
|
// For internal test use
|
5945
6354
|
const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
|
5946
6355
|
return ctx->model.tensors_by_name;
|
@@ -5951,10 +6360,6 @@ void llama_log_set(llama_log_callback log_callback, void * user_data) {
|
|
5951
6360
|
g_state.log_callback_user_data = user_data;
|
5952
6361
|
}
|
5953
6362
|
|
5954
|
-
#if defined(_MSC_VER) && !defined(vsnprintf)
|
5955
|
-
#define vsnprintf _vsnprintf
|
5956
|
-
#endif
|
5957
|
-
|
5958
6363
|
static void llama_log_internal_v(llama_log_level level, const char * format, va_list args) {
|
5959
6364
|
va_list args_copy;
|
5960
6365
|
va_copy(args_copy, args);
|