llama_cpp 0.4.0 → 0.5.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/examples/chat.rb +2 -2
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +23 -11
- data/ext/llama_cpp/src/ggml-alloc.c +118 -73
- data/ext/llama_cpp/src/ggml-cuda.cu +106 -34
- data/ext/llama_cpp/src/ggml-metal.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.m +165 -72
- data/ext/llama_cpp/src/ggml-metal.metal +160 -89
- data/ext/llama_cpp/src/ggml-opencl.cpp +7 -7
- data/ext/llama_cpp/src/ggml.c +661 -380
- data/ext/llama_cpp/src/ggml.h +45 -19
- data/ext/llama_cpp/src/k_quants.c +47 -14
- data/ext/llama_cpp/src/llama.cpp +571 -166
- data/ext/llama_cpp/src/llama.h +54 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +5 -3
- metadata +2 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -1,9 +1,6 @@
|
|
1
1
|
// Defines fileno on msys:
|
2
2
|
#ifndef _GNU_SOURCE
|
3
3
|
#define _GNU_SOURCE
|
4
|
-
#include <cstddef>
|
5
|
-
#include <cstdint>
|
6
|
-
#include <cstdio>
|
7
4
|
#endif
|
8
5
|
|
9
6
|
#include "llama.h"
|
@@ -62,6 +59,9 @@
|
|
62
59
|
#include <cinttypes>
|
63
60
|
#include <climits>
|
64
61
|
#include <cstdarg>
|
62
|
+
#include <cstddef>
|
63
|
+
#include <cstdint>
|
64
|
+
#include <cstdio>
|
65
65
|
#include <cstring>
|
66
66
|
#include <ctime>
|
67
67
|
#include <fstream>
|
@@ -114,13 +114,21 @@ static size_t utf8_len(char src) {
|
|
114
114
|
}
|
115
115
|
|
116
116
|
void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
117
|
+
std::string result;
|
118
|
+
for (size_t pos = 0; ; pos += search.length()) {
|
119
|
+
auto new_pos = s.find(search, pos);
|
120
|
+
if (new_pos == std::string::npos) {
|
121
|
+
result += s.substr(pos, s.size() - pos);
|
122
|
+
break;
|
123
|
+
}
|
124
|
+
result += s.substr(pos, new_pos - pos) + replace;
|
125
|
+
pos = new_pos;
|
122
126
|
}
|
127
|
+
s = std::move(result);
|
123
128
|
}
|
129
|
+
#ifdef GGML_USE_CPU_HBM
|
130
|
+
#include <hbwmalloc.h>
|
131
|
+
#endif
|
124
132
|
|
125
133
|
static void zeros(std::ofstream & file, size_t n) {
|
126
134
|
char zero = 0;
|
@@ -320,6 +328,44 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
320
328
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
321
329
|
},
|
322
330
|
},
|
331
|
+
{
|
332
|
+
LLM_ARCH_GPT2,
|
333
|
+
{
|
334
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
335
|
+
},
|
336
|
+
},
|
337
|
+
{
|
338
|
+
LLM_ARCH_GPTJ,
|
339
|
+
{
|
340
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
341
|
+
},
|
342
|
+
},
|
343
|
+
{
|
344
|
+
LLM_ARCH_GPTNEOX,
|
345
|
+
{
|
346
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
347
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
348
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
349
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
350
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
351
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
352
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
353
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
354
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
355
|
+
},
|
356
|
+
},
|
357
|
+
{
|
358
|
+
LLM_ARCH_MPT,
|
359
|
+
{
|
360
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
361
|
+
},
|
362
|
+
},
|
363
|
+
{
|
364
|
+
LLM_ARCH_UNKNOWN,
|
365
|
+
{
|
366
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
367
|
+
},
|
368
|
+
},
|
323
369
|
};
|
324
370
|
|
325
371
|
static llm_arch llm_arch_from_string(const std::string & name) {
|
@@ -407,6 +453,9 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
|
|
407
453
|
#elif GGML_USE_METAL
|
408
454
|
# define llama_host_malloc(n) ggml_metal_host_malloc(n)
|
409
455
|
# define llama_host_free(data) ggml_metal_host_free(data)
|
456
|
+
#elif GGML_USE_CPU_HBM
|
457
|
+
# define llama_host_malloc(n) hbw_malloc(n)
|
458
|
+
# define llama_host_free(data) if (data != NULL) hbw_free(data)
|
410
459
|
#else
|
411
460
|
# define llama_host_malloc(n) malloc(n)
|
412
461
|
# define llama_host_free(data) free(data)
|
@@ -563,16 +612,16 @@ struct llama_mmap {
|
|
563
612
|
|
564
613
|
if (prefetch > 0) {
|
565
614
|
// Advise the kernel to preload the mapped memory
|
566
|
-
if (
|
567
|
-
fprintf(stderr, "warning:
|
615
|
+
if (posix_madvise(addr, std::min(file->size, prefetch), POSIX_MADV_WILLNEED)) {
|
616
|
+
fprintf(stderr, "warning: posix_madvise(.., POSIX_MADV_WILLNEED) failed: %s\n",
|
568
617
|
strerror(errno));
|
569
618
|
}
|
570
619
|
}
|
571
620
|
if (numa) {
|
572
621
|
// advise the kernel not to use readahead
|
573
622
|
// (because the next page might not belong on the same node)
|
574
|
-
if (
|
575
|
-
fprintf(stderr, "warning:
|
623
|
+
if (posix_madvise(addr, file->size, POSIX_MADV_RANDOM)) {
|
624
|
+
fprintf(stderr, "warning: posix_madvise(.., POSIX_MADV_RANDOM) failed: %s\n",
|
576
625
|
strerror(errno));
|
577
626
|
}
|
578
627
|
}
|
@@ -609,7 +658,9 @@ struct llama_mmap {
|
|
609
658
|
#if _WIN32_WINNT >= _WIN32_WINNT_WIN8
|
610
659
|
if (prefetch) {
|
611
660
|
// Advise the kernel to preload the mapped memory
|
661
|
+
|
612
662
|
WIN32_MEMORY_RANGE_ENTRY range;
|
663
|
+
|
613
664
|
range.VirtualAddress = addr;
|
614
665
|
range.NumberOfBytes = (SIZE_T)size;
|
615
666
|
if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
|
@@ -796,12 +847,12 @@ static void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
|
|
796
847
|
(void) tensor;
|
797
848
|
}
|
798
849
|
|
799
|
-
static std::string
|
850
|
+
static std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) {
|
800
851
|
std::vector<char> result(8, 0);
|
801
|
-
const int n_tokens =
|
852
|
+
const int n_tokens = llama_token_to_piece(ctx, token, result.data(), result.size());
|
802
853
|
if (n_tokens < 0) {
|
803
854
|
result.resize(-n_tokens);
|
804
|
-
int check =
|
855
|
+
int check = llama_token_to_piece(ctx, token, result.data(), result.size());
|
805
856
|
GGML_ASSERT(check == -n_tokens);
|
806
857
|
} else {
|
807
858
|
result.resize(n_tokens);
|
@@ -955,10 +1006,10 @@ struct llama_vocab {
|
|
955
1006
|
id linefeed_id = 13;
|
956
1007
|
|
957
1008
|
int find_bpe_rank(std::string token_left, std::string token_right) const {
|
958
|
-
replace_all(token_left, " ", "
|
959
|
-
replace_all(token_left, "\n", "
|
960
|
-
replace_all(token_right, " ", "
|
961
|
-
replace_all(token_right, "\n", "
|
1009
|
+
replace_all(token_left, " ", "\u0120");
|
1010
|
+
replace_all(token_left, "\n", "\u010A");
|
1011
|
+
replace_all(token_right, " ", "\u0120");
|
1012
|
+
replace_all(token_right, "\n", "\u010A");
|
962
1013
|
|
963
1014
|
auto it = bpe_ranks.find(std::make_pair(token_left, token_right));
|
964
1015
|
if (it == bpe_ranks.end()) {
|
@@ -1144,11 +1195,13 @@ static bool llama_kv_cache_init(
|
|
1144
1195
|
|
1145
1196
|
enum llama_fver {
|
1146
1197
|
GGUF_FILE_VERSION_V1 = 1,
|
1198
|
+
GGUF_FILE_VERSION_V2 = 2,
|
1147
1199
|
};
|
1148
1200
|
|
1149
1201
|
static const char * llama_file_version_name(llama_fver version) {
|
1150
1202
|
switch (version) {
|
1151
|
-
case GGUF_FILE_VERSION_V1: return "GGUF V1 (
|
1203
|
+
case GGUF_FILE_VERSION_V1: return "GGUF V1 (support until nov 2023)";
|
1204
|
+
case GGUF_FILE_VERSION_V2: return "GGUF V2 (latest)";
|
1152
1205
|
}
|
1153
1206
|
|
1154
1207
|
return "unknown";
|
@@ -1439,7 +1492,11 @@ struct llama_model_loader {
|
|
1439
1492
|
// allocate temp buffer if not using mmap
|
1440
1493
|
if (!use_mmap && cur->data == NULL) {
|
1441
1494
|
GGML_ASSERT(cur->backend != GGML_BACKEND_CPU);
|
1442
|
-
|
1495
|
+
#ifdef GGML_USE_CPU_HBM
|
1496
|
+
cur->data = (uint8_t*)hbw_malloc(ggml_nbytes(cur));
|
1497
|
+
#else
|
1498
|
+
cur->data = (uint8_t*)malloc(ggml_nbytes(cur));
|
1499
|
+
#endif
|
1443
1500
|
}
|
1444
1501
|
|
1445
1502
|
load_data_for(cur);
|
@@ -1593,9 +1650,13 @@ static void llm_load_hparams(
|
|
1593
1650
|
|
1594
1651
|
GGUF_GET_KEY(ctx, hparams.n_rot, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ROPE_DIMENSION_COUNT));
|
1595
1652
|
|
1596
|
-
if (
|
1597
|
-
|
1653
|
+
if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON) {
|
1654
|
+
if (hparams.n_rot != hparams.n_embd / hparams.n_head) {
|
1655
|
+
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd / hparams.n_head));
|
1656
|
+
}
|
1598
1657
|
}
|
1658
|
+
// gpt-neox n_rot = rotary_pct * (n_embd / n_head)
|
1659
|
+
// gpt-j n_rot = rotary_dim
|
1599
1660
|
}
|
1600
1661
|
|
1601
1662
|
// arch-specific KVs
|
@@ -1635,7 +1696,8 @@ static void llm_load_hparams(
|
|
1635
1696
|
}
|
1636
1697
|
|
1637
1698
|
// TODO: This should probably be in llama.h
|
1638
|
-
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab,
|
1699
|
+
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos);
|
1700
|
+
static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch);
|
1639
1701
|
|
1640
1702
|
static void llm_load_vocab(
|
1641
1703
|
llama_model_loader & ml,
|
@@ -1737,7 +1799,11 @@ static void llm_load_vocab(
|
|
1737
1799
|
}
|
1738
1800
|
|
1739
1801
|
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
|
1740
|
-
|
1802
|
+
if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
|
1803
|
+
vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
|
1804
|
+
} else {
|
1805
|
+
vocab.linefeed_id = llama_tokenize_internal(vocab, "\n", false)[0];
|
1806
|
+
}
|
1741
1807
|
|
1742
1808
|
// special tokens
|
1743
1809
|
GGUF_GET_KEY(ctx, vocab.special_bos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_BOS_ID));
|
@@ -2635,18 +2701,20 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
2635
2701
|
|
2636
2702
|
const size_t wsize = ggml_type_size(cur->type);
|
2637
2703
|
|
2638
|
-
|
2704
|
+
// TODO: these 2 ggml_conts are technically not needed, but we add them until CUDA support for
|
2705
|
+
// non-contiguous views is added for the rope operator
|
2706
|
+
struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_3d(
|
2639
2707
|
ctx0, cur, n_embd_head, n_head, N,
|
2640
2708
|
wsize * n_embd_head,
|
2641
2709
|
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
2642
|
-
0);
|
2710
|
+
0));
|
2643
2711
|
offload_func_kq(tmpq);
|
2644
2712
|
|
2645
|
-
struct ggml_tensor * tmpk = ggml_view_3d(
|
2713
|
+
struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_3d(
|
2646
2714
|
ctx0, cur, n_embd_head, n_head_kv, N,
|
2647
2715
|
wsize * n_embd_head,
|
2648
2716
|
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
2649
|
-
wsize * n_embd_head * n_head);
|
2717
|
+
wsize * n_embd_head * n_head));
|
2650
2718
|
offload_func_kq(tmpk);
|
2651
2719
|
|
2652
2720
|
struct ggml_tensor * tmpv = ggml_view_3d(
|
@@ -2831,7 +2899,6 @@ static bool llama_eval_internal(
|
|
2831
2899
|
|
2832
2900
|
GGML_ASSERT(n_tokens > 0);
|
2833
2901
|
GGML_ASSERT(n_past >= 0);
|
2834
|
-
GGML_ASSERT(n_threads > 0);
|
2835
2902
|
// TODO: keep the values of n_batch and n_ctx
|
2836
2903
|
// GGML_ASSERT(n_tokens <= n_batch);
|
2837
2904
|
// GGML_ASSERT(n_past + n_tokens <= n_ctx);
|
@@ -2842,6 +2909,8 @@ static bool llama_eval_internal(
|
|
2842
2909
|
ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
|
2843
2910
|
#endif
|
2844
2911
|
|
2912
|
+
GGML_ASSERT(n_threads > 0);
|
2913
|
+
|
2845
2914
|
const int N = n_tokens;
|
2846
2915
|
|
2847
2916
|
const auto & model = lctx.model;
|
@@ -2880,7 +2949,12 @@ static bool llama_eval_internal(
|
|
2880
2949
|
|
2881
2950
|
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
2882
2951
|
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
2883
|
-
|
2952
|
+
// TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
|
2953
|
+
// we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
|
2954
|
+
// with the BLAS calls. need a better solution
|
2955
|
+
if (N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
|
2956
|
+
n_threads = std::min(4, n_threads);
|
2957
|
+
}
|
2884
2958
|
|
2885
2959
|
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
2886
2960
|
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
|
@@ -2985,33 +3059,10 @@ static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) {
|
|
2985
3059
|
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_CONTROL;
|
2986
3060
|
}
|
2987
3061
|
|
2988
|
-
static bool llama_is_user_defined_token(const llama_vocab & vocab, llama_token id) {
|
2989
|
-
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED;
|
2990
|
-
}
|
2991
|
-
|
2992
|
-
static bool llama_is_unused_token(const llama_vocab & vocab, llama_token id) {
|
2993
|
-
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_UNUSED;
|
2994
|
-
}
|
2995
|
-
|
2996
3062
|
static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
|
2997
3063
|
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE;
|
2998
3064
|
}
|
2999
3065
|
|
3000
|
-
static bool llama_is_bos_token(const llama_vocab & vocab, llama_token id) {
|
3001
|
-
GGML_ASSERT(llama_is_control_token(vocab, id));
|
3002
|
-
return id == vocab.special_bos_id;
|
3003
|
-
}
|
3004
|
-
|
3005
|
-
static bool llama_is_eos_token(const llama_vocab & vocab, llama_token id ) {
|
3006
|
-
GGML_ASSERT(llama_is_control_token(vocab, id));
|
3007
|
-
return id == vocab.special_eos_id;
|
3008
|
-
}
|
3009
|
-
|
3010
|
-
static bool llama_is_pad_token(const llama_vocab & vocab, llama_token id ) {
|
3011
|
-
GGML_ASSERT(id < 0 || llama_is_control_token(vocab, id));
|
3012
|
-
return id == vocab.special_pad_id;
|
3013
|
-
}
|
3014
|
-
|
3015
3066
|
static uint8_t llama_token_to_byte(const llama_vocab & vocab, llama_token id) {
|
3016
3067
|
GGML_ASSERT(llama_is_byte_token(vocab, id));
|
3017
3068
|
const auto& token_data = vocab.id_to_token.at(id);
|
@@ -3026,16 +3077,8 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
|
|
3026
3077
|
return vocab.token_to_id.at(buf);
|
3027
3078
|
}
|
3028
3079
|
|
3029
|
-
static
|
3030
|
-
|
3031
|
-
for (size_t offs = 0; offs < text.length(); ++offs) {
|
3032
|
-
if (text[offs] == ' ') {
|
3033
|
-
result += "\xe2\x96\x81";
|
3034
|
-
} else {
|
3035
|
-
result += text[offs];
|
3036
|
-
}
|
3037
|
-
}
|
3038
|
-
return result;
|
3080
|
+
static void llama_escape_whitespace(std::string & text) {
|
3081
|
+
replace_all(text, " ", "\xe2\x96\x81");
|
3039
3082
|
}
|
3040
3083
|
|
3041
3084
|
static void llama_unescape_whitespace(std::string & word) {
|
@@ -3204,7 +3247,7 @@ private:
|
|
3204
3247
|
|
3205
3248
|
struct llm_bigram_bpe {
|
3206
3249
|
struct comparator {
|
3207
|
-
bool operator()(llm_bigram_bpe & l, llm_bigram_bpe & r) {
|
3250
|
+
bool operator()(const llm_bigram_bpe & l, const llm_bigram_bpe & r) const {
|
3208
3251
|
return l.rank > r.rank || (l.rank == r.rank && l.left > r.left);
|
3209
3252
|
}
|
3210
3253
|
};
|
@@ -3219,7 +3262,7 @@ struct llm_bigram_bpe {
|
|
3219
3262
|
};
|
3220
3263
|
|
3221
3264
|
struct llm_tokenizer_bpe {
|
3222
|
-
llm_tokenizer_bpe(const llama_vocab & vocab
|
3265
|
+
llm_tokenizer_bpe(const llama_vocab & vocab): vocab(vocab) {}
|
3223
3266
|
|
3224
3267
|
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
3225
3268
|
int final_prev_index = -1;
|
@@ -3312,9 +3355,15 @@ struct llm_tokenizer_bpe {
|
|
3312
3355
|
std::string byte_str(1, *j);
|
3313
3356
|
auto token_multibyte = vocab.token_to_id.find(byte_str);
|
3314
3357
|
if (token_multibyte == vocab.token_to_id.end()) {
|
3315
|
-
|
3358
|
+
try {
|
3359
|
+
llama_token token_byte = llama_byte_to_token(vocab, *j);
|
3360
|
+
output.push_back(token_byte);
|
3361
|
+
} catch (const std::out_of_range & err) {
|
3362
|
+
fprintf(stderr,"ERROR: byte not found in vocab: '%s'\n", byte_str.c_str());
|
3363
|
+
}
|
3364
|
+
} else {
|
3365
|
+
output.push_back((*token_multibyte).second);
|
3316
3366
|
}
|
3317
|
-
output.push_back((*token_multibyte).second);
|
3318
3367
|
}
|
3319
3368
|
} else {
|
3320
3369
|
output.push_back((*token).second);
|
@@ -3352,26 +3401,23 @@ private:
|
|
3352
3401
|
}
|
3353
3402
|
|
3354
3403
|
// probably not 100% correct
|
3355
|
-
|
3356
|
-
static std::vector<std::string> bpe_gpt2_preprocess(std::string text) {
|
3404
|
+
static std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
|
3357
3405
|
std::vector<std::string> words;
|
3358
3406
|
|
3359
3407
|
// ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53
|
3360
3408
|
const std::string pattern = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
|
3361
3409
|
const std::regex re(pattern);
|
3362
|
-
std::smatch m;
|
3363
3410
|
|
3364
|
-
|
3365
|
-
|
3366
|
-
|
3367
|
-
|
3368
|
-
|
3411
|
+
auto words_begin = std::sregex_iterator(text.begin(), text.end(), re);
|
3412
|
+
auto words_end = std::sregex_iterator();
|
3413
|
+
auto n_words = std::distance(words_begin, words_end);
|
3414
|
+
words.reserve(n_words);
|
3415
|
+
for (auto it = words_begin; it != words_end; ++it) {
|
3416
|
+
words.push_back(it->str());
|
3369
3417
|
}
|
3370
|
-
|
3371
3418
|
return words;
|
3372
|
-
}
|
3373
3419
|
|
3374
|
-
|
3420
|
+
}
|
3375
3421
|
|
3376
3422
|
const llama_vocab & vocab;
|
3377
3423
|
|
@@ -3381,9 +3427,18 @@ private:
|
|
3381
3427
|
llm_bigram_bpe::queue work_queue;
|
3382
3428
|
};
|
3383
3429
|
|
3384
|
-
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab,
|
3430
|
+
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos) {
|
3385
3431
|
std::vector<llama_vocab::id> output;
|
3386
3432
|
|
3433
|
+
// OG tokenizer behavior:
|
3434
|
+
//
|
3435
|
+
// tokenizer.encode('', add_bos=True) returns [1]
|
3436
|
+
// tokenizer.encode('', add_bos=False) returns []
|
3437
|
+
|
3438
|
+
if (bos && vocab.special_bos_id != -1) {
|
3439
|
+
output.push_back(vocab.special_bos_id);
|
3440
|
+
}
|
3441
|
+
|
3387
3442
|
if (raw_text.empty()) {
|
3388
3443
|
return output;
|
3389
3444
|
}
|
@@ -3391,29 +3446,16 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
3391
3446
|
switch (vocab.type) {
|
3392
3447
|
case LLAMA_VOCAB_TYPE_SPM:
|
3393
3448
|
{
|
3394
|
-
|
3395
|
-
|
3396
|
-
if (bos) {
|
3397
|
-
output.push_back(vocab.special_bos_id);
|
3398
|
-
}
|
3449
|
+
// without adding this leading whitespace, we do not get the same results as the original tokenizer
|
3450
|
+
raw_text = " " + raw_text;
|
3399
3451
|
|
3400
|
-
|
3401
|
-
|
3402
|
-
|
3403
|
-
} else {
|
3404
|
-
text = raw_text;
|
3405
|
-
}
|
3406
|
-
|
3407
|
-
tokenizer.tokenize(text, output);
|
3452
|
+
llm_tokenizer_spm tokenizer(vocab);
|
3453
|
+
llama_escape_whitespace(raw_text);
|
3454
|
+
tokenizer.tokenize(raw_text, output);
|
3408
3455
|
} break;
|
3409
3456
|
case LLAMA_VOCAB_TYPE_BPE:
|
3410
3457
|
{
|
3411
|
-
llm_tokenizer_bpe tokenizer(vocab
|
3412
|
-
|
3413
|
-
if (bos && vocab.special_bos_id != -1) {
|
3414
|
-
output.push_back(vocab.special_bos_id);
|
3415
|
-
}
|
3416
|
-
|
3458
|
+
llm_tokenizer_bpe tokenizer(vocab);
|
3417
3459
|
tokenizer.tokenize(raw_text, output);
|
3418
3460
|
} break;
|
3419
3461
|
};
|
@@ -3595,7 +3637,7 @@ static void llama_grammar_advance_stack(
|
|
3595
3637
|
std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
|
3596
3638
|
|
3597
3639
|
if (stack.empty()) {
|
3598
|
-
new_stacks.
|
3640
|
+
new_stacks.emplace_back(stack);
|
3599
3641
|
return;
|
3600
3642
|
}
|
3601
3643
|
|
@@ -3632,7 +3674,7 @@ static void llama_grammar_advance_stack(
|
|
3632
3674
|
}
|
3633
3675
|
case LLAMA_GRETYPE_CHAR:
|
3634
3676
|
case LLAMA_GRETYPE_CHAR_NOT:
|
3635
|
-
new_stacks.
|
3677
|
+
new_stacks.emplace_back(stack);
|
3636
3678
|
break;
|
3637
3679
|
default:
|
3638
3680
|
// end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
|
@@ -3797,6 +3839,25 @@ void llama_grammar_free(struct llama_grammar * grammar) {
|
|
3797
3839
|
delete grammar;
|
3798
3840
|
}
|
3799
3841
|
|
3842
|
+
struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar) {
|
3843
|
+
llama_grammar * result = new llama_grammar{ grammar->rules, grammar->stacks, grammar->partial_utf8 };
|
3844
|
+
|
3845
|
+
// redirect elements in stacks to point to new rules
|
3846
|
+
for (size_t is = 0; is < result->stacks.size(); is++) {
|
3847
|
+
for (size_t ie = 0; ie < result->stacks[is].size(); ie++) {
|
3848
|
+
for (size_t ir0 = 0; ir0 < grammar->rules.size(); ir0++) {
|
3849
|
+
for (size_t ir1 = 0; ir1 < grammar->rules[ir0].size(); ir1++) {
|
3850
|
+
if (grammar->stacks[is][ie] == &grammar->rules[ir0][ir1]) {
|
3851
|
+
result->stacks[is][ie] = &result->rules[ir0][ir1];
|
3852
|
+
}
|
3853
|
+
}
|
3854
|
+
}
|
3855
|
+
}
|
3856
|
+
}
|
3857
|
+
|
3858
|
+
return result;
|
3859
|
+
}
|
3860
|
+
|
3800
3861
|
//
|
3801
3862
|
// sampling
|
3802
3863
|
//
|
@@ -3908,7 +3969,7 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
|
|
3908
3969
|
|
3909
3970
|
// Calculate absolute value of second derivatives
|
3910
3971
|
for (size_t i = 0; i < second_derivatives.size(); ++i) {
|
3911
|
-
second_derivatives[i] = abs(second_derivatives[i]);
|
3972
|
+
second_derivatives[i] = std::abs(second_derivatives[i]);
|
3912
3973
|
}
|
3913
3974
|
|
3914
3975
|
// Normalize the second derivatives
|
@@ -4099,16 +4160,16 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
|
|
4099
4160
|
std::vector<llama_grammar_candidate> candidates_grammar;
|
4100
4161
|
|
4101
4162
|
for (size_t i = 0; i < candidates->size; ++i) {
|
4102
|
-
const llama_token id
|
4103
|
-
const std::string
|
4163
|
+
const llama_token id = candidates->data[i].id;
|
4164
|
+
const std::string piece = llama_token_to_str(ctx, id);
|
4104
4165
|
if (id == eos) {
|
4105
4166
|
if (!allow_eos) {
|
4106
4167
|
candidates->data[i].logit = -INFINITY;
|
4107
4168
|
}
|
4108
|
-
} else if (
|
4169
|
+
} else if (piece.empty() || piece[0] == 0) {
|
4109
4170
|
candidates->data[i].logit = -INFINITY;
|
4110
4171
|
} else {
|
4111
|
-
candidates_decoded.push_back(decode_utf8(
|
4172
|
+
candidates_decoded.push_back(decode_utf8(piece.c_str(), grammar->partial_utf8));
|
4112
4173
|
candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second });
|
4113
4174
|
}
|
4114
4175
|
}
|
@@ -4312,10 +4373,10 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
|
|
4312
4373
|
GGML_ASSERT(false);
|
4313
4374
|
}
|
4314
4375
|
|
4315
|
-
const std::string
|
4376
|
+
const std::string piece = llama_token_to_str(ctx, token);
|
4316
4377
|
|
4317
4378
|
// Note terminating 0 in decoded string
|
4318
|
-
const auto decoded = decode_utf8(
|
4379
|
+
const auto decoded = decode_utf8(piece.c_str(), grammar->partial_utf8);
|
4319
4380
|
const auto & code_points = decoded.first;
|
4320
4381
|
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
4321
4382
|
grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
|
@@ -4326,6 +4387,257 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
|
|
4326
4387
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
4327
4388
|
}
|
4328
4389
|
|
4390
|
+
//
|
4391
|
+
// Beam search
|
4392
|
+
//
|
4393
|
+
|
4394
|
+
struct llama_beam {
|
4395
|
+
std::vector<llama_token> tokens;
|
4396
|
+
float p; // Cumulative beam probability (renormalized relative to all beams)
|
4397
|
+
bool eob; // Initialize end-of-beam to false. Callback sets this to true.
|
4398
|
+
// Sort beams by probability. In case of ties, prefer beams at eob.
|
4399
|
+
bool operator<(const llama_beam & rhs) const {
|
4400
|
+
return std::make_pair(p, eob) < std::make_pair(rhs.p, rhs.eob);
|
4401
|
+
}
|
4402
|
+
// Shift off first n tokens and discard them.
|
4403
|
+
void shift_tokens(const size_t n) {
|
4404
|
+
if (n) {
|
4405
|
+
std::copy(tokens.begin() + n, tokens.end(), tokens.begin());
|
4406
|
+
tokens.resize(tokens.size() - n);
|
4407
|
+
}
|
4408
|
+
}
|
4409
|
+
llama_beam_view view() const { return {tokens.data(), tokens.size(), p, eob}; }
|
4410
|
+
};
|
4411
|
+
|
4412
|
+
// A struct for calculating logit-related info.
|
4413
|
+
struct llama_logit_info {
|
4414
|
+
const float * const logits;
|
4415
|
+
const int n_vocab;
|
4416
|
+
const float max_l;
|
4417
|
+
const float normalizer;
|
4418
|
+
struct sum_exp {
|
4419
|
+
float max_l;
|
4420
|
+
float operator()(float sum, float l) const { return sum + std::exp(l - max_l); }
|
4421
|
+
};
|
4422
|
+
llama_logit_info(llama_context * ctx)
|
4423
|
+
: logits(llama_get_logits(ctx))
|
4424
|
+
, n_vocab(llama_n_vocab(ctx))
|
4425
|
+
, max_l(*std::max_element(logits, logits + n_vocab))
|
4426
|
+
, normalizer(1.0f / std::accumulate(logits, logits + n_vocab, 0.0f, sum_exp{max_l}))
|
4427
|
+
{ }
|
4428
|
+
llama_token_data get_token_data(const llama_token token_id) const {
|
4429
|
+
constexpr auto p = std::numeric_limits<float>::quiet_NaN(); // never used
|
4430
|
+
return {token_id, logits[token_id], p};
|
4431
|
+
}
|
4432
|
+
// Return top k token_data by logit.
|
4433
|
+
std::vector<llama_token_data> top_k(size_t k) {
|
4434
|
+
std::vector<llama_token_data> min_heap; // min-heap by logit
|
4435
|
+
const llama_token k_min = std::min(static_cast<llama_token>(k), n_vocab);
|
4436
|
+
min_heap.reserve(k_min);
|
4437
|
+
for (llama_token token_id = 0 ; token_id < k_min ; ++token_id) {
|
4438
|
+
min_heap.push_back(get_token_data(token_id));
|
4439
|
+
}
|
4440
|
+
auto comp = [](const llama_token_data & a, const llama_token_data & b) { return a.logit > b.logit; };
|
4441
|
+
std::make_heap(min_heap.begin(), min_heap.end(), comp);
|
4442
|
+
for (llama_token token_id = k_min ; token_id < n_vocab ; ++token_id) {
|
4443
|
+
if (min_heap.front().logit < logits[token_id]) {
|
4444
|
+
std::pop_heap(min_heap.begin(), min_heap.end(), comp);
|
4445
|
+
min_heap.back().id = token_id;
|
4446
|
+
min_heap.back().logit = logits[token_id];
|
4447
|
+
std::push_heap(min_heap.begin(), min_heap.end(), comp);
|
4448
|
+
}
|
4449
|
+
}
|
4450
|
+
return min_heap;
|
4451
|
+
}
|
4452
|
+
float probability_from_logit(float logit) const {
|
4453
|
+
return normalizer * std::exp(logit - max_l);
|
4454
|
+
}
|
4455
|
+
};
|
4456
|
+
|
4457
|
+
struct llama_beam_search_data {
|
4458
|
+
llama_context * ctx;
|
4459
|
+
size_t n_beams;
|
4460
|
+
int n_past;
|
4461
|
+
int n_predict;
|
4462
|
+
int n_threads;
|
4463
|
+
std::vector<llama_beam> beams;
|
4464
|
+
std::vector<llama_beam> next_beams;
|
4465
|
+
|
4466
|
+
// Re-calculated on each loop iteration
|
4467
|
+
size_t common_prefix_length;
|
4468
|
+
|
4469
|
+
// Used to communicate to/from callback on beams state.
|
4470
|
+
std::vector<llama_beam_view> beam_views;
|
4471
|
+
|
4472
|
+
llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict, int n_threads)
|
4473
|
+
: ctx(ctx)
|
4474
|
+
, n_beams(n_beams)
|
4475
|
+
, n_past(n_past)
|
4476
|
+
, n_predict(n_predict)
|
4477
|
+
, n_threads(n_threads)
|
4478
|
+
, beam_views(n_beams) {
|
4479
|
+
beams.reserve(n_beams);
|
4480
|
+
next_beams.reserve(n_beams);
|
4481
|
+
}
|
4482
|
+
|
4483
|
+
// Collapse beams to a single beam given by index.
|
4484
|
+
void collapse_beams(const size_t beam_idx) {
|
4485
|
+
if (0u < beam_idx) {
|
4486
|
+
std::swap(beams[0], beams[beam_idx]);
|
4487
|
+
}
|
4488
|
+
beams.resize(1);
|
4489
|
+
}
|
4490
|
+
|
4491
|
+
// Min-heaps are used to efficiently collect the top-k elements (k=n_beams).
|
4492
|
+
// The repetative patterns below reflect the 2 stages of heaps:
|
4493
|
+
// * Gather elements until the vector is full, then call std::make_heap() on it.
|
4494
|
+
// * If the heap is full and a new element is found that should be included, pop the
|
4495
|
+
// least element to the back(), replace it with the new, then push it into the heap.
|
4496
|
+
void fill_next_beams_by_top_probabilities(llama_beam & beam) {
|
4497
|
+
// Min-heaps use a greater-than comparator.
|
4498
|
+
const auto comp = [](const llama_beam & a, const llama_beam & b) { return a.p > b.p; };
|
4499
|
+
if (beam.eob) {
|
4500
|
+
// beam is at end-of-sentence, so just copy it to next_beams if its probability is high enough.
|
4501
|
+
if (next_beams.size() < n_beams) {
|
4502
|
+
next_beams.push_back(std::move(beam));
|
4503
|
+
if (next_beams.size() == n_beams) {
|
4504
|
+
std::make_heap(next_beams.begin(), next_beams.end(), comp);
|
4505
|
+
}
|
4506
|
+
} else if (next_beams.front().p < beam.p) {
|
4507
|
+
std::pop_heap(next_beams.begin(), next_beams.end(), comp);
|
4508
|
+
next_beams.back() = std::move(beam);
|
4509
|
+
std::push_heap(next_beams.begin(), next_beams.end(), comp);
|
4510
|
+
}
|
4511
|
+
} else {
|
4512
|
+
// beam is not at end-of-sentence, so branch with next top_k tokens.
|
4513
|
+
if (!beam.tokens.empty()) {
|
4514
|
+
llama_eval(ctx, beam.tokens.data(), beam.tokens.size(), n_past, n_threads);
|
4515
|
+
}
|
4516
|
+
llama_logit_info logit_info(ctx);
|
4517
|
+
std::vector<llama_token_data> next_tokens = logit_info.top_k(n_beams);
|
4518
|
+
size_t i=0;
|
4519
|
+
if (next_beams.size() < n_beams) {
|
4520
|
+
for (; next_beams.size() < n_beams ; ++i) {
|
4521
|
+
llama_beam next_beam = beam;
|
4522
|
+
next_beam.tokens.push_back(next_tokens[i].id);
|
4523
|
+
next_beam.p *= logit_info.probability_from_logit(next_tokens[i].logit);
|
4524
|
+
next_beams.push_back(std::move(next_beam));
|
4525
|
+
}
|
4526
|
+
std::make_heap(next_beams.begin(), next_beams.end(), comp);
|
4527
|
+
} else {
|
4528
|
+
for (; next_beams.front().p == 0.0f ; ++i) {
|
4529
|
+
std::pop_heap(next_beams.begin(), next_beams.end(), comp);
|
4530
|
+
next_beams.back() = beam;
|
4531
|
+
next_beams.back().tokens.push_back(next_tokens[i].id);
|
4532
|
+
next_beams.back().p *= logit_info.probability_from_logit(next_tokens[i].logit);
|
4533
|
+
std::push_heap(next_beams.begin(), next_beams.end(), comp);
|
4534
|
+
}
|
4535
|
+
}
|
4536
|
+
for (; i < n_beams ; ++i) {
|
4537
|
+
const float next_p = beam.p * logit_info.probability_from_logit(next_tokens[i].logit);
|
4538
|
+
if (next_beams.front().p < next_p) {
|
4539
|
+
std::pop_heap(next_beams.begin(), next_beams.end(), comp);
|
4540
|
+
next_beams.back() = beam;
|
4541
|
+
next_beams.back().tokens.push_back(next_tokens[i].id);
|
4542
|
+
next_beams.back().p = next_p;
|
4543
|
+
std::push_heap(next_beams.begin(), next_beams.end(), comp);
|
4544
|
+
}
|
4545
|
+
}
|
4546
|
+
}
|
4547
|
+
}
|
4548
|
+
|
4549
|
+
// Find common_prefix_length based on beams.
|
4550
|
+
// Requires beams is not empty.
|
4551
|
+
size_t find_common_prefix_length() {
|
4552
|
+
size_t common_prefix_length = beams[0].tokens.size();
|
4553
|
+
for (size_t i = 1 ; i < beams.size() ; ++i) {
|
4554
|
+
common_prefix_length = std::min(common_prefix_length, beams[i].tokens.size());
|
4555
|
+
for (size_t j = 0 ; j < common_prefix_length ; ++j) {
|
4556
|
+
if (beams[0].tokens[j] != beams[i].tokens[j]) {
|
4557
|
+
common_prefix_length = j;
|
4558
|
+
break;
|
4559
|
+
}
|
4560
|
+
}
|
4561
|
+
}
|
4562
|
+
return common_prefix_length;
|
4563
|
+
}
|
4564
|
+
|
4565
|
+
// Construct beams_state to send back to caller via the callback function.
|
4566
|
+
// Side effect: set common_prefix_length = find_common_prefix_length();
|
4567
|
+
llama_beams_state get_beams_state(const bool last_call) {
|
4568
|
+
for (size_t i = 0 ; i < beams.size() ; ++i) {
|
4569
|
+
beam_views[i] = beams[i].view();
|
4570
|
+
}
|
4571
|
+
common_prefix_length = find_common_prefix_length();
|
4572
|
+
return {beam_views.data(), beams.size(), common_prefix_length, last_call};
|
4573
|
+
}
|
4574
|
+
|
4575
|
+
// Loop:
|
4576
|
+
// * while i < n_predict, AND
|
4577
|
+
// * any of the beams have not yet reached end-of-beam (eob), AND
|
4578
|
+
// * the highest probability beam(s) (plural in case of ties) are not at end-of-sentence
|
4579
|
+
// (since all other beam probabilities can only decrease)
|
4580
|
+
void loop(const llama_beam_search_callback_fn_t callback, void * const callback_data) {
|
4581
|
+
beams.push_back({{}, 1.0f, false}); // Start with one empty beam w/ probability = 1.0 and !eob.
|
4582
|
+
const auto not_eob = [](const llama_beam & beam) { return !beam.eob; };
|
4583
|
+
for (int i = 0 ; i < n_predict && std::any_of(beams.begin(),beams.end(),not_eob) &&
|
4584
|
+
!beams[top_beam_index()].eob ; ++i) {
|
4585
|
+
callback(callback_data, get_beams_state(false)); // Sets common_prefix_length
|
4586
|
+
update_beams_from_beam_views(); // Update values (p,eob) that callback may have changed.
|
4587
|
+
if (common_prefix_length) {
|
4588
|
+
llama_eval(ctx, beams[0].tokens.data(), common_prefix_length, n_past, n_threads);
|
4589
|
+
n_past += common_prefix_length;
|
4590
|
+
}
|
4591
|
+
// Zero-out next_beam probabilities to place them last in following min-heap.
|
4592
|
+
std::for_each(next_beams.begin(), next_beams.end(), [](llama_beam & beam) { beam.p = 0.0f; });
|
4593
|
+
for (llama_beam & beam : beams) {
|
4594
|
+
beam.shift_tokens(common_prefix_length);
|
4595
|
+
fill_next_beams_by_top_probabilities(beam);
|
4596
|
+
}
|
4597
|
+
// next_beams become the beams of next/final iteration. Swap them to re-use memory.
|
4598
|
+
beams.swap(next_beams);
|
4599
|
+
renormalize_beam_probabilities(beams);
|
4600
|
+
}
|
4601
|
+
collapse_beams(top_beam_index());
|
4602
|
+
callback(callback_data, get_beams_state(true));
|
4603
|
+
}
|
4604
|
+
|
4605
|
+
// As beams grow, the cumulative probabilities decrease.
|
4606
|
+
// Renormalize them to avoid floating point underflow.
|
4607
|
+
static void renormalize_beam_probabilities(std::vector<llama_beam> & beams) {
|
4608
|
+
const auto sum_p = [](float sum, llama_beam & beam) { return sum + beam.p; };
|
4609
|
+
const float inv_sum = 1.0f / std::accumulate(beams.begin(), beams.end(), 0.0f, sum_p);
|
4610
|
+
std::for_each(beams.begin(), beams.end(), [=](llama_beam & beam) { beam.p *= inv_sum; });
|
4611
|
+
}
|
4612
|
+
|
4613
|
+
// Assumes beams is non-empty. Uses llama_beam::operator<() for ordering.
|
4614
|
+
size_t top_beam_index() {
|
4615
|
+
return std::max_element(beams.begin(), beams.end()) - beams.begin();
|
4616
|
+
}
|
4617
|
+
|
4618
|
+
// Copy (p,eob) for each beam which may have been changed by the callback.
|
4619
|
+
void update_beams_from_beam_views() {
|
4620
|
+
for (size_t i = 0 ; i < beams.size() ; ++i) {
|
4621
|
+
beams[i].p = beam_views[i].p;
|
4622
|
+
beams[i].eob = beam_views[i].eob;
|
4623
|
+
}
|
4624
|
+
}
|
4625
|
+
};
|
4626
|
+
|
4627
|
+
void llama_beam_search(llama_context * ctx,
|
4628
|
+
llama_beam_search_callback_fn_t callback, void * callback_data,
|
4629
|
+
size_t n_beams, int n_past, int n_predict, int n_threads) {
|
4630
|
+
assert(ctx);
|
4631
|
+
const int64_t t_start_sample_us = ggml_time_us();
|
4632
|
+
|
4633
|
+
llama_beam_search_data beam_search_data(ctx, n_beams, n_past, n_predict, n_threads);
|
4634
|
+
|
4635
|
+
beam_search_data.loop(callback, callback_data);
|
4636
|
+
|
4637
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
4638
|
+
ctx->n_sample++;
|
4639
|
+
}
|
4640
|
+
|
4329
4641
|
//
|
4330
4642
|
// quantization
|
4331
4643
|
//
|
@@ -4423,6 +4735,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4423
4735
|
|
4424
4736
|
std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname_inp, /*use_mmap*/ false));
|
4425
4737
|
|
4738
|
+
llama_model model;
|
4739
|
+
llm_load_arch(*ml, model);
|
4740
|
+
llm_load_hparams(*ml, model, 0, 0, 0);
|
4741
|
+
|
4742
|
+
if (params->only_copy) {
|
4743
|
+
ftype = model.ftype;
|
4744
|
+
}
|
4745
|
+
|
4426
4746
|
const size_t align = GGUF_DEFAULT_ALIGNMENT;
|
4427
4747
|
struct gguf_context * ctx_out = gguf_init_empty();
|
4428
4748
|
|
@@ -4448,6 +4768,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4448
4768
|
++n_feed_forward_w2;
|
4449
4769
|
}
|
4450
4770
|
}
|
4771
|
+
if (n_attention_wv != n_feed_forward_w2 || (uint32_t)n_attention_wv != model.hparams.n_layer) {
|
4772
|
+
LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_feed_forward_w2 = %d, hparams.n_layer = %d\n",
|
4773
|
+
__func__, n_attention_wv, n_feed_forward_w2, model.hparams.n_layer);
|
4774
|
+
}
|
4451
4775
|
|
4452
4776
|
int i_attention_wv = 0;
|
4453
4777
|
int i_feed_forward_w2 = 0;
|
@@ -4460,9 +4784,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4460
4784
|
std::vector<std::thread> workers;
|
4461
4785
|
std::mutex mutex;
|
4462
4786
|
|
4787
|
+
#ifdef GGML_USE_K_QUANTS
|
4463
4788
|
auto use_more_bits = [] (int i_layer, int num_layers) -> bool {
|
4464
4789
|
return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
|
4465
4790
|
};
|
4791
|
+
#endif
|
4466
4792
|
|
4467
4793
|
int idx = 0;
|
4468
4794
|
|
@@ -4505,18 +4831,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4505
4831
|
// quantize only 2D tensors
|
4506
4832
|
quantize &= (tensor->n_dims == 2);
|
4507
4833
|
quantize &= params->quantize_output_tensor || name != "output.weight";
|
4508
|
-
quantize &=
|
4834
|
+
quantize &= !params->only_copy;
|
4509
4835
|
|
4510
4836
|
enum ggml_type new_type;
|
4511
4837
|
void * new_data;
|
4512
4838
|
size_t new_size;
|
4513
4839
|
|
4514
|
-
if (
|
4515
|
-
new_type = tensor->type;
|
4516
|
-
new_data = tensor->data;
|
4517
|
-
new_size = ggml_nbytes(tensor);
|
4518
|
-
LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
|
4519
|
-
} else {
|
4840
|
+
if (quantize) {
|
4520
4841
|
new_type = quantized_type;
|
4521
4842
|
#ifdef GGML_USE_K_QUANTS
|
4522
4843
|
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
@@ -4524,8 +4845,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4524
4845
|
|
4525
4846
|
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
|
4526
4847
|
int nx = tensor->ne[0];
|
4527
|
-
|
4528
|
-
|
4848
|
+
if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
|
4849
|
+
new_type = GGML_TYPE_Q8_0;
|
4850
|
+
}
|
4851
|
+
else if (new_type != GGML_TYPE_Q8_0) {
|
4529
4852
|
new_type = GGML_TYPE_Q6_K;
|
4530
4853
|
}
|
4531
4854
|
} else if (name.find("attn_v.weight") != std::string::npos) {
|
@@ -4539,21 +4862,49 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4539
4862
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
|
4540
4863
|
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
|
4541
4864
|
(i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
|
4865
|
+
if (model.type == MODEL_70B) {
|
4866
|
+
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
|
4867
|
+
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
|
4868
|
+
// nearly negligible increase in model size by quantizing this tensor with more bits:
|
4869
|
+
if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
|
4870
|
+
}
|
4542
4871
|
++i_attention_wv;
|
4543
4872
|
} else if (name.find("ffn_down.weight") != std::string::npos) {
|
4544
4873
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
4545
4874
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
4546
|
-
new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
|
4875
|
+
new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
|
4876
|
+
: model.arch != LLM_ARCH_FALCON || use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K
|
4877
|
+
: GGML_TYPE_Q3_K;
|
4878
|
+
}
|
4879
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
|
4880
|
+
new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
|
4881
|
+
}
|
4882
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
|
4883
|
+
if (model.arch == LLM_ARCH_FALCON) {
|
4884
|
+
new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
|
4885
|
+
use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
4886
|
+
} else {
|
4887
|
+
if (use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
4888
|
+
}
|
4889
|
+
}
|
4890
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
4891
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && i_feed_forward_w2 < 4) {
|
4892
|
+
new_type = GGML_TYPE_Q5_K;
|
4547
4893
|
}
|
4548
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
4549
|
-
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
4550
|
-
use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
4551
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_feed_forward_w2 < 4) new_type = GGML_TYPE_Q5_K;
|
4552
4894
|
++i_feed_forward_w2;
|
4553
4895
|
} else if (name.find("attn_output.weight") != std::string::npos) {
|
4554
|
-
if
|
4555
|
-
|
4556
|
-
|
4896
|
+
if (model.arch != LLM_ARCH_FALCON) {
|
4897
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
|
4898
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
|
4899
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
4900
|
+
} else {
|
4901
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
|
4902
|
+
}
|
4903
|
+
}
|
4904
|
+
else if (name.find("attn_qkv.weight") != std::string::npos) {
|
4905
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
|
4906
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
|
4907
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
|
4557
4908
|
}
|
4558
4909
|
else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) {
|
4559
4910
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
@@ -4568,8 +4919,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4568
4919
|
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
|
4569
4920
|
int nx = tensor->ne[0];
|
4570
4921
|
int ny = tensor->ne[1];
|
4571
|
-
if (nx % QK_K != 0
|
4572
|
-
|
4922
|
+
if (nx % QK_K != 0) {
|
4923
|
+
LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for k-quants\n", __func__, nx, ny, QK_K);
|
4573
4924
|
convert_incompatible_tensor = true;
|
4574
4925
|
}
|
4575
4926
|
}
|
@@ -4585,7 +4936,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4585
4936
|
}
|
4586
4937
|
}
|
4587
4938
|
#endif
|
4588
|
-
|
4939
|
+
// If we've decided to quantize to the same type the tensor is already
|
4940
|
+
// in then there's nothing to do.
|
4941
|
+
quantize = tensor->type != new_type;
|
4942
|
+
}
|
4943
|
+
if (!quantize) {
|
4944
|
+
new_type = tensor->type;
|
4945
|
+
new_data = tensor->data;
|
4946
|
+
new_size = ggml_nbytes(tensor);
|
4947
|
+
LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
|
4948
|
+
} else {
|
4589
4949
|
const size_t nelements = ggml_nelements(tensor);
|
4590
4950
|
|
4591
4951
|
float * f32_data;
|
@@ -4990,7 +5350,7 @@ struct llama_context_params llama_context_default_params() {
|
|
4990
5350
|
/*.seed =*/ LLAMA_DEFAULT_SEED,
|
4991
5351
|
/*.n_ctx =*/ 512,
|
4992
5352
|
/*.n_batch =*/ 512,
|
4993
|
-
/*.
|
5353
|
+
/*.n_gpu_layers =*/ 0,
|
4994
5354
|
/*.main_gpu =*/ 0,
|
4995
5355
|
/*.tensor_split =*/ nullptr,
|
4996
5356
|
/*.rope_freq_base =*/ 10000.0f,
|
@@ -4998,7 +5358,7 @@ struct llama_context_params llama_context_default_params() {
|
|
4998
5358
|
/*.progress_callback =*/ nullptr,
|
4999
5359
|
/*.progress_callback_user_data =*/ nullptr,
|
5000
5360
|
/*.low_vram =*/ false,
|
5001
|
-
/*.mul_mat_q =*/
|
5361
|
+
/*.mul_mat_q =*/ true,
|
5002
5362
|
/*.f16_kv =*/ true,
|
5003
5363
|
/*.logits_all =*/ false,
|
5004
5364
|
/*.vocab_only =*/ false,
|
@@ -5007,6 +5367,10 @@ struct llama_context_params llama_context_default_params() {
|
|
5007
5367
|
/*.embedding =*/ false,
|
5008
5368
|
};
|
5009
5369
|
|
5370
|
+
#ifdef GGML_USE_METAL
|
5371
|
+
result.n_gpu_layers = 1;
|
5372
|
+
#endif
|
5373
|
+
|
5010
5374
|
return result;
|
5011
5375
|
}
|
5012
5376
|
|
@@ -5016,6 +5380,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
|
5016
5380
|
/*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
|
5017
5381
|
/*.allow_requantize =*/ false,
|
5018
5382
|
/*.quantize_output_tensor =*/ true,
|
5383
|
+
/*.only_copy =*/ false,
|
5019
5384
|
};
|
5020
5385
|
|
5021
5386
|
return result;
|
@@ -5198,43 +5563,43 @@ struct llama_context * llama_new_context_with_model(
|
|
5198
5563
|
}
|
5199
5564
|
#endif
|
5200
5565
|
}
|
5201
|
-
}
|
5202
5566
|
|
5203
5567
|
#ifdef GGML_USE_METAL
|
5204
|
-
|
5205
|
-
|
5568
|
+
if (params.n_gpu_layers > 0) {
|
5569
|
+
// this allocates all Metal resources and memory buffers
|
5206
5570
|
|
5207
|
-
|
5208
|
-
|
5571
|
+
void * data_ptr = NULL;
|
5572
|
+
size_t data_size = 0;
|
5209
5573
|
|
5210
|
-
|
5211
|
-
|
5212
|
-
|
5213
|
-
|
5214
|
-
|
5215
|
-
|
5216
|
-
|
5574
|
+
if (params.use_mmap) {
|
5575
|
+
data_ptr = ctx->model.mapping->addr;
|
5576
|
+
data_size = ctx->model.mapping->size;
|
5577
|
+
} else {
|
5578
|
+
data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
|
5579
|
+
data_size = ggml_get_mem_size (ctx->model.ctx);
|
5580
|
+
}
|
5217
5581
|
|
5218
|
-
|
5582
|
+
const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
|
5219
5583
|
|
5220
|
-
|
5584
|
+
LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
|
5221
5585
|
|
5222
5586
|
#define LLAMA_METAL_CHECK_BUF(result) \
|
5223
|
-
|
5224
|
-
|
5225
|
-
|
5226
|
-
|
5227
|
-
|
5587
|
+
if (!(result)) { \
|
5588
|
+
LLAMA_LOG_ERROR("%s: failed to add buffer\n", __func__); \
|
5589
|
+
llama_free(ctx); \
|
5590
|
+
return NULL; \
|
5591
|
+
}
|
5228
5592
|
|
5229
|
-
|
5593
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
|
5230
5594
|
|
5231
|
-
|
5232
|
-
|
5595
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.data, ctx->buf_compute.size, 0));
|
5596
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0));
|
5233
5597
|
|
5234
|
-
|
5598
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.data, ctx->buf_alloc.size, 0));
|
5235
5599
|
#undef LLAMA_METAL_CHECK_BUF
|
5236
|
-
|
5600
|
+
}
|
5237
5601
|
#endif
|
5602
|
+
}
|
5238
5603
|
|
5239
5604
|
#ifdef GGML_USE_MPI
|
5240
5605
|
ctx->ctx_mpi = ggml_mpi_init();
|
@@ -5297,13 +5662,29 @@ int llama_model_n_embd(const struct llama_model * model) {
|
|
5297
5662
|
return model->hparams.n_embd;
|
5298
5663
|
}
|
5299
5664
|
|
5300
|
-
int
|
5665
|
+
int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
|
5301
5666
|
return snprintf(buf, buf_size, "%s %s %s",
|
5302
5667
|
model->name.c_str(),
|
5303
5668
|
llama_model_type_name(model->type),
|
5304
5669
|
llama_model_ftype_name(model->ftype).c_str());
|
5305
5670
|
}
|
5306
5671
|
|
5672
|
+
uint64_t llama_model_size(const struct llama_model * model) {
|
5673
|
+
uint64_t size = 0;
|
5674
|
+
for (const auto & it : model->tensors_by_name) {
|
5675
|
+
size += ggml_nbytes(it.second);
|
5676
|
+
}
|
5677
|
+
return size;
|
5678
|
+
}
|
5679
|
+
|
5680
|
+
uint64_t llama_model_n_params(const struct llama_model * model) {
|
5681
|
+
uint64_t nparams = 0;
|
5682
|
+
for (const auto & it : model->tensors_by_name) {
|
5683
|
+
nparams += ggml_nelements(it.second);
|
5684
|
+
}
|
5685
|
+
return nparams;
|
5686
|
+
}
|
5687
|
+
|
5307
5688
|
int llama_model_quantize(
|
5308
5689
|
const char * fname_inp,
|
5309
5690
|
const char * fname_out,
|
@@ -5552,7 +5933,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
5552
5933
|
rng_ss.str(std::string(&rng_buf[0], rng_size));
|
5553
5934
|
rng_ss >> ctx->rng;
|
5554
5935
|
|
5555
|
-
GGML_ASSERT(rng_ss.fail()
|
5936
|
+
GGML_ASSERT(!rng_ss.fail());
|
5556
5937
|
}
|
5557
5938
|
|
5558
5939
|
// set logits
|
@@ -5828,8 +6209,7 @@ int llama_tokenize_with_model(
|
|
5828
6209
|
llama_token * tokens,
|
5829
6210
|
int n_max_tokens,
|
5830
6211
|
bool add_bos) {
|
5831
|
-
auto
|
5832
|
-
auto res = llama_tokenize_internal(model->vocab, text, add_bos, escape);
|
6212
|
+
auto res = llama_tokenize_internal(model->vocab, text, add_bos);
|
5833
6213
|
|
5834
6214
|
if (n_max_tokens < (int) res.size()) {
|
5835
6215
|
LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
|
@@ -5843,12 +6223,12 @@ int llama_tokenize_with_model(
|
|
5843
6223
|
return res.size();
|
5844
6224
|
}
|
5845
6225
|
|
5846
|
-
int
|
5847
|
-
return
|
6226
|
+
int llama_token_to_piece(const struct llama_context * ctx, llama_token token, char * buf, int length) {
|
6227
|
+
return llama_token_to_piece_with_model(&ctx->model, token, buf, length);
|
5848
6228
|
}
|
5849
6229
|
|
5850
|
-
// does not write null-terminator to
|
5851
|
-
int
|
6230
|
+
// does not write null-terminator to buf
|
6231
|
+
int llama_token_to_piece_with_model(const struct llama_model * model, llama_token token, char * buf, int length) {
|
5852
6232
|
if (0 <= token && token < llama_model_n_vocab(model)) {
|
5853
6233
|
if (llama_is_normal_token(model->vocab, token)) {
|
5854
6234
|
std::string result = model->vocab.id_to_token[token].text;
|
@@ -5936,11 +6316,40 @@ const char * llama_print_system_info(void) {
|
|
5936
6316
|
s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
|
5937
6317
|
s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
|
5938
6318
|
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
|
6319
|
+
s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
|
5939
6320
|
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
|
5940
6321
|
|
5941
6322
|
return s.c_str();
|
5942
6323
|
}
|
5943
6324
|
|
6325
|
+
void llama_dump_timing_info_yaml(FILE * stream, const llama_context * ctx) {
|
6326
|
+
fprintf(stream, "\n");
|
6327
|
+
fprintf(stream, "###########\n");
|
6328
|
+
fprintf(stream, "# Timings #\n");
|
6329
|
+
fprintf(stream, "###########\n");
|
6330
|
+
fprintf(stream, "\n");
|
6331
|
+
|
6332
|
+
fprintf(stream, "mst_eval: %.2f # ms / token during generation\n",
|
6333
|
+
1.0e-3 * ctx->t_eval_us / ctx->n_eval);
|
6334
|
+
fprintf(stream, "mst_p_eval: %.2f # ms / token during prompt processing\n",
|
6335
|
+
1.0e-3 * ctx->t_p_eval_us / ctx->n_p_eval);
|
6336
|
+
fprintf(stream, "mst_sample: %.2f # ms / token during sampling\n",
|
6337
|
+
1.0e-3 * ctx->t_sample_us / ctx->n_sample);
|
6338
|
+
fprintf(stream, "n_eval: %d # number of tokens generated (excluding the first one)\n", ctx->n_eval);
|
6339
|
+
fprintf(stream, "n_p_eval: %d # number of tokens processed in batches at the beginning\n", ctx->n_p_eval);
|
6340
|
+
fprintf(stream, "n_sample: %d # number of sampled tokens\n", ctx->n_sample);
|
6341
|
+
fprintf(stream, "t_eval_us: %" PRId64 " # total microseconds spent generating tokens\n", ctx->t_eval_us);
|
6342
|
+
fprintf(stream, "t_load_us: %" PRId64 " # total microseconds spent loading the model\n", ctx->t_load_us);
|
6343
|
+
fprintf(stream, "t_p_eval_us: %" PRId64 " # total microseconds spent prompt processing\n", ctx->t_p_eval_us);
|
6344
|
+
fprintf(stream, "t_sample_us: %" PRId64 " # total microseconds spent sampling\n", ctx->t_sample_us);
|
6345
|
+
fprintf(stream, "ts_eval: %.2f # tokens / second during generation\n",
|
6346
|
+
1.0e6 * ctx->n_eval / ctx->t_eval_us);
|
6347
|
+
fprintf(stream, "ts_p_eval: %.2f # tokens / second during prompt processing\n",
|
6348
|
+
1.0e6 * ctx->n_p_eval / ctx->t_p_eval_us);
|
6349
|
+
fprintf(stream, "ts_sample: %.2f # tokens / second during sampling\n",
|
6350
|
+
1.0e6 * ctx->n_sample / ctx->t_sample_us);
|
6351
|
+
}
|
6352
|
+
|
5944
6353
|
// For internal test use
|
5945
6354
|
const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
|
5946
6355
|
return ctx->model.tensors_by_name;
|
@@ -5951,10 +6360,6 @@ void llama_log_set(llama_log_callback log_callback, void * user_data) {
|
|
5951
6360
|
g_state.log_callback_user_data = user_data;
|
5952
6361
|
}
|
5953
6362
|
|
5954
|
-
#if defined(_MSC_VER) && !defined(vsnprintf)
|
5955
|
-
#define vsnprintf _vsnprintf
|
5956
|
-
#endif
|
5957
|
-
|
5958
6363
|
static void llama_log_internal_v(llama_log_level level, const char * format, va_list args) {
|
5959
6364
|
va_list args_copy;
|
5960
6365
|
va_copy(args_copy, args);
|