llama_cpp 0.5.3 → 0.7.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +17 -0
- data/README.md +6 -5
- data/examples/chat.rb +13 -13
- data/examples/embedding.rb +9 -9
- data/ext/llama_cpp/llama_cpp.cpp +583 -262
- data/ext/llama_cpp/src/ggml-alloc.c +8 -2
- data/ext/llama_cpp/src/ggml-alloc.h +1 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +326 -149
- data/ext/llama_cpp/src/ggml-cuda.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.h +4 -0
- data/ext/llama_cpp/src/ggml-metal.m +167 -89
- data/ext/llama_cpp/src/ggml-metal.metal +130 -40
- data/ext/llama_cpp/src/ggml-opencl.cpp +119 -53
- data/ext/llama_cpp/src/ggml.c +2355 -1166
- data/ext/llama_cpp/src/ggml.h +129 -35
- data/ext/llama_cpp/src/k_quants.c +744 -2
- data/ext/llama_cpp/src/llama.cpp +1766 -671
- data/ext/llama_cpp/src/llama.h +321 -120
- data/ext/llama_cpp/src/unicode.h +462 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +6 -10
- data/sig/llama_cpp.rbs +70 -34
- metadata +4 -3
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
#define LLAMA_API_INTERNAL
|
2
2
|
#include "llama.h"
|
3
3
|
|
4
|
+
#include "unicode.h"
|
5
|
+
|
4
6
|
#include "ggml.h"
|
5
7
|
|
6
8
|
#include "ggml-alloc.h"
|
@@ -72,6 +74,7 @@
|
|
72
74
|
#include <sstream>
|
73
75
|
#include <thread>
|
74
76
|
#include <unordered_map>
|
77
|
+
#include <set>
|
75
78
|
|
76
79
|
#if defined(_MSC_VER)
|
77
80
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
@@ -92,12 +95,12 @@
|
|
92
95
|
//
|
93
96
|
|
94
97
|
LLAMA_ATTRIBUTE_FORMAT(2, 3)
|
95
|
-
static void llama_log_internal (
|
96
|
-
static void llama_log_callback_default(
|
98
|
+
static void llama_log_internal (ggml_log_level level, const char* format, ...);
|
99
|
+
static void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data);
|
97
100
|
|
98
|
-
#define LLAMA_LOG_INFO(...) llama_log_internal(
|
99
|
-
#define LLAMA_LOG_WARN(...) llama_log_internal(
|
100
|
-
#define LLAMA_LOG_ERROR(...) llama_log_internal(
|
101
|
+
#define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
|
102
|
+
#define LLAMA_LOG_WARN(...) llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
|
103
|
+
#define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
|
101
104
|
|
102
105
|
//
|
103
106
|
// helpers
|
@@ -122,6 +125,27 @@ static void replace_all(std::string & s, const std::string & search, const std::
|
|
122
125
|
}
|
123
126
|
s = std::move(result);
|
124
127
|
}
|
128
|
+
|
129
|
+
static bool is_float_close(float a, float b, float abs_tol) {
|
130
|
+
// Check for non-negative tolerance
|
131
|
+
if (abs_tol < 0.0) {
|
132
|
+
throw std::invalid_argument("Tolerance must be non-negative");
|
133
|
+
}
|
134
|
+
|
135
|
+
// Exact equality check
|
136
|
+
if (a == b) {
|
137
|
+
return true;
|
138
|
+
}
|
139
|
+
|
140
|
+
// Check for infinities
|
141
|
+
if (std::isinf(a) || std::isinf(b)) {
|
142
|
+
return false;
|
143
|
+
}
|
144
|
+
|
145
|
+
// Regular comparison using the provided absolute tolerance
|
146
|
+
return std::fabs(b - a) <= abs_tol;
|
147
|
+
}
|
148
|
+
|
125
149
|
#ifdef GGML_USE_CPU_HBM
|
126
150
|
#include <hbwmalloc.h>
|
127
151
|
#endif
|
@@ -162,18 +186,20 @@ enum llm_arch {
|
|
162
186
|
LLM_ARCH_GPTNEOX,
|
163
187
|
LLM_ARCH_MPT,
|
164
188
|
LLM_ARCH_STARCODER,
|
189
|
+
LLM_ARCH_REFACT,
|
165
190
|
LLM_ARCH_UNKNOWN,
|
166
191
|
};
|
167
192
|
|
168
193
|
static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
|
169
|
-
{ LLM_ARCH_LLAMA, "llama"
|
170
|
-
{ LLM_ARCH_FALCON, "falcon"
|
171
|
-
{ LLM_ARCH_GPT2, "gpt2"
|
172
|
-
{ LLM_ARCH_GPTJ, "gptj"
|
173
|
-
{ LLM_ARCH_GPTNEOX, "gptneox"
|
174
|
-
{ LLM_ARCH_MPT, "mpt"
|
175
|
-
{ LLM_ARCH_BAICHUAN, "baichuan"
|
194
|
+
{ LLM_ARCH_LLAMA, "llama" },
|
195
|
+
{ LLM_ARCH_FALCON, "falcon" },
|
196
|
+
{ LLM_ARCH_GPT2, "gpt2" },
|
197
|
+
{ LLM_ARCH_GPTJ, "gptj" },
|
198
|
+
{ LLM_ARCH_GPTNEOX, "gptneox" },
|
199
|
+
{ LLM_ARCH_MPT, "mpt" },
|
200
|
+
{ LLM_ARCH_BAICHUAN, "baichuan" },
|
176
201
|
{ LLM_ARCH_STARCODER, "starcoder" },
|
202
|
+
{ LLM_ARCH_REFACT, "refact" },
|
177
203
|
};
|
178
204
|
|
179
205
|
enum llm_kv {
|
@@ -221,16 +247,16 @@ enum llm_kv {
|
|
221
247
|
};
|
222
248
|
|
223
249
|
static std::map<llm_kv, std::string> LLM_KV_NAMES = {
|
224
|
-
{ LLM_KV_GENERAL_ARCHITECTURE, "general.architecture"
|
225
|
-
{ LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version"
|
226
|
-
{ LLM_KV_GENERAL_ALIGNMENT, "general.alignment"
|
227
|
-
{ LLM_KV_GENERAL_NAME, "general.name"
|
228
|
-
{ LLM_KV_GENERAL_AUTHOR, "general.author"
|
229
|
-
{ LLM_KV_GENERAL_URL, "general.url"
|
230
|
-
{ LLM_KV_GENERAL_DESCRIPTION, "general.description"
|
231
|
-
{ LLM_KV_GENERAL_LICENSE, "general.license"
|
232
|
-
{ LLM_KV_GENERAL_SOURCE_URL, "general.
|
233
|
-
{ LLM_KV_GENERAL_SOURCE_HF_REPO, "general.
|
250
|
+
{ LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" },
|
251
|
+
{ LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" },
|
252
|
+
{ LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
|
253
|
+
{ LLM_KV_GENERAL_NAME, "general.name" },
|
254
|
+
{ LLM_KV_GENERAL_AUTHOR, "general.author" },
|
255
|
+
{ LLM_KV_GENERAL_URL, "general.url" },
|
256
|
+
{ LLM_KV_GENERAL_DESCRIPTION, "general.description" },
|
257
|
+
{ LLM_KV_GENERAL_LICENSE, "general.license" },
|
258
|
+
{ LLM_KV_GENERAL_SOURCE_URL, "general.source.url" },
|
259
|
+
{ LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" },
|
234
260
|
|
235
261
|
{ LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
|
236
262
|
{ LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
|
@@ -394,6 +420,23 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
394
420
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
395
421
|
},
|
396
422
|
},
|
423
|
+
{
|
424
|
+
LLM_ARCH_REFACT,
|
425
|
+
{
|
426
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
427
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
428
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
429
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
430
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
431
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
432
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
433
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
434
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
435
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
436
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
437
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
438
|
+
},
|
439
|
+
},
|
397
440
|
{
|
398
441
|
LLM_ARCH_UNKNOWN,
|
399
442
|
{
|
@@ -448,7 +491,7 @@ struct LLM_TN {
|
|
448
491
|
//
|
449
492
|
|
450
493
|
#define GGUF_GET_KEY(ctx, dst, func, type, req, key) \
|
451
|
-
{ \
|
494
|
+
do { \
|
452
495
|
const std::string skey(key); \
|
453
496
|
const int kid = gguf_find_key(ctx, skey.c_str()); \
|
454
497
|
if (kid >= 0) { \
|
@@ -460,7 +503,7 @@ struct LLM_TN {
|
|
460
503
|
} else if (req) { \
|
461
504
|
throw std::runtime_error(format("key not found in model: %s", skey.c_str())); \
|
462
505
|
} \
|
463
|
-
}
|
506
|
+
} while (0)
|
464
507
|
|
465
508
|
//
|
466
509
|
// ggml helpers
|
@@ -881,10 +924,10 @@ static void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
|
|
881
924
|
|
882
925
|
static std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) {
|
883
926
|
std::vector<char> result(8, 0);
|
884
|
-
const int n_tokens = llama_token_to_piece(ctx, token, result.data(), result.size());
|
927
|
+
const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
|
885
928
|
if (n_tokens < 0) {
|
886
929
|
result.resize(-n_tokens);
|
887
|
-
int check = llama_token_to_piece(ctx, token, result.data(), result.size());
|
930
|
+
int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
|
888
931
|
GGML_ASSERT(check == -n_tokens);
|
889
932
|
} else {
|
890
933
|
result.resize(n_tokens);
|
@@ -899,7 +942,7 @@ static std::string llama_token_to_str(const struct llama_context * ctx, llama_to
|
|
899
942
|
|
900
943
|
struct llama_state {
|
901
944
|
// We save the log callback globally
|
902
|
-
|
945
|
+
ggml_log_callback log_callback = llama_log_callback_default;
|
903
946
|
void * log_callback_user_data = nullptr;
|
904
947
|
};
|
905
948
|
|
@@ -925,9 +968,9 @@ static const size_t MB = kB*kB;
|
|
925
968
|
static const size_t GB = kB*kB*kB;
|
926
969
|
|
927
970
|
struct llama_hparams {
|
971
|
+
bool vocab_only;
|
928
972
|
uint32_t n_vocab;
|
929
973
|
uint32_t n_ctx_train; // context size the model was trained on
|
930
|
-
uint32_t n_ctx; // context size used during inference
|
931
974
|
uint32_t n_embd;
|
932
975
|
uint32_t n_head;
|
933
976
|
uint32_t n_head_kv;
|
@@ -938,11 +981,28 @@ struct llama_hparams {
|
|
938
981
|
float f_norm_eps;
|
939
982
|
float f_norm_rms_eps;
|
940
983
|
|
941
|
-
float
|
942
|
-
float
|
984
|
+
float rope_freq_base_train;
|
985
|
+
float rope_freq_scale_train;
|
943
986
|
|
944
987
|
bool operator!=(const llama_hparams & other) const {
|
945
|
-
|
988
|
+
if (this->vocab_only != other.vocab_only) return true;
|
989
|
+
if (this->n_vocab != other.n_vocab) return true;
|
990
|
+
if (this->n_ctx_train != other.n_ctx_train) return true;
|
991
|
+
if (this->n_embd != other.n_embd) return true;
|
992
|
+
if (this->n_head != other.n_head) return true;
|
993
|
+
if (this->n_head_kv != other.n_head_kv) return true;
|
994
|
+
if (this->n_layer != other.n_layer) return true;
|
995
|
+
if (this->n_rot != other.n_rot) return true;
|
996
|
+
if (this->n_ff != other.n_ff) return true;
|
997
|
+
|
998
|
+
const float EPSILON = 1e-9;
|
999
|
+
|
1000
|
+
if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
|
1001
|
+
if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
|
1002
|
+
if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
|
1003
|
+
if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
|
1004
|
+
|
1005
|
+
return false;
|
946
1006
|
}
|
947
1007
|
|
948
1008
|
uint32_t n_gqa() const {
|
@@ -956,15 +1016,18 @@ struct llama_hparams {
|
|
956
1016
|
uint32_t n_embd_gqa() const {
|
957
1017
|
return n_embd/n_gqa();
|
958
1018
|
}
|
1019
|
+
};
|
959
1020
|
|
960
|
-
|
961
|
-
|
962
|
-
|
963
|
-
|
964
|
-
|
965
|
-
|
966
|
-
|
967
|
-
|
1021
|
+
struct llama_cparams {
|
1022
|
+
uint32_t n_ctx; // context size used during inference
|
1023
|
+
uint32_t n_batch;
|
1024
|
+
uint32_t n_threads; // number of threads to use for generation
|
1025
|
+
uint32_t n_threads_batch; // number of threads to use for batch processing
|
1026
|
+
|
1027
|
+
float rope_freq_base;
|
1028
|
+
float rope_freq_scale;
|
1029
|
+
|
1030
|
+
bool mul_mat_q;
|
968
1031
|
};
|
969
1032
|
|
970
1033
|
struct llama_layer {
|
@@ -999,7 +1062,29 @@ struct llama_layer {
|
|
999
1062
|
struct ggml_tensor * b3; // ffn_up
|
1000
1063
|
};
|
1001
1064
|
|
1065
|
+
struct llama_kv_cell {
|
1066
|
+
llama_pos pos = -1;
|
1067
|
+
llama_pos delta = 0;
|
1068
|
+
|
1069
|
+
std::set<llama_seq_id> seq_id;
|
1070
|
+
|
1071
|
+
bool has_seq_id(const llama_seq_id & id) const {
|
1072
|
+
return seq_id.find(id) != seq_id.end();
|
1073
|
+
}
|
1074
|
+
};
|
1075
|
+
|
1076
|
+
// ring-buffer of cached KV data
|
1002
1077
|
struct llama_kv_cache {
|
1078
|
+
bool has_shift = false;
|
1079
|
+
|
1080
|
+
uint32_t head = 0;
|
1081
|
+
uint32_t size = 0;
|
1082
|
+
|
1083
|
+
// computed before each graph build
|
1084
|
+
uint32_t n = 0;
|
1085
|
+
|
1086
|
+
std::vector<llama_kv_cell> cells;
|
1087
|
+
|
1003
1088
|
struct ggml_tensor * k = NULL;
|
1004
1089
|
struct ggml_tensor * v = NULL;
|
1005
1090
|
|
@@ -1007,8 +1092,6 @@ struct llama_kv_cache {
|
|
1007
1092
|
|
1008
1093
|
llama_buffer buf;
|
1009
1094
|
|
1010
|
-
int n; // number of tokens currently in the cache
|
1011
|
-
|
1012
1095
|
~llama_kv_cache() {
|
1013
1096
|
if (ctx) {
|
1014
1097
|
ggml_free(ctx);
|
@@ -1047,6 +1130,10 @@ struct llama_vocab {
|
|
1047
1130
|
id special_pad_id = -1;
|
1048
1131
|
|
1049
1132
|
id linefeed_id = 13;
|
1133
|
+
id special_prefix_id = 32007;
|
1134
|
+
id special_middle_id = 32009;
|
1135
|
+
id special_suffix_id = 32008;
|
1136
|
+
id special_eot_id = 32010;
|
1050
1137
|
|
1051
1138
|
int find_bpe_rank(std::string token_left, std::string token_right) const {
|
1052
1139
|
replace_all(token_left, " ", "\u0120");
|
@@ -1122,11 +1209,8 @@ struct llama_model {
|
|
1122
1209
|
};
|
1123
1210
|
|
1124
1211
|
struct llama_context {
|
1125
|
-
llama_context(const llama_model & model) : model(model),
|
1212
|
+
llama_context(const llama_model & model) : model(model), t_start_us(model.t_start_us), t_load_us(model.t_load_us) {}
|
1126
1213
|
~llama_context() {
|
1127
|
-
if (model_owner) {
|
1128
|
-
delete &model;
|
1129
|
-
}
|
1130
1214
|
#ifdef GGML_USE_METAL
|
1131
1215
|
if (ctx_metal) {
|
1132
1216
|
ggml_metal_free(ctx_metal);
|
@@ -1137,27 +1221,26 @@ struct llama_context {
|
|
1137
1221
|
}
|
1138
1222
|
}
|
1139
1223
|
|
1224
|
+
llama_cparams cparams;
|
1225
|
+
|
1226
|
+
const llama_model & model;
|
1227
|
+
|
1228
|
+
// key + value cache for the self attention
|
1229
|
+
struct llama_kv_cache kv_self;
|
1230
|
+
|
1140
1231
|
std::mt19937 rng;
|
1141
1232
|
|
1142
1233
|
bool has_evaluated_once = false;
|
1143
1234
|
|
1235
|
+
int64_t t_start_us;
|
1236
|
+
int64_t t_load_us;
|
1144
1237
|
int64_t t_sample_us = 0;
|
1145
|
-
int64_t t_eval_us = 0;
|
1146
1238
|
int64_t t_p_eval_us = 0;
|
1239
|
+
int64_t t_eval_us = 0;
|
1147
1240
|
|
1148
1241
|
int32_t n_sample = 0; // number of tokens sampled
|
1149
|
-
int32_t n_eval = 0; // number of eval calls
|
1150
1242
|
int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
|
1151
|
-
|
1152
|
-
const llama_model & model;
|
1153
|
-
|
1154
|
-
bool model_owner = false;
|
1155
|
-
|
1156
|
-
int64_t t_load_us;
|
1157
|
-
int64_t t_start_us;
|
1158
|
-
|
1159
|
-
// key + value cache for the self attention
|
1160
|
-
struct llama_kv_cache kv_self;
|
1243
|
+
int32_t n_eval = 0; // number of eval calls
|
1161
1244
|
|
1162
1245
|
// decode output (2-dimensional array: [n_tokens][n_vocab])
|
1163
1246
|
std::vector<float> logits;
|
@@ -1192,16 +1275,23 @@ static bool llama_kv_cache_init(
|
|
1192
1275
|
const struct llama_hparams & hparams,
|
1193
1276
|
struct llama_kv_cache & cache,
|
1194
1277
|
ggml_type wtype,
|
1195
|
-
|
1278
|
+
uint32_t n_ctx,
|
1196
1279
|
int n_gpu_layers) {
|
1197
|
-
const
|
1198
|
-
const
|
1280
|
+
const uint32_t n_embd = hparams.n_embd_gqa();
|
1281
|
+
const uint32_t n_layer = hparams.n_layer;
|
1199
1282
|
|
1200
1283
|
const int64_t n_mem = n_layer*n_ctx;
|
1201
1284
|
const int64_t n_elements = n_embd*n_mem;
|
1202
1285
|
|
1286
|
+
cache.has_shift = false;
|
1287
|
+
|
1288
|
+
cache.head = 0;
|
1289
|
+
cache.size = n_ctx;
|
1290
|
+
|
1291
|
+
cache.cells.clear();
|
1292
|
+
cache.cells.resize(n_ctx);
|
1293
|
+
|
1203
1294
|
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
|
1204
|
-
cache.n = 0;
|
1205
1295
|
|
1206
1296
|
struct ggml_init_params params;
|
1207
1297
|
params.mem_size = cache.buf.size;
|
@@ -1222,17 +1312,163 @@ static bool llama_kv_cache_init(
|
|
1222
1312
|
|
1223
1313
|
(void) n_gpu_layers;
|
1224
1314
|
#ifdef GGML_USE_CUBLAS
|
1225
|
-
|
1315
|
+
size_t vram_kv_cache = 0;
|
1316
|
+
|
1317
|
+
if (n_gpu_layers > (int)n_layer + 1) {
|
1226
1318
|
ggml_cuda_assign_buffers_no_scratch(cache.v);
|
1319
|
+
LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__);
|
1320
|
+
vram_kv_cache += ggml_nbytes(cache.v);
|
1227
1321
|
}
|
1228
|
-
if (n_gpu_layers > n_layer + 2) {
|
1322
|
+
if (n_gpu_layers > (int)n_layer + 2) {
|
1229
1323
|
ggml_cuda_assign_buffers_no_scratch(cache.k);
|
1324
|
+
LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
|
1325
|
+
vram_kv_cache += ggml_nbytes(cache.k);
|
1326
|
+
}
|
1327
|
+
if (vram_kv_cache > 0) {
|
1328
|
+
LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
|
1230
1329
|
}
|
1231
1330
|
#endif // GGML_USE_CUBLAS
|
1232
1331
|
|
1233
1332
|
return true;
|
1234
1333
|
}
|
1235
1334
|
|
1335
|
+
// find an empty slot of size "n_tokens" in the cache
|
1336
|
+
// updates the cache head
|
1337
|
+
static bool llama_kv_cache_find_slot(
|
1338
|
+
struct llama_kv_cache & cache,
|
1339
|
+
const struct llama_batch & batch) {
|
1340
|
+
const uint32_t n_ctx = cache.size;
|
1341
|
+
const uint32_t n_tokens = batch.n_tokens;
|
1342
|
+
|
1343
|
+
if (n_tokens > n_ctx) {
|
1344
|
+
LLAMA_LOG_ERROR("%s: n_tokens=%d > n_ctx=%d\n", __func__, n_tokens, n_ctx);
|
1345
|
+
return false;
|
1346
|
+
}
|
1347
|
+
|
1348
|
+
uint32_t n_tested = 0;
|
1349
|
+
|
1350
|
+
while (true) {
|
1351
|
+
if (cache.head + n_tokens > n_ctx) {
|
1352
|
+
cache.head = 0;
|
1353
|
+
n_tested += n_ctx - cache.head;
|
1354
|
+
continue;
|
1355
|
+
}
|
1356
|
+
|
1357
|
+
bool found = true;
|
1358
|
+
for (uint32_t i = 0; i < n_tokens; i++) {
|
1359
|
+
if (cache.cells[cache.head + i].pos >= 0) {
|
1360
|
+
found = false;
|
1361
|
+
cache.head += i + 1;
|
1362
|
+
n_tested += i + 1;
|
1363
|
+
break;
|
1364
|
+
}
|
1365
|
+
}
|
1366
|
+
|
1367
|
+
if (found) {
|
1368
|
+
break;
|
1369
|
+
}
|
1370
|
+
|
1371
|
+
if (n_tested >= n_ctx) {
|
1372
|
+
//LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
|
1373
|
+
return false;
|
1374
|
+
}
|
1375
|
+
}
|
1376
|
+
|
1377
|
+
for (uint32_t i = 0; i < n_tokens; i++) {
|
1378
|
+
cache.cells[cache.head + i].pos = batch.pos[i];
|
1379
|
+
cache.cells[cache.head + i].seq_id.insert(batch.seq_id[i]);
|
1380
|
+
}
|
1381
|
+
|
1382
|
+
return true;
|
1383
|
+
}
|
1384
|
+
|
1385
|
+
// find how many cells are currently in use
|
1386
|
+
static int32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
|
1387
|
+
for (uint32_t i = cache.size - 1; i > 0; --i) {
|
1388
|
+
if (cache.cells[i].pos >= 0 && !cache.cells[i].seq_id.empty()) {
|
1389
|
+
return i + 1;
|
1390
|
+
}
|
1391
|
+
}
|
1392
|
+
|
1393
|
+
return 0;
|
1394
|
+
}
|
1395
|
+
|
1396
|
+
static void llama_kv_cache_tokens_rm(struct llama_kv_cache & cache, int32_t c0, int32_t c1) {
|
1397
|
+
if (c0 < 0) c0 = 0;
|
1398
|
+
if (c1 < 0) c1 = cache.size;
|
1399
|
+
|
1400
|
+
for (int32_t i = c0; i < c1; ++i) {
|
1401
|
+
cache.cells[i].pos = -1;
|
1402
|
+
cache.cells[i].seq_id.clear();
|
1403
|
+
}
|
1404
|
+
}
|
1405
|
+
|
1406
|
+
static void llama_kv_cache_seq_rm(
|
1407
|
+
struct llama_kv_cache & cache,
|
1408
|
+
llama_seq_id seq_id,
|
1409
|
+
llama_pos p0,
|
1410
|
+
llama_pos p1) {
|
1411
|
+
if (p0 < 0) p0 = 0;
|
1412
|
+
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
|
1413
|
+
|
1414
|
+
for (uint32_t i = 0; i < cache.size; ++i) {
|
1415
|
+
if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
|
1416
|
+
cache.cells[i].seq_id.erase(seq_id);
|
1417
|
+
if (cache.cells[i].seq_id.empty()) {
|
1418
|
+
cache.cells[i].pos = -1;
|
1419
|
+
}
|
1420
|
+
}
|
1421
|
+
}
|
1422
|
+
}
|
1423
|
+
|
1424
|
+
static void llama_kv_cache_seq_cp(
|
1425
|
+
struct llama_kv_cache & cache,
|
1426
|
+
llama_seq_id seq_id_src,
|
1427
|
+
llama_seq_id seq_id_dst,
|
1428
|
+
llama_pos p0,
|
1429
|
+
llama_pos p1) {
|
1430
|
+
if (p0 < 0) p0 = 0;
|
1431
|
+
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
|
1432
|
+
|
1433
|
+
for (uint32_t i = 0; i < cache.size; ++i) {
|
1434
|
+
if (cache.cells[i].has_seq_id(seq_id_src) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
|
1435
|
+
cache.cells[i].seq_id.insert(seq_id_dst);
|
1436
|
+
}
|
1437
|
+
}
|
1438
|
+
}
|
1439
|
+
|
1440
|
+
static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id seq_id) {
|
1441
|
+
for (uint32_t i = 0; i < cache.size; ++i) {
|
1442
|
+
if (!cache.cells[i].has_seq_id(seq_id)) {
|
1443
|
+
cache.cells[i].pos = -1;
|
1444
|
+
cache.cells[i].seq_id.clear();
|
1445
|
+
}
|
1446
|
+
}
|
1447
|
+
}
|
1448
|
+
|
1449
|
+
static void llama_kv_cache_seq_shift(
|
1450
|
+
struct llama_kv_cache & cache,
|
1451
|
+
llama_seq_id seq_id,
|
1452
|
+
llama_pos p0,
|
1453
|
+
llama_pos p1,
|
1454
|
+
llama_pos delta) {
|
1455
|
+
if (p0 < 0) p0 = 0;
|
1456
|
+
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
|
1457
|
+
|
1458
|
+
for (uint32_t i = 0; i < cache.size; ++i) {
|
1459
|
+
if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
|
1460
|
+
cache.cells[i].pos += delta;
|
1461
|
+
if (cache.cells[i].pos < 0) {
|
1462
|
+
cache.cells[i].pos = -1;
|
1463
|
+
cache.cells[i].seq_id.clear();
|
1464
|
+
} else {
|
1465
|
+
cache.has_shift = true;
|
1466
|
+
cache.cells[i].delta = delta;
|
1467
|
+
}
|
1468
|
+
}
|
1469
|
+
}
|
1470
|
+
}
|
1471
|
+
|
1236
1472
|
//
|
1237
1473
|
// model loading and saving
|
1238
1474
|
//
|
@@ -1554,7 +1790,7 @@ struct llama_model_loader {
|
|
1554
1790
|
lmlock->grow_to(size_lock);
|
1555
1791
|
}
|
1556
1792
|
break;
|
1557
|
-
#
|
1793
|
+
#ifdef GGML_USE_CUBLAS
|
1558
1794
|
case GGML_BACKEND_GPU:
|
1559
1795
|
case GGML_BACKEND_GPU_SPLIT:
|
1560
1796
|
// old code:
|
@@ -1587,7 +1823,15 @@ struct llama_model_loader {
|
|
1587
1823
|
// load LLaMA models
|
1588
1824
|
//
|
1589
1825
|
|
1590
|
-
static std::string
|
1826
|
+
static std::string llama_model_arch_name(llm_arch arch) {
|
1827
|
+
auto it = LLM_ARCH_NAMES.find(arch);
|
1828
|
+
if (it == LLM_ARCH_NAMES.end()) {
|
1829
|
+
return "unknown";
|
1830
|
+
}
|
1831
|
+
return it->second;
|
1832
|
+
}
|
1833
|
+
|
1834
|
+
static std::string llama_model_ftype_name(llama_ftype ftype) {
|
1591
1835
|
if (ftype & LLAMA_FTYPE_GUESSED) {
|
1592
1836
|
return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
|
1593
1837
|
}
|
@@ -1643,10 +1887,7 @@ static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
|
|
1643
1887
|
|
1644
1888
|
static void llm_load_hparams(
|
1645
1889
|
llama_model_loader & ml,
|
1646
|
-
llama_model & model
|
1647
|
-
int n_ctx,
|
1648
|
-
float rope_freq_base,
|
1649
|
-
float rope_freq_scale) {
|
1890
|
+
llama_model & model) {
|
1650
1891
|
struct gguf_context * ctx = ml.ctx_gguf;
|
1651
1892
|
|
1652
1893
|
const auto kv = LLM_KV(model.arch);
|
@@ -1657,29 +1898,25 @@ static void llm_load_hparams(
|
|
1657
1898
|
GGUF_GET_KEY(ctx, model.name, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_NAME));
|
1658
1899
|
|
1659
1900
|
// get hparams kv
|
1660
|
-
GGUF_GET_KEY(ctx, hparams.n_vocab, gguf_get_arr_n, GGUF_TYPE_ARRAY,
|
1661
|
-
GGUF_GET_KEY(ctx, hparams.n_ctx_train, gguf_get_val_u32, GGUF_TYPE_UINT32,
|
1662
|
-
GGUF_GET_KEY(ctx, hparams.n_embd, gguf_get_val_u32, GGUF_TYPE_UINT32,
|
1663
|
-
GGUF_GET_KEY(ctx, hparams.n_ff, gguf_get_val_u32, GGUF_TYPE_UINT32,
|
1664
|
-
GGUF_GET_KEY(ctx, hparams.n_head, gguf_get_val_u32, GGUF_TYPE_UINT32,
|
1665
|
-
GGUF_GET_KEY(ctx, hparams.n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32,
|
1901
|
+
GGUF_GET_KEY(ctx, hparams.n_vocab, gguf_get_arr_n, GGUF_TYPE_ARRAY, true, kv(LLM_KV_TOKENIZER_LIST));
|
1902
|
+
GGUF_GET_KEY(ctx, hparams.n_ctx_train, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_CONTEXT_LENGTH));
|
1903
|
+
GGUF_GET_KEY(ctx, hparams.n_embd, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_EMBEDDING_LENGTH));
|
1904
|
+
GGUF_GET_KEY(ctx, hparams.n_ff, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_FEED_FORWARD_LENGTH));
|
1905
|
+
GGUF_GET_KEY(ctx, hparams.n_head, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_ATTENTION_HEAD_COUNT));
|
1906
|
+
GGUF_GET_KEY(ctx, hparams.n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_BLOCK_COUNT));
|
1666
1907
|
|
1667
1908
|
// n_head_kv is optional, default to n_head
|
1668
1909
|
hparams.n_head_kv = hparams.n_head;
|
1669
1910
|
GGUF_GET_KEY(ctx, hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV));
|
1670
1911
|
|
1671
1912
|
// rope_freq_base (optional)
|
1672
|
-
|
1673
|
-
|
1674
|
-
GGUF_GET_KEY(ctx, rope_freq_base, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
|
1675
|
-
}
|
1913
|
+
hparams.rope_freq_base_train = 10000.0f;
|
1914
|
+
GGUF_GET_KEY(ctx, hparams.rope_freq_base_train, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
|
1676
1915
|
|
1677
1916
|
// rope_freq_scale (inverse of the kv) is optional
|
1678
|
-
|
1679
|
-
|
1680
|
-
|
1681
|
-
rope_freq_scale = 1.0f/ropescale;
|
1682
|
-
}
|
1917
|
+
float ropescale = 1.0f;
|
1918
|
+
GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
|
1919
|
+
hparams.rope_freq_scale_train = 1.0f/ropescale;
|
1683
1920
|
|
1684
1921
|
// sanity check for n_rot (optional)
|
1685
1922
|
{
|
@@ -1742,14 +1979,18 @@ static void llm_load_hparams(
|
|
1742
1979
|
default: model.type = e_model::MODEL_UNKNOWN;
|
1743
1980
|
}
|
1744
1981
|
} break;
|
1982
|
+
case LLM_ARCH_REFACT:
|
1983
|
+
{
|
1984
|
+
GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
|
1985
|
+
switch (hparams.n_layer) {
|
1986
|
+
case 32: model.type = e_model::MODEL_1B; break;
|
1987
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
1988
|
+
}
|
1989
|
+
} break;
|
1745
1990
|
default: (void)0;
|
1746
|
-
}
|
1991
|
+
}
|
1747
1992
|
|
1748
1993
|
model.ftype = ml.ftype;
|
1749
|
-
|
1750
|
-
hparams.n_ctx = n_ctx;
|
1751
|
-
hparams.rope_freq_base = rope_freq_base;
|
1752
|
-
hparams.rope_freq_scale = rope_freq_scale;
|
1753
1994
|
}
|
1754
1995
|
|
1755
1996
|
// TODO: This should probably be in llama.h
|
@@ -1770,20 +2011,18 @@ static void llm_load_vocab(
|
|
1770
2011
|
throw std::runtime_error("cannot find tokenizer vocab in model file\n");
|
1771
2012
|
}
|
1772
2013
|
|
2014
|
+
const float * scores = nullptr;
|
1773
2015
|
const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
|
1774
|
-
if (score_idx
|
1775
|
-
|
2016
|
+
if (score_idx != -1) {
|
2017
|
+
scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
|
1776
2018
|
}
|
1777
2019
|
|
1778
|
-
const
|
1779
|
-
|
2020
|
+
const int * toktypes = nullptr;
|
1780
2021
|
const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
|
1781
|
-
if (toktype_idx
|
1782
|
-
|
2022
|
+
if (toktype_idx != -1) {
|
2023
|
+
toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
|
1783
2024
|
}
|
1784
2025
|
|
1785
|
-
const int * toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
|
1786
|
-
|
1787
2026
|
// determine vocab type
|
1788
2027
|
{
|
1789
2028
|
std::string tokenizer_name;
|
@@ -1812,6 +2051,7 @@ static void llm_load_vocab(
|
|
1812
2051
|
|
1813
2052
|
for (int i = 0; i < n_merges; i++) {
|
1814
2053
|
const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
|
2054
|
+
GGML_ASSERT(codepoints_from_utf8(word).size() > 0);
|
1815
2055
|
|
1816
2056
|
std::string first;
|
1817
2057
|
std::string second;
|
@@ -1846,20 +2086,22 @@ static void llm_load_vocab(
|
|
1846
2086
|
|
1847
2087
|
for (uint32_t i = 0; i < n_vocab; i++) {
|
1848
2088
|
std::string word = gguf_get_arr_str(ctx, token_idx, i);
|
2089
|
+
GGML_ASSERT(codepoints_from_utf8(word).size() > 0);
|
1849
2090
|
|
1850
2091
|
vocab.token_to_id[word] = i;
|
1851
2092
|
|
1852
2093
|
auto & token_data = vocab.id_to_token[i];
|
1853
2094
|
token_data.text = std::move(word);
|
1854
|
-
token_data.score = scores[i];
|
1855
|
-
token_data.type = (llama_token_type) toktypes[i];
|
2095
|
+
token_data.score = scores ? scores[i] : 0.0f;
|
2096
|
+
token_data.type = toktypes ? (llama_token_type) toktypes[i] : LLAMA_TOKEN_TYPE_NORMAL;
|
1856
2097
|
}
|
2098
|
+
GGML_ASSERT(vocab.id_to_token.size() == vocab.token_to_id.size());
|
1857
2099
|
|
1858
2100
|
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
|
1859
2101
|
if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
|
1860
2102
|
vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
|
1861
2103
|
} else {
|
1862
|
-
vocab.linefeed_id = llama_tokenize_internal(vocab, "\
|
2104
|
+
vocab.linefeed_id = llama_tokenize_internal(vocab, "\u010A", false)[0];
|
1863
2105
|
}
|
1864
2106
|
|
1865
2107
|
// special tokens
|
@@ -1875,31 +2117,30 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
1875
2117
|
const auto & vocab = model.vocab;
|
1876
2118
|
|
1877
2119
|
// hparams
|
1878
|
-
LLAMA_LOG_INFO("%s: format
|
1879
|
-
LLAMA_LOG_INFO("%s: arch
|
1880
|
-
LLAMA_LOG_INFO("%s: vocab type
|
1881
|
-
LLAMA_LOG_INFO("%s: n_vocab
|
1882
|
-
LLAMA_LOG_INFO("%s: n_merges
|
1883
|
-
LLAMA_LOG_INFO("%s: n_ctx_train
|
1884
|
-
LLAMA_LOG_INFO("%s:
|
1885
|
-
LLAMA_LOG_INFO("%s:
|
1886
|
-
LLAMA_LOG_INFO("%s:
|
1887
|
-
LLAMA_LOG_INFO("%s:
|
1888
|
-
LLAMA_LOG_INFO("%s:
|
1889
|
-
LLAMA_LOG_INFO("%s:
|
1890
|
-
LLAMA_LOG_INFO("%s:
|
1891
|
-
LLAMA_LOG_INFO("%s:
|
1892
|
-
LLAMA_LOG_INFO("%s:
|
1893
|
-
LLAMA_LOG_INFO("%s:
|
1894
|
-
LLAMA_LOG_INFO("%s:
|
1895
|
-
LLAMA_LOG_INFO("%s:
|
1896
|
-
LLAMA_LOG_INFO("%s: model
|
1897
|
-
LLAMA_LOG_INFO("%s: model
|
1898
|
-
LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
|
2120
|
+
LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver));
|
2121
|
+
LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch).c_str());
|
2122
|
+
LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, vocab.type == LLAMA_VOCAB_TYPE_SPM ? "SPM" : "BPE"); // TODO: fix
|
2123
|
+
LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab);
|
2124
|
+
LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (int) vocab.bpe_ranks.size());
|
2125
|
+
LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
|
2126
|
+
LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
|
2127
|
+
LLAMA_LOG_INFO("%s: n_head = %u\n", __func__, hparams.n_head);
|
2128
|
+
LLAMA_LOG_INFO("%s: n_head_kv = %u\n", __func__, hparams.n_head_kv);
|
2129
|
+
LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
|
2130
|
+
LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim
|
2131
|
+
LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa());
|
2132
|
+
LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
|
2133
|
+
LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
|
2134
|
+
LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
|
2135
|
+
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
2136
|
+
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
2137
|
+
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
|
2138
|
+
LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
|
2139
|
+
LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
|
1899
2140
|
if (ml.n_bytes < GB) {
|
1900
|
-
LLAMA_LOG_INFO("%s: model size
|
2141
|
+
LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
|
1901
2142
|
} else {
|
1902
|
-
LLAMA_LOG_INFO("%s: model size
|
2143
|
+
LLAMA_LOG_INFO("%s: model size = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
|
1903
2144
|
}
|
1904
2145
|
|
1905
2146
|
// general kv
|
@@ -1917,13 +2158,9 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
1917
2158
|
static void llm_load_tensors(
|
1918
2159
|
llama_model_loader & ml,
|
1919
2160
|
llama_model & model,
|
1920
|
-
int n_batch,
|
1921
2161
|
int n_gpu_layers,
|
1922
2162
|
int main_gpu,
|
1923
2163
|
const float * tensor_split,
|
1924
|
-
const bool mul_mat_q,
|
1925
|
-
bool low_vram,
|
1926
|
-
ggml_type memory_type,
|
1927
2164
|
bool use_mlock,
|
1928
2165
|
llama_progress_callback progress_callback,
|
1929
2166
|
void * progress_callback_user_data) {
|
@@ -1962,11 +2199,9 @@ static void llm_load_tensors(
|
|
1962
2199
|
}
|
1963
2200
|
|
1964
2201
|
(void) main_gpu;
|
1965
|
-
|
1966
|
-
#if defined(GGML_USE_CUBLAS)
|
2202
|
+
#ifdef GGML_USE_CUBLAS
|
1967
2203
|
LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__);
|
1968
2204
|
ggml_cuda_set_main_device(main_gpu);
|
1969
|
-
ggml_cuda_set_mul_mat_q(mul_mat_q);
|
1970
2205
|
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
|
1971
2206
|
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
|
1972
2207
|
#elif defined(GGML_USE_CLBLAST)
|
@@ -1989,6 +2224,7 @@ static void llm_load_tensors(
|
|
1989
2224
|
const auto tn = LLM_TN(model.arch);
|
1990
2225
|
switch (model.arch) {
|
1991
2226
|
case LLM_ARCH_LLAMA:
|
2227
|
+
case LLM_ARCH_REFACT:
|
1992
2228
|
{
|
1993
2229
|
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
1994
2230
|
|
@@ -2001,9 +2237,9 @@ static void llm_load_tensors(
|
|
2001
2237
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2002
2238
|
// on Windows however this is detrimental unless everything is on the GPU
|
2003
2239
|
#ifndef _WIN32
|
2004
|
-
backend_norm =
|
2240
|
+
backend_norm = LLAMA_BACKEND_OFFLOAD;
|
2005
2241
|
#else
|
2006
|
-
backend_norm =
|
2242
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2007
2243
|
#endif // _WIN32
|
2008
2244
|
|
2009
2245
|
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
@@ -2067,9 +2303,9 @@ static void llm_load_tensors(
|
|
2067
2303
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2068
2304
|
// on Windows however this is detrimental unless everything is on the GPU
|
2069
2305
|
#ifndef _WIN32
|
2070
|
-
backend_norm =
|
2306
|
+
backend_norm = LLAMA_BACKEND_OFFLOAD;
|
2071
2307
|
#else
|
2072
|
-
backend_norm =
|
2308
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2073
2309
|
#endif // _WIN32
|
2074
2310
|
|
2075
2311
|
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
@@ -2137,9 +2373,9 @@ static void llm_load_tensors(
|
|
2137
2373
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2138
2374
|
// on Windows however this is detrimental unless everything is on the GPU
|
2139
2375
|
#ifndef _WIN32
|
2140
|
-
backend_norm =
|
2376
|
+
backend_norm = LLAMA_BACKEND_OFFLOAD;
|
2141
2377
|
#else
|
2142
|
-
backend_norm =
|
2378
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2143
2379
|
#endif // _WIN32
|
2144
2380
|
|
2145
2381
|
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
@@ -2214,9 +2450,9 @@ static void llm_load_tensors(
|
|
2214
2450
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2215
2451
|
// on Windows however this is detrimental unless everything is on the GPU
|
2216
2452
|
#ifndef _WIN32
|
2217
|
-
backend_norm =
|
2453
|
+
backend_norm = LLAMA_BACKEND_OFFLOAD;
|
2218
2454
|
#else
|
2219
|
-
backend_norm =
|
2455
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2220
2456
|
#endif // _WIN32
|
2221
2457
|
|
2222
2458
|
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
@@ -2281,27 +2517,19 @@ static void llm_load_tensors(
|
|
2281
2517
|
} break;
|
2282
2518
|
default:
|
2283
2519
|
throw std::runtime_error("unknown architecture");
|
2284
|
-
}
|
2520
|
+
}
|
2285
2521
|
}
|
2286
2522
|
|
2287
2523
|
ml.done_getting_tensors();
|
2288
2524
|
|
2289
2525
|
// print memory requirements
|
2290
2526
|
{
|
2291
|
-
const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
|
2292
|
-
|
2293
2527
|
// this is the total memory required to run the inference
|
2294
2528
|
size_t mem_required =
|
2295
2529
|
ctx_size +
|
2296
2530
|
mmapped_size - vram_weights; // weights in VRAM not in memory
|
2297
2531
|
|
2298
|
-
|
2299
|
-
const size_t mem_required_state = scale*hparams.kv_size();
|
2300
|
-
|
2301
|
-
LLAMA_LOG_INFO("%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
|
2302
|
-
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
2303
|
-
|
2304
|
-
(void) n_batch;
|
2532
|
+
LLAMA_LOG_INFO("%s: mem required = %7.2f MB\n", __func__, mem_required / 1024.0 / 1024.0);
|
2305
2533
|
|
2306
2534
|
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
2307
2535
|
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
@@ -2310,36 +2538,17 @@ static void llm_load_tensors(
|
|
2310
2538
|
if (n_gpu_layers > (int) hparams.n_layer) {
|
2311
2539
|
LLAMA_LOG_INFO("%s: offloading non-repeating layers to GPU\n", __func__);
|
2312
2540
|
}
|
2313
|
-
size_t vram_kv_cache = 0;
|
2314
2541
|
|
2315
2542
|
#ifdef GGML_USE_CUBLAS
|
2316
2543
|
const int max_backend_supported_layers = hparams.n_layer + 3;
|
2317
|
-
const int max_offloadable_layers =
|
2318
|
-
if (n_gpu_layers > (int) hparams.n_layer + 1) {
|
2319
|
-
if (low_vram) {
|
2320
|
-
LLAMA_LOG_INFO("%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
|
2321
|
-
} else {
|
2322
|
-
LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__);
|
2323
|
-
vram_kv_cache += hparams.kv_size() / 2;
|
2324
|
-
}
|
2325
|
-
}
|
2326
|
-
if (n_gpu_layers > (int) hparams.n_layer + 2) {
|
2327
|
-
if (low_vram) {
|
2328
|
-
LLAMA_LOG_WARN("%s: cannot offload k cache to GPU due to low VRAM option\n", __func__);
|
2329
|
-
} else {
|
2330
|
-
LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
|
2331
|
-
vram_kv_cache += hparams.kv_size() / 2;
|
2332
|
-
}
|
2333
|
-
}
|
2544
|
+
const int max_offloadable_layers = hparams.n_layer + 3;
|
2334
2545
|
#elif defined(GGML_USE_CLBLAST)
|
2335
2546
|
const int max_backend_supported_layers = hparams.n_layer + 1;
|
2336
2547
|
const int max_offloadable_layers = hparams.n_layer + 1;
|
2337
2548
|
#endif // GGML_USE_CUBLAS
|
2338
2549
|
|
2339
|
-
LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n",
|
2340
|
-
|
2341
|
-
LLAMA_LOG_INFO("%s: VRAM used: %zu MB\n",
|
2342
|
-
__func__, (vram_weights + vram_kv_cache + MB - 1) / MB); // round up
|
2550
|
+
LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
|
2551
|
+
LLAMA_LOG_INFO("%s: VRAM used: %.2f MB\n", __func__, vram_weights / 1024.0 / 1024.0);
|
2343
2552
|
#else
|
2344
2553
|
(void) n_gpu_layers;
|
2345
2554
|
#endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
@@ -2352,7 +2561,7 @@ static void llm_load_tensors(
|
|
2352
2561
|
}
|
2353
2562
|
|
2354
2563
|
(void) tensor_split;
|
2355
|
-
#
|
2564
|
+
#ifdef GGML_USE_CUBLAS
|
2356
2565
|
{
|
2357
2566
|
ggml_cuda_set_tensor_split(tensor_split);
|
2358
2567
|
}
|
@@ -2374,29 +2583,24 @@ static void llm_load_tensors(
|
|
2374
2583
|
static bool llama_model_load(
|
2375
2584
|
const std::string & fname,
|
2376
2585
|
llama_model & model,
|
2377
|
-
int n_ctx,
|
2378
|
-
int n_batch,
|
2379
2586
|
int n_gpu_layers,
|
2380
2587
|
int main_gpu,
|
2381
2588
|
const float * tensor_split,
|
2382
|
-
const bool mul_mat_q,
|
2383
|
-
float rope_freq_base,
|
2384
|
-
float rope_freq_scale,
|
2385
|
-
bool low_vram,
|
2386
|
-
ggml_type memory_type,
|
2387
2589
|
bool use_mmap,
|
2388
2590
|
bool use_mlock,
|
2389
2591
|
bool vocab_only,
|
2390
2592
|
llama_progress_callback progress_callback,
|
2391
2593
|
void *progress_callback_user_data) {
|
2392
2594
|
try {
|
2393
|
-
|
2595
|
+
llama_model_loader ml(fname, use_mmap);
|
2596
|
+
|
2597
|
+
model.hparams.vocab_only = vocab_only;
|
2394
2598
|
|
2395
|
-
llm_load_arch (
|
2396
|
-
llm_load_hparams(
|
2397
|
-
llm_load_vocab (
|
2599
|
+
llm_load_arch (ml, model);
|
2600
|
+
llm_load_hparams(ml, model);
|
2601
|
+
llm_load_vocab (ml, model);
|
2398
2602
|
|
2399
|
-
llm_load_print_meta(
|
2603
|
+
llm_load_print_meta(ml, model);
|
2400
2604
|
|
2401
2605
|
if (model.hparams.n_vocab != model.vocab.id_to_token.size()) {
|
2402
2606
|
throw std::runtime_error("vocab size mismatch");
|
@@ -2408,8 +2612,8 @@ static bool llama_model_load(
|
|
2408
2612
|
}
|
2409
2613
|
|
2410
2614
|
llm_load_tensors(
|
2411
|
-
|
2412
|
-
main_gpu, tensor_split,
|
2615
|
+
ml, model, n_gpu_layers,
|
2616
|
+
main_gpu, tensor_split,
|
2413
2617
|
use_mlock, progress_callback, progress_callback_user_data);
|
2414
2618
|
} catch (const std::exception & err) {
|
2415
2619
|
LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
|
@@ -2421,17 +2625,10 @@ static bool llama_model_load(
|
|
2421
2625
|
|
2422
2626
|
static struct ggml_cgraph * llm_build_llama(
|
2423
2627
|
llama_context & lctx,
|
2424
|
-
const
|
2425
|
-
const float * embd,
|
2426
|
-
int n_tokens,
|
2427
|
-
int n_past) {
|
2428
|
-
|
2429
|
-
GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
|
2430
|
-
|
2431
|
-
const int N = n_tokens;
|
2432
|
-
|
2628
|
+
const llama_batch & batch) {
|
2433
2629
|
const auto & model = lctx.model;
|
2434
2630
|
const auto & hparams = model.hparams;
|
2631
|
+
const auto & cparams = lctx.cparams;
|
2435
2632
|
|
2436
2633
|
const auto & kv_self = lctx.kv_self;
|
2437
2634
|
|
@@ -2439,7 +2636,7 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2439
2636
|
|
2440
2637
|
const int64_t n_embd = hparams.n_embd;
|
2441
2638
|
const int64_t n_layer = hparams.n_layer;
|
2442
|
-
const int64_t n_ctx =
|
2639
|
+
const int64_t n_ctx = cparams.n_ctx;
|
2443
2640
|
const int64_t n_head = hparams.n_head;
|
2444
2641
|
const int64_t n_head_kv = hparams.n_head_kv;
|
2445
2642
|
const int64_t n_embd_head = hparams.n_embd_head();
|
@@ -2447,12 +2644,20 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2447
2644
|
|
2448
2645
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
2449
2646
|
|
2450
|
-
const float freq_base =
|
2451
|
-
const float freq_scale =
|
2647
|
+
const float freq_base = cparams.rope_freq_base;
|
2648
|
+
const float freq_scale = cparams.rope_freq_scale;
|
2452
2649
|
const float norm_rms_eps = hparams.f_norm_rms_eps;
|
2453
2650
|
|
2454
2651
|
const int n_gpu_layers = model.n_gpu_layers;
|
2455
2652
|
|
2653
|
+
const int32_t n_tokens = batch.n_tokens;
|
2654
|
+
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
|
2655
|
+
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
2656
|
+
|
2657
|
+
const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
|
2658
|
+
|
2659
|
+
//printf("n_kv = %d\n", n_kv);
|
2660
|
+
|
2456
2661
|
auto & buf_compute = lctx.buf_compute;
|
2457
2662
|
|
2458
2663
|
struct ggml_init_params params = {
|
@@ -2470,12 +2675,12 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2470
2675
|
struct ggml_tensor * cur;
|
2471
2676
|
struct ggml_tensor * inpL;
|
2472
2677
|
|
2473
|
-
if (
|
2474
|
-
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32,
|
2678
|
+
if (batch.token) {
|
2679
|
+
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
2475
2680
|
|
2476
2681
|
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
2477
2682
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2478
|
-
memcpy(inp_tokens->data,
|
2683
|
+
memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
|
2479
2684
|
}
|
2480
2685
|
ggml_set_name(inp_tokens, "inp_tokens");
|
2481
2686
|
|
@@ -2485,11 +2690,11 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2485
2690
|
GGML_ASSERT(false && "not implemented");
|
2486
2691
|
#endif
|
2487
2692
|
|
2488
|
-
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd,
|
2693
|
+
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
|
2489
2694
|
|
2490
2695
|
ggml_allocr_alloc(lctx.alloc, inpL);
|
2491
2696
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2492
|
-
memcpy(inpL->data, embd,
|
2697
|
+
memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
|
2493
2698
|
}
|
2494
2699
|
}
|
2495
2700
|
|
@@ -2498,9 +2703,6 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2498
2703
|
|
2499
2704
|
// offload functions set the tensor output backend to GPU
|
2500
2705
|
// tensors are GPU-accelerated if any input or the output has been offloaded
|
2501
|
-
//
|
2502
|
-
// with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
|
2503
|
-
// in that case ggml_cuda_assign_buffers has no effect
|
2504
2706
|
offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
|
2505
2707
|
offload_func_t offload_func_kq = llama_nop;
|
2506
2708
|
offload_func_t offload_func_v = llama_nop;
|
@@ -2517,12 +2719,75 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2517
2719
|
}
|
2518
2720
|
#endif // GGML_USE_CUBLAS
|
2519
2721
|
|
2722
|
+
// KQ_scale
|
2520
2723
|
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
2724
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
2521
2725
|
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
2522
2726
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2523
|
-
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(
|
2727
|
+
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd_head)));
|
2728
|
+
}
|
2729
|
+
|
2730
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
2731
|
+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
2732
|
+
offload_func_kq(KQ_mask);
|
2733
|
+
ggml_set_name(KQ_mask, "KQ_mask");
|
2734
|
+
ggml_allocr_alloc(lctx.alloc, KQ_mask);
|
2735
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2736
|
+
float * data = (float *) KQ_mask->data;
|
2737
|
+
memset(data, 0, ggml_nbytes(KQ_mask));
|
2738
|
+
|
2739
|
+
for (int h = 0; h < 1; ++h) {
|
2740
|
+
for (int j = 0; j < n_tokens; ++j) {
|
2741
|
+
const llama_pos pos = batch.pos[j];
|
2742
|
+
const llama_seq_id seq_id = batch.seq_id[j];
|
2743
|
+
|
2744
|
+
for (int i = 0; i < n_kv; ++i) {
|
2745
|
+
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
2746
|
+
data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
|
2747
|
+
}
|
2748
|
+
}
|
2749
|
+
}
|
2750
|
+
}
|
2751
|
+
}
|
2752
|
+
|
2753
|
+
// KQ_pos - contains the positions
|
2754
|
+
struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
2755
|
+
offload_func_kq(KQ_pos);
|
2756
|
+
ggml_set_name(KQ_pos, "KQ_pos");
|
2757
|
+
ggml_allocr_alloc(lctx.alloc, KQ_pos);
|
2758
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2759
|
+
int * data = (int *) KQ_pos->data;
|
2760
|
+
for (int i = 0; i < n_tokens; ++i) {
|
2761
|
+
data[i] = batch.pos[i];
|
2762
|
+
}
|
2763
|
+
}
|
2764
|
+
|
2765
|
+
// shift the entire K-cache if needed
|
2766
|
+
if (do_rope_shift) {
|
2767
|
+
struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
|
2768
|
+
offload_func_kq(K_shift);
|
2769
|
+
ggml_set_name(K_shift, "K_shift");
|
2770
|
+
ggml_allocr_alloc(lctx.alloc, K_shift);
|
2771
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2772
|
+
int * data = (int *) K_shift->data;
|
2773
|
+
for (int i = 0; i < n_ctx; ++i) {
|
2774
|
+
data[i] = kv_self.cells[i].delta;
|
2775
|
+
}
|
2776
|
+
}
|
2777
|
+
|
2778
|
+
for (int il = 0; il < n_layer; ++il) {
|
2779
|
+
struct ggml_tensor * tmp =
|
2780
|
+
ggml_rope_custom_inplace(ctx0,
|
2781
|
+
ggml_view_3d(ctx0, kv_self.k,
|
2782
|
+
n_embd_head, n_head_kv, n_ctx,
|
2783
|
+
ggml_element_size(kv_self.k)*n_embd_head,
|
2784
|
+
ggml_element_size(kv_self.k)*n_embd_gqa,
|
2785
|
+
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
|
2786
|
+
K_shift, n_embd_head, 0, 0, freq_base, freq_scale);
|
2787
|
+
offload_func_kq(tmp);
|
2788
|
+
ggml_build_forward_expand(gf, tmp);
|
2789
|
+
}
|
2524
2790
|
}
|
2525
|
-
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
2526
2791
|
|
2527
2792
|
for (int il = 0; il < n_layer; ++il) {
|
2528
2793
|
ggml_format_name(inpL, "layer_inp_%d", il);
|
@@ -2560,33 +2825,33 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2560
2825
|
offload_func_kq(tmpq);
|
2561
2826
|
ggml_set_name(tmpq, "tmpq");
|
2562
2827
|
|
2563
|
-
struct ggml_tensor * Kcur =
|
2828
|
+
struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
|
2564
2829
|
offload_func_kq(Kcur);
|
2565
2830
|
ggml_set_name(Kcur, "Kcur");
|
2566
2831
|
|
2567
|
-
struct ggml_tensor * Qcur =
|
2832
|
+
struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
|
2568
2833
|
offload_func_kq(Qcur);
|
2569
2834
|
ggml_set_name(Qcur, "Qcur");
|
2570
2835
|
|
2571
2836
|
// store key and value to memory
|
2572
2837
|
{
|
2573
|
-
// compute the transposed [
|
2838
|
+
// compute the transposed [n_tokens, n_embd] V matrix
|
2574
2839
|
|
2575
2840
|
struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
2576
2841
|
offload_func_v(tmpv);
|
2577
2842
|
ggml_set_name(tmpv, "tmpv");
|
2578
2843
|
|
2579
|
-
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa,
|
2844
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens));
|
2580
2845
|
offload_func_v(Vcur);
|
2581
2846
|
ggml_set_name(Vcur, "Vcur");
|
2582
2847
|
|
2583
|
-
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k,
|
2848
|
+
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
|
2584
2849
|
offload_func_kq(k);
|
2585
2850
|
ggml_set_name(k, "k");
|
2586
2851
|
|
2587
|
-
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v,
|
2852
|
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
|
2588
2853
|
( n_ctx)*ggml_element_size(kv_self.v),
|
2589
|
-
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa +
|
2854
|
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
|
2590
2855
|
offload_func_v(v);
|
2591
2856
|
ggml_set_name(v, "v");
|
2592
2857
|
|
@@ -2601,7 +2866,7 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2601
2866
|
|
2602
2867
|
struct ggml_tensor * K =
|
2603
2868
|
ggml_view_3d(ctx0, kv_self.k,
|
2604
|
-
n_embd_head,
|
2869
|
+
n_embd_head, n_kv, n_head_kv,
|
2605
2870
|
ggml_element_size(kv_self.k)*n_embd_gqa,
|
2606
2871
|
ggml_element_size(kv_self.k)*n_embd_head,
|
2607
2872
|
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
@@ -2614,25 +2879,25 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2614
2879
|
ggml_set_name(KQ, "KQ");
|
2615
2880
|
|
2616
2881
|
// KQ_scaled = KQ / sqrt(n_embd_head)
|
2617
|
-
// KQ_scaled shape [
|
2618
|
-
struct ggml_tensor * KQ_scaled =
|
2882
|
+
// KQ_scaled shape [n_kv, n_tokens, n_head, 1]
|
2883
|
+
struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
|
2619
2884
|
offload_func_kq(KQ_scaled);
|
2620
2885
|
ggml_set_name(KQ_scaled, "KQ_scaled");
|
2621
2886
|
|
2622
2887
|
// KQ_masked = mask_past(KQ_scaled)
|
2623
|
-
struct ggml_tensor * KQ_masked =
|
2888
|
+
struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
|
2624
2889
|
offload_func_kq(KQ_masked);
|
2625
2890
|
ggml_set_name(KQ_masked, "KQ_masked");
|
2626
2891
|
|
2627
2892
|
// KQ = soft_max(KQ_masked)
|
2628
|
-
struct ggml_tensor * KQ_soft_max =
|
2893
|
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
|
2629
2894
|
offload_func_v(KQ_soft_max);
|
2630
2895
|
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
2631
2896
|
|
2632
2897
|
// split cached V into n_head heads
|
2633
2898
|
struct ggml_tensor * V =
|
2634
2899
|
ggml_view_3d(ctx0, kv_self.v,
|
2635
|
-
|
2900
|
+
n_kv, n_embd_head, n_head_kv,
|
2636
2901
|
ggml_element_size(kv_self.v)*n_ctx,
|
2637
2902
|
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
2638
2903
|
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
@@ -2647,7 +2912,7 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2647
2912
|
// make V contiguous in memory to speed up the matmul, however we waste time on the copy
|
2648
2913
|
// on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
|
2649
2914
|
// is there a better way?
|
2650
|
-
struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type,
|
2915
|
+
struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_ctx, n_embd_head, n_head));
|
2651
2916
|
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
|
2652
2917
|
#endif
|
2653
2918
|
|
@@ -2656,10 +2921,8 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2656
2921
|
offload_func_v(KQV_merged);
|
2657
2922
|
ggml_set_name(KQV_merged, "KQV_merged");
|
2658
2923
|
|
2659
|
-
// cur = KQV_merged.contiguous().view(n_embd,
|
2660
|
-
cur =
|
2661
|
-
KQV_merged,
|
2662
|
-
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
|
2924
|
+
// cur = KQV_merged.contiguous().view(n_embd, n_tokens)
|
2925
|
+
cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
|
2663
2926
|
offload_func_v(cur);
|
2664
2927
|
ggml_set_name(cur, "KQV_merged_contiguous");
|
2665
2928
|
|
@@ -2750,20 +3013,12 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2750
3013
|
return gf;
|
2751
3014
|
}
|
2752
3015
|
|
2753
|
-
|
2754
3016
|
static struct ggml_cgraph * llm_build_baichaun(
|
2755
3017
|
llama_context & lctx,
|
2756
|
-
const
|
2757
|
-
const float * embd,
|
2758
|
-
int n_tokens,
|
2759
|
-
int n_past) {
|
2760
|
-
|
2761
|
-
GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
|
2762
|
-
|
2763
|
-
const int N = n_tokens;
|
2764
|
-
|
3018
|
+
const llama_batch & batch) {
|
2765
3019
|
const auto & model = lctx.model;
|
2766
3020
|
const auto & hparams = model.hparams;
|
3021
|
+
const auto & cparams = lctx.cparams;
|
2767
3022
|
|
2768
3023
|
const auto & kv_self = lctx.kv_self;
|
2769
3024
|
|
@@ -2771,7 +3026,7 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2771
3026
|
|
2772
3027
|
const int64_t n_embd = hparams.n_embd;
|
2773
3028
|
const int64_t n_layer = hparams.n_layer;
|
2774
|
-
const int64_t n_ctx =
|
3029
|
+
const int64_t n_ctx = cparams.n_ctx;
|
2775
3030
|
const int64_t n_head = hparams.n_head;
|
2776
3031
|
const int64_t n_head_kv = hparams.n_head_kv;
|
2777
3032
|
const int64_t n_embd_head = hparams.n_embd_head();
|
@@ -2779,12 +3034,18 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2779
3034
|
|
2780
3035
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
2781
3036
|
|
2782
|
-
const float freq_base =
|
2783
|
-
const float freq_scale =
|
3037
|
+
const float freq_base = cparams.rope_freq_base;
|
3038
|
+
const float freq_scale = cparams.rope_freq_scale;
|
2784
3039
|
const float norm_rms_eps = hparams.f_norm_rms_eps;
|
2785
3040
|
|
2786
3041
|
const int n_gpu_layers = model.n_gpu_layers;
|
2787
3042
|
|
3043
|
+
const int32_t n_tokens = batch.n_tokens;
|
3044
|
+
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
|
3045
|
+
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
3046
|
+
|
3047
|
+
const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
|
3048
|
+
|
2788
3049
|
auto & buf_compute = lctx.buf_compute;
|
2789
3050
|
|
2790
3051
|
struct ggml_init_params params = {
|
@@ -2802,12 +3063,12 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2802
3063
|
struct ggml_tensor * cur;
|
2803
3064
|
struct ggml_tensor * inpL;
|
2804
3065
|
|
2805
|
-
if (
|
2806
|
-
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32,
|
3066
|
+
if (batch.token) {
|
3067
|
+
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
2807
3068
|
|
2808
3069
|
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
2809
3070
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2810
|
-
memcpy(inp_tokens->data,
|
3071
|
+
memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
|
2811
3072
|
}
|
2812
3073
|
ggml_set_name(inp_tokens, "inp_tokens");
|
2813
3074
|
|
@@ -2817,11 +3078,11 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2817
3078
|
GGML_ASSERT(false && "not implemented");
|
2818
3079
|
#endif
|
2819
3080
|
|
2820
|
-
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd,
|
3081
|
+
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
|
2821
3082
|
|
2822
3083
|
ggml_allocr_alloc(lctx.alloc, inpL);
|
2823
3084
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2824
|
-
memcpy(inpL->data, embd,
|
3085
|
+
memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
|
2825
3086
|
}
|
2826
3087
|
}
|
2827
3088
|
|
@@ -2830,9 +3091,6 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2830
3091
|
|
2831
3092
|
// offload functions set the tensor output backend to GPU
|
2832
3093
|
// tensors are GPU-accelerated if any input or the output has been offloaded
|
2833
|
-
//
|
2834
|
-
// with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
|
2835
|
-
// in that case ggml_cuda_assign_buffers has no effect
|
2836
3094
|
offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
|
2837
3095
|
offload_func_t offload_func_kq = llama_nop;
|
2838
3096
|
offload_func_t offload_func_v = llama_nop;
|
@@ -2849,12 +3107,75 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2849
3107
|
}
|
2850
3108
|
#endif // GGML_USE_CUBLAS
|
2851
3109
|
|
3110
|
+
// KQ_scale
|
2852
3111
|
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
3112
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
2853
3113
|
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
2854
3114
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2855
3115
|
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
2856
3116
|
}
|
2857
|
-
|
3117
|
+
|
3118
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
3119
|
+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
3120
|
+
offload_func_kq(KQ_mask);
|
3121
|
+
ggml_set_name(KQ_mask, "KQ_mask");
|
3122
|
+
ggml_allocr_alloc(lctx.alloc, KQ_mask);
|
3123
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3124
|
+
float * data = (float *) KQ_mask->data;
|
3125
|
+
memset(data, 0, ggml_nbytes(KQ_mask));
|
3126
|
+
|
3127
|
+
for (int h = 0; h < 1; ++h) {
|
3128
|
+
for (int j = 0; j < n_tokens; ++j) {
|
3129
|
+
const llama_pos pos = batch.pos[j];
|
3130
|
+
const llama_seq_id seq_id = batch.seq_id[j];
|
3131
|
+
|
3132
|
+
for (int i = 0; i < n_kv; ++i) {
|
3133
|
+
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
3134
|
+
data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
|
3135
|
+
}
|
3136
|
+
}
|
3137
|
+
}
|
3138
|
+
}
|
3139
|
+
}
|
3140
|
+
|
3141
|
+
// KQ_pos - contains the positions
|
3142
|
+
struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
3143
|
+
offload_func_kq(KQ_pos);
|
3144
|
+
ggml_set_name(KQ_pos, "KQ_pos");
|
3145
|
+
ggml_allocr_alloc(lctx.alloc, KQ_pos);
|
3146
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3147
|
+
int * data = (int *) KQ_pos->data;
|
3148
|
+
for (int i = 0; i < n_tokens; ++i) {
|
3149
|
+
data[i] = batch.pos[i];
|
3150
|
+
}
|
3151
|
+
}
|
3152
|
+
|
3153
|
+
// shift the entire K-cache if needed
|
3154
|
+
if (do_rope_shift) {
|
3155
|
+
struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
|
3156
|
+
offload_func_kq(K_shift);
|
3157
|
+
ggml_set_name(K_shift, "K_shift");
|
3158
|
+
ggml_allocr_alloc(lctx.alloc, K_shift);
|
3159
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3160
|
+
int * data = (int *) K_shift->data;
|
3161
|
+
for (int i = 0; i < n_ctx; ++i) {
|
3162
|
+
data[i] = kv_self.cells[i].delta;
|
3163
|
+
}
|
3164
|
+
}
|
3165
|
+
|
3166
|
+
for (int il = 0; il < n_layer; ++il) {
|
3167
|
+
struct ggml_tensor * tmp =
|
3168
|
+
ggml_rope_custom_inplace(ctx0,
|
3169
|
+
ggml_view_3d(ctx0, kv_self.k,
|
3170
|
+
n_embd_head, n_head_kv, n_ctx,
|
3171
|
+
ggml_element_size(kv_self.k)*n_embd_head,
|
3172
|
+
ggml_element_size(kv_self.k)*n_embd_gqa,
|
3173
|
+
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
|
3174
|
+
K_shift, n_embd_head, 0, 0, freq_base, freq_scale);
|
3175
|
+
offload_func_kq(tmp);
|
3176
|
+
ggml_build_forward_expand(gf, tmp);
|
3177
|
+
}
|
3178
|
+
}
|
2858
3179
|
|
2859
3180
|
for (int il = 0; il < n_layer; ++il) {
|
2860
3181
|
ggml_format_name(inpL, "layer_inp_%d", il);
|
@@ -2896,12 +3217,12 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2896
3217
|
struct ggml_tensor * Qcur;
|
2897
3218
|
switch (model.type) {
|
2898
3219
|
case MODEL_7B:
|
2899
|
-
Kcur =
|
2900
|
-
Qcur =
|
3220
|
+
Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
|
3221
|
+
Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
|
2901
3222
|
break;
|
2902
3223
|
case MODEL_13B:
|
2903
|
-
Kcur
|
2904
|
-
Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head,
|
3224
|
+
Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, n_tokens);
|
3225
|
+
Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, n_tokens);
|
2905
3226
|
break;
|
2906
3227
|
default:
|
2907
3228
|
GGML_ASSERT(false);
|
@@ -2915,23 +3236,23 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2915
3236
|
|
2916
3237
|
// store key and value to memory
|
2917
3238
|
{
|
2918
|
-
// compute the transposed [
|
3239
|
+
// compute the transposed [n_tokens, n_embd] V matrix
|
2919
3240
|
|
2920
3241
|
struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
2921
3242
|
offload_func_v(tmpv);
|
2922
3243
|
ggml_set_name(tmpv, "tmpv");
|
2923
3244
|
|
2924
|
-
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa,
|
3245
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens));
|
2925
3246
|
offload_func_v(Vcur);
|
2926
3247
|
ggml_set_name(Vcur, "Vcur");
|
2927
3248
|
|
2928
|
-
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k,
|
3249
|
+
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
|
2929
3250
|
offload_func_kq(k);
|
2930
3251
|
ggml_set_name(k, "k");
|
2931
3252
|
|
2932
|
-
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v,
|
3253
|
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
|
2933
3254
|
( n_ctx)*ggml_element_size(kv_self.v),
|
2934
|
-
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa +
|
3255
|
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
|
2935
3256
|
offload_func_v(v);
|
2936
3257
|
ggml_set_name(v, "v");
|
2937
3258
|
|
@@ -2946,7 +3267,7 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2946
3267
|
|
2947
3268
|
struct ggml_tensor * K =
|
2948
3269
|
ggml_view_3d(ctx0, kv_self.k,
|
2949
|
-
n_embd_head,
|
3270
|
+
n_embd_head, n_kv, n_head_kv,
|
2950
3271
|
ggml_element_size(kv_self.k)*n_embd_gqa,
|
2951
3272
|
ggml_element_size(kv_self.k)*n_embd_head,
|
2952
3273
|
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
@@ -2959,8 +3280,8 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2959
3280
|
ggml_set_name(KQ, "KQ");
|
2960
3281
|
|
2961
3282
|
// KQ_scaled = KQ / sqrt(n_embd_head)
|
2962
|
-
// KQ_scaled shape [n_past +
|
2963
|
-
struct ggml_tensor * KQ_scaled =
|
3283
|
+
// KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
|
3284
|
+
struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
|
2964
3285
|
offload_func_kq(KQ_scaled);
|
2965
3286
|
ggml_set_name(KQ_scaled, "KQ_scaled");
|
2966
3287
|
|
@@ -2969,58 +3290,44 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2969
3290
|
|
2970
3291
|
switch (model.type) {
|
2971
3292
|
case MODEL_7B:
|
2972
|
-
KQ_masked =
|
3293
|
+
KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
|
2973
3294
|
break;
|
2974
3295
|
case MODEL_13B:
|
2975
|
-
|
3296
|
+
// TODO: replace with ggml_add()
|
3297
|
+
KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ 0, n_head, 8);
|
2976
3298
|
ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
|
2977
|
-
KQ_masked =
|
3299
|
+
KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
|
2978
3300
|
break;
|
2979
3301
|
default:
|
2980
3302
|
GGML_ASSERT(false);
|
2981
3303
|
}
|
2982
|
-
// KQ_masked = mask_past(KQ_scaled)
|
2983
|
-
// struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
|
2984
|
-
// struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past);
|
2985
|
-
// offload_func_kq(KQ_masked);
|
2986
|
-
// ggml_set_name(KQ_masked, "KQ_masked");
|
2987
3304
|
|
2988
3305
|
// KQ = soft_max(KQ_masked)
|
2989
|
-
struct ggml_tensor * KQ_soft_max =
|
3306
|
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
|
2990
3307
|
offload_func_v(KQ_soft_max);
|
2991
3308
|
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
2992
3309
|
|
2993
3310
|
// split cached V into n_head heads
|
2994
3311
|
struct ggml_tensor * V =
|
2995
3312
|
ggml_view_3d(ctx0, kv_self.v,
|
2996
|
-
|
3313
|
+
n_kv, n_embd_head, n_head_kv,
|
2997
3314
|
ggml_element_size(kv_self.v)*n_ctx,
|
2998
3315
|
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
2999
3316
|
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
3000
3317
|
offload_func_v(V);
|
3001
3318
|
ggml_set_name(V, "V");
|
3002
3319
|
|
3003
|
-
#if 1
|
3004
3320
|
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
3005
3321
|
offload_func_v(KQV);
|
3006
3322
|
ggml_set_name(KQV, "KQV");
|
3007
|
-
#else
|
3008
|
-
// make V contiguous in memory to speed up the matmul, however we waste time on the copy
|
3009
|
-
// on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
|
3010
|
-
// is there a better way?
|
3011
|
-
struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head));
|
3012
|
-
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
|
3013
|
-
#endif
|
3014
3323
|
|
3015
3324
|
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
3016
3325
|
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
3017
3326
|
offload_func_v(KQV_merged);
|
3018
3327
|
ggml_set_name(KQV_merged, "KQV_merged");
|
3019
3328
|
|
3020
|
-
// cur = KQV_merged.contiguous().view(n_embd,
|
3021
|
-
cur =
|
3022
|
-
KQV_merged,
|
3023
|
-
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
|
3329
|
+
// cur = KQV_merged.contiguous().view(n_embd, n_tokens)
|
3330
|
+
cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
|
3024
3331
|
offload_func_v(cur);
|
3025
3332
|
ggml_set_name(cur, "KQV_merged_contiguous");
|
3026
3333
|
|
@@ -3111,19 +3418,12 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
3111
3418
|
return gf;
|
3112
3419
|
}
|
3113
3420
|
|
3114
|
-
static struct ggml_cgraph *
|
3421
|
+
static struct ggml_cgraph * llm_build_refact(
|
3115
3422
|
llama_context & lctx,
|
3116
|
-
const
|
3117
|
-
const float * embd,
|
3118
|
-
int n_tokens,
|
3119
|
-
int n_past) {
|
3120
|
-
|
3121
|
-
GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
|
3122
|
-
|
3123
|
-
const int N = n_tokens;
|
3124
|
-
|
3423
|
+
const llama_batch & batch) {
|
3125
3424
|
const auto & model = lctx.model;
|
3126
3425
|
const auto & hparams = model.hparams;
|
3426
|
+
const auto & cparams = lctx.cparams;
|
3127
3427
|
|
3128
3428
|
const auto & kv_self = lctx.kv_self;
|
3129
3429
|
|
@@ -3131,20 +3431,22 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3131
3431
|
|
3132
3432
|
const int64_t n_embd = hparams.n_embd;
|
3133
3433
|
const int64_t n_layer = hparams.n_layer;
|
3134
|
-
const int64_t n_ctx =
|
3434
|
+
const int64_t n_ctx = cparams.n_ctx;
|
3135
3435
|
const int64_t n_head = hparams.n_head;
|
3136
3436
|
const int64_t n_head_kv = hparams.n_head_kv;
|
3137
3437
|
const int64_t n_embd_head = hparams.n_embd_head();
|
3138
3438
|
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
3139
3439
|
|
3140
|
-
|
3141
|
-
|
3142
|
-
const float freq_base = hparams.rope_freq_base;
|
3143
|
-
const float freq_scale = hparams.rope_freq_scale;
|
3144
|
-
const float norm_eps = hparams.f_norm_eps;
|
3440
|
+
const float norm_rms_eps = hparams.f_norm_rms_eps;
|
3145
3441
|
|
3146
3442
|
const int n_gpu_layers = model.n_gpu_layers;
|
3147
3443
|
|
3444
|
+
const int32_t n_tokens = batch.n_tokens;
|
3445
|
+
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
|
3446
|
+
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
3447
|
+
|
3448
|
+
// printf("n_kv = %d\n", n_kv);
|
3449
|
+
|
3148
3450
|
auto & buf_compute = lctx.buf_compute;
|
3149
3451
|
|
3150
3452
|
struct ggml_init_params params = {
|
@@ -3162,12 +3464,12 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3162
3464
|
struct ggml_tensor * cur;
|
3163
3465
|
struct ggml_tensor * inpL;
|
3164
3466
|
|
3165
|
-
if (
|
3166
|
-
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32,
|
3467
|
+
if (batch.token) {
|
3468
|
+
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
3167
3469
|
|
3168
3470
|
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
3169
3471
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3170
|
-
memcpy(inp_tokens->data,
|
3472
|
+
memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
|
3171
3473
|
}
|
3172
3474
|
ggml_set_name(inp_tokens, "inp_tokens");
|
3173
3475
|
|
@@ -3177,11 +3479,11 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3177
3479
|
GGML_ASSERT(false && "not implemented");
|
3178
3480
|
#endif
|
3179
3481
|
|
3180
|
-
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd,
|
3482
|
+
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
|
3181
3483
|
|
3182
3484
|
ggml_allocr_alloc(lctx.alloc, inpL);
|
3183
3485
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3184
|
-
memcpy(inpL->data, embd,
|
3486
|
+
memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
|
3185
3487
|
}
|
3186
3488
|
}
|
3187
3489
|
|
@@ -3190,9 +3492,6 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3190
3492
|
|
3191
3493
|
// offload functions set the tensor output backend to GPU
|
3192
3494
|
// tensors are GPU-accelerated if any input or the output has been offloaded
|
3193
|
-
//
|
3194
|
-
// with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
|
3195
|
-
// in that case ggml_cuda_assign_buffers has no effect
|
3196
3495
|
offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
|
3197
3496
|
offload_func_t offload_func_kq = llama_nop;
|
3198
3497
|
offload_func_t offload_func_v = llama_nop;
|
@@ -3209,15 +3508,432 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3209
3508
|
}
|
3210
3509
|
#endif // GGML_USE_CUBLAS
|
3211
3510
|
|
3511
|
+
// KQ_scale
|
3212
3512
|
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
3513
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
3213
3514
|
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
3214
3515
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3215
|
-
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(
|
3516
|
+
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd_head)));
|
3216
3517
|
}
|
3217
|
-
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
3218
3518
|
|
3219
|
-
|
3220
|
-
|
3519
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
3520
|
+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
3521
|
+
offload_func_kq(KQ_mask);
|
3522
|
+
ggml_set_name(KQ_mask, "KQ_mask");
|
3523
|
+
ggml_allocr_alloc(lctx.alloc, KQ_mask);
|
3524
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3525
|
+
float * data = (float *) KQ_mask->data;
|
3526
|
+
memset(data, 0, ggml_nbytes(KQ_mask));
|
3527
|
+
|
3528
|
+
for (int h = 0; h < 1; ++h) {
|
3529
|
+
for (int j = 0; j < n_tokens; ++j) {
|
3530
|
+
const llama_pos pos = batch.pos[j];
|
3531
|
+
const llama_seq_id seq_id = batch.seq_id[j];
|
3532
|
+
|
3533
|
+
for (int i = 0; i < n_kv; ++i) {
|
3534
|
+
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
3535
|
+
data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
|
3536
|
+
}
|
3537
|
+
}
|
3538
|
+
}
|
3539
|
+
}
|
3540
|
+
}
|
3541
|
+
|
3542
|
+
for (int il = 0; il < n_layer; ++il) {
|
3543
|
+
ggml_format_name(inpL, "layer_inp_%d", il);
|
3544
|
+
|
3545
|
+
offload_func_t offload_func = llama_nop;
|
3546
|
+
|
3547
|
+
#ifdef GGML_USE_CUBLAS
|
3548
|
+
if (il >= i_gpu_start) {
|
3549
|
+
offload_func = ggml_cuda_assign_buffers_no_alloc;
|
3550
|
+
}
|
3551
|
+
#endif // GGML_USE_CUBLAS
|
3552
|
+
|
3553
|
+
struct ggml_tensor * inpSA = inpL;
|
3554
|
+
|
3555
|
+
// norm
|
3556
|
+
{
|
3557
|
+
cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps);
|
3558
|
+
offload_func(cur);
|
3559
|
+
ggml_set_name(cur, "rms_norm_0");
|
3560
|
+
|
3561
|
+
// cur = cur*attn_norm(broadcasted)
|
3562
|
+
cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm);
|
3563
|
+
offload_func(cur);
|
3564
|
+
ggml_set_name(cur, "attention_norm_0");
|
3565
|
+
}
|
3566
|
+
|
3567
|
+
// self-attention
|
3568
|
+
{
|
3569
|
+
// compute Q and K
|
3570
|
+
struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
3571
|
+
offload_func_kq(tmpk);
|
3572
|
+
ggml_set_name(tmpk, "tmpk");
|
3573
|
+
|
3574
|
+
struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
3575
|
+
offload_func_kq(tmpq);
|
3576
|
+
ggml_set_name(tmpq, "tmpq");
|
3577
|
+
|
3578
|
+
struct ggml_tensor * Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens);
|
3579
|
+
offload_func_kq(Kcur);
|
3580
|
+
ggml_set_name(Kcur, "Kcur");
|
3581
|
+
|
3582
|
+
struct ggml_tensor * Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens);
|
3583
|
+
offload_func_kq(Qcur);
|
3584
|
+
ggml_set_name(Qcur, "Qcur");
|
3585
|
+
|
3586
|
+
// store key and value to memory
|
3587
|
+
{
|
3588
|
+
// compute the transposed [n_tokens, n_embd] V matrix
|
3589
|
+
|
3590
|
+
struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
3591
|
+
offload_func_v(tmpv);
|
3592
|
+
ggml_set_name(tmpv, "tmpv");
|
3593
|
+
|
3594
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens));
|
3595
|
+
offload_func_v(Vcur);
|
3596
|
+
ggml_set_name(Vcur, "Vcur");
|
3597
|
+
|
3598
|
+
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
|
3599
|
+
offload_func_kq(k);
|
3600
|
+
ggml_set_name(k, "k");
|
3601
|
+
|
3602
|
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
|
3603
|
+
( n_ctx)*ggml_element_size(kv_self.v),
|
3604
|
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
|
3605
|
+
offload_func_v(v);
|
3606
|
+
ggml_set_name(v, "v");
|
3607
|
+
|
3608
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
3609
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
3610
|
+
}
|
3611
|
+
|
3612
|
+
struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
|
3613
|
+
offload_func_kq(Q);
|
3614
|
+
ggml_set_name(Q, "Q");
|
3615
|
+
|
3616
|
+
struct ggml_tensor * K =
|
3617
|
+
ggml_view_3d(ctx0, kv_self.k,
|
3618
|
+
n_embd_head, n_kv, n_head_kv,
|
3619
|
+
ggml_element_size(kv_self.k)*n_embd_gqa,
|
3620
|
+
ggml_element_size(kv_self.k)*n_embd_head,
|
3621
|
+
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
3622
|
+
offload_func_kq(K);
|
3623
|
+
ggml_set_name(K, "K");
|
3624
|
+
|
3625
|
+
// K * Q
|
3626
|
+
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
3627
|
+
offload_func_kq(KQ);
|
3628
|
+
ggml_set_name(KQ, "KQ");
|
3629
|
+
|
3630
|
+
// KQ_scaled = KQ / sqrt(n_embd_head)
|
3631
|
+
// KQ_scaled shape [n_kv, n_tokens, n_head, 1]
|
3632
|
+
struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
|
3633
|
+
offload_func_kq(KQ_scaled);
|
3634
|
+
ggml_set_name(KQ_scaled, "KQ_scaled");
|
3635
|
+
|
3636
|
+
// KQ_masked = mask_past(KQ_scaled)
|
3637
|
+
struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ 0, n_head, 8);
|
3638
|
+
ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
|
3639
|
+
|
3640
|
+
struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
|
3641
|
+
offload_func_kq(KQ_masked);
|
3642
|
+
ggml_set_name(KQ_masked, "KQ_masked");
|
3643
|
+
|
3644
|
+
// KQ = soft_max(KQ_masked)
|
3645
|
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
|
3646
|
+
offload_func_v(KQ_soft_max);
|
3647
|
+
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
3648
|
+
|
3649
|
+
// split cached V into n_head heads
|
3650
|
+
struct ggml_tensor * V =
|
3651
|
+
ggml_view_3d(ctx0, kv_self.v,
|
3652
|
+
n_kv, n_embd_head, n_head_kv,
|
3653
|
+
ggml_element_size(kv_self.v)*n_ctx,
|
3654
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
3655
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
3656
|
+
offload_func_v(V);
|
3657
|
+
ggml_set_name(V, "V");
|
3658
|
+
|
3659
|
+
#if 1
|
3660
|
+
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
3661
|
+
offload_func_v(KQV);
|
3662
|
+
ggml_set_name(KQV, "KQV");
|
3663
|
+
#else
|
3664
|
+
// make V contiguous in memory to speed up the matmul, however we waste time on the copy
|
3665
|
+
// on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
|
3666
|
+
// is there a better way?
|
3667
|
+
struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_ctx, n_embd_head, n_head));
|
3668
|
+
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
|
3669
|
+
#endif
|
3670
|
+
|
3671
|
+
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
3672
|
+
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
3673
|
+
offload_func_v(KQV_merged);
|
3674
|
+
ggml_set_name(KQV_merged, "KQV_merged");
|
3675
|
+
|
3676
|
+
// cur = KQV_merged.contiguous().view(n_embd, n_tokens)
|
3677
|
+
cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
|
3678
|
+
offload_func_v(cur);
|
3679
|
+
ggml_set_name(cur, "KQV_merged_contiguous");
|
3680
|
+
|
3681
|
+
// projection (no bias)
|
3682
|
+
cur = ggml_mul_mat(ctx0,
|
3683
|
+
model.layers[il].wo,
|
3684
|
+
cur);
|
3685
|
+
offload_func(cur);
|
3686
|
+
ggml_set_name(cur, "result_wo");
|
3687
|
+
}
|
3688
|
+
|
3689
|
+
struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
|
3690
|
+
offload_func(inpFF);
|
3691
|
+
ggml_set_name(inpFF, "inpFF");
|
3692
|
+
|
3693
|
+
// feed-forward network
|
3694
|
+
{
|
3695
|
+
// norm
|
3696
|
+
{
|
3697
|
+
cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps);
|
3698
|
+
offload_func(cur);
|
3699
|
+
ggml_set_name(cur, "rms_norm_1");
|
3700
|
+
|
3701
|
+
// cur = cur*ffn_norm(broadcasted)
|
3702
|
+
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
|
3703
|
+
offload_func(cur);
|
3704
|
+
ggml_set_name(cur, "ffn_norm");
|
3705
|
+
}
|
3706
|
+
|
3707
|
+
struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
|
3708
|
+
model.layers[il].w3,
|
3709
|
+
cur);
|
3710
|
+
offload_func(tmp);
|
3711
|
+
ggml_set_name(tmp, "result_w3");
|
3712
|
+
|
3713
|
+
cur = ggml_mul_mat(ctx0,
|
3714
|
+
model.layers[il].w1,
|
3715
|
+
cur);
|
3716
|
+
offload_func(cur);
|
3717
|
+
ggml_set_name(cur, "result_w1");
|
3718
|
+
|
3719
|
+
// SILU activation
|
3720
|
+
cur = ggml_silu(ctx0, cur);
|
3721
|
+
offload_func(cur);
|
3722
|
+
ggml_set_name(cur, "silu");
|
3723
|
+
|
3724
|
+
cur = ggml_mul(ctx0, cur, tmp);
|
3725
|
+
offload_func(cur);
|
3726
|
+
ggml_set_name(cur, "silu_x_result_w3");
|
3727
|
+
|
3728
|
+
cur = ggml_mul_mat(ctx0,
|
3729
|
+
model.layers[il].w2,
|
3730
|
+
cur);
|
3731
|
+
offload_func(cur);
|
3732
|
+
ggml_set_name(cur, "result_w2");
|
3733
|
+
}
|
3734
|
+
|
3735
|
+
cur = ggml_add(ctx0, cur, inpFF);
|
3736
|
+
offload_func(cur);
|
3737
|
+
ggml_set_name(cur, "inpFF_+_result_w2");
|
3738
|
+
|
3739
|
+
// input for next layer
|
3740
|
+
inpL = cur;
|
3741
|
+
}
|
3742
|
+
|
3743
|
+
cur = inpL;
|
3744
|
+
|
3745
|
+
// norm
|
3746
|
+
{
|
3747
|
+
cur = ggml_rms_norm(ctx0, cur, norm_rms_eps);
|
3748
|
+
offload_func_nr(cur);
|
3749
|
+
ggml_set_name(cur, "rms_norm_2");
|
3750
|
+
|
3751
|
+
// cur = cur*norm(broadcasted)
|
3752
|
+
cur = ggml_mul(ctx0, cur, model.output_norm);
|
3753
|
+
// offload_func_nr(cur); // TODO CPU + GPU mirrored backend
|
3754
|
+
ggml_set_name(cur, "result_norm");
|
3755
|
+
}
|
3756
|
+
|
3757
|
+
// lm_head
|
3758
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
3759
|
+
ggml_set_name(cur, "result_output");
|
3760
|
+
|
3761
|
+
ggml_build_forward_expand(gf, cur);
|
3762
|
+
|
3763
|
+
ggml_free(ctx0);
|
3764
|
+
|
3765
|
+
return gf;
|
3766
|
+
}
|
3767
|
+
|
3768
|
+
static struct ggml_cgraph * llm_build_falcon(
|
3769
|
+
llama_context & lctx,
|
3770
|
+
const llama_batch & batch) {
|
3771
|
+
const auto & model = lctx.model;
|
3772
|
+
const auto & hparams = model.hparams;
|
3773
|
+
const auto & cparams = lctx.cparams;
|
3774
|
+
|
3775
|
+
const auto & kv_self = lctx.kv_self;
|
3776
|
+
|
3777
|
+
GGML_ASSERT(!!kv_self.ctx);
|
3778
|
+
|
3779
|
+
const int64_t n_embd = hparams.n_embd;
|
3780
|
+
const int64_t n_layer = hparams.n_layer;
|
3781
|
+
const int64_t n_ctx = cparams.n_ctx;
|
3782
|
+
const int64_t n_head = hparams.n_head;
|
3783
|
+
const int64_t n_head_kv = hparams.n_head_kv;
|
3784
|
+
const int64_t n_embd_head = hparams.n_embd_head();
|
3785
|
+
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
3786
|
+
|
3787
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
3788
|
+
|
3789
|
+
const float freq_base = cparams.rope_freq_base;
|
3790
|
+
const float freq_scale = cparams.rope_freq_scale;
|
3791
|
+
const float norm_eps = hparams.f_norm_eps;
|
3792
|
+
|
3793
|
+
const int n_gpu_layers = model.n_gpu_layers;
|
3794
|
+
|
3795
|
+
const int32_t n_tokens = batch.n_tokens;
|
3796
|
+
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
|
3797
|
+
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
3798
|
+
|
3799
|
+
const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
|
3800
|
+
|
3801
|
+
//printf("kv_head = %d, n_kv = %d, n_tokens = %d, n_ctx = %d, is_measure = %d, has_shift = %d\n",
|
3802
|
+
// kv_head, n_kv, n_tokens, n_ctx, ggml_allocr_is_measure(lctx.alloc), kv_self.has_shift);
|
3803
|
+
|
3804
|
+
auto & buf_compute = lctx.buf_compute;
|
3805
|
+
|
3806
|
+
struct ggml_init_params params = {
|
3807
|
+
/*.mem_size =*/ buf_compute.size,
|
3808
|
+
/*.mem_buffer =*/ buf_compute.data,
|
3809
|
+
/*.no_alloc =*/ false,
|
3810
|
+
};
|
3811
|
+
|
3812
|
+
params.no_alloc = true;
|
3813
|
+
|
3814
|
+
struct ggml_context * ctx0 = ggml_init(params);
|
3815
|
+
|
3816
|
+
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
3817
|
+
|
3818
|
+
struct ggml_tensor * cur;
|
3819
|
+
struct ggml_tensor * inpL;
|
3820
|
+
|
3821
|
+
if (batch.token) {
|
3822
|
+
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
3823
|
+
|
3824
|
+
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
3825
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3826
|
+
memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
|
3827
|
+
}
|
3828
|
+
ggml_set_name(inp_tokens, "inp_tokens");
|
3829
|
+
|
3830
|
+
inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
|
3831
|
+
} else {
|
3832
|
+
#ifdef GGML_USE_MPI
|
3833
|
+
GGML_ASSERT(false && "not implemented");
|
3834
|
+
#endif
|
3835
|
+
|
3836
|
+
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
|
3837
|
+
|
3838
|
+
ggml_allocr_alloc(lctx.alloc, inpL);
|
3839
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3840
|
+
memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
|
3841
|
+
}
|
3842
|
+
}
|
3843
|
+
|
3844
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
3845
|
+
(void) i_gpu_start;
|
3846
|
+
|
3847
|
+
// offload functions set the tensor output backend to GPU
|
3848
|
+
// tensors are GPU-accelerated if any input or the output has been offloaded
|
3849
|
+
offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
|
3850
|
+
offload_func_t offload_func_kq = llama_nop;
|
3851
|
+
offload_func_t offload_func_v = llama_nop;
|
3852
|
+
|
3853
|
+
#ifdef GGML_USE_CUBLAS
|
3854
|
+
if (n_gpu_layers > n_layer) {
|
3855
|
+
offload_func_nr = ggml_cuda_assign_buffers_no_alloc;
|
3856
|
+
}
|
3857
|
+
if (n_gpu_layers > n_layer + 1) {
|
3858
|
+
offload_func_v = ggml_cuda_assign_buffers_no_alloc;
|
3859
|
+
}
|
3860
|
+
if (n_gpu_layers > n_layer + 2) {
|
3861
|
+
offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
|
3862
|
+
}
|
3863
|
+
#endif // GGML_USE_CUBLAS
|
3864
|
+
|
3865
|
+
// KQ_scale
|
3866
|
+
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
3867
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
3868
|
+
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
3869
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3870
|
+
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
3871
|
+
}
|
3872
|
+
|
3873
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
3874
|
+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
3875
|
+
offload_func_kq(KQ_mask);
|
3876
|
+
ggml_set_name(KQ_mask, "KQ_mask");
|
3877
|
+
ggml_allocr_alloc(lctx.alloc, KQ_mask);
|
3878
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3879
|
+
float * data = (float *) KQ_mask->data;
|
3880
|
+
memset(data, 0, ggml_nbytes(KQ_mask));
|
3881
|
+
|
3882
|
+
for (int h = 0; h < 1; ++h) {
|
3883
|
+
for (int j = 0; j < n_tokens; ++j) {
|
3884
|
+
const llama_pos pos = batch.pos[j];
|
3885
|
+
const llama_seq_id seq_id = batch.seq_id[j];
|
3886
|
+
|
3887
|
+
for (int i = 0; i < n_kv; ++i) {
|
3888
|
+
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
3889
|
+
data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
|
3890
|
+
}
|
3891
|
+
}
|
3892
|
+
}
|
3893
|
+
}
|
3894
|
+
}
|
3895
|
+
|
3896
|
+
// KQ_pos - contains the positions
|
3897
|
+
struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
3898
|
+
offload_func_kq(KQ_pos);
|
3899
|
+
ggml_set_name(KQ_pos, "KQ_pos");
|
3900
|
+
ggml_allocr_alloc(lctx.alloc, KQ_pos);
|
3901
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3902
|
+
int * data = (int *) KQ_pos->data;
|
3903
|
+
for (int i = 0; i < n_tokens; ++i) {
|
3904
|
+
data[i] = batch.pos[i];
|
3905
|
+
}
|
3906
|
+
}
|
3907
|
+
|
3908
|
+
// shift the entire K-cache if needed
|
3909
|
+
if (do_rope_shift) {
|
3910
|
+
struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
|
3911
|
+
offload_func_kq(K_shift);
|
3912
|
+
ggml_set_name(K_shift, "K_shift");
|
3913
|
+
ggml_allocr_alloc(lctx.alloc, K_shift);
|
3914
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3915
|
+
int * data = (int *) K_shift->data;
|
3916
|
+
for (int i = 0; i < n_ctx; ++i) {
|
3917
|
+
data[i] = kv_self.cells[i].delta;
|
3918
|
+
}
|
3919
|
+
}
|
3920
|
+
|
3921
|
+
for (int il = 0; il < n_layer; ++il) {
|
3922
|
+
struct ggml_tensor * tmp =
|
3923
|
+
ggml_rope_custom_inplace(ctx0,
|
3924
|
+
ggml_view_3d(ctx0, kv_self.k,
|
3925
|
+
n_embd_head, n_head_kv, n_ctx,
|
3926
|
+
ggml_element_size(kv_self.k)*n_embd_head,
|
3927
|
+
ggml_element_size(kv_self.k)*n_embd_gqa,
|
3928
|
+
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
|
3929
|
+
K_shift, n_embd_head, 2, 0, freq_base, freq_scale);
|
3930
|
+
offload_func_kq(tmp);
|
3931
|
+
ggml_build_forward_expand(gf, tmp);
|
3932
|
+
}
|
3933
|
+
}
|
3934
|
+
|
3935
|
+
for (int il = 0; il < n_layer; ++il) {
|
3936
|
+
struct ggml_tensor * attn_norm;
|
3221
3937
|
|
3222
3938
|
offload_func_t offload_func = llama_nop;
|
3223
3939
|
|
@@ -3271,45 +3987,45 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3271
3987
|
// TODO: these 2 ggml_conts are technically not needed, but we add them until CUDA support for
|
3272
3988
|
// non-contiguous views is added for the rope operator
|
3273
3989
|
struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_3d(
|
3274
|
-
ctx0, cur, n_embd_head, n_head,
|
3990
|
+
ctx0, cur, n_embd_head, n_head, n_tokens,
|
3275
3991
|
wsize * n_embd_head,
|
3276
3992
|
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
3277
3993
|
0));
|
3278
3994
|
offload_func_kq(tmpq);
|
3279
3995
|
|
3280
3996
|
struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_3d(
|
3281
|
-
ctx0, cur, n_embd_head, n_head_kv,
|
3997
|
+
ctx0, cur, n_embd_head, n_head_kv, n_tokens,
|
3282
3998
|
wsize * n_embd_head,
|
3283
3999
|
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
3284
4000
|
wsize * n_embd_head * n_head));
|
3285
4001
|
offload_func_kq(tmpk);
|
3286
4002
|
|
3287
4003
|
struct ggml_tensor * tmpv = ggml_view_3d(
|
3288
|
-
ctx0, cur, n_embd_head, n_head_kv,
|
4004
|
+
ctx0, cur, n_embd_head, n_head_kv, n_tokens,
|
3289
4005
|
wsize * n_embd_head,
|
3290
4006
|
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
3291
4007
|
wsize * n_embd_head * (n_head + n_head_kv));
|
3292
4008
|
offload_func_v(tmpv);
|
3293
4009
|
|
3294
4010
|
// using mode = 2 for neox mode
|
3295
|
-
struct ggml_tensor * Qcur =
|
4011
|
+
struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, tmpq, KQ_pos, n_embd_head, 2, 0, freq_base, freq_scale);
|
3296
4012
|
offload_func_kq(Qcur);
|
3297
|
-
struct ggml_tensor * Kcur =
|
4013
|
+
struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, tmpk, KQ_pos, n_embd_head, 2, 0, freq_base, freq_scale);
|
3298
4014
|
offload_func_kq(Kcur);
|
3299
4015
|
|
3300
4016
|
{
|
3301
|
-
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa,
|
4017
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
|
3302
4018
|
offload_func_v(Vcur);
|
3303
4019
|
offload_func_v(Vcur->src[0]->src[0]);
|
3304
4020
|
ggml_set_name(Vcur, "Vcur");
|
3305
4021
|
|
3306
|
-
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k,
|
4022
|
+
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
|
3307
4023
|
offload_func_kq(k);
|
3308
4024
|
ggml_set_name(k, "k");
|
3309
4025
|
|
3310
|
-
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v,
|
4026
|
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
|
3311
4027
|
( n_ctx)*ggml_element_size(kv_self.v),
|
3312
|
-
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa +
|
4028
|
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
|
3313
4029
|
offload_func_v(v);
|
3314
4030
|
|
3315
4031
|
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
@@ -3322,7 +4038,7 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3322
4038
|
|
3323
4039
|
struct ggml_tensor * K =
|
3324
4040
|
ggml_view_3d(ctx0, kv_self.k,
|
3325
|
-
n_embd_head,
|
4041
|
+
n_embd_head, n_kv, n_head_kv,
|
3326
4042
|
ggml_element_size(kv_self.k)*n_embd_gqa,
|
3327
4043
|
ggml_element_size(kv_self.k)*n_embd_head,
|
3328
4044
|
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
@@ -3333,21 +4049,21 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3333
4049
|
offload_func_kq(KQ);
|
3334
4050
|
ggml_set_name(KQ, "KQ");
|
3335
4051
|
|
3336
|
-
struct ggml_tensor * KQ_scaled =
|
4052
|
+
struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
|
3337
4053
|
offload_func_kq(KQ_scaled);
|
3338
4054
|
ggml_set_name(KQ_scaled, "KQ_scaled");
|
3339
4055
|
|
3340
|
-
struct ggml_tensor * KQ_masked =
|
4056
|
+
struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
|
3341
4057
|
offload_func_kq(KQ_masked);
|
3342
4058
|
ggml_set_name(KQ_masked, "KQ_masked");
|
3343
4059
|
|
3344
|
-
struct ggml_tensor * KQ_soft_max =
|
4060
|
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
|
3345
4061
|
offload_func_v(KQ_soft_max);
|
3346
4062
|
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
3347
4063
|
|
3348
4064
|
struct ggml_tensor * V =
|
3349
4065
|
ggml_view_3d(ctx0, kv_self.v,
|
3350
|
-
|
4066
|
+
n_kv, n_embd_head, n_head_kv,
|
3351
4067
|
ggml_element_size(kv_self.v)*n_ctx,
|
3352
4068
|
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
3353
4069
|
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
@@ -3362,7 +4078,7 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3362
4078
|
offload_func_v(KQV_merged);
|
3363
4079
|
ggml_set_name(KQV_merged, "KQV_merged");
|
3364
4080
|
|
3365
|
-
cur =
|
4081
|
+
cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
|
3366
4082
|
offload_func_v(cur);
|
3367
4083
|
ggml_set_name(cur, "KQV_merged_contiguous");
|
3368
4084
|
|
@@ -3420,17 +4136,10 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3420
4136
|
|
3421
4137
|
static struct ggml_cgraph * llm_build_starcoder(
|
3422
4138
|
llama_context & lctx,
|
3423
|
-
const
|
3424
|
-
const float * embd,
|
3425
|
-
int n_tokens,
|
3426
|
-
int n_past) {
|
3427
|
-
|
3428
|
-
GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
|
3429
|
-
|
3430
|
-
const int N = n_tokens;
|
3431
|
-
|
4139
|
+
const llama_batch & batch) {
|
3432
4140
|
const auto & model = lctx.model;
|
3433
4141
|
const auto & hparams = model.hparams;
|
4142
|
+
const auto & cparams = lctx.cparams;
|
3434
4143
|
|
3435
4144
|
const auto & kv_self = lctx.kv_self;
|
3436
4145
|
|
@@ -3438,7 +4147,7 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
3438
4147
|
|
3439
4148
|
const int64_t n_embd = hparams.n_embd;
|
3440
4149
|
const int64_t n_layer = hparams.n_layer;
|
3441
|
-
const int64_t n_ctx =
|
4150
|
+
const int64_t n_ctx = cparams.n_ctx;
|
3442
4151
|
const int64_t n_head = hparams.n_head;
|
3443
4152
|
const int64_t n_head_kv = hparams.n_head_kv;
|
3444
4153
|
const int64_t n_embd_head = hparams.n_embd_head();
|
@@ -3446,7 +4155,11 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
3446
4155
|
|
3447
4156
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
3448
4157
|
|
3449
|
-
const float norm_eps
|
4158
|
+
const float norm_eps = hparams.f_norm_eps;
|
4159
|
+
|
4160
|
+
const int32_t n_tokens = batch.n_tokens;
|
4161
|
+
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
|
4162
|
+
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
3450
4163
|
|
3451
4164
|
auto & buf_compute = lctx.buf_compute;
|
3452
4165
|
|
@@ -3467,12 +4180,12 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
3467
4180
|
struct ggml_tensor * position;
|
3468
4181
|
struct ggml_tensor * inpL;
|
3469
4182
|
|
3470
|
-
if (
|
3471
|
-
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32,
|
4183
|
+
if (batch.token) {
|
4184
|
+
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
3472
4185
|
|
3473
4186
|
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
3474
4187
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3475
|
-
memcpy(inp_tokens->data,
|
4188
|
+
memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
|
3476
4189
|
}
|
3477
4190
|
ggml_set_name(inp_tokens, "inp_tokens");
|
3478
4191
|
|
@@ -3482,21 +4195,21 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
3482
4195
|
GGML_ASSERT(false && "not implemented");
|
3483
4196
|
#endif
|
3484
4197
|
|
3485
|
-
token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd,
|
4198
|
+
token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
|
3486
4199
|
|
3487
4200
|
ggml_allocr_alloc(lctx.alloc, token);
|
3488
4201
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3489
|
-
memcpy(token->data, embd,
|
4202
|
+
memcpy(token->data, batch.embd, n_tokens * n_embd * ggml_element_size(token));
|
3490
4203
|
}
|
3491
4204
|
}
|
3492
4205
|
|
3493
4206
|
{
|
3494
4207
|
// Compute position embeddings.
|
3495
|
-
struct ggml_tensor * inp_positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32,
|
4208
|
+
struct ggml_tensor * inp_positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
3496
4209
|
ggml_allocr_alloc(lctx.alloc, inp_positions);
|
3497
4210
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3498
|
-
for (int i = 0; i <
|
3499
|
-
((int32_t *) inp_positions->data)[i] =
|
4211
|
+
for (int i = 0; i < n_tokens; ++i) {
|
4212
|
+
((int32_t *) inp_positions->data)[i] = batch.pos[i];
|
3500
4213
|
}
|
3501
4214
|
}
|
3502
4215
|
ggml_set_name(inp_positions, "inp_positions");
|
@@ -3504,12 +4217,35 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
3504
4217
|
position = ggml_get_rows(ctx0, model.pos_embeddings, inp_positions);
|
3505
4218
|
}
|
3506
4219
|
|
4220
|
+
// KQ_scale
|
3507
4221
|
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
4222
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
3508
4223
|
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
3509
4224
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3510
4225
|
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
3511
4226
|
}
|
3512
|
-
|
4227
|
+
|
4228
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
4229
|
+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
4230
|
+
ggml_set_name(KQ_mask, "KQ_mask");
|
4231
|
+
ggml_allocr_alloc(lctx.alloc, KQ_mask);
|
4232
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
4233
|
+
float * data = (float *) KQ_mask->data;
|
4234
|
+
memset(data, 0, ggml_nbytes(KQ_mask));
|
4235
|
+
|
4236
|
+
for (int h = 0; h < 1; ++h) {
|
4237
|
+
for (int j = 0; j < n_tokens; ++j) {
|
4238
|
+
const llama_pos pos = batch.pos[j];
|
4239
|
+
const llama_seq_id seq_id = batch.seq_id[j];
|
4240
|
+
|
4241
|
+
for (int i = 0; i < n_kv; ++i) {
|
4242
|
+
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
4243
|
+
data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
|
4244
|
+
}
|
4245
|
+
}
|
4246
|
+
}
|
4247
|
+
}
|
4248
|
+
}
|
3513
4249
|
|
3514
4250
|
inpL = ggml_add(ctx0, token, position);
|
3515
4251
|
ggml_set_name(inpL, "inpL");
|
@@ -3525,23 +4261,23 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
3525
4261
|
// Self Attention
|
3526
4262
|
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv);
|
3527
4263
|
|
3528
|
-
struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd,
|
3529
|
-
struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa,
|
3530
|
-
struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa,
|
4264
|
+
struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd);
|
4265
|
+
struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*n_embd);
|
4266
|
+
struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
|
3531
4267
|
|
3532
4268
|
struct ggml_tensor * Qcur = tmpq;
|
3533
4269
|
struct ggml_tensor * Kcur = tmpk;
|
3534
4270
|
|
3535
4271
|
{
|
3536
|
-
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa,
|
4272
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
|
3537
4273
|
ggml_set_name(Vcur, "Vcur");
|
3538
4274
|
|
3539
|
-
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k,
|
4275
|
+
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
|
3540
4276
|
ggml_set_name(k, "k");
|
3541
4277
|
|
3542
|
-
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v,
|
4278
|
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
|
3543
4279
|
( n_ctx)*ggml_element_size(kv_self.v),
|
3544
|
-
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa +
|
4280
|
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
|
3545
4281
|
|
3546
4282
|
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
3547
4283
|
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
@@ -3551,13 +4287,13 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
3551
4287
|
ggml_permute(ctx0,
|
3552
4288
|
ggml_cpy(ctx0,
|
3553
4289
|
Qcur,
|
3554
|
-
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head,
|
4290
|
+
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, n_tokens)),
|
3555
4291
|
0, 2, 1, 3);
|
3556
4292
|
ggml_set_name(Q, "Q");
|
3557
4293
|
|
3558
4294
|
struct ggml_tensor * K =
|
3559
4295
|
ggml_view_3d(ctx0, kv_self.k,
|
3560
|
-
n_embd_head,
|
4296
|
+
n_embd_head, n_kv, n_head_kv,
|
3561
4297
|
ggml_element_size(kv_self.k)*n_embd_gqa,
|
3562
4298
|
ggml_element_size(kv_self.k)*n_embd_head,
|
3563
4299
|
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
@@ -3568,12 +4304,12 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
3568
4304
|
ggml_set_name(KQ, "KQ");
|
3569
4305
|
|
3570
4306
|
// KQ_scaled = KQ / sqrt(n_embd_head)
|
3571
|
-
// KQ_scaled shape [n_past +
|
4307
|
+
// KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
|
3572
4308
|
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
3573
4309
|
ggml_set_name(KQ_scaled, "KQ_scaled");
|
3574
4310
|
|
3575
4311
|
// KQ_masked = mask_past(KQ_scaled)
|
3576
|
-
struct ggml_tensor * KQ_masked =
|
4312
|
+
struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
|
3577
4313
|
ggml_set_name(KQ_masked, "KQ_masked");
|
3578
4314
|
|
3579
4315
|
// KQ = soft_max(KQ_masked)
|
@@ -3583,7 +4319,7 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
3583
4319
|
// split cached V into n_head heads
|
3584
4320
|
struct ggml_tensor * V =
|
3585
4321
|
ggml_view_3d(ctx0, kv_self.v,
|
3586
|
-
|
4322
|
+
n_kv, n_embd_head, n_head_kv,
|
3587
4323
|
ggml_element_size(kv_self.v)*n_ctx,
|
3588
4324
|
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
3589
4325
|
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
@@ -3596,10 +4332,8 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
3596
4332
|
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
3597
4333
|
ggml_set_name(KQV_merged, "KQV_merged");
|
3598
4334
|
|
3599
|
-
// cur = KQV_merged.contiguous().view(n_embd,
|
3600
|
-
cur =
|
3601
|
-
KQV_merged,
|
3602
|
-
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
|
4335
|
+
// cur = KQV_merged.contiguous().view(n_embd, n_tokens)
|
4336
|
+
cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
|
3603
4337
|
ggml_set_name(cur, "KQV_merged_contiguous");
|
3604
4338
|
}
|
3605
4339
|
|
@@ -3649,10 +4383,7 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
3649
4383
|
|
3650
4384
|
static struct ggml_cgraph * llama_build_graph(
|
3651
4385
|
llama_context & lctx,
|
3652
|
-
const
|
3653
|
-
const float * embd,
|
3654
|
-
int n_tokens,
|
3655
|
-
int n_past) {
|
4386
|
+
const llama_batch & batch) {
|
3656
4387
|
const auto & model = lctx.model;
|
3657
4388
|
|
3658
4389
|
struct ggml_cgraph * result = NULL;
|
@@ -3660,76 +4391,121 @@ static struct ggml_cgraph * llama_build_graph(
|
|
3660
4391
|
switch (model.arch) {
|
3661
4392
|
case LLM_ARCH_LLAMA:
|
3662
4393
|
{
|
3663
|
-
result = llm_build_llama(lctx,
|
4394
|
+
result = llm_build_llama(lctx, batch);
|
3664
4395
|
} break;
|
3665
4396
|
case LLM_ARCH_BAICHUAN:
|
3666
4397
|
{
|
3667
|
-
result = llm_build_baichaun(lctx,
|
4398
|
+
result = llm_build_baichaun(lctx, batch);
|
3668
4399
|
} break;
|
3669
4400
|
case LLM_ARCH_FALCON:
|
3670
4401
|
{
|
3671
|
-
result = llm_build_falcon(lctx,
|
4402
|
+
result = llm_build_falcon(lctx, batch);
|
3672
4403
|
} break;
|
3673
4404
|
case LLM_ARCH_STARCODER:
|
3674
4405
|
{
|
3675
|
-
result = llm_build_starcoder(lctx,
|
4406
|
+
result = llm_build_starcoder(lctx, batch);
|
4407
|
+
} break;
|
4408
|
+
case LLM_ARCH_REFACT:
|
4409
|
+
{
|
4410
|
+
result = llm_build_refact(lctx, batch);
|
3676
4411
|
} break;
|
3677
4412
|
default:
|
3678
4413
|
GGML_ASSERT(false);
|
3679
|
-
}
|
4414
|
+
}
|
3680
4415
|
|
3681
4416
|
return result;
|
3682
4417
|
}
|
3683
4418
|
|
3684
|
-
//
|
4419
|
+
// decode a batch of tokens by evaluating the transformer
|
3685
4420
|
//
|
3686
4421
|
// - lctx: llama context
|
3687
|
-
// -
|
3688
|
-
// - embd embeddings input
|
3689
|
-
// - n_tokens number of tokens
|
3690
|
-
// - n_past: the context size so far
|
4422
|
+
// - batch: batch to evaluate
|
3691
4423
|
// - n_threads: number of threads to use
|
3692
4424
|
//
|
3693
|
-
|
4425
|
+
// return 0 on success
|
4426
|
+
// return positive int on warning
|
4427
|
+
// return negative int on error
|
4428
|
+
//
|
4429
|
+
static int llama_decode_internal(
|
3694
4430
|
llama_context & lctx,
|
3695
|
-
|
3696
|
-
|
3697
|
-
|
3698
|
-
|
3699
|
-
|
3700
|
-
|
4431
|
+
llama_batch batch) {
|
4432
|
+
const uint32_t n_tokens = batch.n_tokens;
|
4433
|
+
|
4434
|
+
if (n_tokens == 0) {
|
4435
|
+
LLAMA_LOG_ERROR("%s: n_tokens == 0", __func__);
|
4436
|
+
return -1;
|
4437
|
+
}
|
4438
|
+
|
4439
|
+
const auto & model = lctx.model;
|
4440
|
+
const auto & hparams = model.hparams;
|
4441
|
+
const auto & cparams = lctx.cparams;
|
3701
4442
|
|
3702
|
-
|
4443
|
+
const auto n_batch = cparams.n_batch;
|
3703
4444
|
|
3704
|
-
GGML_ASSERT(n_tokens
|
3705
|
-
|
3706
|
-
|
3707
|
-
|
3708
|
-
// GGML_ASSERT(n_past + n_tokens <= n_ctx);
|
4445
|
+
GGML_ASSERT(n_tokens <= n_batch);
|
4446
|
+
|
4447
|
+
int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
|
4448
|
+
GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
|
3709
4449
|
|
3710
4450
|
const int64_t t_start_us = ggml_time_us();
|
3711
4451
|
|
3712
4452
|
#ifdef GGML_USE_MPI
|
3713
|
-
|
4453
|
+
// TODO: needs fix after #3228
|
4454
|
+
GGML_ASSERT(false && "not implemented");
|
4455
|
+
//ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
|
3714
4456
|
#endif
|
3715
4457
|
|
3716
4458
|
GGML_ASSERT(n_threads > 0);
|
3717
4459
|
|
3718
|
-
|
3719
|
-
|
3720
|
-
const auto & model = lctx.model;
|
3721
|
-
const auto & hparams = model.hparams;
|
3722
|
-
|
3723
|
-
const auto & kv_self = lctx.kv_self;
|
4460
|
+
auto & kv_self = lctx.kv_self;
|
3724
4461
|
|
3725
4462
|
GGML_ASSERT(!!kv_self.ctx);
|
3726
4463
|
|
3727
4464
|
const int64_t n_embd = hparams.n_embd;
|
3728
4465
|
const int64_t n_vocab = hparams.n_vocab;
|
3729
4466
|
|
4467
|
+
// helpers for smoother batch API transistion
|
4468
|
+
// after deprecating the llama_eval calls, these will be removed
|
4469
|
+
std::vector<llama_pos> pos;
|
4470
|
+
std::vector<llama_seq_id> seq_id;
|
4471
|
+
|
4472
|
+
if (batch.pos == nullptr) {
|
4473
|
+
pos.resize(n_tokens);
|
4474
|
+
for (uint32_t i = 0; i < n_tokens; i++) {
|
4475
|
+
pos[i] = batch.all_pos_0 + i*batch.all_pos_1;
|
4476
|
+
}
|
4477
|
+
|
4478
|
+
batch.pos = pos.data();
|
4479
|
+
}
|
4480
|
+
|
4481
|
+
if (batch.seq_id == nullptr) {
|
4482
|
+
seq_id.resize(n_tokens);
|
4483
|
+
for (uint32_t i = 0; i < n_tokens; i++) {
|
4484
|
+
seq_id[i] = batch.all_seq_id;
|
4485
|
+
}
|
4486
|
+
|
4487
|
+
batch.seq_id = seq_id.data();
|
4488
|
+
}
|
4489
|
+
|
4490
|
+
// we always start to search for a free slot from the start of the cache
|
4491
|
+
// TODO: better strategies can be implemented
|
4492
|
+
kv_self.head = 0;
|
4493
|
+
|
4494
|
+
if (!llama_kv_cache_find_slot(kv_self, batch)) {
|
4495
|
+
return 1;
|
4496
|
+
}
|
4497
|
+
|
4498
|
+
// a heuristic, to avoid attending the full cache if it is not yet utilized
|
4499
|
+
// after enough generations, the benefit from this heuristic disappears
|
4500
|
+
// if we start defragmenting the cache, the benefit from this will be more important
|
4501
|
+
//kv_self.n = std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)); // TODO: this might be better for CUDA?
|
4502
|
+
kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, llama_kv_cache_cell_max(kv_self)));
|
4503
|
+
|
4504
|
+
//printf("kv_self.n = %d\n", kv_self.n);
|
4505
|
+
|
3730
4506
|
ggml_allocr_reset(lctx.alloc);
|
3731
4507
|
|
3732
|
-
ggml_cgraph * gf = llama_build_graph(lctx,
|
4508
|
+
ggml_cgraph * gf = llama_build_graph(lctx, batch);
|
3733
4509
|
|
3734
4510
|
ggml_allocr_alloc_graph(lctx.alloc, gf);
|
3735
4511
|
|
@@ -3738,6 +4514,7 @@ static bool llama_eval_internal(
|
|
3738
4514
|
ggml_tensor * node = gf->leafs[i];
|
3739
4515
|
if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
|
3740
4516
|
ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) lctx.buf_alloc.data);
|
4517
|
+
ggml_cuda_copy_to_device(node);
|
3741
4518
|
}
|
3742
4519
|
}
|
3743
4520
|
|
@@ -3747,6 +4524,8 @@ static bool llama_eval_internal(
|
|
3747
4524
|
ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) lctx.buf_alloc.data);
|
3748
4525
|
}
|
3749
4526
|
}
|
4527
|
+
|
4528
|
+
ggml_cuda_set_mul_mat_q(cparams.mul_mat_q);
|
3750
4529
|
#endif
|
3751
4530
|
|
3752
4531
|
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
@@ -3756,14 +4535,15 @@ static bool llama_eval_internal(
|
|
3756
4535
|
// TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
|
3757
4536
|
// we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
|
3758
4537
|
// with the BLAS calls. need a better solution
|
3759
|
-
if (
|
4538
|
+
if (n_tokens >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
|
3760
4539
|
n_threads = std::min(4, n_threads);
|
3761
4540
|
}
|
3762
4541
|
|
3763
4542
|
// If all tensors can be run on the GPU then using more than 1 thread is detrimental.
|
3764
4543
|
const bool full_offload_supported = model.arch == LLM_ARCH_LLAMA ||
|
3765
4544
|
model.arch == LLM_ARCH_BAICHUAN ||
|
3766
|
-
model.arch == LLM_ARCH_FALCON
|
4545
|
+
model.arch == LLM_ARCH_FALCON ||
|
4546
|
+
model.arch == LLM_ARCH_REFACT;
|
3767
4547
|
const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
|
3768
4548
|
if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
|
3769
4549
|
n_threads = 1;
|
@@ -3795,12 +4575,9 @@ static bool llama_eval_internal(
|
|
3795
4575
|
ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
|
3796
4576
|
#endif
|
3797
4577
|
|
3798
|
-
// update kv
|
3799
|
-
lctx.kv_self.
|
3800
|
-
|
3801
|
-
if (cgraph_fname) {
|
3802
|
-
ggml_graph_export(gf, cgraph_fname);
|
3803
|
-
}
|
4578
|
+
// update the kv ring buffer
|
4579
|
+
lctx.kv_self.head += n_tokens;
|
4580
|
+
lctx.kv_self.has_shift = false;
|
3804
4581
|
|
3805
4582
|
#ifdef GGML_PERF
|
3806
4583
|
// print timing information per ggml operation (for debugging purposes)
|
@@ -3817,13 +4594,20 @@ static bool llama_eval_internal(
|
|
3817
4594
|
{
|
3818
4595
|
auto & logits_out = lctx.logits;
|
3819
4596
|
|
3820
|
-
if (
|
3821
|
-
logits_out.resize(n_vocab *
|
3822
|
-
|
4597
|
+
if (batch.logits) {
|
4598
|
+
logits_out.resize(n_vocab * n_tokens);
|
4599
|
+
for (uint32_t i = 0; i < n_tokens; i++) {
|
4600
|
+
if (batch.logits[i] == 0) {
|
4601
|
+
continue;
|
4602
|
+
}
|
4603
|
+
memcpy(logits_out.data() + (n_vocab*i), (float *) ggml_get_data(res) + (n_vocab*i), sizeof(float)*n_vocab);
|
4604
|
+
}
|
4605
|
+
} else if (lctx.logits_all) {
|
4606
|
+
logits_out.resize(n_vocab * n_tokens);
|
4607
|
+
memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*n_tokens);
|
3823
4608
|
} else {
|
3824
|
-
// return result for just the last token
|
3825
4609
|
logits_out.resize(n_vocab);
|
3826
|
-
memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(
|
4610
|
+
memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(n_tokens - 1)), sizeof(float)*n_vocab);
|
3827
4611
|
}
|
3828
4612
|
}
|
3829
4613
|
|
@@ -3832,20 +4616,27 @@ static bool llama_eval_internal(
|
|
3832
4616
|
auto & embedding_out = lctx.embedding;
|
3833
4617
|
|
3834
4618
|
embedding_out.resize(n_embd);
|
3835
|
-
memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(
|
4619
|
+
memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(n_tokens - 1)), sizeof(float)*n_embd);
|
3836
4620
|
}
|
3837
4621
|
|
3838
4622
|
// measure the performance only for the single-token evals
|
3839
|
-
if (
|
4623
|
+
if (n_tokens == 1) {
|
3840
4624
|
lctx.t_eval_us += ggml_time_us() - t_start_us;
|
3841
4625
|
lctx.n_eval++;
|
3842
4626
|
}
|
3843
|
-
else if (
|
4627
|
+
else if (n_tokens > 1) {
|
3844
4628
|
lctx.t_p_eval_us += ggml_time_us() - t_start_us;
|
3845
|
-
lctx.n_p_eval +=
|
4629
|
+
lctx.n_p_eval += n_tokens;
|
3846
4630
|
}
|
3847
4631
|
|
3848
|
-
|
4632
|
+
// get a more accurate load time, upon first eval
|
4633
|
+
// TODO: fix this
|
4634
|
+
if (!lctx.has_evaluated_once) {
|
4635
|
+
lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
|
4636
|
+
lctx.has_evaluated_once = true;
|
4637
|
+
}
|
4638
|
+
|
4639
|
+
return 0;
|
3849
4640
|
}
|
3850
4641
|
|
3851
4642
|
//
|
@@ -3872,18 +4663,41 @@ static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
|
|
3872
4663
|
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE;
|
3873
4664
|
}
|
3874
4665
|
|
3875
|
-
static
|
4666
|
+
static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id) {
|
4667
|
+
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED;
|
4668
|
+
}
|
4669
|
+
|
4670
|
+
static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
|
3876
4671
|
GGML_ASSERT(llama_is_byte_token(vocab, id));
|
3877
4672
|
const auto& token_data = vocab.id_to_token.at(id);
|
3878
|
-
|
3879
|
-
|
4673
|
+
switch (llama_vocab_get_type(vocab)) {
|
4674
|
+
case LLAMA_VOCAB_TYPE_SPM: {
|
4675
|
+
auto buf = token_data.text.substr(3, 2);
|
4676
|
+
return strtol(buf.c_str(), NULL, 16);
|
4677
|
+
}
|
4678
|
+
case LLAMA_VOCAB_TYPE_BPE: {
|
4679
|
+
GGML_ASSERT(false);
|
4680
|
+
return unicode_to_bytes_bpe(token_data.text);
|
4681
|
+
}
|
4682
|
+
default:
|
4683
|
+
GGML_ASSERT(false);
|
4684
|
+
}
|
3880
4685
|
}
|
3881
4686
|
|
3882
4687
|
static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
|
3883
|
-
|
3884
|
-
|
3885
|
-
|
3886
|
-
|
4688
|
+
switch (llama_vocab_get_type(vocab)) {
|
4689
|
+
case LLAMA_VOCAB_TYPE_SPM: {
|
4690
|
+
char buf[7];
|
4691
|
+
int result = snprintf(buf, sizeof(buf), "<0x%02X>", ch);
|
4692
|
+
GGML_ASSERT(0 <= result && result < 7);
|
4693
|
+
return vocab.token_to_id.at(buf);
|
4694
|
+
}
|
4695
|
+
case LLAMA_VOCAB_TYPE_BPE: {
|
4696
|
+
return vocab.token_to_id.at(bytes_to_unicode_bpe(ch));
|
4697
|
+
}
|
4698
|
+
default:
|
4699
|
+
GGML_ASSERT(false);
|
4700
|
+
}
|
3887
4701
|
}
|
3888
4702
|
|
3889
4703
|
static void llama_escape_whitespace(std::string & text) {
|
@@ -4163,15 +4977,9 @@ struct llm_tokenizer_bpe {
|
|
4163
4977
|
std::string byte_str(1, *j);
|
4164
4978
|
auto token_multibyte = vocab.token_to_id.find(byte_str);
|
4165
4979
|
if (token_multibyte == vocab.token_to_id.end()) {
|
4166
|
-
|
4167
|
-
llama_token token_byte = llama_byte_to_token(vocab, *j);
|
4168
|
-
output.push_back(token_byte);
|
4169
|
-
} catch (const std::out_of_range & err) {
|
4170
|
-
fprintf(stderr,"ERROR: byte not found in vocab: '%s'\n", byte_str.c_str());
|
4171
|
-
}
|
4172
|
-
} else {
|
4173
|
-
output.push_back((*token_multibyte).second);
|
4980
|
+
throw std::runtime_error("ERROR: byte not found in vocab");
|
4174
4981
|
}
|
4982
|
+
output.push_back((*token_multibyte).second);
|
4175
4983
|
}
|
4176
4984
|
} else {
|
4177
4985
|
output.push_back((*token).second);
|
@@ -4208,23 +5016,144 @@ private:
|
|
4208
5016
|
work_queue.push(bigram);
|
4209
5017
|
}
|
4210
5018
|
|
4211
|
-
|
4212
|
-
|
4213
|
-
std::vector<std::string>
|
5019
|
+
std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
|
5020
|
+
std::vector<std::string> bpe_words;
|
5021
|
+
std::vector<std::string> bpe_encoded_words;
|
5022
|
+
|
5023
|
+
std::string token = "";
|
5024
|
+
// GPT2 system regex: 's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+
|
5025
|
+
bool collecting_numeric = false;
|
5026
|
+
bool collecting_letter = false;
|
5027
|
+
bool collecting_special = false;
|
5028
|
+
bool collecting_whitespace_lookahead = false;
|
5029
|
+
bool collecting = false;
|
5030
|
+
|
5031
|
+
std::vector<std::string> text_utf;
|
5032
|
+
text_utf.reserve(text.size());
|
5033
|
+
bpe_words.reserve(text.size());
|
5034
|
+
bpe_encoded_words.reserve(text.size());
|
5035
|
+
|
5036
|
+
auto cps = codepoints_from_utf8(text);
|
5037
|
+
for (size_t i = 0; i < cps.size(); ++i)
|
5038
|
+
text_utf.emplace_back(codepoint_to_utf8(cps[i]));
|
5039
|
+
|
5040
|
+
for (int i = 0; i < (int)text_utf.size(); i++) {
|
5041
|
+
const std::string & utf_char = text_utf[i];
|
5042
|
+
bool split_condition = false;
|
5043
|
+
// const char* text_pos = raw_text_p + utf_char.seq_offset_bytes;
|
5044
|
+
int bytes_remain = text_utf.size() - i;
|
5045
|
+
// forward backward lookups
|
5046
|
+
const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
|
5047
|
+
const std::string & utf_char_next_next = (i + 2 < (int)text_utf.size()) ? text_utf[i + 2] : "";
|
5048
|
+
|
5049
|
+
// handling contractions
|
5050
|
+
if (!split_condition && bytes_remain >= 2) {
|
5051
|
+
// 's|'t|'m|'d
|
5052
|
+
if (utf_char == "\'" && (utf_char_next == "s" || utf_char_next == "t" || utf_char_next == "m" || utf_char_next == "d")) {
|
5053
|
+
split_condition = true;
|
5054
|
+
}
|
5055
|
+
if (split_condition) {
|
5056
|
+
if (token.size()) {
|
5057
|
+
bpe_words.emplace_back(token); // push previous content as token
|
5058
|
+
}
|
5059
|
+
token = utf_char + utf_char_next;
|
5060
|
+
bpe_words.emplace_back(token);
|
5061
|
+
token = "";
|
5062
|
+
i++;
|
5063
|
+
continue;
|
5064
|
+
}
|
5065
|
+
}
|
5066
|
+
if (!split_condition && bytes_remain >= 3) {
|
5067
|
+
// 're|'ve|'ll
|
5068
|
+
if (utf_char == "\'" && (
|
5069
|
+
(utf_char_next == "r" || utf_char_next_next == "e") ||
|
5070
|
+
(utf_char_next == "v" || utf_char_next_next == "e") ||
|
5071
|
+
(utf_char_next == "l" || utf_char_next_next == "l"))
|
5072
|
+
) {
|
5073
|
+
split_condition = true;
|
5074
|
+
}
|
5075
|
+
if (split_condition) {
|
5076
|
+
// current token + next token can be defined
|
5077
|
+
if (token.size()) {
|
5078
|
+
bpe_words.emplace_back(token); // push previous content as token
|
5079
|
+
}
|
5080
|
+
token = utf_char + utf_char_next + utf_char_next_next;
|
5081
|
+
bpe_words.emplace_back(token); // the contraction
|
5082
|
+
token = "";
|
5083
|
+
i += 2;
|
5084
|
+
continue;
|
5085
|
+
}
|
5086
|
+
}
|
5087
|
+
|
5088
|
+
if (!split_condition && !collecting) {
|
5089
|
+
if (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || (!token.size() && utf_char == " " && codepoint_type(utf_char_next) == CODEPOINT_TYPE_LETTER)) {
|
5090
|
+
collecting_letter = true;
|
5091
|
+
collecting = true;
|
5092
|
+
}
|
5093
|
+
else if (codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || (!token.size() && utf_char == " " && codepoint_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
|
5094
|
+
collecting_numeric = true;
|
5095
|
+
collecting = true;
|
5096
|
+
}
|
5097
|
+
else if (
|
5098
|
+
((codepoint_type(utf_char) != CODEPOINT_TYPE_LETTER && codepoint_type(utf_char) != CODEPOINT_TYPE_DIGIT) && (codepoint_type(utf_char) != CODEPOINT_TYPE_WHITESPACE)) ||
|
5099
|
+
(!token.size() && utf_char == " " && codepoint_type(utf_char_next) != CODEPOINT_TYPE_LETTER && codepoint_type(utf_char_next) != CODEPOINT_TYPE_DIGIT && codepoint_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE)
|
5100
|
+
) {
|
5101
|
+
collecting_special = true;
|
5102
|
+
collecting = true;
|
5103
|
+
}
|
5104
|
+
else if (codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE && codepoint_type(utf_char_next) == CODEPOINT_TYPE_WHITESPACE) {
|
5105
|
+
collecting_whitespace_lookahead = true;
|
5106
|
+
collecting = true;
|
5107
|
+
}
|
5108
|
+
else if (codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE) {
|
5109
|
+
split_condition = true;
|
5110
|
+
}
|
5111
|
+
}
|
5112
|
+
else if (!split_condition && collecting) {
|
5113
|
+
if (collecting_letter && codepoint_type(utf_char) != CODEPOINT_TYPE_LETTER) {
|
5114
|
+
split_condition = true;
|
5115
|
+
}
|
5116
|
+
else if (collecting_numeric && codepoint_type(utf_char) != CODEPOINT_TYPE_DIGIT) {
|
5117
|
+
split_condition = true;
|
5118
|
+
}
|
5119
|
+
else if (collecting_special && (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
|
5120
|
+
split_condition = true;
|
5121
|
+
}
|
5122
|
+
else if (collecting_whitespace_lookahead && codepoint_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE) {
|
5123
|
+
split_condition = true;
|
5124
|
+
}
|
5125
|
+
}
|
5126
|
+
|
5127
|
+
if (utf_char_next == "") {
|
5128
|
+
split_condition = true; // final
|
5129
|
+
token += utf_char;
|
5130
|
+
}
|
4214
5131
|
|
4215
|
-
|
4216
|
-
|
4217
|
-
|
5132
|
+
if (split_condition) {
|
5133
|
+
if (token.size()) {
|
5134
|
+
bpe_words.emplace_back(token);
|
5135
|
+
}
|
5136
|
+
token = utf_char;
|
5137
|
+
collecting = false;
|
5138
|
+
collecting_letter = false;
|
5139
|
+
collecting_numeric = false;
|
5140
|
+
collecting_special = false;
|
5141
|
+
collecting_whitespace_lookahead = false;
|
5142
|
+
}
|
5143
|
+
else {
|
5144
|
+
token += utf_char;
|
5145
|
+
}
|
5146
|
+
}
|
4218
5147
|
|
4219
|
-
|
4220
|
-
|
4221
|
-
|
4222
|
-
|
4223
|
-
|
4224
|
-
|
5148
|
+
for (std::string & word : bpe_words) {
|
5149
|
+
std::string encoded_token = "";
|
5150
|
+
for (char & c : word) {
|
5151
|
+
encoded_token += bytes_to_unicode_bpe(c);
|
5152
|
+
}
|
5153
|
+
bpe_encoded_words.emplace_back(encoded_token);
|
4225
5154
|
}
|
4226
|
-
return words;
|
4227
5155
|
|
5156
|
+
return bpe_encoded_words;
|
4228
5157
|
}
|
4229
5158
|
|
4230
5159
|
const llama_vocab & vocab;
|
@@ -4266,7 +5195,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
4266
5195
|
llm_tokenizer_bpe tokenizer(vocab);
|
4267
5196
|
tokenizer.tokenize(raw_text, output);
|
4268
5197
|
} break;
|
4269
|
-
}
|
5198
|
+
}
|
4270
5199
|
|
4271
5200
|
return output;
|
4272
5201
|
}
|
@@ -4670,6 +5599,13 @@ struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar)
|
|
4670
5599
|
// sampling
|
4671
5600
|
//
|
4672
5601
|
|
5602
|
+
void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed) {
|
5603
|
+
if (seed == LLAMA_DEFAULT_SEED) {
|
5604
|
+
seed = time(NULL);
|
5605
|
+
}
|
5606
|
+
ctx->rng.seed(seed);
|
5607
|
+
}
|
5608
|
+
|
4673
5609
|
void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates) {
|
4674
5610
|
GGML_ASSERT(candidates->size > 0);
|
4675
5611
|
|
@@ -4878,7 +5814,7 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
|
|
4878
5814
|
}
|
4879
5815
|
}
|
4880
5816
|
|
4881
|
-
void
|
5817
|
+
void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
|
4882
5818
|
const int64_t t_start_sample_us = ggml_time_us();
|
4883
5819
|
|
4884
5820
|
for (size_t i = 0; i < candidates_p->size; ++i) {
|
@@ -4890,6 +5826,10 @@ void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array
|
|
4890
5826
|
}
|
4891
5827
|
}
|
4892
5828
|
|
5829
|
+
void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
|
5830
|
+
llama_sample_temp(ctx, candidates_p, temp);
|
5831
|
+
}
|
5832
|
+
|
4893
5833
|
void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty) {
|
4894
5834
|
if (last_tokens_size == 0 || penalty == 1.0f) {
|
4895
5835
|
return;
|
@@ -5013,7 +5953,7 @@ void llama_sample_classifier_free_guidance(
|
|
5013
5953
|
|
5014
5954
|
GGML_ASSERT(ctx);
|
5015
5955
|
|
5016
|
-
auto n_vocab = llama_n_vocab(ctx);
|
5956
|
+
auto n_vocab = llama_n_vocab(llama_get_model(ctx));
|
5017
5957
|
|
5018
5958
|
GGML_ASSERT(n_vocab == (int)candidates->size);
|
5019
5959
|
GGML_ASSERT(!candidates->sorted);
|
@@ -5042,7 +5982,7 @@ void llama_sample_classifier_free_guidance(
|
|
5042
5982
|
llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu) {
|
5043
5983
|
GGML_ASSERT(ctx);
|
5044
5984
|
|
5045
|
-
auto N = float(llama_n_vocab(ctx));
|
5985
|
+
auto N = float(llama_n_vocab(llama_get_model(ctx)));
|
5046
5986
|
int64_t t_start_sample_us;
|
5047
5987
|
t_start_sample_us = ggml_time_us();
|
5048
5988
|
|
@@ -5229,7 +6169,7 @@ struct llama_logit_info {
|
|
5229
6169
|
};
|
5230
6170
|
llama_logit_info(llama_context * ctx)
|
5231
6171
|
: logits(llama_get_logits(ctx))
|
5232
|
-
, n_vocab(llama_n_vocab(ctx))
|
6172
|
+
, n_vocab(llama_n_vocab(llama_get_model(ctx)))
|
5233
6173
|
, max_l(*std::max_element(logits, logits + n_vocab))
|
5234
6174
|
, normalizer(1.0f / std::accumulate(logits, logits + n_vocab, 0.0f, sum_exp{max_l}))
|
5235
6175
|
{ }
|
@@ -5267,7 +6207,6 @@ struct llama_beam_search_data {
|
|
5267
6207
|
size_t n_beams;
|
5268
6208
|
int n_past;
|
5269
6209
|
int n_predict;
|
5270
|
-
int n_threads;
|
5271
6210
|
std::vector<llama_beam> beams;
|
5272
6211
|
std::vector<llama_beam> next_beams;
|
5273
6212
|
|
@@ -5277,12 +6216,11 @@ struct llama_beam_search_data {
|
|
5277
6216
|
// Used to communicate to/from callback on beams state.
|
5278
6217
|
std::vector<llama_beam_view> beam_views;
|
5279
6218
|
|
5280
|
-
llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict
|
6219
|
+
llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict)
|
5281
6220
|
: ctx(ctx)
|
5282
6221
|
, n_beams(n_beams)
|
5283
6222
|
, n_past(n_past)
|
5284
6223
|
, n_predict(n_predict)
|
5285
|
-
, n_threads(n_threads)
|
5286
6224
|
, beam_views(n_beams) {
|
5287
6225
|
beams.reserve(n_beams);
|
5288
6226
|
next_beams.reserve(n_beams);
|
@@ -5319,7 +6257,7 @@ struct llama_beam_search_data {
|
|
5319
6257
|
} else {
|
5320
6258
|
// beam is not at end-of-sentence, so branch with next top_k tokens.
|
5321
6259
|
if (!beam.tokens.empty()) {
|
5322
|
-
|
6260
|
+
llama_decode(ctx, llama_batch_get_one(beam.tokens.data(), beam.tokens.size(), n_past, 0));
|
5323
6261
|
}
|
5324
6262
|
llama_logit_info logit_info(ctx);
|
5325
6263
|
std::vector<llama_token_data> next_tokens = logit_info.top_k(n_beams);
|
@@ -5393,7 +6331,7 @@ struct llama_beam_search_data {
|
|
5393
6331
|
callback(callback_data, get_beams_state(false)); // Sets common_prefix_length
|
5394
6332
|
update_beams_from_beam_views(); // Update values (p,eob) that callback may have changed.
|
5395
6333
|
if (common_prefix_length) {
|
5396
|
-
|
6334
|
+
llama_decode(ctx, llama_batch_get_one(beams[0].tokens.data(), common_prefix_length, n_past, 0));
|
5397
6335
|
n_past += common_prefix_length;
|
5398
6336
|
}
|
5399
6337
|
// Zero-out next_beam probabilities to place them last in following min-heap.
|
@@ -5434,11 +6372,11 @@ struct llama_beam_search_data {
|
|
5434
6372
|
|
5435
6373
|
void llama_beam_search(llama_context * ctx,
|
5436
6374
|
llama_beam_search_callback_fn_t callback, void * callback_data,
|
5437
|
-
size_t n_beams, int n_past, int n_predict
|
6375
|
+
size_t n_beams, int n_past, int n_predict) {
|
5438
6376
|
assert(ctx);
|
5439
6377
|
const int64_t t_start_sample_us = ggml_time_us();
|
5440
6378
|
|
5441
|
-
llama_beam_search_data beam_search_data(ctx, n_beams, n_past, n_predict
|
6379
|
+
llama_beam_search_data beam_search_data(ctx, n_beams, n_past, n_predict);
|
5442
6380
|
|
5443
6381
|
beam_search_data.loop(callback, callback_data);
|
5444
6382
|
|
@@ -5658,11 +6596,22 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
5658
6596
|
nthread = std::thread::hardware_concurrency();
|
5659
6597
|
}
|
5660
6598
|
|
5661
|
-
|
6599
|
+
// mmap consistently increases speed Linux, and also increases speed on Windows with
|
6600
|
+
// hot cache. It may cause a slowdown on macOS, possibly related to free memory.
|
6601
|
+
#if defined(__linux__) || defined(_WIN32)
|
6602
|
+
constexpr bool use_mmap = true;
|
6603
|
+
#else
|
6604
|
+
constexpr bool use_mmap = false;
|
6605
|
+
#endif
|
6606
|
+
|
6607
|
+
llama_model_loader ml(fname_inp, use_mmap);
|
6608
|
+
if (ml.use_mmap) {
|
6609
|
+
ml.mapping.reset(new llama_mmap(&ml.file, /* prefetch */ 0, ggml_is_numa()));
|
6610
|
+
}
|
5662
6611
|
|
5663
6612
|
llama_model model;
|
5664
|
-
llm_load_arch(
|
5665
|
-
llm_load_hparams(
|
6613
|
+
llm_load_arch(ml, model);
|
6614
|
+
llm_load_hparams(ml, model);
|
5666
6615
|
|
5667
6616
|
if (params->only_copy) {
|
5668
6617
|
ftype = model.ftype;
|
@@ -5672,7 +6621,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
5672
6621
|
struct gguf_context * ctx_out = gguf_init_empty();
|
5673
6622
|
|
5674
6623
|
// copy the KV pairs from the input file
|
5675
|
-
gguf_set_kv (ctx_out, ml
|
6624
|
+
gguf_set_kv (ctx_out, ml.ctx_gguf);
|
5676
6625
|
gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
|
5677
6626
|
gguf_set_val_u32(ctx_out, "general.file_type", ftype);
|
5678
6627
|
|
@@ -5680,8 +6629,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
5680
6629
|
int n_attention_wv = 0;
|
5681
6630
|
int n_feed_forward_w2 = 0;
|
5682
6631
|
|
5683
|
-
for (int i = 0; i < ml
|
5684
|
-
struct ggml_tensor * meta = ml
|
6632
|
+
for (int i = 0; i < ml.n_tensors; ++i) {
|
6633
|
+
struct ggml_tensor * meta = ml.get_tensor_meta(i);
|
5685
6634
|
|
5686
6635
|
const std::string name = ggml_get_name(meta);
|
5687
6636
|
|
@@ -5717,8 +6666,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
5717
6666
|
std::vector<no_init<float>> f32_conv_buf;
|
5718
6667
|
|
5719
6668
|
// populate the original tensors so we get an initial meta data
|
5720
|
-
for (int i = 0; i < ml
|
5721
|
-
struct ggml_tensor * meta = ml
|
6669
|
+
for (int i = 0; i < ml.n_tensors; ++i) {
|
6670
|
+
struct ggml_tensor * meta = ml.get_tensor_meta(i);
|
5722
6671
|
gguf_add_tensor(ctx_out, meta);
|
5723
6672
|
}
|
5724
6673
|
|
@@ -5731,19 +6680,21 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
5731
6680
|
// placeholder for the meta data
|
5732
6681
|
::zeros(fout, meta_size);
|
5733
6682
|
|
5734
|
-
for (int i = 0; i < ml
|
5735
|
-
struct ggml_tensor * tensor = ml
|
6683
|
+
for (int i = 0; i < ml.n_tensors; ++i) {
|
6684
|
+
struct ggml_tensor * tensor = ml.get_tensor_meta(i);
|
5736
6685
|
|
5737
6686
|
const std::string name = ggml_get_name(tensor);
|
5738
6687
|
|
5739
|
-
if (
|
5740
|
-
read_data.
|
6688
|
+
if (!ml.use_mmap) {
|
6689
|
+
if (read_data.size() < ggml_nbytes(tensor)) {
|
6690
|
+
read_data.resize(ggml_nbytes(tensor));
|
6691
|
+
}
|
6692
|
+
tensor->data = read_data.data();
|
5741
6693
|
}
|
5742
|
-
|
5743
|
-
ml->load_data_for(tensor);
|
6694
|
+
ml.load_data_for(tensor);
|
5744
6695
|
|
5745
6696
|
LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",
|
5746
|
-
++idx, ml
|
6697
|
+
++idx, ml.n_tensors,
|
5747
6698
|
ggml_get_name(tensor),
|
5748
6699
|
llama_format_tensor_shape(tensor).c_str(),
|
5749
6700
|
ggml_type_name(tensor->type));
|
@@ -5893,9 +6844,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
5893
6844
|
}
|
5894
6845
|
}
|
5895
6846
|
|
5896
|
-
// TODO: after the GGUF PR, this likely won't work and needs to be updated
|
5897
6847
|
static int llama_apply_lora_from_file_internal(
|
5898
|
-
const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads
|
6848
|
+
const struct llama_model & model, const char * path_lora, float scale, const char * path_base_model, int n_threads
|
5899
6849
|
) {
|
5900
6850
|
LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
|
5901
6851
|
|
@@ -5924,7 +6874,7 @@ static int llama_apply_lora_from_file_internal(
|
|
5924
6874
|
int32_t lora_alpha;
|
5925
6875
|
fin.read((char *) &lora_r, sizeof(lora_r));
|
5926
6876
|
fin.read((char *) &lora_alpha, sizeof(lora_alpha));
|
5927
|
-
float scaling = (float)lora_alpha / (float)lora_r;
|
6877
|
+
float scaling = scale * (float)lora_alpha / (float)lora_r;
|
5928
6878
|
|
5929
6879
|
LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
|
5930
6880
|
|
@@ -6140,9 +7090,10 @@ static int llama_apply_lora_from_file_internal(
|
|
6140
7090
|
ggml_set_name(r, "r_cpy");
|
6141
7091
|
}
|
6142
7092
|
|
6143
|
-
struct ggml_cgraph gf =
|
7093
|
+
struct ggml_cgraph * gf = ggml_new_graph(lora_ctx);
|
7094
|
+
ggml_build_forward_expand(gf, r);
|
6144
7095
|
|
6145
|
-
ggml_graph_compute_helper(work_buffer,
|
7096
|
+
ggml_graph_compute_helper(work_buffer, gf, n_threads);
|
6146
7097
|
|
6147
7098
|
// we won't need these tensors again, reset the context to save memory
|
6148
7099
|
ggml_free(lora_ctx);
|
@@ -6171,27 +7122,16 @@ static int llama_apply_lora_from_file_internal(
|
|
6171
7122
|
//
|
6172
7123
|
// interface implementation
|
6173
7124
|
//
|
6174
|
-
|
6175
|
-
struct
|
6176
|
-
struct llama_context_params result = {
|
6177
|
-
/*.seed =*/ LLAMA_DEFAULT_SEED,
|
6178
|
-
/*.n_ctx =*/ 512,
|
6179
|
-
/*.n_batch =*/ 512,
|
7125
|
+
struct llama_model_params llama_model_default_params() {
|
7126
|
+
struct llama_model_params result = {
|
6180
7127
|
/*.n_gpu_layers =*/ 0,
|
6181
7128
|
/*.main_gpu =*/ 0,
|
6182
7129
|
/*.tensor_split =*/ nullptr,
|
6183
|
-
/*.rope_freq_base =*/ 0.0f,
|
6184
|
-
/*.rope_freq_scale =*/ 0.0f,
|
6185
7130
|
/*.progress_callback =*/ nullptr,
|
6186
7131
|
/*.progress_callback_user_data =*/ nullptr,
|
6187
|
-
/*.low_vram =*/ false,
|
6188
|
-
/*.mul_mat_q =*/ true,
|
6189
|
-
/*.f16_kv =*/ true,
|
6190
|
-
/*.logits_all =*/ false,
|
6191
7132
|
/*.vocab_only =*/ false,
|
6192
7133
|
/*.use_mmap =*/ true,
|
6193
7134
|
/*.use_mlock =*/ false,
|
6194
|
-
/*.embedding =*/ false,
|
6195
7135
|
};
|
6196
7136
|
|
6197
7137
|
#ifdef GGML_USE_METAL
|
@@ -6201,6 +7141,24 @@ struct llama_context_params llama_context_default_params() {
|
|
6201
7141
|
return result;
|
6202
7142
|
}
|
6203
7143
|
|
7144
|
+
struct llama_context_params llama_context_default_params() {
|
7145
|
+
struct llama_context_params result = {
|
7146
|
+
/*.seed =*/ LLAMA_DEFAULT_SEED,
|
7147
|
+
/*.n_ctx =*/ 512,
|
7148
|
+
/*.n_batch =*/ 512,
|
7149
|
+
/*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
|
7150
|
+
/*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS,
|
7151
|
+
/*.rope_freq_base =*/ 0.0f,
|
7152
|
+
/*.rope_freq_scale =*/ 0.0f,
|
7153
|
+
/*.mul_mat_q =*/ true,
|
7154
|
+
/*.f16_kv =*/ true,
|
7155
|
+
/*.logits_all =*/ false,
|
7156
|
+
/*.embedding =*/ false,
|
7157
|
+
};
|
7158
|
+
|
7159
|
+
return result;
|
7160
|
+
}
|
7161
|
+
|
6204
7162
|
struct llama_model_quantize_params llama_model_quantize_default_params() {
|
6205
7163
|
struct llama_model_quantize_params result = {
|
6206
7164
|
/*.nthread =*/ 0,
|
@@ -6256,13 +7214,11 @@ int64_t llama_time_us(void) {
|
|
6256
7214
|
|
6257
7215
|
struct llama_model * llama_load_model_from_file(
|
6258
7216
|
const char * path_model,
|
6259
|
-
|
7217
|
+
struct llama_model_params params) {
|
6260
7218
|
ggml_time_init();
|
6261
7219
|
|
6262
7220
|
llama_model * model = new llama_model;
|
6263
7221
|
|
6264
|
-
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
6265
|
-
|
6266
7222
|
unsigned cur_percentage = 0;
|
6267
7223
|
if (params.progress_callback == NULL) {
|
6268
7224
|
params.progress_callback_user_data = &cur_percentage;
|
@@ -6279,9 +7235,9 @@ struct llama_model * llama_load_model_from_file(
|
|
6279
7235
|
};
|
6280
7236
|
}
|
6281
7237
|
|
6282
|
-
if (!llama_model_load(path_model, *model, params.
|
6283
|
-
params.main_gpu, params.tensor_split,
|
6284
|
-
params.
|
7238
|
+
if (!llama_model_load(path_model, *model, params.n_gpu_layers,
|
7239
|
+
params.main_gpu, params.tensor_split,
|
7240
|
+
params.use_mmap, params.use_mlock, params.vocab_only,
|
6285
7241
|
params.progress_callback, params.progress_callback_user_data)) {
|
6286
7242
|
LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
|
6287
7243
|
delete model;
|
@@ -6305,18 +7261,33 @@ struct llama_context * llama_new_context_with_model(
|
|
6305
7261
|
|
6306
7262
|
llama_context * ctx = new llama_context(*model);
|
6307
7263
|
|
7264
|
+
const auto & hparams = model->hparams;
|
7265
|
+
auto & cparams = ctx->cparams;
|
7266
|
+
|
7267
|
+
cparams.n_batch = params.n_batch;
|
7268
|
+
cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
|
7269
|
+
cparams.rope_freq_base = params.rope_freq_base == 0 ? hparams.rope_freq_base_train : params.rope_freq_base;
|
7270
|
+
cparams.rope_freq_scale = params.rope_freq_scale == 0 ? hparams.rope_freq_scale_train : params.rope_freq_scale;
|
7271
|
+
cparams.n_threads = params.n_threads;
|
7272
|
+
cparams.n_threads_batch = params.n_threads_batch;
|
7273
|
+
cparams.mul_mat_q = params.mul_mat_q;
|
7274
|
+
|
6308
7275
|
if (params.seed == LLAMA_DEFAULT_SEED) {
|
6309
7276
|
params.seed = time(NULL);
|
6310
7277
|
}
|
6311
7278
|
|
7279
|
+
LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
|
7280
|
+
LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
|
7281
|
+
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
|
7282
|
+
|
6312
7283
|
ctx->rng = std::mt19937(params.seed);
|
6313
7284
|
ctx->logits_all = params.logits_all;
|
6314
7285
|
|
6315
7286
|
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
6316
7287
|
|
6317
7288
|
// reserve memory for context buffers
|
6318
|
-
if (!
|
6319
|
-
if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type,
|
7289
|
+
if (!hparams.vocab_only) {
|
7290
|
+
if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, cparams.n_ctx, model->n_gpu_layers)) {
|
6320
7291
|
LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
|
6321
7292
|
llama_free(ctx);
|
6322
7293
|
return nullptr;
|
@@ -6327,11 +7298,9 @@ struct llama_context * llama_new_context_with_model(
|
|
6327
7298
|
LLAMA_LOG_INFO("%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
|
6328
7299
|
}
|
6329
7300
|
|
6330
|
-
const auto & hparams = ctx->model.hparams;
|
6331
|
-
|
6332
7301
|
// resized during inference
|
6333
7302
|
if (params.logits_all) {
|
6334
|
-
ctx->logits.reserve(
|
7303
|
+
ctx->logits.reserve(cparams.n_ctx*hparams.n_vocab);
|
6335
7304
|
} else {
|
6336
7305
|
ctx->logits.reserve(hparams.n_vocab);
|
6337
7306
|
}
|
@@ -6349,26 +7318,29 @@ struct llama_context * llama_new_context_with_model(
|
|
6349
7318
|
ctx->alloc = ggml_allocr_new_measure(tensor_alignment);
|
6350
7319
|
|
6351
7320
|
// build worst-case graph
|
6352
|
-
int n_tokens = std::min(
|
6353
|
-
int n_past =
|
7321
|
+
int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch);
|
7322
|
+
int n_past = cparams.n_ctx - n_tokens;
|
6354
7323
|
llama_token token = llama_token_bos(ctx); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
|
6355
|
-
ggml_cgraph * gf = llama_build_graph(*ctx, &token,
|
7324
|
+
ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0));
|
7325
|
+
|
6356
7326
|
#ifdef GGML_USE_METAL
|
6357
|
-
if (
|
7327
|
+
if (model->n_gpu_layers > 0) {
|
7328
|
+
ggml_metal_log_set_callback(llama_log_callback_default, NULL);
|
7329
|
+
|
6358
7330
|
ctx->ctx_metal = ggml_metal_init(1);
|
6359
7331
|
if (!ctx->ctx_metal) {
|
6360
7332
|
LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__);
|
6361
7333
|
llama_free(ctx);
|
6362
7334
|
return NULL;
|
6363
7335
|
}
|
6364
|
-
ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false);
|
6365
|
-
ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
|
7336
|
+
//ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false);
|
7337
|
+
//ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
|
6366
7338
|
}
|
6367
7339
|
#endif
|
6368
7340
|
// measure memory requirements for the graph
|
6369
7341
|
size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
|
6370
7342
|
|
6371
|
-
LLAMA_LOG_INFO("%s: compute buffer total size =
|
7343
|
+
LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
|
6372
7344
|
|
6373
7345
|
// recreate allocator with exact memory requirements
|
6374
7346
|
ggml_allocr_free(ctx->alloc);
|
@@ -6377,28 +7349,46 @@ struct llama_context * llama_new_context_with_model(
|
|
6377
7349
|
ctx->alloc = ggml_allocr_new(ctx->buf_alloc.data, ctx->buf_alloc.size, tensor_alignment);
|
6378
7350
|
#ifdef GGML_USE_METAL
|
6379
7351
|
if (ctx->ctx_metal) {
|
6380
|
-
ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
|
7352
|
+
//ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
|
6381
7353
|
}
|
6382
7354
|
#endif
|
6383
7355
|
#ifdef GGML_USE_CUBLAS
|
6384
|
-
|
6385
|
-
|
6386
|
-
|
6387
|
-
|
6388
|
-
|
6389
|
-
|
7356
|
+
ggml_cuda_set_scratch_size(alloc_size);
|
7357
|
+
LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MB\n", __func__, alloc_size / 1024.0 / 1024.0);
|
7358
|
+
|
7359
|
+
// calculate total VRAM usage
|
7360
|
+
auto add_tensor = [](const ggml_tensor * t, size_t & size) {
|
7361
|
+
if (t->backend == GGML_BACKEND_GPU || t->backend == GGML_BACKEND_GPU_SPLIT) {
|
7362
|
+
size += ggml_nbytes(t);
|
7363
|
+
}
|
7364
|
+
};
|
7365
|
+
size_t model_vram_size = 0;
|
7366
|
+
for (const auto & kv : model->tensors_by_name) {
|
7367
|
+
add_tensor(kv.second, model_vram_size);
|
6390
7368
|
}
|
7369
|
+
|
7370
|
+
size_t kv_vram_size = 0;
|
7371
|
+
add_tensor(ctx->kv_self.k, kv_vram_size);
|
7372
|
+
add_tensor(ctx->kv_self.v, kv_vram_size);
|
7373
|
+
|
7374
|
+
size_t ctx_vram_size = alloc_size + kv_vram_size;
|
7375
|
+
size_t total_vram_size = model_vram_size + ctx_vram_size;
|
7376
|
+
|
7377
|
+
LLAMA_LOG_INFO("%s: total VRAM used: %.2f MB (model: %.2f MB, context: %.2f MB)\n", __func__,
|
7378
|
+
total_vram_size / 1024.0 / 1024.0,
|
7379
|
+
model_vram_size / 1024.0 / 1024.0,
|
7380
|
+
ctx_vram_size / 1024.0 / 1024.0);
|
6391
7381
|
#endif
|
6392
7382
|
}
|
6393
7383
|
|
6394
7384
|
#ifdef GGML_USE_METAL
|
6395
|
-
if (
|
7385
|
+
if (model->n_gpu_layers > 0) {
|
6396
7386
|
// this allocates all Metal resources and memory buffers
|
6397
7387
|
|
6398
7388
|
void * data_ptr = NULL;
|
6399
7389
|
size_t data_size = 0;
|
6400
7390
|
|
6401
|
-
if (
|
7391
|
+
if (ctx->model.mapping) {
|
6402
7392
|
data_ptr = ctx->model.mapping->addr;
|
6403
7393
|
data_size = ctx->model.mapping->size;
|
6404
7394
|
} else {
|
@@ -6417,11 +7407,8 @@ struct llama_context * llama_new_context_with_model(
|
|
6417
7407
|
return NULL; \
|
6418
7408
|
}
|
6419
7409
|
|
6420
|
-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data",
|
6421
|
-
|
6422
|
-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.data, ctx->buf_compute.size, 0));
|
6423
|
-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0));
|
6424
|
-
|
7410
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
|
7411
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0));
|
6425
7412
|
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.data, ctx->buf_alloc.size, 0));
|
6426
7413
|
#undef LLAMA_METAL_CHECK_BUF
|
6427
7414
|
}
|
@@ -6433,8 +7420,10 @@ struct llama_context * llama_new_context_with_model(
|
|
6433
7420
|
|
6434
7421
|
if (ggml_mpi_rank(ctx->ctx_mpi) > 0) {
|
6435
7422
|
// Enter a blocking eval loop with dummy input, letting rank=0 drive the process
|
6436
|
-
|
6437
|
-
|
7423
|
+
// TODO: needs fix after #3228
|
7424
|
+
GGML_ASSERT(false && "not implemented");
|
7425
|
+
//const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos(ctx));
|
7426
|
+
//while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {};
|
6438
7427
|
llama_backend_free();
|
6439
7428
|
exit(1);
|
6440
7429
|
}
|
@@ -6443,63 +7432,41 @@ struct llama_context * llama_new_context_with_model(
|
|
6443
7432
|
return ctx;
|
6444
7433
|
}
|
6445
7434
|
|
6446
|
-
static struct llama_context * llama_init_from_file(
|
6447
|
-
const char * path_model,
|
6448
|
-
struct llama_context_params params) {
|
6449
|
-
struct llama_model * model = llama_load_model_from_file(path_model, params);
|
6450
|
-
if (!model) {
|
6451
|
-
return nullptr;
|
6452
|
-
}
|
6453
|
-
|
6454
|
-
struct llama_context * ctx = llama_new_context_with_model(model, params);
|
6455
|
-
ctx->model_owner = true;
|
6456
|
-
|
6457
|
-
return ctx;
|
6458
|
-
}
|
6459
|
-
|
6460
7435
|
void llama_free(struct llama_context * ctx) {
|
6461
7436
|
delete ctx;
|
6462
7437
|
}
|
6463
7438
|
|
6464
|
-
|
6465
|
-
return
|
7439
|
+
const llama_model * llama_get_model(const struct llama_context * ctx) {
|
7440
|
+
return &ctx->model;
|
6466
7441
|
}
|
6467
7442
|
|
6468
7443
|
int llama_n_ctx(const struct llama_context * ctx) {
|
6469
|
-
return
|
6470
|
-
}
|
6471
|
-
|
6472
|
-
int llama_n_ctx_train(const struct llama_context * ctx) {
|
6473
|
-
return llama_model_n_ctx_train(&ctx->model);
|
7444
|
+
return ctx->cparams.n_ctx;
|
6474
7445
|
}
|
6475
7446
|
|
6476
|
-
|
6477
|
-
return
|
7447
|
+
enum llama_vocab_type llama_vocab_type(const struct llama_model * model) {
|
7448
|
+
return model->vocab.type;
|
6478
7449
|
}
|
6479
7450
|
|
6480
|
-
|
6481
|
-
return ctx->model.vocab.type;
|
6482
|
-
}
|
6483
|
-
|
6484
|
-
int llama_model_n_vocab(const struct llama_model * model) {
|
7451
|
+
int llama_n_vocab(const struct llama_model * model) {
|
6485
7452
|
return model->vocab.id_to_token.size();
|
6486
7453
|
}
|
6487
7454
|
|
6488
|
-
int
|
6489
|
-
return model->hparams.n_ctx;
|
6490
|
-
}
|
6491
|
-
|
6492
|
-
int llama_model_n_ctx_train(const struct llama_model * model) {
|
7455
|
+
int llama_n_ctx_train(const struct llama_model * model) {
|
6493
7456
|
return model->hparams.n_ctx_train;
|
6494
7457
|
}
|
6495
7458
|
|
6496
|
-
int
|
7459
|
+
int llama_n_embd(const struct llama_model * model) {
|
6497
7460
|
return model->hparams.n_embd;
|
6498
7461
|
}
|
6499
7462
|
|
7463
|
+
float llama_rope_freq_scale_train(const struct llama_model * model) {
|
7464
|
+
return model->hparams.rope_freq_scale_train;
|
7465
|
+
}
|
7466
|
+
|
6500
7467
|
int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
|
6501
7468
|
return snprintf(buf, buf_size, "%s %s %s",
|
6502
|
-
model->
|
7469
|
+
llama_model_arch_name(model->arch).c_str(),
|
6503
7470
|
llama_model_type_name(model->type),
|
6504
7471
|
llama_model_ftype_name(model->ftype).c_str());
|
6505
7472
|
}
|
@@ -6520,6 +7487,10 @@ uint64_t llama_model_n_params(const struct llama_model * model) {
|
|
6520
7487
|
return nparams;
|
6521
7488
|
}
|
6522
7489
|
|
7490
|
+
struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name) {
|
7491
|
+
return ggml_get_tensor(model->ctx, name);
|
7492
|
+
}
|
7493
|
+
|
6523
7494
|
int llama_model_quantize(
|
6524
7495
|
const char * fname_inp,
|
6525
7496
|
const char * fname_out,
|
@@ -6533,18 +7504,18 @@ int llama_model_quantize(
|
|
6533
7504
|
}
|
6534
7505
|
}
|
6535
7506
|
|
6536
|
-
int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
|
7507
|
+
int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, float scale, const char * path_base_model, int n_threads) {
|
6537
7508
|
try {
|
6538
|
-
return llama_apply_lora_from_file_internal(ctx->model, path_lora, path_base_model, n_threads);
|
7509
|
+
return llama_apply_lora_from_file_internal(ctx->model, path_lora, scale, path_base_model, n_threads);
|
6539
7510
|
} catch (const std::exception & err) {
|
6540
7511
|
LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
6541
7512
|
return 1;
|
6542
7513
|
}
|
6543
7514
|
}
|
6544
7515
|
|
6545
|
-
int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, const char * path_base_model, int n_threads) {
|
7516
|
+
int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int n_threads) {
|
6546
7517
|
try {
|
6547
|
-
return llama_apply_lora_from_file_internal(*model, path_lora, path_base_model, n_threads);
|
7518
|
+
return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads);
|
6548
7519
|
} catch (const std::exception & err) {
|
6549
7520
|
LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
6550
7521
|
return 1;
|
@@ -6552,16 +7523,27 @@ int llama_model_apply_lora_from_file(const struct llama_model * model, const cha
|
|
6552
7523
|
}
|
6553
7524
|
|
6554
7525
|
int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
|
6555
|
-
return ctx->kv_self.
|
7526
|
+
return ctx->kv_self.head;
|
6556
7527
|
}
|
6557
7528
|
|
6558
|
-
|
7529
|
+
void llama_kv_cache_tokens_rm(struct llama_context * ctx, int32_t c0, int32_t c1) {
|
7530
|
+
llama_kv_cache_tokens_rm(ctx->kv_self, c0, c1);
|
7531
|
+
}
|
6559
7532
|
|
6560
|
-
void
|
6561
|
-
|
6562
|
-
|
6563
|
-
|
6564
|
-
|
7533
|
+
void llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
|
7534
|
+
llama_kv_cache_seq_rm(ctx->kv_self, seq_id, p0, p1);
|
7535
|
+
}
|
7536
|
+
|
7537
|
+
void llama_kv_cache_seq_cp(struct llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
|
7538
|
+
llama_kv_cache_seq_cp(ctx->kv_self, seq_id_src, seq_id_dst, p0, p1);
|
7539
|
+
}
|
7540
|
+
|
7541
|
+
void llama_kv_cache_seq_keep(struct llama_context * ctx, llama_seq_id seq_id) {
|
7542
|
+
llama_kv_cache_seq_keep(ctx->kv_self, seq_id);
|
7543
|
+
}
|
7544
|
+
|
7545
|
+
void llama_kv_cache_seq_shift(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
|
7546
|
+
llama_kv_cache_seq_shift(ctx->kv_self, seq_id, p0, p1, delta);
|
6565
7547
|
}
|
6566
7548
|
|
6567
7549
|
// Returns the *maximum* size of the state
|
@@ -6699,36 +7681,40 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
6699
7681
|
{
|
6700
7682
|
const auto & kv_self = ctx->kv_self;
|
6701
7683
|
const auto & hparams = ctx->model.hparams;
|
6702
|
-
const
|
6703
|
-
const int n_embd = hparams.n_embd_gqa();
|
6704
|
-
const int n_ctx = hparams.n_ctx;
|
7684
|
+
const auto & cparams = ctx->cparams;
|
6705
7685
|
|
6706
|
-
const
|
6707
|
-
const
|
7686
|
+
const auto n_layer = hparams.n_layer;
|
7687
|
+
const auto n_embd = hparams.n_embd_gqa();
|
7688
|
+
const auto n_ctx = cparams.n_ctx;
|
6708
7689
|
|
6709
|
-
|
6710
|
-
|
7690
|
+
const size_t kv_buf_size = kv_self.buf.size;
|
7691
|
+
const uint32_t kv_head = kv_self.head;
|
7692
|
+
const uint32_t kv_size = kv_self.size;
|
6711
7693
|
|
6712
|
-
|
7694
|
+
data_ctx->write(&kv_buf_size, sizeof(kv_buf_size));
|
7695
|
+
data_ctx->write(&kv_head, sizeof(kv_head));
|
7696
|
+
data_ctx->write(&kv_size, sizeof(kv_size));
|
7697
|
+
|
7698
|
+
if (kv_buf_size) {
|
6713
7699
|
const size_t elt_size = ggml_element_size(kv_self.k);
|
6714
7700
|
|
6715
7701
|
ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
|
6716
7702
|
ggml_cgraph gf{};
|
6717
7703
|
|
6718
|
-
ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd,
|
7704
|
+
ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
|
6719
7705
|
std::vector<uint8_t> kout3d_data(ggml_nbytes(kout3d), 0);
|
6720
7706
|
kout3d->data = kout3d_data.data();
|
6721
7707
|
|
6722
|
-
ggml_tensor * vout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type,
|
7708
|
+
ggml_tensor * vout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_head, n_embd, n_layer);
|
6723
7709
|
std::vector<uint8_t> vout3d_data(ggml_nbytes(vout3d), 0);
|
6724
7710
|
vout3d->data = vout3d_data.data();
|
6725
7711
|
|
6726
7712
|
ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
|
6727
|
-
n_embd,
|
7713
|
+
n_embd, kv_head, n_layer,
|
6728
7714
|
elt_size*n_embd, elt_size*n_embd*n_ctx, 0);
|
6729
7715
|
|
6730
7716
|
ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v,
|
6731
|
-
|
7717
|
+
kv_head, n_embd, n_layer,
|
6732
7718
|
elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
|
6733
7719
|
|
6734
7720
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
|
@@ -6742,6 +7728,20 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
6742
7728
|
data_ctx->write(kout3d_data.data(), kout3d_data.size());
|
6743
7729
|
data_ctx->write(vout3d_data.data(), vout3d_data.size());
|
6744
7730
|
}
|
7731
|
+
|
7732
|
+
for (uint32_t i = 0; i < kv_size; ++i) {
|
7733
|
+
const auto & cell = kv_self.cells[i];
|
7734
|
+
|
7735
|
+
const llama_pos pos = cell.pos;
|
7736
|
+
const size_t seq_id_size = cell.seq_id.size();
|
7737
|
+
|
7738
|
+
data_ctx->write(&pos, sizeof(pos));
|
7739
|
+
data_ctx->write(&seq_id_size, sizeof(seq_id_size));
|
7740
|
+
|
7741
|
+
for (auto seq_id : cell.seq_id) {
|
7742
|
+
data_ctx->write(&seq_id, sizeof(seq_id));
|
7743
|
+
}
|
7744
|
+
}
|
6745
7745
|
}
|
6746
7746
|
}
|
6747
7747
|
|
@@ -6807,38 +7807,42 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
6807
7807
|
{
|
6808
7808
|
const auto & kv_self = ctx->kv_self;
|
6809
7809
|
const auto & hparams = ctx->model.hparams;
|
7810
|
+
const auto & cparams = ctx->cparams;
|
7811
|
+
|
6810
7812
|
const int n_layer = hparams.n_layer;
|
6811
7813
|
const int n_embd = hparams.n_embd_gqa();
|
6812
|
-
const int n_ctx =
|
7814
|
+
const int n_ctx = cparams.n_ctx;
|
6813
7815
|
|
6814
|
-
size_t
|
6815
|
-
|
7816
|
+
size_t kv_buf_size;
|
7817
|
+
uint32_t kv_head;
|
7818
|
+
uint32_t kv_size;
|
6816
7819
|
|
6817
|
-
memcpy(&
|
6818
|
-
memcpy(&
|
7820
|
+
memcpy(&kv_buf_size, inp, sizeof(kv_buf_size)); inp += sizeof(kv_buf_size);
|
7821
|
+
memcpy(&kv_head, inp, sizeof(kv_head)); inp += sizeof(kv_head);
|
7822
|
+
memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
|
6819
7823
|
|
6820
|
-
if (
|
6821
|
-
GGML_ASSERT(kv_self.buf.size ==
|
7824
|
+
if (kv_buf_size) {
|
7825
|
+
GGML_ASSERT(kv_self.buf.size == kv_buf_size);
|
6822
7826
|
|
6823
7827
|
const size_t elt_size = ggml_element_size(kv_self.k);
|
6824
7828
|
|
6825
7829
|
ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
|
6826
7830
|
ggml_cgraph gf{};
|
6827
7831
|
|
6828
|
-
ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd,
|
7832
|
+
ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
|
6829
7833
|
kin3d->data = (void *) inp;
|
6830
7834
|
inp += ggml_nbytes(kin3d);
|
6831
7835
|
|
6832
|
-
ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type,
|
7836
|
+
ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_head, n_embd, n_layer);
|
6833
7837
|
vin3d->data = (void *) inp;
|
6834
7838
|
inp += ggml_nbytes(vin3d);
|
6835
7839
|
|
6836
7840
|
ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
|
6837
|
-
n_embd,
|
7841
|
+
n_embd, kv_head, n_layer,
|
6838
7842
|
elt_size*n_embd, elt_size*n_embd*n_ctx, 0);
|
6839
7843
|
|
6840
7844
|
ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v,
|
6841
|
-
|
7845
|
+
kv_head, n_embd, n_layer,
|
6842
7846
|
elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
|
6843
7847
|
|
6844
7848
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
|
@@ -6848,7 +7852,27 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
6848
7852
|
ggml_free(cpy_ctx);
|
6849
7853
|
}
|
6850
7854
|
|
6851
|
-
ctx->kv_self.
|
7855
|
+
ctx->kv_self.head = kv_head;
|
7856
|
+
ctx->kv_self.size = kv_size;
|
7857
|
+
|
7858
|
+
ctx->kv_self.cells.resize(kv_size);
|
7859
|
+
|
7860
|
+
for (uint32_t i = 0; i < kv_size; ++i) {
|
7861
|
+
llama_pos pos;
|
7862
|
+
size_t seq_id_size;
|
7863
|
+
|
7864
|
+
memcpy(&pos, inp, sizeof(pos)); inp += sizeof(pos);
|
7865
|
+
memcpy(&seq_id_size, inp, sizeof(seq_id_size)); inp += sizeof(seq_id_size);
|
7866
|
+
|
7867
|
+
ctx->kv_self.cells[i].pos = pos;
|
7868
|
+
|
7869
|
+
llama_seq_id seq_id;
|
7870
|
+
|
7871
|
+
for (size_t j = 0; j < seq_id_size; ++j) {
|
7872
|
+
memcpy(&seq_id, inp, sizeof(seq_id)); inp += sizeof(seq_id);
|
7873
|
+
ctx->kv_self.cells[i].seq_id.insert(seq_id);
|
7874
|
+
}
|
7875
|
+
}
|
6852
7876
|
}
|
6853
7877
|
|
6854
7878
|
const size_t nread = inp - src;
|
@@ -6943,64 +7967,102 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi
|
|
6943
7967
|
|
6944
7968
|
int llama_eval(
|
6945
7969
|
struct llama_context * ctx,
|
6946
|
-
|
6947
|
-
|
6948
|
-
int n_past
|
6949
|
-
|
6950
|
-
if (!llama_eval_internal(*ctx, tokens, nullptr, n_tokens, n_past, n_threads, nullptr)) {
|
6951
|
-
LLAMA_LOG_ERROR("%s: failed to eval\n", __func__);
|
6952
|
-
return 1;
|
6953
|
-
}
|
7970
|
+
llama_token * tokens,
|
7971
|
+
int32_t n_tokens,
|
7972
|
+
int n_past) {
|
7973
|
+
llama_kv_cache_tokens_rm(ctx->kv_self, n_past, -1);
|
6954
7974
|
|
6955
|
-
|
6956
|
-
|
6957
|
-
|
6958
|
-
ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
|
6959
|
-
ctx->has_evaluated_once = true;
|
7975
|
+
const int ret = llama_decode_internal(*ctx, llama_batch_get_one(tokens, n_tokens, n_past, 0));
|
7976
|
+
if (ret < 0) {
|
7977
|
+
LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
|
6960
7978
|
}
|
6961
7979
|
|
6962
|
-
return
|
7980
|
+
return ret;
|
6963
7981
|
}
|
6964
7982
|
|
6965
7983
|
int llama_eval_embd(
|
6966
7984
|
struct llama_context * ctx,
|
6967
|
-
|
6968
|
-
|
6969
|
-
int n_past
|
6970
|
-
|
6971
|
-
if (!llama_eval_internal(*ctx, nullptr, embd, n_tokens, n_past, n_threads, nullptr)) {
|
6972
|
-
LLAMA_LOG_ERROR("%s: failed to eval\n", __func__);
|
6973
|
-
return 1;
|
6974
|
-
}
|
7985
|
+
float * embd,
|
7986
|
+
int32_t n_tokens,
|
7987
|
+
int n_past) {
|
7988
|
+
llama_kv_cache_tokens_rm(ctx->kv_self, n_past, -1);
|
6975
7989
|
|
6976
|
-
|
6977
|
-
|
6978
|
-
|
6979
|
-
|
6980
|
-
|
7990
|
+
llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, n_past, 1, 0, };
|
7991
|
+
|
7992
|
+
const int ret = llama_decode_internal(*ctx, batch);
|
7993
|
+
if (ret < 0) {
|
7994
|
+
LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
|
6981
7995
|
}
|
6982
7996
|
|
6983
|
-
return
|
7997
|
+
return ret;
|
6984
7998
|
}
|
6985
7999
|
|
6986
|
-
|
6987
|
-
|
6988
|
-
|
8000
|
+
void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch) {
|
8001
|
+
ctx->cparams.n_threads = n_threads;
|
8002
|
+
ctx->cparams.n_threads_batch = n_threads_batch;
|
8003
|
+
}
|
8004
|
+
|
8005
|
+
struct llama_batch llama_batch_get_one(
|
8006
|
+
llama_token * tokens,
|
8007
|
+
int32_t n_tokens,
|
8008
|
+
llama_pos pos_0,
|
8009
|
+
llama_seq_id seq_id) {
|
8010
|
+
return {
|
8011
|
+
/*n_tokens =*/ n_tokens,
|
8012
|
+
/*tokens =*/ tokens,
|
8013
|
+
/*embd =*/ nullptr,
|
8014
|
+
/*pos =*/ nullptr,
|
8015
|
+
/*seq_id =*/ nullptr,
|
8016
|
+
/*logits =*/ nullptr,
|
8017
|
+
/*all_pos_0 =*/ pos_0,
|
8018
|
+
/*all_pos_1 =*/ 1,
|
8019
|
+
/*all_seq_id =*/ seq_id,
|
8020
|
+
};
|
8021
|
+
}
|
6989
8022
|
|
6990
|
-
|
8023
|
+
struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd) {
|
8024
|
+
llama_batch batch = { -1, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, };
|
6991
8025
|
|
6992
|
-
if (
|
6993
|
-
|
6994
|
-
|
8026
|
+
if (embd) {
|
8027
|
+
batch.embd = (float *) malloc(sizeof(float) * n_tokens * embd);
|
8028
|
+
} else {
|
8029
|
+
batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens);
|
6995
8030
|
}
|
6996
8031
|
|
6997
|
-
|
8032
|
+
batch.pos = (llama_pos *) malloc(sizeof(llama_pos) * n_tokens);
|
8033
|
+
batch.seq_id = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_tokens);
|
8034
|
+
batch.logits = (int8_t *) malloc(sizeof(int8_t) * n_tokens);
|
8035
|
+
|
8036
|
+
return batch;
|
8037
|
+
}
|
8038
|
+
|
8039
|
+
void llama_batch_free(struct llama_batch batch) {
|
8040
|
+
if (batch.token) free(batch.token);
|
8041
|
+
if (batch.embd) free(batch.embd);
|
8042
|
+
if (batch.pos) free(batch.pos);
|
8043
|
+
if (batch.seq_id) free(batch.seq_id);
|
8044
|
+
if (batch.logits) free(batch.logits);
|
8045
|
+
}
|
8046
|
+
|
8047
|
+
int llama_decode(
|
8048
|
+
struct llama_context * ctx,
|
8049
|
+
struct llama_batch batch) {
|
8050
|
+
const int ret = llama_decode_internal(*ctx, batch);
|
8051
|
+
if (ret < 0) {
|
8052
|
+
LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
|
8053
|
+
}
|
8054
|
+
|
8055
|
+
return ret;
|
6998
8056
|
}
|
6999
8057
|
|
7000
8058
|
float * llama_get_logits(struct llama_context * ctx) {
|
7001
8059
|
return ctx->logits.data();
|
7002
8060
|
}
|
7003
8061
|
|
8062
|
+
float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
|
8063
|
+
return ctx->logits.data() + i*ctx->model.hparams.n_vocab;
|
8064
|
+
}
|
8065
|
+
|
7004
8066
|
float * llama_get_embeddings(struct llama_context * ctx) {
|
7005
8067
|
return ctx->embedding.data();
|
7006
8068
|
}
|
@@ -7028,18 +8090,24 @@ llama_token llama_token_eos(const struct llama_context * ctx) {
|
|
7028
8090
|
llama_token llama_token_nl(const struct llama_context * ctx) {
|
7029
8091
|
return ctx->model.vocab.linefeed_id;
|
7030
8092
|
}
|
8093
|
+
llama_token llama_token_prefix(const struct llama_context * ctx) {
|
8094
|
+
return ctx->model.vocab.special_prefix_id;
|
8095
|
+
}
|
7031
8096
|
|
7032
|
-
|
7033
|
-
|
7034
|
-
const char * text,
|
7035
|
-
int text_len,
|
7036
|
-
llama_token * tokens,
|
7037
|
-
int n_max_tokens,
|
7038
|
-
bool add_bos) {
|
7039
|
-
return llama_tokenize_with_model(&ctx->model, text, text_len, tokens, n_max_tokens, add_bos);
|
8097
|
+
llama_token llama_token_middle(const struct llama_context * ctx) {
|
8098
|
+
return ctx->model.vocab.special_middle_id;
|
7040
8099
|
}
|
7041
8100
|
|
7042
|
-
|
8101
|
+
llama_token llama_token_suffix(const struct llama_context * ctx) {
|
8102
|
+
return ctx->model.vocab.special_suffix_id;
|
8103
|
+
}
|
8104
|
+
|
8105
|
+
llama_token llama_token_eot(const struct llama_context * ctx) {
|
8106
|
+
return ctx->model.vocab.special_eot_id;
|
8107
|
+
}
|
8108
|
+
|
8109
|
+
|
8110
|
+
int llama_tokenize(
|
7043
8111
|
const struct llama_model * model,
|
7044
8112
|
const char * text,
|
7045
8113
|
int text_len,
|
@@ -7060,39 +8128,66 @@ int llama_tokenize_with_model(
|
|
7060
8128
|
return res.size();
|
7061
8129
|
}
|
7062
8130
|
|
7063
|
-
|
7064
|
-
|
8131
|
+
static std::string llama_decode_text(const std::string & text) {
|
8132
|
+
std::string decoded_text;
|
8133
|
+
auto unicode_sequences = codepoints_from_utf8(text);
|
8134
|
+
for (auto& unicode_sequence : unicode_sequences) {
|
8135
|
+
decoded_text += unicode_to_bytes_bpe(codepoint_to_utf8(unicode_sequence));
|
8136
|
+
}
|
8137
|
+
|
8138
|
+
return decoded_text;
|
7065
8139
|
}
|
7066
8140
|
|
7067
8141
|
// does not write null-terminator to buf
|
7068
|
-
int
|
7069
|
-
if (0 <= token && token <
|
7070
|
-
|
7071
|
-
|
7072
|
-
if (
|
8142
|
+
int llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int length) {
|
8143
|
+
if (0 <= token && token < llama_n_vocab(model)) {
|
8144
|
+
switch (llama_vocab_get_type(model->vocab)) {
|
8145
|
+
case LLAMA_VOCAB_TYPE_SPM: {
|
8146
|
+
if (llama_is_normal_token(model->vocab, token)) {
|
8147
|
+
std::string result = model->vocab.id_to_token[token].text;
|
7073
8148
|
llama_unescape_whitespace(result);
|
8149
|
+
if (length < (int) result.length()) {
|
8150
|
+
return -result.length();
|
8151
|
+
}
|
8152
|
+
memcpy(buf, result.c_str(), result.length());
|
8153
|
+
return result.length();
|
8154
|
+
} else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT
|
8155
|
+
if (length < 3) {
|
8156
|
+
return -3;
|
8157
|
+
}
|
8158
|
+
memcpy(buf, "\xe2\x96\x85", 3);
|
8159
|
+
return 3;
|
8160
|
+
} else if (llama_is_control_token(model->vocab, token)) {
|
8161
|
+
;
|
8162
|
+
} else if (llama_is_byte_token(model->vocab, token)) {
|
8163
|
+
if (length < 1) {
|
8164
|
+
return -1;
|
8165
|
+
}
|
8166
|
+
buf[0] = llama_token_to_byte(model->vocab, token);
|
8167
|
+
return 1;
|
8168
|
+
} else {
|
8169
|
+
GGML_ASSERT(false);
|
7074
8170
|
}
|
7075
|
-
|
7076
|
-
|
7077
|
-
|
7078
|
-
|
7079
|
-
|
7080
|
-
|
7081
|
-
|
7082
|
-
|
7083
|
-
|
7084
|
-
|
7085
|
-
|
7086
|
-
|
7087
|
-
|
7088
|
-
|
7089
|
-
|
7090
|
-
} else if (llama_is_byte_token(model->vocab, token)) {
|
7091
|
-
if (length < 1) {
|
7092
|
-
return -1;
|
8171
|
+
break;
|
8172
|
+
}
|
8173
|
+
case LLAMA_VOCAB_TYPE_BPE: {
|
8174
|
+
if (llama_is_normal_token(model->vocab, token)) {
|
8175
|
+
std::string result = model->vocab.id_to_token[token].text;
|
8176
|
+
result = llama_decode_text(result);
|
8177
|
+
if (length < (int) result.length()) {
|
8178
|
+
return -result.length();
|
8179
|
+
}
|
8180
|
+
memcpy(buf, result.c_str(), result.length());
|
8181
|
+
return result.length();
|
8182
|
+
} else if (llama_is_control_token(model->vocab, token)) {
|
8183
|
+
;
|
8184
|
+
} else {
|
8185
|
+
GGML_ASSERT(false);
|
7093
8186
|
}
|
7094
|
-
|
7095
|
-
|
8187
|
+
break;
|
8188
|
+
}
|
8189
|
+
default:
|
8190
|
+
GGML_ASSERT(false);
|
7096
8191
|
}
|
7097
8192
|
}
|
7098
8193
|
return 0;
|
@@ -7119,14 +8214,14 @@ void llama_print_timings(struct llama_context * ctx) {
|
|
7119
8214
|
const llama_timings timings = llama_get_timings(ctx);
|
7120
8215
|
|
7121
8216
|
LLAMA_LOG_INFO("\n");
|
7122
|
-
LLAMA_LOG_INFO("%s: load time = %
|
7123
|
-
LLAMA_LOG_INFO("%s: sample time = %
|
8217
|
+
LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, timings.t_load_ms);
|
8218
|
+
LLAMA_LOG_INFO("%s: sample time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
7124
8219
|
__func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample);
|
7125
|
-
LLAMA_LOG_INFO("%s: prompt eval time = %
|
8220
|
+
LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
|
7126
8221
|
__func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
|
7127
|
-
LLAMA_LOG_INFO("%s: eval time = %
|
8222
|
+
LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
7128
8223
|
__func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
|
7129
|
-
LLAMA_LOG_INFO("%s: total time = %
|
8224
|
+
LLAMA_LOG_INFO("%s: total time = %10.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms));
|
7130
8225
|
}
|
7131
8226
|
|
7132
8227
|
void llama_reset_timings(struct llama_context * ctx) {
|
@@ -7194,12 +8289,12 @@ const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal
|
|
7194
8289
|
return ctx->model.tensors_by_name;
|
7195
8290
|
}
|
7196
8291
|
|
7197
|
-
void llama_log_set(
|
8292
|
+
void llama_log_set(ggml_log_callback log_callback, void * user_data) {
|
7198
8293
|
g_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
|
7199
8294
|
g_state.log_callback_user_data = user_data;
|
7200
8295
|
}
|
7201
8296
|
|
7202
|
-
static void llama_log_internal_v(
|
8297
|
+
static void llama_log_internal_v(ggml_log_level level, const char * format, va_list args) {
|
7203
8298
|
va_list args_copy;
|
7204
8299
|
va_copy(args_copy, args);
|
7205
8300
|
char buffer[128];
|
@@ -7216,14 +8311,14 @@ static void llama_log_internal_v(llama_log_level level, const char * format, va_
|
|
7216
8311
|
va_end(args_copy);
|
7217
8312
|
}
|
7218
8313
|
|
7219
|
-
static void llama_log_internal(
|
8314
|
+
static void llama_log_internal(ggml_log_level level, const char * format, ...) {
|
7220
8315
|
va_list args;
|
7221
8316
|
va_start(args, format);
|
7222
8317
|
llama_log_internal_v(level, format, args);
|
7223
8318
|
va_end(args);
|
7224
8319
|
}
|
7225
8320
|
|
7226
|
-
static void llama_log_callback_default(
|
8321
|
+
static void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data) {
|
7227
8322
|
(void) level;
|
7228
8323
|
(void) user_data;
|
7229
8324
|
fputs(text, stderr);
|