llama_cpp 0.5.3 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +17 -0
- data/README.md +6 -5
- data/examples/chat.rb +13 -13
- data/examples/embedding.rb +9 -9
- data/ext/llama_cpp/llama_cpp.cpp +583 -262
- data/ext/llama_cpp/src/ggml-alloc.c +8 -2
- data/ext/llama_cpp/src/ggml-alloc.h +1 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +326 -149
- data/ext/llama_cpp/src/ggml-cuda.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.h +4 -0
- data/ext/llama_cpp/src/ggml-metal.m +167 -89
- data/ext/llama_cpp/src/ggml-metal.metal +130 -40
- data/ext/llama_cpp/src/ggml-opencl.cpp +119 -53
- data/ext/llama_cpp/src/ggml.c +2355 -1166
- data/ext/llama_cpp/src/ggml.h +129 -35
- data/ext/llama_cpp/src/k_quants.c +744 -2
- data/ext/llama_cpp/src/llama.cpp +1766 -671
- data/ext/llama_cpp/src/llama.h +321 -120
- data/ext/llama_cpp/src/unicode.h +462 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +6 -10
- data/sig/llama_cpp.rbs +70 -34
- metadata +4 -3
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
#define LLAMA_API_INTERNAL
|
2
2
|
#include "llama.h"
|
3
3
|
|
4
|
+
#include "unicode.h"
|
5
|
+
|
4
6
|
#include "ggml.h"
|
5
7
|
|
6
8
|
#include "ggml-alloc.h"
|
@@ -72,6 +74,7 @@
|
|
72
74
|
#include <sstream>
|
73
75
|
#include <thread>
|
74
76
|
#include <unordered_map>
|
77
|
+
#include <set>
|
75
78
|
|
76
79
|
#if defined(_MSC_VER)
|
77
80
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
@@ -92,12 +95,12 @@
|
|
92
95
|
//
|
93
96
|
|
94
97
|
LLAMA_ATTRIBUTE_FORMAT(2, 3)
|
95
|
-
static void llama_log_internal (
|
96
|
-
static void llama_log_callback_default(
|
98
|
+
static void llama_log_internal (ggml_log_level level, const char* format, ...);
|
99
|
+
static void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data);
|
97
100
|
|
98
|
-
#define LLAMA_LOG_INFO(...) llama_log_internal(
|
99
|
-
#define LLAMA_LOG_WARN(...) llama_log_internal(
|
100
|
-
#define LLAMA_LOG_ERROR(...) llama_log_internal(
|
101
|
+
#define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
|
102
|
+
#define LLAMA_LOG_WARN(...) llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
|
103
|
+
#define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
|
101
104
|
|
102
105
|
//
|
103
106
|
// helpers
|
@@ -122,6 +125,27 @@ static void replace_all(std::string & s, const std::string & search, const std::
|
|
122
125
|
}
|
123
126
|
s = std::move(result);
|
124
127
|
}
|
128
|
+
|
129
|
+
static bool is_float_close(float a, float b, float abs_tol) {
|
130
|
+
// Check for non-negative tolerance
|
131
|
+
if (abs_tol < 0.0) {
|
132
|
+
throw std::invalid_argument("Tolerance must be non-negative");
|
133
|
+
}
|
134
|
+
|
135
|
+
// Exact equality check
|
136
|
+
if (a == b) {
|
137
|
+
return true;
|
138
|
+
}
|
139
|
+
|
140
|
+
// Check for infinities
|
141
|
+
if (std::isinf(a) || std::isinf(b)) {
|
142
|
+
return false;
|
143
|
+
}
|
144
|
+
|
145
|
+
// Regular comparison using the provided absolute tolerance
|
146
|
+
return std::fabs(b - a) <= abs_tol;
|
147
|
+
}
|
148
|
+
|
125
149
|
#ifdef GGML_USE_CPU_HBM
|
126
150
|
#include <hbwmalloc.h>
|
127
151
|
#endif
|
@@ -162,18 +186,20 @@ enum llm_arch {
|
|
162
186
|
LLM_ARCH_GPTNEOX,
|
163
187
|
LLM_ARCH_MPT,
|
164
188
|
LLM_ARCH_STARCODER,
|
189
|
+
LLM_ARCH_REFACT,
|
165
190
|
LLM_ARCH_UNKNOWN,
|
166
191
|
};
|
167
192
|
|
168
193
|
static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
|
169
|
-
{ LLM_ARCH_LLAMA, "llama"
|
170
|
-
{ LLM_ARCH_FALCON, "falcon"
|
171
|
-
{ LLM_ARCH_GPT2, "gpt2"
|
172
|
-
{ LLM_ARCH_GPTJ, "gptj"
|
173
|
-
{ LLM_ARCH_GPTNEOX, "gptneox"
|
174
|
-
{ LLM_ARCH_MPT, "mpt"
|
175
|
-
{ LLM_ARCH_BAICHUAN, "baichuan"
|
194
|
+
{ LLM_ARCH_LLAMA, "llama" },
|
195
|
+
{ LLM_ARCH_FALCON, "falcon" },
|
196
|
+
{ LLM_ARCH_GPT2, "gpt2" },
|
197
|
+
{ LLM_ARCH_GPTJ, "gptj" },
|
198
|
+
{ LLM_ARCH_GPTNEOX, "gptneox" },
|
199
|
+
{ LLM_ARCH_MPT, "mpt" },
|
200
|
+
{ LLM_ARCH_BAICHUAN, "baichuan" },
|
176
201
|
{ LLM_ARCH_STARCODER, "starcoder" },
|
202
|
+
{ LLM_ARCH_REFACT, "refact" },
|
177
203
|
};
|
178
204
|
|
179
205
|
enum llm_kv {
|
@@ -221,16 +247,16 @@ enum llm_kv {
|
|
221
247
|
};
|
222
248
|
|
223
249
|
static std::map<llm_kv, std::string> LLM_KV_NAMES = {
|
224
|
-
{ LLM_KV_GENERAL_ARCHITECTURE, "general.architecture"
|
225
|
-
{ LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version"
|
226
|
-
{ LLM_KV_GENERAL_ALIGNMENT, "general.alignment"
|
227
|
-
{ LLM_KV_GENERAL_NAME, "general.name"
|
228
|
-
{ LLM_KV_GENERAL_AUTHOR, "general.author"
|
229
|
-
{ LLM_KV_GENERAL_URL, "general.url"
|
230
|
-
{ LLM_KV_GENERAL_DESCRIPTION, "general.description"
|
231
|
-
{ LLM_KV_GENERAL_LICENSE, "general.license"
|
232
|
-
{ LLM_KV_GENERAL_SOURCE_URL, "general.
|
233
|
-
{ LLM_KV_GENERAL_SOURCE_HF_REPO, "general.
|
250
|
+
{ LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" },
|
251
|
+
{ LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" },
|
252
|
+
{ LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
|
253
|
+
{ LLM_KV_GENERAL_NAME, "general.name" },
|
254
|
+
{ LLM_KV_GENERAL_AUTHOR, "general.author" },
|
255
|
+
{ LLM_KV_GENERAL_URL, "general.url" },
|
256
|
+
{ LLM_KV_GENERAL_DESCRIPTION, "general.description" },
|
257
|
+
{ LLM_KV_GENERAL_LICENSE, "general.license" },
|
258
|
+
{ LLM_KV_GENERAL_SOURCE_URL, "general.source.url" },
|
259
|
+
{ LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" },
|
234
260
|
|
235
261
|
{ LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
|
236
262
|
{ LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
|
@@ -394,6 +420,23 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
394
420
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
395
421
|
},
|
396
422
|
},
|
423
|
+
{
|
424
|
+
LLM_ARCH_REFACT,
|
425
|
+
{
|
426
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
427
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
428
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
429
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
430
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
431
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
432
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
433
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
434
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
435
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
436
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
437
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
438
|
+
},
|
439
|
+
},
|
397
440
|
{
|
398
441
|
LLM_ARCH_UNKNOWN,
|
399
442
|
{
|
@@ -448,7 +491,7 @@ struct LLM_TN {
|
|
448
491
|
//
|
449
492
|
|
450
493
|
#define GGUF_GET_KEY(ctx, dst, func, type, req, key) \
|
451
|
-
{ \
|
494
|
+
do { \
|
452
495
|
const std::string skey(key); \
|
453
496
|
const int kid = gguf_find_key(ctx, skey.c_str()); \
|
454
497
|
if (kid >= 0) { \
|
@@ -460,7 +503,7 @@ struct LLM_TN {
|
|
460
503
|
} else if (req) { \
|
461
504
|
throw std::runtime_error(format("key not found in model: %s", skey.c_str())); \
|
462
505
|
} \
|
463
|
-
}
|
506
|
+
} while (0)
|
464
507
|
|
465
508
|
//
|
466
509
|
// ggml helpers
|
@@ -881,10 +924,10 @@ static void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
|
|
881
924
|
|
882
925
|
static std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) {
|
883
926
|
std::vector<char> result(8, 0);
|
884
|
-
const int n_tokens = llama_token_to_piece(ctx, token, result.data(), result.size());
|
927
|
+
const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
|
885
928
|
if (n_tokens < 0) {
|
886
929
|
result.resize(-n_tokens);
|
887
|
-
int check = llama_token_to_piece(ctx, token, result.data(), result.size());
|
930
|
+
int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
|
888
931
|
GGML_ASSERT(check == -n_tokens);
|
889
932
|
} else {
|
890
933
|
result.resize(n_tokens);
|
@@ -899,7 +942,7 @@ static std::string llama_token_to_str(const struct llama_context * ctx, llama_to
|
|
899
942
|
|
900
943
|
struct llama_state {
|
901
944
|
// We save the log callback globally
|
902
|
-
|
945
|
+
ggml_log_callback log_callback = llama_log_callback_default;
|
903
946
|
void * log_callback_user_data = nullptr;
|
904
947
|
};
|
905
948
|
|
@@ -925,9 +968,9 @@ static const size_t MB = kB*kB;
|
|
925
968
|
static const size_t GB = kB*kB*kB;
|
926
969
|
|
927
970
|
struct llama_hparams {
|
971
|
+
bool vocab_only;
|
928
972
|
uint32_t n_vocab;
|
929
973
|
uint32_t n_ctx_train; // context size the model was trained on
|
930
|
-
uint32_t n_ctx; // context size used during inference
|
931
974
|
uint32_t n_embd;
|
932
975
|
uint32_t n_head;
|
933
976
|
uint32_t n_head_kv;
|
@@ -938,11 +981,28 @@ struct llama_hparams {
|
|
938
981
|
float f_norm_eps;
|
939
982
|
float f_norm_rms_eps;
|
940
983
|
|
941
|
-
float
|
942
|
-
float
|
984
|
+
float rope_freq_base_train;
|
985
|
+
float rope_freq_scale_train;
|
943
986
|
|
944
987
|
bool operator!=(const llama_hparams & other) const {
|
945
|
-
|
988
|
+
if (this->vocab_only != other.vocab_only) return true;
|
989
|
+
if (this->n_vocab != other.n_vocab) return true;
|
990
|
+
if (this->n_ctx_train != other.n_ctx_train) return true;
|
991
|
+
if (this->n_embd != other.n_embd) return true;
|
992
|
+
if (this->n_head != other.n_head) return true;
|
993
|
+
if (this->n_head_kv != other.n_head_kv) return true;
|
994
|
+
if (this->n_layer != other.n_layer) return true;
|
995
|
+
if (this->n_rot != other.n_rot) return true;
|
996
|
+
if (this->n_ff != other.n_ff) return true;
|
997
|
+
|
998
|
+
const float EPSILON = 1e-9;
|
999
|
+
|
1000
|
+
if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
|
1001
|
+
if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
|
1002
|
+
if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
|
1003
|
+
if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
|
1004
|
+
|
1005
|
+
return false;
|
946
1006
|
}
|
947
1007
|
|
948
1008
|
uint32_t n_gqa() const {
|
@@ -956,15 +1016,18 @@ struct llama_hparams {
|
|
956
1016
|
uint32_t n_embd_gqa() const {
|
957
1017
|
return n_embd/n_gqa();
|
958
1018
|
}
|
1019
|
+
};
|
959
1020
|
|
960
|
-
|
961
|
-
|
962
|
-
|
963
|
-
|
964
|
-
|
965
|
-
|
966
|
-
|
967
|
-
|
1021
|
+
struct llama_cparams {
|
1022
|
+
uint32_t n_ctx; // context size used during inference
|
1023
|
+
uint32_t n_batch;
|
1024
|
+
uint32_t n_threads; // number of threads to use for generation
|
1025
|
+
uint32_t n_threads_batch; // number of threads to use for batch processing
|
1026
|
+
|
1027
|
+
float rope_freq_base;
|
1028
|
+
float rope_freq_scale;
|
1029
|
+
|
1030
|
+
bool mul_mat_q;
|
968
1031
|
};
|
969
1032
|
|
970
1033
|
struct llama_layer {
|
@@ -999,7 +1062,29 @@ struct llama_layer {
|
|
999
1062
|
struct ggml_tensor * b3; // ffn_up
|
1000
1063
|
};
|
1001
1064
|
|
1065
|
+
struct llama_kv_cell {
|
1066
|
+
llama_pos pos = -1;
|
1067
|
+
llama_pos delta = 0;
|
1068
|
+
|
1069
|
+
std::set<llama_seq_id> seq_id;
|
1070
|
+
|
1071
|
+
bool has_seq_id(const llama_seq_id & id) const {
|
1072
|
+
return seq_id.find(id) != seq_id.end();
|
1073
|
+
}
|
1074
|
+
};
|
1075
|
+
|
1076
|
+
// ring-buffer of cached KV data
|
1002
1077
|
struct llama_kv_cache {
|
1078
|
+
bool has_shift = false;
|
1079
|
+
|
1080
|
+
uint32_t head = 0;
|
1081
|
+
uint32_t size = 0;
|
1082
|
+
|
1083
|
+
// computed before each graph build
|
1084
|
+
uint32_t n = 0;
|
1085
|
+
|
1086
|
+
std::vector<llama_kv_cell> cells;
|
1087
|
+
|
1003
1088
|
struct ggml_tensor * k = NULL;
|
1004
1089
|
struct ggml_tensor * v = NULL;
|
1005
1090
|
|
@@ -1007,8 +1092,6 @@ struct llama_kv_cache {
|
|
1007
1092
|
|
1008
1093
|
llama_buffer buf;
|
1009
1094
|
|
1010
|
-
int n; // number of tokens currently in the cache
|
1011
|
-
|
1012
1095
|
~llama_kv_cache() {
|
1013
1096
|
if (ctx) {
|
1014
1097
|
ggml_free(ctx);
|
@@ -1047,6 +1130,10 @@ struct llama_vocab {
|
|
1047
1130
|
id special_pad_id = -1;
|
1048
1131
|
|
1049
1132
|
id linefeed_id = 13;
|
1133
|
+
id special_prefix_id = 32007;
|
1134
|
+
id special_middle_id = 32009;
|
1135
|
+
id special_suffix_id = 32008;
|
1136
|
+
id special_eot_id = 32010;
|
1050
1137
|
|
1051
1138
|
int find_bpe_rank(std::string token_left, std::string token_right) const {
|
1052
1139
|
replace_all(token_left, " ", "\u0120");
|
@@ -1122,11 +1209,8 @@ struct llama_model {
|
|
1122
1209
|
};
|
1123
1210
|
|
1124
1211
|
struct llama_context {
|
1125
|
-
llama_context(const llama_model & model) : model(model),
|
1212
|
+
llama_context(const llama_model & model) : model(model), t_start_us(model.t_start_us), t_load_us(model.t_load_us) {}
|
1126
1213
|
~llama_context() {
|
1127
|
-
if (model_owner) {
|
1128
|
-
delete &model;
|
1129
|
-
}
|
1130
1214
|
#ifdef GGML_USE_METAL
|
1131
1215
|
if (ctx_metal) {
|
1132
1216
|
ggml_metal_free(ctx_metal);
|
@@ -1137,27 +1221,26 @@ struct llama_context {
|
|
1137
1221
|
}
|
1138
1222
|
}
|
1139
1223
|
|
1224
|
+
llama_cparams cparams;
|
1225
|
+
|
1226
|
+
const llama_model & model;
|
1227
|
+
|
1228
|
+
// key + value cache for the self attention
|
1229
|
+
struct llama_kv_cache kv_self;
|
1230
|
+
|
1140
1231
|
std::mt19937 rng;
|
1141
1232
|
|
1142
1233
|
bool has_evaluated_once = false;
|
1143
1234
|
|
1235
|
+
int64_t t_start_us;
|
1236
|
+
int64_t t_load_us;
|
1144
1237
|
int64_t t_sample_us = 0;
|
1145
|
-
int64_t t_eval_us = 0;
|
1146
1238
|
int64_t t_p_eval_us = 0;
|
1239
|
+
int64_t t_eval_us = 0;
|
1147
1240
|
|
1148
1241
|
int32_t n_sample = 0; // number of tokens sampled
|
1149
|
-
int32_t n_eval = 0; // number of eval calls
|
1150
1242
|
int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
|
1151
|
-
|
1152
|
-
const llama_model & model;
|
1153
|
-
|
1154
|
-
bool model_owner = false;
|
1155
|
-
|
1156
|
-
int64_t t_load_us;
|
1157
|
-
int64_t t_start_us;
|
1158
|
-
|
1159
|
-
// key + value cache for the self attention
|
1160
|
-
struct llama_kv_cache kv_self;
|
1243
|
+
int32_t n_eval = 0; // number of eval calls
|
1161
1244
|
|
1162
1245
|
// decode output (2-dimensional array: [n_tokens][n_vocab])
|
1163
1246
|
std::vector<float> logits;
|
@@ -1192,16 +1275,23 @@ static bool llama_kv_cache_init(
|
|
1192
1275
|
const struct llama_hparams & hparams,
|
1193
1276
|
struct llama_kv_cache & cache,
|
1194
1277
|
ggml_type wtype,
|
1195
|
-
|
1278
|
+
uint32_t n_ctx,
|
1196
1279
|
int n_gpu_layers) {
|
1197
|
-
const
|
1198
|
-
const
|
1280
|
+
const uint32_t n_embd = hparams.n_embd_gqa();
|
1281
|
+
const uint32_t n_layer = hparams.n_layer;
|
1199
1282
|
|
1200
1283
|
const int64_t n_mem = n_layer*n_ctx;
|
1201
1284
|
const int64_t n_elements = n_embd*n_mem;
|
1202
1285
|
|
1286
|
+
cache.has_shift = false;
|
1287
|
+
|
1288
|
+
cache.head = 0;
|
1289
|
+
cache.size = n_ctx;
|
1290
|
+
|
1291
|
+
cache.cells.clear();
|
1292
|
+
cache.cells.resize(n_ctx);
|
1293
|
+
|
1203
1294
|
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
|
1204
|
-
cache.n = 0;
|
1205
1295
|
|
1206
1296
|
struct ggml_init_params params;
|
1207
1297
|
params.mem_size = cache.buf.size;
|
@@ -1222,17 +1312,163 @@ static bool llama_kv_cache_init(
|
|
1222
1312
|
|
1223
1313
|
(void) n_gpu_layers;
|
1224
1314
|
#ifdef GGML_USE_CUBLAS
|
1225
|
-
|
1315
|
+
size_t vram_kv_cache = 0;
|
1316
|
+
|
1317
|
+
if (n_gpu_layers > (int)n_layer + 1) {
|
1226
1318
|
ggml_cuda_assign_buffers_no_scratch(cache.v);
|
1319
|
+
LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__);
|
1320
|
+
vram_kv_cache += ggml_nbytes(cache.v);
|
1227
1321
|
}
|
1228
|
-
if (n_gpu_layers > n_layer + 2) {
|
1322
|
+
if (n_gpu_layers > (int)n_layer + 2) {
|
1229
1323
|
ggml_cuda_assign_buffers_no_scratch(cache.k);
|
1324
|
+
LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
|
1325
|
+
vram_kv_cache += ggml_nbytes(cache.k);
|
1326
|
+
}
|
1327
|
+
if (vram_kv_cache > 0) {
|
1328
|
+
LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
|
1230
1329
|
}
|
1231
1330
|
#endif // GGML_USE_CUBLAS
|
1232
1331
|
|
1233
1332
|
return true;
|
1234
1333
|
}
|
1235
1334
|
|
1335
|
+
// find an empty slot of size "n_tokens" in the cache
|
1336
|
+
// updates the cache head
|
1337
|
+
static bool llama_kv_cache_find_slot(
|
1338
|
+
struct llama_kv_cache & cache,
|
1339
|
+
const struct llama_batch & batch) {
|
1340
|
+
const uint32_t n_ctx = cache.size;
|
1341
|
+
const uint32_t n_tokens = batch.n_tokens;
|
1342
|
+
|
1343
|
+
if (n_tokens > n_ctx) {
|
1344
|
+
LLAMA_LOG_ERROR("%s: n_tokens=%d > n_ctx=%d\n", __func__, n_tokens, n_ctx);
|
1345
|
+
return false;
|
1346
|
+
}
|
1347
|
+
|
1348
|
+
uint32_t n_tested = 0;
|
1349
|
+
|
1350
|
+
while (true) {
|
1351
|
+
if (cache.head + n_tokens > n_ctx) {
|
1352
|
+
cache.head = 0;
|
1353
|
+
n_tested += n_ctx - cache.head;
|
1354
|
+
continue;
|
1355
|
+
}
|
1356
|
+
|
1357
|
+
bool found = true;
|
1358
|
+
for (uint32_t i = 0; i < n_tokens; i++) {
|
1359
|
+
if (cache.cells[cache.head + i].pos >= 0) {
|
1360
|
+
found = false;
|
1361
|
+
cache.head += i + 1;
|
1362
|
+
n_tested += i + 1;
|
1363
|
+
break;
|
1364
|
+
}
|
1365
|
+
}
|
1366
|
+
|
1367
|
+
if (found) {
|
1368
|
+
break;
|
1369
|
+
}
|
1370
|
+
|
1371
|
+
if (n_tested >= n_ctx) {
|
1372
|
+
//LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
|
1373
|
+
return false;
|
1374
|
+
}
|
1375
|
+
}
|
1376
|
+
|
1377
|
+
for (uint32_t i = 0; i < n_tokens; i++) {
|
1378
|
+
cache.cells[cache.head + i].pos = batch.pos[i];
|
1379
|
+
cache.cells[cache.head + i].seq_id.insert(batch.seq_id[i]);
|
1380
|
+
}
|
1381
|
+
|
1382
|
+
return true;
|
1383
|
+
}
|
1384
|
+
|
1385
|
+
// find how many cells are currently in use
|
1386
|
+
static int32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
|
1387
|
+
for (uint32_t i = cache.size - 1; i > 0; --i) {
|
1388
|
+
if (cache.cells[i].pos >= 0 && !cache.cells[i].seq_id.empty()) {
|
1389
|
+
return i + 1;
|
1390
|
+
}
|
1391
|
+
}
|
1392
|
+
|
1393
|
+
return 0;
|
1394
|
+
}
|
1395
|
+
|
1396
|
+
static void llama_kv_cache_tokens_rm(struct llama_kv_cache & cache, int32_t c0, int32_t c1) {
|
1397
|
+
if (c0 < 0) c0 = 0;
|
1398
|
+
if (c1 < 0) c1 = cache.size;
|
1399
|
+
|
1400
|
+
for (int32_t i = c0; i < c1; ++i) {
|
1401
|
+
cache.cells[i].pos = -1;
|
1402
|
+
cache.cells[i].seq_id.clear();
|
1403
|
+
}
|
1404
|
+
}
|
1405
|
+
|
1406
|
+
static void llama_kv_cache_seq_rm(
|
1407
|
+
struct llama_kv_cache & cache,
|
1408
|
+
llama_seq_id seq_id,
|
1409
|
+
llama_pos p0,
|
1410
|
+
llama_pos p1) {
|
1411
|
+
if (p0 < 0) p0 = 0;
|
1412
|
+
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
|
1413
|
+
|
1414
|
+
for (uint32_t i = 0; i < cache.size; ++i) {
|
1415
|
+
if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
|
1416
|
+
cache.cells[i].seq_id.erase(seq_id);
|
1417
|
+
if (cache.cells[i].seq_id.empty()) {
|
1418
|
+
cache.cells[i].pos = -1;
|
1419
|
+
}
|
1420
|
+
}
|
1421
|
+
}
|
1422
|
+
}
|
1423
|
+
|
1424
|
+
static void llama_kv_cache_seq_cp(
|
1425
|
+
struct llama_kv_cache & cache,
|
1426
|
+
llama_seq_id seq_id_src,
|
1427
|
+
llama_seq_id seq_id_dst,
|
1428
|
+
llama_pos p0,
|
1429
|
+
llama_pos p1) {
|
1430
|
+
if (p0 < 0) p0 = 0;
|
1431
|
+
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
|
1432
|
+
|
1433
|
+
for (uint32_t i = 0; i < cache.size; ++i) {
|
1434
|
+
if (cache.cells[i].has_seq_id(seq_id_src) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
|
1435
|
+
cache.cells[i].seq_id.insert(seq_id_dst);
|
1436
|
+
}
|
1437
|
+
}
|
1438
|
+
}
|
1439
|
+
|
1440
|
+
static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id seq_id) {
|
1441
|
+
for (uint32_t i = 0; i < cache.size; ++i) {
|
1442
|
+
if (!cache.cells[i].has_seq_id(seq_id)) {
|
1443
|
+
cache.cells[i].pos = -1;
|
1444
|
+
cache.cells[i].seq_id.clear();
|
1445
|
+
}
|
1446
|
+
}
|
1447
|
+
}
|
1448
|
+
|
1449
|
+
static void llama_kv_cache_seq_shift(
|
1450
|
+
struct llama_kv_cache & cache,
|
1451
|
+
llama_seq_id seq_id,
|
1452
|
+
llama_pos p0,
|
1453
|
+
llama_pos p1,
|
1454
|
+
llama_pos delta) {
|
1455
|
+
if (p0 < 0) p0 = 0;
|
1456
|
+
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
|
1457
|
+
|
1458
|
+
for (uint32_t i = 0; i < cache.size; ++i) {
|
1459
|
+
if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
|
1460
|
+
cache.cells[i].pos += delta;
|
1461
|
+
if (cache.cells[i].pos < 0) {
|
1462
|
+
cache.cells[i].pos = -1;
|
1463
|
+
cache.cells[i].seq_id.clear();
|
1464
|
+
} else {
|
1465
|
+
cache.has_shift = true;
|
1466
|
+
cache.cells[i].delta = delta;
|
1467
|
+
}
|
1468
|
+
}
|
1469
|
+
}
|
1470
|
+
}
|
1471
|
+
|
1236
1472
|
//
|
1237
1473
|
// model loading and saving
|
1238
1474
|
//
|
@@ -1554,7 +1790,7 @@ struct llama_model_loader {
|
|
1554
1790
|
lmlock->grow_to(size_lock);
|
1555
1791
|
}
|
1556
1792
|
break;
|
1557
|
-
#
|
1793
|
+
#ifdef GGML_USE_CUBLAS
|
1558
1794
|
case GGML_BACKEND_GPU:
|
1559
1795
|
case GGML_BACKEND_GPU_SPLIT:
|
1560
1796
|
// old code:
|
@@ -1587,7 +1823,15 @@ struct llama_model_loader {
|
|
1587
1823
|
// load LLaMA models
|
1588
1824
|
//
|
1589
1825
|
|
1590
|
-
static std::string
|
1826
|
+
static std::string llama_model_arch_name(llm_arch arch) {
|
1827
|
+
auto it = LLM_ARCH_NAMES.find(arch);
|
1828
|
+
if (it == LLM_ARCH_NAMES.end()) {
|
1829
|
+
return "unknown";
|
1830
|
+
}
|
1831
|
+
return it->second;
|
1832
|
+
}
|
1833
|
+
|
1834
|
+
static std::string llama_model_ftype_name(llama_ftype ftype) {
|
1591
1835
|
if (ftype & LLAMA_FTYPE_GUESSED) {
|
1592
1836
|
return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
|
1593
1837
|
}
|
@@ -1643,10 +1887,7 @@ static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
|
|
1643
1887
|
|
1644
1888
|
static void llm_load_hparams(
|
1645
1889
|
llama_model_loader & ml,
|
1646
|
-
llama_model & model
|
1647
|
-
int n_ctx,
|
1648
|
-
float rope_freq_base,
|
1649
|
-
float rope_freq_scale) {
|
1890
|
+
llama_model & model) {
|
1650
1891
|
struct gguf_context * ctx = ml.ctx_gguf;
|
1651
1892
|
|
1652
1893
|
const auto kv = LLM_KV(model.arch);
|
@@ -1657,29 +1898,25 @@ static void llm_load_hparams(
|
|
1657
1898
|
GGUF_GET_KEY(ctx, model.name, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_NAME));
|
1658
1899
|
|
1659
1900
|
// get hparams kv
|
1660
|
-
GGUF_GET_KEY(ctx, hparams.n_vocab, gguf_get_arr_n, GGUF_TYPE_ARRAY,
|
1661
|
-
GGUF_GET_KEY(ctx, hparams.n_ctx_train, gguf_get_val_u32, GGUF_TYPE_UINT32,
|
1662
|
-
GGUF_GET_KEY(ctx, hparams.n_embd, gguf_get_val_u32, GGUF_TYPE_UINT32,
|
1663
|
-
GGUF_GET_KEY(ctx, hparams.n_ff, gguf_get_val_u32, GGUF_TYPE_UINT32,
|
1664
|
-
GGUF_GET_KEY(ctx, hparams.n_head, gguf_get_val_u32, GGUF_TYPE_UINT32,
|
1665
|
-
GGUF_GET_KEY(ctx, hparams.n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32,
|
1901
|
+
GGUF_GET_KEY(ctx, hparams.n_vocab, gguf_get_arr_n, GGUF_TYPE_ARRAY, true, kv(LLM_KV_TOKENIZER_LIST));
|
1902
|
+
GGUF_GET_KEY(ctx, hparams.n_ctx_train, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_CONTEXT_LENGTH));
|
1903
|
+
GGUF_GET_KEY(ctx, hparams.n_embd, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_EMBEDDING_LENGTH));
|
1904
|
+
GGUF_GET_KEY(ctx, hparams.n_ff, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_FEED_FORWARD_LENGTH));
|
1905
|
+
GGUF_GET_KEY(ctx, hparams.n_head, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_ATTENTION_HEAD_COUNT));
|
1906
|
+
GGUF_GET_KEY(ctx, hparams.n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_BLOCK_COUNT));
|
1666
1907
|
|
1667
1908
|
// n_head_kv is optional, default to n_head
|
1668
1909
|
hparams.n_head_kv = hparams.n_head;
|
1669
1910
|
GGUF_GET_KEY(ctx, hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV));
|
1670
1911
|
|
1671
1912
|
// rope_freq_base (optional)
|
1672
|
-
|
1673
|
-
|
1674
|
-
GGUF_GET_KEY(ctx, rope_freq_base, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
|
1675
|
-
}
|
1913
|
+
hparams.rope_freq_base_train = 10000.0f;
|
1914
|
+
GGUF_GET_KEY(ctx, hparams.rope_freq_base_train, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
|
1676
1915
|
|
1677
1916
|
// rope_freq_scale (inverse of the kv) is optional
|
1678
|
-
|
1679
|
-
|
1680
|
-
|
1681
|
-
rope_freq_scale = 1.0f/ropescale;
|
1682
|
-
}
|
1917
|
+
float ropescale = 1.0f;
|
1918
|
+
GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
|
1919
|
+
hparams.rope_freq_scale_train = 1.0f/ropescale;
|
1683
1920
|
|
1684
1921
|
// sanity check for n_rot (optional)
|
1685
1922
|
{
|
@@ -1742,14 +1979,18 @@ static void llm_load_hparams(
|
|
1742
1979
|
default: model.type = e_model::MODEL_UNKNOWN;
|
1743
1980
|
}
|
1744
1981
|
} break;
|
1982
|
+
case LLM_ARCH_REFACT:
|
1983
|
+
{
|
1984
|
+
GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
|
1985
|
+
switch (hparams.n_layer) {
|
1986
|
+
case 32: model.type = e_model::MODEL_1B; break;
|
1987
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
1988
|
+
}
|
1989
|
+
} break;
|
1745
1990
|
default: (void)0;
|
1746
|
-
}
|
1991
|
+
}
|
1747
1992
|
|
1748
1993
|
model.ftype = ml.ftype;
|
1749
|
-
|
1750
|
-
hparams.n_ctx = n_ctx;
|
1751
|
-
hparams.rope_freq_base = rope_freq_base;
|
1752
|
-
hparams.rope_freq_scale = rope_freq_scale;
|
1753
1994
|
}
|
1754
1995
|
|
1755
1996
|
// TODO: This should probably be in llama.h
|
@@ -1770,20 +2011,18 @@ static void llm_load_vocab(
|
|
1770
2011
|
throw std::runtime_error("cannot find tokenizer vocab in model file\n");
|
1771
2012
|
}
|
1772
2013
|
|
2014
|
+
const float * scores = nullptr;
|
1773
2015
|
const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
|
1774
|
-
if (score_idx
|
1775
|
-
|
2016
|
+
if (score_idx != -1) {
|
2017
|
+
scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
|
1776
2018
|
}
|
1777
2019
|
|
1778
|
-
const
|
1779
|
-
|
2020
|
+
const int * toktypes = nullptr;
|
1780
2021
|
const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
|
1781
|
-
if (toktype_idx
|
1782
|
-
|
2022
|
+
if (toktype_idx != -1) {
|
2023
|
+
toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
|
1783
2024
|
}
|
1784
2025
|
|
1785
|
-
const int * toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
|
1786
|
-
|
1787
2026
|
// determine vocab type
|
1788
2027
|
{
|
1789
2028
|
std::string tokenizer_name;
|
@@ -1812,6 +2051,7 @@ static void llm_load_vocab(
|
|
1812
2051
|
|
1813
2052
|
for (int i = 0; i < n_merges; i++) {
|
1814
2053
|
const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
|
2054
|
+
GGML_ASSERT(codepoints_from_utf8(word).size() > 0);
|
1815
2055
|
|
1816
2056
|
std::string first;
|
1817
2057
|
std::string second;
|
@@ -1846,20 +2086,22 @@ static void llm_load_vocab(
|
|
1846
2086
|
|
1847
2087
|
for (uint32_t i = 0; i < n_vocab; i++) {
|
1848
2088
|
std::string word = gguf_get_arr_str(ctx, token_idx, i);
|
2089
|
+
GGML_ASSERT(codepoints_from_utf8(word).size() > 0);
|
1849
2090
|
|
1850
2091
|
vocab.token_to_id[word] = i;
|
1851
2092
|
|
1852
2093
|
auto & token_data = vocab.id_to_token[i];
|
1853
2094
|
token_data.text = std::move(word);
|
1854
|
-
token_data.score = scores[i];
|
1855
|
-
token_data.type = (llama_token_type) toktypes[i];
|
2095
|
+
token_data.score = scores ? scores[i] : 0.0f;
|
2096
|
+
token_data.type = toktypes ? (llama_token_type) toktypes[i] : LLAMA_TOKEN_TYPE_NORMAL;
|
1856
2097
|
}
|
2098
|
+
GGML_ASSERT(vocab.id_to_token.size() == vocab.token_to_id.size());
|
1857
2099
|
|
1858
2100
|
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
|
1859
2101
|
if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
|
1860
2102
|
vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
|
1861
2103
|
} else {
|
1862
|
-
vocab.linefeed_id = llama_tokenize_internal(vocab, "\
|
2104
|
+
vocab.linefeed_id = llama_tokenize_internal(vocab, "\u010A", false)[0];
|
1863
2105
|
}
|
1864
2106
|
|
1865
2107
|
// special tokens
|
@@ -1875,31 +2117,30 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
1875
2117
|
const auto & vocab = model.vocab;
|
1876
2118
|
|
1877
2119
|
// hparams
|
1878
|
-
LLAMA_LOG_INFO("%s: format
|
1879
|
-
LLAMA_LOG_INFO("%s: arch
|
1880
|
-
LLAMA_LOG_INFO("%s: vocab type
|
1881
|
-
LLAMA_LOG_INFO("%s: n_vocab
|
1882
|
-
LLAMA_LOG_INFO("%s: n_merges
|
1883
|
-
LLAMA_LOG_INFO("%s: n_ctx_train
|
1884
|
-
LLAMA_LOG_INFO("%s:
|
1885
|
-
LLAMA_LOG_INFO("%s:
|
1886
|
-
LLAMA_LOG_INFO("%s:
|
1887
|
-
LLAMA_LOG_INFO("%s:
|
1888
|
-
LLAMA_LOG_INFO("%s:
|
1889
|
-
LLAMA_LOG_INFO("%s:
|
1890
|
-
LLAMA_LOG_INFO("%s:
|
1891
|
-
LLAMA_LOG_INFO("%s:
|
1892
|
-
LLAMA_LOG_INFO("%s:
|
1893
|
-
LLAMA_LOG_INFO("%s:
|
1894
|
-
LLAMA_LOG_INFO("%s:
|
1895
|
-
LLAMA_LOG_INFO("%s:
|
1896
|
-
LLAMA_LOG_INFO("%s: model
|
1897
|
-
LLAMA_LOG_INFO("%s: model
|
1898
|
-
LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
|
2120
|
+
LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver));
|
2121
|
+
LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch).c_str());
|
2122
|
+
LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, vocab.type == LLAMA_VOCAB_TYPE_SPM ? "SPM" : "BPE"); // TODO: fix
|
2123
|
+
LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab);
|
2124
|
+
LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (int) vocab.bpe_ranks.size());
|
2125
|
+
LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
|
2126
|
+
LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
|
2127
|
+
LLAMA_LOG_INFO("%s: n_head = %u\n", __func__, hparams.n_head);
|
2128
|
+
LLAMA_LOG_INFO("%s: n_head_kv = %u\n", __func__, hparams.n_head_kv);
|
2129
|
+
LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
|
2130
|
+
LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim
|
2131
|
+
LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa());
|
2132
|
+
LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
|
2133
|
+
LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
|
2134
|
+
LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
|
2135
|
+
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
2136
|
+
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
2137
|
+
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
|
2138
|
+
LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
|
2139
|
+
LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
|
1899
2140
|
if (ml.n_bytes < GB) {
|
1900
|
-
LLAMA_LOG_INFO("%s: model size
|
2141
|
+
LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
|
1901
2142
|
} else {
|
1902
|
-
LLAMA_LOG_INFO("%s: model size
|
2143
|
+
LLAMA_LOG_INFO("%s: model size = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
|
1903
2144
|
}
|
1904
2145
|
|
1905
2146
|
// general kv
|
@@ -1917,13 +2158,9 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
1917
2158
|
static void llm_load_tensors(
|
1918
2159
|
llama_model_loader & ml,
|
1919
2160
|
llama_model & model,
|
1920
|
-
int n_batch,
|
1921
2161
|
int n_gpu_layers,
|
1922
2162
|
int main_gpu,
|
1923
2163
|
const float * tensor_split,
|
1924
|
-
const bool mul_mat_q,
|
1925
|
-
bool low_vram,
|
1926
|
-
ggml_type memory_type,
|
1927
2164
|
bool use_mlock,
|
1928
2165
|
llama_progress_callback progress_callback,
|
1929
2166
|
void * progress_callback_user_data) {
|
@@ -1962,11 +2199,9 @@ static void llm_load_tensors(
|
|
1962
2199
|
}
|
1963
2200
|
|
1964
2201
|
(void) main_gpu;
|
1965
|
-
|
1966
|
-
#if defined(GGML_USE_CUBLAS)
|
2202
|
+
#ifdef GGML_USE_CUBLAS
|
1967
2203
|
LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__);
|
1968
2204
|
ggml_cuda_set_main_device(main_gpu);
|
1969
|
-
ggml_cuda_set_mul_mat_q(mul_mat_q);
|
1970
2205
|
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
|
1971
2206
|
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
|
1972
2207
|
#elif defined(GGML_USE_CLBLAST)
|
@@ -1989,6 +2224,7 @@ static void llm_load_tensors(
|
|
1989
2224
|
const auto tn = LLM_TN(model.arch);
|
1990
2225
|
switch (model.arch) {
|
1991
2226
|
case LLM_ARCH_LLAMA:
|
2227
|
+
case LLM_ARCH_REFACT:
|
1992
2228
|
{
|
1993
2229
|
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
1994
2230
|
|
@@ -2001,9 +2237,9 @@ static void llm_load_tensors(
|
|
2001
2237
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2002
2238
|
// on Windows however this is detrimental unless everything is on the GPU
|
2003
2239
|
#ifndef _WIN32
|
2004
|
-
backend_norm =
|
2240
|
+
backend_norm = LLAMA_BACKEND_OFFLOAD;
|
2005
2241
|
#else
|
2006
|
-
backend_norm =
|
2242
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2007
2243
|
#endif // _WIN32
|
2008
2244
|
|
2009
2245
|
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
@@ -2067,9 +2303,9 @@ static void llm_load_tensors(
|
|
2067
2303
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2068
2304
|
// on Windows however this is detrimental unless everything is on the GPU
|
2069
2305
|
#ifndef _WIN32
|
2070
|
-
backend_norm =
|
2306
|
+
backend_norm = LLAMA_BACKEND_OFFLOAD;
|
2071
2307
|
#else
|
2072
|
-
backend_norm =
|
2308
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2073
2309
|
#endif // _WIN32
|
2074
2310
|
|
2075
2311
|
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
@@ -2137,9 +2373,9 @@ static void llm_load_tensors(
|
|
2137
2373
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2138
2374
|
// on Windows however this is detrimental unless everything is on the GPU
|
2139
2375
|
#ifndef _WIN32
|
2140
|
-
backend_norm =
|
2376
|
+
backend_norm = LLAMA_BACKEND_OFFLOAD;
|
2141
2377
|
#else
|
2142
|
-
backend_norm =
|
2378
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2143
2379
|
#endif // _WIN32
|
2144
2380
|
|
2145
2381
|
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
@@ -2214,9 +2450,9 @@ static void llm_load_tensors(
|
|
2214
2450
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2215
2451
|
// on Windows however this is detrimental unless everything is on the GPU
|
2216
2452
|
#ifndef _WIN32
|
2217
|
-
backend_norm =
|
2453
|
+
backend_norm = LLAMA_BACKEND_OFFLOAD;
|
2218
2454
|
#else
|
2219
|
-
backend_norm =
|
2455
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2220
2456
|
#endif // _WIN32
|
2221
2457
|
|
2222
2458
|
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
@@ -2281,27 +2517,19 @@ static void llm_load_tensors(
|
|
2281
2517
|
} break;
|
2282
2518
|
default:
|
2283
2519
|
throw std::runtime_error("unknown architecture");
|
2284
|
-
}
|
2520
|
+
}
|
2285
2521
|
}
|
2286
2522
|
|
2287
2523
|
ml.done_getting_tensors();
|
2288
2524
|
|
2289
2525
|
// print memory requirements
|
2290
2526
|
{
|
2291
|
-
const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
|
2292
|
-
|
2293
2527
|
// this is the total memory required to run the inference
|
2294
2528
|
size_t mem_required =
|
2295
2529
|
ctx_size +
|
2296
2530
|
mmapped_size - vram_weights; // weights in VRAM not in memory
|
2297
2531
|
|
2298
|
-
|
2299
|
-
const size_t mem_required_state = scale*hparams.kv_size();
|
2300
|
-
|
2301
|
-
LLAMA_LOG_INFO("%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
|
2302
|
-
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
2303
|
-
|
2304
|
-
(void) n_batch;
|
2532
|
+
LLAMA_LOG_INFO("%s: mem required = %7.2f MB\n", __func__, mem_required / 1024.0 / 1024.0);
|
2305
2533
|
|
2306
2534
|
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
2307
2535
|
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
@@ -2310,36 +2538,17 @@ static void llm_load_tensors(
|
|
2310
2538
|
if (n_gpu_layers > (int) hparams.n_layer) {
|
2311
2539
|
LLAMA_LOG_INFO("%s: offloading non-repeating layers to GPU\n", __func__);
|
2312
2540
|
}
|
2313
|
-
size_t vram_kv_cache = 0;
|
2314
2541
|
|
2315
2542
|
#ifdef GGML_USE_CUBLAS
|
2316
2543
|
const int max_backend_supported_layers = hparams.n_layer + 3;
|
2317
|
-
const int max_offloadable_layers =
|
2318
|
-
if (n_gpu_layers > (int) hparams.n_layer + 1) {
|
2319
|
-
if (low_vram) {
|
2320
|
-
LLAMA_LOG_INFO("%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
|
2321
|
-
} else {
|
2322
|
-
LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__);
|
2323
|
-
vram_kv_cache += hparams.kv_size() / 2;
|
2324
|
-
}
|
2325
|
-
}
|
2326
|
-
if (n_gpu_layers > (int) hparams.n_layer + 2) {
|
2327
|
-
if (low_vram) {
|
2328
|
-
LLAMA_LOG_WARN("%s: cannot offload k cache to GPU due to low VRAM option\n", __func__);
|
2329
|
-
} else {
|
2330
|
-
LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
|
2331
|
-
vram_kv_cache += hparams.kv_size() / 2;
|
2332
|
-
}
|
2333
|
-
}
|
2544
|
+
const int max_offloadable_layers = hparams.n_layer + 3;
|
2334
2545
|
#elif defined(GGML_USE_CLBLAST)
|
2335
2546
|
const int max_backend_supported_layers = hparams.n_layer + 1;
|
2336
2547
|
const int max_offloadable_layers = hparams.n_layer + 1;
|
2337
2548
|
#endif // GGML_USE_CUBLAS
|
2338
2549
|
|
2339
|
-
LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n",
|
2340
|
-
|
2341
|
-
LLAMA_LOG_INFO("%s: VRAM used: %zu MB\n",
|
2342
|
-
__func__, (vram_weights + vram_kv_cache + MB - 1) / MB); // round up
|
2550
|
+
LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
|
2551
|
+
LLAMA_LOG_INFO("%s: VRAM used: %.2f MB\n", __func__, vram_weights / 1024.0 / 1024.0);
|
2343
2552
|
#else
|
2344
2553
|
(void) n_gpu_layers;
|
2345
2554
|
#endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
@@ -2352,7 +2561,7 @@ static void llm_load_tensors(
|
|
2352
2561
|
}
|
2353
2562
|
|
2354
2563
|
(void) tensor_split;
|
2355
|
-
#
|
2564
|
+
#ifdef GGML_USE_CUBLAS
|
2356
2565
|
{
|
2357
2566
|
ggml_cuda_set_tensor_split(tensor_split);
|
2358
2567
|
}
|
@@ -2374,29 +2583,24 @@ static void llm_load_tensors(
|
|
2374
2583
|
static bool llama_model_load(
|
2375
2584
|
const std::string & fname,
|
2376
2585
|
llama_model & model,
|
2377
|
-
int n_ctx,
|
2378
|
-
int n_batch,
|
2379
2586
|
int n_gpu_layers,
|
2380
2587
|
int main_gpu,
|
2381
2588
|
const float * tensor_split,
|
2382
|
-
const bool mul_mat_q,
|
2383
|
-
float rope_freq_base,
|
2384
|
-
float rope_freq_scale,
|
2385
|
-
bool low_vram,
|
2386
|
-
ggml_type memory_type,
|
2387
2589
|
bool use_mmap,
|
2388
2590
|
bool use_mlock,
|
2389
2591
|
bool vocab_only,
|
2390
2592
|
llama_progress_callback progress_callback,
|
2391
2593
|
void *progress_callback_user_data) {
|
2392
2594
|
try {
|
2393
|
-
|
2595
|
+
llama_model_loader ml(fname, use_mmap);
|
2596
|
+
|
2597
|
+
model.hparams.vocab_only = vocab_only;
|
2394
2598
|
|
2395
|
-
llm_load_arch (
|
2396
|
-
llm_load_hparams(
|
2397
|
-
llm_load_vocab (
|
2599
|
+
llm_load_arch (ml, model);
|
2600
|
+
llm_load_hparams(ml, model);
|
2601
|
+
llm_load_vocab (ml, model);
|
2398
2602
|
|
2399
|
-
llm_load_print_meta(
|
2603
|
+
llm_load_print_meta(ml, model);
|
2400
2604
|
|
2401
2605
|
if (model.hparams.n_vocab != model.vocab.id_to_token.size()) {
|
2402
2606
|
throw std::runtime_error("vocab size mismatch");
|
@@ -2408,8 +2612,8 @@ static bool llama_model_load(
|
|
2408
2612
|
}
|
2409
2613
|
|
2410
2614
|
llm_load_tensors(
|
2411
|
-
|
2412
|
-
main_gpu, tensor_split,
|
2615
|
+
ml, model, n_gpu_layers,
|
2616
|
+
main_gpu, tensor_split,
|
2413
2617
|
use_mlock, progress_callback, progress_callback_user_data);
|
2414
2618
|
} catch (const std::exception & err) {
|
2415
2619
|
LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
|
@@ -2421,17 +2625,10 @@ static bool llama_model_load(
|
|
2421
2625
|
|
2422
2626
|
static struct ggml_cgraph * llm_build_llama(
|
2423
2627
|
llama_context & lctx,
|
2424
|
-
const
|
2425
|
-
const float * embd,
|
2426
|
-
int n_tokens,
|
2427
|
-
int n_past) {
|
2428
|
-
|
2429
|
-
GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
|
2430
|
-
|
2431
|
-
const int N = n_tokens;
|
2432
|
-
|
2628
|
+
const llama_batch & batch) {
|
2433
2629
|
const auto & model = lctx.model;
|
2434
2630
|
const auto & hparams = model.hparams;
|
2631
|
+
const auto & cparams = lctx.cparams;
|
2435
2632
|
|
2436
2633
|
const auto & kv_self = lctx.kv_self;
|
2437
2634
|
|
@@ -2439,7 +2636,7 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2439
2636
|
|
2440
2637
|
const int64_t n_embd = hparams.n_embd;
|
2441
2638
|
const int64_t n_layer = hparams.n_layer;
|
2442
|
-
const int64_t n_ctx =
|
2639
|
+
const int64_t n_ctx = cparams.n_ctx;
|
2443
2640
|
const int64_t n_head = hparams.n_head;
|
2444
2641
|
const int64_t n_head_kv = hparams.n_head_kv;
|
2445
2642
|
const int64_t n_embd_head = hparams.n_embd_head();
|
@@ -2447,12 +2644,20 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2447
2644
|
|
2448
2645
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
2449
2646
|
|
2450
|
-
const float freq_base =
|
2451
|
-
const float freq_scale =
|
2647
|
+
const float freq_base = cparams.rope_freq_base;
|
2648
|
+
const float freq_scale = cparams.rope_freq_scale;
|
2452
2649
|
const float norm_rms_eps = hparams.f_norm_rms_eps;
|
2453
2650
|
|
2454
2651
|
const int n_gpu_layers = model.n_gpu_layers;
|
2455
2652
|
|
2653
|
+
const int32_t n_tokens = batch.n_tokens;
|
2654
|
+
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
|
2655
|
+
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
2656
|
+
|
2657
|
+
const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
|
2658
|
+
|
2659
|
+
//printf("n_kv = %d\n", n_kv);
|
2660
|
+
|
2456
2661
|
auto & buf_compute = lctx.buf_compute;
|
2457
2662
|
|
2458
2663
|
struct ggml_init_params params = {
|
@@ -2470,12 +2675,12 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2470
2675
|
struct ggml_tensor * cur;
|
2471
2676
|
struct ggml_tensor * inpL;
|
2472
2677
|
|
2473
|
-
if (
|
2474
|
-
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32,
|
2678
|
+
if (batch.token) {
|
2679
|
+
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
2475
2680
|
|
2476
2681
|
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
2477
2682
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2478
|
-
memcpy(inp_tokens->data,
|
2683
|
+
memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
|
2479
2684
|
}
|
2480
2685
|
ggml_set_name(inp_tokens, "inp_tokens");
|
2481
2686
|
|
@@ -2485,11 +2690,11 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2485
2690
|
GGML_ASSERT(false && "not implemented");
|
2486
2691
|
#endif
|
2487
2692
|
|
2488
|
-
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd,
|
2693
|
+
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
|
2489
2694
|
|
2490
2695
|
ggml_allocr_alloc(lctx.alloc, inpL);
|
2491
2696
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2492
|
-
memcpy(inpL->data, embd,
|
2697
|
+
memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
|
2493
2698
|
}
|
2494
2699
|
}
|
2495
2700
|
|
@@ -2498,9 +2703,6 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2498
2703
|
|
2499
2704
|
// offload functions set the tensor output backend to GPU
|
2500
2705
|
// tensors are GPU-accelerated if any input or the output has been offloaded
|
2501
|
-
//
|
2502
|
-
// with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
|
2503
|
-
// in that case ggml_cuda_assign_buffers has no effect
|
2504
2706
|
offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
|
2505
2707
|
offload_func_t offload_func_kq = llama_nop;
|
2506
2708
|
offload_func_t offload_func_v = llama_nop;
|
@@ -2517,12 +2719,75 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2517
2719
|
}
|
2518
2720
|
#endif // GGML_USE_CUBLAS
|
2519
2721
|
|
2722
|
+
// KQ_scale
|
2520
2723
|
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
2724
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
2521
2725
|
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
2522
2726
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2523
|
-
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(
|
2727
|
+
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd_head)));
|
2728
|
+
}
|
2729
|
+
|
2730
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
2731
|
+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
2732
|
+
offload_func_kq(KQ_mask);
|
2733
|
+
ggml_set_name(KQ_mask, "KQ_mask");
|
2734
|
+
ggml_allocr_alloc(lctx.alloc, KQ_mask);
|
2735
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2736
|
+
float * data = (float *) KQ_mask->data;
|
2737
|
+
memset(data, 0, ggml_nbytes(KQ_mask));
|
2738
|
+
|
2739
|
+
for (int h = 0; h < 1; ++h) {
|
2740
|
+
for (int j = 0; j < n_tokens; ++j) {
|
2741
|
+
const llama_pos pos = batch.pos[j];
|
2742
|
+
const llama_seq_id seq_id = batch.seq_id[j];
|
2743
|
+
|
2744
|
+
for (int i = 0; i < n_kv; ++i) {
|
2745
|
+
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
2746
|
+
data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
|
2747
|
+
}
|
2748
|
+
}
|
2749
|
+
}
|
2750
|
+
}
|
2751
|
+
}
|
2752
|
+
|
2753
|
+
// KQ_pos - contains the positions
|
2754
|
+
struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
2755
|
+
offload_func_kq(KQ_pos);
|
2756
|
+
ggml_set_name(KQ_pos, "KQ_pos");
|
2757
|
+
ggml_allocr_alloc(lctx.alloc, KQ_pos);
|
2758
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2759
|
+
int * data = (int *) KQ_pos->data;
|
2760
|
+
for (int i = 0; i < n_tokens; ++i) {
|
2761
|
+
data[i] = batch.pos[i];
|
2762
|
+
}
|
2763
|
+
}
|
2764
|
+
|
2765
|
+
// shift the entire K-cache if needed
|
2766
|
+
if (do_rope_shift) {
|
2767
|
+
struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
|
2768
|
+
offload_func_kq(K_shift);
|
2769
|
+
ggml_set_name(K_shift, "K_shift");
|
2770
|
+
ggml_allocr_alloc(lctx.alloc, K_shift);
|
2771
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2772
|
+
int * data = (int *) K_shift->data;
|
2773
|
+
for (int i = 0; i < n_ctx; ++i) {
|
2774
|
+
data[i] = kv_self.cells[i].delta;
|
2775
|
+
}
|
2776
|
+
}
|
2777
|
+
|
2778
|
+
for (int il = 0; il < n_layer; ++il) {
|
2779
|
+
struct ggml_tensor * tmp =
|
2780
|
+
ggml_rope_custom_inplace(ctx0,
|
2781
|
+
ggml_view_3d(ctx0, kv_self.k,
|
2782
|
+
n_embd_head, n_head_kv, n_ctx,
|
2783
|
+
ggml_element_size(kv_self.k)*n_embd_head,
|
2784
|
+
ggml_element_size(kv_self.k)*n_embd_gqa,
|
2785
|
+
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
|
2786
|
+
K_shift, n_embd_head, 0, 0, freq_base, freq_scale);
|
2787
|
+
offload_func_kq(tmp);
|
2788
|
+
ggml_build_forward_expand(gf, tmp);
|
2789
|
+
}
|
2524
2790
|
}
|
2525
|
-
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
2526
2791
|
|
2527
2792
|
for (int il = 0; il < n_layer; ++il) {
|
2528
2793
|
ggml_format_name(inpL, "layer_inp_%d", il);
|
@@ -2560,33 +2825,33 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2560
2825
|
offload_func_kq(tmpq);
|
2561
2826
|
ggml_set_name(tmpq, "tmpq");
|
2562
2827
|
|
2563
|
-
struct ggml_tensor * Kcur =
|
2828
|
+
struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
|
2564
2829
|
offload_func_kq(Kcur);
|
2565
2830
|
ggml_set_name(Kcur, "Kcur");
|
2566
2831
|
|
2567
|
-
struct ggml_tensor * Qcur =
|
2832
|
+
struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
|
2568
2833
|
offload_func_kq(Qcur);
|
2569
2834
|
ggml_set_name(Qcur, "Qcur");
|
2570
2835
|
|
2571
2836
|
// store key and value to memory
|
2572
2837
|
{
|
2573
|
-
// compute the transposed [
|
2838
|
+
// compute the transposed [n_tokens, n_embd] V matrix
|
2574
2839
|
|
2575
2840
|
struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
2576
2841
|
offload_func_v(tmpv);
|
2577
2842
|
ggml_set_name(tmpv, "tmpv");
|
2578
2843
|
|
2579
|
-
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa,
|
2844
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens));
|
2580
2845
|
offload_func_v(Vcur);
|
2581
2846
|
ggml_set_name(Vcur, "Vcur");
|
2582
2847
|
|
2583
|
-
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k,
|
2848
|
+
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
|
2584
2849
|
offload_func_kq(k);
|
2585
2850
|
ggml_set_name(k, "k");
|
2586
2851
|
|
2587
|
-
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v,
|
2852
|
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
|
2588
2853
|
( n_ctx)*ggml_element_size(kv_self.v),
|
2589
|
-
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa +
|
2854
|
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
|
2590
2855
|
offload_func_v(v);
|
2591
2856
|
ggml_set_name(v, "v");
|
2592
2857
|
|
@@ -2601,7 +2866,7 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2601
2866
|
|
2602
2867
|
struct ggml_tensor * K =
|
2603
2868
|
ggml_view_3d(ctx0, kv_self.k,
|
2604
|
-
n_embd_head,
|
2869
|
+
n_embd_head, n_kv, n_head_kv,
|
2605
2870
|
ggml_element_size(kv_self.k)*n_embd_gqa,
|
2606
2871
|
ggml_element_size(kv_self.k)*n_embd_head,
|
2607
2872
|
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
@@ -2614,25 +2879,25 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2614
2879
|
ggml_set_name(KQ, "KQ");
|
2615
2880
|
|
2616
2881
|
// KQ_scaled = KQ / sqrt(n_embd_head)
|
2617
|
-
// KQ_scaled shape [
|
2618
|
-
struct ggml_tensor * KQ_scaled =
|
2882
|
+
// KQ_scaled shape [n_kv, n_tokens, n_head, 1]
|
2883
|
+
struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
|
2619
2884
|
offload_func_kq(KQ_scaled);
|
2620
2885
|
ggml_set_name(KQ_scaled, "KQ_scaled");
|
2621
2886
|
|
2622
2887
|
// KQ_masked = mask_past(KQ_scaled)
|
2623
|
-
struct ggml_tensor * KQ_masked =
|
2888
|
+
struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
|
2624
2889
|
offload_func_kq(KQ_masked);
|
2625
2890
|
ggml_set_name(KQ_masked, "KQ_masked");
|
2626
2891
|
|
2627
2892
|
// KQ = soft_max(KQ_masked)
|
2628
|
-
struct ggml_tensor * KQ_soft_max =
|
2893
|
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
|
2629
2894
|
offload_func_v(KQ_soft_max);
|
2630
2895
|
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
2631
2896
|
|
2632
2897
|
// split cached V into n_head heads
|
2633
2898
|
struct ggml_tensor * V =
|
2634
2899
|
ggml_view_3d(ctx0, kv_self.v,
|
2635
|
-
|
2900
|
+
n_kv, n_embd_head, n_head_kv,
|
2636
2901
|
ggml_element_size(kv_self.v)*n_ctx,
|
2637
2902
|
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
2638
2903
|
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
@@ -2647,7 +2912,7 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2647
2912
|
// make V contiguous in memory to speed up the matmul, however we waste time on the copy
|
2648
2913
|
// on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
|
2649
2914
|
// is there a better way?
|
2650
|
-
struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type,
|
2915
|
+
struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_ctx, n_embd_head, n_head));
|
2651
2916
|
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
|
2652
2917
|
#endif
|
2653
2918
|
|
@@ -2656,10 +2921,8 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2656
2921
|
offload_func_v(KQV_merged);
|
2657
2922
|
ggml_set_name(KQV_merged, "KQV_merged");
|
2658
2923
|
|
2659
|
-
// cur = KQV_merged.contiguous().view(n_embd,
|
2660
|
-
cur =
|
2661
|
-
KQV_merged,
|
2662
|
-
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
|
2924
|
+
// cur = KQV_merged.contiguous().view(n_embd, n_tokens)
|
2925
|
+
cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
|
2663
2926
|
offload_func_v(cur);
|
2664
2927
|
ggml_set_name(cur, "KQV_merged_contiguous");
|
2665
2928
|
|
@@ -2750,20 +3013,12 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2750
3013
|
return gf;
|
2751
3014
|
}
|
2752
3015
|
|
2753
|
-
|
2754
3016
|
static struct ggml_cgraph * llm_build_baichaun(
|
2755
3017
|
llama_context & lctx,
|
2756
|
-
const
|
2757
|
-
const float * embd,
|
2758
|
-
int n_tokens,
|
2759
|
-
int n_past) {
|
2760
|
-
|
2761
|
-
GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
|
2762
|
-
|
2763
|
-
const int N = n_tokens;
|
2764
|
-
|
3018
|
+
const llama_batch & batch) {
|
2765
3019
|
const auto & model = lctx.model;
|
2766
3020
|
const auto & hparams = model.hparams;
|
3021
|
+
const auto & cparams = lctx.cparams;
|
2767
3022
|
|
2768
3023
|
const auto & kv_self = lctx.kv_self;
|
2769
3024
|
|
@@ -2771,7 +3026,7 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2771
3026
|
|
2772
3027
|
const int64_t n_embd = hparams.n_embd;
|
2773
3028
|
const int64_t n_layer = hparams.n_layer;
|
2774
|
-
const int64_t n_ctx =
|
3029
|
+
const int64_t n_ctx = cparams.n_ctx;
|
2775
3030
|
const int64_t n_head = hparams.n_head;
|
2776
3031
|
const int64_t n_head_kv = hparams.n_head_kv;
|
2777
3032
|
const int64_t n_embd_head = hparams.n_embd_head();
|
@@ -2779,12 +3034,18 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2779
3034
|
|
2780
3035
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
2781
3036
|
|
2782
|
-
const float freq_base =
|
2783
|
-
const float freq_scale =
|
3037
|
+
const float freq_base = cparams.rope_freq_base;
|
3038
|
+
const float freq_scale = cparams.rope_freq_scale;
|
2784
3039
|
const float norm_rms_eps = hparams.f_norm_rms_eps;
|
2785
3040
|
|
2786
3041
|
const int n_gpu_layers = model.n_gpu_layers;
|
2787
3042
|
|
3043
|
+
const int32_t n_tokens = batch.n_tokens;
|
3044
|
+
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
|
3045
|
+
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
3046
|
+
|
3047
|
+
const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
|
3048
|
+
|
2788
3049
|
auto & buf_compute = lctx.buf_compute;
|
2789
3050
|
|
2790
3051
|
struct ggml_init_params params = {
|
@@ -2802,12 +3063,12 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2802
3063
|
struct ggml_tensor * cur;
|
2803
3064
|
struct ggml_tensor * inpL;
|
2804
3065
|
|
2805
|
-
if (
|
2806
|
-
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32,
|
3066
|
+
if (batch.token) {
|
3067
|
+
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
2807
3068
|
|
2808
3069
|
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
2809
3070
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2810
|
-
memcpy(inp_tokens->data,
|
3071
|
+
memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
|
2811
3072
|
}
|
2812
3073
|
ggml_set_name(inp_tokens, "inp_tokens");
|
2813
3074
|
|
@@ -2817,11 +3078,11 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2817
3078
|
GGML_ASSERT(false && "not implemented");
|
2818
3079
|
#endif
|
2819
3080
|
|
2820
|
-
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd,
|
3081
|
+
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
|
2821
3082
|
|
2822
3083
|
ggml_allocr_alloc(lctx.alloc, inpL);
|
2823
3084
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2824
|
-
memcpy(inpL->data, embd,
|
3085
|
+
memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
|
2825
3086
|
}
|
2826
3087
|
}
|
2827
3088
|
|
@@ -2830,9 +3091,6 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2830
3091
|
|
2831
3092
|
// offload functions set the tensor output backend to GPU
|
2832
3093
|
// tensors are GPU-accelerated if any input or the output has been offloaded
|
2833
|
-
//
|
2834
|
-
// with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
|
2835
|
-
// in that case ggml_cuda_assign_buffers has no effect
|
2836
3094
|
offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
|
2837
3095
|
offload_func_t offload_func_kq = llama_nop;
|
2838
3096
|
offload_func_t offload_func_v = llama_nop;
|
@@ -2849,12 +3107,75 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2849
3107
|
}
|
2850
3108
|
#endif // GGML_USE_CUBLAS
|
2851
3109
|
|
3110
|
+
// KQ_scale
|
2852
3111
|
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
3112
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
2853
3113
|
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
2854
3114
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2855
3115
|
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
2856
3116
|
}
|
2857
|
-
|
3117
|
+
|
3118
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
3119
|
+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
3120
|
+
offload_func_kq(KQ_mask);
|
3121
|
+
ggml_set_name(KQ_mask, "KQ_mask");
|
3122
|
+
ggml_allocr_alloc(lctx.alloc, KQ_mask);
|
3123
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3124
|
+
float * data = (float *) KQ_mask->data;
|
3125
|
+
memset(data, 0, ggml_nbytes(KQ_mask));
|
3126
|
+
|
3127
|
+
for (int h = 0; h < 1; ++h) {
|
3128
|
+
for (int j = 0; j < n_tokens; ++j) {
|
3129
|
+
const llama_pos pos = batch.pos[j];
|
3130
|
+
const llama_seq_id seq_id = batch.seq_id[j];
|
3131
|
+
|
3132
|
+
for (int i = 0; i < n_kv; ++i) {
|
3133
|
+
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
3134
|
+
data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
|
3135
|
+
}
|
3136
|
+
}
|
3137
|
+
}
|
3138
|
+
}
|
3139
|
+
}
|
3140
|
+
|
3141
|
+
// KQ_pos - contains the positions
|
3142
|
+
struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
3143
|
+
offload_func_kq(KQ_pos);
|
3144
|
+
ggml_set_name(KQ_pos, "KQ_pos");
|
3145
|
+
ggml_allocr_alloc(lctx.alloc, KQ_pos);
|
3146
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3147
|
+
int * data = (int *) KQ_pos->data;
|
3148
|
+
for (int i = 0; i < n_tokens; ++i) {
|
3149
|
+
data[i] = batch.pos[i];
|
3150
|
+
}
|
3151
|
+
}
|
3152
|
+
|
3153
|
+
// shift the entire K-cache if needed
|
3154
|
+
if (do_rope_shift) {
|
3155
|
+
struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
|
3156
|
+
offload_func_kq(K_shift);
|
3157
|
+
ggml_set_name(K_shift, "K_shift");
|
3158
|
+
ggml_allocr_alloc(lctx.alloc, K_shift);
|
3159
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3160
|
+
int * data = (int *) K_shift->data;
|
3161
|
+
for (int i = 0; i < n_ctx; ++i) {
|
3162
|
+
data[i] = kv_self.cells[i].delta;
|
3163
|
+
}
|
3164
|
+
}
|
3165
|
+
|
3166
|
+
for (int il = 0; il < n_layer; ++il) {
|
3167
|
+
struct ggml_tensor * tmp =
|
3168
|
+
ggml_rope_custom_inplace(ctx0,
|
3169
|
+
ggml_view_3d(ctx0, kv_self.k,
|
3170
|
+
n_embd_head, n_head_kv, n_ctx,
|
3171
|
+
ggml_element_size(kv_self.k)*n_embd_head,
|
3172
|
+
ggml_element_size(kv_self.k)*n_embd_gqa,
|
3173
|
+
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
|
3174
|
+
K_shift, n_embd_head, 0, 0, freq_base, freq_scale);
|
3175
|
+
offload_func_kq(tmp);
|
3176
|
+
ggml_build_forward_expand(gf, tmp);
|
3177
|
+
}
|
3178
|
+
}
|
2858
3179
|
|
2859
3180
|
for (int il = 0; il < n_layer; ++il) {
|
2860
3181
|
ggml_format_name(inpL, "layer_inp_%d", il);
|
@@ -2896,12 +3217,12 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2896
3217
|
struct ggml_tensor * Qcur;
|
2897
3218
|
switch (model.type) {
|
2898
3219
|
case MODEL_7B:
|
2899
|
-
Kcur =
|
2900
|
-
Qcur =
|
3220
|
+
Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
|
3221
|
+
Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
|
2901
3222
|
break;
|
2902
3223
|
case MODEL_13B:
|
2903
|
-
Kcur
|
2904
|
-
Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head,
|
3224
|
+
Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, n_tokens);
|
3225
|
+
Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, n_tokens);
|
2905
3226
|
break;
|
2906
3227
|
default:
|
2907
3228
|
GGML_ASSERT(false);
|
@@ -2915,23 +3236,23 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2915
3236
|
|
2916
3237
|
// store key and value to memory
|
2917
3238
|
{
|
2918
|
-
// compute the transposed [
|
3239
|
+
// compute the transposed [n_tokens, n_embd] V matrix
|
2919
3240
|
|
2920
3241
|
struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
2921
3242
|
offload_func_v(tmpv);
|
2922
3243
|
ggml_set_name(tmpv, "tmpv");
|
2923
3244
|
|
2924
|
-
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa,
|
3245
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens));
|
2925
3246
|
offload_func_v(Vcur);
|
2926
3247
|
ggml_set_name(Vcur, "Vcur");
|
2927
3248
|
|
2928
|
-
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k,
|
3249
|
+
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
|
2929
3250
|
offload_func_kq(k);
|
2930
3251
|
ggml_set_name(k, "k");
|
2931
3252
|
|
2932
|
-
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v,
|
3253
|
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
|
2933
3254
|
( n_ctx)*ggml_element_size(kv_self.v),
|
2934
|
-
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa +
|
3255
|
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
|
2935
3256
|
offload_func_v(v);
|
2936
3257
|
ggml_set_name(v, "v");
|
2937
3258
|
|
@@ -2946,7 +3267,7 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2946
3267
|
|
2947
3268
|
struct ggml_tensor * K =
|
2948
3269
|
ggml_view_3d(ctx0, kv_self.k,
|
2949
|
-
n_embd_head,
|
3270
|
+
n_embd_head, n_kv, n_head_kv,
|
2950
3271
|
ggml_element_size(kv_self.k)*n_embd_gqa,
|
2951
3272
|
ggml_element_size(kv_self.k)*n_embd_head,
|
2952
3273
|
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
@@ -2959,8 +3280,8 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2959
3280
|
ggml_set_name(KQ, "KQ");
|
2960
3281
|
|
2961
3282
|
// KQ_scaled = KQ / sqrt(n_embd_head)
|
2962
|
-
// KQ_scaled shape [n_past +
|
2963
|
-
struct ggml_tensor * KQ_scaled =
|
3283
|
+
// KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
|
3284
|
+
struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
|
2964
3285
|
offload_func_kq(KQ_scaled);
|
2965
3286
|
ggml_set_name(KQ_scaled, "KQ_scaled");
|
2966
3287
|
|
@@ -2969,58 +3290,44 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2969
3290
|
|
2970
3291
|
switch (model.type) {
|
2971
3292
|
case MODEL_7B:
|
2972
|
-
KQ_masked =
|
3293
|
+
KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
|
2973
3294
|
break;
|
2974
3295
|
case MODEL_13B:
|
2975
|
-
|
3296
|
+
// TODO: replace with ggml_add()
|
3297
|
+
KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ 0, n_head, 8);
|
2976
3298
|
ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
|
2977
|
-
KQ_masked =
|
3299
|
+
KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
|
2978
3300
|
break;
|
2979
3301
|
default:
|
2980
3302
|
GGML_ASSERT(false);
|
2981
3303
|
}
|
2982
|
-
// KQ_masked = mask_past(KQ_scaled)
|
2983
|
-
// struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
|
2984
|
-
// struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past);
|
2985
|
-
// offload_func_kq(KQ_masked);
|
2986
|
-
// ggml_set_name(KQ_masked, "KQ_masked");
|
2987
3304
|
|
2988
3305
|
// KQ = soft_max(KQ_masked)
|
2989
|
-
struct ggml_tensor * KQ_soft_max =
|
3306
|
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
|
2990
3307
|
offload_func_v(KQ_soft_max);
|
2991
3308
|
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
2992
3309
|
|
2993
3310
|
// split cached V into n_head heads
|
2994
3311
|
struct ggml_tensor * V =
|
2995
3312
|
ggml_view_3d(ctx0, kv_self.v,
|
2996
|
-
|
3313
|
+
n_kv, n_embd_head, n_head_kv,
|
2997
3314
|
ggml_element_size(kv_self.v)*n_ctx,
|
2998
3315
|
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
2999
3316
|
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
3000
3317
|
offload_func_v(V);
|
3001
3318
|
ggml_set_name(V, "V");
|
3002
3319
|
|
3003
|
-
#if 1
|
3004
3320
|
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
3005
3321
|
offload_func_v(KQV);
|
3006
3322
|
ggml_set_name(KQV, "KQV");
|
3007
|
-
#else
|
3008
|
-
// make V contiguous in memory to speed up the matmul, however we waste time on the copy
|
3009
|
-
// on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
|
3010
|
-
// is there a better way?
|
3011
|
-
struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head));
|
3012
|
-
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
|
3013
|
-
#endif
|
3014
3323
|
|
3015
3324
|
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
3016
3325
|
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
3017
3326
|
offload_func_v(KQV_merged);
|
3018
3327
|
ggml_set_name(KQV_merged, "KQV_merged");
|
3019
3328
|
|
3020
|
-
// cur = KQV_merged.contiguous().view(n_embd,
|
3021
|
-
cur =
|
3022
|
-
KQV_merged,
|
3023
|
-
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
|
3329
|
+
// cur = KQV_merged.contiguous().view(n_embd, n_tokens)
|
3330
|
+
cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
|
3024
3331
|
offload_func_v(cur);
|
3025
3332
|
ggml_set_name(cur, "KQV_merged_contiguous");
|
3026
3333
|
|
@@ -3111,19 +3418,12 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
3111
3418
|
return gf;
|
3112
3419
|
}
|
3113
3420
|
|
3114
|
-
static struct ggml_cgraph *
|
3421
|
+
static struct ggml_cgraph * llm_build_refact(
|
3115
3422
|
llama_context & lctx,
|
3116
|
-
const
|
3117
|
-
const float * embd,
|
3118
|
-
int n_tokens,
|
3119
|
-
int n_past) {
|
3120
|
-
|
3121
|
-
GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
|
3122
|
-
|
3123
|
-
const int N = n_tokens;
|
3124
|
-
|
3423
|
+
const llama_batch & batch) {
|
3125
3424
|
const auto & model = lctx.model;
|
3126
3425
|
const auto & hparams = model.hparams;
|
3426
|
+
const auto & cparams = lctx.cparams;
|
3127
3427
|
|
3128
3428
|
const auto & kv_self = lctx.kv_self;
|
3129
3429
|
|
@@ -3131,20 +3431,22 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3131
3431
|
|
3132
3432
|
const int64_t n_embd = hparams.n_embd;
|
3133
3433
|
const int64_t n_layer = hparams.n_layer;
|
3134
|
-
const int64_t n_ctx =
|
3434
|
+
const int64_t n_ctx = cparams.n_ctx;
|
3135
3435
|
const int64_t n_head = hparams.n_head;
|
3136
3436
|
const int64_t n_head_kv = hparams.n_head_kv;
|
3137
3437
|
const int64_t n_embd_head = hparams.n_embd_head();
|
3138
3438
|
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
3139
3439
|
|
3140
|
-
|
3141
|
-
|
3142
|
-
const float freq_base = hparams.rope_freq_base;
|
3143
|
-
const float freq_scale = hparams.rope_freq_scale;
|
3144
|
-
const float norm_eps = hparams.f_norm_eps;
|
3440
|
+
const float norm_rms_eps = hparams.f_norm_rms_eps;
|
3145
3441
|
|
3146
3442
|
const int n_gpu_layers = model.n_gpu_layers;
|
3147
3443
|
|
3444
|
+
const int32_t n_tokens = batch.n_tokens;
|
3445
|
+
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
|
3446
|
+
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
3447
|
+
|
3448
|
+
// printf("n_kv = %d\n", n_kv);
|
3449
|
+
|
3148
3450
|
auto & buf_compute = lctx.buf_compute;
|
3149
3451
|
|
3150
3452
|
struct ggml_init_params params = {
|
@@ -3162,12 +3464,12 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3162
3464
|
struct ggml_tensor * cur;
|
3163
3465
|
struct ggml_tensor * inpL;
|
3164
3466
|
|
3165
|
-
if (
|
3166
|
-
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32,
|
3467
|
+
if (batch.token) {
|
3468
|
+
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
3167
3469
|
|
3168
3470
|
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
3169
3471
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3170
|
-
memcpy(inp_tokens->data,
|
3472
|
+
memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
|
3171
3473
|
}
|
3172
3474
|
ggml_set_name(inp_tokens, "inp_tokens");
|
3173
3475
|
|
@@ -3177,11 +3479,11 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3177
3479
|
GGML_ASSERT(false && "not implemented");
|
3178
3480
|
#endif
|
3179
3481
|
|
3180
|
-
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd,
|
3482
|
+
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
|
3181
3483
|
|
3182
3484
|
ggml_allocr_alloc(lctx.alloc, inpL);
|
3183
3485
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3184
|
-
memcpy(inpL->data, embd,
|
3486
|
+
memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
|
3185
3487
|
}
|
3186
3488
|
}
|
3187
3489
|
|
@@ -3190,9 +3492,6 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3190
3492
|
|
3191
3493
|
// offload functions set the tensor output backend to GPU
|
3192
3494
|
// tensors are GPU-accelerated if any input or the output has been offloaded
|
3193
|
-
//
|
3194
|
-
// with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
|
3195
|
-
// in that case ggml_cuda_assign_buffers has no effect
|
3196
3495
|
offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
|
3197
3496
|
offload_func_t offload_func_kq = llama_nop;
|
3198
3497
|
offload_func_t offload_func_v = llama_nop;
|
@@ -3209,15 +3508,432 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3209
3508
|
}
|
3210
3509
|
#endif // GGML_USE_CUBLAS
|
3211
3510
|
|
3511
|
+
// KQ_scale
|
3212
3512
|
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
3513
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
3213
3514
|
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
3214
3515
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3215
|
-
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(
|
3516
|
+
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd_head)));
|
3216
3517
|
}
|
3217
|
-
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
3218
3518
|
|
3219
|
-
|
3220
|
-
|
3519
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
3520
|
+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
3521
|
+
offload_func_kq(KQ_mask);
|
3522
|
+
ggml_set_name(KQ_mask, "KQ_mask");
|
3523
|
+
ggml_allocr_alloc(lctx.alloc, KQ_mask);
|
3524
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3525
|
+
float * data = (float *) KQ_mask->data;
|
3526
|
+
memset(data, 0, ggml_nbytes(KQ_mask));
|
3527
|
+
|
3528
|
+
for (int h = 0; h < 1; ++h) {
|
3529
|
+
for (int j = 0; j < n_tokens; ++j) {
|
3530
|
+
const llama_pos pos = batch.pos[j];
|
3531
|
+
const llama_seq_id seq_id = batch.seq_id[j];
|
3532
|
+
|
3533
|
+
for (int i = 0; i < n_kv; ++i) {
|
3534
|
+
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
3535
|
+
data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
|
3536
|
+
}
|
3537
|
+
}
|
3538
|
+
}
|
3539
|
+
}
|
3540
|
+
}
|
3541
|
+
|
3542
|
+
for (int il = 0; il < n_layer; ++il) {
|
3543
|
+
ggml_format_name(inpL, "layer_inp_%d", il);
|
3544
|
+
|
3545
|
+
offload_func_t offload_func = llama_nop;
|
3546
|
+
|
3547
|
+
#ifdef GGML_USE_CUBLAS
|
3548
|
+
if (il >= i_gpu_start) {
|
3549
|
+
offload_func = ggml_cuda_assign_buffers_no_alloc;
|
3550
|
+
}
|
3551
|
+
#endif // GGML_USE_CUBLAS
|
3552
|
+
|
3553
|
+
struct ggml_tensor * inpSA = inpL;
|
3554
|
+
|
3555
|
+
// norm
|
3556
|
+
{
|
3557
|
+
cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps);
|
3558
|
+
offload_func(cur);
|
3559
|
+
ggml_set_name(cur, "rms_norm_0");
|
3560
|
+
|
3561
|
+
// cur = cur*attn_norm(broadcasted)
|
3562
|
+
cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm);
|
3563
|
+
offload_func(cur);
|
3564
|
+
ggml_set_name(cur, "attention_norm_0");
|
3565
|
+
}
|
3566
|
+
|
3567
|
+
// self-attention
|
3568
|
+
{
|
3569
|
+
// compute Q and K
|
3570
|
+
struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
3571
|
+
offload_func_kq(tmpk);
|
3572
|
+
ggml_set_name(tmpk, "tmpk");
|
3573
|
+
|
3574
|
+
struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
3575
|
+
offload_func_kq(tmpq);
|
3576
|
+
ggml_set_name(tmpq, "tmpq");
|
3577
|
+
|
3578
|
+
struct ggml_tensor * Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens);
|
3579
|
+
offload_func_kq(Kcur);
|
3580
|
+
ggml_set_name(Kcur, "Kcur");
|
3581
|
+
|
3582
|
+
struct ggml_tensor * Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens);
|
3583
|
+
offload_func_kq(Qcur);
|
3584
|
+
ggml_set_name(Qcur, "Qcur");
|
3585
|
+
|
3586
|
+
// store key and value to memory
|
3587
|
+
{
|
3588
|
+
// compute the transposed [n_tokens, n_embd] V matrix
|
3589
|
+
|
3590
|
+
struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
3591
|
+
offload_func_v(tmpv);
|
3592
|
+
ggml_set_name(tmpv, "tmpv");
|
3593
|
+
|
3594
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens));
|
3595
|
+
offload_func_v(Vcur);
|
3596
|
+
ggml_set_name(Vcur, "Vcur");
|
3597
|
+
|
3598
|
+
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
|
3599
|
+
offload_func_kq(k);
|
3600
|
+
ggml_set_name(k, "k");
|
3601
|
+
|
3602
|
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
|
3603
|
+
( n_ctx)*ggml_element_size(kv_self.v),
|
3604
|
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
|
3605
|
+
offload_func_v(v);
|
3606
|
+
ggml_set_name(v, "v");
|
3607
|
+
|
3608
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
3609
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
3610
|
+
}
|
3611
|
+
|
3612
|
+
struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
|
3613
|
+
offload_func_kq(Q);
|
3614
|
+
ggml_set_name(Q, "Q");
|
3615
|
+
|
3616
|
+
struct ggml_tensor * K =
|
3617
|
+
ggml_view_3d(ctx0, kv_self.k,
|
3618
|
+
n_embd_head, n_kv, n_head_kv,
|
3619
|
+
ggml_element_size(kv_self.k)*n_embd_gqa,
|
3620
|
+
ggml_element_size(kv_self.k)*n_embd_head,
|
3621
|
+
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
3622
|
+
offload_func_kq(K);
|
3623
|
+
ggml_set_name(K, "K");
|
3624
|
+
|
3625
|
+
// K * Q
|
3626
|
+
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
3627
|
+
offload_func_kq(KQ);
|
3628
|
+
ggml_set_name(KQ, "KQ");
|
3629
|
+
|
3630
|
+
// KQ_scaled = KQ / sqrt(n_embd_head)
|
3631
|
+
// KQ_scaled shape [n_kv, n_tokens, n_head, 1]
|
3632
|
+
struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
|
3633
|
+
offload_func_kq(KQ_scaled);
|
3634
|
+
ggml_set_name(KQ_scaled, "KQ_scaled");
|
3635
|
+
|
3636
|
+
// KQ_masked = mask_past(KQ_scaled)
|
3637
|
+
struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ 0, n_head, 8);
|
3638
|
+
ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
|
3639
|
+
|
3640
|
+
struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
|
3641
|
+
offload_func_kq(KQ_masked);
|
3642
|
+
ggml_set_name(KQ_masked, "KQ_masked");
|
3643
|
+
|
3644
|
+
// KQ = soft_max(KQ_masked)
|
3645
|
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
|
3646
|
+
offload_func_v(KQ_soft_max);
|
3647
|
+
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
3648
|
+
|
3649
|
+
// split cached V into n_head heads
|
3650
|
+
struct ggml_tensor * V =
|
3651
|
+
ggml_view_3d(ctx0, kv_self.v,
|
3652
|
+
n_kv, n_embd_head, n_head_kv,
|
3653
|
+
ggml_element_size(kv_self.v)*n_ctx,
|
3654
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
3655
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
3656
|
+
offload_func_v(V);
|
3657
|
+
ggml_set_name(V, "V");
|
3658
|
+
|
3659
|
+
#if 1
|
3660
|
+
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
3661
|
+
offload_func_v(KQV);
|
3662
|
+
ggml_set_name(KQV, "KQV");
|
3663
|
+
#else
|
3664
|
+
// make V contiguous in memory to speed up the matmul, however we waste time on the copy
|
3665
|
+
// on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
|
3666
|
+
// is there a better way?
|
3667
|
+
struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_ctx, n_embd_head, n_head));
|
3668
|
+
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
|
3669
|
+
#endif
|
3670
|
+
|
3671
|
+
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
3672
|
+
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
3673
|
+
offload_func_v(KQV_merged);
|
3674
|
+
ggml_set_name(KQV_merged, "KQV_merged");
|
3675
|
+
|
3676
|
+
// cur = KQV_merged.contiguous().view(n_embd, n_tokens)
|
3677
|
+
cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
|
3678
|
+
offload_func_v(cur);
|
3679
|
+
ggml_set_name(cur, "KQV_merged_contiguous");
|
3680
|
+
|
3681
|
+
// projection (no bias)
|
3682
|
+
cur = ggml_mul_mat(ctx0,
|
3683
|
+
model.layers[il].wo,
|
3684
|
+
cur);
|
3685
|
+
offload_func(cur);
|
3686
|
+
ggml_set_name(cur, "result_wo");
|
3687
|
+
}
|
3688
|
+
|
3689
|
+
struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
|
3690
|
+
offload_func(inpFF);
|
3691
|
+
ggml_set_name(inpFF, "inpFF");
|
3692
|
+
|
3693
|
+
// feed-forward network
|
3694
|
+
{
|
3695
|
+
// norm
|
3696
|
+
{
|
3697
|
+
cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps);
|
3698
|
+
offload_func(cur);
|
3699
|
+
ggml_set_name(cur, "rms_norm_1");
|
3700
|
+
|
3701
|
+
// cur = cur*ffn_norm(broadcasted)
|
3702
|
+
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
|
3703
|
+
offload_func(cur);
|
3704
|
+
ggml_set_name(cur, "ffn_norm");
|
3705
|
+
}
|
3706
|
+
|
3707
|
+
struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
|
3708
|
+
model.layers[il].w3,
|
3709
|
+
cur);
|
3710
|
+
offload_func(tmp);
|
3711
|
+
ggml_set_name(tmp, "result_w3");
|
3712
|
+
|
3713
|
+
cur = ggml_mul_mat(ctx0,
|
3714
|
+
model.layers[il].w1,
|
3715
|
+
cur);
|
3716
|
+
offload_func(cur);
|
3717
|
+
ggml_set_name(cur, "result_w1");
|
3718
|
+
|
3719
|
+
// SILU activation
|
3720
|
+
cur = ggml_silu(ctx0, cur);
|
3721
|
+
offload_func(cur);
|
3722
|
+
ggml_set_name(cur, "silu");
|
3723
|
+
|
3724
|
+
cur = ggml_mul(ctx0, cur, tmp);
|
3725
|
+
offload_func(cur);
|
3726
|
+
ggml_set_name(cur, "silu_x_result_w3");
|
3727
|
+
|
3728
|
+
cur = ggml_mul_mat(ctx0,
|
3729
|
+
model.layers[il].w2,
|
3730
|
+
cur);
|
3731
|
+
offload_func(cur);
|
3732
|
+
ggml_set_name(cur, "result_w2");
|
3733
|
+
}
|
3734
|
+
|
3735
|
+
cur = ggml_add(ctx0, cur, inpFF);
|
3736
|
+
offload_func(cur);
|
3737
|
+
ggml_set_name(cur, "inpFF_+_result_w2");
|
3738
|
+
|
3739
|
+
// input for next layer
|
3740
|
+
inpL = cur;
|
3741
|
+
}
|
3742
|
+
|
3743
|
+
cur = inpL;
|
3744
|
+
|
3745
|
+
// norm
|
3746
|
+
{
|
3747
|
+
cur = ggml_rms_norm(ctx0, cur, norm_rms_eps);
|
3748
|
+
offload_func_nr(cur);
|
3749
|
+
ggml_set_name(cur, "rms_norm_2");
|
3750
|
+
|
3751
|
+
// cur = cur*norm(broadcasted)
|
3752
|
+
cur = ggml_mul(ctx0, cur, model.output_norm);
|
3753
|
+
// offload_func_nr(cur); // TODO CPU + GPU mirrored backend
|
3754
|
+
ggml_set_name(cur, "result_norm");
|
3755
|
+
}
|
3756
|
+
|
3757
|
+
// lm_head
|
3758
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
3759
|
+
ggml_set_name(cur, "result_output");
|
3760
|
+
|
3761
|
+
ggml_build_forward_expand(gf, cur);
|
3762
|
+
|
3763
|
+
ggml_free(ctx0);
|
3764
|
+
|
3765
|
+
return gf;
|
3766
|
+
}
|
3767
|
+
|
3768
|
+
static struct ggml_cgraph * llm_build_falcon(
|
3769
|
+
llama_context & lctx,
|
3770
|
+
const llama_batch & batch) {
|
3771
|
+
const auto & model = lctx.model;
|
3772
|
+
const auto & hparams = model.hparams;
|
3773
|
+
const auto & cparams = lctx.cparams;
|
3774
|
+
|
3775
|
+
const auto & kv_self = lctx.kv_self;
|
3776
|
+
|
3777
|
+
GGML_ASSERT(!!kv_self.ctx);
|
3778
|
+
|
3779
|
+
const int64_t n_embd = hparams.n_embd;
|
3780
|
+
const int64_t n_layer = hparams.n_layer;
|
3781
|
+
const int64_t n_ctx = cparams.n_ctx;
|
3782
|
+
const int64_t n_head = hparams.n_head;
|
3783
|
+
const int64_t n_head_kv = hparams.n_head_kv;
|
3784
|
+
const int64_t n_embd_head = hparams.n_embd_head();
|
3785
|
+
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
3786
|
+
|
3787
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
3788
|
+
|
3789
|
+
const float freq_base = cparams.rope_freq_base;
|
3790
|
+
const float freq_scale = cparams.rope_freq_scale;
|
3791
|
+
const float norm_eps = hparams.f_norm_eps;
|
3792
|
+
|
3793
|
+
const int n_gpu_layers = model.n_gpu_layers;
|
3794
|
+
|
3795
|
+
const int32_t n_tokens = batch.n_tokens;
|
3796
|
+
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
|
3797
|
+
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
3798
|
+
|
3799
|
+
const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
|
3800
|
+
|
3801
|
+
//printf("kv_head = %d, n_kv = %d, n_tokens = %d, n_ctx = %d, is_measure = %d, has_shift = %d\n",
|
3802
|
+
// kv_head, n_kv, n_tokens, n_ctx, ggml_allocr_is_measure(lctx.alloc), kv_self.has_shift);
|
3803
|
+
|
3804
|
+
auto & buf_compute = lctx.buf_compute;
|
3805
|
+
|
3806
|
+
struct ggml_init_params params = {
|
3807
|
+
/*.mem_size =*/ buf_compute.size,
|
3808
|
+
/*.mem_buffer =*/ buf_compute.data,
|
3809
|
+
/*.no_alloc =*/ false,
|
3810
|
+
};
|
3811
|
+
|
3812
|
+
params.no_alloc = true;
|
3813
|
+
|
3814
|
+
struct ggml_context * ctx0 = ggml_init(params);
|
3815
|
+
|
3816
|
+
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
3817
|
+
|
3818
|
+
struct ggml_tensor * cur;
|
3819
|
+
struct ggml_tensor * inpL;
|
3820
|
+
|
3821
|
+
if (batch.token) {
|
3822
|
+
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
3823
|
+
|
3824
|
+
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
3825
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3826
|
+
memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
|
3827
|
+
}
|
3828
|
+
ggml_set_name(inp_tokens, "inp_tokens");
|
3829
|
+
|
3830
|
+
inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
|
3831
|
+
} else {
|
3832
|
+
#ifdef GGML_USE_MPI
|
3833
|
+
GGML_ASSERT(false && "not implemented");
|
3834
|
+
#endif
|
3835
|
+
|
3836
|
+
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
|
3837
|
+
|
3838
|
+
ggml_allocr_alloc(lctx.alloc, inpL);
|
3839
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3840
|
+
memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
|
3841
|
+
}
|
3842
|
+
}
|
3843
|
+
|
3844
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
3845
|
+
(void) i_gpu_start;
|
3846
|
+
|
3847
|
+
// offload functions set the tensor output backend to GPU
|
3848
|
+
// tensors are GPU-accelerated if any input or the output has been offloaded
|
3849
|
+
offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
|
3850
|
+
offload_func_t offload_func_kq = llama_nop;
|
3851
|
+
offload_func_t offload_func_v = llama_nop;
|
3852
|
+
|
3853
|
+
#ifdef GGML_USE_CUBLAS
|
3854
|
+
if (n_gpu_layers > n_layer) {
|
3855
|
+
offload_func_nr = ggml_cuda_assign_buffers_no_alloc;
|
3856
|
+
}
|
3857
|
+
if (n_gpu_layers > n_layer + 1) {
|
3858
|
+
offload_func_v = ggml_cuda_assign_buffers_no_alloc;
|
3859
|
+
}
|
3860
|
+
if (n_gpu_layers > n_layer + 2) {
|
3861
|
+
offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
|
3862
|
+
}
|
3863
|
+
#endif // GGML_USE_CUBLAS
|
3864
|
+
|
3865
|
+
// KQ_scale
|
3866
|
+
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
3867
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
3868
|
+
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
3869
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3870
|
+
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
3871
|
+
}
|
3872
|
+
|
3873
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
3874
|
+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
3875
|
+
offload_func_kq(KQ_mask);
|
3876
|
+
ggml_set_name(KQ_mask, "KQ_mask");
|
3877
|
+
ggml_allocr_alloc(lctx.alloc, KQ_mask);
|
3878
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3879
|
+
float * data = (float *) KQ_mask->data;
|
3880
|
+
memset(data, 0, ggml_nbytes(KQ_mask));
|
3881
|
+
|
3882
|
+
for (int h = 0; h < 1; ++h) {
|
3883
|
+
for (int j = 0; j < n_tokens; ++j) {
|
3884
|
+
const llama_pos pos = batch.pos[j];
|
3885
|
+
const llama_seq_id seq_id = batch.seq_id[j];
|
3886
|
+
|
3887
|
+
for (int i = 0; i < n_kv; ++i) {
|
3888
|
+
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
3889
|
+
data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
|
3890
|
+
}
|
3891
|
+
}
|
3892
|
+
}
|
3893
|
+
}
|
3894
|
+
}
|
3895
|
+
|
3896
|
+
// KQ_pos - contains the positions
|
3897
|
+
struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
3898
|
+
offload_func_kq(KQ_pos);
|
3899
|
+
ggml_set_name(KQ_pos, "KQ_pos");
|
3900
|
+
ggml_allocr_alloc(lctx.alloc, KQ_pos);
|
3901
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3902
|
+
int * data = (int *) KQ_pos->data;
|
3903
|
+
for (int i = 0; i < n_tokens; ++i) {
|
3904
|
+
data[i] = batch.pos[i];
|
3905
|
+
}
|
3906
|
+
}
|
3907
|
+
|
3908
|
+
// shift the entire K-cache if needed
|
3909
|
+
if (do_rope_shift) {
|
3910
|
+
struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
|
3911
|
+
offload_func_kq(K_shift);
|
3912
|
+
ggml_set_name(K_shift, "K_shift");
|
3913
|
+
ggml_allocr_alloc(lctx.alloc, K_shift);
|
3914
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3915
|
+
int * data = (int *) K_shift->data;
|
3916
|
+
for (int i = 0; i < n_ctx; ++i) {
|
3917
|
+
data[i] = kv_self.cells[i].delta;
|
3918
|
+
}
|
3919
|
+
}
|
3920
|
+
|
3921
|
+
for (int il = 0; il < n_layer; ++il) {
|
3922
|
+
struct ggml_tensor * tmp =
|
3923
|
+
ggml_rope_custom_inplace(ctx0,
|
3924
|
+
ggml_view_3d(ctx0, kv_self.k,
|
3925
|
+
n_embd_head, n_head_kv, n_ctx,
|
3926
|
+
ggml_element_size(kv_self.k)*n_embd_head,
|
3927
|
+
ggml_element_size(kv_self.k)*n_embd_gqa,
|
3928
|
+
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
|
3929
|
+
K_shift, n_embd_head, 2, 0, freq_base, freq_scale);
|
3930
|
+
offload_func_kq(tmp);
|
3931
|
+
ggml_build_forward_expand(gf, tmp);
|
3932
|
+
}
|
3933
|
+
}
|
3934
|
+
|
3935
|
+
for (int il = 0; il < n_layer; ++il) {
|
3936
|
+
struct ggml_tensor * attn_norm;
|
3221
3937
|
|
3222
3938
|
offload_func_t offload_func = llama_nop;
|
3223
3939
|
|
@@ -3271,45 +3987,45 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3271
3987
|
// TODO: these 2 ggml_conts are technically not needed, but we add them until CUDA support for
|
3272
3988
|
// non-contiguous views is added for the rope operator
|
3273
3989
|
struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_3d(
|
3274
|
-
ctx0, cur, n_embd_head, n_head,
|
3990
|
+
ctx0, cur, n_embd_head, n_head, n_tokens,
|
3275
3991
|
wsize * n_embd_head,
|
3276
3992
|
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
3277
3993
|
0));
|
3278
3994
|
offload_func_kq(tmpq);
|
3279
3995
|
|
3280
3996
|
struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_3d(
|
3281
|
-
ctx0, cur, n_embd_head, n_head_kv,
|
3997
|
+
ctx0, cur, n_embd_head, n_head_kv, n_tokens,
|
3282
3998
|
wsize * n_embd_head,
|
3283
3999
|
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
3284
4000
|
wsize * n_embd_head * n_head));
|
3285
4001
|
offload_func_kq(tmpk);
|
3286
4002
|
|
3287
4003
|
struct ggml_tensor * tmpv = ggml_view_3d(
|
3288
|
-
ctx0, cur, n_embd_head, n_head_kv,
|
4004
|
+
ctx0, cur, n_embd_head, n_head_kv, n_tokens,
|
3289
4005
|
wsize * n_embd_head,
|
3290
4006
|
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
3291
4007
|
wsize * n_embd_head * (n_head + n_head_kv));
|
3292
4008
|
offload_func_v(tmpv);
|
3293
4009
|
|
3294
4010
|
// using mode = 2 for neox mode
|
3295
|
-
struct ggml_tensor * Qcur =
|
4011
|
+
struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, tmpq, KQ_pos, n_embd_head, 2, 0, freq_base, freq_scale);
|
3296
4012
|
offload_func_kq(Qcur);
|
3297
|
-
struct ggml_tensor * Kcur =
|
4013
|
+
struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, tmpk, KQ_pos, n_embd_head, 2, 0, freq_base, freq_scale);
|
3298
4014
|
offload_func_kq(Kcur);
|
3299
4015
|
|
3300
4016
|
{
|
3301
|
-
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa,
|
4017
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
|
3302
4018
|
offload_func_v(Vcur);
|
3303
4019
|
offload_func_v(Vcur->src[0]->src[0]);
|
3304
4020
|
ggml_set_name(Vcur, "Vcur");
|
3305
4021
|
|
3306
|
-
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k,
|
4022
|
+
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
|
3307
4023
|
offload_func_kq(k);
|
3308
4024
|
ggml_set_name(k, "k");
|
3309
4025
|
|
3310
|
-
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v,
|
4026
|
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
|
3311
4027
|
( n_ctx)*ggml_element_size(kv_self.v),
|
3312
|
-
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa +
|
4028
|
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
|
3313
4029
|
offload_func_v(v);
|
3314
4030
|
|
3315
4031
|
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
@@ -3322,7 +4038,7 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3322
4038
|
|
3323
4039
|
struct ggml_tensor * K =
|
3324
4040
|
ggml_view_3d(ctx0, kv_self.k,
|
3325
|
-
n_embd_head,
|
4041
|
+
n_embd_head, n_kv, n_head_kv,
|
3326
4042
|
ggml_element_size(kv_self.k)*n_embd_gqa,
|
3327
4043
|
ggml_element_size(kv_self.k)*n_embd_head,
|
3328
4044
|
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
@@ -3333,21 +4049,21 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3333
4049
|
offload_func_kq(KQ);
|
3334
4050
|
ggml_set_name(KQ, "KQ");
|
3335
4051
|
|
3336
|
-
struct ggml_tensor * KQ_scaled =
|
4052
|
+
struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
|
3337
4053
|
offload_func_kq(KQ_scaled);
|
3338
4054
|
ggml_set_name(KQ_scaled, "KQ_scaled");
|
3339
4055
|
|
3340
|
-
struct ggml_tensor * KQ_masked =
|
4056
|
+
struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
|
3341
4057
|
offload_func_kq(KQ_masked);
|
3342
4058
|
ggml_set_name(KQ_masked, "KQ_masked");
|
3343
4059
|
|
3344
|
-
struct ggml_tensor * KQ_soft_max =
|
4060
|
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
|
3345
4061
|
offload_func_v(KQ_soft_max);
|
3346
4062
|
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
3347
4063
|
|
3348
4064
|
struct ggml_tensor * V =
|
3349
4065
|
ggml_view_3d(ctx0, kv_self.v,
|
3350
|
-
|
4066
|
+
n_kv, n_embd_head, n_head_kv,
|
3351
4067
|
ggml_element_size(kv_self.v)*n_ctx,
|
3352
4068
|
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
3353
4069
|
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
@@ -3362,7 +4078,7 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3362
4078
|
offload_func_v(KQV_merged);
|
3363
4079
|
ggml_set_name(KQV_merged, "KQV_merged");
|
3364
4080
|
|
3365
|
-
cur =
|
4081
|
+
cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
|
3366
4082
|
offload_func_v(cur);
|
3367
4083
|
ggml_set_name(cur, "KQV_merged_contiguous");
|
3368
4084
|
|
@@ -3420,17 +4136,10 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3420
4136
|
|
3421
4137
|
static struct ggml_cgraph * llm_build_starcoder(
|
3422
4138
|
llama_context & lctx,
|
3423
|
-
const
|
3424
|
-
const float * embd,
|
3425
|
-
int n_tokens,
|
3426
|
-
int n_past) {
|
3427
|
-
|
3428
|
-
GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
|
3429
|
-
|
3430
|
-
const int N = n_tokens;
|
3431
|
-
|
4139
|
+
const llama_batch & batch) {
|
3432
4140
|
const auto & model = lctx.model;
|
3433
4141
|
const auto & hparams = model.hparams;
|
4142
|
+
const auto & cparams = lctx.cparams;
|
3434
4143
|
|
3435
4144
|
const auto & kv_self = lctx.kv_self;
|
3436
4145
|
|
@@ -3438,7 +4147,7 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
3438
4147
|
|
3439
4148
|
const int64_t n_embd = hparams.n_embd;
|
3440
4149
|
const int64_t n_layer = hparams.n_layer;
|
3441
|
-
const int64_t n_ctx =
|
4150
|
+
const int64_t n_ctx = cparams.n_ctx;
|
3442
4151
|
const int64_t n_head = hparams.n_head;
|
3443
4152
|
const int64_t n_head_kv = hparams.n_head_kv;
|
3444
4153
|
const int64_t n_embd_head = hparams.n_embd_head();
|
@@ -3446,7 +4155,11 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
3446
4155
|
|
3447
4156
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
3448
4157
|
|
3449
|
-
const float norm_eps
|
4158
|
+
const float norm_eps = hparams.f_norm_eps;
|
4159
|
+
|
4160
|
+
const int32_t n_tokens = batch.n_tokens;
|
4161
|
+
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
|
4162
|
+
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
3450
4163
|
|
3451
4164
|
auto & buf_compute = lctx.buf_compute;
|
3452
4165
|
|
@@ -3467,12 +4180,12 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
3467
4180
|
struct ggml_tensor * position;
|
3468
4181
|
struct ggml_tensor * inpL;
|
3469
4182
|
|
3470
|
-
if (
|
3471
|
-
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32,
|
4183
|
+
if (batch.token) {
|
4184
|
+
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
3472
4185
|
|
3473
4186
|
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
3474
4187
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3475
|
-
memcpy(inp_tokens->data,
|
4188
|
+
memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
|
3476
4189
|
}
|
3477
4190
|
ggml_set_name(inp_tokens, "inp_tokens");
|
3478
4191
|
|
@@ -3482,21 +4195,21 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
3482
4195
|
GGML_ASSERT(false && "not implemented");
|
3483
4196
|
#endif
|
3484
4197
|
|
3485
|
-
token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd,
|
4198
|
+
token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
|
3486
4199
|
|
3487
4200
|
ggml_allocr_alloc(lctx.alloc, token);
|
3488
4201
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3489
|
-
memcpy(token->data, embd,
|
4202
|
+
memcpy(token->data, batch.embd, n_tokens * n_embd * ggml_element_size(token));
|
3490
4203
|
}
|
3491
4204
|
}
|
3492
4205
|
|
3493
4206
|
{
|
3494
4207
|
// Compute position embeddings.
|
3495
|
-
struct ggml_tensor * inp_positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32,
|
4208
|
+
struct ggml_tensor * inp_positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
3496
4209
|
ggml_allocr_alloc(lctx.alloc, inp_positions);
|
3497
4210
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3498
|
-
for (int i = 0; i <
|
3499
|
-
((int32_t *) inp_positions->data)[i] =
|
4211
|
+
for (int i = 0; i < n_tokens; ++i) {
|
4212
|
+
((int32_t *) inp_positions->data)[i] = batch.pos[i];
|
3500
4213
|
}
|
3501
4214
|
}
|
3502
4215
|
ggml_set_name(inp_positions, "inp_positions");
|
@@ -3504,12 +4217,35 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
3504
4217
|
position = ggml_get_rows(ctx0, model.pos_embeddings, inp_positions);
|
3505
4218
|
}
|
3506
4219
|
|
4220
|
+
// KQ_scale
|
3507
4221
|
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
4222
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
3508
4223
|
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
3509
4224
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3510
4225
|
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
3511
4226
|
}
|
3512
|
-
|
4227
|
+
|
4228
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
4229
|
+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
4230
|
+
ggml_set_name(KQ_mask, "KQ_mask");
|
4231
|
+
ggml_allocr_alloc(lctx.alloc, KQ_mask);
|
4232
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
4233
|
+
float * data = (float *) KQ_mask->data;
|
4234
|
+
memset(data, 0, ggml_nbytes(KQ_mask));
|
4235
|
+
|
4236
|
+
for (int h = 0; h < 1; ++h) {
|
4237
|
+
for (int j = 0; j < n_tokens; ++j) {
|
4238
|
+
const llama_pos pos = batch.pos[j];
|
4239
|
+
const llama_seq_id seq_id = batch.seq_id[j];
|
4240
|
+
|
4241
|
+
for (int i = 0; i < n_kv; ++i) {
|
4242
|
+
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
4243
|
+
data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
|
4244
|
+
}
|
4245
|
+
}
|
4246
|
+
}
|
4247
|
+
}
|
4248
|
+
}
|
3513
4249
|
|
3514
4250
|
inpL = ggml_add(ctx0, token, position);
|
3515
4251
|
ggml_set_name(inpL, "inpL");
|
@@ -3525,23 +4261,23 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
3525
4261
|
// Self Attention
|
3526
4262
|
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv);
|
3527
4263
|
|
3528
|
-
struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd,
|
3529
|
-
struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa,
|
3530
|
-
struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa,
|
4264
|
+
struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd);
|
4265
|
+
struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*n_embd);
|
4266
|
+
struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
|
3531
4267
|
|
3532
4268
|
struct ggml_tensor * Qcur = tmpq;
|
3533
4269
|
struct ggml_tensor * Kcur = tmpk;
|
3534
4270
|
|
3535
4271
|
{
|
3536
|
-
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa,
|
4272
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
|
3537
4273
|
ggml_set_name(Vcur, "Vcur");
|
3538
4274
|
|
3539
|
-
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k,
|
4275
|
+
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
|
3540
4276
|
ggml_set_name(k, "k");
|
3541
4277
|
|
3542
|
-
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v,
|
4278
|
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
|
3543
4279
|
( n_ctx)*ggml_element_size(kv_self.v),
|
3544
|
-
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa +
|
4280
|
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
|
3545
4281
|
|
3546
4282
|
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
3547
4283
|
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
@@ -3551,13 +4287,13 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
3551
4287
|
ggml_permute(ctx0,
|
3552
4288
|
ggml_cpy(ctx0,
|
3553
4289
|
Qcur,
|
3554
|
-
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head,
|
4290
|
+
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, n_tokens)),
|
3555
4291
|
0, 2, 1, 3);
|
3556
4292
|
ggml_set_name(Q, "Q");
|
3557
4293
|
|
3558
4294
|
struct ggml_tensor * K =
|
3559
4295
|
ggml_view_3d(ctx0, kv_self.k,
|
3560
|
-
n_embd_head,
|
4296
|
+
n_embd_head, n_kv, n_head_kv,
|
3561
4297
|
ggml_element_size(kv_self.k)*n_embd_gqa,
|
3562
4298
|
ggml_element_size(kv_self.k)*n_embd_head,
|
3563
4299
|
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
@@ -3568,12 +4304,12 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
3568
4304
|
ggml_set_name(KQ, "KQ");
|
3569
4305
|
|
3570
4306
|
// KQ_scaled = KQ / sqrt(n_embd_head)
|
3571
|
-
// KQ_scaled shape [n_past +
|
4307
|
+
// KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
|
3572
4308
|
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
3573
4309
|
ggml_set_name(KQ_scaled, "KQ_scaled");
|
3574
4310
|
|
3575
4311
|
// KQ_masked = mask_past(KQ_scaled)
|
3576
|
-
struct ggml_tensor * KQ_masked =
|
4312
|
+
struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
|
3577
4313
|
ggml_set_name(KQ_masked, "KQ_masked");
|
3578
4314
|
|
3579
4315
|
// KQ = soft_max(KQ_masked)
|
@@ -3583,7 +4319,7 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
3583
4319
|
// split cached V into n_head heads
|
3584
4320
|
struct ggml_tensor * V =
|
3585
4321
|
ggml_view_3d(ctx0, kv_self.v,
|
3586
|
-
|
4322
|
+
n_kv, n_embd_head, n_head_kv,
|
3587
4323
|
ggml_element_size(kv_self.v)*n_ctx,
|
3588
4324
|
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
3589
4325
|
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
@@ -3596,10 +4332,8 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
3596
4332
|
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
3597
4333
|
ggml_set_name(KQV_merged, "KQV_merged");
|
3598
4334
|
|
3599
|
-
// cur = KQV_merged.contiguous().view(n_embd,
|
3600
|
-
cur =
|
3601
|
-
KQV_merged,
|
3602
|
-
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
|
4335
|
+
// cur = KQV_merged.contiguous().view(n_embd, n_tokens)
|
4336
|
+
cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
|
3603
4337
|
ggml_set_name(cur, "KQV_merged_contiguous");
|
3604
4338
|
}
|
3605
4339
|
|
@@ -3649,10 +4383,7 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
3649
4383
|
|
3650
4384
|
static struct ggml_cgraph * llama_build_graph(
|
3651
4385
|
llama_context & lctx,
|
3652
|
-
const
|
3653
|
-
const float * embd,
|
3654
|
-
int n_tokens,
|
3655
|
-
int n_past) {
|
4386
|
+
const llama_batch & batch) {
|
3656
4387
|
const auto & model = lctx.model;
|
3657
4388
|
|
3658
4389
|
struct ggml_cgraph * result = NULL;
|
@@ -3660,76 +4391,121 @@ static struct ggml_cgraph * llama_build_graph(
|
|
3660
4391
|
switch (model.arch) {
|
3661
4392
|
case LLM_ARCH_LLAMA:
|
3662
4393
|
{
|
3663
|
-
result = llm_build_llama(lctx,
|
4394
|
+
result = llm_build_llama(lctx, batch);
|
3664
4395
|
} break;
|
3665
4396
|
case LLM_ARCH_BAICHUAN:
|
3666
4397
|
{
|
3667
|
-
result = llm_build_baichaun(lctx,
|
4398
|
+
result = llm_build_baichaun(lctx, batch);
|
3668
4399
|
} break;
|
3669
4400
|
case LLM_ARCH_FALCON:
|
3670
4401
|
{
|
3671
|
-
result = llm_build_falcon(lctx,
|
4402
|
+
result = llm_build_falcon(lctx, batch);
|
3672
4403
|
} break;
|
3673
4404
|
case LLM_ARCH_STARCODER:
|
3674
4405
|
{
|
3675
|
-
result = llm_build_starcoder(lctx,
|
4406
|
+
result = llm_build_starcoder(lctx, batch);
|
4407
|
+
} break;
|
4408
|
+
case LLM_ARCH_REFACT:
|
4409
|
+
{
|
4410
|
+
result = llm_build_refact(lctx, batch);
|
3676
4411
|
} break;
|
3677
4412
|
default:
|
3678
4413
|
GGML_ASSERT(false);
|
3679
|
-
}
|
4414
|
+
}
|
3680
4415
|
|
3681
4416
|
return result;
|
3682
4417
|
}
|
3683
4418
|
|
3684
|
-
//
|
4419
|
+
// decode a batch of tokens by evaluating the transformer
|
3685
4420
|
//
|
3686
4421
|
// - lctx: llama context
|
3687
|
-
// -
|
3688
|
-
// - embd embeddings input
|
3689
|
-
// - n_tokens number of tokens
|
3690
|
-
// - n_past: the context size so far
|
4422
|
+
// - batch: batch to evaluate
|
3691
4423
|
// - n_threads: number of threads to use
|
3692
4424
|
//
|
3693
|
-
|
4425
|
+
// return 0 on success
|
4426
|
+
// return positive int on warning
|
4427
|
+
// return negative int on error
|
4428
|
+
//
|
4429
|
+
static int llama_decode_internal(
|
3694
4430
|
llama_context & lctx,
|
3695
|
-
|
3696
|
-
|
3697
|
-
|
3698
|
-
|
3699
|
-
|
3700
|
-
|
4431
|
+
llama_batch batch) {
|
4432
|
+
const uint32_t n_tokens = batch.n_tokens;
|
4433
|
+
|
4434
|
+
if (n_tokens == 0) {
|
4435
|
+
LLAMA_LOG_ERROR("%s: n_tokens == 0", __func__);
|
4436
|
+
return -1;
|
4437
|
+
}
|
4438
|
+
|
4439
|
+
const auto & model = lctx.model;
|
4440
|
+
const auto & hparams = model.hparams;
|
4441
|
+
const auto & cparams = lctx.cparams;
|
3701
4442
|
|
3702
|
-
|
4443
|
+
const auto n_batch = cparams.n_batch;
|
3703
4444
|
|
3704
|
-
GGML_ASSERT(n_tokens
|
3705
|
-
|
3706
|
-
|
3707
|
-
|
3708
|
-
// GGML_ASSERT(n_past + n_tokens <= n_ctx);
|
4445
|
+
GGML_ASSERT(n_tokens <= n_batch);
|
4446
|
+
|
4447
|
+
int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
|
4448
|
+
GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
|
3709
4449
|
|
3710
4450
|
const int64_t t_start_us = ggml_time_us();
|
3711
4451
|
|
3712
4452
|
#ifdef GGML_USE_MPI
|
3713
|
-
|
4453
|
+
// TODO: needs fix after #3228
|
4454
|
+
GGML_ASSERT(false && "not implemented");
|
4455
|
+
//ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
|
3714
4456
|
#endif
|
3715
4457
|
|
3716
4458
|
GGML_ASSERT(n_threads > 0);
|
3717
4459
|
|
3718
|
-
|
3719
|
-
|
3720
|
-
const auto & model = lctx.model;
|
3721
|
-
const auto & hparams = model.hparams;
|
3722
|
-
|
3723
|
-
const auto & kv_self = lctx.kv_self;
|
4460
|
+
auto & kv_self = lctx.kv_self;
|
3724
4461
|
|
3725
4462
|
GGML_ASSERT(!!kv_self.ctx);
|
3726
4463
|
|
3727
4464
|
const int64_t n_embd = hparams.n_embd;
|
3728
4465
|
const int64_t n_vocab = hparams.n_vocab;
|
3729
4466
|
|
4467
|
+
// helpers for smoother batch API transistion
|
4468
|
+
// after deprecating the llama_eval calls, these will be removed
|
4469
|
+
std::vector<llama_pos> pos;
|
4470
|
+
std::vector<llama_seq_id> seq_id;
|
4471
|
+
|
4472
|
+
if (batch.pos == nullptr) {
|
4473
|
+
pos.resize(n_tokens);
|
4474
|
+
for (uint32_t i = 0; i < n_tokens; i++) {
|
4475
|
+
pos[i] = batch.all_pos_0 + i*batch.all_pos_1;
|
4476
|
+
}
|
4477
|
+
|
4478
|
+
batch.pos = pos.data();
|
4479
|
+
}
|
4480
|
+
|
4481
|
+
if (batch.seq_id == nullptr) {
|
4482
|
+
seq_id.resize(n_tokens);
|
4483
|
+
for (uint32_t i = 0; i < n_tokens; i++) {
|
4484
|
+
seq_id[i] = batch.all_seq_id;
|
4485
|
+
}
|
4486
|
+
|
4487
|
+
batch.seq_id = seq_id.data();
|
4488
|
+
}
|
4489
|
+
|
4490
|
+
// we always start to search for a free slot from the start of the cache
|
4491
|
+
// TODO: better strategies can be implemented
|
4492
|
+
kv_self.head = 0;
|
4493
|
+
|
4494
|
+
if (!llama_kv_cache_find_slot(kv_self, batch)) {
|
4495
|
+
return 1;
|
4496
|
+
}
|
4497
|
+
|
4498
|
+
// a heuristic, to avoid attending the full cache if it is not yet utilized
|
4499
|
+
// after enough generations, the benefit from this heuristic disappears
|
4500
|
+
// if we start defragmenting the cache, the benefit from this will be more important
|
4501
|
+
//kv_self.n = std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)); // TODO: this might be better for CUDA?
|
4502
|
+
kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, llama_kv_cache_cell_max(kv_self)));
|
4503
|
+
|
4504
|
+
//printf("kv_self.n = %d\n", kv_self.n);
|
4505
|
+
|
3730
4506
|
ggml_allocr_reset(lctx.alloc);
|
3731
4507
|
|
3732
|
-
ggml_cgraph * gf = llama_build_graph(lctx,
|
4508
|
+
ggml_cgraph * gf = llama_build_graph(lctx, batch);
|
3733
4509
|
|
3734
4510
|
ggml_allocr_alloc_graph(lctx.alloc, gf);
|
3735
4511
|
|
@@ -3738,6 +4514,7 @@ static bool llama_eval_internal(
|
|
3738
4514
|
ggml_tensor * node = gf->leafs[i];
|
3739
4515
|
if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
|
3740
4516
|
ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) lctx.buf_alloc.data);
|
4517
|
+
ggml_cuda_copy_to_device(node);
|
3741
4518
|
}
|
3742
4519
|
}
|
3743
4520
|
|
@@ -3747,6 +4524,8 @@ static bool llama_eval_internal(
|
|
3747
4524
|
ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) lctx.buf_alloc.data);
|
3748
4525
|
}
|
3749
4526
|
}
|
4527
|
+
|
4528
|
+
ggml_cuda_set_mul_mat_q(cparams.mul_mat_q);
|
3750
4529
|
#endif
|
3751
4530
|
|
3752
4531
|
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
@@ -3756,14 +4535,15 @@ static bool llama_eval_internal(
|
|
3756
4535
|
// TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
|
3757
4536
|
// we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
|
3758
4537
|
// with the BLAS calls. need a better solution
|
3759
|
-
if (
|
4538
|
+
if (n_tokens >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
|
3760
4539
|
n_threads = std::min(4, n_threads);
|
3761
4540
|
}
|
3762
4541
|
|
3763
4542
|
// If all tensors can be run on the GPU then using more than 1 thread is detrimental.
|
3764
4543
|
const bool full_offload_supported = model.arch == LLM_ARCH_LLAMA ||
|
3765
4544
|
model.arch == LLM_ARCH_BAICHUAN ||
|
3766
|
-
model.arch == LLM_ARCH_FALCON
|
4545
|
+
model.arch == LLM_ARCH_FALCON ||
|
4546
|
+
model.arch == LLM_ARCH_REFACT;
|
3767
4547
|
const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
|
3768
4548
|
if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
|
3769
4549
|
n_threads = 1;
|
@@ -3795,12 +4575,9 @@ static bool llama_eval_internal(
|
|
3795
4575
|
ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
|
3796
4576
|
#endif
|
3797
4577
|
|
3798
|
-
// update kv
|
3799
|
-
lctx.kv_self.
|
3800
|
-
|
3801
|
-
if (cgraph_fname) {
|
3802
|
-
ggml_graph_export(gf, cgraph_fname);
|
3803
|
-
}
|
4578
|
+
// update the kv ring buffer
|
4579
|
+
lctx.kv_self.head += n_tokens;
|
4580
|
+
lctx.kv_self.has_shift = false;
|
3804
4581
|
|
3805
4582
|
#ifdef GGML_PERF
|
3806
4583
|
// print timing information per ggml operation (for debugging purposes)
|
@@ -3817,13 +4594,20 @@ static bool llama_eval_internal(
|
|
3817
4594
|
{
|
3818
4595
|
auto & logits_out = lctx.logits;
|
3819
4596
|
|
3820
|
-
if (
|
3821
|
-
logits_out.resize(n_vocab *
|
3822
|
-
|
4597
|
+
if (batch.logits) {
|
4598
|
+
logits_out.resize(n_vocab * n_tokens);
|
4599
|
+
for (uint32_t i = 0; i < n_tokens; i++) {
|
4600
|
+
if (batch.logits[i] == 0) {
|
4601
|
+
continue;
|
4602
|
+
}
|
4603
|
+
memcpy(logits_out.data() + (n_vocab*i), (float *) ggml_get_data(res) + (n_vocab*i), sizeof(float)*n_vocab);
|
4604
|
+
}
|
4605
|
+
} else if (lctx.logits_all) {
|
4606
|
+
logits_out.resize(n_vocab * n_tokens);
|
4607
|
+
memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*n_tokens);
|
3823
4608
|
} else {
|
3824
|
-
// return result for just the last token
|
3825
4609
|
logits_out.resize(n_vocab);
|
3826
|
-
memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(
|
4610
|
+
memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(n_tokens - 1)), sizeof(float)*n_vocab);
|
3827
4611
|
}
|
3828
4612
|
}
|
3829
4613
|
|
@@ -3832,20 +4616,27 @@ static bool llama_eval_internal(
|
|
3832
4616
|
auto & embedding_out = lctx.embedding;
|
3833
4617
|
|
3834
4618
|
embedding_out.resize(n_embd);
|
3835
|
-
memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(
|
4619
|
+
memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(n_tokens - 1)), sizeof(float)*n_embd);
|
3836
4620
|
}
|
3837
4621
|
|
3838
4622
|
// measure the performance only for the single-token evals
|
3839
|
-
if (
|
4623
|
+
if (n_tokens == 1) {
|
3840
4624
|
lctx.t_eval_us += ggml_time_us() - t_start_us;
|
3841
4625
|
lctx.n_eval++;
|
3842
4626
|
}
|
3843
|
-
else if (
|
4627
|
+
else if (n_tokens > 1) {
|
3844
4628
|
lctx.t_p_eval_us += ggml_time_us() - t_start_us;
|
3845
|
-
lctx.n_p_eval +=
|
4629
|
+
lctx.n_p_eval += n_tokens;
|
3846
4630
|
}
|
3847
4631
|
|
3848
|
-
|
4632
|
+
// get a more accurate load time, upon first eval
|
4633
|
+
// TODO: fix this
|
4634
|
+
if (!lctx.has_evaluated_once) {
|
4635
|
+
lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
|
4636
|
+
lctx.has_evaluated_once = true;
|
4637
|
+
}
|
4638
|
+
|
4639
|
+
return 0;
|
3849
4640
|
}
|
3850
4641
|
|
3851
4642
|
//
|
@@ -3872,18 +4663,41 @@ static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
|
|
3872
4663
|
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE;
|
3873
4664
|
}
|
3874
4665
|
|
3875
|
-
static
|
4666
|
+
static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id) {
|
4667
|
+
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED;
|
4668
|
+
}
|
4669
|
+
|
4670
|
+
static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
|
3876
4671
|
GGML_ASSERT(llama_is_byte_token(vocab, id));
|
3877
4672
|
const auto& token_data = vocab.id_to_token.at(id);
|
3878
|
-
|
3879
|
-
|
4673
|
+
switch (llama_vocab_get_type(vocab)) {
|
4674
|
+
case LLAMA_VOCAB_TYPE_SPM: {
|
4675
|
+
auto buf = token_data.text.substr(3, 2);
|
4676
|
+
return strtol(buf.c_str(), NULL, 16);
|
4677
|
+
}
|
4678
|
+
case LLAMA_VOCAB_TYPE_BPE: {
|
4679
|
+
GGML_ASSERT(false);
|
4680
|
+
return unicode_to_bytes_bpe(token_data.text);
|
4681
|
+
}
|
4682
|
+
default:
|
4683
|
+
GGML_ASSERT(false);
|
4684
|
+
}
|
3880
4685
|
}
|
3881
4686
|
|
3882
4687
|
static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
|
3883
|
-
|
3884
|
-
|
3885
|
-
|
3886
|
-
|
4688
|
+
switch (llama_vocab_get_type(vocab)) {
|
4689
|
+
case LLAMA_VOCAB_TYPE_SPM: {
|
4690
|
+
char buf[7];
|
4691
|
+
int result = snprintf(buf, sizeof(buf), "<0x%02X>", ch);
|
4692
|
+
GGML_ASSERT(0 <= result && result < 7);
|
4693
|
+
return vocab.token_to_id.at(buf);
|
4694
|
+
}
|
4695
|
+
case LLAMA_VOCAB_TYPE_BPE: {
|
4696
|
+
return vocab.token_to_id.at(bytes_to_unicode_bpe(ch));
|
4697
|
+
}
|
4698
|
+
default:
|
4699
|
+
GGML_ASSERT(false);
|
4700
|
+
}
|
3887
4701
|
}
|
3888
4702
|
|
3889
4703
|
static void llama_escape_whitespace(std::string & text) {
|
@@ -4163,15 +4977,9 @@ struct llm_tokenizer_bpe {
|
|
4163
4977
|
std::string byte_str(1, *j);
|
4164
4978
|
auto token_multibyte = vocab.token_to_id.find(byte_str);
|
4165
4979
|
if (token_multibyte == vocab.token_to_id.end()) {
|
4166
|
-
|
4167
|
-
llama_token token_byte = llama_byte_to_token(vocab, *j);
|
4168
|
-
output.push_back(token_byte);
|
4169
|
-
} catch (const std::out_of_range & err) {
|
4170
|
-
fprintf(stderr,"ERROR: byte not found in vocab: '%s'\n", byte_str.c_str());
|
4171
|
-
}
|
4172
|
-
} else {
|
4173
|
-
output.push_back((*token_multibyte).second);
|
4980
|
+
throw std::runtime_error("ERROR: byte not found in vocab");
|
4174
4981
|
}
|
4982
|
+
output.push_back((*token_multibyte).second);
|
4175
4983
|
}
|
4176
4984
|
} else {
|
4177
4985
|
output.push_back((*token).second);
|
@@ -4208,23 +5016,144 @@ private:
|
|
4208
5016
|
work_queue.push(bigram);
|
4209
5017
|
}
|
4210
5018
|
|
4211
|
-
|
4212
|
-
|
4213
|
-
std::vector<std::string>
|
5019
|
+
std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
|
5020
|
+
std::vector<std::string> bpe_words;
|
5021
|
+
std::vector<std::string> bpe_encoded_words;
|
5022
|
+
|
5023
|
+
std::string token = "";
|
5024
|
+
// GPT2 system regex: 's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+
|
5025
|
+
bool collecting_numeric = false;
|
5026
|
+
bool collecting_letter = false;
|
5027
|
+
bool collecting_special = false;
|
5028
|
+
bool collecting_whitespace_lookahead = false;
|
5029
|
+
bool collecting = false;
|
5030
|
+
|
5031
|
+
std::vector<std::string> text_utf;
|
5032
|
+
text_utf.reserve(text.size());
|
5033
|
+
bpe_words.reserve(text.size());
|
5034
|
+
bpe_encoded_words.reserve(text.size());
|
5035
|
+
|
5036
|
+
auto cps = codepoints_from_utf8(text);
|
5037
|
+
for (size_t i = 0; i < cps.size(); ++i)
|
5038
|
+
text_utf.emplace_back(codepoint_to_utf8(cps[i]));
|
5039
|
+
|
5040
|
+
for (int i = 0; i < (int)text_utf.size(); i++) {
|
5041
|
+
const std::string & utf_char = text_utf[i];
|
5042
|
+
bool split_condition = false;
|
5043
|
+
// const char* text_pos = raw_text_p + utf_char.seq_offset_bytes;
|
5044
|
+
int bytes_remain = text_utf.size() - i;
|
5045
|
+
// forward backward lookups
|
5046
|
+
const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
|
5047
|
+
const std::string & utf_char_next_next = (i + 2 < (int)text_utf.size()) ? text_utf[i + 2] : "";
|
5048
|
+
|
5049
|
+
// handling contractions
|
5050
|
+
if (!split_condition && bytes_remain >= 2) {
|
5051
|
+
// 's|'t|'m|'d
|
5052
|
+
if (utf_char == "\'" && (utf_char_next == "s" || utf_char_next == "t" || utf_char_next == "m" || utf_char_next == "d")) {
|
5053
|
+
split_condition = true;
|
5054
|
+
}
|
5055
|
+
if (split_condition) {
|
5056
|
+
if (token.size()) {
|
5057
|
+
bpe_words.emplace_back(token); // push previous content as token
|
5058
|
+
}
|
5059
|
+
token = utf_char + utf_char_next;
|
5060
|
+
bpe_words.emplace_back(token);
|
5061
|
+
token = "";
|
5062
|
+
i++;
|
5063
|
+
continue;
|
5064
|
+
}
|
5065
|
+
}
|
5066
|
+
if (!split_condition && bytes_remain >= 3) {
|
5067
|
+
// 're|'ve|'ll
|
5068
|
+
if (utf_char == "\'" && (
|
5069
|
+
(utf_char_next == "r" || utf_char_next_next == "e") ||
|
5070
|
+
(utf_char_next == "v" || utf_char_next_next == "e") ||
|
5071
|
+
(utf_char_next == "l" || utf_char_next_next == "l"))
|
5072
|
+
) {
|
5073
|
+
split_condition = true;
|
5074
|
+
}
|
5075
|
+
if (split_condition) {
|
5076
|
+
// current token + next token can be defined
|
5077
|
+
if (token.size()) {
|
5078
|
+
bpe_words.emplace_back(token); // push previous content as token
|
5079
|
+
}
|
5080
|
+
token = utf_char + utf_char_next + utf_char_next_next;
|
5081
|
+
bpe_words.emplace_back(token); // the contraction
|
5082
|
+
token = "";
|
5083
|
+
i += 2;
|
5084
|
+
continue;
|
5085
|
+
}
|
5086
|
+
}
|
5087
|
+
|
5088
|
+
if (!split_condition && !collecting) {
|
5089
|
+
if (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || (!token.size() && utf_char == " " && codepoint_type(utf_char_next) == CODEPOINT_TYPE_LETTER)) {
|
5090
|
+
collecting_letter = true;
|
5091
|
+
collecting = true;
|
5092
|
+
}
|
5093
|
+
else if (codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || (!token.size() && utf_char == " " && codepoint_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
|
5094
|
+
collecting_numeric = true;
|
5095
|
+
collecting = true;
|
5096
|
+
}
|
5097
|
+
else if (
|
5098
|
+
((codepoint_type(utf_char) != CODEPOINT_TYPE_LETTER && codepoint_type(utf_char) != CODEPOINT_TYPE_DIGIT) && (codepoint_type(utf_char) != CODEPOINT_TYPE_WHITESPACE)) ||
|
5099
|
+
(!token.size() && utf_char == " " && codepoint_type(utf_char_next) != CODEPOINT_TYPE_LETTER && codepoint_type(utf_char_next) != CODEPOINT_TYPE_DIGIT && codepoint_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE)
|
5100
|
+
) {
|
5101
|
+
collecting_special = true;
|
5102
|
+
collecting = true;
|
5103
|
+
}
|
5104
|
+
else if (codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE && codepoint_type(utf_char_next) == CODEPOINT_TYPE_WHITESPACE) {
|
5105
|
+
collecting_whitespace_lookahead = true;
|
5106
|
+
collecting = true;
|
5107
|
+
}
|
5108
|
+
else if (codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE) {
|
5109
|
+
split_condition = true;
|
5110
|
+
}
|
5111
|
+
}
|
5112
|
+
else if (!split_condition && collecting) {
|
5113
|
+
if (collecting_letter && codepoint_type(utf_char) != CODEPOINT_TYPE_LETTER) {
|
5114
|
+
split_condition = true;
|
5115
|
+
}
|
5116
|
+
else if (collecting_numeric && codepoint_type(utf_char) != CODEPOINT_TYPE_DIGIT) {
|
5117
|
+
split_condition = true;
|
5118
|
+
}
|
5119
|
+
else if (collecting_special && (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
|
5120
|
+
split_condition = true;
|
5121
|
+
}
|
5122
|
+
else if (collecting_whitespace_lookahead && codepoint_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE) {
|
5123
|
+
split_condition = true;
|
5124
|
+
}
|
5125
|
+
}
|
5126
|
+
|
5127
|
+
if (utf_char_next == "") {
|
5128
|
+
split_condition = true; // final
|
5129
|
+
token += utf_char;
|
5130
|
+
}
|
4214
5131
|
|
4215
|
-
|
4216
|
-
|
4217
|
-
|
5132
|
+
if (split_condition) {
|
5133
|
+
if (token.size()) {
|
5134
|
+
bpe_words.emplace_back(token);
|
5135
|
+
}
|
5136
|
+
token = utf_char;
|
5137
|
+
collecting = false;
|
5138
|
+
collecting_letter = false;
|
5139
|
+
collecting_numeric = false;
|
5140
|
+
collecting_special = false;
|
5141
|
+
collecting_whitespace_lookahead = false;
|
5142
|
+
}
|
5143
|
+
else {
|
5144
|
+
token += utf_char;
|
5145
|
+
}
|
5146
|
+
}
|
4218
5147
|
|
4219
|
-
|
4220
|
-
|
4221
|
-
|
4222
|
-
|
4223
|
-
|
4224
|
-
|
5148
|
+
for (std::string & word : bpe_words) {
|
5149
|
+
std::string encoded_token = "";
|
5150
|
+
for (char & c : word) {
|
5151
|
+
encoded_token += bytes_to_unicode_bpe(c);
|
5152
|
+
}
|
5153
|
+
bpe_encoded_words.emplace_back(encoded_token);
|
4225
5154
|
}
|
4226
|
-
return words;
|
4227
5155
|
|
5156
|
+
return bpe_encoded_words;
|
4228
5157
|
}
|
4229
5158
|
|
4230
5159
|
const llama_vocab & vocab;
|
@@ -4266,7 +5195,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
4266
5195
|
llm_tokenizer_bpe tokenizer(vocab);
|
4267
5196
|
tokenizer.tokenize(raw_text, output);
|
4268
5197
|
} break;
|
4269
|
-
}
|
5198
|
+
}
|
4270
5199
|
|
4271
5200
|
return output;
|
4272
5201
|
}
|
@@ -4670,6 +5599,13 @@ struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar)
|
|
4670
5599
|
// sampling
|
4671
5600
|
//
|
4672
5601
|
|
5602
|
+
void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed) {
|
5603
|
+
if (seed == LLAMA_DEFAULT_SEED) {
|
5604
|
+
seed = time(NULL);
|
5605
|
+
}
|
5606
|
+
ctx->rng.seed(seed);
|
5607
|
+
}
|
5608
|
+
|
4673
5609
|
void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates) {
|
4674
5610
|
GGML_ASSERT(candidates->size > 0);
|
4675
5611
|
|
@@ -4878,7 +5814,7 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
|
|
4878
5814
|
}
|
4879
5815
|
}
|
4880
5816
|
|
4881
|
-
void
|
5817
|
+
void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
|
4882
5818
|
const int64_t t_start_sample_us = ggml_time_us();
|
4883
5819
|
|
4884
5820
|
for (size_t i = 0; i < candidates_p->size; ++i) {
|
@@ -4890,6 +5826,10 @@ void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array
|
|
4890
5826
|
}
|
4891
5827
|
}
|
4892
5828
|
|
5829
|
+
void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
|
5830
|
+
llama_sample_temp(ctx, candidates_p, temp);
|
5831
|
+
}
|
5832
|
+
|
4893
5833
|
void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty) {
|
4894
5834
|
if (last_tokens_size == 0 || penalty == 1.0f) {
|
4895
5835
|
return;
|
@@ -5013,7 +5953,7 @@ void llama_sample_classifier_free_guidance(
|
|
5013
5953
|
|
5014
5954
|
GGML_ASSERT(ctx);
|
5015
5955
|
|
5016
|
-
auto n_vocab = llama_n_vocab(ctx);
|
5956
|
+
auto n_vocab = llama_n_vocab(llama_get_model(ctx));
|
5017
5957
|
|
5018
5958
|
GGML_ASSERT(n_vocab == (int)candidates->size);
|
5019
5959
|
GGML_ASSERT(!candidates->sorted);
|
@@ -5042,7 +5982,7 @@ void llama_sample_classifier_free_guidance(
|
|
5042
5982
|
llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu) {
|
5043
5983
|
GGML_ASSERT(ctx);
|
5044
5984
|
|
5045
|
-
auto N = float(llama_n_vocab(ctx));
|
5985
|
+
auto N = float(llama_n_vocab(llama_get_model(ctx)));
|
5046
5986
|
int64_t t_start_sample_us;
|
5047
5987
|
t_start_sample_us = ggml_time_us();
|
5048
5988
|
|
@@ -5229,7 +6169,7 @@ struct llama_logit_info {
|
|
5229
6169
|
};
|
5230
6170
|
llama_logit_info(llama_context * ctx)
|
5231
6171
|
: logits(llama_get_logits(ctx))
|
5232
|
-
, n_vocab(llama_n_vocab(ctx))
|
6172
|
+
, n_vocab(llama_n_vocab(llama_get_model(ctx)))
|
5233
6173
|
, max_l(*std::max_element(logits, logits + n_vocab))
|
5234
6174
|
, normalizer(1.0f / std::accumulate(logits, logits + n_vocab, 0.0f, sum_exp{max_l}))
|
5235
6175
|
{ }
|
@@ -5267,7 +6207,6 @@ struct llama_beam_search_data {
|
|
5267
6207
|
size_t n_beams;
|
5268
6208
|
int n_past;
|
5269
6209
|
int n_predict;
|
5270
|
-
int n_threads;
|
5271
6210
|
std::vector<llama_beam> beams;
|
5272
6211
|
std::vector<llama_beam> next_beams;
|
5273
6212
|
|
@@ -5277,12 +6216,11 @@ struct llama_beam_search_data {
|
|
5277
6216
|
// Used to communicate to/from callback on beams state.
|
5278
6217
|
std::vector<llama_beam_view> beam_views;
|
5279
6218
|
|
5280
|
-
llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict
|
6219
|
+
llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict)
|
5281
6220
|
: ctx(ctx)
|
5282
6221
|
, n_beams(n_beams)
|
5283
6222
|
, n_past(n_past)
|
5284
6223
|
, n_predict(n_predict)
|
5285
|
-
, n_threads(n_threads)
|
5286
6224
|
, beam_views(n_beams) {
|
5287
6225
|
beams.reserve(n_beams);
|
5288
6226
|
next_beams.reserve(n_beams);
|
@@ -5319,7 +6257,7 @@ struct llama_beam_search_data {
|
|
5319
6257
|
} else {
|
5320
6258
|
// beam is not at end-of-sentence, so branch with next top_k tokens.
|
5321
6259
|
if (!beam.tokens.empty()) {
|
5322
|
-
|
6260
|
+
llama_decode(ctx, llama_batch_get_one(beam.tokens.data(), beam.tokens.size(), n_past, 0));
|
5323
6261
|
}
|
5324
6262
|
llama_logit_info logit_info(ctx);
|
5325
6263
|
std::vector<llama_token_data> next_tokens = logit_info.top_k(n_beams);
|
@@ -5393,7 +6331,7 @@ struct llama_beam_search_data {
|
|
5393
6331
|
callback(callback_data, get_beams_state(false)); // Sets common_prefix_length
|
5394
6332
|
update_beams_from_beam_views(); // Update values (p,eob) that callback may have changed.
|
5395
6333
|
if (common_prefix_length) {
|
5396
|
-
|
6334
|
+
llama_decode(ctx, llama_batch_get_one(beams[0].tokens.data(), common_prefix_length, n_past, 0));
|
5397
6335
|
n_past += common_prefix_length;
|
5398
6336
|
}
|
5399
6337
|
// Zero-out next_beam probabilities to place them last in following min-heap.
|
@@ -5434,11 +6372,11 @@ struct llama_beam_search_data {
|
|
5434
6372
|
|
5435
6373
|
void llama_beam_search(llama_context * ctx,
|
5436
6374
|
llama_beam_search_callback_fn_t callback, void * callback_data,
|
5437
|
-
size_t n_beams, int n_past, int n_predict
|
6375
|
+
size_t n_beams, int n_past, int n_predict) {
|
5438
6376
|
assert(ctx);
|
5439
6377
|
const int64_t t_start_sample_us = ggml_time_us();
|
5440
6378
|
|
5441
|
-
llama_beam_search_data beam_search_data(ctx, n_beams, n_past, n_predict
|
6379
|
+
llama_beam_search_data beam_search_data(ctx, n_beams, n_past, n_predict);
|
5442
6380
|
|
5443
6381
|
beam_search_data.loop(callback, callback_data);
|
5444
6382
|
|
@@ -5658,11 +6596,22 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
5658
6596
|
nthread = std::thread::hardware_concurrency();
|
5659
6597
|
}
|
5660
6598
|
|
5661
|
-
|
6599
|
+
// mmap consistently increases speed Linux, and also increases speed on Windows with
|
6600
|
+
// hot cache. It may cause a slowdown on macOS, possibly related to free memory.
|
6601
|
+
#if defined(__linux__) || defined(_WIN32)
|
6602
|
+
constexpr bool use_mmap = true;
|
6603
|
+
#else
|
6604
|
+
constexpr bool use_mmap = false;
|
6605
|
+
#endif
|
6606
|
+
|
6607
|
+
llama_model_loader ml(fname_inp, use_mmap);
|
6608
|
+
if (ml.use_mmap) {
|
6609
|
+
ml.mapping.reset(new llama_mmap(&ml.file, /* prefetch */ 0, ggml_is_numa()));
|
6610
|
+
}
|
5662
6611
|
|
5663
6612
|
llama_model model;
|
5664
|
-
llm_load_arch(
|
5665
|
-
llm_load_hparams(
|
6613
|
+
llm_load_arch(ml, model);
|
6614
|
+
llm_load_hparams(ml, model);
|
5666
6615
|
|
5667
6616
|
if (params->only_copy) {
|
5668
6617
|
ftype = model.ftype;
|
@@ -5672,7 +6621,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
5672
6621
|
struct gguf_context * ctx_out = gguf_init_empty();
|
5673
6622
|
|
5674
6623
|
// copy the KV pairs from the input file
|
5675
|
-
gguf_set_kv (ctx_out, ml
|
6624
|
+
gguf_set_kv (ctx_out, ml.ctx_gguf);
|
5676
6625
|
gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
|
5677
6626
|
gguf_set_val_u32(ctx_out, "general.file_type", ftype);
|
5678
6627
|
|
@@ -5680,8 +6629,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
5680
6629
|
int n_attention_wv = 0;
|
5681
6630
|
int n_feed_forward_w2 = 0;
|
5682
6631
|
|
5683
|
-
for (int i = 0; i < ml
|
5684
|
-
struct ggml_tensor * meta = ml
|
6632
|
+
for (int i = 0; i < ml.n_tensors; ++i) {
|
6633
|
+
struct ggml_tensor * meta = ml.get_tensor_meta(i);
|
5685
6634
|
|
5686
6635
|
const std::string name = ggml_get_name(meta);
|
5687
6636
|
|
@@ -5717,8 +6666,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
5717
6666
|
std::vector<no_init<float>> f32_conv_buf;
|
5718
6667
|
|
5719
6668
|
// populate the original tensors so we get an initial meta data
|
5720
|
-
for (int i = 0; i < ml
|
5721
|
-
struct ggml_tensor * meta = ml
|
6669
|
+
for (int i = 0; i < ml.n_tensors; ++i) {
|
6670
|
+
struct ggml_tensor * meta = ml.get_tensor_meta(i);
|
5722
6671
|
gguf_add_tensor(ctx_out, meta);
|
5723
6672
|
}
|
5724
6673
|
|
@@ -5731,19 +6680,21 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
5731
6680
|
// placeholder for the meta data
|
5732
6681
|
::zeros(fout, meta_size);
|
5733
6682
|
|
5734
|
-
for (int i = 0; i < ml
|
5735
|
-
struct ggml_tensor * tensor = ml
|
6683
|
+
for (int i = 0; i < ml.n_tensors; ++i) {
|
6684
|
+
struct ggml_tensor * tensor = ml.get_tensor_meta(i);
|
5736
6685
|
|
5737
6686
|
const std::string name = ggml_get_name(tensor);
|
5738
6687
|
|
5739
|
-
if (
|
5740
|
-
read_data.
|
6688
|
+
if (!ml.use_mmap) {
|
6689
|
+
if (read_data.size() < ggml_nbytes(tensor)) {
|
6690
|
+
read_data.resize(ggml_nbytes(tensor));
|
6691
|
+
}
|
6692
|
+
tensor->data = read_data.data();
|
5741
6693
|
}
|
5742
|
-
|
5743
|
-
ml->load_data_for(tensor);
|
6694
|
+
ml.load_data_for(tensor);
|
5744
6695
|
|
5745
6696
|
LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",
|
5746
|
-
++idx, ml
|
6697
|
+
++idx, ml.n_tensors,
|
5747
6698
|
ggml_get_name(tensor),
|
5748
6699
|
llama_format_tensor_shape(tensor).c_str(),
|
5749
6700
|
ggml_type_name(tensor->type));
|
@@ -5893,9 +6844,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
5893
6844
|
}
|
5894
6845
|
}
|
5895
6846
|
|
5896
|
-
// TODO: after the GGUF PR, this likely won't work and needs to be updated
|
5897
6847
|
static int llama_apply_lora_from_file_internal(
|
5898
|
-
const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads
|
6848
|
+
const struct llama_model & model, const char * path_lora, float scale, const char * path_base_model, int n_threads
|
5899
6849
|
) {
|
5900
6850
|
LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
|
5901
6851
|
|
@@ -5924,7 +6874,7 @@ static int llama_apply_lora_from_file_internal(
|
|
5924
6874
|
int32_t lora_alpha;
|
5925
6875
|
fin.read((char *) &lora_r, sizeof(lora_r));
|
5926
6876
|
fin.read((char *) &lora_alpha, sizeof(lora_alpha));
|
5927
|
-
float scaling = (float)lora_alpha / (float)lora_r;
|
6877
|
+
float scaling = scale * (float)lora_alpha / (float)lora_r;
|
5928
6878
|
|
5929
6879
|
LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
|
5930
6880
|
|
@@ -6140,9 +7090,10 @@ static int llama_apply_lora_from_file_internal(
|
|
6140
7090
|
ggml_set_name(r, "r_cpy");
|
6141
7091
|
}
|
6142
7092
|
|
6143
|
-
struct ggml_cgraph gf =
|
7093
|
+
struct ggml_cgraph * gf = ggml_new_graph(lora_ctx);
|
7094
|
+
ggml_build_forward_expand(gf, r);
|
6144
7095
|
|
6145
|
-
ggml_graph_compute_helper(work_buffer,
|
7096
|
+
ggml_graph_compute_helper(work_buffer, gf, n_threads);
|
6146
7097
|
|
6147
7098
|
// we won't need these tensors again, reset the context to save memory
|
6148
7099
|
ggml_free(lora_ctx);
|
@@ -6171,27 +7122,16 @@ static int llama_apply_lora_from_file_internal(
|
|
6171
7122
|
//
|
6172
7123
|
// interface implementation
|
6173
7124
|
//
|
6174
|
-
|
6175
|
-
struct
|
6176
|
-
struct llama_context_params result = {
|
6177
|
-
/*.seed =*/ LLAMA_DEFAULT_SEED,
|
6178
|
-
/*.n_ctx =*/ 512,
|
6179
|
-
/*.n_batch =*/ 512,
|
7125
|
+
struct llama_model_params llama_model_default_params() {
|
7126
|
+
struct llama_model_params result = {
|
6180
7127
|
/*.n_gpu_layers =*/ 0,
|
6181
7128
|
/*.main_gpu =*/ 0,
|
6182
7129
|
/*.tensor_split =*/ nullptr,
|
6183
|
-
/*.rope_freq_base =*/ 0.0f,
|
6184
|
-
/*.rope_freq_scale =*/ 0.0f,
|
6185
7130
|
/*.progress_callback =*/ nullptr,
|
6186
7131
|
/*.progress_callback_user_data =*/ nullptr,
|
6187
|
-
/*.low_vram =*/ false,
|
6188
|
-
/*.mul_mat_q =*/ true,
|
6189
|
-
/*.f16_kv =*/ true,
|
6190
|
-
/*.logits_all =*/ false,
|
6191
7132
|
/*.vocab_only =*/ false,
|
6192
7133
|
/*.use_mmap =*/ true,
|
6193
7134
|
/*.use_mlock =*/ false,
|
6194
|
-
/*.embedding =*/ false,
|
6195
7135
|
};
|
6196
7136
|
|
6197
7137
|
#ifdef GGML_USE_METAL
|
@@ -6201,6 +7141,24 @@ struct llama_context_params llama_context_default_params() {
|
|
6201
7141
|
return result;
|
6202
7142
|
}
|
6203
7143
|
|
7144
|
+
struct llama_context_params llama_context_default_params() {
|
7145
|
+
struct llama_context_params result = {
|
7146
|
+
/*.seed =*/ LLAMA_DEFAULT_SEED,
|
7147
|
+
/*.n_ctx =*/ 512,
|
7148
|
+
/*.n_batch =*/ 512,
|
7149
|
+
/*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
|
7150
|
+
/*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS,
|
7151
|
+
/*.rope_freq_base =*/ 0.0f,
|
7152
|
+
/*.rope_freq_scale =*/ 0.0f,
|
7153
|
+
/*.mul_mat_q =*/ true,
|
7154
|
+
/*.f16_kv =*/ true,
|
7155
|
+
/*.logits_all =*/ false,
|
7156
|
+
/*.embedding =*/ false,
|
7157
|
+
};
|
7158
|
+
|
7159
|
+
return result;
|
7160
|
+
}
|
7161
|
+
|
6204
7162
|
struct llama_model_quantize_params llama_model_quantize_default_params() {
|
6205
7163
|
struct llama_model_quantize_params result = {
|
6206
7164
|
/*.nthread =*/ 0,
|
@@ -6256,13 +7214,11 @@ int64_t llama_time_us(void) {
|
|
6256
7214
|
|
6257
7215
|
struct llama_model * llama_load_model_from_file(
|
6258
7216
|
const char * path_model,
|
6259
|
-
|
7217
|
+
struct llama_model_params params) {
|
6260
7218
|
ggml_time_init();
|
6261
7219
|
|
6262
7220
|
llama_model * model = new llama_model;
|
6263
7221
|
|
6264
|
-
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
6265
|
-
|
6266
7222
|
unsigned cur_percentage = 0;
|
6267
7223
|
if (params.progress_callback == NULL) {
|
6268
7224
|
params.progress_callback_user_data = &cur_percentage;
|
@@ -6279,9 +7235,9 @@ struct llama_model * llama_load_model_from_file(
|
|
6279
7235
|
};
|
6280
7236
|
}
|
6281
7237
|
|
6282
|
-
if (!llama_model_load(path_model, *model, params.
|
6283
|
-
params.main_gpu, params.tensor_split,
|
6284
|
-
params.
|
7238
|
+
if (!llama_model_load(path_model, *model, params.n_gpu_layers,
|
7239
|
+
params.main_gpu, params.tensor_split,
|
7240
|
+
params.use_mmap, params.use_mlock, params.vocab_only,
|
6285
7241
|
params.progress_callback, params.progress_callback_user_data)) {
|
6286
7242
|
LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
|
6287
7243
|
delete model;
|
@@ -6305,18 +7261,33 @@ struct llama_context * llama_new_context_with_model(
|
|
6305
7261
|
|
6306
7262
|
llama_context * ctx = new llama_context(*model);
|
6307
7263
|
|
7264
|
+
const auto & hparams = model->hparams;
|
7265
|
+
auto & cparams = ctx->cparams;
|
7266
|
+
|
7267
|
+
cparams.n_batch = params.n_batch;
|
7268
|
+
cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
|
7269
|
+
cparams.rope_freq_base = params.rope_freq_base == 0 ? hparams.rope_freq_base_train : params.rope_freq_base;
|
7270
|
+
cparams.rope_freq_scale = params.rope_freq_scale == 0 ? hparams.rope_freq_scale_train : params.rope_freq_scale;
|
7271
|
+
cparams.n_threads = params.n_threads;
|
7272
|
+
cparams.n_threads_batch = params.n_threads_batch;
|
7273
|
+
cparams.mul_mat_q = params.mul_mat_q;
|
7274
|
+
|
6308
7275
|
if (params.seed == LLAMA_DEFAULT_SEED) {
|
6309
7276
|
params.seed = time(NULL);
|
6310
7277
|
}
|
6311
7278
|
|
7279
|
+
LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
|
7280
|
+
LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
|
7281
|
+
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
|
7282
|
+
|
6312
7283
|
ctx->rng = std::mt19937(params.seed);
|
6313
7284
|
ctx->logits_all = params.logits_all;
|
6314
7285
|
|
6315
7286
|
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
6316
7287
|
|
6317
7288
|
// reserve memory for context buffers
|
6318
|
-
if (!
|
6319
|
-
if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type,
|
7289
|
+
if (!hparams.vocab_only) {
|
7290
|
+
if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, cparams.n_ctx, model->n_gpu_layers)) {
|
6320
7291
|
LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
|
6321
7292
|
llama_free(ctx);
|
6322
7293
|
return nullptr;
|
@@ -6327,11 +7298,9 @@ struct llama_context * llama_new_context_with_model(
|
|
6327
7298
|
LLAMA_LOG_INFO("%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
|
6328
7299
|
}
|
6329
7300
|
|
6330
|
-
const auto & hparams = ctx->model.hparams;
|
6331
|
-
|
6332
7301
|
// resized during inference
|
6333
7302
|
if (params.logits_all) {
|
6334
|
-
ctx->logits.reserve(
|
7303
|
+
ctx->logits.reserve(cparams.n_ctx*hparams.n_vocab);
|
6335
7304
|
} else {
|
6336
7305
|
ctx->logits.reserve(hparams.n_vocab);
|
6337
7306
|
}
|
@@ -6349,26 +7318,29 @@ struct llama_context * llama_new_context_with_model(
|
|
6349
7318
|
ctx->alloc = ggml_allocr_new_measure(tensor_alignment);
|
6350
7319
|
|
6351
7320
|
// build worst-case graph
|
6352
|
-
int n_tokens = std::min(
|
6353
|
-
int n_past =
|
7321
|
+
int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch);
|
7322
|
+
int n_past = cparams.n_ctx - n_tokens;
|
6354
7323
|
llama_token token = llama_token_bos(ctx); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
|
6355
|
-
ggml_cgraph * gf = llama_build_graph(*ctx, &token,
|
7324
|
+
ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0));
|
7325
|
+
|
6356
7326
|
#ifdef GGML_USE_METAL
|
6357
|
-
if (
|
7327
|
+
if (model->n_gpu_layers > 0) {
|
7328
|
+
ggml_metal_log_set_callback(llama_log_callback_default, NULL);
|
7329
|
+
|
6358
7330
|
ctx->ctx_metal = ggml_metal_init(1);
|
6359
7331
|
if (!ctx->ctx_metal) {
|
6360
7332
|
LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__);
|
6361
7333
|
llama_free(ctx);
|
6362
7334
|
return NULL;
|
6363
7335
|
}
|
6364
|
-
ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false);
|
6365
|
-
ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
|
7336
|
+
//ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false);
|
7337
|
+
//ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
|
6366
7338
|
}
|
6367
7339
|
#endif
|
6368
7340
|
// measure memory requirements for the graph
|
6369
7341
|
size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
|
6370
7342
|
|
6371
|
-
LLAMA_LOG_INFO("%s: compute buffer total size =
|
7343
|
+
LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
|
6372
7344
|
|
6373
7345
|
// recreate allocator with exact memory requirements
|
6374
7346
|
ggml_allocr_free(ctx->alloc);
|
@@ -6377,28 +7349,46 @@ struct llama_context * llama_new_context_with_model(
|
|
6377
7349
|
ctx->alloc = ggml_allocr_new(ctx->buf_alloc.data, ctx->buf_alloc.size, tensor_alignment);
|
6378
7350
|
#ifdef GGML_USE_METAL
|
6379
7351
|
if (ctx->ctx_metal) {
|
6380
|
-
ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
|
7352
|
+
//ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
|
6381
7353
|
}
|
6382
7354
|
#endif
|
6383
7355
|
#ifdef GGML_USE_CUBLAS
|
6384
|
-
|
6385
|
-
|
6386
|
-
|
6387
|
-
|
6388
|
-
|
6389
|
-
|
7356
|
+
ggml_cuda_set_scratch_size(alloc_size);
|
7357
|
+
LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MB\n", __func__, alloc_size / 1024.0 / 1024.0);
|
7358
|
+
|
7359
|
+
// calculate total VRAM usage
|
7360
|
+
auto add_tensor = [](const ggml_tensor * t, size_t & size) {
|
7361
|
+
if (t->backend == GGML_BACKEND_GPU || t->backend == GGML_BACKEND_GPU_SPLIT) {
|
7362
|
+
size += ggml_nbytes(t);
|
7363
|
+
}
|
7364
|
+
};
|
7365
|
+
size_t model_vram_size = 0;
|
7366
|
+
for (const auto & kv : model->tensors_by_name) {
|
7367
|
+
add_tensor(kv.second, model_vram_size);
|
6390
7368
|
}
|
7369
|
+
|
7370
|
+
size_t kv_vram_size = 0;
|
7371
|
+
add_tensor(ctx->kv_self.k, kv_vram_size);
|
7372
|
+
add_tensor(ctx->kv_self.v, kv_vram_size);
|
7373
|
+
|
7374
|
+
size_t ctx_vram_size = alloc_size + kv_vram_size;
|
7375
|
+
size_t total_vram_size = model_vram_size + ctx_vram_size;
|
7376
|
+
|
7377
|
+
LLAMA_LOG_INFO("%s: total VRAM used: %.2f MB (model: %.2f MB, context: %.2f MB)\n", __func__,
|
7378
|
+
total_vram_size / 1024.0 / 1024.0,
|
7379
|
+
model_vram_size / 1024.0 / 1024.0,
|
7380
|
+
ctx_vram_size / 1024.0 / 1024.0);
|
6391
7381
|
#endif
|
6392
7382
|
}
|
6393
7383
|
|
6394
7384
|
#ifdef GGML_USE_METAL
|
6395
|
-
if (
|
7385
|
+
if (model->n_gpu_layers > 0) {
|
6396
7386
|
// this allocates all Metal resources and memory buffers
|
6397
7387
|
|
6398
7388
|
void * data_ptr = NULL;
|
6399
7389
|
size_t data_size = 0;
|
6400
7390
|
|
6401
|
-
if (
|
7391
|
+
if (ctx->model.mapping) {
|
6402
7392
|
data_ptr = ctx->model.mapping->addr;
|
6403
7393
|
data_size = ctx->model.mapping->size;
|
6404
7394
|
} else {
|
@@ -6417,11 +7407,8 @@ struct llama_context * llama_new_context_with_model(
|
|
6417
7407
|
return NULL; \
|
6418
7408
|
}
|
6419
7409
|
|
6420
|
-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data",
|
6421
|
-
|
6422
|
-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.data, ctx->buf_compute.size, 0));
|
6423
|
-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0));
|
6424
|
-
|
7410
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
|
7411
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0));
|
6425
7412
|
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.data, ctx->buf_alloc.size, 0));
|
6426
7413
|
#undef LLAMA_METAL_CHECK_BUF
|
6427
7414
|
}
|
@@ -6433,8 +7420,10 @@ struct llama_context * llama_new_context_with_model(
|
|
6433
7420
|
|
6434
7421
|
if (ggml_mpi_rank(ctx->ctx_mpi) > 0) {
|
6435
7422
|
// Enter a blocking eval loop with dummy input, letting rank=0 drive the process
|
6436
|
-
|
6437
|
-
|
7423
|
+
// TODO: needs fix after #3228
|
7424
|
+
GGML_ASSERT(false && "not implemented");
|
7425
|
+
//const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos(ctx));
|
7426
|
+
//while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {};
|
6438
7427
|
llama_backend_free();
|
6439
7428
|
exit(1);
|
6440
7429
|
}
|
@@ -6443,63 +7432,41 @@ struct llama_context * llama_new_context_with_model(
|
|
6443
7432
|
return ctx;
|
6444
7433
|
}
|
6445
7434
|
|
6446
|
-
static struct llama_context * llama_init_from_file(
|
6447
|
-
const char * path_model,
|
6448
|
-
struct llama_context_params params) {
|
6449
|
-
struct llama_model * model = llama_load_model_from_file(path_model, params);
|
6450
|
-
if (!model) {
|
6451
|
-
return nullptr;
|
6452
|
-
}
|
6453
|
-
|
6454
|
-
struct llama_context * ctx = llama_new_context_with_model(model, params);
|
6455
|
-
ctx->model_owner = true;
|
6456
|
-
|
6457
|
-
return ctx;
|
6458
|
-
}
|
6459
|
-
|
6460
7435
|
void llama_free(struct llama_context * ctx) {
|
6461
7436
|
delete ctx;
|
6462
7437
|
}
|
6463
7438
|
|
6464
|
-
|
6465
|
-
return
|
7439
|
+
const llama_model * llama_get_model(const struct llama_context * ctx) {
|
7440
|
+
return &ctx->model;
|
6466
7441
|
}
|
6467
7442
|
|
6468
7443
|
int llama_n_ctx(const struct llama_context * ctx) {
|
6469
|
-
return
|
6470
|
-
}
|
6471
|
-
|
6472
|
-
int llama_n_ctx_train(const struct llama_context * ctx) {
|
6473
|
-
return llama_model_n_ctx_train(&ctx->model);
|
7444
|
+
return ctx->cparams.n_ctx;
|
6474
7445
|
}
|
6475
7446
|
|
6476
|
-
|
6477
|
-
return
|
7447
|
+
enum llama_vocab_type llama_vocab_type(const struct llama_model * model) {
|
7448
|
+
return model->vocab.type;
|
6478
7449
|
}
|
6479
7450
|
|
6480
|
-
|
6481
|
-
return ctx->model.vocab.type;
|
6482
|
-
}
|
6483
|
-
|
6484
|
-
int llama_model_n_vocab(const struct llama_model * model) {
|
7451
|
+
int llama_n_vocab(const struct llama_model * model) {
|
6485
7452
|
return model->vocab.id_to_token.size();
|
6486
7453
|
}
|
6487
7454
|
|
6488
|
-
int
|
6489
|
-
return model->hparams.n_ctx;
|
6490
|
-
}
|
6491
|
-
|
6492
|
-
int llama_model_n_ctx_train(const struct llama_model * model) {
|
7455
|
+
int llama_n_ctx_train(const struct llama_model * model) {
|
6493
7456
|
return model->hparams.n_ctx_train;
|
6494
7457
|
}
|
6495
7458
|
|
6496
|
-
int
|
7459
|
+
int llama_n_embd(const struct llama_model * model) {
|
6497
7460
|
return model->hparams.n_embd;
|
6498
7461
|
}
|
6499
7462
|
|
7463
|
+
float llama_rope_freq_scale_train(const struct llama_model * model) {
|
7464
|
+
return model->hparams.rope_freq_scale_train;
|
7465
|
+
}
|
7466
|
+
|
6500
7467
|
int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
|
6501
7468
|
return snprintf(buf, buf_size, "%s %s %s",
|
6502
|
-
model->
|
7469
|
+
llama_model_arch_name(model->arch).c_str(),
|
6503
7470
|
llama_model_type_name(model->type),
|
6504
7471
|
llama_model_ftype_name(model->ftype).c_str());
|
6505
7472
|
}
|
@@ -6520,6 +7487,10 @@ uint64_t llama_model_n_params(const struct llama_model * model) {
|
|
6520
7487
|
return nparams;
|
6521
7488
|
}
|
6522
7489
|
|
7490
|
+
struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name) {
|
7491
|
+
return ggml_get_tensor(model->ctx, name);
|
7492
|
+
}
|
7493
|
+
|
6523
7494
|
int llama_model_quantize(
|
6524
7495
|
const char * fname_inp,
|
6525
7496
|
const char * fname_out,
|
@@ -6533,18 +7504,18 @@ int llama_model_quantize(
|
|
6533
7504
|
}
|
6534
7505
|
}
|
6535
7506
|
|
6536
|
-
int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
|
7507
|
+
int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, float scale, const char * path_base_model, int n_threads) {
|
6537
7508
|
try {
|
6538
|
-
return llama_apply_lora_from_file_internal(ctx->model, path_lora, path_base_model, n_threads);
|
7509
|
+
return llama_apply_lora_from_file_internal(ctx->model, path_lora, scale, path_base_model, n_threads);
|
6539
7510
|
} catch (const std::exception & err) {
|
6540
7511
|
LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
6541
7512
|
return 1;
|
6542
7513
|
}
|
6543
7514
|
}
|
6544
7515
|
|
6545
|
-
int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, const char * path_base_model, int n_threads) {
|
7516
|
+
int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int n_threads) {
|
6546
7517
|
try {
|
6547
|
-
return llama_apply_lora_from_file_internal(*model, path_lora, path_base_model, n_threads);
|
7518
|
+
return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads);
|
6548
7519
|
} catch (const std::exception & err) {
|
6549
7520
|
LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
6550
7521
|
return 1;
|
@@ -6552,16 +7523,27 @@ int llama_model_apply_lora_from_file(const struct llama_model * model, const cha
|
|
6552
7523
|
}
|
6553
7524
|
|
6554
7525
|
int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
|
6555
|
-
return ctx->kv_self.
|
7526
|
+
return ctx->kv_self.head;
|
6556
7527
|
}
|
6557
7528
|
|
6558
|
-
|
7529
|
+
void llama_kv_cache_tokens_rm(struct llama_context * ctx, int32_t c0, int32_t c1) {
|
7530
|
+
llama_kv_cache_tokens_rm(ctx->kv_self, c0, c1);
|
7531
|
+
}
|
6559
7532
|
|
6560
|
-
void
|
6561
|
-
|
6562
|
-
|
6563
|
-
|
6564
|
-
|
7533
|
+
void llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
|
7534
|
+
llama_kv_cache_seq_rm(ctx->kv_self, seq_id, p0, p1);
|
7535
|
+
}
|
7536
|
+
|
7537
|
+
void llama_kv_cache_seq_cp(struct llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
|
7538
|
+
llama_kv_cache_seq_cp(ctx->kv_self, seq_id_src, seq_id_dst, p0, p1);
|
7539
|
+
}
|
7540
|
+
|
7541
|
+
void llama_kv_cache_seq_keep(struct llama_context * ctx, llama_seq_id seq_id) {
|
7542
|
+
llama_kv_cache_seq_keep(ctx->kv_self, seq_id);
|
7543
|
+
}
|
7544
|
+
|
7545
|
+
void llama_kv_cache_seq_shift(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
|
7546
|
+
llama_kv_cache_seq_shift(ctx->kv_self, seq_id, p0, p1, delta);
|
6565
7547
|
}
|
6566
7548
|
|
6567
7549
|
// Returns the *maximum* size of the state
|
@@ -6699,36 +7681,40 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
6699
7681
|
{
|
6700
7682
|
const auto & kv_self = ctx->kv_self;
|
6701
7683
|
const auto & hparams = ctx->model.hparams;
|
6702
|
-
const
|
6703
|
-
const int n_embd = hparams.n_embd_gqa();
|
6704
|
-
const int n_ctx = hparams.n_ctx;
|
7684
|
+
const auto & cparams = ctx->cparams;
|
6705
7685
|
|
6706
|
-
const
|
6707
|
-
const
|
7686
|
+
const auto n_layer = hparams.n_layer;
|
7687
|
+
const auto n_embd = hparams.n_embd_gqa();
|
7688
|
+
const auto n_ctx = cparams.n_ctx;
|
6708
7689
|
|
6709
|
-
|
6710
|
-
|
7690
|
+
const size_t kv_buf_size = kv_self.buf.size;
|
7691
|
+
const uint32_t kv_head = kv_self.head;
|
7692
|
+
const uint32_t kv_size = kv_self.size;
|
6711
7693
|
|
6712
|
-
|
7694
|
+
data_ctx->write(&kv_buf_size, sizeof(kv_buf_size));
|
7695
|
+
data_ctx->write(&kv_head, sizeof(kv_head));
|
7696
|
+
data_ctx->write(&kv_size, sizeof(kv_size));
|
7697
|
+
|
7698
|
+
if (kv_buf_size) {
|
6713
7699
|
const size_t elt_size = ggml_element_size(kv_self.k);
|
6714
7700
|
|
6715
7701
|
ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
|
6716
7702
|
ggml_cgraph gf{};
|
6717
7703
|
|
6718
|
-
ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd,
|
7704
|
+
ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
|
6719
7705
|
std::vector<uint8_t> kout3d_data(ggml_nbytes(kout3d), 0);
|
6720
7706
|
kout3d->data = kout3d_data.data();
|
6721
7707
|
|
6722
|
-
ggml_tensor * vout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type,
|
7708
|
+
ggml_tensor * vout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_head, n_embd, n_layer);
|
6723
7709
|
std::vector<uint8_t> vout3d_data(ggml_nbytes(vout3d), 0);
|
6724
7710
|
vout3d->data = vout3d_data.data();
|
6725
7711
|
|
6726
7712
|
ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
|
6727
|
-
n_embd,
|
7713
|
+
n_embd, kv_head, n_layer,
|
6728
7714
|
elt_size*n_embd, elt_size*n_embd*n_ctx, 0);
|
6729
7715
|
|
6730
7716
|
ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v,
|
6731
|
-
|
7717
|
+
kv_head, n_embd, n_layer,
|
6732
7718
|
elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
|
6733
7719
|
|
6734
7720
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
|
@@ -6742,6 +7728,20 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
6742
7728
|
data_ctx->write(kout3d_data.data(), kout3d_data.size());
|
6743
7729
|
data_ctx->write(vout3d_data.data(), vout3d_data.size());
|
6744
7730
|
}
|
7731
|
+
|
7732
|
+
for (uint32_t i = 0; i < kv_size; ++i) {
|
7733
|
+
const auto & cell = kv_self.cells[i];
|
7734
|
+
|
7735
|
+
const llama_pos pos = cell.pos;
|
7736
|
+
const size_t seq_id_size = cell.seq_id.size();
|
7737
|
+
|
7738
|
+
data_ctx->write(&pos, sizeof(pos));
|
7739
|
+
data_ctx->write(&seq_id_size, sizeof(seq_id_size));
|
7740
|
+
|
7741
|
+
for (auto seq_id : cell.seq_id) {
|
7742
|
+
data_ctx->write(&seq_id, sizeof(seq_id));
|
7743
|
+
}
|
7744
|
+
}
|
6745
7745
|
}
|
6746
7746
|
}
|
6747
7747
|
|
@@ -6807,38 +7807,42 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
6807
7807
|
{
|
6808
7808
|
const auto & kv_self = ctx->kv_self;
|
6809
7809
|
const auto & hparams = ctx->model.hparams;
|
7810
|
+
const auto & cparams = ctx->cparams;
|
7811
|
+
|
6810
7812
|
const int n_layer = hparams.n_layer;
|
6811
7813
|
const int n_embd = hparams.n_embd_gqa();
|
6812
|
-
const int n_ctx =
|
7814
|
+
const int n_ctx = cparams.n_ctx;
|
6813
7815
|
|
6814
|
-
size_t
|
6815
|
-
|
7816
|
+
size_t kv_buf_size;
|
7817
|
+
uint32_t kv_head;
|
7818
|
+
uint32_t kv_size;
|
6816
7819
|
|
6817
|
-
memcpy(&
|
6818
|
-
memcpy(&
|
7820
|
+
memcpy(&kv_buf_size, inp, sizeof(kv_buf_size)); inp += sizeof(kv_buf_size);
|
7821
|
+
memcpy(&kv_head, inp, sizeof(kv_head)); inp += sizeof(kv_head);
|
7822
|
+
memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
|
6819
7823
|
|
6820
|
-
if (
|
6821
|
-
GGML_ASSERT(kv_self.buf.size ==
|
7824
|
+
if (kv_buf_size) {
|
7825
|
+
GGML_ASSERT(kv_self.buf.size == kv_buf_size);
|
6822
7826
|
|
6823
7827
|
const size_t elt_size = ggml_element_size(kv_self.k);
|
6824
7828
|
|
6825
7829
|
ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
|
6826
7830
|
ggml_cgraph gf{};
|
6827
7831
|
|
6828
|
-
ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd,
|
7832
|
+
ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
|
6829
7833
|
kin3d->data = (void *) inp;
|
6830
7834
|
inp += ggml_nbytes(kin3d);
|
6831
7835
|
|
6832
|
-
ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type,
|
7836
|
+
ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_head, n_embd, n_layer);
|
6833
7837
|
vin3d->data = (void *) inp;
|
6834
7838
|
inp += ggml_nbytes(vin3d);
|
6835
7839
|
|
6836
7840
|
ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
|
6837
|
-
n_embd,
|
7841
|
+
n_embd, kv_head, n_layer,
|
6838
7842
|
elt_size*n_embd, elt_size*n_embd*n_ctx, 0);
|
6839
7843
|
|
6840
7844
|
ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v,
|
6841
|
-
|
7845
|
+
kv_head, n_embd, n_layer,
|
6842
7846
|
elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
|
6843
7847
|
|
6844
7848
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
|
@@ -6848,7 +7852,27 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
6848
7852
|
ggml_free(cpy_ctx);
|
6849
7853
|
}
|
6850
7854
|
|
6851
|
-
ctx->kv_self.
|
7855
|
+
ctx->kv_self.head = kv_head;
|
7856
|
+
ctx->kv_self.size = kv_size;
|
7857
|
+
|
7858
|
+
ctx->kv_self.cells.resize(kv_size);
|
7859
|
+
|
7860
|
+
for (uint32_t i = 0; i < kv_size; ++i) {
|
7861
|
+
llama_pos pos;
|
7862
|
+
size_t seq_id_size;
|
7863
|
+
|
7864
|
+
memcpy(&pos, inp, sizeof(pos)); inp += sizeof(pos);
|
7865
|
+
memcpy(&seq_id_size, inp, sizeof(seq_id_size)); inp += sizeof(seq_id_size);
|
7866
|
+
|
7867
|
+
ctx->kv_self.cells[i].pos = pos;
|
7868
|
+
|
7869
|
+
llama_seq_id seq_id;
|
7870
|
+
|
7871
|
+
for (size_t j = 0; j < seq_id_size; ++j) {
|
7872
|
+
memcpy(&seq_id, inp, sizeof(seq_id)); inp += sizeof(seq_id);
|
7873
|
+
ctx->kv_self.cells[i].seq_id.insert(seq_id);
|
7874
|
+
}
|
7875
|
+
}
|
6852
7876
|
}
|
6853
7877
|
|
6854
7878
|
const size_t nread = inp - src;
|
@@ -6943,64 +7967,102 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi
|
|
6943
7967
|
|
6944
7968
|
int llama_eval(
|
6945
7969
|
struct llama_context * ctx,
|
6946
|
-
|
6947
|
-
|
6948
|
-
int n_past
|
6949
|
-
|
6950
|
-
if (!llama_eval_internal(*ctx, tokens, nullptr, n_tokens, n_past, n_threads, nullptr)) {
|
6951
|
-
LLAMA_LOG_ERROR("%s: failed to eval\n", __func__);
|
6952
|
-
return 1;
|
6953
|
-
}
|
7970
|
+
llama_token * tokens,
|
7971
|
+
int32_t n_tokens,
|
7972
|
+
int n_past) {
|
7973
|
+
llama_kv_cache_tokens_rm(ctx->kv_self, n_past, -1);
|
6954
7974
|
|
6955
|
-
|
6956
|
-
|
6957
|
-
|
6958
|
-
ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
|
6959
|
-
ctx->has_evaluated_once = true;
|
7975
|
+
const int ret = llama_decode_internal(*ctx, llama_batch_get_one(tokens, n_tokens, n_past, 0));
|
7976
|
+
if (ret < 0) {
|
7977
|
+
LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
|
6960
7978
|
}
|
6961
7979
|
|
6962
|
-
return
|
7980
|
+
return ret;
|
6963
7981
|
}
|
6964
7982
|
|
6965
7983
|
int llama_eval_embd(
|
6966
7984
|
struct llama_context * ctx,
|
6967
|
-
|
6968
|
-
|
6969
|
-
int n_past
|
6970
|
-
|
6971
|
-
if (!llama_eval_internal(*ctx, nullptr, embd, n_tokens, n_past, n_threads, nullptr)) {
|
6972
|
-
LLAMA_LOG_ERROR("%s: failed to eval\n", __func__);
|
6973
|
-
return 1;
|
6974
|
-
}
|
7985
|
+
float * embd,
|
7986
|
+
int32_t n_tokens,
|
7987
|
+
int n_past) {
|
7988
|
+
llama_kv_cache_tokens_rm(ctx->kv_self, n_past, -1);
|
6975
7989
|
|
6976
|
-
|
6977
|
-
|
6978
|
-
|
6979
|
-
|
6980
|
-
|
7990
|
+
llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, n_past, 1, 0, };
|
7991
|
+
|
7992
|
+
const int ret = llama_decode_internal(*ctx, batch);
|
7993
|
+
if (ret < 0) {
|
7994
|
+
LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
|
6981
7995
|
}
|
6982
7996
|
|
6983
|
-
return
|
7997
|
+
return ret;
|
6984
7998
|
}
|
6985
7999
|
|
6986
|
-
|
6987
|
-
|
6988
|
-
|
8000
|
+
void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch) {
|
8001
|
+
ctx->cparams.n_threads = n_threads;
|
8002
|
+
ctx->cparams.n_threads_batch = n_threads_batch;
|
8003
|
+
}
|
8004
|
+
|
8005
|
+
struct llama_batch llama_batch_get_one(
|
8006
|
+
llama_token * tokens,
|
8007
|
+
int32_t n_tokens,
|
8008
|
+
llama_pos pos_0,
|
8009
|
+
llama_seq_id seq_id) {
|
8010
|
+
return {
|
8011
|
+
/*n_tokens =*/ n_tokens,
|
8012
|
+
/*tokens =*/ tokens,
|
8013
|
+
/*embd =*/ nullptr,
|
8014
|
+
/*pos =*/ nullptr,
|
8015
|
+
/*seq_id =*/ nullptr,
|
8016
|
+
/*logits =*/ nullptr,
|
8017
|
+
/*all_pos_0 =*/ pos_0,
|
8018
|
+
/*all_pos_1 =*/ 1,
|
8019
|
+
/*all_seq_id =*/ seq_id,
|
8020
|
+
};
|
8021
|
+
}
|
6989
8022
|
|
6990
|
-
|
8023
|
+
struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd) {
|
8024
|
+
llama_batch batch = { -1, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, };
|
6991
8025
|
|
6992
|
-
if (
|
6993
|
-
|
6994
|
-
|
8026
|
+
if (embd) {
|
8027
|
+
batch.embd = (float *) malloc(sizeof(float) * n_tokens * embd);
|
8028
|
+
} else {
|
8029
|
+
batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens);
|
6995
8030
|
}
|
6996
8031
|
|
6997
|
-
|
8032
|
+
batch.pos = (llama_pos *) malloc(sizeof(llama_pos) * n_tokens);
|
8033
|
+
batch.seq_id = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_tokens);
|
8034
|
+
batch.logits = (int8_t *) malloc(sizeof(int8_t) * n_tokens);
|
8035
|
+
|
8036
|
+
return batch;
|
8037
|
+
}
|
8038
|
+
|
8039
|
+
void llama_batch_free(struct llama_batch batch) {
|
8040
|
+
if (batch.token) free(batch.token);
|
8041
|
+
if (batch.embd) free(batch.embd);
|
8042
|
+
if (batch.pos) free(batch.pos);
|
8043
|
+
if (batch.seq_id) free(batch.seq_id);
|
8044
|
+
if (batch.logits) free(batch.logits);
|
8045
|
+
}
|
8046
|
+
|
8047
|
+
int llama_decode(
|
8048
|
+
struct llama_context * ctx,
|
8049
|
+
struct llama_batch batch) {
|
8050
|
+
const int ret = llama_decode_internal(*ctx, batch);
|
8051
|
+
if (ret < 0) {
|
8052
|
+
LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
|
8053
|
+
}
|
8054
|
+
|
8055
|
+
return ret;
|
6998
8056
|
}
|
6999
8057
|
|
7000
8058
|
float * llama_get_logits(struct llama_context * ctx) {
|
7001
8059
|
return ctx->logits.data();
|
7002
8060
|
}
|
7003
8061
|
|
8062
|
+
float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
|
8063
|
+
return ctx->logits.data() + i*ctx->model.hparams.n_vocab;
|
8064
|
+
}
|
8065
|
+
|
7004
8066
|
float * llama_get_embeddings(struct llama_context * ctx) {
|
7005
8067
|
return ctx->embedding.data();
|
7006
8068
|
}
|
@@ -7028,18 +8090,24 @@ llama_token llama_token_eos(const struct llama_context * ctx) {
|
|
7028
8090
|
llama_token llama_token_nl(const struct llama_context * ctx) {
|
7029
8091
|
return ctx->model.vocab.linefeed_id;
|
7030
8092
|
}
|
8093
|
+
llama_token llama_token_prefix(const struct llama_context * ctx) {
|
8094
|
+
return ctx->model.vocab.special_prefix_id;
|
8095
|
+
}
|
7031
8096
|
|
7032
|
-
|
7033
|
-
|
7034
|
-
const char * text,
|
7035
|
-
int text_len,
|
7036
|
-
llama_token * tokens,
|
7037
|
-
int n_max_tokens,
|
7038
|
-
bool add_bos) {
|
7039
|
-
return llama_tokenize_with_model(&ctx->model, text, text_len, tokens, n_max_tokens, add_bos);
|
8097
|
+
llama_token llama_token_middle(const struct llama_context * ctx) {
|
8098
|
+
return ctx->model.vocab.special_middle_id;
|
7040
8099
|
}
|
7041
8100
|
|
7042
|
-
|
8101
|
+
llama_token llama_token_suffix(const struct llama_context * ctx) {
|
8102
|
+
return ctx->model.vocab.special_suffix_id;
|
8103
|
+
}
|
8104
|
+
|
8105
|
+
llama_token llama_token_eot(const struct llama_context * ctx) {
|
8106
|
+
return ctx->model.vocab.special_eot_id;
|
8107
|
+
}
|
8108
|
+
|
8109
|
+
|
8110
|
+
int llama_tokenize(
|
7043
8111
|
const struct llama_model * model,
|
7044
8112
|
const char * text,
|
7045
8113
|
int text_len,
|
@@ -7060,39 +8128,66 @@ int llama_tokenize_with_model(
|
|
7060
8128
|
return res.size();
|
7061
8129
|
}
|
7062
8130
|
|
7063
|
-
|
7064
|
-
|
8131
|
+
static std::string llama_decode_text(const std::string & text) {
|
8132
|
+
std::string decoded_text;
|
8133
|
+
auto unicode_sequences = codepoints_from_utf8(text);
|
8134
|
+
for (auto& unicode_sequence : unicode_sequences) {
|
8135
|
+
decoded_text += unicode_to_bytes_bpe(codepoint_to_utf8(unicode_sequence));
|
8136
|
+
}
|
8137
|
+
|
8138
|
+
return decoded_text;
|
7065
8139
|
}
|
7066
8140
|
|
7067
8141
|
// does not write null-terminator to buf
|
7068
|
-
int
|
7069
|
-
if (0 <= token && token <
|
7070
|
-
|
7071
|
-
|
7072
|
-
if (
|
8142
|
+
int llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int length) {
|
8143
|
+
if (0 <= token && token < llama_n_vocab(model)) {
|
8144
|
+
switch (llama_vocab_get_type(model->vocab)) {
|
8145
|
+
case LLAMA_VOCAB_TYPE_SPM: {
|
8146
|
+
if (llama_is_normal_token(model->vocab, token)) {
|
8147
|
+
std::string result = model->vocab.id_to_token[token].text;
|
7073
8148
|
llama_unescape_whitespace(result);
|
8149
|
+
if (length < (int) result.length()) {
|
8150
|
+
return -result.length();
|
8151
|
+
}
|
8152
|
+
memcpy(buf, result.c_str(), result.length());
|
8153
|
+
return result.length();
|
8154
|
+
} else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT
|
8155
|
+
if (length < 3) {
|
8156
|
+
return -3;
|
8157
|
+
}
|
8158
|
+
memcpy(buf, "\xe2\x96\x85", 3);
|
8159
|
+
return 3;
|
8160
|
+
} else if (llama_is_control_token(model->vocab, token)) {
|
8161
|
+
;
|
8162
|
+
} else if (llama_is_byte_token(model->vocab, token)) {
|
8163
|
+
if (length < 1) {
|
8164
|
+
return -1;
|
8165
|
+
}
|
8166
|
+
buf[0] = llama_token_to_byte(model->vocab, token);
|
8167
|
+
return 1;
|
8168
|
+
} else {
|
8169
|
+
GGML_ASSERT(false);
|
7074
8170
|
}
|
7075
|
-
|
7076
|
-
|
7077
|
-
|
7078
|
-
|
7079
|
-
|
7080
|
-
|
7081
|
-
|
7082
|
-
|
7083
|
-
|
7084
|
-
|
7085
|
-
|
7086
|
-
|
7087
|
-
|
7088
|
-
|
7089
|
-
|
7090
|
-
} else if (llama_is_byte_token(model->vocab, token)) {
|
7091
|
-
if (length < 1) {
|
7092
|
-
return -1;
|
8171
|
+
break;
|
8172
|
+
}
|
8173
|
+
case LLAMA_VOCAB_TYPE_BPE: {
|
8174
|
+
if (llama_is_normal_token(model->vocab, token)) {
|
8175
|
+
std::string result = model->vocab.id_to_token[token].text;
|
8176
|
+
result = llama_decode_text(result);
|
8177
|
+
if (length < (int) result.length()) {
|
8178
|
+
return -result.length();
|
8179
|
+
}
|
8180
|
+
memcpy(buf, result.c_str(), result.length());
|
8181
|
+
return result.length();
|
8182
|
+
} else if (llama_is_control_token(model->vocab, token)) {
|
8183
|
+
;
|
8184
|
+
} else {
|
8185
|
+
GGML_ASSERT(false);
|
7093
8186
|
}
|
7094
|
-
|
7095
|
-
|
8187
|
+
break;
|
8188
|
+
}
|
8189
|
+
default:
|
8190
|
+
GGML_ASSERT(false);
|
7096
8191
|
}
|
7097
8192
|
}
|
7098
8193
|
return 0;
|
@@ -7119,14 +8214,14 @@ void llama_print_timings(struct llama_context * ctx) {
|
|
7119
8214
|
const llama_timings timings = llama_get_timings(ctx);
|
7120
8215
|
|
7121
8216
|
LLAMA_LOG_INFO("\n");
|
7122
|
-
LLAMA_LOG_INFO("%s: load time = %
|
7123
|
-
LLAMA_LOG_INFO("%s: sample time = %
|
8217
|
+
LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, timings.t_load_ms);
|
8218
|
+
LLAMA_LOG_INFO("%s: sample time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
7124
8219
|
__func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample);
|
7125
|
-
LLAMA_LOG_INFO("%s: prompt eval time = %
|
8220
|
+
LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
|
7126
8221
|
__func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
|
7127
|
-
LLAMA_LOG_INFO("%s: eval time = %
|
8222
|
+
LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
7128
8223
|
__func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
|
7129
|
-
LLAMA_LOG_INFO("%s: total time = %
|
8224
|
+
LLAMA_LOG_INFO("%s: total time = %10.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms));
|
7130
8225
|
}
|
7131
8226
|
|
7132
8227
|
void llama_reset_timings(struct llama_context * ctx) {
|
@@ -7194,12 +8289,12 @@ const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal
|
|
7194
8289
|
return ctx->model.tensors_by_name;
|
7195
8290
|
}
|
7196
8291
|
|
7197
|
-
void llama_log_set(
|
8292
|
+
void llama_log_set(ggml_log_callback log_callback, void * user_data) {
|
7198
8293
|
g_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
|
7199
8294
|
g_state.log_callback_user_data = user_data;
|
7200
8295
|
}
|
7201
8296
|
|
7202
|
-
static void llama_log_internal_v(
|
8297
|
+
static void llama_log_internal_v(ggml_log_level level, const char * format, va_list args) {
|
7203
8298
|
va_list args_copy;
|
7204
8299
|
va_copy(args_copy, args);
|
7205
8300
|
char buffer[128];
|
@@ -7216,14 +8311,14 @@ static void llama_log_internal_v(llama_log_level level, const char * format, va_
|
|
7216
8311
|
va_end(args_copy);
|
7217
8312
|
}
|
7218
8313
|
|
7219
|
-
static void llama_log_internal(
|
8314
|
+
static void llama_log_internal(ggml_log_level level, const char * format, ...) {
|
7220
8315
|
va_list args;
|
7221
8316
|
va_start(args, format);
|
7222
8317
|
llama_log_internal_v(level, format, args);
|
7223
8318
|
va_end(args);
|
7224
8319
|
}
|
7225
8320
|
|
7226
|
-
static void llama_log_callback_default(
|
8321
|
+
static void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data) {
|
7227
8322
|
(void) level;
|
7228
8323
|
(void) user_data;
|
7229
8324
|
fputs(text, stderr);
|