llama_cpp 0.5.3 → 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/README.md +6 -5
- data/examples/chat.rb +13 -13
- data/examples/embedding.rb +9 -9
- data/ext/llama_cpp/llama_cpp.cpp +547 -272
- data/ext/llama_cpp/src/ggml-alloc.c +8 -2
- data/ext/llama_cpp/src/ggml-alloc.h +1 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +209 -82
- data/ext/llama_cpp/src/ggml-cuda.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.h +4 -0
- data/ext/llama_cpp/src/ggml-metal.m +163 -84
- data/ext/llama_cpp/src/ggml-metal.metal +121 -38
- data/ext/llama_cpp/src/ggml.c +1596 -842
- data/ext/llama_cpp/src/ggml.h +116 -35
- data/ext/llama_cpp/src/llama.cpp +1015 -586
- data/ext/llama_cpp/src/llama.h +304 -119
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +5 -9
- data/sig/llama_cpp.rbs +65 -34
- metadata +3 -3
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -72,6 +72,7 @@
|
|
72
72
|
#include <sstream>
|
73
73
|
#include <thread>
|
74
74
|
#include <unordered_map>
|
75
|
+
#include <set>
|
75
76
|
|
76
77
|
#if defined(_MSC_VER)
|
77
78
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
@@ -92,12 +93,12 @@
|
|
92
93
|
//
|
93
94
|
|
94
95
|
LLAMA_ATTRIBUTE_FORMAT(2, 3)
|
95
|
-
static void llama_log_internal (
|
96
|
-
static void llama_log_callback_default(
|
96
|
+
static void llama_log_internal (ggml_log_level level, const char* format, ...);
|
97
|
+
static void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data);
|
97
98
|
|
98
|
-
#define LLAMA_LOG_INFO(...) llama_log_internal(
|
99
|
-
#define LLAMA_LOG_WARN(...) llama_log_internal(
|
100
|
-
#define LLAMA_LOG_ERROR(...) llama_log_internal(
|
99
|
+
#define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
|
100
|
+
#define LLAMA_LOG_WARN(...) llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
|
101
|
+
#define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
|
101
102
|
|
102
103
|
//
|
103
104
|
// helpers
|
@@ -166,13 +167,13 @@ enum llm_arch {
|
|
166
167
|
};
|
167
168
|
|
168
169
|
static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
|
169
|
-
{ LLM_ARCH_LLAMA, "llama"
|
170
|
-
{ LLM_ARCH_FALCON, "falcon"
|
171
|
-
{ LLM_ARCH_GPT2, "gpt2"
|
172
|
-
{ LLM_ARCH_GPTJ, "gptj"
|
173
|
-
{ LLM_ARCH_GPTNEOX, "gptneox"
|
174
|
-
{ LLM_ARCH_MPT, "mpt"
|
175
|
-
{ LLM_ARCH_BAICHUAN, "baichuan"
|
170
|
+
{ LLM_ARCH_LLAMA, "llama" },
|
171
|
+
{ LLM_ARCH_FALCON, "falcon" },
|
172
|
+
{ LLM_ARCH_GPT2, "gpt2" },
|
173
|
+
{ LLM_ARCH_GPTJ, "gptj" },
|
174
|
+
{ LLM_ARCH_GPTNEOX, "gptneox" },
|
175
|
+
{ LLM_ARCH_MPT, "mpt" },
|
176
|
+
{ LLM_ARCH_BAICHUAN, "baichuan" },
|
176
177
|
{ LLM_ARCH_STARCODER, "starcoder" },
|
177
178
|
};
|
178
179
|
|
@@ -221,16 +222,16 @@ enum llm_kv {
|
|
221
222
|
};
|
222
223
|
|
223
224
|
static std::map<llm_kv, std::string> LLM_KV_NAMES = {
|
224
|
-
{ LLM_KV_GENERAL_ARCHITECTURE, "general.architecture"
|
225
|
-
{ LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version"
|
226
|
-
{ LLM_KV_GENERAL_ALIGNMENT, "general.alignment"
|
227
|
-
{ LLM_KV_GENERAL_NAME, "general.name"
|
228
|
-
{ LLM_KV_GENERAL_AUTHOR, "general.author"
|
229
|
-
{ LLM_KV_GENERAL_URL, "general.url"
|
230
|
-
{ LLM_KV_GENERAL_DESCRIPTION, "general.description"
|
231
|
-
{ LLM_KV_GENERAL_LICENSE, "general.license"
|
232
|
-
{ LLM_KV_GENERAL_SOURCE_URL, "general.
|
233
|
-
{ LLM_KV_GENERAL_SOURCE_HF_REPO, "general.
|
225
|
+
{ LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" },
|
226
|
+
{ LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" },
|
227
|
+
{ LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
|
228
|
+
{ LLM_KV_GENERAL_NAME, "general.name" },
|
229
|
+
{ LLM_KV_GENERAL_AUTHOR, "general.author" },
|
230
|
+
{ LLM_KV_GENERAL_URL, "general.url" },
|
231
|
+
{ LLM_KV_GENERAL_DESCRIPTION, "general.description" },
|
232
|
+
{ LLM_KV_GENERAL_LICENSE, "general.license" },
|
233
|
+
{ LLM_KV_GENERAL_SOURCE_URL, "general.source.url" },
|
234
|
+
{ LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" },
|
234
235
|
|
235
236
|
{ LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
|
236
237
|
{ LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
|
@@ -448,7 +449,7 @@ struct LLM_TN {
|
|
448
449
|
//
|
449
450
|
|
450
451
|
#define GGUF_GET_KEY(ctx, dst, func, type, req, key) \
|
451
|
-
{ \
|
452
|
+
do { \
|
452
453
|
const std::string skey(key); \
|
453
454
|
const int kid = gguf_find_key(ctx, skey.c_str()); \
|
454
455
|
if (kid >= 0) { \
|
@@ -460,7 +461,7 @@ struct LLM_TN {
|
|
460
461
|
} else if (req) { \
|
461
462
|
throw std::runtime_error(format("key not found in model: %s", skey.c_str())); \
|
462
463
|
} \
|
463
|
-
}
|
464
|
+
} while (0)
|
464
465
|
|
465
466
|
//
|
466
467
|
// ggml helpers
|
@@ -881,10 +882,10 @@ static void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
|
|
881
882
|
|
882
883
|
static std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) {
|
883
884
|
std::vector<char> result(8, 0);
|
884
|
-
const int n_tokens = llama_token_to_piece(ctx, token, result.data(), result.size());
|
885
|
+
const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
|
885
886
|
if (n_tokens < 0) {
|
886
887
|
result.resize(-n_tokens);
|
887
|
-
int check = llama_token_to_piece(ctx, token, result.data(), result.size());
|
888
|
+
int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
|
888
889
|
GGML_ASSERT(check == -n_tokens);
|
889
890
|
} else {
|
890
891
|
result.resize(n_tokens);
|
@@ -899,7 +900,7 @@ static std::string llama_token_to_str(const struct llama_context * ctx, llama_to
|
|
899
900
|
|
900
901
|
struct llama_state {
|
901
902
|
// We save the log callback globally
|
902
|
-
|
903
|
+
ggml_log_callback log_callback = llama_log_callback_default;
|
903
904
|
void * log_callback_user_data = nullptr;
|
904
905
|
};
|
905
906
|
|
@@ -925,9 +926,9 @@ static const size_t MB = kB*kB;
|
|
925
926
|
static const size_t GB = kB*kB*kB;
|
926
927
|
|
927
928
|
struct llama_hparams {
|
929
|
+
bool vocab_only;
|
928
930
|
uint32_t n_vocab;
|
929
931
|
uint32_t n_ctx_train; // context size the model was trained on
|
930
|
-
uint32_t n_ctx; // context size used during inference
|
931
932
|
uint32_t n_embd;
|
932
933
|
uint32_t n_head;
|
933
934
|
uint32_t n_head_kv;
|
@@ -938,8 +939,8 @@ struct llama_hparams {
|
|
938
939
|
float f_norm_eps;
|
939
940
|
float f_norm_rms_eps;
|
940
941
|
|
941
|
-
float
|
942
|
-
float
|
942
|
+
float rope_freq_base_train;
|
943
|
+
float rope_freq_scale_train;
|
943
944
|
|
944
945
|
bool operator!=(const llama_hparams & other) const {
|
945
946
|
return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams))); // NOLINT
|
@@ -956,15 +957,18 @@ struct llama_hparams {
|
|
956
957
|
uint32_t n_embd_gqa() const {
|
957
958
|
return n_embd/n_gqa();
|
958
959
|
}
|
960
|
+
};
|
959
961
|
|
960
|
-
|
961
|
-
|
962
|
-
|
963
|
-
|
964
|
-
|
965
|
-
|
966
|
-
|
967
|
-
|
962
|
+
struct llama_cparams {
|
963
|
+
uint32_t n_ctx; // context size used during inference
|
964
|
+
uint32_t n_batch;
|
965
|
+
uint32_t n_threads; // number of threads to use for generation
|
966
|
+
uint32_t n_threads_batch; // number of threads to use for batch processing
|
967
|
+
|
968
|
+
float rope_freq_base;
|
969
|
+
float rope_freq_scale;
|
970
|
+
|
971
|
+
bool mul_mat_q;
|
968
972
|
};
|
969
973
|
|
970
974
|
struct llama_layer {
|
@@ -999,7 +1003,29 @@ struct llama_layer {
|
|
999
1003
|
struct ggml_tensor * b3; // ffn_up
|
1000
1004
|
};
|
1001
1005
|
|
1006
|
+
struct llama_kv_cell {
|
1007
|
+
llama_pos pos = -1;
|
1008
|
+
llama_pos delta = 0;
|
1009
|
+
|
1010
|
+
std::set<llama_seq_id> seq_id;
|
1011
|
+
|
1012
|
+
bool has_seq_id(const llama_seq_id & id) const {
|
1013
|
+
return seq_id.find(id) != seq_id.end();
|
1014
|
+
}
|
1015
|
+
};
|
1016
|
+
|
1017
|
+
// ring-buffer of cached KV data
|
1002
1018
|
struct llama_kv_cache {
|
1019
|
+
bool has_shift = false;
|
1020
|
+
|
1021
|
+
uint32_t head = 0;
|
1022
|
+
uint32_t size = 0;
|
1023
|
+
|
1024
|
+
// computed before each graph build
|
1025
|
+
uint32_t n = 0;
|
1026
|
+
|
1027
|
+
std::vector<llama_kv_cell> cells;
|
1028
|
+
|
1003
1029
|
struct ggml_tensor * k = NULL;
|
1004
1030
|
struct ggml_tensor * v = NULL;
|
1005
1031
|
|
@@ -1007,8 +1033,6 @@ struct llama_kv_cache {
|
|
1007
1033
|
|
1008
1034
|
llama_buffer buf;
|
1009
1035
|
|
1010
|
-
int n; // number of tokens currently in the cache
|
1011
|
-
|
1012
1036
|
~llama_kv_cache() {
|
1013
1037
|
if (ctx) {
|
1014
1038
|
ggml_free(ctx);
|
@@ -1122,11 +1146,8 @@ struct llama_model {
|
|
1122
1146
|
};
|
1123
1147
|
|
1124
1148
|
struct llama_context {
|
1125
|
-
llama_context(const llama_model & model) : model(model),
|
1149
|
+
llama_context(const llama_model & model) : model(model), t_start_us(model.t_start_us), t_load_us(model.t_load_us) {}
|
1126
1150
|
~llama_context() {
|
1127
|
-
if (model_owner) {
|
1128
|
-
delete &model;
|
1129
|
-
}
|
1130
1151
|
#ifdef GGML_USE_METAL
|
1131
1152
|
if (ctx_metal) {
|
1132
1153
|
ggml_metal_free(ctx_metal);
|
@@ -1137,27 +1158,26 @@ struct llama_context {
|
|
1137
1158
|
}
|
1138
1159
|
}
|
1139
1160
|
|
1161
|
+
llama_cparams cparams;
|
1162
|
+
|
1163
|
+
const llama_model & model;
|
1164
|
+
|
1165
|
+
// key + value cache for the self attention
|
1166
|
+
struct llama_kv_cache kv_self;
|
1167
|
+
|
1140
1168
|
std::mt19937 rng;
|
1141
1169
|
|
1142
1170
|
bool has_evaluated_once = false;
|
1143
1171
|
|
1172
|
+
int64_t t_start_us;
|
1173
|
+
int64_t t_load_us;
|
1144
1174
|
int64_t t_sample_us = 0;
|
1145
|
-
int64_t t_eval_us = 0;
|
1146
1175
|
int64_t t_p_eval_us = 0;
|
1176
|
+
int64_t t_eval_us = 0;
|
1147
1177
|
|
1148
1178
|
int32_t n_sample = 0; // number of tokens sampled
|
1149
|
-
int32_t n_eval = 0; // number of eval calls
|
1150
1179
|
int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
|
1151
|
-
|
1152
|
-
const llama_model & model;
|
1153
|
-
|
1154
|
-
bool model_owner = false;
|
1155
|
-
|
1156
|
-
int64_t t_load_us;
|
1157
|
-
int64_t t_start_us;
|
1158
|
-
|
1159
|
-
// key + value cache for the self attention
|
1160
|
-
struct llama_kv_cache kv_self;
|
1180
|
+
int32_t n_eval = 0; // number of eval calls
|
1161
1181
|
|
1162
1182
|
// decode output (2-dimensional array: [n_tokens][n_vocab])
|
1163
1183
|
std::vector<float> logits;
|
@@ -1192,16 +1212,23 @@ static bool llama_kv_cache_init(
|
|
1192
1212
|
const struct llama_hparams & hparams,
|
1193
1213
|
struct llama_kv_cache & cache,
|
1194
1214
|
ggml_type wtype,
|
1195
|
-
|
1215
|
+
uint32_t n_ctx,
|
1196
1216
|
int n_gpu_layers) {
|
1197
|
-
const
|
1198
|
-
const
|
1217
|
+
const uint32_t n_embd = hparams.n_embd_gqa();
|
1218
|
+
const uint32_t n_layer = hparams.n_layer;
|
1199
1219
|
|
1200
1220
|
const int64_t n_mem = n_layer*n_ctx;
|
1201
1221
|
const int64_t n_elements = n_embd*n_mem;
|
1202
1222
|
|
1223
|
+
cache.has_shift = false;
|
1224
|
+
|
1225
|
+
cache.head = 0;
|
1226
|
+
cache.size = n_ctx;
|
1227
|
+
|
1228
|
+
cache.cells.clear();
|
1229
|
+
cache.cells.resize(n_ctx);
|
1230
|
+
|
1203
1231
|
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
|
1204
|
-
cache.n = 0;
|
1205
1232
|
|
1206
1233
|
struct ggml_init_params params;
|
1207
1234
|
params.mem_size = cache.buf.size;
|
@@ -1222,17 +1249,154 @@ static bool llama_kv_cache_init(
|
|
1222
1249
|
|
1223
1250
|
(void) n_gpu_layers;
|
1224
1251
|
#ifdef GGML_USE_CUBLAS
|
1225
|
-
|
1252
|
+
size_t vram_kv_cache = 0;
|
1253
|
+
|
1254
|
+
if (n_gpu_layers > (int)n_layer + 1) {
|
1226
1255
|
ggml_cuda_assign_buffers_no_scratch(cache.v);
|
1256
|
+
LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__);
|
1257
|
+
vram_kv_cache += ggml_nbytes(cache.v);
|
1227
1258
|
}
|
1228
|
-
if (n_gpu_layers > n_layer + 2) {
|
1259
|
+
if (n_gpu_layers > (int)n_layer + 2) {
|
1229
1260
|
ggml_cuda_assign_buffers_no_scratch(cache.k);
|
1261
|
+
LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
|
1262
|
+
vram_kv_cache += ggml_nbytes(cache.k);
|
1263
|
+
}
|
1264
|
+
if (vram_kv_cache > 0) {
|
1265
|
+
LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
|
1230
1266
|
}
|
1231
1267
|
#endif // GGML_USE_CUBLAS
|
1232
1268
|
|
1233
1269
|
return true;
|
1234
1270
|
}
|
1235
1271
|
|
1272
|
+
// find an empty slot of size "n_tokens" in the cache
|
1273
|
+
// updates the cache head
|
1274
|
+
static bool llama_kv_cache_find_slot(
|
1275
|
+
struct llama_kv_cache & cache,
|
1276
|
+
const struct llama_batch & batch) {
|
1277
|
+
const uint32_t n_ctx = cache.size;
|
1278
|
+
const uint32_t n_tokens = batch.n_tokens;
|
1279
|
+
|
1280
|
+
if (n_tokens > n_ctx) {
|
1281
|
+
LLAMA_LOG_ERROR("%s: n_tokens=%d > n_ctx=%d\n", __func__, n_tokens, n_ctx);
|
1282
|
+
return false;
|
1283
|
+
}
|
1284
|
+
|
1285
|
+
uint32_t n_tested = 0;
|
1286
|
+
|
1287
|
+
while (true) {
|
1288
|
+
if (cache.head + n_tokens > n_ctx) {
|
1289
|
+
cache.head = 0;
|
1290
|
+
n_tested += n_ctx - cache.head;
|
1291
|
+
continue;
|
1292
|
+
}
|
1293
|
+
|
1294
|
+
bool found = true;
|
1295
|
+
for (uint32_t i = 0; i < n_tokens; i++) {
|
1296
|
+
if (cache.cells[cache.head + i].pos >= 0) {
|
1297
|
+
found = false;
|
1298
|
+
cache.head += i + 1;
|
1299
|
+
n_tested += i + 1;
|
1300
|
+
break;
|
1301
|
+
}
|
1302
|
+
}
|
1303
|
+
|
1304
|
+
if (found) {
|
1305
|
+
break;
|
1306
|
+
}
|
1307
|
+
|
1308
|
+
if (n_tested >= n_ctx) {
|
1309
|
+
//LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
|
1310
|
+
return false;
|
1311
|
+
}
|
1312
|
+
}
|
1313
|
+
|
1314
|
+
for (uint32_t i = 0; i < n_tokens; i++) {
|
1315
|
+
cache.cells[cache.head + i].pos = batch.pos[i];
|
1316
|
+
cache.cells[cache.head + i].seq_id.insert(batch.seq_id[i]);
|
1317
|
+
}
|
1318
|
+
|
1319
|
+
return true;
|
1320
|
+
}
|
1321
|
+
|
1322
|
+
// find how many cells are currently in use
|
1323
|
+
static int32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
|
1324
|
+
for (uint32_t i = cache.size - 1; i > 0; --i) {
|
1325
|
+
if (cache.cells[i].pos >= 0 && !cache.cells[i].seq_id.empty()) {
|
1326
|
+
return i + 1;
|
1327
|
+
}
|
1328
|
+
}
|
1329
|
+
|
1330
|
+
return 0;
|
1331
|
+
}
|
1332
|
+
|
1333
|
+
static void llama_kv_cache_tokens_rm(struct llama_kv_cache & cache, int32_t c0, int32_t c1) {
|
1334
|
+
if (c0 < 0) c0 = 0;
|
1335
|
+
if (c1 < 0) c1 = cache.size;
|
1336
|
+
|
1337
|
+
for (int32_t i = c0; i < c1; ++i) {
|
1338
|
+
cache.cells[i].pos = -1;
|
1339
|
+
cache.cells[i].seq_id.clear();
|
1340
|
+
}
|
1341
|
+
}
|
1342
|
+
|
1343
|
+
static void llama_kv_cache_seq_rm(
|
1344
|
+
struct llama_kv_cache & cache,
|
1345
|
+
llama_seq_id seq_id,
|
1346
|
+
llama_pos p0,
|
1347
|
+
llama_pos p1) {
|
1348
|
+
for (uint32_t i = 0; i < cache.size; ++i) {
|
1349
|
+
if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
|
1350
|
+
cache.cells[i].seq_id.erase(seq_id);
|
1351
|
+
if (cache.cells[i].seq_id.empty()) {
|
1352
|
+
cache.cells[i].pos = -1;
|
1353
|
+
}
|
1354
|
+
}
|
1355
|
+
}
|
1356
|
+
}
|
1357
|
+
|
1358
|
+
static void llama_kv_cache_seq_cp(
|
1359
|
+
struct llama_kv_cache & cache,
|
1360
|
+
llama_seq_id seq_id_src,
|
1361
|
+
llama_seq_id seq_id_dst,
|
1362
|
+
llama_pos p0,
|
1363
|
+
llama_pos p1) {
|
1364
|
+
for (uint32_t i = 0; i < cache.size; ++i) {
|
1365
|
+
if (cache.cells[i].has_seq_id(seq_id_src) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
|
1366
|
+
cache.cells[i].seq_id.insert(seq_id_dst);
|
1367
|
+
}
|
1368
|
+
}
|
1369
|
+
}
|
1370
|
+
|
1371
|
+
static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id seq_id) {
|
1372
|
+
for (uint32_t i = 0; i < cache.size; ++i) {
|
1373
|
+
if (!cache.cells[i].has_seq_id(seq_id)) {
|
1374
|
+
cache.cells[i].pos = -1;
|
1375
|
+
cache.cells[i].seq_id.clear();
|
1376
|
+
}
|
1377
|
+
}
|
1378
|
+
}
|
1379
|
+
|
1380
|
+
static void llama_kv_cache_seq_shift(
|
1381
|
+
struct llama_kv_cache & cache,
|
1382
|
+
llama_seq_id seq_id,
|
1383
|
+
llama_pos p0,
|
1384
|
+
llama_pos p1,
|
1385
|
+
llama_pos delta) {
|
1386
|
+
for (uint32_t i = 0; i < cache.size; ++i) {
|
1387
|
+
if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
|
1388
|
+
cache.cells[i].pos += delta;
|
1389
|
+
if (cache.cells[i].pos < 0) {
|
1390
|
+
cache.cells[i].pos = -1;
|
1391
|
+
cache.cells[i].seq_id.clear();
|
1392
|
+
} else {
|
1393
|
+
cache.has_shift = true;
|
1394
|
+
cache.cells[i].delta = delta;
|
1395
|
+
}
|
1396
|
+
}
|
1397
|
+
}
|
1398
|
+
}
|
1399
|
+
|
1236
1400
|
//
|
1237
1401
|
// model loading and saving
|
1238
1402
|
//
|
@@ -1554,7 +1718,7 @@ struct llama_model_loader {
|
|
1554
1718
|
lmlock->grow_to(size_lock);
|
1555
1719
|
}
|
1556
1720
|
break;
|
1557
|
-
#
|
1721
|
+
#ifdef GGML_USE_CUBLAS
|
1558
1722
|
case GGML_BACKEND_GPU:
|
1559
1723
|
case GGML_BACKEND_GPU_SPLIT:
|
1560
1724
|
// old code:
|
@@ -1587,7 +1751,15 @@ struct llama_model_loader {
|
|
1587
1751
|
// load LLaMA models
|
1588
1752
|
//
|
1589
1753
|
|
1590
|
-
static std::string
|
1754
|
+
static std::string llama_model_arch_name(llm_arch arch) {
|
1755
|
+
auto it = LLM_ARCH_NAMES.find(arch);
|
1756
|
+
if (it == LLM_ARCH_NAMES.end()) {
|
1757
|
+
return "unknown";
|
1758
|
+
}
|
1759
|
+
return it->second;
|
1760
|
+
}
|
1761
|
+
|
1762
|
+
static std::string llama_model_ftype_name(llama_ftype ftype) {
|
1591
1763
|
if (ftype & LLAMA_FTYPE_GUESSED) {
|
1592
1764
|
return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
|
1593
1765
|
}
|
@@ -1643,10 +1815,7 @@ static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
|
|
1643
1815
|
|
1644
1816
|
static void llm_load_hparams(
|
1645
1817
|
llama_model_loader & ml,
|
1646
|
-
llama_model & model
|
1647
|
-
int n_ctx,
|
1648
|
-
float rope_freq_base,
|
1649
|
-
float rope_freq_scale) {
|
1818
|
+
llama_model & model) {
|
1650
1819
|
struct gguf_context * ctx = ml.ctx_gguf;
|
1651
1820
|
|
1652
1821
|
const auto kv = LLM_KV(model.arch);
|
@@ -1657,29 +1826,25 @@ static void llm_load_hparams(
|
|
1657
1826
|
GGUF_GET_KEY(ctx, model.name, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_NAME));
|
1658
1827
|
|
1659
1828
|
// get hparams kv
|
1660
|
-
GGUF_GET_KEY(ctx, hparams.n_vocab, gguf_get_arr_n, GGUF_TYPE_ARRAY,
|
1661
|
-
GGUF_GET_KEY(ctx, hparams.n_ctx_train, gguf_get_val_u32, GGUF_TYPE_UINT32,
|
1662
|
-
GGUF_GET_KEY(ctx, hparams.n_embd, gguf_get_val_u32, GGUF_TYPE_UINT32,
|
1663
|
-
GGUF_GET_KEY(ctx, hparams.n_ff, gguf_get_val_u32, GGUF_TYPE_UINT32,
|
1664
|
-
GGUF_GET_KEY(ctx, hparams.n_head, gguf_get_val_u32, GGUF_TYPE_UINT32,
|
1665
|
-
GGUF_GET_KEY(ctx, hparams.n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32,
|
1829
|
+
GGUF_GET_KEY(ctx, hparams.n_vocab, gguf_get_arr_n, GGUF_TYPE_ARRAY, true, kv(LLM_KV_TOKENIZER_LIST));
|
1830
|
+
GGUF_GET_KEY(ctx, hparams.n_ctx_train, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_CONTEXT_LENGTH));
|
1831
|
+
GGUF_GET_KEY(ctx, hparams.n_embd, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_EMBEDDING_LENGTH));
|
1832
|
+
GGUF_GET_KEY(ctx, hparams.n_ff, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_FEED_FORWARD_LENGTH));
|
1833
|
+
GGUF_GET_KEY(ctx, hparams.n_head, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_ATTENTION_HEAD_COUNT));
|
1834
|
+
GGUF_GET_KEY(ctx, hparams.n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_BLOCK_COUNT));
|
1666
1835
|
|
1667
1836
|
// n_head_kv is optional, default to n_head
|
1668
1837
|
hparams.n_head_kv = hparams.n_head;
|
1669
1838
|
GGUF_GET_KEY(ctx, hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV));
|
1670
1839
|
|
1671
1840
|
// rope_freq_base (optional)
|
1672
|
-
|
1673
|
-
|
1674
|
-
GGUF_GET_KEY(ctx, rope_freq_base, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
|
1675
|
-
}
|
1841
|
+
hparams.rope_freq_base_train = 10000.0f;
|
1842
|
+
GGUF_GET_KEY(ctx, hparams.rope_freq_base_train, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
|
1676
1843
|
|
1677
1844
|
// rope_freq_scale (inverse of the kv) is optional
|
1678
|
-
|
1679
|
-
|
1680
|
-
|
1681
|
-
rope_freq_scale = 1.0f/ropescale;
|
1682
|
-
}
|
1845
|
+
float ropescale = 1.0f;
|
1846
|
+
GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
|
1847
|
+
hparams.rope_freq_scale_train = 1.0f/ropescale;
|
1683
1848
|
|
1684
1849
|
// sanity check for n_rot (optional)
|
1685
1850
|
{
|
@@ -1743,13 +1908,9 @@ static void llm_load_hparams(
|
|
1743
1908
|
}
|
1744
1909
|
} break;
|
1745
1910
|
default: (void)0;
|
1746
|
-
}
|
1911
|
+
}
|
1747
1912
|
|
1748
1913
|
model.ftype = ml.ftype;
|
1749
|
-
|
1750
|
-
hparams.n_ctx = n_ctx;
|
1751
|
-
hparams.rope_freq_base = rope_freq_base;
|
1752
|
-
hparams.rope_freq_scale = rope_freq_scale;
|
1753
1914
|
}
|
1754
1915
|
|
1755
1916
|
// TODO: This should probably be in llama.h
|
@@ -1770,20 +1931,18 @@ static void llm_load_vocab(
|
|
1770
1931
|
throw std::runtime_error("cannot find tokenizer vocab in model file\n");
|
1771
1932
|
}
|
1772
1933
|
|
1934
|
+
const float * scores = nullptr;
|
1773
1935
|
const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
|
1774
|
-
if (score_idx
|
1775
|
-
|
1936
|
+
if (score_idx != -1) {
|
1937
|
+
scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
|
1776
1938
|
}
|
1777
1939
|
|
1778
|
-
const
|
1779
|
-
|
1940
|
+
const int * toktypes = nullptr;
|
1780
1941
|
const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
|
1781
|
-
if (toktype_idx
|
1782
|
-
|
1942
|
+
if (toktype_idx != -1) {
|
1943
|
+
toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
|
1783
1944
|
}
|
1784
1945
|
|
1785
|
-
const int * toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
|
1786
|
-
|
1787
1946
|
// determine vocab type
|
1788
1947
|
{
|
1789
1948
|
std::string tokenizer_name;
|
@@ -1851,8 +2010,8 @@ static void llm_load_vocab(
|
|
1851
2010
|
|
1852
2011
|
auto & token_data = vocab.id_to_token[i];
|
1853
2012
|
token_data.text = std::move(word);
|
1854
|
-
token_data.score = scores[i];
|
1855
|
-
token_data.type = (llama_token_type) toktypes[i];
|
2013
|
+
token_data.score = scores ? scores[i] : 0.0f;
|
2014
|
+
token_data.type = toktypes ? (llama_token_type) toktypes[i] : LLAMA_TOKEN_TYPE_NORMAL;
|
1856
2015
|
}
|
1857
2016
|
|
1858
2017
|
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
|
@@ -1875,31 +2034,30 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
1875
2034
|
const auto & vocab = model.vocab;
|
1876
2035
|
|
1877
2036
|
// hparams
|
1878
|
-
LLAMA_LOG_INFO("%s: format
|
1879
|
-
LLAMA_LOG_INFO("%s: arch
|
1880
|
-
LLAMA_LOG_INFO("%s: vocab type
|
1881
|
-
LLAMA_LOG_INFO("%s: n_vocab
|
1882
|
-
LLAMA_LOG_INFO("%s: n_merges
|
1883
|
-
LLAMA_LOG_INFO("%s: n_ctx_train
|
1884
|
-
LLAMA_LOG_INFO("%s:
|
1885
|
-
LLAMA_LOG_INFO("%s:
|
1886
|
-
LLAMA_LOG_INFO("%s:
|
1887
|
-
LLAMA_LOG_INFO("%s:
|
1888
|
-
LLAMA_LOG_INFO("%s:
|
1889
|
-
LLAMA_LOG_INFO("%s:
|
1890
|
-
LLAMA_LOG_INFO("%s:
|
1891
|
-
LLAMA_LOG_INFO("%s:
|
1892
|
-
LLAMA_LOG_INFO("%s:
|
1893
|
-
LLAMA_LOG_INFO("%s:
|
1894
|
-
LLAMA_LOG_INFO("%s:
|
1895
|
-
LLAMA_LOG_INFO("%s:
|
1896
|
-
LLAMA_LOG_INFO("%s: model
|
1897
|
-
LLAMA_LOG_INFO("%s: model
|
1898
|
-
LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
|
2037
|
+
LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver));
|
2038
|
+
LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch).c_str());
|
2039
|
+
LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, vocab.type == LLAMA_VOCAB_TYPE_SPM ? "SPM" : "BPE"); // TODO: fix
|
2040
|
+
LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab);
|
2041
|
+
LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (int) vocab.bpe_ranks.size());
|
2042
|
+
LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
|
2043
|
+
LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
|
2044
|
+
LLAMA_LOG_INFO("%s: n_head = %u\n", __func__, hparams.n_head);
|
2045
|
+
LLAMA_LOG_INFO("%s: n_head_kv = %u\n", __func__, hparams.n_head_kv);
|
2046
|
+
LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
|
2047
|
+
LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim
|
2048
|
+
LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa());
|
2049
|
+
LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
|
2050
|
+
LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
|
2051
|
+
LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
|
2052
|
+
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
2053
|
+
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
2054
|
+
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
|
2055
|
+
LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
|
2056
|
+
LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
|
1899
2057
|
if (ml.n_bytes < GB) {
|
1900
|
-
LLAMA_LOG_INFO("%s: model size
|
2058
|
+
LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
|
1901
2059
|
} else {
|
1902
|
-
LLAMA_LOG_INFO("%s: model size
|
2060
|
+
LLAMA_LOG_INFO("%s: model size = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
|
1903
2061
|
}
|
1904
2062
|
|
1905
2063
|
// general kv
|
@@ -1917,13 +2075,9 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
1917
2075
|
static void llm_load_tensors(
|
1918
2076
|
llama_model_loader & ml,
|
1919
2077
|
llama_model & model,
|
1920
|
-
int n_batch,
|
1921
2078
|
int n_gpu_layers,
|
1922
2079
|
int main_gpu,
|
1923
2080
|
const float * tensor_split,
|
1924
|
-
const bool mul_mat_q,
|
1925
|
-
bool low_vram,
|
1926
|
-
ggml_type memory_type,
|
1927
2081
|
bool use_mlock,
|
1928
2082
|
llama_progress_callback progress_callback,
|
1929
2083
|
void * progress_callback_user_data) {
|
@@ -1962,11 +2116,9 @@ static void llm_load_tensors(
|
|
1962
2116
|
}
|
1963
2117
|
|
1964
2118
|
(void) main_gpu;
|
1965
|
-
|
1966
|
-
#if defined(GGML_USE_CUBLAS)
|
2119
|
+
#ifdef GGML_USE_CUBLAS
|
1967
2120
|
LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__);
|
1968
2121
|
ggml_cuda_set_main_device(main_gpu);
|
1969
|
-
ggml_cuda_set_mul_mat_q(mul_mat_q);
|
1970
2122
|
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
|
1971
2123
|
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
|
1972
2124
|
#elif defined(GGML_USE_CLBLAST)
|
@@ -2001,9 +2153,9 @@ static void llm_load_tensors(
|
|
2001
2153
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2002
2154
|
// on Windows however this is detrimental unless everything is on the GPU
|
2003
2155
|
#ifndef _WIN32
|
2004
|
-
backend_norm =
|
2156
|
+
backend_norm = LLAMA_BACKEND_OFFLOAD;
|
2005
2157
|
#else
|
2006
|
-
backend_norm =
|
2158
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2007
2159
|
#endif // _WIN32
|
2008
2160
|
|
2009
2161
|
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
@@ -2067,9 +2219,9 @@ static void llm_load_tensors(
|
|
2067
2219
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2068
2220
|
// on Windows however this is detrimental unless everything is on the GPU
|
2069
2221
|
#ifndef _WIN32
|
2070
|
-
backend_norm =
|
2222
|
+
backend_norm = LLAMA_BACKEND_OFFLOAD;
|
2071
2223
|
#else
|
2072
|
-
backend_norm =
|
2224
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2073
2225
|
#endif // _WIN32
|
2074
2226
|
|
2075
2227
|
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
@@ -2137,9 +2289,9 @@ static void llm_load_tensors(
|
|
2137
2289
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2138
2290
|
// on Windows however this is detrimental unless everything is on the GPU
|
2139
2291
|
#ifndef _WIN32
|
2140
|
-
backend_norm =
|
2292
|
+
backend_norm = LLAMA_BACKEND_OFFLOAD;
|
2141
2293
|
#else
|
2142
|
-
backend_norm =
|
2294
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2143
2295
|
#endif // _WIN32
|
2144
2296
|
|
2145
2297
|
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
@@ -2214,9 +2366,9 @@ static void llm_load_tensors(
|
|
2214
2366
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2215
2367
|
// on Windows however this is detrimental unless everything is on the GPU
|
2216
2368
|
#ifndef _WIN32
|
2217
|
-
backend_norm =
|
2369
|
+
backend_norm = LLAMA_BACKEND_OFFLOAD;
|
2218
2370
|
#else
|
2219
|
-
backend_norm =
|
2371
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2220
2372
|
#endif // _WIN32
|
2221
2373
|
|
2222
2374
|
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
@@ -2281,27 +2433,19 @@ static void llm_load_tensors(
|
|
2281
2433
|
} break;
|
2282
2434
|
default:
|
2283
2435
|
throw std::runtime_error("unknown architecture");
|
2284
|
-
}
|
2436
|
+
}
|
2285
2437
|
}
|
2286
2438
|
|
2287
2439
|
ml.done_getting_tensors();
|
2288
2440
|
|
2289
2441
|
// print memory requirements
|
2290
2442
|
{
|
2291
|
-
const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
|
2292
|
-
|
2293
2443
|
// this is the total memory required to run the inference
|
2294
2444
|
size_t mem_required =
|
2295
2445
|
ctx_size +
|
2296
2446
|
mmapped_size - vram_weights; // weights in VRAM not in memory
|
2297
2447
|
|
2298
|
-
|
2299
|
-
const size_t mem_required_state = scale*hparams.kv_size();
|
2300
|
-
|
2301
|
-
LLAMA_LOG_INFO("%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
|
2302
|
-
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
2303
|
-
|
2304
|
-
(void) n_batch;
|
2448
|
+
LLAMA_LOG_INFO("%s: mem required = %7.2f MB\n", __func__, mem_required / 1024.0 / 1024.0);
|
2305
2449
|
|
2306
2450
|
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
2307
2451
|
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
@@ -2310,36 +2454,17 @@ static void llm_load_tensors(
|
|
2310
2454
|
if (n_gpu_layers > (int) hparams.n_layer) {
|
2311
2455
|
LLAMA_LOG_INFO("%s: offloading non-repeating layers to GPU\n", __func__);
|
2312
2456
|
}
|
2313
|
-
size_t vram_kv_cache = 0;
|
2314
2457
|
|
2315
2458
|
#ifdef GGML_USE_CUBLAS
|
2316
2459
|
const int max_backend_supported_layers = hparams.n_layer + 3;
|
2317
|
-
const int max_offloadable_layers =
|
2318
|
-
if (n_gpu_layers > (int) hparams.n_layer + 1) {
|
2319
|
-
if (low_vram) {
|
2320
|
-
LLAMA_LOG_INFO("%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
|
2321
|
-
} else {
|
2322
|
-
LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__);
|
2323
|
-
vram_kv_cache += hparams.kv_size() / 2;
|
2324
|
-
}
|
2325
|
-
}
|
2326
|
-
if (n_gpu_layers > (int) hparams.n_layer + 2) {
|
2327
|
-
if (low_vram) {
|
2328
|
-
LLAMA_LOG_WARN("%s: cannot offload k cache to GPU due to low VRAM option\n", __func__);
|
2329
|
-
} else {
|
2330
|
-
LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
|
2331
|
-
vram_kv_cache += hparams.kv_size() / 2;
|
2332
|
-
}
|
2333
|
-
}
|
2460
|
+
const int max_offloadable_layers = hparams.n_layer + 3;
|
2334
2461
|
#elif defined(GGML_USE_CLBLAST)
|
2335
2462
|
const int max_backend_supported_layers = hparams.n_layer + 1;
|
2336
2463
|
const int max_offloadable_layers = hparams.n_layer + 1;
|
2337
2464
|
#endif // GGML_USE_CUBLAS
|
2338
2465
|
|
2339
|
-
LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n",
|
2340
|
-
|
2341
|
-
LLAMA_LOG_INFO("%s: VRAM used: %zu MB\n",
|
2342
|
-
__func__, (vram_weights + vram_kv_cache + MB - 1) / MB); // round up
|
2466
|
+
LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
|
2467
|
+
LLAMA_LOG_INFO("%s: VRAM used: %.2f MB\n", __func__, vram_weights / 1024.0 / 1024.0);
|
2343
2468
|
#else
|
2344
2469
|
(void) n_gpu_layers;
|
2345
2470
|
#endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
@@ -2352,7 +2477,7 @@ static void llm_load_tensors(
|
|
2352
2477
|
}
|
2353
2478
|
|
2354
2479
|
(void) tensor_split;
|
2355
|
-
#
|
2480
|
+
#ifdef GGML_USE_CUBLAS
|
2356
2481
|
{
|
2357
2482
|
ggml_cuda_set_tensor_split(tensor_split);
|
2358
2483
|
}
|
@@ -2374,29 +2499,24 @@ static void llm_load_tensors(
|
|
2374
2499
|
static bool llama_model_load(
|
2375
2500
|
const std::string & fname,
|
2376
2501
|
llama_model & model,
|
2377
|
-
int n_ctx,
|
2378
|
-
int n_batch,
|
2379
2502
|
int n_gpu_layers,
|
2380
2503
|
int main_gpu,
|
2381
2504
|
const float * tensor_split,
|
2382
|
-
const bool mul_mat_q,
|
2383
|
-
float rope_freq_base,
|
2384
|
-
float rope_freq_scale,
|
2385
|
-
bool low_vram,
|
2386
|
-
ggml_type memory_type,
|
2387
2505
|
bool use_mmap,
|
2388
2506
|
bool use_mlock,
|
2389
2507
|
bool vocab_only,
|
2390
2508
|
llama_progress_callback progress_callback,
|
2391
2509
|
void *progress_callback_user_data) {
|
2392
2510
|
try {
|
2393
|
-
|
2511
|
+
llama_model_loader ml(fname, use_mmap);
|
2512
|
+
|
2513
|
+
model.hparams.vocab_only = vocab_only;
|
2394
2514
|
|
2395
|
-
llm_load_arch (
|
2396
|
-
llm_load_hparams(
|
2397
|
-
llm_load_vocab (
|
2515
|
+
llm_load_arch (ml, model);
|
2516
|
+
llm_load_hparams(ml, model);
|
2517
|
+
llm_load_vocab (ml, model);
|
2398
2518
|
|
2399
|
-
llm_load_print_meta(
|
2519
|
+
llm_load_print_meta(ml, model);
|
2400
2520
|
|
2401
2521
|
if (model.hparams.n_vocab != model.vocab.id_to_token.size()) {
|
2402
2522
|
throw std::runtime_error("vocab size mismatch");
|
@@ -2408,8 +2528,8 @@ static bool llama_model_load(
|
|
2408
2528
|
}
|
2409
2529
|
|
2410
2530
|
llm_load_tensors(
|
2411
|
-
|
2412
|
-
main_gpu, tensor_split,
|
2531
|
+
ml, model, n_gpu_layers,
|
2532
|
+
main_gpu, tensor_split,
|
2413
2533
|
use_mlock, progress_callback, progress_callback_user_data);
|
2414
2534
|
} catch (const std::exception & err) {
|
2415
2535
|
LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
|
@@ -2421,17 +2541,10 @@ static bool llama_model_load(
|
|
2421
2541
|
|
2422
2542
|
static struct ggml_cgraph * llm_build_llama(
|
2423
2543
|
llama_context & lctx,
|
2424
|
-
const
|
2425
|
-
const float * embd,
|
2426
|
-
int n_tokens,
|
2427
|
-
int n_past) {
|
2428
|
-
|
2429
|
-
GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
|
2430
|
-
|
2431
|
-
const int N = n_tokens;
|
2432
|
-
|
2544
|
+
const llama_batch & batch) {
|
2433
2545
|
const auto & model = lctx.model;
|
2434
2546
|
const auto & hparams = model.hparams;
|
2547
|
+
const auto & cparams = lctx.cparams;
|
2435
2548
|
|
2436
2549
|
const auto & kv_self = lctx.kv_self;
|
2437
2550
|
|
@@ -2439,7 +2552,7 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2439
2552
|
|
2440
2553
|
const int64_t n_embd = hparams.n_embd;
|
2441
2554
|
const int64_t n_layer = hparams.n_layer;
|
2442
|
-
const int64_t n_ctx =
|
2555
|
+
const int64_t n_ctx = cparams.n_ctx;
|
2443
2556
|
const int64_t n_head = hparams.n_head;
|
2444
2557
|
const int64_t n_head_kv = hparams.n_head_kv;
|
2445
2558
|
const int64_t n_embd_head = hparams.n_embd_head();
|
@@ -2447,12 +2560,20 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2447
2560
|
|
2448
2561
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
2449
2562
|
|
2450
|
-
const float freq_base =
|
2451
|
-
const float freq_scale =
|
2563
|
+
const float freq_base = cparams.rope_freq_base;
|
2564
|
+
const float freq_scale = cparams.rope_freq_scale;
|
2452
2565
|
const float norm_rms_eps = hparams.f_norm_rms_eps;
|
2453
2566
|
|
2454
2567
|
const int n_gpu_layers = model.n_gpu_layers;
|
2455
2568
|
|
2569
|
+
const int32_t n_tokens = batch.n_tokens;
|
2570
|
+
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
|
2571
|
+
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
2572
|
+
|
2573
|
+
const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
|
2574
|
+
|
2575
|
+
//printf("n_kv = %d\n", n_kv);
|
2576
|
+
|
2456
2577
|
auto & buf_compute = lctx.buf_compute;
|
2457
2578
|
|
2458
2579
|
struct ggml_init_params params = {
|
@@ -2470,12 +2591,12 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2470
2591
|
struct ggml_tensor * cur;
|
2471
2592
|
struct ggml_tensor * inpL;
|
2472
2593
|
|
2473
|
-
if (
|
2474
|
-
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32,
|
2594
|
+
if (batch.token) {
|
2595
|
+
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
2475
2596
|
|
2476
2597
|
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
2477
2598
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2478
|
-
memcpy(inp_tokens->data,
|
2599
|
+
memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
|
2479
2600
|
}
|
2480
2601
|
ggml_set_name(inp_tokens, "inp_tokens");
|
2481
2602
|
|
@@ -2485,11 +2606,11 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2485
2606
|
GGML_ASSERT(false && "not implemented");
|
2486
2607
|
#endif
|
2487
2608
|
|
2488
|
-
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd,
|
2609
|
+
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
|
2489
2610
|
|
2490
2611
|
ggml_allocr_alloc(lctx.alloc, inpL);
|
2491
2612
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2492
|
-
memcpy(inpL->data, embd,
|
2613
|
+
memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
|
2493
2614
|
}
|
2494
2615
|
}
|
2495
2616
|
|
@@ -2498,9 +2619,6 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2498
2619
|
|
2499
2620
|
// offload functions set the tensor output backend to GPU
|
2500
2621
|
// tensors are GPU-accelerated if any input or the output has been offloaded
|
2501
|
-
//
|
2502
|
-
// with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
|
2503
|
-
// in that case ggml_cuda_assign_buffers has no effect
|
2504
2622
|
offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
|
2505
2623
|
offload_func_t offload_func_kq = llama_nop;
|
2506
2624
|
offload_func_t offload_func_v = llama_nop;
|
@@ -2517,12 +2635,75 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2517
2635
|
}
|
2518
2636
|
#endif // GGML_USE_CUBLAS
|
2519
2637
|
|
2638
|
+
// KQ_scale
|
2520
2639
|
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
2640
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
2521
2641
|
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
2522
2642
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2523
|
-
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(
|
2643
|
+
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd_head)));
|
2644
|
+
}
|
2645
|
+
|
2646
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
2647
|
+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
2648
|
+
offload_func_kq(KQ_mask);
|
2649
|
+
ggml_set_name(KQ_mask, "KQ_mask");
|
2650
|
+
ggml_allocr_alloc(lctx.alloc, KQ_mask);
|
2651
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2652
|
+
float * data = (float *) KQ_mask->data;
|
2653
|
+
memset(data, 0, ggml_nbytes(KQ_mask));
|
2654
|
+
|
2655
|
+
for (int h = 0; h < 1; ++h) {
|
2656
|
+
for (int j = 0; j < n_tokens; ++j) {
|
2657
|
+
const llama_pos pos = batch.pos[j];
|
2658
|
+
const llama_seq_id seq_id = batch.seq_id[j];
|
2659
|
+
|
2660
|
+
for (int i = 0; i < n_kv; ++i) {
|
2661
|
+
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
2662
|
+
data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
|
2663
|
+
}
|
2664
|
+
}
|
2665
|
+
}
|
2666
|
+
}
|
2667
|
+
}
|
2668
|
+
|
2669
|
+
// KQ_pos - contains the positions
|
2670
|
+
struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
2671
|
+
offload_func_kq(KQ_pos);
|
2672
|
+
ggml_set_name(KQ_pos, "KQ_pos");
|
2673
|
+
ggml_allocr_alloc(lctx.alloc, KQ_pos);
|
2674
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2675
|
+
int * data = (int *) KQ_pos->data;
|
2676
|
+
for (int i = 0; i < n_tokens; ++i) {
|
2677
|
+
data[i] = batch.pos[i];
|
2678
|
+
}
|
2679
|
+
}
|
2680
|
+
|
2681
|
+
// shift the entire K-cache if needed
|
2682
|
+
if (do_rope_shift) {
|
2683
|
+
struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
|
2684
|
+
offload_func_kq(K_shift);
|
2685
|
+
ggml_set_name(K_shift, "K_shift");
|
2686
|
+
ggml_allocr_alloc(lctx.alloc, K_shift);
|
2687
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2688
|
+
int * data = (int *) K_shift->data;
|
2689
|
+
for (int i = 0; i < n_ctx; ++i) {
|
2690
|
+
data[i] = kv_self.cells[i].delta;
|
2691
|
+
}
|
2692
|
+
}
|
2693
|
+
|
2694
|
+
for (int il = 0; il < n_layer; ++il) {
|
2695
|
+
struct ggml_tensor * tmp =
|
2696
|
+
ggml_rope_custom_inplace(ctx0,
|
2697
|
+
ggml_view_3d(ctx0, kv_self.k,
|
2698
|
+
n_embd_head, n_head_kv, n_ctx,
|
2699
|
+
ggml_element_size(kv_self.k)*n_embd_head,
|
2700
|
+
ggml_element_size(kv_self.k)*n_embd_gqa,
|
2701
|
+
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
|
2702
|
+
K_shift, n_embd_head, 0, 0, freq_base, freq_scale);
|
2703
|
+
offload_func_kq(tmp);
|
2704
|
+
ggml_build_forward_expand(gf, tmp);
|
2705
|
+
}
|
2524
2706
|
}
|
2525
|
-
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
2526
2707
|
|
2527
2708
|
for (int il = 0; il < n_layer; ++il) {
|
2528
2709
|
ggml_format_name(inpL, "layer_inp_%d", il);
|
@@ -2560,33 +2741,33 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2560
2741
|
offload_func_kq(tmpq);
|
2561
2742
|
ggml_set_name(tmpq, "tmpq");
|
2562
2743
|
|
2563
|
-
struct ggml_tensor * Kcur =
|
2744
|
+
struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
|
2564
2745
|
offload_func_kq(Kcur);
|
2565
2746
|
ggml_set_name(Kcur, "Kcur");
|
2566
2747
|
|
2567
|
-
struct ggml_tensor * Qcur =
|
2748
|
+
struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
|
2568
2749
|
offload_func_kq(Qcur);
|
2569
2750
|
ggml_set_name(Qcur, "Qcur");
|
2570
2751
|
|
2571
2752
|
// store key and value to memory
|
2572
2753
|
{
|
2573
|
-
// compute the transposed [
|
2754
|
+
// compute the transposed [n_tokens, n_embd] V matrix
|
2574
2755
|
|
2575
2756
|
struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
2576
2757
|
offload_func_v(tmpv);
|
2577
2758
|
ggml_set_name(tmpv, "tmpv");
|
2578
2759
|
|
2579
|
-
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa,
|
2760
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens));
|
2580
2761
|
offload_func_v(Vcur);
|
2581
2762
|
ggml_set_name(Vcur, "Vcur");
|
2582
2763
|
|
2583
|
-
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k,
|
2764
|
+
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
|
2584
2765
|
offload_func_kq(k);
|
2585
2766
|
ggml_set_name(k, "k");
|
2586
2767
|
|
2587
|
-
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v,
|
2768
|
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
|
2588
2769
|
( n_ctx)*ggml_element_size(kv_self.v),
|
2589
|
-
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa +
|
2770
|
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
|
2590
2771
|
offload_func_v(v);
|
2591
2772
|
ggml_set_name(v, "v");
|
2592
2773
|
|
@@ -2601,7 +2782,7 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2601
2782
|
|
2602
2783
|
struct ggml_tensor * K =
|
2603
2784
|
ggml_view_3d(ctx0, kv_self.k,
|
2604
|
-
n_embd_head,
|
2785
|
+
n_embd_head, n_kv, n_head_kv,
|
2605
2786
|
ggml_element_size(kv_self.k)*n_embd_gqa,
|
2606
2787
|
ggml_element_size(kv_self.k)*n_embd_head,
|
2607
2788
|
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
@@ -2614,25 +2795,25 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2614
2795
|
ggml_set_name(KQ, "KQ");
|
2615
2796
|
|
2616
2797
|
// KQ_scaled = KQ / sqrt(n_embd_head)
|
2617
|
-
// KQ_scaled shape [
|
2618
|
-
struct ggml_tensor * KQ_scaled =
|
2798
|
+
// KQ_scaled shape [n_kv, n_tokens, n_head, 1]
|
2799
|
+
struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
|
2619
2800
|
offload_func_kq(KQ_scaled);
|
2620
2801
|
ggml_set_name(KQ_scaled, "KQ_scaled");
|
2621
2802
|
|
2622
2803
|
// KQ_masked = mask_past(KQ_scaled)
|
2623
|
-
struct ggml_tensor * KQ_masked =
|
2804
|
+
struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
|
2624
2805
|
offload_func_kq(KQ_masked);
|
2625
2806
|
ggml_set_name(KQ_masked, "KQ_masked");
|
2626
2807
|
|
2627
2808
|
// KQ = soft_max(KQ_masked)
|
2628
|
-
struct ggml_tensor * KQ_soft_max =
|
2809
|
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
|
2629
2810
|
offload_func_v(KQ_soft_max);
|
2630
2811
|
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
2631
2812
|
|
2632
2813
|
// split cached V into n_head heads
|
2633
2814
|
struct ggml_tensor * V =
|
2634
2815
|
ggml_view_3d(ctx0, kv_self.v,
|
2635
|
-
|
2816
|
+
n_kv, n_embd_head, n_head_kv,
|
2636
2817
|
ggml_element_size(kv_self.v)*n_ctx,
|
2637
2818
|
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
2638
2819
|
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
@@ -2647,7 +2828,7 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2647
2828
|
// make V contiguous in memory to speed up the matmul, however we waste time on the copy
|
2648
2829
|
// on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
|
2649
2830
|
// is there a better way?
|
2650
|
-
struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type,
|
2831
|
+
struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_ctx, n_embd_head, n_head));
|
2651
2832
|
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
|
2652
2833
|
#endif
|
2653
2834
|
|
@@ -2656,10 +2837,8 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2656
2837
|
offload_func_v(KQV_merged);
|
2657
2838
|
ggml_set_name(KQV_merged, "KQV_merged");
|
2658
2839
|
|
2659
|
-
// cur = KQV_merged.contiguous().view(n_embd,
|
2660
|
-
cur =
|
2661
|
-
KQV_merged,
|
2662
|
-
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
|
2840
|
+
// cur = KQV_merged.contiguous().view(n_embd, n_tokens)
|
2841
|
+
cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
|
2663
2842
|
offload_func_v(cur);
|
2664
2843
|
ggml_set_name(cur, "KQV_merged_contiguous");
|
2665
2844
|
|
@@ -2750,20 +2929,12 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2750
2929
|
return gf;
|
2751
2930
|
}
|
2752
2931
|
|
2753
|
-
|
2754
2932
|
static struct ggml_cgraph * llm_build_baichaun(
|
2755
2933
|
llama_context & lctx,
|
2756
|
-
const
|
2757
|
-
const float * embd,
|
2758
|
-
int n_tokens,
|
2759
|
-
int n_past) {
|
2760
|
-
|
2761
|
-
GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
|
2762
|
-
|
2763
|
-
const int N = n_tokens;
|
2764
|
-
|
2934
|
+
const llama_batch & batch) {
|
2765
2935
|
const auto & model = lctx.model;
|
2766
2936
|
const auto & hparams = model.hparams;
|
2937
|
+
const auto & cparams = lctx.cparams;
|
2767
2938
|
|
2768
2939
|
const auto & kv_self = lctx.kv_self;
|
2769
2940
|
|
@@ -2771,7 +2942,7 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2771
2942
|
|
2772
2943
|
const int64_t n_embd = hparams.n_embd;
|
2773
2944
|
const int64_t n_layer = hparams.n_layer;
|
2774
|
-
const int64_t n_ctx =
|
2945
|
+
const int64_t n_ctx = cparams.n_ctx;
|
2775
2946
|
const int64_t n_head = hparams.n_head;
|
2776
2947
|
const int64_t n_head_kv = hparams.n_head_kv;
|
2777
2948
|
const int64_t n_embd_head = hparams.n_embd_head();
|
@@ -2779,12 +2950,18 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2779
2950
|
|
2780
2951
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
2781
2952
|
|
2782
|
-
const float freq_base =
|
2783
|
-
const float freq_scale =
|
2953
|
+
const float freq_base = cparams.rope_freq_base;
|
2954
|
+
const float freq_scale = cparams.rope_freq_scale;
|
2784
2955
|
const float norm_rms_eps = hparams.f_norm_rms_eps;
|
2785
2956
|
|
2786
2957
|
const int n_gpu_layers = model.n_gpu_layers;
|
2787
2958
|
|
2959
|
+
const int32_t n_tokens = batch.n_tokens;
|
2960
|
+
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
|
2961
|
+
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
2962
|
+
|
2963
|
+
const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
|
2964
|
+
|
2788
2965
|
auto & buf_compute = lctx.buf_compute;
|
2789
2966
|
|
2790
2967
|
struct ggml_init_params params = {
|
@@ -2802,12 +2979,12 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2802
2979
|
struct ggml_tensor * cur;
|
2803
2980
|
struct ggml_tensor * inpL;
|
2804
2981
|
|
2805
|
-
if (
|
2806
|
-
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32,
|
2982
|
+
if (batch.token) {
|
2983
|
+
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
2807
2984
|
|
2808
2985
|
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
2809
2986
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2810
|
-
memcpy(inp_tokens->data,
|
2987
|
+
memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
|
2811
2988
|
}
|
2812
2989
|
ggml_set_name(inp_tokens, "inp_tokens");
|
2813
2990
|
|
@@ -2817,11 +2994,11 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2817
2994
|
GGML_ASSERT(false && "not implemented");
|
2818
2995
|
#endif
|
2819
2996
|
|
2820
|
-
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd,
|
2997
|
+
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
|
2821
2998
|
|
2822
2999
|
ggml_allocr_alloc(lctx.alloc, inpL);
|
2823
3000
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2824
|
-
memcpy(inpL->data, embd,
|
3001
|
+
memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
|
2825
3002
|
}
|
2826
3003
|
}
|
2827
3004
|
|
@@ -2830,9 +3007,6 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2830
3007
|
|
2831
3008
|
// offload functions set the tensor output backend to GPU
|
2832
3009
|
// tensors are GPU-accelerated if any input or the output has been offloaded
|
2833
|
-
//
|
2834
|
-
// with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
|
2835
|
-
// in that case ggml_cuda_assign_buffers has no effect
|
2836
3010
|
offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
|
2837
3011
|
offload_func_t offload_func_kq = llama_nop;
|
2838
3012
|
offload_func_t offload_func_v = llama_nop;
|
@@ -2849,12 +3023,75 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2849
3023
|
}
|
2850
3024
|
#endif // GGML_USE_CUBLAS
|
2851
3025
|
|
3026
|
+
// KQ_scale
|
2852
3027
|
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
3028
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
2853
3029
|
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
2854
3030
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2855
3031
|
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
2856
3032
|
}
|
2857
|
-
|
3033
|
+
|
3034
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
3035
|
+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
3036
|
+
offload_func_kq(KQ_mask);
|
3037
|
+
ggml_set_name(KQ_mask, "KQ_mask");
|
3038
|
+
ggml_allocr_alloc(lctx.alloc, KQ_mask);
|
3039
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3040
|
+
float * data = (float *) KQ_mask->data;
|
3041
|
+
memset(data, 0, ggml_nbytes(KQ_mask));
|
3042
|
+
|
3043
|
+
for (int h = 0; h < 1; ++h) {
|
3044
|
+
for (int j = 0; j < n_tokens; ++j) {
|
3045
|
+
const llama_pos pos = batch.pos[j];
|
3046
|
+
const llama_seq_id seq_id = batch.seq_id[j];
|
3047
|
+
|
3048
|
+
for (int i = 0; i < n_kv; ++i) {
|
3049
|
+
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
3050
|
+
data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
|
3051
|
+
}
|
3052
|
+
}
|
3053
|
+
}
|
3054
|
+
}
|
3055
|
+
}
|
3056
|
+
|
3057
|
+
// KQ_pos - contains the positions
|
3058
|
+
struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
3059
|
+
offload_func_kq(KQ_pos);
|
3060
|
+
ggml_set_name(KQ_pos, "KQ_pos");
|
3061
|
+
ggml_allocr_alloc(lctx.alloc, KQ_pos);
|
3062
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3063
|
+
int * data = (int *) KQ_pos->data;
|
3064
|
+
for (int i = 0; i < n_tokens; ++i) {
|
3065
|
+
data[i] = batch.pos[i];
|
3066
|
+
}
|
3067
|
+
}
|
3068
|
+
|
3069
|
+
// shift the entire K-cache if needed
|
3070
|
+
if (do_rope_shift) {
|
3071
|
+
struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
|
3072
|
+
offload_func_kq(K_shift);
|
3073
|
+
ggml_set_name(K_shift, "K_shift");
|
3074
|
+
ggml_allocr_alloc(lctx.alloc, K_shift);
|
3075
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3076
|
+
int * data = (int *) K_shift->data;
|
3077
|
+
for (int i = 0; i < n_ctx; ++i) {
|
3078
|
+
data[i] = kv_self.cells[i].delta;
|
3079
|
+
}
|
3080
|
+
}
|
3081
|
+
|
3082
|
+
for (int il = 0; il < n_layer; ++il) {
|
3083
|
+
struct ggml_tensor * tmp =
|
3084
|
+
ggml_rope_custom_inplace(ctx0,
|
3085
|
+
ggml_view_3d(ctx0, kv_self.k,
|
3086
|
+
n_embd_head, n_head_kv, n_ctx,
|
3087
|
+
ggml_element_size(kv_self.k)*n_embd_head,
|
3088
|
+
ggml_element_size(kv_self.k)*n_embd_gqa,
|
3089
|
+
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
|
3090
|
+
K_shift, n_embd_head, 0, 0, freq_base, freq_scale);
|
3091
|
+
offload_func_kq(tmp);
|
3092
|
+
ggml_build_forward_expand(gf, tmp);
|
3093
|
+
}
|
3094
|
+
}
|
2858
3095
|
|
2859
3096
|
for (int il = 0; il < n_layer; ++il) {
|
2860
3097
|
ggml_format_name(inpL, "layer_inp_%d", il);
|
@@ -2896,12 +3133,12 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2896
3133
|
struct ggml_tensor * Qcur;
|
2897
3134
|
switch (model.type) {
|
2898
3135
|
case MODEL_7B:
|
2899
|
-
Kcur =
|
2900
|
-
Qcur =
|
3136
|
+
Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
|
3137
|
+
Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
|
2901
3138
|
break;
|
2902
3139
|
case MODEL_13B:
|
2903
|
-
Kcur
|
2904
|
-
Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head,
|
3140
|
+
Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, n_tokens);
|
3141
|
+
Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, n_tokens);
|
2905
3142
|
break;
|
2906
3143
|
default:
|
2907
3144
|
GGML_ASSERT(false);
|
@@ -2915,23 +3152,23 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2915
3152
|
|
2916
3153
|
// store key and value to memory
|
2917
3154
|
{
|
2918
|
-
// compute the transposed [
|
3155
|
+
// compute the transposed [n_tokens, n_embd] V matrix
|
2919
3156
|
|
2920
3157
|
struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
2921
3158
|
offload_func_v(tmpv);
|
2922
3159
|
ggml_set_name(tmpv, "tmpv");
|
2923
3160
|
|
2924
|
-
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa,
|
3161
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens));
|
2925
3162
|
offload_func_v(Vcur);
|
2926
3163
|
ggml_set_name(Vcur, "Vcur");
|
2927
3164
|
|
2928
|
-
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k,
|
3165
|
+
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
|
2929
3166
|
offload_func_kq(k);
|
2930
3167
|
ggml_set_name(k, "k");
|
2931
3168
|
|
2932
|
-
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v,
|
3169
|
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
|
2933
3170
|
( n_ctx)*ggml_element_size(kv_self.v),
|
2934
|
-
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa +
|
3171
|
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
|
2935
3172
|
offload_func_v(v);
|
2936
3173
|
ggml_set_name(v, "v");
|
2937
3174
|
|
@@ -2946,7 +3183,7 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2946
3183
|
|
2947
3184
|
struct ggml_tensor * K =
|
2948
3185
|
ggml_view_3d(ctx0, kv_self.k,
|
2949
|
-
n_embd_head,
|
3186
|
+
n_embd_head, n_kv, n_head_kv,
|
2950
3187
|
ggml_element_size(kv_self.k)*n_embd_gqa,
|
2951
3188
|
ggml_element_size(kv_self.k)*n_embd_head,
|
2952
3189
|
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
@@ -2959,8 +3196,8 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2959
3196
|
ggml_set_name(KQ, "KQ");
|
2960
3197
|
|
2961
3198
|
// KQ_scaled = KQ / sqrt(n_embd_head)
|
2962
|
-
// KQ_scaled shape [n_past +
|
2963
|
-
struct ggml_tensor * KQ_scaled =
|
3199
|
+
// KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
|
3200
|
+
struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
|
2964
3201
|
offload_func_kq(KQ_scaled);
|
2965
3202
|
ggml_set_name(KQ_scaled, "KQ_scaled");
|
2966
3203
|
|
@@ -2969,58 +3206,44 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2969
3206
|
|
2970
3207
|
switch (model.type) {
|
2971
3208
|
case MODEL_7B:
|
2972
|
-
KQ_masked =
|
3209
|
+
KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
|
2973
3210
|
break;
|
2974
3211
|
case MODEL_13B:
|
2975
|
-
|
3212
|
+
// TODO: replace with ggml_add()
|
3213
|
+
KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ 0, n_head, 8);
|
2976
3214
|
ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
|
2977
|
-
KQ_masked =
|
3215
|
+
KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
|
2978
3216
|
break;
|
2979
3217
|
default:
|
2980
3218
|
GGML_ASSERT(false);
|
2981
3219
|
}
|
2982
|
-
// KQ_masked = mask_past(KQ_scaled)
|
2983
|
-
// struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
|
2984
|
-
// struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past);
|
2985
|
-
// offload_func_kq(KQ_masked);
|
2986
|
-
// ggml_set_name(KQ_masked, "KQ_masked");
|
2987
3220
|
|
2988
3221
|
// KQ = soft_max(KQ_masked)
|
2989
|
-
struct ggml_tensor * KQ_soft_max =
|
3222
|
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
|
2990
3223
|
offload_func_v(KQ_soft_max);
|
2991
3224
|
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
2992
3225
|
|
2993
3226
|
// split cached V into n_head heads
|
2994
3227
|
struct ggml_tensor * V =
|
2995
3228
|
ggml_view_3d(ctx0, kv_self.v,
|
2996
|
-
|
3229
|
+
n_kv, n_embd_head, n_head_kv,
|
2997
3230
|
ggml_element_size(kv_self.v)*n_ctx,
|
2998
3231
|
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
2999
3232
|
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
3000
3233
|
offload_func_v(V);
|
3001
3234
|
ggml_set_name(V, "V");
|
3002
3235
|
|
3003
|
-
#if 1
|
3004
3236
|
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
3005
3237
|
offload_func_v(KQV);
|
3006
3238
|
ggml_set_name(KQV, "KQV");
|
3007
|
-
#else
|
3008
|
-
// make V contiguous in memory to speed up the matmul, however we waste time on the copy
|
3009
|
-
// on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
|
3010
|
-
// is there a better way?
|
3011
|
-
struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head));
|
3012
|
-
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
|
3013
|
-
#endif
|
3014
3239
|
|
3015
3240
|
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
3016
3241
|
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
3017
3242
|
offload_func_v(KQV_merged);
|
3018
3243
|
ggml_set_name(KQV_merged, "KQV_merged");
|
3019
3244
|
|
3020
|
-
// cur = KQV_merged.contiguous().view(n_embd,
|
3021
|
-
cur =
|
3022
|
-
KQV_merged,
|
3023
|
-
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
|
3245
|
+
// cur = KQV_merged.contiguous().view(n_embd, n_tokens)
|
3246
|
+
cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
|
3024
3247
|
offload_func_v(cur);
|
3025
3248
|
ggml_set_name(cur, "KQV_merged_contiguous");
|
3026
3249
|
|
@@ -3113,17 +3336,10 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
3113
3336
|
|
3114
3337
|
static struct ggml_cgraph * llm_build_falcon(
|
3115
3338
|
llama_context & lctx,
|
3116
|
-
const
|
3117
|
-
const float * embd,
|
3118
|
-
int n_tokens,
|
3119
|
-
int n_past) {
|
3120
|
-
|
3121
|
-
GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
|
3122
|
-
|
3123
|
-
const int N = n_tokens;
|
3124
|
-
|
3339
|
+
const llama_batch & batch) {
|
3125
3340
|
const auto & model = lctx.model;
|
3126
3341
|
const auto & hparams = model.hparams;
|
3342
|
+
const auto & cparams = lctx.cparams;
|
3127
3343
|
|
3128
3344
|
const auto & kv_self = lctx.kv_self;
|
3129
3345
|
|
@@ -3131,7 +3347,7 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3131
3347
|
|
3132
3348
|
const int64_t n_embd = hparams.n_embd;
|
3133
3349
|
const int64_t n_layer = hparams.n_layer;
|
3134
|
-
const int64_t n_ctx =
|
3350
|
+
const int64_t n_ctx = cparams.n_ctx;
|
3135
3351
|
const int64_t n_head = hparams.n_head;
|
3136
3352
|
const int64_t n_head_kv = hparams.n_head_kv;
|
3137
3353
|
const int64_t n_embd_head = hparams.n_embd_head();
|
@@ -3139,12 +3355,21 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3139
3355
|
|
3140
3356
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
3141
3357
|
|
3142
|
-
const float freq_base =
|
3143
|
-
const float freq_scale =
|
3358
|
+
const float freq_base = cparams.rope_freq_base;
|
3359
|
+
const float freq_scale = cparams.rope_freq_scale;
|
3144
3360
|
const float norm_eps = hparams.f_norm_eps;
|
3145
3361
|
|
3146
3362
|
const int n_gpu_layers = model.n_gpu_layers;
|
3147
3363
|
|
3364
|
+
const int32_t n_tokens = batch.n_tokens;
|
3365
|
+
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
|
3366
|
+
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
3367
|
+
|
3368
|
+
const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
|
3369
|
+
|
3370
|
+
//printf("kv_head = %d, n_kv = %d, n_tokens = %d, n_ctx = %d, is_measure = %d, has_shift = %d\n",
|
3371
|
+
// kv_head, n_kv, n_tokens, n_ctx, ggml_allocr_is_measure(lctx.alloc), kv_self.has_shift);
|
3372
|
+
|
3148
3373
|
auto & buf_compute = lctx.buf_compute;
|
3149
3374
|
|
3150
3375
|
struct ggml_init_params params = {
|
@@ -3162,12 +3387,12 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3162
3387
|
struct ggml_tensor * cur;
|
3163
3388
|
struct ggml_tensor * inpL;
|
3164
3389
|
|
3165
|
-
if (
|
3166
|
-
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32,
|
3390
|
+
if (batch.token) {
|
3391
|
+
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
3167
3392
|
|
3168
3393
|
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
3169
3394
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3170
|
-
memcpy(inp_tokens->data,
|
3395
|
+
memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
|
3171
3396
|
}
|
3172
3397
|
ggml_set_name(inp_tokens, "inp_tokens");
|
3173
3398
|
|
@@ -3177,11 +3402,11 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3177
3402
|
GGML_ASSERT(false && "not implemented");
|
3178
3403
|
#endif
|
3179
3404
|
|
3180
|
-
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd,
|
3405
|
+
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
|
3181
3406
|
|
3182
3407
|
ggml_allocr_alloc(lctx.alloc, inpL);
|
3183
3408
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3184
|
-
memcpy(inpL->data, embd,
|
3409
|
+
memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
|
3185
3410
|
}
|
3186
3411
|
}
|
3187
3412
|
|
@@ -3190,9 +3415,6 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3190
3415
|
|
3191
3416
|
// offload functions set the tensor output backend to GPU
|
3192
3417
|
// tensors are GPU-accelerated if any input or the output has been offloaded
|
3193
|
-
//
|
3194
|
-
// with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
|
3195
|
-
// in that case ggml_cuda_assign_buffers has no effect
|
3196
3418
|
offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
|
3197
3419
|
offload_func_t offload_func_kq = llama_nop;
|
3198
3420
|
offload_func_t offload_func_v = llama_nop;
|
@@ -3209,12 +3431,75 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3209
3431
|
}
|
3210
3432
|
#endif // GGML_USE_CUBLAS
|
3211
3433
|
|
3434
|
+
// KQ_scale
|
3212
3435
|
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
3436
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
3213
3437
|
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
3214
3438
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3215
3439
|
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
3216
3440
|
}
|
3217
|
-
|
3441
|
+
|
3442
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
3443
|
+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
3444
|
+
offload_func_kq(KQ_mask);
|
3445
|
+
ggml_set_name(KQ_mask, "KQ_mask");
|
3446
|
+
ggml_allocr_alloc(lctx.alloc, KQ_mask);
|
3447
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3448
|
+
float * data = (float *) KQ_mask->data;
|
3449
|
+
memset(data, 0, ggml_nbytes(KQ_mask));
|
3450
|
+
|
3451
|
+
for (int h = 0; h < 1; ++h) {
|
3452
|
+
for (int j = 0; j < n_tokens; ++j) {
|
3453
|
+
const llama_pos pos = batch.pos[j];
|
3454
|
+
const llama_seq_id seq_id = batch.seq_id[j];
|
3455
|
+
|
3456
|
+
for (int i = 0; i < n_kv; ++i) {
|
3457
|
+
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
3458
|
+
data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
|
3459
|
+
}
|
3460
|
+
}
|
3461
|
+
}
|
3462
|
+
}
|
3463
|
+
}
|
3464
|
+
|
3465
|
+
// KQ_pos - contains the positions
|
3466
|
+
struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
3467
|
+
offload_func_kq(KQ_pos);
|
3468
|
+
ggml_set_name(KQ_pos, "KQ_pos");
|
3469
|
+
ggml_allocr_alloc(lctx.alloc, KQ_pos);
|
3470
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3471
|
+
int * data = (int *) KQ_pos->data;
|
3472
|
+
for (int i = 0; i < n_tokens; ++i) {
|
3473
|
+
data[i] = batch.pos[i];
|
3474
|
+
}
|
3475
|
+
}
|
3476
|
+
|
3477
|
+
// shift the entire K-cache if needed
|
3478
|
+
if (do_rope_shift) {
|
3479
|
+
struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
|
3480
|
+
offload_func_kq(K_shift);
|
3481
|
+
ggml_set_name(K_shift, "K_shift");
|
3482
|
+
ggml_allocr_alloc(lctx.alloc, K_shift);
|
3483
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3484
|
+
int * data = (int *) K_shift->data;
|
3485
|
+
for (int i = 0; i < n_ctx; ++i) {
|
3486
|
+
data[i] = kv_self.cells[i].delta;
|
3487
|
+
}
|
3488
|
+
}
|
3489
|
+
|
3490
|
+
for (int il = 0; il < n_layer; ++il) {
|
3491
|
+
struct ggml_tensor * tmp =
|
3492
|
+
ggml_rope_custom_inplace(ctx0,
|
3493
|
+
ggml_view_3d(ctx0, kv_self.k,
|
3494
|
+
n_embd_head, n_head_kv, n_ctx,
|
3495
|
+
ggml_element_size(kv_self.k)*n_embd_head,
|
3496
|
+
ggml_element_size(kv_self.k)*n_embd_gqa,
|
3497
|
+
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
|
3498
|
+
K_shift, n_embd_head, 2, 0, freq_base, freq_scale);
|
3499
|
+
offload_func_kq(tmp);
|
3500
|
+
ggml_build_forward_expand(gf, tmp);
|
3501
|
+
}
|
3502
|
+
}
|
3218
3503
|
|
3219
3504
|
for (int il = 0; il < n_layer; ++il) {
|
3220
3505
|
struct ggml_tensor * attn_norm;
|
@@ -3271,45 +3556,45 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3271
3556
|
// TODO: these 2 ggml_conts are technically not needed, but we add them until CUDA support for
|
3272
3557
|
// non-contiguous views is added for the rope operator
|
3273
3558
|
struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_3d(
|
3274
|
-
ctx0, cur, n_embd_head, n_head,
|
3559
|
+
ctx0, cur, n_embd_head, n_head, n_tokens,
|
3275
3560
|
wsize * n_embd_head,
|
3276
3561
|
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
3277
3562
|
0));
|
3278
3563
|
offload_func_kq(tmpq);
|
3279
3564
|
|
3280
3565
|
struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_3d(
|
3281
|
-
ctx0, cur, n_embd_head, n_head_kv,
|
3566
|
+
ctx0, cur, n_embd_head, n_head_kv, n_tokens,
|
3282
3567
|
wsize * n_embd_head,
|
3283
3568
|
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
3284
3569
|
wsize * n_embd_head * n_head));
|
3285
3570
|
offload_func_kq(tmpk);
|
3286
3571
|
|
3287
3572
|
struct ggml_tensor * tmpv = ggml_view_3d(
|
3288
|
-
ctx0, cur, n_embd_head, n_head_kv,
|
3573
|
+
ctx0, cur, n_embd_head, n_head_kv, n_tokens,
|
3289
3574
|
wsize * n_embd_head,
|
3290
3575
|
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
3291
3576
|
wsize * n_embd_head * (n_head + n_head_kv));
|
3292
3577
|
offload_func_v(tmpv);
|
3293
3578
|
|
3294
3579
|
// using mode = 2 for neox mode
|
3295
|
-
struct ggml_tensor * Qcur =
|
3580
|
+
struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, tmpq, KQ_pos, n_embd_head, 2, 0, freq_base, freq_scale);
|
3296
3581
|
offload_func_kq(Qcur);
|
3297
|
-
struct ggml_tensor * Kcur =
|
3582
|
+
struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, tmpk, KQ_pos, n_embd_head, 2, 0, freq_base, freq_scale);
|
3298
3583
|
offload_func_kq(Kcur);
|
3299
3584
|
|
3300
3585
|
{
|
3301
|
-
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa,
|
3586
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
|
3302
3587
|
offload_func_v(Vcur);
|
3303
3588
|
offload_func_v(Vcur->src[0]->src[0]);
|
3304
3589
|
ggml_set_name(Vcur, "Vcur");
|
3305
3590
|
|
3306
|
-
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k,
|
3591
|
+
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
|
3307
3592
|
offload_func_kq(k);
|
3308
3593
|
ggml_set_name(k, "k");
|
3309
3594
|
|
3310
|
-
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v,
|
3595
|
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
|
3311
3596
|
( n_ctx)*ggml_element_size(kv_self.v),
|
3312
|
-
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa +
|
3597
|
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
|
3313
3598
|
offload_func_v(v);
|
3314
3599
|
|
3315
3600
|
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
@@ -3322,7 +3607,7 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3322
3607
|
|
3323
3608
|
struct ggml_tensor * K =
|
3324
3609
|
ggml_view_3d(ctx0, kv_self.k,
|
3325
|
-
n_embd_head,
|
3610
|
+
n_embd_head, n_kv, n_head_kv,
|
3326
3611
|
ggml_element_size(kv_self.k)*n_embd_gqa,
|
3327
3612
|
ggml_element_size(kv_self.k)*n_embd_head,
|
3328
3613
|
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
@@ -3333,21 +3618,21 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3333
3618
|
offload_func_kq(KQ);
|
3334
3619
|
ggml_set_name(KQ, "KQ");
|
3335
3620
|
|
3336
|
-
struct ggml_tensor * KQ_scaled =
|
3621
|
+
struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
|
3337
3622
|
offload_func_kq(KQ_scaled);
|
3338
3623
|
ggml_set_name(KQ_scaled, "KQ_scaled");
|
3339
3624
|
|
3340
|
-
struct ggml_tensor * KQ_masked =
|
3625
|
+
struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
|
3341
3626
|
offload_func_kq(KQ_masked);
|
3342
3627
|
ggml_set_name(KQ_masked, "KQ_masked");
|
3343
3628
|
|
3344
|
-
struct ggml_tensor * KQ_soft_max =
|
3629
|
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
|
3345
3630
|
offload_func_v(KQ_soft_max);
|
3346
3631
|
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
3347
3632
|
|
3348
3633
|
struct ggml_tensor * V =
|
3349
3634
|
ggml_view_3d(ctx0, kv_self.v,
|
3350
|
-
|
3635
|
+
n_kv, n_embd_head, n_head_kv,
|
3351
3636
|
ggml_element_size(kv_self.v)*n_ctx,
|
3352
3637
|
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
3353
3638
|
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
@@ -3362,7 +3647,7 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3362
3647
|
offload_func_v(KQV_merged);
|
3363
3648
|
ggml_set_name(KQV_merged, "KQV_merged");
|
3364
3649
|
|
3365
|
-
cur =
|
3650
|
+
cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
|
3366
3651
|
offload_func_v(cur);
|
3367
3652
|
ggml_set_name(cur, "KQV_merged_contiguous");
|
3368
3653
|
|
@@ -3420,17 +3705,10 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3420
3705
|
|
3421
3706
|
static struct ggml_cgraph * llm_build_starcoder(
|
3422
3707
|
llama_context & lctx,
|
3423
|
-
const
|
3424
|
-
const float * embd,
|
3425
|
-
int n_tokens,
|
3426
|
-
int n_past) {
|
3427
|
-
|
3428
|
-
GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
|
3429
|
-
|
3430
|
-
const int N = n_tokens;
|
3431
|
-
|
3708
|
+
const llama_batch & batch) {
|
3432
3709
|
const auto & model = lctx.model;
|
3433
3710
|
const auto & hparams = model.hparams;
|
3711
|
+
const auto & cparams = lctx.cparams;
|
3434
3712
|
|
3435
3713
|
const auto & kv_self = lctx.kv_self;
|
3436
3714
|
|
@@ -3438,7 +3716,7 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
3438
3716
|
|
3439
3717
|
const int64_t n_embd = hparams.n_embd;
|
3440
3718
|
const int64_t n_layer = hparams.n_layer;
|
3441
|
-
const int64_t n_ctx =
|
3719
|
+
const int64_t n_ctx = cparams.n_ctx;
|
3442
3720
|
const int64_t n_head = hparams.n_head;
|
3443
3721
|
const int64_t n_head_kv = hparams.n_head_kv;
|
3444
3722
|
const int64_t n_embd_head = hparams.n_embd_head();
|
@@ -3446,7 +3724,11 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
3446
3724
|
|
3447
3725
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
3448
3726
|
|
3449
|
-
const float norm_eps
|
3727
|
+
const float norm_eps = hparams.f_norm_eps;
|
3728
|
+
|
3729
|
+
const int32_t n_tokens = batch.n_tokens;
|
3730
|
+
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
|
3731
|
+
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
3450
3732
|
|
3451
3733
|
auto & buf_compute = lctx.buf_compute;
|
3452
3734
|
|
@@ -3467,12 +3749,12 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
3467
3749
|
struct ggml_tensor * position;
|
3468
3750
|
struct ggml_tensor * inpL;
|
3469
3751
|
|
3470
|
-
if (
|
3471
|
-
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32,
|
3752
|
+
if (batch.token) {
|
3753
|
+
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
3472
3754
|
|
3473
3755
|
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
3474
3756
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3475
|
-
memcpy(inp_tokens->data,
|
3757
|
+
memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
|
3476
3758
|
}
|
3477
3759
|
ggml_set_name(inp_tokens, "inp_tokens");
|
3478
3760
|
|
@@ -3482,21 +3764,21 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
3482
3764
|
GGML_ASSERT(false && "not implemented");
|
3483
3765
|
#endif
|
3484
3766
|
|
3485
|
-
token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd,
|
3767
|
+
token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
|
3486
3768
|
|
3487
3769
|
ggml_allocr_alloc(lctx.alloc, token);
|
3488
3770
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3489
|
-
memcpy(token->data, embd,
|
3771
|
+
memcpy(token->data, batch.embd, n_tokens * n_embd * ggml_element_size(token));
|
3490
3772
|
}
|
3491
3773
|
}
|
3492
3774
|
|
3493
3775
|
{
|
3494
3776
|
// Compute position embeddings.
|
3495
|
-
struct ggml_tensor * inp_positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32,
|
3777
|
+
struct ggml_tensor * inp_positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
3496
3778
|
ggml_allocr_alloc(lctx.alloc, inp_positions);
|
3497
3779
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3498
|
-
for (int i = 0; i <
|
3499
|
-
((int32_t *) inp_positions->data)[i] =
|
3780
|
+
for (int i = 0; i < n_tokens; ++i) {
|
3781
|
+
((int32_t *) inp_positions->data)[i] = batch.pos[i];
|
3500
3782
|
}
|
3501
3783
|
}
|
3502
3784
|
ggml_set_name(inp_positions, "inp_positions");
|
@@ -3504,12 +3786,35 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
3504
3786
|
position = ggml_get_rows(ctx0, model.pos_embeddings, inp_positions);
|
3505
3787
|
}
|
3506
3788
|
|
3789
|
+
// KQ_scale
|
3507
3790
|
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
3791
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
3508
3792
|
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
3509
3793
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3510
3794
|
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
3511
3795
|
}
|
3512
|
-
|
3796
|
+
|
3797
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
3798
|
+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
3799
|
+
ggml_set_name(KQ_mask, "KQ_mask");
|
3800
|
+
ggml_allocr_alloc(lctx.alloc, KQ_mask);
|
3801
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3802
|
+
float * data = (float *) KQ_mask->data;
|
3803
|
+
memset(data, 0, ggml_nbytes(KQ_mask));
|
3804
|
+
|
3805
|
+
for (int h = 0; h < 1; ++h) {
|
3806
|
+
for (int j = 0; j < n_tokens; ++j) {
|
3807
|
+
const llama_pos pos = batch.pos[j];
|
3808
|
+
const llama_seq_id seq_id = batch.seq_id[j];
|
3809
|
+
|
3810
|
+
for (int i = 0; i < n_kv; ++i) {
|
3811
|
+
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
3812
|
+
data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
|
3813
|
+
}
|
3814
|
+
}
|
3815
|
+
}
|
3816
|
+
}
|
3817
|
+
}
|
3513
3818
|
|
3514
3819
|
inpL = ggml_add(ctx0, token, position);
|
3515
3820
|
ggml_set_name(inpL, "inpL");
|
@@ -3525,23 +3830,23 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
3525
3830
|
// Self Attention
|
3526
3831
|
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv);
|
3527
3832
|
|
3528
|
-
struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd,
|
3529
|
-
struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa,
|
3530
|
-
struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa,
|
3833
|
+
struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd);
|
3834
|
+
struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*n_embd);
|
3835
|
+
struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
|
3531
3836
|
|
3532
3837
|
struct ggml_tensor * Qcur = tmpq;
|
3533
3838
|
struct ggml_tensor * Kcur = tmpk;
|
3534
3839
|
|
3535
3840
|
{
|
3536
|
-
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa,
|
3841
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
|
3537
3842
|
ggml_set_name(Vcur, "Vcur");
|
3538
3843
|
|
3539
|
-
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k,
|
3844
|
+
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
|
3540
3845
|
ggml_set_name(k, "k");
|
3541
3846
|
|
3542
|
-
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v,
|
3847
|
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
|
3543
3848
|
( n_ctx)*ggml_element_size(kv_self.v),
|
3544
|
-
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa +
|
3849
|
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
|
3545
3850
|
|
3546
3851
|
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
3547
3852
|
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
@@ -3551,13 +3856,13 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
3551
3856
|
ggml_permute(ctx0,
|
3552
3857
|
ggml_cpy(ctx0,
|
3553
3858
|
Qcur,
|
3554
|
-
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head,
|
3859
|
+
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, n_tokens)),
|
3555
3860
|
0, 2, 1, 3);
|
3556
3861
|
ggml_set_name(Q, "Q");
|
3557
3862
|
|
3558
3863
|
struct ggml_tensor * K =
|
3559
3864
|
ggml_view_3d(ctx0, kv_self.k,
|
3560
|
-
n_embd_head,
|
3865
|
+
n_embd_head, n_kv, n_head_kv,
|
3561
3866
|
ggml_element_size(kv_self.k)*n_embd_gqa,
|
3562
3867
|
ggml_element_size(kv_self.k)*n_embd_head,
|
3563
3868
|
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
@@ -3568,12 +3873,12 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
3568
3873
|
ggml_set_name(KQ, "KQ");
|
3569
3874
|
|
3570
3875
|
// KQ_scaled = KQ / sqrt(n_embd_head)
|
3571
|
-
// KQ_scaled shape [n_past +
|
3876
|
+
// KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
|
3572
3877
|
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
3573
3878
|
ggml_set_name(KQ_scaled, "KQ_scaled");
|
3574
3879
|
|
3575
3880
|
// KQ_masked = mask_past(KQ_scaled)
|
3576
|
-
struct ggml_tensor * KQ_masked =
|
3881
|
+
struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
|
3577
3882
|
ggml_set_name(KQ_masked, "KQ_masked");
|
3578
3883
|
|
3579
3884
|
// KQ = soft_max(KQ_masked)
|
@@ -3583,7 +3888,7 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
3583
3888
|
// split cached V into n_head heads
|
3584
3889
|
struct ggml_tensor * V =
|
3585
3890
|
ggml_view_3d(ctx0, kv_self.v,
|
3586
|
-
|
3891
|
+
n_kv, n_embd_head, n_head_kv,
|
3587
3892
|
ggml_element_size(kv_self.v)*n_ctx,
|
3588
3893
|
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
3589
3894
|
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
@@ -3596,10 +3901,8 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
3596
3901
|
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
3597
3902
|
ggml_set_name(KQV_merged, "KQV_merged");
|
3598
3903
|
|
3599
|
-
// cur = KQV_merged.contiguous().view(n_embd,
|
3600
|
-
cur =
|
3601
|
-
KQV_merged,
|
3602
|
-
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
|
3904
|
+
// cur = KQV_merged.contiguous().view(n_embd, n_tokens)
|
3905
|
+
cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
|
3603
3906
|
ggml_set_name(cur, "KQV_merged_contiguous");
|
3604
3907
|
}
|
3605
3908
|
|
@@ -3649,10 +3952,7 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
3649
3952
|
|
3650
3953
|
static struct ggml_cgraph * llama_build_graph(
|
3651
3954
|
llama_context & lctx,
|
3652
|
-
const
|
3653
|
-
const float * embd,
|
3654
|
-
int n_tokens,
|
3655
|
-
int n_past) {
|
3955
|
+
const llama_batch & batch) {
|
3656
3956
|
const auto & model = lctx.model;
|
3657
3957
|
|
3658
3958
|
struct ggml_cgraph * result = NULL;
|
@@ -3660,76 +3960,117 @@ static struct ggml_cgraph * llama_build_graph(
|
|
3660
3960
|
switch (model.arch) {
|
3661
3961
|
case LLM_ARCH_LLAMA:
|
3662
3962
|
{
|
3663
|
-
result = llm_build_llama(lctx,
|
3963
|
+
result = llm_build_llama(lctx, batch);
|
3664
3964
|
} break;
|
3665
3965
|
case LLM_ARCH_BAICHUAN:
|
3666
3966
|
{
|
3667
|
-
result = llm_build_baichaun(lctx,
|
3967
|
+
result = llm_build_baichaun(lctx, batch);
|
3668
3968
|
} break;
|
3669
3969
|
case LLM_ARCH_FALCON:
|
3670
3970
|
{
|
3671
|
-
result = llm_build_falcon(lctx,
|
3971
|
+
result = llm_build_falcon(lctx, batch);
|
3672
3972
|
} break;
|
3673
3973
|
case LLM_ARCH_STARCODER:
|
3674
3974
|
{
|
3675
|
-
result = llm_build_starcoder(lctx,
|
3975
|
+
result = llm_build_starcoder(lctx, batch);
|
3676
3976
|
} break;
|
3677
3977
|
default:
|
3678
3978
|
GGML_ASSERT(false);
|
3679
|
-
}
|
3979
|
+
}
|
3680
3980
|
|
3681
3981
|
return result;
|
3682
3982
|
}
|
3683
3983
|
|
3684
|
-
//
|
3984
|
+
// decode a batch of tokens by evaluating the transformer
|
3685
3985
|
//
|
3686
3986
|
// - lctx: llama context
|
3687
|
-
// -
|
3688
|
-
// - embd embeddings input
|
3689
|
-
// - n_tokens number of tokens
|
3690
|
-
// - n_past: the context size so far
|
3987
|
+
// - batch: batch to evaluate
|
3691
3988
|
// - n_threads: number of threads to use
|
3692
3989
|
//
|
3693
|
-
|
3990
|
+
// return 0 on success
|
3991
|
+
// return positive int on warning
|
3992
|
+
// return negative int on error
|
3993
|
+
//
|
3994
|
+
static int llama_decode_internal(
|
3694
3995
|
llama_context & lctx,
|
3695
|
-
|
3696
|
-
|
3697
|
-
int n_tokens,
|
3698
|
-
int n_past,
|
3699
|
-
int n_threads,
|
3700
|
-
const char * cgraph_fname) {
|
3996
|
+
llama_batch batch) {
|
3997
|
+
const uint32_t n_tokens = batch.n_tokens;
|
3701
3998
|
|
3702
|
-
|
3999
|
+
if (n_tokens == 0) {
|
4000
|
+
LLAMA_LOG_ERROR("%s: n_tokens == 0", __func__);
|
4001
|
+
return -1;
|
4002
|
+
}
|
3703
4003
|
|
3704
|
-
|
3705
|
-
|
3706
|
-
|
3707
|
-
|
3708
|
-
|
4004
|
+
const auto & model = lctx.model;
|
4005
|
+
const auto & hparams = model.hparams;
|
4006
|
+
const auto & cparams = lctx.cparams;
|
4007
|
+
|
4008
|
+
const auto n_batch = cparams.n_batch;
|
4009
|
+
|
4010
|
+
GGML_ASSERT(n_tokens <= n_batch);
|
4011
|
+
|
4012
|
+
int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
|
4013
|
+
GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
|
3709
4014
|
|
3710
4015
|
const int64_t t_start_us = ggml_time_us();
|
3711
4016
|
|
3712
4017
|
#ifdef GGML_USE_MPI
|
3713
|
-
|
4018
|
+
// TODO: needs fix after #3228
|
4019
|
+
GGML_ASSERT(false && "not implemented");
|
4020
|
+
//ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
|
3714
4021
|
#endif
|
3715
4022
|
|
3716
4023
|
GGML_ASSERT(n_threads > 0);
|
3717
4024
|
|
3718
|
-
|
3719
|
-
|
3720
|
-
const auto & model = lctx.model;
|
3721
|
-
const auto & hparams = model.hparams;
|
3722
|
-
|
3723
|
-
const auto & kv_self = lctx.kv_self;
|
4025
|
+
auto & kv_self = lctx.kv_self;
|
3724
4026
|
|
3725
4027
|
GGML_ASSERT(!!kv_self.ctx);
|
3726
4028
|
|
3727
4029
|
const int64_t n_embd = hparams.n_embd;
|
3728
4030
|
const int64_t n_vocab = hparams.n_vocab;
|
3729
4031
|
|
4032
|
+
// helpers for smoother batch API transistion
|
4033
|
+
// after deprecating the llama_eval calls, these will be removed
|
4034
|
+
std::vector<llama_pos> pos;
|
4035
|
+
std::vector<llama_seq_id> seq_id;
|
4036
|
+
|
4037
|
+
if (batch.pos == nullptr) {
|
4038
|
+
pos.resize(n_tokens);
|
4039
|
+
for (uint32_t i = 0; i < n_tokens; i++) {
|
4040
|
+
pos[i] = batch.all_pos_0 + i*batch.all_pos_1;
|
4041
|
+
}
|
4042
|
+
|
4043
|
+
batch.pos = pos.data();
|
4044
|
+
}
|
4045
|
+
|
4046
|
+
if (batch.seq_id == nullptr) {
|
4047
|
+
seq_id.resize(n_tokens);
|
4048
|
+
for (uint32_t i = 0; i < n_tokens; i++) {
|
4049
|
+
seq_id[i] = batch.all_seq_id;
|
4050
|
+
}
|
4051
|
+
|
4052
|
+
batch.seq_id = seq_id.data();
|
4053
|
+
}
|
4054
|
+
|
4055
|
+
// we always start to search for a free slot from the start of the cache
|
4056
|
+
// TODO: better strategies can be implemented
|
4057
|
+
kv_self.head = 0;
|
4058
|
+
|
4059
|
+
if (!llama_kv_cache_find_slot(kv_self, batch)) {
|
4060
|
+
return 1;
|
4061
|
+
}
|
4062
|
+
|
4063
|
+
// a heuristic, to avoid attending the full cache if it is not yet utilized
|
4064
|
+
// after enough generations, the benefit from this heuristic disappears
|
4065
|
+
// if we start defragmenting the cache, the benefit from this will be more important
|
4066
|
+
//kv_self.n = std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)); // TODO: this might be better for CUDA?
|
4067
|
+
kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, llama_kv_cache_cell_max(kv_self)));
|
4068
|
+
|
4069
|
+
//printf("kv_self.n = %d\n", kv_self.n);
|
4070
|
+
|
3730
4071
|
ggml_allocr_reset(lctx.alloc);
|
3731
4072
|
|
3732
|
-
ggml_cgraph * gf = llama_build_graph(lctx,
|
4073
|
+
ggml_cgraph * gf = llama_build_graph(lctx, batch);
|
3733
4074
|
|
3734
4075
|
ggml_allocr_alloc_graph(lctx.alloc, gf);
|
3735
4076
|
|
@@ -3738,6 +4079,7 @@ static bool llama_eval_internal(
|
|
3738
4079
|
ggml_tensor * node = gf->leafs[i];
|
3739
4080
|
if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
|
3740
4081
|
ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) lctx.buf_alloc.data);
|
4082
|
+
ggml_cuda_copy_to_device(node);
|
3741
4083
|
}
|
3742
4084
|
}
|
3743
4085
|
|
@@ -3747,6 +4089,8 @@ static bool llama_eval_internal(
|
|
3747
4089
|
ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) lctx.buf_alloc.data);
|
3748
4090
|
}
|
3749
4091
|
}
|
4092
|
+
|
4093
|
+
ggml_cuda_set_mul_mat_q(cparams.mul_mat_q);
|
3750
4094
|
#endif
|
3751
4095
|
|
3752
4096
|
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
@@ -3756,7 +4100,7 @@ static bool llama_eval_internal(
|
|
3756
4100
|
// TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
|
3757
4101
|
// we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
|
3758
4102
|
// with the BLAS calls. need a better solution
|
3759
|
-
if (
|
4103
|
+
if (n_tokens >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
|
3760
4104
|
n_threads = std::min(4, n_threads);
|
3761
4105
|
}
|
3762
4106
|
|
@@ -3795,12 +4139,9 @@ static bool llama_eval_internal(
|
|
3795
4139
|
ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
|
3796
4140
|
#endif
|
3797
4141
|
|
3798
|
-
// update kv
|
3799
|
-
lctx.kv_self.
|
3800
|
-
|
3801
|
-
if (cgraph_fname) {
|
3802
|
-
ggml_graph_export(gf, cgraph_fname);
|
3803
|
-
}
|
4142
|
+
// update the kv ring buffer
|
4143
|
+
lctx.kv_self.head += n_tokens;
|
4144
|
+
lctx.kv_self.has_shift = false;
|
3804
4145
|
|
3805
4146
|
#ifdef GGML_PERF
|
3806
4147
|
// print timing information per ggml operation (for debugging purposes)
|
@@ -3817,13 +4158,20 @@ static bool llama_eval_internal(
|
|
3817
4158
|
{
|
3818
4159
|
auto & logits_out = lctx.logits;
|
3819
4160
|
|
3820
|
-
if (
|
3821
|
-
logits_out.resize(n_vocab *
|
3822
|
-
|
4161
|
+
if (batch.logits) {
|
4162
|
+
logits_out.resize(n_vocab * n_tokens);
|
4163
|
+
for (uint32_t i = 0; i < n_tokens; i++) {
|
4164
|
+
if (batch.logits[i] == 0) {
|
4165
|
+
continue;
|
4166
|
+
}
|
4167
|
+
memcpy(logits_out.data() + (n_vocab*i), (float *) ggml_get_data(res) + (n_vocab*i), sizeof(float)*n_vocab);
|
4168
|
+
}
|
4169
|
+
} else if (lctx.logits_all) {
|
4170
|
+
logits_out.resize(n_vocab * n_tokens);
|
4171
|
+
memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*n_tokens);
|
3823
4172
|
} else {
|
3824
|
-
// return result for just the last token
|
3825
4173
|
logits_out.resize(n_vocab);
|
3826
|
-
memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(
|
4174
|
+
memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(n_tokens - 1)), sizeof(float)*n_vocab);
|
3827
4175
|
}
|
3828
4176
|
}
|
3829
4177
|
|
@@ -3832,20 +4180,27 @@ static bool llama_eval_internal(
|
|
3832
4180
|
auto & embedding_out = lctx.embedding;
|
3833
4181
|
|
3834
4182
|
embedding_out.resize(n_embd);
|
3835
|
-
memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(
|
4183
|
+
memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(n_tokens - 1)), sizeof(float)*n_embd);
|
3836
4184
|
}
|
3837
4185
|
|
3838
4186
|
// measure the performance only for the single-token evals
|
3839
|
-
if (
|
4187
|
+
if (n_tokens == 1) {
|
3840
4188
|
lctx.t_eval_us += ggml_time_us() - t_start_us;
|
3841
4189
|
lctx.n_eval++;
|
3842
4190
|
}
|
3843
|
-
else if (
|
4191
|
+
else if (n_tokens > 1) {
|
3844
4192
|
lctx.t_p_eval_us += ggml_time_us() - t_start_us;
|
3845
|
-
lctx.n_p_eval +=
|
4193
|
+
lctx.n_p_eval += n_tokens;
|
3846
4194
|
}
|
3847
4195
|
|
3848
|
-
|
4196
|
+
// get a more accurate load time, upon first eval
|
4197
|
+
// TODO: fix this
|
4198
|
+
if (!lctx.has_evaluated_once) {
|
4199
|
+
lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
|
4200
|
+
lctx.has_evaluated_once = true;
|
4201
|
+
}
|
4202
|
+
|
4203
|
+
return 0;
|
3849
4204
|
}
|
3850
4205
|
|
3851
4206
|
//
|
@@ -4266,7 +4621,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
4266
4621
|
llm_tokenizer_bpe tokenizer(vocab);
|
4267
4622
|
tokenizer.tokenize(raw_text, output);
|
4268
4623
|
} break;
|
4269
|
-
}
|
4624
|
+
}
|
4270
4625
|
|
4271
4626
|
return output;
|
4272
4627
|
}
|
@@ -4670,6 +5025,13 @@ struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar)
|
|
4670
5025
|
// sampling
|
4671
5026
|
//
|
4672
5027
|
|
5028
|
+
void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed) {
|
5029
|
+
if (seed == LLAMA_DEFAULT_SEED) {
|
5030
|
+
seed = time(NULL);
|
5031
|
+
}
|
5032
|
+
ctx->rng.seed(seed);
|
5033
|
+
}
|
5034
|
+
|
4673
5035
|
void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates) {
|
4674
5036
|
GGML_ASSERT(candidates->size > 0);
|
4675
5037
|
|
@@ -4878,7 +5240,7 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
|
|
4878
5240
|
}
|
4879
5241
|
}
|
4880
5242
|
|
4881
|
-
void
|
5243
|
+
void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
|
4882
5244
|
const int64_t t_start_sample_us = ggml_time_us();
|
4883
5245
|
|
4884
5246
|
for (size_t i = 0; i < candidates_p->size; ++i) {
|
@@ -4890,6 +5252,10 @@ void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array
|
|
4890
5252
|
}
|
4891
5253
|
}
|
4892
5254
|
|
5255
|
+
void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
|
5256
|
+
llama_sample_temp(ctx, candidates_p, temp);
|
5257
|
+
}
|
5258
|
+
|
4893
5259
|
void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty) {
|
4894
5260
|
if (last_tokens_size == 0 || penalty == 1.0f) {
|
4895
5261
|
return;
|
@@ -5013,7 +5379,7 @@ void llama_sample_classifier_free_guidance(
|
|
5013
5379
|
|
5014
5380
|
GGML_ASSERT(ctx);
|
5015
5381
|
|
5016
|
-
auto n_vocab = llama_n_vocab(ctx);
|
5382
|
+
auto n_vocab = llama_n_vocab(llama_get_model(ctx));
|
5017
5383
|
|
5018
5384
|
GGML_ASSERT(n_vocab == (int)candidates->size);
|
5019
5385
|
GGML_ASSERT(!candidates->sorted);
|
@@ -5042,7 +5408,7 @@ void llama_sample_classifier_free_guidance(
|
|
5042
5408
|
llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu) {
|
5043
5409
|
GGML_ASSERT(ctx);
|
5044
5410
|
|
5045
|
-
auto N = float(llama_n_vocab(ctx));
|
5411
|
+
auto N = float(llama_n_vocab(llama_get_model(ctx)));
|
5046
5412
|
int64_t t_start_sample_us;
|
5047
5413
|
t_start_sample_us = ggml_time_us();
|
5048
5414
|
|
@@ -5229,7 +5595,7 @@ struct llama_logit_info {
|
|
5229
5595
|
};
|
5230
5596
|
llama_logit_info(llama_context * ctx)
|
5231
5597
|
: logits(llama_get_logits(ctx))
|
5232
|
-
, n_vocab(llama_n_vocab(ctx))
|
5598
|
+
, n_vocab(llama_n_vocab(llama_get_model(ctx)))
|
5233
5599
|
, max_l(*std::max_element(logits, logits + n_vocab))
|
5234
5600
|
, normalizer(1.0f / std::accumulate(logits, logits + n_vocab, 0.0f, sum_exp{max_l}))
|
5235
5601
|
{ }
|
@@ -5267,7 +5633,6 @@ struct llama_beam_search_data {
|
|
5267
5633
|
size_t n_beams;
|
5268
5634
|
int n_past;
|
5269
5635
|
int n_predict;
|
5270
|
-
int n_threads;
|
5271
5636
|
std::vector<llama_beam> beams;
|
5272
5637
|
std::vector<llama_beam> next_beams;
|
5273
5638
|
|
@@ -5277,12 +5642,11 @@ struct llama_beam_search_data {
|
|
5277
5642
|
// Used to communicate to/from callback on beams state.
|
5278
5643
|
std::vector<llama_beam_view> beam_views;
|
5279
5644
|
|
5280
|
-
llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict
|
5645
|
+
llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict)
|
5281
5646
|
: ctx(ctx)
|
5282
5647
|
, n_beams(n_beams)
|
5283
5648
|
, n_past(n_past)
|
5284
5649
|
, n_predict(n_predict)
|
5285
|
-
, n_threads(n_threads)
|
5286
5650
|
, beam_views(n_beams) {
|
5287
5651
|
beams.reserve(n_beams);
|
5288
5652
|
next_beams.reserve(n_beams);
|
@@ -5319,7 +5683,7 @@ struct llama_beam_search_data {
|
|
5319
5683
|
} else {
|
5320
5684
|
// beam is not at end-of-sentence, so branch with next top_k tokens.
|
5321
5685
|
if (!beam.tokens.empty()) {
|
5322
|
-
|
5686
|
+
llama_decode(ctx, llama_batch_get_one(beam.tokens.data(), beam.tokens.size(), n_past, 0));
|
5323
5687
|
}
|
5324
5688
|
llama_logit_info logit_info(ctx);
|
5325
5689
|
std::vector<llama_token_data> next_tokens = logit_info.top_k(n_beams);
|
@@ -5393,7 +5757,7 @@ struct llama_beam_search_data {
|
|
5393
5757
|
callback(callback_data, get_beams_state(false)); // Sets common_prefix_length
|
5394
5758
|
update_beams_from_beam_views(); // Update values (p,eob) that callback may have changed.
|
5395
5759
|
if (common_prefix_length) {
|
5396
|
-
|
5760
|
+
llama_decode(ctx, llama_batch_get_one(beams[0].tokens.data(), common_prefix_length, n_past, 0));
|
5397
5761
|
n_past += common_prefix_length;
|
5398
5762
|
}
|
5399
5763
|
// Zero-out next_beam probabilities to place them last in following min-heap.
|
@@ -5434,11 +5798,11 @@ struct llama_beam_search_data {
|
|
5434
5798
|
|
5435
5799
|
void llama_beam_search(llama_context * ctx,
|
5436
5800
|
llama_beam_search_callback_fn_t callback, void * callback_data,
|
5437
|
-
size_t n_beams, int n_past, int n_predict
|
5801
|
+
size_t n_beams, int n_past, int n_predict) {
|
5438
5802
|
assert(ctx);
|
5439
5803
|
const int64_t t_start_sample_us = ggml_time_us();
|
5440
5804
|
|
5441
|
-
llama_beam_search_data beam_search_data(ctx, n_beams, n_past, n_predict
|
5805
|
+
llama_beam_search_data beam_search_data(ctx, n_beams, n_past, n_predict);
|
5442
5806
|
|
5443
5807
|
beam_search_data.loop(callback, callback_data);
|
5444
5808
|
|
@@ -5658,11 +6022,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
5658
6022
|
nthread = std::thread::hardware_concurrency();
|
5659
6023
|
}
|
5660
6024
|
|
5661
|
-
|
6025
|
+
llama_model_loader ml(fname_inp, /*use_mmap*/ false);
|
5662
6026
|
|
5663
6027
|
llama_model model;
|
5664
|
-
llm_load_arch(
|
5665
|
-
llm_load_hparams(
|
6028
|
+
llm_load_arch(ml, model);
|
6029
|
+
llm_load_hparams(ml, model);
|
5666
6030
|
|
5667
6031
|
if (params->only_copy) {
|
5668
6032
|
ftype = model.ftype;
|
@@ -5672,7 +6036,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
5672
6036
|
struct gguf_context * ctx_out = gguf_init_empty();
|
5673
6037
|
|
5674
6038
|
// copy the KV pairs from the input file
|
5675
|
-
gguf_set_kv (ctx_out, ml
|
6039
|
+
gguf_set_kv (ctx_out, ml.ctx_gguf);
|
5676
6040
|
gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
|
5677
6041
|
gguf_set_val_u32(ctx_out, "general.file_type", ftype);
|
5678
6042
|
|
@@ -5680,8 +6044,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
5680
6044
|
int n_attention_wv = 0;
|
5681
6045
|
int n_feed_forward_w2 = 0;
|
5682
6046
|
|
5683
|
-
for (int i = 0; i < ml
|
5684
|
-
struct ggml_tensor * meta = ml
|
6047
|
+
for (int i = 0; i < ml.n_tensors; ++i) {
|
6048
|
+
struct ggml_tensor * meta = ml.get_tensor_meta(i);
|
5685
6049
|
|
5686
6050
|
const std::string name = ggml_get_name(meta);
|
5687
6051
|
|
@@ -5717,8 +6081,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
5717
6081
|
std::vector<no_init<float>> f32_conv_buf;
|
5718
6082
|
|
5719
6083
|
// populate the original tensors so we get an initial meta data
|
5720
|
-
for (int i = 0; i < ml
|
5721
|
-
struct ggml_tensor * meta = ml
|
6084
|
+
for (int i = 0; i < ml.n_tensors; ++i) {
|
6085
|
+
struct ggml_tensor * meta = ml.get_tensor_meta(i);
|
5722
6086
|
gguf_add_tensor(ctx_out, meta);
|
5723
6087
|
}
|
5724
6088
|
|
@@ -5731,8 +6095,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
5731
6095
|
// placeholder for the meta data
|
5732
6096
|
::zeros(fout, meta_size);
|
5733
6097
|
|
5734
|
-
for (int i = 0; i < ml
|
5735
|
-
struct ggml_tensor * tensor = ml
|
6098
|
+
for (int i = 0; i < ml.n_tensors; ++i) {
|
6099
|
+
struct ggml_tensor * tensor = ml.get_tensor_meta(i);
|
5736
6100
|
|
5737
6101
|
const std::string name = ggml_get_name(tensor);
|
5738
6102
|
|
@@ -5740,10 +6104,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
5740
6104
|
read_data.resize(ggml_nbytes(tensor));
|
5741
6105
|
}
|
5742
6106
|
tensor->data = read_data.data();
|
5743
|
-
ml
|
6107
|
+
ml.load_data_for(tensor);
|
5744
6108
|
|
5745
6109
|
LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",
|
5746
|
-
++idx, ml
|
6110
|
+
++idx, ml.n_tensors,
|
5747
6111
|
ggml_get_name(tensor),
|
5748
6112
|
llama_format_tensor_shape(tensor).c_str(),
|
5749
6113
|
ggml_type_name(tensor->type));
|
@@ -5893,9 +6257,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
5893
6257
|
}
|
5894
6258
|
}
|
5895
6259
|
|
5896
|
-
// TODO: after the GGUF PR, this likely won't work and needs to be updated
|
5897
6260
|
static int llama_apply_lora_from_file_internal(
|
5898
|
-
const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads
|
6261
|
+
const struct llama_model & model, const char * path_lora, float scale, const char * path_base_model, int n_threads
|
5899
6262
|
) {
|
5900
6263
|
LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
|
5901
6264
|
|
@@ -5924,7 +6287,7 @@ static int llama_apply_lora_from_file_internal(
|
|
5924
6287
|
int32_t lora_alpha;
|
5925
6288
|
fin.read((char *) &lora_r, sizeof(lora_r));
|
5926
6289
|
fin.read((char *) &lora_alpha, sizeof(lora_alpha));
|
5927
|
-
float scaling = (float)lora_alpha / (float)lora_r;
|
6290
|
+
float scaling = scale * (float)lora_alpha / (float)lora_r;
|
5928
6291
|
|
5929
6292
|
LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
|
5930
6293
|
|
@@ -6140,9 +6503,10 @@ static int llama_apply_lora_from_file_internal(
|
|
6140
6503
|
ggml_set_name(r, "r_cpy");
|
6141
6504
|
}
|
6142
6505
|
|
6143
|
-
struct ggml_cgraph gf =
|
6506
|
+
struct ggml_cgraph * gf = ggml_new_graph(lora_ctx);
|
6507
|
+
ggml_build_forward_expand(gf, r);
|
6144
6508
|
|
6145
|
-
ggml_graph_compute_helper(work_buffer,
|
6509
|
+
ggml_graph_compute_helper(work_buffer, gf, n_threads);
|
6146
6510
|
|
6147
6511
|
// we won't need these tensors again, reset the context to save memory
|
6148
6512
|
ggml_free(lora_ctx);
|
@@ -6171,27 +6535,16 @@ static int llama_apply_lora_from_file_internal(
|
|
6171
6535
|
//
|
6172
6536
|
// interface implementation
|
6173
6537
|
//
|
6174
|
-
|
6175
|
-
struct
|
6176
|
-
struct llama_context_params result = {
|
6177
|
-
/*.seed =*/ LLAMA_DEFAULT_SEED,
|
6178
|
-
/*.n_ctx =*/ 512,
|
6179
|
-
/*.n_batch =*/ 512,
|
6538
|
+
struct llama_model_params llama_model_default_params() {
|
6539
|
+
struct llama_model_params result = {
|
6180
6540
|
/*.n_gpu_layers =*/ 0,
|
6181
6541
|
/*.main_gpu =*/ 0,
|
6182
6542
|
/*.tensor_split =*/ nullptr,
|
6183
|
-
/*.rope_freq_base =*/ 0.0f,
|
6184
|
-
/*.rope_freq_scale =*/ 0.0f,
|
6185
6543
|
/*.progress_callback =*/ nullptr,
|
6186
6544
|
/*.progress_callback_user_data =*/ nullptr,
|
6187
|
-
/*.low_vram =*/ false,
|
6188
|
-
/*.mul_mat_q =*/ true,
|
6189
|
-
/*.f16_kv =*/ true,
|
6190
|
-
/*.logits_all =*/ false,
|
6191
6545
|
/*.vocab_only =*/ false,
|
6192
6546
|
/*.use_mmap =*/ true,
|
6193
6547
|
/*.use_mlock =*/ false,
|
6194
|
-
/*.embedding =*/ false,
|
6195
6548
|
};
|
6196
6549
|
|
6197
6550
|
#ifdef GGML_USE_METAL
|
@@ -6201,6 +6554,24 @@ struct llama_context_params llama_context_default_params() {
|
|
6201
6554
|
return result;
|
6202
6555
|
}
|
6203
6556
|
|
6557
|
+
struct llama_context_params llama_context_default_params() {
|
6558
|
+
struct llama_context_params result = {
|
6559
|
+
/*.seed =*/ LLAMA_DEFAULT_SEED,
|
6560
|
+
/*.n_ctx =*/ 512,
|
6561
|
+
/*.n_batch =*/ 512,
|
6562
|
+
/*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
|
6563
|
+
/*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS,
|
6564
|
+
/*.rope_freq_base =*/ 0.0f,
|
6565
|
+
/*.rope_freq_scale =*/ 0.0f,
|
6566
|
+
/*.mul_mat_q =*/ true,
|
6567
|
+
/*.f16_kv =*/ true,
|
6568
|
+
/*.logits_all =*/ false,
|
6569
|
+
/*.embedding =*/ false,
|
6570
|
+
};
|
6571
|
+
|
6572
|
+
return result;
|
6573
|
+
}
|
6574
|
+
|
6204
6575
|
struct llama_model_quantize_params llama_model_quantize_default_params() {
|
6205
6576
|
struct llama_model_quantize_params result = {
|
6206
6577
|
/*.nthread =*/ 0,
|
@@ -6256,13 +6627,11 @@ int64_t llama_time_us(void) {
|
|
6256
6627
|
|
6257
6628
|
struct llama_model * llama_load_model_from_file(
|
6258
6629
|
const char * path_model,
|
6259
|
-
|
6630
|
+
struct llama_model_params params) {
|
6260
6631
|
ggml_time_init();
|
6261
6632
|
|
6262
6633
|
llama_model * model = new llama_model;
|
6263
6634
|
|
6264
|
-
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
6265
|
-
|
6266
6635
|
unsigned cur_percentage = 0;
|
6267
6636
|
if (params.progress_callback == NULL) {
|
6268
6637
|
params.progress_callback_user_data = &cur_percentage;
|
@@ -6279,9 +6648,9 @@ struct llama_model * llama_load_model_from_file(
|
|
6279
6648
|
};
|
6280
6649
|
}
|
6281
6650
|
|
6282
|
-
if (!llama_model_load(path_model, *model, params.
|
6283
|
-
params.main_gpu, params.tensor_split,
|
6284
|
-
params.
|
6651
|
+
if (!llama_model_load(path_model, *model, params.n_gpu_layers,
|
6652
|
+
params.main_gpu, params.tensor_split,
|
6653
|
+
params.use_mmap, params.use_mlock, params.vocab_only,
|
6285
6654
|
params.progress_callback, params.progress_callback_user_data)) {
|
6286
6655
|
LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
|
6287
6656
|
delete model;
|
@@ -6305,18 +6674,33 @@ struct llama_context * llama_new_context_with_model(
|
|
6305
6674
|
|
6306
6675
|
llama_context * ctx = new llama_context(*model);
|
6307
6676
|
|
6677
|
+
const auto & hparams = model->hparams;
|
6678
|
+
auto & cparams = ctx->cparams;
|
6679
|
+
|
6680
|
+
cparams.n_batch = params.n_batch;
|
6681
|
+
cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
|
6682
|
+
cparams.rope_freq_base = params.rope_freq_base == 0 ? hparams.rope_freq_base_train : params.rope_freq_base;
|
6683
|
+
cparams.rope_freq_scale = params.rope_freq_scale == 0 ? hparams.rope_freq_scale_train : params.rope_freq_scale;
|
6684
|
+
cparams.n_threads = params.n_threads;
|
6685
|
+
cparams.n_threads_batch = params.n_threads_batch;
|
6686
|
+
cparams.mul_mat_q = params.mul_mat_q;
|
6687
|
+
|
6308
6688
|
if (params.seed == LLAMA_DEFAULT_SEED) {
|
6309
6689
|
params.seed = time(NULL);
|
6310
6690
|
}
|
6311
6691
|
|
6692
|
+
LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
|
6693
|
+
LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
|
6694
|
+
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
|
6695
|
+
|
6312
6696
|
ctx->rng = std::mt19937(params.seed);
|
6313
6697
|
ctx->logits_all = params.logits_all;
|
6314
6698
|
|
6315
6699
|
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
6316
6700
|
|
6317
6701
|
// reserve memory for context buffers
|
6318
|
-
if (!
|
6319
|
-
if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type,
|
6702
|
+
if (!hparams.vocab_only) {
|
6703
|
+
if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, cparams.n_ctx, model->n_gpu_layers)) {
|
6320
6704
|
LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
|
6321
6705
|
llama_free(ctx);
|
6322
6706
|
return nullptr;
|
@@ -6327,11 +6711,9 @@ struct llama_context * llama_new_context_with_model(
|
|
6327
6711
|
LLAMA_LOG_INFO("%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
|
6328
6712
|
}
|
6329
6713
|
|
6330
|
-
const auto & hparams = ctx->model.hparams;
|
6331
|
-
|
6332
6714
|
// resized during inference
|
6333
6715
|
if (params.logits_all) {
|
6334
|
-
ctx->logits.reserve(
|
6716
|
+
ctx->logits.reserve(cparams.n_ctx*hparams.n_vocab);
|
6335
6717
|
} else {
|
6336
6718
|
ctx->logits.reserve(hparams.n_vocab);
|
6337
6719
|
}
|
@@ -6349,26 +6731,28 @@ struct llama_context * llama_new_context_with_model(
|
|
6349
6731
|
ctx->alloc = ggml_allocr_new_measure(tensor_alignment);
|
6350
6732
|
|
6351
6733
|
// build worst-case graph
|
6352
|
-
int n_tokens = std::min(
|
6353
|
-
int n_past =
|
6734
|
+
int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch);
|
6735
|
+
int n_past = cparams.n_ctx - n_tokens;
|
6354
6736
|
llama_token token = llama_token_bos(ctx); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
|
6355
|
-
ggml_cgraph * gf = llama_build_graph(*ctx, &token,
|
6737
|
+
ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0));
|
6738
|
+
|
6356
6739
|
#ifdef GGML_USE_METAL
|
6357
|
-
if (
|
6740
|
+
if (model->n_gpu_layers > 0) {
|
6358
6741
|
ctx->ctx_metal = ggml_metal_init(1);
|
6359
6742
|
if (!ctx->ctx_metal) {
|
6360
6743
|
LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__);
|
6361
6744
|
llama_free(ctx);
|
6362
6745
|
return NULL;
|
6363
6746
|
}
|
6364
|
-
|
6365
|
-
|
6747
|
+
ggml_metal_log_set_callback(llama_log_callback_default, NULL);
|
6748
|
+
//ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false);
|
6749
|
+
//ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
|
6366
6750
|
}
|
6367
6751
|
#endif
|
6368
6752
|
// measure memory requirements for the graph
|
6369
6753
|
size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
|
6370
6754
|
|
6371
|
-
LLAMA_LOG_INFO("%s: compute buffer total size =
|
6755
|
+
LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
|
6372
6756
|
|
6373
6757
|
// recreate allocator with exact memory requirements
|
6374
6758
|
ggml_allocr_free(ctx->alloc);
|
@@ -6377,28 +6761,46 @@ struct llama_context * llama_new_context_with_model(
|
|
6377
6761
|
ctx->alloc = ggml_allocr_new(ctx->buf_alloc.data, ctx->buf_alloc.size, tensor_alignment);
|
6378
6762
|
#ifdef GGML_USE_METAL
|
6379
6763
|
if (ctx->ctx_metal) {
|
6380
|
-
ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
|
6764
|
+
//ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
|
6381
6765
|
}
|
6382
6766
|
#endif
|
6383
6767
|
#ifdef GGML_USE_CUBLAS
|
6384
|
-
|
6385
|
-
|
6386
|
-
|
6387
|
-
|
6388
|
-
|
6389
|
-
|
6768
|
+
ggml_cuda_set_scratch_size(alloc_size);
|
6769
|
+
LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MB\n", __func__, alloc_size / 1024.0 / 1024.0);
|
6770
|
+
|
6771
|
+
// calculate total VRAM usage
|
6772
|
+
auto add_tensor = [](const ggml_tensor * t, size_t & size) {
|
6773
|
+
if (t->backend == GGML_BACKEND_GPU || t->backend == GGML_BACKEND_GPU_SPLIT) {
|
6774
|
+
size += ggml_nbytes(t);
|
6775
|
+
}
|
6776
|
+
};
|
6777
|
+
size_t model_vram_size = 0;
|
6778
|
+
for (const auto & kv : model->tensors_by_name) {
|
6779
|
+
add_tensor(kv.second, model_vram_size);
|
6390
6780
|
}
|
6781
|
+
|
6782
|
+
size_t kv_vram_size = 0;
|
6783
|
+
add_tensor(ctx->kv_self.k, kv_vram_size);
|
6784
|
+
add_tensor(ctx->kv_self.v, kv_vram_size);
|
6785
|
+
|
6786
|
+
size_t ctx_vram_size = alloc_size + kv_vram_size;
|
6787
|
+
size_t total_vram_size = model_vram_size + ctx_vram_size;
|
6788
|
+
|
6789
|
+
LLAMA_LOG_INFO("%s: total VRAM used: %.2f MB (model: %.2f MB, context: %.2f MB)\n", __func__,
|
6790
|
+
total_vram_size / 1024.0 / 1024.0,
|
6791
|
+
model_vram_size / 1024.0 / 1024.0,
|
6792
|
+
ctx_vram_size / 1024.0 / 1024.0);
|
6391
6793
|
#endif
|
6392
6794
|
}
|
6393
6795
|
|
6394
6796
|
#ifdef GGML_USE_METAL
|
6395
|
-
if (
|
6797
|
+
if (model->n_gpu_layers > 0) {
|
6396
6798
|
// this allocates all Metal resources and memory buffers
|
6397
6799
|
|
6398
6800
|
void * data_ptr = NULL;
|
6399
6801
|
size_t data_size = 0;
|
6400
6802
|
|
6401
|
-
if (
|
6803
|
+
if (ctx->model.mapping) {
|
6402
6804
|
data_ptr = ctx->model.mapping->addr;
|
6403
6805
|
data_size = ctx->model.mapping->size;
|
6404
6806
|
} else {
|
@@ -6417,11 +6819,8 @@ struct llama_context * llama_new_context_with_model(
|
|
6417
6819
|
return NULL; \
|
6418
6820
|
}
|
6419
6821
|
|
6420
|
-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data",
|
6421
|
-
|
6422
|
-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.data, ctx->buf_compute.size, 0));
|
6423
|
-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0));
|
6424
|
-
|
6822
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
|
6823
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0));
|
6425
6824
|
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.data, ctx->buf_alloc.size, 0));
|
6426
6825
|
#undef LLAMA_METAL_CHECK_BUF
|
6427
6826
|
}
|
@@ -6433,8 +6832,10 @@ struct llama_context * llama_new_context_with_model(
|
|
6433
6832
|
|
6434
6833
|
if (ggml_mpi_rank(ctx->ctx_mpi) > 0) {
|
6435
6834
|
// Enter a blocking eval loop with dummy input, letting rank=0 drive the process
|
6436
|
-
|
6437
|
-
|
6835
|
+
// TODO: needs fix after #3228
|
6836
|
+
GGML_ASSERT(false && "not implemented");
|
6837
|
+
//const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos(ctx));
|
6838
|
+
//while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {};
|
6438
6839
|
llama_backend_free();
|
6439
6840
|
exit(1);
|
6440
6841
|
}
|
@@ -6443,63 +6844,37 @@ struct llama_context * llama_new_context_with_model(
|
|
6443
6844
|
return ctx;
|
6444
6845
|
}
|
6445
6846
|
|
6446
|
-
static struct llama_context * llama_init_from_file(
|
6447
|
-
const char * path_model,
|
6448
|
-
struct llama_context_params params) {
|
6449
|
-
struct llama_model * model = llama_load_model_from_file(path_model, params);
|
6450
|
-
if (!model) {
|
6451
|
-
return nullptr;
|
6452
|
-
}
|
6453
|
-
|
6454
|
-
struct llama_context * ctx = llama_new_context_with_model(model, params);
|
6455
|
-
ctx->model_owner = true;
|
6456
|
-
|
6457
|
-
return ctx;
|
6458
|
-
}
|
6459
|
-
|
6460
6847
|
void llama_free(struct llama_context * ctx) {
|
6461
6848
|
delete ctx;
|
6462
6849
|
}
|
6463
6850
|
|
6464
|
-
|
6465
|
-
return
|
6851
|
+
const llama_model * llama_get_model(const struct llama_context * ctx) {
|
6852
|
+
return &ctx->model;
|
6466
6853
|
}
|
6467
6854
|
|
6468
6855
|
int llama_n_ctx(const struct llama_context * ctx) {
|
6469
|
-
return
|
6856
|
+
return ctx->cparams.n_ctx;
|
6470
6857
|
}
|
6471
6858
|
|
6472
|
-
|
6473
|
-
return
|
6859
|
+
enum llama_vocab_type llama_vocab_type(const struct llama_model * model) {
|
6860
|
+
return model->vocab.type;
|
6474
6861
|
}
|
6475
6862
|
|
6476
|
-
int
|
6477
|
-
return llama_model_n_embd(&ctx->model);
|
6478
|
-
}
|
6479
|
-
|
6480
|
-
enum llama_vocab_type llama_vocab_type(const struct llama_context * ctx) {
|
6481
|
-
return ctx->model.vocab.type;
|
6482
|
-
}
|
6483
|
-
|
6484
|
-
int llama_model_n_vocab(const struct llama_model * model) {
|
6863
|
+
int llama_n_vocab(const struct llama_model * model) {
|
6485
6864
|
return model->vocab.id_to_token.size();
|
6486
6865
|
}
|
6487
6866
|
|
6488
|
-
int
|
6489
|
-
return model->hparams.n_ctx;
|
6490
|
-
}
|
6491
|
-
|
6492
|
-
int llama_model_n_ctx_train(const struct llama_model * model) {
|
6867
|
+
int llama_n_ctx_train(const struct llama_model * model) {
|
6493
6868
|
return model->hparams.n_ctx_train;
|
6494
6869
|
}
|
6495
6870
|
|
6496
|
-
int
|
6871
|
+
int llama_n_embd(const struct llama_model * model) {
|
6497
6872
|
return model->hparams.n_embd;
|
6498
6873
|
}
|
6499
6874
|
|
6500
6875
|
int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
|
6501
6876
|
return snprintf(buf, buf_size, "%s %s %s",
|
6502
|
-
model->
|
6877
|
+
llama_model_arch_name(model->arch).c_str(),
|
6503
6878
|
llama_model_type_name(model->type),
|
6504
6879
|
llama_model_ftype_name(model->ftype).c_str());
|
6505
6880
|
}
|
@@ -6520,6 +6895,10 @@ uint64_t llama_model_n_params(const struct llama_model * model) {
|
|
6520
6895
|
return nparams;
|
6521
6896
|
}
|
6522
6897
|
|
6898
|
+
struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name) {
|
6899
|
+
return ggml_get_tensor(model->ctx, name);
|
6900
|
+
}
|
6901
|
+
|
6523
6902
|
int llama_model_quantize(
|
6524
6903
|
const char * fname_inp,
|
6525
6904
|
const char * fname_out,
|
@@ -6533,18 +6912,18 @@ int llama_model_quantize(
|
|
6533
6912
|
}
|
6534
6913
|
}
|
6535
6914
|
|
6536
|
-
int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
|
6915
|
+
int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, float scale, const char * path_base_model, int n_threads) {
|
6537
6916
|
try {
|
6538
|
-
return llama_apply_lora_from_file_internal(ctx->model, path_lora, path_base_model, n_threads);
|
6917
|
+
return llama_apply_lora_from_file_internal(ctx->model, path_lora, scale, path_base_model, n_threads);
|
6539
6918
|
} catch (const std::exception & err) {
|
6540
6919
|
LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
6541
6920
|
return 1;
|
6542
6921
|
}
|
6543
6922
|
}
|
6544
6923
|
|
6545
|
-
int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, const char * path_base_model, int n_threads) {
|
6924
|
+
int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int n_threads) {
|
6546
6925
|
try {
|
6547
|
-
return llama_apply_lora_from_file_internal(*model, path_lora, path_base_model, n_threads);
|
6926
|
+
return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads);
|
6548
6927
|
} catch (const std::exception & err) {
|
6549
6928
|
LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
6550
6929
|
return 1;
|
@@ -6552,16 +6931,27 @@ int llama_model_apply_lora_from_file(const struct llama_model * model, const cha
|
|
6552
6931
|
}
|
6553
6932
|
|
6554
6933
|
int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
|
6555
|
-
return ctx->kv_self.
|
6934
|
+
return ctx->kv_self.head;
|
6556
6935
|
}
|
6557
6936
|
|
6558
|
-
|
6937
|
+
void llama_kv_cache_tokens_rm(struct llama_context * ctx, int32_t c0, int32_t c1) {
|
6938
|
+
llama_kv_cache_tokens_rm(ctx->kv_self, c0, c1);
|
6939
|
+
}
|
6559
6940
|
|
6560
|
-
void
|
6561
|
-
|
6562
|
-
|
6563
|
-
|
6564
|
-
|
6941
|
+
void llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
|
6942
|
+
llama_kv_cache_seq_rm(ctx->kv_self, seq_id, p0, p1);
|
6943
|
+
}
|
6944
|
+
|
6945
|
+
void llama_kv_cache_seq_cp(struct llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
|
6946
|
+
llama_kv_cache_seq_cp(ctx->kv_self, seq_id_src, seq_id_dst, p0, p1);
|
6947
|
+
}
|
6948
|
+
|
6949
|
+
void llama_kv_cache_seq_keep(struct llama_context * ctx, llama_seq_id seq_id) {
|
6950
|
+
llama_kv_cache_seq_keep(ctx->kv_self, seq_id);
|
6951
|
+
}
|
6952
|
+
|
6953
|
+
void llama_kv_cache_seq_shift(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
|
6954
|
+
llama_kv_cache_seq_shift(ctx->kv_self, seq_id, p0, p1, delta);
|
6565
6955
|
}
|
6566
6956
|
|
6567
6957
|
// Returns the *maximum* size of the state
|
@@ -6649,6 +7039,16 @@ struct llama_data_file_context : llama_data_context {
|
|
6649
7039
|
*
|
6650
7040
|
*/
|
6651
7041
|
static void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
|
7042
|
+
// TODO: does not support multi-sequence states
|
7043
|
+
{
|
7044
|
+
const auto & kv_self = ctx->kv_self;
|
7045
|
+
for (uint32_t i = 0; i < kv_self.head; ++i) {
|
7046
|
+
GGML_ASSERT(kv_self.cells[i].pos == (int32_t) i);
|
7047
|
+
GGML_ASSERT(kv_self.cells[i].seq_id.size() == 1);
|
7048
|
+
GGML_ASSERT(kv_self.cells[i].has_seq_id(0));
|
7049
|
+
}
|
7050
|
+
}
|
7051
|
+
|
6652
7052
|
// copy rng
|
6653
7053
|
{
|
6654
7054
|
std::stringstream rng_ss;
|
@@ -6699,12 +7099,14 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
6699
7099
|
{
|
6700
7100
|
const auto & kv_self = ctx->kv_self;
|
6701
7101
|
const auto & hparams = ctx->model.hparams;
|
7102
|
+
const auto & cparams = ctx->cparams;
|
7103
|
+
|
6702
7104
|
const int n_layer = hparams.n_layer;
|
6703
7105
|
const int n_embd = hparams.n_embd_gqa();
|
6704
|
-
const int n_ctx =
|
7106
|
+
const int n_ctx = cparams.n_ctx;
|
6705
7107
|
|
6706
7108
|
const size_t kv_size = kv_self.buf.size;
|
6707
|
-
const int kv_ntok =
|
7109
|
+
const int kv_ntok = kv_self.head;
|
6708
7110
|
|
6709
7111
|
data_ctx->write(&kv_size, sizeof(kv_size));
|
6710
7112
|
data_ctx->write(&kv_ntok, sizeof(kv_ntok));
|
@@ -6807,9 +7209,11 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
6807
7209
|
{
|
6808
7210
|
const auto & kv_self = ctx->kv_self;
|
6809
7211
|
const auto & hparams = ctx->model.hparams;
|
7212
|
+
const auto & cparams = ctx->cparams;
|
7213
|
+
|
6810
7214
|
const int n_layer = hparams.n_layer;
|
6811
7215
|
const int n_embd = hparams.n_embd_gqa();
|
6812
|
-
const int n_ctx =
|
7216
|
+
const int n_ctx = cparams.n_ctx;
|
6813
7217
|
|
6814
7218
|
size_t kv_size;
|
6815
7219
|
int kv_ntok;
|
@@ -6848,7 +7252,8 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
6848
7252
|
ggml_free(cpy_ctx);
|
6849
7253
|
}
|
6850
7254
|
|
6851
|
-
ctx->kv_self.
|
7255
|
+
ctx->kv_self.head = kv_ntok;
|
7256
|
+
ctx->kv_self.size = kv_size;
|
6852
7257
|
}
|
6853
7258
|
|
6854
7259
|
const size_t nread = inp - src;
|
@@ -6943,64 +7348,102 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi
|
|
6943
7348
|
|
6944
7349
|
int llama_eval(
|
6945
7350
|
struct llama_context * ctx,
|
6946
|
-
|
6947
|
-
|
6948
|
-
int n_past
|
6949
|
-
|
6950
|
-
if (!llama_eval_internal(*ctx, tokens, nullptr, n_tokens, n_past, n_threads, nullptr)) {
|
6951
|
-
LLAMA_LOG_ERROR("%s: failed to eval\n", __func__);
|
6952
|
-
return 1;
|
6953
|
-
}
|
7351
|
+
llama_token * tokens,
|
7352
|
+
int32_t n_tokens,
|
7353
|
+
int n_past) {
|
7354
|
+
llama_kv_cache_tokens_rm(ctx->kv_self, n_past, -1);
|
6954
7355
|
|
6955
|
-
|
6956
|
-
|
6957
|
-
|
6958
|
-
ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
|
6959
|
-
ctx->has_evaluated_once = true;
|
7356
|
+
const int ret = llama_decode_internal(*ctx, llama_batch_get_one(tokens, n_tokens, n_past, 0));
|
7357
|
+
if (ret < 0) {
|
7358
|
+
LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
|
6960
7359
|
}
|
6961
7360
|
|
6962
|
-
return
|
7361
|
+
return ret;
|
6963
7362
|
}
|
6964
7363
|
|
6965
7364
|
int llama_eval_embd(
|
6966
7365
|
struct llama_context * ctx,
|
6967
|
-
|
6968
|
-
|
6969
|
-
int n_past
|
6970
|
-
|
6971
|
-
if (!llama_eval_internal(*ctx, nullptr, embd, n_tokens, n_past, n_threads, nullptr)) {
|
6972
|
-
LLAMA_LOG_ERROR("%s: failed to eval\n", __func__);
|
6973
|
-
return 1;
|
6974
|
-
}
|
7366
|
+
float * embd,
|
7367
|
+
int32_t n_tokens,
|
7368
|
+
int n_past) {
|
7369
|
+
llama_kv_cache_tokens_rm(ctx->kv_self, n_past, -1);
|
6975
7370
|
|
6976
|
-
|
6977
|
-
|
6978
|
-
|
6979
|
-
|
6980
|
-
|
7371
|
+
llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, n_past, 1, 0, };
|
7372
|
+
|
7373
|
+
const int ret = llama_decode_internal(*ctx, batch);
|
7374
|
+
if (ret < 0) {
|
7375
|
+
LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
|
6981
7376
|
}
|
6982
7377
|
|
6983
|
-
return
|
7378
|
+
return ret;
|
6984
7379
|
}
|
6985
7380
|
|
6986
|
-
|
6987
|
-
|
6988
|
-
|
7381
|
+
void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch) {
|
7382
|
+
ctx->cparams.n_threads = n_threads;
|
7383
|
+
ctx->cparams.n_threads_batch = n_threads_batch;
|
7384
|
+
}
|
7385
|
+
|
7386
|
+
struct llama_batch llama_batch_get_one(
|
7387
|
+
llama_token * tokens,
|
7388
|
+
int32_t n_tokens,
|
7389
|
+
llama_pos pos_0,
|
7390
|
+
llama_seq_id seq_id) {
|
7391
|
+
return {
|
7392
|
+
/*n_tokens =*/ n_tokens,
|
7393
|
+
/*tokens =*/ tokens,
|
7394
|
+
/*embd =*/ nullptr,
|
7395
|
+
/*pos =*/ nullptr,
|
7396
|
+
/*seq_id =*/ nullptr,
|
7397
|
+
/*logits =*/ nullptr,
|
7398
|
+
/*all_pos_0 =*/ pos_0,
|
7399
|
+
/*all_pos_1 =*/ 1,
|
7400
|
+
/*all_seq_id =*/ seq_id,
|
7401
|
+
};
|
7402
|
+
}
|
6989
7403
|
|
6990
|
-
|
7404
|
+
struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd) {
|
7405
|
+
llama_batch batch = { -1, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, };
|
6991
7406
|
|
6992
|
-
if (
|
6993
|
-
|
6994
|
-
|
7407
|
+
if (embd) {
|
7408
|
+
batch.embd = (float *) malloc(sizeof(float) * n_tokens * embd);
|
7409
|
+
} else {
|
7410
|
+
batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens);
|
6995
7411
|
}
|
6996
7412
|
|
6997
|
-
|
7413
|
+
batch.pos = (llama_pos *) malloc(sizeof(llama_pos) * n_tokens);
|
7414
|
+
batch.seq_id = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_tokens);
|
7415
|
+
batch.logits = (int8_t *) malloc(sizeof(int8_t) * n_tokens);
|
7416
|
+
|
7417
|
+
return batch;
|
7418
|
+
}
|
7419
|
+
|
7420
|
+
void llama_batch_free(struct llama_batch batch) {
|
7421
|
+
if (batch.token) free(batch.token);
|
7422
|
+
if (batch.embd) free(batch.embd);
|
7423
|
+
if (batch.pos) free(batch.pos);
|
7424
|
+
if (batch.seq_id) free(batch.seq_id);
|
7425
|
+
if (batch.logits) free(batch.logits);
|
7426
|
+
}
|
7427
|
+
|
7428
|
+
int llama_decode(
|
7429
|
+
struct llama_context * ctx,
|
7430
|
+
struct llama_batch batch) {
|
7431
|
+
const int ret = llama_decode_internal(*ctx, batch);
|
7432
|
+
if (ret < 0) {
|
7433
|
+
LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
|
7434
|
+
}
|
7435
|
+
|
7436
|
+
return ret;
|
6998
7437
|
}
|
6999
7438
|
|
7000
7439
|
float * llama_get_logits(struct llama_context * ctx) {
|
7001
7440
|
return ctx->logits.data();
|
7002
7441
|
}
|
7003
7442
|
|
7443
|
+
float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
|
7444
|
+
return ctx->logits.data() + i*ctx->model.hparams.n_vocab;
|
7445
|
+
}
|
7446
|
+
|
7004
7447
|
float * llama_get_embeddings(struct llama_context * ctx) {
|
7005
7448
|
return ctx->embedding.data();
|
7006
7449
|
}
|
@@ -7030,16 +7473,6 @@ llama_token llama_token_nl(const struct llama_context * ctx) {
|
|
7030
7473
|
}
|
7031
7474
|
|
7032
7475
|
int llama_tokenize(
|
7033
|
-
struct llama_context * ctx,
|
7034
|
-
const char * text,
|
7035
|
-
int text_len,
|
7036
|
-
llama_token * tokens,
|
7037
|
-
int n_max_tokens,
|
7038
|
-
bool add_bos) {
|
7039
|
-
return llama_tokenize_with_model(&ctx->model, text, text_len, tokens, n_max_tokens, add_bos);
|
7040
|
-
}
|
7041
|
-
|
7042
|
-
int llama_tokenize_with_model(
|
7043
7476
|
const struct llama_model * model,
|
7044
7477
|
const char * text,
|
7045
7478
|
int text_len,
|
@@ -7060,13 +7493,9 @@ int llama_tokenize_with_model(
|
|
7060
7493
|
return res.size();
|
7061
7494
|
}
|
7062
7495
|
|
7063
|
-
int llama_token_to_piece(const struct llama_context * ctx, llama_token token, char * buf, int length) {
|
7064
|
-
return llama_token_to_piece_with_model(&ctx->model, token, buf, length);
|
7065
|
-
}
|
7066
|
-
|
7067
7496
|
// does not write null-terminator to buf
|
7068
|
-
int
|
7069
|
-
if (0 <= token && token <
|
7497
|
+
int llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int length) {
|
7498
|
+
if (0 <= token && token < llama_n_vocab(model)) {
|
7070
7499
|
if (llama_is_normal_token(model->vocab, token)) {
|
7071
7500
|
std::string result = model->vocab.id_to_token[token].text;
|
7072
7501
|
if (llama_vocab_get_type(model->vocab) == LLAMA_VOCAB_TYPE_SPM) {
|
@@ -7086,7 +7515,7 @@ int llama_token_to_piece_with_model(const struct llama_model * model, llama_toke
|
|
7086
7515
|
buf[2] = '\x85';
|
7087
7516
|
return 3;
|
7088
7517
|
} else if (llama_is_control_token(model->vocab, token)) {
|
7089
|
-
|
7518
|
+
// do nothing
|
7090
7519
|
} else if (llama_is_byte_token(model->vocab, token)) {
|
7091
7520
|
if (length < 1) {
|
7092
7521
|
return -1;
|
@@ -7194,12 +7623,12 @@ const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal
|
|
7194
7623
|
return ctx->model.tensors_by_name;
|
7195
7624
|
}
|
7196
7625
|
|
7197
|
-
void llama_log_set(
|
7626
|
+
void llama_log_set(ggml_log_callback log_callback, void * user_data) {
|
7198
7627
|
g_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
|
7199
7628
|
g_state.log_callback_user_data = user_data;
|
7200
7629
|
}
|
7201
7630
|
|
7202
|
-
static void llama_log_internal_v(
|
7631
|
+
static void llama_log_internal_v(ggml_log_level level, const char * format, va_list args) {
|
7203
7632
|
va_list args_copy;
|
7204
7633
|
va_copy(args_copy, args);
|
7205
7634
|
char buffer[128];
|
@@ -7216,14 +7645,14 @@ static void llama_log_internal_v(llama_log_level level, const char * format, va_
|
|
7216
7645
|
va_end(args_copy);
|
7217
7646
|
}
|
7218
7647
|
|
7219
|
-
static void llama_log_internal(
|
7648
|
+
static void llama_log_internal(ggml_log_level level, const char * format, ...) {
|
7220
7649
|
va_list args;
|
7221
7650
|
va_start(args, format);
|
7222
7651
|
llama_log_internal_v(level, format, args);
|
7223
7652
|
va_end(args);
|
7224
7653
|
}
|
7225
7654
|
|
7226
|
-
static void llama_log_callback_default(
|
7655
|
+
static void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data) {
|
7227
7656
|
(void) level;
|
7228
7657
|
(void) user_data;
|
7229
7658
|
fputs(text, stderr);
|