llama_cpp 0.5.3 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/README.md +6 -5
- data/examples/chat.rb +13 -13
- data/examples/embedding.rb +9 -9
- data/ext/llama_cpp/llama_cpp.cpp +547 -272
- data/ext/llama_cpp/src/ggml-alloc.c +8 -2
- data/ext/llama_cpp/src/ggml-alloc.h +1 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +209 -82
- data/ext/llama_cpp/src/ggml-cuda.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.h +4 -0
- data/ext/llama_cpp/src/ggml-metal.m +163 -84
- data/ext/llama_cpp/src/ggml-metal.metal +121 -38
- data/ext/llama_cpp/src/ggml.c +1596 -842
- data/ext/llama_cpp/src/ggml.h +116 -35
- data/ext/llama_cpp/src/llama.cpp +1015 -586
- data/ext/llama_cpp/src/llama.h +304 -119
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +5 -9
- data/sig/llama_cpp.rbs +65 -34
- metadata +3 -3
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -72,6 +72,7 @@
|
|
72
72
|
#include <sstream>
|
73
73
|
#include <thread>
|
74
74
|
#include <unordered_map>
|
75
|
+
#include <set>
|
75
76
|
|
76
77
|
#if defined(_MSC_VER)
|
77
78
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
@@ -92,12 +93,12 @@
|
|
92
93
|
//
|
93
94
|
|
94
95
|
LLAMA_ATTRIBUTE_FORMAT(2, 3)
|
95
|
-
static void llama_log_internal (
|
96
|
-
static void llama_log_callback_default(
|
96
|
+
static void llama_log_internal (ggml_log_level level, const char* format, ...);
|
97
|
+
static void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data);
|
97
98
|
|
98
|
-
#define LLAMA_LOG_INFO(...) llama_log_internal(
|
99
|
-
#define LLAMA_LOG_WARN(...) llama_log_internal(
|
100
|
-
#define LLAMA_LOG_ERROR(...) llama_log_internal(
|
99
|
+
#define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
|
100
|
+
#define LLAMA_LOG_WARN(...) llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
|
101
|
+
#define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
|
101
102
|
|
102
103
|
//
|
103
104
|
// helpers
|
@@ -166,13 +167,13 @@ enum llm_arch {
|
|
166
167
|
};
|
167
168
|
|
168
169
|
static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
|
169
|
-
{ LLM_ARCH_LLAMA, "llama"
|
170
|
-
{ LLM_ARCH_FALCON, "falcon"
|
171
|
-
{ LLM_ARCH_GPT2, "gpt2"
|
172
|
-
{ LLM_ARCH_GPTJ, "gptj"
|
173
|
-
{ LLM_ARCH_GPTNEOX, "gptneox"
|
174
|
-
{ LLM_ARCH_MPT, "mpt"
|
175
|
-
{ LLM_ARCH_BAICHUAN, "baichuan"
|
170
|
+
{ LLM_ARCH_LLAMA, "llama" },
|
171
|
+
{ LLM_ARCH_FALCON, "falcon" },
|
172
|
+
{ LLM_ARCH_GPT2, "gpt2" },
|
173
|
+
{ LLM_ARCH_GPTJ, "gptj" },
|
174
|
+
{ LLM_ARCH_GPTNEOX, "gptneox" },
|
175
|
+
{ LLM_ARCH_MPT, "mpt" },
|
176
|
+
{ LLM_ARCH_BAICHUAN, "baichuan" },
|
176
177
|
{ LLM_ARCH_STARCODER, "starcoder" },
|
177
178
|
};
|
178
179
|
|
@@ -221,16 +222,16 @@ enum llm_kv {
|
|
221
222
|
};
|
222
223
|
|
223
224
|
static std::map<llm_kv, std::string> LLM_KV_NAMES = {
|
224
|
-
{ LLM_KV_GENERAL_ARCHITECTURE, "general.architecture"
|
225
|
-
{ LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version"
|
226
|
-
{ LLM_KV_GENERAL_ALIGNMENT, "general.alignment"
|
227
|
-
{ LLM_KV_GENERAL_NAME, "general.name"
|
228
|
-
{ LLM_KV_GENERAL_AUTHOR, "general.author"
|
229
|
-
{ LLM_KV_GENERAL_URL, "general.url"
|
230
|
-
{ LLM_KV_GENERAL_DESCRIPTION, "general.description"
|
231
|
-
{ LLM_KV_GENERAL_LICENSE, "general.license"
|
232
|
-
{ LLM_KV_GENERAL_SOURCE_URL, "general.
|
233
|
-
{ LLM_KV_GENERAL_SOURCE_HF_REPO, "general.
|
225
|
+
{ LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" },
|
226
|
+
{ LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" },
|
227
|
+
{ LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
|
228
|
+
{ LLM_KV_GENERAL_NAME, "general.name" },
|
229
|
+
{ LLM_KV_GENERAL_AUTHOR, "general.author" },
|
230
|
+
{ LLM_KV_GENERAL_URL, "general.url" },
|
231
|
+
{ LLM_KV_GENERAL_DESCRIPTION, "general.description" },
|
232
|
+
{ LLM_KV_GENERAL_LICENSE, "general.license" },
|
233
|
+
{ LLM_KV_GENERAL_SOURCE_URL, "general.source.url" },
|
234
|
+
{ LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" },
|
234
235
|
|
235
236
|
{ LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
|
236
237
|
{ LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
|
@@ -448,7 +449,7 @@ struct LLM_TN {
|
|
448
449
|
//
|
449
450
|
|
450
451
|
#define GGUF_GET_KEY(ctx, dst, func, type, req, key) \
|
451
|
-
{ \
|
452
|
+
do { \
|
452
453
|
const std::string skey(key); \
|
453
454
|
const int kid = gguf_find_key(ctx, skey.c_str()); \
|
454
455
|
if (kid >= 0) { \
|
@@ -460,7 +461,7 @@ struct LLM_TN {
|
|
460
461
|
} else if (req) { \
|
461
462
|
throw std::runtime_error(format("key not found in model: %s", skey.c_str())); \
|
462
463
|
} \
|
463
|
-
}
|
464
|
+
} while (0)
|
464
465
|
|
465
466
|
//
|
466
467
|
// ggml helpers
|
@@ -881,10 +882,10 @@ static void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
|
|
881
882
|
|
882
883
|
static std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) {
|
883
884
|
std::vector<char> result(8, 0);
|
884
|
-
const int n_tokens = llama_token_to_piece(ctx, token, result.data(), result.size());
|
885
|
+
const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
|
885
886
|
if (n_tokens < 0) {
|
886
887
|
result.resize(-n_tokens);
|
887
|
-
int check = llama_token_to_piece(ctx, token, result.data(), result.size());
|
888
|
+
int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
|
888
889
|
GGML_ASSERT(check == -n_tokens);
|
889
890
|
} else {
|
890
891
|
result.resize(n_tokens);
|
@@ -899,7 +900,7 @@ static std::string llama_token_to_str(const struct llama_context * ctx, llama_to
|
|
899
900
|
|
900
901
|
struct llama_state {
|
901
902
|
// We save the log callback globally
|
902
|
-
|
903
|
+
ggml_log_callback log_callback = llama_log_callback_default;
|
903
904
|
void * log_callback_user_data = nullptr;
|
904
905
|
};
|
905
906
|
|
@@ -925,9 +926,9 @@ static const size_t MB = kB*kB;
|
|
925
926
|
static const size_t GB = kB*kB*kB;
|
926
927
|
|
927
928
|
struct llama_hparams {
|
929
|
+
bool vocab_only;
|
928
930
|
uint32_t n_vocab;
|
929
931
|
uint32_t n_ctx_train; // context size the model was trained on
|
930
|
-
uint32_t n_ctx; // context size used during inference
|
931
932
|
uint32_t n_embd;
|
932
933
|
uint32_t n_head;
|
933
934
|
uint32_t n_head_kv;
|
@@ -938,8 +939,8 @@ struct llama_hparams {
|
|
938
939
|
float f_norm_eps;
|
939
940
|
float f_norm_rms_eps;
|
940
941
|
|
941
|
-
float
|
942
|
-
float
|
942
|
+
float rope_freq_base_train;
|
943
|
+
float rope_freq_scale_train;
|
943
944
|
|
944
945
|
bool operator!=(const llama_hparams & other) const {
|
945
946
|
return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams))); // NOLINT
|
@@ -956,15 +957,18 @@ struct llama_hparams {
|
|
956
957
|
uint32_t n_embd_gqa() const {
|
957
958
|
return n_embd/n_gqa();
|
958
959
|
}
|
960
|
+
};
|
959
961
|
|
960
|
-
|
961
|
-
|
962
|
-
|
963
|
-
|
964
|
-
|
965
|
-
|
966
|
-
|
967
|
-
|
962
|
+
struct llama_cparams {
|
963
|
+
uint32_t n_ctx; // context size used during inference
|
964
|
+
uint32_t n_batch;
|
965
|
+
uint32_t n_threads; // number of threads to use for generation
|
966
|
+
uint32_t n_threads_batch; // number of threads to use for batch processing
|
967
|
+
|
968
|
+
float rope_freq_base;
|
969
|
+
float rope_freq_scale;
|
970
|
+
|
971
|
+
bool mul_mat_q;
|
968
972
|
};
|
969
973
|
|
970
974
|
struct llama_layer {
|
@@ -999,7 +1003,29 @@ struct llama_layer {
|
|
999
1003
|
struct ggml_tensor * b3; // ffn_up
|
1000
1004
|
};
|
1001
1005
|
|
1006
|
+
struct llama_kv_cell {
|
1007
|
+
llama_pos pos = -1;
|
1008
|
+
llama_pos delta = 0;
|
1009
|
+
|
1010
|
+
std::set<llama_seq_id> seq_id;
|
1011
|
+
|
1012
|
+
bool has_seq_id(const llama_seq_id & id) const {
|
1013
|
+
return seq_id.find(id) != seq_id.end();
|
1014
|
+
}
|
1015
|
+
};
|
1016
|
+
|
1017
|
+
// ring-buffer of cached KV data
|
1002
1018
|
struct llama_kv_cache {
|
1019
|
+
bool has_shift = false;
|
1020
|
+
|
1021
|
+
uint32_t head = 0;
|
1022
|
+
uint32_t size = 0;
|
1023
|
+
|
1024
|
+
// computed before each graph build
|
1025
|
+
uint32_t n = 0;
|
1026
|
+
|
1027
|
+
std::vector<llama_kv_cell> cells;
|
1028
|
+
|
1003
1029
|
struct ggml_tensor * k = NULL;
|
1004
1030
|
struct ggml_tensor * v = NULL;
|
1005
1031
|
|
@@ -1007,8 +1033,6 @@ struct llama_kv_cache {
|
|
1007
1033
|
|
1008
1034
|
llama_buffer buf;
|
1009
1035
|
|
1010
|
-
int n; // number of tokens currently in the cache
|
1011
|
-
|
1012
1036
|
~llama_kv_cache() {
|
1013
1037
|
if (ctx) {
|
1014
1038
|
ggml_free(ctx);
|
@@ -1122,11 +1146,8 @@ struct llama_model {
|
|
1122
1146
|
};
|
1123
1147
|
|
1124
1148
|
struct llama_context {
|
1125
|
-
llama_context(const llama_model & model) : model(model),
|
1149
|
+
llama_context(const llama_model & model) : model(model), t_start_us(model.t_start_us), t_load_us(model.t_load_us) {}
|
1126
1150
|
~llama_context() {
|
1127
|
-
if (model_owner) {
|
1128
|
-
delete &model;
|
1129
|
-
}
|
1130
1151
|
#ifdef GGML_USE_METAL
|
1131
1152
|
if (ctx_metal) {
|
1132
1153
|
ggml_metal_free(ctx_metal);
|
@@ -1137,27 +1158,26 @@ struct llama_context {
|
|
1137
1158
|
}
|
1138
1159
|
}
|
1139
1160
|
|
1161
|
+
llama_cparams cparams;
|
1162
|
+
|
1163
|
+
const llama_model & model;
|
1164
|
+
|
1165
|
+
// key + value cache for the self attention
|
1166
|
+
struct llama_kv_cache kv_self;
|
1167
|
+
|
1140
1168
|
std::mt19937 rng;
|
1141
1169
|
|
1142
1170
|
bool has_evaluated_once = false;
|
1143
1171
|
|
1172
|
+
int64_t t_start_us;
|
1173
|
+
int64_t t_load_us;
|
1144
1174
|
int64_t t_sample_us = 0;
|
1145
|
-
int64_t t_eval_us = 0;
|
1146
1175
|
int64_t t_p_eval_us = 0;
|
1176
|
+
int64_t t_eval_us = 0;
|
1147
1177
|
|
1148
1178
|
int32_t n_sample = 0; // number of tokens sampled
|
1149
|
-
int32_t n_eval = 0; // number of eval calls
|
1150
1179
|
int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
|
1151
|
-
|
1152
|
-
const llama_model & model;
|
1153
|
-
|
1154
|
-
bool model_owner = false;
|
1155
|
-
|
1156
|
-
int64_t t_load_us;
|
1157
|
-
int64_t t_start_us;
|
1158
|
-
|
1159
|
-
// key + value cache for the self attention
|
1160
|
-
struct llama_kv_cache kv_self;
|
1180
|
+
int32_t n_eval = 0; // number of eval calls
|
1161
1181
|
|
1162
1182
|
// decode output (2-dimensional array: [n_tokens][n_vocab])
|
1163
1183
|
std::vector<float> logits;
|
@@ -1192,16 +1212,23 @@ static bool llama_kv_cache_init(
|
|
1192
1212
|
const struct llama_hparams & hparams,
|
1193
1213
|
struct llama_kv_cache & cache,
|
1194
1214
|
ggml_type wtype,
|
1195
|
-
|
1215
|
+
uint32_t n_ctx,
|
1196
1216
|
int n_gpu_layers) {
|
1197
|
-
const
|
1198
|
-
const
|
1217
|
+
const uint32_t n_embd = hparams.n_embd_gqa();
|
1218
|
+
const uint32_t n_layer = hparams.n_layer;
|
1199
1219
|
|
1200
1220
|
const int64_t n_mem = n_layer*n_ctx;
|
1201
1221
|
const int64_t n_elements = n_embd*n_mem;
|
1202
1222
|
|
1223
|
+
cache.has_shift = false;
|
1224
|
+
|
1225
|
+
cache.head = 0;
|
1226
|
+
cache.size = n_ctx;
|
1227
|
+
|
1228
|
+
cache.cells.clear();
|
1229
|
+
cache.cells.resize(n_ctx);
|
1230
|
+
|
1203
1231
|
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
|
1204
|
-
cache.n = 0;
|
1205
1232
|
|
1206
1233
|
struct ggml_init_params params;
|
1207
1234
|
params.mem_size = cache.buf.size;
|
@@ -1222,17 +1249,154 @@ static bool llama_kv_cache_init(
|
|
1222
1249
|
|
1223
1250
|
(void) n_gpu_layers;
|
1224
1251
|
#ifdef GGML_USE_CUBLAS
|
1225
|
-
|
1252
|
+
size_t vram_kv_cache = 0;
|
1253
|
+
|
1254
|
+
if (n_gpu_layers > (int)n_layer + 1) {
|
1226
1255
|
ggml_cuda_assign_buffers_no_scratch(cache.v);
|
1256
|
+
LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__);
|
1257
|
+
vram_kv_cache += ggml_nbytes(cache.v);
|
1227
1258
|
}
|
1228
|
-
if (n_gpu_layers > n_layer + 2) {
|
1259
|
+
if (n_gpu_layers > (int)n_layer + 2) {
|
1229
1260
|
ggml_cuda_assign_buffers_no_scratch(cache.k);
|
1261
|
+
LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
|
1262
|
+
vram_kv_cache += ggml_nbytes(cache.k);
|
1263
|
+
}
|
1264
|
+
if (vram_kv_cache > 0) {
|
1265
|
+
LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
|
1230
1266
|
}
|
1231
1267
|
#endif // GGML_USE_CUBLAS
|
1232
1268
|
|
1233
1269
|
return true;
|
1234
1270
|
}
|
1235
1271
|
|
1272
|
+
// find an empty slot of size "n_tokens" in the cache
|
1273
|
+
// updates the cache head
|
1274
|
+
static bool llama_kv_cache_find_slot(
|
1275
|
+
struct llama_kv_cache & cache,
|
1276
|
+
const struct llama_batch & batch) {
|
1277
|
+
const uint32_t n_ctx = cache.size;
|
1278
|
+
const uint32_t n_tokens = batch.n_tokens;
|
1279
|
+
|
1280
|
+
if (n_tokens > n_ctx) {
|
1281
|
+
LLAMA_LOG_ERROR("%s: n_tokens=%d > n_ctx=%d\n", __func__, n_tokens, n_ctx);
|
1282
|
+
return false;
|
1283
|
+
}
|
1284
|
+
|
1285
|
+
uint32_t n_tested = 0;
|
1286
|
+
|
1287
|
+
while (true) {
|
1288
|
+
if (cache.head + n_tokens > n_ctx) {
|
1289
|
+
cache.head = 0;
|
1290
|
+
n_tested += n_ctx - cache.head;
|
1291
|
+
continue;
|
1292
|
+
}
|
1293
|
+
|
1294
|
+
bool found = true;
|
1295
|
+
for (uint32_t i = 0; i < n_tokens; i++) {
|
1296
|
+
if (cache.cells[cache.head + i].pos >= 0) {
|
1297
|
+
found = false;
|
1298
|
+
cache.head += i + 1;
|
1299
|
+
n_tested += i + 1;
|
1300
|
+
break;
|
1301
|
+
}
|
1302
|
+
}
|
1303
|
+
|
1304
|
+
if (found) {
|
1305
|
+
break;
|
1306
|
+
}
|
1307
|
+
|
1308
|
+
if (n_tested >= n_ctx) {
|
1309
|
+
//LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
|
1310
|
+
return false;
|
1311
|
+
}
|
1312
|
+
}
|
1313
|
+
|
1314
|
+
for (uint32_t i = 0; i < n_tokens; i++) {
|
1315
|
+
cache.cells[cache.head + i].pos = batch.pos[i];
|
1316
|
+
cache.cells[cache.head + i].seq_id.insert(batch.seq_id[i]);
|
1317
|
+
}
|
1318
|
+
|
1319
|
+
return true;
|
1320
|
+
}
|
1321
|
+
|
1322
|
+
// find how many cells are currently in use
|
1323
|
+
static int32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
|
1324
|
+
for (uint32_t i = cache.size - 1; i > 0; --i) {
|
1325
|
+
if (cache.cells[i].pos >= 0 && !cache.cells[i].seq_id.empty()) {
|
1326
|
+
return i + 1;
|
1327
|
+
}
|
1328
|
+
}
|
1329
|
+
|
1330
|
+
return 0;
|
1331
|
+
}
|
1332
|
+
|
1333
|
+
static void llama_kv_cache_tokens_rm(struct llama_kv_cache & cache, int32_t c0, int32_t c1) {
|
1334
|
+
if (c0 < 0) c0 = 0;
|
1335
|
+
if (c1 < 0) c1 = cache.size;
|
1336
|
+
|
1337
|
+
for (int32_t i = c0; i < c1; ++i) {
|
1338
|
+
cache.cells[i].pos = -1;
|
1339
|
+
cache.cells[i].seq_id.clear();
|
1340
|
+
}
|
1341
|
+
}
|
1342
|
+
|
1343
|
+
static void llama_kv_cache_seq_rm(
|
1344
|
+
struct llama_kv_cache & cache,
|
1345
|
+
llama_seq_id seq_id,
|
1346
|
+
llama_pos p0,
|
1347
|
+
llama_pos p1) {
|
1348
|
+
for (uint32_t i = 0; i < cache.size; ++i) {
|
1349
|
+
if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
|
1350
|
+
cache.cells[i].seq_id.erase(seq_id);
|
1351
|
+
if (cache.cells[i].seq_id.empty()) {
|
1352
|
+
cache.cells[i].pos = -1;
|
1353
|
+
}
|
1354
|
+
}
|
1355
|
+
}
|
1356
|
+
}
|
1357
|
+
|
1358
|
+
static void llama_kv_cache_seq_cp(
|
1359
|
+
struct llama_kv_cache & cache,
|
1360
|
+
llama_seq_id seq_id_src,
|
1361
|
+
llama_seq_id seq_id_dst,
|
1362
|
+
llama_pos p0,
|
1363
|
+
llama_pos p1) {
|
1364
|
+
for (uint32_t i = 0; i < cache.size; ++i) {
|
1365
|
+
if (cache.cells[i].has_seq_id(seq_id_src) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
|
1366
|
+
cache.cells[i].seq_id.insert(seq_id_dst);
|
1367
|
+
}
|
1368
|
+
}
|
1369
|
+
}
|
1370
|
+
|
1371
|
+
static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id seq_id) {
|
1372
|
+
for (uint32_t i = 0; i < cache.size; ++i) {
|
1373
|
+
if (!cache.cells[i].has_seq_id(seq_id)) {
|
1374
|
+
cache.cells[i].pos = -1;
|
1375
|
+
cache.cells[i].seq_id.clear();
|
1376
|
+
}
|
1377
|
+
}
|
1378
|
+
}
|
1379
|
+
|
1380
|
+
static void llama_kv_cache_seq_shift(
|
1381
|
+
struct llama_kv_cache & cache,
|
1382
|
+
llama_seq_id seq_id,
|
1383
|
+
llama_pos p0,
|
1384
|
+
llama_pos p1,
|
1385
|
+
llama_pos delta) {
|
1386
|
+
for (uint32_t i = 0; i < cache.size; ++i) {
|
1387
|
+
if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
|
1388
|
+
cache.cells[i].pos += delta;
|
1389
|
+
if (cache.cells[i].pos < 0) {
|
1390
|
+
cache.cells[i].pos = -1;
|
1391
|
+
cache.cells[i].seq_id.clear();
|
1392
|
+
} else {
|
1393
|
+
cache.has_shift = true;
|
1394
|
+
cache.cells[i].delta = delta;
|
1395
|
+
}
|
1396
|
+
}
|
1397
|
+
}
|
1398
|
+
}
|
1399
|
+
|
1236
1400
|
//
|
1237
1401
|
// model loading and saving
|
1238
1402
|
//
|
@@ -1554,7 +1718,7 @@ struct llama_model_loader {
|
|
1554
1718
|
lmlock->grow_to(size_lock);
|
1555
1719
|
}
|
1556
1720
|
break;
|
1557
|
-
#
|
1721
|
+
#ifdef GGML_USE_CUBLAS
|
1558
1722
|
case GGML_BACKEND_GPU:
|
1559
1723
|
case GGML_BACKEND_GPU_SPLIT:
|
1560
1724
|
// old code:
|
@@ -1587,7 +1751,15 @@ struct llama_model_loader {
|
|
1587
1751
|
// load LLaMA models
|
1588
1752
|
//
|
1589
1753
|
|
1590
|
-
static std::string
|
1754
|
+
static std::string llama_model_arch_name(llm_arch arch) {
|
1755
|
+
auto it = LLM_ARCH_NAMES.find(arch);
|
1756
|
+
if (it == LLM_ARCH_NAMES.end()) {
|
1757
|
+
return "unknown";
|
1758
|
+
}
|
1759
|
+
return it->second;
|
1760
|
+
}
|
1761
|
+
|
1762
|
+
static std::string llama_model_ftype_name(llama_ftype ftype) {
|
1591
1763
|
if (ftype & LLAMA_FTYPE_GUESSED) {
|
1592
1764
|
return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
|
1593
1765
|
}
|
@@ -1643,10 +1815,7 @@ static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
|
|
1643
1815
|
|
1644
1816
|
static void llm_load_hparams(
|
1645
1817
|
llama_model_loader & ml,
|
1646
|
-
llama_model & model
|
1647
|
-
int n_ctx,
|
1648
|
-
float rope_freq_base,
|
1649
|
-
float rope_freq_scale) {
|
1818
|
+
llama_model & model) {
|
1650
1819
|
struct gguf_context * ctx = ml.ctx_gguf;
|
1651
1820
|
|
1652
1821
|
const auto kv = LLM_KV(model.arch);
|
@@ -1657,29 +1826,25 @@ static void llm_load_hparams(
|
|
1657
1826
|
GGUF_GET_KEY(ctx, model.name, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_NAME));
|
1658
1827
|
|
1659
1828
|
// get hparams kv
|
1660
|
-
GGUF_GET_KEY(ctx, hparams.n_vocab, gguf_get_arr_n, GGUF_TYPE_ARRAY,
|
1661
|
-
GGUF_GET_KEY(ctx, hparams.n_ctx_train, gguf_get_val_u32, GGUF_TYPE_UINT32,
|
1662
|
-
GGUF_GET_KEY(ctx, hparams.n_embd, gguf_get_val_u32, GGUF_TYPE_UINT32,
|
1663
|
-
GGUF_GET_KEY(ctx, hparams.n_ff, gguf_get_val_u32, GGUF_TYPE_UINT32,
|
1664
|
-
GGUF_GET_KEY(ctx, hparams.n_head, gguf_get_val_u32, GGUF_TYPE_UINT32,
|
1665
|
-
GGUF_GET_KEY(ctx, hparams.n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32,
|
1829
|
+
GGUF_GET_KEY(ctx, hparams.n_vocab, gguf_get_arr_n, GGUF_TYPE_ARRAY, true, kv(LLM_KV_TOKENIZER_LIST));
|
1830
|
+
GGUF_GET_KEY(ctx, hparams.n_ctx_train, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_CONTEXT_LENGTH));
|
1831
|
+
GGUF_GET_KEY(ctx, hparams.n_embd, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_EMBEDDING_LENGTH));
|
1832
|
+
GGUF_GET_KEY(ctx, hparams.n_ff, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_FEED_FORWARD_LENGTH));
|
1833
|
+
GGUF_GET_KEY(ctx, hparams.n_head, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_ATTENTION_HEAD_COUNT));
|
1834
|
+
GGUF_GET_KEY(ctx, hparams.n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_BLOCK_COUNT));
|
1666
1835
|
|
1667
1836
|
// n_head_kv is optional, default to n_head
|
1668
1837
|
hparams.n_head_kv = hparams.n_head;
|
1669
1838
|
GGUF_GET_KEY(ctx, hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV));
|
1670
1839
|
|
1671
1840
|
// rope_freq_base (optional)
|
1672
|
-
|
1673
|
-
|
1674
|
-
GGUF_GET_KEY(ctx, rope_freq_base, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
|
1675
|
-
}
|
1841
|
+
hparams.rope_freq_base_train = 10000.0f;
|
1842
|
+
GGUF_GET_KEY(ctx, hparams.rope_freq_base_train, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
|
1676
1843
|
|
1677
1844
|
// rope_freq_scale (inverse of the kv) is optional
|
1678
|
-
|
1679
|
-
|
1680
|
-
|
1681
|
-
rope_freq_scale = 1.0f/ropescale;
|
1682
|
-
}
|
1845
|
+
float ropescale = 1.0f;
|
1846
|
+
GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
|
1847
|
+
hparams.rope_freq_scale_train = 1.0f/ropescale;
|
1683
1848
|
|
1684
1849
|
// sanity check for n_rot (optional)
|
1685
1850
|
{
|
@@ -1743,13 +1908,9 @@ static void llm_load_hparams(
|
|
1743
1908
|
}
|
1744
1909
|
} break;
|
1745
1910
|
default: (void)0;
|
1746
|
-
}
|
1911
|
+
}
|
1747
1912
|
|
1748
1913
|
model.ftype = ml.ftype;
|
1749
|
-
|
1750
|
-
hparams.n_ctx = n_ctx;
|
1751
|
-
hparams.rope_freq_base = rope_freq_base;
|
1752
|
-
hparams.rope_freq_scale = rope_freq_scale;
|
1753
1914
|
}
|
1754
1915
|
|
1755
1916
|
// TODO: This should probably be in llama.h
|
@@ -1770,20 +1931,18 @@ static void llm_load_vocab(
|
|
1770
1931
|
throw std::runtime_error("cannot find tokenizer vocab in model file\n");
|
1771
1932
|
}
|
1772
1933
|
|
1934
|
+
const float * scores = nullptr;
|
1773
1935
|
const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
|
1774
|
-
if (score_idx
|
1775
|
-
|
1936
|
+
if (score_idx != -1) {
|
1937
|
+
scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
|
1776
1938
|
}
|
1777
1939
|
|
1778
|
-
const
|
1779
|
-
|
1940
|
+
const int * toktypes = nullptr;
|
1780
1941
|
const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
|
1781
|
-
if (toktype_idx
|
1782
|
-
|
1942
|
+
if (toktype_idx != -1) {
|
1943
|
+
toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
|
1783
1944
|
}
|
1784
1945
|
|
1785
|
-
const int * toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
|
1786
|
-
|
1787
1946
|
// determine vocab type
|
1788
1947
|
{
|
1789
1948
|
std::string tokenizer_name;
|
@@ -1851,8 +2010,8 @@ static void llm_load_vocab(
|
|
1851
2010
|
|
1852
2011
|
auto & token_data = vocab.id_to_token[i];
|
1853
2012
|
token_data.text = std::move(word);
|
1854
|
-
token_data.score = scores[i];
|
1855
|
-
token_data.type = (llama_token_type) toktypes[i];
|
2013
|
+
token_data.score = scores ? scores[i] : 0.0f;
|
2014
|
+
token_data.type = toktypes ? (llama_token_type) toktypes[i] : LLAMA_TOKEN_TYPE_NORMAL;
|
1856
2015
|
}
|
1857
2016
|
|
1858
2017
|
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
|
@@ -1875,31 +2034,30 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
1875
2034
|
const auto & vocab = model.vocab;
|
1876
2035
|
|
1877
2036
|
// hparams
|
1878
|
-
LLAMA_LOG_INFO("%s: format
|
1879
|
-
LLAMA_LOG_INFO("%s: arch
|
1880
|
-
LLAMA_LOG_INFO("%s: vocab type
|
1881
|
-
LLAMA_LOG_INFO("%s: n_vocab
|
1882
|
-
LLAMA_LOG_INFO("%s: n_merges
|
1883
|
-
LLAMA_LOG_INFO("%s: n_ctx_train
|
1884
|
-
LLAMA_LOG_INFO("%s:
|
1885
|
-
LLAMA_LOG_INFO("%s:
|
1886
|
-
LLAMA_LOG_INFO("%s:
|
1887
|
-
LLAMA_LOG_INFO("%s:
|
1888
|
-
LLAMA_LOG_INFO("%s:
|
1889
|
-
LLAMA_LOG_INFO("%s:
|
1890
|
-
LLAMA_LOG_INFO("%s:
|
1891
|
-
LLAMA_LOG_INFO("%s:
|
1892
|
-
LLAMA_LOG_INFO("%s:
|
1893
|
-
LLAMA_LOG_INFO("%s:
|
1894
|
-
LLAMA_LOG_INFO("%s:
|
1895
|
-
LLAMA_LOG_INFO("%s:
|
1896
|
-
LLAMA_LOG_INFO("%s: model
|
1897
|
-
LLAMA_LOG_INFO("%s: model
|
1898
|
-
LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
|
2037
|
+
LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver));
|
2038
|
+
LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch).c_str());
|
2039
|
+
LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, vocab.type == LLAMA_VOCAB_TYPE_SPM ? "SPM" : "BPE"); // TODO: fix
|
2040
|
+
LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab);
|
2041
|
+
LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (int) vocab.bpe_ranks.size());
|
2042
|
+
LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
|
2043
|
+
LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
|
2044
|
+
LLAMA_LOG_INFO("%s: n_head = %u\n", __func__, hparams.n_head);
|
2045
|
+
LLAMA_LOG_INFO("%s: n_head_kv = %u\n", __func__, hparams.n_head_kv);
|
2046
|
+
LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
|
2047
|
+
LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim
|
2048
|
+
LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa());
|
2049
|
+
LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
|
2050
|
+
LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
|
2051
|
+
LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
|
2052
|
+
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
2053
|
+
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
2054
|
+
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
|
2055
|
+
LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
|
2056
|
+
LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
|
1899
2057
|
if (ml.n_bytes < GB) {
|
1900
|
-
LLAMA_LOG_INFO("%s: model size
|
2058
|
+
LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
|
1901
2059
|
} else {
|
1902
|
-
LLAMA_LOG_INFO("%s: model size
|
2060
|
+
LLAMA_LOG_INFO("%s: model size = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
|
1903
2061
|
}
|
1904
2062
|
|
1905
2063
|
// general kv
|
@@ -1917,13 +2075,9 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
1917
2075
|
static void llm_load_tensors(
|
1918
2076
|
llama_model_loader & ml,
|
1919
2077
|
llama_model & model,
|
1920
|
-
int n_batch,
|
1921
2078
|
int n_gpu_layers,
|
1922
2079
|
int main_gpu,
|
1923
2080
|
const float * tensor_split,
|
1924
|
-
const bool mul_mat_q,
|
1925
|
-
bool low_vram,
|
1926
|
-
ggml_type memory_type,
|
1927
2081
|
bool use_mlock,
|
1928
2082
|
llama_progress_callback progress_callback,
|
1929
2083
|
void * progress_callback_user_data) {
|
@@ -1962,11 +2116,9 @@ static void llm_load_tensors(
|
|
1962
2116
|
}
|
1963
2117
|
|
1964
2118
|
(void) main_gpu;
|
1965
|
-
|
1966
|
-
#if defined(GGML_USE_CUBLAS)
|
2119
|
+
#ifdef GGML_USE_CUBLAS
|
1967
2120
|
LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__);
|
1968
2121
|
ggml_cuda_set_main_device(main_gpu);
|
1969
|
-
ggml_cuda_set_mul_mat_q(mul_mat_q);
|
1970
2122
|
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
|
1971
2123
|
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
|
1972
2124
|
#elif defined(GGML_USE_CLBLAST)
|
@@ -2001,9 +2153,9 @@ static void llm_load_tensors(
|
|
2001
2153
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2002
2154
|
// on Windows however this is detrimental unless everything is on the GPU
|
2003
2155
|
#ifndef _WIN32
|
2004
|
-
backend_norm =
|
2156
|
+
backend_norm = LLAMA_BACKEND_OFFLOAD;
|
2005
2157
|
#else
|
2006
|
-
backend_norm =
|
2158
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2007
2159
|
#endif // _WIN32
|
2008
2160
|
|
2009
2161
|
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
@@ -2067,9 +2219,9 @@ static void llm_load_tensors(
|
|
2067
2219
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2068
2220
|
// on Windows however this is detrimental unless everything is on the GPU
|
2069
2221
|
#ifndef _WIN32
|
2070
|
-
backend_norm =
|
2222
|
+
backend_norm = LLAMA_BACKEND_OFFLOAD;
|
2071
2223
|
#else
|
2072
|
-
backend_norm =
|
2224
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2073
2225
|
#endif // _WIN32
|
2074
2226
|
|
2075
2227
|
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
@@ -2137,9 +2289,9 @@ static void llm_load_tensors(
|
|
2137
2289
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2138
2290
|
// on Windows however this is detrimental unless everything is on the GPU
|
2139
2291
|
#ifndef _WIN32
|
2140
|
-
backend_norm =
|
2292
|
+
backend_norm = LLAMA_BACKEND_OFFLOAD;
|
2141
2293
|
#else
|
2142
|
-
backend_norm =
|
2294
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2143
2295
|
#endif // _WIN32
|
2144
2296
|
|
2145
2297
|
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
@@ -2214,9 +2366,9 @@ static void llm_load_tensors(
|
|
2214
2366
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2215
2367
|
// on Windows however this is detrimental unless everything is on the GPU
|
2216
2368
|
#ifndef _WIN32
|
2217
|
-
backend_norm =
|
2369
|
+
backend_norm = LLAMA_BACKEND_OFFLOAD;
|
2218
2370
|
#else
|
2219
|
-
backend_norm =
|
2371
|
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2220
2372
|
#endif // _WIN32
|
2221
2373
|
|
2222
2374
|
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
@@ -2281,27 +2433,19 @@ static void llm_load_tensors(
|
|
2281
2433
|
} break;
|
2282
2434
|
default:
|
2283
2435
|
throw std::runtime_error("unknown architecture");
|
2284
|
-
}
|
2436
|
+
}
|
2285
2437
|
}
|
2286
2438
|
|
2287
2439
|
ml.done_getting_tensors();
|
2288
2440
|
|
2289
2441
|
// print memory requirements
|
2290
2442
|
{
|
2291
|
-
const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
|
2292
|
-
|
2293
2443
|
// this is the total memory required to run the inference
|
2294
2444
|
size_t mem_required =
|
2295
2445
|
ctx_size +
|
2296
2446
|
mmapped_size - vram_weights; // weights in VRAM not in memory
|
2297
2447
|
|
2298
|
-
|
2299
|
-
const size_t mem_required_state = scale*hparams.kv_size();
|
2300
|
-
|
2301
|
-
LLAMA_LOG_INFO("%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
|
2302
|
-
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
2303
|
-
|
2304
|
-
(void) n_batch;
|
2448
|
+
LLAMA_LOG_INFO("%s: mem required = %7.2f MB\n", __func__, mem_required / 1024.0 / 1024.0);
|
2305
2449
|
|
2306
2450
|
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
2307
2451
|
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
@@ -2310,36 +2454,17 @@ static void llm_load_tensors(
|
|
2310
2454
|
if (n_gpu_layers > (int) hparams.n_layer) {
|
2311
2455
|
LLAMA_LOG_INFO("%s: offloading non-repeating layers to GPU\n", __func__);
|
2312
2456
|
}
|
2313
|
-
size_t vram_kv_cache = 0;
|
2314
2457
|
|
2315
2458
|
#ifdef GGML_USE_CUBLAS
|
2316
2459
|
const int max_backend_supported_layers = hparams.n_layer + 3;
|
2317
|
-
const int max_offloadable_layers =
|
2318
|
-
if (n_gpu_layers > (int) hparams.n_layer + 1) {
|
2319
|
-
if (low_vram) {
|
2320
|
-
LLAMA_LOG_INFO("%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
|
2321
|
-
} else {
|
2322
|
-
LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__);
|
2323
|
-
vram_kv_cache += hparams.kv_size() / 2;
|
2324
|
-
}
|
2325
|
-
}
|
2326
|
-
if (n_gpu_layers > (int) hparams.n_layer + 2) {
|
2327
|
-
if (low_vram) {
|
2328
|
-
LLAMA_LOG_WARN("%s: cannot offload k cache to GPU due to low VRAM option\n", __func__);
|
2329
|
-
} else {
|
2330
|
-
LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
|
2331
|
-
vram_kv_cache += hparams.kv_size() / 2;
|
2332
|
-
}
|
2333
|
-
}
|
2460
|
+
const int max_offloadable_layers = hparams.n_layer + 3;
|
2334
2461
|
#elif defined(GGML_USE_CLBLAST)
|
2335
2462
|
const int max_backend_supported_layers = hparams.n_layer + 1;
|
2336
2463
|
const int max_offloadable_layers = hparams.n_layer + 1;
|
2337
2464
|
#endif // GGML_USE_CUBLAS
|
2338
2465
|
|
2339
|
-
LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n",
|
2340
|
-
|
2341
|
-
LLAMA_LOG_INFO("%s: VRAM used: %zu MB\n",
|
2342
|
-
__func__, (vram_weights + vram_kv_cache + MB - 1) / MB); // round up
|
2466
|
+
LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
|
2467
|
+
LLAMA_LOG_INFO("%s: VRAM used: %.2f MB\n", __func__, vram_weights / 1024.0 / 1024.0);
|
2343
2468
|
#else
|
2344
2469
|
(void) n_gpu_layers;
|
2345
2470
|
#endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
@@ -2352,7 +2477,7 @@ static void llm_load_tensors(
|
|
2352
2477
|
}
|
2353
2478
|
|
2354
2479
|
(void) tensor_split;
|
2355
|
-
#
|
2480
|
+
#ifdef GGML_USE_CUBLAS
|
2356
2481
|
{
|
2357
2482
|
ggml_cuda_set_tensor_split(tensor_split);
|
2358
2483
|
}
|
@@ -2374,29 +2499,24 @@ static void llm_load_tensors(
|
|
2374
2499
|
static bool llama_model_load(
|
2375
2500
|
const std::string & fname,
|
2376
2501
|
llama_model & model,
|
2377
|
-
int n_ctx,
|
2378
|
-
int n_batch,
|
2379
2502
|
int n_gpu_layers,
|
2380
2503
|
int main_gpu,
|
2381
2504
|
const float * tensor_split,
|
2382
|
-
const bool mul_mat_q,
|
2383
|
-
float rope_freq_base,
|
2384
|
-
float rope_freq_scale,
|
2385
|
-
bool low_vram,
|
2386
|
-
ggml_type memory_type,
|
2387
2505
|
bool use_mmap,
|
2388
2506
|
bool use_mlock,
|
2389
2507
|
bool vocab_only,
|
2390
2508
|
llama_progress_callback progress_callback,
|
2391
2509
|
void *progress_callback_user_data) {
|
2392
2510
|
try {
|
2393
|
-
|
2511
|
+
llama_model_loader ml(fname, use_mmap);
|
2512
|
+
|
2513
|
+
model.hparams.vocab_only = vocab_only;
|
2394
2514
|
|
2395
|
-
llm_load_arch (
|
2396
|
-
llm_load_hparams(
|
2397
|
-
llm_load_vocab (
|
2515
|
+
llm_load_arch (ml, model);
|
2516
|
+
llm_load_hparams(ml, model);
|
2517
|
+
llm_load_vocab (ml, model);
|
2398
2518
|
|
2399
|
-
llm_load_print_meta(
|
2519
|
+
llm_load_print_meta(ml, model);
|
2400
2520
|
|
2401
2521
|
if (model.hparams.n_vocab != model.vocab.id_to_token.size()) {
|
2402
2522
|
throw std::runtime_error("vocab size mismatch");
|
@@ -2408,8 +2528,8 @@ static bool llama_model_load(
|
|
2408
2528
|
}
|
2409
2529
|
|
2410
2530
|
llm_load_tensors(
|
2411
|
-
|
2412
|
-
main_gpu, tensor_split,
|
2531
|
+
ml, model, n_gpu_layers,
|
2532
|
+
main_gpu, tensor_split,
|
2413
2533
|
use_mlock, progress_callback, progress_callback_user_data);
|
2414
2534
|
} catch (const std::exception & err) {
|
2415
2535
|
LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
|
@@ -2421,17 +2541,10 @@ static bool llama_model_load(
|
|
2421
2541
|
|
2422
2542
|
static struct ggml_cgraph * llm_build_llama(
|
2423
2543
|
llama_context & lctx,
|
2424
|
-
const
|
2425
|
-
const float * embd,
|
2426
|
-
int n_tokens,
|
2427
|
-
int n_past) {
|
2428
|
-
|
2429
|
-
GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
|
2430
|
-
|
2431
|
-
const int N = n_tokens;
|
2432
|
-
|
2544
|
+
const llama_batch & batch) {
|
2433
2545
|
const auto & model = lctx.model;
|
2434
2546
|
const auto & hparams = model.hparams;
|
2547
|
+
const auto & cparams = lctx.cparams;
|
2435
2548
|
|
2436
2549
|
const auto & kv_self = lctx.kv_self;
|
2437
2550
|
|
@@ -2439,7 +2552,7 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2439
2552
|
|
2440
2553
|
const int64_t n_embd = hparams.n_embd;
|
2441
2554
|
const int64_t n_layer = hparams.n_layer;
|
2442
|
-
const int64_t n_ctx =
|
2555
|
+
const int64_t n_ctx = cparams.n_ctx;
|
2443
2556
|
const int64_t n_head = hparams.n_head;
|
2444
2557
|
const int64_t n_head_kv = hparams.n_head_kv;
|
2445
2558
|
const int64_t n_embd_head = hparams.n_embd_head();
|
@@ -2447,12 +2560,20 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2447
2560
|
|
2448
2561
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
2449
2562
|
|
2450
|
-
const float freq_base =
|
2451
|
-
const float freq_scale =
|
2563
|
+
const float freq_base = cparams.rope_freq_base;
|
2564
|
+
const float freq_scale = cparams.rope_freq_scale;
|
2452
2565
|
const float norm_rms_eps = hparams.f_norm_rms_eps;
|
2453
2566
|
|
2454
2567
|
const int n_gpu_layers = model.n_gpu_layers;
|
2455
2568
|
|
2569
|
+
const int32_t n_tokens = batch.n_tokens;
|
2570
|
+
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
|
2571
|
+
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
2572
|
+
|
2573
|
+
const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
|
2574
|
+
|
2575
|
+
//printf("n_kv = %d\n", n_kv);
|
2576
|
+
|
2456
2577
|
auto & buf_compute = lctx.buf_compute;
|
2457
2578
|
|
2458
2579
|
struct ggml_init_params params = {
|
@@ -2470,12 +2591,12 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2470
2591
|
struct ggml_tensor * cur;
|
2471
2592
|
struct ggml_tensor * inpL;
|
2472
2593
|
|
2473
|
-
if (
|
2474
|
-
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32,
|
2594
|
+
if (batch.token) {
|
2595
|
+
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
2475
2596
|
|
2476
2597
|
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
2477
2598
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2478
|
-
memcpy(inp_tokens->data,
|
2599
|
+
memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
|
2479
2600
|
}
|
2480
2601
|
ggml_set_name(inp_tokens, "inp_tokens");
|
2481
2602
|
|
@@ -2485,11 +2606,11 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2485
2606
|
GGML_ASSERT(false && "not implemented");
|
2486
2607
|
#endif
|
2487
2608
|
|
2488
|
-
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd,
|
2609
|
+
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
|
2489
2610
|
|
2490
2611
|
ggml_allocr_alloc(lctx.alloc, inpL);
|
2491
2612
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2492
|
-
memcpy(inpL->data, embd,
|
2613
|
+
memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
|
2493
2614
|
}
|
2494
2615
|
}
|
2495
2616
|
|
@@ -2498,9 +2619,6 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2498
2619
|
|
2499
2620
|
// offload functions set the tensor output backend to GPU
|
2500
2621
|
// tensors are GPU-accelerated if any input or the output has been offloaded
|
2501
|
-
//
|
2502
|
-
// with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
|
2503
|
-
// in that case ggml_cuda_assign_buffers has no effect
|
2504
2622
|
offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
|
2505
2623
|
offload_func_t offload_func_kq = llama_nop;
|
2506
2624
|
offload_func_t offload_func_v = llama_nop;
|
@@ -2517,12 +2635,75 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2517
2635
|
}
|
2518
2636
|
#endif // GGML_USE_CUBLAS
|
2519
2637
|
|
2638
|
+
// KQ_scale
|
2520
2639
|
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
2640
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
2521
2641
|
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
2522
2642
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2523
|
-
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(
|
2643
|
+
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd_head)));
|
2644
|
+
}
|
2645
|
+
|
2646
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
2647
|
+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
2648
|
+
offload_func_kq(KQ_mask);
|
2649
|
+
ggml_set_name(KQ_mask, "KQ_mask");
|
2650
|
+
ggml_allocr_alloc(lctx.alloc, KQ_mask);
|
2651
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2652
|
+
float * data = (float *) KQ_mask->data;
|
2653
|
+
memset(data, 0, ggml_nbytes(KQ_mask));
|
2654
|
+
|
2655
|
+
for (int h = 0; h < 1; ++h) {
|
2656
|
+
for (int j = 0; j < n_tokens; ++j) {
|
2657
|
+
const llama_pos pos = batch.pos[j];
|
2658
|
+
const llama_seq_id seq_id = batch.seq_id[j];
|
2659
|
+
|
2660
|
+
for (int i = 0; i < n_kv; ++i) {
|
2661
|
+
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
2662
|
+
data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
|
2663
|
+
}
|
2664
|
+
}
|
2665
|
+
}
|
2666
|
+
}
|
2667
|
+
}
|
2668
|
+
|
2669
|
+
// KQ_pos - contains the positions
|
2670
|
+
struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
2671
|
+
offload_func_kq(KQ_pos);
|
2672
|
+
ggml_set_name(KQ_pos, "KQ_pos");
|
2673
|
+
ggml_allocr_alloc(lctx.alloc, KQ_pos);
|
2674
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2675
|
+
int * data = (int *) KQ_pos->data;
|
2676
|
+
for (int i = 0; i < n_tokens; ++i) {
|
2677
|
+
data[i] = batch.pos[i];
|
2678
|
+
}
|
2679
|
+
}
|
2680
|
+
|
2681
|
+
// shift the entire K-cache if needed
|
2682
|
+
if (do_rope_shift) {
|
2683
|
+
struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
|
2684
|
+
offload_func_kq(K_shift);
|
2685
|
+
ggml_set_name(K_shift, "K_shift");
|
2686
|
+
ggml_allocr_alloc(lctx.alloc, K_shift);
|
2687
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2688
|
+
int * data = (int *) K_shift->data;
|
2689
|
+
for (int i = 0; i < n_ctx; ++i) {
|
2690
|
+
data[i] = kv_self.cells[i].delta;
|
2691
|
+
}
|
2692
|
+
}
|
2693
|
+
|
2694
|
+
for (int il = 0; il < n_layer; ++il) {
|
2695
|
+
struct ggml_tensor * tmp =
|
2696
|
+
ggml_rope_custom_inplace(ctx0,
|
2697
|
+
ggml_view_3d(ctx0, kv_self.k,
|
2698
|
+
n_embd_head, n_head_kv, n_ctx,
|
2699
|
+
ggml_element_size(kv_self.k)*n_embd_head,
|
2700
|
+
ggml_element_size(kv_self.k)*n_embd_gqa,
|
2701
|
+
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
|
2702
|
+
K_shift, n_embd_head, 0, 0, freq_base, freq_scale);
|
2703
|
+
offload_func_kq(tmp);
|
2704
|
+
ggml_build_forward_expand(gf, tmp);
|
2705
|
+
}
|
2524
2706
|
}
|
2525
|
-
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
2526
2707
|
|
2527
2708
|
for (int il = 0; il < n_layer; ++il) {
|
2528
2709
|
ggml_format_name(inpL, "layer_inp_%d", il);
|
@@ -2560,33 +2741,33 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2560
2741
|
offload_func_kq(tmpq);
|
2561
2742
|
ggml_set_name(tmpq, "tmpq");
|
2562
2743
|
|
2563
|
-
struct ggml_tensor * Kcur =
|
2744
|
+
struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
|
2564
2745
|
offload_func_kq(Kcur);
|
2565
2746
|
ggml_set_name(Kcur, "Kcur");
|
2566
2747
|
|
2567
|
-
struct ggml_tensor * Qcur =
|
2748
|
+
struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
|
2568
2749
|
offload_func_kq(Qcur);
|
2569
2750
|
ggml_set_name(Qcur, "Qcur");
|
2570
2751
|
|
2571
2752
|
// store key and value to memory
|
2572
2753
|
{
|
2573
|
-
// compute the transposed [
|
2754
|
+
// compute the transposed [n_tokens, n_embd] V matrix
|
2574
2755
|
|
2575
2756
|
struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
2576
2757
|
offload_func_v(tmpv);
|
2577
2758
|
ggml_set_name(tmpv, "tmpv");
|
2578
2759
|
|
2579
|
-
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa,
|
2760
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens));
|
2580
2761
|
offload_func_v(Vcur);
|
2581
2762
|
ggml_set_name(Vcur, "Vcur");
|
2582
2763
|
|
2583
|
-
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k,
|
2764
|
+
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
|
2584
2765
|
offload_func_kq(k);
|
2585
2766
|
ggml_set_name(k, "k");
|
2586
2767
|
|
2587
|
-
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v,
|
2768
|
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
|
2588
2769
|
( n_ctx)*ggml_element_size(kv_self.v),
|
2589
|
-
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa +
|
2770
|
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
|
2590
2771
|
offload_func_v(v);
|
2591
2772
|
ggml_set_name(v, "v");
|
2592
2773
|
|
@@ -2601,7 +2782,7 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2601
2782
|
|
2602
2783
|
struct ggml_tensor * K =
|
2603
2784
|
ggml_view_3d(ctx0, kv_self.k,
|
2604
|
-
n_embd_head,
|
2785
|
+
n_embd_head, n_kv, n_head_kv,
|
2605
2786
|
ggml_element_size(kv_self.k)*n_embd_gqa,
|
2606
2787
|
ggml_element_size(kv_self.k)*n_embd_head,
|
2607
2788
|
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
@@ -2614,25 +2795,25 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2614
2795
|
ggml_set_name(KQ, "KQ");
|
2615
2796
|
|
2616
2797
|
// KQ_scaled = KQ / sqrt(n_embd_head)
|
2617
|
-
// KQ_scaled shape [
|
2618
|
-
struct ggml_tensor * KQ_scaled =
|
2798
|
+
// KQ_scaled shape [n_kv, n_tokens, n_head, 1]
|
2799
|
+
struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
|
2619
2800
|
offload_func_kq(KQ_scaled);
|
2620
2801
|
ggml_set_name(KQ_scaled, "KQ_scaled");
|
2621
2802
|
|
2622
2803
|
// KQ_masked = mask_past(KQ_scaled)
|
2623
|
-
struct ggml_tensor * KQ_masked =
|
2804
|
+
struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
|
2624
2805
|
offload_func_kq(KQ_masked);
|
2625
2806
|
ggml_set_name(KQ_masked, "KQ_masked");
|
2626
2807
|
|
2627
2808
|
// KQ = soft_max(KQ_masked)
|
2628
|
-
struct ggml_tensor * KQ_soft_max =
|
2809
|
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
|
2629
2810
|
offload_func_v(KQ_soft_max);
|
2630
2811
|
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
2631
2812
|
|
2632
2813
|
// split cached V into n_head heads
|
2633
2814
|
struct ggml_tensor * V =
|
2634
2815
|
ggml_view_3d(ctx0, kv_self.v,
|
2635
|
-
|
2816
|
+
n_kv, n_embd_head, n_head_kv,
|
2636
2817
|
ggml_element_size(kv_self.v)*n_ctx,
|
2637
2818
|
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
2638
2819
|
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
@@ -2647,7 +2828,7 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2647
2828
|
// make V contiguous in memory to speed up the matmul, however we waste time on the copy
|
2648
2829
|
// on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
|
2649
2830
|
// is there a better way?
|
2650
|
-
struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type,
|
2831
|
+
struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_ctx, n_embd_head, n_head));
|
2651
2832
|
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
|
2652
2833
|
#endif
|
2653
2834
|
|
@@ -2656,10 +2837,8 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2656
2837
|
offload_func_v(KQV_merged);
|
2657
2838
|
ggml_set_name(KQV_merged, "KQV_merged");
|
2658
2839
|
|
2659
|
-
// cur = KQV_merged.contiguous().view(n_embd,
|
2660
|
-
cur =
|
2661
|
-
KQV_merged,
|
2662
|
-
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
|
2840
|
+
// cur = KQV_merged.contiguous().view(n_embd, n_tokens)
|
2841
|
+
cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
|
2663
2842
|
offload_func_v(cur);
|
2664
2843
|
ggml_set_name(cur, "KQV_merged_contiguous");
|
2665
2844
|
|
@@ -2750,20 +2929,12 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2750
2929
|
return gf;
|
2751
2930
|
}
|
2752
2931
|
|
2753
|
-
|
2754
2932
|
static struct ggml_cgraph * llm_build_baichaun(
|
2755
2933
|
llama_context & lctx,
|
2756
|
-
const
|
2757
|
-
const float * embd,
|
2758
|
-
int n_tokens,
|
2759
|
-
int n_past) {
|
2760
|
-
|
2761
|
-
GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
|
2762
|
-
|
2763
|
-
const int N = n_tokens;
|
2764
|
-
|
2934
|
+
const llama_batch & batch) {
|
2765
2935
|
const auto & model = lctx.model;
|
2766
2936
|
const auto & hparams = model.hparams;
|
2937
|
+
const auto & cparams = lctx.cparams;
|
2767
2938
|
|
2768
2939
|
const auto & kv_self = lctx.kv_self;
|
2769
2940
|
|
@@ -2771,7 +2942,7 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2771
2942
|
|
2772
2943
|
const int64_t n_embd = hparams.n_embd;
|
2773
2944
|
const int64_t n_layer = hparams.n_layer;
|
2774
|
-
const int64_t n_ctx =
|
2945
|
+
const int64_t n_ctx = cparams.n_ctx;
|
2775
2946
|
const int64_t n_head = hparams.n_head;
|
2776
2947
|
const int64_t n_head_kv = hparams.n_head_kv;
|
2777
2948
|
const int64_t n_embd_head = hparams.n_embd_head();
|
@@ -2779,12 +2950,18 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2779
2950
|
|
2780
2951
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
2781
2952
|
|
2782
|
-
const float freq_base =
|
2783
|
-
const float freq_scale =
|
2953
|
+
const float freq_base = cparams.rope_freq_base;
|
2954
|
+
const float freq_scale = cparams.rope_freq_scale;
|
2784
2955
|
const float norm_rms_eps = hparams.f_norm_rms_eps;
|
2785
2956
|
|
2786
2957
|
const int n_gpu_layers = model.n_gpu_layers;
|
2787
2958
|
|
2959
|
+
const int32_t n_tokens = batch.n_tokens;
|
2960
|
+
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
|
2961
|
+
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
2962
|
+
|
2963
|
+
const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
|
2964
|
+
|
2788
2965
|
auto & buf_compute = lctx.buf_compute;
|
2789
2966
|
|
2790
2967
|
struct ggml_init_params params = {
|
@@ -2802,12 +2979,12 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2802
2979
|
struct ggml_tensor * cur;
|
2803
2980
|
struct ggml_tensor * inpL;
|
2804
2981
|
|
2805
|
-
if (
|
2806
|
-
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32,
|
2982
|
+
if (batch.token) {
|
2983
|
+
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
2807
2984
|
|
2808
2985
|
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
2809
2986
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2810
|
-
memcpy(inp_tokens->data,
|
2987
|
+
memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
|
2811
2988
|
}
|
2812
2989
|
ggml_set_name(inp_tokens, "inp_tokens");
|
2813
2990
|
|
@@ -2817,11 +2994,11 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2817
2994
|
GGML_ASSERT(false && "not implemented");
|
2818
2995
|
#endif
|
2819
2996
|
|
2820
|
-
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd,
|
2997
|
+
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
|
2821
2998
|
|
2822
2999
|
ggml_allocr_alloc(lctx.alloc, inpL);
|
2823
3000
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2824
|
-
memcpy(inpL->data, embd,
|
3001
|
+
memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
|
2825
3002
|
}
|
2826
3003
|
}
|
2827
3004
|
|
@@ -2830,9 +3007,6 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2830
3007
|
|
2831
3008
|
// offload functions set the tensor output backend to GPU
|
2832
3009
|
// tensors are GPU-accelerated if any input or the output has been offloaded
|
2833
|
-
//
|
2834
|
-
// with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
|
2835
|
-
// in that case ggml_cuda_assign_buffers has no effect
|
2836
3010
|
offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
|
2837
3011
|
offload_func_t offload_func_kq = llama_nop;
|
2838
3012
|
offload_func_t offload_func_v = llama_nop;
|
@@ -2849,12 +3023,75 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2849
3023
|
}
|
2850
3024
|
#endif // GGML_USE_CUBLAS
|
2851
3025
|
|
3026
|
+
// KQ_scale
|
2852
3027
|
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
3028
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
2853
3029
|
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
2854
3030
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2855
3031
|
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
2856
3032
|
}
|
2857
|
-
|
3033
|
+
|
3034
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
3035
|
+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
3036
|
+
offload_func_kq(KQ_mask);
|
3037
|
+
ggml_set_name(KQ_mask, "KQ_mask");
|
3038
|
+
ggml_allocr_alloc(lctx.alloc, KQ_mask);
|
3039
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3040
|
+
float * data = (float *) KQ_mask->data;
|
3041
|
+
memset(data, 0, ggml_nbytes(KQ_mask));
|
3042
|
+
|
3043
|
+
for (int h = 0; h < 1; ++h) {
|
3044
|
+
for (int j = 0; j < n_tokens; ++j) {
|
3045
|
+
const llama_pos pos = batch.pos[j];
|
3046
|
+
const llama_seq_id seq_id = batch.seq_id[j];
|
3047
|
+
|
3048
|
+
for (int i = 0; i < n_kv; ++i) {
|
3049
|
+
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
3050
|
+
data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
|
3051
|
+
}
|
3052
|
+
}
|
3053
|
+
}
|
3054
|
+
}
|
3055
|
+
}
|
3056
|
+
|
3057
|
+
// KQ_pos - contains the positions
|
3058
|
+
struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
3059
|
+
offload_func_kq(KQ_pos);
|
3060
|
+
ggml_set_name(KQ_pos, "KQ_pos");
|
3061
|
+
ggml_allocr_alloc(lctx.alloc, KQ_pos);
|
3062
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3063
|
+
int * data = (int *) KQ_pos->data;
|
3064
|
+
for (int i = 0; i < n_tokens; ++i) {
|
3065
|
+
data[i] = batch.pos[i];
|
3066
|
+
}
|
3067
|
+
}
|
3068
|
+
|
3069
|
+
// shift the entire K-cache if needed
|
3070
|
+
if (do_rope_shift) {
|
3071
|
+
struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
|
3072
|
+
offload_func_kq(K_shift);
|
3073
|
+
ggml_set_name(K_shift, "K_shift");
|
3074
|
+
ggml_allocr_alloc(lctx.alloc, K_shift);
|
3075
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3076
|
+
int * data = (int *) K_shift->data;
|
3077
|
+
for (int i = 0; i < n_ctx; ++i) {
|
3078
|
+
data[i] = kv_self.cells[i].delta;
|
3079
|
+
}
|
3080
|
+
}
|
3081
|
+
|
3082
|
+
for (int il = 0; il < n_layer; ++il) {
|
3083
|
+
struct ggml_tensor * tmp =
|
3084
|
+
ggml_rope_custom_inplace(ctx0,
|
3085
|
+
ggml_view_3d(ctx0, kv_self.k,
|
3086
|
+
n_embd_head, n_head_kv, n_ctx,
|
3087
|
+
ggml_element_size(kv_self.k)*n_embd_head,
|
3088
|
+
ggml_element_size(kv_self.k)*n_embd_gqa,
|
3089
|
+
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
|
3090
|
+
K_shift, n_embd_head, 0, 0, freq_base, freq_scale);
|
3091
|
+
offload_func_kq(tmp);
|
3092
|
+
ggml_build_forward_expand(gf, tmp);
|
3093
|
+
}
|
3094
|
+
}
|
2858
3095
|
|
2859
3096
|
for (int il = 0; il < n_layer; ++il) {
|
2860
3097
|
ggml_format_name(inpL, "layer_inp_%d", il);
|
@@ -2896,12 +3133,12 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2896
3133
|
struct ggml_tensor * Qcur;
|
2897
3134
|
switch (model.type) {
|
2898
3135
|
case MODEL_7B:
|
2899
|
-
Kcur =
|
2900
|
-
Qcur =
|
3136
|
+
Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
|
3137
|
+
Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
|
2901
3138
|
break;
|
2902
3139
|
case MODEL_13B:
|
2903
|
-
Kcur
|
2904
|
-
Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head,
|
3140
|
+
Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, n_tokens);
|
3141
|
+
Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, n_tokens);
|
2905
3142
|
break;
|
2906
3143
|
default:
|
2907
3144
|
GGML_ASSERT(false);
|
@@ -2915,23 +3152,23 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2915
3152
|
|
2916
3153
|
// store key and value to memory
|
2917
3154
|
{
|
2918
|
-
// compute the transposed [
|
3155
|
+
// compute the transposed [n_tokens, n_embd] V matrix
|
2919
3156
|
|
2920
3157
|
struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
2921
3158
|
offload_func_v(tmpv);
|
2922
3159
|
ggml_set_name(tmpv, "tmpv");
|
2923
3160
|
|
2924
|
-
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa,
|
3161
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens));
|
2925
3162
|
offload_func_v(Vcur);
|
2926
3163
|
ggml_set_name(Vcur, "Vcur");
|
2927
3164
|
|
2928
|
-
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k,
|
3165
|
+
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
|
2929
3166
|
offload_func_kq(k);
|
2930
3167
|
ggml_set_name(k, "k");
|
2931
3168
|
|
2932
|
-
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v,
|
3169
|
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
|
2933
3170
|
( n_ctx)*ggml_element_size(kv_self.v),
|
2934
|
-
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa +
|
3171
|
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
|
2935
3172
|
offload_func_v(v);
|
2936
3173
|
ggml_set_name(v, "v");
|
2937
3174
|
|
@@ -2946,7 +3183,7 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2946
3183
|
|
2947
3184
|
struct ggml_tensor * K =
|
2948
3185
|
ggml_view_3d(ctx0, kv_self.k,
|
2949
|
-
n_embd_head,
|
3186
|
+
n_embd_head, n_kv, n_head_kv,
|
2950
3187
|
ggml_element_size(kv_self.k)*n_embd_gqa,
|
2951
3188
|
ggml_element_size(kv_self.k)*n_embd_head,
|
2952
3189
|
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
@@ -2959,8 +3196,8 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2959
3196
|
ggml_set_name(KQ, "KQ");
|
2960
3197
|
|
2961
3198
|
// KQ_scaled = KQ / sqrt(n_embd_head)
|
2962
|
-
// KQ_scaled shape [n_past +
|
2963
|
-
struct ggml_tensor * KQ_scaled =
|
3199
|
+
// KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
|
3200
|
+
struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
|
2964
3201
|
offload_func_kq(KQ_scaled);
|
2965
3202
|
ggml_set_name(KQ_scaled, "KQ_scaled");
|
2966
3203
|
|
@@ -2969,58 +3206,44 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
2969
3206
|
|
2970
3207
|
switch (model.type) {
|
2971
3208
|
case MODEL_7B:
|
2972
|
-
KQ_masked =
|
3209
|
+
KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
|
2973
3210
|
break;
|
2974
3211
|
case MODEL_13B:
|
2975
|
-
|
3212
|
+
// TODO: replace with ggml_add()
|
3213
|
+
KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ 0, n_head, 8);
|
2976
3214
|
ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
|
2977
|
-
KQ_masked =
|
3215
|
+
KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
|
2978
3216
|
break;
|
2979
3217
|
default:
|
2980
3218
|
GGML_ASSERT(false);
|
2981
3219
|
}
|
2982
|
-
// KQ_masked = mask_past(KQ_scaled)
|
2983
|
-
// struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
|
2984
|
-
// struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past);
|
2985
|
-
// offload_func_kq(KQ_masked);
|
2986
|
-
// ggml_set_name(KQ_masked, "KQ_masked");
|
2987
3220
|
|
2988
3221
|
// KQ = soft_max(KQ_masked)
|
2989
|
-
struct ggml_tensor * KQ_soft_max =
|
3222
|
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
|
2990
3223
|
offload_func_v(KQ_soft_max);
|
2991
3224
|
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
2992
3225
|
|
2993
3226
|
// split cached V into n_head heads
|
2994
3227
|
struct ggml_tensor * V =
|
2995
3228
|
ggml_view_3d(ctx0, kv_self.v,
|
2996
|
-
|
3229
|
+
n_kv, n_embd_head, n_head_kv,
|
2997
3230
|
ggml_element_size(kv_self.v)*n_ctx,
|
2998
3231
|
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
2999
3232
|
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
3000
3233
|
offload_func_v(V);
|
3001
3234
|
ggml_set_name(V, "V");
|
3002
3235
|
|
3003
|
-
#if 1
|
3004
3236
|
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
3005
3237
|
offload_func_v(KQV);
|
3006
3238
|
ggml_set_name(KQV, "KQV");
|
3007
|
-
#else
|
3008
|
-
// make V contiguous in memory to speed up the matmul, however we waste time on the copy
|
3009
|
-
// on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
|
3010
|
-
// is there a better way?
|
3011
|
-
struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head));
|
3012
|
-
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
|
3013
|
-
#endif
|
3014
3239
|
|
3015
3240
|
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
3016
3241
|
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
3017
3242
|
offload_func_v(KQV_merged);
|
3018
3243
|
ggml_set_name(KQV_merged, "KQV_merged");
|
3019
3244
|
|
3020
|
-
// cur = KQV_merged.contiguous().view(n_embd,
|
3021
|
-
cur =
|
3022
|
-
KQV_merged,
|
3023
|
-
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
|
3245
|
+
// cur = KQV_merged.contiguous().view(n_embd, n_tokens)
|
3246
|
+
cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
|
3024
3247
|
offload_func_v(cur);
|
3025
3248
|
ggml_set_name(cur, "KQV_merged_contiguous");
|
3026
3249
|
|
@@ -3113,17 +3336,10 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
3113
3336
|
|
3114
3337
|
static struct ggml_cgraph * llm_build_falcon(
|
3115
3338
|
llama_context & lctx,
|
3116
|
-
const
|
3117
|
-
const float * embd,
|
3118
|
-
int n_tokens,
|
3119
|
-
int n_past) {
|
3120
|
-
|
3121
|
-
GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
|
3122
|
-
|
3123
|
-
const int N = n_tokens;
|
3124
|
-
|
3339
|
+
const llama_batch & batch) {
|
3125
3340
|
const auto & model = lctx.model;
|
3126
3341
|
const auto & hparams = model.hparams;
|
3342
|
+
const auto & cparams = lctx.cparams;
|
3127
3343
|
|
3128
3344
|
const auto & kv_self = lctx.kv_self;
|
3129
3345
|
|
@@ -3131,7 +3347,7 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3131
3347
|
|
3132
3348
|
const int64_t n_embd = hparams.n_embd;
|
3133
3349
|
const int64_t n_layer = hparams.n_layer;
|
3134
|
-
const int64_t n_ctx =
|
3350
|
+
const int64_t n_ctx = cparams.n_ctx;
|
3135
3351
|
const int64_t n_head = hparams.n_head;
|
3136
3352
|
const int64_t n_head_kv = hparams.n_head_kv;
|
3137
3353
|
const int64_t n_embd_head = hparams.n_embd_head();
|
@@ -3139,12 +3355,21 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3139
3355
|
|
3140
3356
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
3141
3357
|
|
3142
|
-
const float freq_base =
|
3143
|
-
const float freq_scale =
|
3358
|
+
const float freq_base = cparams.rope_freq_base;
|
3359
|
+
const float freq_scale = cparams.rope_freq_scale;
|
3144
3360
|
const float norm_eps = hparams.f_norm_eps;
|
3145
3361
|
|
3146
3362
|
const int n_gpu_layers = model.n_gpu_layers;
|
3147
3363
|
|
3364
|
+
const int32_t n_tokens = batch.n_tokens;
|
3365
|
+
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
|
3366
|
+
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
3367
|
+
|
3368
|
+
const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
|
3369
|
+
|
3370
|
+
//printf("kv_head = %d, n_kv = %d, n_tokens = %d, n_ctx = %d, is_measure = %d, has_shift = %d\n",
|
3371
|
+
// kv_head, n_kv, n_tokens, n_ctx, ggml_allocr_is_measure(lctx.alloc), kv_self.has_shift);
|
3372
|
+
|
3148
3373
|
auto & buf_compute = lctx.buf_compute;
|
3149
3374
|
|
3150
3375
|
struct ggml_init_params params = {
|
@@ -3162,12 +3387,12 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3162
3387
|
struct ggml_tensor * cur;
|
3163
3388
|
struct ggml_tensor * inpL;
|
3164
3389
|
|
3165
|
-
if (
|
3166
|
-
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32,
|
3390
|
+
if (batch.token) {
|
3391
|
+
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
3167
3392
|
|
3168
3393
|
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
3169
3394
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3170
|
-
memcpy(inp_tokens->data,
|
3395
|
+
memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
|
3171
3396
|
}
|
3172
3397
|
ggml_set_name(inp_tokens, "inp_tokens");
|
3173
3398
|
|
@@ -3177,11 +3402,11 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3177
3402
|
GGML_ASSERT(false && "not implemented");
|
3178
3403
|
#endif
|
3179
3404
|
|
3180
|
-
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd,
|
3405
|
+
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
|
3181
3406
|
|
3182
3407
|
ggml_allocr_alloc(lctx.alloc, inpL);
|
3183
3408
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3184
|
-
memcpy(inpL->data, embd,
|
3409
|
+
memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
|
3185
3410
|
}
|
3186
3411
|
}
|
3187
3412
|
|
@@ -3190,9 +3415,6 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3190
3415
|
|
3191
3416
|
// offload functions set the tensor output backend to GPU
|
3192
3417
|
// tensors are GPU-accelerated if any input or the output has been offloaded
|
3193
|
-
//
|
3194
|
-
// with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
|
3195
|
-
// in that case ggml_cuda_assign_buffers has no effect
|
3196
3418
|
offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
|
3197
3419
|
offload_func_t offload_func_kq = llama_nop;
|
3198
3420
|
offload_func_t offload_func_v = llama_nop;
|
@@ -3209,12 +3431,75 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3209
3431
|
}
|
3210
3432
|
#endif // GGML_USE_CUBLAS
|
3211
3433
|
|
3434
|
+
// KQ_scale
|
3212
3435
|
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
3436
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
3213
3437
|
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
3214
3438
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3215
3439
|
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
3216
3440
|
}
|
3217
|
-
|
3441
|
+
|
3442
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
3443
|
+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
3444
|
+
offload_func_kq(KQ_mask);
|
3445
|
+
ggml_set_name(KQ_mask, "KQ_mask");
|
3446
|
+
ggml_allocr_alloc(lctx.alloc, KQ_mask);
|
3447
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3448
|
+
float * data = (float *) KQ_mask->data;
|
3449
|
+
memset(data, 0, ggml_nbytes(KQ_mask));
|
3450
|
+
|
3451
|
+
for (int h = 0; h < 1; ++h) {
|
3452
|
+
for (int j = 0; j < n_tokens; ++j) {
|
3453
|
+
const llama_pos pos = batch.pos[j];
|
3454
|
+
const llama_seq_id seq_id = batch.seq_id[j];
|
3455
|
+
|
3456
|
+
for (int i = 0; i < n_kv; ++i) {
|
3457
|
+
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
3458
|
+
data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
|
3459
|
+
}
|
3460
|
+
}
|
3461
|
+
}
|
3462
|
+
}
|
3463
|
+
}
|
3464
|
+
|
3465
|
+
// KQ_pos - contains the positions
|
3466
|
+
struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
3467
|
+
offload_func_kq(KQ_pos);
|
3468
|
+
ggml_set_name(KQ_pos, "KQ_pos");
|
3469
|
+
ggml_allocr_alloc(lctx.alloc, KQ_pos);
|
3470
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3471
|
+
int * data = (int *) KQ_pos->data;
|
3472
|
+
for (int i = 0; i < n_tokens; ++i) {
|
3473
|
+
data[i] = batch.pos[i];
|
3474
|
+
}
|
3475
|
+
}
|
3476
|
+
|
3477
|
+
// shift the entire K-cache if needed
|
3478
|
+
if (do_rope_shift) {
|
3479
|
+
struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
|
3480
|
+
offload_func_kq(K_shift);
|
3481
|
+
ggml_set_name(K_shift, "K_shift");
|
3482
|
+
ggml_allocr_alloc(lctx.alloc, K_shift);
|
3483
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3484
|
+
int * data = (int *) K_shift->data;
|
3485
|
+
for (int i = 0; i < n_ctx; ++i) {
|
3486
|
+
data[i] = kv_self.cells[i].delta;
|
3487
|
+
}
|
3488
|
+
}
|
3489
|
+
|
3490
|
+
for (int il = 0; il < n_layer; ++il) {
|
3491
|
+
struct ggml_tensor * tmp =
|
3492
|
+
ggml_rope_custom_inplace(ctx0,
|
3493
|
+
ggml_view_3d(ctx0, kv_self.k,
|
3494
|
+
n_embd_head, n_head_kv, n_ctx,
|
3495
|
+
ggml_element_size(kv_self.k)*n_embd_head,
|
3496
|
+
ggml_element_size(kv_self.k)*n_embd_gqa,
|
3497
|
+
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
|
3498
|
+
K_shift, n_embd_head, 2, 0, freq_base, freq_scale);
|
3499
|
+
offload_func_kq(tmp);
|
3500
|
+
ggml_build_forward_expand(gf, tmp);
|
3501
|
+
}
|
3502
|
+
}
|
3218
3503
|
|
3219
3504
|
for (int il = 0; il < n_layer; ++il) {
|
3220
3505
|
struct ggml_tensor * attn_norm;
|
@@ -3271,45 +3556,45 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3271
3556
|
// TODO: these 2 ggml_conts are technically not needed, but we add them until CUDA support for
|
3272
3557
|
// non-contiguous views is added for the rope operator
|
3273
3558
|
struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_3d(
|
3274
|
-
ctx0, cur, n_embd_head, n_head,
|
3559
|
+
ctx0, cur, n_embd_head, n_head, n_tokens,
|
3275
3560
|
wsize * n_embd_head,
|
3276
3561
|
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
3277
3562
|
0));
|
3278
3563
|
offload_func_kq(tmpq);
|
3279
3564
|
|
3280
3565
|
struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_3d(
|
3281
|
-
ctx0, cur, n_embd_head, n_head_kv,
|
3566
|
+
ctx0, cur, n_embd_head, n_head_kv, n_tokens,
|
3282
3567
|
wsize * n_embd_head,
|
3283
3568
|
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
3284
3569
|
wsize * n_embd_head * n_head));
|
3285
3570
|
offload_func_kq(tmpk);
|
3286
3571
|
|
3287
3572
|
struct ggml_tensor * tmpv = ggml_view_3d(
|
3288
|
-
ctx0, cur, n_embd_head, n_head_kv,
|
3573
|
+
ctx0, cur, n_embd_head, n_head_kv, n_tokens,
|
3289
3574
|
wsize * n_embd_head,
|
3290
3575
|
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
3291
3576
|
wsize * n_embd_head * (n_head + n_head_kv));
|
3292
3577
|
offload_func_v(tmpv);
|
3293
3578
|
|
3294
3579
|
// using mode = 2 for neox mode
|
3295
|
-
struct ggml_tensor * Qcur =
|
3580
|
+
struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, tmpq, KQ_pos, n_embd_head, 2, 0, freq_base, freq_scale);
|
3296
3581
|
offload_func_kq(Qcur);
|
3297
|
-
struct ggml_tensor * Kcur =
|
3582
|
+
struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, tmpk, KQ_pos, n_embd_head, 2, 0, freq_base, freq_scale);
|
3298
3583
|
offload_func_kq(Kcur);
|
3299
3584
|
|
3300
3585
|
{
|
3301
|
-
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa,
|
3586
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
|
3302
3587
|
offload_func_v(Vcur);
|
3303
3588
|
offload_func_v(Vcur->src[0]->src[0]);
|
3304
3589
|
ggml_set_name(Vcur, "Vcur");
|
3305
3590
|
|
3306
|
-
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k,
|
3591
|
+
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
|
3307
3592
|
offload_func_kq(k);
|
3308
3593
|
ggml_set_name(k, "k");
|
3309
3594
|
|
3310
|
-
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v,
|
3595
|
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
|
3311
3596
|
( n_ctx)*ggml_element_size(kv_self.v),
|
3312
|
-
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa +
|
3597
|
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
|
3313
3598
|
offload_func_v(v);
|
3314
3599
|
|
3315
3600
|
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
@@ -3322,7 +3607,7 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3322
3607
|
|
3323
3608
|
struct ggml_tensor * K =
|
3324
3609
|
ggml_view_3d(ctx0, kv_self.k,
|
3325
|
-
n_embd_head,
|
3610
|
+
n_embd_head, n_kv, n_head_kv,
|
3326
3611
|
ggml_element_size(kv_self.k)*n_embd_gqa,
|
3327
3612
|
ggml_element_size(kv_self.k)*n_embd_head,
|
3328
3613
|
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
@@ -3333,21 +3618,21 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3333
3618
|
offload_func_kq(KQ);
|
3334
3619
|
ggml_set_name(KQ, "KQ");
|
3335
3620
|
|
3336
|
-
struct ggml_tensor * KQ_scaled =
|
3621
|
+
struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
|
3337
3622
|
offload_func_kq(KQ_scaled);
|
3338
3623
|
ggml_set_name(KQ_scaled, "KQ_scaled");
|
3339
3624
|
|
3340
|
-
struct ggml_tensor * KQ_masked =
|
3625
|
+
struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
|
3341
3626
|
offload_func_kq(KQ_masked);
|
3342
3627
|
ggml_set_name(KQ_masked, "KQ_masked");
|
3343
3628
|
|
3344
|
-
struct ggml_tensor * KQ_soft_max =
|
3629
|
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
|
3345
3630
|
offload_func_v(KQ_soft_max);
|
3346
3631
|
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
3347
3632
|
|
3348
3633
|
struct ggml_tensor * V =
|
3349
3634
|
ggml_view_3d(ctx0, kv_self.v,
|
3350
|
-
|
3635
|
+
n_kv, n_embd_head, n_head_kv,
|
3351
3636
|
ggml_element_size(kv_self.v)*n_ctx,
|
3352
3637
|
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
3353
3638
|
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
@@ -3362,7 +3647,7 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3362
3647
|
offload_func_v(KQV_merged);
|
3363
3648
|
ggml_set_name(KQV_merged, "KQV_merged");
|
3364
3649
|
|
3365
|
-
cur =
|
3650
|
+
cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
|
3366
3651
|
offload_func_v(cur);
|
3367
3652
|
ggml_set_name(cur, "KQV_merged_contiguous");
|
3368
3653
|
|
@@ -3420,17 +3705,10 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3420
3705
|
|
3421
3706
|
static struct ggml_cgraph * llm_build_starcoder(
|
3422
3707
|
llama_context & lctx,
|
3423
|
-
const
|
3424
|
-
const float * embd,
|
3425
|
-
int n_tokens,
|
3426
|
-
int n_past) {
|
3427
|
-
|
3428
|
-
GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
|
3429
|
-
|
3430
|
-
const int N = n_tokens;
|
3431
|
-
|
3708
|
+
const llama_batch & batch) {
|
3432
3709
|
const auto & model = lctx.model;
|
3433
3710
|
const auto & hparams = model.hparams;
|
3711
|
+
const auto & cparams = lctx.cparams;
|
3434
3712
|
|
3435
3713
|
const auto & kv_self = lctx.kv_self;
|
3436
3714
|
|
@@ -3438,7 +3716,7 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
3438
3716
|
|
3439
3717
|
const int64_t n_embd = hparams.n_embd;
|
3440
3718
|
const int64_t n_layer = hparams.n_layer;
|
3441
|
-
const int64_t n_ctx =
|
3719
|
+
const int64_t n_ctx = cparams.n_ctx;
|
3442
3720
|
const int64_t n_head = hparams.n_head;
|
3443
3721
|
const int64_t n_head_kv = hparams.n_head_kv;
|
3444
3722
|
const int64_t n_embd_head = hparams.n_embd_head();
|
@@ -3446,7 +3724,11 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
3446
3724
|
|
3447
3725
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
3448
3726
|
|
3449
|
-
const float norm_eps
|
3727
|
+
const float norm_eps = hparams.f_norm_eps;
|
3728
|
+
|
3729
|
+
const int32_t n_tokens = batch.n_tokens;
|
3730
|
+
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
|
3731
|
+
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
3450
3732
|
|
3451
3733
|
auto & buf_compute = lctx.buf_compute;
|
3452
3734
|
|
@@ -3467,12 +3749,12 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
3467
3749
|
struct ggml_tensor * position;
|
3468
3750
|
struct ggml_tensor * inpL;
|
3469
3751
|
|
3470
|
-
if (
|
3471
|
-
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32,
|
3752
|
+
if (batch.token) {
|
3753
|
+
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
3472
3754
|
|
3473
3755
|
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
3474
3756
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3475
|
-
memcpy(inp_tokens->data,
|
3757
|
+
memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
|
3476
3758
|
}
|
3477
3759
|
ggml_set_name(inp_tokens, "inp_tokens");
|
3478
3760
|
|
@@ -3482,21 +3764,21 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
3482
3764
|
GGML_ASSERT(false && "not implemented");
|
3483
3765
|
#endif
|
3484
3766
|
|
3485
|
-
token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd,
|
3767
|
+
token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
|
3486
3768
|
|
3487
3769
|
ggml_allocr_alloc(lctx.alloc, token);
|
3488
3770
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3489
|
-
memcpy(token->data, embd,
|
3771
|
+
memcpy(token->data, batch.embd, n_tokens * n_embd * ggml_element_size(token));
|
3490
3772
|
}
|
3491
3773
|
}
|
3492
3774
|
|
3493
3775
|
{
|
3494
3776
|
// Compute position embeddings.
|
3495
|
-
struct ggml_tensor * inp_positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32,
|
3777
|
+
struct ggml_tensor * inp_positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
3496
3778
|
ggml_allocr_alloc(lctx.alloc, inp_positions);
|
3497
3779
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3498
|
-
for (int i = 0; i <
|
3499
|
-
((int32_t *) inp_positions->data)[i] =
|
3780
|
+
for (int i = 0; i < n_tokens; ++i) {
|
3781
|
+
((int32_t *) inp_positions->data)[i] = batch.pos[i];
|
3500
3782
|
}
|
3501
3783
|
}
|
3502
3784
|
ggml_set_name(inp_positions, "inp_positions");
|
@@ -3504,12 +3786,35 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
3504
3786
|
position = ggml_get_rows(ctx0, model.pos_embeddings, inp_positions);
|
3505
3787
|
}
|
3506
3788
|
|
3789
|
+
// KQ_scale
|
3507
3790
|
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
3791
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
3508
3792
|
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
3509
3793
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3510
3794
|
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
3511
3795
|
}
|
3512
|
-
|
3796
|
+
|
3797
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
3798
|
+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
3799
|
+
ggml_set_name(KQ_mask, "KQ_mask");
|
3800
|
+
ggml_allocr_alloc(lctx.alloc, KQ_mask);
|
3801
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3802
|
+
float * data = (float *) KQ_mask->data;
|
3803
|
+
memset(data, 0, ggml_nbytes(KQ_mask));
|
3804
|
+
|
3805
|
+
for (int h = 0; h < 1; ++h) {
|
3806
|
+
for (int j = 0; j < n_tokens; ++j) {
|
3807
|
+
const llama_pos pos = batch.pos[j];
|
3808
|
+
const llama_seq_id seq_id = batch.seq_id[j];
|
3809
|
+
|
3810
|
+
for (int i = 0; i < n_kv; ++i) {
|
3811
|
+
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
3812
|
+
data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
|
3813
|
+
}
|
3814
|
+
}
|
3815
|
+
}
|
3816
|
+
}
|
3817
|
+
}
|
3513
3818
|
|
3514
3819
|
inpL = ggml_add(ctx0, token, position);
|
3515
3820
|
ggml_set_name(inpL, "inpL");
|
@@ -3525,23 +3830,23 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
3525
3830
|
// Self Attention
|
3526
3831
|
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv);
|
3527
3832
|
|
3528
|
-
struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd,
|
3529
|
-
struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa,
|
3530
|
-
struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa,
|
3833
|
+
struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd);
|
3834
|
+
struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*n_embd);
|
3835
|
+
struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
|
3531
3836
|
|
3532
3837
|
struct ggml_tensor * Qcur = tmpq;
|
3533
3838
|
struct ggml_tensor * Kcur = tmpk;
|
3534
3839
|
|
3535
3840
|
{
|
3536
|
-
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa,
|
3841
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
|
3537
3842
|
ggml_set_name(Vcur, "Vcur");
|
3538
3843
|
|
3539
|
-
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k,
|
3844
|
+
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
|
3540
3845
|
ggml_set_name(k, "k");
|
3541
3846
|
|
3542
|
-
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v,
|
3847
|
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
|
3543
3848
|
( n_ctx)*ggml_element_size(kv_self.v),
|
3544
|
-
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa +
|
3849
|
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
|
3545
3850
|
|
3546
3851
|
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
3547
3852
|
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
@@ -3551,13 +3856,13 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
3551
3856
|
ggml_permute(ctx0,
|
3552
3857
|
ggml_cpy(ctx0,
|
3553
3858
|
Qcur,
|
3554
|
-
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head,
|
3859
|
+
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, n_tokens)),
|
3555
3860
|
0, 2, 1, 3);
|
3556
3861
|
ggml_set_name(Q, "Q");
|
3557
3862
|
|
3558
3863
|
struct ggml_tensor * K =
|
3559
3864
|
ggml_view_3d(ctx0, kv_self.k,
|
3560
|
-
n_embd_head,
|
3865
|
+
n_embd_head, n_kv, n_head_kv,
|
3561
3866
|
ggml_element_size(kv_self.k)*n_embd_gqa,
|
3562
3867
|
ggml_element_size(kv_self.k)*n_embd_head,
|
3563
3868
|
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
@@ -3568,12 +3873,12 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
3568
3873
|
ggml_set_name(KQ, "KQ");
|
3569
3874
|
|
3570
3875
|
// KQ_scaled = KQ / sqrt(n_embd_head)
|
3571
|
-
// KQ_scaled shape [n_past +
|
3876
|
+
// KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
|
3572
3877
|
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
3573
3878
|
ggml_set_name(KQ_scaled, "KQ_scaled");
|
3574
3879
|
|
3575
3880
|
// KQ_masked = mask_past(KQ_scaled)
|
3576
|
-
struct ggml_tensor * KQ_masked =
|
3881
|
+
struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
|
3577
3882
|
ggml_set_name(KQ_masked, "KQ_masked");
|
3578
3883
|
|
3579
3884
|
// KQ = soft_max(KQ_masked)
|
@@ -3583,7 +3888,7 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
3583
3888
|
// split cached V into n_head heads
|
3584
3889
|
struct ggml_tensor * V =
|
3585
3890
|
ggml_view_3d(ctx0, kv_self.v,
|
3586
|
-
|
3891
|
+
n_kv, n_embd_head, n_head_kv,
|
3587
3892
|
ggml_element_size(kv_self.v)*n_ctx,
|
3588
3893
|
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
3589
3894
|
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
@@ -3596,10 +3901,8 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
3596
3901
|
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
3597
3902
|
ggml_set_name(KQV_merged, "KQV_merged");
|
3598
3903
|
|
3599
|
-
// cur = KQV_merged.contiguous().view(n_embd,
|
3600
|
-
cur =
|
3601
|
-
KQV_merged,
|
3602
|
-
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
|
3904
|
+
// cur = KQV_merged.contiguous().view(n_embd, n_tokens)
|
3905
|
+
cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
|
3603
3906
|
ggml_set_name(cur, "KQV_merged_contiguous");
|
3604
3907
|
}
|
3605
3908
|
|
@@ -3649,10 +3952,7 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
3649
3952
|
|
3650
3953
|
static struct ggml_cgraph * llama_build_graph(
|
3651
3954
|
llama_context & lctx,
|
3652
|
-
const
|
3653
|
-
const float * embd,
|
3654
|
-
int n_tokens,
|
3655
|
-
int n_past) {
|
3955
|
+
const llama_batch & batch) {
|
3656
3956
|
const auto & model = lctx.model;
|
3657
3957
|
|
3658
3958
|
struct ggml_cgraph * result = NULL;
|
@@ -3660,76 +3960,117 @@ static struct ggml_cgraph * llama_build_graph(
|
|
3660
3960
|
switch (model.arch) {
|
3661
3961
|
case LLM_ARCH_LLAMA:
|
3662
3962
|
{
|
3663
|
-
result = llm_build_llama(lctx,
|
3963
|
+
result = llm_build_llama(lctx, batch);
|
3664
3964
|
} break;
|
3665
3965
|
case LLM_ARCH_BAICHUAN:
|
3666
3966
|
{
|
3667
|
-
result = llm_build_baichaun(lctx,
|
3967
|
+
result = llm_build_baichaun(lctx, batch);
|
3668
3968
|
} break;
|
3669
3969
|
case LLM_ARCH_FALCON:
|
3670
3970
|
{
|
3671
|
-
result = llm_build_falcon(lctx,
|
3971
|
+
result = llm_build_falcon(lctx, batch);
|
3672
3972
|
} break;
|
3673
3973
|
case LLM_ARCH_STARCODER:
|
3674
3974
|
{
|
3675
|
-
result = llm_build_starcoder(lctx,
|
3975
|
+
result = llm_build_starcoder(lctx, batch);
|
3676
3976
|
} break;
|
3677
3977
|
default:
|
3678
3978
|
GGML_ASSERT(false);
|
3679
|
-
}
|
3979
|
+
}
|
3680
3980
|
|
3681
3981
|
return result;
|
3682
3982
|
}
|
3683
3983
|
|
3684
|
-
//
|
3984
|
+
// decode a batch of tokens by evaluating the transformer
|
3685
3985
|
//
|
3686
3986
|
// - lctx: llama context
|
3687
|
-
// -
|
3688
|
-
// - embd embeddings input
|
3689
|
-
// - n_tokens number of tokens
|
3690
|
-
// - n_past: the context size so far
|
3987
|
+
// - batch: batch to evaluate
|
3691
3988
|
// - n_threads: number of threads to use
|
3692
3989
|
//
|
3693
|
-
|
3990
|
+
// return 0 on success
|
3991
|
+
// return positive int on warning
|
3992
|
+
// return negative int on error
|
3993
|
+
//
|
3994
|
+
static int llama_decode_internal(
|
3694
3995
|
llama_context & lctx,
|
3695
|
-
|
3696
|
-
|
3697
|
-
int n_tokens,
|
3698
|
-
int n_past,
|
3699
|
-
int n_threads,
|
3700
|
-
const char * cgraph_fname) {
|
3996
|
+
llama_batch batch) {
|
3997
|
+
const uint32_t n_tokens = batch.n_tokens;
|
3701
3998
|
|
3702
|
-
|
3999
|
+
if (n_tokens == 0) {
|
4000
|
+
LLAMA_LOG_ERROR("%s: n_tokens == 0", __func__);
|
4001
|
+
return -1;
|
4002
|
+
}
|
3703
4003
|
|
3704
|
-
|
3705
|
-
|
3706
|
-
|
3707
|
-
|
3708
|
-
|
4004
|
+
const auto & model = lctx.model;
|
4005
|
+
const auto & hparams = model.hparams;
|
4006
|
+
const auto & cparams = lctx.cparams;
|
4007
|
+
|
4008
|
+
const auto n_batch = cparams.n_batch;
|
4009
|
+
|
4010
|
+
GGML_ASSERT(n_tokens <= n_batch);
|
4011
|
+
|
4012
|
+
int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
|
4013
|
+
GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
|
3709
4014
|
|
3710
4015
|
const int64_t t_start_us = ggml_time_us();
|
3711
4016
|
|
3712
4017
|
#ifdef GGML_USE_MPI
|
3713
|
-
|
4018
|
+
// TODO: needs fix after #3228
|
4019
|
+
GGML_ASSERT(false && "not implemented");
|
4020
|
+
//ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
|
3714
4021
|
#endif
|
3715
4022
|
|
3716
4023
|
GGML_ASSERT(n_threads > 0);
|
3717
4024
|
|
3718
|
-
|
3719
|
-
|
3720
|
-
const auto & model = lctx.model;
|
3721
|
-
const auto & hparams = model.hparams;
|
3722
|
-
|
3723
|
-
const auto & kv_self = lctx.kv_self;
|
4025
|
+
auto & kv_self = lctx.kv_self;
|
3724
4026
|
|
3725
4027
|
GGML_ASSERT(!!kv_self.ctx);
|
3726
4028
|
|
3727
4029
|
const int64_t n_embd = hparams.n_embd;
|
3728
4030
|
const int64_t n_vocab = hparams.n_vocab;
|
3729
4031
|
|
4032
|
+
// helpers for smoother batch API transistion
|
4033
|
+
// after deprecating the llama_eval calls, these will be removed
|
4034
|
+
std::vector<llama_pos> pos;
|
4035
|
+
std::vector<llama_seq_id> seq_id;
|
4036
|
+
|
4037
|
+
if (batch.pos == nullptr) {
|
4038
|
+
pos.resize(n_tokens);
|
4039
|
+
for (uint32_t i = 0; i < n_tokens; i++) {
|
4040
|
+
pos[i] = batch.all_pos_0 + i*batch.all_pos_1;
|
4041
|
+
}
|
4042
|
+
|
4043
|
+
batch.pos = pos.data();
|
4044
|
+
}
|
4045
|
+
|
4046
|
+
if (batch.seq_id == nullptr) {
|
4047
|
+
seq_id.resize(n_tokens);
|
4048
|
+
for (uint32_t i = 0; i < n_tokens; i++) {
|
4049
|
+
seq_id[i] = batch.all_seq_id;
|
4050
|
+
}
|
4051
|
+
|
4052
|
+
batch.seq_id = seq_id.data();
|
4053
|
+
}
|
4054
|
+
|
4055
|
+
// we always start to search for a free slot from the start of the cache
|
4056
|
+
// TODO: better strategies can be implemented
|
4057
|
+
kv_self.head = 0;
|
4058
|
+
|
4059
|
+
if (!llama_kv_cache_find_slot(kv_self, batch)) {
|
4060
|
+
return 1;
|
4061
|
+
}
|
4062
|
+
|
4063
|
+
// a heuristic, to avoid attending the full cache if it is not yet utilized
|
4064
|
+
// after enough generations, the benefit from this heuristic disappears
|
4065
|
+
// if we start defragmenting the cache, the benefit from this will be more important
|
4066
|
+
//kv_self.n = std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)); // TODO: this might be better for CUDA?
|
4067
|
+
kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, llama_kv_cache_cell_max(kv_self)));
|
4068
|
+
|
4069
|
+
//printf("kv_self.n = %d\n", kv_self.n);
|
4070
|
+
|
3730
4071
|
ggml_allocr_reset(lctx.alloc);
|
3731
4072
|
|
3732
|
-
ggml_cgraph * gf = llama_build_graph(lctx,
|
4073
|
+
ggml_cgraph * gf = llama_build_graph(lctx, batch);
|
3733
4074
|
|
3734
4075
|
ggml_allocr_alloc_graph(lctx.alloc, gf);
|
3735
4076
|
|
@@ -3738,6 +4079,7 @@ static bool llama_eval_internal(
|
|
3738
4079
|
ggml_tensor * node = gf->leafs[i];
|
3739
4080
|
if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
|
3740
4081
|
ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) lctx.buf_alloc.data);
|
4082
|
+
ggml_cuda_copy_to_device(node);
|
3741
4083
|
}
|
3742
4084
|
}
|
3743
4085
|
|
@@ -3747,6 +4089,8 @@ static bool llama_eval_internal(
|
|
3747
4089
|
ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) lctx.buf_alloc.data);
|
3748
4090
|
}
|
3749
4091
|
}
|
4092
|
+
|
4093
|
+
ggml_cuda_set_mul_mat_q(cparams.mul_mat_q);
|
3750
4094
|
#endif
|
3751
4095
|
|
3752
4096
|
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
@@ -3756,7 +4100,7 @@ static bool llama_eval_internal(
|
|
3756
4100
|
// TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
|
3757
4101
|
// we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
|
3758
4102
|
// with the BLAS calls. need a better solution
|
3759
|
-
if (
|
4103
|
+
if (n_tokens >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
|
3760
4104
|
n_threads = std::min(4, n_threads);
|
3761
4105
|
}
|
3762
4106
|
|
@@ -3795,12 +4139,9 @@ static bool llama_eval_internal(
|
|
3795
4139
|
ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
|
3796
4140
|
#endif
|
3797
4141
|
|
3798
|
-
// update kv
|
3799
|
-
lctx.kv_self.
|
3800
|
-
|
3801
|
-
if (cgraph_fname) {
|
3802
|
-
ggml_graph_export(gf, cgraph_fname);
|
3803
|
-
}
|
4142
|
+
// update the kv ring buffer
|
4143
|
+
lctx.kv_self.head += n_tokens;
|
4144
|
+
lctx.kv_self.has_shift = false;
|
3804
4145
|
|
3805
4146
|
#ifdef GGML_PERF
|
3806
4147
|
// print timing information per ggml operation (for debugging purposes)
|
@@ -3817,13 +4158,20 @@ static bool llama_eval_internal(
|
|
3817
4158
|
{
|
3818
4159
|
auto & logits_out = lctx.logits;
|
3819
4160
|
|
3820
|
-
if (
|
3821
|
-
logits_out.resize(n_vocab *
|
3822
|
-
|
4161
|
+
if (batch.logits) {
|
4162
|
+
logits_out.resize(n_vocab * n_tokens);
|
4163
|
+
for (uint32_t i = 0; i < n_tokens; i++) {
|
4164
|
+
if (batch.logits[i] == 0) {
|
4165
|
+
continue;
|
4166
|
+
}
|
4167
|
+
memcpy(logits_out.data() + (n_vocab*i), (float *) ggml_get_data(res) + (n_vocab*i), sizeof(float)*n_vocab);
|
4168
|
+
}
|
4169
|
+
} else if (lctx.logits_all) {
|
4170
|
+
logits_out.resize(n_vocab * n_tokens);
|
4171
|
+
memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*n_tokens);
|
3823
4172
|
} else {
|
3824
|
-
// return result for just the last token
|
3825
4173
|
logits_out.resize(n_vocab);
|
3826
|
-
memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(
|
4174
|
+
memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(n_tokens - 1)), sizeof(float)*n_vocab);
|
3827
4175
|
}
|
3828
4176
|
}
|
3829
4177
|
|
@@ -3832,20 +4180,27 @@ static bool llama_eval_internal(
|
|
3832
4180
|
auto & embedding_out = lctx.embedding;
|
3833
4181
|
|
3834
4182
|
embedding_out.resize(n_embd);
|
3835
|
-
memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(
|
4183
|
+
memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(n_tokens - 1)), sizeof(float)*n_embd);
|
3836
4184
|
}
|
3837
4185
|
|
3838
4186
|
// measure the performance only for the single-token evals
|
3839
|
-
if (
|
4187
|
+
if (n_tokens == 1) {
|
3840
4188
|
lctx.t_eval_us += ggml_time_us() - t_start_us;
|
3841
4189
|
lctx.n_eval++;
|
3842
4190
|
}
|
3843
|
-
else if (
|
4191
|
+
else if (n_tokens > 1) {
|
3844
4192
|
lctx.t_p_eval_us += ggml_time_us() - t_start_us;
|
3845
|
-
lctx.n_p_eval +=
|
4193
|
+
lctx.n_p_eval += n_tokens;
|
3846
4194
|
}
|
3847
4195
|
|
3848
|
-
|
4196
|
+
// get a more accurate load time, upon first eval
|
4197
|
+
// TODO: fix this
|
4198
|
+
if (!lctx.has_evaluated_once) {
|
4199
|
+
lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
|
4200
|
+
lctx.has_evaluated_once = true;
|
4201
|
+
}
|
4202
|
+
|
4203
|
+
return 0;
|
3849
4204
|
}
|
3850
4205
|
|
3851
4206
|
//
|
@@ -4266,7 +4621,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
4266
4621
|
llm_tokenizer_bpe tokenizer(vocab);
|
4267
4622
|
tokenizer.tokenize(raw_text, output);
|
4268
4623
|
} break;
|
4269
|
-
}
|
4624
|
+
}
|
4270
4625
|
|
4271
4626
|
return output;
|
4272
4627
|
}
|
@@ -4670,6 +5025,13 @@ struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar)
|
|
4670
5025
|
// sampling
|
4671
5026
|
//
|
4672
5027
|
|
5028
|
+
void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed) {
|
5029
|
+
if (seed == LLAMA_DEFAULT_SEED) {
|
5030
|
+
seed = time(NULL);
|
5031
|
+
}
|
5032
|
+
ctx->rng.seed(seed);
|
5033
|
+
}
|
5034
|
+
|
4673
5035
|
void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates) {
|
4674
5036
|
GGML_ASSERT(candidates->size > 0);
|
4675
5037
|
|
@@ -4878,7 +5240,7 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
|
|
4878
5240
|
}
|
4879
5241
|
}
|
4880
5242
|
|
4881
|
-
void
|
5243
|
+
void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
|
4882
5244
|
const int64_t t_start_sample_us = ggml_time_us();
|
4883
5245
|
|
4884
5246
|
for (size_t i = 0; i < candidates_p->size; ++i) {
|
@@ -4890,6 +5252,10 @@ void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array
|
|
4890
5252
|
}
|
4891
5253
|
}
|
4892
5254
|
|
5255
|
+
void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
|
5256
|
+
llama_sample_temp(ctx, candidates_p, temp);
|
5257
|
+
}
|
5258
|
+
|
4893
5259
|
void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty) {
|
4894
5260
|
if (last_tokens_size == 0 || penalty == 1.0f) {
|
4895
5261
|
return;
|
@@ -5013,7 +5379,7 @@ void llama_sample_classifier_free_guidance(
|
|
5013
5379
|
|
5014
5380
|
GGML_ASSERT(ctx);
|
5015
5381
|
|
5016
|
-
auto n_vocab = llama_n_vocab(ctx);
|
5382
|
+
auto n_vocab = llama_n_vocab(llama_get_model(ctx));
|
5017
5383
|
|
5018
5384
|
GGML_ASSERT(n_vocab == (int)candidates->size);
|
5019
5385
|
GGML_ASSERT(!candidates->sorted);
|
@@ -5042,7 +5408,7 @@ void llama_sample_classifier_free_guidance(
|
|
5042
5408
|
llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu) {
|
5043
5409
|
GGML_ASSERT(ctx);
|
5044
5410
|
|
5045
|
-
auto N = float(llama_n_vocab(ctx));
|
5411
|
+
auto N = float(llama_n_vocab(llama_get_model(ctx)));
|
5046
5412
|
int64_t t_start_sample_us;
|
5047
5413
|
t_start_sample_us = ggml_time_us();
|
5048
5414
|
|
@@ -5229,7 +5595,7 @@ struct llama_logit_info {
|
|
5229
5595
|
};
|
5230
5596
|
llama_logit_info(llama_context * ctx)
|
5231
5597
|
: logits(llama_get_logits(ctx))
|
5232
|
-
, n_vocab(llama_n_vocab(ctx))
|
5598
|
+
, n_vocab(llama_n_vocab(llama_get_model(ctx)))
|
5233
5599
|
, max_l(*std::max_element(logits, logits + n_vocab))
|
5234
5600
|
, normalizer(1.0f / std::accumulate(logits, logits + n_vocab, 0.0f, sum_exp{max_l}))
|
5235
5601
|
{ }
|
@@ -5267,7 +5633,6 @@ struct llama_beam_search_data {
|
|
5267
5633
|
size_t n_beams;
|
5268
5634
|
int n_past;
|
5269
5635
|
int n_predict;
|
5270
|
-
int n_threads;
|
5271
5636
|
std::vector<llama_beam> beams;
|
5272
5637
|
std::vector<llama_beam> next_beams;
|
5273
5638
|
|
@@ -5277,12 +5642,11 @@ struct llama_beam_search_data {
|
|
5277
5642
|
// Used to communicate to/from callback on beams state.
|
5278
5643
|
std::vector<llama_beam_view> beam_views;
|
5279
5644
|
|
5280
|
-
llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict
|
5645
|
+
llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict)
|
5281
5646
|
: ctx(ctx)
|
5282
5647
|
, n_beams(n_beams)
|
5283
5648
|
, n_past(n_past)
|
5284
5649
|
, n_predict(n_predict)
|
5285
|
-
, n_threads(n_threads)
|
5286
5650
|
, beam_views(n_beams) {
|
5287
5651
|
beams.reserve(n_beams);
|
5288
5652
|
next_beams.reserve(n_beams);
|
@@ -5319,7 +5683,7 @@ struct llama_beam_search_data {
|
|
5319
5683
|
} else {
|
5320
5684
|
// beam is not at end-of-sentence, so branch with next top_k tokens.
|
5321
5685
|
if (!beam.tokens.empty()) {
|
5322
|
-
|
5686
|
+
llama_decode(ctx, llama_batch_get_one(beam.tokens.data(), beam.tokens.size(), n_past, 0));
|
5323
5687
|
}
|
5324
5688
|
llama_logit_info logit_info(ctx);
|
5325
5689
|
std::vector<llama_token_data> next_tokens = logit_info.top_k(n_beams);
|
@@ -5393,7 +5757,7 @@ struct llama_beam_search_data {
|
|
5393
5757
|
callback(callback_data, get_beams_state(false)); // Sets common_prefix_length
|
5394
5758
|
update_beams_from_beam_views(); // Update values (p,eob) that callback may have changed.
|
5395
5759
|
if (common_prefix_length) {
|
5396
|
-
|
5760
|
+
llama_decode(ctx, llama_batch_get_one(beams[0].tokens.data(), common_prefix_length, n_past, 0));
|
5397
5761
|
n_past += common_prefix_length;
|
5398
5762
|
}
|
5399
5763
|
// Zero-out next_beam probabilities to place them last in following min-heap.
|
@@ -5434,11 +5798,11 @@ struct llama_beam_search_data {
|
|
5434
5798
|
|
5435
5799
|
void llama_beam_search(llama_context * ctx,
|
5436
5800
|
llama_beam_search_callback_fn_t callback, void * callback_data,
|
5437
|
-
size_t n_beams, int n_past, int n_predict
|
5801
|
+
size_t n_beams, int n_past, int n_predict) {
|
5438
5802
|
assert(ctx);
|
5439
5803
|
const int64_t t_start_sample_us = ggml_time_us();
|
5440
5804
|
|
5441
|
-
llama_beam_search_data beam_search_data(ctx, n_beams, n_past, n_predict
|
5805
|
+
llama_beam_search_data beam_search_data(ctx, n_beams, n_past, n_predict);
|
5442
5806
|
|
5443
5807
|
beam_search_data.loop(callback, callback_data);
|
5444
5808
|
|
@@ -5658,11 +6022,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
5658
6022
|
nthread = std::thread::hardware_concurrency();
|
5659
6023
|
}
|
5660
6024
|
|
5661
|
-
|
6025
|
+
llama_model_loader ml(fname_inp, /*use_mmap*/ false);
|
5662
6026
|
|
5663
6027
|
llama_model model;
|
5664
|
-
llm_load_arch(
|
5665
|
-
llm_load_hparams(
|
6028
|
+
llm_load_arch(ml, model);
|
6029
|
+
llm_load_hparams(ml, model);
|
5666
6030
|
|
5667
6031
|
if (params->only_copy) {
|
5668
6032
|
ftype = model.ftype;
|
@@ -5672,7 +6036,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
5672
6036
|
struct gguf_context * ctx_out = gguf_init_empty();
|
5673
6037
|
|
5674
6038
|
// copy the KV pairs from the input file
|
5675
|
-
gguf_set_kv (ctx_out, ml
|
6039
|
+
gguf_set_kv (ctx_out, ml.ctx_gguf);
|
5676
6040
|
gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
|
5677
6041
|
gguf_set_val_u32(ctx_out, "general.file_type", ftype);
|
5678
6042
|
|
@@ -5680,8 +6044,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
5680
6044
|
int n_attention_wv = 0;
|
5681
6045
|
int n_feed_forward_w2 = 0;
|
5682
6046
|
|
5683
|
-
for (int i = 0; i < ml
|
5684
|
-
struct ggml_tensor * meta = ml
|
6047
|
+
for (int i = 0; i < ml.n_tensors; ++i) {
|
6048
|
+
struct ggml_tensor * meta = ml.get_tensor_meta(i);
|
5685
6049
|
|
5686
6050
|
const std::string name = ggml_get_name(meta);
|
5687
6051
|
|
@@ -5717,8 +6081,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
5717
6081
|
std::vector<no_init<float>> f32_conv_buf;
|
5718
6082
|
|
5719
6083
|
// populate the original tensors so we get an initial meta data
|
5720
|
-
for (int i = 0; i < ml
|
5721
|
-
struct ggml_tensor * meta = ml
|
6084
|
+
for (int i = 0; i < ml.n_tensors; ++i) {
|
6085
|
+
struct ggml_tensor * meta = ml.get_tensor_meta(i);
|
5722
6086
|
gguf_add_tensor(ctx_out, meta);
|
5723
6087
|
}
|
5724
6088
|
|
@@ -5731,8 +6095,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
5731
6095
|
// placeholder for the meta data
|
5732
6096
|
::zeros(fout, meta_size);
|
5733
6097
|
|
5734
|
-
for (int i = 0; i < ml
|
5735
|
-
struct ggml_tensor * tensor = ml
|
6098
|
+
for (int i = 0; i < ml.n_tensors; ++i) {
|
6099
|
+
struct ggml_tensor * tensor = ml.get_tensor_meta(i);
|
5736
6100
|
|
5737
6101
|
const std::string name = ggml_get_name(tensor);
|
5738
6102
|
|
@@ -5740,10 +6104,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
5740
6104
|
read_data.resize(ggml_nbytes(tensor));
|
5741
6105
|
}
|
5742
6106
|
tensor->data = read_data.data();
|
5743
|
-
ml
|
6107
|
+
ml.load_data_for(tensor);
|
5744
6108
|
|
5745
6109
|
LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",
|
5746
|
-
++idx, ml
|
6110
|
+
++idx, ml.n_tensors,
|
5747
6111
|
ggml_get_name(tensor),
|
5748
6112
|
llama_format_tensor_shape(tensor).c_str(),
|
5749
6113
|
ggml_type_name(tensor->type));
|
@@ -5893,9 +6257,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
5893
6257
|
}
|
5894
6258
|
}
|
5895
6259
|
|
5896
|
-
// TODO: after the GGUF PR, this likely won't work and needs to be updated
|
5897
6260
|
static int llama_apply_lora_from_file_internal(
|
5898
|
-
const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads
|
6261
|
+
const struct llama_model & model, const char * path_lora, float scale, const char * path_base_model, int n_threads
|
5899
6262
|
) {
|
5900
6263
|
LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
|
5901
6264
|
|
@@ -5924,7 +6287,7 @@ static int llama_apply_lora_from_file_internal(
|
|
5924
6287
|
int32_t lora_alpha;
|
5925
6288
|
fin.read((char *) &lora_r, sizeof(lora_r));
|
5926
6289
|
fin.read((char *) &lora_alpha, sizeof(lora_alpha));
|
5927
|
-
float scaling = (float)lora_alpha / (float)lora_r;
|
6290
|
+
float scaling = scale * (float)lora_alpha / (float)lora_r;
|
5928
6291
|
|
5929
6292
|
LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
|
5930
6293
|
|
@@ -6140,9 +6503,10 @@ static int llama_apply_lora_from_file_internal(
|
|
6140
6503
|
ggml_set_name(r, "r_cpy");
|
6141
6504
|
}
|
6142
6505
|
|
6143
|
-
struct ggml_cgraph gf =
|
6506
|
+
struct ggml_cgraph * gf = ggml_new_graph(lora_ctx);
|
6507
|
+
ggml_build_forward_expand(gf, r);
|
6144
6508
|
|
6145
|
-
ggml_graph_compute_helper(work_buffer,
|
6509
|
+
ggml_graph_compute_helper(work_buffer, gf, n_threads);
|
6146
6510
|
|
6147
6511
|
// we won't need these tensors again, reset the context to save memory
|
6148
6512
|
ggml_free(lora_ctx);
|
@@ -6171,27 +6535,16 @@ static int llama_apply_lora_from_file_internal(
|
|
6171
6535
|
//
|
6172
6536
|
// interface implementation
|
6173
6537
|
//
|
6174
|
-
|
6175
|
-
struct
|
6176
|
-
struct llama_context_params result = {
|
6177
|
-
/*.seed =*/ LLAMA_DEFAULT_SEED,
|
6178
|
-
/*.n_ctx =*/ 512,
|
6179
|
-
/*.n_batch =*/ 512,
|
6538
|
+
struct llama_model_params llama_model_default_params() {
|
6539
|
+
struct llama_model_params result = {
|
6180
6540
|
/*.n_gpu_layers =*/ 0,
|
6181
6541
|
/*.main_gpu =*/ 0,
|
6182
6542
|
/*.tensor_split =*/ nullptr,
|
6183
|
-
/*.rope_freq_base =*/ 0.0f,
|
6184
|
-
/*.rope_freq_scale =*/ 0.0f,
|
6185
6543
|
/*.progress_callback =*/ nullptr,
|
6186
6544
|
/*.progress_callback_user_data =*/ nullptr,
|
6187
|
-
/*.low_vram =*/ false,
|
6188
|
-
/*.mul_mat_q =*/ true,
|
6189
|
-
/*.f16_kv =*/ true,
|
6190
|
-
/*.logits_all =*/ false,
|
6191
6545
|
/*.vocab_only =*/ false,
|
6192
6546
|
/*.use_mmap =*/ true,
|
6193
6547
|
/*.use_mlock =*/ false,
|
6194
|
-
/*.embedding =*/ false,
|
6195
6548
|
};
|
6196
6549
|
|
6197
6550
|
#ifdef GGML_USE_METAL
|
@@ -6201,6 +6554,24 @@ struct llama_context_params llama_context_default_params() {
|
|
6201
6554
|
return result;
|
6202
6555
|
}
|
6203
6556
|
|
6557
|
+
struct llama_context_params llama_context_default_params() {
|
6558
|
+
struct llama_context_params result = {
|
6559
|
+
/*.seed =*/ LLAMA_DEFAULT_SEED,
|
6560
|
+
/*.n_ctx =*/ 512,
|
6561
|
+
/*.n_batch =*/ 512,
|
6562
|
+
/*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
|
6563
|
+
/*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS,
|
6564
|
+
/*.rope_freq_base =*/ 0.0f,
|
6565
|
+
/*.rope_freq_scale =*/ 0.0f,
|
6566
|
+
/*.mul_mat_q =*/ true,
|
6567
|
+
/*.f16_kv =*/ true,
|
6568
|
+
/*.logits_all =*/ false,
|
6569
|
+
/*.embedding =*/ false,
|
6570
|
+
};
|
6571
|
+
|
6572
|
+
return result;
|
6573
|
+
}
|
6574
|
+
|
6204
6575
|
struct llama_model_quantize_params llama_model_quantize_default_params() {
|
6205
6576
|
struct llama_model_quantize_params result = {
|
6206
6577
|
/*.nthread =*/ 0,
|
@@ -6256,13 +6627,11 @@ int64_t llama_time_us(void) {
|
|
6256
6627
|
|
6257
6628
|
struct llama_model * llama_load_model_from_file(
|
6258
6629
|
const char * path_model,
|
6259
|
-
|
6630
|
+
struct llama_model_params params) {
|
6260
6631
|
ggml_time_init();
|
6261
6632
|
|
6262
6633
|
llama_model * model = new llama_model;
|
6263
6634
|
|
6264
|
-
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
6265
|
-
|
6266
6635
|
unsigned cur_percentage = 0;
|
6267
6636
|
if (params.progress_callback == NULL) {
|
6268
6637
|
params.progress_callback_user_data = &cur_percentage;
|
@@ -6279,9 +6648,9 @@ struct llama_model * llama_load_model_from_file(
|
|
6279
6648
|
};
|
6280
6649
|
}
|
6281
6650
|
|
6282
|
-
if (!llama_model_load(path_model, *model, params.
|
6283
|
-
params.main_gpu, params.tensor_split,
|
6284
|
-
params.
|
6651
|
+
if (!llama_model_load(path_model, *model, params.n_gpu_layers,
|
6652
|
+
params.main_gpu, params.tensor_split,
|
6653
|
+
params.use_mmap, params.use_mlock, params.vocab_only,
|
6285
6654
|
params.progress_callback, params.progress_callback_user_data)) {
|
6286
6655
|
LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
|
6287
6656
|
delete model;
|
@@ -6305,18 +6674,33 @@ struct llama_context * llama_new_context_with_model(
|
|
6305
6674
|
|
6306
6675
|
llama_context * ctx = new llama_context(*model);
|
6307
6676
|
|
6677
|
+
const auto & hparams = model->hparams;
|
6678
|
+
auto & cparams = ctx->cparams;
|
6679
|
+
|
6680
|
+
cparams.n_batch = params.n_batch;
|
6681
|
+
cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
|
6682
|
+
cparams.rope_freq_base = params.rope_freq_base == 0 ? hparams.rope_freq_base_train : params.rope_freq_base;
|
6683
|
+
cparams.rope_freq_scale = params.rope_freq_scale == 0 ? hparams.rope_freq_scale_train : params.rope_freq_scale;
|
6684
|
+
cparams.n_threads = params.n_threads;
|
6685
|
+
cparams.n_threads_batch = params.n_threads_batch;
|
6686
|
+
cparams.mul_mat_q = params.mul_mat_q;
|
6687
|
+
|
6308
6688
|
if (params.seed == LLAMA_DEFAULT_SEED) {
|
6309
6689
|
params.seed = time(NULL);
|
6310
6690
|
}
|
6311
6691
|
|
6692
|
+
LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
|
6693
|
+
LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
|
6694
|
+
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
|
6695
|
+
|
6312
6696
|
ctx->rng = std::mt19937(params.seed);
|
6313
6697
|
ctx->logits_all = params.logits_all;
|
6314
6698
|
|
6315
6699
|
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
6316
6700
|
|
6317
6701
|
// reserve memory for context buffers
|
6318
|
-
if (!
|
6319
|
-
if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type,
|
6702
|
+
if (!hparams.vocab_only) {
|
6703
|
+
if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, cparams.n_ctx, model->n_gpu_layers)) {
|
6320
6704
|
LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
|
6321
6705
|
llama_free(ctx);
|
6322
6706
|
return nullptr;
|
@@ -6327,11 +6711,9 @@ struct llama_context * llama_new_context_with_model(
|
|
6327
6711
|
LLAMA_LOG_INFO("%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
|
6328
6712
|
}
|
6329
6713
|
|
6330
|
-
const auto & hparams = ctx->model.hparams;
|
6331
|
-
|
6332
6714
|
// resized during inference
|
6333
6715
|
if (params.logits_all) {
|
6334
|
-
ctx->logits.reserve(
|
6716
|
+
ctx->logits.reserve(cparams.n_ctx*hparams.n_vocab);
|
6335
6717
|
} else {
|
6336
6718
|
ctx->logits.reserve(hparams.n_vocab);
|
6337
6719
|
}
|
@@ -6349,26 +6731,28 @@ struct llama_context * llama_new_context_with_model(
|
|
6349
6731
|
ctx->alloc = ggml_allocr_new_measure(tensor_alignment);
|
6350
6732
|
|
6351
6733
|
// build worst-case graph
|
6352
|
-
int n_tokens = std::min(
|
6353
|
-
int n_past =
|
6734
|
+
int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch);
|
6735
|
+
int n_past = cparams.n_ctx - n_tokens;
|
6354
6736
|
llama_token token = llama_token_bos(ctx); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
|
6355
|
-
ggml_cgraph * gf = llama_build_graph(*ctx, &token,
|
6737
|
+
ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0));
|
6738
|
+
|
6356
6739
|
#ifdef GGML_USE_METAL
|
6357
|
-
if (
|
6740
|
+
if (model->n_gpu_layers > 0) {
|
6358
6741
|
ctx->ctx_metal = ggml_metal_init(1);
|
6359
6742
|
if (!ctx->ctx_metal) {
|
6360
6743
|
LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__);
|
6361
6744
|
llama_free(ctx);
|
6362
6745
|
return NULL;
|
6363
6746
|
}
|
6364
|
-
|
6365
|
-
|
6747
|
+
ggml_metal_log_set_callback(llama_log_callback_default, NULL);
|
6748
|
+
//ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false);
|
6749
|
+
//ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
|
6366
6750
|
}
|
6367
6751
|
#endif
|
6368
6752
|
// measure memory requirements for the graph
|
6369
6753
|
size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
|
6370
6754
|
|
6371
|
-
LLAMA_LOG_INFO("%s: compute buffer total size =
|
6755
|
+
LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
|
6372
6756
|
|
6373
6757
|
// recreate allocator with exact memory requirements
|
6374
6758
|
ggml_allocr_free(ctx->alloc);
|
@@ -6377,28 +6761,46 @@ struct llama_context * llama_new_context_with_model(
|
|
6377
6761
|
ctx->alloc = ggml_allocr_new(ctx->buf_alloc.data, ctx->buf_alloc.size, tensor_alignment);
|
6378
6762
|
#ifdef GGML_USE_METAL
|
6379
6763
|
if (ctx->ctx_metal) {
|
6380
|
-
ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
|
6764
|
+
//ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
|
6381
6765
|
}
|
6382
6766
|
#endif
|
6383
6767
|
#ifdef GGML_USE_CUBLAS
|
6384
|
-
|
6385
|
-
|
6386
|
-
|
6387
|
-
|
6388
|
-
|
6389
|
-
|
6768
|
+
ggml_cuda_set_scratch_size(alloc_size);
|
6769
|
+
LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MB\n", __func__, alloc_size / 1024.0 / 1024.0);
|
6770
|
+
|
6771
|
+
// calculate total VRAM usage
|
6772
|
+
auto add_tensor = [](const ggml_tensor * t, size_t & size) {
|
6773
|
+
if (t->backend == GGML_BACKEND_GPU || t->backend == GGML_BACKEND_GPU_SPLIT) {
|
6774
|
+
size += ggml_nbytes(t);
|
6775
|
+
}
|
6776
|
+
};
|
6777
|
+
size_t model_vram_size = 0;
|
6778
|
+
for (const auto & kv : model->tensors_by_name) {
|
6779
|
+
add_tensor(kv.second, model_vram_size);
|
6390
6780
|
}
|
6781
|
+
|
6782
|
+
size_t kv_vram_size = 0;
|
6783
|
+
add_tensor(ctx->kv_self.k, kv_vram_size);
|
6784
|
+
add_tensor(ctx->kv_self.v, kv_vram_size);
|
6785
|
+
|
6786
|
+
size_t ctx_vram_size = alloc_size + kv_vram_size;
|
6787
|
+
size_t total_vram_size = model_vram_size + ctx_vram_size;
|
6788
|
+
|
6789
|
+
LLAMA_LOG_INFO("%s: total VRAM used: %.2f MB (model: %.2f MB, context: %.2f MB)\n", __func__,
|
6790
|
+
total_vram_size / 1024.0 / 1024.0,
|
6791
|
+
model_vram_size / 1024.0 / 1024.0,
|
6792
|
+
ctx_vram_size / 1024.0 / 1024.0);
|
6391
6793
|
#endif
|
6392
6794
|
}
|
6393
6795
|
|
6394
6796
|
#ifdef GGML_USE_METAL
|
6395
|
-
if (
|
6797
|
+
if (model->n_gpu_layers > 0) {
|
6396
6798
|
// this allocates all Metal resources and memory buffers
|
6397
6799
|
|
6398
6800
|
void * data_ptr = NULL;
|
6399
6801
|
size_t data_size = 0;
|
6400
6802
|
|
6401
|
-
if (
|
6803
|
+
if (ctx->model.mapping) {
|
6402
6804
|
data_ptr = ctx->model.mapping->addr;
|
6403
6805
|
data_size = ctx->model.mapping->size;
|
6404
6806
|
} else {
|
@@ -6417,11 +6819,8 @@ struct llama_context * llama_new_context_with_model(
|
|
6417
6819
|
return NULL; \
|
6418
6820
|
}
|
6419
6821
|
|
6420
|
-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data",
|
6421
|
-
|
6422
|
-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.data, ctx->buf_compute.size, 0));
|
6423
|
-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0));
|
6424
|
-
|
6822
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
|
6823
|
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0));
|
6425
6824
|
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.data, ctx->buf_alloc.size, 0));
|
6426
6825
|
#undef LLAMA_METAL_CHECK_BUF
|
6427
6826
|
}
|
@@ -6433,8 +6832,10 @@ struct llama_context * llama_new_context_with_model(
|
|
6433
6832
|
|
6434
6833
|
if (ggml_mpi_rank(ctx->ctx_mpi) > 0) {
|
6435
6834
|
// Enter a blocking eval loop with dummy input, letting rank=0 drive the process
|
6436
|
-
|
6437
|
-
|
6835
|
+
// TODO: needs fix after #3228
|
6836
|
+
GGML_ASSERT(false && "not implemented");
|
6837
|
+
//const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos(ctx));
|
6838
|
+
//while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {};
|
6438
6839
|
llama_backend_free();
|
6439
6840
|
exit(1);
|
6440
6841
|
}
|
@@ -6443,63 +6844,37 @@ struct llama_context * llama_new_context_with_model(
|
|
6443
6844
|
return ctx;
|
6444
6845
|
}
|
6445
6846
|
|
6446
|
-
static struct llama_context * llama_init_from_file(
|
6447
|
-
const char * path_model,
|
6448
|
-
struct llama_context_params params) {
|
6449
|
-
struct llama_model * model = llama_load_model_from_file(path_model, params);
|
6450
|
-
if (!model) {
|
6451
|
-
return nullptr;
|
6452
|
-
}
|
6453
|
-
|
6454
|
-
struct llama_context * ctx = llama_new_context_with_model(model, params);
|
6455
|
-
ctx->model_owner = true;
|
6456
|
-
|
6457
|
-
return ctx;
|
6458
|
-
}
|
6459
|
-
|
6460
6847
|
void llama_free(struct llama_context * ctx) {
|
6461
6848
|
delete ctx;
|
6462
6849
|
}
|
6463
6850
|
|
6464
|
-
|
6465
|
-
return
|
6851
|
+
const llama_model * llama_get_model(const struct llama_context * ctx) {
|
6852
|
+
return &ctx->model;
|
6466
6853
|
}
|
6467
6854
|
|
6468
6855
|
int llama_n_ctx(const struct llama_context * ctx) {
|
6469
|
-
return
|
6856
|
+
return ctx->cparams.n_ctx;
|
6470
6857
|
}
|
6471
6858
|
|
6472
|
-
|
6473
|
-
return
|
6859
|
+
enum llama_vocab_type llama_vocab_type(const struct llama_model * model) {
|
6860
|
+
return model->vocab.type;
|
6474
6861
|
}
|
6475
6862
|
|
6476
|
-
int
|
6477
|
-
return llama_model_n_embd(&ctx->model);
|
6478
|
-
}
|
6479
|
-
|
6480
|
-
enum llama_vocab_type llama_vocab_type(const struct llama_context * ctx) {
|
6481
|
-
return ctx->model.vocab.type;
|
6482
|
-
}
|
6483
|
-
|
6484
|
-
int llama_model_n_vocab(const struct llama_model * model) {
|
6863
|
+
int llama_n_vocab(const struct llama_model * model) {
|
6485
6864
|
return model->vocab.id_to_token.size();
|
6486
6865
|
}
|
6487
6866
|
|
6488
|
-
int
|
6489
|
-
return model->hparams.n_ctx;
|
6490
|
-
}
|
6491
|
-
|
6492
|
-
int llama_model_n_ctx_train(const struct llama_model * model) {
|
6867
|
+
int llama_n_ctx_train(const struct llama_model * model) {
|
6493
6868
|
return model->hparams.n_ctx_train;
|
6494
6869
|
}
|
6495
6870
|
|
6496
|
-
int
|
6871
|
+
int llama_n_embd(const struct llama_model * model) {
|
6497
6872
|
return model->hparams.n_embd;
|
6498
6873
|
}
|
6499
6874
|
|
6500
6875
|
int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
|
6501
6876
|
return snprintf(buf, buf_size, "%s %s %s",
|
6502
|
-
model->
|
6877
|
+
llama_model_arch_name(model->arch).c_str(),
|
6503
6878
|
llama_model_type_name(model->type),
|
6504
6879
|
llama_model_ftype_name(model->ftype).c_str());
|
6505
6880
|
}
|
@@ -6520,6 +6895,10 @@ uint64_t llama_model_n_params(const struct llama_model * model) {
|
|
6520
6895
|
return nparams;
|
6521
6896
|
}
|
6522
6897
|
|
6898
|
+
struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name) {
|
6899
|
+
return ggml_get_tensor(model->ctx, name);
|
6900
|
+
}
|
6901
|
+
|
6523
6902
|
int llama_model_quantize(
|
6524
6903
|
const char * fname_inp,
|
6525
6904
|
const char * fname_out,
|
@@ -6533,18 +6912,18 @@ int llama_model_quantize(
|
|
6533
6912
|
}
|
6534
6913
|
}
|
6535
6914
|
|
6536
|
-
int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
|
6915
|
+
int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, float scale, const char * path_base_model, int n_threads) {
|
6537
6916
|
try {
|
6538
|
-
return llama_apply_lora_from_file_internal(ctx->model, path_lora, path_base_model, n_threads);
|
6917
|
+
return llama_apply_lora_from_file_internal(ctx->model, path_lora, scale, path_base_model, n_threads);
|
6539
6918
|
} catch (const std::exception & err) {
|
6540
6919
|
LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
6541
6920
|
return 1;
|
6542
6921
|
}
|
6543
6922
|
}
|
6544
6923
|
|
6545
|
-
int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, const char * path_base_model, int n_threads) {
|
6924
|
+
int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int n_threads) {
|
6546
6925
|
try {
|
6547
|
-
return llama_apply_lora_from_file_internal(*model, path_lora, path_base_model, n_threads);
|
6926
|
+
return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads);
|
6548
6927
|
} catch (const std::exception & err) {
|
6549
6928
|
LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
6550
6929
|
return 1;
|
@@ -6552,16 +6931,27 @@ int llama_model_apply_lora_from_file(const struct llama_model * model, const cha
|
|
6552
6931
|
}
|
6553
6932
|
|
6554
6933
|
int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
|
6555
|
-
return ctx->kv_self.
|
6934
|
+
return ctx->kv_self.head;
|
6556
6935
|
}
|
6557
6936
|
|
6558
|
-
|
6937
|
+
void llama_kv_cache_tokens_rm(struct llama_context * ctx, int32_t c0, int32_t c1) {
|
6938
|
+
llama_kv_cache_tokens_rm(ctx->kv_self, c0, c1);
|
6939
|
+
}
|
6559
6940
|
|
6560
|
-
void
|
6561
|
-
|
6562
|
-
|
6563
|
-
|
6564
|
-
|
6941
|
+
void llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
|
6942
|
+
llama_kv_cache_seq_rm(ctx->kv_self, seq_id, p0, p1);
|
6943
|
+
}
|
6944
|
+
|
6945
|
+
void llama_kv_cache_seq_cp(struct llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
|
6946
|
+
llama_kv_cache_seq_cp(ctx->kv_self, seq_id_src, seq_id_dst, p0, p1);
|
6947
|
+
}
|
6948
|
+
|
6949
|
+
void llama_kv_cache_seq_keep(struct llama_context * ctx, llama_seq_id seq_id) {
|
6950
|
+
llama_kv_cache_seq_keep(ctx->kv_self, seq_id);
|
6951
|
+
}
|
6952
|
+
|
6953
|
+
void llama_kv_cache_seq_shift(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
|
6954
|
+
llama_kv_cache_seq_shift(ctx->kv_self, seq_id, p0, p1, delta);
|
6565
6955
|
}
|
6566
6956
|
|
6567
6957
|
// Returns the *maximum* size of the state
|
@@ -6649,6 +7039,16 @@ struct llama_data_file_context : llama_data_context {
|
|
6649
7039
|
*
|
6650
7040
|
*/
|
6651
7041
|
static void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
|
7042
|
+
// TODO: does not support multi-sequence states
|
7043
|
+
{
|
7044
|
+
const auto & kv_self = ctx->kv_self;
|
7045
|
+
for (uint32_t i = 0; i < kv_self.head; ++i) {
|
7046
|
+
GGML_ASSERT(kv_self.cells[i].pos == (int32_t) i);
|
7047
|
+
GGML_ASSERT(kv_self.cells[i].seq_id.size() == 1);
|
7048
|
+
GGML_ASSERT(kv_self.cells[i].has_seq_id(0));
|
7049
|
+
}
|
7050
|
+
}
|
7051
|
+
|
6652
7052
|
// copy rng
|
6653
7053
|
{
|
6654
7054
|
std::stringstream rng_ss;
|
@@ -6699,12 +7099,14 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
6699
7099
|
{
|
6700
7100
|
const auto & kv_self = ctx->kv_self;
|
6701
7101
|
const auto & hparams = ctx->model.hparams;
|
7102
|
+
const auto & cparams = ctx->cparams;
|
7103
|
+
|
6702
7104
|
const int n_layer = hparams.n_layer;
|
6703
7105
|
const int n_embd = hparams.n_embd_gqa();
|
6704
|
-
const int n_ctx =
|
7106
|
+
const int n_ctx = cparams.n_ctx;
|
6705
7107
|
|
6706
7108
|
const size_t kv_size = kv_self.buf.size;
|
6707
|
-
const int kv_ntok =
|
7109
|
+
const int kv_ntok = kv_self.head;
|
6708
7110
|
|
6709
7111
|
data_ctx->write(&kv_size, sizeof(kv_size));
|
6710
7112
|
data_ctx->write(&kv_ntok, sizeof(kv_ntok));
|
@@ -6807,9 +7209,11 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
6807
7209
|
{
|
6808
7210
|
const auto & kv_self = ctx->kv_self;
|
6809
7211
|
const auto & hparams = ctx->model.hparams;
|
7212
|
+
const auto & cparams = ctx->cparams;
|
7213
|
+
|
6810
7214
|
const int n_layer = hparams.n_layer;
|
6811
7215
|
const int n_embd = hparams.n_embd_gqa();
|
6812
|
-
const int n_ctx =
|
7216
|
+
const int n_ctx = cparams.n_ctx;
|
6813
7217
|
|
6814
7218
|
size_t kv_size;
|
6815
7219
|
int kv_ntok;
|
@@ -6848,7 +7252,8 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
6848
7252
|
ggml_free(cpy_ctx);
|
6849
7253
|
}
|
6850
7254
|
|
6851
|
-
ctx->kv_self.
|
7255
|
+
ctx->kv_self.head = kv_ntok;
|
7256
|
+
ctx->kv_self.size = kv_size;
|
6852
7257
|
}
|
6853
7258
|
|
6854
7259
|
const size_t nread = inp - src;
|
@@ -6943,64 +7348,102 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi
|
|
6943
7348
|
|
6944
7349
|
int llama_eval(
|
6945
7350
|
struct llama_context * ctx,
|
6946
|
-
|
6947
|
-
|
6948
|
-
int n_past
|
6949
|
-
|
6950
|
-
if (!llama_eval_internal(*ctx, tokens, nullptr, n_tokens, n_past, n_threads, nullptr)) {
|
6951
|
-
LLAMA_LOG_ERROR("%s: failed to eval\n", __func__);
|
6952
|
-
return 1;
|
6953
|
-
}
|
7351
|
+
llama_token * tokens,
|
7352
|
+
int32_t n_tokens,
|
7353
|
+
int n_past) {
|
7354
|
+
llama_kv_cache_tokens_rm(ctx->kv_self, n_past, -1);
|
6954
7355
|
|
6955
|
-
|
6956
|
-
|
6957
|
-
|
6958
|
-
ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
|
6959
|
-
ctx->has_evaluated_once = true;
|
7356
|
+
const int ret = llama_decode_internal(*ctx, llama_batch_get_one(tokens, n_tokens, n_past, 0));
|
7357
|
+
if (ret < 0) {
|
7358
|
+
LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
|
6960
7359
|
}
|
6961
7360
|
|
6962
|
-
return
|
7361
|
+
return ret;
|
6963
7362
|
}
|
6964
7363
|
|
6965
7364
|
int llama_eval_embd(
|
6966
7365
|
struct llama_context * ctx,
|
6967
|
-
|
6968
|
-
|
6969
|
-
int n_past
|
6970
|
-
|
6971
|
-
if (!llama_eval_internal(*ctx, nullptr, embd, n_tokens, n_past, n_threads, nullptr)) {
|
6972
|
-
LLAMA_LOG_ERROR("%s: failed to eval\n", __func__);
|
6973
|
-
return 1;
|
6974
|
-
}
|
7366
|
+
float * embd,
|
7367
|
+
int32_t n_tokens,
|
7368
|
+
int n_past) {
|
7369
|
+
llama_kv_cache_tokens_rm(ctx->kv_self, n_past, -1);
|
6975
7370
|
|
6976
|
-
|
6977
|
-
|
6978
|
-
|
6979
|
-
|
6980
|
-
|
7371
|
+
llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, n_past, 1, 0, };
|
7372
|
+
|
7373
|
+
const int ret = llama_decode_internal(*ctx, batch);
|
7374
|
+
if (ret < 0) {
|
7375
|
+
LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
|
6981
7376
|
}
|
6982
7377
|
|
6983
|
-
return
|
7378
|
+
return ret;
|
6984
7379
|
}
|
6985
7380
|
|
6986
|
-
|
6987
|
-
|
6988
|
-
|
7381
|
+
void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch) {
|
7382
|
+
ctx->cparams.n_threads = n_threads;
|
7383
|
+
ctx->cparams.n_threads_batch = n_threads_batch;
|
7384
|
+
}
|
7385
|
+
|
7386
|
+
struct llama_batch llama_batch_get_one(
|
7387
|
+
llama_token * tokens,
|
7388
|
+
int32_t n_tokens,
|
7389
|
+
llama_pos pos_0,
|
7390
|
+
llama_seq_id seq_id) {
|
7391
|
+
return {
|
7392
|
+
/*n_tokens =*/ n_tokens,
|
7393
|
+
/*tokens =*/ tokens,
|
7394
|
+
/*embd =*/ nullptr,
|
7395
|
+
/*pos =*/ nullptr,
|
7396
|
+
/*seq_id =*/ nullptr,
|
7397
|
+
/*logits =*/ nullptr,
|
7398
|
+
/*all_pos_0 =*/ pos_0,
|
7399
|
+
/*all_pos_1 =*/ 1,
|
7400
|
+
/*all_seq_id =*/ seq_id,
|
7401
|
+
};
|
7402
|
+
}
|
6989
7403
|
|
6990
|
-
|
7404
|
+
struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd) {
|
7405
|
+
llama_batch batch = { -1, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, };
|
6991
7406
|
|
6992
|
-
if (
|
6993
|
-
|
6994
|
-
|
7407
|
+
if (embd) {
|
7408
|
+
batch.embd = (float *) malloc(sizeof(float) * n_tokens * embd);
|
7409
|
+
} else {
|
7410
|
+
batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens);
|
6995
7411
|
}
|
6996
7412
|
|
6997
|
-
|
7413
|
+
batch.pos = (llama_pos *) malloc(sizeof(llama_pos) * n_tokens);
|
7414
|
+
batch.seq_id = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_tokens);
|
7415
|
+
batch.logits = (int8_t *) malloc(sizeof(int8_t) * n_tokens);
|
7416
|
+
|
7417
|
+
return batch;
|
7418
|
+
}
|
7419
|
+
|
7420
|
+
void llama_batch_free(struct llama_batch batch) {
|
7421
|
+
if (batch.token) free(batch.token);
|
7422
|
+
if (batch.embd) free(batch.embd);
|
7423
|
+
if (batch.pos) free(batch.pos);
|
7424
|
+
if (batch.seq_id) free(batch.seq_id);
|
7425
|
+
if (batch.logits) free(batch.logits);
|
7426
|
+
}
|
7427
|
+
|
7428
|
+
int llama_decode(
|
7429
|
+
struct llama_context * ctx,
|
7430
|
+
struct llama_batch batch) {
|
7431
|
+
const int ret = llama_decode_internal(*ctx, batch);
|
7432
|
+
if (ret < 0) {
|
7433
|
+
LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
|
7434
|
+
}
|
7435
|
+
|
7436
|
+
return ret;
|
6998
7437
|
}
|
6999
7438
|
|
7000
7439
|
float * llama_get_logits(struct llama_context * ctx) {
|
7001
7440
|
return ctx->logits.data();
|
7002
7441
|
}
|
7003
7442
|
|
7443
|
+
float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
|
7444
|
+
return ctx->logits.data() + i*ctx->model.hparams.n_vocab;
|
7445
|
+
}
|
7446
|
+
|
7004
7447
|
float * llama_get_embeddings(struct llama_context * ctx) {
|
7005
7448
|
return ctx->embedding.data();
|
7006
7449
|
}
|
@@ -7030,16 +7473,6 @@ llama_token llama_token_nl(const struct llama_context * ctx) {
|
|
7030
7473
|
}
|
7031
7474
|
|
7032
7475
|
int llama_tokenize(
|
7033
|
-
struct llama_context * ctx,
|
7034
|
-
const char * text,
|
7035
|
-
int text_len,
|
7036
|
-
llama_token * tokens,
|
7037
|
-
int n_max_tokens,
|
7038
|
-
bool add_bos) {
|
7039
|
-
return llama_tokenize_with_model(&ctx->model, text, text_len, tokens, n_max_tokens, add_bos);
|
7040
|
-
}
|
7041
|
-
|
7042
|
-
int llama_tokenize_with_model(
|
7043
7476
|
const struct llama_model * model,
|
7044
7477
|
const char * text,
|
7045
7478
|
int text_len,
|
@@ -7060,13 +7493,9 @@ int llama_tokenize_with_model(
|
|
7060
7493
|
return res.size();
|
7061
7494
|
}
|
7062
7495
|
|
7063
|
-
int llama_token_to_piece(const struct llama_context * ctx, llama_token token, char * buf, int length) {
|
7064
|
-
return llama_token_to_piece_with_model(&ctx->model, token, buf, length);
|
7065
|
-
}
|
7066
|
-
|
7067
7496
|
// does not write null-terminator to buf
|
7068
|
-
int
|
7069
|
-
if (0 <= token && token <
|
7497
|
+
int llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int length) {
|
7498
|
+
if (0 <= token && token < llama_n_vocab(model)) {
|
7070
7499
|
if (llama_is_normal_token(model->vocab, token)) {
|
7071
7500
|
std::string result = model->vocab.id_to_token[token].text;
|
7072
7501
|
if (llama_vocab_get_type(model->vocab) == LLAMA_VOCAB_TYPE_SPM) {
|
@@ -7086,7 +7515,7 @@ int llama_token_to_piece_with_model(const struct llama_model * model, llama_toke
|
|
7086
7515
|
buf[2] = '\x85';
|
7087
7516
|
return 3;
|
7088
7517
|
} else if (llama_is_control_token(model->vocab, token)) {
|
7089
|
-
|
7518
|
+
// do nothing
|
7090
7519
|
} else if (llama_is_byte_token(model->vocab, token)) {
|
7091
7520
|
if (length < 1) {
|
7092
7521
|
return -1;
|
@@ -7194,12 +7623,12 @@ const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal
|
|
7194
7623
|
return ctx->model.tensors_by_name;
|
7195
7624
|
}
|
7196
7625
|
|
7197
|
-
void llama_log_set(
|
7626
|
+
void llama_log_set(ggml_log_callback log_callback, void * user_data) {
|
7198
7627
|
g_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
|
7199
7628
|
g_state.log_callback_user_data = user_data;
|
7200
7629
|
}
|
7201
7630
|
|
7202
|
-
static void llama_log_internal_v(
|
7631
|
+
static void llama_log_internal_v(ggml_log_level level, const char * format, va_list args) {
|
7203
7632
|
va_list args_copy;
|
7204
7633
|
va_copy(args_copy, args);
|
7205
7634
|
char buffer[128];
|
@@ -7216,14 +7645,14 @@ static void llama_log_internal_v(llama_log_level level, const char * format, va_
|
|
7216
7645
|
va_end(args_copy);
|
7217
7646
|
}
|
7218
7647
|
|
7219
|
-
static void llama_log_internal(
|
7648
|
+
static void llama_log_internal(ggml_log_level level, const char * format, ...) {
|
7220
7649
|
va_list args;
|
7221
7650
|
va_start(args, format);
|
7222
7651
|
llama_log_internal_v(level, format, args);
|
7223
7652
|
va_end(args);
|
7224
7653
|
}
|
7225
7654
|
|
7226
|
-
static void llama_log_callback_default(
|
7655
|
+
static void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data) {
|
7227
7656
|
(void) level;
|
7228
7657
|
(void) user_data;
|
7229
7658
|
fputs(text, stderr);
|