llama_cpp 0.5.3 → 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -72,6 +72,7 @@
72
72
  #include <sstream>
73
73
  #include <thread>
74
74
  #include <unordered_map>
75
+ #include <set>
75
76
 
76
77
  #if defined(_MSC_VER)
77
78
  #pragma warning(disable: 4244 4267) // possible loss of data
@@ -92,12 +93,12 @@
92
93
  //
93
94
 
94
95
  LLAMA_ATTRIBUTE_FORMAT(2, 3)
95
- static void llama_log_internal (llama_log_level level, const char* format, ...);
96
- static void llama_log_callback_default(llama_log_level level, const char * text, void * user_data);
96
+ static void llama_log_internal (ggml_log_level level, const char* format, ...);
97
+ static void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data);
97
98
 
98
- #define LLAMA_LOG_INFO(...) llama_log_internal(LLAMA_LOG_LEVEL_INFO , __VA_ARGS__)
99
- #define LLAMA_LOG_WARN(...) llama_log_internal(LLAMA_LOG_LEVEL_WARN , __VA_ARGS__)
100
- #define LLAMA_LOG_ERROR(...) llama_log_internal(LLAMA_LOG_LEVEL_ERROR, __VA_ARGS__)
99
+ #define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
100
+ #define LLAMA_LOG_WARN(...) llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
101
+ #define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
101
102
 
102
103
  //
103
104
  // helpers
@@ -166,13 +167,13 @@ enum llm_arch {
166
167
  };
167
168
 
168
169
  static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
169
- { LLM_ARCH_LLAMA, "llama" },
170
- { LLM_ARCH_FALCON, "falcon" },
171
- { LLM_ARCH_GPT2, "gpt2" },
172
- { LLM_ARCH_GPTJ, "gptj" },
173
- { LLM_ARCH_GPTNEOX, "gptneox" },
174
- { LLM_ARCH_MPT, "mpt" },
175
- { LLM_ARCH_BAICHUAN, "baichuan" },
170
+ { LLM_ARCH_LLAMA, "llama" },
171
+ { LLM_ARCH_FALCON, "falcon" },
172
+ { LLM_ARCH_GPT2, "gpt2" },
173
+ { LLM_ARCH_GPTJ, "gptj" },
174
+ { LLM_ARCH_GPTNEOX, "gptneox" },
175
+ { LLM_ARCH_MPT, "mpt" },
176
+ { LLM_ARCH_BAICHUAN, "baichuan" },
176
177
  { LLM_ARCH_STARCODER, "starcoder" },
177
178
  };
178
179
 
@@ -221,16 +222,16 @@ enum llm_kv {
221
222
  };
222
223
 
223
224
  static std::map<llm_kv, std::string> LLM_KV_NAMES = {
224
- { LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" },
225
- { LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" },
226
- { LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
227
- { LLM_KV_GENERAL_NAME, "general.name" },
228
- { LLM_KV_GENERAL_AUTHOR, "general.author" },
229
- { LLM_KV_GENERAL_URL, "general.url" },
230
- { LLM_KV_GENERAL_DESCRIPTION, "general.description" },
231
- { LLM_KV_GENERAL_LICENSE, "general.license" },
232
- { LLM_KV_GENERAL_SOURCE_URL, "general.source_url" },
233
- { LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source_hf_repo" },
225
+ { LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" },
226
+ { LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" },
227
+ { LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
228
+ { LLM_KV_GENERAL_NAME, "general.name" },
229
+ { LLM_KV_GENERAL_AUTHOR, "general.author" },
230
+ { LLM_KV_GENERAL_URL, "general.url" },
231
+ { LLM_KV_GENERAL_DESCRIPTION, "general.description" },
232
+ { LLM_KV_GENERAL_LICENSE, "general.license" },
233
+ { LLM_KV_GENERAL_SOURCE_URL, "general.source.url" },
234
+ { LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" },
234
235
 
235
236
  { LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
236
237
  { LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
@@ -448,7 +449,7 @@ struct LLM_TN {
448
449
  //
449
450
 
450
451
  #define GGUF_GET_KEY(ctx, dst, func, type, req, key) \
451
- { \
452
+ do { \
452
453
  const std::string skey(key); \
453
454
  const int kid = gguf_find_key(ctx, skey.c_str()); \
454
455
  if (kid >= 0) { \
@@ -460,7 +461,7 @@ struct LLM_TN {
460
461
  } else if (req) { \
461
462
  throw std::runtime_error(format("key not found in model: %s", skey.c_str())); \
462
463
  } \
463
- }
464
+ } while (0)
464
465
 
465
466
  //
466
467
  // ggml helpers
@@ -881,10 +882,10 @@ static void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
881
882
 
882
883
  static std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) {
883
884
  std::vector<char> result(8, 0);
884
- const int n_tokens = llama_token_to_piece(ctx, token, result.data(), result.size());
885
+ const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
885
886
  if (n_tokens < 0) {
886
887
  result.resize(-n_tokens);
887
- int check = llama_token_to_piece(ctx, token, result.data(), result.size());
888
+ int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
888
889
  GGML_ASSERT(check == -n_tokens);
889
890
  } else {
890
891
  result.resize(n_tokens);
@@ -899,7 +900,7 @@ static std::string llama_token_to_str(const struct llama_context * ctx, llama_to
899
900
 
900
901
  struct llama_state {
901
902
  // We save the log callback globally
902
- llama_log_callback log_callback = llama_log_callback_default;
903
+ ggml_log_callback log_callback = llama_log_callback_default;
903
904
  void * log_callback_user_data = nullptr;
904
905
  };
905
906
 
@@ -925,9 +926,9 @@ static const size_t MB = kB*kB;
925
926
  static const size_t GB = kB*kB*kB;
926
927
 
927
928
  struct llama_hparams {
929
+ bool vocab_only;
928
930
  uint32_t n_vocab;
929
931
  uint32_t n_ctx_train; // context size the model was trained on
930
- uint32_t n_ctx; // context size used during inference
931
932
  uint32_t n_embd;
932
933
  uint32_t n_head;
933
934
  uint32_t n_head_kv;
@@ -938,8 +939,8 @@ struct llama_hparams {
938
939
  float f_norm_eps;
939
940
  float f_norm_rms_eps;
940
941
 
941
- float rope_freq_base;
942
- float rope_freq_scale;
942
+ float rope_freq_base_train;
943
+ float rope_freq_scale_train;
943
944
 
944
945
  bool operator!=(const llama_hparams & other) const {
945
946
  return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams))); // NOLINT
@@ -956,15 +957,18 @@ struct llama_hparams {
956
957
  uint32_t n_embd_gqa() const {
957
958
  return n_embd/n_gqa();
958
959
  }
960
+ };
959
961
 
960
- size_t kv_size() const {
961
- size_t result = 2ull;
962
- result *= (size_t) n_embd_gqa();
963
- result *= (size_t) n_ctx;
964
- result *= (size_t) n_layer;
965
- result *= sizeof(ggml_fp16_t);
966
- return result;
967
- }
962
+ struct llama_cparams {
963
+ uint32_t n_ctx; // context size used during inference
964
+ uint32_t n_batch;
965
+ uint32_t n_threads; // number of threads to use for generation
966
+ uint32_t n_threads_batch; // number of threads to use for batch processing
967
+
968
+ float rope_freq_base;
969
+ float rope_freq_scale;
970
+
971
+ bool mul_mat_q;
968
972
  };
969
973
 
970
974
  struct llama_layer {
@@ -999,7 +1003,29 @@ struct llama_layer {
999
1003
  struct ggml_tensor * b3; // ffn_up
1000
1004
  };
1001
1005
 
1006
+ struct llama_kv_cell {
1007
+ llama_pos pos = -1;
1008
+ llama_pos delta = 0;
1009
+
1010
+ std::set<llama_seq_id> seq_id;
1011
+
1012
+ bool has_seq_id(const llama_seq_id & id) const {
1013
+ return seq_id.find(id) != seq_id.end();
1014
+ }
1015
+ };
1016
+
1017
+ // ring-buffer of cached KV data
1002
1018
  struct llama_kv_cache {
1019
+ bool has_shift = false;
1020
+
1021
+ uint32_t head = 0;
1022
+ uint32_t size = 0;
1023
+
1024
+ // computed before each graph build
1025
+ uint32_t n = 0;
1026
+
1027
+ std::vector<llama_kv_cell> cells;
1028
+
1003
1029
  struct ggml_tensor * k = NULL;
1004
1030
  struct ggml_tensor * v = NULL;
1005
1031
 
@@ -1007,8 +1033,6 @@ struct llama_kv_cache {
1007
1033
 
1008
1034
  llama_buffer buf;
1009
1035
 
1010
- int n; // number of tokens currently in the cache
1011
-
1012
1036
  ~llama_kv_cache() {
1013
1037
  if (ctx) {
1014
1038
  ggml_free(ctx);
@@ -1122,11 +1146,8 @@ struct llama_model {
1122
1146
  };
1123
1147
 
1124
1148
  struct llama_context {
1125
- llama_context(const llama_model & model) : model(model), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
1149
+ llama_context(const llama_model & model) : model(model), t_start_us(model.t_start_us), t_load_us(model.t_load_us) {}
1126
1150
  ~llama_context() {
1127
- if (model_owner) {
1128
- delete &model;
1129
- }
1130
1151
  #ifdef GGML_USE_METAL
1131
1152
  if (ctx_metal) {
1132
1153
  ggml_metal_free(ctx_metal);
@@ -1137,27 +1158,26 @@ struct llama_context {
1137
1158
  }
1138
1159
  }
1139
1160
 
1161
+ llama_cparams cparams;
1162
+
1163
+ const llama_model & model;
1164
+
1165
+ // key + value cache for the self attention
1166
+ struct llama_kv_cache kv_self;
1167
+
1140
1168
  std::mt19937 rng;
1141
1169
 
1142
1170
  bool has_evaluated_once = false;
1143
1171
 
1172
+ int64_t t_start_us;
1173
+ int64_t t_load_us;
1144
1174
  int64_t t_sample_us = 0;
1145
- int64_t t_eval_us = 0;
1146
1175
  int64_t t_p_eval_us = 0;
1176
+ int64_t t_eval_us = 0;
1147
1177
 
1148
1178
  int32_t n_sample = 0; // number of tokens sampled
1149
- int32_t n_eval = 0; // number of eval calls
1150
1179
  int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
1151
-
1152
- const llama_model & model;
1153
-
1154
- bool model_owner = false;
1155
-
1156
- int64_t t_load_us;
1157
- int64_t t_start_us;
1158
-
1159
- // key + value cache for the self attention
1160
- struct llama_kv_cache kv_self;
1180
+ int32_t n_eval = 0; // number of eval calls
1161
1181
 
1162
1182
  // decode output (2-dimensional array: [n_tokens][n_vocab])
1163
1183
  std::vector<float> logits;
@@ -1192,16 +1212,23 @@ static bool llama_kv_cache_init(
1192
1212
  const struct llama_hparams & hparams,
1193
1213
  struct llama_kv_cache & cache,
1194
1214
  ggml_type wtype,
1195
- int n_ctx,
1215
+ uint32_t n_ctx,
1196
1216
  int n_gpu_layers) {
1197
- const int n_embd = hparams.n_embd_gqa();
1198
- const int n_layer = hparams.n_layer;
1217
+ const uint32_t n_embd = hparams.n_embd_gqa();
1218
+ const uint32_t n_layer = hparams.n_layer;
1199
1219
 
1200
1220
  const int64_t n_mem = n_layer*n_ctx;
1201
1221
  const int64_t n_elements = n_embd*n_mem;
1202
1222
 
1223
+ cache.has_shift = false;
1224
+
1225
+ cache.head = 0;
1226
+ cache.size = n_ctx;
1227
+
1228
+ cache.cells.clear();
1229
+ cache.cells.resize(n_ctx);
1230
+
1203
1231
  cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
1204
- cache.n = 0;
1205
1232
 
1206
1233
  struct ggml_init_params params;
1207
1234
  params.mem_size = cache.buf.size;
@@ -1222,17 +1249,154 @@ static bool llama_kv_cache_init(
1222
1249
 
1223
1250
  (void) n_gpu_layers;
1224
1251
  #ifdef GGML_USE_CUBLAS
1225
- if (n_gpu_layers > n_layer + 1) {
1252
+ size_t vram_kv_cache = 0;
1253
+
1254
+ if (n_gpu_layers > (int)n_layer + 1) {
1226
1255
  ggml_cuda_assign_buffers_no_scratch(cache.v);
1256
+ LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__);
1257
+ vram_kv_cache += ggml_nbytes(cache.v);
1227
1258
  }
1228
- if (n_gpu_layers > n_layer + 2) {
1259
+ if (n_gpu_layers > (int)n_layer + 2) {
1229
1260
  ggml_cuda_assign_buffers_no_scratch(cache.k);
1261
+ LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
1262
+ vram_kv_cache += ggml_nbytes(cache.k);
1263
+ }
1264
+ if (vram_kv_cache > 0) {
1265
+ LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
1230
1266
  }
1231
1267
  #endif // GGML_USE_CUBLAS
1232
1268
 
1233
1269
  return true;
1234
1270
  }
1235
1271
 
1272
+ // find an empty slot of size "n_tokens" in the cache
1273
+ // updates the cache head
1274
+ static bool llama_kv_cache_find_slot(
1275
+ struct llama_kv_cache & cache,
1276
+ const struct llama_batch & batch) {
1277
+ const uint32_t n_ctx = cache.size;
1278
+ const uint32_t n_tokens = batch.n_tokens;
1279
+
1280
+ if (n_tokens > n_ctx) {
1281
+ LLAMA_LOG_ERROR("%s: n_tokens=%d > n_ctx=%d\n", __func__, n_tokens, n_ctx);
1282
+ return false;
1283
+ }
1284
+
1285
+ uint32_t n_tested = 0;
1286
+
1287
+ while (true) {
1288
+ if (cache.head + n_tokens > n_ctx) {
1289
+ cache.head = 0;
1290
+ n_tested += n_ctx - cache.head;
1291
+ continue;
1292
+ }
1293
+
1294
+ bool found = true;
1295
+ for (uint32_t i = 0; i < n_tokens; i++) {
1296
+ if (cache.cells[cache.head + i].pos >= 0) {
1297
+ found = false;
1298
+ cache.head += i + 1;
1299
+ n_tested += i + 1;
1300
+ break;
1301
+ }
1302
+ }
1303
+
1304
+ if (found) {
1305
+ break;
1306
+ }
1307
+
1308
+ if (n_tested >= n_ctx) {
1309
+ //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
1310
+ return false;
1311
+ }
1312
+ }
1313
+
1314
+ for (uint32_t i = 0; i < n_tokens; i++) {
1315
+ cache.cells[cache.head + i].pos = batch.pos[i];
1316
+ cache.cells[cache.head + i].seq_id.insert(batch.seq_id[i]);
1317
+ }
1318
+
1319
+ return true;
1320
+ }
1321
+
1322
+ // find how many cells are currently in use
1323
+ static int32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
1324
+ for (uint32_t i = cache.size - 1; i > 0; --i) {
1325
+ if (cache.cells[i].pos >= 0 && !cache.cells[i].seq_id.empty()) {
1326
+ return i + 1;
1327
+ }
1328
+ }
1329
+
1330
+ return 0;
1331
+ }
1332
+
1333
+ static void llama_kv_cache_tokens_rm(struct llama_kv_cache & cache, int32_t c0, int32_t c1) {
1334
+ if (c0 < 0) c0 = 0;
1335
+ if (c1 < 0) c1 = cache.size;
1336
+
1337
+ for (int32_t i = c0; i < c1; ++i) {
1338
+ cache.cells[i].pos = -1;
1339
+ cache.cells[i].seq_id.clear();
1340
+ }
1341
+ }
1342
+
1343
+ static void llama_kv_cache_seq_rm(
1344
+ struct llama_kv_cache & cache,
1345
+ llama_seq_id seq_id,
1346
+ llama_pos p0,
1347
+ llama_pos p1) {
1348
+ for (uint32_t i = 0; i < cache.size; ++i) {
1349
+ if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
1350
+ cache.cells[i].seq_id.erase(seq_id);
1351
+ if (cache.cells[i].seq_id.empty()) {
1352
+ cache.cells[i].pos = -1;
1353
+ }
1354
+ }
1355
+ }
1356
+ }
1357
+
1358
+ static void llama_kv_cache_seq_cp(
1359
+ struct llama_kv_cache & cache,
1360
+ llama_seq_id seq_id_src,
1361
+ llama_seq_id seq_id_dst,
1362
+ llama_pos p0,
1363
+ llama_pos p1) {
1364
+ for (uint32_t i = 0; i < cache.size; ++i) {
1365
+ if (cache.cells[i].has_seq_id(seq_id_src) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
1366
+ cache.cells[i].seq_id.insert(seq_id_dst);
1367
+ }
1368
+ }
1369
+ }
1370
+
1371
+ static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id seq_id) {
1372
+ for (uint32_t i = 0; i < cache.size; ++i) {
1373
+ if (!cache.cells[i].has_seq_id(seq_id)) {
1374
+ cache.cells[i].pos = -1;
1375
+ cache.cells[i].seq_id.clear();
1376
+ }
1377
+ }
1378
+ }
1379
+
1380
+ static void llama_kv_cache_seq_shift(
1381
+ struct llama_kv_cache & cache,
1382
+ llama_seq_id seq_id,
1383
+ llama_pos p0,
1384
+ llama_pos p1,
1385
+ llama_pos delta) {
1386
+ for (uint32_t i = 0; i < cache.size; ++i) {
1387
+ if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
1388
+ cache.cells[i].pos += delta;
1389
+ if (cache.cells[i].pos < 0) {
1390
+ cache.cells[i].pos = -1;
1391
+ cache.cells[i].seq_id.clear();
1392
+ } else {
1393
+ cache.has_shift = true;
1394
+ cache.cells[i].delta = delta;
1395
+ }
1396
+ }
1397
+ }
1398
+ }
1399
+
1236
1400
  //
1237
1401
  // model loading and saving
1238
1402
  //
@@ -1554,7 +1718,7 @@ struct llama_model_loader {
1554
1718
  lmlock->grow_to(size_lock);
1555
1719
  }
1556
1720
  break;
1557
- #if defined(GGML_USE_CUBLAS)
1721
+ #ifdef GGML_USE_CUBLAS
1558
1722
  case GGML_BACKEND_GPU:
1559
1723
  case GGML_BACKEND_GPU_SPLIT:
1560
1724
  // old code:
@@ -1587,7 +1751,15 @@ struct llama_model_loader {
1587
1751
  // load LLaMA models
1588
1752
  //
1589
1753
 
1590
- static std::string llama_model_ftype_name(enum llama_ftype ftype) {
1754
+ static std::string llama_model_arch_name(llm_arch arch) {
1755
+ auto it = LLM_ARCH_NAMES.find(arch);
1756
+ if (it == LLM_ARCH_NAMES.end()) {
1757
+ return "unknown";
1758
+ }
1759
+ return it->second;
1760
+ }
1761
+
1762
+ static std::string llama_model_ftype_name(llama_ftype ftype) {
1591
1763
  if (ftype & LLAMA_FTYPE_GUESSED) {
1592
1764
  return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
1593
1765
  }
@@ -1643,10 +1815,7 @@ static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
1643
1815
 
1644
1816
  static void llm_load_hparams(
1645
1817
  llama_model_loader & ml,
1646
- llama_model & model,
1647
- int n_ctx,
1648
- float rope_freq_base,
1649
- float rope_freq_scale) {
1818
+ llama_model & model) {
1650
1819
  struct gguf_context * ctx = ml.ctx_gguf;
1651
1820
 
1652
1821
  const auto kv = LLM_KV(model.arch);
@@ -1657,29 +1826,25 @@ static void llm_load_hparams(
1657
1826
  GGUF_GET_KEY(ctx, model.name, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_NAME));
1658
1827
 
1659
1828
  // get hparams kv
1660
- GGUF_GET_KEY(ctx, hparams.n_vocab, gguf_get_arr_n, GGUF_TYPE_ARRAY, true, kv(LLM_KV_TOKENIZER_LIST));
1661
- GGUF_GET_KEY(ctx, hparams.n_ctx_train, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_CONTEXT_LENGTH));
1662
- GGUF_GET_KEY(ctx, hparams.n_embd, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_EMBEDDING_LENGTH));
1663
- GGUF_GET_KEY(ctx, hparams.n_ff, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_FEED_FORWARD_LENGTH));
1664
- GGUF_GET_KEY(ctx, hparams.n_head, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_ATTENTION_HEAD_COUNT));
1665
- GGUF_GET_KEY(ctx, hparams.n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_BLOCK_COUNT));
1829
+ GGUF_GET_KEY(ctx, hparams.n_vocab, gguf_get_arr_n, GGUF_TYPE_ARRAY, true, kv(LLM_KV_TOKENIZER_LIST));
1830
+ GGUF_GET_KEY(ctx, hparams.n_ctx_train, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_CONTEXT_LENGTH));
1831
+ GGUF_GET_KEY(ctx, hparams.n_embd, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_EMBEDDING_LENGTH));
1832
+ GGUF_GET_KEY(ctx, hparams.n_ff, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_FEED_FORWARD_LENGTH));
1833
+ GGUF_GET_KEY(ctx, hparams.n_head, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_ATTENTION_HEAD_COUNT));
1834
+ GGUF_GET_KEY(ctx, hparams.n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_BLOCK_COUNT));
1666
1835
 
1667
1836
  // n_head_kv is optional, default to n_head
1668
1837
  hparams.n_head_kv = hparams.n_head;
1669
1838
  GGUF_GET_KEY(ctx, hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV));
1670
1839
 
1671
1840
  // rope_freq_base (optional)
1672
- if (rope_freq_base == 0.0f) {
1673
- rope_freq_base = 10000.0f;
1674
- GGUF_GET_KEY(ctx, rope_freq_base, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
1675
- }
1841
+ hparams.rope_freq_base_train = 10000.0f;
1842
+ GGUF_GET_KEY(ctx, hparams.rope_freq_base_train, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
1676
1843
 
1677
1844
  // rope_freq_scale (inverse of the kv) is optional
1678
- if (rope_freq_scale == 0.0f) {
1679
- float ropescale = 1.0f;
1680
- GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
1681
- rope_freq_scale = 1.0f/ropescale;
1682
- }
1845
+ float ropescale = 1.0f;
1846
+ GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
1847
+ hparams.rope_freq_scale_train = 1.0f/ropescale;
1683
1848
 
1684
1849
  // sanity check for n_rot (optional)
1685
1850
  {
@@ -1743,13 +1908,9 @@ static void llm_load_hparams(
1743
1908
  }
1744
1909
  } break;
1745
1910
  default: (void)0;
1746
- };
1911
+ }
1747
1912
 
1748
1913
  model.ftype = ml.ftype;
1749
-
1750
- hparams.n_ctx = n_ctx;
1751
- hparams.rope_freq_base = rope_freq_base;
1752
- hparams.rope_freq_scale = rope_freq_scale;
1753
1914
  }
1754
1915
 
1755
1916
  // TODO: This should probably be in llama.h
@@ -1770,20 +1931,18 @@ static void llm_load_vocab(
1770
1931
  throw std::runtime_error("cannot find tokenizer vocab in model file\n");
1771
1932
  }
1772
1933
 
1934
+ const float * scores = nullptr;
1773
1935
  const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
1774
- if (score_idx == -1) {
1775
- throw std::runtime_error("cannot find tokenizer scores in model file\n");
1936
+ if (score_idx != -1) {
1937
+ scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
1776
1938
  }
1777
1939
 
1778
- const float * scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
1779
-
1940
+ const int * toktypes = nullptr;
1780
1941
  const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
1781
- if (toktype_idx == -1) {
1782
- throw std::runtime_error("cannot find token type list in GGUF file\n");
1942
+ if (toktype_idx != -1) {
1943
+ toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
1783
1944
  }
1784
1945
 
1785
- const int * toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
1786
-
1787
1946
  // determine vocab type
1788
1947
  {
1789
1948
  std::string tokenizer_name;
@@ -1851,8 +2010,8 @@ static void llm_load_vocab(
1851
2010
 
1852
2011
  auto & token_data = vocab.id_to_token[i];
1853
2012
  token_data.text = std::move(word);
1854
- token_data.score = scores[i];
1855
- token_data.type = (llama_token_type) toktypes[i];
2013
+ token_data.score = scores ? scores[i] : 0.0f;
2014
+ token_data.type = toktypes ? (llama_token_type) toktypes[i] : LLAMA_TOKEN_TYPE_NORMAL;
1856
2015
  }
1857
2016
 
1858
2017
  // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
@@ -1875,31 +2034,30 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
1875
2034
  const auto & vocab = model.vocab;
1876
2035
 
1877
2036
  // hparams
1878
- LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver));
1879
- LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch).c_str());
1880
- LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, vocab.type == LLAMA_VOCAB_TYPE_SPM ? "SPM" : "BPE"); // TODO: fix
1881
- LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab);
1882
- LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (int) vocab.bpe_ranks.size());
1883
- LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
1884
- LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, hparams.n_ctx);
1885
- LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
1886
- LLAMA_LOG_INFO("%s: n_head = %u\n", __func__, hparams.n_head);
1887
- LLAMA_LOG_INFO("%s: n_head_kv = %u\n", __func__, hparams.n_head_kv);
1888
- LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
1889
- LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim
1890
- LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa());
1891
- LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
1892
- LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
1893
- LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
1894
- LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base);
1895
- LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale);
1896
- LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
1897
- LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
1898
- LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
2037
+ LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver));
2038
+ LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch).c_str());
2039
+ LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, vocab.type == LLAMA_VOCAB_TYPE_SPM ? "SPM" : "BPE"); // TODO: fix
2040
+ LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab);
2041
+ LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (int) vocab.bpe_ranks.size());
2042
+ LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
2043
+ LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
2044
+ LLAMA_LOG_INFO("%s: n_head = %u\n", __func__, hparams.n_head);
2045
+ LLAMA_LOG_INFO("%s: n_head_kv = %u\n", __func__, hparams.n_head_kv);
2046
+ LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
2047
+ LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim
2048
+ LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa());
2049
+ LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
2050
+ LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
2051
+ LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
2052
+ LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
2053
+ LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
2054
+ LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
2055
+ LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
2056
+ LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
1899
2057
  if (ml.n_bytes < GB) {
1900
- LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
2058
+ LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
1901
2059
  } else {
1902
- LLAMA_LOG_INFO("%s: model size = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
2060
+ LLAMA_LOG_INFO("%s: model size = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
1903
2061
  }
1904
2062
 
1905
2063
  // general kv
@@ -1917,13 +2075,9 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
1917
2075
  static void llm_load_tensors(
1918
2076
  llama_model_loader & ml,
1919
2077
  llama_model & model,
1920
- int n_batch,
1921
2078
  int n_gpu_layers,
1922
2079
  int main_gpu,
1923
2080
  const float * tensor_split,
1924
- const bool mul_mat_q,
1925
- bool low_vram,
1926
- ggml_type memory_type,
1927
2081
  bool use_mlock,
1928
2082
  llama_progress_callback progress_callback,
1929
2083
  void * progress_callback_user_data) {
@@ -1962,11 +2116,9 @@ static void llm_load_tensors(
1962
2116
  }
1963
2117
 
1964
2118
  (void) main_gpu;
1965
- (void) mul_mat_q;
1966
- #if defined(GGML_USE_CUBLAS)
2119
+ #ifdef GGML_USE_CUBLAS
1967
2120
  LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__);
1968
2121
  ggml_cuda_set_main_device(main_gpu);
1969
- ggml_cuda_set_mul_mat_q(mul_mat_q);
1970
2122
  #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
1971
2123
  #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
1972
2124
  #elif defined(GGML_USE_CLBLAST)
@@ -2001,9 +2153,9 @@ static void llm_load_tensors(
2001
2153
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2002
2154
  // on Windows however this is detrimental unless everything is on the GPU
2003
2155
  #ifndef _WIN32
2004
- backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2156
+ backend_norm = LLAMA_BACKEND_OFFLOAD;
2005
2157
  #else
2006
- backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2158
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2007
2159
  #endif // _WIN32
2008
2160
 
2009
2161
  backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
@@ -2067,9 +2219,9 @@ static void llm_load_tensors(
2067
2219
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2068
2220
  // on Windows however this is detrimental unless everything is on the GPU
2069
2221
  #ifndef _WIN32
2070
- backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2222
+ backend_norm = LLAMA_BACKEND_OFFLOAD;
2071
2223
  #else
2072
- backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2224
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2073
2225
  #endif // _WIN32
2074
2226
 
2075
2227
  backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
@@ -2137,9 +2289,9 @@ static void llm_load_tensors(
2137
2289
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2138
2290
  // on Windows however this is detrimental unless everything is on the GPU
2139
2291
  #ifndef _WIN32
2140
- backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2292
+ backend_norm = LLAMA_BACKEND_OFFLOAD;
2141
2293
  #else
2142
- backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2294
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2143
2295
  #endif // _WIN32
2144
2296
 
2145
2297
  backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
@@ -2214,9 +2366,9 @@ static void llm_load_tensors(
2214
2366
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2215
2367
  // on Windows however this is detrimental unless everything is on the GPU
2216
2368
  #ifndef _WIN32
2217
- backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2369
+ backend_norm = LLAMA_BACKEND_OFFLOAD;
2218
2370
  #else
2219
- backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2371
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2220
2372
  #endif // _WIN32
2221
2373
 
2222
2374
  backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
@@ -2281,27 +2433,19 @@ static void llm_load_tensors(
2281
2433
  } break;
2282
2434
  default:
2283
2435
  throw std::runtime_error("unknown architecture");
2284
- };
2436
+ }
2285
2437
  }
2286
2438
 
2287
2439
  ml.done_getting_tensors();
2288
2440
 
2289
2441
  // print memory requirements
2290
2442
  {
2291
- const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
2292
-
2293
2443
  // this is the total memory required to run the inference
2294
2444
  size_t mem_required =
2295
2445
  ctx_size +
2296
2446
  mmapped_size - vram_weights; // weights in VRAM not in memory
2297
2447
 
2298
- // this is the memory required by one llama_state
2299
- const size_t mem_required_state = scale*hparams.kv_size();
2300
-
2301
- LLAMA_LOG_INFO("%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
2302
- mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
2303
-
2304
- (void) n_batch;
2448
+ LLAMA_LOG_INFO("%s: mem required = %7.2f MB\n", __func__, mem_required / 1024.0 / 1024.0);
2305
2449
 
2306
2450
  #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
2307
2451
  const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
@@ -2310,36 +2454,17 @@ static void llm_load_tensors(
2310
2454
  if (n_gpu_layers > (int) hparams.n_layer) {
2311
2455
  LLAMA_LOG_INFO("%s: offloading non-repeating layers to GPU\n", __func__);
2312
2456
  }
2313
- size_t vram_kv_cache = 0;
2314
2457
 
2315
2458
  #ifdef GGML_USE_CUBLAS
2316
2459
  const int max_backend_supported_layers = hparams.n_layer + 3;
2317
- const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
2318
- if (n_gpu_layers > (int) hparams.n_layer + 1) {
2319
- if (low_vram) {
2320
- LLAMA_LOG_INFO("%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
2321
- } else {
2322
- LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__);
2323
- vram_kv_cache += hparams.kv_size() / 2;
2324
- }
2325
- }
2326
- if (n_gpu_layers > (int) hparams.n_layer + 2) {
2327
- if (low_vram) {
2328
- LLAMA_LOG_WARN("%s: cannot offload k cache to GPU due to low VRAM option\n", __func__);
2329
- } else {
2330
- LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
2331
- vram_kv_cache += hparams.kv_size() / 2;
2332
- }
2333
- }
2460
+ const int max_offloadable_layers = hparams.n_layer + 3;
2334
2461
  #elif defined(GGML_USE_CLBLAST)
2335
2462
  const int max_backend_supported_layers = hparams.n_layer + 1;
2336
2463
  const int max_offloadable_layers = hparams.n_layer + 1;
2337
2464
  #endif // GGML_USE_CUBLAS
2338
2465
 
2339
- LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n",
2340
- __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
2341
- LLAMA_LOG_INFO("%s: VRAM used: %zu MB\n",
2342
- __func__, (vram_weights + vram_kv_cache + MB - 1) / MB); // round up
2466
+ LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
2467
+ LLAMA_LOG_INFO("%s: VRAM used: %.2f MB\n", __func__, vram_weights / 1024.0 / 1024.0);
2343
2468
  #else
2344
2469
  (void) n_gpu_layers;
2345
2470
  #endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
@@ -2352,7 +2477,7 @@ static void llm_load_tensors(
2352
2477
  }
2353
2478
 
2354
2479
  (void) tensor_split;
2355
- #if defined(GGML_USE_CUBLAS)
2480
+ #ifdef GGML_USE_CUBLAS
2356
2481
  {
2357
2482
  ggml_cuda_set_tensor_split(tensor_split);
2358
2483
  }
@@ -2374,29 +2499,24 @@ static void llm_load_tensors(
2374
2499
  static bool llama_model_load(
2375
2500
  const std::string & fname,
2376
2501
  llama_model & model,
2377
- int n_ctx,
2378
- int n_batch,
2379
2502
  int n_gpu_layers,
2380
2503
  int main_gpu,
2381
2504
  const float * tensor_split,
2382
- const bool mul_mat_q,
2383
- float rope_freq_base,
2384
- float rope_freq_scale,
2385
- bool low_vram,
2386
- ggml_type memory_type,
2387
2505
  bool use_mmap,
2388
2506
  bool use_mlock,
2389
2507
  bool vocab_only,
2390
2508
  llama_progress_callback progress_callback,
2391
2509
  void *progress_callback_user_data) {
2392
2510
  try {
2393
- std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap));
2511
+ llama_model_loader ml(fname, use_mmap);
2512
+
2513
+ model.hparams.vocab_only = vocab_only;
2394
2514
 
2395
- llm_load_arch (*ml, model);
2396
- llm_load_hparams(*ml, model, n_ctx, rope_freq_base, rope_freq_scale);
2397
- llm_load_vocab (*ml, model);
2515
+ llm_load_arch (ml, model);
2516
+ llm_load_hparams(ml, model);
2517
+ llm_load_vocab (ml, model);
2398
2518
 
2399
- llm_load_print_meta(*ml, model);
2519
+ llm_load_print_meta(ml, model);
2400
2520
 
2401
2521
  if (model.hparams.n_vocab != model.vocab.id_to_token.size()) {
2402
2522
  throw std::runtime_error("vocab size mismatch");
@@ -2408,8 +2528,8 @@ static bool llama_model_load(
2408
2528
  }
2409
2529
 
2410
2530
  llm_load_tensors(
2411
- *ml, model, n_batch, n_gpu_layers,
2412
- main_gpu, tensor_split, mul_mat_q, low_vram, memory_type,
2531
+ ml, model, n_gpu_layers,
2532
+ main_gpu, tensor_split,
2413
2533
  use_mlock, progress_callback, progress_callback_user_data);
2414
2534
  } catch (const std::exception & err) {
2415
2535
  LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
@@ -2421,17 +2541,10 @@ static bool llama_model_load(
2421
2541
 
2422
2542
  static struct ggml_cgraph * llm_build_llama(
2423
2543
  llama_context & lctx,
2424
- const llama_token * tokens,
2425
- const float * embd,
2426
- int n_tokens,
2427
- int n_past) {
2428
-
2429
- GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
2430
-
2431
- const int N = n_tokens;
2432
-
2544
+ const llama_batch & batch) {
2433
2545
  const auto & model = lctx.model;
2434
2546
  const auto & hparams = model.hparams;
2547
+ const auto & cparams = lctx.cparams;
2435
2548
 
2436
2549
  const auto & kv_self = lctx.kv_self;
2437
2550
 
@@ -2439,7 +2552,7 @@ static struct ggml_cgraph * llm_build_llama(
2439
2552
 
2440
2553
  const int64_t n_embd = hparams.n_embd;
2441
2554
  const int64_t n_layer = hparams.n_layer;
2442
- const int64_t n_ctx = hparams.n_ctx;
2555
+ const int64_t n_ctx = cparams.n_ctx;
2443
2556
  const int64_t n_head = hparams.n_head;
2444
2557
  const int64_t n_head_kv = hparams.n_head_kv;
2445
2558
  const int64_t n_embd_head = hparams.n_embd_head();
@@ -2447,12 +2560,20 @@ static struct ggml_cgraph * llm_build_llama(
2447
2560
 
2448
2561
  GGML_ASSERT(n_embd_head == hparams.n_rot);
2449
2562
 
2450
- const float freq_base = hparams.rope_freq_base;
2451
- const float freq_scale = hparams.rope_freq_scale;
2563
+ const float freq_base = cparams.rope_freq_base;
2564
+ const float freq_scale = cparams.rope_freq_scale;
2452
2565
  const float norm_rms_eps = hparams.f_norm_rms_eps;
2453
2566
 
2454
2567
  const int n_gpu_layers = model.n_gpu_layers;
2455
2568
 
2569
+ const int32_t n_tokens = batch.n_tokens;
2570
+ const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
2571
+ const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
2572
+
2573
+ const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
2574
+
2575
+ //printf("n_kv = %d\n", n_kv);
2576
+
2456
2577
  auto & buf_compute = lctx.buf_compute;
2457
2578
 
2458
2579
  struct ggml_init_params params = {
@@ -2470,12 +2591,12 @@ static struct ggml_cgraph * llm_build_llama(
2470
2591
  struct ggml_tensor * cur;
2471
2592
  struct ggml_tensor * inpL;
2472
2593
 
2473
- if (tokens) {
2474
- struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
2594
+ if (batch.token) {
2595
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
2475
2596
 
2476
2597
  ggml_allocr_alloc(lctx.alloc, inp_tokens);
2477
2598
  if (!ggml_allocr_is_measure(lctx.alloc)) {
2478
- memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
2599
+ memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
2479
2600
  }
2480
2601
  ggml_set_name(inp_tokens, "inp_tokens");
2481
2602
 
@@ -2485,11 +2606,11 @@ static struct ggml_cgraph * llm_build_llama(
2485
2606
  GGML_ASSERT(false && "not implemented");
2486
2607
  #endif
2487
2608
 
2488
- inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
2609
+ inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
2489
2610
 
2490
2611
  ggml_allocr_alloc(lctx.alloc, inpL);
2491
2612
  if (!ggml_allocr_is_measure(lctx.alloc)) {
2492
- memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
2613
+ memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
2493
2614
  }
2494
2615
  }
2495
2616
 
@@ -2498,9 +2619,6 @@ static struct ggml_cgraph * llm_build_llama(
2498
2619
 
2499
2620
  // offload functions set the tensor output backend to GPU
2500
2621
  // tensors are GPU-accelerated if any input or the output has been offloaded
2501
- //
2502
- // with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
2503
- // in that case ggml_cuda_assign_buffers has no effect
2504
2622
  offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
2505
2623
  offload_func_t offload_func_kq = llama_nop;
2506
2624
  offload_func_t offload_func_v = llama_nop;
@@ -2517,12 +2635,75 @@ static struct ggml_cgraph * llm_build_llama(
2517
2635
  }
2518
2636
  #endif // GGML_USE_CUBLAS
2519
2637
 
2638
+ // KQ_scale
2520
2639
  struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
2640
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
2521
2641
  ggml_allocr_alloc(lctx.alloc, KQ_scale);
2522
2642
  if (!ggml_allocr_is_measure(lctx.alloc)) {
2523
- ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
2643
+ ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd_head)));
2644
+ }
2645
+
2646
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
2647
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
2648
+ offload_func_kq(KQ_mask);
2649
+ ggml_set_name(KQ_mask, "KQ_mask");
2650
+ ggml_allocr_alloc(lctx.alloc, KQ_mask);
2651
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
2652
+ float * data = (float *) KQ_mask->data;
2653
+ memset(data, 0, ggml_nbytes(KQ_mask));
2654
+
2655
+ for (int h = 0; h < 1; ++h) {
2656
+ for (int j = 0; j < n_tokens; ++j) {
2657
+ const llama_pos pos = batch.pos[j];
2658
+ const llama_seq_id seq_id = batch.seq_id[j];
2659
+
2660
+ for (int i = 0; i < n_kv; ++i) {
2661
+ if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
2662
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
2663
+ }
2664
+ }
2665
+ }
2666
+ }
2667
+ }
2668
+
2669
+ // KQ_pos - contains the positions
2670
+ struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
2671
+ offload_func_kq(KQ_pos);
2672
+ ggml_set_name(KQ_pos, "KQ_pos");
2673
+ ggml_allocr_alloc(lctx.alloc, KQ_pos);
2674
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
2675
+ int * data = (int *) KQ_pos->data;
2676
+ for (int i = 0; i < n_tokens; ++i) {
2677
+ data[i] = batch.pos[i];
2678
+ }
2679
+ }
2680
+
2681
+ // shift the entire K-cache if needed
2682
+ if (do_rope_shift) {
2683
+ struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
2684
+ offload_func_kq(K_shift);
2685
+ ggml_set_name(K_shift, "K_shift");
2686
+ ggml_allocr_alloc(lctx.alloc, K_shift);
2687
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
2688
+ int * data = (int *) K_shift->data;
2689
+ for (int i = 0; i < n_ctx; ++i) {
2690
+ data[i] = kv_self.cells[i].delta;
2691
+ }
2692
+ }
2693
+
2694
+ for (int il = 0; il < n_layer; ++il) {
2695
+ struct ggml_tensor * tmp =
2696
+ ggml_rope_custom_inplace(ctx0,
2697
+ ggml_view_3d(ctx0, kv_self.k,
2698
+ n_embd_head, n_head_kv, n_ctx,
2699
+ ggml_element_size(kv_self.k)*n_embd_head,
2700
+ ggml_element_size(kv_self.k)*n_embd_gqa,
2701
+ ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
2702
+ K_shift, n_embd_head, 0, 0, freq_base, freq_scale);
2703
+ offload_func_kq(tmp);
2704
+ ggml_build_forward_expand(gf, tmp);
2705
+ }
2524
2706
  }
2525
- ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
2526
2707
 
2527
2708
  for (int il = 0; il < n_layer; ++il) {
2528
2709
  ggml_format_name(inpL, "layer_inp_%d", il);
@@ -2560,33 +2741,33 @@ static struct ggml_cgraph * llm_build_llama(
2560
2741
  offload_func_kq(tmpq);
2561
2742
  ggml_set_name(tmpq, "tmpq");
2562
2743
 
2563
- struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
2744
+ struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
2564
2745
  offload_func_kq(Kcur);
2565
2746
  ggml_set_name(Kcur, "Kcur");
2566
2747
 
2567
- struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
2748
+ struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
2568
2749
  offload_func_kq(Qcur);
2569
2750
  ggml_set_name(Qcur, "Qcur");
2570
2751
 
2571
2752
  // store key and value to memory
2572
2753
  {
2573
- // compute the transposed [N, n_embd] V matrix
2754
+ // compute the transposed [n_tokens, n_embd] V matrix
2574
2755
 
2575
2756
  struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
2576
2757
  offload_func_v(tmpv);
2577
2758
  ggml_set_name(tmpv, "tmpv");
2578
2759
 
2579
- struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, N));
2760
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens));
2580
2761
  offload_func_v(Vcur);
2581
2762
  ggml_set_name(Vcur, "Vcur");
2582
2763
 
2583
- struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
2764
+ struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
2584
2765
  offload_func_kq(k);
2585
2766
  ggml_set_name(k, "k");
2586
2767
 
2587
- struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
2768
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
2588
2769
  ( n_ctx)*ggml_element_size(kv_self.v),
2589
- (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
2770
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
2590
2771
  offload_func_v(v);
2591
2772
  ggml_set_name(v, "v");
2592
2773
 
@@ -2601,7 +2782,7 @@ static struct ggml_cgraph * llm_build_llama(
2601
2782
 
2602
2783
  struct ggml_tensor * K =
2603
2784
  ggml_view_3d(ctx0, kv_self.k,
2604
- n_embd_head, n_past + N, n_head_kv,
2785
+ n_embd_head, n_kv, n_head_kv,
2605
2786
  ggml_element_size(kv_self.k)*n_embd_gqa,
2606
2787
  ggml_element_size(kv_self.k)*n_embd_head,
2607
2788
  ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
@@ -2614,25 +2795,25 @@ static struct ggml_cgraph * llm_build_llama(
2614
2795
  ggml_set_name(KQ, "KQ");
2615
2796
 
2616
2797
  // KQ_scaled = KQ / sqrt(n_embd_head)
2617
- // KQ_scaled shape [n_past + N, N, n_head, 1]
2618
- struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
2798
+ // KQ_scaled shape [n_kv, n_tokens, n_head, 1]
2799
+ struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
2619
2800
  offload_func_kq(KQ_scaled);
2620
2801
  ggml_set_name(KQ_scaled, "KQ_scaled");
2621
2802
 
2622
2803
  // KQ_masked = mask_past(KQ_scaled)
2623
- struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
2804
+ struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
2624
2805
  offload_func_kq(KQ_masked);
2625
2806
  ggml_set_name(KQ_masked, "KQ_masked");
2626
2807
 
2627
2808
  // KQ = soft_max(KQ_masked)
2628
- struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
2809
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
2629
2810
  offload_func_v(KQ_soft_max);
2630
2811
  ggml_set_name(KQ_soft_max, "KQ_soft_max");
2631
2812
 
2632
2813
  // split cached V into n_head heads
2633
2814
  struct ggml_tensor * V =
2634
2815
  ggml_view_3d(ctx0, kv_self.v,
2635
- n_past + N, n_embd_head, n_head_kv,
2816
+ n_kv, n_embd_head, n_head_kv,
2636
2817
  ggml_element_size(kv_self.v)*n_ctx,
2637
2818
  ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
2638
2819
  ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
@@ -2647,7 +2828,7 @@ static struct ggml_cgraph * llm_build_llama(
2647
2828
  // make V contiguous in memory to speed up the matmul, however we waste time on the copy
2648
2829
  // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
2649
2830
  // is there a better way?
2650
- struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head));
2831
+ struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_ctx, n_embd_head, n_head));
2651
2832
  struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
2652
2833
  #endif
2653
2834
 
@@ -2656,10 +2837,8 @@ static struct ggml_cgraph * llm_build_llama(
2656
2837
  offload_func_v(KQV_merged);
2657
2838
  ggml_set_name(KQV_merged, "KQV_merged");
2658
2839
 
2659
- // cur = KQV_merged.contiguous().view(n_embd, N)
2660
- cur = ggml_cpy(ctx0,
2661
- KQV_merged,
2662
- ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
2840
+ // cur = KQV_merged.contiguous().view(n_embd, n_tokens)
2841
+ cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
2663
2842
  offload_func_v(cur);
2664
2843
  ggml_set_name(cur, "KQV_merged_contiguous");
2665
2844
 
@@ -2750,20 +2929,12 @@ static struct ggml_cgraph * llm_build_llama(
2750
2929
  return gf;
2751
2930
  }
2752
2931
 
2753
-
2754
2932
  static struct ggml_cgraph * llm_build_baichaun(
2755
2933
  llama_context & lctx,
2756
- const llama_token * tokens,
2757
- const float * embd,
2758
- int n_tokens,
2759
- int n_past) {
2760
-
2761
- GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
2762
-
2763
- const int N = n_tokens;
2764
-
2934
+ const llama_batch & batch) {
2765
2935
  const auto & model = lctx.model;
2766
2936
  const auto & hparams = model.hparams;
2937
+ const auto & cparams = lctx.cparams;
2767
2938
 
2768
2939
  const auto & kv_self = lctx.kv_self;
2769
2940
 
@@ -2771,7 +2942,7 @@ static struct ggml_cgraph * llm_build_baichaun(
2771
2942
 
2772
2943
  const int64_t n_embd = hparams.n_embd;
2773
2944
  const int64_t n_layer = hparams.n_layer;
2774
- const int64_t n_ctx = hparams.n_ctx;
2945
+ const int64_t n_ctx = cparams.n_ctx;
2775
2946
  const int64_t n_head = hparams.n_head;
2776
2947
  const int64_t n_head_kv = hparams.n_head_kv;
2777
2948
  const int64_t n_embd_head = hparams.n_embd_head();
@@ -2779,12 +2950,18 @@ static struct ggml_cgraph * llm_build_baichaun(
2779
2950
 
2780
2951
  GGML_ASSERT(n_embd_head == hparams.n_rot);
2781
2952
 
2782
- const float freq_base = hparams.rope_freq_base;
2783
- const float freq_scale = hparams.rope_freq_scale;
2953
+ const float freq_base = cparams.rope_freq_base;
2954
+ const float freq_scale = cparams.rope_freq_scale;
2784
2955
  const float norm_rms_eps = hparams.f_norm_rms_eps;
2785
2956
 
2786
2957
  const int n_gpu_layers = model.n_gpu_layers;
2787
2958
 
2959
+ const int32_t n_tokens = batch.n_tokens;
2960
+ const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
2961
+ const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
2962
+
2963
+ const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
2964
+
2788
2965
  auto & buf_compute = lctx.buf_compute;
2789
2966
 
2790
2967
  struct ggml_init_params params = {
@@ -2802,12 +2979,12 @@ static struct ggml_cgraph * llm_build_baichaun(
2802
2979
  struct ggml_tensor * cur;
2803
2980
  struct ggml_tensor * inpL;
2804
2981
 
2805
- if (tokens) {
2806
- struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
2982
+ if (batch.token) {
2983
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
2807
2984
 
2808
2985
  ggml_allocr_alloc(lctx.alloc, inp_tokens);
2809
2986
  if (!ggml_allocr_is_measure(lctx.alloc)) {
2810
- memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
2987
+ memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
2811
2988
  }
2812
2989
  ggml_set_name(inp_tokens, "inp_tokens");
2813
2990
 
@@ -2817,11 +2994,11 @@ static struct ggml_cgraph * llm_build_baichaun(
2817
2994
  GGML_ASSERT(false && "not implemented");
2818
2995
  #endif
2819
2996
 
2820
- inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
2997
+ inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
2821
2998
 
2822
2999
  ggml_allocr_alloc(lctx.alloc, inpL);
2823
3000
  if (!ggml_allocr_is_measure(lctx.alloc)) {
2824
- memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
3001
+ memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
2825
3002
  }
2826
3003
  }
2827
3004
 
@@ -2830,9 +3007,6 @@ static struct ggml_cgraph * llm_build_baichaun(
2830
3007
 
2831
3008
  // offload functions set the tensor output backend to GPU
2832
3009
  // tensors are GPU-accelerated if any input or the output has been offloaded
2833
- //
2834
- // with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
2835
- // in that case ggml_cuda_assign_buffers has no effect
2836
3010
  offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
2837
3011
  offload_func_t offload_func_kq = llama_nop;
2838
3012
  offload_func_t offload_func_v = llama_nop;
@@ -2849,12 +3023,75 @@ static struct ggml_cgraph * llm_build_baichaun(
2849
3023
  }
2850
3024
  #endif // GGML_USE_CUBLAS
2851
3025
 
3026
+ // KQ_scale
2852
3027
  struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
3028
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
2853
3029
  ggml_allocr_alloc(lctx.alloc, KQ_scale);
2854
3030
  if (!ggml_allocr_is_measure(lctx.alloc)) {
2855
3031
  ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
2856
3032
  }
2857
- ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
3033
+
3034
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
3035
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
3036
+ offload_func_kq(KQ_mask);
3037
+ ggml_set_name(KQ_mask, "KQ_mask");
3038
+ ggml_allocr_alloc(lctx.alloc, KQ_mask);
3039
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3040
+ float * data = (float *) KQ_mask->data;
3041
+ memset(data, 0, ggml_nbytes(KQ_mask));
3042
+
3043
+ for (int h = 0; h < 1; ++h) {
3044
+ for (int j = 0; j < n_tokens; ++j) {
3045
+ const llama_pos pos = batch.pos[j];
3046
+ const llama_seq_id seq_id = batch.seq_id[j];
3047
+
3048
+ for (int i = 0; i < n_kv; ++i) {
3049
+ if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
3050
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
3051
+ }
3052
+ }
3053
+ }
3054
+ }
3055
+ }
3056
+
3057
+ // KQ_pos - contains the positions
3058
+ struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
3059
+ offload_func_kq(KQ_pos);
3060
+ ggml_set_name(KQ_pos, "KQ_pos");
3061
+ ggml_allocr_alloc(lctx.alloc, KQ_pos);
3062
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3063
+ int * data = (int *) KQ_pos->data;
3064
+ for (int i = 0; i < n_tokens; ++i) {
3065
+ data[i] = batch.pos[i];
3066
+ }
3067
+ }
3068
+
3069
+ // shift the entire K-cache if needed
3070
+ if (do_rope_shift) {
3071
+ struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
3072
+ offload_func_kq(K_shift);
3073
+ ggml_set_name(K_shift, "K_shift");
3074
+ ggml_allocr_alloc(lctx.alloc, K_shift);
3075
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3076
+ int * data = (int *) K_shift->data;
3077
+ for (int i = 0; i < n_ctx; ++i) {
3078
+ data[i] = kv_self.cells[i].delta;
3079
+ }
3080
+ }
3081
+
3082
+ for (int il = 0; il < n_layer; ++il) {
3083
+ struct ggml_tensor * tmp =
3084
+ ggml_rope_custom_inplace(ctx0,
3085
+ ggml_view_3d(ctx0, kv_self.k,
3086
+ n_embd_head, n_head_kv, n_ctx,
3087
+ ggml_element_size(kv_self.k)*n_embd_head,
3088
+ ggml_element_size(kv_self.k)*n_embd_gqa,
3089
+ ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
3090
+ K_shift, n_embd_head, 0, 0, freq_base, freq_scale);
3091
+ offload_func_kq(tmp);
3092
+ ggml_build_forward_expand(gf, tmp);
3093
+ }
3094
+ }
2858
3095
 
2859
3096
  for (int il = 0; il < n_layer; ++il) {
2860
3097
  ggml_format_name(inpL, "layer_inp_%d", il);
@@ -2896,12 +3133,12 @@ static struct ggml_cgraph * llm_build_baichaun(
2896
3133
  struct ggml_tensor * Qcur;
2897
3134
  switch (model.type) {
2898
3135
  case MODEL_7B:
2899
- Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
2900
- Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
3136
+ Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
3137
+ Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
2901
3138
  break;
2902
3139
  case MODEL_13B:
2903
- Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N);
2904
- Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N);
3140
+ Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, n_tokens);
3141
+ Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, n_tokens);
2905
3142
  break;
2906
3143
  default:
2907
3144
  GGML_ASSERT(false);
@@ -2915,23 +3152,23 @@ static struct ggml_cgraph * llm_build_baichaun(
2915
3152
 
2916
3153
  // store key and value to memory
2917
3154
  {
2918
- // compute the transposed [N, n_embd] V matrix
3155
+ // compute the transposed [n_tokens, n_embd] V matrix
2919
3156
 
2920
3157
  struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
2921
3158
  offload_func_v(tmpv);
2922
3159
  ggml_set_name(tmpv, "tmpv");
2923
3160
 
2924
- struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, N));
3161
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens));
2925
3162
  offload_func_v(Vcur);
2926
3163
  ggml_set_name(Vcur, "Vcur");
2927
3164
 
2928
- struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
3165
+ struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
2929
3166
  offload_func_kq(k);
2930
3167
  ggml_set_name(k, "k");
2931
3168
 
2932
- struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
3169
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
2933
3170
  ( n_ctx)*ggml_element_size(kv_self.v),
2934
- (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
3171
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
2935
3172
  offload_func_v(v);
2936
3173
  ggml_set_name(v, "v");
2937
3174
 
@@ -2946,7 +3183,7 @@ static struct ggml_cgraph * llm_build_baichaun(
2946
3183
 
2947
3184
  struct ggml_tensor * K =
2948
3185
  ggml_view_3d(ctx0, kv_self.k,
2949
- n_embd_head, n_past + N, n_head_kv,
3186
+ n_embd_head, n_kv, n_head_kv,
2950
3187
  ggml_element_size(kv_self.k)*n_embd_gqa,
2951
3188
  ggml_element_size(kv_self.k)*n_embd_head,
2952
3189
  ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
@@ -2959,8 +3196,8 @@ static struct ggml_cgraph * llm_build_baichaun(
2959
3196
  ggml_set_name(KQ, "KQ");
2960
3197
 
2961
3198
  // KQ_scaled = KQ / sqrt(n_embd_head)
2962
- // KQ_scaled shape [n_past + N, N, n_head, 1]
2963
- struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
3199
+ // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
3200
+ struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
2964
3201
  offload_func_kq(KQ_scaled);
2965
3202
  ggml_set_name(KQ_scaled, "KQ_scaled");
2966
3203
 
@@ -2969,58 +3206,44 @@ static struct ggml_cgraph * llm_build_baichaun(
2969
3206
 
2970
3207
  switch (model.type) {
2971
3208
  case MODEL_7B:
2972
- KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
3209
+ KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
2973
3210
  break;
2974
3211
  case MODEL_13B:
2975
- KQ_scaled_alibi =ggml_alibi(ctx0, KQ_scaled, n_past, n_head, 8);
3212
+ // TODO: replace with ggml_add()
3213
+ KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ 0, n_head, 8);
2976
3214
  ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
2977
- KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past);
3215
+ KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
2978
3216
  break;
2979
3217
  default:
2980
3218
  GGML_ASSERT(false);
2981
3219
  }
2982
- // KQ_masked = mask_past(KQ_scaled)
2983
- // struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
2984
- // struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past);
2985
- // offload_func_kq(KQ_masked);
2986
- // ggml_set_name(KQ_masked, "KQ_masked");
2987
3220
 
2988
3221
  // KQ = soft_max(KQ_masked)
2989
- struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
3222
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
2990
3223
  offload_func_v(KQ_soft_max);
2991
3224
  ggml_set_name(KQ_soft_max, "KQ_soft_max");
2992
3225
 
2993
3226
  // split cached V into n_head heads
2994
3227
  struct ggml_tensor * V =
2995
3228
  ggml_view_3d(ctx0, kv_self.v,
2996
- n_past + N, n_embd_head, n_head_kv,
3229
+ n_kv, n_embd_head, n_head_kv,
2997
3230
  ggml_element_size(kv_self.v)*n_ctx,
2998
3231
  ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
2999
3232
  ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
3000
3233
  offload_func_v(V);
3001
3234
  ggml_set_name(V, "V");
3002
3235
 
3003
- #if 1
3004
3236
  struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
3005
3237
  offload_func_v(KQV);
3006
3238
  ggml_set_name(KQV, "KQV");
3007
- #else
3008
- // make V contiguous in memory to speed up the matmul, however we waste time on the copy
3009
- // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
3010
- // is there a better way?
3011
- struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head));
3012
- struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
3013
- #endif
3014
3239
 
3015
3240
  // KQV_merged = KQV.permute(0, 2, 1, 3)
3016
3241
  struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
3017
3242
  offload_func_v(KQV_merged);
3018
3243
  ggml_set_name(KQV_merged, "KQV_merged");
3019
3244
 
3020
- // cur = KQV_merged.contiguous().view(n_embd, N)
3021
- cur = ggml_cpy(ctx0,
3022
- KQV_merged,
3023
- ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
3245
+ // cur = KQV_merged.contiguous().view(n_embd, n_tokens)
3246
+ cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
3024
3247
  offload_func_v(cur);
3025
3248
  ggml_set_name(cur, "KQV_merged_contiguous");
3026
3249
 
@@ -3113,17 +3336,10 @@ static struct ggml_cgraph * llm_build_baichaun(
3113
3336
 
3114
3337
  static struct ggml_cgraph * llm_build_falcon(
3115
3338
  llama_context & lctx,
3116
- const llama_token * tokens,
3117
- const float * embd,
3118
- int n_tokens,
3119
- int n_past) {
3120
-
3121
- GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
3122
-
3123
- const int N = n_tokens;
3124
-
3339
+ const llama_batch & batch) {
3125
3340
  const auto & model = lctx.model;
3126
3341
  const auto & hparams = model.hparams;
3342
+ const auto & cparams = lctx.cparams;
3127
3343
 
3128
3344
  const auto & kv_self = lctx.kv_self;
3129
3345
 
@@ -3131,7 +3347,7 @@ static struct ggml_cgraph * llm_build_falcon(
3131
3347
 
3132
3348
  const int64_t n_embd = hparams.n_embd;
3133
3349
  const int64_t n_layer = hparams.n_layer;
3134
- const int64_t n_ctx = hparams.n_ctx;
3350
+ const int64_t n_ctx = cparams.n_ctx;
3135
3351
  const int64_t n_head = hparams.n_head;
3136
3352
  const int64_t n_head_kv = hparams.n_head_kv;
3137
3353
  const int64_t n_embd_head = hparams.n_embd_head();
@@ -3139,12 +3355,21 @@ static struct ggml_cgraph * llm_build_falcon(
3139
3355
 
3140
3356
  GGML_ASSERT(n_embd_head == hparams.n_rot);
3141
3357
 
3142
- const float freq_base = hparams.rope_freq_base;
3143
- const float freq_scale = hparams.rope_freq_scale;
3358
+ const float freq_base = cparams.rope_freq_base;
3359
+ const float freq_scale = cparams.rope_freq_scale;
3144
3360
  const float norm_eps = hparams.f_norm_eps;
3145
3361
 
3146
3362
  const int n_gpu_layers = model.n_gpu_layers;
3147
3363
 
3364
+ const int32_t n_tokens = batch.n_tokens;
3365
+ const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
3366
+ const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
3367
+
3368
+ const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
3369
+
3370
+ //printf("kv_head = %d, n_kv = %d, n_tokens = %d, n_ctx = %d, is_measure = %d, has_shift = %d\n",
3371
+ // kv_head, n_kv, n_tokens, n_ctx, ggml_allocr_is_measure(lctx.alloc), kv_self.has_shift);
3372
+
3148
3373
  auto & buf_compute = lctx.buf_compute;
3149
3374
 
3150
3375
  struct ggml_init_params params = {
@@ -3162,12 +3387,12 @@ static struct ggml_cgraph * llm_build_falcon(
3162
3387
  struct ggml_tensor * cur;
3163
3388
  struct ggml_tensor * inpL;
3164
3389
 
3165
- if (tokens) {
3166
- struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
3390
+ if (batch.token) {
3391
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
3167
3392
 
3168
3393
  ggml_allocr_alloc(lctx.alloc, inp_tokens);
3169
3394
  if (!ggml_allocr_is_measure(lctx.alloc)) {
3170
- memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
3395
+ memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
3171
3396
  }
3172
3397
  ggml_set_name(inp_tokens, "inp_tokens");
3173
3398
 
@@ -3177,11 +3402,11 @@ static struct ggml_cgraph * llm_build_falcon(
3177
3402
  GGML_ASSERT(false && "not implemented");
3178
3403
  #endif
3179
3404
 
3180
- inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
3405
+ inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
3181
3406
 
3182
3407
  ggml_allocr_alloc(lctx.alloc, inpL);
3183
3408
  if (!ggml_allocr_is_measure(lctx.alloc)) {
3184
- memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
3409
+ memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
3185
3410
  }
3186
3411
  }
3187
3412
 
@@ -3190,9 +3415,6 @@ static struct ggml_cgraph * llm_build_falcon(
3190
3415
 
3191
3416
  // offload functions set the tensor output backend to GPU
3192
3417
  // tensors are GPU-accelerated if any input or the output has been offloaded
3193
- //
3194
- // with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
3195
- // in that case ggml_cuda_assign_buffers has no effect
3196
3418
  offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
3197
3419
  offload_func_t offload_func_kq = llama_nop;
3198
3420
  offload_func_t offload_func_v = llama_nop;
@@ -3209,12 +3431,75 @@ static struct ggml_cgraph * llm_build_falcon(
3209
3431
  }
3210
3432
  #endif // GGML_USE_CUBLAS
3211
3433
 
3434
+ // KQ_scale
3212
3435
  struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
3436
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
3213
3437
  ggml_allocr_alloc(lctx.alloc, KQ_scale);
3214
3438
  if (!ggml_allocr_is_measure(lctx.alloc)) {
3215
3439
  ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
3216
3440
  }
3217
- ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
3441
+
3442
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
3443
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
3444
+ offload_func_kq(KQ_mask);
3445
+ ggml_set_name(KQ_mask, "KQ_mask");
3446
+ ggml_allocr_alloc(lctx.alloc, KQ_mask);
3447
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3448
+ float * data = (float *) KQ_mask->data;
3449
+ memset(data, 0, ggml_nbytes(KQ_mask));
3450
+
3451
+ for (int h = 0; h < 1; ++h) {
3452
+ for (int j = 0; j < n_tokens; ++j) {
3453
+ const llama_pos pos = batch.pos[j];
3454
+ const llama_seq_id seq_id = batch.seq_id[j];
3455
+
3456
+ for (int i = 0; i < n_kv; ++i) {
3457
+ if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
3458
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
3459
+ }
3460
+ }
3461
+ }
3462
+ }
3463
+ }
3464
+
3465
+ // KQ_pos - contains the positions
3466
+ struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
3467
+ offload_func_kq(KQ_pos);
3468
+ ggml_set_name(KQ_pos, "KQ_pos");
3469
+ ggml_allocr_alloc(lctx.alloc, KQ_pos);
3470
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3471
+ int * data = (int *) KQ_pos->data;
3472
+ for (int i = 0; i < n_tokens; ++i) {
3473
+ data[i] = batch.pos[i];
3474
+ }
3475
+ }
3476
+
3477
+ // shift the entire K-cache if needed
3478
+ if (do_rope_shift) {
3479
+ struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
3480
+ offload_func_kq(K_shift);
3481
+ ggml_set_name(K_shift, "K_shift");
3482
+ ggml_allocr_alloc(lctx.alloc, K_shift);
3483
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3484
+ int * data = (int *) K_shift->data;
3485
+ for (int i = 0; i < n_ctx; ++i) {
3486
+ data[i] = kv_self.cells[i].delta;
3487
+ }
3488
+ }
3489
+
3490
+ for (int il = 0; il < n_layer; ++il) {
3491
+ struct ggml_tensor * tmp =
3492
+ ggml_rope_custom_inplace(ctx0,
3493
+ ggml_view_3d(ctx0, kv_self.k,
3494
+ n_embd_head, n_head_kv, n_ctx,
3495
+ ggml_element_size(kv_self.k)*n_embd_head,
3496
+ ggml_element_size(kv_self.k)*n_embd_gqa,
3497
+ ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
3498
+ K_shift, n_embd_head, 2, 0, freq_base, freq_scale);
3499
+ offload_func_kq(tmp);
3500
+ ggml_build_forward_expand(gf, tmp);
3501
+ }
3502
+ }
3218
3503
 
3219
3504
  for (int il = 0; il < n_layer; ++il) {
3220
3505
  struct ggml_tensor * attn_norm;
@@ -3271,45 +3556,45 @@ static struct ggml_cgraph * llm_build_falcon(
3271
3556
  // TODO: these 2 ggml_conts are technically not needed, but we add them until CUDA support for
3272
3557
  // non-contiguous views is added for the rope operator
3273
3558
  struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_3d(
3274
- ctx0, cur, n_embd_head, n_head, N,
3559
+ ctx0, cur, n_embd_head, n_head, n_tokens,
3275
3560
  wsize * n_embd_head,
3276
3561
  wsize * n_embd_head * (n_head + 2 * n_head_kv),
3277
3562
  0));
3278
3563
  offload_func_kq(tmpq);
3279
3564
 
3280
3565
  struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_3d(
3281
- ctx0, cur, n_embd_head, n_head_kv, N,
3566
+ ctx0, cur, n_embd_head, n_head_kv, n_tokens,
3282
3567
  wsize * n_embd_head,
3283
3568
  wsize * n_embd_head * (n_head + 2 * n_head_kv),
3284
3569
  wsize * n_embd_head * n_head));
3285
3570
  offload_func_kq(tmpk);
3286
3571
 
3287
3572
  struct ggml_tensor * tmpv = ggml_view_3d(
3288
- ctx0, cur, n_embd_head, n_head_kv, N,
3573
+ ctx0, cur, n_embd_head, n_head_kv, n_tokens,
3289
3574
  wsize * n_embd_head,
3290
3575
  wsize * n_embd_head * (n_head + 2 * n_head_kv),
3291
3576
  wsize * n_embd_head * (n_head + n_head_kv));
3292
3577
  offload_func_v(tmpv);
3293
3578
 
3294
3579
  // using mode = 2 for neox mode
3295
- struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, tmpq, n_past, n_embd_head, 2, 0, freq_base, freq_scale);
3580
+ struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, tmpq, KQ_pos, n_embd_head, 2, 0, freq_base, freq_scale);
3296
3581
  offload_func_kq(Qcur);
3297
- struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, tmpk, n_past, n_embd_head, 2, 0, freq_base, freq_scale);
3582
+ struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, tmpk, KQ_pos, n_embd_head, 2, 0, freq_base, freq_scale);
3298
3583
  offload_func_kq(Kcur);
3299
3584
 
3300
3585
  {
3301
- struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, N));
3586
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
3302
3587
  offload_func_v(Vcur);
3303
3588
  offload_func_v(Vcur->src[0]->src[0]);
3304
3589
  ggml_set_name(Vcur, "Vcur");
3305
3590
 
3306
- struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
3591
+ struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
3307
3592
  offload_func_kq(k);
3308
3593
  ggml_set_name(k, "k");
3309
3594
 
3310
- struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
3595
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
3311
3596
  ( n_ctx)*ggml_element_size(kv_self.v),
3312
- (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
3597
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
3313
3598
  offload_func_v(v);
3314
3599
 
3315
3600
  ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
@@ -3322,7 +3607,7 @@ static struct ggml_cgraph * llm_build_falcon(
3322
3607
 
3323
3608
  struct ggml_tensor * K =
3324
3609
  ggml_view_3d(ctx0, kv_self.k,
3325
- n_embd_head, n_past + N, n_head_kv,
3610
+ n_embd_head, n_kv, n_head_kv,
3326
3611
  ggml_element_size(kv_self.k)*n_embd_gqa,
3327
3612
  ggml_element_size(kv_self.k)*n_embd_head,
3328
3613
  ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
@@ -3333,21 +3618,21 @@ static struct ggml_cgraph * llm_build_falcon(
3333
3618
  offload_func_kq(KQ);
3334
3619
  ggml_set_name(KQ, "KQ");
3335
3620
 
3336
- struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
3621
+ struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
3337
3622
  offload_func_kq(KQ_scaled);
3338
3623
  ggml_set_name(KQ_scaled, "KQ_scaled");
3339
3624
 
3340
- struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
3625
+ struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
3341
3626
  offload_func_kq(KQ_masked);
3342
3627
  ggml_set_name(KQ_masked, "KQ_masked");
3343
3628
 
3344
- struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
3629
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
3345
3630
  offload_func_v(KQ_soft_max);
3346
3631
  ggml_set_name(KQ_soft_max, "KQ_soft_max");
3347
3632
 
3348
3633
  struct ggml_tensor * V =
3349
3634
  ggml_view_3d(ctx0, kv_self.v,
3350
- n_past + N, n_embd_head, n_head_kv,
3635
+ n_kv, n_embd_head, n_head_kv,
3351
3636
  ggml_element_size(kv_self.v)*n_ctx,
3352
3637
  ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
3353
3638
  ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
@@ -3362,7 +3647,7 @@ static struct ggml_cgraph * llm_build_falcon(
3362
3647
  offload_func_v(KQV_merged);
3363
3648
  ggml_set_name(KQV_merged, "KQV_merged");
3364
3649
 
3365
- cur = ggml_cpy(ctx0, KQV_merged, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
3650
+ cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
3366
3651
  offload_func_v(cur);
3367
3652
  ggml_set_name(cur, "KQV_merged_contiguous");
3368
3653
 
@@ -3420,17 +3705,10 @@ static struct ggml_cgraph * llm_build_falcon(
3420
3705
 
3421
3706
  static struct ggml_cgraph * llm_build_starcoder(
3422
3707
  llama_context & lctx,
3423
- const llama_token * tokens,
3424
- const float * embd,
3425
- int n_tokens,
3426
- int n_past) {
3427
-
3428
- GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
3429
-
3430
- const int N = n_tokens;
3431
-
3708
+ const llama_batch & batch) {
3432
3709
  const auto & model = lctx.model;
3433
3710
  const auto & hparams = model.hparams;
3711
+ const auto & cparams = lctx.cparams;
3434
3712
 
3435
3713
  const auto & kv_self = lctx.kv_self;
3436
3714
 
@@ -3438,7 +3716,7 @@ static struct ggml_cgraph * llm_build_starcoder(
3438
3716
 
3439
3717
  const int64_t n_embd = hparams.n_embd;
3440
3718
  const int64_t n_layer = hparams.n_layer;
3441
- const int64_t n_ctx = hparams.n_ctx;
3719
+ const int64_t n_ctx = cparams.n_ctx;
3442
3720
  const int64_t n_head = hparams.n_head;
3443
3721
  const int64_t n_head_kv = hparams.n_head_kv;
3444
3722
  const int64_t n_embd_head = hparams.n_embd_head();
@@ -3446,7 +3724,11 @@ static struct ggml_cgraph * llm_build_starcoder(
3446
3724
 
3447
3725
  GGML_ASSERT(n_embd_head == hparams.n_rot);
3448
3726
 
3449
- const float norm_eps = hparams.f_norm_eps;
3727
+ const float norm_eps = hparams.f_norm_eps;
3728
+
3729
+ const int32_t n_tokens = batch.n_tokens;
3730
+ const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
3731
+ const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
3450
3732
 
3451
3733
  auto & buf_compute = lctx.buf_compute;
3452
3734
 
@@ -3467,12 +3749,12 @@ static struct ggml_cgraph * llm_build_starcoder(
3467
3749
  struct ggml_tensor * position;
3468
3750
  struct ggml_tensor * inpL;
3469
3751
 
3470
- if (tokens) {
3471
- struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
3752
+ if (batch.token) {
3753
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
3472
3754
 
3473
3755
  ggml_allocr_alloc(lctx.alloc, inp_tokens);
3474
3756
  if (!ggml_allocr_is_measure(lctx.alloc)) {
3475
- memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
3757
+ memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
3476
3758
  }
3477
3759
  ggml_set_name(inp_tokens, "inp_tokens");
3478
3760
 
@@ -3482,21 +3764,21 @@ static struct ggml_cgraph * llm_build_starcoder(
3482
3764
  GGML_ASSERT(false && "not implemented");
3483
3765
  #endif
3484
3766
 
3485
- token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
3767
+ token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
3486
3768
 
3487
3769
  ggml_allocr_alloc(lctx.alloc, token);
3488
3770
  if (!ggml_allocr_is_measure(lctx.alloc)) {
3489
- memcpy(token->data, embd, N * n_embd * ggml_element_size(token));
3771
+ memcpy(token->data, batch.embd, n_tokens * n_embd * ggml_element_size(token));
3490
3772
  }
3491
3773
  }
3492
3774
 
3493
3775
  {
3494
3776
  // Compute position embeddings.
3495
- struct ggml_tensor * inp_positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
3777
+ struct ggml_tensor * inp_positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
3496
3778
  ggml_allocr_alloc(lctx.alloc, inp_positions);
3497
3779
  if (!ggml_allocr_is_measure(lctx.alloc)) {
3498
- for (int i = 0; i < N; ++i) {
3499
- ((int32_t *) inp_positions->data)[i] = n_past + i;
3780
+ for (int i = 0; i < n_tokens; ++i) {
3781
+ ((int32_t *) inp_positions->data)[i] = batch.pos[i];
3500
3782
  }
3501
3783
  }
3502
3784
  ggml_set_name(inp_positions, "inp_positions");
@@ -3504,12 +3786,35 @@ static struct ggml_cgraph * llm_build_starcoder(
3504
3786
  position = ggml_get_rows(ctx0, model.pos_embeddings, inp_positions);
3505
3787
  }
3506
3788
 
3789
+ // KQ_scale
3507
3790
  struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
3791
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
3508
3792
  ggml_allocr_alloc(lctx.alloc, KQ_scale);
3509
3793
  if (!ggml_allocr_is_measure(lctx.alloc)) {
3510
3794
  ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
3511
3795
  }
3512
- ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
3796
+
3797
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
3798
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
3799
+ ggml_set_name(KQ_mask, "KQ_mask");
3800
+ ggml_allocr_alloc(lctx.alloc, KQ_mask);
3801
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3802
+ float * data = (float *) KQ_mask->data;
3803
+ memset(data, 0, ggml_nbytes(KQ_mask));
3804
+
3805
+ for (int h = 0; h < 1; ++h) {
3806
+ for (int j = 0; j < n_tokens; ++j) {
3807
+ const llama_pos pos = batch.pos[j];
3808
+ const llama_seq_id seq_id = batch.seq_id[j];
3809
+
3810
+ for (int i = 0; i < n_kv; ++i) {
3811
+ if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
3812
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
3813
+ }
3814
+ }
3815
+ }
3816
+ }
3817
+ }
3513
3818
 
3514
3819
  inpL = ggml_add(ctx0, token, position);
3515
3820
  ggml_set_name(inpL, "inpL");
@@ -3525,23 +3830,23 @@ static struct ggml_cgraph * llm_build_starcoder(
3525
3830
  // Self Attention
3526
3831
  cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv);
3527
3832
 
3528
- struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
3529
- struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, N, cur->nb[1], sizeof(float)*n_embd);
3530
- struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, N, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
3833
+ struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd);
3834
+ struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*n_embd);
3835
+ struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
3531
3836
 
3532
3837
  struct ggml_tensor * Qcur = tmpq;
3533
3838
  struct ggml_tensor * Kcur = tmpk;
3534
3839
 
3535
3840
  {
3536
- struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, N));
3841
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
3537
3842
  ggml_set_name(Vcur, "Vcur");
3538
3843
 
3539
- struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
3844
+ struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
3540
3845
  ggml_set_name(k, "k");
3541
3846
 
3542
- struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
3847
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
3543
3848
  ( n_ctx)*ggml_element_size(kv_self.v),
3544
- (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
3849
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
3545
3850
 
3546
3851
  ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
3547
3852
  ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
@@ -3551,13 +3856,13 @@ static struct ggml_cgraph * llm_build_starcoder(
3551
3856
  ggml_permute(ctx0,
3552
3857
  ggml_cpy(ctx0,
3553
3858
  Qcur,
3554
- ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, N)),
3859
+ ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, n_tokens)),
3555
3860
  0, 2, 1, 3);
3556
3861
  ggml_set_name(Q, "Q");
3557
3862
 
3558
3863
  struct ggml_tensor * K =
3559
3864
  ggml_view_3d(ctx0, kv_self.k,
3560
- n_embd_head, n_past + N, n_head_kv,
3865
+ n_embd_head, n_kv, n_head_kv,
3561
3866
  ggml_element_size(kv_self.k)*n_embd_gqa,
3562
3867
  ggml_element_size(kv_self.k)*n_embd_head,
3563
3868
  ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
@@ -3568,12 +3873,12 @@ static struct ggml_cgraph * llm_build_starcoder(
3568
3873
  ggml_set_name(KQ, "KQ");
3569
3874
 
3570
3875
  // KQ_scaled = KQ / sqrt(n_embd_head)
3571
- // KQ_scaled shape [n_past + N, N, n_head, 1]
3876
+ // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
3572
3877
  struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
3573
3878
  ggml_set_name(KQ_scaled, "KQ_scaled");
3574
3879
 
3575
3880
  // KQ_masked = mask_past(KQ_scaled)
3576
- struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
3881
+ struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
3577
3882
  ggml_set_name(KQ_masked, "KQ_masked");
3578
3883
 
3579
3884
  // KQ = soft_max(KQ_masked)
@@ -3583,7 +3888,7 @@ static struct ggml_cgraph * llm_build_starcoder(
3583
3888
  // split cached V into n_head heads
3584
3889
  struct ggml_tensor * V =
3585
3890
  ggml_view_3d(ctx0, kv_self.v,
3586
- n_past + N, n_embd_head, n_head_kv,
3891
+ n_kv, n_embd_head, n_head_kv,
3587
3892
  ggml_element_size(kv_self.v)*n_ctx,
3588
3893
  ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
3589
3894
  ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
@@ -3596,10 +3901,8 @@ static struct ggml_cgraph * llm_build_starcoder(
3596
3901
  struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
3597
3902
  ggml_set_name(KQV_merged, "KQV_merged");
3598
3903
 
3599
- // cur = KQV_merged.contiguous().view(n_embd, N)
3600
- cur = ggml_cpy(ctx0,
3601
- KQV_merged,
3602
- ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
3904
+ // cur = KQV_merged.contiguous().view(n_embd, n_tokens)
3905
+ cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
3603
3906
  ggml_set_name(cur, "KQV_merged_contiguous");
3604
3907
  }
3605
3908
 
@@ -3649,10 +3952,7 @@ static struct ggml_cgraph * llm_build_starcoder(
3649
3952
 
3650
3953
  static struct ggml_cgraph * llama_build_graph(
3651
3954
  llama_context & lctx,
3652
- const llama_token * tokens,
3653
- const float * embd,
3654
- int n_tokens,
3655
- int n_past) {
3955
+ const llama_batch & batch) {
3656
3956
  const auto & model = lctx.model;
3657
3957
 
3658
3958
  struct ggml_cgraph * result = NULL;
@@ -3660,76 +3960,117 @@ static struct ggml_cgraph * llama_build_graph(
3660
3960
  switch (model.arch) {
3661
3961
  case LLM_ARCH_LLAMA:
3662
3962
  {
3663
- result = llm_build_llama(lctx, tokens, embd, n_tokens, n_past);
3963
+ result = llm_build_llama(lctx, batch);
3664
3964
  } break;
3665
3965
  case LLM_ARCH_BAICHUAN:
3666
3966
  {
3667
- result = llm_build_baichaun(lctx, tokens, embd, n_tokens, n_past);
3967
+ result = llm_build_baichaun(lctx, batch);
3668
3968
  } break;
3669
3969
  case LLM_ARCH_FALCON:
3670
3970
  {
3671
- result = llm_build_falcon(lctx, tokens, embd, n_tokens, n_past);
3971
+ result = llm_build_falcon(lctx, batch);
3672
3972
  } break;
3673
3973
  case LLM_ARCH_STARCODER:
3674
3974
  {
3675
- result = llm_build_starcoder(lctx, tokens, embd, n_tokens, n_past);
3975
+ result = llm_build_starcoder(lctx, batch);
3676
3976
  } break;
3677
3977
  default:
3678
3978
  GGML_ASSERT(false);
3679
- };
3979
+ }
3680
3980
 
3681
3981
  return result;
3682
3982
  }
3683
3983
 
3684
- // evaluate the transformer
3984
+ // decode a batch of tokens by evaluating the transformer
3685
3985
  //
3686
3986
  // - lctx: llama context
3687
- // - tokens: new batch of tokens to process
3688
- // - embd embeddings input
3689
- // - n_tokens number of tokens
3690
- // - n_past: the context size so far
3987
+ // - batch: batch to evaluate
3691
3988
  // - n_threads: number of threads to use
3692
3989
  //
3693
- static bool llama_eval_internal(
3990
+ // return 0 on success
3991
+ // return positive int on warning
3992
+ // return negative int on error
3993
+ //
3994
+ static int llama_decode_internal(
3694
3995
  llama_context & lctx,
3695
- const llama_token * tokens,
3696
- const float * embd,
3697
- int n_tokens,
3698
- int n_past,
3699
- int n_threads,
3700
- const char * cgraph_fname) {
3996
+ llama_batch batch) {
3997
+ const uint32_t n_tokens = batch.n_tokens;
3701
3998
 
3702
- GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
3999
+ if (n_tokens == 0) {
4000
+ LLAMA_LOG_ERROR("%s: n_tokens == 0", __func__);
4001
+ return -1;
4002
+ }
3703
4003
 
3704
- GGML_ASSERT(n_tokens > 0);
3705
- GGML_ASSERT(n_past >= 0);
3706
- // TODO: keep the values of n_batch and n_ctx
3707
- // GGML_ASSERT(n_tokens <= n_batch);
3708
- // GGML_ASSERT(n_past + n_tokens <= n_ctx);
4004
+ const auto & model = lctx.model;
4005
+ const auto & hparams = model.hparams;
4006
+ const auto & cparams = lctx.cparams;
4007
+
4008
+ const auto n_batch = cparams.n_batch;
4009
+
4010
+ GGML_ASSERT(n_tokens <= n_batch);
4011
+
4012
+ int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
4013
+ GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
3709
4014
 
3710
4015
  const int64_t t_start_us = ggml_time_us();
3711
4016
 
3712
4017
  #ifdef GGML_USE_MPI
3713
- ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
4018
+ // TODO: needs fix after #3228
4019
+ GGML_ASSERT(false && "not implemented");
4020
+ //ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
3714
4021
  #endif
3715
4022
 
3716
4023
  GGML_ASSERT(n_threads > 0);
3717
4024
 
3718
- const int N = n_tokens;
3719
-
3720
- const auto & model = lctx.model;
3721
- const auto & hparams = model.hparams;
3722
-
3723
- const auto & kv_self = lctx.kv_self;
4025
+ auto & kv_self = lctx.kv_self;
3724
4026
 
3725
4027
  GGML_ASSERT(!!kv_self.ctx);
3726
4028
 
3727
4029
  const int64_t n_embd = hparams.n_embd;
3728
4030
  const int64_t n_vocab = hparams.n_vocab;
3729
4031
 
4032
+ // helpers for smoother batch API transistion
4033
+ // after deprecating the llama_eval calls, these will be removed
4034
+ std::vector<llama_pos> pos;
4035
+ std::vector<llama_seq_id> seq_id;
4036
+
4037
+ if (batch.pos == nullptr) {
4038
+ pos.resize(n_tokens);
4039
+ for (uint32_t i = 0; i < n_tokens; i++) {
4040
+ pos[i] = batch.all_pos_0 + i*batch.all_pos_1;
4041
+ }
4042
+
4043
+ batch.pos = pos.data();
4044
+ }
4045
+
4046
+ if (batch.seq_id == nullptr) {
4047
+ seq_id.resize(n_tokens);
4048
+ for (uint32_t i = 0; i < n_tokens; i++) {
4049
+ seq_id[i] = batch.all_seq_id;
4050
+ }
4051
+
4052
+ batch.seq_id = seq_id.data();
4053
+ }
4054
+
4055
+ // we always start to search for a free slot from the start of the cache
4056
+ // TODO: better strategies can be implemented
4057
+ kv_self.head = 0;
4058
+
4059
+ if (!llama_kv_cache_find_slot(kv_self, batch)) {
4060
+ return 1;
4061
+ }
4062
+
4063
+ // a heuristic, to avoid attending the full cache if it is not yet utilized
4064
+ // after enough generations, the benefit from this heuristic disappears
4065
+ // if we start defragmenting the cache, the benefit from this will be more important
4066
+ //kv_self.n = std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)); // TODO: this might be better for CUDA?
4067
+ kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, llama_kv_cache_cell_max(kv_self)));
4068
+
4069
+ //printf("kv_self.n = %d\n", kv_self.n);
4070
+
3730
4071
  ggml_allocr_reset(lctx.alloc);
3731
4072
 
3732
- ggml_cgraph * gf = llama_build_graph(lctx, tokens, embd, n_tokens, n_past);
4073
+ ggml_cgraph * gf = llama_build_graph(lctx, batch);
3733
4074
 
3734
4075
  ggml_allocr_alloc_graph(lctx.alloc, gf);
3735
4076
 
@@ -3738,6 +4079,7 @@ static bool llama_eval_internal(
3738
4079
  ggml_tensor * node = gf->leafs[i];
3739
4080
  if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
3740
4081
  ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) lctx.buf_alloc.data);
4082
+ ggml_cuda_copy_to_device(node);
3741
4083
  }
3742
4084
  }
3743
4085
 
@@ -3747,6 +4089,8 @@ static bool llama_eval_internal(
3747
4089
  ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) lctx.buf_alloc.data);
3748
4090
  }
3749
4091
  }
4092
+
4093
+ ggml_cuda_set_mul_mat_q(cparams.mul_mat_q);
3750
4094
  #endif
3751
4095
 
3752
4096
  // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
@@ -3756,7 +4100,7 @@ static bool llama_eval_internal(
3756
4100
  // TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
3757
4101
  // we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
3758
4102
  // with the BLAS calls. need a better solution
3759
- if (N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
4103
+ if (n_tokens >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
3760
4104
  n_threads = std::min(4, n_threads);
3761
4105
  }
3762
4106
 
@@ -3795,12 +4139,9 @@ static bool llama_eval_internal(
3795
4139
  ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
3796
4140
  #endif
3797
4141
 
3798
- // update kv token count
3799
- lctx.kv_self.n = n_past + N;
3800
-
3801
- if (cgraph_fname) {
3802
- ggml_graph_export(gf, cgraph_fname);
3803
- }
4142
+ // update the kv ring buffer
4143
+ lctx.kv_self.head += n_tokens;
4144
+ lctx.kv_self.has_shift = false;
3804
4145
 
3805
4146
  #ifdef GGML_PERF
3806
4147
  // print timing information per ggml operation (for debugging purposes)
@@ -3817,13 +4158,20 @@ static bool llama_eval_internal(
3817
4158
  {
3818
4159
  auto & logits_out = lctx.logits;
3819
4160
 
3820
- if (lctx.logits_all) {
3821
- logits_out.resize(n_vocab * N);
3822
- memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*N);
4161
+ if (batch.logits) {
4162
+ logits_out.resize(n_vocab * n_tokens);
4163
+ for (uint32_t i = 0; i < n_tokens; i++) {
4164
+ if (batch.logits[i] == 0) {
4165
+ continue;
4166
+ }
4167
+ memcpy(logits_out.data() + (n_vocab*i), (float *) ggml_get_data(res) + (n_vocab*i), sizeof(float)*n_vocab);
4168
+ }
4169
+ } else if (lctx.logits_all) {
4170
+ logits_out.resize(n_vocab * n_tokens);
4171
+ memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*n_tokens);
3823
4172
  } else {
3824
- // return result for just the last token
3825
4173
  logits_out.resize(n_vocab);
3826
- memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
4174
+ memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(n_tokens - 1)), sizeof(float)*n_vocab);
3827
4175
  }
3828
4176
  }
3829
4177
 
@@ -3832,20 +4180,27 @@ static bool llama_eval_internal(
3832
4180
  auto & embedding_out = lctx.embedding;
3833
4181
 
3834
4182
  embedding_out.resize(n_embd);
3835
- memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd);
4183
+ memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(n_tokens - 1)), sizeof(float)*n_embd);
3836
4184
  }
3837
4185
 
3838
4186
  // measure the performance only for the single-token evals
3839
- if (N == 1) {
4187
+ if (n_tokens == 1) {
3840
4188
  lctx.t_eval_us += ggml_time_us() - t_start_us;
3841
4189
  lctx.n_eval++;
3842
4190
  }
3843
- else if (N > 1) {
4191
+ else if (n_tokens > 1) {
3844
4192
  lctx.t_p_eval_us += ggml_time_us() - t_start_us;
3845
- lctx.n_p_eval += N;
4193
+ lctx.n_p_eval += n_tokens;
3846
4194
  }
3847
4195
 
3848
- return true;
4196
+ // get a more accurate load time, upon first eval
4197
+ // TODO: fix this
4198
+ if (!lctx.has_evaluated_once) {
4199
+ lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
4200
+ lctx.has_evaluated_once = true;
4201
+ }
4202
+
4203
+ return 0;
3849
4204
  }
3850
4205
 
3851
4206
  //
@@ -4266,7 +4621,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
4266
4621
  llm_tokenizer_bpe tokenizer(vocab);
4267
4622
  tokenizer.tokenize(raw_text, output);
4268
4623
  } break;
4269
- };
4624
+ }
4270
4625
 
4271
4626
  return output;
4272
4627
  }
@@ -4670,6 +5025,13 @@ struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar)
4670
5025
  // sampling
4671
5026
  //
4672
5027
 
5028
+ void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed) {
5029
+ if (seed == LLAMA_DEFAULT_SEED) {
5030
+ seed = time(NULL);
5031
+ }
5032
+ ctx->rng.seed(seed);
5033
+ }
5034
+
4673
5035
  void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates) {
4674
5036
  GGML_ASSERT(candidates->size > 0);
4675
5037
 
@@ -4878,7 +5240,7 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
4878
5240
  }
4879
5241
  }
4880
5242
 
4881
- void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
5243
+ void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
4882
5244
  const int64_t t_start_sample_us = ggml_time_us();
4883
5245
 
4884
5246
  for (size_t i = 0; i < candidates_p->size; ++i) {
@@ -4890,6 +5252,10 @@ void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array
4890
5252
  }
4891
5253
  }
4892
5254
 
5255
+ void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
5256
+ llama_sample_temp(ctx, candidates_p, temp);
5257
+ }
5258
+
4893
5259
  void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty) {
4894
5260
  if (last_tokens_size == 0 || penalty == 1.0f) {
4895
5261
  return;
@@ -5013,7 +5379,7 @@ void llama_sample_classifier_free_guidance(
5013
5379
 
5014
5380
  GGML_ASSERT(ctx);
5015
5381
 
5016
- auto n_vocab = llama_n_vocab(ctx);
5382
+ auto n_vocab = llama_n_vocab(llama_get_model(ctx));
5017
5383
 
5018
5384
  GGML_ASSERT(n_vocab == (int)candidates->size);
5019
5385
  GGML_ASSERT(!candidates->sorted);
@@ -5042,7 +5408,7 @@ void llama_sample_classifier_free_guidance(
5042
5408
  llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu) {
5043
5409
  GGML_ASSERT(ctx);
5044
5410
 
5045
- auto N = float(llama_n_vocab(ctx));
5411
+ auto N = float(llama_n_vocab(llama_get_model(ctx)));
5046
5412
  int64_t t_start_sample_us;
5047
5413
  t_start_sample_us = ggml_time_us();
5048
5414
 
@@ -5229,7 +5595,7 @@ struct llama_logit_info {
5229
5595
  };
5230
5596
  llama_logit_info(llama_context * ctx)
5231
5597
  : logits(llama_get_logits(ctx))
5232
- , n_vocab(llama_n_vocab(ctx))
5598
+ , n_vocab(llama_n_vocab(llama_get_model(ctx)))
5233
5599
  , max_l(*std::max_element(logits, logits + n_vocab))
5234
5600
  , normalizer(1.0f / std::accumulate(logits, logits + n_vocab, 0.0f, sum_exp{max_l}))
5235
5601
  { }
@@ -5267,7 +5633,6 @@ struct llama_beam_search_data {
5267
5633
  size_t n_beams;
5268
5634
  int n_past;
5269
5635
  int n_predict;
5270
- int n_threads;
5271
5636
  std::vector<llama_beam> beams;
5272
5637
  std::vector<llama_beam> next_beams;
5273
5638
 
@@ -5277,12 +5642,11 @@ struct llama_beam_search_data {
5277
5642
  // Used to communicate to/from callback on beams state.
5278
5643
  std::vector<llama_beam_view> beam_views;
5279
5644
 
5280
- llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict, int n_threads)
5645
+ llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict)
5281
5646
  : ctx(ctx)
5282
5647
  , n_beams(n_beams)
5283
5648
  , n_past(n_past)
5284
5649
  , n_predict(n_predict)
5285
- , n_threads(n_threads)
5286
5650
  , beam_views(n_beams) {
5287
5651
  beams.reserve(n_beams);
5288
5652
  next_beams.reserve(n_beams);
@@ -5319,7 +5683,7 @@ struct llama_beam_search_data {
5319
5683
  } else {
5320
5684
  // beam is not at end-of-sentence, so branch with next top_k tokens.
5321
5685
  if (!beam.tokens.empty()) {
5322
- llama_eval(ctx, beam.tokens.data(), beam.tokens.size(), n_past, n_threads);
5686
+ llama_decode(ctx, llama_batch_get_one(beam.tokens.data(), beam.tokens.size(), n_past, 0));
5323
5687
  }
5324
5688
  llama_logit_info logit_info(ctx);
5325
5689
  std::vector<llama_token_data> next_tokens = logit_info.top_k(n_beams);
@@ -5393,7 +5757,7 @@ struct llama_beam_search_data {
5393
5757
  callback(callback_data, get_beams_state(false)); // Sets common_prefix_length
5394
5758
  update_beams_from_beam_views(); // Update values (p,eob) that callback may have changed.
5395
5759
  if (common_prefix_length) {
5396
- llama_eval(ctx, beams[0].tokens.data(), common_prefix_length, n_past, n_threads);
5760
+ llama_decode(ctx, llama_batch_get_one(beams[0].tokens.data(), common_prefix_length, n_past, 0));
5397
5761
  n_past += common_prefix_length;
5398
5762
  }
5399
5763
  // Zero-out next_beam probabilities to place them last in following min-heap.
@@ -5434,11 +5798,11 @@ struct llama_beam_search_data {
5434
5798
 
5435
5799
  void llama_beam_search(llama_context * ctx,
5436
5800
  llama_beam_search_callback_fn_t callback, void * callback_data,
5437
- size_t n_beams, int n_past, int n_predict, int n_threads) {
5801
+ size_t n_beams, int n_past, int n_predict) {
5438
5802
  assert(ctx);
5439
5803
  const int64_t t_start_sample_us = ggml_time_us();
5440
5804
 
5441
- llama_beam_search_data beam_search_data(ctx, n_beams, n_past, n_predict, n_threads);
5805
+ llama_beam_search_data beam_search_data(ctx, n_beams, n_past, n_predict);
5442
5806
 
5443
5807
  beam_search_data.loop(callback, callback_data);
5444
5808
 
@@ -5658,11 +6022,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
5658
6022
  nthread = std::thread::hardware_concurrency();
5659
6023
  }
5660
6024
 
5661
- std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname_inp, /*use_mmap*/ false));
6025
+ llama_model_loader ml(fname_inp, /*use_mmap*/ false);
5662
6026
 
5663
6027
  llama_model model;
5664
- llm_load_arch(*ml, model);
5665
- llm_load_hparams(*ml, model, 0, 0, 0);
6028
+ llm_load_arch(ml, model);
6029
+ llm_load_hparams(ml, model);
5666
6030
 
5667
6031
  if (params->only_copy) {
5668
6032
  ftype = model.ftype;
@@ -5672,7 +6036,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
5672
6036
  struct gguf_context * ctx_out = gguf_init_empty();
5673
6037
 
5674
6038
  // copy the KV pairs from the input file
5675
- gguf_set_kv (ctx_out, ml->ctx_gguf);
6039
+ gguf_set_kv (ctx_out, ml.ctx_gguf);
5676
6040
  gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
5677
6041
  gguf_set_val_u32(ctx_out, "general.file_type", ftype);
5678
6042
 
@@ -5680,8 +6044,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
5680
6044
  int n_attention_wv = 0;
5681
6045
  int n_feed_forward_w2 = 0;
5682
6046
 
5683
- for (int i = 0; i < ml->n_tensors; ++i) {
5684
- struct ggml_tensor * meta = ml->get_tensor_meta(i);
6047
+ for (int i = 0; i < ml.n_tensors; ++i) {
6048
+ struct ggml_tensor * meta = ml.get_tensor_meta(i);
5685
6049
 
5686
6050
  const std::string name = ggml_get_name(meta);
5687
6051
 
@@ -5717,8 +6081,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
5717
6081
  std::vector<no_init<float>> f32_conv_buf;
5718
6082
 
5719
6083
  // populate the original tensors so we get an initial meta data
5720
- for (int i = 0; i < ml->n_tensors; ++i) {
5721
- struct ggml_tensor * meta = ml->get_tensor_meta(i);
6084
+ for (int i = 0; i < ml.n_tensors; ++i) {
6085
+ struct ggml_tensor * meta = ml.get_tensor_meta(i);
5722
6086
  gguf_add_tensor(ctx_out, meta);
5723
6087
  }
5724
6088
 
@@ -5731,8 +6095,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
5731
6095
  // placeholder for the meta data
5732
6096
  ::zeros(fout, meta_size);
5733
6097
 
5734
- for (int i = 0; i < ml->n_tensors; ++i) {
5735
- struct ggml_tensor * tensor = ml->get_tensor_meta(i);
6098
+ for (int i = 0; i < ml.n_tensors; ++i) {
6099
+ struct ggml_tensor * tensor = ml.get_tensor_meta(i);
5736
6100
 
5737
6101
  const std::string name = ggml_get_name(tensor);
5738
6102
 
@@ -5740,10 +6104,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
5740
6104
  read_data.resize(ggml_nbytes(tensor));
5741
6105
  }
5742
6106
  tensor->data = read_data.data();
5743
- ml->load_data_for(tensor);
6107
+ ml.load_data_for(tensor);
5744
6108
 
5745
6109
  LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",
5746
- ++idx, ml->n_tensors,
6110
+ ++idx, ml.n_tensors,
5747
6111
  ggml_get_name(tensor),
5748
6112
  llama_format_tensor_shape(tensor).c_str(),
5749
6113
  ggml_type_name(tensor->type));
@@ -5893,9 +6257,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
5893
6257
  }
5894
6258
  }
5895
6259
 
5896
- // TODO: after the GGUF PR, this likely won't work and needs to be updated
5897
6260
  static int llama_apply_lora_from_file_internal(
5898
- const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads
6261
+ const struct llama_model & model, const char * path_lora, float scale, const char * path_base_model, int n_threads
5899
6262
  ) {
5900
6263
  LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
5901
6264
 
@@ -5924,7 +6287,7 @@ static int llama_apply_lora_from_file_internal(
5924
6287
  int32_t lora_alpha;
5925
6288
  fin.read((char *) &lora_r, sizeof(lora_r));
5926
6289
  fin.read((char *) &lora_alpha, sizeof(lora_alpha));
5927
- float scaling = (float)lora_alpha / (float)lora_r;
6290
+ float scaling = scale * (float)lora_alpha / (float)lora_r;
5928
6291
 
5929
6292
  LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
5930
6293
 
@@ -6140,9 +6503,10 @@ static int llama_apply_lora_from_file_internal(
6140
6503
  ggml_set_name(r, "r_cpy");
6141
6504
  }
6142
6505
 
6143
- struct ggml_cgraph gf = ggml_build_forward(r);
6506
+ struct ggml_cgraph * gf = ggml_new_graph(lora_ctx);
6507
+ ggml_build_forward_expand(gf, r);
6144
6508
 
6145
- ggml_graph_compute_helper(work_buffer, &gf, n_threads);
6509
+ ggml_graph_compute_helper(work_buffer, gf, n_threads);
6146
6510
 
6147
6511
  // we won't need these tensors again, reset the context to save memory
6148
6512
  ggml_free(lora_ctx);
@@ -6171,27 +6535,16 @@ static int llama_apply_lora_from_file_internal(
6171
6535
  //
6172
6536
  // interface implementation
6173
6537
  //
6174
-
6175
- struct llama_context_params llama_context_default_params() {
6176
- struct llama_context_params result = {
6177
- /*.seed =*/ LLAMA_DEFAULT_SEED,
6178
- /*.n_ctx =*/ 512,
6179
- /*.n_batch =*/ 512,
6538
+ struct llama_model_params llama_model_default_params() {
6539
+ struct llama_model_params result = {
6180
6540
  /*.n_gpu_layers =*/ 0,
6181
6541
  /*.main_gpu =*/ 0,
6182
6542
  /*.tensor_split =*/ nullptr,
6183
- /*.rope_freq_base =*/ 0.0f,
6184
- /*.rope_freq_scale =*/ 0.0f,
6185
6543
  /*.progress_callback =*/ nullptr,
6186
6544
  /*.progress_callback_user_data =*/ nullptr,
6187
- /*.low_vram =*/ false,
6188
- /*.mul_mat_q =*/ true,
6189
- /*.f16_kv =*/ true,
6190
- /*.logits_all =*/ false,
6191
6545
  /*.vocab_only =*/ false,
6192
6546
  /*.use_mmap =*/ true,
6193
6547
  /*.use_mlock =*/ false,
6194
- /*.embedding =*/ false,
6195
6548
  };
6196
6549
 
6197
6550
  #ifdef GGML_USE_METAL
@@ -6201,6 +6554,24 @@ struct llama_context_params llama_context_default_params() {
6201
6554
  return result;
6202
6555
  }
6203
6556
 
6557
+ struct llama_context_params llama_context_default_params() {
6558
+ struct llama_context_params result = {
6559
+ /*.seed =*/ LLAMA_DEFAULT_SEED,
6560
+ /*.n_ctx =*/ 512,
6561
+ /*.n_batch =*/ 512,
6562
+ /*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
6563
+ /*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS,
6564
+ /*.rope_freq_base =*/ 0.0f,
6565
+ /*.rope_freq_scale =*/ 0.0f,
6566
+ /*.mul_mat_q =*/ true,
6567
+ /*.f16_kv =*/ true,
6568
+ /*.logits_all =*/ false,
6569
+ /*.embedding =*/ false,
6570
+ };
6571
+
6572
+ return result;
6573
+ }
6574
+
6204
6575
  struct llama_model_quantize_params llama_model_quantize_default_params() {
6205
6576
  struct llama_model_quantize_params result = {
6206
6577
  /*.nthread =*/ 0,
@@ -6256,13 +6627,11 @@ int64_t llama_time_us(void) {
6256
6627
 
6257
6628
  struct llama_model * llama_load_model_from_file(
6258
6629
  const char * path_model,
6259
- struct llama_context_params params) {
6630
+ struct llama_model_params params) {
6260
6631
  ggml_time_init();
6261
6632
 
6262
6633
  llama_model * model = new llama_model;
6263
6634
 
6264
- ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
6265
-
6266
6635
  unsigned cur_percentage = 0;
6267
6636
  if (params.progress_callback == NULL) {
6268
6637
  params.progress_callback_user_data = &cur_percentage;
@@ -6279,9 +6648,9 @@ struct llama_model * llama_load_model_from_file(
6279
6648
  };
6280
6649
  }
6281
6650
 
6282
- if (!llama_model_load(path_model, *model, params.n_ctx, params.n_batch, params.n_gpu_layers,
6283
- params.main_gpu, params.tensor_split, params.mul_mat_q, params.rope_freq_base, params.rope_freq_scale,
6284
- params.low_vram, memory_type, params.use_mmap, params.use_mlock, params.vocab_only,
6651
+ if (!llama_model_load(path_model, *model, params.n_gpu_layers,
6652
+ params.main_gpu, params.tensor_split,
6653
+ params.use_mmap, params.use_mlock, params.vocab_only,
6285
6654
  params.progress_callback, params.progress_callback_user_data)) {
6286
6655
  LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
6287
6656
  delete model;
@@ -6305,18 +6674,33 @@ struct llama_context * llama_new_context_with_model(
6305
6674
 
6306
6675
  llama_context * ctx = new llama_context(*model);
6307
6676
 
6677
+ const auto & hparams = model->hparams;
6678
+ auto & cparams = ctx->cparams;
6679
+
6680
+ cparams.n_batch = params.n_batch;
6681
+ cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
6682
+ cparams.rope_freq_base = params.rope_freq_base == 0 ? hparams.rope_freq_base_train : params.rope_freq_base;
6683
+ cparams.rope_freq_scale = params.rope_freq_scale == 0 ? hparams.rope_freq_scale_train : params.rope_freq_scale;
6684
+ cparams.n_threads = params.n_threads;
6685
+ cparams.n_threads_batch = params.n_threads_batch;
6686
+ cparams.mul_mat_q = params.mul_mat_q;
6687
+
6308
6688
  if (params.seed == LLAMA_DEFAULT_SEED) {
6309
6689
  params.seed = time(NULL);
6310
6690
  }
6311
6691
 
6692
+ LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
6693
+ LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
6694
+ LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
6695
+
6312
6696
  ctx->rng = std::mt19937(params.seed);
6313
6697
  ctx->logits_all = params.logits_all;
6314
6698
 
6315
6699
  ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
6316
6700
 
6317
6701
  // reserve memory for context buffers
6318
- if (!params.vocab_only) {
6319
- if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
6702
+ if (!hparams.vocab_only) {
6703
+ if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, cparams.n_ctx, model->n_gpu_layers)) {
6320
6704
  LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
6321
6705
  llama_free(ctx);
6322
6706
  return nullptr;
@@ -6327,11 +6711,9 @@ struct llama_context * llama_new_context_with_model(
6327
6711
  LLAMA_LOG_INFO("%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
6328
6712
  }
6329
6713
 
6330
- const auto & hparams = ctx->model.hparams;
6331
-
6332
6714
  // resized during inference
6333
6715
  if (params.logits_all) {
6334
- ctx->logits.reserve(hparams.n_ctx*hparams.n_vocab);
6716
+ ctx->logits.reserve(cparams.n_ctx*hparams.n_vocab);
6335
6717
  } else {
6336
6718
  ctx->logits.reserve(hparams.n_vocab);
6337
6719
  }
@@ -6349,26 +6731,28 @@ struct llama_context * llama_new_context_with_model(
6349
6731
  ctx->alloc = ggml_allocr_new_measure(tensor_alignment);
6350
6732
 
6351
6733
  // build worst-case graph
6352
- int n_tokens = std::min((int)hparams.n_ctx, params.n_batch);
6353
- int n_past = hparams.n_ctx - n_tokens;
6734
+ int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch);
6735
+ int n_past = cparams.n_ctx - n_tokens;
6354
6736
  llama_token token = llama_token_bos(ctx); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
6355
- ggml_cgraph * gf = llama_build_graph(*ctx, &token, NULL, n_tokens, n_past);
6737
+ ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0));
6738
+
6356
6739
  #ifdef GGML_USE_METAL
6357
- if (params.n_gpu_layers > 0) {
6740
+ if (model->n_gpu_layers > 0) {
6358
6741
  ctx->ctx_metal = ggml_metal_init(1);
6359
6742
  if (!ctx->ctx_metal) {
6360
6743
  LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__);
6361
6744
  llama_free(ctx);
6362
6745
  return NULL;
6363
6746
  }
6364
- ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false);
6365
- ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
6747
+ ggml_metal_log_set_callback(llama_log_callback_default, NULL);
6748
+ //ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false);
6749
+ //ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
6366
6750
  }
6367
6751
  #endif
6368
6752
  // measure memory requirements for the graph
6369
6753
  size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
6370
6754
 
6371
- LLAMA_LOG_INFO("%s: compute buffer total size = %7.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
6755
+ LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
6372
6756
 
6373
6757
  // recreate allocator with exact memory requirements
6374
6758
  ggml_allocr_free(ctx->alloc);
@@ -6377,28 +6761,46 @@ struct llama_context * llama_new_context_with_model(
6377
6761
  ctx->alloc = ggml_allocr_new(ctx->buf_alloc.data, ctx->buf_alloc.size, tensor_alignment);
6378
6762
  #ifdef GGML_USE_METAL
6379
6763
  if (ctx->ctx_metal) {
6380
- ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
6764
+ //ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
6381
6765
  }
6382
6766
  #endif
6383
6767
  #ifdef GGML_USE_CUBLAS
6384
- if (params.low_vram) {
6385
- LLAMA_LOG_INFO("%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
6386
- ggml_cuda_set_scratch_size(0); // disable scratch
6387
- } else {
6388
- ggml_cuda_set_scratch_size(alloc_size);
6389
- LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MB\n", __func__, alloc_size / 1024.0 / 1024.0);
6768
+ ggml_cuda_set_scratch_size(alloc_size);
6769
+ LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MB\n", __func__, alloc_size / 1024.0 / 1024.0);
6770
+
6771
+ // calculate total VRAM usage
6772
+ auto add_tensor = [](const ggml_tensor * t, size_t & size) {
6773
+ if (t->backend == GGML_BACKEND_GPU || t->backend == GGML_BACKEND_GPU_SPLIT) {
6774
+ size += ggml_nbytes(t);
6775
+ }
6776
+ };
6777
+ size_t model_vram_size = 0;
6778
+ for (const auto & kv : model->tensors_by_name) {
6779
+ add_tensor(kv.second, model_vram_size);
6390
6780
  }
6781
+
6782
+ size_t kv_vram_size = 0;
6783
+ add_tensor(ctx->kv_self.k, kv_vram_size);
6784
+ add_tensor(ctx->kv_self.v, kv_vram_size);
6785
+
6786
+ size_t ctx_vram_size = alloc_size + kv_vram_size;
6787
+ size_t total_vram_size = model_vram_size + ctx_vram_size;
6788
+
6789
+ LLAMA_LOG_INFO("%s: total VRAM used: %.2f MB (model: %.2f MB, context: %.2f MB)\n", __func__,
6790
+ total_vram_size / 1024.0 / 1024.0,
6791
+ model_vram_size / 1024.0 / 1024.0,
6792
+ ctx_vram_size / 1024.0 / 1024.0);
6391
6793
  #endif
6392
6794
  }
6393
6795
 
6394
6796
  #ifdef GGML_USE_METAL
6395
- if (params.n_gpu_layers > 0) {
6797
+ if (model->n_gpu_layers > 0) {
6396
6798
  // this allocates all Metal resources and memory buffers
6397
6799
 
6398
6800
  void * data_ptr = NULL;
6399
6801
  size_t data_size = 0;
6400
6802
 
6401
- if (params.use_mmap) {
6803
+ if (ctx->model.mapping) {
6402
6804
  data_ptr = ctx->model.mapping->addr;
6403
6805
  data_size = ctx->model.mapping->size;
6404
6806
  } else {
@@ -6417,11 +6819,8 @@ struct llama_context * llama_new_context_with_model(
6417
6819
  return NULL; \
6418
6820
  }
6419
6821
 
6420
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
6421
-
6422
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.data, ctx->buf_compute.size, 0));
6423
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0));
6424
-
6822
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
6823
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0));
6425
6824
  LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.data, ctx->buf_alloc.size, 0));
6426
6825
  #undef LLAMA_METAL_CHECK_BUF
6427
6826
  }
@@ -6433,8 +6832,10 @@ struct llama_context * llama_new_context_with_model(
6433
6832
 
6434
6833
  if (ggml_mpi_rank(ctx->ctx_mpi) > 0) {
6435
6834
  // Enter a blocking eval loop with dummy input, letting rank=0 drive the process
6436
- const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos(ctx));
6437
- while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {};
6835
+ // TODO: needs fix after #3228
6836
+ GGML_ASSERT(false && "not implemented");
6837
+ //const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos(ctx));
6838
+ //while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {};
6438
6839
  llama_backend_free();
6439
6840
  exit(1);
6440
6841
  }
@@ -6443,63 +6844,37 @@ struct llama_context * llama_new_context_with_model(
6443
6844
  return ctx;
6444
6845
  }
6445
6846
 
6446
- static struct llama_context * llama_init_from_file(
6447
- const char * path_model,
6448
- struct llama_context_params params) {
6449
- struct llama_model * model = llama_load_model_from_file(path_model, params);
6450
- if (!model) {
6451
- return nullptr;
6452
- }
6453
-
6454
- struct llama_context * ctx = llama_new_context_with_model(model, params);
6455
- ctx->model_owner = true;
6456
-
6457
- return ctx;
6458
- }
6459
-
6460
6847
  void llama_free(struct llama_context * ctx) {
6461
6848
  delete ctx;
6462
6849
  }
6463
6850
 
6464
- int llama_n_vocab(const struct llama_context * ctx) {
6465
- return llama_model_n_vocab(&ctx->model);
6851
+ const llama_model * llama_get_model(const struct llama_context * ctx) {
6852
+ return &ctx->model;
6466
6853
  }
6467
6854
 
6468
6855
  int llama_n_ctx(const struct llama_context * ctx) {
6469
- return llama_model_n_ctx(&ctx->model);
6856
+ return ctx->cparams.n_ctx;
6470
6857
  }
6471
6858
 
6472
- int llama_n_ctx_train(const struct llama_context * ctx) {
6473
- return llama_model_n_ctx_train(&ctx->model);
6859
+ enum llama_vocab_type llama_vocab_type(const struct llama_model * model) {
6860
+ return model->vocab.type;
6474
6861
  }
6475
6862
 
6476
- int llama_n_embd(const struct llama_context * ctx) {
6477
- return llama_model_n_embd(&ctx->model);
6478
- }
6479
-
6480
- enum llama_vocab_type llama_vocab_type(const struct llama_context * ctx) {
6481
- return ctx->model.vocab.type;
6482
- }
6483
-
6484
- int llama_model_n_vocab(const struct llama_model * model) {
6863
+ int llama_n_vocab(const struct llama_model * model) {
6485
6864
  return model->vocab.id_to_token.size();
6486
6865
  }
6487
6866
 
6488
- int llama_model_n_ctx(const struct llama_model * model) {
6489
- return model->hparams.n_ctx;
6490
- }
6491
-
6492
- int llama_model_n_ctx_train(const struct llama_model * model) {
6867
+ int llama_n_ctx_train(const struct llama_model * model) {
6493
6868
  return model->hparams.n_ctx_train;
6494
6869
  }
6495
6870
 
6496
- int llama_model_n_embd(const struct llama_model * model) {
6871
+ int llama_n_embd(const struct llama_model * model) {
6497
6872
  return model->hparams.n_embd;
6498
6873
  }
6499
6874
 
6500
6875
  int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
6501
6876
  return snprintf(buf, buf_size, "%s %s %s",
6502
- model->name.c_str(),
6877
+ llama_model_arch_name(model->arch).c_str(),
6503
6878
  llama_model_type_name(model->type),
6504
6879
  llama_model_ftype_name(model->ftype).c_str());
6505
6880
  }
@@ -6520,6 +6895,10 @@ uint64_t llama_model_n_params(const struct llama_model * model) {
6520
6895
  return nparams;
6521
6896
  }
6522
6897
 
6898
+ struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name) {
6899
+ return ggml_get_tensor(model->ctx, name);
6900
+ }
6901
+
6523
6902
  int llama_model_quantize(
6524
6903
  const char * fname_inp,
6525
6904
  const char * fname_out,
@@ -6533,18 +6912,18 @@ int llama_model_quantize(
6533
6912
  }
6534
6913
  }
6535
6914
 
6536
- int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
6915
+ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, float scale, const char * path_base_model, int n_threads) {
6537
6916
  try {
6538
- return llama_apply_lora_from_file_internal(ctx->model, path_lora, path_base_model, n_threads);
6917
+ return llama_apply_lora_from_file_internal(ctx->model, path_lora, scale, path_base_model, n_threads);
6539
6918
  } catch (const std::exception & err) {
6540
6919
  LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
6541
6920
  return 1;
6542
6921
  }
6543
6922
  }
6544
6923
 
6545
- int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, const char * path_base_model, int n_threads) {
6924
+ int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int n_threads) {
6546
6925
  try {
6547
- return llama_apply_lora_from_file_internal(*model, path_lora, path_base_model, n_threads);
6926
+ return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads);
6548
6927
  } catch (const std::exception & err) {
6549
6928
  LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
6550
6929
  return 1;
@@ -6552,16 +6931,27 @@ int llama_model_apply_lora_from_file(const struct llama_model * model, const cha
6552
6931
  }
6553
6932
 
6554
6933
  int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
6555
- return ctx->kv_self.n;
6934
+ return ctx->kv_self.head;
6556
6935
  }
6557
6936
 
6558
- #define LLAMA_MAX_RNG_STATE (64*1024)
6937
+ void llama_kv_cache_tokens_rm(struct llama_context * ctx, int32_t c0, int32_t c1) {
6938
+ llama_kv_cache_tokens_rm(ctx->kv_self, c0, c1);
6939
+ }
6559
6940
 
6560
- void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed) {
6561
- if (seed == LLAMA_DEFAULT_SEED) {
6562
- seed = time(NULL);
6563
- }
6564
- ctx->rng.seed(seed);
6941
+ void llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
6942
+ llama_kv_cache_seq_rm(ctx->kv_self, seq_id, p0, p1);
6943
+ }
6944
+
6945
+ void llama_kv_cache_seq_cp(struct llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
6946
+ llama_kv_cache_seq_cp(ctx->kv_self, seq_id_src, seq_id_dst, p0, p1);
6947
+ }
6948
+
6949
+ void llama_kv_cache_seq_keep(struct llama_context * ctx, llama_seq_id seq_id) {
6950
+ llama_kv_cache_seq_keep(ctx->kv_self, seq_id);
6951
+ }
6952
+
6953
+ void llama_kv_cache_seq_shift(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
6954
+ llama_kv_cache_seq_shift(ctx->kv_self, seq_id, p0, p1, delta);
6565
6955
  }
6566
6956
 
6567
6957
  // Returns the *maximum* size of the state
@@ -6649,6 +7039,16 @@ struct llama_data_file_context : llama_data_context {
6649
7039
  *
6650
7040
  */
6651
7041
  static void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
7042
+ // TODO: does not support multi-sequence states
7043
+ {
7044
+ const auto & kv_self = ctx->kv_self;
7045
+ for (uint32_t i = 0; i < kv_self.head; ++i) {
7046
+ GGML_ASSERT(kv_self.cells[i].pos == (int32_t) i);
7047
+ GGML_ASSERT(kv_self.cells[i].seq_id.size() == 1);
7048
+ GGML_ASSERT(kv_self.cells[i].has_seq_id(0));
7049
+ }
7050
+ }
7051
+
6652
7052
  // copy rng
6653
7053
  {
6654
7054
  std::stringstream rng_ss;
@@ -6699,12 +7099,14 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
6699
7099
  {
6700
7100
  const auto & kv_self = ctx->kv_self;
6701
7101
  const auto & hparams = ctx->model.hparams;
7102
+ const auto & cparams = ctx->cparams;
7103
+
6702
7104
  const int n_layer = hparams.n_layer;
6703
7105
  const int n_embd = hparams.n_embd_gqa();
6704
- const int n_ctx = hparams.n_ctx;
7106
+ const int n_ctx = cparams.n_ctx;
6705
7107
 
6706
7108
  const size_t kv_size = kv_self.buf.size;
6707
- const int kv_ntok = llama_get_kv_cache_token_count(ctx);
7109
+ const int kv_ntok = kv_self.head;
6708
7110
 
6709
7111
  data_ctx->write(&kv_size, sizeof(kv_size));
6710
7112
  data_ctx->write(&kv_ntok, sizeof(kv_ntok));
@@ -6807,9 +7209,11 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
6807
7209
  {
6808
7210
  const auto & kv_self = ctx->kv_self;
6809
7211
  const auto & hparams = ctx->model.hparams;
7212
+ const auto & cparams = ctx->cparams;
7213
+
6810
7214
  const int n_layer = hparams.n_layer;
6811
7215
  const int n_embd = hparams.n_embd_gqa();
6812
- const int n_ctx = hparams.n_ctx;
7216
+ const int n_ctx = cparams.n_ctx;
6813
7217
 
6814
7218
  size_t kv_size;
6815
7219
  int kv_ntok;
@@ -6848,7 +7252,8 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
6848
7252
  ggml_free(cpy_ctx);
6849
7253
  }
6850
7254
 
6851
- ctx->kv_self.n = kv_ntok;
7255
+ ctx->kv_self.head = kv_ntok;
7256
+ ctx->kv_self.size = kv_size;
6852
7257
  }
6853
7258
 
6854
7259
  const size_t nread = inp - src;
@@ -6943,64 +7348,102 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi
6943
7348
 
6944
7349
  int llama_eval(
6945
7350
  struct llama_context * ctx,
6946
- const llama_token * tokens,
6947
- int n_tokens,
6948
- int n_past,
6949
- int n_threads) {
6950
- if (!llama_eval_internal(*ctx, tokens, nullptr, n_tokens, n_past, n_threads, nullptr)) {
6951
- LLAMA_LOG_ERROR("%s: failed to eval\n", __func__);
6952
- return 1;
6953
- }
7351
+ llama_token * tokens,
7352
+ int32_t n_tokens,
7353
+ int n_past) {
7354
+ llama_kv_cache_tokens_rm(ctx->kv_self, n_past, -1);
6954
7355
 
6955
- // get a more accurate load time, upon first eval
6956
- // TODO: fix this
6957
- if (!ctx->has_evaluated_once) {
6958
- ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
6959
- ctx->has_evaluated_once = true;
7356
+ const int ret = llama_decode_internal(*ctx, llama_batch_get_one(tokens, n_tokens, n_past, 0));
7357
+ if (ret < 0) {
7358
+ LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
6960
7359
  }
6961
7360
 
6962
- return 0;
7361
+ return ret;
6963
7362
  }
6964
7363
 
6965
7364
  int llama_eval_embd(
6966
7365
  struct llama_context * ctx,
6967
- const float * embd,
6968
- int n_tokens,
6969
- int n_past,
6970
- int n_threads) {
6971
- if (!llama_eval_internal(*ctx, nullptr, embd, n_tokens, n_past, n_threads, nullptr)) {
6972
- LLAMA_LOG_ERROR("%s: failed to eval\n", __func__);
6973
- return 1;
6974
- }
7366
+ float * embd,
7367
+ int32_t n_tokens,
7368
+ int n_past) {
7369
+ llama_kv_cache_tokens_rm(ctx->kv_self, n_past, -1);
6975
7370
 
6976
- // get a more accurate load time, upon first eval
6977
- // TODO: fix this
6978
- if (!ctx->has_evaluated_once) {
6979
- ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
6980
- ctx->has_evaluated_once = true;
7371
+ llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, n_past, 1, 0, };
7372
+
7373
+ const int ret = llama_decode_internal(*ctx, batch);
7374
+ if (ret < 0) {
7375
+ LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
6981
7376
  }
6982
7377
 
6983
- return 0;
7378
+ return ret;
6984
7379
  }
6985
7380
 
6986
- int llama_eval_export(struct llama_context * ctx, const char * fname) {
6987
- const int n_batch = 1;
6988
- const int n_ctx = 512 - n_batch;
7381
+ void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch) {
7382
+ ctx->cparams.n_threads = n_threads;
7383
+ ctx->cparams.n_threads_batch = n_threads_batch;
7384
+ }
7385
+
7386
+ struct llama_batch llama_batch_get_one(
7387
+ llama_token * tokens,
7388
+ int32_t n_tokens,
7389
+ llama_pos pos_0,
7390
+ llama_seq_id seq_id) {
7391
+ return {
7392
+ /*n_tokens =*/ n_tokens,
7393
+ /*tokens =*/ tokens,
7394
+ /*embd =*/ nullptr,
7395
+ /*pos =*/ nullptr,
7396
+ /*seq_id =*/ nullptr,
7397
+ /*logits =*/ nullptr,
7398
+ /*all_pos_0 =*/ pos_0,
7399
+ /*all_pos_1 =*/ 1,
7400
+ /*all_seq_id =*/ seq_id,
7401
+ };
7402
+ }
6989
7403
 
6990
- const std::vector<llama_token> tmp(n_batch, llama_token_bos(ctx));
7404
+ struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd) {
7405
+ llama_batch batch = { -1, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, };
6991
7406
 
6992
- if (!llama_eval_internal(*ctx, tmp.data(), nullptr, tmp.size(), n_ctx, 1, fname)) {
6993
- LLAMA_LOG_ERROR("%s: failed to eval\n", __func__);
6994
- return 1;
7407
+ if (embd) {
7408
+ batch.embd = (float *) malloc(sizeof(float) * n_tokens * embd);
7409
+ } else {
7410
+ batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens);
6995
7411
  }
6996
7412
 
6997
- return 0;
7413
+ batch.pos = (llama_pos *) malloc(sizeof(llama_pos) * n_tokens);
7414
+ batch.seq_id = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_tokens);
7415
+ batch.logits = (int8_t *) malloc(sizeof(int8_t) * n_tokens);
7416
+
7417
+ return batch;
7418
+ }
7419
+
7420
+ void llama_batch_free(struct llama_batch batch) {
7421
+ if (batch.token) free(batch.token);
7422
+ if (batch.embd) free(batch.embd);
7423
+ if (batch.pos) free(batch.pos);
7424
+ if (batch.seq_id) free(batch.seq_id);
7425
+ if (batch.logits) free(batch.logits);
7426
+ }
7427
+
7428
+ int llama_decode(
7429
+ struct llama_context * ctx,
7430
+ struct llama_batch batch) {
7431
+ const int ret = llama_decode_internal(*ctx, batch);
7432
+ if (ret < 0) {
7433
+ LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
7434
+ }
7435
+
7436
+ return ret;
6998
7437
  }
6999
7438
 
7000
7439
  float * llama_get_logits(struct llama_context * ctx) {
7001
7440
  return ctx->logits.data();
7002
7441
  }
7003
7442
 
7443
+ float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
7444
+ return ctx->logits.data() + i*ctx->model.hparams.n_vocab;
7445
+ }
7446
+
7004
7447
  float * llama_get_embeddings(struct llama_context * ctx) {
7005
7448
  return ctx->embedding.data();
7006
7449
  }
@@ -7030,16 +7473,6 @@ llama_token llama_token_nl(const struct llama_context * ctx) {
7030
7473
  }
7031
7474
 
7032
7475
  int llama_tokenize(
7033
- struct llama_context * ctx,
7034
- const char * text,
7035
- int text_len,
7036
- llama_token * tokens,
7037
- int n_max_tokens,
7038
- bool add_bos) {
7039
- return llama_tokenize_with_model(&ctx->model, text, text_len, tokens, n_max_tokens, add_bos);
7040
- }
7041
-
7042
- int llama_tokenize_with_model(
7043
7476
  const struct llama_model * model,
7044
7477
  const char * text,
7045
7478
  int text_len,
@@ -7060,13 +7493,9 @@ int llama_tokenize_with_model(
7060
7493
  return res.size();
7061
7494
  }
7062
7495
 
7063
- int llama_token_to_piece(const struct llama_context * ctx, llama_token token, char * buf, int length) {
7064
- return llama_token_to_piece_with_model(&ctx->model, token, buf, length);
7065
- }
7066
-
7067
7496
  // does not write null-terminator to buf
7068
- int llama_token_to_piece_with_model(const struct llama_model * model, llama_token token, char * buf, int length) {
7069
- if (0 <= token && token < llama_model_n_vocab(model)) {
7497
+ int llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int length) {
7498
+ if (0 <= token && token < llama_n_vocab(model)) {
7070
7499
  if (llama_is_normal_token(model->vocab, token)) {
7071
7500
  std::string result = model->vocab.id_to_token[token].text;
7072
7501
  if (llama_vocab_get_type(model->vocab) == LLAMA_VOCAB_TYPE_SPM) {
@@ -7086,7 +7515,7 @@ int llama_token_to_piece_with_model(const struct llama_model * model, llama_toke
7086
7515
  buf[2] = '\x85';
7087
7516
  return 3;
7088
7517
  } else if (llama_is_control_token(model->vocab, token)) {
7089
- ;
7518
+ // do nothing
7090
7519
  } else if (llama_is_byte_token(model->vocab, token)) {
7091
7520
  if (length < 1) {
7092
7521
  return -1;
@@ -7194,12 +7623,12 @@ const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal
7194
7623
  return ctx->model.tensors_by_name;
7195
7624
  }
7196
7625
 
7197
- void llama_log_set(llama_log_callback log_callback, void * user_data) {
7626
+ void llama_log_set(ggml_log_callback log_callback, void * user_data) {
7198
7627
  g_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
7199
7628
  g_state.log_callback_user_data = user_data;
7200
7629
  }
7201
7630
 
7202
- static void llama_log_internal_v(llama_log_level level, const char * format, va_list args) {
7631
+ static void llama_log_internal_v(ggml_log_level level, const char * format, va_list args) {
7203
7632
  va_list args_copy;
7204
7633
  va_copy(args_copy, args);
7205
7634
  char buffer[128];
@@ -7216,14 +7645,14 @@ static void llama_log_internal_v(llama_log_level level, const char * format, va_
7216
7645
  va_end(args_copy);
7217
7646
  }
7218
7647
 
7219
- static void llama_log_internal(llama_log_level level, const char * format, ...) {
7648
+ static void llama_log_internal(ggml_log_level level, const char * format, ...) {
7220
7649
  va_list args;
7221
7650
  va_start(args, format);
7222
7651
  llama_log_internal_v(level, format, args);
7223
7652
  va_end(args);
7224
7653
  }
7225
7654
 
7226
- static void llama_log_callback_default(llama_log_level level, const char * text, void * user_data) {
7655
+ static void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data) {
7227
7656
  (void) level;
7228
7657
  (void) user_data;
7229
7658
  fputs(text, stderr);