llama_cpp 0.5.3 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -72,6 +72,7 @@
72
72
  #include <sstream>
73
73
  #include <thread>
74
74
  #include <unordered_map>
75
+ #include <set>
75
76
 
76
77
  #if defined(_MSC_VER)
77
78
  #pragma warning(disable: 4244 4267) // possible loss of data
@@ -92,12 +93,12 @@
92
93
  //
93
94
 
94
95
  LLAMA_ATTRIBUTE_FORMAT(2, 3)
95
- static void llama_log_internal (llama_log_level level, const char* format, ...);
96
- static void llama_log_callback_default(llama_log_level level, const char * text, void * user_data);
96
+ static void llama_log_internal (ggml_log_level level, const char* format, ...);
97
+ static void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data);
97
98
 
98
- #define LLAMA_LOG_INFO(...) llama_log_internal(LLAMA_LOG_LEVEL_INFO , __VA_ARGS__)
99
- #define LLAMA_LOG_WARN(...) llama_log_internal(LLAMA_LOG_LEVEL_WARN , __VA_ARGS__)
100
- #define LLAMA_LOG_ERROR(...) llama_log_internal(LLAMA_LOG_LEVEL_ERROR, __VA_ARGS__)
99
+ #define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
100
+ #define LLAMA_LOG_WARN(...) llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
101
+ #define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
101
102
 
102
103
  //
103
104
  // helpers
@@ -166,13 +167,13 @@ enum llm_arch {
166
167
  };
167
168
 
168
169
  static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
169
- { LLM_ARCH_LLAMA, "llama" },
170
- { LLM_ARCH_FALCON, "falcon" },
171
- { LLM_ARCH_GPT2, "gpt2" },
172
- { LLM_ARCH_GPTJ, "gptj" },
173
- { LLM_ARCH_GPTNEOX, "gptneox" },
174
- { LLM_ARCH_MPT, "mpt" },
175
- { LLM_ARCH_BAICHUAN, "baichuan" },
170
+ { LLM_ARCH_LLAMA, "llama" },
171
+ { LLM_ARCH_FALCON, "falcon" },
172
+ { LLM_ARCH_GPT2, "gpt2" },
173
+ { LLM_ARCH_GPTJ, "gptj" },
174
+ { LLM_ARCH_GPTNEOX, "gptneox" },
175
+ { LLM_ARCH_MPT, "mpt" },
176
+ { LLM_ARCH_BAICHUAN, "baichuan" },
176
177
  { LLM_ARCH_STARCODER, "starcoder" },
177
178
  };
178
179
 
@@ -221,16 +222,16 @@ enum llm_kv {
221
222
  };
222
223
 
223
224
  static std::map<llm_kv, std::string> LLM_KV_NAMES = {
224
- { LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" },
225
- { LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" },
226
- { LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
227
- { LLM_KV_GENERAL_NAME, "general.name" },
228
- { LLM_KV_GENERAL_AUTHOR, "general.author" },
229
- { LLM_KV_GENERAL_URL, "general.url" },
230
- { LLM_KV_GENERAL_DESCRIPTION, "general.description" },
231
- { LLM_KV_GENERAL_LICENSE, "general.license" },
232
- { LLM_KV_GENERAL_SOURCE_URL, "general.source_url" },
233
- { LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source_hf_repo" },
225
+ { LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" },
226
+ { LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" },
227
+ { LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
228
+ { LLM_KV_GENERAL_NAME, "general.name" },
229
+ { LLM_KV_GENERAL_AUTHOR, "general.author" },
230
+ { LLM_KV_GENERAL_URL, "general.url" },
231
+ { LLM_KV_GENERAL_DESCRIPTION, "general.description" },
232
+ { LLM_KV_GENERAL_LICENSE, "general.license" },
233
+ { LLM_KV_GENERAL_SOURCE_URL, "general.source.url" },
234
+ { LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" },
234
235
 
235
236
  { LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
236
237
  { LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
@@ -448,7 +449,7 @@ struct LLM_TN {
448
449
  //
449
450
 
450
451
  #define GGUF_GET_KEY(ctx, dst, func, type, req, key) \
451
- { \
452
+ do { \
452
453
  const std::string skey(key); \
453
454
  const int kid = gguf_find_key(ctx, skey.c_str()); \
454
455
  if (kid >= 0) { \
@@ -460,7 +461,7 @@ struct LLM_TN {
460
461
  } else if (req) { \
461
462
  throw std::runtime_error(format("key not found in model: %s", skey.c_str())); \
462
463
  } \
463
- }
464
+ } while (0)
464
465
 
465
466
  //
466
467
  // ggml helpers
@@ -881,10 +882,10 @@ static void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
881
882
 
882
883
  static std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) {
883
884
  std::vector<char> result(8, 0);
884
- const int n_tokens = llama_token_to_piece(ctx, token, result.data(), result.size());
885
+ const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
885
886
  if (n_tokens < 0) {
886
887
  result.resize(-n_tokens);
887
- int check = llama_token_to_piece(ctx, token, result.data(), result.size());
888
+ int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
888
889
  GGML_ASSERT(check == -n_tokens);
889
890
  } else {
890
891
  result.resize(n_tokens);
@@ -899,7 +900,7 @@ static std::string llama_token_to_str(const struct llama_context * ctx, llama_to
899
900
 
900
901
  struct llama_state {
901
902
  // We save the log callback globally
902
- llama_log_callback log_callback = llama_log_callback_default;
903
+ ggml_log_callback log_callback = llama_log_callback_default;
903
904
  void * log_callback_user_data = nullptr;
904
905
  };
905
906
 
@@ -925,9 +926,9 @@ static const size_t MB = kB*kB;
925
926
  static const size_t GB = kB*kB*kB;
926
927
 
927
928
  struct llama_hparams {
929
+ bool vocab_only;
928
930
  uint32_t n_vocab;
929
931
  uint32_t n_ctx_train; // context size the model was trained on
930
- uint32_t n_ctx; // context size used during inference
931
932
  uint32_t n_embd;
932
933
  uint32_t n_head;
933
934
  uint32_t n_head_kv;
@@ -938,8 +939,8 @@ struct llama_hparams {
938
939
  float f_norm_eps;
939
940
  float f_norm_rms_eps;
940
941
 
941
- float rope_freq_base;
942
- float rope_freq_scale;
942
+ float rope_freq_base_train;
943
+ float rope_freq_scale_train;
943
944
 
944
945
  bool operator!=(const llama_hparams & other) const {
945
946
  return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams))); // NOLINT
@@ -956,15 +957,18 @@ struct llama_hparams {
956
957
  uint32_t n_embd_gqa() const {
957
958
  return n_embd/n_gqa();
958
959
  }
960
+ };
959
961
 
960
- size_t kv_size() const {
961
- size_t result = 2ull;
962
- result *= (size_t) n_embd_gqa();
963
- result *= (size_t) n_ctx;
964
- result *= (size_t) n_layer;
965
- result *= sizeof(ggml_fp16_t);
966
- return result;
967
- }
962
+ struct llama_cparams {
963
+ uint32_t n_ctx; // context size used during inference
964
+ uint32_t n_batch;
965
+ uint32_t n_threads; // number of threads to use for generation
966
+ uint32_t n_threads_batch; // number of threads to use for batch processing
967
+
968
+ float rope_freq_base;
969
+ float rope_freq_scale;
970
+
971
+ bool mul_mat_q;
968
972
  };
969
973
 
970
974
  struct llama_layer {
@@ -999,7 +1003,29 @@ struct llama_layer {
999
1003
  struct ggml_tensor * b3; // ffn_up
1000
1004
  };
1001
1005
 
1006
+ struct llama_kv_cell {
1007
+ llama_pos pos = -1;
1008
+ llama_pos delta = 0;
1009
+
1010
+ std::set<llama_seq_id> seq_id;
1011
+
1012
+ bool has_seq_id(const llama_seq_id & id) const {
1013
+ return seq_id.find(id) != seq_id.end();
1014
+ }
1015
+ };
1016
+
1017
+ // ring-buffer of cached KV data
1002
1018
  struct llama_kv_cache {
1019
+ bool has_shift = false;
1020
+
1021
+ uint32_t head = 0;
1022
+ uint32_t size = 0;
1023
+
1024
+ // computed before each graph build
1025
+ uint32_t n = 0;
1026
+
1027
+ std::vector<llama_kv_cell> cells;
1028
+
1003
1029
  struct ggml_tensor * k = NULL;
1004
1030
  struct ggml_tensor * v = NULL;
1005
1031
 
@@ -1007,8 +1033,6 @@ struct llama_kv_cache {
1007
1033
 
1008
1034
  llama_buffer buf;
1009
1035
 
1010
- int n; // number of tokens currently in the cache
1011
-
1012
1036
  ~llama_kv_cache() {
1013
1037
  if (ctx) {
1014
1038
  ggml_free(ctx);
@@ -1122,11 +1146,8 @@ struct llama_model {
1122
1146
  };
1123
1147
 
1124
1148
  struct llama_context {
1125
- llama_context(const llama_model & model) : model(model), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
1149
+ llama_context(const llama_model & model) : model(model), t_start_us(model.t_start_us), t_load_us(model.t_load_us) {}
1126
1150
  ~llama_context() {
1127
- if (model_owner) {
1128
- delete &model;
1129
- }
1130
1151
  #ifdef GGML_USE_METAL
1131
1152
  if (ctx_metal) {
1132
1153
  ggml_metal_free(ctx_metal);
@@ -1137,27 +1158,26 @@ struct llama_context {
1137
1158
  }
1138
1159
  }
1139
1160
 
1161
+ llama_cparams cparams;
1162
+
1163
+ const llama_model & model;
1164
+
1165
+ // key + value cache for the self attention
1166
+ struct llama_kv_cache kv_self;
1167
+
1140
1168
  std::mt19937 rng;
1141
1169
 
1142
1170
  bool has_evaluated_once = false;
1143
1171
 
1172
+ int64_t t_start_us;
1173
+ int64_t t_load_us;
1144
1174
  int64_t t_sample_us = 0;
1145
- int64_t t_eval_us = 0;
1146
1175
  int64_t t_p_eval_us = 0;
1176
+ int64_t t_eval_us = 0;
1147
1177
 
1148
1178
  int32_t n_sample = 0; // number of tokens sampled
1149
- int32_t n_eval = 0; // number of eval calls
1150
1179
  int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
1151
-
1152
- const llama_model & model;
1153
-
1154
- bool model_owner = false;
1155
-
1156
- int64_t t_load_us;
1157
- int64_t t_start_us;
1158
-
1159
- // key + value cache for the self attention
1160
- struct llama_kv_cache kv_self;
1180
+ int32_t n_eval = 0; // number of eval calls
1161
1181
 
1162
1182
  // decode output (2-dimensional array: [n_tokens][n_vocab])
1163
1183
  std::vector<float> logits;
@@ -1192,16 +1212,23 @@ static bool llama_kv_cache_init(
1192
1212
  const struct llama_hparams & hparams,
1193
1213
  struct llama_kv_cache & cache,
1194
1214
  ggml_type wtype,
1195
- int n_ctx,
1215
+ uint32_t n_ctx,
1196
1216
  int n_gpu_layers) {
1197
- const int n_embd = hparams.n_embd_gqa();
1198
- const int n_layer = hparams.n_layer;
1217
+ const uint32_t n_embd = hparams.n_embd_gqa();
1218
+ const uint32_t n_layer = hparams.n_layer;
1199
1219
 
1200
1220
  const int64_t n_mem = n_layer*n_ctx;
1201
1221
  const int64_t n_elements = n_embd*n_mem;
1202
1222
 
1223
+ cache.has_shift = false;
1224
+
1225
+ cache.head = 0;
1226
+ cache.size = n_ctx;
1227
+
1228
+ cache.cells.clear();
1229
+ cache.cells.resize(n_ctx);
1230
+
1203
1231
  cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
1204
- cache.n = 0;
1205
1232
 
1206
1233
  struct ggml_init_params params;
1207
1234
  params.mem_size = cache.buf.size;
@@ -1222,17 +1249,154 @@ static bool llama_kv_cache_init(
1222
1249
 
1223
1250
  (void) n_gpu_layers;
1224
1251
  #ifdef GGML_USE_CUBLAS
1225
- if (n_gpu_layers > n_layer + 1) {
1252
+ size_t vram_kv_cache = 0;
1253
+
1254
+ if (n_gpu_layers > (int)n_layer + 1) {
1226
1255
  ggml_cuda_assign_buffers_no_scratch(cache.v);
1256
+ LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__);
1257
+ vram_kv_cache += ggml_nbytes(cache.v);
1227
1258
  }
1228
- if (n_gpu_layers > n_layer + 2) {
1259
+ if (n_gpu_layers > (int)n_layer + 2) {
1229
1260
  ggml_cuda_assign_buffers_no_scratch(cache.k);
1261
+ LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
1262
+ vram_kv_cache += ggml_nbytes(cache.k);
1263
+ }
1264
+ if (vram_kv_cache > 0) {
1265
+ LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
1230
1266
  }
1231
1267
  #endif // GGML_USE_CUBLAS
1232
1268
 
1233
1269
  return true;
1234
1270
  }
1235
1271
 
1272
+ // find an empty slot of size "n_tokens" in the cache
1273
+ // updates the cache head
1274
+ static bool llama_kv_cache_find_slot(
1275
+ struct llama_kv_cache & cache,
1276
+ const struct llama_batch & batch) {
1277
+ const uint32_t n_ctx = cache.size;
1278
+ const uint32_t n_tokens = batch.n_tokens;
1279
+
1280
+ if (n_tokens > n_ctx) {
1281
+ LLAMA_LOG_ERROR("%s: n_tokens=%d > n_ctx=%d\n", __func__, n_tokens, n_ctx);
1282
+ return false;
1283
+ }
1284
+
1285
+ uint32_t n_tested = 0;
1286
+
1287
+ while (true) {
1288
+ if (cache.head + n_tokens > n_ctx) {
1289
+ cache.head = 0;
1290
+ n_tested += n_ctx - cache.head;
1291
+ continue;
1292
+ }
1293
+
1294
+ bool found = true;
1295
+ for (uint32_t i = 0; i < n_tokens; i++) {
1296
+ if (cache.cells[cache.head + i].pos >= 0) {
1297
+ found = false;
1298
+ cache.head += i + 1;
1299
+ n_tested += i + 1;
1300
+ break;
1301
+ }
1302
+ }
1303
+
1304
+ if (found) {
1305
+ break;
1306
+ }
1307
+
1308
+ if (n_tested >= n_ctx) {
1309
+ //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
1310
+ return false;
1311
+ }
1312
+ }
1313
+
1314
+ for (uint32_t i = 0; i < n_tokens; i++) {
1315
+ cache.cells[cache.head + i].pos = batch.pos[i];
1316
+ cache.cells[cache.head + i].seq_id.insert(batch.seq_id[i]);
1317
+ }
1318
+
1319
+ return true;
1320
+ }
1321
+
1322
+ // find how many cells are currently in use
1323
+ static int32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
1324
+ for (uint32_t i = cache.size - 1; i > 0; --i) {
1325
+ if (cache.cells[i].pos >= 0 && !cache.cells[i].seq_id.empty()) {
1326
+ return i + 1;
1327
+ }
1328
+ }
1329
+
1330
+ return 0;
1331
+ }
1332
+
1333
+ static void llama_kv_cache_tokens_rm(struct llama_kv_cache & cache, int32_t c0, int32_t c1) {
1334
+ if (c0 < 0) c0 = 0;
1335
+ if (c1 < 0) c1 = cache.size;
1336
+
1337
+ for (int32_t i = c0; i < c1; ++i) {
1338
+ cache.cells[i].pos = -1;
1339
+ cache.cells[i].seq_id.clear();
1340
+ }
1341
+ }
1342
+
1343
+ static void llama_kv_cache_seq_rm(
1344
+ struct llama_kv_cache & cache,
1345
+ llama_seq_id seq_id,
1346
+ llama_pos p0,
1347
+ llama_pos p1) {
1348
+ for (uint32_t i = 0; i < cache.size; ++i) {
1349
+ if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
1350
+ cache.cells[i].seq_id.erase(seq_id);
1351
+ if (cache.cells[i].seq_id.empty()) {
1352
+ cache.cells[i].pos = -1;
1353
+ }
1354
+ }
1355
+ }
1356
+ }
1357
+
1358
+ static void llama_kv_cache_seq_cp(
1359
+ struct llama_kv_cache & cache,
1360
+ llama_seq_id seq_id_src,
1361
+ llama_seq_id seq_id_dst,
1362
+ llama_pos p0,
1363
+ llama_pos p1) {
1364
+ for (uint32_t i = 0; i < cache.size; ++i) {
1365
+ if (cache.cells[i].has_seq_id(seq_id_src) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
1366
+ cache.cells[i].seq_id.insert(seq_id_dst);
1367
+ }
1368
+ }
1369
+ }
1370
+
1371
+ static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id seq_id) {
1372
+ for (uint32_t i = 0; i < cache.size; ++i) {
1373
+ if (!cache.cells[i].has_seq_id(seq_id)) {
1374
+ cache.cells[i].pos = -1;
1375
+ cache.cells[i].seq_id.clear();
1376
+ }
1377
+ }
1378
+ }
1379
+
1380
+ static void llama_kv_cache_seq_shift(
1381
+ struct llama_kv_cache & cache,
1382
+ llama_seq_id seq_id,
1383
+ llama_pos p0,
1384
+ llama_pos p1,
1385
+ llama_pos delta) {
1386
+ for (uint32_t i = 0; i < cache.size; ++i) {
1387
+ if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
1388
+ cache.cells[i].pos += delta;
1389
+ if (cache.cells[i].pos < 0) {
1390
+ cache.cells[i].pos = -1;
1391
+ cache.cells[i].seq_id.clear();
1392
+ } else {
1393
+ cache.has_shift = true;
1394
+ cache.cells[i].delta = delta;
1395
+ }
1396
+ }
1397
+ }
1398
+ }
1399
+
1236
1400
  //
1237
1401
  // model loading and saving
1238
1402
  //
@@ -1554,7 +1718,7 @@ struct llama_model_loader {
1554
1718
  lmlock->grow_to(size_lock);
1555
1719
  }
1556
1720
  break;
1557
- #if defined(GGML_USE_CUBLAS)
1721
+ #ifdef GGML_USE_CUBLAS
1558
1722
  case GGML_BACKEND_GPU:
1559
1723
  case GGML_BACKEND_GPU_SPLIT:
1560
1724
  // old code:
@@ -1587,7 +1751,15 @@ struct llama_model_loader {
1587
1751
  // load LLaMA models
1588
1752
  //
1589
1753
 
1590
- static std::string llama_model_ftype_name(enum llama_ftype ftype) {
1754
+ static std::string llama_model_arch_name(llm_arch arch) {
1755
+ auto it = LLM_ARCH_NAMES.find(arch);
1756
+ if (it == LLM_ARCH_NAMES.end()) {
1757
+ return "unknown";
1758
+ }
1759
+ return it->second;
1760
+ }
1761
+
1762
+ static std::string llama_model_ftype_name(llama_ftype ftype) {
1591
1763
  if (ftype & LLAMA_FTYPE_GUESSED) {
1592
1764
  return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
1593
1765
  }
@@ -1643,10 +1815,7 @@ static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
1643
1815
 
1644
1816
  static void llm_load_hparams(
1645
1817
  llama_model_loader & ml,
1646
- llama_model & model,
1647
- int n_ctx,
1648
- float rope_freq_base,
1649
- float rope_freq_scale) {
1818
+ llama_model & model) {
1650
1819
  struct gguf_context * ctx = ml.ctx_gguf;
1651
1820
 
1652
1821
  const auto kv = LLM_KV(model.arch);
@@ -1657,29 +1826,25 @@ static void llm_load_hparams(
1657
1826
  GGUF_GET_KEY(ctx, model.name, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_NAME));
1658
1827
 
1659
1828
  // get hparams kv
1660
- GGUF_GET_KEY(ctx, hparams.n_vocab, gguf_get_arr_n, GGUF_TYPE_ARRAY, true, kv(LLM_KV_TOKENIZER_LIST));
1661
- GGUF_GET_KEY(ctx, hparams.n_ctx_train, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_CONTEXT_LENGTH));
1662
- GGUF_GET_KEY(ctx, hparams.n_embd, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_EMBEDDING_LENGTH));
1663
- GGUF_GET_KEY(ctx, hparams.n_ff, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_FEED_FORWARD_LENGTH));
1664
- GGUF_GET_KEY(ctx, hparams.n_head, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_ATTENTION_HEAD_COUNT));
1665
- GGUF_GET_KEY(ctx, hparams.n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_BLOCK_COUNT));
1829
+ GGUF_GET_KEY(ctx, hparams.n_vocab, gguf_get_arr_n, GGUF_TYPE_ARRAY, true, kv(LLM_KV_TOKENIZER_LIST));
1830
+ GGUF_GET_KEY(ctx, hparams.n_ctx_train, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_CONTEXT_LENGTH));
1831
+ GGUF_GET_KEY(ctx, hparams.n_embd, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_EMBEDDING_LENGTH));
1832
+ GGUF_GET_KEY(ctx, hparams.n_ff, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_FEED_FORWARD_LENGTH));
1833
+ GGUF_GET_KEY(ctx, hparams.n_head, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_ATTENTION_HEAD_COUNT));
1834
+ GGUF_GET_KEY(ctx, hparams.n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_BLOCK_COUNT));
1666
1835
 
1667
1836
  // n_head_kv is optional, default to n_head
1668
1837
  hparams.n_head_kv = hparams.n_head;
1669
1838
  GGUF_GET_KEY(ctx, hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV));
1670
1839
 
1671
1840
  // rope_freq_base (optional)
1672
- if (rope_freq_base == 0.0f) {
1673
- rope_freq_base = 10000.0f;
1674
- GGUF_GET_KEY(ctx, rope_freq_base, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
1675
- }
1841
+ hparams.rope_freq_base_train = 10000.0f;
1842
+ GGUF_GET_KEY(ctx, hparams.rope_freq_base_train, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
1676
1843
 
1677
1844
  // rope_freq_scale (inverse of the kv) is optional
1678
- if (rope_freq_scale == 0.0f) {
1679
- float ropescale = 1.0f;
1680
- GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
1681
- rope_freq_scale = 1.0f/ropescale;
1682
- }
1845
+ float ropescale = 1.0f;
1846
+ GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
1847
+ hparams.rope_freq_scale_train = 1.0f/ropescale;
1683
1848
 
1684
1849
  // sanity check for n_rot (optional)
1685
1850
  {
@@ -1743,13 +1908,9 @@ static void llm_load_hparams(
1743
1908
  }
1744
1909
  } break;
1745
1910
  default: (void)0;
1746
- };
1911
+ }
1747
1912
 
1748
1913
  model.ftype = ml.ftype;
1749
-
1750
- hparams.n_ctx = n_ctx;
1751
- hparams.rope_freq_base = rope_freq_base;
1752
- hparams.rope_freq_scale = rope_freq_scale;
1753
1914
  }
1754
1915
 
1755
1916
  // TODO: This should probably be in llama.h
@@ -1770,20 +1931,18 @@ static void llm_load_vocab(
1770
1931
  throw std::runtime_error("cannot find tokenizer vocab in model file\n");
1771
1932
  }
1772
1933
 
1934
+ const float * scores = nullptr;
1773
1935
  const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
1774
- if (score_idx == -1) {
1775
- throw std::runtime_error("cannot find tokenizer scores in model file\n");
1936
+ if (score_idx != -1) {
1937
+ scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
1776
1938
  }
1777
1939
 
1778
- const float * scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
1779
-
1940
+ const int * toktypes = nullptr;
1780
1941
  const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
1781
- if (toktype_idx == -1) {
1782
- throw std::runtime_error("cannot find token type list in GGUF file\n");
1942
+ if (toktype_idx != -1) {
1943
+ toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
1783
1944
  }
1784
1945
 
1785
- const int * toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
1786
-
1787
1946
  // determine vocab type
1788
1947
  {
1789
1948
  std::string tokenizer_name;
@@ -1851,8 +2010,8 @@ static void llm_load_vocab(
1851
2010
 
1852
2011
  auto & token_data = vocab.id_to_token[i];
1853
2012
  token_data.text = std::move(word);
1854
- token_data.score = scores[i];
1855
- token_data.type = (llama_token_type) toktypes[i];
2013
+ token_data.score = scores ? scores[i] : 0.0f;
2014
+ token_data.type = toktypes ? (llama_token_type) toktypes[i] : LLAMA_TOKEN_TYPE_NORMAL;
1856
2015
  }
1857
2016
 
1858
2017
  // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
@@ -1875,31 +2034,30 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
1875
2034
  const auto & vocab = model.vocab;
1876
2035
 
1877
2036
  // hparams
1878
- LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver));
1879
- LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch).c_str());
1880
- LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, vocab.type == LLAMA_VOCAB_TYPE_SPM ? "SPM" : "BPE"); // TODO: fix
1881
- LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab);
1882
- LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (int) vocab.bpe_ranks.size());
1883
- LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
1884
- LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, hparams.n_ctx);
1885
- LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
1886
- LLAMA_LOG_INFO("%s: n_head = %u\n", __func__, hparams.n_head);
1887
- LLAMA_LOG_INFO("%s: n_head_kv = %u\n", __func__, hparams.n_head_kv);
1888
- LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
1889
- LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim
1890
- LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa());
1891
- LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
1892
- LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
1893
- LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
1894
- LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base);
1895
- LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale);
1896
- LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
1897
- LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
1898
- LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
2037
+ LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver));
2038
+ LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch).c_str());
2039
+ LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, vocab.type == LLAMA_VOCAB_TYPE_SPM ? "SPM" : "BPE"); // TODO: fix
2040
+ LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab);
2041
+ LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (int) vocab.bpe_ranks.size());
2042
+ LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
2043
+ LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
2044
+ LLAMA_LOG_INFO("%s: n_head = %u\n", __func__, hparams.n_head);
2045
+ LLAMA_LOG_INFO("%s: n_head_kv = %u\n", __func__, hparams.n_head_kv);
2046
+ LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
2047
+ LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim
2048
+ LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa());
2049
+ LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
2050
+ LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
2051
+ LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
2052
+ LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
2053
+ LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
2054
+ LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
2055
+ LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
2056
+ LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
1899
2057
  if (ml.n_bytes < GB) {
1900
- LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
2058
+ LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
1901
2059
  } else {
1902
- LLAMA_LOG_INFO("%s: model size = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
2060
+ LLAMA_LOG_INFO("%s: model size = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
1903
2061
  }
1904
2062
 
1905
2063
  // general kv
@@ -1917,13 +2075,9 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
1917
2075
  static void llm_load_tensors(
1918
2076
  llama_model_loader & ml,
1919
2077
  llama_model & model,
1920
- int n_batch,
1921
2078
  int n_gpu_layers,
1922
2079
  int main_gpu,
1923
2080
  const float * tensor_split,
1924
- const bool mul_mat_q,
1925
- bool low_vram,
1926
- ggml_type memory_type,
1927
2081
  bool use_mlock,
1928
2082
  llama_progress_callback progress_callback,
1929
2083
  void * progress_callback_user_data) {
@@ -1962,11 +2116,9 @@ static void llm_load_tensors(
1962
2116
  }
1963
2117
 
1964
2118
  (void) main_gpu;
1965
- (void) mul_mat_q;
1966
- #if defined(GGML_USE_CUBLAS)
2119
+ #ifdef GGML_USE_CUBLAS
1967
2120
  LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__);
1968
2121
  ggml_cuda_set_main_device(main_gpu);
1969
- ggml_cuda_set_mul_mat_q(mul_mat_q);
1970
2122
  #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
1971
2123
  #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
1972
2124
  #elif defined(GGML_USE_CLBLAST)
@@ -2001,9 +2153,9 @@ static void llm_load_tensors(
2001
2153
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2002
2154
  // on Windows however this is detrimental unless everything is on the GPU
2003
2155
  #ifndef _WIN32
2004
- backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2156
+ backend_norm = LLAMA_BACKEND_OFFLOAD;
2005
2157
  #else
2006
- backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2158
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2007
2159
  #endif // _WIN32
2008
2160
 
2009
2161
  backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
@@ -2067,9 +2219,9 @@ static void llm_load_tensors(
2067
2219
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2068
2220
  // on Windows however this is detrimental unless everything is on the GPU
2069
2221
  #ifndef _WIN32
2070
- backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2222
+ backend_norm = LLAMA_BACKEND_OFFLOAD;
2071
2223
  #else
2072
- backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2224
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2073
2225
  #endif // _WIN32
2074
2226
 
2075
2227
  backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
@@ -2137,9 +2289,9 @@ static void llm_load_tensors(
2137
2289
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2138
2290
  // on Windows however this is detrimental unless everything is on the GPU
2139
2291
  #ifndef _WIN32
2140
- backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2292
+ backend_norm = LLAMA_BACKEND_OFFLOAD;
2141
2293
  #else
2142
- backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2294
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2143
2295
  #endif // _WIN32
2144
2296
 
2145
2297
  backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
@@ -2214,9 +2366,9 @@ static void llm_load_tensors(
2214
2366
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2215
2367
  // on Windows however this is detrimental unless everything is on the GPU
2216
2368
  #ifndef _WIN32
2217
- backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2369
+ backend_norm = LLAMA_BACKEND_OFFLOAD;
2218
2370
  #else
2219
- backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2371
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2220
2372
  #endif // _WIN32
2221
2373
 
2222
2374
  backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
@@ -2281,27 +2433,19 @@ static void llm_load_tensors(
2281
2433
  } break;
2282
2434
  default:
2283
2435
  throw std::runtime_error("unknown architecture");
2284
- };
2436
+ }
2285
2437
  }
2286
2438
 
2287
2439
  ml.done_getting_tensors();
2288
2440
 
2289
2441
  // print memory requirements
2290
2442
  {
2291
- const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
2292
-
2293
2443
  // this is the total memory required to run the inference
2294
2444
  size_t mem_required =
2295
2445
  ctx_size +
2296
2446
  mmapped_size - vram_weights; // weights in VRAM not in memory
2297
2447
 
2298
- // this is the memory required by one llama_state
2299
- const size_t mem_required_state = scale*hparams.kv_size();
2300
-
2301
- LLAMA_LOG_INFO("%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
2302
- mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
2303
-
2304
- (void) n_batch;
2448
+ LLAMA_LOG_INFO("%s: mem required = %7.2f MB\n", __func__, mem_required / 1024.0 / 1024.0);
2305
2449
 
2306
2450
  #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
2307
2451
  const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
@@ -2310,36 +2454,17 @@ static void llm_load_tensors(
2310
2454
  if (n_gpu_layers > (int) hparams.n_layer) {
2311
2455
  LLAMA_LOG_INFO("%s: offloading non-repeating layers to GPU\n", __func__);
2312
2456
  }
2313
- size_t vram_kv_cache = 0;
2314
2457
 
2315
2458
  #ifdef GGML_USE_CUBLAS
2316
2459
  const int max_backend_supported_layers = hparams.n_layer + 3;
2317
- const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
2318
- if (n_gpu_layers > (int) hparams.n_layer + 1) {
2319
- if (low_vram) {
2320
- LLAMA_LOG_INFO("%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
2321
- } else {
2322
- LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__);
2323
- vram_kv_cache += hparams.kv_size() / 2;
2324
- }
2325
- }
2326
- if (n_gpu_layers > (int) hparams.n_layer + 2) {
2327
- if (low_vram) {
2328
- LLAMA_LOG_WARN("%s: cannot offload k cache to GPU due to low VRAM option\n", __func__);
2329
- } else {
2330
- LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
2331
- vram_kv_cache += hparams.kv_size() / 2;
2332
- }
2333
- }
2460
+ const int max_offloadable_layers = hparams.n_layer + 3;
2334
2461
  #elif defined(GGML_USE_CLBLAST)
2335
2462
  const int max_backend_supported_layers = hparams.n_layer + 1;
2336
2463
  const int max_offloadable_layers = hparams.n_layer + 1;
2337
2464
  #endif // GGML_USE_CUBLAS
2338
2465
 
2339
- LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n",
2340
- __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
2341
- LLAMA_LOG_INFO("%s: VRAM used: %zu MB\n",
2342
- __func__, (vram_weights + vram_kv_cache + MB - 1) / MB); // round up
2466
+ LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
2467
+ LLAMA_LOG_INFO("%s: VRAM used: %.2f MB\n", __func__, vram_weights / 1024.0 / 1024.0);
2343
2468
  #else
2344
2469
  (void) n_gpu_layers;
2345
2470
  #endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
@@ -2352,7 +2477,7 @@ static void llm_load_tensors(
2352
2477
  }
2353
2478
 
2354
2479
  (void) tensor_split;
2355
- #if defined(GGML_USE_CUBLAS)
2480
+ #ifdef GGML_USE_CUBLAS
2356
2481
  {
2357
2482
  ggml_cuda_set_tensor_split(tensor_split);
2358
2483
  }
@@ -2374,29 +2499,24 @@ static void llm_load_tensors(
2374
2499
  static bool llama_model_load(
2375
2500
  const std::string & fname,
2376
2501
  llama_model & model,
2377
- int n_ctx,
2378
- int n_batch,
2379
2502
  int n_gpu_layers,
2380
2503
  int main_gpu,
2381
2504
  const float * tensor_split,
2382
- const bool mul_mat_q,
2383
- float rope_freq_base,
2384
- float rope_freq_scale,
2385
- bool low_vram,
2386
- ggml_type memory_type,
2387
2505
  bool use_mmap,
2388
2506
  bool use_mlock,
2389
2507
  bool vocab_only,
2390
2508
  llama_progress_callback progress_callback,
2391
2509
  void *progress_callback_user_data) {
2392
2510
  try {
2393
- std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap));
2511
+ llama_model_loader ml(fname, use_mmap);
2512
+
2513
+ model.hparams.vocab_only = vocab_only;
2394
2514
 
2395
- llm_load_arch (*ml, model);
2396
- llm_load_hparams(*ml, model, n_ctx, rope_freq_base, rope_freq_scale);
2397
- llm_load_vocab (*ml, model);
2515
+ llm_load_arch (ml, model);
2516
+ llm_load_hparams(ml, model);
2517
+ llm_load_vocab (ml, model);
2398
2518
 
2399
- llm_load_print_meta(*ml, model);
2519
+ llm_load_print_meta(ml, model);
2400
2520
 
2401
2521
  if (model.hparams.n_vocab != model.vocab.id_to_token.size()) {
2402
2522
  throw std::runtime_error("vocab size mismatch");
@@ -2408,8 +2528,8 @@ static bool llama_model_load(
2408
2528
  }
2409
2529
 
2410
2530
  llm_load_tensors(
2411
- *ml, model, n_batch, n_gpu_layers,
2412
- main_gpu, tensor_split, mul_mat_q, low_vram, memory_type,
2531
+ ml, model, n_gpu_layers,
2532
+ main_gpu, tensor_split,
2413
2533
  use_mlock, progress_callback, progress_callback_user_data);
2414
2534
  } catch (const std::exception & err) {
2415
2535
  LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
@@ -2421,17 +2541,10 @@ static bool llama_model_load(
2421
2541
 
2422
2542
  static struct ggml_cgraph * llm_build_llama(
2423
2543
  llama_context & lctx,
2424
- const llama_token * tokens,
2425
- const float * embd,
2426
- int n_tokens,
2427
- int n_past) {
2428
-
2429
- GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
2430
-
2431
- const int N = n_tokens;
2432
-
2544
+ const llama_batch & batch) {
2433
2545
  const auto & model = lctx.model;
2434
2546
  const auto & hparams = model.hparams;
2547
+ const auto & cparams = lctx.cparams;
2435
2548
 
2436
2549
  const auto & kv_self = lctx.kv_self;
2437
2550
 
@@ -2439,7 +2552,7 @@ static struct ggml_cgraph * llm_build_llama(
2439
2552
 
2440
2553
  const int64_t n_embd = hparams.n_embd;
2441
2554
  const int64_t n_layer = hparams.n_layer;
2442
- const int64_t n_ctx = hparams.n_ctx;
2555
+ const int64_t n_ctx = cparams.n_ctx;
2443
2556
  const int64_t n_head = hparams.n_head;
2444
2557
  const int64_t n_head_kv = hparams.n_head_kv;
2445
2558
  const int64_t n_embd_head = hparams.n_embd_head();
@@ -2447,12 +2560,20 @@ static struct ggml_cgraph * llm_build_llama(
2447
2560
 
2448
2561
  GGML_ASSERT(n_embd_head == hparams.n_rot);
2449
2562
 
2450
- const float freq_base = hparams.rope_freq_base;
2451
- const float freq_scale = hparams.rope_freq_scale;
2563
+ const float freq_base = cparams.rope_freq_base;
2564
+ const float freq_scale = cparams.rope_freq_scale;
2452
2565
  const float norm_rms_eps = hparams.f_norm_rms_eps;
2453
2566
 
2454
2567
  const int n_gpu_layers = model.n_gpu_layers;
2455
2568
 
2569
+ const int32_t n_tokens = batch.n_tokens;
2570
+ const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
2571
+ const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
2572
+
2573
+ const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
2574
+
2575
+ //printf("n_kv = %d\n", n_kv);
2576
+
2456
2577
  auto & buf_compute = lctx.buf_compute;
2457
2578
 
2458
2579
  struct ggml_init_params params = {
@@ -2470,12 +2591,12 @@ static struct ggml_cgraph * llm_build_llama(
2470
2591
  struct ggml_tensor * cur;
2471
2592
  struct ggml_tensor * inpL;
2472
2593
 
2473
- if (tokens) {
2474
- struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
2594
+ if (batch.token) {
2595
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
2475
2596
 
2476
2597
  ggml_allocr_alloc(lctx.alloc, inp_tokens);
2477
2598
  if (!ggml_allocr_is_measure(lctx.alloc)) {
2478
- memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
2599
+ memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
2479
2600
  }
2480
2601
  ggml_set_name(inp_tokens, "inp_tokens");
2481
2602
 
@@ -2485,11 +2606,11 @@ static struct ggml_cgraph * llm_build_llama(
2485
2606
  GGML_ASSERT(false && "not implemented");
2486
2607
  #endif
2487
2608
 
2488
- inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
2609
+ inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
2489
2610
 
2490
2611
  ggml_allocr_alloc(lctx.alloc, inpL);
2491
2612
  if (!ggml_allocr_is_measure(lctx.alloc)) {
2492
- memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
2613
+ memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
2493
2614
  }
2494
2615
  }
2495
2616
 
@@ -2498,9 +2619,6 @@ static struct ggml_cgraph * llm_build_llama(
2498
2619
 
2499
2620
  // offload functions set the tensor output backend to GPU
2500
2621
  // tensors are GPU-accelerated if any input or the output has been offloaded
2501
- //
2502
- // with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
2503
- // in that case ggml_cuda_assign_buffers has no effect
2504
2622
  offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
2505
2623
  offload_func_t offload_func_kq = llama_nop;
2506
2624
  offload_func_t offload_func_v = llama_nop;
@@ -2517,12 +2635,75 @@ static struct ggml_cgraph * llm_build_llama(
2517
2635
  }
2518
2636
  #endif // GGML_USE_CUBLAS
2519
2637
 
2638
+ // KQ_scale
2520
2639
  struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
2640
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
2521
2641
  ggml_allocr_alloc(lctx.alloc, KQ_scale);
2522
2642
  if (!ggml_allocr_is_measure(lctx.alloc)) {
2523
- ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
2643
+ ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd_head)));
2644
+ }
2645
+
2646
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
2647
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
2648
+ offload_func_kq(KQ_mask);
2649
+ ggml_set_name(KQ_mask, "KQ_mask");
2650
+ ggml_allocr_alloc(lctx.alloc, KQ_mask);
2651
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
2652
+ float * data = (float *) KQ_mask->data;
2653
+ memset(data, 0, ggml_nbytes(KQ_mask));
2654
+
2655
+ for (int h = 0; h < 1; ++h) {
2656
+ for (int j = 0; j < n_tokens; ++j) {
2657
+ const llama_pos pos = batch.pos[j];
2658
+ const llama_seq_id seq_id = batch.seq_id[j];
2659
+
2660
+ for (int i = 0; i < n_kv; ++i) {
2661
+ if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
2662
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
2663
+ }
2664
+ }
2665
+ }
2666
+ }
2667
+ }
2668
+
2669
+ // KQ_pos - contains the positions
2670
+ struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
2671
+ offload_func_kq(KQ_pos);
2672
+ ggml_set_name(KQ_pos, "KQ_pos");
2673
+ ggml_allocr_alloc(lctx.alloc, KQ_pos);
2674
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
2675
+ int * data = (int *) KQ_pos->data;
2676
+ for (int i = 0; i < n_tokens; ++i) {
2677
+ data[i] = batch.pos[i];
2678
+ }
2679
+ }
2680
+
2681
+ // shift the entire K-cache if needed
2682
+ if (do_rope_shift) {
2683
+ struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
2684
+ offload_func_kq(K_shift);
2685
+ ggml_set_name(K_shift, "K_shift");
2686
+ ggml_allocr_alloc(lctx.alloc, K_shift);
2687
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
2688
+ int * data = (int *) K_shift->data;
2689
+ for (int i = 0; i < n_ctx; ++i) {
2690
+ data[i] = kv_self.cells[i].delta;
2691
+ }
2692
+ }
2693
+
2694
+ for (int il = 0; il < n_layer; ++il) {
2695
+ struct ggml_tensor * tmp =
2696
+ ggml_rope_custom_inplace(ctx0,
2697
+ ggml_view_3d(ctx0, kv_self.k,
2698
+ n_embd_head, n_head_kv, n_ctx,
2699
+ ggml_element_size(kv_self.k)*n_embd_head,
2700
+ ggml_element_size(kv_self.k)*n_embd_gqa,
2701
+ ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
2702
+ K_shift, n_embd_head, 0, 0, freq_base, freq_scale);
2703
+ offload_func_kq(tmp);
2704
+ ggml_build_forward_expand(gf, tmp);
2705
+ }
2524
2706
  }
2525
- ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
2526
2707
 
2527
2708
  for (int il = 0; il < n_layer; ++il) {
2528
2709
  ggml_format_name(inpL, "layer_inp_%d", il);
@@ -2560,33 +2741,33 @@ static struct ggml_cgraph * llm_build_llama(
2560
2741
  offload_func_kq(tmpq);
2561
2742
  ggml_set_name(tmpq, "tmpq");
2562
2743
 
2563
- struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
2744
+ struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
2564
2745
  offload_func_kq(Kcur);
2565
2746
  ggml_set_name(Kcur, "Kcur");
2566
2747
 
2567
- struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
2748
+ struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
2568
2749
  offload_func_kq(Qcur);
2569
2750
  ggml_set_name(Qcur, "Qcur");
2570
2751
 
2571
2752
  // store key and value to memory
2572
2753
  {
2573
- // compute the transposed [N, n_embd] V matrix
2754
+ // compute the transposed [n_tokens, n_embd] V matrix
2574
2755
 
2575
2756
  struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
2576
2757
  offload_func_v(tmpv);
2577
2758
  ggml_set_name(tmpv, "tmpv");
2578
2759
 
2579
- struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, N));
2760
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens));
2580
2761
  offload_func_v(Vcur);
2581
2762
  ggml_set_name(Vcur, "Vcur");
2582
2763
 
2583
- struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
2764
+ struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
2584
2765
  offload_func_kq(k);
2585
2766
  ggml_set_name(k, "k");
2586
2767
 
2587
- struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
2768
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
2588
2769
  ( n_ctx)*ggml_element_size(kv_self.v),
2589
- (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
2770
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
2590
2771
  offload_func_v(v);
2591
2772
  ggml_set_name(v, "v");
2592
2773
 
@@ -2601,7 +2782,7 @@ static struct ggml_cgraph * llm_build_llama(
2601
2782
 
2602
2783
  struct ggml_tensor * K =
2603
2784
  ggml_view_3d(ctx0, kv_self.k,
2604
- n_embd_head, n_past + N, n_head_kv,
2785
+ n_embd_head, n_kv, n_head_kv,
2605
2786
  ggml_element_size(kv_self.k)*n_embd_gqa,
2606
2787
  ggml_element_size(kv_self.k)*n_embd_head,
2607
2788
  ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
@@ -2614,25 +2795,25 @@ static struct ggml_cgraph * llm_build_llama(
2614
2795
  ggml_set_name(KQ, "KQ");
2615
2796
 
2616
2797
  // KQ_scaled = KQ / sqrt(n_embd_head)
2617
- // KQ_scaled shape [n_past + N, N, n_head, 1]
2618
- struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
2798
+ // KQ_scaled shape [n_kv, n_tokens, n_head, 1]
2799
+ struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
2619
2800
  offload_func_kq(KQ_scaled);
2620
2801
  ggml_set_name(KQ_scaled, "KQ_scaled");
2621
2802
 
2622
2803
  // KQ_masked = mask_past(KQ_scaled)
2623
- struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
2804
+ struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
2624
2805
  offload_func_kq(KQ_masked);
2625
2806
  ggml_set_name(KQ_masked, "KQ_masked");
2626
2807
 
2627
2808
  // KQ = soft_max(KQ_masked)
2628
- struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
2809
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
2629
2810
  offload_func_v(KQ_soft_max);
2630
2811
  ggml_set_name(KQ_soft_max, "KQ_soft_max");
2631
2812
 
2632
2813
  // split cached V into n_head heads
2633
2814
  struct ggml_tensor * V =
2634
2815
  ggml_view_3d(ctx0, kv_self.v,
2635
- n_past + N, n_embd_head, n_head_kv,
2816
+ n_kv, n_embd_head, n_head_kv,
2636
2817
  ggml_element_size(kv_self.v)*n_ctx,
2637
2818
  ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
2638
2819
  ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
@@ -2647,7 +2828,7 @@ static struct ggml_cgraph * llm_build_llama(
2647
2828
  // make V contiguous in memory to speed up the matmul, however we waste time on the copy
2648
2829
  // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
2649
2830
  // is there a better way?
2650
- struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head));
2831
+ struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_ctx, n_embd_head, n_head));
2651
2832
  struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
2652
2833
  #endif
2653
2834
 
@@ -2656,10 +2837,8 @@ static struct ggml_cgraph * llm_build_llama(
2656
2837
  offload_func_v(KQV_merged);
2657
2838
  ggml_set_name(KQV_merged, "KQV_merged");
2658
2839
 
2659
- // cur = KQV_merged.contiguous().view(n_embd, N)
2660
- cur = ggml_cpy(ctx0,
2661
- KQV_merged,
2662
- ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
2840
+ // cur = KQV_merged.contiguous().view(n_embd, n_tokens)
2841
+ cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
2663
2842
  offload_func_v(cur);
2664
2843
  ggml_set_name(cur, "KQV_merged_contiguous");
2665
2844
 
@@ -2750,20 +2929,12 @@ static struct ggml_cgraph * llm_build_llama(
2750
2929
  return gf;
2751
2930
  }
2752
2931
 
2753
-
2754
2932
  static struct ggml_cgraph * llm_build_baichaun(
2755
2933
  llama_context & lctx,
2756
- const llama_token * tokens,
2757
- const float * embd,
2758
- int n_tokens,
2759
- int n_past) {
2760
-
2761
- GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
2762
-
2763
- const int N = n_tokens;
2764
-
2934
+ const llama_batch & batch) {
2765
2935
  const auto & model = lctx.model;
2766
2936
  const auto & hparams = model.hparams;
2937
+ const auto & cparams = lctx.cparams;
2767
2938
 
2768
2939
  const auto & kv_self = lctx.kv_self;
2769
2940
 
@@ -2771,7 +2942,7 @@ static struct ggml_cgraph * llm_build_baichaun(
2771
2942
 
2772
2943
  const int64_t n_embd = hparams.n_embd;
2773
2944
  const int64_t n_layer = hparams.n_layer;
2774
- const int64_t n_ctx = hparams.n_ctx;
2945
+ const int64_t n_ctx = cparams.n_ctx;
2775
2946
  const int64_t n_head = hparams.n_head;
2776
2947
  const int64_t n_head_kv = hparams.n_head_kv;
2777
2948
  const int64_t n_embd_head = hparams.n_embd_head();
@@ -2779,12 +2950,18 @@ static struct ggml_cgraph * llm_build_baichaun(
2779
2950
 
2780
2951
  GGML_ASSERT(n_embd_head == hparams.n_rot);
2781
2952
 
2782
- const float freq_base = hparams.rope_freq_base;
2783
- const float freq_scale = hparams.rope_freq_scale;
2953
+ const float freq_base = cparams.rope_freq_base;
2954
+ const float freq_scale = cparams.rope_freq_scale;
2784
2955
  const float norm_rms_eps = hparams.f_norm_rms_eps;
2785
2956
 
2786
2957
  const int n_gpu_layers = model.n_gpu_layers;
2787
2958
 
2959
+ const int32_t n_tokens = batch.n_tokens;
2960
+ const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
2961
+ const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
2962
+
2963
+ const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
2964
+
2788
2965
  auto & buf_compute = lctx.buf_compute;
2789
2966
 
2790
2967
  struct ggml_init_params params = {
@@ -2802,12 +2979,12 @@ static struct ggml_cgraph * llm_build_baichaun(
2802
2979
  struct ggml_tensor * cur;
2803
2980
  struct ggml_tensor * inpL;
2804
2981
 
2805
- if (tokens) {
2806
- struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
2982
+ if (batch.token) {
2983
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
2807
2984
 
2808
2985
  ggml_allocr_alloc(lctx.alloc, inp_tokens);
2809
2986
  if (!ggml_allocr_is_measure(lctx.alloc)) {
2810
- memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
2987
+ memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
2811
2988
  }
2812
2989
  ggml_set_name(inp_tokens, "inp_tokens");
2813
2990
 
@@ -2817,11 +2994,11 @@ static struct ggml_cgraph * llm_build_baichaun(
2817
2994
  GGML_ASSERT(false && "not implemented");
2818
2995
  #endif
2819
2996
 
2820
- inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
2997
+ inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
2821
2998
 
2822
2999
  ggml_allocr_alloc(lctx.alloc, inpL);
2823
3000
  if (!ggml_allocr_is_measure(lctx.alloc)) {
2824
- memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
3001
+ memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
2825
3002
  }
2826
3003
  }
2827
3004
 
@@ -2830,9 +3007,6 @@ static struct ggml_cgraph * llm_build_baichaun(
2830
3007
 
2831
3008
  // offload functions set the tensor output backend to GPU
2832
3009
  // tensors are GPU-accelerated if any input or the output has been offloaded
2833
- //
2834
- // with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
2835
- // in that case ggml_cuda_assign_buffers has no effect
2836
3010
  offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
2837
3011
  offload_func_t offload_func_kq = llama_nop;
2838
3012
  offload_func_t offload_func_v = llama_nop;
@@ -2849,12 +3023,75 @@ static struct ggml_cgraph * llm_build_baichaun(
2849
3023
  }
2850
3024
  #endif // GGML_USE_CUBLAS
2851
3025
 
3026
+ // KQ_scale
2852
3027
  struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
3028
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
2853
3029
  ggml_allocr_alloc(lctx.alloc, KQ_scale);
2854
3030
  if (!ggml_allocr_is_measure(lctx.alloc)) {
2855
3031
  ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
2856
3032
  }
2857
- ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
3033
+
3034
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
3035
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
3036
+ offload_func_kq(KQ_mask);
3037
+ ggml_set_name(KQ_mask, "KQ_mask");
3038
+ ggml_allocr_alloc(lctx.alloc, KQ_mask);
3039
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3040
+ float * data = (float *) KQ_mask->data;
3041
+ memset(data, 0, ggml_nbytes(KQ_mask));
3042
+
3043
+ for (int h = 0; h < 1; ++h) {
3044
+ for (int j = 0; j < n_tokens; ++j) {
3045
+ const llama_pos pos = batch.pos[j];
3046
+ const llama_seq_id seq_id = batch.seq_id[j];
3047
+
3048
+ for (int i = 0; i < n_kv; ++i) {
3049
+ if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
3050
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
3051
+ }
3052
+ }
3053
+ }
3054
+ }
3055
+ }
3056
+
3057
+ // KQ_pos - contains the positions
3058
+ struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
3059
+ offload_func_kq(KQ_pos);
3060
+ ggml_set_name(KQ_pos, "KQ_pos");
3061
+ ggml_allocr_alloc(lctx.alloc, KQ_pos);
3062
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3063
+ int * data = (int *) KQ_pos->data;
3064
+ for (int i = 0; i < n_tokens; ++i) {
3065
+ data[i] = batch.pos[i];
3066
+ }
3067
+ }
3068
+
3069
+ // shift the entire K-cache if needed
3070
+ if (do_rope_shift) {
3071
+ struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
3072
+ offload_func_kq(K_shift);
3073
+ ggml_set_name(K_shift, "K_shift");
3074
+ ggml_allocr_alloc(lctx.alloc, K_shift);
3075
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3076
+ int * data = (int *) K_shift->data;
3077
+ for (int i = 0; i < n_ctx; ++i) {
3078
+ data[i] = kv_self.cells[i].delta;
3079
+ }
3080
+ }
3081
+
3082
+ for (int il = 0; il < n_layer; ++il) {
3083
+ struct ggml_tensor * tmp =
3084
+ ggml_rope_custom_inplace(ctx0,
3085
+ ggml_view_3d(ctx0, kv_self.k,
3086
+ n_embd_head, n_head_kv, n_ctx,
3087
+ ggml_element_size(kv_self.k)*n_embd_head,
3088
+ ggml_element_size(kv_self.k)*n_embd_gqa,
3089
+ ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
3090
+ K_shift, n_embd_head, 0, 0, freq_base, freq_scale);
3091
+ offload_func_kq(tmp);
3092
+ ggml_build_forward_expand(gf, tmp);
3093
+ }
3094
+ }
2858
3095
 
2859
3096
  for (int il = 0; il < n_layer; ++il) {
2860
3097
  ggml_format_name(inpL, "layer_inp_%d", il);
@@ -2896,12 +3133,12 @@ static struct ggml_cgraph * llm_build_baichaun(
2896
3133
  struct ggml_tensor * Qcur;
2897
3134
  switch (model.type) {
2898
3135
  case MODEL_7B:
2899
- Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
2900
- Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
3136
+ Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
3137
+ Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
2901
3138
  break;
2902
3139
  case MODEL_13B:
2903
- Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N);
2904
- Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N);
3140
+ Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, n_tokens);
3141
+ Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, n_tokens);
2905
3142
  break;
2906
3143
  default:
2907
3144
  GGML_ASSERT(false);
@@ -2915,23 +3152,23 @@ static struct ggml_cgraph * llm_build_baichaun(
2915
3152
 
2916
3153
  // store key and value to memory
2917
3154
  {
2918
- // compute the transposed [N, n_embd] V matrix
3155
+ // compute the transposed [n_tokens, n_embd] V matrix
2919
3156
 
2920
3157
  struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
2921
3158
  offload_func_v(tmpv);
2922
3159
  ggml_set_name(tmpv, "tmpv");
2923
3160
 
2924
- struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, N));
3161
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens));
2925
3162
  offload_func_v(Vcur);
2926
3163
  ggml_set_name(Vcur, "Vcur");
2927
3164
 
2928
- struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
3165
+ struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
2929
3166
  offload_func_kq(k);
2930
3167
  ggml_set_name(k, "k");
2931
3168
 
2932
- struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
3169
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
2933
3170
  ( n_ctx)*ggml_element_size(kv_self.v),
2934
- (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
3171
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
2935
3172
  offload_func_v(v);
2936
3173
  ggml_set_name(v, "v");
2937
3174
 
@@ -2946,7 +3183,7 @@ static struct ggml_cgraph * llm_build_baichaun(
2946
3183
 
2947
3184
  struct ggml_tensor * K =
2948
3185
  ggml_view_3d(ctx0, kv_self.k,
2949
- n_embd_head, n_past + N, n_head_kv,
3186
+ n_embd_head, n_kv, n_head_kv,
2950
3187
  ggml_element_size(kv_self.k)*n_embd_gqa,
2951
3188
  ggml_element_size(kv_self.k)*n_embd_head,
2952
3189
  ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
@@ -2959,8 +3196,8 @@ static struct ggml_cgraph * llm_build_baichaun(
2959
3196
  ggml_set_name(KQ, "KQ");
2960
3197
 
2961
3198
  // KQ_scaled = KQ / sqrt(n_embd_head)
2962
- // KQ_scaled shape [n_past + N, N, n_head, 1]
2963
- struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
3199
+ // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
3200
+ struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
2964
3201
  offload_func_kq(KQ_scaled);
2965
3202
  ggml_set_name(KQ_scaled, "KQ_scaled");
2966
3203
 
@@ -2969,58 +3206,44 @@ static struct ggml_cgraph * llm_build_baichaun(
2969
3206
 
2970
3207
  switch (model.type) {
2971
3208
  case MODEL_7B:
2972
- KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
3209
+ KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
2973
3210
  break;
2974
3211
  case MODEL_13B:
2975
- KQ_scaled_alibi =ggml_alibi(ctx0, KQ_scaled, n_past, n_head, 8);
3212
+ // TODO: replace with ggml_add()
3213
+ KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ 0, n_head, 8);
2976
3214
  ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
2977
- KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past);
3215
+ KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
2978
3216
  break;
2979
3217
  default:
2980
3218
  GGML_ASSERT(false);
2981
3219
  }
2982
- // KQ_masked = mask_past(KQ_scaled)
2983
- // struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
2984
- // struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past);
2985
- // offload_func_kq(KQ_masked);
2986
- // ggml_set_name(KQ_masked, "KQ_masked");
2987
3220
 
2988
3221
  // KQ = soft_max(KQ_masked)
2989
- struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
3222
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
2990
3223
  offload_func_v(KQ_soft_max);
2991
3224
  ggml_set_name(KQ_soft_max, "KQ_soft_max");
2992
3225
 
2993
3226
  // split cached V into n_head heads
2994
3227
  struct ggml_tensor * V =
2995
3228
  ggml_view_3d(ctx0, kv_self.v,
2996
- n_past + N, n_embd_head, n_head_kv,
3229
+ n_kv, n_embd_head, n_head_kv,
2997
3230
  ggml_element_size(kv_self.v)*n_ctx,
2998
3231
  ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
2999
3232
  ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
3000
3233
  offload_func_v(V);
3001
3234
  ggml_set_name(V, "V");
3002
3235
 
3003
- #if 1
3004
3236
  struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
3005
3237
  offload_func_v(KQV);
3006
3238
  ggml_set_name(KQV, "KQV");
3007
- #else
3008
- // make V contiguous in memory to speed up the matmul, however we waste time on the copy
3009
- // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
3010
- // is there a better way?
3011
- struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head));
3012
- struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
3013
- #endif
3014
3239
 
3015
3240
  // KQV_merged = KQV.permute(0, 2, 1, 3)
3016
3241
  struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
3017
3242
  offload_func_v(KQV_merged);
3018
3243
  ggml_set_name(KQV_merged, "KQV_merged");
3019
3244
 
3020
- // cur = KQV_merged.contiguous().view(n_embd, N)
3021
- cur = ggml_cpy(ctx0,
3022
- KQV_merged,
3023
- ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
3245
+ // cur = KQV_merged.contiguous().view(n_embd, n_tokens)
3246
+ cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
3024
3247
  offload_func_v(cur);
3025
3248
  ggml_set_name(cur, "KQV_merged_contiguous");
3026
3249
 
@@ -3113,17 +3336,10 @@ static struct ggml_cgraph * llm_build_baichaun(
3113
3336
 
3114
3337
  static struct ggml_cgraph * llm_build_falcon(
3115
3338
  llama_context & lctx,
3116
- const llama_token * tokens,
3117
- const float * embd,
3118
- int n_tokens,
3119
- int n_past) {
3120
-
3121
- GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
3122
-
3123
- const int N = n_tokens;
3124
-
3339
+ const llama_batch & batch) {
3125
3340
  const auto & model = lctx.model;
3126
3341
  const auto & hparams = model.hparams;
3342
+ const auto & cparams = lctx.cparams;
3127
3343
 
3128
3344
  const auto & kv_self = lctx.kv_self;
3129
3345
 
@@ -3131,7 +3347,7 @@ static struct ggml_cgraph * llm_build_falcon(
3131
3347
 
3132
3348
  const int64_t n_embd = hparams.n_embd;
3133
3349
  const int64_t n_layer = hparams.n_layer;
3134
- const int64_t n_ctx = hparams.n_ctx;
3350
+ const int64_t n_ctx = cparams.n_ctx;
3135
3351
  const int64_t n_head = hparams.n_head;
3136
3352
  const int64_t n_head_kv = hparams.n_head_kv;
3137
3353
  const int64_t n_embd_head = hparams.n_embd_head();
@@ -3139,12 +3355,21 @@ static struct ggml_cgraph * llm_build_falcon(
3139
3355
 
3140
3356
  GGML_ASSERT(n_embd_head == hparams.n_rot);
3141
3357
 
3142
- const float freq_base = hparams.rope_freq_base;
3143
- const float freq_scale = hparams.rope_freq_scale;
3358
+ const float freq_base = cparams.rope_freq_base;
3359
+ const float freq_scale = cparams.rope_freq_scale;
3144
3360
  const float norm_eps = hparams.f_norm_eps;
3145
3361
 
3146
3362
  const int n_gpu_layers = model.n_gpu_layers;
3147
3363
 
3364
+ const int32_t n_tokens = batch.n_tokens;
3365
+ const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
3366
+ const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
3367
+
3368
+ const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
3369
+
3370
+ //printf("kv_head = %d, n_kv = %d, n_tokens = %d, n_ctx = %d, is_measure = %d, has_shift = %d\n",
3371
+ // kv_head, n_kv, n_tokens, n_ctx, ggml_allocr_is_measure(lctx.alloc), kv_self.has_shift);
3372
+
3148
3373
  auto & buf_compute = lctx.buf_compute;
3149
3374
 
3150
3375
  struct ggml_init_params params = {
@@ -3162,12 +3387,12 @@ static struct ggml_cgraph * llm_build_falcon(
3162
3387
  struct ggml_tensor * cur;
3163
3388
  struct ggml_tensor * inpL;
3164
3389
 
3165
- if (tokens) {
3166
- struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
3390
+ if (batch.token) {
3391
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
3167
3392
 
3168
3393
  ggml_allocr_alloc(lctx.alloc, inp_tokens);
3169
3394
  if (!ggml_allocr_is_measure(lctx.alloc)) {
3170
- memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
3395
+ memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
3171
3396
  }
3172
3397
  ggml_set_name(inp_tokens, "inp_tokens");
3173
3398
 
@@ -3177,11 +3402,11 @@ static struct ggml_cgraph * llm_build_falcon(
3177
3402
  GGML_ASSERT(false && "not implemented");
3178
3403
  #endif
3179
3404
 
3180
- inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
3405
+ inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
3181
3406
 
3182
3407
  ggml_allocr_alloc(lctx.alloc, inpL);
3183
3408
  if (!ggml_allocr_is_measure(lctx.alloc)) {
3184
- memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
3409
+ memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
3185
3410
  }
3186
3411
  }
3187
3412
 
@@ -3190,9 +3415,6 @@ static struct ggml_cgraph * llm_build_falcon(
3190
3415
 
3191
3416
  // offload functions set the tensor output backend to GPU
3192
3417
  // tensors are GPU-accelerated if any input or the output has been offloaded
3193
- //
3194
- // with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
3195
- // in that case ggml_cuda_assign_buffers has no effect
3196
3418
  offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
3197
3419
  offload_func_t offload_func_kq = llama_nop;
3198
3420
  offload_func_t offload_func_v = llama_nop;
@@ -3209,12 +3431,75 @@ static struct ggml_cgraph * llm_build_falcon(
3209
3431
  }
3210
3432
  #endif // GGML_USE_CUBLAS
3211
3433
 
3434
+ // KQ_scale
3212
3435
  struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
3436
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
3213
3437
  ggml_allocr_alloc(lctx.alloc, KQ_scale);
3214
3438
  if (!ggml_allocr_is_measure(lctx.alloc)) {
3215
3439
  ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
3216
3440
  }
3217
- ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
3441
+
3442
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
3443
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
3444
+ offload_func_kq(KQ_mask);
3445
+ ggml_set_name(KQ_mask, "KQ_mask");
3446
+ ggml_allocr_alloc(lctx.alloc, KQ_mask);
3447
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3448
+ float * data = (float *) KQ_mask->data;
3449
+ memset(data, 0, ggml_nbytes(KQ_mask));
3450
+
3451
+ for (int h = 0; h < 1; ++h) {
3452
+ for (int j = 0; j < n_tokens; ++j) {
3453
+ const llama_pos pos = batch.pos[j];
3454
+ const llama_seq_id seq_id = batch.seq_id[j];
3455
+
3456
+ for (int i = 0; i < n_kv; ++i) {
3457
+ if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
3458
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
3459
+ }
3460
+ }
3461
+ }
3462
+ }
3463
+ }
3464
+
3465
+ // KQ_pos - contains the positions
3466
+ struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
3467
+ offload_func_kq(KQ_pos);
3468
+ ggml_set_name(KQ_pos, "KQ_pos");
3469
+ ggml_allocr_alloc(lctx.alloc, KQ_pos);
3470
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3471
+ int * data = (int *) KQ_pos->data;
3472
+ for (int i = 0; i < n_tokens; ++i) {
3473
+ data[i] = batch.pos[i];
3474
+ }
3475
+ }
3476
+
3477
+ // shift the entire K-cache if needed
3478
+ if (do_rope_shift) {
3479
+ struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
3480
+ offload_func_kq(K_shift);
3481
+ ggml_set_name(K_shift, "K_shift");
3482
+ ggml_allocr_alloc(lctx.alloc, K_shift);
3483
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3484
+ int * data = (int *) K_shift->data;
3485
+ for (int i = 0; i < n_ctx; ++i) {
3486
+ data[i] = kv_self.cells[i].delta;
3487
+ }
3488
+ }
3489
+
3490
+ for (int il = 0; il < n_layer; ++il) {
3491
+ struct ggml_tensor * tmp =
3492
+ ggml_rope_custom_inplace(ctx0,
3493
+ ggml_view_3d(ctx0, kv_self.k,
3494
+ n_embd_head, n_head_kv, n_ctx,
3495
+ ggml_element_size(kv_self.k)*n_embd_head,
3496
+ ggml_element_size(kv_self.k)*n_embd_gqa,
3497
+ ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
3498
+ K_shift, n_embd_head, 2, 0, freq_base, freq_scale);
3499
+ offload_func_kq(tmp);
3500
+ ggml_build_forward_expand(gf, tmp);
3501
+ }
3502
+ }
3218
3503
 
3219
3504
  for (int il = 0; il < n_layer; ++il) {
3220
3505
  struct ggml_tensor * attn_norm;
@@ -3271,45 +3556,45 @@ static struct ggml_cgraph * llm_build_falcon(
3271
3556
  // TODO: these 2 ggml_conts are technically not needed, but we add them until CUDA support for
3272
3557
  // non-contiguous views is added for the rope operator
3273
3558
  struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_3d(
3274
- ctx0, cur, n_embd_head, n_head, N,
3559
+ ctx0, cur, n_embd_head, n_head, n_tokens,
3275
3560
  wsize * n_embd_head,
3276
3561
  wsize * n_embd_head * (n_head + 2 * n_head_kv),
3277
3562
  0));
3278
3563
  offload_func_kq(tmpq);
3279
3564
 
3280
3565
  struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_3d(
3281
- ctx0, cur, n_embd_head, n_head_kv, N,
3566
+ ctx0, cur, n_embd_head, n_head_kv, n_tokens,
3282
3567
  wsize * n_embd_head,
3283
3568
  wsize * n_embd_head * (n_head + 2 * n_head_kv),
3284
3569
  wsize * n_embd_head * n_head));
3285
3570
  offload_func_kq(tmpk);
3286
3571
 
3287
3572
  struct ggml_tensor * tmpv = ggml_view_3d(
3288
- ctx0, cur, n_embd_head, n_head_kv, N,
3573
+ ctx0, cur, n_embd_head, n_head_kv, n_tokens,
3289
3574
  wsize * n_embd_head,
3290
3575
  wsize * n_embd_head * (n_head + 2 * n_head_kv),
3291
3576
  wsize * n_embd_head * (n_head + n_head_kv));
3292
3577
  offload_func_v(tmpv);
3293
3578
 
3294
3579
  // using mode = 2 for neox mode
3295
- struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, tmpq, n_past, n_embd_head, 2, 0, freq_base, freq_scale);
3580
+ struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, tmpq, KQ_pos, n_embd_head, 2, 0, freq_base, freq_scale);
3296
3581
  offload_func_kq(Qcur);
3297
- struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, tmpk, n_past, n_embd_head, 2, 0, freq_base, freq_scale);
3582
+ struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, tmpk, KQ_pos, n_embd_head, 2, 0, freq_base, freq_scale);
3298
3583
  offload_func_kq(Kcur);
3299
3584
 
3300
3585
  {
3301
- struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, N));
3586
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
3302
3587
  offload_func_v(Vcur);
3303
3588
  offload_func_v(Vcur->src[0]->src[0]);
3304
3589
  ggml_set_name(Vcur, "Vcur");
3305
3590
 
3306
- struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
3591
+ struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
3307
3592
  offload_func_kq(k);
3308
3593
  ggml_set_name(k, "k");
3309
3594
 
3310
- struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
3595
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
3311
3596
  ( n_ctx)*ggml_element_size(kv_self.v),
3312
- (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
3597
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
3313
3598
  offload_func_v(v);
3314
3599
 
3315
3600
  ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
@@ -3322,7 +3607,7 @@ static struct ggml_cgraph * llm_build_falcon(
3322
3607
 
3323
3608
  struct ggml_tensor * K =
3324
3609
  ggml_view_3d(ctx0, kv_self.k,
3325
- n_embd_head, n_past + N, n_head_kv,
3610
+ n_embd_head, n_kv, n_head_kv,
3326
3611
  ggml_element_size(kv_self.k)*n_embd_gqa,
3327
3612
  ggml_element_size(kv_self.k)*n_embd_head,
3328
3613
  ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
@@ -3333,21 +3618,21 @@ static struct ggml_cgraph * llm_build_falcon(
3333
3618
  offload_func_kq(KQ);
3334
3619
  ggml_set_name(KQ, "KQ");
3335
3620
 
3336
- struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
3621
+ struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
3337
3622
  offload_func_kq(KQ_scaled);
3338
3623
  ggml_set_name(KQ_scaled, "KQ_scaled");
3339
3624
 
3340
- struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
3625
+ struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
3341
3626
  offload_func_kq(KQ_masked);
3342
3627
  ggml_set_name(KQ_masked, "KQ_masked");
3343
3628
 
3344
- struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
3629
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
3345
3630
  offload_func_v(KQ_soft_max);
3346
3631
  ggml_set_name(KQ_soft_max, "KQ_soft_max");
3347
3632
 
3348
3633
  struct ggml_tensor * V =
3349
3634
  ggml_view_3d(ctx0, kv_self.v,
3350
- n_past + N, n_embd_head, n_head_kv,
3635
+ n_kv, n_embd_head, n_head_kv,
3351
3636
  ggml_element_size(kv_self.v)*n_ctx,
3352
3637
  ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
3353
3638
  ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
@@ -3362,7 +3647,7 @@ static struct ggml_cgraph * llm_build_falcon(
3362
3647
  offload_func_v(KQV_merged);
3363
3648
  ggml_set_name(KQV_merged, "KQV_merged");
3364
3649
 
3365
- cur = ggml_cpy(ctx0, KQV_merged, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
3650
+ cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
3366
3651
  offload_func_v(cur);
3367
3652
  ggml_set_name(cur, "KQV_merged_contiguous");
3368
3653
 
@@ -3420,17 +3705,10 @@ static struct ggml_cgraph * llm_build_falcon(
3420
3705
 
3421
3706
  static struct ggml_cgraph * llm_build_starcoder(
3422
3707
  llama_context & lctx,
3423
- const llama_token * tokens,
3424
- const float * embd,
3425
- int n_tokens,
3426
- int n_past) {
3427
-
3428
- GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
3429
-
3430
- const int N = n_tokens;
3431
-
3708
+ const llama_batch & batch) {
3432
3709
  const auto & model = lctx.model;
3433
3710
  const auto & hparams = model.hparams;
3711
+ const auto & cparams = lctx.cparams;
3434
3712
 
3435
3713
  const auto & kv_self = lctx.kv_self;
3436
3714
 
@@ -3438,7 +3716,7 @@ static struct ggml_cgraph * llm_build_starcoder(
3438
3716
 
3439
3717
  const int64_t n_embd = hparams.n_embd;
3440
3718
  const int64_t n_layer = hparams.n_layer;
3441
- const int64_t n_ctx = hparams.n_ctx;
3719
+ const int64_t n_ctx = cparams.n_ctx;
3442
3720
  const int64_t n_head = hparams.n_head;
3443
3721
  const int64_t n_head_kv = hparams.n_head_kv;
3444
3722
  const int64_t n_embd_head = hparams.n_embd_head();
@@ -3446,7 +3724,11 @@ static struct ggml_cgraph * llm_build_starcoder(
3446
3724
 
3447
3725
  GGML_ASSERT(n_embd_head == hparams.n_rot);
3448
3726
 
3449
- const float norm_eps = hparams.f_norm_eps;
3727
+ const float norm_eps = hparams.f_norm_eps;
3728
+
3729
+ const int32_t n_tokens = batch.n_tokens;
3730
+ const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
3731
+ const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
3450
3732
 
3451
3733
  auto & buf_compute = lctx.buf_compute;
3452
3734
 
@@ -3467,12 +3749,12 @@ static struct ggml_cgraph * llm_build_starcoder(
3467
3749
  struct ggml_tensor * position;
3468
3750
  struct ggml_tensor * inpL;
3469
3751
 
3470
- if (tokens) {
3471
- struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
3752
+ if (batch.token) {
3753
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
3472
3754
 
3473
3755
  ggml_allocr_alloc(lctx.alloc, inp_tokens);
3474
3756
  if (!ggml_allocr_is_measure(lctx.alloc)) {
3475
- memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
3757
+ memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
3476
3758
  }
3477
3759
  ggml_set_name(inp_tokens, "inp_tokens");
3478
3760
 
@@ -3482,21 +3764,21 @@ static struct ggml_cgraph * llm_build_starcoder(
3482
3764
  GGML_ASSERT(false && "not implemented");
3483
3765
  #endif
3484
3766
 
3485
- token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
3767
+ token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
3486
3768
 
3487
3769
  ggml_allocr_alloc(lctx.alloc, token);
3488
3770
  if (!ggml_allocr_is_measure(lctx.alloc)) {
3489
- memcpy(token->data, embd, N * n_embd * ggml_element_size(token));
3771
+ memcpy(token->data, batch.embd, n_tokens * n_embd * ggml_element_size(token));
3490
3772
  }
3491
3773
  }
3492
3774
 
3493
3775
  {
3494
3776
  // Compute position embeddings.
3495
- struct ggml_tensor * inp_positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
3777
+ struct ggml_tensor * inp_positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
3496
3778
  ggml_allocr_alloc(lctx.alloc, inp_positions);
3497
3779
  if (!ggml_allocr_is_measure(lctx.alloc)) {
3498
- for (int i = 0; i < N; ++i) {
3499
- ((int32_t *) inp_positions->data)[i] = n_past + i;
3780
+ for (int i = 0; i < n_tokens; ++i) {
3781
+ ((int32_t *) inp_positions->data)[i] = batch.pos[i];
3500
3782
  }
3501
3783
  }
3502
3784
  ggml_set_name(inp_positions, "inp_positions");
@@ -3504,12 +3786,35 @@ static struct ggml_cgraph * llm_build_starcoder(
3504
3786
  position = ggml_get_rows(ctx0, model.pos_embeddings, inp_positions);
3505
3787
  }
3506
3788
 
3789
+ // KQ_scale
3507
3790
  struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
3791
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
3508
3792
  ggml_allocr_alloc(lctx.alloc, KQ_scale);
3509
3793
  if (!ggml_allocr_is_measure(lctx.alloc)) {
3510
3794
  ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
3511
3795
  }
3512
- ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
3796
+
3797
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
3798
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
3799
+ ggml_set_name(KQ_mask, "KQ_mask");
3800
+ ggml_allocr_alloc(lctx.alloc, KQ_mask);
3801
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3802
+ float * data = (float *) KQ_mask->data;
3803
+ memset(data, 0, ggml_nbytes(KQ_mask));
3804
+
3805
+ for (int h = 0; h < 1; ++h) {
3806
+ for (int j = 0; j < n_tokens; ++j) {
3807
+ const llama_pos pos = batch.pos[j];
3808
+ const llama_seq_id seq_id = batch.seq_id[j];
3809
+
3810
+ for (int i = 0; i < n_kv; ++i) {
3811
+ if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
3812
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
3813
+ }
3814
+ }
3815
+ }
3816
+ }
3817
+ }
3513
3818
 
3514
3819
  inpL = ggml_add(ctx0, token, position);
3515
3820
  ggml_set_name(inpL, "inpL");
@@ -3525,23 +3830,23 @@ static struct ggml_cgraph * llm_build_starcoder(
3525
3830
  // Self Attention
3526
3831
  cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv);
3527
3832
 
3528
- struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
3529
- struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, N, cur->nb[1], sizeof(float)*n_embd);
3530
- struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, N, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
3833
+ struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd);
3834
+ struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*n_embd);
3835
+ struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
3531
3836
 
3532
3837
  struct ggml_tensor * Qcur = tmpq;
3533
3838
  struct ggml_tensor * Kcur = tmpk;
3534
3839
 
3535
3840
  {
3536
- struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, N));
3841
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
3537
3842
  ggml_set_name(Vcur, "Vcur");
3538
3843
 
3539
- struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
3844
+ struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
3540
3845
  ggml_set_name(k, "k");
3541
3846
 
3542
- struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
3847
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
3543
3848
  ( n_ctx)*ggml_element_size(kv_self.v),
3544
- (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
3849
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
3545
3850
 
3546
3851
  ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
3547
3852
  ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
@@ -3551,13 +3856,13 @@ static struct ggml_cgraph * llm_build_starcoder(
3551
3856
  ggml_permute(ctx0,
3552
3857
  ggml_cpy(ctx0,
3553
3858
  Qcur,
3554
- ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, N)),
3859
+ ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, n_tokens)),
3555
3860
  0, 2, 1, 3);
3556
3861
  ggml_set_name(Q, "Q");
3557
3862
 
3558
3863
  struct ggml_tensor * K =
3559
3864
  ggml_view_3d(ctx0, kv_self.k,
3560
- n_embd_head, n_past + N, n_head_kv,
3865
+ n_embd_head, n_kv, n_head_kv,
3561
3866
  ggml_element_size(kv_self.k)*n_embd_gqa,
3562
3867
  ggml_element_size(kv_self.k)*n_embd_head,
3563
3868
  ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
@@ -3568,12 +3873,12 @@ static struct ggml_cgraph * llm_build_starcoder(
3568
3873
  ggml_set_name(KQ, "KQ");
3569
3874
 
3570
3875
  // KQ_scaled = KQ / sqrt(n_embd_head)
3571
- // KQ_scaled shape [n_past + N, N, n_head, 1]
3876
+ // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
3572
3877
  struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
3573
3878
  ggml_set_name(KQ_scaled, "KQ_scaled");
3574
3879
 
3575
3880
  // KQ_masked = mask_past(KQ_scaled)
3576
- struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
3881
+ struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
3577
3882
  ggml_set_name(KQ_masked, "KQ_masked");
3578
3883
 
3579
3884
  // KQ = soft_max(KQ_masked)
@@ -3583,7 +3888,7 @@ static struct ggml_cgraph * llm_build_starcoder(
3583
3888
  // split cached V into n_head heads
3584
3889
  struct ggml_tensor * V =
3585
3890
  ggml_view_3d(ctx0, kv_self.v,
3586
- n_past + N, n_embd_head, n_head_kv,
3891
+ n_kv, n_embd_head, n_head_kv,
3587
3892
  ggml_element_size(kv_self.v)*n_ctx,
3588
3893
  ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
3589
3894
  ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
@@ -3596,10 +3901,8 @@ static struct ggml_cgraph * llm_build_starcoder(
3596
3901
  struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
3597
3902
  ggml_set_name(KQV_merged, "KQV_merged");
3598
3903
 
3599
- // cur = KQV_merged.contiguous().view(n_embd, N)
3600
- cur = ggml_cpy(ctx0,
3601
- KQV_merged,
3602
- ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
3904
+ // cur = KQV_merged.contiguous().view(n_embd, n_tokens)
3905
+ cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
3603
3906
  ggml_set_name(cur, "KQV_merged_contiguous");
3604
3907
  }
3605
3908
 
@@ -3649,10 +3952,7 @@ static struct ggml_cgraph * llm_build_starcoder(
3649
3952
 
3650
3953
  static struct ggml_cgraph * llama_build_graph(
3651
3954
  llama_context & lctx,
3652
- const llama_token * tokens,
3653
- const float * embd,
3654
- int n_tokens,
3655
- int n_past) {
3955
+ const llama_batch & batch) {
3656
3956
  const auto & model = lctx.model;
3657
3957
 
3658
3958
  struct ggml_cgraph * result = NULL;
@@ -3660,76 +3960,117 @@ static struct ggml_cgraph * llama_build_graph(
3660
3960
  switch (model.arch) {
3661
3961
  case LLM_ARCH_LLAMA:
3662
3962
  {
3663
- result = llm_build_llama(lctx, tokens, embd, n_tokens, n_past);
3963
+ result = llm_build_llama(lctx, batch);
3664
3964
  } break;
3665
3965
  case LLM_ARCH_BAICHUAN:
3666
3966
  {
3667
- result = llm_build_baichaun(lctx, tokens, embd, n_tokens, n_past);
3967
+ result = llm_build_baichaun(lctx, batch);
3668
3968
  } break;
3669
3969
  case LLM_ARCH_FALCON:
3670
3970
  {
3671
- result = llm_build_falcon(lctx, tokens, embd, n_tokens, n_past);
3971
+ result = llm_build_falcon(lctx, batch);
3672
3972
  } break;
3673
3973
  case LLM_ARCH_STARCODER:
3674
3974
  {
3675
- result = llm_build_starcoder(lctx, tokens, embd, n_tokens, n_past);
3975
+ result = llm_build_starcoder(lctx, batch);
3676
3976
  } break;
3677
3977
  default:
3678
3978
  GGML_ASSERT(false);
3679
- };
3979
+ }
3680
3980
 
3681
3981
  return result;
3682
3982
  }
3683
3983
 
3684
- // evaluate the transformer
3984
+ // decode a batch of tokens by evaluating the transformer
3685
3985
  //
3686
3986
  // - lctx: llama context
3687
- // - tokens: new batch of tokens to process
3688
- // - embd embeddings input
3689
- // - n_tokens number of tokens
3690
- // - n_past: the context size so far
3987
+ // - batch: batch to evaluate
3691
3988
  // - n_threads: number of threads to use
3692
3989
  //
3693
- static bool llama_eval_internal(
3990
+ // return 0 on success
3991
+ // return positive int on warning
3992
+ // return negative int on error
3993
+ //
3994
+ static int llama_decode_internal(
3694
3995
  llama_context & lctx,
3695
- const llama_token * tokens,
3696
- const float * embd,
3697
- int n_tokens,
3698
- int n_past,
3699
- int n_threads,
3700
- const char * cgraph_fname) {
3996
+ llama_batch batch) {
3997
+ const uint32_t n_tokens = batch.n_tokens;
3701
3998
 
3702
- GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
3999
+ if (n_tokens == 0) {
4000
+ LLAMA_LOG_ERROR("%s: n_tokens == 0", __func__);
4001
+ return -1;
4002
+ }
3703
4003
 
3704
- GGML_ASSERT(n_tokens > 0);
3705
- GGML_ASSERT(n_past >= 0);
3706
- // TODO: keep the values of n_batch and n_ctx
3707
- // GGML_ASSERT(n_tokens <= n_batch);
3708
- // GGML_ASSERT(n_past + n_tokens <= n_ctx);
4004
+ const auto & model = lctx.model;
4005
+ const auto & hparams = model.hparams;
4006
+ const auto & cparams = lctx.cparams;
4007
+
4008
+ const auto n_batch = cparams.n_batch;
4009
+
4010
+ GGML_ASSERT(n_tokens <= n_batch);
4011
+
4012
+ int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
4013
+ GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
3709
4014
 
3710
4015
  const int64_t t_start_us = ggml_time_us();
3711
4016
 
3712
4017
  #ifdef GGML_USE_MPI
3713
- ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
4018
+ // TODO: needs fix after #3228
4019
+ GGML_ASSERT(false && "not implemented");
4020
+ //ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
3714
4021
  #endif
3715
4022
 
3716
4023
  GGML_ASSERT(n_threads > 0);
3717
4024
 
3718
- const int N = n_tokens;
3719
-
3720
- const auto & model = lctx.model;
3721
- const auto & hparams = model.hparams;
3722
-
3723
- const auto & kv_self = lctx.kv_self;
4025
+ auto & kv_self = lctx.kv_self;
3724
4026
 
3725
4027
  GGML_ASSERT(!!kv_self.ctx);
3726
4028
 
3727
4029
  const int64_t n_embd = hparams.n_embd;
3728
4030
  const int64_t n_vocab = hparams.n_vocab;
3729
4031
 
4032
+ // helpers for smoother batch API transistion
4033
+ // after deprecating the llama_eval calls, these will be removed
4034
+ std::vector<llama_pos> pos;
4035
+ std::vector<llama_seq_id> seq_id;
4036
+
4037
+ if (batch.pos == nullptr) {
4038
+ pos.resize(n_tokens);
4039
+ for (uint32_t i = 0; i < n_tokens; i++) {
4040
+ pos[i] = batch.all_pos_0 + i*batch.all_pos_1;
4041
+ }
4042
+
4043
+ batch.pos = pos.data();
4044
+ }
4045
+
4046
+ if (batch.seq_id == nullptr) {
4047
+ seq_id.resize(n_tokens);
4048
+ for (uint32_t i = 0; i < n_tokens; i++) {
4049
+ seq_id[i] = batch.all_seq_id;
4050
+ }
4051
+
4052
+ batch.seq_id = seq_id.data();
4053
+ }
4054
+
4055
+ // we always start to search for a free slot from the start of the cache
4056
+ // TODO: better strategies can be implemented
4057
+ kv_self.head = 0;
4058
+
4059
+ if (!llama_kv_cache_find_slot(kv_self, batch)) {
4060
+ return 1;
4061
+ }
4062
+
4063
+ // a heuristic, to avoid attending the full cache if it is not yet utilized
4064
+ // after enough generations, the benefit from this heuristic disappears
4065
+ // if we start defragmenting the cache, the benefit from this will be more important
4066
+ //kv_self.n = std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)); // TODO: this might be better for CUDA?
4067
+ kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, llama_kv_cache_cell_max(kv_self)));
4068
+
4069
+ //printf("kv_self.n = %d\n", kv_self.n);
4070
+
3730
4071
  ggml_allocr_reset(lctx.alloc);
3731
4072
 
3732
- ggml_cgraph * gf = llama_build_graph(lctx, tokens, embd, n_tokens, n_past);
4073
+ ggml_cgraph * gf = llama_build_graph(lctx, batch);
3733
4074
 
3734
4075
  ggml_allocr_alloc_graph(lctx.alloc, gf);
3735
4076
 
@@ -3738,6 +4079,7 @@ static bool llama_eval_internal(
3738
4079
  ggml_tensor * node = gf->leafs[i];
3739
4080
  if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
3740
4081
  ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) lctx.buf_alloc.data);
4082
+ ggml_cuda_copy_to_device(node);
3741
4083
  }
3742
4084
  }
3743
4085
 
@@ -3747,6 +4089,8 @@ static bool llama_eval_internal(
3747
4089
  ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) lctx.buf_alloc.data);
3748
4090
  }
3749
4091
  }
4092
+
4093
+ ggml_cuda_set_mul_mat_q(cparams.mul_mat_q);
3750
4094
  #endif
3751
4095
 
3752
4096
  // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
@@ -3756,7 +4100,7 @@ static bool llama_eval_internal(
3756
4100
  // TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
3757
4101
  // we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
3758
4102
  // with the BLAS calls. need a better solution
3759
- if (N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
4103
+ if (n_tokens >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
3760
4104
  n_threads = std::min(4, n_threads);
3761
4105
  }
3762
4106
 
@@ -3795,12 +4139,9 @@ static bool llama_eval_internal(
3795
4139
  ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
3796
4140
  #endif
3797
4141
 
3798
- // update kv token count
3799
- lctx.kv_self.n = n_past + N;
3800
-
3801
- if (cgraph_fname) {
3802
- ggml_graph_export(gf, cgraph_fname);
3803
- }
4142
+ // update the kv ring buffer
4143
+ lctx.kv_self.head += n_tokens;
4144
+ lctx.kv_self.has_shift = false;
3804
4145
 
3805
4146
  #ifdef GGML_PERF
3806
4147
  // print timing information per ggml operation (for debugging purposes)
@@ -3817,13 +4158,20 @@ static bool llama_eval_internal(
3817
4158
  {
3818
4159
  auto & logits_out = lctx.logits;
3819
4160
 
3820
- if (lctx.logits_all) {
3821
- logits_out.resize(n_vocab * N);
3822
- memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*N);
4161
+ if (batch.logits) {
4162
+ logits_out.resize(n_vocab * n_tokens);
4163
+ for (uint32_t i = 0; i < n_tokens; i++) {
4164
+ if (batch.logits[i] == 0) {
4165
+ continue;
4166
+ }
4167
+ memcpy(logits_out.data() + (n_vocab*i), (float *) ggml_get_data(res) + (n_vocab*i), sizeof(float)*n_vocab);
4168
+ }
4169
+ } else if (lctx.logits_all) {
4170
+ logits_out.resize(n_vocab * n_tokens);
4171
+ memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*n_tokens);
3823
4172
  } else {
3824
- // return result for just the last token
3825
4173
  logits_out.resize(n_vocab);
3826
- memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
4174
+ memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(n_tokens - 1)), sizeof(float)*n_vocab);
3827
4175
  }
3828
4176
  }
3829
4177
 
@@ -3832,20 +4180,27 @@ static bool llama_eval_internal(
3832
4180
  auto & embedding_out = lctx.embedding;
3833
4181
 
3834
4182
  embedding_out.resize(n_embd);
3835
- memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd);
4183
+ memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(n_tokens - 1)), sizeof(float)*n_embd);
3836
4184
  }
3837
4185
 
3838
4186
  // measure the performance only for the single-token evals
3839
- if (N == 1) {
4187
+ if (n_tokens == 1) {
3840
4188
  lctx.t_eval_us += ggml_time_us() - t_start_us;
3841
4189
  lctx.n_eval++;
3842
4190
  }
3843
- else if (N > 1) {
4191
+ else if (n_tokens > 1) {
3844
4192
  lctx.t_p_eval_us += ggml_time_us() - t_start_us;
3845
- lctx.n_p_eval += N;
4193
+ lctx.n_p_eval += n_tokens;
3846
4194
  }
3847
4195
 
3848
- return true;
4196
+ // get a more accurate load time, upon first eval
4197
+ // TODO: fix this
4198
+ if (!lctx.has_evaluated_once) {
4199
+ lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
4200
+ lctx.has_evaluated_once = true;
4201
+ }
4202
+
4203
+ return 0;
3849
4204
  }
3850
4205
 
3851
4206
  //
@@ -4266,7 +4621,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
4266
4621
  llm_tokenizer_bpe tokenizer(vocab);
4267
4622
  tokenizer.tokenize(raw_text, output);
4268
4623
  } break;
4269
- };
4624
+ }
4270
4625
 
4271
4626
  return output;
4272
4627
  }
@@ -4670,6 +5025,13 @@ struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar)
4670
5025
  // sampling
4671
5026
  //
4672
5027
 
5028
+ void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed) {
5029
+ if (seed == LLAMA_DEFAULT_SEED) {
5030
+ seed = time(NULL);
5031
+ }
5032
+ ctx->rng.seed(seed);
5033
+ }
5034
+
4673
5035
  void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates) {
4674
5036
  GGML_ASSERT(candidates->size > 0);
4675
5037
 
@@ -4878,7 +5240,7 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
4878
5240
  }
4879
5241
  }
4880
5242
 
4881
- void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
5243
+ void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
4882
5244
  const int64_t t_start_sample_us = ggml_time_us();
4883
5245
 
4884
5246
  for (size_t i = 0; i < candidates_p->size; ++i) {
@@ -4890,6 +5252,10 @@ void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array
4890
5252
  }
4891
5253
  }
4892
5254
 
5255
+ void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
5256
+ llama_sample_temp(ctx, candidates_p, temp);
5257
+ }
5258
+
4893
5259
  void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty) {
4894
5260
  if (last_tokens_size == 0 || penalty == 1.0f) {
4895
5261
  return;
@@ -5013,7 +5379,7 @@ void llama_sample_classifier_free_guidance(
5013
5379
 
5014
5380
  GGML_ASSERT(ctx);
5015
5381
 
5016
- auto n_vocab = llama_n_vocab(ctx);
5382
+ auto n_vocab = llama_n_vocab(llama_get_model(ctx));
5017
5383
 
5018
5384
  GGML_ASSERT(n_vocab == (int)candidates->size);
5019
5385
  GGML_ASSERT(!candidates->sorted);
@@ -5042,7 +5408,7 @@ void llama_sample_classifier_free_guidance(
5042
5408
  llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu) {
5043
5409
  GGML_ASSERT(ctx);
5044
5410
 
5045
- auto N = float(llama_n_vocab(ctx));
5411
+ auto N = float(llama_n_vocab(llama_get_model(ctx)));
5046
5412
  int64_t t_start_sample_us;
5047
5413
  t_start_sample_us = ggml_time_us();
5048
5414
 
@@ -5229,7 +5595,7 @@ struct llama_logit_info {
5229
5595
  };
5230
5596
  llama_logit_info(llama_context * ctx)
5231
5597
  : logits(llama_get_logits(ctx))
5232
- , n_vocab(llama_n_vocab(ctx))
5598
+ , n_vocab(llama_n_vocab(llama_get_model(ctx)))
5233
5599
  , max_l(*std::max_element(logits, logits + n_vocab))
5234
5600
  , normalizer(1.0f / std::accumulate(logits, logits + n_vocab, 0.0f, sum_exp{max_l}))
5235
5601
  { }
@@ -5267,7 +5633,6 @@ struct llama_beam_search_data {
5267
5633
  size_t n_beams;
5268
5634
  int n_past;
5269
5635
  int n_predict;
5270
- int n_threads;
5271
5636
  std::vector<llama_beam> beams;
5272
5637
  std::vector<llama_beam> next_beams;
5273
5638
 
@@ -5277,12 +5642,11 @@ struct llama_beam_search_data {
5277
5642
  // Used to communicate to/from callback on beams state.
5278
5643
  std::vector<llama_beam_view> beam_views;
5279
5644
 
5280
- llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict, int n_threads)
5645
+ llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict)
5281
5646
  : ctx(ctx)
5282
5647
  , n_beams(n_beams)
5283
5648
  , n_past(n_past)
5284
5649
  , n_predict(n_predict)
5285
- , n_threads(n_threads)
5286
5650
  , beam_views(n_beams) {
5287
5651
  beams.reserve(n_beams);
5288
5652
  next_beams.reserve(n_beams);
@@ -5319,7 +5683,7 @@ struct llama_beam_search_data {
5319
5683
  } else {
5320
5684
  // beam is not at end-of-sentence, so branch with next top_k tokens.
5321
5685
  if (!beam.tokens.empty()) {
5322
- llama_eval(ctx, beam.tokens.data(), beam.tokens.size(), n_past, n_threads);
5686
+ llama_decode(ctx, llama_batch_get_one(beam.tokens.data(), beam.tokens.size(), n_past, 0));
5323
5687
  }
5324
5688
  llama_logit_info logit_info(ctx);
5325
5689
  std::vector<llama_token_data> next_tokens = logit_info.top_k(n_beams);
@@ -5393,7 +5757,7 @@ struct llama_beam_search_data {
5393
5757
  callback(callback_data, get_beams_state(false)); // Sets common_prefix_length
5394
5758
  update_beams_from_beam_views(); // Update values (p,eob) that callback may have changed.
5395
5759
  if (common_prefix_length) {
5396
- llama_eval(ctx, beams[0].tokens.data(), common_prefix_length, n_past, n_threads);
5760
+ llama_decode(ctx, llama_batch_get_one(beams[0].tokens.data(), common_prefix_length, n_past, 0));
5397
5761
  n_past += common_prefix_length;
5398
5762
  }
5399
5763
  // Zero-out next_beam probabilities to place them last in following min-heap.
@@ -5434,11 +5798,11 @@ struct llama_beam_search_data {
5434
5798
 
5435
5799
  void llama_beam_search(llama_context * ctx,
5436
5800
  llama_beam_search_callback_fn_t callback, void * callback_data,
5437
- size_t n_beams, int n_past, int n_predict, int n_threads) {
5801
+ size_t n_beams, int n_past, int n_predict) {
5438
5802
  assert(ctx);
5439
5803
  const int64_t t_start_sample_us = ggml_time_us();
5440
5804
 
5441
- llama_beam_search_data beam_search_data(ctx, n_beams, n_past, n_predict, n_threads);
5805
+ llama_beam_search_data beam_search_data(ctx, n_beams, n_past, n_predict);
5442
5806
 
5443
5807
  beam_search_data.loop(callback, callback_data);
5444
5808
 
@@ -5658,11 +6022,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
5658
6022
  nthread = std::thread::hardware_concurrency();
5659
6023
  }
5660
6024
 
5661
- std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname_inp, /*use_mmap*/ false));
6025
+ llama_model_loader ml(fname_inp, /*use_mmap*/ false);
5662
6026
 
5663
6027
  llama_model model;
5664
- llm_load_arch(*ml, model);
5665
- llm_load_hparams(*ml, model, 0, 0, 0);
6028
+ llm_load_arch(ml, model);
6029
+ llm_load_hparams(ml, model);
5666
6030
 
5667
6031
  if (params->only_copy) {
5668
6032
  ftype = model.ftype;
@@ -5672,7 +6036,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
5672
6036
  struct gguf_context * ctx_out = gguf_init_empty();
5673
6037
 
5674
6038
  // copy the KV pairs from the input file
5675
- gguf_set_kv (ctx_out, ml->ctx_gguf);
6039
+ gguf_set_kv (ctx_out, ml.ctx_gguf);
5676
6040
  gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
5677
6041
  gguf_set_val_u32(ctx_out, "general.file_type", ftype);
5678
6042
 
@@ -5680,8 +6044,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
5680
6044
  int n_attention_wv = 0;
5681
6045
  int n_feed_forward_w2 = 0;
5682
6046
 
5683
- for (int i = 0; i < ml->n_tensors; ++i) {
5684
- struct ggml_tensor * meta = ml->get_tensor_meta(i);
6047
+ for (int i = 0; i < ml.n_tensors; ++i) {
6048
+ struct ggml_tensor * meta = ml.get_tensor_meta(i);
5685
6049
 
5686
6050
  const std::string name = ggml_get_name(meta);
5687
6051
 
@@ -5717,8 +6081,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
5717
6081
  std::vector<no_init<float>> f32_conv_buf;
5718
6082
 
5719
6083
  // populate the original tensors so we get an initial meta data
5720
- for (int i = 0; i < ml->n_tensors; ++i) {
5721
- struct ggml_tensor * meta = ml->get_tensor_meta(i);
6084
+ for (int i = 0; i < ml.n_tensors; ++i) {
6085
+ struct ggml_tensor * meta = ml.get_tensor_meta(i);
5722
6086
  gguf_add_tensor(ctx_out, meta);
5723
6087
  }
5724
6088
 
@@ -5731,8 +6095,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
5731
6095
  // placeholder for the meta data
5732
6096
  ::zeros(fout, meta_size);
5733
6097
 
5734
- for (int i = 0; i < ml->n_tensors; ++i) {
5735
- struct ggml_tensor * tensor = ml->get_tensor_meta(i);
6098
+ for (int i = 0; i < ml.n_tensors; ++i) {
6099
+ struct ggml_tensor * tensor = ml.get_tensor_meta(i);
5736
6100
 
5737
6101
  const std::string name = ggml_get_name(tensor);
5738
6102
 
@@ -5740,10 +6104,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
5740
6104
  read_data.resize(ggml_nbytes(tensor));
5741
6105
  }
5742
6106
  tensor->data = read_data.data();
5743
- ml->load_data_for(tensor);
6107
+ ml.load_data_for(tensor);
5744
6108
 
5745
6109
  LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",
5746
- ++idx, ml->n_tensors,
6110
+ ++idx, ml.n_tensors,
5747
6111
  ggml_get_name(tensor),
5748
6112
  llama_format_tensor_shape(tensor).c_str(),
5749
6113
  ggml_type_name(tensor->type));
@@ -5893,9 +6257,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
5893
6257
  }
5894
6258
  }
5895
6259
 
5896
- // TODO: after the GGUF PR, this likely won't work and needs to be updated
5897
6260
  static int llama_apply_lora_from_file_internal(
5898
- const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads
6261
+ const struct llama_model & model, const char * path_lora, float scale, const char * path_base_model, int n_threads
5899
6262
  ) {
5900
6263
  LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
5901
6264
 
@@ -5924,7 +6287,7 @@ static int llama_apply_lora_from_file_internal(
5924
6287
  int32_t lora_alpha;
5925
6288
  fin.read((char *) &lora_r, sizeof(lora_r));
5926
6289
  fin.read((char *) &lora_alpha, sizeof(lora_alpha));
5927
- float scaling = (float)lora_alpha / (float)lora_r;
6290
+ float scaling = scale * (float)lora_alpha / (float)lora_r;
5928
6291
 
5929
6292
  LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
5930
6293
 
@@ -6140,9 +6503,10 @@ static int llama_apply_lora_from_file_internal(
6140
6503
  ggml_set_name(r, "r_cpy");
6141
6504
  }
6142
6505
 
6143
- struct ggml_cgraph gf = ggml_build_forward(r);
6506
+ struct ggml_cgraph * gf = ggml_new_graph(lora_ctx);
6507
+ ggml_build_forward_expand(gf, r);
6144
6508
 
6145
- ggml_graph_compute_helper(work_buffer, &gf, n_threads);
6509
+ ggml_graph_compute_helper(work_buffer, gf, n_threads);
6146
6510
 
6147
6511
  // we won't need these tensors again, reset the context to save memory
6148
6512
  ggml_free(lora_ctx);
@@ -6171,27 +6535,16 @@ static int llama_apply_lora_from_file_internal(
6171
6535
  //
6172
6536
  // interface implementation
6173
6537
  //
6174
-
6175
- struct llama_context_params llama_context_default_params() {
6176
- struct llama_context_params result = {
6177
- /*.seed =*/ LLAMA_DEFAULT_SEED,
6178
- /*.n_ctx =*/ 512,
6179
- /*.n_batch =*/ 512,
6538
+ struct llama_model_params llama_model_default_params() {
6539
+ struct llama_model_params result = {
6180
6540
  /*.n_gpu_layers =*/ 0,
6181
6541
  /*.main_gpu =*/ 0,
6182
6542
  /*.tensor_split =*/ nullptr,
6183
- /*.rope_freq_base =*/ 0.0f,
6184
- /*.rope_freq_scale =*/ 0.0f,
6185
6543
  /*.progress_callback =*/ nullptr,
6186
6544
  /*.progress_callback_user_data =*/ nullptr,
6187
- /*.low_vram =*/ false,
6188
- /*.mul_mat_q =*/ true,
6189
- /*.f16_kv =*/ true,
6190
- /*.logits_all =*/ false,
6191
6545
  /*.vocab_only =*/ false,
6192
6546
  /*.use_mmap =*/ true,
6193
6547
  /*.use_mlock =*/ false,
6194
- /*.embedding =*/ false,
6195
6548
  };
6196
6549
 
6197
6550
  #ifdef GGML_USE_METAL
@@ -6201,6 +6554,24 @@ struct llama_context_params llama_context_default_params() {
6201
6554
  return result;
6202
6555
  }
6203
6556
 
6557
+ struct llama_context_params llama_context_default_params() {
6558
+ struct llama_context_params result = {
6559
+ /*.seed =*/ LLAMA_DEFAULT_SEED,
6560
+ /*.n_ctx =*/ 512,
6561
+ /*.n_batch =*/ 512,
6562
+ /*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
6563
+ /*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS,
6564
+ /*.rope_freq_base =*/ 0.0f,
6565
+ /*.rope_freq_scale =*/ 0.0f,
6566
+ /*.mul_mat_q =*/ true,
6567
+ /*.f16_kv =*/ true,
6568
+ /*.logits_all =*/ false,
6569
+ /*.embedding =*/ false,
6570
+ };
6571
+
6572
+ return result;
6573
+ }
6574
+
6204
6575
  struct llama_model_quantize_params llama_model_quantize_default_params() {
6205
6576
  struct llama_model_quantize_params result = {
6206
6577
  /*.nthread =*/ 0,
@@ -6256,13 +6627,11 @@ int64_t llama_time_us(void) {
6256
6627
 
6257
6628
  struct llama_model * llama_load_model_from_file(
6258
6629
  const char * path_model,
6259
- struct llama_context_params params) {
6630
+ struct llama_model_params params) {
6260
6631
  ggml_time_init();
6261
6632
 
6262
6633
  llama_model * model = new llama_model;
6263
6634
 
6264
- ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
6265
-
6266
6635
  unsigned cur_percentage = 0;
6267
6636
  if (params.progress_callback == NULL) {
6268
6637
  params.progress_callback_user_data = &cur_percentage;
@@ -6279,9 +6648,9 @@ struct llama_model * llama_load_model_from_file(
6279
6648
  };
6280
6649
  }
6281
6650
 
6282
- if (!llama_model_load(path_model, *model, params.n_ctx, params.n_batch, params.n_gpu_layers,
6283
- params.main_gpu, params.tensor_split, params.mul_mat_q, params.rope_freq_base, params.rope_freq_scale,
6284
- params.low_vram, memory_type, params.use_mmap, params.use_mlock, params.vocab_only,
6651
+ if (!llama_model_load(path_model, *model, params.n_gpu_layers,
6652
+ params.main_gpu, params.tensor_split,
6653
+ params.use_mmap, params.use_mlock, params.vocab_only,
6285
6654
  params.progress_callback, params.progress_callback_user_data)) {
6286
6655
  LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
6287
6656
  delete model;
@@ -6305,18 +6674,33 @@ struct llama_context * llama_new_context_with_model(
6305
6674
 
6306
6675
  llama_context * ctx = new llama_context(*model);
6307
6676
 
6677
+ const auto & hparams = model->hparams;
6678
+ auto & cparams = ctx->cparams;
6679
+
6680
+ cparams.n_batch = params.n_batch;
6681
+ cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
6682
+ cparams.rope_freq_base = params.rope_freq_base == 0 ? hparams.rope_freq_base_train : params.rope_freq_base;
6683
+ cparams.rope_freq_scale = params.rope_freq_scale == 0 ? hparams.rope_freq_scale_train : params.rope_freq_scale;
6684
+ cparams.n_threads = params.n_threads;
6685
+ cparams.n_threads_batch = params.n_threads_batch;
6686
+ cparams.mul_mat_q = params.mul_mat_q;
6687
+
6308
6688
  if (params.seed == LLAMA_DEFAULT_SEED) {
6309
6689
  params.seed = time(NULL);
6310
6690
  }
6311
6691
 
6692
+ LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
6693
+ LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
6694
+ LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
6695
+
6312
6696
  ctx->rng = std::mt19937(params.seed);
6313
6697
  ctx->logits_all = params.logits_all;
6314
6698
 
6315
6699
  ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
6316
6700
 
6317
6701
  // reserve memory for context buffers
6318
- if (!params.vocab_only) {
6319
- if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
6702
+ if (!hparams.vocab_only) {
6703
+ if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, cparams.n_ctx, model->n_gpu_layers)) {
6320
6704
  LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
6321
6705
  llama_free(ctx);
6322
6706
  return nullptr;
@@ -6327,11 +6711,9 @@ struct llama_context * llama_new_context_with_model(
6327
6711
  LLAMA_LOG_INFO("%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
6328
6712
  }
6329
6713
 
6330
- const auto & hparams = ctx->model.hparams;
6331
-
6332
6714
  // resized during inference
6333
6715
  if (params.logits_all) {
6334
- ctx->logits.reserve(hparams.n_ctx*hparams.n_vocab);
6716
+ ctx->logits.reserve(cparams.n_ctx*hparams.n_vocab);
6335
6717
  } else {
6336
6718
  ctx->logits.reserve(hparams.n_vocab);
6337
6719
  }
@@ -6349,26 +6731,28 @@ struct llama_context * llama_new_context_with_model(
6349
6731
  ctx->alloc = ggml_allocr_new_measure(tensor_alignment);
6350
6732
 
6351
6733
  // build worst-case graph
6352
- int n_tokens = std::min((int)hparams.n_ctx, params.n_batch);
6353
- int n_past = hparams.n_ctx - n_tokens;
6734
+ int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch);
6735
+ int n_past = cparams.n_ctx - n_tokens;
6354
6736
  llama_token token = llama_token_bos(ctx); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
6355
- ggml_cgraph * gf = llama_build_graph(*ctx, &token, NULL, n_tokens, n_past);
6737
+ ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0));
6738
+
6356
6739
  #ifdef GGML_USE_METAL
6357
- if (params.n_gpu_layers > 0) {
6740
+ if (model->n_gpu_layers > 0) {
6358
6741
  ctx->ctx_metal = ggml_metal_init(1);
6359
6742
  if (!ctx->ctx_metal) {
6360
6743
  LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__);
6361
6744
  llama_free(ctx);
6362
6745
  return NULL;
6363
6746
  }
6364
- ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false);
6365
- ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
6747
+ ggml_metal_log_set_callback(llama_log_callback_default, NULL);
6748
+ //ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false);
6749
+ //ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
6366
6750
  }
6367
6751
  #endif
6368
6752
  // measure memory requirements for the graph
6369
6753
  size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
6370
6754
 
6371
- LLAMA_LOG_INFO("%s: compute buffer total size = %7.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
6755
+ LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
6372
6756
 
6373
6757
  // recreate allocator with exact memory requirements
6374
6758
  ggml_allocr_free(ctx->alloc);
@@ -6377,28 +6761,46 @@ struct llama_context * llama_new_context_with_model(
6377
6761
  ctx->alloc = ggml_allocr_new(ctx->buf_alloc.data, ctx->buf_alloc.size, tensor_alignment);
6378
6762
  #ifdef GGML_USE_METAL
6379
6763
  if (ctx->ctx_metal) {
6380
- ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
6764
+ //ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
6381
6765
  }
6382
6766
  #endif
6383
6767
  #ifdef GGML_USE_CUBLAS
6384
- if (params.low_vram) {
6385
- LLAMA_LOG_INFO("%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
6386
- ggml_cuda_set_scratch_size(0); // disable scratch
6387
- } else {
6388
- ggml_cuda_set_scratch_size(alloc_size);
6389
- LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MB\n", __func__, alloc_size / 1024.0 / 1024.0);
6768
+ ggml_cuda_set_scratch_size(alloc_size);
6769
+ LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MB\n", __func__, alloc_size / 1024.0 / 1024.0);
6770
+
6771
+ // calculate total VRAM usage
6772
+ auto add_tensor = [](const ggml_tensor * t, size_t & size) {
6773
+ if (t->backend == GGML_BACKEND_GPU || t->backend == GGML_BACKEND_GPU_SPLIT) {
6774
+ size += ggml_nbytes(t);
6775
+ }
6776
+ };
6777
+ size_t model_vram_size = 0;
6778
+ for (const auto & kv : model->tensors_by_name) {
6779
+ add_tensor(kv.second, model_vram_size);
6390
6780
  }
6781
+
6782
+ size_t kv_vram_size = 0;
6783
+ add_tensor(ctx->kv_self.k, kv_vram_size);
6784
+ add_tensor(ctx->kv_self.v, kv_vram_size);
6785
+
6786
+ size_t ctx_vram_size = alloc_size + kv_vram_size;
6787
+ size_t total_vram_size = model_vram_size + ctx_vram_size;
6788
+
6789
+ LLAMA_LOG_INFO("%s: total VRAM used: %.2f MB (model: %.2f MB, context: %.2f MB)\n", __func__,
6790
+ total_vram_size / 1024.0 / 1024.0,
6791
+ model_vram_size / 1024.0 / 1024.0,
6792
+ ctx_vram_size / 1024.0 / 1024.0);
6391
6793
  #endif
6392
6794
  }
6393
6795
 
6394
6796
  #ifdef GGML_USE_METAL
6395
- if (params.n_gpu_layers > 0) {
6797
+ if (model->n_gpu_layers > 0) {
6396
6798
  // this allocates all Metal resources and memory buffers
6397
6799
 
6398
6800
  void * data_ptr = NULL;
6399
6801
  size_t data_size = 0;
6400
6802
 
6401
- if (params.use_mmap) {
6803
+ if (ctx->model.mapping) {
6402
6804
  data_ptr = ctx->model.mapping->addr;
6403
6805
  data_size = ctx->model.mapping->size;
6404
6806
  } else {
@@ -6417,11 +6819,8 @@ struct llama_context * llama_new_context_with_model(
6417
6819
  return NULL; \
6418
6820
  }
6419
6821
 
6420
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
6421
-
6422
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.data, ctx->buf_compute.size, 0));
6423
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0));
6424
-
6822
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
6823
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0));
6425
6824
  LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.data, ctx->buf_alloc.size, 0));
6426
6825
  #undef LLAMA_METAL_CHECK_BUF
6427
6826
  }
@@ -6433,8 +6832,10 @@ struct llama_context * llama_new_context_with_model(
6433
6832
 
6434
6833
  if (ggml_mpi_rank(ctx->ctx_mpi) > 0) {
6435
6834
  // Enter a blocking eval loop with dummy input, letting rank=0 drive the process
6436
- const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos(ctx));
6437
- while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {};
6835
+ // TODO: needs fix after #3228
6836
+ GGML_ASSERT(false && "not implemented");
6837
+ //const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos(ctx));
6838
+ //while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {};
6438
6839
  llama_backend_free();
6439
6840
  exit(1);
6440
6841
  }
@@ -6443,63 +6844,37 @@ struct llama_context * llama_new_context_with_model(
6443
6844
  return ctx;
6444
6845
  }
6445
6846
 
6446
- static struct llama_context * llama_init_from_file(
6447
- const char * path_model,
6448
- struct llama_context_params params) {
6449
- struct llama_model * model = llama_load_model_from_file(path_model, params);
6450
- if (!model) {
6451
- return nullptr;
6452
- }
6453
-
6454
- struct llama_context * ctx = llama_new_context_with_model(model, params);
6455
- ctx->model_owner = true;
6456
-
6457
- return ctx;
6458
- }
6459
-
6460
6847
  void llama_free(struct llama_context * ctx) {
6461
6848
  delete ctx;
6462
6849
  }
6463
6850
 
6464
- int llama_n_vocab(const struct llama_context * ctx) {
6465
- return llama_model_n_vocab(&ctx->model);
6851
+ const llama_model * llama_get_model(const struct llama_context * ctx) {
6852
+ return &ctx->model;
6466
6853
  }
6467
6854
 
6468
6855
  int llama_n_ctx(const struct llama_context * ctx) {
6469
- return llama_model_n_ctx(&ctx->model);
6856
+ return ctx->cparams.n_ctx;
6470
6857
  }
6471
6858
 
6472
- int llama_n_ctx_train(const struct llama_context * ctx) {
6473
- return llama_model_n_ctx_train(&ctx->model);
6859
+ enum llama_vocab_type llama_vocab_type(const struct llama_model * model) {
6860
+ return model->vocab.type;
6474
6861
  }
6475
6862
 
6476
- int llama_n_embd(const struct llama_context * ctx) {
6477
- return llama_model_n_embd(&ctx->model);
6478
- }
6479
-
6480
- enum llama_vocab_type llama_vocab_type(const struct llama_context * ctx) {
6481
- return ctx->model.vocab.type;
6482
- }
6483
-
6484
- int llama_model_n_vocab(const struct llama_model * model) {
6863
+ int llama_n_vocab(const struct llama_model * model) {
6485
6864
  return model->vocab.id_to_token.size();
6486
6865
  }
6487
6866
 
6488
- int llama_model_n_ctx(const struct llama_model * model) {
6489
- return model->hparams.n_ctx;
6490
- }
6491
-
6492
- int llama_model_n_ctx_train(const struct llama_model * model) {
6867
+ int llama_n_ctx_train(const struct llama_model * model) {
6493
6868
  return model->hparams.n_ctx_train;
6494
6869
  }
6495
6870
 
6496
- int llama_model_n_embd(const struct llama_model * model) {
6871
+ int llama_n_embd(const struct llama_model * model) {
6497
6872
  return model->hparams.n_embd;
6498
6873
  }
6499
6874
 
6500
6875
  int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
6501
6876
  return snprintf(buf, buf_size, "%s %s %s",
6502
- model->name.c_str(),
6877
+ llama_model_arch_name(model->arch).c_str(),
6503
6878
  llama_model_type_name(model->type),
6504
6879
  llama_model_ftype_name(model->ftype).c_str());
6505
6880
  }
@@ -6520,6 +6895,10 @@ uint64_t llama_model_n_params(const struct llama_model * model) {
6520
6895
  return nparams;
6521
6896
  }
6522
6897
 
6898
+ struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name) {
6899
+ return ggml_get_tensor(model->ctx, name);
6900
+ }
6901
+
6523
6902
  int llama_model_quantize(
6524
6903
  const char * fname_inp,
6525
6904
  const char * fname_out,
@@ -6533,18 +6912,18 @@ int llama_model_quantize(
6533
6912
  }
6534
6913
  }
6535
6914
 
6536
- int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
6915
+ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, float scale, const char * path_base_model, int n_threads) {
6537
6916
  try {
6538
- return llama_apply_lora_from_file_internal(ctx->model, path_lora, path_base_model, n_threads);
6917
+ return llama_apply_lora_from_file_internal(ctx->model, path_lora, scale, path_base_model, n_threads);
6539
6918
  } catch (const std::exception & err) {
6540
6919
  LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
6541
6920
  return 1;
6542
6921
  }
6543
6922
  }
6544
6923
 
6545
- int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, const char * path_base_model, int n_threads) {
6924
+ int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int n_threads) {
6546
6925
  try {
6547
- return llama_apply_lora_from_file_internal(*model, path_lora, path_base_model, n_threads);
6926
+ return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads);
6548
6927
  } catch (const std::exception & err) {
6549
6928
  LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
6550
6929
  return 1;
@@ -6552,16 +6931,27 @@ int llama_model_apply_lora_from_file(const struct llama_model * model, const cha
6552
6931
  }
6553
6932
 
6554
6933
  int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
6555
- return ctx->kv_self.n;
6934
+ return ctx->kv_self.head;
6556
6935
  }
6557
6936
 
6558
- #define LLAMA_MAX_RNG_STATE (64*1024)
6937
+ void llama_kv_cache_tokens_rm(struct llama_context * ctx, int32_t c0, int32_t c1) {
6938
+ llama_kv_cache_tokens_rm(ctx->kv_self, c0, c1);
6939
+ }
6559
6940
 
6560
- void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed) {
6561
- if (seed == LLAMA_DEFAULT_SEED) {
6562
- seed = time(NULL);
6563
- }
6564
- ctx->rng.seed(seed);
6941
+ void llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
6942
+ llama_kv_cache_seq_rm(ctx->kv_self, seq_id, p0, p1);
6943
+ }
6944
+
6945
+ void llama_kv_cache_seq_cp(struct llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
6946
+ llama_kv_cache_seq_cp(ctx->kv_self, seq_id_src, seq_id_dst, p0, p1);
6947
+ }
6948
+
6949
+ void llama_kv_cache_seq_keep(struct llama_context * ctx, llama_seq_id seq_id) {
6950
+ llama_kv_cache_seq_keep(ctx->kv_self, seq_id);
6951
+ }
6952
+
6953
+ void llama_kv_cache_seq_shift(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
6954
+ llama_kv_cache_seq_shift(ctx->kv_self, seq_id, p0, p1, delta);
6565
6955
  }
6566
6956
 
6567
6957
  // Returns the *maximum* size of the state
@@ -6649,6 +7039,16 @@ struct llama_data_file_context : llama_data_context {
6649
7039
  *
6650
7040
  */
6651
7041
  static void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
7042
+ // TODO: does not support multi-sequence states
7043
+ {
7044
+ const auto & kv_self = ctx->kv_self;
7045
+ for (uint32_t i = 0; i < kv_self.head; ++i) {
7046
+ GGML_ASSERT(kv_self.cells[i].pos == (int32_t) i);
7047
+ GGML_ASSERT(kv_self.cells[i].seq_id.size() == 1);
7048
+ GGML_ASSERT(kv_self.cells[i].has_seq_id(0));
7049
+ }
7050
+ }
7051
+
6652
7052
  // copy rng
6653
7053
  {
6654
7054
  std::stringstream rng_ss;
@@ -6699,12 +7099,14 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
6699
7099
  {
6700
7100
  const auto & kv_self = ctx->kv_self;
6701
7101
  const auto & hparams = ctx->model.hparams;
7102
+ const auto & cparams = ctx->cparams;
7103
+
6702
7104
  const int n_layer = hparams.n_layer;
6703
7105
  const int n_embd = hparams.n_embd_gqa();
6704
- const int n_ctx = hparams.n_ctx;
7106
+ const int n_ctx = cparams.n_ctx;
6705
7107
 
6706
7108
  const size_t kv_size = kv_self.buf.size;
6707
- const int kv_ntok = llama_get_kv_cache_token_count(ctx);
7109
+ const int kv_ntok = kv_self.head;
6708
7110
 
6709
7111
  data_ctx->write(&kv_size, sizeof(kv_size));
6710
7112
  data_ctx->write(&kv_ntok, sizeof(kv_ntok));
@@ -6807,9 +7209,11 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
6807
7209
  {
6808
7210
  const auto & kv_self = ctx->kv_self;
6809
7211
  const auto & hparams = ctx->model.hparams;
7212
+ const auto & cparams = ctx->cparams;
7213
+
6810
7214
  const int n_layer = hparams.n_layer;
6811
7215
  const int n_embd = hparams.n_embd_gqa();
6812
- const int n_ctx = hparams.n_ctx;
7216
+ const int n_ctx = cparams.n_ctx;
6813
7217
 
6814
7218
  size_t kv_size;
6815
7219
  int kv_ntok;
@@ -6848,7 +7252,8 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
6848
7252
  ggml_free(cpy_ctx);
6849
7253
  }
6850
7254
 
6851
- ctx->kv_self.n = kv_ntok;
7255
+ ctx->kv_self.head = kv_ntok;
7256
+ ctx->kv_self.size = kv_size;
6852
7257
  }
6853
7258
 
6854
7259
  const size_t nread = inp - src;
@@ -6943,64 +7348,102 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi
6943
7348
 
6944
7349
  int llama_eval(
6945
7350
  struct llama_context * ctx,
6946
- const llama_token * tokens,
6947
- int n_tokens,
6948
- int n_past,
6949
- int n_threads) {
6950
- if (!llama_eval_internal(*ctx, tokens, nullptr, n_tokens, n_past, n_threads, nullptr)) {
6951
- LLAMA_LOG_ERROR("%s: failed to eval\n", __func__);
6952
- return 1;
6953
- }
7351
+ llama_token * tokens,
7352
+ int32_t n_tokens,
7353
+ int n_past) {
7354
+ llama_kv_cache_tokens_rm(ctx->kv_self, n_past, -1);
6954
7355
 
6955
- // get a more accurate load time, upon first eval
6956
- // TODO: fix this
6957
- if (!ctx->has_evaluated_once) {
6958
- ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
6959
- ctx->has_evaluated_once = true;
7356
+ const int ret = llama_decode_internal(*ctx, llama_batch_get_one(tokens, n_tokens, n_past, 0));
7357
+ if (ret < 0) {
7358
+ LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
6960
7359
  }
6961
7360
 
6962
- return 0;
7361
+ return ret;
6963
7362
  }
6964
7363
 
6965
7364
  int llama_eval_embd(
6966
7365
  struct llama_context * ctx,
6967
- const float * embd,
6968
- int n_tokens,
6969
- int n_past,
6970
- int n_threads) {
6971
- if (!llama_eval_internal(*ctx, nullptr, embd, n_tokens, n_past, n_threads, nullptr)) {
6972
- LLAMA_LOG_ERROR("%s: failed to eval\n", __func__);
6973
- return 1;
6974
- }
7366
+ float * embd,
7367
+ int32_t n_tokens,
7368
+ int n_past) {
7369
+ llama_kv_cache_tokens_rm(ctx->kv_self, n_past, -1);
6975
7370
 
6976
- // get a more accurate load time, upon first eval
6977
- // TODO: fix this
6978
- if (!ctx->has_evaluated_once) {
6979
- ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
6980
- ctx->has_evaluated_once = true;
7371
+ llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, n_past, 1, 0, };
7372
+
7373
+ const int ret = llama_decode_internal(*ctx, batch);
7374
+ if (ret < 0) {
7375
+ LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
6981
7376
  }
6982
7377
 
6983
- return 0;
7378
+ return ret;
6984
7379
  }
6985
7380
 
6986
- int llama_eval_export(struct llama_context * ctx, const char * fname) {
6987
- const int n_batch = 1;
6988
- const int n_ctx = 512 - n_batch;
7381
+ void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch) {
7382
+ ctx->cparams.n_threads = n_threads;
7383
+ ctx->cparams.n_threads_batch = n_threads_batch;
7384
+ }
7385
+
7386
+ struct llama_batch llama_batch_get_one(
7387
+ llama_token * tokens,
7388
+ int32_t n_tokens,
7389
+ llama_pos pos_0,
7390
+ llama_seq_id seq_id) {
7391
+ return {
7392
+ /*n_tokens =*/ n_tokens,
7393
+ /*tokens =*/ tokens,
7394
+ /*embd =*/ nullptr,
7395
+ /*pos =*/ nullptr,
7396
+ /*seq_id =*/ nullptr,
7397
+ /*logits =*/ nullptr,
7398
+ /*all_pos_0 =*/ pos_0,
7399
+ /*all_pos_1 =*/ 1,
7400
+ /*all_seq_id =*/ seq_id,
7401
+ };
7402
+ }
6989
7403
 
6990
- const std::vector<llama_token> tmp(n_batch, llama_token_bos(ctx));
7404
+ struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd) {
7405
+ llama_batch batch = { -1, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, };
6991
7406
 
6992
- if (!llama_eval_internal(*ctx, tmp.data(), nullptr, tmp.size(), n_ctx, 1, fname)) {
6993
- LLAMA_LOG_ERROR("%s: failed to eval\n", __func__);
6994
- return 1;
7407
+ if (embd) {
7408
+ batch.embd = (float *) malloc(sizeof(float) * n_tokens * embd);
7409
+ } else {
7410
+ batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens);
6995
7411
  }
6996
7412
 
6997
- return 0;
7413
+ batch.pos = (llama_pos *) malloc(sizeof(llama_pos) * n_tokens);
7414
+ batch.seq_id = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_tokens);
7415
+ batch.logits = (int8_t *) malloc(sizeof(int8_t) * n_tokens);
7416
+
7417
+ return batch;
7418
+ }
7419
+
7420
+ void llama_batch_free(struct llama_batch batch) {
7421
+ if (batch.token) free(batch.token);
7422
+ if (batch.embd) free(batch.embd);
7423
+ if (batch.pos) free(batch.pos);
7424
+ if (batch.seq_id) free(batch.seq_id);
7425
+ if (batch.logits) free(batch.logits);
7426
+ }
7427
+
7428
+ int llama_decode(
7429
+ struct llama_context * ctx,
7430
+ struct llama_batch batch) {
7431
+ const int ret = llama_decode_internal(*ctx, batch);
7432
+ if (ret < 0) {
7433
+ LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
7434
+ }
7435
+
7436
+ return ret;
6998
7437
  }
6999
7438
 
7000
7439
  float * llama_get_logits(struct llama_context * ctx) {
7001
7440
  return ctx->logits.data();
7002
7441
  }
7003
7442
 
7443
+ float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
7444
+ return ctx->logits.data() + i*ctx->model.hparams.n_vocab;
7445
+ }
7446
+
7004
7447
  float * llama_get_embeddings(struct llama_context * ctx) {
7005
7448
  return ctx->embedding.data();
7006
7449
  }
@@ -7030,16 +7473,6 @@ llama_token llama_token_nl(const struct llama_context * ctx) {
7030
7473
  }
7031
7474
 
7032
7475
  int llama_tokenize(
7033
- struct llama_context * ctx,
7034
- const char * text,
7035
- int text_len,
7036
- llama_token * tokens,
7037
- int n_max_tokens,
7038
- bool add_bos) {
7039
- return llama_tokenize_with_model(&ctx->model, text, text_len, tokens, n_max_tokens, add_bos);
7040
- }
7041
-
7042
- int llama_tokenize_with_model(
7043
7476
  const struct llama_model * model,
7044
7477
  const char * text,
7045
7478
  int text_len,
@@ -7060,13 +7493,9 @@ int llama_tokenize_with_model(
7060
7493
  return res.size();
7061
7494
  }
7062
7495
 
7063
- int llama_token_to_piece(const struct llama_context * ctx, llama_token token, char * buf, int length) {
7064
- return llama_token_to_piece_with_model(&ctx->model, token, buf, length);
7065
- }
7066
-
7067
7496
  // does not write null-terminator to buf
7068
- int llama_token_to_piece_with_model(const struct llama_model * model, llama_token token, char * buf, int length) {
7069
- if (0 <= token && token < llama_model_n_vocab(model)) {
7497
+ int llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int length) {
7498
+ if (0 <= token && token < llama_n_vocab(model)) {
7070
7499
  if (llama_is_normal_token(model->vocab, token)) {
7071
7500
  std::string result = model->vocab.id_to_token[token].text;
7072
7501
  if (llama_vocab_get_type(model->vocab) == LLAMA_VOCAB_TYPE_SPM) {
@@ -7086,7 +7515,7 @@ int llama_token_to_piece_with_model(const struct llama_model * model, llama_toke
7086
7515
  buf[2] = '\x85';
7087
7516
  return 3;
7088
7517
  } else if (llama_is_control_token(model->vocab, token)) {
7089
- ;
7518
+ // do nothing
7090
7519
  } else if (llama_is_byte_token(model->vocab, token)) {
7091
7520
  if (length < 1) {
7092
7521
  return -1;
@@ -7194,12 +7623,12 @@ const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal
7194
7623
  return ctx->model.tensors_by_name;
7195
7624
  }
7196
7625
 
7197
- void llama_log_set(llama_log_callback log_callback, void * user_data) {
7626
+ void llama_log_set(ggml_log_callback log_callback, void * user_data) {
7198
7627
  g_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
7199
7628
  g_state.log_callback_user_data = user_data;
7200
7629
  }
7201
7630
 
7202
- static void llama_log_internal_v(llama_log_level level, const char * format, va_list args) {
7631
+ static void llama_log_internal_v(ggml_log_level level, const char * format, va_list args) {
7203
7632
  va_list args_copy;
7204
7633
  va_copy(args_copy, args);
7205
7634
  char buffer[128];
@@ -7216,14 +7645,14 @@ static void llama_log_internal_v(llama_log_level level, const char * format, va_
7216
7645
  va_end(args_copy);
7217
7646
  }
7218
7647
 
7219
- static void llama_log_internal(llama_log_level level, const char * format, ...) {
7648
+ static void llama_log_internal(ggml_log_level level, const char * format, ...) {
7220
7649
  va_list args;
7221
7650
  va_start(args, format);
7222
7651
  llama_log_internal_v(level, format, args);
7223
7652
  va_end(args);
7224
7653
  }
7225
7654
 
7226
- static void llama_log_callback_default(llama_log_level level, const char * text, void * user_data) {
7655
+ static void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data) {
7227
7656
  (void) level;
7228
7657
  (void) user_data;
7229
7658
  fputs(text, stderr);