llama_cpp 0.5.2 → 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,3 +1,4 @@
1
+ #define LLAMA_API_INTERNAL
1
2
  #include "llama.h"
2
3
 
3
4
  #include "ggml.h"
@@ -71,6 +72,7 @@
71
72
  #include <sstream>
72
73
  #include <thread>
73
74
  #include <unordered_map>
75
+ #include <set>
74
76
 
75
77
  #if defined(_MSC_VER)
76
78
  #pragma warning(disable: 4244 4267) // possible loss of data
@@ -91,12 +93,12 @@
91
93
  //
92
94
 
93
95
  LLAMA_ATTRIBUTE_FORMAT(2, 3)
94
- static void llama_log_internal (llama_log_level level, const char* format, ...);
95
- static void llama_log_callback_default(llama_log_level level, const char * text, void * user_data);
96
+ static void llama_log_internal (ggml_log_level level, const char* format, ...);
97
+ static void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data);
96
98
 
97
- #define LLAMA_LOG_INFO(...) llama_log_internal(LLAMA_LOG_LEVEL_INFO , __VA_ARGS__)
98
- #define LLAMA_LOG_WARN(...) llama_log_internal(LLAMA_LOG_LEVEL_WARN , __VA_ARGS__)
99
- #define LLAMA_LOG_ERROR(...) llama_log_internal(LLAMA_LOG_LEVEL_ERROR, __VA_ARGS__)
99
+ #define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
100
+ #define LLAMA_LOG_WARN(...) llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
101
+ #define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
100
102
 
101
103
  //
102
104
  // helpers
@@ -108,7 +110,7 @@ static size_t utf8_len(char src) {
108
110
  return lookup[highbits];
109
111
  }
110
112
 
111
- void replace_all(std::string & s, const std::string & search, const std::string & replace) {
113
+ static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
112
114
  std::string result;
113
115
  for (size_t pos = 0; ; pos += search.length()) {
114
116
  auto new_pos = s.find(search, pos);
@@ -160,17 +162,19 @@ enum llm_arch {
160
162
  LLM_ARCH_GPTJ,
161
163
  LLM_ARCH_GPTNEOX,
162
164
  LLM_ARCH_MPT,
165
+ LLM_ARCH_STARCODER,
163
166
  LLM_ARCH_UNKNOWN,
164
167
  };
165
168
 
166
169
  static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
167
- { LLM_ARCH_LLAMA, "llama" },
168
- { LLM_ARCH_FALCON, "falcon" },
169
- { LLM_ARCH_GPT2, "gpt2" },
170
- { LLM_ARCH_GPTJ, "gptj" },
171
- { LLM_ARCH_GPTNEOX, "gptneox" },
172
- { LLM_ARCH_MPT, "mpt" },
173
- { LLM_ARCH_BAICHUAN,"baichuan" },
170
+ { LLM_ARCH_LLAMA, "llama" },
171
+ { LLM_ARCH_FALCON, "falcon" },
172
+ { LLM_ARCH_GPT2, "gpt2" },
173
+ { LLM_ARCH_GPTJ, "gptj" },
174
+ { LLM_ARCH_GPTNEOX, "gptneox" },
175
+ { LLM_ARCH_MPT, "mpt" },
176
+ { LLM_ARCH_BAICHUAN, "baichuan" },
177
+ { LLM_ARCH_STARCODER, "starcoder" },
174
178
  };
175
179
 
176
180
  enum llm_kv {
@@ -218,16 +222,16 @@ enum llm_kv {
218
222
  };
219
223
 
220
224
  static std::map<llm_kv, std::string> LLM_KV_NAMES = {
221
- { LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" },
222
- { LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" },
223
- { LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
224
- { LLM_KV_GENERAL_NAME, "general.name" },
225
- { LLM_KV_GENERAL_AUTHOR, "general.author" },
226
- { LLM_KV_GENERAL_URL, "general.url" },
227
- { LLM_KV_GENERAL_DESCRIPTION, "general.description" },
228
- { LLM_KV_GENERAL_LICENSE, "general.license" },
229
- { LLM_KV_GENERAL_SOURCE_URL, "general.source_url" },
230
- { LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source_hf_repo" },
225
+ { LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" },
226
+ { LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" },
227
+ { LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
228
+ { LLM_KV_GENERAL_NAME, "general.name" },
229
+ { LLM_KV_GENERAL_AUTHOR, "general.author" },
230
+ { LLM_KV_GENERAL_URL, "general.url" },
231
+ { LLM_KV_GENERAL_DESCRIPTION, "general.description" },
232
+ { LLM_KV_GENERAL_LICENSE, "general.license" },
233
+ { LLM_KV_GENERAL_SOURCE_URL, "general.source.url" },
234
+ { LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" },
231
235
 
232
236
  { LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
233
237
  { LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
@@ -376,6 +380,21 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
376
380
  { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
377
381
  },
378
382
  },
383
+ {
384
+ LLM_ARCH_STARCODER,
385
+ {
386
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
387
+ { LLM_TENSOR_POS_EMBD, "position_embd" },
388
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
389
+ { LLM_TENSOR_OUTPUT, "output" },
390
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
391
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
392
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
393
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
394
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
395
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
396
+ },
397
+ },
379
398
  {
380
399
  LLM_ARCH_UNKNOWN,
381
400
  {
@@ -430,7 +449,7 @@ struct LLM_TN {
430
449
  //
431
450
 
432
451
  #define GGUF_GET_KEY(ctx, dst, func, type, req, key) \
433
- { \
452
+ do { \
434
453
  const std::string skey(key); \
435
454
  const int kid = gguf_find_key(ctx, skey.c_str()); \
436
455
  if (kid >= 0) { \
@@ -442,7 +461,7 @@ struct LLM_TN {
442
461
  } else if (req) { \
443
462
  throw std::runtime_error(format("key not found in model: %s", skey.c_str())); \
444
463
  } \
445
- }
464
+ } while (0)
446
465
 
447
466
  //
448
467
  // ggml helpers
@@ -680,6 +699,7 @@ struct llama_mmap {
680
699
  if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
681
700
  fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
682
701
  llama_format_win_err(GetLastError()).c_str());
702
+ }
683
703
  }
684
704
  #else
685
705
  #pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
@@ -862,10 +882,10 @@ static void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
862
882
 
863
883
  static std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) {
864
884
  std::vector<char> result(8, 0);
865
- const int n_tokens = llama_token_to_piece(ctx, token, result.data(), result.size());
885
+ const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
866
886
  if (n_tokens < 0) {
867
887
  result.resize(-n_tokens);
868
- int check = llama_token_to_piece(ctx, token, result.data(), result.size());
888
+ int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
869
889
  GGML_ASSERT(check == -n_tokens);
870
890
  } else {
871
891
  result.resize(n_tokens);
@@ -880,7 +900,7 @@ static std::string llama_token_to_str(const struct llama_context * ctx, llama_to
880
900
 
881
901
  struct llama_state {
882
902
  // We save the log callback globally
883
- llama_log_callback log_callback = llama_log_callback_default;
903
+ ggml_log_callback log_callback = llama_log_callback_default;
884
904
  void * log_callback_user_data = nullptr;
885
905
  };
886
906
 
@@ -889,9 +909,11 @@ static llama_state g_state;
889
909
  // available llama models
890
910
  enum e_model {
891
911
  MODEL_UNKNOWN,
912
+ MODEL_1B,
892
913
  MODEL_3B,
893
914
  MODEL_7B,
894
915
  MODEL_13B,
916
+ MODEL_15B,
895
917
  MODEL_30B,
896
918
  MODEL_34B,
897
919
  MODEL_40B,
@@ -901,24 +923,24 @@ enum e_model {
901
923
 
902
924
  static const size_t kB = 1024;
903
925
  static const size_t MB = kB*kB;
926
+ static const size_t GB = kB*kB*kB;
904
927
 
905
- // default hparams (LLaMA 7B)
906
928
  struct llama_hparams {
907
- uint32_t n_vocab = 32000;
908
- uint32_t n_ctx_train = 2048; // the context size used during training
909
- uint32_t n_ctx = 512; // the context size used during inference
910
- uint32_t n_embd = 4096;
911
- uint32_t n_head = 32;
912
- uint32_t n_head_kv = 32;
913
- uint32_t n_layer = 32;
914
- uint32_t n_rot = 64;
915
- uint32_t n_ff = 11008;
916
-
917
- float f_norm_eps = 1e-5;
918
- float f_norm_rms_eps = 1e-5;
919
-
920
- float rope_freq_base = 10000.0f;
921
- float rope_freq_scale = 1.0f;
929
+ bool vocab_only;
930
+ uint32_t n_vocab;
931
+ uint32_t n_ctx_train; // context size the model was trained on
932
+ uint32_t n_embd;
933
+ uint32_t n_head;
934
+ uint32_t n_head_kv;
935
+ uint32_t n_layer;
936
+ uint32_t n_rot;
937
+ uint32_t n_ff;
938
+
939
+ float f_norm_eps;
940
+ float f_norm_rms_eps;
941
+
942
+ float rope_freq_base_train;
943
+ float rope_freq_scale_train;
922
944
 
923
945
  bool operator!=(const llama_hparams & other) const {
924
946
  return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams))); // NOLINT
@@ -935,15 +957,18 @@ struct llama_hparams {
935
957
  uint32_t n_embd_gqa() const {
936
958
  return n_embd/n_gqa();
937
959
  }
960
+ };
938
961
 
939
- size_t kv_size() const {
940
- size_t result = 2ull;
941
- result *= (size_t) n_embd_gqa();
942
- result *= (size_t) n_ctx;
943
- result *= (size_t) n_layer;
944
- result *= sizeof(ggml_fp16_t);
945
- return result;
946
- }
962
+ struct llama_cparams {
963
+ uint32_t n_ctx; // context size used during inference
964
+ uint32_t n_batch;
965
+ uint32_t n_threads; // number of threads to use for generation
966
+ uint32_t n_threads_batch; // number of threads to use for batch processing
967
+
968
+ float rope_freq_base;
969
+ float rope_freq_scale;
970
+
971
+ bool mul_mat_q;
947
972
  };
948
973
 
949
974
  struct llama_layer {
@@ -960,16 +985,47 @@ struct llama_layer {
960
985
  struct ggml_tensor * wo;
961
986
  struct ggml_tensor * wqkv;
962
987
 
988
+ // attention bias
989
+ struct ggml_tensor * bo;
990
+ struct ggml_tensor * bqkv;
991
+
963
992
  // normalization
964
993
  struct ggml_tensor * ffn_norm;
994
+ struct ggml_tensor * ffn_norm_b;
965
995
 
966
996
  // ff
967
997
  struct ggml_tensor * w1; // ffn_gate
968
998
  struct ggml_tensor * w2; // ffn_down
969
999
  struct ggml_tensor * w3; // ffn_up
1000
+
1001
+ // ff bias
1002
+ struct ggml_tensor * b2; // ffn_down
1003
+ struct ggml_tensor * b3; // ffn_up
1004
+ };
1005
+
1006
+ struct llama_kv_cell {
1007
+ llama_pos pos = -1;
1008
+ llama_pos delta = 0;
1009
+
1010
+ std::set<llama_seq_id> seq_id;
1011
+
1012
+ bool has_seq_id(const llama_seq_id & id) const {
1013
+ return seq_id.find(id) != seq_id.end();
1014
+ }
970
1015
  };
971
1016
 
1017
+ // ring-buffer of cached KV data
972
1018
  struct llama_kv_cache {
1019
+ bool has_shift = false;
1020
+
1021
+ uint32_t head = 0;
1022
+ uint32_t size = 0;
1023
+
1024
+ // computed before each graph build
1025
+ uint32_t n = 0;
1026
+
1027
+ std::vector<llama_kv_cell> cells;
1028
+
973
1029
  struct ggml_tensor * k = NULL;
974
1030
  struct ggml_tensor * v = NULL;
975
1031
 
@@ -977,8 +1033,6 @@ struct llama_kv_cache {
977
1033
 
978
1034
  llama_buffer buf;
979
1035
 
980
- int n; // number of tokens currently in the cache
981
-
982
1036
  ~llama_kv_cache() {
983
1037
  if (ctx) {
984
1038
  ggml_free(ctx);
@@ -1040,10 +1094,11 @@ struct llama_model {
1040
1094
 
1041
1095
  std::string name = "n/a";
1042
1096
 
1043
- llama_hparams hparams;
1097
+ llama_hparams hparams = {};
1044
1098
  llama_vocab vocab;
1045
1099
 
1046
1100
  struct ggml_tensor * tok_embeddings;
1101
+ struct ggml_tensor * pos_embeddings;
1047
1102
 
1048
1103
  struct ggml_tensor * output_norm;
1049
1104
  struct ggml_tensor * output_norm_b;
@@ -1091,11 +1146,8 @@ struct llama_model {
1091
1146
  };
1092
1147
 
1093
1148
  struct llama_context {
1094
- llama_context(const llama_model & model) : model(model), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
1149
+ llama_context(const llama_model & model) : model(model), t_start_us(model.t_start_us), t_load_us(model.t_load_us) {}
1095
1150
  ~llama_context() {
1096
- if (model_owner) {
1097
- delete &model;
1098
- }
1099
1151
  #ifdef GGML_USE_METAL
1100
1152
  if (ctx_metal) {
1101
1153
  ggml_metal_free(ctx_metal);
@@ -1106,27 +1158,26 @@ struct llama_context {
1106
1158
  }
1107
1159
  }
1108
1160
 
1161
+ llama_cparams cparams;
1162
+
1163
+ const llama_model & model;
1164
+
1165
+ // key + value cache for the self attention
1166
+ struct llama_kv_cache kv_self;
1167
+
1109
1168
  std::mt19937 rng;
1110
1169
 
1111
1170
  bool has_evaluated_once = false;
1112
1171
 
1172
+ int64_t t_start_us;
1173
+ int64_t t_load_us;
1113
1174
  int64_t t_sample_us = 0;
1114
- int64_t t_eval_us = 0;
1115
1175
  int64_t t_p_eval_us = 0;
1176
+ int64_t t_eval_us = 0;
1116
1177
 
1117
1178
  int32_t n_sample = 0; // number of tokens sampled
1118
- int32_t n_eval = 0; // number of eval calls
1119
1179
  int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
1120
-
1121
- const llama_model & model;
1122
-
1123
- bool model_owner = false;
1124
-
1125
- int64_t t_load_us;
1126
- int64_t t_start_us;
1127
-
1128
- // key + value cache for the self attention
1129
- struct llama_kv_cache kv_self;
1180
+ int32_t n_eval = 0; // number of eval calls
1130
1181
 
1131
1182
  // decode output (2-dimensional array: [n_tokens][n_vocab])
1132
1183
  std::vector<float> logits;
@@ -1161,16 +1212,23 @@ static bool llama_kv_cache_init(
1161
1212
  const struct llama_hparams & hparams,
1162
1213
  struct llama_kv_cache & cache,
1163
1214
  ggml_type wtype,
1164
- int n_ctx,
1215
+ uint32_t n_ctx,
1165
1216
  int n_gpu_layers) {
1166
- const int n_embd = hparams.n_embd_gqa();
1167
- const int n_layer = hparams.n_layer;
1217
+ const uint32_t n_embd = hparams.n_embd_gqa();
1218
+ const uint32_t n_layer = hparams.n_layer;
1168
1219
 
1169
1220
  const int64_t n_mem = n_layer*n_ctx;
1170
1221
  const int64_t n_elements = n_embd*n_mem;
1171
1222
 
1223
+ cache.has_shift = false;
1224
+
1225
+ cache.head = 0;
1226
+ cache.size = n_ctx;
1227
+
1228
+ cache.cells.clear();
1229
+ cache.cells.resize(n_ctx);
1230
+
1172
1231
  cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
1173
- cache.n = 0;
1174
1232
 
1175
1233
  struct ggml_init_params params;
1176
1234
  params.mem_size = cache.buf.size;
@@ -1191,17 +1249,154 @@ static bool llama_kv_cache_init(
1191
1249
 
1192
1250
  (void) n_gpu_layers;
1193
1251
  #ifdef GGML_USE_CUBLAS
1194
- if (n_gpu_layers > n_layer + 1) {
1252
+ size_t vram_kv_cache = 0;
1253
+
1254
+ if (n_gpu_layers > (int)n_layer + 1) {
1195
1255
  ggml_cuda_assign_buffers_no_scratch(cache.v);
1256
+ LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__);
1257
+ vram_kv_cache += ggml_nbytes(cache.v);
1196
1258
  }
1197
- if (n_gpu_layers > n_layer + 2) {
1259
+ if (n_gpu_layers > (int)n_layer + 2) {
1198
1260
  ggml_cuda_assign_buffers_no_scratch(cache.k);
1261
+ LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
1262
+ vram_kv_cache += ggml_nbytes(cache.k);
1263
+ }
1264
+ if (vram_kv_cache > 0) {
1265
+ LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
1199
1266
  }
1200
1267
  #endif // GGML_USE_CUBLAS
1201
1268
 
1202
1269
  return true;
1203
1270
  }
1204
1271
 
1272
+ // find an empty slot of size "n_tokens" in the cache
1273
+ // updates the cache head
1274
+ static bool llama_kv_cache_find_slot(
1275
+ struct llama_kv_cache & cache,
1276
+ const struct llama_batch & batch) {
1277
+ const uint32_t n_ctx = cache.size;
1278
+ const uint32_t n_tokens = batch.n_tokens;
1279
+
1280
+ if (n_tokens > n_ctx) {
1281
+ LLAMA_LOG_ERROR("%s: n_tokens=%d > n_ctx=%d\n", __func__, n_tokens, n_ctx);
1282
+ return false;
1283
+ }
1284
+
1285
+ uint32_t n_tested = 0;
1286
+
1287
+ while (true) {
1288
+ if (cache.head + n_tokens > n_ctx) {
1289
+ cache.head = 0;
1290
+ n_tested += n_ctx - cache.head;
1291
+ continue;
1292
+ }
1293
+
1294
+ bool found = true;
1295
+ for (uint32_t i = 0; i < n_tokens; i++) {
1296
+ if (cache.cells[cache.head + i].pos >= 0) {
1297
+ found = false;
1298
+ cache.head += i + 1;
1299
+ n_tested += i + 1;
1300
+ break;
1301
+ }
1302
+ }
1303
+
1304
+ if (found) {
1305
+ break;
1306
+ }
1307
+
1308
+ if (n_tested >= n_ctx) {
1309
+ //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
1310
+ return false;
1311
+ }
1312
+ }
1313
+
1314
+ for (uint32_t i = 0; i < n_tokens; i++) {
1315
+ cache.cells[cache.head + i].pos = batch.pos[i];
1316
+ cache.cells[cache.head + i].seq_id.insert(batch.seq_id[i]);
1317
+ }
1318
+
1319
+ return true;
1320
+ }
1321
+
1322
+ // find how many cells are currently in use
1323
+ static int32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
1324
+ for (uint32_t i = cache.size - 1; i > 0; --i) {
1325
+ if (cache.cells[i].pos >= 0 && !cache.cells[i].seq_id.empty()) {
1326
+ return i + 1;
1327
+ }
1328
+ }
1329
+
1330
+ return 0;
1331
+ }
1332
+
1333
+ static void llama_kv_cache_tokens_rm(struct llama_kv_cache & cache, int32_t c0, int32_t c1) {
1334
+ if (c0 < 0) c0 = 0;
1335
+ if (c1 < 0) c1 = cache.size;
1336
+
1337
+ for (int32_t i = c0; i < c1; ++i) {
1338
+ cache.cells[i].pos = -1;
1339
+ cache.cells[i].seq_id.clear();
1340
+ }
1341
+ }
1342
+
1343
+ static void llama_kv_cache_seq_rm(
1344
+ struct llama_kv_cache & cache,
1345
+ llama_seq_id seq_id,
1346
+ llama_pos p0,
1347
+ llama_pos p1) {
1348
+ for (uint32_t i = 0; i < cache.size; ++i) {
1349
+ if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
1350
+ cache.cells[i].seq_id.erase(seq_id);
1351
+ if (cache.cells[i].seq_id.empty()) {
1352
+ cache.cells[i].pos = -1;
1353
+ }
1354
+ }
1355
+ }
1356
+ }
1357
+
1358
+ static void llama_kv_cache_seq_cp(
1359
+ struct llama_kv_cache & cache,
1360
+ llama_seq_id seq_id_src,
1361
+ llama_seq_id seq_id_dst,
1362
+ llama_pos p0,
1363
+ llama_pos p1) {
1364
+ for (uint32_t i = 0; i < cache.size; ++i) {
1365
+ if (cache.cells[i].has_seq_id(seq_id_src) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
1366
+ cache.cells[i].seq_id.insert(seq_id_dst);
1367
+ }
1368
+ }
1369
+ }
1370
+
1371
+ static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id seq_id) {
1372
+ for (uint32_t i = 0; i < cache.size; ++i) {
1373
+ if (!cache.cells[i].has_seq_id(seq_id)) {
1374
+ cache.cells[i].pos = -1;
1375
+ cache.cells[i].seq_id.clear();
1376
+ }
1377
+ }
1378
+ }
1379
+
1380
+ static void llama_kv_cache_seq_shift(
1381
+ struct llama_kv_cache & cache,
1382
+ llama_seq_id seq_id,
1383
+ llama_pos p0,
1384
+ llama_pos p1,
1385
+ llama_pos delta) {
1386
+ for (uint32_t i = 0; i < cache.size; ++i) {
1387
+ if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
1388
+ cache.cells[i].pos += delta;
1389
+ if (cache.cells[i].pos < 0) {
1390
+ cache.cells[i].pos = -1;
1391
+ cache.cells[i].seq_id.clear();
1392
+ } else {
1393
+ cache.has_shift = true;
1394
+ cache.cells[i].delta = delta;
1395
+ }
1396
+ }
1397
+ }
1398
+ }
1399
+
1205
1400
  //
1206
1401
  // model loading and saving
1207
1402
  //
@@ -1244,6 +1439,7 @@ struct llama_model_loader {
1244
1439
  int n_created = 0;
1245
1440
 
1246
1441
  int64_t n_elements = 0;
1442
+ size_t n_bytes = 0;
1247
1443
 
1248
1444
  bool use_mmap = false;
1249
1445
 
@@ -1276,6 +1472,7 @@ struct llama_model_loader {
1276
1472
  const char * name = gguf_get_tensor_name(ctx_gguf, i);
1277
1473
  struct ggml_tensor * t = ggml_get_tensor(ctx_meta, name);
1278
1474
  n_elements += ggml_nelements(t);
1475
+ n_bytes += ggml_nbytes(t);
1279
1476
  }
1280
1477
 
1281
1478
  LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
@@ -1521,7 +1718,7 @@ struct llama_model_loader {
1521
1718
  lmlock->grow_to(size_lock);
1522
1719
  }
1523
1720
  break;
1524
- #if defined(GGML_USE_CUBLAS)
1721
+ #ifdef GGML_USE_CUBLAS
1525
1722
  case GGML_BACKEND_GPU:
1526
1723
  case GGML_BACKEND_GPU_SPLIT:
1527
1724
  // old code:
@@ -1554,7 +1751,15 @@ struct llama_model_loader {
1554
1751
  // load LLaMA models
1555
1752
  //
1556
1753
 
1557
- std::string llama_model_ftype_name(enum llama_ftype ftype) {
1754
+ static std::string llama_model_arch_name(llm_arch arch) {
1755
+ auto it = LLM_ARCH_NAMES.find(arch);
1756
+ if (it == LLM_ARCH_NAMES.end()) {
1757
+ return "unknown";
1758
+ }
1759
+ return it->second;
1760
+ }
1761
+
1762
+ static std::string llama_model_ftype_name(llama_ftype ftype) {
1558
1763
  if (ftype & LLAMA_FTYPE_GUESSED) {
1559
1764
  return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
1560
1765
  }
@@ -1587,9 +1792,11 @@ std::string llama_model_ftype_name(enum llama_ftype ftype) {
1587
1792
 
1588
1793
  static const char * llama_model_type_name(e_model type) {
1589
1794
  switch (type) {
1795
+ case MODEL_1B: return "1B";
1590
1796
  case MODEL_3B: return "3B";
1591
1797
  case MODEL_7B: return "7B";
1592
1798
  case MODEL_13B: return "13B";
1799
+ case MODEL_15B: return "15B";
1593
1800
  case MODEL_30B: return "30B";
1594
1801
  case MODEL_34B: return "34B";
1595
1802
  case MODEL_40B: return "40B";
@@ -1608,10 +1815,7 @@ static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
1608
1815
 
1609
1816
  static void llm_load_hparams(
1610
1817
  llama_model_loader & ml,
1611
- llama_model & model,
1612
- int n_ctx,
1613
- float rope_freq_base,
1614
- float rope_freq_scale) {
1818
+ llama_model & model) {
1615
1819
  struct gguf_context * ctx = ml.ctx_gguf;
1616
1820
 
1617
1821
  const auto kv = LLM_KV(model.arch);
@@ -1622,40 +1826,25 @@ static void llm_load_hparams(
1622
1826
  GGUF_GET_KEY(ctx, model.name, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_NAME));
1623
1827
 
1624
1828
  // get hparams kv
1625
- GGUF_GET_KEY(ctx, hparams.n_vocab, gguf_get_arr_n, GGUF_TYPE_ARRAY, true, kv(LLM_KV_TOKENIZER_LIST));
1626
- GGUF_GET_KEY(ctx, hparams.n_ctx_train, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_CONTEXT_LENGTH));
1627
- GGUF_GET_KEY(ctx, hparams.n_embd, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_EMBEDDING_LENGTH));
1628
- GGUF_GET_KEY(ctx, hparams.n_ff, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_FEED_FORWARD_LENGTH));
1629
- GGUF_GET_KEY(ctx, hparams.n_head, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_ATTENTION_HEAD_COUNT));
1630
- GGUF_GET_KEY(ctx, hparams.n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_BLOCK_COUNT));
1829
+ GGUF_GET_KEY(ctx, hparams.n_vocab, gguf_get_arr_n, GGUF_TYPE_ARRAY, true, kv(LLM_KV_TOKENIZER_LIST));
1830
+ GGUF_GET_KEY(ctx, hparams.n_ctx_train, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_CONTEXT_LENGTH));
1831
+ GGUF_GET_KEY(ctx, hparams.n_embd, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_EMBEDDING_LENGTH));
1832
+ GGUF_GET_KEY(ctx, hparams.n_ff, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_FEED_FORWARD_LENGTH));
1833
+ GGUF_GET_KEY(ctx, hparams.n_head, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_ATTENTION_HEAD_COUNT));
1834
+ GGUF_GET_KEY(ctx, hparams.n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_BLOCK_COUNT));
1631
1835
 
1632
1836
  // n_head_kv is optional, default to n_head
1633
1837
  hparams.n_head_kv = hparams.n_head;
1634
1838
  GGUF_GET_KEY(ctx, hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV));
1635
1839
 
1636
- // TODO: manually setting rope freq base and scale should override this
1637
- // FIXME: partial fix when the param specified is not the default value, but
1638
- // will not work for overriding the model value to the params default
1639
-
1640
- llama_context_params defaults = llama_context_default_params();
1641
-
1642
- // rope_freq_base
1643
- {
1644
- float ropebase = 10000.0f;
1645
- GGUF_GET_KEY(ctx, ropebase, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
1646
- if (ropebase != 10000.0f && rope_freq_base == defaults.rope_freq_base) {
1647
- rope_freq_base = ropebase;
1648
- }
1649
- }
1840
+ // rope_freq_base (optional)
1841
+ hparams.rope_freq_base_train = 10000.0f;
1842
+ GGUF_GET_KEY(ctx, hparams.rope_freq_base_train, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
1650
1843
 
1651
1844
  // rope_freq_scale (inverse of the kv) is optional
1652
- {
1653
- float ropescale = 1.0f;
1654
- GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
1655
- if (ropescale != 1.0f && rope_freq_scale == defaults.rope_freq_scale) {
1656
- rope_freq_scale = 1.0f/ropescale;
1657
- }
1658
- }
1845
+ float ropescale = 1.0f;
1846
+ GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
1847
+ hparams.rope_freq_scale_train = 1.0f/ropescale;
1659
1848
 
1660
1849
  // sanity check for n_rot (optional)
1661
1850
  {
@@ -1707,14 +1896,21 @@ static void llm_load_hparams(
1707
1896
  default: model.type = e_model::MODEL_UNKNOWN;
1708
1897
  }
1709
1898
  } break;
1899
+ case LLM_ARCH_STARCODER:
1900
+ {
1901
+ GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
1902
+ switch (hparams.n_layer) {
1903
+ case 24: model.type = e_model::MODEL_1B; break;
1904
+ case 36: model.type = e_model::MODEL_3B; break;
1905
+ case 42: model.type = e_model::MODEL_7B; break;
1906
+ case 40: model.type = e_model::MODEL_15B; break;
1907
+ default: model.type = e_model::MODEL_UNKNOWN;
1908
+ }
1909
+ } break;
1710
1910
  default: (void)0;
1711
- };
1911
+ }
1712
1912
 
1713
1913
  model.ftype = ml.ftype;
1714
-
1715
- hparams.n_ctx = n_ctx;
1716
- hparams.rope_freq_base = rope_freq_base;
1717
- hparams.rope_freq_scale = rope_freq_scale;
1718
1914
  }
1719
1915
 
1720
1916
  // TODO: This should probably be in llama.h
@@ -1735,20 +1931,18 @@ static void llm_load_vocab(
1735
1931
  throw std::runtime_error("cannot find tokenizer vocab in model file\n");
1736
1932
  }
1737
1933
 
1934
+ const float * scores = nullptr;
1738
1935
  const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
1739
- if (score_idx == -1) {
1740
- throw std::runtime_error("cannot find tokenizer scores in model file\n");
1936
+ if (score_idx != -1) {
1937
+ scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
1741
1938
  }
1742
1939
 
1743
- const float * scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
1744
-
1940
+ const int * toktypes = nullptr;
1745
1941
  const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
1746
- if (toktype_idx == -1) {
1747
- throw std::runtime_error("cannot find token type list in GGUF file\n");
1942
+ if (toktype_idx != -1) {
1943
+ toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
1748
1944
  }
1749
1945
 
1750
- const int * toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
1751
-
1752
1946
  // determine vocab type
1753
1947
  {
1754
1948
  std::string tokenizer_name;
@@ -1816,8 +2010,8 @@ static void llm_load_vocab(
1816
2010
 
1817
2011
  auto & token_data = vocab.id_to_token[i];
1818
2012
  token_data.text = std::move(word);
1819
- token_data.score = scores[i];
1820
- token_data.type = (llama_token_type) toktypes[i];
2013
+ token_data.score = scores ? scores[i] : 0.0f;
2014
+ token_data.type = toktypes ? (llama_token_type) toktypes[i] : LLAMA_TOKEN_TYPE_NORMAL;
1821
2015
  }
1822
2016
 
1823
2017
  // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
@@ -1840,27 +2034,31 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
1840
2034
  const auto & vocab = model.vocab;
1841
2035
 
1842
2036
  // hparams
1843
- LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver));
1844
- LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch).c_str());
1845
- LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, vocab.type == LLAMA_VOCAB_TYPE_SPM ? "SPM" : "BPE"); // TODO: fix
1846
- LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab);
1847
- LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (int) vocab.bpe_ranks.size());
1848
- LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
1849
- LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, hparams.n_ctx);
1850
- LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
1851
- LLAMA_LOG_INFO("%s: n_head = %u\n", __func__, hparams.n_head);
1852
- LLAMA_LOG_INFO("%s: n_head_kv = %u\n", __func__, hparams.n_head_kv);
1853
- LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
1854
- LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim
1855
- LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa());
1856
- LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
1857
- LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
1858
- LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
1859
- LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base);
1860
- LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale);
1861
- LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
1862
- LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
1863
- LLAMA_LOG_INFO("%s: model size = %.2f B\n", __func__, ml.n_elements*1e-9);
2037
+ LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver));
2038
+ LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch).c_str());
2039
+ LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, vocab.type == LLAMA_VOCAB_TYPE_SPM ? "SPM" : "BPE"); // TODO: fix
2040
+ LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab);
2041
+ LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (int) vocab.bpe_ranks.size());
2042
+ LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
2043
+ LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
2044
+ LLAMA_LOG_INFO("%s: n_head = %u\n", __func__, hparams.n_head);
2045
+ LLAMA_LOG_INFO("%s: n_head_kv = %u\n", __func__, hparams.n_head_kv);
2046
+ LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
2047
+ LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim
2048
+ LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa());
2049
+ LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
2050
+ LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
2051
+ LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
2052
+ LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
2053
+ LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
2054
+ LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
2055
+ LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
2056
+ LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
2057
+ if (ml.n_bytes < GB) {
2058
+ LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
2059
+ } else {
2060
+ LLAMA_LOG_INFO("%s: model size = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
2061
+ }
1864
2062
 
1865
2063
  // general kv
1866
2064
  LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
@@ -1877,13 +2075,9 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
1877
2075
  static void llm_load_tensors(
1878
2076
  llama_model_loader & ml,
1879
2077
  llama_model & model,
1880
- int n_batch,
1881
2078
  int n_gpu_layers,
1882
2079
  int main_gpu,
1883
2080
  const float * tensor_split,
1884
- const bool mul_mat_q,
1885
- bool low_vram,
1886
- ggml_type memory_type,
1887
2081
  bool use_mlock,
1888
2082
  llama_progress_callback progress_callback,
1889
2083
  void * progress_callback_user_data) {
@@ -1922,11 +2116,9 @@ static void llm_load_tensors(
1922
2116
  }
1923
2117
 
1924
2118
  (void) main_gpu;
1925
- (void) mul_mat_q;
1926
- #if defined(GGML_USE_CUBLAS)
2119
+ #ifdef GGML_USE_CUBLAS
1927
2120
  LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__);
1928
2121
  ggml_cuda_set_main_device(main_gpu);
1929
- ggml_cuda_set_mul_mat_q(mul_mat_q);
1930
2122
  #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
1931
2123
  #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
1932
2124
  #elif defined(GGML_USE_CLBLAST)
@@ -1961,9 +2153,9 @@ static void llm_load_tensors(
1961
2153
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
1962
2154
  // on Windows however this is detrimental unless everything is on the GPU
1963
2155
  #ifndef _WIN32
1964
- backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2156
+ backend_norm = LLAMA_BACKEND_OFFLOAD;
1965
2157
  #else
1966
- backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2158
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
1967
2159
  #endif // _WIN32
1968
2160
 
1969
2161
  backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
@@ -2027,9 +2219,9 @@ static void llm_load_tensors(
2027
2219
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2028
2220
  // on Windows however this is detrimental unless everything is on the GPU
2029
2221
  #ifndef _WIN32
2030
- backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2222
+ backend_norm = LLAMA_BACKEND_OFFLOAD;
2031
2223
  #else
2032
- backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2224
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2033
2225
  #endif // _WIN32
2034
2226
 
2035
2227
  backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
@@ -2097,9 +2289,9 @@ static void llm_load_tensors(
2097
2289
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2098
2290
  // on Windows however this is detrimental unless everything is on the GPU
2099
2291
  #ifndef _WIN32
2100
- backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2292
+ backend_norm = LLAMA_BACKEND_OFFLOAD;
2101
2293
  #else
2102
- backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2294
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2103
2295
  #endif // _WIN32
2104
2296
 
2105
2297
  backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
@@ -2160,29 +2352,100 @@ static void llm_load_tensors(
2160
2352
  }
2161
2353
  }
2162
2354
  } break;
2355
+ case LLM_ARCH_STARCODER:
2356
+ {
2357
+ model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2358
+ model.pos_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU);
2359
+
2360
+ // output
2361
+ {
2362
+ ggml_backend backend_norm;
2363
+ ggml_backend backend_output;
2364
+
2365
+ if (n_gpu_layers > int(n_layer)) {
2366
+ // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2367
+ // on Windows however this is detrimental unless everything is on the GPU
2368
+ #ifndef _WIN32
2369
+ backend_norm = LLAMA_BACKEND_OFFLOAD;
2370
+ #else
2371
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2372
+ #endif // _WIN32
2373
+
2374
+ backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
2375
+ } else {
2376
+ backend_norm = GGML_BACKEND_CPU;
2377
+ backend_output = GGML_BACKEND_CPU;
2378
+ }
2379
+
2380
+ model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
2381
+ model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
2382
+ model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
2383
+
2384
+ if (backend_norm == GGML_BACKEND_GPU) {
2385
+ vram_weights += ggml_nbytes(model.output_norm);
2386
+ vram_weights += ggml_nbytes(model.output_norm_b);
2387
+ }
2388
+ if (backend_output == GGML_BACKEND_GPU_SPLIT) {
2389
+ vram_weights += ggml_nbytes(model.output);
2390
+ }
2391
+ }
2392
+
2393
+ const uint32_t n_ff = hparams.n_ff;
2394
+
2395
+ const int i_gpu_start = n_layer - n_gpu_layers;
2396
+
2397
+ model.layers.resize(n_layer);
2398
+
2399
+ for (uint32_t i = 0; i < n_layer; ++i) {
2400
+ const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2401
+ const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2402
+
2403
+ auto & layer = model.layers[i];
2404
+
2405
+ layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
2406
+ layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
2407
+
2408
+ layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
2409
+ layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split);
2410
+
2411
+ layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
2412
+ layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split);
2413
+
2414
+ layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
2415
+ layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
2416
+
2417
+ layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
2418
+ layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split);
2419
+
2420
+ layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
2421
+ layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split);
2422
+
2423
+ if (backend == GGML_BACKEND_GPU) {
2424
+ vram_weights +=
2425
+ ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
2426
+ ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
2427
+ ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) +
2428
+ ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_norm_b) +
2429
+ ggml_nbytes(layer.w2) + ggml_nbytes(layer.b2) +
2430
+ ggml_nbytes(layer.w3) + ggml_nbytes(layer.b3);
2431
+ }
2432
+ }
2433
+ } break;
2163
2434
  default:
2164
2435
  throw std::runtime_error("unknown architecture");
2165
- };
2436
+ }
2166
2437
  }
2167
2438
 
2168
2439
  ml.done_getting_tensors();
2169
2440
 
2170
2441
  // print memory requirements
2171
2442
  {
2172
- const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
2173
-
2174
2443
  // this is the total memory required to run the inference
2175
2444
  size_t mem_required =
2176
2445
  ctx_size +
2177
2446
  mmapped_size - vram_weights; // weights in VRAM not in memory
2178
2447
 
2179
- // this is the memory required by one llama_state
2180
- const size_t mem_required_state = scale*hparams.kv_size();
2181
-
2182
- LLAMA_LOG_INFO("%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
2183
- mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
2184
-
2185
- (void) n_batch;
2448
+ LLAMA_LOG_INFO("%s: mem required = %7.2f MB\n", __func__, mem_required / 1024.0 / 1024.0);
2186
2449
 
2187
2450
  #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
2188
2451
  const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
@@ -2191,36 +2454,17 @@ static void llm_load_tensors(
2191
2454
  if (n_gpu_layers > (int) hparams.n_layer) {
2192
2455
  LLAMA_LOG_INFO("%s: offloading non-repeating layers to GPU\n", __func__);
2193
2456
  }
2194
- size_t vram_kv_cache = 0;
2195
2457
 
2196
2458
  #ifdef GGML_USE_CUBLAS
2197
2459
  const int max_backend_supported_layers = hparams.n_layer + 3;
2198
- const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
2199
- if (n_gpu_layers > (int) hparams.n_layer + 1) {
2200
- if (low_vram) {
2201
- LLAMA_LOG_INFO("%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
2202
- } else {
2203
- LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__);
2204
- vram_kv_cache += hparams.kv_size() / 2;
2205
- }
2206
- }
2207
- if (n_gpu_layers > (int) hparams.n_layer + 2) {
2208
- if (low_vram) {
2209
- LLAMA_LOG_WARN("%s: cannot offload k cache to GPU due to low VRAM option\n", __func__);
2210
- } else {
2211
- LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
2212
- vram_kv_cache += hparams.kv_size() / 2;
2213
- }
2214
- }
2460
+ const int max_offloadable_layers = hparams.n_layer + 3;
2215
2461
  #elif defined(GGML_USE_CLBLAST)
2216
2462
  const int max_backend_supported_layers = hparams.n_layer + 1;
2217
2463
  const int max_offloadable_layers = hparams.n_layer + 1;
2218
2464
  #endif // GGML_USE_CUBLAS
2219
2465
 
2220
- LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n",
2221
- __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
2222
- LLAMA_LOG_INFO("%s: VRAM used: %zu MB\n",
2223
- __func__, (vram_weights + vram_kv_cache + MB - 1) / MB); // round up
2466
+ LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
2467
+ LLAMA_LOG_INFO("%s: VRAM used: %.2f MB\n", __func__, vram_weights / 1024.0 / 1024.0);
2224
2468
  #else
2225
2469
  (void) n_gpu_layers;
2226
2470
  #endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
@@ -2233,7 +2477,7 @@ static void llm_load_tensors(
2233
2477
  }
2234
2478
 
2235
2479
  (void) tensor_split;
2236
- #if defined(GGML_USE_CUBLAS)
2480
+ #ifdef GGML_USE_CUBLAS
2237
2481
  {
2238
2482
  ggml_cuda_set_tensor_split(tensor_split);
2239
2483
  }
@@ -2255,29 +2499,24 @@ static void llm_load_tensors(
2255
2499
  static bool llama_model_load(
2256
2500
  const std::string & fname,
2257
2501
  llama_model & model,
2258
- int n_ctx,
2259
- int n_batch,
2260
2502
  int n_gpu_layers,
2261
2503
  int main_gpu,
2262
2504
  const float * tensor_split,
2263
- const bool mul_mat_q,
2264
- float rope_freq_base,
2265
- float rope_freq_scale,
2266
- bool low_vram,
2267
- ggml_type memory_type,
2268
2505
  bool use_mmap,
2269
2506
  bool use_mlock,
2270
2507
  bool vocab_only,
2271
2508
  llama_progress_callback progress_callback,
2272
2509
  void *progress_callback_user_data) {
2273
2510
  try {
2274
- std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap));
2511
+ llama_model_loader ml(fname, use_mmap);
2512
+
2513
+ model.hparams.vocab_only = vocab_only;
2275
2514
 
2276
- llm_load_arch (*ml, model);
2277
- llm_load_hparams(*ml, model, n_ctx, rope_freq_base, rope_freq_scale);
2278
- llm_load_vocab (*ml, model);
2515
+ llm_load_arch (ml, model);
2516
+ llm_load_hparams(ml, model);
2517
+ llm_load_vocab (ml, model);
2279
2518
 
2280
- llm_load_print_meta(*ml, model);
2519
+ llm_load_print_meta(ml, model);
2281
2520
 
2282
2521
  if (model.hparams.n_vocab != model.vocab.id_to_token.size()) {
2283
2522
  throw std::runtime_error("vocab size mismatch");
@@ -2289,8 +2528,8 @@ static bool llama_model_load(
2289
2528
  }
2290
2529
 
2291
2530
  llm_load_tensors(
2292
- *ml, model, n_batch, n_gpu_layers,
2293
- main_gpu, tensor_split, mul_mat_q, low_vram, memory_type,
2531
+ ml, model, n_gpu_layers,
2532
+ main_gpu, tensor_split,
2294
2533
  use_mlock, progress_callback, progress_callback_user_data);
2295
2534
  } catch (const std::exception & err) {
2296
2535
  LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
@@ -2302,17 +2541,10 @@ static bool llama_model_load(
2302
2541
 
2303
2542
  static struct ggml_cgraph * llm_build_llama(
2304
2543
  llama_context & lctx,
2305
- const llama_token * tokens,
2306
- const float * embd,
2307
- int n_tokens,
2308
- int n_past) {
2309
-
2310
- GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
2311
-
2312
- const int N = n_tokens;
2313
-
2544
+ const llama_batch & batch) {
2314
2545
  const auto & model = lctx.model;
2315
2546
  const auto & hparams = model.hparams;
2547
+ const auto & cparams = lctx.cparams;
2316
2548
 
2317
2549
  const auto & kv_self = lctx.kv_self;
2318
2550
 
@@ -2320,7 +2552,7 @@ static struct ggml_cgraph * llm_build_llama(
2320
2552
 
2321
2553
  const int64_t n_embd = hparams.n_embd;
2322
2554
  const int64_t n_layer = hparams.n_layer;
2323
- const int64_t n_ctx = hparams.n_ctx;
2555
+ const int64_t n_ctx = cparams.n_ctx;
2324
2556
  const int64_t n_head = hparams.n_head;
2325
2557
  const int64_t n_head_kv = hparams.n_head_kv;
2326
2558
  const int64_t n_embd_head = hparams.n_embd_head();
@@ -2328,12 +2560,20 @@ static struct ggml_cgraph * llm_build_llama(
2328
2560
 
2329
2561
  GGML_ASSERT(n_embd_head == hparams.n_rot);
2330
2562
 
2331
- const float freq_base = hparams.rope_freq_base;
2332
- const float freq_scale = hparams.rope_freq_scale;
2563
+ const float freq_base = cparams.rope_freq_base;
2564
+ const float freq_scale = cparams.rope_freq_scale;
2333
2565
  const float norm_rms_eps = hparams.f_norm_rms_eps;
2334
2566
 
2335
2567
  const int n_gpu_layers = model.n_gpu_layers;
2336
2568
 
2569
+ const int32_t n_tokens = batch.n_tokens;
2570
+ const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
2571
+ const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
2572
+
2573
+ const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
2574
+
2575
+ //printf("n_kv = %d\n", n_kv);
2576
+
2337
2577
  auto & buf_compute = lctx.buf_compute;
2338
2578
 
2339
2579
  struct ggml_init_params params = {
@@ -2351,12 +2591,12 @@ static struct ggml_cgraph * llm_build_llama(
2351
2591
  struct ggml_tensor * cur;
2352
2592
  struct ggml_tensor * inpL;
2353
2593
 
2354
- if (tokens) {
2355
- struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
2594
+ if (batch.token) {
2595
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
2356
2596
 
2357
2597
  ggml_allocr_alloc(lctx.alloc, inp_tokens);
2358
2598
  if (!ggml_allocr_is_measure(lctx.alloc)) {
2359
- memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
2599
+ memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
2360
2600
  }
2361
2601
  ggml_set_name(inp_tokens, "inp_tokens");
2362
2602
 
@@ -2366,11 +2606,11 @@ static struct ggml_cgraph * llm_build_llama(
2366
2606
  GGML_ASSERT(false && "not implemented");
2367
2607
  #endif
2368
2608
 
2369
- inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
2609
+ inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
2370
2610
 
2371
2611
  ggml_allocr_alloc(lctx.alloc, inpL);
2372
2612
  if (!ggml_allocr_is_measure(lctx.alloc)) {
2373
- memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
2613
+ memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
2374
2614
  }
2375
2615
  }
2376
2616
 
@@ -2379,9 +2619,6 @@ static struct ggml_cgraph * llm_build_llama(
2379
2619
 
2380
2620
  // offload functions set the tensor output backend to GPU
2381
2621
  // tensors are GPU-accelerated if any input or the output has been offloaded
2382
- //
2383
- // with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
2384
- // in that case ggml_cuda_assign_buffers has no effect
2385
2622
  offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
2386
2623
  offload_func_t offload_func_kq = llama_nop;
2387
2624
  offload_func_t offload_func_v = llama_nop;
@@ -2398,12 +2635,75 @@ static struct ggml_cgraph * llm_build_llama(
2398
2635
  }
2399
2636
  #endif // GGML_USE_CUBLAS
2400
2637
 
2638
+ // KQ_scale
2401
2639
  struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
2640
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
2402
2641
  ggml_allocr_alloc(lctx.alloc, KQ_scale);
2403
2642
  if (!ggml_allocr_is_measure(lctx.alloc)) {
2404
- ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
2643
+ ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd_head)));
2644
+ }
2645
+
2646
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
2647
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
2648
+ offload_func_kq(KQ_mask);
2649
+ ggml_set_name(KQ_mask, "KQ_mask");
2650
+ ggml_allocr_alloc(lctx.alloc, KQ_mask);
2651
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
2652
+ float * data = (float *) KQ_mask->data;
2653
+ memset(data, 0, ggml_nbytes(KQ_mask));
2654
+
2655
+ for (int h = 0; h < 1; ++h) {
2656
+ for (int j = 0; j < n_tokens; ++j) {
2657
+ const llama_pos pos = batch.pos[j];
2658
+ const llama_seq_id seq_id = batch.seq_id[j];
2659
+
2660
+ for (int i = 0; i < n_kv; ++i) {
2661
+ if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
2662
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
2663
+ }
2664
+ }
2665
+ }
2666
+ }
2667
+ }
2668
+
2669
+ // KQ_pos - contains the positions
2670
+ struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
2671
+ offload_func_kq(KQ_pos);
2672
+ ggml_set_name(KQ_pos, "KQ_pos");
2673
+ ggml_allocr_alloc(lctx.alloc, KQ_pos);
2674
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
2675
+ int * data = (int *) KQ_pos->data;
2676
+ for (int i = 0; i < n_tokens; ++i) {
2677
+ data[i] = batch.pos[i];
2678
+ }
2679
+ }
2680
+
2681
+ // shift the entire K-cache if needed
2682
+ if (do_rope_shift) {
2683
+ struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
2684
+ offload_func_kq(K_shift);
2685
+ ggml_set_name(K_shift, "K_shift");
2686
+ ggml_allocr_alloc(lctx.alloc, K_shift);
2687
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
2688
+ int * data = (int *) K_shift->data;
2689
+ for (int i = 0; i < n_ctx; ++i) {
2690
+ data[i] = kv_self.cells[i].delta;
2691
+ }
2692
+ }
2693
+
2694
+ for (int il = 0; il < n_layer; ++il) {
2695
+ struct ggml_tensor * tmp =
2696
+ ggml_rope_custom_inplace(ctx0,
2697
+ ggml_view_3d(ctx0, kv_self.k,
2698
+ n_embd_head, n_head_kv, n_ctx,
2699
+ ggml_element_size(kv_self.k)*n_embd_head,
2700
+ ggml_element_size(kv_self.k)*n_embd_gqa,
2701
+ ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
2702
+ K_shift, n_embd_head, 0, 0, freq_base, freq_scale);
2703
+ offload_func_kq(tmp);
2704
+ ggml_build_forward_expand(gf, tmp);
2705
+ }
2405
2706
  }
2406
- ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
2407
2707
 
2408
2708
  for (int il = 0; il < n_layer; ++il) {
2409
2709
  ggml_format_name(inpL, "layer_inp_%d", il);
@@ -2441,33 +2741,33 @@ static struct ggml_cgraph * llm_build_llama(
2441
2741
  offload_func_kq(tmpq);
2442
2742
  ggml_set_name(tmpq, "tmpq");
2443
2743
 
2444
- struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
2744
+ struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
2445
2745
  offload_func_kq(Kcur);
2446
2746
  ggml_set_name(Kcur, "Kcur");
2447
2747
 
2448
- struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
2748
+ struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
2449
2749
  offload_func_kq(Qcur);
2450
2750
  ggml_set_name(Qcur, "Qcur");
2451
2751
 
2452
2752
  // store key and value to memory
2453
2753
  {
2454
- // compute the transposed [N, n_embd] V matrix
2754
+ // compute the transposed [n_tokens, n_embd] V matrix
2455
2755
 
2456
2756
  struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
2457
2757
  offload_func_v(tmpv);
2458
2758
  ggml_set_name(tmpv, "tmpv");
2459
2759
 
2460
- struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, N));
2760
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens));
2461
2761
  offload_func_v(Vcur);
2462
2762
  ggml_set_name(Vcur, "Vcur");
2463
2763
 
2464
- struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
2764
+ struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
2465
2765
  offload_func_kq(k);
2466
2766
  ggml_set_name(k, "k");
2467
2767
 
2468
- struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
2768
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
2469
2769
  ( n_ctx)*ggml_element_size(kv_self.v),
2470
- (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
2770
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
2471
2771
  offload_func_v(v);
2472
2772
  ggml_set_name(v, "v");
2473
2773
 
@@ -2482,7 +2782,7 @@ static struct ggml_cgraph * llm_build_llama(
2482
2782
 
2483
2783
  struct ggml_tensor * K =
2484
2784
  ggml_view_3d(ctx0, kv_self.k,
2485
- n_embd_head, n_past + N, n_head_kv,
2785
+ n_embd_head, n_kv, n_head_kv,
2486
2786
  ggml_element_size(kv_self.k)*n_embd_gqa,
2487
2787
  ggml_element_size(kv_self.k)*n_embd_head,
2488
2788
  ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
@@ -2495,25 +2795,25 @@ static struct ggml_cgraph * llm_build_llama(
2495
2795
  ggml_set_name(KQ, "KQ");
2496
2796
 
2497
2797
  // KQ_scaled = KQ / sqrt(n_embd_head)
2498
- // KQ_scaled shape [n_past + N, N, n_head, 1]
2499
- struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
2798
+ // KQ_scaled shape [n_kv, n_tokens, n_head, 1]
2799
+ struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
2500
2800
  offload_func_kq(KQ_scaled);
2501
2801
  ggml_set_name(KQ_scaled, "KQ_scaled");
2502
2802
 
2503
2803
  // KQ_masked = mask_past(KQ_scaled)
2504
- struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
2804
+ struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
2505
2805
  offload_func_kq(KQ_masked);
2506
2806
  ggml_set_name(KQ_masked, "KQ_masked");
2507
2807
 
2508
2808
  // KQ = soft_max(KQ_masked)
2509
- struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
2809
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
2510
2810
  offload_func_v(KQ_soft_max);
2511
2811
  ggml_set_name(KQ_soft_max, "KQ_soft_max");
2512
2812
 
2513
2813
  // split cached V into n_head heads
2514
2814
  struct ggml_tensor * V =
2515
2815
  ggml_view_3d(ctx0, kv_self.v,
2516
- n_past + N, n_embd_head, n_head_kv,
2816
+ n_kv, n_embd_head, n_head_kv,
2517
2817
  ggml_element_size(kv_self.v)*n_ctx,
2518
2818
  ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
2519
2819
  ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
@@ -2528,7 +2828,7 @@ static struct ggml_cgraph * llm_build_llama(
2528
2828
  // make V contiguous in memory to speed up the matmul, however we waste time on the copy
2529
2829
  // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
2530
2830
  // is there a better way?
2531
- struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head));
2831
+ struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_ctx, n_embd_head, n_head));
2532
2832
  struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
2533
2833
  #endif
2534
2834
 
@@ -2537,10 +2837,8 @@ static struct ggml_cgraph * llm_build_llama(
2537
2837
  offload_func_v(KQV_merged);
2538
2838
  ggml_set_name(KQV_merged, "KQV_merged");
2539
2839
 
2540
- // cur = KQV_merged.contiguous().view(n_embd, N)
2541
- cur = ggml_cpy(ctx0,
2542
- KQV_merged,
2543
- ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
2840
+ // cur = KQV_merged.contiguous().view(n_embd, n_tokens)
2841
+ cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
2544
2842
  offload_func_v(cur);
2545
2843
  ggml_set_name(cur, "KQV_merged_contiguous");
2546
2844
 
@@ -2631,20 +2929,12 @@ static struct ggml_cgraph * llm_build_llama(
2631
2929
  return gf;
2632
2930
  }
2633
2931
 
2634
-
2635
2932
  static struct ggml_cgraph * llm_build_baichaun(
2636
2933
  llama_context & lctx,
2637
- const llama_token * tokens,
2638
- const float * embd,
2639
- int n_tokens,
2640
- int n_past) {
2641
-
2642
- GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
2643
-
2644
- const int N = n_tokens;
2645
-
2934
+ const llama_batch & batch) {
2646
2935
  const auto & model = lctx.model;
2647
2936
  const auto & hparams = model.hparams;
2937
+ const auto & cparams = lctx.cparams;
2648
2938
 
2649
2939
  const auto & kv_self = lctx.kv_self;
2650
2940
 
@@ -2652,7 +2942,7 @@ static struct ggml_cgraph * llm_build_baichaun(
2652
2942
 
2653
2943
  const int64_t n_embd = hparams.n_embd;
2654
2944
  const int64_t n_layer = hparams.n_layer;
2655
- const int64_t n_ctx = hparams.n_ctx;
2945
+ const int64_t n_ctx = cparams.n_ctx;
2656
2946
  const int64_t n_head = hparams.n_head;
2657
2947
  const int64_t n_head_kv = hparams.n_head_kv;
2658
2948
  const int64_t n_embd_head = hparams.n_embd_head();
@@ -2660,12 +2950,18 @@ static struct ggml_cgraph * llm_build_baichaun(
2660
2950
 
2661
2951
  GGML_ASSERT(n_embd_head == hparams.n_rot);
2662
2952
 
2663
- const float freq_base = hparams.rope_freq_base;
2664
- const float freq_scale = hparams.rope_freq_scale;
2953
+ const float freq_base = cparams.rope_freq_base;
2954
+ const float freq_scale = cparams.rope_freq_scale;
2665
2955
  const float norm_rms_eps = hparams.f_norm_rms_eps;
2666
2956
 
2667
2957
  const int n_gpu_layers = model.n_gpu_layers;
2668
2958
 
2959
+ const int32_t n_tokens = batch.n_tokens;
2960
+ const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
2961
+ const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
2962
+
2963
+ const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
2964
+
2669
2965
  auto & buf_compute = lctx.buf_compute;
2670
2966
 
2671
2967
  struct ggml_init_params params = {
@@ -2683,12 +2979,12 @@ static struct ggml_cgraph * llm_build_baichaun(
2683
2979
  struct ggml_tensor * cur;
2684
2980
  struct ggml_tensor * inpL;
2685
2981
 
2686
- if (tokens) {
2687
- struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
2982
+ if (batch.token) {
2983
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
2688
2984
 
2689
2985
  ggml_allocr_alloc(lctx.alloc, inp_tokens);
2690
2986
  if (!ggml_allocr_is_measure(lctx.alloc)) {
2691
- memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
2987
+ memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
2692
2988
  }
2693
2989
  ggml_set_name(inp_tokens, "inp_tokens");
2694
2990
 
@@ -2698,11 +2994,11 @@ static struct ggml_cgraph * llm_build_baichaun(
2698
2994
  GGML_ASSERT(false && "not implemented");
2699
2995
  #endif
2700
2996
 
2701
- inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
2997
+ inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
2702
2998
 
2703
2999
  ggml_allocr_alloc(lctx.alloc, inpL);
2704
3000
  if (!ggml_allocr_is_measure(lctx.alloc)) {
2705
- memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
3001
+ memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
2706
3002
  }
2707
3003
  }
2708
3004
 
@@ -2711,9 +3007,6 @@ static struct ggml_cgraph * llm_build_baichaun(
2711
3007
 
2712
3008
  // offload functions set the tensor output backend to GPU
2713
3009
  // tensors are GPU-accelerated if any input or the output has been offloaded
2714
- //
2715
- // with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
2716
- // in that case ggml_cuda_assign_buffers has no effect
2717
3010
  offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
2718
3011
  offload_func_t offload_func_kq = llama_nop;
2719
3012
  offload_func_t offload_func_v = llama_nop;
@@ -2730,12 +3023,75 @@ static struct ggml_cgraph * llm_build_baichaun(
2730
3023
  }
2731
3024
  #endif // GGML_USE_CUBLAS
2732
3025
 
3026
+ // KQ_scale
2733
3027
  struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
3028
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
2734
3029
  ggml_allocr_alloc(lctx.alloc, KQ_scale);
2735
3030
  if (!ggml_allocr_is_measure(lctx.alloc)) {
2736
3031
  ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
2737
3032
  }
2738
- ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
3033
+
3034
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
3035
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
3036
+ offload_func_kq(KQ_mask);
3037
+ ggml_set_name(KQ_mask, "KQ_mask");
3038
+ ggml_allocr_alloc(lctx.alloc, KQ_mask);
3039
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3040
+ float * data = (float *) KQ_mask->data;
3041
+ memset(data, 0, ggml_nbytes(KQ_mask));
3042
+
3043
+ for (int h = 0; h < 1; ++h) {
3044
+ for (int j = 0; j < n_tokens; ++j) {
3045
+ const llama_pos pos = batch.pos[j];
3046
+ const llama_seq_id seq_id = batch.seq_id[j];
3047
+
3048
+ for (int i = 0; i < n_kv; ++i) {
3049
+ if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
3050
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
3051
+ }
3052
+ }
3053
+ }
3054
+ }
3055
+ }
3056
+
3057
+ // KQ_pos - contains the positions
3058
+ struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
3059
+ offload_func_kq(KQ_pos);
3060
+ ggml_set_name(KQ_pos, "KQ_pos");
3061
+ ggml_allocr_alloc(lctx.alloc, KQ_pos);
3062
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3063
+ int * data = (int *) KQ_pos->data;
3064
+ for (int i = 0; i < n_tokens; ++i) {
3065
+ data[i] = batch.pos[i];
3066
+ }
3067
+ }
3068
+
3069
+ // shift the entire K-cache if needed
3070
+ if (do_rope_shift) {
3071
+ struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
3072
+ offload_func_kq(K_shift);
3073
+ ggml_set_name(K_shift, "K_shift");
3074
+ ggml_allocr_alloc(lctx.alloc, K_shift);
3075
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3076
+ int * data = (int *) K_shift->data;
3077
+ for (int i = 0; i < n_ctx; ++i) {
3078
+ data[i] = kv_self.cells[i].delta;
3079
+ }
3080
+ }
3081
+
3082
+ for (int il = 0; il < n_layer; ++il) {
3083
+ struct ggml_tensor * tmp =
3084
+ ggml_rope_custom_inplace(ctx0,
3085
+ ggml_view_3d(ctx0, kv_self.k,
3086
+ n_embd_head, n_head_kv, n_ctx,
3087
+ ggml_element_size(kv_self.k)*n_embd_head,
3088
+ ggml_element_size(kv_self.k)*n_embd_gqa,
3089
+ ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
3090
+ K_shift, n_embd_head, 0, 0, freq_base, freq_scale);
3091
+ offload_func_kq(tmp);
3092
+ ggml_build_forward_expand(gf, tmp);
3093
+ }
3094
+ }
2739
3095
 
2740
3096
  for (int il = 0; il < n_layer; ++il) {
2741
3097
  ggml_format_name(inpL, "layer_inp_%d", il);
@@ -2777,12 +3133,12 @@ static struct ggml_cgraph * llm_build_baichaun(
2777
3133
  struct ggml_tensor * Qcur;
2778
3134
  switch (model.type) {
2779
3135
  case MODEL_7B:
2780
- Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
2781
- Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
3136
+ Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
3137
+ Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
2782
3138
  break;
2783
3139
  case MODEL_13B:
2784
- Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N);
2785
- Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N);
3140
+ Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, n_tokens);
3141
+ Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, n_tokens);
2786
3142
  break;
2787
3143
  default:
2788
3144
  GGML_ASSERT(false);
@@ -2796,23 +3152,23 @@ static struct ggml_cgraph * llm_build_baichaun(
2796
3152
 
2797
3153
  // store key and value to memory
2798
3154
  {
2799
- // compute the transposed [N, n_embd] V matrix
3155
+ // compute the transposed [n_tokens, n_embd] V matrix
2800
3156
 
2801
3157
  struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
2802
3158
  offload_func_v(tmpv);
2803
3159
  ggml_set_name(tmpv, "tmpv");
2804
3160
 
2805
- struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, N));
3161
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens));
2806
3162
  offload_func_v(Vcur);
2807
3163
  ggml_set_name(Vcur, "Vcur");
2808
3164
 
2809
- struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
3165
+ struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
2810
3166
  offload_func_kq(k);
2811
3167
  ggml_set_name(k, "k");
2812
3168
 
2813
- struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
3169
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
2814
3170
  ( n_ctx)*ggml_element_size(kv_self.v),
2815
- (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
3171
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
2816
3172
  offload_func_v(v);
2817
3173
  ggml_set_name(v, "v");
2818
3174
 
@@ -2827,7 +3183,7 @@ static struct ggml_cgraph * llm_build_baichaun(
2827
3183
 
2828
3184
  struct ggml_tensor * K =
2829
3185
  ggml_view_3d(ctx0, kv_self.k,
2830
- n_embd_head, n_past + N, n_head_kv,
3186
+ n_embd_head, n_kv, n_head_kv,
2831
3187
  ggml_element_size(kv_self.k)*n_embd_gqa,
2832
3188
  ggml_element_size(kv_self.k)*n_embd_head,
2833
3189
  ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
@@ -2840,8 +3196,8 @@ static struct ggml_cgraph * llm_build_baichaun(
2840
3196
  ggml_set_name(KQ, "KQ");
2841
3197
 
2842
3198
  // KQ_scaled = KQ / sqrt(n_embd_head)
2843
- // KQ_scaled shape [n_past + N, N, n_head, 1]
2844
- struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
3199
+ // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
3200
+ struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
2845
3201
  offload_func_kq(KQ_scaled);
2846
3202
  ggml_set_name(KQ_scaled, "KQ_scaled");
2847
3203
 
@@ -2850,58 +3206,44 @@ static struct ggml_cgraph * llm_build_baichaun(
2850
3206
 
2851
3207
  switch (model.type) {
2852
3208
  case MODEL_7B:
2853
- KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
3209
+ KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
2854
3210
  break;
2855
3211
  case MODEL_13B:
2856
- KQ_scaled_alibi =ggml_alibi(ctx0, KQ_scaled, n_past, n_head, 8);
3212
+ // TODO: replace with ggml_add()
3213
+ KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ 0, n_head, 8);
2857
3214
  ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
2858
- KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past);
3215
+ KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
2859
3216
  break;
2860
3217
  default:
2861
3218
  GGML_ASSERT(false);
2862
3219
  }
2863
- // KQ_masked = mask_past(KQ_scaled)
2864
- // struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
2865
- // struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past);
2866
- // offload_func_kq(KQ_masked);
2867
- // ggml_set_name(KQ_masked, "KQ_masked");
2868
3220
 
2869
3221
  // KQ = soft_max(KQ_masked)
2870
- struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
3222
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
2871
3223
  offload_func_v(KQ_soft_max);
2872
3224
  ggml_set_name(KQ_soft_max, "KQ_soft_max");
2873
3225
 
2874
3226
  // split cached V into n_head heads
2875
3227
  struct ggml_tensor * V =
2876
3228
  ggml_view_3d(ctx0, kv_self.v,
2877
- n_past + N, n_embd_head, n_head_kv,
3229
+ n_kv, n_embd_head, n_head_kv,
2878
3230
  ggml_element_size(kv_self.v)*n_ctx,
2879
3231
  ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
2880
3232
  ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
2881
3233
  offload_func_v(V);
2882
3234
  ggml_set_name(V, "V");
2883
3235
 
2884
- #if 1
2885
3236
  struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
2886
3237
  offload_func_v(KQV);
2887
3238
  ggml_set_name(KQV, "KQV");
2888
- #else
2889
- // make V contiguous in memory to speed up the matmul, however we waste time on the copy
2890
- // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
2891
- // is there a better way?
2892
- struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head));
2893
- struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
2894
- #endif
2895
3239
 
2896
3240
  // KQV_merged = KQV.permute(0, 2, 1, 3)
2897
3241
  struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
2898
3242
  offload_func_v(KQV_merged);
2899
3243
  ggml_set_name(KQV_merged, "KQV_merged");
2900
3244
 
2901
- // cur = KQV_merged.contiguous().view(n_embd, N)
2902
- cur = ggml_cpy(ctx0,
2903
- KQV_merged,
2904
- ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
3245
+ // cur = KQV_merged.contiguous().view(n_embd, n_tokens)
3246
+ cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
2905
3247
  offload_func_v(cur);
2906
3248
  ggml_set_name(cur, "KQV_merged_contiguous");
2907
3249
 
@@ -2994,17 +3336,10 @@ static struct ggml_cgraph * llm_build_baichaun(
2994
3336
 
2995
3337
  static struct ggml_cgraph * llm_build_falcon(
2996
3338
  llama_context & lctx,
2997
- const llama_token * tokens,
2998
- const float * embd,
2999
- int n_tokens,
3000
- int n_past) {
3001
-
3002
- GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
3003
-
3004
- const int N = n_tokens;
3005
-
3339
+ const llama_batch & batch) {
3006
3340
  const auto & model = lctx.model;
3007
3341
  const auto & hparams = model.hparams;
3342
+ const auto & cparams = lctx.cparams;
3008
3343
 
3009
3344
  const auto & kv_self = lctx.kv_self;
3010
3345
 
@@ -3012,7 +3347,7 @@ static struct ggml_cgraph * llm_build_falcon(
3012
3347
 
3013
3348
  const int64_t n_embd = hparams.n_embd;
3014
3349
  const int64_t n_layer = hparams.n_layer;
3015
- const int64_t n_ctx = hparams.n_ctx;
3350
+ const int64_t n_ctx = cparams.n_ctx;
3016
3351
  const int64_t n_head = hparams.n_head;
3017
3352
  const int64_t n_head_kv = hparams.n_head_kv;
3018
3353
  const int64_t n_embd_head = hparams.n_embd_head();
@@ -3020,12 +3355,21 @@ static struct ggml_cgraph * llm_build_falcon(
3020
3355
 
3021
3356
  GGML_ASSERT(n_embd_head == hparams.n_rot);
3022
3357
 
3023
- const float freq_base = hparams.rope_freq_base;
3024
- const float freq_scale = hparams.rope_freq_scale;
3358
+ const float freq_base = cparams.rope_freq_base;
3359
+ const float freq_scale = cparams.rope_freq_scale;
3025
3360
  const float norm_eps = hparams.f_norm_eps;
3026
3361
 
3027
3362
  const int n_gpu_layers = model.n_gpu_layers;
3028
3363
 
3364
+ const int32_t n_tokens = batch.n_tokens;
3365
+ const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
3366
+ const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
3367
+
3368
+ const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
3369
+
3370
+ //printf("kv_head = %d, n_kv = %d, n_tokens = %d, n_ctx = %d, is_measure = %d, has_shift = %d\n",
3371
+ // kv_head, n_kv, n_tokens, n_ctx, ggml_allocr_is_measure(lctx.alloc), kv_self.has_shift);
3372
+
3029
3373
  auto & buf_compute = lctx.buf_compute;
3030
3374
 
3031
3375
  struct ggml_init_params params = {
@@ -3043,12 +3387,12 @@ static struct ggml_cgraph * llm_build_falcon(
3043
3387
  struct ggml_tensor * cur;
3044
3388
  struct ggml_tensor * inpL;
3045
3389
 
3046
- if (tokens) {
3047
- struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
3390
+ if (batch.token) {
3391
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
3048
3392
 
3049
3393
  ggml_allocr_alloc(lctx.alloc, inp_tokens);
3050
3394
  if (!ggml_allocr_is_measure(lctx.alloc)) {
3051
- memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
3395
+ memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
3052
3396
  }
3053
3397
  ggml_set_name(inp_tokens, "inp_tokens");
3054
3398
 
@@ -3058,11 +3402,11 @@ static struct ggml_cgraph * llm_build_falcon(
3058
3402
  GGML_ASSERT(false && "not implemented");
3059
3403
  #endif
3060
3404
 
3061
- inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
3405
+ inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
3062
3406
 
3063
3407
  ggml_allocr_alloc(lctx.alloc, inpL);
3064
3408
  if (!ggml_allocr_is_measure(lctx.alloc)) {
3065
- memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
3409
+ memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
3066
3410
  }
3067
3411
  }
3068
3412
 
@@ -3071,9 +3415,6 @@ static struct ggml_cgraph * llm_build_falcon(
3071
3415
 
3072
3416
  // offload functions set the tensor output backend to GPU
3073
3417
  // tensors are GPU-accelerated if any input or the output has been offloaded
3074
- //
3075
- // with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
3076
- // in that case ggml_cuda_assign_buffers has no effect
3077
3418
  offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
3078
3419
  offload_func_t offload_func_kq = llama_nop;
3079
3420
  offload_func_t offload_func_v = llama_nop;
@@ -3090,12 +3431,75 @@ static struct ggml_cgraph * llm_build_falcon(
3090
3431
  }
3091
3432
  #endif // GGML_USE_CUBLAS
3092
3433
 
3434
+ // KQ_scale
3093
3435
  struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
3436
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
3094
3437
  ggml_allocr_alloc(lctx.alloc, KQ_scale);
3095
3438
  if (!ggml_allocr_is_measure(lctx.alloc)) {
3096
3439
  ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
3097
3440
  }
3098
- ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
3441
+
3442
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
3443
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
3444
+ offload_func_kq(KQ_mask);
3445
+ ggml_set_name(KQ_mask, "KQ_mask");
3446
+ ggml_allocr_alloc(lctx.alloc, KQ_mask);
3447
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3448
+ float * data = (float *) KQ_mask->data;
3449
+ memset(data, 0, ggml_nbytes(KQ_mask));
3450
+
3451
+ for (int h = 0; h < 1; ++h) {
3452
+ for (int j = 0; j < n_tokens; ++j) {
3453
+ const llama_pos pos = batch.pos[j];
3454
+ const llama_seq_id seq_id = batch.seq_id[j];
3455
+
3456
+ for (int i = 0; i < n_kv; ++i) {
3457
+ if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
3458
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
3459
+ }
3460
+ }
3461
+ }
3462
+ }
3463
+ }
3464
+
3465
+ // KQ_pos - contains the positions
3466
+ struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
3467
+ offload_func_kq(KQ_pos);
3468
+ ggml_set_name(KQ_pos, "KQ_pos");
3469
+ ggml_allocr_alloc(lctx.alloc, KQ_pos);
3470
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3471
+ int * data = (int *) KQ_pos->data;
3472
+ for (int i = 0; i < n_tokens; ++i) {
3473
+ data[i] = batch.pos[i];
3474
+ }
3475
+ }
3476
+
3477
+ // shift the entire K-cache if needed
3478
+ if (do_rope_shift) {
3479
+ struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
3480
+ offload_func_kq(K_shift);
3481
+ ggml_set_name(K_shift, "K_shift");
3482
+ ggml_allocr_alloc(lctx.alloc, K_shift);
3483
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3484
+ int * data = (int *) K_shift->data;
3485
+ for (int i = 0; i < n_ctx; ++i) {
3486
+ data[i] = kv_self.cells[i].delta;
3487
+ }
3488
+ }
3489
+
3490
+ for (int il = 0; il < n_layer; ++il) {
3491
+ struct ggml_tensor * tmp =
3492
+ ggml_rope_custom_inplace(ctx0,
3493
+ ggml_view_3d(ctx0, kv_self.k,
3494
+ n_embd_head, n_head_kv, n_ctx,
3495
+ ggml_element_size(kv_self.k)*n_embd_head,
3496
+ ggml_element_size(kv_self.k)*n_embd_gqa,
3497
+ ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
3498
+ K_shift, n_embd_head, 2, 0, freq_base, freq_scale);
3499
+ offload_func_kq(tmp);
3500
+ ggml_build_forward_expand(gf, tmp);
3501
+ }
3502
+ }
3099
3503
 
3100
3504
  for (int il = 0; il < n_layer; ++il) {
3101
3505
  struct ggml_tensor * attn_norm;
@@ -3152,148 +3556,395 @@ static struct ggml_cgraph * llm_build_falcon(
3152
3556
  // TODO: these 2 ggml_conts are technically not needed, but we add them until CUDA support for
3153
3557
  // non-contiguous views is added for the rope operator
3154
3558
  struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_3d(
3155
- ctx0, cur, n_embd_head, n_head, N,
3559
+ ctx0, cur, n_embd_head, n_head, n_tokens,
3156
3560
  wsize * n_embd_head,
3157
3561
  wsize * n_embd_head * (n_head + 2 * n_head_kv),
3158
3562
  0));
3159
3563
  offload_func_kq(tmpq);
3160
3564
 
3161
- struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_3d(
3162
- ctx0, cur, n_embd_head, n_head_kv, N,
3163
- wsize * n_embd_head,
3164
- wsize * n_embd_head * (n_head + 2 * n_head_kv),
3165
- wsize * n_embd_head * n_head));
3166
- offload_func_kq(tmpk);
3565
+ struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_3d(
3566
+ ctx0, cur, n_embd_head, n_head_kv, n_tokens,
3567
+ wsize * n_embd_head,
3568
+ wsize * n_embd_head * (n_head + 2 * n_head_kv),
3569
+ wsize * n_embd_head * n_head));
3570
+ offload_func_kq(tmpk);
3571
+
3572
+ struct ggml_tensor * tmpv = ggml_view_3d(
3573
+ ctx0, cur, n_embd_head, n_head_kv, n_tokens,
3574
+ wsize * n_embd_head,
3575
+ wsize * n_embd_head * (n_head + 2 * n_head_kv),
3576
+ wsize * n_embd_head * (n_head + n_head_kv));
3577
+ offload_func_v(tmpv);
3578
+
3579
+ // using mode = 2 for neox mode
3580
+ struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, tmpq, KQ_pos, n_embd_head, 2, 0, freq_base, freq_scale);
3581
+ offload_func_kq(Qcur);
3582
+ struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, tmpk, KQ_pos, n_embd_head, 2, 0, freq_base, freq_scale);
3583
+ offload_func_kq(Kcur);
3584
+
3585
+ {
3586
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
3587
+ offload_func_v(Vcur);
3588
+ offload_func_v(Vcur->src[0]->src[0]);
3589
+ ggml_set_name(Vcur, "Vcur");
3590
+
3591
+ struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
3592
+ offload_func_kq(k);
3593
+ ggml_set_name(k, "k");
3594
+
3595
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
3596
+ ( n_ctx)*ggml_element_size(kv_self.v),
3597
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
3598
+ offload_func_v(v);
3599
+
3600
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
3601
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
3602
+ }
3603
+
3604
+ struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
3605
+ offload_func_kq(Q);
3606
+ ggml_set_name(Q, "Q");
3607
+
3608
+ struct ggml_tensor * K =
3609
+ ggml_view_3d(ctx0, kv_self.k,
3610
+ n_embd_head, n_kv, n_head_kv,
3611
+ ggml_element_size(kv_self.k)*n_embd_gqa,
3612
+ ggml_element_size(kv_self.k)*n_embd_head,
3613
+ ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
3614
+ offload_func_kq(K);
3615
+ ggml_set_name(K, "K");
3616
+
3617
+ struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
3618
+ offload_func_kq(KQ);
3619
+ ggml_set_name(KQ, "KQ");
3620
+
3621
+ struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
3622
+ offload_func_kq(KQ_scaled);
3623
+ ggml_set_name(KQ_scaled, "KQ_scaled");
3624
+
3625
+ struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
3626
+ offload_func_kq(KQ_masked);
3627
+ ggml_set_name(KQ_masked, "KQ_masked");
3628
+
3629
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
3630
+ offload_func_v(KQ_soft_max);
3631
+ ggml_set_name(KQ_soft_max, "KQ_soft_max");
3632
+
3633
+ struct ggml_tensor * V =
3634
+ ggml_view_3d(ctx0, kv_self.v,
3635
+ n_kv, n_embd_head, n_head_kv,
3636
+ ggml_element_size(kv_self.v)*n_ctx,
3637
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
3638
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
3639
+ offload_func_v(V);
3640
+ ggml_set_name(V, "V");
3641
+
3642
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
3643
+ offload_func_v(KQV);
3644
+ ggml_set_name(KQV, "KQV");
3645
+
3646
+ struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
3647
+ offload_func_v(KQV_merged);
3648
+ ggml_set_name(KQV_merged, "KQV_merged");
3649
+
3650
+ cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
3651
+ offload_func_v(cur);
3652
+ ggml_set_name(cur, "KQV_merged_contiguous");
3653
+
3654
+ cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
3655
+ offload_func(cur);
3656
+ ggml_set_name(cur, "result_wo");
3657
+ }
3658
+
3659
+ struct ggml_tensor * attn_out = cur;
3660
+
3661
+ // feed forward
3662
+ {
3663
+ struct ggml_tensor * inpFF = attn_norm;
3664
+
3665
+ cur = ggml_mul_mat(ctx0, model.layers[il].w3, inpFF);
3666
+ offload_func(cur);
3667
+
3668
+ cur = ggml_gelu(ctx0, cur);
3669
+ offload_func(cur);
3670
+ cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur);
3671
+ offload_func(cur);
3672
+ }
3673
+
3674
+ cur = ggml_add(ctx0, cur, attn_out);
3675
+ offload_func(cur);
3676
+ cur = ggml_add(ctx0, cur, inpL);
3677
+ offload_func(cur);
3678
+
3679
+ // input for next layer
3680
+ inpL = cur;
3681
+ }
3682
+
3683
+ cur = inpL;
3684
+
3685
+ // norm
3686
+ {
3687
+ cur = ggml_norm(ctx0, cur, norm_eps);
3688
+ offload_func_nr(cur);
3689
+
3690
+ cur = ggml_add(ctx0,
3691
+ ggml_mul(ctx0, cur, model.output_norm),
3692
+ model.output_norm_b);
3693
+ ggml_set_name(cur, "result_norm");
3694
+ }
3695
+
3696
+ cur = ggml_mul_mat(ctx0, model.output, cur);
3697
+ ggml_set_name(cur, "result_output");
3698
+
3699
+ ggml_build_forward_expand(gf, cur);
3700
+
3701
+ ggml_free(ctx0);
3702
+
3703
+ return gf;
3704
+ }
3705
+
3706
+ static struct ggml_cgraph * llm_build_starcoder(
3707
+ llama_context & lctx,
3708
+ const llama_batch & batch) {
3709
+ const auto & model = lctx.model;
3710
+ const auto & hparams = model.hparams;
3711
+ const auto & cparams = lctx.cparams;
3712
+
3713
+ const auto & kv_self = lctx.kv_self;
3714
+
3715
+ GGML_ASSERT(!!kv_self.ctx);
3716
+
3717
+ const int64_t n_embd = hparams.n_embd;
3718
+ const int64_t n_layer = hparams.n_layer;
3719
+ const int64_t n_ctx = cparams.n_ctx;
3720
+ const int64_t n_head = hparams.n_head;
3721
+ const int64_t n_head_kv = hparams.n_head_kv;
3722
+ const int64_t n_embd_head = hparams.n_embd_head();
3723
+ const int64_t n_embd_gqa = hparams.n_embd_gqa();
3724
+
3725
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
3726
+
3727
+ const float norm_eps = hparams.f_norm_eps;
3728
+
3729
+ const int32_t n_tokens = batch.n_tokens;
3730
+ const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
3731
+ const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
3732
+
3733
+ auto & buf_compute = lctx.buf_compute;
3734
+
3735
+ struct ggml_init_params params = {
3736
+ /*.mem_size =*/ buf_compute.size,
3737
+ /*.mem_buffer =*/ buf_compute.data,
3738
+ /*.no_alloc =*/ false,
3739
+ };
3740
+
3741
+ params.no_alloc = true;
3742
+
3743
+ struct ggml_context * ctx0 = ggml_init(params);
3744
+
3745
+ ggml_cgraph * gf = ggml_new_graph(ctx0);
3746
+
3747
+ struct ggml_tensor * cur;
3748
+ struct ggml_tensor * token;
3749
+ struct ggml_tensor * position;
3750
+ struct ggml_tensor * inpL;
3751
+
3752
+ if (batch.token) {
3753
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
3754
+
3755
+ ggml_allocr_alloc(lctx.alloc, inp_tokens);
3756
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3757
+ memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
3758
+ }
3759
+ ggml_set_name(inp_tokens, "inp_tokens");
3760
+
3761
+ token = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
3762
+ } else {
3763
+ #ifdef GGML_USE_MPI
3764
+ GGML_ASSERT(false && "not implemented");
3765
+ #endif
3766
+
3767
+ token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
3768
+
3769
+ ggml_allocr_alloc(lctx.alloc, token);
3770
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3771
+ memcpy(token->data, batch.embd, n_tokens * n_embd * ggml_element_size(token));
3772
+ }
3773
+ }
3774
+
3775
+ {
3776
+ // Compute position embeddings.
3777
+ struct ggml_tensor * inp_positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
3778
+ ggml_allocr_alloc(lctx.alloc, inp_positions);
3779
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3780
+ for (int i = 0; i < n_tokens; ++i) {
3781
+ ((int32_t *) inp_positions->data)[i] = batch.pos[i];
3782
+ }
3783
+ }
3784
+ ggml_set_name(inp_positions, "inp_positions");
3785
+
3786
+ position = ggml_get_rows(ctx0, model.pos_embeddings, inp_positions);
3787
+ }
3788
+
3789
+ // KQ_scale
3790
+ struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
3791
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
3792
+ ggml_allocr_alloc(lctx.alloc, KQ_scale);
3793
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3794
+ ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
3795
+ }
3796
+
3797
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
3798
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
3799
+ ggml_set_name(KQ_mask, "KQ_mask");
3800
+ ggml_allocr_alloc(lctx.alloc, KQ_mask);
3801
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3802
+ float * data = (float *) KQ_mask->data;
3803
+ memset(data, 0, ggml_nbytes(KQ_mask));
3804
+
3805
+ for (int h = 0; h < 1; ++h) {
3806
+ for (int j = 0; j < n_tokens; ++j) {
3807
+ const llama_pos pos = batch.pos[j];
3808
+ const llama_seq_id seq_id = batch.seq_id[j];
3809
+
3810
+ for (int i = 0; i < n_kv; ++i) {
3811
+ if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
3812
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
3813
+ }
3814
+ }
3815
+ }
3816
+ }
3817
+ }
3818
+
3819
+ inpL = ggml_add(ctx0, token, position);
3820
+ ggml_set_name(inpL, "inpL");
3167
3821
 
3168
- struct ggml_tensor * tmpv = ggml_view_3d(
3169
- ctx0, cur, n_embd_head, n_head_kv, N,
3170
- wsize * n_embd_head,
3171
- wsize * n_embd_head * (n_head + 2 * n_head_kv),
3172
- wsize * n_embd_head * (n_head + n_head_kv));
3173
- offload_func_v(tmpv);
3822
+ for (int il = 0; il < n_layer; ++il) {
3823
+ {
3824
+ // Norm
3825
+ cur = ggml_norm(ctx0, inpL, norm_eps);
3826
+ cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b);
3827
+ }
3174
3828
 
3175
- // using mode = 2 for neox mode
3176
- struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, tmpq, n_past, n_embd_head, 2, 0, freq_base, freq_scale);
3177
- offload_func_kq(Qcur);
3178
- struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, tmpk, n_past, n_embd_head, 2, 0, freq_base, freq_scale);
3179
- offload_func_kq(Kcur);
3829
+ {
3830
+ // Self Attention
3831
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv);
3832
+
3833
+ struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd);
3834
+ struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*n_embd);
3835
+ struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
3836
+
3837
+ struct ggml_tensor * Qcur = tmpq;
3838
+ struct ggml_tensor * Kcur = tmpk;
3180
3839
 
3181
3840
  {
3182
- struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, N));
3183
- offload_func_v(Vcur);
3184
- offload_func_v(Vcur->src[0]->src[0]);
3841
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
3185
3842
  ggml_set_name(Vcur, "Vcur");
3186
3843
 
3187
- struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
3188
- offload_func_kq(k);
3844
+ struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
3189
3845
  ggml_set_name(k, "k");
3190
3846
 
3191
- struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
3847
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
3192
3848
  ( n_ctx)*ggml_element_size(kv_self.v),
3193
- (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
3194
- offload_func_v(v);
3849
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
3195
3850
 
3196
3851
  ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
3197
3852
  ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
3198
3853
  }
3199
3854
 
3200
- struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
3201
- offload_func_kq(Q);
3855
+ struct ggml_tensor * Q =
3856
+ ggml_permute(ctx0,
3857
+ ggml_cpy(ctx0,
3858
+ Qcur,
3859
+ ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, n_tokens)),
3860
+ 0, 2, 1, 3);
3202
3861
  ggml_set_name(Q, "Q");
3203
3862
 
3204
3863
  struct ggml_tensor * K =
3205
3864
  ggml_view_3d(ctx0, kv_self.k,
3206
- n_embd_head, n_past + N, n_head_kv,
3865
+ n_embd_head, n_kv, n_head_kv,
3207
3866
  ggml_element_size(kv_self.k)*n_embd_gqa,
3208
3867
  ggml_element_size(kv_self.k)*n_embd_head,
3209
3868
  ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
3210
- offload_func_kq(K);
3211
3869
  ggml_set_name(K, "K");
3212
3870
 
3871
+ // K * Q
3213
3872
  struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
3214
- offload_func_kq(KQ);
3215
3873
  ggml_set_name(KQ, "KQ");
3216
3874
 
3875
+ // KQ_scaled = KQ / sqrt(n_embd_head)
3876
+ // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
3217
3877
  struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
3218
- offload_func_kq(KQ_scaled);
3219
3878
  ggml_set_name(KQ_scaled, "KQ_scaled");
3220
3879
 
3221
- struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
3222
- offload_func_kq(KQ_masked);
3880
+ // KQ_masked = mask_past(KQ_scaled)
3881
+ struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
3223
3882
  ggml_set_name(KQ_masked, "KQ_masked");
3224
3883
 
3884
+ // KQ = soft_max(KQ_masked)
3225
3885
  struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
3226
- offload_func_v(KQ_soft_max);
3227
3886
  ggml_set_name(KQ_soft_max, "KQ_soft_max");
3228
3887
 
3888
+ // split cached V into n_head heads
3229
3889
  struct ggml_tensor * V =
3230
3890
  ggml_view_3d(ctx0, kv_self.v,
3231
- n_past + N, n_embd_head, n_head_kv,
3891
+ n_kv, n_embd_head, n_head_kv,
3232
3892
  ggml_element_size(kv_self.v)*n_ctx,
3233
3893
  ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
3234
3894
  ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
3235
- offload_func_v(V);
3236
3895
  ggml_set_name(V, "V");
3237
3896
 
3238
3897
  struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
3239
- offload_func_v(KQV);
3240
3898
  ggml_set_name(KQV, "KQV");
3241
3899
 
3900
+ // KQV_merged = KQV.permute(0, 2, 1, 3)
3242
3901
  struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
3243
- offload_func_v(KQV_merged);
3244
3902
  ggml_set_name(KQV_merged, "KQV_merged");
3245
3903
 
3246
- cur = ggml_cpy(ctx0, KQV_merged, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
3247
- offload_func_v(cur);
3904
+ // cur = KQV_merged.contiguous().view(n_embd, n_tokens)
3905
+ cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
3248
3906
  ggml_set_name(cur, "KQV_merged_contiguous");
3249
-
3250
- cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
3251
- offload_func(cur);
3252
- ggml_set_name(cur, "result_wo");
3253
3907
  }
3254
3908
 
3255
- struct ggml_tensor * attn_out = cur;
3909
+ // Projection
3910
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo);
3256
3911
 
3257
- // feed forward
3912
+ // Add the input
3913
+ cur = ggml_add(ctx0, cur, inpL);
3914
+
3915
+ struct ggml_tensor * inpFF = cur;
3916
+
3917
+ // FF
3258
3918
  {
3259
- struct ggml_tensor * inpFF = attn_norm;
3919
+ // Norm
3920
+ {
3921
+ cur = ggml_norm(ctx0, inpFF, norm_eps);
3922
+ cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b);
3923
+ }
3260
3924
 
3261
- cur = ggml_mul_mat(ctx0, model.layers[il].w3, inpFF);
3262
- offload_func(cur);
3925
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3);
3263
3926
 
3927
+ // GELU activation
3264
3928
  cur = ggml_gelu(ctx0, cur);
3265
- offload_func(cur);
3266
- cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur);
3267
- offload_func(cur);
3268
- }
3269
3929
 
3270
- cur = ggml_add(ctx0, cur, attn_out);
3271
- offload_func(cur);
3272
- cur = ggml_add(ctx0, cur, inpL);
3273
- offload_func(cur);
3930
+ // Projection
3931
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2);
3932
+ }
3274
3933
 
3275
- // input for next layer
3276
- inpL = cur;
3934
+ inpL = ggml_add(ctx0, cur, inpFF);
3277
3935
  }
3278
3936
 
3279
- cur = inpL;
3280
-
3281
- // norm
3937
+ // Output Norm
3282
3938
  {
3283
- cur = ggml_norm(ctx0, cur, norm_eps);
3284
- offload_func_nr(cur);
3285
-
3286
- cur = ggml_add(ctx0,
3287
- ggml_mul(ctx0, cur, model.output_norm),
3288
- model.output_norm_b);
3289
- ggml_set_name(cur, "result_norm");
3939
+ cur = ggml_norm(ctx0, inpL, norm_eps);
3940
+ cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b);
3290
3941
  }
3942
+ ggml_set_name(cur, "result_norm");
3291
3943
 
3292
3944
  cur = ggml_mul_mat(ctx0, model.output, cur);
3293
3945
  ggml_set_name(cur, "result_output");
3294
3946
 
3295
3947
  ggml_build_forward_expand(gf, cur);
3296
-
3297
3948
  ggml_free(ctx0);
3298
3949
 
3299
3950
  return gf;
@@ -3301,10 +3952,7 @@ static struct ggml_cgraph * llm_build_falcon(
3301
3952
 
3302
3953
  static struct ggml_cgraph * llama_build_graph(
3303
3954
  llama_context & lctx,
3304
- const llama_token * tokens,
3305
- const float * embd,
3306
- int n_tokens,
3307
- int n_past) {
3955
+ const llama_batch & batch) {
3308
3956
  const auto & model = lctx.model;
3309
3957
 
3310
3958
  struct ggml_cgraph * result = NULL;
@@ -3312,72 +3960,117 @@ static struct ggml_cgraph * llama_build_graph(
3312
3960
  switch (model.arch) {
3313
3961
  case LLM_ARCH_LLAMA:
3314
3962
  {
3315
- result = llm_build_llama(lctx, tokens, embd, n_tokens, n_past);
3963
+ result = llm_build_llama(lctx, batch);
3316
3964
  } break;
3317
3965
  case LLM_ARCH_BAICHUAN:
3318
3966
  {
3319
- result = llm_build_baichaun(lctx, tokens, embd, n_tokens, n_past);
3967
+ result = llm_build_baichaun(lctx, batch);
3320
3968
  } break;
3321
3969
  case LLM_ARCH_FALCON:
3322
3970
  {
3323
- result = llm_build_falcon(lctx, tokens, embd, n_tokens, n_past);
3971
+ result = llm_build_falcon(lctx, batch);
3972
+ } break;
3973
+ case LLM_ARCH_STARCODER:
3974
+ {
3975
+ result = llm_build_starcoder(lctx, batch);
3324
3976
  } break;
3325
3977
  default:
3326
3978
  GGML_ASSERT(false);
3327
- };
3979
+ }
3328
3980
 
3329
3981
  return result;
3330
3982
  }
3331
3983
 
3332
- // evaluate the transformer
3984
+ // decode a batch of tokens by evaluating the transformer
3333
3985
  //
3334
3986
  // - lctx: llama context
3335
- // - tokens: new batch of tokens to process
3336
- // - embd embeddings input
3337
- // - n_tokens number of tokens
3338
- // - n_past: the context size so far
3987
+ // - batch: batch to evaluate
3339
3988
  // - n_threads: number of threads to use
3340
3989
  //
3341
- static bool llama_eval_internal(
3990
+ // return 0 on success
3991
+ // return positive int on warning
3992
+ // return negative int on error
3993
+ //
3994
+ static int llama_decode_internal(
3342
3995
  llama_context & lctx,
3343
- const llama_token * tokens,
3344
- const float * embd,
3345
- int n_tokens,
3346
- int n_past,
3347
- int n_threads,
3348
- const char * cgraph_fname) {
3996
+ llama_batch batch) {
3997
+ const uint32_t n_tokens = batch.n_tokens;
3998
+
3999
+ if (n_tokens == 0) {
4000
+ LLAMA_LOG_ERROR("%s: n_tokens == 0", __func__);
4001
+ return -1;
4002
+ }
4003
+
4004
+ const auto & model = lctx.model;
4005
+ const auto & hparams = model.hparams;
4006
+ const auto & cparams = lctx.cparams;
3349
4007
 
3350
- GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
4008
+ const auto n_batch = cparams.n_batch;
3351
4009
 
3352
- GGML_ASSERT(n_tokens > 0);
3353
- GGML_ASSERT(n_past >= 0);
3354
- // TODO: keep the values of n_batch and n_ctx
3355
- // GGML_ASSERT(n_tokens <= n_batch);
3356
- // GGML_ASSERT(n_past + n_tokens <= n_ctx);
4010
+ GGML_ASSERT(n_tokens <= n_batch);
4011
+
4012
+ int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
4013
+ GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
3357
4014
 
3358
4015
  const int64_t t_start_us = ggml_time_us();
3359
4016
 
3360
4017
  #ifdef GGML_USE_MPI
3361
- ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
4018
+ // TODO: needs fix after #3228
4019
+ GGML_ASSERT(false && "not implemented");
4020
+ //ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
3362
4021
  #endif
3363
4022
 
3364
4023
  GGML_ASSERT(n_threads > 0);
3365
4024
 
3366
- const int N = n_tokens;
3367
-
3368
- const auto & model = lctx.model;
3369
- const auto & hparams = model.hparams;
3370
-
3371
- const auto & kv_self = lctx.kv_self;
4025
+ auto & kv_self = lctx.kv_self;
3372
4026
 
3373
4027
  GGML_ASSERT(!!kv_self.ctx);
3374
4028
 
3375
4029
  const int64_t n_embd = hparams.n_embd;
3376
4030
  const int64_t n_vocab = hparams.n_vocab;
3377
4031
 
4032
+ // helpers for smoother batch API transistion
4033
+ // after deprecating the llama_eval calls, these will be removed
4034
+ std::vector<llama_pos> pos;
4035
+ std::vector<llama_seq_id> seq_id;
4036
+
4037
+ if (batch.pos == nullptr) {
4038
+ pos.resize(n_tokens);
4039
+ for (uint32_t i = 0; i < n_tokens; i++) {
4040
+ pos[i] = batch.all_pos_0 + i*batch.all_pos_1;
4041
+ }
4042
+
4043
+ batch.pos = pos.data();
4044
+ }
4045
+
4046
+ if (batch.seq_id == nullptr) {
4047
+ seq_id.resize(n_tokens);
4048
+ for (uint32_t i = 0; i < n_tokens; i++) {
4049
+ seq_id[i] = batch.all_seq_id;
4050
+ }
4051
+
4052
+ batch.seq_id = seq_id.data();
4053
+ }
4054
+
4055
+ // we always start to search for a free slot from the start of the cache
4056
+ // TODO: better strategies can be implemented
4057
+ kv_self.head = 0;
4058
+
4059
+ if (!llama_kv_cache_find_slot(kv_self, batch)) {
4060
+ return 1;
4061
+ }
4062
+
4063
+ // a heuristic, to avoid attending the full cache if it is not yet utilized
4064
+ // after enough generations, the benefit from this heuristic disappears
4065
+ // if we start defragmenting the cache, the benefit from this will be more important
4066
+ //kv_self.n = std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)); // TODO: this might be better for CUDA?
4067
+ kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, llama_kv_cache_cell_max(kv_self)));
4068
+
4069
+ //printf("kv_self.n = %d\n", kv_self.n);
4070
+
3378
4071
  ggml_allocr_reset(lctx.alloc);
3379
4072
 
3380
- ggml_cgraph * gf = llama_build_graph(lctx, tokens, embd, n_tokens, n_past);
4073
+ ggml_cgraph * gf = llama_build_graph(lctx, batch);
3381
4074
 
3382
4075
  ggml_allocr_alloc_graph(lctx.alloc, gf);
3383
4076
 
@@ -3386,6 +4079,7 @@ static bool llama_eval_internal(
3386
4079
  ggml_tensor * node = gf->leafs[i];
3387
4080
  if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
3388
4081
  ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) lctx.buf_alloc.data);
4082
+ ggml_cuda_copy_to_device(node);
3389
4083
  }
3390
4084
  }
3391
4085
 
@@ -3395,6 +4089,8 @@ static bool llama_eval_internal(
3395
4089
  ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) lctx.buf_alloc.data);
3396
4090
  }
3397
4091
  }
4092
+
4093
+ ggml_cuda_set_mul_mat_q(cparams.mul_mat_q);
3398
4094
  #endif
3399
4095
 
3400
4096
  // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
@@ -3404,10 +4100,19 @@ static bool llama_eval_internal(
3404
4100
  // TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
3405
4101
  // we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
3406
4102
  // with the BLAS calls. need a better solution
3407
- if (N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
4103
+ if (n_tokens >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
3408
4104
  n_threads = std::min(4, n_threads);
3409
4105
  }
3410
4106
 
4107
+ // If all tensors can be run on the GPU then using more than 1 thread is detrimental.
4108
+ const bool full_offload_supported = model.arch == LLM_ARCH_LLAMA ||
4109
+ model.arch == LLM_ARCH_BAICHUAN ||
4110
+ model.arch == LLM_ARCH_FALCON;
4111
+ const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
4112
+ if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
4113
+ n_threads = 1;
4114
+ }
4115
+
3411
4116
  struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
3412
4117
  struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
3413
4118
 
@@ -3423,10 +4128,6 @@ static bool llama_eval_internal(
3423
4128
  if (lctx.ctx_metal) {
3424
4129
  ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
3425
4130
  ggml_metal_graph_compute(lctx.ctx_metal, gf);
3426
- ggml_metal_get_tensor (lctx.ctx_metal, res);
3427
- if (!lctx.embedding.empty()) {
3428
- ggml_metal_get_tensor(lctx.ctx_metal, embeddings);
3429
- }
3430
4131
  } else {
3431
4132
  ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
3432
4133
  }
@@ -3438,12 +4139,9 @@ static bool llama_eval_internal(
3438
4139
  ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
3439
4140
  #endif
3440
4141
 
3441
- // update kv token count
3442
- lctx.kv_self.n = n_past + N;
3443
-
3444
- if (cgraph_fname) {
3445
- ggml_graph_export(gf, cgraph_fname);
3446
- }
4142
+ // update the kv ring buffer
4143
+ lctx.kv_self.head += n_tokens;
4144
+ lctx.kv_self.has_shift = false;
3447
4145
 
3448
4146
  #ifdef GGML_PERF
3449
4147
  // print timing information per ggml operation (for debugging purposes)
@@ -3460,13 +4158,20 @@ static bool llama_eval_internal(
3460
4158
  {
3461
4159
  auto & logits_out = lctx.logits;
3462
4160
 
3463
- if (lctx.logits_all) {
3464
- logits_out.resize(n_vocab * N);
3465
- memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*N);
4161
+ if (batch.logits) {
4162
+ logits_out.resize(n_vocab * n_tokens);
4163
+ for (uint32_t i = 0; i < n_tokens; i++) {
4164
+ if (batch.logits[i] == 0) {
4165
+ continue;
4166
+ }
4167
+ memcpy(logits_out.data() + (n_vocab*i), (float *) ggml_get_data(res) + (n_vocab*i), sizeof(float)*n_vocab);
4168
+ }
4169
+ } else if (lctx.logits_all) {
4170
+ logits_out.resize(n_vocab * n_tokens);
4171
+ memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*n_tokens);
3466
4172
  } else {
3467
- // return result for just the last token
3468
4173
  logits_out.resize(n_vocab);
3469
- memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
4174
+ memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(n_tokens - 1)), sizeof(float)*n_vocab);
3470
4175
  }
3471
4176
  }
3472
4177
 
@@ -3475,20 +4180,27 @@ static bool llama_eval_internal(
3475
4180
  auto & embedding_out = lctx.embedding;
3476
4181
 
3477
4182
  embedding_out.resize(n_embd);
3478
- memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd);
4183
+ memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(n_tokens - 1)), sizeof(float)*n_embd);
3479
4184
  }
3480
4185
 
3481
4186
  // measure the performance only for the single-token evals
3482
- if (N == 1) {
4187
+ if (n_tokens == 1) {
3483
4188
  lctx.t_eval_us += ggml_time_us() - t_start_us;
3484
4189
  lctx.n_eval++;
3485
4190
  }
3486
- else if (N > 1) {
4191
+ else if (n_tokens > 1) {
3487
4192
  lctx.t_p_eval_us += ggml_time_us() - t_start_us;
3488
- lctx.n_p_eval += N;
4193
+ lctx.n_p_eval += n_tokens;
3489
4194
  }
3490
4195
 
3491
- return true;
4196
+ // get a more accurate load time, upon first eval
4197
+ // TODO: fix this
4198
+ if (!lctx.has_evaluated_once) {
4199
+ lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
4200
+ lctx.has_evaluated_once = true;
4201
+ }
4202
+
4203
+ return 0;
3492
4204
  }
3493
4205
 
3494
4206
  //
@@ -3909,7 +4621,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
3909
4621
  llm_tokenizer_bpe tokenizer(vocab);
3910
4622
  tokenizer.tokenize(raw_text, output);
3911
4623
  } break;
3912
- };
4624
+ }
3913
4625
 
3914
4626
  return output;
3915
4627
  }
@@ -3939,7 +4651,7 @@ struct llama_grammar_candidate {
3939
4651
 
3940
4652
  // Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as
3941
4653
  // pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
3942
- std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
4654
+ static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
3943
4655
  const char * src,
3944
4656
  llama_partial_utf8 partial_start) {
3945
4657
  static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
@@ -4313,6 +5025,13 @@ struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar)
4313
5025
  // sampling
4314
5026
  //
4315
5027
 
5028
+ void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed) {
5029
+ if (seed == LLAMA_DEFAULT_SEED) {
5030
+ seed = time(NULL);
5031
+ }
5032
+ ctx->rng.seed(seed);
5033
+ }
5034
+
4316
5035
  void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates) {
4317
5036
  GGML_ASSERT(candidates->size > 0);
4318
5037
 
@@ -4521,7 +5240,7 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
4521
5240
  }
4522
5241
  }
4523
5242
 
4524
- void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
5243
+ void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
4525
5244
  const int64_t t_start_sample_us = ggml_time_us();
4526
5245
 
4527
5246
  for (size_t i = 0; i < candidates_p->size; ++i) {
@@ -4533,6 +5252,10 @@ void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array
4533
5252
  }
4534
5253
  }
4535
5254
 
5255
+ void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
5256
+ llama_sample_temp(ctx, candidates_p, temp);
5257
+ }
5258
+
4536
5259
  void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty) {
4537
5260
  if (last_tokens_size == 0 || penalty == 1.0f) {
4538
5261
  return;
@@ -4656,7 +5379,7 @@ void llama_sample_classifier_free_guidance(
4656
5379
 
4657
5380
  GGML_ASSERT(ctx);
4658
5381
 
4659
- auto n_vocab = llama_n_vocab(ctx);
5382
+ auto n_vocab = llama_n_vocab(llama_get_model(ctx));
4660
5383
 
4661
5384
  GGML_ASSERT(n_vocab == (int)candidates->size);
4662
5385
  GGML_ASSERT(!candidates->sorted);
@@ -4685,7 +5408,7 @@ void llama_sample_classifier_free_guidance(
4685
5408
  llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu) {
4686
5409
  GGML_ASSERT(ctx);
4687
5410
 
4688
- auto N = float(llama_n_vocab(ctx));
5411
+ auto N = float(llama_n_vocab(llama_get_model(ctx)));
4689
5412
  int64_t t_start_sample_us;
4690
5413
  t_start_sample_us = ggml_time_us();
4691
5414
 
@@ -4872,7 +5595,7 @@ struct llama_logit_info {
4872
5595
  };
4873
5596
  llama_logit_info(llama_context * ctx)
4874
5597
  : logits(llama_get_logits(ctx))
4875
- , n_vocab(llama_n_vocab(ctx))
5598
+ , n_vocab(llama_n_vocab(llama_get_model(ctx)))
4876
5599
  , max_l(*std::max_element(logits, logits + n_vocab))
4877
5600
  , normalizer(1.0f / std::accumulate(logits, logits + n_vocab, 0.0f, sum_exp{max_l}))
4878
5601
  { }
@@ -4910,7 +5633,6 @@ struct llama_beam_search_data {
4910
5633
  size_t n_beams;
4911
5634
  int n_past;
4912
5635
  int n_predict;
4913
- int n_threads;
4914
5636
  std::vector<llama_beam> beams;
4915
5637
  std::vector<llama_beam> next_beams;
4916
5638
 
@@ -4920,12 +5642,11 @@ struct llama_beam_search_data {
4920
5642
  // Used to communicate to/from callback on beams state.
4921
5643
  std::vector<llama_beam_view> beam_views;
4922
5644
 
4923
- llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict, int n_threads)
5645
+ llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict)
4924
5646
  : ctx(ctx)
4925
5647
  , n_beams(n_beams)
4926
5648
  , n_past(n_past)
4927
5649
  , n_predict(n_predict)
4928
- , n_threads(n_threads)
4929
5650
  , beam_views(n_beams) {
4930
5651
  beams.reserve(n_beams);
4931
5652
  next_beams.reserve(n_beams);
@@ -4962,7 +5683,7 @@ struct llama_beam_search_data {
4962
5683
  } else {
4963
5684
  // beam is not at end-of-sentence, so branch with next top_k tokens.
4964
5685
  if (!beam.tokens.empty()) {
4965
- llama_eval(ctx, beam.tokens.data(), beam.tokens.size(), n_past, n_threads);
5686
+ llama_decode(ctx, llama_batch_get_one(beam.tokens.data(), beam.tokens.size(), n_past, 0));
4966
5687
  }
4967
5688
  llama_logit_info logit_info(ctx);
4968
5689
  std::vector<llama_token_data> next_tokens = logit_info.top_k(n_beams);
@@ -5036,7 +5757,7 @@ struct llama_beam_search_data {
5036
5757
  callback(callback_data, get_beams_state(false)); // Sets common_prefix_length
5037
5758
  update_beams_from_beam_views(); // Update values (p,eob) that callback may have changed.
5038
5759
  if (common_prefix_length) {
5039
- llama_eval(ctx, beams[0].tokens.data(), common_prefix_length, n_past, n_threads);
5760
+ llama_decode(ctx, llama_batch_get_one(beams[0].tokens.data(), common_prefix_length, n_past, 0));
5040
5761
  n_past += common_prefix_length;
5041
5762
  }
5042
5763
  // Zero-out next_beam probabilities to place them last in following min-heap.
@@ -5077,11 +5798,11 @@ struct llama_beam_search_data {
5077
5798
 
5078
5799
  void llama_beam_search(llama_context * ctx,
5079
5800
  llama_beam_search_callback_fn_t callback, void * callback_data,
5080
- size_t n_beams, int n_past, int n_predict, int n_threads) {
5801
+ size_t n_beams, int n_past, int n_predict) {
5081
5802
  assert(ctx);
5082
5803
  const int64_t t_start_sample_us = ggml_time_us();
5083
5804
 
5084
- llama_beam_search_data beam_search_data(ctx, n_beams, n_past, n_predict, n_threads);
5805
+ llama_beam_search_data beam_search_data(ctx, n_beams, n_past, n_predict);
5085
5806
 
5086
5807
  beam_search_data.loop(callback, callback_data);
5087
5808
 
@@ -5301,11 +6022,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
5301
6022
  nthread = std::thread::hardware_concurrency();
5302
6023
  }
5303
6024
 
5304
- std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname_inp, /*use_mmap*/ false));
6025
+ llama_model_loader ml(fname_inp, /*use_mmap*/ false);
5305
6026
 
5306
6027
  llama_model model;
5307
- llm_load_arch(*ml, model);
5308
- llm_load_hparams(*ml, model, 0, 0, 0);
6028
+ llm_load_arch(ml, model);
6029
+ llm_load_hparams(ml, model);
5309
6030
 
5310
6031
  if (params->only_copy) {
5311
6032
  ftype = model.ftype;
@@ -5315,7 +6036,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
5315
6036
  struct gguf_context * ctx_out = gguf_init_empty();
5316
6037
 
5317
6038
  // copy the KV pairs from the input file
5318
- gguf_set_kv (ctx_out, ml->ctx_gguf);
6039
+ gguf_set_kv (ctx_out, ml.ctx_gguf);
5319
6040
  gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
5320
6041
  gguf_set_val_u32(ctx_out, "general.file_type", ftype);
5321
6042
 
@@ -5323,8 +6044,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
5323
6044
  int n_attention_wv = 0;
5324
6045
  int n_feed_forward_w2 = 0;
5325
6046
 
5326
- for (int i = 0; i < ml->n_tensors; ++i) {
5327
- struct ggml_tensor * meta = ml->get_tensor_meta(i);
6047
+ for (int i = 0; i < ml.n_tensors; ++i) {
6048
+ struct ggml_tensor * meta = ml.get_tensor_meta(i);
5328
6049
 
5329
6050
  const std::string name = ggml_get_name(meta);
5330
6051
 
@@ -5360,8 +6081,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
5360
6081
  std::vector<no_init<float>> f32_conv_buf;
5361
6082
 
5362
6083
  // populate the original tensors so we get an initial meta data
5363
- for (int i = 0; i < ml->n_tensors; ++i) {
5364
- struct ggml_tensor * meta = ml->get_tensor_meta(i);
6084
+ for (int i = 0; i < ml.n_tensors; ++i) {
6085
+ struct ggml_tensor * meta = ml.get_tensor_meta(i);
5365
6086
  gguf_add_tensor(ctx_out, meta);
5366
6087
  }
5367
6088
 
@@ -5374,8 +6095,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
5374
6095
  // placeholder for the meta data
5375
6096
  ::zeros(fout, meta_size);
5376
6097
 
5377
- for (int i = 0; i < ml->n_tensors; ++i) {
5378
- struct ggml_tensor * tensor = ml->get_tensor_meta(i);
6098
+ for (int i = 0; i < ml.n_tensors; ++i) {
6099
+ struct ggml_tensor * tensor = ml.get_tensor_meta(i);
5379
6100
 
5380
6101
  const std::string name = ggml_get_name(tensor);
5381
6102
 
@@ -5383,10 +6104,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
5383
6104
  read_data.resize(ggml_nbytes(tensor));
5384
6105
  }
5385
6106
  tensor->data = read_data.data();
5386
- ml->load_data_for(tensor);
6107
+ ml.load_data_for(tensor);
5387
6108
 
5388
6109
  LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",
5389
- ++idx, ml->n_tensors,
6110
+ ++idx, ml.n_tensors,
5390
6111
  ggml_get_name(tensor),
5391
6112
  llama_format_tensor_shape(tensor).c_str(),
5392
6113
  ggml_type_name(tensor->type));
@@ -5536,8 +6257,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
5536
6257
  }
5537
6258
  }
5538
6259
 
5539
- // TODO: after the GGUF PR, this likely won't work and needs to be updated
5540
- int llama_apply_lora_from_file_internal(const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads) {
6260
+ static int llama_apply_lora_from_file_internal(
6261
+ const struct llama_model & model, const char * path_lora, float scale, const char * path_base_model, int n_threads
6262
+ ) {
5541
6263
  LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
5542
6264
 
5543
6265
  const int64_t t_start_lora_us = ggml_time_us();
@@ -5565,7 +6287,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
5565
6287
  int32_t lora_alpha;
5566
6288
  fin.read((char *) &lora_r, sizeof(lora_r));
5567
6289
  fin.read((char *) &lora_alpha, sizeof(lora_alpha));
5568
- float scaling = (float)lora_alpha / (float)lora_r;
6290
+ float scaling = scale * (float)lora_alpha / (float)lora_r;
5569
6291
 
5570
6292
  LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
5571
6293
 
@@ -5781,9 +6503,10 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
5781
6503
  ggml_set_name(r, "r_cpy");
5782
6504
  }
5783
6505
 
5784
- struct ggml_cgraph gf = ggml_build_forward(r);
6506
+ struct ggml_cgraph * gf = ggml_new_graph(lora_ctx);
6507
+ ggml_build_forward_expand(gf, r);
5785
6508
 
5786
- ggml_graph_compute_helper(work_buffer, &gf, n_threads);
6509
+ ggml_graph_compute_helper(work_buffer, gf, n_threads);
5787
6510
 
5788
6511
  // we won't need these tensors again, reset the context to save memory
5789
6512
  ggml_free(lora_ctx);
@@ -5812,27 +6535,16 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
5812
6535
  //
5813
6536
  // interface implementation
5814
6537
  //
5815
-
5816
- struct llama_context_params llama_context_default_params() {
5817
- struct llama_context_params result = {
5818
- /*.seed =*/ LLAMA_DEFAULT_SEED,
5819
- /*.n_ctx =*/ 512,
5820
- /*.n_batch =*/ 512,
6538
+ struct llama_model_params llama_model_default_params() {
6539
+ struct llama_model_params result = {
5821
6540
  /*.n_gpu_layers =*/ 0,
5822
6541
  /*.main_gpu =*/ 0,
5823
6542
  /*.tensor_split =*/ nullptr,
5824
- /*.rope_freq_base =*/ 10000.0f,
5825
- /*.rope_freq_scale =*/ 1.0f,
5826
6543
  /*.progress_callback =*/ nullptr,
5827
6544
  /*.progress_callback_user_data =*/ nullptr,
5828
- /*.low_vram =*/ false,
5829
- /*.mul_mat_q =*/ true,
5830
- /*.f16_kv =*/ true,
5831
- /*.logits_all =*/ false,
5832
6545
  /*.vocab_only =*/ false,
5833
6546
  /*.use_mmap =*/ true,
5834
6547
  /*.use_mlock =*/ false,
5835
- /*.embedding =*/ false,
5836
6548
  };
5837
6549
 
5838
6550
  #ifdef GGML_USE_METAL
@@ -5842,6 +6554,24 @@ struct llama_context_params llama_context_default_params() {
5842
6554
  return result;
5843
6555
  }
5844
6556
 
6557
+ struct llama_context_params llama_context_default_params() {
6558
+ struct llama_context_params result = {
6559
+ /*.seed =*/ LLAMA_DEFAULT_SEED,
6560
+ /*.n_ctx =*/ 512,
6561
+ /*.n_batch =*/ 512,
6562
+ /*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
6563
+ /*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS,
6564
+ /*.rope_freq_base =*/ 0.0f,
6565
+ /*.rope_freq_scale =*/ 0.0f,
6566
+ /*.mul_mat_q =*/ true,
6567
+ /*.f16_kv =*/ true,
6568
+ /*.logits_all =*/ false,
6569
+ /*.embedding =*/ false,
6570
+ };
6571
+
6572
+ return result;
6573
+ }
6574
+
5845
6575
  struct llama_model_quantize_params llama_model_quantize_default_params() {
5846
6576
  struct llama_model_quantize_params result = {
5847
6577
  /*.nthread =*/ 0,
@@ -5897,13 +6627,11 @@ int64_t llama_time_us(void) {
5897
6627
 
5898
6628
  struct llama_model * llama_load_model_from_file(
5899
6629
  const char * path_model,
5900
- struct llama_context_params params) {
6630
+ struct llama_model_params params) {
5901
6631
  ggml_time_init();
5902
6632
 
5903
6633
  llama_model * model = new llama_model;
5904
6634
 
5905
- ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
5906
-
5907
6635
  unsigned cur_percentage = 0;
5908
6636
  if (params.progress_callback == NULL) {
5909
6637
  params.progress_callback_user_data = &cur_percentage;
@@ -5920,9 +6648,9 @@ struct llama_model * llama_load_model_from_file(
5920
6648
  };
5921
6649
  }
5922
6650
 
5923
- if (!llama_model_load(path_model, *model, params.n_ctx, params.n_batch, params.n_gpu_layers,
5924
- params.main_gpu, params.tensor_split, params.mul_mat_q, params.rope_freq_base, params.rope_freq_scale,
5925
- params.low_vram, memory_type, params.use_mmap, params.use_mlock, params.vocab_only,
6651
+ if (!llama_model_load(path_model, *model, params.n_gpu_layers,
6652
+ params.main_gpu, params.tensor_split,
6653
+ params.use_mmap, params.use_mlock, params.vocab_only,
5926
6654
  params.progress_callback, params.progress_callback_user_data)) {
5927
6655
  LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
5928
6656
  delete model;
@@ -5946,18 +6674,33 @@ struct llama_context * llama_new_context_with_model(
5946
6674
 
5947
6675
  llama_context * ctx = new llama_context(*model);
5948
6676
 
6677
+ const auto & hparams = model->hparams;
6678
+ auto & cparams = ctx->cparams;
6679
+
6680
+ cparams.n_batch = params.n_batch;
6681
+ cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
6682
+ cparams.rope_freq_base = params.rope_freq_base == 0 ? hparams.rope_freq_base_train : params.rope_freq_base;
6683
+ cparams.rope_freq_scale = params.rope_freq_scale == 0 ? hparams.rope_freq_scale_train : params.rope_freq_scale;
6684
+ cparams.n_threads = params.n_threads;
6685
+ cparams.n_threads_batch = params.n_threads_batch;
6686
+ cparams.mul_mat_q = params.mul_mat_q;
6687
+
5949
6688
  if (params.seed == LLAMA_DEFAULT_SEED) {
5950
6689
  params.seed = time(NULL);
5951
6690
  }
5952
6691
 
6692
+ LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
6693
+ LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
6694
+ LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
6695
+
5953
6696
  ctx->rng = std::mt19937(params.seed);
5954
6697
  ctx->logits_all = params.logits_all;
5955
6698
 
5956
6699
  ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
5957
6700
 
5958
6701
  // reserve memory for context buffers
5959
- if (!params.vocab_only) {
5960
- if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
6702
+ if (!hparams.vocab_only) {
6703
+ if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, cparams.n_ctx, model->n_gpu_layers)) {
5961
6704
  LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
5962
6705
  llama_free(ctx);
5963
6706
  return nullptr;
@@ -5968,11 +6711,9 @@ struct llama_context * llama_new_context_with_model(
5968
6711
  LLAMA_LOG_INFO("%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
5969
6712
  }
5970
6713
 
5971
- const auto & hparams = ctx->model.hparams;
5972
-
5973
6714
  // resized during inference
5974
6715
  if (params.logits_all) {
5975
- ctx->logits.reserve(hparams.n_ctx*hparams.n_vocab);
6716
+ ctx->logits.reserve(cparams.n_ctx*hparams.n_vocab);
5976
6717
  } else {
5977
6718
  ctx->logits.reserve(hparams.n_vocab);
5978
6719
  }
@@ -5990,26 +6731,28 @@ struct llama_context * llama_new_context_with_model(
5990
6731
  ctx->alloc = ggml_allocr_new_measure(tensor_alignment);
5991
6732
 
5992
6733
  // build worst-case graph
5993
- int n_tokens = std::min((int)hparams.n_ctx, params.n_batch);
5994
- int n_past = hparams.n_ctx - n_tokens;
6734
+ int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch);
6735
+ int n_past = cparams.n_ctx - n_tokens;
5995
6736
  llama_token token = llama_token_bos(ctx); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
5996
- ggml_cgraph * gf = llama_build_graph(*ctx, &token, NULL, n_tokens, n_past);
6737
+ ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0));
6738
+
5997
6739
  #ifdef GGML_USE_METAL
5998
- if (params.n_gpu_layers > 0) {
6740
+ if (model->n_gpu_layers > 0) {
5999
6741
  ctx->ctx_metal = ggml_metal_init(1);
6000
6742
  if (!ctx->ctx_metal) {
6001
6743
  LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__);
6002
6744
  llama_free(ctx);
6003
6745
  return NULL;
6004
6746
  }
6005
- ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false);
6006
- ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
6747
+ ggml_metal_log_set_callback(llama_log_callback_default, NULL);
6748
+ //ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false);
6749
+ //ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
6007
6750
  }
6008
6751
  #endif
6009
6752
  // measure memory requirements for the graph
6010
6753
  size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
6011
6754
 
6012
- LLAMA_LOG_INFO("%s: compute buffer total size = %7.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
6755
+ LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
6013
6756
 
6014
6757
  // recreate allocator with exact memory requirements
6015
6758
  ggml_allocr_free(ctx->alloc);
@@ -6018,28 +6761,46 @@ struct llama_context * llama_new_context_with_model(
6018
6761
  ctx->alloc = ggml_allocr_new(ctx->buf_alloc.data, ctx->buf_alloc.size, tensor_alignment);
6019
6762
  #ifdef GGML_USE_METAL
6020
6763
  if (ctx->ctx_metal) {
6021
- ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
6764
+ //ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
6022
6765
  }
6023
6766
  #endif
6024
6767
  #ifdef GGML_USE_CUBLAS
6025
- if (params.low_vram) {
6026
- LLAMA_LOG_INFO("%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
6027
- ggml_cuda_set_scratch_size(0); // disable scratch
6028
- } else {
6029
- ggml_cuda_set_scratch_size(alloc_size);
6030
- LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MB\n", __func__, alloc_size / 1024.0 / 1024.0);
6768
+ ggml_cuda_set_scratch_size(alloc_size);
6769
+ LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MB\n", __func__, alloc_size / 1024.0 / 1024.0);
6770
+
6771
+ // calculate total VRAM usage
6772
+ auto add_tensor = [](const ggml_tensor * t, size_t & size) {
6773
+ if (t->backend == GGML_BACKEND_GPU || t->backend == GGML_BACKEND_GPU_SPLIT) {
6774
+ size += ggml_nbytes(t);
6775
+ }
6776
+ };
6777
+ size_t model_vram_size = 0;
6778
+ for (const auto & kv : model->tensors_by_name) {
6779
+ add_tensor(kv.second, model_vram_size);
6031
6780
  }
6781
+
6782
+ size_t kv_vram_size = 0;
6783
+ add_tensor(ctx->kv_self.k, kv_vram_size);
6784
+ add_tensor(ctx->kv_self.v, kv_vram_size);
6785
+
6786
+ size_t ctx_vram_size = alloc_size + kv_vram_size;
6787
+ size_t total_vram_size = model_vram_size + ctx_vram_size;
6788
+
6789
+ LLAMA_LOG_INFO("%s: total VRAM used: %.2f MB (model: %.2f MB, context: %.2f MB)\n", __func__,
6790
+ total_vram_size / 1024.0 / 1024.0,
6791
+ model_vram_size / 1024.0 / 1024.0,
6792
+ ctx_vram_size / 1024.0 / 1024.0);
6032
6793
  #endif
6033
6794
  }
6034
6795
 
6035
6796
  #ifdef GGML_USE_METAL
6036
- if (params.n_gpu_layers > 0) {
6797
+ if (model->n_gpu_layers > 0) {
6037
6798
  // this allocates all Metal resources and memory buffers
6038
6799
 
6039
6800
  void * data_ptr = NULL;
6040
6801
  size_t data_size = 0;
6041
6802
 
6042
- if (params.use_mmap) {
6803
+ if (ctx->model.mapping) {
6043
6804
  data_ptr = ctx->model.mapping->addr;
6044
6805
  data_size = ctx->model.mapping->size;
6045
6806
  } else {
@@ -6058,11 +6819,8 @@ struct llama_context * llama_new_context_with_model(
6058
6819
  return NULL; \
6059
6820
  }
6060
6821
 
6061
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
6062
-
6063
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.data, ctx->buf_compute.size, 0));
6064
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0));
6065
-
6822
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
6823
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0));
6066
6824
  LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.data, ctx->buf_alloc.size, 0));
6067
6825
  #undef LLAMA_METAL_CHECK_BUF
6068
6826
  }
@@ -6074,8 +6832,10 @@ struct llama_context * llama_new_context_with_model(
6074
6832
 
6075
6833
  if (ggml_mpi_rank(ctx->ctx_mpi) > 0) {
6076
6834
  // Enter a blocking eval loop with dummy input, letting rank=0 drive the process
6077
- const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos(ctx));
6078
- while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {};
6835
+ // TODO: needs fix after #3228
6836
+ GGML_ASSERT(false && "not implemented");
6837
+ //const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos(ctx));
6838
+ //while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {};
6079
6839
  llama_backend_free();
6080
6840
  exit(1);
6081
6841
  }
@@ -6084,63 +6844,37 @@ struct llama_context * llama_new_context_with_model(
6084
6844
  return ctx;
6085
6845
  }
6086
6846
 
6087
- struct llama_context * llama_init_from_file(
6088
- const char * path_model,
6089
- struct llama_context_params params) {
6090
- struct llama_model * model = llama_load_model_from_file(path_model, params);
6091
- if (!model) {
6092
- return nullptr;
6093
- }
6094
-
6095
- struct llama_context * ctx = llama_new_context_with_model(model, params);
6096
- ctx->model_owner = true;
6097
-
6098
- return ctx;
6099
- }
6100
-
6101
6847
  void llama_free(struct llama_context * ctx) {
6102
6848
  delete ctx;
6103
6849
  }
6104
6850
 
6105
- int llama_n_vocab(const struct llama_context * ctx) {
6106
- return llama_model_n_vocab(&ctx->model);
6851
+ const llama_model * llama_get_model(const struct llama_context * ctx) {
6852
+ return &ctx->model;
6107
6853
  }
6108
6854
 
6109
6855
  int llama_n_ctx(const struct llama_context * ctx) {
6110
- return llama_model_n_ctx(&ctx->model);
6111
- }
6112
-
6113
- int llama_n_ctx_train(const struct llama_context * ctx) {
6114
- return llama_model_n_ctx_train(&ctx->model);
6115
- }
6116
-
6117
- int llama_n_embd(const struct llama_context * ctx) {
6118
- return llama_model_n_embd(&ctx->model);
6856
+ return ctx->cparams.n_ctx;
6119
6857
  }
6120
6858
 
6121
- enum llama_vocab_type llama_vocab_type(const struct llama_context * ctx) {
6122
- return ctx->model.vocab.type;
6859
+ enum llama_vocab_type llama_vocab_type(const struct llama_model * model) {
6860
+ return model->vocab.type;
6123
6861
  }
6124
6862
 
6125
- int llama_model_n_vocab(const struct llama_model * model) {
6863
+ int llama_n_vocab(const struct llama_model * model) {
6126
6864
  return model->vocab.id_to_token.size();
6127
6865
  }
6128
6866
 
6129
- int llama_model_n_ctx(const struct llama_model * model) {
6130
- return model->hparams.n_ctx;
6131
- }
6132
-
6133
- int llama_model_n_ctx_train(const struct llama_model * model) {
6867
+ int llama_n_ctx_train(const struct llama_model * model) {
6134
6868
  return model->hparams.n_ctx_train;
6135
6869
  }
6136
6870
 
6137
- int llama_model_n_embd(const struct llama_model * model) {
6871
+ int llama_n_embd(const struct llama_model * model) {
6138
6872
  return model->hparams.n_embd;
6139
6873
  }
6140
6874
 
6141
6875
  int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
6142
6876
  return snprintf(buf, buf_size, "%s %s %s",
6143
- model->name.c_str(),
6877
+ llama_model_arch_name(model->arch).c_str(),
6144
6878
  llama_model_type_name(model->type),
6145
6879
  llama_model_ftype_name(model->ftype).c_str());
6146
6880
  }
@@ -6161,6 +6895,10 @@ uint64_t llama_model_n_params(const struct llama_model * model) {
6161
6895
  return nparams;
6162
6896
  }
6163
6897
 
6898
+ struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name) {
6899
+ return ggml_get_tensor(model->ctx, name);
6900
+ }
6901
+
6164
6902
  int llama_model_quantize(
6165
6903
  const char * fname_inp,
6166
6904
  const char * fname_out,
@@ -6174,18 +6912,18 @@ int llama_model_quantize(
6174
6912
  }
6175
6913
  }
6176
6914
 
6177
- int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
6915
+ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, float scale, const char * path_base_model, int n_threads) {
6178
6916
  try {
6179
- return llama_apply_lora_from_file_internal(ctx->model, path_lora, path_base_model, n_threads);
6917
+ return llama_apply_lora_from_file_internal(ctx->model, path_lora, scale, path_base_model, n_threads);
6180
6918
  } catch (const std::exception & err) {
6181
6919
  LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
6182
6920
  return 1;
6183
6921
  }
6184
6922
  }
6185
6923
 
6186
- int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, const char * path_base_model, int n_threads) {
6924
+ int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int n_threads) {
6187
6925
  try {
6188
- return llama_apply_lora_from_file_internal(*model, path_lora, path_base_model, n_threads);
6926
+ return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads);
6189
6927
  } catch (const std::exception & err) {
6190
6928
  LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
6191
6929
  return 1;
@@ -6193,16 +6931,27 @@ int llama_model_apply_lora_from_file(const struct llama_model * model, const cha
6193
6931
  }
6194
6932
 
6195
6933
  int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
6196
- return ctx->kv_self.n;
6934
+ return ctx->kv_self.head;
6197
6935
  }
6198
6936
 
6199
- #define LLAMA_MAX_RNG_STATE (64*1024)
6937
+ void llama_kv_cache_tokens_rm(struct llama_context * ctx, int32_t c0, int32_t c1) {
6938
+ llama_kv_cache_tokens_rm(ctx->kv_self, c0, c1);
6939
+ }
6200
6940
 
6201
- void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed) {
6202
- if (seed == LLAMA_DEFAULT_SEED) {
6203
- seed = time(NULL);
6204
- }
6205
- ctx->rng.seed(seed);
6941
+ void llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
6942
+ llama_kv_cache_seq_rm(ctx->kv_self, seq_id, p0, p1);
6943
+ }
6944
+
6945
+ void llama_kv_cache_seq_cp(struct llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
6946
+ llama_kv_cache_seq_cp(ctx->kv_self, seq_id_src, seq_id_dst, p0, p1);
6947
+ }
6948
+
6949
+ void llama_kv_cache_seq_keep(struct llama_context * ctx, llama_seq_id seq_id) {
6950
+ llama_kv_cache_seq_keep(ctx->kv_self, seq_id);
6951
+ }
6952
+
6953
+ void llama_kv_cache_seq_shift(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
6954
+ llama_kv_cache_seq_shift(ctx->kv_self, seq_id, p0, p1, delta);
6206
6955
  }
6207
6956
 
6208
6957
  // Returns the *maximum* size of the state
@@ -6289,7 +7038,17 @@ struct llama_data_file_context : llama_data_context {
6289
7038
  * llama_copy_state_data(ctx, &data_ctx);
6290
7039
  *
6291
7040
  */
6292
- void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
7041
+ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
7042
+ // TODO: does not support multi-sequence states
7043
+ {
7044
+ const auto & kv_self = ctx->kv_self;
7045
+ for (uint32_t i = 0; i < kv_self.head; ++i) {
7046
+ GGML_ASSERT(kv_self.cells[i].pos == (int32_t) i);
7047
+ GGML_ASSERT(kv_self.cells[i].seq_id.size() == 1);
7048
+ GGML_ASSERT(kv_self.cells[i].has_seq_id(0));
7049
+ }
7050
+ }
7051
+
6293
7052
  // copy rng
6294
7053
  {
6295
7054
  std::stringstream rng_ss;
@@ -6340,12 +7099,14 @@ void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_conte
6340
7099
  {
6341
7100
  const auto & kv_self = ctx->kv_self;
6342
7101
  const auto & hparams = ctx->model.hparams;
7102
+ const auto & cparams = ctx->cparams;
7103
+
6343
7104
  const int n_layer = hparams.n_layer;
6344
7105
  const int n_embd = hparams.n_embd_gqa();
6345
- const int n_ctx = hparams.n_ctx;
7106
+ const int n_ctx = cparams.n_ctx;
6346
7107
 
6347
7108
  const size_t kv_size = kv_self.buf.size;
6348
- const int kv_ntok = llama_get_kv_cache_token_count(ctx);
7109
+ const int kv_ntok = kv_self.head;
6349
7110
 
6350
7111
  data_ctx->write(&kv_size, sizeof(kv_size));
6351
7112
  data_ctx->write(&kv_ntok, sizeof(kv_ntok));
@@ -6448,9 +7209,11 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
6448
7209
  {
6449
7210
  const auto & kv_self = ctx->kv_self;
6450
7211
  const auto & hparams = ctx->model.hparams;
7212
+ const auto & cparams = ctx->cparams;
7213
+
6451
7214
  const int n_layer = hparams.n_layer;
6452
7215
  const int n_embd = hparams.n_embd_gqa();
6453
- const int n_ctx = hparams.n_ctx;
7216
+ const int n_ctx = cparams.n_ctx;
6454
7217
 
6455
7218
  size_t kv_size;
6456
7219
  int kv_ntok;
@@ -6489,7 +7252,8 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
6489
7252
  ggml_free(cpy_ctx);
6490
7253
  }
6491
7254
 
6492
- ctx->kv_self.n = kv_ntok;
7255
+ ctx->kv_self.head = kv_ntok;
7256
+ ctx->kv_self.size = kv_size;
6493
7257
  }
6494
7258
 
6495
7259
  const size_t nread = inp - src;
@@ -6584,64 +7348,102 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi
6584
7348
 
6585
7349
  int llama_eval(
6586
7350
  struct llama_context * ctx,
6587
- const llama_token * tokens,
6588
- int n_tokens,
6589
- int n_past,
6590
- int n_threads) {
6591
- if (!llama_eval_internal(*ctx, tokens, nullptr, n_tokens, n_past, n_threads, nullptr)) {
6592
- LLAMA_LOG_ERROR("%s: failed to eval\n", __func__);
6593
- return 1;
6594
- }
7351
+ llama_token * tokens,
7352
+ int32_t n_tokens,
7353
+ int n_past) {
7354
+ llama_kv_cache_tokens_rm(ctx->kv_self, n_past, -1);
6595
7355
 
6596
- // get a more accurate load time, upon first eval
6597
- // TODO: fix this
6598
- if (!ctx->has_evaluated_once) {
6599
- ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
6600
- ctx->has_evaluated_once = true;
7356
+ const int ret = llama_decode_internal(*ctx, llama_batch_get_one(tokens, n_tokens, n_past, 0));
7357
+ if (ret < 0) {
7358
+ LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
6601
7359
  }
6602
7360
 
6603
- return 0;
7361
+ return ret;
6604
7362
  }
6605
7363
 
6606
7364
  int llama_eval_embd(
6607
7365
  struct llama_context * ctx,
6608
- const float * embd,
6609
- int n_tokens,
6610
- int n_past,
6611
- int n_threads) {
6612
- if (!llama_eval_internal(*ctx, nullptr, embd, n_tokens, n_past, n_threads, nullptr)) {
6613
- LLAMA_LOG_ERROR("%s: failed to eval\n", __func__);
6614
- return 1;
6615
- }
7366
+ float * embd,
7367
+ int32_t n_tokens,
7368
+ int n_past) {
7369
+ llama_kv_cache_tokens_rm(ctx->kv_self, n_past, -1);
6616
7370
 
6617
- // get a more accurate load time, upon first eval
6618
- // TODO: fix this
6619
- if (!ctx->has_evaluated_once) {
6620
- ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
6621
- ctx->has_evaluated_once = true;
7371
+ llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, n_past, 1, 0, };
7372
+
7373
+ const int ret = llama_decode_internal(*ctx, batch);
7374
+ if (ret < 0) {
7375
+ LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
6622
7376
  }
6623
7377
 
6624
- return 0;
7378
+ return ret;
6625
7379
  }
6626
7380
 
6627
- int llama_eval_export(struct llama_context * ctx, const char * fname) {
6628
- const int n_batch = 1;
6629
- const int n_ctx = 512 - n_batch;
7381
+ void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch) {
7382
+ ctx->cparams.n_threads = n_threads;
7383
+ ctx->cparams.n_threads_batch = n_threads_batch;
7384
+ }
7385
+
7386
+ struct llama_batch llama_batch_get_one(
7387
+ llama_token * tokens,
7388
+ int32_t n_tokens,
7389
+ llama_pos pos_0,
7390
+ llama_seq_id seq_id) {
7391
+ return {
7392
+ /*n_tokens =*/ n_tokens,
7393
+ /*tokens =*/ tokens,
7394
+ /*embd =*/ nullptr,
7395
+ /*pos =*/ nullptr,
7396
+ /*seq_id =*/ nullptr,
7397
+ /*logits =*/ nullptr,
7398
+ /*all_pos_0 =*/ pos_0,
7399
+ /*all_pos_1 =*/ 1,
7400
+ /*all_seq_id =*/ seq_id,
7401
+ };
7402
+ }
6630
7403
 
6631
- const std::vector<llama_token> tmp(n_batch, llama_token_bos(ctx));
7404
+ struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd) {
7405
+ llama_batch batch = { -1, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, };
6632
7406
 
6633
- if (!llama_eval_internal(*ctx, tmp.data(), nullptr, tmp.size(), n_ctx, 1, fname)) {
6634
- LLAMA_LOG_ERROR("%s: failed to eval\n", __func__);
6635
- return 1;
7407
+ if (embd) {
7408
+ batch.embd = (float *) malloc(sizeof(float) * n_tokens * embd);
7409
+ } else {
7410
+ batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens);
6636
7411
  }
6637
7412
 
6638
- return 0;
7413
+ batch.pos = (llama_pos *) malloc(sizeof(llama_pos) * n_tokens);
7414
+ batch.seq_id = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_tokens);
7415
+ batch.logits = (int8_t *) malloc(sizeof(int8_t) * n_tokens);
7416
+
7417
+ return batch;
7418
+ }
7419
+
7420
+ void llama_batch_free(struct llama_batch batch) {
7421
+ if (batch.token) free(batch.token);
7422
+ if (batch.embd) free(batch.embd);
7423
+ if (batch.pos) free(batch.pos);
7424
+ if (batch.seq_id) free(batch.seq_id);
7425
+ if (batch.logits) free(batch.logits);
7426
+ }
7427
+
7428
+ int llama_decode(
7429
+ struct llama_context * ctx,
7430
+ struct llama_batch batch) {
7431
+ const int ret = llama_decode_internal(*ctx, batch);
7432
+ if (ret < 0) {
7433
+ LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
7434
+ }
7435
+
7436
+ return ret;
6639
7437
  }
6640
7438
 
6641
7439
  float * llama_get_logits(struct llama_context * ctx) {
6642
7440
  return ctx->logits.data();
6643
7441
  }
6644
7442
 
7443
+ float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
7444
+ return ctx->logits.data() + i*ctx->model.hparams.n_vocab;
7445
+ }
7446
+
6645
7447
  float * llama_get_embeddings(struct llama_context * ctx) {
6646
7448
  return ctx->embedding.data();
6647
7449
  }
@@ -6671,21 +7473,13 @@ llama_token llama_token_nl(const struct llama_context * ctx) {
6671
7473
  }
6672
7474
 
6673
7475
  int llama_tokenize(
6674
- struct llama_context * ctx,
6675
- const char * text,
6676
- llama_token * tokens,
6677
- int n_max_tokens,
6678
- bool add_bos) {
6679
- return llama_tokenize_with_model(&ctx->model, text, tokens, n_max_tokens, add_bos);
6680
- }
6681
-
6682
- int llama_tokenize_with_model(
6683
7476
  const struct llama_model * model,
6684
7477
  const char * text,
7478
+ int text_len,
6685
7479
  llama_token * tokens,
6686
7480
  int n_max_tokens,
6687
7481
  bool add_bos) {
6688
- auto res = llama_tokenize_internal(model->vocab, text, add_bos);
7482
+ auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos);
6689
7483
 
6690
7484
  if (n_max_tokens < (int) res.size()) {
6691
7485
  // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
@@ -6699,13 +7493,9 @@ int llama_tokenize_with_model(
6699
7493
  return res.size();
6700
7494
  }
6701
7495
 
6702
- int llama_token_to_piece(const struct llama_context * ctx, llama_token token, char * buf, int length) {
6703
- return llama_token_to_piece_with_model(&ctx->model, token, buf, length);
6704
- }
6705
-
6706
7496
  // does not write null-terminator to buf
6707
- int llama_token_to_piece_with_model(const struct llama_model * model, llama_token token, char * buf, int length) {
6708
- if (0 <= token && token < llama_model_n_vocab(model)) {
7497
+ int llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int length) {
7498
+ if (0 <= token && token < llama_n_vocab(model)) {
6709
7499
  if (llama_is_normal_token(model->vocab, token)) {
6710
7500
  std::string result = model->vocab.id_to_token[token].text;
6711
7501
  if (llama_vocab_get_type(model->vocab) == LLAMA_VOCAB_TYPE_SPM) {
@@ -6725,7 +7515,7 @@ int llama_token_to_piece_with_model(const struct llama_model * model, llama_toke
6725
7515
  buf[2] = '\x85';
6726
7516
  return 3;
6727
7517
  } else if (llama_is_control_token(model->vocab, token)) {
6728
- ;
7518
+ // do nothing
6729
7519
  } else if (llama_is_byte_token(model->vocab, token)) {
6730
7520
  if (length < 1) {
6731
7521
  return -1;
@@ -6827,16 +7617,18 @@ void llama_dump_timing_info_yaml(FILE * stream, const llama_context * ctx) {
6827
7617
  }
6828
7618
 
6829
7619
  // For internal test use
6830
- const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
7620
+ const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
7621
+ struct llama_context * ctx
7622
+ ) {
6831
7623
  return ctx->model.tensors_by_name;
6832
7624
  }
6833
7625
 
6834
- void llama_log_set(llama_log_callback log_callback, void * user_data) {
7626
+ void llama_log_set(ggml_log_callback log_callback, void * user_data) {
6835
7627
  g_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
6836
7628
  g_state.log_callback_user_data = user_data;
6837
7629
  }
6838
7630
 
6839
- static void llama_log_internal_v(llama_log_level level, const char * format, va_list args) {
7631
+ static void llama_log_internal_v(ggml_log_level level, const char * format, va_list args) {
6840
7632
  va_list args_copy;
6841
7633
  va_copy(args_copy, args);
6842
7634
  char buffer[128];
@@ -6853,14 +7645,14 @@ static void llama_log_internal_v(llama_log_level level, const char * format, va_
6853
7645
  va_end(args_copy);
6854
7646
  }
6855
7647
 
6856
- static void llama_log_internal(llama_log_level level, const char * format, ...) {
7648
+ static void llama_log_internal(ggml_log_level level, const char * format, ...) {
6857
7649
  va_list args;
6858
7650
  va_start(args, format);
6859
7651
  llama_log_internal_v(level, format, args);
6860
7652
  va_end(args);
6861
7653
  }
6862
7654
 
6863
- static void llama_log_callback_default(llama_log_level level, const char * text, void * user_data) {
7655
+ static void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data) {
6864
7656
  (void) level;
6865
7657
  (void) user_data;
6866
7658
  fputs(text, stderr);