llama_cpp 0.5.2 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,4 @@
1
+ #define LLAMA_API_INTERNAL
1
2
  #include "llama.h"
2
3
 
3
4
  #include "ggml.h"
@@ -71,6 +72,7 @@
71
72
  #include <sstream>
72
73
  #include <thread>
73
74
  #include <unordered_map>
75
+ #include <set>
74
76
 
75
77
  #if defined(_MSC_VER)
76
78
  #pragma warning(disable: 4244 4267) // possible loss of data
@@ -91,12 +93,12 @@
91
93
  //
92
94
 
93
95
  LLAMA_ATTRIBUTE_FORMAT(2, 3)
94
- static void llama_log_internal (llama_log_level level, const char* format, ...);
95
- static void llama_log_callback_default(llama_log_level level, const char * text, void * user_data);
96
+ static void llama_log_internal (ggml_log_level level, const char* format, ...);
97
+ static void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data);
96
98
 
97
- #define LLAMA_LOG_INFO(...) llama_log_internal(LLAMA_LOG_LEVEL_INFO , __VA_ARGS__)
98
- #define LLAMA_LOG_WARN(...) llama_log_internal(LLAMA_LOG_LEVEL_WARN , __VA_ARGS__)
99
- #define LLAMA_LOG_ERROR(...) llama_log_internal(LLAMA_LOG_LEVEL_ERROR, __VA_ARGS__)
99
+ #define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
100
+ #define LLAMA_LOG_WARN(...) llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
101
+ #define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
100
102
 
101
103
  //
102
104
  // helpers
@@ -108,7 +110,7 @@ static size_t utf8_len(char src) {
108
110
  return lookup[highbits];
109
111
  }
110
112
 
111
- void replace_all(std::string & s, const std::string & search, const std::string & replace) {
113
+ static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
112
114
  std::string result;
113
115
  for (size_t pos = 0; ; pos += search.length()) {
114
116
  auto new_pos = s.find(search, pos);
@@ -160,17 +162,19 @@ enum llm_arch {
160
162
  LLM_ARCH_GPTJ,
161
163
  LLM_ARCH_GPTNEOX,
162
164
  LLM_ARCH_MPT,
165
+ LLM_ARCH_STARCODER,
163
166
  LLM_ARCH_UNKNOWN,
164
167
  };
165
168
 
166
169
  static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
167
- { LLM_ARCH_LLAMA, "llama" },
168
- { LLM_ARCH_FALCON, "falcon" },
169
- { LLM_ARCH_GPT2, "gpt2" },
170
- { LLM_ARCH_GPTJ, "gptj" },
171
- { LLM_ARCH_GPTNEOX, "gptneox" },
172
- { LLM_ARCH_MPT, "mpt" },
173
- { LLM_ARCH_BAICHUAN,"baichuan" },
170
+ { LLM_ARCH_LLAMA, "llama" },
171
+ { LLM_ARCH_FALCON, "falcon" },
172
+ { LLM_ARCH_GPT2, "gpt2" },
173
+ { LLM_ARCH_GPTJ, "gptj" },
174
+ { LLM_ARCH_GPTNEOX, "gptneox" },
175
+ { LLM_ARCH_MPT, "mpt" },
176
+ { LLM_ARCH_BAICHUAN, "baichuan" },
177
+ { LLM_ARCH_STARCODER, "starcoder" },
174
178
  };
175
179
 
176
180
  enum llm_kv {
@@ -218,16 +222,16 @@ enum llm_kv {
218
222
  };
219
223
 
220
224
  static std::map<llm_kv, std::string> LLM_KV_NAMES = {
221
- { LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" },
222
- { LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" },
223
- { LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
224
- { LLM_KV_GENERAL_NAME, "general.name" },
225
- { LLM_KV_GENERAL_AUTHOR, "general.author" },
226
- { LLM_KV_GENERAL_URL, "general.url" },
227
- { LLM_KV_GENERAL_DESCRIPTION, "general.description" },
228
- { LLM_KV_GENERAL_LICENSE, "general.license" },
229
- { LLM_KV_GENERAL_SOURCE_URL, "general.source_url" },
230
- { LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source_hf_repo" },
225
+ { LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" },
226
+ { LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" },
227
+ { LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
228
+ { LLM_KV_GENERAL_NAME, "general.name" },
229
+ { LLM_KV_GENERAL_AUTHOR, "general.author" },
230
+ { LLM_KV_GENERAL_URL, "general.url" },
231
+ { LLM_KV_GENERAL_DESCRIPTION, "general.description" },
232
+ { LLM_KV_GENERAL_LICENSE, "general.license" },
233
+ { LLM_KV_GENERAL_SOURCE_URL, "general.source.url" },
234
+ { LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" },
231
235
 
232
236
  { LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
233
237
  { LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
@@ -376,6 +380,21 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
376
380
  { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
377
381
  },
378
382
  },
383
+ {
384
+ LLM_ARCH_STARCODER,
385
+ {
386
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
387
+ { LLM_TENSOR_POS_EMBD, "position_embd" },
388
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
389
+ { LLM_TENSOR_OUTPUT, "output" },
390
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
391
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
392
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
393
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
394
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
395
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
396
+ },
397
+ },
379
398
  {
380
399
  LLM_ARCH_UNKNOWN,
381
400
  {
@@ -430,7 +449,7 @@ struct LLM_TN {
430
449
  //
431
450
 
432
451
  #define GGUF_GET_KEY(ctx, dst, func, type, req, key) \
433
- { \
452
+ do { \
434
453
  const std::string skey(key); \
435
454
  const int kid = gguf_find_key(ctx, skey.c_str()); \
436
455
  if (kid >= 0) { \
@@ -442,7 +461,7 @@ struct LLM_TN {
442
461
  } else if (req) { \
443
462
  throw std::runtime_error(format("key not found in model: %s", skey.c_str())); \
444
463
  } \
445
- }
464
+ } while (0)
446
465
 
447
466
  //
448
467
  // ggml helpers
@@ -680,6 +699,7 @@ struct llama_mmap {
680
699
  if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
681
700
  fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
682
701
  llama_format_win_err(GetLastError()).c_str());
702
+ }
683
703
  }
684
704
  #else
685
705
  #pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
@@ -862,10 +882,10 @@ static void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
862
882
 
863
883
  static std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) {
864
884
  std::vector<char> result(8, 0);
865
- const int n_tokens = llama_token_to_piece(ctx, token, result.data(), result.size());
885
+ const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
866
886
  if (n_tokens < 0) {
867
887
  result.resize(-n_tokens);
868
- int check = llama_token_to_piece(ctx, token, result.data(), result.size());
888
+ int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
869
889
  GGML_ASSERT(check == -n_tokens);
870
890
  } else {
871
891
  result.resize(n_tokens);
@@ -880,7 +900,7 @@ static std::string llama_token_to_str(const struct llama_context * ctx, llama_to
880
900
 
881
901
  struct llama_state {
882
902
  // We save the log callback globally
883
- llama_log_callback log_callback = llama_log_callback_default;
903
+ ggml_log_callback log_callback = llama_log_callback_default;
884
904
  void * log_callback_user_data = nullptr;
885
905
  };
886
906
 
@@ -889,9 +909,11 @@ static llama_state g_state;
889
909
  // available llama models
890
910
  enum e_model {
891
911
  MODEL_UNKNOWN,
912
+ MODEL_1B,
892
913
  MODEL_3B,
893
914
  MODEL_7B,
894
915
  MODEL_13B,
916
+ MODEL_15B,
895
917
  MODEL_30B,
896
918
  MODEL_34B,
897
919
  MODEL_40B,
@@ -901,24 +923,24 @@ enum e_model {
901
923
 
902
924
  static const size_t kB = 1024;
903
925
  static const size_t MB = kB*kB;
926
+ static const size_t GB = kB*kB*kB;
904
927
 
905
- // default hparams (LLaMA 7B)
906
928
  struct llama_hparams {
907
- uint32_t n_vocab = 32000;
908
- uint32_t n_ctx_train = 2048; // the context size used during training
909
- uint32_t n_ctx = 512; // the context size used during inference
910
- uint32_t n_embd = 4096;
911
- uint32_t n_head = 32;
912
- uint32_t n_head_kv = 32;
913
- uint32_t n_layer = 32;
914
- uint32_t n_rot = 64;
915
- uint32_t n_ff = 11008;
916
-
917
- float f_norm_eps = 1e-5;
918
- float f_norm_rms_eps = 1e-5;
919
-
920
- float rope_freq_base = 10000.0f;
921
- float rope_freq_scale = 1.0f;
929
+ bool vocab_only;
930
+ uint32_t n_vocab;
931
+ uint32_t n_ctx_train; // context size the model was trained on
932
+ uint32_t n_embd;
933
+ uint32_t n_head;
934
+ uint32_t n_head_kv;
935
+ uint32_t n_layer;
936
+ uint32_t n_rot;
937
+ uint32_t n_ff;
938
+
939
+ float f_norm_eps;
940
+ float f_norm_rms_eps;
941
+
942
+ float rope_freq_base_train;
943
+ float rope_freq_scale_train;
922
944
 
923
945
  bool operator!=(const llama_hparams & other) const {
924
946
  return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams))); // NOLINT
@@ -935,15 +957,18 @@ struct llama_hparams {
935
957
  uint32_t n_embd_gqa() const {
936
958
  return n_embd/n_gqa();
937
959
  }
960
+ };
938
961
 
939
- size_t kv_size() const {
940
- size_t result = 2ull;
941
- result *= (size_t) n_embd_gqa();
942
- result *= (size_t) n_ctx;
943
- result *= (size_t) n_layer;
944
- result *= sizeof(ggml_fp16_t);
945
- return result;
946
- }
962
+ struct llama_cparams {
963
+ uint32_t n_ctx; // context size used during inference
964
+ uint32_t n_batch;
965
+ uint32_t n_threads; // number of threads to use for generation
966
+ uint32_t n_threads_batch; // number of threads to use for batch processing
967
+
968
+ float rope_freq_base;
969
+ float rope_freq_scale;
970
+
971
+ bool mul_mat_q;
947
972
  };
948
973
 
949
974
  struct llama_layer {
@@ -960,16 +985,47 @@ struct llama_layer {
960
985
  struct ggml_tensor * wo;
961
986
  struct ggml_tensor * wqkv;
962
987
 
988
+ // attention bias
989
+ struct ggml_tensor * bo;
990
+ struct ggml_tensor * bqkv;
991
+
963
992
  // normalization
964
993
  struct ggml_tensor * ffn_norm;
994
+ struct ggml_tensor * ffn_norm_b;
965
995
 
966
996
  // ff
967
997
  struct ggml_tensor * w1; // ffn_gate
968
998
  struct ggml_tensor * w2; // ffn_down
969
999
  struct ggml_tensor * w3; // ffn_up
1000
+
1001
+ // ff bias
1002
+ struct ggml_tensor * b2; // ffn_down
1003
+ struct ggml_tensor * b3; // ffn_up
1004
+ };
1005
+
1006
+ struct llama_kv_cell {
1007
+ llama_pos pos = -1;
1008
+ llama_pos delta = 0;
1009
+
1010
+ std::set<llama_seq_id> seq_id;
1011
+
1012
+ bool has_seq_id(const llama_seq_id & id) const {
1013
+ return seq_id.find(id) != seq_id.end();
1014
+ }
970
1015
  };
971
1016
 
1017
+ // ring-buffer of cached KV data
972
1018
  struct llama_kv_cache {
1019
+ bool has_shift = false;
1020
+
1021
+ uint32_t head = 0;
1022
+ uint32_t size = 0;
1023
+
1024
+ // computed before each graph build
1025
+ uint32_t n = 0;
1026
+
1027
+ std::vector<llama_kv_cell> cells;
1028
+
973
1029
  struct ggml_tensor * k = NULL;
974
1030
  struct ggml_tensor * v = NULL;
975
1031
 
@@ -977,8 +1033,6 @@ struct llama_kv_cache {
977
1033
 
978
1034
  llama_buffer buf;
979
1035
 
980
- int n; // number of tokens currently in the cache
981
-
982
1036
  ~llama_kv_cache() {
983
1037
  if (ctx) {
984
1038
  ggml_free(ctx);
@@ -1040,10 +1094,11 @@ struct llama_model {
1040
1094
 
1041
1095
  std::string name = "n/a";
1042
1096
 
1043
- llama_hparams hparams;
1097
+ llama_hparams hparams = {};
1044
1098
  llama_vocab vocab;
1045
1099
 
1046
1100
  struct ggml_tensor * tok_embeddings;
1101
+ struct ggml_tensor * pos_embeddings;
1047
1102
 
1048
1103
  struct ggml_tensor * output_norm;
1049
1104
  struct ggml_tensor * output_norm_b;
@@ -1091,11 +1146,8 @@ struct llama_model {
1091
1146
  };
1092
1147
 
1093
1148
  struct llama_context {
1094
- llama_context(const llama_model & model) : model(model), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
1149
+ llama_context(const llama_model & model) : model(model), t_start_us(model.t_start_us), t_load_us(model.t_load_us) {}
1095
1150
  ~llama_context() {
1096
- if (model_owner) {
1097
- delete &model;
1098
- }
1099
1151
  #ifdef GGML_USE_METAL
1100
1152
  if (ctx_metal) {
1101
1153
  ggml_metal_free(ctx_metal);
@@ -1106,27 +1158,26 @@ struct llama_context {
1106
1158
  }
1107
1159
  }
1108
1160
 
1161
+ llama_cparams cparams;
1162
+
1163
+ const llama_model & model;
1164
+
1165
+ // key + value cache for the self attention
1166
+ struct llama_kv_cache kv_self;
1167
+
1109
1168
  std::mt19937 rng;
1110
1169
 
1111
1170
  bool has_evaluated_once = false;
1112
1171
 
1172
+ int64_t t_start_us;
1173
+ int64_t t_load_us;
1113
1174
  int64_t t_sample_us = 0;
1114
- int64_t t_eval_us = 0;
1115
1175
  int64_t t_p_eval_us = 0;
1176
+ int64_t t_eval_us = 0;
1116
1177
 
1117
1178
  int32_t n_sample = 0; // number of tokens sampled
1118
- int32_t n_eval = 0; // number of eval calls
1119
1179
  int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
1120
-
1121
- const llama_model & model;
1122
-
1123
- bool model_owner = false;
1124
-
1125
- int64_t t_load_us;
1126
- int64_t t_start_us;
1127
-
1128
- // key + value cache for the self attention
1129
- struct llama_kv_cache kv_self;
1180
+ int32_t n_eval = 0; // number of eval calls
1130
1181
 
1131
1182
  // decode output (2-dimensional array: [n_tokens][n_vocab])
1132
1183
  std::vector<float> logits;
@@ -1161,16 +1212,23 @@ static bool llama_kv_cache_init(
1161
1212
  const struct llama_hparams & hparams,
1162
1213
  struct llama_kv_cache & cache,
1163
1214
  ggml_type wtype,
1164
- int n_ctx,
1215
+ uint32_t n_ctx,
1165
1216
  int n_gpu_layers) {
1166
- const int n_embd = hparams.n_embd_gqa();
1167
- const int n_layer = hparams.n_layer;
1217
+ const uint32_t n_embd = hparams.n_embd_gqa();
1218
+ const uint32_t n_layer = hparams.n_layer;
1168
1219
 
1169
1220
  const int64_t n_mem = n_layer*n_ctx;
1170
1221
  const int64_t n_elements = n_embd*n_mem;
1171
1222
 
1223
+ cache.has_shift = false;
1224
+
1225
+ cache.head = 0;
1226
+ cache.size = n_ctx;
1227
+
1228
+ cache.cells.clear();
1229
+ cache.cells.resize(n_ctx);
1230
+
1172
1231
  cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
1173
- cache.n = 0;
1174
1232
 
1175
1233
  struct ggml_init_params params;
1176
1234
  params.mem_size = cache.buf.size;
@@ -1191,17 +1249,154 @@ static bool llama_kv_cache_init(
1191
1249
 
1192
1250
  (void) n_gpu_layers;
1193
1251
  #ifdef GGML_USE_CUBLAS
1194
- if (n_gpu_layers > n_layer + 1) {
1252
+ size_t vram_kv_cache = 0;
1253
+
1254
+ if (n_gpu_layers > (int)n_layer + 1) {
1195
1255
  ggml_cuda_assign_buffers_no_scratch(cache.v);
1256
+ LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__);
1257
+ vram_kv_cache += ggml_nbytes(cache.v);
1196
1258
  }
1197
- if (n_gpu_layers > n_layer + 2) {
1259
+ if (n_gpu_layers > (int)n_layer + 2) {
1198
1260
  ggml_cuda_assign_buffers_no_scratch(cache.k);
1261
+ LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
1262
+ vram_kv_cache += ggml_nbytes(cache.k);
1263
+ }
1264
+ if (vram_kv_cache > 0) {
1265
+ LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
1199
1266
  }
1200
1267
  #endif // GGML_USE_CUBLAS
1201
1268
 
1202
1269
  return true;
1203
1270
  }
1204
1271
 
1272
+ // find an empty slot of size "n_tokens" in the cache
1273
+ // updates the cache head
1274
+ static bool llama_kv_cache_find_slot(
1275
+ struct llama_kv_cache & cache,
1276
+ const struct llama_batch & batch) {
1277
+ const uint32_t n_ctx = cache.size;
1278
+ const uint32_t n_tokens = batch.n_tokens;
1279
+
1280
+ if (n_tokens > n_ctx) {
1281
+ LLAMA_LOG_ERROR("%s: n_tokens=%d > n_ctx=%d\n", __func__, n_tokens, n_ctx);
1282
+ return false;
1283
+ }
1284
+
1285
+ uint32_t n_tested = 0;
1286
+
1287
+ while (true) {
1288
+ if (cache.head + n_tokens > n_ctx) {
1289
+ cache.head = 0;
1290
+ n_tested += n_ctx - cache.head;
1291
+ continue;
1292
+ }
1293
+
1294
+ bool found = true;
1295
+ for (uint32_t i = 0; i < n_tokens; i++) {
1296
+ if (cache.cells[cache.head + i].pos >= 0) {
1297
+ found = false;
1298
+ cache.head += i + 1;
1299
+ n_tested += i + 1;
1300
+ break;
1301
+ }
1302
+ }
1303
+
1304
+ if (found) {
1305
+ break;
1306
+ }
1307
+
1308
+ if (n_tested >= n_ctx) {
1309
+ //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
1310
+ return false;
1311
+ }
1312
+ }
1313
+
1314
+ for (uint32_t i = 0; i < n_tokens; i++) {
1315
+ cache.cells[cache.head + i].pos = batch.pos[i];
1316
+ cache.cells[cache.head + i].seq_id.insert(batch.seq_id[i]);
1317
+ }
1318
+
1319
+ return true;
1320
+ }
1321
+
1322
+ // find how many cells are currently in use
1323
+ static int32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
1324
+ for (uint32_t i = cache.size - 1; i > 0; --i) {
1325
+ if (cache.cells[i].pos >= 0 && !cache.cells[i].seq_id.empty()) {
1326
+ return i + 1;
1327
+ }
1328
+ }
1329
+
1330
+ return 0;
1331
+ }
1332
+
1333
+ static void llama_kv_cache_tokens_rm(struct llama_kv_cache & cache, int32_t c0, int32_t c1) {
1334
+ if (c0 < 0) c0 = 0;
1335
+ if (c1 < 0) c1 = cache.size;
1336
+
1337
+ for (int32_t i = c0; i < c1; ++i) {
1338
+ cache.cells[i].pos = -1;
1339
+ cache.cells[i].seq_id.clear();
1340
+ }
1341
+ }
1342
+
1343
+ static void llama_kv_cache_seq_rm(
1344
+ struct llama_kv_cache & cache,
1345
+ llama_seq_id seq_id,
1346
+ llama_pos p0,
1347
+ llama_pos p1) {
1348
+ for (uint32_t i = 0; i < cache.size; ++i) {
1349
+ if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
1350
+ cache.cells[i].seq_id.erase(seq_id);
1351
+ if (cache.cells[i].seq_id.empty()) {
1352
+ cache.cells[i].pos = -1;
1353
+ }
1354
+ }
1355
+ }
1356
+ }
1357
+
1358
+ static void llama_kv_cache_seq_cp(
1359
+ struct llama_kv_cache & cache,
1360
+ llama_seq_id seq_id_src,
1361
+ llama_seq_id seq_id_dst,
1362
+ llama_pos p0,
1363
+ llama_pos p1) {
1364
+ for (uint32_t i = 0; i < cache.size; ++i) {
1365
+ if (cache.cells[i].has_seq_id(seq_id_src) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
1366
+ cache.cells[i].seq_id.insert(seq_id_dst);
1367
+ }
1368
+ }
1369
+ }
1370
+
1371
+ static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id seq_id) {
1372
+ for (uint32_t i = 0; i < cache.size; ++i) {
1373
+ if (!cache.cells[i].has_seq_id(seq_id)) {
1374
+ cache.cells[i].pos = -1;
1375
+ cache.cells[i].seq_id.clear();
1376
+ }
1377
+ }
1378
+ }
1379
+
1380
+ static void llama_kv_cache_seq_shift(
1381
+ struct llama_kv_cache & cache,
1382
+ llama_seq_id seq_id,
1383
+ llama_pos p0,
1384
+ llama_pos p1,
1385
+ llama_pos delta) {
1386
+ for (uint32_t i = 0; i < cache.size; ++i) {
1387
+ if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
1388
+ cache.cells[i].pos += delta;
1389
+ if (cache.cells[i].pos < 0) {
1390
+ cache.cells[i].pos = -1;
1391
+ cache.cells[i].seq_id.clear();
1392
+ } else {
1393
+ cache.has_shift = true;
1394
+ cache.cells[i].delta = delta;
1395
+ }
1396
+ }
1397
+ }
1398
+ }
1399
+
1205
1400
  //
1206
1401
  // model loading and saving
1207
1402
  //
@@ -1244,6 +1439,7 @@ struct llama_model_loader {
1244
1439
  int n_created = 0;
1245
1440
 
1246
1441
  int64_t n_elements = 0;
1442
+ size_t n_bytes = 0;
1247
1443
 
1248
1444
  bool use_mmap = false;
1249
1445
 
@@ -1276,6 +1472,7 @@ struct llama_model_loader {
1276
1472
  const char * name = gguf_get_tensor_name(ctx_gguf, i);
1277
1473
  struct ggml_tensor * t = ggml_get_tensor(ctx_meta, name);
1278
1474
  n_elements += ggml_nelements(t);
1475
+ n_bytes += ggml_nbytes(t);
1279
1476
  }
1280
1477
 
1281
1478
  LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
@@ -1521,7 +1718,7 @@ struct llama_model_loader {
1521
1718
  lmlock->grow_to(size_lock);
1522
1719
  }
1523
1720
  break;
1524
- #if defined(GGML_USE_CUBLAS)
1721
+ #ifdef GGML_USE_CUBLAS
1525
1722
  case GGML_BACKEND_GPU:
1526
1723
  case GGML_BACKEND_GPU_SPLIT:
1527
1724
  // old code:
@@ -1554,7 +1751,15 @@ struct llama_model_loader {
1554
1751
  // load LLaMA models
1555
1752
  //
1556
1753
 
1557
- std::string llama_model_ftype_name(enum llama_ftype ftype) {
1754
+ static std::string llama_model_arch_name(llm_arch arch) {
1755
+ auto it = LLM_ARCH_NAMES.find(arch);
1756
+ if (it == LLM_ARCH_NAMES.end()) {
1757
+ return "unknown";
1758
+ }
1759
+ return it->second;
1760
+ }
1761
+
1762
+ static std::string llama_model_ftype_name(llama_ftype ftype) {
1558
1763
  if (ftype & LLAMA_FTYPE_GUESSED) {
1559
1764
  return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
1560
1765
  }
@@ -1587,9 +1792,11 @@ std::string llama_model_ftype_name(enum llama_ftype ftype) {
1587
1792
 
1588
1793
  static const char * llama_model_type_name(e_model type) {
1589
1794
  switch (type) {
1795
+ case MODEL_1B: return "1B";
1590
1796
  case MODEL_3B: return "3B";
1591
1797
  case MODEL_7B: return "7B";
1592
1798
  case MODEL_13B: return "13B";
1799
+ case MODEL_15B: return "15B";
1593
1800
  case MODEL_30B: return "30B";
1594
1801
  case MODEL_34B: return "34B";
1595
1802
  case MODEL_40B: return "40B";
@@ -1608,10 +1815,7 @@ static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
1608
1815
 
1609
1816
  static void llm_load_hparams(
1610
1817
  llama_model_loader & ml,
1611
- llama_model & model,
1612
- int n_ctx,
1613
- float rope_freq_base,
1614
- float rope_freq_scale) {
1818
+ llama_model & model) {
1615
1819
  struct gguf_context * ctx = ml.ctx_gguf;
1616
1820
 
1617
1821
  const auto kv = LLM_KV(model.arch);
@@ -1622,40 +1826,25 @@ static void llm_load_hparams(
1622
1826
  GGUF_GET_KEY(ctx, model.name, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_NAME));
1623
1827
 
1624
1828
  // get hparams kv
1625
- GGUF_GET_KEY(ctx, hparams.n_vocab, gguf_get_arr_n, GGUF_TYPE_ARRAY, true, kv(LLM_KV_TOKENIZER_LIST));
1626
- GGUF_GET_KEY(ctx, hparams.n_ctx_train, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_CONTEXT_LENGTH));
1627
- GGUF_GET_KEY(ctx, hparams.n_embd, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_EMBEDDING_LENGTH));
1628
- GGUF_GET_KEY(ctx, hparams.n_ff, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_FEED_FORWARD_LENGTH));
1629
- GGUF_GET_KEY(ctx, hparams.n_head, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_ATTENTION_HEAD_COUNT));
1630
- GGUF_GET_KEY(ctx, hparams.n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_BLOCK_COUNT));
1829
+ GGUF_GET_KEY(ctx, hparams.n_vocab, gguf_get_arr_n, GGUF_TYPE_ARRAY, true, kv(LLM_KV_TOKENIZER_LIST));
1830
+ GGUF_GET_KEY(ctx, hparams.n_ctx_train, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_CONTEXT_LENGTH));
1831
+ GGUF_GET_KEY(ctx, hparams.n_embd, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_EMBEDDING_LENGTH));
1832
+ GGUF_GET_KEY(ctx, hparams.n_ff, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_FEED_FORWARD_LENGTH));
1833
+ GGUF_GET_KEY(ctx, hparams.n_head, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_ATTENTION_HEAD_COUNT));
1834
+ GGUF_GET_KEY(ctx, hparams.n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_BLOCK_COUNT));
1631
1835
 
1632
1836
  // n_head_kv is optional, default to n_head
1633
1837
  hparams.n_head_kv = hparams.n_head;
1634
1838
  GGUF_GET_KEY(ctx, hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV));
1635
1839
 
1636
- // TODO: manually setting rope freq base and scale should override this
1637
- // FIXME: partial fix when the param specified is not the default value, but
1638
- // will not work for overriding the model value to the params default
1639
-
1640
- llama_context_params defaults = llama_context_default_params();
1641
-
1642
- // rope_freq_base
1643
- {
1644
- float ropebase = 10000.0f;
1645
- GGUF_GET_KEY(ctx, ropebase, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
1646
- if (ropebase != 10000.0f && rope_freq_base == defaults.rope_freq_base) {
1647
- rope_freq_base = ropebase;
1648
- }
1649
- }
1840
+ // rope_freq_base (optional)
1841
+ hparams.rope_freq_base_train = 10000.0f;
1842
+ GGUF_GET_KEY(ctx, hparams.rope_freq_base_train, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
1650
1843
 
1651
1844
  // rope_freq_scale (inverse of the kv) is optional
1652
- {
1653
- float ropescale = 1.0f;
1654
- GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
1655
- if (ropescale != 1.0f && rope_freq_scale == defaults.rope_freq_scale) {
1656
- rope_freq_scale = 1.0f/ropescale;
1657
- }
1658
- }
1845
+ float ropescale = 1.0f;
1846
+ GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
1847
+ hparams.rope_freq_scale_train = 1.0f/ropescale;
1659
1848
 
1660
1849
  // sanity check for n_rot (optional)
1661
1850
  {
@@ -1707,14 +1896,21 @@ static void llm_load_hparams(
1707
1896
  default: model.type = e_model::MODEL_UNKNOWN;
1708
1897
  }
1709
1898
  } break;
1899
+ case LLM_ARCH_STARCODER:
1900
+ {
1901
+ GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
1902
+ switch (hparams.n_layer) {
1903
+ case 24: model.type = e_model::MODEL_1B; break;
1904
+ case 36: model.type = e_model::MODEL_3B; break;
1905
+ case 42: model.type = e_model::MODEL_7B; break;
1906
+ case 40: model.type = e_model::MODEL_15B; break;
1907
+ default: model.type = e_model::MODEL_UNKNOWN;
1908
+ }
1909
+ } break;
1710
1910
  default: (void)0;
1711
- };
1911
+ }
1712
1912
 
1713
1913
  model.ftype = ml.ftype;
1714
-
1715
- hparams.n_ctx = n_ctx;
1716
- hparams.rope_freq_base = rope_freq_base;
1717
- hparams.rope_freq_scale = rope_freq_scale;
1718
1914
  }
1719
1915
 
1720
1916
  // TODO: This should probably be in llama.h
@@ -1735,20 +1931,18 @@ static void llm_load_vocab(
1735
1931
  throw std::runtime_error("cannot find tokenizer vocab in model file\n");
1736
1932
  }
1737
1933
 
1934
+ const float * scores = nullptr;
1738
1935
  const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
1739
- if (score_idx == -1) {
1740
- throw std::runtime_error("cannot find tokenizer scores in model file\n");
1936
+ if (score_idx != -1) {
1937
+ scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
1741
1938
  }
1742
1939
 
1743
- const float * scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
1744
-
1940
+ const int * toktypes = nullptr;
1745
1941
  const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
1746
- if (toktype_idx == -1) {
1747
- throw std::runtime_error("cannot find token type list in GGUF file\n");
1942
+ if (toktype_idx != -1) {
1943
+ toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
1748
1944
  }
1749
1945
 
1750
- const int * toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
1751
-
1752
1946
  // determine vocab type
1753
1947
  {
1754
1948
  std::string tokenizer_name;
@@ -1816,8 +2010,8 @@ static void llm_load_vocab(
1816
2010
 
1817
2011
  auto & token_data = vocab.id_to_token[i];
1818
2012
  token_data.text = std::move(word);
1819
- token_data.score = scores[i];
1820
- token_data.type = (llama_token_type) toktypes[i];
2013
+ token_data.score = scores ? scores[i] : 0.0f;
2014
+ token_data.type = toktypes ? (llama_token_type) toktypes[i] : LLAMA_TOKEN_TYPE_NORMAL;
1821
2015
  }
1822
2016
 
1823
2017
  // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
@@ -1840,27 +2034,31 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
1840
2034
  const auto & vocab = model.vocab;
1841
2035
 
1842
2036
  // hparams
1843
- LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver));
1844
- LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch).c_str());
1845
- LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, vocab.type == LLAMA_VOCAB_TYPE_SPM ? "SPM" : "BPE"); // TODO: fix
1846
- LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab);
1847
- LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (int) vocab.bpe_ranks.size());
1848
- LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
1849
- LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, hparams.n_ctx);
1850
- LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
1851
- LLAMA_LOG_INFO("%s: n_head = %u\n", __func__, hparams.n_head);
1852
- LLAMA_LOG_INFO("%s: n_head_kv = %u\n", __func__, hparams.n_head_kv);
1853
- LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
1854
- LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim
1855
- LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa());
1856
- LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
1857
- LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
1858
- LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
1859
- LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base);
1860
- LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale);
1861
- LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
1862
- LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
1863
- LLAMA_LOG_INFO("%s: model size = %.2f B\n", __func__, ml.n_elements*1e-9);
2037
+ LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver));
2038
+ LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch).c_str());
2039
+ LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, vocab.type == LLAMA_VOCAB_TYPE_SPM ? "SPM" : "BPE"); // TODO: fix
2040
+ LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab);
2041
+ LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (int) vocab.bpe_ranks.size());
2042
+ LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
2043
+ LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
2044
+ LLAMA_LOG_INFO("%s: n_head = %u\n", __func__, hparams.n_head);
2045
+ LLAMA_LOG_INFO("%s: n_head_kv = %u\n", __func__, hparams.n_head_kv);
2046
+ LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
2047
+ LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim
2048
+ LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa());
2049
+ LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
2050
+ LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
2051
+ LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
2052
+ LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
2053
+ LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
2054
+ LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
2055
+ LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
2056
+ LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
2057
+ if (ml.n_bytes < GB) {
2058
+ LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
2059
+ } else {
2060
+ LLAMA_LOG_INFO("%s: model size = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
2061
+ }
1864
2062
 
1865
2063
  // general kv
1866
2064
  LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
@@ -1877,13 +2075,9 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
1877
2075
  static void llm_load_tensors(
1878
2076
  llama_model_loader & ml,
1879
2077
  llama_model & model,
1880
- int n_batch,
1881
2078
  int n_gpu_layers,
1882
2079
  int main_gpu,
1883
2080
  const float * tensor_split,
1884
- const bool mul_mat_q,
1885
- bool low_vram,
1886
- ggml_type memory_type,
1887
2081
  bool use_mlock,
1888
2082
  llama_progress_callback progress_callback,
1889
2083
  void * progress_callback_user_data) {
@@ -1922,11 +2116,9 @@ static void llm_load_tensors(
1922
2116
  }
1923
2117
 
1924
2118
  (void) main_gpu;
1925
- (void) mul_mat_q;
1926
- #if defined(GGML_USE_CUBLAS)
2119
+ #ifdef GGML_USE_CUBLAS
1927
2120
  LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__);
1928
2121
  ggml_cuda_set_main_device(main_gpu);
1929
- ggml_cuda_set_mul_mat_q(mul_mat_q);
1930
2122
  #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
1931
2123
  #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
1932
2124
  #elif defined(GGML_USE_CLBLAST)
@@ -1961,9 +2153,9 @@ static void llm_load_tensors(
1961
2153
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
1962
2154
  // on Windows however this is detrimental unless everything is on the GPU
1963
2155
  #ifndef _WIN32
1964
- backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2156
+ backend_norm = LLAMA_BACKEND_OFFLOAD;
1965
2157
  #else
1966
- backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2158
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
1967
2159
  #endif // _WIN32
1968
2160
 
1969
2161
  backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
@@ -2027,9 +2219,9 @@ static void llm_load_tensors(
2027
2219
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2028
2220
  // on Windows however this is detrimental unless everything is on the GPU
2029
2221
  #ifndef _WIN32
2030
- backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2222
+ backend_norm = LLAMA_BACKEND_OFFLOAD;
2031
2223
  #else
2032
- backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2224
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2033
2225
  #endif // _WIN32
2034
2226
 
2035
2227
  backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
@@ -2097,9 +2289,9 @@ static void llm_load_tensors(
2097
2289
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2098
2290
  // on Windows however this is detrimental unless everything is on the GPU
2099
2291
  #ifndef _WIN32
2100
- backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2292
+ backend_norm = LLAMA_BACKEND_OFFLOAD;
2101
2293
  #else
2102
- backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2294
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2103
2295
  #endif // _WIN32
2104
2296
 
2105
2297
  backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
@@ -2160,29 +2352,100 @@ static void llm_load_tensors(
2160
2352
  }
2161
2353
  }
2162
2354
  } break;
2355
+ case LLM_ARCH_STARCODER:
2356
+ {
2357
+ model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2358
+ model.pos_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU);
2359
+
2360
+ // output
2361
+ {
2362
+ ggml_backend backend_norm;
2363
+ ggml_backend backend_output;
2364
+
2365
+ if (n_gpu_layers > int(n_layer)) {
2366
+ // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2367
+ // on Windows however this is detrimental unless everything is on the GPU
2368
+ #ifndef _WIN32
2369
+ backend_norm = LLAMA_BACKEND_OFFLOAD;
2370
+ #else
2371
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2372
+ #endif // _WIN32
2373
+
2374
+ backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
2375
+ } else {
2376
+ backend_norm = GGML_BACKEND_CPU;
2377
+ backend_output = GGML_BACKEND_CPU;
2378
+ }
2379
+
2380
+ model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
2381
+ model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
2382
+ model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
2383
+
2384
+ if (backend_norm == GGML_BACKEND_GPU) {
2385
+ vram_weights += ggml_nbytes(model.output_norm);
2386
+ vram_weights += ggml_nbytes(model.output_norm_b);
2387
+ }
2388
+ if (backend_output == GGML_BACKEND_GPU_SPLIT) {
2389
+ vram_weights += ggml_nbytes(model.output);
2390
+ }
2391
+ }
2392
+
2393
+ const uint32_t n_ff = hparams.n_ff;
2394
+
2395
+ const int i_gpu_start = n_layer - n_gpu_layers;
2396
+
2397
+ model.layers.resize(n_layer);
2398
+
2399
+ for (uint32_t i = 0; i < n_layer; ++i) {
2400
+ const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2401
+ const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2402
+
2403
+ auto & layer = model.layers[i];
2404
+
2405
+ layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
2406
+ layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
2407
+
2408
+ layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
2409
+ layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split);
2410
+
2411
+ layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
2412
+ layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split);
2413
+
2414
+ layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
2415
+ layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
2416
+
2417
+ layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
2418
+ layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split);
2419
+
2420
+ layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
2421
+ layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split);
2422
+
2423
+ if (backend == GGML_BACKEND_GPU) {
2424
+ vram_weights +=
2425
+ ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
2426
+ ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
2427
+ ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) +
2428
+ ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_norm_b) +
2429
+ ggml_nbytes(layer.w2) + ggml_nbytes(layer.b2) +
2430
+ ggml_nbytes(layer.w3) + ggml_nbytes(layer.b3);
2431
+ }
2432
+ }
2433
+ } break;
2163
2434
  default:
2164
2435
  throw std::runtime_error("unknown architecture");
2165
- };
2436
+ }
2166
2437
  }
2167
2438
 
2168
2439
  ml.done_getting_tensors();
2169
2440
 
2170
2441
  // print memory requirements
2171
2442
  {
2172
- const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
2173
-
2174
2443
  // this is the total memory required to run the inference
2175
2444
  size_t mem_required =
2176
2445
  ctx_size +
2177
2446
  mmapped_size - vram_weights; // weights in VRAM not in memory
2178
2447
 
2179
- // this is the memory required by one llama_state
2180
- const size_t mem_required_state = scale*hparams.kv_size();
2181
-
2182
- LLAMA_LOG_INFO("%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
2183
- mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
2184
-
2185
- (void) n_batch;
2448
+ LLAMA_LOG_INFO("%s: mem required = %7.2f MB\n", __func__, mem_required / 1024.0 / 1024.0);
2186
2449
 
2187
2450
  #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
2188
2451
  const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
@@ -2191,36 +2454,17 @@ static void llm_load_tensors(
2191
2454
  if (n_gpu_layers > (int) hparams.n_layer) {
2192
2455
  LLAMA_LOG_INFO("%s: offloading non-repeating layers to GPU\n", __func__);
2193
2456
  }
2194
- size_t vram_kv_cache = 0;
2195
2457
 
2196
2458
  #ifdef GGML_USE_CUBLAS
2197
2459
  const int max_backend_supported_layers = hparams.n_layer + 3;
2198
- const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
2199
- if (n_gpu_layers > (int) hparams.n_layer + 1) {
2200
- if (low_vram) {
2201
- LLAMA_LOG_INFO("%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
2202
- } else {
2203
- LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__);
2204
- vram_kv_cache += hparams.kv_size() / 2;
2205
- }
2206
- }
2207
- if (n_gpu_layers > (int) hparams.n_layer + 2) {
2208
- if (low_vram) {
2209
- LLAMA_LOG_WARN("%s: cannot offload k cache to GPU due to low VRAM option\n", __func__);
2210
- } else {
2211
- LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
2212
- vram_kv_cache += hparams.kv_size() / 2;
2213
- }
2214
- }
2460
+ const int max_offloadable_layers = hparams.n_layer + 3;
2215
2461
  #elif defined(GGML_USE_CLBLAST)
2216
2462
  const int max_backend_supported_layers = hparams.n_layer + 1;
2217
2463
  const int max_offloadable_layers = hparams.n_layer + 1;
2218
2464
  #endif // GGML_USE_CUBLAS
2219
2465
 
2220
- LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n",
2221
- __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
2222
- LLAMA_LOG_INFO("%s: VRAM used: %zu MB\n",
2223
- __func__, (vram_weights + vram_kv_cache + MB - 1) / MB); // round up
2466
+ LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
2467
+ LLAMA_LOG_INFO("%s: VRAM used: %.2f MB\n", __func__, vram_weights / 1024.0 / 1024.0);
2224
2468
  #else
2225
2469
  (void) n_gpu_layers;
2226
2470
  #endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
@@ -2233,7 +2477,7 @@ static void llm_load_tensors(
2233
2477
  }
2234
2478
 
2235
2479
  (void) tensor_split;
2236
- #if defined(GGML_USE_CUBLAS)
2480
+ #ifdef GGML_USE_CUBLAS
2237
2481
  {
2238
2482
  ggml_cuda_set_tensor_split(tensor_split);
2239
2483
  }
@@ -2255,29 +2499,24 @@ static void llm_load_tensors(
2255
2499
  static bool llama_model_load(
2256
2500
  const std::string & fname,
2257
2501
  llama_model & model,
2258
- int n_ctx,
2259
- int n_batch,
2260
2502
  int n_gpu_layers,
2261
2503
  int main_gpu,
2262
2504
  const float * tensor_split,
2263
- const bool mul_mat_q,
2264
- float rope_freq_base,
2265
- float rope_freq_scale,
2266
- bool low_vram,
2267
- ggml_type memory_type,
2268
2505
  bool use_mmap,
2269
2506
  bool use_mlock,
2270
2507
  bool vocab_only,
2271
2508
  llama_progress_callback progress_callback,
2272
2509
  void *progress_callback_user_data) {
2273
2510
  try {
2274
- std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap));
2511
+ llama_model_loader ml(fname, use_mmap);
2512
+
2513
+ model.hparams.vocab_only = vocab_only;
2275
2514
 
2276
- llm_load_arch (*ml, model);
2277
- llm_load_hparams(*ml, model, n_ctx, rope_freq_base, rope_freq_scale);
2278
- llm_load_vocab (*ml, model);
2515
+ llm_load_arch (ml, model);
2516
+ llm_load_hparams(ml, model);
2517
+ llm_load_vocab (ml, model);
2279
2518
 
2280
- llm_load_print_meta(*ml, model);
2519
+ llm_load_print_meta(ml, model);
2281
2520
 
2282
2521
  if (model.hparams.n_vocab != model.vocab.id_to_token.size()) {
2283
2522
  throw std::runtime_error("vocab size mismatch");
@@ -2289,8 +2528,8 @@ static bool llama_model_load(
2289
2528
  }
2290
2529
 
2291
2530
  llm_load_tensors(
2292
- *ml, model, n_batch, n_gpu_layers,
2293
- main_gpu, tensor_split, mul_mat_q, low_vram, memory_type,
2531
+ ml, model, n_gpu_layers,
2532
+ main_gpu, tensor_split,
2294
2533
  use_mlock, progress_callback, progress_callback_user_data);
2295
2534
  } catch (const std::exception & err) {
2296
2535
  LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
@@ -2302,17 +2541,10 @@ static bool llama_model_load(
2302
2541
 
2303
2542
  static struct ggml_cgraph * llm_build_llama(
2304
2543
  llama_context & lctx,
2305
- const llama_token * tokens,
2306
- const float * embd,
2307
- int n_tokens,
2308
- int n_past) {
2309
-
2310
- GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
2311
-
2312
- const int N = n_tokens;
2313
-
2544
+ const llama_batch & batch) {
2314
2545
  const auto & model = lctx.model;
2315
2546
  const auto & hparams = model.hparams;
2547
+ const auto & cparams = lctx.cparams;
2316
2548
 
2317
2549
  const auto & kv_self = lctx.kv_self;
2318
2550
 
@@ -2320,7 +2552,7 @@ static struct ggml_cgraph * llm_build_llama(
2320
2552
 
2321
2553
  const int64_t n_embd = hparams.n_embd;
2322
2554
  const int64_t n_layer = hparams.n_layer;
2323
- const int64_t n_ctx = hparams.n_ctx;
2555
+ const int64_t n_ctx = cparams.n_ctx;
2324
2556
  const int64_t n_head = hparams.n_head;
2325
2557
  const int64_t n_head_kv = hparams.n_head_kv;
2326
2558
  const int64_t n_embd_head = hparams.n_embd_head();
@@ -2328,12 +2560,20 @@ static struct ggml_cgraph * llm_build_llama(
2328
2560
 
2329
2561
  GGML_ASSERT(n_embd_head == hparams.n_rot);
2330
2562
 
2331
- const float freq_base = hparams.rope_freq_base;
2332
- const float freq_scale = hparams.rope_freq_scale;
2563
+ const float freq_base = cparams.rope_freq_base;
2564
+ const float freq_scale = cparams.rope_freq_scale;
2333
2565
  const float norm_rms_eps = hparams.f_norm_rms_eps;
2334
2566
 
2335
2567
  const int n_gpu_layers = model.n_gpu_layers;
2336
2568
 
2569
+ const int32_t n_tokens = batch.n_tokens;
2570
+ const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
2571
+ const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
2572
+
2573
+ const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
2574
+
2575
+ //printf("n_kv = %d\n", n_kv);
2576
+
2337
2577
  auto & buf_compute = lctx.buf_compute;
2338
2578
 
2339
2579
  struct ggml_init_params params = {
@@ -2351,12 +2591,12 @@ static struct ggml_cgraph * llm_build_llama(
2351
2591
  struct ggml_tensor * cur;
2352
2592
  struct ggml_tensor * inpL;
2353
2593
 
2354
- if (tokens) {
2355
- struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
2594
+ if (batch.token) {
2595
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
2356
2596
 
2357
2597
  ggml_allocr_alloc(lctx.alloc, inp_tokens);
2358
2598
  if (!ggml_allocr_is_measure(lctx.alloc)) {
2359
- memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
2599
+ memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
2360
2600
  }
2361
2601
  ggml_set_name(inp_tokens, "inp_tokens");
2362
2602
 
@@ -2366,11 +2606,11 @@ static struct ggml_cgraph * llm_build_llama(
2366
2606
  GGML_ASSERT(false && "not implemented");
2367
2607
  #endif
2368
2608
 
2369
- inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
2609
+ inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
2370
2610
 
2371
2611
  ggml_allocr_alloc(lctx.alloc, inpL);
2372
2612
  if (!ggml_allocr_is_measure(lctx.alloc)) {
2373
- memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
2613
+ memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
2374
2614
  }
2375
2615
  }
2376
2616
 
@@ -2379,9 +2619,6 @@ static struct ggml_cgraph * llm_build_llama(
2379
2619
 
2380
2620
  // offload functions set the tensor output backend to GPU
2381
2621
  // tensors are GPU-accelerated if any input or the output has been offloaded
2382
- //
2383
- // with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
2384
- // in that case ggml_cuda_assign_buffers has no effect
2385
2622
  offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
2386
2623
  offload_func_t offload_func_kq = llama_nop;
2387
2624
  offload_func_t offload_func_v = llama_nop;
@@ -2398,12 +2635,75 @@ static struct ggml_cgraph * llm_build_llama(
2398
2635
  }
2399
2636
  #endif // GGML_USE_CUBLAS
2400
2637
 
2638
+ // KQ_scale
2401
2639
  struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
2640
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
2402
2641
  ggml_allocr_alloc(lctx.alloc, KQ_scale);
2403
2642
  if (!ggml_allocr_is_measure(lctx.alloc)) {
2404
- ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
2643
+ ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd_head)));
2644
+ }
2645
+
2646
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
2647
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
2648
+ offload_func_kq(KQ_mask);
2649
+ ggml_set_name(KQ_mask, "KQ_mask");
2650
+ ggml_allocr_alloc(lctx.alloc, KQ_mask);
2651
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
2652
+ float * data = (float *) KQ_mask->data;
2653
+ memset(data, 0, ggml_nbytes(KQ_mask));
2654
+
2655
+ for (int h = 0; h < 1; ++h) {
2656
+ for (int j = 0; j < n_tokens; ++j) {
2657
+ const llama_pos pos = batch.pos[j];
2658
+ const llama_seq_id seq_id = batch.seq_id[j];
2659
+
2660
+ for (int i = 0; i < n_kv; ++i) {
2661
+ if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
2662
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
2663
+ }
2664
+ }
2665
+ }
2666
+ }
2667
+ }
2668
+
2669
+ // KQ_pos - contains the positions
2670
+ struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
2671
+ offload_func_kq(KQ_pos);
2672
+ ggml_set_name(KQ_pos, "KQ_pos");
2673
+ ggml_allocr_alloc(lctx.alloc, KQ_pos);
2674
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
2675
+ int * data = (int *) KQ_pos->data;
2676
+ for (int i = 0; i < n_tokens; ++i) {
2677
+ data[i] = batch.pos[i];
2678
+ }
2679
+ }
2680
+
2681
+ // shift the entire K-cache if needed
2682
+ if (do_rope_shift) {
2683
+ struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
2684
+ offload_func_kq(K_shift);
2685
+ ggml_set_name(K_shift, "K_shift");
2686
+ ggml_allocr_alloc(lctx.alloc, K_shift);
2687
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
2688
+ int * data = (int *) K_shift->data;
2689
+ for (int i = 0; i < n_ctx; ++i) {
2690
+ data[i] = kv_self.cells[i].delta;
2691
+ }
2692
+ }
2693
+
2694
+ for (int il = 0; il < n_layer; ++il) {
2695
+ struct ggml_tensor * tmp =
2696
+ ggml_rope_custom_inplace(ctx0,
2697
+ ggml_view_3d(ctx0, kv_self.k,
2698
+ n_embd_head, n_head_kv, n_ctx,
2699
+ ggml_element_size(kv_self.k)*n_embd_head,
2700
+ ggml_element_size(kv_self.k)*n_embd_gqa,
2701
+ ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
2702
+ K_shift, n_embd_head, 0, 0, freq_base, freq_scale);
2703
+ offload_func_kq(tmp);
2704
+ ggml_build_forward_expand(gf, tmp);
2705
+ }
2405
2706
  }
2406
- ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
2407
2707
 
2408
2708
  for (int il = 0; il < n_layer; ++il) {
2409
2709
  ggml_format_name(inpL, "layer_inp_%d", il);
@@ -2441,33 +2741,33 @@ static struct ggml_cgraph * llm_build_llama(
2441
2741
  offload_func_kq(tmpq);
2442
2742
  ggml_set_name(tmpq, "tmpq");
2443
2743
 
2444
- struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
2744
+ struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
2445
2745
  offload_func_kq(Kcur);
2446
2746
  ggml_set_name(Kcur, "Kcur");
2447
2747
 
2448
- struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
2748
+ struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
2449
2749
  offload_func_kq(Qcur);
2450
2750
  ggml_set_name(Qcur, "Qcur");
2451
2751
 
2452
2752
  // store key and value to memory
2453
2753
  {
2454
- // compute the transposed [N, n_embd] V matrix
2754
+ // compute the transposed [n_tokens, n_embd] V matrix
2455
2755
 
2456
2756
  struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
2457
2757
  offload_func_v(tmpv);
2458
2758
  ggml_set_name(tmpv, "tmpv");
2459
2759
 
2460
- struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, N));
2760
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens));
2461
2761
  offload_func_v(Vcur);
2462
2762
  ggml_set_name(Vcur, "Vcur");
2463
2763
 
2464
- struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
2764
+ struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
2465
2765
  offload_func_kq(k);
2466
2766
  ggml_set_name(k, "k");
2467
2767
 
2468
- struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
2768
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
2469
2769
  ( n_ctx)*ggml_element_size(kv_self.v),
2470
- (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
2770
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
2471
2771
  offload_func_v(v);
2472
2772
  ggml_set_name(v, "v");
2473
2773
 
@@ -2482,7 +2782,7 @@ static struct ggml_cgraph * llm_build_llama(
2482
2782
 
2483
2783
  struct ggml_tensor * K =
2484
2784
  ggml_view_3d(ctx0, kv_self.k,
2485
- n_embd_head, n_past + N, n_head_kv,
2785
+ n_embd_head, n_kv, n_head_kv,
2486
2786
  ggml_element_size(kv_self.k)*n_embd_gqa,
2487
2787
  ggml_element_size(kv_self.k)*n_embd_head,
2488
2788
  ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
@@ -2495,25 +2795,25 @@ static struct ggml_cgraph * llm_build_llama(
2495
2795
  ggml_set_name(KQ, "KQ");
2496
2796
 
2497
2797
  // KQ_scaled = KQ / sqrt(n_embd_head)
2498
- // KQ_scaled shape [n_past + N, N, n_head, 1]
2499
- struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
2798
+ // KQ_scaled shape [n_kv, n_tokens, n_head, 1]
2799
+ struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
2500
2800
  offload_func_kq(KQ_scaled);
2501
2801
  ggml_set_name(KQ_scaled, "KQ_scaled");
2502
2802
 
2503
2803
  // KQ_masked = mask_past(KQ_scaled)
2504
- struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
2804
+ struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
2505
2805
  offload_func_kq(KQ_masked);
2506
2806
  ggml_set_name(KQ_masked, "KQ_masked");
2507
2807
 
2508
2808
  // KQ = soft_max(KQ_masked)
2509
- struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
2809
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
2510
2810
  offload_func_v(KQ_soft_max);
2511
2811
  ggml_set_name(KQ_soft_max, "KQ_soft_max");
2512
2812
 
2513
2813
  // split cached V into n_head heads
2514
2814
  struct ggml_tensor * V =
2515
2815
  ggml_view_3d(ctx0, kv_self.v,
2516
- n_past + N, n_embd_head, n_head_kv,
2816
+ n_kv, n_embd_head, n_head_kv,
2517
2817
  ggml_element_size(kv_self.v)*n_ctx,
2518
2818
  ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
2519
2819
  ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
@@ -2528,7 +2828,7 @@ static struct ggml_cgraph * llm_build_llama(
2528
2828
  // make V contiguous in memory to speed up the matmul, however we waste time on the copy
2529
2829
  // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
2530
2830
  // is there a better way?
2531
- struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head));
2831
+ struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_ctx, n_embd_head, n_head));
2532
2832
  struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
2533
2833
  #endif
2534
2834
 
@@ -2537,10 +2837,8 @@ static struct ggml_cgraph * llm_build_llama(
2537
2837
  offload_func_v(KQV_merged);
2538
2838
  ggml_set_name(KQV_merged, "KQV_merged");
2539
2839
 
2540
- // cur = KQV_merged.contiguous().view(n_embd, N)
2541
- cur = ggml_cpy(ctx0,
2542
- KQV_merged,
2543
- ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
2840
+ // cur = KQV_merged.contiguous().view(n_embd, n_tokens)
2841
+ cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
2544
2842
  offload_func_v(cur);
2545
2843
  ggml_set_name(cur, "KQV_merged_contiguous");
2546
2844
 
@@ -2631,20 +2929,12 @@ static struct ggml_cgraph * llm_build_llama(
2631
2929
  return gf;
2632
2930
  }
2633
2931
 
2634
-
2635
2932
  static struct ggml_cgraph * llm_build_baichaun(
2636
2933
  llama_context & lctx,
2637
- const llama_token * tokens,
2638
- const float * embd,
2639
- int n_tokens,
2640
- int n_past) {
2641
-
2642
- GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
2643
-
2644
- const int N = n_tokens;
2645
-
2934
+ const llama_batch & batch) {
2646
2935
  const auto & model = lctx.model;
2647
2936
  const auto & hparams = model.hparams;
2937
+ const auto & cparams = lctx.cparams;
2648
2938
 
2649
2939
  const auto & kv_self = lctx.kv_self;
2650
2940
 
@@ -2652,7 +2942,7 @@ static struct ggml_cgraph * llm_build_baichaun(
2652
2942
 
2653
2943
  const int64_t n_embd = hparams.n_embd;
2654
2944
  const int64_t n_layer = hparams.n_layer;
2655
- const int64_t n_ctx = hparams.n_ctx;
2945
+ const int64_t n_ctx = cparams.n_ctx;
2656
2946
  const int64_t n_head = hparams.n_head;
2657
2947
  const int64_t n_head_kv = hparams.n_head_kv;
2658
2948
  const int64_t n_embd_head = hparams.n_embd_head();
@@ -2660,12 +2950,18 @@ static struct ggml_cgraph * llm_build_baichaun(
2660
2950
 
2661
2951
  GGML_ASSERT(n_embd_head == hparams.n_rot);
2662
2952
 
2663
- const float freq_base = hparams.rope_freq_base;
2664
- const float freq_scale = hparams.rope_freq_scale;
2953
+ const float freq_base = cparams.rope_freq_base;
2954
+ const float freq_scale = cparams.rope_freq_scale;
2665
2955
  const float norm_rms_eps = hparams.f_norm_rms_eps;
2666
2956
 
2667
2957
  const int n_gpu_layers = model.n_gpu_layers;
2668
2958
 
2959
+ const int32_t n_tokens = batch.n_tokens;
2960
+ const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
2961
+ const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
2962
+
2963
+ const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
2964
+
2669
2965
  auto & buf_compute = lctx.buf_compute;
2670
2966
 
2671
2967
  struct ggml_init_params params = {
@@ -2683,12 +2979,12 @@ static struct ggml_cgraph * llm_build_baichaun(
2683
2979
  struct ggml_tensor * cur;
2684
2980
  struct ggml_tensor * inpL;
2685
2981
 
2686
- if (tokens) {
2687
- struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
2982
+ if (batch.token) {
2983
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
2688
2984
 
2689
2985
  ggml_allocr_alloc(lctx.alloc, inp_tokens);
2690
2986
  if (!ggml_allocr_is_measure(lctx.alloc)) {
2691
- memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
2987
+ memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
2692
2988
  }
2693
2989
  ggml_set_name(inp_tokens, "inp_tokens");
2694
2990
 
@@ -2698,11 +2994,11 @@ static struct ggml_cgraph * llm_build_baichaun(
2698
2994
  GGML_ASSERT(false && "not implemented");
2699
2995
  #endif
2700
2996
 
2701
- inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
2997
+ inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
2702
2998
 
2703
2999
  ggml_allocr_alloc(lctx.alloc, inpL);
2704
3000
  if (!ggml_allocr_is_measure(lctx.alloc)) {
2705
- memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
3001
+ memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
2706
3002
  }
2707
3003
  }
2708
3004
 
@@ -2711,9 +3007,6 @@ static struct ggml_cgraph * llm_build_baichaun(
2711
3007
 
2712
3008
  // offload functions set the tensor output backend to GPU
2713
3009
  // tensors are GPU-accelerated if any input or the output has been offloaded
2714
- //
2715
- // with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
2716
- // in that case ggml_cuda_assign_buffers has no effect
2717
3010
  offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
2718
3011
  offload_func_t offload_func_kq = llama_nop;
2719
3012
  offload_func_t offload_func_v = llama_nop;
@@ -2730,12 +3023,75 @@ static struct ggml_cgraph * llm_build_baichaun(
2730
3023
  }
2731
3024
  #endif // GGML_USE_CUBLAS
2732
3025
 
3026
+ // KQ_scale
2733
3027
  struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
3028
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
2734
3029
  ggml_allocr_alloc(lctx.alloc, KQ_scale);
2735
3030
  if (!ggml_allocr_is_measure(lctx.alloc)) {
2736
3031
  ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
2737
3032
  }
2738
- ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
3033
+
3034
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
3035
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
3036
+ offload_func_kq(KQ_mask);
3037
+ ggml_set_name(KQ_mask, "KQ_mask");
3038
+ ggml_allocr_alloc(lctx.alloc, KQ_mask);
3039
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3040
+ float * data = (float *) KQ_mask->data;
3041
+ memset(data, 0, ggml_nbytes(KQ_mask));
3042
+
3043
+ for (int h = 0; h < 1; ++h) {
3044
+ for (int j = 0; j < n_tokens; ++j) {
3045
+ const llama_pos pos = batch.pos[j];
3046
+ const llama_seq_id seq_id = batch.seq_id[j];
3047
+
3048
+ for (int i = 0; i < n_kv; ++i) {
3049
+ if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
3050
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
3051
+ }
3052
+ }
3053
+ }
3054
+ }
3055
+ }
3056
+
3057
+ // KQ_pos - contains the positions
3058
+ struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
3059
+ offload_func_kq(KQ_pos);
3060
+ ggml_set_name(KQ_pos, "KQ_pos");
3061
+ ggml_allocr_alloc(lctx.alloc, KQ_pos);
3062
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3063
+ int * data = (int *) KQ_pos->data;
3064
+ for (int i = 0; i < n_tokens; ++i) {
3065
+ data[i] = batch.pos[i];
3066
+ }
3067
+ }
3068
+
3069
+ // shift the entire K-cache if needed
3070
+ if (do_rope_shift) {
3071
+ struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
3072
+ offload_func_kq(K_shift);
3073
+ ggml_set_name(K_shift, "K_shift");
3074
+ ggml_allocr_alloc(lctx.alloc, K_shift);
3075
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3076
+ int * data = (int *) K_shift->data;
3077
+ for (int i = 0; i < n_ctx; ++i) {
3078
+ data[i] = kv_self.cells[i].delta;
3079
+ }
3080
+ }
3081
+
3082
+ for (int il = 0; il < n_layer; ++il) {
3083
+ struct ggml_tensor * tmp =
3084
+ ggml_rope_custom_inplace(ctx0,
3085
+ ggml_view_3d(ctx0, kv_self.k,
3086
+ n_embd_head, n_head_kv, n_ctx,
3087
+ ggml_element_size(kv_self.k)*n_embd_head,
3088
+ ggml_element_size(kv_self.k)*n_embd_gqa,
3089
+ ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
3090
+ K_shift, n_embd_head, 0, 0, freq_base, freq_scale);
3091
+ offload_func_kq(tmp);
3092
+ ggml_build_forward_expand(gf, tmp);
3093
+ }
3094
+ }
2739
3095
 
2740
3096
  for (int il = 0; il < n_layer; ++il) {
2741
3097
  ggml_format_name(inpL, "layer_inp_%d", il);
@@ -2777,12 +3133,12 @@ static struct ggml_cgraph * llm_build_baichaun(
2777
3133
  struct ggml_tensor * Qcur;
2778
3134
  switch (model.type) {
2779
3135
  case MODEL_7B:
2780
- Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
2781
- Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
3136
+ Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
3137
+ Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
2782
3138
  break;
2783
3139
  case MODEL_13B:
2784
- Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N);
2785
- Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N);
3140
+ Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, n_tokens);
3141
+ Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, n_tokens);
2786
3142
  break;
2787
3143
  default:
2788
3144
  GGML_ASSERT(false);
@@ -2796,23 +3152,23 @@ static struct ggml_cgraph * llm_build_baichaun(
2796
3152
 
2797
3153
  // store key and value to memory
2798
3154
  {
2799
- // compute the transposed [N, n_embd] V matrix
3155
+ // compute the transposed [n_tokens, n_embd] V matrix
2800
3156
 
2801
3157
  struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
2802
3158
  offload_func_v(tmpv);
2803
3159
  ggml_set_name(tmpv, "tmpv");
2804
3160
 
2805
- struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, N));
3161
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens));
2806
3162
  offload_func_v(Vcur);
2807
3163
  ggml_set_name(Vcur, "Vcur");
2808
3164
 
2809
- struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
3165
+ struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
2810
3166
  offload_func_kq(k);
2811
3167
  ggml_set_name(k, "k");
2812
3168
 
2813
- struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
3169
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
2814
3170
  ( n_ctx)*ggml_element_size(kv_self.v),
2815
- (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
3171
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
2816
3172
  offload_func_v(v);
2817
3173
  ggml_set_name(v, "v");
2818
3174
 
@@ -2827,7 +3183,7 @@ static struct ggml_cgraph * llm_build_baichaun(
2827
3183
 
2828
3184
  struct ggml_tensor * K =
2829
3185
  ggml_view_3d(ctx0, kv_self.k,
2830
- n_embd_head, n_past + N, n_head_kv,
3186
+ n_embd_head, n_kv, n_head_kv,
2831
3187
  ggml_element_size(kv_self.k)*n_embd_gqa,
2832
3188
  ggml_element_size(kv_self.k)*n_embd_head,
2833
3189
  ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
@@ -2840,8 +3196,8 @@ static struct ggml_cgraph * llm_build_baichaun(
2840
3196
  ggml_set_name(KQ, "KQ");
2841
3197
 
2842
3198
  // KQ_scaled = KQ / sqrt(n_embd_head)
2843
- // KQ_scaled shape [n_past + N, N, n_head, 1]
2844
- struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
3199
+ // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
3200
+ struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
2845
3201
  offload_func_kq(KQ_scaled);
2846
3202
  ggml_set_name(KQ_scaled, "KQ_scaled");
2847
3203
 
@@ -2850,58 +3206,44 @@ static struct ggml_cgraph * llm_build_baichaun(
2850
3206
 
2851
3207
  switch (model.type) {
2852
3208
  case MODEL_7B:
2853
- KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
3209
+ KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
2854
3210
  break;
2855
3211
  case MODEL_13B:
2856
- KQ_scaled_alibi =ggml_alibi(ctx0, KQ_scaled, n_past, n_head, 8);
3212
+ // TODO: replace with ggml_add()
3213
+ KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ 0, n_head, 8);
2857
3214
  ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
2858
- KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past);
3215
+ KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
2859
3216
  break;
2860
3217
  default:
2861
3218
  GGML_ASSERT(false);
2862
3219
  }
2863
- // KQ_masked = mask_past(KQ_scaled)
2864
- // struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
2865
- // struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past);
2866
- // offload_func_kq(KQ_masked);
2867
- // ggml_set_name(KQ_masked, "KQ_masked");
2868
3220
 
2869
3221
  // KQ = soft_max(KQ_masked)
2870
- struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
3222
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
2871
3223
  offload_func_v(KQ_soft_max);
2872
3224
  ggml_set_name(KQ_soft_max, "KQ_soft_max");
2873
3225
 
2874
3226
  // split cached V into n_head heads
2875
3227
  struct ggml_tensor * V =
2876
3228
  ggml_view_3d(ctx0, kv_self.v,
2877
- n_past + N, n_embd_head, n_head_kv,
3229
+ n_kv, n_embd_head, n_head_kv,
2878
3230
  ggml_element_size(kv_self.v)*n_ctx,
2879
3231
  ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
2880
3232
  ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
2881
3233
  offload_func_v(V);
2882
3234
  ggml_set_name(V, "V");
2883
3235
 
2884
- #if 1
2885
3236
  struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
2886
3237
  offload_func_v(KQV);
2887
3238
  ggml_set_name(KQV, "KQV");
2888
- #else
2889
- // make V contiguous in memory to speed up the matmul, however we waste time on the copy
2890
- // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
2891
- // is there a better way?
2892
- struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head));
2893
- struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
2894
- #endif
2895
3239
 
2896
3240
  // KQV_merged = KQV.permute(0, 2, 1, 3)
2897
3241
  struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
2898
3242
  offload_func_v(KQV_merged);
2899
3243
  ggml_set_name(KQV_merged, "KQV_merged");
2900
3244
 
2901
- // cur = KQV_merged.contiguous().view(n_embd, N)
2902
- cur = ggml_cpy(ctx0,
2903
- KQV_merged,
2904
- ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
3245
+ // cur = KQV_merged.contiguous().view(n_embd, n_tokens)
3246
+ cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
2905
3247
  offload_func_v(cur);
2906
3248
  ggml_set_name(cur, "KQV_merged_contiguous");
2907
3249
 
@@ -2994,17 +3336,10 @@ static struct ggml_cgraph * llm_build_baichaun(
2994
3336
 
2995
3337
  static struct ggml_cgraph * llm_build_falcon(
2996
3338
  llama_context & lctx,
2997
- const llama_token * tokens,
2998
- const float * embd,
2999
- int n_tokens,
3000
- int n_past) {
3001
-
3002
- GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
3003
-
3004
- const int N = n_tokens;
3005
-
3339
+ const llama_batch & batch) {
3006
3340
  const auto & model = lctx.model;
3007
3341
  const auto & hparams = model.hparams;
3342
+ const auto & cparams = lctx.cparams;
3008
3343
 
3009
3344
  const auto & kv_self = lctx.kv_self;
3010
3345
 
@@ -3012,7 +3347,7 @@ static struct ggml_cgraph * llm_build_falcon(
3012
3347
 
3013
3348
  const int64_t n_embd = hparams.n_embd;
3014
3349
  const int64_t n_layer = hparams.n_layer;
3015
- const int64_t n_ctx = hparams.n_ctx;
3350
+ const int64_t n_ctx = cparams.n_ctx;
3016
3351
  const int64_t n_head = hparams.n_head;
3017
3352
  const int64_t n_head_kv = hparams.n_head_kv;
3018
3353
  const int64_t n_embd_head = hparams.n_embd_head();
@@ -3020,12 +3355,21 @@ static struct ggml_cgraph * llm_build_falcon(
3020
3355
 
3021
3356
  GGML_ASSERT(n_embd_head == hparams.n_rot);
3022
3357
 
3023
- const float freq_base = hparams.rope_freq_base;
3024
- const float freq_scale = hparams.rope_freq_scale;
3358
+ const float freq_base = cparams.rope_freq_base;
3359
+ const float freq_scale = cparams.rope_freq_scale;
3025
3360
  const float norm_eps = hparams.f_norm_eps;
3026
3361
 
3027
3362
  const int n_gpu_layers = model.n_gpu_layers;
3028
3363
 
3364
+ const int32_t n_tokens = batch.n_tokens;
3365
+ const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
3366
+ const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
3367
+
3368
+ const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
3369
+
3370
+ //printf("kv_head = %d, n_kv = %d, n_tokens = %d, n_ctx = %d, is_measure = %d, has_shift = %d\n",
3371
+ // kv_head, n_kv, n_tokens, n_ctx, ggml_allocr_is_measure(lctx.alloc), kv_self.has_shift);
3372
+
3029
3373
  auto & buf_compute = lctx.buf_compute;
3030
3374
 
3031
3375
  struct ggml_init_params params = {
@@ -3043,12 +3387,12 @@ static struct ggml_cgraph * llm_build_falcon(
3043
3387
  struct ggml_tensor * cur;
3044
3388
  struct ggml_tensor * inpL;
3045
3389
 
3046
- if (tokens) {
3047
- struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
3390
+ if (batch.token) {
3391
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
3048
3392
 
3049
3393
  ggml_allocr_alloc(lctx.alloc, inp_tokens);
3050
3394
  if (!ggml_allocr_is_measure(lctx.alloc)) {
3051
- memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
3395
+ memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
3052
3396
  }
3053
3397
  ggml_set_name(inp_tokens, "inp_tokens");
3054
3398
 
@@ -3058,11 +3402,11 @@ static struct ggml_cgraph * llm_build_falcon(
3058
3402
  GGML_ASSERT(false && "not implemented");
3059
3403
  #endif
3060
3404
 
3061
- inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
3405
+ inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
3062
3406
 
3063
3407
  ggml_allocr_alloc(lctx.alloc, inpL);
3064
3408
  if (!ggml_allocr_is_measure(lctx.alloc)) {
3065
- memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
3409
+ memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
3066
3410
  }
3067
3411
  }
3068
3412
 
@@ -3071,9 +3415,6 @@ static struct ggml_cgraph * llm_build_falcon(
3071
3415
 
3072
3416
  // offload functions set the tensor output backend to GPU
3073
3417
  // tensors are GPU-accelerated if any input or the output has been offloaded
3074
- //
3075
- // with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
3076
- // in that case ggml_cuda_assign_buffers has no effect
3077
3418
  offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
3078
3419
  offload_func_t offload_func_kq = llama_nop;
3079
3420
  offload_func_t offload_func_v = llama_nop;
@@ -3090,12 +3431,75 @@ static struct ggml_cgraph * llm_build_falcon(
3090
3431
  }
3091
3432
  #endif // GGML_USE_CUBLAS
3092
3433
 
3434
+ // KQ_scale
3093
3435
  struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
3436
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
3094
3437
  ggml_allocr_alloc(lctx.alloc, KQ_scale);
3095
3438
  if (!ggml_allocr_is_measure(lctx.alloc)) {
3096
3439
  ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
3097
3440
  }
3098
- ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
3441
+
3442
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
3443
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
3444
+ offload_func_kq(KQ_mask);
3445
+ ggml_set_name(KQ_mask, "KQ_mask");
3446
+ ggml_allocr_alloc(lctx.alloc, KQ_mask);
3447
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3448
+ float * data = (float *) KQ_mask->data;
3449
+ memset(data, 0, ggml_nbytes(KQ_mask));
3450
+
3451
+ for (int h = 0; h < 1; ++h) {
3452
+ for (int j = 0; j < n_tokens; ++j) {
3453
+ const llama_pos pos = batch.pos[j];
3454
+ const llama_seq_id seq_id = batch.seq_id[j];
3455
+
3456
+ for (int i = 0; i < n_kv; ++i) {
3457
+ if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
3458
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
3459
+ }
3460
+ }
3461
+ }
3462
+ }
3463
+ }
3464
+
3465
+ // KQ_pos - contains the positions
3466
+ struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
3467
+ offload_func_kq(KQ_pos);
3468
+ ggml_set_name(KQ_pos, "KQ_pos");
3469
+ ggml_allocr_alloc(lctx.alloc, KQ_pos);
3470
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3471
+ int * data = (int *) KQ_pos->data;
3472
+ for (int i = 0; i < n_tokens; ++i) {
3473
+ data[i] = batch.pos[i];
3474
+ }
3475
+ }
3476
+
3477
+ // shift the entire K-cache if needed
3478
+ if (do_rope_shift) {
3479
+ struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
3480
+ offload_func_kq(K_shift);
3481
+ ggml_set_name(K_shift, "K_shift");
3482
+ ggml_allocr_alloc(lctx.alloc, K_shift);
3483
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3484
+ int * data = (int *) K_shift->data;
3485
+ for (int i = 0; i < n_ctx; ++i) {
3486
+ data[i] = kv_self.cells[i].delta;
3487
+ }
3488
+ }
3489
+
3490
+ for (int il = 0; il < n_layer; ++il) {
3491
+ struct ggml_tensor * tmp =
3492
+ ggml_rope_custom_inplace(ctx0,
3493
+ ggml_view_3d(ctx0, kv_self.k,
3494
+ n_embd_head, n_head_kv, n_ctx,
3495
+ ggml_element_size(kv_self.k)*n_embd_head,
3496
+ ggml_element_size(kv_self.k)*n_embd_gqa,
3497
+ ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
3498
+ K_shift, n_embd_head, 2, 0, freq_base, freq_scale);
3499
+ offload_func_kq(tmp);
3500
+ ggml_build_forward_expand(gf, tmp);
3501
+ }
3502
+ }
3099
3503
 
3100
3504
  for (int il = 0; il < n_layer; ++il) {
3101
3505
  struct ggml_tensor * attn_norm;
@@ -3152,148 +3556,395 @@ static struct ggml_cgraph * llm_build_falcon(
3152
3556
  // TODO: these 2 ggml_conts are technically not needed, but we add them until CUDA support for
3153
3557
  // non-contiguous views is added for the rope operator
3154
3558
  struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_3d(
3155
- ctx0, cur, n_embd_head, n_head, N,
3559
+ ctx0, cur, n_embd_head, n_head, n_tokens,
3156
3560
  wsize * n_embd_head,
3157
3561
  wsize * n_embd_head * (n_head + 2 * n_head_kv),
3158
3562
  0));
3159
3563
  offload_func_kq(tmpq);
3160
3564
 
3161
- struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_3d(
3162
- ctx0, cur, n_embd_head, n_head_kv, N,
3163
- wsize * n_embd_head,
3164
- wsize * n_embd_head * (n_head + 2 * n_head_kv),
3165
- wsize * n_embd_head * n_head));
3166
- offload_func_kq(tmpk);
3565
+ struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_3d(
3566
+ ctx0, cur, n_embd_head, n_head_kv, n_tokens,
3567
+ wsize * n_embd_head,
3568
+ wsize * n_embd_head * (n_head + 2 * n_head_kv),
3569
+ wsize * n_embd_head * n_head));
3570
+ offload_func_kq(tmpk);
3571
+
3572
+ struct ggml_tensor * tmpv = ggml_view_3d(
3573
+ ctx0, cur, n_embd_head, n_head_kv, n_tokens,
3574
+ wsize * n_embd_head,
3575
+ wsize * n_embd_head * (n_head + 2 * n_head_kv),
3576
+ wsize * n_embd_head * (n_head + n_head_kv));
3577
+ offload_func_v(tmpv);
3578
+
3579
+ // using mode = 2 for neox mode
3580
+ struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, tmpq, KQ_pos, n_embd_head, 2, 0, freq_base, freq_scale);
3581
+ offload_func_kq(Qcur);
3582
+ struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, tmpk, KQ_pos, n_embd_head, 2, 0, freq_base, freq_scale);
3583
+ offload_func_kq(Kcur);
3584
+
3585
+ {
3586
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
3587
+ offload_func_v(Vcur);
3588
+ offload_func_v(Vcur->src[0]->src[0]);
3589
+ ggml_set_name(Vcur, "Vcur");
3590
+
3591
+ struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
3592
+ offload_func_kq(k);
3593
+ ggml_set_name(k, "k");
3594
+
3595
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
3596
+ ( n_ctx)*ggml_element_size(kv_self.v),
3597
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
3598
+ offload_func_v(v);
3599
+
3600
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
3601
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
3602
+ }
3603
+
3604
+ struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
3605
+ offload_func_kq(Q);
3606
+ ggml_set_name(Q, "Q");
3607
+
3608
+ struct ggml_tensor * K =
3609
+ ggml_view_3d(ctx0, kv_self.k,
3610
+ n_embd_head, n_kv, n_head_kv,
3611
+ ggml_element_size(kv_self.k)*n_embd_gqa,
3612
+ ggml_element_size(kv_self.k)*n_embd_head,
3613
+ ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
3614
+ offload_func_kq(K);
3615
+ ggml_set_name(K, "K");
3616
+
3617
+ struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
3618
+ offload_func_kq(KQ);
3619
+ ggml_set_name(KQ, "KQ");
3620
+
3621
+ struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
3622
+ offload_func_kq(KQ_scaled);
3623
+ ggml_set_name(KQ_scaled, "KQ_scaled");
3624
+
3625
+ struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
3626
+ offload_func_kq(KQ_masked);
3627
+ ggml_set_name(KQ_masked, "KQ_masked");
3628
+
3629
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
3630
+ offload_func_v(KQ_soft_max);
3631
+ ggml_set_name(KQ_soft_max, "KQ_soft_max");
3632
+
3633
+ struct ggml_tensor * V =
3634
+ ggml_view_3d(ctx0, kv_self.v,
3635
+ n_kv, n_embd_head, n_head_kv,
3636
+ ggml_element_size(kv_self.v)*n_ctx,
3637
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
3638
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
3639
+ offload_func_v(V);
3640
+ ggml_set_name(V, "V");
3641
+
3642
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
3643
+ offload_func_v(KQV);
3644
+ ggml_set_name(KQV, "KQV");
3645
+
3646
+ struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
3647
+ offload_func_v(KQV_merged);
3648
+ ggml_set_name(KQV_merged, "KQV_merged");
3649
+
3650
+ cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
3651
+ offload_func_v(cur);
3652
+ ggml_set_name(cur, "KQV_merged_contiguous");
3653
+
3654
+ cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
3655
+ offload_func(cur);
3656
+ ggml_set_name(cur, "result_wo");
3657
+ }
3658
+
3659
+ struct ggml_tensor * attn_out = cur;
3660
+
3661
+ // feed forward
3662
+ {
3663
+ struct ggml_tensor * inpFF = attn_norm;
3664
+
3665
+ cur = ggml_mul_mat(ctx0, model.layers[il].w3, inpFF);
3666
+ offload_func(cur);
3667
+
3668
+ cur = ggml_gelu(ctx0, cur);
3669
+ offload_func(cur);
3670
+ cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur);
3671
+ offload_func(cur);
3672
+ }
3673
+
3674
+ cur = ggml_add(ctx0, cur, attn_out);
3675
+ offload_func(cur);
3676
+ cur = ggml_add(ctx0, cur, inpL);
3677
+ offload_func(cur);
3678
+
3679
+ // input for next layer
3680
+ inpL = cur;
3681
+ }
3682
+
3683
+ cur = inpL;
3684
+
3685
+ // norm
3686
+ {
3687
+ cur = ggml_norm(ctx0, cur, norm_eps);
3688
+ offload_func_nr(cur);
3689
+
3690
+ cur = ggml_add(ctx0,
3691
+ ggml_mul(ctx0, cur, model.output_norm),
3692
+ model.output_norm_b);
3693
+ ggml_set_name(cur, "result_norm");
3694
+ }
3695
+
3696
+ cur = ggml_mul_mat(ctx0, model.output, cur);
3697
+ ggml_set_name(cur, "result_output");
3698
+
3699
+ ggml_build_forward_expand(gf, cur);
3700
+
3701
+ ggml_free(ctx0);
3702
+
3703
+ return gf;
3704
+ }
3705
+
3706
+ static struct ggml_cgraph * llm_build_starcoder(
3707
+ llama_context & lctx,
3708
+ const llama_batch & batch) {
3709
+ const auto & model = lctx.model;
3710
+ const auto & hparams = model.hparams;
3711
+ const auto & cparams = lctx.cparams;
3712
+
3713
+ const auto & kv_self = lctx.kv_self;
3714
+
3715
+ GGML_ASSERT(!!kv_self.ctx);
3716
+
3717
+ const int64_t n_embd = hparams.n_embd;
3718
+ const int64_t n_layer = hparams.n_layer;
3719
+ const int64_t n_ctx = cparams.n_ctx;
3720
+ const int64_t n_head = hparams.n_head;
3721
+ const int64_t n_head_kv = hparams.n_head_kv;
3722
+ const int64_t n_embd_head = hparams.n_embd_head();
3723
+ const int64_t n_embd_gqa = hparams.n_embd_gqa();
3724
+
3725
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
3726
+
3727
+ const float norm_eps = hparams.f_norm_eps;
3728
+
3729
+ const int32_t n_tokens = batch.n_tokens;
3730
+ const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
3731
+ const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
3732
+
3733
+ auto & buf_compute = lctx.buf_compute;
3734
+
3735
+ struct ggml_init_params params = {
3736
+ /*.mem_size =*/ buf_compute.size,
3737
+ /*.mem_buffer =*/ buf_compute.data,
3738
+ /*.no_alloc =*/ false,
3739
+ };
3740
+
3741
+ params.no_alloc = true;
3742
+
3743
+ struct ggml_context * ctx0 = ggml_init(params);
3744
+
3745
+ ggml_cgraph * gf = ggml_new_graph(ctx0);
3746
+
3747
+ struct ggml_tensor * cur;
3748
+ struct ggml_tensor * token;
3749
+ struct ggml_tensor * position;
3750
+ struct ggml_tensor * inpL;
3751
+
3752
+ if (batch.token) {
3753
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
3754
+
3755
+ ggml_allocr_alloc(lctx.alloc, inp_tokens);
3756
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3757
+ memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
3758
+ }
3759
+ ggml_set_name(inp_tokens, "inp_tokens");
3760
+
3761
+ token = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
3762
+ } else {
3763
+ #ifdef GGML_USE_MPI
3764
+ GGML_ASSERT(false && "not implemented");
3765
+ #endif
3766
+
3767
+ token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
3768
+
3769
+ ggml_allocr_alloc(lctx.alloc, token);
3770
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3771
+ memcpy(token->data, batch.embd, n_tokens * n_embd * ggml_element_size(token));
3772
+ }
3773
+ }
3774
+
3775
+ {
3776
+ // Compute position embeddings.
3777
+ struct ggml_tensor * inp_positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
3778
+ ggml_allocr_alloc(lctx.alloc, inp_positions);
3779
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3780
+ for (int i = 0; i < n_tokens; ++i) {
3781
+ ((int32_t *) inp_positions->data)[i] = batch.pos[i];
3782
+ }
3783
+ }
3784
+ ggml_set_name(inp_positions, "inp_positions");
3785
+
3786
+ position = ggml_get_rows(ctx0, model.pos_embeddings, inp_positions);
3787
+ }
3788
+
3789
+ // KQ_scale
3790
+ struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
3791
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
3792
+ ggml_allocr_alloc(lctx.alloc, KQ_scale);
3793
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3794
+ ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
3795
+ }
3796
+
3797
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
3798
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
3799
+ ggml_set_name(KQ_mask, "KQ_mask");
3800
+ ggml_allocr_alloc(lctx.alloc, KQ_mask);
3801
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3802
+ float * data = (float *) KQ_mask->data;
3803
+ memset(data, 0, ggml_nbytes(KQ_mask));
3804
+
3805
+ for (int h = 0; h < 1; ++h) {
3806
+ for (int j = 0; j < n_tokens; ++j) {
3807
+ const llama_pos pos = batch.pos[j];
3808
+ const llama_seq_id seq_id = batch.seq_id[j];
3809
+
3810
+ for (int i = 0; i < n_kv; ++i) {
3811
+ if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
3812
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
3813
+ }
3814
+ }
3815
+ }
3816
+ }
3817
+ }
3818
+
3819
+ inpL = ggml_add(ctx0, token, position);
3820
+ ggml_set_name(inpL, "inpL");
3167
3821
 
3168
- struct ggml_tensor * tmpv = ggml_view_3d(
3169
- ctx0, cur, n_embd_head, n_head_kv, N,
3170
- wsize * n_embd_head,
3171
- wsize * n_embd_head * (n_head + 2 * n_head_kv),
3172
- wsize * n_embd_head * (n_head + n_head_kv));
3173
- offload_func_v(tmpv);
3822
+ for (int il = 0; il < n_layer; ++il) {
3823
+ {
3824
+ // Norm
3825
+ cur = ggml_norm(ctx0, inpL, norm_eps);
3826
+ cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b);
3827
+ }
3174
3828
 
3175
- // using mode = 2 for neox mode
3176
- struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, tmpq, n_past, n_embd_head, 2, 0, freq_base, freq_scale);
3177
- offload_func_kq(Qcur);
3178
- struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, tmpk, n_past, n_embd_head, 2, 0, freq_base, freq_scale);
3179
- offload_func_kq(Kcur);
3829
+ {
3830
+ // Self Attention
3831
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv);
3832
+
3833
+ struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd);
3834
+ struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*n_embd);
3835
+ struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
3836
+
3837
+ struct ggml_tensor * Qcur = tmpq;
3838
+ struct ggml_tensor * Kcur = tmpk;
3180
3839
 
3181
3840
  {
3182
- struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, N));
3183
- offload_func_v(Vcur);
3184
- offload_func_v(Vcur->src[0]->src[0]);
3841
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
3185
3842
  ggml_set_name(Vcur, "Vcur");
3186
3843
 
3187
- struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
3188
- offload_func_kq(k);
3844
+ struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
3189
3845
  ggml_set_name(k, "k");
3190
3846
 
3191
- struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
3847
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
3192
3848
  ( n_ctx)*ggml_element_size(kv_self.v),
3193
- (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
3194
- offload_func_v(v);
3849
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
3195
3850
 
3196
3851
  ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
3197
3852
  ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
3198
3853
  }
3199
3854
 
3200
- struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
3201
- offload_func_kq(Q);
3855
+ struct ggml_tensor * Q =
3856
+ ggml_permute(ctx0,
3857
+ ggml_cpy(ctx0,
3858
+ Qcur,
3859
+ ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, n_tokens)),
3860
+ 0, 2, 1, 3);
3202
3861
  ggml_set_name(Q, "Q");
3203
3862
 
3204
3863
  struct ggml_tensor * K =
3205
3864
  ggml_view_3d(ctx0, kv_self.k,
3206
- n_embd_head, n_past + N, n_head_kv,
3865
+ n_embd_head, n_kv, n_head_kv,
3207
3866
  ggml_element_size(kv_self.k)*n_embd_gqa,
3208
3867
  ggml_element_size(kv_self.k)*n_embd_head,
3209
3868
  ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
3210
- offload_func_kq(K);
3211
3869
  ggml_set_name(K, "K");
3212
3870
 
3871
+ // K * Q
3213
3872
  struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
3214
- offload_func_kq(KQ);
3215
3873
  ggml_set_name(KQ, "KQ");
3216
3874
 
3875
+ // KQ_scaled = KQ / sqrt(n_embd_head)
3876
+ // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
3217
3877
  struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
3218
- offload_func_kq(KQ_scaled);
3219
3878
  ggml_set_name(KQ_scaled, "KQ_scaled");
3220
3879
 
3221
- struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
3222
- offload_func_kq(KQ_masked);
3880
+ // KQ_masked = mask_past(KQ_scaled)
3881
+ struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
3223
3882
  ggml_set_name(KQ_masked, "KQ_masked");
3224
3883
 
3884
+ // KQ = soft_max(KQ_masked)
3225
3885
  struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
3226
- offload_func_v(KQ_soft_max);
3227
3886
  ggml_set_name(KQ_soft_max, "KQ_soft_max");
3228
3887
 
3888
+ // split cached V into n_head heads
3229
3889
  struct ggml_tensor * V =
3230
3890
  ggml_view_3d(ctx0, kv_self.v,
3231
- n_past + N, n_embd_head, n_head_kv,
3891
+ n_kv, n_embd_head, n_head_kv,
3232
3892
  ggml_element_size(kv_self.v)*n_ctx,
3233
3893
  ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
3234
3894
  ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
3235
- offload_func_v(V);
3236
3895
  ggml_set_name(V, "V");
3237
3896
 
3238
3897
  struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
3239
- offload_func_v(KQV);
3240
3898
  ggml_set_name(KQV, "KQV");
3241
3899
 
3900
+ // KQV_merged = KQV.permute(0, 2, 1, 3)
3242
3901
  struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
3243
- offload_func_v(KQV_merged);
3244
3902
  ggml_set_name(KQV_merged, "KQV_merged");
3245
3903
 
3246
- cur = ggml_cpy(ctx0, KQV_merged, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
3247
- offload_func_v(cur);
3904
+ // cur = KQV_merged.contiguous().view(n_embd, n_tokens)
3905
+ cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
3248
3906
  ggml_set_name(cur, "KQV_merged_contiguous");
3249
-
3250
- cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
3251
- offload_func(cur);
3252
- ggml_set_name(cur, "result_wo");
3253
3907
  }
3254
3908
 
3255
- struct ggml_tensor * attn_out = cur;
3909
+ // Projection
3910
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo);
3256
3911
 
3257
- // feed forward
3912
+ // Add the input
3913
+ cur = ggml_add(ctx0, cur, inpL);
3914
+
3915
+ struct ggml_tensor * inpFF = cur;
3916
+
3917
+ // FF
3258
3918
  {
3259
- struct ggml_tensor * inpFF = attn_norm;
3919
+ // Norm
3920
+ {
3921
+ cur = ggml_norm(ctx0, inpFF, norm_eps);
3922
+ cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b);
3923
+ }
3260
3924
 
3261
- cur = ggml_mul_mat(ctx0, model.layers[il].w3, inpFF);
3262
- offload_func(cur);
3925
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3);
3263
3926
 
3927
+ // GELU activation
3264
3928
  cur = ggml_gelu(ctx0, cur);
3265
- offload_func(cur);
3266
- cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur);
3267
- offload_func(cur);
3268
- }
3269
3929
 
3270
- cur = ggml_add(ctx0, cur, attn_out);
3271
- offload_func(cur);
3272
- cur = ggml_add(ctx0, cur, inpL);
3273
- offload_func(cur);
3930
+ // Projection
3931
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2);
3932
+ }
3274
3933
 
3275
- // input for next layer
3276
- inpL = cur;
3934
+ inpL = ggml_add(ctx0, cur, inpFF);
3277
3935
  }
3278
3936
 
3279
- cur = inpL;
3280
-
3281
- // norm
3937
+ // Output Norm
3282
3938
  {
3283
- cur = ggml_norm(ctx0, cur, norm_eps);
3284
- offload_func_nr(cur);
3285
-
3286
- cur = ggml_add(ctx0,
3287
- ggml_mul(ctx0, cur, model.output_norm),
3288
- model.output_norm_b);
3289
- ggml_set_name(cur, "result_norm");
3939
+ cur = ggml_norm(ctx0, inpL, norm_eps);
3940
+ cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b);
3290
3941
  }
3942
+ ggml_set_name(cur, "result_norm");
3291
3943
 
3292
3944
  cur = ggml_mul_mat(ctx0, model.output, cur);
3293
3945
  ggml_set_name(cur, "result_output");
3294
3946
 
3295
3947
  ggml_build_forward_expand(gf, cur);
3296
-
3297
3948
  ggml_free(ctx0);
3298
3949
 
3299
3950
  return gf;
@@ -3301,10 +3952,7 @@ static struct ggml_cgraph * llm_build_falcon(
3301
3952
 
3302
3953
  static struct ggml_cgraph * llama_build_graph(
3303
3954
  llama_context & lctx,
3304
- const llama_token * tokens,
3305
- const float * embd,
3306
- int n_tokens,
3307
- int n_past) {
3955
+ const llama_batch & batch) {
3308
3956
  const auto & model = lctx.model;
3309
3957
 
3310
3958
  struct ggml_cgraph * result = NULL;
@@ -3312,72 +3960,117 @@ static struct ggml_cgraph * llama_build_graph(
3312
3960
  switch (model.arch) {
3313
3961
  case LLM_ARCH_LLAMA:
3314
3962
  {
3315
- result = llm_build_llama(lctx, tokens, embd, n_tokens, n_past);
3963
+ result = llm_build_llama(lctx, batch);
3316
3964
  } break;
3317
3965
  case LLM_ARCH_BAICHUAN:
3318
3966
  {
3319
- result = llm_build_baichaun(lctx, tokens, embd, n_tokens, n_past);
3967
+ result = llm_build_baichaun(lctx, batch);
3320
3968
  } break;
3321
3969
  case LLM_ARCH_FALCON:
3322
3970
  {
3323
- result = llm_build_falcon(lctx, tokens, embd, n_tokens, n_past);
3971
+ result = llm_build_falcon(lctx, batch);
3972
+ } break;
3973
+ case LLM_ARCH_STARCODER:
3974
+ {
3975
+ result = llm_build_starcoder(lctx, batch);
3324
3976
  } break;
3325
3977
  default:
3326
3978
  GGML_ASSERT(false);
3327
- };
3979
+ }
3328
3980
 
3329
3981
  return result;
3330
3982
  }
3331
3983
 
3332
- // evaluate the transformer
3984
+ // decode a batch of tokens by evaluating the transformer
3333
3985
  //
3334
3986
  // - lctx: llama context
3335
- // - tokens: new batch of tokens to process
3336
- // - embd embeddings input
3337
- // - n_tokens number of tokens
3338
- // - n_past: the context size so far
3987
+ // - batch: batch to evaluate
3339
3988
  // - n_threads: number of threads to use
3340
3989
  //
3341
- static bool llama_eval_internal(
3990
+ // return 0 on success
3991
+ // return positive int on warning
3992
+ // return negative int on error
3993
+ //
3994
+ static int llama_decode_internal(
3342
3995
  llama_context & lctx,
3343
- const llama_token * tokens,
3344
- const float * embd,
3345
- int n_tokens,
3346
- int n_past,
3347
- int n_threads,
3348
- const char * cgraph_fname) {
3996
+ llama_batch batch) {
3997
+ const uint32_t n_tokens = batch.n_tokens;
3998
+
3999
+ if (n_tokens == 0) {
4000
+ LLAMA_LOG_ERROR("%s: n_tokens == 0", __func__);
4001
+ return -1;
4002
+ }
4003
+
4004
+ const auto & model = lctx.model;
4005
+ const auto & hparams = model.hparams;
4006
+ const auto & cparams = lctx.cparams;
3349
4007
 
3350
- GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
4008
+ const auto n_batch = cparams.n_batch;
3351
4009
 
3352
- GGML_ASSERT(n_tokens > 0);
3353
- GGML_ASSERT(n_past >= 0);
3354
- // TODO: keep the values of n_batch and n_ctx
3355
- // GGML_ASSERT(n_tokens <= n_batch);
3356
- // GGML_ASSERT(n_past + n_tokens <= n_ctx);
4010
+ GGML_ASSERT(n_tokens <= n_batch);
4011
+
4012
+ int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
4013
+ GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
3357
4014
 
3358
4015
  const int64_t t_start_us = ggml_time_us();
3359
4016
 
3360
4017
  #ifdef GGML_USE_MPI
3361
- ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
4018
+ // TODO: needs fix after #3228
4019
+ GGML_ASSERT(false && "not implemented");
4020
+ //ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
3362
4021
  #endif
3363
4022
 
3364
4023
  GGML_ASSERT(n_threads > 0);
3365
4024
 
3366
- const int N = n_tokens;
3367
-
3368
- const auto & model = lctx.model;
3369
- const auto & hparams = model.hparams;
3370
-
3371
- const auto & kv_self = lctx.kv_self;
4025
+ auto & kv_self = lctx.kv_self;
3372
4026
 
3373
4027
  GGML_ASSERT(!!kv_self.ctx);
3374
4028
 
3375
4029
  const int64_t n_embd = hparams.n_embd;
3376
4030
  const int64_t n_vocab = hparams.n_vocab;
3377
4031
 
4032
+ // helpers for smoother batch API transistion
4033
+ // after deprecating the llama_eval calls, these will be removed
4034
+ std::vector<llama_pos> pos;
4035
+ std::vector<llama_seq_id> seq_id;
4036
+
4037
+ if (batch.pos == nullptr) {
4038
+ pos.resize(n_tokens);
4039
+ for (uint32_t i = 0; i < n_tokens; i++) {
4040
+ pos[i] = batch.all_pos_0 + i*batch.all_pos_1;
4041
+ }
4042
+
4043
+ batch.pos = pos.data();
4044
+ }
4045
+
4046
+ if (batch.seq_id == nullptr) {
4047
+ seq_id.resize(n_tokens);
4048
+ for (uint32_t i = 0; i < n_tokens; i++) {
4049
+ seq_id[i] = batch.all_seq_id;
4050
+ }
4051
+
4052
+ batch.seq_id = seq_id.data();
4053
+ }
4054
+
4055
+ // we always start to search for a free slot from the start of the cache
4056
+ // TODO: better strategies can be implemented
4057
+ kv_self.head = 0;
4058
+
4059
+ if (!llama_kv_cache_find_slot(kv_self, batch)) {
4060
+ return 1;
4061
+ }
4062
+
4063
+ // a heuristic, to avoid attending the full cache if it is not yet utilized
4064
+ // after enough generations, the benefit from this heuristic disappears
4065
+ // if we start defragmenting the cache, the benefit from this will be more important
4066
+ //kv_self.n = std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)); // TODO: this might be better for CUDA?
4067
+ kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, llama_kv_cache_cell_max(kv_self)));
4068
+
4069
+ //printf("kv_self.n = %d\n", kv_self.n);
4070
+
3378
4071
  ggml_allocr_reset(lctx.alloc);
3379
4072
 
3380
- ggml_cgraph * gf = llama_build_graph(lctx, tokens, embd, n_tokens, n_past);
4073
+ ggml_cgraph * gf = llama_build_graph(lctx, batch);
3381
4074
 
3382
4075
  ggml_allocr_alloc_graph(lctx.alloc, gf);
3383
4076
 
@@ -3386,6 +4079,7 @@ static bool llama_eval_internal(
3386
4079
  ggml_tensor * node = gf->leafs[i];
3387
4080
  if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
3388
4081
  ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) lctx.buf_alloc.data);
4082
+ ggml_cuda_copy_to_device(node);
3389
4083
  }
3390
4084
  }
3391
4085
 
@@ -3395,6 +4089,8 @@ static bool llama_eval_internal(
3395
4089
  ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) lctx.buf_alloc.data);
3396
4090
  }
3397
4091
  }
4092
+
4093
+ ggml_cuda_set_mul_mat_q(cparams.mul_mat_q);
3398
4094
  #endif
3399
4095
 
3400
4096
  // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
@@ -3404,10 +4100,19 @@ static bool llama_eval_internal(
3404
4100
  // TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
3405
4101
  // we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
3406
4102
  // with the BLAS calls. need a better solution
3407
- if (N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
4103
+ if (n_tokens >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
3408
4104
  n_threads = std::min(4, n_threads);
3409
4105
  }
3410
4106
 
4107
+ // If all tensors can be run on the GPU then using more than 1 thread is detrimental.
4108
+ const bool full_offload_supported = model.arch == LLM_ARCH_LLAMA ||
4109
+ model.arch == LLM_ARCH_BAICHUAN ||
4110
+ model.arch == LLM_ARCH_FALCON;
4111
+ const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
4112
+ if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
4113
+ n_threads = 1;
4114
+ }
4115
+
3411
4116
  struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
3412
4117
  struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
3413
4118
 
@@ -3423,10 +4128,6 @@ static bool llama_eval_internal(
3423
4128
  if (lctx.ctx_metal) {
3424
4129
  ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
3425
4130
  ggml_metal_graph_compute(lctx.ctx_metal, gf);
3426
- ggml_metal_get_tensor (lctx.ctx_metal, res);
3427
- if (!lctx.embedding.empty()) {
3428
- ggml_metal_get_tensor(lctx.ctx_metal, embeddings);
3429
- }
3430
4131
  } else {
3431
4132
  ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
3432
4133
  }
@@ -3438,12 +4139,9 @@ static bool llama_eval_internal(
3438
4139
  ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
3439
4140
  #endif
3440
4141
 
3441
- // update kv token count
3442
- lctx.kv_self.n = n_past + N;
3443
-
3444
- if (cgraph_fname) {
3445
- ggml_graph_export(gf, cgraph_fname);
3446
- }
4142
+ // update the kv ring buffer
4143
+ lctx.kv_self.head += n_tokens;
4144
+ lctx.kv_self.has_shift = false;
3447
4145
 
3448
4146
  #ifdef GGML_PERF
3449
4147
  // print timing information per ggml operation (for debugging purposes)
@@ -3460,13 +4158,20 @@ static bool llama_eval_internal(
3460
4158
  {
3461
4159
  auto & logits_out = lctx.logits;
3462
4160
 
3463
- if (lctx.logits_all) {
3464
- logits_out.resize(n_vocab * N);
3465
- memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*N);
4161
+ if (batch.logits) {
4162
+ logits_out.resize(n_vocab * n_tokens);
4163
+ for (uint32_t i = 0; i < n_tokens; i++) {
4164
+ if (batch.logits[i] == 0) {
4165
+ continue;
4166
+ }
4167
+ memcpy(logits_out.data() + (n_vocab*i), (float *) ggml_get_data(res) + (n_vocab*i), sizeof(float)*n_vocab);
4168
+ }
4169
+ } else if (lctx.logits_all) {
4170
+ logits_out.resize(n_vocab * n_tokens);
4171
+ memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*n_tokens);
3466
4172
  } else {
3467
- // return result for just the last token
3468
4173
  logits_out.resize(n_vocab);
3469
- memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
4174
+ memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(n_tokens - 1)), sizeof(float)*n_vocab);
3470
4175
  }
3471
4176
  }
3472
4177
 
@@ -3475,20 +4180,27 @@ static bool llama_eval_internal(
3475
4180
  auto & embedding_out = lctx.embedding;
3476
4181
 
3477
4182
  embedding_out.resize(n_embd);
3478
- memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd);
4183
+ memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(n_tokens - 1)), sizeof(float)*n_embd);
3479
4184
  }
3480
4185
 
3481
4186
  // measure the performance only for the single-token evals
3482
- if (N == 1) {
4187
+ if (n_tokens == 1) {
3483
4188
  lctx.t_eval_us += ggml_time_us() - t_start_us;
3484
4189
  lctx.n_eval++;
3485
4190
  }
3486
- else if (N > 1) {
4191
+ else if (n_tokens > 1) {
3487
4192
  lctx.t_p_eval_us += ggml_time_us() - t_start_us;
3488
- lctx.n_p_eval += N;
4193
+ lctx.n_p_eval += n_tokens;
3489
4194
  }
3490
4195
 
3491
- return true;
4196
+ // get a more accurate load time, upon first eval
4197
+ // TODO: fix this
4198
+ if (!lctx.has_evaluated_once) {
4199
+ lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
4200
+ lctx.has_evaluated_once = true;
4201
+ }
4202
+
4203
+ return 0;
3492
4204
  }
3493
4205
 
3494
4206
  //
@@ -3909,7 +4621,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
3909
4621
  llm_tokenizer_bpe tokenizer(vocab);
3910
4622
  tokenizer.tokenize(raw_text, output);
3911
4623
  } break;
3912
- };
4624
+ }
3913
4625
 
3914
4626
  return output;
3915
4627
  }
@@ -3939,7 +4651,7 @@ struct llama_grammar_candidate {
3939
4651
 
3940
4652
  // Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as
3941
4653
  // pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
3942
- std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
4654
+ static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
3943
4655
  const char * src,
3944
4656
  llama_partial_utf8 partial_start) {
3945
4657
  static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
@@ -4313,6 +5025,13 @@ struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar)
4313
5025
  // sampling
4314
5026
  //
4315
5027
 
5028
+ void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed) {
5029
+ if (seed == LLAMA_DEFAULT_SEED) {
5030
+ seed = time(NULL);
5031
+ }
5032
+ ctx->rng.seed(seed);
5033
+ }
5034
+
4316
5035
  void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates) {
4317
5036
  GGML_ASSERT(candidates->size > 0);
4318
5037
 
@@ -4521,7 +5240,7 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
4521
5240
  }
4522
5241
  }
4523
5242
 
4524
- void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
5243
+ void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
4525
5244
  const int64_t t_start_sample_us = ggml_time_us();
4526
5245
 
4527
5246
  for (size_t i = 0; i < candidates_p->size; ++i) {
@@ -4533,6 +5252,10 @@ void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array
4533
5252
  }
4534
5253
  }
4535
5254
 
5255
+ void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
5256
+ llama_sample_temp(ctx, candidates_p, temp);
5257
+ }
5258
+
4536
5259
  void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty) {
4537
5260
  if (last_tokens_size == 0 || penalty == 1.0f) {
4538
5261
  return;
@@ -4656,7 +5379,7 @@ void llama_sample_classifier_free_guidance(
4656
5379
 
4657
5380
  GGML_ASSERT(ctx);
4658
5381
 
4659
- auto n_vocab = llama_n_vocab(ctx);
5382
+ auto n_vocab = llama_n_vocab(llama_get_model(ctx));
4660
5383
 
4661
5384
  GGML_ASSERT(n_vocab == (int)candidates->size);
4662
5385
  GGML_ASSERT(!candidates->sorted);
@@ -4685,7 +5408,7 @@ void llama_sample_classifier_free_guidance(
4685
5408
  llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu) {
4686
5409
  GGML_ASSERT(ctx);
4687
5410
 
4688
- auto N = float(llama_n_vocab(ctx));
5411
+ auto N = float(llama_n_vocab(llama_get_model(ctx)));
4689
5412
  int64_t t_start_sample_us;
4690
5413
  t_start_sample_us = ggml_time_us();
4691
5414
 
@@ -4872,7 +5595,7 @@ struct llama_logit_info {
4872
5595
  };
4873
5596
  llama_logit_info(llama_context * ctx)
4874
5597
  : logits(llama_get_logits(ctx))
4875
- , n_vocab(llama_n_vocab(ctx))
5598
+ , n_vocab(llama_n_vocab(llama_get_model(ctx)))
4876
5599
  , max_l(*std::max_element(logits, logits + n_vocab))
4877
5600
  , normalizer(1.0f / std::accumulate(logits, logits + n_vocab, 0.0f, sum_exp{max_l}))
4878
5601
  { }
@@ -4910,7 +5633,6 @@ struct llama_beam_search_data {
4910
5633
  size_t n_beams;
4911
5634
  int n_past;
4912
5635
  int n_predict;
4913
- int n_threads;
4914
5636
  std::vector<llama_beam> beams;
4915
5637
  std::vector<llama_beam> next_beams;
4916
5638
 
@@ -4920,12 +5642,11 @@ struct llama_beam_search_data {
4920
5642
  // Used to communicate to/from callback on beams state.
4921
5643
  std::vector<llama_beam_view> beam_views;
4922
5644
 
4923
- llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict, int n_threads)
5645
+ llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict)
4924
5646
  : ctx(ctx)
4925
5647
  , n_beams(n_beams)
4926
5648
  , n_past(n_past)
4927
5649
  , n_predict(n_predict)
4928
- , n_threads(n_threads)
4929
5650
  , beam_views(n_beams) {
4930
5651
  beams.reserve(n_beams);
4931
5652
  next_beams.reserve(n_beams);
@@ -4962,7 +5683,7 @@ struct llama_beam_search_data {
4962
5683
  } else {
4963
5684
  // beam is not at end-of-sentence, so branch with next top_k tokens.
4964
5685
  if (!beam.tokens.empty()) {
4965
- llama_eval(ctx, beam.tokens.data(), beam.tokens.size(), n_past, n_threads);
5686
+ llama_decode(ctx, llama_batch_get_one(beam.tokens.data(), beam.tokens.size(), n_past, 0));
4966
5687
  }
4967
5688
  llama_logit_info logit_info(ctx);
4968
5689
  std::vector<llama_token_data> next_tokens = logit_info.top_k(n_beams);
@@ -5036,7 +5757,7 @@ struct llama_beam_search_data {
5036
5757
  callback(callback_data, get_beams_state(false)); // Sets common_prefix_length
5037
5758
  update_beams_from_beam_views(); // Update values (p,eob) that callback may have changed.
5038
5759
  if (common_prefix_length) {
5039
- llama_eval(ctx, beams[0].tokens.data(), common_prefix_length, n_past, n_threads);
5760
+ llama_decode(ctx, llama_batch_get_one(beams[0].tokens.data(), common_prefix_length, n_past, 0));
5040
5761
  n_past += common_prefix_length;
5041
5762
  }
5042
5763
  // Zero-out next_beam probabilities to place them last in following min-heap.
@@ -5077,11 +5798,11 @@ struct llama_beam_search_data {
5077
5798
 
5078
5799
  void llama_beam_search(llama_context * ctx,
5079
5800
  llama_beam_search_callback_fn_t callback, void * callback_data,
5080
- size_t n_beams, int n_past, int n_predict, int n_threads) {
5801
+ size_t n_beams, int n_past, int n_predict) {
5081
5802
  assert(ctx);
5082
5803
  const int64_t t_start_sample_us = ggml_time_us();
5083
5804
 
5084
- llama_beam_search_data beam_search_data(ctx, n_beams, n_past, n_predict, n_threads);
5805
+ llama_beam_search_data beam_search_data(ctx, n_beams, n_past, n_predict);
5085
5806
 
5086
5807
  beam_search_data.loop(callback, callback_data);
5087
5808
 
@@ -5301,11 +6022,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
5301
6022
  nthread = std::thread::hardware_concurrency();
5302
6023
  }
5303
6024
 
5304
- std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname_inp, /*use_mmap*/ false));
6025
+ llama_model_loader ml(fname_inp, /*use_mmap*/ false);
5305
6026
 
5306
6027
  llama_model model;
5307
- llm_load_arch(*ml, model);
5308
- llm_load_hparams(*ml, model, 0, 0, 0);
6028
+ llm_load_arch(ml, model);
6029
+ llm_load_hparams(ml, model);
5309
6030
 
5310
6031
  if (params->only_copy) {
5311
6032
  ftype = model.ftype;
@@ -5315,7 +6036,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
5315
6036
  struct gguf_context * ctx_out = gguf_init_empty();
5316
6037
 
5317
6038
  // copy the KV pairs from the input file
5318
- gguf_set_kv (ctx_out, ml->ctx_gguf);
6039
+ gguf_set_kv (ctx_out, ml.ctx_gguf);
5319
6040
  gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
5320
6041
  gguf_set_val_u32(ctx_out, "general.file_type", ftype);
5321
6042
 
@@ -5323,8 +6044,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
5323
6044
  int n_attention_wv = 0;
5324
6045
  int n_feed_forward_w2 = 0;
5325
6046
 
5326
- for (int i = 0; i < ml->n_tensors; ++i) {
5327
- struct ggml_tensor * meta = ml->get_tensor_meta(i);
6047
+ for (int i = 0; i < ml.n_tensors; ++i) {
6048
+ struct ggml_tensor * meta = ml.get_tensor_meta(i);
5328
6049
 
5329
6050
  const std::string name = ggml_get_name(meta);
5330
6051
 
@@ -5360,8 +6081,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
5360
6081
  std::vector<no_init<float>> f32_conv_buf;
5361
6082
 
5362
6083
  // populate the original tensors so we get an initial meta data
5363
- for (int i = 0; i < ml->n_tensors; ++i) {
5364
- struct ggml_tensor * meta = ml->get_tensor_meta(i);
6084
+ for (int i = 0; i < ml.n_tensors; ++i) {
6085
+ struct ggml_tensor * meta = ml.get_tensor_meta(i);
5365
6086
  gguf_add_tensor(ctx_out, meta);
5366
6087
  }
5367
6088
 
@@ -5374,8 +6095,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
5374
6095
  // placeholder for the meta data
5375
6096
  ::zeros(fout, meta_size);
5376
6097
 
5377
- for (int i = 0; i < ml->n_tensors; ++i) {
5378
- struct ggml_tensor * tensor = ml->get_tensor_meta(i);
6098
+ for (int i = 0; i < ml.n_tensors; ++i) {
6099
+ struct ggml_tensor * tensor = ml.get_tensor_meta(i);
5379
6100
 
5380
6101
  const std::string name = ggml_get_name(tensor);
5381
6102
 
@@ -5383,10 +6104,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
5383
6104
  read_data.resize(ggml_nbytes(tensor));
5384
6105
  }
5385
6106
  tensor->data = read_data.data();
5386
- ml->load_data_for(tensor);
6107
+ ml.load_data_for(tensor);
5387
6108
 
5388
6109
  LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",
5389
- ++idx, ml->n_tensors,
6110
+ ++idx, ml.n_tensors,
5390
6111
  ggml_get_name(tensor),
5391
6112
  llama_format_tensor_shape(tensor).c_str(),
5392
6113
  ggml_type_name(tensor->type));
@@ -5536,8 +6257,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
5536
6257
  }
5537
6258
  }
5538
6259
 
5539
- // TODO: after the GGUF PR, this likely won't work and needs to be updated
5540
- int llama_apply_lora_from_file_internal(const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads) {
6260
+ static int llama_apply_lora_from_file_internal(
6261
+ const struct llama_model & model, const char * path_lora, float scale, const char * path_base_model, int n_threads
6262
+ ) {
5541
6263
  LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
5542
6264
 
5543
6265
  const int64_t t_start_lora_us = ggml_time_us();
@@ -5565,7 +6287,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
5565
6287
  int32_t lora_alpha;
5566
6288
  fin.read((char *) &lora_r, sizeof(lora_r));
5567
6289
  fin.read((char *) &lora_alpha, sizeof(lora_alpha));
5568
- float scaling = (float)lora_alpha / (float)lora_r;
6290
+ float scaling = scale * (float)lora_alpha / (float)lora_r;
5569
6291
 
5570
6292
  LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
5571
6293
 
@@ -5781,9 +6503,10 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
5781
6503
  ggml_set_name(r, "r_cpy");
5782
6504
  }
5783
6505
 
5784
- struct ggml_cgraph gf = ggml_build_forward(r);
6506
+ struct ggml_cgraph * gf = ggml_new_graph(lora_ctx);
6507
+ ggml_build_forward_expand(gf, r);
5785
6508
 
5786
- ggml_graph_compute_helper(work_buffer, &gf, n_threads);
6509
+ ggml_graph_compute_helper(work_buffer, gf, n_threads);
5787
6510
 
5788
6511
  // we won't need these tensors again, reset the context to save memory
5789
6512
  ggml_free(lora_ctx);
@@ -5812,27 +6535,16 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
5812
6535
  //
5813
6536
  // interface implementation
5814
6537
  //
5815
-
5816
- struct llama_context_params llama_context_default_params() {
5817
- struct llama_context_params result = {
5818
- /*.seed =*/ LLAMA_DEFAULT_SEED,
5819
- /*.n_ctx =*/ 512,
5820
- /*.n_batch =*/ 512,
6538
+ struct llama_model_params llama_model_default_params() {
6539
+ struct llama_model_params result = {
5821
6540
  /*.n_gpu_layers =*/ 0,
5822
6541
  /*.main_gpu =*/ 0,
5823
6542
  /*.tensor_split =*/ nullptr,
5824
- /*.rope_freq_base =*/ 10000.0f,
5825
- /*.rope_freq_scale =*/ 1.0f,
5826
6543
  /*.progress_callback =*/ nullptr,
5827
6544
  /*.progress_callback_user_data =*/ nullptr,
5828
- /*.low_vram =*/ false,
5829
- /*.mul_mat_q =*/ true,
5830
- /*.f16_kv =*/ true,
5831
- /*.logits_all =*/ false,
5832
6545
  /*.vocab_only =*/ false,
5833
6546
  /*.use_mmap =*/ true,
5834
6547
  /*.use_mlock =*/ false,
5835
- /*.embedding =*/ false,
5836
6548
  };
5837
6549
 
5838
6550
  #ifdef GGML_USE_METAL
@@ -5842,6 +6554,24 @@ struct llama_context_params llama_context_default_params() {
5842
6554
  return result;
5843
6555
  }
5844
6556
 
6557
+ struct llama_context_params llama_context_default_params() {
6558
+ struct llama_context_params result = {
6559
+ /*.seed =*/ LLAMA_DEFAULT_SEED,
6560
+ /*.n_ctx =*/ 512,
6561
+ /*.n_batch =*/ 512,
6562
+ /*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
6563
+ /*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS,
6564
+ /*.rope_freq_base =*/ 0.0f,
6565
+ /*.rope_freq_scale =*/ 0.0f,
6566
+ /*.mul_mat_q =*/ true,
6567
+ /*.f16_kv =*/ true,
6568
+ /*.logits_all =*/ false,
6569
+ /*.embedding =*/ false,
6570
+ };
6571
+
6572
+ return result;
6573
+ }
6574
+
5845
6575
  struct llama_model_quantize_params llama_model_quantize_default_params() {
5846
6576
  struct llama_model_quantize_params result = {
5847
6577
  /*.nthread =*/ 0,
@@ -5897,13 +6627,11 @@ int64_t llama_time_us(void) {
5897
6627
 
5898
6628
  struct llama_model * llama_load_model_from_file(
5899
6629
  const char * path_model,
5900
- struct llama_context_params params) {
6630
+ struct llama_model_params params) {
5901
6631
  ggml_time_init();
5902
6632
 
5903
6633
  llama_model * model = new llama_model;
5904
6634
 
5905
- ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
5906
-
5907
6635
  unsigned cur_percentage = 0;
5908
6636
  if (params.progress_callback == NULL) {
5909
6637
  params.progress_callback_user_data = &cur_percentage;
@@ -5920,9 +6648,9 @@ struct llama_model * llama_load_model_from_file(
5920
6648
  };
5921
6649
  }
5922
6650
 
5923
- if (!llama_model_load(path_model, *model, params.n_ctx, params.n_batch, params.n_gpu_layers,
5924
- params.main_gpu, params.tensor_split, params.mul_mat_q, params.rope_freq_base, params.rope_freq_scale,
5925
- params.low_vram, memory_type, params.use_mmap, params.use_mlock, params.vocab_only,
6651
+ if (!llama_model_load(path_model, *model, params.n_gpu_layers,
6652
+ params.main_gpu, params.tensor_split,
6653
+ params.use_mmap, params.use_mlock, params.vocab_only,
5926
6654
  params.progress_callback, params.progress_callback_user_data)) {
5927
6655
  LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
5928
6656
  delete model;
@@ -5946,18 +6674,33 @@ struct llama_context * llama_new_context_with_model(
5946
6674
 
5947
6675
  llama_context * ctx = new llama_context(*model);
5948
6676
 
6677
+ const auto & hparams = model->hparams;
6678
+ auto & cparams = ctx->cparams;
6679
+
6680
+ cparams.n_batch = params.n_batch;
6681
+ cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
6682
+ cparams.rope_freq_base = params.rope_freq_base == 0 ? hparams.rope_freq_base_train : params.rope_freq_base;
6683
+ cparams.rope_freq_scale = params.rope_freq_scale == 0 ? hparams.rope_freq_scale_train : params.rope_freq_scale;
6684
+ cparams.n_threads = params.n_threads;
6685
+ cparams.n_threads_batch = params.n_threads_batch;
6686
+ cparams.mul_mat_q = params.mul_mat_q;
6687
+
5949
6688
  if (params.seed == LLAMA_DEFAULT_SEED) {
5950
6689
  params.seed = time(NULL);
5951
6690
  }
5952
6691
 
6692
+ LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
6693
+ LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
6694
+ LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
6695
+
5953
6696
  ctx->rng = std::mt19937(params.seed);
5954
6697
  ctx->logits_all = params.logits_all;
5955
6698
 
5956
6699
  ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
5957
6700
 
5958
6701
  // reserve memory for context buffers
5959
- if (!params.vocab_only) {
5960
- if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
6702
+ if (!hparams.vocab_only) {
6703
+ if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, cparams.n_ctx, model->n_gpu_layers)) {
5961
6704
  LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
5962
6705
  llama_free(ctx);
5963
6706
  return nullptr;
@@ -5968,11 +6711,9 @@ struct llama_context * llama_new_context_with_model(
5968
6711
  LLAMA_LOG_INFO("%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
5969
6712
  }
5970
6713
 
5971
- const auto & hparams = ctx->model.hparams;
5972
-
5973
6714
  // resized during inference
5974
6715
  if (params.logits_all) {
5975
- ctx->logits.reserve(hparams.n_ctx*hparams.n_vocab);
6716
+ ctx->logits.reserve(cparams.n_ctx*hparams.n_vocab);
5976
6717
  } else {
5977
6718
  ctx->logits.reserve(hparams.n_vocab);
5978
6719
  }
@@ -5990,26 +6731,28 @@ struct llama_context * llama_new_context_with_model(
5990
6731
  ctx->alloc = ggml_allocr_new_measure(tensor_alignment);
5991
6732
 
5992
6733
  // build worst-case graph
5993
- int n_tokens = std::min((int)hparams.n_ctx, params.n_batch);
5994
- int n_past = hparams.n_ctx - n_tokens;
6734
+ int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch);
6735
+ int n_past = cparams.n_ctx - n_tokens;
5995
6736
  llama_token token = llama_token_bos(ctx); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
5996
- ggml_cgraph * gf = llama_build_graph(*ctx, &token, NULL, n_tokens, n_past);
6737
+ ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0));
6738
+
5997
6739
  #ifdef GGML_USE_METAL
5998
- if (params.n_gpu_layers > 0) {
6740
+ if (model->n_gpu_layers > 0) {
5999
6741
  ctx->ctx_metal = ggml_metal_init(1);
6000
6742
  if (!ctx->ctx_metal) {
6001
6743
  LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__);
6002
6744
  llama_free(ctx);
6003
6745
  return NULL;
6004
6746
  }
6005
- ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false);
6006
- ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
6747
+ ggml_metal_log_set_callback(llama_log_callback_default, NULL);
6748
+ //ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false);
6749
+ //ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
6007
6750
  }
6008
6751
  #endif
6009
6752
  // measure memory requirements for the graph
6010
6753
  size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
6011
6754
 
6012
- LLAMA_LOG_INFO("%s: compute buffer total size = %7.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
6755
+ LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
6013
6756
 
6014
6757
  // recreate allocator with exact memory requirements
6015
6758
  ggml_allocr_free(ctx->alloc);
@@ -6018,28 +6761,46 @@ struct llama_context * llama_new_context_with_model(
6018
6761
  ctx->alloc = ggml_allocr_new(ctx->buf_alloc.data, ctx->buf_alloc.size, tensor_alignment);
6019
6762
  #ifdef GGML_USE_METAL
6020
6763
  if (ctx->ctx_metal) {
6021
- ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
6764
+ //ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
6022
6765
  }
6023
6766
  #endif
6024
6767
  #ifdef GGML_USE_CUBLAS
6025
- if (params.low_vram) {
6026
- LLAMA_LOG_INFO("%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
6027
- ggml_cuda_set_scratch_size(0); // disable scratch
6028
- } else {
6029
- ggml_cuda_set_scratch_size(alloc_size);
6030
- LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MB\n", __func__, alloc_size / 1024.0 / 1024.0);
6768
+ ggml_cuda_set_scratch_size(alloc_size);
6769
+ LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MB\n", __func__, alloc_size / 1024.0 / 1024.0);
6770
+
6771
+ // calculate total VRAM usage
6772
+ auto add_tensor = [](const ggml_tensor * t, size_t & size) {
6773
+ if (t->backend == GGML_BACKEND_GPU || t->backend == GGML_BACKEND_GPU_SPLIT) {
6774
+ size += ggml_nbytes(t);
6775
+ }
6776
+ };
6777
+ size_t model_vram_size = 0;
6778
+ for (const auto & kv : model->tensors_by_name) {
6779
+ add_tensor(kv.second, model_vram_size);
6031
6780
  }
6781
+
6782
+ size_t kv_vram_size = 0;
6783
+ add_tensor(ctx->kv_self.k, kv_vram_size);
6784
+ add_tensor(ctx->kv_self.v, kv_vram_size);
6785
+
6786
+ size_t ctx_vram_size = alloc_size + kv_vram_size;
6787
+ size_t total_vram_size = model_vram_size + ctx_vram_size;
6788
+
6789
+ LLAMA_LOG_INFO("%s: total VRAM used: %.2f MB (model: %.2f MB, context: %.2f MB)\n", __func__,
6790
+ total_vram_size / 1024.0 / 1024.0,
6791
+ model_vram_size / 1024.0 / 1024.0,
6792
+ ctx_vram_size / 1024.0 / 1024.0);
6032
6793
  #endif
6033
6794
  }
6034
6795
 
6035
6796
  #ifdef GGML_USE_METAL
6036
- if (params.n_gpu_layers > 0) {
6797
+ if (model->n_gpu_layers > 0) {
6037
6798
  // this allocates all Metal resources and memory buffers
6038
6799
 
6039
6800
  void * data_ptr = NULL;
6040
6801
  size_t data_size = 0;
6041
6802
 
6042
- if (params.use_mmap) {
6803
+ if (ctx->model.mapping) {
6043
6804
  data_ptr = ctx->model.mapping->addr;
6044
6805
  data_size = ctx->model.mapping->size;
6045
6806
  } else {
@@ -6058,11 +6819,8 @@ struct llama_context * llama_new_context_with_model(
6058
6819
  return NULL; \
6059
6820
  }
6060
6821
 
6061
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
6062
-
6063
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.data, ctx->buf_compute.size, 0));
6064
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0));
6065
-
6822
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
6823
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0));
6066
6824
  LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.data, ctx->buf_alloc.size, 0));
6067
6825
  #undef LLAMA_METAL_CHECK_BUF
6068
6826
  }
@@ -6074,8 +6832,10 @@ struct llama_context * llama_new_context_with_model(
6074
6832
 
6075
6833
  if (ggml_mpi_rank(ctx->ctx_mpi) > 0) {
6076
6834
  // Enter a blocking eval loop with dummy input, letting rank=0 drive the process
6077
- const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos(ctx));
6078
- while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {};
6835
+ // TODO: needs fix after #3228
6836
+ GGML_ASSERT(false && "not implemented");
6837
+ //const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos(ctx));
6838
+ //while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {};
6079
6839
  llama_backend_free();
6080
6840
  exit(1);
6081
6841
  }
@@ -6084,63 +6844,37 @@ struct llama_context * llama_new_context_with_model(
6084
6844
  return ctx;
6085
6845
  }
6086
6846
 
6087
- struct llama_context * llama_init_from_file(
6088
- const char * path_model,
6089
- struct llama_context_params params) {
6090
- struct llama_model * model = llama_load_model_from_file(path_model, params);
6091
- if (!model) {
6092
- return nullptr;
6093
- }
6094
-
6095
- struct llama_context * ctx = llama_new_context_with_model(model, params);
6096
- ctx->model_owner = true;
6097
-
6098
- return ctx;
6099
- }
6100
-
6101
6847
  void llama_free(struct llama_context * ctx) {
6102
6848
  delete ctx;
6103
6849
  }
6104
6850
 
6105
- int llama_n_vocab(const struct llama_context * ctx) {
6106
- return llama_model_n_vocab(&ctx->model);
6851
+ const llama_model * llama_get_model(const struct llama_context * ctx) {
6852
+ return &ctx->model;
6107
6853
  }
6108
6854
 
6109
6855
  int llama_n_ctx(const struct llama_context * ctx) {
6110
- return llama_model_n_ctx(&ctx->model);
6111
- }
6112
-
6113
- int llama_n_ctx_train(const struct llama_context * ctx) {
6114
- return llama_model_n_ctx_train(&ctx->model);
6115
- }
6116
-
6117
- int llama_n_embd(const struct llama_context * ctx) {
6118
- return llama_model_n_embd(&ctx->model);
6856
+ return ctx->cparams.n_ctx;
6119
6857
  }
6120
6858
 
6121
- enum llama_vocab_type llama_vocab_type(const struct llama_context * ctx) {
6122
- return ctx->model.vocab.type;
6859
+ enum llama_vocab_type llama_vocab_type(const struct llama_model * model) {
6860
+ return model->vocab.type;
6123
6861
  }
6124
6862
 
6125
- int llama_model_n_vocab(const struct llama_model * model) {
6863
+ int llama_n_vocab(const struct llama_model * model) {
6126
6864
  return model->vocab.id_to_token.size();
6127
6865
  }
6128
6866
 
6129
- int llama_model_n_ctx(const struct llama_model * model) {
6130
- return model->hparams.n_ctx;
6131
- }
6132
-
6133
- int llama_model_n_ctx_train(const struct llama_model * model) {
6867
+ int llama_n_ctx_train(const struct llama_model * model) {
6134
6868
  return model->hparams.n_ctx_train;
6135
6869
  }
6136
6870
 
6137
- int llama_model_n_embd(const struct llama_model * model) {
6871
+ int llama_n_embd(const struct llama_model * model) {
6138
6872
  return model->hparams.n_embd;
6139
6873
  }
6140
6874
 
6141
6875
  int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
6142
6876
  return snprintf(buf, buf_size, "%s %s %s",
6143
- model->name.c_str(),
6877
+ llama_model_arch_name(model->arch).c_str(),
6144
6878
  llama_model_type_name(model->type),
6145
6879
  llama_model_ftype_name(model->ftype).c_str());
6146
6880
  }
@@ -6161,6 +6895,10 @@ uint64_t llama_model_n_params(const struct llama_model * model) {
6161
6895
  return nparams;
6162
6896
  }
6163
6897
 
6898
+ struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name) {
6899
+ return ggml_get_tensor(model->ctx, name);
6900
+ }
6901
+
6164
6902
  int llama_model_quantize(
6165
6903
  const char * fname_inp,
6166
6904
  const char * fname_out,
@@ -6174,18 +6912,18 @@ int llama_model_quantize(
6174
6912
  }
6175
6913
  }
6176
6914
 
6177
- int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
6915
+ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, float scale, const char * path_base_model, int n_threads) {
6178
6916
  try {
6179
- return llama_apply_lora_from_file_internal(ctx->model, path_lora, path_base_model, n_threads);
6917
+ return llama_apply_lora_from_file_internal(ctx->model, path_lora, scale, path_base_model, n_threads);
6180
6918
  } catch (const std::exception & err) {
6181
6919
  LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
6182
6920
  return 1;
6183
6921
  }
6184
6922
  }
6185
6923
 
6186
- int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, const char * path_base_model, int n_threads) {
6924
+ int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int n_threads) {
6187
6925
  try {
6188
- return llama_apply_lora_from_file_internal(*model, path_lora, path_base_model, n_threads);
6926
+ return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads);
6189
6927
  } catch (const std::exception & err) {
6190
6928
  LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
6191
6929
  return 1;
@@ -6193,16 +6931,27 @@ int llama_model_apply_lora_from_file(const struct llama_model * model, const cha
6193
6931
  }
6194
6932
 
6195
6933
  int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
6196
- return ctx->kv_self.n;
6934
+ return ctx->kv_self.head;
6197
6935
  }
6198
6936
 
6199
- #define LLAMA_MAX_RNG_STATE (64*1024)
6937
+ void llama_kv_cache_tokens_rm(struct llama_context * ctx, int32_t c0, int32_t c1) {
6938
+ llama_kv_cache_tokens_rm(ctx->kv_self, c0, c1);
6939
+ }
6200
6940
 
6201
- void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed) {
6202
- if (seed == LLAMA_DEFAULT_SEED) {
6203
- seed = time(NULL);
6204
- }
6205
- ctx->rng.seed(seed);
6941
+ void llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
6942
+ llama_kv_cache_seq_rm(ctx->kv_self, seq_id, p0, p1);
6943
+ }
6944
+
6945
+ void llama_kv_cache_seq_cp(struct llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
6946
+ llama_kv_cache_seq_cp(ctx->kv_self, seq_id_src, seq_id_dst, p0, p1);
6947
+ }
6948
+
6949
+ void llama_kv_cache_seq_keep(struct llama_context * ctx, llama_seq_id seq_id) {
6950
+ llama_kv_cache_seq_keep(ctx->kv_self, seq_id);
6951
+ }
6952
+
6953
+ void llama_kv_cache_seq_shift(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
6954
+ llama_kv_cache_seq_shift(ctx->kv_self, seq_id, p0, p1, delta);
6206
6955
  }
6207
6956
 
6208
6957
  // Returns the *maximum* size of the state
@@ -6289,7 +7038,17 @@ struct llama_data_file_context : llama_data_context {
6289
7038
  * llama_copy_state_data(ctx, &data_ctx);
6290
7039
  *
6291
7040
  */
6292
- void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
7041
+ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
7042
+ // TODO: does not support multi-sequence states
7043
+ {
7044
+ const auto & kv_self = ctx->kv_self;
7045
+ for (uint32_t i = 0; i < kv_self.head; ++i) {
7046
+ GGML_ASSERT(kv_self.cells[i].pos == (int32_t) i);
7047
+ GGML_ASSERT(kv_self.cells[i].seq_id.size() == 1);
7048
+ GGML_ASSERT(kv_self.cells[i].has_seq_id(0));
7049
+ }
7050
+ }
7051
+
6293
7052
  // copy rng
6294
7053
  {
6295
7054
  std::stringstream rng_ss;
@@ -6340,12 +7099,14 @@ void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_conte
6340
7099
  {
6341
7100
  const auto & kv_self = ctx->kv_self;
6342
7101
  const auto & hparams = ctx->model.hparams;
7102
+ const auto & cparams = ctx->cparams;
7103
+
6343
7104
  const int n_layer = hparams.n_layer;
6344
7105
  const int n_embd = hparams.n_embd_gqa();
6345
- const int n_ctx = hparams.n_ctx;
7106
+ const int n_ctx = cparams.n_ctx;
6346
7107
 
6347
7108
  const size_t kv_size = kv_self.buf.size;
6348
- const int kv_ntok = llama_get_kv_cache_token_count(ctx);
7109
+ const int kv_ntok = kv_self.head;
6349
7110
 
6350
7111
  data_ctx->write(&kv_size, sizeof(kv_size));
6351
7112
  data_ctx->write(&kv_ntok, sizeof(kv_ntok));
@@ -6448,9 +7209,11 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
6448
7209
  {
6449
7210
  const auto & kv_self = ctx->kv_self;
6450
7211
  const auto & hparams = ctx->model.hparams;
7212
+ const auto & cparams = ctx->cparams;
7213
+
6451
7214
  const int n_layer = hparams.n_layer;
6452
7215
  const int n_embd = hparams.n_embd_gqa();
6453
- const int n_ctx = hparams.n_ctx;
7216
+ const int n_ctx = cparams.n_ctx;
6454
7217
 
6455
7218
  size_t kv_size;
6456
7219
  int kv_ntok;
@@ -6489,7 +7252,8 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
6489
7252
  ggml_free(cpy_ctx);
6490
7253
  }
6491
7254
 
6492
- ctx->kv_self.n = kv_ntok;
7255
+ ctx->kv_self.head = kv_ntok;
7256
+ ctx->kv_self.size = kv_size;
6493
7257
  }
6494
7258
 
6495
7259
  const size_t nread = inp - src;
@@ -6584,64 +7348,102 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi
6584
7348
 
6585
7349
  int llama_eval(
6586
7350
  struct llama_context * ctx,
6587
- const llama_token * tokens,
6588
- int n_tokens,
6589
- int n_past,
6590
- int n_threads) {
6591
- if (!llama_eval_internal(*ctx, tokens, nullptr, n_tokens, n_past, n_threads, nullptr)) {
6592
- LLAMA_LOG_ERROR("%s: failed to eval\n", __func__);
6593
- return 1;
6594
- }
7351
+ llama_token * tokens,
7352
+ int32_t n_tokens,
7353
+ int n_past) {
7354
+ llama_kv_cache_tokens_rm(ctx->kv_self, n_past, -1);
6595
7355
 
6596
- // get a more accurate load time, upon first eval
6597
- // TODO: fix this
6598
- if (!ctx->has_evaluated_once) {
6599
- ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
6600
- ctx->has_evaluated_once = true;
7356
+ const int ret = llama_decode_internal(*ctx, llama_batch_get_one(tokens, n_tokens, n_past, 0));
7357
+ if (ret < 0) {
7358
+ LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
6601
7359
  }
6602
7360
 
6603
- return 0;
7361
+ return ret;
6604
7362
  }
6605
7363
 
6606
7364
  int llama_eval_embd(
6607
7365
  struct llama_context * ctx,
6608
- const float * embd,
6609
- int n_tokens,
6610
- int n_past,
6611
- int n_threads) {
6612
- if (!llama_eval_internal(*ctx, nullptr, embd, n_tokens, n_past, n_threads, nullptr)) {
6613
- LLAMA_LOG_ERROR("%s: failed to eval\n", __func__);
6614
- return 1;
6615
- }
7366
+ float * embd,
7367
+ int32_t n_tokens,
7368
+ int n_past) {
7369
+ llama_kv_cache_tokens_rm(ctx->kv_self, n_past, -1);
6616
7370
 
6617
- // get a more accurate load time, upon first eval
6618
- // TODO: fix this
6619
- if (!ctx->has_evaluated_once) {
6620
- ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
6621
- ctx->has_evaluated_once = true;
7371
+ llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, n_past, 1, 0, };
7372
+
7373
+ const int ret = llama_decode_internal(*ctx, batch);
7374
+ if (ret < 0) {
7375
+ LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
6622
7376
  }
6623
7377
 
6624
- return 0;
7378
+ return ret;
6625
7379
  }
6626
7380
 
6627
- int llama_eval_export(struct llama_context * ctx, const char * fname) {
6628
- const int n_batch = 1;
6629
- const int n_ctx = 512 - n_batch;
7381
+ void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch) {
7382
+ ctx->cparams.n_threads = n_threads;
7383
+ ctx->cparams.n_threads_batch = n_threads_batch;
7384
+ }
7385
+
7386
+ struct llama_batch llama_batch_get_one(
7387
+ llama_token * tokens,
7388
+ int32_t n_tokens,
7389
+ llama_pos pos_0,
7390
+ llama_seq_id seq_id) {
7391
+ return {
7392
+ /*n_tokens =*/ n_tokens,
7393
+ /*tokens =*/ tokens,
7394
+ /*embd =*/ nullptr,
7395
+ /*pos =*/ nullptr,
7396
+ /*seq_id =*/ nullptr,
7397
+ /*logits =*/ nullptr,
7398
+ /*all_pos_0 =*/ pos_0,
7399
+ /*all_pos_1 =*/ 1,
7400
+ /*all_seq_id =*/ seq_id,
7401
+ };
7402
+ }
6630
7403
 
6631
- const std::vector<llama_token> tmp(n_batch, llama_token_bos(ctx));
7404
+ struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd) {
7405
+ llama_batch batch = { -1, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, };
6632
7406
 
6633
- if (!llama_eval_internal(*ctx, tmp.data(), nullptr, tmp.size(), n_ctx, 1, fname)) {
6634
- LLAMA_LOG_ERROR("%s: failed to eval\n", __func__);
6635
- return 1;
7407
+ if (embd) {
7408
+ batch.embd = (float *) malloc(sizeof(float) * n_tokens * embd);
7409
+ } else {
7410
+ batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens);
6636
7411
  }
6637
7412
 
6638
- return 0;
7413
+ batch.pos = (llama_pos *) malloc(sizeof(llama_pos) * n_tokens);
7414
+ batch.seq_id = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_tokens);
7415
+ batch.logits = (int8_t *) malloc(sizeof(int8_t) * n_tokens);
7416
+
7417
+ return batch;
7418
+ }
7419
+
7420
+ void llama_batch_free(struct llama_batch batch) {
7421
+ if (batch.token) free(batch.token);
7422
+ if (batch.embd) free(batch.embd);
7423
+ if (batch.pos) free(batch.pos);
7424
+ if (batch.seq_id) free(batch.seq_id);
7425
+ if (batch.logits) free(batch.logits);
7426
+ }
7427
+
7428
+ int llama_decode(
7429
+ struct llama_context * ctx,
7430
+ struct llama_batch batch) {
7431
+ const int ret = llama_decode_internal(*ctx, batch);
7432
+ if (ret < 0) {
7433
+ LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
7434
+ }
7435
+
7436
+ return ret;
6639
7437
  }
6640
7438
 
6641
7439
  float * llama_get_logits(struct llama_context * ctx) {
6642
7440
  return ctx->logits.data();
6643
7441
  }
6644
7442
 
7443
+ float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
7444
+ return ctx->logits.data() + i*ctx->model.hparams.n_vocab;
7445
+ }
7446
+
6645
7447
  float * llama_get_embeddings(struct llama_context * ctx) {
6646
7448
  return ctx->embedding.data();
6647
7449
  }
@@ -6671,21 +7473,13 @@ llama_token llama_token_nl(const struct llama_context * ctx) {
6671
7473
  }
6672
7474
 
6673
7475
  int llama_tokenize(
6674
- struct llama_context * ctx,
6675
- const char * text,
6676
- llama_token * tokens,
6677
- int n_max_tokens,
6678
- bool add_bos) {
6679
- return llama_tokenize_with_model(&ctx->model, text, tokens, n_max_tokens, add_bos);
6680
- }
6681
-
6682
- int llama_tokenize_with_model(
6683
7476
  const struct llama_model * model,
6684
7477
  const char * text,
7478
+ int text_len,
6685
7479
  llama_token * tokens,
6686
7480
  int n_max_tokens,
6687
7481
  bool add_bos) {
6688
- auto res = llama_tokenize_internal(model->vocab, text, add_bos);
7482
+ auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos);
6689
7483
 
6690
7484
  if (n_max_tokens < (int) res.size()) {
6691
7485
  // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
@@ -6699,13 +7493,9 @@ int llama_tokenize_with_model(
6699
7493
  return res.size();
6700
7494
  }
6701
7495
 
6702
- int llama_token_to_piece(const struct llama_context * ctx, llama_token token, char * buf, int length) {
6703
- return llama_token_to_piece_with_model(&ctx->model, token, buf, length);
6704
- }
6705
-
6706
7496
  // does not write null-terminator to buf
6707
- int llama_token_to_piece_with_model(const struct llama_model * model, llama_token token, char * buf, int length) {
6708
- if (0 <= token && token < llama_model_n_vocab(model)) {
7497
+ int llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int length) {
7498
+ if (0 <= token && token < llama_n_vocab(model)) {
6709
7499
  if (llama_is_normal_token(model->vocab, token)) {
6710
7500
  std::string result = model->vocab.id_to_token[token].text;
6711
7501
  if (llama_vocab_get_type(model->vocab) == LLAMA_VOCAB_TYPE_SPM) {
@@ -6725,7 +7515,7 @@ int llama_token_to_piece_with_model(const struct llama_model * model, llama_toke
6725
7515
  buf[2] = '\x85';
6726
7516
  return 3;
6727
7517
  } else if (llama_is_control_token(model->vocab, token)) {
6728
- ;
7518
+ // do nothing
6729
7519
  } else if (llama_is_byte_token(model->vocab, token)) {
6730
7520
  if (length < 1) {
6731
7521
  return -1;
@@ -6827,16 +7617,18 @@ void llama_dump_timing_info_yaml(FILE * stream, const llama_context * ctx) {
6827
7617
  }
6828
7618
 
6829
7619
  // For internal test use
6830
- const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
7620
+ const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
7621
+ struct llama_context * ctx
7622
+ ) {
6831
7623
  return ctx->model.tensors_by_name;
6832
7624
  }
6833
7625
 
6834
- void llama_log_set(llama_log_callback log_callback, void * user_data) {
7626
+ void llama_log_set(ggml_log_callback log_callback, void * user_data) {
6835
7627
  g_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
6836
7628
  g_state.log_callback_user_data = user_data;
6837
7629
  }
6838
7630
 
6839
- static void llama_log_internal_v(llama_log_level level, const char * format, va_list args) {
7631
+ static void llama_log_internal_v(ggml_log_level level, const char * format, va_list args) {
6840
7632
  va_list args_copy;
6841
7633
  va_copy(args_copy, args);
6842
7634
  char buffer[128];
@@ -6853,14 +7645,14 @@ static void llama_log_internal_v(llama_log_level level, const char * format, va_
6853
7645
  va_end(args_copy);
6854
7646
  }
6855
7647
 
6856
- static void llama_log_internal(llama_log_level level, const char * format, ...) {
7648
+ static void llama_log_internal(ggml_log_level level, const char * format, ...) {
6857
7649
  va_list args;
6858
7650
  va_start(args, format);
6859
7651
  llama_log_internal_v(level, format, args);
6860
7652
  va_end(args);
6861
7653
  }
6862
7654
 
6863
- static void llama_log_callback_default(llama_log_level level, const char * text, void * user_data) {
7655
+ static void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data) {
6864
7656
  (void) level;
6865
7657
  (void) user_data;
6866
7658
  fputs(text, stderr);