llama_cpp 0.5.3 → 0.7.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,6 +1,8 @@
1
1
  #define LLAMA_API_INTERNAL
2
2
  #include "llama.h"
3
3
 
4
+ #include "unicode.h"
5
+
4
6
  #include "ggml.h"
5
7
 
6
8
  #include "ggml-alloc.h"
@@ -72,6 +74,7 @@
72
74
  #include <sstream>
73
75
  #include <thread>
74
76
  #include <unordered_map>
77
+ #include <set>
75
78
 
76
79
  #if defined(_MSC_VER)
77
80
  #pragma warning(disable: 4244 4267) // possible loss of data
@@ -92,12 +95,12 @@
92
95
  //
93
96
 
94
97
  LLAMA_ATTRIBUTE_FORMAT(2, 3)
95
- static void llama_log_internal (llama_log_level level, const char* format, ...);
96
- static void llama_log_callback_default(llama_log_level level, const char * text, void * user_data);
98
+ static void llama_log_internal (ggml_log_level level, const char* format, ...);
99
+ static void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data);
97
100
 
98
- #define LLAMA_LOG_INFO(...) llama_log_internal(LLAMA_LOG_LEVEL_INFO , __VA_ARGS__)
99
- #define LLAMA_LOG_WARN(...) llama_log_internal(LLAMA_LOG_LEVEL_WARN , __VA_ARGS__)
100
- #define LLAMA_LOG_ERROR(...) llama_log_internal(LLAMA_LOG_LEVEL_ERROR, __VA_ARGS__)
101
+ #define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
102
+ #define LLAMA_LOG_WARN(...) llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
103
+ #define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
101
104
 
102
105
  //
103
106
  // helpers
@@ -122,6 +125,27 @@ static void replace_all(std::string & s, const std::string & search, const std::
122
125
  }
123
126
  s = std::move(result);
124
127
  }
128
+
129
+ static bool is_float_close(float a, float b, float abs_tol) {
130
+ // Check for non-negative tolerance
131
+ if (abs_tol < 0.0) {
132
+ throw std::invalid_argument("Tolerance must be non-negative");
133
+ }
134
+
135
+ // Exact equality check
136
+ if (a == b) {
137
+ return true;
138
+ }
139
+
140
+ // Check for infinities
141
+ if (std::isinf(a) || std::isinf(b)) {
142
+ return false;
143
+ }
144
+
145
+ // Regular comparison using the provided absolute tolerance
146
+ return std::fabs(b - a) <= abs_tol;
147
+ }
148
+
125
149
  #ifdef GGML_USE_CPU_HBM
126
150
  #include <hbwmalloc.h>
127
151
  #endif
@@ -162,18 +186,20 @@ enum llm_arch {
162
186
  LLM_ARCH_GPTNEOX,
163
187
  LLM_ARCH_MPT,
164
188
  LLM_ARCH_STARCODER,
189
+ LLM_ARCH_REFACT,
165
190
  LLM_ARCH_UNKNOWN,
166
191
  };
167
192
 
168
193
  static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
169
- { LLM_ARCH_LLAMA, "llama" },
170
- { LLM_ARCH_FALCON, "falcon" },
171
- { LLM_ARCH_GPT2, "gpt2" },
172
- { LLM_ARCH_GPTJ, "gptj" },
173
- { LLM_ARCH_GPTNEOX, "gptneox" },
174
- { LLM_ARCH_MPT, "mpt" },
175
- { LLM_ARCH_BAICHUAN, "baichuan" },
194
+ { LLM_ARCH_LLAMA, "llama" },
195
+ { LLM_ARCH_FALCON, "falcon" },
196
+ { LLM_ARCH_GPT2, "gpt2" },
197
+ { LLM_ARCH_GPTJ, "gptj" },
198
+ { LLM_ARCH_GPTNEOX, "gptneox" },
199
+ { LLM_ARCH_MPT, "mpt" },
200
+ { LLM_ARCH_BAICHUAN, "baichuan" },
176
201
  { LLM_ARCH_STARCODER, "starcoder" },
202
+ { LLM_ARCH_REFACT, "refact" },
177
203
  };
178
204
 
179
205
  enum llm_kv {
@@ -221,16 +247,16 @@ enum llm_kv {
221
247
  };
222
248
 
223
249
  static std::map<llm_kv, std::string> LLM_KV_NAMES = {
224
- { LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" },
225
- { LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" },
226
- { LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
227
- { LLM_KV_GENERAL_NAME, "general.name" },
228
- { LLM_KV_GENERAL_AUTHOR, "general.author" },
229
- { LLM_KV_GENERAL_URL, "general.url" },
230
- { LLM_KV_GENERAL_DESCRIPTION, "general.description" },
231
- { LLM_KV_GENERAL_LICENSE, "general.license" },
232
- { LLM_KV_GENERAL_SOURCE_URL, "general.source_url" },
233
- { LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source_hf_repo" },
250
+ { LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" },
251
+ { LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" },
252
+ { LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
253
+ { LLM_KV_GENERAL_NAME, "general.name" },
254
+ { LLM_KV_GENERAL_AUTHOR, "general.author" },
255
+ { LLM_KV_GENERAL_URL, "general.url" },
256
+ { LLM_KV_GENERAL_DESCRIPTION, "general.description" },
257
+ { LLM_KV_GENERAL_LICENSE, "general.license" },
258
+ { LLM_KV_GENERAL_SOURCE_URL, "general.source.url" },
259
+ { LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" },
234
260
 
235
261
  { LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
236
262
  { LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
@@ -394,6 +420,23 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
394
420
  { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
395
421
  },
396
422
  },
423
+ {
424
+ LLM_ARCH_REFACT,
425
+ {
426
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
427
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
428
+ { LLM_TENSOR_OUTPUT, "output" },
429
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
430
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
431
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
432
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
433
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
434
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
435
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
436
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
437
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
438
+ },
439
+ },
397
440
  {
398
441
  LLM_ARCH_UNKNOWN,
399
442
  {
@@ -448,7 +491,7 @@ struct LLM_TN {
448
491
  //
449
492
 
450
493
  #define GGUF_GET_KEY(ctx, dst, func, type, req, key) \
451
- { \
494
+ do { \
452
495
  const std::string skey(key); \
453
496
  const int kid = gguf_find_key(ctx, skey.c_str()); \
454
497
  if (kid >= 0) { \
@@ -460,7 +503,7 @@ struct LLM_TN {
460
503
  } else if (req) { \
461
504
  throw std::runtime_error(format("key not found in model: %s", skey.c_str())); \
462
505
  } \
463
- }
506
+ } while (0)
464
507
 
465
508
  //
466
509
  // ggml helpers
@@ -881,10 +924,10 @@ static void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
881
924
 
882
925
  static std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) {
883
926
  std::vector<char> result(8, 0);
884
- const int n_tokens = llama_token_to_piece(ctx, token, result.data(), result.size());
927
+ const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
885
928
  if (n_tokens < 0) {
886
929
  result.resize(-n_tokens);
887
- int check = llama_token_to_piece(ctx, token, result.data(), result.size());
930
+ int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
888
931
  GGML_ASSERT(check == -n_tokens);
889
932
  } else {
890
933
  result.resize(n_tokens);
@@ -899,7 +942,7 @@ static std::string llama_token_to_str(const struct llama_context * ctx, llama_to
899
942
 
900
943
  struct llama_state {
901
944
  // We save the log callback globally
902
- llama_log_callback log_callback = llama_log_callback_default;
945
+ ggml_log_callback log_callback = llama_log_callback_default;
903
946
  void * log_callback_user_data = nullptr;
904
947
  };
905
948
 
@@ -925,9 +968,9 @@ static const size_t MB = kB*kB;
925
968
  static const size_t GB = kB*kB*kB;
926
969
 
927
970
  struct llama_hparams {
971
+ bool vocab_only;
928
972
  uint32_t n_vocab;
929
973
  uint32_t n_ctx_train; // context size the model was trained on
930
- uint32_t n_ctx; // context size used during inference
931
974
  uint32_t n_embd;
932
975
  uint32_t n_head;
933
976
  uint32_t n_head_kv;
@@ -938,11 +981,28 @@ struct llama_hparams {
938
981
  float f_norm_eps;
939
982
  float f_norm_rms_eps;
940
983
 
941
- float rope_freq_base;
942
- float rope_freq_scale;
984
+ float rope_freq_base_train;
985
+ float rope_freq_scale_train;
943
986
 
944
987
  bool operator!=(const llama_hparams & other) const {
945
- return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams))); // NOLINT
988
+ if (this->vocab_only != other.vocab_only) return true;
989
+ if (this->n_vocab != other.n_vocab) return true;
990
+ if (this->n_ctx_train != other.n_ctx_train) return true;
991
+ if (this->n_embd != other.n_embd) return true;
992
+ if (this->n_head != other.n_head) return true;
993
+ if (this->n_head_kv != other.n_head_kv) return true;
994
+ if (this->n_layer != other.n_layer) return true;
995
+ if (this->n_rot != other.n_rot) return true;
996
+ if (this->n_ff != other.n_ff) return true;
997
+
998
+ const float EPSILON = 1e-9;
999
+
1000
+ if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
1001
+ if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
1002
+ if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
1003
+ if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
1004
+
1005
+ return false;
946
1006
  }
947
1007
 
948
1008
  uint32_t n_gqa() const {
@@ -956,15 +1016,18 @@ struct llama_hparams {
956
1016
  uint32_t n_embd_gqa() const {
957
1017
  return n_embd/n_gqa();
958
1018
  }
1019
+ };
959
1020
 
960
- size_t kv_size() const {
961
- size_t result = 2ull;
962
- result *= (size_t) n_embd_gqa();
963
- result *= (size_t) n_ctx;
964
- result *= (size_t) n_layer;
965
- result *= sizeof(ggml_fp16_t);
966
- return result;
967
- }
1021
+ struct llama_cparams {
1022
+ uint32_t n_ctx; // context size used during inference
1023
+ uint32_t n_batch;
1024
+ uint32_t n_threads; // number of threads to use for generation
1025
+ uint32_t n_threads_batch; // number of threads to use for batch processing
1026
+
1027
+ float rope_freq_base;
1028
+ float rope_freq_scale;
1029
+
1030
+ bool mul_mat_q;
968
1031
  };
969
1032
 
970
1033
  struct llama_layer {
@@ -999,7 +1062,29 @@ struct llama_layer {
999
1062
  struct ggml_tensor * b3; // ffn_up
1000
1063
  };
1001
1064
 
1065
+ struct llama_kv_cell {
1066
+ llama_pos pos = -1;
1067
+ llama_pos delta = 0;
1068
+
1069
+ std::set<llama_seq_id> seq_id;
1070
+
1071
+ bool has_seq_id(const llama_seq_id & id) const {
1072
+ return seq_id.find(id) != seq_id.end();
1073
+ }
1074
+ };
1075
+
1076
+ // ring-buffer of cached KV data
1002
1077
  struct llama_kv_cache {
1078
+ bool has_shift = false;
1079
+
1080
+ uint32_t head = 0;
1081
+ uint32_t size = 0;
1082
+
1083
+ // computed before each graph build
1084
+ uint32_t n = 0;
1085
+
1086
+ std::vector<llama_kv_cell> cells;
1087
+
1003
1088
  struct ggml_tensor * k = NULL;
1004
1089
  struct ggml_tensor * v = NULL;
1005
1090
 
@@ -1007,8 +1092,6 @@ struct llama_kv_cache {
1007
1092
 
1008
1093
  llama_buffer buf;
1009
1094
 
1010
- int n; // number of tokens currently in the cache
1011
-
1012
1095
  ~llama_kv_cache() {
1013
1096
  if (ctx) {
1014
1097
  ggml_free(ctx);
@@ -1047,6 +1130,10 @@ struct llama_vocab {
1047
1130
  id special_pad_id = -1;
1048
1131
 
1049
1132
  id linefeed_id = 13;
1133
+ id special_prefix_id = 32007;
1134
+ id special_middle_id = 32009;
1135
+ id special_suffix_id = 32008;
1136
+ id special_eot_id = 32010;
1050
1137
 
1051
1138
  int find_bpe_rank(std::string token_left, std::string token_right) const {
1052
1139
  replace_all(token_left, " ", "\u0120");
@@ -1122,11 +1209,8 @@ struct llama_model {
1122
1209
  };
1123
1210
 
1124
1211
  struct llama_context {
1125
- llama_context(const llama_model & model) : model(model), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
1212
+ llama_context(const llama_model & model) : model(model), t_start_us(model.t_start_us), t_load_us(model.t_load_us) {}
1126
1213
  ~llama_context() {
1127
- if (model_owner) {
1128
- delete &model;
1129
- }
1130
1214
  #ifdef GGML_USE_METAL
1131
1215
  if (ctx_metal) {
1132
1216
  ggml_metal_free(ctx_metal);
@@ -1137,27 +1221,26 @@ struct llama_context {
1137
1221
  }
1138
1222
  }
1139
1223
 
1224
+ llama_cparams cparams;
1225
+
1226
+ const llama_model & model;
1227
+
1228
+ // key + value cache for the self attention
1229
+ struct llama_kv_cache kv_self;
1230
+
1140
1231
  std::mt19937 rng;
1141
1232
 
1142
1233
  bool has_evaluated_once = false;
1143
1234
 
1235
+ int64_t t_start_us;
1236
+ int64_t t_load_us;
1144
1237
  int64_t t_sample_us = 0;
1145
- int64_t t_eval_us = 0;
1146
1238
  int64_t t_p_eval_us = 0;
1239
+ int64_t t_eval_us = 0;
1147
1240
 
1148
1241
  int32_t n_sample = 0; // number of tokens sampled
1149
- int32_t n_eval = 0; // number of eval calls
1150
1242
  int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
1151
-
1152
- const llama_model & model;
1153
-
1154
- bool model_owner = false;
1155
-
1156
- int64_t t_load_us;
1157
- int64_t t_start_us;
1158
-
1159
- // key + value cache for the self attention
1160
- struct llama_kv_cache kv_self;
1243
+ int32_t n_eval = 0; // number of eval calls
1161
1244
 
1162
1245
  // decode output (2-dimensional array: [n_tokens][n_vocab])
1163
1246
  std::vector<float> logits;
@@ -1192,16 +1275,23 @@ static bool llama_kv_cache_init(
1192
1275
  const struct llama_hparams & hparams,
1193
1276
  struct llama_kv_cache & cache,
1194
1277
  ggml_type wtype,
1195
- int n_ctx,
1278
+ uint32_t n_ctx,
1196
1279
  int n_gpu_layers) {
1197
- const int n_embd = hparams.n_embd_gqa();
1198
- const int n_layer = hparams.n_layer;
1280
+ const uint32_t n_embd = hparams.n_embd_gqa();
1281
+ const uint32_t n_layer = hparams.n_layer;
1199
1282
 
1200
1283
  const int64_t n_mem = n_layer*n_ctx;
1201
1284
  const int64_t n_elements = n_embd*n_mem;
1202
1285
 
1286
+ cache.has_shift = false;
1287
+
1288
+ cache.head = 0;
1289
+ cache.size = n_ctx;
1290
+
1291
+ cache.cells.clear();
1292
+ cache.cells.resize(n_ctx);
1293
+
1203
1294
  cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
1204
- cache.n = 0;
1205
1295
 
1206
1296
  struct ggml_init_params params;
1207
1297
  params.mem_size = cache.buf.size;
@@ -1222,17 +1312,163 @@ static bool llama_kv_cache_init(
1222
1312
 
1223
1313
  (void) n_gpu_layers;
1224
1314
  #ifdef GGML_USE_CUBLAS
1225
- if (n_gpu_layers > n_layer + 1) {
1315
+ size_t vram_kv_cache = 0;
1316
+
1317
+ if (n_gpu_layers > (int)n_layer + 1) {
1226
1318
  ggml_cuda_assign_buffers_no_scratch(cache.v);
1319
+ LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__);
1320
+ vram_kv_cache += ggml_nbytes(cache.v);
1227
1321
  }
1228
- if (n_gpu_layers > n_layer + 2) {
1322
+ if (n_gpu_layers > (int)n_layer + 2) {
1229
1323
  ggml_cuda_assign_buffers_no_scratch(cache.k);
1324
+ LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
1325
+ vram_kv_cache += ggml_nbytes(cache.k);
1326
+ }
1327
+ if (vram_kv_cache > 0) {
1328
+ LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
1230
1329
  }
1231
1330
  #endif // GGML_USE_CUBLAS
1232
1331
 
1233
1332
  return true;
1234
1333
  }
1235
1334
 
1335
+ // find an empty slot of size "n_tokens" in the cache
1336
+ // updates the cache head
1337
+ static bool llama_kv_cache_find_slot(
1338
+ struct llama_kv_cache & cache,
1339
+ const struct llama_batch & batch) {
1340
+ const uint32_t n_ctx = cache.size;
1341
+ const uint32_t n_tokens = batch.n_tokens;
1342
+
1343
+ if (n_tokens > n_ctx) {
1344
+ LLAMA_LOG_ERROR("%s: n_tokens=%d > n_ctx=%d\n", __func__, n_tokens, n_ctx);
1345
+ return false;
1346
+ }
1347
+
1348
+ uint32_t n_tested = 0;
1349
+
1350
+ while (true) {
1351
+ if (cache.head + n_tokens > n_ctx) {
1352
+ cache.head = 0;
1353
+ n_tested += n_ctx - cache.head;
1354
+ continue;
1355
+ }
1356
+
1357
+ bool found = true;
1358
+ for (uint32_t i = 0; i < n_tokens; i++) {
1359
+ if (cache.cells[cache.head + i].pos >= 0) {
1360
+ found = false;
1361
+ cache.head += i + 1;
1362
+ n_tested += i + 1;
1363
+ break;
1364
+ }
1365
+ }
1366
+
1367
+ if (found) {
1368
+ break;
1369
+ }
1370
+
1371
+ if (n_tested >= n_ctx) {
1372
+ //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
1373
+ return false;
1374
+ }
1375
+ }
1376
+
1377
+ for (uint32_t i = 0; i < n_tokens; i++) {
1378
+ cache.cells[cache.head + i].pos = batch.pos[i];
1379
+ cache.cells[cache.head + i].seq_id.insert(batch.seq_id[i]);
1380
+ }
1381
+
1382
+ return true;
1383
+ }
1384
+
1385
+ // find how many cells are currently in use
1386
+ static int32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
1387
+ for (uint32_t i = cache.size - 1; i > 0; --i) {
1388
+ if (cache.cells[i].pos >= 0 && !cache.cells[i].seq_id.empty()) {
1389
+ return i + 1;
1390
+ }
1391
+ }
1392
+
1393
+ return 0;
1394
+ }
1395
+
1396
+ static void llama_kv_cache_tokens_rm(struct llama_kv_cache & cache, int32_t c0, int32_t c1) {
1397
+ if (c0 < 0) c0 = 0;
1398
+ if (c1 < 0) c1 = cache.size;
1399
+
1400
+ for (int32_t i = c0; i < c1; ++i) {
1401
+ cache.cells[i].pos = -1;
1402
+ cache.cells[i].seq_id.clear();
1403
+ }
1404
+ }
1405
+
1406
+ static void llama_kv_cache_seq_rm(
1407
+ struct llama_kv_cache & cache,
1408
+ llama_seq_id seq_id,
1409
+ llama_pos p0,
1410
+ llama_pos p1) {
1411
+ if (p0 < 0) p0 = 0;
1412
+ if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
1413
+
1414
+ for (uint32_t i = 0; i < cache.size; ++i) {
1415
+ if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
1416
+ cache.cells[i].seq_id.erase(seq_id);
1417
+ if (cache.cells[i].seq_id.empty()) {
1418
+ cache.cells[i].pos = -1;
1419
+ }
1420
+ }
1421
+ }
1422
+ }
1423
+
1424
+ static void llama_kv_cache_seq_cp(
1425
+ struct llama_kv_cache & cache,
1426
+ llama_seq_id seq_id_src,
1427
+ llama_seq_id seq_id_dst,
1428
+ llama_pos p0,
1429
+ llama_pos p1) {
1430
+ if (p0 < 0) p0 = 0;
1431
+ if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
1432
+
1433
+ for (uint32_t i = 0; i < cache.size; ++i) {
1434
+ if (cache.cells[i].has_seq_id(seq_id_src) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
1435
+ cache.cells[i].seq_id.insert(seq_id_dst);
1436
+ }
1437
+ }
1438
+ }
1439
+
1440
+ static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id seq_id) {
1441
+ for (uint32_t i = 0; i < cache.size; ++i) {
1442
+ if (!cache.cells[i].has_seq_id(seq_id)) {
1443
+ cache.cells[i].pos = -1;
1444
+ cache.cells[i].seq_id.clear();
1445
+ }
1446
+ }
1447
+ }
1448
+
1449
+ static void llama_kv_cache_seq_shift(
1450
+ struct llama_kv_cache & cache,
1451
+ llama_seq_id seq_id,
1452
+ llama_pos p0,
1453
+ llama_pos p1,
1454
+ llama_pos delta) {
1455
+ if (p0 < 0) p0 = 0;
1456
+ if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
1457
+
1458
+ for (uint32_t i = 0; i < cache.size; ++i) {
1459
+ if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
1460
+ cache.cells[i].pos += delta;
1461
+ if (cache.cells[i].pos < 0) {
1462
+ cache.cells[i].pos = -1;
1463
+ cache.cells[i].seq_id.clear();
1464
+ } else {
1465
+ cache.has_shift = true;
1466
+ cache.cells[i].delta = delta;
1467
+ }
1468
+ }
1469
+ }
1470
+ }
1471
+
1236
1472
  //
1237
1473
  // model loading and saving
1238
1474
  //
@@ -1554,7 +1790,7 @@ struct llama_model_loader {
1554
1790
  lmlock->grow_to(size_lock);
1555
1791
  }
1556
1792
  break;
1557
- #if defined(GGML_USE_CUBLAS)
1793
+ #ifdef GGML_USE_CUBLAS
1558
1794
  case GGML_BACKEND_GPU:
1559
1795
  case GGML_BACKEND_GPU_SPLIT:
1560
1796
  // old code:
@@ -1587,7 +1823,15 @@ struct llama_model_loader {
1587
1823
  // load LLaMA models
1588
1824
  //
1589
1825
 
1590
- static std::string llama_model_ftype_name(enum llama_ftype ftype) {
1826
+ static std::string llama_model_arch_name(llm_arch arch) {
1827
+ auto it = LLM_ARCH_NAMES.find(arch);
1828
+ if (it == LLM_ARCH_NAMES.end()) {
1829
+ return "unknown";
1830
+ }
1831
+ return it->second;
1832
+ }
1833
+
1834
+ static std::string llama_model_ftype_name(llama_ftype ftype) {
1591
1835
  if (ftype & LLAMA_FTYPE_GUESSED) {
1592
1836
  return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
1593
1837
  }
@@ -1643,10 +1887,7 @@ static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
1643
1887
 
1644
1888
  static void llm_load_hparams(
1645
1889
  llama_model_loader & ml,
1646
- llama_model & model,
1647
- int n_ctx,
1648
- float rope_freq_base,
1649
- float rope_freq_scale) {
1890
+ llama_model & model) {
1650
1891
  struct gguf_context * ctx = ml.ctx_gguf;
1651
1892
 
1652
1893
  const auto kv = LLM_KV(model.arch);
@@ -1657,29 +1898,25 @@ static void llm_load_hparams(
1657
1898
  GGUF_GET_KEY(ctx, model.name, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_NAME));
1658
1899
 
1659
1900
  // get hparams kv
1660
- GGUF_GET_KEY(ctx, hparams.n_vocab, gguf_get_arr_n, GGUF_TYPE_ARRAY, true, kv(LLM_KV_TOKENIZER_LIST));
1661
- GGUF_GET_KEY(ctx, hparams.n_ctx_train, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_CONTEXT_LENGTH));
1662
- GGUF_GET_KEY(ctx, hparams.n_embd, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_EMBEDDING_LENGTH));
1663
- GGUF_GET_KEY(ctx, hparams.n_ff, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_FEED_FORWARD_LENGTH));
1664
- GGUF_GET_KEY(ctx, hparams.n_head, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_ATTENTION_HEAD_COUNT));
1665
- GGUF_GET_KEY(ctx, hparams.n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_BLOCK_COUNT));
1901
+ GGUF_GET_KEY(ctx, hparams.n_vocab, gguf_get_arr_n, GGUF_TYPE_ARRAY, true, kv(LLM_KV_TOKENIZER_LIST));
1902
+ GGUF_GET_KEY(ctx, hparams.n_ctx_train, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_CONTEXT_LENGTH));
1903
+ GGUF_GET_KEY(ctx, hparams.n_embd, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_EMBEDDING_LENGTH));
1904
+ GGUF_GET_KEY(ctx, hparams.n_ff, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_FEED_FORWARD_LENGTH));
1905
+ GGUF_GET_KEY(ctx, hparams.n_head, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_ATTENTION_HEAD_COUNT));
1906
+ GGUF_GET_KEY(ctx, hparams.n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_BLOCK_COUNT));
1666
1907
 
1667
1908
  // n_head_kv is optional, default to n_head
1668
1909
  hparams.n_head_kv = hparams.n_head;
1669
1910
  GGUF_GET_KEY(ctx, hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV));
1670
1911
 
1671
1912
  // rope_freq_base (optional)
1672
- if (rope_freq_base == 0.0f) {
1673
- rope_freq_base = 10000.0f;
1674
- GGUF_GET_KEY(ctx, rope_freq_base, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
1675
- }
1913
+ hparams.rope_freq_base_train = 10000.0f;
1914
+ GGUF_GET_KEY(ctx, hparams.rope_freq_base_train, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
1676
1915
 
1677
1916
  // rope_freq_scale (inverse of the kv) is optional
1678
- if (rope_freq_scale == 0.0f) {
1679
- float ropescale = 1.0f;
1680
- GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
1681
- rope_freq_scale = 1.0f/ropescale;
1682
- }
1917
+ float ropescale = 1.0f;
1918
+ GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
1919
+ hparams.rope_freq_scale_train = 1.0f/ropescale;
1683
1920
 
1684
1921
  // sanity check for n_rot (optional)
1685
1922
  {
@@ -1742,14 +1979,18 @@ static void llm_load_hparams(
1742
1979
  default: model.type = e_model::MODEL_UNKNOWN;
1743
1980
  }
1744
1981
  } break;
1982
+ case LLM_ARCH_REFACT:
1983
+ {
1984
+ GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
1985
+ switch (hparams.n_layer) {
1986
+ case 32: model.type = e_model::MODEL_1B; break;
1987
+ default: model.type = e_model::MODEL_UNKNOWN;
1988
+ }
1989
+ } break;
1745
1990
  default: (void)0;
1746
- };
1991
+ }
1747
1992
 
1748
1993
  model.ftype = ml.ftype;
1749
-
1750
- hparams.n_ctx = n_ctx;
1751
- hparams.rope_freq_base = rope_freq_base;
1752
- hparams.rope_freq_scale = rope_freq_scale;
1753
1994
  }
1754
1995
 
1755
1996
  // TODO: This should probably be in llama.h
@@ -1770,20 +2011,18 @@ static void llm_load_vocab(
1770
2011
  throw std::runtime_error("cannot find tokenizer vocab in model file\n");
1771
2012
  }
1772
2013
 
2014
+ const float * scores = nullptr;
1773
2015
  const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
1774
- if (score_idx == -1) {
1775
- throw std::runtime_error("cannot find tokenizer scores in model file\n");
2016
+ if (score_idx != -1) {
2017
+ scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
1776
2018
  }
1777
2019
 
1778
- const float * scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
1779
-
2020
+ const int * toktypes = nullptr;
1780
2021
  const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
1781
- if (toktype_idx == -1) {
1782
- throw std::runtime_error("cannot find token type list in GGUF file\n");
2022
+ if (toktype_idx != -1) {
2023
+ toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
1783
2024
  }
1784
2025
 
1785
- const int * toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
1786
-
1787
2026
  // determine vocab type
1788
2027
  {
1789
2028
  std::string tokenizer_name;
@@ -1812,6 +2051,7 @@ static void llm_load_vocab(
1812
2051
 
1813
2052
  for (int i = 0; i < n_merges; i++) {
1814
2053
  const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
2054
+ GGML_ASSERT(codepoints_from_utf8(word).size() > 0);
1815
2055
 
1816
2056
  std::string first;
1817
2057
  std::string second;
@@ -1846,20 +2086,22 @@ static void llm_load_vocab(
1846
2086
 
1847
2087
  for (uint32_t i = 0; i < n_vocab; i++) {
1848
2088
  std::string word = gguf_get_arr_str(ctx, token_idx, i);
2089
+ GGML_ASSERT(codepoints_from_utf8(word).size() > 0);
1849
2090
 
1850
2091
  vocab.token_to_id[word] = i;
1851
2092
 
1852
2093
  auto & token_data = vocab.id_to_token[i];
1853
2094
  token_data.text = std::move(word);
1854
- token_data.score = scores[i];
1855
- token_data.type = (llama_token_type) toktypes[i];
2095
+ token_data.score = scores ? scores[i] : 0.0f;
2096
+ token_data.type = toktypes ? (llama_token_type) toktypes[i] : LLAMA_TOKEN_TYPE_NORMAL;
1856
2097
  }
2098
+ GGML_ASSERT(vocab.id_to_token.size() == vocab.token_to_id.size());
1857
2099
 
1858
2100
  // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
1859
2101
  if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
1860
2102
  vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
1861
2103
  } else {
1862
- vocab.linefeed_id = llama_tokenize_internal(vocab, "\n", false)[0];
2104
+ vocab.linefeed_id = llama_tokenize_internal(vocab, "\u010A", false)[0];
1863
2105
  }
1864
2106
 
1865
2107
  // special tokens
@@ -1875,31 +2117,30 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
1875
2117
  const auto & vocab = model.vocab;
1876
2118
 
1877
2119
  // hparams
1878
- LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver));
1879
- LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch).c_str());
1880
- LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, vocab.type == LLAMA_VOCAB_TYPE_SPM ? "SPM" : "BPE"); // TODO: fix
1881
- LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab);
1882
- LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (int) vocab.bpe_ranks.size());
1883
- LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
1884
- LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, hparams.n_ctx);
1885
- LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
1886
- LLAMA_LOG_INFO("%s: n_head = %u\n", __func__, hparams.n_head);
1887
- LLAMA_LOG_INFO("%s: n_head_kv = %u\n", __func__, hparams.n_head_kv);
1888
- LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
1889
- LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim
1890
- LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa());
1891
- LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
1892
- LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
1893
- LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
1894
- LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base);
1895
- LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale);
1896
- LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
1897
- LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
1898
- LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
2120
+ LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver));
2121
+ LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch).c_str());
2122
+ LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, vocab.type == LLAMA_VOCAB_TYPE_SPM ? "SPM" : "BPE"); // TODO: fix
2123
+ LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab);
2124
+ LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (int) vocab.bpe_ranks.size());
2125
+ LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
2126
+ LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
2127
+ LLAMA_LOG_INFO("%s: n_head = %u\n", __func__, hparams.n_head);
2128
+ LLAMA_LOG_INFO("%s: n_head_kv = %u\n", __func__, hparams.n_head_kv);
2129
+ LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
2130
+ LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim
2131
+ LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa());
2132
+ LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
2133
+ LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
2134
+ LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
2135
+ LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
2136
+ LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
2137
+ LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
2138
+ LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
2139
+ LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
1899
2140
  if (ml.n_bytes < GB) {
1900
- LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
2141
+ LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
1901
2142
  } else {
1902
- LLAMA_LOG_INFO("%s: model size = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
2143
+ LLAMA_LOG_INFO("%s: model size = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
1903
2144
  }
1904
2145
 
1905
2146
  // general kv
@@ -1917,13 +2158,9 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
1917
2158
  static void llm_load_tensors(
1918
2159
  llama_model_loader & ml,
1919
2160
  llama_model & model,
1920
- int n_batch,
1921
2161
  int n_gpu_layers,
1922
2162
  int main_gpu,
1923
2163
  const float * tensor_split,
1924
- const bool mul_mat_q,
1925
- bool low_vram,
1926
- ggml_type memory_type,
1927
2164
  bool use_mlock,
1928
2165
  llama_progress_callback progress_callback,
1929
2166
  void * progress_callback_user_data) {
@@ -1962,11 +2199,9 @@ static void llm_load_tensors(
1962
2199
  }
1963
2200
 
1964
2201
  (void) main_gpu;
1965
- (void) mul_mat_q;
1966
- #if defined(GGML_USE_CUBLAS)
2202
+ #ifdef GGML_USE_CUBLAS
1967
2203
  LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__);
1968
2204
  ggml_cuda_set_main_device(main_gpu);
1969
- ggml_cuda_set_mul_mat_q(mul_mat_q);
1970
2205
  #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
1971
2206
  #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
1972
2207
  #elif defined(GGML_USE_CLBLAST)
@@ -1989,6 +2224,7 @@ static void llm_load_tensors(
1989
2224
  const auto tn = LLM_TN(model.arch);
1990
2225
  switch (model.arch) {
1991
2226
  case LLM_ARCH_LLAMA:
2227
+ case LLM_ARCH_REFACT:
1992
2228
  {
1993
2229
  model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
1994
2230
 
@@ -2001,9 +2237,9 @@ static void llm_load_tensors(
2001
2237
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2002
2238
  // on Windows however this is detrimental unless everything is on the GPU
2003
2239
  #ifndef _WIN32
2004
- backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2240
+ backend_norm = LLAMA_BACKEND_OFFLOAD;
2005
2241
  #else
2006
- backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2242
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2007
2243
  #endif // _WIN32
2008
2244
 
2009
2245
  backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
@@ -2067,9 +2303,9 @@ static void llm_load_tensors(
2067
2303
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2068
2304
  // on Windows however this is detrimental unless everything is on the GPU
2069
2305
  #ifndef _WIN32
2070
- backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2306
+ backend_norm = LLAMA_BACKEND_OFFLOAD;
2071
2307
  #else
2072
- backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2308
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2073
2309
  #endif // _WIN32
2074
2310
 
2075
2311
  backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
@@ -2137,9 +2373,9 @@ static void llm_load_tensors(
2137
2373
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2138
2374
  // on Windows however this is detrimental unless everything is on the GPU
2139
2375
  #ifndef _WIN32
2140
- backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2376
+ backend_norm = LLAMA_BACKEND_OFFLOAD;
2141
2377
  #else
2142
- backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2378
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2143
2379
  #endif // _WIN32
2144
2380
 
2145
2381
  backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
@@ -2214,9 +2450,9 @@ static void llm_load_tensors(
2214
2450
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2215
2451
  // on Windows however this is detrimental unless everything is on the GPU
2216
2452
  #ifndef _WIN32
2217
- backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2453
+ backend_norm = LLAMA_BACKEND_OFFLOAD;
2218
2454
  #else
2219
- backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2455
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2220
2456
  #endif // _WIN32
2221
2457
 
2222
2458
  backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
@@ -2281,27 +2517,19 @@ static void llm_load_tensors(
2281
2517
  } break;
2282
2518
  default:
2283
2519
  throw std::runtime_error("unknown architecture");
2284
- };
2520
+ }
2285
2521
  }
2286
2522
 
2287
2523
  ml.done_getting_tensors();
2288
2524
 
2289
2525
  // print memory requirements
2290
2526
  {
2291
- const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
2292
-
2293
2527
  // this is the total memory required to run the inference
2294
2528
  size_t mem_required =
2295
2529
  ctx_size +
2296
2530
  mmapped_size - vram_weights; // weights in VRAM not in memory
2297
2531
 
2298
- // this is the memory required by one llama_state
2299
- const size_t mem_required_state = scale*hparams.kv_size();
2300
-
2301
- LLAMA_LOG_INFO("%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
2302
- mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
2303
-
2304
- (void) n_batch;
2532
+ LLAMA_LOG_INFO("%s: mem required = %7.2f MB\n", __func__, mem_required / 1024.0 / 1024.0);
2305
2533
 
2306
2534
  #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
2307
2535
  const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
@@ -2310,36 +2538,17 @@ static void llm_load_tensors(
2310
2538
  if (n_gpu_layers > (int) hparams.n_layer) {
2311
2539
  LLAMA_LOG_INFO("%s: offloading non-repeating layers to GPU\n", __func__);
2312
2540
  }
2313
- size_t vram_kv_cache = 0;
2314
2541
 
2315
2542
  #ifdef GGML_USE_CUBLAS
2316
2543
  const int max_backend_supported_layers = hparams.n_layer + 3;
2317
- const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
2318
- if (n_gpu_layers > (int) hparams.n_layer + 1) {
2319
- if (low_vram) {
2320
- LLAMA_LOG_INFO("%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
2321
- } else {
2322
- LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__);
2323
- vram_kv_cache += hparams.kv_size() / 2;
2324
- }
2325
- }
2326
- if (n_gpu_layers > (int) hparams.n_layer + 2) {
2327
- if (low_vram) {
2328
- LLAMA_LOG_WARN("%s: cannot offload k cache to GPU due to low VRAM option\n", __func__);
2329
- } else {
2330
- LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
2331
- vram_kv_cache += hparams.kv_size() / 2;
2332
- }
2333
- }
2544
+ const int max_offloadable_layers = hparams.n_layer + 3;
2334
2545
  #elif defined(GGML_USE_CLBLAST)
2335
2546
  const int max_backend_supported_layers = hparams.n_layer + 1;
2336
2547
  const int max_offloadable_layers = hparams.n_layer + 1;
2337
2548
  #endif // GGML_USE_CUBLAS
2338
2549
 
2339
- LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n",
2340
- __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
2341
- LLAMA_LOG_INFO("%s: VRAM used: %zu MB\n",
2342
- __func__, (vram_weights + vram_kv_cache + MB - 1) / MB); // round up
2550
+ LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
2551
+ LLAMA_LOG_INFO("%s: VRAM used: %.2f MB\n", __func__, vram_weights / 1024.0 / 1024.0);
2343
2552
  #else
2344
2553
  (void) n_gpu_layers;
2345
2554
  #endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
@@ -2352,7 +2561,7 @@ static void llm_load_tensors(
2352
2561
  }
2353
2562
 
2354
2563
  (void) tensor_split;
2355
- #if defined(GGML_USE_CUBLAS)
2564
+ #ifdef GGML_USE_CUBLAS
2356
2565
  {
2357
2566
  ggml_cuda_set_tensor_split(tensor_split);
2358
2567
  }
@@ -2374,29 +2583,24 @@ static void llm_load_tensors(
2374
2583
  static bool llama_model_load(
2375
2584
  const std::string & fname,
2376
2585
  llama_model & model,
2377
- int n_ctx,
2378
- int n_batch,
2379
2586
  int n_gpu_layers,
2380
2587
  int main_gpu,
2381
2588
  const float * tensor_split,
2382
- const bool mul_mat_q,
2383
- float rope_freq_base,
2384
- float rope_freq_scale,
2385
- bool low_vram,
2386
- ggml_type memory_type,
2387
2589
  bool use_mmap,
2388
2590
  bool use_mlock,
2389
2591
  bool vocab_only,
2390
2592
  llama_progress_callback progress_callback,
2391
2593
  void *progress_callback_user_data) {
2392
2594
  try {
2393
- std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap));
2595
+ llama_model_loader ml(fname, use_mmap);
2596
+
2597
+ model.hparams.vocab_only = vocab_only;
2394
2598
 
2395
- llm_load_arch (*ml, model);
2396
- llm_load_hparams(*ml, model, n_ctx, rope_freq_base, rope_freq_scale);
2397
- llm_load_vocab (*ml, model);
2599
+ llm_load_arch (ml, model);
2600
+ llm_load_hparams(ml, model);
2601
+ llm_load_vocab (ml, model);
2398
2602
 
2399
- llm_load_print_meta(*ml, model);
2603
+ llm_load_print_meta(ml, model);
2400
2604
 
2401
2605
  if (model.hparams.n_vocab != model.vocab.id_to_token.size()) {
2402
2606
  throw std::runtime_error("vocab size mismatch");
@@ -2408,8 +2612,8 @@ static bool llama_model_load(
2408
2612
  }
2409
2613
 
2410
2614
  llm_load_tensors(
2411
- *ml, model, n_batch, n_gpu_layers,
2412
- main_gpu, tensor_split, mul_mat_q, low_vram, memory_type,
2615
+ ml, model, n_gpu_layers,
2616
+ main_gpu, tensor_split,
2413
2617
  use_mlock, progress_callback, progress_callback_user_data);
2414
2618
  } catch (const std::exception & err) {
2415
2619
  LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
@@ -2421,17 +2625,10 @@ static bool llama_model_load(
2421
2625
 
2422
2626
  static struct ggml_cgraph * llm_build_llama(
2423
2627
  llama_context & lctx,
2424
- const llama_token * tokens,
2425
- const float * embd,
2426
- int n_tokens,
2427
- int n_past) {
2428
-
2429
- GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
2430
-
2431
- const int N = n_tokens;
2432
-
2628
+ const llama_batch & batch) {
2433
2629
  const auto & model = lctx.model;
2434
2630
  const auto & hparams = model.hparams;
2631
+ const auto & cparams = lctx.cparams;
2435
2632
 
2436
2633
  const auto & kv_self = lctx.kv_self;
2437
2634
 
@@ -2439,7 +2636,7 @@ static struct ggml_cgraph * llm_build_llama(
2439
2636
 
2440
2637
  const int64_t n_embd = hparams.n_embd;
2441
2638
  const int64_t n_layer = hparams.n_layer;
2442
- const int64_t n_ctx = hparams.n_ctx;
2639
+ const int64_t n_ctx = cparams.n_ctx;
2443
2640
  const int64_t n_head = hparams.n_head;
2444
2641
  const int64_t n_head_kv = hparams.n_head_kv;
2445
2642
  const int64_t n_embd_head = hparams.n_embd_head();
@@ -2447,12 +2644,20 @@ static struct ggml_cgraph * llm_build_llama(
2447
2644
 
2448
2645
  GGML_ASSERT(n_embd_head == hparams.n_rot);
2449
2646
 
2450
- const float freq_base = hparams.rope_freq_base;
2451
- const float freq_scale = hparams.rope_freq_scale;
2647
+ const float freq_base = cparams.rope_freq_base;
2648
+ const float freq_scale = cparams.rope_freq_scale;
2452
2649
  const float norm_rms_eps = hparams.f_norm_rms_eps;
2453
2650
 
2454
2651
  const int n_gpu_layers = model.n_gpu_layers;
2455
2652
 
2653
+ const int32_t n_tokens = batch.n_tokens;
2654
+ const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
2655
+ const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
2656
+
2657
+ const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
2658
+
2659
+ //printf("n_kv = %d\n", n_kv);
2660
+
2456
2661
  auto & buf_compute = lctx.buf_compute;
2457
2662
 
2458
2663
  struct ggml_init_params params = {
@@ -2470,12 +2675,12 @@ static struct ggml_cgraph * llm_build_llama(
2470
2675
  struct ggml_tensor * cur;
2471
2676
  struct ggml_tensor * inpL;
2472
2677
 
2473
- if (tokens) {
2474
- struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
2678
+ if (batch.token) {
2679
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
2475
2680
 
2476
2681
  ggml_allocr_alloc(lctx.alloc, inp_tokens);
2477
2682
  if (!ggml_allocr_is_measure(lctx.alloc)) {
2478
- memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
2683
+ memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
2479
2684
  }
2480
2685
  ggml_set_name(inp_tokens, "inp_tokens");
2481
2686
 
@@ -2485,11 +2690,11 @@ static struct ggml_cgraph * llm_build_llama(
2485
2690
  GGML_ASSERT(false && "not implemented");
2486
2691
  #endif
2487
2692
 
2488
- inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
2693
+ inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
2489
2694
 
2490
2695
  ggml_allocr_alloc(lctx.alloc, inpL);
2491
2696
  if (!ggml_allocr_is_measure(lctx.alloc)) {
2492
- memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
2697
+ memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
2493
2698
  }
2494
2699
  }
2495
2700
 
@@ -2498,9 +2703,6 @@ static struct ggml_cgraph * llm_build_llama(
2498
2703
 
2499
2704
  // offload functions set the tensor output backend to GPU
2500
2705
  // tensors are GPU-accelerated if any input or the output has been offloaded
2501
- //
2502
- // with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
2503
- // in that case ggml_cuda_assign_buffers has no effect
2504
2706
  offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
2505
2707
  offload_func_t offload_func_kq = llama_nop;
2506
2708
  offload_func_t offload_func_v = llama_nop;
@@ -2517,12 +2719,75 @@ static struct ggml_cgraph * llm_build_llama(
2517
2719
  }
2518
2720
  #endif // GGML_USE_CUBLAS
2519
2721
 
2722
+ // KQ_scale
2520
2723
  struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
2724
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
2521
2725
  ggml_allocr_alloc(lctx.alloc, KQ_scale);
2522
2726
  if (!ggml_allocr_is_measure(lctx.alloc)) {
2523
- ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
2727
+ ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd_head)));
2728
+ }
2729
+
2730
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
2731
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
2732
+ offload_func_kq(KQ_mask);
2733
+ ggml_set_name(KQ_mask, "KQ_mask");
2734
+ ggml_allocr_alloc(lctx.alloc, KQ_mask);
2735
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
2736
+ float * data = (float *) KQ_mask->data;
2737
+ memset(data, 0, ggml_nbytes(KQ_mask));
2738
+
2739
+ for (int h = 0; h < 1; ++h) {
2740
+ for (int j = 0; j < n_tokens; ++j) {
2741
+ const llama_pos pos = batch.pos[j];
2742
+ const llama_seq_id seq_id = batch.seq_id[j];
2743
+
2744
+ for (int i = 0; i < n_kv; ++i) {
2745
+ if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
2746
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
2747
+ }
2748
+ }
2749
+ }
2750
+ }
2751
+ }
2752
+
2753
+ // KQ_pos - contains the positions
2754
+ struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
2755
+ offload_func_kq(KQ_pos);
2756
+ ggml_set_name(KQ_pos, "KQ_pos");
2757
+ ggml_allocr_alloc(lctx.alloc, KQ_pos);
2758
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
2759
+ int * data = (int *) KQ_pos->data;
2760
+ for (int i = 0; i < n_tokens; ++i) {
2761
+ data[i] = batch.pos[i];
2762
+ }
2763
+ }
2764
+
2765
+ // shift the entire K-cache if needed
2766
+ if (do_rope_shift) {
2767
+ struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
2768
+ offload_func_kq(K_shift);
2769
+ ggml_set_name(K_shift, "K_shift");
2770
+ ggml_allocr_alloc(lctx.alloc, K_shift);
2771
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
2772
+ int * data = (int *) K_shift->data;
2773
+ for (int i = 0; i < n_ctx; ++i) {
2774
+ data[i] = kv_self.cells[i].delta;
2775
+ }
2776
+ }
2777
+
2778
+ for (int il = 0; il < n_layer; ++il) {
2779
+ struct ggml_tensor * tmp =
2780
+ ggml_rope_custom_inplace(ctx0,
2781
+ ggml_view_3d(ctx0, kv_self.k,
2782
+ n_embd_head, n_head_kv, n_ctx,
2783
+ ggml_element_size(kv_self.k)*n_embd_head,
2784
+ ggml_element_size(kv_self.k)*n_embd_gqa,
2785
+ ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
2786
+ K_shift, n_embd_head, 0, 0, freq_base, freq_scale);
2787
+ offload_func_kq(tmp);
2788
+ ggml_build_forward_expand(gf, tmp);
2789
+ }
2524
2790
  }
2525
- ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
2526
2791
 
2527
2792
  for (int il = 0; il < n_layer; ++il) {
2528
2793
  ggml_format_name(inpL, "layer_inp_%d", il);
@@ -2560,33 +2825,33 @@ static struct ggml_cgraph * llm_build_llama(
2560
2825
  offload_func_kq(tmpq);
2561
2826
  ggml_set_name(tmpq, "tmpq");
2562
2827
 
2563
- struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
2828
+ struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
2564
2829
  offload_func_kq(Kcur);
2565
2830
  ggml_set_name(Kcur, "Kcur");
2566
2831
 
2567
- struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
2832
+ struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
2568
2833
  offload_func_kq(Qcur);
2569
2834
  ggml_set_name(Qcur, "Qcur");
2570
2835
 
2571
2836
  // store key and value to memory
2572
2837
  {
2573
- // compute the transposed [N, n_embd] V matrix
2838
+ // compute the transposed [n_tokens, n_embd] V matrix
2574
2839
 
2575
2840
  struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
2576
2841
  offload_func_v(tmpv);
2577
2842
  ggml_set_name(tmpv, "tmpv");
2578
2843
 
2579
- struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, N));
2844
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens));
2580
2845
  offload_func_v(Vcur);
2581
2846
  ggml_set_name(Vcur, "Vcur");
2582
2847
 
2583
- struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
2848
+ struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
2584
2849
  offload_func_kq(k);
2585
2850
  ggml_set_name(k, "k");
2586
2851
 
2587
- struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
2852
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
2588
2853
  ( n_ctx)*ggml_element_size(kv_self.v),
2589
- (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
2854
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
2590
2855
  offload_func_v(v);
2591
2856
  ggml_set_name(v, "v");
2592
2857
 
@@ -2601,7 +2866,7 @@ static struct ggml_cgraph * llm_build_llama(
2601
2866
 
2602
2867
  struct ggml_tensor * K =
2603
2868
  ggml_view_3d(ctx0, kv_self.k,
2604
- n_embd_head, n_past + N, n_head_kv,
2869
+ n_embd_head, n_kv, n_head_kv,
2605
2870
  ggml_element_size(kv_self.k)*n_embd_gqa,
2606
2871
  ggml_element_size(kv_self.k)*n_embd_head,
2607
2872
  ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
@@ -2614,25 +2879,25 @@ static struct ggml_cgraph * llm_build_llama(
2614
2879
  ggml_set_name(KQ, "KQ");
2615
2880
 
2616
2881
  // KQ_scaled = KQ / sqrt(n_embd_head)
2617
- // KQ_scaled shape [n_past + N, N, n_head, 1]
2618
- struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
2882
+ // KQ_scaled shape [n_kv, n_tokens, n_head, 1]
2883
+ struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
2619
2884
  offload_func_kq(KQ_scaled);
2620
2885
  ggml_set_name(KQ_scaled, "KQ_scaled");
2621
2886
 
2622
2887
  // KQ_masked = mask_past(KQ_scaled)
2623
- struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
2888
+ struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
2624
2889
  offload_func_kq(KQ_masked);
2625
2890
  ggml_set_name(KQ_masked, "KQ_masked");
2626
2891
 
2627
2892
  // KQ = soft_max(KQ_masked)
2628
- struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
2893
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
2629
2894
  offload_func_v(KQ_soft_max);
2630
2895
  ggml_set_name(KQ_soft_max, "KQ_soft_max");
2631
2896
 
2632
2897
  // split cached V into n_head heads
2633
2898
  struct ggml_tensor * V =
2634
2899
  ggml_view_3d(ctx0, kv_self.v,
2635
- n_past + N, n_embd_head, n_head_kv,
2900
+ n_kv, n_embd_head, n_head_kv,
2636
2901
  ggml_element_size(kv_self.v)*n_ctx,
2637
2902
  ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
2638
2903
  ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
@@ -2647,7 +2912,7 @@ static struct ggml_cgraph * llm_build_llama(
2647
2912
  // make V contiguous in memory to speed up the matmul, however we waste time on the copy
2648
2913
  // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
2649
2914
  // is there a better way?
2650
- struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head));
2915
+ struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_ctx, n_embd_head, n_head));
2651
2916
  struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
2652
2917
  #endif
2653
2918
 
@@ -2656,10 +2921,8 @@ static struct ggml_cgraph * llm_build_llama(
2656
2921
  offload_func_v(KQV_merged);
2657
2922
  ggml_set_name(KQV_merged, "KQV_merged");
2658
2923
 
2659
- // cur = KQV_merged.contiguous().view(n_embd, N)
2660
- cur = ggml_cpy(ctx0,
2661
- KQV_merged,
2662
- ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
2924
+ // cur = KQV_merged.contiguous().view(n_embd, n_tokens)
2925
+ cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
2663
2926
  offload_func_v(cur);
2664
2927
  ggml_set_name(cur, "KQV_merged_contiguous");
2665
2928
 
@@ -2750,20 +3013,12 @@ static struct ggml_cgraph * llm_build_llama(
2750
3013
  return gf;
2751
3014
  }
2752
3015
 
2753
-
2754
3016
  static struct ggml_cgraph * llm_build_baichaun(
2755
3017
  llama_context & lctx,
2756
- const llama_token * tokens,
2757
- const float * embd,
2758
- int n_tokens,
2759
- int n_past) {
2760
-
2761
- GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
2762
-
2763
- const int N = n_tokens;
2764
-
3018
+ const llama_batch & batch) {
2765
3019
  const auto & model = lctx.model;
2766
3020
  const auto & hparams = model.hparams;
3021
+ const auto & cparams = lctx.cparams;
2767
3022
 
2768
3023
  const auto & kv_self = lctx.kv_self;
2769
3024
 
@@ -2771,7 +3026,7 @@ static struct ggml_cgraph * llm_build_baichaun(
2771
3026
 
2772
3027
  const int64_t n_embd = hparams.n_embd;
2773
3028
  const int64_t n_layer = hparams.n_layer;
2774
- const int64_t n_ctx = hparams.n_ctx;
3029
+ const int64_t n_ctx = cparams.n_ctx;
2775
3030
  const int64_t n_head = hparams.n_head;
2776
3031
  const int64_t n_head_kv = hparams.n_head_kv;
2777
3032
  const int64_t n_embd_head = hparams.n_embd_head();
@@ -2779,12 +3034,18 @@ static struct ggml_cgraph * llm_build_baichaun(
2779
3034
 
2780
3035
  GGML_ASSERT(n_embd_head == hparams.n_rot);
2781
3036
 
2782
- const float freq_base = hparams.rope_freq_base;
2783
- const float freq_scale = hparams.rope_freq_scale;
3037
+ const float freq_base = cparams.rope_freq_base;
3038
+ const float freq_scale = cparams.rope_freq_scale;
2784
3039
  const float norm_rms_eps = hparams.f_norm_rms_eps;
2785
3040
 
2786
3041
  const int n_gpu_layers = model.n_gpu_layers;
2787
3042
 
3043
+ const int32_t n_tokens = batch.n_tokens;
3044
+ const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
3045
+ const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
3046
+
3047
+ const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
3048
+
2788
3049
  auto & buf_compute = lctx.buf_compute;
2789
3050
 
2790
3051
  struct ggml_init_params params = {
@@ -2802,12 +3063,12 @@ static struct ggml_cgraph * llm_build_baichaun(
2802
3063
  struct ggml_tensor * cur;
2803
3064
  struct ggml_tensor * inpL;
2804
3065
 
2805
- if (tokens) {
2806
- struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
3066
+ if (batch.token) {
3067
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
2807
3068
 
2808
3069
  ggml_allocr_alloc(lctx.alloc, inp_tokens);
2809
3070
  if (!ggml_allocr_is_measure(lctx.alloc)) {
2810
- memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
3071
+ memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
2811
3072
  }
2812
3073
  ggml_set_name(inp_tokens, "inp_tokens");
2813
3074
 
@@ -2817,11 +3078,11 @@ static struct ggml_cgraph * llm_build_baichaun(
2817
3078
  GGML_ASSERT(false && "not implemented");
2818
3079
  #endif
2819
3080
 
2820
- inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
3081
+ inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
2821
3082
 
2822
3083
  ggml_allocr_alloc(lctx.alloc, inpL);
2823
3084
  if (!ggml_allocr_is_measure(lctx.alloc)) {
2824
- memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
3085
+ memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
2825
3086
  }
2826
3087
  }
2827
3088
 
@@ -2830,9 +3091,6 @@ static struct ggml_cgraph * llm_build_baichaun(
2830
3091
 
2831
3092
  // offload functions set the tensor output backend to GPU
2832
3093
  // tensors are GPU-accelerated if any input or the output has been offloaded
2833
- //
2834
- // with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
2835
- // in that case ggml_cuda_assign_buffers has no effect
2836
3094
  offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
2837
3095
  offload_func_t offload_func_kq = llama_nop;
2838
3096
  offload_func_t offload_func_v = llama_nop;
@@ -2849,12 +3107,75 @@ static struct ggml_cgraph * llm_build_baichaun(
2849
3107
  }
2850
3108
  #endif // GGML_USE_CUBLAS
2851
3109
 
3110
+ // KQ_scale
2852
3111
  struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
3112
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
2853
3113
  ggml_allocr_alloc(lctx.alloc, KQ_scale);
2854
3114
  if (!ggml_allocr_is_measure(lctx.alloc)) {
2855
3115
  ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
2856
3116
  }
2857
- ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
3117
+
3118
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
3119
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
3120
+ offload_func_kq(KQ_mask);
3121
+ ggml_set_name(KQ_mask, "KQ_mask");
3122
+ ggml_allocr_alloc(lctx.alloc, KQ_mask);
3123
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3124
+ float * data = (float *) KQ_mask->data;
3125
+ memset(data, 0, ggml_nbytes(KQ_mask));
3126
+
3127
+ for (int h = 0; h < 1; ++h) {
3128
+ for (int j = 0; j < n_tokens; ++j) {
3129
+ const llama_pos pos = batch.pos[j];
3130
+ const llama_seq_id seq_id = batch.seq_id[j];
3131
+
3132
+ for (int i = 0; i < n_kv; ++i) {
3133
+ if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
3134
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
3135
+ }
3136
+ }
3137
+ }
3138
+ }
3139
+ }
3140
+
3141
+ // KQ_pos - contains the positions
3142
+ struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
3143
+ offload_func_kq(KQ_pos);
3144
+ ggml_set_name(KQ_pos, "KQ_pos");
3145
+ ggml_allocr_alloc(lctx.alloc, KQ_pos);
3146
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3147
+ int * data = (int *) KQ_pos->data;
3148
+ for (int i = 0; i < n_tokens; ++i) {
3149
+ data[i] = batch.pos[i];
3150
+ }
3151
+ }
3152
+
3153
+ // shift the entire K-cache if needed
3154
+ if (do_rope_shift) {
3155
+ struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
3156
+ offload_func_kq(K_shift);
3157
+ ggml_set_name(K_shift, "K_shift");
3158
+ ggml_allocr_alloc(lctx.alloc, K_shift);
3159
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3160
+ int * data = (int *) K_shift->data;
3161
+ for (int i = 0; i < n_ctx; ++i) {
3162
+ data[i] = kv_self.cells[i].delta;
3163
+ }
3164
+ }
3165
+
3166
+ for (int il = 0; il < n_layer; ++il) {
3167
+ struct ggml_tensor * tmp =
3168
+ ggml_rope_custom_inplace(ctx0,
3169
+ ggml_view_3d(ctx0, kv_self.k,
3170
+ n_embd_head, n_head_kv, n_ctx,
3171
+ ggml_element_size(kv_self.k)*n_embd_head,
3172
+ ggml_element_size(kv_self.k)*n_embd_gqa,
3173
+ ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
3174
+ K_shift, n_embd_head, 0, 0, freq_base, freq_scale);
3175
+ offload_func_kq(tmp);
3176
+ ggml_build_forward_expand(gf, tmp);
3177
+ }
3178
+ }
2858
3179
 
2859
3180
  for (int il = 0; il < n_layer; ++il) {
2860
3181
  ggml_format_name(inpL, "layer_inp_%d", il);
@@ -2896,12 +3217,12 @@ static struct ggml_cgraph * llm_build_baichaun(
2896
3217
  struct ggml_tensor * Qcur;
2897
3218
  switch (model.type) {
2898
3219
  case MODEL_7B:
2899
- Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
2900
- Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
3220
+ Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
3221
+ Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
2901
3222
  break;
2902
3223
  case MODEL_13B:
2903
- Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N);
2904
- Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N);
3224
+ Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, n_tokens);
3225
+ Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, n_tokens);
2905
3226
  break;
2906
3227
  default:
2907
3228
  GGML_ASSERT(false);
@@ -2915,23 +3236,23 @@ static struct ggml_cgraph * llm_build_baichaun(
2915
3236
 
2916
3237
  // store key and value to memory
2917
3238
  {
2918
- // compute the transposed [N, n_embd] V matrix
3239
+ // compute the transposed [n_tokens, n_embd] V matrix
2919
3240
 
2920
3241
  struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
2921
3242
  offload_func_v(tmpv);
2922
3243
  ggml_set_name(tmpv, "tmpv");
2923
3244
 
2924
- struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, N));
3245
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens));
2925
3246
  offload_func_v(Vcur);
2926
3247
  ggml_set_name(Vcur, "Vcur");
2927
3248
 
2928
- struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
3249
+ struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
2929
3250
  offload_func_kq(k);
2930
3251
  ggml_set_name(k, "k");
2931
3252
 
2932
- struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
3253
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
2933
3254
  ( n_ctx)*ggml_element_size(kv_self.v),
2934
- (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
3255
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
2935
3256
  offload_func_v(v);
2936
3257
  ggml_set_name(v, "v");
2937
3258
 
@@ -2946,7 +3267,7 @@ static struct ggml_cgraph * llm_build_baichaun(
2946
3267
 
2947
3268
  struct ggml_tensor * K =
2948
3269
  ggml_view_3d(ctx0, kv_self.k,
2949
- n_embd_head, n_past + N, n_head_kv,
3270
+ n_embd_head, n_kv, n_head_kv,
2950
3271
  ggml_element_size(kv_self.k)*n_embd_gqa,
2951
3272
  ggml_element_size(kv_self.k)*n_embd_head,
2952
3273
  ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
@@ -2959,8 +3280,8 @@ static struct ggml_cgraph * llm_build_baichaun(
2959
3280
  ggml_set_name(KQ, "KQ");
2960
3281
 
2961
3282
  // KQ_scaled = KQ / sqrt(n_embd_head)
2962
- // KQ_scaled shape [n_past + N, N, n_head, 1]
2963
- struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
3283
+ // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
3284
+ struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
2964
3285
  offload_func_kq(KQ_scaled);
2965
3286
  ggml_set_name(KQ_scaled, "KQ_scaled");
2966
3287
 
@@ -2969,58 +3290,44 @@ static struct ggml_cgraph * llm_build_baichaun(
2969
3290
 
2970
3291
  switch (model.type) {
2971
3292
  case MODEL_7B:
2972
- KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
3293
+ KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
2973
3294
  break;
2974
3295
  case MODEL_13B:
2975
- KQ_scaled_alibi =ggml_alibi(ctx0, KQ_scaled, n_past, n_head, 8);
3296
+ // TODO: replace with ggml_add()
3297
+ KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ 0, n_head, 8);
2976
3298
  ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
2977
- KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past);
3299
+ KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
2978
3300
  break;
2979
3301
  default:
2980
3302
  GGML_ASSERT(false);
2981
3303
  }
2982
- // KQ_masked = mask_past(KQ_scaled)
2983
- // struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
2984
- // struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past);
2985
- // offload_func_kq(KQ_masked);
2986
- // ggml_set_name(KQ_masked, "KQ_masked");
2987
3304
 
2988
3305
  // KQ = soft_max(KQ_masked)
2989
- struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
3306
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
2990
3307
  offload_func_v(KQ_soft_max);
2991
3308
  ggml_set_name(KQ_soft_max, "KQ_soft_max");
2992
3309
 
2993
3310
  // split cached V into n_head heads
2994
3311
  struct ggml_tensor * V =
2995
3312
  ggml_view_3d(ctx0, kv_self.v,
2996
- n_past + N, n_embd_head, n_head_kv,
3313
+ n_kv, n_embd_head, n_head_kv,
2997
3314
  ggml_element_size(kv_self.v)*n_ctx,
2998
3315
  ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
2999
3316
  ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
3000
3317
  offload_func_v(V);
3001
3318
  ggml_set_name(V, "V");
3002
3319
 
3003
- #if 1
3004
3320
  struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
3005
3321
  offload_func_v(KQV);
3006
3322
  ggml_set_name(KQV, "KQV");
3007
- #else
3008
- // make V contiguous in memory to speed up the matmul, however we waste time on the copy
3009
- // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
3010
- // is there a better way?
3011
- struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head));
3012
- struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
3013
- #endif
3014
3323
 
3015
3324
  // KQV_merged = KQV.permute(0, 2, 1, 3)
3016
3325
  struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
3017
3326
  offload_func_v(KQV_merged);
3018
3327
  ggml_set_name(KQV_merged, "KQV_merged");
3019
3328
 
3020
- // cur = KQV_merged.contiguous().view(n_embd, N)
3021
- cur = ggml_cpy(ctx0,
3022
- KQV_merged,
3023
- ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
3329
+ // cur = KQV_merged.contiguous().view(n_embd, n_tokens)
3330
+ cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
3024
3331
  offload_func_v(cur);
3025
3332
  ggml_set_name(cur, "KQV_merged_contiguous");
3026
3333
 
@@ -3111,19 +3418,12 @@ static struct ggml_cgraph * llm_build_baichaun(
3111
3418
  return gf;
3112
3419
  }
3113
3420
 
3114
- static struct ggml_cgraph * llm_build_falcon(
3421
+ static struct ggml_cgraph * llm_build_refact(
3115
3422
  llama_context & lctx,
3116
- const llama_token * tokens,
3117
- const float * embd,
3118
- int n_tokens,
3119
- int n_past) {
3120
-
3121
- GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
3122
-
3123
- const int N = n_tokens;
3124
-
3423
+ const llama_batch & batch) {
3125
3424
  const auto & model = lctx.model;
3126
3425
  const auto & hparams = model.hparams;
3426
+ const auto & cparams = lctx.cparams;
3127
3427
 
3128
3428
  const auto & kv_self = lctx.kv_self;
3129
3429
 
@@ -3131,20 +3431,22 @@ static struct ggml_cgraph * llm_build_falcon(
3131
3431
 
3132
3432
  const int64_t n_embd = hparams.n_embd;
3133
3433
  const int64_t n_layer = hparams.n_layer;
3134
- const int64_t n_ctx = hparams.n_ctx;
3434
+ const int64_t n_ctx = cparams.n_ctx;
3135
3435
  const int64_t n_head = hparams.n_head;
3136
3436
  const int64_t n_head_kv = hparams.n_head_kv;
3137
3437
  const int64_t n_embd_head = hparams.n_embd_head();
3138
3438
  const int64_t n_embd_gqa = hparams.n_embd_gqa();
3139
3439
 
3140
- GGML_ASSERT(n_embd_head == hparams.n_rot);
3141
-
3142
- const float freq_base = hparams.rope_freq_base;
3143
- const float freq_scale = hparams.rope_freq_scale;
3144
- const float norm_eps = hparams.f_norm_eps;
3440
+ const float norm_rms_eps = hparams.f_norm_rms_eps;
3145
3441
 
3146
3442
  const int n_gpu_layers = model.n_gpu_layers;
3147
3443
 
3444
+ const int32_t n_tokens = batch.n_tokens;
3445
+ const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
3446
+ const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
3447
+
3448
+ // printf("n_kv = %d\n", n_kv);
3449
+
3148
3450
  auto & buf_compute = lctx.buf_compute;
3149
3451
 
3150
3452
  struct ggml_init_params params = {
@@ -3162,12 +3464,12 @@ static struct ggml_cgraph * llm_build_falcon(
3162
3464
  struct ggml_tensor * cur;
3163
3465
  struct ggml_tensor * inpL;
3164
3466
 
3165
- if (tokens) {
3166
- struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
3467
+ if (batch.token) {
3468
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
3167
3469
 
3168
3470
  ggml_allocr_alloc(lctx.alloc, inp_tokens);
3169
3471
  if (!ggml_allocr_is_measure(lctx.alloc)) {
3170
- memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
3472
+ memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
3171
3473
  }
3172
3474
  ggml_set_name(inp_tokens, "inp_tokens");
3173
3475
 
@@ -3177,11 +3479,11 @@ static struct ggml_cgraph * llm_build_falcon(
3177
3479
  GGML_ASSERT(false && "not implemented");
3178
3480
  #endif
3179
3481
 
3180
- inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
3482
+ inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
3181
3483
 
3182
3484
  ggml_allocr_alloc(lctx.alloc, inpL);
3183
3485
  if (!ggml_allocr_is_measure(lctx.alloc)) {
3184
- memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
3486
+ memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
3185
3487
  }
3186
3488
  }
3187
3489
 
@@ -3190,9 +3492,6 @@ static struct ggml_cgraph * llm_build_falcon(
3190
3492
 
3191
3493
  // offload functions set the tensor output backend to GPU
3192
3494
  // tensors are GPU-accelerated if any input or the output has been offloaded
3193
- //
3194
- // with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
3195
- // in that case ggml_cuda_assign_buffers has no effect
3196
3495
  offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
3197
3496
  offload_func_t offload_func_kq = llama_nop;
3198
3497
  offload_func_t offload_func_v = llama_nop;
@@ -3209,15 +3508,432 @@ static struct ggml_cgraph * llm_build_falcon(
3209
3508
  }
3210
3509
  #endif // GGML_USE_CUBLAS
3211
3510
 
3511
+ // KQ_scale
3212
3512
  struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
3513
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
3213
3514
  ggml_allocr_alloc(lctx.alloc, KQ_scale);
3214
3515
  if (!ggml_allocr_is_measure(lctx.alloc)) {
3215
- ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
3516
+ ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd_head)));
3216
3517
  }
3217
- ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
3218
3518
 
3219
- for (int il = 0; il < n_layer; ++il) {
3220
- struct ggml_tensor * attn_norm;
3519
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
3520
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
3521
+ offload_func_kq(KQ_mask);
3522
+ ggml_set_name(KQ_mask, "KQ_mask");
3523
+ ggml_allocr_alloc(lctx.alloc, KQ_mask);
3524
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3525
+ float * data = (float *) KQ_mask->data;
3526
+ memset(data, 0, ggml_nbytes(KQ_mask));
3527
+
3528
+ for (int h = 0; h < 1; ++h) {
3529
+ for (int j = 0; j < n_tokens; ++j) {
3530
+ const llama_pos pos = batch.pos[j];
3531
+ const llama_seq_id seq_id = batch.seq_id[j];
3532
+
3533
+ for (int i = 0; i < n_kv; ++i) {
3534
+ if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
3535
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
3536
+ }
3537
+ }
3538
+ }
3539
+ }
3540
+ }
3541
+
3542
+ for (int il = 0; il < n_layer; ++il) {
3543
+ ggml_format_name(inpL, "layer_inp_%d", il);
3544
+
3545
+ offload_func_t offload_func = llama_nop;
3546
+
3547
+ #ifdef GGML_USE_CUBLAS
3548
+ if (il >= i_gpu_start) {
3549
+ offload_func = ggml_cuda_assign_buffers_no_alloc;
3550
+ }
3551
+ #endif // GGML_USE_CUBLAS
3552
+
3553
+ struct ggml_tensor * inpSA = inpL;
3554
+
3555
+ // norm
3556
+ {
3557
+ cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps);
3558
+ offload_func(cur);
3559
+ ggml_set_name(cur, "rms_norm_0");
3560
+
3561
+ // cur = cur*attn_norm(broadcasted)
3562
+ cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm);
3563
+ offload_func(cur);
3564
+ ggml_set_name(cur, "attention_norm_0");
3565
+ }
3566
+
3567
+ // self-attention
3568
+ {
3569
+ // compute Q and K
3570
+ struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
3571
+ offload_func_kq(tmpk);
3572
+ ggml_set_name(tmpk, "tmpk");
3573
+
3574
+ struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
3575
+ offload_func_kq(tmpq);
3576
+ ggml_set_name(tmpq, "tmpq");
3577
+
3578
+ struct ggml_tensor * Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens);
3579
+ offload_func_kq(Kcur);
3580
+ ggml_set_name(Kcur, "Kcur");
3581
+
3582
+ struct ggml_tensor * Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens);
3583
+ offload_func_kq(Qcur);
3584
+ ggml_set_name(Qcur, "Qcur");
3585
+
3586
+ // store key and value to memory
3587
+ {
3588
+ // compute the transposed [n_tokens, n_embd] V matrix
3589
+
3590
+ struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
3591
+ offload_func_v(tmpv);
3592
+ ggml_set_name(tmpv, "tmpv");
3593
+
3594
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens));
3595
+ offload_func_v(Vcur);
3596
+ ggml_set_name(Vcur, "Vcur");
3597
+
3598
+ struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
3599
+ offload_func_kq(k);
3600
+ ggml_set_name(k, "k");
3601
+
3602
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
3603
+ ( n_ctx)*ggml_element_size(kv_self.v),
3604
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
3605
+ offload_func_v(v);
3606
+ ggml_set_name(v, "v");
3607
+
3608
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
3609
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
3610
+ }
3611
+
3612
+ struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
3613
+ offload_func_kq(Q);
3614
+ ggml_set_name(Q, "Q");
3615
+
3616
+ struct ggml_tensor * K =
3617
+ ggml_view_3d(ctx0, kv_self.k,
3618
+ n_embd_head, n_kv, n_head_kv,
3619
+ ggml_element_size(kv_self.k)*n_embd_gqa,
3620
+ ggml_element_size(kv_self.k)*n_embd_head,
3621
+ ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
3622
+ offload_func_kq(K);
3623
+ ggml_set_name(K, "K");
3624
+
3625
+ // K * Q
3626
+ struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
3627
+ offload_func_kq(KQ);
3628
+ ggml_set_name(KQ, "KQ");
3629
+
3630
+ // KQ_scaled = KQ / sqrt(n_embd_head)
3631
+ // KQ_scaled shape [n_kv, n_tokens, n_head, 1]
3632
+ struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
3633
+ offload_func_kq(KQ_scaled);
3634
+ ggml_set_name(KQ_scaled, "KQ_scaled");
3635
+
3636
+ // KQ_masked = mask_past(KQ_scaled)
3637
+ struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ 0, n_head, 8);
3638
+ ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
3639
+
3640
+ struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
3641
+ offload_func_kq(KQ_masked);
3642
+ ggml_set_name(KQ_masked, "KQ_masked");
3643
+
3644
+ // KQ = soft_max(KQ_masked)
3645
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
3646
+ offload_func_v(KQ_soft_max);
3647
+ ggml_set_name(KQ_soft_max, "KQ_soft_max");
3648
+
3649
+ // split cached V into n_head heads
3650
+ struct ggml_tensor * V =
3651
+ ggml_view_3d(ctx0, kv_self.v,
3652
+ n_kv, n_embd_head, n_head_kv,
3653
+ ggml_element_size(kv_self.v)*n_ctx,
3654
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
3655
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
3656
+ offload_func_v(V);
3657
+ ggml_set_name(V, "V");
3658
+
3659
+ #if 1
3660
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
3661
+ offload_func_v(KQV);
3662
+ ggml_set_name(KQV, "KQV");
3663
+ #else
3664
+ // make V contiguous in memory to speed up the matmul, however we waste time on the copy
3665
+ // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
3666
+ // is there a better way?
3667
+ struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_ctx, n_embd_head, n_head));
3668
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
3669
+ #endif
3670
+
3671
+ // KQV_merged = KQV.permute(0, 2, 1, 3)
3672
+ struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
3673
+ offload_func_v(KQV_merged);
3674
+ ggml_set_name(KQV_merged, "KQV_merged");
3675
+
3676
+ // cur = KQV_merged.contiguous().view(n_embd, n_tokens)
3677
+ cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
3678
+ offload_func_v(cur);
3679
+ ggml_set_name(cur, "KQV_merged_contiguous");
3680
+
3681
+ // projection (no bias)
3682
+ cur = ggml_mul_mat(ctx0,
3683
+ model.layers[il].wo,
3684
+ cur);
3685
+ offload_func(cur);
3686
+ ggml_set_name(cur, "result_wo");
3687
+ }
3688
+
3689
+ struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
3690
+ offload_func(inpFF);
3691
+ ggml_set_name(inpFF, "inpFF");
3692
+
3693
+ // feed-forward network
3694
+ {
3695
+ // norm
3696
+ {
3697
+ cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps);
3698
+ offload_func(cur);
3699
+ ggml_set_name(cur, "rms_norm_1");
3700
+
3701
+ // cur = cur*ffn_norm(broadcasted)
3702
+ cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
3703
+ offload_func(cur);
3704
+ ggml_set_name(cur, "ffn_norm");
3705
+ }
3706
+
3707
+ struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
3708
+ model.layers[il].w3,
3709
+ cur);
3710
+ offload_func(tmp);
3711
+ ggml_set_name(tmp, "result_w3");
3712
+
3713
+ cur = ggml_mul_mat(ctx0,
3714
+ model.layers[il].w1,
3715
+ cur);
3716
+ offload_func(cur);
3717
+ ggml_set_name(cur, "result_w1");
3718
+
3719
+ // SILU activation
3720
+ cur = ggml_silu(ctx0, cur);
3721
+ offload_func(cur);
3722
+ ggml_set_name(cur, "silu");
3723
+
3724
+ cur = ggml_mul(ctx0, cur, tmp);
3725
+ offload_func(cur);
3726
+ ggml_set_name(cur, "silu_x_result_w3");
3727
+
3728
+ cur = ggml_mul_mat(ctx0,
3729
+ model.layers[il].w2,
3730
+ cur);
3731
+ offload_func(cur);
3732
+ ggml_set_name(cur, "result_w2");
3733
+ }
3734
+
3735
+ cur = ggml_add(ctx0, cur, inpFF);
3736
+ offload_func(cur);
3737
+ ggml_set_name(cur, "inpFF_+_result_w2");
3738
+
3739
+ // input for next layer
3740
+ inpL = cur;
3741
+ }
3742
+
3743
+ cur = inpL;
3744
+
3745
+ // norm
3746
+ {
3747
+ cur = ggml_rms_norm(ctx0, cur, norm_rms_eps);
3748
+ offload_func_nr(cur);
3749
+ ggml_set_name(cur, "rms_norm_2");
3750
+
3751
+ // cur = cur*norm(broadcasted)
3752
+ cur = ggml_mul(ctx0, cur, model.output_norm);
3753
+ // offload_func_nr(cur); // TODO CPU + GPU mirrored backend
3754
+ ggml_set_name(cur, "result_norm");
3755
+ }
3756
+
3757
+ // lm_head
3758
+ cur = ggml_mul_mat(ctx0, model.output, cur);
3759
+ ggml_set_name(cur, "result_output");
3760
+
3761
+ ggml_build_forward_expand(gf, cur);
3762
+
3763
+ ggml_free(ctx0);
3764
+
3765
+ return gf;
3766
+ }
3767
+
3768
+ static struct ggml_cgraph * llm_build_falcon(
3769
+ llama_context & lctx,
3770
+ const llama_batch & batch) {
3771
+ const auto & model = lctx.model;
3772
+ const auto & hparams = model.hparams;
3773
+ const auto & cparams = lctx.cparams;
3774
+
3775
+ const auto & kv_self = lctx.kv_self;
3776
+
3777
+ GGML_ASSERT(!!kv_self.ctx);
3778
+
3779
+ const int64_t n_embd = hparams.n_embd;
3780
+ const int64_t n_layer = hparams.n_layer;
3781
+ const int64_t n_ctx = cparams.n_ctx;
3782
+ const int64_t n_head = hparams.n_head;
3783
+ const int64_t n_head_kv = hparams.n_head_kv;
3784
+ const int64_t n_embd_head = hparams.n_embd_head();
3785
+ const int64_t n_embd_gqa = hparams.n_embd_gqa();
3786
+
3787
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
3788
+
3789
+ const float freq_base = cparams.rope_freq_base;
3790
+ const float freq_scale = cparams.rope_freq_scale;
3791
+ const float norm_eps = hparams.f_norm_eps;
3792
+
3793
+ const int n_gpu_layers = model.n_gpu_layers;
3794
+
3795
+ const int32_t n_tokens = batch.n_tokens;
3796
+ const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
3797
+ const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
3798
+
3799
+ const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
3800
+
3801
+ //printf("kv_head = %d, n_kv = %d, n_tokens = %d, n_ctx = %d, is_measure = %d, has_shift = %d\n",
3802
+ // kv_head, n_kv, n_tokens, n_ctx, ggml_allocr_is_measure(lctx.alloc), kv_self.has_shift);
3803
+
3804
+ auto & buf_compute = lctx.buf_compute;
3805
+
3806
+ struct ggml_init_params params = {
3807
+ /*.mem_size =*/ buf_compute.size,
3808
+ /*.mem_buffer =*/ buf_compute.data,
3809
+ /*.no_alloc =*/ false,
3810
+ };
3811
+
3812
+ params.no_alloc = true;
3813
+
3814
+ struct ggml_context * ctx0 = ggml_init(params);
3815
+
3816
+ ggml_cgraph * gf = ggml_new_graph(ctx0);
3817
+
3818
+ struct ggml_tensor * cur;
3819
+ struct ggml_tensor * inpL;
3820
+
3821
+ if (batch.token) {
3822
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
3823
+
3824
+ ggml_allocr_alloc(lctx.alloc, inp_tokens);
3825
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3826
+ memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
3827
+ }
3828
+ ggml_set_name(inp_tokens, "inp_tokens");
3829
+
3830
+ inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
3831
+ } else {
3832
+ #ifdef GGML_USE_MPI
3833
+ GGML_ASSERT(false && "not implemented");
3834
+ #endif
3835
+
3836
+ inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
3837
+
3838
+ ggml_allocr_alloc(lctx.alloc, inpL);
3839
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3840
+ memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
3841
+ }
3842
+ }
3843
+
3844
+ const int i_gpu_start = n_layer - n_gpu_layers;
3845
+ (void) i_gpu_start;
3846
+
3847
+ // offload functions set the tensor output backend to GPU
3848
+ // tensors are GPU-accelerated if any input or the output has been offloaded
3849
+ offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
3850
+ offload_func_t offload_func_kq = llama_nop;
3851
+ offload_func_t offload_func_v = llama_nop;
3852
+
3853
+ #ifdef GGML_USE_CUBLAS
3854
+ if (n_gpu_layers > n_layer) {
3855
+ offload_func_nr = ggml_cuda_assign_buffers_no_alloc;
3856
+ }
3857
+ if (n_gpu_layers > n_layer + 1) {
3858
+ offload_func_v = ggml_cuda_assign_buffers_no_alloc;
3859
+ }
3860
+ if (n_gpu_layers > n_layer + 2) {
3861
+ offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
3862
+ }
3863
+ #endif // GGML_USE_CUBLAS
3864
+
3865
+ // KQ_scale
3866
+ struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
3867
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
3868
+ ggml_allocr_alloc(lctx.alloc, KQ_scale);
3869
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3870
+ ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
3871
+ }
3872
+
3873
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
3874
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
3875
+ offload_func_kq(KQ_mask);
3876
+ ggml_set_name(KQ_mask, "KQ_mask");
3877
+ ggml_allocr_alloc(lctx.alloc, KQ_mask);
3878
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3879
+ float * data = (float *) KQ_mask->data;
3880
+ memset(data, 0, ggml_nbytes(KQ_mask));
3881
+
3882
+ for (int h = 0; h < 1; ++h) {
3883
+ for (int j = 0; j < n_tokens; ++j) {
3884
+ const llama_pos pos = batch.pos[j];
3885
+ const llama_seq_id seq_id = batch.seq_id[j];
3886
+
3887
+ for (int i = 0; i < n_kv; ++i) {
3888
+ if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
3889
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
3890
+ }
3891
+ }
3892
+ }
3893
+ }
3894
+ }
3895
+
3896
+ // KQ_pos - contains the positions
3897
+ struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
3898
+ offload_func_kq(KQ_pos);
3899
+ ggml_set_name(KQ_pos, "KQ_pos");
3900
+ ggml_allocr_alloc(lctx.alloc, KQ_pos);
3901
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3902
+ int * data = (int *) KQ_pos->data;
3903
+ for (int i = 0; i < n_tokens; ++i) {
3904
+ data[i] = batch.pos[i];
3905
+ }
3906
+ }
3907
+
3908
+ // shift the entire K-cache if needed
3909
+ if (do_rope_shift) {
3910
+ struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
3911
+ offload_func_kq(K_shift);
3912
+ ggml_set_name(K_shift, "K_shift");
3913
+ ggml_allocr_alloc(lctx.alloc, K_shift);
3914
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3915
+ int * data = (int *) K_shift->data;
3916
+ for (int i = 0; i < n_ctx; ++i) {
3917
+ data[i] = kv_self.cells[i].delta;
3918
+ }
3919
+ }
3920
+
3921
+ for (int il = 0; il < n_layer; ++il) {
3922
+ struct ggml_tensor * tmp =
3923
+ ggml_rope_custom_inplace(ctx0,
3924
+ ggml_view_3d(ctx0, kv_self.k,
3925
+ n_embd_head, n_head_kv, n_ctx,
3926
+ ggml_element_size(kv_self.k)*n_embd_head,
3927
+ ggml_element_size(kv_self.k)*n_embd_gqa,
3928
+ ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
3929
+ K_shift, n_embd_head, 2, 0, freq_base, freq_scale);
3930
+ offload_func_kq(tmp);
3931
+ ggml_build_forward_expand(gf, tmp);
3932
+ }
3933
+ }
3934
+
3935
+ for (int il = 0; il < n_layer; ++il) {
3936
+ struct ggml_tensor * attn_norm;
3221
3937
 
3222
3938
  offload_func_t offload_func = llama_nop;
3223
3939
 
@@ -3271,45 +3987,45 @@ static struct ggml_cgraph * llm_build_falcon(
3271
3987
  // TODO: these 2 ggml_conts are technically not needed, but we add them until CUDA support for
3272
3988
  // non-contiguous views is added for the rope operator
3273
3989
  struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_3d(
3274
- ctx0, cur, n_embd_head, n_head, N,
3990
+ ctx0, cur, n_embd_head, n_head, n_tokens,
3275
3991
  wsize * n_embd_head,
3276
3992
  wsize * n_embd_head * (n_head + 2 * n_head_kv),
3277
3993
  0));
3278
3994
  offload_func_kq(tmpq);
3279
3995
 
3280
3996
  struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_3d(
3281
- ctx0, cur, n_embd_head, n_head_kv, N,
3997
+ ctx0, cur, n_embd_head, n_head_kv, n_tokens,
3282
3998
  wsize * n_embd_head,
3283
3999
  wsize * n_embd_head * (n_head + 2 * n_head_kv),
3284
4000
  wsize * n_embd_head * n_head));
3285
4001
  offload_func_kq(tmpk);
3286
4002
 
3287
4003
  struct ggml_tensor * tmpv = ggml_view_3d(
3288
- ctx0, cur, n_embd_head, n_head_kv, N,
4004
+ ctx0, cur, n_embd_head, n_head_kv, n_tokens,
3289
4005
  wsize * n_embd_head,
3290
4006
  wsize * n_embd_head * (n_head + 2 * n_head_kv),
3291
4007
  wsize * n_embd_head * (n_head + n_head_kv));
3292
4008
  offload_func_v(tmpv);
3293
4009
 
3294
4010
  // using mode = 2 for neox mode
3295
- struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, tmpq, n_past, n_embd_head, 2, 0, freq_base, freq_scale);
4011
+ struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, tmpq, KQ_pos, n_embd_head, 2, 0, freq_base, freq_scale);
3296
4012
  offload_func_kq(Qcur);
3297
- struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, tmpk, n_past, n_embd_head, 2, 0, freq_base, freq_scale);
4013
+ struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, tmpk, KQ_pos, n_embd_head, 2, 0, freq_base, freq_scale);
3298
4014
  offload_func_kq(Kcur);
3299
4015
 
3300
4016
  {
3301
- struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, N));
4017
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
3302
4018
  offload_func_v(Vcur);
3303
4019
  offload_func_v(Vcur->src[0]->src[0]);
3304
4020
  ggml_set_name(Vcur, "Vcur");
3305
4021
 
3306
- struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
4022
+ struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
3307
4023
  offload_func_kq(k);
3308
4024
  ggml_set_name(k, "k");
3309
4025
 
3310
- struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
4026
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
3311
4027
  ( n_ctx)*ggml_element_size(kv_self.v),
3312
- (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
4028
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
3313
4029
  offload_func_v(v);
3314
4030
 
3315
4031
  ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
@@ -3322,7 +4038,7 @@ static struct ggml_cgraph * llm_build_falcon(
3322
4038
 
3323
4039
  struct ggml_tensor * K =
3324
4040
  ggml_view_3d(ctx0, kv_self.k,
3325
- n_embd_head, n_past + N, n_head_kv,
4041
+ n_embd_head, n_kv, n_head_kv,
3326
4042
  ggml_element_size(kv_self.k)*n_embd_gqa,
3327
4043
  ggml_element_size(kv_self.k)*n_embd_head,
3328
4044
  ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
@@ -3333,21 +4049,21 @@ static struct ggml_cgraph * llm_build_falcon(
3333
4049
  offload_func_kq(KQ);
3334
4050
  ggml_set_name(KQ, "KQ");
3335
4051
 
3336
- struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
4052
+ struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
3337
4053
  offload_func_kq(KQ_scaled);
3338
4054
  ggml_set_name(KQ_scaled, "KQ_scaled");
3339
4055
 
3340
- struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
4056
+ struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
3341
4057
  offload_func_kq(KQ_masked);
3342
4058
  ggml_set_name(KQ_masked, "KQ_masked");
3343
4059
 
3344
- struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
4060
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
3345
4061
  offload_func_v(KQ_soft_max);
3346
4062
  ggml_set_name(KQ_soft_max, "KQ_soft_max");
3347
4063
 
3348
4064
  struct ggml_tensor * V =
3349
4065
  ggml_view_3d(ctx0, kv_self.v,
3350
- n_past + N, n_embd_head, n_head_kv,
4066
+ n_kv, n_embd_head, n_head_kv,
3351
4067
  ggml_element_size(kv_self.v)*n_ctx,
3352
4068
  ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
3353
4069
  ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
@@ -3362,7 +4078,7 @@ static struct ggml_cgraph * llm_build_falcon(
3362
4078
  offload_func_v(KQV_merged);
3363
4079
  ggml_set_name(KQV_merged, "KQV_merged");
3364
4080
 
3365
- cur = ggml_cpy(ctx0, KQV_merged, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
4081
+ cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
3366
4082
  offload_func_v(cur);
3367
4083
  ggml_set_name(cur, "KQV_merged_contiguous");
3368
4084
 
@@ -3420,17 +4136,10 @@ static struct ggml_cgraph * llm_build_falcon(
3420
4136
 
3421
4137
  static struct ggml_cgraph * llm_build_starcoder(
3422
4138
  llama_context & lctx,
3423
- const llama_token * tokens,
3424
- const float * embd,
3425
- int n_tokens,
3426
- int n_past) {
3427
-
3428
- GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
3429
-
3430
- const int N = n_tokens;
3431
-
4139
+ const llama_batch & batch) {
3432
4140
  const auto & model = lctx.model;
3433
4141
  const auto & hparams = model.hparams;
4142
+ const auto & cparams = lctx.cparams;
3434
4143
 
3435
4144
  const auto & kv_self = lctx.kv_self;
3436
4145
 
@@ -3438,7 +4147,7 @@ static struct ggml_cgraph * llm_build_starcoder(
3438
4147
 
3439
4148
  const int64_t n_embd = hparams.n_embd;
3440
4149
  const int64_t n_layer = hparams.n_layer;
3441
- const int64_t n_ctx = hparams.n_ctx;
4150
+ const int64_t n_ctx = cparams.n_ctx;
3442
4151
  const int64_t n_head = hparams.n_head;
3443
4152
  const int64_t n_head_kv = hparams.n_head_kv;
3444
4153
  const int64_t n_embd_head = hparams.n_embd_head();
@@ -3446,7 +4155,11 @@ static struct ggml_cgraph * llm_build_starcoder(
3446
4155
 
3447
4156
  GGML_ASSERT(n_embd_head == hparams.n_rot);
3448
4157
 
3449
- const float norm_eps = hparams.f_norm_eps;
4158
+ const float norm_eps = hparams.f_norm_eps;
4159
+
4160
+ const int32_t n_tokens = batch.n_tokens;
4161
+ const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
4162
+ const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
3450
4163
 
3451
4164
  auto & buf_compute = lctx.buf_compute;
3452
4165
 
@@ -3467,12 +4180,12 @@ static struct ggml_cgraph * llm_build_starcoder(
3467
4180
  struct ggml_tensor * position;
3468
4181
  struct ggml_tensor * inpL;
3469
4182
 
3470
- if (tokens) {
3471
- struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
4183
+ if (batch.token) {
4184
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
3472
4185
 
3473
4186
  ggml_allocr_alloc(lctx.alloc, inp_tokens);
3474
4187
  if (!ggml_allocr_is_measure(lctx.alloc)) {
3475
- memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
4188
+ memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
3476
4189
  }
3477
4190
  ggml_set_name(inp_tokens, "inp_tokens");
3478
4191
 
@@ -3482,21 +4195,21 @@ static struct ggml_cgraph * llm_build_starcoder(
3482
4195
  GGML_ASSERT(false && "not implemented");
3483
4196
  #endif
3484
4197
 
3485
- token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
4198
+ token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
3486
4199
 
3487
4200
  ggml_allocr_alloc(lctx.alloc, token);
3488
4201
  if (!ggml_allocr_is_measure(lctx.alloc)) {
3489
- memcpy(token->data, embd, N * n_embd * ggml_element_size(token));
4202
+ memcpy(token->data, batch.embd, n_tokens * n_embd * ggml_element_size(token));
3490
4203
  }
3491
4204
  }
3492
4205
 
3493
4206
  {
3494
4207
  // Compute position embeddings.
3495
- struct ggml_tensor * inp_positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
4208
+ struct ggml_tensor * inp_positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
3496
4209
  ggml_allocr_alloc(lctx.alloc, inp_positions);
3497
4210
  if (!ggml_allocr_is_measure(lctx.alloc)) {
3498
- for (int i = 0; i < N; ++i) {
3499
- ((int32_t *) inp_positions->data)[i] = n_past + i;
4211
+ for (int i = 0; i < n_tokens; ++i) {
4212
+ ((int32_t *) inp_positions->data)[i] = batch.pos[i];
3500
4213
  }
3501
4214
  }
3502
4215
  ggml_set_name(inp_positions, "inp_positions");
@@ -3504,12 +4217,35 @@ static struct ggml_cgraph * llm_build_starcoder(
3504
4217
  position = ggml_get_rows(ctx0, model.pos_embeddings, inp_positions);
3505
4218
  }
3506
4219
 
4220
+ // KQ_scale
3507
4221
  struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
4222
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
3508
4223
  ggml_allocr_alloc(lctx.alloc, KQ_scale);
3509
4224
  if (!ggml_allocr_is_measure(lctx.alloc)) {
3510
4225
  ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
3511
4226
  }
3512
- ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
4227
+
4228
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
4229
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
4230
+ ggml_set_name(KQ_mask, "KQ_mask");
4231
+ ggml_allocr_alloc(lctx.alloc, KQ_mask);
4232
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
4233
+ float * data = (float *) KQ_mask->data;
4234
+ memset(data, 0, ggml_nbytes(KQ_mask));
4235
+
4236
+ for (int h = 0; h < 1; ++h) {
4237
+ for (int j = 0; j < n_tokens; ++j) {
4238
+ const llama_pos pos = batch.pos[j];
4239
+ const llama_seq_id seq_id = batch.seq_id[j];
4240
+
4241
+ for (int i = 0; i < n_kv; ++i) {
4242
+ if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
4243
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
4244
+ }
4245
+ }
4246
+ }
4247
+ }
4248
+ }
3513
4249
 
3514
4250
  inpL = ggml_add(ctx0, token, position);
3515
4251
  ggml_set_name(inpL, "inpL");
@@ -3525,23 +4261,23 @@ static struct ggml_cgraph * llm_build_starcoder(
3525
4261
  // Self Attention
3526
4262
  cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv);
3527
4263
 
3528
- struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
3529
- struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, N, cur->nb[1], sizeof(float)*n_embd);
3530
- struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, N, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
4264
+ struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd);
4265
+ struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*n_embd);
4266
+ struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
3531
4267
 
3532
4268
  struct ggml_tensor * Qcur = tmpq;
3533
4269
  struct ggml_tensor * Kcur = tmpk;
3534
4270
 
3535
4271
  {
3536
- struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, N));
4272
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
3537
4273
  ggml_set_name(Vcur, "Vcur");
3538
4274
 
3539
- struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
4275
+ struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
3540
4276
  ggml_set_name(k, "k");
3541
4277
 
3542
- struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
4278
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
3543
4279
  ( n_ctx)*ggml_element_size(kv_self.v),
3544
- (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
4280
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
3545
4281
 
3546
4282
  ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
3547
4283
  ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
@@ -3551,13 +4287,13 @@ static struct ggml_cgraph * llm_build_starcoder(
3551
4287
  ggml_permute(ctx0,
3552
4288
  ggml_cpy(ctx0,
3553
4289
  Qcur,
3554
- ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, N)),
4290
+ ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, n_tokens)),
3555
4291
  0, 2, 1, 3);
3556
4292
  ggml_set_name(Q, "Q");
3557
4293
 
3558
4294
  struct ggml_tensor * K =
3559
4295
  ggml_view_3d(ctx0, kv_self.k,
3560
- n_embd_head, n_past + N, n_head_kv,
4296
+ n_embd_head, n_kv, n_head_kv,
3561
4297
  ggml_element_size(kv_self.k)*n_embd_gqa,
3562
4298
  ggml_element_size(kv_self.k)*n_embd_head,
3563
4299
  ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
@@ -3568,12 +4304,12 @@ static struct ggml_cgraph * llm_build_starcoder(
3568
4304
  ggml_set_name(KQ, "KQ");
3569
4305
 
3570
4306
  // KQ_scaled = KQ / sqrt(n_embd_head)
3571
- // KQ_scaled shape [n_past + N, N, n_head, 1]
4307
+ // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
3572
4308
  struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
3573
4309
  ggml_set_name(KQ_scaled, "KQ_scaled");
3574
4310
 
3575
4311
  // KQ_masked = mask_past(KQ_scaled)
3576
- struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
4312
+ struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
3577
4313
  ggml_set_name(KQ_masked, "KQ_masked");
3578
4314
 
3579
4315
  // KQ = soft_max(KQ_masked)
@@ -3583,7 +4319,7 @@ static struct ggml_cgraph * llm_build_starcoder(
3583
4319
  // split cached V into n_head heads
3584
4320
  struct ggml_tensor * V =
3585
4321
  ggml_view_3d(ctx0, kv_self.v,
3586
- n_past + N, n_embd_head, n_head_kv,
4322
+ n_kv, n_embd_head, n_head_kv,
3587
4323
  ggml_element_size(kv_self.v)*n_ctx,
3588
4324
  ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
3589
4325
  ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
@@ -3596,10 +4332,8 @@ static struct ggml_cgraph * llm_build_starcoder(
3596
4332
  struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
3597
4333
  ggml_set_name(KQV_merged, "KQV_merged");
3598
4334
 
3599
- // cur = KQV_merged.contiguous().view(n_embd, N)
3600
- cur = ggml_cpy(ctx0,
3601
- KQV_merged,
3602
- ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
4335
+ // cur = KQV_merged.contiguous().view(n_embd, n_tokens)
4336
+ cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
3603
4337
  ggml_set_name(cur, "KQV_merged_contiguous");
3604
4338
  }
3605
4339
 
@@ -3649,10 +4383,7 @@ static struct ggml_cgraph * llm_build_starcoder(
3649
4383
 
3650
4384
  static struct ggml_cgraph * llama_build_graph(
3651
4385
  llama_context & lctx,
3652
- const llama_token * tokens,
3653
- const float * embd,
3654
- int n_tokens,
3655
- int n_past) {
4386
+ const llama_batch & batch) {
3656
4387
  const auto & model = lctx.model;
3657
4388
 
3658
4389
  struct ggml_cgraph * result = NULL;
@@ -3660,76 +4391,121 @@ static struct ggml_cgraph * llama_build_graph(
3660
4391
  switch (model.arch) {
3661
4392
  case LLM_ARCH_LLAMA:
3662
4393
  {
3663
- result = llm_build_llama(lctx, tokens, embd, n_tokens, n_past);
4394
+ result = llm_build_llama(lctx, batch);
3664
4395
  } break;
3665
4396
  case LLM_ARCH_BAICHUAN:
3666
4397
  {
3667
- result = llm_build_baichaun(lctx, tokens, embd, n_tokens, n_past);
4398
+ result = llm_build_baichaun(lctx, batch);
3668
4399
  } break;
3669
4400
  case LLM_ARCH_FALCON:
3670
4401
  {
3671
- result = llm_build_falcon(lctx, tokens, embd, n_tokens, n_past);
4402
+ result = llm_build_falcon(lctx, batch);
3672
4403
  } break;
3673
4404
  case LLM_ARCH_STARCODER:
3674
4405
  {
3675
- result = llm_build_starcoder(lctx, tokens, embd, n_tokens, n_past);
4406
+ result = llm_build_starcoder(lctx, batch);
4407
+ } break;
4408
+ case LLM_ARCH_REFACT:
4409
+ {
4410
+ result = llm_build_refact(lctx, batch);
3676
4411
  } break;
3677
4412
  default:
3678
4413
  GGML_ASSERT(false);
3679
- };
4414
+ }
3680
4415
 
3681
4416
  return result;
3682
4417
  }
3683
4418
 
3684
- // evaluate the transformer
4419
+ // decode a batch of tokens by evaluating the transformer
3685
4420
  //
3686
4421
  // - lctx: llama context
3687
- // - tokens: new batch of tokens to process
3688
- // - embd embeddings input
3689
- // - n_tokens number of tokens
3690
- // - n_past: the context size so far
4422
+ // - batch: batch to evaluate
3691
4423
  // - n_threads: number of threads to use
3692
4424
  //
3693
- static bool llama_eval_internal(
4425
+ // return 0 on success
4426
+ // return positive int on warning
4427
+ // return negative int on error
4428
+ //
4429
+ static int llama_decode_internal(
3694
4430
  llama_context & lctx,
3695
- const llama_token * tokens,
3696
- const float * embd,
3697
- int n_tokens,
3698
- int n_past,
3699
- int n_threads,
3700
- const char * cgraph_fname) {
4431
+ llama_batch batch) {
4432
+ const uint32_t n_tokens = batch.n_tokens;
4433
+
4434
+ if (n_tokens == 0) {
4435
+ LLAMA_LOG_ERROR("%s: n_tokens == 0", __func__);
4436
+ return -1;
4437
+ }
4438
+
4439
+ const auto & model = lctx.model;
4440
+ const auto & hparams = model.hparams;
4441
+ const auto & cparams = lctx.cparams;
3701
4442
 
3702
- GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
4443
+ const auto n_batch = cparams.n_batch;
3703
4444
 
3704
- GGML_ASSERT(n_tokens > 0);
3705
- GGML_ASSERT(n_past >= 0);
3706
- // TODO: keep the values of n_batch and n_ctx
3707
- // GGML_ASSERT(n_tokens <= n_batch);
3708
- // GGML_ASSERT(n_past + n_tokens <= n_ctx);
4445
+ GGML_ASSERT(n_tokens <= n_batch);
4446
+
4447
+ int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
4448
+ GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
3709
4449
 
3710
4450
  const int64_t t_start_us = ggml_time_us();
3711
4451
 
3712
4452
  #ifdef GGML_USE_MPI
3713
- ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
4453
+ // TODO: needs fix after #3228
4454
+ GGML_ASSERT(false && "not implemented");
4455
+ //ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
3714
4456
  #endif
3715
4457
 
3716
4458
  GGML_ASSERT(n_threads > 0);
3717
4459
 
3718
- const int N = n_tokens;
3719
-
3720
- const auto & model = lctx.model;
3721
- const auto & hparams = model.hparams;
3722
-
3723
- const auto & kv_self = lctx.kv_self;
4460
+ auto & kv_self = lctx.kv_self;
3724
4461
 
3725
4462
  GGML_ASSERT(!!kv_self.ctx);
3726
4463
 
3727
4464
  const int64_t n_embd = hparams.n_embd;
3728
4465
  const int64_t n_vocab = hparams.n_vocab;
3729
4466
 
4467
+ // helpers for smoother batch API transistion
4468
+ // after deprecating the llama_eval calls, these will be removed
4469
+ std::vector<llama_pos> pos;
4470
+ std::vector<llama_seq_id> seq_id;
4471
+
4472
+ if (batch.pos == nullptr) {
4473
+ pos.resize(n_tokens);
4474
+ for (uint32_t i = 0; i < n_tokens; i++) {
4475
+ pos[i] = batch.all_pos_0 + i*batch.all_pos_1;
4476
+ }
4477
+
4478
+ batch.pos = pos.data();
4479
+ }
4480
+
4481
+ if (batch.seq_id == nullptr) {
4482
+ seq_id.resize(n_tokens);
4483
+ for (uint32_t i = 0; i < n_tokens; i++) {
4484
+ seq_id[i] = batch.all_seq_id;
4485
+ }
4486
+
4487
+ batch.seq_id = seq_id.data();
4488
+ }
4489
+
4490
+ // we always start to search for a free slot from the start of the cache
4491
+ // TODO: better strategies can be implemented
4492
+ kv_self.head = 0;
4493
+
4494
+ if (!llama_kv_cache_find_slot(kv_self, batch)) {
4495
+ return 1;
4496
+ }
4497
+
4498
+ // a heuristic, to avoid attending the full cache if it is not yet utilized
4499
+ // after enough generations, the benefit from this heuristic disappears
4500
+ // if we start defragmenting the cache, the benefit from this will be more important
4501
+ //kv_self.n = std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)); // TODO: this might be better for CUDA?
4502
+ kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, llama_kv_cache_cell_max(kv_self)));
4503
+
4504
+ //printf("kv_self.n = %d\n", kv_self.n);
4505
+
3730
4506
  ggml_allocr_reset(lctx.alloc);
3731
4507
 
3732
- ggml_cgraph * gf = llama_build_graph(lctx, tokens, embd, n_tokens, n_past);
4508
+ ggml_cgraph * gf = llama_build_graph(lctx, batch);
3733
4509
 
3734
4510
  ggml_allocr_alloc_graph(lctx.alloc, gf);
3735
4511
 
@@ -3738,6 +4514,7 @@ static bool llama_eval_internal(
3738
4514
  ggml_tensor * node = gf->leafs[i];
3739
4515
  if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
3740
4516
  ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) lctx.buf_alloc.data);
4517
+ ggml_cuda_copy_to_device(node);
3741
4518
  }
3742
4519
  }
3743
4520
 
@@ -3747,6 +4524,8 @@ static bool llama_eval_internal(
3747
4524
  ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) lctx.buf_alloc.data);
3748
4525
  }
3749
4526
  }
4527
+
4528
+ ggml_cuda_set_mul_mat_q(cparams.mul_mat_q);
3750
4529
  #endif
3751
4530
 
3752
4531
  // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
@@ -3756,14 +4535,15 @@ static bool llama_eval_internal(
3756
4535
  // TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
3757
4536
  // we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
3758
4537
  // with the BLAS calls. need a better solution
3759
- if (N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
4538
+ if (n_tokens >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
3760
4539
  n_threads = std::min(4, n_threads);
3761
4540
  }
3762
4541
 
3763
4542
  // If all tensors can be run on the GPU then using more than 1 thread is detrimental.
3764
4543
  const bool full_offload_supported = model.arch == LLM_ARCH_LLAMA ||
3765
4544
  model.arch == LLM_ARCH_BAICHUAN ||
3766
- model.arch == LLM_ARCH_FALCON;
4545
+ model.arch == LLM_ARCH_FALCON ||
4546
+ model.arch == LLM_ARCH_REFACT;
3767
4547
  const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
3768
4548
  if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
3769
4549
  n_threads = 1;
@@ -3795,12 +4575,9 @@ static bool llama_eval_internal(
3795
4575
  ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
3796
4576
  #endif
3797
4577
 
3798
- // update kv token count
3799
- lctx.kv_self.n = n_past + N;
3800
-
3801
- if (cgraph_fname) {
3802
- ggml_graph_export(gf, cgraph_fname);
3803
- }
4578
+ // update the kv ring buffer
4579
+ lctx.kv_self.head += n_tokens;
4580
+ lctx.kv_self.has_shift = false;
3804
4581
 
3805
4582
  #ifdef GGML_PERF
3806
4583
  // print timing information per ggml operation (for debugging purposes)
@@ -3817,13 +4594,20 @@ static bool llama_eval_internal(
3817
4594
  {
3818
4595
  auto & logits_out = lctx.logits;
3819
4596
 
3820
- if (lctx.logits_all) {
3821
- logits_out.resize(n_vocab * N);
3822
- memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*N);
4597
+ if (batch.logits) {
4598
+ logits_out.resize(n_vocab * n_tokens);
4599
+ for (uint32_t i = 0; i < n_tokens; i++) {
4600
+ if (batch.logits[i] == 0) {
4601
+ continue;
4602
+ }
4603
+ memcpy(logits_out.data() + (n_vocab*i), (float *) ggml_get_data(res) + (n_vocab*i), sizeof(float)*n_vocab);
4604
+ }
4605
+ } else if (lctx.logits_all) {
4606
+ logits_out.resize(n_vocab * n_tokens);
4607
+ memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*n_tokens);
3823
4608
  } else {
3824
- // return result for just the last token
3825
4609
  logits_out.resize(n_vocab);
3826
- memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
4610
+ memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(n_tokens - 1)), sizeof(float)*n_vocab);
3827
4611
  }
3828
4612
  }
3829
4613
 
@@ -3832,20 +4616,27 @@ static bool llama_eval_internal(
3832
4616
  auto & embedding_out = lctx.embedding;
3833
4617
 
3834
4618
  embedding_out.resize(n_embd);
3835
- memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd);
4619
+ memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(n_tokens - 1)), sizeof(float)*n_embd);
3836
4620
  }
3837
4621
 
3838
4622
  // measure the performance only for the single-token evals
3839
- if (N == 1) {
4623
+ if (n_tokens == 1) {
3840
4624
  lctx.t_eval_us += ggml_time_us() - t_start_us;
3841
4625
  lctx.n_eval++;
3842
4626
  }
3843
- else if (N > 1) {
4627
+ else if (n_tokens > 1) {
3844
4628
  lctx.t_p_eval_us += ggml_time_us() - t_start_us;
3845
- lctx.n_p_eval += N;
4629
+ lctx.n_p_eval += n_tokens;
3846
4630
  }
3847
4631
 
3848
- return true;
4632
+ // get a more accurate load time, upon first eval
4633
+ // TODO: fix this
4634
+ if (!lctx.has_evaluated_once) {
4635
+ lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
4636
+ lctx.has_evaluated_once = true;
4637
+ }
4638
+
4639
+ return 0;
3849
4640
  }
3850
4641
 
3851
4642
  //
@@ -3872,18 +4663,41 @@ static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
3872
4663
  return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE;
3873
4664
  }
3874
4665
 
3875
- static uint8_t llama_token_to_byte(const llama_vocab & vocab, llama_token id) {
4666
+ static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id) {
4667
+ return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED;
4668
+ }
4669
+
4670
+ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
3876
4671
  GGML_ASSERT(llama_is_byte_token(vocab, id));
3877
4672
  const auto& token_data = vocab.id_to_token.at(id);
3878
- auto buf = token_data.text.substr(3, 2);
3879
- return strtol(buf.c_str(), NULL, 16);
4673
+ switch (llama_vocab_get_type(vocab)) {
4674
+ case LLAMA_VOCAB_TYPE_SPM: {
4675
+ auto buf = token_data.text.substr(3, 2);
4676
+ return strtol(buf.c_str(), NULL, 16);
4677
+ }
4678
+ case LLAMA_VOCAB_TYPE_BPE: {
4679
+ GGML_ASSERT(false);
4680
+ return unicode_to_bytes_bpe(token_data.text);
4681
+ }
4682
+ default:
4683
+ GGML_ASSERT(false);
4684
+ }
3880
4685
  }
3881
4686
 
3882
4687
  static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
3883
- char buf[7];
3884
- int result = snprintf(buf, sizeof(buf), "<0x%02X>", ch);
3885
- GGML_ASSERT(0 <= result && result < 7);
3886
- return vocab.token_to_id.at(buf);
4688
+ switch (llama_vocab_get_type(vocab)) {
4689
+ case LLAMA_VOCAB_TYPE_SPM: {
4690
+ char buf[7];
4691
+ int result = snprintf(buf, sizeof(buf), "<0x%02X>", ch);
4692
+ GGML_ASSERT(0 <= result && result < 7);
4693
+ return vocab.token_to_id.at(buf);
4694
+ }
4695
+ case LLAMA_VOCAB_TYPE_BPE: {
4696
+ return vocab.token_to_id.at(bytes_to_unicode_bpe(ch));
4697
+ }
4698
+ default:
4699
+ GGML_ASSERT(false);
4700
+ }
3887
4701
  }
3888
4702
 
3889
4703
  static void llama_escape_whitespace(std::string & text) {
@@ -4163,15 +4977,9 @@ struct llm_tokenizer_bpe {
4163
4977
  std::string byte_str(1, *j);
4164
4978
  auto token_multibyte = vocab.token_to_id.find(byte_str);
4165
4979
  if (token_multibyte == vocab.token_to_id.end()) {
4166
- try {
4167
- llama_token token_byte = llama_byte_to_token(vocab, *j);
4168
- output.push_back(token_byte);
4169
- } catch (const std::out_of_range & err) {
4170
- fprintf(stderr,"ERROR: byte not found in vocab: '%s'\n", byte_str.c_str());
4171
- }
4172
- } else {
4173
- output.push_back((*token_multibyte).second);
4980
+ throw std::runtime_error("ERROR: byte not found in vocab");
4174
4981
  }
4982
+ output.push_back((*token_multibyte).second);
4175
4983
  }
4176
4984
  } else {
4177
4985
  output.push_back((*token).second);
@@ -4208,23 +5016,144 @@ private:
4208
5016
  work_queue.push(bigram);
4209
5017
  }
4210
5018
 
4211
- // probably not 100% correct
4212
- static std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
4213
- std::vector<std::string> words;
5019
+ std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
5020
+ std::vector<std::string> bpe_words;
5021
+ std::vector<std::string> bpe_encoded_words;
5022
+
5023
+ std::string token = "";
5024
+ // GPT2 system regex: 's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+
5025
+ bool collecting_numeric = false;
5026
+ bool collecting_letter = false;
5027
+ bool collecting_special = false;
5028
+ bool collecting_whitespace_lookahead = false;
5029
+ bool collecting = false;
5030
+
5031
+ std::vector<std::string> text_utf;
5032
+ text_utf.reserve(text.size());
5033
+ bpe_words.reserve(text.size());
5034
+ bpe_encoded_words.reserve(text.size());
5035
+
5036
+ auto cps = codepoints_from_utf8(text);
5037
+ for (size_t i = 0; i < cps.size(); ++i)
5038
+ text_utf.emplace_back(codepoint_to_utf8(cps[i]));
5039
+
5040
+ for (int i = 0; i < (int)text_utf.size(); i++) {
5041
+ const std::string & utf_char = text_utf[i];
5042
+ bool split_condition = false;
5043
+ // const char* text_pos = raw_text_p + utf_char.seq_offset_bytes;
5044
+ int bytes_remain = text_utf.size() - i;
5045
+ // forward backward lookups
5046
+ const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
5047
+ const std::string & utf_char_next_next = (i + 2 < (int)text_utf.size()) ? text_utf[i + 2] : "";
5048
+
5049
+ // handling contractions
5050
+ if (!split_condition && bytes_remain >= 2) {
5051
+ // 's|'t|'m|'d
5052
+ if (utf_char == "\'" && (utf_char_next == "s" || utf_char_next == "t" || utf_char_next == "m" || utf_char_next == "d")) {
5053
+ split_condition = true;
5054
+ }
5055
+ if (split_condition) {
5056
+ if (token.size()) {
5057
+ bpe_words.emplace_back(token); // push previous content as token
5058
+ }
5059
+ token = utf_char + utf_char_next;
5060
+ bpe_words.emplace_back(token);
5061
+ token = "";
5062
+ i++;
5063
+ continue;
5064
+ }
5065
+ }
5066
+ if (!split_condition && bytes_remain >= 3) {
5067
+ // 're|'ve|'ll
5068
+ if (utf_char == "\'" && (
5069
+ (utf_char_next == "r" || utf_char_next_next == "e") ||
5070
+ (utf_char_next == "v" || utf_char_next_next == "e") ||
5071
+ (utf_char_next == "l" || utf_char_next_next == "l"))
5072
+ ) {
5073
+ split_condition = true;
5074
+ }
5075
+ if (split_condition) {
5076
+ // current token + next token can be defined
5077
+ if (token.size()) {
5078
+ bpe_words.emplace_back(token); // push previous content as token
5079
+ }
5080
+ token = utf_char + utf_char_next + utf_char_next_next;
5081
+ bpe_words.emplace_back(token); // the contraction
5082
+ token = "";
5083
+ i += 2;
5084
+ continue;
5085
+ }
5086
+ }
5087
+
5088
+ if (!split_condition && !collecting) {
5089
+ if (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || (!token.size() && utf_char == " " && codepoint_type(utf_char_next) == CODEPOINT_TYPE_LETTER)) {
5090
+ collecting_letter = true;
5091
+ collecting = true;
5092
+ }
5093
+ else if (codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || (!token.size() && utf_char == " " && codepoint_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
5094
+ collecting_numeric = true;
5095
+ collecting = true;
5096
+ }
5097
+ else if (
5098
+ ((codepoint_type(utf_char) != CODEPOINT_TYPE_LETTER && codepoint_type(utf_char) != CODEPOINT_TYPE_DIGIT) && (codepoint_type(utf_char) != CODEPOINT_TYPE_WHITESPACE)) ||
5099
+ (!token.size() && utf_char == " " && codepoint_type(utf_char_next) != CODEPOINT_TYPE_LETTER && codepoint_type(utf_char_next) != CODEPOINT_TYPE_DIGIT && codepoint_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE)
5100
+ ) {
5101
+ collecting_special = true;
5102
+ collecting = true;
5103
+ }
5104
+ else if (codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE && codepoint_type(utf_char_next) == CODEPOINT_TYPE_WHITESPACE) {
5105
+ collecting_whitespace_lookahead = true;
5106
+ collecting = true;
5107
+ }
5108
+ else if (codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE) {
5109
+ split_condition = true;
5110
+ }
5111
+ }
5112
+ else if (!split_condition && collecting) {
5113
+ if (collecting_letter && codepoint_type(utf_char) != CODEPOINT_TYPE_LETTER) {
5114
+ split_condition = true;
5115
+ }
5116
+ else if (collecting_numeric && codepoint_type(utf_char) != CODEPOINT_TYPE_DIGIT) {
5117
+ split_condition = true;
5118
+ }
5119
+ else if (collecting_special && (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
5120
+ split_condition = true;
5121
+ }
5122
+ else if (collecting_whitespace_lookahead && codepoint_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE) {
5123
+ split_condition = true;
5124
+ }
5125
+ }
5126
+
5127
+ if (utf_char_next == "") {
5128
+ split_condition = true; // final
5129
+ token += utf_char;
5130
+ }
4214
5131
 
4215
- // ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53
4216
- const std::string pattern = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
4217
- const std::regex re(pattern);
5132
+ if (split_condition) {
5133
+ if (token.size()) {
5134
+ bpe_words.emplace_back(token);
5135
+ }
5136
+ token = utf_char;
5137
+ collecting = false;
5138
+ collecting_letter = false;
5139
+ collecting_numeric = false;
5140
+ collecting_special = false;
5141
+ collecting_whitespace_lookahead = false;
5142
+ }
5143
+ else {
5144
+ token += utf_char;
5145
+ }
5146
+ }
4218
5147
 
4219
- auto words_begin = std::sregex_iterator(text.begin(), text.end(), re);
4220
- auto words_end = std::sregex_iterator();
4221
- auto n_words = std::distance(words_begin, words_end);
4222
- words.reserve(n_words);
4223
- for (auto it = words_begin; it != words_end; ++it) {
4224
- words.push_back(it->str());
5148
+ for (std::string & word : bpe_words) {
5149
+ std::string encoded_token = "";
5150
+ for (char & c : word) {
5151
+ encoded_token += bytes_to_unicode_bpe(c);
5152
+ }
5153
+ bpe_encoded_words.emplace_back(encoded_token);
4225
5154
  }
4226
- return words;
4227
5155
 
5156
+ return bpe_encoded_words;
4228
5157
  }
4229
5158
 
4230
5159
  const llama_vocab & vocab;
@@ -4266,7 +5195,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
4266
5195
  llm_tokenizer_bpe tokenizer(vocab);
4267
5196
  tokenizer.tokenize(raw_text, output);
4268
5197
  } break;
4269
- };
5198
+ }
4270
5199
 
4271
5200
  return output;
4272
5201
  }
@@ -4670,6 +5599,13 @@ struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar)
4670
5599
  // sampling
4671
5600
  //
4672
5601
 
5602
+ void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed) {
5603
+ if (seed == LLAMA_DEFAULT_SEED) {
5604
+ seed = time(NULL);
5605
+ }
5606
+ ctx->rng.seed(seed);
5607
+ }
5608
+
4673
5609
  void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates) {
4674
5610
  GGML_ASSERT(candidates->size > 0);
4675
5611
 
@@ -4878,7 +5814,7 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
4878
5814
  }
4879
5815
  }
4880
5816
 
4881
- void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
5817
+ void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
4882
5818
  const int64_t t_start_sample_us = ggml_time_us();
4883
5819
 
4884
5820
  for (size_t i = 0; i < candidates_p->size; ++i) {
@@ -4890,6 +5826,10 @@ void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array
4890
5826
  }
4891
5827
  }
4892
5828
 
5829
+ void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
5830
+ llama_sample_temp(ctx, candidates_p, temp);
5831
+ }
5832
+
4893
5833
  void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty) {
4894
5834
  if (last_tokens_size == 0 || penalty == 1.0f) {
4895
5835
  return;
@@ -5013,7 +5953,7 @@ void llama_sample_classifier_free_guidance(
5013
5953
 
5014
5954
  GGML_ASSERT(ctx);
5015
5955
 
5016
- auto n_vocab = llama_n_vocab(ctx);
5956
+ auto n_vocab = llama_n_vocab(llama_get_model(ctx));
5017
5957
 
5018
5958
  GGML_ASSERT(n_vocab == (int)candidates->size);
5019
5959
  GGML_ASSERT(!candidates->sorted);
@@ -5042,7 +5982,7 @@ void llama_sample_classifier_free_guidance(
5042
5982
  llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu) {
5043
5983
  GGML_ASSERT(ctx);
5044
5984
 
5045
- auto N = float(llama_n_vocab(ctx));
5985
+ auto N = float(llama_n_vocab(llama_get_model(ctx)));
5046
5986
  int64_t t_start_sample_us;
5047
5987
  t_start_sample_us = ggml_time_us();
5048
5988
 
@@ -5229,7 +6169,7 @@ struct llama_logit_info {
5229
6169
  };
5230
6170
  llama_logit_info(llama_context * ctx)
5231
6171
  : logits(llama_get_logits(ctx))
5232
- , n_vocab(llama_n_vocab(ctx))
6172
+ , n_vocab(llama_n_vocab(llama_get_model(ctx)))
5233
6173
  , max_l(*std::max_element(logits, logits + n_vocab))
5234
6174
  , normalizer(1.0f / std::accumulate(logits, logits + n_vocab, 0.0f, sum_exp{max_l}))
5235
6175
  { }
@@ -5267,7 +6207,6 @@ struct llama_beam_search_data {
5267
6207
  size_t n_beams;
5268
6208
  int n_past;
5269
6209
  int n_predict;
5270
- int n_threads;
5271
6210
  std::vector<llama_beam> beams;
5272
6211
  std::vector<llama_beam> next_beams;
5273
6212
 
@@ -5277,12 +6216,11 @@ struct llama_beam_search_data {
5277
6216
  // Used to communicate to/from callback on beams state.
5278
6217
  std::vector<llama_beam_view> beam_views;
5279
6218
 
5280
- llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict, int n_threads)
6219
+ llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict)
5281
6220
  : ctx(ctx)
5282
6221
  , n_beams(n_beams)
5283
6222
  , n_past(n_past)
5284
6223
  , n_predict(n_predict)
5285
- , n_threads(n_threads)
5286
6224
  , beam_views(n_beams) {
5287
6225
  beams.reserve(n_beams);
5288
6226
  next_beams.reserve(n_beams);
@@ -5319,7 +6257,7 @@ struct llama_beam_search_data {
5319
6257
  } else {
5320
6258
  // beam is not at end-of-sentence, so branch with next top_k tokens.
5321
6259
  if (!beam.tokens.empty()) {
5322
- llama_eval(ctx, beam.tokens.data(), beam.tokens.size(), n_past, n_threads);
6260
+ llama_decode(ctx, llama_batch_get_one(beam.tokens.data(), beam.tokens.size(), n_past, 0));
5323
6261
  }
5324
6262
  llama_logit_info logit_info(ctx);
5325
6263
  std::vector<llama_token_data> next_tokens = logit_info.top_k(n_beams);
@@ -5393,7 +6331,7 @@ struct llama_beam_search_data {
5393
6331
  callback(callback_data, get_beams_state(false)); // Sets common_prefix_length
5394
6332
  update_beams_from_beam_views(); // Update values (p,eob) that callback may have changed.
5395
6333
  if (common_prefix_length) {
5396
- llama_eval(ctx, beams[0].tokens.data(), common_prefix_length, n_past, n_threads);
6334
+ llama_decode(ctx, llama_batch_get_one(beams[0].tokens.data(), common_prefix_length, n_past, 0));
5397
6335
  n_past += common_prefix_length;
5398
6336
  }
5399
6337
  // Zero-out next_beam probabilities to place them last in following min-heap.
@@ -5434,11 +6372,11 @@ struct llama_beam_search_data {
5434
6372
 
5435
6373
  void llama_beam_search(llama_context * ctx,
5436
6374
  llama_beam_search_callback_fn_t callback, void * callback_data,
5437
- size_t n_beams, int n_past, int n_predict, int n_threads) {
6375
+ size_t n_beams, int n_past, int n_predict) {
5438
6376
  assert(ctx);
5439
6377
  const int64_t t_start_sample_us = ggml_time_us();
5440
6378
 
5441
- llama_beam_search_data beam_search_data(ctx, n_beams, n_past, n_predict, n_threads);
6379
+ llama_beam_search_data beam_search_data(ctx, n_beams, n_past, n_predict);
5442
6380
 
5443
6381
  beam_search_data.loop(callback, callback_data);
5444
6382
 
@@ -5658,11 +6596,22 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
5658
6596
  nthread = std::thread::hardware_concurrency();
5659
6597
  }
5660
6598
 
5661
- std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname_inp, /*use_mmap*/ false));
6599
+ // mmap consistently increases speed Linux, and also increases speed on Windows with
6600
+ // hot cache. It may cause a slowdown on macOS, possibly related to free memory.
6601
+ #if defined(__linux__) || defined(_WIN32)
6602
+ constexpr bool use_mmap = true;
6603
+ #else
6604
+ constexpr bool use_mmap = false;
6605
+ #endif
6606
+
6607
+ llama_model_loader ml(fname_inp, use_mmap);
6608
+ if (ml.use_mmap) {
6609
+ ml.mapping.reset(new llama_mmap(&ml.file, /* prefetch */ 0, ggml_is_numa()));
6610
+ }
5662
6611
 
5663
6612
  llama_model model;
5664
- llm_load_arch(*ml, model);
5665
- llm_load_hparams(*ml, model, 0, 0, 0);
6613
+ llm_load_arch(ml, model);
6614
+ llm_load_hparams(ml, model);
5666
6615
 
5667
6616
  if (params->only_copy) {
5668
6617
  ftype = model.ftype;
@@ -5672,7 +6621,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
5672
6621
  struct gguf_context * ctx_out = gguf_init_empty();
5673
6622
 
5674
6623
  // copy the KV pairs from the input file
5675
- gguf_set_kv (ctx_out, ml->ctx_gguf);
6624
+ gguf_set_kv (ctx_out, ml.ctx_gguf);
5676
6625
  gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
5677
6626
  gguf_set_val_u32(ctx_out, "general.file_type", ftype);
5678
6627
 
@@ -5680,8 +6629,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
5680
6629
  int n_attention_wv = 0;
5681
6630
  int n_feed_forward_w2 = 0;
5682
6631
 
5683
- for (int i = 0; i < ml->n_tensors; ++i) {
5684
- struct ggml_tensor * meta = ml->get_tensor_meta(i);
6632
+ for (int i = 0; i < ml.n_tensors; ++i) {
6633
+ struct ggml_tensor * meta = ml.get_tensor_meta(i);
5685
6634
 
5686
6635
  const std::string name = ggml_get_name(meta);
5687
6636
 
@@ -5717,8 +6666,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
5717
6666
  std::vector<no_init<float>> f32_conv_buf;
5718
6667
 
5719
6668
  // populate the original tensors so we get an initial meta data
5720
- for (int i = 0; i < ml->n_tensors; ++i) {
5721
- struct ggml_tensor * meta = ml->get_tensor_meta(i);
6669
+ for (int i = 0; i < ml.n_tensors; ++i) {
6670
+ struct ggml_tensor * meta = ml.get_tensor_meta(i);
5722
6671
  gguf_add_tensor(ctx_out, meta);
5723
6672
  }
5724
6673
 
@@ -5731,19 +6680,21 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
5731
6680
  // placeholder for the meta data
5732
6681
  ::zeros(fout, meta_size);
5733
6682
 
5734
- for (int i = 0; i < ml->n_tensors; ++i) {
5735
- struct ggml_tensor * tensor = ml->get_tensor_meta(i);
6683
+ for (int i = 0; i < ml.n_tensors; ++i) {
6684
+ struct ggml_tensor * tensor = ml.get_tensor_meta(i);
5736
6685
 
5737
6686
  const std::string name = ggml_get_name(tensor);
5738
6687
 
5739
- if (read_data.size() < ggml_nbytes(tensor)) {
5740
- read_data.resize(ggml_nbytes(tensor));
6688
+ if (!ml.use_mmap) {
6689
+ if (read_data.size() < ggml_nbytes(tensor)) {
6690
+ read_data.resize(ggml_nbytes(tensor));
6691
+ }
6692
+ tensor->data = read_data.data();
5741
6693
  }
5742
- tensor->data = read_data.data();
5743
- ml->load_data_for(tensor);
6694
+ ml.load_data_for(tensor);
5744
6695
 
5745
6696
  LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",
5746
- ++idx, ml->n_tensors,
6697
+ ++idx, ml.n_tensors,
5747
6698
  ggml_get_name(tensor),
5748
6699
  llama_format_tensor_shape(tensor).c_str(),
5749
6700
  ggml_type_name(tensor->type));
@@ -5893,9 +6844,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
5893
6844
  }
5894
6845
  }
5895
6846
 
5896
- // TODO: after the GGUF PR, this likely won't work and needs to be updated
5897
6847
  static int llama_apply_lora_from_file_internal(
5898
- const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads
6848
+ const struct llama_model & model, const char * path_lora, float scale, const char * path_base_model, int n_threads
5899
6849
  ) {
5900
6850
  LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
5901
6851
 
@@ -5924,7 +6874,7 @@ static int llama_apply_lora_from_file_internal(
5924
6874
  int32_t lora_alpha;
5925
6875
  fin.read((char *) &lora_r, sizeof(lora_r));
5926
6876
  fin.read((char *) &lora_alpha, sizeof(lora_alpha));
5927
- float scaling = (float)lora_alpha / (float)lora_r;
6877
+ float scaling = scale * (float)lora_alpha / (float)lora_r;
5928
6878
 
5929
6879
  LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
5930
6880
 
@@ -6140,9 +7090,10 @@ static int llama_apply_lora_from_file_internal(
6140
7090
  ggml_set_name(r, "r_cpy");
6141
7091
  }
6142
7092
 
6143
- struct ggml_cgraph gf = ggml_build_forward(r);
7093
+ struct ggml_cgraph * gf = ggml_new_graph(lora_ctx);
7094
+ ggml_build_forward_expand(gf, r);
6144
7095
 
6145
- ggml_graph_compute_helper(work_buffer, &gf, n_threads);
7096
+ ggml_graph_compute_helper(work_buffer, gf, n_threads);
6146
7097
 
6147
7098
  // we won't need these tensors again, reset the context to save memory
6148
7099
  ggml_free(lora_ctx);
@@ -6171,27 +7122,16 @@ static int llama_apply_lora_from_file_internal(
6171
7122
  //
6172
7123
  // interface implementation
6173
7124
  //
6174
-
6175
- struct llama_context_params llama_context_default_params() {
6176
- struct llama_context_params result = {
6177
- /*.seed =*/ LLAMA_DEFAULT_SEED,
6178
- /*.n_ctx =*/ 512,
6179
- /*.n_batch =*/ 512,
7125
+ struct llama_model_params llama_model_default_params() {
7126
+ struct llama_model_params result = {
6180
7127
  /*.n_gpu_layers =*/ 0,
6181
7128
  /*.main_gpu =*/ 0,
6182
7129
  /*.tensor_split =*/ nullptr,
6183
- /*.rope_freq_base =*/ 0.0f,
6184
- /*.rope_freq_scale =*/ 0.0f,
6185
7130
  /*.progress_callback =*/ nullptr,
6186
7131
  /*.progress_callback_user_data =*/ nullptr,
6187
- /*.low_vram =*/ false,
6188
- /*.mul_mat_q =*/ true,
6189
- /*.f16_kv =*/ true,
6190
- /*.logits_all =*/ false,
6191
7132
  /*.vocab_only =*/ false,
6192
7133
  /*.use_mmap =*/ true,
6193
7134
  /*.use_mlock =*/ false,
6194
- /*.embedding =*/ false,
6195
7135
  };
6196
7136
 
6197
7137
  #ifdef GGML_USE_METAL
@@ -6201,6 +7141,24 @@ struct llama_context_params llama_context_default_params() {
6201
7141
  return result;
6202
7142
  }
6203
7143
 
7144
+ struct llama_context_params llama_context_default_params() {
7145
+ struct llama_context_params result = {
7146
+ /*.seed =*/ LLAMA_DEFAULT_SEED,
7147
+ /*.n_ctx =*/ 512,
7148
+ /*.n_batch =*/ 512,
7149
+ /*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
7150
+ /*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS,
7151
+ /*.rope_freq_base =*/ 0.0f,
7152
+ /*.rope_freq_scale =*/ 0.0f,
7153
+ /*.mul_mat_q =*/ true,
7154
+ /*.f16_kv =*/ true,
7155
+ /*.logits_all =*/ false,
7156
+ /*.embedding =*/ false,
7157
+ };
7158
+
7159
+ return result;
7160
+ }
7161
+
6204
7162
  struct llama_model_quantize_params llama_model_quantize_default_params() {
6205
7163
  struct llama_model_quantize_params result = {
6206
7164
  /*.nthread =*/ 0,
@@ -6256,13 +7214,11 @@ int64_t llama_time_us(void) {
6256
7214
 
6257
7215
  struct llama_model * llama_load_model_from_file(
6258
7216
  const char * path_model,
6259
- struct llama_context_params params) {
7217
+ struct llama_model_params params) {
6260
7218
  ggml_time_init();
6261
7219
 
6262
7220
  llama_model * model = new llama_model;
6263
7221
 
6264
- ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
6265
-
6266
7222
  unsigned cur_percentage = 0;
6267
7223
  if (params.progress_callback == NULL) {
6268
7224
  params.progress_callback_user_data = &cur_percentage;
@@ -6279,9 +7235,9 @@ struct llama_model * llama_load_model_from_file(
6279
7235
  };
6280
7236
  }
6281
7237
 
6282
- if (!llama_model_load(path_model, *model, params.n_ctx, params.n_batch, params.n_gpu_layers,
6283
- params.main_gpu, params.tensor_split, params.mul_mat_q, params.rope_freq_base, params.rope_freq_scale,
6284
- params.low_vram, memory_type, params.use_mmap, params.use_mlock, params.vocab_only,
7238
+ if (!llama_model_load(path_model, *model, params.n_gpu_layers,
7239
+ params.main_gpu, params.tensor_split,
7240
+ params.use_mmap, params.use_mlock, params.vocab_only,
6285
7241
  params.progress_callback, params.progress_callback_user_data)) {
6286
7242
  LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
6287
7243
  delete model;
@@ -6305,18 +7261,33 @@ struct llama_context * llama_new_context_with_model(
6305
7261
 
6306
7262
  llama_context * ctx = new llama_context(*model);
6307
7263
 
7264
+ const auto & hparams = model->hparams;
7265
+ auto & cparams = ctx->cparams;
7266
+
7267
+ cparams.n_batch = params.n_batch;
7268
+ cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
7269
+ cparams.rope_freq_base = params.rope_freq_base == 0 ? hparams.rope_freq_base_train : params.rope_freq_base;
7270
+ cparams.rope_freq_scale = params.rope_freq_scale == 0 ? hparams.rope_freq_scale_train : params.rope_freq_scale;
7271
+ cparams.n_threads = params.n_threads;
7272
+ cparams.n_threads_batch = params.n_threads_batch;
7273
+ cparams.mul_mat_q = params.mul_mat_q;
7274
+
6308
7275
  if (params.seed == LLAMA_DEFAULT_SEED) {
6309
7276
  params.seed = time(NULL);
6310
7277
  }
6311
7278
 
7279
+ LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
7280
+ LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
7281
+ LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
7282
+
6312
7283
  ctx->rng = std::mt19937(params.seed);
6313
7284
  ctx->logits_all = params.logits_all;
6314
7285
 
6315
7286
  ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
6316
7287
 
6317
7288
  // reserve memory for context buffers
6318
- if (!params.vocab_only) {
6319
- if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
7289
+ if (!hparams.vocab_only) {
7290
+ if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, cparams.n_ctx, model->n_gpu_layers)) {
6320
7291
  LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
6321
7292
  llama_free(ctx);
6322
7293
  return nullptr;
@@ -6327,11 +7298,9 @@ struct llama_context * llama_new_context_with_model(
6327
7298
  LLAMA_LOG_INFO("%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
6328
7299
  }
6329
7300
 
6330
- const auto & hparams = ctx->model.hparams;
6331
-
6332
7301
  // resized during inference
6333
7302
  if (params.logits_all) {
6334
- ctx->logits.reserve(hparams.n_ctx*hparams.n_vocab);
7303
+ ctx->logits.reserve(cparams.n_ctx*hparams.n_vocab);
6335
7304
  } else {
6336
7305
  ctx->logits.reserve(hparams.n_vocab);
6337
7306
  }
@@ -6349,26 +7318,29 @@ struct llama_context * llama_new_context_with_model(
6349
7318
  ctx->alloc = ggml_allocr_new_measure(tensor_alignment);
6350
7319
 
6351
7320
  // build worst-case graph
6352
- int n_tokens = std::min((int)hparams.n_ctx, params.n_batch);
6353
- int n_past = hparams.n_ctx - n_tokens;
7321
+ int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch);
7322
+ int n_past = cparams.n_ctx - n_tokens;
6354
7323
  llama_token token = llama_token_bos(ctx); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
6355
- ggml_cgraph * gf = llama_build_graph(*ctx, &token, NULL, n_tokens, n_past);
7324
+ ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0));
7325
+
6356
7326
  #ifdef GGML_USE_METAL
6357
- if (params.n_gpu_layers > 0) {
7327
+ if (model->n_gpu_layers > 0) {
7328
+ ggml_metal_log_set_callback(llama_log_callback_default, NULL);
7329
+
6358
7330
  ctx->ctx_metal = ggml_metal_init(1);
6359
7331
  if (!ctx->ctx_metal) {
6360
7332
  LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__);
6361
7333
  llama_free(ctx);
6362
7334
  return NULL;
6363
7335
  }
6364
- ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false);
6365
- ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
7336
+ //ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false);
7337
+ //ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
6366
7338
  }
6367
7339
  #endif
6368
7340
  // measure memory requirements for the graph
6369
7341
  size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
6370
7342
 
6371
- LLAMA_LOG_INFO("%s: compute buffer total size = %7.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
7343
+ LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
6372
7344
 
6373
7345
  // recreate allocator with exact memory requirements
6374
7346
  ggml_allocr_free(ctx->alloc);
@@ -6377,28 +7349,46 @@ struct llama_context * llama_new_context_with_model(
6377
7349
  ctx->alloc = ggml_allocr_new(ctx->buf_alloc.data, ctx->buf_alloc.size, tensor_alignment);
6378
7350
  #ifdef GGML_USE_METAL
6379
7351
  if (ctx->ctx_metal) {
6380
- ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
7352
+ //ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
6381
7353
  }
6382
7354
  #endif
6383
7355
  #ifdef GGML_USE_CUBLAS
6384
- if (params.low_vram) {
6385
- LLAMA_LOG_INFO("%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
6386
- ggml_cuda_set_scratch_size(0); // disable scratch
6387
- } else {
6388
- ggml_cuda_set_scratch_size(alloc_size);
6389
- LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MB\n", __func__, alloc_size / 1024.0 / 1024.0);
7356
+ ggml_cuda_set_scratch_size(alloc_size);
7357
+ LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MB\n", __func__, alloc_size / 1024.0 / 1024.0);
7358
+
7359
+ // calculate total VRAM usage
7360
+ auto add_tensor = [](const ggml_tensor * t, size_t & size) {
7361
+ if (t->backend == GGML_BACKEND_GPU || t->backend == GGML_BACKEND_GPU_SPLIT) {
7362
+ size += ggml_nbytes(t);
7363
+ }
7364
+ };
7365
+ size_t model_vram_size = 0;
7366
+ for (const auto & kv : model->tensors_by_name) {
7367
+ add_tensor(kv.second, model_vram_size);
6390
7368
  }
7369
+
7370
+ size_t kv_vram_size = 0;
7371
+ add_tensor(ctx->kv_self.k, kv_vram_size);
7372
+ add_tensor(ctx->kv_self.v, kv_vram_size);
7373
+
7374
+ size_t ctx_vram_size = alloc_size + kv_vram_size;
7375
+ size_t total_vram_size = model_vram_size + ctx_vram_size;
7376
+
7377
+ LLAMA_LOG_INFO("%s: total VRAM used: %.2f MB (model: %.2f MB, context: %.2f MB)\n", __func__,
7378
+ total_vram_size / 1024.0 / 1024.0,
7379
+ model_vram_size / 1024.0 / 1024.0,
7380
+ ctx_vram_size / 1024.0 / 1024.0);
6391
7381
  #endif
6392
7382
  }
6393
7383
 
6394
7384
  #ifdef GGML_USE_METAL
6395
- if (params.n_gpu_layers > 0) {
7385
+ if (model->n_gpu_layers > 0) {
6396
7386
  // this allocates all Metal resources and memory buffers
6397
7387
 
6398
7388
  void * data_ptr = NULL;
6399
7389
  size_t data_size = 0;
6400
7390
 
6401
- if (params.use_mmap) {
7391
+ if (ctx->model.mapping) {
6402
7392
  data_ptr = ctx->model.mapping->addr;
6403
7393
  data_size = ctx->model.mapping->size;
6404
7394
  } else {
@@ -6417,11 +7407,8 @@ struct llama_context * llama_new_context_with_model(
6417
7407
  return NULL; \
6418
7408
  }
6419
7409
 
6420
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
6421
-
6422
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.data, ctx->buf_compute.size, 0));
6423
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0));
6424
-
7410
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
7411
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0));
6425
7412
  LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.data, ctx->buf_alloc.size, 0));
6426
7413
  #undef LLAMA_METAL_CHECK_BUF
6427
7414
  }
@@ -6433,8 +7420,10 @@ struct llama_context * llama_new_context_with_model(
6433
7420
 
6434
7421
  if (ggml_mpi_rank(ctx->ctx_mpi) > 0) {
6435
7422
  // Enter a blocking eval loop with dummy input, letting rank=0 drive the process
6436
- const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos(ctx));
6437
- while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {};
7423
+ // TODO: needs fix after #3228
7424
+ GGML_ASSERT(false && "not implemented");
7425
+ //const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos(ctx));
7426
+ //while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {};
6438
7427
  llama_backend_free();
6439
7428
  exit(1);
6440
7429
  }
@@ -6443,63 +7432,41 @@ struct llama_context * llama_new_context_with_model(
6443
7432
  return ctx;
6444
7433
  }
6445
7434
 
6446
- static struct llama_context * llama_init_from_file(
6447
- const char * path_model,
6448
- struct llama_context_params params) {
6449
- struct llama_model * model = llama_load_model_from_file(path_model, params);
6450
- if (!model) {
6451
- return nullptr;
6452
- }
6453
-
6454
- struct llama_context * ctx = llama_new_context_with_model(model, params);
6455
- ctx->model_owner = true;
6456
-
6457
- return ctx;
6458
- }
6459
-
6460
7435
  void llama_free(struct llama_context * ctx) {
6461
7436
  delete ctx;
6462
7437
  }
6463
7438
 
6464
- int llama_n_vocab(const struct llama_context * ctx) {
6465
- return llama_model_n_vocab(&ctx->model);
7439
+ const llama_model * llama_get_model(const struct llama_context * ctx) {
7440
+ return &ctx->model;
6466
7441
  }
6467
7442
 
6468
7443
  int llama_n_ctx(const struct llama_context * ctx) {
6469
- return llama_model_n_ctx(&ctx->model);
6470
- }
6471
-
6472
- int llama_n_ctx_train(const struct llama_context * ctx) {
6473
- return llama_model_n_ctx_train(&ctx->model);
7444
+ return ctx->cparams.n_ctx;
6474
7445
  }
6475
7446
 
6476
- int llama_n_embd(const struct llama_context * ctx) {
6477
- return llama_model_n_embd(&ctx->model);
7447
+ enum llama_vocab_type llama_vocab_type(const struct llama_model * model) {
7448
+ return model->vocab.type;
6478
7449
  }
6479
7450
 
6480
- enum llama_vocab_type llama_vocab_type(const struct llama_context * ctx) {
6481
- return ctx->model.vocab.type;
6482
- }
6483
-
6484
- int llama_model_n_vocab(const struct llama_model * model) {
7451
+ int llama_n_vocab(const struct llama_model * model) {
6485
7452
  return model->vocab.id_to_token.size();
6486
7453
  }
6487
7454
 
6488
- int llama_model_n_ctx(const struct llama_model * model) {
6489
- return model->hparams.n_ctx;
6490
- }
6491
-
6492
- int llama_model_n_ctx_train(const struct llama_model * model) {
7455
+ int llama_n_ctx_train(const struct llama_model * model) {
6493
7456
  return model->hparams.n_ctx_train;
6494
7457
  }
6495
7458
 
6496
- int llama_model_n_embd(const struct llama_model * model) {
7459
+ int llama_n_embd(const struct llama_model * model) {
6497
7460
  return model->hparams.n_embd;
6498
7461
  }
6499
7462
 
7463
+ float llama_rope_freq_scale_train(const struct llama_model * model) {
7464
+ return model->hparams.rope_freq_scale_train;
7465
+ }
7466
+
6500
7467
  int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
6501
7468
  return snprintf(buf, buf_size, "%s %s %s",
6502
- model->name.c_str(),
7469
+ llama_model_arch_name(model->arch).c_str(),
6503
7470
  llama_model_type_name(model->type),
6504
7471
  llama_model_ftype_name(model->ftype).c_str());
6505
7472
  }
@@ -6520,6 +7487,10 @@ uint64_t llama_model_n_params(const struct llama_model * model) {
6520
7487
  return nparams;
6521
7488
  }
6522
7489
 
7490
+ struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name) {
7491
+ return ggml_get_tensor(model->ctx, name);
7492
+ }
7493
+
6523
7494
  int llama_model_quantize(
6524
7495
  const char * fname_inp,
6525
7496
  const char * fname_out,
@@ -6533,18 +7504,18 @@ int llama_model_quantize(
6533
7504
  }
6534
7505
  }
6535
7506
 
6536
- int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
7507
+ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, float scale, const char * path_base_model, int n_threads) {
6537
7508
  try {
6538
- return llama_apply_lora_from_file_internal(ctx->model, path_lora, path_base_model, n_threads);
7509
+ return llama_apply_lora_from_file_internal(ctx->model, path_lora, scale, path_base_model, n_threads);
6539
7510
  } catch (const std::exception & err) {
6540
7511
  LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
6541
7512
  return 1;
6542
7513
  }
6543
7514
  }
6544
7515
 
6545
- int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, const char * path_base_model, int n_threads) {
7516
+ int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int n_threads) {
6546
7517
  try {
6547
- return llama_apply_lora_from_file_internal(*model, path_lora, path_base_model, n_threads);
7518
+ return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads);
6548
7519
  } catch (const std::exception & err) {
6549
7520
  LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
6550
7521
  return 1;
@@ -6552,16 +7523,27 @@ int llama_model_apply_lora_from_file(const struct llama_model * model, const cha
6552
7523
  }
6553
7524
 
6554
7525
  int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
6555
- return ctx->kv_self.n;
7526
+ return ctx->kv_self.head;
6556
7527
  }
6557
7528
 
6558
- #define LLAMA_MAX_RNG_STATE (64*1024)
7529
+ void llama_kv_cache_tokens_rm(struct llama_context * ctx, int32_t c0, int32_t c1) {
7530
+ llama_kv_cache_tokens_rm(ctx->kv_self, c0, c1);
7531
+ }
6559
7532
 
6560
- void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed) {
6561
- if (seed == LLAMA_DEFAULT_SEED) {
6562
- seed = time(NULL);
6563
- }
6564
- ctx->rng.seed(seed);
7533
+ void llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
7534
+ llama_kv_cache_seq_rm(ctx->kv_self, seq_id, p0, p1);
7535
+ }
7536
+
7537
+ void llama_kv_cache_seq_cp(struct llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
7538
+ llama_kv_cache_seq_cp(ctx->kv_self, seq_id_src, seq_id_dst, p0, p1);
7539
+ }
7540
+
7541
+ void llama_kv_cache_seq_keep(struct llama_context * ctx, llama_seq_id seq_id) {
7542
+ llama_kv_cache_seq_keep(ctx->kv_self, seq_id);
7543
+ }
7544
+
7545
+ void llama_kv_cache_seq_shift(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
7546
+ llama_kv_cache_seq_shift(ctx->kv_self, seq_id, p0, p1, delta);
6565
7547
  }
6566
7548
 
6567
7549
  // Returns the *maximum* size of the state
@@ -6699,36 +7681,40 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
6699
7681
  {
6700
7682
  const auto & kv_self = ctx->kv_self;
6701
7683
  const auto & hparams = ctx->model.hparams;
6702
- const int n_layer = hparams.n_layer;
6703
- const int n_embd = hparams.n_embd_gqa();
6704
- const int n_ctx = hparams.n_ctx;
7684
+ const auto & cparams = ctx->cparams;
6705
7685
 
6706
- const size_t kv_size = kv_self.buf.size;
6707
- const int kv_ntok = llama_get_kv_cache_token_count(ctx);
7686
+ const auto n_layer = hparams.n_layer;
7687
+ const auto n_embd = hparams.n_embd_gqa();
7688
+ const auto n_ctx = cparams.n_ctx;
6708
7689
 
6709
- data_ctx->write(&kv_size, sizeof(kv_size));
6710
- data_ctx->write(&kv_ntok, sizeof(kv_ntok));
7690
+ const size_t kv_buf_size = kv_self.buf.size;
7691
+ const uint32_t kv_head = kv_self.head;
7692
+ const uint32_t kv_size = kv_self.size;
6711
7693
 
6712
- if (kv_size) {
7694
+ data_ctx->write(&kv_buf_size, sizeof(kv_buf_size));
7695
+ data_ctx->write(&kv_head, sizeof(kv_head));
7696
+ data_ctx->write(&kv_size, sizeof(kv_size));
7697
+
7698
+ if (kv_buf_size) {
6713
7699
  const size_t elt_size = ggml_element_size(kv_self.k);
6714
7700
 
6715
7701
  ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
6716
7702
  ggml_cgraph gf{};
6717
7703
 
6718
- ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
7704
+ ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
6719
7705
  std::vector<uint8_t> kout3d_data(ggml_nbytes(kout3d), 0);
6720
7706
  kout3d->data = kout3d_data.data();
6721
7707
 
6722
- ggml_tensor * vout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer);
7708
+ ggml_tensor * vout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_head, n_embd, n_layer);
6723
7709
  std::vector<uint8_t> vout3d_data(ggml_nbytes(vout3d), 0);
6724
7710
  vout3d->data = vout3d_data.data();
6725
7711
 
6726
7712
  ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
6727
- n_embd, kv_ntok, n_layer,
7713
+ n_embd, kv_head, n_layer,
6728
7714
  elt_size*n_embd, elt_size*n_embd*n_ctx, 0);
6729
7715
 
6730
7716
  ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v,
6731
- kv_ntok, n_embd, n_layer,
7717
+ kv_head, n_embd, n_layer,
6732
7718
  elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
6733
7719
 
6734
7720
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
@@ -6742,6 +7728,20 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
6742
7728
  data_ctx->write(kout3d_data.data(), kout3d_data.size());
6743
7729
  data_ctx->write(vout3d_data.data(), vout3d_data.size());
6744
7730
  }
7731
+
7732
+ for (uint32_t i = 0; i < kv_size; ++i) {
7733
+ const auto & cell = kv_self.cells[i];
7734
+
7735
+ const llama_pos pos = cell.pos;
7736
+ const size_t seq_id_size = cell.seq_id.size();
7737
+
7738
+ data_ctx->write(&pos, sizeof(pos));
7739
+ data_ctx->write(&seq_id_size, sizeof(seq_id_size));
7740
+
7741
+ for (auto seq_id : cell.seq_id) {
7742
+ data_ctx->write(&seq_id, sizeof(seq_id));
7743
+ }
7744
+ }
6745
7745
  }
6746
7746
  }
6747
7747
 
@@ -6807,38 +7807,42 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
6807
7807
  {
6808
7808
  const auto & kv_self = ctx->kv_self;
6809
7809
  const auto & hparams = ctx->model.hparams;
7810
+ const auto & cparams = ctx->cparams;
7811
+
6810
7812
  const int n_layer = hparams.n_layer;
6811
7813
  const int n_embd = hparams.n_embd_gqa();
6812
- const int n_ctx = hparams.n_ctx;
7814
+ const int n_ctx = cparams.n_ctx;
6813
7815
 
6814
- size_t kv_size;
6815
- int kv_ntok;
7816
+ size_t kv_buf_size;
7817
+ uint32_t kv_head;
7818
+ uint32_t kv_size;
6816
7819
 
6817
- memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
6818
- memcpy(&kv_ntok, inp, sizeof(kv_ntok)); inp += sizeof(kv_ntok);
7820
+ memcpy(&kv_buf_size, inp, sizeof(kv_buf_size)); inp += sizeof(kv_buf_size);
7821
+ memcpy(&kv_head, inp, sizeof(kv_head)); inp += sizeof(kv_head);
7822
+ memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
6819
7823
 
6820
- if (kv_size) {
6821
- GGML_ASSERT(kv_self.buf.size == kv_size);
7824
+ if (kv_buf_size) {
7825
+ GGML_ASSERT(kv_self.buf.size == kv_buf_size);
6822
7826
 
6823
7827
  const size_t elt_size = ggml_element_size(kv_self.k);
6824
7828
 
6825
7829
  ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
6826
7830
  ggml_cgraph gf{};
6827
7831
 
6828
- ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
7832
+ ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
6829
7833
  kin3d->data = (void *) inp;
6830
7834
  inp += ggml_nbytes(kin3d);
6831
7835
 
6832
- ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer);
7836
+ ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_head, n_embd, n_layer);
6833
7837
  vin3d->data = (void *) inp;
6834
7838
  inp += ggml_nbytes(vin3d);
6835
7839
 
6836
7840
  ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
6837
- n_embd, kv_ntok, n_layer,
7841
+ n_embd, kv_head, n_layer,
6838
7842
  elt_size*n_embd, elt_size*n_embd*n_ctx, 0);
6839
7843
 
6840
7844
  ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v,
6841
- kv_ntok, n_embd, n_layer,
7845
+ kv_head, n_embd, n_layer,
6842
7846
  elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
6843
7847
 
6844
7848
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
@@ -6848,7 +7852,27 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
6848
7852
  ggml_free(cpy_ctx);
6849
7853
  }
6850
7854
 
6851
- ctx->kv_self.n = kv_ntok;
7855
+ ctx->kv_self.head = kv_head;
7856
+ ctx->kv_self.size = kv_size;
7857
+
7858
+ ctx->kv_self.cells.resize(kv_size);
7859
+
7860
+ for (uint32_t i = 0; i < kv_size; ++i) {
7861
+ llama_pos pos;
7862
+ size_t seq_id_size;
7863
+
7864
+ memcpy(&pos, inp, sizeof(pos)); inp += sizeof(pos);
7865
+ memcpy(&seq_id_size, inp, sizeof(seq_id_size)); inp += sizeof(seq_id_size);
7866
+
7867
+ ctx->kv_self.cells[i].pos = pos;
7868
+
7869
+ llama_seq_id seq_id;
7870
+
7871
+ for (size_t j = 0; j < seq_id_size; ++j) {
7872
+ memcpy(&seq_id, inp, sizeof(seq_id)); inp += sizeof(seq_id);
7873
+ ctx->kv_self.cells[i].seq_id.insert(seq_id);
7874
+ }
7875
+ }
6852
7876
  }
6853
7877
 
6854
7878
  const size_t nread = inp - src;
@@ -6943,64 +7967,102 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi
6943
7967
 
6944
7968
  int llama_eval(
6945
7969
  struct llama_context * ctx,
6946
- const llama_token * tokens,
6947
- int n_tokens,
6948
- int n_past,
6949
- int n_threads) {
6950
- if (!llama_eval_internal(*ctx, tokens, nullptr, n_tokens, n_past, n_threads, nullptr)) {
6951
- LLAMA_LOG_ERROR("%s: failed to eval\n", __func__);
6952
- return 1;
6953
- }
7970
+ llama_token * tokens,
7971
+ int32_t n_tokens,
7972
+ int n_past) {
7973
+ llama_kv_cache_tokens_rm(ctx->kv_self, n_past, -1);
6954
7974
 
6955
- // get a more accurate load time, upon first eval
6956
- // TODO: fix this
6957
- if (!ctx->has_evaluated_once) {
6958
- ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
6959
- ctx->has_evaluated_once = true;
7975
+ const int ret = llama_decode_internal(*ctx, llama_batch_get_one(tokens, n_tokens, n_past, 0));
7976
+ if (ret < 0) {
7977
+ LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
6960
7978
  }
6961
7979
 
6962
- return 0;
7980
+ return ret;
6963
7981
  }
6964
7982
 
6965
7983
  int llama_eval_embd(
6966
7984
  struct llama_context * ctx,
6967
- const float * embd,
6968
- int n_tokens,
6969
- int n_past,
6970
- int n_threads) {
6971
- if (!llama_eval_internal(*ctx, nullptr, embd, n_tokens, n_past, n_threads, nullptr)) {
6972
- LLAMA_LOG_ERROR("%s: failed to eval\n", __func__);
6973
- return 1;
6974
- }
7985
+ float * embd,
7986
+ int32_t n_tokens,
7987
+ int n_past) {
7988
+ llama_kv_cache_tokens_rm(ctx->kv_self, n_past, -1);
6975
7989
 
6976
- // get a more accurate load time, upon first eval
6977
- // TODO: fix this
6978
- if (!ctx->has_evaluated_once) {
6979
- ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
6980
- ctx->has_evaluated_once = true;
7990
+ llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, n_past, 1, 0, };
7991
+
7992
+ const int ret = llama_decode_internal(*ctx, batch);
7993
+ if (ret < 0) {
7994
+ LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
6981
7995
  }
6982
7996
 
6983
- return 0;
7997
+ return ret;
6984
7998
  }
6985
7999
 
6986
- int llama_eval_export(struct llama_context * ctx, const char * fname) {
6987
- const int n_batch = 1;
6988
- const int n_ctx = 512 - n_batch;
8000
+ void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch) {
8001
+ ctx->cparams.n_threads = n_threads;
8002
+ ctx->cparams.n_threads_batch = n_threads_batch;
8003
+ }
8004
+
8005
+ struct llama_batch llama_batch_get_one(
8006
+ llama_token * tokens,
8007
+ int32_t n_tokens,
8008
+ llama_pos pos_0,
8009
+ llama_seq_id seq_id) {
8010
+ return {
8011
+ /*n_tokens =*/ n_tokens,
8012
+ /*tokens =*/ tokens,
8013
+ /*embd =*/ nullptr,
8014
+ /*pos =*/ nullptr,
8015
+ /*seq_id =*/ nullptr,
8016
+ /*logits =*/ nullptr,
8017
+ /*all_pos_0 =*/ pos_0,
8018
+ /*all_pos_1 =*/ 1,
8019
+ /*all_seq_id =*/ seq_id,
8020
+ };
8021
+ }
6989
8022
 
6990
- const std::vector<llama_token> tmp(n_batch, llama_token_bos(ctx));
8023
+ struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd) {
8024
+ llama_batch batch = { -1, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, };
6991
8025
 
6992
- if (!llama_eval_internal(*ctx, tmp.data(), nullptr, tmp.size(), n_ctx, 1, fname)) {
6993
- LLAMA_LOG_ERROR("%s: failed to eval\n", __func__);
6994
- return 1;
8026
+ if (embd) {
8027
+ batch.embd = (float *) malloc(sizeof(float) * n_tokens * embd);
8028
+ } else {
8029
+ batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens);
6995
8030
  }
6996
8031
 
6997
- return 0;
8032
+ batch.pos = (llama_pos *) malloc(sizeof(llama_pos) * n_tokens);
8033
+ batch.seq_id = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_tokens);
8034
+ batch.logits = (int8_t *) malloc(sizeof(int8_t) * n_tokens);
8035
+
8036
+ return batch;
8037
+ }
8038
+
8039
+ void llama_batch_free(struct llama_batch batch) {
8040
+ if (batch.token) free(batch.token);
8041
+ if (batch.embd) free(batch.embd);
8042
+ if (batch.pos) free(batch.pos);
8043
+ if (batch.seq_id) free(batch.seq_id);
8044
+ if (batch.logits) free(batch.logits);
8045
+ }
8046
+
8047
+ int llama_decode(
8048
+ struct llama_context * ctx,
8049
+ struct llama_batch batch) {
8050
+ const int ret = llama_decode_internal(*ctx, batch);
8051
+ if (ret < 0) {
8052
+ LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
8053
+ }
8054
+
8055
+ return ret;
6998
8056
  }
6999
8057
 
7000
8058
  float * llama_get_logits(struct llama_context * ctx) {
7001
8059
  return ctx->logits.data();
7002
8060
  }
7003
8061
 
8062
+ float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
8063
+ return ctx->logits.data() + i*ctx->model.hparams.n_vocab;
8064
+ }
8065
+
7004
8066
  float * llama_get_embeddings(struct llama_context * ctx) {
7005
8067
  return ctx->embedding.data();
7006
8068
  }
@@ -7028,18 +8090,24 @@ llama_token llama_token_eos(const struct llama_context * ctx) {
7028
8090
  llama_token llama_token_nl(const struct llama_context * ctx) {
7029
8091
  return ctx->model.vocab.linefeed_id;
7030
8092
  }
8093
+ llama_token llama_token_prefix(const struct llama_context * ctx) {
8094
+ return ctx->model.vocab.special_prefix_id;
8095
+ }
7031
8096
 
7032
- int llama_tokenize(
7033
- struct llama_context * ctx,
7034
- const char * text,
7035
- int text_len,
7036
- llama_token * tokens,
7037
- int n_max_tokens,
7038
- bool add_bos) {
7039
- return llama_tokenize_with_model(&ctx->model, text, text_len, tokens, n_max_tokens, add_bos);
8097
+ llama_token llama_token_middle(const struct llama_context * ctx) {
8098
+ return ctx->model.vocab.special_middle_id;
7040
8099
  }
7041
8100
 
7042
- int llama_tokenize_with_model(
8101
+ llama_token llama_token_suffix(const struct llama_context * ctx) {
8102
+ return ctx->model.vocab.special_suffix_id;
8103
+ }
8104
+
8105
+ llama_token llama_token_eot(const struct llama_context * ctx) {
8106
+ return ctx->model.vocab.special_eot_id;
8107
+ }
8108
+
8109
+
8110
+ int llama_tokenize(
7043
8111
  const struct llama_model * model,
7044
8112
  const char * text,
7045
8113
  int text_len,
@@ -7060,39 +8128,66 @@ int llama_tokenize_with_model(
7060
8128
  return res.size();
7061
8129
  }
7062
8130
 
7063
- int llama_token_to_piece(const struct llama_context * ctx, llama_token token, char * buf, int length) {
7064
- return llama_token_to_piece_with_model(&ctx->model, token, buf, length);
8131
+ static std::string llama_decode_text(const std::string & text) {
8132
+ std::string decoded_text;
8133
+ auto unicode_sequences = codepoints_from_utf8(text);
8134
+ for (auto& unicode_sequence : unicode_sequences) {
8135
+ decoded_text += unicode_to_bytes_bpe(codepoint_to_utf8(unicode_sequence));
8136
+ }
8137
+
8138
+ return decoded_text;
7065
8139
  }
7066
8140
 
7067
8141
  // does not write null-terminator to buf
7068
- int llama_token_to_piece_with_model(const struct llama_model * model, llama_token token, char * buf, int length) {
7069
- if (0 <= token && token < llama_model_n_vocab(model)) {
7070
- if (llama_is_normal_token(model->vocab, token)) {
7071
- std::string result = model->vocab.id_to_token[token].text;
7072
- if (llama_vocab_get_type(model->vocab) == LLAMA_VOCAB_TYPE_SPM) {
8142
+ int llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int length) {
8143
+ if (0 <= token && token < llama_n_vocab(model)) {
8144
+ switch (llama_vocab_get_type(model->vocab)) {
8145
+ case LLAMA_VOCAB_TYPE_SPM: {
8146
+ if (llama_is_normal_token(model->vocab, token)) {
8147
+ std::string result = model->vocab.id_to_token[token].text;
7073
8148
  llama_unescape_whitespace(result);
8149
+ if (length < (int) result.length()) {
8150
+ return -result.length();
8151
+ }
8152
+ memcpy(buf, result.c_str(), result.length());
8153
+ return result.length();
8154
+ } else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT
8155
+ if (length < 3) {
8156
+ return -3;
8157
+ }
8158
+ memcpy(buf, "\xe2\x96\x85", 3);
8159
+ return 3;
8160
+ } else if (llama_is_control_token(model->vocab, token)) {
8161
+ ;
8162
+ } else if (llama_is_byte_token(model->vocab, token)) {
8163
+ if (length < 1) {
8164
+ return -1;
8165
+ }
8166
+ buf[0] = llama_token_to_byte(model->vocab, token);
8167
+ return 1;
8168
+ } else {
8169
+ GGML_ASSERT(false);
7074
8170
  }
7075
- if (length < (int) result.length()) {
7076
- return -result.length();
7077
- }
7078
- memcpy(buf, result.c_str(), result.length());
7079
- return result.length();
7080
- } else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT
7081
- if (length < 3) {
7082
- return -3;
7083
- }
7084
- buf[0] = '\xe2';
7085
- buf[1] = '\x96';
7086
- buf[2] = '\x85';
7087
- return 3;
7088
- } else if (llama_is_control_token(model->vocab, token)) {
7089
- ;
7090
- } else if (llama_is_byte_token(model->vocab, token)) {
7091
- if (length < 1) {
7092
- return -1;
8171
+ break;
8172
+ }
8173
+ case LLAMA_VOCAB_TYPE_BPE: {
8174
+ if (llama_is_normal_token(model->vocab, token)) {
8175
+ std::string result = model->vocab.id_to_token[token].text;
8176
+ result = llama_decode_text(result);
8177
+ if (length < (int) result.length()) {
8178
+ return -result.length();
8179
+ }
8180
+ memcpy(buf, result.c_str(), result.length());
8181
+ return result.length();
8182
+ } else if (llama_is_control_token(model->vocab, token)) {
8183
+ ;
8184
+ } else {
8185
+ GGML_ASSERT(false);
7093
8186
  }
7094
- buf[0] = llama_token_to_byte(model->vocab, token);
7095
- return 1;
8187
+ break;
8188
+ }
8189
+ default:
8190
+ GGML_ASSERT(false);
7096
8191
  }
7097
8192
  }
7098
8193
  return 0;
@@ -7119,14 +8214,14 @@ void llama_print_timings(struct llama_context * ctx) {
7119
8214
  const llama_timings timings = llama_get_timings(ctx);
7120
8215
 
7121
8216
  LLAMA_LOG_INFO("\n");
7122
- LLAMA_LOG_INFO("%s: load time = %8.2f ms\n", __func__, timings.t_load_ms);
7123
- LLAMA_LOG_INFO("%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
8217
+ LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, timings.t_load_ms);
8218
+ LLAMA_LOG_INFO("%s: sample time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
7124
8219
  __func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample);
7125
- LLAMA_LOG_INFO("%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
8220
+ LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
7126
8221
  __func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
7127
- LLAMA_LOG_INFO("%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
8222
+ LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
7128
8223
  __func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
7129
- LLAMA_LOG_INFO("%s: total time = %8.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms));
8224
+ LLAMA_LOG_INFO("%s: total time = %10.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms));
7130
8225
  }
7131
8226
 
7132
8227
  void llama_reset_timings(struct llama_context * ctx) {
@@ -7194,12 +8289,12 @@ const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal
7194
8289
  return ctx->model.tensors_by_name;
7195
8290
  }
7196
8291
 
7197
- void llama_log_set(llama_log_callback log_callback, void * user_data) {
8292
+ void llama_log_set(ggml_log_callback log_callback, void * user_data) {
7198
8293
  g_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
7199
8294
  g_state.log_callback_user_data = user_data;
7200
8295
  }
7201
8296
 
7202
- static void llama_log_internal_v(llama_log_level level, const char * format, va_list args) {
8297
+ static void llama_log_internal_v(ggml_log_level level, const char * format, va_list args) {
7203
8298
  va_list args_copy;
7204
8299
  va_copy(args_copy, args);
7205
8300
  char buffer[128];
@@ -7216,14 +8311,14 @@ static void llama_log_internal_v(llama_log_level level, const char * format, va_
7216
8311
  va_end(args_copy);
7217
8312
  }
7218
8313
 
7219
- static void llama_log_internal(llama_log_level level, const char * format, ...) {
8314
+ static void llama_log_internal(ggml_log_level level, const char * format, ...) {
7220
8315
  va_list args;
7221
8316
  va_start(args, format);
7222
8317
  llama_log_internal_v(level, format, args);
7223
8318
  va_end(args);
7224
8319
  }
7225
8320
 
7226
- static void llama_log_callback_default(llama_log_level level, const char * text, void * user_data) {
8321
+ static void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data) {
7227
8322
  (void) level;
7228
8323
  (void) user_data;
7229
8324
  fputs(text, stderr);