llama_cpp 0.5.3 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,8 @@
1
1
  #define LLAMA_API_INTERNAL
2
2
  #include "llama.h"
3
3
 
4
+ #include "unicode.h"
5
+
4
6
  #include "ggml.h"
5
7
 
6
8
  #include "ggml-alloc.h"
@@ -72,6 +74,7 @@
72
74
  #include <sstream>
73
75
  #include <thread>
74
76
  #include <unordered_map>
77
+ #include <set>
75
78
 
76
79
  #if defined(_MSC_VER)
77
80
  #pragma warning(disable: 4244 4267) // possible loss of data
@@ -92,12 +95,12 @@
92
95
  //
93
96
 
94
97
  LLAMA_ATTRIBUTE_FORMAT(2, 3)
95
- static void llama_log_internal (llama_log_level level, const char* format, ...);
96
- static void llama_log_callback_default(llama_log_level level, const char * text, void * user_data);
98
+ static void llama_log_internal (ggml_log_level level, const char* format, ...);
99
+ static void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data);
97
100
 
98
- #define LLAMA_LOG_INFO(...) llama_log_internal(LLAMA_LOG_LEVEL_INFO , __VA_ARGS__)
99
- #define LLAMA_LOG_WARN(...) llama_log_internal(LLAMA_LOG_LEVEL_WARN , __VA_ARGS__)
100
- #define LLAMA_LOG_ERROR(...) llama_log_internal(LLAMA_LOG_LEVEL_ERROR, __VA_ARGS__)
101
+ #define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
102
+ #define LLAMA_LOG_WARN(...) llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
103
+ #define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
101
104
 
102
105
  //
103
106
  // helpers
@@ -122,6 +125,27 @@ static void replace_all(std::string & s, const std::string & search, const std::
122
125
  }
123
126
  s = std::move(result);
124
127
  }
128
+
129
+ static bool is_float_close(float a, float b, float abs_tol) {
130
+ // Check for non-negative tolerance
131
+ if (abs_tol < 0.0) {
132
+ throw std::invalid_argument("Tolerance must be non-negative");
133
+ }
134
+
135
+ // Exact equality check
136
+ if (a == b) {
137
+ return true;
138
+ }
139
+
140
+ // Check for infinities
141
+ if (std::isinf(a) || std::isinf(b)) {
142
+ return false;
143
+ }
144
+
145
+ // Regular comparison using the provided absolute tolerance
146
+ return std::fabs(b - a) <= abs_tol;
147
+ }
148
+
125
149
  #ifdef GGML_USE_CPU_HBM
126
150
  #include <hbwmalloc.h>
127
151
  #endif
@@ -162,18 +186,20 @@ enum llm_arch {
162
186
  LLM_ARCH_GPTNEOX,
163
187
  LLM_ARCH_MPT,
164
188
  LLM_ARCH_STARCODER,
189
+ LLM_ARCH_REFACT,
165
190
  LLM_ARCH_UNKNOWN,
166
191
  };
167
192
 
168
193
  static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
169
- { LLM_ARCH_LLAMA, "llama" },
170
- { LLM_ARCH_FALCON, "falcon" },
171
- { LLM_ARCH_GPT2, "gpt2" },
172
- { LLM_ARCH_GPTJ, "gptj" },
173
- { LLM_ARCH_GPTNEOX, "gptneox" },
174
- { LLM_ARCH_MPT, "mpt" },
175
- { LLM_ARCH_BAICHUAN, "baichuan" },
194
+ { LLM_ARCH_LLAMA, "llama" },
195
+ { LLM_ARCH_FALCON, "falcon" },
196
+ { LLM_ARCH_GPT2, "gpt2" },
197
+ { LLM_ARCH_GPTJ, "gptj" },
198
+ { LLM_ARCH_GPTNEOX, "gptneox" },
199
+ { LLM_ARCH_MPT, "mpt" },
200
+ { LLM_ARCH_BAICHUAN, "baichuan" },
176
201
  { LLM_ARCH_STARCODER, "starcoder" },
202
+ { LLM_ARCH_REFACT, "refact" },
177
203
  };
178
204
 
179
205
  enum llm_kv {
@@ -221,16 +247,16 @@ enum llm_kv {
221
247
  };
222
248
 
223
249
  static std::map<llm_kv, std::string> LLM_KV_NAMES = {
224
- { LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" },
225
- { LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" },
226
- { LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
227
- { LLM_KV_GENERAL_NAME, "general.name" },
228
- { LLM_KV_GENERAL_AUTHOR, "general.author" },
229
- { LLM_KV_GENERAL_URL, "general.url" },
230
- { LLM_KV_GENERAL_DESCRIPTION, "general.description" },
231
- { LLM_KV_GENERAL_LICENSE, "general.license" },
232
- { LLM_KV_GENERAL_SOURCE_URL, "general.source_url" },
233
- { LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source_hf_repo" },
250
+ { LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" },
251
+ { LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" },
252
+ { LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
253
+ { LLM_KV_GENERAL_NAME, "general.name" },
254
+ { LLM_KV_GENERAL_AUTHOR, "general.author" },
255
+ { LLM_KV_GENERAL_URL, "general.url" },
256
+ { LLM_KV_GENERAL_DESCRIPTION, "general.description" },
257
+ { LLM_KV_GENERAL_LICENSE, "general.license" },
258
+ { LLM_KV_GENERAL_SOURCE_URL, "general.source.url" },
259
+ { LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" },
234
260
 
235
261
  { LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
236
262
  { LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
@@ -394,6 +420,23 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
394
420
  { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
395
421
  },
396
422
  },
423
+ {
424
+ LLM_ARCH_REFACT,
425
+ {
426
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
427
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
428
+ { LLM_TENSOR_OUTPUT, "output" },
429
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
430
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
431
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
432
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
433
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
434
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
435
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
436
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
437
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
438
+ },
439
+ },
397
440
  {
398
441
  LLM_ARCH_UNKNOWN,
399
442
  {
@@ -448,7 +491,7 @@ struct LLM_TN {
448
491
  //
449
492
 
450
493
  #define GGUF_GET_KEY(ctx, dst, func, type, req, key) \
451
- { \
494
+ do { \
452
495
  const std::string skey(key); \
453
496
  const int kid = gguf_find_key(ctx, skey.c_str()); \
454
497
  if (kid >= 0) { \
@@ -460,7 +503,7 @@ struct LLM_TN {
460
503
  } else if (req) { \
461
504
  throw std::runtime_error(format("key not found in model: %s", skey.c_str())); \
462
505
  } \
463
- }
506
+ } while (0)
464
507
 
465
508
  //
466
509
  // ggml helpers
@@ -881,10 +924,10 @@ static void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
881
924
 
882
925
  static std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) {
883
926
  std::vector<char> result(8, 0);
884
- const int n_tokens = llama_token_to_piece(ctx, token, result.data(), result.size());
927
+ const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
885
928
  if (n_tokens < 0) {
886
929
  result.resize(-n_tokens);
887
- int check = llama_token_to_piece(ctx, token, result.data(), result.size());
930
+ int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
888
931
  GGML_ASSERT(check == -n_tokens);
889
932
  } else {
890
933
  result.resize(n_tokens);
@@ -899,7 +942,7 @@ static std::string llama_token_to_str(const struct llama_context * ctx, llama_to
899
942
 
900
943
  struct llama_state {
901
944
  // We save the log callback globally
902
- llama_log_callback log_callback = llama_log_callback_default;
945
+ ggml_log_callback log_callback = llama_log_callback_default;
903
946
  void * log_callback_user_data = nullptr;
904
947
  };
905
948
 
@@ -925,9 +968,9 @@ static const size_t MB = kB*kB;
925
968
  static const size_t GB = kB*kB*kB;
926
969
 
927
970
  struct llama_hparams {
971
+ bool vocab_only;
928
972
  uint32_t n_vocab;
929
973
  uint32_t n_ctx_train; // context size the model was trained on
930
- uint32_t n_ctx; // context size used during inference
931
974
  uint32_t n_embd;
932
975
  uint32_t n_head;
933
976
  uint32_t n_head_kv;
@@ -938,11 +981,28 @@ struct llama_hparams {
938
981
  float f_norm_eps;
939
982
  float f_norm_rms_eps;
940
983
 
941
- float rope_freq_base;
942
- float rope_freq_scale;
984
+ float rope_freq_base_train;
985
+ float rope_freq_scale_train;
943
986
 
944
987
  bool operator!=(const llama_hparams & other) const {
945
- return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams))); // NOLINT
988
+ if (this->vocab_only != other.vocab_only) return true;
989
+ if (this->n_vocab != other.n_vocab) return true;
990
+ if (this->n_ctx_train != other.n_ctx_train) return true;
991
+ if (this->n_embd != other.n_embd) return true;
992
+ if (this->n_head != other.n_head) return true;
993
+ if (this->n_head_kv != other.n_head_kv) return true;
994
+ if (this->n_layer != other.n_layer) return true;
995
+ if (this->n_rot != other.n_rot) return true;
996
+ if (this->n_ff != other.n_ff) return true;
997
+
998
+ const float EPSILON = 1e-9;
999
+
1000
+ if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
1001
+ if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
1002
+ if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
1003
+ if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
1004
+
1005
+ return false;
946
1006
  }
947
1007
 
948
1008
  uint32_t n_gqa() const {
@@ -956,15 +1016,18 @@ struct llama_hparams {
956
1016
  uint32_t n_embd_gqa() const {
957
1017
  return n_embd/n_gqa();
958
1018
  }
1019
+ };
959
1020
 
960
- size_t kv_size() const {
961
- size_t result = 2ull;
962
- result *= (size_t) n_embd_gqa();
963
- result *= (size_t) n_ctx;
964
- result *= (size_t) n_layer;
965
- result *= sizeof(ggml_fp16_t);
966
- return result;
967
- }
1021
+ struct llama_cparams {
1022
+ uint32_t n_ctx; // context size used during inference
1023
+ uint32_t n_batch;
1024
+ uint32_t n_threads; // number of threads to use for generation
1025
+ uint32_t n_threads_batch; // number of threads to use for batch processing
1026
+
1027
+ float rope_freq_base;
1028
+ float rope_freq_scale;
1029
+
1030
+ bool mul_mat_q;
968
1031
  };
969
1032
 
970
1033
  struct llama_layer {
@@ -999,7 +1062,29 @@ struct llama_layer {
999
1062
  struct ggml_tensor * b3; // ffn_up
1000
1063
  };
1001
1064
 
1065
+ struct llama_kv_cell {
1066
+ llama_pos pos = -1;
1067
+ llama_pos delta = 0;
1068
+
1069
+ std::set<llama_seq_id> seq_id;
1070
+
1071
+ bool has_seq_id(const llama_seq_id & id) const {
1072
+ return seq_id.find(id) != seq_id.end();
1073
+ }
1074
+ };
1075
+
1076
+ // ring-buffer of cached KV data
1002
1077
  struct llama_kv_cache {
1078
+ bool has_shift = false;
1079
+
1080
+ uint32_t head = 0;
1081
+ uint32_t size = 0;
1082
+
1083
+ // computed before each graph build
1084
+ uint32_t n = 0;
1085
+
1086
+ std::vector<llama_kv_cell> cells;
1087
+
1003
1088
  struct ggml_tensor * k = NULL;
1004
1089
  struct ggml_tensor * v = NULL;
1005
1090
 
@@ -1007,8 +1092,6 @@ struct llama_kv_cache {
1007
1092
 
1008
1093
  llama_buffer buf;
1009
1094
 
1010
- int n; // number of tokens currently in the cache
1011
-
1012
1095
  ~llama_kv_cache() {
1013
1096
  if (ctx) {
1014
1097
  ggml_free(ctx);
@@ -1047,6 +1130,10 @@ struct llama_vocab {
1047
1130
  id special_pad_id = -1;
1048
1131
 
1049
1132
  id linefeed_id = 13;
1133
+ id special_prefix_id = 32007;
1134
+ id special_middle_id = 32009;
1135
+ id special_suffix_id = 32008;
1136
+ id special_eot_id = 32010;
1050
1137
 
1051
1138
  int find_bpe_rank(std::string token_left, std::string token_right) const {
1052
1139
  replace_all(token_left, " ", "\u0120");
@@ -1122,11 +1209,8 @@ struct llama_model {
1122
1209
  };
1123
1210
 
1124
1211
  struct llama_context {
1125
- llama_context(const llama_model & model) : model(model), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
1212
+ llama_context(const llama_model & model) : model(model), t_start_us(model.t_start_us), t_load_us(model.t_load_us) {}
1126
1213
  ~llama_context() {
1127
- if (model_owner) {
1128
- delete &model;
1129
- }
1130
1214
  #ifdef GGML_USE_METAL
1131
1215
  if (ctx_metal) {
1132
1216
  ggml_metal_free(ctx_metal);
@@ -1137,27 +1221,26 @@ struct llama_context {
1137
1221
  }
1138
1222
  }
1139
1223
 
1224
+ llama_cparams cparams;
1225
+
1226
+ const llama_model & model;
1227
+
1228
+ // key + value cache for the self attention
1229
+ struct llama_kv_cache kv_self;
1230
+
1140
1231
  std::mt19937 rng;
1141
1232
 
1142
1233
  bool has_evaluated_once = false;
1143
1234
 
1235
+ int64_t t_start_us;
1236
+ int64_t t_load_us;
1144
1237
  int64_t t_sample_us = 0;
1145
- int64_t t_eval_us = 0;
1146
1238
  int64_t t_p_eval_us = 0;
1239
+ int64_t t_eval_us = 0;
1147
1240
 
1148
1241
  int32_t n_sample = 0; // number of tokens sampled
1149
- int32_t n_eval = 0; // number of eval calls
1150
1242
  int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
1151
-
1152
- const llama_model & model;
1153
-
1154
- bool model_owner = false;
1155
-
1156
- int64_t t_load_us;
1157
- int64_t t_start_us;
1158
-
1159
- // key + value cache for the self attention
1160
- struct llama_kv_cache kv_self;
1243
+ int32_t n_eval = 0; // number of eval calls
1161
1244
 
1162
1245
  // decode output (2-dimensional array: [n_tokens][n_vocab])
1163
1246
  std::vector<float> logits;
@@ -1192,16 +1275,23 @@ static bool llama_kv_cache_init(
1192
1275
  const struct llama_hparams & hparams,
1193
1276
  struct llama_kv_cache & cache,
1194
1277
  ggml_type wtype,
1195
- int n_ctx,
1278
+ uint32_t n_ctx,
1196
1279
  int n_gpu_layers) {
1197
- const int n_embd = hparams.n_embd_gqa();
1198
- const int n_layer = hparams.n_layer;
1280
+ const uint32_t n_embd = hparams.n_embd_gqa();
1281
+ const uint32_t n_layer = hparams.n_layer;
1199
1282
 
1200
1283
  const int64_t n_mem = n_layer*n_ctx;
1201
1284
  const int64_t n_elements = n_embd*n_mem;
1202
1285
 
1286
+ cache.has_shift = false;
1287
+
1288
+ cache.head = 0;
1289
+ cache.size = n_ctx;
1290
+
1291
+ cache.cells.clear();
1292
+ cache.cells.resize(n_ctx);
1293
+
1203
1294
  cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
1204
- cache.n = 0;
1205
1295
 
1206
1296
  struct ggml_init_params params;
1207
1297
  params.mem_size = cache.buf.size;
@@ -1222,17 +1312,163 @@ static bool llama_kv_cache_init(
1222
1312
 
1223
1313
  (void) n_gpu_layers;
1224
1314
  #ifdef GGML_USE_CUBLAS
1225
- if (n_gpu_layers > n_layer + 1) {
1315
+ size_t vram_kv_cache = 0;
1316
+
1317
+ if (n_gpu_layers > (int)n_layer + 1) {
1226
1318
  ggml_cuda_assign_buffers_no_scratch(cache.v);
1319
+ LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__);
1320
+ vram_kv_cache += ggml_nbytes(cache.v);
1227
1321
  }
1228
- if (n_gpu_layers > n_layer + 2) {
1322
+ if (n_gpu_layers > (int)n_layer + 2) {
1229
1323
  ggml_cuda_assign_buffers_no_scratch(cache.k);
1324
+ LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
1325
+ vram_kv_cache += ggml_nbytes(cache.k);
1326
+ }
1327
+ if (vram_kv_cache > 0) {
1328
+ LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
1230
1329
  }
1231
1330
  #endif // GGML_USE_CUBLAS
1232
1331
 
1233
1332
  return true;
1234
1333
  }
1235
1334
 
1335
+ // find an empty slot of size "n_tokens" in the cache
1336
+ // updates the cache head
1337
+ static bool llama_kv_cache_find_slot(
1338
+ struct llama_kv_cache & cache,
1339
+ const struct llama_batch & batch) {
1340
+ const uint32_t n_ctx = cache.size;
1341
+ const uint32_t n_tokens = batch.n_tokens;
1342
+
1343
+ if (n_tokens > n_ctx) {
1344
+ LLAMA_LOG_ERROR("%s: n_tokens=%d > n_ctx=%d\n", __func__, n_tokens, n_ctx);
1345
+ return false;
1346
+ }
1347
+
1348
+ uint32_t n_tested = 0;
1349
+
1350
+ while (true) {
1351
+ if (cache.head + n_tokens > n_ctx) {
1352
+ cache.head = 0;
1353
+ n_tested += n_ctx - cache.head;
1354
+ continue;
1355
+ }
1356
+
1357
+ bool found = true;
1358
+ for (uint32_t i = 0; i < n_tokens; i++) {
1359
+ if (cache.cells[cache.head + i].pos >= 0) {
1360
+ found = false;
1361
+ cache.head += i + 1;
1362
+ n_tested += i + 1;
1363
+ break;
1364
+ }
1365
+ }
1366
+
1367
+ if (found) {
1368
+ break;
1369
+ }
1370
+
1371
+ if (n_tested >= n_ctx) {
1372
+ //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
1373
+ return false;
1374
+ }
1375
+ }
1376
+
1377
+ for (uint32_t i = 0; i < n_tokens; i++) {
1378
+ cache.cells[cache.head + i].pos = batch.pos[i];
1379
+ cache.cells[cache.head + i].seq_id.insert(batch.seq_id[i]);
1380
+ }
1381
+
1382
+ return true;
1383
+ }
1384
+
1385
+ // find how many cells are currently in use
1386
+ static int32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
1387
+ for (uint32_t i = cache.size - 1; i > 0; --i) {
1388
+ if (cache.cells[i].pos >= 0 && !cache.cells[i].seq_id.empty()) {
1389
+ return i + 1;
1390
+ }
1391
+ }
1392
+
1393
+ return 0;
1394
+ }
1395
+
1396
+ static void llama_kv_cache_tokens_rm(struct llama_kv_cache & cache, int32_t c0, int32_t c1) {
1397
+ if (c0 < 0) c0 = 0;
1398
+ if (c1 < 0) c1 = cache.size;
1399
+
1400
+ for (int32_t i = c0; i < c1; ++i) {
1401
+ cache.cells[i].pos = -1;
1402
+ cache.cells[i].seq_id.clear();
1403
+ }
1404
+ }
1405
+
1406
+ static void llama_kv_cache_seq_rm(
1407
+ struct llama_kv_cache & cache,
1408
+ llama_seq_id seq_id,
1409
+ llama_pos p0,
1410
+ llama_pos p1) {
1411
+ if (p0 < 0) p0 = 0;
1412
+ if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
1413
+
1414
+ for (uint32_t i = 0; i < cache.size; ++i) {
1415
+ if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
1416
+ cache.cells[i].seq_id.erase(seq_id);
1417
+ if (cache.cells[i].seq_id.empty()) {
1418
+ cache.cells[i].pos = -1;
1419
+ }
1420
+ }
1421
+ }
1422
+ }
1423
+
1424
+ static void llama_kv_cache_seq_cp(
1425
+ struct llama_kv_cache & cache,
1426
+ llama_seq_id seq_id_src,
1427
+ llama_seq_id seq_id_dst,
1428
+ llama_pos p0,
1429
+ llama_pos p1) {
1430
+ if (p0 < 0) p0 = 0;
1431
+ if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
1432
+
1433
+ for (uint32_t i = 0; i < cache.size; ++i) {
1434
+ if (cache.cells[i].has_seq_id(seq_id_src) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
1435
+ cache.cells[i].seq_id.insert(seq_id_dst);
1436
+ }
1437
+ }
1438
+ }
1439
+
1440
+ static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id seq_id) {
1441
+ for (uint32_t i = 0; i < cache.size; ++i) {
1442
+ if (!cache.cells[i].has_seq_id(seq_id)) {
1443
+ cache.cells[i].pos = -1;
1444
+ cache.cells[i].seq_id.clear();
1445
+ }
1446
+ }
1447
+ }
1448
+
1449
+ static void llama_kv_cache_seq_shift(
1450
+ struct llama_kv_cache & cache,
1451
+ llama_seq_id seq_id,
1452
+ llama_pos p0,
1453
+ llama_pos p1,
1454
+ llama_pos delta) {
1455
+ if (p0 < 0) p0 = 0;
1456
+ if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
1457
+
1458
+ for (uint32_t i = 0; i < cache.size; ++i) {
1459
+ if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
1460
+ cache.cells[i].pos += delta;
1461
+ if (cache.cells[i].pos < 0) {
1462
+ cache.cells[i].pos = -1;
1463
+ cache.cells[i].seq_id.clear();
1464
+ } else {
1465
+ cache.has_shift = true;
1466
+ cache.cells[i].delta = delta;
1467
+ }
1468
+ }
1469
+ }
1470
+ }
1471
+
1236
1472
  //
1237
1473
  // model loading and saving
1238
1474
  //
@@ -1554,7 +1790,7 @@ struct llama_model_loader {
1554
1790
  lmlock->grow_to(size_lock);
1555
1791
  }
1556
1792
  break;
1557
- #if defined(GGML_USE_CUBLAS)
1793
+ #ifdef GGML_USE_CUBLAS
1558
1794
  case GGML_BACKEND_GPU:
1559
1795
  case GGML_BACKEND_GPU_SPLIT:
1560
1796
  // old code:
@@ -1587,7 +1823,15 @@ struct llama_model_loader {
1587
1823
  // load LLaMA models
1588
1824
  //
1589
1825
 
1590
- static std::string llama_model_ftype_name(enum llama_ftype ftype) {
1826
+ static std::string llama_model_arch_name(llm_arch arch) {
1827
+ auto it = LLM_ARCH_NAMES.find(arch);
1828
+ if (it == LLM_ARCH_NAMES.end()) {
1829
+ return "unknown";
1830
+ }
1831
+ return it->second;
1832
+ }
1833
+
1834
+ static std::string llama_model_ftype_name(llama_ftype ftype) {
1591
1835
  if (ftype & LLAMA_FTYPE_GUESSED) {
1592
1836
  return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
1593
1837
  }
@@ -1643,10 +1887,7 @@ static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
1643
1887
 
1644
1888
  static void llm_load_hparams(
1645
1889
  llama_model_loader & ml,
1646
- llama_model & model,
1647
- int n_ctx,
1648
- float rope_freq_base,
1649
- float rope_freq_scale) {
1890
+ llama_model & model) {
1650
1891
  struct gguf_context * ctx = ml.ctx_gguf;
1651
1892
 
1652
1893
  const auto kv = LLM_KV(model.arch);
@@ -1657,29 +1898,25 @@ static void llm_load_hparams(
1657
1898
  GGUF_GET_KEY(ctx, model.name, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_NAME));
1658
1899
 
1659
1900
  // get hparams kv
1660
- GGUF_GET_KEY(ctx, hparams.n_vocab, gguf_get_arr_n, GGUF_TYPE_ARRAY, true, kv(LLM_KV_TOKENIZER_LIST));
1661
- GGUF_GET_KEY(ctx, hparams.n_ctx_train, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_CONTEXT_LENGTH));
1662
- GGUF_GET_KEY(ctx, hparams.n_embd, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_EMBEDDING_LENGTH));
1663
- GGUF_GET_KEY(ctx, hparams.n_ff, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_FEED_FORWARD_LENGTH));
1664
- GGUF_GET_KEY(ctx, hparams.n_head, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_ATTENTION_HEAD_COUNT));
1665
- GGUF_GET_KEY(ctx, hparams.n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_BLOCK_COUNT));
1901
+ GGUF_GET_KEY(ctx, hparams.n_vocab, gguf_get_arr_n, GGUF_TYPE_ARRAY, true, kv(LLM_KV_TOKENIZER_LIST));
1902
+ GGUF_GET_KEY(ctx, hparams.n_ctx_train, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_CONTEXT_LENGTH));
1903
+ GGUF_GET_KEY(ctx, hparams.n_embd, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_EMBEDDING_LENGTH));
1904
+ GGUF_GET_KEY(ctx, hparams.n_ff, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_FEED_FORWARD_LENGTH));
1905
+ GGUF_GET_KEY(ctx, hparams.n_head, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_ATTENTION_HEAD_COUNT));
1906
+ GGUF_GET_KEY(ctx, hparams.n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_BLOCK_COUNT));
1666
1907
 
1667
1908
  // n_head_kv is optional, default to n_head
1668
1909
  hparams.n_head_kv = hparams.n_head;
1669
1910
  GGUF_GET_KEY(ctx, hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV));
1670
1911
 
1671
1912
  // rope_freq_base (optional)
1672
- if (rope_freq_base == 0.0f) {
1673
- rope_freq_base = 10000.0f;
1674
- GGUF_GET_KEY(ctx, rope_freq_base, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
1675
- }
1913
+ hparams.rope_freq_base_train = 10000.0f;
1914
+ GGUF_GET_KEY(ctx, hparams.rope_freq_base_train, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
1676
1915
 
1677
1916
  // rope_freq_scale (inverse of the kv) is optional
1678
- if (rope_freq_scale == 0.0f) {
1679
- float ropescale = 1.0f;
1680
- GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
1681
- rope_freq_scale = 1.0f/ropescale;
1682
- }
1917
+ float ropescale = 1.0f;
1918
+ GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
1919
+ hparams.rope_freq_scale_train = 1.0f/ropescale;
1683
1920
 
1684
1921
  // sanity check for n_rot (optional)
1685
1922
  {
@@ -1742,14 +1979,18 @@ static void llm_load_hparams(
1742
1979
  default: model.type = e_model::MODEL_UNKNOWN;
1743
1980
  }
1744
1981
  } break;
1982
+ case LLM_ARCH_REFACT:
1983
+ {
1984
+ GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
1985
+ switch (hparams.n_layer) {
1986
+ case 32: model.type = e_model::MODEL_1B; break;
1987
+ default: model.type = e_model::MODEL_UNKNOWN;
1988
+ }
1989
+ } break;
1745
1990
  default: (void)0;
1746
- };
1991
+ }
1747
1992
 
1748
1993
  model.ftype = ml.ftype;
1749
-
1750
- hparams.n_ctx = n_ctx;
1751
- hparams.rope_freq_base = rope_freq_base;
1752
- hparams.rope_freq_scale = rope_freq_scale;
1753
1994
  }
1754
1995
 
1755
1996
  // TODO: This should probably be in llama.h
@@ -1770,20 +2011,18 @@ static void llm_load_vocab(
1770
2011
  throw std::runtime_error("cannot find tokenizer vocab in model file\n");
1771
2012
  }
1772
2013
 
2014
+ const float * scores = nullptr;
1773
2015
  const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
1774
- if (score_idx == -1) {
1775
- throw std::runtime_error("cannot find tokenizer scores in model file\n");
2016
+ if (score_idx != -1) {
2017
+ scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
1776
2018
  }
1777
2019
 
1778
- const float * scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
1779
-
2020
+ const int * toktypes = nullptr;
1780
2021
  const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
1781
- if (toktype_idx == -1) {
1782
- throw std::runtime_error("cannot find token type list in GGUF file\n");
2022
+ if (toktype_idx != -1) {
2023
+ toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
1783
2024
  }
1784
2025
 
1785
- const int * toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
1786
-
1787
2026
  // determine vocab type
1788
2027
  {
1789
2028
  std::string tokenizer_name;
@@ -1812,6 +2051,7 @@ static void llm_load_vocab(
1812
2051
 
1813
2052
  for (int i = 0; i < n_merges; i++) {
1814
2053
  const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
2054
+ GGML_ASSERT(codepoints_from_utf8(word).size() > 0);
1815
2055
 
1816
2056
  std::string first;
1817
2057
  std::string second;
@@ -1846,20 +2086,22 @@ static void llm_load_vocab(
1846
2086
 
1847
2087
  for (uint32_t i = 0; i < n_vocab; i++) {
1848
2088
  std::string word = gguf_get_arr_str(ctx, token_idx, i);
2089
+ GGML_ASSERT(codepoints_from_utf8(word).size() > 0);
1849
2090
 
1850
2091
  vocab.token_to_id[word] = i;
1851
2092
 
1852
2093
  auto & token_data = vocab.id_to_token[i];
1853
2094
  token_data.text = std::move(word);
1854
- token_data.score = scores[i];
1855
- token_data.type = (llama_token_type) toktypes[i];
2095
+ token_data.score = scores ? scores[i] : 0.0f;
2096
+ token_data.type = toktypes ? (llama_token_type) toktypes[i] : LLAMA_TOKEN_TYPE_NORMAL;
1856
2097
  }
2098
+ GGML_ASSERT(vocab.id_to_token.size() == vocab.token_to_id.size());
1857
2099
 
1858
2100
  // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
1859
2101
  if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
1860
2102
  vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
1861
2103
  } else {
1862
- vocab.linefeed_id = llama_tokenize_internal(vocab, "\n", false)[0];
2104
+ vocab.linefeed_id = llama_tokenize_internal(vocab, "\u010A", false)[0];
1863
2105
  }
1864
2106
 
1865
2107
  // special tokens
@@ -1875,31 +2117,30 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
1875
2117
  const auto & vocab = model.vocab;
1876
2118
 
1877
2119
  // hparams
1878
- LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver));
1879
- LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch).c_str());
1880
- LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, vocab.type == LLAMA_VOCAB_TYPE_SPM ? "SPM" : "BPE"); // TODO: fix
1881
- LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab);
1882
- LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (int) vocab.bpe_ranks.size());
1883
- LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
1884
- LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, hparams.n_ctx);
1885
- LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
1886
- LLAMA_LOG_INFO("%s: n_head = %u\n", __func__, hparams.n_head);
1887
- LLAMA_LOG_INFO("%s: n_head_kv = %u\n", __func__, hparams.n_head_kv);
1888
- LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
1889
- LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim
1890
- LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa());
1891
- LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
1892
- LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
1893
- LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
1894
- LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base);
1895
- LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale);
1896
- LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
1897
- LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
1898
- LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
2120
+ LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver));
2121
+ LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch).c_str());
2122
+ LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, vocab.type == LLAMA_VOCAB_TYPE_SPM ? "SPM" : "BPE"); // TODO: fix
2123
+ LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab);
2124
+ LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (int) vocab.bpe_ranks.size());
2125
+ LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
2126
+ LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
2127
+ LLAMA_LOG_INFO("%s: n_head = %u\n", __func__, hparams.n_head);
2128
+ LLAMA_LOG_INFO("%s: n_head_kv = %u\n", __func__, hparams.n_head_kv);
2129
+ LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
2130
+ LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim
2131
+ LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa());
2132
+ LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
2133
+ LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
2134
+ LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
2135
+ LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
2136
+ LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
2137
+ LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
2138
+ LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
2139
+ LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
1899
2140
  if (ml.n_bytes < GB) {
1900
- LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
2141
+ LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
1901
2142
  } else {
1902
- LLAMA_LOG_INFO("%s: model size = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
2143
+ LLAMA_LOG_INFO("%s: model size = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
1903
2144
  }
1904
2145
 
1905
2146
  // general kv
@@ -1917,13 +2158,9 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
1917
2158
  static void llm_load_tensors(
1918
2159
  llama_model_loader & ml,
1919
2160
  llama_model & model,
1920
- int n_batch,
1921
2161
  int n_gpu_layers,
1922
2162
  int main_gpu,
1923
2163
  const float * tensor_split,
1924
- const bool mul_mat_q,
1925
- bool low_vram,
1926
- ggml_type memory_type,
1927
2164
  bool use_mlock,
1928
2165
  llama_progress_callback progress_callback,
1929
2166
  void * progress_callback_user_data) {
@@ -1962,11 +2199,9 @@ static void llm_load_tensors(
1962
2199
  }
1963
2200
 
1964
2201
  (void) main_gpu;
1965
- (void) mul_mat_q;
1966
- #if defined(GGML_USE_CUBLAS)
2202
+ #ifdef GGML_USE_CUBLAS
1967
2203
  LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__);
1968
2204
  ggml_cuda_set_main_device(main_gpu);
1969
- ggml_cuda_set_mul_mat_q(mul_mat_q);
1970
2205
  #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
1971
2206
  #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
1972
2207
  #elif defined(GGML_USE_CLBLAST)
@@ -1989,6 +2224,7 @@ static void llm_load_tensors(
1989
2224
  const auto tn = LLM_TN(model.arch);
1990
2225
  switch (model.arch) {
1991
2226
  case LLM_ARCH_LLAMA:
2227
+ case LLM_ARCH_REFACT:
1992
2228
  {
1993
2229
  model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
1994
2230
 
@@ -2001,9 +2237,9 @@ static void llm_load_tensors(
2001
2237
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2002
2238
  // on Windows however this is detrimental unless everything is on the GPU
2003
2239
  #ifndef _WIN32
2004
- backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2240
+ backend_norm = LLAMA_BACKEND_OFFLOAD;
2005
2241
  #else
2006
- backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2242
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2007
2243
  #endif // _WIN32
2008
2244
 
2009
2245
  backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
@@ -2067,9 +2303,9 @@ static void llm_load_tensors(
2067
2303
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2068
2304
  // on Windows however this is detrimental unless everything is on the GPU
2069
2305
  #ifndef _WIN32
2070
- backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2306
+ backend_norm = LLAMA_BACKEND_OFFLOAD;
2071
2307
  #else
2072
- backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2308
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2073
2309
  #endif // _WIN32
2074
2310
 
2075
2311
  backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
@@ -2137,9 +2373,9 @@ static void llm_load_tensors(
2137
2373
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2138
2374
  // on Windows however this is detrimental unless everything is on the GPU
2139
2375
  #ifndef _WIN32
2140
- backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2376
+ backend_norm = LLAMA_BACKEND_OFFLOAD;
2141
2377
  #else
2142
- backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2378
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2143
2379
  #endif // _WIN32
2144
2380
 
2145
2381
  backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
@@ -2214,9 +2450,9 @@ static void llm_load_tensors(
2214
2450
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2215
2451
  // on Windows however this is detrimental unless everything is on the GPU
2216
2452
  #ifndef _WIN32
2217
- backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2453
+ backend_norm = LLAMA_BACKEND_OFFLOAD;
2218
2454
  #else
2219
- backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2455
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2220
2456
  #endif // _WIN32
2221
2457
 
2222
2458
  backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
@@ -2281,27 +2517,19 @@ static void llm_load_tensors(
2281
2517
  } break;
2282
2518
  default:
2283
2519
  throw std::runtime_error("unknown architecture");
2284
- };
2520
+ }
2285
2521
  }
2286
2522
 
2287
2523
  ml.done_getting_tensors();
2288
2524
 
2289
2525
  // print memory requirements
2290
2526
  {
2291
- const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
2292
-
2293
2527
  // this is the total memory required to run the inference
2294
2528
  size_t mem_required =
2295
2529
  ctx_size +
2296
2530
  mmapped_size - vram_weights; // weights in VRAM not in memory
2297
2531
 
2298
- // this is the memory required by one llama_state
2299
- const size_t mem_required_state = scale*hparams.kv_size();
2300
-
2301
- LLAMA_LOG_INFO("%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
2302
- mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
2303
-
2304
- (void) n_batch;
2532
+ LLAMA_LOG_INFO("%s: mem required = %7.2f MB\n", __func__, mem_required / 1024.0 / 1024.0);
2305
2533
 
2306
2534
  #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
2307
2535
  const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
@@ -2310,36 +2538,17 @@ static void llm_load_tensors(
2310
2538
  if (n_gpu_layers > (int) hparams.n_layer) {
2311
2539
  LLAMA_LOG_INFO("%s: offloading non-repeating layers to GPU\n", __func__);
2312
2540
  }
2313
- size_t vram_kv_cache = 0;
2314
2541
 
2315
2542
  #ifdef GGML_USE_CUBLAS
2316
2543
  const int max_backend_supported_layers = hparams.n_layer + 3;
2317
- const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
2318
- if (n_gpu_layers > (int) hparams.n_layer + 1) {
2319
- if (low_vram) {
2320
- LLAMA_LOG_INFO("%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
2321
- } else {
2322
- LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__);
2323
- vram_kv_cache += hparams.kv_size() / 2;
2324
- }
2325
- }
2326
- if (n_gpu_layers > (int) hparams.n_layer + 2) {
2327
- if (low_vram) {
2328
- LLAMA_LOG_WARN("%s: cannot offload k cache to GPU due to low VRAM option\n", __func__);
2329
- } else {
2330
- LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
2331
- vram_kv_cache += hparams.kv_size() / 2;
2332
- }
2333
- }
2544
+ const int max_offloadable_layers = hparams.n_layer + 3;
2334
2545
  #elif defined(GGML_USE_CLBLAST)
2335
2546
  const int max_backend_supported_layers = hparams.n_layer + 1;
2336
2547
  const int max_offloadable_layers = hparams.n_layer + 1;
2337
2548
  #endif // GGML_USE_CUBLAS
2338
2549
 
2339
- LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n",
2340
- __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
2341
- LLAMA_LOG_INFO("%s: VRAM used: %zu MB\n",
2342
- __func__, (vram_weights + vram_kv_cache + MB - 1) / MB); // round up
2550
+ LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
2551
+ LLAMA_LOG_INFO("%s: VRAM used: %.2f MB\n", __func__, vram_weights / 1024.0 / 1024.0);
2343
2552
  #else
2344
2553
  (void) n_gpu_layers;
2345
2554
  #endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
@@ -2352,7 +2561,7 @@ static void llm_load_tensors(
2352
2561
  }
2353
2562
 
2354
2563
  (void) tensor_split;
2355
- #if defined(GGML_USE_CUBLAS)
2564
+ #ifdef GGML_USE_CUBLAS
2356
2565
  {
2357
2566
  ggml_cuda_set_tensor_split(tensor_split);
2358
2567
  }
@@ -2374,29 +2583,24 @@ static void llm_load_tensors(
2374
2583
  static bool llama_model_load(
2375
2584
  const std::string & fname,
2376
2585
  llama_model & model,
2377
- int n_ctx,
2378
- int n_batch,
2379
2586
  int n_gpu_layers,
2380
2587
  int main_gpu,
2381
2588
  const float * tensor_split,
2382
- const bool mul_mat_q,
2383
- float rope_freq_base,
2384
- float rope_freq_scale,
2385
- bool low_vram,
2386
- ggml_type memory_type,
2387
2589
  bool use_mmap,
2388
2590
  bool use_mlock,
2389
2591
  bool vocab_only,
2390
2592
  llama_progress_callback progress_callback,
2391
2593
  void *progress_callback_user_data) {
2392
2594
  try {
2393
- std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap));
2595
+ llama_model_loader ml(fname, use_mmap);
2596
+
2597
+ model.hparams.vocab_only = vocab_only;
2394
2598
 
2395
- llm_load_arch (*ml, model);
2396
- llm_load_hparams(*ml, model, n_ctx, rope_freq_base, rope_freq_scale);
2397
- llm_load_vocab (*ml, model);
2599
+ llm_load_arch (ml, model);
2600
+ llm_load_hparams(ml, model);
2601
+ llm_load_vocab (ml, model);
2398
2602
 
2399
- llm_load_print_meta(*ml, model);
2603
+ llm_load_print_meta(ml, model);
2400
2604
 
2401
2605
  if (model.hparams.n_vocab != model.vocab.id_to_token.size()) {
2402
2606
  throw std::runtime_error("vocab size mismatch");
@@ -2408,8 +2612,8 @@ static bool llama_model_load(
2408
2612
  }
2409
2613
 
2410
2614
  llm_load_tensors(
2411
- *ml, model, n_batch, n_gpu_layers,
2412
- main_gpu, tensor_split, mul_mat_q, low_vram, memory_type,
2615
+ ml, model, n_gpu_layers,
2616
+ main_gpu, tensor_split,
2413
2617
  use_mlock, progress_callback, progress_callback_user_data);
2414
2618
  } catch (const std::exception & err) {
2415
2619
  LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
@@ -2421,17 +2625,10 @@ static bool llama_model_load(
2421
2625
 
2422
2626
  static struct ggml_cgraph * llm_build_llama(
2423
2627
  llama_context & lctx,
2424
- const llama_token * tokens,
2425
- const float * embd,
2426
- int n_tokens,
2427
- int n_past) {
2428
-
2429
- GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
2430
-
2431
- const int N = n_tokens;
2432
-
2628
+ const llama_batch & batch) {
2433
2629
  const auto & model = lctx.model;
2434
2630
  const auto & hparams = model.hparams;
2631
+ const auto & cparams = lctx.cparams;
2435
2632
 
2436
2633
  const auto & kv_self = lctx.kv_self;
2437
2634
 
@@ -2439,7 +2636,7 @@ static struct ggml_cgraph * llm_build_llama(
2439
2636
 
2440
2637
  const int64_t n_embd = hparams.n_embd;
2441
2638
  const int64_t n_layer = hparams.n_layer;
2442
- const int64_t n_ctx = hparams.n_ctx;
2639
+ const int64_t n_ctx = cparams.n_ctx;
2443
2640
  const int64_t n_head = hparams.n_head;
2444
2641
  const int64_t n_head_kv = hparams.n_head_kv;
2445
2642
  const int64_t n_embd_head = hparams.n_embd_head();
@@ -2447,12 +2644,20 @@ static struct ggml_cgraph * llm_build_llama(
2447
2644
 
2448
2645
  GGML_ASSERT(n_embd_head == hparams.n_rot);
2449
2646
 
2450
- const float freq_base = hparams.rope_freq_base;
2451
- const float freq_scale = hparams.rope_freq_scale;
2647
+ const float freq_base = cparams.rope_freq_base;
2648
+ const float freq_scale = cparams.rope_freq_scale;
2452
2649
  const float norm_rms_eps = hparams.f_norm_rms_eps;
2453
2650
 
2454
2651
  const int n_gpu_layers = model.n_gpu_layers;
2455
2652
 
2653
+ const int32_t n_tokens = batch.n_tokens;
2654
+ const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
2655
+ const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
2656
+
2657
+ const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
2658
+
2659
+ //printf("n_kv = %d\n", n_kv);
2660
+
2456
2661
  auto & buf_compute = lctx.buf_compute;
2457
2662
 
2458
2663
  struct ggml_init_params params = {
@@ -2470,12 +2675,12 @@ static struct ggml_cgraph * llm_build_llama(
2470
2675
  struct ggml_tensor * cur;
2471
2676
  struct ggml_tensor * inpL;
2472
2677
 
2473
- if (tokens) {
2474
- struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
2678
+ if (batch.token) {
2679
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
2475
2680
 
2476
2681
  ggml_allocr_alloc(lctx.alloc, inp_tokens);
2477
2682
  if (!ggml_allocr_is_measure(lctx.alloc)) {
2478
- memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
2683
+ memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
2479
2684
  }
2480
2685
  ggml_set_name(inp_tokens, "inp_tokens");
2481
2686
 
@@ -2485,11 +2690,11 @@ static struct ggml_cgraph * llm_build_llama(
2485
2690
  GGML_ASSERT(false && "not implemented");
2486
2691
  #endif
2487
2692
 
2488
- inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
2693
+ inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
2489
2694
 
2490
2695
  ggml_allocr_alloc(lctx.alloc, inpL);
2491
2696
  if (!ggml_allocr_is_measure(lctx.alloc)) {
2492
- memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
2697
+ memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
2493
2698
  }
2494
2699
  }
2495
2700
 
@@ -2498,9 +2703,6 @@ static struct ggml_cgraph * llm_build_llama(
2498
2703
 
2499
2704
  // offload functions set the tensor output backend to GPU
2500
2705
  // tensors are GPU-accelerated if any input or the output has been offloaded
2501
- //
2502
- // with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
2503
- // in that case ggml_cuda_assign_buffers has no effect
2504
2706
  offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
2505
2707
  offload_func_t offload_func_kq = llama_nop;
2506
2708
  offload_func_t offload_func_v = llama_nop;
@@ -2517,12 +2719,75 @@ static struct ggml_cgraph * llm_build_llama(
2517
2719
  }
2518
2720
  #endif // GGML_USE_CUBLAS
2519
2721
 
2722
+ // KQ_scale
2520
2723
  struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
2724
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
2521
2725
  ggml_allocr_alloc(lctx.alloc, KQ_scale);
2522
2726
  if (!ggml_allocr_is_measure(lctx.alloc)) {
2523
- ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
2727
+ ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd_head)));
2728
+ }
2729
+
2730
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
2731
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
2732
+ offload_func_kq(KQ_mask);
2733
+ ggml_set_name(KQ_mask, "KQ_mask");
2734
+ ggml_allocr_alloc(lctx.alloc, KQ_mask);
2735
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
2736
+ float * data = (float *) KQ_mask->data;
2737
+ memset(data, 0, ggml_nbytes(KQ_mask));
2738
+
2739
+ for (int h = 0; h < 1; ++h) {
2740
+ for (int j = 0; j < n_tokens; ++j) {
2741
+ const llama_pos pos = batch.pos[j];
2742
+ const llama_seq_id seq_id = batch.seq_id[j];
2743
+
2744
+ for (int i = 0; i < n_kv; ++i) {
2745
+ if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
2746
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
2747
+ }
2748
+ }
2749
+ }
2750
+ }
2751
+ }
2752
+
2753
+ // KQ_pos - contains the positions
2754
+ struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
2755
+ offload_func_kq(KQ_pos);
2756
+ ggml_set_name(KQ_pos, "KQ_pos");
2757
+ ggml_allocr_alloc(lctx.alloc, KQ_pos);
2758
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
2759
+ int * data = (int *) KQ_pos->data;
2760
+ for (int i = 0; i < n_tokens; ++i) {
2761
+ data[i] = batch.pos[i];
2762
+ }
2763
+ }
2764
+
2765
+ // shift the entire K-cache if needed
2766
+ if (do_rope_shift) {
2767
+ struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
2768
+ offload_func_kq(K_shift);
2769
+ ggml_set_name(K_shift, "K_shift");
2770
+ ggml_allocr_alloc(lctx.alloc, K_shift);
2771
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
2772
+ int * data = (int *) K_shift->data;
2773
+ for (int i = 0; i < n_ctx; ++i) {
2774
+ data[i] = kv_self.cells[i].delta;
2775
+ }
2776
+ }
2777
+
2778
+ for (int il = 0; il < n_layer; ++il) {
2779
+ struct ggml_tensor * tmp =
2780
+ ggml_rope_custom_inplace(ctx0,
2781
+ ggml_view_3d(ctx0, kv_self.k,
2782
+ n_embd_head, n_head_kv, n_ctx,
2783
+ ggml_element_size(kv_self.k)*n_embd_head,
2784
+ ggml_element_size(kv_self.k)*n_embd_gqa,
2785
+ ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
2786
+ K_shift, n_embd_head, 0, 0, freq_base, freq_scale);
2787
+ offload_func_kq(tmp);
2788
+ ggml_build_forward_expand(gf, tmp);
2789
+ }
2524
2790
  }
2525
- ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
2526
2791
 
2527
2792
  for (int il = 0; il < n_layer; ++il) {
2528
2793
  ggml_format_name(inpL, "layer_inp_%d", il);
@@ -2560,33 +2825,33 @@ static struct ggml_cgraph * llm_build_llama(
2560
2825
  offload_func_kq(tmpq);
2561
2826
  ggml_set_name(tmpq, "tmpq");
2562
2827
 
2563
- struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
2828
+ struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
2564
2829
  offload_func_kq(Kcur);
2565
2830
  ggml_set_name(Kcur, "Kcur");
2566
2831
 
2567
- struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
2832
+ struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
2568
2833
  offload_func_kq(Qcur);
2569
2834
  ggml_set_name(Qcur, "Qcur");
2570
2835
 
2571
2836
  // store key and value to memory
2572
2837
  {
2573
- // compute the transposed [N, n_embd] V matrix
2838
+ // compute the transposed [n_tokens, n_embd] V matrix
2574
2839
 
2575
2840
  struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
2576
2841
  offload_func_v(tmpv);
2577
2842
  ggml_set_name(tmpv, "tmpv");
2578
2843
 
2579
- struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, N));
2844
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens));
2580
2845
  offload_func_v(Vcur);
2581
2846
  ggml_set_name(Vcur, "Vcur");
2582
2847
 
2583
- struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
2848
+ struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
2584
2849
  offload_func_kq(k);
2585
2850
  ggml_set_name(k, "k");
2586
2851
 
2587
- struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
2852
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
2588
2853
  ( n_ctx)*ggml_element_size(kv_self.v),
2589
- (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
2854
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
2590
2855
  offload_func_v(v);
2591
2856
  ggml_set_name(v, "v");
2592
2857
 
@@ -2601,7 +2866,7 @@ static struct ggml_cgraph * llm_build_llama(
2601
2866
 
2602
2867
  struct ggml_tensor * K =
2603
2868
  ggml_view_3d(ctx0, kv_self.k,
2604
- n_embd_head, n_past + N, n_head_kv,
2869
+ n_embd_head, n_kv, n_head_kv,
2605
2870
  ggml_element_size(kv_self.k)*n_embd_gqa,
2606
2871
  ggml_element_size(kv_self.k)*n_embd_head,
2607
2872
  ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
@@ -2614,25 +2879,25 @@ static struct ggml_cgraph * llm_build_llama(
2614
2879
  ggml_set_name(KQ, "KQ");
2615
2880
 
2616
2881
  // KQ_scaled = KQ / sqrt(n_embd_head)
2617
- // KQ_scaled shape [n_past + N, N, n_head, 1]
2618
- struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
2882
+ // KQ_scaled shape [n_kv, n_tokens, n_head, 1]
2883
+ struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
2619
2884
  offload_func_kq(KQ_scaled);
2620
2885
  ggml_set_name(KQ_scaled, "KQ_scaled");
2621
2886
 
2622
2887
  // KQ_masked = mask_past(KQ_scaled)
2623
- struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
2888
+ struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
2624
2889
  offload_func_kq(KQ_masked);
2625
2890
  ggml_set_name(KQ_masked, "KQ_masked");
2626
2891
 
2627
2892
  // KQ = soft_max(KQ_masked)
2628
- struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
2893
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
2629
2894
  offload_func_v(KQ_soft_max);
2630
2895
  ggml_set_name(KQ_soft_max, "KQ_soft_max");
2631
2896
 
2632
2897
  // split cached V into n_head heads
2633
2898
  struct ggml_tensor * V =
2634
2899
  ggml_view_3d(ctx0, kv_self.v,
2635
- n_past + N, n_embd_head, n_head_kv,
2900
+ n_kv, n_embd_head, n_head_kv,
2636
2901
  ggml_element_size(kv_self.v)*n_ctx,
2637
2902
  ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
2638
2903
  ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
@@ -2647,7 +2912,7 @@ static struct ggml_cgraph * llm_build_llama(
2647
2912
  // make V contiguous in memory to speed up the matmul, however we waste time on the copy
2648
2913
  // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
2649
2914
  // is there a better way?
2650
- struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head));
2915
+ struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_ctx, n_embd_head, n_head));
2651
2916
  struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
2652
2917
  #endif
2653
2918
 
@@ -2656,10 +2921,8 @@ static struct ggml_cgraph * llm_build_llama(
2656
2921
  offload_func_v(KQV_merged);
2657
2922
  ggml_set_name(KQV_merged, "KQV_merged");
2658
2923
 
2659
- // cur = KQV_merged.contiguous().view(n_embd, N)
2660
- cur = ggml_cpy(ctx0,
2661
- KQV_merged,
2662
- ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
2924
+ // cur = KQV_merged.contiguous().view(n_embd, n_tokens)
2925
+ cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
2663
2926
  offload_func_v(cur);
2664
2927
  ggml_set_name(cur, "KQV_merged_contiguous");
2665
2928
 
@@ -2750,20 +3013,12 @@ static struct ggml_cgraph * llm_build_llama(
2750
3013
  return gf;
2751
3014
  }
2752
3015
 
2753
-
2754
3016
  static struct ggml_cgraph * llm_build_baichaun(
2755
3017
  llama_context & lctx,
2756
- const llama_token * tokens,
2757
- const float * embd,
2758
- int n_tokens,
2759
- int n_past) {
2760
-
2761
- GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
2762
-
2763
- const int N = n_tokens;
2764
-
3018
+ const llama_batch & batch) {
2765
3019
  const auto & model = lctx.model;
2766
3020
  const auto & hparams = model.hparams;
3021
+ const auto & cparams = lctx.cparams;
2767
3022
 
2768
3023
  const auto & kv_self = lctx.kv_self;
2769
3024
 
@@ -2771,7 +3026,7 @@ static struct ggml_cgraph * llm_build_baichaun(
2771
3026
 
2772
3027
  const int64_t n_embd = hparams.n_embd;
2773
3028
  const int64_t n_layer = hparams.n_layer;
2774
- const int64_t n_ctx = hparams.n_ctx;
3029
+ const int64_t n_ctx = cparams.n_ctx;
2775
3030
  const int64_t n_head = hparams.n_head;
2776
3031
  const int64_t n_head_kv = hparams.n_head_kv;
2777
3032
  const int64_t n_embd_head = hparams.n_embd_head();
@@ -2779,12 +3034,18 @@ static struct ggml_cgraph * llm_build_baichaun(
2779
3034
 
2780
3035
  GGML_ASSERT(n_embd_head == hparams.n_rot);
2781
3036
 
2782
- const float freq_base = hparams.rope_freq_base;
2783
- const float freq_scale = hparams.rope_freq_scale;
3037
+ const float freq_base = cparams.rope_freq_base;
3038
+ const float freq_scale = cparams.rope_freq_scale;
2784
3039
  const float norm_rms_eps = hparams.f_norm_rms_eps;
2785
3040
 
2786
3041
  const int n_gpu_layers = model.n_gpu_layers;
2787
3042
 
3043
+ const int32_t n_tokens = batch.n_tokens;
3044
+ const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
3045
+ const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
3046
+
3047
+ const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
3048
+
2788
3049
  auto & buf_compute = lctx.buf_compute;
2789
3050
 
2790
3051
  struct ggml_init_params params = {
@@ -2802,12 +3063,12 @@ static struct ggml_cgraph * llm_build_baichaun(
2802
3063
  struct ggml_tensor * cur;
2803
3064
  struct ggml_tensor * inpL;
2804
3065
 
2805
- if (tokens) {
2806
- struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
3066
+ if (batch.token) {
3067
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
2807
3068
 
2808
3069
  ggml_allocr_alloc(lctx.alloc, inp_tokens);
2809
3070
  if (!ggml_allocr_is_measure(lctx.alloc)) {
2810
- memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
3071
+ memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
2811
3072
  }
2812
3073
  ggml_set_name(inp_tokens, "inp_tokens");
2813
3074
 
@@ -2817,11 +3078,11 @@ static struct ggml_cgraph * llm_build_baichaun(
2817
3078
  GGML_ASSERT(false && "not implemented");
2818
3079
  #endif
2819
3080
 
2820
- inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
3081
+ inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
2821
3082
 
2822
3083
  ggml_allocr_alloc(lctx.alloc, inpL);
2823
3084
  if (!ggml_allocr_is_measure(lctx.alloc)) {
2824
- memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
3085
+ memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
2825
3086
  }
2826
3087
  }
2827
3088
 
@@ -2830,9 +3091,6 @@ static struct ggml_cgraph * llm_build_baichaun(
2830
3091
 
2831
3092
  // offload functions set the tensor output backend to GPU
2832
3093
  // tensors are GPU-accelerated if any input or the output has been offloaded
2833
- //
2834
- // with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
2835
- // in that case ggml_cuda_assign_buffers has no effect
2836
3094
  offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
2837
3095
  offload_func_t offload_func_kq = llama_nop;
2838
3096
  offload_func_t offload_func_v = llama_nop;
@@ -2849,12 +3107,75 @@ static struct ggml_cgraph * llm_build_baichaun(
2849
3107
  }
2850
3108
  #endif // GGML_USE_CUBLAS
2851
3109
 
3110
+ // KQ_scale
2852
3111
  struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
3112
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
2853
3113
  ggml_allocr_alloc(lctx.alloc, KQ_scale);
2854
3114
  if (!ggml_allocr_is_measure(lctx.alloc)) {
2855
3115
  ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
2856
3116
  }
2857
- ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
3117
+
3118
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
3119
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
3120
+ offload_func_kq(KQ_mask);
3121
+ ggml_set_name(KQ_mask, "KQ_mask");
3122
+ ggml_allocr_alloc(lctx.alloc, KQ_mask);
3123
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3124
+ float * data = (float *) KQ_mask->data;
3125
+ memset(data, 0, ggml_nbytes(KQ_mask));
3126
+
3127
+ for (int h = 0; h < 1; ++h) {
3128
+ for (int j = 0; j < n_tokens; ++j) {
3129
+ const llama_pos pos = batch.pos[j];
3130
+ const llama_seq_id seq_id = batch.seq_id[j];
3131
+
3132
+ for (int i = 0; i < n_kv; ++i) {
3133
+ if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
3134
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
3135
+ }
3136
+ }
3137
+ }
3138
+ }
3139
+ }
3140
+
3141
+ // KQ_pos - contains the positions
3142
+ struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
3143
+ offload_func_kq(KQ_pos);
3144
+ ggml_set_name(KQ_pos, "KQ_pos");
3145
+ ggml_allocr_alloc(lctx.alloc, KQ_pos);
3146
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3147
+ int * data = (int *) KQ_pos->data;
3148
+ for (int i = 0; i < n_tokens; ++i) {
3149
+ data[i] = batch.pos[i];
3150
+ }
3151
+ }
3152
+
3153
+ // shift the entire K-cache if needed
3154
+ if (do_rope_shift) {
3155
+ struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
3156
+ offload_func_kq(K_shift);
3157
+ ggml_set_name(K_shift, "K_shift");
3158
+ ggml_allocr_alloc(lctx.alloc, K_shift);
3159
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3160
+ int * data = (int *) K_shift->data;
3161
+ for (int i = 0; i < n_ctx; ++i) {
3162
+ data[i] = kv_self.cells[i].delta;
3163
+ }
3164
+ }
3165
+
3166
+ for (int il = 0; il < n_layer; ++il) {
3167
+ struct ggml_tensor * tmp =
3168
+ ggml_rope_custom_inplace(ctx0,
3169
+ ggml_view_3d(ctx0, kv_self.k,
3170
+ n_embd_head, n_head_kv, n_ctx,
3171
+ ggml_element_size(kv_self.k)*n_embd_head,
3172
+ ggml_element_size(kv_self.k)*n_embd_gqa,
3173
+ ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
3174
+ K_shift, n_embd_head, 0, 0, freq_base, freq_scale);
3175
+ offload_func_kq(tmp);
3176
+ ggml_build_forward_expand(gf, tmp);
3177
+ }
3178
+ }
2858
3179
 
2859
3180
  for (int il = 0; il < n_layer; ++il) {
2860
3181
  ggml_format_name(inpL, "layer_inp_%d", il);
@@ -2896,12 +3217,12 @@ static struct ggml_cgraph * llm_build_baichaun(
2896
3217
  struct ggml_tensor * Qcur;
2897
3218
  switch (model.type) {
2898
3219
  case MODEL_7B:
2899
- Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
2900
- Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
3220
+ Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
3221
+ Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
2901
3222
  break;
2902
3223
  case MODEL_13B:
2903
- Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N);
2904
- Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N);
3224
+ Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, n_tokens);
3225
+ Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, n_tokens);
2905
3226
  break;
2906
3227
  default:
2907
3228
  GGML_ASSERT(false);
@@ -2915,23 +3236,23 @@ static struct ggml_cgraph * llm_build_baichaun(
2915
3236
 
2916
3237
  // store key and value to memory
2917
3238
  {
2918
- // compute the transposed [N, n_embd] V matrix
3239
+ // compute the transposed [n_tokens, n_embd] V matrix
2919
3240
 
2920
3241
  struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
2921
3242
  offload_func_v(tmpv);
2922
3243
  ggml_set_name(tmpv, "tmpv");
2923
3244
 
2924
- struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, N));
3245
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens));
2925
3246
  offload_func_v(Vcur);
2926
3247
  ggml_set_name(Vcur, "Vcur");
2927
3248
 
2928
- struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
3249
+ struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
2929
3250
  offload_func_kq(k);
2930
3251
  ggml_set_name(k, "k");
2931
3252
 
2932
- struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
3253
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
2933
3254
  ( n_ctx)*ggml_element_size(kv_self.v),
2934
- (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
3255
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
2935
3256
  offload_func_v(v);
2936
3257
  ggml_set_name(v, "v");
2937
3258
 
@@ -2946,7 +3267,7 @@ static struct ggml_cgraph * llm_build_baichaun(
2946
3267
 
2947
3268
  struct ggml_tensor * K =
2948
3269
  ggml_view_3d(ctx0, kv_self.k,
2949
- n_embd_head, n_past + N, n_head_kv,
3270
+ n_embd_head, n_kv, n_head_kv,
2950
3271
  ggml_element_size(kv_self.k)*n_embd_gqa,
2951
3272
  ggml_element_size(kv_self.k)*n_embd_head,
2952
3273
  ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
@@ -2959,8 +3280,8 @@ static struct ggml_cgraph * llm_build_baichaun(
2959
3280
  ggml_set_name(KQ, "KQ");
2960
3281
 
2961
3282
  // KQ_scaled = KQ / sqrt(n_embd_head)
2962
- // KQ_scaled shape [n_past + N, N, n_head, 1]
2963
- struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
3283
+ // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
3284
+ struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
2964
3285
  offload_func_kq(KQ_scaled);
2965
3286
  ggml_set_name(KQ_scaled, "KQ_scaled");
2966
3287
 
@@ -2969,58 +3290,44 @@ static struct ggml_cgraph * llm_build_baichaun(
2969
3290
 
2970
3291
  switch (model.type) {
2971
3292
  case MODEL_7B:
2972
- KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
3293
+ KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
2973
3294
  break;
2974
3295
  case MODEL_13B:
2975
- KQ_scaled_alibi =ggml_alibi(ctx0, KQ_scaled, n_past, n_head, 8);
3296
+ // TODO: replace with ggml_add()
3297
+ KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ 0, n_head, 8);
2976
3298
  ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
2977
- KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past);
3299
+ KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
2978
3300
  break;
2979
3301
  default:
2980
3302
  GGML_ASSERT(false);
2981
3303
  }
2982
- // KQ_masked = mask_past(KQ_scaled)
2983
- // struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
2984
- // struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past);
2985
- // offload_func_kq(KQ_masked);
2986
- // ggml_set_name(KQ_masked, "KQ_masked");
2987
3304
 
2988
3305
  // KQ = soft_max(KQ_masked)
2989
- struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
3306
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
2990
3307
  offload_func_v(KQ_soft_max);
2991
3308
  ggml_set_name(KQ_soft_max, "KQ_soft_max");
2992
3309
 
2993
3310
  // split cached V into n_head heads
2994
3311
  struct ggml_tensor * V =
2995
3312
  ggml_view_3d(ctx0, kv_self.v,
2996
- n_past + N, n_embd_head, n_head_kv,
3313
+ n_kv, n_embd_head, n_head_kv,
2997
3314
  ggml_element_size(kv_self.v)*n_ctx,
2998
3315
  ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
2999
3316
  ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
3000
3317
  offload_func_v(V);
3001
3318
  ggml_set_name(V, "V");
3002
3319
 
3003
- #if 1
3004
3320
  struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
3005
3321
  offload_func_v(KQV);
3006
3322
  ggml_set_name(KQV, "KQV");
3007
- #else
3008
- // make V contiguous in memory to speed up the matmul, however we waste time on the copy
3009
- // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
3010
- // is there a better way?
3011
- struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head));
3012
- struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
3013
- #endif
3014
3323
 
3015
3324
  // KQV_merged = KQV.permute(0, 2, 1, 3)
3016
3325
  struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
3017
3326
  offload_func_v(KQV_merged);
3018
3327
  ggml_set_name(KQV_merged, "KQV_merged");
3019
3328
 
3020
- // cur = KQV_merged.contiguous().view(n_embd, N)
3021
- cur = ggml_cpy(ctx0,
3022
- KQV_merged,
3023
- ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
3329
+ // cur = KQV_merged.contiguous().view(n_embd, n_tokens)
3330
+ cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
3024
3331
  offload_func_v(cur);
3025
3332
  ggml_set_name(cur, "KQV_merged_contiguous");
3026
3333
 
@@ -3111,19 +3418,12 @@ static struct ggml_cgraph * llm_build_baichaun(
3111
3418
  return gf;
3112
3419
  }
3113
3420
 
3114
- static struct ggml_cgraph * llm_build_falcon(
3421
+ static struct ggml_cgraph * llm_build_refact(
3115
3422
  llama_context & lctx,
3116
- const llama_token * tokens,
3117
- const float * embd,
3118
- int n_tokens,
3119
- int n_past) {
3120
-
3121
- GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
3122
-
3123
- const int N = n_tokens;
3124
-
3423
+ const llama_batch & batch) {
3125
3424
  const auto & model = lctx.model;
3126
3425
  const auto & hparams = model.hparams;
3426
+ const auto & cparams = lctx.cparams;
3127
3427
 
3128
3428
  const auto & kv_self = lctx.kv_self;
3129
3429
 
@@ -3131,20 +3431,22 @@ static struct ggml_cgraph * llm_build_falcon(
3131
3431
 
3132
3432
  const int64_t n_embd = hparams.n_embd;
3133
3433
  const int64_t n_layer = hparams.n_layer;
3134
- const int64_t n_ctx = hparams.n_ctx;
3434
+ const int64_t n_ctx = cparams.n_ctx;
3135
3435
  const int64_t n_head = hparams.n_head;
3136
3436
  const int64_t n_head_kv = hparams.n_head_kv;
3137
3437
  const int64_t n_embd_head = hparams.n_embd_head();
3138
3438
  const int64_t n_embd_gqa = hparams.n_embd_gqa();
3139
3439
 
3140
- GGML_ASSERT(n_embd_head == hparams.n_rot);
3141
-
3142
- const float freq_base = hparams.rope_freq_base;
3143
- const float freq_scale = hparams.rope_freq_scale;
3144
- const float norm_eps = hparams.f_norm_eps;
3440
+ const float norm_rms_eps = hparams.f_norm_rms_eps;
3145
3441
 
3146
3442
  const int n_gpu_layers = model.n_gpu_layers;
3147
3443
 
3444
+ const int32_t n_tokens = batch.n_tokens;
3445
+ const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
3446
+ const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
3447
+
3448
+ // printf("n_kv = %d\n", n_kv);
3449
+
3148
3450
  auto & buf_compute = lctx.buf_compute;
3149
3451
 
3150
3452
  struct ggml_init_params params = {
@@ -3162,12 +3464,12 @@ static struct ggml_cgraph * llm_build_falcon(
3162
3464
  struct ggml_tensor * cur;
3163
3465
  struct ggml_tensor * inpL;
3164
3466
 
3165
- if (tokens) {
3166
- struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
3467
+ if (batch.token) {
3468
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
3167
3469
 
3168
3470
  ggml_allocr_alloc(lctx.alloc, inp_tokens);
3169
3471
  if (!ggml_allocr_is_measure(lctx.alloc)) {
3170
- memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
3472
+ memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
3171
3473
  }
3172
3474
  ggml_set_name(inp_tokens, "inp_tokens");
3173
3475
 
@@ -3177,11 +3479,11 @@ static struct ggml_cgraph * llm_build_falcon(
3177
3479
  GGML_ASSERT(false && "not implemented");
3178
3480
  #endif
3179
3481
 
3180
- inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
3482
+ inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
3181
3483
 
3182
3484
  ggml_allocr_alloc(lctx.alloc, inpL);
3183
3485
  if (!ggml_allocr_is_measure(lctx.alloc)) {
3184
- memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
3486
+ memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
3185
3487
  }
3186
3488
  }
3187
3489
 
@@ -3190,9 +3492,6 @@ static struct ggml_cgraph * llm_build_falcon(
3190
3492
 
3191
3493
  // offload functions set the tensor output backend to GPU
3192
3494
  // tensors are GPU-accelerated if any input or the output has been offloaded
3193
- //
3194
- // with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
3195
- // in that case ggml_cuda_assign_buffers has no effect
3196
3495
  offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
3197
3496
  offload_func_t offload_func_kq = llama_nop;
3198
3497
  offload_func_t offload_func_v = llama_nop;
@@ -3209,15 +3508,432 @@ static struct ggml_cgraph * llm_build_falcon(
3209
3508
  }
3210
3509
  #endif // GGML_USE_CUBLAS
3211
3510
 
3511
+ // KQ_scale
3212
3512
  struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
3513
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
3213
3514
  ggml_allocr_alloc(lctx.alloc, KQ_scale);
3214
3515
  if (!ggml_allocr_is_measure(lctx.alloc)) {
3215
- ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
3516
+ ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd_head)));
3216
3517
  }
3217
- ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
3218
3518
 
3219
- for (int il = 0; il < n_layer; ++il) {
3220
- struct ggml_tensor * attn_norm;
3519
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
3520
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
3521
+ offload_func_kq(KQ_mask);
3522
+ ggml_set_name(KQ_mask, "KQ_mask");
3523
+ ggml_allocr_alloc(lctx.alloc, KQ_mask);
3524
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3525
+ float * data = (float *) KQ_mask->data;
3526
+ memset(data, 0, ggml_nbytes(KQ_mask));
3527
+
3528
+ for (int h = 0; h < 1; ++h) {
3529
+ for (int j = 0; j < n_tokens; ++j) {
3530
+ const llama_pos pos = batch.pos[j];
3531
+ const llama_seq_id seq_id = batch.seq_id[j];
3532
+
3533
+ for (int i = 0; i < n_kv; ++i) {
3534
+ if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
3535
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
3536
+ }
3537
+ }
3538
+ }
3539
+ }
3540
+ }
3541
+
3542
+ for (int il = 0; il < n_layer; ++il) {
3543
+ ggml_format_name(inpL, "layer_inp_%d", il);
3544
+
3545
+ offload_func_t offload_func = llama_nop;
3546
+
3547
+ #ifdef GGML_USE_CUBLAS
3548
+ if (il >= i_gpu_start) {
3549
+ offload_func = ggml_cuda_assign_buffers_no_alloc;
3550
+ }
3551
+ #endif // GGML_USE_CUBLAS
3552
+
3553
+ struct ggml_tensor * inpSA = inpL;
3554
+
3555
+ // norm
3556
+ {
3557
+ cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps);
3558
+ offload_func(cur);
3559
+ ggml_set_name(cur, "rms_norm_0");
3560
+
3561
+ // cur = cur*attn_norm(broadcasted)
3562
+ cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm);
3563
+ offload_func(cur);
3564
+ ggml_set_name(cur, "attention_norm_0");
3565
+ }
3566
+
3567
+ // self-attention
3568
+ {
3569
+ // compute Q and K
3570
+ struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
3571
+ offload_func_kq(tmpk);
3572
+ ggml_set_name(tmpk, "tmpk");
3573
+
3574
+ struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
3575
+ offload_func_kq(tmpq);
3576
+ ggml_set_name(tmpq, "tmpq");
3577
+
3578
+ struct ggml_tensor * Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens);
3579
+ offload_func_kq(Kcur);
3580
+ ggml_set_name(Kcur, "Kcur");
3581
+
3582
+ struct ggml_tensor * Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens);
3583
+ offload_func_kq(Qcur);
3584
+ ggml_set_name(Qcur, "Qcur");
3585
+
3586
+ // store key and value to memory
3587
+ {
3588
+ // compute the transposed [n_tokens, n_embd] V matrix
3589
+
3590
+ struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
3591
+ offload_func_v(tmpv);
3592
+ ggml_set_name(tmpv, "tmpv");
3593
+
3594
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens));
3595
+ offload_func_v(Vcur);
3596
+ ggml_set_name(Vcur, "Vcur");
3597
+
3598
+ struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
3599
+ offload_func_kq(k);
3600
+ ggml_set_name(k, "k");
3601
+
3602
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
3603
+ ( n_ctx)*ggml_element_size(kv_self.v),
3604
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
3605
+ offload_func_v(v);
3606
+ ggml_set_name(v, "v");
3607
+
3608
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
3609
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
3610
+ }
3611
+
3612
+ struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
3613
+ offload_func_kq(Q);
3614
+ ggml_set_name(Q, "Q");
3615
+
3616
+ struct ggml_tensor * K =
3617
+ ggml_view_3d(ctx0, kv_self.k,
3618
+ n_embd_head, n_kv, n_head_kv,
3619
+ ggml_element_size(kv_self.k)*n_embd_gqa,
3620
+ ggml_element_size(kv_self.k)*n_embd_head,
3621
+ ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
3622
+ offload_func_kq(K);
3623
+ ggml_set_name(K, "K");
3624
+
3625
+ // K * Q
3626
+ struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
3627
+ offload_func_kq(KQ);
3628
+ ggml_set_name(KQ, "KQ");
3629
+
3630
+ // KQ_scaled = KQ / sqrt(n_embd_head)
3631
+ // KQ_scaled shape [n_kv, n_tokens, n_head, 1]
3632
+ struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
3633
+ offload_func_kq(KQ_scaled);
3634
+ ggml_set_name(KQ_scaled, "KQ_scaled");
3635
+
3636
+ // KQ_masked = mask_past(KQ_scaled)
3637
+ struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ 0, n_head, 8);
3638
+ ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
3639
+
3640
+ struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
3641
+ offload_func_kq(KQ_masked);
3642
+ ggml_set_name(KQ_masked, "KQ_masked");
3643
+
3644
+ // KQ = soft_max(KQ_masked)
3645
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
3646
+ offload_func_v(KQ_soft_max);
3647
+ ggml_set_name(KQ_soft_max, "KQ_soft_max");
3648
+
3649
+ // split cached V into n_head heads
3650
+ struct ggml_tensor * V =
3651
+ ggml_view_3d(ctx0, kv_self.v,
3652
+ n_kv, n_embd_head, n_head_kv,
3653
+ ggml_element_size(kv_self.v)*n_ctx,
3654
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
3655
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
3656
+ offload_func_v(V);
3657
+ ggml_set_name(V, "V");
3658
+
3659
+ #if 1
3660
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
3661
+ offload_func_v(KQV);
3662
+ ggml_set_name(KQV, "KQV");
3663
+ #else
3664
+ // make V contiguous in memory to speed up the matmul, however we waste time on the copy
3665
+ // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
3666
+ // is there a better way?
3667
+ struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_ctx, n_embd_head, n_head));
3668
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
3669
+ #endif
3670
+
3671
+ // KQV_merged = KQV.permute(0, 2, 1, 3)
3672
+ struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
3673
+ offload_func_v(KQV_merged);
3674
+ ggml_set_name(KQV_merged, "KQV_merged");
3675
+
3676
+ // cur = KQV_merged.contiguous().view(n_embd, n_tokens)
3677
+ cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
3678
+ offload_func_v(cur);
3679
+ ggml_set_name(cur, "KQV_merged_contiguous");
3680
+
3681
+ // projection (no bias)
3682
+ cur = ggml_mul_mat(ctx0,
3683
+ model.layers[il].wo,
3684
+ cur);
3685
+ offload_func(cur);
3686
+ ggml_set_name(cur, "result_wo");
3687
+ }
3688
+
3689
+ struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
3690
+ offload_func(inpFF);
3691
+ ggml_set_name(inpFF, "inpFF");
3692
+
3693
+ // feed-forward network
3694
+ {
3695
+ // norm
3696
+ {
3697
+ cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps);
3698
+ offload_func(cur);
3699
+ ggml_set_name(cur, "rms_norm_1");
3700
+
3701
+ // cur = cur*ffn_norm(broadcasted)
3702
+ cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
3703
+ offload_func(cur);
3704
+ ggml_set_name(cur, "ffn_norm");
3705
+ }
3706
+
3707
+ struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
3708
+ model.layers[il].w3,
3709
+ cur);
3710
+ offload_func(tmp);
3711
+ ggml_set_name(tmp, "result_w3");
3712
+
3713
+ cur = ggml_mul_mat(ctx0,
3714
+ model.layers[il].w1,
3715
+ cur);
3716
+ offload_func(cur);
3717
+ ggml_set_name(cur, "result_w1");
3718
+
3719
+ // SILU activation
3720
+ cur = ggml_silu(ctx0, cur);
3721
+ offload_func(cur);
3722
+ ggml_set_name(cur, "silu");
3723
+
3724
+ cur = ggml_mul(ctx0, cur, tmp);
3725
+ offload_func(cur);
3726
+ ggml_set_name(cur, "silu_x_result_w3");
3727
+
3728
+ cur = ggml_mul_mat(ctx0,
3729
+ model.layers[il].w2,
3730
+ cur);
3731
+ offload_func(cur);
3732
+ ggml_set_name(cur, "result_w2");
3733
+ }
3734
+
3735
+ cur = ggml_add(ctx0, cur, inpFF);
3736
+ offload_func(cur);
3737
+ ggml_set_name(cur, "inpFF_+_result_w2");
3738
+
3739
+ // input for next layer
3740
+ inpL = cur;
3741
+ }
3742
+
3743
+ cur = inpL;
3744
+
3745
+ // norm
3746
+ {
3747
+ cur = ggml_rms_norm(ctx0, cur, norm_rms_eps);
3748
+ offload_func_nr(cur);
3749
+ ggml_set_name(cur, "rms_norm_2");
3750
+
3751
+ // cur = cur*norm(broadcasted)
3752
+ cur = ggml_mul(ctx0, cur, model.output_norm);
3753
+ // offload_func_nr(cur); // TODO CPU + GPU mirrored backend
3754
+ ggml_set_name(cur, "result_norm");
3755
+ }
3756
+
3757
+ // lm_head
3758
+ cur = ggml_mul_mat(ctx0, model.output, cur);
3759
+ ggml_set_name(cur, "result_output");
3760
+
3761
+ ggml_build_forward_expand(gf, cur);
3762
+
3763
+ ggml_free(ctx0);
3764
+
3765
+ return gf;
3766
+ }
3767
+
3768
+ static struct ggml_cgraph * llm_build_falcon(
3769
+ llama_context & lctx,
3770
+ const llama_batch & batch) {
3771
+ const auto & model = lctx.model;
3772
+ const auto & hparams = model.hparams;
3773
+ const auto & cparams = lctx.cparams;
3774
+
3775
+ const auto & kv_self = lctx.kv_self;
3776
+
3777
+ GGML_ASSERT(!!kv_self.ctx);
3778
+
3779
+ const int64_t n_embd = hparams.n_embd;
3780
+ const int64_t n_layer = hparams.n_layer;
3781
+ const int64_t n_ctx = cparams.n_ctx;
3782
+ const int64_t n_head = hparams.n_head;
3783
+ const int64_t n_head_kv = hparams.n_head_kv;
3784
+ const int64_t n_embd_head = hparams.n_embd_head();
3785
+ const int64_t n_embd_gqa = hparams.n_embd_gqa();
3786
+
3787
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
3788
+
3789
+ const float freq_base = cparams.rope_freq_base;
3790
+ const float freq_scale = cparams.rope_freq_scale;
3791
+ const float norm_eps = hparams.f_norm_eps;
3792
+
3793
+ const int n_gpu_layers = model.n_gpu_layers;
3794
+
3795
+ const int32_t n_tokens = batch.n_tokens;
3796
+ const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
3797
+ const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
3798
+
3799
+ const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
3800
+
3801
+ //printf("kv_head = %d, n_kv = %d, n_tokens = %d, n_ctx = %d, is_measure = %d, has_shift = %d\n",
3802
+ // kv_head, n_kv, n_tokens, n_ctx, ggml_allocr_is_measure(lctx.alloc), kv_self.has_shift);
3803
+
3804
+ auto & buf_compute = lctx.buf_compute;
3805
+
3806
+ struct ggml_init_params params = {
3807
+ /*.mem_size =*/ buf_compute.size,
3808
+ /*.mem_buffer =*/ buf_compute.data,
3809
+ /*.no_alloc =*/ false,
3810
+ };
3811
+
3812
+ params.no_alloc = true;
3813
+
3814
+ struct ggml_context * ctx0 = ggml_init(params);
3815
+
3816
+ ggml_cgraph * gf = ggml_new_graph(ctx0);
3817
+
3818
+ struct ggml_tensor * cur;
3819
+ struct ggml_tensor * inpL;
3820
+
3821
+ if (batch.token) {
3822
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
3823
+
3824
+ ggml_allocr_alloc(lctx.alloc, inp_tokens);
3825
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3826
+ memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
3827
+ }
3828
+ ggml_set_name(inp_tokens, "inp_tokens");
3829
+
3830
+ inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
3831
+ } else {
3832
+ #ifdef GGML_USE_MPI
3833
+ GGML_ASSERT(false && "not implemented");
3834
+ #endif
3835
+
3836
+ inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
3837
+
3838
+ ggml_allocr_alloc(lctx.alloc, inpL);
3839
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3840
+ memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
3841
+ }
3842
+ }
3843
+
3844
+ const int i_gpu_start = n_layer - n_gpu_layers;
3845
+ (void) i_gpu_start;
3846
+
3847
+ // offload functions set the tensor output backend to GPU
3848
+ // tensors are GPU-accelerated if any input or the output has been offloaded
3849
+ offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
3850
+ offload_func_t offload_func_kq = llama_nop;
3851
+ offload_func_t offload_func_v = llama_nop;
3852
+
3853
+ #ifdef GGML_USE_CUBLAS
3854
+ if (n_gpu_layers > n_layer) {
3855
+ offload_func_nr = ggml_cuda_assign_buffers_no_alloc;
3856
+ }
3857
+ if (n_gpu_layers > n_layer + 1) {
3858
+ offload_func_v = ggml_cuda_assign_buffers_no_alloc;
3859
+ }
3860
+ if (n_gpu_layers > n_layer + 2) {
3861
+ offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
3862
+ }
3863
+ #endif // GGML_USE_CUBLAS
3864
+
3865
+ // KQ_scale
3866
+ struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
3867
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
3868
+ ggml_allocr_alloc(lctx.alloc, KQ_scale);
3869
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3870
+ ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
3871
+ }
3872
+
3873
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
3874
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
3875
+ offload_func_kq(KQ_mask);
3876
+ ggml_set_name(KQ_mask, "KQ_mask");
3877
+ ggml_allocr_alloc(lctx.alloc, KQ_mask);
3878
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3879
+ float * data = (float *) KQ_mask->data;
3880
+ memset(data, 0, ggml_nbytes(KQ_mask));
3881
+
3882
+ for (int h = 0; h < 1; ++h) {
3883
+ for (int j = 0; j < n_tokens; ++j) {
3884
+ const llama_pos pos = batch.pos[j];
3885
+ const llama_seq_id seq_id = batch.seq_id[j];
3886
+
3887
+ for (int i = 0; i < n_kv; ++i) {
3888
+ if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
3889
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
3890
+ }
3891
+ }
3892
+ }
3893
+ }
3894
+ }
3895
+
3896
+ // KQ_pos - contains the positions
3897
+ struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
3898
+ offload_func_kq(KQ_pos);
3899
+ ggml_set_name(KQ_pos, "KQ_pos");
3900
+ ggml_allocr_alloc(lctx.alloc, KQ_pos);
3901
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3902
+ int * data = (int *) KQ_pos->data;
3903
+ for (int i = 0; i < n_tokens; ++i) {
3904
+ data[i] = batch.pos[i];
3905
+ }
3906
+ }
3907
+
3908
+ // shift the entire K-cache if needed
3909
+ if (do_rope_shift) {
3910
+ struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
3911
+ offload_func_kq(K_shift);
3912
+ ggml_set_name(K_shift, "K_shift");
3913
+ ggml_allocr_alloc(lctx.alloc, K_shift);
3914
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3915
+ int * data = (int *) K_shift->data;
3916
+ for (int i = 0; i < n_ctx; ++i) {
3917
+ data[i] = kv_self.cells[i].delta;
3918
+ }
3919
+ }
3920
+
3921
+ for (int il = 0; il < n_layer; ++il) {
3922
+ struct ggml_tensor * tmp =
3923
+ ggml_rope_custom_inplace(ctx0,
3924
+ ggml_view_3d(ctx0, kv_self.k,
3925
+ n_embd_head, n_head_kv, n_ctx,
3926
+ ggml_element_size(kv_self.k)*n_embd_head,
3927
+ ggml_element_size(kv_self.k)*n_embd_gqa,
3928
+ ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
3929
+ K_shift, n_embd_head, 2, 0, freq_base, freq_scale);
3930
+ offload_func_kq(tmp);
3931
+ ggml_build_forward_expand(gf, tmp);
3932
+ }
3933
+ }
3934
+
3935
+ for (int il = 0; il < n_layer; ++il) {
3936
+ struct ggml_tensor * attn_norm;
3221
3937
 
3222
3938
  offload_func_t offload_func = llama_nop;
3223
3939
 
@@ -3271,45 +3987,45 @@ static struct ggml_cgraph * llm_build_falcon(
3271
3987
  // TODO: these 2 ggml_conts are technically not needed, but we add them until CUDA support for
3272
3988
  // non-contiguous views is added for the rope operator
3273
3989
  struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_3d(
3274
- ctx0, cur, n_embd_head, n_head, N,
3990
+ ctx0, cur, n_embd_head, n_head, n_tokens,
3275
3991
  wsize * n_embd_head,
3276
3992
  wsize * n_embd_head * (n_head + 2 * n_head_kv),
3277
3993
  0));
3278
3994
  offload_func_kq(tmpq);
3279
3995
 
3280
3996
  struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_3d(
3281
- ctx0, cur, n_embd_head, n_head_kv, N,
3997
+ ctx0, cur, n_embd_head, n_head_kv, n_tokens,
3282
3998
  wsize * n_embd_head,
3283
3999
  wsize * n_embd_head * (n_head + 2 * n_head_kv),
3284
4000
  wsize * n_embd_head * n_head));
3285
4001
  offload_func_kq(tmpk);
3286
4002
 
3287
4003
  struct ggml_tensor * tmpv = ggml_view_3d(
3288
- ctx0, cur, n_embd_head, n_head_kv, N,
4004
+ ctx0, cur, n_embd_head, n_head_kv, n_tokens,
3289
4005
  wsize * n_embd_head,
3290
4006
  wsize * n_embd_head * (n_head + 2 * n_head_kv),
3291
4007
  wsize * n_embd_head * (n_head + n_head_kv));
3292
4008
  offload_func_v(tmpv);
3293
4009
 
3294
4010
  // using mode = 2 for neox mode
3295
- struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, tmpq, n_past, n_embd_head, 2, 0, freq_base, freq_scale);
4011
+ struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, tmpq, KQ_pos, n_embd_head, 2, 0, freq_base, freq_scale);
3296
4012
  offload_func_kq(Qcur);
3297
- struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, tmpk, n_past, n_embd_head, 2, 0, freq_base, freq_scale);
4013
+ struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, tmpk, KQ_pos, n_embd_head, 2, 0, freq_base, freq_scale);
3298
4014
  offload_func_kq(Kcur);
3299
4015
 
3300
4016
  {
3301
- struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, N));
4017
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
3302
4018
  offload_func_v(Vcur);
3303
4019
  offload_func_v(Vcur->src[0]->src[0]);
3304
4020
  ggml_set_name(Vcur, "Vcur");
3305
4021
 
3306
- struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
4022
+ struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
3307
4023
  offload_func_kq(k);
3308
4024
  ggml_set_name(k, "k");
3309
4025
 
3310
- struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
4026
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
3311
4027
  ( n_ctx)*ggml_element_size(kv_self.v),
3312
- (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
4028
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
3313
4029
  offload_func_v(v);
3314
4030
 
3315
4031
  ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
@@ -3322,7 +4038,7 @@ static struct ggml_cgraph * llm_build_falcon(
3322
4038
 
3323
4039
  struct ggml_tensor * K =
3324
4040
  ggml_view_3d(ctx0, kv_self.k,
3325
- n_embd_head, n_past + N, n_head_kv,
4041
+ n_embd_head, n_kv, n_head_kv,
3326
4042
  ggml_element_size(kv_self.k)*n_embd_gqa,
3327
4043
  ggml_element_size(kv_self.k)*n_embd_head,
3328
4044
  ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
@@ -3333,21 +4049,21 @@ static struct ggml_cgraph * llm_build_falcon(
3333
4049
  offload_func_kq(KQ);
3334
4050
  ggml_set_name(KQ, "KQ");
3335
4051
 
3336
- struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
4052
+ struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
3337
4053
  offload_func_kq(KQ_scaled);
3338
4054
  ggml_set_name(KQ_scaled, "KQ_scaled");
3339
4055
 
3340
- struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
4056
+ struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
3341
4057
  offload_func_kq(KQ_masked);
3342
4058
  ggml_set_name(KQ_masked, "KQ_masked");
3343
4059
 
3344
- struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
4060
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
3345
4061
  offload_func_v(KQ_soft_max);
3346
4062
  ggml_set_name(KQ_soft_max, "KQ_soft_max");
3347
4063
 
3348
4064
  struct ggml_tensor * V =
3349
4065
  ggml_view_3d(ctx0, kv_self.v,
3350
- n_past + N, n_embd_head, n_head_kv,
4066
+ n_kv, n_embd_head, n_head_kv,
3351
4067
  ggml_element_size(kv_self.v)*n_ctx,
3352
4068
  ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
3353
4069
  ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
@@ -3362,7 +4078,7 @@ static struct ggml_cgraph * llm_build_falcon(
3362
4078
  offload_func_v(KQV_merged);
3363
4079
  ggml_set_name(KQV_merged, "KQV_merged");
3364
4080
 
3365
- cur = ggml_cpy(ctx0, KQV_merged, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
4081
+ cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
3366
4082
  offload_func_v(cur);
3367
4083
  ggml_set_name(cur, "KQV_merged_contiguous");
3368
4084
 
@@ -3420,17 +4136,10 @@ static struct ggml_cgraph * llm_build_falcon(
3420
4136
 
3421
4137
  static struct ggml_cgraph * llm_build_starcoder(
3422
4138
  llama_context & lctx,
3423
- const llama_token * tokens,
3424
- const float * embd,
3425
- int n_tokens,
3426
- int n_past) {
3427
-
3428
- GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
3429
-
3430
- const int N = n_tokens;
3431
-
4139
+ const llama_batch & batch) {
3432
4140
  const auto & model = lctx.model;
3433
4141
  const auto & hparams = model.hparams;
4142
+ const auto & cparams = lctx.cparams;
3434
4143
 
3435
4144
  const auto & kv_self = lctx.kv_self;
3436
4145
 
@@ -3438,7 +4147,7 @@ static struct ggml_cgraph * llm_build_starcoder(
3438
4147
 
3439
4148
  const int64_t n_embd = hparams.n_embd;
3440
4149
  const int64_t n_layer = hparams.n_layer;
3441
- const int64_t n_ctx = hparams.n_ctx;
4150
+ const int64_t n_ctx = cparams.n_ctx;
3442
4151
  const int64_t n_head = hparams.n_head;
3443
4152
  const int64_t n_head_kv = hparams.n_head_kv;
3444
4153
  const int64_t n_embd_head = hparams.n_embd_head();
@@ -3446,7 +4155,11 @@ static struct ggml_cgraph * llm_build_starcoder(
3446
4155
 
3447
4156
  GGML_ASSERT(n_embd_head == hparams.n_rot);
3448
4157
 
3449
- const float norm_eps = hparams.f_norm_eps;
4158
+ const float norm_eps = hparams.f_norm_eps;
4159
+
4160
+ const int32_t n_tokens = batch.n_tokens;
4161
+ const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
4162
+ const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
3450
4163
 
3451
4164
  auto & buf_compute = lctx.buf_compute;
3452
4165
 
@@ -3467,12 +4180,12 @@ static struct ggml_cgraph * llm_build_starcoder(
3467
4180
  struct ggml_tensor * position;
3468
4181
  struct ggml_tensor * inpL;
3469
4182
 
3470
- if (tokens) {
3471
- struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
4183
+ if (batch.token) {
4184
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
3472
4185
 
3473
4186
  ggml_allocr_alloc(lctx.alloc, inp_tokens);
3474
4187
  if (!ggml_allocr_is_measure(lctx.alloc)) {
3475
- memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
4188
+ memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
3476
4189
  }
3477
4190
  ggml_set_name(inp_tokens, "inp_tokens");
3478
4191
 
@@ -3482,21 +4195,21 @@ static struct ggml_cgraph * llm_build_starcoder(
3482
4195
  GGML_ASSERT(false && "not implemented");
3483
4196
  #endif
3484
4197
 
3485
- token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
4198
+ token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
3486
4199
 
3487
4200
  ggml_allocr_alloc(lctx.alloc, token);
3488
4201
  if (!ggml_allocr_is_measure(lctx.alloc)) {
3489
- memcpy(token->data, embd, N * n_embd * ggml_element_size(token));
4202
+ memcpy(token->data, batch.embd, n_tokens * n_embd * ggml_element_size(token));
3490
4203
  }
3491
4204
  }
3492
4205
 
3493
4206
  {
3494
4207
  // Compute position embeddings.
3495
- struct ggml_tensor * inp_positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
4208
+ struct ggml_tensor * inp_positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
3496
4209
  ggml_allocr_alloc(lctx.alloc, inp_positions);
3497
4210
  if (!ggml_allocr_is_measure(lctx.alloc)) {
3498
- for (int i = 0; i < N; ++i) {
3499
- ((int32_t *) inp_positions->data)[i] = n_past + i;
4211
+ for (int i = 0; i < n_tokens; ++i) {
4212
+ ((int32_t *) inp_positions->data)[i] = batch.pos[i];
3500
4213
  }
3501
4214
  }
3502
4215
  ggml_set_name(inp_positions, "inp_positions");
@@ -3504,12 +4217,35 @@ static struct ggml_cgraph * llm_build_starcoder(
3504
4217
  position = ggml_get_rows(ctx0, model.pos_embeddings, inp_positions);
3505
4218
  }
3506
4219
 
4220
+ // KQ_scale
3507
4221
  struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
4222
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
3508
4223
  ggml_allocr_alloc(lctx.alloc, KQ_scale);
3509
4224
  if (!ggml_allocr_is_measure(lctx.alloc)) {
3510
4225
  ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
3511
4226
  }
3512
- ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
4227
+
4228
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
4229
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
4230
+ ggml_set_name(KQ_mask, "KQ_mask");
4231
+ ggml_allocr_alloc(lctx.alloc, KQ_mask);
4232
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
4233
+ float * data = (float *) KQ_mask->data;
4234
+ memset(data, 0, ggml_nbytes(KQ_mask));
4235
+
4236
+ for (int h = 0; h < 1; ++h) {
4237
+ for (int j = 0; j < n_tokens; ++j) {
4238
+ const llama_pos pos = batch.pos[j];
4239
+ const llama_seq_id seq_id = batch.seq_id[j];
4240
+
4241
+ for (int i = 0; i < n_kv; ++i) {
4242
+ if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
4243
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
4244
+ }
4245
+ }
4246
+ }
4247
+ }
4248
+ }
3513
4249
 
3514
4250
  inpL = ggml_add(ctx0, token, position);
3515
4251
  ggml_set_name(inpL, "inpL");
@@ -3525,23 +4261,23 @@ static struct ggml_cgraph * llm_build_starcoder(
3525
4261
  // Self Attention
3526
4262
  cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv);
3527
4263
 
3528
- struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
3529
- struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, N, cur->nb[1], sizeof(float)*n_embd);
3530
- struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, N, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
4264
+ struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd);
4265
+ struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*n_embd);
4266
+ struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
3531
4267
 
3532
4268
  struct ggml_tensor * Qcur = tmpq;
3533
4269
  struct ggml_tensor * Kcur = tmpk;
3534
4270
 
3535
4271
  {
3536
- struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, N));
4272
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
3537
4273
  ggml_set_name(Vcur, "Vcur");
3538
4274
 
3539
- struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
4275
+ struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
3540
4276
  ggml_set_name(k, "k");
3541
4277
 
3542
- struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
4278
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
3543
4279
  ( n_ctx)*ggml_element_size(kv_self.v),
3544
- (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
4280
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
3545
4281
 
3546
4282
  ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
3547
4283
  ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
@@ -3551,13 +4287,13 @@ static struct ggml_cgraph * llm_build_starcoder(
3551
4287
  ggml_permute(ctx0,
3552
4288
  ggml_cpy(ctx0,
3553
4289
  Qcur,
3554
- ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, N)),
4290
+ ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, n_tokens)),
3555
4291
  0, 2, 1, 3);
3556
4292
  ggml_set_name(Q, "Q");
3557
4293
 
3558
4294
  struct ggml_tensor * K =
3559
4295
  ggml_view_3d(ctx0, kv_self.k,
3560
- n_embd_head, n_past + N, n_head_kv,
4296
+ n_embd_head, n_kv, n_head_kv,
3561
4297
  ggml_element_size(kv_self.k)*n_embd_gqa,
3562
4298
  ggml_element_size(kv_self.k)*n_embd_head,
3563
4299
  ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
@@ -3568,12 +4304,12 @@ static struct ggml_cgraph * llm_build_starcoder(
3568
4304
  ggml_set_name(KQ, "KQ");
3569
4305
 
3570
4306
  // KQ_scaled = KQ / sqrt(n_embd_head)
3571
- // KQ_scaled shape [n_past + N, N, n_head, 1]
4307
+ // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
3572
4308
  struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
3573
4309
  ggml_set_name(KQ_scaled, "KQ_scaled");
3574
4310
 
3575
4311
  // KQ_masked = mask_past(KQ_scaled)
3576
- struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
4312
+ struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
3577
4313
  ggml_set_name(KQ_masked, "KQ_masked");
3578
4314
 
3579
4315
  // KQ = soft_max(KQ_masked)
@@ -3583,7 +4319,7 @@ static struct ggml_cgraph * llm_build_starcoder(
3583
4319
  // split cached V into n_head heads
3584
4320
  struct ggml_tensor * V =
3585
4321
  ggml_view_3d(ctx0, kv_self.v,
3586
- n_past + N, n_embd_head, n_head_kv,
4322
+ n_kv, n_embd_head, n_head_kv,
3587
4323
  ggml_element_size(kv_self.v)*n_ctx,
3588
4324
  ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
3589
4325
  ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
@@ -3596,10 +4332,8 @@ static struct ggml_cgraph * llm_build_starcoder(
3596
4332
  struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
3597
4333
  ggml_set_name(KQV_merged, "KQV_merged");
3598
4334
 
3599
- // cur = KQV_merged.contiguous().view(n_embd, N)
3600
- cur = ggml_cpy(ctx0,
3601
- KQV_merged,
3602
- ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
4335
+ // cur = KQV_merged.contiguous().view(n_embd, n_tokens)
4336
+ cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
3603
4337
  ggml_set_name(cur, "KQV_merged_contiguous");
3604
4338
  }
3605
4339
 
@@ -3649,10 +4383,7 @@ static struct ggml_cgraph * llm_build_starcoder(
3649
4383
 
3650
4384
  static struct ggml_cgraph * llama_build_graph(
3651
4385
  llama_context & lctx,
3652
- const llama_token * tokens,
3653
- const float * embd,
3654
- int n_tokens,
3655
- int n_past) {
4386
+ const llama_batch & batch) {
3656
4387
  const auto & model = lctx.model;
3657
4388
 
3658
4389
  struct ggml_cgraph * result = NULL;
@@ -3660,76 +4391,121 @@ static struct ggml_cgraph * llama_build_graph(
3660
4391
  switch (model.arch) {
3661
4392
  case LLM_ARCH_LLAMA:
3662
4393
  {
3663
- result = llm_build_llama(lctx, tokens, embd, n_tokens, n_past);
4394
+ result = llm_build_llama(lctx, batch);
3664
4395
  } break;
3665
4396
  case LLM_ARCH_BAICHUAN:
3666
4397
  {
3667
- result = llm_build_baichaun(lctx, tokens, embd, n_tokens, n_past);
4398
+ result = llm_build_baichaun(lctx, batch);
3668
4399
  } break;
3669
4400
  case LLM_ARCH_FALCON:
3670
4401
  {
3671
- result = llm_build_falcon(lctx, tokens, embd, n_tokens, n_past);
4402
+ result = llm_build_falcon(lctx, batch);
3672
4403
  } break;
3673
4404
  case LLM_ARCH_STARCODER:
3674
4405
  {
3675
- result = llm_build_starcoder(lctx, tokens, embd, n_tokens, n_past);
4406
+ result = llm_build_starcoder(lctx, batch);
4407
+ } break;
4408
+ case LLM_ARCH_REFACT:
4409
+ {
4410
+ result = llm_build_refact(lctx, batch);
3676
4411
  } break;
3677
4412
  default:
3678
4413
  GGML_ASSERT(false);
3679
- };
4414
+ }
3680
4415
 
3681
4416
  return result;
3682
4417
  }
3683
4418
 
3684
- // evaluate the transformer
4419
+ // decode a batch of tokens by evaluating the transformer
3685
4420
  //
3686
4421
  // - lctx: llama context
3687
- // - tokens: new batch of tokens to process
3688
- // - embd embeddings input
3689
- // - n_tokens number of tokens
3690
- // - n_past: the context size so far
4422
+ // - batch: batch to evaluate
3691
4423
  // - n_threads: number of threads to use
3692
4424
  //
3693
- static bool llama_eval_internal(
4425
+ // return 0 on success
4426
+ // return positive int on warning
4427
+ // return negative int on error
4428
+ //
4429
+ static int llama_decode_internal(
3694
4430
  llama_context & lctx,
3695
- const llama_token * tokens,
3696
- const float * embd,
3697
- int n_tokens,
3698
- int n_past,
3699
- int n_threads,
3700
- const char * cgraph_fname) {
4431
+ llama_batch batch) {
4432
+ const uint32_t n_tokens = batch.n_tokens;
4433
+
4434
+ if (n_tokens == 0) {
4435
+ LLAMA_LOG_ERROR("%s: n_tokens == 0", __func__);
4436
+ return -1;
4437
+ }
4438
+
4439
+ const auto & model = lctx.model;
4440
+ const auto & hparams = model.hparams;
4441
+ const auto & cparams = lctx.cparams;
3701
4442
 
3702
- GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
4443
+ const auto n_batch = cparams.n_batch;
3703
4444
 
3704
- GGML_ASSERT(n_tokens > 0);
3705
- GGML_ASSERT(n_past >= 0);
3706
- // TODO: keep the values of n_batch and n_ctx
3707
- // GGML_ASSERT(n_tokens <= n_batch);
3708
- // GGML_ASSERT(n_past + n_tokens <= n_ctx);
4445
+ GGML_ASSERT(n_tokens <= n_batch);
4446
+
4447
+ int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
4448
+ GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
3709
4449
 
3710
4450
  const int64_t t_start_us = ggml_time_us();
3711
4451
 
3712
4452
  #ifdef GGML_USE_MPI
3713
- ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
4453
+ // TODO: needs fix after #3228
4454
+ GGML_ASSERT(false && "not implemented");
4455
+ //ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
3714
4456
  #endif
3715
4457
 
3716
4458
  GGML_ASSERT(n_threads > 0);
3717
4459
 
3718
- const int N = n_tokens;
3719
-
3720
- const auto & model = lctx.model;
3721
- const auto & hparams = model.hparams;
3722
-
3723
- const auto & kv_self = lctx.kv_self;
4460
+ auto & kv_self = lctx.kv_self;
3724
4461
 
3725
4462
  GGML_ASSERT(!!kv_self.ctx);
3726
4463
 
3727
4464
  const int64_t n_embd = hparams.n_embd;
3728
4465
  const int64_t n_vocab = hparams.n_vocab;
3729
4466
 
4467
+ // helpers for smoother batch API transistion
4468
+ // after deprecating the llama_eval calls, these will be removed
4469
+ std::vector<llama_pos> pos;
4470
+ std::vector<llama_seq_id> seq_id;
4471
+
4472
+ if (batch.pos == nullptr) {
4473
+ pos.resize(n_tokens);
4474
+ for (uint32_t i = 0; i < n_tokens; i++) {
4475
+ pos[i] = batch.all_pos_0 + i*batch.all_pos_1;
4476
+ }
4477
+
4478
+ batch.pos = pos.data();
4479
+ }
4480
+
4481
+ if (batch.seq_id == nullptr) {
4482
+ seq_id.resize(n_tokens);
4483
+ for (uint32_t i = 0; i < n_tokens; i++) {
4484
+ seq_id[i] = batch.all_seq_id;
4485
+ }
4486
+
4487
+ batch.seq_id = seq_id.data();
4488
+ }
4489
+
4490
+ // we always start to search for a free slot from the start of the cache
4491
+ // TODO: better strategies can be implemented
4492
+ kv_self.head = 0;
4493
+
4494
+ if (!llama_kv_cache_find_slot(kv_self, batch)) {
4495
+ return 1;
4496
+ }
4497
+
4498
+ // a heuristic, to avoid attending the full cache if it is not yet utilized
4499
+ // after enough generations, the benefit from this heuristic disappears
4500
+ // if we start defragmenting the cache, the benefit from this will be more important
4501
+ //kv_self.n = std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)); // TODO: this might be better for CUDA?
4502
+ kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, llama_kv_cache_cell_max(kv_self)));
4503
+
4504
+ //printf("kv_self.n = %d\n", kv_self.n);
4505
+
3730
4506
  ggml_allocr_reset(lctx.alloc);
3731
4507
 
3732
- ggml_cgraph * gf = llama_build_graph(lctx, tokens, embd, n_tokens, n_past);
4508
+ ggml_cgraph * gf = llama_build_graph(lctx, batch);
3733
4509
 
3734
4510
  ggml_allocr_alloc_graph(lctx.alloc, gf);
3735
4511
 
@@ -3738,6 +4514,7 @@ static bool llama_eval_internal(
3738
4514
  ggml_tensor * node = gf->leafs[i];
3739
4515
  if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
3740
4516
  ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) lctx.buf_alloc.data);
4517
+ ggml_cuda_copy_to_device(node);
3741
4518
  }
3742
4519
  }
3743
4520
 
@@ -3747,6 +4524,8 @@ static bool llama_eval_internal(
3747
4524
  ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) lctx.buf_alloc.data);
3748
4525
  }
3749
4526
  }
4527
+
4528
+ ggml_cuda_set_mul_mat_q(cparams.mul_mat_q);
3750
4529
  #endif
3751
4530
 
3752
4531
  // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
@@ -3756,14 +4535,15 @@ static bool llama_eval_internal(
3756
4535
  // TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
3757
4536
  // we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
3758
4537
  // with the BLAS calls. need a better solution
3759
- if (N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
4538
+ if (n_tokens >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
3760
4539
  n_threads = std::min(4, n_threads);
3761
4540
  }
3762
4541
 
3763
4542
  // If all tensors can be run on the GPU then using more than 1 thread is detrimental.
3764
4543
  const bool full_offload_supported = model.arch == LLM_ARCH_LLAMA ||
3765
4544
  model.arch == LLM_ARCH_BAICHUAN ||
3766
- model.arch == LLM_ARCH_FALCON;
4545
+ model.arch == LLM_ARCH_FALCON ||
4546
+ model.arch == LLM_ARCH_REFACT;
3767
4547
  const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
3768
4548
  if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
3769
4549
  n_threads = 1;
@@ -3795,12 +4575,9 @@ static bool llama_eval_internal(
3795
4575
  ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
3796
4576
  #endif
3797
4577
 
3798
- // update kv token count
3799
- lctx.kv_self.n = n_past + N;
3800
-
3801
- if (cgraph_fname) {
3802
- ggml_graph_export(gf, cgraph_fname);
3803
- }
4578
+ // update the kv ring buffer
4579
+ lctx.kv_self.head += n_tokens;
4580
+ lctx.kv_self.has_shift = false;
3804
4581
 
3805
4582
  #ifdef GGML_PERF
3806
4583
  // print timing information per ggml operation (for debugging purposes)
@@ -3817,13 +4594,20 @@ static bool llama_eval_internal(
3817
4594
  {
3818
4595
  auto & logits_out = lctx.logits;
3819
4596
 
3820
- if (lctx.logits_all) {
3821
- logits_out.resize(n_vocab * N);
3822
- memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*N);
4597
+ if (batch.logits) {
4598
+ logits_out.resize(n_vocab * n_tokens);
4599
+ for (uint32_t i = 0; i < n_tokens; i++) {
4600
+ if (batch.logits[i] == 0) {
4601
+ continue;
4602
+ }
4603
+ memcpy(logits_out.data() + (n_vocab*i), (float *) ggml_get_data(res) + (n_vocab*i), sizeof(float)*n_vocab);
4604
+ }
4605
+ } else if (lctx.logits_all) {
4606
+ logits_out.resize(n_vocab * n_tokens);
4607
+ memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*n_tokens);
3823
4608
  } else {
3824
- // return result for just the last token
3825
4609
  logits_out.resize(n_vocab);
3826
- memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
4610
+ memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(n_tokens - 1)), sizeof(float)*n_vocab);
3827
4611
  }
3828
4612
  }
3829
4613
 
@@ -3832,20 +4616,27 @@ static bool llama_eval_internal(
3832
4616
  auto & embedding_out = lctx.embedding;
3833
4617
 
3834
4618
  embedding_out.resize(n_embd);
3835
- memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd);
4619
+ memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(n_tokens - 1)), sizeof(float)*n_embd);
3836
4620
  }
3837
4621
 
3838
4622
  // measure the performance only for the single-token evals
3839
- if (N == 1) {
4623
+ if (n_tokens == 1) {
3840
4624
  lctx.t_eval_us += ggml_time_us() - t_start_us;
3841
4625
  lctx.n_eval++;
3842
4626
  }
3843
- else if (N > 1) {
4627
+ else if (n_tokens > 1) {
3844
4628
  lctx.t_p_eval_us += ggml_time_us() - t_start_us;
3845
- lctx.n_p_eval += N;
4629
+ lctx.n_p_eval += n_tokens;
3846
4630
  }
3847
4631
 
3848
- return true;
4632
+ // get a more accurate load time, upon first eval
4633
+ // TODO: fix this
4634
+ if (!lctx.has_evaluated_once) {
4635
+ lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
4636
+ lctx.has_evaluated_once = true;
4637
+ }
4638
+
4639
+ return 0;
3849
4640
  }
3850
4641
 
3851
4642
  //
@@ -3872,18 +4663,41 @@ static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
3872
4663
  return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE;
3873
4664
  }
3874
4665
 
3875
- static uint8_t llama_token_to_byte(const llama_vocab & vocab, llama_token id) {
4666
+ static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id) {
4667
+ return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED;
4668
+ }
4669
+
4670
+ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
3876
4671
  GGML_ASSERT(llama_is_byte_token(vocab, id));
3877
4672
  const auto& token_data = vocab.id_to_token.at(id);
3878
- auto buf = token_data.text.substr(3, 2);
3879
- return strtol(buf.c_str(), NULL, 16);
4673
+ switch (llama_vocab_get_type(vocab)) {
4674
+ case LLAMA_VOCAB_TYPE_SPM: {
4675
+ auto buf = token_data.text.substr(3, 2);
4676
+ return strtol(buf.c_str(), NULL, 16);
4677
+ }
4678
+ case LLAMA_VOCAB_TYPE_BPE: {
4679
+ GGML_ASSERT(false);
4680
+ return unicode_to_bytes_bpe(token_data.text);
4681
+ }
4682
+ default:
4683
+ GGML_ASSERT(false);
4684
+ }
3880
4685
  }
3881
4686
 
3882
4687
  static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
3883
- char buf[7];
3884
- int result = snprintf(buf, sizeof(buf), "<0x%02X>", ch);
3885
- GGML_ASSERT(0 <= result && result < 7);
3886
- return vocab.token_to_id.at(buf);
4688
+ switch (llama_vocab_get_type(vocab)) {
4689
+ case LLAMA_VOCAB_TYPE_SPM: {
4690
+ char buf[7];
4691
+ int result = snprintf(buf, sizeof(buf), "<0x%02X>", ch);
4692
+ GGML_ASSERT(0 <= result && result < 7);
4693
+ return vocab.token_to_id.at(buf);
4694
+ }
4695
+ case LLAMA_VOCAB_TYPE_BPE: {
4696
+ return vocab.token_to_id.at(bytes_to_unicode_bpe(ch));
4697
+ }
4698
+ default:
4699
+ GGML_ASSERT(false);
4700
+ }
3887
4701
  }
3888
4702
 
3889
4703
  static void llama_escape_whitespace(std::string & text) {
@@ -4163,15 +4977,9 @@ struct llm_tokenizer_bpe {
4163
4977
  std::string byte_str(1, *j);
4164
4978
  auto token_multibyte = vocab.token_to_id.find(byte_str);
4165
4979
  if (token_multibyte == vocab.token_to_id.end()) {
4166
- try {
4167
- llama_token token_byte = llama_byte_to_token(vocab, *j);
4168
- output.push_back(token_byte);
4169
- } catch (const std::out_of_range & err) {
4170
- fprintf(stderr,"ERROR: byte not found in vocab: '%s'\n", byte_str.c_str());
4171
- }
4172
- } else {
4173
- output.push_back((*token_multibyte).second);
4980
+ throw std::runtime_error("ERROR: byte not found in vocab");
4174
4981
  }
4982
+ output.push_back((*token_multibyte).second);
4175
4983
  }
4176
4984
  } else {
4177
4985
  output.push_back((*token).second);
@@ -4208,23 +5016,144 @@ private:
4208
5016
  work_queue.push(bigram);
4209
5017
  }
4210
5018
 
4211
- // probably not 100% correct
4212
- static std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
4213
- std::vector<std::string> words;
5019
+ std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
5020
+ std::vector<std::string> bpe_words;
5021
+ std::vector<std::string> bpe_encoded_words;
5022
+
5023
+ std::string token = "";
5024
+ // GPT2 system regex: 's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+
5025
+ bool collecting_numeric = false;
5026
+ bool collecting_letter = false;
5027
+ bool collecting_special = false;
5028
+ bool collecting_whitespace_lookahead = false;
5029
+ bool collecting = false;
5030
+
5031
+ std::vector<std::string> text_utf;
5032
+ text_utf.reserve(text.size());
5033
+ bpe_words.reserve(text.size());
5034
+ bpe_encoded_words.reserve(text.size());
5035
+
5036
+ auto cps = codepoints_from_utf8(text);
5037
+ for (size_t i = 0; i < cps.size(); ++i)
5038
+ text_utf.emplace_back(codepoint_to_utf8(cps[i]));
5039
+
5040
+ for (int i = 0; i < (int)text_utf.size(); i++) {
5041
+ const std::string & utf_char = text_utf[i];
5042
+ bool split_condition = false;
5043
+ // const char* text_pos = raw_text_p + utf_char.seq_offset_bytes;
5044
+ int bytes_remain = text_utf.size() - i;
5045
+ // forward backward lookups
5046
+ const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
5047
+ const std::string & utf_char_next_next = (i + 2 < (int)text_utf.size()) ? text_utf[i + 2] : "";
5048
+
5049
+ // handling contractions
5050
+ if (!split_condition && bytes_remain >= 2) {
5051
+ // 's|'t|'m|'d
5052
+ if (utf_char == "\'" && (utf_char_next == "s" || utf_char_next == "t" || utf_char_next == "m" || utf_char_next == "d")) {
5053
+ split_condition = true;
5054
+ }
5055
+ if (split_condition) {
5056
+ if (token.size()) {
5057
+ bpe_words.emplace_back(token); // push previous content as token
5058
+ }
5059
+ token = utf_char + utf_char_next;
5060
+ bpe_words.emplace_back(token);
5061
+ token = "";
5062
+ i++;
5063
+ continue;
5064
+ }
5065
+ }
5066
+ if (!split_condition && bytes_remain >= 3) {
5067
+ // 're|'ve|'ll
5068
+ if (utf_char == "\'" && (
5069
+ (utf_char_next == "r" || utf_char_next_next == "e") ||
5070
+ (utf_char_next == "v" || utf_char_next_next == "e") ||
5071
+ (utf_char_next == "l" || utf_char_next_next == "l"))
5072
+ ) {
5073
+ split_condition = true;
5074
+ }
5075
+ if (split_condition) {
5076
+ // current token + next token can be defined
5077
+ if (token.size()) {
5078
+ bpe_words.emplace_back(token); // push previous content as token
5079
+ }
5080
+ token = utf_char + utf_char_next + utf_char_next_next;
5081
+ bpe_words.emplace_back(token); // the contraction
5082
+ token = "";
5083
+ i += 2;
5084
+ continue;
5085
+ }
5086
+ }
5087
+
5088
+ if (!split_condition && !collecting) {
5089
+ if (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || (!token.size() && utf_char == " " && codepoint_type(utf_char_next) == CODEPOINT_TYPE_LETTER)) {
5090
+ collecting_letter = true;
5091
+ collecting = true;
5092
+ }
5093
+ else if (codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || (!token.size() && utf_char == " " && codepoint_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
5094
+ collecting_numeric = true;
5095
+ collecting = true;
5096
+ }
5097
+ else if (
5098
+ ((codepoint_type(utf_char) != CODEPOINT_TYPE_LETTER && codepoint_type(utf_char) != CODEPOINT_TYPE_DIGIT) && (codepoint_type(utf_char) != CODEPOINT_TYPE_WHITESPACE)) ||
5099
+ (!token.size() && utf_char == " " && codepoint_type(utf_char_next) != CODEPOINT_TYPE_LETTER && codepoint_type(utf_char_next) != CODEPOINT_TYPE_DIGIT && codepoint_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE)
5100
+ ) {
5101
+ collecting_special = true;
5102
+ collecting = true;
5103
+ }
5104
+ else if (codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE && codepoint_type(utf_char_next) == CODEPOINT_TYPE_WHITESPACE) {
5105
+ collecting_whitespace_lookahead = true;
5106
+ collecting = true;
5107
+ }
5108
+ else if (codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE) {
5109
+ split_condition = true;
5110
+ }
5111
+ }
5112
+ else if (!split_condition && collecting) {
5113
+ if (collecting_letter && codepoint_type(utf_char) != CODEPOINT_TYPE_LETTER) {
5114
+ split_condition = true;
5115
+ }
5116
+ else if (collecting_numeric && codepoint_type(utf_char) != CODEPOINT_TYPE_DIGIT) {
5117
+ split_condition = true;
5118
+ }
5119
+ else if (collecting_special && (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
5120
+ split_condition = true;
5121
+ }
5122
+ else if (collecting_whitespace_lookahead && codepoint_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE) {
5123
+ split_condition = true;
5124
+ }
5125
+ }
5126
+
5127
+ if (utf_char_next == "") {
5128
+ split_condition = true; // final
5129
+ token += utf_char;
5130
+ }
4214
5131
 
4215
- // ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53
4216
- const std::string pattern = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
4217
- const std::regex re(pattern);
5132
+ if (split_condition) {
5133
+ if (token.size()) {
5134
+ bpe_words.emplace_back(token);
5135
+ }
5136
+ token = utf_char;
5137
+ collecting = false;
5138
+ collecting_letter = false;
5139
+ collecting_numeric = false;
5140
+ collecting_special = false;
5141
+ collecting_whitespace_lookahead = false;
5142
+ }
5143
+ else {
5144
+ token += utf_char;
5145
+ }
5146
+ }
4218
5147
 
4219
- auto words_begin = std::sregex_iterator(text.begin(), text.end(), re);
4220
- auto words_end = std::sregex_iterator();
4221
- auto n_words = std::distance(words_begin, words_end);
4222
- words.reserve(n_words);
4223
- for (auto it = words_begin; it != words_end; ++it) {
4224
- words.push_back(it->str());
5148
+ for (std::string & word : bpe_words) {
5149
+ std::string encoded_token = "";
5150
+ for (char & c : word) {
5151
+ encoded_token += bytes_to_unicode_bpe(c);
5152
+ }
5153
+ bpe_encoded_words.emplace_back(encoded_token);
4225
5154
  }
4226
- return words;
4227
5155
 
5156
+ return bpe_encoded_words;
4228
5157
  }
4229
5158
 
4230
5159
  const llama_vocab & vocab;
@@ -4266,7 +5195,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
4266
5195
  llm_tokenizer_bpe tokenizer(vocab);
4267
5196
  tokenizer.tokenize(raw_text, output);
4268
5197
  } break;
4269
- };
5198
+ }
4270
5199
 
4271
5200
  return output;
4272
5201
  }
@@ -4670,6 +5599,13 @@ struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar)
4670
5599
  // sampling
4671
5600
  //
4672
5601
 
5602
+ void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed) {
5603
+ if (seed == LLAMA_DEFAULT_SEED) {
5604
+ seed = time(NULL);
5605
+ }
5606
+ ctx->rng.seed(seed);
5607
+ }
5608
+
4673
5609
  void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates) {
4674
5610
  GGML_ASSERT(candidates->size > 0);
4675
5611
 
@@ -4878,7 +5814,7 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
4878
5814
  }
4879
5815
  }
4880
5816
 
4881
- void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
5817
+ void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
4882
5818
  const int64_t t_start_sample_us = ggml_time_us();
4883
5819
 
4884
5820
  for (size_t i = 0; i < candidates_p->size; ++i) {
@@ -4890,6 +5826,10 @@ void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array
4890
5826
  }
4891
5827
  }
4892
5828
 
5829
+ void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
5830
+ llama_sample_temp(ctx, candidates_p, temp);
5831
+ }
5832
+
4893
5833
  void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty) {
4894
5834
  if (last_tokens_size == 0 || penalty == 1.0f) {
4895
5835
  return;
@@ -5013,7 +5953,7 @@ void llama_sample_classifier_free_guidance(
5013
5953
 
5014
5954
  GGML_ASSERT(ctx);
5015
5955
 
5016
- auto n_vocab = llama_n_vocab(ctx);
5956
+ auto n_vocab = llama_n_vocab(llama_get_model(ctx));
5017
5957
 
5018
5958
  GGML_ASSERT(n_vocab == (int)candidates->size);
5019
5959
  GGML_ASSERT(!candidates->sorted);
@@ -5042,7 +5982,7 @@ void llama_sample_classifier_free_guidance(
5042
5982
  llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu) {
5043
5983
  GGML_ASSERT(ctx);
5044
5984
 
5045
- auto N = float(llama_n_vocab(ctx));
5985
+ auto N = float(llama_n_vocab(llama_get_model(ctx)));
5046
5986
  int64_t t_start_sample_us;
5047
5987
  t_start_sample_us = ggml_time_us();
5048
5988
 
@@ -5229,7 +6169,7 @@ struct llama_logit_info {
5229
6169
  };
5230
6170
  llama_logit_info(llama_context * ctx)
5231
6171
  : logits(llama_get_logits(ctx))
5232
- , n_vocab(llama_n_vocab(ctx))
6172
+ , n_vocab(llama_n_vocab(llama_get_model(ctx)))
5233
6173
  , max_l(*std::max_element(logits, logits + n_vocab))
5234
6174
  , normalizer(1.0f / std::accumulate(logits, logits + n_vocab, 0.0f, sum_exp{max_l}))
5235
6175
  { }
@@ -5267,7 +6207,6 @@ struct llama_beam_search_data {
5267
6207
  size_t n_beams;
5268
6208
  int n_past;
5269
6209
  int n_predict;
5270
- int n_threads;
5271
6210
  std::vector<llama_beam> beams;
5272
6211
  std::vector<llama_beam> next_beams;
5273
6212
 
@@ -5277,12 +6216,11 @@ struct llama_beam_search_data {
5277
6216
  // Used to communicate to/from callback on beams state.
5278
6217
  std::vector<llama_beam_view> beam_views;
5279
6218
 
5280
- llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict, int n_threads)
6219
+ llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict)
5281
6220
  : ctx(ctx)
5282
6221
  , n_beams(n_beams)
5283
6222
  , n_past(n_past)
5284
6223
  , n_predict(n_predict)
5285
- , n_threads(n_threads)
5286
6224
  , beam_views(n_beams) {
5287
6225
  beams.reserve(n_beams);
5288
6226
  next_beams.reserve(n_beams);
@@ -5319,7 +6257,7 @@ struct llama_beam_search_data {
5319
6257
  } else {
5320
6258
  // beam is not at end-of-sentence, so branch with next top_k tokens.
5321
6259
  if (!beam.tokens.empty()) {
5322
- llama_eval(ctx, beam.tokens.data(), beam.tokens.size(), n_past, n_threads);
6260
+ llama_decode(ctx, llama_batch_get_one(beam.tokens.data(), beam.tokens.size(), n_past, 0));
5323
6261
  }
5324
6262
  llama_logit_info logit_info(ctx);
5325
6263
  std::vector<llama_token_data> next_tokens = logit_info.top_k(n_beams);
@@ -5393,7 +6331,7 @@ struct llama_beam_search_data {
5393
6331
  callback(callback_data, get_beams_state(false)); // Sets common_prefix_length
5394
6332
  update_beams_from_beam_views(); // Update values (p,eob) that callback may have changed.
5395
6333
  if (common_prefix_length) {
5396
- llama_eval(ctx, beams[0].tokens.data(), common_prefix_length, n_past, n_threads);
6334
+ llama_decode(ctx, llama_batch_get_one(beams[0].tokens.data(), common_prefix_length, n_past, 0));
5397
6335
  n_past += common_prefix_length;
5398
6336
  }
5399
6337
  // Zero-out next_beam probabilities to place them last in following min-heap.
@@ -5434,11 +6372,11 @@ struct llama_beam_search_data {
5434
6372
 
5435
6373
  void llama_beam_search(llama_context * ctx,
5436
6374
  llama_beam_search_callback_fn_t callback, void * callback_data,
5437
- size_t n_beams, int n_past, int n_predict, int n_threads) {
6375
+ size_t n_beams, int n_past, int n_predict) {
5438
6376
  assert(ctx);
5439
6377
  const int64_t t_start_sample_us = ggml_time_us();
5440
6378
 
5441
- llama_beam_search_data beam_search_data(ctx, n_beams, n_past, n_predict, n_threads);
6379
+ llama_beam_search_data beam_search_data(ctx, n_beams, n_past, n_predict);
5442
6380
 
5443
6381
  beam_search_data.loop(callback, callback_data);
5444
6382
 
@@ -5658,11 +6596,22 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
5658
6596
  nthread = std::thread::hardware_concurrency();
5659
6597
  }
5660
6598
 
5661
- std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname_inp, /*use_mmap*/ false));
6599
+ // mmap consistently increases speed Linux, and also increases speed on Windows with
6600
+ // hot cache. It may cause a slowdown on macOS, possibly related to free memory.
6601
+ #if defined(__linux__) || defined(_WIN32)
6602
+ constexpr bool use_mmap = true;
6603
+ #else
6604
+ constexpr bool use_mmap = false;
6605
+ #endif
6606
+
6607
+ llama_model_loader ml(fname_inp, use_mmap);
6608
+ if (ml.use_mmap) {
6609
+ ml.mapping.reset(new llama_mmap(&ml.file, /* prefetch */ 0, ggml_is_numa()));
6610
+ }
5662
6611
 
5663
6612
  llama_model model;
5664
- llm_load_arch(*ml, model);
5665
- llm_load_hparams(*ml, model, 0, 0, 0);
6613
+ llm_load_arch(ml, model);
6614
+ llm_load_hparams(ml, model);
5666
6615
 
5667
6616
  if (params->only_copy) {
5668
6617
  ftype = model.ftype;
@@ -5672,7 +6621,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
5672
6621
  struct gguf_context * ctx_out = gguf_init_empty();
5673
6622
 
5674
6623
  // copy the KV pairs from the input file
5675
- gguf_set_kv (ctx_out, ml->ctx_gguf);
6624
+ gguf_set_kv (ctx_out, ml.ctx_gguf);
5676
6625
  gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
5677
6626
  gguf_set_val_u32(ctx_out, "general.file_type", ftype);
5678
6627
 
@@ -5680,8 +6629,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
5680
6629
  int n_attention_wv = 0;
5681
6630
  int n_feed_forward_w2 = 0;
5682
6631
 
5683
- for (int i = 0; i < ml->n_tensors; ++i) {
5684
- struct ggml_tensor * meta = ml->get_tensor_meta(i);
6632
+ for (int i = 0; i < ml.n_tensors; ++i) {
6633
+ struct ggml_tensor * meta = ml.get_tensor_meta(i);
5685
6634
 
5686
6635
  const std::string name = ggml_get_name(meta);
5687
6636
 
@@ -5717,8 +6666,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
5717
6666
  std::vector<no_init<float>> f32_conv_buf;
5718
6667
 
5719
6668
  // populate the original tensors so we get an initial meta data
5720
- for (int i = 0; i < ml->n_tensors; ++i) {
5721
- struct ggml_tensor * meta = ml->get_tensor_meta(i);
6669
+ for (int i = 0; i < ml.n_tensors; ++i) {
6670
+ struct ggml_tensor * meta = ml.get_tensor_meta(i);
5722
6671
  gguf_add_tensor(ctx_out, meta);
5723
6672
  }
5724
6673
 
@@ -5731,19 +6680,21 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
5731
6680
  // placeholder for the meta data
5732
6681
  ::zeros(fout, meta_size);
5733
6682
 
5734
- for (int i = 0; i < ml->n_tensors; ++i) {
5735
- struct ggml_tensor * tensor = ml->get_tensor_meta(i);
6683
+ for (int i = 0; i < ml.n_tensors; ++i) {
6684
+ struct ggml_tensor * tensor = ml.get_tensor_meta(i);
5736
6685
 
5737
6686
  const std::string name = ggml_get_name(tensor);
5738
6687
 
5739
- if (read_data.size() < ggml_nbytes(tensor)) {
5740
- read_data.resize(ggml_nbytes(tensor));
6688
+ if (!ml.use_mmap) {
6689
+ if (read_data.size() < ggml_nbytes(tensor)) {
6690
+ read_data.resize(ggml_nbytes(tensor));
6691
+ }
6692
+ tensor->data = read_data.data();
5741
6693
  }
5742
- tensor->data = read_data.data();
5743
- ml->load_data_for(tensor);
6694
+ ml.load_data_for(tensor);
5744
6695
 
5745
6696
  LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",
5746
- ++idx, ml->n_tensors,
6697
+ ++idx, ml.n_tensors,
5747
6698
  ggml_get_name(tensor),
5748
6699
  llama_format_tensor_shape(tensor).c_str(),
5749
6700
  ggml_type_name(tensor->type));
@@ -5893,9 +6844,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
5893
6844
  }
5894
6845
  }
5895
6846
 
5896
- // TODO: after the GGUF PR, this likely won't work and needs to be updated
5897
6847
  static int llama_apply_lora_from_file_internal(
5898
- const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads
6848
+ const struct llama_model & model, const char * path_lora, float scale, const char * path_base_model, int n_threads
5899
6849
  ) {
5900
6850
  LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
5901
6851
 
@@ -5924,7 +6874,7 @@ static int llama_apply_lora_from_file_internal(
5924
6874
  int32_t lora_alpha;
5925
6875
  fin.read((char *) &lora_r, sizeof(lora_r));
5926
6876
  fin.read((char *) &lora_alpha, sizeof(lora_alpha));
5927
- float scaling = (float)lora_alpha / (float)lora_r;
6877
+ float scaling = scale * (float)lora_alpha / (float)lora_r;
5928
6878
 
5929
6879
  LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
5930
6880
 
@@ -6140,9 +7090,10 @@ static int llama_apply_lora_from_file_internal(
6140
7090
  ggml_set_name(r, "r_cpy");
6141
7091
  }
6142
7092
 
6143
- struct ggml_cgraph gf = ggml_build_forward(r);
7093
+ struct ggml_cgraph * gf = ggml_new_graph(lora_ctx);
7094
+ ggml_build_forward_expand(gf, r);
6144
7095
 
6145
- ggml_graph_compute_helper(work_buffer, &gf, n_threads);
7096
+ ggml_graph_compute_helper(work_buffer, gf, n_threads);
6146
7097
 
6147
7098
  // we won't need these tensors again, reset the context to save memory
6148
7099
  ggml_free(lora_ctx);
@@ -6171,27 +7122,16 @@ static int llama_apply_lora_from_file_internal(
6171
7122
  //
6172
7123
  // interface implementation
6173
7124
  //
6174
-
6175
- struct llama_context_params llama_context_default_params() {
6176
- struct llama_context_params result = {
6177
- /*.seed =*/ LLAMA_DEFAULT_SEED,
6178
- /*.n_ctx =*/ 512,
6179
- /*.n_batch =*/ 512,
7125
+ struct llama_model_params llama_model_default_params() {
7126
+ struct llama_model_params result = {
6180
7127
  /*.n_gpu_layers =*/ 0,
6181
7128
  /*.main_gpu =*/ 0,
6182
7129
  /*.tensor_split =*/ nullptr,
6183
- /*.rope_freq_base =*/ 0.0f,
6184
- /*.rope_freq_scale =*/ 0.0f,
6185
7130
  /*.progress_callback =*/ nullptr,
6186
7131
  /*.progress_callback_user_data =*/ nullptr,
6187
- /*.low_vram =*/ false,
6188
- /*.mul_mat_q =*/ true,
6189
- /*.f16_kv =*/ true,
6190
- /*.logits_all =*/ false,
6191
7132
  /*.vocab_only =*/ false,
6192
7133
  /*.use_mmap =*/ true,
6193
7134
  /*.use_mlock =*/ false,
6194
- /*.embedding =*/ false,
6195
7135
  };
6196
7136
 
6197
7137
  #ifdef GGML_USE_METAL
@@ -6201,6 +7141,24 @@ struct llama_context_params llama_context_default_params() {
6201
7141
  return result;
6202
7142
  }
6203
7143
 
7144
+ struct llama_context_params llama_context_default_params() {
7145
+ struct llama_context_params result = {
7146
+ /*.seed =*/ LLAMA_DEFAULT_SEED,
7147
+ /*.n_ctx =*/ 512,
7148
+ /*.n_batch =*/ 512,
7149
+ /*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
7150
+ /*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS,
7151
+ /*.rope_freq_base =*/ 0.0f,
7152
+ /*.rope_freq_scale =*/ 0.0f,
7153
+ /*.mul_mat_q =*/ true,
7154
+ /*.f16_kv =*/ true,
7155
+ /*.logits_all =*/ false,
7156
+ /*.embedding =*/ false,
7157
+ };
7158
+
7159
+ return result;
7160
+ }
7161
+
6204
7162
  struct llama_model_quantize_params llama_model_quantize_default_params() {
6205
7163
  struct llama_model_quantize_params result = {
6206
7164
  /*.nthread =*/ 0,
@@ -6256,13 +7214,11 @@ int64_t llama_time_us(void) {
6256
7214
 
6257
7215
  struct llama_model * llama_load_model_from_file(
6258
7216
  const char * path_model,
6259
- struct llama_context_params params) {
7217
+ struct llama_model_params params) {
6260
7218
  ggml_time_init();
6261
7219
 
6262
7220
  llama_model * model = new llama_model;
6263
7221
 
6264
- ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
6265
-
6266
7222
  unsigned cur_percentage = 0;
6267
7223
  if (params.progress_callback == NULL) {
6268
7224
  params.progress_callback_user_data = &cur_percentage;
@@ -6279,9 +7235,9 @@ struct llama_model * llama_load_model_from_file(
6279
7235
  };
6280
7236
  }
6281
7237
 
6282
- if (!llama_model_load(path_model, *model, params.n_ctx, params.n_batch, params.n_gpu_layers,
6283
- params.main_gpu, params.tensor_split, params.mul_mat_q, params.rope_freq_base, params.rope_freq_scale,
6284
- params.low_vram, memory_type, params.use_mmap, params.use_mlock, params.vocab_only,
7238
+ if (!llama_model_load(path_model, *model, params.n_gpu_layers,
7239
+ params.main_gpu, params.tensor_split,
7240
+ params.use_mmap, params.use_mlock, params.vocab_only,
6285
7241
  params.progress_callback, params.progress_callback_user_data)) {
6286
7242
  LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
6287
7243
  delete model;
@@ -6305,18 +7261,33 @@ struct llama_context * llama_new_context_with_model(
6305
7261
 
6306
7262
  llama_context * ctx = new llama_context(*model);
6307
7263
 
7264
+ const auto & hparams = model->hparams;
7265
+ auto & cparams = ctx->cparams;
7266
+
7267
+ cparams.n_batch = params.n_batch;
7268
+ cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
7269
+ cparams.rope_freq_base = params.rope_freq_base == 0 ? hparams.rope_freq_base_train : params.rope_freq_base;
7270
+ cparams.rope_freq_scale = params.rope_freq_scale == 0 ? hparams.rope_freq_scale_train : params.rope_freq_scale;
7271
+ cparams.n_threads = params.n_threads;
7272
+ cparams.n_threads_batch = params.n_threads_batch;
7273
+ cparams.mul_mat_q = params.mul_mat_q;
7274
+
6308
7275
  if (params.seed == LLAMA_DEFAULT_SEED) {
6309
7276
  params.seed = time(NULL);
6310
7277
  }
6311
7278
 
7279
+ LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
7280
+ LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
7281
+ LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
7282
+
6312
7283
  ctx->rng = std::mt19937(params.seed);
6313
7284
  ctx->logits_all = params.logits_all;
6314
7285
 
6315
7286
  ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
6316
7287
 
6317
7288
  // reserve memory for context buffers
6318
- if (!params.vocab_only) {
6319
- if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
7289
+ if (!hparams.vocab_only) {
7290
+ if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, cparams.n_ctx, model->n_gpu_layers)) {
6320
7291
  LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
6321
7292
  llama_free(ctx);
6322
7293
  return nullptr;
@@ -6327,11 +7298,9 @@ struct llama_context * llama_new_context_with_model(
6327
7298
  LLAMA_LOG_INFO("%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
6328
7299
  }
6329
7300
 
6330
- const auto & hparams = ctx->model.hparams;
6331
-
6332
7301
  // resized during inference
6333
7302
  if (params.logits_all) {
6334
- ctx->logits.reserve(hparams.n_ctx*hparams.n_vocab);
7303
+ ctx->logits.reserve(cparams.n_ctx*hparams.n_vocab);
6335
7304
  } else {
6336
7305
  ctx->logits.reserve(hparams.n_vocab);
6337
7306
  }
@@ -6349,26 +7318,29 @@ struct llama_context * llama_new_context_with_model(
6349
7318
  ctx->alloc = ggml_allocr_new_measure(tensor_alignment);
6350
7319
 
6351
7320
  // build worst-case graph
6352
- int n_tokens = std::min((int)hparams.n_ctx, params.n_batch);
6353
- int n_past = hparams.n_ctx - n_tokens;
7321
+ int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch);
7322
+ int n_past = cparams.n_ctx - n_tokens;
6354
7323
  llama_token token = llama_token_bos(ctx); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
6355
- ggml_cgraph * gf = llama_build_graph(*ctx, &token, NULL, n_tokens, n_past);
7324
+ ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0));
7325
+
6356
7326
  #ifdef GGML_USE_METAL
6357
- if (params.n_gpu_layers > 0) {
7327
+ if (model->n_gpu_layers > 0) {
7328
+ ggml_metal_log_set_callback(llama_log_callback_default, NULL);
7329
+
6358
7330
  ctx->ctx_metal = ggml_metal_init(1);
6359
7331
  if (!ctx->ctx_metal) {
6360
7332
  LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__);
6361
7333
  llama_free(ctx);
6362
7334
  return NULL;
6363
7335
  }
6364
- ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false);
6365
- ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
7336
+ //ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false);
7337
+ //ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
6366
7338
  }
6367
7339
  #endif
6368
7340
  // measure memory requirements for the graph
6369
7341
  size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
6370
7342
 
6371
- LLAMA_LOG_INFO("%s: compute buffer total size = %7.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
7343
+ LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
6372
7344
 
6373
7345
  // recreate allocator with exact memory requirements
6374
7346
  ggml_allocr_free(ctx->alloc);
@@ -6377,28 +7349,46 @@ struct llama_context * llama_new_context_with_model(
6377
7349
  ctx->alloc = ggml_allocr_new(ctx->buf_alloc.data, ctx->buf_alloc.size, tensor_alignment);
6378
7350
  #ifdef GGML_USE_METAL
6379
7351
  if (ctx->ctx_metal) {
6380
- ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
7352
+ //ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
6381
7353
  }
6382
7354
  #endif
6383
7355
  #ifdef GGML_USE_CUBLAS
6384
- if (params.low_vram) {
6385
- LLAMA_LOG_INFO("%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
6386
- ggml_cuda_set_scratch_size(0); // disable scratch
6387
- } else {
6388
- ggml_cuda_set_scratch_size(alloc_size);
6389
- LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MB\n", __func__, alloc_size / 1024.0 / 1024.0);
7356
+ ggml_cuda_set_scratch_size(alloc_size);
7357
+ LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MB\n", __func__, alloc_size / 1024.0 / 1024.0);
7358
+
7359
+ // calculate total VRAM usage
7360
+ auto add_tensor = [](const ggml_tensor * t, size_t & size) {
7361
+ if (t->backend == GGML_BACKEND_GPU || t->backend == GGML_BACKEND_GPU_SPLIT) {
7362
+ size += ggml_nbytes(t);
7363
+ }
7364
+ };
7365
+ size_t model_vram_size = 0;
7366
+ for (const auto & kv : model->tensors_by_name) {
7367
+ add_tensor(kv.second, model_vram_size);
6390
7368
  }
7369
+
7370
+ size_t kv_vram_size = 0;
7371
+ add_tensor(ctx->kv_self.k, kv_vram_size);
7372
+ add_tensor(ctx->kv_self.v, kv_vram_size);
7373
+
7374
+ size_t ctx_vram_size = alloc_size + kv_vram_size;
7375
+ size_t total_vram_size = model_vram_size + ctx_vram_size;
7376
+
7377
+ LLAMA_LOG_INFO("%s: total VRAM used: %.2f MB (model: %.2f MB, context: %.2f MB)\n", __func__,
7378
+ total_vram_size / 1024.0 / 1024.0,
7379
+ model_vram_size / 1024.0 / 1024.0,
7380
+ ctx_vram_size / 1024.0 / 1024.0);
6391
7381
  #endif
6392
7382
  }
6393
7383
 
6394
7384
  #ifdef GGML_USE_METAL
6395
- if (params.n_gpu_layers > 0) {
7385
+ if (model->n_gpu_layers > 0) {
6396
7386
  // this allocates all Metal resources and memory buffers
6397
7387
 
6398
7388
  void * data_ptr = NULL;
6399
7389
  size_t data_size = 0;
6400
7390
 
6401
- if (params.use_mmap) {
7391
+ if (ctx->model.mapping) {
6402
7392
  data_ptr = ctx->model.mapping->addr;
6403
7393
  data_size = ctx->model.mapping->size;
6404
7394
  } else {
@@ -6417,11 +7407,8 @@ struct llama_context * llama_new_context_with_model(
6417
7407
  return NULL; \
6418
7408
  }
6419
7409
 
6420
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
6421
-
6422
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.data, ctx->buf_compute.size, 0));
6423
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0));
6424
-
7410
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
7411
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0));
6425
7412
  LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.data, ctx->buf_alloc.size, 0));
6426
7413
  #undef LLAMA_METAL_CHECK_BUF
6427
7414
  }
@@ -6433,8 +7420,10 @@ struct llama_context * llama_new_context_with_model(
6433
7420
 
6434
7421
  if (ggml_mpi_rank(ctx->ctx_mpi) > 0) {
6435
7422
  // Enter a blocking eval loop with dummy input, letting rank=0 drive the process
6436
- const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos(ctx));
6437
- while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {};
7423
+ // TODO: needs fix after #3228
7424
+ GGML_ASSERT(false && "not implemented");
7425
+ //const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos(ctx));
7426
+ //while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {};
6438
7427
  llama_backend_free();
6439
7428
  exit(1);
6440
7429
  }
@@ -6443,63 +7432,41 @@ struct llama_context * llama_new_context_with_model(
6443
7432
  return ctx;
6444
7433
  }
6445
7434
 
6446
- static struct llama_context * llama_init_from_file(
6447
- const char * path_model,
6448
- struct llama_context_params params) {
6449
- struct llama_model * model = llama_load_model_from_file(path_model, params);
6450
- if (!model) {
6451
- return nullptr;
6452
- }
6453
-
6454
- struct llama_context * ctx = llama_new_context_with_model(model, params);
6455
- ctx->model_owner = true;
6456
-
6457
- return ctx;
6458
- }
6459
-
6460
7435
  void llama_free(struct llama_context * ctx) {
6461
7436
  delete ctx;
6462
7437
  }
6463
7438
 
6464
- int llama_n_vocab(const struct llama_context * ctx) {
6465
- return llama_model_n_vocab(&ctx->model);
7439
+ const llama_model * llama_get_model(const struct llama_context * ctx) {
7440
+ return &ctx->model;
6466
7441
  }
6467
7442
 
6468
7443
  int llama_n_ctx(const struct llama_context * ctx) {
6469
- return llama_model_n_ctx(&ctx->model);
6470
- }
6471
-
6472
- int llama_n_ctx_train(const struct llama_context * ctx) {
6473
- return llama_model_n_ctx_train(&ctx->model);
7444
+ return ctx->cparams.n_ctx;
6474
7445
  }
6475
7446
 
6476
- int llama_n_embd(const struct llama_context * ctx) {
6477
- return llama_model_n_embd(&ctx->model);
7447
+ enum llama_vocab_type llama_vocab_type(const struct llama_model * model) {
7448
+ return model->vocab.type;
6478
7449
  }
6479
7450
 
6480
- enum llama_vocab_type llama_vocab_type(const struct llama_context * ctx) {
6481
- return ctx->model.vocab.type;
6482
- }
6483
-
6484
- int llama_model_n_vocab(const struct llama_model * model) {
7451
+ int llama_n_vocab(const struct llama_model * model) {
6485
7452
  return model->vocab.id_to_token.size();
6486
7453
  }
6487
7454
 
6488
- int llama_model_n_ctx(const struct llama_model * model) {
6489
- return model->hparams.n_ctx;
6490
- }
6491
-
6492
- int llama_model_n_ctx_train(const struct llama_model * model) {
7455
+ int llama_n_ctx_train(const struct llama_model * model) {
6493
7456
  return model->hparams.n_ctx_train;
6494
7457
  }
6495
7458
 
6496
- int llama_model_n_embd(const struct llama_model * model) {
7459
+ int llama_n_embd(const struct llama_model * model) {
6497
7460
  return model->hparams.n_embd;
6498
7461
  }
6499
7462
 
7463
+ float llama_rope_freq_scale_train(const struct llama_model * model) {
7464
+ return model->hparams.rope_freq_scale_train;
7465
+ }
7466
+
6500
7467
  int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
6501
7468
  return snprintf(buf, buf_size, "%s %s %s",
6502
- model->name.c_str(),
7469
+ llama_model_arch_name(model->arch).c_str(),
6503
7470
  llama_model_type_name(model->type),
6504
7471
  llama_model_ftype_name(model->ftype).c_str());
6505
7472
  }
@@ -6520,6 +7487,10 @@ uint64_t llama_model_n_params(const struct llama_model * model) {
6520
7487
  return nparams;
6521
7488
  }
6522
7489
 
7490
+ struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name) {
7491
+ return ggml_get_tensor(model->ctx, name);
7492
+ }
7493
+
6523
7494
  int llama_model_quantize(
6524
7495
  const char * fname_inp,
6525
7496
  const char * fname_out,
@@ -6533,18 +7504,18 @@ int llama_model_quantize(
6533
7504
  }
6534
7505
  }
6535
7506
 
6536
- int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
7507
+ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, float scale, const char * path_base_model, int n_threads) {
6537
7508
  try {
6538
- return llama_apply_lora_from_file_internal(ctx->model, path_lora, path_base_model, n_threads);
7509
+ return llama_apply_lora_from_file_internal(ctx->model, path_lora, scale, path_base_model, n_threads);
6539
7510
  } catch (const std::exception & err) {
6540
7511
  LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
6541
7512
  return 1;
6542
7513
  }
6543
7514
  }
6544
7515
 
6545
- int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, const char * path_base_model, int n_threads) {
7516
+ int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int n_threads) {
6546
7517
  try {
6547
- return llama_apply_lora_from_file_internal(*model, path_lora, path_base_model, n_threads);
7518
+ return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads);
6548
7519
  } catch (const std::exception & err) {
6549
7520
  LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
6550
7521
  return 1;
@@ -6552,16 +7523,27 @@ int llama_model_apply_lora_from_file(const struct llama_model * model, const cha
6552
7523
  }
6553
7524
 
6554
7525
  int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
6555
- return ctx->kv_self.n;
7526
+ return ctx->kv_self.head;
6556
7527
  }
6557
7528
 
6558
- #define LLAMA_MAX_RNG_STATE (64*1024)
7529
+ void llama_kv_cache_tokens_rm(struct llama_context * ctx, int32_t c0, int32_t c1) {
7530
+ llama_kv_cache_tokens_rm(ctx->kv_self, c0, c1);
7531
+ }
6559
7532
 
6560
- void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed) {
6561
- if (seed == LLAMA_DEFAULT_SEED) {
6562
- seed = time(NULL);
6563
- }
6564
- ctx->rng.seed(seed);
7533
+ void llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
7534
+ llama_kv_cache_seq_rm(ctx->kv_self, seq_id, p0, p1);
7535
+ }
7536
+
7537
+ void llama_kv_cache_seq_cp(struct llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
7538
+ llama_kv_cache_seq_cp(ctx->kv_self, seq_id_src, seq_id_dst, p0, p1);
7539
+ }
7540
+
7541
+ void llama_kv_cache_seq_keep(struct llama_context * ctx, llama_seq_id seq_id) {
7542
+ llama_kv_cache_seq_keep(ctx->kv_self, seq_id);
7543
+ }
7544
+
7545
+ void llama_kv_cache_seq_shift(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
7546
+ llama_kv_cache_seq_shift(ctx->kv_self, seq_id, p0, p1, delta);
6565
7547
  }
6566
7548
 
6567
7549
  // Returns the *maximum* size of the state
@@ -6699,36 +7681,40 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
6699
7681
  {
6700
7682
  const auto & kv_self = ctx->kv_self;
6701
7683
  const auto & hparams = ctx->model.hparams;
6702
- const int n_layer = hparams.n_layer;
6703
- const int n_embd = hparams.n_embd_gqa();
6704
- const int n_ctx = hparams.n_ctx;
7684
+ const auto & cparams = ctx->cparams;
6705
7685
 
6706
- const size_t kv_size = kv_self.buf.size;
6707
- const int kv_ntok = llama_get_kv_cache_token_count(ctx);
7686
+ const auto n_layer = hparams.n_layer;
7687
+ const auto n_embd = hparams.n_embd_gqa();
7688
+ const auto n_ctx = cparams.n_ctx;
6708
7689
 
6709
- data_ctx->write(&kv_size, sizeof(kv_size));
6710
- data_ctx->write(&kv_ntok, sizeof(kv_ntok));
7690
+ const size_t kv_buf_size = kv_self.buf.size;
7691
+ const uint32_t kv_head = kv_self.head;
7692
+ const uint32_t kv_size = kv_self.size;
6711
7693
 
6712
- if (kv_size) {
7694
+ data_ctx->write(&kv_buf_size, sizeof(kv_buf_size));
7695
+ data_ctx->write(&kv_head, sizeof(kv_head));
7696
+ data_ctx->write(&kv_size, sizeof(kv_size));
7697
+
7698
+ if (kv_buf_size) {
6713
7699
  const size_t elt_size = ggml_element_size(kv_self.k);
6714
7700
 
6715
7701
  ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
6716
7702
  ggml_cgraph gf{};
6717
7703
 
6718
- ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
7704
+ ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
6719
7705
  std::vector<uint8_t> kout3d_data(ggml_nbytes(kout3d), 0);
6720
7706
  kout3d->data = kout3d_data.data();
6721
7707
 
6722
- ggml_tensor * vout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer);
7708
+ ggml_tensor * vout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_head, n_embd, n_layer);
6723
7709
  std::vector<uint8_t> vout3d_data(ggml_nbytes(vout3d), 0);
6724
7710
  vout3d->data = vout3d_data.data();
6725
7711
 
6726
7712
  ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
6727
- n_embd, kv_ntok, n_layer,
7713
+ n_embd, kv_head, n_layer,
6728
7714
  elt_size*n_embd, elt_size*n_embd*n_ctx, 0);
6729
7715
 
6730
7716
  ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v,
6731
- kv_ntok, n_embd, n_layer,
7717
+ kv_head, n_embd, n_layer,
6732
7718
  elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
6733
7719
 
6734
7720
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
@@ -6742,6 +7728,20 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
6742
7728
  data_ctx->write(kout3d_data.data(), kout3d_data.size());
6743
7729
  data_ctx->write(vout3d_data.data(), vout3d_data.size());
6744
7730
  }
7731
+
7732
+ for (uint32_t i = 0; i < kv_size; ++i) {
7733
+ const auto & cell = kv_self.cells[i];
7734
+
7735
+ const llama_pos pos = cell.pos;
7736
+ const size_t seq_id_size = cell.seq_id.size();
7737
+
7738
+ data_ctx->write(&pos, sizeof(pos));
7739
+ data_ctx->write(&seq_id_size, sizeof(seq_id_size));
7740
+
7741
+ for (auto seq_id : cell.seq_id) {
7742
+ data_ctx->write(&seq_id, sizeof(seq_id));
7743
+ }
7744
+ }
6745
7745
  }
6746
7746
  }
6747
7747
 
@@ -6807,38 +7807,42 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
6807
7807
  {
6808
7808
  const auto & kv_self = ctx->kv_self;
6809
7809
  const auto & hparams = ctx->model.hparams;
7810
+ const auto & cparams = ctx->cparams;
7811
+
6810
7812
  const int n_layer = hparams.n_layer;
6811
7813
  const int n_embd = hparams.n_embd_gqa();
6812
- const int n_ctx = hparams.n_ctx;
7814
+ const int n_ctx = cparams.n_ctx;
6813
7815
 
6814
- size_t kv_size;
6815
- int kv_ntok;
7816
+ size_t kv_buf_size;
7817
+ uint32_t kv_head;
7818
+ uint32_t kv_size;
6816
7819
 
6817
- memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
6818
- memcpy(&kv_ntok, inp, sizeof(kv_ntok)); inp += sizeof(kv_ntok);
7820
+ memcpy(&kv_buf_size, inp, sizeof(kv_buf_size)); inp += sizeof(kv_buf_size);
7821
+ memcpy(&kv_head, inp, sizeof(kv_head)); inp += sizeof(kv_head);
7822
+ memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
6819
7823
 
6820
- if (kv_size) {
6821
- GGML_ASSERT(kv_self.buf.size == kv_size);
7824
+ if (kv_buf_size) {
7825
+ GGML_ASSERT(kv_self.buf.size == kv_buf_size);
6822
7826
 
6823
7827
  const size_t elt_size = ggml_element_size(kv_self.k);
6824
7828
 
6825
7829
  ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
6826
7830
  ggml_cgraph gf{};
6827
7831
 
6828
- ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
7832
+ ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
6829
7833
  kin3d->data = (void *) inp;
6830
7834
  inp += ggml_nbytes(kin3d);
6831
7835
 
6832
- ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer);
7836
+ ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_head, n_embd, n_layer);
6833
7837
  vin3d->data = (void *) inp;
6834
7838
  inp += ggml_nbytes(vin3d);
6835
7839
 
6836
7840
  ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
6837
- n_embd, kv_ntok, n_layer,
7841
+ n_embd, kv_head, n_layer,
6838
7842
  elt_size*n_embd, elt_size*n_embd*n_ctx, 0);
6839
7843
 
6840
7844
  ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v,
6841
- kv_ntok, n_embd, n_layer,
7845
+ kv_head, n_embd, n_layer,
6842
7846
  elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
6843
7847
 
6844
7848
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
@@ -6848,7 +7852,27 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
6848
7852
  ggml_free(cpy_ctx);
6849
7853
  }
6850
7854
 
6851
- ctx->kv_self.n = kv_ntok;
7855
+ ctx->kv_self.head = kv_head;
7856
+ ctx->kv_self.size = kv_size;
7857
+
7858
+ ctx->kv_self.cells.resize(kv_size);
7859
+
7860
+ for (uint32_t i = 0; i < kv_size; ++i) {
7861
+ llama_pos pos;
7862
+ size_t seq_id_size;
7863
+
7864
+ memcpy(&pos, inp, sizeof(pos)); inp += sizeof(pos);
7865
+ memcpy(&seq_id_size, inp, sizeof(seq_id_size)); inp += sizeof(seq_id_size);
7866
+
7867
+ ctx->kv_self.cells[i].pos = pos;
7868
+
7869
+ llama_seq_id seq_id;
7870
+
7871
+ for (size_t j = 0; j < seq_id_size; ++j) {
7872
+ memcpy(&seq_id, inp, sizeof(seq_id)); inp += sizeof(seq_id);
7873
+ ctx->kv_self.cells[i].seq_id.insert(seq_id);
7874
+ }
7875
+ }
6852
7876
  }
6853
7877
 
6854
7878
  const size_t nread = inp - src;
@@ -6943,64 +7967,102 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi
6943
7967
 
6944
7968
  int llama_eval(
6945
7969
  struct llama_context * ctx,
6946
- const llama_token * tokens,
6947
- int n_tokens,
6948
- int n_past,
6949
- int n_threads) {
6950
- if (!llama_eval_internal(*ctx, tokens, nullptr, n_tokens, n_past, n_threads, nullptr)) {
6951
- LLAMA_LOG_ERROR("%s: failed to eval\n", __func__);
6952
- return 1;
6953
- }
7970
+ llama_token * tokens,
7971
+ int32_t n_tokens,
7972
+ int n_past) {
7973
+ llama_kv_cache_tokens_rm(ctx->kv_self, n_past, -1);
6954
7974
 
6955
- // get a more accurate load time, upon first eval
6956
- // TODO: fix this
6957
- if (!ctx->has_evaluated_once) {
6958
- ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
6959
- ctx->has_evaluated_once = true;
7975
+ const int ret = llama_decode_internal(*ctx, llama_batch_get_one(tokens, n_tokens, n_past, 0));
7976
+ if (ret < 0) {
7977
+ LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
6960
7978
  }
6961
7979
 
6962
- return 0;
7980
+ return ret;
6963
7981
  }
6964
7982
 
6965
7983
  int llama_eval_embd(
6966
7984
  struct llama_context * ctx,
6967
- const float * embd,
6968
- int n_tokens,
6969
- int n_past,
6970
- int n_threads) {
6971
- if (!llama_eval_internal(*ctx, nullptr, embd, n_tokens, n_past, n_threads, nullptr)) {
6972
- LLAMA_LOG_ERROR("%s: failed to eval\n", __func__);
6973
- return 1;
6974
- }
7985
+ float * embd,
7986
+ int32_t n_tokens,
7987
+ int n_past) {
7988
+ llama_kv_cache_tokens_rm(ctx->kv_self, n_past, -1);
6975
7989
 
6976
- // get a more accurate load time, upon first eval
6977
- // TODO: fix this
6978
- if (!ctx->has_evaluated_once) {
6979
- ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
6980
- ctx->has_evaluated_once = true;
7990
+ llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, n_past, 1, 0, };
7991
+
7992
+ const int ret = llama_decode_internal(*ctx, batch);
7993
+ if (ret < 0) {
7994
+ LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
6981
7995
  }
6982
7996
 
6983
- return 0;
7997
+ return ret;
6984
7998
  }
6985
7999
 
6986
- int llama_eval_export(struct llama_context * ctx, const char * fname) {
6987
- const int n_batch = 1;
6988
- const int n_ctx = 512 - n_batch;
8000
+ void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch) {
8001
+ ctx->cparams.n_threads = n_threads;
8002
+ ctx->cparams.n_threads_batch = n_threads_batch;
8003
+ }
8004
+
8005
+ struct llama_batch llama_batch_get_one(
8006
+ llama_token * tokens,
8007
+ int32_t n_tokens,
8008
+ llama_pos pos_0,
8009
+ llama_seq_id seq_id) {
8010
+ return {
8011
+ /*n_tokens =*/ n_tokens,
8012
+ /*tokens =*/ tokens,
8013
+ /*embd =*/ nullptr,
8014
+ /*pos =*/ nullptr,
8015
+ /*seq_id =*/ nullptr,
8016
+ /*logits =*/ nullptr,
8017
+ /*all_pos_0 =*/ pos_0,
8018
+ /*all_pos_1 =*/ 1,
8019
+ /*all_seq_id =*/ seq_id,
8020
+ };
8021
+ }
6989
8022
 
6990
- const std::vector<llama_token> tmp(n_batch, llama_token_bos(ctx));
8023
+ struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd) {
8024
+ llama_batch batch = { -1, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, };
6991
8025
 
6992
- if (!llama_eval_internal(*ctx, tmp.data(), nullptr, tmp.size(), n_ctx, 1, fname)) {
6993
- LLAMA_LOG_ERROR("%s: failed to eval\n", __func__);
6994
- return 1;
8026
+ if (embd) {
8027
+ batch.embd = (float *) malloc(sizeof(float) * n_tokens * embd);
8028
+ } else {
8029
+ batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens);
6995
8030
  }
6996
8031
 
6997
- return 0;
8032
+ batch.pos = (llama_pos *) malloc(sizeof(llama_pos) * n_tokens);
8033
+ batch.seq_id = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_tokens);
8034
+ batch.logits = (int8_t *) malloc(sizeof(int8_t) * n_tokens);
8035
+
8036
+ return batch;
8037
+ }
8038
+
8039
+ void llama_batch_free(struct llama_batch batch) {
8040
+ if (batch.token) free(batch.token);
8041
+ if (batch.embd) free(batch.embd);
8042
+ if (batch.pos) free(batch.pos);
8043
+ if (batch.seq_id) free(batch.seq_id);
8044
+ if (batch.logits) free(batch.logits);
8045
+ }
8046
+
8047
+ int llama_decode(
8048
+ struct llama_context * ctx,
8049
+ struct llama_batch batch) {
8050
+ const int ret = llama_decode_internal(*ctx, batch);
8051
+ if (ret < 0) {
8052
+ LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
8053
+ }
8054
+
8055
+ return ret;
6998
8056
  }
6999
8057
 
7000
8058
  float * llama_get_logits(struct llama_context * ctx) {
7001
8059
  return ctx->logits.data();
7002
8060
  }
7003
8061
 
8062
+ float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
8063
+ return ctx->logits.data() + i*ctx->model.hparams.n_vocab;
8064
+ }
8065
+
7004
8066
  float * llama_get_embeddings(struct llama_context * ctx) {
7005
8067
  return ctx->embedding.data();
7006
8068
  }
@@ -7028,18 +8090,24 @@ llama_token llama_token_eos(const struct llama_context * ctx) {
7028
8090
  llama_token llama_token_nl(const struct llama_context * ctx) {
7029
8091
  return ctx->model.vocab.linefeed_id;
7030
8092
  }
8093
+ llama_token llama_token_prefix(const struct llama_context * ctx) {
8094
+ return ctx->model.vocab.special_prefix_id;
8095
+ }
7031
8096
 
7032
- int llama_tokenize(
7033
- struct llama_context * ctx,
7034
- const char * text,
7035
- int text_len,
7036
- llama_token * tokens,
7037
- int n_max_tokens,
7038
- bool add_bos) {
7039
- return llama_tokenize_with_model(&ctx->model, text, text_len, tokens, n_max_tokens, add_bos);
8097
+ llama_token llama_token_middle(const struct llama_context * ctx) {
8098
+ return ctx->model.vocab.special_middle_id;
7040
8099
  }
7041
8100
 
7042
- int llama_tokenize_with_model(
8101
+ llama_token llama_token_suffix(const struct llama_context * ctx) {
8102
+ return ctx->model.vocab.special_suffix_id;
8103
+ }
8104
+
8105
+ llama_token llama_token_eot(const struct llama_context * ctx) {
8106
+ return ctx->model.vocab.special_eot_id;
8107
+ }
8108
+
8109
+
8110
+ int llama_tokenize(
7043
8111
  const struct llama_model * model,
7044
8112
  const char * text,
7045
8113
  int text_len,
@@ -7060,39 +8128,66 @@ int llama_tokenize_with_model(
7060
8128
  return res.size();
7061
8129
  }
7062
8130
 
7063
- int llama_token_to_piece(const struct llama_context * ctx, llama_token token, char * buf, int length) {
7064
- return llama_token_to_piece_with_model(&ctx->model, token, buf, length);
8131
+ static std::string llama_decode_text(const std::string & text) {
8132
+ std::string decoded_text;
8133
+ auto unicode_sequences = codepoints_from_utf8(text);
8134
+ for (auto& unicode_sequence : unicode_sequences) {
8135
+ decoded_text += unicode_to_bytes_bpe(codepoint_to_utf8(unicode_sequence));
8136
+ }
8137
+
8138
+ return decoded_text;
7065
8139
  }
7066
8140
 
7067
8141
  // does not write null-terminator to buf
7068
- int llama_token_to_piece_with_model(const struct llama_model * model, llama_token token, char * buf, int length) {
7069
- if (0 <= token && token < llama_model_n_vocab(model)) {
7070
- if (llama_is_normal_token(model->vocab, token)) {
7071
- std::string result = model->vocab.id_to_token[token].text;
7072
- if (llama_vocab_get_type(model->vocab) == LLAMA_VOCAB_TYPE_SPM) {
8142
+ int llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int length) {
8143
+ if (0 <= token && token < llama_n_vocab(model)) {
8144
+ switch (llama_vocab_get_type(model->vocab)) {
8145
+ case LLAMA_VOCAB_TYPE_SPM: {
8146
+ if (llama_is_normal_token(model->vocab, token)) {
8147
+ std::string result = model->vocab.id_to_token[token].text;
7073
8148
  llama_unescape_whitespace(result);
8149
+ if (length < (int) result.length()) {
8150
+ return -result.length();
8151
+ }
8152
+ memcpy(buf, result.c_str(), result.length());
8153
+ return result.length();
8154
+ } else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT
8155
+ if (length < 3) {
8156
+ return -3;
8157
+ }
8158
+ memcpy(buf, "\xe2\x96\x85", 3);
8159
+ return 3;
8160
+ } else if (llama_is_control_token(model->vocab, token)) {
8161
+ ;
8162
+ } else if (llama_is_byte_token(model->vocab, token)) {
8163
+ if (length < 1) {
8164
+ return -1;
8165
+ }
8166
+ buf[0] = llama_token_to_byte(model->vocab, token);
8167
+ return 1;
8168
+ } else {
8169
+ GGML_ASSERT(false);
7074
8170
  }
7075
- if (length < (int) result.length()) {
7076
- return -result.length();
7077
- }
7078
- memcpy(buf, result.c_str(), result.length());
7079
- return result.length();
7080
- } else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT
7081
- if (length < 3) {
7082
- return -3;
7083
- }
7084
- buf[0] = '\xe2';
7085
- buf[1] = '\x96';
7086
- buf[2] = '\x85';
7087
- return 3;
7088
- } else if (llama_is_control_token(model->vocab, token)) {
7089
- ;
7090
- } else if (llama_is_byte_token(model->vocab, token)) {
7091
- if (length < 1) {
7092
- return -1;
8171
+ break;
8172
+ }
8173
+ case LLAMA_VOCAB_TYPE_BPE: {
8174
+ if (llama_is_normal_token(model->vocab, token)) {
8175
+ std::string result = model->vocab.id_to_token[token].text;
8176
+ result = llama_decode_text(result);
8177
+ if (length < (int) result.length()) {
8178
+ return -result.length();
8179
+ }
8180
+ memcpy(buf, result.c_str(), result.length());
8181
+ return result.length();
8182
+ } else if (llama_is_control_token(model->vocab, token)) {
8183
+ ;
8184
+ } else {
8185
+ GGML_ASSERT(false);
7093
8186
  }
7094
- buf[0] = llama_token_to_byte(model->vocab, token);
7095
- return 1;
8187
+ break;
8188
+ }
8189
+ default:
8190
+ GGML_ASSERT(false);
7096
8191
  }
7097
8192
  }
7098
8193
  return 0;
@@ -7119,14 +8214,14 @@ void llama_print_timings(struct llama_context * ctx) {
7119
8214
  const llama_timings timings = llama_get_timings(ctx);
7120
8215
 
7121
8216
  LLAMA_LOG_INFO("\n");
7122
- LLAMA_LOG_INFO("%s: load time = %8.2f ms\n", __func__, timings.t_load_ms);
7123
- LLAMA_LOG_INFO("%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
8217
+ LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, timings.t_load_ms);
8218
+ LLAMA_LOG_INFO("%s: sample time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
7124
8219
  __func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample);
7125
- LLAMA_LOG_INFO("%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
8220
+ LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
7126
8221
  __func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
7127
- LLAMA_LOG_INFO("%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
8222
+ LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
7128
8223
  __func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
7129
- LLAMA_LOG_INFO("%s: total time = %8.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms));
8224
+ LLAMA_LOG_INFO("%s: total time = %10.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms));
7130
8225
  }
7131
8226
 
7132
8227
  void llama_reset_timings(struct llama_context * ctx) {
@@ -7194,12 +8289,12 @@ const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal
7194
8289
  return ctx->model.tensors_by_name;
7195
8290
  }
7196
8291
 
7197
- void llama_log_set(llama_log_callback log_callback, void * user_data) {
8292
+ void llama_log_set(ggml_log_callback log_callback, void * user_data) {
7198
8293
  g_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
7199
8294
  g_state.log_callback_user_data = user_data;
7200
8295
  }
7201
8296
 
7202
- static void llama_log_internal_v(llama_log_level level, const char * format, va_list args) {
8297
+ static void llama_log_internal_v(ggml_log_level level, const char * format, va_list args) {
7203
8298
  va_list args_copy;
7204
8299
  va_copy(args_copy, args);
7205
8300
  char buffer[128];
@@ -7216,14 +8311,14 @@ static void llama_log_internal_v(llama_log_level level, const char * format, va_
7216
8311
  va_end(args_copy);
7217
8312
  }
7218
8313
 
7219
- static void llama_log_internal(llama_log_level level, const char * format, ...) {
8314
+ static void llama_log_internal(ggml_log_level level, const char * format, ...) {
7220
8315
  va_list args;
7221
8316
  va_start(args, format);
7222
8317
  llama_log_internal_v(level, format, args);
7223
8318
  va_end(args);
7224
8319
  }
7225
8320
 
7226
- static void llama_log_callback_default(llama_log_level level, const char * text, void * user_data) {
8321
+ static void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data) {
7227
8322
  (void) level;
7228
8323
  (void) user_data;
7229
8324
  fputs(text, stderr);