llama_cpp 0.6.0 → 0.7.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,6 +1,8 @@
1
1
  #define LLAMA_API_INTERNAL
2
2
  #include "llama.h"
3
3
 
4
+ #include "unicode.h"
5
+
4
6
  #include "ggml.h"
5
7
 
6
8
  #include "ggml-alloc.h"
@@ -123,6 +125,27 @@ static void replace_all(std::string & s, const std::string & search, const std::
123
125
  }
124
126
  s = std::move(result);
125
127
  }
128
+
129
+ static bool is_float_close(float a, float b, float abs_tol) {
130
+ // Check for non-negative tolerance
131
+ if (abs_tol < 0.0) {
132
+ throw std::invalid_argument("Tolerance must be non-negative");
133
+ }
134
+
135
+ // Exact equality check
136
+ if (a == b) {
137
+ return true;
138
+ }
139
+
140
+ // Check for infinities
141
+ if (std::isinf(a) || std::isinf(b)) {
142
+ return false;
143
+ }
144
+
145
+ // Regular comparison using the provided absolute tolerance
146
+ return std::fabs(b - a) <= abs_tol;
147
+ }
148
+
126
149
  #ifdef GGML_USE_CPU_HBM
127
150
  #include <hbwmalloc.h>
128
151
  #endif
@@ -163,6 +186,9 @@ enum llm_arch {
163
186
  LLM_ARCH_GPTNEOX,
164
187
  LLM_ARCH_MPT,
165
188
  LLM_ARCH_STARCODER,
189
+ LLM_ARCH_PERSIMMON,
190
+ LLM_ARCH_REFACT,
191
+ LLM_ARCH_BLOOM,
166
192
  LLM_ARCH_UNKNOWN,
167
193
  };
168
194
 
@@ -175,6 +201,9 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
175
201
  { LLM_ARCH_MPT, "mpt" },
176
202
  { LLM_ARCH_BAICHUAN, "baichuan" },
177
203
  { LLM_ARCH_STARCODER, "starcoder" },
204
+ { LLM_ARCH_PERSIMMON, "persimmon" },
205
+ { LLM_ARCH_REFACT, "refact" },
206
+ { LLM_ARCH_BLOOM, "bloom" },
178
207
  };
179
208
 
180
209
  enum llm_kv {
@@ -277,6 +306,7 @@ struct LLM_KV {
277
306
 
278
307
  enum llm_tensor {
279
308
  LLM_TENSOR_TOKEN_EMBD,
309
+ LLM_TENSOR_TOKEN_EMBD_NORM,
280
310
  LLM_TENSOR_POS_EMBD,
281
311
  LLM_TENSOR_OUTPUT,
282
312
  LLM_TENSOR_OUTPUT_NORM,
@@ -293,6 +323,8 @@ enum llm_tensor {
293
323
  LLM_TENSOR_FFN_DOWN,
294
324
  LLM_TENSOR_FFN_UP,
295
325
  LLM_TENSOR_FFN_NORM,
326
+ LLM_TENSOR_ATTN_Q_NORM,
327
+ LLM_TENSOR_ATTN_K_NORM,
296
328
  };
297
329
 
298
330
  static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
@@ -374,10 +406,35 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
374
406
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
375
407
  },
376
408
  },
409
+ {
410
+ LLM_ARCH_PERSIMMON,
411
+ {
412
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd"},
413
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm"},
414
+ { LLM_TENSOR_OUTPUT, "output"},
415
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm"},
416
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv"},
417
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output"},
418
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm"},
419
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm"},
420
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm"},
421
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down"},
422
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up"},
423
+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd"},
424
+ },
425
+ },
377
426
  {
378
427
  LLM_ARCH_MPT,
379
428
  {
380
429
  { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
430
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
431
+ { LLM_TENSOR_OUTPUT, "output" },
432
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
433
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
434
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
435
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
436
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
437
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
381
438
  },
382
439
  },
383
440
  {
@@ -395,6 +452,38 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
395
452
  { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
396
453
  },
397
454
  },
455
+ {
456
+ LLM_ARCH_REFACT,
457
+ {
458
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
459
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
460
+ { LLM_TENSOR_OUTPUT, "output" },
461
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
462
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
463
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
464
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
465
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
466
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
467
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
468
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
469
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
470
+ },
471
+ },
472
+ {
473
+ LLM_ARCH_BLOOM,
474
+ {
475
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
476
+ { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
477
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
478
+ { LLM_TENSOR_OUTPUT, "output" },
479
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
480
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
481
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
482
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
483
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
484
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
485
+ },
486
+ },
398
487
  {
399
488
  LLM_ARCH_UNKNOWN,
400
489
  {
@@ -912,6 +1001,7 @@ enum e_model {
912
1001
  MODEL_1B,
913
1002
  MODEL_3B,
914
1003
  MODEL_7B,
1004
+ MODEL_8B,
915
1005
  MODEL_13B,
916
1006
  MODEL_15B,
917
1007
  MODEL_30B,
@@ -942,8 +1032,28 @@ struct llama_hparams {
942
1032
  float rope_freq_base_train;
943
1033
  float rope_freq_scale_train;
944
1034
 
1035
+ float f_clamp_kqv;
1036
+ float f_max_alibi_bias;
1037
+
945
1038
  bool operator!=(const llama_hparams & other) const {
946
- return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams))); // NOLINT
1039
+ if (this->vocab_only != other.vocab_only) return true;
1040
+ if (this->n_vocab != other.n_vocab) return true;
1041
+ if (this->n_ctx_train != other.n_ctx_train) return true;
1042
+ if (this->n_embd != other.n_embd) return true;
1043
+ if (this->n_head != other.n_head) return true;
1044
+ if (this->n_head_kv != other.n_head_kv) return true;
1045
+ if (this->n_layer != other.n_layer) return true;
1046
+ if (this->n_rot != other.n_rot) return true;
1047
+ if (this->n_ff != other.n_ff) return true;
1048
+
1049
+ const float EPSILON = 1e-9;
1050
+
1051
+ if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
1052
+ if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
1053
+ if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
1054
+ if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
1055
+
1056
+ return false;
947
1057
  }
948
1058
 
949
1059
  uint32_t n_gqa() const {
@@ -977,6 +1087,10 @@ struct llama_layer {
977
1087
  struct ggml_tensor * attn_norm_b;
978
1088
  struct ggml_tensor * attn_norm_2;
979
1089
  struct ggml_tensor * attn_norm_2_b;
1090
+ struct ggml_tensor * attn_q_norm;
1091
+ struct ggml_tensor * attn_q_norm_b;
1092
+ struct ggml_tensor * attn_k_norm;
1093
+ struct ggml_tensor * attn_k_norm_b;
980
1094
 
981
1095
  // attention
982
1096
  struct ggml_tensor * wq;
@@ -1018,6 +1132,9 @@ struct llama_kv_cell {
1018
1132
  struct llama_kv_cache {
1019
1133
  bool has_shift = false;
1020
1134
 
1135
+ // Note: The value of head isn't only used to optimize searching
1136
+ // for a free KV slot. llama_decode_internal also uses it, so it
1137
+ // cannot be freely changed after a slot has been allocated.
1021
1138
  uint32_t head = 0;
1022
1139
  uint32_t size = 0;
1023
1140
 
@@ -1071,6 +1188,10 @@ struct llama_vocab {
1071
1188
  id special_pad_id = -1;
1072
1189
 
1073
1190
  id linefeed_id = 13;
1191
+ id special_prefix_id = 32007;
1192
+ id special_middle_id = 32009;
1193
+ id special_suffix_id = 32008;
1194
+ id special_eot_id = 32010;
1074
1195
 
1075
1196
  int find_bpe_rank(std::string token_left, std::string token_right) const {
1076
1197
  replace_all(token_left, " ", "\u0120");
@@ -1099,6 +1220,8 @@ struct llama_model {
1099
1220
 
1100
1221
  struct ggml_tensor * tok_embeddings;
1101
1222
  struct ggml_tensor * pos_embeddings;
1223
+ struct ggml_tensor * tok_norm;
1224
+ struct ggml_tensor * tok_norm_b;
1102
1225
 
1103
1226
  struct ggml_tensor * output_norm;
1104
1227
  struct ggml_tensor * output_norm_b;
@@ -1228,7 +1351,11 @@ static bool llama_kv_cache_init(
1228
1351
  cache.cells.clear();
1229
1352
  cache.cells.resize(n_ctx);
1230
1353
 
1354
+ // TODO: this should be:
1355
+ // cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*ggml_tensor_overhead());
1356
+ // change it and test that it works
1231
1357
  cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
1358
+ memset(cache.buf.data, 0, cache.buf.size);
1232
1359
 
1233
1360
  struct ggml_init_params params;
1234
1361
  params.mem_size = cache.buf.size;
@@ -1271,9 +1398,11 @@ static bool llama_kv_cache_init(
1271
1398
 
1272
1399
  // find an empty slot of size "n_tokens" in the cache
1273
1400
  // updates the cache head
1401
+ // Note: On success, it's important that cache.head points
1402
+ // to the first cell of the slot.
1274
1403
  static bool llama_kv_cache_find_slot(
1275
- struct llama_kv_cache & cache,
1276
- const struct llama_batch & batch) {
1404
+ struct llama_kv_cache & cache,
1405
+ const struct llama_batch & batch) {
1277
1406
  const uint32_t n_ctx = cache.size;
1278
1407
  const uint32_t n_tokens = batch.n_tokens;
1279
1408
 
@@ -1286,8 +1415,8 @@ static bool llama_kv_cache_find_slot(
1286
1415
 
1287
1416
  while (true) {
1288
1417
  if (cache.head + n_tokens > n_ctx) {
1418
+ n_tested += n_ctx - cache.head;
1289
1419
  cache.head = 0;
1290
- n_tested += n_ctx - cache.head;
1291
1420
  continue;
1292
1421
  }
1293
1422
 
@@ -1338,29 +1467,46 @@ static void llama_kv_cache_tokens_rm(struct llama_kv_cache & cache, int32_t c0,
1338
1467
  cache.cells[i].pos = -1;
1339
1468
  cache.cells[i].seq_id.clear();
1340
1469
  }
1470
+
1471
+ // Searching for a free slot can start here since we know it will be empty.
1472
+ cache.head = uint32_t(c0);
1341
1473
  }
1342
1474
 
1343
1475
  static void llama_kv_cache_seq_rm(
1344
- struct llama_kv_cache & cache,
1345
- llama_seq_id seq_id,
1346
- llama_pos p0,
1347
- llama_pos p1) {
1476
+ struct llama_kv_cache & cache,
1477
+ llama_seq_id seq_id,
1478
+ llama_pos p0,
1479
+ llama_pos p1) {
1480
+ uint32_t new_head = cache.size;
1481
+
1482
+ if (p0 < 0) p0 = 0;
1483
+ if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
1484
+
1348
1485
  for (uint32_t i = 0; i < cache.size; ++i) {
1349
1486
  if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
1350
1487
  cache.cells[i].seq_id.erase(seq_id);
1351
1488
  if (cache.cells[i].seq_id.empty()) {
1352
1489
  cache.cells[i].pos = -1;
1490
+ if (new_head == cache.size) new_head = i;
1353
1491
  }
1354
1492
  }
1355
1493
  }
1494
+
1495
+ // If we freed up a slot, set head to it so searching can start there.
1496
+ if (new_head != cache.size) cache.head = new_head;
1356
1497
  }
1357
1498
 
1358
1499
  static void llama_kv_cache_seq_cp(
1359
- struct llama_kv_cache & cache,
1360
- llama_seq_id seq_id_src,
1361
- llama_seq_id seq_id_dst,
1362
- llama_pos p0,
1363
- llama_pos p1) {
1500
+ struct llama_kv_cache & cache,
1501
+ llama_seq_id seq_id_src,
1502
+ llama_seq_id seq_id_dst,
1503
+ llama_pos p0,
1504
+ llama_pos p1) {
1505
+ if (p0 < 0) p0 = 0;
1506
+ if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
1507
+
1508
+ cache.head = 0;
1509
+
1364
1510
  for (uint32_t i = 0; i < cache.size; ++i) {
1365
1511
  if (cache.cells[i].has_seq_id(seq_id_src) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
1366
1512
  cache.cells[i].seq_id.insert(seq_id_dst);
@@ -1369,32 +1515,48 @@ static void llama_kv_cache_seq_cp(
1369
1515
  }
1370
1516
 
1371
1517
  static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id seq_id) {
1518
+ uint32_t new_head = cache.size;
1519
+
1372
1520
  for (uint32_t i = 0; i < cache.size; ++i) {
1373
1521
  if (!cache.cells[i].has_seq_id(seq_id)) {
1374
1522
  cache.cells[i].pos = -1;
1375
1523
  cache.cells[i].seq_id.clear();
1524
+ if (new_head == cache.size) new_head = i;
1376
1525
  }
1377
1526
  }
1527
+
1528
+ // If we freed up a slot, set head to it so searching can start there.
1529
+ if (new_head != cache.size) cache.head = new_head;
1378
1530
  }
1379
1531
 
1380
1532
  static void llama_kv_cache_seq_shift(
1381
- struct llama_kv_cache & cache,
1382
- llama_seq_id seq_id,
1383
- llama_pos p0,
1384
- llama_pos p1,
1385
- llama_pos delta) {
1533
+ struct llama_kv_cache & cache,
1534
+ llama_seq_id seq_id,
1535
+ llama_pos p0,
1536
+ llama_pos p1,
1537
+ llama_pos delta) {
1538
+ uint32_t new_head = cache.size;
1539
+
1540
+ if (p0 < 0) p0 = 0;
1541
+ if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
1542
+
1386
1543
  for (uint32_t i = 0; i < cache.size; ++i) {
1387
1544
  if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
1388
1545
  cache.cells[i].pos += delta;
1389
1546
  if (cache.cells[i].pos < 0) {
1390
1547
  cache.cells[i].pos = -1;
1391
1548
  cache.cells[i].seq_id.clear();
1549
+ if (new_head == cache.size) new_head = i;
1392
1550
  } else {
1393
1551
  cache.has_shift = true;
1394
1552
  cache.cells[i].delta = delta;
1395
1553
  }
1396
1554
  }
1397
1555
  }
1556
+
1557
+ // If we freed up a slot, set head to it so searching can start there.
1558
+ // Otherwise we just start the next search from the beginning.
1559
+ cache.head = new_head != cache.size ? new_head : 0;
1398
1560
  }
1399
1561
 
1400
1562
  //
@@ -1598,7 +1760,7 @@ struct llama_model_loader {
1598
1760
  }
1599
1761
  }
1600
1762
 
1601
- struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend backend) {
1763
+ struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend_type backend) {
1602
1764
  if (backend != GGML_BACKEND_CPU) {
1603
1765
  ggml_set_no_alloc(ctx, true);
1604
1766
  }
@@ -1616,7 +1778,7 @@ struct llama_model_loader {
1616
1778
  return tensor;
1617
1779
  }
1618
1780
 
1619
- struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend backend) {
1781
+ struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend_type backend) {
1620
1782
  struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str());
1621
1783
 
1622
1784
  if (cur == NULL) {
@@ -1795,6 +1957,7 @@ static const char * llama_model_type_name(e_model type) {
1795
1957
  case MODEL_1B: return "1B";
1796
1958
  case MODEL_3B: return "3B";
1797
1959
  case MODEL_7B: return "7B";
1960
+ case MODEL_8B: return "8B";
1798
1961
  case MODEL_13B: return "13B";
1799
1962
  case MODEL_15B: return "15B";
1800
1963
  case MODEL_30B: return "30B";
@@ -1907,6 +2070,49 @@ static void llm_load_hparams(
1907
2070
  default: model.type = e_model::MODEL_UNKNOWN;
1908
2071
  }
1909
2072
  } break;
2073
+ case LLM_ARCH_PERSIMMON:
2074
+ {
2075
+ GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
2076
+ switch (hparams.n_layer) {
2077
+ case 36: model.type = e_model::MODEL_8B; break;
2078
+ default: model.type = e_model::MODEL_UNKNOWN;
2079
+ }
2080
+ } break;
2081
+ case LLM_ARCH_REFACT:
2082
+ {
2083
+ GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
2084
+ switch (hparams.n_layer) {
2085
+ case 32: model.type = e_model::MODEL_1B; break;
2086
+ default: model.type = e_model::MODEL_UNKNOWN;
2087
+ }
2088
+ } break;
2089
+ case LLM_ARCH_BLOOM:
2090
+ {
2091
+ GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
2092
+
2093
+ switch (hparams.n_layer) {
2094
+ case 24: model.type = e_model::MODEL_1B; break;
2095
+ case 30:
2096
+ switch (hparams.n_embd) {
2097
+ case 2560: model.type = e_model::MODEL_3B; break;
2098
+ case 4096: model.type = e_model::MODEL_7B; break;
2099
+ } break;
2100
+ }
2101
+ } break;
2102
+ case LLM_ARCH_MPT:
2103
+ {
2104
+ hparams.f_clamp_kqv = 0.0f;
2105
+
2106
+ GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
2107
+ GGUF_GET_KEY(ctx, hparams.f_clamp_kqv, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ATTENTION_CLAMP_KQV));
2108
+ GGUF_GET_KEY(ctx, hparams.f_max_alibi_bias, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_MAX_ALIBI_BIAS));
2109
+
2110
+ switch (hparams.n_layer) {
2111
+ case 32: model.type = e_model::MODEL_7B; break;
2112
+ case 48: model.type = e_model::MODEL_30B; break;
2113
+ default: model.type = e_model::MODEL_UNKNOWN;
2114
+ }
2115
+ } break;
1910
2116
  default: (void)0;
1911
2117
  }
1912
2118
 
@@ -1971,6 +2177,7 @@ static void llm_load_vocab(
1971
2177
 
1972
2178
  for (int i = 0; i < n_merges; i++) {
1973
2179
  const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
2180
+ GGML_ASSERT(codepoints_from_utf8(word).size() > 0);
1974
2181
 
1975
2182
  std::string first;
1976
2183
  std::string second;
@@ -2005,6 +2212,7 @@ static void llm_load_vocab(
2005
2212
 
2006
2213
  for (uint32_t i = 0; i < n_vocab; i++) {
2007
2214
  std::string word = gguf_get_arr_str(ctx, token_idx, i);
2215
+ GGML_ASSERT(codepoints_from_utf8(word).size() > 0);
2008
2216
 
2009
2217
  vocab.token_to_id[word] = i;
2010
2218
 
@@ -2013,12 +2221,13 @@ static void llm_load_vocab(
2013
2221
  token_data.score = scores ? scores[i] : 0.0f;
2014
2222
  token_data.type = toktypes ? (llama_token_type) toktypes[i] : LLAMA_TOKEN_TYPE_NORMAL;
2015
2223
  }
2224
+ GGML_ASSERT(vocab.id_to_token.size() == vocab.token_to_id.size());
2016
2225
 
2017
2226
  // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
2018
2227
  if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
2019
2228
  vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
2020
2229
  } else {
2021
- vocab.linefeed_id = llama_tokenize_internal(vocab, "\n", false)[0];
2230
+ vocab.linefeed_id = llama_tokenize_internal(vocab, "\u010A", false)[0];
2022
2231
  }
2023
2232
 
2024
2233
  // special tokens
@@ -2048,6 +2257,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
2048
2257
  LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa());
2049
2258
  LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
2050
2259
  LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
2260
+ LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
2261
+ LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
2051
2262
  LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
2052
2263
  LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
2053
2264
  LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
@@ -2141,13 +2352,14 @@ static void llm_load_tensors(
2141
2352
  const auto tn = LLM_TN(model.arch);
2142
2353
  switch (model.arch) {
2143
2354
  case LLM_ARCH_LLAMA:
2355
+ case LLM_ARCH_REFACT:
2144
2356
  {
2145
2357
  model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2146
2358
 
2147
2359
  // output
2148
2360
  {
2149
- ggml_backend backend_norm;
2150
- ggml_backend backend_output;
2361
+ ggml_backend_type backend_norm;
2362
+ ggml_backend_type backend_output;
2151
2363
 
2152
2364
  if (n_gpu_layers > int(n_layer)) {
2153
2365
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
@@ -2182,8 +2394,8 @@ static void llm_load_tensors(
2182
2394
  model.layers.resize(n_layer);
2183
2395
 
2184
2396
  for (uint32_t i = 0; i < n_layer; ++i) {
2185
- const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2186
- const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2397
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2398
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2187
2399
 
2188
2400
  auto & layer = model.layers[i];
2189
2401
 
@@ -2212,8 +2424,8 @@ static void llm_load_tensors(
2212
2424
  {
2213
2425
  model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2214
2426
  {
2215
- ggml_backend backend_norm;
2216
- ggml_backend backend_output;
2427
+ ggml_backend_type backend_norm;
2428
+ ggml_backend_type backend_output;
2217
2429
 
2218
2430
  if (n_gpu_layers > int(n_layer)) {
2219
2431
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
@@ -2248,8 +2460,8 @@ static void llm_load_tensors(
2248
2460
  model.layers.resize(n_layer);
2249
2461
 
2250
2462
  for (uint32_t i = 0; i < n_layer; ++i) {
2251
- const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2252
- const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2463
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2464
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2253
2465
 
2254
2466
  auto & layer = model.layers[i];
2255
2467
 
@@ -2282,8 +2494,8 @@ static void llm_load_tensors(
2282
2494
 
2283
2495
  // output
2284
2496
  {
2285
- ggml_backend backend_norm;
2286
- ggml_backend backend_output;
2497
+ ggml_backend_type backend_norm;
2498
+ ggml_backend_type backend_output;
2287
2499
 
2288
2500
  if (n_gpu_layers > int(n_layer)) {
2289
2501
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
@@ -2320,8 +2532,8 @@ static void llm_load_tensors(
2320
2532
  model.layers.resize(n_layer);
2321
2533
 
2322
2534
  for (uint32_t i = 0; i < n_layer; ++i) {
2323
- const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2324
- const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2535
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2536
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2325
2537
 
2326
2538
  auto & layer = model.layers[i];
2327
2539
 
@@ -2359,8 +2571,8 @@ static void llm_load_tensors(
2359
2571
 
2360
2572
  // output
2361
2573
  {
2362
- ggml_backend backend_norm;
2363
- ggml_backend backend_output;
2574
+ ggml_backend_type backend_norm;
2575
+ ggml_backend_type backend_output;
2364
2576
 
2365
2577
  if (n_gpu_layers > int(n_layer)) {
2366
2578
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
@@ -2397,8 +2609,8 @@ static void llm_load_tensors(
2397
2609
  model.layers.resize(n_layer);
2398
2610
 
2399
2611
  for (uint32_t i = 0; i < n_layer; ++i) {
2400
- const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2401
- const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2612
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2613
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2402
2614
 
2403
2615
  auto & layer = model.layers[i];
2404
2616
 
@@ -2431,103 +2643,313 @@ static void llm_load_tensors(
2431
2643
  }
2432
2644
  }
2433
2645
  } break;
2434
- default:
2435
- throw std::runtime_error("unknown architecture");
2436
- }
2437
- }
2646
+ case LLM_ARCH_PERSIMMON:
2647
+ {
2648
+ model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2438
2649
 
2439
- ml.done_getting_tensors();
2650
+ {
2651
+ ggml_backend_type backend_norm;
2652
+ ggml_backend_type backend_output;
2440
2653
 
2441
- // print memory requirements
2442
- {
2443
- // this is the total memory required to run the inference
2444
- size_t mem_required =
2445
- ctx_size +
2446
- mmapped_size - vram_weights; // weights in VRAM not in memory
2654
+ if (n_gpu_layers > int(n_layer)) {
2655
+ // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2656
+ // on Windows however this is detrimental unless everything is on the GPU
2657
+ #ifndef _WIN32
2658
+ backend_norm = LLAMA_BACKEND_OFFLOAD;
2659
+ #else
2660
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2661
+ #endif // _WIN32
2447
2662
 
2448
- LLAMA_LOG_INFO("%s: mem required = %7.2f MB\n", __func__, mem_required / 1024.0 / 1024.0);
2663
+ backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
2664
+ } else {
2665
+ backend_norm = GGML_BACKEND_CPU;
2666
+ backend_output = GGML_BACKEND_CPU;
2667
+ }
2449
2668
 
2450
- #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
2451
- const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
2669
+ model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
2670
+ model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
2671
+ model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
2452
2672
 
2453
- LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
2454
- if (n_gpu_layers > (int) hparams.n_layer) {
2455
- LLAMA_LOG_INFO("%s: offloading non-repeating layers to GPU\n", __func__);
2456
- }
2673
+ if (backend_norm == GGML_BACKEND_GPU) {
2674
+ vram_weights += ggml_nbytes(model.output_norm);
2675
+ vram_weights += ggml_nbytes(model.output_norm_b);
2676
+ }
2677
+ if (backend_output == GGML_BACKEND_GPU_SPLIT) {
2678
+ vram_weights += ggml_nbytes(model.output);
2679
+ }
2680
+ }
2457
2681
 
2458
- #ifdef GGML_USE_CUBLAS
2459
- const int max_backend_supported_layers = hparams.n_layer + 3;
2460
- const int max_offloadable_layers = hparams.n_layer + 3;
2461
- #elif defined(GGML_USE_CLBLAST)
2462
- const int max_backend_supported_layers = hparams.n_layer + 1;
2463
- const int max_offloadable_layers = hparams.n_layer + 1;
2464
- #endif // GGML_USE_CUBLAS
2682
+ const uint32_t n_ff = hparams.n_ff;
2683
+ const int i_gpu_start = n_layer - n_gpu_layers;
2684
+ model.layers.resize(n_layer);
2685
+ for (uint32_t i = 0; i < n_layer; ++i) {
2686
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2687
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT;
2688
+ auto & layer = model.layers[i];
2689
+ layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
2690
+ layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
2691
+ layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
2692
+ layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split);
2693
+ layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
2694
+ layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split);
2695
+ layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
2696
+ layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split);
2697
+ layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
2698
+ layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split);
2699
+ layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
2700
+ layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
2701
+ layer.attn_q_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64}, backend);
2702
+ layer.attn_q_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {64}, backend);
2703
+ layer.attn_k_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {64}, backend);
2704
+ layer.attn_k_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64}, backend);
2705
+ }
2706
+ } break;
2707
+ case LLM_ARCH_BLOOM:
2708
+ {
2709
+ // TODO: CPU-only for now
2465
2710
 
2466
- LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
2467
- LLAMA_LOG_INFO("%s: VRAM used: %.2f MB\n", __func__, vram_weights / 1024.0 / 1024.0);
2711
+ model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2712
+ model.tok_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, GGML_BACKEND_CPU);
2713
+ model.tok_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, GGML_BACKEND_CPU);
2714
+
2715
+ // output
2716
+ {
2717
+ ggml_backend_type backend_norm;
2718
+ ggml_backend_type backend_output;
2719
+
2720
+ if (n_gpu_layers > int(n_layer)) {
2721
+ // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2722
+ // on Windows however this is detrimental unless everything is on the GPU
2723
+ #ifndef _WIN32
2724
+ backend_norm = LLAMA_BACKEND_OFFLOAD;
2468
2725
  #else
2469
- (void) n_gpu_layers;
2470
- #endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
2471
- }
2726
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2727
+ #endif // _WIN32
2472
2728
 
2473
- // populate `tensors_by_name`
2474
- for (int i = 0; i < ml.n_tensors; ++i) {
2475
- struct ggml_tensor * cur = ggml_get_tensor(ctx, ml.get_tensor_name(i));
2476
- model.tensors_by_name.emplace_back(ggml_get_name(cur), cur);
2477
- }
2729
+ backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
2730
+ } else {
2731
+ backend_norm = GGML_BACKEND_CPU;
2732
+ backend_output = GGML_BACKEND_CPU;
2733
+ }
2478
2734
 
2479
- (void) tensor_split;
2480
- #ifdef GGML_USE_CUBLAS
2481
- {
2482
- ggml_cuda_set_tensor_split(tensor_split);
2483
- }
2484
- #endif
2735
+ model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
2736
+ model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
2737
+ model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
2485
2738
 
2486
- ml.load_all_data(ctx, progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL);
2739
+ if (backend_norm == GGML_BACKEND_GPU) {
2740
+ vram_weights += ggml_nbytes(model.output_norm);
2741
+ vram_weights += ggml_nbytes(model.output_norm_b);
2742
+ }
2743
+ if (backend_output == GGML_BACKEND_GPU_SPLIT) {
2744
+ vram_weights += ggml_nbytes(model.output);
2745
+ }
2746
+ }
2487
2747
 
2488
- if (progress_callback) {
2489
- progress_callback(1.0f, progress_callback_user_data);
2490
- }
2748
+ const uint32_t n_ff = hparams.n_ff;
2491
2749
 
2492
- model.mapping = std::move(ml.mapping);
2750
+ const int i_gpu_start = n_layer - n_gpu_layers;
2493
2751
 
2494
- // loading time will be recalculate after the first eval, so
2495
- // we take page faults deferred by mmap() into consideration
2496
- model.t_load_us = ggml_time_us() - model.t_start_us;
2497
- }
2752
+ model.layers.resize(n_layer);
2498
2753
 
2499
- static bool llama_model_load(
2500
- const std::string & fname,
2501
- llama_model & model,
2502
- int n_gpu_layers,
2503
- int main_gpu,
2504
- const float * tensor_split,
2505
- bool use_mmap,
2506
- bool use_mlock,
2507
- bool vocab_only,
2508
- llama_progress_callback progress_callback,
2509
- void *progress_callback_user_data) {
2510
- try {
2511
- llama_model_loader ml(fname, use_mmap);
2754
+ for (uint32_t i = 0; i < n_layer; ++i) {
2755
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2756
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2512
2757
 
2513
- model.hparams.vocab_only = vocab_only;
2758
+ auto & layer = model.layers[i];
2514
2759
 
2515
- llm_load_arch (ml, model);
2516
- llm_load_hparams(ml, model);
2517
- llm_load_vocab (ml, model);
2760
+ layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
2761
+ layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
2518
2762
 
2519
- llm_load_print_meta(ml, model);
2763
+ layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
2764
+ layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split);
2520
2765
 
2521
- if (model.hparams.n_vocab != model.vocab.id_to_token.size()) {
2522
- throw std::runtime_error("vocab size mismatch");
2523
- }
2766
+ layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
2767
+ layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split);
2524
2768
 
2525
- if (vocab_only) {
2526
- LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
2527
- return true;
2528
- }
2769
+ layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
2770
+ layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
2529
2771
 
2530
- llm_load_tensors(
2772
+ layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
2773
+ layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split);
2774
+
2775
+ layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
2776
+ layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split);
2777
+
2778
+ if (backend == GGML_BACKEND_GPU) {
2779
+ vram_weights +=
2780
+ ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
2781
+ ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
2782
+ ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) +
2783
+ ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_norm_b) +
2784
+ ggml_nbytes(layer.w3) + ggml_nbytes(layer.b3) +
2785
+ ggml_nbytes(layer.w2) + ggml_nbytes(layer.b2);
2786
+ }
2787
+ }
2788
+ } break;
2789
+ case LLM_ARCH_MPT:
2790
+ {
2791
+ model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2792
+
2793
+ // output
2794
+ {
2795
+ ggml_backend_type backend_norm;
2796
+ ggml_backend_type backend_output;
2797
+
2798
+ if (n_gpu_layers > int(n_layer)) {
2799
+ // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2800
+ // on Windows however this is detrimental unless everything is on the GPU
2801
+ #ifndef _WIN32
2802
+ backend_norm = LLAMA_BACKEND_OFFLOAD;
2803
+ #else
2804
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2805
+ #endif // _WIN32
2806
+
2807
+ backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
2808
+ } else {
2809
+ backend_norm = GGML_BACKEND_CPU;
2810
+ backend_output = GGML_BACKEND_CPU;
2811
+ }
2812
+
2813
+ model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
2814
+ model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
2815
+
2816
+ if (backend_norm == GGML_BACKEND_GPU) {
2817
+ vram_weights += ggml_nbytes(model.output_norm);
2818
+ }
2819
+ if (backend_output == GGML_BACKEND_GPU_SPLIT) {
2820
+ vram_weights += ggml_nbytes(model.output);
2821
+ }
2822
+ }
2823
+
2824
+ const uint32_t n_ff = hparams.n_ff;
2825
+
2826
+ const int i_gpu_start = n_layer - n_gpu_layers;
2827
+
2828
+ model.layers.resize(n_layer);
2829
+
2830
+ for (uint32_t i = 0; i < n_layer; ++i) {
2831
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2832
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2833
+
2834
+ auto & layer = model.layers[i];
2835
+
2836
+ layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
2837
+ layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, 3*n_embd}, backend_split);
2838
+ layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
2839
+
2840
+ layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
2841
+
2842
+ layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
2843
+ layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
2844
+
2845
+ if (backend == GGML_BACKEND_GPU) {
2846
+ vram_weights +=
2847
+ ggml_nbytes(layer.attn_norm) +
2848
+ ggml_nbytes(layer.wqkv) +
2849
+ ggml_nbytes(layer.wo) +
2850
+ ggml_nbytes(layer.ffn_norm) +
2851
+ ggml_nbytes(layer.w2) +
2852
+ ggml_nbytes(layer.w3);
2853
+ }
2854
+ }
2855
+ } break;
2856
+ default:
2857
+ throw std::runtime_error("unknown architecture");
2858
+ }
2859
+ }
2860
+
2861
+ ml.done_getting_tensors();
2862
+
2863
+ // print memory requirements
2864
+ {
2865
+ // this is the total memory required to run the inference
2866
+ size_t mem_required =
2867
+ ctx_size +
2868
+ mmapped_size - vram_weights; // weights in VRAM not in memory
2869
+
2870
+ LLAMA_LOG_INFO("%s: mem required = %7.2f MB\n", __func__, mem_required / 1024.0 / 1024.0);
2871
+
2872
+ #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
2873
+ const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
2874
+
2875
+ LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
2876
+ if (n_gpu_layers > (int) hparams.n_layer) {
2877
+ LLAMA_LOG_INFO("%s: offloading non-repeating layers to GPU\n", __func__);
2878
+ }
2879
+
2880
+ #ifdef GGML_USE_CUBLAS
2881
+ const int max_backend_supported_layers = hparams.n_layer + 3;
2882
+ const int max_offloadable_layers = hparams.n_layer + 3;
2883
+ #elif defined(GGML_USE_CLBLAST)
2884
+ const int max_backend_supported_layers = hparams.n_layer + 1;
2885
+ const int max_offloadable_layers = hparams.n_layer + 1;
2886
+ #endif // GGML_USE_CUBLAS
2887
+
2888
+ LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
2889
+ LLAMA_LOG_INFO("%s: VRAM used: %.2f MB\n", __func__, vram_weights / 1024.0 / 1024.0);
2890
+ #else
2891
+ (void) n_gpu_layers;
2892
+ #endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
2893
+ }
2894
+
2895
+ // populate `tensors_by_name`
2896
+ for (int i = 0; i < ml.n_tensors; ++i) {
2897
+ struct ggml_tensor * cur = ggml_get_tensor(ctx, ml.get_tensor_name(i));
2898
+ model.tensors_by_name.emplace_back(ggml_get_name(cur), cur);
2899
+ }
2900
+
2901
+ (void) tensor_split;
2902
+ #ifdef GGML_USE_CUBLAS
2903
+ {
2904
+ ggml_cuda_set_tensor_split(tensor_split);
2905
+ }
2906
+ #endif
2907
+
2908
+ ml.load_all_data(ctx, progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL);
2909
+
2910
+ if (progress_callback) {
2911
+ progress_callback(1.0f, progress_callback_user_data);
2912
+ }
2913
+
2914
+ model.mapping = std::move(ml.mapping);
2915
+
2916
+ // loading time will be recalculate after the first eval, so
2917
+ // we take page faults deferred by mmap() into consideration
2918
+ model.t_load_us = ggml_time_us() - model.t_start_us;
2919
+ }
2920
+
2921
+ static bool llama_model_load(
2922
+ const std::string & fname,
2923
+ llama_model & model,
2924
+ int n_gpu_layers,
2925
+ int main_gpu,
2926
+ const float * tensor_split,
2927
+ bool use_mmap,
2928
+ bool use_mlock,
2929
+ bool vocab_only,
2930
+ llama_progress_callback progress_callback,
2931
+ void *progress_callback_user_data) {
2932
+ try {
2933
+ llama_model_loader ml(fname, use_mmap);
2934
+
2935
+ model.hparams.vocab_only = vocab_only;
2936
+
2937
+ llm_load_arch (ml, model);
2938
+ llm_load_hparams(ml, model);
2939
+ llm_load_vocab (ml, model);
2940
+
2941
+ llm_load_print_meta(ml, model);
2942
+
2943
+ if (model.hparams.n_vocab != model.vocab.id_to_token.size()) {
2944
+ throw std::runtime_error("vocab size mismatch");
2945
+ }
2946
+
2947
+ if (vocab_only) {
2948
+ LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
2949
+ return true;
2950
+ }
2951
+
2952
+ llm_load_tensors(
2531
2953
  ml, model, n_gpu_layers,
2532
2954
  main_gpu, tensor_split,
2533
2955
  use_mlock, progress_callback, progress_callback_user_data);
@@ -2540,8 +2962,8 @@ static bool llama_model_load(
2540
2962
  }
2541
2963
 
2542
2964
  static struct ggml_cgraph * llm_build_llama(
2543
- llama_context & lctx,
2544
- const llama_batch & batch) {
2965
+ llama_context & lctx,
2966
+ const llama_batch & batch) {
2545
2967
  const auto & model = lctx.model;
2546
2968
  const auto & hparams = model.hparams;
2547
2969
  const auto & cparams = lctx.cparams;
@@ -2579,11 +3001,9 @@ static struct ggml_cgraph * llm_build_llama(
2579
3001
  struct ggml_init_params params = {
2580
3002
  /*.mem_size =*/ buf_compute.size,
2581
3003
  /*.mem_buffer =*/ buf_compute.data,
2582
- /*.no_alloc =*/ false,
3004
+ /*.no_alloc =*/ true,
2583
3005
  };
2584
3006
 
2585
- params.no_alloc = true;
2586
-
2587
3007
  struct ggml_context * ctx0 = ggml_init(params);
2588
3008
 
2589
3009
  ggml_cgraph * gf = ggml_new_graph(ctx0);
@@ -2967,11 +3387,9 @@ static struct ggml_cgraph * llm_build_baichaun(
2967
3387
  struct ggml_init_params params = {
2968
3388
  /*.mem_size =*/ buf_compute.size,
2969
3389
  /*.mem_buffer =*/ buf_compute.data,
2970
- /*.no_alloc =*/ false,
3390
+ /*.no_alloc =*/ true,
2971
3391
  };
2972
3392
 
2973
- params.no_alloc = true;
2974
-
2975
3393
  struct ggml_context * ctx0 = ggml_init(params);
2976
3394
 
2977
3395
  ggml_cgraph * gf = ggml_new_graph(ctx0);
@@ -3334,7 +3752,7 @@ static struct ggml_cgraph * llm_build_baichaun(
3334
3752
  return gf;
3335
3753
  }
3336
3754
 
3337
- static struct ggml_cgraph * llm_build_falcon(
3755
+ static struct ggml_cgraph * llm_build_refact(
3338
3756
  llama_context & lctx,
3339
3757
  const llama_batch & batch) {
3340
3758
  const auto & model = lctx.model;
@@ -3353,11 +3771,7 @@ static struct ggml_cgraph * llm_build_falcon(
3353
3771
  const int64_t n_embd_head = hparams.n_embd_head();
3354
3772
  const int64_t n_embd_gqa = hparams.n_embd_gqa();
3355
3773
 
3356
- GGML_ASSERT(n_embd_head == hparams.n_rot);
3357
-
3358
- const float freq_base = cparams.rope_freq_base;
3359
- const float freq_scale = cparams.rope_freq_scale;
3360
- const float norm_eps = hparams.f_norm_eps;
3774
+ const float norm_rms_eps = hparams.f_norm_rms_eps;
3361
3775
 
3362
3776
  const int n_gpu_layers = model.n_gpu_layers;
3363
3777
 
@@ -3365,21 +3779,16 @@ static struct ggml_cgraph * llm_build_falcon(
3365
3779
  const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
3366
3780
  const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
3367
3781
 
3368
- const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
3369
-
3370
- //printf("kv_head = %d, n_kv = %d, n_tokens = %d, n_ctx = %d, is_measure = %d, has_shift = %d\n",
3371
- // kv_head, n_kv, n_tokens, n_ctx, ggml_allocr_is_measure(lctx.alloc), kv_self.has_shift);
3782
+ // printf("n_kv = %d\n", n_kv);
3372
3783
 
3373
3784
  auto & buf_compute = lctx.buf_compute;
3374
3785
 
3375
3786
  struct ggml_init_params params = {
3376
3787
  /*.mem_size =*/ buf_compute.size,
3377
3788
  /*.mem_buffer =*/ buf_compute.data,
3378
- /*.no_alloc =*/ false,
3789
+ /*.no_alloc =*/ true,
3379
3790
  };
3380
3791
 
3381
- params.no_alloc = true;
3382
-
3383
3792
  struct ggml_context * ctx0 = ggml_init(params);
3384
3793
 
3385
3794
  ggml_cgraph * gf = ggml_new_graph(ctx0);
@@ -3436,7 +3845,7 @@ static struct ggml_cgraph * llm_build_falcon(
3436
3845
  ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
3437
3846
  ggml_allocr_alloc(lctx.alloc, KQ_scale);
3438
3847
  if (!ggml_allocr_is_measure(lctx.alloc)) {
3439
- ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
3848
+ ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd_head)));
3440
3849
  }
3441
3850
 
3442
3851
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
@@ -3462,47 +3871,8 @@ static struct ggml_cgraph * llm_build_falcon(
3462
3871
  }
3463
3872
  }
3464
3873
 
3465
- // KQ_pos - contains the positions
3466
- struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
3467
- offload_func_kq(KQ_pos);
3468
- ggml_set_name(KQ_pos, "KQ_pos");
3469
- ggml_allocr_alloc(lctx.alloc, KQ_pos);
3470
- if (!ggml_allocr_is_measure(lctx.alloc)) {
3471
- int * data = (int *) KQ_pos->data;
3472
- for (int i = 0; i < n_tokens; ++i) {
3473
- data[i] = batch.pos[i];
3474
- }
3475
- }
3476
-
3477
- // shift the entire K-cache if needed
3478
- if (do_rope_shift) {
3479
- struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
3480
- offload_func_kq(K_shift);
3481
- ggml_set_name(K_shift, "K_shift");
3482
- ggml_allocr_alloc(lctx.alloc, K_shift);
3483
- if (!ggml_allocr_is_measure(lctx.alloc)) {
3484
- int * data = (int *) K_shift->data;
3485
- for (int i = 0; i < n_ctx; ++i) {
3486
- data[i] = kv_self.cells[i].delta;
3487
- }
3488
- }
3489
-
3490
- for (int il = 0; il < n_layer; ++il) {
3491
- struct ggml_tensor * tmp =
3492
- ggml_rope_custom_inplace(ctx0,
3493
- ggml_view_3d(ctx0, kv_self.k,
3494
- n_embd_head, n_head_kv, n_ctx,
3495
- ggml_element_size(kv_self.k)*n_embd_head,
3496
- ggml_element_size(kv_self.k)*n_embd_gqa,
3497
- ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
3498
- K_shift, n_embd_head, 2, 0, freq_base, freq_scale);
3499
- offload_func_kq(tmp);
3500
- ggml_build_forward_expand(gf, tmp);
3501
- }
3502
- }
3503
-
3504
3874
  for (int il = 0; il < n_layer; ++il) {
3505
- struct ggml_tensor * attn_norm;
3875
+ ggml_format_name(inpL, "layer_inp_%d", il);
3506
3876
 
3507
3877
  offload_func_t offload_func = llama_nop;
3508
3878
 
@@ -3512,80 +3882,49 @@ static struct ggml_cgraph * llm_build_falcon(
3512
3882
  }
3513
3883
  #endif // GGML_USE_CUBLAS
3514
3884
 
3515
- // self-attention
3516
- // TODO: refactor into common function (shared with LLaMA)
3517
- {
3518
- attn_norm = ggml_norm(ctx0, inpL, norm_eps);
3519
- offload_func(attn_norm);
3885
+ struct ggml_tensor * inpSA = inpL;
3520
3886
 
3521
- attn_norm = ggml_add(ctx0,
3522
- ggml_mul(ctx0, attn_norm, model.layers[il].attn_norm),
3523
- model.layers[il].attn_norm_b);
3524
- offload_func(attn_norm->src[0]);
3525
- offload_func(attn_norm);
3887
+ // norm
3888
+ {
3889
+ cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps);
3890
+ offload_func(cur);
3891
+ ggml_set_name(cur, "rms_norm_0");
3526
3892
 
3527
- if (model.layers[il].attn_norm_2) { // Falcon-40B
3528
- cur = ggml_norm(ctx0, inpL, norm_eps);
3529
- offload_func(cur);
3893
+ // cur = cur*attn_norm(broadcasted)
3894
+ cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm);
3895
+ offload_func(cur);
3896
+ ggml_set_name(cur, "attention_norm_0");
3897
+ }
3530
3898
 
3531
- cur = ggml_add(ctx0,
3532
- ggml_mul(ctx0, cur, model.layers[il].attn_norm_2),
3533
- model.layers[il].attn_norm_2_b);
3534
- offload_func(cur->src[0]);
3535
- offload_func(cur);
3536
- } else { // Falcon 7B
3537
- cur = attn_norm;
3538
- }
3539
-
3540
- // compute QKV
3541
-
3542
- cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
3543
- offload_func_kq(cur);
3544
-
3545
- // Note that the strides for Kcur, Vcur are set up so that the
3546
- // resulting views are misaligned with the tensor's storage
3547
- // (by applying the K/V offset we shift the tensor's original
3548
- // view to stick out behind the viewed QKV tensor's allocated
3549
- // memory, so to say). This is ok because no actual accesses
3550
- // happen to that out-of-range memory, but it can require some
3551
- // trickery when trying to accurately dump these views for
3552
- // debugging.
3553
-
3554
- const size_t wsize = ggml_type_size(cur->type);
3899
+ // self-attention
3900
+ {
3901
+ // compute Q and K
3902
+ struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
3903
+ offload_func_kq(tmpk);
3904
+ ggml_set_name(tmpk, "tmpk");
3555
3905
 
3556
- // TODO: these 2 ggml_conts are technically not needed, but we add them until CUDA support for
3557
- // non-contiguous views is added for the rope operator
3558
- struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_3d(
3559
- ctx0, cur, n_embd_head, n_head, n_tokens,
3560
- wsize * n_embd_head,
3561
- wsize * n_embd_head * (n_head + 2 * n_head_kv),
3562
- 0));
3906
+ struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
3563
3907
  offload_func_kq(tmpq);
3908
+ ggml_set_name(tmpq, "tmpq");
3564
3909
 
3565
- struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_3d(
3566
- ctx0, cur, n_embd_head, n_head_kv, n_tokens,
3567
- wsize * n_embd_head,
3568
- wsize * n_embd_head * (n_head + 2 * n_head_kv),
3569
- wsize * n_embd_head * n_head));
3570
- offload_func_kq(tmpk);
3571
-
3572
- struct ggml_tensor * tmpv = ggml_view_3d(
3573
- ctx0, cur, n_embd_head, n_head_kv, n_tokens,
3574
- wsize * n_embd_head,
3575
- wsize * n_embd_head * (n_head + 2 * n_head_kv),
3576
- wsize * n_embd_head * (n_head + n_head_kv));
3577
- offload_func_v(tmpv);
3910
+ struct ggml_tensor * Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens);
3911
+ offload_func_kq(Kcur);
3912
+ ggml_set_name(Kcur, "Kcur");
3578
3913
 
3579
- // using mode = 2 for neox mode
3580
- struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, tmpq, KQ_pos, n_embd_head, 2, 0, freq_base, freq_scale);
3914
+ struct ggml_tensor * Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens);
3581
3915
  offload_func_kq(Qcur);
3582
- struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, tmpk, KQ_pos, n_embd_head, 2, 0, freq_base, freq_scale);
3583
- offload_func_kq(Kcur);
3916
+ ggml_set_name(Qcur, "Qcur");
3584
3917
 
3918
+ // store key and value to memory
3585
3919
  {
3586
- struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
3920
+ // compute the transposed [n_tokens, n_embd] V matrix
3921
+
3922
+ struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
3923
+ offload_func_v(tmpv);
3924
+ ggml_set_name(tmpv, "tmpv");
3925
+
3926
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens));
3587
3927
  offload_func_v(Vcur);
3588
- offload_func_v(Vcur->src[0]->src[0]);
3589
3928
  ggml_set_name(Vcur, "Vcur");
3590
3929
 
3591
3930
  struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
@@ -3596,6 +3935,7 @@ static struct ggml_cgraph * llm_build_falcon(
3596
3935
  ( n_ctx)*ggml_element_size(kv_self.v),
3597
3936
  (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
3598
3937
  offload_func_v(v);
3938
+ ggml_set_name(v, "v");
3599
3939
 
3600
3940
  ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
3601
3941
  ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
@@ -3614,22 +3954,31 @@ static struct ggml_cgraph * llm_build_falcon(
3614
3954
  offload_func_kq(K);
3615
3955
  ggml_set_name(K, "K");
3616
3956
 
3957
+ // K * Q
3617
3958
  struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
3618
3959
  offload_func_kq(KQ);
3619
3960
  ggml_set_name(KQ, "KQ");
3620
3961
 
3962
+ // KQ_scaled = KQ / sqrt(n_embd_head)
3963
+ // KQ_scaled shape [n_kv, n_tokens, n_head, 1]
3621
3964
  struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
3622
3965
  offload_func_kq(KQ_scaled);
3623
3966
  ggml_set_name(KQ_scaled, "KQ_scaled");
3624
3967
 
3625
- struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
3968
+ // KQ_masked = mask_past(KQ_scaled)
3969
+ struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ 0, n_head, 8);
3970
+ ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
3971
+
3972
+ struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
3626
3973
  offload_func_kq(KQ_masked);
3627
3974
  ggml_set_name(KQ_masked, "KQ_masked");
3628
3975
 
3976
+ // KQ = soft_max(KQ_masked)
3629
3977
  struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
3630
3978
  offload_func_v(KQ_soft_max);
3631
3979
  ggml_set_name(KQ_soft_max, "KQ_soft_max");
3632
3980
 
3981
+ // split cached V into n_head heads
3633
3982
  struct ggml_tensor * V =
3634
3983
  ggml_view_3d(ctx0, kv_self.v,
3635
3984
  n_kv, n_embd_head, n_head_kv,
@@ -3639,42 +3988,85 @@ static struct ggml_cgraph * llm_build_falcon(
3639
3988
  offload_func_v(V);
3640
3989
  ggml_set_name(V, "V");
3641
3990
 
3991
+ #if 1
3642
3992
  struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
3643
3993
  offload_func_v(KQV);
3644
3994
  ggml_set_name(KQV, "KQV");
3995
+ #else
3996
+ // make V contiguous in memory to speed up the matmul, however we waste time on the copy
3997
+ // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
3998
+ // is there a better way?
3999
+ struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_ctx, n_embd_head, n_head));
4000
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
4001
+ #endif
3645
4002
 
4003
+ // KQV_merged = KQV.permute(0, 2, 1, 3)
3646
4004
  struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
3647
4005
  offload_func_v(KQV_merged);
3648
4006
  ggml_set_name(KQV_merged, "KQV_merged");
3649
4007
 
4008
+ // cur = KQV_merged.contiguous().view(n_embd, n_tokens)
3650
4009
  cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
3651
4010
  offload_func_v(cur);
3652
4011
  ggml_set_name(cur, "KQV_merged_contiguous");
3653
4012
 
3654
- cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
4013
+ // projection (no bias)
4014
+ cur = ggml_mul_mat(ctx0,
4015
+ model.layers[il].wo,
4016
+ cur);
3655
4017
  offload_func(cur);
3656
4018
  ggml_set_name(cur, "result_wo");
3657
4019
  }
3658
4020
 
3659
- struct ggml_tensor * attn_out = cur;
4021
+ struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
4022
+ offload_func(inpFF);
4023
+ ggml_set_name(inpFF, "inpFF");
3660
4024
 
3661
- // feed forward
4025
+ // feed-forward network
3662
4026
  {
3663
- struct ggml_tensor * inpFF = attn_norm;
4027
+ // norm
4028
+ {
4029
+ cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps);
4030
+ offload_func(cur);
4031
+ ggml_set_name(cur, "rms_norm_1");
3664
4032
 
3665
- cur = ggml_mul_mat(ctx0, model.layers[il].w3, inpFF);
4033
+ // cur = cur*ffn_norm(broadcasted)
4034
+ cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
4035
+ offload_func(cur);
4036
+ ggml_set_name(cur, "ffn_norm");
4037
+ }
4038
+
4039
+ struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
4040
+ model.layers[il].w3,
4041
+ cur);
4042
+ offload_func(tmp);
4043
+ ggml_set_name(tmp, "result_w3");
4044
+
4045
+ cur = ggml_mul_mat(ctx0,
4046
+ model.layers[il].w1,
4047
+ cur);
3666
4048
  offload_func(cur);
4049
+ ggml_set_name(cur, "result_w1");
3667
4050
 
3668
- cur = ggml_gelu(ctx0, cur);
4051
+ // SILU activation
4052
+ cur = ggml_silu(ctx0, cur);
3669
4053
  offload_func(cur);
3670
- cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur);
4054
+ ggml_set_name(cur, "silu");
4055
+
4056
+ cur = ggml_mul(ctx0, cur, tmp);
3671
4057
  offload_func(cur);
4058
+ ggml_set_name(cur, "silu_x_result_w3");
4059
+
4060
+ cur = ggml_mul_mat(ctx0,
4061
+ model.layers[il].w2,
4062
+ cur);
4063
+ offload_func(cur);
4064
+ ggml_set_name(cur, "result_w2");
3672
4065
  }
3673
4066
 
3674
- cur = ggml_add(ctx0, cur, attn_out);
3675
- offload_func(cur);
3676
- cur = ggml_add(ctx0, cur, inpL);
4067
+ cur = ggml_add(ctx0, cur, inpFF);
3677
4068
  offload_func(cur);
4069
+ ggml_set_name(cur, "inpFF_+_result_w2");
3678
4070
 
3679
4071
  // input for next layer
3680
4072
  inpL = cur;
@@ -3684,15 +4076,17 @@ static struct ggml_cgraph * llm_build_falcon(
3684
4076
 
3685
4077
  // norm
3686
4078
  {
3687
- cur = ggml_norm(ctx0, cur, norm_eps);
4079
+ cur = ggml_rms_norm(ctx0, cur, norm_rms_eps);
3688
4080
  offload_func_nr(cur);
4081
+ ggml_set_name(cur, "rms_norm_2");
3689
4082
 
3690
- cur = ggml_add(ctx0,
3691
- ggml_mul(ctx0, cur, model.output_norm),
3692
- model.output_norm_b);
4083
+ // cur = cur*norm(broadcasted)
4084
+ cur = ggml_mul(ctx0, cur, model.output_norm);
4085
+ // offload_func_nr(cur); // TODO CPU + GPU mirrored backend
3693
4086
  ggml_set_name(cur, "result_norm");
3694
4087
  }
3695
4088
 
4089
+ // lm_head
3696
4090
  cur = ggml_mul_mat(ctx0, model.output, cur);
3697
4091
  ggml_set_name(cur, "result_output");
3698
4092
 
@@ -3703,7 +4097,7 @@ static struct ggml_cgraph * llm_build_falcon(
3703
4097
  return gf;
3704
4098
  }
3705
4099
 
3706
- static struct ggml_cgraph * llm_build_starcoder(
4100
+ static struct ggml_cgraph * llm_build_falcon(
3707
4101
  llama_context & lctx,
3708
4102
  const llama_batch & batch) {
3709
4103
  const auto & model = lctx.model;
@@ -3724,29 +4118,34 @@ static struct ggml_cgraph * llm_build_starcoder(
3724
4118
 
3725
4119
  GGML_ASSERT(n_embd_head == hparams.n_rot);
3726
4120
 
3727
- const float norm_eps = hparams.f_norm_eps;
4121
+ const float freq_base = cparams.rope_freq_base;
4122
+ const float freq_scale = cparams.rope_freq_scale;
4123
+ const float norm_eps = hparams.f_norm_eps;
4124
+
4125
+ const int n_gpu_layers = model.n_gpu_layers;
3728
4126
 
3729
4127
  const int32_t n_tokens = batch.n_tokens;
3730
4128
  const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
3731
4129
  const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
3732
4130
 
4131
+ const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
4132
+
4133
+ //printf("kv_head = %d, n_kv = %d, n_tokens = %d, n_ctx = %d, is_measure = %d, has_shift = %d\n",
4134
+ // kv_head, n_kv, n_tokens, n_ctx, ggml_allocr_is_measure(lctx.alloc), kv_self.has_shift);
4135
+
3733
4136
  auto & buf_compute = lctx.buf_compute;
3734
4137
 
3735
4138
  struct ggml_init_params params = {
3736
4139
  /*.mem_size =*/ buf_compute.size,
3737
4140
  /*.mem_buffer =*/ buf_compute.data,
3738
- /*.no_alloc =*/ false,
4141
+ /*.no_alloc =*/ true,
3739
4142
  };
3740
4143
 
3741
- params.no_alloc = true;
3742
-
3743
4144
  struct ggml_context * ctx0 = ggml_init(params);
3744
4145
 
3745
4146
  ggml_cgraph * gf = ggml_new_graph(ctx0);
3746
4147
 
3747
4148
  struct ggml_tensor * cur;
3748
- struct ggml_tensor * token;
3749
- struct ggml_tensor * position;
3750
4149
  struct ggml_tensor * inpL;
3751
4150
 
3752
4151
  if (batch.token) {
@@ -3758,30 +4157,390 @@ static struct ggml_cgraph * llm_build_starcoder(
3758
4157
  }
3759
4158
  ggml_set_name(inp_tokens, "inp_tokens");
3760
4159
 
3761
- token = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
4160
+ inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
3762
4161
  } else {
3763
4162
  #ifdef GGML_USE_MPI
3764
4163
  GGML_ASSERT(false && "not implemented");
3765
4164
  #endif
3766
4165
 
3767
- token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
4166
+ inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
3768
4167
 
3769
- ggml_allocr_alloc(lctx.alloc, token);
4168
+ ggml_allocr_alloc(lctx.alloc, inpL);
3770
4169
  if (!ggml_allocr_is_measure(lctx.alloc)) {
3771
- memcpy(token->data, batch.embd, n_tokens * n_embd * ggml_element_size(token));
4170
+ memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
3772
4171
  }
3773
4172
  }
3774
4173
 
3775
- {
3776
- // Compute position embeddings.
3777
- struct ggml_tensor * inp_positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
3778
- ggml_allocr_alloc(lctx.alloc, inp_positions);
4174
+ const int i_gpu_start = n_layer - n_gpu_layers;
4175
+ (void) i_gpu_start;
4176
+
4177
+ // offload functions set the tensor output backend to GPU
4178
+ // tensors are GPU-accelerated if any input or the output has been offloaded
4179
+ offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
4180
+ offload_func_t offload_func_kq = llama_nop;
4181
+ offload_func_t offload_func_v = llama_nop;
4182
+
4183
+ #ifdef GGML_USE_CUBLAS
4184
+ if (n_gpu_layers > n_layer) {
4185
+ offload_func_nr = ggml_cuda_assign_buffers_no_alloc;
4186
+ }
4187
+ if (n_gpu_layers > n_layer + 1) {
4188
+ offload_func_v = ggml_cuda_assign_buffers_no_alloc;
4189
+ }
4190
+ if (n_gpu_layers > n_layer + 2) {
4191
+ offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
4192
+ }
4193
+ #endif // GGML_USE_CUBLAS
4194
+
4195
+ // KQ_scale
4196
+ struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
4197
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
4198
+ ggml_allocr_alloc(lctx.alloc, KQ_scale);
4199
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
4200
+ ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
4201
+ }
4202
+
4203
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
4204
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
4205
+ offload_func_kq(KQ_mask);
4206
+ ggml_set_name(KQ_mask, "KQ_mask");
4207
+ ggml_allocr_alloc(lctx.alloc, KQ_mask);
4208
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
4209
+ float * data = (float *) KQ_mask->data;
4210
+ memset(data, 0, ggml_nbytes(KQ_mask));
4211
+
4212
+ for (int h = 0; h < 1; ++h) {
4213
+ for (int j = 0; j < n_tokens; ++j) {
4214
+ const llama_pos pos = batch.pos[j];
4215
+ const llama_seq_id seq_id = batch.seq_id[j];
4216
+
4217
+ for (int i = 0; i < n_kv; ++i) {
4218
+ if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
4219
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
4220
+ }
4221
+ }
4222
+ }
4223
+ }
4224
+ }
4225
+
4226
+ // KQ_pos - contains the positions
4227
+ struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
4228
+ offload_func_kq(KQ_pos);
4229
+ ggml_set_name(KQ_pos, "KQ_pos");
4230
+ ggml_allocr_alloc(lctx.alloc, KQ_pos);
4231
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
4232
+ int * data = (int *) KQ_pos->data;
4233
+ for (int i = 0; i < n_tokens; ++i) {
4234
+ data[i] = batch.pos[i];
4235
+ }
4236
+ }
4237
+
4238
+ // shift the entire K-cache if needed
4239
+ if (do_rope_shift) {
4240
+ struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
4241
+ offload_func_kq(K_shift);
4242
+ ggml_set_name(K_shift, "K_shift");
4243
+ ggml_allocr_alloc(lctx.alloc, K_shift);
3779
4244
  if (!ggml_allocr_is_measure(lctx.alloc)) {
3780
- for (int i = 0; i < n_tokens; ++i) {
3781
- ((int32_t *) inp_positions->data)[i] = batch.pos[i];
4245
+ int * data = (int *) K_shift->data;
4246
+ for (int i = 0; i < n_ctx; ++i) {
4247
+ data[i] = kv_self.cells[i].delta;
3782
4248
  }
3783
4249
  }
3784
- ggml_set_name(inp_positions, "inp_positions");
4250
+
4251
+ for (int il = 0; il < n_layer; ++il) {
4252
+ struct ggml_tensor * tmp =
4253
+ ggml_rope_custom_inplace(ctx0,
4254
+ ggml_view_3d(ctx0, kv_self.k,
4255
+ n_embd_head, n_head_kv, n_ctx,
4256
+ ggml_element_size(kv_self.k)*n_embd_head,
4257
+ ggml_element_size(kv_self.k)*n_embd_gqa,
4258
+ ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
4259
+ K_shift, n_embd_head, 2, 0, freq_base, freq_scale);
4260
+ offload_func_kq(tmp);
4261
+ ggml_build_forward_expand(gf, tmp);
4262
+ }
4263
+ }
4264
+
4265
+ for (int il = 0; il < n_layer; ++il) {
4266
+ struct ggml_tensor * attn_norm;
4267
+
4268
+ offload_func_t offload_func = llama_nop;
4269
+
4270
+ #ifdef GGML_USE_CUBLAS
4271
+ if (il >= i_gpu_start) {
4272
+ offload_func = ggml_cuda_assign_buffers_no_alloc;
4273
+ }
4274
+ #endif // GGML_USE_CUBLAS
4275
+
4276
+ // self-attention
4277
+ // TODO: refactor into common function (shared with LLaMA)
4278
+ {
4279
+ attn_norm = ggml_norm(ctx0, inpL, norm_eps);
4280
+ offload_func(attn_norm);
4281
+
4282
+ attn_norm = ggml_add(ctx0,
4283
+ ggml_mul(ctx0, attn_norm, model.layers[il].attn_norm),
4284
+ model.layers[il].attn_norm_b);
4285
+ offload_func(attn_norm->src[0]);
4286
+ offload_func(attn_norm);
4287
+
4288
+ if (model.layers[il].attn_norm_2) { // Falcon-40B
4289
+ cur = ggml_norm(ctx0, inpL, norm_eps);
4290
+ offload_func(cur);
4291
+
4292
+ cur = ggml_add(ctx0,
4293
+ ggml_mul(ctx0, cur, model.layers[il].attn_norm_2),
4294
+ model.layers[il].attn_norm_2_b);
4295
+ offload_func(cur->src[0]);
4296
+ offload_func(cur);
4297
+ } else { // Falcon 7B
4298
+ cur = attn_norm;
4299
+ }
4300
+
4301
+ // compute QKV
4302
+
4303
+ cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
4304
+ offload_func_kq(cur);
4305
+
4306
+ // Note that the strides for Kcur, Vcur are set up so that the
4307
+ // resulting views are misaligned with the tensor's storage
4308
+ // (by applying the K/V offset we shift the tensor's original
4309
+ // view to stick out behind the viewed QKV tensor's allocated
4310
+ // memory, so to say). This is ok because no actual accesses
4311
+ // happen to that out-of-range memory, but it can require some
4312
+ // trickery when trying to accurately dump these views for
4313
+ // debugging.
4314
+
4315
+ const size_t wsize = ggml_type_size(cur->type);
4316
+
4317
+ // TODO: these 2 ggml_conts are technically not needed, but we add them until CUDA support for
4318
+ // non-contiguous views is added for the rope operator
4319
+ struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_3d(
4320
+ ctx0, cur, n_embd_head, n_head, n_tokens,
4321
+ wsize * n_embd_head,
4322
+ wsize * n_embd_head * (n_head + 2 * n_head_kv),
4323
+ 0));
4324
+ offload_func_kq(tmpq);
4325
+
4326
+ struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_3d(
4327
+ ctx0, cur, n_embd_head, n_head_kv, n_tokens,
4328
+ wsize * n_embd_head,
4329
+ wsize * n_embd_head * (n_head + 2 * n_head_kv),
4330
+ wsize * n_embd_head * n_head));
4331
+ offload_func_kq(tmpk);
4332
+
4333
+ struct ggml_tensor * tmpv = ggml_view_3d(
4334
+ ctx0, cur, n_embd_head, n_head_kv, n_tokens,
4335
+ wsize * n_embd_head,
4336
+ wsize * n_embd_head * (n_head + 2 * n_head_kv),
4337
+ wsize * n_embd_head * (n_head + n_head_kv));
4338
+ offload_func_v(tmpv);
4339
+
4340
+ // using mode = 2 for neox mode
4341
+ struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, tmpq, KQ_pos, n_embd_head, 2, 0, freq_base, freq_scale);
4342
+ offload_func_kq(Qcur);
4343
+ struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, tmpk, KQ_pos, n_embd_head, 2, 0, freq_base, freq_scale);
4344
+ offload_func_kq(Kcur);
4345
+
4346
+ {
4347
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
4348
+ offload_func_v(Vcur);
4349
+ offload_func_v(Vcur->src[0]->src[0]);
4350
+ ggml_set_name(Vcur, "Vcur");
4351
+
4352
+ struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
4353
+ offload_func_kq(k);
4354
+ ggml_set_name(k, "k");
4355
+
4356
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
4357
+ ( n_ctx)*ggml_element_size(kv_self.v),
4358
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
4359
+ offload_func_v(v);
4360
+
4361
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
4362
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
4363
+ }
4364
+
4365
+ struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
4366
+ offload_func_kq(Q);
4367
+ ggml_set_name(Q, "Q");
4368
+
4369
+ struct ggml_tensor * K =
4370
+ ggml_view_3d(ctx0, kv_self.k,
4371
+ n_embd_head, n_kv, n_head_kv,
4372
+ ggml_element_size(kv_self.k)*n_embd_gqa,
4373
+ ggml_element_size(kv_self.k)*n_embd_head,
4374
+ ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
4375
+ offload_func_kq(K);
4376
+ ggml_set_name(K, "K");
4377
+
4378
+ struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
4379
+ offload_func_kq(KQ);
4380
+ ggml_set_name(KQ, "KQ");
4381
+
4382
+ struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
4383
+ offload_func_kq(KQ_scaled);
4384
+ ggml_set_name(KQ_scaled, "KQ_scaled");
4385
+
4386
+ struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
4387
+ offload_func_kq(KQ_masked);
4388
+ ggml_set_name(KQ_masked, "KQ_masked");
4389
+
4390
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
4391
+ offload_func_v(KQ_soft_max);
4392
+ ggml_set_name(KQ_soft_max, "KQ_soft_max");
4393
+
4394
+ struct ggml_tensor * V =
4395
+ ggml_view_3d(ctx0, kv_self.v,
4396
+ n_kv, n_embd_head, n_head_kv,
4397
+ ggml_element_size(kv_self.v)*n_ctx,
4398
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
4399
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
4400
+ offload_func_v(V);
4401
+ ggml_set_name(V, "V");
4402
+
4403
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
4404
+ offload_func_v(KQV);
4405
+ ggml_set_name(KQV, "KQV");
4406
+
4407
+ struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
4408
+ offload_func_v(KQV_merged);
4409
+ ggml_set_name(KQV_merged, "KQV_merged");
4410
+
4411
+ cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
4412
+ offload_func_v(cur);
4413
+ ggml_set_name(cur, "KQV_merged_contiguous");
4414
+
4415
+ cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
4416
+ offload_func(cur);
4417
+ ggml_set_name(cur, "result_wo");
4418
+ }
4419
+
4420
+ struct ggml_tensor * attn_out = cur;
4421
+
4422
+ // feed forward
4423
+ {
4424
+ struct ggml_tensor * inpFF = attn_norm;
4425
+
4426
+ cur = ggml_mul_mat(ctx0, model.layers[il].w3, inpFF);
4427
+ offload_func(cur);
4428
+
4429
+ cur = ggml_gelu(ctx0, cur);
4430
+ offload_func(cur);
4431
+ cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur);
4432
+ offload_func(cur);
4433
+ }
4434
+
4435
+ cur = ggml_add(ctx0, cur, attn_out);
4436
+ offload_func(cur);
4437
+ cur = ggml_add(ctx0, cur, inpL);
4438
+ offload_func(cur);
4439
+
4440
+ // input for next layer
4441
+ inpL = cur;
4442
+ }
4443
+
4444
+ cur = inpL;
4445
+
4446
+ // norm
4447
+ {
4448
+ cur = ggml_norm(ctx0, cur, norm_eps);
4449
+ offload_func_nr(cur);
4450
+
4451
+ cur = ggml_add(ctx0,
4452
+ ggml_mul(ctx0, cur, model.output_norm),
4453
+ model.output_norm_b);
4454
+ ggml_set_name(cur, "result_norm");
4455
+ }
4456
+
4457
+ cur = ggml_mul_mat(ctx0, model.output, cur);
4458
+ ggml_set_name(cur, "result_output");
4459
+
4460
+ ggml_build_forward_expand(gf, cur);
4461
+
4462
+ ggml_free(ctx0);
4463
+
4464
+ return gf;
4465
+ }
4466
+
4467
+ static struct ggml_cgraph * llm_build_starcoder(
4468
+ llama_context & lctx,
4469
+ const llama_batch & batch) {
4470
+ const auto & model = lctx.model;
4471
+ const auto & hparams = model.hparams;
4472
+ const auto & cparams = lctx.cparams;
4473
+
4474
+ const auto & kv_self = lctx.kv_self;
4475
+
4476
+ GGML_ASSERT(!!kv_self.ctx);
4477
+
4478
+ const int64_t n_embd = hparams.n_embd;
4479
+ const int64_t n_layer = hparams.n_layer;
4480
+ const int64_t n_ctx = cparams.n_ctx;
4481
+ const int64_t n_head = hparams.n_head;
4482
+ const int64_t n_head_kv = hparams.n_head_kv;
4483
+ const int64_t n_embd_head = hparams.n_embd_head();
4484
+ const int64_t n_embd_gqa = hparams.n_embd_gqa();
4485
+
4486
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
4487
+
4488
+ const float norm_eps = hparams.f_norm_eps;
4489
+
4490
+ const int32_t n_tokens = batch.n_tokens;
4491
+ const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
4492
+ const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
4493
+
4494
+ auto & buf_compute = lctx.buf_compute;
4495
+
4496
+ struct ggml_init_params params = {
4497
+ /*.mem_size =*/ buf_compute.size,
4498
+ /*.mem_buffer =*/ buf_compute.data,
4499
+ /*.no_alloc =*/ true,
4500
+ };
4501
+
4502
+ struct ggml_context * ctx0 = ggml_init(params);
4503
+
4504
+ ggml_cgraph * gf = ggml_new_graph(ctx0);
4505
+
4506
+ struct ggml_tensor * cur;
4507
+ struct ggml_tensor * token;
4508
+ struct ggml_tensor * position;
4509
+ struct ggml_tensor * inpL;
4510
+
4511
+ if (batch.token) {
4512
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
4513
+
4514
+ ggml_allocr_alloc(lctx.alloc, inp_tokens);
4515
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
4516
+ memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
4517
+ }
4518
+ ggml_set_name(inp_tokens, "inp_tokens");
4519
+
4520
+ token = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
4521
+ } else {
4522
+ #ifdef GGML_USE_MPI
4523
+ GGML_ASSERT(false && "not implemented");
4524
+ #endif
4525
+
4526
+ token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
4527
+
4528
+ ggml_allocr_alloc(lctx.alloc, token);
4529
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
4530
+ memcpy(token->data, batch.embd, n_tokens * n_embd * ggml_element_size(token));
4531
+ }
4532
+ }
4533
+
4534
+ {
4535
+ // Compute position embeddings.
4536
+ struct ggml_tensor * inp_positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
4537
+ ggml_allocr_alloc(lctx.alloc, inp_positions);
4538
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
4539
+ for (int i = 0; i < n_tokens; ++i) {
4540
+ ((int32_t *) inp_positions->data)[i] = batch.pos[i];
4541
+ }
4542
+ }
4543
+ ggml_set_name(inp_positions, "inp_positions");
3785
4544
 
3786
4545
  position = ggml_get_rows(ctx0, model.pos_embeddings, inp_positions);
3787
4546
  }
@@ -3816,48 +4575,984 @@ static struct ggml_cgraph * llm_build_starcoder(
3816
4575
  }
3817
4576
  }
3818
4577
 
3819
- inpL = ggml_add(ctx0, token, position);
3820
- ggml_set_name(inpL, "inpL");
3821
-
4578
+ inpL = ggml_add(ctx0, token, position);
4579
+ ggml_set_name(inpL, "inpL");
4580
+
4581
+ for (int il = 0; il < n_layer; ++il) {
4582
+ {
4583
+ // Norm
4584
+ cur = ggml_norm(ctx0, inpL, norm_eps);
4585
+ cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b);
4586
+ }
4587
+
4588
+ {
4589
+ // Self Attention
4590
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv);
4591
+
4592
+ struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd);
4593
+ struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*n_embd);
4594
+ struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
4595
+
4596
+ struct ggml_tensor * Qcur = tmpq;
4597
+ struct ggml_tensor * Kcur = tmpk;
4598
+
4599
+ {
4600
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
4601
+ ggml_set_name(Vcur, "Vcur");
4602
+
4603
+ struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
4604
+ ggml_set_name(k, "k");
4605
+
4606
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
4607
+ ( n_ctx)*ggml_element_size(kv_self.v),
4608
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
4609
+
4610
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
4611
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
4612
+ }
4613
+
4614
+ struct ggml_tensor * Q =
4615
+ ggml_permute(ctx0,
4616
+ ggml_cpy(ctx0,
4617
+ Qcur,
4618
+ ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, n_tokens)),
4619
+ 0, 2, 1, 3);
4620
+ ggml_set_name(Q, "Q");
4621
+
4622
+ struct ggml_tensor * K =
4623
+ ggml_view_3d(ctx0, kv_self.k,
4624
+ n_embd_head, n_kv, n_head_kv,
4625
+ ggml_element_size(kv_self.k)*n_embd_gqa,
4626
+ ggml_element_size(kv_self.k)*n_embd_head,
4627
+ ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
4628
+ ggml_set_name(K, "K");
4629
+
4630
+ // K * Q
4631
+ struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
4632
+ ggml_set_name(KQ, "KQ");
4633
+
4634
+ // KQ_scaled = KQ / sqrt(n_embd_head)
4635
+ // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
4636
+ struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
4637
+ ggml_set_name(KQ_scaled, "KQ_scaled");
4638
+
4639
+ // KQ_masked = mask_past(KQ_scaled)
4640
+ struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
4641
+ ggml_set_name(KQ_masked, "KQ_masked");
4642
+
4643
+ // KQ = soft_max(KQ_masked)
4644
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
4645
+ ggml_set_name(KQ_soft_max, "KQ_soft_max");
4646
+
4647
+ // split cached V into n_head heads
4648
+ struct ggml_tensor * V =
4649
+ ggml_view_3d(ctx0, kv_self.v,
4650
+ n_kv, n_embd_head, n_head_kv,
4651
+ ggml_element_size(kv_self.v)*n_ctx,
4652
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
4653
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
4654
+ ggml_set_name(V, "V");
4655
+
4656
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
4657
+ ggml_set_name(KQV, "KQV");
4658
+
4659
+ // KQV_merged = KQV.permute(0, 2, 1, 3)
4660
+ struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
4661
+ ggml_set_name(KQV_merged, "KQV_merged");
4662
+
4663
+ // cur = KQV_merged.contiguous().view(n_embd, n_tokens)
4664
+ cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
4665
+ ggml_set_name(cur, "KQV_merged_contiguous");
4666
+ }
4667
+
4668
+ // Projection
4669
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo);
4670
+
4671
+ // Add the input
4672
+ cur = ggml_add(ctx0, cur, inpL);
4673
+
4674
+ struct ggml_tensor * inpFF = cur;
4675
+
4676
+ // FF
4677
+ {
4678
+ // Norm
4679
+ {
4680
+ cur = ggml_norm(ctx0, inpFF, norm_eps);
4681
+ cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b);
4682
+ }
4683
+
4684
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3);
4685
+
4686
+ // GELU activation
4687
+ cur = ggml_gelu(ctx0, cur);
4688
+
4689
+ // Projection
4690
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2);
4691
+ }
4692
+
4693
+ inpL = ggml_add(ctx0, cur, inpFF);
4694
+ }
4695
+
4696
+ // Output Norm
4697
+ {
4698
+ cur = ggml_norm(ctx0, inpL, norm_eps);
4699
+ cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b);
4700
+ }
4701
+ ggml_set_name(cur, "result_norm");
4702
+
4703
+ cur = ggml_mul_mat(ctx0, model.output, cur);
4704
+ ggml_set_name(cur, "result_output");
4705
+
4706
+ ggml_build_forward_expand(gf, cur);
4707
+ ggml_free(ctx0);
4708
+
4709
+ return gf;
4710
+ }
4711
+
4712
+ static struct ggml_cgraph * llm_build_persimmon(
4713
+ llama_context & lctx,
4714
+ const llama_batch & batch) {
4715
+ const auto & model = lctx.model;
4716
+ const auto & hparams = model.hparams;
4717
+
4718
+ const auto & kv_self = lctx.kv_self;
4719
+
4720
+ GGML_ASSERT(!!kv_self.ctx);
4721
+
4722
+ const auto & cparams = lctx.cparams;
4723
+ const int64_t n_embd = hparams.n_embd;
4724
+ const int64_t n_layer = hparams.n_layer;
4725
+ const int64_t n_ctx = cparams.n_ctx;
4726
+ const int64_t n_head_kv = hparams.n_head_kv;
4727
+ const int64_t n_head = hparams.n_head;
4728
+ const int64_t n_embd_head = hparams.n_embd_head();
4729
+ const int64_t n_embd_gqa = hparams.n_embd_gqa();
4730
+ const size_t n_rot = n_embd_head / 2;
4731
+
4732
+ const float freq_base = cparams.rope_freq_base;
4733
+ const float freq_scale = cparams.rope_freq_scale;
4734
+ const float norm_eps = hparams.f_norm_eps;
4735
+
4736
+ const int n_gpu_layers = model.n_gpu_layers;
4737
+
4738
+
4739
+ const int32_t n_tokens = batch.n_tokens;
4740
+ const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
4741
+ const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
4742
+
4743
+ const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
4744
+
4745
+ auto & buf_compute = lctx.buf_compute;
4746
+ struct ggml_init_params params = {
4747
+ /*.mem_size =*/ buf_compute.size,
4748
+ /*.mem_buffer =*/ buf_compute.data,
4749
+ /*.no_alloc =*/ true,
4750
+ };
4751
+
4752
+ struct ggml_context * ctx0 = ggml_init(params);
4753
+
4754
+ ggml_cgraph * gf = ggml_new_graph(ctx0);
4755
+
4756
+ struct ggml_tensor * cur;
4757
+ struct ggml_tensor * inpL;
4758
+
4759
+ if (batch.token) {
4760
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
4761
+
4762
+ ggml_allocr_alloc(lctx.alloc, inp_tokens);
4763
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
4764
+ memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
4765
+ }
4766
+ ggml_set_name(inp_tokens, "inp_tokens");
4767
+ inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
4768
+ } else {
4769
+ inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
4770
+ ggml_allocr_alloc(lctx.alloc, inpL);
4771
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
4772
+ memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
4773
+ }
4774
+ }
4775
+ const int i_gpu_start = n_layer - n_gpu_layers;
4776
+ (void) i_gpu_start;
4777
+ offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
4778
+ offload_func_t offload_func_kq = llama_nop;
4779
+ offload_func_t offload_func_v = llama_nop;
4780
+ // KQ_scale
4781
+ struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
4782
+ ggml_allocr_alloc(lctx.alloc, KQ_scale);
4783
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
4784
+ ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd_head)));
4785
+ }
4786
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
4787
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
4788
+ offload_func_kq(KQ_mask);
4789
+ ggml_set_name(KQ_mask, "KQ_mask");
4790
+ ggml_allocr_alloc(lctx.alloc, KQ_mask);
4791
+
4792
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
4793
+ float * data = (float *) KQ_mask->data;
4794
+ memset(data, 0, ggml_nbytes(KQ_mask));
4795
+ for (int h = 0; h < 1; ++h) {
4796
+ for (int j = 0; j < n_tokens; ++j) {
4797
+ const llama_pos pos = batch.pos[j];
4798
+ const llama_seq_id seq_id = batch.seq_id[j];
4799
+ for (int i = 0; i < n_kv; ++i) {
4800
+ if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
4801
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
4802
+ }
4803
+ }
4804
+ }
4805
+ }
4806
+ }
4807
+
4808
+ struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
4809
+ offload_func_kq(KQ_pos);
4810
+ ggml_set_name(KQ_pos, "KQ_pos");
4811
+ ggml_allocr_alloc(lctx.alloc, KQ_pos);
4812
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
4813
+ int * data = (int *) KQ_pos->data;
4814
+ for (int i = 0; i < n_tokens; ++i) {
4815
+ data[i] = batch.pos[i];
4816
+ }
4817
+ }
4818
+ if (do_rope_shift) {
4819
+ struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
4820
+ offload_func_kq(K_shift);
4821
+ ggml_set_name(K_shift, "K_shift");
4822
+ ggml_allocr_alloc(lctx.alloc, K_shift);
4823
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
4824
+ int * data = (int *) K_shift->data;
4825
+ for (int i = 0; i < n_ctx; ++i) {
4826
+ data[i] = kv_self.cells[i].delta;
4827
+ }
4828
+ }
4829
+ for (int il = 0; il < n_layer; ++il) {
4830
+ struct ggml_tensor * tmp =
4831
+ // we rotate only the first n_rot dimensions.
4832
+ ggml_rope_custom_inplace(ctx0,
4833
+ ggml_view_3d(ctx0, kv_self.k,
4834
+ n_rot, n_head, n_ctx,
4835
+ ggml_element_size(kv_self.k)*n_embd_gqa,
4836
+ ggml_element_size(kv_self.k)*n_embd_head,
4837
+ ggml_element_size(kv_self.k)*(n_embd_head*n_ctx*il)
4838
+ ),
4839
+ K_shift, n_rot, 2, 0, freq_base, freq_scale);
4840
+ offload_func_kq(tmp);
4841
+ ggml_build_forward_expand(gf, tmp);
4842
+ }
4843
+ }
4844
+ for (int il=0; il < n_layer; ++il) {
4845
+ struct ggml_tensor * residual = inpL;
4846
+ offload_func_t offload_func = llama_nop;
4847
+ {
4848
+ cur = ggml_norm(ctx0, inpL, norm_eps);
4849
+ offload_func(cur);
4850
+ cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm);
4851
+ offload_func(cur);
4852
+ cur = ggml_add(ctx0, cur, model.layers[il].attn_norm_b);
4853
+ offload_func(cur);
4854
+ ggml_format_name(cur, "input_layernorm_%d", il);
4855
+ }
4856
+ // self attention
4857
+ {
4858
+ cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
4859
+ offload_func_kq(cur);
4860
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
4861
+ offload_func_kq(cur);
4862
+
4863
+ // split qkv
4864
+ GGML_ASSERT(n_head_kv == n_head);
4865
+ ggml_set_name(cur, format("qkv_%d", il).c_str());
4866
+ struct ggml_tensor * tmpqkv = ggml_reshape_4d(ctx0, cur, n_embd_head, 3, n_head, n_tokens);
4867
+ offload_func_kq(tmpqkv);
4868
+ struct ggml_tensor * tmpqkv_perm = ggml_cont(ctx0, ggml_permute(ctx0, tmpqkv, 0, 3, 1, 2));
4869
+ offload_func_kq(tmpqkv_perm);
4870
+ ggml_format_name(tmpqkv_perm, "tmpqkv_perm_%d", il);
4871
+ struct ggml_tensor * tmpq = ggml_view_3d(
4872
+ ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
4873
+ ggml_element_size(tmpqkv_perm) * n_embd_head,
4874
+ ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
4875
+ 0
4876
+ );
4877
+ offload_func_kq(tmpq);
4878
+ struct ggml_tensor * tmpk = ggml_view_3d(
4879
+ ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
4880
+ ggml_element_size(tmpqkv_perm) * n_embd_head,
4881
+ ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
4882
+ ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens
4883
+ );
4884
+ offload_func_kq(tmpk);
4885
+ // Q/K Layernorm
4886
+ tmpq = ggml_norm(ctx0, tmpq, norm_eps);
4887
+ offload_func_kq(tmpq);
4888
+ tmpq = ggml_mul(ctx0, tmpq, model.layers[il].attn_q_norm);
4889
+ offload_func_kq(tmpq);
4890
+ tmpq = ggml_add(ctx0, tmpq, model.layers[il].attn_q_norm_b);
4891
+ offload_func_kq(tmpq);
4892
+
4893
+ tmpk = ggml_norm(ctx0, tmpk, norm_eps);
4894
+ offload_func_v(tmpk);
4895
+ tmpk = ggml_mul(ctx0, tmpk, model.layers[il].attn_k_norm);
4896
+ offload_func_v(tmpk);
4897
+ tmpk = ggml_add(ctx0, tmpk, model.layers[il].attn_k_norm_b);
4898
+ offload_func_v(tmpk);
4899
+
4900
+ // RoPE the first n_rot of q/k, pass the other half, and concat.
4901
+ struct ggml_tensor * qrot = ggml_view_3d(
4902
+ ctx0, tmpq, n_rot, n_head, n_tokens,
4903
+ ggml_element_size(tmpq) * n_embd_head,
4904
+ ggml_element_size(tmpq) * n_embd_head * n_head,
4905
+ 0
4906
+ );
4907
+ offload_func_kq(qrot);
4908
+ ggml_format_name(qrot, "qrot_%d", il);
4909
+ struct ggml_tensor * krot = ggml_view_3d(
4910
+ ctx0, tmpk, n_rot, n_head, n_tokens,
4911
+ ggml_element_size(tmpk) * n_embd_head,
4912
+ ggml_element_size(tmpk) * n_embd_head * n_head,
4913
+ 0
4914
+ );
4915
+ offload_func_kq(krot);
4916
+ ggml_format_name(krot, "krot_%d", il);
4917
+
4918
+ // get the second half of tmpq, e.g tmpq[n_rot:, :, :]
4919
+ struct ggml_tensor * qpass = ggml_view_3d(
4920
+ ctx0, tmpq, n_rot, n_head, n_tokens,
4921
+ ggml_element_size(tmpq) * n_embd_head,
4922
+ ggml_element_size(tmpq) * n_embd_head * n_head,
4923
+ ggml_element_size(tmpq) * n_rot
4924
+ );
4925
+ offload_func_kq(qpass);
4926
+ ggml_format_name(qpass, "qpass_%d", il);
4927
+ struct ggml_tensor * kpass = ggml_view_3d(
4928
+ ctx0, tmpk, n_rot, n_head, n_tokens,
4929
+ ggml_element_size(tmpk) * n_embd_head,
4930
+ ggml_element_size(tmpk) * n_embd_head * n_head,
4931
+ ggml_element_size(tmpk) * n_rot
4932
+ );
4933
+ offload_func_kq(kpass);
4934
+ ggml_format_name(kpass, "kpass_%d", il);
4935
+
4936
+ struct ggml_tensor * qrotated = ggml_rope_custom(
4937
+ ctx0, qrot, KQ_pos, n_rot, 2, 0, freq_base, freq_scale
4938
+ );
4939
+ offload_func_kq(qrotated);
4940
+ struct ggml_tensor * krotated = ggml_rope_custom(
4941
+ ctx0, krot, KQ_pos, n_rot, 2, 0, freq_base, freq_scale
4942
+ );
4943
+ offload_func_kq(krotated);
4944
+ // ggml currently only supports concatenation on dim=2
4945
+ // so we need to permute qrot, qpass, concat, then permute back.
4946
+ qrotated = ggml_cont(ctx0, ggml_permute(ctx0, qrotated, 2, 1, 0, 3));
4947
+ offload_func_kq(qrotated);
4948
+ krotated = ggml_cont(ctx0, ggml_permute(ctx0, krotated, 2, 1, 0, 3));
4949
+ offload_func_kq(krotated);
4950
+
4951
+ qpass = ggml_cont(ctx0, ggml_permute(ctx0, qpass, 2, 1, 0, 3));
4952
+ offload_func_kq(qpass);
4953
+ kpass = ggml_cont(ctx0, ggml_permute(ctx0, kpass, 2, 1, 0, 3));
4954
+ offload_func_kq(kpass);
4955
+
4956
+ struct ggml_tensor * Qcur = ggml_concat(ctx0, qrotated, qpass);
4957
+ offload_func_kq(Qcur);
4958
+ struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass);
4959
+ offload_func_kq(Kcur);
4960
+
4961
+ struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 1, 2, 0, 3));
4962
+ offload_func_kq(Q);
4963
+
4964
+ Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
4965
+ offload_func_kq(Kcur);
4966
+ {
4967
+ struct ggml_tensor * tmpv = ggml_view_3d(
4968
+ ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
4969
+ ggml_element_size(tmpqkv_perm) * n_embd_head,
4970
+ ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
4971
+ ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens * 2
4972
+ );
4973
+ offload_func_v(tmpv);
4974
+ // store K, V in cache
4975
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens));
4976
+ offload_func_v(Vcur);
4977
+ ggml_set_name(Vcur, "Vcur");
4978
+
4979
+ struct ggml_tensor * k = ggml_view_1d(
4980
+ ctx0, kv_self.k, n_tokens*n_embd_gqa,
4981
+ (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)
4982
+ );
4983
+ offload_func_kq(k);
4984
+ ggml_set_name(k, "k");
4985
+
4986
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
4987
+ ( n_ctx)*ggml_element_size(kv_self.v),
4988
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
4989
+ offload_func_v(v);
4990
+ ggml_set_name(v, "v");
4991
+
4992
+ // important: storing RoPE-ed version of K in the KV cache!
4993
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
4994
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
4995
+ }
4996
+ struct ggml_tensor * K = ggml_view_3d(ctx0, kv_self.k,
4997
+ n_embd_head, n_kv, n_head_kv,
4998
+ ggml_element_size(kv_self.k)*n_embd_gqa,
4999
+ ggml_element_size(kv_self.k)*n_embd_head,
5000
+ ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
5001
+
5002
+ offload_func_kq(K);
5003
+ ggml_format_name(K, "K_%d", il);
5004
+
5005
+ struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
5006
+ offload_func_kq(KQ);
5007
+ ggml_set_name(KQ, "KQ");
5008
+
5009
+ struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
5010
+ offload_func_kq(KQ_scaled);
5011
+ ggml_set_name(KQ_scaled, "KQ_scaled");
5012
+
5013
+ struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
5014
+ offload_func_kq(KQ_masked);
5015
+ ggml_set_name(KQ_masked, "KQ_masked");
5016
+
5017
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
5018
+ offload_func_kq(KQ_soft_max);
5019
+ ggml_set_name(KQ_soft_max, "KQ_soft_max");
5020
+
5021
+ struct ggml_tensor * V =
5022
+ ggml_view_3d(ctx0, kv_self.v,
5023
+ n_kv, n_embd_head, n_head_kv,
5024
+ ggml_element_size(kv_self.v)*n_ctx,
5025
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
5026
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
5027
+ offload_func_v(V);
5028
+ ggml_set_name(V, "V");
5029
+
5030
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
5031
+ offload_func_v(KQV);
5032
+ ggml_set_name(KQV, "KQV");
5033
+
5034
+ struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
5035
+ offload_func_v(KQV_merged);
5036
+ ggml_set_name(KQV_merged, "KQV_merged");
5037
+
5038
+ cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
5039
+ offload_func_v(cur);
5040
+ ggml_set_name(cur, "KQV_merged_contiguous");
5041
+
5042
+ cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
5043
+ offload_func(cur);
5044
+ cur = ggml_add(ctx0, cur, model.layers[il].bo);
5045
+ offload_func(cur);
5046
+ ggml_set_name(cur, "result_wo");
5047
+ }
5048
+
5049
+ struct ggml_tensor * inpFF = ggml_add(ctx0, residual, cur);
5050
+ offload_func(inpFF);
5051
+ ggml_set_name(inpFF, "inpFF");
5052
+ {
5053
+ // MLP
5054
+ {
5055
+ // Norm
5056
+ cur = ggml_norm(ctx0, inpFF, norm_eps);
5057
+ offload_func(cur);
5058
+ cur = ggml_add(ctx0,
5059
+ ggml_mul(ctx0, cur, model.layers[il].ffn_norm),
5060
+ model.layers[il].ffn_norm_b
5061
+ );
5062
+ ggml_set_name(cur, "ffn_norm");
5063
+ offload_func(cur);
5064
+ }
5065
+ cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur);
5066
+ offload_func(cur);
5067
+
5068
+ cur = ggml_add(ctx0, cur, model.layers[il].b3);
5069
+ offload_func(cur);
5070
+ ggml_set_name(cur, "result_ffn_up");
5071
+
5072
+ cur = ggml_sqr(ctx0, ggml_relu(ctx0, cur));
5073
+ ggml_set_name(cur, "result_ffn_act");
5074
+ offload_func(cur);
5075
+ offload_func(cur->src[0]);
5076
+
5077
+ cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur);
5078
+ offload_func(cur);
5079
+ cur = ggml_add(ctx0,
5080
+ cur,
5081
+ model.layers[il].b2);
5082
+ offload_func(cur);
5083
+ ggml_set_name(cur, "outFF");
5084
+ }
5085
+ cur = ggml_add(ctx0, cur, inpFF);
5086
+ offload_func(cur);
5087
+ ggml_set_name(cur, "inpFF_+_outFF");
5088
+ inpL = cur;
5089
+ }
5090
+ cur = inpL;
5091
+ {
5092
+ cur = ggml_norm(ctx0, cur, norm_eps);
5093
+ offload_func_nr(cur);
5094
+ cur = ggml_mul(ctx0, cur, model.output_norm);
5095
+ offload_func_nr(cur);
5096
+
5097
+ cur = ggml_add(ctx0, cur, model.output_norm_b);
5098
+ // offload_func_nr(cur);
5099
+
5100
+ ggml_set_name(cur, "result_norm");
5101
+ }
5102
+ cur = ggml_mul_mat(ctx0, model.output, cur);
5103
+ ggml_set_name(cur, "result_output");
5104
+ ggml_build_forward_expand(gf, cur);
5105
+ ggml_free(ctx0);
5106
+ return gf;
5107
+ }
5108
+
5109
+ static struct ggml_cgraph * llm_build_bloom(
5110
+ llama_context & lctx,
5111
+ const llama_batch & batch) {
5112
+ const auto & model = lctx.model;
5113
+ const auto & hparams = model.hparams;
5114
+ const auto & cparams = lctx.cparams;
5115
+
5116
+ const auto & kv_self = lctx.kv_self;
5117
+
5118
+ GGML_ASSERT(!!kv_self.ctx);
5119
+
5120
+ const int64_t n_embd = hparams.n_embd;
5121
+ const int64_t n_layer = hparams.n_layer;
5122
+ const int64_t n_ctx = cparams.n_ctx;
5123
+ const int64_t n_head = hparams.n_head;
5124
+ const int64_t n_head_kv = hparams.n_head_kv;
5125
+ const int64_t n_embd_head = hparams.n_embd_head();
5126
+ const int64_t n_embd_gqa = hparams.n_embd_gqa();
5127
+
5128
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
5129
+
5130
+ const float norm_eps = hparams.f_norm_eps;
5131
+
5132
+ const int32_t n_tokens = batch.n_tokens;
5133
+ const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
5134
+ const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
5135
+
5136
+ auto & buf_compute = lctx.buf_compute;
5137
+
5138
+ struct ggml_init_params params = {
5139
+ /*.mem_size =*/ buf_compute.size,
5140
+ /*.mem_buffer =*/ buf_compute.data,
5141
+ /*.no_alloc =*/ false,
5142
+ };
5143
+
5144
+ params.no_alloc = true;
5145
+
5146
+ struct ggml_context * ctx0 = ggml_init(params);
5147
+
5148
+ ggml_cgraph * gf = ggml_new_graph(ctx0);
5149
+
5150
+ struct ggml_tensor * cur;
5151
+ struct ggml_tensor * token;
5152
+ struct ggml_tensor * inpL;
5153
+
5154
+ if (batch.token) {
5155
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
5156
+
5157
+ ggml_allocr_alloc(lctx.alloc, inp_tokens);
5158
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
5159
+ memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
5160
+ }
5161
+ ggml_set_name(inp_tokens, "inp_tokens");
5162
+
5163
+ token = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
5164
+ } else {
5165
+ #ifdef GGML_USE_MPI
5166
+ GGML_ASSERT(false && "not implemented");
5167
+ #endif
5168
+
5169
+ token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
5170
+
5171
+ ggml_allocr_alloc(lctx.alloc, token);
5172
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
5173
+ memcpy(token->data, batch.embd, n_tokens * n_embd * ggml_element_size(token));
5174
+ }
5175
+ }
5176
+
5177
+ // KQ_scale
5178
+ struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
5179
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
5180
+ ggml_allocr_alloc(lctx.alloc, KQ_scale);
5181
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
5182
+ ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
5183
+ }
5184
+
5185
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5186
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
5187
+ ggml_set_name(KQ_mask, "KQ_mask");
5188
+ ggml_allocr_alloc(lctx.alloc, KQ_mask);
5189
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
5190
+ float * data = (float *) KQ_mask->data;
5191
+ memset(data, 0, ggml_nbytes(KQ_mask));
5192
+
5193
+ for (int h = 0; h < 1; ++h) {
5194
+ for (int j = 0; j < n_tokens; ++j) {
5195
+ const llama_pos pos = batch.pos[j];
5196
+ const llama_seq_id seq_id = batch.seq_id[j];
5197
+
5198
+ for (int i = 0; i < n_kv; ++i) {
5199
+ if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
5200
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
5201
+ }
5202
+ }
5203
+ }
5204
+ }
5205
+ }
5206
+
5207
+ // norm
5208
+ {
5209
+ inpL = ggml_norm(ctx0, token, norm_eps);
5210
+ inpL = ggml_add(ctx0, ggml_mul(ctx0, inpL, model.tok_norm), model.tok_norm_b);
5211
+ }
5212
+
5213
+ ggml_set_name(inpL, "inpL");
5214
+
5215
+ for (int il = 0; il < n_layer; ++il) {
5216
+ {
5217
+ // Norm
5218
+ cur = ggml_norm(ctx0, inpL, norm_eps);
5219
+ cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b);
5220
+ }
5221
+
5222
+ {
5223
+ // Self Attention
5224
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv);
5225
+
5226
+ struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd);
5227
+ struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*n_embd);
5228
+ struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
5229
+
5230
+ struct ggml_tensor * Qcur = tmpq;
5231
+ struct ggml_tensor * Kcur = tmpk;
5232
+
5233
+ // store key and value to memory
5234
+ {
5235
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
5236
+ ggml_set_name(Vcur, "Vcur");
5237
+
5238
+ struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
5239
+ ggml_set_name(k, "k");
5240
+
5241
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
5242
+ ( n_ctx)*ggml_element_size(kv_self.v),
5243
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
5244
+
5245
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
5246
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
5247
+ }
5248
+
5249
+ struct ggml_tensor * Q =
5250
+ ggml_permute(ctx0,
5251
+ ggml_cpy(ctx0,
5252
+ Qcur,
5253
+ ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, n_tokens)),
5254
+ 0, 2, 1, 3);
5255
+ ggml_set_name(Q, "Q");
5256
+
5257
+ struct ggml_tensor * K =
5258
+ ggml_view_3d(ctx0, kv_self.k,
5259
+ n_embd_head, n_kv, n_head_kv,
5260
+ ggml_element_size(kv_self.k)*n_embd_gqa,
5261
+ ggml_element_size(kv_self.k)*n_embd_head,
5262
+ ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
5263
+ ggml_set_name(K, "K");
5264
+
5265
+ // K * Q
5266
+ struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
5267
+ ggml_set_name(KQ, "KQ");
5268
+
5269
+ // KQ_scaled = KQ / sqrt(n_embd_head)
5270
+ // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
5271
+ struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
5272
+ ggml_set_name(KQ_scaled, "KQ_scaled");
5273
+
5274
+ struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ kv_head, n_head, 8);
5275
+ ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
5276
+
5277
+ // KQ_masked = mask_past(KQ_scaled)
5278
+ struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
5279
+ ggml_set_name(KQ_masked, "KQ_masked");
5280
+
5281
+ // KQ = soft_max(KQ_masked)
5282
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
5283
+ ggml_set_name(KQ_soft_max, "KQ_soft_max");
5284
+
5285
+ // split cached V into n_head heads
5286
+ struct ggml_tensor * V =
5287
+ ggml_view_3d(ctx0, kv_self.v,
5288
+ n_kv, n_embd_head, n_head_kv,
5289
+ ggml_element_size(kv_self.v)*n_ctx,
5290
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
5291
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
5292
+ ggml_set_name(V, "V");
5293
+
5294
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
5295
+ ggml_set_name(KQV, "KQV");
5296
+
5297
+ // KQV_merged = KQV.permute(0, 2, 1, 3)
5298
+ struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
5299
+ ggml_set_name(KQV_merged, "KQV_merged");
5300
+
5301
+ // cur = KQV_merged.contiguous().view(n_embd, n_tokens)
5302
+ cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
5303
+ ggml_set_name(cur, "KQV_merged_contiguous");
5304
+ }
5305
+
5306
+ // Projection
5307
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo);
5308
+
5309
+ // Add the input
5310
+ cur = ggml_add(ctx0, cur, inpL);
5311
+
5312
+ struct ggml_tensor * inpFF = cur;
5313
+
5314
+ // FF
5315
+ {
5316
+ // Norm
5317
+ {
5318
+ cur = ggml_norm(ctx0, inpFF, norm_eps);
5319
+ cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b);
5320
+ }
5321
+
5322
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3);
5323
+
5324
+ // GELU activation
5325
+ cur = ggml_gelu(ctx0, cur);
5326
+
5327
+ // Projection
5328
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2);
5329
+ }
5330
+
5331
+ inpL = ggml_add(ctx0, cur, inpFF);
5332
+ }
5333
+
5334
+ // Output Norm
5335
+ {
5336
+ cur = ggml_norm(ctx0, inpL, norm_eps);
5337
+ cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b);
5338
+ }
5339
+ ggml_set_name(cur, "result_norm");
5340
+
5341
+ cur = ggml_mul_mat(ctx0, model.output, cur);
5342
+ ggml_set_name(cur, "result_output");
5343
+
5344
+ ggml_build_forward_expand(gf, cur);
5345
+
5346
+ ggml_free(ctx0);
5347
+
5348
+ return gf;
5349
+ }
5350
+
5351
+ static struct ggml_cgraph * llm_build_mpt(
5352
+ llama_context & lctx,
5353
+ const llama_batch & batch) {
5354
+ const auto & model = lctx.model;
5355
+ const auto & hparams = model.hparams;
5356
+ const auto & cparams = lctx.cparams;
5357
+
5358
+ const auto & kv_self = lctx.kv_self;
5359
+
5360
+ GGML_ASSERT(!!kv_self.ctx);
5361
+
5362
+ const int64_t n_embd = hparams.n_embd;
5363
+ const int64_t n_layer = hparams.n_layer;
5364
+ const int64_t n_ctx = cparams.n_ctx;
5365
+ const int64_t n_head = hparams.n_head;
5366
+ const int64_t n_head_kv = hparams.n_head_kv; // == n_head for MPT, as there's no MQA/GQA
5367
+ const int64_t n_embd_head = hparams.n_embd_head();
5368
+ const int64_t n_embd_gqa = hparams.n_embd_gqa();
5369
+
5370
+ const float norm_eps = hparams.f_norm_eps;
5371
+ const float clamp_kqv = hparams.f_clamp_kqv;
5372
+ const float max_alibi_bias = hparams.f_max_alibi_bias;
5373
+
5374
+ const int n_gpu_layers = model.n_gpu_layers;
5375
+
5376
+ const int32_t n_tokens = batch.n_tokens;
5377
+ const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
5378
+ const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
5379
+
5380
+ auto & buf_compute = lctx.buf_compute;
5381
+
5382
+ struct ggml_init_params params = {
5383
+ /*.mem_size =*/ buf_compute.size,
5384
+ /*.mem_buffer =*/ buf_compute.data,
5385
+ /*.no_alloc =*/ false,
5386
+ };
5387
+
5388
+ params.no_alloc = true;
5389
+
5390
+ struct ggml_context * ctx0 = ggml_init(params);
5391
+
5392
+ ggml_cgraph * gf = ggml_new_graph(ctx0);
5393
+
5394
+ struct ggml_tensor * cur;
5395
+ struct ggml_tensor * inpL;
5396
+
5397
+ //int warmup = 0;
5398
+ if (batch.token) {
5399
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
5400
+
5401
+ ggml_allocr_alloc(lctx.alloc, inp_tokens);
5402
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
5403
+ memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
5404
+ //warmup = ((uint32_t*) inp_tokens->data)[0] == 0;
5405
+ }
5406
+
5407
+ ggml_set_name(inp_tokens, "inp_tokens");
5408
+
5409
+ inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
5410
+ } else {
5411
+ #ifdef GGML_USE_MPI
5412
+ GGML_ASSERT(false && "not implemented");
5413
+ #endif
5414
+
5415
+ inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
5416
+
5417
+ ggml_allocr_alloc(lctx.alloc, inpL);
5418
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
5419
+ memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
5420
+ }
5421
+ }
5422
+
5423
+ const int i_gpu_start = n_layer - n_gpu_layers;
5424
+ (void) i_gpu_start;
5425
+
5426
+ // offload functions set the tensor output backend to GPU
5427
+ // tensors are GPU-accelerated if any input or the output has been offloaded
5428
+ offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
5429
+ offload_func_t offload_func_kq = llama_nop;
5430
+ offload_func_t offload_func_v = llama_nop;
5431
+
5432
+ #ifdef GGML_USE_CUBLAS
5433
+ if (n_gpu_layers > n_layer) {
5434
+ offload_func_nr = ggml_cuda_assign_buffers_no_alloc;
5435
+ }
5436
+ if (n_gpu_layers > n_layer + 1) {
5437
+ offload_func_v = ggml_cuda_assign_buffers_no_alloc;
5438
+ }
5439
+ if (n_gpu_layers > n_layer + 2) {
5440
+ offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
5441
+ }
5442
+ #endif // GGML_USE_CUBLAS
5443
+
5444
+ // KQ_scale
5445
+ struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
5446
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
5447
+ ggml_allocr_alloc(lctx.alloc, KQ_scale);
5448
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
5449
+ ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
5450
+ }
5451
+
5452
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5453
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
5454
+ offload_func_kq(KQ_mask);
5455
+ ggml_set_name(KQ_mask, "KQ_mask");
5456
+ ggml_allocr_alloc(lctx.alloc, KQ_mask);
5457
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
5458
+ float * data = (float *) KQ_mask->data;
5459
+ memset(data, 0, ggml_nbytes(KQ_mask));
5460
+
5461
+ for (int h = 0; h < 1; ++h) {
5462
+ for (int j = 0; j < n_tokens; ++j) {
5463
+ const llama_pos pos = batch.pos[j];
5464
+ const llama_seq_id seq_id = batch.seq_id[j];
5465
+
5466
+ for (int i = 0; i < n_kv; ++i) {
5467
+ if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
5468
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
5469
+ }
5470
+ }
5471
+ }
5472
+ }
5473
+ }
5474
+
3822
5475
  for (int il = 0; il < n_layer; ++il) {
3823
- {
3824
- // Norm
3825
- cur = ggml_norm(ctx0, inpL, norm_eps);
3826
- cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b);
5476
+ struct ggml_tensor * attn_norm;
5477
+
5478
+ offload_func_t offload_func = llama_nop;
5479
+
5480
+ #ifdef GGML_USE_CUBLAS
5481
+ if (il >= i_gpu_start) {
5482
+ offload_func = ggml_cuda_assign_buffers_no_alloc;
3827
5483
  }
5484
+ #endif // GGML_USE_CUBLAS
3828
5485
 
5486
+ // self-attention
5487
+ // TODO: refactor into common function (shared with LLaMA)
3829
5488
  {
3830
- // Self Attention
3831
- cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv);
5489
+ attn_norm = ggml_norm(ctx0, inpL, norm_eps);
5490
+ offload_func(attn_norm);
3832
5491
 
3833
- struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd);
3834
- struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*n_embd);
3835
- struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
5492
+ attn_norm = ggml_mul(ctx0, attn_norm, model.layers[il].attn_norm);
5493
+ offload_func(attn_norm);
3836
5494
 
3837
- struct ggml_tensor * Qcur = tmpq;
3838
- struct ggml_tensor * Kcur = tmpk;
5495
+ if (1) {
5496
+ cur = attn_norm;
5497
+ }
5498
+
5499
+ // compute QKV
5500
+
5501
+ cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
5502
+ offload_func_kq(cur);
5503
+
5504
+ if (clamp_kqv > 0.0f) {
5505
+ cur = ggml_clamp(ctx0, cur, -clamp_kqv, clamp_kqv);
5506
+ offload_func_kq(cur);
5507
+ }
5508
+
5509
+ const size_t wsize = ggml_type_size(cur->type);
5510
+
5511
+ struct ggml_tensor * Qcur = ggml_view_3d(
5512
+ ctx0, cur, n_embd_head, n_head, n_tokens,
5513
+ wsize * n_embd_head,
5514
+ wsize * n_embd_head * (n_head + 2 * n_head_kv),
5515
+ 0);
5516
+ offload_func_kq(Qcur);
5517
+
5518
+ struct ggml_tensor * Kcur = ggml_view_3d(
5519
+ ctx0, cur, n_embd_head, n_head_kv, n_tokens,
5520
+ wsize * n_embd_head,
5521
+ wsize * n_embd_head * (n_head + 2 * n_head_kv),
5522
+ wsize * n_embd_head * n_head);
5523
+ offload_func_kq(Kcur);
5524
+
5525
+ struct ggml_tensor * tmpv = ggml_view_3d(
5526
+ ctx0, cur, n_embd_head, n_head_kv, n_tokens,
5527
+ wsize * n_embd_head,
5528
+ wsize * n_embd_head * (n_head + 2 * n_head_kv),
5529
+ wsize * n_embd_head * (n_head + n_head_kv));
5530
+ offload_func_kq(Kcur);
5531
+
5532
+ ggml_set_name(Qcur, "Qcur");
5533
+ ggml_set_name(Kcur, "Kcur");
3839
5534
 
3840
5535
  {
3841
5536
  struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
5537
+ offload_func_v(Vcur);
5538
+ offload_func_v(Vcur->src[0]->src[0]);
3842
5539
  ggml_set_name(Vcur, "Vcur");
3843
5540
 
3844
5541
  struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
5542
+ offload_func_kq(k);
3845
5543
  ggml_set_name(k, "k");
3846
5544
 
3847
5545
  struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
3848
5546
  ( n_ctx)*ggml_element_size(kv_self.v),
3849
5547
  (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
5548
+ offload_func_v(v);
3850
5549
 
3851
5550
  ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
3852
5551
  ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
3853
5552
  }
3854
5553
 
3855
- struct ggml_tensor * Q =
3856
- ggml_permute(ctx0,
3857
- ggml_cpy(ctx0,
3858
- Qcur,
3859
- ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, n_tokens)),
3860
- 0, 2, 1, 3);
5554
+ struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
5555
+ offload_func_kq(Q);
3861
5556
  ggml_set_name(Q, "Q");
3862
5557
 
3863
5558
  struct ggml_tensor * K =
@@ -3866,85 +5561,105 @@ static struct ggml_cgraph * llm_build_starcoder(
3866
5561
  ggml_element_size(kv_self.k)*n_embd_gqa,
3867
5562
  ggml_element_size(kv_self.k)*n_embd_head,
3868
5563
  ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
5564
+ offload_func_kq(K);
3869
5565
  ggml_set_name(K, "K");
3870
5566
 
3871
- // K * Q
3872
5567
  struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
5568
+ offload_func_kq(KQ);
3873
5569
  ggml_set_name(KQ, "KQ");
3874
5570
 
3875
- // KQ_scaled = KQ / sqrt(n_embd_head)
3876
- // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
3877
- struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
5571
+ struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
5572
+ offload_func_kq(KQ_scaled);
3878
5573
  ggml_set_name(KQ_scaled, "KQ_scaled");
3879
5574
 
3880
- // KQ_masked = mask_past(KQ_scaled)
3881
- struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
5575
+ // TODO: replace with ggml_add()
5576
+ struct ggml_tensor * KQ_scaled_alibi =
5577
+ ggml_alibi(ctx0, KQ_scaled, 0, n_head, max_alibi_bias);
5578
+ offload_func_kq(KQ_scaled_alibi);
5579
+ ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
5580
+
5581
+ struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
5582
+ offload_func_kq(KQ_masked);
3882
5583
  ggml_set_name(KQ_masked, "KQ_masked");
3883
5584
 
3884
- // KQ = soft_max(KQ_masked)
3885
- struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
5585
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
5586
+ offload_func_v(KQ_soft_max);
3886
5587
  ggml_set_name(KQ_soft_max, "KQ_soft_max");
3887
5588
 
3888
- // split cached V into n_head heads
3889
5589
  struct ggml_tensor * V =
3890
5590
  ggml_view_3d(ctx0, kv_self.v,
3891
5591
  n_kv, n_embd_head, n_head_kv,
3892
5592
  ggml_element_size(kv_self.v)*n_ctx,
3893
5593
  ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
3894
5594
  ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
5595
+ offload_func_v(V);
3895
5596
  ggml_set_name(V, "V");
3896
5597
 
3897
5598
  struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
5599
+ offload_func_v(KQV);
3898
5600
  ggml_set_name(KQV, "KQV");
3899
5601
 
3900
- // KQV_merged = KQV.permute(0, 2, 1, 3)
3901
5602
  struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
5603
+ offload_func_v(KQV_merged);
3902
5604
  ggml_set_name(KQV_merged, "KQV_merged");
3903
5605
 
3904
- // cur = KQV_merged.contiguous().view(n_embd, n_tokens)
3905
5606
  cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
5607
+ offload_func_v(cur);
3906
5608
  ggml_set_name(cur, "KQV_merged_contiguous");
3907
- }
3908
5609
 
3909
- // Projection
3910
- cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo);
5610
+ cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
5611
+ offload_func(cur);
5612
+ ggml_set_name(cur, "result_wo");
5613
+ }
3911
5614
 
3912
5615
  // Add the input
3913
5616
  cur = ggml_add(ctx0, cur, inpL);
5617
+ offload_func(cur);
3914
5618
 
3915
- struct ggml_tensor * inpFF = cur;
5619
+ struct ggml_tensor * attn_out = cur;
3916
5620
 
3917
- // FF
5621
+ // feed forward
3918
5622
  {
3919
5623
  // Norm
3920
5624
  {
3921
- cur = ggml_norm(ctx0, inpFF, norm_eps);
3922
- cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b);
5625
+ cur = ggml_norm(ctx0, attn_out, norm_eps);
5626
+ offload_func(cur);
5627
+
5628
+ cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
5629
+ offload_func(cur);
3923
5630
  }
3924
5631
 
3925
- cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3);
5632
+ cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur);
5633
+ offload_func(cur);
3926
5634
 
3927
- // GELU activation
3928
5635
  cur = ggml_gelu(ctx0, cur);
3929
-
3930
- // Projection
3931
- cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2);
5636
+ offload_func(cur);
5637
+ cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur);
5638
+ offload_func(cur);
3932
5639
  }
3933
5640
 
3934
- inpL = ggml_add(ctx0, cur, inpFF);
5641
+ cur = ggml_add(ctx0, cur, attn_out);
5642
+ offload_func(cur);
5643
+ // input for next layer
5644
+ inpL = cur;
3935
5645
  }
3936
5646
 
3937
- // Output Norm
5647
+ cur = inpL;
5648
+
5649
+ // norm
3938
5650
  {
3939
- cur = ggml_norm(ctx0, inpL, norm_eps);
3940
- cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b);
5651
+ cur = ggml_norm(ctx0, cur, norm_eps);
5652
+ offload_func_nr(cur);
5653
+
5654
+ cur = ggml_mul(ctx0, cur, model.output_norm);
5655
+ ggml_set_name(cur, "result_norm");
3941
5656
  }
3942
- ggml_set_name(cur, "result_norm");
3943
5657
 
3944
5658
  cur = ggml_mul_mat(ctx0, model.output, cur);
3945
5659
  ggml_set_name(cur, "result_output");
3946
5660
 
3947
5661
  ggml_build_forward_expand(gf, cur);
5662
+
3948
5663
  ggml_free(ctx0);
3949
5664
 
3950
5665
  return gf;
@@ -3974,6 +5689,22 @@ static struct ggml_cgraph * llama_build_graph(
3974
5689
  {
3975
5690
  result = llm_build_starcoder(lctx, batch);
3976
5691
  } break;
5692
+ case LLM_ARCH_PERSIMMON:
5693
+ {
5694
+ result = llm_build_persimmon(lctx, batch);
5695
+ } break;
5696
+ case LLM_ARCH_REFACT:
5697
+ {
5698
+ result = llm_build_refact(lctx, batch);
5699
+ } break;
5700
+ case LLM_ARCH_BLOOM:
5701
+ {
5702
+ result = llm_build_bloom(lctx, batch);
5703
+ } break;
5704
+ case LLM_ARCH_MPT:
5705
+ {
5706
+ result = llm_build_mpt(lctx, batch);
5707
+ } break;
3977
5708
  default:
3978
5709
  GGML_ASSERT(false);
3979
5710
  }
@@ -3985,7 +5716,6 @@ static struct ggml_cgraph * llama_build_graph(
3985
5716
  //
3986
5717
  // - lctx: llama context
3987
5718
  // - batch: batch to evaluate
3988
- // - n_threads: number of threads to use
3989
5719
  //
3990
5720
  // return 0 on success
3991
5721
  // return positive int on warning
@@ -4052,10 +5782,6 @@ static int llama_decode_internal(
4052
5782
  batch.seq_id = seq_id.data();
4053
5783
  }
4054
5784
 
4055
- // we always start to search for a free slot from the start of the cache
4056
- // TODO: better strategies can be implemented
4057
- kv_self.head = 0;
4058
-
4059
5785
  if (!llama_kv_cache_find_slot(kv_self, batch)) {
4060
5786
  return 1;
4061
5787
  }
@@ -4107,7 +5833,9 @@ static int llama_decode_internal(
4107
5833
  // If all tensors can be run on the GPU then using more than 1 thread is detrimental.
4108
5834
  const bool full_offload_supported = model.arch == LLM_ARCH_LLAMA ||
4109
5835
  model.arch == LLM_ARCH_BAICHUAN ||
4110
- model.arch == LLM_ARCH_FALCON;
5836
+ model.arch == LLM_ARCH_FALCON ||
5837
+ model.arch == LLM_ARCH_REFACT ||
5838
+ model.arch == LLM_ARCH_MPT;
4111
5839
  const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
4112
5840
  if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
4113
5841
  n_threads = 1;
@@ -4140,8 +5868,12 @@ static int llama_decode_internal(
4140
5868
  #endif
4141
5869
 
4142
5870
  // update the kv ring buffer
4143
- lctx.kv_self.head += n_tokens;
4144
5871
  lctx.kv_self.has_shift = false;
5872
+ lctx.kv_self.head += n_tokens;
5873
+ // Ensure kv cache head points to a valid index.
5874
+ if (lctx.kv_self.head >= lctx.kv_self.size) {
5875
+ lctx.kv_self.head = 0;
5876
+ }
4145
5877
 
4146
5878
  #ifdef GGML_PERF
4147
5879
  // print timing information per ggml operation (for debugging purposes)
@@ -4227,18 +5959,41 @@ static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
4227
5959
  return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE;
4228
5960
  }
4229
5961
 
4230
- static uint8_t llama_token_to_byte(const llama_vocab & vocab, llama_token id) {
5962
+ static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id) {
5963
+ return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED;
5964
+ }
5965
+
5966
+ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
4231
5967
  GGML_ASSERT(llama_is_byte_token(vocab, id));
4232
5968
  const auto& token_data = vocab.id_to_token.at(id);
4233
- auto buf = token_data.text.substr(3, 2);
4234
- return strtol(buf.c_str(), NULL, 16);
5969
+ switch (llama_vocab_get_type(vocab)) {
5970
+ case LLAMA_VOCAB_TYPE_SPM: {
5971
+ auto buf = token_data.text.substr(3, 2);
5972
+ return strtol(buf.c_str(), NULL, 16);
5973
+ }
5974
+ case LLAMA_VOCAB_TYPE_BPE: {
5975
+ GGML_ASSERT(false);
5976
+ return unicode_to_bytes_bpe(token_data.text);
5977
+ }
5978
+ default:
5979
+ GGML_ASSERT(false);
5980
+ }
4235
5981
  }
4236
5982
 
4237
5983
  static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
4238
- char buf[7];
4239
- int result = snprintf(buf, sizeof(buf), "<0x%02X>", ch);
4240
- GGML_ASSERT(0 <= result && result < 7);
4241
- return vocab.token_to_id.at(buf);
5984
+ switch (llama_vocab_get_type(vocab)) {
5985
+ case LLAMA_VOCAB_TYPE_SPM: {
5986
+ char buf[7];
5987
+ int result = snprintf(buf, sizeof(buf), "<0x%02X>", ch);
5988
+ GGML_ASSERT(0 <= result && result < 7);
5989
+ return vocab.token_to_id.at(buf);
5990
+ }
5991
+ case LLAMA_VOCAB_TYPE_BPE: {
5992
+ return vocab.token_to_id.at(bytes_to_unicode_bpe(ch));
5993
+ }
5994
+ default:
5995
+ GGML_ASSERT(false);
5996
+ }
4242
5997
  }
4243
5998
 
4244
5999
  static void llama_escape_whitespace(std::string & text) {
@@ -4518,15 +6273,9 @@ struct llm_tokenizer_bpe {
4518
6273
  std::string byte_str(1, *j);
4519
6274
  auto token_multibyte = vocab.token_to_id.find(byte_str);
4520
6275
  if (token_multibyte == vocab.token_to_id.end()) {
4521
- try {
4522
- llama_token token_byte = llama_byte_to_token(vocab, *j);
4523
- output.push_back(token_byte);
4524
- } catch (const std::out_of_range & err) {
4525
- fprintf(stderr,"ERROR: byte not found in vocab: '%s'\n", byte_str.c_str());
4526
- }
4527
- } else {
4528
- output.push_back((*token_multibyte).second);
6276
+ throw std::runtime_error("ERROR: byte not found in vocab");
4529
6277
  }
6278
+ output.push_back((*token_multibyte).second);
4530
6279
  }
4531
6280
  } else {
4532
6281
  output.push_back((*token).second);
@@ -4563,23 +6312,143 @@ private:
4563
6312
  work_queue.push(bigram);
4564
6313
  }
4565
6314
 
4566
- // probably not 100% correct
4567
- static std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
4568
- std::vector<std::string> words;
6315
+ std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
6316
+ std::vector<std::string> bpe_words;
6317
+ std::vector<std::string> bpe_encoded_words;
6318
+
6319
+ std::string token = "";
6320
+ // GPT2 system regex: 's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+
6321
+ bool collecting_numeric = false;
6322
+ bool collecting_letter = false;
6323
+ bool collecting_special = false;
6324
+ bool collecting_whitespace_lookahead = false;
6325
+ bool collecting = false;
6326
+
6327
+ std::vector<std::string> text_utf;
6328
+ text_utf.reserve(text.size());
6329
+ bpe_words.reserve(text.size());
6330
+ bpe_encoded_words.reserve(text.size());
6331
+
6332
+ auto cps = codepoints_from_utf8(text);
6333
+ for (size_t i = 0; i < cps.size(); ++i)
6334
+ text_utf.emplace_back(codepoint_to_utf8(cps[i]));
6335
+
6336
+ for (int i = 0; i < (int)text_utf.size(); i++) {
6337
+ const std::string & utf_char = text_utf[i];
6338
+ bool split_condition = false;
6339
+ int bytes_remain = text_utf.size() - i;
6340
+ // forward backward lookups
6341
+ const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
6342
+ const std::string & utf_char_next_next = (i + 2 < (int)text_utf.size()) ? text_utf[i + 2] : "";
6343
+
6344
+ // handling contractions
6345
+ if (!split_condition && bytes_remain >= 2) {
6346
+ // 's|'t|'m|'d
6347
+ if (utf_char == "\'" && (utf_char_next == "s" || utf_char_next == "t" || utf_char_next == "m" || utf_char_next == "d")) {
6348
+ split_condition = true;
6349
+ }
6350
+ if (split_condition) {
6351
+ if (token.size()) {
6352
+ bpe_words.emplace_back(token); // push previous content as token
6353
+ }
6354
+ token = utf_char + utf_char_next;
6355
+ bpe_words.emplace_back(token);
6356
+ token = "";
6357
+ i++;
6358
+ continue;
6359
+ }
6360
+ }
6361
+ if (!split_condition && bytes_remain >= 3) {
6362
+ // 're|'ve|'ll
6363
+ if (utf_char == "\'" && (
6364
+ (utf_char_next == "r" && utf_char_next_next == "e") ||
6365
+ (utf_char_next == "v" && utf_char_next_next == "e") ||
6366
+ (utf_char_next == "l" && utf_char_next_next == "l"))
6367
+ ) {
6368
+ split_condition = true;
6369
+ }
6370
+ if (split_condition) {
6371
+ // current token + next token can be defined
6372
+ if (token.size()) {
6373
+ bpe_words.emplace_back(token); // push previous content as token
6374
+ }
6375
+ token = utf_char + utf_char_next + utf_char_next_next;
6376
+ bpe_words.emplace_back(token); // the contraction
6377
+ token = "";
6378
+ i += 2;
6379
+ continue;
6380
+ }
6381
+ }
6382
+
6383
+ if (!split_condition && !collecting) {
6384
+ if (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || (!token.size() && utf_char == " " && codepoint_type(utf_char_next) == CODEPOINT_TYPE_LETTER)) {
6385
+ collecting_letter = true;
6386
+ collecting = true;
6387
+ }
6388
+ else if (codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || (!token.size() && utf_char == " " && codepoint_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
6389
+ collecting_numeric = true;
6390
+ collecting = true;
6391
+ }
6392
+ else if (
6393
+ ((codepoint_type(utf_char) != CODEPOINT_TYPE_LETTER && codepoint_type(utf_char) != CODEPOINT_TYPE_DIGIT) && (codepoint_type(utf_char) != CODEPOINT_TYPE_WHITESPACE)) ||
6394
+ (!token.size() && utf_char == " " && codepoint_type(utf_char_next) != CODEPOINT_TYPE_LETTER && codepoint_type(utf_char_next) != CODEPOINT_TYPE_DIGIT && codepoint_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE)
6395
+ ) {
6396
+ collecting_special = true;
6397
+ collecting = true;
6398
+ }
6399
+ else if (codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE && codepoint_type(utf_char_next) == CODEPOINT_TYPE_WHITESPACE) {
6400
+ collecting_whitespace_lookahead = true;
6401
+ collecting = true;
6402
+ }
6403
+ else if (codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE) {
6404
+ split_condition = true;
6405
+ }
6406
+ }
6407
+ else if (!split_condition && collecting) {
6408
+ if (collecting_letter && codepoint_type(utf_char) != CODEPOINT_TYPE_LETTER) {
6409
+ split_condition = true;
6410
+ }
6411
+ else if (collecting_numeric && codepoint_type(utf_char) != CODEPOINT_TYPE_DIGIT) {
6412
+ split_condition = true;
6413
+ }
6414
+ else if (collecting_special && (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
6415
+ split_condition = true;
6416
+ }
6417
+ else if (collecting_whitespace_lookahead && (codepoint_type(utf_char_next) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
6418
+ split_condition = true;
6419
+ }
6420
+ }
6421
+
6422
+ if (utf_char_next == "") {
6423
+ split_condition = true; // final
6424
+ token += utf_char;
6425
+ }
4569
6426
 
4570
- // ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53
4571
- const std::string pattern = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
4572
- const std::regex re(pattern);
6427
+ if (split_condition) {
6428
+ if (token.size()) {
6429
+ bpe_words.emplace_back(token);
6430
+ }
6431
+ token = utf_char;
6432
+ collecting = false;
6433
+ collecting_letter = false;
6434
+ collecting_numeric = false;
6435
+ collecting_special = false;
6436
+ collecting_whitespace_lookahead = false;
6437
+ }
6438
+ else {
6439
+ token += utf_char;
6440
+ }
6441
+ }
4573
6442
 
4574
- auto words_begin = std::sregex_iterator(text.begin(), text.end(), re);
4575
- auto words_end = std::sregex_iterator();
4576
- auto n_words = std::distance(words_begin, words_end);
4577
- words.reserve(n_words);
4578
- for (auto it = words_begin; it != words_end; ++it) {
4579
- words.push_back(it->str());
6443
+ for (std::string & word : bpe_words) {
6444
+ std::string encoded_token = "";
6445
+ for (char & c : word) {
6446
+ encoded_token += bytes_to_unicode_bpe(c);
6447
+ }
6448
+ bpe_encoded_words.emplace_back(encoded_token);
4580
6449
  }
4581
- return words;
4582
6450
 
6451
+ return bpe_encoded_words;
4583
6452
  }
4584
6453
 
4585
6454
  const llama_vocab & vocab;
@@ -6022,7 +7891,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
6022
7891
  nthread = std::thread::hardware_concurrency();
6023
7892
  }
6024
7893
 
6025
- llama_model_loader ml(fname_inp, /*use_mmap*/ false);
7894
+ // mmap consistently increases speed Linux, and also increases speed on Windows with
7895
+ // hot cache. It may cause a slowdown on macOS, possibly related to free memory.
7896
+ #if defined(__linux__) || defined(_WIN32)
7897
+ constexpr bool use_mmap = true;
7898
+ #else
7899
+ constexpr bool use_mmap = false;
7900
+ #endif
7901
+
7902
+ llama_model_loader ml(fname_inp, use_mmap);
7903
+ if (ml.use_mmap) {
7904
+ ml.mapping.reset(new llama_mmap(&ml.file, /* prefetch */ 0, ggml_is_numa()));
7905
+ }
6026
7906
 
6027
7907
  llama_model model;
6028
7908
  llm_load_arch(ml, model);
@@ -6050,7 +7930,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
6050
7930
  const std::string name = ggml_get_name(meta);
6051
7931
 
6052
7932
  // TODO: avoid hardcoded tensor names - use the TN_* constants
6053
- if (name.find("attn_v.weight") != std::string::npos) {
7933
+ if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
6054
7934
  ++n_attention_wv;
6055
7935
  }
6056
7936
  else if (name.find("ffn_down.weight") != std::string::npos) {
@@ -6087,6 +7967,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
6087
7967
  }
6088
7968
 
6089
7969
  std::ofstream fout(fname_out, std::ios::binary);
7970
+ fout.exceptions(std::ofstream::failbit); // fail fast on write errors
6090
7971
 
6091
7972
  const size_t meta_size = gguf_get_meta_size(ctx_out);
6092
7973
 
@@ -6100,10 +7981,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
6100
7981
 
6101
7982
  const std::string name = ggml_get_name(tensor);
6102
7983
 
6103
- if (read_data.size() < ggml_nbytes(tensor)) {
6104
- read_data.resize(ggml_nbytes(tensor));
7984
+ if (!ml.use_mmap) {
7985
+ if (read_data.size() < ggml_nbytes(tensor)) {
7986
+ read_data.resize(ggml_nbytes(tensor));
7987
+ }
7988
+ tensor->data = read_data.data();
6105
7989
  }
6106
- tensor->data = read_data.data();
6107
7990
  ml.load_data_for(tensor);
6108
7991
 
6109
7992
  LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",
@@ -6738,13 +8621,14 @@ struct llama_context * llama_new_context_with_model(
6738
8621
 
6739
8622
  #ifdef GGML_USE_METAL
6740
8623
  if (model->n_gpu_layers > 0) {
8624
+ ggml_metal_log_set_callback(llama_log_callback_default, NULL);
8625
+
6741
8626
  ctx->ctx_metal = ggml_metal_init(1);
6742
8627
  if (!ctx->ctx_metal) {
6743
8628
  LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__);
6744
8629
  llama_free(ctx);
6745
8630
  return NULL;
6746
8631
  }
6747
- ggml_metal_log_set_callback(llama_log_callback_default, NULL);
6748
8632
  //ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false);
6749
8633
  //ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
6750
8634
  }
@@ -6872,6 +8756,10 @@ int llama_n_embd(const struct llama_model * model) {
6872
8756
  return model->hparams.n_embd;
6873
8757
  }
6874
8758
 
8759
+ float llama_rope_freq_scale_train(const struct llama_model * model) {
8760
+ return model->hparams.rope_freq_scale_train;
8761
+ }
8762
+
6875
8763
  int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
6876
8764
  return snprintf(buf, buf_size, "%s %s %s",
6877
8765
  llama_model_arch_name(model->arch).c_str(),
@@ -7039,16 +8927,6 @@ struct llama_data_file_context : llama_data_context {
7039
8927
  *
7040
8928
  */
7041
8929
  static void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
7042
- // TODO: does not support multi-sequence states
7043
- {
7044
- const auto & kv_self = ctx->kv_self;
7045
- for (uint32_t i = 0; i < kv_self.head; ++i) {
7046
- GGML_ASSERT(kv_self.cells[i].pos == (int32_t) i);
7047
- GGML_ASSERT(kv_self.cells[i].seq_id.size() == 1);
7048
- GGML_ASSERT(kv_self.cells[i].has_seq_id(0));
7049
- }
7050
- }
7051
-
7052
8930
  // copy rng
7053
8931
  {
7054
8932
  std::stringstream rng_ss;
@@ -7101,36 +8979,38 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
7101
8979
  const auto & hparams = ctx->model.hparams;
7102
8980
  const auto & cparams = ctx->cparams;
7103
8981
 
7104
- const int n_layer = hparams.n_layer;
7105
- const int n_embd = hparams.n_embd_gqa();
7106
- const int n_ctx = cparams.n_ctx;
8982
+ const auto n_layer = hparams.n_layer;
8983
+ const auto n_embd = hparams.n_embd_gqa();
8984
+ const auto n_ctx = cparams.n_ctx;
7107
8985
 
7108
- const size_t kv_size = kv_self.buf.size;
7109
- const int kv_ntok = kv_self.head;
8986
+ const size_t kv_buf_size = kv_self.buf.size;
8987
+ const uint32_t kv_head = kv_self.head;
8988
+ const uint32_t kv_size = kv_self.size;
7110
8989
 
7111
- data_ctx->write(&kv_size, sizeof(kv_size));
7112
- data_ctx->write(&kv_ntok, sizeof(kv_ntok));
8990
+ data_ctx->write(&kv_buf_size, sizeof(kv_buf_size));
8991
+ data_ctx->write(&kv_head, sizeof(kv_head));
8992
+ data_ctx->write(&kv_size, sizeof(kv_size));
7113
8993
 
7114
- if (kv_size) {
8994
+ if (kv_buf_size) {
7115
8995
  const size_t elt_size = ggml_element_size(kv_self.k);
7116
8996
 
7117
8997
  ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
7118
8998
  ggml_cgraph gf{};
7119
8999
 
7120
- ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
9000
+ ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
7121
9001
  std::vector<uint8_t> kout3d_data(ggml_nbytes(kout3d), 0);
7122
9002
  kout3d->data = kout3d_data.data();
7123
9003
 
7124
- ggml_tensor * vout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer);
9004
+ ggml_tensor * vout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_head, n_embd, n_layer);
7125
9005
  std::vector<uint8_t> vout3d_data(ggml_nbytes(vout3d), 0);
7126
9006
  vout3d->data = vout3d_data.data();
7127
9007
 
7128
9008
  ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
7129
- n_embd, kv_ntok, n_layer,
9009
+ n_embd, kv_head, n_layer,
7130
9010
  elt_size*n_embd, elt_size*n_embd*n_ctx, 0);
7131
9011
 
7132
9012
  ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v,
7133
- kv_ntok, n_embd, n_layer,
9013
+ kv_head, n_embd, n_layer,
7134
9014
  elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
7135
9015
 
7136
9016
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
@@ -7144,6 +9024,20 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
7144
9024
  data_ctx->write(kout3d_data.data(), kout3d_data.size());
7145
9025
  data_ctx->write(vout3d_data.data(), vout3d_data.size());
7146
9026
  }
9027
+
9028
+ for (uint32_t i = 0; i < kv_size; ++i) {
9029
+ const auto & cell = kv_self.cells[i];
9030
+
9031
+ const llama_pos pos = cell.pos;
9032
+ const size_t seq_id_size = cell.seq_id.size();
9033
+
9034
+ data_ctx->write(&pos, sizeof(pos));
9035
+ data_ctx->write(&seq_id_size, sizeof(seq_id_size));
9036
+
9037
+ for (auto seq_id : cell.seq_id) {
9038
+ data_ctx->write(&seq_id, sizeof(seq_id));
9039
+ }
9040
+ }
7147
9041
  }
7148
9042
  }
7149
9043
 
@@ -7215,34 +9109,36 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
7215
9109
  const int n_embd = hparams.n_embd_gqa();
7216
9110
  const int n_ctx = cparams.n_ctx;
7217
9111
 
7218
- size_t kv_size;
7219
- int kv_ntok;
9112
+ size_t kv_buf_size;
9113
+ uint32_t kv_head;
9114
+ uint32_t kv_size;
7220
9115
 
7221
- memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
7222
- memcpy(&kv_ntok, inp, sizeof(kv_ntok)); inp += sizeof(kv_ntok);
9116
+ memcpy(&kv_buf_size, inp, sizeof(kv_buf_size)); inp += sizeof(kv_buf_size);
9117
+ memcpy(&kv_head, inp, sizeof(kv_head)); inp += sizeof(kv_head);
9118
+ memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
7223
9119
 
7224
- if (kv_size) {
7225
- GGML_ASSERT(kv_self.buf.size == kv_size);
9120
+ if (kv_buf_size) {
9121
+ GGML_ASSERT(kv_self.buf.size == kv_buf_size);
7226
9122
 
7227
9123
  const size_t elt_size = ggml_element_size(kv_self.k);
7228
9124
 
7229
9125
  ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
7230
9126
  ggml_cgraph gf{};
7231
9127
 
7232
- ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
9128
+ ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
7233
9129
  kin3d->data = (void *) inp;
7234
9130
  inp += ggml_nbytes(kin3d);
7235
9131
 
7236
- ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer);
9132
+ ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_head, n_embd, n_layer);
7237
9133
  vin3d->data = (void *) inp;
7238
9134
  inp += ggml_nbytes(vin3d);
7239
9135
 
7240
9136
  ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
7241
- n_embd, kv_ntok, n_layer,
9137
+ n_embd, kv_head, n_layer,
7242
9138
  elt_size*n_embd, elt_size*n_embd*n_ctx, 0);
7243
9139
 
7244
9140
  ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v,
7245
- kv_ntok, n_embd, n_layer,
9141
+ kv_head, n_embd, n_layer,
7246
9142
  elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
7247
9143
 
7248
9144
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
@@ -7252,8 +9148,27 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
7252
9148
  ggml_free(cpy_ctx);
7253
9149
  }
7254
9150
 
7255
- ctx->kv_self.head = kv_ntok;
9151
+ ctx->kv_self.head = kv_head;
7256
9152
  ctx->kv_self.size = kv_size;
9153
+
9154
+ ctx->kv_self.cells.resize(kv_size);
9155
+
9156
+ for (uint32_t i = 0; i < kv_size; ++i) {
9157
+ llama_pos pos;
9158
+ size_t seq_id_size;
9159
+
9160
+ memcpy(&pos, inp, sizeof(pos)); inp += sizeof(pos);
9161
+ memcpy(&seq_id_size, inp, sizeof(seq_id_size)); inp += sizeof(seq_id_size);
9162
+
9163
+ ctx->kv_self.cells[i].pos = pos;
9164
+
9165
+ llama_seq_id seq_id;
9166
+
9167
+ for (size_t j = 0; j < seq_id_size; ++j) {
9168
+ memcpy(&seq_id, inp, sizeof(seq_id)); inp += sizeof(seq_id);
9169
+ ctx->kv_self.cells[i].seq_id.insert(seq_id);
9170
+ }
9171
+ }
7257
9172
  }
7258
9173
 
7259
9174
  const size_t nread = inp - src;
@@ -7471,6 +9386,22 @@ llama_token llama_token_eos(const struct llama_context * ctx) {
7471
9386
  llama_token llama_token_nl(const struct llama_context * ctx) {
7472
9387
  return ctx->model.vocab.linefeed_id;
7473
9388
  }
9389
+ llama_token llama_token_prefix(const struct llama_context * ctx) {
9390
+ return ctx->model.vocab.special_prefix_id;
9391
+ }
9392
+
9393
+ llama_token llama_token_middle(const struct llama_context * ctx) {
9394
+ return ctx->model.vocab.special_middle_id;
9395
+ }
9396
+
9397
+ llama_token llama_token_suffix(const struct llama_context * ctx) {
9398
+ return ctx->model.vocab.special_suffix_id;
9399
+ }
9400
+
9401
+ llama_token llama_token_eot(const struct llama_context * ctx) {
9402
+ return ctx->model.vocab.special_eot_id;
9403
+ }
9404
+
7474
9405
 
7475
9406
  int llama_tokenize(
7476
9407
  const struct llama_model * model,
@@ -7493,35 +9424,70 @@ int llama_tokenize(
7493
9424
  return res.size();
7494
9425
  }
7495
9426
 
9427
+ static std::string llama_decode_text(const std::string & text) {
9428
+ std::string decoded_text;
9429
+ auto unicode_sequences = codepoints_from_utf8(text);
9430
+ for (auto& unicode_sequence : unicode_sequences) {
9431
+ decoded_text += unicode_to_bytes_bpe(codepoint_to_utf8(unicode_sequence));
9432
+ }
9433
+
9434
+ return decoded_text;
9435
+ }
9436
+
7496
9437
  // does not write null-terminator to buf
7497
9438
  int llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int length) {
7498
9439
  if (0 <= token && token < llama_n_vocab(model)) {
7499
- if (llama_is_normal_token(model->vocab, token)) {
7500
- std::string result = model->vocab.id_to_token[token].text;
7501
- if (llama_vocab_get_type(model->vocab) == LLAMA_VOCAB_TYPE_SPM) {
9440
+ switch (llama_vocab_get_type(model->vocab)) {
9441
+ case LLAMA_VOCAB_TYPE_SPM: {
9442
+ if (llama_is_normal_token(model->vocab, token)) {
9443
+ std::string result = model->vocab.id_to_token[token].text;
7502
9444
  llama_unescape_whitespace(result);
9445
+ if (length < (int) result.length()) {
9446
+ return -result.length();
9447
+ }
9448
+ memcpy(buf, result.c_str(), result.length());
9449
+ return result.length();
9450
+ } else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT
9451
+ if (length < 3) {
9452
+ return -3;
9453
+ }
9454
+ memcpy(buf, "\xe2\x96\x85", 3);
9455
+ return 3;
9456
+ } else if (llama_is_control_token(model->vocab, token)) {
9457
+ ;
9458
+ } else if (llama_is_byte_token(model->vocab, token)) {
9459
+ if (length < 1) {
9460
+ return -1;
9461
+ }
9462
+ buf[0] = llama_token_to_byte(model->vocab, token);
9463
+ return 1;
9464
+ } else {
9465
+ // TODO: for now we accept all unsupported token types,
9466
+ // suppressing them like CONTROL tokens.
9467
+ // GGML_ASSERT(false);
7503
9468
  }
7504
- if (length < (int) result.length()) {
7505
- return -result.length();
7506
- }
7507
- memcpy(buf, result.c_str(), result.length());
7508
- return result.length();
7509
- } else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT
7510
- if (length < 3) {
7511
- return -3;
7512
- }
7513
- buf[0] = '\xe2';
7514
- buf[1] = '\x96';
7515
- buf[2] = '\x85';
7516
- return 3;
7517
- } else if (llama_is_control_token(model->vocab, token)) {
7518
- // do nothing
7519
- } else if (llama_is_byte_token(model->vocab, token)) {
7520
- if (length < 1) {
7521
- return -1;
9469
+ break;
9470
+ }
9471
+ case LLAMA_VOCAB_TYPE_BPE: {
9472
+ if (llama_is_normal_token(model->vocab, token)) {
9473
+ std::string result = model->vocab.id_to_token[token].text;
9474
+ result = llama_decode_text(result);
9475
+ if (length < (int) result.length()) {
9476
+ return -result.length();
9477
+ }
9478
+ memcpy(buf, result.c_str(), result.length());
9479
+ return result.length();
9480
+ } else if (llama_is_control_token(model->vocab, token)) {
9481
+ ;
9482
+ } else {
9483
+ // TODO: for now we accept all unsupported token types,
9484
+ // suppressing them like CONTROL tokens.
9485
+ // GGML_ASSERT(false);
7522
9486
  }
7523
- buf[0] = llama_token_to_byte(model->vocab, token);
7524
- return 1;
9487
+ break;
9488
+ }
9489
+ default:
9490
+ GGML_ASSERT(false);
7525
9491
  }
7526
9492
  }
7527
9493
  return 0;
@@ -7548,14 +9514,14 @@ void llama_print_timings(struct llama_context * ctx) {
7548
9514
  const llama_timings timings = llama_get_timings(ctx);
7549
9515
 
7550
9516
  LLAMA_LOG_INFO("\n");
7551
- LLAMA_LOG_INFO("%s: load time = %8.2f ms\n", __func__, timings.t_load_ms);
7552
- LLAMA_LOG_INFO("%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
9517
+ LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, timings.t_load_ms);
9518
+ LLAMA_LOG_INFO("%s: sample time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
7553
9519
  __func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample);
7554
- LLAMA_LOG_INFO("%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
9520
+ LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
7555
9521
  __func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
7556
- LLAMA_LOG_INFO("%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
9522
+ LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
7557
9523
  __func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
7558
- LLAMA_LOG_INFO("%s: total time = %8.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms));
9524
+ LLAMA_LOG_INFO("%s: total time = %10.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms));
7559
9525
  }
7560
9526
 
7561
9527
  void llama_reset_timings(struct llama_context * ctx) {