llama_cpp 0.6.0 → 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,8 @@
1
1
  #define LLAMA_API_INTERNAL
2
2
  #include "llama.h"
3
3
 
4
+ #include "unicode.h"
5
+
4
6
  #include "ggml.h"
5
7
 
6
8
  #include "ggml-alloc.h"
@@ -123,6 +125,27 @@ static void replace_all(std::string & s, const std::string & search, const std::
123
125
  }
124
126
  s = std::move(result);
125
127
  }
128
+
129
+ static bool is_float_close(float a, float b, float abs_tol) {
130
+ // Check for non-negative tolerance
131
+ if (abs_tol < 0.0) {
132
+ throw std::invalid_argument("Tolerance must be non-negative");
133
+ }
134
+
135
+ // Exact equality check
136
+ if (a == b) {
137
+ return true;
138
+ }
139
+
140
+ // Check for infinities
141
+ if (std::isinf(a) || std::isinf(b)) {
142
+ return false;
143
+ }
144
+
145
+ // Regular comparison using the provided absolute tolerance
146
+ return std::fabs(b - a) <= abs_tol;
147
+ }
148
+
126
149
  #ifdef GGML_USE_CPU_HBM
127
150
  #include <hbwmalloc.h>
128
151
  #endif
@@ -163,6 +186,9 @@ enum llm_arch {
163
186
  LLM_ARCH_GPTNEOX,
164
187
  LLM_ARCH_MPT,
165
188
  LLM_ARCH_STARCODER,
189
+ LLM_ARCH_PERSIMMON,
190
+ LLM_ARCH_REFACT,
191
+ LLM_ARCH_BLOOM,
166
192
  LLM_ARCH_UNKNOWN,
167
193
  };
168
194
 
@@ -175,6 +201,9 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
175
201
  { LLM_ARCH_MPT, "mpt" },
176
202
  { LLM_ARCH_BAICHUAN, "baichuan" },
177
203
  { LLM_ARCH_STARCODER, "starcoder" },
204
+ { LLM_ARCH_PERSIMMON, "persimmon" },
205
+ { LLM_ARCH_REFACT, "refact" },
206
+ { LLM_ARCH_BLOOM, "bloom" },
178
207
  };
179
208
 
180
209
  enum llm_kv {
@@ -277,6 +306,7 @@ struct LLM_KV {
277
306
 
278
307
  enum llm_tensor {
279
308
  LLM_TENSOR_TOKEN_EMBD,
309
+ LLM_TENSOR_TOKEN_EMBD_NORM,
280
310
  LLM_TENSOR_POS_EMBD,
281
311
  LLM_TENSOR_OUTPUT,
282
312
  LLM_TENSOR_OUTPUT_NORM,
@@ -293,6 +323,8 @@ enum llm_tensor {
293
323
  LLM_TENSOR_FFN_DOWN,
294
324
  LLM_TENSOR_FFN_UP,
295
325
  LLM_TENSOR_FFN_NORM,
326
+ LLM_TENSOR_ATTN_Q_NORM,
327
+ LLM_TENSOR_ATTN_K_NORM,
296
328
  };
297
329
 
298
330
  static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
@@ -374,10 +406,35 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
374
406
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
375
407
  },
376
408
  },
409
+ {
410
+ LLM_ARCH_PERSIMMON,
411
+ {
412
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd"},
413
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm"},
414
+ { LLM_TENSOR_OUTPUT, "output"},
415
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm"},
416
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv"},
417
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output"},
418
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm"},
419
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm"},
420
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm"},
421
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down"},
422
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up"},
423
+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd"},
424
+ },
425
+ },
377
426
  {
378
427
  LLM_ARCH_MPT,
379
428
  {
380
429
  { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
430
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
431
+ { LLM_TENSOR_OUTPUT, "output" },
432
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
433
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
434
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
435
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
436
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
437
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
381
438
  },
382
439
  },
383
440
  {
@@ -395,6 +452,38 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
395
452
  { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
396
453
  },
397
454
  },
455
+ {
456
+ LLM_ARCH_REFACT,
457
+ {
458
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
459
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
460
+ { LLM_TENSOR_OUTPUT, "output" },
461
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
462
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
463
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
464
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
465
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
466
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
467
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
468
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
469
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
470
+ },
471
+ },
472
+ {
473
+ LLM_ARCH_BLOOM,
474
+ {
475
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
476
+ { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
477
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
478
+ { LLM_TENSOR_OUTPUT, "output" },
479
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
480
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
481
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
482
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
483
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
484
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
485
+ },
486
+ },
398
487
  {
399
488
  LLM_ARCH_UNKNOWN,
400
489
  {
@@ -912,6 +1001,7 @@ enum e_model {
912
1001
  MODEL_1B,
913
1002
  MODEL_3B,
914
1003
  MODEL_7B,
1004
+ MODEL_8B,
915
1005
  MODEL_13B,
916
1006
  MODEL_15B,
917
1007
  MODEL_30B,
@@ -942,8 +1032,28 @@ struct llama_hparams {
942
1032
  float rope_freq_base_train;
943
1033
  float rope_freq_scale_train;
944
1034
 
1035
+ float f_clamp_kqv;
1036
+ float f_max_alibi_bias;
1037
+
945
1038
  bool operator!=(const llama_hparams & other) const {
946
- return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams))); // NOLINT
1039
+ if (this->vocab_only != other.vocab_only) return true;
1040
+ if (this->n_vocab != other.n_vocab) return true;
1041
+ if (this->n_ctx_train != other.n_ctx_train) return true;
1042
+ if (this->n_embd != other.n_embd) return true;
1043
+ if (this->n_head != other.n_head) return true;
1044
+ if (this->n_head_kv != other.n_head_kv) return true;
1045
+ if (this->n_layer != other.n_layer) return true;
1046
+ if (this->n_rot != other.n_rot) return true;
1047
+ if (this->n_ff != other.n_ff) return true;
1048
+
1049
+ const float EPSILON = 1e-9;
1050
+
1051
+ if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
1052
+ if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
1053
+ if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
1054
+ if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
1055
+
1056
+ return false;
947
1057
  }
948
1058
 
949
1059
  uint32_t n_gqa() const {
@@ -977,6 +1087,10 @@ struct llama_layer {
977
1087
  struct ggml_tensor * attn_norm_b;
978
1088
  struct ggml_tensor * attn_norm_2;
979
1089
  struct ggml_tensor * attn_norm_2_b;
1090
+ struct ggml_tensor * attn_q_norm;
1091
+ struct ggml_tensor * attn_q_norm_b;
1092
+ struct ggml_tensor * attn_k_norm;
1093
+ struct ggml_tensor * attn_k_norm_b;
980
1094
 
981
1095
  // attention
982
1096
  struct ggml_tensor * wq;
@@ -1018,6 +1132,9 @@ struct llama_kv_cell {
1018
1132
  struct llama_kv_cache {
1019
1133
  bool has_shift = false;
1020
1134
 
1135
+ // Note: The value of head isn't only used to optimize searching
1136
+ // for a free KV slot. llama_decode_internal also uses it, so it
1137
+ // cannot be freely changed after a slot has been allocated.
1021
1138
  uint32_t head = 0;
1022
1139
  uint32_t size = 0;
1023
1140
 
@@ -1071,6 +1188,10 @@ struct llama_vocab {
1071
1188
  id special_pad_id = -1;
1072
1189
 
1073
1190
  id linefeed_id = 13;
1191
+ id special_prefix_id = 32007;
1192
+ id special_middle_id = 32009;
1193
+ id special_suffix_id = 32008;
1194
+ id special_eot_id = 32010;
1074
1195
 
1075
1196
  int find_bpe_rank(std::string token_left, std::string token_right) const {
1076
1197
  replace_all(token_left, " ", "\u0120");
@@ -1099,6 +1220,8 @@ struct llama_model {
1099
1220
 
1100
1221
  struct ggml_tensor * tok_embeddings;
1101
1222
  struct ggml_tensor * pos_embeddings;
1223
+ struct ggml_tensor * tok_norm;
1224
+ struct ggml_tensor * tok_norm_b;
1102
1225
 
1103
1226
  struct ggml_tensor * output_norm;
1104
1227
  struct ggml_tensor * output_norm_b;
@@ -1228,7 +1351,11 @@ static bool llama_kv_cache_init(
1228
1351
  cache.cells.clear();
1229
1352
  cache.cells.resize(n_ctx);
1230
1353
 
1354
+ // TODO: this should be:
1355
+ // cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*ggml_tensor_overhead());
1356
+ // change it and test that it works
1231
1357
  cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
1358
+ memset(cache.buf.data, 0, cache.buf.size);
1232
1359
 
1233
1360
  struct ggml_init_params params;
1234
1361
  params.mem_size = cache.buf.size;
@@ -1271,9 +1398,11 @@ static bool llama_kv_cache_init(
1271
1398
 
1272
1399
  // find an empty slot of size "n_tokens" in the cache
1273
1400
  // updates the cache head
1401
+ // Note: On success, it's important that cache.head points
1402
+ // to the first cell of the slot.
1274
1403
  static bool llama_kv_cache_find_slot(
1275
- struct llama_kv_cache & cache,
1276
- const struct llama_batch & batch) {
1404
+ struct llama_kv_cache & cache,
1405
+ const struct llama_batch & batch) {
1277
1406
  const uint32_t n_ctx = cache.size;
1278
1407
  const uint32_t n_tokens = batch.n_tokens;
1279
1408
 
@@ -1286,8 +1415,8 @@ static bool llama_kv_cache_find_slot(
1286
1415
 
1287
1416
  while (true) {
1288
1417
  if (cache.head + n_tokens > n_ctx) {
1418
+ n_tested += n_ctx - cache.head;
1289
1419
  cache.head = 0;
1290
- n_tested += n_ctx - cache.head;
1291
1420
  continue;
1292
1421
  }
1293
1422
 
@@ -1338,29 +1467,46 @@ static void llama_kv_cache_tokens_rm(struct llama_kv_cache & cache, int32_t c0,
1338
1467
  cache.cells[i].pos = -1;
1339
1468
  cache.cells[i].seq_id.clear();
1340
1469
  }
1470
+
1471
+ // Searching for a free slot can start here since we know it will be empty.
1472
+ cache.head = uint32_t(c0);
1341
1473
  }
1342
1474
 
1343
1475
  static void llama_kv_cache_seq_rm(
1344
- struct llama_kv_cache & cache,
1345
- llama_seq_id seq_id,
1346
- llama_pos p0,
1347
- llama_pos p1) {
1476
+ struct llama_kv_cache & cache,
1477
+ llama_seq_id seq_id,
1478
+ llama_pos p0,
1479
+ llama_pos p1) {
1480
+ uint32_t new_head = cache.size;
1481
+
1482
+ if (p0 < 0) p0 = 0;
1483
+ if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
1484
+
1348
1485
  for (uint32_t i = 0; i < cache.size; ++i) {
1349
1486
  if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
1350
1487
  cache.cells[i].seq_id.erase(seq_id);
1351
1488
  if (cache.cells[i].seq_id.empty()) {
1352
1489
  cache.cells[i].pos = -1;
1490
+ if (new_head == cache.size) new_head = i;
1353
1491
  }
1354
1492
  }
1355
1493
  }
1494
+
1495
+ // If we freed up a slot, set head to it so searching can start there.
1496
+ if (new_head != cache.size) cache.head = new_head;
1356
1497
  }
1357
1498
 
1358
1499
  static void llama_kv_cache_seq_cp(
1359
- struct llama_kv_cache & cache,
1360
- llama_seq_id seq_id_src,
1361
- llama_seq_id seq_id_dst,
1362
- llama_pos p0,
1363
- llama_pos p1) {
1500
+ struct llama_kv_cache & cache,
1501
+ llama_seq_id seq_id_src,
1502
+ llama_seq_id seq_id_dst,
1503
+ llama_pos p0,
1504
+ llama_pos p1) {
1505
+ if (p0 < 0) p0 = 0;
1506
+ if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
1507
+
1508
+ cache.head = 0;
1509
+
1364
1510
  for (uint32_t i = 0; i < cache.size; ++i) {
1365
1511
  if (cache.cells[i].has_seq_id(seq_id_src) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
1366
1512
  cache.cells[i].seq_id.insert(seq_id_dst);
@@ -1369,32 +1515,48 @@ static void llama_kv_cache_seq_cp(
1369
1515
  }
1370
1516
 
1371
1517
  static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id seq_id) {
1518
+ uint32_t new_head = cache.size;
1519
+
1372
1520
  for (uint32_t i = 0; i < cache.size; ++i) {
1373
1521
  if (!cache.cells[i].has_seq_id(seq_id)) {
1374
1522
  cache.cells[i].pos = -1;
1375
1523
  cache.cells[i].seq_id.clear();
1524
+ if (new_head == cache.size) new_head = i;
1376
1525
  }
1377
1526
  }
1527
+
1528
+ // If we freed up a slot, set head to it so searching can start there.
1529
+ if (new_head != cache.size) cache.head = new_head;
1378
1530
  }
1379
1531
 
1380
1532
  static void llama_kv_cache_seq_shift(
1381
- struct llama_kv_cache & cache,
1382
- llama_seq_id seq_id,
1383
- llama_pos p0,
1384
- llama_pos p1,
1385
- llama_pos delta) {
1533
+ struct llama_kv_cache & cache,
1534
+ llama_seq_id seq_id,
1535
+ llama_pos p0,
1536
+ llama_pos p1,
1537
+ llama_pos delta) {
1538
+ uint32_t new_head = cache.size;
1539
+
1540
+ if (p0 < 0) p0 = 0;
1541
+ if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
1542
+
1386
1543
  for (uint32_t i = 0; i < cache.size; ++i) {
1387
1544
  if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
1388
1545
  cache.cells[i].pos += delta;
1389
1546
  if (cache.cells[i].pos < 0) {
1390
1547
  cache.cells[i].pos = -1;
1391
1548
  cache.cells[i].seq_id.clear();
1549
+ if (new_head == cache.size) new_head = i;
1392
1550
  } else {
1393
1551
  cache.has_shift = true;
1394
1552
  cache.cells[i].delta = delta;
1395
1553
  }
1396
1554
  }
1397
1555
  }
1556
+
1557
+ // If we freed up a slot, set head to it so searching can start there.
1558
+ // Otherwise we just start the next search from the beginning.
1559
+ cache.head = new_head != cache.size ? new_head : 0;
1398
1560
  }
1399
1561
 
1400
1562
  //
@@ -1598,7 +1760,7 @@ struct llama_model_loader {
1598
1760
  }
1599
1761
  }
1600
1762
 
1601
- struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend backend) {
1763
+ struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend_type backend) {
1602
1764
  if (backend != GGML_BACKEND_CPU) {
1603
1765
  ggml_set_no_alloc(ctx, true);
1604
1766
  }
@@ -1616,7 +1778,7 @@ struct llama_model_loader {
1616
1778
  return tensor;
1617
1779
  }
1618
1780
 
1619
- struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend backend) {
1781
+ struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend_type backend) {
1620
1782
  struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str());
1621
1783
 
1622
1784
  if (cur == NULL) {
@@ -1795,6 +1957,7 @@ static const char * llama_model_type_name(e_model type) {
1795
1957
  case MODEL_1B: return "1B";
1796
1958
  case MODEL_3B: return "3B";
1797
1959
  case MODEL_7B: return "7B";
1960
+ case MODEL_8B: return "8B";
1798
1961
  case MODEL_13B: return "13B";
1799
1962
  case MODEL_15B: return "15B";
1800
1963
  case MODEL_30B: return "30B";
@@ -1907,6 +2070,49 @@ static void llm_load_hparams(
1907
2070
  default: model.type = e_model::MODEL_UNKNOWN;
1908
2071
  }
1909
2072
  } break;
2073
+ case LLM_ARCH_PERSIMMON:
2074
+ {
2075
+ GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
2076
+ switch (hparams.n_layer) {
2077
+ case 36: model.type = e_model::MODEL_8B; break;
2078
+ default: model.type = e_model::MODEL_UNKNOWN;
2079
+ }
2080
+ } break;
2081
+ case LLM_ARCH_REFACT:
2082
+ {
2083
+ GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
2084
+ switch (hparams.n_layer) {
2085
+ case 32: model.type = e_model::MODEL_1B; break;
2086
+ default: model.type = e_model::MODEL_UNKNOWN;
2087
+ }
2088
+ } break;
2089
+ case LLM_ARCH_BLOOM:
2090
+ {
2091
+ GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
2092
+
2093
+ switch (hparams.n_layer) {
2094
+ case 24: model.type = e_model::MODEL_1B; break;
2095
+ case 30:
2096
+ switch (hparams.n_embd) {
2097
+ case 2560: model.type = e_model::MODEL_3B; break;
2098
+ case 4096: model.type = e_model::MODEL_7B; break;
2099
+ } break;
2100
+ }
2101
+ } break;
2102
+ case LLM_ARCH_MPT:
2103
+ {
2104
+ hparams.f_clamp_kqv = 0.0f;
2105
+
2106
+ GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
2107
+ GGUF_GET_KEY(ctx, hparams.f_clamp_kqv, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ATTENTION_CLAMP_KQV));
2108
+ GGUF_GET_KEY(ctx, hparams.f_max_alibi_bias, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_MAX_ALIBI_BIAS));
2109
+
2110
+ switch (hparams.n_layer) {
2111
+ case 32: model.type = e_model::MODEL_7B; break;
2112
+ case 48: model.type = e_model::MODEL_30B; break;
2113
+ default: model.type = e_model::MODEL_UNKNOWN;
2114
+ }
2115
+ } break;
1910
2116
  default: (void)0;
1911
2117
  }
1912
2118
 
@@ -1971,6 +2177,7 @@ static void llm_load_vocab(
1971
2177
 
1972
2178
  for (int i = 0; i < n_merges; i++) {
1973
2179
  const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
2180
+ GGML_ASSERT(codepoints_from_utf8(word).size() > 0);
1974
2181
 
1975
2182
  std::string first;
1976
2183
  std::string second;
@@ -2005,6 +2212,7 @@ static void llm_load_vocab(
2005
2212
 
2006
2213
  for (uint32_t i = 0; i < n_vocab; i++) {
2007
2214
  std::string word = gguf_get_arr_str(ctx, token_idx, i);
2215
+ GGML_ASSERT(codepoints_from_utf8(word).size() > 0);
2008
2216
 
2009
2217
  vocab.token_to_id[word] = i;
2010
2218
 
@@ -2013,12 +2221,13 @@ static void llm_load_vocab(
2013
2221
  token_data.score = scores ? scores[i] : 0.0f;
2014
2222
  token_data.type = toktypes ? (llama_token_type) toktypes[i] : LLAMA_TOKEN_TYPE_NORMAL;
2015
2223
  }
2224
+ GGML_ASSERT(vocab.id_to_token.size() == vocab.token_to_id.size());
2016
2225
 
2017
2226
  // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
2018
2227
  if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
2019
2228
  vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
2020
2229
  } else {
2021
- vocab.linefeed_id = llama_tokenize_internal(vocab, "\n", false)[0];
2230
+ vocab.linefeed_id = llama_tokenize_internal(vocab, "\u010A", false)[0];
2022
2231
  }
2023
2232
 
2024
2233
  // special tokens
@@ -2048,6 +2257,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
2048
2257
  LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa());
2049
2258
  LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
2050
2259
  LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
2260
+ LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
2261
+ LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
2051
2262
  LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
2052
2263
  LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
2053
2264
  LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
@@ -2141,13 +2352,14 @@ static void llm_load_tensors(
2141
2352
  const auto tn = LLM_TN(model.arch);
2142
2353
  switch (model.arch) {
2143
2354
  case LLM_ARCH_LLAMA:
2355
+ case LLM_ARCH_REFACT:
2144
2356
  {
2145
2357
  model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2146
2358
 
2147
2359
  // output
2148
2360
  {
2149
- ggml_backend backend_norm;
2150
- ggml_backend backend_output;
2361
+ ggml_backend_type backend_norm;
2362
+ ggml_backend_type backend_output;
2151
2363
 
2152
2364
  if (n_gpu_layers > int(n_layer)) {
2153
2365
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
@@ -2182,8 +2394,8 @@ static void llm_load_tensors(
2182
2394
  model.layers.resize(n_layer);
2183
2395
 
2184
2396
  for (uint32_t i = 0; i < n_layer; ++i) {
2185
- const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2186
- const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2397
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2398
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2187
2399
 
2188
2400
  auto & layer = model.layers[i];
2189
2401
 
@@ -2212,8 +2424,8 @@ static void llm_load_tensors(
2212
2424
  {
2213
2425
  model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2214
2426
  {
2215
- ggml_backend backend_norm;
2216
- ggml_backend backend_output;
2427
+ ggml_backend_type backend_norm;
2428
+ ggml_backend_type backend_output;
2217
2429
 
2218
2430
  if (n_gpu_layers > int(n_layer)) {
2219
2431
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
@@ -2248,8 +2460,8 @@ static void llm_load_tensors(
2248
2460
  model.layers.resize(n_layer);
2249
2461
 
2250
2462
  for (uint32_t i = 0; i < n_layer; ++i) {
2251
- const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2252
- const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2463
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2464
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2253
2465
 
2254
2466
  auto & layer = model.layers[i];
2255
2467
 
@@ -2282,8 +2494,8 @@ static void llm_load_tensors(
2282
2494
 
2283
2495
  // output
2284
2496
  {
2285
- ggml_backend backend_norm;
2286
- ggml_backend backend_output;
2497
+ ggml_backend_type backend_norm;
2498
+ ggml_backend_type backend_output;
2287
2499
 
2288
2500
  if (n_gpu_layers > int(n_layer)) {
2289
2501
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
@@ -2320,8 +2532,8 @@ static void llm_load_tensors(
2320
2532
  model.layers.resize(n_layer);
2321
2533
 
2322
2534
  for (uint32_t i = 0; i < n_layer; ++i) {
2323
- const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2324
- const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2535
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2536
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2325
2537
 
2326
2538
  auto & layer = model.layers[i];
2327
2539
 
@@ -2359,8 +2571,8 @@ static void llm_load_tensors(
2359
2571
 
2360
2572
  // output
2361
2573
  {
2362
- ggml_backend backend_norm;
2363
- ggml_backend backend_output;
2574
+ ggml_backend_type backend_norm;
2575
+ ggml_backend_type backend_output;
2364
2576
 
2365
2577
  if (n_gpu_layers > int(n_layer)) {
2366
2578
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
@@ -2397,8 +2609,8 @@ static void llm_load_tensors(
2397
2609
  model.layers.resize(n_layer);
2398
2610
 
2399
2611
  for (uint32_t i = 0; i < n_layer; ++i) {
2400
- const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2401
- const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2612
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2613
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2402
2614
 
2403
2615
  auto & layer = model.layers[i];
2404
2616
 
@@ -2431,103 +2643,313 @@ static void llm_load_tensors(
2431
2643
  }
2432
2644
  }
2433
2645
  } break;
2434
- default:
2435
- throw std::runtime_error("unknown architecture");
2436
- }
2437
- }
2646
+ case LLM_ARCH_PERSIMMON:
2647
+ {
2648
+ model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2438
2649
 
2439
- ml.done_getting_tensors();
2650
+ {
2651
+ ggml_backend_type backend_norm;
2652
+ ggml_backend_type backend_output;
2440
2653
 
2441
- // print memory requirements
2442
- {
2443
- // this is the total memory required to run the inference
2444
- size_t mem_required =
2445
- ctx_size +
2446
- mmapped_size - vram_weights; // weights in VRAM not in memory
2654
+ if (n_gpu_layers > int(n_layer)) {
2655
+ // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2656
+ // on Windows however this is detrimental unless everything is on the GPU
2657
+ #ifndef _WIN32
2658
+ backend_norm = LLAMA_BACKEND_OFFLOAD;
2659
+ #else
2660
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2661
+ #endif // _WIN32
2447
2662
 
2448
- LLAMA_LOG_INFO("%s: mem required = %7.2f MB\n", __func__, mem_required / 1024.0 / 1024.0);
2663
+ backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
2664
+ } else {
2665
+ backend_norm = GGML_BACKEND_CPU;
2666
+ backend_output = GGML_BACKEND_CPU;
2667
+ }
2449
2668
 
2450
- #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
2451
- const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
2669
+ model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
2670
+ model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
2671
+ model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
2452
2672
 
2453
- LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
2454
- if (n_gpu_layers > (int) hparams.n_layer) {
2455
- LLAMA_LOG_INFO("%s: offloading non-repeating layers to GPU\n", __func__);
2456
- }
2673
+ if (backend_norm == GGML_BACKEND_GPU) {
2674
+ vram_weights += ggml_nbytes(model.output_norm);
2675
+ vram_weights += ggml_nbytes(model.output_norm_b);
2676
+ }
2677
+ if (backend_output == GGML_BACKEND_GPU_SPLIT) {
2678
+ vram_weights += ggml_nbytes(model.output);
2679
+ }
2680
+ }
2457
2681
 
2458
- #ifdef GGML_USE_CUBLAS
2459
- const int max_backend_supported_layers = hparams.n_layer + 3;
2460
- const int max_offloadable_layers = hparams.n_layer + 3;
2461
- #elif defined(GGML_USE_CLBLAST)
2462
- const int max_backend_supported_layers = hparams.n_layer + 1;
2463
- const int max_offloadable_layers = hparams.n_layer + 1;
2464
- #endif // GGML_USE_CUBLAS
2682
+ const uint32_t n_ff = hparams.n_ff;
2683
+ const int i_gpu_start = n_layer - n_gpu_layers;
2684
+ model.layers.resize(n_layer);
2685
+ for (uint32_t i = 0; i < n_layer; ++i) {
2686
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2687
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT;
2688
+ auto & layer = model.layers[i];
2689
+ layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
2690
+ layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
2691
+ layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
2692
+ layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split);
2693
+ layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
2694
+ layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split);
2695
+ layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
2696
+ layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split);
2697
+ layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
2698
+ layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split);
2699
+ layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
2700
+ layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
2701
+ layer.attn_q_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64}, backend);
2702
+ layer.attn_q_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {64}, backend);
2703
+ layer.attn_k_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {64}, backend);
2704
+ layer.attn_k_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64}, backend);
2705
+ }
2706
+ } break;
2707
+ case LLM_ARCH_BLOOM:
2708
+ {
2709
+ // TODO: CPU-only for now
2465
2710
 
2466
- LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
2467
- LLAMA_LOG_INFO("%s: VRAM used: %.2f MB\n", __func__, vram_weights / 1024.0 / 1024.0);
2711
+ model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2712
+ model.tok_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, GGML_BACKEND_CPU);
2713
+ model.tok_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, GGML_BACKEND_CPU);
2714
+
2715
+ // output
2716
+ {
2717
+ ggml_backend_type backend_norm;
2718
+ ggml_backend_type backend_output;
2719
+
2720
+ if (n_gpu_layers > int(n_layer)) {
2721
+ // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2722
+ // on Windows however this is detrimental unless everything is on the GPU
2723
+ #ifndef _WIN32
2724
+ backend_norm = LLAMA_BACKEND_OFFLOAD;
2468
2725
  #else
2469
- (void) n_gpu_layers;
2470
- #endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
2471
- }
2726
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2727
+ #endif // _WIN32
2472
2728
 
2473
- // populate `tensors_by_name`
2474
- for (int i = 0; i < ml.n_tensors; ++i) {
2475
- struct ggml_tensor * cur = ggml_get_tensor(ctx, ml.get_tensor_name(i));
2476
- model.tensors_by_name.emplace_back(ggml_get_name(cur), cur);
2477
- }
2729
+ backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
2730
+ } else {
2731
+ backend_norm = GGML_BACKEND_CPU;
2732
+ backend_output = GGML_BACKEND_CPU;
2733
+ }
2478
2734
 
2479
- (void) tensor_split;
2480
- #ifdef GGML_USE_CUBLAS
2481
- {
2482
- ggml_cuda_set_tensor_split(tensor_split);
2483
- }
2484
- #endif
2735
+ model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
2736
+ model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
2737
+ model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
2485
2738
 
2486
- ml.load_all_data(ctx, progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL);
2739
+ if (backend_norm == GGML_BACKEND_GPU) {
2740
+ vram_weights += ggml_nbytes(model.output_norm);
2741
+ vram_weights += ggml_nbytes(model.output_norm_b);
2742
+ }
2743
+ if (backend_output == GGML_BACKEND_GPU_SPLIT) {
2744
+ vram_weights += ggml_nbytes(model.output);
2745
+ }
2746
+ }
2487
2747
 
2488
- if (progress_callback) {
2489
- progress_callback(1.0f, progress_callback_user_data);
2490
- }
2748
+ const uint32_t n_ff = hparams.n_ff;
2491
2749
 
2492
- model.mapping = std::move(ml.mapping);
2750
+ const int i_gpu_start = n_layer - n_gpu_layers;
2493
2751
 
2494
- // loading time will be recalculate after the first eval, so
2495
- // we take page faults deferred by mmap() into consideration
2496
- model.t_load_us = ggml_time_us() - model.t_start_us;
2497
- }
2752
+ model.layers.resize(n_layer);
2498
2753
 
2499
- static bool llama_model_load(
2500
- const std::string & fname,
2501
- llama_model & model,
2502
- int n_gpu_layers,
2503
- int main_gpu,
2504
- const float * tensor_split,
2505
- bool use_mmap,
2506
- bool use_mlock,
2507
- bool vocab_only,
2508
- llama_progress_callback progress_callback,
2509
- void *progress_callback_user_data) {
2510
- try {
2511
- llama_model_loader ml(fname, use_mmap);
2754
+ for (uint32_t i = 0; i < n_layer; ++i) {
2755
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2756
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2512
2757
 
2513
- model.hparams.vocab_only = vocab_only;
2758
+ auto & layer = model.layers[i];
2514
2759
 
2515
- llm_load_arch (ml, model);
2516
- llm_load_hparams(ml, model);
2517
- llm_load_vocab (ml, model);
2760
+ layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
2761
+ layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
2518
2762
 
2519
- llm_load_print_meta(ml, model);
2763
+ layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
2764
+ layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split);
2520
2765
 
2521
- if (model.hparams.n_vocab != model.vocab.id_to_token.size()) {
2522
- throw std::runtime_error("vocab size mismatch");
2523
- }
2766
+ layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
2767
+ layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split);
2524
2768
 
2525
- if (vocab_only) {
2526
- LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
2527
- return true;
2528
- }
2769
+ layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
2770
+ layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
2529
2771
 
2530
- llm_load_tensors(
2772
+ layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
2773
+ layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split);
2774
+
2775
+ layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
2776
+ layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split);
2777
+
2778
+ if (backend == GGML_BACKEND_GPU) {
2779
+ vram_weights +=
2780
+ ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
2781
+ ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
2782
+ ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) +
2783
+ ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_norm_b) +
2784
+ ggml_nbytes(layer.w3) + ggml_nbytes(layer.b3) +
2785
+ ggml_nbytes(layer.w2) + ggml_nbytes(layer.b2);
2786
+ }
2787
+ }
2788
+ } break;
2789
+ case LLM_ARCH_MPT:
2790
+ {
2791
+ model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2792
+
2793
+ // output
2794
+ {
2795
+ ggml_backend_type backend_norm;
2796
+ ggml_backend_type backend_output;
2797
+
2798
+ if (n_gpu_layers > int(n_layer)) {
2799
+ // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2800
+ // on Windows however this is detrimental unless everything is on the GPU
2801
+ #ifndef _WIN32
2802
+ backend_norm = LLAMA_BACKEND_OFFLOAD;
2803
+ #else
2804
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2805
+ #endif // _WIN32
2806
+
2807
+ backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
2808
+ } else {
2809
+ backend_norm = GGML_BACKEND_CPU;
2810
+ backend_output = GGML_BACKEND_CPU;
2811
+ }
2812
+
2813
+ model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
2814
+ model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
2815
+
2816
+ if (backend_norm == GGML_BACKEND_GPU) {
2817
+ vram_weights += ggml_nbytes(model.output_norm);
2818
+ }
2819
+ if (backend_output == GGML_BACKEND_GPU_SPLIT) {
2820
+ vram_weights += ggml_nbytes(model.output);
2821
+ }
2822
+ }
2823
+
2824
+ const uint32_t n_ff = hparams.n_ff;
2825
+
2826
+ const int i_gpu_start = n_layer - n_gpu_layers;
2827
+
2828
+ model.layers.resize(n_layer);
2829
+
2830
+ for (uint32_t i = 0; i < n_layer; ++i) {
2831
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2832
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2833
+
2834
+ auto & layer = model.layers[i];
2835
+
2836
+ layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
2837
+ layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, 3*n_embd}, backend_split);
2838
+ layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
2839
+
2840
+ layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
2841
+
2842
+ layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
2843
+ layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
2844
+
2845
+ if (backend == GGML_BACKEND_GPU) {
2846
+ vram_weights +=
2847
+ ggml_nbytes(layer.attn_norm) +
2848
+ ggml_nbytes(layer.wqkv) +
2849
+ ggml_nbytes(layer.wo) +
2850
+ ggml_nbytes(layer.ffn_norm) +
2851
+ ggml_nbytes(layer.w2) +
2852
+ ggml_nbytes(layer.w3);
2853
+ }
2854
+ }
2855
+ } break;
2856
+ default:
2857
+ throw std::runtime_error("unknown architecture");
2858
+ }
2859
+ }
2860
+
2861
+ ml.done_getting_tensors();
2862
+
2863
+ // print memory requirements
2864
+ {
2865
+ // this is the total memory required to run the inference
2866
+ size_t mem_required =
2867
+ ctx_size +
2868
+ mmapped_size - vram_weights; // weights in VRAM not in memory
2869
+
2870
+ LLAMA_LOG_INFO("%s: mem required = %7.2f MB\n", __func__, mem_required / 1024.0 / 1024.0);
2871
+
2872
+ #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
2873
+ const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
2874
+
2875
+ LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
2876
+ if (n_gpu_layers > (int) hparams.n_layer) {
2877
+ LLAMA_LOG_INFO("%s: offloading non-repeating layers to GPU\n", __func__);
2878
+ }
2879
+
2880
+ #ifdef GGML_USE_CUBLAS
2881
+ const int max_backend_supported_layers = hparams.n_layer + 3;
2882
+ const int max_offloadable_layers = hparams.n_layer + 3;
2883
+ #elif defined(GGML_USE_CLBLAST)
2884
+ const int max_backend_supported_layers = hparams.n_layer + 1;
2885
+ const int max_offloadable_layers = hparams.n_layer + 1;
2886
+ #endif // GGML_USE_CUBLAS
2887
+
2888
+ LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
2889
+ LLAMA_LOG_INFO("%s: VRAM used: %.2f MB\n", __func__, vram_weights / 1024.0 / 1024.0);
2890
+ #else
2891
+ (void) n_gpu_layers;
2892
+ #endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
2893
+ }
2894
+
2895
+ // populate `tensors_by_name`
2896
+ for (int i = 0; i < ml.n_tensors; ++i) {
2897
+ struct ggml_tensor * cur = ggml_get_tensor(ctx, ml.get_tensor_name(i));
2898
+ model.tensors_by_name.emplace_back(ggml_get_name(cur), cur);
2899
+ }
2900
+
2901
+ (void) tensor_split;
2902
+ #ifdef GGML_USE_CUBLAS
2903
+ {
2904
+ ggml_cuda_set_tensor_split(tensor_split);
2905
+ }
2906
+ #endif
2907
+
2908
+ ml.load_all_data(ctx, progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL);
2909
+
2910
+ if (progress_callback) {
2911
+ progress_callback(1.0f, progress_callback_user_data);
2912
+ }
2913
+
2914
+ model.mapping = std::move(ml.mapping);
2915
+
2916
+ // loading time will be recalculate after the first eval, so
2917
+ // we take page faults deferred by mmap() into consideration
2918
+ model.t_load_us = ggml_time_us() - model.t_start_us;
2919
+ }
2920
+
2921
+ static bool llama_model_load(
2922
+ const std::string & fname,
2923
+ llama_model & model,
2924
+ int n_gpu_layers,
2925
+ int main_gpu,
2926
+ const float * tensor_split,
2927
+ bool use_mmap,
2928
+ bool use_mlock,
2929
+ bool vocab_only,
2930
+ llama_progress_callback progress_callback,
2931
+ void *progress_callback_user_data) {
2932
+ try {
2933
+ llama_model_loader ml(fname, use_mmap);
2934
+
2935
+ model.hparams.vocab_only = vocab_only;
2936
+
2937
+ llm_load_arch (ml, model);
2938
+ llm_load_hparams(ml, model);
2939
+ llm_load_vocab (ml, model);
2940
+
2941
+ llm_load_print_meta(ml, model);
2942
+
2943
+ if (model.hparams.n_vocab != model.vocab.id_to_token.size()) {
2944
+ throw std::runtime_error("vocab size mismatch");
2945
+ }
2946
+
2947
+ if (vocab_only) {
2948
+ LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
2949
+ return true;
2950
+ }
2951
+
2952
+ llm_load_tensors(
2531
2953
  ml, model, n_gpu_layers,
2532
2954
  main_gpu, tensor_split,
2533
2955
  use_mlock, progress_callback, progress_callback_user_data);
@@ -2540,8 +2962,8 @@ static bool llama_model_load(
2540
2962
  }
2541
2963
 
2542
2964
  static struct ggml_cgraph * llm_build_llama(
2543
- llama_context & lctx,
2544
- const llama_batch & batch) {
2965
+ llama_context & lctx,
2966
+ const llama_batch & batch) {
2545
2967
  const auto & model = lctx.model;
2546
2968
  const auto & hparams = model.hparams;
2547
2969
  const auto & cparams = lctx.cparams;
@@ -2579,11 +3001,9 @@ static struct ggml_cgraph * llm_build_llama(
2579
3001
  struct ggml_init_params params = {
2580
3002
  /*.mem_size =*/ buf_compute.size,
2581
3003
  /*.mem_buffer =*/ buf_compute.data,
2582
- /*.no_alloc =*/ false,
3004
+ /*.no_alloc =*/ true,
2583
3005
  };
2584
3006
 
2585
- params.no_alloc = true;
2586
-
2587
3007
  struct ggml_context * ctx0 = ggml_init(params);
2588
3008
 
2589
3009
  ggml_cgraph * gf = ggml_new_graph(ctx0);
@@ -2967,11 +3387,9 @@ static struct ggml_cgraph * llm_build_baichaun(
2967
3387
  struct ggml_init_params params = {
2968
3388
  /*.mem_size =*/ buf_compute.size,
2969
3389
  /*.mem_buffer =*/ buf_compute.data,
2970
- /*.no_alloc =*/ false,
3390
+ /*.no_alloc =*/ true,
2971
3391
  };
2972
3392
 
2973
- params.no_alloc = true;
2974
-
2975
3393
  struct ggml_context * ctx0 = ggml_init(params);
2976
3394
 
2977
3395
  ggml_cgraph * gf = ggml_new_graph(ctx0);
@@ -3334,7 +3752,7 @@ static struct ggml_cgraph * llm_build_baichaun(
3334
3752
  return gf;
3335
3753
  }
3336
3754
 
3337
- static struct ggml_cgraph * llm_build_falcon(
3755
+ static struct ggml_cgraph * llm_build_refact(
3338
3756
  llama_context & lctx,
3339
3757
  const llama_batch & batch) {
3340
3758
  const auto & model = lctx.model;
@@ -3353,11 +3771,7 @@ static struct ggml_cgraph * llm_build_falcon(
3353
3771
  const int64_t n_embd_head = hparams.n_embd_head();
3354
3772
  const int64_t n_embd_gqa = hparams.n_embd_gqa();
3355
3773
 
3356
- GGML_ASSERT(n_embd_head == hparams.n_rot);
3357
-
3358
- const float freq_base = cparams.rope_freq_base;
3359
- const float freq_scale = cparams.rope_freq_scale;
3360
- const float norm_eps = hparams.f_norm_eps;
3774
+ const float norm_rms_eps = hparams.f_norm_rms_eps;
3361
3775
 
3362
3776
  const int n_gpu_layers = model.n_gpu_layers;
3363
3777
 
@@ -3365,21 +3779,16 @@ static struct ggml_cgraph * llm_build_falcon(
3365
3779
  const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
3366
3780
  const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
3367
3781
 
3368
- const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
3369
-
3370
- //printf("kv_head = %d, n_kv = %d, n_tokens = %d, n_ctx = %d, is_measure = %d, has_shift = %d\n",
3371
- // kv_head, n_kv, n_tokens, n_ctx, ggml_allocr_is_measure(lctx.alloc), kv_self.has_shift);
3782
+ // printf("n_kv = %d\n", n_kv);
3372
3783
 
3373
3784
  auto & buf_compute = lctx.buf_compute;
3374
3785
 
3375
3786
  struct ggml_init_params params = {
3376
3787
  /*.mem_size =*/ buf_compute.size,
3377
3788
  /*.mem_buffer =*/ buf_compute.data,
3378
- /*.no_alloc =*/ false,
3789
+ /*.no_alloc =*/ true,
3379
3790
  };
3380
3791
 
3381
- params.no_alloc = true;
3382
-
3383
3792
  struct ggml_context * ctx0 = ggml_init(params);
3384
3793
 
3385
3794
  ggml_cgraph * gf = ggml_new_graph(ctx0);
@@ -3436,7 +3845,7 @@ static struct ggml_cgraph * llm_build_falcon(
3436
3845
  ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
3437
3846
  ggml_allocr_alloc(lctx.alloc, KQ_scale);
3438
3847
  if (!ggml_allocr_is_measure(lctx.alloc)) {
3439
- ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
3848
+ ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd_head)));
3440
3849
  }
3441
3850
 
3442
3851
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
@@ -3462,47 +3871,8 @@ static struct ggml_cgraph * llm_build_falcon(
3462
3871
  }
3463
3872
  }
3464
3873
 
3465
- // KQ_pos - contains the positions
3466
- struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
3467
- offload_func_kq(KQ_pos);
3468
- ggml_set_name(KQ_pos, "KQ_pos");
3469
- ggml_allocr_alloc(lctx.alloc, KQ_pos);
3470
- if (!ggml_allocr_is_measure(lctx.alloc)) {
3471
- int * data = (int *) KQ_pos->data;
3472
- for (int i = 0; i < n_tokens; ++i) {
3473
- data[i] = batch.pos[i];
3474
- }
3475
- }
3476
-
3477
- // shift the entire K-cache if needed
3478
- if (do_rope_shift) {
3479
- struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
3480
- offload_func_kq(K_shift);
3481
- ggml_set_name(K_shift, "K_shift");
3482
- ggml_allocr_alloc(lctx.alloc, K_shift);
3483
- if (!ggml_allocr_is_measure(lctx.alloc)) {
3484
- int * data = (int *) K_shift->data;
3485
- for (int i = 0; i < n_ctx; ++i) {
3486
- data[i] = kv_self.cells[i].delta;
3487
- }
3488
- }
3489
-
3490
- for (int il = 0; il < n_layer; ++il) {
3491
- struct ggml_tensor * tmp =
3492
- ggml_rope_custom_inplace(ctx0,
3493
- ggml_view_3d(ctx0, kv_self.k,
3494
- n_embd_head, n_head_kv, n_ctx,
3495
- ggml_element_size(kv_self.k)*n_embd_head,
3496
- ggml_element_size(kv_self.k)*n_embd_gqa,
3497
- ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
3498
- K_shift, n_embd_head, 2, 0, freq_base, freq_scale);
3499
- offload_func_kq(tmp);
3500
- ggml_build_forward_expand(gf, tmp);
3501
- }
3502
- }
3503
-
3504
3874
  for (int il = 0; il < n_layer; ++il) {
3505
- struct ggml_tensor * attn_norm;
3875
+ ggml_format_name(inpL, "layer_inp_%d", il);
3506
3876
 
3507
3877
  offload_func_t offload_func = llama_nop;
3508
3878
 
@@ -3512,80 +3882,49 @@ static struct ggml_cgraph * llm_build_falcon(
3512
3882
  }
3513
3883
  #endif // GGML_USE_CUBLAS
3514
3884
 
3515
- // self-attention
3516
- // TODO: refactor into common function (shared with LLaMA)
3517
- {
3518
- attn_norm = ggml_norm(ctx0, inpL, norm_eps);
3519
- offload_func(attn_norm);
3885
+ struct ggml_tensor * inpSA = inpL;
3520
3886
 
3521
- attn_norm = ggml_add(ctx0,
3522
- ggml_mul(ctx0, attn_norm, model.layers[il].attn_norm),
3523
- model.layers[il].attn_norm_b);
3524
- offload_func(attn_norm->src[0]);
3525
- offload_func(attn_norm);
3887
+ // norm
3888
+ {
3889
+ cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps);
3890
+ offload_func(cur);
3891
+ ggml_set_name(cur, "rms_norm_0");
3526
3892
 
3527
- if (model.layers[il].attn_norm_2) { // Falcon-40B
3528
- cur = ggml_norm(ctx0, inpL, norm_eps);
3529
- offload_func(cur);
3893
+ // cur = cur*attn_norm(broadcasted)
3894
+ cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm);
3895
+ offload_func(cur);
3896
+ ggml_set_name(cur, "attention_norm_0");
3897
+ }
3530
3898
 
3531
- cur = ggml_add(ctx0,
3532
- ggml_mul(ctx0, cur, model.layers[il].attn_norm_2),
3533
- model.layers[il].attn_norm_2_b);
3534
- offload_func(cur->src[0]);
3535
- offload_func(cur);
3536
- } else { // Falcon 7B
3537
- cur = attn_norm;
3538
- }
3539
-
3540
- // compute QKV
3541
-
3542
- cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
3543
- offload_func_kq(cur);
3544
-
3545
- // Note that the strides for Kcur, Vcur are set up so that the
3546
- // resulting views are misaligned with the tensor's storage
3547
- // (by applying the K/V offset we shift the tensor's original
3548
- // view to stick out behind the viewed QKV tensor's allocated
3549
- // memory, so to say). This is ok because no actual accesses
3550
- // happen to that out-of-range memory, but it can require some
3551
- // trickery when trying to accurately dump these views for
3552
- // debugging.
3553
-
3554
- const size_t wsize = ggml_type_size(cur->type);
3899
+ // self-attention
3900
+ {
3901
+ // compute Q and K
3902
+ struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
3903
+ offload_func_kq(tmpk);
3904
+ ggml_set_name(tmpk, "tmpk");
3555
3905
 
3556
- // TODO: these 2 ggml_conts are technically not needed, but we add them until CUDA support for
3557
- // non-contiguous views is added for the rope operator
3558
- struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_3d(
3559
- ctx0, cur, n_embd_head, n_head, n_tokens,
3560
- wsize * n_embd_head,
3561
- wsize * n_embd_head * (n_head + 2 * n_head_kv),
3562
- 0));
3906
+ struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
3563
3907
  offload_func_kq(tmpq);
3908
+ ggml_set_name(tmpq, "tmpq");
3564
3909
 
3565
- struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_3d(
3566
- ctx0, cur, n_embd_head, n_head_kv, n_tokens,
3567
- wsize * n_embd_head,
3568
- wsize * n_embd_head * (n_head + 2 * n_head_kv),
3569
- wsize * n_embd_head * n_head));
3570
- offload_func_kq(tmpk);
3571
-
3572
- struct ggml_tensor * tmpv = ggml_view_3d(
3573
- ctx0, cur, n_embd_head, n_head_kv, n_tokens,
3574
- wsize * n_embd_head,
3575
- wsize * n_embd_head * (n_head + 2 * n_head_kv),
3576
- wsize * n_embd_head * (n_head + n_head_kv));
3577
- offload_func_v(tmpv);
3910
+ struct ggml_tensor * Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens);
3911
+ offload_func_kq(Kcur);
3912
+ ggml_set_name(Kcur, "Kcur");
3578
3913
 
3579
- // using mode = 2 for neox mode
3580
- struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, tmpq, KQ_pos, n_embd_head, 2, 0, freq_base, freq_scale);
3914
+ struct ggml_tensor * Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens);
3581
3915
  offload_func_kq(Qcur);
3582
- struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, tmpk, KQ_pos, n_embd_head, 2, 0, freq_base, freq_scale);
3583
- offload_func_kq(Kcur);
3916
+ ggml_set_name(Qcur, "Qcur");
3584
3917
 
3918
+ // store key and value to memory
3585
3919
  {
3586
- struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
3920
+ // compute the transposed [n_tokens, n_embd] V matrix
3921
+
3922
+ struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
3923
+ offload_func_v(tmpv);
3924
+ ggml_set_name(tmpv, "tmpv");
3925
+
3926
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens));
3587
3927
  offload_func_v(Vcur);
3588
- offload_func_v(Vcur->src[0]->src[0]);
3589
3928
  ggml_set_name(Vcur, "Vcur");
3590
3929
 
3591
3930
  struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
@@ -3596,6 +3935,7 @@ static struct ggml_cgraph * llm_build_falcon(
3596
3935
  ( n_ctx)*ggml_element_size(kv_self.v),
3597
3936
  (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
3598
3937
  offload_func_v(v);
3938
+ ggml_set_name(v, "v");
3599
3939
 
3600
3940
  ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
3601
3941
  ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
@@ -3614,22 +3954,31 @@ static struct ggml_cgraph * llm_build_falcon(
3614
3954
  offload_func_kq(K);
3615
3955
  ggml_set_name(K, "K");
3616
3956
 
3957
+ // K * Q
3617
3958
  struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
3618
3959
  offload_func_kq(KQ);
3619
3960
  ggml_set_name(KQ, "KQ");
3620
3961
 
3962
+ // KQ_scaled = KQ / sqrt(n_embd_head)
3963
+ // KQ_scaled shape [n_kv, n_tokens, n_head, 1]
3621
3964
  struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
3622
3965
  offload_func_kq(KQ_scaled);
3623
3966
  ggml_set_name(KQ_scaled, "KQ_scaled");
3624
3967
 
3625
- struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
3968
+ // KQ_masked = mask_past(KQ_scaled)
3969
+ struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ 0, n_head, 8);
3970
+ ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
3971
+
3972
+ struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
3626
3973
  offload_func_kq(KQ_masked);
3627
3974
  ggml_set_name(KQ_masked, "KQ_masked");
3628
3975
 
3976
+ // KQ = soft_max(KQ_masked)
3629
3977
  struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
3630
3978
  offload_func_v(KQ_soft_max);
3631
3979
  ggml_set_name(KQ_soft_max, "KQ_soft_max");
3632
3980
 
3981
+ // split cached V into n_head heads
3633
3982
  struct ggml_tensor * V =
3634
3983
  ggml_view_3d(ctx0, kv_self.v,
3635
3984
  n_kv, n_embd_head, n_head_kv,
@@ -3639,42 +3988,85 @@ static struct ggml_cgraph * llm_build_falcon(
3639
3988
  offload_func_v(V);
3640
3989
  ggml_set_name(V, "V");
3641
3990
 
3991
+ #if 1
3642
3992
  struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
3643
3993
  offload_func_v(KQV);
3644
3994
  ggml_set_name(KQV, "KQV");
3995
+ #else
3996
+ // make V contiguous in memory to speed up the matmul, however we waste time on the copy
3997
+ // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
3998
+ // is there a better way?
3999
+ struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_ctx, n_embd_head, n_head));
4000
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
4001
+ #endif
3645
4002
 
4003
+ // KQV_merged = KQV.permute(0, 2, 1, 3)
3646
4004
  struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
3647
4005
  offload_func_v(KQV_merged);
3648
4006
  ggml_set_name(KQV_merged, "KQV_merged");
3649
4007
 
4008
+ // cur = KQV_merged.contiguous().view(n_embd, n_tokens)
3650
4009
  cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
3651
4010
  offload_func_v(cur);
3652
4011
  ggml_set_name(cur, "KQV_merged_contiguous");
3653
4012
 
3654
- cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
4013
+ // projection (no bias)
4014
+ cur = ggml_mul_mat(ctx0,
4015
+ model.layers[il].wo,
4016
+ cur);
3655
4017
  offload_func(cur);
3656
4018
  ggml_set_name(cur, "result_wo");
3657
4019
  }
3658
4020
 
3659
- struct ggml_tensor * attn_out = cur;
4021
+ struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
4022
+ offload_func(inpFF);
4023
+ ggml_set_name(inpFF, "inpFF");
3660
4024
 
3661
- // feed forward
4025
+ // feed-forward network
3662
4026
  {
3663
- struct ggml_tensor * inpFF = attn_norm;
4027
+ // norm
4028
+ {
4029
+ cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps);
4030
+ offload_func(cur);
4031
+ ggml_set_name(cur, "rms_norm_1");
3664
4032
 
3665
- cur = ggml_mul_mat(ctx0, model.layers[il].w3, inpFF);
4033
+ // cur = cur*ffn_norm(broadcasted)
4034
+ cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
4035
+ offload_func(cur);
4036
+ ggml_set_name(cur, "ffn_norm");
4037
+ }
4038
+
4039
+ struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
4040
+ model.layers[il].w3,
4041
+ cur);
4042
+ offload_func(tmp);
4043
+ ggml_set_name(tmp, "result_w3");
4044
+
4045
+ cur = ggml_mul_mat(ctx0,
4046
+ model.layers[il].w1,
4047
+ cur);
3666
4048
  offload_func(cur);
4049
+ ggml_set_name(cur, "result_w1");
3667
4050
 
3668
- cur = ggml_gelu(ctx0, cur);
4051
+ // SILU activation
4052
+ cur = ggml_silu(ctx0, cur);
3669
4053
  offload_func(cur);
3670
- cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur);
4054
+ ggml_set_name(cur, "silu");
4055
+
4056
+ cur = ggml_mul(ctx0, cur, tmp);
3671
4057
  offload_func(cur);
4058
+ ggml_set_name(cur, "silu_x_result_w3");
4059
+
4060
+ cur = ggml_mul_mat(ctx0,
4061
+ model.layers[il].w2,
4062
+ cur);
4063
+ offload_func(cur);
4064
+ ggml_set_name(cur, "result_w2");
3672
4065
  }
3673
4066
 
3674
- cur = ggml_add(ctx0, cur, attn_out);
3675
- offload_func(cur);
3676
- cur = ggml_add(ctx0, cur, inpL);
4067
+ cur = ggml_add(ctx0, cur, inpFF);
3677
4068
  offload_func(cur);
4069
+ ggml_set_name(cur, "inpFF_+_result_w2");
3678
4070
 
3679
4071
  // input for next layer
3680
4072
  inpL = cur;
@@ -3684,15 +4076,17 @@ static struct ggml_cgraph * llm_build_falcon(
3684
4076
 
3685
4077
  // norm
3686
4078
  {
3687
- cur = ggml_norm(ctx0, cur, norm_eps);
4079
+ cur = ggml_rms_norm(ctx0, cur, norm_rms_eps);
3688
4080
  offload_func_nr(cur);
4081
+ ggml_set_name(cur, "rms_norm_2");
3689
4082
 
3690
- cur = ggml_add(ctx0,
3691
- ggml_mul(ctx0, cur, model.output_norm),
3692
- model.output_norm_b);
4083
+ // cur = cur*norm(broadcasted)
4084
+ cur = ggml_mul(ctx0, cur, model.output_norm);
4085
+ // offload_func_nr(cur); // TODO CPU + GPU mirrored backend
3693
4086
  ggml_set_name(cur, "result_norm");
3694
4087
  }
3695
4088
 
4089
+ // lm_head
3696
4090
  cur = ggml_mul_mat(ctx0, model.output, cur);
3697
4091
  ggml_set_name(cur, "result_output");
3698
4092
 
@@ -3703,7 +4097,7 @@ static struct ggml_cgraph * llm_build_falcon(
3703
4097
  return gf;
3704
4098
  }
3705
4099
 
3706
- static struct ggml_cgraph * llm_build_starcoder(
4100
+ static struct ggml_cgraph * llm_build_falcon(
3707
4101
  llama_context & lctx,
3708
4102
  const llama_batch & batch) {
3709
4103
  const auto & model = lctx.model;
@@ -3724,29 +4118,34 @@ static struct ggml_cgraph * llm_build_starcoder(
3724
4118
 
3725
4119
  GGML_ASSERT(n_embd_head == hparams.n_rot);
3726
4120
 
3727
- const float norm_eps = hparams.f_norm_eps;
4121
+ const float freq_base = cparams.rope_freq_base;
4122
+ const float freq_scale = cparams.rope_freq_scale;
4123
+ const float norm_eps = hparams.f_norm_eps;
4124
+
4125
+ const int n_gpu_layers = model.n_gpu_layers;
3728
4126
 
3729
4127
  const int32_t n_tokens = batch.n_tokens;
3730
4128
  const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
3731
4129
  const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
3732
4130
 
4131
+ const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
4132
+
4133
+ //printf("kv_head = %d, n_kv = %d, n_tokens = %d, n_ctx = %d, is_measure = %d, has_shift = %d\n",
4134
+ // kv_head, n_kv, n_tokens, n_ctx, ggml_allocr_is_measure(lctx.alloc), kv_self.has_shift);
4135
+
3733
4136
  auto & buf_compute = lctx.buf_compute;
3734
4137
 
3735
4138
  struct ggml_init_params params = {
3736
4139
  /*.mem_size =*/ buf_compute.size,
3737
4140
  /*.mem_buffer =*/ buf_compute.data,
3738
- /*.no_alloc =*/ false,
4141
+ /*.no_alloc =*/ true,
3739
4142
  };
3740
4143
 
3741
- params.no_alloc = true;
3742
-
3743
4144
  struct ggml_context * ctx0 = ggml_init(params);
3744
4145
 
3745
4146
  ggml_cgraph * gf = ggml_new_graph(ctx0);
3746
4147
 
3747
4148
  struct ggml_tensor * cur;
3748
- struct ggml_tensor * token;
3749
- struct ggml_tensor * position;
3750
4149
  struct ggml_tensor * inpL;
3751
4150
 
3752
4151
  if (batch.token) {
@@ -3758,30 +4157,390 @@ static struct ggml_cgraph * llm_build_starcoder(
3758
4157
  }
3759
4158
  ggml_set_name(inp_tokens, "inp_tokens");
3760
4159
 
3761
- token = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
4160
+ inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
3762
4161
  } else {
3763
4162
  #ifdef GGML_USE_MPI
3764
4163
  GGML_ASSERT(false && "not implemented");
3765
4164
  #endif
3766
4165
 
3767
- token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
4166
+ inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
3768
4167
 
3769
- ggml_allocr_alloc(lctx.alloc, token);
4168
+ ggml_allocr_alloc(lctx.alloc, inpL);
3770
4169
  if (!ggml_allocr_is_measure(lctx.alloc)) {
3771
- memcpy(token->data, batch.embd, n_tokens * n_embd * ggml_element_size(token));
4170
+ memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
3772
4171
  }
3773
4172
  }
3774
4173
 
3775
- {
3776
- // Compute position embeddings.
3777
- struct ggml_tensor * inp_positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
3778
- ggml_allocr_alloc(lctx.alloc, inp_positions);
4174
+ const int i_gpu_start = n_layer - n_gpu_layers;
4175
+ (void) i_gpu_start;
4176
+
4177
+ // offload functions set the tensor output backend to GPU
4178
+ // tensors are GPU-accelerated if any input or the output has been offloaded
4179
+ offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
4180
+ offload_func_t offload_func_kq = llama_nop;
4181
+ offload_func_t offload_func_v = llama_nop;
4182
+
4183
+ #ifdef GGML_USE_CUBLAS
4184
+ if (n_gpu_layers > n_layer) {
4185
+ offload_func_nr = ggml_cuda_assign_buffers_no_alloc;
4186
+ }
4187
+ if (n_gpu_layers > n_layer + 1) {
4188
+ offload_func_v = ggml_cuda_assign_buffers_no_alloc;
4189
+ }
4190
+ if (n_gpu_layers > n_layer + 2) {
4191
+ offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
4192
+ }
4193
+ #endif // GGML_USE_CUBLAS
4194
+
4195
+ // KQ_scale
4196
+ struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
4197
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
4198
+ ggml_allocr_alloc(lctx.alloc, KQ_scale);
4199
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
4200
+ ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
4201
+ }
4202
+
4203
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
4204
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
4205
+ offload_func_kq(KQ_mask);
4206
+ ggml_set_name(KQ_mask, "KQ_mask");
4207
+ ggml_allocr_alloc(lctx.alloc, KQ_mask);
4208
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
4209
+ float * data = (float *) KQ_mask->data;
4210
+ memset(data, 0, ggml_nbytes(KQ_mask));
4211
+
4212
+ for (int h = 0; h < 1; ++h) {
4213
+ for (int j = 0; j < n_tokens; ++j) {
4214
+ const llama_pos pos = batch.pos[j];
4215
+ const llama_seq_id seq_id = batch.seq_id[j];
4216
+
4217
+ for (int i = 0; i < n_kv; ++i) {
4218
+ if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
4219
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
4220
+ }
4221
+ }
4222
+ }
4223
+ }
4224
+ }
4225
+
4226
+ // KQ_pos - contains the positions
4227
+ struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
4228
+ offload_func_kq(KQ_pos);
4229
+ ggml_set_name(KQ_pos, "KQ_pos");
4230
+ ggml_allocr_alloc(lctx.alloc, KQ_pos);
4231
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
4232
+ int * data = (int *) KQ_pos->data;
4233
+ for (int i = 0; i < n_tokens; ++i) {
4234
+ data[i] = batch.pos[i];
4235
+ }
4236
+ }
4237
+
4238
+ // shift the entire K-cache if needed
4239
+ if (do_rope_shift) {
4240
+ struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
4241
+ offload_func_kq(K_shift);
4242
+ ggml_set_name(K_shift, "K_shift");
4243
+ ggml_allocr_alloc(lctx.alloc, K_shift);
3779
4244
  if (!ggml_allocr_is_measure(lctx.alloc)) {
3780
- for (int i = 0; i < n_tokens; ++i) {
3781
- ((int32_t *) inp_positions->data)[i] = batch.pos[i];
4245
+ int * data = (int *) K_shift->data;
4246
+ for (int i = 0; i < n_ctx; ++i) {
4247
+ data[i] = kv_self.cells[i].delta;
3782
4248
  }
3783
4249
  }
3784
- ggml_set_name(inp_positions, "inp_positions");
4250
+
4251
+ for (int il = 0; il < n_layer; ++il) {
4252
+ struct ggml_tensor * tmp =
4253
+ ggml_rope_custom_inplace(ctx0,
4254
+ ggml_view_3d(ctx0, kv_self.k,
4255
+ n_embd_head, n_head_kv, n_ctx,
4256
+ ggml_element_size(kv_self.k)*n_embd_head,
4257
+ ggml_element_size(kv_self.k)*n_embd_gqa,
4258
+ ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
4259
+ K_shift, n_embd_head, 2, 0, freq_base, freq_scale);
4260
+ offload_func_kq(tmp);
4261
+ ggml_build_forward_expand(gf, tmp);
4262
+ }
4263
+ }
4264
+
4265
+ for (int il = 0; il < n_layer; ++il) {
4266
+ struct ggml_tensor * attn_norm;
4267
+
4268
+ offload_func_t offload_func = llama_nop;
4269
+
4270
+ #ifdef GGML_USE_CUBLAS
4271
+ if (il >= i_gpu_start) {
4272
+ offload_func = ggml_cuda_assign_buffers_no_alloc;
4273
+ }
4274
+ #endif // GGML_USE_CUBLAS
4275
+
4276
+ // self-attention
4277
+ // TODO: refactor into common function (shared with LLaMA)
4278
+ {
4279
+ attn_norm = ggml_norm(ctx0, inpL, norm_eps);
4280
+ offload_func(attn_norm);
4281
+
4282
+ attn_norm = ggml_add(ctx0,
4283
+ ggml_mul(ctx0, attn_norm, model.layers[il].attn_norm),
4284
+ model.layers[il].attn_norm_b);
4285
+ offload_func(attn_norm->src[0]);
4286
+ offload_func(attn_norm);
4287
+
4288
+ if (model.layers[il].attn_norm_2) { // Falcon-40B
4289
+ cur = ggml_norm(ctx0, inpL, norm_eps);
4290
+ offload_func(cur);
4291
+
4292
+ cur = ggml_add(ctx0,
4293
+ ggml_mul(ctx0, cur, model.layers[il].attn_norm_2),
4294
+ model.layers[il].attn_norm_2_b);
4295
+ offload_func(cur->src[0]);
4296
+ offload_func(cur);
4297
+ } else { // Falcon 7B
4298
+ cur = attn_norm;
4299
+ }
4300
+
4301
+ // compute QKV
4302
+
4303
+ cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
4304
+ offload_func_kq(cur);
4305
+
4306
+ // Note that the strides for Kcur, Vcur are set up so that the
4307
+ // resulting views are misaligned with the tensor's storage
4308
+ // (by applying the K/V offset we shift the tensor's original
4309
+ // view to stick out behind the viewed QKV tensor's allocated
4310
+ // memory, so to say). This is ok because no actual accesses
4311
+ // happen to that out-of-range memory, but it can require some
4312
+ // trickery when trying to accurately dump these views for
4313
+ // debugging.
4314
+
4315
+ const size_t wsize = ggml_type_size(cur->type);
4316
+
4317
+ // TODO: these 2 ggml_conts are technically not needed, but we add them until CUDA support for
4318
+ // non-contiguous views is added for the rope operator
4319
+ struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_3d(
4320
+ ctx0, cur, n_embd_head, n_head, n_tokens,
4321
+ wsize * n_embd_head,
4322
+ wsize * n_embd_head * (n_head + 2 * n_head_kv),
4323
+ 0));
4324
+ offload_func_kq(tmpq);
4325
+
4326
+ struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_3d(
4327
+ ctx0, cur, n_embd_head, n_head_kv, n_tokens,
4328
+ wsize * n_embd_head,
4329
+ wsize * n_embd_head * (n_head + 2 * n_head_kv),
4330
+ wsize * n_embd_head * n_head));
4331
+ offload_func_kq(tmpk);
4332
+
4333
+ struct ggml_tensor * tmpv = ggml_view_3d(
4334
+ ctx0, cur, n_embd_head, n_head_kv, n_tokens,
4335
+ wsize * n_embd_head,
4336
+ wsize * n_embd_head * (n_head + 2 * n_head_kv),
4337
+ wsize * n_embd_head * (n_head + n_head_kv));
4338
+ offload_func_v(tmpv);
4339
+
4340
+ // using mode = 2 for neox mode
4341
+ struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, tmpq, KQ_pos, n_embd_head, 2, 0, freq_base, freq_scale);
4342
+ offload_func_kq(Qcur);
4343
+ struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, tmpk, KQ_pos, n_embd_head, 2, 0, freq_base, freq_scale);
4344
+ offload_func_kq(Kcur);
4345
+
4346
+ {
4347
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
4348
+ offload_func_v(Vcur);
4349
+ offload_func_v(Vcur->src[0]->src[0]);
4350
+ ggml_set_name(Vcur, "Vcur");
4351
+
4352
+ struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
4353
+ offload_func_kq(k);
4354
+ ggml_set_name(k, "k");
4355
+
4356
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
4357
+ ( n_ctx)*ggml_element_size(kv_self.v),
4358
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
4359
+ offload_func_v(v);
4360
+
4361
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
4362
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
4363
+ }
4364
+
4365
+ struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
4366
+ offload_func_kq(Q);
4367
+ ggml_set_name(Q, "Q");
4368
+
4369
+ struct ggml_tensor * K =
4370
+ ggml_view_3d(ctx0, kv_self.k,
4371
+ n_embd_head, n_kv, n_head_kv,
4372
+ ggml_element_size(kv_self.k)*n_embd_gqa,
4373
+ ggml_element_size(kv_self.k)*n_embd_head,
4374
+ ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
4375
+ offload_func_kq(K);
4376
+ ggml_set_name(K, "K");
4377
+
4378
+ struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
4379
+ offload_func_kq(KQ);
4380
+ ggml_set_name(KQ, "KQ");
4381
+
4382
+ struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
4383
+ offload_func_kq(KQ_scaled);
4384
+ ggml_set_name(KQ_scaled, "KQ_scaled");
4385
+
4386
+ struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
4387
+ offload_func_kq(KQ_masked);
4388
+ ggml_set_name(KQ_masked, "KQ_masked");
4389
+
4390
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
4391
+ offload_func_v(KQ_soft_max);
4392
+ ggml_set_name(KQ_soft_max, "KQ_soft_max");
4393
+
4394
+ struct ggml_tensor * V =
4395
+ ggml_view_3d(ctx0, kv_self.v,
4396
+ n_kv, n_embd_head, n_head_kv,
4397
+ ggml_element_size(kv_self.v)*n_ctx,
4398
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
4399
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
4400
+ offload_func_v(V);
4401
+ ggml_set_name(V, "V");
4402
+
4403
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
4404
+ offload_func_v(KQV);
4405
+ ggml_set_name(KQV, "KQV");
4406
+
4407
+ struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
4408
+ offload_func_v(KQV_merged);
4409
+ ggml_set_name(KQV_merged, "KQV_merged");
4410
+
4411
+ cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
4412
+ offload_func_v(cur);
4413
+ ggml_set_name(cur, "KQV_merged_contiguous");
4414
+
4415
+ cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
4416
+ offload_func(cur);
4417
+ ggml_set_name(cur, "result_wo");
4418
+ }
4419
+
4420
+ struct ggml_tensor * attn_out = cur;
4421
+
4422
+ // feed forward
4423
+ {
4424
+ struct ggml_tensor * inpFF = attn_norm;
4425
+
4426
+ cur = ggml_mul_mat(ctx0, model.layers[il].w3, inpFF);
4427
+ offload_func(cur);
4428
+
4429
+ cur = ggml_gelu(ctx0, cur);
4430
+ offload_func(cur);
4431
+ cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur);
4432
+ offload_func(cur);
4433
+ }
4434
+
4435
+ cur = ggml_add(ctx0, cur, attn_out);
4436
+ offload_func(cur);
4437
+ cur = ggml_add(ctx0, cur, inpL);
4438
+ offload_func(cur);
4439
+
4440
+ // input for next layer
4441
+ inpL = cur;
4442
+ }
4443
+
4444
+ cur = inpL;
4445
+
4446
+ // norm
4447
+ {
4448
+ cur = ggml_norm(ctx0, cur, norm_eps);
4449
+ offload_func_nr(cur);
4450
+
4451
+ cur = ggml_add(ctx0,
4452
+ ggml_mul(ctx0, cur, model.output_norm),
4453
+ model.output_norm_b);
4454
+ ggml_set_name(cur, "result_norm");
4455
+ }
4456
+
4457
+ cur = ggml_mul_mat(ctx0, model.output, cur);
4458
+ ggml_set_name(cur, "result_output");
4459
+
4460
+ ggml_build_forward_expand(gf, cur);
4461
+
4462
+ ggml_free(ctx0);
4463
+
4464
+ return gf;
4465
+ }
4466
+
4467
+ static struct ggml_cgraph * llm_build_starcoder(
4468
+ llama_context & lctx,
4469
+ const llama_batch & batch) {
4470
+ const auto & model = lctx.model;
4471
+ const auto & hparams = model.hparams;
4472
+ const auto & cparams = lctx.cparams;
4473
+
4474
+ const auto & kv_self = lctx.kv_self;
4475
+
4476
+ GGML_ASSERT(!!kv_self.ctx);
4477
+
4478
+ const int64_t n_embd = hparams.n_embd;
4479
+ const int64_t n_layer = hparams.n_layer;
4480
+ const int64_t n_ctx = cparams.n_ctx;
4481
+ const int64_t n_head = hparams.n_head;
4482
+ const int64_t n_head_kv = hparams.n_head_kv;
4483
+ const int64_t n_embd_head = hparams.n_embd_head();
4484
+ const int64_t n_embd_gqa = hparams.n_embd_gqa();
4485
+
4486
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
4487
+
4488
+ const float norm_eps = hparams.f_norm_eps;
4489
+
4490
+ const int32_t n_tokens = batch.n_tokens;
4491
+ const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
4492
+ const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
4493
+
4494
+ auto & buf_compute = lctx.buf_compute;
4495
+
4496
+ struct ggml_init_params params = {
4497
+ /*.mem_size =*/ buf_compute.size,
4498
+ /*.mem_buffer =*/ buf_compute.data,
4499
+ /*.no_alloc =*/ true,
4500
+ };
4501
+
4502
+ struct ggml_context * ctx0 = ggml_init(params);
4503
+
4504
+ ggml_cgraph * gf = ggml_new_graph(ctx0);
4505
+
4506
+ struct ggml_tensor * cur;
4507
+ struct ggml_tensor * token;
4508
+ struct ggml_tensor * position;
4509
+ struct ggml_tensor * inpL;
4510
+
4511
+ if (batch.token) {
4512
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
4513
+
4514
+ ggml_allocr_alloc(lctx.alloc, inp_tokens);
4515
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
4516
+ memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
4517
+ }
4518
+ ggml_set_name(inp_tokens, "inp_tokens");
4519
+
4520
+ token = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
4521
+ } else {
4522
+ #ifdef GGML_USE_MPI
4523
+ GGML_ASSERT(false && "not implemented");
4524
+ #endif
4525
+
4526
+ token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
4527
+
4528
+ ggml_allocr_alloc(lctx.alloc, token);
4529
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
4530
+ memcpy(token->data, batch.embd, n_tokens * n_embd * ggml_element_size(token));
4531
+ }
4532
+ }
4533
+
4534
+ {
4535
+ // Compute position embeddings.
4536
+ struct ggml_tensor * inp_positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
4537
+ ggml_allocr_alloc(lctx.alloc, inp_positions);
4538
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
4539
+ for (int i = 0; i < n_tokens; ++i) {
4540
+ ((int32_t *) inp_positions->data)[i] = batch.pos[i];
4541
+ }
4542
+ }
4543
+ ggml_set_name(inp_positions, "inp_positions");
3785
4544
 
3786
4545
  position = ggml_get_rows(ctx0, model.pos_embeddings, inp_positions);
3787
4546
  }
@@ -3816,48 +4575,984 @@ static struct ggml_cgraph * llm_build_starcoder(
3816
4575
  }
3817
4576
  }
3818
4577
 
3819
- inpL = ggml_add(ctx0, token, position);
3820
- ggml_set_name(inpL, "inpL");
3821
-
4578
+ inpL = ggml_add(ctx0, token, position);
4579
+ ggml_set_name(inpL, "inpL");
4580
+
4581
+ for (int il = 0; il < n_layer; ++il) {
4582
+ {
4583
+ // Norm
4584
+ cur = ggml_norm(ctx0, inpL, norm_eps);
4585
+ cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b);
4586
+ }
4587
+
4588
+ {
4589
+ // Self Attention
4590
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv);
4591
+
4592
+ struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd);
4593
+ struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*n_embd);
4594
+ struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
4595
+
4596
+ struct ggml_tensor * Qcur = tmpq;
4597
+ struct ggml_tensor * Kcur = tmpk;
4598
+
4599
+ {
4600
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
4601
+ ggml_set_name(Vcur, "Vcur");
4602
+
4603
+ struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
4604
+ ggml_set_name(k, "k");
4605
+
4606
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
4607
+ ( n_ctx)*ggml_element_size(kv_self.v),
4608
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
4609
+
4610
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
4611
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
4612
+ }
4613
+
4614
+ struct ggml_tensor * Q =
4615
+ ggml_permute(ctx0,
4616
+ ggml_cpy(ctx0,
4617
+ Qcur,
4618
+ ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, n_tokens)),
4619
+ 0, 2, 1, 3);
4620
+ ggml_set_name(Q, "Q");
4621
+
4622
+ struct ggml_tensor * K =
4623
+ ggml_view_3d(ctx0, kv_self.k,
4624
+ n_embd_head, n_kv, n_head_kv,
4625
+ ggml_element_size(kv_self.k)*n_embd_gqa,
4626
+ ggml_element_size(kv_self.k)*n_embd_head,
4627
+ ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
4628
+ ggml_set_name(K, "K");
4629
+
4630
+ // K * Q
4631
+ struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
4632
+ ggml_set_name(KQ, "KQ");
4633
+
4634
+ // KQ_scaled = KQ / sqrt(n_embd_head)
4635
+ // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
4636
+ struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
4637
+ ggml_set_name(KQ_scaled, "KQ_scaled");
4638
+
4639
+ // KQ_masked = mask_past(KQ_scaled)
4640
+ struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
4641
+ ggml_set_name(KQ_masked, "KQ_masked");
4642
+
4643
+ // KQ = soft_max(KQ_masked)
4644
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
4645
+ ggml_set_name(KQ_soft_max, "KQ_soft_max");
4646
+
4647
+ // split cached V into n_head heads
4648
+ struct ggml_tensor * V =
4649
+ ggml_view_3d(ctx0, kv_self.v,
4650
+ n_kv, n_embd_head, n_head_kv,
4651
+ ggml_element_size(kv_self.v)*n_ctx,
4652
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
4653
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
4654
+ ggml_set_name(V, "V");
4655
+
4656
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
4657
+ ggml_set_name(KQV, "KQV");
4658
+
4659
+ // KQV_merged = KQV.permute(0, 2, 1, 3)
4660
+ struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
4661
+ ggml_set_name(KQV_merged, "KQV_merged");
4662
+
4663
+ // cur = KQV_merged.contiguous().view(n_embd, n_tokens)
4664
+ cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
4665
+ ggml_set_name(cur, "KQV_merged_contiguous");
4666
+ }
4667
+
4668
+ // Projection
4669
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo);
4670
+
4671
+ // Add the input
4672
+ cur = ggml_add(ctx0, cur, inpL);
4673
+
4674
+ struct ggml_tensor * inpFF = cur;
4675
+
4676
+ // FF
4677
+ {
4678
+ // Norm
4679
+ {
4680
+ cur = ggml_norm(ctx0, inpFF, norm_eps);
4681
+ cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b);
4682
+ }
4683
+
4684
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3);
4685
+
4686
+ // GELU activation
4687
+ cur = ggml_gelu(ctx0, cur);
4688
+
4689
+ // Projection
4690
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2);
4691
+ }
4692
+
4693
+ inpL = ggml_add(ctx0, cur, inpFF);
4694
+ }
4695
+
4696
+ // Output Norm
4697
+ {
4698
+ cur = ggml_norm(ctx0, inpL, norm_eps);
4699
+ cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b);
4700
+ }
4701
+ ggml_set_name(cur, "result_norm");
4702
+
4703
+ cur = ggml_mul_mat(ctx0, model.output, cur);
4704
+ ggml_set_name(cur, "result_output");
4705
+
4706
+ ggml_build_forward_expand(gf, cur);
4707
+ ggml_free(ctx0);
4708
+
4709
+ return gf;
4710
+ }
4711
+
4712
+ static struct ggml_cgraph * llm_build_persimmon(
4713
+ llama_context & lctx,
4714
+ const llama_batch & batch) {
4715
+ const auto & model = lctx.model;
4716
+ const auto & hparams = model.hparams;
4717
+
4718
+ const auto & kv_self = lctx.kv_self;
4719
+
4720
+ GGML_ASSERT(!!kv_self.ctx);
4721
+
4722
+ const auto & cparams = lctx.cparams;
4723
+ const int64_t n_embd = hparams.n_embd;
4724
+ const int64_t n_layer = hparams.n_layer;
4725
+ const int64_t n_ctx = cparams.n_ctx;
4726
+ const int64_t n_head_kv = hparams.n_head_kv;
4727
+ const int64_t n_head = hparams.n_head;
4728
+ const int64_t n_embd_head = hparams.n_embd_head();
4729
+ const int64_t n_embd_gqa = hparams.n_embd_gqa();
4730
+ const size_t n_rot = n_embd_head / 2;
4731
+
4732
+ const float freq_base = cparams.rope_freq_base;
4733
+ const float freq_scale = cparams.rope_freq_scale;
4734
+ const float norm_eps = hparams.f_norm_eps;
4735
+
4736
+ const int n_gpu_layers = model.n_gpu_layers;
4737
+
4738
+
4739
+ const int32_t n_tokens = batch.n_tokens;
4740
+ const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
4741
+ const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
4742
+
4743
+ const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
4744
+
4745
+ auto & buf_compute = lctx.buf_compute;
4746
+ struct ggml_init_params params = {
4747
+ /*.mem_size =*/ buf_compute.size,
4748
+ /*.mem_buffer =*/ buf_compute.data,
4749
+ /*.no_alloc =*/ true,
4750
+ };
4751
+
4752
+ struct ggml_context * ctx0 = ggml_init(params);
4753
+
4754
+ ggml_cgraph * gf = ggml_new_graph(ctx0);
4755
+
4756
+ struct ggml_tensor * cur;
4757
+ struct ggml_tensor * inpL;
4758
+
4759
+ if (batch.token) {
4760
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
4761
+
4762
+ ggml_allocr_alloc(lctx.alloc, inp_tokens);
4763
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
4764
+ memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
4765
+ }
4766
+ ggml_set_name(inp_tokens, "inp_tokens");
4767
+ inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
4768
+ } else {
4769
+ inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
4770
+ ggml_allocr_alloc(lctx.alloc, inpL);
4771
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
4772
+ memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
4773
+ }
4774
+ }
4775
+ const int i_gpu_start = n_layer - n_gpu_layers;
4776
+ (void) i_gpu_start;
4777
+ offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
4778
+ offload_func_t offload_func_kq = llama_nop;
4779
+ offload_func_t offload_func_v = llama_nop;
4780
+ // KQ_scale
4781
+ struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
4782
+ ggml_allocr_alloc(lctx.alloc, KQ_scale);
4783
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
4784
+ ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd_head)));
4785
+ }
4786
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
4787
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
4788
+ offload_func_kq(KQ_mask);
4789
+ ggml_set_name(KQ_mask, "KQ_mask");
4790
+ ggml_allocr_alloc(lctx.alloc, KQ_mask);
4791
+
4792
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
4793
+ float * data = (float *) KQ_mask->data;
4794
+ memset(data, 0, ggml_nbytes(KQ_mask));
4795
+ for (int h = 0; h < 1; ++h) {
4796
+ for (int j = 0; j < n_tokens; ++j) {
4797
+ const llama_pos pos = batch.pos[j];
4798
+ const llama_seq_id seq_id = batch.seq_id[j];
4799
+ for (int i = 0; i < n_kv; ++i) {
4800
+ if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
4801
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
4802
+ }
4803
+ }
4804
+ }
4805
+ }
4806
+ }
4807
+
4808
+ struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
4809
+ offload_func_kq(KQ_pos);
4810
+ ggml_set_name(KQ_pos, "KQ_pos");
4811
+ ggml_allocr_alloc(lctx.alloc, KQ_pos);
4812
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
4813
+ int * data = (int *) KQ_pos->data;
4814
+ for (int i = 0; i < n_tokens; ++i) {
4815
+ data[i] = batch.pos[i];
4816
+ }
4817
+ }
4818
+ if (do_rope_shift) {
4819
+ struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
4820
+ offload_func_kq(K_shift);
4821
+ ggml_set_name(K_shift, "K_shift");
4822
+ ggml_allocr_alloc(lctx.alloc, K_shift);
4823
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
4824
+ int * data = (int *) K_shift->data;
4825
+ for (int i = 0; i < n_ctx; ++i) {
4826
+ data[i] = kv_self.cells[i].delta;
4827
+ }
4828
+ }
4829
+ for (int il = 0; il < n_layer; ++il) {
4830
+ struct ggml_tensor * tmp =
4831
+ // we rotate only the first n_rot dimensions.
4832
+ ggml_rope_custom_inplace(ctx0,
4833
+ ggml_view_3d(ctx0, kv_self.k,
4834
+ n_rot, n_head, n_ctx,
4835
+ ggml_element_size(kv_self.k)*n_embd_gqa,
4836
+ ggml_element_size(kv_self.k)*n_embd_head,
4837
+ ggml_element_size(kv_self.k)*(n_embd_head*n_ctx*il)
4838
+ ),
4839
+ K_shift, n_rot, 2, 0, freq_base, freq_scale);
4840
+ offload_func_kq(tmp);
4841
+ ggml_build_forward_expand(gf, tmp);
4842
+ }
4843
+ }
4844
+ for (int il=0; il < n_layer; ++il) {
4845
+ struct ggml_tensor * residual = inpL;
4846
+ offload_func_t offload_func = llama_nop;
4847
+ {
4848
+ cur = ggml_norm(ctx0, inpL, norm_eps);
4849
+ offload_func(cur);
4850
+ cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm);
4851
+ offload_func(cur);
4852
+ cur = ggml_add(ctx0, cur, model.layers[il].attn_norm_b);
4853
+ offload_func(cur);
4854
+ ggml_format_name(cur, "input_layernorm_%d", il);
4855
+ }
4856
+ // self attention
4857
+ {
4858
+ cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
4859
+ offload_func_kq(cur);
4860
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
4861
+ offload_func_kq(cur);
4862
+
4863
+ // split qkv
4864
+ GGML_ASSERT(n_head_kv == n_head);
4865
+ ggml_set_name(cur, format("qkv_%d", il).c_str());
4866
+ struct ggml_tensor * tmpqkv = ggml_reshape_4d(ctx0, cur, n_embd_head, 3, n_head, n_tokens);
4867
+ offload_func_kq(tmpqkv);
4868
+ struct ggml_tensor * tmpqkv_perm = ggml_cont(ctx0, ggml_permute(ctx0, tmpqkv, 0, 3, 1, 2));
4869
+ offload_func_kq(tmpqkv_perm);
4870
+ ggml_format_name(tmpqkv_perm, "tmpqkv_perm_%d", il);
4871
+ struct ggml_tensor * tmpq = ggml_view_3d(
4872
+ ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
4873
+ ggml_element_size(tmpqkv_perm) * n_embd_head,
4874
+ ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
4875
+ 0
4876
+ );
4877
+ offload_func_kq(tmpq);
4878
+ struct ggml_tensor * tmpk = ggml_view_3d(
4879
+ ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
4880
+ ggml_element_size(tmpqkv_perm) * n_embd_head,
4881
+ ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
4882
+ ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens
4883
+ );
4884
+ offload_func_kq(tmpk);
4885
+ // Q/K Layernorm
4886
+ tmpq = ggml_norm(ctx0, tmpq, norm_eps);
4887
+ offload_func_kq(tmpq);
4888
+ tmpq = ggml_mul(ctx0, tmpq, model.layers[il].attn_q_norm);
4889
+ offload_func_kq(tmpq);
4890
+ tmpq = ggml_add(ctx0, tmpq, model.layers[il].attn_q_norm_b);
4891
+ offload_func_kq(tmpq);
4892
+
4893
+ tmpk = ggml_norm(ctx0, tmpk, norm_eps);
4894
+ offload_func_v(tmpk);
4895
+ tmpk = ggml_mul(ctx0, tmpk, model.layers[il].attn_k_norm);
4896
+ offload_func_v(tmpk);
4897
+ tmpk = ggml_add(ctx0, tmpk, model.layers[il].attn_k_norm_b);
4898
+ offload_func_v(tmpk);
4899
+
4900
+ // RoPE the first n_rot of q/k, pass the other half, and concat.
4901
+ struct ggml_tensor * qrot = ggml_view_3d(
4902
+ ctx0, tmpq, n_rot, n_head, n_tokens,
4903
+ ggml_element_size(tmpq) * n_embd_head,
4904
+ ggml_element_size(tmpq) * n_embd_head * n_head,
4905
+ 0
4906
+ );
4907
+ offload_func_kq(qrot);
4908
+ ggml_format_name(qrot, "qrot_%d", il);
4909
+ struct ggml_tensor * krot = ggml_view_3d(
4910
+ ctx0, tmpk, n_rot, n_head, n_tokens,
4911
+ ggml_element_size(tmpk) * n_embd_head,
4912
+ ggml_element_size(tmpk) * n_embd_head * n_head,
4913
+ 0
4914
+ );
4915
+ offload_func_kq(krot);
4916
+ ggml_format_name(krot, "krot_%d", il);
4917
+
4918
+ // get the second half of tmpq, e.g tmpq[n_rot:, :, :]
4919
+ struct ggml_tensor * qpass = ggml_view_3d(
4920
+ ctx0, tmpq, n_rot, n_head, n_tokens,
4921
+ ggml_element_size(tmpq) * n_embd_head,
4922
+ ggml_element_size(tmpq) * n_embd_head * n_head,
4923
+ ggml_element_size(tmpq) * n_rot
4924
+ );
4925
+ offload_func_kq(qpass);
4926
+ ggml_format_name(qpass, "qpass_%d", il);
4927
+ struct ggml_tensor * kpass = ggml_view_3d(
4928
+ ctx0, tmpk, n_rot, n_head, n_tokens,
4929
+ ggml_element_size(tmpk) * n_embd_head,
4930
+ ggml_element_size(tmpk) * n_embd_head * n_head,
4931
+ ggml_element_size(tmpk) * n_rot
4932
+ );
4933
+ offload_func_kq(kpass);
4934
+ ggml_format_name(kpass, "kpass_%d", il);
4935
+
4936
+ struct ggml_tensor * qrotated = ggml_rope_custom(
4937
+ ctx0, qrot, KQ_pos, n_rot, 2, 0, freq_base, freq_scale
4938
+ );
4939
+ offload_func_kq(qrotated);
4940
+ struct ggml_tensor * krotated = ggml_rope_custom(
4941
+ ctx0, krot, KQ_pos, n_rot, 2, 0, freq_base, freq_scale
4942
+ );
4943
+ offload_func_kq(krotated);
4944
+ // ggml currently only supports concatenation on dim=2
4945
+ // so we need to permute qrot, qpass, concat, then permute back.
4946
+ qrotated = ggml_cont(ctx0, ggml_permute(ctx0, qrotated, 2, 1, 0, 3));
4947
+ offload_func_kq(qrotated);
4948
+ krotated = ggml_cont(ctx0, ggml_permute(ctx0, krotated, 2, 1, 0, 3));
4949
+ offload_func_kq(krotated);
4950
+
4951
+ qpass = ggml_cont(ctx0, ggml_permute(ctx0, qpass, 2, 1, 0, 3));
4952
+ offload_func_kq(qpass);
4953
+ kpass = ggml_cont(ctx0, ggml_permute(ctx0, kpass, 2, 1, 0, 3));
4954
+ offload_func_kq(kpass);
4955
+
4956
+ struct ggml_tensor * Qcur = ggml_concat(ctx0, qrotated, qpass);
4957
+ offload_func_kq(Qcur);
4958
+ struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass);
4959
+ offload_func_kq(Kcur);
4960
+
4961
+ struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 1, 2, 0, 3));
4962
+ offload_func_kq(Q);
4963
+
4964
+ Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
4965
+ offload_func_kq(Kcur);
4966
+ {
4967
+ struct ggml_tensor * tmpv = ggml_view_3d(
4968
+ ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
4969
+ ggml_element_size(tmpqkv_perm) * n_embd_head,
4970
+ ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
4971
+ ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens * 2
4972
+ );
4973
+ offload_func_v(tmpv);
4974
+ // store K, V in cache
4975
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens));
4976
+ offload_func_v(Vcur);
4977
+ ggml_set_name(Vcur, "Vcur");
4978
+
4979
+ struct ggml_tensor * k = ggml_view_1d(
4980
+ ctx0, kv_self.k, n_tokens*n_embd_gqa,
4981
+ (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)
4982
+ );
4983
+ offload_func_kq(k);
4984
+ ggml_set_name(k, "k");
4985
+
4986
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
4987
+ ( n_ctx)*ggml_element_size(kv_self.v),
4988
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
4989
+ offload_func_v(v);
4990
+ ggml_set_name(v, "v");
4991
+
4992
+ // important: storing RoPE-ed version of K in the KV cache!
4993
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
4994
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
4995
+ }
4996
+ struct ggml_tensor * K = ggml_view_3d(ctx0, kv_self.k,
4997
+ n_embd_head, n_kv, n_head_kv,
4998
+ ggml_element_size(kv_self.k)*n_embd_gqa,
4999
+ ggml_element_size(kv_self.k)*n_embd_head,
5000
+ ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
5001
+
5002
+ offload_func_kq(K);
5003
+ ggml_format_name(K, "K_%d", il);
5004
+
5005
+ struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
5006
+ offload_func_kq(KQ);
5007
+ ggml_set_name(KQ, "KQ");
5008
+
5009
+ struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
5010
+ offload_func_kq(KQ_scaled);
5011
+ ggml_set_name(KQ_scaled, "KQ_scaled");
5012
+
5013
+ struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
5014
+ offload_func_kq(KQ_masked);
5015
+ ggml_set_name(KQ_masked, "KQ_masked");
5016
+
5017
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
5018
+ offload_func_kq(KQ_soft_max);
5019
+ ggml_set_name(KQ_soft_max, "KQ_soft_max");
5020
+
5021
+ struct ggml_tensor * V =
5022
+ ggml_view_3d(ctx0, kv_self.v,
5023
+ n_kv, n_embd_head, n_head_kv,
5024
+ ggml_element_size(kv_self.v)*n_ctx,
5025
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
5026
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
5027
+ offload_func_v(V);
5028
+ ggml_set_name(V, "V");
5029
+
5030
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
5031
+ offload_func_v(KQV);
5032
+ ggml_set_name(KQV, "KQV");
5033
+
5034
+ struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
5035
+ offload_func_v(KQV_merged);
5036
+ ggml_set_name(KQV_merged, "KQV_merged");
5037
+
5038
+ cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
5039
+ offload_func_v(cur);
5040
+ ggml_set_name(cur, "KQV_merged_contiguous");
5041
+
5042
+ cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
5043
+ offload_func(cur);
5044
+ cur = ggml_add(ctx0, cur, model.layers[il].bo);
5045
+ offload_func(cur);
5046
+ ggml_set_name(cur, "result_wo");
5047
+ }
5048
+
5049
+ struct ggml_tensor * inpFF = ggml_add(ctx0, residual, cur);
5050
+ offload_func(inpFF);
5051
+ ggml_set_name(inpFF, "inpFF");
5052
+ {
5053
+ // MLP
5054
+ {
5055
+ // Norm
5056
+ cur = ggml_norm(ctx0, inpFF, norm_eps);
5057
+ offload_func(cur);
5058
+ cur = ggml_add(ctx0,
5059
+ ggml_mul(ctx0, cur, model.layers[il].ffn_norm),
5060
+ model.layers[il].ffn_norm_b
5061
+ );
5062
+ ggml_set_name(cur, "ffn_norm");
5063
+ offload_func(cur);
5064
+ }
5065
+ cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur);
5066
+ offload_func(cur);
5067
+
5068
+ cur = ggml_add(ctx0, cur, model.layers[il].b3);
5069
+ offload_func(cur);
5070
+ ggml_set_name(cur, "result_ffn_up");
5071
+
5072
+ cur = ggml_sqr(ctx0, ggml_relu(ctx0, cur));
5073
+ ggml_set_name(cur, "result_ffn_act");
5074
+ offload_func(cur);
5075
+ offload_func(cur->src[0]);
5076
+
5077
+ cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur);
5078
+ offload_func(cur);
5079
+ cur = ggml_add(ctx0,
5080
+ cur,
5081
+ model.layers[il].b2);
5082
+ offload_func(cur);
5083
+ ggml_set_name(cur, "outFF");
5084
+ }
5085
+ cur = ggml_add(ctx0, cur, inpFF);
5086
+ offload_func(cur);
5087
+ ggml_set_name(cur, "inpFF_+_outFF");
5088
+ inpL = cur;
5089
+ }
5090
+ cur = inpL;
5091
+ {
5092
+ cur = ggml_norm(ctx0, cur, norm_eps);
5093
+ offload_func_nr(cur);
5094
+ cur = ggml_mul(ctx0, cur, model.output_norm);
5095
+ offload_func_nr(cur);
5096
+
5097
+ cur = ggml_add(ctx0, cur, model.output_norm_b);
5098
+ // offload_func_nr(cur);
5099
+
5100
+ ggml_set_name(cur, "result_norm");
5101
+ }
5102
+ cur = ggml_mul_mat(ctx0, model.output, cur);
5103
+ ggml_set_name(cur, "result_output");
5104
+ ggml_build_forward_expand(gf, cur);
5105
+ ggml_free(ctx0);
5106
+ return gf;
5107
+ }
5108
+
5109
+ static struct ggml_cgraph * llm_build_bloom(
5110
+ llama_context & lctx,
5111
+ const llama_batch & batch) {
5112
+ const auto & model = lctx.model;
5113
+ const auto & hparams = model.hparams;
5114
+ const auto & cparams = lctx.cparams;
5115
+
5116
+ const auto & kv_self = lctx.kv_self;
5117
+
5118
+ GGML_ASSERT(!!kv_self.ctx);
5119
+
5120
+ const int64_t n_embd = hparams.n_embd;
5121
+ const int64_t n_layer = hparams.n_layer;
5122
+ const int64_t n_ctx = cparams.n_ctx;
5123
+ const int64_t n_head = hparams.n_head;
5124
+ const int64_t n_head_kv = hparams.n_head_kv;
5125
+ const int64_t n_embd_head = hparams.n_embd_head();
5126
+ const int64_t n_embd_gqa = hparams.n_embd_gqa();
5127
+
5128
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
5129
+
5130
+ const float norm_eps = hparams.f_norm_eps;
5131
+
5132
+ const int32_t n_tokens = batch.n_tokens;
5133
+ const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
5134
+ const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
5135
+
5136
+ auto & buf_compute = lctx.buf_compute;
5137
+
5138
+ struct ggml_init_params params = {
5139
+ /*.mem_size =*/ buf_compute.size,
5140
+ /*.mem_buffer =*/ buf_compute.data,
5141
+ /*.no_alloc =*/ false,
5142
+ };
5143
+
5144
+ params.no_alloc = true;
5145
+
5146
+ struct ggml_context * ctx0 = ggml_init(params);
5147
+
5148
+ ggml_cgraph * gf = ggml_new_graph(ctx0);
5149
+
5150
+ struct ggml_tensor * cur;
5151
+ struct ggml_tensor * token;
5152
+ struct ggml_tensor * inpL;
5153
+
5154
+ if (batch.token) {
5155
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
5156
+
5157
+ ggml_allocr_alloc(lctx.alloc, inp_tokens);
5158
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
5159
+ memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
5160
+ }
5161
+ ggml_set_name(inp_tokens, "inp_tokens");
5162
+
5163
+ token = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
5164
+ } else {
5165
+ #ifdef GGML_USE_MPI
5166
+ GGML_ASSERT(false && "not implemented");
5167
+ #endif
5168
+
5169
+ token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
5170
+
5171
+ ggml_allocr_alloc(lctx.alloc, token);
5172
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
5173
+ memcpy(token->data, batch.embd, n_tokens * n_embd * ggml_element_size(token));
5174
+ }
5175
+ }
5176
+
5177
+ // KQ_scale
5178
+ struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
5179
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
5180
+ ggml_allocr_alloc(lctx.alloc, KQ_scale);
5181
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
5182
+ ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
5183
+ }
5184
+
5185
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5186
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
5187
+ ggml_set_name(KQ_mask, "KQ_mask");
5188
+ ggml_allocr_alloc(lctx.alloc, KQ_mask);
5189
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
5190
+ float * data = (float *) KQ_mask->data;
5191
+ memset(data, 0, ggml_nbytes(KQ_mask));
5192
+
5193
+ for (int h = 0; h < 1; ++h) {
5194
+ for (int j = 0; j < n_tokens; ++j) {
5195
+ const llama_pos pos = batch.pos[j];
5196
+ const llama_seq_id seq_id = batch.seq_id[j];
5197
+
5198
+ for (int i = 0; i < n_kv; ++i) {
5199
+ if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
5200
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
5201
+ }
5202
+ }
5203
+ }
5204
+ }
5205
+ }
5206
+
5207
+ // norm
5208
+ {
5209
+ inpL = ggml_norm(ctx0, token, norm_eps);
5210
+ inpL = ggml_add(ctx0, ggml_mul(ctx0, inpL, model.tok_norm), model.tok_norm_b);
5211
+ }
5212
+
5213
+ ggml_set_name(inpL, "inpL");
5214
+
5215
+ for (int il = 0; il < n_layer; ++il) {
5216
+ {
5217
+ // Norm
5218
+ cur = ggml_norm(ctx0, inpL, norm_eps);
5219
+ cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b);
5220
+ }
5221
+
5222
+ {
5223
+ // Self Attention
5224
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv);
5225
+
5226
+ struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd);
5227
+ struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*n_embd);
5228
+ struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
5229
+
5230
+ struct ggml_tensor * Qcur = tmpq;
5231
+ struct ggml_tensor * Kcur = tmpk;
5232
+
5233
+ // store key and value to memory
5234
+ {
5235
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
5236
+ ggml_set_name(Vcur, "Vcur");
5237
+
5238
+ struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
5239
+ ggml_set_name(k, "k");
5240
+
5241
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
5242
+ ( n_ctx)*ggml_element_size(kv_self.v),
5243
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
5244
+
5245
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
5246
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
5247
+ }
5248
+
5249
+ struct ggml_tensor * Q =
5250
+ ggml_permute(ctx0,
5251
+ ggml_cpy(ctx0,
5252
+ Qcur,
5253
+ ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, n_tokens)),
5254
+ 0, 2, 1, 3);
5255
+ ggml_set_name(Q, "Q");
5256
+
5257
+ struct ggml_tensor * K =
5258
+ ggml_view_3d(ctx0, kv_self.k,
5259
+ n_embd_head, n_kv, n_head_kv,
5260
+ ggml_element_size(kv_self.k)*n_embd_gqa,
5261
+ ggml_element_size(kv_self.k)*n_embd_head,
5262
+ ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
5263
+ ggml_set_name(K, "K");
5264
+
5265
+ // K * Q
5266
+ struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
5267
+ ggml_set_name(KQ, "KQ");
5268
+
5269
+ // KQ_scaled = KQ / sqrt(n_embd_head)
5270
+ // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
5271
+ struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
5272
+ ggml_set_name(KQ_scaled, "KQ_scaled");
5273
+
5274
+ struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ kv_head, n_head, 8);
5275
+ ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
5276
+
5277
+ // KQ_masked = mask_past(KQ_scaled)
5278
+ struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
5279
+ ggml_set_name(KQ_masked, "KQ_masked");
5280
+
5281
+ // KQ = soft_max(KQ_masked)
5282
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
5283
+ ggml_set_name(KQ_soft_max, "KQ_soft_max");
5284
+
5285
+ // split cached V into n_head heads
5286
+ struct ggml_tensor * V =
5287
+ ggml_view_3d(ctx0, kv_self.v,
5288
+ n_kv, n_embd_head, n_head_kv,
5289
+ ggml_element_size(kv_self.v)*n_ctx,
5290
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
5291
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
5292
+ ggml_set_name(V, "V");
5293
+
5294
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
5295
+ ggml_set_name(KQV, "KQV");
5296
+
5297
+ // KQV_merged = KQV.permute(0, 2, 1, 3)
5298
+ struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
5299
+ ggml_set_name(KQV_merged, "KQV_merged");
5300
+
5301
+ // cur = KQV_merged.contiguous().view(n_embd, n_tokens)
5302
+ cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
5303
+ ggml_set_name(cur, "KQV_merged_contiguous");
5304
+ }
5305
+
5306
+ // Projection
5307
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo);
5308
+
5309
+ // Add the input
5310
+ cur = ggml_add(ctx0, cur, inpL);
5311
+
5312
+ struct ggml_tensor * inpFF = cur;
5313
+
5314
+ // FF
5315
+ {
5316
+ // Norm
5317
+ {
5318
+ cur = ggml_norm(ctx0, inpFF, norm_eps);
5319
+ cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b);
5320
+ }
5321
+
5322
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3);
5323
+
5324
+ // GELU activation
5325
+ cur = ggml_gelu(ctx0, cur);
5326
+
5327
+ // Projection
5328
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2);
5329
+ }
5330
+
5331
+ inpL = ggml_add(ctx0, cur, inpFF);
5332
+ }
5333
+
5334
+ // Output Norm
5335
+ {
5336
+ cur = ggml_norm(ctx0, inpL, norm_eps);
5337
+ cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b);
5338
+ }
5339
+ ggml_set_name(cur, "result_norm");
5340
+
5341
+ cur = ggml_mul_mat(ctx0, model.output, cur);
5342
+ ggml_set_name(cur, "result_output");
5343
+
5344
+ ggml_build_forward_expand(gf, cur);
5345
+
5346
+ ggml_free(ctx0);
5347
+
5348
+ return gf;
5349
+ }
5350
+
5351
+ static struct ggml_cgraph * llm_build_mpt(
5352
+ llama_context & lctx,
5353
+ const llama_batch & batch) {
5354
+ const auto & model = lctx.model;
5355
+ const auto & hparams = model.hparams;
5356
+ const auto & cparams = lctx.cparams;
5357
+
5358
+ const auto & kv_self = lctx.kv_self;
5359
+
5360
+ GGML_ASSERT(!!kv_self.ctx);
5361
+
5362
+ const int64_t n_embd = hparams.n_embd;
5363
+ const int64_t n_layer = hparams.n_layer;
5364
+ const int64_t n_ctx = cparams.n_ctx;
5365
+ const int64_t n_head = hparams.n_head;
5366
+ const int64_t n_head_kv = hparams.n_head_kv; // == n_head for MPT, as there's no MQA/GQA
5367
+ const int64_t n_embd_head = hparams.n_embd_head();
5368
+ const int64_t n_embd_gqa = hparams.n_embd_gqa();
5369
+
5370
+ const float norm_eps = hparams.f_norm_eps;
5371
+ const float clamp_kqv = hparams.f_clamp_kqv;
5372
+ const float max_alibi_bias = hparams.f_max_alibi_bias;
5373
+
5374
+ const int n_gpu_layers = model.n_gpu_layers;
5375
+
5376
+ const int32_t n_tokens = batch.n_tokens;
5377
+ const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
5378
+ const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
5379
+
5380
+ auto & buf_compute = lctx.buf_compute;
5381
+
5382
+ struct ggml_init_params params = {
5383
+ /*.mem_size =*/ buf_compute.size,
5384
+ /*.mem_buffer =*/ buf_compute.data,
5385
+ /*.no_alloc =*/ false,
5386
+ };
5387
+
5388
+ params.no_alloc = true;
5389
+
5390
+ struct ggml_context * ctx0 = ggml_init(params);
5391
+
5392
+ ggml_cgraph * gf = ggml_new_graph(ctx0);
5393
+
5394
+ struct ggml_tensor * cur;
5395
+ struct ggml_tensor * inpL;
5396
+
5397
+ //int warmup = 0;
5398
+ if (batch.token) {
5399
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
5400
+
5401
+ ggml_allocr_alloc(lctx.alloc, inp_tokens);
5402
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
5403
+ memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
5404
+ //warmup = ((uint32_t*) inp_tokens->data)[0] == 0;
5405
+ }
5406
+
5407
+ ggml_set_name(inp_tokens, "inp_tokens");
5408
+
5409
+ inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
5410
+ } else {
5411
+ #ifdef GGML_USE_MPI
5412
+ GGML_ASSERT(false && "not implemented");
5413
+ #endif
5414
+
5415
+ inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
5416
+
5417
+ ggml_allocr_alloc(lctx.alloc, inpL);
5418
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
5419
+ memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
5420
+ }
5421
+ }
5422
+
5423
+ const int i_gpu_start = n_layer - n_gpu_layers;
5424
+ (void) i_gpu_start;
5425
+
5426
+ // offload functions set the tensor output backend to GPU
5427
+ // tensors are GPU-accelerated if any input or the output has been offloaded
5428
+ offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
5429
+ offload_func_t offload_func_kq = llama_nop;
5430
+ offload_func_t offload_func_v = llama_nop;
5431
+
5432
+ #ifdef GGML_USE_CUBLAS
5433
+ if (n_gpu_layers > n_layer) {
5434
+ offload_func_nr = ggml_cuda_assign_buffers_no_alloc;
5435
+ }
5436
+ if (n_gpu_layers > n_layer + 1) {
5437
+ offload_func_v = ggml_cuda_assign_buffers_no_alloc;
5438
+ }
5439
+ if (n_gpu_layers > n_layer + 2) {
5440
+ offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
5441
+ }
5442
+ #endif // GGML_USE_CUBLAS
5443
+
5444
+ // KQ_scale
5445
+ struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
5446
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
5447
+ ggml_allocr_alloc(lctx.alloc, KQ_scale);
5448
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
5449
+ ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
5450
+ }
5451
+
5452
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5453
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
5454
+ offload_func_kq(KQ_mask);
5455
+ ggml_set_name(KQ_mask, "KQ_mask");
5456
+ ggml_allocr_alloc(lctx.alloc, KQ_mask);
5457
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
5458
+ float * data = (float *) KQ_mask->data;
5459
+ memset(data, 0, ggml_nbytes(KQ_mask));
5460
+
5461
+ for (int h = 0; h < 1; ++h) {
5462
+ for (int j = 0; j < n_tokens; ++j) {
5463
+ const llama_pos pos = batch.pos[j];
5464
+ const llama_seq_id seq_id = batch.seq_id[j];
5465
+
5466
+ for (int i = 0; i < n_kv; ++i) {
5467
+ if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
5468
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
5469
+ }
5470
+ }
5471
+ }
5472
+ }
5473
+ }
5474
+
3822
5475
  for (int il = 0; il < n_layer; ++il) {
3823
- {
3824
- // Norm
3825
- cur = ggml_norm(ctx0, inpL, norm_eps);
3826
- cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b);
5476
+ struct ggml_tensor * attn_norm;
5477
+
5478
+ offload_func_t offload_func = llama_nop;
5479
+
5480
+ #ifdef GGML_USE_CUBLAS
5481
+ if (il >= i_gpu_start) {
5482
+ offload_func = ggml_cuda_assign_buffers_no_alloc;
3827
5483
  }
5484
+ #endif // GGML_USE_CUBLAS
3828
5485
 
5486
+ // self-attention
5487
+ // TODO: refactor into common function (shared with LLaMA)
3829
5488
  {
3830
- // Self Attention
3831
- cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv);
5489
+ attn_norm = ggml_norm(ctx0, inpL, norm_eps);
5490
+ offload_func(attn_norm);
3832
5491
 
3833
- struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd);
3834
- struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*n_embd);
3835
- struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
5492
+ attn_norm = ggml_mul(ctx0, attn_norm, model.layers[il].attn_norm);
5493
+ offload_func(attn_norm);
3836
5494
 
3837
- struct ggml_tensor * Qcur = tmpq;
3838
- struct ggml_tensor * Kcur = tmpk;
5495
+ if (1) {
5496
+ cur = attn_norm;
5497
+ }
5498
+
5499
+ // compute QKV
5500
+
5501
+ cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
5502
+ offload_func_kq(cur);
5503
+
5504
+ if (clamp_kqv > 0.0f) {
5505
+ cur = ggml_clamp(ctx0, cur, -clamp_kqv, clamp_kqv);
5506
+ offload_func_kq(cur);
5507
+ }
5508
+
5509
+ const size_t wsize = ggml_type_size(cur->type);
5510
+
5511
+ struct ggml_tensor * Qcur = ggml_view_3d(
5512
+ ctx0, cur, n_embd_head, n_head, n_tokens,
5513
+ wsize * n_embd_head,
5514
+ wsize * n_embd_head * (n_head + 2 * n_head_kv),
5515
+ 0);
5516
+ offload_func_kq(Qcur);
5517
+
5518
+ struct ggml_tensor * Kcur = ggml_view_3d(
5519
+ ctx0, cur, n_embd_head, n_head_kv, n_tokens,
5520
+ wsize * n_embd_head,
5521
+ wsize * n_embd_head * (n_head + 2 * n_head_kv),
5522
+ wsize * n_embd_head * n_head);
5523
+ offload_func_kq(Kcur);
5524
+
5525
+ struct ggml_tensor * tmpv = ggml_view_3d(
5526
+ ctx0, cur, n_embd_head, n_head_kv, n_tokens,
5527
+ wsize * n_embd_head,
5528
+ wsize * n_embd_head * (n_head + 2 * n_head_kv),
5529
+ wsize * n_embd_head * (n_head + n_head_kv));
5530
+ offload_func_kq(Kcur);
5531
+
5532
+ ggml_set_name(Qcur, "Qcur");
5533
+ ggml_set_name(Kcur, "Kcur");
3839
5534
 
3840
5535
  {
3841
5536
  struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
5537
+ offload_func_v(Vcur);
5538
+ offload_func_v(Vcur->src[0]->src[0]);
3842
5539
  ggml_set_name(Vcur, "Vcur");
3843
5540
 
3844
5541
  struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
5542
+ offload_func_kq(k);
3845
5543
  ggml_set_name(k, "k");
3846
5544
 
3847
5545
  struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
3848
5546
  ( n_ctx)*ggml_element_size(kv_self.v),
3849
5547
  (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
5548
+ offload_func_v(v);
3850
5549
 
3851
5550
  ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
3852
5551
  ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
3853
5552
  }
3854
5553
 
3855
- struct ggml_tensor * Q =
3856
- ggml_permute(ctx0,
3857
- ggml_cpy(ctx0,
3858
- Qcur,
3859
- ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, n_tokens)),
3860
- 0, 2, 1, 3);
5554
+ struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
5555
+ offload_func_kq(Q);
3861
5556
  ggml_set_name(Q, "Q");
3862
5557
 
3863
5558
  struct ggml_tensor * K =
@@ -3866,85 +5561,105 @@ static struct ggml_cgraph * llm_build_starcoder(
3866
5561
  ggml_element_size(kv_self.k)*n_embd_gqa,
3867
5562
  ggml_element_size(kv_self.k)*n_embd_head,
3868
5563
  ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
5564
+ offload_func_kq(K);
3869
5565
  ggml_set_name(K, "K");
3870
5566
 
3871
- // K * Q
3872
5567
  struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
5568
+ offload_func_kq(KQ);
3873
5569
  ggml_set_name(KQ, "KQ");
3874
5570
 
3875
- // KQ_scaled = KQ / sqrt(n_embd_head)
3876
- // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
3877
- struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
5571
+ struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
5572
+ offload_func_kq(KQ_scaled);
3878
5573
  ggml_set_name(KQ_scaled, "KQ_scaled");
3879
5574
 
3880
- // KQ_masked = mask_past(KQ_scaled)
3881
- struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
5575
+ // TODO: replace with ggml_add()
5576
+ struct ggml_tensor * KQ_scaled_alibi =
5577
+ ggml_alibi(ctx0, KQ_scaled, 0, n_head, max_alibi_bias);
5578
+ offload_func_kq(KQ_scaled_alibi);
5579
+ ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
5580
+
5581
+ struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
5582
+ offload_func_kq(KQ_masked);
3882
5583
  ggml_set_name(KQ_masked, "KQ_masked");
3883
5584
 
3884
- // KQ = soft_max(KQ_masked)
3885
- struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
5585
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
5586
+ offload_func_v(KQ_soft_max);
3886
5587
  ggml_set_name(KQ_soft_max, "KQ_soft_max");
3887
5588
 
3888
- // split cached V into n_head heads
3889
5589
  struct ggml_tensor * V =
3890
5590
  ggml_view_3d(ctx0, kv_self.v,
3891
5591
  n_kv, n_embd_head, n_head_kv,
3892
5592
  ggml_element_size(kv_self.v)*n_ctx,
3893
5593
  ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
3894
5594
  ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
5595
+ offload_func_v(V);
3895
5596
  ggml_set_name(V, "V");
3896
5597
 
3897
5598
  struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
5599
+ offload_func_v(KQV);
3898
5600
  ggml_set_name(KQV, "KQV");
3899
5601
 
3900
- // KQV_merged = KQV.permute(0, 2, 1, 3)
3901
5602
  struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
5603
+ offload_func_v(KQV_merged);
3902
5604
  ggml_set_name(KQV_merged, "KQV_merged");
3903
5605
 
3904
- // cur = KQV_merged.contiguous().view(n_embd, n_tokens)
3905
5606
  cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
5607
+ offload_func_v(cur);
3906
5608
  ggml_set_name(cur, "KQV_merged_contiguous");
3907
- }
3908
5609
 
3909
- // Projection
3910
- cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo);
5610
+ cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
5611
+ offload_func(cur);
5612
+ ggml_set_name(cur, "result_wo");
5613
+ }
3911
5614
 
3912
5615
  // Add the input
3913
5616
  cur = ggml_add(ctx0, cur, inpL);
5617
+ offload_func(cur);
3914
5618
 
3915
- struct ggml_tensor * inpFF = cur;
5619
+ struct ggml_tensor * attn_out = cur;
3916
5620
 
3917
- // FF
5621
+ // feed forward
3918
5622
  {
3919
5623
  // Norm
3920
5624
  {
3921
- cur = ggml_norm(ctx0, inpFF, norm_eps);
3922
- cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b);
5625
+ cur = ggml_norm(ctx0, attn_out, norm_eps);
5626
+ offload_func(cur);
5627
+
5628
+ cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
5629
+ offload_func(cur);
3923
5630
  }
3924
5631
 
3925
- cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3);
5632
+ cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur);
5633
+ offload_func(cur);
3926
5634
 
3927
- // GELU activation
3928
5635
  cur = ggml_gelu(ctx0, cur);
3929
-
3930
- // Projection
3931
- cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2);
5636
+ offload_func(cur);
5637
+ cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur);
5638
+ offload_func(cur);
3932
5639
  }
3933
5640
 
3934
- inpL = ggml_add(ctx0, cur, inpFF);
5641
+ cur = ggml_add(ctx0, cur, attn_out);
5642
+ offload_func(cur);
5643
+ // input for next layer
5644
+ inpL = cur;
3935
5645
  }
3936
5646
 
3937
- // Output Norm
5647
+ cur = inpL;
5648
+
5649
+ // norm
3938
5650
  {
3939
- cur = ggml_norm(ctx0, inpL, norm_eps);
3940
- cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b);
5651
+ cur = ggml_norm(ctx0, cur, norm_eps);
5652
+ offload_func_nr(cur);
5653
+
5654
+ cur = ggml_mul(ctx0, cur, model.output_norm);
5655
+ ggml_set_name(cur, "result_norm");
3941
5656
  }
3942
- ggml_set_name(cur, "result_norm");
3943
5657
 
3944
5658
  cur = ggml_mul_mat(ctx0, model.output, cur);
3945
5659
  ggml_set_name(cur, "result_output");
3946
5660
 
3947
5661
  ggml_build_forward_expand(gf, cur);
5662
+
3948
5663
  ggml_free(ctx0);
3949
5664
 
3950
5665
  return gf;
@@ -3974,6 +5689,22 @@ static struct ggml_cgraph * llama_build_graph(
3974
5689
  {
3975
5690
  result = llm_build_starcoder(lctx, batch);
3976
5691
  } break;
5692
+ case LLM_ARCH_PERSIMMON:
5693
+ {
5694
+ result = llm_build_persimmon(lctx, batch);
5695
+ } break;
5696
+ case LLM_ARCH_REFACT:
5697
+ {
5698
+ result = llm_build_refact(lctx, batch);
5699
+ } break;
5700
+ case LLM_ARCH_BLOOM:
5701
+ {
5702
+ result = llm_build_bloom(lctx, batch);
5703
+ } break;
5704
+ case LLM_ARCH_MPT:
5705
+ {
5706
+ result = llm_build_mpt(lctx, batch);
5707
+ } break;
3977
5708
  default:
3978
5709
  GGML_ASSERT(false);
3979
5710
  }
@@ -3985,7 +5716,6 @@ static struct ggml_cgraph * llama_build_graph(
3985
5716
  //
3986
5717
  // - lctx: llama context
3987
5718
  // - batch: batch to evaluate
3988
- // - n_threads: number of threads to use
3989
5719
  //
3990
5720
  // return 0 on success
3991
5721
  // return positive int on warning
@@ -4052,10 +5782,6 @@ static int llama_decode_internal(
4052
5782
  batch.seq_id = seq_id.data();
4053
5783
  }
4054
5784
 
4055
- // we always start to search for a free slot from the start of the cache
4056
- // TODO: better strategies can be implemented
4057
- kv_self.head = 0;
4058
-
4059
5785
  if (!llama_kv_cache_find_slot(kv_self, batch)) {
4060
5786
  return 1;
4061
5787
  }
@@ -4107,7 +5833,9 @@ static int llama_decode_internal(
4107
5833
  // If all tensors can be run on the GPU then using more than 1 thread is detrimental.
4108
5834
  const bool full_offload_supported = model.arch == LLM_ARCH_LLAMA ||
4109
5835
  model.arch == LLM_ARCH_BAICHUAN ||
4110
- model.arch == LLM_ARCH_FALCON;
5836
+ model.arch == LLM_ARCH_FALCON ||
5837
+ model.arch == LLM_ARCH_REFACT ||
5838
+ model.arch == LLM_ARCH_MPT;
4111
5839
  const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
4112
5840
  if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
4113
5841
  n_threads = 1;
@@ -4140,8 +5868,12 @@ static int llama_decode_internal(
4140
5868
  #endif
4141
5869
 
4142
5870
  // update the kv ring buffer
4143
- lctx.kv_self.head += n_tokens;
4144
5871
  lctx.kv_self.has_shift = false;
5872
+ lctx.kv_self.head += n_tokens;
5873
+ // Ensure kv cache head points to a valid index.
5874
+ if (lctx.kv_self.head >= lctx.kv_self.size) {
5875
+ lctx.kv_self.head = 0;
5876
+ }
4145
5877
 
4146
5878
  #ifdef GGML_PERF
4147
5879
  // print timing information per ggml operation (for debugging purposes)
@@ -4227,18 +5959,41 @@ static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
4227
5959
  return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE;
4228
5960
  }
4229
5961
 
4230
- static uint8_t llama_token_to_byte(const llama_vocab & vocab, llama_token id) {
5962
+ static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id) {
5963
+ return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED;
5964
+ }
5965
+
5966
+ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
4231
5967
  GGML_ASSERT(llama_is_byte_token(vocab, id));
4232
5968
  const auto& token_data = vocab.id_to_token.at(id);
4233
- auto buf = token_data.text.substr(3, 2);
4234
- return strtol(buf.c_str(), NULL, 16);
5969
+ switch (llama_vocab_get_type(vocab)) {
5970
+ case LLAMA_VOCAB_TYPE_SPM: {
5971
+ auto buf = token_data.text.substr(3, 2);
5972
+ return strtol(buf.c_str(), NULL, 16);
5973
+ }
5974
+ case LLAMA_VOCAB_TYPE_BPE: {
5975
+ GGML_ASSERT(false);
5976
+ return unicode_to_bytes_bpe(token_data.text);
5977
+ }
5978
+ default:
5979
+ GGML_ASSERT(false);
5980
+ }
4235
5981
  }
4236
5982
 
4237
5983
  static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
4238
- char buf[7];
4239
- int result = snprintf(buf, sizeof(buf), "<0x%02X>", ch);
4240
- GGML_ASSERT(0 <= result && result < 7);
4241
- return vocab.token_to_id.at(buf);
5984
+ switch (llama_vocab_get_type(vocab)) {
5985
+ case LLAMA_VOCAB_TYPE_SPM: {
5986
+ char buf[7];
5987
+ int result = snprintf(buf, sizeof(buf), "<0x%02X>", ch);
5988
+ GGML_ASSERT(0 <= result && result < 7);
5989
+ return vocab.token_to_id.at(buf);
5990
+ }
5991
+ case LLAMA_VOCAB_TYPE_BPE: {
5992
+ return vocab.token_to_id.at(bytes_to_unicode_bpe(ch));
5993
+ }
5994
+ default:
5995
+ GGML_ASSERT(false);
5996
+ }
4242
5997
  }
4243
5998
 
4244
5999
  static void llama_escape_whitespace(std::string & text) {
@@ -4518,15 +6273,9 @@ struct llm_tokenizer_bpe {
4518
6273
  std::string byte_str(1, *j);
4519
6274
  auto token_multibyte = vocab.token_to_id.find(byte_str);
4520
6275
  if (token_multibyte == vocab.token_to_id.end()) {
4521
- try {
4522
- llama_token token_byte = llama_byte_to_token(vocab, *j);
4523
- output.push_back(token_byte);
4524
- } catch (const std::out_of_range & err) {
4525
- fprintf(stderr,"ERROR: byte not found in vocab: '%s'\n", byte_str.c_str());
4526
- }
4527
- } else {
4528
- output.push_back((*token_multibyte).second);
6276
+ throw std::runtime_error("ERROR: byte not found in vocab");
4529
6277
  }
6278
+ output.push_back((*token_multibyte).second);
4530
6279
  }
4531
6280
  } else {
4532
6281
  output.push_back((*token).second);
@@ -4563,23 +6312,143 @@ private:
4563
6312
  work_queue.push(bigram);
4564
6313
  }
4565
6314
 
4566
- // probably not 100% correct
4567
- static std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
4568
- std::vector<std::string> words;
6315
+ std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
6316
+ std::vector<std::string> bpe_words;
6317
+ std::vector<std::string> bpe_encoded_words;
6318
+
6319
+ std::string token = "";
6320
+ // GPT2 system regex: 's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+
6321
+ bool collecting_numeric = false;
6322
+ bool collecting_letter = false;
6323
+ bool collecting_special = false;
6324
+ bool collecting_whitespace_lookahead = false;
6325
+ bool collecting = false;
6326
+
6327
+ std::vector<std::string> text_utf;
6328
+ text_utf.reserve(text.size());
6329
+ bpe_words.reserve(text.size());
6330
+ bpe_encoded_words.reserve(text.size());
6331
+
6332
+ auto cps = codepoints_from_utf8(text);
6333
+ for (size_t i = 0; i < cps.size(); ++i)
6334
+ text_utf.emplace_back(codepoint_to_utf8(cps[i]));
6335
+
6336
+ for (int i = 0; i < (int)text_utf.size(); i++) {
6337
+ const std::string & utf_char = text_utf[i];
6338
+ bool split_condition = false;
6339
+ int bytes_remain = text_utf.size() - i;
6340
+ // forward backward lookups
6341
+ const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
6342
+ const std::string & utf_char_next_next = (i + 2 < (int)text_utf.size()) ? text_utf[i + 2] : "";
6343
+
6344
+ // handling contractions
6345
+ if (!split_condition && bytes_remain >= 2) {
6346
+ // 's|'t|'m|'d
6347
+ if (utf_char == "\'" && (utf_char_next == "s" || utf_char_next == "t" || utf_char_next == "m" || utf_char_next == "d")) {
6348
+ split_condition = true;
6349
+ }
6350
+ if (split_condition) {
6351
+ if (token.size()) {
6352
+ bpe_words.emplace_back(token); // push previous content as token
6353
+ }
6354
+ token = utf_char + utf_char_next;
6355
+ bpe_words.emplace_back(token);
6356
+ token = "";
6357
+ i++;
6358
+ continue;
6359
+ }
6360
+ }
6361
+ if (!split_condition && bytes_remain >= 3) {
6362
+ // 're|'ve|'ll
6363
+ if (utf_char == "\'" && (
6364
+ (utf_char_next == "r" && utf_char_next_next == "e") ||
6365
+ (utf_char_next == "v" && utf_char_next_next == "e") ||
6366
+ (utf_char_next == "l" && utf_char_next_next == "l"))
6367
+ ) {
6368
+ split_condition = true;
6369
+ }
6370
+ if (split_condition) {
6371
+ // current token + next token can be defined
6372
+ if (token.size()) {
6373
+ bpe_words.emplace_back(token); // push previous content as token
6374
+ }
6375
+ token = utf_char + utf_char_next + utf_char_next_next;
6376
+ bpe_words.emplace_back(token); // the contraction
6377
+ token = "";
6378
+ i += 2;
6379
+ continue;
6380
+ }
6381
+ }
6382
+
6383
+ if (!split_condition && !collecting) {
6384
+ if (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || (!token.size() && utf_char == " " && codepoint_type(utf_char_next) == CODEPOINT_TYPE_LETTER)) {
6385
+ collecting_letter = true;
6386
+ collecting = true;
6387
+ }
6388
+ else if (codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || (!token.size() && utf_char == " " && codepoint_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
6389
+ collecting_numeric = true;
6390
+ collecting = true;
6391
+ }
6392
+ else if (
6393
+ ((codepoint_type(utf_char) != CODEPOINT_TYPE_LETTER && codepoint_type(utf_char) != CODEPOINT_TYPE_DIGIT) && (codepoint_type(utf_char) != CODEPOINT_TYPE_WHITESPACE)) ||
6394
+ (!token.size() && utf_char == " " && codepoint_type(utf_char_next) != CODEPOINT_TYPE_LETTER && codepoint_type(utf_char_next) != CODEPOINT_TYPE_DIGIT && codepoint_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE)
6395
+ ) {
6396
+ collecting_special = true;
6397
+ collecting = true;
6398
+ }
6399
+ else if (codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE && codepoint_type(utf_char_next) == CODEPOINT_TYPE_WHITESPACE) {
6400
+ collecting_whitespace_lookahead = true;
6401
+ collecting = true;
6402
+ }
6403
+ else if (codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE) {
6404
+ split_condition = true;
6405
+ }
6406
+ }
6407
+ else if (!split_condition && collecting) {
6408
+ if (collecting_letter && codepoint_type(utf_char) != CODEPOINT_TYPE_LETTER) {
6409
+ split_condition = true;
6410
+ }
6411
+ else if (collecting_numeric && codepoint_type(utf_char) != CODEPOINT_TYPE_DIGIT) {
6412
+ split_condition = true;
6413
+ }
6414
+ else if (collecting_special && (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
6415
+ split_condition = true;
6416
+ }
6417
+ else if (collecting_whitespace_lookahead && (codepoint_type(utf_char_next) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
6418
+ split_condition = true;
6419
+ }
6420
+ }
6421
+
6422
+ if (utf_char_next == "") {
6423
+ split_condition = true; // final
6424
+ token += utf_char;
6425
+ }
4569
6426
 
4570
- // ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53
4571
- const std::string pattern = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
4572
- const std::regex re(pattern);
6427
+ if (split_condition) {
6428
+ if (token.size()) {
6429
+ bpe_words.emplace_back(token);
6430
+ }
6431
+ token = utf_char;
6432
+ collecting = false;
6433
+ collecting_letter = false;
6434
+ collecting_numeric = false;
6435
+ collecting_special = false;
6436
+ collecting_whitespace_lookahead = false;
6437
+ }
6438
+ else {
6439
+ token += utf_char;
6440
+ }
6441
+ }
4573
6442
 
4574
- auto words_begin = std::sregex_iterator(text.begin(), text.end(), re);
4575
- auto words_end = std::sregex_iterator();
4576
- auto n_words = std::distance(words_begin, words_end);
4577
- words.reserve(n_words);
4578
- for (auto it = words_begin; it != words_end; ++it) {
4579
- words.push_back(it->str());
6443
+ for (std::string & word : bpe_words) {
6444
+ std::string encoded_token = "";
6445
+ for (char & c : word) {
6446
+ encoded_token += bytes_to_unicode_bpe(c);
6447
+ }
6448
+ bpe_encoded_words.emplace_back(encoded_token);
4580
6449
  }
4581
- return words;
4582
6450
 
6451
+ return bpe_encoded_words;
4583
6452
  }
4584
6453
 
4585
6454
  const llama_vocab & vocab;
@@ -6022,7 +7891,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
6022
7891
  nthread = std::thread::hardware_concurrency();
6023
7892
  }
6024
7893
 
6025
- llama_model_loader ml(fname_inp, /*use_mmap*/ false);
7894
+ // mmap consistently increases speed Linux, and also increases speed on Windows with
7895
+ // hot cache. It may cause a slowdown on macOS, possibly related to free memory.
7896
+ #if defined(__linux__) || defined(_WIN32)
7897
+ constexpr bool use_mmap = true;
7898
+ #else
7899
+ constexpr bool use_mmap = false;
7900
+ #endif
7901
+
7902
+ llama_model_loader ml(fname_inp, use_mmap);
7903
+ if (ml.use_mmap) {
7904
+ ml.mapping.reset(new llama_mmap(&ml.file, /* prefetch */ 0, ggml_is_numa()));
7905
+ }
6026
7906
 
6027
7907
  llama_model model;
6028
7908
  llm_load_arch(ml, model);
@@ -6050,7 +7930,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
6050
7930
  const std::string name = ggml_get_name(meta);
6051
7931
 
6052
7932
  // TODO: avoid hardcoded tensor names - use the TN_* constants
6053
- if (name.find("attn_v.weight") != std::string::npos) {
7933
+ if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
6054
7934
  ++n_attention_wv;
6055
7935
  }
6056
7936
  else if (name.find("ffn_down.weight") != std::string::npos) {
@@ -6087,6 +7967,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
6087
7967
  }
6088
7968
 
6089
7969
  std::ofstream fout(fname_out, std::ios::binary);
7970
+ fout.exceptions(std::ofstream::failbit); // fail fast on write errors
6090
7971
 
6091
7972
  const size_t meta_size = gguf_get_meta_size(ctx_out);
6092
7973
 
@@ -6100,10 +7981,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
6100
7981
 
6101
7982
  const std::string name = ggml_get_name(tensor);
6102
7983
 
6103
- if (read_data.size() < ggml_nbytes(tensor)) {
6104
- read_data.resize(ggml_nbytes(tensor));
7984
+ if (!ml.use_mmap) {
7985
+ if (read_data.size() < ggml_nbytes(tensor)) {
7986
+ read_data.resize(ggml_nbytes(tensor));
7987
+ }
7988
+ tensor->data = read_data.data();
6105
7989
  }
6106
- tensor->data = read_data.data();
6107
7990
  ml.load_data_for(tensor);
6108
7991
 
6109
7992
  LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",
@@ -6738,13 +8621,14 @@ struct llama_context * llama_new_context_with_model(
6738
8621
 
6739
8622
  #ifdef GGML_USE_METAL
6740
8623
  if (model->n_gpu_layers > 0) {
8624
+ ggml_metal_log_set_callback(llama_log_callback_default, NULL);
8625
+
6741
8626
  ctx->ctx_metal = ggml_metal_init(1);
6742
8627
  if (!ctx->ctx_metal) {
6743
8628
  LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__);
6744
8629
  llama_free(ctx);
6745
8630
  return NULL;
6746
8631
  }
6747
- ggml_metal_log_set_callback(llama_log_callback_default, NULL);
6748
8632
  //ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false);
6749
8633
  //ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
6750
8634
  }
@@ -6872,6 +8756,10 @@ int llama_n_embd(const struct llama_model * model) {
6872
8756
  return model->hparams.n_embd;
6873
8757
  }
6874
8758
 
8759
+ float llama_rope_freq_scale_train(const struct llama_model * model) {
8760
+ return model->hparams.rope_freq_scale_train;
8761
+ }
8762
+
6875
8763
  int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
6876
8764
  return snprintf(buf, buf_size, "%s %s %s",
6877
8765
  llama_model_arch_name(model->arch).c_str(),
@@ -7039,16 +8927,6 @@ struct llama_data_file_context : llama_data_context {
7039
8927
  *
7040
8928
  */
7041
8929
  static void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
7042
- // TODO: does not support multi-sequence states
7043
- {
7044
- const auto & kv_self = ctx->kv_self;
7045
- for (uint32_t i = 0; i < kv_self.head; ++i) {
7046
- GGML_ASSERT(kv_self.cells[i].pos == (int32_t) i);
7047
- GGML_ASSERT(kv_self.cells[i].seq_id.size() == 1);
7048
- GGML_ASSERT(kv_self.cells[i].has_seq_id(0));
7049
- }
7050
- }
7051
-
7052
8930
  // copy rng
7053
8931
  {
7054
8932
  std::stringstream rng_ss;
@@ -7101,36 +8979,38 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
7101
8979
  const auto & hparams = ctx->model.hparams;
7102
8980
  const auto & cparams = ctx->cparams;
7103
8981
 
7104
- const int n_layer = hparams.n_layer;
7105
- const int n_embd = hparams.n_embd_gqa();
7106
- const int n_ctx = cparams.n_ctx;
8982
+ const auto n_layer = hparams.n_layer;
8983
+ const auto n_embd = hparams.n_embd_gqa();
8984
+ const auto n_ctx = cparams.n_ctx;
7107
8985
 
7108
- const size_t kv_size = kv_self.buf.size;
7109
- const int kv_ntok = kv_self.head;
8986
+ const size_t kv_buf_size = kv_self.buf.size;
8987
+ const uint32_t kv_head = kv_self.head;
8988
+ const uint32_t kv_size = kv_self.size;
7110
8989
 
7111
- data_ctx->write(&kv_size, sizeof(kv_size));
7112
- data_ctx->write(&kv_ntok, sizeof(kv_ntok));
8990
+ data_ctx->write(&kv_buf_size, sizeof(kv_buf_size));
8991
+ data_ctx->write(&kv_head, sizeof(kv_head));
8992
+ data_ctx->write(&kv_size, sizeof(kv_size));
7113
8993
 
7114
- if (kv_size) {
8994
+ if (kv_buf_size) {
7115
8995
  const size_t elt_size = ggml_element_size(kv_self.k);
7116
8996
 
7117
8997
  ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
7118
8998
  ggml_cgraph gf{};
7119
8999
 
7120
- ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
9000
+ ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
7121
9001
  std::vector<uint8_t> kout3d_data(ggml_nbytes(kout3d), 0);
7122
9002
  kout3d->data = kout3d_data.data();
7123
9003
 
7124
- ggml_tensor * vout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer);
9004
+ ggml_tensor * vout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_head, n_embd, n_layer);
7125
9005
  std::vector<uint8_t> vout3d_data(ggml_nbytes(vout3d), 0);
7126
9006
  vout3d->data = vout3d_data.data();
7127
9007
 
7128
9008
  ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
7129
- n_embd, kv_ntok, n_layer,
9009
+ n_embd, kv_head, n_layer,
7130
9010
  elt_size*n_embd, elt_size*n_embd*n_ctx, 0);
7131
9011
 
7132
9012
  ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v,
7133
- kv_ntok, n_embd, n_layer,
9013
+ kv_head, n_embd, n_layer,
7134
9014
  elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
7135
9015
 
7136
9016
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
@@ -7144,6 +9024,20 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
7144
9024
  data_ctx->write(kout3d_data.data(), kout3d_data.size());
7145
9025
  data_ctx->write(vout3d_data.data(), vout3d_data.size());
7146
9026
  }
9027
+
9028
+ for (uint32_t i = 0; i < kv_size; ++i) {
9029
+ const auto & cell = kv_self.cells[i];
9030
+
9031
+ const llama_pos pos = cell.pos;
9032
+ const size_t seq_id_size = cell.seq_id.size();
9033
+
9034
+ data_ctx->write(&pos, sizeof(pos));
9035
+ data_ctx->write(&seq_id_size, sizeof(seq_id_size));
9036
+
9037
+ for (auto seq_id : cell.seq_id) {
9038
+ data_ctx->write(&seq_id, sizeof(seq_id));
9039
+ }
9040
+ }
7147
9041
  }
7148
9042
  }
7149
9043
 
@@ -7215,34 +9109,36 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
7215
9109
  const int n_embd = hparams.n_embd_gqa();
7216
9110
  const int n_ctx = cparams.n_ctx;
7217
9111
 
7218
- size_t kv_size;
7219
- int kv_ntok;
9112
+ size_t kv_buf_size;
9113
+ uint32_t kv_head;
9114
+ uint32_t kv_size;
7220
9115
 
7221
- memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
7222
- memcpy(&kv_ntok, inp, sizeof(kv_ntok)); inp += sizeof(kv_ntok);
9116
+ memcpy(&kv_buf_size, inp, sizeof(kv_buf_size)); inp += sizeof(kv_buf_size);
9117
+ memcpy(&kv_head, inp, sizeof(kv_head)); inp += sizeof(kv_head);
9118
+ memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
7223
9119
 
7224
- if (kv_size) {
7225
- GGML_ASSERT(kv_self.buf.size == kv_size);
9120
+ if (kv_buf_size) {
9121
+ GGML_ASSERT(kv_self.buf.size == kv_buf_size);
7226
9122
 
7227
9123
  const size_t elt_size = ggml_element_size(kv_self.k);
7228
9124
 
7229
9125
  ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
7230
9126
  ggml_cgraph gf{};
7231
9127
 
7232
- ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
9128
+ ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
7233
9129
  kin3d->data = (void *) inp;
7234
9130
  inp += ggml_nbytes(kin3d);
7235
9131
 
7236
- ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer);
9132
+ ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_head, n_embd, n_layer);
7237
9133
  vin3d->data = (void *) inp;
7238
9134
  inp += ggml_nbytes(vin3d);
7239
9135
 
7240
9136
  ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
7241
- n_embd, kv_ntok, n_layer,
9137
+ n_embd, kv_head, n_layer,
7242
9138
  elt_size*n_embd, elt_size*n_embd*n_ctx, 0);
7243
9139
 
7244
9140
  ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v,
7245
- kv_ntok, n_embd, n_layer,
9141
+ kv_head, n_embd, n_layer,
7246
9142
  elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
7247
9143
 
7248
9144
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
@@ -7252,8 +9148,27 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
7252
9148
  ggml_free(cpy_ctx);
7253
9149
  }
7254
9150
 
7255
- ctx->kv_self.head = kv_ntok;
9151
+ ctx->kv_self.head = kv_head;
7256
9152
  ctx->kv_self.size = kv_size;
9153
+
9154
+ ctx->kv_self.cells.resize(kv_size);
9155
+
9156
+ for (uint32_t i = 0; i < kv_size; ++i) {
9157
+ llama_pos pos;
9158
+ size_t seq_id_size;
9159
+
9160
+ memcpy(&pos, inp, sizeof(pos)); inp += sizeof(pos);
9161
+ memcpy(&seq_id_size, inp, sizeof(seq_id_size)); inp += sizeof(seq_id_size);
9162
+
9163
+ ctx->kv_self.cells[i].pos = pos;
9164
+
9165
+ llama_seq_id seq_id;
9166
+
9167
+ for (size_t j = 0; j < seq_id_size; ++j) {
9168
+ memcpy(&seq_id, inp, sizeof(seq_id)); inp += sizeof(seq_id);
9169
+ ctx->kv_self.cells[i].seq_id.insert(seq_id);
9170
+ }
9171
+ }
7257
9172
  }
7258
9173
 
7259
9174
  const size_t nread = inp - src;
@@ -7471,6 +9386,22 @@ llama_token llama_token_eos(const struct llama_context * ctx) {
7471
9386
  llama_token llama_token_nl(const struct llama_context * ctx) {
7472
9387
  return ctx->model.vocab.linefeed_id;
7473
9388
  }
9389
+ llama_token llama_token_prefix(const struct llama_context * ctx) {
9390
+ return ctx->model.vocab.special_prefix_id;
9391
+ }
9392
+
9393
+ llama_token llama_token_middle(const struct llama_context * ctx) {
9394
+ return ctx->model.vocab.special_middle_id;
9395
+ }
9396
+
9397
+ llama_token llama_token_suffix(const struct llama_context * ctx) {
9398
+ return ctx->model.vocab.special_suffix_id;
9399
+ }
9400
+
9401
+ llama_token llama_token_eot(const struct llama_context * ctx) {
9402
+ return ctx->model.vocab.special_eot_id;
9403
+ }
9404
+
7474
9405
 
7475
9406
  int llama_tokenize(
7476
9407
  const struct llama_model * model,
@@ -7493,35 +9424,70 @@ int llama_tokenize(
7493
9424
  return res.size();
7494
9425
  }
7495
9426
 
9427
+ static std::string llama_decode_text(const std::string & text) {
9428
+ std::string decoded_text;
9429
+ auto unicode_sequences = codepoints_from_utf8(text);
9430
+ for (auto& unicode_sequence : unicode_sequences) {
9431
+ decoded_text += unicode_to_bytes_bpe(codepoint_to_utf8(unicode_sequence));
9432
+ }
9433
+
9434
+ return decoded_text;
9435
+ }
9436
+
7496
9437
  // does not write null-terminator to buf
7497
9438
  int llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int length) {
7498
9439
  if (0 <= token && token < llama_n_vocab(model)) {
7499
- if (llama_is_normal_token(model->vocab, token)) {
7500
- std::string result = model->vocab.id_to_token[token].text;
7501
- if (llama_vocab_get_type(model->vocab) == LLAMA_VOCAB_TYPE_SPM) {
9440
+ switch (llama_vocab_get_type(model->vocab)) {
9441
+ case LLAMA_VOCAB_TYPE_SPM: {
9442
+ if (llama_is_normal_token(model->vocab, token)) {
9443
+ std::string result = model->vocab.id_to_token[token].text;
7502
9444
  llama_unescape_whitespace(result);
9445
+ if (length < (int) result.length()) {
9446
+ return -result.length();
9447
+ }
9448
+ memcpy(buf, result.c_str(), result.length());
9449
+ return result.length();
9450
+ } else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT
9451
+ if (length < 3) {
9452
+ return -3;
9453
+ }
9454
+ memcpy(buf, "\xe2\x96\x85", 3);
9455
+ return 3;
9456
+ } else if (llama_is_control_token(model->vocab, token)) {
9457
+ ;
9458
+ } else if (llama_is_byte_token(model->vocab, token)) {
9459
+ if (length < 1) {
9460
+ return -1;
9461
+ }
9462
+ buf[0] = llama_token_to_byte(model->vocab, token);
9463
+ return 1;
9464
+ } else {
9465
+ // TODO: for now we accept all unsupported token types,
9466
+ // suppressing them like CONTROL tokens.
9467
+ // GGML_ASSERT(false);
7503
9468
  }
7504
- if (length < (int) result.length()) {
7505
- return -result.length();
7506
- }
7507
- memcpy(buf, result.c_str(), result.length());
7508
- return result.length();
7509
- } else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT
7510
- if (length < 3) {
7511
- return -3;
7512
- }
7513
- buf[0] = '\xe2';
7514
- buf[1] = '\x96';
7515
- buf[2] = '\x85';
7516
- return 3;
7517
- } else if (llama_is_control_token(model->vocab, token)) {
7518
- // do nothing
7519
- } else if (llama_is_byte_token(model->vocab, token)) {
7520
- if (length < 1) {
7521
- return -1;
9469
+ break;
9470
+ }
9471
+ case LLAMA_VOCAB_TYPE_BPE: {
9472
+ if (llama_is_normal_token(model->vocab, token)) {
9473
+ std::string result = model->vocab.id_to_token[token].text;
9474
+ result = llama_decode_text(result);
9475
+ if (length < (int) result.length()) {
9476
+ return -result.length();
9477
+ }
9478
+ memcpy(buf, result.c_str(), result.length());
9479
+ return result.length();
9480
+ } else if (llama_is_control_token(model->vocab, token)) {
9481
+ ;
9482
+ } else {
9483
+ // TODO: for now we accept all unsupported token types,
9484
+ // suppressing them like CONTROL tokens.
9485
+ // GGML_ASSERT(false);
7522
9486
  }
7523
- buf[0] = llama_token_to_byte(model->vocab, token);
7524
- return 1;
9487
+ break;
9488
+ }
9489
+ default:
9490
+ GGML_ASSERT(false);
7525
9491
  }
7526
9492
  }
7527
9493
  return 0;
@@ -7548,14 +9514,14 @@ void llama_print_timings(struct llama_context * ctx) {
7548
9514
  const llama_timings timings = llama_get_timings(ctx);
7549
9515
 
7550
9516
  LLAMA_LOG_INFO("\n");
7551
- LLAMA_LOG_INFO("%s: load time = %8.2f ms\n", __func__, timings.t_load_ms);
7552
- LLAMA_LOG_INFO("%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
9517
+ LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, timings.t_load_ms);
9518
+ LLAMA_LOG_INFO("%s: sample time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
7553
9519
  __func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample);
7554
- LLAMA_LOG_INFO("%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
9520
+ LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
7555
9521
  __func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
7556
- LLAMA_LOG_INFO("%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
9522
+ LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
7557
9523
  __func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
7558
- LLAMA_LOG_INFO("%s: total time = %8.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms));
9524
+ LLAMA_LOG_INFO("%s: total time = %10.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms));
7559
9525
  }
7560
9526
 
7561
9527
  void llama_reset_timings(struct llama_context * ctx) {