llama_cpp 0.7.0 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -75,6 +75,7 @@
75
75
  #include <thread>
76
76
  #include <unordered_map>
77
77
  #include <set>
78
+ #include <forward_list>
78
79
 
79
80
  #if defined(_MSC_VER)
80
81
  #pragma warning(disable: 4244 4267) // possible loss of data
@@ -186,7 +187,9 @@ enum llm_arch {
186
187
  LLM_ARCH_GPTNEOX,
187
188
  LLM_ARCH_MPT,
188
189
  LLM_ARCH_STARCODER,
190
+ LLM_ARCH_PERSIMMON,
189
191
  LLM_ARCH_REFACT,
192
+ LLM_ARCH_BLOOM,
190
193
  LLM_ARCH_UNKNOWN,
191
194
  };
192
195
 
@@ -199,7 +202,9 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
199
202
  { LLM_ARCH_MPT, "mpt" },
200
203
  { LLM_ARCH_BAICHUAN, "baichuan" },
201
204
  { LLM_ARCH_STARCODER, "starcoder" },
202
- { LLM_ARCH_REFACT, "refact" },
205
+ { LLM_ARCH_PERSIMMON, "persimmon" },
206
+ { LLM_ARCH_REFACT, "refact" },
207
+ { LLM_ARCH_BLOOM, "bloom" },
203
208
  };
204
209
 
205
210
  enum llm_kv {
@@ -302,6 +307,7 @@ struct LLM_KV {
302
307
 
303
308
  enum llm_tensor {
304
309
  LLM_TENSOR_TOKEN_EMBD,
310
+ LLM_TENSOR_TOKEN_EMBD_NORM,
305
311
  LLM_TENSOR_POS_EMBD,
306
312
  LLM_TENSOR_OUTPUT,
307
313
  LLM_TENSOR_OUTPUT_NORM,
@@ -318,6 +324,8 @@ enum llm_tensor {
318
324
  LLM_TENSOR_FFN_DOWN,
319
325
  LLM_TENSOR_FFN_UP,
320
326
  LLM_TENSOR_FFN_NORM,
327
+ LLM_TENSOR_ATTN_Q_NORM,
328
+ LLM_TENSOR_ATTN_K_NORM,
321
329
  };
322
330
 
323
331
  static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
@@ -399,10 +407,35 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
399
407
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
400
408
  },
401
409
  },
410
+ {
411
+ LLM_ARCH_PERSIMMON,
412
+ {
413
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd"},
414
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm"},
415
+ { LLM_TENSOR_OUTPUT, "output"},
416
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm"},
417
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv"},
418
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output"},
419
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm"},
420
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm"},
421
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm"},
422
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down"},
423
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up"},
424
+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd"},
425
+ },
426
+ },
402
427
  {
403
428
  LLM_ARCH_MPT,
404
429
  {
405
430
  { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
431
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
432
+ { LLM_TENSOR_OUTPUT, "output" },
433
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
434
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
435
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
436
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
437
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
438
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
406
439
  },
407
440
  },
408
441
  {
@@ -437,6 +470,21 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
437
470
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
438
471
  },
439
472
  },
473
+ {
474
+ LLM_ARCH_BLOOM,
475
+ {
476
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
477
+ { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
478
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
479
+ { LLM_TENSOR_OUTPUT, "output" },
480
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
481
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
482
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
483
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
484
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
485
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
486
+ },
487
+ },
440
488
  {
441
489
  LLM_ARCH_UNKNOWN,
442
490
  {
@@ -954,6 +1002,7 @@ enum e_model {
954
1002
  MODEL_1B,
955
1003
  MODEL_3B,
956
1004
  MODEL_7B,
1005
+ MODEL_8B,
957
1006
  MODEL_13B,
958
1007
  MODEL_15B,
959
1008
  MODEL_30B,
@@ -984,6 +1033,9 @@ struct llama_hparams {
984
1033
  float rope_freq_base_train;
985
1034
  float rope_freq_scale_train;
986
1035
 
1036
+ float f_clamp_kqv;
1037
+ float f_max_alibi_bias;
1038
+
987
1039
  bool operator!=(const llama_hparams & other) const {
988
1040
  if (this->vocab_only != other.vocab_only) return true;
989
1041
  if (this->n_vocab != other.n_vocab) return true;
@@ -1036,6 +1088,10 @@ struct llama_layer {
1036
1088
  struct ggml_tensor * attn_norm_b;
1037
1089
  struct ggml_tensor * attn_norm_2;
1038
1090
  struct ggml_tensor * attn_norm_2_b;
1091
+ struct ggml_tensor * attn_q_norm;
1092
+ struct ggml_tensor * attn_q_norm_b;
1093
+ struct ggml_tensor * attn_k_norm;
1094
+ struct ggml_tensor * attn_k_norm_b;
1039
1095
 
1040
1096
  // attention
1041
1097
  struct ggml_tensor * wq;
@@ -1077,6 +1133,9 @@ struct llama_kv_cell {
1077
1133
  struct llama_kv_cache {
1078
1134
  bool has_shift = false;
1079
1135
 
1136
+ // Note: The value of head isn't only used to optimize searching
1137
+ // for a free KV slot. llama_decode_internal also uses it, so it
1138
+ // cannot be freely changed after a slot has been allocated.
1080
1139
  uint32_t head = 0;
1081
1140
  uint32_t size = 0;
1082
1141
 
@@ -1120,6 +1179,8 @@ struct llama_vocab {
1120
1179
  std::unordered_map<token, id> token_to_id;
1121
1180
  std::vector<token_data> id_to_token;
1122
1181
 
1182
+ std::unordered_map<token, id> special_tokens_cache;
1183
+
1123
1184
  std::map<std::pair<std::string, std::string>, int> bpe_ranks;
1124
1185
 
1125
1186
  // default LLaMA special tokens
@@ -1162,6 +1223,8 @@ struct llama_model {
1162
1223
 
1163
1224
  struct ggml_tensor * tok_embeddings;
1164
1225
  struct ggml_tensor * pos_embeddings;
1226
+ struct ggml_tensor * tok_norm;
1227
+ struct ggml_tensor * tok_norm_b;
1165
1228
 
1166
1229
  struct ggml_tensor * output_norm;
1167
1230
  struct ggml_tensor * output_norm_b;
@@ -1291,7 +1354,11 @@ static bool llama_kv_cache_init(
1291
1354
  cache.cells.clear();
1292
1355
  cache.cells.resize(n_ctx);
1293
1356
 
1357
+ // TODO: this should be:
1358
+ // cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*ggml_tensor_overhead());
1359
+ // change it and test that it works
1294
1360
  cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
1361
+ memset(cache.buf.data, 0, cache.buf.size);
1295
1362
 
1296
1363
  struct ggml_init_params params;
1297
1364
  params.mem_size = cache.buf.size;
@@ -1334,6 +1401,8 @@ static bool llama_kv_cache_init(
1334
1401
 
1335
1402
  // find an empty slot of size "n_tokens" in the cache
1336
1403
  // updates the cache head
1404
+ // Note: On success, it's important that cache.head points
1405
+ // to the first cell of the slot.
1337
1406
  static bool llama_kv_cache_find_slot(
1338
1407
  struct llama_kv_cache & cache,
1339
1408
  const struct llama_batch & batch) {
@@ -1349,8 +1418,8 @@ static bool llama_kv_cache_find_slot(
1349
1418
 
1350
1419
  while (true) {
1351
1420
  if (cache.head + n_tokens > n_ctx) {
1421
+ n_tested += n_ctx - cache.head;
1352
1422
  cache.head = 0;
1353
- n_tested += n_ctx - cache.head;
1354
1423
  continue;
1355
1424
  }
1356
1425
 
@@ -1376,7 +1445,10 @@ static bool llama_kv_cache_find_slot(
1376
1445
 
1377
1446
  for (uint32_t i = 0; i < n_tokens; i++) {
1378
1447
  cache.cells[cache.head + i].pos = batch.pos[i];
1379
- cache.cells[cache.head + i].seq_id.insert(batch.seq_id[i]);
1448
+
1449
+ for (int32_t j = 0; j < batch.n_seq_id[i]; j++) {
1450
+ cache.cells[cache.head + i].seq_id.insert(batch.seq_id[i][j]);
1451
+ }
1380
1452
  }
1381
1453
 
1382
1454
  return true;
@@ -1401,6 +1473,9 @@ static void llama_kv_cache_tokens_rm(struct llama_kv_cache & cache, int32_t c0,
1401
1473
  cache.cells[i].pos = -1;
1402
1474
  cache.cells[i].seq_id.clear();
1403
1475
  }
1476
+
1477
+ // Searching for a free slot can start here since we know it will be empty.
1478
+ cache.head = uint32_t(c0);
1404
1479
  }
1405
1480
 
1406
1481
  static void llama_kv_cache_seq_rm(
@@ -1408,6 +1483,8 @@ static void llama_kv_cache_seq_rm(
1408
1483
  llama_seq_id seq_id,
1409
1484
  llama_pos p0,
1410
1485
  llama_pos p1) {
1486
+ uint32_t new_head = cache.size;
1487
+
1411
1488
  if (p0 < 0) p0 = 0;
1412
1489
  if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
1413
1490
 
@@ -1416,9 +1493,13 @@ static void llama_kv_cache_seq_rm(
1416
1493
  cache.cells[i].seq_id.erase(seq_id);
1417
1494
  if (cache.cells[i].seq_id.empty()) {
1418
1495
  cache.cells[i].pos = -1;
1496
+ if (new_head == cache.size) new_head = i;
1419
1497
  }
1420
1498
  }
1421
1499
  }
1500
+
1501
+ // If we freed up a slot, set head to it so searching can start there.
1502
+ if (new_head != cache.size) cache.head = new_head;
1422
1503
  }
1423
1504
 
1424
1505
  static void llama_kv_cache_seq_cp(
@@ -1430,6 +1511,8 @@ static void llama_kv_cache_seq_cp(
1430
1511
  if (p0 < 0) p0 = 0;
1431
1512
  if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
1432
1513
 
1514
+ cache.head = 0;
1515
+
1433
1516
  for (uint32_t i = 0; i < cache.size; ++i) {
1434
1517
  if (cache.cells[i].has_seq_id(seq_id_src) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
1435
1518
  cache.cells[i].seq_id.insert(seq_id_dst);
@@ -1438,12 +1521,21 @@ static void llama_kv_cache_seq_cp(
1438
1521
  }
1439
1522
 
1440
1523
  static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id seq_id) {
1524
+ uint32_t new_head = cache.size;
1525
+
1441
1526
  for (uint32_t i = 0; i < cache.size; ++i) {
1442
1527
  if (!cache.cells[i].has_seq_id(seq_id)) {
1443
1528
  cache.cells[i].pos = -1;
1444
1529
  cache.cells[i].seq_id.clear();
1530
+ if (new_head == cache.size) new_head = i;
1531
+ } else {
1532
+ cache.cells[i].seq_id.clear();
1533
+ cache.cells[i].seq_id.insert(seq_id);
1445
1534
  }
1446
1535
  }
1536
+
1537
+ // If we freed up a slot, set head to it so searching can start there.
1538
+ if (new_head != cache.size) cache.head = new_head;
1447
1539
  }
1448
1540
 
1449
1541
  static void llama_kv_cache_seq_shift(
@@ -1452,6 +1544,8 @@ static void llama_kv_cache_seq_shift(
1452
1544
  llama_pos p0,
1453
1545
  llama_pos p1,
1454
1546
  llama_pos delta) {
1547
+ uint32_t new_head = cache.size;
1548
+
1455
1549
  if (p0 < 0) p0 = 0;
1456
1550
  if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
1457
1551
 
@@ -1461,12 +1555,17 @@ static void llama_kv_cache_seq_shift(
1461
1555
  if (cache.cells[i].pos < 0) {
1462
1556
  cache.cells[i].pos = -1;
1463
1557
  cache.cells[i].seq_id.clear();
1558
+ if (new_head == cache.size) new_head = i;
1464
1559
  } else {
1465
1560
  cache.has_shift = true;
1466
1561
  cache.cells[i].delta = delta;
1467
1562
  }
1468
1563
  }
1469
1564
  }
1565
+
1566
+ // If we freed up a slot, set head to it so searching can start there.
1567
+ // Otherwise we just start the next search from the beginning.
1568
+ cache.head = new_head != cache.size ? new_head : 0;
1470
1569
  }
1471
1570
 
1472
1571
  //
@@ -1670,7 +1769,7 @@ struct llama_model_loader {
1670
1769
  }
1671
1770
  }
1672
1771
 
1673
- struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend backend) {
1772
+ struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend_type backend) {
1674
1773
  if (backend != GGML_BACKEND_CPU) {
1675
1774
  ggml_set_no_alloc(ctx, true);
1676
1775
  }
@@ -1688,7 +1787,7 @@ struct llama_model_loader {
1688
1787
  return tensor;
1689
1788
  }
1690
1789
 
1691
- struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend backend) {
1790
+ struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend_type backend) {
1692
1791
  struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str());
1693
1792
 
1694
1793
  if (cur == NULL) {
@@ -1867,6 +1966,7 @@ static const char * llama_model_type_name(e_model type) {
1867
1966
  case MODEL_1B: return "1B";
1868
1967
  case MODEL_3B: return "3B";
1869
1968
  case MODEL_7B: return "7B";
1969
+ case MODEL_8B: return "8B";
1870
1970
  case MODEL_13B: return "13B";
1871
1971
  case MODEL_15B: return "15B";
1872
1972
  case MODEL_30B: return "30B";
@@ -1979,6 +2079,14 @@ static void llm_load_hparams(
1979
2079
  default: model.type = e_model::MODEL_UNKNOWN;
1980
2080
  }
1981
2081
  } break;
2082
+ case LLM_ARCH_PERSIMMON:
2083
+ {
2084
+ GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
2085
+ switch (hparams.n_layer) {
2086
+ case 36: model.type = e_model::MODEL_8B; break;
2087
+ default: model.type = e_model::MODEL_UNKNOWN;
2088
+ }
2089
+ } break;
1982
2090
  case LLM_ARCH_REFACT:
1983
2091
  {
1984
2092
  GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
@@ -1987,6 +2095,33 @@ static void llm_load_hparams(
1987
2095
  default: model.type = e_model::MODEL_UNKNOWN;
1988
2096
  }
1989
2097
  } break;
2098
+ case LLM_ARCH_BLOOM:
2099
+ {
2100
+ GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
2101
+
2102
+ switch (hparams.n_layer) {
2103
+ case 24: model.type = e_model::MODEL_1B; break;
2104
+ case 30:
2105
+ switch (hparams.n_embd) {
2106
+ case 2560: model.type = e_model::MODEL_3B; break;
2107
+ case 4096: model.type = e_model::MODEL_7B; break;
2108
+ } break;
2109
+ }
2110
+ } break;
2111
+ case LLM_ARCH_MPT:
2112
+ {
2113
+ hparams.f_clamp_kqv = 0.0f;
2114
+
2115
+ GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
2116
+ GGUF_GET_KEY(ctx, hparams.f_clamp_kqv, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ATTENTION_CLAMP_KQV));
2117
+ GGUF_GET_KEY(ctx, hparams.f_max_alibi_bias, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_MAX_ALIBI_BIAS));
2118
+
2119
+ switch (hparams.n_layer) {
2120
+ case 32: model.type = e_model::MODEL_7B; break;
2121
+ case 48: model.type = e_model::MODEL_30B; break;
2122
+ default: model.type = e_model::MODEL_UNKNOWN;
2123
+ }
2124
+ } break;
1990
2125
  default: (void)0;
1991
2126
  }
1992
2127
 
@@ -1994,7 +2129,7 @@ static void llm_load_hparams(
1994
2129
  }
1995
2130
 
1996
2131
  // TODO: This should probably be in llama.h
1997
- static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos);
2132
+ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos, bool special = false);
1998
2133
  static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch);
1999
2134
 
2000
2135
  static void llm_load_vocab(
@@ -2110,6 +2245,101 @@ static void llm_load_vocab(
2110
2245
  GGUF_GET_KEY(ctx, vocab.special_unk_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_UNK_ID));
2111
2246
  GGUF_GET_KEY(ctx, vocab.special_sep_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_SEP_ID));
2112
2247
  GGUF_GET_KEY(ctx, vocab.special_pad_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_PAD_ID));
2248
+
2249
+ // build special tokens cache
2250
+ {
2251
+ // TODO: It is unclear (to me) at this point, whether special tokes are guaranteed to be of a deterministic type,
2252
+ // and will always be correctly labeled in 'added_tokens.json' etc.
2253
+ // The assumption is, since special tokens aren't meant to be exposed to end user, they are designed
2254
+ // to be unmatchable by the tokenizer, therefore tokens from the vocab, which are unmatchable by the tokenizer
2255
+ // are special tokens.
2256
+ // From testing, this appears to corelate 1:1 with special tokens.
2257
+ //
2258
+
2259
+ // Counting special tokens and verifying in only one direction
2260
+ // is sufficient to detect difference in those two sets.
2261
+ //
2262
+ uint32_t special_tokens_count_by_type = 0;
2263
+ uint32_t special_tokens_count_from_verification = 0;
2264
+
2265
+ bool special_tokens_definition_mismatch = false;
2266
+
2267
+ for (const auto & t : vocab.token_to_id) {
2268
+ const auto & token = t.first;
2269
+ const auto & id = t.second;
2270
+
2271
+ // Count all non-normal tokens in the vocab while iterating
2272
+ if (vocab.id_to_token[id].type != LLAMA_TOKEN_TYPE_NORMAL) {
2273
+ special_tokens_count_by_type++;
2274
+ }
2275
+
2276
+ // Skip single character tokens
2277
+ if (token.length() > 1) {
2278
+ bool is_tokenizable = false;
2279
+
2280
+ // Split token string representation in two, in all possible ways
2281
+ // and check if both halves can be matched to a valid token
2282
+ for (unsigned i = 1; i < token.length();) {
2283
+ const auto left = token.substr(0, i);
2284
+ const auto right = token.substr(i);
2285
+
2286
+ // check if we didnt partition in the middle of a utf sequence
2287
+ auto utf = utf8_len(left.at(left.length() - 1));
2288
+
2289
+ if (utf == 1) {
2290
+ if (vocab.token_to_id.find(left) != vocab.token_to_id.end() &&
2291
+ vocab.token_to_id.find(right) != vocab.token_to_id.end() ) {
2292
+ is_tokenizable = true;
2293
+ break;
2294
+ }
2295
+ i++;
2296
+ } else {
2297
+ // skip over the rest of multibyte utf sequence
2298
+ i += utf - 1;
2299
+ }
2300
+ }
2301
+
2302
+ if (!is_tokenizable) {
2303
+ // Some tokens are multibyte, but they are utf sequences with equivalent text length of 1
2304
+ // it's faster to re-filter them here, since there are way less candidates now
2305
+
2306
+ // Calculate a total "utf" length of a token string representation
2307
+ size_t utf8_str_len = 0;
2308
+ for (unsigned i = 0; i < token.length();) {
2309
+ utf8_str_len++;
2310
+ i += utf8_len(token.at(i));
2311
+ }
2312
+
2313
+ // And skip the ones which are one character
2314
+ if (utf8_str_len > 1) {
2315
+ // At this point what we have left are special tokens only
2316
+ vocab.special_tokens_cache[token] = id;
2317
+
2318
+ // Count manually found special tokens
2319
+ special_tokens_count_from_verification++;
2320
+
2321
+ // If this manually found special token is not marked as such, flag a mismatch
2322
+ if (vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL) {
2323
+ special_tokens_definition_mismatch = true;
2324
+ }
2325
+ }
2326
+ }
2327
+ }
2328
+ }
2329
+
2330
+ if (special_tokens_definition_mismatch || special_tokens_count_from_verification != special_tokens_count_by_type) {
2331
+ LLAMA_LOG_WARN("%s: mismatch in special tokens definition ( %u/%zu vs %u/%zu ).\n",
2332
+ __func__,
2333
+ special_tokens_count_from_verification, vocab.id_to_token.size(),
2334
+ special_tokens_count_by_type, vocab.id_to_token.size()
2335
+ );
2336
+ } else {
2337
+ LLAMA_LOG_INFO("%s: special tokens definition check successful ( %u/%zu ).\n",
2338
+ __func__,
2339
+ special_tokens_count_from_verification, vocab.id_to_token.size()
2340
+ );
2341
+ }
2342
+ }
2113
2343
  }
2114
2344
 
2115
2345
  static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
@@ -2131,6 +2361,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
2131
2361
  LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa());
2132
2362
  LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
2133
2363
  LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
2364
+ LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
2365
+ LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
2134
2366
  LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
2135
2367
  LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
2136
2368
  LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
@@ -2230,8 +2462,8 @@ static void llm_load_tensors(
2230
2462
 
2231
2463
  // output
2232
2464
  {
2233
- ggml_backend backend_norm;
2234
- ggml_backend backend_output;
2465
+ ggml_backend_type backend_norm;
2466
+ ggml_backend_type backend_output;
2235
2467
 
2236
2468
  if (n_gpu_layers > int(n_layer)) {
2237
2469
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
@@ -2266,8 +2498,8 @@ static void llm_load_tensors(
2266
2498
  model.layers.resize(n_layer);
2267
2499
 
2268
2500
  for (uint32_t i = 0; i < n_layer; ++i) {
2269
- const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2270
- const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2501
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2502
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2271
2503
 
2272
2504
  auto & layer = model.layers[i];
2273
2505
 
@@ -2296,8 +2528,8 @@ static void llm_load_tensors(
2296
2528
  {
2297
2529
  model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2298
2530
  {
2299
- ggml_backend backend_norm;
2300
- ggml_backend backend_output;
2531
+ ggml_backend_type backend_norm;
2532
+ ggml_backend_type backend_output;
2301
2533
 
2302
2534
  if (n_gpu_layers > int(n_layer)) {
2303
2535
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
@@ -2332,8 +2564,8 @@ static void llm_load_tensors(
2332
2564
  model.layers.resize(n_layer);
2333
2565
 
2334
2566
  for (uint32_t i = 0; i < n_layer; ++i) {
2335
- const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2336
- const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2567
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2568
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2337
2569
 
2338
2570
  auto & layer = model.layers[i];
2339
2571
 
@@ -2366,8 +2598,8 @@ static void llm_load_tensors(
2366
2598
 
2367
2599
  // output
2368
2600
  {
2369
- ggml_backend backend_norm;
2370
- ggml_backend backend_output;
2601
+ ggml_backend_type backend_norm;
2602
+ ggml_backend_type backend_output;
2371
2603
 
2372
2604
  if (n_gpu_layers > int(n_layer)) {
2373
2605
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
@@ -2404,8 +2636,8 @@ static void llm_load_tensors(
2404
2636
  model.layers.resize(n_layer);
2405
2637
 
2406
2638
  for (uint32_t i = 0; i < n_layer; ++i) {
2407
- const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2408
- const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2639
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2640
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2409
2641
 
2410
2642
  auto & layer = model.layers[i];
2411
2643
 
@@ -2443,8 +2675,8 @@ static void llm_load_tensors(
2443
2675
 
2444
2676
  // output
2445
2677
  {
2446
- ggml_backend backend_norm;
2447
- ggml_backend backend_output;
2678
+ ggml_backend_type backend_norm;
2679
+ ggml_backend_type backend_output;
2448
2680
 
2449
2681
  if (n_gpu_layers > int(n_layer)) {
2450
2682
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
@@ -2481,8 +2713,8 @@ static void llm_load_tensors(
2481
2713
  model.layers.resize(n_layer);
2482
2714
 
2483
2715
  for (uint32_t i = 0; i < n_layer; ++i) {
2484
- const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2485
- const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2716
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2717
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2486
2718
 
2487
2719
  auto & layer = model.layers[i];
2488
2720
 
@@ -2515,117 +2747,327 @@ static void llm_load_tensors(
2515
2747
  }
2516
2748
  }
2517
2749
  } break;
2518
- default:
2519
- throw std::runtime_error("unknown architecture");
2520
- }
2521
- }
2750
+ case LLM_ARCH_PERSIMMON:
2751
+ {
2752
+ model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2522
2753
 
2523
- ml.done_getting_tensors();
2754
+ {
2755
+ ggml_backend_type backend_norm;
2756
+ ggml_backend_type backend_output;
2524
2757
 
2525
- // print memory requirements
2526
- {
2527
- // this is the total memory required to run the inference
2528
- size_t mem_required =
2529
- ctx_size +
2530
- mmapped_size - vram_weights; // weights in VRAM not in memory
2758
+ if (n_gpu_layers > int(n_layer)) {
2759
+ // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2760
+ // on Windows however this is detrimental unless everything is on the GPU
2761
+ #ifndef _WIN32
2762
+ backend_norm = LLAMA_BACKEND_OFFLOAD;
2763
+ #else
2764
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2765
+ #endif // _WIN32
2531
2766
 
2532
- LLAMA_LOG_INFO("%s: mem required = %7.2f MB\n", __func__, mem_required / 1024.0 / 1024.0);
2767
+ backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
2768
+ } else {
2769
+ backend_norm = GGML_BACKEND_CPU;
2770
+ backend_output = GGML_BACKEND_CPU;
2771
+ }
2533
2772
 
2534
- #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
2535
- const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
2773
+ model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
2774
+ model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
2775
+ model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
2536
2776
 
2537
- LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
2538
- if (n_gpu_layers > (int) hparams.n_layer) {
2539
- LLAMA_LOG_INFO("%s: offloading non-repeating layers to GPU\n", __func__);
2540
- }
2777
+ if (backend_norm == GGML_BACKEND_GPU) {
2778
+ vram_weights += ggml_nbytes(model.output_norm);
2779
+ vram_weights += ggml_nbytes(model.output_norm_b);
2780
+ }
2781
+ if (backend_output == GGML_BACKEND_GPU_SPLIT) {
2782
+ vram_weights += ggml_nbytes(model.output);
2783
+ }
2784
+ }
2541
2785
 
2542
- #ifdef GGML_USE_CUBLAS
2543
- const int max_backend_supported_layers = hparams.n_layer + 3;
2544
- const int max_offloadable_layers = hparams.n_layer + 3;
2545
- #elif defined(GGML_USE_CLBLAST)
2546
- const int max_backend_supported_layers = hparams.n_layer + 1;
2547
- const int max_offloadable_layers = hparams.n_layer + 1;
2548
- #endif // GGML_USE_CUBLAS
2786
+ const uint32_t n_ff = hparams.n_ff;
2787
+ const int i_gpu_start = n_layer - n_gpu_layers;
2788
+ model.layers.resize(n_layer);
2789
+ for (uint32_t i = 0; i < n_layer; ++i) {
2790
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2791
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT;
2792
+ auto & layer = model.layers[i];
2793
+ layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
2794
+ layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
2795
+ layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
2796
+ layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split);
2797
+ layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
2798
+ layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split);
2799
+ layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
2800
+ layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split);
2801
+ layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
2802
+ layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split);
2803
+ layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
2804
+ layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
2805
+ layer.attn_q_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64}, backend);
2806
+ layer.attn_q_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {64}, backend);
2807
+ layer.attn_k_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {64}, backend);
2808
+ layer.attn_k_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64}, backend);
2809
+ }
2810
+ } break;
2811
+ case LLM_ARCH_BLOOM:
2812
+ {
2813
+ // TODO: CPU-only for now
2549
2814
 
2550
- LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
2551
- LLAMA_LOG_INFO("%s: VRAM used: %.2f MB\n", __func__, vram_weights / 1024.0 / 1024.0);
2552
- #else
2553
- (void) n_gpu_layers;
2554
- #endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
2555
- }
2815
+ model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2816
+ model.tok_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, GGML_BACKEND_CPU);
2817
+ model.tok_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, GGML_BACKEND_CPU);
2556
2818
 
2557
- // populate `tensors_by_name`
2558
- for (int i = 0; i < ml.n_tensors; ++i) {
2559
- struct ggml_tensor * cur = ggml_get_tensor(ctx, ml.get_tensor_name(i));
2560
- model.tensors_by_name.emplace_back(ggml_get_name(cur), cur);
2561
- }
2819
+ // output
2820
+ {
2821
+ ggml_backend_type backend_norm;
2822
+ ggml_backend_type backend_output;
2562
2823
 
2563
- (void) tensor_split;
2564
- #ifdef GGML_USE_CUBLAS
2565
- {
2566
- ggml_cuda_set_tensor_split(tensor_split);
2567
- }
2568
- #endif
2824
+ if (n_gpu_layers > int(n_layer)) {
2825
+ // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2826
+ // on Windows however this is detrimental unless everything is on the GPU
2827
+ #ifndef _WIN32
2828
+ backend_norm = LLAMA_BACKEND_OFFLOAD;
2829
+ #else
2830
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2831
+ #endif // _WIN32
2569
2832
 
2570
- ml.load_all_data(ctx, progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL);
2833
+ backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
2834
+ } else {
2835
+ backend_norm = GGML_BACKEND_CPU;
2836
+ backend_output = GGML_BACKEND_CPU;
2837
+ }
2571
2838
 
2572
- if (progress_callback) {
2573
- progress_callback(1.0f, progress_callback_user_data);
2574
- }
2839
+ model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
2840
+ model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
2841
+ model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
2575
2842
 
2576
- model.mapping = std::move(ml.mapping);
2843
+ if (backend_norm == GGML_BACKEND_GPU) {
2844
+ vram_weights += ggml_nbytes(model.output_norm);
2845
+ vram_weights += ggml_nbytes(model.output_norm_b);
2846
+ }
2847
+ if (backend_output == GGML_BACKEND_GPU_SPLIT) {
2848
+ vram_weights += ggml_nbytes(model.output);
2849
+ }
2850
+ }
2577
2851
 
2578
- // loading time will be recalculate after the first eval, so
2579
- // we take page faults deferred by mmap() into consideration
2580
- model.t_load_us = ggml_time_us() - model.t_start_us;
2581
- }
2852
+ const uint32_t n_ff = hparams.n_ff;
2582
2853
 
2583
- static bool llama_model_load(
2584
- const std::string & fname,
2585
- llama_model & model,
2586
- int n_gpu_layers,
2587
- int main_gpu,
2588
- const float * tensor_split,
2589
- bool use_mmap,
2590
- bool use_mlock,
2591
- bool vocab_only,
2592
- llama_progress_callback progress_callback,
2593
- void *progress_callback_user_data) {
2594
- try {
2595
- llama_model_loader ml(fname, use_mmap);
2854
+ const int i_gpu_start = n_layer - n_gpu_layers;
2596
2855
 
2597
- model.hparams.vocab_only = vocab_only;
2856
+ model.layers.resize(n_layer);
2598
2857
 
2599
- llm_load_arch (ml, model);
2600
- llm_load_hparams(ml, model);
2601
- llm_load_vocab (ml, model);
2858
+ for (uint32_t i = 0; i < n_layer; ++i) {
2859
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2860
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2602
2861
 
2603
- llm_load_print_meta(ml, model);
2862
+ auto & layer = model.layers[i];
2604
2863
 
2605
- if (model.hparams.n_vocab != model.vocab.id_to_token.size()) {
2606
- throw std::runtime_error("vocab size mismatch");
2607
- }
2864
+ layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
2865
+ layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
2608
2866
 
2609
- if (vocab_only) {
2610
- LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
2611
- return true;
2612
- }
2867
+ layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
2868
+ layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split);
2613
2869
 
2614
- llm_load_tensors(
2615
- ml, model, n_gpu_layers,
2616
- main_gpu, tensor_split,
2617
- use_mlock, progress_callback, progress_callback_user_data);
2618
- } catch (const std::exception & err) {
2619
- LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
2620
- return false;
2621
- }
2870
+ layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
2871
+ layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split);
2622
2872
 
2623
- return true;
2624
- }
2873
+ layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
2874
+ layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
2875
+
2876
+ layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
2877
+ layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split);
2878
+
2879
+ layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
2880
+ layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split);
2881
+
2882
+ if (backend == GGML_BACKEND_GPU) {
2883
+ vram_weights +=
2884
+ ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
2885
+ ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
2886
+ ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) +
2887
+ ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_norm_b) +
2888
+ ggml_nbytes(layer.w3) + ggml_nbytes(layer.b3) +
2889
+ ggml_nbytes(layer.w2) + ggml_nbytes(layer.b2);
2890
+ }
2891
+ }
2892
+ } break;
2893
+ case LLM_ARCH_MPT:
2894
+ {
2895
+ model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2896
+
2897
+ // output
2898
+ {
2899
+ ggml_backend_type backend_norm;
2900
+ ggml_backend_type backend_output;
2901
+
2902
+ if (n_gpu_layers > int(n_layer)) {
2903
+ // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2904
+ // on Windows however this is detrimental unless everything is on the GPU
2905
+ #ifndef _WIN32
2906
+ backend_norm = LLAMA_BACKEND_OFFLOAD;
2907
+ #else
2908
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2909
+ #endif // _WIN32
2910
+
2911
+ backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
2912
+ } else {
2913
+ backend_norm = GGML_BACKEND_CPU;
2914
+ backend_output = GGML_BACKEND_CPU;
2915
+ }
2916
+
2917
+ model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
2918
+ model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
2919
+
2920
+ if (backend_norm == GGML_BACKEND_GPU) {
2921
+ vram_weights += ggml_nbytes(model.output_norm);
2922
+ }
2923
+ if (backend_output == GGML_BACKEND_GPU_SPLIT) {
2924
+ vram_weights += ggml_nbytes(model.output);
2925
+ }
2926
+ }
2927
+
2928
+ const uint32_t n_ff = hparams.n_ff;
2929
+
2930
+ const int i_gpu_start = n_layer - n_gpu_layers;
2931
+
2932
+ model.layers.resize(n_layer);
2933
+
2934
+ for (uint32_t i = 0; i < n_layer; ++i) {
2935
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2936
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2937
+
2938
+ auto & layer = model.layers[i];
2939
+
2940
+ layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
2941
+ layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
2942
+ layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
2943
+
2944
+ layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
2945
+
2946
+ layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
2947
+ layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
2948
+
2949
+ if (backend == GGML_BACKEND_GPU) {
2950
+ vram_weights +=
2951
+ ggml_nbytes(layer.attn_norm) +
2952
+ ggml_nbytes(layer.wqkv) +
2953
+ ggml_nbytes(layer.wo) +
2954
+ ggml_nbytes(layer.ffn_norm) +
2955
+ ggml_nbytes(layer.w2) +
2956
+ ggml_nbytes(layer.w3);
2957
+ }
2958
+ }
2959
+ } break;
2960
+ default:
2961
+ throw std::runtime_error("unknown architecture");
2962
+ }
2963
+ }
2964
+
2965
+ ml.done_getting_tensors();
2966
+
2967
+ // print memory requirements
2968
+ {
2969
+ // this is the total memory required to run the inference
2970
+ size_t mem_required =
2971
+ ctx_size +
2972
+ mmapped_size - vram_weights; // weights in VRAM not in memory
2973
+
2974
+ LLAMA_LOG_INFO("%s: mem required = %7.2f MB\n", __func__, mem_required / 1024.0 / 1024.0);
2975
+
2976
+ #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
2977
+ const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
2978
+
2979
+ LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
2980
+ if (n_gpu_layers > (int) hparams.n_layer) {
2981
+ LLAMA_LOG_INFO("%s: offloading non-repeating layers to GPU\n", __func__);
2982
+ }
2983
+
2984
+ #ifdef GGML_USE_CUBLAS
2985
+ const int max_backend_supported_layers = hparams.n_layer + 3;
2986
+ const int max_offloadable_layers = hparams.n_layer + 3;
2987
+ #elif defined(GGML_USE_CLBLAST)
2988
+ const int max_backend_supported_layers = hparams.n_layer + 1;
2989
+ const int max_offloadable_layers = hparams.n_layer + 1;
2990
+ #endif // GGML_USE_CUBLAS
2991
+
2992
+ LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
2993
+ LLAMA_LOG_INFO("%s: VRAM used: %.2f MB\n", __func__, vram_weights / 1024.0 / 1024.0);
2994
+ #else
2995
+ (void) n_gpu_layers;
2996
+ #endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
2997
+ }
2998
+
2999
+ // populate `tensors_by_name`
3000
+ for (int i = 0; i < ml.n_tensors; ++i) {
3001
+ struct ggml_tensor * cur = ggml_get_tensor(ctx, ml.get_tensor_name(i));
3002
+ model.tensors_by_name.emplace_back(ggml_get_name(cur), cur);
3003
+ }
3004
+
3005
+ (void) tensor_split;
3006
+ #ifdef GGML_USE_CUBLAS
3007
+ {
3008
+ ggml_cuda_set_tensor_split(tensor_split);
3009
+ }
3010
+ #endif
3011
+
3012
+ ml.load_all_data(ctx, progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL);
3013
+
3014
+ if (progress_callback) {
3015
+ progress_callback(1.0f, progress_callback_user_data);
3016
+ }
3017
+
3018
+ model.mapping = std::move(ml.mapping);
3019
+
3020
+ // loading time will be recalculate after the first eval, so
3021
+ // we take page faults deferred by mmap() into consideration
3022
+ model.t_load_us = ggml_time_us() - model.t_start_us;
3023
+ }
3024
+
3025
+ static bool llama_model_load(
3026
+ const std::string & fname,
3027
+ llama_model & model,
3028
+ int n_gpu_layers,
3029
+ int main_gpu,
3030
+ const float * tensor_split,
3031
+ bool use_mmap,
3032
+ bool use_mlock,
3033
+ bool vocab_only,
3034
+ llama_progress_callback progress_callback,
3035
+ void *progress_callback_user_data) {
3036
+ try {
3037
+ llama_model_loader ml(fname, use_mmap);
3038
+
3039
+ model.hparams.vocab_only = vocab_only;
3040
+
3041
+ llm_load_arch (ml, model);
3042
+ llm_load_hparams(ml, model);
3043
+ llm_load_vocab (ml, model);
3044
+
3045
+ llm_load_print_meta(ml, model);
3046
+
3047
+ if (model.hparams.n_vocab != model.vocab.id_to_token.size()) {
3048
+ throw std::runtime_error("vocab size mismatch");
3049
+ }
3050
+
3051
+ if (vocab_only) {
3052
+ LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
3053
+ return true;
3054
+ }
3055
+
3056
+ llm_load_tensors(
3057
+ ml, model, n_gpu_layers,
3058
+ main_gpu, tensor_split,
3059
+ use_mlock, progress_callback, progress_callback_user_data);
3060
+ } catch (const std::exception & err) {
3061
+ LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
3062
+ return false;
3063
+ }
3064
+
3065
+ return true;
3066
+ }
2625
3067
 
2626
3068
  static struct ggml_cgraph * llm_build_llama(
2627
- llama_context & lctx,
2628
- const llama_batch & batch) {
3069
+ llama_context & lctx,
3070
+ const llama_batch & batch) {
2629
3071
  const auto & model = lctx.model;
2630
3072
  const auto & hparams = model.hparams;
2631
3073
  const auto & cparams = lctx.cparams;
@@ -2663,11 +3105,9 @@ static struct ggml_cgraph * llm_build_llama(
2663
3105
  struct ggml_init_params params = {
2664
3106
  /*.mem_size =*/ buf_compute.size,
2665
3107
  /*.mem_buffer =*/ buf_compute.data,
2666
- /*.no_alloc =*/ false,
3108
+ /*.no_alloc =*/ true,
2667
3109
  };
2668
3110
 
2669
- params.no_alloc = true;
2670
-
2671
3111
  struct ggml_context * ctx0 = ggml_init(params);
2672
3112
 
2673
3113
  ggml_cgraph * gf = ggml_new_graph(ctx0);
@@ -2739,7 +3179,7 @@ static struct ggml_cgraph * llm_build_llama(
2739
3179
  for (int h = 0; h < 1; ++h) {
2740
3180
  for (int j = 0; j < n_tokens; ++j) {
2741
3181
  const llama_pos pos = batch.pos[j];
2742
- const llama_seq_id seq_id = batch.seq_id[j];
3182
+ const llama_seq_id seq_id = batch.seq_id[j][0];
2743
3183
 
2744
3184
  for (int i = 0; i < n_kv; ++i) {
2745
3185
  if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
@@ -3051,11 +3491,9 @@ static struct ggml_cgraph * llm_build_baichaun(
3051
3491
  struct ggml_init_params params = {
3052
3492
  /*.mem_size =*/ buf_compute.size,
3053
3493
  /*.mem_buffer =*/ buf_compute.data,
3054
- /*.no_alloc =*/ false,
3494
+ /*.no_alloc =*/ true,
3055
3495
  };
3056
3496
 
3057
- params.no_alloc = true;
3058
-
3059
3497
  struct ggml_context * ctx0 = ggml_init(params);
3060
3498
 
3061
3499
  ggml_cgraph * gf = ggml_new_graph(ctx0);
@@ -3127,7 +3565,7 @@ static struct ggml_cgraph * llm_build_baichaun(
3127
3565
  for (int h = 0; h < 1; ++h) {
3128
3566
  for (int j = 0; j < n_tokens; ++j) {
3129
3567
  const llama_pos pos = batch.pos[j];
3130
- const llama_seq_id seq_id = batch.seq_id[j];
3568
+ const llama_seq_id seq_id = batch.seq_id[j][0];
3131
3569
 
3132
3570
  for (int i = 0; i < n_kv; ++i) {
3133
3571
  if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
@@ -3452,11 +3890,9 @@ static struct ggml_cgraph * llm_build_refact(
3452
3890
  struct ggml_init_params params = {
3453
3891
  /*.mem_size =*/ buf_compute.size,
3454
3892
  /*.mem_buffer =*/ buf_compute.data,
3455
- /*.no_alloc =*/ false,
3893
+ /*.no_alloc =*/ true,
3456
3894
  };
3457
3895
 
3458
- params.no_alloc = true;
3459
-
3460
3896
  struct ggml_context * ctx0 = ggml_init(params);
3461
3897
 
3462
3898
  ggml_cgraph * gf = ggml_new_graph(ctx0);
@@ -3528,7 +3964,7 @@ static struct ggml_cgraph * llm_build_refact(
3528
3964
  for (int h = 0; h < 1; ++h) {
3529
3965
  for (int j = 0; j < n_tokens; ++j) {
3530
3966
  const llama_pos pos = batch.pos[j];
3531
- const llama_seq_id seq_id = batch.seq_id[j];
3967
+ const llama_seq_id seq_id = batch.seq_id[j][0];
3532
3968
 
3533
3969
  for (int i = 0; i < n_kv; ++i) {
3534
3970
  if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
@@ -3806,11 +4242,9 @@ static struct ggml_cgraph * llm_build_falcon(
3806
4242
  struct ggml_init_params params = {
3807
4243
  /*.mem_size =*/ buf_compute.size,
3808
4244
  /*.mem_buffer =*/ buf_compute.data,
3809
- /*.no_alloc =*/ false,
4245
+ /*.no_alloc =*/ true,
3810
4246
  };
3811
4247
 
3812
- params.no_alloc = true;
3813
-
3814
4248
  struct ggml_context * ctx0 = ggml_init(params);
3815
4249
 
3816
4250
  ggml_cgraph * gf = ggml_new_graph(ctx0);
@@ -3882,7 +4316,7 @@ static struct ggml_cgraph * llm_build_falcon(
3882
4316
  for (int h = 0; h < 1; ++h) {
3883
4317
  for (int j = 0; j < n_tokens; ++j) {
3884
4318
  const llama_pos pos = batch.pos[j];
3885
- const llama_seq_id seq_id = batch.seq_id[j];
4319
+ const llama_seq_id seq_id = batch.seq_id[j][0];
3886
4320
 
3887
4321
  for (int i = 0; i < n_kv; ++i) {
3888
4322
  if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
@@ -4166,11 +4600,9 @@ static struct ggml_cgraph * llm_build_starcoder(
4166
4600
  struct ggml_init_params params = {
4167
4601
  /*.mem_size =*/ buf_compute.size,
4168
4602
  /*.mem_buffer =*/ buf_compute.data,
4169
- /*.no_alloc =*/ false,
4603
+ /*.no_alloc =*/ true,
4170
4604
  };
4171
4605
 
4172
- params.no_alloc = true;
4173
-
4174
4606
  struct ggml_context * ctx0 = ggml_init(params);
4175
4607
 
4176
4608
  ggml_cgraph * gf = ggml_new_graph(ctx0);
@@ -4199,23 +4631,919 @@ static struct ggml_cgraph * llm_build_starcoder(
4199
4631
 
4200
4632
  ggml_allocr_alloc(lctx.alloc, token);
4201
4633
  if (!ggml_allocr_is_measure(lctx.alloc)) {
4202
- memcpy(token->data, batch.embd, n_tokens * n_embd * ggml_element_size(token));
4634
+ memcpy(token->data, batch.embd, n_tokens * n_embd * ggml_element_size(token));
4635
+ }
4636
+ }
4637
+
4638
+ {
4639
+ // Compute position embeddings.
4640
+ struct ggml_tensor * inp_positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
4641
+ ggml_allocr_alloc(lctx.alloc, inp_positions);
4642
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
4643
+ for (int i = 0; i < n_tokens; ++i) {
4644
+ ((int32_t *) inp_positions->data)[i] = batch.pos[i];
4645
+ }
4646
+ }
4647
+ ggml_set_name(inp_positions, "inp_positions");
4648
+
4649
+ position = ggml_get_rows(ctx0, model.pos_embeddings, inp_positions);
4650
+ }
4651
+
4652
+ // KQ_scale
4653
+ struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
4654
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
4655
+ ggml_allocr_alloc(lctx.alloc, KQ_scale);
4656
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
4657
+ ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
4658
+ }
4659
+
4660
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
4661
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
4662
+ ggml_set_name(KQ_mask, "KQ_mask");
4663
+ ggml_allocr_alloc(lctx.alloc, KQ_mask);
4664
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
4665
+ float * data = (float *) KQ_mask->data;
4666
+ memset(data, 0, ggml_nbytes(KQ_mask));
4667
+
4668
+ for (int h = 0; h < 1; ++h) {
4669
+ for (int j = 0; j < n_tokens; ++j) {
4670
+ const llama_pos pos = batch.pos[j];
4671
+ const llama_seq_id seq_id = batch.seq_id[j][0];
4672
+
4673
+ for (int i = 0; i < n_kv; ++i) {
4674
+ if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
4675
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
4676
+ }
4677
+ }
4678
+ }
4679
+ }
4680
+ }
4681
+
4682
+ inpL = ggml_add(ctx0, token, position);
4683
+ ggml_set_name(inpL, "inpL");
4684
+
4685
+ for (int il = 0; il < n_layer; ++il) {
4686
+ {
4687
+ // Norm
4688
+ cur = ggml_norm(ctx0, inpL, norm_eps);
4689
+ cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b);
4690
+ }
4691
+
4692
+ {
4693
+ // Self Attention
4694
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv);
4695
+
4696
+ struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd);
4697
+ struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*n_embd);
4698
+ struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
4699
+
4700
+ struct ggml_tensor * Qcur = tmpq;
4701
+ struct ggml_tensor * Kcur = tmpk;
4702
+
4703
+ {
4704
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
4705
+ ggml_set_name(Vcur, "Vcur");
4706
+
4707
+ struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
4708
+ ggml_set_name(k, "k");
4709
+
4710
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
4711
+ ( n_ctx)*ggml_element_size(kv_self.v),
4712
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
4713
+
4714
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
4715
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
4716
+ }
4717
+
4718
+ struct ggml_tensor * Q =
4719
+ ggml_permute(ctx0,
4720
+ ggml_cpy(ctx0,
4721
+ Qcur,
4722
+ ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, n_tokens)),
4723
+ 0, 2, 1, 3);
4724
+ ggml_set_name(Q, "Q");
4725
+
4726
+ struct ggml_tensor * K =
4727
+ ggml_view_3d(ctx0, kv_self.k,
4728
+ n_embd_head, n_kv, n_head_kv,
4729
+ ggml_element_size(kv_self.k)*n_embd_gqa,
4730
+ ggml_element_size(kv_self.k)*n_embd_head,
4731
+ ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
4732
+ ggml_set_name(K, "K");
4733
+
4734
+ // K * Q
4735
+ struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
4736
+ ggml_set_name(KQ, "KQ");
4737
+
4738
+ // KQ_scaled = KQ / sqrt(n_embd_head)
4739
+ // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
4740
+ struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
4741
+ ggml_set_name(KQ_scaled, "KQ_scaled");
4742
+
4743
+ // KQ_masked = mask_past(KQ_scaled)
4744
+ struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
4745
+ ggml_set_name(KQ_masked, "KQ_masked");
4746
+
4747
+ // KQ = soft_max(KQ_masked)
4748
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
4749
+ ggml_set_name(KQ_soft_max, "KQ_soft_max");
4750
+
4751
+ // split cached V into n_head heads
4752
+ struct ggml_tensor * V =
4753
+ ggml_view_3d(ctx0, kv_self.v,
4754
+ n_kv, n_embd_head, n_head_kv,
4755
+ ggml_element_size(kv_self.v)*n_ctx,
4756
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
4757
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
4758
+ ggml_set_name(V, "V");
4759
+
4760
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
4761
+ ggml_set_name(KQV, "KQV");
4762
+
4763
+ // KQV_merged = KQV.permute(0, 2, 1, 3)
4764
+ struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
4765
+ ggml_set_name(KQV_merged, "KQV_merged");
4766
+
4767
+ // cur = KQV_merged.contiguous().view(n_embd, n_tokens)
4768
+ cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
4769
+ ggml_set_name(cur, "KQV_merged_contiguous");
4770
+ }
4771
+
4772
+ // Projection
4773
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo);
4774
+
4775
+ // Add the input
4776
+ cur = ggml_add(ctx0, cur, inpL);
4777
+
4778
+ struct ggml_tensor * inpFF = cur;
4779
+
4780
+ // FF
4781
+ {
4782
+ // Norm
4783
+ {
4784
+ cur = ggml_norm(ctx0, inpFF, norm_eps);
4785
+ cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b);
4786
+ }
4787
+
4788
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3);
4789
+
4790
+ // GELU activation
4791
+ cur = ggml_gelu(ctx0, cur);
4792
+
4793
+ // Projection
4794
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2);
4795
+ }
4796
+
4797
+ inpL = ggml_add(ctx0, cur, inpFF);
4798
+ }
4799
+
4800
+ // Output Norm
4801
+ {
4802
+ cur = ggml_norm(ctx0, inpL, norm_eps);
4803
+ cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b);
4804
+ }
4805
+ ggml_set_name(cur, "result_norm");
4806
+
4807
+ cur = ggml_mul_mat(ctx0, model.output, cur);
4808
+ ggml_set_name(cur, "result_output");
4809
+
4810
+ ggml_build_forward_expand(gf, cur);
4811
+ ggml_free(ctx0);
4812
+
4813
+ return gf;
4814
+ }
4815
+
4816
+ static struct ggml_cgraph * llm_build_persimmon(
4817
+ llama_context & lctx,
4818
+ const llama_batch & batch) {
4819
+ const auto & model = lctx.model;
4820
+ const auto & hparams = model.hparams;
4821
+
4822
+ const auto & kv_self = lctx.kv_self;
4823
+
4824
+ GGML_ASSERT(!!kv_self.ctx);
4825
+
4826
+ const auto & cparams = lctx.cparams;
4827
+ const int64_t n_embd = hparams.n_embd;
4828
+ const int64_t n_layer = hparams.n_layer;
4829
+ const int64_t n_ctx = cparams.n_ctx;
4830
+ const int64_t n_head_kv = hparams.n_head_kv;
4831
+ const int64_t n_head = hparams.n_head;
4832
+ const int64_t n_embd_head = hparams.n_embd_head();
4833
+ const int64_t n_embd_gqa = hparams.n_embd_gqa();
4834
+ const size_t n_rot = n_embd_head / 2;
4835
+
4836
+ const float freq_base = cparams.rope_freq_base;
4837
+ const float freq_scale = cparams.rope_freq_scale;
4838
+ const float norm_eps = hparams.f_norm_eps;
4839
+
4840
+ const int n_gpu_layers = model.n_gpu_layers;
4841
+
4842
+
4843
+ const int32_t n_tokens = batch.n_tokens;
4844
+ const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
4845
+ const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
4846
+
4847
+ const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
4848
+
4849
+ auto & buf_compute = lctx.buf_compute;
4850
+ struct ggml_init_params params = {
4851
+ /*.mem_size =*/ buf_compute.size,
4852
+ /*.mem_buffer =*/ buf_compute.data,
4853
+ /*.no_alloc =*/ true,
4854
+ };
4855
+
4856
+ struct ggml_context * ctx0 = ggml_init(params);
4857
+
4858
+ ggml_cgraph * gf = ggml_new_graph(ctx0);
4859
+
4860
+ struct ggml_tensor * cur;
4861
+ struct ggml_tensor * inpL;
4862
+
4863
+ if (batch.token) {
4864
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
4865
+
4866
+ ggml_allocr_alloc(lctx.alloc, inp_tokens);
4867
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
4868
+ memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
4869
+ }
4870
+ ggml_set_name(inp_tokens, "inp_tokens");
4871
+ inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
4872
+ } else {
4873
+ inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
4874
+ ggml_allocr_alloc(lctx.alloc, inpL);
4875
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
4876
+ memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
4877
+ }
4878
+ }
4879
+ const int i_gpu_start = n_layer - n_gpu_layers;
4880
+ (void) i_gpu_start;
4881
+ offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
4882
+ offload_func_t offload_func_kq = llama_nop;
4883
+ offload_func_t offload_func_v = llama_nop;
4884
+ // KQ_scale
4885
+ struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
4886
+ ggml_allocr_alloc(lctx.alloc, KQ_scale);
4887
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
4888
+ ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd_head)));
4889
+ }
4890
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
4891
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
4892
+ offload_func_kq(KQ_mask);
4893
+ ggml_set_name(KQ_mask, "KQ_mask");
4894
+ ggml_allocr_alloc(lctx.alloc, KQ_mask);
4895
+
4896
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
4897
+ float * data = (float *) KQ_mask->data;
4898
+ memset(data, 0, ggml_nbytes(KQ_mask));
4899
+ for (int h = 0; h < 1; ++h) {
4900
+ for (int j = 0; j < n_tokens; ++j) {
4901
+ const llama_pos pos = batch.pos[j];
4902
+ const llama_seq_id seq_id = batch.seq_id[j][0];
4903
+ for (int i = 0; i < n_kv; ++i) {
4904
+ if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
4905
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
4906
+ }
4907
+ }
4908
+ }
4909
+ }
4910
+ }
4911
+
4912
+ struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
4913
+ offload_func_kq(KQ_pos);
4914
+ ggml_set_name(KQ_pos, "KQ_pos");
4915
+ ggml_allocr_alloc(lctx.alloc, KQ_pos);
4916
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
4917
+ int * data = (int *) KQ_pos->data;
4918
+ for (int i = 0; i < n_tokens; ++i) {
4919
+ data[i] = batch.pos[i];
4920
+ }
4921
+ }
4922
+ if (do_rope_shift) {
4923
+ struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
4924
+ offload_func_kq(K_shift);
4925
+ ggml_set_name(K_shift, "K_shift");
4926
+ ggml_allocr_alloc(lctx.alloc, K_shift);
4927
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
4928
+ int * data = (int *) K_shift->data;
4929
+ for (int i = 0; i < n_ctx; ++i) {
4930
+ data[i] = kv_self.cells[i].delta;
4931
+ }
4932
+ }
4933
+ for (int il = 0; il < n_layer; ++il) {
4934
+ struct ggml_tensor * tmp =
4935
+ // we rotate only the first n_rot dimensions.
4936
+ ggml_rope_custom_inplace(ctx0,
4937
+ ggml_view_3d(ctx0, kv_self.k,
4938
+ n_rot, n_head, n_ctx,
4939
+ ggml_element_size(kv_self.k)*n_embd_gqa,
4940
+ ggml_element_size(kv_self.k)*n_embd_head,
4941
+ ggml_element_size(kv_self.k)*(n_embd_head*n_ctx*il)
4942
+ ),
4943
+ K_shift, n_rot, 2, 0, freq_base, freq_scale);
4944
+ offload_func_kq(tmp);
4945
+ ggml_build_forward_expand(gf, tmp);
4946
+ }
4947
+ }
4948
+ for (int il=0; il < n_layer; ++il) {
4949
+ struct ggml_tensor * residual = inpL;
4950
+ offload_func_t offload_func = llama_nop;
4951
+ {
4952
+ cur = ggml_norm(ctx0, inpL, norm_eps);
4953
+ offload_func(cur);
4954
+ cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm);
4955
+ offload_func(cur);
4956
+ cur = ggml_add(ctx0, cur, model.layers[il].attn_norm_b);
4957
+ offload_func(cur);
4958
+ ggml_format_name(cur, "input_layernorm_%d", il);
4959
+ }
4960
+ // self attention
4961
+ {
4962
+ cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
4963
+ offload_func_kq(cur);
4964
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
4965
+ offload_func_kq(cur);
4966
+
4967
+ // split qkv
4968
+ GGML_ASSERT(n_head_kv == n_head);
4969
+ ggml_set_name(cur, format("qkv_%d", il).c_str());
4970
+ struct ggml_tensor * tmpqkv = ggml_reshape_4d(ctx0, cur, n_embd_head, 3, n_head, n_tokens);
4971
+ offload_func_kq(tmpqkv);
4972
+ struct ggml_tensor * tmpqkv_perm = ggml_cont(ctx0, ggml_permute(ctx0, tmpqkv, 0, 3, 1, 2));
4973
+ offload_func_kq(tmpqkv_perm);
4974
+ ggml_format_name(tmpqkv_perm, "tmpqkv_perm_%d", il);
4975
+ struct ggml_tensor * tmpq = ggml_view_3d(
4976
+ ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
4977
+ ggml_element_size(tmpqkv_perm) * n_embd_head,
4978
+ ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
4979
+ 0
4980
+ );
4981
+ offload_func_kq(tmpq);
4982
+ struct ggml_tensor * tmpk = ggml_view_3d(
4983
+ ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
4984
+ ggml_element_size(tmpqkv_perm) * n_embd_head,
4985
+ ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
4986
+ ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens
4987
+ );
4988
+ offload_func_kq(tmpk);
4989
+ // Q/K Layernorm
4990
+ tmpq = ggml_norm(ctx0, tmpq, norm_eps);
4991
+ offload_func_kq(tmpq);
4992
+ tmpq = ggml_mul(ctx0, tmpq, model.layers[il].attn_q_norm);
4993
+ offload_func_kq(tmpq);
4994
+ tmpq = ggml_add(ctx0, tmpq, model.layers[il].attn_q_norm_b);
4995
+ offload_func_kq(tmpq);
4996
+
4997
+ tmpk = ggml_norm(ctx0, tmpk, norm_eps);
4998
+ offload_func_v(tmpk);
4999
+ tmpk = ggml_mul(ctx0, tmpk, model.layers[il].attn_k_norm);
5000
+ offload_func_v(tmpk);
5001
+ tmpk = ggml_add(ctx0, tmpk, model.layers[il].attn_k_norm_b);
5002
+ offload_func_v(tmpk);
5003
+
5004
+ // RoPE the first n_rot of q/k, pass the other half, and concat.
5005
+ struct ggml_tensor * qrot = ggml_view_3d(
5006
+ ctx0, tmpq, n_rot, n_head, n_tokens,
5007
+ ggml_element_size(tmpq) * n_embd_head,
5008
+ ggml_element_size(tmpq) * n_embd_head * n_head,
5009
+ 0
5010
+ );
5011
+ offload_func_kq(qrot);
5012
+ ggml_format_name(qrot, "qrot_%d", il);
5013
+ struct ggml_tensor * krot = ggml_view_3d(
5014
+ ctx0, tmpk, n_rot, n_head, n_tokens,
5015
+ ggml_element_size(tmpk) * n_embd_head,
5016
+ ggml_element_size(tmpk) * n_embd_head * n_head,
5017
+ 0
5018
+ );
5019
+ offload_func_kq(krot);
5020
+ ggml_format_name(krot, "krot_%d", il);
5021
+
5022
+ // get the second half of tmpq, e.g tmpq[n_rot:, :, :]
5023
+ struct ggml_tensor * qpass = ggml_view_3d(
5024
+ ctx0, tmpq, n_rot, n_head, n_tokens,
5025
+ ggml_element_size(tmpq) * n_embd_head,
5026
+ ggml_element_size(tmpq) * n_embd_head * n_head,
5027
+ ggml_element_size(tmpq) * n_rot
5028
+ );
5029
+ offload_func_kq(qpass);
5030
+ ggml_format_name(qpass, "qpass_%d", il);
5031
+ struct ggml_tensor * kpass = ggml_view_3d(
5032
+ ctx0, tmpk, n_rot, n_head, n_tokens,
5033
+ ggml_element_size(tmpk) * n_embd_head,
5034
+ ggml_element_size(tmpk) * n_embd_head * n_head,
5035
+ ggml_element_size(tmpk) * n_rot
5036
+ );
5037
+ offload_func_kq(kpass);
5038
+ ggml_format_name(kpass, "kpass_%d", il);
5039
+
5040
+ struct ggml_tensor * qrotated = ggml_rope_custom(
5041
+ ctx0, qrot, KQ_pos, n_rot, 2, 0, freq_base, freq_scale
5042
+ );
5043
+ offload_func_kq(qrotated);
5044
+ struct ggml_tensor * krotated = ggml_rope_custom(
5045
+ ctx0, krot, KQ_pos, n_rot, 2, 0, freq_base, freq_scale
5046
+ );
5047
+ offload_func_kq(krotated);
5048
+ // ggml currently only supports concatenation on dim=2
5049
+ // so we need to permute qrot, qpass, concat, then permute back.
5050
+ qrotated = ggml_cont(ctx0, ggml_permute(ctx0, qrotated, 2, 1, 0, 3));
5051
+ offload_func_kq(qrotated);
5052
+ krotated = ggml_cont(ctx0, ggml_permute(ctx0, krotated, 2, 1, 0, 3));
5053
+ offload_func_kq(krotated);
5054
+
5055
+ qpass = ggml_cont(ctx0, ggml_permute(ctx0, qpass, 2, 1, 0, 3));
5056
+ offload_func_kq(qpass);
5057
+ kpass = ggml_cont(ctx0, ggml_permute(ctx0, kpass, 2, 1, 0, 3));
5058
+ offload_func_kq(kpass);
5059
+
5060
+ struct ggml_tensor * Qcur = ggml_concat(ctx0, qrotated, qpass);
5061
+ offload_func_kq(Qcur);
5062
+ struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass);
5063
+ offload_func_kq(Kcur);
5064
+
5065
+ struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 1, 2, 0, 3));
5066
+ offload_func_kq(Q);
5067
+
5068
+ Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
5069
+ offload_func_kq(Kcur);
5070
+ {
5071
+ struct ggml_tensor * tmpv = ggml_view_3d(
5072
+ ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
5073
+ ggml_element_size(tmpqkv_perm) * n_embd_head,
5074
+ ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
5075
+ ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens * 2
5076
+ );
5077
+ offload_func_v(tmpv);
5078
+ // store K, V in cache
5079
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens));
5080
+ offload_func_v(Vcur);
5081
+ ggml_set_name(Vcur, "Vcur");
5082
+
5083
+ struct ggml_tensor * k = ggml_view_1d(
5084
+ ctx0, kv_self.k, n_tokens*n_embd_gqa,
5085
+ (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)
5086
+ );
5087
+ offload_func_kq(k);
5088
+ ggml_set_name(k, "k");
5089
+
5090
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
5091
+ ( n_ctx)*ggml_element_size(kv_self.v),
5092
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
5093
+ offload_func_v(v);
5094
+ ggml_set_name(v, "v");
5095
+
5096
+ // important: storing RoPE-ed version of K in the KV cache!
5097
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
5098
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
5099
+ }
5100
+ struct ggml_tensor * K = ggml_view_3d(ctx0, kv_self.k,
5101
+ n_embd_head, n_kv, n_head_kv,
5102
+ ggml_element_size(kv_self.k)*n_embd_gqa,
5103
+ ggml_element_size(kv_self.k)*n_embd_head,
5104
+ ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
5105
+
5106
+ offload_func_kq(K);
5107
+ ggml_format_name(K, "K_%d", il);
5108
+
5109
+ struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
5110
+ offload_func_kq(KQ);
5111
+ ggml_set_name(KQ, "KQ");
5112
+
5113
+ struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
5114
+ offload_func_kq(KQ_scaled);
5115
+ ggml_set_name(KQ_scaled, "KQ_scaled");
5116
+
5117
+ struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
5118
+ offload_func_kq(KQ_masked);
5119
+ ggml_set_name(KQ_masked, "KQ_masked");
5120
+
5121
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
5122
+ offload_func_kq(KQ_soft_max);
5123
+ ggml_set_name(KQ_soft_max, "KQ_soft_max");
5124
+
5125
+ struct ggml_tensor * V =
5126
+ ggml_view_3d(ctx0, kv_self.v,
5127
+ n_kv, n_embd_head, n_head_kv,
5128
+ ggml_element_size(kv_self.v)*n_ctx,
5129
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
5130
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
5131
+ offload_func_v(V);
5132
+ ggml_set_name(V, "V");
5133
+
5134
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
5135
+ offload_func_v(KQV);
5136
+ ggml_set_name(KQV, "KQV");
5137
+
5138
+ struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
5139
+ offload_func_v(KQV_merged);
5140
+ ggml_set_name(KQV_merged, "KQV_merged");
5141
+
5142
+ cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
5143
+ offload_func_v(cur);
5144
+ ggml_set_name(cur, "KQV_merged_contiguous");
5145
+
5146
+ cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
5147
+ offload_func(cur);
5148
+ cur = ggml_add(ctx0, cur, model.layers[il].bo);
5149
+ offload_func(cur);
5150
+ ggml_set_name(cur, "result_wo");
5151
+ }
5152
+
5153
+ struct ggml_tensor * inpFF = ggml_add(ctx0, residual, cur);
5154
+ offload_func(inpFF);
5155
+ ggml_set_name(inpFF, "inpFF");
5156
+ {
5157
+ // MLP
5158
+ {
5159
+ // Norm
5160
+ cur = ggml_norm(ctx0, inpFF, norm_eps);
5161
+ offload_func(cur);
5162
+ cur = ggml_add(ctx0,
5163
+ ggml_mul(ctx0, cur, model.layers[il].ffn_norm),
5164
+ model.layers[il].ffn_norm_b
5165
+ );
5166
+ ggml_set_name(cur, "ffn_norm");
5167
+ offload_func(cur);
5168
+ }
5169
+ cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur);
5170
+ offload_func(cur);
5171
+
5172
+ cur = ggml_add(ctx0, cur, model.layers[il].b3);
5173
+ offload_func(cur);
5174
+ ggml_set_name(cur, "result_ffn_up");
5175
+
5176
+ cur = ggml_sqr(ctx0, ggml_relu(ctx0, cur));
5177
+ ggml_set_name(cur, "result_ffn_act");
5178
+ offload_func(cur);
5179
+ offload_func(cur->src[0]);
5180
+
5181
+ cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur);
5182
+ offload_func(cur);
5183
+ cur = ggml_add(ctx0,
5184
+ cur,
5185
+ model.layers[il].b2);
5186
+ offload_func(cur);
5187
+ ggml_set_name(cur, "outFF");
5188
+ }
5189
+ cur = ggml_add(ctx0, cur, inpFF);
5190
+ offload_func(cur);
5191
+ ggml_set_name(cur, "inpFF_+_outFF");
5192
+ inpL = cur;
5193
+ }
5194
+ cur = inpL;
5195
+ {
5196
+ cur = ggml_norm(ctx0, cur, norm_eps);
5197
+ offload_func_nr(cur);
5198
+ cur = ggml_mul(ctx0, cur, model.output_norm);
5199
+ offload_func_nr(cur);
5200
+
5201
+ cur = ggml_add(ctx0, cur, model.output_norm_b);
5202
+ // offload_func_nr(cur);
5203
+
5204
+ ggml_set_name(cur, "result_norm");
5205
+ }
5206
+ cur = ggml_mul_mat(ctx0, model.output, cur);
5207
+ ggml_set_name(cur, "result_output");
5208
+ ggml_build_forward_expand(gf, cur);
5209
+ ggml_free(ctx0);
5210
+ return gf;
5211
+ }
5212
+
5213
+ static struct ggml_cgraph * llm_build_bloom(
5214
+ llama_context & lctx,
5215
+ const llama_batch & batch) {
5216
+ const auto & model = lctx.model;
5217
+ const auto & hparams = model.hparams;
5218
+ const auto & cparams = lctx.cparams;
5219
+
5220
+ const auto & kv_self = lctx.kv_self;
5221
+
5222
+ GGML_ASSERT(!!kv_self.ctx);
5223
+
5224
+ const int64_t n_embd = hparams.n_embd;
5225
+ const int64_t n_layer = hparams.n_layer;
5226
+ const int64_t n_ctx = cparams.n_ctx;
5227
+ const int64_t n_head = hparams.n_head;
5228
+ const int64_t n_head_kv = hparams.n_head_kv;
5229
+ const int64_t n_embd_head = hparams.n_embd_head();
5230
+ const int64_t n_embd_gqa = hparams.n_embd_gqa();
5231
+
5232
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
5233
+
5234
+ const float norm_eps = hparams.f_norm_eps;
5235
+
5236
+ const int32_t n_tokens = batch.n_tokens;
5237
+ const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
5238
+ const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
5239
+
5240
+ auto & buf_compute = lctx.buf_compute;
5241
+
5242
+ struct ggml_init_params params = {
5243
+ /*.mem_size =*/ buf_compute.size,
5244
+ /*.mem_buffer =*/ buf_compute.data,
5245
+ /*.no_alloc =*/ false,
5246
+ };
5247
+
5248
+ params.no_alloc = true;
5249
+
5250
+ struct ggml_context * ctx0 = ggml_init(params);
5251
+
5252
+ ggml_cgraph * gf = ggml_new_graph(ctx0);
5253
+
5254
+ struct ggml_tensor * cur;
5255
+ struct ggml_tensor * token;
5256
+ struct ggml_tensor * inpL;
5257
+
5258
+ if (batch.token) {
5259
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
5260
+
5261
+ ggml_allocr_alloc(lctx.alloc, inp_tokens);
5262
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
5263
+ memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
5264
+ }
5265
+ ggml_set_name(inp_tokens, "inp_tokens");
5266
+
5267
+ token = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
5268
+ } else {
5269
+ #ifdef GGML_USE_MPI
5270
+ GGML_ASSERT(false && "not implemented");
5271
+ #endif
5272
+
5273
+ token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
5274
+
5275
+ ggml_allocr_alloc(lctx.alloc, token);
5276
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
5277
+ memcpy(token->data, batch.embd, n_tokens * n_embd * ggml_element_size(token));
5278
+ }
5279
+ }
5280
+
5281
+ // KQ_scale
5282
+ struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
5283
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
5284
+ ggml_allocr_alloc(lctx.alloc, KQ_scale);
5285
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
5286
+ ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
5287
+ }
5288
+
5289
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5290
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
5291
+ ggml_set_name(KQ_mask, "KQ_mask");
5292
+ ggml_allocr_alloc(lctx.alloc, KQ_mask);
5293
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
5294
+ float * data = (float *) KQ_mask->data;
5295
+ memset(data, 0, ggml_nbytes(KQ_mask));
5296
+
5297
+ for (int h = 0; h < 1; ++h) {
5298
+ for (int j = 0; j < n_tokens; ++j) {
5299
+ const llama_pos pos = batch.pos[j];
5300
+ const llama_seq_id seq_id = batch.seq_id[j][0];
5301
+
5302
+ for (int i = 0; i < n_kv; ++i) {
5303
+ if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
5304
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
5305
+ }
5306
+ }
5307
+ }
5308
+ }
5309
+ }
5310
+
5311
+ // norm
5312
+ {
5313
+ inpL = ggml_norm(ctx0, token, norm_eps);
5314
+ inpL = ggml_add(ctx0, ggml_mul(ctx0, inpL, model.tok_norm), model.tok_norm_b);
5315
+ }
5316
+
5317
+ ggml_set_name(inpL, "inpL");
5318
+
5319
+ for (int il = 0; il < n_layer; ++il) {
5320
+ {
5321
+ // Norm
5322
+ cur = ggml_norm(ctx0, inpL, norm_eps);
5323
+ cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b);
5324
+ }
5325
+
5326
+ {
5327
+ // Self Attention
5328
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv);
5329
+
5330
+ struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd);
5331
+ struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*n_embd);
5332
+ struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
5333
+
5334
+ struct ggml_tensor * Qcur = tmpq;
5335
+ struct ggml_tensor * Kcur = tmpk;
5336
+
5337
+ // store key and value to memory
5338
+ {
5339
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
5340
+ ggml_set_name(Vcur, "Vcur");
5341
+
5342
+ struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
5343
+ ggml_set_name(k, "k");
5344
+
5345
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
5346
+ ( n_ctx)*ggml_element_size(kv_self.v),
5347
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
5348
+
5349
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
5350
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
5351
+ }
5352
+
5353
+ struct ggml_tensor * Q =
5354
+ ggml_permute(ctx0,
5355
+ ggml_cpy(ctx0,
5356
+ Qcur,
5357
+ ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, n_tokens)),
5358
+ 0, 2, 1, 3);
5359
+ ggml_set_name(Q, "Q");
5360
+
5361
+ struct ggml_tensor * K =
5362
+ ggml_view_3d(ctx0, kv_self.k,
5363
+ n_embd_head, n_kv, n_head_kv,
5364
+ ggml_element_size(kv_self.k)*n_embd_gqa,
5365
+ ggml_element_size(kv_self.k)*n_embd_head,
5366
+ ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
5367
+ ggml_set_name(K, "K");
5368
+
5369
+ // K * Q
5370
+ struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
5371
+ ggml_set_name(KQ, "KQ");
5372
+
5373
+ // KQ_scaled = KQ / sqrt(n_embd_head)
5374
+ // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
5375
+ struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
5376
+ ggml_set_name(KQ_scaled, "KQ_scaled");
5377
+
5378
+ struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ kv_head, n_head, 8);
5379
+ ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
5380
+
5381
+ // KQ_masked = mask_past(KQ_scaled)
5382
+ struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
5383
+ ggml_set_name(KQ_masked, "KQ_masked");
5384
+
5385
+ // KQ = soft_max(KQ_masked)
5386
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
5387
+ ggml_set_name(KQ_soft_max, "KQ_soft_max");
5388
+
5389
+ // split cached V into n_head heads
5390
+ struct ggml_tensor * V =
5391
+ ggml_view_3d(ctx0, kv_self.v,
5392
+ n_kv, n_embd_head, n_head_kv,
5393
+ ggml_element_size(kv_self.v)*n_ctx,
5394
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
5395
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
5396
+ ggml_set_name(V, "V");
5397
+
5398
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
5399
+ ggml_set_name(KQV, "KQV");
5400
+
5401
+ // KQV_merged = KQV.permute(0, 2, 1, 3)
5402
+ struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
5403
+ ggml_set_name(KQV_merged, "KQV_merged");
5404
+
5405
+ // cur = KQV_merged.contiguous().view(n_embd, n_tokens)
5406
+ cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
5407
+ ggml_set_name(cur, "KQV_merged_contiguous");
5408
+ }
5409
+
5410
+ // Projection
5411
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo);
5412
+
5413
+ // Add the input
5414
+ cur = ggml_add(ctx0, cur, inpL);
5415
+
5416
+ struct ggml_tensor * inpFF = cur;
5417
+
5418
+ // FF
5419
+ {
5420
+ // Norm
5421
+ {
5422
+ cur = ggml_norm(ctx0, inpFF, norm_eps);
5423
+ cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b);
5424
+ }
5425
+
5426
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3);
5427
+
5428
+ // GELU activation
5429
+ cur = ggml_gelu(ctx0, cur);
5430
+
5431
+ // Projection
5432
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2);
5433
+ }
5434
+
5435
+ inpL = ggml_add(ctx0, cur, inpFF);
5436
+ }
5437
+
5438
+ // Output Norm
5439
+ {
5440
+ cur = ggml_norm(ctx0, inpL, norm_eps);
5441
+ cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b);
5442
+ }
5443
+ ggml_set_name(cur, "result_norm");
5444
+
5445
+ cur = ggml_mul_mat(ctx0, model.output, cur);
5446
+ ggml_set_name(cur, "result_output");
5447
+
5448
+ ggml_build_forward_expand(gf, cur);
5449
+
5450
+ ggml_free(ctx0);
5451
+
5452
+ return gf;
5453
+ }
5454
+
5455
+ static struct ggml_cgraph * llm_build_mpt(
5456
+ llama_context & lctx,
5457
+ const llama_batch & batch) {
5458
+ const auto & model = lctx.model;
5459
+ const auto & hparams = model.hparams;
5460
+ const auto & cparams = lctx.cparams;
5461
+
5462
+ const auto & kv_self = lctx.kv_self;
5463
+
5464
+ GGML_ASSERT(!!kv_self.ctx);
5465
+
5466
+ const int64_t n_embd = hparams.n_embd;
5467
+ const int64_t n_layer = hparams.n_layer;
5468
+ const int64_t n_ctx = cparams.n_ctx;
5469
+ const int64_t n_head = hparams.n_head;
5470
+ const int64_t n_head_kv = hparams.n_head_kv;
5471
+ const int64_t n_embd_head = hparams.n_embd_head();
5472
+ const int64_t n_embd_gqa = hparams.n_embd_gqa();
5473
+
5474
+ const float norm_eps = hparams.f_norm_eps;
5475
+ const float clamp_kqv = hparams.f_clamp_kqv;
5476
+ const float max_alibi_bias = hparams.f_max_alibi_bias;
5477
+
5478
+ const int n_gpu_layers = model.n_gpu_layers;
5479
+
5480
+ const int32_t n_tokens = batch.n_tokens;
5481
+ const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
5482
+ const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
5483
+
5484
+ auto & buf_compute = lctx.buf_compute;
5485
+
5486
+ struct ggml_init_params params = {
5487
+ /*.mem_size =*/ buf_compute.size,
5488
+ /*.mem_buffer =*/ buf_compute.data,
5489
+ /*.no_alloc =*/ false,
5490
+ };
5491
+
5492
+ params.no_alloc = true;
5493
+
5494
+ struct ggml_context * ctx0 = ggml_init(params);
5495
+
5496
+ ggml_cgraph * gf = ggml_new_graph(ctx0);
5497
+
5498
+ struct ggml_tensor * cur;
5499
+ struct ggml_tensor * inpL;
5500
+
5501
+ //int warmup = 0;
5502
+ if (batch.token) {
5503
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
5504
+
5505
+ ggml_allocr_alloc(lctx.alloc, inp_tokens);
5506
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
5507
+ memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
5508
+ //warmup = ((uint32_t*) inp_tokens->data)[0] == 0;
5509
+ }
5510
+
5511
+ ggml_set_name(inp_tokens, "inp_tokens");
5512
+
5513
+ inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
5514
+ } else {
5515
+ #ifdef GGML_USE_MPI
5516
+ GGML_ASSERT(false && "not implemented");
5517
+ #endif
5518
+
5519
+ inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
5520
+
5521
+ ggml_allocr_alloc(lctx.alloc, inpL);
5522
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
5523
+ memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
4203
5524
  }
4204
5525
  }
4205
5526
 
4206
- {
4207
- // Compute position embeddings.
4208
- struct ggml_tensor * inp_positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
4209
- ggml_allocr_alloc(lctx.alloc, inp_positions);
4210
- if (!ggml_allocr_is_measure(lctx.alloc)) {
4211
- for (int i = 0; i < n_tokens; ++i) {
4212
- ((int32_t *) inp_positions->data)[i] = batch.pos[i];
4213
- }
4214
- }
4215
- ggml_set_name(inp_positions, "inp_positions");
5527
+ const int i_gpu_start = n_layer - n_gpu_layers;
5528
+ (void) i_gpu_start;
4216
5529
 
4217
- position = ggml_get_rows(ctx0, model.pos_embeddings, inp_positions);
5530
+ // offload functions set the tensor output backend to GPU
5531
+ // tensors are GPU-accelerated if any input or the output has been offloaded
5532
+ offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
5533
+ offload_func_t offload_func_kq = llama_nop;
5534
+ offload_func_t offload_func_v = llama_nop;
5535
+
5536
+ #ifdef GGML_USE_CUBLAS
5537
+ if (n_gpu_layers > n_layer) {
5538
+ offload_func_nr = ggml_cuda_assign_buffers_no_alloc;
4218
5539
  }
5540
+ if (n_gpu_layers > n_layer + 1) {
5541
+ offload_func_v = ggml_cuda_assign_buffers_no_alloc;
5542
+ }
5543
+ if (n_gpu_layers > n_layer + 2) {
5544
+ offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
5545
+ }
5546
+ #endif // GGML_USE_CUBLAS
4219
5547
 
4220
5548
  // KQ_scale
4221
5549
  struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
@@ -4227,6 +5555,7 @@ static struct ggml_cgraph * llm_build_starcoder(
4227
5555
 
4228
5556
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
4229
5557
  struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
5558
+ offload_func_kq(KQ_mask);
4230
5559
  ggml_set_name(KQ_mask, "KQ_mask");
4231
5560
  ggml_allocr_alloc(lctx.alloc, KQ_mask);
4232
5561
  if (!ggml_allocr_is_measure(lctx.alloc)) {
@@ -4236,7 +5565,7 @@ static struct ggml_cgraph * llm_build_starcoder(
4236
5565
  for (int h = 0; h < 1; ++h) {
4237
5566
  for (int j = 0; j < n_tokens; ++j) {
4238
5567
  const llama_pos pos = batch.pos[j];
4239
- const llama_seq_id seq_id = batch.seq_id[j];
5568
+ const llama_seq_id seq_id = batch.seq_id[j][0];
4240
5569
 
4241
5570
  for (int i = 0; i < n_kv; ++i) {
4242
5571
  if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
@@ -4247,48 +5576,87 @@ static struct ggml_cgraph * llm_build_starcoder(
4247
5576
  }
4248
5577
  }
4249
5578
 
4250
- inpL = ggml_add(ctx0, token, position);
4251
- ggml_set_name(inpL, "inpL");
4252
-
4253
5579
  for (int il = 0; il < n_layer; ++il) {
4254
- {
4255
- // Norm
4256
- cur = ggml_norm(ctx0, inpL, norm_eps);
4257
- cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b);
5580
+ struct ggml_tensor * attn_norm;
5581
+
5582
+ offload_func_t offload_func = llama_nop;
5583
+
5584
+ #ifdef GGML_USE_CUBLAS
5585
+ if (il >= i_gpu_start) {
5586
+ offload_func = ggml_cuda_assign_buffers_no_alloc;
4258
5587
  }
5588
+ #endif // GGML_USE_CUBLAS
4259
5589
 
5590
+ // self-attention
5591
+ // TODO: refactor into common function (shared with LLaMA)
4260
5592
  {
4261
- // Self Attention
4262
- cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv);
5593
+ attn_norm = ggml_norm(ctx0, inpL, norm_eps);
5594
+ offload_func(attn_norm);
4263
5595
 
4264
- struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd);
4265
- struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*n_embd);
4266
- struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
5596
+ attn_norm = ggml_mul(ctx0, attn_norm, model.layers[il].attn_norm);
5597
+ offload_func(attn_norm);
4267
5598
 
4268
- struct ggml_tensor * Qcur = tmpq;
4269
- struct ggml_tensor * Kcur = tmpk;
5599
+ if (1) {
5600
+ cur = attn_norm;
5601
+ }
5602
+
5603
+ // compute QKV
5604
+
5605
+ cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
5606
+ offload_func_kq(cur);
5607
+
5608
+ if (clamp_kqv > 0.0f) {
5609
+ cur = ggml_clamp(ctx0, cur, -clamp_kqv, clamp_kqv);
5610
+ offload_func_kq(cur);
5611
+ }
5612
+
5613
+ const size_t wsize = ggml_type_size(cur->type);
5614
+
5615
+ struct ggml_tensor * Qcur = ggml_view_3d(
5616
+ ctx0, cur, n_embd_head, n_head, n_tokens,
5617
+ wsize * n_embd_head,
5618
+ wsize * n_embd_head * (n_head + 2 * n_head_kv),
5619
+ 0);
5620
+ offload_func_kq(Qcur);
5621
+
5622
+ struct ggml_tensor * Kcur = ggml_view_3d(
5623
+ ctx0, cur, n_embd_head, n_head_kv, n_tokens,
5624
+ wsize * n_embd_head,
5625
+ wsize * n_embd_head * (n_head + 2 * n_head_kv),
5626
+ wsize * n_embd_head * n_head);
5627
+ offload_func_kq(Kcur);
5628
+
5629
+ struct ggml_tensor * tmpv = ggml_view_3d(
5630
+ ctx0, cur, n_embd_head, n_head_kv, n_tokens,
5631
+ wsize * n_embd_head,
5632
+ wsize * n_embd_head * (n_head + 2 * n_head_kv),
5633
+ wsize * n_embd_head * (n_head + n_head_kv));
5634
+ offload_func_kq(Kcur);
5635
+
5636
+ ggml_set_name(Qcur, "Qcur");
5637
+ ggml_set_name(Kcur, "Kcur");
4270
5638
 
4271
5639
  {
4272
5640
  struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
5641
+ offload_func_v(Vcur);
5642
+ offload_func_v(Vcur->src[0]->src[0]);
4273
5643
  ggml_set_name(Vcur, "Vcur");
4274
5644
 
4275
5645
  struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
5646
+ offload_func_kq(k);
4276
5647
  ggml_set_name(k, "k");
4277
5648
 
4278
5649
  struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
4279
5650
  ( n_ctx)*ggml_element_size(kv_self.v),
4280
5651
  (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
5652
+ offload_func_v(v);
4281
5653
 
4282
5654
  ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
4283
5655
  ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
4284
5656
  }
4285
5657
 
4286
- struct ggml_tensor * Q =
4287
- ggml_permute(ctx0,
4288
- ggml_cpy(ctx0,
4289
- Qcur,
4290
- ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, n_tokens)),
4291
- 0, 2, 1, 3);
5658
+ struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
5659
+ offload_func_kq(Q);
4292
5660
  ggml_set_name(Q, "Q");
4293
5661
 
4294
5662
  struct ggml_tensor * K =
@@ -4297,85 +5665,105 @@ static struct ggml_cgraph * llm_build_starcoder(
4297
5665
  ggml_element_size(kv_self.k)*n_embd_gqa,
4298
5666
  ggml_element_size(kv_self.k)*n_embd_head,
4299
5667
  ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
5668
+ offload_func_kq(K);
4300
5669
  ggml_set_name(K, "K");
4301
5670
 
4302
- // K * Q
4303
5671
  struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
5672
+ offload_func_kq(KQ);
4304
5673
  ggml_set_name(KQ, "KQ");
4305
5674
 
4306
- // KQ_scaled = KQ / sqrt(n_embd_head)
4307
- // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
4308
- struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
5675
+ struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
5676
+ offload_func_kq(KQ_scaled);
4309
5677
  ggml_set_name(KQ_scaled, "KQ_scaled");
4310
5678
 
4311
- // KQ_masked = mask_past(KQ_scaled)
4312
- struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
5679
+ // TODO: replace with ggml_add()
5680
+ struct ggml_tensor * KQ_scaled_alibi =
5681
+ ggml_alibi(ctx0, KQ_scaled, 0, n_head, max_alibi_bias);
5682
+ offload_func_kq(KQ_scaled_alibi);
5683
+ ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
5684
+
5685
+ struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
5686
+ offload_func_kq(KQ_masked);
4313
5687
  ggml_set_name(KQ_masked, "KQ_masked");
4314
5688
 
4315
- // KQ = soft_max(KQ_masked)
4316
- struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
5689
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
5690
+ offload_func_v(KQ_soft_max);
4317
5691
  ggml_set_name(KQ_soft_max, "KQ_soft_max");
4318
5692
 
4319
- // split cached V into n_head heads
4320
5693
  struct ggml_tensor * V =
4321
5694
  ggml_view_3d(ctx0, kv_self.v,
4322
5695
  n_kv, n_embd_head, n_head_kv,
4323
5696
  ggml_element_size(kv_self.v)*n_ctx,
4324
5697
  ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
4325
5698
  ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
5699
+ offload_func_v(V);
4326
5700
  ggml_set_name(V, "V");
4327
5701
 
4328
5702
  struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
5703
+ offload_func_v(KQV);
4329
5704
  ggml_set_name(KQV, "KQV");
4330
5705
 
4331
- // KQV_merged = KQV.permute(0, 2, 1, 3)
4332
5706
  struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
5707
+ offload_func_v(KQV_merged);
4333
5708
  ggml_set_name(KQV_merged, "KQV_merged");
4334
5709
 
4335
- // cur = KQV_merged.contiguous().view(n_embd, n_tokens)
4336
5710
  cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
5711
+ offload_func_v(cur);
4337
5712
  ggml_set_name(cur, "KQV_merged_contiguous");
4338
- }
4339
5713
 
4340
- // Projection
4341
- cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo);
5714
+ cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
5715
+ offload_func(cur);
5716
+ ggml_set_name(cur, "result_wo");
5717
+ }
4342
5718
 
4343
5719
  // Add the input
4344
5720
  cur = ggml_add(ctx0, cur, inpL);
5721
+ offload_func(cur);
4345
5722
 
4346
- struct ggml_tensor * inpFF = cur;
5723
+ struct ggml_tensor * attn_out = cur;
4347
5724
 
4348
- // FF
5725
+ // feed forward
4349
5726
  {
4350
5727
  // Norm
4351
5728
  {
4352
- cur = ggml_norm(ctx0, inpFF, norm_eps);
4353
- cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b);
5729
+ cur = ggml_norm(ctx0, attn_out, norm_eps);
5730
+ offload_func(cur);
5731
+
5732
+ cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
5733
+ offload_func(cur);
4354
5734
  }
4355
5735
 
4356
- cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3);
5736
+ cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur);
5737
+ offload_func(cur);
4357
5738
 
4358
- // GELU activation
4359
5739
  cur = ggml_gelu(ctx0, cur);
4360
-
4361
- // Projection
4362
- cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2);
5740
+ offload_func(cur);
5741
+ cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur);
5742
+ offload_func(cur);
4363
5743
  }
4364
5744
 
4365
- inpL = ggml_add(ctx0, cur, inpFF);
5745
+ cur = ggml_add(ctx0, cur, attn_out);
5746
+ offload_func(cur);
5747
+ // input for next layer
5748
+ inpL = cur;
4366
5749
  }
4367
5750
 
4368
- // Output Norm
5751
+ cur = inpL;
5752
+
5753
+ // norm
4369
5754
  {
4370
- cur = ggml_norm(ctx0, inpL, norm_eps);
4371
- cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b);
5755
+ cur = ggml_norm(ctx0, cur, norm_eps);
5756
+ offload_func_nr(cur);
5757
+
5758
+ cur = ggml_mul(ctx0, cur, model.output_norm);
5759
+ ggml_set_name(cur, "result_norm");
4372
5760
  }
4373
- ggml_set_name(cur, "result_norm");
4374
5761
 
4375
5762
  cur = ggml_mul_mat(ctx0, model.output, cur);
4376
5763
  ggml_set_name(cur, "result_output");
4377
5764
 
4378
5765
  ggml_build_forward_expand(gf, cur);
5766
+
4379
5767
  ggml_free(ctx0);
4380
5768
 
4381
5769
  return gf;
@@ -4405,10 +5793,22 @@ static struct ggml_cgraph * llama_build_graph(
4405
5793
  {
4406
5794
  result = llm_build_starcoder(lctx, batch);
4407
5795
  } break;
5796
+ case LLM_ARCH_PERSIMMON:
5797
+ {
5798
+ result = llm_build_persimmon(lctx, batch);
5799
+ } break;
4408
5800
  case LLM_ARCH_REFACT:
4409
5801
  {
4410
5802
  result = llm_build_refact(lctx, batch);
4411
5803
  } break;
5804
+ case LLM_ARCH_BLOOM:
5805
+ {
5806
+ result = llm_build_bloom(lctx, batch);
5807
+ } break;
5808
+ case LLM_ARCH_MPT:
5809
+ {
5810
+ result = llm_build_mpt(lctx, batch);
5811
+ } break;
4412
5812
  default:
4413
5813
  GGML_ASSERT(false);
4414
5814
  }
@@ -4420,7 +5820,6 @@ static struct ggml_cgraph * llama_build_graph(
4420
5820
  //
4421
5821
  // - lctx: llama context
4422
5822
  // - batch: batch to evaluate
4423
- // - n_threads: number of threads to use
4424
5823
  //
4425
5824
  // return 0 on success
4426
5825
  // return positive int on warning
@@ -4466,8 +5865,11 @@ static int llama_decode_internal(
4466
5865
 
4467
5866
  // helpers for smoother batch API transistion
4468
5867
  // after deprecating the llama_eval calls, these will be removed
4469
- std::vector<llama_pos> pos;
4470
- std::vector<llama_seq_id> seq_id;
5868
+ std::vector<llama_pos> pos;
5869
+
5870
+ std::vector<int32_t> n_seq_id;
5871
+ std::vector<llama_seq_id *> seq_id_arr;
5872
+ std::vector<std::vector<llama_seq_id>> seq_id;
4471
5873
 
4472
5874
  if (batch.pos == nullptr) {
4473
5875
  pos.resize(n_tokens);
@@ -4479,18 +5881,20 @@ static int llama_decode_internal(
4479
5881
  }
4480
5882
 
4481
5883
  if (batch.seq_id == nullptr) {
5884
+ n_seq_id.resize(n_tokens);
4482
5885
  seq_id.resize(n_tokens);
5886
+ seq_id_arr.resize(n_tokens);
4483
5887
  for (uint32_t i = 0; i < n_tokens; i++) {
4484
- seq_id[i] = batch.all_seq_id;
5888
+ n_seq_id[i] = 1;
5889
+ seq_id[i].resize(1);
5890
+ seq_id[i][0] = batch.all_seq_id;
5891
+ seq_id_arr[i] = seq_id[i].data();
4485
5892
  }
4486
5893
 
4487
- batch.seq_id = seq_id.data();
5894
+ batch.n_seq_id = n_seq_id.data();
5895
+ batch.seq_id = seq_id_arr.data();
4488
5896
  }
4489
5897
 
4490
- // we always start to search for a free slot from the start of the cache
4491
- // TODO: better strategies can be implemented
4492
- kv_self.head = 0;
4493
-
4494
5898
  if (!llama_kv_cache_find_slot(kv_self, batch)) {
4495
5899
  return 1;
4496
5900
  }
@@ -4509,6 +5913,13 @@ static int llama_decode_internal(
4509
5913
 
4510
5914
  ggml_allocr_alloc_graph(lctx.alloc, gf);
4511
5915
 
5916
+ struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
5917
+ struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
5918
+
5919
+ GGML_ASSERT(strcmp(res->name, "result_output") == 0);
5920
+ GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
5921
+
5922
+
4512
5923
  #ifdef GGML_USE_CUBLAS
4513
5924
  for (int i = 0; i < gf->n_leafs; i++) {
4514
5925
  ggml_tensor * node = gf->leafs[i];
@@ -4526,6 +5937,12 @@ static int llama_decode_internal(
4526
5937
  }
4527
5938
 
4528
5939
  ggml_cuda_set_mul_mat_q(cparams.mul_mat_q);
5940
+
5941
+ // HACK: ggml-alloc may change the tensor backend when reusing a parent, so force output to be on the CPU here if needed
5942
+ if (!lctx.embedding.empty()) {
5943
+ embeddings->backend = GGML_BACKEND_CPU;
5944
+ }
5945
+ res->backend = GGML_BACKEND_CPU;
4529
5946
  #endif
4530
5947
 
4531
5948
  // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
@@ -4543,18 +5960,13 @@ static int llama_decode_internal(
4543
5960
  const bool full_offload_supported = model.arch == LLM_ARCH_LLAMA ||
4544
5961
  model.arch == LLM_ARCH_BAICHUAN ||
4545
5962
  model.arch == LLM_ARCH_FALCON ||
4546
- model.arch == LLM_ARCH_REFACT;
5963
+ model.arch == LLM_ARCH_REFACT ||
5964
+ model.arch == LLM_ARCH_MPT;
4547
5965
  const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
4548
5966
  if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
4549
5967
  n_threads = 1;
4550
5968
  }
4551
5969
 
4552
- struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
4553
- struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
4554
-
4555
- GGML_ASSERT(strcmp(res->name, "result_output") == 0);
4556
- GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
4557
-
4558
5970
  #if GGML_USE_MPI
4559
5971
  const int64_t n_layer = hparams.n_layer;
4560
5972
  ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
@@ -4576,8 +5988,12 @@ static int llama_decode_internal(
4576
5988
  #endif
4577
5989
 
4578
5990
  // update the kv ring buffer
4579
- lctx.kv_self.head += n_tokens;
4580
5991
  lctx.kv_self.has_shift = false;
5992
+ lctx.kv_self.head += n_tokens;
5993
+ // Ensure kv cache head points to a valid index.
5994
+ if (lctx.kv_self.head >= lctx.kv_self.size) {
5995
+ lctx.kv_self.head = 0;
5996
+ }
4581
5997
 
4582
5998
  #ifdef GGML_PERF
4583
5999
  // print timing information per ggml operation (for debugging purposes)
@@ -4903,7 +6319,6 @@ struct llm_tokenizer_bpe {
4903
6319
  llm_symbol sym;
4904
6320
  size_t char_len = std::min(word.size() - offset, (size_t) ::utf8_len(word[offset]));
4905
6321
  sym.text = word.c_str() + offset;
4906
- sym.n = 1;
4907
6322
  sym.n = char_len;
4908
6323
  offset += sym.n;
4909
6324
  sym.prev = index - 1;
@@ -5040,7 +6455,6 @@ private:
5040
6455
  for (int i = 0; i < (int)text_utf.size(); i++) {
5041
6456
  const std::string & utf_char = text_utf[i];
5042
6457
  bool split_condition = false;
5043
- // const char* text_pos = raw_text_p + utf_char.seq_offset_bytes;
5044
6458
  int bytes_remain = text_utf.size() - i;
5045
6459
  // forward backward lookups
5046
6460
  const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
@@ -5066,9 +6480,9 @@ private:
5066
6480
  if (!split_condition && bytes_remain >= 3) {
5067
6481
  // 're|'ve|'ll
5068
6482
  if (utf_char == "\'" && (
5069
- (utf_char_next == "r" || utf_char_next_next == "e") ||
5070
- (utf_char_next == "v" || utf_char_next_next == "e") ||
5071
- (utf_char_next == "l" || utf_char_next_next == "l"))
6483
+ (utf_char_next == "r" && utf_char_next_next == "e") ||
6484
+ (utf_char_next == "v" && utf_char_next_next == "e") ||
6485
+ (utf_char_next == "l" && utf_char_next_next == "l"))
5072
6486
  ) {
5073
6487
  split_condition = true;
5074
6488
  }
@@ -5119,7 +6533,7 @@ private:
5119
6533
  else if (collecting_special && (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
5120
6534
  split_condition = true;
5121
6535
  }
5122
- else if (collecting_whitespace_lookahead && codepoint_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE) {
6536
+ else if (collecting_whitespace_lookahead && (codepoint_type(utf_char_next) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
5123
6537
  split_condition = true;
5124
6538
  }
5125
6539
  }
@@ -5164,7 +6578,137 @@ private:
5164
6578
  llm_bigram_bpe::queue work_queue;
5165
6579
  };
5166
6580
 
5167
- static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos) {
6581
+ typedef enum FRAGMENT_BUFFER_VARIANT_TYPE{
6582
+ FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN,
6583
+ FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT
6584
+ } FRAGMENT_BUFFER_VARIANT_TYPE;
6585
+
6586
+ struct fragment_buffer_variant{
6587
+ fragment_buffer_variant(llama_vocab::id _token)
6588
+ :
6589
+ type(FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN),
6590
+ token(_token),
6591
+ raw_text(_dummy),
6592
+ offset(0),
6593
+ length(0){}
6594
+ fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length)
6595
+ :
6596
+ type(FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT),
6597
+ token((llama_vocab::id)-1),
6598
+ raw_text(_raw_text),
6599
+ offset(_offset),
6600
+ length(_length){
6601
+ GGML_ASSERT( _offset >= 0 );
6602
+ GGML_ASSERT( _length >= 1 );
6603
+ GGML_ASSERT( offset + length <= raw_text.length() );
6604
+ }
6605
+
6606
+ const FRAGMENT_BUFFER_VARIANT_TYPE type;
6607
+ const llama_vocab::id token;
6608
+ const std::string _dummy;
6609
+ const std::string & raw_text;
6610
+ const uint64_t offset;
6611
+ const uint64_t length;
6612
+ };
6613
+
6614
+ // #define PRETOKENIZERDEBUG
6615
+
6616
+ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer)
6617
+ {
6618
+ // for each special token
6619
+ for (const auto & st: vocab.special_tokens_cache) {
6620
+ const auto & special_token = st.first;
6621
+ const auto & special_id = st.second;
6622
+
6623
+ // for each text fragment
6624
+ std::forward_list<fragment_buffer_variant>::iterator it = buffer.begin();
6625
+ while (it != buffer.end()) {
6626
+ auto & fragment = (*it);
6627
+
6628
+ // if a fragment is text ( not yet processed )
6629
+ if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
6630
+ auto * raw_text = &(fragment.raw_text);
6631
+
6632
+ auto raw_text_base_offset = fragment.offset;
6633
+ auto raw_text_base_length = fragment.length;
6634
+
6635
+ // loop over the text
6636
+ while (true) {
6637
+ // find the first occurence of a given special token in this fragment
6638
+ // passing offset argument only limit the "search area" but match coordinates
6639
+ // are still relative to the source full raw_text
6640
+ auto match = raw_text->find(special_token, raw_text_base_offset);
6641
+
6642
+ // no occurences found, stop processing this fragment for a given special token
6643
+ if (match == std::string::npos) break;
6644
+
6645
+ // check if match is within bounds of offset <-> length
6646
+ if (match + special_token.length() > raw_text_base_offset + raw_text_base_length) break;
6647
+
6648
+ #ifdef PRETOKENIZERDEBUG
6649
+ fprintf(stderr, "FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
6650
+ #endif
6651
+ auto source = std::distance(buffer.begin(), it);
6652
+
6653
+ // if match is further than base offset
6654
+ // then we have some text to the left of it
6655
+ if (match > raw_text_base_offset) {
6656
+ // left
6657
+ const int64_t left_reminder_offset = raw_text_base_offset + 0;
6658
+ const int64_t left_reminder_length = match - raw_text_base_offset;
6659
+ buffer.emplace_after(it, (*raw_text), left_reminder_offset, left_reminder_length);
6660
+
6661
+ #ifdef PRETOKENIZERDEBUG
6662
+ fprintf(stderr, "FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
6663
+ #endif
6664
+ it++;
6665
+ }
6666
+
6667
+ // special token
6668
+ buffer.emplace_after(it, special_id);
6669
+ it++;
6670
+
6671
+ // right
6672
+ if (match + special_token.length() < raw_text_base_offset + raw_text_base_length) {
6673
+ const int64_t right_reminder_offset = match + special_token.length();
6674
+ const int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length());
6675
+ buffer.emplace_after(it, (*raw_text), right_reminder_offset, right_reminder_length);
6676
+
6677
+ #ifdef PRETOKENIZERDEBUG
6678
+ fprintf(stderr, "FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
6679
+ #endif
6680
+
6681
+ it++;
6682
+
6683
+ if (source == 0) {
6684
+ buffer.erase_after(buffer.before_begin());
6685
+ } else {
6686
+ buffer.erase_after(std::next(buffer.begin(), (source-1)));
6687
+ }
6688
+
6689
+ // repeat for the right side
6690
+ raw_text_base_offset = right_reminder_offset;
6691
+ raw_text_base_length = right_reminder_length;
6692
+
6693
+ #ifdef PRETOKENIZERDEBUG
6694
+ fprintf(stderr, "RR: (%ld %ld) '%s'\n", raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
6695
+ #endif
6696
+ } else {
6697
+ if (source == 0) {
6698
+ buffer.erase_after(buffer.before_begin());
6699
+ } else {
6700
+ buffer.erase_after(std::next(buffer.begin(), (source-1)));
6701
+ }
6702
+ break;
6703
+ }
6704
+ }
6705
+ }
6706
+ it++;
6707
+ }
6708
+ }
6709
+ }
6710
+
6711
+ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos, bool special) {
5168
6712
  std::vector<llama_vocab::id> output;
5169
6713
 
5170
6714
  // OG tokenizer behavior:
@@ -5180,20 +6724,58 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
5180
6724
  return output;
5181
6725
  }
5182
6726
 
6727
+ std::forward_list<fragment_buffer_variant> fragment_buffer;
6728
+ fragment_buffer.emplace_front( raw_text, 0, raw_text.length() );
6729
+
6730
+ if (special) tokenizer_st_partition( vocab, fragment_buffer );
6731
+
5183
6732
  switch (vocab.type) {
5184
6733
  case LLAMA_VOCAB_TYPE_SPM:
5185
6734
  {
5186
- // without adding this leading whitespace, we do not get the same results as the original tokenizer
5187
- raw_text = " " + raw_text;
6735
+ for (const auto & fragment: fragment_buffer)
6736
+ {
6737
+ if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT)
6738
+ {
6739
+ // without adding this leading whitespace, we do not get the same results as the original tokenizer
5188
6740
 
5189
- llm_tokenizer_spm tokenizer(vocab);
5190
- llama_escape_whitespace(raw_text);
5191
- tokenizer.tokenize(raw_text, output);
6741
+ // TODO: It's likely possible to get rid of this string copy entirely
6742
+ // by modifying llm_tokenizer_x to operate with string offsets like pre-tokenizer
6743
+ // and passing 'add space prefix' as bool argument
6744
+ //
6745
+ auto raw_text = (special ? "" : " ") + fragment.raw_text.substr(fragment.offset, fragment.length);
6746
+
6747
+ #ifdef PRETOKENIZERDEBUG
6748
+ fprintf(stderr,"TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
6749
+ #endif
6750
+ llm_tokenizer_spm tokenizer(vocab);
6751
+ llama_escape_whitespace(raw_text);
6752
+ tokenizer.tokenize(raw_text, output);
6753
+ }
6754
+ else // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
6755
+ {
6756
+ output.push_back(fragment.token);
6757
+ }
6758
+ }
5192
6759
  } break;
5193
6760
  case LLAMA_VOCAB_TYPE_BPE:
5194
6761
  {
5195
- llm_tokenizer_bpe tokenizer(vocab);
5196
- tokenizer.tokenize(raw_text, output);
6762
+ for (const auto & fragment: fragment_buffer)
6763
+ {
6764
+ if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT)
6765
+ {
6766
+ auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
6767
+
6768
+ #ifdef PRETOKENIZERDEBUG
6769
+ fprintf(stderr,"TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
6770
+ #endif
6771
+ llm_tokenizer_bpe tokenizer(vocab);
6772
+ tokenizer.tokenize(raw_text, output);
6773
+ }
6774
+ else // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
6775
+ {
6776
+ output.push_back(fragment.token);
6777
+ }
6778
+ }
5197
6779
  } break;
5198
6780
  }
5199
6781
 
@@ -5466,7 +7048,7 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
5466
7048
  std::vector<llama_grammar_candidate> rejects;
5467
7049
 
5468
7050
  if (stack.empty()) {
5469
- for (auto tok : candidates) {
7051
+ for (const auto & tok : candidates) {
5470
7052
  if (*tok.code_points != 0 || tok.partial_utf8.n_remain != 0) {
5471
7053
  rejects.push_back(tok);
5472
7054
  }
@@ -5477,7 +7059,7 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
5477
7059
  const llama_grammar_element * stack_pos = stack.back();
5478
7060
 
5479
7061
  std::vector<llama_grammar_candidate> next_candidates;
5480
- for (auto tok : candidates) {
7062
+ for (const auto & tok : candidates) {
5481
7063
  if (*tok.code_points == 0) {
5482
7064
  // reached end of full codepoints in token, reject iff it ended in a partial sequence
5483
7065
  // that cannot satisfy this position in grammar
@@ -5503,7 +7085,7 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
5503
7085
  llama_grammar_advance_stack(rules, stack_after, next_stacks);
5504
7086
 
5505
7087
  auto next_rejects = llama_grammar_reject_candidates(rules, next_stacks, next_candidates);
5506
- for (auto tok : next_rejects) {
7088
+ for (const auto & tok : next_rejects) {
5507
7089
  rejects.push_back({ tok.index, tok.code_points - 1, tok.partial_utf8 });
5508
7090
  }
5509
7091
 
@@ -6635,7 +8217,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
6635
8217
  const std::string name = ggml_get_name(meta);
6636
8218
 
6637
8219
  // TODO: avoid hardcoded tensor names - use the TN_* constants
6638
- if (name.find("attn_v.weight") != std::string::npos) {
8220
+ if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
6639
8221
  ++n_attention_wv;
6640
8222
  }
6641
8223
  else if (name.find("ffn_down.weight") != std::string::npos) {
@@ -6672,6 +8254,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
6672
8254
  }
6673
8255
 
6674
8256
  std::ofstream fout(fname_out, std::ios::binary);
8257
+ fout.exceptions(std::ofstream::failbit); // fail fast on write errors
6675
8258
 
6676
8259
  const size_t meta_size = gguf_get_meta_size(ctx_out);
6677
8260
 
@@ -7535,6 +9118,9 @@ void llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llam
7535
9118
  }
7536
9119
 
7537
9120
  void llama_kv_cache_seq_cp(struct llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
9121
+ if (seq_id_src == seq_id_dst) {
9122
+ return;
9123
+ }
7538
9124
  llama_kv_cache_seq_cp(ctx->kv_self, seq_id_src, seq_id_dst, p0, p1);
7539
9125
  }
7540
9126
 
@@ -7987,7 +9573,7 @@ int llama_eval_embd(
7987
9573
  int n_past) {
7988
9574
  llama_kv_cache_tokens_rm(ctx->kv_self, n_past, -1);
7989
9575
 
7990
- llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, n_past, 1, 0, };
9576
+ llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, };
7991
9577
 
7992
9578
  const int ret = llama_decode_internal(*ctx, batch);
7993
9579
  if (ret < 0) {
@@ -8008,20 +9594,21 @@ struct llama_batch llama_batch_get_one(
8008
9594
  llama_pos pos_0,
8009
9595
  llama_seq_id seq_id) {
8010
9596
  return {
8011
- /*n_tokens =*/ n_tokens,
8012
- /*tokens =*/ tokens,
8013
- /*embd =*/ nullptr,
8014
- /*pos =*/ nullptr,
8015
- /*seq_id =*/ nullptr,
8016
- /*logits =*/ nullptr,
8017
- /*all_pos_0 =*/ pos_0,
8018
- /*all_pos_1 =*/ 1,
8019
- /*all_seq_id =*/ seq_id,
9597
+ /*n_tokens =*/ n_tokens,
9598
+ /*tokens =*/ tokens,
9599
+ /*embd =*/ nullptr,
9600
+ /*pos =*/ nullptr,
9601
+ /*n_seq_id =*/ nullptr,
9602
+ /*seq_id =*/ nullptr,
9603
+ /*logits =*/ nullptr,
9604
+ /*all_pos_0 =*/ pos_0,
9605
+ /*all_pos_1 =*/ 1,
9606
+ /*all_seq_id =*/ seq_id,
8020
9607
  };
8021
9608
  }
8022
9609
 
8023
- struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd) {
8024
- llama_batch batch = { -1, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, };
9610
+ struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd, int32_t n_seq_max) {
9611
+ llama_batch batch = { 0, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, };
8025
9612
 
8026
9613
  if (embd) {
8027
9614
  batch.embd = (float *) malloc(sizeof(float) * n_tokens * embd);
@@ -8029,19 +9616,29 @@ struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd) {
8029
9616
  batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens);
8030
9617
  }
8031
9618
 
8032
- batch.pos = (llama_pos *) malloc(sizeof(llama_pos) * n_tokens);
8033
- batch.seq_id = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_tokens);
8034
- batch.logits = (int8_t *) malloc(sizeof(int8_t) * n_tokens);
9619
+ batch.pos = (llama_pos *) malloc(sizeof(llama_pos) * n_tokens);
9620
+ batch.n_seq_id = (int32_t *) malloc(sizeof(int32_t) * n_tokens);
9621
+ batch.seq_id = (llama_seq_id **) malloc(sizeof(llama_seq_id *) * n_tokens);
9622
+ for (int i = 0; i < n_tokens; ++i) {
9623
+ batch.seq_id[i] = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_seq_max);
9624
+ }
9625
+ batch.logits = (int8_t *) malloc(sizeof(int8_t) * n_tokens);
8035
9626
 
8036
9627
  return batch;
8037
9628
  }
8038
9629
 
8039
9630
  void llama_batch_free(struct llama_batch batch) {
8040
- if (batch.token) free(batch.token);
8041
- if (batch.embd) free(batch.embd);
8042
- if (batch.pos) free(batch.pos);
8043
- if (batch.seq_id) free(batch.seq_id);
8044
- if (batch.logits) free(batch.logits);
9631
+ if (batch.token) free(batch.token);
9632
+ if (batch.embd) free(batch.embd);
9633
+ if (batch.pos) free(batch.pos);
9634
+ if (batch.n_seq_id) free(batch.n_seq_id);
9635
+ if (batch.seq_id) {
9636
+ for (int i = 0; i < batch.n_tokens; ++i) {
9637
+ free(batch.seq_id[i]);
9638
+ }
9639
+ free(batch.seq_id);
9640
+ }
9641
+ if (batch.logits) free(batch.logits);
8045
9642
  }
8046
9643
 
8047
9644
  int llama_decode(
@@ -8106,15 +9703,15 @@ llama_token llama_token_eot(const struct llama_context * ctx) {
8106
9703
  return ctx->model.vocab.special_eot_id;
8107
9704
  }
8108
9705
 
8109
-
8110
9706
  int llama_tokenize(
8111
9707
  const struct llama_model * model,
8112
9708
  const char * text,
8113
9709
  int text_len,
8114
9710
  llama_token * tokens,
8115
9711
  int n_max_tokens,
8116
- bool add_bos) {
8117
- auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos);
9712
+ bool add_bos,
9713
+ bool special) {
9714
+ auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos, special);
8118
9715
 
8119
9716
  if (n_max_tokens < (int) res.size()) {
8120
9717
  // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
@@ -8166,7 +9763,9 @@ int llama_token_to_piece(const struct llama_model * model, llama_token token, ch
8166
9763
  buf[0] = llama_token_to_byte(model->vocab, token);
8167
9764
  return 1;
8168
9765
  } else {
8169
- GGML_ASSERT(false);
9766
+ // TODO: for now we accept all unsupported token types,
9767
+ // suppressing them like CONTROL tokens.
9768
+ // GGML_ASSERT(false);
8170
9769
  }
8171
9770
  break;
8172
9771
  }
@@ -8182,7 +9781,9 @@ int llama_token_to_piece(const struct llama_model * model, llama_token token, ch
8182
9781
  } else if (llama_is_control_token(model->vocab, token)) {
8183
9782
  ;
8184
9783
  } else {
8185
- GGML_ASSERT(false);
9784
+ // TODO: for now we accept all unsupported token types,
9785
+ // suppressing them like CONTROL tokens.
9786
+ // GGML_ASSERT(false);
8186
9787
  }
8187
9788
  break;
8188
9789
  }