llama_cpp 0.7.0 → 0.8.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -75,6 +75,7 @@
75
75
  #include <thread>
76
76
  #include <unordered_map>
77
77
  #include <set>
78
+ #include <forward_list>
78
79
 
79
80
  #if defined(_MSC_VER)
80
81
  #pragma warning(disable: 4244 4267) // possible loss of data
@@ -186,7 +187,9 @@ enum llm_arch {
186
187
  LLM_ARCH_GPTNEOX,
187
188
  LLM_ARCH_MPT,
188
189
  LLM_ARCH_STARCODER,
190
+ LLM_ARCH_PERSIMMON,
189
191
  LLM_ARCH_REFACT,
192
+ LLM_ARCH_BLOOM,
190
193
  LLM_ARCH_UNKNOWN,
191
194
  };
192
195
 
@@ -199,7 +202,9 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
199
202
  { LLM_ARCH_MPT, "mpt" },
200
203
  { LLM_ARCH_BAICHUAN, "baichuan" },
201
204
  { LLM_ARCH_STARCODER, "starcoder" },
202
- { LLM_ARCH_REFACT, "refact" },
205
+ { LLM_ARCH_PERSIMMON, "persimmon" },
206
+ { LLM_ARCH_REFACT, "refact" },
207
+ { LLM_ARCH_BLOOM, "bloom" },
203
208
  };
204
209
 
205
210
  enum llm_kv {
@@ -302,6 +307,7 @@ struct LLM_KV {
302
307
 
303
308
  enum llm_tensor {
304
309
  LLM_TENSOR_TOKEN_EMBD,
310
+ LLM_TENSOR_TOKEN_EMBD_NORM,
305
311
  LLM_TENSOR_POS_EMBD,
306
312
  LLM_TENSOR_OUTPUT,
307
313
  LLM_TENSOR_OUTPUT_NORM,
@@ -318,6 +324,8 @@ enum llm_tensor {
318
324
  LLM_TENSOR_FFN_DOWN,
319
325
  LLM_TENSOR_FFN_UP,
320
326
  LLM_TENSOR_FFN_NORM,
327
+ LLM_TENSOR_ATTN_Q_NORM,
328
+ LLM_TENSOR_ATTN_K_NORM,
321
329
  };
322
330
 
323
331
  static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
@@ -399,10 +407,35 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
399
407
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
400
408
  },
401
409
  },
410
+ {
411
+ LLM_ARCH_PERSIMMON,
412
+ {
413
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd"},
414
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm"},
415
+ { LLM_TENSOR_OUTPUT, "output"},
416
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm"},
417
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv"},
418
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output"},
419
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm"},
420
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm"},
421
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm"},
422
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down"},
423
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up"},
424
+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd"},
425
+ },
426
+ },
402
427
  {
403
428
  LLM_ARCH_MPT,
404
429
  {
405
430
  { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
431
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
432
+ { LLM_TENSOR_OUTPUT, "output" },
433
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
434
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
435
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
436
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
437
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
438
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
406
439
  },
407
440
  },
408
441
  {
@@ -437,6 +470,21 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
437
470
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
438
471
  },
439
472
  },
473
+ {
474
+ LLM_ARCH_BLOOM,
475
+ {
476
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
477
+ { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
478
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
479
+ { LLM_TENSOR_OUTPUT, "output" },
480
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
481
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
482
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
483
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
484
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
485
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
486
+ },
487
+ },
440
488
  {
441
489
  LLM_ARCH_UNKNOWN,
442
490
  {
@@ -954,6 +1002,7 @@ enum e_model {
954
1002
  MODEL_1B,
955
1003
  MODEL_3B,
956
1004
  MODEL_7B,
1005
+ MODEL_8B,
957
1006
  MODEL_13B,
958
1007
  MODEL_15B,
959
1008
  MODEL_30B,
@@ -984,6 +1033,9 @@ struct llama_hparams {
984
1033
  float rope_freq_base_train;
985
1034
  float rope_freq_scale_train;
986
1035
 
1036
+ float f_clamp_kqv;
1037
+ float f_max_alibi_bias;
1038
+
987
1039
  bool operator!=(const llama_hparams & other) const {
988
1040
  if (this->vocab_only != other.vocab_only) return true;
989
1041
  if (this->n_vocab != other.n_vocab) return true;
@@ -1036,6 +1088,10 @@ struct llama_layer {
1036
1088
  struct ggml_tensor * attn_norm_b;
1037
1089
  struct ggml_tensor * attn_norm_2;
1038
1090
  struct ggml_tensor * attn_norm_2_b;
1091
+ struct ggml_tensor * attn_q_norm;
1092
+ struct ggml_tensor * attn_q_norm_b;
1093
+ struct ggml_tensor * attn_k_norm;
1094
+ struct ggml_tensor * attn_k_norm_b;
1039
1095
 
1040
1096
  // attention
1041
1097
  struct ggml_tensor * wq;
@@ -1077,6 +1133,9 @@ struct llama_kv_cell {
1077
1133
  struct llama_kv_cache {
1078
1134
  bool has_shift = false;
1079
1135
 
1136
+ // Note: The value of head isn't only used to optimize searching
1137
+ // for a free KV slot. llama_decode_internal also uses it, so it
1138
+ // cannot be freely changed after a slot has been allocated.
1080
1139
  uint32_t head = 0;
1081
1140
  uint32_t size = 0;
1082
1141
 
@@ -1120,6 +1179,8 @@ struct llama_vocab {
1120
1179
  std::unordered_map<token, id> token_to_id;
1121
1180
  std::vector<token_data> id_to_token;
1122
1181
 
1182
+ std::unordered_map<token, id> special_tokens_cache;
1183
+
1123
1184
  std::map<std::pair<std::string, std::string>, int> bpe_ranks;
1124
1185
 
1125
1186
  // default LLaMA special tokens
@@ -1162,6 +1223,8 @@ struct llama_model {
1162
1223
 
1163
1224
  struct ggml_tensor * tok_embeddings;
1164
1225
  struct ggml_tensor * pos_embeddings;
1226
+ struct ggml_tensor * tok_norm;
1227
+ struct ggml_tensor * tok_norm_b;
1165
1228
 
1166
1229
  struct ggml_tensor * output_norm;
1167
1230
  struct ggml_tensor * output_norm_b;
@@ -1291,7 +1354,11 @@ static bool llama_kv_cache_init(
1291
1354
  cache.cells.clear();
1292
1355
  cache.cells.resize(n_ctx);
1293
1356
 
1357
+ // TODO: this should be:
1358
+ // cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*ggml_tensor_overhead());
1359
+ // change it and test that it works
1294
1360
  cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
1361
+ memset(cache.buf.data, 0, cache.buf.size);
1295
1362
 
1296
1363
  struct ggml_init_params params;
1297
1364
  params.mem_size = cache.buf.size;
@@ -1334,6 +1401,8 @@ static bool llama_kv_cache_init(
1334
1401
 
1335
1402
  // find an empty slot of size "n_tokens" in the cache
1336
1403
  // updates the cache head
1404
+ // Note: On success, it's important that cache.head points
1405
+ // to the first cell of the slot.
1337
1406
  static bool llama_kv_cache_find_slot(
1338
1407
  struct llama_kv_cache & cache,
1339
1408
  const struct llama_batch & batch) {
@@ -1349,8 +1418,8 @@ static bool llama_kv_cache_find_slot(
1349
1418
 
1350
1419
  while (true) {
1351
1420
  if (cache.head + n_tokens > n_ctx) {
1421
+ n_tested += n_ctx - cache.head;
1352
1422
  cache.head = 0;
1353
- n_tested += n_ctx - cache.head;
1354
1423
  continue;
1355
1424
  }
1356
1425
 
@@ -1376,7 +1445,10 @@ static bool llama_kv_cache_find_slot(
1376
1445
 
1377
1446
  for (uint32_t i = 0; i < n_tokens; i++) {
1378
1447
  cache.cells[cache.head + i].pos = batch.pos[i];
1379
- cache.cells[cache.head + i].seq_id.insert(batch.seq_id[i]);
1448
+
1449
+ for (int32_t j = 0; j < batch.n_seq_id[i]; j++) {
1450
+ cache.cells[cache.head + i].seq_id.insert(batch.seq_id[i][j]);
1451
+ }
1380
1452
  }
1381
1453
 
1382
1454
  return true;
@@ -1401,6 +1473,9 @@ static void llama_kv_cache_tokens_rm(struct llama_kv_cache & cache, int32_t c0,
1401
1473
  cache.cells[i].pos = -1;
1402
1474
  cache.cells[i].seq_id.clear();
1403
1475
  }
1476
+
1477
+ // Searching for a free slot can start here since we know it will be empty.
1478
+ cache.head = uint32_t(c0);
1404
1479
  }
1405
1480
 
1406
1481
  static void llama_kv_cache_seq_rm(
@@ -1408,6 +1483,8 @@ static void llama_kv_cache_seq_rm(
1408
1483
  llama_seq_id seq_id,
1409
1484
  llama_pos p0,
1410
1485
  llama_pos p1) {
1486
+ uint32_t new_head = cache.size;
1487
+
1411
1488
  if (p0 < 0) p0 = 0;
1412
1489
  if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
1413
1490
 
@@ -1416,9 +1493,13 @@ static void llama_kv_cache_seq_rm(
1416
1493
  cache.cells[i].seq_id.erase(seq_id);
1417
1494
  if (cache.cells[i].seq_id.empty()) {
1418
1495
  cache.cells[i].pos = -1;
1496
+ if (new_head == cache.size) new_head = i;
1419
1497
  }
1420
1498
  }
1421
1499
  }
1500
+
1501
+ // If we freed up a slot, set head to it so searching can start there.
1502
+ if (new_head != cache.size) cache.head = new_head;
1422
1503
  }
1423
1504
 
1424
1505
  static void llama_kv_cache_seq_cp(
@@ -1430,6 +1511,8 @@ static void llama_kv_cache_seq_cp(
1430
1511
  if (p0 < 0) p0 = 0;
1431
1512
  if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
1432
1513
 
1514
+ cache.head = 0;
1515
+
1433
1516
  for (uint32_t i = 0; i < cache.size; ++i) {
1434
1517
  if (cache.cells[i].has_seq_id(seq_id_src) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
1435
1518
  cache.cells[i].seq_id.insert(seq_id_dst);
@@ -1438,12 +1521,21 @@ static void llama_kv_cache_seq_cp(
1438
1521
  }
1439
1522
 
1440
1523
  static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id seq_id) {
1524
+ uint32_t new_head = cache.size;
1525
+
1441
1526
  for (uint32_t i = 0; i < cache.size; ++i) {
1442
1527
  if (!cache.cells[i].has_seq_id(seq_id)) {
1443
1528
  cache.cells[i].pos = -1;
1444
1529
  cache.cells[i].seq_id.clear();
1530
+ if (new_head == cache.size) new_head = i;
1531
+ } else {
1532
+ cache.cells[i].seq_id.clear();
1533
+ cache.cells[i].seq_id.insert(seq_id);
1445
1534
  }
1446
1535
  }
1536
+
1537
+ // If we freed up a slot, set head to it so searching can start there.
1538
+ if (new_head != cache.size) cache.head = new_head;
1447
1539
  }
1448
1540
 
1449
1541
  static void llama_kv_cache_seq_shift(
@@ -1452,6 +1544,8 @@ static void llama_kv_cache_seq_shift(
1452
1544
  llama_pos p0,
1453
1545
  llama_pos p1,
1454
1546
  llama_pos delta) {
1547
+ uint32_t new_head = cache.size;
1548
+
1455
1549
  if (p0 < 0) p0 = 0;
1456
1550
  if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
1457
1551
 
@@ -1461,12 +1555,17 @@ static void llama_kv_cache_seq_shift(
1461
1555
  if (cache.cells[i].pos < 0) {
1462
1556
  cache.cells[i].pos = -1;
1463
1557
  cache.cells[i].seq_id.clear();
1558
+ if (new_head == cache.size) new_head = i;
1464
1559
  } else {
1465
1560
  cache.has_shift = true;
1466
1561
  cache.cells[i].delta = delta;
1467
1562
  }
1468
1563
  }
1469
1564
  }
1565
+
1566
+ // If we freed up a slot, set head to it so searching can start there.
1567
+ // Otherwise we just start the next search from the beginning.
1568
+ cache.head = new_head != cache.size ? new_head : 0;
1470
1569
  }
1471
1570
 
1472
1571
  //
@@ -1670,7 +1769,7 @@ struct llama_model_loader {
1670
1769
  }
1671
1770
  }
1672
1771
 
1673
- struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend backend) {
1772
+ struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend_type backend) {
1674
1773
  if (backend != GGML_BACKEND_CPU) {
1675
1774
  ggml_set_no_alloc(ctx, true);
1676
1775
  }
@@ -1688,7 +1787,7 @@ struct llama_model_loader {
1688
1787
  return tensor;
1689
1788
  }
1690
1789
 
1691
- struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend backend) {
1790
+ struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend_type backend) {
1692
1791
  struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str());
1693
1792
 
1694
1793
  if (cur == NULL) {
@@ -1867,6 +1966,7 @@ static const char * llama_model_type_name(e_model type) {
1867
1966
  case MODEL_1B: return "1B";
1868
1967
  case MODEL_3B: return "3B";
1869
1968
  case MODEL_7B: return "7B";
1969
+ case MODEL_8B: return "8B";
1870
1970
  case MODEL_13B: return "13B";
1871
1971
  case MODEL_15B: return "15B";
1872
1972
  case MODEL_30B: return "30B";
@@ -1979,6 +2079,14 @@ static void llm_load_hparams(
1979
2079
  default: model.type = e_model::MODEL_UNKNOWN;
1980
2080
  }
1981
2081
  } break;
2082
+ case LLM_ARCH_PERSIMMON:
2083
+ {
2084
+ GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
2085
+ switch (hparams.n_layer) {
2086
+ case 36: model.type = e_model::MODEL_8B; break;
2087
+ default: model.type = e_model::MODEL_UNKNOWN;
2088
+ }
2089
+ } break;
1982
2090
  case LLM_ARCH_REFACT:
1983
2091
  {
1984
2092
  GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
@@ -1987,6 +2095,33 @@ static void llm_load_hparams(
1987
2095
  default: model.type = e_model::MODEL_UNKNOWN;
1988
2096
  }
1989
2097
  } break;
2098
+ case LLM_ARCH_BLOOM:
2099
+ {
2100
+ GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
2101
+
2102
+ switch (hparams.n_layer) {
2103
+ case 24: model.type = e_model::MODEL_1B; break;
2104
+ case 30:
2105
+ switch (hparams.n_embd) {
2106
+ case 2560: model.type = e_model::MODEL_3B; break;
2107
+ case 4096: model.type = e_model::MODEL_7B; break;
2108
+ } break;
2109
+ }
2110
+ } break;
2111
+ case LLM_ARCH_MPT:
2112
+ {
2113
+ hparams.f_clamp_kqv = 0.0f;
2114
+
2115
+ GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
2116
+ GGUF_GET_KEY(ctx, hparams.f_clamp_kqv, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ATTENTION_CLAMP_KQV));
2117
+ GGUF_GET_KEY(ctx, hparams.f_max_alibi_bias, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_MAX_ALIBI_BIAS));
2118
+
2119
+ switch (hparams.n_layer) {
2120
+ case 32: model.type = e_model::MODEL_7B; break;
2121
+ case 48: model.type = e_model::MODEL_30B; break;
2122
+ default: model.type = e_model::MODEL_UNKNOWN;
2123
+ }
2124
+ } break;
1990
2125
  default: (void)0;
1991
2126
  }
1992
2127
 
@@ -1994,7 +2129,7 @@ static void llm_load_hparams(
1994
2129
  }
1995
2130
 
1996
2131
  // TODO: This should probably be in llama.h
1997
- static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos);
2132
+ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos, bool special = false);
1998
2133
  static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch);
1999
2134
 
2000
2135
  static void llm_load_vocab(
@@ -2110,6 +2245,101 @@ static void llm_load_vocab(
2110
2245
  GGUF_GET_KEY(ctx, vocab.special_unk_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_UNK_ID));
2111
2246
  GGUF_GET_KEY(ctx, vocab.special_sep_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_SEP_ID));
2112
2247
  GGUF_GET_KEY(ctx, vocab.special_pad_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_PAD_ID));
2248
+
2249
+ // build special tokens cache
2250
+ {
2251
+ // TODO: It is unclear (to me) at this point, whether special tokes are guaranteed to be of a deterministic type,
2252
+ // and will always be correctly labeled in 'added_tokens.json' etc.
2253
+ // The assumption is, since special tokens aren't meant to be exposed to end user, they are designed
2254
+ // to be unmatchable by the tokenizer, therefore tokens from the vocab, which are unmatchable by the tokenizer
2255
+ // are special tokens.
2256
+ // From testing, this appears to corelate 1:1 with special tokens.
2257
+ //
2258
+
2259
+ // Counting special tokens and verifying in only one direction
2260
+ // is sufficient to detect difference in those two sets.
2261
+ //
2262
+ uint32_t special_tokens_count_by_type = 0;
2263
+ uint32_t special_tokens_count_from_verification = 0;
2264
+
2265
+ bool special_tokens_definition_mismatch = false;
2266
+
2267
+ for (const auto & t : vocab.token_to_id) {
2268
+ const auto & token = t.first;
2269
+ const auto & id = t.second;
2270
+
2271
+ // Count all non-normal tokens in the vocab while iterating
2272
+ if (vocab.id_to_token[id].type != LLAMA_TOKEN_TYPE_NORMAL) {
2273
+ special_tokens_count_by_type++;
2274
+ }
2275
+
2276
+ // Skip single character tokens
2277
+ if (token.length() > 1) {
2278
+ bool is_tokenizable = false;
2279
+
2280
+ // Split token string representation in two, in all possible ways
2281
+ // and check if both halves can be matched to a valid token
2282
+ for (unsigned i = 1; i < token.length();) {
2283
+ const auto left = token.substr(0, i);
2284
+ const auto right = token.substr(i);
2285
+
2286
+ // check if we didnt partition in the middle of a utf sequence
2287
+ auto utf = utf8_len(left.at(left.length() - 1));
2288
+
2289
+ if (utf == 1) {
2290
+ if (vocab.token_to_id.find(left) != vocab.token_to_id.end() &&
2291
+ vocab.token_to_id.find(right) != vocab.token_to_id.end() ) {
2292
+ is_tokenizable = true;
2293
+ break;
2294
+ }
2295
+ i++;
2296
+ } else {
2297
+ // skip over the rest of multibyte utf sequence
2298
+ i += utf - 1;
2299
+ }
2300
+ }
2301
+
2302
+ if (!is_tokenizable) {
2303
+ // Some tokens are multibyte, but they are utf sequences with equivalent text length of 1
2304
+ // it's faster to re-filter them here, since there are way less candidates now
2305
+
2306
+ // Calculate a total "utf" length of a token string representation
2307
+ size_t utf8_str_len = 0;
2308
+ for (unsigned i = 0; i < token.length();) {
2309
+ utf8_str_len++;
2310
+ i += utf8_len(token.at(i));
2311
+ }
2312
+
2313
+ // And skip the ones which are one character
2314
+ if (utf8_str_len > 1) {
2315
+ // At this point what we have left are special tokens only
2316
+ vocab.special_tokens_cache[token] = id;
2317
+
2318
+ // Count manually found special tokens
2319
+ special_tokens_count_from_verification++;
2320
+
2321
+ // If this manually found special token is not marked as such, flag a mismatch
2322
+ if (vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL) {
2323
+ special_tokens_definition_mismatch = true;
2324
+ }
2325
+ }
2326
+ }
2327
+ }
2328
+ }
2329
+
2330
+ if (special_tokens_definition_mismatch || special_tokens_count_from_verification != special_tokens_count_by_type) {
2331
+ LLAMA_LOG_WARN("%s: mismatch in special tokens definition ( %u/%zu vs %u/%zu ).\n",
2332
+ __func__,
2333
+ special_tokens_count_from_verification, vocab.id_to_token.size(),
2334
+ special_tokens_count_by_type, vocab.id_to_token.size()
2335
+ );
2336
+ } else {
2337
+ LLAMA_LOG_INFO("%s: special tokens definition check successful ( %u/%zu ).\n",
2338
+ __func__,
2339
+ special_tokens_count_from_verification, vocab.id_to_token.size()
2340
+ );
2341
+ }
2342
+ }
2113
2343
  }
2114
2344
 
2115
2345
  static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
@@ -2131,6 +2361,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
2131
2361
  LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa());
2132
2362
  LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
2133
2363
  LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
2364
+ LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
2365
+ LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
2134
2366
  LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
2135
2367
  LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
2136
2368
  LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
@@ -2230,8 +2462,8 @@ static void llm_load_tensors(
2230
2462
 
2231
2463
  // output
2232
2464
  {
2233
- ggml_backend backend_norm;
2234
- ggml_backend backend_output;
2465
+ ggml_backend_type backend_norm;
2466
+ ggml_backend_type backend_output;
2235
2467
 
2236
2468
  if (n_gpu_layers > int(n_layer)) {
2237
2469
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
@@ -2266,8 +2498,8 @@ static void llm_load_tensors(
2266
2498
  model.layers.resize(n_layer);
2267
2499
 
2268
2500
  for (uint32_t i = 0; i < n_layer; ++i) {
2269
- const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2270
- const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2501
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2502
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2271
2503
 
2272
2504
  auto & layer = model.layers[i];
2273
2505
 
@@ -2296,8 +2528,8 @@ static void llm_load_tensors(
2296
2528
  {
2297
2529
  model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2298
2530
  {
2299
- ggml_backend backend_norm;
2300
- ggml_backend backend_output;
2531
+ ggml_backend_type backend_norm;
2532
+ ggml_backend_type backend_output;
2301
2533
 
2302
2534
  if (n_gpu_layers > int(n_layer)) {
2303
2535
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
@@ -2332,8 +2564,8 @@ static void llm_load_tensors(
2332
2564
  model.layers.resize(n_layer);
2333
2565
 
2334
2566
  for (uint32_t i = 0; i < n_layer; ++i) {
2335
- const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2336
- const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2567
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2568
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2337
2569
 
2338
2570
  auto & layer = model.layers[i];
2339
2571
 
@@ -2366,8 +2598,8 @@ static void llm_load_tensors(
2366
2598
 
2367
2599
  // output
2368
2600
  {
2369
- ggml_backend backend_norm;
2370
- ggml_backend backend_output;
2601
+ ggml_backend_type backend_norm;
2602
+ ggml_backend_type backend_output;
2371
2603
 
2372
2604
  if (n_gpu_layers > int(n_layer)) {
2373
2605
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
@@ -2404,8 +2636,8 @@ static void llm_load_tensors(
2404
2636
  model.layers.resize(n_layer);
2405
2637
 
2406
2638
  for (uint32_t i = 0; i < n_layer; ++i) {
2407
- const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2408
- const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2639
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2640
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2409
2641
 
2410
2642
  auto & layer = model.layers[i];
2411
2643
 
@@ -2443,8 +2675,8 @@ static void llm_load_tensors(
2443
2675
 
2444
2676
  // output
2445
2677
  {
2446
- ggml_backend backend_norm;
2447
- ggml_backend backend_output;
2678
+ ggml_backend_type backend_norm;
2679
+ ggml_backend_type backend_output;
2448
2680
 
2449
2681
  if (n_gpu_layers > int(n_layer)) {
2450
2682
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
@@ -2481,8 +2713,8 @@ static void llm_load_tensors(
2481
2713
  model.layers.resize(n_layer);
2482
2714
 
2483
2715
  for (uint32_t i = 0; i < n_layer; ++i) {
2484
- const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2485
- const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2716
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2717
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2486
2718
 
2487
2719
  auto & layer = model.layers[i];
2488
2720
 
@@ -2515,117 +2747,327 @@ static void llm_load_tensors(
2515
2747
  }
2516
2748
  }
2517
2749
  } break;
2518
- default:
2519
- throw std::runtime_error("unknown architecture");
2520
- }
2521
- }
2750
+ case LLM_ARCH_PERSIMMON:
2751
+ {
2752
+ model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2522
2753
 
2523
- ml.done_getting_tensors();
2754
+ {
2755
+ ggml_backend_type backend_norm;
2756
+ ggml_backend_type backend_output;
2524
2757
 
2525
- // print memory requirements
2526
- {
2527
- // this is the total memory required to run the inference
2528
- size_t mem_required =
2529
- ctx_size +
2530
- mmapped_size - vram_weights; // weights in VRAM not in memory
2758
+ if (n_gpu_layers > int(n_layer)) {
2759
+ // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2760
+ // on Windows however this is detrimental unless everything is on the GPU
2761
+ #ifndef _WIN32
2762
+ backend_norm = LLAMA_BACKEND_OFFLOAD;
2763
+ #else
2764
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2765
+ #endif // _WIN32
2531
2766
 
2532
- LLAMA_LOG_INFO("%s: mem required = %7.2f MB\n", __func__, mem_required / 1024.0 / 1024.0);
2767
+ backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
2768
+ } else {
2769
+ backend_norm = GGML_BACKEND_CPU;
2770
+ backend_output = GGML_BACKEND_CPU;
2771
+ }
2533
2772
 
2534
- #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
2535
- const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
2773
+ model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
2774
+ model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
2775
+ model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
2536
2776
 
2537
- LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
2538
- if (n_gpu_layers > (int) hparams.n_layer) {
2539
- LLAMA_LOG_INFO("%s: offloading non-repeating layers to GPU\n", __func__);
2540
- }
2777
+ if (backend_norm == GGML_BACKEND_GPU) {
2778
+ vram_weights += ggml_nbytes(model.output_norm);
2779
+ vram_weights += ggml_nbytes(model.output_norm_b);
2780
+ }
2781
+ if (backend_output == GGML_BACKEND_GPU_SPLIT) {
2782
+ vram_weights += ggml_nbytes(model.output);
2783
+ }
2784
+ }
2541
2785
 
2542
- #ifdef GGML_USE_CUBLAS
2543
- const int max_backend_supported_layers = hparams.n_layer + 3;
2544
- const int max_offloadable_layers = hparams.n_layer + 3;
2545
- #elif defined(GGML_USE_CLBLAST)
2546
- const int max_backend_supported_layers = hparams.n_layer + 1;
2547
- const int max_offloadable_layers = hparams.n_layer + 1;
2548
- #endif // GGML_USE_CUBLAS
2786
+ const uint32_t n_ff = hparams.n_ff;
2787
+ const int i_gpu_start = n_layer - n_gpu_layers;
2788
+ model.layers.resize(n_layer);
2789
+ for (uint32_t i = 0; i < n_layer; ++i) {
2790
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2791
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT;
2792
+ auto & layer = model.layers[i];
2793
+ layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
2794
+ layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
2795
+ layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
2796
+ layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split);
2797
+ layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
2798
+ layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split);
2799
+ layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
2800
+ layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split);
2801
+ layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
2802
+ layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split);
2803
+ layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
2804
+ layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
2805
+ layer.attn_q_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64}, backend);
2806
+ layer.attn_q_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {64}, backend);
2807
+ layer.attn_k_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {64}, backend);
2808
+ layer.attn_k_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64}, backend);
2809
+ }
2810
+ } break;
2811
+ case LLM_ARCH_BLOOM:
2812
+ {
2813
+ // TODO: CPU-only for now
2549
2814
 
2550
- LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
2551
- LLAMA_LOG_INFO("%s: VRAM used: %.2f MB\n", __func__, vram_weights / 1024.0 / 1024.0);
2552
- #else
2553
- (void) n_gpu_layers;
2554
- #endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
2555
- }
2815
+ model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2816
+ model.tok_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, GGML_BACKEND_CPU);
2817
+ model.tok_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, GGML_BACKEND_CPU);
2556
2818
 
2557
- // populate `tensors_by_name`
2558
- for (int i = 0; i < ml.n_tensors; ++i) {
2559
- struct ggml_tensor * cur = ggml_get_tensor(ctx, ml.get_tensor_name(i));
2560
- model.tensors_by_name.emplace_back(ggml_get_name(cur), cur);
2561
- }
2819
+ // output
2820
+ {
2821
+ ggml_backend_type backend_norm;
2822
+ ggml_backend_type backend_output;
2562
2823
 
2563
- (void) tensor_split;
2564
- #ifdef GGML_USE_CUBLAS
2565
- {
2566
- ggml_cuda_set_tensor_split(tensor_split);
2567
- }
2568
- #endif
2824
+ if (n_gpu_layers > int(n_layer)) {
2825
+ // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2826
+ // on Windows however this is detrimental unless everything is on the GPU
2827
+ #ifndef _WIN32
2828
+ backend_norm = LLAMA_BACKEND_OFFLOAD;
2829
+ #else
2830
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2831
+ #endif // _WIN32
2569
2832
 
2570
- ml.load_all_data(ctx, progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL);
2833
+ backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
2834
+ } else {
2835
+ backend_norm = GGML_BACKEND_CPU;
2836
+ backend_output = GGML_BACKEND_CPU;
2837
+ }
2571
2838
 
2572
- if (progress_callback) {
2573
- progress_callback(1.0f, progress_callback_user_data);
2574
- }
2839
+ model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
2840
+ model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
2841
+ model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
2575
2842
 
2576
- model.mapping = std::move(ml.mapping);
2843
+ if (backend_norm == GGML_BACKEND_GPU) {
2844
+ vram_weights += ggml_nbytes(model.output_norm);
2845
+ vram_weights += ggml_nbytes(model.output_norm_b);
2846
+ }
2847
+ if (backend_output == GGML_BACKEND_GPU_SPLIT) {
2848
+ vram_weights += ggml_nbytes(model.output);
2849
+ }
2850
+ }
2577
2851
 
2578
- // loading time will be recalculate after the first eval, so
2579
- // we take page faults deferred by mmap() into consideration
2580
- model.t_load_us = ggml_time_us() - model.t_start_us;
2581
- }
2852
+ const uint32_t n_ff = hparams.n_ff;
2582
2853
 
2583
- static bool llama_model_load(
2584
- const std::string & fname,
2585
- llama_model & model,
2586
- int n_gpu_layers,
2587
- int main_gpu,
2588
- const float * tensor_split,
2589
- bool use_mmap,
2590
- bool use_mlock,
2591
- bool vocab_only,
2592
- llama_progress_callback progress_callback,
2593
- void *progress_callback_user_data) {
2594
- try {
2595
- llama_model_loader ml(fname, use_mmap);
2854
+ const int i_gpu_start = n_layer - n_gpu_layers;
2596
2855
 
2597
- model.hparams.vocab_only = vocab_only;
2856
+ model.layers.resize(n_layer);
2598
2857
 
2599
- llm_load_arch (ml, model);
2600
- llm_load_hparams(ml, model);
2601
- llm_load_vocab (ml, model);
2858
+ for (uint32_t i = 0; i < n_layer; ++i) {
2859
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2860
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2602
2861
 
2603
- llm_load_print_meta(ml, model);
2862
+ auto & layer = model.layers[i];
2604
2863
 
2605
- if (model.hparams.n_vocab != model.vocab.id_to_token.size()) {
2606
- throw std::runtime_error("vocab size mismatch");
2607
- }
2864
+ layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
2865
+ layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
2608
2866
 
2609
- if (vocab_only) {
2610
- LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
2611
- return true;
2612
- }
2867
+ layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
2868
+ layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split);
2613
2869
 
2614
- llm_load_tensors(
2615
- ml, model, n_gpu_layers,
2616
- main_gpu, tensor_split,
2617
- use_mlock, progress_callback, progress_callback_user_data);
2618
- } catch (const std::exception & err) {
2619
- LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
2620
- return false;
2621
- }
2870
+ layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
2871
+ layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split);
2622
2872
 
2623
- return true;
2624
- }
2873
+ layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
2874
+ layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
2875
+
2876
+ layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
2877
+ layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split);
2878
+
2879
+ layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
2880
+ layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split);
2881
+
2882
+ if (backend == GGML_BACKEND_GPU) {
2883
+ vram_weights +=
2884
+ ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
2885
+ ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
2886
+ ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) +
2887
+ ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_norm_b) +
2888
+ ggml_nbytes(layer.w3) + ggml_nbytes(layer.b3) +
2889
+ ggml_nbytes(layer.w2) + ggml_nbytes(layer.b2);
2890
+ }
2891
+ }
2892
+ } break;
2893
+ case LLM_ARCH_MPT:
2894
+ {
2895
+ model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2896
+
2897
+ // output
2898
+ {
2899
+ ggml_backend_type backend_norm;
2900
+ ggml_backend_type backend_output;
2901
+
2902
+ if (n_gpu_layers > int(n_layer)) {
2903
+ // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2904
+ // on Windows however this is detrimental unless everything is on the GPU
2905
+ #ifndef _WIN32
2906
+ backend_norm = LLAMA_BACKEND_OFFLOAD;
2907
+ #else
2908
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2909
+ #endif // _WIN32
2910
+
2911
+ backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
2912
+ } else {
2913
+ backend_norm = GGML_BACKEND_CPU;
2914
+ backend_output = GGML_BACKEND_CPU;
2915
+ }
2916
+
2917
+ model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
2918
+ model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
2919
+
2920
+ if (backend_norm == GGML_BACKEND_GPU) {
2921
+ vram_weights += ggml_nbytes(model.output_norm);
2922
+ }
2923
+ if (backend_output == GGML_BACKEND_GPU_SPLIT) {
2924
+ vram_weights += ggml_nbytes(model.output);
2925
+ }
2926
+ }
2927
+
2928
+ const uint32_t n_ff = hparams.n_ff;
2929
+
2930
+ const int i_gpu_start = n_layer - n_gpu_layers;
2931
+
2932
+ model.layers.resize(n_layer);
2933
+
2934
+ for (uint32_t i = 0; i < n_layer; ++i) {
2935
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2936
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2937
+
2938
+ auto & layer = model.layers[i];
2939
+
2940
+ layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
2941
+ layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
2942
+ layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
2943
+
2944
+ layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
2945
+
2946
+ layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
2947
+ layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
2948
+
2949
+ if (backend == GGML_BACKEND_GPU) {
2950
+ vram_weights +=
2951
+ ggml_nbytes(layer.attn_norm) +
2952
+ ggml_nbytes(layer.wqkv) +
2953
+ ggml_nbytes(layer.wo) +
2954
+ ggml_nbytes(layer.ffn_norm) +
2955
+ ggml_nbytes(layer.w2) +
2956
+ ggml_nbytes(layer.w3);
2957
+ }
2958
+ }
2959
+ } break;
2960
+ default:
2961
+ throw std::runtime_error("unknown architecture");
2962
+ }
2963
+ }
2964
+
2965
+ ml.done_getting_tensors();
2966
+
2967
+ // print memory requirements
2968
+ {
2969
+ // this is the total memory required to run the inference
2970
+ size_t mem_required =
2971
+ ctx_size +
2972
+ mmapped_size - vram_weights; // weights in VRAM not in memory
2973
+
2974
+ LLAMA_LOG_INFO("%s: mem required = %7.2f MB\n", __func__, mem_required / 1024.0 / 1024.0);
2975
+
2976
+ #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
2977
+ const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
2978
+
2979
+ LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
2980
+ if (n_gpu_layers > (int) hparams.n_layer) {
2981
+ LLAMA_LOG_INFO("%s: offloading non-repeating layers to GPU\n", __func__);
2982
+ }
2983
+
2984
+ #ifdef GGML_USE_CUBLAS
2985
+ const int max_backend_supported_layers = hparams.n_layer + 3;
2986
+ const int max_offloadable_layers = hparams.n_layer + 3;
2987
+ #elif defined(GGML_USE_CLBLAST)
2988
+ const int max_backend_supported_layers = hparams.n_layer + 1;
2989
+ const int max_offloadable_layers = hparams.n_layer + 1;
2990
+ #endif // GGML_USE_CUBLAS
2991
+
2992
+ LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
2993
+ LLAMA_LOG_INFO("%s: VRAM used: %.2f MB\n", __func__, vram_weights / 1024.0 / 1024.0);
2994
+ #else
2995
+ (void) n_gpu_layers;
2996
+ #endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
2997
+ }
2998
+
2999
+ // populate `tensors_by_name`
3000
+ for (int i = 0; i < ml.n_tensors; ++i) {
3001
+ struct ggml_tensor * cur = ggml_get_tensor(ctx, ml.get_tensor_name(i));
3002
+ model.tensors_by_name.emplace_back(ggml_get_name(cur), cur);
3003
+ }
3004
+
3005
+ (void) tensor_split;
3006
+ #ifdef GGML_USE_CUBLAS
3007
+ {
3008
+ ggml_cuda_set_tensor_split(tensor_split);
3009
+ }
3010
+ #endif
3011
+
3012
+ ml.load_all_data(ctx, progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL);
3013
+
3014
+ if (progress_callback) {
3015
+ progress_callback(1.0f, progress_callback_user_data);
3016
+ }
3017
+
3018
+ model.mapping = std::move(ml.mapping);
3019
+
3020
+ // loading time will be recalculate after the first eval, so
3021
+ // we take page faults deferred by mmap() into consideration
3022
+ model.t_load_us = ggml_time_us() - model.t_start_us;
3023
+ }
3024
+
3025
+ static bool llama_model_load(
3026
+ const std::string & fname,
3027
+ llama_model & model,
3028
+ int n_gpu_layers,
3029
+ int main_gpu,
3030
+ const float * tensor_split,
3031
+ bool use_mmap,
3032
+ bool use_mlock,
3033
+ bool vocab_only,
3034
+ llama_progress_callback progress_callback,
3035
+ void *progress_callback_user_data) {
3036
+ try {
3037
+ llama_model_loader ml(fname, use_mmap);
3038
+
3039
+ model.hparams.vocab_only = vocab_only;
3040
+
3041
+ llm_load_arch (ml, model);
3042
+ llm_load_hparams(ml, model);
3043
+ llm_load_vocab (ml, model);
3044
+
3045
+ llm_load_print_meta(ml, model);
3046
+
3047
+ if (model.hparams.n_vocab != model.vocab.id_to_token.size()) {
3048
+ throw std::runtime_error("vocab size mismatch");
3049
+ }
3050
+
3051
+ if (vocab_only) {
3052
+ LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
3053
+ return true;
3054
+ }
3055
+
3056
+ llm_load_tensors(
3057
+ ml, model, n_gpu_layers,
3058
+ main_gpu, tensor_split,
3059
+ use_mlock, progress_callback, progress_callback_user_data);
3060
+ } catch (const std::exception & err) {
3061
+ LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
3062
+ return false;
3063
+ }
3064
+
3065
+ return true;
3066
+ }
2625
3067
 
2626
3068
  static struct ggml_cgraph * llm_build_llama(
2627
- llama_context & lctx,
2628
- const llama_batch & batch) {
3069
+ llama_context & lctx,
3070
+ const llama_batch & batch) {
2629
3071
  const auto & model = lctx.model;
2630
3072
  const auto & hparams = model.hparams;
2631
3073
  const auto & cparams = lctx.cparams;
@@ -2663,11 +3105,9 @@ static struct ggml_cgraph * llm_build_llama(
2663
3105
  struct ggml_init_params params = {
2664
3106
  /*.mem_size =*/ buf_compute.size,
2665
3107
  /*.mem_buffer =*/ buf_compute.data,
2666
- /*.no_alloc =*/ false,
3108
+ /*.no_alloc =*/ true,
2667
3109
  };
2668
3110
 
2669
- params.no_alloc = true;
2670
-
2671
3111
  struct ggml_context * ctx0 = ggml_init(params);
2672
3112
 
2673
3113
  ggml_cgraph * gf = ggml_new_graph(ctx0);
@@ -2739,7 +3179,7 @@ static struct ggml_cgraph * llm_build_llama(
2739
3179
  for (int h = 0; h < 1; ++h) {
2740
3180
  for (int j = 0; j < n_tokens; ++j) {
2741
3181
  const llama_pos pos = batch.pos[j];
2742
- const llama_seq_id seq_id = batch.seq_id[j];
3182
+ const llama_seq_id seq_id = batch.seq_id[j][0];
2743
3183
 
2744
3184
  for (int i = 0; i < n_kv; ++i) {
2745
3185
  if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
@@ -3051,11 +3491,9 @@ static struct ggml_cgraph * llm_build_baichaun(
3051
3491
  struct ggml_init_params params = {
3052
3492
  /*.mem_size =*/ buf_compute.size,
3053
3493
  /*.mem_buffer =*/ buf_compute.data,
3054
- /*.no_alloc =*/ false,
3494
+ /*.no_alloc =*/ true,
3055
3495
  };
3056
3496
 
3057
- params.no_alloc = true;
3058
-
3059
3497
  struct ggml_context * ctx0 = ggml_init(params);
3060
3498
 
3061
3499
  ggml_cgraph * gf = ggml_new_graph(ctx0);
@@ -3127,7 +3565,7 @@ static struct ggml_cgraph * llm_build_baichaun(
3127
3565
  for (int h = 0; h < 1; ++h) {
3128
3566
  for (int j = 0; j < n_tokens; ++j) {
3129
3567
  const llama_pos pos = batch.pos[j];
3130
- const llama_seq_id seq_id = batch.seq_id[j];
3568
+ const llama_seq_id seq_id = batch.seq_id[j][0];
3131
3569
 
3132
3570
  for (int i = 0; i < n_kv; ++i) {
3133
3571
  if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
@@ -3452,11 +3890,9 @@ static struct ggml_cgraph * llm_build_refact(
3452
3890
  struct ggml_init_params params = {
3453
3891
  /*.mem_size =*/ buf_compute.size,
3454
3892
  /*.mem_buffer =*/ buf_compute.data,
3455
- /*.no_alloc =*/ false,
3893
+ /*.no_alloc =*/ true,
3456
3894
  };
3457
3895
 
3458
- params.no_alloc = true;
3459
-
3460
3896
  struct ggml_context * ctx0 = ggml_init(params);
3461
3897
 
3462
3898
  ggml_cgraph * gf = ggml_new_graph(ctx0);
@@ -3528,7 +3964,7 @@ static struct ggml_cgraph * llm_build_refact(
3528
3964
  for (int h = 0; h < 1; ++h) {
3529
3965
  for (int j = 0; j < n_tokens; ++j) {
3530
3966
  const llama_pos pos = batch.pos[j];
3531
- const llama_seq_id seq_id = batch.seq_id[j];
3967
+ const llama_seq_id seq_id = batch.seq_id[j][0];
3532
3968
 
3533
3969
  for (int i = 0; i < n_kv; ++i) {
3534
3970
  if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
@@ -3806,11 +4242,9 @@ static struct ggml_cgraph * llm_build_falcon(
3806
4242
  struct ggml_init_params params = {
3807
4243
  /*.mem_size =*/ buf_compute.size,
3808
4244
  /*.mem_buffer =*/ buf_compute.data,
3809
- /*.no_alloc =*/ false,
4245
+ /*.no_alloc =*/ true,
3810
4246
  };
3811
4247
 
3812
- params.no_alloc = true;
3813
-
3814
4248
  struct ggml_context * ctx0 = ggml_init(params);
3815
4249
 
3816
4250
  ggml_cgraph * gf = ggml_new_graph(ctx0);
@@ -3882,7 +4316,7 @@ static struct ggml_cgraph * llm_build_falcon(
3882
4316
  for (int h = 0; h < 1; ++h) {
3883
4317
  for (int j = 0; j < n_tokens; ++j) {
3884
4318
  const llama_pos pos = batch.pos[j];
3885
- const llama_seq_id seq_id = batch.seq_id[j];
4319
+ const llama_seq_id seq_id = batch.seq_id[j][0];
3886
4320
 
3887
4321
  for (int i = 0; i < n_kv; ++i) {
3888
4322
  if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
@@ -4166,11 +4600,9 @@ static struct ggml_cgraph * llm_build_starcoder(
4166
4600
  struct ggml_init_params params = {
4167
4601
  /*.mem_size =*/ buf_compute.size,
4168
4602
  /*.mem_buffer =*/ buf_compute.data,
4169
- /*.no_alloc =*/ false,
4603
+ /*.no_alloc =*/ true,
4170
4604
  };
4171
4605
 
4172
- params.no_alloc = true;
4173
-
4174
4606
  struct ggml_context * ctx0 = ggml_init(params);
4175
4607
 
4176
4608
  ggml_cgraph * gf = ggml_new_graph(ctx0);
@@ -4199,23 +4631,919 @@ static struct ggml_cgraph * llm_build_starcoder(
4199
4631
 
4200
4632
  ggml_allocr_alloc(lctx.alloc, token);
4201
4633
  if (!ggml_allocr_is_measure(lctx.alloc)) {
4202
- memcpy(token->data, batch.embd, n_tokens * n_embd * ggml_element_size(token));
4634
+ memcpy(token->data, batch.embd, n_tokens * n_embd * ggml_element_size(token));
4635
+ }
4636
+ }
4637
+
4638
+ {
4639
+ // Compute position embeddings.
4640
+ struct ggml_tensor * inp_positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
4641
+ ggml_allocr_alloc(lctx.alloc, inp_positions);
4642
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
4643
+ for (int i = 0; i < n_tokens; ++i) {
4644
+ ((int32_t *) inp_positions->data)[i] = batch.pos[i];
4645
+ }
4646
+ }
4647
+ ggml_set_name(inp_positions, "inp_positions");
4648
+
4649
+ position = ggml_get_rows(ctx0, model.pos_embeddings, inp_positions);
4650
+ }
4651
+
4652
+ // KQ_scale
4653
+ struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
4654
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
4655
+ ggml_allocr_alloc(lctx.alloc, KQ_scale);
4656
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
4657
+ ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
4658
+ }
4659
+
4660
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
4661
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
4662
+ ggml_set_name(KQ_mask, "KQ_mask");
4663
+ ggml_allocr_alloc(lctx.alloc, KQ_mask);
4664
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
4665
+ float * data = (float *) KQ_mask->data;
4666
+ memset(data, 0, ggml_nbytes(KQ_mask));
4667
+
4668
+ for (int h = 0; h < 1; ++h) {
4669
+ for (int j = 0; j < n_tokens; ++j) {
4670
+ const llama_pos pos = batch.pos[j];
4671
+ const llama_seq_id seq_id = batch.seq_id[j][0];
4672
+
4673
+ for (int i = 0; i < n_kv; ++i) {
4674
+ if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
4675
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
4676
+ }
4677
+ }
4678
+ }
4679
+ }
4680
+ }
4681
+
4682
+ inpL = ggml_add(ctx0, token, position);
4683
+ ggml_set_name(inpL, "inpL");
4684
+
4685
+ for (int il = 0; il < n_layer; ++il) {
4686
+ {
4687
+ // Norm
4688
+ cur = ggml_norm(ctx0, inpL, norm_eps);
4689
+ cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b);
4690
+ }
4691
+
4692
+ {
4693
+ // Self Attention
4694
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv);
4695
+
4696
+ struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd);
4697
+ struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*n_embd);
4698
+ struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
4699
+
4700
+ struct ggml_tensor * Qcur = tmpq;
4701
+ struct ggml_tensor * Kcur = tmpk;
4702
+
4703
+ {
4704
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
4705
+ ggml_set_name(Vcur, "Vcur");
4706
+
4707
+ struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
4708
+ ggml_set_name(k, "k");
4709
+
4710
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
4711
+ ( n_ctx)*ggml_element_size(kv_self.v),
4712
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
4713
+
4714
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
4715
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
4716
+ }
4717
+
4718
+ struct ggml_tensor * Q =
4719
+ ggml_permute(ctx0,
4720
+ ggml_cpy(ctx0,
4721
+ Qcur,
4722
+ ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, n_tokens)),
4723
+ 0, 2, 1, 3);
4724
+ ggml_set_name(Q, "Q");
4725
+
4726
+ struct ggml_tensor * K =
4727
+ ggml_view_3d(ctx0, kv_self.k,
4728
+ n_embd_head, n_kv, n_head_kv,
4729
+ ggml_element_size(kv_self.k)*n_embd_gqa,
4730
+ ggml_element_size(kv_self.k)*n_embd_head,
4731
+ ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
4732
+ ggml_set_name(K, "K");
4733
+
4734
+ // K * Q
4735
+ struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
4736
+ ggml_set_name(KQ, "KQ");
4737
+
4738
+ // KQ_scaled = KQ / sqrt(n_embd_head)
4739
+ // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
4740
+ struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
4741
+ ggml_set_name(KQ_scaled, "KQ_scaled");
4742
+
4743
+ // KQ_masked = mask_past(KQ_scaled)
4744
+ struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
4745
+ ggml_set_name(KQ_masked, "KQ_masked");
4746
+
4747
+ // KQ = soft_max(KQ_masked)
4748
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
4749
+ ggml_set_name(KQ_soft_max, "KQ_soft_max");
4750
+
4751
+ // split cached V into n_head heads
4752
+ struct ggml_tensor * V =
4753
+ ggml_view_3d(ctx0, kv_self.v,
4754
+ n_kv, n_embd_head, n_head_kv,
4755
+ ggml_element_size(kv_self.v)*n_ctx,
4756
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
4757
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
4758
+ ggml_set_name(V, "V");
4759
+
4760
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
4761
+ ggml_set_name(KQV, "KQV");
4762
+
4763
+ // KQV_merged = KQV.permute(0, 2, 1, 3)
4764
+ struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
4765
+ ggml_set_name(KQV_merged, "KQV_merged");
4766
+
4767
+ // cur = KQV_merged.contiguous().view(n_embd, n_tokens)
4768
+ cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
4769
+ ggml_set_name(cur, "KQV_merged_contiguous");
4770
+ }
4771
+
4772
+ // Projection
4773
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo);
4774
+
4775
+ // Add the input
4776
+ cur = ggml_add(ctx0, cur, inpL);
4777
+
4778
+ struct ggml_tensor * inpFF = cur;
4779
+
4780
+ // FF
4781
+ {
4782
+ // Norm
4783
+ {
4784
+ cur = ggml_norm(ctx0, inpFF, norm_eps);
4785
+ cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b);
4786
+ }
4787
+
4788
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3);
4789
+
4790
+ // GELU activation
4791
+ cur = ggml_gelu(ctx0, cur);
4792
+
4793
+ // Projection
4794
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2);
4795
+ }
4796
+
4797
+ inpL = ggml_add(ctx0, cur, inpFF);
4798
+ }
4799
+
4800
+ // Output Norm
4801
+ {
4802
+ cur = ggml_norm(ctx0, inpL, norm_eps);
4803
+ cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b);
4804
+ }
4805
+ ggml_set_name(cur, "result_norm");
4806
+
4807
+ cur = ggml_mul_mat(ctx0, model.output, cur);
4808
+ ggml_set_name(cur, "result_output");
4809
+
4810
+ ggml_build_forward_expand(gf, cur);
4811
+ ggml_free(ctx0);
4812
+
4813
+ return gf;
4814
+ }
4815
+
4816
+ static struct ggml_cgraph * llm_build_persimmon(
4817
+ llama_context & lctx,
4818
+ const llama_batch & batch) {
4819
+ const auto & model = lctx.model;
4820
+ const auto & hparams = model.hparams;
4821
+
4822
+ const auto & kv_self = lctx.kv_self;
4823
+
4824
+ GGML_ASSERT(!!kv_self.ctx);
4825
+
4826
+ const auto & cparams = lctx.cparams;
4827
+ const int64_t n_embd = hparams.n_embd;
4828
+ const int64_t n_layer = hparams.n_layer;
4829
+ const int64_t n_ctx = cparams.n_ctx;
4830
+ const int64_t n_head_kv = hparams.n_head_kv;
4831
+ const int64_t n_head = hparams.n_head;
4832
+ const int64_t n_embd_head = hparams.n_embd_head();
4833
+ const int64_t n_embd_gqa = hparams.n_embd_gqa();
4834
+ const size_t n_rot = n_embd_head / 2;
4835
+
4836
+ const float freq_base = cparams.rope_freq_base;
4837
+ const float freq_scale = cparams.rope_freq_scale;
4838
+ const float norm_eps = hparams.f_norm_eps;
4839
+
4840
+ const int n_gpu_layers = model.n_gpu_layers;
4841
+
4842
+
4843
+ const int32_t n_tokens = batch.n_tokens;
4844
+ const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
4845
+ const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
4846
+
4847
+ const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
4848
+
4849
+ auto & buf_compute = lctx.buf_compute;
4850
+ struct ggml_init_params params = {
4851
+ /*.mem_size =*/ buf_compute.size,
4852
+ /*.mem_buffer =*/ buf_compute.data,
4853
+ /*.no_alloc =*/ true,
4854
+ };
4855
+
4856
+ struct ggml_context * ctx0 = ggml_init(params);
4857
+
4858
+ ggml_cgraph * gf = ggml_new_graph(ctx0);
4859
+
4860
+ struct ggml_tensor * cur;
4861
+ struct ggml_tensor * inpL;
4862
+
4863
+ if (batch.token) {
4864
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
4865
+
4866
+ ggml_allocr_alloc(lctx.alloc, inp_tokens);
4867
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
4868
+ memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
4869
+ }
4870
+ ggml_set_name(inp_tokens, "inp_tokens");
4871
+ inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
4872
+ } else {
4873
+ inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
4874
+ ggml_allocr_alloc(lctx.alloc, inpL);
4875
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
4876
+ memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
4877
+ }
4878
+ }
4879
+ const int i_gpu_start = n_layer - n_gpu_layers;
4880
+ (void) i_gpu_start;
4881
+ offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
4882
+ offload_func_t offload_func_kq = llama_nop;
4883
+ offload_func_t offload_func_v = llama_nop;
4884
+ // KQ_scale
4885
+ struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
4886
+ ggml_allocr_alloc(lctx.alloc, KQ_scale);
4887
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
4888
+ ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd_head)));
4889
+ }
4890
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
4891
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
4892
+ offload_func_kq(KQ_mask);
4893
+ ggml_set_name(KQ_mask, "KQ_mask");
4894
+ ggml_allocr_alloc(lctx.alloc, KQ_mask);
4895
+
4896
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
4897
+ float * data = (float *) KQ_mask->data;
4898
+ memset(data, 0, ggml_nbytes(KQ_mask));
4899
+ for (int h = 0; h < 1; ++h) {
4900
+ for (int j = 0; j < n_tokens; ++j) {
4901
+ const llama_pos pos = batch.pos[j];
4902
+ const llama_seq_id seq_id = batch.seq_id[j][0];
4903
+ for (int i = 0; i < n_kv; ++i) {
4904
+ if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
4905
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
4906
+ }
4907
+ }
4908
+ }
4909
+ }
4910
+ }
4911
+
4912
+ struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
4913
+ offload_func_kq(KQ_pos);
4914
+ ggml_set_name(KQ_pos, "KQ_pos");
4915
+ ggml_allocr_alloc(lctx.alloc, KQ_pos);
4916
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
4917
+ int * data = (int *) KQ_pos->data;
4918
+ for (int i = 0; i < n_tokens; ++i) {
4919
+ data[i] = batch.pos[i];
4920
+ }
4921
+ }
4922
+ if (do_rope_shift) {
4923
+ struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
4924
+ offload_func_kq(K_shift);
4925
+ ggml_set_name(K_shift, "K_shift");
4926
+ ggml_allocr_alloc(lctx.alloc, K_shift);
4927
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
4928
+ int * data = (int *) K_shift->data;
4929
+ for (int i = 0; i < n_ctx; ++i) {
4930
+ data[i] = kv_self.cells[i].delta;
4931
+ }
4932
+ }
4933
+ for (int il = 0; il < n_layer; ++il) {
4934
+ struct ggml_tensor * tmp =
4935
+ // we rotate only the first n_rot dimensions.
4936
+ ggml_rope_custom_inplace(ctx0,
4937
+ ggml_view_3d(ctx0, kv_self.k,
4938
+ n_rot, n_head, n_ctx,
4939
+ ggml_element_size(kv_self.k)*n_embd_gqa,
4940
+ ggml_element_size(kv_self.k)*n_embd_head,
4941
+ ggml_element_size(kv_self.k)*(n_embd_head*n_ctx*il)
4942
+ ),
4943
+ K_shift, n_rot, 2, 0, freq_base, freq_scale);
4944
+ offload_func_kq(tmp);
4945
+ ggml_build_forward_expand(gf, tmp);
4946
+ }
4947
+ }
4948
+ for (int il=0; il < n_layer; ++il) {
4949
+ struct ggml_tensor * residual = inpL;
4950
+ offload_func_t offload_func = llama_nop;
4951
+ {
4952
+ cur = ggml_norm(ctx0, inpL, norm_eps);
4953
+ offload_func(cur);
4954
+ cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm);
4955
+ offload_func(cur);
4956
+ cur = ggml_add(ctx0, cur, model.layers[il].attn_norm_b);
4957
+ offload_func(cur);
4958
+ ggml_format_name(cur, "input_layernorm_%d", il);
4959
+ }
4960
+ // self attention
4961
+ {
4962
+ cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
4963
+ offload_func_kq(cur);
4964
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
4965
+ offload_func_kq(cur);
4966
+
4967
+ // split qkv
4968
+ GGML_ASSERT(n_head_kv == n_head);
4969
+ ggml_set_name(cur, format("qkv_%d", il).c_str());
4970
+ struct ggml_tensor * tmpqkv = ggml_reshape_4d(ctx0, cur, n_embd_head, 3, n_head, n_tokens);
4971
+ offload_func_kq(tmpqkv);
4972
+ struct ggml_tensor * tmpqkv_perm = ggml_cont(ctx0, ggml_permute(ctx0, tmpqkv, 0, 3, 1, 2));
4973
+ offload_func_kq(tmpqkv_perm);
4974
+ ggml_format_name(tmpqkv_perm, "tmpqkv_perm_%d", il);
4975
+ struct ggml_tensor * tmpq = ggml_view_3d(
4976
+ ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
4977
+ ggml_element_size(tmpqkv_perm) * n_embd_head,
4978
+ ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
4979
+ 0
4980
+ );
4981
+ offload_func_kq(tmpq);
4982
+ struct ggml_tensor * tmpk = ggml_view_3d(
4983
+ ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
4984
+ ggml_element_size(tmpqkv_perm) * n_embd_head,
4985
+ ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
4986
+ ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens
4987
+ );
4988
+ offload_func_kq(tmpk);
4989
+ // Q/K Layernorm
4990
+ tmpq = ggml_norm(ctx0, tmpq, norm_eps);
4991
+ offload_func_kq(tmpq);
4992
+ tmpq = ggml_mul(ctx0, tmpq, model.layers[il].attn_q_norm);
4993
+ offload_func_kq(tmpq);
4994
+ tmpq = ggml_add(ctx0, tmpq, model.layers[il].attn_q_norm_b);
4995
+ offload_func_kq(tmpq);
4996
+
4997
+ tmpk = ggml_norm(ctx0, tmpk, norm_eps);
4998
+ offload_func_v(tmpk);
4999
+ tmpk = ggml_mul(ctx0, tmpk, model.layers[il].attn_k_norm);
5000
+ offload_func_v(tmpk);
5001
+ tmpk = ggml_add(ctx0, tmpk, model.layers[il].attn_k_norm_b);
5002
+ offload_func_v(tmpk);
5003
+
5004
+ // RoPE the first n_rot of q/k, pass the other half, and concat.
5005
+ struct ggml_tensor * qrot = ggml_view_3d(
5006
+ ctx0, tmpq, n_rot, n_head, n_tokens,
5007
+ ggml_element_size(tmpq) * n_embd_head,
5008
+ ggml_element_size(tmpq) * n_embd_head * n_head,
5009
+ 0
5010
+ );
5011
+ offload_func_kq(qrot);
5012
+ ggml_format_name(qrot, "qrot_%d", il);
5013
+ struct ggml_tensor * krot = ggml_view_3d(
5014
+ ctx0, tmpk, n_rot, n_head, n_tokens,
5015
+ ggml_element_size(tmpk) * n_embd_head,
5016
+ ggml_element_size(tmpk) * n_embd_head * n_head,
5017
+ 0
5018
+ );
5019
+ offload_func_kq(krot);
5020
+ ggml_format_name(krot, "krot_%d", il);
5021
+
5022
+ // get the second half of tmpq, e.g tmpq[n_rot:, :, :]
5023
+ struct ggml_tensor * qpass = ggml_view_3d(
5024
+ ctx0, tmpq, n_rot, n_head, n_tokens,
5025
+ ggml_element_size(tmpq) * n_embd_head,
5026
+ ggml_element_size(tmpq) * n_embd_head * n_head,
5027
+ ggml_element_size(tmpq) * n_rot
5028
+ );
5029
+ offload_func_kq(qpass);
5030
+ ggml_format_name(qpass, "qpass_%d", il);
5031
+ struct ggml_tensor * kpass = ggml_view_3d(
5032
+ ctx0, tmpk, n_rot, n_head, n_tokens,
5033
+ ggml_element_size(tmpk) * n_embd_head,
5034
+ ggml_element_size(tmpk) * n_embd_head * n_head,
5035
+ ggml_element_size(tmpk) * n_rot
5036
+ );
5037
+ offload_func_kq(kpass);
5038
+ ggml_format_name(kpass, "kpass_%d", il);
5039
+
5040
+ struct ggml_tensor * qrotated = ggml_rope_custom(
5041
+ ctx0, qrot, KQ_pos, n_rot, 2, 0, freq_base, freq_scale
5042
+ );
5043
+ offload_func_kq(qrotated);
5044
+ struct ggml_tensor * krotated = ggml_rope_custom(
5045
+ ctx0, krot, KQ_pos, n_rot, 2, 0, freq_base, freq_scale
5046
+ );
5047
+ offload_func_kq(krotated);
5048
+ // ggml currently only supports concatenation on dim=2
5049
+ // so we need to permute qrot, qpass, concat, then permute back.
5050
+ qrotated = ggml_cont(ctx0, ggml_permute(ctx0, qrotated, 2, 1, 0, 3));
5051
+ offload_func_kq(qrotated);
5052
+ krotated = ggml_cont(ctx0, ggml_permute(ctx0, krotated, 2, 1, 0, 3));
5053
+ offload_func_kq(krotated);
5054
+
5055
+ qpass = ggml_cont(ctx0, ggml_permute(ctx0, qpass, 2, 1, 0, 3));
5056
+ offload_func_kq(qpass);
5057
+ kpass = ggml_cont(ctx0, ggml_permute(ctx0, kpass, 2, 1, 0, 3));
5058
+ offload_func_kq(kpass);
5059
+
5060
+ struct ggml_tensor * Qcur = ggml_concat(ctx0, qrotated, qpass);
5061
+ offload_func_kq(Qcur);
5062
+ struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass);
5063
+ offload_func_kq(Kcur);
5064
+
5065
+ struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 1, 2, 0, 3));
5066
+ offload_func_kq(Q);
5067
+
5068
+ Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
5069
+ offload_func_kq(Kcur);
5070
+ {
5071
+ struct ggml_tensor * tmpv = ggml_view_3d(
5072
+ ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
5073
+ ggml_element_size(tmpqkv_perm) * n_embd_head,
5074
+ ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
5075
+ ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens * 2
5076
+ );
5077
+ offload_func_v(tmpv);
5078
+ // store K, V in cache
5079
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens));
5080
+ offload_func_v(Vcur);
5081
+ ggml_set_name(Vcur, "Vcur");
5082
+
5083
+ struct ggml_tensor * k = ggml_view_1d(
5084
+ ctx0, kv_self.k, n_tokens*n_embd_gqa,
5085
+ (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)
5086
+ );
5087
+ offload_func_kq(k);
5088
+ ggml_set_name(k, "k");
5089
+
5090
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
5091
+ ( n_ctx)*ggml_element_size(kv_self.v),
5092
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
5093
+ offload_func_v(v);
5094
+ ggml_set_name(v, "v");
5095
+
5096
+ // important: storing RoPE-ed version of K in the KV cache!
5097
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
5098
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
5099
+ }
5100
+ struct ggml_tensor * K = ggml_view_3d(ctx0, kv_self.k,
5101
+ n_embd_head, n_kv, n_head_kv,
5102
+ ggml_element_size(kv_self.k)*n_embd_gqa,
5103
+ ggml_element_size(kv_self.k)*n_embd_head,
5104
+ ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
5105
+
5106
+ offload_func_kq(K);
5107
+ ggml_format_name(K, "K_%d", il);
5108
+
5109
+ struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
5110
+ offload_func_kq(KQ);
5111
+ ggml_set_name(KQ, "KQ");
5112
+
5113
+ struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
5114
+ offload_func_kq(KQ_scaled);
5115
+ ggml_set_name(KQ_scaled, "KQ_scaled");
5116
+
5117
+ struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
5118
+ offload_func_kq(KQ_masked);
5119
+ ggml_set_name(KQ_masked, "KQ_masked");
5120
+
5121
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
5122
+ offload_func_kq(KQ_soft_max);
5123
+ ggml_set_name(KQ_soft_max, "KQ_soft_max");
5124
+
5125
+ struct ggml_tensor * V =
5126
+ ggml_view_3d(ctx0, kv_self.v,
5127
+ n_kv, n_embd_head, n_head_kv,
5128
+ ggml_element_size(kv_self.v)*n_ctx,
5129
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
5130
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
5131
+ offload_func_v(V);
5132
+ ggml_set_name(V, "V");
5133
+
5134
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
5135
+ offload_func_v(KQV);
5136
+ ggml_set_name(KQV, "KQV");
5137
+
5138
+ struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
5139
+ offload_func_v(KQV_merged);
5140
+ ggml_set_name(KQV_merged, "KQV_merged");
5141
+
5142
+ cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
5143
+ offload_func_v(cur);
5144
+ ggml_set_name(cur, "KQV_merged_contiguous");
5145
+
5146
+ cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
5147
+ offload_func(cur);
5148
+ cur = ggml_add(ctx0, cur, model.layers[il].bo);
5149
+ offload_func(cur);
5150
+ ggml_set_name(cur, "result_wo");
5151
+ }
5152
+
5153
+ struct ggml_tensor * inpFF = ggml_add(ctx0, residual, cur);
5154
+ offload_func(inpFF);
5155
+ ggml_set_name(inpFF, "inpFF");
5156
+ {
5157
+ // MLP
5158
+ {
5159
+ // Norm
5160
+ cur = ggml_norm(ctx0, inpFF, norm_eps);
5161
+ offload_func(cur);
5162
+ cur = ggml_add(ctx0,
5163
+ ggml_mul(ctx0, cur, model.layers[il].ffn_norm),
5164
+ model.layers[il].ffn_norm_b
5165
+ );
5166
+ ggml_set_name(cur, "ffn_norm");
5167
+ offload_func(cur);
5168
+ }
5169
+ cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur);
5170
+ offload_func(cur);
5171
+
5172
+ cur = ggml_add(ctx0, cur, model.layers[il].b3);
5173
+ offload_func(cur);
5174
+ ggml_set_name(cur, "result_ffn_up");
5175
+
5176
+ cur = ggml_sqr(ctx0, ggml_relu(ctx0, cur));
5177
+ ggml_set_name(cur, "result_ffn_act");
5178
+ offload_func(cur);
5179
+ offload_func(cur->src[0]);
5180
+
5181
+ cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur);
5182
+ offload_func(cur);
5183
+ cur = ggml_add(ctx0,
5184
+ cur,
5185
+ model.layers[il].b2);
5186
+ offload_func(cur);
5187
+ ggml_set_name(cur, "outFF");
5188
+ }
5189
+ cur = ggml_add(ctx0, cur, inpFF);
5190
+ offload_func(cur);
5191
+ ggml_set_name(cur, "inpFF_+_outFF");
5192
+ inpL = cur;
5193
+ }
5194
+ cur = inpL;
5195
+ {
5196
+ cur = ggml_norm(ctx0, cur, norm_eps);
5197
+ offload_func_nr(cur);
5198
+ cur = ggml_mul(ctx0, cur, model.output_norm);
5199
+ offload_func_nr(cur);
5200
+
5201
+ cur = ggml_add(ctx0, cur, model.output_norm_b);
5202
+ // offload_func_nr(cur);
5203
+
5204
+ ggml_set_name(cur, "result_norm");
5205
+ }
5206
+ cur = ggml_mul_mat(ctx0, model.output, cur);
5207
+ ggml_set_name(cur, "result_output");
5208
+ ggml_build_forward_expand(gf, cur);
5209
+ ggml_free(ctx0);
5210
+ return gf;
5211
+ }
5212
+
5213
+ static struct ggml_cgraph * llm_build_bloom(
5214
+ llama_context & lctx,
5215
+ const llama_batch & batch) {
5216
+ const auto & model = lctx.model;
5217
+ const auto & hparams = model.hparams;
5218
+ const auto & cparams = lctx.cparams;
5219
+
5220
+ const auto & kv_self = lctx.kv_self;
5221
+
5222
+ GGML_ASSERT(!!kv_self.ctx);
5223
+
5224
+ const int64_t n_embd = hparams.n_embd;
5225
+ const int64_t n_layer = hparams.n_layer;
5226
+ const int64_t n_ctx = cparams.n_ctx;
5227
+ const int64_t n_head = hparams.n_head;
5228
+ const int64_t n_head_kv = hparams.n_head_kv;
5229
+ const int64_t n_embd_head = hparams.n_embd_head();
5230
+ const int64_t n_embd_gqa = hparams.n_embd_gqa();
5231
+
5232
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
5233
+
5234
+ const float norm_eps = hparams.f_norm_eps;
5235
+
5236
+ const int32_t n_tokens = batch.n_tokens;
5237
+ const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
5238
+ const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
5239
+
5240
+ auto & buf_compute = lctx.buf_compute;
5241
+
5242
+ struct ggml_init_params params = {
5243
+ /*.mem_size =*/ buf_compute.size,
5244
+ /*.mem_buffer =*/ buf_compute.data,
5245
+ /*.no_alloc =*/ false,
5246
+ };
5247
+
5248
+ params.no_alloc = true;
5249
+
5250
+ struct ggml_context * ctx0 = ggml_init(params);
5251
+
5252
+ ggml_cgraph * gf = ggml_new_graph(ctx0);
5253
+
5254
+ struct ggml_tensor * cur;
5255
+ struct ggml_tensor * token;
5256
+ struct ggml_tensor * inpL;
5257
+
5258
+ if (batch.token) {
5259
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
5260
+
5261
+ ggml_allocr_alloc(lctx.alloc, inp_tokens);
5262
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
5263
+ memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
5264
+ }
5265
+ ggml_set_name(inp_tokens, "inp_tokens");
5266
+
5267
+ token = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
5268
+ } else {
5269
+ #ifdef GGML_USE_MPI
5270
+ GGML_ASSERT(false && "not implemented");
5271
+ #endif
5272
+
5273
+ token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
5274
+
5275
+ ggml_allocr_alloc(lctx.alloc, token);
5276
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
5277
+ memcpy(token->data, batch.embd, n_tokens * n_embd * ggml_element_size(token));
5278
+ }
5279
+ }
5280
+
5281
+ // KQ_scale
5282
+ struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
5283
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
5284
+ ggml_allocr_alloc(lctx.alloc, KQ_scale);
5285
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
5286
+ ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
5287
+ }
5288
+
5289
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5290
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
5291
+ ggml_set_name(KQ_mask, "KQ_mask");
5292
+ ggml_allocr_alloc(lctx.alloc, KQ_mask);
5293
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
5294
+ float * data = (float *) KQ_mask->data;
5295
+ memset(data, 0, ggml_nbytes(KQ_mask));
5296
+
5297
+ for (int h = 0; h < 1; ++h) {
5298
+ for (int j = 0; j < n_tokens; ++j) {
5299
+ const llama_pos pos = batch.pos[j];
5300
+ const llama_seq_id seq_id = batch.seq_id[j][0];
5301
+
5302
+ for (int i = 0; i < n_kv; ++i) {
5303
+ if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
5304
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
5305
+ }
5306
+ }
5307
+ }
5308
+ }
5309
+ }
5310
+
5311
+ // norm
5312
+ {
5313
+ inpL = ggml_norm(ctx0, token, norm_eps);
5314
+ inpL = ggml_add(ctx0, ggml_mul(ctx0, inpL, model.tok_norm), model.tok_norm_b);
5315
+ }
5316
+
5317
+ ggml_set_name(inpL, "inpL");
5318
+
5319
+ for (int il = 0; il < n_layer; ++il) {
5320
+ {
5321
+ // Norm
5322
+ cur = ggml_norm(ctx0, inpL, norm_eps);
5323
+ cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b);
5324
+ }
5325
+
5326
+ {
5327
+ // Self Attention
5328
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv);
5329
+
5330
+ struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd);
5331
+ struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*n_embd);
5332
+ struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
5333
+
5334
+ struct ggml_tensor * Qcur = tmpq;
5335
+ struct ggml_tensor * Kcur = tmpk;
5336
+
5337
+ // store key and value to memory
5338
+ {
5339
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
5340
+ ggml_set_name(Vcur, "Vcur");
5341
+
5342
+ struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
5343
+ ggml_set_name(k, "k");
5344
+
5345
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
5346
+ ( n_ctx)*ggml_element_size(kv_self.v),
5347
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
5348
+
5349
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
5350
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
5351
+ }
5352
+
5353
+ struct ggml_tensor * Q =
5354
+ ggml_permute(ctx0,
5355
+ ggml_cpy(ctx0,
5356
+ Qcur,
5357
+ ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, n_tokens)),
5358
+ 0, 2, 1, 3);
5359
+ ggml_set_name(Q, "Q");
5360
+
5361
+ struct ggml_tensor * K =
5362
+ ggml_view_3d(ctx0, kv_self.k,
5363
+ n_embd_head, n_kv, n_head_kv,
5364
+ ggml_element_size(kv_self.k)*n_embd_gqa,
5365
+ ggml_element_size(kv_self.k)*n_embd_head,
5366
+ ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
5367
+ ggml_set_name(K, "K");
5368
+
5369
+ // K * Q
5370
+ struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
5371
+ ggml_set_name(KQ, "KQ");
5372
+
5373
+ // KQ_scaled = KQ / sqrt(n_embd_head)
5374
+ // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
5375
+ struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
5376
+ ggml_set_name(KQ_scaled, "KQ_scaled");
5377
+
5378
+ struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ kv_head, n_head, 8);
5379
+ ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
5380
+
5381
+ // KQ_masked = mask_past(KQ_scaled)
5382
+ struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
5383
+ ggml_set_name(KQ_masked, "KQ_masked");
5384
+
5385
+ // KQ = soft_max(KQ_masked)
5386
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
5387
+ ggml_set_name(KQ_soft_max, "KQ_soft_max");
5388
+
5389
+ // split cached V into n_head heads
5390
+ struct ggml_tensor * V =
5391
+ ggml_view_3d(ctx0, kv_self.v,
5392
+ n_kv, n_embd_head, n_head_kv,
5393
+ ggml_element_size(kv_self.v)*n_ctx,
5394
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
5395
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
5396
+ ggml_set_name(V, "V");
5397
+
5398
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
5399
+ ggml_set_name(KQV, "KQV");
5400
+
5401
+ // KQV_merged = KQV.permute(0, 2, 1, 3)
5402
+ struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
5403
+ ggml_set_name(KQV_merged, "KQV_merged");
5404
+
5405
+ // cur = KQV_merged.contiguous().view(n_embd, n_tokens)
5406
+ cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
5407
+ ggml_set_name(cur, "KQV_merged_contiguous");
5408
+ }
5409
+
5410
+ // Projection
5411
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo);
5412
+
5413
+ // Add the input
5414
+ cur = ggml_add(ctx0, cur, inpL);
5415
+
5416
+ struct ggml_tensor * inpFF = cur;
5417
+
5418
+ // FF
5419
+ {
5420
+ // Norm
5421
+ {
5422
+ cur = ggml_norm(ctx0, inpFF, norm_eps);
5423
+ cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b);
5424
+ }
5425
+
5426
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3);
5427
+
5428
+ // GELU activation
5429
+ cur = ggml_gelu(ctx0, cur);
5430
+
5431
+ // Projection
5432
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2);
5433
+ }
5434
+
5435
+ inpL = ggml_add(ctx0, cur, inpFF);
5436
+ }
5437
+
5438
+ // Output Norm
5439
+ {
5440
+ cur = ggml_norm(ctx0, inpL, norm_eps);
5441
+ cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b);
5442
+ }
5443
+ ggml_set_name(cur, "result_norm");
5444
+
5445
+ cur = ggml_mul_mat(ctx0, model.output, cur);
5446
+ ggml_set_name(cur, "result_output");
5447
+
5448
+ ggml_build_forward_expand(gf, cur);
5449
+
5450
+ ggml_free(ctx0);
5451
+
5452
+ return gf;
5453
+ }
5454
+
5455
+ static struct ggml_cgraph * llm_build_mpt(
5456
+ llama_context & lctx,
5457
+ const llama_batch & batch) {
5458
+ const auto & model = lctx.model;
5459
+ const auto & hparams = model.hparams;
5460
+ const auto & cparams = lctx.cparams;
5461
+
5462
+ const auto & kv_self = lctx.kv_self;
5463
+
5464
+ GGML_ASSERT(!!kv_self.ctx);
5465
+
5466
+ const int64_t n_embd = hparams.n_embd;
5467
+ const int64_t n_layer = hparams.n_layer;
5468
+ const int64_t n_ctx = cparams.n_ctx;
5469
+ const int64_t n_head = hparams.n_head;
5470
+ const int64_t n_head_kv = hparams.n_head_kv;
5471
+ const int64_t n_embd_head = hparams.n_embd_head();
5472
+ const int64_t n_embd_gqa = hparams.n_embd_gqa();
5473
+
5474
+ const float norm_eps = hparams.f_norm_eps;
5475
+ const float clamp_kqv = hparams.f_clamp_kqv;
5476
+ const float max_alibi_bias = hparams.f_max_alibi_bias;
5477
+
5478
+ const int n_gpu_layers = model.n_gpu_layers;
5479
+
5480
+ const int32_t n_tokens = batch.n_tokens;
5481
+ const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
5482
+ const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
5483
+
5484
+ auto & buf_compute = lctx.buf_compute;
5485
+
5486
+ struct ggml_init_params params = {
5487
+ /*.mem_size =*/ buf_compute.size,
5488
+ /*.mem_buffer =*/ buf_compute.data,
5489
+ /*.no_alloc =*/ false,
5490
+ };
5491
+
5492
+ params.no_alloc = true;
5493
+
5494
+ struct ggml_context * ctx0 = ggml_init(params);
5495
+
5496
+ ggml_cgraph * gf = ggml_new_graph(ctx0);
5497
+
5498
+ struct ggml_tensor * cur;
5499
+ struct ggml_tensor * inpL;
5500
+
5501
+ //int warmup = 0;
5502
+ if (batch.token) {
5503
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
5504
+
5505
+ ggml_allocr_alloc(lctx.alloc, inp_tokens);
5506
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
5507
+ memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
5508
+ //warmup = ((uint32_t*) inp_tokens->data)[0] == 0;
5509
+ }
5510
+
5511
+ ggml_set_name(inp_tokens, "inp_tokens");
5512
+
5513
+ inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
5514
+ } else {
5515
+ #ifdef GGML_USE_MPI
5516
+ GGML_ASSERT(false && "not implemented");
5517
+ #endif
5518
+
5519
+ inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
5520
+
5521
+ ggml_allocr_alloc(lctx.alloc, inpL);
5522
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
5523
+ memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
4203
5524
  }
4204
5525
  }
4205
5526
 
4206
- {
4207
- // Compute position embeddings.
4208
- struct ggml_tensor * inp_positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
4209
- ggml_allocr_alloc(lctx.alloc, inp_positions);
4210
- if (!ggml_allocr_is_measure(lctx.alloc)) {
4211
- for (int i = 0; i < n_tokens; ++i) {
4212
- ((int32_t *) inp_positions->data)[i] = batch.pos[i];
4213
- }
4214
- }
4215
- ggml_set_name(inp_positions, "inp_positions");
5527
+ const int i_gpu_start = n_layer - n_gpu_layers;
5528
+ (void) i_gpu_start;
4216
5529
 
4217
- position = ggml_get_rows(ctx0, model.pos_embeddings, inp_positions);
5530
+ // offload functions set the tensor output backend to GPU
5531
+ // tensors are GPU-accelerated if any input or the output has been offloaded
5532
+ offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
5533
+ offload_func_t offload_func_kq = llama_nop;
5534
+ offload_func_t offload_func_v = llama_nop;
5535
+
5536
+ #ifdef GGML_USE_CUBLAS
5537
+ if (n_gpu_layers > n_layer) {
5538
+ offload_func_nr = ggml_cuda_assign_buffers_no_alloc;
4218
5539
  }
5540
+ if (n_gpu_layers > n_layer + 1) {
5541
+ offload_func_v = ggml_cuda_assign_buffers_no_alloc;
5542
+ }
5543
+ if (n_gpu_layers > n_layer + 2) {
5544
+ offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
5545
+ }
5546
+ #endif // GGML_USE_CUBLAS
4219
5547
 
4220
5548
  // KQ_scale
4221
5549
  struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
@@ -4227,6 +5555,7 @@ static struct ggml_cgraph * llm_build_starcoder(
4227
5555
 
4228
5556
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
4229
5557
  struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
5558
+ offload_func_kq(KQ_mask);
4230
5559
  ggml_set_name(KQ_mask, "KQ_mask");
4231
5560
  ggml_allocr_alloc(lctx.alloc, KQ_mask);
4232
5561
  if (!ggml_allocr_is_measure(lctx.alloc)) {
@@ -4236,7 +5565,7 @@ static struct ggml_cgraph * llm_build_starcoder(
4236
5565
  for (int h = 0; h < 1; ++h) {
4237
5566
  for (int j = 0; j < n_tokens; ++j) {
4238
5567
  const llama_pos pos = batch.pos[j];
4239
- const llama_seq_id seq_id = batch.seq_id[j];
5568
+ const llama_seq_id seq_id = batch.seq_id[j][0];
4240
5569
 
4241
5570
  for (int i = 0; i < n_kv; ++i) {
4242
5571
  if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
@@ -4247,48 +5576,87 @@ static struct ggml_cgraph * llm_build_starcoder(
4247
5576
  }
4248
5577
  }
4249
5578
 
4250
- inpL = ggml_add(ctx0, token, position);
4251
- ggml_set_name(inpL, "inpL");
4252
-
4253
5579
  for (int il = 0; il < n_layer; ++il) {
4254
- {
4255
- // Norm
4256
- cur = ggml_norm(ctx0, inpL, norm_eps);
4257
- cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b);
5580
+ struct ggml_tensor * attn_norm;
5581
+
5582
+ offload_func_t offload_func = llama_nop;
5583
+
5584
+ #ifdef GGML_USE_CUBLAS
5585
+ if (il >= i_gpu_start) {
5586
+ offload_func = ggml_cuda_assign_buffers_no_alloc;
4258
5587
  }
5588
+ #endif // GGML_USE_CUBLAS
4259
5589
 
5590
+ // self-attention
5591
+ // TODO: refactor into common function (shared with LLaMA)
4260
5592
  {
4261
- // Self Attention
4262
- cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv);
5593
+ attn_norm = ggml_norm(ctx0, inpL, norm_eps);
5594
+ offload_func(attn_norm);
4263
5595
 
4264
- struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd);
4265
- struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*n_embd);
4266
- struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
5596
+ attn_norm = ggml_mul(ctx0, attn_norm, model.layers[il].attn_norm);
5597
+ offload_func(attn_norm);
4267
5598
 
4268
- struct ggml_tensor * Qcur = tmpq;
4269
- struct ggml_tensor * Kcur = tmpk;
5599
+ if (1) {
5600
+ cur = attn_norm;
5601
+ }
5602
+
5603
+ // compute QKV
5604
+
5605
+ cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
5606
+ offload_func_kq(cur);
5607
+
5608
+ if (clamp_kqv > 0.0f) {
5609
+ cur = ggml_clamp(ctx0, cur, -clamp_kqv, clamp_kqv);
5610
+ offload_func_kq(cur);
5611
+ }
5612
+
5613
+ const size_t wsize = ggml_type_size(cur->type);
5614
+
5615
+ struct ggml_tensor * Qcur = ggml_view_3d(
5616
+ ctx0, cur, n_embd_head, n_head, n_tokens,
5617
+ wsize * n_embd_head,
5618
+ wsize * n_embd_head * (n_head + 2 * n_head_kv),
5619
+ 0);
5620
+ offload_func_kq(Qcur);
5621
+
5622
+ struct ggml_tensor * Kcur = ggml_view_3d(
5623
+ ctx0, cur, n_embd_head, n_head_kv, n_tokens,
5624
+ wsize * n_embd_head,
5625
+ wsize * n_embd_head * (n_head + 2 * n_head_kv),
5626
+ wsize * n_embd_head * n_head);
5627
+ offload_func_kq(Kcur);
5628
+
5629
+ struct ggml_tensor * tmpv = ggml_view_3d(
5630
+ ctx0, cur, n_embd_head, n_head_kv, n_tokens,
5631
+ wsize * n_embd_head,
5632
+ wsize * n_embd_head * (n_head + 2 * n_head_kv),
5633
+ wsize * n_embd_head * (n_head + n_head_kv));
5634
+ offload_func_kq(Kcur);
5635
+
5636
+ ggml_set_name(Qcur, "Qcur");
5637
+ ggml_set_name(Kcur, "Kcur");
4270
5638
 
4271
5639
  {
4272
5640
  struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
5641
+ offload_func_v(Vcur);
5642
+ offload_func_v(Vcur->src[0]->src[0]);
4273
5643
  ggml_set_name(Vcur, "Vcur");
4274
5644
 
4275
5645
  struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
5646
+ offload_func_kq(k);
4276
5647
  ggml_set_name(k, "k");
4277
5648
 
4278
5649
  struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
4279
5650
  ( n_ctx)*ggml_element_size(kv_self.v),
4280
5651
  (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
5652
+ offload_func_v(v);
4281
5653
 
4282
5654
  ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
4283
5655
  ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
4284
5656
  }
4285
5657
 
4286
- struct ggml_tensor * Q =
4287
- ggml_permute(ctx0,
4288
- ggml_cpy(ctx0,
4289
- Qcur,
4290
- ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, n_tokens)),
4291
- 0, 2, 1, 3);
5658
+ struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
5659
+ offload_func_kq(Q);
4292
5660
  ggml_set_name(Q, "Q");
4293
5661
 
4294
5662
  struct ggml_tensor * K =
@@ -4297,85 +5665,105 @@ static struct ggml_cgraph * llm_build_starcoder(
4297
5665
  ggml_element_size(kv_self.k)*n_embd_gqa,
4298
5666
  ggml_element_size(kv_self.k)*n_embd_head,
4299
5667
  ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
5668
+ offload_func_kq(K);
4300
5669
  ggml_set_name(K, "K");
4301
5670
 
4302
- // K * Q
4303
5671
  struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
5672
+ offload_func_kq(KQ);
4304
5673
  ggml_set_name(KQ, "KQ");
4305
5674
 
4306
- // KQ_scaled = KQ / sqrt(n_embd_head)
4307
- // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
4308
- struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
5675
+ struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
5676
+ offload_func_kq(KQ_scaled);
4309
5677
  ggml_set_name(KQ_scaled, "KQ_scaled");
4310
5678
 
4311
- // KQ_masked = mask_past(KQ_scaled)
4312
- struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
5679
+ // TODO: replace with ggml_add()
5680
+ struct ggml_tensor * KQ_scaled_alibi =
5681
+ ggml_alibi(ctx0, KQ_scaled, 0, n_head, max_alibi_bias);
5682
+ offload_func_kq(KQ_scaled_alibi);
5683
+ ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
5684
+
5685
+ struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
5686
+ offload_func_kq(KQ_masked);
4313
5687
  ggml_set_name(KQ_masked, "KQ_masked");
4314
5688
 
4315
- // KQ = soft_max(KQ_masked)
4316
- struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
5689
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
5690
+ offload_func_v(KQ_soft_max);
4317
5691
  ggml_set_name(KQ_soft_max, "KQ_soft_max");
4318
5692
 
4319
- // split cached V into n_head heads
4320
5693
  struct ggml_tensor * V =
4321
5694
  ggml_view_3d(ctx0, kv_self.v,
4322
5695
  n_kv, n_embd_head, n_head_kv,
4323
5696
  ggml_element_size(kv_self.v)*n_ctx,
4324
5697
  ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
4325
5698
  ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
5699
+ offload_func_v(V);
4326
5700
  ggml_set_name(V, "V");
4327
5701
 
4328
5702
  struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
5703
+ offload_func_v(KQV);
4329
5704
  ggml_set_name(KQV, "KQV");
4330
5705
 
4331
- // KQV_merged = KQV.permute(0, 2, 1, 3)
4332
5706
  struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
5707
+ offload_func_v(KQV_merged);
4333
5708
  ggml_set_name(KQV_merged, "KQV_merged");
4334
5709
 
4335
- // cur = KQV_merged.contiguous().view(n_embd, n_tokens)
4336
5710
  cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
5711
+ offload_func_v(cur);
4337
5712
  ggml_set_name(cur, "KQV_merged_contiguous");
4338
- }
4339
5713
 
4340
- // Projection
4341
- cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo);
5714
+ cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
5715
+ offload_func(cur);
5716
+ ggml_set_name(cur, "result_wo");
5717
+ }
4342
5718
 
4343
5719
  // Add the input
4344
5720
  cur = ggml_add(ctx0, cur, inpL);
5721
+ offload_func(cur);
4345
5722
 
4346
- struct ggml_tensor * inpFF = cur;
5723
+ struct ggml_tensor * attn_out = cur;
4347
5724
 
4348
- // FF
5725
+ // feed forward
4349
5726
  {
4350
5727
  // Norm
4351
5728
  {
4352
- cur = ggml_norm(ctx0, inpFF, norm_eps);
4353
- cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b);
5729
+ cur = ggml_norm(ctx0, attn_out, norm_eps);
5730
+ offload_func(cur);
5731
+
5732
+ cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
5733
+ offload_func(cur);
4354
5734
  }
4355
5735
 
4356
- cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3);
5736
+ cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur);
5737
+ offload_func(cur);
4357
5738
 
4358
- // GELU activation
4359
5739
  cur = ggml_gelu(ctx0, cur);
4360
-
4361
- // Projection
4362
- cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2);
5740
+ offload_func(cur);
5741
+ cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur);
5742
+ offload_func(cur);
4363
5743
  }
4364
5744
 
4365
- inpL = ggml_add(ctx0, cur, inpFF);
5745
+ cur = ggml_add(ctx0, cur, attn_out);
5746
+ offload_func(cur);
5747
+ // input for next layer
5748
+ inpL = cur;
4366
5749
  }
4367
5750
 
4368
- // Output Norm
5751
+ cur = inpL;
5752
+
5753
+ // norm
4369
5754
  {
4370
- cur = ggml_norm(ctx0, inpL, norm_eps);
4371
- cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b);
5755
+ cur = ggml_norm(ctx0, cur, norm_eps);
5756
+ offload_func_nr(cur);
5757
+
5758
+ cur = ggml_mul(ctx0, cur, model.output_norm);
5759
+ ggml_set_name(cur, "result_norm");
4372
5760
  }
4373
- ggml_set_name(cur, "result_norm");
4374
5761
 
4375
5762
  cur = ggml_mul_mat(ctx0, model.output, cur);
4376
5763
  ggml_set_name(cur, "result_output");
4377
5764
 
4378
5765
  ggml_build_forward_expand(gf, cur);
5766
+
4379
5767
  ggml_free(ctx0);
4380
5768
 
4381
5769
  return gf;
@@ -4405,10 +5793,22 @@ static struct ggml_cgraph * llama_build_graph(
4405
5793
  {
4406
5794
  result = llm_build_starcoder(lctx, batch);
4407
5795
  } break;
5796
+ case LLM_ARCH_PERSIMMON:
5797
+ {
5798
+ result = llm_build_persimmon(lctx, batch);
5799
+ } break;
4408
5800
  case LLM_ARCH_REFACT:
4409
5801
  {
4410
5802
  result = llm_build_refact(lctx, batch);
4411
5803
  } break;
5804
+ case LLM_ARCH_BLOOM:
5805
+ {
5806
+ result = llm_build_bloom(lctx, batch);
5807
+ } break;
5808
+ case LLM_ARCH_MPT:
5809
+ {
5810
+ result = llm_build_mpt(lctx, batch);
5811
+ } break;
4412
5812
  default:
4413
5813
  GGML_ASSERT(false);
4414
5814
  }
@@ -4420,7 +5820,6 @@ static struct ggml_cgraph * llama_build_graph(
4420
5820
  //
4421
5821
  // - lctx: llama context
4422
5822
  // - batch: batch to evaluate
4423
- // - n_threads: number of threads to use
4424
5823
  //
4425
5824
  // return 0 on success
4426
5825
  // return positive int on warning
@@ -4466,8 +5865,11 @@ static int llama_decode_internal(
4466
5865
 
4467
5866
  // helpers for smoother batch API transistion
4468
5867
  // after deprecating the llama_eval calls, these will be removed
4469
- std::vector<llama_pos> pos;
4470
- std::vector<llama_seq_id> seq_id;
5868
+ std::vector<llama_pos> pos;
5869
+
5870
+ std::vector<int32_t> n_seq_id;
5871
+ std::vector<llama_seq_id *> seq_id_arr;
5872
+ std::vector<std::vector<llama_seq_id>> seq_id;
4471
5873
 
4472
5874
  if (batch.pos == nullptr) {
4473
5875
  pos.resize(n_tokens);
@@ -4479,18 +5881,20 @@ static int llama_decode_internal(
4479
5881
  }
4480
5882
 
4481
5883
  if (batch.seq_id == nullptr) {
5884
+ n_seq_id.resize(n_tokens);
4482
5885
  seq_id.resize(n_tokens);
5886
+ seq_id_arr.resize(n_tokens);
4483
5887
  for (uint32_t i = 0; i < n_tokens; i++) {
4484
- seq_id[i] = batch.all_seq_id;
5888
+ n_seq_id[i] = 1;
5889
+ seq_id[i].resize(1);
5890
+ seq_id[i][0] = batch.all_seq_id;
5891
+ seq_id_arr[i] = seq_id[i].data();
4485
5892
  }
4486
5893
 
4487
- batch.seq_id = seq_id.data();
5894
+ batch.n_seq_id = n_seq_id.data();
5895
+ batch.seq_id = seq_id_arr.data();
4488
5896
  }
4489
5897
 
4490
- // we always start to search for a free slot from the start of the cache
4491
- // TODO: better strategies can be implemented
4492
- kv_self.head = 0;
4493
-
4494
5898
  if (!llama_kv_cache_find_slot(kv_self, batch)) {
4495
5899
  return 1;
4496
5900
  }
@@ -4509,6 +5913,13 @@ static int llama_decode_internal(
4509
5913
 
4510
5914
  ggml_allocr_alloc_graph(lctx.alloc, gf);
4511
5915
 
5916
+ struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
5917
+ struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
5918
+
5919
+ GGML_ASSERT(strcmp(res->name, "result_output") == 0);
5920
+ GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
5921
+
5922
+
4512
5923
  #ifdef GGML_USE_CUBLAS
4513
5924
  for (int i = 0; i < gf->n_leafs; i++) {
4514
5925
  ggml_tensor * node = gf->leafs[i];
@@ -4526,6 +5937,12 @@ static int llama_decode_internal(
4526
5937
  }
4527
5938
 
4528
5939
  ggml_cuda_set_mul_mat_q(cparams.mul_mat_q);
5940
+
5941
+ // HACK: ggml-alloc may change the tensor backend when reusing a parent, so force output to be on the CPU here if needed
5942
+ if (!lctx.embedding.empty()) {
5943
+ embeddings->backend = GGML_BACKEND_CPU;
5944
+ }
5945
+ res->backend = GGML_BACKEND_CPU;
4529
5946
  #endif
4530
5947
 
4531
5948
  // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
@@ -4543,18 +5960,13 @@ static int llama_decode_internal(
4543
5960
  const bool full_offload_supported = model.arch == LLM_ARCH_LLAMA ||
4544
5961
  model.arch == LLM_ARCH_BAICHUAN ||
4545
5962
  model.arch == LLM_ARCH_FALCON ||
4546
- model.arch == LLM_ARCH_REFACT;
5963
+ model.arch == LLM_ARCH_REFACT ||
5964
+ model.arch == LLM_ARCH_MPT;
4547
5965
  const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
4548
5966
  if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
4549
5967
  n_threads = 1;
4550
5968
  }
4551
5969
 
4552
- struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
4553
- struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
4554
-
4555
- GGML_ASSERT(strcmp(res->name, "result_output") == 0);
4556
- GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
4557
-
4558
5970
  #if GGML_USE_MPI
4559
5971
  const int64_t n_layer = hparams.n_layer;
4560
5972
  ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
@@ -4576,8 +5988,12 @@ static int llama_decode_internal(
4576
5988
  #endif
4577
5989
 
4578
5990
  // update the kv ring buffer
4579
- lctx.kv_self.head += n_tokens;
4580
5991
  lctx.kv_self.has_shift = false;
5992
+ lctx.kv_self.head += n_tokens;
5993
+ // Ensure kv cache head points to a valid index.
5994
+ if (lctx.kv_self.head >= lctx.kv_self.size) {
5995
+ lctx.kv_self.head = 0;
5996
+ }
4581
5997
 
4582
5998
  #ifdef GGML_PERF
4583
5999
  // print timing information per ggml operation (for debugging purposes)
@@ -4903,7 +6319,6 @@ struct llm_tokenizer_bpe {
4903
6319
  llm_symbol sym;
4904
6320
  size_t char_len = std::min(word.size() - offset, (size_t) ::utf8_len(word[offset]));
4905
6321
  sym.text = word.c_str() + offset;
4906
- sym.n = 1;
4907
6322
  sym.n = char_len;
4908
6323
  offset += sym.n;
4909
6324
  sym.prev = index - 1;
@@ -5040,7 +6455,6 @@ private:
5040
6455
  for (int i = 0; i < (int)text_utf.size(); i++) {
5041
6456
  const std::string & utf_char = text_utf[i];
5042
6457
  bool split_condition = false;
5043
- // const char* text_pos = raw_text_p + utf_char.seq_offset_bytes;
5044
6458
  int bytes_remain = text_utf.size() - i;
5045
6459
  // forward backward lookups
5046
6460
  const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
@@ -5066,9 +6480,9 @@ private:
5066
6480
  if (!split_condition && bytes_remain >= 3) {
5067
6481
  // 're|'ve|'ll
5068
6482
  if (utf_char == "\'" && (
5069
- (utf_char_next == "r" || utf_char_next_next == "e") ||
5070
- (utf_char_next == "v" || utf_char_next_next == "e") ||
5071
- (utf_char_next == "l" || utf_char_next_next == "l"))
6483
+ (utf_char_next == "r" && utf_char_next_next == "e") ||
6484
+ (utf_char_next == "v" && utf_char_next_next == "e") ||
6485
+ (utf_char_next == "l" && utf_char_next_next == "l"))
5072
6486
  ) {
5073
6487
  split_condition = true;
5074
6488
  }
@@ -5119,7 +6533,7 @@ private:
5119
6533
  else if (collecting_special && (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
5120
6534
  split_condition = true;
5121
6535
  }
5122
- else if (collecting_whitespace_lookahead && codepoint_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE) {
6536
+ else if (collecting_whitespace_lookahead && (codepoint_type(utf_char_next) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
5123
6537
  split_condition = true;
5124
6538
  }
5125
6539
  }
@@ -5164,7 +6578,137 @@ private:
5164
6578
  llm_bigram_bpe::queue work_queue;
5165
6579
  };
5166
6580
 
5167
- static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos) {
6581
+ typedef enum FRAGMENT_BUFFER_VARIANT_TYPE{
6582
+ FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN,
6583
+ FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT
6584
+ } FRAGMENT_BUFFER_VARIANT_TYPE;
6585
+
6586
+ struct fragment_buffer_variant{
6587
+ fragment_buffer_variant(llama_vocab::id _token)
6588
+ :
6589
+ type(FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN),
6590
+ token(_token),
6591
+ raw_text(_dummy),
6592
+ offset(0),
6593
+ length(0){}
6594
+ fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length)
6595
+ :
6596
+ type(FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT),
6597
+ token((llama_vocab::id)-1),
6598
+ raw_text(_raw_text),
6599
+ offset(_offset),
6600
+ length(_length){
6601
+ GGML_ASSERT( _offset >= 0 );
6602
+ GGML_ASSERT( _length >= 1 );
6603
+ GGML_ASSERT( offset + length <= raw_text.length() );
6604
+ }
6605
+
6606
+ const FRAGMENT_BUFFER_VARIANT_TYPE type;
6607
+ const llama_vocab::id token;
6608
+ const std::string _dummy;
6609
+ const std::string & raw_text;
6610
+ const uint64_t offset;
6611
+ const uint64_t length;
6612
+ };
6613
+
6614
+ // #define PRETOKENIZERDEBUG
6615
+
6616
+ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer)
6617
+ {
6618
+ // for each special token
6619
+ for (const auto & st: vocab.special_tokens_cache) {
6620
+ const auto & special_token = st.first;
6621
+ const auto & special_id = st.second;
6622
+
6623
+ // for each text fragment
6624
+ std::forward_list<fragment_buffer_variant>::iterator it = buffer.begin();
6625
+ while (it != buffer.end()) {
6626
+ auto & fragment = (*it);
6627
+
6628
+ // if a fragment is text ( not yet processed )
6629
+ if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
6630
+ auto * raw_text = &(fragment.raw_text);
6631
+
6632
+ auto raw_text_base_offset = fragment.offset;
6633
+ auto raw_text_base_length = fragment.length;
6634
+
6635
+ // loop over the text
6636
+ while (true) {
6637
+ // find the first occurence of a given special token in this fragment
6638
+ // passing offset argument only limit the "search area" but match coordinates
6639
+ // are still relative to the source full raw_text
6640
+ auto match = raw_text->find(special_token, raw_text_base_offset);
6641
+
6642
+ // no occurences found, stop processing this fragment for a given special token
6643
+ if (match == std::string::npos) break;
6644
+
6645
+ // check if match is within bounds of offset <-> length
6646
+ if (match + special_token.length() > raw_text_base_offset + raw_text_base_length) break;
6647
+
6648
+ #ifdef PRETOKENIZERDEBUG
6649
+ fprintf(stderr, "FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
6650
+ #endif
6651
+ auto source = std::distance(buffer.begin(), it);
6652
+
6653
+ // if match is further than base offset
6654
+ // then we have some text to the left of it
6655
+ if (match > raw_text_base_offset) {
6656
+ // left
6657
+ const int64_t left_reminder_offset = raw_text_base_offset + 0;
6658
+ const int64_t left_reminder_length = match - raw_text_base_offset;
6659
+ buffer.emplace_after(it, (*raw_text), left_reminder_offset, left_reminder_length);
6660
+
6661
+ #ifdef PRETOKENIZERDEBUG
6662
+ fprintf(stderr, "FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
6663
+ #endif
6664
+ it++;
6665
+ }
6666
+
6667
+ // special token
6668
+ buffer.emplace_after(it, special_id);
6669
+ it++;
6670
+
6671
+ // right
6672
+ if (match + special_token.length() < raw_text_base_offset + raw_text_base_length) {
6673
+ const int64_t right_reminder_offset = match + special_token.length();
6674
+ const int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length());
6675
+ buffer.emplace_after(it, (*raw_text), right_reminder_offset, right_reminder_length);
6676
+
6677
+ #ifdef PRETOKENIZERDEBUG
6678
+ fprintf(stderr, "FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
6679
+ #endif
6680
+
6681
+ it++;
6682
+
6683
+ if (source == 0) {
6684
+ buffer.erase_after(buffer.before_begin());
6685
+ } else {
6686
+ buffer.erase_after(std::next(buffer.begin(), (source-1)));
6687
+ }
6688
+
6689
+ // repeat for the right side
6690
+ raw_text_base_offset = right_reminder_offset;
6691
+ raw_text_base_length = right_reminder_length;
6692
+
6693
+ #ifdef PRETOKENIZERDEBUG
6694
+ fprintf(stderr, "RR: (%ld %ld) '%s'\n", raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
6695
+ #endif
6696
+ } else {
6697
+ if (source == 0) {
6698
+ buffer.erase_after(buffer.before_begin());
6699
+ } else {
6700
+ buffer.erase_after(std::next(buffer.begin(), (source-1)));
6701
+ }
6702
+ break;
6703
+ }
6704
+ }
6705
+ }
6706
+ it++;
6707
+ }
6708
+ }
6709
+ }
6710
+
6711
+ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos, bool special) {
5168
6712
  std::vector<llama_vocab::id> output;
5169
6713
 
5170
6714
  // OG tokenizer behavior:
@@ -5180,20 +6724,58 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
5180
6724
  return output;
5181
6725
  }
5182
6726
 
6727
+ std::forward_list<fragment_buffer_variant> fragment_buffer;
6728
+ fragment_buffer.emplace_front( raw_text, 0, raw_text.length() );
6729
+
6730
+ if (special) tokenizer_st_partition( vocab, fragment_buffer );
6731
+
5183
6732
  switch (vocab.type) {
5184
6733
  case LLAMA_VOCAB_TYPE_SPM:
5185
6734
  {
5186
- // without adding this leading whitespace, we do not get the same results as the original tokenizer
5187
- raw_text = " " + raw_text;
6735
+ for (const auto & fragment: fragment_buffer)
6736
+ {
6737
+ if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT)
6738
+ {
6739
+ // without adding this leading whitespace, we do not get the same results as the original tokenizer
5188
6740
 
5189
- llm_tokenizer_spm tokenizer(vocab);
5190
- llama_escape_whitespace(raw_text);
5191
- tokenizer.tokenize(raw_text, output);
6741
+ // TODO: It's likely possible to get rid of this string copy entirely
6742
+ // by modifying llm_tokenizer_x to operate with string offsets like pre-tokenizer
6743
+ // and passing 'add space prefix' as bool argument
6744
+ //
6745
+ auto raw_text = (special ? "" : " ") + fragment.raw_text.substr(fragment.offset, fragment.length);
6746
+
6747
+ #ifdef PRETOKENIZERDEBUG
6748
+ fprintf(stderr,"TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
6749
+ #endif
6750
+ llm_tokenizer_spm tokenizer(vocab);
6751
+ llama_escape_whitespace(raw_text);
6752
+ tokenizer.tokenize(raw_text, output);
6753
+ }
6754
+ else // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
6755
+ {
6756
+ output.push_back(fragment.token);
6757
+ }
6758
+ }
5192
6759
  } break;
5193
6760
  case LLAMA_VOCAB_TYPE_BPE:
5194
6761
  {
5195
- llm_tokenizer_bpe tokenizer(vocab);
5196
- tokenizer.tokenize(raw_text, output);
6762
+ for (const auto & fragment: fragment_buffer)
6763
+ {
6764
+ if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT)
6765
+ {
6766
+ auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
6767
+
6768
+ #ifdef PRETOKENIZERDEBUG
6769
+ fprintf(stderr,"TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
6770
+ #endif
6771
+ llm_tokenizer_bpe tokenizer(vocab);
6772
+ tokenizer.tokenize(raw_text, output);
6773
+ }
6774
+ else // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
6775
+ {
6776
+ output.push_back(fragment.token);
6777
+ }
6778
+ }
5197
6779
  } break;
5198
6780
  }
5199
6781
 
@@ -5466,7 +7048,7 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
5466
7048
  std::vector<llama_grammar_candidate> rejects;
5467
7049
 
5468
7050
  if (stack.empty()) {
5469
- for (auto tok : candidates) {
7051
+ for (const auto & tok : candidates) {
5470
7052
  if (*tok.code_points != 0 || tok.partial_utf8.n_remain != 0) {
5471
7053
  rejects.push_back(tok);
5472
7054
  }
@@ -5477,7 +7059,7 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
5477
7059
  const llama_grammar_element * stack_pos = stack.back();
5478
7060
 
5479
7061
  std::vector<llama_grammar_candidate> next_candidates;
5480
- for (auto tok : candidates) {
7062
+ for (const auto & tok : candidates) {
5481
7063
  if (*tok.code_points == 0) {
5482
7064
  // reached end of full codepoints in token, reject iff it ended in a partial sequence
5483
7065
  // that cannot satisfy this position in grammar
@@ -5503,7 +7085,7 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
5503
7085
  llama_grammar_advance_stack(rules, stack_after, next_stacks);
5504
7086
 
5505
7087
  auto next_rejects = llama_grammar_reject_candidates(rules, next_stacks, next_candidates);
5506
- for (auto tok : next_rejects) {
7088
+ for (const auto & tok : next_rejects) {
5507
7089
  rejects.push_back({ tok.index, tok.code_points - 1, tok.partial_utf8 });
5508
7090
  }
5509
7091
 
@@ -6635,7 +8217,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
6635
8217
  const std::string name = ggml_get_name(meta);
6636
8218
 
6637
8219
  // TODO: avoid hardcoded tensor names - use the TN_* constants
6638
- if (name.find("attn_v.weight") != std::string::npos) {
8220
+ if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
6639
8221
  ++n_attention_wv;
6640
8222
  }
6641
8223
  else if (name.find("ffn_down.weight") != std::string::npos) {
@@ -6672,6 +8254,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
6672
8254
  }
6673
8255
 
6674
8256
  std::ofstream fout(fname_out, std::ios::binary);
8257
+ fout.exceptions(std::ofstream::failbit); // fail fast on write errors
6675
8258
 
6676
8259
  const size_t meta_size = gguf_get_meta_size(ctx_out);
6677
8260
 
@@ -7535,6 +9118,9 @@ void llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llam
7535
9118
  }
7536
9119
 
7537
9120
  void llama_kv_cache_seq_cp(struct llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
9121
+ if (seq_id_src == seq_id_dst) {
9122
+ return;
9123
+ }
7538
9124
  llama_kv_cache_seq_cp(ctx->kv_self, seq_id_src, seq_id_dst, p0, p1);
7539
9125
  }
7540
9126
 
@@ -7987,7 +9573,7 @@ int llama_eval_embd(
7987
9573
  int n_past) {
7988
9574
  llama_kv_cache_tokens_rm(ctx->kv_self, n_past, -1);
7989
9575
 
7990
- llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, n_past, 1, 0, };
9576
+ llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, };
7991
9577
 
7992
9578
  const int ret = llama_decode_internal(*ctx, batch);
7993
9579
  if (ret < 0) {
@@ -8008,20 +9594,21 @@ struct llama_batch llama_batch_get_one(
8008
9594
  llama_pos pos_0,
8009
9595
  llama_seq_id seq_id) {
8010
9596
  return {
8011
- /*n_tokens =*/ n_tokens,
8012
- /*tokens =*/ tokens,
8013
- /*embd =*/ nullptr,
8014
- /*pos =*/ nullptr,
8015
- /*seq_id =*/ nullptr,
8016
- /*logits =*/ nullptr,
8017
- /*all_pos_0 =*/ pos_0,
8018
- /*all_pos_1 =*/ 1,
8019
- /*all_seq_id =*/ seq_id,
9597
+ /*n_tokens =*/ n_tokens,
9598
+ /*tokens =*/ tokens,
9599
+ /*embd =*/ nullptr,
9600
+ /*pos =*/ nullptr,
9601
+ /*n_seq_id =*/ nullptr,
9602
+ /*seq_id =*/ nullptr,
9603
+ /*logits =*/ nullptr,
9604
+ /*all_pos_0 =*/ pos_0,
9605
+ /*all_pos_1 =*/ 1,
9606
+ /*all_seq_id =*/ seq_id,
8020
9607
  };
8021
9608
  }
8022
9609
 
8023
- struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd) {
8024
- llama_batch batch = { -1, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, };
9610
+ struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd, int32_t n_seq_max) {
9611
+ llama_batch batch = { 0, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, };
8025
9612
 
8026
9613
  if (embd) {
8027
9614
  batch.embd = (float *) malloc(sizeof(float) * n_tokens * embd);
@@ -8029,19 +9616,29 @@ struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd) {
8029
9616
  batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens);
8030
9617
  }
8031
9618
 
8032
- batch.pos = (llama_pos *) malloc(sizeof(llama_pos) * n_tokens);
8033
- batch.seq_id = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_tokens);
8034
- batch.logits = (int8_t *) malloc(sizeof(int8_t) * n_tokens);
9619
+ batch.pos = (llama_pos *) malloc(sizeof(llama_pos) * n_tokens);
9620
+ batch.n_seq_id = (int32_t *) malloc(sizeof(int32_t) * n_tokens);
9621
+ batch.seq_id = (llama_seq_id **) malloc(sizeof(llama_seq_id *) * n_tokens);
9622
+ for (int i = 0; i < n_tokens; ++i) {
9623
+ batch.seq_id[i] = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_seq_max);
9624
+ }
9625
+ batch.logits = (int8_t *) malloc(sizeof(int8_t) * n_tokens);
8035
9626
 
8036
9627
  return batch;
8037
9628
  }
8038
9629
 
8039
9630
  void llama_batch_free(struct llama_batch batch) {
8040
- if (batch.token) free(batch.token);
8041
- if (batch.embd) free(batch.embd);
8042
- if (batch.pos) free(batch.pos);
8043
- if (batch.seq_id) free(batch.seq_id);
8044
- if (batch.logits) free(batch.logits);
9631
+ if (batch.token) free(batch.token);
9632
+ if (batch.embd) free(batch.embd);
9633
+ if (batch.pos) free(batch.pos);
9634
+ if (batch.n_seq_id) free(batch.n_seq_id);
9635
+ if (batch.seq_id) {
9636
+ for (int i = 0; i < batch.n_tokens; ++i) {
9637
+ free(batch.seq_id[i]);
9638
+ }
9639
+ free(batch.seq_id);
9640
+ }
9641
+ if (batch.logits) free(batch.logits);
8045
9642
  }
8046
9643
 
8047
9644
  int llama_decode(
@@ -8106,15 +9703,15 @@ llama_token llama_token_eot(const struct llama_context * ctx) {
8106
9703
  return ctx->model.vocab.special_eot_id;
8107
9704
  }
8108
9705
 
8109
-
8110
9706
  int llama_tokenize(
8111
9707
  const struct llama_model * model,
8112
9708
  const char * text,
8113
9709
  int text_len,
8114
9710
  llama_token * tokens,
8115
9711
  int n_max_tokens,
8116
- bool add_bos) {
8117
- auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos);
9712
+ bool add_bos,
9713
+ bool special) {
9714
+ auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos, special);
8118
9715
 
8119
9716
  if (n_max_tokens < (int) res.size()) {
8120
9717
  // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
@@ -8166,7 +9763,9 @@ int llama_token_to_piece(const struct llama_model * model, llama_token token, ch
8166
9763
  buf[0] = llama_token_to_byte(model->vocab, token);
8167
9764
  return 1;
8168
9765
  } else {
8169
- GGML_ASSERT(false);
9766
+ // TODO: for now we accept all unsupported token types,
9767
+ // suppressing them like CONTROL tokens.
9768
+ // GGML_ASSERT(false);
8170
9769
  }
8171
9770
  break;
8172
9771
  }
@@ -8182,7 +9781,9 @@ int llama_token_to_piece(const struct llama_model * model, llama_token token, ch
8182
9781
  } else if (llama_is_control_token(model->vocab, token)) {
8183
9782
  ;
8184
9783
  } else {
8185
- GGML_ASSERT(false);
9784
+ // TODO: for now we accept all unsupported token types,
9785
+ // suppressing them like CONTROL tokens.
9786
+ // GGML_ASSERT(false);
8186
9787
  }
8187
9788
  break;
8188
9789
  }