llama_cpp 0.7.0 → 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -186,7 +186,9 @@ enum llm_arch {
186
186
  LLM_ARCH_GPTNEOX,
187
187
  LLM_ARCH_MPT,
188
188
  LLM_ARCH_STARCODER,
189
+ LLM_ARCH_PERSIMMON,
189
190
  LLM_ARCH_REFACT,
191
+ LLM_ARCH_BLOOM,
190
192
  LLM_ARCH_UNKNOWN,
191
193
  };
192
194
 
@@ -199,7 +201,9 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
199
201
  { LLM_ARCH_MPT, "mpt" },
200
202
  { LLM_ARCH_BAICHUAN, "baichuan" },
201
203
  { LLM_ARCH_STARCODER, "starcoder" },
202
- { LLM_ARCH_REFACT, "refact" },
204
+ { LLM_ARCH_PERSIMMON, "persimmon" },
205
+ { LLM_ARCH_REFACT, "refact" },
206
+ { LLM_ARCH_BLOOM, "bloom" },
203
207
  };
204
208
 
205
209
  enum llm_kv {
@@ -302,6 +306,7 @@ struct LLM_KV {
302
306
 
303
307
  enum llm_tensor {
304
308
  LLM_TENSOR_TOKEN_EMBD,
309
+ LLM_TENSOR_TOKEN_EMBD_NORM,
305
310
  LLM_TENSOR_POS_EMBD,
306
311
  LLM_TENSOR_OUTPUT,
307
312
  LLM_TENSOR_OUTPUT_NORM,
@@ -318,6 +323,8 @@ enum llm_tensor {
318
323
  LLM_TENSOR_FFN_DOWN,
319
324
  LLM_TENSOR_FFN_UP,
320
325
  LLM_TENSOR_FFN_NORM,
326
+ LLM_TENSOR_ATTN_Q_NORM,
327
+ LLM_TENSOR_ATTN_K_NORM,
321
328
  };
322
329
 
323
330
  static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
@@ -399,10 +406,35 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
399
406
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
400
407
  },
401
408
  },
409
+ {
410
+ LLM_ARCH_PERSIMMON,
411
+ {
412
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd"},
413
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm"},
414
+ { LLM_TENSOR_OUTPUT, "output"},
415
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm"},
416
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv"},
417
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output"},
418
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm"},
419
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm"},
420
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm"},
421
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down"},
422
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up"},
423
+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd"},
424
+ },
425
+ },
402
426
  {
403
427
  LLM_ARCH_MPT,
404
428
  {
405
429
  { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
430
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
431
+ { LLM_TENSOR_OUTPUT, "output" },
432
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
433
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
434
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
435
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
436
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
437
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
406
438
  },
407
439
  },
408
440
  {
@@ -437,6 +469,21 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
437
469
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
438
470
  },
439
471
  },
472
+ {
473
+ LLM_ARCH_BLOOM,
474
+ {
475
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
476
+ { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
477
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
478
+ { LLM_TENSOR_OUTPUT, "output" },
479
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
480
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
481
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
482
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
483
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
484
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
485
+ },
486
+ },
440
487
  {
441
488
  LLM_ARCH_UNKNOWN,
442
489
  {
@@ -954,6 +1001,7 @@ enum e_model {
954
1001
  MODEL_1B,
955
1002
  MODEL_3B,
956
1003
  MODEL_7B,
1004
+ MODEL_8B,
957
1005
  MODEL_13B,
958
1006
  MODEL_15B,
959
1007
  MODEL_30B,
@@ -984,6 +1032,9 @@ struct llama_hparams {
984
1032
  float rope_freq_base_train;
985
1033
  float rope_freq_scale_train;
986
1034
 
1035
+ float f_clamp_kqv;
1036
+ float f_max_alibi_bias;
1037
+
987
1038
  bool operator!=(const llama_hparams & other) const {
988
1039
  if (this->vocab_only != other.vocab_only) return true;
989
1040
  if (this->n_vocab != other.n_vocab) return true;
@@ -1036,6 +1087,10 @@ struct llama_layer {
1036
1087
  struct ggml_tensor * attn_norm_b;
1037
1088
  struct ggml_tensor * attn_norm_2;
1038
1089
  struct ggml_tensor * attn_norm_2_b;
1090
+ struct ggml_tensor * attn_q_norm;
1091
+ struct ggml_tensor * attn_q_norm_b;
1092
+ struct ggml_tensor * attn_k_norm;
1093
+ struct ggml_tensor * attn_k_norm_b;
1039
1094
 
1040
1095
  // attention
1041
1096
  struct ggml_tensor * wq;
@@ -1077,6 +1132,9 @@ struct llama_kv_cell {
1077
1132
  struct llama_kv_cache {
1078
1133
  bool has_shift = false;
1079
1134
 
1135
+ // Note: The value of head isn't only used to optimize searching
1136
+ // for a free KV slot. llama_decode_internal also uses it, so it
1137
+ // cannot be freely changed after a slot has been allocated.
1080
1138
  uint32_t head = 0;
1081
1139
  uint32_t size = 0;
1082
1140
 
@@ -1162,6 +1220,8 @@ struct llama_model {
1162
1220
 
1163
1221
  struct ggml_tensor * tok_embeddings;
1164
1222
  struct ggml_tensor * pos_embeddings;
1223
+ struct ggml_tensor * tok_norm;
1224
+ struct ggml_tensor * tok_norm_b;
1165
1225
 
1166
1226
  struct ggml_tensor * output_norm;
1167
1227
  struct ggml_tensor * output_norm_b;
@@ -1291,7 +1351,11 @@ static bool llama_kv_cache_init(
1291
1351
  cache.cells.clear();
1292
1352
  cache.cells.resize(n_ctx);
1293
1353
 
1354
+ // TODO: this should be:
1355
+ // cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*ggml_tensor_overhead());
1356
+ // change it and test that it works
1294
1357
  cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
1358
+ memset(cache.buf.data, 0, cache.buf.size);
1295
1359
 
1296
1360
  struct ggml_init_params params;
1297
1361
  params.mem_size = cache.buf.size;
@@ -1334,6 +1398,8 @@ static bool llama_kv_cache_init(
1334
1398
 
1335
1399
  // find an empty slot of size "n_tokens" in the cache
1336
1400
  // updates the cache head
1401
+ // Note: On success, it's important that cache.head points
1402
+ // to the first cell of the slot.
1337
1403
  static bool llama_kv_cache_find_slot(
1338
1404
  struct llama_kv_cache & cache,
1339
1405
  const struct llama_batch & batch) {
@@ -1349,8 +1415,8 @@ static bool llama_kv_cache_find_slot(
1349
1415
 
1350
1416
  while (true) {
1351
1417
  if (cache.head + n_tokens > n_ctx) {
1418
+ n_tested += n_ctx - cache.head;
1352
1419
  cache.head = 0;
1353
- n_tested += n_ctx - cache.head;
1354
1420
  continue;
1355
1421
  }
1356
1422
 
@@ -1401,6 +1467,9 @@ static void llama_kv_cache_tokens_rm(struct llama_kv_cache & cache, int32_t c0,
1401
1467
  cache.cells[i].pos = -1;
1402
1468
  cache.cells[i].seq_id.clear();
1403
1469
  }
1470
+
1471
+ // Searching for a free slot can start here since we know it will be empty.
1472
+ cache.head = uint32_t(c0);
1404
1473
  }
1405
1474
 
1406
1475
  static void llama_kv_cache_seq_rm(
@@ -1408,6 +1477,8 @@ static void llama_kv_cache_seq_rm(
1408
1477
  llama_seq_id seq_id,
1409
1478
  llama_pos p0,
1410
1479
  llama_pos p1) {
1480
+ uint32_t new_head = cache.size;
1481
+
1411
1482
  if (p0 < 0) p0 = 0;
1412
1483
  if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
1413
1484
 
@@ -1416,9 +1487,13 @@ static void llama_kv_cache_seq_rm(
1416
1487
  cache.cells[i].seq_id.erase(seq_id);
1417
1488
  if (cache.cells[i].seq_id.empty()) {
1418
1489
  cache.cells[i].pos = -1;
1490
+ if (new_head == cache.size) new_head = i;
1419
1491
  }
1420
1492
  }
1421
1493
  }
1494
+
1495
+ // If we freed up a slot, set head to it so searching can start there.
1496
+ if (new_head != cache.size) cache.head = new_head;
1422
1497
  }
1423
1498
 
1424
1499
  static void llama_kv_cache_seq_cp(
@@ -1430,6 +1505,8 @@ static void llama_kv_cache_seq_cp(
1430
1505
  if (p0 < 0) p0 = 0;
1431
1506
  if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
1432
1507
 
1508
+ cache.head = 0;
1509
+
1433
1510
  for (uint32_t i = 0; i < cache.size; ++i) {
1434
1511
  if (cache.cells[i].has_seq_id(seq_id_src) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
1435
1512
  cache.cells[i].seq_id.insert(seq_id_dst);
@@ -1438,12 +1515,18 @@ static void llama_kv_cache_seq_cp(
1438
1515
  }
1439
1516
 
1440
1517
  static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id seq_id) {
1518
+ uint32_t new_head = cache.size;
1519
+
1441
1520
  for (uint32_t i = 0; i < cache.size; ++i) {
1442
1521
  if (!cache.cells[i].has_seq_id(seq_id)) {
1443
1522
  cache.cells[i].pos = -1;
1444
1523
  cache.cells[i].seq_id.clear();
1524
+ if (new_head == cache.size) new_head = i;
1445
1525
  }
1446
1526
  }
1527
+
1528
+ // If we freed up a slot, set head to it so searching can start there.
1529
+ if (new_head != cache.size) cache.head = new_head;
1447
1530
  }
1448
1531
 
1449
1532
  static void llama_kv_cache_seq_shift(
@@ -1452,6 +1535,8 @@ static void llama_kv_cache_seq_shift(
1452
1535
  llama_pos p0,
1453
1536
  llama_pos p1,
1454
1537
  llama_pos delta) {
1538
+ uint32_t new_head = cache.size;
1539
+
1455
1540
  if (p0 < 0) p0 = 0;
1456
1541
  if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
1457
1542
 
@@ -1461,12 +1546,17 @@ static void llama_kv_cache_seq_shift(
1461
1546
  if (cache.cells[i].pos < 0) {
1462
1547
  cache.cells[i].pos = -1;
1463
1548
  cache.cells[i].seq_id.clear();
1549
+ if (new_head == cache.size) new_head = i;
1464
1550
  } else {
1465
1551
  cache.has_shift = true;
1466
1552
  cache.cells[i].delta = delta;
1467
1553
  }
1468
1554
  }
1469
1555
  }
1556
+
1557
+ // If we freed up a slot, set head to it so searching can start there.
1558
+ // Otherwise we just start the next search from the beginning.
1559
+ cache.head = new_head != cache.size ? new_head : 0;
1470
1560
  }
1471
1561
 
1472
1562
  //
@@ -1670,7 +1760,7 @@ struct llama_model_loader {
1670
1760
  }
1671
1761
  }
1672
1762
 
1673
- struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend backend) {
1763
+ struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend_type backend) {
1674
1764
  if (backend != GGML_BACKEND_CPU) {
1675
1765
  ggml_set_no_alloc(ctx, true);
1676
1766
  }
@@ -1688,7 +1778,7 @@ struct llama_model_loader {
1688
1778
  return tensor;
1689
1779
  }
1690
1780
 
1691
- struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend backend) {
1781
+ struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend_type backend) {
1692
1782
  struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str());
1693
1783
 
1694
1784
  if (cur == NULL) {
@@ -1867,6 +1957,7 @@ static const char * llama_model_type_name(e_model type) {
1867
1957
  case MODEL_1B: return "1B";
1868
1958
  case MODEL_3B: return "3B";
1869
1959
  case MODEL_7B: return "7B";
1960
+ case MODEL_8B: return "8B";
1870
1961
  case MODEL_13B: return "13B";
1871
1962
  case MODEL_15B: return "15B";
1872
1963
  case MODEL_30B: return "30B";
@@ -1979,6 +2070,14 @@ static void llm_load_hparams(
1979
2070
  default: model.type = e_model::MODEL_UNKNOWN;
1980
2071
  }
1981
2072
  } break;
2073
+ case LLM_ARCH_PERSIMMON:
2074
+ {
2075
+ GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
2076
+ switch (hparams.n_layer) {
2077
+ case 36: model.type = e_model::MODEL_8B; break;
2078
+ default: model.type = e_model::MODEL_UNKNOWN;
2079
+ }
2080
+ } break;
1982
2081
  case LLM_ARCH_REFACT:
1983
2082
  {
1984
2083
  GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
@@ -1987,6 +2086,33 @@ static void llm_load_hparams(
1987
2086
  default: model.type = e_model::MODEL_UNKNOWN;
1988
2087
  }
1989
2088
  } break;
2089
+ case LLM_ARCH_BLOOM:
2090
+ {
2091
+ GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
2092
+
2093
+ switch (hparams.n_layer) {
2094
+ case 24: model.type = e_model::MODEL_1B; break;
2095
+ case 30:
2096
+ switch (hparams.n_embd) {
2097
+ case 2560: model.type = e_model::MODEL_3B; break;
2098
+ case 4096: model.type = e_model::MODEL_7B; break;
2099
+ } break;
2100
+ }
2101
+ } break;
2102
+ case LLM_ARCH_MPT:
2103
+ {
2104
+ hparams.f_clamp_kqv = 0.0f;
2105
+
2106
+ GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
2107
+ GGUF_GET_KEY(ctx, hparams.f_clamp_kqv, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ATTENTION_CLAMP_KQV));
2108
+ GGUF_GET_KEY(ctx, hparams.f_max_alibi_bias, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_MAX_ALIBI_BIAS));
2109
+
2110
+ switch (hparams.n_layer) {
2111
+ case 32: model.type = e_model::MODEL_7B; break;
2112
+ case 48: model.type = e_model::MODEL_30B; break;
2113
+ default: model.type = e_model::MODEL_UNKNOWN;
2114
+ }
2115
+ } break;
1990
2116
  default: (void)0;
1991
2117
  }
1992
2118
 
@@ -2131,6 +2257,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
2131
2257
  LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa());
2132
2258
  LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
2133
2259
  LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
2260
+ LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
2261
+ LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
2134
2262
  LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
2135
2263
  LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
2136
2264
  LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
@@ -2230,8 +2358,8 @@ static void llm_load_tensors(
2230
2358
 
2231
2359
  // output
2232
2360
  {
2233
- ggml_backend backend_norm;
2234
- ggml_backend backend_output;
2361
+ ggml_backend_type backend_norm;
2362
+ ggml_backend_type backend_output;
2235
2363
 
2236
2364
  if (n_gpu_layers > int(n_layer)) {
2237
2365
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
@@ -2266,8 +2394,8 @@ static void llm_load_tensors(
2266
2394
  model.layers.resize(n_layer);
2267
2395
 
2268
2396
  for (uint32_t i = 0; i < n_layer; ++i) {
2269
- const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2270
- const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2397
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2398
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2271
2399
 
2272
2400
  auto & layer = model.layers[i];
2273
2401
 
@@ -2296,8 +2424,8 @@ static void llm_load_tensors(
2296
2424
  {
2297
2425
  model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2298
2426
  {
2299
- ggml_backend backend_norm;
2300
- ggml_backend backend_output;
2427
+ ggml_backend_type backend_norm;
2428
+ ggml_backend_type backend_output;
2301
2429
 
2302
2430
  if (n_gpu_layers > int(n_layer)) {
2303
2431
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
@@ -2332,8 +2460,8 @@ static void llm_load_tensors(
2332
2460
  model.layers.resize(n_layer);
2333
2461
 
2334
2462
  for (uint32_t i = 0; i < n_layer; ++i) {
2335
- const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2336
- const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2463
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2464
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2337
2465
 
2338
2466
  auto & layer = model.layers[i];
2339
2467
 
@@ -2366,8 +2494,8 @@ static void llm_load_tensors(
2366
2494
 
2367
2495
  // output
2368
2496
  {
2369
- ggml_backend backend_norm;
2370
- ggml_backend backend_output;
2497
+ ggml_backend_type backend_norm;
2498
+ ggml_backend_type backend_output;
2371
2499
 
2372
2500
  if (n_gpu_layers > int(n_layer)) {
2373
2501
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
@@ -2404,8 +2532,8 @@ static void llm_load_tensors(
2404
2532
  model.layers.resize(n_layer);
2405
2533
 
2406
2534
  for (uint32_t i = 0; i < n_layer; ++i) {
2407
- const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2408
- const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2535
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2536
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2409
2537
 
2410
2538
  auto & layer = model.layers[i];
2411
2539
 
@@ -2443,8 +2571,8 @@ static void llm_load_tensors(
2443
2571
 
2444
2572
  // output
2445
2573
  {
2446
- ggml_backend backend_norm;
2447
- ggml_backend backend_output;
2574
+ ggml_backend_type backend_norm;
2575
+ ggml_backend_type backend_output;
2448
2576
 
2449
2577
  if (n_gpu_layers > int(n_layer)) {
2450
2578
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
@@ -2481,8 +2609,8 @@ static void llm_load_tensors(
2481
2609
  model.layers.resize(n_layer);
2482
2610
 
2483
2611
  for (uint32_t i = 0; i < n_layer; ++i) {
2484
- const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2485
- const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2612
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2613
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2486
2614
 
2487
2615
  auto & layer = model.layers[i];
2488
2616
 
@@ -2515,6 +2643,216 @@ static void llm_load_tensors(
2515
2643
  }
2516
2644
  }
2517
2645
  } break;
2646
+ case LLM_ARCH_PERSIMMON:
2647
+ {
2648
+ model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2649
+
2650
+ {
2651
+ ggml_backend_type backend_norm;
2652
+ ggml_backend_type backend_output;
2653
+
2654
+ if (n_gpu_layers > int(n_layer)) {
2655
+ // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2656
+ // on Windows however this is detrimental unless everything is on the GPU
2657
+ #ifndef _WIN32
2658
+ backend_norm = LLAMA_BACKEND_OFFLOAD;
2659
+ #else
2660
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2661
+ #endif // _WIN32
2662
+
2663
+ backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
2664
+ } else {
2665
+ backend_norm = GGML_BACKEND_CPU;
2666
+ backend_output = GGML_BACKEND_CPU;
2667
+ }
2668
+
2669
+ model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
2670
+ model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
2671
+ model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
2672
+
2673
+ if (backend_norm == GGML_BACKEND_GPU) {
2674
+ vram_weights += ggml_nbytes(model.output_norm);
2675
+ vram_weights += ggml_nbytes(model.output_norm_b);
2676
+ }
2677
+ if (backend_output == GGML_BACKEND_GPU_SPLIT) {
2678
+ vram_weights += ggml_nbytes(model.output);
2679
+ }
2680
+ }
2681
+
2682
+ const uint32_t n_ff = hparams.n_ff;
2683
+ const int i_gpu_start = n_layer - n_gpu_layers;
2684
+ model.layers.resize(n_layer);
2685
+ for (uint32_t i = 0; i < n_layer; ++i) {
2686
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2687
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT;
2688
+ auto & layer = model.layers[i];
2689
+ layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
2690
+ layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
2691
+ layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
2692
+ layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split);
2693
+ layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
2694
+ layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split);
2695
+ layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
2696
+ layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split);
2697
+ layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
2698
+ layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split);
2699
+ layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
2700
+ layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
2701
+ layer.attn_q_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64}, backend);
2702
+ layer.attn_q_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {64}, backend);
2703
+ layer.attn_k_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {64}, backend);
2704
+ layer.attn_k_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64}, backend);
2705
+ }
2706
+ } break;
2707
+ case LLM_ARCH_BLOOM:
2708
+ {
2709
+ // TODO: CPU-only for now
2710
+
2711
+ model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2712
+ model.tok_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, GGML_BACKEND_CPU);
2713
+ model.tok_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, GGML_BACKEND_CPU);
2714
+
2715
+ // output
2716
+ {
2717
+ ggml_backend_type backend_norm;
2718
+ ggml_backend_type backend_output;
2719
+
2720
+ if (n_gpu_layers > int(n_layer)) {
2721
+ // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2722
+ // on Windows however this is detrimental unless everything is on the GPU
2723
+ #ifndef _WIN32
2724
+ backend_norm = LLAMA_BACKEND_OFFLOAD;
2725
+ #else
2726
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2727
+ #endif // _WIN32
2728
+
2729
+ backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
2730
+ } else {
2731
+ backend_norm = GGML_BACKEND_CPU;
2732
+ backend_output = GGML_BACKEND_CPU;
2733
+ }
2734
+
2735
+ model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
2736
+ model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
2737
+ model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
2738
+
2739
+ if (backend_norm == GGML_BACKEND_GPU) {
2740
+ vram_weights += ggml_nbytes(model.output_norm);
2741
+ vram_weights += ggml_nbytes(model.output_norm_b);
2742
+ }
2743
+ if (backend_output == GGML_BACKEND_GPU_SPLIT) {
2744
+ vram_weights += ggml_nbytes(model.output);
2745
+ }
2746
+ }
2747
+
2748
+ const uint32_t n_ff = hparams.n_ff;
2749
+
2750
+ const int i_gpu_start = n_layer - n_gpu_layers;
2751
+
2752
+ model.layers.resize(n_layer);
2753
+
2754
+ for (uint32_t i = 0; i < n_layer; ++i) {
2755
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2756
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2757
+
2758
+ auto & layer = model.layers[i];
2759
+
2760
+ layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
2761
+ layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
2762
+
2763
+ layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
2764
+ layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split);
2765
+
2766
+ layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
2767
+ layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split);
2768
+
2769
+ layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
2770
+ layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
2771
+
2772
+ layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
2773
+ layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split);
2774
+
2775
+ layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
2776
+ layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split);
2777
+
2778
+ if (backend == GGML_BACKEND_GPU) {
2779
+ vram_weights +=
2780
+ ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
2781
+ ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
2782
+ ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) +
2783
+ ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_norm_b) +
2784
+ ggml_nbytes(layer.w3) + ggml_nbytes(layer.b3) +
2785
+ ggml_nbytes(layer.w2) + ggml_nbytes(layer.b2);
2786
+ }
2787
+ }
2788
+ } break;
2789
+ case LLM_ARCH_MPT:
2790
+ {
2791
+ model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2792
+
2793
+ // output
2794
+ {
2795
+ ggml_backend_type backend_norm;
2796
+ ggml_backend_type backend_output;
2797
+
2798
+ if (n_gpu_layers > int(n_layer)) {
2799
+ // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2800
+ // on Windows however this is detrimental unless everything is on the GPU
2801
+ #ifndef _WIN32
2802
+ backend_norm = LLAMA_BACKEND_OFFLOAD;
2803
+ #else
2804
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2805
+ #endif // _WIN32
2806
+
2807
+ backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
2808
+ } else {
2809
+ backend_norm = GGML_BACKEND_CPU;
2810
+ backend_output = GGML_BACKEND_CPU;
2811
+ }
2812
+
2813
+ model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
2814
+ model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
2815
+
2816
+ if (backend_norm == GGML_BACKEND_GPU) {
2817
+ vram_weights += ggml_nbytes(model.output_norm);
2818
+ }
2819
+ if (backend_output == GGML_BACKEND_GPU_SPLIT) {
2820
+ vram_weights += ggml_nbytes(model.output);
2821
+ }
2822
+ }
2823
+
2824
+ const uint32_t n_ff = hparams.n_ff;
2825
+
2826
+ const int i_gpu_start = n_layer - n_gpu_layers;
2827
+
2828
+ model.layers.resize(n_layer);
2829
+
2830
+ for (uint32_t i = 0; i < n_layer; ++i) {
2831
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2832
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2833
+
2834
+ auto & layer = model.layers[i];
2835
+
2836
+ layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
2837
+ layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, 3*n_embd}, backend_split);
2838
+ layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
2839
+
2840
+ layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
2841
+
2842
+ layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
2843
+ layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
2844
+
2845
+ if (backend == GGML_BACKEND_GPU) {
2846
+ vram_weights +=
2847
+ ggml_nbytes(layer.attn_norm) +
2848
+ ggml_nbytes(layer.wqkv) +
2849
+ ggml_nbytes(layer.wo) +
2850
+ ggml_nbytes(layer.ffn_norm) +
2851
+ ggml_nbytes(layer.w2) +
2852
+ ggml_nbytes(layer.w3);
2853
+ }
2854
+ }
2855
+ } break;
2518
2856
  default:
2519
2857
  throw std::runtime_error("unknown architecture");
2520
2858
  }
@@ -2624,8 +2962,8 @@ static bool llama_model_load(
2624
2962
  }
2625
2963
 
2626
2964
  static struct ggml_cgraph * llm_build_llama(
2627
- llama_context & lctx,
2628
- const llama_batch & batch) {
2965
+ llama_context & lctx,
2966
+ const llama_batch & batch) {
2629
2967
  const auto & model = lctx.model;
2630
2968
  const auto & hparams = model.hparams;
2631
2969
  const auto & cparams = lctx.cparams;
@@ -2663,11 +3001,9 @@ static struct ggml_cgraph * llm_build_llama(
2663
3001
  struct ggml_init_params params = {
2664
3002
  /*.mem_size =*/ buf_compute.size,
2665
3003
  /*.mem_buffer =*/ buf_compute.data,
2666
- /*.no_alloc =*/ false,
3004
+ /*.no_alloc =*/ true,
2667
3005
  };
2668
3006
 
2669
- params.no_alloc = true;
2670
-
2671
3007
  struct ggml_context * ctx0 = ggml_init(params);
2672
3008
 
2673
3009
  ggml_cgraph * gf = ggml_new_graph(ctx0);
@@ -3051,11 +3387,9 @@ static struct ggml_cgraph * llm_build_baichaun(
3051
3387
  struct ggml_init_params params = {
3052
3388
  /*.mem_size =*/ buf_compute.size,
3053
3389
  /*.mem_buffer =*/ buf_compute.data,
3054
- /*.no_alloc =*/ false,
3390
+ /*.no_alloc =*/ true,
3055
3391
  };
3056
3392
 
3057
- params.no_alloc = true;
3058
-
3059
3393
  struct ggml_context * ctx0 = ggml_init(params);
3060
3394
 
3061
3395
  ggml_cgraph * gf = ggml_new_graph(ctx0);
@@ -3452,11 +3786,9 @@ static struct ggml_cgraph * llm_build_refact(
3452
3786
  struct ggml_init_params params = {
3453
3787
  /*.mem_size =*/ buf_compute.size,
3454
3788
  /*.mem_buffer =*/ buf_compute.data,
3455
- /*.no_alloc =*/ false,
3789
+ /*.no_alloc =*/ true,
3456
3790
  };
3457
3791
 
3458
- params.no_alloc = true;
3459
-
3460
3792
  struct ggml_context * ctx0 = ggml_init(params);
3461
3793
 
3462
3794
  ggml_cgraph * gf = ggml_new_graph(ctx0);
@@ -3806,11 +4138,9 @@ static struct ggml_cgraph * llm_build_falcon(
3806
4138
  struct ggml_init_params params = {
3807
4139
  /*.mem_size =*/ buf_compute.size,
3808
4140
  /*.mem_buffer =*/ buf_compute.data,
3809
- /*.no_alloc =*/ false,
4141
+ /*.no_alloc =*/ true,
3810
4142
  };
3811
4143
 
3812
- params.no_alloc = true;
3813
-
3814
4144
  struct ggml_context * ctx0 = ggml_init(params);
3815
4145
 
3816
4146
  ggml_cgraph * gf = ggml_new_graph(ctx0);
@@ -4166,11 +4496,9 @@ static struct ggml_cgraph * llm_build_starcoder(
4166
4496
  struct ggml_init_params params = {
4167
4497
  /*.mem_size =*/ buf_compute.size,
4168
4498
  /*.mem_buffer =*/ buf_compute.data,
4169
- /*.no_alloc =*/ false,
4499
+ /*.no_alloc =*/ true,
4170
4500
  };
4171
4501
 
4172
- params.no_alloc = true;
4173
-
4174
4502
  struct ggml_context * ctx0 = ggml_init(params);
4175
4503
 
4176
4504
  ggml_cgraph * gf = ggml_new_graph(ctx0);
@@ -4381,19 +4709,975 @@ static struct ggml_cgraph * llm_build_starcoder(
4381
4709
  return gf;
4382
4710
  }
4383
4711
 
4384
- static struct ggml_cgraph * llama_build_graph(
4712
+ static struct ggml_cgraph * llm_build_persimmon(
4385
4713
  llama_context & lctx,
4386
4714
  const llama_batch & batch) {
4387
4715
  const auto & model = lctx.model;
4716
+ const auto & hparams = model.hparams;
4388
4717
 
4389
- struct ggml_cgraph * result = NULL;
4718
+ const auto & kv_self = lctx.kv_self;
4390
4719
 
4391
- switch (model.arch) {
4392
- case LLM_ARCH_LLAMA:
4393
- {
4394
- result = llm_build_llama(lctx, batch);
4395
- } break;
4396
- case LLM_ARCH_BAICHUAN:
4720
+ GGML_ASSERT(!!kv_self.ctx);
4721
+
4722
+ const auto & cparams = lctx.cparams;
4723
+ const int64_t n_embd = hparams.n_embd;
4724
+ const int64_t n_layer = hparams.n_layer;
4725
+ const int64_t n_ctx = cparams.n_ctx;
4726
+ const int64_t n_head_kv = hparams.n_head_kv;
4727
+ const int64_t n_head = hparams.n_head;
4728
+ const int64_t n_embd_head = hparams.n_embd_head();
4729
+ const int64_t n_embd_gqa = hparams.n_embd_gqa();
4730
+ const size_t n_rot = n_embd_head / 2;
4731
+
4732
+ const float freq_base = cparams.rope_freq_base;
4733
+ const float freq_scale = cparams.rope_freq_scale;
4734
+ const float norm_eps = hparams.f_norm_eps;
4735
+
4736
+ const int n_gpu_layers = model.n_gpu_layers;
4737
+
4738
+
4739
+ const int32_t n_tokens = batch.n_tokens;
4740
+ const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
4741
+ const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
4742
+
4743
+ const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
4744
+
4745
+ auto & buf_compute = lctx.buf_compute;
4746
+ struct ggml_init_params params = {
4747
+ /*.mem_size =*/ buf_compute.size,
4748
+ /*.mem_buffer =*/ buf_compute.data,
4749
+ /*.no_alloc =*/ true,
4750
+ };
4751
+
4752
+ struct ggml_context * ctx0 = ggml_init(params);
4753
+
4754
+ ggml_cgraph * gf = ggml_new_graph(ctx0);
4755
+
4756
+ struct ggml_tensor * cur;
4757
+ struct ggml_tensor * inpL;
4758
+
4759
+ if (batch.token) {
4760
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
4761
+
4762
+ ggml_allocr_alloc(lctx.alloc, inp_tokens);
4763
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
4764
+ memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
4765
+ }
4766
+ ggml_set_name(inp_tokens, "inp_tokens");
4767
+ inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
4768
+ } else {
4769
+ inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
4770
+ ggml_allocr_alloc(lctx.alloc, inpL);
4771
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
4772
+ memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
4773
+ }
4774
+ }
4775
+ const int i_gpu_start = n_layer - n_gpu_layers;
4776
+ (void) i_gpu_start;
4777
+ offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
4778
+ offload_func_t offload_func_kq = llama_nop;
4779
+ offload_func_t offload_func_v = llama_nop;
4780
+ // KQ_scale
4781
+ struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
4782
+ ggml_allocr_alloc(lctx.alloc, KQ_scale);
4783
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
4784
+ ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd_head)));
4785
+ }
4786
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
4787
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
4788
+ offload_func_kq(KQ_mask);
4789
+ ggml_set_name(KQ_mask, "KQ_mask");
4790
+ ggml_allocr_alloc(lctx.alloc, KQ_mask);
4791
+
4792
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
4793
+ float * data = (float *) KQ_mask->data;
4794
+ memset(data, 0, ggml_nbytes(KQ_mask));
4795
+ for (int h = 0; h < 1; ++h) {
4796
+ for (int j = 0; j < n_tokens; ++j) {
4797
+ const llama_pos pos = batch.pos[j];
4798
+ const llama_seq_id seq_id = batch.seq_id[j];
4799
+ for (int i = 0; i < n_kv; ++i) {
4800
+ if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
4801
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
4802
+ }
4803
+ }
4804
+ }
4805
+ }
4806
+ }
4807
+
4808
+ struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
4809
+ offload_func_kq(KQ_pos);
4810
+ ggml_set_name(KQ_pos, "KQ_pos");
4811
+ ggml_allocr_alloc(lctx.alloc, KQ_pos);
4812
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
4813
+ int * data = (int *) KQ_pos->data;
4814
+ for (int i = 0; i < n_tokens; ++i) {
4815
+ data[i] = batch.pos[i];
4816
+ }
4817
+ }
4818
+ if (do_rope_shift) {
4819
+ struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
4820
+ offload_func_kq(K_shift);
4821
+ ggml_set_name(K_shift, "K_shift");
4822
+ ggml_allocr_alloc(lctx.alloc, K_shift);
4823
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
4824
+ int * data = (int *) K_shift->data;
4825
+ for (int i = 0; i < n_ctx; ++i) {
4826
+ data[i] = kv_self.cells[i].delta;
4827
+ }
4828
+ }
4829
+ for (int il = 0; il < n_layer; ++il) {
4830
+ struct ggml_tensor * tmp =
4831
+ // we rotate only the first n_rot dimensions.
4832
+ ggml_rope_custom_inplace(ctx0,
4833
+ ggml_view_3d(ctx0, kv_self.k,
4834
+ n_rot, n_head, n_ctx,
4835
+ ggml_element_size(kv_self.k)*n_embd_gqa,
4836
+ ggml_element_size(kv_self.k)*n_embd_head,
4837
+ ggml_element_size(kv_self.k)*(n_embd_head*n_ctx*il)
4838
+ ),
4839
+ K_shift, n_rot, 2, 0, freq_base, freq_scale);
4840
+ offload_func_kq(tmp);
4841
+ ggml_build_forward_expand(gf, tmp);
4842
+ }
4843
+ }
4844
+ for (int il=0; il < n_layer; ++il) {
4845
+ struct ggml_tensor * residual = inpL;
4846
+ offload_func_t offload_func = llama_nop;
4847
+ {
4848
+ cur = ggml_norm(ctx0, inpL, norm_eps);
4849
+ offload_func(cur);
4850
+ cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm);
4851
+ offload_func(cur);
4852
+ cur = ggml_add(ctx0, cur, model.layers[il].attn_norm_b);
4853
+ offload_func(cur);
4854
+ ggml_format_name(cur, "input_layernorm_%d", il);
4855
+ }
4856
+ // self attention
4857
+ {
4858
+ cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
4859
+ offload_func_kq(cur);
4860
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
4861
+ offload_func_kq(cur);
4862
+
4863
+ // split qkv
4864
+ GGML_ASSERT(n_head_kv == n_head);
4865
+ ggml_set_name(cur, format("qkv_%d", il).c_str());
4866
+ struct ggml_tensor * tmpqkv = ggml_reshape_4d(ctx0, cur, n_embd_head, 3, n_head, n_tokens);
4867
+ offload_func_kq(tmpqkv);
4868
+ struct ggml_tensor * tmpqkv_perm = ggml_cont(ctx0, ggml_permute(ctx0, tmpqkv, 0, 3, 1, 2));
4869
+ offload_func_kq(tmpqkv_perm);
4870
+ ggml_format_name(tmpqkv_perm, "tmpqkv_perm_%d", il);
4871
+ struct ggml_tensor * tmpq = ggml_view_3d(
4872
+ ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
4873
+ ggml_element_size(tmpqkv_perm) * n_embd_head,
4874
+ ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
4875
+ 0
4876
+ );
4877
+ offload_func_kq(tmpq);
4878
+ struct ggml_tensor * tmpk = ggml_view_3d(
4879
+ ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
4880
+ ggml_element_size(tmpqkv_perm) * n_embd_head,
4881
+ ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
4882
+ ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens
4883
+ );
4884
+ offload_func_kq(tmpk);
4885
+ // Q/K Layernorm
4886
+ tmpq = ggml_norm(ctx0, tmpq, norm_eps);
4887
+ offload_func_kq(tmpq);
4888
+ tmpq = ggml_mul(ctx0, tmpq, model.layers[il].attn_q_norm);
4889
+ offload_func_kq(tmpq);
4890
+ tmpq = ggml_add(ctx0, tmpq, model.layers[il].attn_q_norm_b);
4891
+ offload_func_kq(tmpq);
4892
+
4893
+ tmpk = ggml_norm(ctx0, tmpk, norm_eps);
4894
+ offload_func_v(tmpk);
4895
+ tmpk = ggml_mul(ctx0, tmpk, model.layers[il].attn_k_norm);
4896
+ offload_func_v(tmpk);
4897
+ tmpk = ggml_add(ctx0, tmpk, model.layers[il].attn_k_norm_b);
4898
+ offload_func_v(tmpk);
4899
+
4900
+ // RoPE the first n_rot of q/k, pass the other half, and concat.
4901
+ struct ggml_tensor * qrot = ggml_view_3d(
4902
+ ctx0, tmpq, n_rot, n_head, n_tokens,
4903
+ ggml_element_size(tmpq) * n_embd_head,
4904
+ ggml_element_size(tmpq) * n_embd_head * n_head,
4905
+ 0
4906
+ );
4907
+ offload_func_kq(qrot);
4908
+ ggml_format_name(qrot, "qrot_%d", il);
4909
+ struct ggml_tensor * krot = ggml_view_3d(
4910
+ ctx0, tmpk, n_rot, n_head, n_tokens,
4911
+ ggml_element_size(tmpk) * n_embd_head,
4912
+ ggml_element_size(tmpk) * n_embd_head * n_head,
4913
+ 0
4914
+ );
4915
+ offload_func_kq(krot);
4916
+ ggml_format_name(krot, "krot_%d", il);
4917
+
4918
+ // get the second half of tmpq, e.g tmpq[n_rot:, :, :]
4919
+ struct ggml_tensor * qpass = ggml_view_3d(
4920
+ ctx0, tmpq, n_rot, n_head, n_tokens,
4921
+ ggml_element_size(tmpq) * n_embd_head,
4922
+ ggml_element_size(tmpq) * n_embd_head * n_head,
4923
+ ggml_element_size(tmpq) * n_rot
4924
+ );
4925
+ offload_func_kq(qpass);
4926
+ ggml_format_name(qpass, "qpass_%d", il);
4927
+ struct ggml_tensor * kpass = ggml_view_3d(
4928
+ ctx0, tmpk, n_rot, n_head, n_tokens,
4929
+ ggml_element_size(tmpk) * n_embd_head,
4930
+ ggml_element_size(tmpk) * n_embd_head * n_head,
4931
+ ggml_element_size(tmpk) * n_rot
4932
+ );
4933
+ offload_func_kq(kpass);
4934
+ ggml_format_name(kpass, "kpass_%d", il);
4935
+
4936
+ struct ggml_tensor * qrotated = ggml_rope_custom(
4937
+ ctx0, qrot, KQ_pos, n_rot, 2, 0, freq_base, freq_scale
4938
+ );
4939
+ offload_func_kq(qrotated);
4940
+ struct ggml_tensor * krotated = ggml_rope_custom(
4941
+ ctx0, krot, KQ_pos, n_rot, 2, 0, freq_base, freq_scale
4942
+ );
4943
+ offload_func_kq(krotated);
4944
+ // ggml currently only supports concatenation on dim=2
4945
+ // so we need to permute qrot, qpass, concat, then permute back.
4946
+ qrotated = ggml_cont(ctx0, ggml_permute(ctx0, qrotated, 2, 1, 0, 3));
4947
+ offload_func_kq(qrotated);
4948
+ krotated = ggml_cont(ctx0, ggml_permute(ctx0, krotated, 2, 1, 0, 3));
4949
+ offload_func_kq(krotated);
4950
+
4951
+ qpass = ggml_cont(ctx0, ggml_permute(ctx0, qpass, 2, 1, 0, 3));
4952
+ offload_func_kq(qpass);
4953
+ kpass = ggml_cont(ctx0, ggml_permute(ctx0, kpass, 2, 1, 0, 3));
4954
+ offload_func_kq(kpass);
4955
+
4956
+ struct ggml_tensor * Qcur = ggml_concat(ctx0, qrotated, qpass);
4957
+ offload_func_kq(Qcur);
4958
+ struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass);
4959
+ offload_func_kq(Kcur);
4960
+
4961
+ struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 1, 2, 0, 3));
4962
+ offload_func_kq(Q);
4963
+
4964
+ Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
4965
+ offload_func_kq(Kcur);
4966
+ {
4967
+ struct ggml_tensor * tmpv = ggml_view_3d(
4968
+ ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
4969
+ ggml_element_size(tmpqkv_perm) * n_embd_head,
4970
+ ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
4971
+ ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens * 2
4972
+ );
4973
+ offload_func_v(tmpv);
4974
+ // store K, V in cache
4975
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens));
4976
+ offload_func_v(Vcur);
4977
+ ggml_set_name(Vcur, "Vcur");
4978
+
4979
+ struct ggml_tensor * k = ggml_view_1d(
4980
+ ctx0, kv_self.k, n_tokens*n_embd_gqa,
4981
+ (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)
4982
+ );
4983
+ offload_func_kq(k);
4984
+ ggml_set_name(k, "k");
4985
+
4986
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
4987
+ ( n_ctx)*ggml_element_size(kv_self.v),
4988
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
4989
+ offload_func_v(v);
4990
+ ggml_set_name(v, "v");
4991
+
4992
+ // important: storing RoPE-ed version of K in the KV cache!
4993
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
4994
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
4995
+ }
4996
+ struct ggml_tensor * K = ggml_view_3d(ctx0, kv_self.k,
4997
+ n_embd_head, n_kv, n_head_kv,
4998
+ ggml_element_size(kv_self.k)*n_embd_gqa,
4999
+ ggml_element_size(kv_self.k)*n_embd_head,
5000
+ ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
5001
+
5002
+ offload_func_kq(K);
5003
+ ggml_format_name(K, "K_%d", il);
5004
+
5005
+ struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
5006
+ offload_func_kq(KQ);
5007
+ ggml_set_name(KQ, "KQ");
5008
+
5009
+ struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
5010
+ offload_func_kq(KQ_scaled);
5011
+ ggml_set_name(KQ_scaled, "KQ_scaled");
5012
+
5013
+ struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
5014
+ offload_func_kq(KQ_masked);
5015
+ ggml_set_name(KQ_masked, "KQ_masked");
5016
+
5017
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
5018
+ offload_func_kq(KQ_soft_max);
5019
+ ggml_set_name(KQ_soft_max, "KQ_soft_max");
5020
+
5021
+ struct ggml_tensor * V =
5022
+ ggml_view_3d(ctx0, kv_self.v,
5023
+ n_kv, n_embd_head, n_head_kv,
5024
+ ggml_element_size(kv_self.v)*n_ctx,
5025
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
5026
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
5027
+ offload_func_v(V);
5028
+ ggml_set_name(V, "V");
5029
+
5030
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
5031
+ offload_func_v(KQV);
5032
+ ggml_set_name(KQV, "KQV");
5033
+
5034
+ struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
5035
+ offload_func_v(KQV_merged);
5036
+ ggml_set_name(KQV_merged, "KQV_merged");
5037
+
5038
+ cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
5039
+ offload_func_v(cur);
5040
+ ggml_set_name(cur, "KQV_merged_contiguous");
5041
+
5042
+ cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
5043
+ offload_func(cur);
5044
+ cur = ggml_add(ctx0, cur, model.layers[il].bo);
5045
+ offload_func(cur);
5046
+ ggml_set_name(cur, "result_wo");
5047
+ }
5048
+
5049
+ struct ggml_tensor * inpFF = ggml_add(ctx0, residual, cur);
5050
+ offload_func(inpFF);
5051
+ ggml_set_name(inpFF, "inpFF");
5052
+ {
5053
+ // MLP
5054
+ {
5055
+ // Norm
5056
+ cur = ggml_norm(ctx0, inpFF, norm_eps);
5057
+ offload_func(cur);
5058
+ cur = ggml_add(ctx0,
5059
+ ggml_mul(ctx0, cur, model.layers[il].ffn_norm),
5060
+ model.layers[il].ffn_norm_b
5061
+ );
5062
+ ggml_set_name(cur, "ffn_norm");
5063
+ offload_func(cur);
5064
+ }
5065
+ cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur);
5066
+ offload_func(cur);
5067
+
5068
+ cur = ggml_add(ctx0, cur, model.layers[il].b3);
5069
+ offload_func(cur);
5070
+ ggml_set_name(cur, "result_ffn_up");
5071
+
5072
+ cur = ggml_sqr(ctx0, ggml_relu(ctx0, cur));
5073
+ ggml_set_name(cur, "result_ffn_act");
5074
+ offload_func(cur);
5075
+ offload_func(cur->src[0]);
5076
+
5077
+ cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur);
5078
+ offload_func(cur);
5079
+ cur = ggml_add(ctx0,
5080
+ cur,
5081
+ model.layers[il].b2);
5082
+ offload_func(cur);
5083
+ ggml_set_name(cur, "outFF");
5084
+ }
5085
+ cur = ggml_add(ctx0, cur, inpFF);
5086
+ offload_func(cur);
5087
+ ggml_set_name(cur, "inpFF_+_outFF");
5088
+ inpL = cur;
5089
+ }
5090
+ cur = inpL;
5091
+ {
5092
+ cur = ggml_norm(ctx0, cur, norm_eps);
5093
+ offload_func_nr(cur);
5094
+ cur = ggml_mul(ctx0, cur, model.output_norm);
5095
+ offload_func_nr(cur);
5096
+
5097
+ cur = ggml_add(ctx0, cur, model.output_norm_b);
5098
+ // offload_func_nr(cur);
5099
+
5100
+ ggml_set_name(cur, "result_norm");
5101
+ }
5102
+ cur = ggml_mul_mat(ctx0, model.output, cur);
5103
+ ggml_set_name(cur, "result_output");
5104
+ ggml_build_forward_expand(gf, cur);
5105
+ ggml_free(ctx0);
5106
+ return gf;
5107
+ }
5108
+
5109
+ static struct ggml_cgraph * llm_build_bloom(
5110
+ llama_context & lctx,
5111
+ const llama_batch & batch) {
5112
+ const auto & model = lctx.model;
5113
+ const auto & hparams = model.hparams;
5114
+ const auto & cparams = lctx.cparams;
5115
+
5116
+ const auto & kv_self = lctx.kv_self;
5117
+
5118
+ GGML_ASSERT(!!kv_self.ctx);
5119
+
5120
+ const int64_t n_embd = hparams.n_embd;
5121
+ const int64_t n_layer = hparams.n_layer;
5122
+ const int64_t n_ctx = cparams.n_ctx;
5123
+ const int64_t n_head = hparams.n_head;
5124
+ const int64_t n_head_kv = hparams.n_head_kv;
5125
+ const int64_t n_embd_head = hparams.n_embd_head();
5126
+ const int64_t n_embd_gqa = hparams.n_embd_gqa();
5127
+
5128
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
5129
+
5130
+ const float norm_eps = hparams.f_norm_eps;
5131
+
5132
+ const int32_t n_tokens = batch.n_tokens;
5133
+ const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
5134
+ const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
5135
+
5136
+ auto & buf_compute = lctx.buf_compute;
5137
+
5138
+ struct ggml_init_params params = {
5139
+ /*.mem_size =*/ buf_compute.size,
5140
+ /*.mem_buffer =*/ buf_compute.data,
5141
+ /*.no_alloc =*/ false,
5142
+ };
5143
+
5144
+ params.no_alloc = true;
5145
+
5146
+ struct ggml_context * ctx0 = ggml_init(params);
5147
+
5148
+ ggml_cgraph * gf = ggml_new_graph(ctx0);
5149
+
5150
+ struct ggml_tensor * cur;
5151
+ struct ggml_tensor * token;
5152
+ struct ggml_tensor * inpL;
5153
+
5154
+ if (batch.token) {
5155
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
5156
+
5157
+ ggml_allocr_alloc(lctx.alloc, inp_tokens);
5158
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
5159
+ memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
5160
+ }
5161
+ ggml_set_name(inp_tokens, "inp_tokens");
5162
+
5163
+ token = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
5164
+ } else {
5165
+ #ifdef GGML_USE_MPI
5166
+ GGML_ASSERT(false && "not implemented");
5167
+ #endif
5168
+
5169
+ token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
5170
+
5171
+ ggml_allocr_alloc(lctx.alloc, token);
5172
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
5173
+ memcpy(token->data, batch.embd, n_tokens * n_embd * ggml_element_size(token));
5174
+ }
5175
+ }
5176
+
5177
+ // KQ_scale
5178
+ struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
5179
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
5180
+ ggml_allocr_alloc(lctx.alloc, KQ_scale);
5181
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
5182
+ ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
5183
+ }
5184
+
5185
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5186
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
5187
+ ggml_set_name(KQ_mask, "KQ_mask");
5188
+ ggml_allocr_alloc(lctx.alloc, KQ_mask);
5189
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
5190
+ float * data = (float *) KQ_mask->data;
5191
+ memset(data, 0, ggml_nbytes(KQ_mask));
5192
+
5193
+ for (int h = 0; h < 1; ++h) {
5194
+ for (int j = 0; j < n_tokens; ++j) {
5195
+ const llama_pos pos = batch.pos[j];
5196
+ const llama_seq_id seq_id = batch.seq_id[j];
5197
+
5198
+ for (int i = 0; i < n_kv; ++i) {
5199
+ if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
5200
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
5201
+ }
5202
+ }
5203
+ }
5204
+ }
5205
+ }
5206
+
5207
+ // norm
5208
+ {
5209
+ inpL = ggml_norm(ctx0, token, norm_eps);
5210
+ inpL = ggml_add(ctx0, ggml_mul(ctx0, inpL, model.tok_norm), model.tok_norm_b);
5211
+ }
5212
+
5213
+ ggml_set_name(inpL, "inpL");
5214
+
5215
+ for (int il = 0; il < n_layer; ++il) {
5216
+ {
5217
+ // Norm
5218
+ cur = ggml_norm(ctx0, inpL, norm_eps);
5219
+ cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b);
5220
+ }
5221
+
5222
+ {
5223
+ // Self Attention
5224
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv);
5225
+
5226
+ struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd);
5227
+ struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*n_embd);
5228
+ struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
5229
+
5230
+ struct ggml_tensor * Qcur = tmpq;
5231
+ struct ggml_tensor * Kcur = tmpk;
5232
+
5233
+ // store key and value to memory
5234
+ {
5235
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
5236
+ ggml_set_name(Vcur, "Vcur");
5237
+
5238
+ struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
5239
+ ggml_set_name(k, "k");
5240
+
5241
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
5242
+ ( n_ctx)*ggml_element_size(kv_self.v),
5243
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
5244
+
5245
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
5246
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
5247
+ }
5248
+
5249
+ struct ggml_tensor * Q =
5250
+ ggml_permute(ctx0,
5251
+ ggml_cpy(ctx0,
5252
+ Qcur,
5253
+ ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, n_tokens)),
5254
+ 0, 2, 1, 3);
5255
+ ggml_set_name(Q, "Q");
5256
+
5257
+ struct ggml_tensor * K =
5258
+ ggml_view_3d(ctx0, kv_self.k,
5259
+ n_embd_head, n_kv, n_head_kv,
5260
+ ggml_element_size(kv_self.k)*n_embd_gqa,
5261
+ ggml_element_size(kv_self.k)*n_embd_head,
5262
+ ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
5263
+ ggml_set_name(K, "K");
5264
+
5265
+ // K * Q
5266
+ struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
5267
+ ggml_set_name(KQ, "KQ");
5268
+
5269
+ // KQ_scaled = KQ / sqrt(n_embd_head)
5270
+ // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
5271
+ struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
5272
+ ggml_set_name(KQ_scaled, "KQ_scaled");
5273
+
5274
+ struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ kv_head, n_head, 8);
5275
+ ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
5276
+
5277
+ // KQ_masked = mask_past(KQ_scaled)
5278
+ struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
5279
+ ggml_set_name(KQ_masked, "KQ_masked");
5280
+
5281
+ // KQ = soft_max(KQ_masked)
5282
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
5283
+ ggml_set_name(KQ_soft_max, "KQ_soft_max");
5284
+
5285
+ // split cached V into n_head heads
5286
+ struct ggml_tensor * V =
5287
+ ggml_view_3d(ctx0, kv_self.v,
5288
+ n_kv, n_embd_head, n_head_kv,
5289
+ ggml_element_size(kv_self.v)*n_ctx,
5290
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
5291
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
5292
+ ggml_set_name(V, "V");
5293
+
5294
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
5295
+ ggml_set_name(KQV, "KQV");
5296
+
5297
+ // KQV_merged = KQV.permute(0, 2, 1, 3)
5298
+ struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
5299
+ ggml_set_name(KQV_merged, "KQV_merged");
5300
+
5301
+ // cur = KQV_merged.contiguous().view(n_embd, n_tokens)
5302
+ cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
5303
+ ggml_set_name(cur, "KQV_merged_contiguous");
5304
+ }
5305
+
5306
+ // Projection
5307
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo);
5308
+
5309
+ // Add the input
5310
+ cur = ggml_add(ctx0, cur, inpL);
5311
+
5312
+ struct ggml_tensor * inpFF = cur;
5313
+
5314
+ // FF
5315
+ {
5316
+ // Norm
5317
+ {
5318
+ cur = ggml_norm(ctx0, inpFF, norm_eps);
5319
+ cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b);
5320
+ }
5321
+
5322
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3);
5323
+
5324
+ // GELU activation
5325
+ cur = ggml_gelu(ctx0, cur);
5326
+
5327
+ // Projection
5328
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2);
5329
+ }
5330
+
5331
+ inpL = ggml_add(ctx0, cur, inpFF);
5332
+ }
5333
+
5334
+ // Output Norm
5335
+ {
5336
+ cur = ggml_norm(ctx0, inpL, norm_eps);
5337
+ cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b);
5338
+ }
5339
+ ggml_set_name(cur, "result_norm");
5340
+
5341
+ cur = ggml_mul_mat(ctx0, model.output, cur);
5342
+ ggml_set_name(cur, "result_output");
5343
+
5344
+ ggml_build_forward_expand(gf, cur);
5345
+
5346
+ ggml_free(ctx0);
5347
+
5348
+ return gf;
5349
+ }
5350
+
5351
+ static struct ggml_cgraph * llm_build_mpt(
5352
+ llama_context & lctx,
5353
+ const llama_batch & batch) {
5354
+ const auto & model = lctx.model;
5355
+ const auto & hparams = model.hparams;
5356
+ const auto & cparams = lctx.cparams;
5357
+
5358
+ const auto & kv_self = lctx.kv_self;
5359
+
5360
+ GGML_ASSERT(!!kv_self.ctx);
5361
+
5362
+ const int64_t n_embd = hparams.n_embd;
5363
+ const int64_t n_layer = hparams.n_layer;
5364
+ const int64_t n_ctx = cparams.n_ctx;
5365
+ const int64_t n_head = hparams.n_head;
5366
+ const int64_t n_head_kv = hparams.n_head_kv; // == n_head for MPT, as there's no MQA/GQA
5367
+ const int64_t n_embd_head = hparams.n_embd_head();
5368
+ const int64_t n_embd_gqa = hparams.n_embd_gqa();
5369
+
5370
+ const float norm_eps = hparams.f_norm_eps;
5371
+ const float clamp_kqv = hparams.f_clamp_kqv;
5372
+ const float max_alibi_bias = hparams.f_max_alibi_bias;
5373
+
5374
+ const int n_gpu_layers = model.n_gpu_layers;
5375
+
5376
+ const int32_t n_tokens = batch.n_tokens;
5377
+ const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
5378
+ const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
5379
+
5380
+ auto & buf_compute = lctx.buf_compute;
5381
+
5382
+ struct ggml_init_params params = {
5383
+ /*.mem_size =*/ buf_compute.size,
5384
+ /*.mem_buffer =*/ buf_compute.data,
5385
+ /*.no_alloc =*/ false,
5386
+ };
5387
+
5388
+ params.no_alloc = true;
5389
+
5390
+ struct ggml_context * ctx0 = ggml_init(params);
5391
+
5392
+ ggml_cgraph * gf = ggml_new_graph(ctx0);
5393
+
5394
+ struct ggml_tensor * cur;
5395
+ struct ggml_tensor * inpL;
5396
+
5397
+ //int warmup = 0;
5398
+ if (batch.token) {
5399
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
5400
+
5401
+ ggml_allocr_alloc(lctx.alloc, inp_tokens);
5402
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
5403
+ memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
5404
+ //warmup = ((uint32_t*) inp_tokens->data)[0] == 0;
5405
+ }
5406
+
5407
+ ggml_set_name(inp_tokens, "inp_tokens");
5408
+
5409
+ inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
5410
+ } else {
5411
+ #ifdef GGML_USE_MPI
5412
+ GGML_ASSERT(false && "not implemented");
5413
+ #endif
5414
+
5415
+ inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
5416
+
5417
+ ggml_allocr_alloc(lctx.alloc, inpL);
5418
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
5419
+ memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
5420
+ }
5421
+ }
5422
+
5423
+ const int i_gpu_start = n_layer - n_gpu_layers;
5424
+ (void) i_gpu_start;
5425
+
5426
+ // offload functions set the tensor output backend to GPU
5427
+ // tensors are GPU-accelerated if any input or the output has been offloaded
5428
+ offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
5429
+ offload_func_t offload_func_kq = llama_nop;
5430
+ offload_func_t offload_func_v = llama_nop;
5431
+
5432
+ #ifdef GGML_USE_CUBLAS
5433
+ if (n_gpu_layers > n_layer) {
5434
+ offload_func_nr = ggml_cuda_assign_buffers_no_alloc;
5435
+ }
5436
+ if (n_gpu_layers > n_layer + 1) {
5437
+ offload_func_v = ggml_cuda_assign_buffers_no_alloc;
5438
+ }
5439
+ if (n_gpu_layers > n_layer + 2) {
5440
+ offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
5441
+ }
5442
+ #endif // GGML_USE_CUBLAS
5443
+
5444
+ // KQ_scale
5445
+ struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
5446
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
5447
+ ggml_allocr_alloc(lctx.alloc, KQ_scale);
5448
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
5449
+ ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
5450
+ }
5451
+
5452
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5453
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
5454
+ offload_func_kq(KQ_mask);
5455
+ ggml_set_name(KQ_mask, "KQ_mask");
5456
+ ggml_allocr_alloc(lctx.alloc, KQ_mask);
5457
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
5458
+ float * data = (float *) KQ_mask->data;
5459
+ memset(data, 0, ggml_nbytes(KQ_mask));
5460
+
5461
+ for (int h = 0; h < 1; ++h) {
5462
+ for (int j = 0; j < n_tokens; ++j) {
5463
+ const llama_pos pos = batch.pos[j];
5464
+ const llama_seq_id seq_id = batch.seq_id[j];
5465
+
5466
+ for (int i = 0; i < n_kv; ++i) {
5467
+ if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
5468
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
5469
+ }
5470
+ }
5471
+ }
5472
+ }
5473
+ }
5474
+
5475
+ for (int il = 0; il < n_layer; ++il) {
5476
+ struct ggml_tensor * attn_norm;
5477
+
5478
+ offload_func_t offload_func = llama_nop;
5479
+
5480
+ #ifdef GGML_USE_CUBLAS
5481
+ if (il >= i_gpu_start) {
5482
+ offload_func = ggml_cuda_assign_buffers_no_alloc;
5483
+ }
5484
+ #endif // GGML_USE_CUBLAS
5485
+
5486
+ // self-attention
5487
+ // TODO: refactor into common function (shared with LLaMA)
5488
+ {
5489
+ attn_norm = ggml_norm(ctx0, inpL, norm_eps);
5490
+ offload_func(attn_norm);
5491
+
5492
+ attn_norm = ggml_mul(ctx0, attn_norm, model.layers[il].attn_norm);
5493
+ offload_func(attn_norm);
5494
+
5495
+ if (1) {
5496
+ cur = attn_norm;
5497
+ }
5498
+
5499
+ // compute QKV
5500
+
5501
+ cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
5502
+ offload_func_kq(cur);
5503
+
5504
+ if (clamp_kqv > 0.0f) {
5505
+ cur = ggml_clamp(ctx0, cur, -clamp_kqv, clamp_kqv);
5506
+ offload_func_kq(cur);
5507
+ }
5508
+
5509
+ const size_t wsize = ggml_type_size(cur->type);
5510
+
5511
+ struct ggml_tensor * Qcur = ggml_view_3d(
5512
+ ctx0, cur, n_embd_head, n_head, n_tokens,
5513
+ wsize * n_embd_head,
5514
+ wsize * n_embd_head * (n_head + 2 * n_head_kv),
5515
+ 0);
5516
+ offload_func_kq(Qcur);
5517
+
5518
+ struct ggml_tensor * Kcur = ggml_view_3d(
5519
+ ctx0, cur, n_embd_head, n_head_kv, n_tokens,
5520
+ wsize * n_embd_head,
5521
+ wsize * n_embd_head * (n_head + 2 * n_head_kv),
5522
+ wsize * n_embd_head * n_head);
5523
+ offload_func_kq(Kcur);
5524
+
5525
+ struct ggml_tensor * tmpv = ggml_view_3d(
5526
+ ctx0, cur, n_embd_head, n_head_kv, n_tokens,
5527
+ wsize * n_embd_head,
5528
+ wsize * n_embd_head * (n_head + 2 * n_head_kv),
5529
+ wsize * n_embd_head * (n_head + n_head_kv));
5530
+ offload_func_kq(Kcur);
5531
+
5532
+ ggml_set_name(Qcur, "Qcur");
5533
+ ggml_set_name(Kcur, "Kcur");
5534
+
5535
+ {
5536
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
5537
+ offload_func_v(Vcur);
5538
+ offload_func_v(Vcur->src[0]->src[0]);
5539
+ ggml_set_name(Vcur, "Vcur");
5540
+
5541
+ struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
5542
+ offload_func_kq(k);
5543
+ ggml_set_name(k, "k");
5544
+
5545
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
5546
+ ( n_ctx)*ggml_element_size(kv_self.v),
5547
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
5548
+ offload_func_v(v);
5549
+
5550
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
5551
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
5552
+ }
5553
+
5554
+ struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
5555
+ offload_func_kq(Q);
5556
+ ggml_set_name(Q, "Q");
5557
+
5558
+ struct ggml_tensor * K =
5559
+ ggml_view_3d(ctx0, kv_self.k,
5560
+ n_embd_head, n_kv, n_head_kv,
5561
+ ggml_element_size(kv_self.k)*n_embd_gqa,
5562
+ ggml_element_size(kv_self.k)*n_embd_head,
5563
+ ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
5564
+ offload_func_kq(K);
5565
+ ggml_set_name(K, "K");
5566
+
5567
+ struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
5568
+ offload_func_kq(KQ);
5569
+ ggml_set_name(KQ, "KQ");
5570
+
5571
+ struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
5572
+ offload_func_kq(KQ_scaled);
5573
+ ggml_set_name(KQ_scaled, "KQ_scaled");
5574
+
5575
+ // TODO: replace with ggml_add()
5576
+ struct ggml_tensor * KQ_scaled_alibi =
5577
+ ggml_alibi(ctx0, KQ_scaled, 0, n_head, max_alibi_bias);
5578
+ offload_func_kq(KQ_scaled_alibi);
5579
+ ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
5580
+
5581
+ struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
5582
+ offload_func_kq(KQ_masked);
5583
+ ggml_set_name(KQ_masked, "KQ_masked");
5584
+
5585
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
5586
+ offload_func_v(KQ_soft_max);
5587
+ ggml_set_name(KQ_soft_max, "KQ_soft_max");
5588
+
5589
+ struct ggml_tensor * V =
5590
+ ggml_view_3d(ctx0, kv_self.v,
5591
+ n_kv, n_embd_head, n_head_kv,
5592
+ ggml_element_size(kv_self.v)*n_ctx,
5593
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
5594
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
5595
+ offload_func_v(V);
5596
+ ggml_set_name(V, "V");
5597
+
5598
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
5599
+ offload_func_v(KQV);
5600
+ ggml_set_name(KQV, "KQV");
5601
+
5602
+ struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
5603
+ offload_func_v(KQV_merged);
5604
+ ggml_set_name(KQV_merged, "KQV_merged");
5605
+
5606
+ cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
5607
+ offload_func_v(cur);
5608
+ ggml_set_name(cur, "KQV_merged_contiguous");
5609
+
5610
+ cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
5611
+ offload_func(cur);
5612
+ ggml_set_name(cur, "result_wo");
5613
+ }
5614
+
5615
+ // Add the input
5616
+ cur = ggml_add(ctx0, cur, inpL);
5617
+ offload_func(cur);
5618
+
5619
+ struct ggml_tensor * attn_out = cur;
5620
+
5621
+ // feed forward
5622
+ {
5623
+ // Norm
5624
+ {
5625
+ cur = ggml_norm(ctx0, attn_out, norm_eps);
5626
+ offload_func(cur);
5627
+
5628
+ cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
5629
+ offload_func(cur);
5630
+ }
5631
+
5632
+ cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur);
5633
+ offload_func(cur);
5634
+
5635
+ cur = ggml_gelu(ctx0, cur);
5636
+ offload_func(cur);
5637
+ cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur);
5638
+ offload_func(cur);
5639
+ }
5640
+
5641
+ cur = ggml_add(ctx0, cur, attn_out);
5642
+ offload_func(cur);
5643
+ // input for next layer
5644
+ inpL = cur;
5645
+ }
5646
+
5647
+ cur = inpL;
5648
+
5649
+ // norm
5650
+ {
5651
+ cur = ggml_norm(ctx0, cur, norm_eps);
5652
+ offload_func_nr(cur);
5653
+
5654
+ cur = ggml_mul(ctx0, cur, model.output_norm);
5655
+ ggml_set_name(cur, "result_norm");
5656
+ }
5657
+
5658
+ cur = ggml_mul_mat(ctx0, model.output, cur);
5659
+ ggml_set_name(cur, "result_output");
5660
+
5661
+ ggml_build_forward_expand(gf, cur);
5662
+
5663
+ ggml_free(ctx0);
5664
+
5665
+ return gf;
5666
+ }
5667
+
5668
+ static struct ggml_cgraph * llama_build_graph(
5669
+ llama_context & lctx,
5670
+ const llama_batch & batch) {
5671
+ const auto & model = lctx.model;
5672
+
5673
+ struct ggml_cgraph * result = NULL;
5674
+
5675
+ switch (model.arch) {
5676
+ case LLM_ARCH_LLAMA:
5677
+ {
5678
+ result = llm_build_llama(lctx, batch);
5679
+ } break;
5680
+ case LLM_ARCH_BAICHUAN:
4397
5681
  {
4398
5682
  result = llm_build_baichaun(lctx, batch);
4399
5683
  } break;
@@ -4405,10 +5689,22 @@ static struct ggml_cgraph * llama_build_graph(
4405
5689
  {
4406
5690
  result = llm_build_starcoder(lctx, batch);
4407
5691
  } break;
5692
+ case LLM_ARCH_PERSIMMON:
5693
+ {
5694
+ result = llm_build_persimmon(lctx, batch);
5695
+ } break;
4408
5696
  case LLM_ARCH_REFACT:
4409
5697
  {
4410
5698
  result = llm_build_refact(lctx, batch);
4411
5699
  } break;
5700
+ case LLM_ARCH_BLOOM:
5701
+ {
5702
+ result = llm_build_bloom(lctx, batch);
5703
+ } break;
5704
+ case LLM_ARCH_MPT:
5705
+ {
5706
+ result = llm_build_mpt(lctx, batch);
5707
+ } break;
4412
5708
  default:
4413
5709
  GGML_ASSERT(false);
4414
5710
  }
@@ -4420,7 +5716,6 @@ static struct ggml_cgraph * llama_build_graph(
4420
5716
  //
4421
5717
  // - lctx: llama context
4422
5718
  // - batch: batch to evaluate
4423
- // - n_threads: number of threads to use
4424
5719
  //
4425
5720
  // return 0 on success
4426
5721
  // return positive int on warning
@@ -4487,10 +5782,6 @@ static int llama_decode_internal(
4487
5782
  batch.seq_id = seq_id.data();
4488
5783
  }
4489
5784
 
4490
- // we always start to search for a free slot from the start of the cache
4491
- // TODO: better strategies can be implemented
4492
- kv_self.head = 0;
4493
-
4494
5785
  if (!llama_kv_cache_find_slot(kv_self, batch)) {
4495
5786
  return 1;
4496
5787
  }
@@ -4543,7 +5834,8 @@ static int llama_decode_internal(
4543
5834
  const bool full_offload_supported = model.arch == LLM_ARCH_LLAMA ||
4544
5835
  model.arch == LLM_ARCH_BAICHUAN ||
4545
5836
  model.arch == LLM_ARCH_FALCON ||
4546
- model.arch == LLM_ARCH_REFACT;
5837
+ model.arch == LLM_ARCH_REFACT ||
5838
+ model.arch == LLM_ARCH_MPT;
4547
5839
  const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
4548
5840
  if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
4549
5841
  n_threads = 1;
@@ -4576,8 +5868,12 @@ static int llama_decode_internal(
4576
5868
  #endif
4577
5869
 
4578
5870
  // update the kv ring buffer
4579
- lctx.kv_self.head += n_tokens;
4580
5871
  lctx.kv_self.has_shift = false;
5872
+ lctx.kv_self.head += n_tokens;
5873
+ // Ensure kv cache head points to a valid index.
5874
+ if (lctx.kv_self.head >= lctx.kv_self.size) {
5875
+ lctx.kv_self.head = 0;
5876
+ }
4581
5877
 
4582
5878
  #ifdef GGML_PERF
4583
5879
  // print timing information per ggml operation (for debugging purposes)
@@ -5040,7 +6336,6 @@ private:
5040
6336
  for (int i = 0; i < (int)text_utf.size(); i++) {
5041
6337
  const std::string & utf_char = text_utf[i];
5042
6338
  bool split_condition = false;
5043
- // const char* text_pos = raw_text_p + utf_char.seq_offset_bytes;
5044
6339
  int bytes_remain = text_utf.size() - i;
5045
6340
  // forward backward lookups
5046
6341
  const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
@@ -5066,9 +6361,9 @@ private:
5066
6361
  if (!split_condition && bytes_remain >= 3) {
5067
6362
  // 're|'ve|'ll
5068
6363
  if (utf_char == "\'" && (
5069
- (utf_char_next == "r" || utf_char_next_next == "e") ||
5070
- (utf_char_next == "v" || utf_char_next_next == "e") ||
5071
- (utf_char_next == "l" || utf_char_next_next == "l"))
6364
+ (utf_char_next == "r" && utf_char_next_next == "e") ||
6365
+ (utf_char_next == "v" && utf_char_next_next == "e") ||
6366
+ (utf_char_next == "l" && utf_char_next_next == "l"))
5072
6367
  ) {
5073
6368
  split_condition = true;
5074
6369
  }
@@ -5119,7 +6414,7 @@ private:
5119
6414
  else if (collecting_special && (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
5120
6415
  split_condition = true;
5121
6416
  }
5122
- else if (collecting_whitespace_lookahead && codepoint_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE) {
6417
+ else if (collecting_whitespace_lookahead && (codepoint_type(utf_char_next) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
5123
6418
  split_condition = true;
5124
6419
  }
5125
6420
  }
@@ -6635,7 +7930,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
6635
7930
  const std::string name = ggml_get_name(meta);
6636
7931
 
6637
7932
  // TODO: avoid hardcoded tensor names - use the TN_* constants
6638
- if (name.find("attn_v.weight") != std::string::npos) {
7933
+ if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
6639
7934
  ++n_attention_wv;
6640
7935
  }
6641
7936
  else if (name.find("ffn_down.weight") != std::string::npos) {
@@ -6672,6 +7967,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
6672
7967
  }
6673
7968
 
6674
7969
  std::ofstream fout(fname_out, std::ios::binary);
7970
+ fout.exceptions(std::ofstream::failbit); // fail fast on write errors
6675
7971
 
6676
7972
  const size_t meta_size = gguf_get_meta_size(ctx_out);
6677
7973
 
@@ -8166,7 +9462,9 @@ int llama_token_to_piece(const struct llama_model * model, llama_token token, ch
8166
9462
  buf[0] = llama_token_to_byte(model->vocab, token);
8167
9463
  return 1;
8168
9464
  } else {
8169
- GGML_ASSERT(false);
9465
+ // TODO: for now we accept all unsupported token types,
9466
+ // suppressing them like CONTROL tokens.
9467
+ // GGML_ASSERT(false);
8170
9468
  }
8171
9469
  break;
8172
9470
  }
@@ -8182,7 +9480,9 @@ int llama_token_to_piece(const struct llama_model * model, llama_token token, ch
8182
9480
  } else if (llama_is_control_token(model->vocab, token)) {
8183
9481
  ;
8184
9482
  } else {
8185
- GGML_ASSERT(false);
9483
+ // TODO: for now we accept all unsupported token types,
9484
+ // suppressing them like CONTROL tokens.
9485
+ // GGML_ASSERT(false);
8186
9486
  }
8187
9487
  break;
8188
9488
  }