llama_cpp 0.7.0 → 0.7.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -186,7 +186,9 @@ enum llm_arch {
186
186
  LLM_ARCH_GPTNEOX,
187
187
  LLM_ARCH_MPT,
188
188
  LLM_ARCH_STARCODER,
189
+ LLM_ARCH_PERSIMMON,
189
190
  LLM_ARCH_REFACT,
191
+ LLM_ARCH_BLOOM,
190
192
  LLM_ARCH_UNKNOWN,
191
193
  };
192
194
 
@@ -199,7 +201,9 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
199
201
  { LLM_ARCH_MPT, "mpt" },
200
202
  { LLM_ARCH_BAICHUAN, "baichuan" },
201
203
  { LLM_ARCH_STARCODER, "starcoder" },
202
- { LLM_ARCH_REFACT, "refact" },
204
+ { LLM_ARCH_PERSIMMON, "persimmon" },
205
+ { LLM_ARCH_REFACT, "refact" },
206
+ { LLM_ARCH_BLOOM, "bloom" },
203
207
  };
204
208
 
205
209
  enum llm_kv {
@@ -302,6 +306,7 @@ struct LLM_KV {
302
306
 
303
307
  enum llm_tensor {
304
308
  LLM_TENSOR_TOKEN_EMBD,
309
+ LLM_TENSOR_TOKEN_EMBD_NORM,
305
310
  LLM_TENSOR_POS_EMBD,
306
311
  LLM_TENSOR_OUTPUT,
307
312
  LLM_TENSOR_OUTPUT_NORM,
@@ -318,6 +323,8 @@ enum llm_tensor {
318
323
  LLM_TENSOR_FFN_DOWN,
319
324
  LLM_TENSOR_FFN_UP,
320
325
  LLM_TENSOR_FFN_NORM,
326
+ LLM_TENSOR_ATTN_Q_NORM,
327
+ LLM_TENSOR_ATTN_K_NORM,
321
328
  };
322
329
 
323
330
  static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
@@ -399,10 +406,35 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
399
406
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
400
407
  },
401
408
  },
409
+ {
410
+ LLM_ARCH_PERSIMMON,
411
+ {
412
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd"},
413
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm"},
414
+ { LLM_TENSOR_OUTPUT, "output"},
415
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm"},
416
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv"},
417
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output"},
418
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm"},
419
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm"},
420
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm"},
421
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down"},
422
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up"},
423
+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd"},
424
+ },
425
+ },
402
426
  {
403
427
  LLM_ARCH_MPT,
404
428
  {
405
429
  { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
430
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
431
+ { LLM_TENSOR_OUTPUT, "output" },
432
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
433
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
434
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
435
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
436
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
437
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
406
438
  },
407
439
  },
408
440
  {
@@ -437,6 +469,21 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
437
469
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
438
470
  },
439
471
  },
472
+ {
473
+ LLM_ARCH_BLOOM,
474
+ {
475
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
476
+ { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
477
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
478
+ { LLM_TENSOR_OUTPUT, "output" },
479
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
480
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
481
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
482
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
483
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
484
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
485
+ },
486
+ },
440
487
  {
441
488
  LLM_ARCH_UNKNOWN,
442
489
  {
@@ -954,6 +1001,7 @@ enum e_model {
954
1001
  MODEL_1B,
955
1002
  MODEL_3B,
956
1003
  MODEL_7B,
1004
+ MODEL_8B,
957
1005
  MODEL_13B,
958
1006
  MODEL_15B,
959
1007
  MODEL_30B,
@@ -984,6 +1032,9 @@ struct llama_hparams {
984
1032
  float rope_freq_base_train;
985
1033
  float rope_freq_scale_train;
986
1034
 
1035
+ float f_clamp_kqv;
1036
+ float f_max_alibi_bias;
1037
+
987
1038
  bool operator!=(const llama_hparams & other) const {
988
1039
  if (this->vocab_only != other.vocab_only) return true;
989
1040
  if (this->n_vocab != other.n_vocab) return true;
@@ -1036,6 +1087,10 @@ struct llama_layer {
1036
1087
  struct ggml_tensor * attn_norm_b;
1037
1088
  struct ggml_tensor * attn_norm_2;
1038
1089
  struct ggml_tensor * attn_norm_2_b;
1090
+ struct ggml_tensor * attn_q_norm;
1091
+ struct ggml_tensor * attn_q_norm_b;
1092
+ struct ggml_tensor * attn_k_norm;
1093
+ struct ggml_tensor * attn_k_norm_b;
1039
1094
 
1040
1095
  // attention
1041
1096
  struct ggml_tensor * wq;
@@ -1077,6 +1132,9 @@ struct llama_kv_cell {
1077
1132
  struct llama_kv_cache {
1078
1133
  bool has_shift = false;
1079
1134
 
1135
+ // Note: The value of head isn't only used to optimize searching
1136
+ // for a free KV slot. llama_decode_internal also uses it, so it
1137
+ // cannot be freely changed after a slot has been allocated.
1080
1138
  uint32_t head = 0;
1081
1139
  uint32_t size = 0;
1082
1140
 
@@ -1162,6 +1220,8 @@ struct llama_model {
1162
1220
 
1163
1221
  struct ggml_tensor * tok_embeddings;
1164
1222
  struct ggml_tensor * pos_embeddings;
1223
+ struct ggml_tensor * tok_norm;
1224
+ struct ggml_tensor * tok_norm_b;
1165
1225
 
1166
1226
  struct ggml_tensor * output_norm;
1167
1227
  struct ggml_tensor * output_norm_b;
@@ -1291,7 +1351,11 @@ static bool llama_kv_cache_init(
1291
1351
  cache.cells.clear();
1292
1352
  cache.cells.resize(n_ctx);
1293
1353
 
1354
+ // TODO: this should be:
1355
+ // cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*ggml_tensor_overhead());
1356
+ // change it and test that it works
1294
1357
  cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
1358
+ memset(cache.buf.data, 0, cache.buf.size);
1295
1359
 
1296
1360
  struct ggml_init_params params;
1297
1361
  params.mem_size = cache.buf.size;
@@ -1334,6 +1398,8 @@ static bool llama_kv_cache_init(
1334
1398
 
1335
1399
  // find an empty slot of size "n_tokens" in the cache
1336
1400
  // updates the cache head
1401
+ // Note: On success, it's important that cache.head points
1402
+ // to the first cell of the slot.
1337
1403
  static bool llama_kv_cache_find_slot(
1338
1404
  struct llama_kv_cache & cache,
1339
1405
  const struct llama_batch & batch) {
@@ -1349,8 +1415,8 @@ static bool llama_kv_cache_find_slot(
1349
1415
 
1350
1416
  while (true) {
1351
1417
  if (cache.head + n_tokens > n_ctx) {
1418
+ n_tested += n_ctx - cache.head;
1352
1419
  cache.head = 0;
1353
- n_tested += n_ctx - cache.head;
1354
1420
  continue;
1355
1421
  }
1356
1422
 
@@ -1401,6 +1467,9 @@ static void llama_kv_cache_tokens_rm(struct llama_kv_cache & cache, int32_t c0,
1401
1467
  cache.cells[i].pos = -1;
1402
1468
  cache.cells[i].seq_id.clear();
1403
1469
  }
1470
+
1471
+ // Searching for a free slot can start here since we know it will be empty.
1472
+ cache.head = uint32_t(c0);
1404
1473
  }
1405
1474
 
1406
1475
  static void llama_kv_cache_seq_rm(
@@ -1408,6 +1477,8 @@ static void llama_kv_cache_seq_rm(
1408
1477
  llama_seq_id seq_id,
1409
1478
  llama_pos p0,
1410
1479
  llama_pos p1) {
1480
+ uint32_t new_head = cache.size;
1481
+
1411
1482
  if (p0 < 0) p0 = 0;
1412
1483
  if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
1413
1484
 
@@ -1416,9 +1487,13 @@ static void llama_kv_cache_seq_rm(
1416
1487
  cache.cells[i].seq_id.erase(seq_id);
1417
1488
  if (cache.cells[i].seq_id.empty()) {
1418
1489
  cache.cells[i].pos = -1;
1490
+ if (new_head == cache.size) new_head = i;
1419
1491
  }
1420
1492
  }
1421
1493
  }
1494
+
1495
+ // If we freed up a slot, set head to it so searching can start there.
1496
+ if (new_head != cache.size) cache.head = new_head;
1422
1497
  }
1423
1498
 
1424
1499
  static void llama_kv_cache_seq_cp(
@@ -1430,6 +1505,8 @@ static void llama_kv_cache_seq_cp(
1430
1505
  if (p0 < 0) p0 = 0;
1431
1506
  if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
1432
1507
 
1508
+ cache.head = 0;
1509
+
1433
1510
  for (uint32_t i = 0; i < cache.size; ++i) {
1434
1511
  if (cache.cells[i].has_seq_id(seq_id_src) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
1435
1512
  cache.cells[i].seq_id.insert(seq_id_dst);
@@ -1438,12 +1515,18 @@ static void llama_kv_cache_seq_cp(
1438
1515
  }
1439
1516
 
1440
1517
  static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id seq_id) {
1518
+ uint32_t new_head = cache.size;
1519
+
1441
1520
  for (uint32_t i = 0; i < cache.size; ++i) {
1442
1521
  if (!cache.cells[i].has_seq_id(seq_id)) {
1443
1522
  cache.cells[i].pos = -1;
1444
1523
  cache.cells[i].seq_id.clear();
1524
+ if (new_head == cache.size) new_head = i;
1445
1525
  }
1446
1526
  }
1527
+
1528
+ // If we freed up a slot, set head to it so searching can start there.
1529
+ if (new_head != cache.size) cache.head = new_head;
1447
1530
  }
1448
1531
 
1449
1532
  static void llama_kv_cache_seq_shift(
@@ -1452,6 +1535,8 @@ static void llama_kv_cache_seq_shift(
1452
1535
  llama_pos p0,
1453
1536
  llama_pos p1,
1454
1537
  llama_pos delta) {
1538
+ uint32_t new_head = cache.size;
1539
+
1455
1540
  if (p0 < 0) p0 = 0;
1456
1541
  if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
1457
1542
 
@@ -1461,12 +1546,17 @@ static void llama_kv_cache_seq_shift(
1461
1546
  if (cache.cells[i].pos < 0) {
1462
1547
  cache.cells[i].pos = -1;
1463
1548
  cache.cells[i].seq_id.clear();
1549
+ if (new_head == cache.size) new_head = i;
1464
1550
  } else {
1465
1551
  cache.has_shift = true;
1466
1552
  cache.cells[i].delta = delta;
1467
1553
  }
1468
1554
  }
1469
1555
  }
1556
+
1557
+ // If we freed up a slot, set head to it so searching can start there.
1558
+ // Otherwise we just start the next search from the beginning.
1559
+ cache.head = new_head != cache.size ? new_head : 0;
1470
1560
  }
1471
1561
 
1472
1562
  //
@@ -1670,7 +1760,7 @@ struct llama_model_loader {
1670
1760
  }
1671
1761
  }
1672
1762
 
1673
- struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend backend) {
1763
+ struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend_type backend) {
1674
1764
  if (backend != GGML_BACKEND_CPU) {
1675
1765
  ggml_set_no_alloc(ctx, true);
1676
1766
  }
@@ -1688,7 +1778,7 @@ struct llama_model_loader {
1688
1778
  return tensor;
1689
1779
  }
1690
1780
 
1691
- struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend backend) {
1781
+ struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend_type backend) {
1692
1782
  struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str());
1693
1783
 
1694
1784
  if (cur == NULL) {
@@ -1867,6 +1957,7 @@ static const char * llama_model_type_name(e_model type) {
1867
1957
  case MODEL_1B: return "1B";
1868
1958
  case MODEL_3B: return "3B";
1869
1959
  case MODEL_7B: return "7B";
1960
+ case MODEL_8B: return "8B";
1870
1961
  case MODEL_13B: return "13B";
1871
1962
  case MODEL_15B: return "15B";
1872
1963
  case MODEL_30B: return "30B";
@@ -1979,6 +2070,14 @@ static void llm_load_hparams(
1979
2070
  default: model.type = e_model::MODEL_UNKNOWN;
1980
2071
  }
1981
2072
  } break;
2073
+ case LLM_ARCH_PERSIMMON:
2074
+ {
2075
+ GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
2076
+ switch (hparams.n_layer) {
2077
+ case 36: model.type = e_model::MODEL_8B; break;
2078
+ default: model.type = e_model::MODEL_UNKNOWN;
2079
+ }
2080
+ } break;
1982
2081
  case LLM_ARCH_REFACT:
1983
2082
  {
1984
2083
  GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
@@ -1987,6 +2086,33 @@ static void llm_load_hparams(
1987
2086
  default: model.type = e_model::MODEL_UNKNOWN;
1988
2087
  }
1989
2088
  } break;
2089
+ case LLM_ARCH_BLOOM:
2090
+ {
2091
+ GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
2092
+
2093
+ switch (hparams.n_layer) {
2094
+ case 24: model.type = e_model::MODEL_1B; break;
2095
+ case 30:
2096
+ switch (hparams.n_embd) {
2097
+ case 2560: model.type = e_model::MODEL_3B; break;
2098
+ case 4096: model.type = e_model::MODEL_7B; break;
2099
+ } break;
2100
+ }
2101
+ } break;
2102
+ case LLM_ARCH_MPT:
2103
+ {
2104
+ hparams.f_clamp_kqv = 0.0f;
2105
+
2106
+ GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
2107
+ GGUF_GET_KEY(ctx, hparams.f_clamp_kqv, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ATTENTION_CLAMP_KQV));
2108
+ GGUF_GET_KEY(ctx, hparams.f_max_alibi_bias, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_MAX_ALIBI_BIAS));
2109
+
2110
+ switch (hparams.n_layer) {
2111
+ case 32: model.type = e_model::MODEL_7B; break;
2112
+ case 48: model.type = e_model::MODEL_30B; break;
2113
+ default: model.type = e_model::MODEL_UNKNOWN;
2114
+ }
2115
+ } break;
1990
2116
  default: (void)0;
1991
2117
  }
1992
2118
 
@@ -2131,6 +2257,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
2131
2257
  LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa());
2132
2258
  LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
2133
2259
  LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
2260
+ LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
2261
+ LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
2134
2262
  LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
2135
2263
  LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
2136
2264
  LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
@@ -2230,8 +2358,8 @@ static void llm_load_tensors(
2230
2358
 
2231
2359
  // output
2232
2360
  {
2233
- ggml_backend backend_norm;
2234
- ggml_backend backend_output;
2361
+ ggml_backend_type backend_norm;
2362
+ ggml_backend_type backend_output;
2235
2363
 
2236
2364
  if (n_gpu_layers > int(n_layer)) {
2237
2365
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
@@ -2266,8 +2394,8 @@ static void llm_load_tensors(
2266
2394
  model.layers.resize(n_layer);
2267
2395
 
2268
2396
  for (uint32_t i = 0; i < n_layer; ++i) {
2269
- const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2270
- const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2397
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2398
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2271
2399
 
2272
2400
  auto & layer = model.layers[i];
2273
2401
 
@@ -2296,8 +2424,8 @@ static void llm_load_tensors(
2296
2424
  {
2297
2425
  model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2298
2426
  {
2299
- ggml_backend backend_norm;
2300
- ggml_backend backend_output;
2427
+ ggml_backend_type backend_norm;
2428
+ ggml_backend_type backend_output;
2301
2429
 
2302
2430
  if (n_gpu_layers > int(n_layer)) {
2303
2431
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
@@ -2332,8 +2460,8 @@ static void llm_load_tensors(
2332
2460
  model.layers.resize(n_layer);
2333
2461
 
2334
2462
  for (uint32_t i = 0; i < n_layer; ++i) {
2335
- const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2336
- const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2463
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2464
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2337
2465
 
2338
2466
  auto & layer = model.layers[i];
2339
2467
 
@@ -2366,8 +2494,8 @@ static void llm_load_tensors(
2366
2494
 
2367
2495
  // output
2368
2496
  {
2369
- ggml_backend backend_norm;
2370
- ggml_backend backend_output;
2497
+ ggml_backend_type backend_norm;
2498
+ ggml_backend_type backend_output;
2371
2499
 
2372
2500
  if (n_gpu_layers > int(n_layer)) {
2373
2501
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
@@ -2404,8 +2532,8 @@ static void llm_load_tensors(
2404
2532
  model.layers.resize(n_layer);
2405
2533
 
2406
2534
  for (uint32_t i = 0; i < n_layer; ++i) {
2407
- const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2408
- const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2535
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2536
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2409
2537
 
2410
2538
  auto & layer = model.layers[i];
2411
2539
 
@@ -2443,8 +2571,8 @@ static void llm_load_tensors(
2443
2571
 
2444
2572
  // output
2445
2573
  {
2446
- ggml_backend backend_norm;
2447
- ggml_backend backend_output;
2574
+ ggml_backend_type backend_norm;
2575
+ ggml_backend_type backend_output;
2448
2576
 
2449
2577
  if (n_gpu_layers > int(n_layer)) {
2450
2578
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
@@ -2481,8 +2609,8 @@ static void llm_load_tensors(
2481
2609
  model.layers.resize(n_layer);
2482
2610
 
2483
2611
  for (uint32_t i = 0; i < n_layer; ++i) {
2484
- const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2485
- const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2612
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2613
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2486
2614
 
2487
2615
  auto & layer = model.layers[i];
2488
2616
 
@@ -2515,6 +2643,216 @@ static void llm_load_tensors(
2515
2643
  }
2516
2644
  }
2517
2645
  } break;
2646
+ case LLM_ARCH_PERSIMMON:
2647
+ {
2648
+ model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2649
+
2650
+ {
2651
+ ggml_backend_type backend_norm;
2652
+ ggml_backend_type backend_output;
2653
+
2654
+ if (n_gpu_layers > int(n_layer)) {
2655
+ // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2656
+ // on Windows however this is detrimental unless everything is on the GPU
2657
+ #ifndef _WIN32
2658
+ backend_norm = LLAMA_BACKEND_OFFLOAD;
2659
+ #else
2660
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2661
+ #endif // _WIN32
2662
+
2663
+ backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
2664
+ } else {
2665
+ backend_norm = GGML_BACKEND_CPU;
2666
+ backend_output = GGML_BACKEND_CPU;
2667
+ }
2668
+
2669
+ model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
2670
+ model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
2671
+ model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
2672
+
2673
+ if (backend_norm == GGML_BACKEND_GPU) {
2674
+ vram_weights += ggml_nbytes(model.output_norm);
2675
+ vram_weights += ggml_nbytes(model.output_norm_b);
2676
+ }
2677
+ if (backend_output == GGML_BACKEND_GPU_SPLIT) {
2678
+ vram_weights += ggml_nbytes(model.output);
2679
+ }
2680
+ }
2681
+
2682
+ const uint32_t n_ff = hparams.n_ff;
2683
+ const int i_gpu_start = n_layer - n_gpu_layers;
2684
+ model.layers.resize(n_layer);
2685
+ for (uint32_t i = 0; i < n_layer; ++i) {
2686
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2687
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT;
2688
+ auto & layer = model.layers[i];
2689
+ layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
2690
+ layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
2691
+ layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
2692
+ layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split);
2693
+ layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
2694
+ layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split);
2695
+ layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
2696
+ layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split);
2697
+ layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
2698
+ layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split);
2699
+ layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
2700
+ layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
2701
+ layer.attn_q_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64}, backend);
2702
+ layer.attn_q_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {64}, backend);
2703
+ layer.attn_k_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {64}, backend);
2704
+ layer.attn_k_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64}, backend);
2705
+ }
2706
+ } break;
2707
+ case LLM_ARCH_BLOOM:
2708
+ {
2709
+ // TODO: CPU-only for now
2710
+
2711
+ model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2712
+ model.tok_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, GGML_BACKEND_CPU);
2713
+ model.tok_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, GGML_BACKEND_CPU);
2714
+
2715
+ // output
2716
+ {
2717
+ ggml_backend_type backend_norm;
2718
+ ggml_backend_type backend_output;
2719
+
2720
+ if (n_gpu_layers > int(n_layer)) {
2721
+ // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2722
+ // on Windows however this is detrimental unless everything is on the GPU
2723
+ #ifndef _WIN32
2724
+ backend_norm = LLAMA_BACKEND_OFFLOAD;
2725
+ #else
2726
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2727
+ #endif // _WIN32
2728
+
2729
+ backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
2730
+ } else {
2731
+ backend_norm = GGML_BACKEND_CPU;
2732
+ backend_output = GGML_BACKEND_CPU;
2733
+ }
2734
+
2735
+ model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
2736
+ model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
2737
+ model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
2738
+
2739
+ if (backend_norm == GGML_BACKEND_GPU) {
2740
+ vram_weights += ggml_nbytes(model.output_norm);
2741
+ vram_weights += ggml_nbytes(model.output_norm_b);
2742
+ }
2743
+ if (backend_output == GGML_BACKEND_GPU_SPLIT) {
2744
+ vram_weights += ggml_nbytes(model.output);
2745
+ }
2746
+ }
2747
+
2748
+ const uint32_t n_ff = hparams.n_ff;
2749
+
2750
+ const int i_gpu_start = n_layer - n_gpu_layers;
2751
+
2752
+ model.layers.resize(n_layer);
2753
+
2754
+ for (uint32_t i = 0; i < n_layer; ++i) {
2755
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2756
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2757
+
2758
+ auto & layer = model.layers[i];
2759
+
2760
+ layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
2761
+ layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
2762
+
2763
+ layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
2764
+ layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split);
2765
+
2766
+ layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
2767
+ layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split);
2768
+
2769
+ layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
2770
+ layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
2771
+
2772
+ layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
2773
+ layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split);
2774
+
2775
+ layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
2776
+ layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split);
2777
+
2778
+ if (backend == GGML_BACKEND_GPU) {
2779
+ vram_weights +=
2780
+ ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
2781
+ ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
2782
+ ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) +
2783
+ ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_norm_b) +
2784
+ ggml_nbytes(layer.w3) + ggml_nbytes(layer.b3) +
2785
+ ggml_nbytes(layer.w2) + ggml_nbytes(layer.b2);
2786
+ }
2787
+ }
2788
+ } break;
2789
+ case LLM_ARCH_MPT:
2790
+ {
2791
+ model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2792
+
2793
+ // output
2794
+ {
2795
+ ggml_backend_type backend_norm;
2796
+ ggml_backend_type backend_output;
2797
+
2798
+ if (n_gpu_layers > int(n_layer)) {
2799
+ // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2800
+ // on Windows however this is detrimental unless everything is on the GPU
2801
+ #ifndef _WIN32
2802
+ backend_norm = LLAMA_BACKEND_OFFLOAD;
2803
+ #else
2804
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2805
+ #endif // _WIN32
2806
+
2807
+ backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
2808
+ } else {
2809
+ backend_norm = GGML_BACKEND_CPU;
2810
+ backend_output = GGML_BACKEND_CPU;
2811
+ }
2812
+
2813
+ model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
2814
+ model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
2815
+
2816
+ if (backend_norm == GGML_BACKEND_GPU) {
2817
+ vram_weights += ggml_nbytes(model.output_norm);
2818
+ }
2819
+ if (backend_output == GGML_BACKEND_GPU_SPLIT) {
2820
+ vram_weights += ggml_nbytes(model.output);
2821
+ }
2822
+ }
2823
+
2824
+ const uint32_t n_ff = hparams.n_ff;
2825
+
2826
+ const int i_gpu_start = n_layer - n_gpu_layers;
2827
+
2828
+ model.layers.resize(n_layer);
2829
+
2830
+ for (uint32_t i = 0; i < n_layer; ++i) {
2831
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2832
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2833
+
2834
+ auto & layer = model.layers[i];
2835
+
2836
+ layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
2837
+ layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, 3*n_embd}, backend_split);
2838
+ layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
2839
+
2840
+ layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
2841
+
2842
+ layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
2843
+ layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
2844
+
2845
+ if (backend == GGML_BACKEND_GPU) {
2846
+ vram_weights +=
2847
+ ggml_nbytes(layer.attn_norm) +
2848
+ ggml_nbytes(layer.wqkv) +
2849
+ ggml_nbytes(layer.wo) +
2850
+ ggml_nbytes(layer.ffn_norm) +
2851
+ ggml_nbytes(layer.w2) +
2852
+ ggml_nbytes(layer.w3);
2853
+ }
2854
+ }
2855
+ } break;
2518
2856
  default:
2519
2857
  throw std::runtime_error("unknown architecture");
2520
2858
  }
@@ -2624,8 +2962,8 @@ static bool llama_model_load(
2624
2962
  }
2625
2963
 
2626
2964
  static struct ggml_cgraph * llm_build_llama(
2627
- llama_context & lctx,
2628
- const llama_batch & batch) {
2965
+ llama_context & lctx,
2966
+ const llama_batch & batch) {
2629
2967
  const auto & model = lctx.model;
2630
2968
  const auto & hparams = model.hparams;
2631
2969
  const auto & cparams = lctx.cparams;
@@ -2663,11 +3001,9 @@ static struct ggml_cgraph * llm_build_llama(
2663
3001
  struct ggml_init_params params = {
2664
3002
  /*.mem_size =*/ buf_compute.size,
2665
3003
  /*.mem_buffer =*/ buf_compute.data,
2666
- /*.no_alloc =*/ false,
3004
+ /*.no_alloc =*/ true,
2667
3005
  };
2668
3006
 
2669
- params.no_alloc = true;
2670
-
2671
3007
  struct ggml_context * ctx0 = ggml_init(params);
2672
3008
 
2673
3009
  ggml_cgraph * gf = ggml_new_graph(ctx0);
@@ -3051,11 +3387,9 @@ static struct ggml_cgraph * llm_build_baichaun(
3051
3387
  struct ggml_init_params params = {
3052
3388
  /*.mem_size =*/ buf_compute.size,
3053
3389
  /*.mem_buffer =*/ buf_compute.data,
3054
- /*.no_alloc =*/ false,
3390
+ /*.no_alloc =*/ true,
3055
3391
  };
3056
3392
 
3057
- params.no_alloc = true;
3058
-
3059
3393
  struct ggml_context * ctx0 = ggml_init(params);
3060
3394
 
3061
3395
  ggml_cgraph * gf = ggml_new_graph(ctx0);
@@ -3452,11 +3786,9 @@ static struct ggml_cgraph * llm_build_refact(
3452
3786
  struct ggml_init_params params = {
3453
3787
  /*.mem_size =*/ buf_compute.size,
3454
3788
  /*.mem_buffer =*/ buf_compute.data,
3455
- /*.no_alloc =*/ false,
3789
+ /*.no_alloc =*/ true,
3456
3790
  };
3457
3791
 
3458
- params.no_alloc = true;
3459
-
3460
3792
  struct ggml_context * ctx0 = ggml_init(params);
3461
3793
 
3462
3794
  ggml_cgraph * gf = ggml_new_graph(ctx0);
@@ -3806,11 +4138,9 @@ static struct ggml_cgraph * llm_build_falcon(
3806
4138
  struct ggml_init_params params = {
3807
4139
  /*.mem_size =*/ buf_compute.size,
3808
4140
  /*.mem_buffer =*/ buf_compute.data,
3809
- /*.no_alloc =*/ false,
4141
+ /*.no_alloc =*/ true,
3810
4142
  };
3811
4143
 
3812
- params.no_alloc = true;
3813
-
3814
4144
  struct ggml_context * ctx0 = ggml_init(params);
3815
4145
 
3816
4146
  ggml_cgraph * gf = ggml_new_graph(ctx0);
@@ -4166,11 +4496,9 @@ static struct ggml_cgraph * llm_build_starcoder(
4166
4496
  struct ggml_init_params params = {
4167
4497
  /*.mem_size =*/ buf_compute.size,
4168
4498
  /*.mem_buffer =*/ buf_compute.data,
4169
- /*.no_alloc =*/ false,
4499
+ /*.no_alloc =*/ true,
4170
4500
  };
4171
4501
 
4172
- params.no_alloc = true;
4173
-
4174
4502
  struct ggml_context * ctx0 = ggml_init(params);
4175
4503
 
4176
4504
  ggml_cgraph * gf = ggml_new_graph(ctx0);
@@ -4381,19 +4709,975 @@ static struct ggml_cgraph * llm_build_starcoder(
4381
4709
  return gf;
4382
4710
  }
4383
4711
 
4384
- static struct ggml_cgraph * llama_build_graph(
4712
+ static struct ggml_cgraph * llm_build_persimmon(
4385
4713
  llama_context & lctx,
4386
4714
  const llama_batch & batch) {
4387
4715
  const auto & model = lctx.model;
4716
+ const auto & hparams = model.hparams;
4388
4717
 
4389
- struct ggml_cgraph * result = NULL;
4718
+ const auto & kv_self = lctx.kv_self;
4390
4719
 
4391
- switch (model.arch) {
4392
- case LLM_ARCH_LLAMA:
4393
- {
4394
- result = llm_build_llama(lctx, batch);
4395
- } break;
4396
- case LLM_ARCH_BAICHUAN:
4720
+ GGML_ASSERT(!!kv_self.ctx);
4721
+
4722
+ const auto & cparams = lctx.cparams;
4723
+ const int64_t n_embd = hparams.n_embd;
4724
+ const int64_t n_layer = hparams.n_layer;
4725
+ const int64_t n_ctx = cparams.n_ctx;
4726
+ const int64_t n_head_kv = hparams.n_head_kv;
4727
+ const int64_t n_head = hparams.n_head;
4728
+ const int64_t n_embd_head = hparams.n_embd_head();
4729
+ const int64_t n_embd_gqa = hparams.n_embd_gqa();
4730
+ const size_t n_rot = n_embd_head / 2;
4731
+
4732
+ const float freq_base = cparams.rope_freq_base;
4733
+ const float freq_scale = cparams.rope_freq_scale;
4734
+ const float norm_eps = hparams.f_norm_eps;
4735
+
4736
+ const int n_gpu_layers = model.n_gpu_layers;
4737
+
4738
+
4739
+ const int32_t n_tokens = batch.n_tokens;
4740
+ const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
4741
+ const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
4742
+
4743
+ const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
4744
+
4745
+ auto & buf_compute = lctx.buf_compute;
4746
+ struct ggml_init_params params = {
4747
+ /*.mem_size =*/ buf_compute.size,
4748
+ /*.mem_buffer =*/ buf_compute.data,
4749
+ /*.no_alloc =*/ true,
4750
+ };
4751
+
4752
+ struct ggml_context * ctx0 = ggml_init(params);
4753
+
4754
+ ggml_cgraph * gf = ggml_new_graph(ctx0);
4755
+
4756
+ struct ggml_tensor * cur;
4757
+ struct ggml_tensor * inpL;
4758
+
4759
+ if (batch.token) {
4760
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
4761
+
4762
+ ggml_allocr_alloc(lctx.alloc, inp_tokens);
4763
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
4764
+ memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
4765
+ }
4766
+ ggml_set_name(inp_tokens, "inp_tokens");
4767
+ inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
4768
+ } else {
4769
+ inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
4770
+ ggml_allocr_alloc(lctx.alloc, inpL);
4771
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
4772
+ memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
4773
+ }
4774
+ }
4775
+ const int i_gpu_start = n_layer - n_gpu_layers;
4776
+ (void) i_gpu_start;
4777
+ offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
4778
+ offload_func_t offload_func_kq = llama_nop;
4779
+ offload_func_t offload_func_v = llama_nop;
4780
+ // KQ_scale
4781
+ struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
4782
+ ggml_allocr_alloc(lctx.alloc, KQ_scale);
4783
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
4784
+ ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd_head)));
4785
+ }
4786
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
4787
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
4788
+ offload_func_kq(KQ_mask);
4789
+ ggml_set_name(KQ_mask, "KQ_mask");
4790
+ ggml_allocr_alloc(lctx.alloc, KQ_mask);
4791
+
4792
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
4793
+ float * data = (float *) KQ_mask->data;
4794
+ memset(data, 0, ggml_nbytes(KQ_mask));
4795
+ for (int h = 0; h < 1; ++h) {
4796
+ for (int j = 0; j < n_tokens; ++j) {
4797
+ const llama_pos pos = batch.pos[j];
4798
+ const llama_seq_id seq_id = batch.seq_id[j];
4799
+ for (int i = 0; i < n_kv; ++i) {
4800
+ if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
4801
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
4802
+ }
4803
+ }
4804
+ }
4805
+ }
4806
+ }
4807
+
4808
+ struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
4809
+ offload_func_kq(KQ_pos);
4810
+ ggml_set_name(KQ_pos, "KQ_pos");
4811
+ ggml_allocr_alloc(lctx.alloc, KQ_pos);
4812
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
4813
+ int * data = (int *) KQ_pos->data;
4814
+ for (int i = 0; i < n_tokens; ++i) {
4815
+ data[i] = batch.pos[i];
4816
+ }
4817
+ }
4818
+ if (do_rope_shift) {
4819
+ struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
4820
+ offload_func_kq(K_shift);
4821
+ ggml_set_name(K_shift, "K_shift");
4822
+ ggml_allocr_alloc(lctx.alloc, K_shift);
4823
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
4824
+ int * data = (int *) K_shift->data;
4825
+ for (int i = 0; i < n_ctx; ++i) {
4826
+ data[i] = kv_self.cells[i].delta;
4827
+ }
4828
+ }
4829
+ for (int il = 0; il < n_layer; ++il) {
4830
+ struct ggml_tensor * tmp =
4831
+ // we rotate only the first n_rot dimensions.
4832
+ ggml_rope_custom_inplace(ctx0,
4833
+ ggml_view_3d(ctx0, kv_self.k,
4834
+ n_rot, n_head, n_ctx,
4835
+ ggml_element_size(kv_self.k)*n_embd_gqa,
4836
+ ggml_element_size(kv_self.k)*n_embd_head,
4837
+ ggml_element_size(kv_self.k)*(n_embd_head*n_ctx*il)
4838
+ ),
4839
+ K_shift, n_rot, 2, 0, freq_base, freq_scale);
4840
+ offload_func_kq(tmp);
4841
+ ggml_build_forward_expand(gf, tmp);
4842
+ }
4843
+ }
4844
+ for (int il=0; il < n_layer; ++il) {
4845
+ struct ggml_tensor * residual = inpL;
4846
+ offload_func_t offload_func = llama_nop;
4847
+ {
4848
+ cur = ggml_norm(ctx0, inpL, norm_eps);
4849
+ offload_func(cur);
4850
+ cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm);
4851
+ offload_func(cur);
4852
+ cur = ggml_add(ctx0, cur, model.layers[il].attn_norm_b);
4853
+ offload_func(cur);
4854
+ ggml_format_name(cur, "input_layernorm_%d", il);
4855
+ }
4856
+ // self attention
4857
+ {
4858
+ cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
4859
+ offload_func_kq(cur);
4860
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
4861
+ offload_func_kq(cur);
4862
+
4863
+ // split qkv
4864
+ GGML_ASSERT(n_head_kv == n_head);
4865
+ ggml_set_name(cur, format("qkv_%d", il).c_str());
4866
+ struct ggml_tensor * tmpqkv = ggml_reshape_4d(ctx0, cur, n_embd_head, 3, n_head, n_tokens);
4867
+ offload_func_kq(tmpqkv);
4868
+ struct ggml_tensor * tmpqkv_perm = ggml_cont(ctx0, ggml_permute(ctx0, tmpqkv, 0, 3, 1, 2));
4869
+ offload_func_kq(tmpqkv_perm);
4870
+ ggml_format_name(tmpqkv_perm, "tmpqkv_perm_%d", il);
4871
+ struct ggml_tensor * tmpq = ggml_view_3d(
4872
+ ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
4873
+ ggml_element_size(tmpqkv_perm) * n_embd_head,
4874
+ ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
4875
+ 0
4876
+ );
4877
+ offload_func_kq(tmpq);
4878
+ struct ggml_tensor * tmpk = ggml_view_3d(
4879
+ ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
4880
+ ggml_element_size(tmpqkv_perm) * n_embd_head,
4881
+ ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
4882
+ ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens
4883
+ );
4884
+ offload_func_kq(tmpk);
4885
+ // Q/K Layernorm
4886
+ tmpq = ggml_norm(ctx0, tmpq, norm_eps);
4887
+ offload_func_kq(tmpq);
4888
+ tmpq = ggml_mul(ctx0, tmpq, model.layers[il].attn_q_norm);
4889
+ offload_func_kq(tmpq);
4890
+ tmpq = ggml_add(ctx0, tmpq, model.layers[il].attn_q_norm_b);
4891
+ offload_func_kq(tmpq);
4892
+
4893
+ tmpk = ggml_norm(ctx0, tmpk, norm_eps);
4894
+ offload_func_v(tmpk);
4895
+ tmpk = ggml_mul(ctx0, tmpk, model.layers[il].attn_k_norm);
4896
+ offload_func_v(tmpk);
4897
+ tmpk = ggml_add(ctx0, tmpk, model.layers[il].attn_k_norm_b);
4898
+ offload_func_v(tmpk);
4899
+
4900
+ // RoPE the first n_rot of q/k, pass the other half, and concat.
4901
+ struct ggml_tensor * qrot = ggml_view_3d(
4902
+ ctx0, tmpq, n_rot, n_head, n_tokens,
4903
+ ggml_element_size(tmpq) * n_embd_head,
4904
+ ggml_element_size(tmpq) * n_embd_head * n_head,
4905
+ 0
4906
+ );
4907
+ offload_func_kq(qrot);
4908
+ ggml_format_name(qrot, "qrot_%d", il);
4909
+ struct ggml_tensor * krot = ggml_view_3d(
4910
+ ctx0, tmpk, n_rot, n_head, n_tokens,
4911
+ ggml_element_size(tmpk) * n_embd_head,
4912
+ ggml_element_size(tmpk) * n_embd_head * n_head,
4913
+ 0
4914
+ );
4915
+ offload_func_kq(krot);
4916
+ ggml_format_name(krot, "krot_%d", il);
4917
+
4918
+ // get the second half of tmpq, e.g tmpq[n_rot:, :, :]
4919
+ struct ggml_tensor * qpass = ggml_view_3d(
4920
+ ctx0, tmpq, n_rot, n_head, n_tokens,
4921
+ ggml_element_size(tmpq) * n_embd_head,
4922
+ ggml_element_size(tmpq) * n_embd_head * n_head,
4923
+ ggml_element_size(tmpq) * n_rot
4924
+ );
4925
+ offload_func_kq(qpass);
4926
+ ggml_format_name(qpass, "qpass_%d", il);
4927
+ struct ggml_tensor * kpass = ggml_view_3d(
4928
+ ctx0, tmpk, n_rot, n_head, n_tokens,
4929
+ ggml_element_size(tmpk) * n_embd_head,
4930
+ ggml_element_size(tmpk) * n_embd_head * n_head,
4931
+ ggml_element_size(tmpk) * n_rot
4932
+ );
4933
+ offload_func_kq(kpass);
4934
+ ggml_format_name(kpass, "kpass_%d", il);
4935
+
4936
+ struct ggml_tensor * qrotated = ggml_rope_custom(
4937
+ ctx0, qrot, KQ_pos, n_rot, 2, 0, freq_base, freq_scale
4938
+ );
4939
+ offload_func_kq(qrotated);
4940
+ struct ggml_tensor * krotated = ggml_rope_custom(
4941
+ ctx0, krot, KQ_pos, n_rot, 2, 0, freq_base, freq_scale
4942
+ );
4943
+ offload_func_kq(krotated);
4944
+ // ggml currently only supports concatenation on dim=2
4945
+ // so we need to permute qrot, qpass, concat, then permute back.
4946
+ qrotated = ggml_cont(ctx0, ggml_permute(ctx0, qrotated, 2, 1, 0, 3));
4947
+ offload_func_kq(qrotated);
4948
+ krotated = ggml_cont(ctx0, ggml_permute(ctx0, krotated, 2, 1, 0, 3));
4949
+ offload_func_kq(krotated);
4950
+
4951
+ qpass = ggml_cont(ctx0, ggml_permute(ctx0, qpass, 2, 1, 0, 3));
4952
+ offload_func_kq(qpass);
4953
+ kpass = ggml_cont(ctx0, ggml_permute(ctx0, kpass, 2, 1, 0, 3));
4954
+ offload_func_kq(kpass);
4955
+
4956
+ struct ggml_tensor * Qcur = ggml_concat(ctx0, qrotated, qpass);
4957
+ offload_func_kq(Qcur);
4958
+ struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass);
4959
+ offload_func_kq(Kcur);
4960
+
4961
+ struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 1, 2, 0, 3));
4962
+ offload_func_kq(Q);
4963
+
4964
+ Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
4965
+ offload_func_kq(Kcur);
4966
+ {
4967
+ struct ggml_tensor * tmpv = ggml_view_3d(
4968
+ ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
4969
+ ggml_element_size(tmpqkv_perm) * n_embd_head,
4970
+ ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
4971
+ ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens * 2
4972
+ );
4973
+ offload_func_v(tmpv);
4974
+ // store K, V in cache
4975
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens));
4976
+ offload_func_v(Vcur);
4977
+ ggml_set_name(Vcur, "Vcur");
4978
+
4979
+ struct ggml_tensor * k = ggml_view_1d(
4980
+ ctx0, kv_self.k, n_tokens*n_embd_gqa,
4981
+ (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)
4982
+ );
4983
+ offload_func_kq(k);
4984
+ ggml_set_name(k, "k");
4985
+
4986
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
4987
+ ( n_ctx)*ggml_element_size(kv_self.v),
4988
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
4989
+ offload_func_v(v);
4990
+ ggml_set_name(v, "v");
4991
+
4992
+ // important: storing RoPE-ed version of K in the KV cache!
4993
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
4994
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
4995
+ }
4996
+ struct ggml_tensor * K = ggml_view_3d(ctx0, kv_self.k,
4997
+ n_embd_head, n_kv, n_head_kv,
4998
+ ggml_element_size(kv_self.k)*n_embd_gqa,
4999
+ ggml_element_size(kv_self.k)*n_embd_head,
5000
+ ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
5001
+
5002
+ offload_func_kq(K);
5003
+ ggml_format_name(K, "K_%d", il);
5004
+
5005
+ struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
5006
+ offload_func_kq(KQ);
5007
+ ggml_set_name(KQ, "KQ");
5008
+
5009
+ struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
5010
+ offload_func_kq(KQ_scaled);
5011
+ ggml_set_name(KQ_scaled, "KQ_scaled");
5012
+
5013
+ struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
5014
+ offload_func_kq(KQ_masked);
5015
+ ggml_set_name(KQ_masked, "KQ_masked");
5016
+
5017
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
5018
+ offload_func_kq(KQ_soft_max);
5019
+ ggml_set_name(KQ_soft_max, "KQ_soft_max");
5020
+
5021
+ struct ggml_tensor * V =
5022
+ ggml_view_3d(ctx0, kv_self.v,
5023
+ n_kv, n_embd_head, n_head_kv,
5024
+ ggml_element_size(kv_self.v)*n_ctx,
5025
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
5026
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
5027
+ offload_func_v(V);
5028
+ ggml_set_name(V, "V");
5029
+
5030
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
5031
+ offload_func_v(KQV);
5032
+ ggml_set_name(KQV, "KQV");
5033
+
5034
+ struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
5035
+ offload_func_v(KQV_merged);
5036
+ ggml_set_name(KQV_merged, "KQV_merged");
5037
+
5038
+ cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
5039
+ offload_func_v(cur);
5040
+ ggml_set_name(cur, "KQV_merged_contiguous");
5041
+
5042
+ cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
5043
+ offload_func(cur);
5044
+ cur = ggml_add(ctx0, cur, model.layers[il].bo);
5045
+ offload_func(cur);
5046
+ ggml_set_name(cur, "result_wo");
5047
+ }
5048
+
5049
+ struct ggml_tensor * inpFF = ggml_add(ctx0, residual, cur);
5050
+ offload_func(inpFF);
5051
+ ggml_set_name(inpFF, "inpFF");
5052
+ {
5053
+ // MLP
5054
+ {
5055
+ // Norm
5056
+ cur = ggml_norm(ctx0, inpFF, norm_eps);
5057
+ offload_func(cur);
5058
+ cur = ggml_add(ctx0,
5059
+ ggml_mul(ctx0, cur, model.layers[il].ffn_norm),
5060
+ model.layers[il].ffn_norm_b
5061
+ );
5062
+ ggml_set_name(cur, "ffn_norm");
5063
+ offload_func(cur);
5064
+ }
5065
+ cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur);
5066
+ offload_func(cur);
5067
+
5068
+ cur = ggml_add(ctx0, cur, model.layers[il].b3);
5069
+ offload_func(cur);
5070
+ ggml_set_name(cur, "result_ffn_up");
5071
+
5072
+ cur = ggml_sqr(ctx0, ggml_relu(ctx0, cur));
5073
+ ggml_set_name(cur, "result_ffn_act");
5074
+ offload_func(cur);
5075
+ offload_func(cur->src[0]);
5076
+
5077
+ cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur);
5078
+ offload_func(cur);
5079
+ cur = ggml_add(ctx0,
5080
+ cur,
5081
+ model.layers[il].b2);
5082
+ offload_func(cur);
5083
+ ggml_set_name(cur, "outFF");
5084
+ }
5085
+ cur = ggml_add(ctx0, cur, inpFF);
5086
+ offload_func(cur);
5087
+ ggml_set_name(cur, "inpFF_+_outFF");
5088
+ inpL = cur;
5089
+ }
5090
+ cur = inpL;
5091
+ {
5092
+ cur = ggml_norm(ctx0, cur, norm_eps);
5093
+ offload_func_nr(cur);
5094
+ cur = ggml_mul(ctx0, cur, model.output_norm);
5095
+ offload_func_nr(cur);
5096
+
5097
+ cur = ggml_add(ctx0, cur, model.output_norm_b);
5098
+ // offload_func_nr(cur);
5099
+
5100
+ ggml_set_name(cur, "result_norm");
5101
+ }
5102
+ cur = ggml_mul_mat(ctx0, model.output, cur);
5103
+ ggml_set_name(cur, "result_output");
5104
+ ggml_build_forward_expand(gf, cur);
5105
+ ggml_free(ctx0);
5106
+ return gf;
5107
+ }
5108
+
5109
+ static struct ggml_cgraph * llm_build_bloom(
5110
+ llama_context & lctx,
5111
+ const llama_batch & batch) {
5112
+ const auto & model = lctx.model;
5113
+ const auto & hparams = model.hparams;
5114
+ const auto & cparams = lctx.cparams;
5115
+
5116
+ const auto & kv_self = lctx.kv_self;
5117
+
5118
+ GGML_ASSERT(!!kv_self.ctx);
5119
+
5120
+ const int64_t n_embd = hparams.n_embd;
5121
+ const int64_t n_layer = hparams.n_layer;
5122
+ const int64_t n_ctx = cparams.n_ctx;
5123
+ const int64_t n_head = hparams.n_head;
5124
+ const int64_t n_head_kv = hparams.n_head_kv;
5125
+ const int64_t n_embd_head = hparams.n_embd_head();
5126
+ const int64_t n_embd_gqa = hparams.n_embd_gqa();
5127
+
5128
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
5129
+
5130
+ const float norm_eps = hparams.f_norm_eps;
5131
+
5132
+ const int32_t n_tokens = batch.n_tokens;
5133
+ const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
5134
+ const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
5135
+
5136
+ auto & buf_compute = lctx.buf_compute;
5137
+
5138
+ struct ggml_init_params params = {
5139
+ /*.mem_size =*/ buf_compute.size,
5140
+ /*.mem_buffer =*/ buf_compute.data,
5141
+ /*.no_alloc =*/ false,
5142
+ };
5143
+
5144
+ params.no_alloc = true;
5145
+
5146
+ struct ggml_context * ctx0 = ggml_init(params);
5147
+
5148
+ ggml_cgraph * gf = ggml_new_graph(ctx0);
5149
+
5150
+ struct ggml_tensor * cur;
5151
+ struct ggml_tensor * token;
5152
+ struct ggml_tensor * inpL;
5153
+
5154
+ if (batch.token) {
5155
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
5156
+
5157
+ ggml_allocr_alloc(lctx.alloc, inp_tokens);
5158
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
5159
+ memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
5160
+ }
5161
+ ggml_set_name(inp_tokens, "inp_tokens");
5162
+
5163
+ token = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
5164
+ } else {
5165
+ #ifdef GGML_USE_MPI
5166
+ GGML_ASSERT(false && "not implemented");
5167
+ #endif
5168
+
5169
+ token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
5170
+
5171
+ ggml_allocr_alloc(lctx.alloc, token);
5172
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
5173
+ memcpy(token->data, batch.embd, n_tokens * n_embd * ggml_element_size(token));
5174
+ }
5175
+ }
5176
+
5177
+ // KQ_scale
5178
+ struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
5179
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
5180
+ ggml_allocr_alloc(lctx.alloc, KQ_scale);
5181
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
5182
+ ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
5183
+ }
5184
+
5185
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5186
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
5187
+ ggml_set_name(KQ_mask, "KQ_mask");
5188
+ ggml_allocr_alloc(lctx.alloc, KQ_mask);
5189
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
5190
+ float * data = (float *) KQ_mask->data;
5191
+ memset(data, 0, ggml_nbytes(KQ_mask));
5192
+
5193
+ for (int h = 0; h < 1; ++h) {
5194
+ for (int j = 0; j < n_tokens; ++j) {
5195
+ const llama_pos pos = batch.pos[j];
5196
+ const llama_seq_id seq_id = batch.seq_id[j];
5197
+
5198
+ for (int i = 0; i < n_kv; ++i) {
5199
+ if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
5200
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
5201
+ }
5202
+ }
5203
+ }
5204
+ }
5205
+ }
5206
+
5207
+ // norm
5208
+ {
5209
+ inpL = ggml_norm(ctx0, token, norm_eps);
5210
+ inpL = ggml_add(ctx0, ggml_mul(ctx0, inpL, model.tok_norm), model.tok_norm_b);
5211
+ }
5212
+
5213
+ ggml_set_name(inpL, "inpL");
5214
+
5215
+ for (int il = 0; il < n_layer; ++il) {
5216
+ {
5217
+ // Norm
5218
+ cur = ggml_norm(ctx0, inpL, norm_eps);
5219
+ cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b);
5220
+ }
5221
+
5222
+ {
5223
+ // Self Attention
5224
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv);
5225
+
5226
+ struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd);
5227
+ struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*n_embd);
5228
+ struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
5229
+
5230
+ struct ggml_tensor * Qcur = tmpq;
5231
+ struct ggml_tensor * Kcur = tmpk;
5232
+
5233
+ // store key and value to memory
5234
+ {
5235
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
5236
+ ggml_set_name(Vcur, "Vcur");
5237
+
5238
+ struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
5239
+ ggml_set_name(k, "k");
5240
+
5241
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
5242
+ ( n_ctx)*ggml_element_size(kv_self.v),
5243
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
5244
+
5245
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
5246
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
5247
+ }
5248
+
5249
+ struct ggml_tensor * Q =
5250
+ ggml_permute(ctx0,
5251
+ ggml_cpy(ctx0,
5252
+ Qcur,
5253
+ ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, n_tokens)),
5254
+ 0, 2, 1, 3);
5255
+ ggml_set_name(Q, "Q");
5256
+
5257
+ struct ggml_tensor * K =
5258
+ ggml_view_3d(ctx0, kv_self.k,
5259
+ n_embd_head, n_kv, n_head_kv,
5260
+ ggml_element_size(kv_self.k)*n_embd_gqa,
5261
+ ggml_element_size(kv_self.k)*n_embd_head,
5262
+ ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
5263
+ ggml_set_name(K, "K");
5264
+
5265
+ // K * Q
5266
+ struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
5267
+ ggml_set_name(KQ, "KQ");
5268
+
5269
+ // KQ_scaled = KQ / sqrt(n_embd_head)
5270
+ // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
5271
+ struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
5272
+ ggml_set_name(KQ_scaled, "KQ_scaled");
5273
+
5274
+ struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ kv_head, n_head, 8);
5275
+ ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
5276
+
5277
+ // KQ_masked = mask_past(KQ_scaled)
5278
+ struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
5279
+ ggml_set_name(KQ_masked, "KQ_masked");
5280
+
5281
+ // KQ = soft_max(KQ_masked)
5282
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
5283
+ ggml_set_name(KQ_soft_max, "KQ_soft_max");
5284
+
5285
+ // split cached V into n_head heads
5286
+ struct ggml_tensor * V =
5287
+ ggml_view_3d(ctx0, kv_self.v,
5288
+ n_kv, n_embd_head, n_head_kv,
5289
+ ggml_element_size(kv_self.v)*n_ctx,
5290
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
5291
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
5292
+ ggml_set_name(V, "V");
5293
+
5294
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
5295
+ ggml_set_name(KQV, "KQV");
5296
+
5297
+ // KQV_merged = KQV.permute(0, 2, 1, 3)
5298
+ struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
5299
+ ggml_set_name(KQV_merged, "KQV_merged");
5300
+
5301
+ // cur = KQV_merged.contiguous().view(n_embd, n_tokens)
5302
+ cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
5303
+ ggml_set_name(cur, "KQV_merged_contiguous");
5304
+ }
5305
+
5306
+ // Projection
5307
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo);
5308
+
5309
+ // Add the input
5310
+ cur = ggml_add(ctx0, cur, inpL);
5311
+
5312
+ struct ggml_tensor * inpFF = cur;
5313
+
5314
+ // FF
5315
+ {
5316
+ // Norm
5317
+ {
5318
+ cur = ggml_norm(ctx0, inpFF, norm_eps);
5319
+ cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b);
5320
+ }
5321
+
5322
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3);
5323
+
5324
+ // GELU activation
5325
+ cur = ggml_gelu(ctx0, cur);
5326
+
5327
+ // Projection
5328
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2);
5329
+ }
5330
+
5331
+ inpL = ggml_add(ctx0, cur, inpFF);
5332
+ }
5333
+
5334
+ // Output Norm
5335
+ {
5336
+ cur = ggml_norm(ctx0, inpL, norm_eps);
5337
+ cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b);
5338
+ }
5339
+ ggml_set_name(cur, "result_norm");
5340
+
5341
+ cur = ggml_mul_mat(ctx0, model.output, cur);
5342
+ ggml_set_name(cur, "result_output");
5343
+
5344
+ ggml_build_forward_expand(gf, cur);
5345
+
5346
+ ggml_free(ctx0);
5347
+
5348
+ return gf;
5349
+ }
5350
+
5351
+ static struct ggml_cgraph * llm_build_mpt(
5352
+ llama_context & lctx,
5353
+ const llama_batch & batch) {
5354
+ const auto & model = lctx.model;
5355
+ const auto & hparams = model.hparams;
5356
+ const auto & cparams = lctx.cparams;
5357
+
5358
+ const auto & kv_self = lctx.kv_self;
5359
+
5360
+ GGML_ASSERT(!!kv_self.ctx);
5361
+
5362
+ const int64_t n_embd = hparams.n_embd;
5363
+ const int64_t n_layer = hparams.n_layer;
5364
+ const int64_t n_ctx = cparams.n_ctx;
5365
+ const int64_t n_head = hparams.n_head;
5366
+ const int64_t n_head_kv = hparams.n_head_kv; // == n_head for MPT, as there's no MQA/GQA
5367
+ const int64_t n_embd_head = hparams.n_embd_head();
5368
+ const int64_t n_embd_gqa = hparams.n_embd_gqa();
5369
+
5370
+ const float norm_eps = hparams.f_norm_eps;
5371
+ const float clamp_kqv = hparams.f_clamp_kqv;
5372
+ const float max_alibi_bias = hparams.f_max_alibi_bias;
5373
+
5374
+ const int n_gpu_layers = model.n_gpu_layers;
5375
+
5376
+ const int32_t n_tokens = batch.n_tokens;
5377
+ const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
5378
+ const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
5379
+
5380
+ auto & buf_compute = lctx.buf_compute;
5381
+
5382
+ struct ggml_init_params params = {
5383
+ /*.mem_size =*/ buf_compute.size,
5384
+ /*.mem_buffer =*/ buf_compute.data,
5385
+ /*.no_alloc =*/ false,
5386
+ };
5387
+
5388
+ params.no_alloc = true;
5389
+
5390
+ struct ggml_context * ctx0 = ggml_init(params);
5391
+
5392
+ ggml_cgraph * gf = ggml_new_graph(ctx0);
5393
+
5394
+ struct ggml_tensor * cur;
5395
+ struct ggml_tensor * inpL;
5396
+
5397
+ //int warmup = 0;
5398
+ if (batch.token) {
5399
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
5400
+
5401
+ ggml_allocr_alloc(lctx.alloc, inp_tokens);
5402
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
5403
+ memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
5404
+ //warmup = ((uint32_t*) inp_tokens->data)[0] == 0;
5405
+ }
5406
+
5407
+ ggml_set_name(inp_tokens, "inp_tokens");
5408
+
5409
+ inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
5410
+ } else {
5411
+ #ifdef GGML_USE_MPI
5412
+ GGML_ASSERT(false && "not implemented");
5413
+ #endif
5414
+
5415
+ inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
5416
+
5417
+ ggml_allocr_alloc(lctx.alloc, inpL);
5418
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
5419
+ memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
5420
+ }
5421
+ }
5422
+
5423
+ const int i_gpu_start = n_layer - n_gpu_layers;
5424
+ (void) i_gpu_start;
5425
+
5426
+ // offload functions set the tensor output backend to GPU
5427
+ // tensors are GPU-accelerated if any input or the output has been offloaded
5428
+ offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
5429
+ offload_func_t offload_func_kq = llama_nop;
5430
+ offload_func_t offload_func_v = llama_nop;
5431
+
5432
+ #ifdef GGML_USE_CUBLAS
5433
+ if (n_gpu_layers > n_layer) {
5434
+ offload_func_nr = ggml_cuda_assign_buffers_no_alloc;
5435
+ }
5436
+ if (n_gpu_layers > n_layer + 1) {
5437
+ offload_func_v = ggml_cuda_assign_buffers_no_alloc;
5438
+ }
5439
+ if (n_gpu_layers > n_layer + 2) {
5440
+ offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
5441
+ }
5442
+ #endif // GGML_USE_CUBLAS
5443
+
5444
+ // KQ_scale
5445
+ struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
5446
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
5447
+ ggml_allocr_alloc(lctx.alloc, KQ_scale);
5448
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
5449
+ ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
5450
+ }
5451
+
5452
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5453
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
5454
+ offload_func_kq(KQ_mask);
5455
+ ggml_set_name(KQ_mask, "KQ_mask");
5456
+ ggml_allocr_alloc(lctx.alloc, KQ_mask);
5457
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
5458
+ float * data = (float *) KQ_mask->data;
5459
+ memset(data, 0, ggml_nbytes(KQ_mask));
5460
+
5461
+ for (int h = 0; h < 1; ++h) {
5462
+ for (int j = 0; j < n_tokens; ++j) {
5463
+ const llama_pos pos = batch.pos[j];
5464
+ const llama_seq_id seq_id = batch.seq_id[j];
5465
+
5466
+ for (int i = 0; i < n_kv; ++i) {
5467
+ if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
5468
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
5469
+ }
5470
+ }
5471
+ }
5472
+ }
5473
+ }
5474
+
5475
+ for (int il = 0; il < n_layer; ++il) {
5476
+ struct ggml_tensor * attn_norm;
5477
+
5478
+ offload_func_t offload_func = llama_nop;
5479
+
5480
+ #ifdef GGML_USE_CUBLAS
5481
+ if (il >= i_gpu_start) {
5482
+ offload_func = ggml_cuda_assign_buffers_no_alloc;
5483
+ }
5484
+ #endif // GGML_USE_CUBLAS
5485
+
5486
+ // self-attention
5487
+ // TODO: refactor into common function (shared with LLaMA)
5488
+ {
5489
+ attn_norm = ggml_norm(ctx0, inpL, norm_eps);
5490
+ offload_func(attn_norm);
5491
+
5492
+ attn_norm = ggml_mul(ctx0, attn_norm, model.layers[il].attn_norm);
5493
+ offload_func(attn_norm);
5494
+
5495
+ if (1) {
5496
+ cur = attn_norm;
5497
+ }
5498
+
5499
+ // compute QKV
5500
+
5501
+ cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
5502
+ offload_func_kq(cur);
5503
+
5504
+ if (clamp_kqv > 0.0f) {
5505
+ cur = ggml_clamp(ctx0, cur, -clamp_kqv, clamp_kqv);
5506
+ offload_func_kq(cur);
5507
+ }
5508
+
5509
+ const size_t wsize = ggml_type_size(cur->type);
5510
+
5511
+ struct ggml_tensor * Qcur = ggml_view_3d(
5512
+ ctx0, cur, n_embd_head, n_head, n_tokens,
5513
+ wsize * n_embd_head,
5514
+ wsize * n_embd_head * (n_head + 2 * n_head_kv),
5515
+ 0);
5516
+ offload_func_kq(Qcur);
5517
+
5518
+ struct ggml_tensor * Kcur = ggml_view_3d(
5519
+ ctx0, cur, n_embd_head, n_head_kv, n_tokens,
5520
+ wsize * n_embd_head,
5521
+ wsize * n_embd_head * (n_head + 2 * n_head_kv),
5522
+ wsize * n_embd_head * n_head);
5523
+ offload_func_kq(Kcur);
5524
+
5525
+ struct ggml_tensor * tmpv = ggml_view_3d(
5526
+ ctx0, cur, n_embd_head, n_head_kv, n_tokens,
5527
+ wsize * n_embd_head,
5528
+ wsize * n_embd_head * (n_head + 2 * n_head_kv),
5529
+ wsize * n_embd_head * (n_head + n_head_kv));
5530
+ offload_func_kq(Kcur);
5531
+
5532
+ ggml_set_name(Qcur, "Qcur");
5533
+ ggml_set_name(Kcur, "Kcur");
5534
+
5535
+ {
5536
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
5537
+ offload_func_v(Vcur);
5538
+ offload_func_v(Vcur->src[0]->src[0]);
5539
+ ggml_set_name(Vcur, "Vcur");
5540
+
5541
+ struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
5542
+ offload_func_kq(k);
5543
+ ggml_set_name(k, "k");
5544
+
5545
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
5546
+ ( n_ctx)*ggml_element_size(kv_self.v),
5547
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
5548
+ offload_func_v(v);
5549
+
5550
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
5551
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
5552
+ }
5553
+
5554
+ struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
5555
+ offload_func_kq(Q);
5556
+ ggml_set_name(Q, "Q");
5557
+
5558
+ struct ggml_tensor * K =
5559
+ ggml_view_3d(ctx0, kv_self.k,
5560
+ n_embd_head, n_kv, n_head_kv,
5561
+ ggml_element_size(kv_self.k)*n_embd_gqa,
5562
+ ggml_element_size(kv_self.k)*n_embd_head,
5563
+ ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
5564
+ offload_func_kq(K);
5565
+ ggml_set_name(K, "K");
5566
+
5567
+ struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
5568
+ offload_func_kq(KQ);
5569
+ ggml_set_name(KQ, "KQ");
5570
+
5571
+ struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
5572
+ offload_func_kq(KQ_scaled);
5573
+ ggml_set_name(KQ_scaled, "KQ_scaled");
5574
+
5575
+ // TODO: replace with ggml_add()
5576
+ struct ggml_tensor * KQ_scaled_alibi =
5577
+ ggml_alibi(ctx0, KQ_scaled, 0, n_head, max_alibi_bias);
5578
+ offload_func_kq(KQ_scaled_alibi);
5579
+ ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
5580
+
5581
+ struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
5582
+ offload_func_kq(KQ_masked);
5583
+ ggml_set_name(KQ_masked, "KQ_masked");
5584
+
5585
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
5586
+ offload_func_v(KQ_soft_max);
5587
+ ggml_set_name(KQ_soft_max, "KQ_soft_max");
5588
+
5589
+ struct ggml_tensor * V =
5590
+ ggml_view_3d(ctx0, kv_self.v,
5591
+ n_kv, n_embd_head, n_head_kv,
5592
+ ggml_element_size(kv_self.v)*n_ctx,
5593
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
5594
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
5595
+ offload_func_v(V);
5596
+ ggml_set_name(V, "V");
5597
+
5598
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
5599
+ offload_func_v(KQV);
5600
+ ggml_set_name(KQV, "KQV");
5601
+
5602
+ struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
5603
+ offload_func_v(KQV_merged);
5604
+ ggml_set_name(KQV_merged, "KQV_merged");
5605
+
5606
+ cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
5607
+ offload_func_v(cur);
5608
+ ggml_set_name(cur, "KQV_merged_contiguous");
5609
+
5610
+ cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
5611
+ offload_func(cur);
5612
+ ggml_set_name(cur, "result_wo");
5613
+ }
5614
+
5615
+ // Add the input
5616
+ cur = ggml_add(ctx0, cur, inpL);
5617
+ offload_func(cur);
5618
+
5619
+ struct ggml_tensor * attn_out = cur;
5620
+
5621
+ // feed forward
5622
+ {
5623
+ // Norm
5624
+ {
5625
+ cur = ggml_norm(ctx0, attn_out, norm_eps);
5626
+ offload_func(cur);
5627
+
5628
+ cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
5629
+ offload_func(cur);
5630
+ }
5631
+
5632
+ cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur);
5633
+ offload_func(cur);
5634
+
5635
+ cur = ggml_gelu(ctx0, cur);
5636
+ offload_func(cur);
5637
+ cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur);
5638
+ offload_func(cur);
5639
+ }
5640
+
5641
+ cur = ggml_add(ctx0, cur, attn_out);
5642
+ offload_func(cur);
5643
+ // input for next layer
5644
+ inpL = cur;
5645
+ }
5646
+
5647
+ cur = inpL;
5648
+
5649
+ // norm
5650
+ {
5651
+ cur = ggml_norm(ctx0, cur, norm_eps);
5652
+ offload_func_nr(cur);
5653
+
5654
+ cur = ggml_mul(ctx0, cur, model.output_norm);
5655
+ ggml_set_name(cur, "result_norm");
5656
+ }
5657
+
5658
+ cur = ggml_mul_mat(ctx0, model.output, cur);
5659
+ ggml_set_name(cur, "result_output");
5660
+
5661
+ ggml_build_forward_expand(gf, cur);
5662
+
5663
+ ggml_free(ctx0);
5664
+
5665
+ return gf;
5666
+ }
5667
+
5668
+ static struct ggml_cgraph * llama_build_graph(
5669
+ llama_context & lctx,
5670
+ const llama_batch & batch) {
5671
+ const auto & model = lctx.model;
5672
+
5673
+ struct ggml_cgraph * result = NULL;
5674
+
5675
+ switch (model.arch) {
5676
+ case LLM_ARCH_LLAMA:
5677
+ {
5678
+ result = llm_build_llama(lctx, batch);
5679
+ } break;
5680
+ case LLM_ARCH_BAICHUAN:
4397
5681
  {
4398
5682
  result = llm_build_baichaun(lctx, batch);
4399
5683
  } break;
@@ -4405,10 +5689,22 @@ static struct ggml_cgraph * llama_build_graph(
4405
5689
  {
4406
5690
  result = llm_build_starcoder(lctx, batch);
4407
5691
  } break;
5692
+ case LLM_ARCH_PERSIMMON:
5693
+ {
5694
+ result = llm_build_persimmon(lctx, batch);
5695
+ } break;
4408
5696
  case LLM_ARCH_REFACT:
4409
5697
  {
4410
5698
  result = llm_build_refact(lctx, batch);
4411
5699
  } break;
5700
+ case LLM_ARCH_BLOOM:
5701
+ {
5702
+ result = llm_build_bloom(lctx, batch);
5703
+ } break;
5704
+ case LLM_ARCH_MPT:
5705
+ {
5706
+ result = llm_build_mpt(lctx, batch);
5707
+ } break;
4412
5708
  default:
4413
5709
  GGML_ASSERT(false);
4414
5710
  }
@@ -4420,7 +5716,6 @@ static struct ggml_cgraph * llama_build_graph(
4420
5716
  //
4421
5717
  // - lctx: llama context
4422
5718
  // - batch: batch to evaluate
4423
- // - n_threads: number of threads to use
4424
5719
  //
4425
5720
  // return 0 on success
4426
5721
  // return positive int on warning
@@ -4487,10 +5782,6 @@ static int llama_decode_internal(
4487
5782
  batch.seq_id = seq_id.data();
4488
5783
  }
4489
5784
 
4490
- // we always start to search for a free slot from the start of the cache
4491
- // TODO: better strategies can be implemented
4492
- kv_self.head = 0;
4493
-
4494
5785
  if (!llama_kv_cache_find_slot(kv_self, batch)) {
4495
5786
  return 1;
4496
5787
  }
@@ -4543,7 +5834,8 @@ static int llama_decode_internal(
4543
5834
  const bool full_offload_supported = model.arch == LLM_ARCH_LLAMA ||
4544
5835
  model.arch == LLM_ARCH_BAICHUAN ||
4545
5836
  model.arch == LLM_ARCH_FALCON ||
4546
- model.arch == LLM_ARCH_REFACT;
5837
+ model.arch == LLM_ARCH_REFACT ||
5838
+ model.arch == LLM_ARCH_MPT;
4547
5839
  const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
4548
5840
  if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
4549
5841
  n_threads = 1;
@@ -4576,8 +5868,12 @@ static int llama_decode_internal(
4576
5868
  #endif
4577
5869
 
4578
5870
  // update the kv ring buffer
4579
- lctx.kv_self.head += n_tokens;
4580
5871
  lctx.kv_self.has_shift = false;
5872
+ lctx.kv_self.head += n_tokens;
5873
+ // Ensure kv cache head points to a valid index.
5874
+ if (lctx.kv_self.head >= lctx.kv_self.size) {
5875
+ lctx.kv_self.head = 0;
5876
+ }
4581
5877
 
4582
5878
  #ifdef GGML_PERF
4583
5879
  // print timing information per ggml operation (for debugging purposes)
@@ -5040,7 +6336,6 @@ private:
5040
6336
  for (int i = 0; i < (int)text_utf.size(); i++) {
5041
6337
  const std::string & utf_char = text_utf[i];
5042
6338
  bool split_condition = false;
5043
- // const char* text_pos = raw_text_p + utf_char.seq_offset_bytes;
5044
6339
  int bytes_remain = text_utf.size() - i;
5045
6340
  // forward backward lookups
5046
6341
  const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
@@ -5066,9 +6361,9 @@ private:
5066
6361
  if (!split_condition && bytes_remain >= 3) {
5067
6362
  // 're|'ve|'ll
5068
6363
  if (utf_char == "\'" && (
5069
- (utf_char_next == "r" || utf_char_next_next == "e") ||
5070
- (utf_char_next == "v" || utf_char_next_next == "e") ||
5071
- (utf_char_next == "l" || utf_char_next_next == "l"))
6364
+ (utf_char_next == "r" && utf_char_next_next == "e") ||
6365
+ (utf_char_next == "v" && utf_char_next_next == "e") ||
6366
+ (utf_char_next == "l" && utf_char_next_next == "l"))
5072
6367
  ) {
5073
6368
  split_condition = true;
5074
6369
  }
@@ -5119,7 +6414,7 @@ private:
5119
6414
  else if (collecting_special && (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
5120
6415
  split_condition = true;
5121
6416
  }
5122
- else if (collecting_whitespace_lookahead && codepoint_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE) {
6417
+ else if (collecting_whitespace_lookahead && (codepoint_type(utf_char_next) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
5123
6418
  split_condition = true;
5124
6419
  }
5125
6420
  }
@@ -6635,7 +7930,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
6635
7930
  const std::string name = ggml_get_name(meta);
6636
7931
 
6637
7932
  // TODO: avoid hardcoded tensor names - use the TN_* constants
6638
- if (name.find("attn_v.weight") != std::string::npos) {
7933
+ if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
6639
7934
  ++n_attention_wv;
6640
7935
  }
6641
7936
  else if (name.find("ffn_down.weight") != std::string::npos) {
@@ -6672,6 +7967,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
6672
7967
  }
6673
7968
 
6674
7969
  std::ofstream fout(fname_out, std::ios::binary);
7970
+ fout.exceptions(std::ofstream::failbit); // fail fast on write errors
6675
7971
 
6676
7972
  const size_t meta_size = gguf_get_meta_size(ctx_out);
6677
7973
 
@@ -8166,7 +9462,9 @@ int llama_token_to_piece(const struct llama_model * model, llama_token token, ch
8166
9462
  buf[0] = llama_token_to_byte(model->vocab, token);
8167
9463
  return 1;
8168
9464
  } else {
8169
- GGML_ASSERT(false);
9465
+ // TODO: for now we accept all unsupported token types,
9466
+ // suppressing them like CONTROL tokens.
9467
+ // GGML_ASSERT(false);
8170
9468
  }
8171
9469
  break;
8172
9470
  }
@@ -8182,7 +9480,9 @@ int llama_token_to_piece(const struct llama_model * model, llama_token token, ch
8182
9480
  } else if (llama_is_control_token(model->vocab, token)) {
8183
9481
  ;
8184
9482
  } else {
8185
- GGML_ASSERT(false);
9483
+ // TODO: for now we accept all unsupported token types,
9484
+ // suppressing them like CONTROL tokens.
9485
+ // GGML_ASSERT(false);
8186
9486
  }
8187
9487
  break;
8188
9488
  }