llama_cpp 0.12.4 → 0.12.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -196,6 +196,7 @@ enum llm_arch {
196
196
  LLM_ARCH_STARCODER,
197
197
  LLM_ARCH_PERSIMMON,
198
198
  LLM_ARCH_REFACT,
199
+ LLM_ARCH_BERT,
199
200
  LLM_ARCH_BLOOM,
200
201
  LLM_ARCH_STABLELM,
201
202
  LLM_ARCH_QWEN,
@@ -205,10 +206,11 @@ enum llm_arch {
205
206
  LLM_ARCH_CODESHELL,
206
207
  LLM_ARCH_ORION,
207
208
  LLM_ARCH_INTERNLM2,
209
+ LLM_ARCH_MINICPM,
208
210
  LLM_ARCH_UNKNOWN,
209
211
  };
210
212
 
211
- static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
213
+ static std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
212
214
  { LLM_ARCH_LLAMA, "llama" },
213
215
  { LLM_ARCH_FALCON, "falcon" },
214
216
  { LLM_ARCH_GPT2, "gpt2" },
@@ -219,6 +221,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
219
221
  { LLM_ARCH_STARCODER, "starcoder" },
220
222
  { LLM_ARCH_PERSIMMON, "persimmon" },
221
223
  { LLM_ARCH_REFACT, "refact" },
224
+ { LLM_ARCH_BERT, "bert" },
222
225
  { LLM_ARCH_BLOOM, "bloom" },
223
226
  { LLM_ARCH_STABLELM, "stablelm" },
224
227
  { LLM_ARCH_QWEN, "qwen" },
@@ -228,6 +231,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
228
231
  { LLM_ARCH_CODESHELL, "codeshell" },
229
232
  { LLM_ARCH_ORION, "orion" },
230
233
  { LLM_ARCH_INTERNLM2, "internlm2" },
234
+ { LLM_ARCH_MINICPM, "minicpm" },
231
235
  };
232
236
 
233
237
  enum llm_kv {
@@ -250,6 +254,7 @@ enum llm_kv {
250
254
  LLM_KV_TENSOR_DATA_LAYOUT,
251
255
  LLM_KV_EXPERT_COUNT,
252
256
  LLM_KV_EXPERT_USED_COUNT,
257
+ LLM_KV_POOLING_LAYER,
253
258
 
254
259
  LLM_KV_ATTENTION_HEAD_COUNT,
255
260
  LLM_KV_ATTENTION_HEAD_COUNT_KV,
@@ -259,6 +264,7 @@ enum llm_kv {
259
264
  LLM_KV_ATTENTION_VALUE_LENGTH,
260
265
  LLM_KV_ATTENTION_LAYERNORM_EPS,
261
266
  LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
267
+ LLM_KV_ATTENTION_CAUSAL,
262
268
 
263
269
  LLM_KV_ROPE_DIMENSION_COUNT,
264
270
  LLM_KV_ROPE_FREQ_BASE,
@@ -271,6 +277,7 @@ enum llm_kv {
271
277
  LLM_KV_TOKENIZER_MODEL,
272
278
  LLM_KV_TOKENIZER_LIST,
273
279
  LLM_KV_TOKENIZER_TOKEN_TYPE,
280
+ LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT,
274
281
  LLM_KV_TOKENIZER_SCORES,
275
282
  LLM_KV_TOKENIZER_MERGES,
276
283
  LLM_KV_TOKENIZER_BOS_ID,
@@ -285,7 +292,7 @@ enum llm_kv {
285
292
  LLM_KV_TOKENIZER_RWKV,
286
293
  };
287
294
 
288
- static std::map<llm_kv, std::string> LLM_KV_NAMES = {
295
+ static std::map<llm_kv, const char *> LLM_KV_NAMES = {
289
296
  { LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" },
290
297
  { LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" },
291
298
  { LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
@@ -305,6 +312,7 @@ static std::map<llm_kv, std::string> LLM_KV_NAMES = {
305
312
  { LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
306
313
  { LLM_KV_EXPERT_COUNT, "%s.expert_count" },
307
314
  { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
315
+ { LLM_KV_POOLING_LAYER, "%s.pooling_layer" },
308
316
 
309
317
  { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
310
318
  { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
@@ -314,6 +322,7 @@ static std::map<llm_kv, std::string> LLM_KV_NAMES = {
314
322
  { LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" },
315
323
  { LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
316
324
  { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
325
+ { LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" },
317
326
 
318
327
  { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
319
328
  { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
@@ -326,6 +335,7 @@ static std::map<llm_kv, std::string> LLM_KV_NAMES = {
326
335
  { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
327
336
  { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
328
337
  { LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" },
338
+ { LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, "tokenizer.ggml.token_type_count" },
329
339
  { LLM_KV_TOKENIZER_SCORES, "tokenizer.ggml.scores" },
330
340
  { LLM_KV_TOKENIZER_MERGES, "tokenizer.ggml.merges" },
331
341
  { LLM_KV_TOKENIZER_BOS_ID, "tokenizer.ggml.bos_token_id" },
@@ -346,13 +356,14 @@ struct LLM_KV {
346
356
  llm_arch arch;
347
357
 
348
358
  std::string operator()(llm_kv kv) const {
349
- return ::format(LLM_KV_NAMES[kv].c_str(), LLM_ARCH_NAMES[arch].c_str());
359
+ return ::format(LLM_KV_NAMES[kv], LLM_ARCH_NAMES[arch]);
350
360
  }
351
361
  };
352
362
 
353
363
  enum llm_tensor {
354
364
  LLM_TENSOR_TOKEN_EMBD,
355
365
  LLM_TENSOR_TOKEN_EMBD_NORM,
366
+ LLM_TENSOR_TOKEN_TYPES,
356
367
  LLM_TENSOR_POS_EMBD,
357
368
  LLM_TENSOR_OUTPUT,
358
369
  LLM_TENSOR_OUTPUT_NORM,
@@ -534,6 +545,23 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
534
545
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
535
546
  },
536
547
  },
548
+ {
549
+ LLM_ARCH_BERT,
550
+ {
551
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
552
+ { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
553
+ { LLM_TENSOR_TOKEN_TYPES, "token_types" },
554
+ { LLM_TENSOR_POS_EMBD, "position_embd" },
555
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_output_norm" },
556
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
557
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
558
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
559
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
560
+ { LLM_TENSOR_FFN_NORM, "blk.%d.layer_output_norm" },
561
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
562
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
563
+ },
564
+ },
537
565
  {
538
566
  LLM_ARCH_BLOOM,
539
567
  {
@@ -690,6 +718,29 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
690
718
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
691
719
  },
692
720
  },
721
+ {
722
+ LLM_ARCH_MINICPM,
723
+ {
724
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
725
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
726
+ { LLM_TENSOR_OUTPUT, "output" },
727
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
728
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
729
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
730
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
731
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
732
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
733
+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
734
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
735
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
736
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
737
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
738
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
739
+ { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
740
+ { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
741
+ { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
742
+ },
743
+ },
693
744
  {
694
745
  LLM_ARCH_UNKNOWN,
695
746
  {
@@ -723,22 +774,37 @@ struct LLM_TN {
723
774
  llm_arch arch;
724
775
 
725
776
  std::string operator()(llm_tensor tensor) const {
777
+ if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
778
+ return "__missing__";
779
+ }
726
780
  return LLM_TENSOR_NAMES[arch].at(tensor);
727
781
  }
728
782
 
729
783
  std::string operator()(llm_tensor tensor, const std::string & suffix) const {
784
+ if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
785
+ return "__missing__";
786
+ }
730
787
  return LLM_TENSOR_NAMES[arch].at(tensor) + "." + suffix;
731
788
  }
732
789
 
733
790
  std::string operator()(llm_tensor tensor, int bid) const {
791
+ if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
792
+ return "__missing__";
793
+ }
734
794
  return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid);
735
795
  }
736
796
 
737
797
  std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const {
798
+ if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
799
+ return "__missing__";
800
+ }
738
801
  return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid) + "." + suffix;
739
802
  }
740
803
 
741
804
  std::string operator()(llm_tensor tensor, const std::string & suffix, int bid, int xid) const {
805
+ if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
806
+ return "__missing__";
807
+ }
742
808
  return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid, xid) + "." + suffix;
743
809
  }
744
810
  };
@@ -747,13 +813,13 @@ struct LLM_TN {
747
813
  // gguf helpers
748
814
  //
749
815
 
750
- static std::map<int8_t, std::string> LLAMA_ROPE_SCALING_TYPES = {
816
+ static std::map<int32_t, const char *> LLAMA_ROPE_SCALING_TYPES = {
751
817
  { LLAMA_ROPE_SCALING_NONE, "none" },
752
818
  { LLAMA_ROPE_SCALING_LINEAR, "linear" },
753
819
  { LLAMA_ROPE_SCALING_YARN, "yarn" },
754
820
  };
755
821
 
756
- static int8_t llama_rope_scaling_type_from_string(const std::string & name) {
822
+ static int32_t llama_rope_scaling_type_from_string(const std::string & name) {
757
823
  for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
758
824
  if (kv.second == name) {
759
825
  return kv.first;
@@ -1330,7 +1396,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
1330
1396
  #elif defined(GGML_USE_CUBLAS)
1331
1397
  buft = ggml_backend_cuda_buffer_type(gpu);
1332
1398
  #elif defined(GGML_USE_VULKAN)
1333
- buft = ggml_backend_vk_buffer_type();
1399
+ buft = ggml_backend_vk_buffer_type(gpu);
1334
1400
  #elif defined(GGML_USE_SYCL)
1335
1401
  buft = ggml_backend_sycl_buffer_type(gpu);
1336
1402
  #elif defined(GGML_USE_CLBLAST)
@@ -1367,6 +1433,33 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
1367
1433
  GGML_UNUSED(tensor_split);
1368
1434
  }
1369
1435
 
1436
+ static size_t llama_get_device_count() {
1437
+ #if defined(GGML_USE_CUBLAS)
1438
+ return ggml_backend_cuda_get_device_count();
1439
+ #elif defined(GGML_USE_VULKAN)
1440
+ return ggml_backend_vk_get_device_count();
1441
+ #else
1442
+ return 1;
1443
+ #endif
1444
+ }
1445
+
1446
+ static size_t llama_get_device_memory(int device) {
1447
+ #if defined(GGML_USE_CUBLAS)
1448
+ size_t total;
1449
+ size_t free;
1450
+ ggml_backend_cuda_get_device_memory(device, &total, &free);
1451
+ return free;
1452
+ #elif defined(GGML_USE_VULKAN)
1453
+ size_t total;
1454
+ size_t free;
1455
+ ggml_backend_vk_get_device_memory(device, &total, &free);
1456
+ return free;
1457
+ #else
1458
+ return 1;
1459
+ GGML_UNUSED(device);
1460
+ #endif
1461
+ }
1462
+
1370
1463
  //
1371
1464
  // globals
1372
1465
  //
@@ -1388,8 +1481,14 @@ static llama_state g_state;
1388
1481
  // available llama models
1389
1482
  enum e_model {
1390
1483
  MODEL_UNKNOWN,
1484
+ MODEL_17M,
1485
+ MODEL_22M,
1486
+ MODEL_33M,
1487
+ MODEL_109M,
1488
+ MODEL_335M,
1391
1489
  MODEL_0_5B,
1392
1490
  MODEL_1B,
1491
+ MODEL_2B,
1393
1492
  MODEL_3B,
1394
1493
  MODEL_4B,
1395
1494
  MODEL_7B,
@@ -1415,6 +1514,7 @@ static const size_t GiB = 1024*MiB;
1415
1514
 
1416
1515
  struct llama_hparams {
1417
1516
  bool vocab_only;
1517
+ bool rope_finetuned;
1418
1518
  uint32_t n_vocab;
1419
1519
  uint32_t n_ctx_train; // context size the model was trained on
1420
1520
  uint32_t n_embd;
@@ -1427,6 +1527,7 @@ struct llama_hparams {
1427
1527
  uint32_t n_ff;
1428
1528
  uint32_t n_expert = 0;
1429
1529
  uint32_t n_expert_used = 0;
1530
+ uint32_t n_vocab_type = 0; // for BERT-style token types
1430
1531
 
1431
1532
  float f_norm_eps;
1432
1533
  float f_norm_rms_eps;
@@ -1434,12 +1535,14 @@ struct llama_hparams {
1434
1535
  float rope_freq_base_train;
1435
1536
  float rope_freq_scale_train;
1436
1537
  uint32_t n_yarn_orig_ctx;
1437
- int8_t rope_scaling_type_train : 3;
1438
- bool rope_finetuned : 1;
1538
+ int32_t rope_scaling_type_train;
1439
1539
 
1440
1540
  float f_clamp_kqv;
1441
1541
  float f_max_alibi_bias;
1442
1542
 
1543
+ bool causal_attn = true;
1544
+ bool pooling_layer = false;
1545
+
1443
1546
 
1444
1547
  bool operator!=(const llama_hparams & other) const {
1445
1548
  if (this->vocab_only != other.vocab_only) return true;
@@ -1501,6 +1604,7 @@ struct llama_cparams {
1501
1604
 
1502
1605
  bool mul_mat_q;
1503
1606
  bool offload_kqv;
1607
+ bool do_pooling;
1504
1608
 
1505
1609
  ggml_backend_sched_eval_callback cb_eval;
1506
1610
  void * cb_eval_user_data;
@@ -1667,6 +1771,7 @@ struct llama_model {
1667
1771
  llama_vocab vocab;
1668
1772
 
1669
1773
  struct ggml_tensor * tok_embd;
1774
+ struct ggml_tensor * type_embd;
1670
1775
  struct ggml_tensor * pos_embd;
1671
1776
  struct ggml_tensor * tok_norm;
1672
1777
  struct ggml_tensor * tok_norm_b;
@@ -1737,6 +1842,10 @@ struct llama_context {
1737
1842
  ggml_backend_free(backend);
1738
1843
  }
1739
1844
 
1845
+ #ifdef GGML_USE_VULKAN
1846
+ ggml_vk_free_cpu_assist();
1847
+ #endif
1848
+
1740
1849
  ggml_backend_buffer_free(buf_input);
1741
1850
  ggml_free(ctx_input);
1742
1851
  }
@@ -1782,8 +1891,6 @@ struct llama_context {
1782
1891
  // memory buffers used to evaluate the model
1783
1892
  std::vector<uint8_t> buf_compute_meta;
1784
1893
  ggml_backend_sched_t sched = nullptr;
1785
- // allocator for the input tensors
1786
- ggml_tallocr * alloc = nullptr;
1787
1894
 
1788
1895
  // input tensors
1789
1896
  ggml_backend_buffer_t buf_input = nullptr;
@@ -1793,6 +1900,7 @@ struct llama_context {
1793
1900
  struct ggml_tensor * inp_pos; // I32 [n_batch]
1794
1901
  struct ggml_tensor * inp_KQ_mask; // F32 [n_ctx, n_batch]
1795
1902
  struct ggml_tensor * inp_K_shift; // I32 [n_ctx]
1903
+ struct ggml_tensor * inp_sum; // F32 [n_batch, n_batch]
1796
1904
 
1797
1905
  #ifdef GGML_USE_MPI
1798
1906
  ggml_mpi_context * ctx_mpi = NULL;
@@ -2701,7 +2809,7 @@ struct llama_model_loader {
2701
2809
  // load LLaMA models
2702
2810
  //
2703
2811
 
2704
- static std::string llama_model_arch_name(llm_arch arch) {
2812
+ static const char * llama_model_arch_name(llm_arch arch) {
2705
2813
  auto it = LLM_ARCH_NAMES.find(arch);
2706
2814
  if (it == LLM_ARCH_NAMES.end()) {
2707
2815
  return "unknown";
@@ -2748,6 +2856,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
2748
2856
  static const char * llama_model_type_name(e_model type) {
2749
2857
  switch (type) {
2750
2858
  case MODEL_1B: return "1B";
2859
+ case MODEL_2B: return "2B";
2751
2860
  case MODEL_3B: return "3B";
2752
2861
  case MODEL_7B: return "7B";
2753
2862
  case MODEL_8B: return "8B";
@@ -2771,6 +2880,7 @@ static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
2771
2880
  switch (type) {
2772
2881
  case LLAMA_VOCAB_TYPE_SPM: return "SPM";
2773
2882
  case LLAMA_VOCAB_TYPE_BPE: return "BPE";
2883
+ case LLAMA_VOCAB_TYPE_WPM: return "WPM";
2774
2884
  default: return "unknown";
2775
2885
  }
2776
2886
  }
@@ -2887,6 +2997,15 @@ static void llm_load_hparams(
2887
2997
  default: model.type = e_model::MODEL_UNKNOWN;
2888
2998
  }
2889
2999
  } break;
3000
+ case LLM_ARCH_MINICPM:
3001
+ {
3002
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
3003
+
3004
+ switch (hparams.n_layer) {
3005
+ case 40: model.type = e_model::MODEL_2B; break;
3006
+ default: model.type = e_model::MODEL_UNKNOWN;
3007
+ }
3008
+ } break;
2890
3009
  case LLM_ARCH_FALCON:
2891
3010
  {
2892
3011
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -2933,6 +3052,27 @@ static void llm_load_hparams(
2933
3052
  default: model.type = e_model::MODEL_UNKNOWN;
2934
3053
  }
2935
3054
  } break;
3055
+ case LLM_ARCH_BERT:
3056
+ {
3057
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
3058
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
3059
+ ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
3060
+ ml.get_key(LLM_KV_POOLING_LAYER, hparams.pooling_layer);
3061
+
3062
+ switch (hparams.n_layer) {
3063
+ case 3:
3064
+ model.type = e_model::MODEL_17M; break; // bge-micro
3065
+ case 6:
3066
+ model.type = e_model::MODEL_22M; break; // MiniLM-L6
3067
+ case 12:
3068
+ switch (hparams.n_embd) {
3069
+ case 384: model.type = e_model::MODEL_33M; break; // MiniLM-L12, bge-small
3070
+ case 768: model.type = e_model::MODEL_109M; break; // bge-base
3071
+ } break;
3072
+ case 24:
3073
+ model.type = e_model::MODEL_335M; break; // bge-large
3074
+ }
3075
+ } break;
2936
3076
  case LLM_ARCH_BLOOM:
2937
3077
  {
2938
3078
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -3137,6 +3277,16 @@ static void llm_load_vocab(
3137
3277
  vocab.special_unk_id = -1;
3138
3278
  vocab.special_sep_id = -1;
3139
3279
  vocab.special_pad_id = -1;
3280
+ } else if (tokenizer_name == "bert") {
3281
+ vocab.type = LLAMA_VOCAB_TYPE_WPM;
3282
+
3283
+ // default special tokens
3284
+ vocab.special_bos_id = 101;
3285
+ vocab.special_eos_id = 102;
3286
+ vocab.special_unk_id = 100;
3287
+ vocab.special_sep_id = -1;
3288
+ vocab.special_pad_id = -1;
3289
+ vocab.add_space_prefix = false;
3140
3290
  } else {
3141
3291
  LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str());
3142
3292
  LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
@@ -3164,7 +3314,14 @@ static void llm_load_vocab(
3164
3314
 
3165
3315
  // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
3166
3316
  if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
3167
- vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
3317
+ try {
3318
+ vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
3319
+ } catch (const std::exception & e) {
3320
+ LLAMA_LOG_WARN("%s: SPM vocabulary, but newline token not found: %s! Using special_pad_id instead.", __func__, e.what());
3321
+ vocab.linefeed_id = vocab.special_pad_id;
3322
+ }
3323
+ } else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
3324
+ vocab.linefeed_id = vocab.special_pad_id;
3168
3325
  } else {
3169
3326
  const std::vector<int> ids = llama_tokenize_internal(vocab, "\u010A", false);
3170
3327
  GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
@@ -3310,11 +3467,11 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
3310
3467
  const auto & hparams = model.hparams;
3311
3468
  const auto & vocab = model.vocab;
3312
3469
 
3313
- const auto rope_scaling_type = LLAMA_ROPE_SCALING_TYPES.at(hparams.rope_scaling_type_train);
3470
+ const char * rope_scaling_type = LLAMA_ROPE_SCALING_TYPES.at(hparams.rope_scaling_type_train);
3314
3471
 
3315
3472
  // hparams
3316
3473
  LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver));
3317
- LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch).c_str());
3474
+ LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch));
3318
3475
  LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, llama_model_vocab_type_name(vocab.type));
3319
3476
  LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab);
3320
3477
  LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (int) vocab.bpe_ranks.size());
@@ -3336,7 +3493,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
3336
3493
  LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
3337
3494
  LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
3338
3495
  LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
3339
- LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str());
3496
+ LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type);
3340
3497
  LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
3341
3498
  LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
3342
3499
  LLAMA_LOG_INFO("%s: n_yarn_orig_ctx = %u\n", __func__, hparams.n_yarn_orig_ctx);
@@ -3402,22 +3559,18 @@ static bool llm_load_tensors(
3402
3559
  model.buft_layer[i] = llama_default_buffer_type_cpu(true);
3403
3560
  }
3404
3561
 
3405
- #ifdef GGML_USE_CUBLAS
3406
3562
  if (split_mode == LLAMA_SPLIT_LAYER) {
3407
3563
  // calculate the split points
3408
- int device_count = ggml_backend_cuda_get_device_count();
3564
+ int device_count = llama_get_device_count();
3409
3565
  bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
3410
- float splits[GGML_CUDA_MAX_DEVICES];
3566
+ std::vector<float> splits(device_count);
3411
3567
  if (all_zero) {
3412
3568
  // default split, by free memory
3413
3569
  for (int i = 0; i < device_count; ++i) {
3414
- size_t total;
3415
- size_t free;
3416
- ggml_backend_cuda_get_device_memory(i, &total, &free);
3417
- splits[i] = free;
3570
+ splits[i] = llama_get_device_memory(i);
3418
3571
  }
3419
3572
  } else {
3420
- std::copy(tensor_split, tensor_split + device_count, splits);
3573
+ std::copy(tensor_split, tensor_split + device_count, splits.begin());
3421
3574
  }
3422
3575
 
3423
3576
  // sum and normalize the splits to get the split points
@@ -3433,19 +3586,17 @@ static bool llm_load_tensors(
3433
3586
  // assign the repeating layers to the devices according to the splits
3434
3587
  int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1);
3435
3588
  for (int64_t i = i_gpu_start; i < n_layer; ++i) {
3436
- int layer_gpu = std::upper_bound(splits, splits + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits;
3589
+ int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits.begin();
3437
3590
  model.buft_layer[i] = llama_default_buffer_type_offload(layer_gpu);
3438
3591
  }
3439
3592
  // assign the output layer
3440
3593
  if (n_gpu_layers > n_layer) {
3441
- int layer_gpu = std::upper_bound(splits, splits + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits;
3594
+ int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits.begin();
3442
3595
  model.buft_output = llama_default_buffer_type_offload(layer_gpu);
3443
3596
  } else {
3444
3597
  model.buft_output = llama_default_buffer_type_cpu(true);
3445
3598
  }
3446
- } else
3447
- #endif
3448
- {
3599
+ } else {
3449
3600
  ggml_backend_buffer_type_t split_buft;
3450
3601
  if (split_mode == LLAMA_SPLIT_ROW) {
3451
3602
  split_buft = llama_default_buffer_type_split(main_gpu, tensor_split);
@@ -3508,6 +3659,7 @@ static bool llm_load_tensors(
3508
3659
  const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
3509
3660
  const int64_t n_embd_gqa = n_embd_v_gqa;
3510
3661
  const int64_t n_vocab = hparams.n_vocab;
3662
+ const int64_t n_vocab_type = hparams.n_vocab_type;
3511
3663
  const int64_t n_ff = hparams.n_ff;
3512
3664
 
3513
3665
  GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
@@ -3524,13 +3676,16 @@ static bool llm_load_tensors(
3524
3676
  switch (model.arch) {
3525
3677
  case LLM_ARCH_LLAMA:
3526
3678
  case LLM_ARCH_REFACT:
3679
+ case LLM_ARCH_MINICPM:
3527
3680
  {
3528
3681
  model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
3529
3682
 
3530
3683
  // output
3531
3684
  {
3532
3685
  model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
3533
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
3686
+ if (model.arch != LLM_ARCH_MINICPM){
3687
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
3688
+ }
3534
3689
  }
3535
3690
 
3536
3691
  for (int i = 0; i < n_layer; ++i) {
@@ -3719,11 +3874,50 @@ static bool llm_load_tensors(
3719
3874
  layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64});
3720
3875
  }
3721
3876
  } break;
3722
- case LLM_ARCH_BLOOM:
3877
+ case LLM_ARCH_BERT:
3723
3878
  {
3724
3879
  model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
3725
- model.tok_norm = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
3726
- model.tok_norm_b = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd});
3880
+ model.type_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type});
3881
+ model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train});
3882
+ model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
3883
+ model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd});
3884
+
3885
+ for (int i = 0; i < n_layer; ++i) {
3886
+ ggml_context * ctx_layer = ctx_for_layer(i);
3887
+ ggml_context * ctx_split = ctx_for_layer_split(i);
3888
+
3889
+ auto & layer = model.layers[i];
3890
+
3891
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
3892
+ layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
3893
+
3894
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
3895
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
3896
+
3897
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
3898
+ layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
3899
+
3900
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
3901
+ layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
3902
+
3903
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
3904
+ layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
3905
+
3906
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
3907
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
3908
+
3909
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
3910
+ layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
3911
+
3912
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
3913
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
3914
+ }
3915
+ } break;
3916
+ case LLM_ARCH_BLOOM:
3917
+ {
3918
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
3919
+ model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
3920
+ model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd});
3727
3921
 
3728
3922
  // output
3729
3923
  {
@@ -4145,8 +4339,7 @@ static bool llm_load_tensors(
4145
4339
  ctx_bufs.emplace_back(ctx, buf);
4146
4340
  }
4147
4341
 
4148
- // print memory requirements
4149
- {
4342
+ if (llama_supports_gpu_offload()) {
4150
4343
  const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
4151
4344
 
4152
4345
  LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
@@ -4158,10 +4351,11 @@ static bool llm_load_tensors(
4158
4351
  const int max_offloadable_layers = hparams.n_layer + 1;
4159
4352
 
4160
4353
  LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
4354
+ }
4161
4355
 
4162
- for (ggml_backend_buffer_t buf : model.bufs) {
4163
- LLAMA_LOG_INFO("%s: %10s buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
4164
- }
4356
+ // print memory requirements
4357
+ for (ggml_backend_buffer_t buf : model.bufs) {
4358
+ LLAMA_LOG_INFO("%s: %10s buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
4165
4359
  }
4166
4360
 
4167
4361
  // populate tensors_by_name
@@ -4195,9 +4389,21 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
4195
4389
 
4196
4390
  model.hparams.vocab_only = params.vocab_only;
4197
4391
 
4198
- llm_load_arch (ml, model);
4199
- llm_load_hparams(ml, model);
4200
- llm_load_vocab (ml, model);
4392
+ try {
4393
+ llm_load_arch(ml, model);
4394
+ } catch(const std::exception & e) {
4395
+ throw std::runtime_error("error loading model architecture: " + std::string(e.what()));
4396
+ }
4397
+ try {
4398
+ llm_load_hparams(ml, model);
4399
+ } catch(const std::exception & e) {
4400
+ throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what()));
4401
+ }
4402
+ try {
4403
+ llm_load_vocab(ml, model);
4404
+ } catch(const std::exception & e) {
4405
+ throw std::runtime_error("error loading model vocabulary: " + std::string(e.what()));
4406
+ }
4201
4407
 
4202
4408
  llm_load_print_meta(ml, model);
4203
4409
 
@@ -4675,6 +4881,7 @@ struct llm_build_context {
4675
4881
  const int32_t n_orig_ctx;
4676
4882
 
4677
4883
  const bool do_rope_shift;
4884
+ const bool do_pooling;
4678
4885
 
4679
4886
  const llm_build_cb & cb;
4680
4887
 
@@ -4718,6 +4925,7 @@ struct llm_build_context {
4718
4925
  kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
4719
4926
  n_orig_ctx (cparams.n_yarn_orig_ctx),
4720
4927
  do_rope_shift (worst_case || kv_self.has_shift),
4928
+ do_pooling (hparams.pooling_layer && cparams.do_pooling),
4721
4929
  cb (cb),
4722
4930
  buf_compute_meta (lctx.buf_compute_meta) {
4723
4931
  // all initializations should be done in init()
@@ -5561,6 +5769,103 @@ struct llm_build_context {
5561
5769
  return gf;
5562
5770
  }
5563
5771
 
5772
+ struct ggml_cgraph * build_bert() {
5773
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5774
+
5775
+ const int64_t n_embd_head = hparams.n_embd_head_v;
5776
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5777
+
5778
+ struct ggml_tensor * cur;
5779
+ struct ggml_tensor * inpL;
5780
+
5781
+ // get input vectors with right size
5782
+ const size_t stride1 = n_tokens * ggml_type_size(lctx.inp_tokens->type);
5783
+ struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
5784
+ struct ggml_tensor * inp_sum = ggml_view_2d(ctx0, lctx.inp_sum, n_tokens, n_tokens, stride1, 0);
5785
+
5786
+ // construct input embeddings (token, type, position)
5787
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
5788
+
5789
+ // token types are hardcoded to zero ("Sentence A")
5790
+ struct ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
5791
+ inpL = ggml_add(ctx0, inpL, type_row0);
5792
+ inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL);
5793
+ cb(inpL, "inp_embd", -1);
5794
+
5795
+ // embed layer norm
5796
+ inpL = llm_build_norm(ctx0, inpL, hparams, model.tok_norm, model.tok_norm_b, LLM_NORM, cb, -1);
5797
+ cb(inpL, "inp_norm", -1);
5798
+
5799
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5800
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5801
+ cb(KQ_mask, "KQ_mask", -1); // [n_kv, n_tokens]
5802
+
5803
+ // iterate layers
5804
+ for (int il = 0; il < n_layer; ++il) {
5805
+ struct ggml_tensor * cur = inpL;
5806
+
5807
+ // self-attention
5808
+ {
5809
+ struct ggml_tensor * Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq);
5810
+ cb(Qcur, "Qcur", il);
5811
+
5812
+ struct ggml_tensor * Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), model.layers[il].bk);
5813
+ cb(Kcur, "Kcur", il);
5814
+
5815
+ struct ggml_tensor * Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), model.layers[il].bv);
5816
+ cb(Vcur, "Vcur", il);
5817
+
5818
+ // seems like we just need to do this for Q?
5819
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
5820
+
5821
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5822
+ model.layers[il].wo, model.layers[il].bo,
5823
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5824
+ cb(cur, "kqv_out", il);
5825
+ }
5826
+
5827
+ // re-add the layer input
5828
+ cur = ggml_add(ctx0, cur, inpL);
5829
+
5830
+ // attention layer norm
5831
+ cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_norm, model.layers[il].attn_norm_b, LLM_NORM, cb, il);
5832
+
5833
+ struct ggml_tensor * ffn_inp = cur;
5834
+ cb(ffn_inp, "ffn_inp", il);
5835
+
5836
+ // feed-forward network
5837
+ cur = llm_build_ffn(ctx0, cur,
5838
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b,
5839
+ NULL, NULL,
5840
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b,
5841
+ NULL,
5842
+ LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
5843
+ cb(cur, "ffn_out", il);
5844
+
5845
+ // attentions bypass the intermediate layer
5846
+ cur = ggml_add(ctx0, cur, ffn_inp);
5847
+
5848
+ // output layer norm
5849
+ cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, LLM_NORM, cb, il);
5850
+
5851
+ // input for next layer
5852
+ inpL = cur;
5853
+ }
5854
+
5855
+ // final output
5856
+ cur = inpL;
5857
+
5858
+ // pooling layer
5859
+ if (do_pooling) {
5860
+ cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_sum);
5861
+ }
5862
+ cb(cur, "result_embd", -1);
5863
+
5864
+ ggml_build_forward_expand(gf, cur);
5865
+
5866
+ return gf;
5867
+ }
5868
+
5564
5869
  struct ggml_cgraph * build_bloom() {
5565
5870
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5566
5871
 
@@ -6781,96 +7086,180 @@ struct llm_build_context {
6781
7086
  return gf;
6782
7087
  }
6783
7088
 
6784
- };
7089
+ // ref: https://arxiv.org/abs/2203.03466
7090
+ // https://github.com/ggerganov/llama.cpp/issues/5276#issuecomment-1925774738
7091
+ // based on the original build_llama() function
7092
+ struct ggml_cgraph * build_minicpm() {
7093
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
6785
7094
 
6786
- static struct ggml_cgraph * llama_build_graph(
6787
- llama_context & lctx,
6788
- const llama_batch & batch) {
6789
- const auto & model = lctx.model;
7095
+ const int64_t n_embd_head = hparams.n_embd_head_v;
7096
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
7097
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
6790
7098
 
6791
- // check if we should build the worst-case graph (for memory measurement)
6792
- const bool worst_case = ggml_tallocr_is_measure(lctx.alloc);
7099
+ const int64_t n_embd = hparams.n_embd;
7100
+ //TODO: if the model varies, these parameters need to be read from the model
7101
+ const int64_t n_embd_base = 256;
7102
+ const float scale_embd = 12.0f;
7103
+ const float scale_depth = 1.4f;
6793
7104
 
6794
- // this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
6795
- llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) {
6796
- if (il >= 0) {
6797
- ggml_format_name(cur, "%s-%d", name, il);
6798
- } else {
6799
- ggml_set_name(cur, name);
7105
+ struct ggml_tensor * cur;
7106
+ struct ggml_tensor * inpL;
7107
+
7108
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
7109
+ cb(inpL, "inp_embd", -1);
7110
+
7111
+ // scale the input embeddings
7112
+ inpL = ggml_scale(ctx0, inpL, scale_embd);
7113
+ cb(inpL, "inp_scaled", -1);
7114
+
7115
+ // inp_pos - contains the positions
7116
+ struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
7117
+ cb(inp_pos, "inp_pos", -1);
7118
+
7119
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
7120
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
7121
+ cb(KQ_mask, "KQ_mask", -1);
7122
+
7123
+ // shift the entire K-cache if needed
7124
+ if (do_rope_shift) {
7125
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
6800
7126
  }
6801
7127
 
6802
- if (!lctx.cparams.offload_kqv) {
6803
- if (strcmp(name, "kqv_merged_cont") == 0) {
6804
- // all nodes between the KV store and the attention output are run on the CPU
6805
- ggml_backend_sched_set_node_backend(lctx.sched, cur, lctx.backend_cpu);
7128
+ for (int il = 0; il < n_layer; ++il) {
7129
+ struct ggml_tensor * inpSA = inpL;
7130
+
7131
+ // norm
7132
+ cur = llm_build_norm(ctx0, inpL, hparams,
7133
+ model.layers[il].attn_norm, NULL,
7134
+ LLM_NORM_RMS, cb, il);
7135
+ cb(cur, "attn_norm", il);
7136
+
7137
+ // self-attention
7138
+ {
7139
+ // compute Q and K and RoPE them
7140
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
7141
+ cb(Qcur, "Qcur", il);
7142
+ if (model.layers[il].bq) {
7143
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
7144
+ cb(Qcur, "Qcur", il);
7145
+ }
7146
+
7147
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
7148
+ cb(Kcur, "Kcur", il);
7149
+ if (model.layers[il].bk) {
7150
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
7151
+ cb(Kcur, "Kcur", il);
7152
+ }
7153
+
7154
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
7155
+ cb(Vcur, "Vcur", il);
7156
+ if (model.layers[il].bv) {
7157
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
7158
+ cb(Vcur, "Vcur", il);
7159
+ }
7160
+
7161
+ Qcur = ggml_rope_custom(
7162
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
7163
+ hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
7164
+ ext_factor, attn_factor, beta_fast, beta_slow
7165
+ );
7166
+ cb(Qcur, "Qcur", il);
7167
+
7168
+ Kcur = ggml_rope_custom(
7169
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
7170
+ hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
7171
+ ext_factor, attn_factor, beta_fast, beta_slow
7172
+ );
7173
+ cb(Kcur, "Kcur", il);
7174
+
7175
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7176
+ model.layers[il].wo, model.layers[il].bo,
7177
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7178
+ cb(cur, "kqv_out", il);
6806
7179
  }
6807
- }
6808
- };
6809
7180
 
6810
- struct ggml_cgraph * result = NULL;
7181
+ // scale_res - scale the hidden states for residual connection
7182
+ const float scale_res = scale_depth/sqrtf(float(n_layer));
7183
+ cur = ggml_scale(ctx0, cur, scale_res);
7184
+ cb(cur, "hidden_scaled", -1);
6811
7185
 
6812
- struct llm_build_context llm(lctx, batch, cb, worst_case);
7186
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
7187
+ cb(ffn_inp, "ffn_inp", il);
6813
7188
 
6814
- //
6815
- // set input data
6816
- //
7189
+ // feed-forward network
7190
+ {
7191
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
7192
+ model.layers[il].ffn_norm, NULL,
7193
+ LLM_NORM_RMS, cb, il);
7194
+ cb(cur, "ffn_norm", il);
6817
7195
 
6818
- if (!ggml_tallocr_is_measure(lctx.alloc)) {
6819
- if (batch.token) {
6820
- const int64_t n_tokens = batch.n_tokens;
7196
+ cur = llm_build_ffn(ctx0, cur,
7197
+ model.layers[il].ffn_up, NULL,
7198
+ model.layers[il].ffn_gate, NULL,
7199
+ model.layers[il].ffn_down, NULL,
7200
+ NULL,
7201
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
7202
+ cb(cur, "ffn_out", il);
7203
+ }
6821
7204
 
6822
- ggml_backend_tensor_set(lctx.inp_tokens, batch.token, 0, n_tokens*ggml_element_size(lctx.inp_tokens));
6823
- }
7205
+ // scale the hidden states for residual connection
7206
+ cur = ggml_scale(ctx0, cur, scale_res);
7207
+ cb(cur, "hidden_scaled_ffn", -1);
6824
7208
 
6825
- if (batch.embd) {
6826
- const int64_t n_embd = llm.n_embd;
6827
- const int64_t n_tokens = batch.n_tokens;
7209
+ cur = ggml_add(ctx0, cur, ffn_inp);
7210
+ cb(cur, "l_out", il);
6828
7211
 
6829
- ggml_backend_tensor_set(lctx.inp_embd, batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
7212
+ // input for next layer
7213
+ inpL = cur;
6830
7214
  }
6831
7215
 
6832
- if (batch.pos) {
6833
- const int64_t n_tokens = batch.n_tokens;
7216
+ cur = inpL;
6834
7217
 
6835
- ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
6836
- }
7218
+ cur = llm_build_norm(ctx0, cur, hparams,
7219
+ model.output_norm, NULL,
7220
+ LLM_NORM_RMS, cb, -1);
7221
+ cb(cur, "result_norm", -1);
6837
7222
 
6838
- {
6839
- const int64_t n_kv = llm.n_kv;
6840
- const int64_t n_tokens = batch.n_tokens;
7223
+ // lm_head scaling
7224
+ const float scale_lmhead = float(n_embd_base)/float(n_embd);
7225
+ cur = ggml_scale(ctx0, cur, scale_lmhead);
7226
+ cb(cur, "lmhead_scaling", -1);
6841
7227
 
6842
- GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
6843
- float * data = (float *) lctx.inp_KQ_mask->data;
7228
+ // lm_head
7229
+ cur = ggml_mul_mat(ctx0, model.tok_embd, cur);
7230
+ cb(cur, "result_output", -1);
6844
7231
 
6845
- for (int h = 0; h < 1; ++h) {
6846
- for (int j = 0; j < n_tokens; ++j) {
6847
- const llama_pos pos = batch.pos[j];
6848
- const llama_seq_id seq_id = batch.seq_id[j][0];
7232
+ ggml_build_forward_expand(gf, cur);
6849
7233
 
6850
- for (int i = 0; i < n_kv; ++i) {
6851
- float f;
6852
- if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
6853
- f = -INFINITY;
6854
- } else {
6855
- f = 0;
6856
- }
6857
- data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
6858
- }
6859
- }
6860
- }
6861
- }
7234
+ return gf;
7235
+ }
7236
+ };
6862
7237
 
6863
- if (llm.do_rope_shift) {
6864
- const int64_t n_ctx = llm.n_ctx;
7238
+ static struct ggml_cgraph * llama_build_graph(
7239
+ llama_context & lctx,
7240
+ const llama_batch & batch,
7241
+ bool worst_case) {
7242
+ const auto & model = lctx.model;
6865
7243
 
6866
- GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));
6867
- int32_t * data = (int32_t *) lctx.inp_K_shift->data;
7244
+ // this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
7245
+ llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) {
7246
+ if (il >= 0) {
7247
+ ggml_format_name(cur, "%s-%d", name, il);
7248
+ } else {
7249
+ ggml_set_name(cur, name);
7250
+ }
6868
7251
 
6869
- for (int i = 0; i < n_ctx; ++i) {
6870
- data[i] = lctx.kv_self.cells[i].delta;
7252
+ if (!lctx.cparams.offload_kqv) {
7253
+ if (strcmp(name, "kqv_merged_cont") == 0) {
7254
+ // all nodes between the KV store and the attention output are run on the CPU
7255
+ ggml_backend_sched_set_node_backend(lctx.sched, cur, lctx.backend_cpu);
6871
7256
  }
6872
7257
  }
6873
- }
7258
+ };
7259
+
7260
+ struct ggml_cgraph * result = NULL;
7261
+
7262
+ struct llm_build_context llm(lctx, batch, cb, worst_case);
6874
7263
 
6875
7264
  llm.init();
6876
7265
 
@@ -6899,6 +7288,10 @@ static struct ggml_cgraph * llama_build_graph(
6899
7288
  {
6900
7289
  result = llm.build_refact();
6901
7290
  } break;
7291
+ case LLM_ARCH_BERT:
7292
+ {
7293
+ result = llm.build_bert();
7294
+ } break;
6902
7295
  case LLM_ARCH_BLOOM:
6903
7296
  {
6904
7297
  result = llm.build_bloom();
@@ -6943,6 +7336,10 @@ static struct ggml_cgraph * llama_build_graph(
6943
7336
  {
6944
7337
  result = llm.build_internlm2();
6945
7338
  } break;
7339
+ case LLM_ARCH_MINICPM:
7340
+ {
7341
+ result = llm.build_minicpm();
7342
+ } break;
6946
7343
  default:
6947
7344
  GGML_ASSERT(false);
6948
7345
  }
@@ -6952,6 +7349,97 @@ static struct ggml_cgraph * llama_build_graph(
6952
7349
  return result;
6953
7350
  }
6954
7351
 
7352
+ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
7353
+ //
7354
+ // set input data
7355
+ //
7356
+
7357
+ const auto & hparams = lctx.model.hparams;
7358
+ const auto & cparams = lctx.cparams;
7359
+ const auto & kv_self = lctx.kv_self;
7360
+
7361
+ if (batch.token) {
7362
+ const int64_t n_tokens = batch.n_tokens;
7363
+
7364
+ ggml_backend_tensor_set(lctx.inp_tokens, batch.token, 0, n_tokens*ggml_element_size(lctx.inp_tokens));
7365
+ }
7366
+
7367
+ if (batch.embd) {
7368
+ const int64_t n_embd = hparams.n_embd;
7369
+ const int64_t n_tokens = batch.n_tokens;
7370
+
7371
+ ggml_backend_tensor_set(lctx.inp_embd, batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
7372
+ }
7373
+
7374
+ if (batch.pos) {
7375
+ const int64_t n_tokens = batch.n_tokens;
7376
+
7377
+ ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
7378
+ }
7379
+
7380
+ {
7381
+ const int64_t n_kv = kv_self.n;
7382
+ const int64_t n_tokens = batch.n_tokens;
7383
+
7384
+ assert(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
7385
+
7386
+ float * data = (float *) lctx.inp_KQ_mask->data;
7387
+
7388
+ for (int h = 0; h < 1; ++h) {
7389
+ for (int j = 0; j < n_tokens; ++j) {
7390
+ const llama_pos pos = batch.pos[j];
7391
+ const llama_seq_id seq_id = batch.seq_id[j][0];
7392
+
7393
+ for (int i = 0; i < n_kv; ++i) {
7394
+ float f;
7395
+ if (!lctx.kv_self.cells[i].has_seq_id(seq_id) ||
7396
+ (hparams.causal_attn && lctx.kv_self.cells[i].pos > pos)) {
7397
+ f = -INFINITY;
7398
+ } else {
7399
+ f = 0;
7400
+ }
7401
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
7402
+ }
7403
+ }
7404
+ }
7405
+ }
7406
+
7407
+ {
7408
+ assert(ggml_backend_buffer_is_host(lctx.inp_sum->buffer));
7409
+ float * data = (float *) lctx.inp_sum->data;
7410
+
7411
+ for (int i = 0; i < batch.n_tokens; ++i) {
7412
+ data[i] = 1.0f/float(batch.n_tokens);
7413
+ }
7414
+ }
7415
+
7416
+ if (kv_self.has_shift) {
7417
+ const int64_t n_ctx = cparams.n_ctx;
7418
+
7419
+ assert(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));
7420
+
7421
+ int32_t * data = (int32_t *) lctx.inp_K_shift->data;
7422
+
7423
+ for (int i = 0; i < n_ctx; ++i) {
7424
+ data[i] = lctx.kv_self.cells[i].delta;
7425
+ }
7426
+ }
7427
+
7428
+ if (hparams.pooling_layer && cparams.do_pooling) {
7429
+ const int64_t n_tokens = batch.n_tokens;
7430
+
7431
+ GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_sum->buffer));
7432
+ float * data = (float *) lctx.inp_sum->data;
7433
+
7434
+ memset(lctx.inp_sum->data, 0, batch.n_tokens * batch.n_tokens * ggml_element_size(lctx.inp_sum));
7435
+
7436
+ for (int i = 0; i < n_tokens; ++i) {
7437
+ const llama_seq_id seq_id = batch.seq_id[i][0];
7438
+ data[seq_id*n_tokens + i] = 1.0f;
7439
+ }
7440
+ }
7441
+ }
7442
+
6955
7443
  // decode a batch of tokens by evaluating the transformer
6956
7444
  //
6957
7445
  // - lctx: llama context
@@ -7050,17 +7538,22 @@ static int llama_decode_internal(
7050
7538
  ggml_backend_sched_reset(lctx.sched);
7051
7539
  ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
7052
7540
 
7053
- ggml_cgraph * gf = llama_build_graph(lctx, batch);
7541
+ ggml_cgraph * gf = llama_build_graph(lctx, batch, false);
7054
7542
 
7055
7543
  // the output is always the last tensor in the graph
7056
7544
  struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
7057
- GGML_ASSERT(strcmp(res->name, "result_output") == 0);
7058
-
7059
- // the embeddings could be the second to last tensor, or the third to last tensor
7060
7545
  struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
7061
- if (strcmp(embeddings->name, "result_norm") != 0) {
7062
- embeddings = gf->nodes[gf->n_nodes - 3];
7063
- GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
7546
+ if (strcmp(res->name, "result_output") == 0) {
7547
+ // the embeddings could be the second to last tensor, or the third to last tensor
7548
+ if (strcmp(embeddings->name, "result_norm") != 0) {
7549
+ embeddings = gf->nodes[gf->n_nodes - 3];
7550
+ GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
7551
+ }
7552
+ } else if (strcmp(res->name, "result_embd") == 0) {
7553
+ embeddings = res;
7554
+ res = nullptr;
7555
+ } else {
7556
+ GGML_ASSERT(false);
7064
7557
  }
7065
7558
 
7066
7559
  // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
@@ -7070,7 +7563,9 @@ static int llama_decode_internal(
7070
7563
  // TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
7071
7564
  // we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
7072
7565
  // with the BLAS calls. need a better solution
7073
- if (n_tokens >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
7566
+ // MoE Special Case: This logic applies when hparams.n_expert == 0, i.e. the model is NOT an MoE model. When an MoE is
7567
+ // being processed then Accelerate/BLAS will not be involved, so capping would limit performance.
7568
+ if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
7074
7569
  n_threads = std::min(4, n_threads);
7075
7570
  }
7076
7571
 
@@ -7088,6 +7583,9 @@ static int llama_decode_internal(
7088
7583
  if (lctx.backend_cpu != nullptr) {
7089
7584
  ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
7090
7585
  }
7586
+
7587
+ llama_set_inputs(lctx, batch);
7588
+
7091
7589
  ggml_backend_sched_graph_compute(lctx.sched, gf);
7092
7590
 
7093
7591
  // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
@@ -7127,7 +7625,7 @@ static int llama_decode_internal(
7127
7625
  // extract logits
7128
7626
  // TODO: do not compute and extract logits if only embeddings are needed
7129
7627
  // need to update the graphs to skip "result_output"
7130
- {
7628
+ if (res) {
7131
7629
  auto & logits_out = lctx.logits;
7132
7630
 
7133
7631
  #ifndef NDEBUG
@@ -7171,9 +7669,12 @@ static int llama_decode_internal(
7171
7669
  if (!lctx.embedding.empty()) {
7172
7670
  auto & embedding_out = lctx.embedding;
7173
7671
 
7174
- embedding_out.resize(n_embd);
7672
+ const int64_t embd_pos = res ? n_embd * (n_tokens-1) : 0;
7673
+ const int64_t embd_size = res ? n_embd : n_embd * n_tokens;
7674
+
7675
+ embedding_out.resize(embd_size);
7175
7676
  ggml_backend_t embeddings_backend = ggml_backend_sched_get_node_backend(lctx.sched, embeddings);
7176
- ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(), (n_embd*(n_tokens - 1))*sizeof(float), n_embd*sizeof(float));
7677
+ ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(), embd_pos*sizeof(float), embd_size*sizeof(float));
7177
7678
  ggml_backend_synchronize(embeddings_backend);
7178
7679
  }
7179
7680
 
@@ -7237,6 +7738,9 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
7237
7738
  GGML_ASSERT(false);
7238
7739
  return unicode_to_bytes_bpe(token_data.text);
7239
7740
  }
7741
+ case LLAMA_VOCAB_TYPE_WPM: {
7742
+ GGML_ASSERT(false);
7743
+ }
7240
7744
  default:
7241
7745
  GGML_ASSERT(false);
7242
7746
  }
@@ -7247,8 +7751,15 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
7247
7751
  switch (llama_vocab_get_type(vocab)) {
7248
7752
  case LLAMA_VOCAB_TYPE_SPM: {
7249
7753
  const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
7250
- return vocab.token_to_id.at(buf);
7754
+ auto token = vocab.token_to_id.find(buf);
7755
+ if (token != vocab.token_to_id.end()) {
7756
+ return (*token).second;
7757
+ }
7758
+ // Try to fall back to just the byte as a string
7759
+ const char buf2[2] = { (char)ch, 0 };
7760
+ return vocab.token_to_id.at(buf2);
7251
7761
  }
7762
+ case LLAMA_VOCAB_TYPE_WPM:
7252
7763
  case LLAMA_VOCAB_TYPE_BPE: {
7253
7764
  return vocab.token_to_id.at(bytes_to_unicode_bpe(ch));
7254
7765
  }
@@ -7294,7 +7805,7 @@ struct llm_bigram_spm {
7294
7805
  };
7295
7806
 
7296
7807
  struct llm_tokenizer_spm {
7297
- llm_tokenizer_spm(const llama_vocab & vocab): vocab(vocab) {}
7808
+ llm_tokenizer_spm(const llama_vocab & vocab) : vocab(vocab) {}
7298
7809
 
7299
7810
  void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
7300
7811
  // split string into utf8 chars
@@ -7369,6 +7880,7 @@ private:
7369
7880
 
7370
7881
  if (p == rev_merge.end()) {
7371
7882
  // output any symbols that did not form tokens as bytes.
7883
+ output.reserve(output.size() + symbol.n);
7372
7884
  for (int j = 0; j < (int)symbol.n; ++j) {
7373
7885
  llama_vocab::id token_id = llama_byte_to_token(vocab, symbol.text[j]);
7374
7886
  output.push_back(token_id);
@@ -7719,29 +8231,230 @@ private:
7719
8231
  llm_bigram_bpe::queue work_queue;
7720
8232
  };
7721
8233
 
7722
- typedef enum FRAGMENT_BUFFER_VARIANT_TYPE{
8234
+ struct llm_tokenizer_wpm {
8235
+ llm_tokenizer_wpm(const llama_vocab & vocab): vocab(vocab) {}
8236
+
8237
+ void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
8238
+ auto * token_map = &vocab.token_to_id;
8239
+
8240
+ // normalize and split by whitespace
8241
+ std::vector<std::string> words = preprocess(text);
8242
+
8243
+ // bos token prepended already
8244
+
8245
+ // find the longest tokens that form the words
8246
+ for (const std::string &word : words) {
8247
+ // skip empty words
8248
+ if (word.size() == 0) {
8249
+ continue;
8250
+ }
8251
+
8252
+ // prepend phantom space
8253
+ std::string word1 = "\xe2\x96\x81" + word;
8254
+ int n = word1.size();
8255
+
8256
+ // we're at the start of a new word
8257
+ int i = 0;
8258
+ bool match_any = false;
8259
+
8260
+ // move through character position in word
8261
+ while (i < n) {
8262
+ // loop through possible match length
8263
+ bool match = false;
8264
+ for (int j = n; j > i; j--) {
8265
+ auto it = token_map->find(word1.substr(i, j - i));
8266
+ if (it != token_map->end()) {
8267
+ output.push_back(it->second);
8268
+ match = true;
8269
+ match_any = true;
8270
+ i = j;
8271
+ break;
8272
+ }
8273
+ }
8274
+
8275
+ // must be an unknown character
8276
+ if (!match) {
8277
+ i++;
8278
+ }
8279
+ }
8280
+
8281
+ // we didn't find any matches for this word
8282
+ if (!match_any) {
8283
+ output.push_back(vocab.special_unk_id);
8284
+ }
8285
+ }
8286
+
8287
+ // append eos token
8288
+ output.push_back(vocab.special_eos_id);
8289
+ }
8290
+
8291
+ std::vector<std::string> preprocess(const std::string & text) {
8292
+ std::string ori_str = normalize(text);
8293
+ uint64_t ori_size = ori_str.size();
8294
+
8295
+ // single punct / single symbol / single digit
8296
+ // baseline: add whitespace on the left and right of punct and chinese characters
8297
+ std::vector<std::string> words;
8298
+ std::string new_str = "";
8299
+ uint64_t i = 0;
8300
+ while (i < ori_size) {
8301
+ int utf_char_len = utf8_len(ori_str[i]);
8302
+ if ((utf_char_len == 1) && ispunct(ori_str[i])) {
8303
+ new_str += " ";
8304
+ new_str += ori_str[i];
8305
+ new_str += " ";
8306
+ i += 1;
8307
+ }
8308
+ else if ((utf_char_len == 3) && is_chinese_char(ori_str.substr(i, 3))) {
8309
+ new_str += " ";
8310
+ new_str += ori_str.substr(i, 3);
8311
+ new_str += " ";
8312
+ i += 3;
8313
+ }
8314
+ else {
8315
+ new_str += ori_str[i];
8316
+ i += 1;
8317
+ }
8318
+ }
8319
+
8320
+ // split by whitespace
8321
+ uint64_t l = 0;
8322
+ uint64_t r = 0;
8323
+ while (r < new_str.size()) {
8324
+ // if is whitespace
8325
+ if (isspace(new_str[r])) {
8326
+ if (r > l) words.push_back(new_str.substr(l, (r - l)));
8327
+ l = r + 1;
8328
+ r = l;
8329
+ }
8330
+ else {
8331
+ r += 1;
8332
+ }
8333
+ }
8334
+ if (r > l) {
8335
+ words.push_back(new_str.substr(l, (r - l)));
8336
+ }
8337
+ return words;
8338
+ }
8339
+
8340
+ std::string normalize(const std::string & text) {
8341
+ // TODO: handle chinese characters? https://github.com/huggingface/tokenizers/blob/ef5f50605ddf9f8caef1598c0e4853862b9707a7/tokenizers/src/normalizers/bert.rs#L98
8342
+ std::string text2 = strip_accents(text);
8343
+ for (size_t i = 0; i < text2.size(); i += utf8_len(text2[i])) {
8344
+ char c = text2[i];
8345
+ if (c >= 'A' && c <= 'Z') {
8346
+ text2[i] = c - 'A' + 'a';
8347
+ }
8348
+ }
8349
+ return text2;
8350
+ }
8351
+
8352
+ bool is_chinese_char(const std::string & str) {
8353
+ int len = str.length();
8354
+ unsigned int codepoint = 0;
8355
+ int num_bytes = 0;
8356
+ int i = 0;
8357
+ unsigned char ch = static_cast<unsigned char>(str[i]);
8358
+ if (ch <= 0x7f) {
8359
+ codepoint = ch;
8360
+ num_bytes = 1;
8361
+ } else if ((ch >> 5) == 0x06) {
8362
+ codepoint = ch & 0x1f;
8363
+ num_bytes = 2;
8364
+ } else if ((ch >> 4) == 0x0e) {
8365
+ codepoint = ch & 0x0f;
8366
+ num_bytes = 3;
8367
+ } else if ((ch >> 3) == 0x1e) {
8368
+ codepoint = ch & 0x07;
8369
+ num_bytes = 4;
8370
+ }
8371
+ for (int j = 1; j < num_bytes; ++j) {
8372
+ if (i + j >= len) {
8373
+ return false; // incomplete UTF-8 character
8374
+ }
8375
+ unsigned char next_ch = static_cast<unsigned char>(str[i + j]);
8376
+ if ((next_ch >> 6) != 0x02) {
8377
+ return false; // invalid trailing byte
8378
+ }
8379
+ codepoint = (codepoint << 6) | (next_ch & 0x3f);
8380
+ }
8381
+ if ((codepoint >= 0x4E00 && codepoint <= 0x9FFF) ||
8382
+ (codepoint >= 0x3400 && codepoint <= 0x4DBF) ||
8383
+ (codepoint >= 0x20000 && codepoint <= 0x2A6DF) ||
8384
+ (codepoint >= 0x2A700 && codepoint <= 0x2B73F) ||
8385
+ (codepoint >= 0x2B740 && codepoint <= 0x2B81F) ||
8386
+ (codepoint >= 0x2B920 && codepoint <= 0x2CEAF) || // this should be 0x2B820 but in hf rust code it is 0x2B920
8387
+ (codepoint >= 0xF900 && codepoint <= 0xFAFF) ||
8388
+ (codepoint >= 0x2F800 && codepoint <= 0x2FA1F) ||
8389
+ (codepoint >= 0x3000 && codepoint <= 0x303F) ||
8390
+ (codepoint >= 0xFF00 && codepoint <= 0xFFEF)) {
8391
+ return true; // NOLINT
8392
+ }
8393
+ return false;
8394
+ }
8395
+
8396
+ std::string strip_accents(const std::string & input_string) {
8397
+ std::string resultString;
8398
+ std::map<std::string, char> accent_map = {
8399
+ {"À", 'A'}, {"Á", 'A'}, {"Â", 'A'}, {"Ã", 'A'}, {"Ä", 'A'}, {"Å", 'A'},
8400
+ {"à", 'a'}, {"á", 'a'}, {"â", 'a'}, {"ã", 'a'}, {"ä", 'a'}, {"å", 'a'},
8401
+ {"È", 'E'}, {"É", 'E'}, {"Ê", 'E'}, {"Ë", 'E'}, {"è", 'e'}, {"é", 'e'},
8402
+ {"ê", 'e'}, {"ë", 'e'}, {"Ì", 'I'}, {"Í", 'I'}, {"Î", 'I'}, {"Ï", 'I'},
8403
+ {"ì", 'i'}, {"í", 'i'}, {"î", 'i'}, {"ï", 'i'}, {"Ò", 'O'}, {"Ó", 'O'},
8404
+ {"Ô", 'O'}, {"Õ", 'O'}, {"Ö", 'O'}, {"ò", 'o'}, {"ó", 'o'}, {"ô", 'o'},
8405
+ {"õ", 'o'}, {"ö", 'o'}, {"Ù", 'U'}, {"Ú", 'U'}, {"Û", 'U'}, {"Ü", 'U'},
8406
+ {"ù", 'u'}, {"ú", 'u'}, {"û", 'u'}, {"ü", 'u'}, {"Ý", 'Y'}, {"ý", 'y'},
8407
+ {"Ç", 'C'}, {"ç", 'c'}, {"Ñ", 'N'}, {"ñ", 'n'},
8408
+ };
8409
+
8410
+ for (size_t i = 0; i < input_string.length();) {
8411
+ int len = utf8_len(input_string[i]);
8412
+ std::string curChar = input_string.substr(i, len);
8413
+ auto iter = accent_map.find(curChar);
8414
+ if (iter != accent_map.end()) {
8415
+ resultString += iter->second;
8416
+ } else {
8417
+ resultString += curChar;
8418
+ }
8419
+ i += len;
8420
+ }
8421
+
8422
+ return resultString;
8423
+ }
8424
+
8425
+ static size_t utf8_len(char src) {
8426
+ const size_t lookup[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4};
8427
+ uint8_t highbits = static_cast<uint8_t>(src) >> 4;
8428
+ return lookup[highbits];
8429
+ }
8430
+
8431
+ const llama_vocab & vocab;
8432
+ };
8433
+
8434
+ typedef enum FRAGMENT_BUFFER_VARIANT_TYPE {
7723
8435
  FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN,
7724
8436
  FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT
7725
8437
  } FRAGMENT_BUFFER_VARIANT_TYPE;
7726
8438
 
7727
- struct fragment_buffer_variant{
8439
+ struct fragment_buffer_variant {
7728
8440
  fragment_buffer_variant(llama_vocab::id _token)
7729
8441
  :
7730
8442
  type(FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN),
7731
8443
  token(_token),
7732
8444
  raw_text(_dummy),
7733
8445
  offset(0),
7734
- length(0){}
8446
+ length(0) {}
8447
+
7735
8448
  fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length)
7736
8449
  :
7737
8450
  type(FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT),
7738
- token((llama_vocab::id)-1),
8451
+ token((llama_vocab::id) - 1),
7739
8452
  raw_text(_raw_text),
7740
8453
  offset(_offset),
7741
8454
  length(_length){
7742
- GGML_ASSERT( _offset >= 0 );
7743
- GGML_ASSERT( _length >= 1 );
7744
- GGML_ASSERT( offset + length <= raw_text.length() );
8455
+ GGML_ASSERT(_offset >= 0);
8456
+ GGML_ASSERT(_length >= 1);
8457
+ GGML_ASSERT(offset + length <= raw_text.length());
7745
8458
  }
7746
8459
 
7747
8460
  const FRAGMENT_BUFFER_VARIANT_TYPE type;
@@ -7754,8 +8467,7 @@ struct fragment_buffer_variant{
7754
8467
 
7755
8468
  // #define PRETOKENIZERDEBUG
7756
8469
 
7757
- static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer)
7758
- {
8470
+ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer) {
7759
8471
  // for each special token
7760
8472
  for (const auto & st: vocab.special_tokens_cache) {
7761
8473
  const auto & special_token = st.first;
@@ -7866,17 +8578,15 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
7866
8578
  }
7867
8579
 
7868
8580
  std::forward_list<fragment_buffer_variant> fragment_buffer;
7869
- fragment_buffer.emplace_front( raw_text, 0, raw_text.length() );
8581
+ fragment_buffer.emplace_front(raw_text, 0, raw_text.length());
7870
8582
 
7871
- if (special) tokenizer_st_partition( vocab, fragment_buffer );
8583
+ if (special) tokenizer_st_partition(vocab, fragment_buffer);
7872
8584
 
7873
8585
  switch (vocab.type) {
7874
8586
  case LLAMA_VOCAB_TYPE_SPM:
7875
8587
  {
7876
- for (const auto & fragment: fragment_buffer)
7877
- {
7878
- if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT)
7879
- {
8588
+ for (const auto & fragment : fragment_buffer) {
8589
+ if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
7880
8590
  // without adding this leading whitespace, we do not get the same results as the original tokenizer
7881
8591
 
7882
8592
  // TODO: It's likely possible to get rid of this string copy entirely
@@ -7896,19 +8606,15 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
7896
8606
  llm_tokenizer_spm tokenizer(vocab);
7897
8607
  llama_escape_whitespace(raw_text);
7898
8608
  tokenizer.tokenize(raw_text, output);
7899
- }
7900
- else // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
7901
- {
8609
+ } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
7902
8610
  output.push_back(fragment.token);
7903
8611
  }
7904
8612
  }
7905
8613
  } break;
7906
8614
  case LLAMA_VOCAB_TYPE_BPE:
7907
8615
  {
7908
- for (const auto & fragment: fragment_buffer)
7909
- {
7910
- if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT)
7911
- {
8616
+ for (const auto & fragment : fragment_buffer) {
8617
+ if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
7912
8618
  auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
7913
8619
 
7914
8620
  #ifdef PRETOKENIZERDEBUG
@@ -7916,9 +8622,23 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
7916
8622
  #endif
7917
8623
  llm_tokenizer_bpe tokenizer(vocab);
7918
8624
  tokenizer.tokenize(raw_text, output);
8625
+ } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
8626
+ output.push_back(fragment.token);
7919
8627
  }
7920
- else // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
7921
- {
8628
+ }
8629
+ } break;
8630
+ case LLAMA_VOCAB_TYPE_WPM:
8631
+ {
8632
+ for (const auto & fragment : fragment_buffer) {
8633
+ if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
8634
+ auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
8635
+
8636
+ #ifdef PRETOKENIZERDEBUG
8637
+ LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
8638
+ #endif
8639
+ llm_tokenizer_wpm tokenizer(vocab);
8640
+ tokenizer.tokenize(raw_text, output);
8641
+ } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
7922
8642
  output.push_back(fragment.token);
7923
8643
  }
7924
8644
  }
@@ -8373,6 +9093,10 @@ void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * can
8373
9093
 
8374
9094
  const int64_t t_start_sample_us = ggml_time_us();
8375
9095
 
9096
+ if (k <= 0) {
9097
+ k = candidates->size;
9098
+ }
9099
+
8376
9100
  k = std::max(k, (int) min_keep);
8377
9101
  k = std::min(k, (int) candidates->size);
8378
9102
 
@@ -9456,8 +10180,8 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
9456
10180
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && qs.model.hparams.n_gqa() >= 4) {
9457
10181
  new_type = GGML_TYPE_Q4_K;
9458
10182
  }
9459
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && qs.model.hparams.n_gqa() >= 4) {
9460
- new_type = GGML_TYPE_Q4_K;
10183
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
10184
+ new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_Q3_K : GGML_TYPE_IQ3_XXS;
9461
10185
  }
9462
10186
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
9463
10187
  new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
@@ -9496,9 +10220,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
9496
10220
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) {
9497
10221
  if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
9498
10222
  }
9499
- //else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
9500
- // if (i_layer < n_layer/8) new_type = GGML_TYPE_Q5_K;
9501
- //}
10223
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && !qs.has_imatrix) {
10224
+ new_type = i_layer < n_layer/8 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
10225
+ }
9502
10226
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
9503
10227
  new_type = i_layer < n_layer/16 ? GGML_TYPE_Q5_K
9504
10228
  : arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K
@@ -9566,6 +10290,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
9566
10290
  }
9567
10291
  ++qs.i_ffn_up;
9568
10292
  }
10293
+
9569
10294
  // if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
9570
10295
  //}
9571
10296
  // IK: let's remove this, else Q2_K is almost the same as Q3_K_S
@@ -9625,19 +10350,19 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
9625
10350
 
9626
10351
  // K-quants
9627
10352
  case LLAMA_FTYPE_MOSTLY_Q2_K_S:
9628
- case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
10353
+ case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
9629
10354
  case LLAMA_FTYPE_MOSTLY_Q3_K_XS:
9630
10355
  case LLAMA_FTYPE_MOSTLY_Q3_K_S:
9631
10356
  case LLAMA_FTYPE_MOSTLY_Q3_K_M:
9632
- case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
10357
+ case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
9633
10358
  case LLAMA_FTYPE_MOSTLY_Q4_K_S:
9634
- case LLAMA_FTYPE_MOSTLY_Q4_K_M: quantized_type = GGML_TYPE_Q4_K; break;
10359
+ case LLAMA_FTYPE_MOSTLY_Q4_K_M: quantized_type = GGML_TYPE_Q4_K; break;
9635
10360
  case LLAMA_FTYPE_MOSTLY_Q5_K_S:
9636
- case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
9637
- case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
9638
- case LLAMA_FTYPE_MOSTLY_IQ2_XXS:quantized_type = GGML_TYPE_IQ2_XXS; break;
9639
- case LLAMA_FTYPE_MOSTLY_IQ2_XS :quantized_type = GGML_TYPE_IQ2_XS; break;
9640
- case LLAMA_FTYPE_MOSTLY_IQ3_XXS:quantized_type = GGML_TYPE_IQ3_XXS; break;
10361
+ case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
10362
+ case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
10363
+ case LLAMA_FTYPE_MOSTLY_IQ2_XXS: quantized_type = GGML_TYPE_IQ2_XXS; break;
10364
+ case LLAMA_FTYPE_MOSTLY_IQ2_XS: quantized_type = GGML_TYPE_IQ2_XS; break;
10365
+ case LLAMA_FTYPE_MOSTLY_IQ3_XXS: quantized_type = GGML_TYPE_IQ3_XXS; break;
9641
10366
 
9642
10367
  default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
9643
10368
  }
@@ -9767,7 +10492,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
9767
10492
  quantize &= !params->only_copy;
9768
10493
 
9769
10494
  // do not quantize expert gating tensors
9770
- quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
10495
+ quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_FFN_GATE_INP, "weight");
10496
+
10497
+ // do not quantize positional embeddings and token types (BERT)
10498
+ quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD, "weight");
10499
+ quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight");
9771
10500
 
9772
10501
  enum ggml_type new_type;
9773
10502
  void * new_data;
@@ -10269,6 +10998,7 @@ struct llama_context_params llama_context_default_params() {
10269
10998
  /*.logits_all =*/ false,
10270
10999
  /*.embedding =*/ false,
10271
11000
  /*.offload_kqv =*/ true,
11001
+ /*.do_pooling =*/ true,
10272
11002
  };
10273
11003
 
10274
11004
  return result;
@@ -10295,6 +11025,8 @@ size_t llama_max_devices(void) {
10295
11025
  return GGML_CUDA_MAX_DEVICES;
10296
11026
  #elif defined(GGML_USE_SYCL)
10297
11027
  return GGML_SYCL_MAX_DEVICES;
11028
+ #elif defined(GGML_USE_VULKAN)
11029
+ return GGML_VK_MAX_DEVICES;
10298
11030
  #else
10299
11031
  return 1;
10300
11032
  #endif
@@ -10422,6 +11154,7 @@ struct llama_context * llama_new_context_with_model(
10422
11154
  cparams.yarn_beta_slow = params.yarn_beta_slow;
10423
11155
  cparams.mul_mat_q = params.mul_mat_q;
10424
11156
  cparams.offload_kqv = params.offload_kqv;
11157
+ cparams.do_pooling = params.do_pooling;
10425
11158
 
10426
11159
  cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
10427
11160
  cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base;
@@ -10502,13 +11235,15 @@ struct llama_context * llama_new_context_with_model(
10502
11235
  }
10503
11236
  #elif defined(GGML_USE_VULKAN)
10504
11237
  if (model->n_gpu_layers > 0) {
10505
- ggml_backend_t backend = ggml_backend_vk_init();
10506
- if (backend == nullptr) {
10507
- LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
10508
- llama_free(ctx);
10509
- return nullptr;
11238
+ for (int device = 0; device < ggml_backend_vk_get_device_count(); ++device) {
11239
+ ggml_backend_t backend = ggml_backend_vk_init(device);
11240
+ if (backend == nullptr) {
11241
+ LLAMA_LOG_ERROR("%s: failed to initialize Vulkan%d backend\n", __func__, device);
11242
+ llama_free(ctx);
11243
+ return nullptr;
11244
+ }
11245
+ ctx->backends.push_back(backend);
10510
11246
  }
10511
- ctx->backends.push_back(backend);
10512
11247
  }
10513
11248
  #elif defined(GGML_USE_SYCL)
10514
11249
  if (model->n_gpu_layers > 0) {
@@ -10567,14 +11302,14 @@ struct llama_context * llama_new_context_with_model(
10567
11302
  // resized during inference, reserve maximum
10568
11303
  ctx->logits.reserve(hparams.n_vocab*cparams.n_batch);
10569
11304
 
10570
- if (params.embedding){
11305
+ if (params.embedding) {
10571
11306
  ctx->embedding.resize(hparams.n_embd);
10572
11307
  }
10573
11308
 
10574
11309
  // graph inputs
10575
11310
  {
10576
11311
  ggml_init_params init_params = {
10577
- /* .mem_size */ ggml_tensor_overhead()*5,
11312
+ /* .mem_size */ ggml_tensor_overhead()*7,
10578
11313
  /* .mem_buffer */ nullptr,
10579
11314
  /* .no_alloc */ true,
10580
11315
  };
@@ -10585,12 +11320,14 @@ struct llama_context * llama_new_context_with_model(
10585
11320
  ctx->inp_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
10586
11321
  ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch);
10587
11322
  ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx);
11323
+ ctx->inp_sum = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_batch, cparams.n_batch);
10588
11324
 
10589
11325
  ggml_set_name(ctx->inp_tokens, "inp_tokens");
10590
11326
  ggml_set_name(ctx->inp_embd, "inp_embd");
10591
11327
  ggml_set_name(ctx->inp_pos, "inp_pos");
10592
11328
  ggml_set_name(ctx->inp_KQ_mask, "inp_KQ_mask");
10593
11329
  ggml_set_name(ctx->inp_K_shift, "inp_K_shift");
11330
+ ggml_set_name(ctx->inp_sum, "inp_sum");
10594
11331
 
10595
11332
  ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true));
10596
11333
 
@@ -10616,23 +11353,27 @@ struct llama_context * llama_new_context_with_model(
10616
11353
  ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead());
10617
11354
 
10618
11355
  ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES);
10619
- ctx->alloc = ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu);
10620
11356
 
10621
11357
  // build worst-case graph
10622
11358
  int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch);
10623
11359
  int n_past = cparams.n_ctx - n_tokens;
10624
11360
  llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
10625
- ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0));
11361
+ ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0), true);
10626
11362
 
10627
11363
  // initialize scheduler with the worst-case graph
10628
- ggml_backend_sched_init_measure(ctx->sched, gf);
10629
- ctx->alloc = ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu);
11364
+ if (!ggml_backend_sched_reserve(ctx->sched, gf)) {
11365
+ LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
11366
+ llama_free(ctx);
11367
+ return nullptr;
11368
+ }
10630
11369
 
10631
- for (ggml_backend_t backend : ctx->backends) {
10632
- ggml_backend_buffer_t buf = ggml_backend_sched_get_buffer(ctx->sched, backend);
11370
+ for (size_t i = 0; i < ctx->backends.size(); i++) {
11371
+ ggml_backend_t backend = ctx->backends[i];
11372
+ ggml_backend_buffer_type_t buft = backend_buft[i];
11373
+ size_t size = ggml_backend_sched_get_buffer_size(ctx->sched, backend);
10633
11374
  LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
10634
- ggml_backend_buffer_name(buf),
10635
- ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
11375
+ ggml_backend_buft_name(buft),
11376
+ size / 1024.0 / 1024.0);
10636
11377
  }
10637
11378
 
10638
11379
  // note: the number of splits during measure is higher than during inference due to the kv shift
@@ -10735,7 +11476,7 @@ int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int3
10735
11476
 
10736
11477
  int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
10737
11478
  return snprintf(buf, buf_size, "%s %s %s",
10738
- llama_model_arch_name(model->arch).c_str(),
11479
+ llama_model_arch_name(model->arch),
10739
11480
  llama_model_type_name(model->type),
10740
11481
  llama_model_ftype_name(model->ftype).c_str());
10741
11482
  }
@@ -11437,6 +12178,10 @@ float * llama_get_embeddings(struct llama_context * ctx) {
11437
12178
  return ctx->embedding.data();
11438
12179
  }
11439
12180
 
12181
+ float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
12182
+ return ctx->embedding.data() + i*ctx->model.hparams.n_embd;
12183
+ }
12184
+
11440
12185
  const char * llama_token_get_text(const struct llama_model * model, llama_token token) {
11441
12186
  return model->vocab.id_to_token[token].text.c_str();
11442
12187
  }
@@ -11521,6 +12266,7 @@ static std::string llama_decode_text(const std::string & text) {
11521
12266
  int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length) {
11522
12267
  if (0 <= token && token < llama_n_vocab(model)) {
11523
12268
  switch (llama_vocab_get_type(model->vocab)) {
12269
+ case LLAMA_VOCAB_TYPE_WPM:
11524
12270
  case LLAMA_VOCAB_TYPE_SPM: {
11525
12271
  // NOTE: we accept all unsupported token types,
11526
12272
  // suppressing them like CONTROL tokens.
@@ -11644,6 +12390,7 @@ const char * llama_print_system_info(void) {
11644
12390
  s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
11645
12391
  s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
11646
12392
  s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
12393
+ s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | ";
11647
12394
 
11648
12395
  return s.c_str();
11649
12396
  }