llama_cpp 0.12.5 → 0.12.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -196,6 +196,7 @@ enum llm_arch {
196
196
  LLM_ARCH_STARCODER,
197
197
  LLM_ARCH_PERSIMMON,
198
198
  LLM_ARCH_REFACT,
199
+ LLM_ARCH_BERT,
199
200
  LLM_ARCH_BLOOM,
200
201
  LLM_ARCH_STABLELM,
201
202
  LLM_ARCH_QWEN,
@@ -220,6 +221,7 @@ static std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
220
221
  { LLM_ARCH_STARCODER, "starcoder" },
221
222
  { LLM_ARCH_PERSIMMON, "persimmon" },
222
223
  { LLM_ARCH_REFACT, "refact" },
224
+ { LLM_ARCH_BERT, "bert" },
223
225
  { LLM_ARCH_BLOOM, "bloom" },
224
226
  { LLM_ARCH_STABLELM, "stablelm" },
225
227
  { LLM_ARCH_QWEN, "qwen" },
@@ -252,6 +254,7 @@ enum llm_kv {
252
254
  LLM_KV_TENSOR_DATA_LAYOUT,
253
255
  LLM_KV_EXPERT_COUNT,
254
256
  LLM_KV_EXPERT_USED_COUNT,
257
+ LLM_KV_POOLING_LAYER,
255
258
 
256
259
  LLM_KV_ATTENTION_HEAD_COUNT,
257
260
  LLM_KV_ATTENTION_HEAD_COUNT_KV,
@@ -261,6 +264,7 @@ enum llm_kv {
261
264
  LLM_KV_ATTENTION_VALUE_LENGTH,
262
265
  LLM_KV_ATTENTION_LAYERNORM_EPS,
263
266
  LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
267
+ LLM_KV_ATTENTION_CAUSAL,
264
268
 
265
269
  LLM_KV_ROPE_DIMENSION_COUNT,
266
270
  LLM_KV_ROPE_FREQ_BASE,
@@ -273,6 +277,7 @@ enum llm_kv {
273
277
  LLM_KV_TOKENIZER_MODEL,
274
278
  LLM_KV_TOKENIZER_LIST,
275
279
  LLM_KV_TOKENIZER_TOKEN_TYPE,
280
+ LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT,
276
281
  LLM_KV_TOKENIZER_SCORES,
277
282
  LLM_KV_TOKENIZER_MERGES,
278
283
  LLM_KV_TOKENIZER_BOS_ID,
@@ -307,6 +312,7 @@ static std::map<llm_kv, const char *> LLM_KV_NAMES = {
307
312
  { LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
308
313
  { LLM_KV_EXPERT_COUNT, "%s.expert_count" },
309
314
  { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
315
+ { LLM_KV_POOLING_LAYER, "%s.pooling_layer" },
310
316
 
311
317
  { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
312
318
  { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
@@ -316,6 +322,7 @@ static std::map<llm_kv, const char *> LLM_KV_NAMES = {
316
322
  { LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" },
317
323
  { LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
318
324
  { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
325
+ { LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" },
319
326
 
320
327
  { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
321
328
  { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
@@ -328,6 +335,7 @@ static std::map<llm_kv, const char *> LLM_KV_NAMES = {
328
335
  { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
329
336
  { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
330
337
  { LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" },
338
+ { LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, "tokenizer.ggml.token_type_count" },
331
339
  { LLM_KV_TOKENIZER_SCORES, "tokenizer.ggml.scores" },
332
340
  { LLM_KV_TOKENIZER_MERGES, "tokenizer.ggml.merges" },
333
341
  { LLM_KV_TOKENIZER_BOS_ID, "tokenizer.ggml.bos_token_id" },
@@ -355,6 +363,7 @@ struct LLM_KV {
355
363
  enum llm_tensor {
356
364
  LLM_TENSOR_TOKEN_EMBD,
357
365
  LLM_TENSOR_TOKEN_EMBD_NORM,
366
+ LLM_TENSOR_TOKEN_TYPES,
358
367
  LLM_TENSOR_POS_EMBD,
359
368
  LLM_TENSOR_OUTPUT,
360
369
  LLM_TENSOR_OUTPUT_NORM,
@@ -536,6 +545,23 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
536
545
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
537
546
  },
538
547
  },
548
+ {
549
+ LLM_ARCH_BERT,
550
+ {
551
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
552
+ { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
553
+ { LLM_TENSOR_TOKEN_TYPES, "token_types" },
554
+ { LLM_TENSOR_POS_EMBD, "position_embd" },
555
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_output_norm" },
556
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
557
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
558
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
559
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
560
+ { LLM_TENSOR_FFN_NORM, "blk.%d.layer_output_norm" },
561
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
562
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
563
+ },
564
+ },
539
565
  {
540
566
  LLM_ARCH_BLOOM,
541
567
  {
@@ -748,22 +774,37 @@ struct LLM_TN {
748
774
  llm_arch arch;
749
775
 
750
776
  std::string operator()(llm_tensor tensor) const {
777
+ if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
778
+ return "__missing__";
779
+ }
751
780
  return LLM_TENSOR_NAMES[arch].at(tensor);
752
781
  }
753
782
 
754
783
  std::string operator()(llm_tensor tensor, const std::string & suffix) const {
784
+ if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
785
+ return "__missing__";
786
+ }
755
787
  return LLM_TENSOR_NAMES[arch].at(tensor) + "." + suffix;
756
788
  }
757
789
 
758
790
  std::string operator()(llm_tensor tensor, int bid) const {
791
+ if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
792
+ return "__missing__";
793
+ }
759
794
  return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid);
760
795
  }
761
796
 
762
797
  std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const {
798
+ if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
799
+ return "__missing__";
800
+ }
763
801
  return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid) + "." + suffix;
764
802
  }
765
803
 
766
804
  std::string operator()(llm_tensor tensor, const std::string & suffix, int bid, int xid) const {
805
+ if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
806
+ return "__missing__";
807
+ }
767
808
  return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid, xid) + "." + suffix;
768
809
  }
769
810
  };
@@ -1440,6 +1481,11 @@ static llama_state g_state;
1440
1481
  // available llama models
1441
1482
  enum e_model {
1442
1483
  MODEL_UNKNOWN,
1484
+ MODEL_17M,
1485
+ MODEL_22M,
1486
+ MODEL_33M,
1487
+ MODEL_109M,
1488
+ MODEL_335M,
1443
1489
  MODEL_0_5B,
1444
1490
  MODEL_1B,
1445
1491
  MODEL_2B,
@@ -1481,6 +1527,7 @@ struct llama_hparams {
1481
1527
  uint32_t n_ff;
1482
1528
  uint32_t n_expert = 0;
1483
1529
  uint32_t n_expert_used = 0;
1530
+ uint32_t n_vocab_type = 0; // for BERT-style token types
1484
1531
 
1485
1532
  float f_norm_eps;
1486
1533
  float f_norm_rms_eps;
@@ -1493,6 +1540,9 @@ struct llama_hparams {
1493
1540
  float f_clamp_kqv;
1494
1541
  float f_max_alibi_bias;
1495
1542
 
1543
+ bool causal_attn = true;
1544
+ bool pooling_layer = false;
1545
+
1496
1546
 
1497
1547
  bool operator!=(const llama_hparams & other) const {
1498
1548
  if (this->vocab_only != other.vocab_only) return true;
@@ -1554,6 +1604,7 @@ struct llama_cparams {
1554
1604
 
1555
1605
  bool mul_mat_q;
1556
1606
  bool offload_kqv;
1607
+ bool do_pooling;
1557
1608
 
1558
1609
  ggml_backend_sched_eval_callback cb_eval;
1559
1610
  void * cb_eval_user_data;
@@ -1720,6 +1771,7 @@ struct llama_model {
1720
1771
  llama_vocab vocab;
1721
1772
 
1722
1773
  struct ggml_tensor * tok_embd;
1774
+ struct ggml_tensor * type_embd;
1723
1775
  struct ggml_tensor * pos_embd;
1724
1776
  struct ggml_tensor * tok_norm;
1725
1777
  struct ggml_tensor * tok_norm_b;
@@ -1839,8 +1891,6 @@ struct llama_context {
1839
1891
  // memory buffers used to evaluate the model
1840
1892
  std::vector<uint8_t> buf_compute_meta;
1841
1893
  ggml_backend_sched_t sched = nullptr;
1842
- // allocator for the input tensors
1843
- ggml_tallocr * alloc = nullptr;
1844
1894
 
1845
1895
  // input tensors
1846
1896
  ggml_backend_buffer_t buf_input = nullptr;
@@ -1850,6 +1900,7 @@ struct llama_context {
1850
1900
  struct ggml_tensor * inp_pos; // I32 [n_batch]
1851
1901
  struct ggml_tensor * inp_KQ_mask; // F32 [n_ctx, n_batch]
1852
1902
  struct ggml_tensor * inp_K_shift; // I32 [n_ctx]
1903
+ struct ggml_tensor * inp_sum; // F32 [n_batch, n_batch]
1853
1904
 
1854
1905
  #ifdef GGML_USE_MPI
1855
1906
  ggml_mpi_context * ctx_mpi = NULL;
@@ -2829,6 +2880,7 @@ static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
2829
2880
  switch (type) {
2830
2881
  case LLAMA_VOCAB_TYPE_SPM: return "SPM";
2831
2882
  case LLAMA_VOCAB_TYPE_BPE: return "BPE";
2883
+ case LLAMA_VOCAB_TYPE_WPM: return "WPM";
2832
2884
  default: return "unknown";
2833
2885
  }
2834
2886
  }
@@ -3000,6 +3052,27 @@ static void llm_load_hparams(
3000
3052
  default: model.type = e_model::MODEL_UNKNOWN;
3001
3053
  }
3002
3054
  } break;
3055
+ case LLM_ARCH_BERT:
3056
+ {
3057
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
3058
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
3059
+ ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
3060
+ ml.get_key(LLM_KV_POOLING_LAYER, hparams.pooling_layer);
3061
+
3062
+ switch (hparams.n_layer) {
3063
+ case 3:
3064
+ model.type = e_model::MODEL_17M; break; // bge-micro
3065
+ case 6:
3066
+ model.type = e_model::MODEL_22M; break; // MiniLM-L6
3067
+ case 12:
3068
+ switch (hparams.n_embd) {
3069
+ case 384: model.type = e_model::MODEL_33M; break; // MiniLM-L12, bge-small
3070
+ case 768: model.type = e_model::MODEL_109M; break; // bge-base
3071
+ } break;
3072
+ case 24:
3073
+ model.type = e_model::MODEL_335M; break; // bge-large
3074
+ }
3075
+ } break;
3003
3076
  case LLM_ARCH_BLOOM:
3004
3077
  {
3005
3078
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -3204,6 +3277,16 @@ static void llm_load_vocab(
3204
3277
  vocab.special_unk_id = -1;
3205
3278
  vocab.special_sep_id = -1;
3206
3279
  vocab.special_pad_id = -1;
3280
+ } else if (tokenizer_name == "bert") {
3281
+ vocab.type = LLAMA_VOCAB_TYPE_WPM;
3282
+
3283
+ // default special tokens
3284
+ vocab.special_bos_id = 101;
3285
+ vocab.special_eos_id = 102;
3286
+ vocab.special_unk_id = 100;
3287
+ vocab.special_sep_id = -1;
3288
+ vocab.special_pad_id = -1;
3289
+ vocab.add_space_prefix = false;
3207
3290
  } else {
3208
3291
  LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str());
3209
3292
  LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
@@ -3231,7 +3314,14 @@ static void llm_load_vocab(
3231
3314
 
3232
3315
  // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
3233
3316
  if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
3234
- vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
3317
+ try {
3318
+ vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
3319
+ } catch (const std::exception & e) {
3320
+ LLAMA_LOG_WARN("%s: SPM vocabulary, but newline token not found: %s! Using special_pad_id instead.", __func__, e.what());
3321
+ vocab.linefeed_id = vocab.special_pad_id;
3322
+ }
3323
+ } else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
3324
+ vocab.linefeed_id = vocab.special_pad_id;
3235
3325
  } else {
3236
3326
  const std::vector<int> ids = llama_tokenize_internal(vocab, "\u010A", false);
3237
3327
  GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
@@ -3569,6 +3659,7 @@ static bool llm_load_tensors(
3569
3659
  const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
3570
3660
  const int64_t n_embd_gqa = n_embd_v_gqa;
3571
3661
  const int64_t n_vocab = hparams.n_vocab;
3662
+ const int64_t n_vocab_type = hparams.n_vocab_type;
3572
3663
  const int64_t n_ff = hparams.n_ff;
3573
3664
 
3574
3665
  GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
@@ -3783,11 +3874,50 @@ static bool llm_load_tensors(
3783
3874
  layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64});
3784
3875
  }
3785
3876
  } break;
3786
- case LLM_ARCH_BLOOM:
3877
+ case LLM_ARCH_BERT:
3787
3878
  {
3788
3879
  model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
3789
- model.tok_norm = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
3790
- model.tok_norm_b = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd});
3880
+ model.type_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type});
3881
+ model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train});
3882
+ model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
3883
+ model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd});
3884
+
3885
+ for (int i = 0; i < n_layer; ++i) {
3886
+ ggml_context * ctx_layer = ctx_for_layer(i);
3887
+ ggml_context * ctx_split = ctx_for_layer_split(i);
3888
+
3889
+ auto & layer = model.layers[i];
3890
+
3891
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
3892
+ layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
3893
+
3894
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
3895
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
3896
+
3897
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
3898
+ layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
3899
+
3900
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
3901
+ layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
3902
+
3903
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
3904
+ layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
3905
+
3906
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
3907
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
3908
+
3909
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
3910
+ layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
3911
+
3912
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
3913
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
3914
+ }
3915
+ } break;
3916
+ case LLM_ARCH_BLOOM:
3917
+ {
3918
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
3919
+ model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
3920
+ model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd});
3791
3921
 
3792
3922
  // output
3793
3923
  {
@@ -4259,9 +4389,21 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
4259
4389
 
4260
4390
  model.hparams.vocab_only = params.vocab_only;
4261
4391
 
4262
- llm_load_arch (ml, model);
4263
- llm_load_hparams(ml, model);
4264
- llm_load_vocab (ml, model);
4392
+ try {
4393
+ llm_load_arch(ml, model);
4394
+ } catch(const std::exception & e) {
4395
+ throw std::runtime_error("error loading model architecture: " + std::string(e.what()));
4396
+ }
4397
+ try {
4398
+ llm_load_hparams(ml, model);
4399
+ } catch(const std::exception & e) {
4400
+ throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what()));
4401
+ }
4402
+ try {
4403
+ llm_load_vocab(ml, model);
4404
+ } catch(const std::exception & e) {
4405
+ throw std::runtime_error("error loading model vocabulary: " + std::string(e.what()));
4406
+ }
4265
4407
 
4266
4408
  llm_load_print_meta(ml, model);
4267
4409
 
@@ -4739,6 +4881,7 @@ struct llm_build_context {
4739
4881
  const int32_t n_orig_ctx;
4740
4882
 
4741
4883
  const bool do_rope_shift;
4884
+ const bool do_pooling;
4742
4885
 
4743
4886
  const llm_build_cb & cb;
4744
4887
 
@@ -4782,6 +4925,7 @@ struct llm_build_context {
4782
4925
  kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
4783
4926
  n_orig_ctx (cparams.n_yarn_orig_ctx),
4784
4927
  do_rope_shift (worst_case || kv_self.has_shift),
4928
+ do_pooling (hparams.pooling_layer && cparams.do_pooling),
4785
4929
  cb (cb),
4786
4930
  buf_compute_meta (lctx.buf_compute_meta) {
4787
4931
  // all initializations should be done in init()
@@ -5625,6 +5769,103 @@ struct llm_build_context {
5625
5769
  return gf;
5626
5770
  }
5627
5771
 
5772
+ struct ggml_cgraph * build_bert() {
5773
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5774
+
5775
+ const int64_t n_embd_head = hparams.n_embd_head_v;
5776
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5777
+
5778
+ struct ggml_tensor * cur;
5779
+ struct ggml_tensor * inpL;
5780
+
5781
+ // get input vectors with right size
5782
+ const size_t stride1 = n_tokens * ggml_type_size(lctx.inp_tokens->type);
5783
+ struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
5784
+ struct ggml_tensor * inp_sum = ggml_view_2d(ctx0, lctx.inp_sum, n_tokens, n_tokens, stride1, 0);
5785
+
5786
+ // construct input embeddings (token, type, position)
5787
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
5788
+
5789
+ // token types are hardcoded to zero ("Sentence A")
5790
+ struct ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
5791
+ inpL = ggml_add(ctx0, inpL, type_row0);
5792
+ inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL);
5793
+ cb(inpL, "inp_embd", -1);
5794
+
5795
+ // embed layer norm
5796
+ inpL = llm_build_norm(ctx0, inpL, hparams, model.tok_norm, model.tok_norm_b, LLM_NORM, cb, -1);
5797
+ cb(inpL, "inp_norm", -1);
5798
+
5799
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5800
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5801
+ cb(KQ_mask, "KQ_mask", -1); // [n_kv, n_tokens]
5802
+
5803
+ // iterate layers
5804
+ for (int il = 0; il < n_layer; ++il) {
5805
+ struct ggml_tensor * cur = inpL;
5806
+
5807
+ // self-attention
5808
+ {
5809
+ struct ggml_tensor * Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq);
5810
+ cb(Qcur, "Qcur", il);
5811
+
5812
+ struct ggml_tensor * Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), model.layers[il].bk);
5813
+ cb(Kcur, "Kcur", il);
5814
+
5815
+ struct ggml_tensor * Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), model.layers[il].bv);
5816
+ cb(Vcur, "Vcur", il);
5817
+
5818
+ // seems like we just need to do this for Q?
5819
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
5820
+
5821
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5822
+ model.layers[il].wo, model.layers[il].bo,
5823
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5824
+ cb(cur, "kqv_out", il);
5825
+ }
5826
+
5827
+ // re-add the layer input
5828
+ cur = ggml_add(ctx0, cur, inpL);
5829
+
5830
+ // attention layer norm
5831
+ cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_norm, model.layers[il].attn_norm_b, LLM_NORM, cb, il);
5832
+
5833
+ struct ggml_tensor * ffn_inp = cur;
5834
+ cb(ffn_inp, "ffn_inp", il);
5835
+
5836
+ // feed-forward network
5837
+ cur = llm_build_ffn(ctx0, cur,
5838
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b,
5839
+ NULL, NULL,
5840
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b,
5841
+ NULL,
5842
+ LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
5843
+ cb(cur, "ffn_out", il);
5844
+
5845
+ // attentions bypass the intermediate layer
5846
+ cur = ggml_add(ctx0, cur, ffn_inp);
5847
+
5848
+ // output layer norm
5849
+ cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, LLM_NORM, cb, il);
5850
+
5851
+ // input for next layer
5852
+ inpL = cur;
5853
+ }
5854
+
5855
+ // final output
5856
+ cur = inpL;
5857
+
5858
+ // pooling layer
5859
+ if (do_pooling) {
5860
+ cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_sum);
5861
+ }
5862
+ cb(cur, "result_embd", -1);
5863
+
5864
+ ggml_build_forward_expand(gf, cur);
5865
+
5866
+ return gf;
5867
+ }
5868
+
5628
5869
  struct ggml_cgraph * build_bloom() {
5629
5870
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5630
5871
 
@@ -6996,12 +7237,10 @@ struct llm_build_context {
6996
7237
 
6997
7238
  static struct ggml_cgraph * llama_build_graph(
6998
7239
  llama_context & lctx,
6999
- const llama_batch & batch) {
7240
+ const llama_batch & batch,
7241
+ bool worst_case) {
7000
7242
  const auto & model = lctx.model;
7001
7243
 
7002
- // check if we should build the worst-case graph (for memory measurement)
7003
- const bool worst_case = ggml_tallocr_is_measure(lctx.alloc);
7004
-
7005
7244
  // this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
7006
7245
  llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) {
7007
7246
  if (il >= 0) {
@@ -7022,67 +7261,6 @@ static struct ggml_cgraph * llama_build_graph(
7022
7261
 
7023
7262
  struct llm_build_context llm(lctx, batch, cb, worst_case);
7024
7263
 
7025
- //
7026
- // set input data
7027
- //
7028
-
7029
- if (!ggml_tallocr_is_measure(lctx.alloc)) {
7030
- if (batch.token) {
7031
- const int64_t n_tokens = batch.n_tokens;
7032
-
7033
- ggml_backend_tensor_set(lctx.inp_tokens, batch.token, 0, n_tokens*ggml_element_size(lctx.inp_tokens));
7034
- }
7035
-
7036
- if (batch.embd) {
7037
- const int64_t n_embd = llm.n_embd;
7038
- const int64_t n_tokens = batch.n_tokens;
7039
-
7040
- ggml_backend_tensor_set(lctx.inp_embd, batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
7041
- }
7042
-
7043
- if (batch.pos) {
7044
- const int64_t n_tokens = batch.n_tokens;
7045
-
7046
- ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
7047
- }
7048
-
7049
- {
7050
- const int64_t n_kv = llm.n_kv;
7051
- const int64_t n_tokens = batch.n_tokens;
7052
-
7053
- GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
7054
- float * data = (float *) lctx.inp_KQ_mask->data;
7055
-
7056
- for (int h = 0; h < 1; ++h) {
7057
- for (int j = 0; j < n_tokens; ++j) {
7058
- const llama_pos pos = batch.pos[j];
7059
- const llama_seq_id seq_id = batch.seq_id[j][0];
7060
-
7061
- for (int i = 0; i < n_kv; ++i) {
7062
- float f;
7063
- if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
7064
- f = -INFINITY;
7065
- } else {
7066
- f = 0;
7067
- }
7068
- data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
7069
- }
7070
- }
7071
- }
7072
- }
7073
-
7074
- if (llm.do_rope_shift) {
7075
- const int64_t n_ctx = llm.n_ctx;
7076
-
7077
- GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));
7078
- int32_t * data = (int32_t *) lctx.inp_K_shift->data;
7079
-
7080
- for (int i = 0; i < n_ctx; ++i) {
7081
- data[i] = lctx.kv_self.cells[i].delta;
7082
- }
7083
- }
7084
- }
7085
-
7086
7264
  llm.init();
7087
7265
 
7088
7266
  switch (model.arch) {
@@ -7110,6 +7288,10 @@ static struct ggml_cgraph * llama_build_graph(
7110
7288
  {
7111
7289
  result = llm.build_refact();
7112
7290
  } break;
7291
+ case LLM_ARCH_BERT:
7292
+ {
7293
+ result = llm.build_bert();
7294
+ } break;
7113
7295
  case LLM_ARCH_BLOOM:
7114
7296
  {
7115
7297
  result = llm.build_bloom();
@@ -7167,6 +7349,97 @@ static struct ggml_cgraph * llama_build_graph(
7167
7349
  return result;
7168
7350
  }
7169
7351
 
7352
+ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
7353
+ //
7354
+ // set input data
7355
+ //
7356
+
7357
+ const auto & hparams = lctx.model.hparams;
7358
+ const auto & cparams = lctx.cparams;
7359
+ const auto & kv_self = lctx.kv_self;
7360
+
7361
+ if (batch.token) {
7362
+ const int64_t n_tokens = batch.n_tokens;
7363
+
7364
+ ggml_backend_tensor_set(lctx.inp_tokens, batch.token, 0, n_tokens*ggml_element_size(lctx.inp_tokens));
7365
+ }
7366
+
7367
+ if (batch.embd) {
7368
+ const int64_t n_embd = hparams.n_embd;
7369
+ const int64_t n_tokens = batch.n_tokens;
7370
+
7371
+ ggml_backend_tensor_set(lctx.inp_embd, batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
7372
+ }
7373
+
7374
+ if (batch.pos) {
7375
+ const int64_t n_tokens = batch.n_tokens;
7376
+
7377
+ ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
7378
+ }
7379
+
7380
+ {
7381
+ const int64_t n_kv = kv_self.n;
7382
+ const int64_t n_tokens = batch.n_tokens;
7383
+
7384
+ assert(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
7385
+
7386
+ float * data = (float *) lctx.inp_KQ_mask->data;
7387
+
7388
+ for (int h = 0; h < 1; ++h) {
7389
+ for (int j = 0; j < n_tokens; ++j) {
7390
+ const llama_pos pos = batch.pos[j];
7391
+ const llama_seq_id seq_id = batch.seq_id[j][0];
7392
+
7393
+ for (int i = 0; i < n_kv; ++i) {
7394
+ float f;
7395
+ if (!lctx.kv_self.cells[i].has_seq_id(seq_id) ||
7396
+ (hparams.causal_attn && lctx.kv_self.cells[i].pos > pos)) {
7397
+ f = -INFINITY;
7398
+ } else {
7399
+ f = 0;
7400
+ }
7401
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
7402
+ }
7403
+ }
7404
+ }
7405
+ }
7406
+
7407
+ {
7408
+ assert(ggml_backend_buffer_is_host(lctx.inp_sum->buffer));
7409
+ float * data = (float *) lctx.inp_sum->data;
7410
+
7411
+ for (int i = 0; i < batch.n_tokens; ++i) {
7412
+ data[i] = 1.0f/float(batch.n_tokens);
7413
+ }
7414
+ }
7415
+
7416
+ if (kv_self.has_shift) {
7417
+ const int64_t n_ctx = cparams.n_ctx;
7418
+
7419
+ assert(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));
7420
+
7421
+ int32_t * data = (int32_t *) lctx.inp_K_shift->data;
7422
+
7423
+ for (int i = 0; i < n_ctx; ++i) {
7424
+ data[i] = lctx.kv_self.cells[i].delta;
7425
+ }
7426
+ }
7427
+
7428
+ if (hparams.pooling_layer && cparams.do_pooling) {
7429
+ const int64_t n_tokens = batch.n_tokens;
7430
+
7431
+ GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_sum->buffer));
7432
+ float * data = (float *) lctx.inp_sum->data;
7433
+
7434
+ memset(lctx.inp_sum->data, 0, batch.n_tokens * batch.n_tokens * ggml_element_size(lctx.inp_sum));
7435
+
7436
+ for (int i = 0; i < n_tokens; ++i) {
7437
+ const llama_seq_id seq_id = batch.seq_id[i][0];
7438
+ data[seq_id*n_tokens + i] = 1.0f;
7439
+ }
7440
+ }
7441
+ }
7442
+
7170
7443
  // decode a batch of tokens by evaluating the transformer
7171
7444
  //
7172
7445
  // - lctx: llama context
@@ -7265,17 +7538,22 @@ static int llama_decode_internal(
7265
7538
  ggml_backend_sched_reset(lctx.sched);
7266
7539
  ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
7267
7540
 
7268
- ggml_cgraph * gf = llama_build_graph(lctx, batch);
7541
+ ggml_cgraph * gf = llama_build_graph(lctx, batch, false);
7269
7542
 
7270
7543
  // the output is always the last tensor in the graph
7271
7544
  struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
7272
- GGML_ASSERT(strcmp(res->name, "result_output") == 0);
7273
-
7274
- // the embeddings could be the second to last tensor, or the third to last tensor
7275
7545
  struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
7276
- if (strcmp(embeddings->name, "result_norm") != 0) {
7277
- embeddings = gf->nodes[gf->n_nodes - 3];
7278
- GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
7546
+ if (strcmp(res->name, "result_output") == 0) {
7547
+ // the embeddings could be the second to last tensor, or the third to last tensor
7548
+ if (strcmp(embeddings->name, "result_norm") != 0) {
7549
+ embeddings = gf->nodes[gf->n_nodes - 3];
7550
+ GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
7551
+ }
7552
+ } else if (strcmp(res->name, "result_embd") == 0) {
7553
+ embeddings = res;
7554
+ res = nullptr;
7555
+ } else {
7556
+ GGML_ASSERT(false);
7279
7557
  }
7280
7558
 
7281
7559
  // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
@@ -7285,7 +7563,9 @@ static int llama_decode_internal(
7285
7563
  // TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
7286
7564
  // we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
7287
7565
  // with the BLAS calls. need a better solution
7288
- if (n_tokens >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
7566
+ // MoE Special Case: This logic applies when hparams.n_expert == 0, i.e. the model is NOT an MoE model. When an MoE is
7567
+ // being processed then Accelerate/BLAS will not be involved, so capping would limit performance.
7568
+ if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
7289
7569
  n_threads = std::min(4, n_threads);
7290
7570
  }
7291
7571
 
@@ -7303,6 +7583,9 @@ static int llama_decode_internal(
7303
7583
  if (lctx.backend_cpu != nullptr) {
7304
7584
  ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
7305
7585
  }
7586
+
7587
+ llama_set_inputs(lctx, batch);
7588
+
7306
7589
  ggml_backend_sched_graph_compute(lctx.sched, gf);
7307
7590
 
7308
7591
  // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
@@ -7342,7 +7625,7 @@ static int llama_decode_internal(
7342
7625
  // extract logits
7343
7626
  // TODO: do not compute and extract logits if only embeddings are needed
7344
7627
  // need to update the graphs to skip "result_output"
7345
- {
7628
+ if (res) {
7346
7629
  auto & logits_out = lctx.logits;
7347
7630
 
7348
7631
  #ifndef NDEBUG
@@ -7386,9 +7669,12 @@ static int llama_decode_internal(
7386
7669
  if (!lctx.embedding.empty()) {
7387
7670
  auto & embedding_out = lctx.embedding;
7388
7671
 
7389
- embedding_out.resize(n_embd);
7672
+ const int64_t embd_pos = res ? n_embd * (n_tokens-1) : 0;
7673
+ const int64_t embd_size = res ? n_embd : n_embd * n_tokens;
7674
+
7675
+ embedding_out.resize(embd_size);
7390
7676
  ggml_backend_t embeddings_backend = ggml_backend_sched_get_node_backend(lctx.sched, embeddings);
7391
- ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(), (n_embd*(n_tokens - 1))*sizeof(float), n_embd*sizeof(float));
7677
+ ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(), embd_pos*sizeof(float), embd_size*sizeof(float));
7392
7678
  ggml_backend_synchronize(embeddings_backend);
7393
7679
  }
7394
7680
 
@@ -7452,6 +7738,9 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
7452
7738
  GGML_ASSERT(false);
7453
7739
  return unicode_to_bytes_bpe(token_data.text);
7454
7740
  }
7741
+ case LLAMA_VOCAB_TYPE_WPM: {
7742
+ GGML_ASSERT(false);
7743
+ }
7455
7744
  default:
7456
7745
  GGML_ASSERT(false);
7457
7746
  }
@@ -7462,8 +7751,15 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
7462
7751
  switch (llama_vocab_get_type(vocab)) {
7463
7752
  case LLAMA_VOCAB_TYPE_SPM: {
7464
7753
  const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
7465
- return vocab.token_to_id.at(buf);
7754
+ auto token = vocab.token_to_id.find(buf);
7755
+ if (token != vocab.token_to_id.end()) {
7756
+ return (*token).second;
7757
+ }
7758
+ // Try to fall back to just the byte as a string
7759
+ const char buf2[2] = { (char)ch, 0 };
7760
+ return vocab.token_to_id.at(buf2);
7466
7761
  }
7762
+ case LLAMA_VOCAB_TYPE_WPM:
7467
7763
  case LLAMA_VOCAB_TYPE_BPE: {
7468
7764
  return vocab.token_to_id.at(bytes_to_unicode_bpe(ch));
7469
7765
  }
@@ -7509,7 +7805,7 @@ struct llm_bigram_spm {
7509
7805
  };
7510
7806
 
7511
7807
  struct llm_tokenizer_spm {
7512
- llm_tokenizer_spm(const llama_vocab & vocab): vocab(vocab) {}
7808
+ llm_tokenizer_spm(const llama_vocab & vocab) : vocab(vocab) {}
7513
7809
 
7514
7810
  void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
7515
7811
  // split string into utf8 chars
@@ -7584,6 +7880,7 @@ private:
7584
7880
 
7585
7881
  if (p == rev_merge.end()) {
7586
7882
  // output any symbols that did not form tokens as bytes.
7883
+ output.reserve(output.size() + symbol.n);
7587
7884
  for (int j = 0; j < (int)symbol.n; ++j) {
7588
7885
  llama_vocab::id token_id = llama_byte_to_token(vocab, symbol.text[j]);
7589
7886
  output.push_back(token_id);
@@ -7934,29 +8231,230 @@ private:
7934
8231
  llm_bigram_bpe::queue work_queue;
7935
8232
  };
7936
8233
 
7937
- typedef enum FRAGMENT_BUFFER_VARIANT_TYPE{
8234
+ struct llm_tokenizer_wpm {
8235
+ llm_tokenizer_wpm(const llama_vocab & vocab): vocab(vocab) {}
8236
+
8237
+ void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
8238
+ auto * token_map = &vocab.token_to_id;
8239
+
8240
+ // normalize and split by whitespace
8241
+ std::vector<std::string> words = preprocess(text);
8242
+
8243
+ // bos token prepended already
8244
+
8245
+ // find the longest tokens that form the words
8246
+ for (const std::string &word : words) {
8247
+ // skip empty words
8248
+ if (word.size() == 0) {
8249
+ continue;
8250
+ }
8251
+
8252
+ // prepend phantom space
8253
+ std::string word1 = "\xe2\x96\x81" + word;
8254
+ int n = word1.size();
8255
+
8256
+ // we're at the start of a new word
8257
+ int i = 0;
8258
+ bool match_any = false;
8259
+
8260
+ // move through character position in word
8261
+ while (i < n) {
8262
+ // loop through possible match length
8263
+ bool match = false;
8264
+ for (int j = n; j > i; j--) {
8265
+ auto it = token_map->find(word1.substr(i, j - i));
8266
+ if (it != token_map->end()) {
8267
+ output.push_back(it->second);
8268
+ match = true;
8269
+ match_any = true;
8270
+ i = j;
8271
+ break;
8272
+ }
8273
+ }
8274
+
8275
+ // must be an unknown character
8276
+ if (!match) {
8277
+ i++;
8278
+ }
8279
+ }
8280
+
8281
+ // we didn't find any matches for this word
8282
+ if (!match_any) {
8283
+ output.push_back(vocab.special_unk_id);
8284
+ }
8285
+ }
8286
+
8287
+ // append eos token
8288
+ output.push_back(vocab.special_eos_id);
8289
+ }
8290
+
8291
+ std::vector<std::string> preprocess(const std::string & text) {
8292
+ std::string ori_str = normalize(text);
8293
+ uint64_t ori_size = ori_str.size();
8294
+
8295
+ // single punct / single symbol / single digit
8296
+ // baseline: add whitespace on the left and right of punct and chinese characters
8297
+ std::vector<std::string> words;
8298
+ std::string new_str = "";
8299
+ uint64_t i = 0;
8300
+ while (i < ori_size) {
8301
+ int utf_char_len = utf8_len(ori_str[i]);
8302
+ if ((utf_char_len == 1) && ispunct(ori_str[i])) {
8303
+ new_str += " ";
8304
+ new_str += ori_str[i];
8305
+ new_str += " ";
8306
+ i += 1;
8307
+ }
8308
+ else if ((utf_char_len == 3) && is_chinese_char(ori_str.substr(i, 3))) {
8309
+ new_str += " ";
8310
+ new_str += ori_str.substr(i, 3);
8311
+ new_str += " ";
8312
+ i += 3;
8313
+ }
8314
+ else {
8315
+ new_str += ori_str[i];
8316
+ i += 1;
8317
+ }
8318
+ }
8319
+
8320
+ // split by whitespace
8321
+ uint64_t l = 0;
8322
+ uint64_t r = 0;
8323
+ while (r < new_str.size()) {
8324
+ // if is whitespace
8325
+ if (isspace(new_str[r])) {
8326
+ if (r > l) words.push_back(new_str.substr(l, (r - l)));
8327
+ l = r + 1;
8328
+ r = l;
8329
+ }
8330
+ else {
8331
+ r += 1;
8332
+ }
8333
+ }
8334
+ if (r > l) {
8335
+ words.push_back(new_str.substr(l, (r - l)));
8336
+ }
8337
+ return words;
8338
+ }
8339
+
8340
+ std::string normalize(const std::string & text) {
8341
+ // TODO: handle chinese characters? https://github.com/huggingface/tokenizers/blob/ef5f50605ddf9f8caef1598c0e4853862b9707a7/tokenizers/src/normalizers/bert.rs#L98
8342
+ std::string text2 = strip_accents(text);
8343
+ for (size_t i = 0; i < text2.size(); i += utf8_len(text2[i])) {
8344
+ char c = text2[i];
8345
+ if (c >= 'A' && c <= 'Z') {
8346
+ text2[i] = c - 'A' + 'a';
8347
+ }
8348
+ }
8349
+ return text2;
8350
+ }
8351
+
8352
+ bool is_chinese_char(const std::string & str) {
8353
+ int len = str.length();
8354
+ unsigned int codepoint = 0;
8355
+ int num_bytes = 0;
8356
+ int i = 0;
8357
+ unsigned char ch = static_cast<unsigned char>(str[i]);
8358
+ if (ch <= 0x7f) {
8359
+ codepoint = ch;
8360
+ num_bytes = 1;
8361
+ } else if ((ch >> 5) == 0x06) {
8362
+ codepoint = ch & 0x1f;
8363
+ num_bytes = 2;
8364
+ } else if ((ch >> 4) == 0x0e) {
8365
+ codepoint = ch & 0x0f;
8366
+ num_bytes = 3;
8367
+ } else if ((ch >> 3) == 0x1e) {
8368
+ codepoint = ch & 0x07;
8369
+ num_bytes = 4;
8370
+ }
8371
+ for (int j = 1; j < num_bytes; ++j) {
8372
+ if (i + j >= len) {
8373
+ return false; // incomplete UTF-8 character
8374
+ }
8375
+ unsigned char next_ch = static_cast<unsigned char>(str[i + j]);
8376
+ if ((next_ch >> 6) != 0x02) {
8377
+ return false; // invalid trailing byte
8378
+ }
8379
+ codepoint = (codepoint << 6) | (next_ch & 0x3f);
8380
+ }
8381
+ if ((codepoint >= 0x4E00 && codepoint <= 0x9FFF) ||
8382
+ (codepoint >= 0x3400 && codepoint <= 0x4DBF) ||
8383
+ (codepoint >= 0x20000 && codepoint <= 0x2A6DF) ||
8384
+ (codepoint >= 0x2A700 && codepoint <= 0x2B73F) ||
8385
+ (codepoint >= 0x2B740 && codepoint <= 0x2B81F) ||
8386
+ (codepoint >= 0x2B920 && codepoint <= 0x2CEAF) || // this should be 0x2B820 but in hf rust code it is 0x2B920
8387
+ (codepoint >= 0xF900 && codepoint <= 0xFAFF) ||
8388
+ (codepoint >= 0x2F800 && codepoint <= 0x2FA1F) ||
8389
+ (codepoint >= 0x3000 && codepoint <= 0x303F) ||
8390
+ (codepoint >= 0xFF00 && codepoint <= 0xFFEF)) {
8391
+ return true; // NOLINT
8392
+ }
8393
+ return false;
8394
+ }
8395
+
8396
+ std::string strip_accents(const std::string & input_string) {
8397
+ std::string resultString;
8398
+ std::map<std::string, char> accent_map = {
8399
+ {"À", 'A'}, {"Á", 'A'}, {"Â", 'A'}, {"Ã", 'A'}, {"Ä", 'A'}, {"Å", 'A'},
8400
+ {"à", 'a'}, {"á", 'a'}, {"â", 'a'}, {"ã", 'a'}, {"ä", 'a'}, {"å", 'a'},
8401
+ {"È", 'E'}, {"É", 'E'}, {"Ê", 'E'}, {"Ë", 'E'}, {"è", 'e'}, {"é", 'e'},
8402
+ {"ê", 'e'}, {"ë", 'e'}, {"Ì", 'I'}, {"Í", 'I'}, {"Î", 'I'}, {"Ï", 'I'},
8403
+ {"ì", 'i'}, {"í", 'i'}, {"î", 'i'}, {"ï", 'i'}, {"Ò", 'O'}, {"Ó", 'O'},
8404
+ {"Ô", 'O'}, {"Õ", 'O'}, {"Ö", 'O'}, {"ò", 'o'}, {"ó", 'o'}, {"ô", 'o'},
8405
+ {"õ", 'o'}, {"ö", 'o'}, {"Ù", 'U'}, {"Ú", 'U'}, {"Û", 'U'}, {"Ü", 'U'},
8406
+ {"ù", 'u'}, {"ú", 'u'}, {"û", 'u'}, {"ü", 'u'}, {"Ý", 'Y'}, {"ý", 'y'},
8407
+ {"Ç", 'C'}, {"ç", 'c'}, {"Ñ", 'N'}, {"ñ", 'n'},
8408
+ };
8409
+
8410
+ for (size_t i = 0; i < input_string.length();) {
8411
+ int len = utf8_len(input_string[i]);
8412
+ std::string curChar = input_string.substr(i, len);
8413
+ auto iter = accent_map.find(curChar);
8414
+ if (iter != accent_map.end()) {
8415
+ resultString += iter->second;
8416
+ } else {
8417
+ resultString += curChar;
8418
+ }
8419
+ i += len;
8420
+ }
8421
+
8422
+ return resultString;
8423
+ }
8424
+
8425
+ static size_t utf8_len(char src) {
8426
+ const size_t lookup[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4};
8427
+ uint8_t highbits = static_cast<uint8_t>(src) >> 4;
8428
+ return lookup[highbits];
8429
+ }
8430
+
8431
+ const llama_vocab & vocab;
8432
+ };
8433
+
8434
+ typedef enum FRAGMENT_BUFFER_VARIANT_TYPE {
7938
8435
  FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN,
7939
8436
  FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT
7940
8437
  } FRAGMENT_BUFFER_VARIANT_TYPE;
7941
8438
 
7942
- struct fragment_buffer_variant{
8439
+ struct fragment_buffer_variant {
7943
8440
  fragment_buffer_variant(llama_vocab::id _token)
7944
8441
  :
7945
8442
  type(FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN),
7946
8443
  token(_token),
7947
8444
  raw_text(_dummy),
7948
8445
  offset(0),
7949
- length(0){}
8446
+ length(0) {}
8447
+
7950
8448
  fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length)
7951
8449
  :
7952
8450
  type(FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT),
7953
- token((llama_vocab::id)-1),
8451
+ token((llama_vocab::id) - 1),
7954
8452
  raw_text(_raw_text),
7955
8453
  offset(_offset),
7956
8454
  length(_length){
7957
- GGML_ASSERT( _offset >= 0 );
7958
- GGML_ASSERT( _length >= 1 );
7959
- GGML_ASSERT( offset + length <= raw_text.length() );
8455
+ GGML_ASSERT(_offset >= 0);
8456
+ GGML_ASSERT(_length >= 1);
8457
+ GGML_ASSERT(offset + length <= raw_text.length());
7960
8458
  }
7961
8459
 
7962
8460
  const FRAGMENT_BUFFER_VARIANT_TYPE type;
@@ -7969,8 +8467,7 @@ struct fragment_buffer_variant{
7969
8467
 
7970
8468
  // #define PRETOKENIZERDEBUG
7971
8469
 
7972
- static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer)
7973
- {
8470
+ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer) {
7974
8471
  // for each special token
7975
8472
  for (const auto & st: vocab.special_tokens_cache) {
7976
8473
  const auto & special_token = st.first;
@@ -8081,17 +8578,15 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
8081
8578
  }
8082
8579
 
8083
8580
  std::forward_list<fragment_buffer_variant> fragment_buffer;
8084
- fragment_buffer.emplace_front( raw_text, 0, raw_text.length() );
8581
+ fragment_buffer.emplace_front(raw_text, 0, raw_text.length());
8085
8582
 
8086
- if (special) tokenizer_st_partition( vocab, fragment_buffer );
8583
+ if (special) tokenizer_st_partition(vocab, fragment_buffer);
8087
8584
 
8088
8585
  switch (vocab.type) {
8089
8586
  case LLAMA_VOCAB_TYPE_SPM:
8090
8587
  {
8091
- for (const auto & fragment: fragment_buffer)
8092
- {
8093
- if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT)
8094
- {
8588
+ for (const auto & fragment : fragment_buffer) {
8589
+ if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
8095
8590
  // without adding this leading whitespace, we do not get the same results as the original tokenizer
8096
8591
 
8097
8592
  // TODO: It's likely possible to get rid of this string copy entirely
@@ -8111,19 +8606,15 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
8111
8606
  llm_tokenizer_spm tokenizer(vocab);
8112
8607
  llama_escape_whitespace(raw_text);
8113
8608
  tokenizer.tokenize(raw_text, output);
8114
- }
8115
- else // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
8116
- {
8609
+ } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
8117
8610
  output.push_back(fragment.token);
8118
8611
  }
8119
8612
  }
8120
8613
  } break;
8121
8614
  case LLAMA_VOCAB_TYPE_BPE:
8122
8615
  {
8123
- for (const auto & fragment: fragment_buffer)
8124
- {
8125
- if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT)
8126
- {
8616
+ for (const auto & fragment : fragment_buffer) {
8617
+ if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
8127
8618
  auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
8128
8619
 
8129
8620
  #ifdef PRETOKENIZERDEBUG
@@ -8131,9 +8622,23 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
8131
8622
  #endif
8132
8623
  llm_tokenizer_bpe tokenizer(vocab);
8133
8624
  tokenizer.tokenize(raw_text, output);
8625
+ } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
8626
+ output.push_back(fragment.token);
8134
8627
  }
8135
- else // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
8136
- {
8628
+ }
8629
+ } break;
8630
+ case LLAMA_VOCAB_TYPE_WPM:
8631
+ {
8632
+ for (const auto & fragment : fragment_buffer) {
8633
+ if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
8634
+ auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
8635
+
8636
+ #ifdef PRETOKENIZERDEBUG
8637
+ LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
8638
+ #endif
8639
+ llm_tokenizer_wpm tokenizer(vocab);
8640
+ tokenizer.tokenize(raw_text, output);
8641
+ } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
8137
8642
  output.push_back(fragment.token);
8138
8643
  }
8139
8644
  }
@@ -9785,6 +10290,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
9785
10290
  }
9786
10291
  ++qs.i_ffn_up;
9787
10292
  }
10293
+
9788
10294
  // if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
9789
10295
  //}
9790
10296
  // IK: let's remove this, else Q2_K is almost the same as Q3_K_S
@@ -9844,19 +10350,19 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
9844
10350
 
9845
10351
  // K-quants
9846
10352
  case LLAMA_FTYPE_MOSTLY_Q2_K_S:
9847
- case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
10353
+ case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
9848
10354
  case LLAMA_FTYPE_MOSTLY_Q3_K_XS:
9849
10355
  case LLAMA_FTYPE_MOSTLY_Q3_K_S:
9850
10356
  case LLAMA_FTYPE_MOSTLY_Q3_K_M:
9851
- case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
10357
+ case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
9852
10358
  case LLAMA_FTYPE_MOSTLY_Q4_K_S:
9853
- case LLAMA_FTYPE_MOSTLY_Q4_K_M: quantized_type = GGML_TYPE_Q4_K; break;
10359
+ case LLAMA_FTYPE_MOSTLY_Q4_K_M: quantized_type = GGML_TYPE_Q4_K; break;
9854
10360
  case LLAMA_FTYPE_MOSTLY_Q5_K_S:
9855
- case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
9856
- case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
9857
- case LLAMA_FTYPE_MOSTLY_IQ2_XXS:quantized_type = GGML_TYPE_IQ2_XXS; break;
9858
- case LLAMA_FTYPE_MOSTLY_IQ2_XS :quantized_type = GGML_TYPE_IQ2_XS; break;
9859
- case LLAMA_FTYPE_MOSTLY_IQ3_XXS:quantized_type = GGML_TYPE_IQ3_XXS; break;
10361
+ case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
10362
+ case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
10363
+ case LLAMA_FTYPE_MOSTLY_IQ2_XXS: quantized_type = GGML_TYPE_IQ2_XXS; break;
10364
+ case LLAMA_FTYPE_MOSTLY_IQ2_XS: quantized_type = GGML_TYPE_IQ2_XS; break;
10365
+ case LLAMA_FTYPE_MOSTLY_IQ3_XXS: quantized_type = GGML_TYPE_IQ3_XXS; break;
9860
10366
 
9861
10367
  default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
9862
10368
  }
@@ -9986,7 +10492,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
9986
10492
  quantize &= !params->only_copy;
9987
10493
 
9988
10494
  // do not quantize expert gating tensors
9989
- quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
10495
+ quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_FFN_GATE_INP, "weight");
10496
+
10497
+ // do not quantize positional embeddings and token types (BERT)
10498
+ quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD, "weight");
10499
+ quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight");
9990
10500
 
9991
10501
  enum ggml_type new_type;
9992
10502
  void * new_data;
@@ -10488,6 +10998,7 @@ struct llama_context_params llama_context_default_params() {
10488
10998
  /*.logits_all =*/ false,
10489
10999
  /*.embedding =*/ false,
10490
11000
  /*.offload_kqv =*/ true,
11001
+ /*.do_pooling =*/ true,
10491
11002
  };
10492
11003
 
10493
11004
  return result;
@@ -10643,6 +11154,7 @@ struct llama_context * llama_new_context_with_model(
10643
11154
  cparams.yarn_beta_slow = params.yarn_beta_slow;
10644
11155
  cparams.mul_mat_q = params.mul_mat_q;
10645
11156
  cparams.offload_kqv = params.offload_kqv;
11157
+ cparams.do_pooling = params.do_pooling;
10646
11158
 
10647
11159
  cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
10648
11160
  cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base;
@@ -10790,14 +11302,14 @@ struct llama_context * llama_new_context_with_model(
10790
11302
  // resized during inference, reserve maximum
10791
11303
  ctx->logits.reserve(hparams.n_vocab*cparams.n_batch);
10792
11304
 
10793
- if (params.embedding){
11305
+ if (params.embedding) {
10794
11306
  ctx->embedding.resize(hparams.n_embd);
10795
11307
  }
10796
11308
 
10797
11309
  // graph inputs
10798
11310
  {
10799
11311
  ggml_init_params init_params = {
10800
- /* .mem_size */ ggml_tensor_overhead()*5,
11312
+ /* .mem_size */ ggml_tensor_overhead()*7,
10801
11313
  /* .mem_buffer */ nullptr,
10802
11314
  /* .no_alloc */ true,
10803
11315
  };
@@ -10808,12 +11320,14 @@ struct llama_context * llama_new_context_with_model(
10808
11320
  ctx->inp_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
10809
11321
  ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch);
10810
11322
  ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx);
11323
+ ctx->inp_sum = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_batch, cparams.n_batch);
10811
11324
 
10812
11325
  ggml_set_name(ctx->inp_tokens, "inp_tokens");
10813
11326
  ggml_set_name(ctx->inp_embd, "inp_embd");
10814
11327
  ggml_set_name(ctx->inp_pos, "inp_pos");
10815
11328
  ggml_set_name(ctx->inp_KQ_mask, "inp_KQ_mask");
10816
11329
  ggml_set_name(ctx->inp_K_shift, "inp_K_shift");
11330
+ ggml_set_name(ctx->inp_sum, "inp_sum");
10817
11331
 
10818
11332
  ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true));
10819
11333
 
@@ -10839,23 +11353,27 @@ struct llama_context * llama_new_context_with_model(
10839
11353
  ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead());
10840
11354
 
10841
11355
  ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES);
10842
- ctx->alloc = ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu);
10843
11356
 
10844
11357
  // build worst-case graph
10845
11358
  int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch);
10846
11359
  int n_past = cparams.n_ctx - n_tokens;
10847
11360
  llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
10848
- ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0));
11361
+ ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0), true);
10849
11362
 
10850
11363
  // initialize scheduler with the worst-case graph
10851
- ggml_backend_sched_init_measure(ctx->sched, gf);
10852
- ctx->alloc = ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu);
11364
+ if (!ggml_backend_sched_reserve(ctx->sched, gf)) {
11365
+ LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
11366
+ llama_free(ctx);
11367
+ return nullptr;
11368
+ }
10853
11369
 
10854
- for (ggml_backend_t backend : ctx->backends) {
10855
- ggml_backend_buffer_t buf = ggml_backend_sched_get_buffer(ctx->sched, backend);
11370
+ for (size_t i = 0; i < ctx->backends.size(); i++) {
11371
+ ggml_backend_t backend = ctx->backends[i];
11372
+ ggml_backend_buffer_type_t buft = backend_buft[i];
11373
+ size_t size = ggml_backend_sched_get_buffer_size(ctx->sched, backend);
10856
11374
  LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
10857
- ggml_backend_buffer_name(buf),
10858
- ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
11375
+ ggml_backend_buft_name(buft),
11376
+ size / 1024.0 / 1024.0);
10859
11377
  }
10860
11378
 
10861
11379
  // note: the number of splits during measure is higher than during inference due to the kv shift
@@ -11660,6 +12178,10 @@ float * llama_get_embeddings(struct llama_context * ctx) {
11660
12178
  return ctx->embedding.data();
11661
12179
  }
11662
12180
 
12181
+ float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
12182
+ return ctx->embedding.data() + i*ctx->model.hparams.n_embd;
12183
+ }
12184
+
11663
12185
  const char * llama_token_get_text(const struct llama_model * model, llama_token token) {
11664
12186
  return model->vocab.id_to_token[token].text.c_str();
11665
12187
  }
@@ -11744,6 +12266,7 @@ static std::string llama_decode_text(const std::string & text) {
11744
12266
  int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length) {
11745
12267
  if (0 <= token && token < llama_n_vocab(model)) {
11746
12268
  switch (llama_vocab_get_type(model->vocab)) {
12269
+ case LLAMA_VOCAB_TYPE_WPM:
11747
12270
  case LLAMA_VOCAB_TYPE_SPM: {
11748
12271
  // NOTE: we accept all unsupported token types,
11749
12272
  // suppressing them like CONTROL tokens.
@@ -11867,6 +12390,7 @@ const char * llama_print_system_info(void) {
11867
12390
  s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
11868
12391
  s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
11869
12392
  s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
12393
+ s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | ";
11870
12394
 
11871
12395
  return s.c_str();
11872
12396
  }