llama_cpp 0.12.5 → 0.12.6

Sign up to get free protection for your applications and to get access to all the features.
@@ -196,6 +196,7 @@ enum llm_arch {
196
196
  LLM_ARCH_STARCODER,
197
197
  LLM_ARCH_PERSIMMON,
198
198
  LLM_ARCH_REFACT,
199
+ LLM_ARCH_BERT,
199
200
  LLM_ARCH_BLOOM,
200
201
  LLM_ARCH_STABLELM,
201
202
  LLM_ARCH_QWEN,
@@ -220,6 +221,7 @@ static std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
220
221
  { LLM_ARCH_STARCODER, "starcoder" },
221
222
  { LLM_ARCH_PERSIMMON, "persimmon" },
222
223
  { LLM_ARCH_REFACT, "refact" },
224
+ { LLM_ARCH_BERT, "bert" },
223
225
  { LLM_ARCH_BLOOM, "bloom" },
224
226
  { LLM_ARCH_STABLELM, "stablelm" },
225
227
  { LLM_ARCH_QWEN, "qwen" },
@@ -252,6 +254,7 @@ enum llm_kv {
252
254
  LLM_KV_TENSOR_DATA_LAYOUT,
253
255
  LLM_KV_EXPERT_COUNT,
254
256
  LLM_KV_EXPERT_USED_COUNT,
257
+ LLM_KV_POOLING_LAYER,
255
258
 
256
259
  LLM_KV_ATTENTION_HEAD_COUNT,
257
260
  LLM_KV_ATTENTION_HEAD_COUNT_KV,
@@ -261,6 +264,7 @@ enum llm_kv {
261
264
  LLM_KV_ATTENTION_VALUE_LENGTH,
262
265
  LLM_KV_ATTENTION_LAYERNORM_EPS,
263
266
  LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
267
+ LLM_KV_ATTENTION_CAUSAL,
264
268
 
265
269
  LLM_KV_ROPE_DIMENSION_COUNT,
266
270
  LLM_KV_ROPE_FREQ_BASE,
@@ -273,6 +277,7 @@ enum llm_kv {
273
277
  LLM_KV_TOKENIZER_MODEL,
274
278
  LLM_KV_TOKENIZER_LIST,
275
279
  LLM_KV_TOKENIZER_TOKEN_TYPE,
280
+ LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT,
276
281
  LLM_KV_TOKENIZER_SCORES,
277
282
  LLM_KV_TOKENIZER_MERGES,
278
283
  LLM_KV_TOKENIZER_BOS_ID,
@@ -307,6 +312,7 @@ static std::map<llm_kv, const char *> LLM_KV_NAMES = {
307
312
  { LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
308
313
  { LLM_KV_EXPERT_COUNT, "%s.expert_count" },
309
314
  { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
315
+ { LLM_KV_POOLING_LAYER, "%s.pooling_layer" },
310
316
 
311
317
  { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
312
318
  { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
@@ -316,6 +322,7 @@ static std::map<llm_kv, const char *> LLM_KV_NAMES = {
316
322
  { LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" },
317
323
  { LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
318
324
  { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
325
+ { LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" },
319
326
 
320
327
  { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
321
328
  { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
@@ -328,6 +335,7 @@ static std::map<llm_kv, const char *> LLM_KV_NAMES = {
328
335
  { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
329
336
  { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
330
337
  { LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" },
338
+ { LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, "tokenizer.ggml.token_type_count" },
331
339
  { LLM_KV_TOKENIZER_SCORES, "tokenizer.ggml.scores" },
332
340
  { LLM_KV_TOKENIZER_MERGES, "tokenizer.ggml.merges" },
333
341
  { LLM_KV_TOKENIZER_BOS_ID, "tokenizer.ggml.bos_token_id" },
@@ -355,6 +363,7 @@ struct LLM_KV {
355
363
  enum llm_tensor {
356
364
  LLM_TENSOR_TOKEN_EMBD,
357
365
  LLM_TENSOR_TOKEN_EMBD_NORM,
366
+ LLM_TENSOR_TOKEN_TYPES,
358
367
  LLM_TENSOR_POS_EMBD,
359
368
  LLM_TENSOR_OUTPUT,
360
369
  LLM_TENSOR_OUTPUT_NORM,
@@ -536,6 +545,23 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
536
545
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
537
546
  },
538
547
  },
548
+ {
549
+ LLM_ARCH_BERT,
550
+ {
551
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
552
+ { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
553
+ { LLM_TENSOR_TOKEN_TYPES, "token_types" },
554
+ { LLM_TENSOR_POS_EMBD, "position_embd" },
555
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_output_norm" },
556
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
557
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
558
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
559
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
560
+ { LLM_TENSOR_FFN_NORM, "blk.%d.layer_output_norm" },
561
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
562
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
563
+ },
564
+ },
539
565
  {
540
566
  LLM_ARCH_BLOOM,
541
567
  {
@@ -748,22 +774,37 @@ struct LLM_TN {
748
774
  llm_arch arch;
749
775
 
750
776
  std::string operator()(llm_tensor tensor) const {
777
+ if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
778
+ return "__missing__";
779
+ }
751
780
  return LLM_TENSOR_NAMES[arch].at(tensor);
752
781
  }
753
782
 
754
783
  std::string operator()(llm_tensor tensor, const std::string & suffix) const {
784
+ if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
785
+ return "__missing__";
786
+ }
755
787
  return LLM_TENSOR_NAMES[arch].at(tensor) + "." + suffix;
756
788
  }
757
789
 
758
790
  std::string operator()(llm_tensor tensor, int bid) const {
791
+ if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
792
+ return "__missing__";
793
+ }
759
794
  return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid);
760
795
  }
761
796
 
762
797
  std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const {
798
+ if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
799
+ return "__missing__";
800
+ }
763
801
  return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid) + "." + suffix;
764
802
  }
765
803
 
766
804
  std::string operator()(llm_tensor tensor, const std::string & suffix, int bid, int xid) const {
805
+ if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
806
+ return "__missing__";
807
+ }
767
808
  return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid, xid) + "." + suffix;
768
809
  }
769
810
  };
@@ -1440,6 +1481,11 @@ static llama_state g_state;
1440
1481
  // available llama models
1441
1482
  enum e_model {
1442
1483
  MODEL_UNKNOWN,
1484
+ MODEL_17M,
1485
+ MODEL_22M,
1486
+ MODEL_33M,
1487
+ MODEL_109M,
1488
+ MODEL_335M,
1443
1489
  MODEL_0_5B,
1444
1490
  MODEL_1B,
1445
1491
  MODEL_2B,
@@ -1481,6 +1527,7 @@ struct llama_hparams {
1481
1527
  uint32_t n_ff;
1482
1528
  uint32_t n_expert = 0;
1483
1529
  uint32_t n_expert_used = 0;
1530
+ uint32_t n_vocab_type = 0; // for BERT-style token types
1484
1531
 
1485
1532
  float f_norm_eps;
1486
1533
  float f_norm_rms_eps;
@@ -1493,6 +1540,9 @@ struct llama_hparams {
1493
1540
  float f_clamp_kqv;
1494
1541
  float f_max_alibi_bias;
1495
1542
 
1543
+ bool causal_attn = true;
1544
+ bool pooling_layer = false;
1545
+
1496
1546
 
1497
1547
  bool operator!=(const llama_hparams & other) const {
1498
1548
  if (this->vocab_only != other.vocab_only) return true;
@@ -1554,6 +1604,7 @@ struct llama_cparams {
1554
1604
 
1555
1605
  bool mul_mat_q;
1556
1606
  bool offload_kqv;
1607
+ bool do_pooling;
1557
1608
 
1558
1609
  ggml_backend_sched_eval_callback cb_eval;
1559
1610
  void * cb_eval_user_data;
@@ -1720,6 +1771,7 @@ struct llama_model {
1720
1771
  llama_vocab vocab;
1721
1772
 
1722
1773
  struct ggml_tensor * tok_embd;
1774
+ struct ggml_tensor * type_embd;
1723
1775
  struct ggml_tensor * pos_embd;
1724
1776
  struct ggml_tensor * tok_norm;
1725
1777
  struct ggml_tensor * tok_norm_b;
@@ -1839,8 +1891,6 @@ struct llama_context {
1839
1891
  // memory buffers used to evaluate the model
1840
1892
  std::vector<uint8_t> buf_compute_meta;
1841
1893
  ggml_backend_sched_t sched = nullptr;
1842
- // allocator for the input tensors
1843
- ggml_tallocr * alloc = nullptr;
1844
1894
 
1845
1895
  // input tensors
1846
1896
  ggml_backend_buffer_t buf_input = nullptr;
@@ -1850,6 +1900,7 @@ struct llama_context {
1850
1900
  struct ggml_tensor * inp_pos; // I32 [n_batch]
1851
1901
  struct ggml_tensor * inp_KQ_mask; // F32 [n_ctx, n_batch]
1852
1902
  struct ggml_tensor * inp_K_shift; // I32 [n_ctx]
1903
+ struct ggml_tensor * inp_sum; // F32 [n_batch, n_batch]
1853
1904
 
1854
1905
  #ifdef GGML_USE_MPI
1855
1906
  ggml_mpi_context * ctx_mpi = NULL;
@@ -2829,6 +2880,7 @@ static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
2829
2880
  switch (type) {
2830
2881
  case LLAMA_VOCAB_TYPE_SPM: return "SPM";
2831
2882
  case LLAMA_VOCAB_TYPE_BPE: return "BPE";
2883
+ case LLAMA_VOCAB_TYPE_WPM: return "WPM";
2832
2884
  default: return "unknown";
2833
2885
  }
2834
2886
  }
@@ -3000,6 +3052,27 @@ static void llm_load_hparams(
3000
3052
  default: model.type = e_model::MODEL_UNKNOWN;
3001
3053
  }
3002
3054
  } break;
3055
+ case LLM_ARCH_BERT:
3056
+ {
3057
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
3058
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
3059
+ ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
3060
+ ml.get_key(LLM_KV_POOLING_LAYER, hparams.pooling_layer);
3061
+
3062
+ switch (hparams.n_layer) {
3063
+ case 3:
3064
+ model.type = e_model::MODEL_17M; break; // bge-micro
3065
+ case 6:
3066
+ model.type = e_model::MODEL_22M; break; // MiniLM-L6
3067
+ case 12:
3068
+ switch (hparams.n_embd) {
3069
+ case 384: model.type = e_model::MODEL_33M; break; // MiniLM-L12, bge-small
3070
+ case 768: model.type = e_model::MODEL_109M; break; // bge-base
3071
+ } break;
3072
+ case 24:
3073
+ model.type = e_model::MODEL_335M; break; // bge-large
3074
+ }
3075
+ } break;
3003
3076
  case LLM_ARCH_BLOOM:
3004
3077
  {
3005
3078
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -3204,6 +3277,16 @@ static void llm_load_vocab(
3204
3277
  vocab.special_unk_id = -1;
3205
3278
  vocab.special_sep_id = -1;
3206
3279
  vocab.special_pad_id = -1;
3280
+ } else if (tokenizer_name == "bert") {
3281
+ vocab.type = LLAMA_VOCAB_TYPE_WPM;
3282
+
3283
+ // default special tokens
3284
+ vocab.special_bos_id = 101;
3285
+ vocab.special_eos_id = 102;
3286
+ vocab.special_unk_id = 100;
3287
+ vocab.special_sep_id = -1;
3288
+ vocab.special_pad_id = -1;
3289
+ vocab.add_space_prefix = false;
3207
3290
  } else {
3208
3291
  LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str());
3209
3292
  LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
@@ -3231,7 +3314,14 @@ static void llm_load_vocab(
3231
3314
 
3232
3315
  // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
3233
3316
  if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
3234
- vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
3317
+ try {
3318
+ vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
3319
+ } catch (const std::exception & e) {
3320
+ LLAMA_LOG_WARN("%s: SPM vocabulary, but newline token not found: %s! Using special_pad_id instead.", __func__, e.what());
3321
+ vocab.linefeed_id = vocab.special_pad_id;
3322
+ }
3323
+ } else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
3324
+ vocab.linefeed_id = vocab.special_pad_id;
3235
3325
  } else {
3236
3326
  const std::vector<int> ids = llama_tokenize_internal(vocab, "\u010A", false);
3237
3327
  GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
@@ -3569,6 +3659,7 @@ static bool llm_load_tensors(
3569
3659
  const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
3570
3660
  const int64_t n_embd_gqa = n_embd_v_gqa;
3571
3661
  const int64_t n_vocab = hparams.n_vocab;
3662
+ const int64_t n_vocab_type = hparams.n_vocab_type;
3572
3663
  const int64_t n_ff = hparams.n_ff;
3573
3664
 
3574
3665
  GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
@@ -3783,11 +3874,50 @@ static bool llm_load_tensors(
3783
3874
  layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64});
3784
3875
  }
3785
3876
  } break;
3786
- case LLM_ARCH_BLOOM:
3877
+ case LLM_ARCH_BERT:
3787
3878
  {
3788
3879
  model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
3789
- model.tok_norm = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
3790
- model.tok_norm_b = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd});
3880
+ model.type_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type});
3881
+ model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train});
3882
+ model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
3883
+ model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd});
3884
+
3885
+ for (int i = 0; i < n_layer; ++i) {
3886
+ ggml_context * ctx_layer = ctx_for_layer(i);
3887
+ ggml_context * ctx_split = ctx_for_layer_split(i);
3888
+
3889
+ auto & layer = model.layers[i];
3890
+
3891
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
3892
+ layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
3893
+
3894
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
3895
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
3896
+
3897
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
3898
+ layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
3899
+
3900
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
3901
+ layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
3902
+
3903
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
3904
+ layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
3905
+
3906
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
3907
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
3908
+
3909
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
3910
+ layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
3911
+
3912
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
3913
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
3914
+ }
3915
+ } break;
3916
+ case LLM_ARCH_BLOOM:
3917
+ {
3918
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
3919
+ model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
3920
+ model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd});
3791
3921
 
3792
3922
  // output
3793
3923
  {
@@ -4259,9 +4389,21 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
4259
4389
 
4260
4390
  model.hparams.vocab_only = params.vocab_only;
4261
4391
 
4262
- llm_load_arch (ml, model);
4263
- llm_load_hparams(ml, model);
4264
- llm_load_vocab (ml, model);
4392
+ try {
4393
+ llm_load_arch(ml, model);
4394
+ } catch(const std::exception & e) {
4395
+ throw std::runtime_error("error loading model architecture: " + std::string(e.what()));
4396
+ }
4397
+ try {
4398
+ llm_load_hparams(ml, model);
4399
+ } catch(const std::exception & e) {
4400
+ throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what()));
4401
+ }
4402
+ try {
4403
+ llm_load_vocab(ml, model);
4404
+ } catch(const std::exception & e) {
4405
+ throw std::runtime_error("error loading model vocabulary: " + std::string(e.what()));
4406
+ }
4265
4407
 
4266
4408
  llm_load_print_meta(ml, model);
4267
4409
 
@@ -4739,6 +4881,7 @@ struct llm_build_context {
4739
4881
  const int32_t n_orig_ctx;
4740
4882
 
4741
4883
  const bool do_rope_shift;
4884
+ const bool do_pooling;
4742
4885
 
4743
4886
  const llm_build_cb & cb;
4744
4887
 
@@ -4782,6 +4925,7 @@ struct llm_build_context {
4782
4925
  kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
4783
4926
  n_orig_ctx (cparams.n_yarn_orig_ctx),
4784
4927
  do_rope_shift (worst_case || kv_self.has_shift),
4928
+ do_pooling (hparams.pooling_layer && cparams.do_pooling),
4785
4929
  cb (cb),
4786
4930
  buf_compute_meta (lctx.buf_compute_meta) {
4787
4931
  // all initializations should be done in init()
@@ -5625,6 +5769,103 @@ struct llm_build_context {
5625
5769
  return gf;
5626
5770
  }
5627
5771
 
5772
+ struct ggml_cgraph * build_bert() {
5773
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5774
+
5775
+ const int64_t n_embd_head = hparams.n_embd_head_v;
5776
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5777
+
5778
+ struct ggml_tensor * cur;
5779
+ struct ggml_tensor * inpL;
5780
+
5781
+ // get input vectors with right size
5782
+ const size_t stride1 = n_tokens * ggml_type_size(lctx.inp_tokens->type);
5783
+ struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
5784
+ struct ggml_tensor * inp_sum = ggml_view_2d(ctx0, lctx.inp_sum, n_tokens, n_tokens, stride1, 0);
5785
+
5786
+ // construct input embeddings (token, type, position)
5787
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
5788
+
5789
+ // token types are hardcoded to zero ("Sentence A")
5790
+ struct ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
5791
+ inpL = ggml_add(ctx0, inpL, type_row0);
5792
+ inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL);
5793
+ cb(inpL, "inp_embd", -1);
5794
+
5795
+ // embed layer norm
5796
+ inpL = llm_build_norm(ctx0, inpL, hparams, model.tok_norm, model.tok_norm_b, LLM_NORM, cb, -1);
5797
+ cb(inpL, "inp_norm", -1);
5798
+
5799
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5800
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
5801
+ cb(KQ_mask, "KQ_mask", -1); // [n_kv, n_tokens]
5802
+
5803
+ // iterate layers
5804
+ for (int il = 0; il < n_layer; ++il) {
5805
+ struct ggml_tensor * cur = inpL;
5806
+
5807
+ // self-attention
5808
+ {
5809
+ struct ggml_tensor * Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq);
5810
+ cb(Qcur, "Qcur", il);
5811
+
5812
+ struct ggml_tensor * Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), model.layers[il].bk);
5813
+ cb(Kcur, "Kcur", il);
5814
+
5815
+ struct ggml_tensor * Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), model.layers[il].bv);
5816
+ cb(Vcur, "Vcur", il);
5817
+
5818
+ // seems like we just need to do this for Q?
5819
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
5820
+
5821
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
5822
+ model.layers[il].wo, model.layers[il].bo,
5823
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5824
+ cb(cur, "kqv_out", il);
5825
+ }
5826
+
5827
+ // re-add the layer input
5828
+ cur = ggml_add(ctx0, cur, inpL);
5829
+
5830
+ // attention layer norm
5831
+ cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_norm, model.layers[il].attn_norm_b, LLM_NORM, cb, il);
5832
+
5833
+ struct ggml_tensor * ffn_inp = cur;
5834
+ cb(ffn_inp, "ffn_inp", il);
5835
+
5836
+ // feed-forward network
5837
+ cur = llm_build_ffn(ctx0, cur,
5838
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b,
5839
+ NULL, NULL,
5840
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b,
5841
+ NULL,
5842
+ LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
5843
+ cb(cur, "ffn_out", il);
5844
+
5845
+ // attentions bypass the intermediate layer
5846
+ cur = ggml_add(ctx0, cur, ffn_inp);
5847
+
5848
+ // output layer norm
5849
+ cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, LLM_NORM, cb, il);
5850
+
5851
+ // input for next layer
5852
+ inpL = cur;
5853
+ }
5854
+
5855
+ // final output
5856
+ cur = inpL;
5857
+
5858
+ // pooling layer
5859
+ if (do_pooling) {
5860
+ cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_sum);
5861
+ }
5862
+ cb(cur, "result_embd", -1);
5863
+
5864
+ ggml_build_forward_expand(gf, cur);
5865
+
5866
+ return gf;
5867
+ }
5868
+
5628
5869
  struct ggml_cgraph * build_bloom() {
5629
5870
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5630
5871
 
@@ -6996,12 +7237,10 @@ struct llm_build_context {
6996
7237
 
6997
7238
  static struct ggml_cgraph * llama_build_graph(
6998
7239
  llama_context & lctx,
6999
- const llama_batch & batch) {
7240
+ const llama_batch & batch,
7241
+ bool worst_case) {
7000
7242
  const auto & model = lctx.model;
7001
7243
 
7002
- // check if we should build the worst-case graph (for memory measurement)
7003
- const bool worst_case = ggml_tallocr_is_measure(lctx.alloc);
7004
-
7005
7244
  // this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
7006
7245
  llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) {
7007
7246
  if (il >= 0) {
@@ -7022,67 +7261,6 @@ static struct ggml_cgraph * llama_build_graph(
7022
7261
 
7023
7262
  struct llm_build_context llm(lctx, batch, cb, worst_case);
7024
7263
 
7025
- //
7026
- // set input data
7027
- //
7028
-
7029
- if (!ggml_tallocr_is_measure(lctx.alloc)) {
7030
- if (batch.token) {
7031
- const int64_t n_tokens = batch.n_tokens;
7032
-
7033
- ggml_backend_tensor_set(lctx.inp_tokens, batch.token, 0, n_tokens*ggml_element_size(lctx.inp_tokens));
7034
- }
7035
-
7036
- if (batch.embd) {
7037
- const int64_t n_embd = llm.n_embd;
7038
- const int64_t n_tokens = batch.n_tokens;
7039
-
7040
- ggml_backend_tensor_set(lctx.inp_embd, batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
7041
- }
7042
-
7043
- if (batch.pos) {
7044
- const int64_t n_tokens = batch.n_tokens;
7045
-
7046
- ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
7047
- }
7048
-
7049
- {
7050
- const int64_t n_kv = llm.n_kv;
7051
- const int64_t n_tokens = batch.n_tokens;
7052
-
7053
- GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
7054
- float * data = (float *) lctx.inp_KQ_mask->data;
7055
-
7056
- for (int h = 0; h < 1; ++h) {
7057
- for (int j = 0; j < n_tokens; ++j) {
7058
- const llama_pos pos = batch.pos[j];
7059
- const llama_seq_id seq_id = batch.seq_id[j][0];
7060
-
7061
- for (int i = 0; i < n_kv; ++i) {
7062
- float f;
7063
- if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
7064
- f = -INFINITY;
7065
- } else {
7066
- f = 0;
7067
- }
7068
- data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
7069
- }
7070
- }
7071
- }
7072
- }
7073
-
7074
- if (llm.do_rope_shift) {
7075
- const int64_t n_ctx = llm.n_ctx;
7076
-
7077
- GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));
7078
- int32_t * data = (int32_t *) lctx.inp_K_shift->data;
7079
-
7080
- for (int i = 0; i < n_ctx; ++i) {
7081
- data[i] = lctx.kv_self.cells[i].delta;
7082
- }
7083
- }
7084
- }
7085
-
7086
7264
  llm.init();
7087
7265
 
7088
7266
  switch (model.arch) {
@@ -7110,6 +7288,10 @@ static struct ggml_cgraph * llama_build_graph(
7110
7288
  {
7111
7289
  result = llm.build_refact();
7112
7290
  } break;
7291
+ case LLM_ARCH_BERT:
7292
+ {
7293
+ result = llm.build_bert();
7294
+ } break;
7113
7295
  case LLM_ARCH_BLOOM:
7114
7296
  {
7115
7297
  result = llm.build_bloom();
@@ -7167,6 +7349,97 @@ static struct ggml_cgraph * llama_build_graph(
7167
7349
  return result;
7168
7350
  }
7169
7351
 
7352
+ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
7353
+ //
7354
+ // set input data
7355
+ //
7356
+
7357
+ const auto & hparams = lctx.model.hparams;
7358
+ const auto & cparams = lctx.cparams;
7359
+ const auto & kv_self = lctx.kv_self;
7360
+
7361
+ if (batch.token) {
7362
+ const int64_t n_tokens = batch.n_tokens;
7363
+
7364
+ ggml_backend_tensor_set(lctx.inp_tokens, batch.token, 0, n_tokens*ggml_element_size(lctx.inp_tokens));
7365
+ }
7366
+
7367
+ if (batch.embd) {
7368
+ const int64_t n_embd = hparams.n_embd;
7369
+ const int64_t n_tokens = batch.n_tokens;
7370
+
7371
+ ggml_backend_tensor_set(lctx.inp_embd, batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
7372
+ }
7373
+
7374
+ if (batch.pos) {
7375
+ const int64_t n_tokens = batch.n_tokens;
7376
+
7377
+ ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
7378
+ }
7379
+
7380
+ {
7381
+ const int64_t n_kv = kv_self.n;
7382
+ const int64_t n_tokens = batch.n_tokens;
7383
+
7384
+ assert(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
7385
+
7386
+ float * data = (float *) lctx.inp_KQ_mask->data;
7387
+
7388
+ for (int h = 0; h < 1; ++h) {
7389
+ for (int j = 0; j < n_tokens; ++j) {
7390
+ const llama_pos pos = batch.pos[j];
7391
+ const llama_seq_id seq_id = batch.seq_id[j][0];
7392
+
7393
+ for (int i = 0; i < n_kv; ++i) {
7394
+ float f;
7395
+ if (!lctx.kv_self.cells[i].has_seq_id(seq_id) ||
7396
+ (hparams.causal_attn && lctx.kv_self.cells[i].pos > pos)) {
7397
+ f = -INFINITY;
7398
+ } else {
7399
+ f = 0;
7400
+ }
7401
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
7402
+ }
7403
+ }
7404
+ }
7405
+ }
7406
+
7407
+ {
7408
+ assert(ggml_backend_buffer_is_host(lctx.inp_sum->buffer));
7409
+ float * data = (float *) lctx.inp_sum->data;
7410
+
7411
+ for (int i = 0; i < batch.n_tokens; ++i) {
7412
+ data[i] = 1.0f/float(batch.n_tokens);
7413
+ }
7414
+ }
7415
+
7416
+ if (kv_self.has_shift) {
7417
+ const int64_t n_ctx = cparams.n_ctx;
7418
+
7419
+ assert(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));
7420
+
7421
+ int32_t * data = (int32_t *) lctx.inp_K_shift->data;
7422
+
7423
+ for (int i = 0; i < n_ctx; ++i) {
7424
+ data[i] = lctx.kv_self.cells[i].delta;
7425
+ }
7426
+ }
7427
+
7428
+ if (hparams.pooling_layer && cparams.do_pooling) {
7429
+ const int64_t n_tokens = batch.n_tokens;
7430
+
7431
+ GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_sum->buffer));
7432
+ float * data = (float *) lctx.inp_sum->data;
7433
+
7434
+ memset(lctx.inp_sum->data, 0, batch.n_tokens * batch.n_tokens * ggml_element_size(lctx.inp_sum));
7435
+
7436
+ for (int i = 0; i < n_tokens; ++i) {
7437
+ const llama_seq_id seq_id = batch.seq_id[i][0];
7438
+ data[seq_id*n_tokens + i] = 1.0f;
7439
+ }
7440
+ }
7441
+ }
7442
+
7170
7443
  // decode a batch of tokens by evaluating the transformer
7171
7444
  //
7172
7445
  // - lctx: llama context
@@ -7265,17 +7538,22 @@ static int llama_decode_internal(
7265
7538
  ggml_backend_sched_reset(lctx.sched);
7266
7539
  ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
7267
7540
 
7268
- ggml_cgraph * gf = llama_build_graph(lctx, batch);
7541
+ ggml_cgraph * gf = llama_build_graph(lctx, batch, false);
7269
7542
 
7270
7543
  // the output is always the last tensor in the graph
7271
7544
  struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
7272
- GGML_ASSERT(strcmp(res->name, "result_output") == 0);
7273
-
7274
- // the embeddings could be the second to last tensor, or the third to last tensor
7275
7545
  struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
7276
- if (strcmp(embeddings->name, "result_norm") != 0) {
7277
- embeddings = gf->nodes[gf->n_nodes - 3];
7278
- GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
7546
+ if (strcmp(res->name, "result_output") == 0) {
7547
+ // the embeddings could be the second to last tensor, or the third to last tensor
7548
+ if (strcmp(embeddings->name, "result_norm") != 0) {
7549
+ embeddings = gf->nodes[gf->n_nodes - 3];
7550
+ GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
7551
+ }
7552
+ } else if (strcmp(res->name, "result_embd") == 0) {
7553
+ embeddings = res;
7554
+ res = nullptr;
7555
+ } else {
7556
+ GGML_ASSERT(false);
7279
7557
  }
7280
7558
 
7281
7559
  // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
@@ -7285,7 +7563,9 @@ static int llama_decode_internal(
7285
7563
  // TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
7286
7564
  // we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
7287
7565
  // with the BLAS calls. need a better solution
7288
- if (n_tokens >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
7566
+ // MoE Special Case: This logic applies when hparams.n_expert == 0, i.e. the model is NOT an MoE model. When an MoE is
7567
+ // being processed then Accelerate/BLAS will not be involved, so capping would limit performance.
7568
+ if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
7289
7569
  n_threads = std::min(4, n_threads);
7290
7570
  }
7291
7571
 
@@ -7303,6 +7583,9 @@ static int llama_decode_internal(
7303
7583
  if (lctx.backend_cpu != nullptr) {
7304
7584
  ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
7305
7585
  }
7586
+
7587
+ llama_set_inputs(lctx, batch);
7588
+
7306
7589
  ggml_backend_sched_graph_compute(lctx.sched, gf);
7307
7590
 
7308
7591
  // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
@@ -7342,7 +7625,7 @@ static int llama_decode_internal(
7342
7625
  // extract logits
7343
7626
  // TODO: do not compute and extract logits if only embeddings are needed
7344
7627
  // need to update the graphs to skip "result_output"
7345
- {
7628
+ if (res) {
7346
7629
  auto & logits_out = lctx.logits;
7347
7630
 
7348
7631
  #ifndef NDEBUG
@@ -7386,9 +7669,12 @@ static int llama_decode_internal(
7386
7669
  if (!lctx.embedding.empty()) {
7387
7670
  auto & embedding_out = lctx.embedding;
7388
7671
 
7389
- embedding_out.resize(n_embd);
7672
+ const int64_t embd_pos = res ? n_embd * (n_tokens-1) : 0;
7673
+ const int64_t embd_size = res ? n_embd : n_embd * n_tokens;
7674
+
7675
+ embedding_out.resize(embd_size);
7390
7676
  ggml_backend_t embeddings_backend = ggml_backend_sched_get_node_backend(lctx.sched, embeddings);
7391
- ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(), (n_embd*(n_tokens - 1))*sizeof(float), n_embd*sizeof(float));
7677
+ ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(), embd_pos*sizeof(float), embd_size*sizeof(float));
7392
7678
  ggml_backend_synchronize(embeddings_backend);
7393
7679
  }
7394
7680
 
@@ -7452,6 +7738,9 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
7452
7738
  GGML_ASSERT(false);
7453
7739
  return unicode_to_bytes_bpe(token_data.text);
7454
7740
  }
7741
+ case LLAMA_VOCAB_TYPE_WPM: {
7742
+ GGML_ASSERT(false);
7743
+ }
7455
7744
  default:
7456
7745
  GGML_ASSERT(false);
7457
7746
  }
@@ -7462,8 +7751,15 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
7462
7751
  switch (llama_vocab_get_type(vocab)) {
7463
7752
  case LLAMA_VOCAB_TYPE_SPM: {
7464
7753
  const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
7465
- return vocab.token_to_id.at(buf);
7754
+ auto token = vocab.token_to_id.find(buf);
7755
+ if (token != vocab.token_to_id.end()) {
7756
+ return (*token).second;
7757
+ }
7758
+ // Try to fall back to just the byte as a string
7759
+ const char buf2[2] = { (char)ch, 0 };
7760
+ return vocab.token_to_id.at(buf2);
7466
7761
  }
7762
+ case LLAMA_VOCAB_TYPE_WPM:
7467
7763
  case LLAMA_VOCAB_TYPE_BPE: {
7468
7764
  return vocab.token_to_id.at(bytes_to_unicode_bpe(ch));
7469
7765
  }
@@ -7509,7 +7805,7 @@ struct llm_bigram_spm {
7509
7805
  };
7510
7806
 
7511
7807
  struct llm_tokenizer_spm {
7512
- llm_tokenizer_spm(const llama_vocab & vocab): vocab(vocab) {}
7808
+ llm_tokenizer_spm(const llama_vocab & vocab) : vocab(vocab) {}
7513
7809
 
7514
7810
  void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
7515
7811
  // split string into utf8 chars
@@ -7584,6 +7880,7 @@ private:
7584
7880
 
7585
7881
  if (p == rev_merge.end()) {
7586
7882
  // output any symbols that did not form tokens as bytes.
7883
+ output.reserve(output.size() + symbol.n);
7587
7884
  for (int j = 0; j < (int)symbol.n; ++j) {
7588
7885
  llama_vocab::id token_id = llama_byte_to_token(vocab, symbol.text[j]);
7589
7886
  output.push_back(token_id);
@@ -7934,29 +8231,230 @@ private:
7934
8231
  llm_bigram_bpe::queue work_queue;
7935
8232
  };
7936
8233
 
7937
- typedef enum FRAGMENT_BUFFER_VARIANT_TYPE{
8234
+ struct llm_tokenizer_wpm {
8235
+ llm_tokenizer_wpm(const llama_vocab & vocab): vocab(vocab) {}
8236
+
8237
+ void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
8238
+ auto * token_map = &vocab.token_to_id;
8239
+
8240
+ // normalize and split by whitespace
8241
+ std::vector<std::string> words = preprocess(text);
8242
+
8243
+ // bos token prepended already
8244
+
8245
+ // find the longest tokens that form the words
8246
+ for (const std::string &word : words) {
8247
+ // skip empty words
8248
+ if (word.size() == 0) {
8249
+ continue;
8250
+ }
8251
+
8252
+ // prepend phantom space
8253
+ std::string word1 = "\xe2\x96\x81" + word;
8254
+ int n = word1.size();
8255
+
8256
+ // we're at the start of a new word
8257
+ int i = 0;
8258
+ bool match_any = false;
8259
+
8260
+ // move through character position in word
8261
+ while (i < n) {
8262
+ // loop through possible match length
8263
+ bool match = false;
8264
+ for (int j = n; j > i; j--) {
8265
+ auto it = token_map->find(word1.substr(i, j - i));
8266
+ if (it != token_map->end()) {
8267
+ output.push_back(it->second);
8268
+ match = true;
8269
+ match_any = true;
8270
+ i = j;
8271
+ break;
8272
+ }
8273
+ }
8274
+
8275
+ // must be an unknown character
8276
+ if (!match) {
8277
+ i++;
8278
+ }
8279
+ }
8280
+
8281
+ // we didn't find any matches for this word
8282
+ if (!match_any) {
8283
+ output.push_back(vocab.special_unk_id);
8284
+ }
8285
+ }
8286
+
8287
+ // append eos token
8288
+ output.push_back(vocab.special_eos_id);
8289
+ }
8290
+
8291
+ std::vector<std::string> preprocess(const std::string & text) {
8292
+ std::string ori_str = normalize(text);
8293
+ uint64_t ori_size = ori_str.size();
8294
+
8295
+ // single punct / single symbol / single digit
8296
+ // baseline: add whitespace on the left and right of punct and chinese characters
8297
+ std::vector<std::string> words;
8298
+ std::string new_str = "";
8299
+ uint64_t i = 0;
8300
+ while (i < ori_size) {
8301
+ int utf_char_len = utf8_len(ori_str[i]);
8302
+ if ((utf_char_len == 1) && ispunct(ori_str[i])) {
8303
+ new_str += " ";
8304
+ new_str += ori_str[i];
8305
+ new_str += " ";
8306
+ i += 1;
8307
+ }
8308
+ else if ((utf_char_len == 3) && is_chinese_char(ori_str.substr(i, 3))) {
8309
+ new_str += " ";
8310
+ new_str += ori_str.substr(i, 3);
8311
+ new_str += " ";
8312
+ i += 3;
8313
+ }
8314
+ else {
8315
+ new_str += ori_str[i];
8316
+ i += 1;
8317
+ }
8318
+ }
8319
+
8320
+ // split by whitespace
8321
+ uint64_t l = 0;
8322
+ uint64_t r = 0;
8323
+ while (r < new_str.size()) {
8324
+ // if is whitespace
8325
+ if (isspace(new_str[r])) {
8326
+ if (r > l) words.push_back(new_str.substr(l, (r - l)));
8327
+ l = r + 1;
8328
+ r = l;
8329
+ }
8330
+ else {
8331
+ r += 1;
8332
+ }
8333
+ }
8334
+ if (r > l) {
8335
+ words.push_back(new_str.substr(l, (r - l)));
8336
+ }
8337
+ return words;
8338
+ }
8339
+
8340
+ std::string normalize(const std::string & text) {
8341
+ // TODO: handle chinese characters? https://github.com/huggingface/tokenizers/blob/ef5f50605ddf9f8caef1598c0e4853862b9707a7/tokenizers/src/normalizers/bert.rs#L98
8342
+ std::string text2 = strip_accents(text);
8343
+ for (size_t i = 0; i < text2.size(); i += utf8_len(text2[i])) {
8344
+ char c = text2[i];
8345
+ if (c >= 'A' && c <= 'Z') {
8346
+ text2[i] = c - 'A' + 'a';
8347
+ }
8348
+ }
8349
+ return text2;
8350
+ }
8351
+
8352
+ bool is_chinese_char(const std::string & str) {
8353
+ int len = str.length();
8354
+ unsigned int codepoint = 0;
8355
+ int num_bytes = 0;
8356
+ int i = 0;
8357
+ unsigned char ch = static_cast<unsigned char>(str[i]);
8358
+ if (ch <= 0x7f) {
8359
+ codepoint = ch;
8360
+ num_bytes = 1;
8361
+ } else if ((ch >> 5) == 0x06) {
8362
+ codepoint = ch & 0x1f;
8363
+ num_bytes = 2;
8364
+ } else if ((ch >> 4) == 0x0e) {
8365
+ codepoint = ch & 0x0f;
8366
+ num_bytes = 3;
8367
+ } else if ((ch >> 3) == 0x1e) {
8368
+ codepoint = ch & 0x07;
8369
+ num_bytes = 4;
8370
+ }
8371
+ for (int j = 1; j < num_bytes; ++j) {
8372
+ if (i + j >= len) {
8373
+ return false; // incomplete UTF-8 character
8374
+ }
8375
+ unsigned char next_ch = static_cast<unsigned char>(str[i + j]);
8376
+ if ((next_ch >> 6) != 0x02) {
8377
+ return false; // invalid trailing byte
8378
+ }
8379
+ codepoint = (codepoint << 6) | (next_ch & 0x3f);
8380
+ }
8381
+ if ((codepoint >= 0x4E00 && codepoint <= 0x9FFF) ||
8382
+ (codepoint >= 0x3400 && codepoint <= 0x4DBF) ||
8383
+ (codepoint >= 0x20000 && codepoint <= 0x2A6DF) ||
8384
+ (codepoint >= 0x2A700 && codepoint <= 0x2B73F) ||
8385
+ (codepoint >= 0x2B740 && codepoint <= 0x2B81F) ||
8386
+ (codepoint >= 0x2B920 && codepoint <= 0x2CEAF) || // this should be 0x2B820 but in hf rust code it is 0x2B920
8387
+ (codepoint >= 0xF900 && codepoint <= 0xFAFF) ||
8388
+ (codepoint >= 0x2F800 && codepoint <= 0x2FA1F) ||
8389
+ (codepoint >= 0x3000 && codepoint <= 0x303F) ||
8390
+ (codepoint >= 0xFF00 && codepoint <= 0xFFEF)) {
8391
+ return true; // NOLINT
8392
+ }
8393
+ return false;
8394
+ }
8395
+
8396
+ std::string strip_accents(const std::string & input_string) {
8397
+ std::string resultString;
8398
+ std::map<std::string, char> accent_map = {
8399
+ {"À", 'A'}, {"Á", 'A'}, {"Â", 'A'}, {"Ã", 'A'}, {"Ä", 'A'}, {"Å", 'A'},
8400
+ {"à", 'a'}, {"á", 'a'}, {"â", 'a'}, {"ã", 'a'}, {"ä", 'a'}, {"å", 'a'},
8401
+ {"È", 'E'}, {"É", 'E'}, {"Ê", 'E'}, {"Ë", 'E'}, {"è", 'e'}, {"é", 'e'},
8402
+ {"ê", 'e'}, {"ë", 'e'}, {"Ì", 'I'}, {"Í", 'I'}, {"Î", 'I'}, {"Ï", 'I'},
8403
+ {"ì", 'i'}, {"í", 'i'}, {"î", 'i'}, {"ï", 'i'}, {"Ò", 'O'}, {"Ó", 'O'},
8404
+ {"Ô", 'O'}, {"Õ", 'O'}, {"Ö", 'O'}, {"ò", 'o'}, {"ó", 'o'}, {"ô", 'o'},
8405
+ {"õ", 'o'}, {"ö", 'o'}, {"Ù", 'U'}, {"Ú", 'U'}, {"Û", 'U'}, {"Ü", 'U'},
8406
+ {"ù", 'u'}, {"ú", 'u'}, {"û", 'u'}, {"ü", 'u'}, {"Ý", 'Y'}, {"ý", 'y'},
8407
+ {"Ç", 'C'}, {"ç", 'c'}, {"Ñ", 'N'}, {"ñ", 'n'},
8408
+ };
8409
+
8410
+ for (size_t i = 0; i < input_string.length();) {
8411
+ int len = utf8_len(input_string[i]);
8412
+ std::string curChar = input_string.substr(i, len);
8413
+ auto iter = accent_map.find(curChar);
8414
+ if (iter != accent_map.end()) {
8415
+ resultString += iter->second;
8416
+ } else {
8417
+ resultString += curChar;
8418
+ }
8419
+ i += len;
8420
+ }
8421
+
8422
+ return resultString;
8423
+ }
8424
+
8425
+ static size_t utf8_len(char src) {
8426
+ const size_t lookup[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4};
8427
+ uint8_t highbits = static_cast<uint8_t>(src) >> 4;
8428
+ return lookup[highbits];
8429
+ }
8430
+
8431
+ const llama_vocab & vocab;
8432
+ };
8433
+
8434
+ typedef enum FRAGMENT_BUFFER_VARIANT_TYPE {
7938
8435
  FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN,
7939
8436
  FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT
7940
8437
  } FRAGMENT_BUFFER_VARIANT_TYPE;
7941
8438
 
7942
- struct fragment_buffer_variant{
8439
+ struct fragment_buffer_variant {
7943
8440
  fragment_buffer_variant(llama_vocab::id _token)
7944
8441
  :
7945
8442
  type(FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN),
7946
8443
  token(_token),
7947
8444
  raw_text(_dummy),
7948
8445
  offset(0),
7949
- length(0){}
8446
+ length(0) {}
8447
+
7950
8448
  fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length)
7951
8449
  :
7952
8450
  type(FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT),
7953
- token((llama_vocab::id)-1),
8451
+ token((llama_vocab::id) - 1),
7954
8452
  raw_text(_raw_text),
7955
8453
  offset(_offset),
7956
8454
  length(_length){
7957
- GGML_ASSERT( _offset >= 0 );
7958
- GGML_ASSERT( _length >= 1 );
7959
- GGML_ASSERT( offset + length <= raw_text.length() );
8455
+ GGML_ASSERT(_offset >= 0);
8456
+ GGML_ASSERT(_length >= 1);
8457
+ GGML_ASSERT(offset + length <= raw_text.length());
7960
8458
  }
7961
8459
 
7962
8460
  const FRAGMENT_BUFFER_VARIANT_TYPE type;
@@ -7969,8 +8467,7 @@ struct fragment_buffer_variant{
7969
8467
 
7970
8468
  // #define PRETOKENIZERDEBUG
7971
8469
 
7972
- static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer)
7973
- {
8470
+ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer) {
7974
8471
  // for each special token
7975
8472
  for (const auto & st: vocab.special_tokens_cache) {
7976
8473
  const auto & special_token = st.first;
@@ -8081,17 +8578,15 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
8081
8578
  }
8082
8579
 
8083
8580
  std::forward_list<fragment_buffer_variant> fragment_buffer;
8084
- fragment_buffer.emplace_front( raw_text, 0, raw_text.length() );
8581
+ fragment_buffer.emplace_front(raw_text, 0, raw_text.length());
8085
8582
 
8086
- if (special) tokenizer_st_partition( vocab, fragment_buffer );
8583
+ if (special) tokenizer_st_partition(vocab, fragment_buffer);
8087
8584
 
8088
8585
  switch (vocab.type) {
8089
8586
  case LLAMA_VOCAB_TYPE_SPM:
8090
8587
  {
8091
- for (const auto & fragment: fragment_buffer)
8092
- {
8093
- if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT)
8094
- {
8588
+ for (const auto & fragment : fragment_buffer) {
8589
+ if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
8095
8590
  // without adding this leading whitespace, we do not get the same results as the original tokenizer
8096
8591
 
8097
8592
  // TODO: It's likely possible to get rid of this string copy entirely
@@ -8111,19 +8606,15 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
8111
8606
  llm_tokenizer_spm tokenizer(vocab);
8112
8607
  llama_escape_whitespace(raw_text);
8113
8608
  tokenizer.tokenize(raw_text, output);
8114
- }
8115
- else // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
8116
- {
8609
+ } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
8117
8610
  output.push_back(fragment.token);
8118
8611
  }
8119
8612
  }
8120
8613
  } break;
8121
8614
  case LLAMA_VOCAB_TYPE_BPE:
8122
8615
  {
8123
- for (const auto & fragment: fragment_buffer)
8124
- {
8125
- if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT)
8126
- {
8616
+ for (const auto & fragment : fragment_buffer) {
8617
+ if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
8127
8618
  auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
8128
8619
 
8129
8620
  #ifdef PRETOKENIZERDEBUG
@@ -8131,9 +8622,23 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
8131
8622
  #endif
8132
8623
  llm_tokenizer_bpe tokenizer(vocab);
8133
8624
  tokenizer.tokenize(raw_text, output);
8625
+ } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
8626
+ output.push_back(fragment.token);
8134
8627
  }
8135
- else // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
8136
- {
8628
+ }
8629
+ } break;
8630
+ case LLAMA_VOCAB_TYPE_WPM:
8631
+ {
8632
+ for (const auto & fragment : fragment_buffer) {
8633
+ if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
8634
+ auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
8635
+
8636
+ #ifdef PRETOKENIZERDEBUG
8637
+ LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
8638
+ #endif
8639
+ llm_tokenizer_wpm tokenizer(vocab);
8640
+ tokenizer.tokenize(raw_text, output);
8641
+ } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
8137
8642
  output.push_back(fragment.token);
8138
8643
  }
8139
8644
  }
@@ -9785,6 +10290,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
9785
10290
  }
9786
10291
  ++qs.i_ffn_up;
9787
10292
  }
10293
+
9788
10294
  // if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
9789
10295
  //}
9790
10296
  // IK: let's remove this, else Q2_K is almost the same as Q3_K_S
@@ -9844,19 +10350,19 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
9844
10350
 
9845
10351
  // K-quants
9846
10352
  case LLAMA_FTYPE_MOSTLY_Q2_K_S:
9847
- case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
10353
+ case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
9848
10354
  case LLAMA_FTYPE_MOSTLY_Q3_K_XS:
9849
10355
  case LLAMA_FTYPE_MOSTLY_Q3_K_S:
9850
10356
  case LLAMA_FTYPE_MOSTLY_Q3_K_M:
9851
- case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
10357
+ case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
9852
10358
  case LLAMA_FTYPE_MOSTLY_Q4_K_S:
9853
- case LLAMA_FTYPE_MOSTLY_Q4_K_M: quantized_type = GGML_TYPE_Q4_K; break;
10359
+ case LLAMA_FTYPE_MOSTLY_Q4_K_M: quantized_type = GGML_TYPE_Q4_K; break;
9854
10360
  case LLAMA_FTYPE_MOSTLY_Q5_K_S:
9855
- case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
9856
- case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
9857
- case LLAMA_FTYPE_MOSTLY_IQ2_XXS:quantized_type = GGML_TYPE_IQ2_XXS; break;
9858
- case LLAMA_FTYPE_MOSTLY_IQ2_XS :quantized_type = GGML_TYPE_IQ2_XS; break;
9859
- case LLAMA_FTYPE_MOSTLY_IQ3_XXS:quantized_type = GGML_TYPE_IQ3_XXS; break;
10361
+ case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
10362
+ case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
10363
+ case LLAMA_FTYPE_MOSTLY_IQ2_XXS: quantized_type = GGML_TYPE_IQ2_XXS; break;
10364
+ case LLAMA_FTYPE_MOSTLY_IQ2_XS: quantized_type = GGML_TYPE_IQ2_XS; break;
10365
+ case LLAMA_FTYPE_MOSTLY_IQ3_XXS: quantized_type = GGML_TYPE_IQ3_XXS; break;
9860
10366
 
9861
10367
  default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
9862
10368
  }
@@ -9986,7 +10492,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
9986
10492
  quantize &= !params->only_copy;
9987
10493
 
9988
10494
  // do not quantize expert gating tensors
9989
- quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
10495
+ quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_FFN_GATE_INP, "weight");
10496
+
10497
+ // do not quantize positional embeddings and token types (BERT)
10498
+ quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD, "weight");
10499
+ quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight");
9990
10500
 
9991
10501
  enum ggml_type new_type;
9992
10502
  void * new_data;
@@ -10488,6 +10998,7 @@ struct llama_context_params llama_context_default_params() {
10488
10998
  /*.logits_all =*/ false,
10489
10999
  /*.embedding =*/ false,
10490
11000
  /*.offload_kqv =*/ true,
11001
+ /*.do_pooling =*/ true,
10491
11002
  };
10492
11003
 
10493
11004
  return result;
@@ -10643,6 +11154,7 @@ struct llama_context * llama_new_context_with_model(
10643
11154
  cparams.yarn_beta_slow = params.yarn_beta_slow;
10644
11155
  cparams.mul_mat_q = params.mul_mat_q;
10645
11156
  cparams.offload_kqv = params.offload_kqv;
11157
+ cparams.do_pooling = params.do_pooling;
10646
11158
 
10647
11159
  cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
10648
11160
  cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base;
@@ -10790,14 +11302,14 @@ struct llama_context * llama_new_context_with_model(
10790
11302
  // resized during inference, reserve maximum
10791
11303
  ctx->logits.reserve(hparams.n_vocab*cparams.n_batch);
10792
11304
 
10793
- if (params.embedding){
11305
+ if (params.embedding) {
10794
11306
  ctx->embedding.resize(hparams.n_embd);
10795
11307
  }
10796
11308
 
10797
11309
  // graph inputs
10798
11310
  {
10799
11311
  ggml_init_params init_params = {
10800
- /* .mem_size */ ggml_tensor_overhead()*5,
11312
+ /* .mem_size */ ggml_tensor_overhead()*7,
10801
11313
  /* .mem_buffer */ nullptr,
10802
11314
  /* .no_alloc */ true,
10803
11315
  };
@@ -10808,12 +11320,14 @@ struct llama_context * llama_new_context_with_model(
10808
11320
  ctx->inp_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
10809
11321
  ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch);
10810
11322
  ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx);
11323
+ ctx->inp_sum = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_batch, cparams.n_batch);
10811
11324
 
10812
11325
  ggml_set_name(ctx->inp_tokens, "inp_tokens");
10813
11326
  ggml_set_name(ctx->inp_embd, "inp_embd");
10814
11327
  ggml_set_name(ctx->inp_pos, "inp_pos");
10815
11328
  ggml_set_name(ctx->inp_KQ_mask, "inp_KQ_mask");
10816
11329
  ggml_set_name(ctx->inp_K_shift, "inp_K_shift");
11330
+ ggml_set_name(ctx->inp_sum, "inp_sum");
10817
11331
 
10818
11332
  ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true));
10819
11333
 
@@ -10839,23 +11353,27 @@ struct llama_context * llama_new_context_with_model(
10839
11353
  ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead());
10840
11354
 
10841
11355
  ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES);
10842
- ctx->alloc = ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu);
10843
11356
 
10844
11357
  // build worst-case graph
10845
11358
  int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch);
10846
11359
  int n_past = cparams.n_ctx - n_tokens;
10847
11360
  llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
10848
- ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0));
11361
+ ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0), true);
10849
11362
 
10850
11363
  // initialize scheduler with the worst-case graph
10851
- ggml_backend_sched_init_measure(ctx->sched, gf);
10852
- ctx->alloc = ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu);
11364
+ if (!ggml_backend_sched_reserve(ctx->sched, gf)) {
11365
+ LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
11366
+ llama_free(ctx);
11367
+ return nullptr;
11368
+ }
10853
11369
 
10854
- for (ggml_backend_t backend : ctx->backends) {
10855
- ggml_backend_buffer_t buf = ggml_backend_sched_get_buffer(ctx->sched, backend);
11370
+ for (size_t i = 0; i < ctx->backends.size(); i++) {
11371
+ ggml_backend_t backend = ctx->backends[i];
11372
+ ggml_backend_buffer_type_t buft = backend_buft[i];
11373
+ size_t size = ggml_backend_sched_get_buffer_size(ctx->sched, backend);
10856
11374
  LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
10857
- ggml_backend_buffer_name(buf),
10858
- ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
11375
+ ggml_backend_buft_name(buft),
11376
+ size / 1024.0 / 1024.0);
10859
11377
  }
10860
11378
 
10861
11379
  // note: the number of splits during measure is higher than during inference due to the kv shift
@@ -11660,6 +12178,10 @@ float * llama_get_embeddings(struct llama_context * ctx) {
11660
12178
  return ctx->embedding.data();
11661
12179
  }
11662
12180
 
12181
+ float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
12182
+ return ctx->embedding.data() + i*ctx->model.hparams.n_embd;
12183
+ }
12184
+
11663
12185
  const char * llama_token_get_text(const struct llama_model * model, llama_token token) {
11664
12186
  return model->vocab.id_to_token[token].text.c_str();
11665
12187
  }
@@ -11744,6 +12266,7 @@ static std::string llama_decode_text(const std::string & text) {
11744
12266
  int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length) {
11745
12267
  if (0 <= token && token < llama_n_vocab(model)) {
11746
12268
  switch (llama_vocab_get_type(model->vocab)) {
12269
+ case LLAMA_VOCAB_TYPE_WPM:
11747
12270
  case LLAMA_VOCAB_TYPE_SPM: {
11748
12271
  // NOTE: we accept all unsupported token types,
11749
12272
  // suppressing them like CONTROL tokens.
@@ -11867,6 +12390,7 @@ const char * llama_print_system_info(void) {
11867
12390
  s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
11868
12391
  s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
11869
12392
  s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
12393
+ s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | ";
11870
12394
 
11871
12395
  return s.c_str();
11872
12396
  }