llama_cpp 0.13.0 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -104,6 +104,7 @@
104
104
  #define LLAMA_MAX_NODES 8192
105
105
  #define LLAMA_MAX_EXPERTS 8
106
106
 
107
+
107
108
  //
108
109
  // logging
109
110
  //
@@ -211,10 +212,11 @@ enum llm_arch {
211
212
  LLM_ARCH_INTERNLM2,
212
213
  LLM_ARCH_MINICPM,
213
214
  LLM_ARCH_GEMMA,
215
+ LLM_ARCH_STARCODER2,
214
216
  LLM_ARCH_UNKNOWN,
215
217
  };
216
218
 
217
- static std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
219
+ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
218
220
  { LLM_ARCH_LLAMA, "llama" },
219
221
  { LLM_ARCH_FALCON, "falcon" },
220
222
  { LLM_ARCH_GPT2, "gpt2" },
@@ -238,6 +240,8 @@ static std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
238
240
  { LLM_ARCH_INTERNLM2, "internlm2" },
239
241
  { LLM_ARCH_MINICPM, "minicpm" },
240
242
  { LLM_ARCH_GEMMA, "gemma" },
243
+ { LLM_ARCH_STARCODER2, "starcoder2" },
244
+ { LLM_ARCH_UNKNOWN, "(unknown)" },
241
245
  };
242
246
 
243
247
  enum llm_kv {
@@ -298,7 +302,7 @@ enum llm_kv {
298
302
  LLM_KV_TOKENIZER_RWKV,
299
303
  };
300
304
 
301
- static std::map<llm_kv, const char *> LLM_KV_NAMES = {
305
+ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
302
306
  { LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" },
303
307
  { LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" },
304
308
  { LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
@@ -362,7 +366,7 @@ struct LLM_KV {
362
366
  llm_arch arch;
363
367
 
364
368
  std::string operator()(llm_kv kv) const {
365
- return ::format(LLM_KV_NAMES[kv], LLM_ARCH_NAMES[arch]);
369
+ return ::format(LLM_KV_NAMES.at(kv), LLM_ARCH_NAMES.at(arch));
366
370
  }
367
371
  };
368
372
 
@@ -397,7 +401,7 @@ enum llm_tensor {
397
401
  LLM_TENSOR_LAYER_OUT_NORM,
398
402
  };
399
403
 
400
- static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
404
+ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
401
405
  {
402
406
  LLM_ARCH_LLAMA,
403
407
  {
@@ -779,6 +783,24 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
779
783
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
780
784
  },
781
785
  },
786
+ {
787
+ LLM_ARCH_STARCODER2,
788
+ {
789
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
790
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
791
+ { LLM_TENSOR_OUTPUT, "output" },
792
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
793
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
794
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
795
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
796
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
797
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
798
+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
799
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
800
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
801
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
802
+ },
803
+ },
782
804
  {
783
805
  LLM_ARCH_UNKNOWN,
784
806
  {
@@ -812,38 +834,38 @@ struct LLM_TN {
812
834
  llm_arch arch;
813
835
 
814
836
  std::string operator()(llm_tensor tensor) const {
815
- if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
837
+ if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
816
838
  return "__missing__";
817
839
  }
818
- return LLM_TENSOR_NAMES[arch].at(tensor);
840
+ return LLM_TENSOR_NAMES.at(arch).at(tensor);
819
841
  }
820
842
 
821
843
  std::string operator()(llm_tensor tensor, const std::string & suffix) const {
822
- if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
844
+ if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
823
845
  return "__missing__";
824
846
  }
825
- return LLM_TENSOR_NAMES[arch].at(tensor) + "." + suffix;
847
+ return LLM_TENSOR_NAMES.at(arch).at(tensor) + "." + suffix;
826
848
  }
827
849
 
828
850
  std::string operator()(llm_tensor tensor, int bid) const {
829
- if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
851
+ if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
830
852
  return "__missing__";
831
853
  }
832
- return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid);
854
+ return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor).c_str(), bid);
833
855
  }
834
856
 
835
857
  std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const {
836
- if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
858
+ if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
837
859
  return "__missing__";
838
860
  }
839
- return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid) + "." + suffix;
861
+ return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor).c_str(), bid) + "." + suffix;
840
862
  }
841
863
 
842
864
  std::string operator()(llm_tensor tensor, const std::string & suffix, int bid, int xid) const {
843
- if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) {
865
+ if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
844
866
  return "__missing__";
845
867
  }
846
- return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid, xid) + "." + suffix;
868
+ return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor).c_str(), bid, xid) + "." + suffix;
847
869
  }
848
870
  };
849
871
 
@@ -851,16 +873,16 @@ struct LLM_TN {
851
873
  // gguf helpers
852
874
  //
853
875
 
854
- static std::map<int32_t, const char *> LLAMA_ROPE_SCALING_TYPES = {
876
+ static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_TYPES = {
855
877
  { LLAMA_ROPE_SCALING_TYPE_NONE, "none" },
856
878
  { LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
857
879
  { LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" },
858
880
  };
859
881
 
860
- static int32_t llama_rope_scaling_type_from_string(const std::string & name) {
882
+ static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
861
883
  for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
862
884
  if (kv.second == name) {
863
- return kv.first;
885
+ return (llama_rope_scaling_type) kv.first;
864
886
  }
865
887
  }
866
888
 
@@ -1409,7 +1431,9 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
1409
1431
  buft = ggml_backend_cuda_host_buffer_type();
1410
1432
  }
1411
1433
  #elif defined(GGML_USE_SYCL)
1412
- buft = ggml_backend_sycl_host_buffer_type();
1434
+ if (host_buffer) {
1435
+ buft = ggml_backend_sycl_host_buffer_type();
1436
+ }
1413
1437
  #elif defined(GGML_USE_CPU_HBM)
1414
1438
  buft = ggml_backend_cpu_hbm_buffer_type();
1415
1439
  #elif defined(GGML_USE_VULKAN)
@@ -1463,6 +1487,12 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
1463
1487
  }
1464
1488
  #endif
1465
1489
 
1490
+ #ifdef GGML_USE_SYCL
1491
+ if (ggml_backend_sycl_get_device_count() > 1) {
1492
+ buft = ggml_backend_sycl_split_buffer_type(tensor_split);
1493
+ }
1494
+ #endif
1495
+
1466
1496
  if (buft == nullptr) {
1467
1497
  buft = llama_default_buffer_type_offload(fallback_gpu);
1468
1498
  }
@@ -1474,6 +1504,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
1474
1504
  static size_t llama_get_device_count() {
1475
1505
  #if defined(GGML_USE_CUBLAS)
1476
1506
  return ggml_backend_cuda_get_device_count();
1507
+ #elif defined(GGML_USE_SYCL)
1508
+ return ggml_backend_sycl_get_device_count();
1477
1509
  #elif defined(GGML_USE_VULKAN)
1478
1510
  return ggml_backend_vk_get_device_count();
1479
1511
  #else
@@ -1487,6 +1519,11 @@ static size_t llama_get_device_memory(int device) {
1487
1519
  size_t free;
1488
1520
  ggml_backend_cuda_get_device_memory(device, &total, &free);
1489
1521
  return free;
1522
+ #elif defined(GGML_USE_SYCL)
1523
+ size_t total;
1524
+ size_t free;
1525
+ ggml_backend_sycl_get_device_memory(device, &total, &free);
1526
+ return free;
1490
1527
  #elif defined(GGML_USE_VULKAN)
1491
1528
  size_t total;
1492
1529
  size_t free;
@@ -1575,7 +1612,6 @@ struct llama_hparams {
1575
1612
  float rope_freq_base_train;
1576
1613
  float rope_freq_scale_train;
1577
1614
  uint32_t n_yarn_orig_ctx;
1578
- int32_t rope_scaling_type_train;
1579
1615
 
1580
1616
  float f_clamp_kqv = 0.0f;
1581
1617
  float f_max_alibi_bias = 0.0f;
@@ -1583,8 +1619,9 @@ struct llama_hparams {
1583
1619
  bool causal_attn = true;
1584
1620
  bool need_kq_pos = false;
1585
1621
 
1586
- enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
1587
- enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
1622
+ enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
1623
+ enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
1624
+ enum llama_rope_scaling_type rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE;
1588
1625
 
1589
1626
  bool operator!=(const llama_hparams & other) const {
1590
1627
  if (this->vocab_only != other.vocab_only) return true;
@@ -1628,13 +1665,13 @@ struct llama_hparams {
1628
1665
  };
1629
1666
 
1630
1667
  struct llama_cparams {
1631
- uint32_t n_ctx; // context size used during inference
1668
+ uint32_t n_ctx; // context size used during inference
1632
1669
  uint32_t n_batch;
1633
1670
  uint32_t n_threads; // number of threads to use for generation
1634
1671
  uint32_t n_threads_batch; // number of threads to use for batch processing
1635
1672
 
1636
- float rope_freq_base;
1637
- float rope_freq_scale;
1673
+ float rope_freq_base;
1674
+ float rope_freq_scale;
1638
1675
 
1639
1676
  uint32_t n_yarn_orig_ctx;
1640
1677
  // These hyperparameters are not exposed in GGUF, because all
@@ -1645,8 +1682,10 @@ struct llama_cparams {
1645
1682
  float yarn_beta_slow;
1646
1683
  float defrag_thold;
1647
1684
 
1685
+ bool embeddings;
1648
1686
  bool offload_kqv;
1649
- bool do_pooling;
1687
+
1688
+ enum llama_pooling_type pooling_type;
1650
1689
 
1651
1690
  ggml_backend_sched_eval_callback cb_eval;
1652
1691
  void * cb_eval_user_data;
@@ -1935,7 +1974,7 @@ struct llama_context {
1935
1974
  int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
1936
1975
  int32_t n_eval = 0; // number of eval calls
1937
1976
 
1938
- // decode output (2-dimensional array: [n_tokens][n_vocab])
1977
+ // logits output (2-dimensional array: [n_tokens][n_vocab])
1939
1978
  std::vector<float> logits;
1940
1979
  #ifndef NDEBUG
1941
1980
  // guard against access to unset logits
@@ -1943,13 +1982,21 @@ struct llama_context {
1943
1982
  #endif
1944
1983
  bool logits_all = false;
1945
1984
 
1946
- // input embedding (1-dimensional array: [n_embd])
1947
- std::vector<float> embedding;
1985
+ // embeddings output (2-dimensional array: [n_tokens][n_embd])
1986
+ // populated only when pooling_type == LLAMA_POOLING_TYPE_NONE
1987
+ std::vector<float> embd;
1988
+
1989
+ // sequence embeddings output (map of [n_embd] vectors)
1990
+ // populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
1991
+ std::map<llama_seq_id, std::vector<float>> embd_seq;
1948
1992
 
1949
1993
  // memory buffers used to evaluate the model
1950
1994
  std::vector<uint8_t> buf_compute_meta;
1951
1995
  ggml_backend_sched_t sched = nullptr;
1952
1996
 
1997
+ ggml_abort_callback abort_callback = nullptr;
1998
+ void * abort_callback_data = nullptr;
1999
+
1953
2000
  // input tensors
1954
2001
  ggml_backend_buffer_t buf_input = nullptr;
1955
2002
  ggml_context * ctx_input = nullptr;
@@ -2116,10 +2163,12 @@ static bool llama_kv_cache_find_slot(
2116
2163
  }
2117
2164
 
2118
2165
  // find how many cells are currently in use
2119
- static int32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
2120
- for (uint32_t i = cache.size - 1; i > 0; --i) {
2121
- if (cache.cells[i].pos >= 0 && !cache.cells[i].is_empty()) {
2122
- return i + 1;
2166
+ static uint32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
2167
+ for (uint32_t i = cache.size; i > 0; --i) {
2168
+ const llama_kv_cell & cell = cache.cells[i - 1];
2169
+
2170
+ if (cell.pos >= 0 && !cell.is_empty()) {
2171
+ return i;
2123
2172
  }
2124
2173
  }
2125
2174
 
@@ -2891,7 +2940,11 @@ template<>
2891
2940
  bool llama_model_loader::get_key(const enum llm_kv kid, enum llama_pooling_type & result, const bool required) {
2892
2941
  uint32_t tmp;
2893
2942
  const bool found = get_key(kid, tmp, required);
2894
- result = (enum llama_pooling_type) tmp;
2943
+ if (found) {
2944
+ result = (enum llama_pooling_type) tmp;
2945
+ } else {
2946
+ result = LLAMA_POOLING_TYPE_UNSPECIFIED;
2947
+ }
2895
2948
  return found;
2896
2949
  }
2897
2950
 
@@ -3168,7 +3221,7 @@ static void llm_load_hparams(
3168
3221
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
3169
3222
  ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
3170
3223
  ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
3171
- ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
3224
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
3172
3225
 
3173
3226
  switch (hparams.n_layer) {
3174
3227
  case 3:
@@ -3320,6 +3373,16 @@ static void llm_load_hparams(
3320
3373
  default: model.type = e_model::MODEL_UNKNOWN;
3321
3374
  }
3322
3375
  } break;
3376
+ case LLM_ARCH_STARCODER2:
3377
+ {
3378
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
3379
+ switch (hparams.n_layer) {
3380
+ case 30: model.type = e_model::MODEL_3B; break;
3381
+ case 32: model.type = e_model::MODEL_7B; break;
3382
+ case 40: model.type = e_model::MODEL_15B; break;
3383
+ default: model.type = e_model::MODEL_UNKNOWN;
3384
+ }
3385
+ } break;
3323
3386
  default: (void)0;
3324
3387
  }
3325
3388
 
@@ -4490,6 +4553,56 @@ static bool llm_load_tensors(
4490
4553
  layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
4491
4554
  }
4492
4555
  } break;
4556
+ case LLM_ARCH_STARCODER2:
4557
+ {
4558
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4559
+
4560
+ // output
4561
+ {
4562
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
4563
+ model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
4564
+
4565
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
4566
+ // if output is NULL, init from the input tok embed
4567
+ if (model.output == NULL) {
4568
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4569
+ ml.n_created--; // artificial tensor
4570
+ ml.size_data += ggml_nbytes(model.output);
4571
+ }
4572
+
4573
+ }
4574
+
4575
+ for (int i = 0; i < n_layer; ++i) {
4576
+ ggml_context * ctx_layer = ctx_for_layer(i);
4577
+ ggml_context * ctx_split = ctx_for_layer_split(i);
4578
+
4579
+ auto & layer = model.layers[i];
4580
+
4581
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
4582
+ layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
4583
+
4584
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
4585
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
4586
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
4587
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
4588
+
4589
+ // optional bias tensors
4590
+ layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
4591
+ layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
4592
+ layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
4593
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
4594
+
4595
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
4596
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
4597
+
4598
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
4599
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
4600
+
4601
+ // optional bias tensors
4602
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
4603
+ layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP , "bias", i), { n_ff});
4604
+ }
4605
+ } break;
4493
4606
  default:
4494
4607
  throw std::runtime_error("unknown architecture");
4495
4608
  }
@@ -4901,8 +5014,8 @@ static struct ggml_tensor * llm_build_kqv(
4901
5014
  ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
4902
5015
  }
4903
5016
 
4904
- #if defined(GGML_USE_VULKAN) || defined(GGML_USE_KOMPUTE)
4905
- #pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Vulkan, and Kompute")
5017
+ #if defined(GGML_USE_KOMPUTE)
5018
+ #pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Kompute")
4906
5019
  #pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
4907
5020
  #pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5488")
4908
5021
  if (hparams.f_max_alibi_bias > 0.0f) {
@@ -4986,6 +5099,7 @@ static struct ggml_tensor * llm_build_kv(
4986
5099
  llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur, n_ctx, n_tokens, kv_head, cb, il);
4987
5100
 
4988
5101
  struct ggml_tensor * cur;
5102
+
4989
5103
  cur = llm_build_kqv(ctx, model, hparams, kv, graph, wo, wo_b,
4990
5104
  q_cur, kq_mask, kq_pos, n_ctx, n_tokens, n_kv, kq_scale, cb, il);
4991
5105
  cb(cur, "kqv_out", il);
@@ -5073,7 +5187,7 @@ struct llm_build_context {
5073
5187
  n_kv (worst_case ? n_ctx : kv_self.n),
5074
5188
  kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
5075
5189
  n_orig_ctx (cparams.n_yarn_orig_ctx),
5076
- pooling_type (cparams.do_pooling ? hparams.pooling_type : LLAMA_POOLING_TYPE_NONE),
5190
+ pooling_type (cparams.pooling_type),
5077
5191
  rope_type (hparams.rope_type),
5078
5192
  cb (cb),
5079
5193
  buf_compute_meta (lctx.buf_compute_meta) {
@@ -5979,6 +6093,7 @@ struct llm_build_context {
5979
6093
 
5980
6094
  const int64_t n_embd_head = hparams.n_embd_head_v;
5981
6095
  const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
6096
+
5982
6097
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5983
6098
 
5984
6099
  struct ggml_tensor * cur;
@@ -5986,9 +6101,10 @@ struct llm_build_context {
5986
6101
 
5987
6102
  // get input vectors with right size
5988
6103
  const size_t stride1 = n_tokens * ggml_type_size(lctx.inp_tokens->type);
5989
- struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
6104
+
6105
+ struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
5990
6106
  struct ggml_tensor * inp_mean = ggml_view_2d(ctx0, lctx.inp_mean, n_tokens, n_tokens, stride1, 0);
5991
- struct ggml_tensor * inp_cls = ggml_view_1d(ctx0, lctx.inp_cls, n_tokens, 0);
6107
+ struct ggml_tensor * inp_cls = ggml_view_1d(ctx0, lctx.inp_cls, n_tokens, 0);
5992
6108
 
5993
6109
  // construct input embeddings (token, type, position)
5994
6110
  inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
@@ -6006,39 +6122,38 @@ struct llm_build_context {
6006
6122
  cb(inpL, "inp_norm", -1);
6007
6123
 
6008
6124
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
6009
- struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
6010
- cb(KQ_mask, "KQ_mask", -1); // [n_kv, n_tokens]
6125
+ struct ggml_tensor * KQ_mask = ggml_cont(ctx0, ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_tokens, n_tokens, n_tokens*ggml_type_size(lctx.inp_KQ_mask->type), 0));
6126
+ cb(KQ_mask, "KQ_mask", -1); // [n_tokens, n_tokens]
6011
6127
 
6012
6128
  // iterate layers
6013
6129
  for (int il = 0; il < n_layer; ++il) {
6014
6130
  struct ggml_tensor * cur = inpL;
6015
6131
 
6132
+ struct ggml_tensor * Qcur;
6133
+ struct ggml_tensor * Kcur;
6134
+ struct ggml_tensor * Vcur;
6135
+
6016
6136
  // self-attention
6017
6137
  if (model.arch == LLM_ARCH_BERT) {
6018
- struct ggml_tensor * Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq);
6138
+ Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq);
6019
6139
  cb(Qcur, "Qcur", il);
6020
6140
 
6021
- struct ggml_tensor * Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), model.layers[il].bk);
6141
+ Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), model.layers[il].bk);
6022
6142
  cb(Kcur, "Kcur", il);
6023
6143
 
6024
- struct ggml_tensor * Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), model.layers[il].bv);
6144
+ Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), model.layers[il].bv);
6025
6145
  cb(Vcur, "Vcur", il);
6026
6146
 
6027
- // seems like we just need to do this for Q?
6028
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
6029
-
6030
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6031
- model.layers[il].wo, model.layers[il].bo,
6032
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6033
- cb(cur, "kqv_out", il);
6147
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
6148
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
6034
6149
  } else {
6035
6150
  // compute Q and K and RoPE them
6036
6151
  cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
6037
6152
  cb(cur, "wqkv", il);
6038
6153
 
6039
- struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
6040
- struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
6041
- struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
6154
+ Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
6155
+ Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
6156
+ Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
6042
6157
 
6043
6158
  cb(Qcur, "Qcur", il);
6044
6159
  cb(Kcur, "Kcur", il);
@@ -6057,13 +6172,41 @@ struct llm_build_context {
6057
6172
  ext_factor, attn_factor, beta_fast, beta_slow
6058
6173
  );
6059
6174
  cb(Kcur, "Kcur", il);
6175
+ }
6060
6176
 
6061
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6062
- model.layers[il].wo, model.layers[il].bo,
6063
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6064
- cb(cur, "kqv_out", il);
6177
+ struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
6178
+ struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
6179
+
6180
+ struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
6181
+ cb(kq, "kq", il);
6182
+
6183
+ kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, nullptr, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
6184
+ cb(kq, "kq_soft_max_ext", il);
6185
+
6186
+ struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)));
6187
+ cb(v, "v", il);
6188
+
6189
+ struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_tokens, n_embd_head, n_head_kv), kq);
6190
+ cb(kqv, "kqv", il);
6191
+
6192
+ struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
6193
+ cb(kqv_merged, "kqv_merged", il);
6194
+
6195
+ cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
6196
+ cb(cur, "kqv_merged_cont", il);
6197
+
6198
+ ggml_build_forward_expand(gf, cur);
6199
+
6200
+ cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
6201
+ if (model.layers[il].bo) {
6202
+ cb(cur, "kqv_wo", il);
6065
6203
  }
6066
6204
 
6205
+ if (model.layers[il].bo) {
6206
+ cur = ggml_add(ctx0, cur, model.layers[il].bo);
6207
+ }
6208
+ cb(cur, "kqv_out", il);
6209
+
6067
6210
  // re-add the layer input
6068
6211
  cur = ggml_add(ctx0, cur, inpL);
6069
6212
 
@@ -6103,16 +6246,29 @@ struct llm_build_context {
6103
6246
 
6104
6247
  // final output
6105
6248
  cur = inpL;
6249
+ cb(cur, "result_embd", -1);
6106
6250
 
6107
6251
  // pooling layer
6108
- if (pooling_type == LLAMA_POOLING_TYPE_MEAN) {
6109
- cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean);
6110
- } else if (pooling_type == LLAMA_POOLING_TYPE_CLS) {
6111
- cur = ggml_get_rows(ctx0, cur, inp_cls);
6112
- } else {
6113
- GGML_ASSERT(pooling_type == LLAMA_POOLING_TYPE_NONE && "Invalid pooling type");
6252
+ switch (pooling_type) {
6253
+ case LLAMA_POOLING_TYPE_NONE:
6254
+ {
6255
+ // nop
6256
+ } break;
6257
+ case LLAMA_POOLING_TYPE_MEAN:
6258
+ {
6259
+ cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean);
6260
+ cb(cur, "result_embd_pooled", -1);
6261
+ } break;
6262
+ case LLAMA_POOLING_TYPE_CLS:
6263
+ {
6264
+ cur = ggml_get_rows(ctx0, cur, inp_cls);
6265
+ cb(cur, "result_embd_pooled", -1);
6266
+ } break;
6267
+ case LLAMA_POOLING_TYPE_UNSPECIFIED:
6268
+ {
6269
+ GGML_ASSERT(false && "Invalid pooling type");
6270
+ } break;
6114
6271
  }
6115
- cb(cur, "result_embd", -1);
6116
6272
 
6117
6273
  ggml_build_forward_expand(gf, cur);
6118
6274
 
@@ -7559,6 +7715,120 @@ struct llm_build_context {
7559
7715
 
7560
7716
  return gf;
7561
7717
  }
7718
+
7719
+ struct ggml_cgraph * build_starcoder2() {
7720
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
7721
+
7722
+ const int64_t n_embd_head = hparams.n_embd_head_v;
7723
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
7724
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
7725
+
7726
+ struct ggml_tensor * cur;
7727
+ struct ggml_tensor * inpL;
7728
+
7729
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
7730
+ cb(inpL, "inp_embd", -1);
7731
+
7732
+ // inp_pos - contains the positions
7733
+ struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
7734
+ cb(inp_pos, "inp_pos", -1);
7735
+
7736
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
7737
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
7738
+ cb(KQ_mask, "KQ_mask", -1);
7739
+
7740
+ for (int il = 0; il < n_layer; ++il) {
7741
+ struct ggml_tensor * inpSA = inpL;
7742
+
7743
+ // norm
7744
+ cur = llm_build_norm(ctx0, inpL, hparams,
7745
+ model.layers[il].attn_norm, model.layers[il].attn_norm_b,
7746
+ LLM_NORM, cb, il);
7747
+ cb(cur, "attn_norm", il);
7748
+
7749
+ // self-attention
7750
+ {
7751
+ // compute Q and K and RoPE them
7752
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
7753
+ cb(Qcur, "Qcur", il);
7754
+ if (model.layers[il].bq) {
7755
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
7756
+ cb(Qcur, "Qcur", il);
7757
+ }
7758
+
7759
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
7760
+ cb(Kcur, "Kcur", il);
7761
+ if (model.layers[il].bk) {
7762
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
7763
+ cb(Kcur, "Kcur", il);
7764
+ }
7765
+
7766
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
7767
+ cb(Vcur, "Vcur", il);
7768
+ if (model.layers[il].bv) {
7769
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
7770
+ cb(Vcur, "Vcur", il);
7771
+ }
7772
+
7773
+ Qcur = ggml_rope_custom(
7774
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
7775
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7776
+ ext_factor, attn_factor, beta_fast, beta_slow
7777
+ );
7778
+ cb(Qcur, "Qcur", il);
7779
+
7780
+ Kcur = ggml_rope_custom(
7781
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
7782
+ n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
7783
+ ext_factor, attn_factor, beta_fast, beta_slow
7784
+ );
7785
+ cb(Kcur, "Kcur", il);
7786
+
7787
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7788
+ model.layers[il].wo, model.layers[il].bo,
7789
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7790
+ cb(cur, "kqv_out", il);
7791
+ }
7792
+
7793
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
7794
+ cb(ffn_inp, "ffn_inp", il);
7795
+
7796
+ // feed-forward network
7797
+
7798
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
7799
+ model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
7800
+ LLM_NORM, cb, il);
7801
+ cb(cur, "ffn_norm", il);
7802
+
7803
+ cur = llm_build_ffn(ctx0, cur,
7804
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b,
7805
+ NULL, NULL,
7806
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b,
7807
+ NULL,
7808
+ LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
7809
+ cb(cur, "ffn_out", il);
7810
+ cur = ggml_add(ctx0, cur, ffn_inp);
7811
+ cb(cur, "l_out", il);
7812
+
7813
+ // input for next layer
7814
+ inpL = cur;
7815
+ }
7816
+
7817
+ cur = inpL;
7818
+
7819
+ cur = llm_build_norm(ctx0, cur, hparams,
7820
+ model.output_norm, model.output_norm_b,
7821
+ LLM_NORM, cb, -1);
7822
+ cb(cur, "result_norm", -1);
7823
+
7824
+ // lm_head
7825
+ cur = ggml_mul_mat(ctx0, model.output, cur);
7826
+ cb(cur, "result_output", -1);
7827
+
7828
+ ggml_build_forward_expand(gf, cur);
7829
+
7830
+ return gf;
7831
+ }
7562
7832
  };
7563
7833
 
7564
7834
  static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
@@ -7705,6 +7975,10 @@ static struct ggml_cgraph * llama_build_graph(
7705
7975
  {
7706
7976
  result = llm.build_gemma();
7707
7977
  } break;
7978
+ case LLM_ARCH_STARCODER2:
7979
+ {
7980
+ result = llm.build_starcoder2();
7981
+ } break;
7708
7982
  default:
7709
7983
  GGML_ASSERT(false);
7710
7984
  }
@@ -7756,7 +8030,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
7756
8030
  ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
7757
8031
  }
7758
8032
 
7759
- {
8033
+ if (hparams.causal_attn) {
7760
8034
  const int64_t n_kv = kv_self.n;
7761
8035
  const int64_t n_tokens = batch.n_tokens;
7762
8036
 
@@ -7771,16 +8045,40 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
7771
8045
 
7772
8046
  for (int i = 0; i < n_kv; ++i) {
7773
8047
  float f;
7774
- if (!lctx.kv_self.cells[i].has_seq_id(seq_id) ||
7775
- (hparams.causal_attn && lctx.kv_self.cells[i].pos > pos)) {
8048
+ if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
7776
8049
  f = -INFINITY;
7777
8050
  } else {
7778
- f = 0;
8051
+ f = 0.0f;
7779
8052
  }
7780
8053
  data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
7781
8054
  }
7782
8055
  }
7783
8056
  }
8057
+ } else {
8058
+ // non-causal attention attends only the tokens within the batch (i.e. the KV cache is not used)
8059
+ const int64_t n_tokens = batch.n_tokens;
8060
+
8061
+ assert(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
8062
+
8063
+ float * data = (float *) lctx.inp_KQ_mask->data;
8064
+
8065
+ for (int h = 0; h < 1; ++h) {
8066
+ for (int j = 0; j < n_tokens; ++j) {
8067
+ const llama_seq_id seq_id = batch.seq_id[j][0];
8068
+
8069
+ for (int i = 0; i < n_tokens; ++i) {
8070
+ float f = -INFINITY;
8071
+ for (int s = 0; s < batch.n_seq_id[i]; ++s) {
8072
+ if (batch.seq_id[i][s] == seq_id) {
8073
+ f = 0.0f;
8074
+ break;
8075
+ }
8076
+ }
8077
+
8078
+ data[h*(n_tokens*n_tokens) + j*n_tokens + i] = f;
8079
+ }
8080
+ }
8081
+ }
7784
8082
  }
7785
8083
 
7786
8084
  if (hparams.need_kq_pos) {
@@ -7795,17 +8093,20 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
7795
8093
  }
7796
8094
  }
7797
8095
 
7798
- if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
8096
+ if (cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
7799
8097
  const int64_t n_tokens = batch.n_tokens;
7800
8098
 
7801
8099
  GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
7802
- float * data = (float *) lctx.inp_mean->data;
7803
8100
 
8101
+ float * data = (float *) lctx.inp_mean->data;
7804
8102
  memset(lctx.inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(lctx.inp_mean));
7805
8103
 
7806
8104
  std::vector<uint64_t> sum(n_tokens, 0);
7807
8105
  for (int i = 0; i < n_tokens; ++i) {
7808
8106
  const llama_seq_id seq_id = batch.seq_id[i][0];
8107
+
8108
+ GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN");
8109
+
7809
8110
  sum[seq_id] += 1;
7810
8111
  }
7811
8112
 
@@ -7823,15 +8124,20 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
7823
8124
  }
7824
8125
  }
7825
8126
 
7826
- if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_TYPE_CLS) {
8127
+ if (cparams.pooling_type == LLAMA_POOLING_TYPE_CLS) {
7827
8128
  const int64_t n_tokens = batch.n_tokens;
7828
8129
 
7829
8130
  GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
8131
+
7830
8132
  uint32_t * data = (uint32_t *) lctx.inp_cls->data;
8133
+ memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls));
7831
8134
 
7832
8135
  for (int i = 0; i < n_tokens; ++i) {
7833
8136
  const llama_seq_id seq_id = batch.seq_id[i][0];
7834
- const llama_pos pos = batch.pos[i];
8137
+ const llama_pos pos = batch.pos[i];
8138
+
8139
+ GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS");
8140
+
7835
8141
  if (pos == 0) {
7836
8142
  data[seq_id] = i;
7837
8143
  }
@@ -7856,6 +8162,7 @@ static void llama_graph_compute(
7856
8162
 
7857
8163
  if (lctx.backend_cpu != nullptr) {
7858
8164
  ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
8165
+ ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
7859
8166
  }
7860
8167
 
7861
8168
  ggml_backend_sched_graph_compute(lctx.sched, gf);
@@ -7944,23 +8251,26 @@ static int llama_decode_internal(
7944
8251
  batch.seq_id = seq_id_arr.data();
7945
8252
  }
7946
8253
 
7947
- llama_kv_cache_update(&lctx);
8254
+ // non-causal masks do not use the KV cache
8255
+ if (hparams.causal_attn) {
8256
+ llama_kv_cache_update(&lctx);
7948
8257
 
7949
- // if we have enough unused cells before the current head ->
7950
- // better to start searching from the beginning of the cache, hoping to fill it
7951
- if (kv_self.head > kv_self.used + 2*n_tokens) {
7952
- kv_self.head = 0;
7953
- }
8258
+ // if we have enough unused cells before the current head ->
8259
+ // better to start searching from the beginning of the cache, hoping to fill it
8260
+ if (kv_self.head > kv_self.used + 2*n_tokens) {
8261
+ kv_self.head = 0;
8262
+ }
7954
8263
 
7955
- if (!llama_kv_cache_find_slot(kv_self, batch)) {
7956
- return 1;
7957
- }
8264
+ if (!llama_kv_cache_find_slot(kv_self, batch)) {
8265
+ return 1;
8266
+ }
7958
8267
 
7959
- // a heuristic, to avoid attending the full cache if it is not yet utilized
7960
- // after enough generations, the benefit from this heuristic disappears
7961
- // if we start defragmenting the cache, the benefit from this will be more important
7962
- kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)));
7963
- //kv_self.n = llama_kv_cache_cell_max(kv_self);
8268
+ // a heuristic, to avoid attending the full cache if it is not yet utilized
8269
+ // after enough generations, the benefit from this heuristic disappears
8270
+ // if we start defragmenting the cache, the benefit from this will be more important
8271
+ kv_self.n = std::min(cparams.n_ctx, std::max(32u, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)));
8272
+ //kv_self.n = llama_kv_cache_cell_max(kv_self);
8273
+ }
7964
8274
 
7965
8275
  //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
7966
8276
 
@@ -7970,20 +8280,26 @@ static int llama_decode_internal(
7970
8280
  ggml_cgraph * gf = llama_build_graph(lctx, batch, false);
7971
8281
 
7972
8282
  // the output is always the last tensor in the graph
7973
- struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
7974
- struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
7975
-
7976
- if (strcmp(res->name, "result_output") == 0) {
7977
- // the embeddings could be the second to last tensor, or the third to last tensor
7978
- if (strcmp(embeddings->name, "result_norm") != 0) {
7979
- embeddings = gf->nodes[gf->n_nodes - 3];
7980
- GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
7981
- }
7982
- } else if (strcmp(res->name, "result_embd") == 0) {
7983
- embeddings = res;
7984
- res = nullptr;
8283
+ struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
8284
+ struct ggml_tensor * embd = gf->nodes[gf->n_nodes - 2];
8285
+
8286
+ if (!hparams.causal_attn) {
8287
+ res = nullptr; // do not extract logits for embedding models such as BERT
8288
+
8289
+ // token or sequence embeddings
8290
+ embd = gf->nodes[gf->n_nodes - 1];
8291
+
8292
+ GGML_ASSERT(strcmp(embd->name, "result_embd") == 0 || strcmp(embd->name, "result_embd_pooled") == 0);
7985
8293
  } else {
7986
- GGML_ASSERT(false);
8294
+ if (strcmp(res->name, "result_output") == 0) {
8295
+ // the token embeddings could be the second to last tensor, or the third to last tensor
8296
+ if (strcmp(embd->name, "result_norm") != 0) {
8297
+ embd = gf->nodes[gf->n_nodes - 3];
8298
+ GGML_ASSERT(strcmp(embd->name, "result_norm") == 0);
8299
+ }
8300
+ } else {
8301
+ GGML_ASSERT(false && "missing result_output tensor");
8302
+ }
7987
8303
  }
7988
8304
 
7989
8305
  // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
@@ -8050,46 +8366,82 @@ static int llama_decode_internal(
8050
8366
  logits_out.clear();
8051
8367
  #endif
8052
8368
 
8053
- ggml_backend_t res_backend = ggml_backend_sched_get_node_backend(lctx.sched, res);
8054
- GGML_ASSERT(res_backend != nullptr);
8369
+ ggml_backend_t backend_res = ggml_backend_sched_get_node_backend(lctx.sched, res);
8370
+ GGML_ASSERT(backend_res != nullptr);
8371
+
8055
8372
  if (batch.logits) {
8056
8373
  logits_out.resize(n_vocab * n_tokens);
8057
8374
  for (uint32_t i = 0; i < n_tokens; i++) {
8058
8375
  if (batch.logits[i] == 0) {
8059
8376
  continue;
8060
8377
  }
8061
- ggml_backend_tensor_get_async(res_backend, res, logits_out.data() + (n_vocab*i), (n_vocab*i)*sizeof(float), n_vocab*sizeof(float));
8378
+ ggml_backend_tensor_get_async(backend_res, res, logits_out.data() + (n_vocab*i), (n_vocab*i)*sizeof(float), n_vocab*sizeof(float));
8062
8379
  #ifndef NDEBUG
8063
8380
  logits_valid[i] = true;
8064
8381
  #endif
8065
8382
  }
8066
8383
  } else if (lctx.logits_all) {
8067
8384
  logits_out.resize(n_vocab * n_tokens);
8068
- ggml_backend_tensor_get_async(res_backend, res, logits_out.data(), 0, n_vocab*n_tokens*sizeof(float));
8385
+ ggml_backend_tensor_get_async(backend_res, res, logits_out.data(), 0, n_vocab*n_tokens*sizeof(float));
8069
8386
  #ifndef NDEBUG
8070
8387
  std::fill(logits_valid.begin(), logits_valid.end(), true);
8071
8388
  #endif
8072
8389
  } else {
8073
8390
  logits_out.resize(n_vocab);
8074
- ggml_backend_tensor_get_async(res_backend, res, logits_out.data(), (n_vocab*(n_tokens - 1))*sizeof(float), n_vocab*sizeof(float));
8391
+ ggml_backend_tensor_get_async(backend_res, res, logits_out.data(), (n_vocab*(n_tokens - 1))*sizeof(float), n_vocab*sizeof(float));
8075
8392
  #ifndef NDEBUG
8076
8393
  logits_valid[0] = true;
8077
8394
  #endif
8078
8395
  }
8079
- ggml_backend_synchronize(res_backend);
8396
+ ggml_backend_synchronize(backend_res);
8080
8397
  }
8081
8398
 
8082
8399
  // extract embeddings
8083
- if (!lctx.embedding.empty()) {
8084
- auto & embedding_out = lctx.embedding;
8400
+ if (cparams.embeddings && embd) {
8401
+ ggml_backend_t backend_embd = ggml_backend_sched_get_node_backend(lctx.sched, embd);
8402
+ GGML_ASSERT(backend_embd != nullptr);
8403
+
8404
+ switch (cparams.pooling_type) {
8405
+ case LLAMA_POOLING_TYPE_NONE:
8406
+ {
8407
+ // extract token embeddings
8408
+ auto & embd_out = lctx.embd;
8409
+
8410
+ if (batch.logits) {
8411
+ embd_out.resize(n_embd * n_tokens);
8412
+ for (uint32_t i = 0; i < n_tokens; i++) {
8413
+ if (batch.logits[i] == 0) {
8414
+ continue;
8415
+ }
8416
+
8417
+ ggml_backend_tensor_get_async(backend_embd, embd, embd_out.data() + (n_embd*i), (n_embd*i)*sizeof(float), n_embd*sizeof(float));
8418
+ }
8419
+ }
8420
+ } break;
8421
+ case LLAMA_POOLING_TYPE_CLS:
8422
+ case LLAMA_POOLING_TYPE_MEAN:
8423
+ {
8424
+ GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0);
8085
8425
 
8086
- const int64_t embd_pos = res ? n_embd * (n_tokens-1) : 0;
8087
- const int64_t embd_size = res ? n_embd : n_embd * n_tokens;
8426
+ // extract sequence embeddings
8427
+ auto & embd_seq_out = lctx.embd_seq;
8428
+ embd_seq_out.clear();
8088
8429
 
8089
- embedding_out.resize(embd_size);
8090
- ggml_backend_t embeddings_backend = ggml_backend_sched_get_node_backend(lctx.sched, embeddings);
8091
- ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(), embd_pos*sizeof(float), embd_size*sizeof(float));
8092
- ggml_backend_synchronize(embeddings_backend);
8430
+ for (uint32_t i = 0; i < n_tokens; i++) {
8431
+ const llama_seq_id seq_id = batch.seq_id[i][0];
8432
+ if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
8433
+ continue;
8434
+ }
8435
+ embd_seq_out[seq_id].resize(n_embd);
8436
+ ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
8437
+ }
8438
+ } break;
8439
+ case LLAMA_POOLING_TYPE_UNSPECIFIED:
8440
+ {
8441
+ GGML_ASSERT(false && "unknown pooling type");
8442
+ } break;
8443
+ }
8444
+ ggml_backend_synchronize(backend_embd);
8093
8445
  }
8094
8446
 
8095
8447
  // measure the performance only for the single-token evals
@@ -8383,19 +8735,19 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
8383
8735
  GGML_ASSERT(llama_is_byte_token(vocab, id));
8384
8736
  const auto& token_data = vocab.id_to_token.at(id);
8385
8737
  switch (llama_vocab_get_type(vocab)) {
8386
- case LLAMA_VOCAB_TYPE_SPM: {
8387
- auto buf = token_data.text.substr(3, 2);
8388
- return strtol(buf.c_str(), NULL, 16);
8389
- }
8390
- case LLAMA_VOCAB_TYPE_BPE: {
8391
- GGML_ASSERT(false);
8392
- return unicode_to_bytes_bpe(token_data.text);
8393
- }
8394
- case LLAMA_VOCAB_TYPE_WPM: {
8395
- GGML_ASSERT(false);
8396
- }
8397
- default:
8398
- GGML_ASSERT(false);
8738
+ case LLAMA_VOCAB_TYPE_SPM: {
8739
+ auto buf = token_data.text.substr(3, 2);
8740
+ return strtol(buf.c_str(), NULL, 16);
8741
+ }
8742
+ case LLAMA_VOCAB_TYPE_BPE: {
8743
+ GGML_ASSERT(false);
8744
+ return unicode_to_bytes_bpe(token_data.text);
8745
+ }
8746
+ case LLAMA_VOCAB_TYPE_WPM: {
8747
+ GGML_ASSERT(false);
8748
+ }
8749
+ default:
8750
+ GGML_ASSERT(false);
8399
8751
  }
8400
8752
  }
8401
8753
 
@@ -10621,7 +10973,7 @@ struct quantize_state_internal {
10621
10973
  {}
10622
10974
  };
10623
10975
 
10624
- static void llama_convert_tensor_internal(
10976
+ static void llama_tensor_dequantize_internal(
10625
10977
  struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
10626
10978
  const size_t nelements, const int nthread
10627
10979
  ) {
@@ -10962,6 +11314,46 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
10962
11314
  return new_type;
10963
11315
  }
10964
11316
 
11317
+ static int32_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int chunk_size, int nrows, int n_per_row, int64_t * hist_cur, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
11318
+ std::mutex mutex;
11319
+ int counter = 0;
11320
+ size_t new_size = 0;
11321
+ if (nthread < 2) {
11322
+ // single-thread
11323
+ return ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, hist_cur, imatrix);
11324
+ }
11325
+ auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, chunk_size,
11326
+ nrows, n_per_row, imatrix]() {
11327
+ std::array<int64_t, 1 << 4> local_hist = {};
11328
+ const int nrows_per_chunk = chunk_size / n_per_row;
11329
+ size_t local_size = 0;
11330
+ while (true) {
11331
+ std::unique_lock<std::mutex> lock(mutex);
11332
+ int first_row = counter; counter += nrows_per_chunk;
11333
+ if (first_row >= nrows) {
11334
+ if (local_size > 0) {
11335
+ for (int j=0; j<int(local_hist.size()); ++j) {
11336
+ hist_cur[j] += local_hist[j];
11337
+ }
11338
+ new_size += local_size;
11339
+ }
11340
+ break;
11341
+ }
11342
+ lock.unlock();
11343
+ const int this_nrow = std::min(nrows - first_row, nrows_per_chunk);
11344
+ local_size += ggml_quantize_chunk(new_type, f32_data, new_data,
11345
+ first_row * n_per_row, this_nrow, n_per_row, local_hist.data(), imatrix);
11346
+ }
11347
+ };
11348
+ for (int it = 0; it < nthread - 1; ++it) {
11349
+ workers.emplace_back(compute);
11350
+ }
11351
+ compute();
11352
+ for (auto & w : workers) { w.join(); }
11353
+ workers.clear();
11354
+ return new_size;
11355
+ }
11356
+
10965
11357
  static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
10966
11358
  ggml_type quantized_type;
10967
11359
  llama_ftype ftype = params->ftype;
@@ -11074,7 +11466,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
11074
11466
 
11075
11467
  std::vector<std::thread> workers;
11076
11468
  workers.reserve(nthread);
11077
- std::mutex mutex;
11078
11469
 
11079
11470
  int idx = 0;
11080
11471
 
@@ -11188,7 +11579,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
11188
11579
  } else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
11189
11580
  throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
11190
11581
  } else {
11191
- llama_convert_tensor_internal(tensor, f32_conv_buf, workers, nelements, nthread);
11582
+ llama_tensor_dequantize_internal(tensor, f32_conv_buf, workers, nelements, nthread);
11192
11583
  f32_data = (float *) f32_conv_buf.data();
11193
11584
  }
11194
11585
 
@@ -11209,41 +11600,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
11209
11600
 
11210
11601
  const int nchunk = (nelements + chunk_size - 1)/chunk_size;
11211
11602
  const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
11212
- if (nthread_use < 2) {
11213
- new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, hist_cur.data(), imatrix);
11214
- } else {
11215
- int counter = 0;
11216
- new_size = 0;
11217
- auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, chunk_size,
11218
- nrows, n_per_row, imatrix]() {
11219
- std::array<int64_t, 1 << 4> local_hist = {};
11220
- const int nrows_per_chunk = chunk_size / n_per_row;
11221
- size_t local_size = 0;
11222
- while (true) {
11223
- std::unique_lock<std::mutex> lock(mutex);
11224
- int first_row = counter; counter += nrows_per_chunk;
11225
- if (first_row >= nrows) {
11226
- if (local_size > 0) {
11227
- for (int j=0; j<int(local_hist.size()); ++j) {
11228
- hist_cur[j] += local_hist[j];
11229
- }
11230
- new_size += local_size;
11231
- }
11232
- break;
11233
- }
11234
- lock.unlock();
11235
- const int this_nrow = std::min(nrows - first_row, nrows_per_chunk);
11236
- local_size += ggml_quantize_chunk(new_type, f32_data, new_data,
11237
- first_row * n_per_row, this_nrow, n_per_row, local_hist.data(), imatrix);
11238
- }
11239
- };
11240
- for (int it = 0; it < nthread_use - 1; ++it) {
11241
- workers.emplace_back(compute);
11242
- }
11243
- compute();
11244
- for (auto & w : workers) { w.join(); }
11245
- workers.clear();
11246
- }
11603
+ new_size = llama_tensor_quantize_internal(new_type, f32_data, new_data, chunk_size, nrows, n_per_row, hist_cur.data(), imatrix, workers, nthread_use);
11247
11604
 
11248
11605
  LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
11249
11606
  int64_t tot_count = 0;
@@ -11620,6 +11977,7 @@ struct llama_context_params llama_context_default_params() {
11620
11977
  /*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
11621
11978
  /*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS,
11622
11979
  /*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
11980
+ /*.pooling_type =*/ LLAMA_POOLING_TYPE_UNSPECIFIED,
11623
11981
  /*.rope_freq_base =*/ 0.0f,
11624
11982
  /*.rope_freq_scale =*/ 0.0f,
11625
11983
  /*.yarn_ext_factor =*/ -1.0f,
@@ -11633,9 +11991,10 @@ struct llama_context_params llama_context_default_params() {
11633
11991
  /*.type_k =*/ GGML_TYPE_F16,
11634
11992
  /*.type_v =*/ GGML_TYPE_F16,
11635
11993
  /*.logits_all =*/ false,
11636
- /*.embedding =*/ false,
11994
+ /*.embeddings =*/ false,
11637
11995
  /*.offload_kqv =*/ true,
11638
- /*.do_pooling =*/ true,
11996
+ /*.abort_callback =*/ nullptr,
11997
+ /*.abort_callback_data =*/ nullptr,
11639
11998
  };
11640
11999
 
11641
12000
  return result;
@@ -11783,8 +12142,9 @@ struct llama_context * llama_new_context_with_model(
11783
12142
  cparams.yarn_beta_fast = params.yarn_beta_fast;
11784
12143
  cparams.yarn_beta_slow = params.yarn_beta_slow;
11785
12144
  cparams.defrag_thold = params.defrag_thold;
12145
+ cparams.embeddings = params.embeddings;
11786
12146
  cparams.offload_kqv = params.offload_kqv;
11787
- cparams.do_pooling = params.do_pooling;
12147
+ cparams.pooling_type = params.pooling_type;
11788
12148
 
11789
12149
  cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
11790
12150
  cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base;
@@ -11810,6 +12170,14 @@ struct llama_context * llama_new_context_with_model(
11810
12170
  cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
11811
12171
  }
11812
12172
 
12173
+ if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
12174
+ if (hparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
12175
+ cparams.pooling_type = LLAMA_POOLING_TYPE_NONE;
12176
+ } else {
12177
+ cparams.pooling_type = hparams.pooling_type;
12178
+ }
12179
+ }
12180
+
11813
12181
  if (params.seed == LLAMA_DEFAULT_SEED) {
11814
12182
  params.seed = time(NULL);
11815
12183
  }
@@ -11818,8 +12186,11 @@ struct llama_context * llama_new_context_with_model(
11818
12186
  LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
11819
12187
  LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
11820
12188
 
11821
- ctx->rng = std::mt19937(params.seed);
11822
- ctx->logits_all = params.logits_all;
12189
+ ctx->abort_callback = params.abort_callback;
12190
+ ctx->abort_callback_data = params.abort_callback_data;
12191
+
12192
+ ctx->rng = std::mt19937(params.seed);
12193
+ ctx->logits_all = params.logits_all;
11823
12194
 
11824
12195
  const ggml_type type_k = params.type_k;
11825
12196
  const ggml_type type_v = params.type_v;
@@ -11877,13 +12248,31 @@ struct llama_context * llama_new_context_with_model(
11877
12248
  }
11878
12249
  #elif defined(GGML_USE_SYCL)
11879
12250
  if (model->n_gpu_layers > 0) {
11880
- ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
11881
- if (backend == nullptr) {
11882
- LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu);
11883
- llama_free(ctx);
11884
- return nullptr;
12251
+ // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
12252
+ if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
12253
+ int main_gpu_index = ggml_backend_sycl_get_device_index(model->main_gpu);
12254
+ ggml_backend_t backend = ggml_backend_sycl_init(main_gpu_index);
12255
+ if (backend == nullptr) {
12256
+ LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, model->main_gpu, main_gpu_index);
12257
+ llama_free(ctx);
12258
+ return nullptr;
12259
+ }
12260
+ ctx->backends.push_back(backend);
12261
+ } else {
12262
+ // LLAMA_SPLIT_LAYER requires a backend for each GPU
12263
+ int id_list[GGML_SYCL_MAX_DEVICES];
12264
+ ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
12265
+ for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
12266
+ int device_id = id_list[i];
12267
+ ggml_backend_t backend = ggml_backend_sycl_init(i);
12268
+ if (backend == nullptr) {
12269
+ LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, device_id, i);
12270
+ llama_free(ctx);
12271
+ return nullptr;
12272
+ }
12273
+ ctx->backends.push_back(backend);
12274
+ }
11885
12275
  }
11886
- ctx->backends.push_back(backend);
11887
12276
  }
11888
12277
  #elif defined(GGML_USE_KOMPUTE)
11889
12278
  if (model->n_gpu_layers > 0) {
@@ -11931,8 +12320,8 @@ struct llama_context * llama_new_context_with_model(
11931
12320
  // resized during inference, reserve maximum
11932
12321
  ctx->logits.reserve(hparams.n_vocab*cparams.n_batch);
11933
12322
 
11934
- if (params.embedding) {
11935
- ctx->embedding.resize(hparams.n_embd);
12323
+ if (params.embeddings) {
12324
+ ctx->embd.reserve(hparams.n_embd*cparams.n_batch);
11936
12325
  }
11937
12326
 
11938
12327
  // graph inputs
@@ -11963,7 +12352,6 @@ struct llama_context * llama_new_context_with_model(
11963
12352
  ggml_set_name(ctx->inp_cls, "inp_cls");
11964
12353
 
11965
12354
  ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true));
11966
-
11967
12355
  LLAMA_LOG_INFO("%s: %10s input buffer size = %8.2f MiB\n", __func__,
11968
12356
  ggml_backend_buffer_name(ctx->buf_input),
11969
12357
  ggml_backend_buffer_get_size(ctx->buf_input) / 1024.0 / 1024.0);
@@ -12084,6 +12472,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
12084
12472
  case LLM_ARCH_QWEN2:
12085
12473
  case LLM_ARCH_PHI2:
12086
12474
  case LLM_ARCH_GEMMA:
12475
+ case LLM_ARCH_STARCODER2:
12087
12476
  return LLAMA_ROPE_TYPE_NEOX;
12088
12477
 
12089
12478
  // all model arches should be listed explicitly here
@@ -12367,10 +12756,15 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
12367
12756
  // assume worst case for logits although only currently set ones are serialized
12368
12757
  const size_t s_logits = ctx->logits.capacity() * sizeof(float);
12369
12758
  const size_t s_embedding_size = sizeof(size_t);
12370
- const size_t s_embedding = ctx->embedding.size() * sizeof(float);
12371
- const size_t s_kv_size = sizeof(size_t);
12372
- const size_t s_kv_ntok = sizeof(int);
12759
+ const size_t s_embedding = ctx->embd.capacity() * sizeof(float);
12760
+ const size_t s_kv_buf_size = sizeof(size_t);
12761
+ const size_t s_kv_head = sizeof(uint32_t);
12762
+ const size_t s_kv_size = sizeof(uint32_t);
12763
+ const size_t s_kv_used = sizeof(uint32_t);
12373
12764
  const size_t s_kv = ctx->kv_self.total_size();
12765
+ // TODO: assume the max is more than 1 seq_id per KV cell
12766
+ const size_t s_kv_cell = sizeof(llama_pos) + sizeof(size_t) + sizeof(llama_seq_id);
12767
+ const size_t s_kv_cells = ctx->kv_self.size * s_kv_cell;
12374
12768
 
12375
12769
  const size_t s_total = (
12376
12770
  + s_rng_size
@@ -12379,9 +12773,12 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
12379
12773
  + s_logits
12380
12774
  + s_embedding_size
12381
12775
  + s_embedding
12776
+ + s_kv_buf_size
12777
+ + s_kv_head
12382
12778
  + s_kv_size
12383
- + s_kv_ntok
12779
+ + s_kv_used
12384
12780
  + s_kv
12781
+ + s_kv_cells
12385
12782
  );
12386
12783
 
12387
12784
  return s_total;
@@ -12468,12 +12865,12 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
12468
12865
 
12469
12866
  // copy embeddings
12470
12867
  {
12471
- const size_t embedding_size = ctx->embedding.size();
12868
+ const size_t embeddings_size = ctx->embd.size();
12472
12869
 
12473
- data_ctx->write(&embedding_size, sizeof(embedding_size));
12870
+ data_ctx->write(&embeddings_size, sizeof(embeddings_size));
12474
12871
 
12475
- if (embedding_size) {
12476
- data_ctx->write(ctx->embedding.data(), embedding_size * sizeof(float));
12872
+ if (embeddings_size) {
12873
+ data_ctx->write(ctx->embd.data(), embeddings_size * sizeof(float));
12477
12874
  }
12478
12875
  }
12479
12876
 
@@ -12481,15 +12878,13 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
12481
12878
  {
12482
12879
  const auto & kv_self = ctx->kv_self;
12483
12880
  const auto & hparams = ctx->model.hparams;
12484
- const auto & cparams = ctx->cparams;
12485
12881
 
12486
12882
  const uint32_t n_layer = hparams.n_layer;
12487
12883
  const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
12488
12884
  const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
12489
- const uint32_t n_ctx = cparams.n_ctx;
12490
12885
 
12491
12886
  const size_t kv_buf_size = kv_self.total_size();
12492
- const uint32_t kv_head = kv_self.head;
12887
+ const uint32_t kv_head = llama_kv_cache_cell_max(kv_self);
12493
12888
  const uint32_t kv_size = kv_self.size;
12494
12889
  const uint32_t kv_used = kv_self.used;
12495
12890
 
@@ -12509,7 +12904,7 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
12509
12904
 
12510
12905
  // v is not contiguous, copy row by row
12511
12906
  const size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
12512
- const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
12907
+ const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, kv_size);
12513
12908
 
12514
12909
  tmp_buf.resize(v_row_size);
12515
12910
  for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
@@ -12519,7 +12914,7 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
12519
12914
  }
12520
12915
  }
12521
12916
 
12522
- for (uint32_t i = 0; i < kv_size; ++i) {
12917
+ for (uint32_t i = 0; i < kv_head; ++i) {
12523
12918
  const auto & cell = kv_self.cells[i];
12524
12919
 
12525
12920
  const llama_pos pos = cell.pos;
@@ -12579,15 +12974,17 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
12579
12974
 
12580
12975
  // set embeddings
12581
12976
  {
12582
- size_t embedding_size;
12977
+ size_t embeddings_size;
12583
12978
 
12584
- memcpy(&embedding_size, inp, sizeof(embedding_size)); inp += sizeof(embedding_size);
12979
+ memcpy(&embeddings_size, inp, sizeof(embeddings_size)); inp += sizeof(embeddings_size);
12585
12980
 
12586
- GGML_ASSERT(ctx->embedding.capacity() == embedding_size);
12981
+ GGML_ASSERT(ctx->embd.capacity() == embeddings_size);
12587
12982
 
12588
- if (embedding_size) {
12589
- memcpy(ctx->embedding.data(), inp, embedding_size * sizeof(float));
12590
- inp += embedding_size * sizeof(float);
12983
+ if (embeddings_size) {
12984
+ ctx->embd.resize(embeddings_size);
12985
+
12986
+ memcpy(ctx->embd.data(), inp, embeddings_size * sizeof(float));
12987
+ inp += embeddings_size * sizeof(float);
12591
12988
  }
12592
12989
  }
12593
12990
 
@@ -12595,12 +12992,10 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
12595
12992
  {
12596
12993
  const auto & kv_self = ctx->kv_self;
12597
12994
  const auto & hparams = ctx->model.hparams;
12598
- const auto & cparams = ctx->cparams;
12599
12995
 
12600
12996
  const uint32_t n_layer = hparams.n_layer;
12601
12997
  const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
12602
12998
  const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
12603
- const uint32_t n_ctx = cparams.n_ctx;
12604
12999
 
12605
13000
  size_t kv_buf_size;
12606
13001
  uint32_t kv_head;
@@ -12623,7 +13018,7 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
12623
13018
 
12624
13019
  // v is not contiguous, copy row by row
12625
13020
  const size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
12626
- const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
13021
+ const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, kv_size);
12627
13022
 
12628
13023
  for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
12629
13024
  ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*v_row_stride, v_row_size);
@@ -12632,13 +13027,15 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
12632
13027
  }
12633
13028
  }
12634
13029
 
13030
+ GGML_ASSERT(kv_self.size == kv_size);
13031
+
12635
13032
  ctx->kv_self.head = kv_head;
12636
13033
  ctx->kv_self.size = kv_size;
12637
13034
  ctx->kv_self.used = kv_used;
12638
13035
 
12639
13036
  ctx->kv_self.cells.resize(kv_size);
12640
13037
 
12641
- for (uint32_t i = 0; i < kv_size; ++i) {
13038
+ for (uint32_t i = 0; i < kv_head; ++i) {
12642
13039
  llama_pos pos;
12643
13040
  size_t seq_id_size;
12644
13041
 
@@ -12654,6 +13051,11 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
12654
13051
  ctx->kv_self.cells[i].seq_id.insert(seq_id);
12655
13052
  }
12656
13053
  }
13054
+
13055
+ for (uint32_t i = kv_head; i < kv_size; ++i) {
13056
+ ctx->kv_self.cells[i].pos = -1;
13057
+ ctx->kv_self.cells[i].seq_id.clear();
13058
+ }
12657
13059
  }
12658
13060
 
12659
13061
  const size_t nread = inp - src;
@@ -12751,6 +13153,11 @@ void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_
12751
13153
  ctx->cparams.n_threads_batch = n_threads_batch;
12752
13154
  }
12753
13155
 
13156
+ void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) {
13157
+ ctx->abort_callback = abort_callback;
13158
+ ctx->abort_callback_data = abort_callback_data;
13159
+ }
13160
+
12754
13161
  struct llama_batch llama_batch_get_one(
12755
13162
  llama_token * tokens,
12756
13163
  int32_t n_tokens,
@@ -12827,11 +13234,20 @@ float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
12827
13234
  }
12828
13235
 
12829
13236
  float * llama_get_embeddings(struct llama_context * ctx) {
12830
- return ctx->embedding.data();
13237
+ return ctx->embd.data();
12831
13238
  }
12832
13239
 
12833
13240
  float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
12834
- return ctx->embedding.data() + i*ctx->model.hparams.n_embd;
13241
+ return ctx->embd.data() + i*ctx->model.hparams.n_embd;
13242
+ }
13243
+
13244
+ float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id) {
13245
+ auto it = ctx->embd_seq.find(seq_id);
13246
+ if (it == ctx->embd_seq.end()) {
13247
+ return nullptr;
13248
+ }
13249
+
13250
+ return it->second.data();
12835
13251
  }
12836
13252
 
12837
13253
  const char * llama_token_get_text(const struct llama_model * model, llama_token token) {
@@ -13005,7 +13421,7 @@ static int32_t llama_chat_apply_template_internal(
13005
13421
  std::string & dest, bool add_ass) {
13006
13422
  // Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527
13007
13423
  std::stringstream ss;
13008
- if (tmpl.find("<|im_start|>") != std::string::npos) {
13424
+ if (tmpl == "chatml" || tmpl.find("<|im_start|>") != std::string::npos) {
13009
13425
  // chatml template
13010
13426
  for (auto message : chat) {
13011
13427
  ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n";
@@ -13013,7 +13429,7 @@ static int32_t llama_chat_apply_template_internal(
13013
13429
  if (add_ass) {
13014
13430
  ss << "<|im_start|>assistant\n";
13015
13431
  }
13016
- } else if (tmpl.find("[INST]") != std::string::npos) {
13432
+ } else if (tmpl == "llama2" || tmpl.find("[INST]") != std::string::npos) {
13017
13433
  // llama2 template and its variants
13018
13434
  // [variant] support system message
13019
13435
  bool support_system_message = tmpl.find("<<SYS>>") != std::string::npos;
@@ -13048,7 +13464,7 @@ static int32_t llama_chat_apply_template_internal(
13048
13464
  }
13049
13465
  }
13050
13466
  // llama2 templates seem to not care about "add_generation_prompt"
13051
- } else if (tmpl.find("<|user|>") != std::string::npos) {
13467
+ } else if (tmpl == "zephyr" || tmpl.find("<|user|>") != std::string::npos) {
13052
13468
  // zephyr template
13053
13469
  for (auto message : chat) {
13054
13470
  ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n";
@@ -13056,7 +13472,7 @@ static int32_t llama_chat_apply_template_internal(
13056
13472
  if (add_ass) {
13057
13473
  ss << "<|assistant|>\n";
13058
13474
  }
13059
- } else if (tmpl.find("bos_token + message['role']") != std::string::npos) {
13475
+ } else if (tmpl == "monarch" || tmpl.find("bos_token + message['role']") != std::string::npos) {
13060
13476
  // mlabonne/AlphaMonarch-7B template (the <s> is included inside history)
13061
13477
  for (auto message : chat) {
13062
13478
  std::string bos = (message == chat.front()) ? "" : "<s>"; // skip BOS for first message
@@ -13065,7 +13481,7 @@ static int32_t llama_chat_apply_template_internal(
13065
13481
  if (add_ass) {
13066
13482
  ss << "<s>assistant\n";
13067
13483
  }
13068
- } else if (tmpl.find("<start_of_turn>") != std::string::npos) {
13484
+ } else if (tmpl == "gemma" || tmpl.find("<start_of_turn>") != std::string::npos) {
13069
13485
  // google/gemma-7b-it
13070
13486
  std::string system_prompt = "";
13071
13487
  for (auto message : chat) {
@@ -13112,23 +13528,27 @@ LLAMA_API int32_t llama_chat_apply_template(
13112
13528
  int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
13113
13529
  if (res < 0) {
13114
13530
  // worst case: there is no information about template, we will use chatml by default
13115
- curr_tmpl = "<|im_start|>"; // see llama_chat_apply_template_internal
13531
+ curr_tmpl = "chatml"; // see llama_chat_apply_template_internal
13116
13532
  } else {
13117
13533
  curr_tmpl = std::string(model_template.data(), model_template.size());
13118
13534
  }
13119
13535
  }
13536
+
13120
13537
  // format the chat to string
13121
13538
  std::vector<const llama_chat_message *> chat_vec;
13122
13539
  chat_vec.resize(n_msg);
13123
13540
  for (size_t i = 0; i < n_msg; i++) {
13124
13541
  chat_vec[i] = &chat[i];
13125
13542
  }
13543
+
13126
13544
  std::string formatted_chat;
13127
13545
  int32_t res = llama_chat_apply_template_internal(curr_tmpl, chat_vec, formatted_chat, add_ass);
13128
13546
  if (res < 0) {
13129
13547
  return res;
13130
13548
  }
13131
- strncpy(buf, formatted_chat.c_str(), length);
13549
+ if (buf && length > 0) {
13550
+ strncpy(buf, formatted_chat.c_str(), length);
13551
+ }
13132
13552
  return res;
13133
13553
  }
13134
13554