llama_cpp 0.15.0 → 0.15.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,6 +7,10 @@
7
7
  #include "ggml-alloc.h"
8
8
  #include "ggml-backend.h"
9
9
 
10
+ #ifdef GGML_USE_RPC
11
+ # include "ggml-rpc.h"
12
+ #endif
13
+
10
14
  #ifdef GGML_USE_CUDA
11
15
  # include "ggml-cuda.h"
12
16
  #elif defined(GGML_USE_CLBLAST)
@@ -205,6 +209,7 @@ enum llm_arch {
205
209
  LLM_ARCH_REFACT,
206
210
  LLM_ARCH_BERT,
207
211
  LLM_ARCH_NOMIC_BERT,
212
+ LLM_ARCH_JINA_BERT_V2,
208
213
  LLM_ARCH_BLOOM,
209
214
  LLM_ARCH_STABLELM,
210
215
  LLM_ARCH_QWEN,
@@ -228,39 +233,40 @@ enum llm_arch {
228
233
  };
229
234
 
230
235
  static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
231
- { LLM_ARCH_LLAMA, "llama" },
232
- { LLM_ARCH_FALCON, "falcon" },
233
- { LLM_ARCH_GROK, "grok" },
234
- { LLM_ARCH_GPT2, "gpt2" },
235
- { LLM_ARCH_GPTJ, "gptj" },
236
- { LLM_ARCH_GPTNEOX, "gptneox" },
237
- { LLM_ARCH_MPT, "mpt" },
238
- { LLM_ARCH_BAICHUAN, "baichuan" },
239
- { LLM_ARCH_STARCODER, "starcoder" },
240
- { LLM_ARCH_PERSIMMON, "persimmon" },
241
- { LLM_ARCH_REFACT, "refact" },
242
- { LLM_ARCH_BERT, "bert" },
243
- { LLM_ARCH_NOMIC_BERT, "nomic-bert" },
244
- { LLM_ARCH_BLOOM, "bloom" },
245
- { LLM_ARCH_STABLELM, "stablelm" },
246
- { LLM_ARCH_QWEN, "qwen" },
247
- { LLM_ARCH_QWEN2, "qwen2" },
248
- { LLM_ARCH_QWEN2MOE, "qwen2moe" },
249
- { LLM_ARCH_PHI2, "phi2" },
250
- { LLM_ARCH_PHI3, "phi3" },
251
- { LLM_ARCH_PLAMO, "plamo" },
252
- { LLM_ARCH_CODESHELL, "codeshell" },
253
- { LLM_ARCH_ORION, "orion" },
254
- { LLM_ARCH_INTERNLM2, "internlm2" },
255
- { LLM_ARCH_MINICPM, "minicpm" },
256
- { LLM_ARCH_GEMMA, "gemma" },
257
- { LLM_ARCH_STARCODER2, "starcoder2" },
258
- { LLM_ARCH_MAMBA, "mamba" },
259
- { LLM_ARCH_XVERSE, "xverse" },
260
- { LLM_ARCH_COMMAND_R, "command-r" },
261
- { LLM_ARCH_DBRX, "dbrx" },
262
- { LLM_ARCH_OLMO, "olmo" },
263
- { LLM_ARCH_UNKNOWN, "(unknown)" },
236
+ { LLM_ARCH_LLAMA, "llama" },
237
+ { LLM_ARCH_FALCON, "falcon" },
238
+ { LLM_ARCH_GROK, "grok" },
239
+ { LLM_ARCH_GPT2, "gpt2" },
240
+ { LLM_ARCH_GPTJ, "gptj" },
241
+ { LLM_ARCH_GPTNEOX, "gptneox" },
242
+ { LLM_ARCH_MPT, "mpt" },
243
+ { LLM_ARCH_BAICHUAN, "baichuan" },
244
+ { LLM_ARCH_STARCODER, "starcoder" },
245
+ { LLM_ARCH_PERSIMMON, "persimmon" },
246
+ { LLM_ARCH_REFACT, "refact" },
247
+ { LLM_ARCH_BERT, "bert" },
248
+ { LLM_ARCH_NOMIC_BERT, "nomic-bert" },
249
+ { LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
250
+ { LLM_ARCH_BLOOM, "bloom" },
251
+ { LLM_ARCH_STABLELM, "stablelm" },
252
+ { LLM_ARCH_QWEN, "qwen" },
253
+ { LLM_ARCH_QWEN2, "qwen2" },
254
+ { LLM_ARCH_QWEN2MOE, "qwen2moe" },
255
+ { LLM_ARCH_PHI2, "phi2" },
256
+ { LLM_ARCH_PHI3, "phi3" },
257
+ { LLM_ARCH_PLAMO, "plamo" },
258
+ { LLM_ARCH_CODESHELL, "codeshell" },
259
+ { LLM_ARCH_ORION, "orion" },
260
+ { LLM_ARCH_INTERNLM2, "internlm2" },
261
+ { LLM_ARCH_MINICPM, "minicpm" },
262
+ { LLM_ARCH_GEMMA, "gemma" },
263
+ { LLM_ARCH_STARCODER2, "starcoder2" },
264
+ { LLM_ARCH_MAMBA, "mamba" },
265
+ { LLM_ARCH_XVERSE, "xverse" },
266
+ { LLM_ARCH_COMMAND_R, "command-r" },
267
+ { LLM_ARCH_DBRX, "dbrx" },
268
+ { LLM_ARCH_OLMO, "olmo" },
269
+ { LLM_ARCH_UNKNOWN, "(unknown)" },
264
270
  };
265
271
 
266
272
  enum llm_kv {
@@ -691,6 +697,25 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
691
697
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
692
698
  },
693
699
  },
700
+ {
701
+ LLM_ARCH_JINA_BERT_V2,
702
+ {
703
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
704
+ { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
705
+ { LLM_TENSOR_TOKEN_TYPES, "token_types" },
706
+ { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
707
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
708
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
709
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
710
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
711
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
712
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
713
+ { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
714
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
715
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
716
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
717
+ },
718
+ },
694
719
  {
695
720
  LLM_ARCH_BLOOM,
696
721
  {
@@ -1664,91 +1689,6 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
1664
1689
  GGML_UNUSED(host_buffer);
1665
1690
  }
1666
1691
 
1667
- static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
1668
- ggml_backend_buffer_type_t buft = nullptr;
1669
-
1670
- #ifdef GGML_USE_METAL
1671
- buft = ggml_backend_metal_buffer_type();
1672
- #elif defined(GGML_USE_CUDA)
1673
- buft = ggml_backend_cuda_buffer_type(gpu);
1674
- #elif defined(GGML_USE_VULKAN)
1675
- buft = ggml_backend_vk_buffer_type(gpu);
1676
- #elif defined(GGML_USE_SYCL)
1677
- buft = ggml_backend_sycl_buffer_type(gpu);
1678
- #elif defined(GGML_USE_CLBLAST)
1679
- buft = ggml_backend_opencl_buffer_type();
1680
- #elif defined(GGML_USE_KOMPUTE)
1681
- buft = ggml_backend_kompute_buffer_type(gpu);
1682
- if (buft == nullptr) {
1683
- LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
1684
- }
1685
- #endif
1686
-
1687
- if (buft == nullptr) {
1688
- buft = llama_default_buffer_type_cpu(true);
1689
- }
1690
- return buft;
1691
-
1692
- GGML_UNUSED(gpu);
1693
- }
1694
-
1695
- static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_gpu, const float * tensor_split) {
1696
- ggml_backend_buffer_type_t buft = nullptr;
1697
-
1698
- #ifdef GGML_USE_CUDA
1699
- if (ggml_backend_cuda_get_device_count() > 1) {
1700
- buft = ggml_backend_cuda_split_buffer_type(tensor_split);
1701
- }
1702
- #endif
1703
-
1704
- #ifdef GGML_USE_SYCL
1705
- if (ggml_backend_sycl_get_device_count() > 1) {
1706
- buft = ggml_backend_sycl_split_buffer_type(tensor_split);
1707
- }
1708
- #endif
1709
-
1710
- if (buft == nullptr) {
1711
- buft = llama_default_buffer_type_offload(fallback_gpu);
1712
- }
1713
- return buft;
1714
-
1715
- GGML_UNUSED(tensor_split);
1716
- }
1717
-
1718
- static size_t llama_get_device_count() {
1719
- #if defined(GGML_USE_CUDA)
1720
- return ggml_backend_cuda_get_device_count();
1721
- #elif defined(GGML_USE_SYCL)
1722
- return ggml_backend_sycl_get_device_count();
1723
- #elif defined(GGML_USE_VULKAN)
1724
- return ggml_backend_vk_get_device_count();
1725
- #else
1726
- return 1;
1727
- #endif
1728
- }
1729
-
1730
- static size_t llama_get_device_memory(int device) {
1731
- #if defined(GGML_USE_CUDA)
1732
- size_t total;
1733
- size_t free;
1734
- ggml_backend_cuda_get_device_memory(device, &free, &total);
1735
- return free;
1736
- #elif defined(GGML_USE_SYCL)
1737
- size_t total;
1738
- size_t free;
1739
- ggml_backend_sycl_get_device_memory(device, &free, &total);
1740
- return free;
1741
- #elif defined(GGML_USE_VULKAN)
1742
- size_t total;
1743
- size_t free;
1744
- ggml_backend_vk_get_device_memory(device, &free, &total);
1745
- return free;
1746
- #else
1747
- return 1;
1748
- GGML_UNUSED(device);
1749
- #endif
1750
- }
1751
-
1752
1692
  //
1753
1693
  // globals
1754
1694
  //
@@ -1845,7 +1785,7 @@ struct llama_hparams {
1845
1785
  float f_logit_scale = 0.0f;
1846
1786
 
1847
1787
  bool causal_attn = true;
1848
- bool use_alibi = false; // currently, we need KQ_pos data for ALiBi-based models
1788
+ bool use_alibi = false;
1849
1789
 
1850
1790
  enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
1851
1791
  enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
@@ -2189,6 +2129,8 @@ struct llama_model {
2189
2129
  int main_gpu;
2190
2130
  int n_gpu_layers;
2191
2131
 
2132
+ std::vector<std::string> rpc_servers;
2133
+
2192
2134
  // gguf metadata
2193
2135
  std::unordered_map<std::string, std::string> gguf_kv;
2194
2136
 
@@ -2317,7 +2259,6 @@ struct llama_context {
2317
2259
  struct ggml_tensor * inp_pos; // I32 [n_batch]
2318
2260
  struct ggml_tensor * inp_out_ids; // I32 [n_outputs]
2319
2261
  struct ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch]
2320
- struct ggml_tensor * inp_KQ_pos; // F32 [n_kv]
2321
2262
  struct ggml_tensor * inp_K_shift; // I32 [kv_size]
2322
2263
  struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch]
2323
2264
  struct ggml_tensor * inp_cls; // I32 [n_batch]
@@ -2333,6 +2274,104 @@ struct llama_context {
2333
2274
  #endif
2334
2275
  };
2335
2276
 
2277
+ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
2278
+ ggml_backend_buffer_type_t buft = nullptr;
2279
+
2280
+ #ifdef GGML_USE_RPC
2281
+ std::string endpoint = model.rpc_servers[gpu];
2282
+ buft = ggml_backend_rpc_buffer_type(endpoint.c_str());
2283
+ #elif defined(GGML_USE_METAL)
2284
+ buft = ggml_backend_metal_buffer_type();
2285
+ #elif defined(GGML_USE_CUDA)
2286
+ buft = ggml_backend_cuda_buffer_type(gpu);
2287
+ #elif defined(GGML_USE_VULKAN)
2288
+ buft = ggml_backend_vk_buffer_type(gpu);
2289
+ #elif defined(GGML_USE_SYCL)
2290
+ buft = ggml_backend_sycl_buffer_type(gpu);
2291
+ #elif defined(GGML_USE_CLBLAST)
2292
+ buft = ggml_backend_opencl_buffer_type();
2293
+ #elif defined(GGML_USE_KOMPUTE)
2294
+ buft = ggml_backend_kompute_buffer_type(gpu);
2295
+ if (buft == nullptr) {
2296
+ LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
2297
+ }
2298
+ #endif
2299
+
2300
+ if (buft == nullptr) {
2301
+ buft = llama_default_buffer_type_cpu(true);
2302
+ }
2303
+ return buft;
2304
+ GGML_UNUSED(model);
2305
+ GGML_UNUSED(gpu);
2306
+ }
2307
+
2308
+ static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_model & model, int fallback_gpu, const float * tensor_split) {
2309
+ ggml_backend_buffer_type_t buft = nullptr;
2310
+
2311
+ #ifdef GGML_USE_CUDA
2312
+ if (ggml_backend_cuda_get_device_count() > 1) {
2313
+ buft = ggml_backend_cuda_split_buffer_type(tensor_split);
2314
+ }
2315
+ #endif
2316
+
2317
+ #ifdef GGML_USE_SYCL
2318
+ if (ggml_backend_sycl_get_device_count() > 1) {
2319
+ buft = ggml_backend_sycl_split_buffer_type(tensor_split);
2320
+ }
2321
+ #endif
2322
+
2323
+ if (buft == nullptr) {
2324
+ buft = llama_default_buffer_type_offload(model, fallback_gpu);
2325
+ }
2326
+ return buft;
2327
+
2328
+ GGML_UNUSED(tensor_split);
2329
+ }
2330
+
2331
+ static size_t llama_get_device_count(const llama_model & model) {
2332
+ #if defined(GGML_USE_RPC)
2333
+ return model.rpc_servers.size();
2334
+ #elif defined(GGML_USE_CUDA)
2335
+ return ggml_backend_cuda_get_device_count();
2336
+ #elif defined(GGML_USE_SYCL)
2337
+ return ggml_backend_sycl_get_device_count();
2338
+ #elif defined(GGML_USE_VULKAN)
2339
+ return ggml_backend_vk_get_device_count();
2340
+ #else
2341
+ return 1;
2342
+ #endif
2343
+ GGML_UNUSED(model);
2344
+ }
2345
+
2346
+ static size_t llama_get_device_memory(const llama_model & model, int device) {
2347
+ #if defined(GGML_USE_RPC)
2348
+ size_t total;
2349
+ size_t free;
2350
+ std::string endpoint = model.rpc_servers[device];
2351
+ ggml_backend_rpc_get_device_memory(endpoint.c_str(), &free, &total);
2352
+ return free;
2353
+ #elif defined(GGML_USE_CUDA)
2354
+ size_t total;
2355
+ size_t free;
2356
+ ggml_backend_cuda_get_device_memory(device, &free, &total);
2357
+ return free;
2358
+ #elif defined(GGML_USE_SYCL)
2359
+ size_t total;
2360
+ size_t free;
2361
+ ggml_backend_sycl_get_device_memory(device, &free, &total);
2362
+ return free;
2363
+ #elif defined(GGML_USE_VULKAN)
2364
+ size_t total;
2365
+ size_t free;
2366
+ ggml_backend_vk_get_device_memory(device, &free, &total);
2367
+ return free;
2368
+ #else
2369
+ return 1;
2370
+ #endif
2371
+ GGML_UNUSED(model);
2372
+ GGML_UNUSED(device);
2373
+ }
2374
+
2336
2375
  //
2337
2376
  // kv cache helpers
2338
2377
  //
@@ -2785,6 +2824,11 @@ static void llama_kv_cache_defrag(struct llama_kv_cache & cache) {
2785
2824
  cache.do_defrag = true;
2786
2825
  }
2787
2826
 
2827
+ static uint32_t llama_kv_cache_get_padding(const struct llama_cparams & cparams) {
2828
+ // the FA kernels require padding to avoid extra runtime boundary checks
2829
+ return cparams.flash_attn ? 256u : 32u;
2830
+ }
2831
+
2788
2832
  //
2789
2833
  // model loading and saving
2790
2834
  //
@@ -3175,6 +3219,7 @@ struct llama_model_loader {
3175
3219
  switch (type_max) {
3176
3220
  case GGML_TYPE_F32: ftype = LLAMA_FTYPE_ALL_F32; break;
3177
3221
  case GGML_TYPE_F16: ftype = LLAMA_FTYPE_MOSTLY_F16; break;
3222
+ case GGML_TYPE_BF16: ftype = LLAMA_FTYPE_MOSTLY_BF16; break;
3178
3223
  case GGML_TYPE_Q4_0: ftype = LLAMA_FTYPE_MOSTLY_Q4_0; break;
3179
3224
  case GGML_TYPE_Q4_1: ftype = LLAMA_FTYPE_MOSTLY_Q4_1; break;
3180
3225
  case GGML_TYPE_Q5_0: ftype = LLAMA_FTYPE_MOSTLY_Q5_0; break;
@@ -3666,6 +3711,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
3666
3711
  switch (ftype) {
3667
3712
  case LLAMA_FTYPE_ALL_F32: return "all F32";
3668
3713
  case LLAMA_FTYPE_MOSTLY_F16: return "F16";
3714
+ case LLAMA_FTYPE_MOSTLY_BF16: return "BF16";
3669
3715
  case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
3670
3716
  case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
3671
3717
  case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
@@ -3777,6 +3823,12 @@ static void llm_load_hparams(
3777
3823
 
3778
3824
  // get hparams kv
3779
3825
  ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab);
3826
+
3827
+ // everything past this point is not vocab-related
3828
+ if (hparams.vocab_only) {
3829
+ return;
3830
+ }
3831
+
3780
3832
  ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
3781
3833
  ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
3782
3834
  ml.get_key(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff);
@@ -3858,7 +3910,7 @@ static void llm_load_hparams(
3858
3910
  switch (hparams.n_layer) {
3859
3911
  case 22: model.type = e_model::MODEL_1B; break;
3860
3912
  case 26: model.type = e_model::MODEL_3B; break;
3861
- case 32: model.type = hparams.n_head == hparams.n_head_kv ? e_model::MODEL_7B : e_model::MODEL_8B; break; // LLaMa 8B v3 uses GQA
3913
+ case 32: model.type = hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B; break;
3862
3914
  case 40: model.type = e_model::MODEL_13B; break;
3863
3915
  case 48: model.type = e_model::MODEL_34B; break;
3864
3916
  case 60: model.type = e_model::MODEL_30B; break;
@@ -3960,6 +4012,19 @@ static void llm_load_hparams(
3960
4012
  model.type = e_model::MODEL_335M; break; // bge-large
3961
4013
  }
3962
4014
  } break;
4015
+ case LLM_ARCH_JINA_BERT_V2:
4016
+ {
4017
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
4018
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
4019
+ ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
4020
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
4021
+ hparams.f_max_alibi_bias = 8.0f;
4022
+
4023
+ switch (hparams.n_layer) {
4024
+ case 4: model.type = e_model::MODEL_33M; break; // jina-embeddings-small
4025
+ case 12: model.type = e_model::MODEL_137M; break; // jina-embeddings-base
4026
+ }
4027
+ } break;
3963
4028
  case LLM_ARCH_NOMIC_BERT:
3964
4029
  {
3965
4030
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -4381,8 +4446,27 @@ static void llm_load_vocab(
4381
4446
  tokenizer_pre == "starcoder") {
4382
4447
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STARCODER;
4383
4448
  } else if (
4384
- tokenizer_pre == "gpt-2") {
4449
+ tokenizer_pre == "gpt-2" ||
4450
+ tokenizer_pre == "jina-es" ||
4451
+ tokenizer_pre == "jina-de" ||
4452
+ tokenizer_pre == "jina-v2-es" ||
4453
+ tokenizer_pre == "jina-v2-de") {
4385
4454
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
4455
+ } else if (
4456
+ tokenizer_pre == "refact") {
4457
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_REFACT;
4458
+ } else if (
4459
+ tokenizer_pre == "command-r") {
4460
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_COMMAND_R;
4461
+ } else if (
4462
+ tokenizer_pre == "qwen2") {
4463
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
4464
+ } else if (
4465
+ tokenizer_pre == "olmo") {
4466
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_OLMO;
4467
+ } else if (
4468
+ tokenizer_pre == "dbrx") {
4469
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DBRX;
4386
4470
  } else {
4387
4471
  throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
4388
4472
  }
@@ -4726,13 +4810,13 @@ static bool llm_load_tensors(
4726
4810
 
4727
4811
  if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
4728
4812
  // calculate the split points
4729
- int device_count = llama_get_device_count();
4813
+ int device_count = llama_get_device_count(model);
4730
4814
  bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
4731
4815
  std::vector<float> splits(device_count);
4732
4816
  if (all_zero) {
4733
4817
  // default split, by free memory
4734
4818
  for (int i = 0; i < device_count; ++i) {
4735
- splits[i] = llama_get_device_memory(i);
4819
+ splits[i] = llama_get_device_memory(model, i);
4736
4820
  }
4737
4821
  } else {
4738
4822
  std::copy(tensor_split, tensor_split + device_count, splits.begin());
@@ -4752,35 +4836,35 @@ static bool llm_load_tensors(
4752
4836
  int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1);
4753
4837
  for (int64_t i = i_gpu_start; i < n_layer; ++i) {
4754
4838
  int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits.begin();
4755
- model.buft_layer[i] = llama_default_buffer_type_offload(layer_gpu);
4839
+ model.buft_layer[i] = llama_default_buffer_type_offload(model, layer_gpu);
4756
4840
  }
4757
4841
  // assign the output layer
4758
4842
  if (n_gpu_layers > n_layer) {
4759
4843
  int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits.begin();
4760
- model.buft_output = llama_default_buffer_type_offload(layer_gpu);
4844
+ model.buft_output = llama_default_buffer_type_offload(model, layer_gpu);
4761
4845
  } else {
4762
4846
  model.buft_output = llama_default_buffer_type_cpu(true);
4763
4847
  }
4764
4848
  } else {
4765
4849
  ggml_backend_buffer_type_t split_buft;
4766
4850
  if (split_mode == LLAMA_SPLIT_MODE_ROW) {
4767
- split_buft = llama_default_buffer_type_split(main_gpu, tensor_split);
4851
+ split_buft = llama_default_buffer_type_split(model, main_gpu, tensor_split);
4768
4852
  } else {
4769
4853
  // LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_LAYER in backends where it is not supported
4770
- split_buft = llama_default_buffer_type_offload(main_gpu);
4854
+ split_buft = llama_default_buffer_type_offload(model, main_gpu);
4771
4855
  }
4772
4856
  // assign the repeating layers
4773
4857
  for (int64_t i = i_gpu_start; i < n_layer; ++i) {
4774
4858
  model.buft_layer[i] = {
4775
4859
  split_buft,
4776
- llama_default_buffer_type_offload(main_gpu)
4860
+ llama_default_buffer_type_offload(model, main_gpu)
4777
4861
  };
4778
4862
  }
4779
4863
  // assign the output layer
4780
4864
  if (n_gpu_layers > n_layer) {
4781
4865
  model.buft_output = {
4782
4866
  split_buft,
4783
- llama_default_buffer_type_offload(main_gpu)
4867
+ llama_default_buffer_type_offload(model, main_gpu)
4784
4868
  };
4785
4869
  } else {
4786
4870
  model.buft_output = llama_default_buffer_type_cpu(true);
@@ -5225,6 +5309,50 @@ static bool llm_load_tensors(
5225
5309
  layer.layer_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd});
5226
5310
  }
5227
5311
  } break;
5312
+ case LLM_ARCH_JINA_BERT_V2:
5313
+ {
5314
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // word_embeddings
5315
+ model.type_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type}); //token_type_embeddings
5316
+ model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}); // LayerNorm
5317
+ model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}); //LayerNorm bias
5318
+
5319
+ for (int i = 0; i < n_layer; ++i) {
5320
+ ggml_context * ctx_layer = ctx_for_layer(i);
5321
+ ggml_context * ctx_split = ctx_for_layer_split(i);
5322
+
5323
+ auto & layer = model.layers[i]; // JinaBertLayer
5324
+
5325
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
5326
+ layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
5327
+
5328
+ layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, false);
5329
+ layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, false);
5330
+
5331
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
5332
+ layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
5333
+
5334
+ layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, false);
5335
+ layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, false);
5336
+
5337
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
5338
+ layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
5339
+
5340
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); //output_dens
5341
+ layer.bo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}); //output_dens
5342
+
5343
+ layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}); //output_norm
5344
+ layer.attn_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd});
5345
+
5346
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
5347
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
5348
+
5349
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
5350
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
5351
+
5352
+ layer.layer_out_norm = ml.create_tensor(ctx_split, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
5353
+ layer.layer_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd});
5354
+ }
5355
+ } break;
5228
5356
  case LLM_ARCH_BLOOM:
5229
5357
  {
5230
5358
  model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
@@ -6120,6 +6248,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
6120
6248
  || !(
6121
6249
  model.ftype == LLAMA_FTYPE_ALL_F32 ||
6122
6250
  model.ftype == LLAMA_FTYPE_MOSTLY_F16 ||
6251
+ model.ftype == LLAMA_FTYPE_MOSTLY_BF16 ||
6123
6252
  model.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
6124
6253
  model.ftype == LLAMA_FTYPE_MOSTLY_Q4_1
6125
6254
  )
@@ -6300,7 +6429,7 @@ static struct ggml_tensor * llm_build_ffn(
6300
6429
  llm_ffn_gate_type type_gate,
6301
6430
  const llm_build_cb & cb,
6302
6431
  int il) {
6303
- struct ggml_tensor * tmp = ggml_mul_mat(ctx, up, cur);
6432
+ struct ggml_tensor * tmp = up ? ggml_mul_mat(ctx, up, cur) : cur;
6304
6433
  cb(tmp, "ffn_up", il);
6305
6434
 
6306
6435
  if (up_b) {
@@ -6482,7 +6611,6 @@ static struct ggml_tensor * llm_build_kqv(
6482
6611
  struct ggml_tensor * wo_b,
6483
6612
  struct ggml_tensor * q_cur,
6484
6613
  struct ggml_tensor * kq_mask,
6485
- struct ggml_tensor * kq_pos,
6486
6614
  int32_t n_tokens,
6487
6615
  int32_t n_kv,
6488
6616
  float kq_scale,
@@ -6494,6 +6622,7 @@ static struct ggml_tensor * llm_build_kqv(
6494
6622
  const int64_t n_embd_head_k = hparams.n_embd_head_k;
6495
6623
  const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
6496
6624
  const int64_t n_embd_head_v = hparams.n_embd_head_v;
6625
+ const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
6497
6626
 
6498
6627
  struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3);
6499
6628
  cb(q, "q", il);
@@ -6512,26 +6641,22 @@ static struct ggml_tensor * llm_build_kqv(
6512
6641
  GGML_UNUSED(model);
6513
6642
  GGML_UNUSED(n_ctx);
6514
6643
 
6515
- // note: if this assert triggers, then some check has failed earlier
6516
- // the idea is to detect during context creation that ALiBi would be used and disable Flash Attention
6517
- GGML_ASSERT(kq_pos == nullptr && "ALiBi is not yet supported with Flash Attention");
6518
-
6519
6644
  // split cached v into n_head heads (not transposed)
6520
6645
  struct ggml_tensor * v =
6521
6646
  ggml_view_3d(ctx, kv.v_l[il],
6522
6647
  n_embd_head_v, n_kv, n_head_kv,
6523
- ggml_row_size(kv.v_l[il]->type, n_embd_k_gqa),
6524
- ggml_row_size(kv.v_l[il]->type, n_embd_head_k),
6648
+ ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa),
6649
+ ggml_row_size(kv.v_l[il]->type, n_embd_head_v),
6525
6650
  0);
6526
6651
  cb(v, "v", il);
6527
6652
 
6528
- cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale);
6653
+ cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias);
6529
6654
 
6530
6655
  if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
6531
6656
  ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
6532
6657
  }
6533
6658
 
6534
- cur = ggml_reshape_2d(ctx, cur, n_embd_head_k*n_head, n_tokens);
6659
+ cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens);
6535
6660
  } else {
6536
6661
  struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
6537
6662
  cb(kq, "kq", il);
@@ -6556,28 +6681,8 @@ static struct ggml_tensor * llm_build_kqv(
6556
6681
  kq = ggml_scale(ctx, kq, 30);
6557
6682
  }
6558
6683
 
6559
- #if defined(GGML_USE_KOMPUTE)
6560
- #pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Kompute")
6561
- #pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
6562
- #pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5488")
6563
- if (hparams.use_alibi) {
6564
- kq = ggml_scale(ctx, kq, kq_scale);
6565
- cb(kq, "kq_scaled", il);
6566
-
6567
- kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, hparams.f_max_alibi_bias);
6568
- cb(kq, "kq_scaled_alibi", il);
6569
-
6570
- kq = ggml_add(ctx, kq, kq_mask);
6571
- cb(kq, "kq_masked", il);
6572
-
6573
- kq = ggml_soft_max(ctx, kq);
6574
- cb(kq, "kq_soft_max", il);
6575
- } else
6576
- #endif
6577
- {
6578
- kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_pos, kq_scale, hparams.f_max_alibi_bias);
6579
- cb(kq, "kq_soft_max_ext", il);
6580
- }
6684
+ kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias);
6685
+ cb(kq, "kq_soft_max_ext", il);
6581
6686
 
6582
6687
  GGML_ASSERT(kv.size == n_ctx);
6583
6688
 
@@ -6596,7 +6701,7 @@ static struct ggml_tensor * llm_build_kqv(
6596
6701
  struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
6597
6702
  cb(kqv_merged, "kqv_merged", il);
6598
6703
 
6599
- cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens);
6704
+ cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_v*n_head, n_tokens);
6600
6705
  cb(cur, "kqv_merged_cont", il);
6601
6706
  }
6602
6707
 
@@ -6627,7 +6732,6 @@ static struct ggml_tensor * llm_build_kv(
6627
6732
  struct ggml_tensor * v_cur,
6628
6733
  struct ggml_tensor * q_cur,
6629
6734
  struct ggml_tensor * kq_mask,
6630
- struct ggml_tensor * kq_pos,
6631
6735
  int32_t n_tokens,
6632
6736
  int32_t kv_head,
6633
6737
  int32_t n_kv,
@@ -6646,7 +6750,7 @@ static struct ggml_tensor * llm_build_kv(
6646
6750
  struct ggml_tensor * cur;
6647
6751
 
6648
6752
  cur = llm_build_kqv(ctx, model, hparams, cparams, kv, graph, wo, wo_b,
6649
- q_cur, kq_mask, kq_pos, n_tokens, n_kv, kq_scale, cb, il);
6753
+ q_cur, kq_mask, n_tokens, n_kv, kq_scale, cb, il);
6650
6754
  cb(cur, "kqv_out", il);
6651
6755
 
6652
6756
  return cur;
@@ -6753,18 +6857,17 @@ struct llm_build_context {
6753
6857
 
6754
6858
  ctx0 = ggml_init(params);
6755
6859
 
6756
- lctx.inp_tokens = nullptr;
6757
- lctx.inp_embd = nullptr;
6758
- lctx.inp_pos = nullptr;
6860
+ lctx.inp_tokens = nullptr;
6861
+ lctx.inp_embd = nullptr;
6862
+ lctx.inp_pos = nullptr;
6759
6863
  lctx.inp_out_ids = nullptr;
6760
6864
  lctx.inp_KQ_mask = nullptr;
6761
- lctx.inp_KQ_pos = nullptr;
6762
6865
  lctx.inp_K_shift = nullptr;
6763
- lctx.inp_mean = nullptr;
6764
- lctx.inp_cls = nullptr;
6765
- lctx.inp_s_copy = nullptr;
6766
- lctx.inp_s_mask = nullptr;
6767
- lctx.inp_s_seq = nullptr;
6866
+ lctx.inp_mean = nullptr;
6867
+ lctx.inp_cls = nullptr;
6868
+ lctx.inp_s_copy = nullptr;
6869
+ lctx.inp_s_mask = nullptr;
6870
+ lctx.inp_s_seq = nullptr;
6768
6871
  }
6769
6872
 
6770
6873
  void free() {
@@ -6914,19 +7017,6 @@ struct llm_build_context {
6914
7017
  return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_mask, GGML_TYPE_F16) : lctx.inp_KQ_mask;
6915
7018
  }
6916
7019
 
6917
- struct ggml_tensor * build_inp_KQ_pos(bool causal = true) {
6918
- if (causal) {
6919
- lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_kv);
6920
- } else {
6921
- // TODO: this will be needed for ALiBi-based BERT models
6922
- // https://github.com/ggerganov/llama.cpp/pull/6826
6923
- lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_tokens);
6924
- }
6925
- cb(lctx.inp_KQ_pos, "KQ_pos", -1);
6926
- ggml_set_input(lctx.inp_KQ_pos);
6927
- return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_pos, GGML_TYPE_F16) : lctx.inp_KQ_pos;
6928
- }
6929
-
6930
7020
  struct ggml_tensor * build_inp_mean() {
6931
7021
  lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
6932
7022
  cb(lctx.inp_mean, "inp_mean", -1);
@@ -7032,7 +7122,7 @@ struct llm_build_context {
7032
7122
 
7033
7123
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7034
7124
  model.layers[il].wo, model.layers[il].bo,
7035
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7125
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7036
7126
  }
7037
7127
 
7038
7128
  if (il == n_layer - 1) {
@@ -7125,9 +7215,6 @@ struct llm_build_context {
7125
7215
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
7126
7216
  struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
7127
7217
 
7128
- // positions of the tokens in the KV cache
7129
- struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
7130
-
7131
7218
  for (int il = 0; il < n_layer; ++il) {
7132
7219
  struct ggml_tensor * inpSA = inpL;
7133
7220
 
@@ -7172,7 +7259,7 @@ struct llm_build_context {
7172
7259
 
7173
7260
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7174
7261
  model.layers[il].wo, NULL,
7175
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7262
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7176
7263
  }
7177
7264
 
7178
7265
  if (il == n_layer - 1) {
@@ -7242,9 +7329,6 @@ struct llm_build_context {
7242
7329
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
7243
7330
  struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
7244
7331
 
7245
- // positions of the tokens in the KV cache
7246
- struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
7247
-
7248
7332
  for (int il = 0; il < n_layer; ++il) {
7249
7333
  struct ggml_tensor * inpSA = inpL;
7250
7334
 
@@ -7279,7 +7363,7 @@ struct llm_build_context {
7279
7363
  cb(Kcur, "Kcur", il);
7280
7364
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7281
7365
  model.layers[il].wo, NULL,
7282
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7366
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7283
7367
  }
7284
7368
 
7285
7369
  if (il == n_layer - 1) {
@@ -7399,7 +7483,7 @@ struct llm_build_context {
7399
7483
 
7400
7484
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7401
7485
  model.layers[il].wo, NULL,
7402
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7486
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7403
7487
  }
7404
7488
 
7405
7489
  if (il == n_layer - 1) {
@@ -7524,7 +7608,7 @@ struct llm_build_context {
7524
7608
 
7525
7609
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7526
7610
  model.layers[il].wo, model.layers[il].bo,
7527
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
7611
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
7528
7612
  }
7529
7613
 
7530
7614
  if (il == n_layer - 1) {
@@ -7676,7 +7760,7 @@ struct llm_build_context {
7676
7760
 
7677
7761
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7678
7762
  model.layers[il].wo, NULL,
7679
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7763
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7680
7764
  }
7681
7765
 
7682
7766
  if (il == n_layer - 1) {
@@ -7788,7 +7872,7 @@ struct llm_build_context {
7788
7872
 
7789
7873
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7790
7874
  model.layers[il].wo, model.layers[il].bo,
7791
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7875
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7792
7876
  }
7793
7877
 
7794
7878
  if (il == n_layer - 1) {
@@ -7992,7 +8076,7 @@ struct llm_build_context {
7992
8076
 
7993
8077
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7994
8078
  model.layers[il].wo, model.layers[il].bo,
7995
- Kcur, Vcur, Q, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8079
+ Kcur, Vcur, Q, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7996
8080
  }
7997
8081
 
7998
8082
  if (il == n_layer - 1) {
@@ -8058,9 +8142,6 @@ struct llm_build_context {
8058
8142
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
8059
8143
  struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
8060
8144
 
8061
- // positions of the tokens in the KV cache
8062
- struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
8063
-
8064
8145
  for (int il = 0; il < n_layer; ++il) {
8065
8146
  struct ggml_tensor * inpSA = inpL;
8066
8147
 
@@ -8088,7 +8169,7 @@ struct llm_build_context {
8088
8169
 
8089
8170
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8090
8171
  model.layers[il].wo, NULL,
8091
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8172
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8092
8173
  }
8093
8174
 
8094
8175
  if (il == n_layer - 1) {
@@ -8150,8 +8231,11 @@ struct llm_build_context {
8150
8231
 
8151
8232
  struct ggml_tensor * cur;
8152
8233
  struct ggml_tensor * inpL;
8234
+ struct ggml_tensor * inp_pos = nullptr;
8153
8235
 
8154
- struct ggml_tensor * inp_pos = build_inp_pos();
8236
+ if (model.arch != LLM_ARCH_JINA_BERT_V2) {
8237
+ inp_pos = build_inp_pos();
8238
+ }
8155
8239
  struct ggml_tensor * inp_mean = build_inp_mean();
8156
8240
  struct ggml_tensor * inp_cls = build_inp_cls();
8157
8241
 
@@ -8182,13 +8266,26 @@ struct llm_build_context {
8182
8266
  struct ggml_tensor * Vcur;
8183
8267
 
8184
8268
  // self-attention
8185
- if (model.arch == LLM_ARCH_BERT) {
8269
+ if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_JINA_BERT_V2) {
8186
8270
  Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq);
8187
8271
  cb(Qcur, "Qcur", il);
8188
8272
 
8273
+ if (model.layers[il].attn_q_norm) {
8274
+ Qcur = llm_build_norm(ctx0, Qcur, hparams,
8275
+ model.layers[il].attn_q_norm,
8276
+ model.layers[il].attn_q_norm_b,
8277
+ LLM_NORM, cb, il);
8278
+ }
8279
+
8189
8280
  Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), model.layers[il].bk);
8190
8281
  cb(Kcur, "Kcur", il);
8191
8282
 
8283
+ if (model.layers[il].attn_k_norm) {
8284
+ Kcur = llm_build_norm(ctx0, Kcur, hparams,
8285
+ model.layers[il].attn_k_norm,
8286
+ model.layers[il].attn_k_norm_b,
8287
+ LLM_NORM, cb, il);
8288
+ }
8192
8289
  Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), model.layers[il].bv);
8193
8290
  cb(Vcur, "Vcur", il);
8194
8291
 
@@ -8228,7 +8325,7 @@ struct llm_build_context {
8228
8325
  struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
8229
8326
  cb(kq, "kq", il);
8230
8327
 
8231
- kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, nullptr, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
8328
+ kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
8232
8329
  cb(kq, "kq_soft_max_ext", il);
8233
8330
 
8234
8331
  struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)));
@@ -8279,6 +8376,13 @@ struct llm_build_context {
8279
8376
  model.layers[il].ffn_down, model.layers[il].ffn_down_b,
8280
8377
  NULL,
8281
8378
  LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
8379
+ } else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
8380
+ cur = llm_build_ffn(ctx0, cur,
8381
+ model.layers[il].ffn_up, NULL,
8382
+ model.layers[il].ffn_gate, NULL,
8383
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b,
8384
+ NULL,
8385
+ LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
8282
8386
  } else {
8283
8387
  cur = llm_build_ffn(ctx0, cur,
8284
8388
  model.layers[il].ffn_up, NULL,
@@ -8345,9 +8449,6 @@ struct llm_build_context {
8345
8449
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
8346
8450
  struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
8347
8451
 
8348
- // positions of the tokens in the KV cache
8349
- struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
8350
-
8351
8452
  inpL = llm_build_norm(ctx0, inpL, hparams,
8352
8453
  model.tok_norm,
8353
8454
  model.tok_norm_b,
@@ -8381,7 +8482,7 @@ struct llm_build_context {
8381
8482
 
8382
8483
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8383
8484
  model.layers[il].wo, model.layers[il].bo,
8384
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8485
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8385
8486
  }
8386
8487
 
8387
8488
  if (il == n_layer - 1) {
@@ -8446,9 +8547,6 @@ struct llm_build_context {
8446
8547
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
8447
8548
  struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
8448
8549
 
8449
- // positions of the tokens in the KV cache
8450
- struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
8451
-
8452
8550
  if (model.pos_embd) {
8453
8551
  // inp_pos - contains the positions
8454
8552
  struct ggml_tensor * inp_pos = build_inp_pos();
@@ -8512,13 +8610,13 @@ struct llm_build_context {
8512
8610
 
8513
8611
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8514
8612
  model.layers[il].wo, model.layers[il].bo,
8515
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8613
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8516
8614
  } else {
8517
8615
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8518
8616
 
8519
8617
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8520
8618
  model.layers[il].wo, model.layers[il].bo,
8521
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8619
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8522
8620
  }
8523
8621
  }
8524
8622
 
@@ -8662,7 +8760,7 @@ struct llm_build_context {
8662
8760
 
8663
8761
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8664
8762
  model.layers[il].wo, NULL,
8665
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8763
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8666
8764
  }
8667
8765
 
8668
8766
  if (il == n_layer - 1) {
@@ -8780,7 +8878,7 @@ struct llm_build_context {
8780
8878
 
8781
8879
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8782
8880
  model.layers[il].wo, NULL,
8783
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8881
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8784
8882
  }
8785
8883
 
8786
8884
  if (il == n_layer - 1) {
@@ -8893,7 +8991,7 @@ struct llm_build_context {
8893
8991
 
8894
8992
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8895
8993
  model.layers[il].wo, model.layers[il].bo,
8896
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8994
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8897
8995
  }
8898
8996
 
8899
8997
  if (il == n_layer - 1) {
@@ -9007,7 +9105,7 @@ struct llm_build_context {
9007
9105
 
9008
9106
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9009
9107
  model.layers[il].wo, model.layers[il].bo,
9010
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9108
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9011
9109
  }
9012
9110
 
9013
9111
  if (il == n_layer - 1) {
@@ -9162,7 +9260,7 @@ struct llm_build_context {
9162
9260
 
9163
9261
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9164
9262
  model.layers[il].wo, model.layers[il].bo,
9165
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
9263
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
9166
9264
  }
9167
9265
 
9168
9266
  if (il == n_layer - 1) {
@@ -9279,7 +9377,7 @@ struct llm_build_context {
9279
9377
 
9280
9378
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9281
9379
  model.layers[il].wo, model.layers[il].bo,
9282
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
9380
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
9283
9381
  }
9284
9382
 
9285
9383
  if (il == n_layer - 1) {
@@ -9392,7 +9490,7 @@ struct llm_build_context {
9392
9490
 
9393
9491
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9394
9492
  model.layers[il].wo, NULL,
9395
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9493
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9396
9494
  }
9397
9495
  struct ggml_tensor * sa_out = cur;
9398
9496
 
@@ -9495,7 +9593,7 @@ struct llm_build_context {
9495
9593
 
9496
9594
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9497
9595
  model.layers[il].wo, model.layers[il].bo,
9498
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9596
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9499
9597
  }
9500
9598
 
9501
9599
  if (il == n_layer - 1) {
@@ -9602,7 +9700,7 @@ struct llm_build_context {
9602
9700
 
9603
9701
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9604
9702
  model.layers[il].wo, model.layers[il].bo,
9605
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9703
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9606
9704
  }
9607
9705
 
9608
9706
  if (il == n_layer - 1) {
@@ -9718,7 +9816,7 @@ struct llm_build_context {
9718
9816
 
9719
9817
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9720
9818
  model.layers[il].wo, NULL,
9721
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9819
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9722
9820
  }
9723
9821
 
9724
9822
  if (il == n_layer - 1) {
@@ -9835,7 +9933,7 @@ struct llm_build_context {
9835
9933
 
9836
9934
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9837
9935
  model.layers[il].wo, model.layers[il].bo,
9838
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9936
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9839
9937
  }
9840
9938
 
9841
9939
  if (il == n_layer - 1) {
@@ -9965,7 +10063,7 @@ struct llm_build_context {
9965
10063
 
9966
10064
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9967
10065
  model.layers[il].wo, model.layers[il].bo,
9968
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10066
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9969
10067
  }
9970
10068
 
9971
10069
  if (il == n_layer - 1) {
@@ -10086,7 +10184,7 @@ struct llm_build_context {
10086
10184
 
10087
10185
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10088
10186
  model.layers[il].wo, NULL,
10089
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
10187
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
10090
10188
  }
10091
10189
 
10092
10190
  if (il == n_layer - 1) {
@@ -10205,7 +10303,7 @@ struct llm_build_context {
10205
10303
 
10206
10304
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10207
10305
  model.layers[il].wo, model.layers[il].bo,
10208
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10306
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10209
10307
  }
10210
10308
 
10211
10309
  if (il == n_layer - 1) {
@@ -10495,7 +10593,7 @@ struct llm_build_context {
10495
10593
 
10496
10594
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10497
10595
  model.layers[il].wo, model.layers[il].bo,
10498
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10596
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10499
10597
  }
10500
10598
 
10501
10599
  if (il == n_layer - 1) {
@@ -10626,7 +10724,7 @@ struct llm_build_context {
10626
10724
 
10627
10725
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10628
10726
  model.layers[il].wo, nullptr,
10629
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10727
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10630
10728
  }
10631
10729
 
10632
10730
  if (il == n_layer - 1) {
@@ -10807,6 +10905,7 @@ static struct ggml_cgraph * llama_build_graph(
10807
10905
  result = llm.build_refact();
10808
10906
  } break;
10809
10907
  case LLM_ARCH_BERT:
10908
+ case LLM_ARCH_JINA_BERT_V2:
10810
10909
  case LLM_ARCH_NOMIC_BERT:
10811
10910
  {
10812
10911
  result = llm.build_bert();
@@ -11014,11 +11113,21 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
11014
11113
  if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
11015
11114
  f = -INFINITY;
11016
11115
  } else {
11017
- f = 0.0f;
11116
+ if (hparams.use_alibi) {
11117
+ f = -fabs(lctx.kv_self.cells[i].pos - pos);
11118
+ } else {
11119
+ f = 0.0f;
11120
+ }
11018
11121
  }
11019
11122
  data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
11020
11123
  }
11021
11124
  }
11125
+
11126
+ for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
11127
+ for (int j = 0; j < n_kv; ++j) {
11128
+ data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
11129
+ }
11130
+ }
11022
11131
  }
11023
11132
  } else {
11024
11133
  // when using kv cache, the mask needs to match the kv cache size
@@ -11037,7 +11146,11 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
11037
11146
  float f = -INFINITY;
11038
11147
  for (int s = 0; s < batch.n_seq_id[i]; ++s) {
11039
11148
  if (batch.seq_id[i][s] == seq_id) {
11040
- f = 0.0f;
11149
+ if (hparams.use_alibi) {
11150
+ f = -fabs(batch.pos[i] - batch.pos[j]);
11151
+ } else {
11152
+ f = 0.0f;
11153
+ }
11041
11154
  break;
11042
11155
  }
11043
11156
  }
@@ -11053,21 +11166,6 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
11053
11166
  }
11054
11167
  }
11055
11168
 
11056
- // ALiBi requires the KQ_pos tensor to provide the sequence position of each token in the batch
11057
- // this allows to process multiple sequences in parallel with ALiBi-based models
11058
- if (hparams.use_alibi) {
11059
- const int64_t n_kv = kv_self.n;
11060
-
11061
- GGML_ASSERT(lctx.inp_KQ_pos);
11062
- GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_pos->buffer));
11063
-
11064
- float * data = (float *) lctx.inp_KQ_pos->data;
11065
-
11066
- for (int i = 0; i < n_kv; ++i) {
11067
- data[i] = float(lctx.kv_self.cells[i].pos);
11068
- }
11069
- }
11070
-
11071
11169
  if (cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
11072
11170
  const int64_t n_tokens = batch.n_tokens;
11073
11171
 
@@ -11437,7 +11535,8 @@ static int llama_decode_internal(
11437
11535
  // a heuristic, to avoid attending the full cache if it is not yet utilized
11438
11536
  // after enough generations, the benefit from this heuristic disappears
11439
11537
  // if we start defragmenting the cache, the benefit from this will be more important
11440
- kv_self.n = std::min(kv_self.size, std::max(256u, GGML_PAD(llama_kv_cache_cell_max(kv_self), 256)));
11538
+ const uint32_t pad = llama_kv_cache_get_padding(cparams);
11539
+ kv_self.n = std::min(kv_self.size, std::max(pad, GGML_PAD(llama_kv_cache_cell_max(kv_self), pad)));
11441
11540
  //kv_self.n = llama_kv_cache_cell_max(kv_self);
11442
11541
  }
11443
11542
  }
@@ -11952,7 +12051,7 @@ static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id
11952
12051
  static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
11953
12052
  GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE);
11954
12053
  GGML_ASSERT(llama_is_byte_token(vocab, id));
11955
- const auto& token_data = vocab.id_to_token.at(id);
12054
+ const auto & token_data = vocab.id_to_token.at(id);
11956
12055
  switch (llama_vocab_get_type(vocab)) {
11957
12056
  case LLAMA_VOCAB_TYPE_SPM: {
11958
12057
  auto buf = token_data.text.substr(3, 2);
@@ -12182,12 +12281,14 @@ struct llm_tokenizer_bpe {
12182
12281
 
12183
12282
  void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
12184
12283
  int final_prev_index = -1;
12284
+ bool ignore_merges = false;
12185
12285
 
12186
12286
  std::vector<std::string> word_collection;
12187
12287
  switch (vocab.type) {
12188
12288
  case LLAMA_VOCAB_TYPE_BPE:
12189
12289
  switch (vocab.type_pre) {
12190
12290
  case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
12291
+ ignore_merges = true;
12191
12292
  word_collection = unicode_regex_split(text, {
12192
12293
  // original regex from tokenizer.json
12193
12294
  //"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
@@ -12196,6 +12297,12 @@ struct llm_tokenizer_bpe {
12196
12297
  "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
12197
12298
  });
12198
12299
  break;
12300
+ case LLAMA_VOCAB_PRE_TYPE_DBRX:
12301
+ word_collection = unicode_regex_split(text, {
12302
+ // same as llama3
12303
+ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
12304
+ });
12305
+ break;
12199
12306
  case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
12200
12307
  word_collection = unicode_regex_split(text, {
12201
12308
  "[\r\n]",
@@ -12212,14 +12319,13 @@ struct llm_tokenizer_bpe {
12212
12319
  "\\s?\\p{L}+",
12213
12320
  "\\s?\\p{P}+",
12214
12321
  "[一-龥ࠀ-一가-퟿]+",
12215
- "\\p{N}+",
12322
+ "\\p{N}",
12216
12323
  });
12217
12324
  break;
12218
12325
  case LLAMA_VOCAB_PRE_TYPE_FALCON:
12219
12326
  word_collection = unicode_regex_split(text, {
12220
12327
  "[\\p{P}\\$\\+<=>\\^~\\|]+",
12221
12328
  "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
12222
- "\\p{N}+",
12223
12329
  "[0-9][0-9][0-9]",
12224
12330
  });
12225
12331
  break;
@@ -12235,11 +12341,26 @@ struct llm_tokenizer_bpe {
12235
12341
  });
12236
12342
  break;
12237
12343
  case LLAMA_VOCAB_PRE_TYPE_STARCODER:
12344
+ case LLAMA_VOCAB_PRE_TYPE_REFACT:
12345
+ case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
12346
+ word_collection = unicode_regex_split(text, {
12347
+ "\\p{N}",
12348
+ "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
12349
+ });
12350
+ break;
12238
12351
  case LLAMA_VOCAB_PRE_TYPE_GPT2:
12352
+ case LLAMA_VOCAB_PRE_TYPE_OLMO:
12239
12353
  word_collection = unicode_regex_split(text, {
12240
12354
  "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
12241
12355
  });
12242
12356
  break;
12357
+ case LLAMA_VOCAB_PRE_TYPE_QWEN2:
12358
+ word_collection = unicode_regex_split(text, {
12359
+ // original regex from tokenizer.json
12360
+ // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
12361
+ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
12362
+ });
12363
+ break;
12243
12364
  default:
12244
12365
  // default regex for BPE tokenization pre-processing
12245
12366
  word_collection = unicode_regex_split(text, {
@@ -12265,6 +12386,11 @@ struct llm_tokenizer_bpe {
12265
12386
  int index = 0;
12266
12387
  size_t offset = 0;
12267
12388
 
12389
+ if (ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) {
12390
+ symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()});
12391
+ offset = word.size();
12392
+ }
12393
+
12268
12394
  while (offset < word.size()) {
12269
12395
  llm_symbol sym;
12270
12396
  size_t char_len = std::min(word.size() - offset, (size_t) ::utf8_len(word[offset]));
@@ -12450,16 +12576,16 @@ struct llm_tokenizer_wpm {
12450
12576
  // to lowercase, pad chinese characters, pad punctuation
12451
12577
  std::string new_str = "";
12452
12578
  for (uint32_t code : cpts_nfd) {
12453
- int type = unicode_cpt_type(code);
12454
- if (type == CODEPOINT_TYPE_ACCENT_MARK || type == CODEPOINT_TYPE_CONTROL) {
12579
+ const codepoint_flags flags = unicode_cpt_flags(code);
12580
+ if (flags.is_accent_mark || flags.is_control) {
12455
12581
  continue;
12456
12582
  }
12457
12583
  code = unicode_tolower(code);
12458
- if (type == CODEPOINT_TYPE_WHITESPACE) {
12584
+ if (flags.is_separator || flags.is_whitespace) { //####FIXME: is_separator ?
12459
12585
  code = ' ';
12460
12586
  }
12461
12587
  std::string s = unicode_cpt_to_utf8(code);
12462
- if (type == CODEPOINT_TYPE_PUNCTUATION || is_ascii_punct(code) || is_chinese_char(code)) {
12588
+ if (flags.is_punctuation || is_ascii_punct(code) || is_chinese_char(code)) {
12463
12589
  new_str += " ";
12464
12590
  new_str += s;
12465
12591
  new_str += " ";
@@ -12693,6 +12819,13 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
12693
12819
  }
12694
12820
  }
12695
12821
 
12822
+ if (add_special && vocab.special_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) {
12823
+ LLAMA_LOG_WARN(
12824
+ "%s: Added a BOS token to the prompt as specified by the model but the prompt "
12825
+ "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
12826
+ "Are you sure this is what you want?\n", __FUNCTION__);
12827
+ }
12828
+
12696
12829
  if (add_special && vocab.special_add_eos == 1) {
12697
12830
  GGML_ASSERT(vocab.special_eos_id != -1);
12698
12831
  output.push_back(vocab.special_eos_id);
@@ -12719,7 +12852,17 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
12719
12852
  }
12720
12853
  }
12721
12854
 
12722
- GGML_ASSERT(vocab.special_add_eos != 1);
12855
+ if (add_special && vocab.special_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) {
12856
+ LLAMA_LOG_WARN(
12857
+ "%s: Added a BOS token to the prompt as specified by the model but the prompt "
12858
+ "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
12859
+ "Are you sure this is what you want?\n", __FUNCTION__);
12860
+ }
12861
+
12862
+ if (add_special && vocab.special_add_eos == 1) {
12863
+ GGML_ASSERT(vocab.special_add_eos != -1);
12864
+ output.push_back(vocab.special_eos_id);
12865
+ }
12723
12866
  } break;
12724
12867
  case LLAMA_VOCAB_TYPE_WPM:
12725
12868
  {
@@ -13073,6 +13216,58 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
13073
13216
  return rejects;
13074
13217
  }
13075
13218
 
13219
+ static bool llama_grammar_detect_left_recursion(
13220
+ const std::vector<std::vector<llama_grammar_element>> & rules,
13221
+ size_t rule_index,
13222
+ std::vector<bool> * rules_visited,
13223
+ std::vector<bool> * rules_in_progress,
13224
+ std::vector<bool> * rules_may_be_empty) {
13225
+ if ((*rules_in_progress)[rule_index]) {
13226
+ return true;
13227
+ }
13228
+
13229
+ (*rules_in_progress)[rule_index] = true;
13230
+
13231
+ const std::vector<llama_grammar_element> & rule = rules[rule_index];
13232
+
13233
+ // First check if the rule might produce the empty string. This could be done combined with the second
13234
+ // step but it's more readable as two steps.
13235
+ bool at_rule_start = true;
13236
+ for (size_t i = 0; i < rule.size(); i++) {
13237
+ if (llama_grammar_is_end_of_sequence(&rule[i])) {
13238
+ if (at_rule_start) {
13239
+ (*rules_may_be_empty)[rule_index] = true;
13240
+ break;
13241
+ }
13242
+ at_rule_start = true;
13243
+ } else {
13244
+ at_rule_start = false;
13245
+ }
13246
+ }
13247
+
13248
+ // Second, recurse into leftmost nonterminals (or next-leftmost as long as the previous nonterminal may
13249
+ // be empty)
13250
+ bool recurse_into_nonterminal = true;
13251
+ for (size_t i = 0; i < rule.size(); i++) {
13252
+ if (rule[i].type == LLAMA_GRETYPE_RULE_REF && recurse_into_nonterminal) {
13253
+ if (llama_grammar_detect_left_recursion(rules, (size_t)rule[i].value, rules_visited, rules_in_progress, rules_may_be_empty)) {
13254
+ return true;
13255
+ }
13256
+ if (!((*rules_may_be_empty)[(size_t)rule[i].value])) {
13257
+ recurse_into_nonterminal = false;
13258
+ }
13259
+ } else if (llama_grammar_is_end_of_sequence(&rule[i])) {
13260
+ recurse_into_nonterminal = true;
13261
+ } else {
13262
+ recurse_into_nonterminal = false;
13263
+ }
13264
+ }
13265
+
13266
+ (*rules_in_progress)[rule_index] = false;
13267
+ (*rules_visited)[rule_index] = true;
13268
+ return false;
13269
+ }
13270
+
13076
13271
  //
13077
13272
  // grammar - external
13078
13273
  //
@@ -13092,6 +13287,19 @@ struct llama_grammar * llama_grammar_init(
13092
13287
  vec_rules[i].push_back({LLAMA_GRETYPE_END, 0});
13093
13288
  }
13094
13289
 
13290
+ // Check for left recursion
13291
+ std::vector<bool> rules_visited(n_rules);
13292
+ std::vector<bool> rules_in_progress(n_rules);
13293
+ std::vector<bool> rules_may_be_empty(n_rules);
13294
+ for (size_t i = 0; i < n_rules; i++) {
13295
+ if (rules_visited[i]) {
13296
+ continue;
13297
+ }
13298
+ if (llama_grammar_detect_left_recursion(vec_rules, i, &rules_visited, &rules_in_progress, &rules_may_be_empty)) {
13299
+ throw std::runtime_error(format("unsupported grammar, left recursion detected for nonterminal at index %zu", i));
13300
+ }
13301
+ }
13302
+
13095
13303
  // loop over alternates of start rule to build initial stacks
13096
13304
  std::vector<std::vector<const llama_grammar_element *>> stacks;
13097
13305
  pos = vec_rules[start_rule_index].data();
@@ -13114,6 +13322,9 @@ struct llama_grammar * llama_grammar_init(
13114
13322
  }
13115
13323
  } while (true);
13116
13324
 
13325
+ // Important: vec_rules has to be moved here, not copied, because stacks contains
13326
+ // pointers to elements of vec_rules. If vec_rules were copied into llama_grammar
13327
+ // then the pointers would be invalidated when the local vec_rules goes out of scope.
13117
13328
  return new llama_grammar{ std::move(vec_rules), std::move(stacks), {} };
13118
13329
  }
13119
13330
 
@@ -13708,9 +13919,7 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
13708
13919
 
13709
13920
  // Sample the next word X using top-k sampling
13710
13921
  llama_sample_top_k(nullptr, candidates, int(k), 1);
13711
- if (ctx) {
13712
- ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
13713
- }
13922
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
13714
13923
  llama_token X = llama_sample_token(ctx, candidates);
13715
13924
  t_start_sample_us = ggml_time_us();
13716
13925
 
@@ -13724,9 +13933,7 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
13724
13933
  // Update mu using the learning rate and error
13725
13934
  *mu = *mu - eta * e;
13726
13935
 
13727
- if (ctx) {
13728
- ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
13729
- }
13936
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
13730
13937
  return X;
13731
13938
  }
13732
13939
 
@@ -14142,13 +14349,16 @@ static void llama_tensor_dequantize_internal(
14142
14349
  if (qtype.to_float == NULL) {
14143
14350
  throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor->type)));
14144
14351
  }
14145
- } else if (tensor->type != GGML_TYPE_F16) {
14352
+ } else if (tensor->type != GGML_TYPE_F16 &&
14353
+ tensor->type != GGML_TYPE_BF16) {
14146
14354
  throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor->type)));
14147
14355
  }
14148
14356
 
14149
14357
  if (nthread < 2) {
14150
14358
  if (tensor->type == GGML_TYPE_F16) {
14151
14359
  ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor->data, f32_output, nelements);
14360
+ } else if (tensor->type == GGML_TYPE_BF16) {
14361
+ ggml_bf16_to_fp32_row((ggml_bf16_t *)tensor->data, f32_output, nelements);
14152
14362
  } else if (ggml_is_quantized(tensor->type)) {
14153
14363
  qtype.to_float(tensor->data, f32_output, nelements);
14154
14364
  } else {
@@ -14157,7 +14367,14 @@ static void llama_tensor_dequantize_internal(
14157
14367
  return;
14158
14368
  }
14159
14369
 
14160
- size_t block_size = tensor->type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor->type);
14370
+ size_t block_size;
14371
+ if (tensor->type == GGML_TYPE_F16 ||
14372
+ tensor->type == GGML_TYPE_BF16) {
14373
+ block_size = 1;
14374
+ } else {
14375
+ block_size = (size_t)ggml_blck_size(tensor->type);
14376
+ }
14377
+
14161
14378
  size_t block_size_bytes = ggml_type_size(tensor->type);
14162
14379
 
14163
14380
  GGML_ASSERT(nelements % block_size == 0);
@@ -14176,6 +14393,8 @@ static void llama_tensor_dequantize_internal(
14176
14393
  auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
14177
14394
  if (typ == GGML_TYPE_F16) {
14178
14395
  ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
14396
+ } else if (typ == GGML_TYPE_BF16) {
14397
+ ggml_bf16_to_fp32_row((ggml_bf16_t *)inbuf, outbuf, nels);
14179
14398
  } else {
14180
14399
  qtype.to_float(inbuf, outbuf, nels);
14181
14400
  }
@@ -14536,6 +14755,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
14536
14755
  case LLAMA_FTYPE_MOSTLY_Q5_1: default_type = GGML_TYPE_Q5_1; break;
14537
14756
  case LLAMA_FTYPE_MOSTLY_Q8_0: default_type = GGML_TYPE_Q8_0; break;
14538
14757
  case LLAMA_FTYPE_MOSTLY_F16: default_type = GGML_TYPE_F16; break;
14758
+ case LLAMA_FTYPE_MOSTLY_BF16: default_type = GGML_TYPE_BF16; break;
14539
14759
  case LLAMA_FTYPE_ALL_F32: default_type = GGML_TYPE_F32; break;
14540
14760
 
14541
14761
  // K-quants
@@ -15200,6 +15420,7 @@ struct llama_model_params llama_model_default_params() {
15200
15420
  /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
15201
15421
  /*.main_gpu =*/ 0,
15202
15422
  /*.tensor_split =*/ nullptr,
15423
+ /*.rpc_servers =*/ nullptr,
15203
15424
  /*.progress_callback =*/ nullptr,
15204
15425
  /*.progress_callback_user_data =*/ nullptr,
15205
15426
  /*.kv_overrides =*/ nullptr,
@@ -15270,7 +15491,9 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
15270
15491
  }
15271
15492
 
15272
15493
  size_t llama_max_devices(void) {
15273
- #if defined(GGML_USE_METAL)
15494
+ #if defined(GGML_USE_RPC)
15495
+ return GGML_RPC_MAX_SERVERS;
15496
+ #elif defined(GGML_USE_METAL)
15274
15497
  return 1;
15275
15498
  #elif defined(GGML_USE_CUDA)
15276
15499
  return GGML_CUDA_MAX_DEVICES;
@@ -15293,7 +15516,7 @@ bool llama_supports_mlock(void) {
15293
15516
 
15294
15517
  bool llama_supports_gpu_offload(void) {
15295
15518
  #if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
15296
- defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
15519
+ defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC)
15297
15520
  // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
15298
15521
  return true;
15299
15522
  #else
@@ -15356,7 +15579,17 @@ struct llama_model * llama_load_model_from_file(
15356
15579
  return true;
15357
15580
  };
15358
15581
  }
15359
-
15582
+ if (params.rpc_servers != nullptr) {
15583
+ // split the servers set them into model->rpc_servers
15584
+ std::string servers(params.rpc_servers);
15585
+ size_t pos = 0;
15586
+ while ((pos = servers.find(",")) != std::string::npos) {
15587
+ std::string server = servers.substr(0, pos);
15588
+ model->rpc_servers.push_back(server);
15589
+ servers.erase(0, pos + 1);
15590
+ }
15591
+ model->rpc_servers.push_back(servers);
15592
+ }
15360
15593
  int status = llama_model_load(path_model, *model, params);
15361
15594
  GGML_ASSERT(status <= 0);
15362
15595
  if (status < 0) {
@@ -15395,6 +15628,11 @@ struct llama_context * llama_new_context_with_model(
15395
15628
  return nullptr;
15396
15629
  }
15397
15630
 
15631
+ if (params.flash_attn && model->arch == LLM_ARCH_GROK) {
15632
+ LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__);
15633
+ params.flash_attn = false;
15634
+ }
15635
+
15398
15636
  llama_context * ctx = new llama_context(*model);
15399
15637
 
15400
15638
  const auto & hparams = model->hparams;
@@ -15418,7 +15656,7 @@ struct llama_context * llama_new_context_with_model(
15418
15656
  cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
15419
15657
 
15420
15658
  // this is necessary due to kv_self.n being padded later during inference
15421
- cparams.n_ctx = GGML_PAD(cparams.n_ctx, 256);
15659
+ cparams.n_ctx = GGML_PAD(cparams.n_ctx, llama_kv_cache_get_padding(cparams));
15422
15660
 
15423
15661
  // with causal attention, the batch size is limited by the context size
15424
15662
  cparams.n_batch = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
@@ -15463,23 +15701,6 @@ struct llama_context * llama_new_context_with_model(
15463
15701
  }
15464
15702
  }
15465
15703
 
15466
- if (cparams.flash_attn && hparams.use_alibi) {
15467
- LLAMA_LOG_WARN("%s: flash_attn is not yet compatible with ALiBi - forcing off\n", __func__);
15468
- cparams.flash_attn = false;
15469
- }
15470
-
15471
- if (cparams.flash_attn && model->arch == LLM_ARCH_GROK) {
15472
- LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__);
15473
- cparams.flash_attn = false;
15474
- }
15475
-
15476
- #ifdef GGML_USE_HIPBLAS
15477
- if (cparams.flash_attn) {
15478
- LLAMA_LOG_WARN("%s: flash_attn is not yet compatible with HIPBLAS builds - forcing off\n", __func__);
15479
- cparams.flash_attn = false;
15480
- }
15481
- #endif
15482
-
15483
15704
  if (params.seed == LLAMA_DEFAULT_SEED) {
15484
15705
  params.seed = time(NULL);
15485
15706
  }
@@ -15515,7 +15736,17 @@ struct llama_context * llama_new_context_with_model(
15515
15736
 
15516
15737
  if (!hparams.vocab_only) {
15517
15738
  // initialize backends
15518
- #ifdef GGML_USE_METAL
15739
+ #if defined(GGML_USE_RPC)
15740
+ for (auto & server : model->rpc_servers) {
15741
+ ggml_backend_t backend = ggml_backend_rpc_init(server.c_str());
15742
+ if (backend == nullptr) {
15743
+ LLAMA_LOG_ERROR("%s: failed to connect RPC backend to %s\n", __func__, server.c_str());
15744
+ llama_free(ctx);
15745
+ return nullptr;
15746
+ }
15747
+ ctx->backends.push_back(backend);
15748
+ }
15749
+ #elif defined(GGML_USE_METAL)
15519
15750
  if (model->n_gpu_layers > 0) {
15520
15751
  ctx->backend_metal = ggml_backend_metal_init();
15521
15752
  if (ctx->backend_metal == nullptr) {
@@ -15671,7 +15902,11 @@ struct llama_context * llama_new_context_with_model(
15671
15902
  ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead_custom(LLAMA_MAX_NODES, false));
15672
15903
 
15673
15904
  // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
15674
- bool pipeline_parallel = llama_get_device_count() > 1 && model->n_gpu_layers > (int)model->hparams.n_layer && model->split_mode == LLAMA_SPLIT_MODE_LAYER;
15905
+ bool pipeline_parallel =
15906
+ llama_get_device_count(*model) > 1 &&
15907
+ model->n_gpu_layers > (int)model->hparams.n_layer &&
15908
+ model->split_mode == LLAMA_SPLIT_MODE_LAYER &&
15909
+ params.offload_kqv;
15675
15910
  #ifndef GGML_USE_CUDA
15676
15911
  // pipeline parallelism requires support for async compute and events
15677
15912
  // currently this is only implemented in the CUDA backend
@@ -15769,6 +16004,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
15769
16004
  case LLM_ARCH_REFACT:
15770
16005
  case LLM_ARCH_BLOOM:
15771
16006
  case LLM_ARCH_MAMBA:
16007
+ case LLM_ARCH_JINA_BERT_V2:
15772
16008
  return LLAMA_ROPE_TYPE_NONE;
15773
16009
 
15774
16010
  // use what we call a normal RoPE, operating on pairs of consecutive head values
@@ -16790,13 +17026,13 @@ static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llam
16790
17026
  }
16791
17027
  else {
16792
17028
  if (cell_range_begin != kv_self.size) {
16793
- cell_ranges.push_back({ cell_range_begin, i });
17029
+ cell_ranges.emplace_back(cell_range_begin, i);
16794
17030
  cell_range_begin = kv_self.size;
16795
17031
  }
16796
17032
  }
16797
17033
  }
16798
17034
  if (cell_range_begin != kv_self.size) {
16799
- cell_ranges.push_back({ cell_range_begin, kv_self.size });
17035
+ cell_ranges.emplace_back(cell_range_begin, kv_self.size);
16800
17036
  }
16801
17037
 
16802
17038
  // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
@@ -17466,9 +17702,10 @@ int32_t llama_tokenize(
17466
17702
 
17467
17703
  static std::string llama_decode_text(const std::string & text) {
17468
17704
  std::string decoded_text;
17469
- auto unicode_sequences = unicode_cpts_from_utf8(text);
17470
- for (auto & unicode_sequence : unicode_sequences) {
17471
- decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(unicode_sequence));
17705
+
17706
+ const auto cpts = unicode_cpts_from_utf8(text);
17707
+ for (const auto cpt : cpts) {
17708
+ decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(cpt));
17472
17709
  }
17473
17710
 
17474
17711
  return decoded_text;
@@ -17832,7 +18069,7 @@ struct llama_timings llama_get_timings(struct llama_context * ctx) {
17832
18069
  /*.t_eval_ms =*/ 1e-3 * ctx->t_eval_us,
17833
18070
 
17834
18071
  /*.n_sample =*/ std::max(1, ctx->n_sample),
17835
- /*.n_p_eval =*/ std::max(1, ctx->n_p_eval),
18072
+ /*.n_p_eval =*/ std::max(0, ctx->n_p_eval),
17836
18073
  /*.n_eval =*/ std::max(1, ctx->n_eval),
17837
18074
  };
17838
18075