llama_cpp 0.15.0 → 0.15.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -7,6 +7,10 @@
7
7
  #include "ggml-alloc.h"
8
8
  #include "ggml-backend.h"
9
9
 
10
+ #ifdef GGML_USE_RPC
11
+ # include "ggml-rpc.h"
12
+ #endif
13
+
10
14
  #ifdef GGML_USE_CUDA
11
15
  # include "ggml-cuda.h"
12
16
  #elif defined(GGML_USE_CLBLAST)
@@ -205,6 +209,7 @@ enum llm_arch {
205
209
  LLM_ARCH_REFACT,
206
210
  LLM_ARCH_BERT,
207
211
  LLM_ARCH_NOMIC_BERT,
212
+ LLM_ARCH_JINA_BERT_V2,
208
213
  LLM_ARCH_BLOOM,
209
214
  LLM_ARCH_STABLELM,
210
215
  LLM_ARCH_QWEN,
@@ -228,39 +233,40 @@ enum llm_arch {
228
233
  };
229
234
 
230
235
  static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
231
- { LLM_ARCH_LLAMA, "llama" },
232
- { LLM_ARCH_FALCON, "falcon" },
233
- { LLM_ARCH_GROK, "grok" },
234
- { LLM_ARCH_GPT2, "gpt2" },
235
- { LLM_ARCH_GPTJ, "gptj" },
236
- { LLM_ARCH_GPTNEOX, "gptneox" },
237
- { LLM_ARCH_MPT, "mpt" },
238
- { LLM_ARCH_BAICHUAN, "baichuan" },
239
- { LLM_ARCH_STARCODER, "starcoder" },
240
- { LLM_ARCH_PERSIMMON, "persimmon" },
241
- { LLM_ARCH_REFACT, "refact" },
242
- { LLM_ARCH_BERT, "bert" },
243
- { LLM_ARCH_NOMIC_BERT, "nomic-bert" },
244
- { LLM_ARCH_BLOOM, "bloom" },
245
- { LLM_ARCH_STABLELM, "stablelm" },
246
- { LLM_ARCH_QWEN, "qwen" },
247
- { LLM_ARCH_QWEN2, "qwen2" },
248
- { LLM_ARCH_QWEN2MOE, "qwen2moe" },
249
- { LLM_ARCH_PHI2, "phi2" },
250
- { LLM_ARCH_PHI3, "phi3" },
251
- { LLM_ARCH_PLAMO, "plamo" },
252
- { LLM_ARCH_CODESHELL, "codeshell" },
253
- { LLM_ARCH_ORION, "orion" },
254
- { LLM_ARCH_INTERNLM2, "internlm2" },
255
- { LLM_ARCH_MINICPM, "minicpm" },
256
- { LLM_ARCH_GEMMA, "gemma" },
257
- { LLM_ARCH_STARCODER2, "starcoder2" },
258
- { LLM_ARCH_MAMBA, "mamba" },
259
- { LLM_ARCH_XVERSE, "xverse" },
260
- { LLM_ARCH_COMMAND_R, "command-r" },
261
- { LLM_ARCH_DBRX, "dbrx" },
262
- { LLM_ARCH_OLMO, "olmo" },
263
- { LLM_ARCH_UNKNOWN, "(unknown)" },
236
+ { LLM_ARCH_LLAMA, "llama" },
237
+ { LLM_ARCH_FALCON, "falcon" },
238
+ { LLM_ARCH_GROK, "grok" },
239
+ { LLM_ARCH_GPT2, "gpt2" },
240
+ { LLM_ARCH_GPTJ, "gptj" },
241
+ { LLM_ARCH_GPTNEOX, "gptneox" },
242
+ { LLM_ARCH_MPT, "mpt" },
243
+ { LLM_ARCH_BAICHUAN, "baichuan" },
244
+ { LLM_ARCH_STARCODER, "starcoder" },
245
+ { LLM_ARCH_PERSIMMON, "persimmon" },
246
+ { LLM_ARCH_REFACT, "refact" },
247
+ { LLM_ARCH_BERT, "bert" },
248
+ { LLM_ARCH_NOMIC_BERT, "nomic-bert" },
249
+ { LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
250
+ { LLM_ARCH_BLOOM, "bloom" },
251
+ { LLM_ARCH_STABLELM, "stablelm" },
252
+ { LLM_ARCH_QWEN, "qwen" },
253
+ { LLM_ARCH_QWEN2, "qwen2" },
254
+ { LLM_ARCH_QWEN2MOE, "qwen2moe" },
255
+ { LLM_ARCH_PHI2, "phi2" },
256
+ { LLM_ARCH_PHI3, "phi3" },
257
+ { LLM_ARCH_PLAMO, "plamo" },
258
+ { LLM_ARCH_CODESHELL, "codeshell" },
259
+ { LLM_ARCH_ORION, "orion" },
260
+ { LLM_ARCH_INTERNLM2, "internlm2" },
261
+ { LLM_ARCH_MINICPM, "minicpm" },
262
+ { LLM_ARCH_GEMMA, "gemma" },
263
+ { LLM_ARCH_STARCODER2, "starcoder2" },
264
+ { LLM_ARCH_MAMBA, "mamba" },
265
+ { LLM_ARCH_XVERSE, "xverse" },
266
+ { LLM_ARCH_COMMAND_R, "command-r" },
267
+ { LLM_ARCH_DBRX, "dbrx" },
268
+ { LLM_ARCH_OLMO, "olmo" },
269
+ { LLM_ARCH_UNKNOWN, "(unknown)" },
264
270
  };
265
271
 
266
272
  enum llm_kv {
@@ -691,6 +697,25 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
691
697
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
692
698
  },
693
699
  },
700
+ {
701
+ LLM_ARCH_JINA_BERT_V2,
702
+ {
703
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
704
+ { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
705
+ { LLM_TENSOR_TOKEN_TYPES, "token_types" },
706
+ { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
707
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
708
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
709
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
710
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
711
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
712
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
713
+ { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
714
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
715
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
716
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
717
+ },
718
+ },
694
719
  {
695
720
  LLM_ARCH_BLOOM,
696
721
  {
@@ -1664,91 +1689,6 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
1664
1689
  GGML_UNUSED(host_buffer);
1665
1690
  }
1666
1691
 
1667
- static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
1668
- ggml_backend_buffer_type_t buft = nullptr;
1669
-
1670
- #ifdef GGML_USE_METAL
1671
- buft = ggml_backend_metal_buffer_type();
1672
- #elif defined(GGML_USE_CUDA)
1673
- buft = ggml_backend_cuda_buffer_type(gpu);
1674
- #elif defined(GGML_USE_VULKAN)
1675
- buft = ggml_backend_vk_buffer_type(gpu);
1676
- #elif defined(GGML_USE_SYCL)
1677
- buft = ggml_backend_sycl_buffer_type(gpu);
1678
- #elif defined(GGML_USE_CLBLAST)
1679
- buft = ggml_backend_opencl_buffer_type();
1680
- #elif defined(GGML_USE_KOMPUTE)
1681
- buft = ggml_backend_kompute_buffer_type(gpu);
1682
- if (buft == nullptr) {
1683
- LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
1684
- }
1685
- #endif
1686
-
1687
- if (buft == nullptr) {
1688
- buft = llama_default_buffer_type_cpu(true);
1689
- }
1690
- return buft;
1691
-
1692
- GGML_UNUSED(gpu);
1693
- }
1694
-
1695
- static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_gpu, const float * tensor_split) {
1696
- ggml_backend_buffer_type_t buft = nullptr;
1697
-
1698
- #ifdef GGML_USE_CUDA
1699
- if (ggml_backend_cuda_get_device_count() > 1) {
1700
- buft = ggml_backend_cuda_split_buffer_type(tensor_split);
1701
- }
1702
- #endif
1703
-
1704
- #ifdef GGML_USE_SYCL
1705
- if (ggml_backend_sycl_get_device_count() > 1) {
1706
- buft = ggml_backend_sycl_split_buffer_type(tensor_split);
1707
- }
1708
- #endif
1709
-
1710
- if (buft == nullptr) {
1711
- buft = llama_default_buffer_type_offload(fallback_gpu);
1712
- }
1713
- return buft;
1714
-
1715
- GGML_UNUSED(tensor_split);
1716
- }
1717
-
1718
- static size_t llama_get_device_count() {
1719
- #if defined(GGML_USE_CUDA)
1720
- return ggml_backend_cuda_get_device_count();
1721
- #elif defined(GGML_USE_SYCL)
1722
- return ggml_backend_sycl_get_device_count();
1723
- #elif defined(GGML_USE_VULKAN)
1724
- return ggml_backend_vk_get_device_count();
1725
- #else
1726
- return 1;
1727
- #endif
1728
- }
1729
-
1730
- static size_t llama_get_device_memory(int device) {
1731
- #if defined(GGML_USE_CUDA)
1732
- size_t total;
1733
- size_t free;
1734
- ggml_backend_cuda_get_device_memory(device, &free, &total);
1735
- return free;
1736
- #elif defined(GGML_USE_SYCL)
1737
- size_t total;
1738
- size_t free;
1739
- ggml_backend_sycl_get_device_memory(device, &free, &total);
1740
- return free;
1741
- #elif defined(GGML_USE_VULKAN)
1742
- size_t total;
1743
- size_t free;
1744
- ggml_backend_vk_get_device_memory(device, &free, &total);
1745
- return free;
1746
- #else
1747
- return 1;
1748
- GGML_UNUSED(device);
1749
- #endif
1750
- }
1751
-
1752
1692
  //
1753
1693
  // globals
1754
1694
  //
@@ -1845,7 +1785,7 @@ struct llama_hparams {
1845
1785
  float f_logit_scale = 0.0f;
1846
1786
 
1847
1787
  bool causal_attn = true;
1848
- bool use_alibi = false; // currently, we need KQ_pos data for ALiBi-based models
1788
+ bool use_alibi = false;
1849
1789
 
1850
1790
  enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
1851
1791
  enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
@@ -2189,6 +2129,8 @@ struct llama_model {
2189
2129
  int main_gpu;
2190
2130
  int n_gpu_layers;
2191
2131
 
2132
+ std::vector<std::string> rpc_servers;
2133
+
2192
2134
  // gguf metadata
2193
2135
  std::unordered_map<std::string, std::string> gguf_kv;
2194
2136
 
@@ -2317,7 +2259,6 @@ struct llama_context {
2317
2259
  struct ggml_tensor * inp_pos; // I32 [n_batch]
2318
2260
  struct ggml_tensor * inp_out_ids; // I32 [n_outputs]
2319
2261
  struct ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch]
2320
- struct ggml_tensor * inp_KQ_pos; // F32 [n_kv]
2321
2262
  struct ggml_tensor * inp_K_shift; // I32 [kv_size]
2322
2263
  struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch]
2323
2264
  struct ggml_tensor * inp_cls; // I32 [n_batch]
@@ -2333,6 +2274,104 @@ struct llama_context {
2333
2274
  #endif
2334
2275
  };
2335
2276
 
2277
+ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
2278
+ ggml_backend_buffer_type_t buft = nullptr;
2279
+
2280
+ #ifdef GGML_USE_RPC
2281
+ std::string endpoint = model.rpc_servers[gpu];
2282
+ buft = ggml_backend_rpc_buffer_type(endpoint.c_str());
2283
+ #elif defined(GGML_USE_METAL)
2284
+ buft = ggml_backend_metal_buffer_type();
2285
+ #elif defined(GGML_USE_CUDA)
2286
+ buft = ggml_backend_cuda_buffer_type(gpu);
2287
+ #elif defined(GGML_USE_VULKAN)
2288
+ buft = ggml_backend_vk_buffer_type(gpu);
2289
+ #elif defined(GGML_USE_SYCL)
2290
+ buft = ggml_backend_sycl_buffer_type(gpu);
2291
+ #elif defined(GGML_USE_CLBLAST)
2292
+ buft = ggml_backend_opencl_buffer_type();
2293
+ #elif defined(GGML_USE_KOMPUTE)
2294
+ buft = ggml_backend_kompute_buffer_type(gpu);
2295
+ if (buft == nullptr) {
2296
+ LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
2297
+ }
2298
+ #endif
2299
+
2300
+ if (buft == nullptr) {
2301
+ buft = llama_default_buffer_type_cpu(true);
2302
+ }
2303
+ return buft;
2304
+ GGML_UNUSED(model);
2305
+ GGML_UNUSED(gpu);
2306
+ }
2307
+
2308
+ static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_model & model, int fallback_gpu, const float * tensor_split) {
2309
+ ggml_backend_buffer_type_t buft = nullptr;
2310
+
2311
+ #ifdef GGML_USE_CUDA
2312
+ if (ggml_backend_cuda_get_device_count() > 1) {
2313
+ buft = ggml_backend_cuda_split_buffer_type(tensor_split);
2314
+ }
2315
+ #endif
2316
+
2317
+ #ifdef GGML_USE_SYCL
2318
+ if (ggml_backend_sycl_get_device_count() > 1) {
2319
+ buft = ggml_backend_sycl_split_buffer_type(tensor_split);
2320
+ }
2321
+ #endif
2322
+
2323
+ if (buft == nullptr) {
2324
+ buft = llama_default_buffer_type_offload(model, fallback_gpu);
2325
+ }
2326
+ return buft;
2327
+
2328
+ GGML_UNUSED(tensor_split);
2329
+ }
2330
+
2331
+ static size_t llama_get_device_count(const llama_model & model) {
2332
+ #if defined(GGML_USE_RPC)
2333
+ return model.rpc_servers.size();
2334
+ #elif defined(GGML_USE_CUDA)
2335
+ return ggml_backend_cuda_get_device_count();
2336
+ #elif defined(GGML_USE_SYCL)
2337
+ return ggml_backend_sycl_get_device_count();
2338
+ #elif defined(GGML_USE_VULKAN)
2339
+ return ggml_backend_vk_get_device_count();
2340
+ #else
2341
+ return 1;
2342
+ #endif
2343
+ GGML_UNUSED(model);
2344
+ }
2345
+
2346
+ static size_t llama_get_device_memory(const llama_model & model, int device) {
2347
+ #if defined(GGML_USE_RPC)
2348
+ size_t total;
2349
+ size_t free;
2350
+ std::string endpoint = model.rpc_servers[device];
2351
+ ggml_backend_rpc_get_device_memory(endpoint.c_str(), &free, &total);
2352
+ return free;
2353
+ #elif defined(GGML_USE_CUDA)
2354
+ size_t total;
2355
+ size_t free;
2356
+ ggml_backend_cuda_get_device_memory(device, &free, &total);
2357
+ return free;
2358
+ #elif defined(GGML_USE_SYCL)
2359
+ size_t total;
2360
+ size_t free;
2361
+ ggml_backend_sycl_get_device_memory(device, &free, &total);
2362
+ return free;
2363
+ #elif defined(GGML_USE_VULKAN)
2364
+ size_t total;
2365
+ size_t free;
2366
+ ggml_backend_vk_get_device_memory(device, &free, &total);
2367
+ return free;
2368
+ #else
2369
+ return 1;
2370
+ #endif
2371
+ GGML_UNUSED(model);
2372
+ GGML_UNUSED(device);
2373
+ }
2374
+
2336
2375
  //
2337
2376
  // kv cache helpers
2338
2377
  //
@@ -2785,6 +2824,11 @@ static void llama_kv_cache_defrag(struct llama_kv_cache & cache) {
2785
2824
  cache.do_defrag = true;
2786
2825
  }
2787
2826
 
2827
+ static uint32_t llama_kv_cache_get_padding(const struct llama_cparams & cparams) {
2828
+ // the FA kernels require padding to avoid extra runtime boundary checks
2829
+ return cparams.flash_attn ? 256u : 32u;
2830
+ }
2831
+
2788
2832
  //
2789
2833
  // model loading and saving
2790
2834
  //
@@ -3175,6 +3219,7 @@ struct llama_model_loader {
3175
3219
  switch (type_max) {
3176
3220
  case GGML_TYPE_F32: ftype = LLAMA_FTYPE_ALL_F32; break;
3177
3221
  case GGML_TYPE_F16: ftype = LLAMA_FTYPE_MOSTLY_F16; break;
3222
+ case GGML_TYPE_BF16: ftype = LLAMA_FTYPE_MOSTLY_BF16; break;
3178
3223
  case GGML_TYPE_Q4_0: ftype = LLAMA_FTYPE_MOSTLY_Q4_0; break;
3179
3224
  case GGML_TYPE_Q4_1: ftype = LLAMA_FTYPE_MOSTLY_Q4_1; break;
3180
3225
  case GGML_TYPE_Q5_0: ftype = LLAMA_FTYPE_MOSTLY_Q5_0; break;
@@ -3666,6 +3711,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
3666
3711
  switch (ftype) {
3667
3712
  case LLAMA_FTYPE_ALL_F32: return "all F32";
3668
3713
  case LLAMA_FTYPE_MOSTLY_F16: return "F16";
3714
+ case LLAMA_FTYPE_MOSTLY_BF16: return "BF16";
3669
3715
  case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
3670
3716
  case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
3671
3717
  case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
@@ -3777,6 +3823,12 @@ static void llm_load_hparams(
3777
3823
 
3778
3824
  // get hparams kv
3779
3825
  ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab);
3826
+
3827
+ // everything past this point is not vocab-related
3828
+ if (hparams.vocab_only) {
3829
+ return;
3830
+ }
3831
+
3780
3832
  ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
3781
3833
  ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
3782
3834
  ml.get_key(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff);
@@ -3858,7 +3910,7 @@ static void llm_load_hparams(
3858
3910
  switch (hparams.n_layer) {
3859
3911
  case 22: model.type = e_model::MODEL_1B; break;
3860
3912
  case 26: model.type = e_model::MODEL_3B; break;
3861
- case 32: model.type = hparams.n_head == hparams.n_head_kv ? e_model::MODEL_7B : e_model::MODEL_8B; break; // LLaMa 8B v3 uses GQA
3913
+ case 32: model.type = hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B; break;
3862
3914
  case 40: model.type = e_model::MODEL_13B; break;
3863
3915
  case 48: model.type = e_model::MODEL_34B; break;
3864
3916
  case 60: model.type = e_model::MODEL_30B; break;
@@ -3960,6 +4012,19 @@ static void llm_load_hparams(
3960
4012
  model.type = e_model::MODEL_335M; break; // bge-large
3961
4013
  }
3962
4014
  } break;
4015
+ case LLM_ARCH_JINA_BERT_V2:
4016
+ {
4017
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
4018
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
4019
+ ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
4020
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
4021
+ hparams.f_max_alibi_bias = 8.0f;
4022
+
4023
+ switch (hparams.n_layer) {
4024
+ case 4: model.type = e_model::MODEL_33M; break; // jina-embeddings-small
4025
+ case 12: model.type = e_model::MODEL_137M; break; // jina-embeddings-base
4026
+ }
4027
+ } break;
3963
4028
  case LLM_ARCH_NOMIC_BERT:
3964
4029
  {
3965
4030
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -4381,8 +4446,27 @@ static void llm_load_vocab(
4381
4446
  tokenizer_pre == "starcoder") {
4382
4447
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STARCODER;
4383
4448
  } else if (
4384
- tokenizer_pre == "gpt-2") {
4449
+ tokenizer_pre == "gpt-2" ||
4450
+ tokenizer_pre == "jina-es" ||
4451
+ tokenizer_pre == "jina-de" ||
4452
+ tokenizer_pre == "jina-v2-es" ||
4453
+ tokenizer_pre == "jina-v2-de") {
4385
4454
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
4455
+ } else if (
4456
+ tokenizer_pre == "refact") {
4457
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_REFACT;
4458
+ } else if (
4459
+ tokenizer_pre == "command-r") {
4460
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_COMMAND_R;
4461
+ } else if (
4462
+ tokenizer_pre == "qwen2") {
4463
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
4464
+ } else if (
4465
+ tokenizer_pre == "olmo") {
4466
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_OLMO;
4467
+ } else if (
4468
+ tokenizer_pre == "dbrx") {
4469
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DBRX;
4386
4470
  } else {
4387
4471
  throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
4388
4472
  }
@@ -4726,13 +4810,13 @@ static bool llm_load_tensors(
4726
4810
 
4727
4811
  if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
4728
4812
  // calculate the split points
4729
- int device_count = llama_get_device_count();
4813
+ int device_count = llama_get_device_count(model);
4730
4814
  bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
4731
4815
  std::vector<float> splits(device_count);
4732
4816
  if (all_zero) {
4733
4817
  // default split, by free memory
4734
4818
  for (int i = 0; i < device_count; ++i) {
4735
- splits[i] = llama_get_device_memory(i);
4819
+ splits[i] = llama_get_device_memory(model, i);
4736
4820
  }
4737
4821
  } else {
4738
4822
  std::copy(tensor_split, tensor_split + device_count, splits.begin());
@@ -4752,35 +4836,35 @@ static bool llm_load_tensors(
4752
4836
  int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1);
4753
4837
  for (int64_t i = i_gpu_start; i < n_layer; ++i) {
4754
4838
  int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits.begin();
4755
- model.buft_layer[i] = llama_default_buffer_type_offload(layer_gpu);
4839
+ model.buft_layer[i] = llama_default_buffer_type_offload(model, layer_gpu);
4756
4840
  }
4757
4841
  // assign the output layer
4758
4842
  if (n_gpu_layers > n_layer) {
4759
4843
  int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits.begin();
4760
- model.buft_output = llama_default_buffer_type_offload(layer_gpu);
4844
+ model.buft_output = llama_default_buffer_type_offload(model, layer_gpu);
4761
4845
  } else {
4762
4846
  model.buft_output = llama_default_buffer_type_cpu(true);
4763
4847
  }
4764
4848
  } else {
4765
4849
  ggml_backend_buffer_type_t split_buft;
4766
4850
  if (split_mode == LLAMA_SPLIT_MODE_ROW) {
4767
- split_buft = llama_default_buffer_type_split(main_gpu, tensor_split);
4851
+ split_buft = llama_default_buffer_type_split(model, main_gpu, tensor_split);
4768
4852
  } else {
4769
4853
  // LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_LAYER in backends where it is not supported
4770
- split_buft = llama_default_buffer_type_offload(main_gpu);
4854
+ split_buft = llama_default_buffer_type_offload(model, main_gpu);
4771
4855
  }
4772
4856
  // assign the repeating layers
4773
4857
  for (int64_t i = i_gpu_start; i < n_layer; ++i) {
4774
4858
  model.buft_layer[i] = {
4775
4859
  split_buft,
4776
- llama_default_buffer_type_offload(main_gpu)
4860
+ llama_default_buffer_type_offload(model, main_gpu)
4777
4861
  };
4778
4862
  }
4779
4863
  // assign the output layer
4780
4864
  if (n_gpu_layers > n_layer) {
4781
4865
  model.buft_output = {
4782
4866
  split_buft,
4783
- llama_default_buffer_type_offload(main_gpu)
4867
+ llama_default_buffer_type_offload(model, main_gpu)
4784
4868
  };
4785
4869
  } else {
4786
4870
  model.buft_output = llama_default_buffer_type_cpu(true);
@@ -5225,6 +5309,50 @@ static bool llm_load_tensors(
5225
5309
  layer.layer_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd});
5226
5310
  }
5227
5311
  } break;
5312
+ case LLM_ARCH_JINA_BERT_V2:
5313
+ {
5314
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // word_embeddings
5315
+ model.type_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type}); //token_type_embeddings
5316
+ model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}); // LayerNorm
5317
+ model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}); //LayerNorm bias
5318
+
5319
+ for (int i = 0; i < n_layer; ++i) {
5320
+ ggml_context * ctx_layer = ctx_for_layer(i);
5321
+ ggml_context * ctx_split = ctx_for_layer_split(i);
5322
+
5323
+ auto & layer = model.layers[i]; // JinaBertLayer
5324
+
5325
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
5326
+ layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
5327
+
5328
+ layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, false);
5329
+ layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, false);
5330
+
5331
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
5332
+ layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
5333
+
5334
+ layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, false);
5335
+ layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, false);
5336
+
5337
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
5338
+ layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
5339
+
5340
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); //output_dens
5341
+ layer.bo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}); //output_dens
5342
+
5343
+ layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}); //output_norm
5344
+ layer.attn_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd});
5345
+
5346
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
5347
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
5348
+
5349
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
5350
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
5351
+
5352
+ layer.layer_out_norm = ml.create_tensor(ctx_split, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
5353
+ layer.layer_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd});
5354
+ }
5355
+ } break;
5228
5356
  case LLM_ARCH_BLOOM:
5229
5357
  {
5230
5358
  model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
@@ -6120,6 +6248,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
6120
6248
  || !(
6121
6249
  model.ftype == LLAMA_FTYPE_ALL_F32 ||
6122
6250
  model.ftype == LLAMA_FTYPE_MOSTLY_F16 ||
6251
+ model.ftype == LLAMA_FTYPE_MOSTLY_BF16 ||
6123
6252
  model.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
6124
6253
  model.ftype == LLAMA_FTYPE_MOSTLY_Q4_1
6125
6254
  )
@@ -6300,7 +6429,7 @@ static struct ggml_tensor * llm_build_ffn(
6300
6429
  llm_ffn_gate_type type_gate,
6301
6430
  const llm_build_cb & cb,
6302
6431
  int il) {
6303
- struct ggml_tensor * tmp = ggml_mul_mat(ctx, up, cur);
6432
+ struct ggml_tensor * tmp = up ? ggml_mul_mat(ctx, up, cur) : cur;
6304
6433
  cb(tmp, "ffn_up", il);
6305
6434
 
6306
6435
  if (up_b) {
@@ -6482,7 +6611,6 @@ static struct ggml_tensor * llm_build_kqv(
6482
6611
  struct ggml_tensor * wo_b,
6483
6612
  struct ggml_tensor * q_cur,
6484
6613
  struct ggml_tensor * kq_mask,
6485
- struct ggml_tensor * kq_pos,
6486
6614
  int32_t n_tokens,
6487
6615
  int32_t n_kv,
6488
6616
  float kq_scale,
@@ -6494,6 +6622,7 @@ static struct ggml_tensor * llm_build_kqv(
6494
6622
  const int64_t n_embd_head_k = hparams.n_embd_head_k;
6495
6623
  const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
6496
6624
  const int64_t n_embd_head_v = hparams.n_embd_head_v;
6625
+ const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
6497
6626
 
6498
6627
  struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3);
6499
6628
  cb(q, "q", il);
@@ -6512,26 +6641,22 @@ static struct ggml_tensor * llm_build_kqv(
6512
6641
  GGML_UNUSED(model);
6513
6642
  GGML_UNUSED(n_ctx);
6514
6643
 
6515
- // note: if this assert triggers, then some check has failed earlier
6516
- // the idea is to detect during context creation that ALiBi would be used and disable Flash Attention
6517
- GGML_ASSERT(kq_pos == nullptr && "ALiBi is not yet supported with Flash Attention");
6518
-
6519
6644
  // split cached v into n_head heads (not transposed)
6520
6645
  struct ggml_tensor * v =
6521
6646
  ggml_view_3d(ctx, kv.v_l[il],
6522
6647
  n_embd_head_v, n_kv, n_head_kv,
6523
- ggml_row_size(kv.v_l[il]->type, n_embd_k_gqa),
6524
- ggml_row_size(kv.v_l[il]->type, n_embd_head_k),
6648
+ ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa),
6649
+ ggml_row_size(kv.v_l[il]->type, n_embd_head_v),
6525
6650
  0);
6526
6651
  cb(v, "v", il);
6527
6652
 
6528
- cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale);
6653
+ cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias);
6529
6654
 
6530
6655
  if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
6531
6656
  ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
6532
6657
  }
6533
6658
 
6534
- cur = ggml_reshape_2d(ctx, cur, n_embd_head_k*n_head, n_tokens);
6659
+ cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens);
6535
6660
  } else {
6536
6661
  struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
6537
6662
  cb(kq, "kq", il);
@@ -6556,28 +6681,8 @@ static struct ggml_tensor * llm_build_kqv(
6556
6681
  kq = ggml_scale(ctx, kq, 30);
6557
6682
  }
6558
6683
 
6559
- #if defined(GGML_USE_KOMPUTE)
6560
- #pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Kompute")
6561
- #pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
6562
- #pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5488")
6563
- if (hparams.use_alibi) {
6564
- kq = ggml_scale(ctx, kq, kq_scale);
6565
- cb(kq, "kq_scaled", il);
6566
-
6567
- kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, hparams.f_max_alibi_bias);
6568
- cb(kq, "kq_scaled_alibi", il);
6569
-
6570
- kq = ggml_add(ctx, kq, kq_mask);
6571
- cb(kq, "kq_masked", il);
6572
-
6573
- kq = ggml_soft_max(ctx, kq);
6574
- cb(kq, "kq_soft_max", il);
6575
- } else
6576
- #endif
6577
- {
6578
- kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_pos, kq_scale, hparams.f_max_alibi_bias);
6579
- cb(kq, "kq_soft_max_ext", il);
6580
- }
6684
+ kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias);
6685
+ cb(kq, "kq_soft_max_ext", il);
6581
6686
 
6582
6687
  GGML_ASSERT(kv.size == n_ctx);
6583
6688
 
@@ -6596,7 +6701,7 @@ static struct ggml_tensor * llm_build_kqv(
6596
6701
  struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
6597
6702
  cb(kqv_merged, "kqv_merged", il);
6598
6703
 
6599
- cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens);
6704
+ cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_v*n_head, n_tokens);
6600
6705
  cb(cur, "kqv_merged_cont", il);
6601
6706
  }
6602
6707
 
@@ -6627,7 +6732,6 @@ static struct ggml_tensor * llm_build_kv(
6627
6732
  struct ggml_tensor * v_cur,
6628
6733
  struct ggml_tensor * q_cur,
6629
6734
  struct ggml_tensor * kq_mask,
6630
- struct ggml_tensor * kq_pos,
6631
6735
  int32_t n_tokens,
6632
6736
  int32_t kv_head,
6633
6737
  int32_t n_kv,
@@ -6646,7 +6750,7 @@ static struct ggml_tensor * llm_build_kv(
6646
6750
  struct ggml_tensor * cur;
6647
6751
 
6648
6752
  cur = llm_build_kqv(ctx, model, hparams, cparams, kv, graph, wo, wo_b,
6649
- q_cur, kq_mask, kq_pos, n_tokens, n_kv, kq_scale, cb, il);
6753
+ q_cur, kq_mask, n_tokens, n_kv, kq_scale, cb, il);
6650
6754
  cb(cur, "kqv_out", il);
6651
6755
 
6652
6756
  return cur;
@@ -6753,18 +6857,17 @@ struct llm_build_context {
6753
6857
 
6754
6858
  ctx0 = ggml_init(params);
6755
6859
 
6756
- lctx.inp_tokens = nullptr;
6757
- lctx.inp_embd = nullptr;
6758
- lctx.inp_pos = nullptr;
6860
+ lctx.inp_tokens = nullptr;
6861
+ lctx.inp_embd = nullptr;
6862
+ lctx.inp_pos = nullptr;
6759
6863
  lctx.inp_out_ids = nullptr;
6760
6864
  lctx.inp_KQ_mask = nullptr;
6761
- lctx.inp_KQ_pos = nullptr;
6762
6865
  lctx.inp_K_shift = nullptr;
6763
- lctx.inp_mean = nullptr;
6764
- lctx.inp_cls = nullptr;
6765
- lctx.inp_s_copy = nullptr;
6766
- lctx.inp_s_mask = nullptr;
6767
- lctx.inp_s_seq = nullptr;
6866
+ lctx.inp_mean = nullptr;
6867
+ lctx.inp_cls = nullptr;
6868
+ lctx.inp_s_copy = nullptr;
6869
+ lctx.inp_s_mask = nullptr;
6870
+ lctx.inp_s_seq = nullptr;
6768
6871
  }
6769
6872
 
6770
6873
  void free() {
@@ -6914,19 +7017,6 @@ struct llm_build_context {
6914
7017
  return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_mask, GGML_TYPE_F16) : lctx.inp_KQ_mask;
6915
7018
  }
6916
7019
 
6917
- struct ggml_tensor * build_inp_KQ_pos(bool causal = true) {
6918
- if (causal) {
6919
- lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_kv);
6920
- } else {
6921
- // TODO: this will be needed for ALiBi-based BERT models
6922
- // https://github.com/ggerganov/llama.cpp/pull/6826
6923
- lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_tokens);
6924
- }
6925
- cb(lctx.inp_KQ_pos, "KQ_pos", -1);
6926
- ggml_set_input(lctx.inp_KQ_pos);
6927
- return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_pos, GGML_TYPE_F16) : lctx.inp_KQ_pos;
6928
- }
6929
-
6930
7020
  struct ggml_tensor * build_inp_mean() {
6931
7021
  lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
6932
7022
  cb(lctx.inp_mean, "inp_mean", -1);
@@ -7032,7 +7122,7 @@ struct llm_build_context {
7032
7122
 
7033
7123
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7034
7124
  model.layers[il].wo, model.layers[il].bo,
7035
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7125
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7036
7126
  }
7037
7127
 
7038
7128
  if (il == n_layer - 1) {
@@ -7125,9 +7215,6 @@ struct llm_build_context {
7125
7215
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
7126
7216
  struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
7127
7217
 
7128
- // positions of the tokens in the KV cache
7129
- struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
7130
-
7131
7218
  for (int il = 0; il < n_layer; ++il) {
7132
7219
  struct ggml_tensor * inpSA = inpL;
7133
7220
 
@@ -7172,7 +7259,7 @@ struct llm_build_context {
7172
7259
 
7173
7260
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7174
7261
  model.layers[il].wo, NULL,
7175
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7262
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7176
7263
  }
7177
7264
 
7178
7265
  if (il == n_layer - 1) {
@@ -7242,9 +7329,6 @@ struct llm_build_context {
7242
7329
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
7243
7330
  struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
7244
7331
 
7245
- // positions of the tokens in the KV cache
7246
- struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
7247
-
7248
7332
  for (int il = 0; il < n_layer; ++il) {
7249
7333
  struct ggml_tensor * inpSA = inpL;
7250
7334
 
@@ -7279,7 +7363,7 @@ struct llm_build_context {
7279
7363
  cb(Kcur, "Kcur", il);
7280
7364
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7281
7365
  model.layers[il].wo, NULL,
7282
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7366
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7283
7367
  }
7284
7368
 
7285
7369
  if (il == n_layer - 1) {
@@ -7399,7 +7483,7 @@ struct llm_build_context {
7399
7483
 
7400
7484
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7401
7485
  model.layers[il].wo, NULL,
7402
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7486
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7403
7487
  }
7404
7488
 
7405
7489
  if (il == n_layer - 1) {
@@ -7524,7 +7608,7 @@ struct llm_build_context {
7524
7608
 
7525
7609
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7526
7610
  model.layers[il].wo, model.layers[il].bo,
7527
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
7611
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
7528
7612
  }
7529
7613
 
7530
7614
  if (il == n_layer - 1) {
@@ -7676,7 +7760,7 @@ struct llm_build_context {
7676
7760
 
7677
7761
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7678
7762
  model.layers[il].wo, NULL,
7679
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7763
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7680
7764
  }
7681
7765
 
7682
7766
  if (il == n_layer - 1) {
@@ -7788,7 +7872,7 @@ struct llm_build_context {
7788
7872
 
7789
7873
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7790
7874
  model.layers[il].wo, model.layers[il].bo,
7791
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7875
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7792
7876
  }
7793
7877
 
7794
7878
  if (il == n_layer - 1) {
@@ -7992,7 +8076,7 @@ struct llm_build_context {
7992
8076
 
7993
8077
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7994
8078
  model.layers[il].wo, model.layers[il].bo,
7995
- Kcur, Vcur, Q, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8079
+ Kcur, Vcur, Q, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7996
8080
  }
7997
8081
 
7998
8082
  if (il == n_layer - 1) {
@@ -8058,9 +8142,6 @@ struct llm_build_context {
8058
8142
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
8059
8143
  struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
8060
8144
 
8061
- // positions of the tokens in the KV cache
8062
- struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
8063
-
8064
8145
  for (int il = 0; il < n_layer; ++il) {
8065
8146
  struct ggml_tensor * inpSA = inpL;
8066
8147
 
@@ -8088,7 +8169,7 @@ struct llm_build_context {
8088
8169
 
8089
8170
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8090
8171
  model.layers[il].wo, NULL,
8091
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8172
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8092
8173
  }
8093
8174
 
8094
8175
  if (il == n_layer - 1) {
@@ -8150,8 +8231,11 @@ struct llm_build_context {
8150
8231
 
8151
8232
  struct ggml_tensor * cur;
8152
8233
  struct ggml_tensor * inpL;
8234
+ struct ggml_tensor * inp_pos = nullptr;
8153
8235
 
8154
- struct ggml_tensor * inp_pos = build_inp_pos();
8236
+ if (model.arch != LLM_ARCH_JINA_BERT_V2) {
8237
+ inp_pos = build_inp_pos();
8238
+ }
8155
8239
  struct ggml_tensor * inp_mean = build_inp_mean();
8156
8240
  struct ggml_tensor * inp_cls = build_inp_cls();
8157
8241
 
@@ -8182,13 +8266,26 @@ struct llm_build_context {
8182
8266
  struct ggml_tensor * Vcur;
8183
8267
 
8184
8268
  // self-attention
8185
- if (model.arch == LLM_ARCH_BERT) {
8269
+ if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_JINA_BERT_V2) {
8186
8270
  Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq);
8187
8271
  cb(Qcur, "Qcur", il);
8188
8272
 
8273
+ if (model.layers[il].attn_q_norm) {
8274
+ Qcur = llm_build_norm(ctx0, Qcur, hparams,
8275
+ model.layers[il].attn_q_norm,
8276
+ model.layers[il].attn_q_norm_b,
8277
+ LLM_NORM, cb, il);
8278
+ }
8279
+
8189
8280
  Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), model.layers[il].bk);
8190
8281
  cb(Kcur, "Kcur", il);
8191
8282
 
8283
+ if (model.layers[il].attn_k_norm) {
8284
+ Kcur = llm_build_norm(ctx0, Kcur, hparams,
8285
+ model.layers[il].attn_k_norm,
8286
+ model.layers[il].attn_k_norm_b,
8287
+ LLM_NORM, cb, il);
8288
+ }
8192
8289
  Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), model.layers[il].bv);
8193
8290
  cb(Vcur, "Vcur", il);
8194
8291
 
@@ -8228,7 +8325,7 @@ struct llm_build_context {
8228
8325
  struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
8229
8326
  cb(kq, "kq", il);
8230
8327
 
8231
- kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, nullptr, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
8328
+ kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
8232
8329
  cb(kq, "kq_soft_max_ext", il);
8233
8330
 
8234
8331
  struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)));
@@ -8279,6 +8376,13 @@ struct llm_build_context {
8279
8376
  model.layers[il].ffn_down, model.layers[il].ffn_down_b,
8280
8377
  NULL,
8281
8378
  LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
8379
+ } else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
8380
+ cur = llm_build_ffn(ctx0, cur,
8381
+ model.layers[il].ffn_up, NULL,
8382
+ model.layers[il].ffn_gate, NULL,
8383
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b,
8384
+ NULL,
8385
+ LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
8282
8386
  } else {
8283
8387
  cur = llm_build_ffn(ctx0, cur,
8284
8388
  model.layers[il].ffn_up, NULL,
@@ -8345,9 +8449,6 @@ struct llm_build_context {
8345
8449
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
8346
8450
  struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
8347
8451
 
8348
- // positions of the tokens in the KV cache
8349
- struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
8350
-
8351
8452
  inpL = llm_build_norm(ctx0, inpL, hparams,
8352
8453
  model.tok_norm,
8353
8454
  model.tok_norm_b,
@@ -8381,7 +8482,7 @@ struct llm_build_context {
8381
8482
 
8382
8483
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8383
8484
  model.layers[il].wo, model.layers[il].bo,
8384
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8485
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8385
8486
  }
8386
8487
 
8387
8488
  if (il == n_layer - 1) {
@@ -8446,9 +8547,6 @@ struct llm_build_context {
8446
8547
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
8447
8548
  struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
8448
8549
 
8449
- // positions of the tokens in the KV cache
8450
- struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
8451
-
8452
8550
  if (model.pos_embd) {
8453
8551
  // inp_pos - contains the positions
8454
8552
  struct ggml_tensor * inp_pos = build_inp_pos();
@@ -8512,13 +8610,13 @@ struct llm_build_context {
8512
8610
 
8513
8611
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8514
8612
  model.layers[il].wo, model.layers[il].bo,
8515
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8613
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8516
8614
  } else {
8517
8615
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8518
8616
 
8519
8617
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8520
8618
  model.layers[il].wo, model.layers[il].bo,
8521
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8619
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8522
8620
  }
8523
8621
  }
8524
8622
 
@@ -8662,7 +8760,7 @@ struct llm_build_context {
8662
8760
 
8663
8761
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8664
8762
  model.layers[il].wo, NULL,
8665
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8763
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8666
8764
  }
8667
8765
 
8668
8766
  if (il == n_layer - 1) {
@@ -8780,7 +8878,7 @@ struct llm_build_context {
8780
8878
 
8781
8879
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8782
8880
  model.layers[il].wo, NULL,
8783
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8881
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8784
8882
  }
8785
8883
 
8786
8884
  if (il == n_layer - 1) {
@@ -8893,7 +8991,7 @@ struct llm_build_context {
8893
8991
 
8894
8992
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8895
8993
  model.layers[il].wo, model.layers[il].bo,
8896
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8994
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8897
8995
  }
8898
8996
 
8899
8997
  if (il == n_layer - 1) {
@@ -9007,7 +9105,7 @@ struct llm_build_context {
9007
9105
 
9008
9106
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9009
9107
  model.layers[il].wo, model.layers[il].bo,
9010
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9108
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9011
9109
  }
9012
9110
 
9013
9111
  if (il == n_layer - 1) {
@@ -9162,7 +9260,7 @@ struct llm_build_context {
9162
9260
 
9163
9261
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9164
9262
  model.layers[il].wo, model.layers[il].bo,
9165
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
9263
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
9166
9264
  }
9167
9265
 
9168
9266
  if (il == n_layer - 1) {
@@ -9279,7 +9377,7 @@ struct llm_build_context {
9279
9377
 
9280
9378
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9281
9379
  model.layers[il].wo, model.layers[il].bo,
9282
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
9380
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
9283
9381
  }
9284
9382
 
9285
9383
  if (il == n_layer - 1) {
@@ -9392,7 +9490,7 @@ struct llm_build_context {
9392
9490
 
9393
9491
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9394
9492
  model.layers[il].wo, NULL,
9395
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9493
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9396
9494
  }
9397
9495
  struct ggml_tensor * sa_out = cur;
9398
9496
 
@@ -9495,7 +9593,7 @@ struct llm_build_context {
9495
9593
 
9496
9594
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9497
9595
  model.layers[il].wo, model.layers[il].bo,
9498
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9596
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9499
9597
  }
9500
9598
 
9501
9599
  if (il == n_layer - 1) {
@@ -9602,7 +9700,7 @@ struct llm_build_context {
9602
9700
 
9603
9701
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9604
9702
  model.layers[il].wo, model.layers[il].bo,
9605
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9703
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9606
9704
  }
9607
9705
 
9608
9706
  if (il == n_layer - 1) {
@@ -9718,7 +9816,7 @@ struct llm_build_context {
9718
9816
 
9719
9817
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9720
9818
  model.layers[il].wo, NULL,
9721
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9819
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9722
9820
  }
9723
9821
 
9724
9822
  if (il == n_layer - 1) {
@@ -9835,7 +9933,7 @@ struct llm_build_context {
9835
9933
 
9836
9934
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9837
9935
  model.layers[il].wo, model.layers[il].bo,
9838
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9936
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9839
9937
  }
9840
9938
 
9841
9939
  if (il == n_layer - 1) {
@@ -9965,7 +10063,7 @@ struct llm_build_context {
9965
10063
 
9966
10064
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9967
10065
  model.layers[il].wo, model.layers[il].bo,
9968
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10066
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9969
10067
  }
9970
10068
 
9971
10069
  if (il == n_layer - 1) {
@@ -10086,7 +10184,7 @@ struct llm_build_context {
10086
10184
 
10087
10185
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10088
10186
  model.layers[il].wo, NULL,
10089
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
10187
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
10090
10188
  }
10091
10189
 
10092
10190
  if (il == n_layer - 1) {
@@ -10205,7 +10303,7 @@ struct llm_build_context {
10205
10303
 
10206
10304
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10207
10305
  model.layers[il].wo, model.layers[il].bo,
10208
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10306
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10209
10307
  }
10210
10308
 
10211
10309
  if (il == n_layer - 1) {
@@ -10495,7 +10593,7 @@ struct llm_build_context {
10495
10593
 
10496
10594
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10497
10595
  model.layers[il].wo, model.layers[il].bo,
10498
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10596
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10499
10597
  }
10500
10598
 
10501
10599
  if (il == n_layer - 1) {
@@ -10626,7 +10724,7 @@ struct llm_build_context {
10626
10724
 
10627
10725
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10628
10726
  model.layers[il].wo, nullptr,
10629
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10727
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10630
10728
  }
10631
10729
 
10632
10730
  if (il == n_layer - 1) {
@@ -10807,6 +10905,7 @@ static struct ggml_cgraph * llama_build_graph(
10807
10905
  result = llm.build_refact();
10808
10906
  } break;
10809
10907
  case LLM_ARCH_BERT:
10908
+ case LLM_ARCH_JINA_BERT_V2:
10810
10909
  case LLM_ARCH_NOMIC_BERT:
10811
10910
  {
10812
10911
  result = llm.build_bert();
@@ -11014,11 +11113,21 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
11014
11113
  if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
11015
11114
  f = -INFINITY;
11016
11115
  } else {
11017
- f = 0.0f;
11116
+ if (hparams.use_alibi) {
11117
+ f = -fabs(lctx.kv_self.cells[i].pos - pos);
11118
+ } else {
11119
+ f = 0.0f;
11120
+ }
11018
11121
  }
11019
11122
  data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
11020
11123
  }
11021
11124
  }
11125
+
11126
+ for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
11127
+ for (int j = 0; j < n_kv; ++j) {
11128
+ data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
11129
+ }
11130
+ }
11022
11131
  }
11023
11132
  } else {
11024
11133
  // when using kv cache, the mask needs to match the kv cache size
@@ -11037,7 +11146,11 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
11037
11146
  float f = -INFINITY;
11038
11147
  for (int s = 0; s < batch.n_seq_id[i]; ++s) {
11039
11148
  if (batch.seq_id[i][s] == seq_id) {
11040
- f = 0.0f;
11149
+ if (hparams.use_alibi) {
11150
+ f = -fabs(batch.pos[i] - batch.pos[j]);
11151
+ } else {
11152
+ f = 0.0f;
11153
+ }
11041
11154
  break;
11042
11155
  }
11043
11156
  }
@@ -11053,21 +11166,6 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
11053
11166
  }
11054
11167
  }
11055
11168
 
11056
- // ALiBi requires the KQ_pos tensor to provide the sequence position of each token in the batch
11057
- // this allows to process multiple sequences in parallel with ALiBi-based models
11058
- if (hparams.use_alibi) {
11059
- const int64_t n_kv = kv_self.n;
11060
-
11061
- GGML_ASSERT(lctx.inp_KQ_pos);
11062
- GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_pos->buffer));
11063
-
11064
- float * data = (float *) lctx.inp_KQ_pos->data;
11065
-
11066
- for (int i = 0; i < n_kv; ++i) {
11067
- data[i] = float(lctx.kv_self.cells[i].pos);
11068
- }
11069
- }
11070
-
11071
11169
  if (cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
11072
11170
  const int64_t n_tokens = batch.n_tokens;
11073
11171
 
@@ -11437,7 +11535,8 @@ static int llama_decode_internal(
11437
11535
  // a heuristic, to avoid attending the full cache if it is not yet utilized
11438
11536
  // after enough generations, the benefit from this heuristic disappears
11439
11537
  // if we start defragmenting the cache, the benefit from this will be more important
11440
- kv_self.n = std::min(kv_self.size, std::max(256u, GGML_PAD(llama_kv_cache_cell_max(kv_self), 256)));
11538
+ const uint32_t pad = llama_kv_cache_get_padding(cparams);
11539
+ kv_self.n = std::min(kv_self.size, std::max(pad, GGML_PAD(llama_kv_cache_cell_max(kv_self), pad)));
11441
11540
  //kv_self.n = llama_kv_cache_cell_max(kv_self);
11442
11541
  }
11443
11542
  }
@@ -11952,7 +12051,7 @@ static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id
11952
12051
  static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
11953
12052
  GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE);
11954
12053
  GGML_ASSERT(llama_is_byte_token(vocab, id));
11955
- const auto& token_data = vocab.id_to_token.at(id);
12054
+ const auto & token_data = vocab.id_to_token.at(id);
11956
12055
  switch (llama_vocab_get_type(vocab)) {
11957
12056
  case LLAMA_VOCAB_TYPE_SPM: {
11958
12057
  auto buf = token_data.text.substr(3, 2);
@@ -12182,12 +12281,14 @@ struct llm_tokenizer_bpe {
12182
12281
 
12183
12282
  void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
12184
12283
  int final_prev_index = -1;
12284
+ bool ignore_merges = false;
12185
12285
 
12186
12286
  std::vector<std::string> word_collection;
12187
12287
  switch (vocab.type) {
12188
12288
  case LLAMA_VOCAB_TYPE_BPE:
12189
12289
  switch (vocab.type_pre) {
12190
12290
  case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
12291
+ ignore_merges = true;
12191
12292
  word_collection = unicode_regex_split(text, {
12192
12293
  // original regex from tokenizer.json
12193
12294
  //"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
@@ -12196,6 +12297,12 @@ struct llm_tokenizer_bpe {
12196
12297
  "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
12197
12298
  });
12198
12299
  break;
12300
+ case LLAMA_VOCAB_PRE_TYPE_DBRX:
12301
+ word_collection = unicode_regex_split(text, {
12302
+ // same as llama3
12303
+ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
12304
+ });
12305
+ break;
12199
12306
  case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
12200
12307
  word_collection = unicode_regex_split(text, {
12201
12308
  "[\r\n]",
@@ -12212,14 +12319,13 @@ struct llm_tokenizer_bpe {
12212
12319
  "\\s?\\p{L}+",
12213
12320
  "\\s?\\p{P}+",
12214
12321
  "[一-龥ࠀ-一가-퟿]+",
12215
- "\\p{N}+",
12322
+ "\\p{N}",
12216
12323
  });
12217
12324
  break;
12218
12325
  case LLAMA_VOCAB_PRE_TYPE_FALCON:
12219
12326
  word_collection = unicode_regex_split(text, {
12220
12327
  "[\\p{P}\\$\\+<=>\\^~\\|]+",
12221
12328
  "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
12222
- "\\p{N}+",
12223
12329
  "[0-9][0-9][0-9]",
12224
12330
  });
12225
12331
  break;
@@ -12235,11 +12341,26 @@ struct llm_tokenizer_bpe {
12235
12341
  });
12236
12342
  break;
12237
12343
  case LLAMA_VOCAB_PRE_TYPE_STARCODER:
12344
+ case LLAMA_VOCAB_PRE_TYPE_REFACT:
12345
+ case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
12346
+ word_collection = unicode_regex_split(text, {
12347
+ "\\p{N}",
12348
+ "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
12349
+ });
12350
+ break;
12238
12351
  case LLAMA_VOCAB_PRE_TYPE_GPT2:
12352
+ case LLAMA_VOCAB_PRE_TYPE_OLMO:
12239
12353
  word_collection = unicode_regex_split(text, {
12240
12354
  "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
12241
12355
  });
12242
12356
  break;
12357
+ case LLAMA_VOCAB_PRE_TYPE_QWEN2:
12358
+ word_collection = unicode_regex_split(text, {
12359
+ // original regex from tokenizer.json
12360
+ // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
12361
+ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
12362
+ });
12363
+ break;
12243
12364
  default:
12244
12365
  // default regex for BPE tokenization pre-processing
12245
12366
  word_collection = unicode_regex_split(text, {
@@ -12265,6 +12386,11 @@ struct llm_tokenizer_bpe {
12265
12386
  int index = 0;
12266
12387
  size_t offset = 0;
12267
12388
 
12389
+ if (ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) {
12390
+ symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()});
12391
+ offset = word.size();
12392
+ }
12393
+
12268
12394
  while (offset < word.size()) {
12269
12395
  llm_symbol sym;
12270
12396
  size_t char_len = std::min(word.size() - offset, (size_t) ::utf8_len(word[offset]));
@@ -12450,16 +12576,16 @@ struct llm_tokenizer_wpm {
12450
12576
  // to lowercase, pad chinese characters, pad punctuation
12451
12577
  std::string new_str = "";
12452
12578
  for (uint32_t code : cpts_nfd) {
12453
- int type = unicode_cpt_type(code);
12454
- if (type == CODEPOINT_TYPE_ACCENT_MARK || type == CODEPOINT_TYPE_CONTROL) {
12579
+ const codepoint_flags flags = unicode_cpt_flags(code);
12580
+ if (flags.is_accent_mark || flags.is_control) {
12455
12581
  continue;
12456
12582
  }
12457
12583
  code = unicode_tolower(code);
12458
- if (type == CODEPOINT_TYPE_WHITESPACE) {
12584
+ if (flags.is_separator || flags.is_whitespace) { //####FIXME: is_separator ?
12459
12585
  code = ' ';
12460
12586
  }
12461
12587
  std::string s = unicode_cpt_to_utf8(code);
12462
- if (type == CODEPOINT_TYPE_PUNCTUATION || is_ascii_punct(code) || is_chinese_char(code)) {
12588
+ if (flags.is_punctuation || is_ascii_punct(code) || is_chinese_char(code)) {
12463
12589
  new_str += " ";
12464
12590
  new_str += s;
12465
12591
  new_str += " ";
@@ -12693,6 +12819,13 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
12693
12819
  }
12694
12820
  }
12695
12821
 
12822
+ if (add_special && vocab.special_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) {
12823
+ LLAMA_LOG_WARN(
12824
+ "%s: Added a BOS token to the prompt as specified by the model but the prompt "
12825
+ "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
12826
+ "Are you sure this is what you want?\n", __FUNCTION__);
12827
+ }
12828
+
12696
12829
  if (add_special && vocab.special_add_eos == 1) {
12697
12830
  GGML_ASSERT(vocab.special_eos_id != -1);
12698
12831
  output.push_back(vocab.special_eos_id);
@@ -12719,7 +12852,17 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
12719
12852
  }
12720
12853
  }
12721
12854
 
12722
- GGML_ASSERT(vocab.special_add_eos != 1);
12855
+ if (add_special && vocab.special_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) {
12856
+ LLAMA_LOG_WARN(
12857
+ "%s: Added a BOS token to the prompt as specified by the model but the prompt "
12858
+ "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
12859
+ "Are you sure this is what you want?\n", __FUNCTION__);
12860
+ }
12861
+
12862
+ if (add_special && vocab.special_add_eos == 1) {
12863
+ GGML_ASSERT(vocab.special_add_eos != -1);
12864
+ output.push_back(vocab.special_eos_id);
12865
+ }
12723
12866
  } break;
12724
12867
  case LLAMA_VOCAB_TYPE_WPM:
12725
12868
  {
@@ -13073,6 +13216,58 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
13073
13216
  return rejects;
13074
13217
  }
13075
13218
 
13219
+ static bool llama_grammar_detect_left_recursion(
13220
+ const std::vector<std::vector<llama_grammar_element>> & rules,
13221
+ size_t rule_index,
13222
+ std::vector<bool> * rules_visited,
13223
+ std::vector<bool> * rules_in_progress,
13224
+ std::vector<bool> * rules_may_be_empty) {
13225
+ if ((*rules_in_progress)[rule_index]) {
13226
+ return true;
13227
+ }
13228
+
13229
+ (*rules_in_progress)[rule_index] = true;
13230
+
13231
+ const std::vector<llama_grammar_element> & rule = rules[rule_index];
13232
+
13233
+ // First check if the rule might produce the empty string. This could be done combined with the second
13234
+ // step but it's more readable as two steps.
13235
+ bool at_rule_start = true;
13236
+ for (size_t i = 0; i < rule.size(); i++) {
13237
+ if (llama_grammar_is_end_of_sequence(&rule[i])) {
13238
+ if (at_rule_start) {
13239
+ (*rules_may_be_empty)[rule_index] = true;
13240
+ break;
13241
+ }
13242
+ at_rule_start = true;
13243
+ } else {
13244
+ at_rule_start = false;
13245
+ }
13246
+ }
13247
+
13248
+ // Second, recurse into leftmost nonterminals (or next-leftmost as long as the previous nonterminal may
13249
+ // be empty)
13250
+ bool recurse_into_nonterminal = true;
13251
+ for (size_t i = 0; i < rule.size(); i++) {
13252
+ if (rule[i].type == LLAMA_GRETYPE_RULE_REF && recurse_into_nonterminal) {
13253
+ if (llama_grammar_detect_left_recursion(rules, (size_t)rule[i].value, rules_visited, rules_in_progress, rules_may_be_empty)) {
13254
+ return true;
13255
+ }
13256
+ if (!((*rules_may_be_empty)[(size_t)rule[i].value])) {
13257
+ recurse_into_nonterminal = false;
13258
+ }
13259
+ } else if (llama_grammar_is_end_of_sequence(&rule[i])) {
13260
+ recurse_into_nonterminal = true;
13261
+ } else {
13262
+ recurse_into_nonterminal = false;
13263
+ }
13264
+ }
13265
+
13266
+ (*rules_in_progress)[rule_index] = false;
13267
+ (*rules_visited)[rule_index] = true;
13268
+ return false;
13269
+ }
13270
+
13076
13271
  //
13077
13272
  // grammar - external
13078
13273
  //
@@ -13092,6 +13287,19 @@ struct llama_grammar * llama_grammar_init(
13092
13287
  vec_rules[i].push_back({LLAMA_GRETYPE_END, 0});
13093
13288
  }
13094
13289
 
13290
+ // Check for left recursion
13291
+ std::vector<bool> rules_visited(n_rules);
13292
+ std::vector<bool> rules_in_progress(n_rules);
13293
+ std::vector<bool> rules_may_be_empty(n_rules);
13294
+ for (size_t i = 0; i < n_rules; i++) {
13295
+ if (rules_visited[i]) {
13296
+ continue;
13297
+ }
13298
+ if (llama_grammar_detect_left_recursion(vec_rules, i, &rules_visited, &rules_in_progress, &rules_may_be_empty)) {
13299
+ throw std::runtime_error(format("unsupported grammar, left recursion detected for nonterminal at index %zu", i));
13300
+ }
13301
+ }
13302
+
13095
13303
  // loop over alternates of start rule to build initial stacks
13096
13304
  std::vector<std::vector<const llama_grammar_element *>> stacks;
13097
13305
  pos = vec_rules[start_rule_index].data();
@@ -13114,6 +13322,9 @@ struct llama_grammar * llama_grammar_init(
13114
13322
  }
13115
13323
  } while (true);
13116
13324
 
13325
+ // Important: vec_rules has to be moved here, not copied, because stacks contains
13326
+ // pointers to elements of vec_rules. If vec_rules were copied into llama_grammar
13327
+ // then the pointers would be invalidated when the local vec_rules goes out of scope.
13117
13328
  return new llama_grammar{ std::move(vec_rules), std::move(stacks), {} };
13118
13329
  }
13119
13330
 
@@ -13708,9 +13919,7 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
13708
13919
 
13709
13920
  // Sample the next word X using top-k sampling
13710
13921
  llama_sample_top_k(nullptr, candidates, int(k), 1);
13711
- if (ctx) {
13712
- ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
13713
- }
13922
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
13714
13923
  llama_token X = llama_sample_token(ctx, candidates);
13715
13924
  t_start_sample_us = ggml_time_us();
13716
13925
 
@@ -13724,9 +13933,7 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
13724
13933
  // Update mu using the learning rate and error
13725
13934
  *mu = *mu - eta * e;
13726
13935
 
13727
- if (ctx) {
13728
- ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
13729
- }
13936
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
13730
13937
  return X;
13731
13938
  }
13732
13939
 
@@ -14142,13 +14349,16 @@ static void llama_tensor_dequantize_internal(
14142
14349
  if (qtype.to_float == NULL) {
14143
14350
  throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor->type)));
14144
14351
  }
14145
- } else if (tensor->type != GGML_TYPE_F16) {
14352
+ } else if (tensor->type != GGML_TYPE_F16 &&
14353
+ tensor->type != GGML_TYPE_BF16) {
14146
14354
  throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor->type)));
14147
14355
  }
14148
14356
 
14149
14357
  if (nthread < 2) {
14150
14358
  if (tensor->type == GGML_TYPE_F16) {
14151
14359
  ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor->data, f32_output, nelements);
14360
+ } else if (tensor->type == GGML_TYPE_BF16) {
14361
+ ggml_bf16_to_fp32_row((ggml_bf16_t *)tensor->data, f32_output, nelements);
14152
14362
  } else if (ggml_is_quantized(tensor->type)) {
14153
14363
  qtype.to_float(tensor->data, f32_output, nelements);
14154
14364
  } else {
@@ -14157,7 +14367,14 @@ static void llama_tensor_dequantize_internal(
14157
14367
  return;
14158
14368
  }
14159
14369
 
14160
- size_t block_size = tensor->type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor->type);
14370
+ size_t block_size;
14371
+ if (tensor->type == GGML_TYPE_F16 ||
14372
+ tensor->type == GGML_TYPE_BF16) {
14373
+ block_size = 1;
14374
+ } else {
14375
+ block_size = (size_t)ggml_blck_size(tensor->type);
14376
+ }
14377
+
14161
14378
  size_t block_size_bytes = ggml_type_size(tensor->type);
14162
14379
 
14163
14380
  GGML_ASSERT(nelements % block_size == 0);
@@ -14176,6 +14393,8 @@ static void llama_tensor_dequantize_internal(
14176
14393
  auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
14177
14394
  if (typ == GGML_TYPE_F16) {
14178
14395
  ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
14396
+ } else if (typ == GGML_TYPE_BF16) {
14397
+ ggml_bf16_to_fp32_row((ggml_bf16_t *)inbuf, outbuf, nels);
14179
14398
  } else {
14180
14399
  qtype.to_float(inbuf, outbuf, nels);
14181
14400
  }
@@ -14536,6 +14755,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
14536
14755
  case LLAMA_FTYPE_MOSTLY_Q5_1: default_type = GGML_TYPE_Q5_1; break;
14537
14756
  case LLAMA_FTYPE_MOSTLY_Q8_0: default_type = GGML_TYPE_Q8_0; break;
14538
14757
  case LLAMA_FTYPE_MOSTLY_F16: default_type = GGML_TYPE_F16; break;
14758
+ case LLAMA_FTYPE_MOSTLY_BF16: default_type = GGML_TYPE_BF16; break;
14539
14759
  case LLAMA_FTYPE_ALL_F32: default_type = GGML_TYPE_F32; break;
14540
14760
 
14541
14761
  // K-quants
@@ -15200,6 +15420,7 @@ struct llama_model_params llama_model_default_params() {
15200
15420
  /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
15201
15421
  /*.main_gpu =*/ 0,
15202
15422
  /*.tensor_split =*/ nullptr,
15423
+ /*.rpc_servers =*/ nullptr,
15203
15424
  /*.progress_callback =*/ nullptr,
15204
15425
  /*.progress_callback_user_data =*/ nullptr,
15205
15426
  /*.kv_overrides =*/ nullptr,
@@ -15270,7 +15491,9 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
15270
15491
  }
15271
15492
 
15272
15493
  size_t llama_max_devices(void) {
15273
- #if defined(GGML_USE_METAL)
15494
+ #if defined(GGML_USE_RPC)
15495
+ return GGML_RPC_MAX_SERVERS;
15496
+ #elif defined(GGML_USE_METAL)
15274
15497
  return 1;
15275
15498
  #elif defined(GGML_USE_CUDA)
15276
15499
  return GGML_CUDA_MAX_DEVICES;
@@ -15293,7 +15516,7 @@ bool llama_supports_mlock(void) {
15293
15516
 
15294
15517
  bool llama_supports_gpu_offload(void) {
15295
15518
  #if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
15296
- defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
15519
+ defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC)
15297
15520
  // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
15298
15521
  return true;
15299
15522
  #else
@@ -15356,7 +15579,17 @@ struct llama_model * llama_load_model_from_file(
15356
15579
  return true;
15357
15580
  };
15358
15581
  }
15359
-
15582
+ if (params.rpc_servers != nullptr) {
15583
+ // split the servers set them into model->rpc_servers
15584
+ std::string servers(params.rpc_servers);
15585
+ size_t pos = 0;
15586
+ while ((pos = servers.find(",")) != std::string::npos) {
15587
+ std::string server = servers.substr(0, pos);
15588
+ model->rpc_servers.push_back(server);
15589
+ servers.erase(0, pos + 1);
15590
+ }
15591
+ model->rpc_servers.push_back(servers);
15592
+ }
15360
15593
  int status = llama_model_load(path_model, *model, params);
15361
15594
  GGML_ASSERT(status <= 0);
15362
15595
  if (status < 0) {
@@ -15395,6 +15628,11 @@ struct llama_context * llama_new_context_with_model(
15395
15628
  return nullptr;
15396
15629
  }
15397
15630
 
15631
+ if (params.flash_attn && model->arch == LLM_ARCH_GROK) {
15632
+ LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__);
15633
+ params.flash_attn = false;
15634
+ }
15635
+
15398
15636
  llama_context * ctx = new llama_context(*model);
15399
15637
 
15400
15638
  const auto & hparams = model->hparams;
@@ -15418,7 +15656,7 @@ struct llama_context * llama_new_context_with_model(
15418
15656
  cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
15419
15657
 
15420
15658
  // this is necessary due to kv_self.n being padded later during inference
15421
- cparams.n_ctx = GGML_PAD(cparams.n_ctx, 256);
15659
+ cparams.n_ctx = GGML_PAD(cparams.n_ctx, llama_kv_cache_get_padding(cparams));
15422
15660
 
15423
15661
  // with causal attention, the batch size is limited by the context size
15424
15662
  cparams.n_batch = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
@@ -15463,23 +15701,6 @@ struct llama_context * llama_new_context_with_model(
15463
15701
  }
15464
15702
  }
15465
15703
 
15466
- if (cparams.flash_attn && hparams.use_alibi) {
15467
- LLAMA_LOG_WARN("%s: flash_attn is not yet compatible with ALiBi - forcing off\n", __func__);
15468
- cparams.flash_attn = false;
15469
- }
15470
-
15471
- if (cparams.flash_attn && model->arch == LLM_ARCH_GROK) {
15472
- LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__);
15473
- cparams.flash_attn = false;
15474
- }
15475
-
15476
- #ifdef GGML_USE_HIPBLAS
15477
- if (cparams.flash_attn) {
15478
- LLAMA_LOG_WARN("%s: flash_attn is not yet compatible with HIPBLAS builds - forcing off\n", __func__);
15479
- cparams.flash_attn = false;
15480
- }
15481
- #endif
15482
-
15483
15704
  if (params.seed == LLAMA_DEFAULT_SEED) {
15484
15705
  params.seed = time(NULL);
15485
15706
  }
@@ -15515,7 +15736,17 @@ struct llama_context * llama_new_context_with_model(
15515
15736
 
15516
15737
  if (!hparams.vocab_only) {
15517
15738
  // initialize backends
15518
- #ifdef GGML_USE_METAL
15739
+ #if defined(GGML_USE_RPC)
15740
+ for (auto & server : model->rpc_servers) {
15741
+ ggml_backend_t backend = ggml_backend_rpc_init(server.c_str());
15742
+ if (backend == nullptr) {
15743
+ LLAMA_LOG_ERROR("%s: failed to connect RPC backend to %s\n", __func__, server.c_str());
15744
+ llama_free(ctx);
15745
+ return nullptr;
15746
+ }
15747
+ ctx->backends.push_back(backend);
15748
+ }
15749
+ #elif defined(GGML_USE_METAL)
15519
15750
  if (model->n_gpu_layers > 0) {
15520
15751
  ctx->backend_metal = ggml_backend_metal_init();
15521
15752
  if (ctx->backend_metal == nullptr) {
@@ -15671,7 +15902,11 @@ struct llama_context * llama_new_context_with_model(
15671
15902
  ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead_custom(LLAMA_MAX_NODES, false));
15672
15903
 
15673
15904
  // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
15674
- bool pipeline_parallel = llama_get_device_count() > 1 && model->n_gpu_layers > (int)model->hparams.n_layer && model->split_mode == LLAMA_SPLIT_MODE_LAYER;
15905
+ bool pipeline_parallel =
15906
+ llama_get_device_count(*model) > 1 &&
15907
+ model->n_gpu_layers > (int)model->hparams.n_layer &&
15908
+ model->split_mode == LLAMA_SPLIT_MODE_LAYER &&
15909
+ params.offload_kqv;
15675
15910
  #ifndef GGML_USE_CUDA
15676
15911
  // pipeline parallelism requires support for async compute and events
15677
15912
  // currently this is only implemented in the CUDA backend
@@ -15769,6 +16004,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
15769
16004
  case LLM_ARCH_REFACT:
15770
16005
  case LLM_ARCH_BLOOM:
15771
16006
  case LLM_ARCH_MAMBA:
16007
+ case LLM_ARCH_JINA_BERT_V2:
15772
16008
  return LLAMA_ROPE_TYPE_NONE;
15773
16009
 
15774
16010
  // use what we call a normal RoPE, operating on pairs of consecutive head values
@@ -16790,13 +17026,13 @@ static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llam
16790
17026
  }
16791
17027
  else {
16792
17028
  if (cell_range_begin != kv_self.size) {
16793
- cell_ranges.push_back({ cell_range_begin, i });
17029
+ cell_ranges.emplace_back(cell_range_begin, i);
16794
17030
  cell_range_begin = kv_self.size;
16795
17031
  }
16796
17032
  }
16797
17033
  }
16798
17034
  if (cell_range_begin != kv_self.size) {
16799
- cell_ranges.push_back({ cell_range_begin, kv_self.size });
17035
+ cell_ranges.emplace_back(cell_range_begin, kv_self.size);
16800
17036
  }
16801
17037
 
16802
17038
  // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
@@ -17466,9 +17702,10 @@ int32_t llama_tokenize(
17466
17702
 
17467
17703
  static std::string llama_decode_text(const std::string & text) {
17468
17704
  std::string decoded_text;
17469
- auto unicode_sequences = unicode_cpts_from_utf8(text);
17470
- for (auto & unicode_sequence : unicode_sequences) {
17471
- decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(unicode_sequence));
17705
+
17706
+ const auto cpts = unicode_cpts_from_utf8(text);
17707
+ for (const auto cpt : cpts) {
17708
+ decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(cpt));
17472
17709
  }
17473
17710
 
17474
17711
  return decoded_text;
@@ -17832,7 +18069,7 @@ struct llama_timings llama_get_timings(struct llama_context * ctx) {
17832
18069
  /*.t_eval_ms =*/ 1e-3 * ctx->t_eval_us,
17833
18070
 
17834
18071
  /*.n_sample =*/ std::max(1, ctx->n_sample),
17835
- /*.n_p_eval =*/ std::max(1, ctx->n_p_eval),
18072
+ /*.n_p_eval =*/ std::max(0, ctx->n_p_eval),
17836
18073
  /*.n_eval =*/ std::max(1, ctx->n_eval),
17837
18074
  };
17838
18075