llama_cpp 0.15.1 → 0.15.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,6 +7,10 @@
7
7
  #include "ggml-alloc.h"
8
8
  #include "ggml-backend.h"
9
9
 
10
+ #ifdef GGML_USE_RPC
11
+ # include "ggml-rpc.h"
12
+ #endif
13
+
10
14
  #ifdef GGML_USE_CUDA
11
15
  # include "ggml-cuda.h"
12
16
  #elif defined(GGML_USE_CLBLAST)
@@ -205,6 +209,7 @@ enum llm_arch {
205
209
  LLM_ARCH_REFACT,
206
210
  LLM_ARCH_BERT,
207
211
  LLM_ARCH_NOMIC_BERT,
212
+ LLM_ARCH_JINA_BERT_V2,
208
213
  LLM_ARCH_BLOOM,
209
214
  LLM_ARCH_STABLELM,
210
215
  LLM_ARCH_QWEN,
@@ -228,39 +233,40 @@ enum llm_arch {
228
233
  };
229
234
 
230
235
  static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
231
- { LLM_ARCH_LLAMA, "llama" },
232
- { LLM_ARCH_FALCON, "falcon" },
233
- { LLM_ARCH_GROK, "grok" },
234
- { LLM_ARCH_GPT2, "gpt2" },
235
- { LLM_ARCH_GPTJ, "gptj" },
236
- { LLM_ARCH_GPTNEOX, "gptneox" },
237
- { LLM_ARCH_MPT, "mpt" },
238
- { LLM_ARCH_BAICHUAN, "baichuan" },
239
- { LLM_ARCH_STARCODER, "starcoder" },
240
- { LLM_ARCH_PERSIMMON, "persimmon" },
241
- { LLM_ARCH_REFACT, "refact" },
242
- { LLM_ARCH_BERT, "bert" },
243
- { LLM_ARCH_NOMIC_BERT, "nomic-bert" },
244
- { LLM_ARCH_BLOOM, "bloom" },
245
- { LLM_ARCH_STABLELM, "stablelm" },
246
- { LLM_ARCH_QWEN, "qwen" },
247
- { LLM_ARCH_QWEN2, "qwen2" },
248
- { LLM_ARCH_QWEN2MOE, "qwen2moe" },
249
- { LLM_ARCH_PHI2, "phi2" },
250
- { LLM_ARCH_PHI3, "phi3" },
251
- { LLM_ARCH_PLAMO, "plamo" },
252
- { LLM_ARCH_CODESHELL, "codeshell" },
253
- { LLM_ARCH_ORION, "orion" },
254
- { LLM_ARCH_INTERNLM2, "internlm2" },
255
- { LLM_ARCH_MINICPM, "minicpm" },
256
- { LLM_ARCH_GEMMA, "gemma" },
257
- { LLM_ARCH_STARCODER2, "starcoder2" },
258
- { LLM_ARCH_MAMBA, "mamba" },
259
- { LLM_ARCH_XVERSE, "xverse" },
260
- { LLM_ARCH_COMMAND_R, "command-r" },
261
- { LLM_ARCH_DBRX, "dbrx" },
262
- { LLM_ARCH_OLMO, "olmo" },
263
- { LLM_ARCH_UNKNOWN, "(unknown)" },
236
+ { LLM_ARCH_LLAMA, "llama" },
237
+ { LLM_ARCH_FALCON, "falcon" },
238
+ { LLM_ARCH_GROK, "grok" },
239
+ { LLM_ARCH_GPT2, "gpt2" },
240
+ { LLM_ARCH_GPTJ, "gptj" },
241
+ { LLM_ARCH_GPTNEOX, "gptneox" },
242
+ { LLM_ARCH_MPT, "mpt" },
243
+ { LLM_ARCH_BAICHUAN, "baichuan" },
244
+ { LLM_ARCH_STARCODER, "starcoder" },
245
+ { LLM_ARCH_PERSIMMON, "persimmon" },
246
+ { LLM_ARCH_REFACT, "refact" },
247
+ { LLM_ARCH_BERT, "bert" },
248
+ { LLM_ARCH_NOMIC_BERT, "nomic-bert" },
249
+ { LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
250
+ { LLM_ARCH_BLOOM, "bloom" },
251
+ { LLM_ARCH_STABLELM, "stablelm" },
252
+ { LLM_ARCH_QWEN, "qwen" },
253
+ { LLM_ARCH_QWEN2, "qwen2" },
254
+ { LLM_ARCH_QWEN2MOE, "qwen2moe" },
255
+ { LLM_ARCH_PHI2, "phi2" },
256
+ { LLM_ARCH_PHI3, "phi3" },
257
+ { LLM_ARCH_PLAMO, "plamo" },
258
+ { LLM_ARCH_CODESHELL, "codeshell" },
259
+ { LLM_ARCH_ORION, "orion" },
260
+ { LLM_ARCH_INTERNLM2, "internlm2" },
261
+ { LLM_ARCH_MINICPM, "minicpm" },
262
+ { LLM_ARCH_GEMMA, "gemma" },
263
+ { LLM_ARCH_STARCODER2, "starcoder2" },
264
+ { LLM_ARCH_MAMBA, "mamba" },
265
+ { LLM_ARCH_XVERSE, "xverse" },
266
+ { LLM_ARCH_COMMAND_R, "command-r" },
267
+ { LLM_ARCH_DBRX, "dbrx" },
268
+ { LLM_ARCH_OLMO, "olmo" },
269
+ { LLM_ARCH_UNKNOWN, "(unknown)" },
264
270
  };
265
271
 
266
272
  enum llm_kv {
@@ -691,6 +697,25 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
691
697
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
692
698
  },
693
699
  },
700
+ {
701
+ LLM_ARCH_JINA_BERT_V2,
702
+ {
703
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
704
+ { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
705
+ { LLM_TENSOR_TOKEN_TYPES, "token_types" },
706
+ { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
707
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
708
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
709
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
710
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
711
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
712
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
713
+ { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
714
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
715
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
716
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
717
+ },
718
+ },
694
719
  {
695
720
  LLM_ARCH_BLOOM,
696
721
  {
@@ -1664,91 +1689,6 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
1664
1689
  GGML_UNUSED(host_buffer);
1665
1690
  }
1666
1691
 
1667
- static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
1668
- ggml_backend_buffer_type_t buft = nullptr;
1669
-
1670
- #ifdef GGML_USE_METAL
1671
- buft = ggml_backend_metal_buffer_type();
1672
- #elif defined(GGML_USE_CUDA)
1673
- buft = ggml_backend_cuda_buffer_type(gpu);
1674
- #elif defined(GGML_USE_VULKAN)
1675
- buft = ggml_backend_vk_buffer_type(gpu);
1676
- #elif defined(GGML_USE_SYCL)
1677
- buft = ggml_backend_sycl_buffer_type(gpu);
1678
- #elif defined(GGML_USE_CLBLAST)
1679
- buft = ggml_backend_opencl_buffer_type();
1680
- #elif defined(GGML_USE_KOMPUTE)
1681
- buft = ggml_backend_kompute_buffer_type(gpu);
1682
- if (buft == nullptr) {
1683
- LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
1684
- }
1685
- #endif
1686
-
1687
- if (buft == nullptr) {
1688
- buft = llama_default_buffer_type_cpu(true);
1689
- }
1690
- return buft;
1691
-
1692
- GGML_UNUSED(gpu);
1693
- }
1694
-
1695
- static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_gpu, const float * tensor_split) {
1696
- ggml_backend_buffer_type_t buft = nullptr;
1697
-
1698
- #ifdef GGML_USE_CUDA
1699
- if (ggml_backend_cuda_get_device_count() > 1) {
1700
- buft = ggml_backend_cuda_split_buffer_type(tensor_split);
1701
- }
1702
- #endif
1703
-
1704
- #ifdef GGML_USE_SYCL
1705
- if (ggml_backend_sycl_get_device_count() > 1) {
1706
- buft = ggml_backend_sycl_split_buffer_type(tensor_split);
1707
- }
1708
- #endif
1709
-
1710
- if (buft == nullptr) {
1711
- buft = llama_default_buffer_type_offload(fallback_gpu);
1712
- }
1713
- return buft;
1714
-
1715
- GGML_UNUSED(tensor_split);
1716
- }
1717
-
1718
- static size_t llama_get_device_count() {
1719
- #if defined(GGML_USE_CUDA)
1720
- return ggml_backend_cuda_get_device_count();
1721
- #elif defined(GGML_USE_SYCL)
1722
- return ggml_backend_sycl_get_device_count();
1723
- #elif defined(GGML_USE_VULKAN)
1724
- return ggml_backend_vk_get_device_count();
1725
- #else
1726
- return 1;
1727
- #endif
1728
- }
1729
-
1730
- static size_t llama_get_device_memory(int device) {
1731
- #if defined(GGML_USE_CUDA)
1732
- size_t total;
1733
- size_t free;
1734
- ggml_backend_cuda_get_device_memory(device, &free, &total);
1735
- return free;
1736
- #elif defined(GGML_USE_SYCL)
1737
- size_t total;
1738
- size_t free;
1739
- ggml_backend_sycl_get_device_memory(device, &free, &total);
1740
- return free;
1741
- #elif defined(GGML_USE_VULKAN)
1742
- size_t total;
1743
- size_t free;
1744
- ggml_backend_vk_get_device_memory(device, &free, &total);
1745
- return free;
1746
- #else
1747
- return 1;
1748
- GGML_UNUSED(device);
1749
- #endif
1750
- }
1751
-
1752
1692
  //
1753
1693
  // globals
1754
1694
  //
@@ -1845,7 +1785,7 @@ struct llama_hparams {
1845
1785
  float f_logit_scale = 0.0f;
1846
1786
 
1847
1787
  bool causal_attn = true;
1848
- bool use_alibi = false; // currently, we need KQ_pos data for ALiBi-based models
1788
+ bool use_alibi = false;
1849
1789
 
1850
1790
  enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
1851
1791
  enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
@@ -2189,6 +2129,8 @@ struct llama_model {
2189
2129
  int main_gpu;
2190
2130
  int n_gpu_layers;
2191
2131
 
2132
+ std::vector<std::string> rpc_servers;
2133
+
2192
2134
  // gguf metadata
2193
2135
  std::unordered_map<std::string, std::string> gguf_kv;
2194
2136
 
@@ -2317,7 +2259,6 @@ struct llama_context {
2317
2259
  struct ggml_tensor * inp_pos; // I32 [n_batch]
2318
2260
  struct ggml_tensor * inp_out_ids; // I32 [n_outputs]
2319
2261
  struct ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch]
2320
- struct ggml_tensor * inp_KQ_pos; // F32 [n_kv]
2321
2262
  struct ggml_tensor * inp_K_shift; // I32 [kv_size]
2322
2263
  struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch]
2323
2264
  struct ggml_tensor * inp_cls; // I32 [n_batch]
@@ -2333,6 +2274,104 @@ struct llama_context {
2333
2274
  #endif
2334
2275
  };
2335
2276
 
2277
+ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
2278
+ ggml_backend_buffer_type_t buft = nullptr;
2279
+
2280
+ #ifdef GGML_USE_RPC
2281
+ std::string endpoint = model.rpc_servers[gpu];
2282
+ buft = ggml_backend_rpc_buffer_type(endpoint.c_str());
2283
+ #elif defined(GGML_USE_METAL)
2284
+ buft = ggml_backend_metal_buffer_type();
2285
+ #elif defined(GGML_USE_CUDA)
2286
+ buft = ggml_backend_cuda_buffer_type(gpu);
2287
+ #elif defined(GGML_USE_VULKAN)
2288
+ buft = ggml_backend_vk_buffer_type(gpu);
2289
+ #elif defined(GGML_USE_SYCL)
2290
+ buft = ggml_backend_sycl_buffer_type(gpu);
2291
+ #elif defined(GGML_USE_CLBLAST)
2292
+ buft = ggml_backend_opencl_buffer_type();
2293
+ #elif defined(GGML_USE_KOMPUTE)
2294
+ buft = ggml_backend_kompute_buffer_type(gpu);
2295
+ if (buft == nullptr) {
2296
+ LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
2297
+ }
2298
+ #endif
2299
+
2300
+ if (buft == nullptr) {
2301
+ buft = llama_default_buffer_type_cpu(true);
2302
+ }
2303
+ return buft;
2304
+ GGML_UNUSED(model);
2305
+ GGML_UNUSED(gpu);
2306
+ }
2307
+
2308
+ static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_model & model, int fallback_gpu, const float * tensor_split) {
2309
+ ggml_backend_buffer_type_t buft = nullptr;
2310
+
2311
+ #ifdef GGML_USE_CUDA
2312
+ if (ggml_backend_cuda_get_device_count() > 1) {
2313
+ buft = ggml_backend_cuda_split_buffer_type(tensor_split);
2314
+ }
2315
+ #endif
2316
+
2317
+ #ifdef GGML_USE_SYCL
2318
+ if (ggml_backend_sycl_get_device_count() > 1) {
2319
+ buft = ggml_backend_sycl_split_buffer_type(tensor_split);
2320
+ }
2321
+ #endif
2322
+
2323
+ if (buft == nullptr) {
2324
+ buft = llama_default_buffer_type_offload(model, fallback_gpu);
2325
+ }
2326
+ return buft;
2327
+
2328
+ GGML_UNUSED(tensor_split);
2329
+ }
2330
+
2331
+ static size_t llama_get_device_count(const llama_model & model) {
2332
+ #if defined(GGML_USE_RPC)
2333
+ return model.rpc_servers.size();
2334
+ #elif defined(GGML_USE_CUDA)
2335
+ return ggml_backend_cuda_get_device_count();
2336
+ #elif defined(GGML_USE_SYCL)
2337
+ return ggml_backend_sycl_get_device_count();
2338
+ #elif defined(GGML_USE_VULKAN)
2339
+ return ggml_backend_vk_get_device_count();
2340
+ #else
2341
+ return 1;
2342
+ #endif
2343
+ GGML_UNUSED(model);
2344
+ }
2345
+
2346
+ static size_t llama_get_device_memory(const llama_model & model, int device) {
2347
+ #if defined(GGML_USE_RPC)
2348
+ size_t total;
2349
+ size_t free;
2350
+ std::string endpoint = model.rpc_servers[device];
2351
+ ggml_backend_rpc_get_device_memory(endpoint.c_str(), &free, &total);
2352
+ return free;
2353
+ #elif defined(GGML_USE_CUDA)
2354
+ size_t total;
2355
+ size_t free;
2356
+ ggml_backend_cuda_get_device_memory(device, &free, &total);
2357
+ return free;
2358
+ #elif defined(GGML_USE_SYCL)
2359
+ size_t total;
2360
+ size_t free;
2361
+ ggml_backend_sycl_get_device_memory(device, &free, &total);
2362
+ return free;
2363
+ #elif defined(GGML_USE_VULKAN)
2364
+ size_t total;
2365
+ size_t free;
2366
+ ggml_backend_vk_get_device_memory(device, &free, &total);
2367
+ return free;
2368
+ #else
2369
+ return 1;
2370
+ #endif
2371
+ GGML_UNUSED(model);
2372
+ GGML_UNUSED(device);
2373
+ }
2374
+
2336
2375
  //
2337
2376
  // kv cache helpers
2338
2377
  //
@@ -2785,6 +2824,11 @@ static void llama_kv_cache_defrag(struct llama_kv_cache & cache) {
2785
2824
  cache.do_defrag = true;
2786
2825
  }
2787
2826
 
2827
+ static uint32_t llama_kv_cache_get_padding(const struct llama_cparams & cparams) {
2828
+ // the FA kernels require padding to avoid extra runtime boundary checks
2829
+ return cparams.flash_attn ? 256u : 32u;
2830
+ }
2831
+
2788
2832
  //
2789
2833
  // model loading and saving
2790
2834
  //
@@ -3779,6 +3823,12 @@ static void llm_load_hparams(
3779
3823
 
3780
3824
  // get hparams kv
3781
3825
  ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab);
3826
+
3827
+ // everything past this point is not vocab-related
3828
+ if (hparams.vocab_only) {
3829
+ return;
3830
+ }
3831
+
3782
3832
  ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
3783
3833
  ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
3784
3834
  ml.get_key(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff);
@@ -3860,7 +3910,7 @@ static void llm_load_hparams(
3860
3910
  switch (hparams.n_layer) {
3861
3911
  case 22: model.type = e_model::MODEL_1B; break;
3862
3912
  case 26: model.type = e_model::MODEL_3B; break;
3863
- case 32: model.type = hparams.n_head == hparams.n_head_kv ? e_model::MODEL_7B : e_model::MODEL_8B; break; // LLaMa 8B v3 uses GQA
3913
+ case 32: model.type = hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B; break;
3864
3914
  case 40: model.type = e_model::MODEL_13B; break;
3865
3915
  case 48: model.type = e_model::MODEL_34B; break;
3866
3916
  case 60: model.type = e_model::MODEL_30B; break;
@@ -3962,6 +4012,19 @@ static void llm_load_hparams(
3962
4012
  model.type = e_model::MODEL_335M; break; // bge-large
3963
4013
  }
3964
4014
  } break;
4015
+ case LLM_ARCH_JINA_BERT_V2:
4016
+ {
4017
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
4018
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
4019
+ ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
4020
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
4021
+ hparams.f_max_alibi_bias = 8.0f;
4022
+
4023
+ switch (hparams.n_layer) {
4024
+ case 4: model.type = e_model::MODEL_33M; break; // jina-embeddings-small
4025
+ case 12: model.type = e_model::MODEL_137M; break; // jina-embeddings-base
4026
+ }
4027
+ } break;
3965
4028
  case LLM_ARCH_NOMIC_BERT:
3966
4029
  {
3967
4030
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -4383,7 +4446,11 @@ static void llm_load_vocab(
4383
4446
  tokenizer_pre == "starcoder") {
4384
4447
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STARCODER;
4385
4448
  } else if (
4386
- tokenizer_pre == "gpt-2") {
4449
+ tokenizer_pre == "gpt-2" ||
4450
+ tokenizer_pre == "jina-es" ||
4451
+ tokenizer_pre == "jina-de" ||
4452
+ tokenizer_pre == "jina-v2-es" ||
4453
+ tokenizer_pre == "jina-v2-de") {
4387
4454
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
4388
4455
  } else if (
4389
4456
  tokenizer_pre == "refact") {
@@ -4743,13 +4810,13 @@ static bool llm_load_tensors(
4743
4810
 
4744
4811
  if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
4745
4812
  // calculate the split points
4746
- int device_count = llama_get_device_count();
4813
+ int device_count = llama_get_device_count(model);
4747
4814
  bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
4748
4815
  std::vector<float> splits(device_count);
4749
4816
  if (all_zero) {
4750
4817
  // default split, by free memory
4751
4818
  for (int i = 0; i < device_count; ++i) {
4752
- splits[i] = llama_get_device_memory(i);
4819
+ splits[i] = llama_get_device_memory(model, i);
4753
4820
  }
4754
4821
  } else {
4755
4822
  std::copy(tensor_split, tensor_split + device_count, splits.begin());
@@ -4769,35 +4836,35 @@ static bool llm_load_tensors(
4769
4836
  int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1);
4770
4837
  for (int64_t i = i_gpu_start; i < n_layer; ++i) {
4771
4838
  int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits.begin();
4772
- model.buft_layer[i] = llama_default_buffer_type_offload(layer_gpu);
4839
+ model.buft_layer[i] = llama_default_buffer_type_offload(model, layer_gpu);
4773
4840
  }
4774
4841
  // assign the output layer
4775
4842
  if (n_gpu_layers > n_layer) {
4776
4843
  int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits.begin();
4777
- model.buft_output = llama_default_buffer_type_offload(layer_gpu);
4844
+ model.buft_output = llama_default_buffer_type_offload(model, layer_gpu);
4778
4845
  } else {
4779
4846
  model.buft_output = llama_default_buffer_type_cpu(true);
4780
4847
  }
4781
4848
  } else {
4782
4849
  ggml_backend_buffer_type_t split_buft;
4783
4850
  if (split_mode == LLAMA_SPLIT_MODE_ROW) {
4784
- split_buft = llama_default_buffer_type_split(main_gpu, tensor_split);
4851
+ split_buft = llama_default_buffer_type_split(model, main_gpu, tensor_split);
4785
4852
  } else {
4786
4853
  // LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_LAYER in backends where it is not supported
4787
- split_buft = llama_default_buffer_type_offload(main_gpu);
4854
+ split_buft = llama_default_buffer_type_offload(model, main_gpu);
4788
4855
  }
4789
4856
  // assign the repeating layers
4790
4857
  for (int64_t i = i_gpu_start; i < n_layer; ++i) {
4791
4858
  model.buft_layer[i] = {
4792
4859
  split_buft,
4793
- llama_default_buffer_type_offload(main_gpu)
4860
+ llama_default_buffer_type_offload(model, main_gpu)
4794
4861
  };
4795
4862
  }
4796
4863
  // assign the output layer
4797
4864
  if (n_gpu_layers > n_layer) {
4798
4865
  model.buft_output = {
4799
4866
  split_buft,
4800
- llama_default_buffer_type_offload(main_gpu)
4867
+ llama_default_buffer_type_offload(model, main_gpu)
4801
4868
  };
4802
4869
  } else {
4803
4870
  model.buft_output = llama_default_buffer_type_cpu(true);
@@ -5242,6 +5309,50 @@ static bool llm_load_tensors(
5242
5309
  layer.layer_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd});
5243
5310
  }
5244
5311
  } break;
5312
+ case LLM_ARCH_JINA_BERT_V2:
5313
+ {
5314
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // word_embeddings
5315
+ model.type_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type}); //token_type_embeddings
5316
+ model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}); // LayerNorm
5317
+ model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}); //LayerNorm bias
5318
+
5319
+ for (int i = 0; i < n_layer; ++i) {
5320
+ ggml_context * ctx_layer = ctx_for_layer(i);
5321
+ ggml_context * ctx_split = ctx_for_layer_split(i);
5322
+
5323
+ auto & layer = model.layers[i]; // JinaBertLayer
5324
+
5325
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
5326
+ layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
5327
+
5328
+ layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, false);
5329
+ layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, false);
5330
+
5331
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
5332
+ layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
5333
+
5334
+ layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, false);
5335
+ layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, false);
5336
+
5337
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
5338
+ layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
5339
+
5340
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); //output_dens
5341
+ layer.bo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}); //output_dens
5342
+
5343
+ layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}); //output_norm
5344
+ layer.attn_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd});
5345
+
5346
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
5347
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
5348
+
5349
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
5350
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
5351
+
5352
+ layer.layer_out_norm = ml.create_tensor(ctx_split, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
5353
+ layer.layer_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd});
5354
+ }
5355
+ } break;
5245
5356
  case LLM_ARCH_BLOOM:
5246
5357
  {
5247
5358
  model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
@@ -6318,7 +6429,7 @@ static struct ggml_tensor * llm_build_ffn(
6318
6429
  llm_ffn_gate_type type_gate,
6319
6430
  const llm_build_cb & cb,
6320
6431
  int il) {
6321
- struct ggml_tensor * tmp = ggml_mul_mat(ctx, up, cur);
6432
+ struct ggml_tensor * tmp = up ? ggml_mul_mat(ctx, up, cur) : cur;
6322
6433
  cb(tmp, "ffn_up", il);
6323
6434
 
6324
6435
  if (up_b) {
@@ -6500,7 +6611,6 @@ static struct ggml_tensor * llm_build_kqv(
6500
6611
  struct ggml_tensor * wo_b,
6501
6612
  struct ggml_tensor * q_cur,
6502
6613
  struct ggml_tensor * kq_mask,
6503
- struct ggml_tensor * kq_pos,
6504
6614
  int32_t n_tokens,
6505
6615
  int32_t n_kv,
6506
6616
  float kq_scale,
@@ -6512,6 +6622,7 @@ static struct ggml_tensor * llm_build_kqv(
6512
6622
  const int64_t n_embd_head_k = hparams.n_embd_head_k;
6513
6623
  const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
6514
6624
  const int64_t n_embd_head_v = hparams.n_embd_head_v;
6625
+ const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
6515
6626
 
6516
6627
  struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3);
6517
6628
  cb(q, "q", il);
@@ -6530,26 +6641,22 @@ static struct ggml_tensor * llm_build_kqv(
6530
6641
  GGML_UNUSED(model);
6531
6642
  GGML_UNUSED(n_ctx);
6532
6643
 
6533
- // note: if this assert triggers, then some check has failed earlier
6534
- // the idea is to detect during context creation that ALiBi would be used and disable Flash Attention
6535
- GGML_ASSERT(kq_pos == nullptr && "ALiBi is not yet supported with Flash Attention");
6536
-
6537
6644
  // split cached v into n_head heads (not transposed)
6538
6645
  struct ggml_tensor * v =
6539
6646
  ggml_view_3d(ctx, kv.v_l[il],
6540
6647
  n_embd_head_v, n_kv, n_head_kv,
6541
- ggml_row_size(kv.v_l[il]->type, n_embd_k_gqa),
6542
- ggml_row_size(kv.v_l[il]->type, n_embd_head_k),
6648
+ ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa),
6649
+ ggml_row_size(kv.v_l[il]->type, n_embd_head_v),
6543
6650
  0);
6544
6651
  cb(v, "v", il);
6545
6652
 
6546
- cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale);
6653
+ cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias);
6547
6654
 
6548
6655
  if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
6549
6656
  ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
6550
6657
  }
6551
6658
 
6552
- cur = ggml_reshape_2d(ctx, cur, n_embd_head_k*n_head, n_tokens);
6659
+ cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens);
6553
6660
  } else {
6554
6661
  struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
6555
6662
  cb(kq, "kq", il);
@@ -6574,28 +6681,8 @@ static struct ggml_tensor * llm_build_kqv(
6574
6681
  kq = ggml_scale(ctx, kq, 30);
6575
6682
  }
6576
6683
 
6577
- #if defined(GGML_USE_KOMPUTE)
6578
- #pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Kompute")
6579
- #pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
6580
- #pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5488")
6581
- if (hparams.use_alibi) {
6582
- kq = ggml_scale(ctx, kq, kq_scale);
6583
- cb(kq, "kq_scaled", il);
6584
-
6585
- kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, hparams.f_max_alibi_bias);
6586
- cb(kq, "kq_scaled_alibi", il);
6587
-
6588
- kq = ggml_add(ctx, kq, kq_mask);
6589
- cb(kq, "kq_masked", il);
6590
-
6591
- kq = ggml_soft_max(ctx, kq);
6592
- cb(kq, "kq_soft_max", il);
6593
- } else
6594
- #endif
6595
- {
6596
- kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_pos, kq_scale, hparams.f_max_alibi_bias);
6597
- cb(kq, "kq_soft_max_ext", il);
6598
- }
6684
+ kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias);
6685
+ cb(kq, "kq_soft_max_ext", il);
6599
6686
 
6600
6687
  GGML_ASSERT(kv.size == n_ctx);
6601
6688
 
@@ -6614,7 +6701,7 @@ static struct ggml_tensor * llm_build_kqv(
6614
6701
  struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
6615
6702
  cb(kqv_merged, "kqv_merged", il);
6616
6703
 
6617
- cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens);
6704
+ cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_v*n_head, n_tokens);
6618
6705
  cb(cur, "kqv_merged_cont", il);
6619
6706
  }
6620
6707
 
@@ -6645,7 +6732,6 @@ static struct ggml_tensor * llm_build_kv(
6645
6732
  struct ggml_tensor * v_cur,
6646
6733
  struct ggml_tensor * q_cur,
6647
6734
  struct ggml_tensor * kq_mask,
6648
- struct ggml_tensor * kq_pos,
6649
6735
  int32_t n_tokens,
6650
6736
  int32_t kv_head,
6651
6737
  int32_t n_kv,
@@ -6664,7 +6750,7 @@ static struct ggml_tensor * llm_build_kv(
6664
6750
  struct ggml_tensor * cur;
6665
6751
 
6666
6752
  cur = llm_build_kqv(ctx, model, hparams, cparams, kv, graph, wo, wo_b,
6667
- q_cur, kq_mask, kq_pos, n_tokens, n_kv, kq_scale, cb, il);
6753
+ q_cur, kq_mask, n_tokens, n_kv, kq_scale, cb, il);
6668
6754
  cb(cur, "kqv_out", il);
6669
6755
 
6670
6756
  return cur;
@@ -6771,18 +6857,17 @@ struct llm_build_context {
6771
6857
 
6772
6858
  ctx0 = ggml_init(params);
6773
6859
 
6774
- lctx.inp_tokens = nullptr;
6775
- lctx.inp_embd = nullptr;
6776
- lctx.inp_pos = nullptr;
6860
+ lctx.inp_tokens = nullptr;
6861
+ lctx.inp_embd = nullptr;
6862
+ lctx.inp_pos = nullptr;
6777
6863
  lctx.inp_out_ids = nullptr;
6778
6864
  lctx.inp_KQ_mask = nullptr;
6779
- lctx.inp_KQ_pos = nullptr;
6780
6865
  lctx.inp_K_shift = nullptr;
6781
- lctx.inp_mean = nullptr;
6782
- lctx.inp_cls = nullptr;
6783
- lctx.inp_s_copy = nullptr;
6784
- lctx.inp_s_mask = nullptr;
6785
- lctx.inp_s_seq = nullptr;
6866
+ lctx.inp_mean = nullptr;
6867
+ lctx.inp_cls = nullptr;
6868
+ lctx.inp_s_copy = nullptr;
6869
+ lctx.inp_s_mask = nullptr;
6870
+ lctx.inp_s_seq = nullptr;
6786
6871
  }
6787
6872
 
6788
6873
  void free() {
@@ -6932,19 +7017,6 @@ struct llm_build_context {
6932
7017
  return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_mask, GGML_TYPE_F16) : lctx.inp_KQ_mask;
6933
7018
  }
6934
7019
 
6935
- struct ggml_tensor * build_inp_KQ_pos(bool causal = true) {
6936
- if (causal) {
6937
- lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_kv);
6938
- } else {
6939
- // TODO: this will be needed for ALiBi-based BERT models
6940
- // https://github.com/ggerganov/llama.cpp/pull/6826
6941
- lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_tokens);
6942
- }
6943
- cb(lctx.inp_KQ_pos, "KQ_pos", -1);
6944
- ggml_set_input(lctx.inp_KQ_pos);
6945
- return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_pos, GGML_TYPE_F16) : lctx.inp_KQ_pos;
6946
- }
6947
-
6948
7020
  struct ggml_tensor * build_inp_mean() {
6949
7021
  lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
6950
7022
  cb(lctx.inp_mean, "inp_mean", -1);
@@ -7050,7 +7122,7 @@ struct llm_build_context {
7050
7122
 
7051
7123
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7052
7124
  model.layers[il].wo, model.layers[il].bo,
7053
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7125
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7054
7126
  }
7055
7127
 
7056
7128
  if (il == n_layer - 1) {
@@ -7143,9 +7215,6 @@ struct llm_build_context {
7143
7215
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
7144
7216
  struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
7145
7217
 
7146
- // positions of the tokens in the KV cache
7147
- struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
7148
-
7149
7218
  for (int il = 0; il < n_layer; ++il) {
7150
7219
  struct ggml_tensor * inpSA = inpL;
7151
7220
 
@@ -7190,7 +7259,7 @@ struct llm_build_context {
7190
7259
 
7191
7260
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7192
7261
  model.layers[il].wo, NULL,
7193
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7262
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7194
7263
  }
7195
7264
 
7196
7265
  if (il == n_layer - 1) {
@@ -7260,9 +7329,6 @@ struct llm_build_context {
7260
7329
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
7261
7330
  struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
7262
7331
 
7263
- // positions of the tokens in the KV cache
7264
- struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
7265
-
7266
7332
  for (int il = 0; il < n_layer; ++il) {
7267
7333
  struct ggml_tensor * inpSA = inpL;
7268
7334
 
@@ -7297,7 +7363,7 @@ struct llm_build_context {
7297
7363
  cb(Kcur, "Kcur", il);
7298
7364
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7299
7365
  model.layers[il].wo, NULL,
7300
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7366
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7301
7367
  }
7302
7368
 
7303
7369
  if (il == n_layer - 1) {
@@ -7417,7 +7483,7 @@ struct llm_build_context {
7417
7483
 
7418
7484
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7419
7485
  model.layers[il].wo, NULL,
7420
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7486
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7421
7487
  }
7422
7488
 
7423
7489
  if (il == n_layer - 1) {
@@ -7542,7 +7608,7 @@ struct llm_build_context {
7542
7608
 
7543
7609
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7544
7610
  model.layers[il].wo, model.layers[il].bo,
7545
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
7611
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
7546
7612
  }
7547
7613
 
7548
7614
  if (il == n_layer - 1) {
@@ -7694,7 +7760,7 @@ struct llm_build_context {
7694
7760
 
7695
7761
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7696
7762
  model.layers[il].wo, NULL,
7697
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7763
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7698
7764
  }
7699
7765
 
7700
7766
  if (il == n_layer - 1) {
@@ -7806,7 +7872,7 @@ struct llm_build_context {
7806
7872
 
7807
7873
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7808
7874
  model.layers[il].wo, model.layers[il].bo,
7809
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7875
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7810
7876
  }
7811
7877
 
7812
7878
  if (il == n_layer - 1) {
@@ -8010,7 +8076,7 @@ struct llm_build_context {
8010
8076
 
8011
8077
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8012
8078
  model.layers[il].wo, model.layers[il].bo,
8013
- Kcur, Vcur, Q, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8079
+ Kcur, Vcur, Q, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8014
8080
  }
8015
8081
 
8016
8082
  if (il == n_layer - 1) {
@@ -8076,9 +8142,6 @@ struct llm_build_context {
8076
8142
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
8077
8143
  struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
8078
8144
 
8079
- // positions of the tokens in the KV cache
8080
- struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
8081
-
8082
8145
  for (int il = 0; il < n_layer; ++il) {
8083
8146
  struct ggml_tensor * inpSA = inpL;
8084
8147
 
@@ -8106,7 +8169,7 @@ struct llm_build_context {
8106
8169
 
8107
8170
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8108
8171
  model.layers[il].wo, NULL,
8109
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8172
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8110
8173
  }
8111
8174
 
8112
8175
  if (il == n_layer - 1) {
@@ -8168,8 +8231,11 @@ struct llm_build_context {
8168
8231
 
8169
8232
  struct ggml_tensor * cur;
8170
8233
  struct ggml_tensor * inpL;
8234
+ struct ggml_tensor * inp_pos = nullptr;
8171
8235
 
8172
- struct ggml_tensor * inp_pos = build_inp_pos();
8236
+ if (model.arch != LLM_ARCH_JINA_BERT_V2) {
8237
+ inp_pos = build_inp_pos();
8238
+ }
8173
8239
  struct ggml_tensor * inp_mean = build_inp_mean();
8174
8240
  struct ggml_tensor * inp_cls = build_inp_cls();
8175
8241
 
@@ -8200,13 +8266,26 @@ struct llm_build_context {
8200
8266
  struct ggml_tensor * Vcur;
8201
8267
 
8202
8268
  // self-attention
8203
- if (model.arch == LLM_ARCH_BERT) {
8269
+ if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_JINA_BERT_V2) {
8204
8270
  Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq);
8205
8271
  cb(Qcur, "Qcur", il);
8206
8272
 
8273
+ if (model.layers[il].attn_q_norm) {
8274
+ Qcur = llm_build_norm(ctx0, Qcur, hparams,
8275
+ model.layers[il].attn_q_norm,
8276
+ model.layers[il].attn_q_norm_b,
8277
+ LLM_NORM, cb, il);
8278
+ }
8279
+
8207
8280
  Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), model.layers[il].bk);
8208
8281
  cb(Kcur, "Kcur", il);
8209
8282
 
8283
+ if (model.layers[il].attn_k_norm) {
8284
+ Kcur = llm_build_norm(ctx0, Kcur, hparams,
8285
+ model.layers[il].attn_k_norm,
8286
+ model.layers[il].attn_k_norm_b,
8287
+ LLM_NORM, cb, il);
8288
+ }
8210
8289
  Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), model.layers[il].bv);
8211
8290
  cb(Vcur, "Vcur", il);
8212
8291
 
@@ -8246,7 +8325,7 @@ struct llm_build_context {
8246
8325
  struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
8247
8326
  cb(kq, "kq", il);
8248
8327
 
8249
- kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, nullptr, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
8328
+ kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
8250
8329
  cb(kq, "kq_soft_max_ext", il);
8251
8330
 
8252
8331
  struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)));
@@ -8297,6 +8376,13 @@ struct llm_build_context {
8297
8376
  model.layers[il].ffn_down, model.layers[il].ffn_down_b,
8298
8377
  NULL,
8299
8378
  LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
8379
+ } else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
8380
+ cur = llm_build_ffn(ctx0, cur,
8381
+ model.layers[il].ffn_up, NULL,
8382
+ model.layers[il].ffn_gate, NULL,
8383
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b,
8384
+ NULL,
8385
+ LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
8300
8386
  } else {
8301
8387
  cur = llm_build_ffn(ctx0, cur,
8302
8388
  model.layers[il].ffn_up, NULL,
@@ -8363,9 +8449,6 @@ struct llm_build_context {
8363
8449
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
8364
8450
  struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
8365
8451
 
8366
- // positions of the tokens in the KV cache
8367
- struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
8368
-
8369
8452
  inpL = llm_build_norm(ctx0, inpL, hparams,
8370
8453
  model.tok_norm,
8371
8454
  model.tok_norm_b,
@@ -8399,7 +8482,7 @@ struct llm_build_context {
8399
8482
 
8400
8483
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8401
8484
  model.layers[il].wo, model.layers[il].bo,
8402
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8485
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8403
8486
  }
8404
8487
 
8405
8488
  if (il == n_layer - 1) {
@@ -8464,9 +8547,6 @@ struct llm_build_context {
8464
8547
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
8465
8548
  struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
8466
8549
 
8467
- // positions of the tokens in the KV cache
8468
- struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
8469
-
8470
8550
  if (model.pos_embd) {
8471
8551
  // inp_pos - contains the positions
8472
8552
  struct ggml_tensor * inp_pos = build_inp_pos();
@@ -8530,13 +8610,13 @@ struct llm_build_context {
8530
8610
 
8531
8611
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8532
8612
  model.layers[il].wo, model.layers[il].bo,
8533
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8613
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8534
8614
  } else {
8535
8615
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8536
8616
 
8537
8617
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8538
8618
  model.layers[il].wo, model.layers[il].bo,
8539
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8619
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8540
8620
  }
8541
8621
  }
8542
8622
 
@@ -8680,7 +8760,7 @@ struct llm_build_context {
8680
8760
 
8681
8761
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8682
8762
  model.layers[il].wo, NULL,
8683
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8763
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8684
8764
  }
8685
8765
 
8686
8766
  if (il == n_layer - 1) {
@@ -8798,7 +8878,7 @@ struct llm_build_context {
8798
8878
 
8799
8879
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8800
8880
  model.layers[il].wo, NULL,
8801
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8881
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8802
8882
  }
8803
8883
 
8804
8884
  if (il == n_layer - 1) {
@@ -8911,7 +8991,7 @@ struct llm_build_context {
8911
8991
 
8912
8992
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8913
8993
  model.layers[il].wo, model.layers[il].bo,
8914
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8994
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8915
8995
  }
8916
8996
 
8917
8997
  if (il == n_layer - 1) {
@@ -9025,7 +9105,7 @@ struct llm_build_context {
9025
9105
 
9026
9106
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9027
9107
  model.layers[il].wo, model.layers[il].bo,
9028
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9108
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9029
9109
  }
9030
9110
 
9031
9111
  if (il == n_layer - 1) {
@@ -9180,7 +9260,7 @@ struct llm_build_context {
9180
9260
 
9181
9261
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9182
9262
  model.layers[il].wo, model.layers[il].bo,
9183
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
9263
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
9184
9264
  }
9185
9265
 
9186
9266
  if (il == n_layer - 1) {
@@ -9297,7 +9377,7 @@ struct llm_build_context {
9297
9377
 
9298
9378
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9299
9379
  model.layers[il].wo, model.layers[il].bo,
9300
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
9380
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
9301
9381
  }
9302
9382
 
9303
9383
  if (il == n_layer - 1) {
@@ -9410,7 +9490,7 @@ struct llm_build_context {
9410
9490
 
9411
9491
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9412
9492
  model.layers[il].wo, NULL,
9413
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9493
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9414
9494
  }
9415
9495
  struct ggml_tensor * sa_out = cur;
9416
9496
 
@@ -9513,7 +9593,7 @@ struct llm_build_context {
9513
9593
 
9514
9594
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9515
9595
  model.layers[il].wo, model.layers[il].bo,
9516
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9596
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9517
9597
  }
9518
9598
 
9519
9599
  if (il == n_layer - 1) {
@@ -9620,7 +9700,7 @@ struct llm_build_context {
9620
9700
 
9621
9701
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9622
9702
  model.layers[il].wo, model.layers[il].bo,
9623
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9703
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9624
9704
  }
9625
9705
 
9626
9706
  if (il == n_layer - 1) {
@@ -9736,7 +9816,7 @@ struct llm_build_context {
9736
9816
 
9737
9817
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9738
9818
  model.layers[il].wo, NULL,
9739
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9819
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9740
9820
  }
9741
9821
 
9742
9822
  if (il == n_layer - 1) {
@@ -9853,7 +9933,7 @@ struct llm_build_context {
9853
9933
 
9854
9934
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9855
9935
  model.layers[il].wo, model.layers[il].bo,
9856
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9936
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9857
9937
  }
9858
9938
 
9859
9939
  if (il == n_layer - 1) {
@@ -9983,7 +10063,7 @@ struct llm_build_context {
9983
10063
 
9984
10064
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9985
10065
  model.layers[il].wo, model.layers[il].bo,
9986
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10066
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9987
10067
  }
9988
10068
 
9989
10069
  if (il == n_layer - 1) {
@@ -10104,7 +10184,7 @@ struct llm_build_context {
10104
10184
 
10105
10185
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10106
10186
  model.layers[il].wo, NULL,
10107
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
10187
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
10108
10188
  }
10109
10189
 
10110
10190
  if (il == n_layer - 1) {
@@ -10223,7 +10303,7 @@ struct llm_build_context {
10223
10303
 
10224
10304
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10225
10305
  model.layers[il].wo, model.layers[il].bo,
10226
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10306
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10227
10307
  }
10228
10308
 
10229
10309
  if (il == n_layer - 1) {
@@ -10513,7 +10593,7 @@ struct llm_build_context {
10513
10593
 
10514
10594
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10515
10595
  model.layers[il].wo, model.layers[il].bo,
10516
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10596
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10517
10597
  }
10518
10598
 
10519
10599
  if (il == n_layer - 1) {
@@ -10644,7 +10724,7 @@ struct llm_build_context {
10644
10724
 
10645
10725
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10646
10726
  model.layers[il].wo, nullptr,
10647
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10727
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10648
10728
  }
10649
10729
 
10650
10730
  if (il == n_layer - 1) {
@@ -10825,6 +10905,7 @@ static struct ggml_cgraph * llama_build_graph(
10825
10905
  result = llm.build_refact();
10826
10906
  } break;
10827
10907
  case LLM_ARCH_BERT:
10908
+ case LLM_ARCH_JINA_BERT_V2:
10828
10909
  case LLM_ARCH_NOMIC_BERT:
10829
10910
  {
10830
10911
  result = llm.build_bert();
@@ -11032,11 +11113,21 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
11032
11113
  if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
11033
11114
  f = -INFINITY;
11034
11115
  } else {
11035
- f = 0.0f;
11116
+ if (hparams.use_alibi) {
11117
+ f = -fabs(lctx.kv_self.cells[i].pos - pos);
11118
+ } else {
11119
+ f = 0.0f;
11120
+ }
11036
11121
  }
11037
11122
  data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
11038
11123
  }
11039
11124
  }
11125
+
11126
+ for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
11127
+ for (int j = 0; j < n_kv; ++j) {
11128
+ data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
11129
+ }
11130
+ }
11040
11131
  }
11041
11132
  } else {
11042
11133
  // when using kv cache, the mask needs to match the kv cache size
@@ -11055,7 +11146,11 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
11055
11146
  float f = -INFINITY;
11056
11147
  for (int s = 0; s < batch.n_seq_id[i]; ++s) {
11057
11148
  if (batch.seq_id[i][s] == seq_id) {
11058
- f = 0.0f;
11149
+ if (hparams.use_alibi) {
11150
+ f = -fabs(batch.pos[i] - batch.pos[j]);
11151
+ } else {
11152
+ f = 0.0f;
11153
+ }
11059
11154
  break;
11060
11155
  }
11061
11156
  }
@@ -11071,21 +11166,6 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
11071
11166
  }
11072
11167
  }
11073
11168
 
11074
- // ALiBi requires the KQ_pos tensor to provide the sequence position of each token in the batch
11075
- // this allows to process multiple sequences in parallel with ALiBi-based models
11076
- if (hparams.use_alibi) {
11077
- const int64_t n_kv = kv_self.n;
11078
-
11079
- GGML_ASSERT(lctx.inp_KQ_pos);
11080
- GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_pos->buffer));
11081
-
11082
- float * data = (float *) lctx.inp_KQ_pos->data;
11083
-
11084
- for (int i = 0; i < n_kv; ++i) {
11085
- data[i] = float(lctx.kv_self.cells[i].pos);
11086
- }
11087
- }
11088
-
11089
11169
  if (cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
11090
11170
  const int64_t n_tokens = batch.n_tokens;
11091
11171
 
@@ -11455,7 +11535,8 @@ static int llama_decode_internal(
11455
11535
  // a heuristic, to avoid attending the full cache if it is not yet utilized
11456
11536
  // after enough generations, the benefit from this heuristic disappears
11457
11537
  // if we start defragmenting the cache, the benefit from this will be more important
11458
- kv_self.n = std::min(kv_self.size, std::max(256u, GGML_PAD(llama_kv_cache_cell_max(kv_self), 256)));
11538
+ const uint32_t pad = llama_kv_cache_get_padding(cparams);
11539
+ kv_self.n = std::min(kv_self.size, std::max(pad, GGML_PAD(llama_kv_cache_cell_max(kv_self), pad)));
11459
11540
  //kv_self.n = llama_kv_cache_cell_max(kv_self);
11460
11541
  }
11461
11542
  }
@@ -12200,13 +12281,14 @@ struct llm_tokenizer_bpe {
12200
12281
 
12201
12282
  void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
12202
12283
  int final_prev_index = -1;
12284
+ bool ignore_merges = false;
12203
12285
 
12204
12286
  std::vector<std::string> word_collection;
12205
12287
  switch (vocab.type) {
12206
12288
  case LLAMA_VOCAB_TYPE_BPE:
12207
12289
  switch (vocab.type_pre) {
12208
12290
  case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
12209
- case LLAMA_VOCAB_PRE_TYPE_DBRX:
12291
+ ignore_merges = true;
12210
12292
  word_collection = unicode_regex_split(text, {
12211
12293
  // original regex from tokenizer.json
12212
12294
  //"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
@@ -12215,6 +12297,12 @@ struct llm_tokenizer_bpe {
12215
12297
  "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
12216
12298
  });
12217
12299
  break;
12300
+ case LLAMA_VOCAB_PRE_TYPE_DBRX:
12301
+ word_collection = unicode_regex_split(text, {
12302
+ // same as llama3
12303
+ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
12304
+ });
12305
+ break;
12218
12306
  case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
12219
12307
  word_collection = unicode_regex_split(text, {
12220
12308
  "[\r\n]",
@@ -12298,6 +12386,11 @@ struct llm_tokenizer_bpe {
12298
12386
  int index = 0;
12299
12387
  size_t offset = 0;
12300
12388
 
12389
+ if (ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) {
12390
+ symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()});
12391
+ offset = word.size();
12392
+ }
12393
+
12301
12394
  while (offset < word.size()) {
12302
12395
  llm_symbol sym;
12303
12396
  size_t char_len = std::min(word.size() - offset, (size_t) ::utf8_len(word[offset]));
@@ -12483,16 +12576,16 @@ struct llm_tokenizer_wpm {
12483
12576
  // to lowercase, pad chinese characters, pad punctuation
12484
12577
  std::string new_str = "";
12485
12578
  for (uint32_t code : cpts_nfd) {
12486
- int type = unicode_cpt_type(code);
12487
- if (type == CODEPOINT_TYPE_ACCENT_MARK || type == CODEPOINT_TYPE_CONTROL) {
12579
+ const codepoint_flags flags = unicode_cpt_flags(code);
12580
+ if (flags.is_accent_mark || flags.is_control) {
12488
12581
  continue;
12489
12582
  }
12490
12583
  code = unicode_tolower(code);
12491
- if (type == CODEPOINT_TYPE_SEPARATOR) {
12584
+ if (flags.is_separator || flags.is_whitespace) { //####FIXME: is_separator ?
12492
12585
  code = ' ';
12493
12586
  }
12494
12587
  std::string s = unicode_cpt_to_utf8(code);
12495
- if (type == CODEPOINT_TYPE_PUNCTUATION || is_ascii_punct(code) || is_chinese_char(code)) {
12588
+ if (flags.is_punctuation || is_ascii_punct(code) || is_chinese_char(code)) {
12496
12589
  new_str += " ";
12497
12590
  new_str += s;
12498
12591
  new_str += " ";
@@ -12726,6 +12819,13 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
12726
12819
  }
12727
12820
  }
12728
12821
 
12822
+ if (add_special && vocab.special_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) {
12823
+ LLAMA_LOG_WARN(
12824
+ "%s: Added a BOS token to the prompt as specified by the model but the prompt "
12825
+ "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
12826
+ "Are you sure this is what you want?\n", __FUNCTION__);
12827
+ }
12828
+
12729
12829
  if (add_special && vocab.special_add_eos == 1) {
12730
12830
  GGML_ASSERT(vocab.special_eos_id != -1);
12731
12831
  output.push_back(vocab.special_eos_id);
@@ -12752,7 +12852,17 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
12752
12852
  }
12753
12853
  }
12754
12854
 
12755
- GGML_ASSERT(vocab.special_add_eos != 1);
12855
+ if (add_special && vocab.special_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) {
12856
+ LLAMA_LOG_WARN(
12857
+ "%s: Added a BOS token to the prompt as specified by the model but the prompt "
12858
+ "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
12859
+ "Are you sure this is what you want?\n", __FUNCTION__);
12860
+ }
12861
+
12862
+ if (add_special && vocab.special_add_eos == 1) {
12863
+ GGML_ASSERT(vocab.special_add_eos != -1);
12864
+ output.push_back(vocab.special_eos_id);
12865
+ }
12756
12866
  } break;
12757
12867
  case LLAMA_VOCAB_TYPE_WPM:
12758
12868
  {
@@ -13106,6 +13216,58 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
13106
13216
  return rejects;
13107
13217
  }
13108
13218
 
13219
+ static bool llama_grammar_detect_left_recursion(
13220
+ const std::vector<std::vector<llama_grammar_element>> & rules,
13221
+ size_t rule_index,
13222
+ std::vector<bool> * rules_visited,
13223
+ std::vector<bool> * rules_in_progress,
13224
+ std::vector<bool> * rules_may_be_empty) {
13225
+ if ((*rules_in_progress)[rule_index]) {
13226
+ return true;
13227
+ }
13228
+
13229
+ (*rules_in_progress)[rule_index] = true;
13230
+
13231
+ const std::vector<llama_grammar_element> & rule = rules[rule_index];
13232
+
13233
+ // First check if the rule might produce the empty string. This could be done combined with the second
13234
+ // step but it's more readable as two steps.
13235
+ bool at_rule_start = true;
13236
+ for (size_t i = 0; i < rule.size(); i++) {
13237
+ if (llama_grammar_is_end_of_sequence(&rule[i])) {
13238
+ if (at_rule_start) {
13239
+ (*rules_may_be_empty)[rule_index] = true;
13240
+ break;
13241
+ }
13242
+ at_rule_start = true;
13243
+ } else {
13244
+ at_rule_start = false;
13245
+ }
13246
+ }
13247
+
13248
+ // Second, recurse into leftmost nonterminals (or next-leftmost as long as the previous nonterminal may
13249
+ // be empty)
13250
+ bool recurse_into_nonterminal = true;
13251
+ for (size_t i = 0; i < rule.size(); i++) {
13252
+ if (rule[i].type == LLAMA_GRETYPE_RULE_REF && recurse_into_nonterminal) {
13253
+ if (llama_grammar_detect_left_recursion(rules, (size_t)rule[i].value, rules_visited, rules_in_progress, rules_may_be_empty)) {
13254
+ return true;
13255
+ }
13256
+ if (!((*rules_may_be_empty)[(size_t)rule[i].value])) {
13257
+ recurse_into_nonterminal = false;
13258
+ }
13259
+ } else if (llama_grammar_is_end_of_sequence(&rule[i])) {
13260
+ recurse_into_nonterminal = true;
13261
+ } else {
13262
+ recurse_into_nonterminal = false;
13263
+ }
13264
+ }
13265
+
13266
+ (*rules_in_progress)[rule_index] = false;
13267
+ (*rules_visited)[rule_index] = true;
13268
+ return false;
13269
+ }
13270
+
13109
13271
  //
13110
13272
  // grammar - external
13111
13273
  //
@@ -13125,6 +13287,19 @@ struct llama_grammar * llama_grammar_init(
13125
13287
  vec_rules[i].push_back({LLAMA_GRETYPE_END, 0});
13126
13288
  }
13127
13289
 
13290
+ // Check for left recursion
13291
+ std::vector<bool> rules_visited(n_rules);
13292
+ std::vector<bool> rules_in_progress(n_rules);
13293
+ std::vector<bool> rules_may_be_empty(n_rules);
13294
+ for (size_t i = 0; i < n_rules; i++) {
13295
+ if (rules_visited[i]) {
13296
+ continue;
13297
+ }
13298
+ if (llama_grammar_detect_left_recursion(vec_rules, i, &rules_visited, &rules_in_progress, &rules_may_be_empty)) {
13299
+ throw std::runtime_error(format("unsupported grammar, left recursion detected for nonterminal at index %zu", i));
13300
+ }
13301
+ }
13302
+
13128
13303
  // loop over alternates of start rule to build initial stacks
13129
13304
  std::vector<std::vector<const llama_grammar_element *>> stacks;
13130
13305
  pos = vec_rules[start_rule_index].data();
@@ -13147,6 +13322,9 @@ struct llama_grammar * llama_grammar_init(
13147
13322
  }
13148
13323
  } while (true);
13149
13324
 
13325
+ // Important: vec_rules has to be moved here, not copied, because stacks contains
13326
+ // pointers to elements of vec_rules. If vec_rules were copied into llama_grammar
13327
+ // then the pointers would be invalidated when the local vec_rules goes out of scope.
13150
13328
  return new llama_grammar{ std::move(vec_rules), std::move(stacks), {} };
13151
13329
  }
13152
13330
 
@@ -13741,9 +13919,7 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
13741
13919
 
13742
13920
  // Sample the next word X using top-k sampling
13743
13921
  llama_sample_top_k(nullptr, candidates, int(k), 1);
13744
- if (ctx) {
13745
- ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
13746
- }
13922
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
13747
13923
  llama_token X = llama_sample_token(ctx, candidates);
13748
13924
  t_start_sample_us = ggml_time_us();
13749
13925
 
@@ -13757,9 +13933,7 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
13757
13933
  // Update mu using the learning rate and error
13758
13934
  *mu = *mu - eta * e;
13759
13935
 
13760
- if (ctx) {
13761
- ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
13762
- }
13936
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
13763
13937
  return X;
13764
13938
  }
13765
13939
 
@@ -15246,6 +15420,7 @@ struct llama_model_params llama_model_default_params() {
15246
15420
  /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
15247
15421
  /*.main_gpu =*/ 0,
15248
15422
  /*.tensor_split =*/ nullptr,
15423
+ /*.rpc_servers =*/ nullptr,
15249
15424
  /*.progress_callback =*/ nullptr,
15250
15425
  /*.progress_callback_user_data =*/ nullptr,
15251
15426
  /*.kv_overrides =*/ nullptr,
@@ -15316,7 +15491,9 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
15316
15491
  }
15317
15492
 
15318
15493
  size_t llama_max_devices(void) {
15319
- #if defined(GGML_USE_METAL)
15494
+ #if defined(GGML_USE_RPC)
15495
+ return GGML_RPC_MAX_SERVERS;
15496
+ #elif defined(GGML_USE_METAL)
15320
15497
  return 1;
15321
15498
  #elif defined(GGML_USE_CUDA)
15322
15499
  return GGML_CUDA_MAX_DEVICES;
@@ -15339,7 +15516,7 @@ bool llama_supports_mlock(void) {
15339
15516
 
15340
15517
  bool llama_supports_gpu_offload(void) {
15341
15518
  #if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
15342
- defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
15519
+ defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC)
15343
15520
  // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
15344
15521
  return true;
15345
15522
  #else
@@ -15402,7 +15579,17 @@ struct llama_model * llama_load_model_from_file(
15402
15579
  return true;
15403
15580
  };
15404
15581
  }
15405
-
15582
+ if (params.rpc_servers != nullptr) {
15583
+ // split the servers set them into model->rpc_servers
15584
+ std::string servers(params.rpc_servers);
15585
+ size_t pos = 0;
15586
+ while ((pos = servers.find(",")) != std::string::npos) {
15587
+ std::string server = servers.substr(0, pos);
15588
+ model->rpc_servers.push_back(server);
15589
+ servers.erase(0, pos + 1);
15590
+ }
15591
+ model->rpc_servers.push_back(servers);
15592
+ }
15406
15593
  int status = llama_model_load(path_model, *model, params);
15407
15594
  GGML_ASSERT(status <= 0);
15408
15595
  if (status < 0) {
@@ -15441,6 +15628,11 @@ struct llama_context * llama_new_context_with_model(
15441
15628
  return nullptr;
15442
15629
  }
15443
15630
 
15631
+ if (params.flash_attn && model->arch == LLM_ARCH_GROK) {
15632
+ LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__);
15633
+ params.flash_attn = false;
15634
+ }
15635
+
15444
15636
  llama_context * ctx = new llama_context(*model);
15445
15637
 
15446
15638
  const auto & hparams = model->hparams;
@@ -15464,7 +15656,7 @@ struct llama_context * llama_new_context_with_model(
15464
15656
  cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
15465
15657
 
15466
15658
  // this is necessary due to kv_self.n being padded later during inference
15467
- cparams.n_ctx = GGML_PAD(cparams.n_ctx, 256);
15659
+ cparams.n_ctx = GGML_PAD(cparams.n_ctx, llama_kv_cache_get_padding(cparams));
15468
15660
 
15469
15661
  // with causal attention, the batch size is limited by the context size
15470
15662
  cparams.n_batch = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
@@ -15509,16 +15701,6 @@ struct llama_context * llama_new_context_with_model(
15509
15701
  }
15510
15702
  }
15511
15703
 
15512
- if (cparams.flash_attn && hparams.use_alibi) {
15513
- LLAMA_LOG_WARN("%s: flash_attn is not yet compatible with ALiBi - forcing off\n", __func__);
15514
- cparams.flash_attn = false;
15515
- }
15516
-
15517
- if (cparams.flash_attn && model->arch == LLM_ARCH_GROK) {
15518
- LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__);
15519
- cparams.flash_attn = false;
15520
- }
15521
-
15522
15704
  if (params.seed == LLAMA_DEFAULT_SEED) {
15523
15705
  params.seed = time(NULL);
15524
15706
  }
@@ -15554,7 +15736,17 @@ struct llama_context * llama_new_context_with_model(
15554
15736
 
15555
15737
  if (!hparams.vocab_only) {
15556
15738
  // initialize backends
15557
- #ifdef GGML_USE_METAL
15739
+ #if defined(GGML_USE_RPC)
15740
+ for (auto & server : model->rpc_servers) {
15741
+ ggml_backend_t backend = ggml_backend_rpc_init(server.c_str());
15742
+ if (backend == nullptr) {
15743
+ LLAMA_LOG_ERROR("%s: failed to connect RPC backend to %s\n", __func__, server.c_str());
15744
+ llama_free(ctx);
15745
+ return nullptr;
15746
+ }
15747
+ ctx->backends.push_back(backend);
15748
+ }
15749
+ #elif defined(GGML_USE_METAL)
15558
15750
  if (model->n_gpu_layers > 0) {
15559
15751
  ctx->backend_metal = ggml_backend_metal_init();
15560
15752
  if (ctx->backend_metal == nullptr) {
@@ -15710,7 +15902,11 @@ struct llama_context * llama_new_context_with_model(
15710
15902
  ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead_custom(LLAMA_MAX_NODES, false));
15711
15903
 
15712
15904
  // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
15713
- bool pipeline_parallel = llama_get_device_count() > 1 && model->n_gpu_layers > (int)model->hparams.n_layer && model->split_mode == LLAMA_SPLIT_MODE_LAYER;
15905
+ bool pipeline_parallel =
15906
+ llama_get_device_count(*model) > 1 &&
15907
+ model->n_gpu_layers > (int)model->hparams.n_layer &&
15908
+ model->split_mode == LLAMA_SPLIT_MODE_LAYER &&
15909
+ params.offload_kqv;
15714
15910
  #ifndef GGML_USE_CUDA
15715
15911
  // pipeline parallelism requires support for async compute and events
15716
15912
  // currently this is only implemented in the CUDA backend
@@ -15808,6 +16004,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
15808
16004
  case LLM_ARCH_REFACT:
15809
16005
  case LLM_ARCH_BLOOM:
15810
16006
  case LLM_ARCH_MAMBA:
16007
+ case LLM_ARCH_JINA_BERT_V2:
15811
16008
  return LLAMA_ROPE_TYPE_NONE;
15812
16009
 
15813
16010
  // use what we call a normal RoPE, operating on pairs of consecutive head values
@@ -16829,13 +17026,13 @@ static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llam
16829
17026
  }
16830
17027
  else {
16831
17028
  if (cell_range_begin != kv_self.size) {
16832
- cell_ranges.push_back({ cell_range_begin, i });
17029
+ cell_ranges.emplace_back(cell_range_begin, i);
16833
17030
  cell_range_begin = kv_self.size;
16834
17031
  }
16835
17032
  }
16836
17033
  }
16837
17034
  if (cell_range_begin != kv_self.size) {
16838
- cell_ranges.push_back({ cell_range_begin, kv_self.size });
17035
+ cell_ranges.emplace_back(cell_range_begin, kv_self.size);
16839
17036
  }
16840
17037
 
16841
17038
  // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count