llama_cpp 0.15.1 → 0.15.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -7,6 +7,10 @@
7
7
  #include "ggml-alloc.h"
8
8
  #include "ggml-backend.h"
9
9
 
10
+ #ifdef GGML_USE_RPC
11
+ # include "ggml-rpc.h"
12
+ #endif
13
+
10
14
  #ifdef GGML_USE_CUDA
11
15
  # include "ggml-cuda.h"
12
16
  #elif defined(GGML_USE_CLBLAST)
@@ -205,6 +209,7 @@ enum llm_arch {
205
209
  LLM_ARCH_REFACT,
206
210
  LLM_ARCH_BERT,
207
211
  LLM_ARCH_NOMIC_BERT,
212
+ LLM_ARCH_JINA_BERT_V2,
208
213
  LLM_ARCH_BLOOM,
209
214
  LLM_ARCH_STABLELM,
210
215
  LLM_ARCH_QWEN,
@@ -228,39 +233,40 @@ enum llm_arch {
228
233
  };
229
234
 
230
235
  static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
231
- { LLM_ARCH_LLAMA, "llama" },
232
- { LLM_ARCH_FALCON, "falcon" },
233
- { LLM_ARCH_GROK, "grok" },
234
- { LLM_ARCH_GPT2, "gpt2" },
235
- { LLM_ARCH_GPTJ, "gptj" },
236
- { LLM_ARCH_GPTNEOX, "gptneox" },
237
- { LLM_ARCH_MPT, "mpt" },
238
- { LLM_ARCH_BAICHUAN, "baichuan" },
239
- { LLM_ARCH_STARCODER, "starcoder" },
240
- { LLM_ARCH_PERSIMMON, "persimmon" },
241
- { LLM_ARCH_REFACT, "refact" },
242
- { LLM_ARCH_BERT, "bert" },
243
- { LLM_ARCH_NOMIC_BERT, "nomic-bert" },
244
- { LLM_ARCH_BLOOM, "bloom" },
245
- { LLM_ARCH_STABLELM, "stablelm" },
246
- { LLM_ARCH_QWEN, "qwen" },
247
- { LLM_ARCH_QWEN2, "qwen2" },
248
- { LLM_ARCH_QWEN2MOE, "qwen2moe" },
249
- { LLM_ARCH_PHI2, "phi2" },
250
- { LLM_ARCH_PHI3, "phi3" },
251
- { LLM_ARCH_PLAMO, "plamo" },
252
- { LLM_ARCH_CODESHELL, "codeshell" },
253
- { LLM_ARCH_ORION, "orion" },
254
- { LLM_ARCH_INTERNLM2, "internlm2" },
255
- { LLM_ARCH_MINICPM, "minicpm" },
256
- { LLM_ARCH_GEMMA, "gemma" },
257
- { LLM_ARCH_STARCODER2, "starcoder2" },
258
- { LLM_ARCH_MAMBA, "mamba" },
259
- { LLM_ARCH_XVERSE, "xverse" },
260
- { LLM_ARCH_COMMAND_R, "command-r" },
261
- { LLM_ARCH_DBRX, "dbrx" },
262
- { LLM_ARCH_OLMO, "olmo" },
263
- { LLM_ARCH_UNKNOWN, "(unknown)" },
236
+ { LLM_ARCH_LLAMA, "llama" },
237
+ { LLM_ARCH_FALCON, "falcon" },
238
+ { LLM_ARCH_GROK, "grok" },
239
+ { LLM_ARCH_GPT2, "gpt2" },
240
+ { LLM_ARCH_GPTJ, "gptj" },
241
+ { LLM_ARCH_GPTNEOX, "gptneox" },
242
+ { LLM_ARCH_MPT, "mpt" },
243
+ { LLM_ARCH_BAICHUAN, "baichuan" },
244
+ { LLM_ARCH_STARCODER, "starcoder" },
245
+ { LLM_ARCH_PERSIMMON, "persimmon" },
246
+ { LLM_ARCH_REFACT, "refact" },
247
+ { LLM_ARCH_BERT, "bert" },
248
+ { LLM_ARCH_NOMIC_BERT, "nomic-bert" },
249
+ { LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
250
+ { LLM_ARCH_BLOOM, "bloom" },
251
+ { LLM_ARCH_STABLELM, "stablelm" },
252
+ { LLM_ARCH_QWEN, "qwen" },
253
+ { LLM_ARCH_QWEN2, "qwen2" },
254
+ { LLM_ARCH_QWEN2MOE, "qwen2moe" },
255
+ { LLM_ARCH_PHI2, "phi2" },
256
+ { LLM_ARCH_PHI3, "phi3" },
257
+ { LLM_ARCH_PLAMO, "plamo" },
258
+ { LLM_ARCH_CODESHELL, "codeshell" },
259
+ { LLM_ARCH_ORION, "orion" },
260
+ { LLM_ARCH_INTERNLM2, "internlm2" },
261
+ { LLM_ARCH_MINICPM, "minicpm" },
262
+ { LLM_ARCH_GEMMA, "gemma" },
263
+ { LLM_ARCH_STARCODER2, "starcoder2" },
264
+ { LLM_ARCH_MAMBA, "mamba" },
265
+ { LLM_ARCH_XVERSE, "xverse" },
266
+ { LLM_ARCH_COMMAND_R, "command-r" },
267
+ { LLM_ARCH_DBRX, "dbrx" },
268
+ { LLM_ARCH_OLMO, "olmo" },
269
+ { LLM_ARCH_UNKNOWN, "(unknown)" },
264
270
  };
265
271
 
266
272
  enum llm_kv {
@@ -691,6 +697,25 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
691
697
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
692
698
  },
693
699
  },
700
+ {
701
+ LLM_ARCH_JINA_BERT_V2,
702
+ {
703
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
704
+ { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
705
+ { LLM_TENSOR_TOKEN_TYPES, "token_types" },
706
+ { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
707
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
708
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
709
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
710
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
711
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
712
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
713
+ { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
714
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
715
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
716
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
717
+ },
718
+ },
694
719
  {
695
720
  LLM_ARCH_BLOOM,
696
721
  {
@@ -1664,91 +1689,6 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
1664
1689
  GGML_UNUSED(host_buffer);
1665
1690
  }
1666
1691
 
1667
- static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
1668
- ggml_backend_buffer_type_t buft = nullptr;
1669
-
1670
- #ifdef GGML_USE_METAL
1671
- buft = ggml_backend_metal_buffer_type();
1672
- #elif defined(GGML_USE_CUDA)
1673
- buft = ggml_backend_cuda_buffer_type(gpu);
1674
- #elif defined(GGML_USE_VULKAN)
1675
- buft = ggml_backend_vk_buffer_type(gpu);
1676
- #elif defined(GGML_USE_SYCL)
1677
- buft = ggml_backend_sycl_buffer_type(gpu);
1678
- #elif defined(GGML_USE_CLBLAST)
1679
- buft = ggml_backend_opencl_buffer_type();
1680
- #elif defined(GGML_USE_KOMPUTE)
1681
- buft = ggml_backend_kompute_buffer_type(gpu);
1682
- if (buft == nullptr) {
1683
- LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
1684
- }
1685
- #endif
1686
-
1687
- if (buft == nullptr) {
1688
- buft = llama_default_buffer_type_cpu(true);
1689
- }
1690
- return buft;
1691
-
1692
- GGML_UNUSED(gpu);
1693
- }
1694
-
1695
- static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_gpu, const float * tensor_split) {
1696
- ggml_backend_buffer_type_t buft = nullptr;
1697
-
1698
- #ifdef GGML_USE_CUDA
1699
- if (ggml_backend_cuda_get_device_count() > 1) {
1700
- buft = ggml_backend_cuda_split_buffer_type(tensor_split);
1701
- }
1702
- #endif
1703
-
1704
- #ifdef GGML_USE_SYCL
1705
- if (ggml_backend_sycl_get_device_count() > 1) {
1706
- buft = ggml_backend_sycl_split_buffer_type(tensor_split);
1707
- }
1708
- #endif
1709
-
1710
- if (buft == nullptr) {
1711
- buft = llama_default_buffer_type_offload(fallback_gpu);
1712
- }
1713
- return buft;
1714
-
1715
- GGML_UNUSED(tensor_split);
1716
- }
1717
-
1718
- static size_t llama_get_device_count() {
1719
- #if defined(GGML_USE_CUDA)
1720
- return ggml_backend_cuda_get_device_count();
1721
- #elif defined(GGML_USE_SYCL)
1722
- return ggml_backend_sycl_get_device_count();
1723
- #elif defined(GGML_USE_VULKAN)
1724
- return ggml_backend_vk_get_device_count();
1725
- #else
1726
- return 1;
1727
- #endif
1728
- }
1729
-
1730
- static size_t llama_get_device_memory(int device) {
1731
- #if defined(GGML_USE_CUDA)
1732
- size_t total;
1733
- size_t free;
1734
- ggml_backend_cuda_get_device_memory(device, &free, &total);
1735
- return free;
1736
- #elif defined(GGML_USE_SYCL)
1737
- size_t total;
1738
- size_t free;
1739
- ggml_backend_sycl_get_device_memory(device, &free, &total);
1740
- return free;
1741
- #elif defined(GGML_USE_VULKAN)
1742
- size_t total;
1743
- size_t free;
1744
- ggml_backend_vk_get_device_memory(device, &free, &total);
1745
- return free;
1746
- #else
1747
- return 1;
1748
- GGML_UNUSED(device);
1749
- #endif
1750
- }
1751
-
1752
1692
  //
1753
1693
  // globals
1754
1694
  //
@@ -1845,7 +1785,7 @@ struct llama_hparams {
1845
1785
  float f_logit_scale = 0.0f;
1846
1786
 
1847
1787
  bool causal_attn = true;
1848
- bool use_alibi = false; // currently, we need KQ_pos data for ALiBi-based models
1788
+ bool use_alibi = false;
1849
1789
 
1850
1790
  enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
1851
1791
  enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
@@ -2189,6 +2129,8 @@ struct llama_model {
2189
2129
  int main_gpu;
2190
2130
  int n_gpu_layers;
2191
2131
 
2132
+ std::vector<std::string> rpc_servers;
2133
+
2192
2134
  // gguf metadata
2193
2135
  std::unordered_map<std::string, std::string> gguf_kv;
2194
2136
 
@@ -2317,7 +2259,6 @@ struct llama_context {
2317
2259
  struct ggml_tensor * inp_pos; // I32 [n_batch]
2318
2260
  struct ggml_tensor * inp_out_ids; // I32 [n_outputs]
2319
2261
  struct ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch]
2320
- struct ggml_tensor * inp_KQ_pos; // F32 [n_kv]
2321
2262
  struct ggml_tensor * inp_K_shift; // I32 [kv_size]
2322
2263
  struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch]
2323
2264
  struct ggml_tensor * inp_cls; // I32 [n_batch]
@@ -2333,6 +2274,104 @@ struct llama_context {
2333
2274
  #endif
2334
2275
  };
2335
2276
 
2277
+ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
2278
+ ggml_backend_buffer_type_t buft = nullptr;
2279
+
2280
+ #ifdef GGML_USE_RPC
2281
+ std::string endpoint = model.rpc_servers[gpu];
2282
+ buft = ggml_backend_rpc_buffer_type(endpoint.c_str());
2283
+ #elif defined(GGML_USE_METAL)
2284
+ buft = ggml_backend_metal_buffer_type();
2285
+ #elif defined(GGML_USE_CUDA)
2286
+ buft = ggml_backend_cuda_buffer_type(gpu);
2287
+ #elif defined(GGML_USE_VULKAN)
2288
+ buft = ggml_backend_vk_buffer_type(gpu);
2289
+ #elif defined(GGML_USE_SYCL)
2290
+ buft = ggml_backend_sycl_buffer_type(gpu);
2291
+ #elif defined(GGML_USE_CLBLAST)
2292
+ buft = ggml_backend_opencl_buffer_type();
2293
+ #elif defined(GGML_USE_KOMPUTE)
2294
+ buft = ggml_backend_kompute_buffer_type(gpu);
2295
+ if (buft == nullptr) {
2296
+ LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
2297
+ }
2298
+ #endif
2299
+
2300
+ if (buft == nullptr) {
2301
+ buft = llama_default_buffer_type_cpu(true);
2302
+ }
2303
+ return buft;
2304
+ GGML_UNUSED(model);
2305
+ GGML_UNUSED(gpu);
2306
+ }
2307
+
2308
+ static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_model & model, int fallback_gpu, const float * tensor_split) {
2309
+ ggml_backend_buffer_type_t buft = nullptr;
2310
+
2311
+ #ifdef GGML_USE_CUDA
2312
+ if (ggml_backend_cuda_get_device_count() > 1) {
2313
+ buft = ggml_backend_cuda_split_buffer_type(tensor_split);
2314
+ }
2315
+ #endif
2316
+
2317
+ #ifdef GGML_USE_SYCL
2318
+ if (ggml_backend_sycl_get_device_count() > 1) {
2319
+ buft = ggml_backend_sycl_split_buffer_type(tensor_split);
2320
+ }
2321
+ #endif
2322
+
2323
+ if (buft == nullptr) {
2324
+ buft = llama_default_buffer_type_offload(model, fallback_gpu);
2325
+ }
2326
+ return buft;
2327
+
2328
+ GGML_UNUSED(tensor_split);
2329
+ }
2330
+
2331
+ static size_t llama_get_device_count(const llama_model & model) {
2332
+ #if defined(GGML_USE_RPC)
2333
+ return model.rpc_servers.size();
2334
+ #elif defined(GGML_USE_CUDA)
2335
+ return ggml_backend_cuda_get_device_count();
2336
+ #elif defined(GGML_USE_SYCL)
2337
+ return ggml_backend_sycl_get_device_count();
2338
+ #elif defined(GGML_USE_VULKAN)
2339
+ return ggml_backend_vk_get_device_count();
2340
+ #else
2341
+ return 1;
2342
+ #endif
2343
+ GGML_UNUSED(model);
2344
+ }
2345
+
2346
+ static size_t llama_get_device_memory(const llama_model & model, int device) {
2347
+ #if defined(GGML_USE_RPC)
2348
+ size_t total;
2349
+ size_t free;
2350
+ std::string endpoint = model.rpc_servers[device];
2351
+ ggml_backend_rpc_get_device_memory(endpoint.c_str(), &free, &total);
2352
+ return free;
2353
+ #elif defined(GGML_USE_CUDA)
2354
+ size_t total;
2355
+ size_t free;
2356
+ ggml_backend_cuda_get_device_memory(device, &free, &total);
2357
+ return free;
2358
+ #elif defined(GGML_USE_SYCL)
2359
+ size_t total;
2360
+ size_t free;
2361
+ ggml_backend_sycl_get_device_memory(device, &free, &total);
2362
+ return free;
2363
+ #elif defined(GGML_USE_VULKAN)
2364
+ size_t total;
2365
+ size_t free;
2366
+ ggml_backend_vk_get_device_memory(device, &free, &total);
2367
+ return free;
2368
+ #else
2369
+ return 1;
2370
+ #endif
2371
+ GGML_UNUSED(model);
2372
+ GGML_UNUSED(device);
2373
+ }
2374
+
2336
2375
  //
2337
2376
  // kv cache helpers
2338
2377
  //
@@ -2785,6 +2824,11 @@ static void llama_kv_cache_defrag(struct llama_kv_cache & cache) {
2785
2824
  cache.do_defrag = true;
2786
2825
  }
2787
2826
 
2827
+ static uint32_t llama_kv_cache_get_padding(const struct llama_cparams & cparams) {
2828
+ // the FA kernels require padding to avoid extra runtime boundary checks
2829
+ return cparams.flash_attn ? 256u : 32u;
2830
+ }
2831
+
2788
2832
  //
2789
2833
  // model loading and saving
2790
2834
  //
@@ -3779,6 +3823,12 @@ static void llm_load_hparams(
3779
3823
 
3780
3824
  // get hparams kv
3781
3825
  ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab);
3826
+
3827
+ // everything past this point is not vocab-related
3828
+ if (hparams.vocab_only) {
3829
+ return;
3830
+ }
3831
+
3782
3832
  ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
3783
3833
  ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
3784
3834
  ml.get_key(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff);
@@ -3860,7 +3910,7 @@ static void llm_load_hparams(
3860
3910
  switch (hparams.n_layer) {
3861
3911
  case 22: model.type = e_model::MODEL_1B; break;
3862
3912
  case 26: model.type = e_model::MODEL_3B; break;
3863
- case 32: model.type = hparams.n_head == hparams.n_head_kv ? e_model::MODEL_7B : e_model::MODEL_8B; break; // LLaMa 8B v3 uses GQA
3913
+ case 32: model.type = hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B; break;
3864
3914
  case 40: model.type = e_model::MODEL_13B; break;
3865
3915
  case 48: model.type = e_model::MODEL_34B; break;
3866
3916
  case 60: model.type = e_model::MODEL_30B; break;
@@ -3962,6 +4012,19 @@ static void llm_load_hparams(
3962
4012
  model.type = e_model::MODEL_335M; break; // bge-large
3963
4013
  }
3964
4014
  } break;
4015
+ case LLM_ARCH_JINA_BERT_V2:
4016
+ {
4017
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
4018
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
4019
+ ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
4020
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
4021
+ hparams.f_max_alibi_bias = 8.0f;
4022
+
4023
+ switch (hparams.n_layer) {
4024
+ case 4: model.type = e_model::MODEL_33M; break; // jina-embeddings-small
4025
+ case 12: model.type = e_model::MODEL_137M; break; // jina-embeddings-base
4026
+ }
4027
+ } break;
3965
4028
  case LLM_ARCH_NOMIC_BERT:
3966
4029
  {
3967
4030
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -4383,7 +4446,11 @@ static void llm_load_vocab(
4383
4446
  tokenizer_pre == "starcoder") {
4384
4447
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STARCODER;
4385
4448
  } else if (
4386
- tokenizer_pre == "gpt-2") {
4449
+ tokenizer_pre == "gpt-2" ||
4450
+ tokenizer_pre == "jina-es" ||
4451
+ tokenizer_pre == "jina-de" ||
4452
+ tokenizer_pre == "jina-v2-es" ||
4453
+ tokenizer_pre == "jina-v2-de") {
4387
4454
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
4388
4455
  } else if (
4389
4456
  tokenizer_pre == "refact") {
@@ -4743,13 +4810,13 @@ static bool llm_load_tensors(
4743
4810
 
4744
4811
  if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
4745
4812
  // calculate the split points
4746
- int device_count = llama_get_device_count();
4813
+ int device_count = llama_get_device_count(model);
4747
4814
  bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
4748
4815
  std::vector<float> splits(device_count);
4749
4816
  if (all_zero) {
4750
4817
  // default split, by free memory
4751
4818
  for (int i = 0; i < device_count; ++i) {
4752
- splits[i] = llama_get_device_memory(i);
4819
+ splits[i] = llama_get_device_memory(model, i);
4753
4820
  }
4754
4821
  } else {
4755
4822
  std::copy(tensor_split, tensor_split + device_count, splits.begin());
@@ -4769,35 +4836,35 @@ static bool llm_load_tensors(
4769
4836
  int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1);
4770
4837
  for (int64_t i = i_gpu_start; i < n_layer; ++i) {
4771
4838
  int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits.begin();
4772
- model.buft_layer[i] = llama_default_buffer_type_offload(layer_gpu);
4839
+ model.buft_layer[i] = llama_default_buffer_type_offload(model, layer_gpu);
4773
4840
  }
4774
4841
  // assign the output layer
4775
4842
  if (n_gpu_layers > n_layer) {
4776
4843
  int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits.begin();
4777
- model.buft_output = llama_default_buffer_type_offload(layer_gpu);
4844
+ model.buft_output = llama_default_buffer_type_offload(model, layer_gpu);
4778
4845
  } else {
4779
4846
  model.buft_output = llama_default_buffer_type_cpu(true);
4780
4847
  }
4781
4848
  } else {
4782
4849
  ggml_backend_buffer_type_t split_buft;
4783
4850
  if (split_mode == LLAMA_SPLIT_MODE_ROW) {
4784
- split_buft = llama_default_buffer_type_split(main_gpu, tensor_split);
4851
+ split_buft = llama_default_buffer_type_split(model, main_gpu, tensor_split);
4785
4852
  } else {
4786
4853
  // LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_LAYER in backends where it is not supported
4787
- split_buft = llama_default_buffer_type_offload(main_gpu);
4854
+ split_buft = llama_default_buffer_type_offload(model, main_gpu);
4788
4855
  }
4789
4856
  // assign the repeating layers
4790
4857
  for (int64_t i = i_gpu_start; i < n_layer; ++i) {
4791
4858
  model.buft_layer[i] = {
4792
4859
  split_buft,
4793
- llama_default_buffer_type_offload(main_gpu)
4860
+ llama_default_buffer_type_offload(model, main_gpu)
4794
4861
  };
4795
4862
  }
4796
4863
  // assign the output layer
4797
4864
  if (n_gpu_layers > n_layer) {
4798
4865
  model.buft_output = {
4799
4866
  split_buft,
4800
- llama_default_buffer_type_offload(main_gpu)
4867
+ llama_default_buffer_type_offload(model, main_gpu)
4801
4868
  };
4802
4869
  } else {
4803
4870
  model.buft_output = llama_default_buffer_type_cpu(true);
@@ -5242,6 +5309,50 @@ static bool llm_load_tensors(
5242
5309
  layer.layer_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd});
5243
5310
  }
5244
5311
  } break;
5312
+ case LLM_ARCH_JINA_BERT_V2:
5313
+ {
5314
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // word_embeddings
5315
+ model.type_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type}); //token_type_embeddings
5316
+ model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}); // LayerNorm
5317
+ model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}); //LayerNorm bias
5318
+
5319
+ for (int i = 0; i < n_layer; ++i) {
5320
+ ggml_context * ctx_layer = ctx_for_layer(i);
5321
+ ggml_context * ctx_split = ctx_for_layer_split(i);
5322
+
5323
+ auto & layer = model.layers[i]; // JinaBertLayer
5324
+
5325
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
5326
+ layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
5327
+
5328
+ layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, false);
5329
+ layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, false);
5330
+
5331
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
5332
+ layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
5333
+
5334
+ layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, false);
5335
+ layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, false);
5336
+
5337
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
5338
+ layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
5339
+
5340
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); //output_dens
5341
+ layer.bo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}); //output_dens
5342
+
5343
+ layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}); //output_norm
5344
+ layer.attn_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd});
5345
+
5346
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
5347
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
5348
+
5349
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
5350
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
5351
+
5352
+ layer.layer_out_norm = ml.create_tensor(ctx_split, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
5353
+ layer.layer_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd});
5354
+ }
5355
+ } break;
5245
5356
  case LLM_ARCH_BLOOM:
5246
5357
  {
5247
5358
  model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
@@ -6318,7 +6429,7 @@ static struct ggml_tensor * llm_build_ffn(
6318
6429
  llm_ffn_gate_type type_gate,
6319
6430
  const llm_build_cb & cb,
6320
6431
  int il) {
6321
- struct ggml_tensor * tmp = ggml_mul_mat(ctx, up, cur);
6432
+ struct ggml_tensor * tmp = up ? ggml_mul_mat(ctx, up, cur) : cur;
6322
6433
  cb(tmp, "ffn_up", il);
6323
6434
 
6324
6435
  if (up_b) {
@@ -6500,7 +6611,6 @@ static struct ggml_tensor * llm_build_kqv(
6500
6611
  struct ggml_tensor * wo_b,
6501
6612
  struct ggml_tensor * q_cur,
6502
6613
  struct ggml_tensor * kq_mask,
6503
- struct ggml_tensor * kq_pos,
6504
6614
  int32_t n_tokens,
6505
6615
  int32_t n_kv,
6506
6616
  float kq_scale,
@@ -6512,6 +6622,7 @@ static struct ggml_tensor * llm_build_kqv(
6512
6622
  const int64_t n_embd_head_k = hparams.n_embd_head_k;
6513
6623
  const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
6514
6624
  const int64_t n_embd_head_v = hparams.n_embd_head_v;
6625
+ const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
6515
6626
 
6516
6627
  struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3);
6517
6628
  cb(q, "q", il);
@@ -6530,26 +6641,22 @@ static struct ggml_tensor * llm_build_kqv(
6530
6641
  GGML_UNUSED(model);
6531
6642
  GGML_UNUSED(n_ctx);
6532
6643
 
6533
- // note: if this assert triggers, then some check has failed earlier
6534
- // the idea is to detect during context creation that ALiBi would be used and disable Flash Attention
6535
- GGML_ASSERT(kq_pos == nullptr && "ALiBi is not yet supported with Flash Attention");
6536
-
6537
6644
  // split cached v into n_head heads (not transposed)
6538
6645
  struct ggml_tensor * v =
6539
6646
  ggml_view_3d(ctx, kv.v_l[il],
6540
6647
  n_embd_head_v, n_kv, n_head_kv,
6541
- ggml_row_size(kv.v_l[il]->type, n_embd_k_gqa),
6542
- ggml_row_size(kv.v_l[il]->type, n_embd_head_k),
6648
+ ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa),
6649
+ ggml_row_size(kv.v_l[il]->type, n_embd_head_v),
6543
6650
  0);
6544
6651
  cb(v, "v", il);
6545
6652
 
6546
- cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale);
6653
+ cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias);
6547
6654
 
6548
6655
  if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
6549
6656
  ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
6550
6657
  }
6551
6658
 
6552
- cur = ggml_reshape_2d(ctx, cur, n_embd_head_k*n_head, n_tokens);
6659
+ cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens);
6553
6660
  } else {
6554
6661
  struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
6555
6662
  cb(kq, "kq", il);
@@ -6574,28 +6681,8 @@ static struct ggml_tensor * llm_build_kqv(
6574
6681
  kq = ggml_scale(ctx, kq, 30);
6575
6682
  }
6576
6683
 
6577
- #if defined(GGML_USE_KOMPUTE)
6578
- #pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Kompute")
6579
- #pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
6580
- #pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5488")
6581
- if (hparams.use_alibi) {
6582
- kq = ggml_scale(ctx, kq, kq_scale);
6583
- cb(kq, "kq_scaled", il);
6584
-
6585
- kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, hparams.f_max_alibi_bias);
6586
- cb(kq, "kq_scaled_alibi", il);
6587
-
6588
- kq = ggml_add(ctx, kq, kq_mask);
6589
- cb(kq, "kq_masked", il);
6590
-
6591
- kq = ggml_soft_max(ctx, kq);
6592
- cb(kq, "kq_soft_max", il);
6593
- } else
6594
- #endif
6595
- {
6596
- kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_pos, kq_scale, hparams.f_max_alibi_bias);
6597
- cb(kq, "kq_soft_max_ext", il);
6598
- }
6684
+ kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias);
6685
+ cb(kq, "kq_soft_max_ext", il);
6599
6686
 
6600
6687
  GGML_ASSERT(kv.size == n_ctx);
6601
6688
 
@@ -6614,7 +6701,7 @@ static struct ggml_tensor * llm_build_kqv(
6614
6701
  struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
6615
6702
  cb(kqv_merged, "kqv_merged", il);
6616
6703
 
6617
- cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens);
6704
+ cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_v*n_head, n_tokens);
6618
6705
  cb(cur, "kqv_merged_cont", il);
6619
6706
  }
6620
6707
 
@@ -6645,7 +6732,6 @@ static struct ggml_tensor * llm_build_kv(
6645
6732
  struct ggml_tensor * v_cur,
6646
6733
  struct ggml_tensor * q_cur,
6647
6734
  struct ggml_tensor * kq_mask,
6648
- struct ggml_tensor * kq_pos,
6649
6735
  int32_t n_tokens,
6650
6736
  int32_t kv_head,
6651
6737
  int32_t n_kv,
@@ -6664,7 +6750,7 @@ static struct ggml_tensor * llm_build_kv(
6664
6750
  struct ggml_tensor * cur;
6665
6751
 
6666
6752
  cur = llm_build_kqv(ctx, model, hparams, cparams, kv, graph, wo, wo_b,
6667
- q_cur, kq_mask, kq_pos, n_tokens, n_kv, kq_scale, cb, il);
6753
+ q_cur, kq_mask, n_tokens, n_kv, kq_scale, cb, il);
6668
6754
  cb(cur, "kqv_out", il);
6669
6755
 
6670
6756
  return cur;
@@ -6771,18 +6857,17 @@ struct llm_build_context {
6771
6857
 
6772
6858
  ctx0 = ggml_init(params);
6773
6859
 
6774
- lctx.inp_tokens = nullptr;
6775
- lctx.inp_embd = nullptr;
6776
- lctx.inp_pos = nullptr;
6860
+ lctx.inp_tokens = nullptr;
6861
+ lctx.inp_embd = nullptr;
6862
+ lctx.inp_pos = nullptr;
6777
6863
  lctx.inp_out_ids = nullptr;
6778
6864
  lctx.inp_KQ_mask = nullptr;
6779
- lctx.inp_KQ_pos = nullptr;
6780
6865
  lctx.inp_K_shift = nullptr;
6781
- lctx.inp_mean = nullptr;
6782
- lctx.inp_cls = nullptr;
6783
- lctx.inp_s_copy = nullptr;
6784
- lctx.inp_s_mask = nullptr;
6785
- lctx.inp_s_seq = nullptr;
6866
+ lctx.inp_mean = nullptr;
6867
+ lctx.inp_cls = nullptr;
6868
+ lctx.inp_s_copy = nullptr;
6869
+ lctx.inp_s_mask = nullptr;
6870
+ lctx.inp_s_seq = nullptr;
6786
6871
  }
6787
6872
 
6788
6873
  void free() {
@@ -6932,19 +7017,6 @@ struct llm_build_context {
6932
7017
  return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_mask, GGML_TYPE_F16) : lctx.inp_KQ_mask;
6933
7018
  }
6934
7019
 
6935
- struct ggml_tensor * build_inp_KQ_pos(bool causal = true) {
6936
- if (causal) {
6937
- lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_kv);
6938
- } else {
6939
- // TODO: this will be needed for ALiBi-based BERT models
6940
- // https://github.com/ggerganov/llama.cpp/pull/6826
6941
- lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_tokens);
6942
- }
6943
- cb(lctx.inp_KQ_pos, "KQ_pos", -1);
6944
- ggml_set_input(lctx.inp_KQ_pos);
6945
- return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_pos, GGML_TYPE_F16) : lctx.inp_KQ_pos;
6946
- }
6947
-
6948
7020
  struct ggml_tensor * build_inp_mean() {
6949
7021
  lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
6950
7022
  cb(lctx.inp_mean, "inp_mean", -1);
@@ -7050,7 +7122,7 @@ struct llm_build_context {
7050
7122
 
7051
7123
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7052
7124
  model.layers[il].wo, model.layers[il].bo,
7053
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7125
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7054
7126
  }
7055
7127
 
7056
7128
  if (il == n_layer - 1) {
@@ -7143,9 +7215,6 @@ struct llm_build_context {
7143
7215
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
7144
7216
  struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
7145
7217
 
7146
- // positions of the tokens in the KV cache
7147
- struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
7148
-
7149
7218
  for (int il = 0; il < n_layer; ++il) {
7150
7219
  struct ggml_tensor * inpSA = inpL;
7151
7220
 
@@ -7190,7 +7259,7 @@ struct llm_build_context {
7190
7259
 
7191
7260
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7192
7261
  model.layers[il].wo, NULL,
7193
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7262
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7194
7263
  }
7195
7264
 
7196
7265
  if (il == n_layer - 1) {
@@ -7260,9 +7329,6 @@ struct llm_build_context {
7260
7329
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
7261
7330
  struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
7262
7331
 
7263
- // positions of the tokens in the KV cache
7264
- struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
7265
-
7266
7332
  for (int il = 0; il < n_layer; ++il) {
7267
7333
  struct ggml_tensor * inpSA = inpL;
7268
7334
 
@@ -7297,7 +7363,7 @@ struct llm_build_context {
7297
7363
  cb(Kcur, "Kcur", il);
7298
7364
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7299
7365
  model.layers[il].wo, NULL,
7300
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7366
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7301
7367
  }
7302
7368
 
7303
7369
  if (il == n_layer - 1) {
@@ -7417,7 +7483,7 @@ struct llm_build_context {
7417
7483
 
7418
7484
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7419
7485
  model.layers[il].wo, NULL,
7420
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7486
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7421
7487
  }
7422
7488
 
7423
7489
  if (il == n_layer - 1) {
@@ -7542,7 +7608,7 @@ struct llm_build_context {
7542
7608
 
7543
7609
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7544
7610
  model.layers[il].wo, model.layers[il].bo,
7545
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
7611
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
7546
7612
  }
7547
7613
 
7548
7614
  if (il == n_layer - 1) {
@@ -7694,7 +7760,7 @@ struct llm_build_context {
7694
7760
 
7695
7761
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7696
7762
  model.layers[il].wo, NULL,
7697
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7763
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7698
7764
  }
7699
7765
 
7700
7766
  if (il == n_layer - 1) {
@@ -7806,7 +7872,7 @@ struct llm_build_context {
7806
7872
 
7807
7873
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7808
7874
  model.layers[il].wo, model.layers[il].bo,
7809
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7875
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7810
7876
  }
7811
7877
 
7812
7878
  if (il == n_layer - 1) {
@@ -8010,7 +8076,7 @@ struct llm_build_context {
8010
8076
 
8011
8077
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8012
8078
  model.layers[il].wo, model.layers[il].bo,
8013
- Kcur, Vcur, Q, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8079
+ Kcur, Vcur, Q, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8014
8080
  }
8015
8081
 
8016
8082
  if (il == n_layer - 1) {
@@ -8076,9 +8142,6 @@ struct llm_build_context {
8076
8142
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
8077
8143
  struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
8078
8144
 
8079
- // positions of the tokens in the KV cache
8080
- struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
8081
-
8082
8145
  for (int il = 0; il < n_layer; ++il) {
8083
8146
  struct ggml_tensor * inpSA = inpL;
8084
8147
 
@@ -8106,7 +8169,7 @@ struct llm_build_context {
8106
8169
 
8107
8170
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8108
8171
  model.layers[il].wo, NULL,
8109
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8172
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8110
8173
  }
8111
8174
 
8112
8175
  if (il == n_layer - 1) {
@@ -8168,8 +8231,11 @@ struct llm_build_context {
8168
8231
 
8169
8232
  struct ggml_tensor * cur;
8170
8233
  struct ggml_tensor * inpL;
8234
+ struct ggml_tensor * inp_pos = nullptr;
8171
8235
 
8172
- struct ggml_tensor * inp_pos = build_inp_pos();
8236
+ if (model.arch != LLM_ARCH_JINA_BERT_V2) {
8237
+ inp_pos = build_inp_pos();
8238
+ }
8173
8239
  struct ggml_tensor * inp_mean = build_inp_mean();
8174
8240
  struct ggml_tensor * inp_cls = build_inp_cls();
8175
8241
 
@@ -8200,13 +8266,26 @@ struct llm_build_context {
8200
8266
  struct ggml_tensor * Vcur;
8201
8267
 
8202
8268
  // self-attention
8203
- if (model.arch == LLM_ARCH_BERT) {
8269
+ if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_JINA_BERT_V2) {
8204
8270
  Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq);
8205
8271
  cb(Qcur, "Qcur", il);
8206
8272
 
8273
+ if (model.layers[il].attn_q_norm) {
8274
+ Qcur = llm_build_norm(ctx0, Qcur, hparams,
8275
+ model.layers[il].attn_q_norm,
8276
+ model.layers[il].attn_q_norm_b,
8277
+ LLM_NORM, cb, il);
8278
+ }
8279
+
8207
8280
  Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), model.layers[il].bk);
8208
8281
  cb(Kcur, "Kcur", il);
8209
8282
 
8283
+ if (model.layers[il].attn_k_norm) {
8284
+ Kcur = llm_build_norm(ctx0, Kcur, hparams,
8285
+ model.layers[il].attn_k_norm,
8286
+ model.layers[il].attn_k_norm_b,
8287
+ LLM_NORM, cb, il);
8288
+ }
8210
8289
  Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), model.layers[il].bv);
8211
8290
  cb(Vcur, "Vcur", il);
8212
8291
 
@@ -8246,7 +8325,7 @@ struct llm_build_context {
8246
8325
  struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
8247
8326
  cb(kq, "kq", il);
8248
8327
 
8249
- kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, nullptr, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
8328
+ kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
8250
8329
  cb(kq, "kq_soft_max_ext", il);
8251
8330
 
8252
8331
  struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)));
@@ -8297,6 +8376,13 @@ struct llm_build_context {
8297
8376
  model.layers[il].ffn_down, model.layers[il].ffn_down_b,
8298
8377
  NULL,
8299
8378
  LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
8379
+ } else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
8380
+ cur = llm_build_ffn(ctx0, cur,
8381
+ model.layers[il].ffn_up, NULL,
8382
+ model.layers[il].ffn_gate, NULL,
8383
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b,
8384
+ NULL,
8385
+ LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
8300
8386
  } else {
8301
8387
  cur = llm_build_ffn(ctx0, cur,
8302
8388
  model.layers[il].ffn_up, NULL,
@@ -8363,9 +8449,6 @@ struct llm_build_context {
8363
8449
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
8364
8450
  struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
8365
8451
 
8366
- // positions of the tokens in the KV cache
8367
- struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
8368
-
8369
8452
  inpL = llm_build_norm(ctx0, inpL, hparams,
8370
8453
  model.tok_norm,
8371
8454
  model.tok_norm_b,
@@ -8399,7 +8482,7 @@ struct llm_build_context {
8399
8482
 
8400
8483
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8401
8484
  model.layers[il].wo, model.layers[il].bo,
8402
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8485
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8403
8486
  }
8404
8487
 
8405
8488
  if (il == n_layer - 1) {
@@ -8464,9 +8547,6 @@ struct llm_build_context {
8464
8547
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
8465
8548
  struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
8466
8549
 
8467
- // positions of the tokens in the KV cache
8468
- struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
8469
-
8470
8550
  if (model.pos_embd) {
8471
8551
  // inp_pos - contains the positions
8472
8552
  struct ggml_tensor * inp_pos = build_inp_pos();
@@ -8530,13 +8610,13 @@ struct llm_build_context {
8530
8610
 
8531
8611
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8532
8612
  model.layers[il].wo, model.layers[il].bo,
8533
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8613
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8534
8614
  } else {
8535
8615
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8536
8616
 
8537
8617
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8538
8618
  model.layers[il].wo, model.layers[il].bo,
8539
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8619
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8540
8620
  }
8541
8621
  }
8542
8622
 
@@ -8680,7 +8760,7 @@ struct llm_build_context {
8680
8760
 
8681
8761
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8682
8762
  model.layers[il].wo, NULL,
8683
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8763
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8684
8764
  }
8685
8765
 
8686
8766
  if (il == n_layer - 1) {
@@ -8798,7 +8878,7 @@ struct llm_build_context {
8798
8878
 
8799
8879
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8800
8880
  model.layers[il].wo, NULL,
8801
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8881
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8802
8882
  }
8803
8883
 
8804
8884
  if (il == n_layer - 1) {
@@ -8911,7 +8991,7 @@ struct llm_build_context {
8911
8991
 
8912
8992
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8913
8993
  model.layers[il].wo, model.layers[il].bo,
8914
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8994
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8915
8995
  }
8916
8996
 
8917
8997
  if (il == n_layer - 1) {
@@ -9025,7 +9105,7 @@ struct llm_build_context {
9025
9105
 
9026
9106
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9027
9107
  model.layers[il].wo, model.layers[il].bo,
9028
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9108
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9029
9109
  }
9030
9110
 
9031
9111
  if (il == n_layer - 1) {
@@ -9180,7 +9260,7 @@ struct llm_build_context {
9180
9260
 
9181
9261
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9182
9262
  model.layers[il].wo, model.layers[il].bo,
9183
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
9263
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
9184
9264
  }
9185
9265
 
9186
9266
  if (il == n_layer - 1) {
@@ -9297,7 +9377,7 @@ struct llm_build_context {
9297
9377
 
9298
9378
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9299
9379
  model.layers[il].wo, model.layers[il].bo,
9300
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
9380
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
9301
9381
  }
9302
9382
 
9303
9383
  if (il == n_layer - 1) {
@@ -9410,7 +9490,7 @@ struct llm_build_context {
9410
9490
 
9411
9491
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9412
9492
  model.layers[il].wo, NULL,
9413
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9493
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9414
9494
  }
9415
9495
  struct ggml_tensor * sa_out = cur;
9416
9496
 
@@ -9513,7 +9593,7 @@ struct llm_build_context {
9513
9593
 
9514
9594
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9515
9595
  model.layers[il].wo, model.layers[il].bo,
9516
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9596
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9517
9597
  }
9518
9598
 
9519
9599
  if (il == n_layer - 1) {
@@ -9620,7 +9700,7 @@ struct llm_build_context {
9620
9700
 
9621
9701
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9622
9702
  model.layers[il].wo, model.layers[il].bo,
9623
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9703
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9624
9704
  }
9625
9705
 
9626
9706
  if (il == n_layer - 1) {
@@ -9736,7 +9816,7 @@ struct llm_build_context {
9736
9816
 
9737
9817
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9738
9818
  model.layers[il].wo, NULL,
9739
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9819
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9740
9820
  }
9741
9821
 
9742
9822
  if (il == n_layer - 1) {
@@ -9853,7 +9933,7 @@ struct llm_build_context {
9853
9933
 
9854
9934
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9855
9935
  model.layers[il].wo, model.layers[il].bo,
9856
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9936
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9857
9937
  }
9858
9938
 
9859
9939
  if (il == n_layer - 1) {
@@ -9983,7 +10063,7 @@ struct llm_build_context {
9983
10063
 
9984
10064
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9985
10065
  model.layers[il].wo, model.layers[il].bo,
9986
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10066
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9987
10067
  }
9988
10068
 
9989
10069
  if (il == n_layer - 1) {
@@ -10104,7 +10184,7 @@ struct llm_build_context {
10104
10184
 
10105
10185
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10106
10186
  model.layers[il].wo, NULL,
10107
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
10187
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
10108
10188
  }
10109
10189
 
10110
10190
  if (il == n_layer - 1) {
@@ -10223,7 +10303,7 @@ struct llm_build_context {
10223
10303
 
10224
10304
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10225
10305
  model.layers[il].wo, model.layers[il].bo,
10226
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10306
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10227
10307
  }
10228
10308
 
10229
10309
  if (il == n_layer - 1) {
@@ -10513,7 +10593,7 @@ struct llm_build_context {
10513
10593
 
10514
10594
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10515
10595
  model.layers[il].wo, model.layers[il].bo,
10516
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10596
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10517
10597
  }
10518
10598
 
10519
10599
  if (il == n_layer - 1) {
@@ -10644,7 +10724,7 @@ struct llm_build_context {
10644
10724
 
10645
10725
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10646
10726
  model.layers[il].wo, nullptr,
10647
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10727
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10648
10728
  }
10649
10729
 
10650
10730
  if (il == n_layer - 1) {
@@ -10825,6 +10905,7 @@ static struct ggml_cgraph * llama_build_graph(
10825
10905
  result = llm.build_refact();
10826
10906
  } break;
10827
10907
  case LLM_ARCH_BERT:
10908
+ case LLM_ARCH_JINA_BERT_V2:
10828
10909
  case LLM_ARCH_NOMIC_BERT:
10829
10910
  {
10830
10911
  result = llm.build_bert();
@@ -11032,11 +11113,21 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
11032
11113
  if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
11033
11114
  f = -INFINITY;
11034
11115
  } else {
11035
- f = 0.0f;
11116
+ if (hparams.use_alibi) {
11117
+ f = -fabs(lctx.kv_self.cells[i].pos - pos);
11118
+ } else {
11119
+ f = 0.0f;
11120
+ }
11036
11121
  }
11037
11122
  data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
11038
11123
  }
11039
11124
  }
11125
+
11126
+ for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
11127
+ for (int j = 0; j < n_kv; ++j) {
11128
+ data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
11129
+ }
11130
+ }
11040
11131
  }
11041
11132
  } else {
11042
11133
  // when using kv cache, the mask needs to match the kv cache size
@@ -11055,7 +11146,11 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
11055
11146
  float f = -INFINITY;
11056
11147
  for (int s = 0; s < batch.n_seq_id[i]; ++s) {
11057
11148
  if (batch.seq_id[i][s] == seq_id) {
11058
- f = 0.0f;
11149
+ if (hparams.use_alibi) {
11150
+ f = -fabs(batch.pos[i] - batch.pos[j]);
11151
+ } else {
11152
+ f = 0.0f;
11153
+ }
11059
11154
  break;
11060
11155
  }
11061
11156
  }
@@ -11071,21 +11166,6 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
11071
11166
  }
11072
11167
  }
11073
11168
 
11074
- // ALiBi requires the KQ_pos tensor to provide the sequence position of each token in the batch
11075
- // this allows to process multiple sequences in parallel with ALiBi-based models
11076
- if (hparams.use_alibi) {
11077
- const int64_t n_kv = kv_self.n;
11078
-
11079
- GGML_ASSERT(lctx.inp_KQ_pos);
11080
- GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_pos->buffer));
11081
-
11082
- float * data = (float *) lctx.inp_KQ_pos->data;
11083
-
11084
- for (int i = 0; i < n_kv; ++i) {
11085
- data[i] = float(lctx.kv_self.cells[i].pos);
11086
- }
11087
- }
11088
-
11089
11169
  if (cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
11090
11170
  const int64_t n_tokens = batch.n_tokens;
11091
11171
 
@@ -11455,7 +11535,8 @@ static int llama_decode_internal(
11455
11535
  // a heuristic, to avoid attending the full cache if it is not yet utilized
11456
11536
  // after enough generations, the benefit from this heuristic disappears
11457
11537
  // if we start defragmenting the cache, the benefit from this will be more important
11458
- kv_self.n = std::min(kv_self.size, std::max(256u, GGML_PAD(llama_kv_cache_cell_max(kv_self), 256)));
11538
+ const uint32_t pad = llama_kv_cache_get_padding(cparams);
11539
+ kv_self.n = std::min(kv_self.size, std::max(pad, GGML_PAD(llama_kv_cache_cell_max(kv_self), pad)));
11459
11540
  //kv_self.n = llama_kv_cache_cell_max(kv_self);
11460
11541
  }
11461
11542
  }
@@ -12200,13 +12281,14 @@ struct llm_tokenizer_bpe {
12200
12281
 
12201
12282
  void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
12202
12283
  int final_prev_index = -1;
12284
+ bool ignore_merges = false;
12203
12285
 
12204
12286
  std::vector<std::string> word_collection;
12205
12287
  switch (vocab.type) {
12206
12288
  case LLAMA_VOCAB_TYPE_BPE:
12207
12289
  switch (vocab.type_pre) {
12208
12290
  case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
12209
- case LLAMA_VOCAB_PRE_TYPE_DBRX:
12291
+ ignore_merges = true;
12210
12292
  word_collection = unicode_regex_split(text, {
12211
12293
  // original regex from tokenizer.json
12212
12294
  //"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
@@ -12215,6 +12297,12 @@ struct llm_tokenizer_bpe {
12215
12297
  "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
12216
12298
  });
12217
12299
  break;
12300
+ case LLAMA_VOCAB_PRE_TYPE_DBRX:
12301
+ word_collection = unicode_regex_split(text, {
12302
+ // same as llama3
12303
+ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
12304
+ });
12305
+ break;
12218
12306
  case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
12219
12307
  word_collection = unicode_regex_split(text, {
12220
12308
  "[\r\n]",
@@ -12298,6 +12386,11 @@ struct llm_tokenizer_bpe {
12298
12386
  int index = 0;
12299
12387
  size_t offset = 0;
12300
12388
 
12389
+ if (ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) {
12390
+ symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()});
12391
+ offset = word.size();
12392
+ }
12393
+
12301
12394
  while (offset < word.size()) {
12302
12395
  llm_symbol sym;
12303
12396
  size_t char_len = std::min(word.size() - offset, (size_t) ::utf8_len(word[offset]));
@@ -12483,16 +12576,16 @@ struct llm_tokenizer_wpm {
12483
12576
  // to lowercase, pad chinese characters, pad punctuation
12484
12577
  std::string new_str = "";
12485
12578
  for (uint32_t code : cpts_nfd) {
12486
- int type = unicode_cpt_type(code);
12487
- if (type == CODEPOINT_TYPE_ACCENT_MARK || type == CODEPOINT_TYPE_CONTROL) {
12579
+ const codepoint_flags flags = unicode_cpt_flags(code);
12580
+ if (flags.is_accent_mark || flags.is_control) {
12488
12581
  continue;
12489
12582
  }
12490
12583
  code = unicode_tolower(code);
12491
- if (type == CODEPOINT_TYPE_SEPARATOR) {
12584
+ if (flags.is_separator || flags.is_whitespace) { //####FIXME: is_separator ?
12492
12585
  code = ' ';
12493
12586
  }
12494
12587
  std::string s = unicode_cpt_to_utf8(code);
12495
- if (type == CODEPOINT_TYPE_PUNCTUATION || is_ascii_punct(code) || is_chinese_char(code)) {
12588
+ if (flags.is_punctuation || is_ascii_punct(code) || is_chinese_char(code)) {
12496
12589
  new_str += " ";
12497
12590
  new_str += s;
12498
12591
  new_str += " ";
@@ -12726,6 +12819,13 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
12726
12819
  }
12727
12820
  }
12728
12821
 
12822
+ if (add_special && vocab.special_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) {
12823
+ LLAMA_LOG_WARN(
12824
+ "%s: Added a BOS token to the prompt as specified by the model but the prompt "
12825
+ "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
12826
+ "Are you sure this is what you want?\n", __FUNCTION__);
12827
+ }
12828
+
12729
12829
  if (add_special && vocab.special_add_eos == 1) {
12730
12830
  GGML_ASSERT(vocab.special_eos_id != -1);
12731
12831
  output.push_back(vocab.special_eos_id);
@@ -12752,7 +12852,17 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
12752
12852
  }
12753
12853
  }
12754
12854
 
12755
- GGML_ASSERT(vocab.special_add_eos != 1);
12855
+ if (add_special && vocab.special_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) {
12856
+ LLAMA_LOG_WARN(
12857
+ "%s: Added a BOS token to the prompt as specified by the model but the prompt "
12858
+ "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
12859
+ "Are you sure this is what you want?\n", __FUNCTION__);
12860
+ }
12861
+
12862
+ if (add_special && vocab.special_add_eos == 1) {
12863
+ GGML_ASSERT(vocab.special_add_eos != -1);
12864
+ output.push_back(vocab.special_eos_id);
12865
+ }
12756
12866
  } break;
12757
12867
  case LLAMA_VOCAB_TYPE_WPM:
12758
12868
  {
@@ -13106,6 +13216,58 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
13106
13216
  return rejects;
13107
13217
  }
13108
13218
 
13219
+ static bool llama_grammar_detect_left_recursion(
13220
+ const std::vector<std::vector<llama_grammar_element>> & rules,
13221
+ size_t rule_index,
13222
+ std::vector<bool> * rules_visited,
13223
+ std::vector<bool> * rules_in_progress,
13224
+ std::vector<bool> * rules_may_be_empty) {
13225
+ if ((*rules_in_progress)[rule_index]) {
13226
+ return true;
13227
+ }
13228
+
13229
+ (*rules_in_progress)[rule_index] = true;
13230
+
13231
+ const std::vector<llama_grammar_element> & rule = rules[rule_index];
13232
+
13233
+ // First check if the rule might produce the empty string. This could be done combined with the second
13234
+ // step but it's more readable as two steps.
13235
+ bool at_rule_start = true;
13236
+ for (size_t i = 0; i < rule.size(); i++) {
13237
+ if (llama_grammar_is_end_of_sequence(&rule[i])) {
13238
+ if (at_rule_start) {
13239
+ (*rules_may_be_empty)[rule_index] = true;
13240
+ break;
13241
+ }
13242
+ at_rule_start = true;
13243
+ } else {
13244
+ at_rule_start = false;
13245
+ }
13246
+ }
13247
+
13248
+ // Second, recurse into leftmost nonterminals (or next-leftmost as long as the previous nonterminal may
13249
+ // be empty)
13250
+ bool recurse_into_nonterminal = true;
13251
+ for (size_t i = 0; i < rule.size(); i++) {
13252
+ if (rule[i].type == LLAMA_GRETYPE_RULE_REF && recurse_into_nonterminal) {
13253
+ if (llama_grammar_detect_left_recursion(rules, (size_t)rule[i].value, rules_visited, rules_in_progress, rules_may_be_empty)) {
13254
+ return true;
13255
+ }
13256
+ if (!((*rules_may_be_empty)[(size_t)rule[i].value])) {
13257
+ recurse_into_nonterminal = false;
13258
+ }
13259
+ } else if (llama_grammar_is_end_of_sequence(&rule[i])) {
13260
+ recurse_into_nonterminal = true;
13261
+ } else {
13262
+ recurse_into_nonterminal = false;
13263
+ }
13264
+ }
13265
+
13266
+ (*rules_in_progress)[rule_index] = false;
13267
+ (*rules_visited)[rule_index] = true;
13268
+ return false;
13269
+ }
13270
+
13109
13271
  //
13110
13272
  // grammar - external
13111
13273
  //
@@ -13125,6 +13287,19 @@ struct llama_grammar * llama_grammar_init(
13125
13287
  vec_rules[i].push_back({LLAMA_GRETYPE_END, 0});
13126
13288
  }
13127
13289
 
13290
+ // Check for left recursion
13291
+ std::vector<bool> rules_visited(n_rules);
13292
+ std::vector<bool> rules_in_progress(n_rules);
13293
+ std::vector<bool> rules_may_be_empty(n_rules);
13294
+ for (size_t i = 0; i < n_rules; i++) {
13295
+ if (rules_visited[i]) {
13296
+ continue;
13297
+ }
13298
+ if (llama_grammar_detect_left_recursion(vec_rules, i, &rules_visited, &rules_in_progress, &rules_may_be_empty)) {
13299
+ throw std::runtime_error(format("unsupported grammar, left recursion detected for nonterminal at index %zu", i));
13300
+ }
13301
+ }
13302
+
13128
13303
  // loop over alternates of start rule to build initial stacks
13129
13304
  std::vector<std::vector<const llama_grammar_element *>> stacks;
13130
13305
  pos = vec_rules[start_rule_index].data();
@@ -13147,6 +13322,9 @@ struct llama_grammar * llama_grammar_init(
13147
13322
  }
13148
13323
  } while (true);
13149
13324
 
13325
+ // Important: vec_rules has to be moved here, not copied, because stacks contains
13326
+ // pointers to elements of vec_rules. If vec_rules were copied into llama_grammar
13327
+ // then the pointers would be invalidated when the local vec_rules goes out of scope.
13150
13328
  return new llama_grammar{ std::move(vec_rules), std::move(stacks), {} };
13151
13329
  }
13152
13330
 
@@ -13741,9 +13919,7 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
13741
13919
 
13742
13920
  // Sample the next word X using top-k sampling
13743
13921
  llama_sample_top_k(nullptr, candidates, int(k), 1);
13744
- if (ctx) {
13745
- ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
13746
- }
13922
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
13747
13923
  llama_token X = llama_sample_token(ctx, candidates);
13748
13924
  t_start_sample_us = ggml_time_us();
13749
13925
 
@@ -13757,9 +13933,7 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
13757
13933
  // Update mu using the learning rate and error
13758
13934
  *mu = *mu - eta * e;
13759
13935
 
13760
- if (ctx) {
13761
- ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
13762
- }
13936
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
13763
13937
  return X;
13764
13938
  }
13765
13939
 
@@ -15246,6 +15420,7 @@ struct llama_model_params llama_model_default_params() {
15246
15420
  /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
15247
15421
  /*.main_gpu =*/ 0,
15248
15422
  /*.tensor_split =*/ nullptr,
15423
+ /*.rpc_servers =*/ nullptr,
15249
15424
  /*.progress_callback =*/ nullptr,
15250
15425
  /*.progress_callback_user_data =*/ nullptr,
15251
15426
  /*.kv_overrides =*/ nullptr,
@@ -15316,7 +15491,9 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
15316
15491
  }
15317
15492
 
15318
15493
  size_t llama_max_devices(void) {
15319
- #if defined(GGML_USE_METAL)
15494
+ #if defined(GGML_USE_RPC)
15495
+ return GGML_RPC_MAX_SERVERS;
15496
+ #elif defined(GGML_USE_METAL)
15320
15497
  return 1;
15321
15498
  #elif defined(GGML_USE_CUDA)
15322
15499
  return GGML_CUDA_MAX_DEVICES;
@@ -15339,7 +15516,7 @@ bool llama_supports_mlock(void) {
15339
15516
 
15340
15517
  bool llama_supports_gpu_offload(void) {
15341
15518
  #if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
15342
- defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
15519
+ defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC)
15343
15520
  // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
15344
15521
  return true;
15345
15522
  #else
@@ -15402,7 +15579,17 @@ struct llama_model * llama_load_model_from_file(
15402
15579
  return true;
15403
15580
  };
15404
15581
  }
15405
-
15582
+ if (params.rpc_servers != nullptr) {
15583
+ // split the servers set them into model->rpc_servers
15584
+ std::string servers(params.rpc_servers);
15585
+ size_t pos = 0;
15586
+ while ((pos = servers.find(",")) != std::string::npos) {
15587
+ std::string server = servers.substr(0, pos);
15588
+ model->rpc_servers.push_back(server);
15589
+ servers.erase(0, pos + 1);
15590
+ }
15591
+ model->rpc_servers.push_back(servers);
15592
+ }
15406
15593
  int status = llama_model_load(path_model, *model, params);
15407
15594
  GGML_ASSERT(status <= 0);
15408
15595
  if (status < 0) {
@@ -15441,6 +15628,11 @@ struct llama_context * llama_new_context_with_model(
15441
15628
  return nullptr;
15442
15629
  }
15443
15630
 
15631
+ if (params.flash_attn && model->arch == LLM_ARCH_GROK) {
15632
+ LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__);
15633
+ params.flash_attn = false;
15634
+ }
15635
+
15444
15636
  llama_context * ctx = new llama_context(*model);
15445
15637
 
15446
15638
  const auto & hparams = model->hparams;
@@ -15464,7 +15656,7 @@ struct llama_context * llama_new_context_with_model(
15464
15656
  cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
15465
15657
 
15466
15658
  // this is necessary due to kv_self.n being padded later during inference
15467
- cparams.n_ctx = GGML_PAD(cparams.n_ctx, 256);
15659
+ cparams.n_ctx = GGML_PAD(cparams.n_ctx, llama_kv_cache_get_padding(cparams));
15468
15660
 
15469
15661
  // with causal attention, the batch size is limited by the context size
15470
15662
  cparams.n_batch = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
@@ -15509,16 +15701,6 @@ struct llama_context * llama_new_context_with_model(
15509
15701
  }
15510
15702
  }
15511
15703
 
15512
- if (cparams.flash_attn && hparams.use_alibi) {
15513
- LLAMA_LOG_WARN("%s: flash_attn is not yet compatible with ALiBi - forcing off\n", __func__);
15514
- cparams.flash_attn = false;
15515
- }
15516
-
15517
- if (cparams.flash_attn && model->arch == LLM_ARCH_GROK) {
15518
- LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__);
15519
- cparams.flash_attn = false;
15520
- }
15521
-
15522
15704
  if (params.seed == LLAMA_DEFAULT_SEED) {
15523
15705
  params.seed = time(NULL);
15524
15706
  }
@@ -15554,7 +15736,17 @@ struct llama_context * llama_new_context_with_model(
15554
15736
 
15555
15737
  if (!hparams.vocab_only) {
15556
15738
  // initialize backends
15557
- #ifdef GGML_USE_METAL
15739
+ #if defined(GGML_USE_RPC)
15740
+ for (auto & server : model->rpc_servers) {
15741
+ ggml_backend_t backend = ggml_backend_rpc_init(server.c_str());
15742
+ if (backend == nullptr) {
15743
+ LLAMA_LOG_ERROR("%s: failed to connect RPC backend to %s\n", __func__, server.c_str());
15744
+ llama_free(ctx);
15745
+ return nullptr;
15746
+ }
15747
+ ctx->backends.push_back(backend);
15748
+ }
15749
+ #elif defined(GGML_USE_METAL)
15558
15750
  if (model->n_gpu_layers > 0) {
15559
15751
  ctx->backend_metal = ggml_backend_metal_init();
15560
15752
  if (ctx->backend_metal == nullptr) {
@@ -15710,7 +15902,11 @@ struct llama_context * llama_new_context_with_model(
15710
15902
  ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead_custom(LLAMA_MAX_NODES, false));
15711
15903
 
15712
15904
  // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
15713
- bool pipeline_parallel = llama_get_device_count() > 1 && model->n_gpu_layers > (int)model->hparams.n_layer && model->split_mode == LLAMA_SPLIT_MODE_LAYER;
15905
+ bool pipeline_parallel =
15906
+ llama_get_device_count(*model) > 1 &&
15907
+ model->n_gpu_layers > (int)model->hparams.n_layer &&
15908
+ model->split_mode == LLAMA_SPLIT_MODE_LAYER &&
15909
+ params.offload_kqv;
15714
15910
  #ifndef GGML_USE_CUDA
15715
15911
  // pipeline parallelism requires support for async compute and events
15716
15912
  // currently this is only implemented in the CUDA backend
@@ -15808,6 +16004,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
15808
16004
  case LLM_ARCH_REFACT:
15809
16005
  case LLM_ARCH_BLOOM:
15810
16006
  case LLM_ARCH_MAMBA:
16007
+ case LLM_ARCH_JINA_BERT_V2:
15811
16008
  return LLAMA_ROPE_TYPE_NONE;
15812
16009
 
15813
16010
  // use what we call a normal RoPE, operating on pairs of consecutive head values
@@ -16829,13 +17026,13 @@ static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llam
16829
17026
  }
16830
17027
  else {
16831
17028
  if (cell_range_begin != kv_self.size) {
16832
- cell_ranges.push_back({ cell_range_begin, i });
17029
+ cell_ranges.emplace_back(cell_range_begin, i);
16833
17030
  cell_range_begin = kv_self.size;
16834
17031
  }
16835
17032
  }
16836
17033
  }
16837
17034
  if (cell_range_begin != kv_self.size) {
16838
- cell_ranges.push_back({ cell_range_begin, kv_self.size });
17035
+ cell_ranges.emplace_back(cell_range_begin, kv_self.size);
16839
17036
  }
16840
17037
 
16841
17038
  // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count