@fugood/llama.node 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. package/CMakeLists.txt +9 -0
  2. package/README.md +1 -1
  3. package/bin/darwin/arm64/default.metallib +0 -0
  4. package/bin/darwin/arm64/llama-node.node +0 -0
  5. package/bin/darwin/x64/default.metallib +0 -0
  6. package/bin/darwin/x64/llama-node.node +0 -0
  7. package/bin/linux/arm64/llama-node.node +0 -0
  8. package/bin/linux/x64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  10. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  11. package/bin/win32/arm64/llama-node.node +0 -0
  12. package/bin/win32/arm64/node.lib +0 -0
  13. package/bin/win32/x64/llama-node.node +0 -0
  14. package/bin/win32/x64/node.lib +0 -0
  15. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/arm64/node.lib +0 -0
  17. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  18. package/bin/win32-vulkan/x64/node.lib +0 -0
  19. package/lib/binding.ts +1 -1
  20. package/package.json +2 -1
  21. package/patches/llama.patch +22 -0
  22. package/src/TokenizeWorker.cpp +1 -1
  23. package/src/llama.cpp/CMakeLists.txt +14 -12
  24. package/src/llama.cpp/common/common.cpp +19 -5
  25. package/src/llama.cpp/common/common.h +2 -0
  26. package/src/llama.cpp/common/grammar-parser.cpp +9 -0
  27. package/src/llama.cpp/common/sampling.cpp +3 -3
  28. package/src/llama.cpp/common/sampling.h +1 -1
  29. package/src/llama.cpp/examples/CMakeLists.txt +3 -0
  30. package/src/llama.cpp/examples/embedding/embedding.cpp +10 -2
  31. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +56 -7
  32. package/src/llama.cpp/examples/llama.android/{app/src/main/cpp → llama}/CMakeLists.txt +1 -1
  33. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +49 -0
  34. package/src/llama.cpp/examples/llama.android/{app → llama}/src/main/cpp/llama-android.cpp +14 -14
  35. package/src/llama.cpp/examples/llava/llava-cli.cpp +26 -6
  36. package/src/llama.cpp/examples/main/main.cpp +5 -1
  37. package/src/llama.cpp/examples/rpc/CMakeLists.txt +2 -0
  38. package/src/llama.cpp/examples/rpc/rpc-server.cpp +70 -0
  39. package/src/llama.cpp/examples/server/server.cpp +12 -16
  40. package/src/llama.cpp/examples/server/utils.hpp +1 -1
  41. package/src/llama.cpp/ggml-backend.c +2 -2
  42. package/src/llama.cpp/ggml-kompute.cpp +9 -3
  43. package/src/llama.cpp/ggml-quants.c +6 -0
  44. package/src/llama.cpp/ggml-rpc.cpp +1023 -0
  45. package/src/llama.cpp/ggml-rpc.h +24 -0
  46. package/src/llama.cpp/ggml-sycl.cpp +20 -143
  47. package/src/llama.cpp/ggml-vulkan.cpp +4 -2
  48. package/src/llama.cpp/ggml.c +116 -271
  49. package/src/llama.cpp/ggml.h +12 -15
  50. package/src/llama.cpp/llama.cpp +451 -265
  51. package/src/llama.cpp/llama.h +3 -0
  52. package/src/llama.cpp/requirements.txt +0 -1
  53. package/src/llama.cpp/tests/CMakeLists.txt +1 -1
  54. package/src/llama.cpp/tests/test-backend-ops.cpp +16 -19
  55. package/src/llama.cpp/tests/test-grammar-integration.cpp +46 -0
  56. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +27 -3
  57. package/src/llama.cpp/requirements/requirements-convert-lora-to-ggml.txt +0 -2
@@ -7,6 +7,10 @@
7
7
  #include "ggml-alloc.h"
8
8
  #include "ggml-backend.h"
9
9
 
10
+ #ifdef GGML_USE_RPC
11
+ # include "ggml-rpc.h"
12
+ #endif
13
+
10
14
  #ifdef GGML_USE_CUDA
11
15
  # include "ggml-cuda.h"
12
16
  #elif defined(GGML_USE_CLBLAST)
@@ -205,6 +209,7 @@ enum llm_arch {
205
209
  LLM_ARCH_REFACT,
206
210
  LLM_ARCH_BERT,
207
211
  LLM_ARCH_NOMIC_BERT,
212
+ LLM_ARCH_JINA_BERT_V2,
208
213
  LLM_ARCH_BLOOM,
209
214
  LLM_ARCH_STABLELM,
210
215
  LLM_ARCH_QWEN,
@@ -228,39 +233,40 @@ enum llm_arch {
228
233
  };
229
234
 
230
235
  static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
231
- { LLM_ARCH_LLAMA, "llama" },
232
- { LLM_ARCH_FALCON, "falcon" },
233
- { LLM_ARCH_GROK, "grok" },
234
- { LLM_ARCH_GPT2, "gpt2" },
235
- { LLM_ARCH_GPTJ, "gptj" },
236
- { LLM_ARCH_GPTNEOX, "gptneox" },
237
- { LLM_ARCH_MPT, "mpt" },
238
- { LLM_ARCH_BAICHUAN, "baichuan" },
239
- { LLM_ARCH_STARCODER, "starcoder" },
240
- { LLM_ARCH_PERSIMMON, "persimmon" },
241
- { LLM_ARCH_REFACT, "refact" },
242
- { LLM_ARCH_BERT, "bert" },
243
- { LLM_ARCH_NOMIC_BERT, "nomic-bert" },
244
- { LLM_ARCH_BLOOM, "bloom" },
245
- { LLM_ARCH_STABLELM, "stablelm" },
246
- { LLM_ARCH_QWEN, "qwen" },
247
- { LLM_ARCH_QWEN2, "qwen2" },
248
- { LLM_ARCH_QWEN2MOE, "qwen2moe" },
249
- { LLM_ARCH_PHI2, "phi2" },
250
- { LLM_ARCH_PHI3, "phi3" },
251
- { LLM_ARCH_PLAMO, "plamo" },
252
- { LLM_ARCH_CODESHELL, "codeshell" },
253
- { LLM_ARCH_ORION, "orion" },
254
- { LLM_ARCH_INTERNLM2, "internlm2" },
255
- { LLM_ARCH_MINICPM, "minicpm" },
256
- { LLM_ARCH_GEMMA, "gemma" },
257
- { LLM_ARCH_STARCODER2, "starcoder2" },
258
- { LLM_ARCH_MAMBA, "mamba" },
259
- { LLM_ARCH_XVERSE, "xverse" },
260
- { LLM_ARCH_COMMAND_R, "command-r" },
261
- { LLM_ARCH_DBRX, "dbrx" },
262
- { LLM_ARCH_OLMO, "olmo" },
263
- { LLM_ARCH_UNKNOWN, "(unknown)" },
236
+ { LLM_ARCH_LLAMA, "llama" },
237
+ { LLM_ARCH_FALCON, "falcon" },
238
+ { LLM_ARCH_GROK, "grok" },
239
+ { LLM_ARCH_GPT2, "gpt2" },
240
+ { LLM_ARCH_GPTJ, "gptj" },
241
+ { LLM_ARCH_GPTNEOX, "gptneox" },
242
+ { LLM_ARCH_MPT, "mpt" },
243
+ { LLM_ARCH_BAICHUAN, "baichuan" },
244
+ { LLM_ARCH_STARCODER, "starcoder" },
245
+ { LLM_ARCH_PERSIMMON, "persimmon" },
246
+ { LLM_ARCH_REFACT, "refact" },
247
+ { LLM_ARCH_BERT, "bert" },
248
+ { LLM_ARCH_NOMIC_BERT, "nomic-bert" },
249
+ { LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
250
+ { LLM_ARCH_BLOOM, "bloom" },
251
+ { LLM_ARCH_STABLELM, "stablelm" },
252
+ { LLM_ARCH_QWEN, "qwen" },
253
+ { LLM_ARCH_QWEN2, "qwen2" },
254
+ { LLM_ARCH_QWEN2MOE, "qwen2moe" },
255
+ { LLM_ARCH_PHI2, "phi2" },
256
+ { LLM_ARCH_PHI3, "phi3" },
257
+ { LLM_ARCH_PLAMO, "plamo" },
258
+ { LLM_ARCH_CODESHELL, "codeshell" },
259
+ { LLM_ARCH_ORION, "orion" },
260
+ { LLM_ARCH_INTERNLM2, "internlm2" },
261
+ { LLM_ARCH_MINICPM, "minicpm" },
262
+ { LLM_ARCH_GEMMA, "gemma" },
263
+ { LLM_ARCH_STARCODER2, "starcoder2" },
264
+ { LLM_ARCH_MAMBA, "mamba" },
265
+ { LLM_ARCH_XVERSE, "xverse" },
266
+ { LLM_ARCH_COMMAND_R, "command-r" },
267
+ { LLM_ARCH_DBRX, "dbrx" },
268
+ { LLM_ARCH_OLMO, "olmo" },
269
+ { LLM_ARCH_UNKNOWN, "(unknown)" },
264
270
  };
265
271
 
266
272
  enum llm_kv {
@@ -691,6 +697,25 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
691
697
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
692
698
  },
693
699
  },
700
+ {
701
+ LLM_ARCH_JINA_BERT_V2,
702
+ {
703
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
704
+ { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
705
+ { LLM_TENSOR_TOKEN_TYPES, "token_types" },
706
+ { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
707
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
708
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
709
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
710
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
711
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
712
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
713
+ { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
714
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
715
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
716
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
717
+ },
718
+ },
694
719
  {
695
720
  LLM_ARCH_BLOOM,
696
721
  {
@@ -1664,91 +1689,6 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
1664
1689
  GGML_UNUSED(host_buffer);
1665
1690
  }
1666
1691
 
1667
- static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
1668
- ggml_backend_buffer_type_t buft = nullptr;
1669
-
1670
- #ifdef GGML_USE_METAL
1671
- buft = ggml_backend_metal_buffer_type();
1672
- #elif defined(GGML_USE_CUDA)
1673
- buft = ggml_backend_cuda_buffer_type(gpu);
1674
- #elif defined(GGML_USE_VULKAN)
1675
- buft = ggml_backend_vk_buffer_type(gpu);
1676
- #elif defined(GGML_USE_SYCL)
1677
- buft = ggml_backend_sycl_buffer_type(gpu);
1678
- #elif defined(GGML_USE_CLBLAST)
1679
- buft = ggml_backend_opencl_buffer_type();
1680
- #elif defined(GGML_USE_KOMPUTE)
1681
- buft = ggml_backend_kompute_buffer_type(gpu);
1682
- if (buft == nullptr) {
1683
- LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
1684
- }
1685
- #endif
1686
-
1687
- if (buft == nullptr) {
1688
- buft = llama_default_buffer_type_cpu(true);
1689
- }
1690
- return buft;
1691
-
1692
- GGML_UNUSED(gpu);
1693
- }
1694
-
1695
- static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_gpu, const float * tensor_split) {
1696
- ggml_backend_buffer_type_t buft = nullptr;
1697
-
1698
- #ifdef GGML_USE_CUDA
1699
- if (ggml_backend_cuda_get_device_count() > 1) {
1700
- buft = ggml_backend_cuda_split_buffer_type(tensor_split);
1701
- }
1702
- #endif
1703
-
1704
- #ifdef GGML_USE_SYCL
1705
- if (ggml_backend_sycl_get_device_count() > 1) {
1706
- buft = ggml_backend_sycl_split_buffer_type(tensor_split);
1707
- }
1708
- #endif
1709
-
1710
- if (buft == nullptr) {
1711
- buft = llama_default_buffer_type_offload(fallback_gpu);
1712
- }
1713
- return buft;
1714
-
1715
- GGML_UNUSED(tensor_split);
1716
- }
1717
-
1718
- static size_t llama_get_device_count() {
1719
- #if defined(GGML_USE_CUDA)
1720
- return ggml_backend_cuda_get_device_count();
1721
- #elif defined(GGML_USE_SYCL)
1722
- return ggml_backend_sycl_get_device_count();
1723
- #elif defined(GGML_USE_VULKAN)
1724
- return ggml_backend_vk_get_device_count();
1725
- #else
1726
- return 1;
1727
- #endif
1728
- }
1729
-
1730
- static size_t llama_get_device_memory(int device) {
1731
- #if defined(GGML_USE_CUDA)
1732
- size_t total;
1733
- size_t free;
1734
- ggml_backend_cuda_get_device_memory(device, &free, &total);
1735
- return free;
1736
- #elif defined(GGML_USE_SYCL)
1737
- size_t total;
1738
- size_t free;
1739
- ggml_backend_sycl_get_device_memory(device, &free, &total);
1740
- return free;
1741
- #elif defined(GGML_USE_VULKAN)
1742
- size_t total;
1743
- size_t free;
1744
- ggml_backend_vk_get_device_memory(device, &free, &total);
1745
- return free;
1746
- #else
1747
- return 1;
1748
- GGML_UNUSED(device);
1749
- #endif
1750
- }
1751
-
1752
1692
  //
1753
1693
  // globals
1754
1694
  //
@@ -1845,7 +1785,7 @@ struct llama_hparams {
1845
1785
  float f_logit_scale = 0.0f;
1846
1786
 
1847
1787
  bool causal_attn = true;
1848
- bool use_alibi = false; // currently, we need KQ_pos data for ALiBi-based models
1788
+ bool use_alibi = false;
1849
1789
 
1850
1790
  enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
1851
1791
  enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
@@ -2189,6 +2129,8 @@ struct llama_model {
2189
2129
  int main_gpu;
2190
2130
  int n_gpu_layers;
2191
2131
 
2132
+ std::vector<std::string> rpc_servers;
2133
+
2192
2134
  // gguf metadata
2193
2135
  std::unordered_map<std::string, std::string> gguf_kv;
2194
2136
 
@@ -2317,7 +2259,6 @@ struct llama_context {
2317
2259
  struct ggml_tensor * inp_pos; // I32 [n_batch]
2318
2260
  struct ggml_tensor * inp_out_ids; // I32 [n_outputs]
2319
2261
  struct ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch]
2320
- struct ggml_tensor * inp_KQ_pos; // F32 [n_kv]
2321
2262
  struct ggml_tensor * inp_K_shift; // I32 [kv_size]
2322
2263
  struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch]
2323
2264
  struct ggml_tensor * inp_cls; // I32 [n_batch]
@@ -2333,6 +2274,104 @@ struct llama_context {
2333
2274
  #endif
2334
2275
  };
2335
2276
 
2277
+ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
2278
+ ggml_backend_buffer_type_t buft = nullptr;
2279
+
2280
+ #ifdef GGML_USE_RPC
2281
+ std::string endpoint = model.rpc_servers[gpu];
2282
+ buft = ggml_backend_rpc_buffer_type(endpoint.c_str());
2283
+ #elif defined(GGML_USE_METAL)
2284
+ buft = ggml_backend_metal_buffer_type();
2285
+ #elif defined(GGML_USE_CUDA)
2286
+ buft = ggml_backend_cuda_buffer_type(gpu);
2287
+ #elif defined(GGML_USE_VULKAN)
2288
+ buft = ggml_backend_vk_buffer_type(gpu);
2289
+ #elif defined(GGML_USE_SYCL)
2290
+ buft = ggml_backend_sycl_buffer_type(gpu);
2291
+ #elif defined(GGML_USE_CLBLAST)
2292
+ buft = ggml_backend_opencl_buffer_type();
2293
+ #elif defined(GGML_USE_KOMPUTE)
2294
+ buft = ggml_backend_kompute_buffer_type(gpu);
2295
+ if (buft == nullptr) {
2296
+ LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
2297
+ }
2298
+ #endif
2299
+
2300
+ if (buft == nullptr) {
2301
+ buft = llama_default_buffer_type_cpu(true);
2302
+ }
2303
+ return buft;
2304
+ GGML_UNUSED(model);
2305
+ GGML_UNUSED(gpu);
2306
+ }
2307
+
2308
+ static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_model & model, int fallback_gpu, const float * tensor_split) {
2309
+ ggml_backend_buffer_type_t buft = nullptr;
2310
+
2311
+ #ifdef GGML_USE_CUDA
2312
+ if (ggml_backend_cuda_get_device_count() > 1) {
2313
+ buft = ggml_backend_cuda_split_buffer_type(tensor_split);
2314
+ }
2315
+ #endif
2316
+
2317
+ #ifdef GGML_USE_SYCL
2318
+ if (ggml_backend_sycl_get_device_count() > 1) {
2319
+ buft = ggml_backend_sycl_split_buffer_type(tensor_split);
2320
+ }
2321
+ #endif
2322
+
2323
+ if (buft == nullptr) {
2324
+ buft = llama_default_buffer_type_offload(model, fallback_gpu);
2325
+ }
2326
+ return buft;
2327
+
2328
+ GGML_UNUSED(tensor_split);
2329
+ }
2330
+
2331
+ static size_t llama_get_device_count(const llama_model & model) {
2332
+ #if defined(GGML_USE_RPC)
2333
+ return model.rpc_servers.size();
2334
+ #elif defined(GGML_USE_CUDA)
2335
+ return ggml_backend_cuda_get_device_count();
2336
+ #elif defined(GGML_USE_SYCL)
2337
+ return ggml_backend_sycl_get_device_count();
2338
+ #elif defined(GGML_USE_VULKAN)
2339
+ return ggml_backend_vk_get_device_count();
2340
+ #else
2341
+ return 1;
2342
+ #endif
2343
+ GGML_UNUSED(model);
2344
+ }
2345
+
2346
+ static size_t llama_get_device_memory(const llama_model & model, int device) {
2347
+ #if defined(GGML_USE_RPC)
2348
+ size_t total;
2349
+ size_t free;
2350
+ std::string endpoint = model.rpc_servers[device];
2351
+ ggml_backend_rpc_get_device_memory(endpoint.c_str(), &free, &total);
2352
+ return free;
2353
+ #elif defined(GGML_USE_CUDA)
2354
+ size_t total;
2355
+ size_t free;
2356
+ ggml_backend_cuda_get_device_memory(device, &free, &total);
2357
+ return free;
2358
+ #elif defined(GGML_USE_SYCL)
2359
+ size_t total;
2360
+ size_t free;
2361
+ ggml_backend_sycl_get_device_memory(device, &free, &total);
2362
+ return free;
2363
+ #elif defined(GGML_USE_VULKAN)
2364
+ size_t total;
2365
+ size_t free;
2366
+ ggml_backend_vk_get_device_memory(device, &free, &total);
2367
+ return free;
2368
+ #else
2369
+ return 1;
2370
+ #endif
2371
+ GGML_UNUSED(model);
2372
+ GGML_UNUSED(device);
2373
+ }
2374
+
2336
2375
  //
2337
2376
  // kv cache helpers
2338
2377
  //
@@ -2785,6 +2824,11 @@ static void llama_kv_cache_defrag(struct llama_kv_cache & cache) {
2785
2824
  cache.do_defrag = true;
2786
2825
  }
2787
2826
 
2827
+ static uint32_t llama_kv_cache_get_padding(const struct llama_cparams & cparams) {
2828
+ // the FA kernels require padding to avoid extra runtime boundary checks
2829
+ return cparams.flash_attn ? 256u : 32u;
2830
+ }
2831
+
2788
2832
  //
2789
2833
  // model loading and saving
2790
2834
  //
@@ -3779,6 +3823,12 @@ static void llm_load_hparams(
3779
3823
 
3780
3824
  // get hparams kv
3781
3825
  ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab);
3826
+
3827
+ // everything past this point is not vocab-related
3828
+ if (hparams.vocab_only) {
3829
+ return;
3830
+ }
3831
+
3782
3832
  ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
3783
3833
  ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
3784
3834
  ml.get_key(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff);
@@ -3860,7 +3910,7 @@ static void llm_load_hparams(
3860
3910
  switch (hparams.n_layer) {
3861
3911
  case 22: model.type = e_model::MODEL_1B; break;
3862
3912
  case 26: model.type = e_model::MODEL_3B; break;
3863
- case 32: model.type = hparams.n_head == hparams.n_head_kv ? e_model::MODEL_7B : e_model::MODEL_8B; break; // LLaMa 8B v3 uses GQA
3913
+ case 32: model.type = hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B; break;
3864
3914
  case 40: model.type = e_model::MODEL_13B; break;
3865
3915
  case 48: model.type = e_model::MODEL_34B; break;
3866
3916
  case 60: model.type = e_model::MODEL_30B; break;
@@ -3962,6 +4012,19 @@ static void llm_load_hparams(
3962
4012
  model.type = e_model::MODEL_335M; break; // bge-large
3963
4013
  }
3964
4014
  } break;
4015
+ case LLM_ARCH_JINA_BERT_V2:
4016
+ {
4017
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
4018
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
4019
+ ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
4020
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
4021
+ hparams.f_max_alibi_bias = 8.0f;
4022
+
4023
+ switch (hparams.n_layer) {
4024
+ case 4: model.type = e_model::MODEL_33M; break; // jina-embeddings-small
4025
+ case 12: model.type = e_model::MODEL_137M; break; // jina-embeddings-base
4026
+ }
4027
+ } break;
3965
4028
  case LLM_ARCH_NOMIC_BERT:
3966
4029
  {
3967
4030
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -4383,7 +4446,11 @@ static void llm_load_vocab(
4383
4446
  tokenizer_pre == "starcoder") {
4384
4447
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STARCODER;
4385
4448
  } else if (
4386
- tokenizer_pre == "gpt-2") {
4449
+ tokenizer_pre == "gpt-2" ||
4450
+ tokenizer_pre == "jina-es" ||
4451
+ tokenizer_pre == "jina-de" ||
4452
+ tokenizer_pre == "jina-v2-es" ||
4453
+ tokenizer_pre == "jina-v2-de") {
4387
4454
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
4388
4455
  } else if (
4389
4456
  tokenizer_pre == "refact") {
@@ -4743,13 +4810,13 @@ static bool llm_load_tensors(
4743
4810
 
4744
4811
  if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
4745
4812
  // calculate the split points
4746
- int device_count = llama_get_device_count();
4813
+ int device_count = llama_get_device_count(model);
4747
4814
  bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
4748
4815
  std::vector<float> splits(device_count);
4749
4816
  if (all_zero) {
4750
4817
  // default split, by free memory
4751
4818
  for (int i = 0; i < device_count; ++i) {
4752
- splits[i] = llama_get_device_memory(i);
4819
+ splits[i] = llama_get_device_memory(model, i);
4753
4820
  }
4754
4821
  } else {
4755
4822
  std::copy(tensor_split, tensor_split + device_count, splits.begin());
@@ -4769,35 +4836,35 @@ static bool llm_load_tensors(
4769
4836
  int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1);
4770
4837
  for (int64_t i = i_gpu_start; i < n_layer; ++i) {
4771
4838
  int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits.begin();
4772
- model.buft_layer[i] = llama_default_buffer_type_offload(layer_gpu);
4839
+ model.buft_layer[i] = llama_default_buffer_type_offload(model, layer_gpu);
4773
4840
  }
4774
4841
  // assign the output layer
4775
4842
  if (n_gpu_layers > n_layer) {
4776
4843
  int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits.begin();
4777
- model.buft_output = llama_default_buffer_type_offload(layer_gpu);
4844
+ model.buft_output = llama_default_buffer_type_offload(model, layer_gpu);
4778
4845
  } else {
4779
4846
  model.buft_output = llama_default_buffer_type_cpu(true);
4780
4847
  }
4781
4848
  } else {
4782
4849
  ggml_backend_buffer_type_t split_buft;
4783
4850
  if (split_mode == LLAMA_SPLIT_MODE_ROW) {
4784
- split_buft = llama_default_buffer_type_split(main_gpu, tensor_split);
4851
+ split_buft = llama_default_buffer_type_split(model, main_gpu, tensor_split);
4785
4852
  } else {
4786
4853
  // LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_LAYER in backends where it is not supported
4787
- split_buft = llama_default_buffer_type_offload(main_gpu);
4854
+ split_buft = llama_default_buffer_type_offload(model, main_gpu);
4788
4855
  }
4789
4856
  // assign the repeating layers
4790
4857
  for (int64_t i = i_gpu_start; i < n_layer; ++i) {
4791
4858
  model.buft_layer[i] = {
4792
4859
  split_buft,
4793
- llama_default_buffer_type_offload(main_gpu)
4860
+ llama_default_buffer_type_offload(model, main_gpu)
4794
4861
  };
4795
4862
  }
4796
4863
  // assign the output layer
4797
4864
  if (n_gpu_layers > n_layer) {
4798
4865
  model.buft_output = {
4799
4866
  split_buft,
4800
- llama_default_buffer_type_offload(main_gpu)
4867
+ llama_default_buffer_type_offload(model, main_gpu)
4801
4868
  };
4802
4869
  } else {
4803
4870
  model.buft_output = llama_default_buffer_type_cpu(true);
@@ -5242,6 +5309,50 @@ static bool llm_load_tensors(
5242
5309
  layer.layer_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd});
5243
5310
  }
5244
5311
  } break;
5312
+ case LLM_ARCH_JINA_BERT_V2:
5313
+ {
5314
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // word_embeddings
5315
+ model.type_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type}); //token_type_embeddings
5316
+ model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}); // LayerNorm
5317
+ model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}); //LayerNorm bias
5318
+
5319
+ for (int i = 0; i < n_layer; ++i) {
5320
+ ggml_context * ctx_layer = ctx_for_layer(i);
5321
+ ggml_context * ctx_split = ctx_for_layer_split(i);
5322
+
5323
+ auto & layer = model.layers[i]; // JinaBertLayer
5324
+
5325
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
5326
+ layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
5327
+
5328
+ layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, false);
5329
+ layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, false);
5330
+
5331
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
5332
+ layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
5333
+
5334
+ layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, false);
5335
+ layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, false);
5336
+
5337
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
5338
+ layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
5339
+
5340
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); //output_dens
5341
+ layer.bo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}); //output_dens
5342
+
5343
+ layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}); //output_norm
5344
+ layer.attn_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd});
5345
+
5346
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
5347
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
5348
+
5349
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
5350
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
5351
+
5352
+ layer.layer_out_norm = ml.create_tensor(ctx_split, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
5353
+ layer.layer_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd});
5354
+ }
5355
+ } break;
5245
5356
  case LLM_ARCH_BLOOM:
5246
5357
  {
5247
5358
  model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
@@ -6318,7 +6429,7 @@ static struct ggml_tensor * llm_build_ffn(
6318
6429
  llm_ffn_gate_type type_gate,
6319
6430
  const llm_build_cb & cb,
6320
6431
  int il) {
6321
- struct ggml_tensor * tmp = ggml_mul_mat(ctx, up, cur);
6432
+ struct ggml_tensor * tmp = up ? ggml_mul_mat(ctx, up, cur) : cur;
6322
6433
  cb(tmp, "ffn_up", il);
6323
6434
 
6324
6435
  if (up_b) {
@@ -6500,7 +6611,6 @@ static struct ggml_tensor * llm_build_kqv(
6500
6611
  struct ggml_tensor * wo_b,
6501
6612
  struct ggml_tensor * q_cur,
6502
6613
  struct ggml_tensor * kq_mask,
6503
- struct ggml_tensor * kq_pos,
6504
6614
  int32_t n_tokens,
6505
6615
  int32_t n_kv,
6506
6616
  float kq_scale,
@@ -6530,10 +6640,6 @@ static struct ggml_tensor * llm_build_kqv(
6530
6640
  GGML_UNUSED(model);
6531
6641
  GGML_UNUSED(n_ctx);
6532
6642
 
6533
- // note: if this assert triggers, then some check has failed earlier
6534
- // the idea is to detect during context creation that ALiBi would be used and disable Flash Attention
6535
- GGML_ASSERT(kq_pos == nullptr && "ALiBi is not yet supported with Flash Attention");
6536
-
6537
6643
  // split cached v into n_head heads (not transposed)
6538
6644
  struct ggml_tensor * v =
6539
6645
  ggml_view_3d(ctx, kv.v_l[il],
@@ -6543,7 +6649,7 @@ static struct ggml_tensor * llm_build_kqv(
6543
6649
  0);
6544
6650
  cb(v, "v", il);
6545
6651
 
6546
- cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale);
6652
+ cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias);
6547
6653
 
6548
6654
  if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
6549
6655
  ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
@@ -6574,28 +6680,8 @@ static struct ggml_tensor * llm_build_kqv(
6574
6680
  kq = ggml_scale(ctx, kq, 30);
6575
6681
  }
6576
6682
 
6577
- #if defined(GGML_USE_KOMPUTE)
6578
- #pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Kompute")
6579
- #pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
6580
- #pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5488")
6581
- if (hparams.use_alibi) {
6582
- kq = ggml_scale(ctx, kq, kq_scale);
6583
- cb(kq, "kq_scaled", il);
6584
-
6585
- kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, hparams.f_max_alibi_bias);
6586
- cb(kq, "kq_scaled_alibi", il);
6587
-
6588
- kq = ggml_add(ctx, kq, kq_mask);
6589
- cb(kq, "kq_masked", il);
6590
-
6591
- kq = ggml_soft_max(ctx, kq);
6592
- cb(kq, "kq_soft_max", il);
6593
- } else
6594
- #endif
6595
- {
6596
- kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_pos, kq_scale, hparams.f_max_alibi_bias);
6597
- cb(kq, "kq_soft_max_ext", il);
6598
- }
6683
+ kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias);
6684
+ cb(kq, "kq_soft_max_ext", il);
6599
6685
 
6600
6686
  GGML_ASSERT(kv.size == n_ctx);
6601
6687
 
@@ -6645,7 +6731,6 @@ static struct ggml_tensor * llm_build_kv(
6645
6731
  struct ggml_tensor * v_cur,
6646
6732
  struct ggml_tensor * q_cur,
6647
6733
  struct ggml_tensor * kq_mask,
6648
- struct ggml_tensor * kq_pos,
6649
6734
  int32_t n_tokens,
6650
6735
  int32_t kv_head,
6651
6736
  int32_t n_kv,
@@ -6664,7 +6749,7 @@ static struct ggml_tensor * llm_build_kv(
6664
6749
  struct ggml_tensor * cur;
6665
6750
 
6666
6751
  cur = llm_build_kqv(ctx, model, hparams, cparams, kv, graph, wo, wo_b,
6667
- q_cur, kq_mask, kq_pos, n_tokens, n_kv, kq_scale, cb, il);
6752
+ q_cur, kq_mask, n_tokens, n_kv, kq_scale, cb, il);
6668
6753
  cb(cur, "kqv_out", il);
6669
6754
 
6670
6755
  return cur;
@@ -6771,18 +6856,17 @@ struct llm_build_context {
6771
6856
 
6772
6857
  ctx0 = ggml_init(params);
6773
6858
 
6774
- lctx.inp_tokens = nullptr;
6775
- lctx.inp_embd = nullptr;
6776
- lctx.inp_pos = nullptr;
6859
+ lctx.inp_tokens = nullptr;
6860
+ lctx.inp_embd = nullptr;
6861
+ lctx.inp_pos = nullptr;
6777
6862
  lctx.inp_out_ids = nullptr;
6778
6863
  lctx.inp_KQ_mask = nullptr;
6779
- lctx.inp_KQ_pos = nullptr;
6780
6864
  lctx.inp_K_shift = nullptr;
6781
- lctx.inp_mean = nullptr;
6782
- lctx.inp_cls = nullptr;
6783
- lctx.inp_s_copy = nullptr;
6784
- lctx.inp_s_mask = nullptr;
6785
- lctx.inp_s_seq = nullptr;
6865
+ lctx.inp_mean = nullptr;
6866
+ lctx.inp_cls = nullptr;
6867
+ lctx.inp_s_copy = nullptr;
6868
+ lctx.inp_s_mask = nullptr;
6869
+ lctx.inp_s_seq = nullptr;
6786
6870
  }
6787
6871
 
6788
6872
  void free() {
@@ -6932,19 +7016,6 @@ struct llm_build_context {
6932
7016
  return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_mask, GGML_TYPE_F16) : lctx.inp_KQ_mask;
6933
7017
  }
6934
7018
 
6935
- struct ggml_tensor * build_inp_KQ_pos(bool causal = true) {
6936
- if (causal) {
6937
- lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_kv);
6938
- } else {
6939
- // TODO: this will be needed for ALiBi-based BERT models
6940
- // https://github.com/ggerganov/llama.cpp/pull/6826
6941
- lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_tokens);
6942
- }
6943
- cb(lctx.inp_KQ_pos, "KQ_pos", -1);
6944
- ggml_set_input(lctx.inp_KQ_pos);
6945
- return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_pos, GGML_TYPE_F16) : lctx.inp_KQ_pos;
6946
- }
6947
-
6948
7019
  struct ggml_tensor * build_inp_mean() {
6949
7020
  lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
6950
7021
  cb(lctx.inp_mean, "inp_mean", -1);
@@ -7050,7 +7121,7 @@ struct llm_build_context {
7050
7121
 
7051
7122
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7052
7123
  model.layers[il].wo, model.layers[il].bo,
7053
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7124
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7054
7125
  }
7055
7126
 
7056
7127
  if (il == n_layer - 1) {
@@ -7143,9 +7214,6 @@ struct llm_build_context {
7143
7214
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
7144
7215
  struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
7145
7216
 
7146
- // positions of the tokens in the KV cache
7147
- struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
7148
-
7149
7217
  for (int il = 0; il < n_layer; ++il) {
7150
7218
  struct ggml_tensor * inpSA = inpL;
7151
7219
 
@@ -7190,7 +7258,7 @@ struct llm_build_context {
7190
7258
 
7191
7259
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7192
7260
  model.layers[il].wo, NULL,
7193
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7261
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7194
7262
  }
7195
7263
 
7196
7264
  if (il == n_layer - 1) {
@@ -7260,9 +7328,6 @@ struct llm_build_context {
7260
7328
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
7261
7329
  struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
7262
7330
 
7263
- // positions of the tokens in the KV cache
7264
- struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
7265
-
7266
7331
  for (int il = 0; il < n_layer; ++il) {
7267
7332
  struct ggml_tensor * inpSA = inpL;
7268
7333
 
@@ -7297,7 +7362,7 @@ struct llm_build_context {
7297
7362
  cb(Kcur, "Kcur", il);
7298
7363
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7299
7364
  model.layers[il].wo, NULL,
7300
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7365
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7301
7366
  }
7302
7367
 
7303
7368
  if (il == n_layer - 1) {
@@ -7417,7 +7482,7 @@ struct llm_build_context {
7417
7482
 
7418
7483
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7419
7484
  model.layers[il].wo, NULL,
7420
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7485
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7421
7486
  }
7422
7487
 
7423
7488
  if (il == n_layer - 1) {
@@ -7542,7 +7607,7 @@ struct llm_build_context {
7542
7607
 
7543
7608
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7544
7609
  model.layers[il].wo, model.layers[il].bo,
7545
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
7610
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
7546
7611
  }
7547
7612
 
7548
7613
  if (il == n_layer - 1) {
@@ -7694,7 +7759,7 @@ struct llm_build_context {
7694
7759
 
7695
7760
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7696
7761
  model.layers[il].wo, NULL,
7697
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7762
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7698
7763
  }
7699
7764
 
7700
7765
  if (il == n_layer - 1) {
@@ -7806,7 +7871,7 @@ struct llm_build_context {
7806
7871
 
7807
7872
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
7808
7873
  model.layers[il].wo, model.layers[il].bo,
7809
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7874
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7810
7875
  }
7811
7876
 
7812
7877
  if (il == n_layer - 1) {
@@ -8010,7 +8075,7 @@ struct llm_build_context {
8010
8075
 
8011
8076
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8012
8077
  model.layers[il].wo, model.layers[il].bo,
8013
- Kcur, Vcur, Q, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8078
+ Kcur, Vcur, Q, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8014
8079
  }
8015
8080
 
8016
8081
  if (il == n_layer - 1) {
@@ -8076,9 +8141,6 @@ struct llm_build_context {
8076
8141
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
8077
8142
  struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
8078
8143
 
8079
- // positions of the tokens in the KV cache
8080
- struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
8081
-
8082
8144
  for (int il = 0; il < n_layer; ++il) {
8083
8145
  struct ggml_tensor * inpSA = inpL;
8084
8146
 
@@ -8106,7 +8168,7 @@ struct llm_build_context {
8106
8168
 
8107
8169
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8108
8170
  model.layers[il].wo, NULL,
8109
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8171
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8110
8172
  }
8111
8173
 
8112
8174
  if (il == n_layer - 1) {
@@ -8168,8 +8230,11 @@ struct llm_build_context {
8168
8230
 
8169
8231
  struct ggml_tensor * cur;
8170
8232
  struct ggml_tensor * inpL;
8233
+ struct ggml_tensor * inp_pos = nullptr;
8171
8234
 
8172
- struct ggml_tensor * inp_pos = build_inp_pos();
8235
+ if (model.arch != LLM_ARCH_JINA_BERT_V2) {
8236
+ inp_pos = build_inp_pos();
8237
+ }
8173
8238
  struct ggml_tensor * inp_mean = build_inp_mean();
8174
8239
  struct ggml_tensor * inp_cls = build_inp_cls();
8175
8240
 
@@ -8200,13 +8265,26 @@ struct llm_build_context {
8200
8265
  struct ggml_tensor * Vcur;
8201
8266
 
8202
8267
  // self-attention
8203
- if (model.arch == LLM_ARCH_BERT) {
8268
+ if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_JINA_BERT_V2) {
8204
8269
  Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq);
8205
8270
  cb(Qcur, "Qcur", il);
8206
8271
 
8272
+ if (model.layers[il].attn_q_norm) {
8273
+ Qcur = llm_build_norm(ctx0, Qcur, hparams,
8274
+ model.layers[il].attn_q_norm,
8275
+ model.layers[il].attn_q_norm_b,
8276
+ LLM_NORM, cb, il);
8277
+ }
8278
+
8207
8279
  Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), model.layers[il].bk);
8208
8280
  cb(Kcur, "Kcur", il);
8209
8281
 
8282
+ if (model.layers[il].attn_k_norm) {
8283
+ Kcur = llm_build_norm(ctx0, Kcur, hparams,
8284
+ model.layers[il].attn_k_norm,
8285
+ model.layers[il].attn_k_norm_b,
8286
+ LLM_NORM, cb, il);
8287
+ }
8210
8288
  Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), model.layers[il].bv);
8211
8289
  cb(Vcur, "Vcur", il);
8212
8290
 
@@ -8246,7 +8324,7 @@ struct llm_build_context {
8246
8324
  struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
8247
8325
  cb(kq, "kq", il);
8248
8326
 
8249
- kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, nullptr, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
8327
+ kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
8250
8328
  cb(kq, "kq_soft_max_ext", il);
8251
8329
 
8252
8330
  struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)));
@@ -8297,6 +8375,13 @@ struct llm_build_context {
8297
8375
  model.layers[il].ffn_down, model.layers[il].ffn_down_b,
8298
8376
  NULL,
8299
8377
  LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
8378
+ } else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
8379
+ cur = llm_build_ffn(ctx0, cur,
8380
+ model.layers[il].ffn_up, NULL,
8381
+ model.layers[il].ffn_gate, NULL,
8382
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b,
8383
+ NULL,
8384
+ LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
8300
8385
  } else {
8301
8386
  cur = llm_build_ffn(ctx0, cur,
8302
8387
  model.layers[il].ffn_up, NULL,
@@ -8363,9 +8448,6 @@ struct llm_build_context {
8363
8448
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
8364
8449
  struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
8365
8450
 
8366
- // positions of the tokens in the KV cache
8367
- struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
8368
-
8369
8451
  inpL = llm_build_norm(ctx0, inpL, hparams,
8370
8452
  model.tok_norm,
8371
8453
  model.tok_norm_b,
@@ -8399,7 +8481,7 @@ struct llm_build_context {
8399
8481
 
8400
8482
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8401
8483
  model.layers[il].wo, model.layers[il].bo,
8402
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8484
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8403
8485
  }
8404
8486
 
8405
8487
  if (il == n_layer - 1) {
@@ -8464,9 +8546,6 @@ struct llm_build_context {
8464
8546
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
8465
8547
  struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
8466
8548
 
8467
- // positions of the tokens in the KV cache
8468
- struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
8469
-
8470
8549
  if (model.pos_embd) {
8471
8550
  // inp_pos - contains the positions
8472
8551
  struct ggml_tensor * inp_pos = build_inp_pos();
@@ -8530,13 +8609,13 @@ struct llm_build_context {
8530
8609
 
8531
8610
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8532
8611
  model.layers[il].wo, model.layers[il].bo,
8533
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8612
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8534
8613
  } else {
8535
8614
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8536
8615
 
8537
8616
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8538
8617
  model.layers[il].wo, model.layers[il].bo,
8539
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8618
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8540
8619
  }
8541
8620
  }
8542
8621
 
@@ -8680,7 +8759,7 @@ struct llm_build_context {
8680
8759
 
8681
8760
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8682
8761
  model.layers[il].wo, NULL,
8683
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8762
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8684
8763
  }
8685
8764
 
8686
8765
  if (il == n_layer - 1) {
@@ -8798,7 +8877,7 @@ struct llm_build_context {
8798
8877
 
8799
8878
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8800
8879
  model.layers[il].wo, NULL,
8801
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8880
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8802
8881
  }
8803
8882
 
8804
8883
  if (il == n_layer - 1) {
@@ -8911,7 +8990,7 @@ struct llm_build_context {
8911
8990
 
8912
8991
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
8913
8992
  model.layers[il].wo, model.layers[il].bo,
8914
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8993
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
8915
8994
  }
8916
8995
 
8917
8996
  if (il == n_layer - 1) {
@@ -9025,7 +9104,7 @@ struct llm_build_context {
9025
9104
 
9026
9105
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9027
9106
  model.layers[il].wo, model.layers[il].bo,
9028
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9107
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9029
9108
  }
9030
9109
 
9031
9110
  if (il == n_layer - 1) {
@@ -9180,7 +9259,7 @@ struct llm_build_context {
9180
9259
 
9181
9260
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9182
9261
  model.layers[il].wo, model.layers[il].bo,
9183
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
9262
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
9184
9263
  }
9185
9264
 
9186
9265
  if (il == n_layer - 1) {
@@ -9297,7 +9376,7 @@ struct llm_build_context {
9297
9376
 
9298
9377
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9299
9378
  model.layers[il].wo, model.layers[il].bo,
9300
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
9379
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
9301
9380
  }
9302
9381
 
9303
9382
  if (il == n_layer - 1) {
@@ -9410,7 +9489,7 @@ struct llm_build_context {
9410
9489
 
9411
9490
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9412
9491
  model.layers[il].wo, NULL,
9413
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9492
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9414
9493
  }
9415
9494
  struct ggml_tensor * sa_out = cur;
9416
9495
 
@@ -9513,7 +9592,7 @@ struct llm_build_context {
9513
9592
 
9514
9593
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9515
9594
  model.layers[il].wo, model.layers[il].bo,
9516
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9595
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9517
9596
  }
9518
9597
 
9519
9598
  if (il == n_layer - 1) {
@@ -9620,7 +9699,7 @@ struct llm_build_context {
9620
9699
 
9621
9700
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9622
9701
  model.layers[il].wo, model.layers[il].bo,
9623
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9702
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9624
9703
  }
9625
9704
 
9626
9705
  if (il == n_layer - 1) {
@@ -9736,7 +9815,7 @@ struct llm_build_context {
9736
9815
 
9737
9816
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9738
9817
  model.layers[il].wo, NULL,
9739
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9818
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9740
9819
  }
9741
9820
 
9742
9821
  if (il == n_layer - 1) {
@@ -9853,7 +9932,7 @@ struct llm_build_context {
9853
9932
 
9854
9933
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9855
9934
  model.layers[il].wo, model.layers[il].bo,
9856
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9935
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9857
9936
  }
9858
9937
 
9859
9938
  if (il == n_layer - 1) {
@@ -9983,7 +10062,7 @@ struct llm_build_context {
9983
10062
 
9984
10063
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
9985
10064
  model.layers[il].wo, model.layers[il].bo,
9986
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10065
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9987
10066
  }
9988
10067
 
9989
10068
  if (il == n_layer - 1) {
@@ -10104,7 +10183,7 @@ struct llm_build_context {
10104
10183
 
10105
10184
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10106
10185
  model.layers[il].wo, NULL,
10107
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
10186
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
10108
10187
  }
10109
10188
 
10110
10189
  if (il == n_layer - 1) {
@@ -10223,7 +10302,7 @@ struct llm_build_context {
10223
10302
 
10224
10303
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10225
10304
  model.layers[il].wo, model.layers[il].bo,
10226
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10305
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10227
10306
  }
10228
10307
 
10229
10308
  if (il == n_layer - 1) {
@@ -10513,7 +10592,7 @@ struct llm_build_context {
10513
10592
 
10514
10593
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10515
10594
  model.layers[il].wo, model.layers[il].bo,
10516
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10595
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10517
10596
  }
10518
10597
 
10519
10598
  if (il == n_layer - 1) {
@@ -10644,7 +10723,7 @@ struct llm_build_context {
10644
10723
 
10645
10724
  cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
10646
10725
  model.layers[il].wo, nullptr,
10647
- Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10726
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10648
10727
  }
10649
10728
 
10650
10729
  if (il == n_layer - 1) {
@@ -10825,6 +10904,7 @@ static struct ggml_cgraph * llama_build_graph(
10825
10904
  result = llm.build_refact();
10826
10905
  } break;
10827
10906
  case LLM_ARCH_BERT:
10907
+ case LLM_ARCH_JINA_BERT_V2:
10828
10908
  case LLM_ARCH_NOMIC_BERT:
10829
10909
  {
10830
10910
  result = llm.build_bert();
@@ -11032,11 +11112,21 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
11032
11112
  if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
11033
11113
  f = -INFINITY;
11034
11114
  } else {
11035
- f = 0.0f;
11115
+ if (hparams.use_alibi) {
11116
+ f = -fabs(lctx.kv_self.cells[i].pos - pos);
11117
+ } else {
11118
+ f = 0.0f;
11119
+ }
11036
11120
  }
11037
11121
  data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
11038
11122
  }
11039
11123
  }
11124
+
11125
+ for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
11126
+ for (int j = 0; j < n_kv; ++j) {
11127
+ data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
11128
+ }
11129
+ }
11040
11130
  }
11041
11131
  } else {
11042
11132
  // when using kv cache, the mask needs to match the kv cache size
@@ -11055,7 +11145,11 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
11055
11145
  float f = -INFINITY;
11056
11146
  for (int s = 0; s < batch.n_seq_id[i]; ++s) {
11057
11147
  if (batch.seq_id[i][s] == seq_id) {
11058
- f = 0.0f;
11148
+ if (hparams.use_alibi) {
11149
+ f = -fabs(batch.pos[i] - batch.pos[j]);
11150
+ } else {
11151
+ f = 0.0f;
11152
+ }
11059
11153
  break;
11060
11154
  }
11061
11155
  }
@@ -11071,21 +11165,6 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
11071
11165
  }
11072
11166
  }
11073
11167
 
11074
- // ALiBi requires the KQ_pos tensor to provide the sequence position of each token in the batch
11075
- // this allows to process multiple sequences in parallel with ALiBi-based models
11076
- if (hparams.use_alibi) {
11077
- const int64_t n_kv = kv_self.n;
11078
-
11079
- GGML_ASSERT(lctx.inp_KQ_pos);
11080
- GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_pos->buffer));
11081
-
11082
- float * data = (float *) lctx.inp_KQ_pos->data;
11083
-
11084
- for (int i = 0; i < n_kv; ++i) {
11085
- data[i] = float(lctx.kv_self.cells[i].pos);
11086
- }
11087
- }
11088
-
11089
11168
  if (cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
11090
11169
  const int64_t n_tokens = batch.n_tokens;
11091
11170
 
@@ -11455,7 +11534,8 @@ static int llama_decode_internal(
11455
11534
  // a heuristic, to avoid attending the full cache if it is not yet utilized
11456
11535
  // after enough generations, the benefit from this heuristic disappears
11457
11536
  // if we start defragmenting the cache, the benefit from this will be more important
11458
- kv_self.n = std::min(kv_self.size, std::max(256u, GGML_PAD(llama_kv_cache_cell_max(kv_self), 256)));
11537
+ const uint32_t pad = llama_kv_cache_get_padding(cparams);
11538
+ kv_self.n = std::min(kv_self.size, std::max(pad, GGML_PAD(llama_kv_cache_cell_max(kv_self), pad)));
11459
11539
  //kv_self.n = llama_kv_cache_cell_max(kv_self);
11460
11540
  }
11461
11541
  }
@@ -12200,13 +12280,14 @@ struct llm_tokenizer_bpe {
12200
12280
 
12201
12281
  void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
12202
12282
  int final_prev_index = -1;
12283
+ bool ignore_merges = false;
12203
12284
 
12204
12285
  std::vector<std::string> word_collection;
12205
12286
  switch (vocab.type) {
12206
12287
  case LLAMA_VOCAB_TYPE_BPE:
12207
12288
  switch (vocab.type_pre) {
12208
12289
  case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
12209
- case LLAMA_VOCAB_PRE_TYPE_DBRX:
12290
+ ignore_merges = true;
12210
12291
  word_collection = unicode_regex_split(text, {
12211
12292
  // original regex from tokenizer.json
12212
12293
  //"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
@@ -12215,6 +12296,12 @@ struct llm_tokenizer_bpe {
12215
12296
  "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
12216
12297
  });
12217
12298
  break;
12299
+ case LLAMA_VOCAB_PRE_TYPE_DBRX:
12300
+ word_collection = unicode_regex_split(text, {
12301
+ // same as llama3
12302
+ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
12303
+ });
12304
+ break;
12218
12305
  case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
12219
12306
  word_collection = unicode_regex_split(text, {
12220
12307
  "[\r\n]",
@@ -12298,6 +12385,11 @@ struct llm_tokenizer_bpe {
12298
12385
  int index = 0;
12299
12386
  size_t offset = 0;
12300
12387
 
12388
+ if (ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) {
12389
+ symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()});
12390
+ offset = word.size();
12391
+ }
12392
+
12301
12393
  while (offset < word.size()) {
12302
12394
  llm_symbol sym;
12303
12395
  size_t char_len = std::min(word.size() - offset, (size_t) ::utf8_len(word[offset]));
@@ -12752,7 +12844,10 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
12752
12844
  }
12753
12845
  }
12754
12846
 
12755
- GGML_ASSERT(vocab.special_add_eos != 1);
12847
+ if (add_special && vocab.special_add_eos == 1) {
12848
+ GGML_ASSERT(vocab.special_add_eos != -1);
12849
+ output.push_back(vocab.special_eos_id);
12850
+ }
12756
12851
  } break;
12757
12852
  case LLAMA_VOCAB_TYPE_WPM:
12758
12853
  {
@@ -13106,6 +13201,58 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
13106
13201
  return rejects;
13107
13202
  }
13108
13203
 
13204
+ static bool llama_grammar_detect_left_recursion(
13205
+ const std::vector<std::vector<llama_grammar_element>> & rules,
13206
+ size_t rule_index,
13207
+ std::vector<bool> * rules_visited,
13208
+ std::vector<bool> * rules_in_progress,
13209
+ std::vector<bool> * rules_may_be_empty) {
13210
+ if ((*rules_in_progress)[rule_index]) {
13211
+ return true;
13212
+ }
13213
+
13214
+ (*rules_in_progress)[rule_index] = true;
13215
+
13216
+ const std::vector<llama_grammar_element> & rule = rules[rule_index];
13217
+
13218
+ // First check if the rule might produce the empty string. This could be done combined with the second
13219
+ // step but it's more readable as two steps.
13220
+ bool at_rule_start = true;
13221
+ for (size_t i = 0; i < rule.size(); i++) {
13222
+ if (llama_grammar_is_end_of_sequence(&rule[i])) {
13223
+ if (at_rule_start) {
13224
+ (*rules_may_be_empty)[rule_index] = true;
13225
+ break;
13226
+ }
13227
+ at_rule_start = true;
13228
+ } else {
13229
+ at_rule_start = false;
13230
+ }
13231
+ }
13232
+
13233
+ // Second, recurse into leftmost nonterminals (or next-leftmost as long as the previous nonterminal may
13234
+ // be empty)
13235
+ bool recurse_into_nonterminal = true;
13236
+ for (size_t i = 0; i < rule.size(); i++) {
13237
+ if (rule[i].type == LLAMA_GRETYPE_RULE_REF && recurse_into_nonterminal) {
13238
+ if (llama_grammar_detect_left_recursion(rules, (size_t)rule[i].value, rules_visited, rules_in_progress, rules_may_be_empty)) {
13239
+ return true;
13240
+ }
13241
+ if (!((*rules_may_be_empty)[(size_t)rule[i].value])) {
13242
+ recurse_into_nonterminal = false;
13243
+ }
13244
+ } else if (llama_grammar_is_end_of_sequence(&rule[i])) {
13245
+ recurse_into_nonterminal = true;
13246
+ } else {
13247
+ recurse_into_nonterminal = false;
13248
+ }
13249
+ }
13250
+
13251
+ (*rules_in_progress)[rule_index] = false;
13252
+ (*rules_visited)[rule_index] = true;
13253
+ return false;
13254
+ }
13255
+
13109
13256
  //
13110
13257
  // grammar - external
13111
13258
  //
@@ -13125,6 +13272,19 @@ struct llama_grammar * llama_grammar_init(
13125
13272
  vec_rules[i].push_back({LLAMA_GRETYPE_END, 0});
13126
13273
  }
13127
13274
 
13275
+ // Check for left recursion
13276
+ std::vector<bool> rules_visited(n_rules);
13277
+ std::vector<bool> rules_in_progress(n_rules);
13278
+ std::vector<bool> rules_may_be_empty(n_rules);
13279
+ for (size_t i = 0; i < n_rules; i++) {
13280
+ if (rules_visited[i]) {
13281
+ continue;
13282
+ }
13283
+ if (llama_grammar_detect_left_recursion(vec_rules, i, &rules_visited, &rules_in_progress, &rules_may_be_empty)) {
13284
+ throw std::runtime_error(format("unsupported grammar, left recursion detected for nonterminal at index %zu", i));
13285
+ }
13286
+ }
13287
+
13128
13288
  // loop over alternates of start rule to build initial stacks
13129
13289
  std::vector<std::vector<const llama_grammar_element *>> stacks;
13130
13290
  pos = vec_rules[start_rule_index].data();
@@ -13147,6 +13307,9 @@ struct llama_grammar * llama_grammar_init(
13147
13307
  }
13148
13308
  } while (true);
13149
13309
 
13310
+ // Important: vec_rules has to be moved here, not copied, because stacks contains
13311
+ // pointers to elements of vec_rules. If vec_rules were copied into llama_grammar
13312
+ // then the pointers would be invalidated when the local vec_rules goes out of scope.
13150
13313
  return new llama_grammar{ std::move(vec_rules), std::move(stacks), {} };
13151
13314
  }
13152
13315
 
@@ -15246,6 +15409,7 @@ struct llama_model_params llama_model_default_params() {
15246
15409
  /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
15247
15410
  /*.main_gpu =*/ 0,
15248
15411
  /*.tensor_split =*/ nullptr,
15412
+ /*.rpc_servers =*/ nullptr,
15249
15413
  /*.progress_callback =*/ nullptr,
15250
15414
  /*.progress_callback_user_data =*/ nullptr,
15251
15415
  /*.kv_overrides =*/ nullptr,
@@ -15316,7 +15480,9 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
15316
15480
  }
15317
15481
 
15318
15482
  size_t llama_max_devices(void) {
15319
- #if defined(GGML_USE_METAL)
15483
+ #if defined(GGML_USE_RPC)
15484
+ return GGML_RPC_MAX_SERVERS;
15485
+ #elif defined(GGML_USE_METAL)
15320
15486
  return 1;
15321
15487
  #elif defined(GGML_USE_CUDA)
15322
15488
  return GGML_CUDA_MAX_DEVICES;
@@ -15339,7 +15505,7 @@ bool llama_supports_mlock(void) {
15339
15505
 
15340
15506
  bool llama_supports_gpu_offload(void) {
15341
15507
  #if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
15342
- defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
15508
+ defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC)
15343
15509
  // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
15344
15510
  return true;
15345
15511
  #else
@@ -15402,7 +15568,17 @@ struct llama_model * llama_load_model_from_file(
15402
15568
  return true;
15403
15569
  };
15404
15570
  }
15405
-
15571
+ if (params.rpc_servers != nullptr) {
15572
+ // split the servers set them into model->rpc_servers
15573
+ std::string servers(params.rpc_servers);
15574
+ size_t pos = 0;
15575
+ while ((pos = servers.find(",")) != std::string::npos) {
15576
+ std::string server = servers.substr(0, pos);
15577
+ model->rpc_servers.push_back(server);
15578
+ servers.erase(0, pos + 1);
15579
+ }
15580
+ model->rpc_servers.push_back(servers);
15581
+ }
15406
15582
  int status = llama_model_load(path_model, *model, params);
15407
15583
  GGML_ASSERT(status <= 0);
15408
15584
  if (status < 0) {
@@ -15441,6 +15617,11 @@ struct llama_context * llama_new_context_with_model(
15441
15617
  return nullptr;
15442
15618
  }
15443
15619
 
15620
+ if (params.flash_attn && model->arch == LLM_ARCH_GROK) {
15621
+ LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__);
15622
+ params.flash_attn = false;
15623
+ }
15624
+
15444
15625
  llama_context * ctx = new llama_context(*model);
15445
15626
 
15446
15627
  const auto & hparams = model->hparams;
@@ -15464,7 +15645,7 @@ struct llama_context * llama_new_context_with_model(
15464
15645
  cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
15465
15646
 
15466
15647
  // this is necessary due to kv_self.n being padded later during inference
15467
- cparams.n_ctx = GGML_PAD(cparams.n_ctx, 256);
15648
+ cparams.n_ctx = GGML_PAD(cparams.n_ctx, llama_kv_cache_get_padding(cparams));
15468
15649
 
15469
15650
  // with causal attention, the batch size is limited by the context size
15470
15651
  cparams.n_batch = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
@@ -15509,16 +15690,6 @@ struct llama_context * llama_new_context_with_model(
15509
15690
  }
15510
15691
  }
15511
15692
 
15512
- if (cparams.flash_attn && hparams.use_alibi) {
15513
- LLAMA_LOG_WARN("%s: flash_attn is not yet compatible with ALiBi - forcing off\n", __func__);
15514
- cparams.flash_attn = false;
15515
- }
15516
-
15517
- if (cparams.flash_attn && model->arch == LLM_ARCH_GROK) {
15518
- LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__);
15519
- cparams.flash_attn = false;
15520
- }
15521
-
15522
15693
  if (params.seed == LLAMA_DEFAULT_SEED) {
15523
15694
  params.seed = time(NULL);
15524
15695
  }
@@ -15554,7 +15725,17 @@ struct llama_context * llama_new_context_with_model(
15554
15725
 
15555
15726
  if (!hparams.vocab_only) {
15556
15727
  // initialize backends
15557
- #ifdef GGML_USE_METAL
15728
+ #if defined(GGML_USE_RPC)
15729
+ for (auto & server : model->rpc_servers) {
15730
+ ggml_backend_t backend = ggml_backend_rpc_init(server.c_str());
15731
+ if (backend == nullptr) {
15732
+ LLAMA_LOG_ERROR("%s: failed to connect RPC backend to %s\n", __func__, server.c_str());
15733
+ llama_free(ctx);
15734
+ return nullptr;
15735
+ }
15736
+ ctx->backends.push_back(backend);
15737
+ }
15738
+ #elif defined(GGML_USE_METAL)
15558
15739
  if (model->n_gpu_layers > 0) {
15559
15740
  ctx->backend_metal = ggml_backend_metal_init();
15560
15741
  if (ctx->backend_metal == nullptr) {
@@ -15710,7 +15891,11 @@ struct llama_context * llama_new_context_with_model(
15710
15891
  ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead_custom(LLAMA_MAX_NODES, false));
15711
15892
 
15712
15893
  // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
15713
- bool pipeline_parallel = llama_get_device_count() > 1 && model->n_gpu_layers > (int)model->hparams.n_layer && model->split_mode == LLAMA_SPLIT_MODE_LAYER;
15894
+ bool pipeline_parallel =
15895
+ llama_get_device_count(*model) > 1 &&
15896
+ model->n_gpu_layers > (int)model->hparams.n_layer &&
15897
+ model->split_mode == LLAMA_SPLIT_MODE_LAYER &&
15898
+ params.offload_kqv;
15714
15899
  #ifndef GGML_USE_CUDA
15715
15900
  // pipeline parallelism requires support for async compute and events
15716
15901
  // currently this is only implemented in the CUDA backend
@@ -15808,6 +15993,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
15808
15993
  case LLM_ARCH_REFACT:
15809
15994
  case LLM_ARCH_BLOOM:
15810
15995
  case LLM_ARCH_MAMBA:
15996
+ case LLM_ARCH_JINA_BERT_V2:
15811
15997
  return LLAMA_ROPE_TYPE_NONE;
15812
15998
 
15813
15999
  // use what we call a normal RoPE, operating on pairs of consecutive head values