cui-llama.rn 1.2.6 → 1.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/README.md +3 -2
  2. package/android/src/main/CMakeLists.txt +26 -6
  3. package/android/src/main/java/com/rnllama/LlamaContext.java +115 -27
  4. package/android/src/main/java/com/rnllama/RNLlama.java +40 -7
  5. package/android/src/main/jni.cpp +228 -40
  6. package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +9 -4
  7. package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +9 -4
  8. package/cpp/amx/amx.cpp +196 -0
  9. package/cpp/amx/amx.h +20 -0
  10. package/cpp/amx/common.h +101 -0
  11. package/cpp/amx/mmq.cpp +2524 -0
  12. package/cpp/amx/mmq.h +16 -0
  13. package/cpp/common.cpp +118 -251
  14. package/cpp/common.h +53 -30
  15. package/cpp/ggml-aarch64.c +46 -3395
  16. package/cpp/ggml-aarch64.h +0 -20
  17. package/cpp/ggml-alloc.c +6 -8
  18. package/cpp/ggml-backend-impl.h +33 -11
  19. package/cpp/ggml-backend-reg.cpp +423 -0
  20. package/cpp/ggml-backend.cpp +14 -676
  21. package/cpp/ggml-backend.h +46 -9
  22. package/cpp/ggml-common.h +6 -0
  23. package/cpp/ggml-cpu-aarch64.c +3823 -0
  24. package/cpp/ggml-cpu-aarch64.h +32 -0
  25. package/cpp/ggml-cpu-impl.h +14 -242
  26. package/cpp/ggml-cpu-quants.c +10835 -0
  27. package/cpp/ggml-cpu-quants.h +63 -0
  28. package/cpp/ggml-cpu.c +13971 -13720
  29. package/cpp/ggml-cpu.cpp +715 -0
  30. package/cpp/ggml-cpu.h +65 -63
  31. package/cpp/ggml-impl.h +285 -25
  32. package/cpp/ggml-metal.h +8 -8
  33. package/cpp/ggml-metal.m +1221 -728
  34. package/cpp/ggml-quants.c +189 -10681
  35. package/cpp/ggml-quants.h +78 -125
  36. package/cpp/ggml-threading.cpp +12 -0
  37. package/cpp/ggml-threading.h +12 -0
  38. package/cpp/ggml.c +688 -1460
  39. package/cpp/ggml.h +58 -244
  40. package/cpp/json-schema-to-grammar.cpp +1045 -1045
  41. package/cpp/json.hpp +24766 -24766
  42. package/cpp/llama-sampling.cpp +5 -2
  43. package/cpp/llama.cpp +409 -123
  44. package/cpp/llama.h +8 -4
  45. package/cpp/rn-llama.hpp +89 -25
  46. package/cpp/sampling.cpp +42 -3
  47. package/cpp/sampling.h +22 -1
  48. package/cpp/sgemm.cpp +608 -0
  49. package/cpp/speculative.cpp +270 -0
  50. package/cpp/speculative.h +28 -0
  51. package/cpp/unicode.cpp +11 -0
  52. package/ios/RNLlama.mm +43 -20
  53. package/ios/RNLlamaContext.h +9 -3
  54. package/ios/RNLlamaContext.mm +146 -33
  55. package/jest/mock.js +0 -1
  56. package/lib/commonjs/NativeRNLlama.js.map +1 -1
  57. package/lib/commonjs/grammar.js +4 -2
  58. package/lib/commonjs/grammar.js.map +1 -1
  59. package/lib/commonjs/index.js +52 -15
  60. package/lib/commonjs/index.js.map +1 -1
  61. package/lib/module/NativeRNLlama.js.map +1 -1
  62. package/lib/module/grammar.js +2 -1
  63. package/lib/module/grammar.js.map +1 -1
  64. package/lib/module/index.js +51 -15
  65. package/lib/module/index.js.map +1 -1
  66. package/lib/typescript/NativeRNLlama.d.ts +122 -8
  67. package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
  68. package/lib/typescript/grammar.d.ts +5 -6
  69. package/lib/typescript/grammar.d.ts.map +1 -1
  70. package/lib/typescript/index.d.ts +15 -6
  71. package/lib/typescript/index.d.ts.map +1 -1
  72. package/package.json +2 -1
  73. package/src/NativeRNLlama.ts +135 -13
  74. package/src/grammar.ts +10 -8
  75. package/src/index.ts +104 -28
package/cpp/llama.cpp CHANGED
@@ -190,6 +190,7 @@ enum llm_arch {
190
190
  LLM_ARCH_COMMAND_R,
191
191
  LLM_ARCH_DBRX,
192
192
  LLM_ARCH_OLMO,
193
+ LLM_ARCH_OLMO2,
193
194
  LLM_ARCH_OLMOE,
194
195
  LLM_ARCH_OPENELM,
195
196
  LLM_ARCH_ARCTIC,
@@ -243,6 +244,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
243
244
  { LLM_ARCH_COMMAND_R, "command-r" },
244
245
  { LLM_ARCH_DBRX, "dbrx" },
245
246
  { LLM_ARCH_OLMO, "olmo" },
247
+ { LLM_ARCH_OLMO2, "olmo2" },
246
248
  { LLM_ARCH_OLMOE, "olmoe" },
247
249
  { LLM_ARCH_OPENELM, "openelm" },
248
250
  { LLM_ARCH_ARCTIC, "arctic" },
@@ -1218,6 +1220,25 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1218
1220
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1219
1221
  },
1220
1222
  },
1223
+ {
1224
+ LLM_ARCH_OLMO2,
1225
+ {
1226
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1227
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1228
+ { LLM_TENSOR_OUTPUT, "output" },
1229
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1230
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1231
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1232
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1233
+ { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
1234
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
1235
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
1236
+ { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
1237
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1238
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1239
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1240
+ },
1241
+ },
1221
1242
  {
1222
1243
  LLM_ARCH_OLMOE,
1223
1244
  {
@@ -2312,6 +2333,7 @@ enum e_model {
2312
2333
  MODEL_1B,
2313
2334
  MODEL_1_3B,
2314
2335
  MODEL_1_4B,
2336
+ MODEL_1_5B,
2315
2337
  MODEL_1_6B,
2316
2338
  MODEL_2B,
2317
2339
  MODEL_2_8B,
@@ -2330,6 +2352,7 @@ enum e_model {
2330
2352
  MODEL_16B,
2331
2353
  MODEL_20B,
2332
2354
  MODEL_30B,
2355
+ MODEL_32B,
2333
2356
  MODEL_34B,
2334
2357
  MODEL_35B,
2335
2358
  MODEL_40B,
@@ -2917,9 +2940,15 @@ struct llama_model {
2917
2940
  // for quantize-stats only
2918
2941
  std::vector<std::pair<std::string, struct lm_ggml_tensor *>> tensors_by_name;
2919
2942
 
2920
- int64_t t_load_us = 0;
2943
+ int64_t t_load_us = 0;
2921
2944
  int64_t t_start_us = 0;
2922
2945
 
2946
+ // total number of parameters in the model
2947
+ uint64_t n_elements = 0;
2948
+
2949
+ // total size of all the tensors in the model in bytes
2950
+ size_t n_bytes = 0;
2951
+
2923
2952
  // keep track of loaded lora adapters
2924
2953
  std::set<struct llama_lora_adapter *> lora_adapters;
2925
2954
 
@@ -3464,21 +3493,13 @@ static bool llama_kv_cache_init(
3464
3493
  const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
3465
3494
  const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
3466
3495
 
3467
- const llama_model::buft_list_t * buft_list;
3496
+ lm_ggml_backend_buffer_type_t buft;
3468
3497
  if (offload) {
3469
- buft_list = model.dev_layer.at(i).buft_list;
3498
+ auto * dev = model.dev_layer.at(i).dev;
3499
+ buft = lm_ggml_backend_dev_buffer_type(dev);
3470
3500
  } else {
3471
- buft_list = &model.cpu_buft_list;
3501
+ buft = lm_ggml_backend_cpu_buffer_type();
3472
3502
  }
3473
- lm_ggml_backend_buffer_type_t buft = select_buft(*buft_list,
3474
- [&](lm_ggml_context * ctx) {
3475
- lm_ggml_tensor * k = lm_ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
3476
- if (hparams.rope_type == LLAMA_ROPE_TYPE_NONE) {
3477
- return k;
3478
- }
3479
- lm_ggml_tensor * p = lm_ggml_new_tensor_1d(ctx, LM_GGML_TYPE_I32, 1);
3480
- return lm_ggml_rope(ctx, k, p, hparams.n_rot, hparams.rope_type);
3481
- });
3482
3503
  lm_ggml_context * ctx = ctx_for_buft(buft);
3483
3504
 
3484
3505
  if (!ctx) {
@@ -3512,11 +3533,24 @@ static bool llama_kv_cache_init(
3512
3533
  return true;
3513
3534
  }
3514
3535
 
3536
+ // a structure holds information about the slot found in llama_kv_cache_find_slot
3537
+ struct llama_kv_cache_slot_info {
3538
+ std::pair<uint32_t, uint32_t> boundaries; // slot boundaries [begin, end)
3539
+ bool found = false; // the slot was found
3540
+
3541
+ explicit llama_kv_cache_slot_info(bool found_) : found{found_} {}
3542
+ llama_kv_cache_slot_info(uint32_t begin, uint32_t end) : boundaries{begin, end}, found{true} {}
3543
+
3544
+ operator bool() const { return found; }
3545
+ };
3546
+ static const llama_kv_cache_slot_info llama_kv_cache_slot_info_failed{false};
3547
+
3515
3548
  // find an empty slot of size "n_tokens" in the cache
3516
3549
  // updates the cache head
3550
+ // returns a structure holding information about the slot found
3517
3551
  // Note: On success, it's important that cache.head points
3518
3552
  // to the first cell of the slot.
3519
- static bool llama_kv_cache_find_slot(
3553
+ static struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
3520
3554
  struct llama_kv_cache & cache,
3521
3555
  const struct llama_ubatch & batch) {
3522
3556
  const uint32_t n_tokens = batch.n_tokens;
@@ -3544,7 +3578,7 @@ static bool llama_kv_cache_find_slot(
3544
3578
  // too big seq_id
3545
3579
  // TODO: would it be possible to resize the cache instead?
3546
3580
  LLAMA_LOG_ERROR("%s: seq_id=%d >= n_seq_max=%d Try using a bigger --parallel value\n", __func__, seq_id, cache.size);
3547
- return false;
3581
+ return llama_kv_cache_slot_info_failed;
3548
3582
  }
3549
3583
  if (j > 0) {
3550
3584
  llama_kv_cell & seq = cache.cells[seq_id];
@@ -3679,15 +3713,17 @@ static bool llama_kv_cache_find_slot(
3679
3713
  // allow getting the range of used cells, from head to head + n
3680
3714
  cache.head = min;
3681
3715
  cache.n = max - min + 1;
3716
+ cache.used = std::count_if(cache.cells.begin(), cache.cells.end(),
3717
+ [](const llama_kv_cell& cell){ return !cell.is_empty(); });
3682
3718
 
3683
3719
  // sanity check
3684
- return cache.n >= n_seqs;
3720
+ return llama_kv_cache_slot_info(cache.n >= n_seqs);
3685
3721
  }
3686
3722
  // otherwise, one cell per token.
3687
3723
 
3688
3724
  if (n_tokens > cache.size) {
3689
3725
  LLAMA_LOG_ERROR("%s: n_tokens=%d > cache.size=%d\n", __func__, n_tokens, cache.size);
3690
- return false;
3726
+ return llama_kv_cache_slot_info_failed;
3691
3727
  }
3692
3728
 
3693
3729
  uint32_t n_tested = 0;
@@ -3715,7 +3751,7 @@ static bool llama_kv_cache_find_slot(
3715
3751
 
3716
3752
  if (n_tested >= cache.size) {
3717
3753
  //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
3718
- return false;
3754
+ return llama_kv_cache_slot_info_failed;
3719
3755
  }
3720
3756
  }
3721
3757
 
@@ -3732,7 +3768,7 @@ static bool llama_kv_cache_find_slot(
3732
3768
 
3733
3769
  cache.used += n_tokens;
3734
3770
 
3735
- return true;
3771
+ return llama_kv_cache_slot_info(cache.head, cache.head + n_tokens);
3736
3772
  }
3737
3773
 
3738
3774
  // find how many cells are currently in use
@@ -4008,6 +4044,53 @@ static uint32_t llama_kv_cache_get_padding(const struct llama_cparams & cparams)
4008
4044
  return cparams.flash_attn ? 256u : 32u;
4009
4045
  }
4010
4046
 
4047
+ // saves the kv_cache state for future recovery.
4048
+ // used to rollback llama_kv_cache_find_slot changes.
4049
+ struct llama_kv_slot_restorer {
4050
+ struct llama_kv_cache_state {
4051
+ uint32_t head = 0;
4052
+ uint32_t n = 0;
4053
+ } old_state;
4054
+
4055
+ // for non-recurrent models only
4056
+ // list of slots to restore
4057
+ std::vector<std::pair<uint32_t, uint32_t>> slot_boundaries;
4058
+
4059
+ bool do_restore = false;
4060
+
4061
+ explicit llama_kv_slot_restorer(const struct llama_kv_cache & cache) {
4062
+ old_state.head = cache.head;
4063
+ old_state.n = cache.n;
4064
+ }
4065
+
4066
+ // saves a slot information for future restoration
4067
+ void save(const struct llama_kv_cache_slot_info & slot) {
4068
+ if (slot) {
4069
+ do_restore = true;
4070
+ if (slot.boundaries.first != slot.boundaries.second) {
4071
+ slot_boundaries.push_back(slot.boundaries);
4072
+ }
4073
+ }
4074
+ }
4075
+
4076
+ // must be explicitly called to restore the kv_cache state
4077
+ // and rollback changes from all llama_kv_cache_find_slot calls
4078
+ void restore(struct llama_kv_cache & cache) {
4079
+ if (do_restore) {
4080
+ cache.head = old_state.head;
4081
+ cache.n = old_state.n;
4082
+
4083
+ if (cache.recurrent) { // recurrent models like Mamba or RWKV can't have a state partially erased
4084
+ llama_kv_cache_seq_rm(cache, -1, -1, -1);
4085
+ } else {
4086
+ for (auto & slot : slot_boundaries) {
4087
+ llama_kv_cache_seq_rm(cache, -1, slot.first, slot.second);
4088
+ }
4089
+ }
4090
+ }
4091
+ }
4092
+ };
4093
+
4011
4094
  //
4012
4095
  // model loading and saving
4013
4096
  //
@@ -4223,8 +4306,8 @@ struct llama_model_loader {
4223
4306
  int n_tensors = 0;
4224
4307
  int n_created = 0;
4225
4308
 
4226
- int64_t n_elements = 0;
4227
- size_t n_bytes = 0;
4309
+ uint64_t n_elements = 0;
4310
+ size_t n_bytes = 0;
4228
4311
 
4229
4312
  bool use_mmap = false;
4230
4313
  bool check_tensors;
@@ -4795,7 +4878,9 @@ struct llama_model_loader {
4795
4878
  mappings.reserve(files.size());
4796
4879
  mmaps_used.reserve(files.size());
4797
4880
  for (const auto & file : files) {
4798
- std::unique_ptr<llama_mmap> mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, lm_ggml_is_numa()));
4881
+ auto * reg = lm_ggml_backend_dev_backend_reg(lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_CPU));
4882
+ auto * is_numa_fn = (decltype(lm_ggml_is_numa) *) lm_ggml_backend_reg_get_proc_address(reg, "lm_ggml_backend_cpu_is_numa");
4883
+ std::unique_ptr<llama_mmap> mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, is_numa_fn()));
4799
4884
  mmaps_used.emplace_back(mapping->size, 0);
4800
4885
  if (mlock_mmaps) {
4801
4886
  std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock());
@@ -5238,6 +5323,7 @@ static const char * llama_model_type_name(e_model type) {
5238
5323
  case MODEL_1B: return "1B";
5239
5324
  case MODEL_1_3B: return "1.3B";
5240
5325
  case MODEL_1_4B: return "1.4B";
5326
+ case MODEL_1_5B: return "1.5B";
5241
5327
  case MODEL_1_6B: return "1.6B";
5242
5328
  case MODEL_2B: return "2B";
5243
5329
  case MODEL_2_8B: return "2.8B";
@@ -5256,6 +5342,7 @@ static const char * llama_model_type_name(e_model type) {
5256
5342
  case MODEL_16B: return "16B";
5257
5343
  case MODEL_20B: return "20B";
5258
5344
  case MODEL_30B: return "30B";
5345
+ case MODEL_32B: return "32B";
5259
5346
  case MODEL_34B: return "34B";
5260
5347
  case MODEL_35B: return "35B";
5261
5348
  case MODEL_40B: return "40B";
@@ -5291,6 +5378,11 @@ static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
5291
5378
  }
5292
5379
  }
5293
5380
 
5381
+ static void llm_load_stats(llama_model_loader & ml, llama_model & model) {
5382
+ model.n_elements = ml.n_elements;
5383
+ model.n_bytes = ml.n_bytes;
5384
+ }
5385
+
5294
5386
  static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
5295
5387
  model.arch = ml.get_arch();
5296
5388
  if (model.arch == LLM_ARCH_UNKNOWN) {
@@ -5609,8 +5701,12 @@ static void llm_load_hparams(
5609
5701
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
5610
5702
  switch (hparams.n_layer) {
5611
5703
  case 24: model.type = hparams.n_embd == 1024 ? e_model::MODEL_0_5B : e_model::MODEL_1B; break;
5704
+ case 28: model.type = hparams.n_embd == 1536 ? e_model::MODEL_1_5B : e_model::MODEL_7B; break;
5612
5705
  case 32: model.type = e_model::MODEL_7B; break;
5706
+ case 36: model.type = e_model::MODEL_3B; break;
5613
5707
  case 40: model.type = hparams.n_head() == 20 ? e_model::MODEL_4B : e_model::MODEL_13B; break;
5708
+ case 48: model.type = e_model::MODEL_14B; break;
5709
+ case 64: model.type = e_model::MODEL_32B; break;
5614
5710
  case 80: model.type = e_model::MODEL_70B; break;
5615
5711
  default: model.type = e_model::MODEL_UNKNOWN;
5616
5712
  }
@@ -5820,6 +5916,17 @@ static void llm_load_hparams(
5820
5916
  default: model.type = e_model::MODEL_UNKNOWN;
5821
5917
  }
5822
5918
  } break;
5919
+ case LLM_ARCH_OLMO2:
5920
+ {
5921
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
5922
+
5923
+ switch (hparams.n_layer) {
5924
+ case 16: model.type = e_model::MODEL_1B; break;
5925
+ case 32: model.type = e_model::MODEL_7B; break;
5926
+ case 40: model.type = e_model::MODEL_13B; break;
5927
+ default: model.type = e_model::MODEL_UNKNOWN;
5928
+ }
5929
+ } break;
5823
5930
  case LLM_ARCH_OLMOE:
5824
5931
  {
5825
5932
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -7022,7 +7129,7 @@ static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
7022
7129
  {LLM_TENSOR_TIME_MIX_LERP_R, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_ADD}},
7023
7130
  {LLM_TENSOR_TIME_MIX_LERP_G, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_ADD}},
7024
7131
  {LLM_TENSOR_TIME_MIX_DECAY, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_ADD}},
7025
- {LLM_TENSOR_TIME_MIX_FIRST, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_RWKV_WKV}},
7132
+ {LLM_TENSOR_TIME_MIX_FIRST, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_RWKV_WKV6}},
7026
7133
  {LLM_TENSOR_ATTN_NORM, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL}},
7027
7134
  {LLM_TENSOR_ATTN_NORM_2, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL}},
7028
7135
  {LLM_TENSOR_ATTN_OUT_NORM, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL}},
@@ -7092,12 +7199,12 @@ static bool weight_buft_supported(const llama_hparams & hparams, lm_ggml_tensor
7092
7199
  } break;
7093
7200
  case LM_GGML_OP_ADD:
7094
7201
  {
7095
- lm_ggml_tensor * a = lm_ggml_new_tensor_2d(ctx, LM_GGML_TYPE_F32, w->ne[0], 512);
7202
+ lm_ggml_tensor * a = lm_ggml_new_tensor_4d(ctx, LM_GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
7096
7203
  op_tensor = lm_ggml_add(ctx, a, w);
7097
7204
  } break;
7098
7205
  case LM_GGML_OP_MUL:
7099
7206
  {
7100
- lm_ggml_tensor * a = lm_ggml_new_tensor_2d(ctx, LM_GGML_TYPE_F32, w->ne[0], 512);
7207
+ lm_ggml_tensor * a = lm_ggml_new_tensor_4d(ctx, LM_GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
7101
7208
  op_tensor = lm_ggml_mul(ctx, a, w);
7102
7209
  } break;
7103
7210
  case LM_GGML_OP_DIV:
@@ -7138,7 +7245,7 @@ static bool weight_buft_supported(const llama_hparams & hparams, lm_ggml_tensor
7138
7245
  lm_ggml_tensor * C = lm_ggml_new_tensor_3d(ctx, LM_GGML_TYPE_F32, d_state, n_seq_tokens, n_seqs);
7139
7246
  op_tensor = lm_ggml_ssm_scan(ctx, s, x, dt, w, B, C);
7140
7247
  } break;
7141
- case LM_GGML_OP_RWKV_WKV:
7248
+ case LM_GGML_OP_RWKV_WKV6:
7142
7249
  {
7143
7250
  // FIXME
7144
7251
  const int64_t S = 123;
@@ -7151,7 +7258,7 @@ static bool weight_buft_supported(const llama_hparams & hparams, lm_ggml_tensor
7151
7258
  lm_ggml_tensor * tf = w;
7152
7259
  lm_ggml_tensor * td = lm_ggml_new_tensor_4d(ctx, LM_GGML_TYPE_F32, 1, S, H, n_tokens);
7153
7260
  lm_ggml_tensor * state = lm_ggml_new_tensor_4d(ctx, LM_GGML_TYPE_F32, S, n_seqs, S, H);
7154
- op_tensor = lm_ggml_rwkv_wkv(ctx, k, v, r, tf, td, state);
7261
+ op_tensor = lm_ggml_rwkv_wkv6(ctx, k, v, r, tf, td, state);
7155
7262
  } break;
7156
7263
  default:
7157
7264
  LM_GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, lm_ggml_op_name(op), w->name);
@@ -7200,7 +7307,7 @@ static llama_model::buft_list_t make_cpu_buft_list(llama_model & model) {
7200
7307
  auto * cpu_dev = lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_CPU);
7201
7308
  auto * cpu_reg = lm_ggml_backend_dev_backend_reg(cpu_dev);
7202
7309
  auto lm_ggml_backend_dev_get_extra_bufts_fn = (lm_ggml_backend_dev_get_extra_bufts_t)
7203
- lm_ggml_backend_reg_get_proc_address(cpu_reg, "lm_ggml_backend_cpu_get_extra_bufts");
7310
+ lm_ggml_backend_reg_get_proc_address(cpu_reg, "lm_ggml_backend_dev_get_extra_bufts");
7204
7311
  if (lm_ggml_backend_dev_get_extra_bufts_fn) {
7205
7312
  lm_ggml_backend_buffer_type_t * extra_bufts = lm_ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
7206
7313
  while (extra_bufts && *extra_bufts) {
@@ -7467,7 +7574,7 @@ static bool llm_load_tensors(
7467
7574
 
7468
7575
  // avoid using a host buffer when using mmap
7469
7576
  auto * buft_dev = lm_ggml_backend_buft_get_device(buft);
7470
- if (ml.use_mmap && buft == lm_ggml_backend_dev_host_buffer_type(buft_dev)) {
7577
+ if (ml.use_mmap && buft_dev && buft == lm_ggml_backend_dev_host_buffer_type(buft_dev)) {
7471
7578
  auto * cpu_dev = lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_CPU);
7472
7579
  buft = lm_ggml_backend_dev_buffer_type(cpu_dev);
7473
7580
  }
@@ -8502,6 +8609,31 @@ static bool llm_load_tensors(
8502
8609
  layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
8503
8610
  }
8504
8611
  } break;
8612
+ case LLM_ARCH_OLMO2:
8613
+ {
8614
+ model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
8615
+
8616
+ // output
8617
+ model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
8618
+ model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
8619
+
8620
+ for (int i = 0; i < n_layer; ++i) {
8621
+ auto & layer = model.layers[i];
8622
+
8623
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
8624
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
8625
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
8626
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
8627
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);
8628
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, 0);
8629
+ layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
8630
+
8631
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
8632
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
8633
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
8634
+ layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
8635
+ }
8636
+ } break;
8505
8637
  case LLM_ARCH_OLMOE:
8506
8638
  {
8507
8639
  model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -9074,6 +9206,10 @@ static bool llm_load_tensors(
9074
9206
 
9075
9207
  // check if it is possible to use buffer_from_host_ptr with this buffer type
9076
9208
  lm_ggml_backend_dev_t dev = lm_ggml_backend_buft_get_device(buft);
9209
+ if (!dev) {
9210
+ // FIXME: workaround for CPU backend buft having a NULL device
9211
+ dev = lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_CPU);
9212
+ }
9077
9213
  lm_ggml_backend_dev_props props;
9078
9214
  lm_ggml_backend_dev_get_props(dev, &props);
9079
9215
  bool buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr;
@@ -9145,7 +9281,7 @@ static bool llm_load_tensors(
9145
9281
 
9146
9282
  // print memory requirements per buffer type
9147
9283
  for (auto & buf : model.bufs) {
9148
- LLAMA_LOG_INFO("%s: %10s model buffer size = %8.2f MiB\n", __func__, lm_ggml_backend_buffer_name(buf.get()), lm_ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
9284
+ LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, lm_ggml_backend_buffer_name(buf.get()), lm_ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
9149
9285
  }
9150
9286
 
9151
9287
  // populate tensors_by_name
@@ -9198,6 +9334,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
9198
9334
  throw std::runtime_error("error loading model vocabulary: " + std::string(e.what()));
9199
9335
  }
9200
9336
 
9337
+ llm_load_stats(ml, model);
9201
9338
  llm_load_print_meta(ml, model);
9202
9339
 
9203
9340
  if (model.vocab.type != LLAMA_VOCAB_TYPE_NONE &&
@@ -10094,7 +10231,7 @@ static struct lm_ggml_tensor * llm_build_rwkv6_time_mix(
10094
10231
  v = lm_ggml_transpose(ctx, v);
10095
10232
  r = lm_ggml_transpose(ctx, r);
10096
10233
 
10097
- struct lm_ggml_tensor * wkv_output = lm_ggml_rwkv_wkv(ctx, k, v, r, layer->time_mix_first, w, *wkv_state);
10234
+ struct lm_ggml_tensor * wkv_output = lm_ggml_rwkv_wkv6(ctx, k, v, r, layer->time_mix_first, w, *wkv_state);
10098
10235
  cur = lm_ggml_view_1d(ctx, wkv_output, n_embd * n_tokens, 0);
10099
10236
  *wkv_state = lm_ggml_view_1d(ctx, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float));
10100
10237
 
@@ -14362,6 +14499,130 @@ struct llm_build_context {
14362
14499
  return gf;
14363
14500
  }
14364
14501
 
14502
+ struct lm_ggml_cgraph * build_olmo2() {
14503
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
14504
+
14505
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
14506
+ int32_t n_tokens = this->n_tokens;
14507
+
14508
+ const int64_t n_embd_head = hparams.n_embd_head_v;
14509
+ LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
14510
+ LM_GGML_ASSERT(n_embd_head == hparams.n_rot);
14511
+
14512
+ struct lm_ggml_tensor * cur;
14513
+ struct lm_ggml_tensor * inpL;
14514
+
14515
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
14516
+
14517
+ // inp_pos - contains the positions
14518
+ struct lm_ggml_tensor * inp_pos = build_inp_pos();
14519
+
14520
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
14521
+ struct lm_ggml_tensor * KQ_mask = build_inp_KQ_mask();
14522
+
14523
+ for (int il = 0; il < n_layer; ++il) {
14524
+ struct lm_ggml_tensor * inpSA = inpL;
14525
+
14526
+ cur = inpL;
14527
+
14528
+ // self_attention
14529
+ {
14530
+ // compute Q and K and RoPE them
14531
+ struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
14532
+ cb(Qcur, "Qcur", il);
14533
+
14534
+ struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
14535
+ cb(Kcur, "Kcur", il);
14536
+
14537
+ struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
14538
+ cb(Vcur, "Vcur", il);
14539
+
14540
+ Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].attn_q_norm, NULL,
14541
+ LLM_NORM_RMS, cb, il);
14542
+ cb(Qcur, "Qcur_normed", il);
14543
+
14544
+ Kcur = llm_build_norm(ctx0, Kcur, hparams, model.layers[il].attn_k_norm, NULL,
14545
+ LLM_NORM_RMS, cb, il);
14546
+ cb(Kcur, "Kcur_normed", il);
14547
+
14548
+ Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
14549
+ Kcur = lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
14550
+
14551
+ Qcur = lm_ggml_rope_ext(
14552
+ ctx0, Qcur, inp_pos, nullptr,
14553
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
14554
+ ext_factor, attn_factor, beta_fast, beta_slow
14555
+ );
14556
+ cb(Qcur, "Qcur_rope", il);
14557
+
14558
+ Kcur = lm_ggml_rope_ext(
14559
+ ctx0, Kcur, inp_pos, nullptr,
14560
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
14561
+ ext_factor, attn_factor, beta_fast, beta_slow
14562
+ );
14563
+ cb(Kcur, "Kcur_rope", il);
14564
+
14565
+ cur = llm_build_kv(ctx0, lctx, kv_self, gf,
14566
+ model.layers[il].wo, NULL,
14567
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
14568
+ }
14569
+
14570
+ cur = llm_build_norm(ctx0, cur, hparams,
14571
+ model.layers[il].attn_post_norm, NULL,
14572
+ LLM_NORM_RMS, cb, il);
14573
+ cb(cur, "attn_post_norm", il);
14574
+
14575
+ if (il == n_layer - 1) {
14576
+ // skip computing output for unused tokens
14577
+ struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
14578
+ n_tokens = n_outputs;
14579
+ cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
14580
+ inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
14581
+ }
14582
+
14583
+ struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
14584
+ cb(ffn_inp, "ffn_inp", il);
14585
+
14586
+ // feed-forward network
14587
+ cur = llm_build_ffn(ctx0, lctx, ffn_inp,
14588
+ model.layers[il].ffn_up, NULL, NULL,
14589
+ model.layers[il].ffn_gate, NULL, NULL,
14590
+ model.layers[il].ffn_down, NULL, NULL,
14591
+ NULL,
14592
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
14593
+ cb(cur, "ffn_out", il);
14594
+
14595
+ cur = llm_build_norm(ctx0, cur, hparams,
14596
+ model.layers[il].ffn_post_norm, NULL,
14597
+ LLM_NORM_RMS, cb, -1);
14598
+ cb(cur, "ffn_post_norm", -1);
14599
+
14600
+ cur = lm_ggml_add(ctx0, cur, ffn_inp);
14601
+ cb(cur, "ffn_out", il);
14602
+
14603
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
14604
+ cb(cur, "l_out", il);
14605
+
14606
+ // input for next layer
14607
+ inpL = cur;
14608
+ }
14609
+
14610
+ cur = inpL;
14611
+
14612
+ cur = llm_build_norm(ctx0, cur, hparams,
14613
+ model.output_norm, NULL,
14614
+ LLM_NORM_RMS, cb, -1);
14615
+ cb(cur, "result_norm", -1);
14616
+
14617
+ // lm_head
14618
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
14619
+ cb(cur, "result_output", -1);
14620
+
14621
+ lm_ggml_build_forward_expand(gf, cur);
14622
+
14623
+ return gf;
14624
+ }
14625
+
14365
14626
  // based on the build_qwen2moe() function, changes:
14366
14627
  // * removed shared experts
14367
14628
  // * removed bias
@@ -16554,6 +16815,10 @@ static struct lm_ggml_cgraph * llama_build_graph(
16554
16815
  {
16555
16816
  result = llm.build_olmo();
16556
16817
  } break;
16818
+ case LLM_ARCH_OLMO2:
16819
+ {
16820
+ result = llm.build_olmo2();
16821
+ } break;
16557
16822
  case LLM_ARCH_OLMOE:
16558
16823
  {
16559
16824
  result = llm.build_olmoe();
@@ -17189,14 +17454,16 @@ static void llama_output_reorder(struct llama_context * ctx) {
17189
17454
  }
17190
17455
  }
17191
17456
 
17192
- static void llama_graph_compute(
17457
+ // returns the result of lm_ggml_backend_sched_graph_compute_async execution
17458
+ static enum lm_ggml_status llama_graph_compute(
17193
17459
  llama_context & lctx,
17194
17460
  lm_ggml_cgraph * gf,
17195
17461
  int n_threads,
17196
17462
  lm_ggml_threadpool * threadpool) {
17197
17463
  if (lctx.backend_cpu != nullptr) {
17198
- lm_ggml_backend_cpu_set_threadpool(lctx.backend_cpu, threadpool);
17199
- lm_ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
17464
+ auto * reg = lm_ggml_backend_dev_backend_reg(lm_ggml_backend_get_device(lctx.backend_cpu));
17465
+ auto * set_threadpool_fn = (decltype(lm_ggml_backend_cpu_set_threadpool) *) lm_ggml_backend_reg_get_proc_address(reg, "lm_ggml_backend_cpu_set_threadpool");
17466
+ set_threadpool_fn(lctx.backend_cpu, threadpool);
17200
17467
  }
17201
17468
 
17202
17469
  // set the number of threads for all the backends
@@ -17204,15 +17471,20 @@ static void llama_graph_compute(
17204
17471
  set_n_threads_fn.second(set_n_threads_fn.first, n_threads);
17205
17472
  }
17206
17473
 
17207
- auto err = lm_ggml_backend_sched_graph_compute_async(lctx.sched.get(), gf);
17208
- if (err != LM_GGML_STATUS_SUCCESS) {
17209
- LLAMA_LOG_ERROR("%s: lm_ggml_backend_sched_graph_compute_async failed with error %d\n", __func__, err);
17474
+ auto status = lm_ggml_backend_sched_graph_compute_async(lctx.sched.get(), gf);
17475
+ if (status != LM_GGML_STATUS_SUCCESS) {
17476
+ LLAMA_LOG_ERROR("%s: lm_ggml_backend_sched_graph_compute_async failed with error %d\n", __func__, status);
17210
17477
  }
17211
17478
 
17212
17479
  // fprintf(stderr, "splits: %d\n", lm_ggml_backend_sched_get_n_splits(lctx.sched));
17480
+
17481
+ return status;
17213
17482
  }
17214
17483
 
17215
17484
  // decode a batch of tokens by evaluating the transformer
17485
+ // in case of unsuccessful decoding (error or warning),
17486
+ // the kv_cache state will be returned to its original state
17487
+ // (for non-recurrent models) or cleaned (for recurrent models)
17216
17488
  //
17217
17489
  // - lctx: llama context
17218
17490
  // - batch: batch to evaluate
@@ -17262,6 +17534,7 @@ static int llama_decode_internal(
17262
17534
  lctx.n_queued_tokens += n_tokens_all;
17263
17535
 
17264
17536
  auto & kv_self = lctx.kv_self;
17537
+ llama_kv_slot_restorer kv_slot_restorer(kv_self);
17265
17538
 
17266
17539
  const int64_t n_embd = hparams.n_embd;
17267
17540
  const int64_t n_vocab = hparams.n_vocab;
@@ -17346,9 +17619,11 @@ static int llama_decode_internal(
17346
17619
  kv_self.head = 0;
17347
17620
  }
17348
17621
 
17349
- if (!llama_kv_cache_find_slot(kv_self, ubatch)) {
17622
+ const auto slot = llama_kv_cache_find_slot(kv_self, ubatch);
17623
+ if (!slot) {
17350
17624
  return 1;
17351
17625
  }
17626
+ kv_slot_restorer.save(slot);
17352
17627
 
17353
17628
  if (!kv_self.recurrent) {
17354
17629
  // a heuristic, to avoid attending the full cache if it is not yet utilized
@@ -17395,7 +17670,19 @@ static int llama_decode_internal(
17395
17670
 
17396
17671
  llama_set_inputs(lctx, ubatch);
17397
17672
 
17398
- llama_graph_compute(lctx, gf, n_threads, threadpool);
17673
+ const auto compute_status = llama_graph_compute(lctx, gf, n_threads, threadpool);
17674
+ if (compute_status != LM_GGML_STATUS_SUCCESS) {
17675
+ kv_slot_restorer.restore(kv_self);
17676
+ switch (compute_status) {
17677
+ case LM_GGML_STATUS_ABORTED:
17678
+ return 2;
17679
+ case LM_GGML_STATUS_ALLOC_FAILED:
17680
+ return -2;
17681
+ case LM_GGML_STATUS_FAILED:
17682
+ default:
17683
+ return -3;
17684
+ }
17685
+ }
17399
17686
 
17400
17687
  // update the kv ring buffer
17401
17688
  {
@@ -17632,7 +17919,18 @@ static int llama_encode_internal(
17632
17919
 
17633
17920
  llama_set_inputs(lctx, ubatch);
17634
17921
 
17635
- llama_graph_compute(lctx, gf, n_threads, threadpool);
17922
+ const auto compute_status = llama_graph_compute(lctx, gf, n_threads, threadpool);
17923
+ switch (compute_status) {
17924
+ case LM_GGML_STATUS_SUCCESS:
17925
+ break;
17926
+ case LM_GGML_STATUS_ABORTED:
17927
+ return 2;
17928
+ case LM_GGML_STATUS_ALLOC_FAILED:
17929
+ return -2;
17930
+ case LM_GGML_STATUS_FAILED:
17931
+ default:
17932
+ return -3;
17933
+ }
17636
17934
 
17637
17935
  // extract embeddings
17638
17936
  if (embd) {
@@ -17932,13 +18230,13 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
17932
18230
  static void llama_kv_cache_update_internal(struct llama_context & lctx) {
17933
18231
  bool need_reserve = false;
17934
18232
 
17935
- // apply K-shift if needed
17936
- if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE && lctx.kv_self.has_shift) {
17937
- if (lctx.model.arch == LLM_ARCH_DEEPSEEK2) { // not supported due to MLA
17938
- LM_GGML_ABORT("Deepseek2 does not support K-shift");
18233
+ if (lctx.kv_self.has_shift) {
18234
+ if (!llama_kv_cache_can_shift(&lctx)) {
18235
+ LM_GGML_ABORT("The current context does not support K-shift");
17939
18236
  }
17940
18237
 
17941
- {
18238
+ // apply K-shift if needed
18239
+ if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE) {
17942
18240
  lm_ggml_backend_sched_reset(lctx.sched.get());
17943
18241
 
17944
18242
  lm_ggml_cgraph * gf = llama_build_graph_k_shift(lctx);
@@ -18511,6 +18809,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
18511
18809
  llama_model model;
18512
18810
  llm_load_arch(ml, model);
18513
18811
  llm_load_hparams(ml, model);
18812
+ llm_load_stats(ml, model);
18514
18813
 
18515
18814
  struct quantize_state_internal qs(model, params);
18516
18815
 
@@ -19081,6 +19380,7 @@ void llama_lora_adapter_free(struct llama_lora_adapter * adapter) {
19081
19380
  //
19082
19381
  struct llama_model_params llama_model_default_params() {
19083
19382
  struct llama_model_params result = {
19383
+ /*.devices =*/ nullptr,
19084
19384
  /*.n_gpu_layers =*/ 0,
19085
19385
  /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
19086
19386
  /*.main_gpu =*/ 0,
@@ -19198,7 +19498,11 @@ void llama_backend_init(void) {
19198
19498
 
19199
19499
  void llama_numa_init(enum lm_ggml_numa_strategy numa) {
19200
19500
  if (numa != LM_GGML_NUMA_STRATEGY_DISABLED) {
19201
- lm_ggml_numa_init(numa);
19501
+ auto * dev = lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_CPU);
19502
+ LM_GGML_ASSERT(dev && "CPU backend is not loaded");
19503
+ auto * reg = lm_ggml_backend_dev_backend_reg(dev);
19504
+ auto * numa_init_fn = (decltype(lm_ggml_numa_init) *) lm_ggml_backend_reg_get_proc_address(reg, "lm_ggml_backend_cpu_numa_init");
19505
+ numa_init_fn(numa);
19202
19506
  }
19203
19507
  }
19204
19508
 
@@ -19289,19 +19593,24 @@ struct llama_model * llama_load_model_from_file(
19289
19593
  }
19290
19594
 
19291
19595
  // create list of devices to use with this model
19292
- // currently, we use all available devices
19293
- // TODO: rework API to give user more control over device selection
19294
- for (size_t i = 0; i < lm_ggml_backend_dev_count(); ++i) {
19295
- lm_ggml_backend_dev_t dev = lm_ggml_backend_dev_get(i);
19296
- switch (lm_ggml_backend_dev_type(dev)) {
19297
- case LM_GGML_BACKEND_DEVICE_TYPE_CPU:
19298
- case LM_GGML_BACKEND_DEVICE_TYPE_ACCEL:
19299
- // skip CPU backends since they are handled separately
19300
- break;
19596
+ if (params.devices) {
19597
+ for (lm_ggml_backend_dev_t * dev = params.devices; *dev; ++dev) {
19598
+ model->devices.push_back(*dev);
19599
+ }
19600
+ } else {
19601
+ // use all available devices
19602
+ for (size_t i = 0; i < lm_ggml_backend_dev_count(); ++i) {
19603
+ lm_ggml_backend_dev_t dev = lm_ggml_backend_dev_get(i);
19604
+ switch (lm_ggml_backend_dev_type(dev)) {
19605
+ case LM_GGML_BACKEND_DEVICE_TYPE_CPU:
19606
+ case LM_GGML_BACKEND_DEVICE_TYPE_ACCEL:
19607
+ // skip CPU backends since they are handled separately
19608
+ break;
19301
19609
 
19302
- case LM_GGML_BACKEND_DEVICE_TYPE_GPU:
19303
- model->devices.push_back(dev);
19304
- break;
19610
+ case LM_GGML_BACKEND_DEVICE_TYPE_GPU:
19611
+ model->devices.push_back(dev);
19612
+ break;
19613
+ }
19305
19614
  }
19306
19615
  }
19307
19616
 
@@ -19472,9 +19781,6 @@ struct llama_context * llama_new_context_with_model(
19472
19781
  __func__, n_ctx_per_seq, hparams.n_ctx_train);
19473
19782
  }
19474
19783
 
19475
- ctx->abort_callback = params.abort_callback;
19476
- ctx->abort_callback_data = params.abort_callback_data;
19477
-
19478
19784
  ctx->logits_all = params.logits_all;
19479
19785
 
19480
19786
  // build worst-case graph for encoder if a model contains encoder
@@ -19523,7 +19829,7 @@ struct llama_context * llama_new_context_with_model(
19523
19829
  }
19524
19830
 
19525
19831
  // add CPU backend
19526
- ctx->backend_cpu = lm_ggml_backend_cpu_init();
19832
+ ctx->backend_cpu = lm_ggml_backend_init_by_type(LM_GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
19527
19833
  if (ctx->backend_cpu == nullptr) {
19528
19834
  LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
19529
19835
  llama_free(ctx);
@@ -19543,6 +19849,8 @@ struct llama_context * llama_new_context_with_model(
19543
19849
  }
19544
19850
  }
19545
19851
 
19852
+ llama_set_abort_callback(ctx, params.abort_callback, params.abort_callback_data);
19853
+
19546
19854
  if (!llama_kv_cache_init(ctx->kv_self, ctx, type_k, type_v, kv_size, cparams.offload_kqv)) {
19547
19855
  LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
19548
19856
  llama_free(ctx);
@@ -19588,7 +19896,8 @@ struct llama_context * llama_new_context_with_model(
19588
19896
  std::vector<lm_ggml_backend_t> backend_ptrs;
19589
19897
  for (auto & backend : ctx->backends) {
19590
19898
  auto * buft = lm_ggml_backend_get_default_buffer_type(backend.get());
19591
- if (lm_ggml_backend_is_cpu(backend.get()) && !model->devices.empty()) {
19899
+ auto backend_type = lm_ggml_backend_dev_type(lm_ggml_backend_get_device(backend.get()));
19900
+ if (backend_type == LM_GGML_BACKEND_DEVICE_TYPE_CPU && !model->devices.empty()) {
19592
19901
  // use the host buffer of the first device CPU for faster transfer of the intermediate state
19593
19902
  auto * dev = model->devices[0];
19594
19903
  auto * host_buft = lm_ggml_backend_dev_host_buffer_type(dev);
@@ -19616,7 +19925,8 @@ struct llama_context * llama_new_context_with_model(
19616
19925
  // pipeline parallelism requires support for async compute and events in all devices
19617
19926
  if (pipeline_parallel) {
19618
19927
  for (auto & backend : ctx->backends) {
19619
- if (lm_ggml_backend_is_cpu(backend.get())) {
19928
+ auto dev_type = lm_ggml_backend_dev_type(lm_ggml_backend_get_device(backend.get()));
19929
+ if (dev_type == LM_GGML_BACKEND_DEVICE_TYPE_CPU) {
19620
19930
  // ignore CPU backend
19621
19931
  continue;
19622
19932
  }
@@ -19790,6 +20100,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
19790
20100
  case LLM_ARCH_QWEN:
19791
20101
  case LLM_ARCH_QWEN2:
19792
20102
  case LLM_ARCH_QWEN2MOE:
20103
+ case LLM_ARCH_OLMO2:
19793
20104
  case LLM_ARCH_OLMOE:
19794
20105
  case LLM_ARCH_PHI2:
19795
20106
  case LLM_ARCH_PHI3:
@@ -19863,19 +20174,11 @@ int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t bu
19863
20174
  }
19864
20175
 
19865
20176
  uint64_t llama_model_size(const struct llama_model * model) {
19866
- uint64_t size = 0;
19867
- for (const auto & it : model->tensors_by_name) {
19868
- size += lm_ggml_nbytes(it.second);
19869
- }
19870
- return size;
20177
+ return model->n_bytes;
19871
20178
  }
19872
20179
 
19873
20180
  uint64_t llama_model_n_params(const struct llama_model * model) {
19874
- uint64_t nparams = 0;
19875
- for (const auto & it : model->tensors_by_name) {
19876
- nparams += lm_ggml_nelements(it.second);
19877
- }
19878
- return nparams;
20181
+ return model->n_elements;
19879
20182
  }
19880
20183
 
19881
20184
  struct lm_ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name) {
@@ -20189,6 +20492,10 @@ void llama_kv_cache_update(struct llama_context * ctx) {
20189
20492
  llama_kv_cache_update_internal(*ctx);
20190
20493
  }
20191
20494
 
20495
+ bool llama_kv_cache_can_shift(struct llama_context * ctx) {
20496
+ return !ctx->kv_self.recurrent && ctx->model.arch != LLM_ARCH_DEEPSEEK2; // not supported due to MLA
20497
+ }
20498
+
20192
20499
  // deprecated
20193
20500
  size_t llama_get_state_size(struct llama_context * ctx) {
20194
20501
  return llama_state_get_size(ctx);
@@ -21173,6 +21480,14 @@ int32_t llama_n_threads_batch(struct llama_context * ctx) {
21173
21480
  void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) {
21174
21481
  ctx->abort_callback = abort_callback;
21175
21482
  ctx->abort_callback_data = abort_callback_data;
21483
+
21484
+ for (auto & backend : ctx->backends) {
21485
+ auto * reg = lm_ggml_backend_dev_backend_reg(lm_ggml_backend_get_device(backend.get()));
21486
+ auto * set_abort_callback_fn = (lm_ggml_backend_set_abort_callback_t) lm_ggml_backend_reg_get_proc_address(reg, "lm_ggml_backend_set_abort_callback");
21487
+ if (set_abort_callback_fn) {
21488
+ set_abort_callback_fn(backend.get(), ctx->abort_callback, ctx->abort_callback_data);
21489
+ }
21490
+ }
21176
21491
  }
21177
21492
 
21178
21493
  void llama_set_embeddings(struct llama_context * ctx, bool embeddings) {
@@ -21810,8 +22125,11 @@ static int32_t llama_chat_apply_template_internal(
21810
22125
  // IBM Granite template
21811
22126
  for (const auto & message : chat) {
21812
22127
  std::string role(message->role);
21813
- ss << "<|start_of_role|>" << role << "<|end_of_role|>"
21814
- << message->content << "<|end_of_text|>\n";
22128
+ ss << "<|start_of_role|>" << role << "<|end_of_role|>";
22129
+ if (role == "assistant_tool_call") {
22130
+ ss << "<|tool_call|>";
22131
+ }
22132
+ ss << message->content << "<|end_of_text|>\n";
21815
22133
  }
21816
22134
  if (add_ass) {
21817
22135
  ss << "<|start_of_role|>assistant<|end_of_role|>\n";
@@ -21911,33 +22229,23 @@ int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int
21911
22229
  }
21912
22230
 
21913
22231
  const char * llama_print_system_info(void) {
21914
- lm_ggml_cpu_init(); // some ARM features are detected at runtime
21915
-
21916
22232
  static std::string s;
21917
22233
 
21918
- s = "";
21919
- s += "AVX = " + std::to_string(lm_ggml_cpu_has_avx()) + " | ";
21920
- s += "AVX_VNNI = " + std::to_string(lm_ggml_cpu_has_avx_vnni()) + " | ";
21921
- s += "AVX2 = " + std::to_string(lm_ggml_cpu_has_avx2()) + " | ";
21922
- s += "AVX512 = " + std::to_string(lm_ggml_cpu_has_avx512()) + " | ";
21923
- s += "AVX512_VBMI = " + std::to_string(lm_ggml_cpu_has_avx512_vbmi()) + " | ";
21924
- s += "AVX512_VNNI = " + std::to_string(lm_ggml_cpu_has_avx512_vnni()) + " | ";
21925
- s += "AVX512_BF16 = " + std::to_string(lm_ggml_cpu_has_avx512_bf16()) + " | ";
21926
- s += "AMX_INT8 = " + std::to_string(lm_ggml_cpu_has_amx_int8()) + " | ";
21927
- s += "FMA = " + std::to_string(lm_ggml_cpu_has_fma()) + " | ";
21928
- s += "NEON = " + std::to_string(lm_ggml_cpu_has_neon()) + " | ";
21929
- s += "SVE = " + std::to_string(lm_ggml_cpu_has_sve()) + " | ";
21930
- s += "ARM_FMA = " + std::to_string(lm_ggml_cpu_has_arm_fma()) + " | ";
21931
- s += "F16C = " + std::to_string(lm_ggml_cpu_has_f16c()) + " | ";
21932
- s += "FP16_VA = " + std::to_string(lm_ggml_cpu_has_fp16_va()) + " | ";
21933
- s += "RISCV_VECT = " + std::to_string(lm_ggml_cpu_has_riscv_v()) + " | ";
21934
- s += "WASM_SIMD = " + std::to_string(lm_ggml_cpu_has_wasm_simd()) + " | ";
21935
- s += "BLAS = " + std::to_string(lm_ggml_cpu_has_blas()) + " | ";
21936
- s += "SSE3 = " + std::to_string(lm_ggml_cpu_has_sse3()) + " | ";
21937
- s += "SSSE3 = " + std::to_string(lm_ggml_cpu_has_ssse3()) + " | ";
21938
- s += "VSX = " + std::to_string(lm_ggml_cpu_has_vsx()) + " | ";
21939
- s += "MATMUL_INT8 = " + std::to_string(lm_ggml_cpu_has_matmul_int8()) + " | ";
21940
- s += "LLAMAFILE = " + std::to_string(lm_ggml_cpu_has_llamafile()) + " | ";
22234
+ for (size_t i = 0; i < lm_ggml_backend_reg_count(); i++) {
22235
+ auto * reg = lm_ggml_backend_reg_get(i);
22236
+ auto * get_features_fn = (lm_ggml_backend_get_features_t) lm_ggml_backend_reg_get_proc_address(reg, "lm_ggml_backend_get_features");
22237
+ if (get_features_fn) {
22238
+ lm_ggml_backend_feature * features = get_features_fn(reg);
22239
+ s += lm_ggml_backend_reg_name(reg);
22240
+ s += " : ";
22241
+ for (; features->name; features++) {
22242
+ s += features->name;
22243
+ s += " = ";
22244
+ s += features->value;
22245
+ s += " | ";
22246
+ }
22247
+ }
22248
+ }
21941
22249
 
21942
22250
  return s.c_str();
21943
22251
  }
@@ -21978,28 +22286,6 @@ void llama_perf_context_reset(struct llama_context * ctx) {
21978
22286
  ctx->t_p_eval_us = ctx->n_p_eval = 0;
21979
22287
  }
21980
22288
 
21981
- void llama_perf_dump_yaml(FILE * stream, const llama_context * ctx) {
21982
- fprintf(stream, "\n");
21983
- fprintf(stream, "###########\n");
21984
- fprintf(stream, "# Timings #\n");
21985
- fprintf(stream, "###########\n");
21986
- fprintf(stream, "\n");
21987
-
21988
- fprintf(stream, "mst_eval: %.2f # ms / token during generation\n",
21989
- 1.0e-3 * ctx->t_eval_us / ctx->n_eval);
21990
- fprintf(stream, "mst_p_eval: %.2f # ms / token during prompt processing\n",
21991
- 1.0e-3 * ctx->t_p_eval_us / ctx->n_p_eval);
21992
- fprintf(stream, "n_eval: %d # number of tokens generated (excluding the first one)\n", ctx->n_eval);
21993
- fprintf(stream, "n_p_eval: %d # number of tokens processed in batches at the beginning\n", ctx->n_p_eval);
21994
- fprintf(stream, "t_eval_us: %" PRId64 " # total microseconds spent generating tokens\n", ctx->t_eval_us);
21995
- fprintf(stream, "t_load_us: %" PRId64 " # total microseconds spent loading the model\n", ctx->t_load_us);
21996
- fprintf(stream, "t_p_eval_us: %" PRId64 " # total microseconds spent prompt processing\n", ctx->t_p_eval_us);
21997
- fprintf(stream, "ts_eval: %.2f # tokens / second during generation\n",
21998
- 1.0e6 * ctx->n_eval / ctx->t_eval_us);
21999
- fprintf(stream, "ts_p_eval: %.2f # tokens / second during prompt processing\n",
22000
- 1.0e6 * ctx->n_p_eval / ctx->t_p_eval_us);
22001
- }
22002
-
22003
22289
  // For internal test use
22004
22290
  const std::vector<std::pair<std::string, struct lm_ggml_tensor *>> & llama_internal_get_tensor_map(
22005
22291
  struct llama_context * ctx