cui-llama.rn 1.2.6 → 1.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -2
- package/android/src/main/CMakeLists.txt +26 -6
- package/android/src/main/java/com/rnllama/LlamaContext.java +115 -27
- package/android/src/main/java/com/rnllama/RNLlama.java +40 -7
- package/android/src/main/jni.cpp +228 -40
- package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +9 -4
- package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +9 -4
- package/cpp/amx/amx.cpp +196 -0
- package/cpp/amx/amx.h +20 -0
- package/cpp/amx/common.h +101 -0
- package/cpp/amx/mmq.cpp +2524 -0
- package/cpp/amx/mmq.h +16 -0
- package/cpp/common.cpp +118 -251
- package/cpp/common.h +53 -30
- package/cpp/ggml-aarch64.c +46 -3395
- package/cpp/ggml-aarch64.h +0 -20
- package/cpp/ggml-alloc.c +6 -8
- package/cpp/ggml-backend-impl.h +33 -11
- package/cpp/ggml-backend-reg.cpp +423 -0
- package/cpp/ggml-backend.cpp +14 -676
- package/cpp/ggml-backend.h +46 -9
- package/cpp/ggml-common.h +6 -0
- package/cpp/ggml-cpu-aarch64.c +3823 -0
- package/cpp/ggml-cpu-aarch64.h +32 -0
- package/cpp/ggml-cpu-impl.h +14 -242
- package/cpp/ggml-cpu-quants.c +10835 -0
- package/cpp/ggml-cpu-quants.h +63 -0
- package/cpp/ggml-cpu.c +13971 -13720
- package/cpp/ggml-cpu.cpp +715 -0
- package/cpp/ggml-cpu.h +65 -63
- package/cpp/ggml-impl.h +285 -25
- package/cpp/ggml-metal.h +8 -8
- package/cpp/ggml-metal.m +1221 -728
- package/cpp/ggml-quants.c +189 -10681
- package/cpp/ggml-quants.h +78 -125
- package/cpp/ggml-threading.cpp +12 -0
- package/cpp/ggml-threading.h +12 -0
- package/cpp/ggml.c +688 -1460
- package/cpp/ggml.h +58 -244
- package/cpp/json-schema-to-grammar.cpp +1045 -1045
- package/cpp/json.hpp +24766 -24766
- package/cpp/llama-sampling.cpp +5 -2
- package/cpp/llama.cpp +409 -123
- package/cpp/llama.h +8 -4
- package/cpp/rn-llama.hpp +89 -25
- package/cpp/sampling.cpp +42 -3
- package/cpp/sampling.h +22 -1
- package/cpp/sgemm.cpp +608 -0
- package/cpp/speculative.cpp +270 -0
- package/cpp/speculative.h +28 -0
- package/cpp/unicode.cpp +11 -0
- package/ios/RNLlama.mm +43 -20
- package/ios/RNLlamaContext.h +9 -3
- package/ios/RNLlamaContext.mm +146 -33
- package/jest/mock.js +0 -1
- package/lib/commonjs/NativeRNLlama.js.map +1 -1
- package/lib/commonjs/grammar.js +4 -2
- package/lib/commonjs/grammar.js.map +1 -1
- package/lib/commonjs/index.js +52 -15
- package/lib/commonjs/index.js.map +1 -1
- package/lib/module/NativeRNLlama.js.map +1 -1
- package/lib/module/grammar.js +2 -1
- package/lib/module/grammar.js.map +1 -1
- package/lib/module/index.js +51 -15
- package/lib/module/index.js.map +1 -1
- package/lib/typescript/NativeRNLlama.d.ts +122 -8
- package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
- package/lib/typescript/grammar.d.ts +5 -6
- package/lib/typescript/grammar.d.ts.map +1 -1
- package/lib/typescript/index.d.ts +15 -6
- package/lib/typescript/index.d.ts.map +1 -1
- package/package.json +2 -1
- package/src/NativeRNLlama.ts +135 -13
- package/src/grammar.ts +10 -8
- package/src/index.ts +104 -28
package/cpp/llama.cpp
CHANGED
@@ -190,6 +190,7 @@ enum llm_arch {
|
|
190
190
|
LLM_ARCH_COMMAND_R,
|
191
191
|
LLM_ARCH_DBRX,
|
192
192
|
LLM_ARCH_OLMO,
|
193
|
+
LLM_ARCH_OLMO2,
|
193
194
|
LLM_ARCH_OLMOE,
|
194
195
|
LLM_ARCH_OPENELM,
|
195
196
|
LLM_ARCH_ARCTIC,
|
@@ -243,6 +244,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
243
244
|
{ LLM_ARCH_COMMAND_R, "command-r" },
|
244
245
|
{ LLM_ARCH_DBRX, "dbrx" },
|
245
246
|
{ LLM_ARCH_OLMO, "olmo" },
|
247
|
+
{ LLM_ARCH_OLMO2, "olmo2" },
|
246
248
|
{ LLM_ARCH_OLMOE, "olmoe" },
|
247
249
|
{ LLM_ARCH_OPENELM, "openelm" },
|
248
250
|
{ LLM_ARCH_ARCTIC, "arctic" },
|
@@ -1218,6 +1220,25 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
1218
1220
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
1219
1221
|
},
|
1220
1222
|
},
|
1223
|
+
{
|
1224
|
+
LLM_ARCH_OLMO2,
|
1225
|
+
{
|
1226
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
1227
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
1228
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
1229
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
1230
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
1231
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
1232
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
1233
|
+
{ LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
|
1234
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
1235
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
1236
|
+
{ LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
|
1237
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
1238
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
1239
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
1240
|
+
},
|
1241
|
+
},
|
1221
1242
|
{
|
1222
1243
|
LLM_ARCH_OLMOE,
|
1223
1244
|
{
|
@@ -2312,6 +2333,7 @@ enum e_model {
|
|
2312
2333
|
MODEL_1B,
|
2313
2334
|
MODEL_1_3B,
|
2314
2335
|
MODEL_1_4B,
|
2336
|
+
MODEL_1_5B,
|
2315
2337
|
MODEL_1_6B,
|
2316
2338
|
MODEL_2B,
|
2317
2339
|
MODEL_2_8B,
|
@@ -2330,6 +2352,7 @@ enum e_model {
|
|
2330
2352
|
MODEL_16B,
|
2331
2353
|
MODEL_20B,
|
2332
2354
|
MODEL_30B,
|
2355
|
+
MODEL_32B,
|
2333
2356
|
MODEL_34B,
|
2334
2357
|
MODEL_35B,
|
2335
2358
|
MODEL_40B,
|
@@ -2917,9 +2940,15 @@ struct llama_model {
|
|
2917
2940
|
// for quantize-stats only
|
2918
2941
|
std::vector<std::pair<std::string, struct lm_ggml_tensor *>> tensors_by_name;
|
2919
2942
|
|
2920
|
-
int64_t t_load_us
|
2943
|
+
int64_t t_load_us = 0;
|
2921
2944
|
int64_t t_start_us = 0;
|
2922
2945
|
|
2946
|
+
// total number of parameters in the model
|
2947
|
+
uint64_t n_elements = 0;
|
2948
|
+
|
2949
|
+
// total size of all the tensors in the model in bytes
|
2950
|
+
size_t n_bytes = 0;
|
2951
|
+
|
2923
2952
|
// keep track of loaded lora adapters
|
2924
2953
|
std::set<struct llama_lora_adapter *> lora_adapters;
|
2925
2954
|
|
@@ -3464,21 +3493,13 @@ static bool llama_kv_cache_init(
|
|
3464
3493
|
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
|
3465
3494
|
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
|
3466
3495
|
|
3467
|
-
|
3496
|
+
lm_ggml_backend_buffer_type_t buft;
|
3468
3497
|
if (offload) {
|
3469
|
-
|
3498
|
+
auto * dev = model.dev_layer.at(i).dev;
|
3499
|
+
buft = lm_ggml_backend_dev_buffer_type(dev);
|
3470
3500
|
} else {
|
3471
|
-
|
3501
|
+
buft = lm_ggml_backend_cpu_buffer_type();
|
3472
3502
|
}
|
3473
|
-
lm_ggml_backend_buffer_type_t buft = select_buft(*buft_list,
|
3474
|
-
[&](lm_ggml_context * ctx) {
|
3475
|
-
lm_ggml_tensor * k = lm_ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
|
3476
|
-
if (hparams.rope_type == LLAMA_ROPE_TYPE_NONE) {
|
3477
|
-
return k;
|
3478
|
-
}
|
3479
|
-
lm_ggml_tensor * p = lm_ggml_new_tensor_1d(ctx, LM_GGML_TYPE_I32, 1);
|
3480
|
-
return lm_ggml_rope(ctx, k, p, hparams.n_rot, hparams.rope_type);
|
3481
|
-
});
|
3482
3503
|
lm_ggml_context * ctx = ctx_for_buft(buft);
|
3483
3504
|
|
3484
3505
|
if (!ctx) {
|
@@ -3512,11 +3533,24 @@ static bool llama_kv_cache_init(
|
|
3512
3533
|
return true;
|
3513
3534
|
}
|
3514
3535
|
|
3536
|
+
// a structure holds information about the slot found in llama_kv_cache_find_slot
|
3537
|
+
struct llama_kv_cache_slot_info {
|
3538
|
+
std::pair<uint32_t, uint32_t> boundaries; // slot boundaries [begin, end)
|
3539
|
+
bool found = false; // the slot was found
|
3540
|
+
|
3541
|
+
explicit llama_kv_cache_slot_info(bool found_) : found{found_} {}
|
3542
|
+
llama_kv_cache_slot_info(uint32_t begin, uint32_t end) : boundaries{begin, end}, found{true} {}
|
3543
|
+
|
3544
|
+
operator bool() const { return found; }
|
3545
|
+
};
|
3546
|
+
static const llama_kv_cache_slot_info llama_kv_cache_slot_info_failed{false};
|
3547
|
+
|
3515
3548
|
// find an empty slot of size "n_tokens" in the cache
|
3516
3549
|
// updates the cache head
|
3550
|
+
// returns a structure holding information about the slot found
|
3517
3551
|
// Note: On success, it's important that cache.head points
|
3518
3552
|
// to the first cell of the slot.
|
3519
|
-
static
|
3553
|
+
static struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
|
3520
3554
|
struct llama_kv_cache & cache,
|
3521
3555
|
const struct llama_ubatch & batch) {
|
3522
3556
|
const uint32_t n_tokens = batch.n_tokens;
|
@@ -3544,7 +3578,7 @@ static bool llama_kv_cache_find_slot(
|
|
3544
3578
|
// too big seq_id
|
3545
3579
|
// TODO: would it be possible to resize the cache instead?
|
3546
3580
|
LLAMA_LOG_ERROR("%s: seq_id=%d >= n_seq_max=%d Try using a bigger --parallel value\n", __func__, seq_id, cache.size);
|
3547
|
-
return
|
3581
|
+
return llama_kv_cache_slot_info_failed;
|
3548
3582
|
}
|
3549
3583
|
if (j > 0) {
|
3550
3584
|
llama_kv_cell & seq = cache.cells[seq_id];
|
@@ -3679,15 +3713,17 @@ static bool llama_kv_cache_find_slot(
|
|
3679
3713
|
// allow getting the range of used cells, from head to head + n
|
3680
3714
|
cache.head = min;
|
3681
3715
|
cache.n = max - min + 1;
|
3716
|
+
cache.used = std::count_if(cache.cells.begin(), cache.cells.end(),
|
3717
|
+
[](const llama_kv_cell& cell){ return !cell.is_empty(); });
|
3682
3718
|
|
3683
3719
|
// sanity check
|
3684
|
-
return cache.n >= n_seqs;
|
3720
|
+
return llama_kv_cache_slot_info(cache.n >= n_seqs);
|
3685
3721
|
}
|
3686
3722
|
// otherwise, one cell per token.
|
3687
3723
|
|
3688
3724
|
if (n_tokens > cache.size) {
|
3689
3725
|
LLAMA_LOG_ERROR("%s: n_tokens=%d > cache.size=%d\n", __func__, n_tokens, cache.size);
|
3690
|
-
return
|
3726
|
+
return llama_kv_cache_slot_info_failed;
|
3691
3727
|
}
|
3692
3728
|
|
3693
3729
|
uint32_t n_tested = 0;
|
@@ -3715,7 +3751,7 @@ static bool llama_kv_cache_find_slot(
|
|
3715
3751
|
|
3716
3752
|
if (n_tested >= cache.size) {
|
3717
3753
|
//LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
|
3718
|
-
return
|
3754
|
+
return llama_kv_cache_slot_info_failed;
|
3719
3755
|
}
|
3720
3756
|
}
|
3721
3757
|
|
@@ -3732,7 +3768,7 @@ static bool llama_kv_cache_find_slot(
|
|
3732
3768
|
|
3733
3769
|
cache.used += n_tokens;
|
3734
3770
|
|
3735
|
-
return
|
3771
|
+
return llama_kv_cache_slot_info(cache.head, cache.head + n_tokens);
|
3736
3772
|
}
|
3737
3773
|
|
3738
3774
|
// find how many cells are currently in use
|
@@ -4008,6 +4044,53 @@ static uint32_t llama_kv_cache_get_padding(const struct llama_cparams & cparams)
|
|
4008
4044
|
return cparams.flash_attn ? 256u : 32u;
|
4009
4045
|
}
|
4010
4046
|
|
4047
|
+
// saves the kv_cache state for future recovery.
|
4048
|
+
// used to rollback llama_kv_cache_find_slot changes.
|
4049
|
+
struct llama_kv_slot_restorer {
|
4050
|
+
struct llama_kv_cache_state {
|
4051
|
+
uint32_t head = 0;
|
4052
|
+
uint32_t n = 0;
|
4053
|
+
} old_state;
|
4054
|
+
|
4055
|
+
// for non-recurrent models only
|
4056
|
+
// list of slots to restore
|
4057
|
+
std::vector<std::pair<uint32_t, uint32_t>> slot_boundaries;
|
4058
|
+
|
4059
|
+
bool do_restore = false;
|
4060
|
+
|
4061
|
+
explicit llama_kv_slot_restorer(const struct llama_kv_cache & cache) {
|
4062
|
+
old_state.head = cache.head;
|
4063
|
+
old_state.n = cache.n;
|
4064
|
+
}
|
4065
|
+
|
4066
|
+
// saves a slot information for future restoration
|
4067
|
+
void save(const struct llama_kv_cache_slot_info & slot) {
|
4068
|
+
if (slot) {
|
4069
|
+
do_restore = true;
|
4070
|
+
if (slot.boundaries.first != slot.boundaries.second) {
|
4071
|
+
slot_boundaries.push_back(slot.boundaries);
|
4072
|
+
}
|
4073
|
+
}
|
4074
|
+
}
|
4075
|
+
|
4076
|
+
// must be explicitly called to restore the kv_cache state
|
4077
|
+
// and rollback changes from all llama_kv_cache_find_slot calls
|
4078
|
+
void restore(struct llama_kv_cache & cache) {
|
4079
|
+
if (do_restore) {
|
4080
|
+
cache.head = old_state.head;
|
4081
|
+
cache.n = old_state.n;
|
4082
|
+
|
4083
|
+
if (cache.recurrent) { // recurrent models like Mamba or RWKV can't have a state partially erased
|
4084
|
+
llama_kv_cache_seq_rm(cache, -1, -1, -1);
|
4085
|
+
} else {
|
4086
|
+
for (auto & slot : slot_boundaries) {
|
4087
|
+
llama_kv_cache_seq_rm(cache, -1, slot.first, slot.second);
|
4088
|
+
}
|
4089
|
+
}
|
4090
|
+
}
|
4091
|
+
}
|
4092
|
+
};
|
4093
|
+
|
4011
4094
|
//
|
4012
4095
|
// model loading and saving
|
4013
4096
|
//
|
@@ -4223,8 +4306,8 @@ struct llama_model_loader {
|
|
4223
4306
|
int n_tensors = 0;
|
4224
4307
|
int n_created = 0;
|
4225
4308
|
|
4226
|
-
|
4227
|
-
size_t n_bytes
|
4309
|
+
uint64_t n_elements = 0;
|
4310
|
+
size_t n_bytes = 0;
|
4228
4311
|
|
4229
4312
|
bool use_mmap = false;
|
4230
4313
|
bool check_tensors;
|
@@ -4795,7 +4878,9 @@ struct llama_model_loader {
|
|
4795
4878
|
mappings.reserve(files.size());
|
4796
4879
|
mmaps_used.reserve(files.size());
|
4797
4880
|
for (const auto & file : files) {
|
4798
|
-
|
4881
|
+
auto * reg = lm_ggml_backend_dev_backend_reg(lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_CPU));
|
4882
|
+
auto * is_numa_fn = (decltype(lm_ggml_is_numa) *) lm_ggml_backend_reg_get_proc_address(reg, "lm_ggml_backend_cpu_is_numa");
|
4883
|
+
std::unique_ptr<llama_mmap> mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, is_numa_fn()));
|
4799
4884
|
mmaps_used.emplace_back(mapping->size, 0);
|
4800
4885
|
if (mlock_mmaps) {
|
4801
4886
|
std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock());
|
@@ -5238,6 +5323,7 @@ static const char * llama_model_type_name(e_model type) {
|
|
5238
5323
|
case MODEL_1B: return "1B";
|
5239
5324
|
case MODEL_1_3B: return "1.3B";
|
5240
5325
|
case MODEL_1_4B: return "1.4B";
|
5326
|
+
case MODEL_1_5B: return "1.5B";
|
5241
5327
|
case MODEL_1_6B: return "1.6B";
|
5242
5328
|
case MODEL_2B: return "2B";
|
5243
5329
|
case MODEL_2_8B: return "2.8B";
|
@@ -5256,6 +5342,7 @@ static const char * llama_model_type_name(e_model type) {
|
|
5256
5342
|
case MODEL_16B: return "16B";
|
5257
5343
|
case MODEL_20B: return "20B";
|
5258
5344
|
case MODEL_30B: return "30B";
|
5345
|
+
case MODEL_32B: return "32B";
|
5259
5346
|
case MODEL_34B: return "34B";
|
5260
5347
|
case MODEL_35B: return "35B";
|
5261
5348
|
case MODEL_40B: return "40B";
|
@@ -5291,6 +5378,11 @@ static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
|
|
5291
5378
|
}
|
5292
5379
|
}
|
5293
5380
|
|
5381
|
+
static void llm_load_stats(llama_model_loader & ml, llama_model & model) {
|
5382
|
+
model.n_elements = ml.n_elements;
|
5383
|
+
model.n_bytes = ml.n_bytes;
|
5384
|
+
}
|
5385
|
+
|
5294
5386
|
static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
|
5295
5387
|
model.arch = ml.get_arch();
|
5296
5388
|
if (model.arch == LLM_ARCH_UNKNOWN) {
|
@@ -5609,8 +5701,12 @@ static void llm_load_hparams(
|
|
5609
5701
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
5610
5702
|
switch (hparams.n_layer) {
|
5611
5703
|
case 24: model.type = hparams.n_embd == 1024 ? e_model::MODEL_0_5B : e_model::MODEL_1B; break;
|
5704
|
+
case 28: model.type = hparams.n_embd == 1536 ? e_model::MODEL_1_5B : e_model::MODEL_7B; break;
|
5612
5705
|
case 32: model.type = e_model::MODEL_7B; break;
|
5706
|
+
case 36: model.type = e_model::MODEL_3B; break;
|
5613
5707
|
case 40: model.type = hparams.n_head() == 20 ? e_model::MODEL_4B : e_model::MODEL_13B; break;
|
5708
|
+
case 48: model.type = e_model::MODEL_14B; break;
|
5709
|
+
case 64: model.type = e_model::MODEL_32B; break;
|
5614
5710
|
case 80: model.type = e_model::MODEL_70B; break;
|
5615
5711
|
default: model.type = e_model::MODEL_UNKNOWN;
|
5616
5712
|
}
|
@@ -5820,6 +5916,17 @@ static void llm_load_hparams(
|
|
5820
5916
|
default: model.type = e_model::MODEL_UNKNOWN;
|
5821
5917
|
}
|
5822
5918
|
} break;
|
5919
|
+
case LLM_ARCH_OLMO2:
|
5920
|
+
{
|
5921
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
5922
|
+
|
5923
|
+
switch (hparams.n_layer) {
|
5924
|
+
case 16: model.type = e_model::MODEL_1B; break;
|
5925
|
+
case 32: model.type = e_model::MODEL_7B; break;
|
5926
|
+
case 40: model.type = e_model::MODEL_13B; break;
|
5927
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
5928
|
+
}
|
5929
|
+
} break;
|
5823
5930
|
case LLM_ARCH_OLMOE:
|
5824
5931
|
{
|
5825
5932
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
@@ -7022,7 +7129,7 @@ static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
|
|
7022
7129
|
{LLM_TENSOR_TIME_MIX_LERP_R, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_ADD}},
|
7023
7130
|
{LLM_TENSOR_TIME_MIX_LERP_G, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_ADD}},
|
7024
7131
|
{LLM_TENSOR_TIME_MIX_DECAY, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_ADD}},
|
7025
|
-
{LLM_TENSOR_TIME_MIX_FIRST, {LLM_TENSOR_LAYER_REPEATING,
|
7132
|
+
{LLM_TENSOR_TIME_MIX_FIRST, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_RWKV_WKV6}},
|
7026
7133
|
{LLM_TENSOR_ATTN_NORM, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL}},
|
7027
7134
|
{LLM_TENSOR_ATTN_NORM_2, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL}},
|
7028
7135
|
{LLM_TENSOR_ATTN_OUT_NORM, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL}},
|
@@ -7092,12 +7199,12 @@ static bool weight_buft_supported(const llama_hparams & hparams, lm_ggml_tensor
|
|
7092
7199
|
} break;
|
7093
7200
|
case LM_GGML_OP_ADD:
|
7094
7201
|
{
|
7095
|
-
lm_ggml_tensor * a =
|
7202
|
+
lm_ggml_tensor * a = lm_ggml_new_tensor_4d(ctx, LM_GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
|
7096
7203
|
op_tensor = lm_ggml_add(ctx, a, w);
|
7097
7204
|
} break;
|
7098
7205
|
case LM_GGML_OP_MUL:
|
7099
7206
|
{
|
7100
|
-
lm_ggml_tensor * a =
|
7207
|
+
lm_ggml_tensor * a = lm_ggml_new_tensor_4d(ctx, LM_GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
|
7101
7208
|
op_tensor = lm_ggml_mul(ctx, a, w);
|
7102
7209
|
} break;
|
7103
7210
|
case LM_GGML_OP_DIV:
|
@@ -7138,7 +7245,7 @@ static bool weight_buft_supported(const llama_hparams & hparams, lm_ggml_tensor
|
|
7138
7245
|
lm_ggml_tensor * C = lm_ggml_new_tensor_3d(ctx, LM_GGML_TYPE_F32, d_state, n_seq_tokens, n_seqs);
|
7139
7246
|
op_tensor = lm_ggml_ssm_scan(ctx, s, x, dt, w, B, C);
|
7140
7247
|
} break;
|
7141
|
-
case
|
7248
|
+
case LM_GGML_OP_RWKV_WKV6:
|
7142
7249
|
{
|
7143
7250
|
// FIXME
|
7144
7251
|
const int64_t S = 123;
|
@@ -7151,7 +7258,7 @@ static bool weight_buft_supported(const llama_hparams & hparams, lm_ggml_tensor
|
|
7151
7258
|
lm_ggml_tensor * tf = w;
|
7152
7259
|
lm_ggml_tensor * td = lm_ggml_new_tensor_4d(ctx, LM_GGML_TYPE_F32, 1, S, H, n_tokens);
|
7153
7260
|
lm_ggml_tensor * state = lm_ggml_new_tensor_4d(ctx, LM_GGML_TYPE_F32, S, n_seqs, S, H);
|
7154
|
-
op_tensor =
|
7261
|
+
op_tensor = lm_ggml_rwkv_wkv6(ctx, k, v, r, tf, td, state);
|
7155
7262
|
} break;
|
7156
7263
|
default:
|
7157
7264
|
LM_GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, lm_ggml_op_name(op), w->name);
|
@@ -7200,7 +7307,7 @@ static llama_model::buft_list_t make_cpu_buft_list(llama_model & model) {
|
|
7200
7307
|
auto * cpu_dev = lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_CPU);
|
7201
7308
|
auto * cpu_reg = lm_ggml_backend_dev_backend_reg(cpu_dev);
|
7202
7309
|
auto lm_ggml_backend_dev_get_extra_bufts_fn = (lm_ggml_backend_dev_get_extra_bufts_t)
|
7203
|
-
lm_ggml_backend_reg_get_proc_address(cpu_reg, "
|
7310
|
+
lm_ggml_backend_reg_get_proc_address(cpu_reg, "lm_ggml_backend_dev_get_extra_bufts");
|
7204
7311
|
if (lm_ggml_backend_dev_get_extra_bufts_fn) {
|
7205
7312
|
lm_ggml_backend_buffer_type_t * extra_bufts = lm_ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
|
7206
7313
|
while (extra_bufts && *extra_bufts) {
|
@@ -7467,7 +7574,7 @@ static bool llm_load_tensors(
|
|
7467
7574
|
|
7468
7575
|
// avoid using a host buffer when using mmap
|
7469
7576
|
auto * buft_dev = lm_ggml_backend_buft_get_device(buft);
|
7470
|
-
if (ml.use_mmap && buft == lm_ggml_backend_dev_host_buffer_type(buft_dev)) {
|
7577
|
+
if (ml.use_mmap && buft_dev && buft == lm_ggml_backend_dev_host_buffer_type(buft_dev)) {
|
7471
7578
|
auto * cpu_dev = lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_CPU);
|
7472
7579
|
buft = lm_ggml_backend_dev_buffer_type(cpu_dev);
|
7473
7580
|
}
|
@@ -8502,6 +8609,31 @@ static bool llm_load_tensors(
|
|
8502
8609
|
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
8503
8610
|
}
|
8504
8611
|
} break;
|
8612
|
+
case LLM_ARCH_OLMO2:
|
8613
|
+
{
|
8614
|
+
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
8615
|
+
|
8616
|
+
// output
|
8617
|
+
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
8618
|
+
model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
8619
|
+
|
8620
|
+
for (int i = 0; i < n_layer; ++i) {
|
8621
|
+
auto & layer = model.layers[i];
|
8622
|
+
|
8623
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
8624
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
8625
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
8626
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
8627
|
+
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);
|
8628
|
+
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, 0);
|
8629
|
+
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
|
8630
|
+
|
8631
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
8632
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
8633
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
8634
|
+
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
|
8635
|
+
}
|
8636
|
+
} break;
|
8505
8637
|
case LLM_ARCH_OLMOE:
|
8506
8638
|
{
|
8507
8639
|
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
@@ -9074,6 +9206,10 @@ static bool llm_load_tensors(
|
|
9074
9206
|
|
9075
9207
|
// check if it is possible to use buffer_from_host_ptr with this buffer type
|
9076
9208
|
lm_ggml_backend_dev_t dev = lm_ggml_backend_buft_get_device(buft);
|
9209
|
+
if (!dev) {
|
9210
|
+
// FIXME: workaround for CPU backend buft having a NULL device
|
9211
|
+
dev = lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_CPU);
|
9212
|
+
}
|
9077
9213
|
lm_ggml_backend_dev_props props;
|
9078
9214
|
lm_ggml_backend_dev_get_props(dev, &props);
|
9079
9215
|
bool buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr;
|
@@ -9145,7 +9281,7 @@ static bool llm_load_tensors(
|
|
9145
9281
|
|
9146
9282
|
// print memory requirements per buffer type
|
9147
9283
|
for (auto & buf : model.bufs) {
|
9148
|
-
LLAMA_LOG_INFO("%s: %
|
9284
|
+
LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, lm_ggml_backend_buffer_name(buf.get()), lm_ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
|
9149
9285
|
}
|
9150
9286
|
|
9151
9287
|
// populate tensors_by_name
|
@@ -9198,6 +9334,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
|
9198
9334
|
throw std::runtime_error("error loading model vocabulary: " + std::string(e.what()));
|
9199
9335
|
}
|
9200
9336
|
|
9337
|
+
llm_load_stats(ml, model);
|
9201
9338
|
llm_load_print_meta(ml, model);
|
9202
9339
|
|
9203
9340
|
if (model.vocab.type != LLAMA_VOCAB_TYPE_NONE &&
|
@@ -10094,7 +10231,7 @@ static struct lm_ggml_tensor * llm_build_rwkv6_time_mix(
|
|
10094
10231
|
v = lm_ggml_transpose(ctx, v);
|
10095
10232
|
r = lm_ggml_transpose(ctx, r);
|
10096
10233
|
|
10097
|
-
struct lm_ggml_tensor * wkv_output =
|
10234
|
+
struct lm_ggml_tensor * wkv_output = lm_ggml_rwkv_wkv6(ctx, k, v, r, layer->time_mix_first, w, *wkv_state);
|
10098
10235
|
cur = lm_ggml_view_1d(ctx, wkv_output, n_embd * n_tokens, 0);
|
10099
10236
|
*wkv_state = lm_ggml_view_1d(ctx, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float));
|
10100
10237
|
|
@@ -14362,6 +14499,130 @@ struct llm_build_context {
|
|
14362
14499
|
return gf;
|
14363
14500
|
}
|
14364
14501
|
|
14502
|
+
struct lm_ggml_cgraph * build_olmo2() {
|
14503
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
14504
|
+
|
14505
|
+
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
14506
|
+
int32_t n_tokens = this->n_tokens;
|
14507
|
+
|
14508
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
14509
|
+
LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
14510
|
+
LM_GGML_ASSERT(n_embd_head == hparams.n_rot);
|
14511
|
+
|
14512
|
+
struct lm_ggml_tensor * cur;
|
14513
|
+
struct lm_ggml_tensor * inpL;
|
14514
|
+
|
14515
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
|
14516
|
+
|
14517
|
+
// inp_pos - contains the positions
|
14518
|
+
struct lm_ggml_tensor * inp_pos = build_inp_pos();
|
14519
|
+
|
14520
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
14521
|
+
struct lm_ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
14522
|
+
|
14523
|
+
for (int il = 0; il < n_layer; ++il) {
|
14524
|
+
struct lm_ggml_tensor * inpSA = inpL;
|
14525
|
+
|
14526
|
+
cur = inpL;
|
14527
|
+
|
14528
|
+
// self_attention
|
14529
|
+
{
|
14530
|
+
// compute Q and K and RoPE them
|
14531
|
+
struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
14532
|
+
cb(Qcur, "Qcur", il);
|
14533
|
+
|
14534
|
+
struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
|
14535
|
+
cb(Kcur, "Kcur", il);
|
14536
|
+
|
14537
|
+
struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
|
14538
|
+
cb(Vcur, "Vcur", il);
|
14539
|
+
|
14540
|
+
Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].attn_q_norm, NULL,
|
14541
|
+
LLM_NORM_RMS, cb, il);
|
14542
|
+
cb(Qcur, "Qcur_normed", il);
|
14543
|
+
|
14544
|
+
Kcur = llm_build_norm(ctx0, Kcur, hparams, model.layers[il].attn_k_norm, NULL,
|
14545
|
+
LLM_NORM_RMS, cb, il);
|
14546
|
+
cb(Kcur, "Kcur_normed", il);
|
14547
|
+
|
14548
|
+
Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
14549
|
+
Kcur = lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
14550
|
+
|
14551
|
+
Qcur = lm_ggml_rope_ext(
|
14552
|
+
ctx0, Qcur, inp_pos, nullptr,
|
14553
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
14554
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
14555
|
+
);
|
14556
|
+
cb(Qcur, "Qcur_rope", il);
|
14557
|
+
|
14558
|
+
Kcur = lm_ggml_rope_ext(
|
14559
|
+
ctx0, Kcur, inp_pos, nullptr,
|
14560
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
14561
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
14562
|
+
);
|
14563
|
+
cb(Kcur, "Kcur_rope", il);
|
14564
|
+
|
14565
|
+
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
14566
|
+
model.layers[il].wo, NULL,
|
14567
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
14568
|
+
}
|
14569
|
+
|
14570
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
14571
|
+
model.layers[il].attn_post_norm, NULL,
|
14572
|
+
LLM_NORM_RMS, cb, il);
|
14573
|
+
cb(cur, "attn_post_norm", il);
|
14574
|
+
|
14575
|
+
if (il == n_layer - 1) {
|
14576
|
+
// skip computing output for unused tokens
|
14577
|
+
struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
|
14578
|
+
n_tokens = n_outputs;
|
14579
|
+
cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
|
14580
|
+
inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
14581
|
+
}
|
14582
|
+
|
14583
|
+
struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
|
14584
|
+
cb(ffn_inp, "ffn_inp", il);
|
14585
|
+
|
14586
|
+
// feed-forward network
|
14587
|
+
cur = llm_build_ffn(ctx0, lctx, ffn_inp,
|
14588
|
+
model.layers[il].ffn_up, NULL, NULL,
|
14589
|
+
model.layers[il].ffn_gate, NULL, NULL,
|
14590
|
+
model.layers[il].ffn_down, NULL, NULL,
|
14591
|
+
NULL,
|
14592
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
14593
|
+
cb(cur, "ffn_out", il);
|
14594
|
+
|
14595
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
14596
|
+
model.layers[il].ffn_post_norm, NULL,
|
14597
|
+
LLM_NORM_RMS, cb, -1);
|
14598
|
+
cb(cur, "ffn_post_norm", -1);
|
14599
|
+
|
14600
|
+
cur = lm_ggml_add(ctx0, cur, ffn_inp);
|
14601
|
+
cb(cur, "ffn_out", il);
|
14602
|
+
|
14603
|
+
cur = lctx.cvec.apply_to(ctx0, cur, il);
|
14604
|
+
cb(cur, "l_out", il);
|
14605
|
+
|
14606
|
+
// input for next layer
|
14607
|
+
inpL = cur;
|
14608
|
+
}
|
14609
|
+
|
14610
|
+
cur = inpL;
|
14611
|
+
|
14612
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
14613
|
+
model.output_norm, NULL,
|
14614
|
+
LLM_NORM_RMS, cb, -1);
|
14615
|
+
cb(cur, "result_norm", -1);
|
14616
|
+
|
14617
|
+
// lm_head
|
14618
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
14619
|
+
cb(cur, "result_output", -1);
|
14620
|
+
|
14621
|
+
lm_ggml_build_forward_expand(gf, cur);
|
14622
|
+
|
14623
|
+
return gf;
|
14624
|
+
}
|
14625
|
+
|
14365
14626
|
// based on the build_qwen2moe() function, changes:
|
14366
14627
|
// * removed shared experts
|
14367
14628
|
// * removed bias
|
@@ -16554,6 +16815,10 @@ static struct lm_ggml_cgraph * llama_build_graph(
|
|
16554
16815
|
{
|
16555
16816
|
result = llm.build_olmo();
|
16556
16817
|
} break;
|
16818
|
+
case LLM_ARCH_OLMO2:
|
16819
|
+
{
|
16820
|
+
result = llm.build_olmo2();
|
16821
|
+
} break;
|
16557
16822
|
case LLM_ARCH_OLMOE:
|
16558
16823
|
{
|
16559
16824
|
result = llm.build_olmoe();
|
@@ -17189,14 +17454,16 @@ static void llama_output_reorder(struct llama_context * ctx) {
|
|
17189
17454
|
}
|
17190
17455
|
}
|
17191
17456
|
|
17192
|
-
|
17457
|
+
// returns the result of lm_ggml_backend_sched_graph_compute_async execution
|
17458
|
+
static enum lm_ggml_status llama_graph_compute(
|
17193
17459
|
llama_context & lctx,
|
17194
17460
|
lm_ggml_cgraph * gf,
|
17195
17461
|
int n_threads,
|
17196
17462
|
lm_ggml_threadpool * threadpool) {
|
17197
17463
|
if (lctx.backend_cpu != nullptr) {
|
17198
|
-
|
17199
|
-
|
17464
|
+
auto * reg = lm_ggml_backend_dev_backend_reg(lm_ggml_backend_get_device(lctx.backend_cpu));
|
17465
|
+
auto * set_threadpool_fn = (decltype(lm_ggml_backend_cpu_set_threadpool) *) lm_ggml_backend_reg_get_proc_address(reg, "lm_ggml_backend_cpu_set_threadpool");
|
17466
|
+
set_threadpool_fn(lctx.backend_cpu, threadpool);
|
17200
17467
|
}
|
17201
17468
|
|
17202
17469
|
// set the number of threads for all the backends
|
@@ -17204,15 +17471,20 @@ static void llama_graph_compute(
|
|
17204
17471
|
set_n_threads_fn.second(set_n_threads_fn.first, n_threads);
|
17205
17472
|
}
|
17206
17473
|
|
17207
|
-
auto
|
17208
|
-
if (
|
17209
|
-
LLAMA_LOG_ERROR("%s: lm_ggml_backend_sched_graph_compute_async failed with error %d\n", __func__,
|
17474
|
+
auto status = lm_ggml_backend_sched_graph_compute_async(lctx.sched.get(), gf);
|
17475
|
+
if (status != LM_GGML_STATUS_SUCCESS) {
|
17476
|
+
LLAMA_LOG_ERROR("%s: lm_ggml_backend_sched_graph_compute_async failed with error %d\n", __func__, status);
|
17210
17477
|
}
|
17211
17478
|
|
17212
17479
|
// fprintf(stderr, "splits: %d\n", lm_ggml_backend_sched_get_n_splits(lctx.sched));
|
17480
|
+
|
17481
|
+
return status;
|
17213
17482
|
}
|
17214
17483
|
|
17215
17484
|
// decode a batch of tokens by evaluating the transformer
|
17485
|
+
// in case of unsuccessful decoding (error or warning),
|
17486
|
+
// the kv_cache state will be returned to its original state
|
17487
|
+
// (for non-recurrent models) or cleaned (for recurrent models)
|
17216
17488
|
//
|
17217
17489
|
// - lctx: llama context
|
17218
17490
|
// - batch: batch to evaluate
|
@@ -17262,6 +17534,7 @@ static int llama_decode_internal(
|
|
17262
17534
|
lctx.n_queued_tokens += n_tokens_all;
|
17263
17535
|
|
17264
17536
|
auto & kv_self = lctx.kv_self;
|
17537
|
+
llama_kv_slot_restorer kv_slot_restorer(kv_self);
|
17265
17538
|
|
17266
17539
|
const int64_t n_embd = hparams.n_embd;
|
17267
17540
|
const int64_t n_vocab = hparams.n_vocab;
|
@@ -17346,9 +17619,11 @@ static int llama_decode_internal(
|
|
17346
17619
|
kv_self.head = 0;
|
17347
17620
|
}
|
17348
17621
|
|
17349
|
-
|
17622
|
+
const auto slot = llama_kv_cache_find_slot(kv_self, ubatch);
|
17623
|
+
if (!slot) {
|
17350
17624
|
return 1;
|
17351
17625
|
}
|
17626
|
+
kv_slot_restorer.save(slot);
|
17352
17627
|
|
17353
17628
|
if (!kv_self.recurrent) {
|
17354
17629
|
// a heuristic, to avoid attending the full cache if it is not yet utilized
|
@@ -17395,7 +17670,19 @@ static int llama_decode_internal(
|
|
17395
17670
|
|
17396
17671
|
llama_set_inputs(lctx, ubatch);
|
17397
17672
|
|
17398
|
-
llama_graph_compute(lctx, gf, n_threads, threadpool);
|
17673
|
+
const auto compute_status = llama_graph_compute(lctx, gf, n_threads, threadpool);
|
17674
|
+
if (compute_status != LM_GGML_STATUS_SUCCESS) {
|
17675
|
+
kv_slot_restorer.restore(kv_self);
|
17676
|
+
switch (compute_status) {
|
17677
|
+
case LM_GGML_STATUS_ABORTED:
|
17678
|
+
return 2;
|
17679
|
+
case LM_GGML_STATUS_ALLOC_FAILED:
|
17680
|
+
return -2;
|
17681
|
+
case LM_GGML_STATUS_FAILED:
|
17682
|
+
default:
|
17683
|
+
return -3;
|
17684
|
+
}
|
17685
|
+
}
|
17399
17686
|
|
17400
17687
|
// update the kv ring buffer
|
17401
17688
|
{
|
@@ -17632,7 +17919,18 @@ static int llama_encode_internal(
|
|
17632
17919
|
|
17633
17920
|
llama_set_inputs(lctx, ubatch);
|
17634
17921
|
|
17635
|
-
llama_graph_compute(lctx, gf, n_threads, threadpool);
|
17922
|
+
const auto compute_status = llama_graph_compute(lctx, gf, n_threads, threadpool);
|
17923
|
+
switch (compute_status) {
|
17924
|
+
case LM_GGML_STATUS_SUCCESS:
|
17925
|
+
break;
|
17926
|
+
case LM_GGML_STATUS_ABORTED:
|
17927
|
+
return 2;
|
17928
|
+
case LM_GGML_STATUS_ALLOC_FAILED:
|
17929
|
+
return -2;
|
17930
|
+
case LM_GGML_STATUS_FAILED:
|
17931
|
+
default:
|
17932
|
+
return -3;
|
17933
|
+
}
|
17636
17934
|
|
17637
17935
|
// extract embeddings
|
17638
17936
|
if (embd) {
|
@@ -17932,13 +18230,13 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
|
17932
18230
|
static void llama_kv_cache_update_internal(struct llama_context & lctx) {
|
17933
18231
|
bool need_reserve = false;
|
17934
18232
|
|
17935
|
-
|
17936
|
-
|
17937
|
-
|
17938
|
-
LM_GGML_ABORT("Deepseek2 does not support K-shift");
|
18233
|
+
if (lctx.kv_self.has_shift) {
|
18234
|
+
if (!llama_kv_cache_can_shift(&lctx)) {
|
18235
|
+
LM_GGML_ABORT("The current context does not support K-shift");
|
17939
18236
|
}
|
17940
18237
|
|
17941
|
-
|
18238
|
+
// apply K-shift if needed
|
18239
|
+
if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE) {
|
17942
18240
|
lm_ggml_backend_sched_reset(lctx.sched.get());
|
17943
18241
|
|
17944
18242
|
lm_ggml_cgraph * gf = llama_build_graph_k_shift(lctx);
|
@@ -18511,6 +18809,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
18511
18809
|
llama_model model;
|
18512
18810
|
llm_load_arch(ml, model);
|
18513
18811
|
llm_load_hparams(ml, model);
|
18812
|
+
llm_load_stats(ml, model);
|
18514
18813
|
|
18515
18814
|
struct quantize_state_internal qs(model, params);
|
18516
18815
|
|
@@ -19081,6 +19380,7 @@ void llama_lora_adapter_free(struct llama_lora_adapter * adapter) {
|
|
19081
19380
|
//
|
19082
19381
|
struct llama_model_params llama_model_default_params() {
|
19083
19382
|
struct llama_model_params result = {
|
19383
|
+
/*.devices =*/ nullptr,
|
19084
19384
|
/*.n_gpu_layers =*/ 0,
|
19085
19385
|
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
|
19086
19386
|
/*.main_gpu =*/ 0,
|
@@ -19198,7 +19498,11 @@ void llama_backend_init(void) {
|
|
19198
19498
|
|
19199
19499
|
void llama_numa_init(enum lm_ggml_numa_strategy numa) {
|
19200
19500
|
if (numa != LM_GGML_NUMA_STRATEGY_DISABLED) {
|
19201
|
-
|
19501
|
+
auto * dev = lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_CPU);
|
19502
|
+
LM_GGML_ASSERT(dev && "CPU backend is not loaded");
|
19503
|
+
auto * reg = lm_ggml_backend_dev_backend_reg(dev);
|
19504
|
+
auto * numa_init_fn = (decltype(lm_ggml_numa_init) *) lm_ggml_backend_reg_get_proc_address(reg, "lm_ggml_backend_cpu_numa_init");
|
19505
|
+
numa_init_fn(numa);
|
19202
19506
|
}
|
19203
19507
|
}
|
19204
19508
|
|
@@ -19289,19 +19593,24 @@ struct llama_model * llama_load_model_from_file(
|
|
19289
19593
|
}
|
19290
19594
|
|
19291
19595
|
// create list of devices to use with this model
|
19292
|
-
|
19293
|
-
|
19294
|
-
|
19295
|
-
|
19296
|
-
|
19297
|
-
|
19298
|
-
|
19299
|
-
|
19300
|
-
|
19596
|
+
if (params.devices) {
|
19597
|
+
for (lm_ggml_backend_dev_t * dev = params.devices; *dev; ++dev) {
|
19598
|
+
model->devices.push_back(*dev);
|
19599
|
+
}
|
19600
|
+
} else {
|
19601
|
+
// use all available devices
|
19602
|
+
for (size_t i = 0; i < lm_ggml_backend_dev_count(); ++i) {
|
19603
|
+
lm_ggml_backend_dev_t dev = lm_ggml_backend_dev_get(i);
|
19604
|
+
switch (lm_ggml_backend_dev_type(dev)) {
|
19605
|
+
case LM_GGML_BACKEND_DEVICE_TYPE_CPU:
|
19606
|
+
case LM_GGML_BACKEND_DEVICE_TYPE_ACCEL:
|
19607
|
+
// skip CPU backends since they are handled separately
|
19608
|
+
break;
|
19301
19609
|
|
19302
|
-
|
19303
|
-
|
19304
|
-
|
19610
|
+
case LM_GGML_BACKEND_DEVICE_TYPE_GPU:
|
19611
|
+
model->devices.push_back(dev);
|
19612
|
+
break;
|
19613
|
+
}
|
19305
19614
|
}
|
19306
19615
|
}
|
19307
19616
|
|
@@ -19472,9 +19781,6 @@ struct llama_context * llama_new_context_with_model(
|
|
19472
19781
|
__func__, n_ctx_per_seq, hparams.n_ctx_train);
|
19473
19782
|
}
|
19474
19783
|
|
19475
|
-
ctx->abort_callback = params.abort_callback;
|
19476
|
-
ctx->abort_callback_data = params.abort_callback_data;
|
19477
|
-
|
19478
19784
|
ctx->logits_all = params.logits_all;
|
19479
19785
|
|
19480
19786
|
// build worst-case graph for encoder if a model contains encoder
|
@@ -19523,7 +19829,7 @@ struct llama_context * llama_new_context_with_model(
|
|
19523
19829
|
}
|
19524
19830
|
|
19525
19831
|
// add CPU backend
|
19526
|
-
ctx->backend_cpu =
|
19832
|
+
ctx->backend_cpu = lm_ggml_backend_init_by_type(LM_GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
|
19527
19833
|
if (ctx->backend_cpu == nullptr) {
|
19528
19834
|
LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
|
19529
19835
|
llama_free(ctx);
|
@@ -19543,6 +19849,8 @@ struct llama_context * llama_new_context_with_model(
|
|
19543
19849
|
}
|
19544
19850
|
}
|
19545
19851
|
|
19852
|
+
llama_set_abort_callback(ctx, params.abort_callback, params.abort_callback_data);
|
19853
|
+
|
19546
19854
|
if (!llama_kv_cache_init(ctx->kv_self, ctx, type_k, type_v, kv_size, cparams.offload_kqv)) {
|
19547
19855
|
LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
|
19548
19856
|
llama_free(ctx);
|
@@ -19588,7 +19896,8 @@ struct llama_context * llama_new_context_with_model(
|
|
19588
19896
|
std::vector<lm_ggml_backend_t> backend_ptrs;
|
19589
19897
|
for (auto & backend : ctx->backends) {
|
19590
19898
|
auto * buft = lm_ggml_backend_get_default_buffer_type(backend.get());
|
19591
|
-
|
19899
|
+
auto backend_type = lm_ggml_backend_dev_type(lm_ggml_backend_get_device(backend.get()));
|
19900
|
+
if (backend_type == LM_GGML_BACKEND_DEVICE_TYPE_CPU && !model->devices.empty()) {
|
19592
19901
|
// use the host buffer of the first device CPU for faster transfer of the intermediate state
|
19593
19902
|
auto * dev = model->devices[0];
|
19594
19903
|
auto * host_buft = lm_ggml_backend_dev_host_buffer_type(dev);
|
@@ -19616,7 +19925,8 @@ struct llama_context * llama_new_context_with_model(
|
|
19616
19925
|
// pipeline parallelism requires support for async compute and events in all devices
|
19617
19926
|
if (pipeline_parallel) {
|
19618
19927
|
for (auto & backend : ctx->backends) {
|
19619
|
-
|
19928
|
+
auto dev_type = lm_ggml_backend_dev_type(lm_ggml_backend_get_device(backend.get()));
|
19929
|
+
if (dev_type == LM_GGML_BACKEND_DEVICE_TYPE_CPU) {
|
19620
19930
|
// ignore CPU backend
|
19621
19931
|
continue;
|
19622
19932
|
}
|
@@ -19790,6 +20100,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
19790
20100
|
case LLM_ARCH_QWEN:
|
19791
20101
|
case LLM_ARCH_QWEN2:
|
19792
20102
|
case LLM_ARCH_QWEN2MOE:
|
20103
|
+
case LLM_ARCH_OLMO2:
|
19793
20104
|
case LLM_ARCH_OLMOE:
|
19794
20105
|
case LLM_ARCH_PHI2:
|
19795
20106
|
case LLM_ARCH_PHI3:
|
@@ -19863,19 +20174,11 @@ int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t bu
|
|
19863
20174
|
}
|
19864
20175
|
|
19865
20176
|
uint64_t llama_model_size(const struct llama_model * model) {
|
19866
|
-
|
19867
|
-
for (const auto & it : model->tensors_by_name) {
|
19868
|
-
size += lm_ggml_nbytes(it.second);
|
19869
|
-
}
|
19870
|
-
return size;
|
20177
|
+
return model->n_bytes;
|
19871
20178
|
}
|
19872
20179
|
|
19873
20180
|
uint64_t llama_model_n_params(const struct llama_model * model) {
|
19874
|
-
|
19875
|
-
for (const auto & it : model->tensors_by_name) {
|
19876
|
-
nparams += lm_ggml_nelements(it.second);
|
19877
|
-
}
|
19878
|
-
return nparams;
|
20181
|
+
return model->n_elements;
|
19879
20182
|
}
|
19880
20183
|
|
19881
20184
|
struct lm_ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name) {
|
@@ -20189,6 +20492,10 @@ void llama_kv_cache_update(struct llama_context * ctx) {
|
|
20189
20492
|
llama_kv_cache_update_internal(*ctx);
|
20190
20493
|
}
|
20191
20494
|
|
20495
|
+
bool llama_kv_cache_can_shift(struct llama_context * ctx) {
|
20496
|
+
return !ctx->kv_self.recurrent && ctx->model.arch != LLM_ARCH_DEEPSEEK2; // not supported due to MLA
|
20497
|
+
}
|
20498
|
+
|
20192
20499
|
// deprecated
|
20193
20500
|
size_t llama_get_state_size(struct llama_context * ctx) {
|
20194
20501
|
return llama_state_get_size(ctx);
|
@@ -21173,6 +21480,14 @@ int32_t llama_n_threads_batch(struct llama_context * ctx) {
|
|
21173
21480
|
void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) {
|
21174
21481
|
ctx->abort_callback = abort_callback;
|
21175
21482
|
ctx->abort_callback_data = abort_callback_data;
|
21483
|
+
|
21484
|
+
for (auto & backend : ctx->backends) {
|
21485
|
+
auto * reg = lm_ggml_backend_dev_backend_reg(lm_ggml_backend_get_device(backend.get()));
|
21486
|
+
auto * set_abort_callback_fn = (lm_ggml_backend_set_abort_callback_t) lm_ggml_backend_reg_get_proc_address(reg, "lm_ggml_backend_set_abort_callback");
|
21487
|
+
if (set_abort_callback_fn) {
|
21488
|
+
set_abort_callback_fn(backend.get(), ctx->abort_callback, ctx->abort_callback_data);
|
21489
|
+
}
|
21490
|
+
}
|
21176
21491
|
}
|
21177
21492
|
|
21178
21493
|
void llama_set_embeddings(struct llama_context * ctx, bool embeddings) {
|
@@ -21810,8 +22125,11 @@ static int32_t llama_chat_apply_template_internal(
|
|
21810
22125
|
// IBM Granite template
|
21811
22126
|
for (const auto & message : chat) {
|
21812
22127
|
std::string role(message->role);
|
21813
|
-
ss << "<|start_of_role|>" << role << "<|end_of_role|>"
|
21814
|
-
|
22128
|
+
ss << "<|start_of_role|>" << role << "<|end_of_role|>";
|
22129
|
+
if (role == "assistant_tool_call") {
|
22130
|
+
ss << "<|tool_call|>";
|
22131
|
+
}
|
22132
|
+
ss << message->content << "<|end_of_text|>\n";
|
21815
22133
|
}
|
21816
22134
|
if (add_ass) {
|
21817
22135
|
ss << "<|start_of_role|>assistant<|end_of_role|>\n";
|
@@ -21911,33 +22229,23 @@ int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int
|
|
21911
22229
|
}
|
21912
22230
|
|
21913
22231
|
const char * llama_print_system_info(void) {
|
21914
|
-
lm_ggml_cpu_init(); // some ARM features are detected at runtime
|
21915
|
-
|
21916
22232
|
static std::string s;
|
21917
22233
|
|
21918
|
-
|
21919
|
-
|
21920
|
-
|
21921
|
-
|
21922
|
-
|
21923
|
-
|
21924
|
-
|
21925
|
-
|
21926
|
-
|
21927
|
-
|
21928
|
-
|
21929
|
-
|
21930
|
-
|
21931
|
-
|
21932
|
-
|
21933
|
-
s += "RISCV_VECT = " + std::to_string(lm_ggml_cpu_has_riscv_v()) + " | ";
|
21934
|
-
s += "WASM_SIMD = " + std::to_string(lm_ggml_cpu_has_wasm_simd()) + " | ";
|
21935
|
-
s += "BLAS = " + std::to_string(lm_ggml_cpu_has_blas()) + " | ";
|
21936
|
-
s += "SSE3 = " + std::to_string(lm_ggml_cpu_has_sse3()) + " | ";
|
21937
|
-
s += "SSSE3 = " + std::to_string(lm_ggml_cpu_has_ssse3()) + " | ";
|
21938
|
-
s += "VSX = " + std::to_string(lm_ggml_cpu_has_vsx()) + " | ";
|
21939
|
-
s += "MATMUL_INT8 = " + std::to_string(lm_ggml_cpu_has_matmul_int8()) + " | ";
|
21940
|
-
s += "LLAMAFILE = " + std::to_string(lm_ggml_cpu_has_llamafile()) + " | ";
|
22234
|
+
for (size_t i = 0; i < lm_ggml_backend_reg_count(); i++) {
|
22235
|
+
auto * reg = lm_ggml_backend_reg_get(i);
|
22236
|
+
auto * get_features_fn = (lm_ggml_backend_get_features_t) lm_ggml_backend_reg_get_proc_address(reg, "lm_ggml_backend_get_features");
|
22237
|
+
if (get_features_fn) {
|
22238
|
+
lm_ggml_backend_feature * features = get_features_fn(reg);
|
22239
|
+
s += lm_ggml_backend_reg_name(reg);
|
22240
|
+
s += " : ";
|
22241
|
+
for (; features->name; features++) {
|
22242
|
+
s += features->name;
|
22243
|
+
s += " = ";
|
22244
|
+
s += features->value;
|
22245
|
+
s += " | ";
|
22246
|
+
}
|
22247
|
+
}
|
22248
|
+
}
|
21941
22249
|
|
21942
22250
|
return s.c_str();
|
21943
22251
|
}
|
@@ -21978,28 +22286,6 @@ void llama_perf_context_reset(struct llama_context * ctx) {
|
|
21978
22286
|
ctx->t_p_eval_us = ctx->n_p_eval = 0;
|
21979
22287
|
}
|
21980
22288
|
|
21981
|
-
void llama_perf_dump_yaml(FILE * stream, const llama_context * ctx) {
|
21982
|
-
fprintf(stream, "\n");
|
21983
|
-
fprintf(stream, "###########\n");
|
21984
|
-
fprintf(stream, "# Timings #\n");
|
21985
|
-
fprintf(stream, "###########\n");
|
21986
|
-
fprintf(stream, "\n");
|
21987
|
-
|
21988
|
-
fprintf(stream, "mst_eval: %.2f # ms / token during generation\n",
|
21989
|
-
1.0e-3 * ctx->t_eval_us / ctx->n_eval);
|
21990
|
-
fprintf(stream, "mst_p_eval: %.2f # ms / token during prompt processing\n",
|
21991
|
-
1.0e-3 * ctx->t_p_eval_us / ctx->n_p_eval);
|
21992
|
-
fprintf(stream, "n_eval: %d # number of tokens generated (excluding the first one)\n", ctx->n_eval);
|
21993
|
-
fprintf(stream, "n_p_eval: %d # number of tokens processed in batches at the beginning\n", ctx->n_p_eval);
|
21994
|
-
fprintf(stream, "t_eval_us: %" PRId64 " # total microseconds spent generating tokens\n", ctx->t_eval_us);
|
21995
|
-
fprintf(stream, "t_load_us: %" PRId64 " # total microseconds spent loading the model\n", ctx->t_load_us);
|
21996
|
-
fprintf(stream, "t_p_eval_us: %" PRId64 " # total microseconds spent prompt processing\n", ctx->t_p_eval_us);
|
21997
|
-
fprintf(stream, "ts_eval: %.2f # tokens / second during generation\n",
|
21998
|
-
1.0e6 * ctx->n_eval / ctx->t_eval_us);
|
21999
|
-
fprintf(stream, "ts_p_eval: %.2f # tokens / second during prompt processing\n",
|
22000
|
-
1.0e6 * ctx->n_p_eval / ctx->t_p_eval_us);
|
22001
|
-
}
|
22002
|
-
|
22003
22289
|
// For internal test use
|
22004
22290
|
const std::vector<std::pair<std::string, struct lm_ggml_tensor *>> & llama_internal_get_tensor_map(
|
22005
22291
|
struct llama_context * ctx
|